[viennacl] 01/09: New upstream version 1.5.1

Wed Feb 19 19:09:52 UTC 2014

This is an automated email from the git hooks/post-receive script.

tsmithe-guest pushed a commit to branch master
in repository viennacl.

commit d19020e60ba4975326d49dc4eb6e36200ffada34
Author: Toby Smithe <git at tsmithe.net>
Date:   Wed Feb 19 16:10:36 2014 +0000

    New upstream version 1.5.1
---
 CL/cl_gl_ext.h                                     |    2 +-
 CMakeLists.txt                                     |   10 +-
 LICENSE                                            |    7 +-
 README                                             |   53 +-
 auxiliary/CMakeLists.txt                           |  303 -
 auxiliary/README                                   |   17 -
 .../compressed_matrix/align1/bicgstab_kernel1.cl   |   54 -
 .../compressed_matrix/align1/bicgstab_kernel2.cl   |   81 -
 auxiliary/compressed_matrix/align1/jacobi.cl       |   28 -
 .../compressed_matrix/align1/jacobi_precond.cl     |   26 -
 auxiliary/compressed_matrix/align1/lu_backward.cl  |  115 -
 auxiliary/compressed_matrix/align1/lu_forward.cl   |  107 -
 .../compressed_matrix/align1/row_scaling_1.cl      |   20 -
 .../compressed_matrix/align1/row_scaling_2.cl      |   24 -
 auxiliary/compressed_matrix/align1/vec_mul.cl      |   21 -
 auxiliary/compressed_matrix/align4/vec_mul.cl      |   37 -
 auxiliary/compressed_matrix/align8/vec_mul.cl      |   42 -
 auxiliary/compressed_matrix/matrix.old_cl          |  226 -
 auxiliary/converter.cpp                            |  379 -
 auxiliary/coordinate_matrix/align1/vec_mul.cl      |  126 -
 auxiliary/coordinate_matrix/align128/dummy         |    1 -
 auxiliary/coordinate_matrix/matrix.old_cl          |  822 --
 auxiliary/fft/align1/bluestein_post.cl             |   23 -
 auxiliary/fft/align1/bluestein_pre.cl              |   34 -
 auxiliary/fft/align1/complex_to_real.cl            |    8 -
 auxiliary/fft/align1/fft_div_vec_scalar.cl         |    7 -
 auxiliary/fft/align1/fft_mult_vec.cl               |   13 -
 auxiliary/fft/align1/real_to_complex.cl            |   11 -
 auxiliary/fft/align1/reverse_inplace.cl            |   11 -
 auxiliary/fft/align1/transpose.cl                  |   16 -
 auxiliary/fft/align1/transpose_inplace.cl          |   22 -
 auxiliary/fft/align1/vandermonde_prod.cl           |   19 -
 auxiliary/fft/align1/zero2.cl                      |   11 -
 auxiliary/generate-blas3-prod-align1.cpp           |  277 -
 auxiliary/generate-blas3-solve-align1.cpp          |  183 -
 auxiliary/matrix_col/align1/add.cl                 |   29 -
 auxiliary/matrix_col/align1/assign.cl              |   22 -
 auxiliary/matrix_col/align1/clear.cl               |   14 -
 auxiliary/matrix_col/align1/cpu_inplace_mult.cl    |   16 -
 auxiliary/matrix_col/align1/fft_direct.cl          |   29 -
 auxiliary/matrix_col/align1/fft_radix2.cl          |   39 -
 auxiliary/matrix_col/align1/fft_radix2_local.cl    |   74 -
 auxiliary/matrix_col/align1/fft_reorder.cl         |   38 -
 auxiliary/matrix_col/align1/inplace_add.cl         |   21 -
 auxiliary/matrix_col/align1/inplace_divide.cl      |   17 -
 auxiliary/matrix_col/align1/inplace_mult.cl        |   18 -
 auxiliary/matrix_col/align1/inplace_sub.cl         |   22 -
 .../align1/lower_triangular_substitute_inplace.cl  |   26 -
 auxiliary/matrix_col/align1/lu_factorize.cl        |   27 -
 auxiliary/matrix_col/align1/rank1_update.cl        |   21 -
 auxiliary/matrix_col/align1/scaled_rank1_update.cl |   22 -
 auxiliary/matrix_col/align1/sub.cl                 |   29 -
 .../trans_lower_triangular_substitute_inplace.cl   |   26 -
 ...ans_unit_lower_triangular_substitute_inplace.cl |   22 -
 ...ans_unit_upper_triangular_substitute_inplace.cl |   24 -
 .../trans_upper_triangular_substitute_inplace.cl   |   28 -
 auxiliary/matrix_col/align1/trans_vec_mul.cl       |   22 -
 .../unit_lower_triangular_substitute_inplace.cl    |   22 -
 .../unit_upper_triangular_substitute_inplace.cl    |   23 -
 .../align1/upper_triangular_substitute_inplace.cl  |   27 -
 auxiliary/matrix_col/align1/vec_mul.cl             |   22 -
 auxiliary/matrix_col/align16/dummy                 |    1 -
 auxiliary/matrix_col/matrix.old_cl                 |  120 -
 auxiliary/matrix_row/align1/add.cl                 |   30 -
 auxiliary/matrix_row/align1/assign.cl              |   22 -
 auxiliary/matrix_row/align1/clear.cl               |   14 -
 auxiliary/matrix_row/align1/cpu_inplace_mult.cl    |   15 -
 auxiliary/matrix_row/align1/fft_direct.cl          |   32 -
 auxiliary/matrix_row/align1/fft_radix2.cl          |   46 -
 auxiliary/matrix_row/align1/fft_radix2_local.cl    |   72 -
 auxiliary/matrix_row/align1/fft_reorder.cl         |   42 -
 auxiliary/matrix_row/align1/inplace_add.cl         |   22 -
 auxiliary/matrix_row/align1/inplace_divide.cl      |   16 -
 auxiliary/matrix_row/align1/inplace_mult.cl        |   18 -
 auxiliary/matrix_row/align1/inplace_sub.cl         |   21 -
 .../align1/lower_triangular_substitute_inplace.cl  |   26 -
 auxiliary/matrix_row/align1/lu_factorize.cl        |   31 -
 auxiliary/matrix_row/align1/rank1_update.cl        |   23 -
 auxiliary/matrix_row/align1/scaled_rank1_update.cl |   24 -
 auxiliary/matrix_row/align1/sub.cl                 |   29 -
 .../trans_lower_triangular_substitute_inplace.cl   |   26 -
 ...ans_unit_lower_triangular_substitute_inplace.cl |   22 -
 ...ans_unit_upper_triangular_substitute_inplace.cl |   24 -
 .../trans_upper_triangular_substitute_inplace.cl   |   28 -
 auxiliary/matrix_row/align1/trans_vec_mul.cl       |   22 -
 .../unit_lower_triangular_substitute_inplace.cl    |   22 -
 .../unit_upper_triangular_substitute_inplace.cl    |   23 -
 .../align1/upper_triangular_substitute_inplace.cl  |   27 -
 auxiliary/matrix_row/align1/vec_mul.cl             |   22 -
 auxiliary/matrix_row/align16/dummy                 |    1 -
 auxiliary/matrix_row/matrix.old_cl                 |  120 -
 auxiliary/scalar/align1/add.cl                     |   10 -
 auxiliary/scalar/align1/cpu_add.cl                 |   10 -
 auxiliary/scalar/align1/cpu_div.cl                 |   10 -
 auxiliary/scalar/align1/cpu_inplace_add.cl         |    9 -
 auxiliary/scalar/align1/cpu_inplace_div.cl         |   10 -
 auxiliary/scalar/align1/cpu_inplace_mul.cl         |    9 -
 auxiliary/scalar/align1/cpu_inplace_sub.cl         |   10 -
 auxiliary/scalar/align1/cpu_mul.cl                 |   10 -
 auxiliary/scalar/align1/cpu_sub.cl                 |   10 -
 auxiliary/scalar/align1/divide.cl                  |   12 -
 auxiliary/scalar/align1/inplace_add.cl             |    9 -
 auxiliary/scalar/align1/inplace_div.cl             |    9 -
 auxiliary/scalar/align1/inplace_mul.cl             |    9 -
 auxiliary/scalar/align1/inplace_sub.cl             |    9 -
 auxiliary/scalar/align1/mul.cl                     |   10 -
 auxiliary/scalar/align1/sub.cl                     |   10 -
 auxiliary/spai/align1/assemble_blocks.cl           |   60 -
 auxiliary/spai/align1/block_bv_assembly.cl         |   33 -
 auxiliary/spai/align1/block_least_squares.cl       |   68 -
 auxiliary/spai/align1/block_q_mult.cl              |   74 -
 auxiliary/spai/align1/block_qr.cl                  |  130 -
 auxiliary/spai/align1/block_qr_assembly.cl         |   57 -
 auxiliary/spai/align1/block_qr_assembly_1.cl       |   36 -
 auxiliary/spai/align1/block_r_assembly.cl          |   68 -
 auxiliary/vector/align1/add.cl                     |   16 -
 auxiliary/vector/align1/clear.cl                   |   10 -
 auxiliary/vector/align1/cpu_inplace_mul_add.cl     |   14 -
 auxiliary/vector/align1/cpu_inplace_mult.cl        |   11 -
 auxiliary/vector/align1/cpu_mul_add.cl             |   18 -
 auxiliary/vector/align1/cpu_mult.cl                |   15 -
 auxiliary/vector/align1/diag_precond.cl            |   12 -
 auxiliary/vector/align1/divide.cl                  |   16 -
 auxiliary/vector/align1/index_norm_inf.cl          |   56 -
 auxiliary/vector/align1/inner_prod.cl              |   56 -
 auxiliary/vector/align1/inplace_add.cl             |   13 -
 auxiliary/vector/align1/inplace_div_add.cl         |   15 -
 auxiliary/vector/align1/inplace_div_sub.cl         |   16 -
 auxiliary/vector/align1/inplace_divide.cl          |   12 -
 auxiliary/vector/align1/inplace_mul_add.cl         |   16 -
 auxiliary/vector/align1/inplace_mul_sub.cl         |   16 -
 auxiliary/vector/align1/inplace_mult.cl            |   13 -
 auxiliary/vector/align1/inplace_sub.cl             |   13 -
 auxiliary/vector/align1/mul_add.cl                 |   20 -
 auxiliary/vector/align1/mul_sub.cl                 |   20 -
 auxiliary/vector/align1/mult.cl                    |   15 -
 auxiliary/vector/align1/norm_1.cl                  |   45 -
 auxiliary/vector/align1/norm_2.cl                  |   48 -
 auxiliary/vector/align1/norm_inf.cl                |   39 -
 auxiliary/vector/align1/plane_rotation.cl          |   26 -
 auxiliary/vector/align1/sqrt_sum.cl                |   21 -
 auxiliary/vector/align1/sub.cl                     |   16 -
 auxiliary/vector/align1/sum.cl                     |   20 -
 auxiliary/vector/align1/swap.cl                    |   21 -
 auxiliary/vector/align1/vmax.cl                    |   20 -
 auxiliary/vector/align16/add.cl                    |   18 -
 auxiliary/vector/align16/cpu_inplace_mul.cl        |   12 -
 auxiliary/vector/align16/cpu_mult.cl               |   15 -
 auxiliary/vector/align16/divide.cl                 |   18 -
 auxiliary/vector/align16/inplace_add.cl            |   14 -
 auxiliary/vector/align16/inplace_divide.cl         |   14 -
 auxiliary/vector/align16/inplace_mult.cl           |   13 -
 auxiliary/vector/align16/inplace_sub.cl            |   15 -
 auxiliary/vector/align16/mult.cl                   |   16 -
 auxiliary/vector/align16/sub.cl                    |   18 -
 auxiliary/vector/align4/cpu_inplace_mul_add.cl     |   15 -
 auxiliary/vector/align4/cpu_mul_add.cl             |   18 -
 auxiliary/vector/align4/inner_prod.cl_disabled     |   40 -
 auxiliary/vector/align4/inplace_div_add.cl         |   18 -
 auxiliary/vector/align4/inplace_div_sub.cl         |   18 -
 auxiliary/vector/align4/inplace_mul_add.cl         |   16 -
 auxiliary/vector/align4/inplace_mul_sub.cl         |   17 -
 auxiliary/vector/align4/mul_add.cl                 |   19 -
 auxiliary/vector/align4/norm_2.cl_disabled         |   47 -
 changelog                                          |  154 +-
 cmake/FindMTL.cmake                                |   11 +-
 cmake/FindOpenCL.cmake                             |    6 +
 cmake/ViennaCLCommon.cmake                         |   59 +-
 doc/CMakeLists.txt                                 |    3 +
 doc/Doxyfile.in                                    | 1312 +--
 doc/manual/additional-algorithms.tex               |  221 +
 doc/manual/algorithms.tex                          |  316 +-
 doc/manual/benchmarks.tex                          |    2 +-
 doc/manual/changelogs.tex                          |  192 +-
 doc/manual/contributors.tex                        |   34 +-
 doc/manual/cover.tex                               |   10 +-
 doc/manual/custom-contexts.tex                     |   13 +-
 doc/manual/custom-kernels.tex                      |   30 +-
 doc/manual/design.tex                              |   11 +-
 doc/manual/figures/TU_Signet_CMYK.eps              |   10 +-
 doc/manual/installation.tex                        |  206 +-
 doc/manual/introduction.tex                        |   51 +-
 doc/manual/kernel-generation.tex                   |   39 +
 doc/manual/keywords.tex                            |    4 +-
 doc/manual/license.tex                             |   10 +-
 doc/manual/memory-model.tex                        |   46 +
 doc/manual/multi-device.tex                        |   34 +-
 doc/manual/operations.tex                          |  202 +-
 doc/manual/other-libs.tex                          |   36 +-
 doc/manual/setup.tex                               |    2 +-
 doc/manual/shared-lib.tex                          |   14 +
 doc/manual/structured-matrices.tex                 |   98 +
 doc/manual/tuning.tex                              |    4 +-
 doc/manual/types.tex                               |  282 +-
 doc/manual/versioning.tex                          |    2 +-
 doc/manual/viennacl.bib                            |   41 +-
 doc/manual/viennacl.tex                            |   78 +-
 examples/CMakeLists.txt                            |    4 +-
 examples/autotuner/CMakeLists.txt                  |   12 +
 examples/autotuner/command-line-utils.hpp          |   55 +
 examples/autotuner/dot_autotuning.cpp              |  270 +
 examples/autotuner/dump_default_kernels.cpp        |   81 +
 examples/autotuner/gemm_autotuning.cpp             |  382 +
 examples/autotuner/gemv_autotuning.cpp             |  266 +
 examples/autotuner/vector-axpy_autotuning.cpp      |  270 +
 examples/benchmarks/CMakeLists.txt                 |   67 +-
 examples/benchmarks/benchmark-utils.hpp            |  197 +-
 examples/benchmarks/blas3.cpp                      |  388 +-
 examples/benchmarks/blas3.cu                       |  246 +
 examples/benchmarks/copy.cpp                       |  189 +
 examples/benchmarks/copy.cu                        |  189 +
 examples/benchmarks/generator_blas1.cpp            |  135 +
 examples/benchmarks/generator_blas2.cpp            |  127 +
 examples/benchmarks/generator_blas3.cpp            |  129 +
 examples/benchmarks/iccs_qr.cpp                    |  139 -
 examples/benchmarks/io.hpp                         |  229 +-
 examples/benchmarks/opencl.cpp                     |  288 +-
 examples/benchmarks/qr.cpp                         |   19 +-
 examples/benchmarks/{qr.cpp => qr.cu}              |   19 +-
 examples/benchmarks/{opencl.cpp => scheduler.cpp}  |  293 +-
 examples/benchmarks/solver.cpp                     | 1046 ++-
 examples/benchmarks/solver.cu                      |  645 ++
 examples/benchmarks/sparse.cpp                     |  557 +-
 examples/benchmarks/{sparse.cpp => sparse.cu}      |  557 +-
 examples/benchmarks/vector.cpp                     |  522 +-
 examples/benchmarks/{vector.cpp => vector.cu}      |  522 +-
 examples/parameters/CMakeLists.txt                 |   25 -
 examples/parameters/benchmark-utils.hpp            |   98 -
 examples/parameters/common.hpp                     |  251 -
 examples/parameters/common_vprof.hpp               |   53 -
 examples/parameters/matrix.cpp                     |  267 -
 examples/parameters/matrix_functors.hpp            |   91 -
 examples/parameters/parameter_reader.cpp           |   65 -
 examples/parameters/sparse.cpp                     |  245 -
 examples/parameters/vector.cpp                     |  254 -
 examples/parameters/vector_functors.hpp            |  204 -
 examples/testdata/eigen/nsm1.example               |    6 +
 examples/testdata/eigen/nsm2.example               |   12 +
 examples/testdata/eigen/nsm3.example               |  273 +
 examples/testdata/eigen/nsm4.example               |  921 ++
 examples/testdata/eigen/symm1.example              |    6 +
 examples/testdata/eigen/symm2.example              |   12 +
 examples/testdata/eigen/symm3.example              |  922 ++
 examples/testdata/svd/pysvd.example                |   11 +
 examples/testdata/svd/qr.example                   |    6 +
 examples/testdata/svd/random.example               | 1003 ++
 examples/testdata/svd/wiki.example                 |    7 +
 examples/testdata/svd/wiki.qr.example              |    6 +
 examples/tutorial/CMakeLists.txt                   |  109 +-
 examples/tutorial/CMakeLists.txt~                  |   32 -
 examples/tutorial/Random.hpp                       |  104 +-
 examples/tutorial/amg.cpp                          |  120 +-
 examples/tutorial/bandwidth-reduction.cpp          |  109 +-
 examples/tutorial/blas1.cpp                        |  421 +-
 examples/tutorial/{blas1.cpp => blas1.cu}          |  421 +-
 examples/tutorial/blas2.cpp                        |  521 +-
 examples/tutorial/{blas2.cpp => blas2.cu}          |  521 +-
 examples/tutorial/blas3.cpp                        |  378 +-
 examples/tutorial/{blas3.cpp => blas3.cu}          |  378 +-
 examples/tutorial/blas3range.cpp                   |  424 +-
 .../tutorial/{blas3range.cpp => blas3range.cu}     |  424 +-
 examples/tutorial/custom-context.cpp               |  562 +-
 examples/tutorial/custom-kernels.cpp               |  276 +-
 examples/tutorial/eigen-with-viennacl.cpp          |   27 +-
 examples/tutorial/fft.cpp                          |   40 +-
 examples/tutorial/iterative-eigen.cpp              |   48 +-
 examples/tutorial/iterative-mtl4.cpp               |   69 +-
 examples/tutorial/iterative-ublas.cpp              |  312 +-
 examples/tutorial/iterative.cpp                    |  472 +-
 examples/tutorial/{iterative.cpp => iterative.cu}  |  472 +-
 examples/tutorial/lanczos.cpp                      |   85 +
 examples/tutorial/lanczos.cu                       |   85 +
 examples/tutorial/least-squares.cpp                |  144 +
 examples/tutorial/least-squares.cu                 |  144 +
 examples/tutorial/libviennacl.cpp                  |  105 +
 examples/tutorial/libviennacl.cu                   |  105 +
 examples/tutorial/matrix-range.cpp                 |   56 +-
 .../tutorial/{matrix-range.cpp => matrix-range.cu} |   56 +-
 examples/tutorial/mtl4-with-viennacl.cpp           |   13 +-
 examples/tutorial/multithreaded.cpp                |  127 +
 examples/tutorial/multithreaded_cg.cpp             |  185 +
 examples/tutorial/power-iter.cpp                   |   79 +
 examples/tutorial/power-iter.cu                    |   79 +
 examples/tutorial/qr.cpp                           |   83 +-
 examples/tutorial/{qr.cpp => qr.cu}                |   83 +-
 examples/tutorial/rand.cpp                         |   70 +
 examples/tutorial/scheduler.cpp                    |  130 +
 examples/tutorial/spai.cpp                         |  102 +-
 examples/tutorial/sparse.cpp                       |  235 +-
 examples/tutorial/{sparse.cpp => sparse.cu}        |  235 +-
 examples/tutorial/structured-matrices.cpp          |   36 +-
 examples/tutorial/vector-io.hpp                    |  345 +-
 examples/tutorial/vector-range.cpp                 |   55 +-
 .../tutorial/{vector-range.cpp => vector-range.cu} |   55 +-
 examples/tutorial/viennacl-info.cpp                |   92 +-
 examples/tutorial/wrap-cuda-buffer.cu              |  121 +
 examples/tutorial/wrap-host-buffer.cpp             |   86 +
 external/pugixml/src/pugiconfig.hpp                |   62 -
 external/pugixml/src/pugixml.cpp                   | 9576 --------------------
 external/pugixml/src/pugixml.hpp                   | 1131 ---
 external/tclap/Arg.h                               |  692 ++
 external/tclap/ArgException.h                      |  200 +
 external/tclap/ArgTraits.h                         |   87 +
 external/tclap/COPYING                             |   25 +
 external/tclap/CmdLine.h                           |  633 ++
 external/tclap/CmdLineInterface.h                  |  150 +
 external/tclap/CmdLineOutput.h                     |   74 +
 external/tclap/Constraint.h                        |   68 +
 external/tclap/DocBookOutput.h                     |  299 +
 external/tclap/HelpVisitor.h                       |   76 +
 external/tclap/IgnoreRestVisitor.h                 |   52 +
 external/tclap/MultiArg.h                          |  433 +
 external/tclap/MultiSwitchArg.h                    |  216 +
 external/tclap/OptionalUnlabeledTracker.h          |   62 +
 external/tclap/StandardTraits.h                    |  208 +
 external/tclap/StdOutput.h                         |  298 +
 external/tclap/SwitchArg.h                         |  266 +
 external/tclap/UnlabeledMultiArg.h                 |  301 +
 external/tclap/UnlabeledValueArg.h                 |  340 +
 external/tclap/ValueArg.h                          |  425 +
 external/tclap/ValuesConstraint.h                  |  148 +
 external/tclap/VersionVisitor.h                    |   81 +
 external/tclap/Visitor.h                           |   53 +
 external/tclap/XorHandler.h                        |  166 +
 external/tclap/ZshCompletionOutput.h               |  323 +
 libviennacl/CMakeLists.txt                         |   35 +
 libviennacl/include/viennacl.hpp                   |  607 ++
 libviennacl/src/backend.cpp                        |   46 +
 libviennacl/src/backend.cu                         |   46 +
 libviennacl/src/blas1.cpp                          |  402 +
 libviennacl/src/blas1.cu                           |  402 +
 libviennacl/src/blas1_cuda.cu                      |  264 +
 libviennacl/src/blas1_host.cpp                     |  257 +
 libviennacl/src/blas1_host.cu                      |  257 +
 libviennacl/src/blas1_opencl.cpp                   |  261 +
 libviennacl/src/blas1_opencl.cu                    |  261 +
 libviennacl/src/blas2.cpp                          |  309 +
 libviennacl/src/blas2.cu                           |  309 +
 libviennacl/src/blas2_cuda.cu                      |  286 +
 libviennacl/src/blas2_host.cpp                     |  283 +
 libviennacl/src/blas2_host.cu                      |  283 +
 libviennacl/src/blas2_opencl.cpp                   |  283 +
 libviennacl/src/blas2_opencl.cu                    |  283 +
 libviennacl/src/blas3.cpp                          |  970 ++
 libviennacl/src/blas3.cu                           |  970 ++
 libviennacl/src/blas3.hpp                          |   60 +
 libviennacl/src/blas3_cuda.cu                      |  249 +
 libviennacl/src/blas3_host.cpp                     |  243 +
 libviennacl/src/blas3_host.cu                      |  243 +
 libviennacl/src/blas3_opencl.cpp                   |  249 +
 libviennacl/src/blas3_opencl.cu                    |  249 +
 libviennacl/src/init_matrix.hpp                    |  101 +
 libviennacl/src/init_vector.hpp                    |  101 +
 libviennacl/src/viennacl_private.hpp               |  141 +
 tests/CMakeLists.txt                               |  123 +-
 {examples/tutorial => tests/src}/Random.hpp        |  105 +-
 tests/src/blas3_prod_double.cpp                    |   65 +
 tests/src/blas3_prod_double.cu                     |   65 +
 tests/src/blas3_prod_float.cpp                     |   61 +
 tests/src/blas3_prod_float.cu                      |   61 +
 tests/src/blas3_prod_float_double.hpp              |  855 ++
 tests/src/{blas3.cpp => blas3_solve_double.cpp}    |  510 +-
 .../src/{blas3range.cpp => blas3_solve_double.cu}  |  519 +-
 tests/src/{blas3.cpp => blas3_solve_float.cpp}     |  510 +-
 tests/src/{blas3.cpp => blas3_solve_float.cu}      |  510 +-
 tests/src/blas3_solve_float_double.hpp             |  514 ++
 tests/src/external_1.cpp                           |   47 +-
 tests/src/{external_1.cpp => external_1.cu}        |   47 +-
 tests/src/external_2.cpp                           |   37 +-
 tests/src/{external_1.cpp => external_2.cu}        |   52 +-
 tests/src/fft.cpp                                  |  169 +-
 tests/src/generator_blas1.cpp                      |  524 ++
 tests/src/generator_blas2.cpp                      |  261 +
 tests/src/generator_blas3.cpp                      |  424 +
 tests/src/global_variables.cpp                     |   85 +
 tests/src/global_variables.cu                      |   85 +
 tests/src/iterators.cpp                            |   21 +-
 tests/src/{iterators.cpp => iterators.cu}          |   21 +-
 tests/src/libviennacl_blas1.cpp                    |  668 ++
 tests/src/libviennacl_blas1.cu                     |  668 ++
 tests/src/libviennacl_blas2.cpp                    |  265 +
 tests/src/libviennacl_blas2.cu                     |  265 +
 tests/src/libviennacl_blas3.cpp                    |  623 ++
 tests/src/libviennacl_blas3.cu                     |  623 ++
 tests/src/matrix.cpp                               |  532 --
 tests/src/matrix_col_double.cpp                    |   52 +
 tests/src/matrix_col_double.cu                     |   52 +
 tests/src/matrix_col_float.cpp                     |   45 +
 tests/src/matrix_col_float.cu                      |   45 +
 tests/src/matrix_col_int.cpp                       |   48 +
 tests/src/matrix_col_int.cu                        |   48 +
 tests/src/matrix_float_double.hpp                  | 1304 +++
 tests/src/matrix_int.hpp                           | 1107 +++
 tests/src/matrix_range.cpp                         |  489 -
 tests/src/matrix_row_double.cpp                    |   51 +
 tests/src/matrix_row_double.cu                     |   51 +
 tests/src/matrix_row_float.cpp                     |   44 +
 tests/src/matrix_row_float.cu                      |   44 +
 tests/src/matrix_row_int.cpp                       |   48 +
 tests/src/matrix_row_int.cu                        |   48 +
 tests/src/matrix_vector.cpp                        | 1146 +++
 tests/src/matrix_vector.cu                         | 1146 +++
 tests/src/matrix_vector_int.cpp                    |  823 ++
 tests/src/matrix_vector_int.cu                     |  823 ++
 tests/src/nmf.cpp                                  |  120 +
 tests/src/qr_method.cpp                            |  277 +
 tests/src/scalar.cpp                               |  390 +-
 tests/src/scalar.cu                                |  461 +
 tests/src/scheduler_matrix.cpp                     |  920 ++
 tests/src/scheduler_matrix_matrix.cpp              |  954 ++
 tests/src/scheduler_matrix_vector.cpp              |  945 ++
 tests/src/scheduler_sparse.cpp                     |  456 +
 tests/src/scheduler_vector.cpp                     |  697 ++
 tests/src/sparse.cpp                               | 1016 ++-
 tests/src/sparse.cu                                |  891 ++
 tests/src/spmdm.cpp                                |  339 +
 tests/src/spmdm.cu                                 |  339 +
 tests/src/structured-matrices.cpp                  |  216 +-
 tests/src/svd.cpp                                  |  311 +
 tests/src/vector.cpp                               |  705 --
 tests/src/vector_double.cpp                        |   66 +
 tests/src/vector_double.cu                         |   66 +
 tests/src/vector_float.cpp                         |   62 +
 tests/src/vector_float.cu                          |   62 +
 tests/src/vector_float_double.hpp                  | 1717 ++++
 tests/src/vector_int.cpp                           | 1523 ++++
 tests/src/vector_int.cu                            | 1523 ++++
 tests/src/vector_multi_inner_prod.cpp              |  584 ++
 tests/src/vector_multi_inner_prod.cu               |  584 ++
 tests/src/vector_range.cpp                         |  254 -
 tests/src/vector_uint.cpp                          |  966 ++
 tests/src/vector_uint.cu                           |  966 ++
 viennacl/backend/cpu_ram.hpp                       |  143 +
 viennacl/backend/cuda.hpp                          |  190 +
 viennacl/backend/mem_handle.hpp                    |  225 +
 viennacl/backend/memory.hpp                        |  630 ++
 viennacl/backend/opencl.hpp                        |  146 +
 viennacl/backend/util.hpp                          |  280 +
 viennacl/circulant_matrix.hpp                      |  194 +-
 viennacl/compressed_compressed_matrix.hpp          |  588 ++
 viennacl/compressed_matrix.hpp                     | 1522 ++--
 viennacl/context.hpp                               |   88 +
 viennacl/coordinate_matrix.hpp                     |  812 +-
 viennacl/ell_matrix.hpp                            |  296 +
 viennacl/fft.hpp                                   |  402 +-
 viennacl/forwards.h                                |  851 +-
 viennacl/generator/autotune.hpp                    |  208 +
 viennacl/generator/forwards.h                      |  142 +
 viennacl/generator/generate.hpp                    |  408 +
 viennacl/generator/helpers.hpp                     |  286 +
 viennacl/generator/map_functor.hpp                 |  170 +
 viennacl/generator/mapped_objects.hpp              |  343 +
 viennacl/generator/matrix_product.hpp              |  716 ++
 viennacl/generator/profile_base.hpp                |  194 +
 viennacl/generator/profiles.hpp                    |  340 +
 viennacl/generator/saxpy.hpp                       |  210 +
 viennacl/generator/scalar_reduction.hpp            |  362 +
 viennacl/generator/set_arguments_functor.hpp       |  139 +
 .../generator/statement_representation_functor.hpp |  172 +
 viennacl/generator/utils.hpp                       |  274 +
 viennacl/generator/vector_reduction.hpp            |  243 +
 viennacl/hankel_matrix.hpp                         |  181 +-
 viennacl/hyb_matrix.hpp                            |  368 +
 viennacl/io/kernel_parameters.hpp                  |  446 -
 viennacl/io/matrix_market.hpp                      |  113 +-
 viennacl/linalg/amg.hpp                            |  472 +-
 viennacl/linalg/bicgstab.hpp                       |  238 +-
 viennacl/linalg/bisect.hpp                         |  176 +
 viennacl/linalg/cg.hpp                             |  122 +-
 viennacl/linalg/circulant_matrix_operations.hpp    |  175 +-
 viennacl/linalg/compressed_matrix_operations.hpp   |  265 -
 viennacl/linalg/coordinate_matrix_operations.hpp   |  222 -
 viennacl/linalg/cuda/common.hpp                    |  189 +
 viennacl/linalg/cuda/direct_solve.hpp              |  523 ++
 viennacl/linalg/cuda/matrix_operations.hpp         | 2539 ++++++
 viennacl/linalg/cuda/matrix_operations_col.hpp     | 1423 +++
 viennacl/linalg/cuda/matrix_operations_prod.hpp    | 2886 ++++++
 viennacl/linalg/cuda/matrix_operations_row.hpp     | 1419 +++
 viennacl/linalg/cuda/misc_operations.hpp           |   93 +
 viennacl/linalg/cuda/scalar_operations.hpp         |  380 +
 viennacl/linalg/cuda/sparse_matrix_operations.hpp  | 1831 ++++
 .../linalg/cuda/sparse_matrix_operations_solve.hpp |  761 ++
 viennacl/linalg/cuda/vector_operations.hpp         | 2790 ++++++
 viennacl/linalg/detail/amg/amg_base.hpp            |  712 +-
 viennacl/linalg/detail/amg/amg_coarse.hpp          |  921 +-
 viennacl/linalg/detail/amg/amg_debug.hpp           |   31 +-
 viennacl/linalg/detail/amg/amg_interpol.hpp        |  555 +-
 viennacl/linalg/detail/ilu/block_ilu.hpp           |  463 +
 viennacl/linalg/detail/ilu/common.hpp              |  263 +
 viennacl/linalg/detail/ilu/ilu0.hpp                |  381 +
 viennacl/linalg/detail/ilu/ilut.hpp                |  486 +
 viennacl/linalg/detail/op_applier.hpp              |  103 +
 viennacl/linalg/detail/op_executor.hpp             |   85 +
 viennacl/linalg/detail/spai/block_matrix.hpp       |   49 +-
 viennacl/linalg/detail/spai/block_vector.hpp       |   38 +-
 viennacl/linalg/detail/spai/fspai.hpp              |  219 +-
 viennacl/linalg/detail/spai/qr.hpp                 |  304 +-
 viennacl/linalg/detail/spai/small_matrix.hpp       |   37 +-
 viennacl/linalg/detail/spai/spai-dynamic.hpp       |  893 +-
 viennacl/linalg/detail/spai/spai-static.hpp        |  232 +-
 viennacl/linalg/detail/spai/spai.hpp               |  599 +-
 viennacl/linalg/detail/spai/spai_tag.hpp           |  105 +-
 viennacl/linalg/detail/spai/sparse_vector.hpp      |   60 +-
 viennacl/linalg/direct_solve.hpp                   |  779 +-
 .../bandwidth_reduction.hpp => linalg/eig.hpp}     |   28 +-
 viennacl/linalg/gmres.hpp                          |  345 +-
 viennacl/linalg/hankel_matrix_operations.hpp       |  176 +-
 viennacl/linalg/host_based/common.hpp              |  166 +
 viennacl/linalg/host_based/direct_solve.hpp        |  418 +
 viennacl/linalg/host_based/matrix_operations.hpp   | 1177 +++
 viennacl/linalg/host_based/misc_operations.hpp     |   80 +
 viennacl/linalg/host_based/scalar_operations.hpp   |  162 +
 .../linalg/host_based/sparse_matrix_operations.hpp | 1603 ++++
 viennacl/linalg/host_based/sse_blas.hpp            | 1013 +++
 viennacl/linalg/host_based/sse_kernels.hpp         |  590 ++
 viennacl/linalg/host_based/vector_operations.hpp   |  621 ++
 viennacl/linalg/ichol.hpp                          |  228 +
 viennacl/linalg/ilu.hpp                            |  380 +-
 viennacl/linalg/inner_prod.hpp                     |  194 +-
 viennacl/linalg/jacobi_precond.hpp                 |  140 +-
 viennacl/linalg/lanczos.hpp                        |  490 +
 viennacl/linalg/lu.hpp                             |  227 +
 viennacl/linalg/matrix_operations.hpp              | 1344 +--
 viennacl/linalg/misc_operations.hpp                |   94 +
 viennacl/linalg/mixed_precision_cg.hpp             |  254 +
 viennacl/linalg/nmf.hpp                            |  200 +
 viennacl/linalg/norm_1.hpp                         |   74 +-
 viennacl/linalg/norm_2.hpp                         |  157 +-
 viennacl/linalg/norm_frobenius.hpp                 |   73 +
 viennacl/linalg/norm_inf.hpp                       |   79 +-
 viennacl/linalg/opencl/common.hpp                  |   95 +
 viennacl/linalg/opencl/direct_solve.hpp            |  232 +
 .../kernels/compressed_compressed_matrix.hpp       |   89 +
 .../linalg/opencl/kernels/compressed_matrix.hpp    | 1096 +++
 .../linalg/opencl/kernels/coordinate_matrix.hpp    |  382 +
 viennacl/linalg/opencl/kernels/ell_matrix.hpp      |  195 +
 viennacl/linalg/opencl/kernels/fft.hpp             |  294 +
 viennacl/linalg/opencl/kernels/hyb_matrix.hpp      |  214 +
 viennacl/linalg/opencl/kernels/ilu.hpp             |   90 +
 viennacl/linalg/opencl/kernels/matrix.hpp          |  932 ++
 viennacl/linalg/opencl/kernels/matrix_element.hpp  |  138 +
 viennacl/linalg/opencl/kernels/matrix_prod.hpp     |  485 +
 viennacl/linalg/opencl/kernels/matrix_solve.hpp    |  212 +
 viennacl/linalg/opencl/kernels/nmf.hpp             |   82 +
 viennacl/linalg/opencl/kernels/scalar.hpp          |  266 +
 viennacl/linalg/opencl/kernels/spai.hpp            |  614 ++
 viennacl/linalg/opencl/kernels/svd.hpp             |  560 ++
 viennacl/linalg/opencl/kernels/vector.hpp          |  688 ++
 viennacl/linalg/opencl/kernels/vector_element.hpp  |  155 +
 viennacl/linalg/opencl/matrix_operations.hpp       |  998 ++
 viennacl/linalg/opencl/misc_operations.hpp         |   72 +
 viennacl/linalg/opencl/scalar_operations.hpp       |  201 +
 .../linalg/opencl/sparse_matrix_operations.hpp     |  940 ++
 .../opencl/vandermonde_matrix_operations.hpp       |   68 +
 viennacl/linalg/opencl/vector_operations.hpp       |  975 ++
 viennacl/linalg/power_iter.hpp                     |  118 +
 viennacl/linalg/prod.hpp                           |  285 +-
 viennacl/linalg/qr-method-common.hpp               |  225 +
 viennacl/linalg/qr-method.hpp                      |  952 ++
 viennacl/linalg/qr.hpp                             | 1036 +--
 viennacl/linalg/row_scaling.hpp                    |  217 +-
 viennacl/linalg/scalar_operations.hpp              |  242 +
 viennacl/linalg/spai.hpp                           |  151 +-
 viennacl/linalg/sparse_matrix_operations.hpp       |  375 +
 viennacl/linalg/svd.hpp                            |  532 ++
 viennacl/linalg/toeplitz_matrix_operations.hpp     |  190 +-
 viennacl/linalg/tred2.hpp                          |   68 +
 viennacl/linalg/vandermonde_matrix_operations.hpp  |  185 +-
 viennacl/linalg/vector_operations.hpp              | 1554 ++--
 viennacl/matrix.hpp                                | 4092 ++++++---
 viennacl/matrix_proxy.hpp                          |  740 +-
 viennacl/meta/enable_if.hpp                        |   90 +-
 viennacl/meta/predicate.hpp                        |  626 +-
 viennacl/meta/result_of.hpp                        |  850 +-
 viennacl/meta/tag_of.hpp                           |  115 +-
 viennacl/misc/bandwidth_reduction.hpp              |   13 +-
 viennacl/misc/cuthill_mckee.hpp                    |  772 +-
 viennacl/misc/gibbs_poole_stockmeyer.hpp           |  187 +-
 viennacl/ocl/backend.hpp                           |  116 +-
 viennacl/ocl/command_queue.hpp                     |  182 +-
 viennacl/ocl/context.hpp                           |  461 +-
 viennacl/ocl/device.hpp                            | 1722 +++-
 viennacl/ocl/device_utils.hpp                      |  155 +
 viennacl/ocl/enqueue.hpp                           |  270 +-
 viennacl/ocl/error.hpp                             | 1260 +--
 viennacl/ocl/forwards.h                            |   17 +-
 viennacl/ocl/handle.hpp                            |  422 +-
 viennacl/ocl/infos.hpp                             |  268 +
 viennacl/ocl/kernel.hpp                            | 1390 +--
 viennacl/ocl/local_mem.hpp                         |   21 +-
 viennacl/ocl/platform.hpp                          |   76 +-
 viennacl/ocl/program.hpp                           |   66 +-
 viennacl/ocl/utils.hpp                             |   48 +-
 viennacl/rand/gaussian.hpp                         |   54 +
 viennacl/rand/uniform.hpp                          |   56 +
 viennacl/rand/utils.hpp                            |   71 +
 viennacl/range.hpp                                 |   27 +-
 viennacl/scalar.hpp                                | 1269 +--
 viennacl/scheduler/execute.hpp                     |  247 +
 viennacl/scheduler/execute_axbx.hpp                |  379 +
 viennacl/scheduler/execute_elementwise.hpp         |  466 +
 viennacl/scheduler/execute_generic_dispatcher.hpp  |  135 +
 viennacl/scheduler/execute_matrix_dispatcher.hpp   |  210 +
 viennacl/scheduler/execute_matrix_prod.hpp         |  498 +
 viennacl/scheduler/execute_scalar_assign.hpp       |  189 +
 viennacl/scheduler/execute_scalar_dispatcher.hpp   |  131 +
 viennacl/scheduler/execute_util.hpp                |  253 +
 viennacl/scheduler/execute_vector_dispatcher.hpp   |  191 +
 viennacl/scheduler/forwards.h                      |  710 ++
 viennacl/scheduler/io.hpp                          |  290 +
 viennacl/{range.hpp => slice.hpp}                  |   60 +-
 viennacl/toeplitz_matrix.hpp                       |  191 +-
 viennacl/tools/adapter.hpp                         |  262 +-
 viennacl/tools/entry_proxy.hpp                     |  144 +-
 viennacl/tools/matrix_kernel_class_deducer.hpp     |   67 -
 .../tools/matrix_prod_kernel_class_deducer.hpp     |  160 -
 viennacl/tools/matrix_size_deducer.hpp             |  201 +-
 .../tools/matrix_solve_kernel_class_deducer.hpp    |   77 -
 viennacl/tools/shared_ptr.hpp                      |  163 +
 viennacl/tools/timer.hpp                           |  122 +
 viennacl/tools/tools.hpp                           |  628 +-
 viennacl/traits/clear.hpp                          |  147 +-
 viennacl/traits/context.hpp                        |   66 +
 viennacl/traits/fill.hpp                           |  139 +-
 viennacl/traits/handle.hpp                         |  320 +-
 viennacl/traits/size.hpp                           |  545 +-
 viennacl/traits/start.hpp                          |  198 +-
 viennacl/traits/stride.hpp                         |   75 +
 viennacl/vandermonde_matrix.hpp                    |  182 +-
 viennacl/vector.hpp                                | 4988 ++++++----
 viennacl/vector_proxy.hpp                          |  293 +-
 631 files changed, 133555 insertions(+), 47453 deletions(-)

diff --git a/CL/cl_gl_ext.h b/CL/cl_gl_ext.h
index 26e4782..7c9b64c 100644
--- a/CL/cl_gl_ext.h
+++ b/CL/cl_gl_ext.h
@@ -41,7 +41,7 @@ extern "C" {
 
 /*
  * For each extension, follow this template
- * /* cl_VEN_extname extension  */
+ *  cl_VEN_extname extension  */
 /* #define cl_VEN_extname 1
  * ... define new types, if any
  * ... define new tokens, if any
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ffe5f6a..bd38b04 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -21,7 +21,7 @@ SET(VIENNACL_SRC_DIST ON)
 
 # For out-of-the-box support on MacOS:
 IF(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
-  INCLUDE_DIRECTORIES("/opt/local/include") 
+  INCLUDE_DIRECTORIES("/opt/local/include")
   set(CMAKE_EXE_LINKER_FLAGS "-framework OpenCL")
 ENDIF(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
 
@@ -30,7 +30,7 @@ ENDIF(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
 ################
 
 set(VERSION_MAJOR 1)
-set(VERSION_MINOR 2)
+set(VERSION_MINOR 5)
 set(VERSION_PATCH 1)
 set(VERSION ${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH})
 
@@ -73,8 +73,6 @@ include_directories(BEFORE ${PROJECT_SOURCE_DIR})
 # Subdirectories
 ################
 
-add_subdirectory(auxiliary)
-
 if(BUILD_EXAMPLES)
    add_subdirectory(examples)
 endif()
@@ -85,9 +83,11 @@ endif()
 
 add_subdirectory(doc)
 
+add_subdirectory(libviennacl)
+
 # Install
 #########
 
-install(DIRECTORY viennacl
+install(DIRECTORY viennacl ${CMAKE_CURRENT_BINARY_DIR}/viennacl
    DESTINATION ${INSTALL_INCLUDE_DIR} COMPONENT dev
    FILES_MATCHING PATTERN "*.h" PATTERN "*.hpp")
diff --git a/LICENSE b/LICENSE
index fd650fb..297e11d 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,4 +1,9 @@
-Copyright (c) 2010, Institute for Microelectronics, TU Wien
+Copyright (c) 2010-2014 Institute for Microelectronics,
+                        Institute for Analysis and Scientific Computing, TU Wien.
+Portions of this software are copyright by UChicago Argonne, LLC.
+Argonne National Laboratory, with facilities in the state of Illinois,
+is owned by The United States Government, and operated by UChicago Argonne, LLC
+under provision of a contract with the Department of Energy.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/README b/README
index a7cf5f9..b4ea993 100644
--- a/README
+++ b/README
@@ -26,9 +26,12 @@ ViennaCL requires the following:
 The first step is to extract the file:
 
 Unix-based OS:
-$> gunzip ViennaCL-1.2.1.tar.gz
-$> tar -xf ViennaCL-1.2.1.tar
-$> cd ViennaCL-1.2.1
+$> gunzip ViennaCL-1.5.1.tar.gz
+$> tar -xf ViennaCL-1.5.1.tar
+$> cd ViennaCL-1.5.1
+
+Windows:
+Extract the file using your favorite compressor/decompressor, e.g. 7-zip.
 
 ViennaCL is a header-only library, therefore it is sufficient to copy the subfolder viennacl/ (holding the header files) into you project directory or your system include directory. For instructions on how to set the include paths correctly, please refer to the documentation of your compiler.
 
@@ -37,45 +40,67 @@ For building the examples, proceed as follows:
 Unix-based clients:
  * change to the build directory:
    $> cd build
+
  * call cmake
    $> cmake ..
+
+ * If CMake cannot find the OpenCL files, e.g. 'Could NOT find OPENCL (missing:  OPENCL_INCLUDE_DIR)', please set the paths manually, e.g.
+   $> cmake -DOPENCL_INCLUDE_DIR=../CL ..
+   Or, use the CMake GUI.
+   $> cmake-gui
+   Or, disable openCL if you are not planning to use parallel programming components
+   $> cmake .. -DENABLE_OPENCL=0
+
  * Use 'make' to build all examples:
    $> make
    (alternatively, you can build them individually via 'make blas1', 'make viennacl-info', etc.
+
  * Start the tutorials (optional)
-   $> ./blas1
-   $> ./custom-kernels
-   $> ./viennacl-info
+   $> examples/blas1
+   $> examples/custom-kernels
+   $> examples/viennacl-info
    (...)
 
+Windows:
+ * Open the CMake GUI
+ * Set the source code location ('Where is the source code:') to the extracted ViennaCL-1.x.x folder
+ * Set the build folder ('Where to build the binaries:') to the subfolder build/ in the ViennaCL-1.x.x folder.
+ * Click on 'Configure' and select your Compiler
+ * Click on 'Configure' again
+ * Click on 'Generate'
+ * Navigate to the build/ folder, open the generated project files with your favorite IDE, and build them.
 
 3. Project Layout
 -----------------
 
 ---- ViennaCL-1.X.X
    |
-   |-- auxiliary/ - Auxiliary files (i.e. the OpenCL source code tree and the converter for the header files)
-   | 
+   |-- auxiliary/ - (only in src-Edition) Auxiliary files (i.e. the OpenCL source code tree and the converter for the header files)
+   |
+   |-- build/ - Build directory for building the examples
+   |
    |-- CL/ - The OpenCL headers
    |
    |-- cmake/ - Additional CMake configuration files
    |
    |-- doc/ - Documentation (LaTeX and doxygen)
-   | 
+   |
    |-- examples/ - Tutorial and benchmarking applications
-        | 
+        |
         |-- testdata/ - Test data for the tutorials and benchmarks
-        | 
+        |
         |-- benchmarks/ - A small benchmarking suite
         |
         |-- tutorial/ - Some tutorials explaining the usage of ViennaCL
         |
         |-- parameters/ - Parameter optimization environment
-   | 
+   |
    |-- external/ - External libraries
    |
+   |-- libviennacl/ - Shared library for interfacing some BLAS functionality of ViennaCL from languages other than C++
+   |
    |-- tests/ - Automated test suite using CTest
-   | 
+   |
    |-- viennacl/ - The library source code
 
 
@@ -84,7 +109,7 @@ Unix-based clients:
 
 For any technical questions related to ViennaCL, please use our mailing list: viennacl-support at lists.sourceforge.net
 You may also use the forum provided by sourceforge.net: http://sourceforge.net/projects/viennacl/
-For any other issues, please contact the project head Karl Rupp  at rupp at iue.tuwien.ac.at.
+For any other issues, please contact the project head Karl Rupp at rupp at iue.tuwien.ac.at.
 
 ViennaCL was developed under the aegis of the 'Institute for Microelectronics' at the 'Vienna University of Technology'.
 
diff --git a/auxiliary/CMakeLists.txt b/auxiliary/CMakeLists.txt
deleted file mode 100644
index d009a3f..0000000
--- a/auxiliary/CMakeLists.txt
+++ /dev/null
@@ -1,303 +0,0 @@
-include_directories(${Boost_INCLUDE_DIRS})
-
-add_executable(generate-blas3-solve-align1 generate-blas3-solve-align1.cpp)
-add_executable(generate-blas3-prod-align1 generate-blas3-prod-align1.cpp)
-
-function(generate_blas3_prod_align1 outvar)
-   set(crstr_0 col)
-   set(crstr_1 row)
-   set(ATstr_0 A)
-   set(ATstr_1 T)
-   set(outfiles)
-
-   foreach(ar 0 1) # A is column/row major
-   foreach(br 0 1) # B is column/row major
-   foreach(cr 0 1) # C is column/row major
-   foreach(at 0 1) # A is (not) transposed
-   foreach(bt 0 1) # B is (not) transposed
-      set(d "${CMAKE_CURRENT_BINARY_DIR}")
-      set(d "${d}/matrix_prod_${crstr_${ar}}_${crstr_${br}}_${crstr_${cr}}")
-      set(d "${d}/align1")
-      file(MAKE_DIRECTORY "${d}")
-      set(o "${d}/prod_${ATstr_${at}}${ATstr_${bt}}.cl")
-      file(RELATIVE_PATH ro "${CMAKE_CURRENT_BINARY_DIR}" "${o}")
-      add_custom_command(OUTPUT "${o}"
-         COMMAND generate-blas3-prod-align1
-            ${ar} ${br} ${cr} ${at} ${bt} > "${o}"
-         COMMENT "Generating ${ro}"
-         WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}"
-         VERBATIM)
-      list(APPEND outfiles "${o}")
-   endforeach()
-   endforeach()
-   endforeach()
-   endforeach()
-   endforeach()
-   set(${outvar} "${outfiles}" PARENT_SCOPE)
-endfunction()
-
-function(generate_blas3_solve_align1 outvar)
-   set(crstr_0 col)
-   set(crstr_1 row)
-   set(tstr_0)
-   set(tstr_1 trans_)
-   set(ulstr_0 lower)
-   set(ulstr_1 upper)
-   set(unitstr_0)
-   set(unitstr_1 unit_)
-   set(outfiles)
-
-   foreach(ar 0 1) # A is column/row major
-   foreach(br 0 1) # A is column/row major
-   foreach(at 0 1) # A is transposed
-   foreach(bt 0 1) # B is transposed
-   foreach(ul 0 1) # upper/lower
-   foreach(un 0 1) # unit
-      set(d "${CMAKE_CURRENT_BINARY_DIR}")
-      set(d "${d}/matrix_solve_${crstr_${ar}}_${crstr_${br}}")
-      set(d "${d}/align1")
-      file(MAKE_DIRECTORY "${d}")
-      set(o "${d}/${tstr_${at}}${unitstr_${un}}${ulstr_${ul}}_${tstr_${bt}}solve.cl")
-      file(RELATIVE_PATH ro "${CMAKE_CURRENT_BINARY_DIR}" "${o}")
-      add_custom_command(OUTPUT "${o}"
-         COMMAND generate-blas3-solve-align1
-            ${ar} ${br} ${at} ${bt} ${ul} ${un} > "${o}"
-         COMMENT "Generating ${ro}"
-         WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}"
-         VERBATIM)
-      list(APPEND outfiles "${o}")
-   endforeach()
-   endforeach()
-   endforeach()
-   endforeach()
-   endforeach()
-   endforeach()
-   set(${outvar} "${outfiles}" PARENT_SCOPE)
-endfunction()
-
-# Matrix-Matrix products
-generate_blas3_prod_align1(MATRIX_PROD_SRCS)
-
-# Matrix-Matrix triangular solver
-generate_blas3_solve_align1(MATRIX_SOLVE_SRCS)
-
-set(COMPRESSED_MATRIX_SRCS
-   compressed_matrix/align1/bicgstab_kernel1.cl
-   compressed_matrix/align1/bicgstab_kernel2.cl
-   compressed_matrix/align1/jacobi.cl
-   compressed_matrix/align1/jacobi_precond.cl
-   compressed_matrix/align1/lu_backward.cl
-   compressed_matrix/align1/lu_forward.cl
-   compressed_matrix/align1/row_scaling_1.cl
-   compressed_matrix/align1/row_scaling_2.cl
-   compressed_matrix/align1/vec_mul.cl
-   compressed_matrix/align4/vec_mul.cl
-   compressed_matrix/align8/vec_mul.cl)
-
-set(COORDINATE_MATRIX_SRCS
-   coordinate_matrix/align1/vec_mul.cl
-   coordinate_matrix/align128/dummy)
-
-set(MATRIX_COL_SRCS
-   matrix_col/align1/add.cl
-   matrix_col/align1/assign.cl
-   matrix_col/align1/clear.cl
-   matrix_col/align1/cpu_inplace_mult.cl
-   matrix_col/align1/fft_direct.cl
-   matrix_col/align1/fft_radix2.cl
-   matrix_col/align1/fft_radix2_local.cl
-   matrix_col/align1/fft_reorder.cl
-   matrix_col/align1/inplace_add.cl
-   matrix_col/align1/inplace_divide.cl
-   matrix_col/align1/inplace_mult.cl
-   matrix_col/align1/inplace_sub.cl
-   matrix_col/align1/lower_triangular_substitute_inplace.cl
-   matrix_col/align1/lu_factorize.cl
-   matrix_col/align1/rank1_update.cl
-   matrix_col/align1/scaled_rank1_update.cl
-   matrix_col/align1/sub.cl
-   matrix_col/align1/trans_lower_triangular_substitute_inplace.cl
-   matrix_col/align1/trans_unit_lower_triangular_substitute_inplace.cl
-   matrix_col/align1/trans_unit_upper_triangular_substitute_inplace.cl
-   matrix_col/align1/trans_upper_triangular_substitute_inplace.cl
-   matrix_col/align1/trans_vec_mul.cl
-   matrix_col/align1/unit_lower_triangular_substitute_inplace.cl
-   matrix_col/align1/unit_upper_triangular_substitute_inplace.cl
-   matrix_col/align1/upper_triangular_substitute_inplace.cl
-   matrix_col/align1/vec_mul.cl
-   matrix_col/align16/dummy)
-
-set(MATRIX_ROW_SRCS
-   matrix_row/align1/add.cl
-   matrix_row/align1/assign.cl
-   matrix_row/align1/clear.cl
-   matrix_row/align1/cpu_inplace_mult.cl
-   matrix_row/align1/fft_direct.cl
-   matrix_row/align1/fft_radix2.cl
-   matrix_row/align1/fft_radix2_local.cl
-   matrix_row/align1/fft_reorder.cl
-   matrix_row/align1/inplace_add.cl
-   matrix_row/align1/inplace_divide.cl
-   matrix_row/align1/inplace_mult.cl
-   matrix_row/align1/inplace_sub.cl
-   matrix_row/align1/lower_triangular_substitute_inplace.cl
-   matrix_row/align1/lu_factorize.cl
-   matrix_row/align1/rank1_update.cl
-   matrix_row/align1/scaled_rank1_update.cl
-   matrix_row/align1/sub.cl
-   matrix_row/align1/trans_lower_triangular_substitute_inplace.cl
-   matrix_row/align1/trans_unit_lower_triangular_substitute_inplace.cl
-   matrix_row/align1/trans_unit_upper_triangular_substitute_inplace.cl
-   matrix_row/align1/trans_upper_triangular_substitute_inplace.cl
-   matrix_row/align1/trans_vec_mul.cl
-   matrix_row/align1/unit_lower_triangular_substitute_inplace.cl
-   matrix_row/align1/unit_upper_triangular_substitute_inplace.cl
-   matrix_row/align1/upper_triangular_substitute_inplace.cl
-   matrix_row/align1/vec_mul.cl
-   matrix_row/align16/dummy)
-
-set(SCALAR_SRCS
-   scalar/align1/add.cl
-   scalar/align1/cpu_add.cl
-   scalar/align1/cpu_div.cl
-   scalar/align1/cpu_inplace_add.cl
-   scalar/align1/cpu_inplace_div.cl
-   scalar/align1/cpu_inplace_mul.cl
-   scalar/align1/cpu_inplace_sub.cl
-   scalar/align1/cpu_mul.cl
-   scalar/align1/cpu_sub.cl
-   scalar/align1/divide.cl
-   scalar/align1/inplace_add.cl
-   scalar/align1/inplace_div.cl
-   scalar/align1/inplace_mul.cl
-   scalar/align1/inplace_sub.cl
-   scalar/align1/mul.cl
-   scalar/align1/sub.cl)
-
-set(VECTOR_SRCS
-   vector/align16/add.cl
-   vector/align16/cpu_inplace_mul.cl
-   vector/align16/cpu_mult.cl
-   vector/align16/divide.cl
-   vector/align16/inplace_add.cl
-   vector/align16/inplace_divide.cl
-   vector/align16/inplace_mult.cl
-   vector/align16/inplace_sub.cl
-   vector/align16/mult.cl
-   vector/align16/sub.cl
-   vector/align1/add.cl
-   vector/align1/clear.cl
-   vector/align1/cpu_inplace_mul_add.cl
-   vector/align1/cpu_inplace_mult.cl
-   vector/align1/cpu_mul_add.cl
-   vector/align1/cpu_mult.cl
-   vector/align1/diag_precond.cl
-   vector/align1/divide.cl
-   vector/align1/index_norm_inf.cl
-   vector/align1/inner_prod.cl
-   vector/align1/inplace_add.cl
-   vector/align1/inplace_div_add.cl
-   vector/align1/inplace_divide.cl
-   vector/align1/inplace_div_sub.cl
-   vector/align1/inplace_mul_add.cl
-   vector/align1/inplace_mul_sub.cl
-   vector/align1/inplace_mult.cl
-   vector/align1/inplace_sub.cl
-   vector/align1/mul_add.cl
-   vector/align1/mul_sub.cl
-   vector/align1/mult.cl
-   vector/align1/norm_1.cl
-   vector/align1/norm_2.cl
-   vector/align1/norm_inf.cl
-   vector/align1/plane_rotation.cl
-   vector/align1/sqrt_sum.cl
-   vector/align1/sub.cl
-   vector/align1/sum.cl
-   vector/align1/swap.cl
-   vector/align1/vmax.cl
-   vector/align4/cpu_inplace_mul_add.cl
-   vector/align4/cpu_mul_add.cl
-   vector/align4/inplace_div_add.cl
-   vector/align4/inplace_div_sub.cl
-   vector/align4/inplace_mul_add.cl
-   vector/align4/inplace_mul_sub.cl
-   vector/align4/mul_add.cl)
-
-set(FFT_SRCS
-   fft/align1/bluestein_post.cl
-   fft/align1/bluestein_pre.cl
-   fft/align1/complex_to_real.cl
-   fft/align1/fft_div_vec_scalar.cl
-   fft/align1/fft_mult_vec.cl
-   fft/align1/real_to_complex.cl
-   fft/align1/reverse_inplace.cl
-   fft/align1/transpose.cl
-   fft/align1/transpose_inplace.cl
-   fft/align1/vandermonde_prod.cl
-   fft/align1/zero2.cl
-   )
-
-set(SPAI_SRCS
-   spai/align1/assemble_blocks.cl
-   spai/align1/block_bv_assembly.cl
-   spai/align1/block_least_squares.cl
-   spai/align1/block_q_mult.cl
-   spai/align1/block_qr.cl
-   spai/align1/block_qr_assembly.cl
-   spai/align1/block_qr_assembly_1.cl
-   spai/align1/block_r_assembly.cl
-   )
-
-set(CL_SRCS)
-foreach(f IN LISTS COMPRESSED_MATRIX_SRCS COORDINATE_MATRIX_SRCS
-      MATRIX_COL_SRCS MATRIX_ROW_SRCS SCALAR_SRCS VECTOR_SRCS FFT_SRCS SPAI_SRCS)
-   get_filename_component(d "${CMAKE_CURRENT_BINARY_DIR}/${f}" PATH)
-   file(MAKE_DIRECTORY "${d}")
-   configure_file(${f} "${CMAKE_CURRENT_BINARY_DIR}/${f}" COPYONLY)
-   list(APPEND CL_SRCS "${CMAKE_CURRENT_BINARY_DIR}/${f}")
-endforeach()
-list(APPEND CL_SRCS ${MATRIX_PROD_SRCS} ${MATRIX_SOLVE_SRCS})
-
-add_executable(converter converter.cpp)
-target_link_libraries(converter ${Boost_LIBRARIES})
-
-set(KERNEL_HDRS)
-set(KERNEL_SRCS)
-foreach(d
-      compressed_matrix
-      coordinate_matrix
-      matrix_col
-      matrix_prod_col_col_col
-      matrix_prod_col_col_row
-      matrix_prod_col_row_col
-      matrix_prod_col_row_row
-      matrix_prod_row_col_col
-      matrix_prod_row_col_row
-      matrix_prod_row_row_col
-      matrix_prod_row_row_row
-      matrix_row
-      matrix_solve_col_col
-      matrix_solve_col_row
-      matrix_solve_row_col
-      matrix_solve_row_row
-      scalar
-      vector
-      fft
-      spai
-      )
-   set(f "${PROJECT_SOURCE_DIR}/viennacl/linalg/kernels/${d}")
-   list(APPEND KERNEL_HDRS "${f}_kernels.h")
-   list(APPEND KERNEL_SRCS "${f}_source.h")
-endforeach()
-
-file(MAKE_DIRECTORY "${PROJECT_SOURCE_DIR}/viennacl/linalg/kernels")
-
-add_custom_command(OUTPUT ${KERNEL_HDRS} ${KERNEL_SRCS}
-   COMMAND converter
-   DEPENDS ${CL_SRCS}
-   WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}"
-   COMMENT "Generating kernel headers and sources"
-   VERBATIM)
-
-add_custom_target(kernels ALL
-   DEPENDS ${KERNEL_HDRS} ${KERNEL_SRCS})
diff --git a/auxiliary/README b/auxiliary/README
deleted file mode 100644
index f672a19..0000000
--- a/auxiliary/README
+++ /dev/null
@@ -1,17 +0,0 @@
-******************************************
-* Naming convention for kernel functions *
-******************************************
-
-The generic location of the raw .cl files in auxiliary-folder is is:
-[type]/align[alignment]/[kernelname].cl
-
-where 
-[type]       ... the ViennaCL class the kernel is defined for (see VCLKernels.h)
-[alignment]  ... Alignment of data for vectorization (1,2,4,8,16,...)
-[kernelname] ... name of the kernel as defined in (see VCLKernels.h)
-
-Note that kernels for double precision are generated automatically by a string replace of "float" -> "double"
-
-The converter program generates the static strings in viennacl/linalg/kernels/ from the cl-files in the header files
-
-Use the convert.sh script to inject all kernels into the ViennaCL source tree!
diff --git a/auxiliary/compressed_matrix/align1/bicgstab_kernel1.cl b/auxiliary/compressed_matrix/align1/bicgstab_kernel1.cl
deleted file mode 100644
index 342325d..0000000
--- a/auxiliary/compressed_matrix/align1/bicgstab_kernel1.cl
+++ /dev/null
@@ -1,54 +0,0 @@
-void helper_bicgstab_kernel1_parallel_reduction( __local float * tmp_buffer )
-{
-  for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2)
-  {
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (get_local_id(0) < stride)
-      tmp_buffer[get_local_id(0)] += tmp_buffer[get_local_id(0)+stride];
-  }
-}
-
-//////// inner products:
-float bicgstab_kernel1_inner_prod(
-          __global const float * vec1,
-          __global const float * vec2,
-          unsigned int size,
-          __local float * tmp_buffer)
-{
-  float tmp = 0;
-  unsigned int i_end = ((size - 1) / get_local_size(0) + 1) * get_local_size(0);
-  for (unsigned int i = get_local_id(0); i < i_end; i += get_local_size(0))
-  {
-    if (i < size)
-      tmp += vec1[i] * vec2[i];
-  }
-  tmp_buffer[get_local_id(0)] = tmp;
-  
-  helper_bicgstab_kernel1_parallel_reduction(tmp_buffer);
-
-  barrier(CLK_LOCAL_MEM_FENCE);
-
-  return tmp_buffer[0];
-}
-
-
-__kernel void bicgstab_kernel1(
-          __global const float * tmp0,
-          __global const float * r0star, 
-          __global const float * residual,
-          __global float * s,
-          __global float * alpha,
-          __global const float * ip_rr0star,
-          __local float * tmp_buffer,
-          unsigned int size) 
-{ 
-  float alpha_local = ip_rr0star[0] / bicgstab_kernel1_inner_prod(tmp0, r0star, size, tmp_buffer);
-  
-  for (unsigned int i = get_local_id(0); i < size; i += get_local_size(0))
-    s[i] = residual[i] - alpha_local * tmp0[i];
-  
-  if (get_global_id(0) == 0)
-    alpha[0] = alpha_local;
-}
-
-
diff --git a/auxiliary/compressed_matrix/align1/bicgstab_kernel2.cl b/auxiliary/compressed_matrix/align1/bicgstab_kernel2.cl
deleted file mode 100644
index c6d2803..0000000
--- a/auxiliary/compressed_matrix/align1/bicgstab_kernel2.cl
+++ /dev/null
@@ -1,81 +0,0 @@
-void helper_bicgstab_kernel2_parallel_reduction( __local float * tmp_buffer )
-{
-  for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2)
-  {
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (get_local_id(0) < stride)
-      tmp_buffer[get_local_id(0)] += tmp_buffer[get_local_id(0)+stride];
-  }
-}
-
-//////// inner products:
-float bicgstab_kernel2_inner_prod(
-          __global const float * vec1,
-          __global const float * vec2,
-          unsigned int size,
-          __local float * tmp_buffer)
-{
-  float tmp = 0;
-  unsigned int i_end = ((size - 1) / get_local_size(0) + 1) * get_local_size(0);
-  for (unsigned int i = get_local_id(0); i < i_end; i += get_local_size(0))
-  {
-    if (i < size)
-      tmp += vec1[i] * vec2[i];
-  }
-  tmp_buffer[get_local_id(0)] = tmp;
-  
-  helper_bicgstab_kernel2_parallel_reduction(tmp_buffer);
-
-  barrier(CLK_LOCAL_MEM_FENCE);
-
-  return tmp_buffer[0];
-}
-
-
-__kernel void bicgstab_kernel2(
-          __global const float * tmp0,
-          __global const float * tmp1,
-          __global const float * r0star, 
-          __global const float * s, 
-          __global float * p, 
-          __global float * result,
-          __global float * residual,
-          __global const float * alpha,
-          __global float * ip_rr0star,
-          __global float * error_estimate,
-          __local float * tmp_buffer,
-          unsigned int size) 
-{ 
-  float omega_local = bicgstab_kernel2_inner_prod(tmp1, s, size, tmp_buffer) / bicgstab_kernel2_inner_prod(tmp1, tmp1, size, tmp_buffer);
-  float alpha_local = alpha[0];
-  
-  //result += alpha * p + omega * s;
-  for (unsigned int i = get_local_id(0); i < size; i += get_local_size(0))
-    result[i] += alpha_local * p[i] + omega_local * s[i];
-
-  //residual = s - omega * tmp1;
-  for (unsigned int i = get_local_id(0); i < size; i += get_local_size(0))
-    residual[i] = s[i] - omega_local * tmp1[i];
-
-  //new_ip_rr0star = viennacl::linalg::inner_prod(residual, r0star);
-  float new_ip_rr0star = bicgstab_kernel2_inner_prod(residual, r0star, size, tmp_buffer);
-  float beta = (new_ip_rr0star / ip_rr0star[0]) * (alpha_local / omega_local);
-  
-  //p = residual + beta * (p - omega*tmp0);
-  for (unsigned int i = get_local_id(0); i < size; i += get_local_size(0))
-    p[i] = residual[i] + beta * (p[i] - omega_local * tmp0[i]);
-
-  //compute norm of residual:
-  float new_error_estimate = bicgstab_kernel2_inner_prod(residual, residual, size, tmp_buffer);
-
-  barrier(CLK_GLOBAL_MEM_FENCE);
-
-  //update values:
-  if (get_global_id(0) == 0)
-  {
-    error_estimate[0] = new_error_estimate;
-    ip_rr0star[0] = new_ip_rr0star;
-  }
-}
-
-
diff --git a/auxiliary/compressed_matrix/align1/jacobi.cl b/auxiliary/compressed_matrix/align1/jacobi.cl
deleted file mode 100644
index 6623ceb..0000000
--- a/auxiliary/compressed_matrix/align1/jacobi.cl
+++ /dev/null
@@ -1,28 +0,0 @@
-
-
-__kernel void jacobi(
- __global const unsigned int * row_indices,
- __global const unsigned int * column_indices,
- __global const float * elements,
- float weight,
- __global const float * old_result,
- __global float * new_result,
- __global const float * rhs,
- unsigned int size)
- {
-  float sum, diag=1;
-  int col;
-  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))
-  {
-    sum = 0;
-    for (unsigned int j = row_indices[i]; j<row_indices[i+1]; j++)
-    {
-      col = column_indices[j];
-      if (i == col)
-	diag = elements[j];
-      else 
-	sum += elements[j] * old_result[col]; 
-    } 
-      new_result[i] = weight * (rhs[i]-sum) / diag + (1-weight) * old_result[i]; 
-   } 
- } 
diff --git a/auxiliary/compressed_matrix/align1/jacobi_precond.cl b/auxiliary/compressed_matrix/align1/jacobi_precond.cl
deleted file mode 100644
index c668f8e..0000000
--- a/auxiliary/compressed_matrix/align1/jacobi_precond.cl
+++ /dev/null
@@ -1,26 +0,0 @@
-
-
-__kernel void jacobi_precond(
-          __global const unsigned int * row_indices,
-          __global const unsigned int * column_indices, 
-          __global const float * elements,
-          __global float * diag_M_inv,
-          unsigned int size) 
-{ 
-  for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0))
-  {
-    float diag = 1.0f;
-    unsigned int row_end = row_indices[row+1];
-    for (unsigned int i = row_indices[row]; i < row_end; ++i)
-    {
-      if (row == column_indices[i])
-      {
-        diag = elements[i];
-        break;
-      }
-    }
-    diag_M_inv[row] = 1.0f / diag;
-  }
-}
-
-
diff --git a/auxiliary/compressed_matrix/align1/lu_backward.cl b/auxiliary/compressed_matrix/align1/lu_backward.cl
deleted file mode 100644
index 4844585..0000000
--- a/auxiliary/compressed_matrix/align1/lu_backward.cl
+++ /dev/null
@@ -1,115 +0,0 @@
-
-
-// compute x in Ux = y for incomplete LU factorizations of a sparse matrix in compressed format
-__kernel void lu_backward(
-          __global const unsigned int * row_indices,
-          __global const unsigned int * column_indices, 
-          __global const float * elements,
-          __local  int * buffer,                              
-          __local  float * vec_entries,   //a memory block from vector
-          __global float * vector,
-          unsigned int size) 
-{
-  int waiting_for; //block index that must be finished before the current thread can start
-  unsigned int waiting_for_index;
-  unsigned int block_offset;
-  unsigned int col;
-  unsigned int row;
-  unsigned int row_index_end;
-  float diagonal_entry = 42;
-  
-  //forward substitution: one thread per row in blocks of get_global_size(0)
-  for (int block_num = size / get_global_size(0); block_num > -1; --block_num)
-  {
-    block_offset = block_num * get_global_size(0);
-    row = block_offset + get_global_id(0);
-    buffer[get_global_id(0)] = 0; //set flag to 'undone'
-    waiting_for = -1;
-    
-    if (row < size)
-    {
-      vec_entries[get_global_id(0)] = vector[row];
-      waiting_for_index = row_indices[row];
-      row_index_end = row_indices[row+1];
-      diagonal_entry = column_indices[waiting_for_index];
-    }
-    
-    if (get_global_id(0) == 0)
-       buffer[get_global_size(0)] = 1;
-
-
-    //try to eliminate all lines in the block. 
-    //in worst case scenarios, in each step only one line can be substituted, thus loop
-    for (unsigned int k = 0; k<get_global_size(0); ++k)
-    {
-      barrier(CLK_LOCAL_MEM_FENCE);
-      if (row < size) //valid index?
-      {
-        if (waiting_for >= 0)
-        {
-          if (buffer[waiting_for] == 1)
-            waiting_for = -1;
-        }
-        
-        if (waiting_for == -1) //substitution not yet done, check whether possible
-        {
-          //check whether reduction is possible:
-          for (unsigned int j = waiting_for_index; j < row_index_end; ++j)
-          {
-            col = column_indices[j];
-            barrier(CLK_LOCAL_MEM_FENCE);
-            if (col >= block_offset + get_global_size(0))  //index valid, but not from current block
-              vec_entries[get_global_id(0)] -= elements[j] * vector[col];
-            else if (col > row)  //index is from current block
-            {
-              if (buffer[col - block_offset] == 0) //entry is not yet calculated
-              {
-                waiting_for = col - block_offset;
-                waiting_for_index = j;
-                break;
-              }
-              else  //updated entry is available in shared memory:
-                vec_entries[get_global_id(0)] -= elements[j] * vec_entries[col - block_offset];
-            }
-            else if (col == row)
-              diagonal_entry = elements[j];
-          }
-          
-          if (waiting_for == -1)  //this row is done
-          {
-            if (row == 0)
-              vec_entries[get_global_id(0)] /= elements[0];
-            else
-              vec_entries[get_global_id(0)] /= diagonal_entry;
-            buffer[get_global_id(0)] = 1;
-            waiting_for = -2; //magic number: thread is finished
-          }
-        } 
-      } //row < size
-      else
-        buffer[get_global_id(0)] = 1; //work done (because there is no work to be done at all...)
-      
-      ///////// check whether all threads are done. If yes, exit loop /////////////
-      if (buffer[get_global_id(0)] == 0)
-        buffer[get_global_size(0)] = 0;
-      barrier(CLK_LOCAL_MEM_FENCE);
-      
-      if (buffer[get_global_size(0)] > 0)  //all threads break the loop simultaneously
-        break;
-
-      if (get_global_id(0) == 0)
-        buffer[get_global_size(0)] = 1;
-    } //for k
-
-    if (row < size)
-      vector[row] = vec_entries[get_global_id(0)];
-      //vector[row] = diagonal_entry;
-    
-    //if (row == 0)
-      //vector[0] = diagonal_entry;
-      //vector[0] = elements[0];
-
-    barrier(CLK_GLOBAL_MEM_FENCE);
-  } //for block_num
-}
-
diff --git a/auxiliary/compressed_matrix/align1/lu_forward.cl b/auxiliary/compressed_matrix/align1/lu_forward.cl
deleted file mode 100644
index 1387cc9..0000000
--- a/auxiliary/compressed_matrix/align1/lu_forward.cl
+++ /dev/null
@@ -1,107 +0,0 @@
-
-
-
- 
-// compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format
-__kernel void lu_forward(
-          __global const unsigned int * row_indices,
-          __global const unsigned int * column_indices, 
-          __global const float * elements,
-          __local  int * buffer,                              
-          __local  float * vec_entries,   //a memory block from vector
-          __global float * vector,
-          unsigned int size) 
-{
-  int waiting_for; //block index that must be finished before the current thread can start
-  unsigned int waiting_for_index;
-  int block_offset;
-  unsigned int col;
-  unsigned int row;
-  unsigned int row_index_end;
-  
-  //backward substitution: one thread per row in blocks of get_global_size(0)
-  for (unsigned int block_num = 0; block_num <= size / get_global_size(0); ++block_num)
-  {
-    block_offset = block_num * get_global_size(0);
-    row = block_offset + get_global_id(0);
-    buffer[get_global_id(0)] = 0; //set flag to 'undone'
-    waiting_for = -1;
-
-    if (row < size)
-    {
-      vec_entries[get_global_id(0)] = vector[row];
-      waiting_for_index = row_indices[row];
-      row_index_end = row_indices[row+1];
-    }
-    
-    if (get_global_id(0) == 0)
-      buffer[get_global_size(0)] = 1;
-
-
-    //try to eliminate all lines in the block. 
-    //in worst case scenarios, in each step only one line can be substituted, thus loop
-    for (unsigned int k = 0; k<get_global_size(0); ++k)
-    {
-      barrier(CLK_LOCAL_MEM_FENCE);
-      if (row < size) //valid index?
-      {
-        if (waiting_for >= 0)
-        {
-          if (buffer[waiting_for] == 1)
-            waiting_for = -1;
-        }
-        
-        if (waiting_for == -1) //substitution not yet done, check whether possible
-        {
-          //check whether reduction is possible:
-          for (unsigned int j = waiting_for_index; j < row_index_end; ++j)
-          {
-            col = column_indices[j];
-            if (col < block_offset) //index valid, but not from current block
-              vec_entries[get_global_id(0)] -= elements[j] * vector[col];
-            else if (col < row)  //index is from current block
-            {
-              if (buffer[col - block_offset] == 0) //entry is not yet calculated
-              {
-                waiting_for = col - block_offset;
-                waiting_for_index = j;
-                break;
-              }
-              else  //updated entry is available in shared memory:
-                vec_entries[get_global_id(0)] -= elements[j] * vec_entries[col - block_offset];
-            }
-          }
-          
-          if (waiting_for == -1)  //this row is done
-          {
-            buffer[get_global_id(0)] = 1;
-            waiting_for = -2; //magic number: thread is finished
-          }
-        } 
-      } //row < size
-      else
-        buffer[get_global_id(0)] = 1; //work done (because there is no work to be done at all...)
-
-      ///////// check whether all threads are done. If yes, exit loop /////////////
-      
-      if (buffer[get_global_id(0)] == 0)
-        buffer[get_global_size(0)] = 0;
-      barrier(CLK_LOCAL_MEM_FENCE);
-      
-      if (buffer[get_global_size(0)] > 0)  //all threads break this loop simultaneously
-        break;
-
-      if (get_global_id(0) == 0)
-        buffer[get_global_size(0)] = 1;
-
-    } //for k
-    
-    //write to vector:
-    if (row < size)
-      vector[row] = vec_entries[get_global_id(0)];
-    
-    barrier(CLK_GLOBAL_MEM_FENCE);
-  } //for block_num
-}
-
-
diff --git a/auxiliary/compressed_matrix/align1/row_scaling_1.cl b/auxiliary/compressed_matrix/align1/row_scaling_1.cl
deleted file mode 100644
index 8de3bca..0000000
--- a/auxiliary/compressed_matrix/align1/row_scaling_1.cl
+++ /dev/null
@@ -1,20 +0,0 @@
-
-
-__kernel void row_scaling_1(
-          __global const unsigned int * row_indices,
-          __global const unsigned int * column_indices, 
-          __global const float * elements,
-          __global float * diag_M_inv,
-          unsigned int size) 
-{ 
-  for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0))
-  {
-    float dot_prod = 0.0f;
-    unsigned int row_end = row_indices[row+1];
-    for (unsigned int i = row_indices[row]; i < row_end; ++i)
-      dot_prod += fabs(elements[i]);
-    diag_M_inv[row] = 1.0f / dot_prod;
-  }
-}
-
-
diff --git a/auxiliary/compressed_matrix/align1/row_scaling_2.cl b/auxiliary/compressed_matrix/align1/row_scaling_2.cl
deleted file mode 100644
index a92db4a..0000000
--- a/auxiliary/compressed_matrix/align1/row_scaling_2.cl
+++ /dev/null
@@ -1,24 +0,0 @@
-
-
-__kernel void row_scaling_2(
-          __global const unsigned int * row_indices,
-          __global const unsigned int * column_indices, 
-          __global const float * elements,
-          __global float * diag_M_inv,
-          unsigned int size) 
-{ 
-  for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0))
-  {
-    float dot_prod = 0.0f;
-    float temp = 0.0f;
-    unsigned int row_end = row_indices[row+1];
-    for (unsigned int i = row_indices[row]; i < row_end; ++i)
-    {
-      temp = elements[i];
-      dot_prod += temp * temp;
-    }
-    diag_M_inv[row] = 1.0f / sqrt(dot_prod);
-  }
-}
-
-
diff --git a/auxiliary/compressed_matrix/align1/vec_mul.cl b/auxiliary/compressed_matrix/align1/vec_mul.cl
deleted file mode 100644
index e4b3408..0000000
--- a/auxiliary/compressed_matrix/align1/vec_mul.cl
+++ /dev/null
@@ -1,21 +0,0 @@
-
-
-__kernel void vec_mul(
-          __global const unsigned int * row_indices,
-          __global const unsigned int * column_indices, 
-          __global const float * elements,
-          __global const float * vector,  
-          __global float * result,
-          unsigned int size) 
-{ 
-  for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0))
-  {
-    float dot_prod = 0.0f;
-    unsigned int row_end = row_indices[row+1];
-    for (unsigned int i = row_indices[row]; i < row_end; ++i)
-      dot_prod += elements[i] * vector[column_indices[i]];
-    result[row] = dot_prod;
-  }
-}
-
-
diff --git a/auxiliary/compressed_matrix/align4/vec_mul.cl b/auxiliary/compressed_matrix/align4/vec_mul.cl
deleted file mode 100644
index 0e0eae7..0000000
--- a/auxiliary/compressed_matrix/align4/vec_mul.cl
+++ /dev/null
@@ -1,37 +0,0 @@
-
-
-__kernel void vec_mul(
-          __global const unsigned int * row_indices,
-          __global const uint4 * column_indices, 
-          __global const float4 * elements,
-          __global const float * vector,  
-          __global float * result,
-          unsigned int size)
-{ 
-  float dot_prod;
-  unsigned int start, next_stop;
-  uint4 col_idx;
-  float4 tmp_vec;
-  float4 tmp_entries;
-
-  for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0))
-  {
-    dot_prod = 0.0f;
-    start = row_indices[row] / 4;
-    next_stop = row_indices[row+1] / 4;
-
-    for (unsigned int i = start; i < next_stop; ++i)
-    {
-      col_idx = column_indices[i];
-
-      tmp_entries = elements[i];
-      tmp_vec.x = vector[col_idx.x];
-      tmp_vec.y = vector[col_idx.y];
-      tmp_vec.z = vector[col_idx.z];
-      tmp_vec.w = vector[col_idx.w];
-
-      dot_prod += dot(tmp_entries, tmp_vec);
-    }
-    result[row] = dot_prod;
-  }
-}
diff --git a/auxiliary/compressed_matrix/align8/vec_mul.cl b/auxiliary/compressed_matrix/align8/vec_mul.cl
deleted file mode 100644
index 9fa35b4..0000000
--- a/auxiliary/compressed_matrix/align8/vec_mul.cl
+++ /dev/null
@@ -1,42 +0,0 @@
-
-
-__kernel void vec_mul(
-          __global const unsigned int * row_indices,
-          __global const uint8 * column_indices, 
-          __global const float8 * elements,
-          __global const float * vector,  
-          __global float * result,
-          unsigned int size)
-{ 
-  float dot_prod;
-  unsigned int start, next_stop;
-  uint8 col_idx;
-  float8 tmp_vec;
-  float8 tmp_entries;
-
-  for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0))
-  {
-    dot_prod = 0.0f;
-    start = row_indices[row] / 8;
-    next_stop = row_indices[row+1] / 8;
-
-    for (unsigned int i = start; i < next_stop; ++i)
-    {
-      col_idx = column_indices[i];
-
-      tmp_entries = elements[i];
-      tmp_vec.s0 = vector[col_idx.s0];
-      tmp_vec.s1 = vector[col_idx.s1];
-      tmp_vec.s2 = vector[col_idx.s2];
-      tmp_vec.s3 = vector[col_idx.s3];
-      tmp_vec.s4 = vector[col_idx.s4];
-      tmp_vec.s5 = vector[col_idx.s5];
-      tmp_vec.s6 = vector[col_idx.s6];
-      tmp_vec.s7 = vector[col_idx.s7];
-
-      dot_prod += dot(tmp_entries.lo, tmp_vec.lo);
-      dot_prod += dot(tmp_entries.hi, tmp_vec.hi);
-    }
-    result[row] = dot_prod;
-  }
-}
diff --git a/auxiliary/compressed_matrix/matrix.old_cl b/auxiliary/compressed_matrix/matrix.old_cl
deleted file mode 100644
index d8d2f4b..0000000
--- a/auxiliary/compressed_matrix/matrix.old_cl
+++ /dev/null
@@ -1,226 +0,0 @@
-//helper:
-void helper_float_parallel_reduction( __local float * tmp_buffer )
-{
-  for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)
-  {
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (get_global_id(0) < stride)
-      tmp_buffer[get_global_id(0)] += tmp_buffer[get_global_id(0)+stride];
-  }
-};
-
-
-/////////////////////////// MATRIX OPERATIONS ///////////////////////////////
-
-
-__kernel void float_packed_sparse_matrix_vector_mul_align1(
-          unsigned int elements_per_row,
-          __global const unsigned int * row_element_count,
-          __global const uint * column_indices, 
-          __global const float * elements,
-          __global const float * vector,  
-          __global float * result,
-          unsigned int size)
-{
-  unsigned int row = get_global_id(0);
-  __global const float * cur_row_elements = elements + row*elements_per_row;
-  __global const uint * cur_row_column_indices = column_indices + row*elements_per_row;
-  for (; row < size; row += get_global_size(0))
-  {
-    cur_row_elements = elements + row*elements_per_row;
-    cur_row_column_indices = column_indices + row*elements_per_row;
-
-    float dot_prod = 0.0f;
-    for (unsigned int i = 0; i < row_element_count[row]; ++i)
-      dot_prod += cur_row_elements[i] * vector[cur_row_column_indices[i]];
-    result[row] = dot_prod;
-  }
-};
-
-//
-//__kernel void float_packed_sparse_matrix_vector_mul_align4(
-//          unsigned int elements_per_row,
-//          __global const unsigned int * row_element_count,
-//          __global const uint * column_indices, 
-//          __global const float * elements,
-//          __global const float * vector,  
-//          __global float * result,
-//          unsigned int size)
-//{
-//  unsigned int row = get_global_id(0);
-//  __global const float * cur_row_elements = elements + row*elements_per_row*4;
-//  __global const uint * cur_row_column_indices = column_indices + row*elements_per_row*4;
-//  for (; row < size; row += get_global_size(0))
-//  {
-//    cur_row_elements = elements + row*elements_per_row*4;
-//    cur_row_column_indices = column_indices + row*elements_per_row*4;
-//
-//    float dot_prod = 0.0f;
-//    for (unsigned int i = 0; i < row_element_count[row]*4; ++i)
-//      dot_prod += cur_row_elements[i] * vector[cur_row_column_indices[i]];
-//    result[row] = dot_prod;
-//  }
-//};
-
-
-__kernel void float_packed_sparse_matrix_vector_mul_align4(
-          unsigned int elements_per_row,
-          __global const unsigned int * row_element_count,
-          __global const uint4 * column_indices, 
-          __global const float4 * elements,
-          __global const float * vector,  
-          __global float * result,
-          unsigned int size)
-{ 
-  unsigned int row = get_global_id(0);
-  __global const float4 * cur_row_elements = elements + row*elements_per_row;
-  __global const uint4 * cur_row_column_indices = column_indices + row*elements_per_row;
-  for (; row < size; row += get_global_size(0))
-  {
-    cur_row_elements = elements + row*elements_per_row;
-    cur_row_column_indices = column_indices + row*elements_per_row;
-
-    float dot_prod = 0.0f;
-    for (unsigned int i = 0; i < row_element_count[row]; ++i)
-    {
-      float4 tmp0;
-      float4 tmp1 = cur_row_elements[i];
-      uint4 ind0 = cur_row_column_indices[i];
-
-      tmp0.x = vector[ind0.x];
-      tmp0.y = vector[ind0.y];
-      tmp0.z = vector[ind0.z];
-      tmp0.w = vector[ind0.w];
-      
-      dot_prod += dot(tmp0, tmp1);
-    }
-    result[row] = dot_prod;
-  }
-};
-
-__kernel void float_packed_sparse_matrix_vector_mul_align8(
-          unsigned int elements_per_row,
-          __global const unsigned int * row_element_count,
-          __global const uint8 * column_indices, 
-          __global const float8 * elements,
-          __global const float * vector,  
-          __global float * result,
-          unsigned int size)
-{ 
-  unsigned int row = get_global_id(0);
-  __global const float8 * cur_row_elements = elements + row*elements_per_row/8;
-  __global const uint8 * cur_row_column_indices = column_indices + row*elements_per_row/8;
-  for (; row < size; row += get_global_size(0))
-  {
-    cur_row_elements = elements + row*elements_per_row/8;
-    cur_row_column_indices = column_indices + row*elements_per_row/8;
-
-    float dot_prod = 0.0f;
-    for (unsigned int i = 0; i < row_element_count[row]/8; ++i)
-    {
-      float8 tmp0;
-      float8 tmp1 = cur_row_elements[i];
-      uint8 ind0 = cur_row_column_indices[i];
-
-      tmp0.s0 = vector[ind0.s0];
-      tmp0.s1 = vector[ind0.s1];
-      tmp0.s2 = vector[ind0.s2];
-      tmp0.s3 = vector[ind0.s3];
-      tmp0.s4 = vector[ind0.s4];
-      tmp0.s5 = vector[ind0.s5];
-      tmp0.s6 = vector[ind0.s6];
-      tmp0.s7 = vector[ind0.s7];
-
-      dot_prod += dot(tmp0.lo, tmp1.lo);
-      dot_prod += dot(tmp0.hi, tmp1.hi);
-
-      /*float4 tmp0;
-      float4 tmp1 = cur_row_elements[i];
-      float4 tmp2;
-      float4 tmp3 = cur_row_elements[i+1];
-      uint4 ind0 = cur_row_column_indices[i];
-      uint4 ind2 = cur_row_column_indices[i+1];
-
-      tmp0.x = vector[ind0.x];
-      tmp0.y = vector[ind0.y];
-      tmp0.z = vector[ind0.z];
-      tmp0.w = vector[ind0.w];
-
-      tmp2.x = vector[ind2.x];
-      tmp2.y = vector[ind2.y];
-      tmp2.z = vector[ind2.z];
-      tmp2.w = vector[ind2.w];
-      
-      dot_prod += dot(tmp0, tmp1);
-      dot_prod += dot(tmp2, tmp3);*/
-    }
-    result[row] = dot_prod;
-  }
-};
-//
-
-//
-//__kernel void float_sparse_matrix_vector_packed_mul(
-//          __global const unsigned int * row_indices,
-//          __global const unsigned int * column_indices, 
-//          __global const float * elements,
-//          __global const float * vector,  
-//          __global float * result,
-//          unsigned int size) 
-//{ 
-//  __global const float4 * vector_f4 = elements;
-//  for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0))
-//  {
-//    float dot_prod = 0.0f;
-//    unsigned int start = row_indices[row];
-//    unsigned int end = row_indices[row+1];
-//    
-//    for (unsigned int i = start/4; i < end/4; i+=2)
-//    {
-//      float4 tmp0;
-//      float4 tmp1 = vector_f4[i];
-//      float4 tmp2;
-//      float4 tmp3 = vector_f4[i+1];
-//      
-//      tmp0.x = vector[column_indices[8*i+0]];
-//      tmp0.y = vector[column_indices[8*i+1]];
-//      tmp0.z = vector[column_indices[8*i+2]];
-//      tmp0.w = vector[column_indices[8*i+3]];
-//      
-//      tmp2.x = vector[column_indices[8*i+4]];
-//      tmp2.y = vector[column_indices[8*i+5]];
-//      tmp2.z = vector[column_indices[8*i+6]];
-//      tmp2.w = vector[column_indices[8*i+7]];
-//      
-//      dot_prod += dot(tmp0, tmp1);
-//      dot_prod += dot(tmp2, tmp3);
-//    }
-//    
-//    result[row] = dot_prod;
-//  }
-//};
-//
-//float float_vector_inner_prod_impl(
-//          __global const float * vec1,
-//          __global const float * vec2,
-//          unsigned int size,
-//          __local float * tmp_buffer)
-//{
-//  float tmp = 0;
-//  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))
-//    tmp += vec1[i]*vec2[i];
-//  tmp_buffer[get_global_id(0)] = tmp;
-//  
-//  for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)
-//  {
-//    barrier(CLK_LOCAL_MEM_FENCE);
-//    if (get_global_id(0) < stride)
-//      tmp_buffer[get_global_id(0)] += tmp_buffer[get_global_id(0)+stride];
-//  }
-//  
-//  return tmp_buffer[0];
-//}
-
-
-
-
diff --git a/auxiliary/converter.cpp b/auxiliary/converter.cpp
deleted file mode 100644
index d858822..0000000
--- a/auxiliary/converter.cpp
+++ /dev/null
@@ -1,379 +0,0 @@
-/*
-* Converts OpenCL sources to header file string constants
-*/
-
-#include <iostream>
-#include <fstream>
-#include <sstream>
-#include <string>
-
-#define BOOST_FILESYSTEM_VERSION 2
-
-#include <boost/filesystem/operations.hpp>
-#include <boost/filesystem/path.hpp>
-#include <iostream>
-
-namespace fs = boost::filesystem;
-
-void writeSourceFile(std::ofstream & out_file, std::string & filename, const char * dirname, const char * alignment)
-{
-    std::string fullpath(dirname);
-    fullpath += "/";
-    fullpath += alignment;
-    fullpath += "/";
-    fullpath += filename;
-    std::ifstream in_file(fullpath.c_str());
-    std::string tmp;
-
-    if (in_file.is_open())
-    {
-        //write variable declaration:
-        out_file << "const char * const " << dirname << "_" << alignment << "_" << filename.substr(0, filename.size()-3) << " = " << std::endl;
-    
-        //write source string:
-        while (getline(in_file, tmp, '\n'))
-        {
-            if (tmp.size() > 0)
-            {
-      	        //out_file << "\"" << tmp.replace(tmp.end()-1, tmp.end(), "\\n\"") << std::endl;
-                if ( *(tmp.end()-1) == '\r')  //Windows line delimiter, \r\n
-                    out_file << "\"" << tmp.replace(tmp.end()-1, tmp.end(), "\\n\"") << std::endl;
-                else //Unix line delimiter \n
-                    out_file << "\"" << tmp.append("\\n\"") << std::endl;
-            }
-        }
-        out_file << "; //" << dirname << "_" << alignment << "_" << filename.substr(0, filename.size()-3)  << std::endl << std::endl;
-        
-    }
-    else
-        std::cerr << "Failed to open file " << filename << std::endl;
-}
-
-void createSourceFile(const char * dirname)
-{
-    //Step 1: Open source file
-    std::string header_name(dirname);
-    std::ofstream source_file(("../../viennacl/linalg/kernels/" + header_name + "_source.h").c_str());
-
-    //Step 2: Write source header file preamble
-    std::string dirname_uppercase(dirname);
-    std::transform(dirname_uppercase.begin(), dirname_uppercase.end(), dirname_uppercase.begin(), toupper);
-    source_file << "#ifndef VIENNACL_LINALG_KERNELS_" << dirname_uppercase << "_SOURCE_HPP_" << std::endl;
-    source_file << "#define VIENNACL_LINALG_KERNELS_" << dirname_uppercase << "_SOURCE_HPP_" << std::endl;
-    source_file << "//Automatically generated file from auxiliary-directory, do not edit manually!" << std::endl;
-    source_file << "namespace viennacl" << std::endl;
-    source_file << "{" << std::endl;
-    source_file << " namespace linalg" << std::endl;
-    source_file << " {" << std::endl;
-    source_file << "  namespace kernels" << std::endl;
-    source_file << "  {" << std::endl;
-
-    //Step 3: Write all OpenCL kernel sources into header file
-    fs::path filepath = fs::system_complete( fs::path( dirname ) );
-    if ( fs::is_directory( filepath ) )
-    {
-        //std::cout << "\n In directory " << filepath.directory_string() << std::endl;
-
-        fs::directory_iterator end_iter;
-        //write and register single precision sources:
-        for ( fs::directory_iterator alignment_itr( filepath );
-              alignment_itr != end_iter;
-              ++alignment_itr )
-        {
-            if (fs::is_directory( alignment_itr->path() ))
-            {
-                std::cout << "\nGenerating kernels from directory " << alignment_itr->path().directory_string() << std::endl;
-
-                //write and register single precision sources:
-                for ( fs::directory_iterator cl_itr( alignment_itr->path() );
-                      cl_itr != end_iter;
-                      ++cl_itr )
-                {
-                    std::string fname = cl_itr->path().filename();
-                    std::string alignment = alignment_itr->path().filename();
-
-                    size_t pos = fname.find(".cl");
-                    if ( pos == std::string::npos )
-                      continue;
-
-                    if (fname.substr(fname.size()-3, 3) == ".cl")
-                        writeSourceFile(source_file, fname, dirname, alignment.c_str());
-                        //std::cout << alignment_itr->path().filename() << "/" << fname << std::endl;
-                } //for                
-            } //if is_directory
-        } //for alignment_iterator
-    } //if is_directory
-    else
-        std::cerr << "Cannot access directory " << dirname << std::endl;
-
-    //Final Step: Write file tail:
-    source_file << "  }  //namespace kernels" << std::endl;
-    source_file << " }  //namespace linalg" << std::endl;
-    source_file << "}  //namespace viennacl" << std::endl;
-    source_file << "#endif" << std::endl;
-    source_file.close();
-}
-
-
-unsigned int getBestKernel(const char * dirname, std::string & kernel_name, unsigned int alignment)
-{
-    unsigned int search_alignment = alignment;
-    //std::cout << "Searching for best match for " << kernel_name << " with alignment " << alignment << std::endl;
-
-    while (search_alignment > 1)
-    {
-        std::ostringstream oss;
-        oss << dirname << "/align" << search_alignment;
-        //std::cout << "Searching " << oss.str() << std::endl;
-
-        //try to find kernel in directory:
-        fs::path filepath = fs::system_complete( fs::path( oss.str() ) );
-        if ( fs::is_directory( filepath ) ) //directory exists?
-        {
-            fs::directory_iterator end_iter;
-            for ( fs::directory_iterator cl_itr( filepath );
-                  cl_itr != end_iter;
-                  ++cl_itr )
-            {
-                std::string fname = cl_itr->path().filename();
-                if (fname == kernel_name)
-                {
-                  //std::cout << "Found matching kernel for " << kernel_name << " with alignment " << alignment << " at alignment " << search_alignment << std::endl;
-                    return search_alignment;
-                }
-            }
-        }
-
-        search_alignment /= 2;
-    }
-
-    //std::cout << "Found alignment 1 only..." << std::endl;
-    //nothing found: return alignment 1:
-    return 1;
-}
-
-
-void writeKernelInit(std::ostream & kernel_file, const char * dirname, std::string & subfolder, bool is_float)
-{
-    //extract alignment information from subfolder string:
-    std::istringstream stream(subfolder.substr(5, subfolder.size()-5));
-    unsigned int alignment = 0;
-    stream >> alignment;
-    if (alignment == 0)
-        std::cerr << "ERROR: Could not extract alignment from " << subfolder << std::endl;
-
-    kernel_file << "   template <>" << std::endl;
-    kernel_file << "   struct " << dirname;
-    if (is_float)
-        kernel_file << "<float, ";
-    else
-        kernel_file << "<double, ";
-    kernel_file << alignment << ">" << std::endl;
-    kernel_file << "   {" << std::endl;
-
-    kernel_file << "    static std::string program_name()" << std::endl;
-    kernel_file << "    {" << std::endl;
-    kernel_file << "      return \"";
-    if (is_float)
-        kernel_file << "f";
-    else
-        kernel_file << "d";
-    kernel_file << "_" << dirname << "_" << alignment << "\";" << std::endl;
-    kernel_file << "    }" << std::endl;
-    
-    kernel_file << "    static void init()" << std::endl;
-    kernel_file << "    {" << std::endl;
-    if (is_float)
-      kernel_file << "      viennacl::ocl::DOUBLE_PRECISION_CHECKER<float>::apply();" << std::endl;
-    else
-      kernel_file << "      viennacl::ocl::DOUBLE_PRECISION_CHECKER<double>::apply();" << std::endl;
-    kernel_file << "      static std::map<cl_context, bool> init_done;" << std::endl;
-    kernel_file << "      viennacl::ocl::context & context_ = viennacl::ocl::current_context();" << std::endl;
-    kernel_file << "      if (!init_done[context_.handle().get()])" << std::endl;
-    kernel_file << "      {" << std::endl;
-    kernel_file << "        std::string source;" << std::endl;
-    if (!is_float)
-      kernel_file << "        std::string fp64_ext = viennacl::ocl::current_device().double_support_extension();" << std::endl;
-
-    //iterate over all kernels in align1-folder:
-    std::string current_dir(dirname);
-    current_dir += "/align1";
-    fs::path filepath = fs::system_complete( fs::path( current_dir ) );
-
-    fs::directory_iterator end_iter;
-    //write and register single precision sources:
-    for ( fs::directory_iterator cl_itr( filepath );
-          cl_itr != end_iter;
-          ++cl_itr )
-    {
-        std::string fname = cl_itr->path().filename();
-        size_t pos = fname.find(".cl");
-        if ( pos == std::string::npos )
-          continue;
-
-        if (fname.substr(fname.size()-3, 3) == ".cl")
-        {
-            //add kernel source to program string:
-            kernel_file << "        source.append(";
-            if (!is_float)
-                kernel_file << "viennacl::tools::make_double_kernel(";
-            kernel_file << dirname << "_align" << getBestKernel(dirname, fname, alignment) << "_" << fname.substr(0, fname.size()-3);
-            if (!is_float)
-                kernel_file << ", fp64_ext)";
-            kernel_file << ");" << std::endl;
-        }
-    } //for                
-    
-    kernel_file << "        std::string prog_name = program_name();" << std::endl;
-    kernel_file << "        #ifdef VIENNACL_BUILD_INFO" << std::endl;
-    kernel_file << "        std::cout << \"Creating program \" << prog_name << std::endl;" << std::endl;
-    kernel_file << "        #endif" << std::endl;
-    kernel_file << "        context_.add_program(source, prog_name);" << std::endl;
-    kernel_file << "        viennacl::ocl::program & prog_ = context_.get_program(prog_name);" << std::endl;
-    
-    //write and register single precision sources:
-    for ( fs::directory_iterator cl_itr( filepath );
-          cl_itr != end_iter;
-          ++cl_itr )
-    {
-        std::string fname = cl_itr->path().filename();
-        size_t pos = fname.find(".cl");
-        if ( pos == std::string::npos )
-          continue;
-
-        if (fname.substr(fname.size()-3, 3) == ".cl")
-        {
-            //initialize kernel:
-            kernel_file << "        prog_.add_kernel(\"" << fname.substr(0, fname.size()-3) << "\");" << std::endl;
-        }
-    } //for                
-    
-    kernel_file << "        init_done[context_.handle().get()] = true;" << std::endl;
-    kernel_file << "       } //if" << std::endl;
-    kernel_file << "     } //init" << std::endl;
-    kernel_file << "    }; // struct" << std::endl << std::endl;
-}
-
-
-
-
-void createKernelFile(const char * dirname)
-{
-    //Step 1: Open kernel file
-    std::string header_name(dirname);
-    std::ofstream kernel_file(("../../viennacl/linalg/kernels/" + header_name + "_kernels.h").c_str());
-
-    //Step 2: Write kernel header file preamble
-    std::string dirname_uppercase(dirname);
-    std::transform(dirname_uppercase.begin(), dirname_uppercase.end(), dirname_uppercase.begin(), toupper);
-    kernel_file << "#ifndef _VIENNACL_" << dirname_uppercase << "_KERNELS_HPP_" << std::endl;
-    kernel_file << "#define _VIENNACL_" << dirname_uppercase << "_KERNELS_HPP_" << std::endl;
-    kernel_file << "#include \"viennacl/tools/tools.hpp\"" << std::endl;
-    kernel_file << "#include \"viennacl/ocl/kernel.hpp\"" << std::endl;
-    kernel_file << "#include \"viennacl/ocl/platform.hpp\"" << std::endl;
-    kernel_file << "#include \"viennacl/ocl/utils.hpp\"" << std::endl;
-    kernel_file << "#include \"viennacl/linalg/kernels/" << dirname << "_source.h\"" << std::endl;
-    kernel_file << std::endl;
-    kernel_file << "//Automatically generated file from aux-directory, do not edit manually!" << std::endl;
-    kernel_file << "namespace viennacl" << std::endl;
-    kernel_file << "{" << std::endl;
-    kernel_file << " namespace linalg" << std::endl;
-    kernel_file << " {" << std::endl;
-    kernel_file << "  namespace kernels" << std::endl;
-    kernel_file << "  {" << std::endl;
-
-    //Step 3: Write class information:
-    kernel_file << "   template<class TYPE, unsigned int alignment>" << std::endl;
-    kernel_file << "   struct " << dirname << ";" << std::endl << std::endl;
-    
-    //Step 4: Write single precision kernels
-    std::string dir(dirname);
-    kernel_file << std::endl << "    /////////////// single precision kernels //////////////// " << std::endl;
-    fs::path filepath = fs::system_complete( fs::path( dir ) );
-    if ( fs::is_directory( filepath ) )
-    {
-        //std::cout << "\nIn directory: " << filepath.directory_string() << std::endl;
-
-        fs::directory_iterator end_iter;
-        //write and register single precision sources:
-        for ( fs::directory_iterator alignment_itr( filepath );
-              alignment_itr != end_iter;
-              ++alignment_itr )
-        {
-            if (fs::is_directory( alignment_itr->path() ))
-            {
-                std::string subfolder = alignment_itr->path().filename();
-                if( subfolder.find("align") == std::string::npos )
-                  continue;
-                writeKernelInit(kernel_file, dirname, subfolder, true);
-            } //if is_directory
-        } //for alignment_iterator
-        kernel_file << std::endl;
-    } //if is_directory
-    else
-        std::cerr << "Cannot access directory " << dirname << std::endl;
-
-    //Step 5: Write double precision kernels
-    kernel_file << std::endl << "    /////////////// double precision kernels //////////////// " << std::endl;
-    filepath = fs::system_complete( fs::path( dir ) );
-    if ( fs::is_directory( filepath ) )
-    {
-        //std::cout << "\nIn directory: " << filepath.directory_string() << std::endl;
-
-        fs::directory_iterator end_iter;
-        //write and register single precision sources:
-        for ( fs::directory_iterator alignment_itr( filepath );
-              alignment_itr != end_iter;
-              ++alignment_itr )
-        {
-            if (fs::is_directory( alignment_itr->path() ))
-            {
-                std::string subfolder = alignment_itr->path().filename();
-                if( subfolder.find("align") == std::string::npos )
-                  continue;
-                writeKernelInit(kernel_file, dirname, subfolder, false);
-            } //if is_directory
-        } //for alignment_iterator
-        kernel_file << std::endl;
-    } //if is_directory
-    else
-        std::cerr << "Cannot access directory " << dirname << std::endl;
-
-    //Final Step: Write file tail:
-    kernel_file << "  }  //namespace kernels" << std::endl;
-    kernel_file << " }  //namespace linalg" << std::endl;
-    kernel_file << "}  //namespace viennacl" << std::endl;
-    kernel_file << "#endif" << std::endl;
-    kernel_file.close();
-}
-
-void createHeaders(const char * dirname)
-{
-    createKernelFile(dirname);
-    createSourceFile(dirname);
-}
-
-int main(int args, char * argsv[])
-{
-    createHeaders("compressed_matrix");
-    createHeaders("coordinate_matrix");
-    createHeaders("matrix_row");
-    createHeaders("matrix_col");
-    createHeaders("matrix_prod_row_row_row");
-    createHeaders("matrix_prod_row_row_col");
-    createHeaders("matrix_prod_row_col_row");
-    createHeaders("matrix_prod_row_col_col");
-    createHeaders("matrix_prod_col_row_row");
-    createHeaders("matrix_prod_col_row_col");
-    createHeaders("matrix_prod_col_col_row");
-    createHeaders("matrix_prod_col_col_col");
-    createHeaders("matrix_solve_col_col");
-    createHeaders("matrix_solve_col_row");
-    createHeaders("matrix_solve_row_col");
-    createHeaders("matrix_solve_row_row");
-    createHeaders("scalar");
-    createHeaders("vector");
-    createHeaders("fft");
-    createHeaders("spai");
-}
-
diff --git a/auxiliary/coordinate_matrix/align1/vec_mul.cl b/auxiliary/coordinate_matrix/align1/vec_mul.cl
deleted file mode 100644
index 6528bd4..0000000
--- a/auxiliary/coordinate_matrix/align1/vec_mul.cl
+++ /dev/null
@@ -1,126 +0,0 @@
-
-//segmented parallel reduction. At present restricted to up to 256 threads
-void segmented_parallel_reduction(unsigned int row, 
-                                  float val, 
-                                  __local unsigned int * shared_rows, 
-                                  __local float * inter_results) 
-{ 
-  //barrier(CLK_LOCAL_MEM_FENCE); 
-  shared_rows[get_local_id(0)] = row; 
-  inter_results[get_local_id(0)] = val; 
-  float left = 0;
- 
-  barrier(CLK_LOCAL_MEM_FENCE); 
-  if( get_local_id(0) >=  1 && row == shared_rows[get_local_id(0) -  1] ) { left = inter_results[get_local_id(0) -  1]; }  
-  barrier(CLK_LOCAL_MEM_FENCE); 
-  inter_results[get_local_id(0)] += left; left = 0;
-  barrier(CLK_LOCAL_MEM_FENCE); 
-
-  if( get_local_id(0) >=  2 && row == shared_rows[get_local_id(0) -  2] ) { left = inter_results[get_local_id(0) -  2]; } 
-  barrier(CLK_LOCAL_MEM_FENCE); 
-  inter_results[get_local_id(0)] += left; left = 0;
-  barrier(CLK_LOCAL_MEM_FENCE); 
-
-  if( get_local_id(0) >=  4 && row == shared_rows[get_local_id(0) -  4] ) { left = inter_results[get_local_id(0) -  4]; } 
-  barrier(CLK_LOCAL_MEM_FENCE); 
-  inter_results[get_local_id(0)] += left; left = 0;
-  barrier(CLK_LOCAL_MEM_FENCE); 
-
-  if( get_local_id(0) >=  8 && row == shared_rows[get_local_id(0) -  8] ) { left = inter_results[get_local_id(0) -  8]; } 
-  barrier(CLK_LOCAL_MEM_FENCE); 
-  inter_results[get_local_id(0)] += left; left = 0;
-  barrier(CLK_LOCAL_MEM_FENCE); 
-
-  if( get_local_id(0) >= 16 && row == shared_rows[get_local_id(0) - 16] ) { left = inter_results[get_local_id(0) - 16]; } 
-  barrier(CLK_LOCAL_MEM_FENCE); 
-  inter_results[get_local_id(0)] += left; left = 0;
-  barrier(CLK_LOCAL_MEM_FENCE); 
-
-  if( get_local_id(0) >= 32 && row == shared_rows[get_local_id(0) - 32] ) { left = inter_results[get_local_id(0) - 32]; } 
-  barrier(CLK_LOCAL_MEM_FENCE); 
-  inter_results[get_local_id(0)] += left; left = 0;
-  barrier(CLK_LOCAL_MEM_FENCE); 
-
-  if( get_local_id(0) >= 64 && row == shared_rows[get_local_id(0) - 64] ) { left = inter_results[get_local_id(0) - 64]; } 
-  barrier(CLK_LOCAL_MEM_FENCE); 
-  inter_results[get_local_id(0)] += left; left = 0;
-  barrier(CLK_LOCAL_MEM_FENCE); 
-
-  if( get_local_id(0) >= 128 && row == shared_rows[get_local_id(0) - 128] ) { left = inter_results[get_local_id(0) - 128]; } 
-  barrier(CLK_LOCAL_MEM_FENCE); 
-  inter_results[get_local_id(0)] += left; left = 0;
-  barrier(CLK_LOCAL_MEM_FENCE); 
-
-  //if( get_local_id(0) >= 256 && row == shared_rows[get_local_id(0) - 256] ) { left = inter_results[get_local_id(0) - 256]; } 
-  //barrier(CLK_LOCAL_MEM_FENCE);  
-  //inter_results[get_local_id(0)] += left; left = 0;
-  //barrier(CLK_LOCAL_MEM_FENCE); 
-}
-
-
-__kernel void vec_mul( 
-          __global const uint2 * coords, //(row_index, column_index) 
-          __global const float * elements, 
-          __global const uint  * group_boundaries,
-          __global const float * vector,  
-          __global float * result, 
-          __local unsigned int * shared_rows, 
-          __local float * inter_results) 
-{ 
-  uint2 tmp; 
-  float val;
-  uint last_index = get_local_size(0) - 1;
-  uint group_start = group_boundaries[get_group_id(0)];
-  uint group_end = group_boundaries[get_group_id(0) + 1];
-  uint k_end = 1 + (group_end - group_start - 1) / get_local_size(0);   // -1 in order to have correct behavior if group_end - group_start == j * get_local_size(0)
-
-  uint local_index = 0;
-
-  for (uint k = 0; k < k_end; ++k)
-  { 
-    barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE); 
-    
-    local_index = group_start + k * get_local_size(0) + get_local_id(0); 
-  
-    if (local_index < group_end)
-    {
-      tmp = coords[local_index]; 
-      val = elements[local_index] * vector[tmp.y]; 
-    }
-    else
-    {
-      tmp.x = 0;
-      tmp.y = 0;
-      val = 0;
-    }
-
-    barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE); 
-
-    //check for carry from previous loop run: 
-    if (get_local_id(0) == 0 && k > 0)
-    { 
-      if (tmp.x == shared_rows[last_index]) 
-        val += inter_results[last_index]; 
-      else 
-        result[shared_rows[last_index]] += inter_results[last_index]; 
-    } 
-
-    barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE); 
-
-    segmented_parallel_reduction(tmp.x, val, shared_rows, inter_results); //all threads have to enter this function
-
-    barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE); 
-
-    if (get_local_id(0) != last_index &&
-        shared_rows[get_local_id(0)] != shared_rows[get_local_id(0) + 1] &&
-        inter_results[get_local_id(0)] != 0) 
-    { 
-      result[tmp.x] += inter_results[get_local_id(0)]; 
-    }
-   
-    barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE); 
-  } //for k
-   
-  if (get_local_id(0) == last_index && inter_results[last_index] != 0) 
-    result[tmp.x] += inter_results[last_index]; 
-}
\ No newline at end of file
diff --git a/auxiliary/coordinate_matrix/align128/dummy b/auxiliary/coordinate_matrix/align128/dummy
deleted file mode 100644
index 061329f..0000000
--- a/auxiliary/coordinate_matrix/align128/dummy
+++ /dev/null
@@ -1 +0,0 @@
-this is a dummy file for the software versioning system
diff --git a/auxiliary/coordinate_matrix/matrix.old_cl b/auxiliary/coordinate_matrix/matrix.old_cl
deleted file mode 100644
index f13708b..0000000
--- a/auxiliary/coordinate_matrix/matrix.old_cl
+++ /dev/null
@@ -1,822 +0,0 @@
-//helper:
-void helper_float_parallel_reduction( __local float * tmp_buffer )
-{
-  for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)
-  {
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (get_global_id(0) < stride)
-      tmp_buffer[get_global_id(0)] += tmp_buffer[get_global_id(0)+stride];
-  }
-};
-
-
-/////////////////////////// MATRIX OPERATIONS ///////////////////////////////
-
-
-__kernel void float_packed_sparse_matrix_vector_mul_align1(
-          unsigned int elements_per_row,
-          __global const unsigned int * row_element_count,
-          __global const uint * column_indices, 
-          __global const float * elements,
-          __global const float * vector,  
-          __global float * result,
-          unsigned int size)
-{
-  unsigned int row = get_global_id(0);
-  __global const float * cur_row_elements = elements + row*elements_per_row;
-  __global const uint * cur_row_column_indices = column_indices + row*elements_per_row;
-  for (; row < size; row += get_global_size(0))
-  {
-    cur_row_elements = elements + row*elements_per_row;
-    cur_row_column_indices = column_indices + row*elements_per_row;
-
-    float dot_prod = 0.0f;
-    for (unsigned int i = 0; i < row_element_count[row]; ++i)
-      dot_prod += cur_row_elements[i] * vector[cur_row_column_indices[i]];
-    result[row] = dot_prod;
-  }
-};
-
-//
-//__kernel void float_packed_sparse_matrix_vector_mul_align4(
-//          unsigned int elements_per_row,
-//          __global const unsigned int * row_element_count,
-//          __global const uint * column_indices, 
-//          __global const float * elements,
-//          __global const float * vector,  
-//          __global float * result,
-//          unsigned int size)
-//{
-//  unsigned int row = get_global_id(0);
-//  __global const float * cur_row_elements = elements + row*elements_per_row*4;
-//  __global const uint * cur_row_column_indices = column_indices + row*elements_per_row*4;
-//  for (; row < size; row += get_global_size(0))
-//  {
-//    cur_row_elements = elements + row*elements_per_row*4;
-//    cur_row_column_indices = column_indices + row*elements_per_row*4;
-//
-//    float dot_prod = 0.0f;
-//    for (unsigned int i = 0; i < row_element_count[row]*4; ++i)
-//      dot_prod += cur_row_elements[i] * vector[cur_row_column_indices[i]];
-//    result[row] = dot_prod;
-//  }
-//};
-
-
-__kernel void float_packed_sparse_matrix_vector_mul_align4(
-          unsigned int elements_per_row,
-          __global const unsigned int * row_element_count,
-          __global const uint4 * column_indices, 
-          __global const float4 * elements,
-          __global const float * vector,  
-          __global float * result,
-          unsigned int size)
-{ 
-  unsigned int row = get_global_id(0);
-  __global const float4 * cur_row_elements = elements + row*elements_per_row;
-  __global const uint4 * cur_row_column_indices = column_indices + row*elements_per_row;
-  for (; row < size; row += get_global_size(0))
-  {
-    cur_row_elements = elements + row*elements_per_row;
-    cur_row_column_indices = column_indices + row*elements_per_row;
-
-    float dot_prod = 0.0f;
-    for (unsigned int i = 0; i < row_element_count[row]; ++i)
-    {
-      float4 tmp0;
-      float4 tmp1 = cur_row_elements[i];
-      uint4 ind0 = cur_row_column_indices[i];
-
-      tmp0.x = vector[ind0.x];
-      tmp0.y = vector[ind0.y];
-      tmp0.z = vector[ind0.z];
-      tmp0.w = vector[ind0.w];
-      
-      dot_prod += dot(tmp0, tmp1);
-    }
-    result[row] = dot_prod;
-  }
-};
-
-__kernel void float_packed_sparse_matrix_vector_mul_align8(
-          unsigned int elements_per_row,
-          __global const unsigned int * row_element_count,
-          __global const uint8 * column_indices, 
-          __global const float8 * elements,
-          __global const float * vector,  
-          __global float * result,
-          unsigned int size)
-{ 
-  unsigned int row = get_global_id(0);
-  __global const float8 * cur_row_elements = elements + row*elements_per_row/8;
-  __global const uint8 * cur_row_column_indices = column_indices + row*elements_per_row/8;
-  for (; row < size; row += get_global_size(0))
-  {
-    cur_row_elements = elements + row*elements_per_row/8;
-    cur_row_column_indices = column_indices + row*elements_per_row/8;
-
-    float dot_prod = 0.0f;
-    for (unsigned int i = 0; i < row_element_count[row]/8; ++i)
-    {
-      float8 tmp0;
-      float8 tmp1 = cur_row_elements[i];
-      uint8 ind0 = cur_row_column_indices[i];
-
-      tmp0.s0 = vector[ind0.s0];
-      tmp0.s1 = vector[ind0.s1];
-      tmp0.s2 = vector[ind0.s2];
-      tmp0.s3 = vector[ind0.s3];
-      tmp0.s4 = vector[ind0.s4];
-      tmp0.s5 = vector[ind0.s5];
-      tmp0.s6 = vector[ind0.s6];
-      tmp0.s7 = vector[ind0.s7];
-
-      dot_prod += dot(tmp0.lo, tmp1.lo);
-      dot_prod += dot(tmp0.hi, tmp1.hi);
-
-      /*float4 tmp0;
-      float4 tmp1 = cur_row_elements[i];
-      float4 tmp2;
-      float4 tmp3 = cur_row_elements[i+1];
-      uint4 ind0 = cur_row_column_indices[i];
-      uint4 ind2 = cur_row_column_indices[i+1];
-
-      tmp0.x = vector[ind0.x];
-      tmp0.y = vector[ind0.y];
-      tmp0.z = vector[ind0.z];
-      tmp0.w = vector[ind0.w];
-
-      tmp2.x = vector[ind2.x];
-      tmp2.y = vector[ind2.y];
-      tmp2.z = vector[ind2.z];
-      tmp2.w = vector[ind2.w];
-      
-      dot_prod += dot(tmp0, tmp1);
-      dot_prod += dot(tmp2, tmp3);*/
-    }
-    result[row] = dot_prod;
-  }
-};
-//
-__kernel void float_compressed_mat_vec_mul_align1(
-          __global const unsigned int * row_indices,
-          __global const unsigned int * column_indices, 
-          __global const float * elements,
-          __global const float * vector,  
-          __global float * result,
-          unsigned int size) 
-{ 
-  for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0))
-  {
-    float dot_prod = 0.0f;
-    for (unsigned int i = row_indices[row]; i < row_indices[row+1]; ++i)
-      dot_prod += elements[i] * vector[column_indices[i]];
-    result[row] = dot_prod;
-  }
-};
-
-
-
-//segmented parallel reduction. At present restricted to up to 128 threads
-void helper_float_segmented_parallel_reduction( unsigned int row, float val, __local unsigned int * shared_rows, __local float * inter_results )
-{
-  barrier(CLK_LOCAL_MEM_FENCE);
-  shared_rows[get_global_id(0)] = row;
-  inter_results[get_global_id(0)] = val;
-
-  barrier(CLK_LOCAL_MEM_FENCE);
-  if( get_global_id(0) >=  1 && row == shared_rows[get_global_id(0) -  1] ) { inter_results[get_global_id(0)] += inter_results[get_global_id(0) -  1]; } 
-  barrier(CLK_LOCAL_MEM_FENCE);
-  if( get_global_id(0) >=  2 && row == shared_rows[get_global_id(0) -  2] ) { inter_results[get_global_id(0)] += inter_results[get_global_id(0) -  2]; }
-  barrier(CLK_LOCAL_MEM_FENCE);
-  if( get_global_id(0) >=  4 && row == shared_rows[get_global_id(0) -  4] ) { inter_results[get_global_id(0)] += inter_results[get_global_id(0) -  4]; }
-  barrier(CLK_LOCAL_MEM_FENCE);
-  if( get_global_id(0) >=  8 && row == shared_rows[get_global_id(0) -  8] ) { inter_results[get_global_id(0)] += inter_results[get_global_id(0) -  8]; }
-  barrier(CLK_LOCAL_MEM_FENCE);
-  if( get_global_id(0) >= 16 && row == shared_rows[get_global_id(0) - 16] ) { inter_results[get_global_id(0)] += inter_results[get_global_id(0) - 16]; }
-  barrier(CLK_LOCAL_MEM_FENCE);
-  if( get_global_id(0) >= 32 && row == shared_rows[get_global_id(0) - 32] ) { inter_results[get_global_id(0)] += inter_results[get_global_id(0) - 32]; }
-  barrier(CLK_LOCAL_MEM_FENCE);
-  if( get_global_id(0) >= 64 && row == shared_rows[get_global_id(0) - 64] ) { inter_results[get_global_id(0)] += inter_results[get_global_id(0) - 64]; }
-  barrier(CLK_LOCAL_MEM_FENCE);
-
-};
-
-
-__kernel void float_coord_mat_vec_mul_align1(
-          __global const uint2 * coords, //(row_index, column_index)
-          __global const float * elements,
-          __global const float * vector,  
-          __global float * result,
-          unsigned int size,
-          __local unsigned int * shared_rows,
-          __local float * inter_results)
-{ 
-  uint2 tmp;
-  float val;
-  const uint last_index = get_global_size(0) - 1;
-  shared_rows[last_index] = 1;  //will prevent if-trigger in first loop run
-  inter_results[last_index] = 0.0f;
-  
-  for (unsigned int index = get_global_id(0); index < size; index += get_global_size(0))
-  {
-    tmp = coords[index];
-    val = elements[index] * vector[tmp.y];
-    
-    if (get_global_id(0) == 0)
-    {
-      //check for carry from previous loop run:
-      if (tmp.x == shared_rows[last_index])
-        val += inter_results[last_index];
-      else
-        result[shared_rows[last_index]] += inter_results[last_index];
-    }
-
-    helper_float_segmented_parallel_reduction(tmp.x, val, shared_rows, inter_results);
-    
-    if (get_global_id(0) != last_index && 
-         shared_rows[get_global_id(0)] != shared_rows[get_global_id(0) + 1])
-    {
-      result[tmp.x] += inter_results[get_global_id(0)];
-    }
-  } //for
-  
-  barrier(CLK_GLOBAL_MEM_FENCE);
-  
-  if (get_global_id(0) == last_index)
-  {
-    result[shared_rows[get_global_id(0)]] += inter_results[get_global_id(0)];
-  }
-};
-
-//
-//__kernel void float_sparse_matrix_vector_packed_mul(
-//          __global const unsigned int * row_indices,
-//          __global const unsigned int * column_indices, 
-//          __global const float * elements,
-//          __global const float * vector,  
-//          __global float * result,
-//          unsigned int size) 
-//{ 
-//  __global const float4 * vector_f4 = elements;
-//  for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0))
-//  {
-//    float dot_prod = 0.0f;
-//    unsigned int start = row_indices[row];
-//    unsigned int end = row_indices[row+1];
-//    
-//    for (unsigned int i = start/4; i < end/4; i+=2)
-//    {
-//      float4 tmp0;
-//      float4 tmp1 = vector_f4[i];
-//      float4 tmp2;
-//      float4 tmp3 = vector_f4[i+1];
-//      
-//      tmp0.x = vector[column_indices[8*i+0]];
-//      tmp0.y = vector[column_indices[8*i+1]];
-//      tmp0.z = vector[column_indices[8*i+2]];
-//      tmp0.w = vector[column_indices[8*i+3]];
-//      
-//      tmp2.x = vector[column_indices[8*i+4]];
-//      tmp2.y = vector[column_indices[8*i+5]];
-//      tmp2.z = vector[column_indices[8*i+6]];
-//      tmp2.w = vector[column_indices[8*i+7]];
-//      
-//      dot_prod += dot(tmp0, tmp1);
-//      dot_prod += dot(tmp2, tmp3);
-//    }
-//    
-//    result[row] = dot_prod;
-//  }
-//};
-//
-//float float_vector_inner_prod_impl(
-//          __global const float * vec1,
-//          __global const float * vec2,
-//          unsigned int size,
-//          __local float * tmp_buffer)
-//{
-//  float tmp = 0;
-//  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))
-//    tmp += vec1[i]*vec2[i];
-//  tmp_buffer[get_global_id(0)] = tmp;
-//  
-//  for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)
-//  {
-//    barrier(CLK_LOCAL_MEM_FENCE);
-//    if (get_global_id(0) < stride)
-//      tmp_buffer[get_global_id(0)] += tmp_buffer[get_global_id(0)+stride];
-//  }
-//  
-//  return tmp_buffer[0];
-//}
-
-
-////// solver kernels for upper triangular systems
-
-__kernel void float_upper_triangular_substitute_inplace(
-          __global const float * matrix,
-          __global float * vector,
-          unsigned int row_length,
-          unsigned int size)
-{
-  float temp;
-  for (int row = size-1; row > -1; --row)
-  {
-    if (get_global_id(0) == 0)
-      vector[row] /= matrix[row*row_length+row];
-
-    barrier(CLK_GLOBAL_MEM_FENCE);
-
-    temp = vector[row];
-    //eliminate column with index 'row' in parallel:
-    for  (int row_elim = get_global_id(0); row_elim < row; row_elim += get_global_size(0))
-      vector[row_elim] -= temp * matrix[row_elim*row_length+row];
-  }
-  
-};
-
-//transposed lower triangular matrix
-__kernel void float_trans_upper_triangular_substitute_inplace(
-          __global const float * matrix,
-          __global float * vector,
-          unsigned int row_length,
-          unsigned int size)
-{
-  float temp;
-  for (int row = size-1; row > -1; --row)
-  {
-    if (get_global_id(0) == 0)
-      vector[row] /= matrix[row+row*row_length];
-
-    barrier(CLK_GLOBAL_MEM_FENCE);
-
-    temp = vector[row];
-
-    for  (int row_elim = get_global_id(0); row_elim < row; row_elim += get_global_size(0))
-      vector[row_elim] -= temp * matrix[row*row_length+row_elim];
-  }
-};
-
-
-////// solver kernels for lower triangular systems /////////////
-
-__kernel void float_lower_triangular_substitute_inplace(
-          __global const float * matrix,
-          __global float * vector,
-          unsigned int row_length,
-          unsigned int size)
-{
-  float temp;
-  for (int row = 0; row < size; ++row)
-  {
-    if (get_global_id(0) == 0)
-      vector[row] /= matrix[row+row*row_length];
-
-    barrier(CLK_GLOBAL_MEM_FENCE);
-
-    temp = vector[row];
-
-    for  (int row_elim = row + get_global_id(0) + 1; row_elim < size; row_elim += get_global_size(0))
-      vector[row_elim] -= temp * matrix[row_elim*row_length+row];
-  }
-};
-
-//transposed upper triangular matrix
-__kernel void float_trans_lower_triangular_substitute_inplace(
-          __global const float * matrix,
-          __global float * vector,
-          unsigned int row_length,
-          unsigned int size)
-{
-  float temp;
-  for (int row = 0; row < size; ++row)
-  {
-    if (get_global_id(0) == 0)
-      vector[row] /= matrix[row+row*row_length];
-
-    barrier(CLK_GLOBAL_MEM_FENCE);
-
-    temp = vector[row];
-
-    for  (int row_elim = row + get_global_id(0) + 1; row_elim < size; row_elim += get_global_size(0))
-      vector[row_elim] -= temp * matrix[row*row_length+row_elim];
-  }
-};
-
-
-// __kernel void float_trans_matvec_mul_align1(
-//           __global const float * matrix, //matrix is not transposed in memory!
-//           __global const float * vector,  
-//           __global float * result,
-//           unsigned int row_length,
-//           unsigned int size) 
-// { 
-//   for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0))
-//   {
-//     float dot_prod = 0.0f;
-//     for (unsigned int col = 0; col < row_length; ++col)
-//       dot_prod += matrix[row + col*size] * vector[col];
-//     result[row] = dot_prod;
-//   }
-// };
-
-
-__kernel void float_trans_matrix_vector_mul_align1(
-          __global const float * matrix,
-          __global const float * vector,  
-          __global float * result,
-          unsigned int matrix_row_length, //keep transpose operation in mind!
-          unsigned int vector_length, //keep transpose operation in mind!
-          unsigned int result_length) 
-{ 
-  for (unsigned int row = get_global_id(0); row < result_length; row += get_global_size(0))
-  {
-    float dot_prod2 = 0.0f;
-    for (unsigned int col = 0; col < vector_length; ++col)
-      dot_prod2 += matrix[row + col*matrix_row_length] * vector[col];
-    result[row] = dot_prod2;
-  }
-};
-
-
-__kernel void float_matrix_vector_mul_align1(
-          __global const float * matrix,
-          __global const float * vector,  
-          __global float * result,
-          unsigned int matrix_row_length, //keep transpose operation in mind!
-          unsigned int vector_length,
-          unsigned int result_length) 
-{ 
-  for (unsigned int row = get_global_id(0); row < result_length; row += get_global_size(0))
-  {
-    float dot_prod = 0.0f;
-    for (unsigned int col = 0; col < vector_length; ++col)
-      dot_prod += matrix[row*matrix_row_length+col] * vector[col];
-    result[row] = dot_prod;
-  }
-};
-
-
-
-
-//perform a rank-1 update of the matrix, i.e. A += x * x^T
-__kernel void float_rank_1_update(
-          __global float * matrix,
-          __global const float * vector1,  
-          __global const float * vector2,  
-          unsigned int matrix_row_length,
-          unsigned int row_length,
-          unsigned int col_length) 
-{ 
-  float tmp;
-  unsigned int offset;
-
-  for (unsigned int row = get_global_id(0); row < row_length; row += get_global_size(0))
-  {
-    tmp = vector1[row];
-    offset = row*matrix_row_length;
-    for (unsigned int col = 0; col < col_length; ++col)
-      matrix[offset+col] += tmp * vector2[col];
-  }
-};
-
-__kernel void float_scaled_rank_1_update(
-          __global float * matrix,
-          float val,
-          __global const float * vector1,  
-          __global const float * vector2,  
-          unsigned int matrix_row_length,
-          unsigned int row_length,
-          unsigned int col_length) 
-{ 
-  float tmp;
-  unsigned int offset;
-
-  for (unsigned int row = get_global_id(0); row < row_length; row += get_global_size(0))
-  {
-    tmp = val * vector1[row];
-    offset = row*matrix_row_length;
-    for (unsigned int col = 0; col < col_length; ++col)
-      matrix[offset+col] += tmp * vector2[col];
-  }
-};
-
-
-
-//lu factorization of a matrix without pivoting:
-
-
-__kernel void float_matrix_lu_factorize(
-          __global float * matrix,
-          unsigned int matrix_row_length,
-          unsigned int size) 
-{ 
-  float temp;
-  unsigned rowi;
-  unsigned rowk;
-  for (unsigned int i=1; i<size; ++i)
-  {
-    rowi = i * matrix_row_length;
-    for (unsigned int k=0; k<i; ++k)
-    {
-      rowk = k * matrix_row_length;
-      if (get_global_id(0) == 0)
-        matrix[rowi + k] /= matrix[rowk + k];
-
-      barrier(CLK_GLOBAL_MEM_FENCE);
-      temp = matrix[rowi + k];
-      
-      //parallel subtraction:
-      for (unsigned int j=k+1 + get_global_id(0); j<size; j += get_global_size(0))
-        matrix[rowi + j] -= temp * matrix[rowk + j];
-    }
-  }
-} 
-
-
-/*
-__kernel void float_matrix_lu_factorize(
-          __global float * matrix,
-          __local float * buffer,                              
-          unsigned int matrix_row_length,
-          unsigned int size) 
-{ 
-  float temp;
-  unsigned rowi;
-  unsigned rowk;
-  for (unsigned int i=1; i<size; ++i)
-  {
-    rowi = i * matrix_row_length;
-    
-    //first step: obtain a_ik from a triangular solution step:
-    for (unsigned int k=0; k<i; ++k)
-    {
-      rowk = k * matrix_row_length;
-      if (get_global_id(0) == 0)
-        matrix[rowi + k] = matrix[rowi + k] / matrix[rowk + k];
-      barrier(CLK_GLOBAL_MEM_FENCE);
-      
-      temp = matrix[rowi + k];
-      
-      for  (unsigned int j = k + 1 + get_global_id(0); j < i; j += get_global_size(0))
-        matrix[rowi + j] -= temp * matrix[rowk + j];
-    }
-
-
-    //second step: subtract block A(k,j) with k=0..i-1 and j=i+1...size-1
-    if (i < get_global_size(0))
-    {
-      //condense column down to matrix(i,j):
-      for (unsigned int j=i+get_global_id(0); j<size; j += get_global_size(0))
-      {
-        temp = 0.0;      
-        //subtraction of A(j, 0:i-1) from A(j,i):
-        for (unsigned int k=0; k<i; ++k)
-          temp += matrix[rowi + k] * matrix[k * matrix_row_length + j];
-        matrix[rowi + j] -= temp;
-      } 
-    }
-    else
-    {
-      //parallel columns:
-      for (unsigned int j=i; j<size; ++j)
-      {
-        temp = 0.0;
-        for (unsigned int k=0; k<= i / get_global_size(0); ++k)
-        {
-          rowk = k*get_global_size(0) + get_global_id(0); //reused as row index k in matrix
-          if (rowk < i)
-            buffer[get_global_id(0)] = matrix[rowi + rowk] * matrix[rowk * matrix_row_length + j];
-          else
-            buffer[get_global_id(0)] = 0.0;
-          helper_float_parallel_reduction(buffer);
-          if (get_global_id(0) == 0)
-            temp += buffer[0];
-        }
-        
-        if (get_global_id(0) == 0)
-          matrix[rowi + j] -= temp;
-      } //for j
-    } //if 
-    
-    barrier(CLK_GLOBAL_MEM_FENCE);
-  }
-} */
-
-
-
-//solve LUx = y
-__kernel void float_matrix_lu_substitute(
-          __global float * matrix,
-          __global float * vector,
-          unsigned int matrix_row_length,
-          unsigned int size) 
-{ 
-  float temp;
-  
-  //forward substitution Lz = y
-  for (int row = 0; row < size; ++row)
-  {
-    barrier(CLK_GLOBAL_MEM_FENCE);
-    temp = vector[row];
-
-    for  (int row_elim = row + get_global_id(0) + 1; row_elim < size; row_elim += get_global_size(0))
-      vector[row_elim] -= temp * matrix[row_elim*matrix_row_length+row];
-  }
-
-
-  //backward substitution: Ux = z
-  float_upper_triangular_substitute_inplace(matrix, vector, matrix_row_length, size);
-  
-}
-
- 
-// compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format
-__kernel void float_compressed_ilu_forward_substitute(
-          __global const unsigned int * row_indices,
-          __global const unsigned int * column_indices, 
-          __global const float * elements,
-          __local  int * buffer,                              
-          __local  float * vec_entries,   //a memory block from vector
-          __global float * vector,
-          unsigned int size) 
-{
-  int waiting_for; //block index that must be finished before the current thread can start
-  unsigned int waiting_for_index;
-  int block_offset;
-  unsigned int col;
-  unsigned int row;
-  unsigned int row_index_end;
-  
-  //forward substitution: one thread per row in blocks of get_global_size(0)
-  for (unsigned int block_num = 0; block_num <= size / get_global_size(0); ++block_num)
-  {
-    block_offset = block_num * get_global_size(0);
-    row = block_offset + get_global_id(0);
-    buffer[get_global_id(0)] = 0; //set flag to 'undone'
-    waiting_for = -1;
-
-    if (row < size)
-    {
-      vec_entries[get_global_id(0)] = vector[row];
-      waiting_for_index = row_indices[row];
-      row_index_end = row_indices[row+1];
-    }
-
-    //try to eliminate all lines in the block. 
-    //in worst case scenarios, in each step only one line can be substituted, thus loop
-    for (unsigned int k = 0; k<get_global_size(0); ++k)
-    {
-      //barrier(CLK_LOCAL_MEM_FENCE);
-      if (row < size) //valid index?
-      {
-        if (waiting_for >= 0)
-        {
-          if (buffer[waiting_for] == 1)
-            waiting_for = -1;
-        }
-        
-        if (waiting_for == -1) //substitution not yet done, check whether possible
-        {
-          //check whether reduction is possible:
-          for (unsigned int j = waiting_for_index; j < row_index_end; ++j)
-          {
-            col = column_indices[j];
-            if (col < block_offset)
-              vec_entries[get_global_id(0)] -= elements[j] * vector[col];
-            else if (col < row)  //index is from current block
-            {
-              if (buffer[col - block_offset] == 0) //entry is not yet calculated
-              {
-                waiting_for = col - block_offset;
-                waiting_for_index = j;
-                break;
-              }
-              else  //updated entry is available in shared memory:
-                vec_entries[get_global_id(0)] -= elements[j] * vec_entries[col - block_offset];
-            }
-            else  //not a relevant entry
-              continue;
-          }
-          
-          if (waiting_for == -1)  //this row is done
-          {
-            buffer[get_global_id(0)] = 1;
-            waiting_for = -2; //magic number: thread is finished
-          }
-        } 
-      } //row < size
-      else
-        buffer[get_global_id(0)] = 1; //work done (because there is no work to be done at all...)
-
-      ///////// check whether all threads are done. If yes, exit loop /////////////
-      /*
-      if (get_global_id(0) == 0)
-        buffer[get_global_size(0) + 1] = 1;
-      barrier(CLK_LOCAL_MEM_FENCE);
-      if (buffer[get_global_id(0)] == 0)
-        buffer[get_global_size(0) + 1] = 0;
-      barrier(CLK_LOCAL_MEM_FENCE);
-      
-      if (buffer[get_global_size(0) + 1] > 0)  //all threads break this loop simultaneously
-        break; */
-      
-    } //for k
-    
-    //write to vector:
-    if (row < size)
-      vector[row] = vec_entries[get_global_id(0)];
-    
-    barrier(CLK_GLOBAL_MEM_FENCE);
-  } //for block_num
-}
-
-// compute x in Ux = y for incomplete LU factorizations of a sparse matrix in compressed format
-__kernel void float_compressed_ilu_backward_substitute(
-          __global const unsigned int * row_indices,
-          __global const unsigned int * column_indices, 
-          __global const float * elements,
-          __local  int * buffer,                              
-          __global float * vector,
-          unsigned int size) 
-{
-  int waiting_for; //block index that must be finished before the current thread can start
-  int block_offset;
-  unsigned int col;
-  unsigned int row;
-  float row_entry;
-  
-  //forward substitution: one thread per row in blocks of get_global_size(0)
-  for (int block_num = size / get_global_size(0); block_num > -1; --block_num)
-  {
-    block_offset = block_num * get_global_size(0);
-    row = block_offset + get_global_id(0);
-    buffer[get_global_id(0)] = 0; //set flag to 'undone'
-    waiting_for = -1;
-    
-    if (row < size)
-      row_entry = vector[row];
-
-    //try to eliminate all lines in the block. 
-    //in worst case scenarios, in each step only one line can be substituted, thus loop
-    for (unsigned int k = 0; k<get_global_size(0); ++k)
-    {
-      barrier(CLK_LOCAL_MEM_FENCE);
-      if (row < size) //valid index?
-      {
-        if (waiting_for >= 0)
-        {
-          if (buffer[waiting_for] == 1)
-            waiting_for = -1;
-        }
-        
-        if (waiting_for == -1) //substitution not yet done, check whether possible
-        {
-          //check whether reduction is possible:
-          for (unsigned int j = row_indices[row] + 1; j < row_indices[row+1]; ++j)
-          {
-            col = column_indices[j];
-            if (col < block_offset + get_global_size(0))  //is index from current block?
-            {
-              if (buffer[col - block_offset] == 0) //entry is not yet calculated
-              {
-                waiting_for = col - block_offset;
-                break;
-              }
-            }
-          }
-        }
-        
-        //carry out reduction if possible:
-        if (waiting_for == -1)
-        {
-          for (unsigned int j = row_indices[row] + 1; j < row_indices[row+1]; ++j)
-            row_entry -= elements[j] * vector[column_indices[j]];
-          vector[row] = row_entry / elements[row_indices[row]];
-        
-          //this row is done
-          buffer[get_global_id(0)] = 1;
-          waiting_for = -2; //magic number: thread is finished
-        } //if 
-      } //row < size
-      else
-        buffer[get_global_id(0)] = 1; //work done (because there is no work to be done at all...)
-      
-      ///////// check whether all threads are done. If yes, exit loop /////////////
-      
-      if (get_global_id(0) == 0)
-        buffer[get_global_size(0) + 1] = 1;
-      barrier(CLK_LOCAL_MEM_FENCE);
-      if (buffer[get_global_id(0)] == 0)
-        buffer[get_global_size(0) + 1] = 0;
-      barrier(CLK_LOCAL_MEM_FENCE);
-      
-      if (buffer[get_global_size(0) + 1] > 0)  //all threads break the loop simultaneously
-        break;
-
-      barrier(CLK_GLOBAL_MEM_FENCE);
-    } //for k
-  } //for block_num
-}
-
diff --git a/auxiliary/fft/align1/bluestein_post.cl b/auxiliary/fft/align1/bluestein_post.cl
deleted file mode 100644
index 563d9ea..0000000
--- a/auxiliary/fft/align1/bluestein_post.cl
+++ /dev/null
@@ -1,23 +0,0 @@
-// Postprocessing phase of Bluestein algorithm
-__kernel void bluestein_post(__global float2* Z,
-                             __global float2* out,
-                             unsigned int size) 
-{
-    unsigned int glb_id = get_global_id(0);
-    unsigned int glb_sz = get_global_size(0);
-
-    unsigned int double_size = size << 1;
-    float sn_a, cs_a;
-    const float NUM_PI = 3.14159265358979323846;
-
-    for(unsigned int i = glb_id; i < size; i += glb_sz) {
-        unsigned int rm = i * i % (double_size);
-        float angle = (float)rm / size * (-NUM_PI);
-
-        sn_a = sincos(angle, &cs_a);
-
-        float2 b_i = (float2)(cs_a, sn_a);
-        out[i] = (float2)(Z[i].x * b_i.x - Z[i].y * b_i.y, Z[i].x * b_i.y + Z[i].y * b_i.x);
-    }
-}
-
diff --git a/auxiliary/fft/align1/bluestein_pre.cl b/auxiliary/fft/align1/bluestein_pre.cl
deleted file mode 100644
index fe64e80..0000000
--- a/auxiliary/fft/align1/bluestein_pre.cl
+++ /dev/null
@@ -1,34 +0,0 @@
-// Preprocessing phase of Bluestein algorithm
-__kernel void bluestein_pre(__global float2* input,
-                            __global float2* A,
-                            __global float2* B,
-                            unsigned int size,
-                            unsigned int ext_size
-                           ) {
-    unsigned int glb_id = get_global_id(0);
-    unsigned int glb_sz = get_global_size(0);
-
-    unsigned int double_size = size << 1;
-
-    float sn_a, cs_a;
-    const float NUM_PI = 3.14159265358979323846;
-
-    for(unsigned int i = glb_id; i < size; i += glb_sz) {
-        unsigned int rm = i * i % (double_size);
-        float angle = (float)rm / size * NUM_PI;
-
-        sn_a = sincos(-angle, &cs_a);
-
-        float2 a_i = (float2)(cs_a, sn_a);
-        float2 b_i = (float2)(cs_a, -sn_a);
-
-        A[i] = (float2)(input[i].x * a_i.x - input[i].y * a_i.y, input[i].x * a_i.y + input[i].y * a_i.x);
-
-        B[i] = b_i;
-
-        // very bad instruction, to be fixed
-        if(i) 
-          B[ext_size - i] = b_i;
-    }
-}
-
diff --git a/auxiliary/fft/align1/complex_to_real.cl b/auxiliary/fft/align1/complex_to_real.cl
deleted file mode 100644
index 313d78a..0000000
--- a/auxiliary/fft/align1/complex_to_real.cl
+++ /dev/null
@@ -1,8 +0,0 @@
-__kernel void complex_to_real(__global float2* in,
-                              __global float* out,
-                              unsigned int size) {
-    for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) {
-        out[i] = in[i].x;
-    }
-}
-
diff --git a/auxiliary/fft/align1/fft_div_vec_scalar.cl b/auxiliary/fft/align1/fft_div_vec_scalar.cl
deleted file mode 100644
index 3c21cf7..0000000
--- a/auxiliary/fft/align1/fft_div_vec_scalar.cl
+++ /dev/null
@@ -1,7 +0,0 @@
-// divide a vector by a scalar (to be removed...)
-__kernel void fft_div_vec_scalar(__global float2* input1, unsigned int size, float factor) {
-    for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) {
-        input1[i] /= factor;
-    }
-}
-
diff --git a/auxiliary/fft/align1/fft_mult_vec.cl b/auxiliary/fft/align1/fft_mult_vec.cl
deleted file mode 100644
index 8fce0b4..0000000
--- a/auxiliary/fft/align1/fft_mult_vec.cl
+++ /dev/null
@@ -1,13 +0,0 @@
-// elementwise product of two complex vectors
-__kernel void fft_mult_vec(__global const float2* input1,
-                          __global const float2* input2,
-                          __global float2* output,
-                          unsigned int size) {
-    for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) {
-        float2 in1 = input1[i];
-        float2 in2 = input2[i];
-
-        output[i] = (float2)(in1.x * in2.x - in1.y * in2.y, in1.x * in2.y + in1.y * in2.x);
-    }
-}
-
diff --git a/auxiliary/fft/align1/real_to_complex.cl b/auxiliary/fft/align1/real_to_complex.cl
deleted file mode 100644
index 73888bc..0000000
--- a/auxiliary/fft/align1/real_to_complex.cl
+++ /dev/null
@@ -1,11 +0,0 @@
-// embedd a real-valued vector into a complex one
-__kernel void real_to_complex(__global float* in,
-                              __global float2* out,
-                              unsigned int size) {
-    for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) {
-        float2 val = 0;
-        val.x = in[i];
-        out[i] = val;
-    }
-}
-
diff --git a/auxiliary/fft/align1/reverse_inplace.cl b/auxiliary/fft/align1/reverse_inplace.cl
deleted file mode 100644
index 33c0a76..0000000
--- a/auxiliary/fft/align1/reverse_inplace.cl
+++ /dev/null
@@ -1,11 +0,0 @@
-// reverses the entries in a vector
-__kernel void reverse_inplace(__global float* vec, uint size) {
-    for(uint i = get_global_id(0); i < (size >> 1); i+=get_global_size(0)) {
-        float val1 = vec[i];
-        float val2 = vec[size - i - 1];
-
-        vec[i] = val2;
-        vec[size - i - 1] = val1;
-    }
-}
-
diff --git a/auxiliary/fft/align1/transpose.cl b/auxiliary/fft/align1/transpose.cl
deleted file mode 100644
index 5660fea..0000000
--- a/auxiliary/fft/align1/transpose.cl
+++ /dev/null
@@ -1,16 +0,0 @@
-// simplistic matrix transpose function
-__kernel void transpose(__global float2* input,
-                        __global float2* output,
-                        unsigned int row_num,
-                        unsigned int col_num) {
-    unsigned int size = row_num * col_num;
-    for(unsigned int i = get_global_id(0); i < size; i+= get_global_size(0)) {
-        unsigned int row = i / col_num;
-        unsigned int col = i - row*col_num;
-
-        unsigned int new_pos = col * row_num + row;
-
-        output[new_pos] = input[i];
-    }
-}
-
diff --git a/auxiliary/fft/align1/transpose_inplace.cl b/auxiliary/fft/align1/transpose_inplace.cl
deleted file mode 100644
index fcbece7..0000000
--- a/auxiliary/fft/align1/transpose_inplace.cl
+++ /dev/null
@@ -1,22 +0,0 @@
-// inplace-transpose of a matrix
-__kernel void transpose_inplace(__global float2* input,
-                        unsigned int row_num,
-                        unsigned int col_num) {
-    unsigned int size = row_num * col_num;
-    for(unsigned int i = get_global_id(0); i < size; i+= get_global_size(0)) {
-        unsigned int row = i / col_num;
-        unsigned int col = i - row*col_num;
-
-        unsigned int new_pos = col * row_num + row;
-
-        //new_pos = col < row?0:1;
-        //input[i] = new_pos;
-
-        if(i < new_pos) {
-            float2 val = input[i];
-            input[i] = input[new_pos];
-            input[new_pos] = val;
-        }
-    }
-}
-
diff --git a/auxiliary/fft/align1/vandermonde_prod.cl b/auxiliary/fft/align1/vandermonde_prod.cl
deleted file mode 100644
index 9c2cadc..0000000
--- a/auxiliary/fft/align1/vandermonde_prod.cl
+++ /dev/null
@@ -1,19 +0,0 @@
-// computes the matrix vector product with a Vandermonde matrix
-__kernel void vandermonde_prod(__global float* vander,
-                                __global float* vector,
-                                __global float* result,
-                                uint size) {
-    for(uint i = get_global_id(0); i < size; i+= get_global_size(0)) {
-        float mul = vander[i];
-        float pwr = 1;
-        float val = 0;
-
-        for(uint j = 0; j < size; j++) {
-            val = val + pwr * vector[j];
-            pwr *= mul;
-        }
-            
-        result[i] = val;
-    }
-}
-
diff --git a/auxiliary/fft/align1/zero2.cl b/auxiliary/fft/align1/zero2.cl
deleted file mode 100644
index ff8d6aa..0000000
--- a/auxiliary/fft/align1/zero2.cl
+++ /dev/null
@@ -1,11 +0,0 @@
-// Zero two complex vectors (to avoid kernel launch overhead)
-__kernel void zero2(__global float2* input1,
-                    __global float2* input2,
-                    unsigned int size) {
-    for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) {
-        input1[i] = 0;
-        input2[i] = 0;
-    }
-
-}
-
diff --git a/auxiliary/generate-blas3-prod-align1.cpp b/auxiliary/generate-blas3-prod-align1.cpp
deleted file mode 100755
index 3223c24..0000000
--- a/auxiliary/generate-blas3-prod-align1.cpp
+++ /dev/null
@@ -1,277 +0,0 @@
-/*
-* Generates BLAS level 3 routines
-*/
-
-#include <iostream>
-#include <fstream>
-#include <sstream>
-#include <string>
-
-#include <iostream>
-#include <stdlib.h>
-
-//generate code for C = op1(A) * op2(B), where A, B, C can have different storage layouts and opX(D) = D or trans(D)
-void printMatrixMatrixProduct(bool row_major_A, bool row_major_B, bool row_major_C,
-                              bool transpose_A, bool transpose_B)
-{
-  //write header:
-  std::cout << "// file automatically generated - do not edit!" << std::endl;
-  std::cout << "// matrix-matrix multiplication C = ";
-  if (transpose_A)
-    std::cout << "A^T * ";
-  else
-    std::cout << "A * ";
-  if (transpose_B)
-    std::cout << "B^T" << std::endl;
-  else
-    std::cout << "B" << std::endl;
-  std::cout << "// matrix layouts: ";
-  if (row_major_C)
-    std::cout << "C...row_major, ";
-  else
-    std::cout << "C...col_major, ";
-  if (row_major_A)
-    std::cout << "A...row_major, ";
-  else
-    std::cout << "A...col_major, ";
-  if (row_major_B)
-    std::cout << "B...row_major" << std::endl;
-  else
-    std::cout << "B...col_major" << std::endl;
-  
-  //start OpenCL code:
-  std::cout << "__kernel void prod_";
-  if (transpose_A)
-    std::cout << "T";
-  else
-    std::cout << "A";
-  if (transpose_B)
-    std::cout << "T";
-  else
-    std::cout << "A";
-  
-  std::cout << "(" << std::endl;
-  std::cout << "          __global const float * A," << std::endl;
-  std::cout << "          unsigned int A_row_start," << std::endl;
-  std::cout << "          unsigned int A_col_start," << std::endl;
-  std::cout << "          unsigned int A_row_size," << std::endl;   //number of elements starting from row_start!
-  std::cout << "          unsigned int A_col_size," << std::endl;
-  std::cout << "          unsigned int A_internal_rows," << std::endl;
-  std::cout << "          unsigned int A_internal_cols," << std::endl;
-  std::cout << "          __global const float * B,  " << std::endl;
-  std::cout << "          unsigned int B_row_start," << std::endl;
-  std::cout << "          unsigned int B_col_start," << std::endl;
-  std::cout << "          unsigned int B_row_size," << std::endl;
-  std::cout << "          unsigned int B_col_size," << std::endl;
-  std::cout << "          unsigned int B_internal_rows," << std::endl;
-  std::cout << "          unsigned int B_internal_cols," << std::endl;
-  std::cout << "          __global float * C," << std::endl;
-  std::cout << "          unsigned int C_row_start," << std::endl;
-  std::cout << "          unsigned int C_col_start," << std::endl;
-  std::cout << "          unsigned int C_row_size," << std::endl;
-  std::cout << "          unsigned int C_col_size," << std::endl;
-  std::cout << "          unsigned int C_internal_rows," << std::endl;
-  std::cout << "          unsigned int C_internal_cols," << std::endl;
-  std::cout << "          __local float * bufA," << std::endl;
-  std::cout << "          __local float * bufB) " << std::endl;
-  std::cout << "{ " << std::endl;
-  //do not forgot to change block_size !!!
-  std::cout << "  size_t block_size = 16;//get_local_size(0);" << std::endl;
-  std::cout << "  size_t row_block_id = get_group_id(0);" << std::endl;
-  std::cout << "  size_t col_block_id = get_group_id(1);" << std::endl;
-  std::cout << "  size_t row_thread_id = get_local_id(0);" << std::endl;
-  std::cout << "  size_t col_thread_id = get_local_id(1);" << std::endl;
-  std::cout << "  size_t row_block_id_ = get_local_id(1);" << std::endl;
-  
-  //traverse block row of A (taking mem layout and transpose operation into account)
-  if (row_major_A && transpose_A)
-  {
-    std::cout << "  size_t aBegin = (row_block_id * block_size + A_col_start) + A_row_start * A_internal_cols;" << std::endl;
-    std::cout << "  size_t aStep = block_size * A_internal_cols;" << std::endl;
-  }
-  else if (row_major_A && !transpose_A)
-  {
-    std::cout << "  size_t aBegin = (row_block_id * block_size + A_row_start) * A_internal_cols + A_col_start;" << std::endl;
-    std::cout << "  size_t aStep = block_size;" << std::endl;
-  }
-  else if (!row_major_A && transpose_A)
-  {
-    std::cout << "  size_t aBegin = (row_block_id * block_size + A_col_start) * A_internal_rows + A_row_start;" << std::endl;
-    std::cout << "  size_t aStep = block_size;" << std::endl;
-  }
-  else if (!row_major_A && !transpose_A)
-  {
-    std::cout << "  size_t aBegin = (row_block_id * block_size + A_row_start) + A_col_start * A_internal_rows;" << std::endl;
-    std::cout << "  size_t aStep = block_size * A_internal_rows;" << std::endl;
-  }
-
-
-  if (row_major_B && transpose_B)
-  {
-    std::cout << "  size_t bBegin = (col_block_id * block_size + B_row_start) * B_internal_cols + B_col_start;" << std::endl;
-    std::cout << "  size_t bStep = block_size;" << std::endl;
-  }
-  else if (row_major_B && !transpose_B)
-  {
-    std::cout << "  size_t bBegin = (col_block_id * block_size + B_col_start) + B_row_start * B_internal_cols;" << std::endl;
-    std::cout << "  size_t bStep = block_size * B_internal_cols;" << std::endl;
-  }
-  else if (!row_major_B && transpose_B)
-  {
-    std::cout << "  size_t bBegin = (col_block_id * block_size + B_row_start) + B_col_start * B_internal_rows;" << std::endl;
-    std::cout << "  size_t bStep = block_size * B_internal_rows;" << std::endl;
-  }
-  else if (!row_major_B && !transpose_B)
-  {
-    std::cout << "  size_t bBegin = (col_block_id * block_size + B_col_start) * B_internal_rows + B_row_start;" << std::endl;
-    std::cout << "  size_t bStep = block_size;" << std::endl;
-  }
-
-
-  if (transpose_A)
-    std::cout << "  size_t block_num = (A_row_size + block_size - 1) / block_size;" << std::endl;
-  else
-    std::cout << "  size_t block_num = (A_col_size + block_size - 1) / block_size;" << std::endl;
-    
-  std::cout << "  float Csub = 0;" << std::endl;
-  
-  //offset of the the memory access by the thread relative to the beginning of the block:
-  if (row_major_A)
-    std::cout << "  size_t aOffset = row_thread_id + col_thread_id * A_internal_cols;" << std::endl;
-  else
-    std::cout << "  size_t aOffset = row_thread_id + col_thread_id * A_internal_rows;" << std::endl;
-
-  if (row_major_B)
-    std::cout << "  size_t bOffset = row_thread_id + col_thread_id * B_internal_cols;" << std::endl;
-  else
-    std::cout << "  size_t bOffset = row_thread_id + col_thread_id * B_internal_rows;" << std::endl;
-
-  std::cout << std::endl;  
-  
-  std::cout << "  size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);" << std::endl;
-  std::cout << "  size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);" << std::endl;
-
-  std::cout << "  for (size_t block = 0;" << std::endl;
-  std::cout << "           block < block_num;" << std::endl;
-  std::cout << "           ++block)" << std::endl;
-  std::cout << "  {" << std::endl;
-  
-  //read block from A and check for access within matrix:
-/*  if (transpose_A)
-    std::cout << "    if (block * block_size + col_thread_id < A_rows && get_global_id(0) < A_cols)" << std::endl;
-  else 
-    std::cout << "    if (block * block_size + col_thread_id < A_cols && get_global_id(0) < A_rows)" << std::endl;
-  
-  std::cout << "      bufA[row_thread_id * block_size + col_thread_id] = A[aBegin + aOffset];" << std::endl;
-  std::cout << "    else" << std::endl;
-  std::cout << "      bufA[row_thread_id * block_size + col_thread_id] = 0;" << std::endl;*/
-
-  //new code:
-  if (transpose_A && row_major_A)
-    std::cout << "    bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;" << std::endl;
-  else if (transpose_A && !row_major_A)
-    std::cout << "    bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;" << std::endl;
-  else if (!transpose_A && row_major_A)
-    std::cout << "    bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;" << std::endl;
-  else if (!transpose_A && !row_major_A)
-    std::cout << "    bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;" << std::endl;
-
-
-  if (transpose_B && row_major_B)
-    std::cout << "    bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;" << std::endl;
-  else if (transpose_B && !row_major_B)
-    std::cout << "    bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;" << std::endl;
-  else if (!transpose_B && row_major_B)
-    std::cout << "    bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;" << std::endl;
-  else if (!transpose_B && !row_major_B)
-    std::cout << "    bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;" << std::endl;
-
-  //computation of block-matrix-matrix product is the same for all cases:
-  std::cout << "    barrier(CLK_LOCAL_MEM_FENCE);" << std::endl;
-  //std::cout << "    for (size_t k = 0; k < block_size; ++k)" << std::endl;
-  //std::cout << "      Csub += bufA[row_thread_id_times_block_size + k] * bufB[k * block_size + col_thread_id];" << std::endl;
-  //loop unrolling:
-  std::cout << "    __local float * bufAptr = bufA + row_thread_id_times_block_size;" << std::endl;
-  std::cout << "    __local float * bufBptr = bufB + col_thread_id_times_block_size;" << std::endl;
-  //std::cout << "      Csub += bufA[row_thread_id_times_block_size] * bufB[col_thread_id * block_size];" << std::endl;
-  // code in following line depends on block size and must be changed in case of block_size changes
-  std::cout << "      for(int i = 0; i < 4; i++) {" << std::endl;
-  for (size_t unroll = 0; unroll < 4; ++unroll) {
-    std::cout << "      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;" << std::endl;
-  }
-  std::cout << "     }" << std::endl;
-    //std::cout << "      Csub += bufAptr[" << i << "] * bufB[" << i << "  + col_thread_id * block_size];" << std::endl;
-    //std::cout << "      Csub += bufAptr[" << i << "] * bufB[" << i << " * block_size + col_thread_id];" << std::endl;
-    //std::cout << "      Csub += bufAptr[" << i << "] * bufB[" << i << "];" << std::endl;
-  std::cout << "    barrier(CLK_LOCAL_MEM_FENCE);" << std::endl;
-  std::cout << "    aBegin += aStep;" << std::endl;
-  std::cout << "    bBegin += bStep;" << std::endl;
-  std::cout << "  }" << std::endl;
-  
-  
-  if (transpose_A)
-    std::cout << "  if (get_global_id(0) < A_col_size && ";
-  else
-    std::cout << "  if (get_global_id(0) < A_row_size && ";
-  
-  if (transpose_B)
-    std::cout << "get_global_id(1) < B_row_size)" << std::endl;
-  else
-    std::cout << "get_global_id(1) < B_col_size)" << std::endl;
-  
-  if (row_major_C)
-    std::cout << "    C[(get_global_id(0) + C_row_start) * C_internal_cols + get_global_id(1) + C_col_start] = Csub;" << std::endl;
-  else
-    std::cout << "    C[get_global_id(0) + C_row_start + (get_global_id(1) + C_col_start) * C_internal_rows] = Csub;" << std::endl;
-  std::cout << "}" << std::endl;
-  
-}
-
-void printUsage()
-{
-  std::cout << "Must have five parameters for C = A * B:" << std::endl;
-  std::cout << " 0/1 : storage layout for A (column_major/row_major)" << std::endl;
-  std::cout << " 0/1 : storage layout for B (column_major/row_major)" << std::endl;
-  std::cout << " 0/1 : storage layout for C (column_major/row_major)" << std::endl;
-  std::cout << " 0/1 : transpose for A (no/yes)" << std::endl;
-  std::cout << " 0/1 : transpose for B (no/yes)" << std::endl;
-}
-
-void readParameter(bool & param, char input)
-{
-  if (input == '0')
-    param = false;
-  else if (input == '1')
-    param = true;
-  else
-  {
-    printUsage();
-    exit(EXIT_FAILURE);
-  }
-}
-
-int main(int args, char * argsv[])
-{
-  if (args != 6)
-  {
-    printUsage();
-    exit(EXIT_FAILURE);
-  }
-  
-  //the following flags are 'true' for row_major layout
-  bool layout_A;
-  bool layout_B;
-  bool layout_C;
-
-  readParameter(layout_A, argsv[1][0]);
-  readParameter(layout_B, argsv[2][0]);
-  readParameter(layout_C, argsv[3][0]);
-  
-  bool transpose_A;
-  bool transpose_B;
-  readParameter(transpose_A, argsv[4][0]);
-  readParameter(transpose_B, argsv[5][0]);
-  
-  
-  printMatrixMatrixProduct(layout_A, layout_B, layout_C, transpose_A, transpose_B);
-}
diff --git a/auxiliary/generate-blas3-solve-align1.cpp b/auxiliary/generate-blas3-solve-align1.cpp
deleted file mode 100644
index 9e4ec1d..0000000
--- a/auxiliary/generate-blas3-solve-align1.cpp
+++ /dev/null
@@ -1,183 +0,0 @@
-/*
-* Generates BLAS level 3 routines for direct solve
-*/
-
-#include <iostream>
-#include <fstream>
-#include <sstream>
-#include <string>
-
-#include <iostream>
-#include <stdlib.h>
-
-//generate code for inplace_solve(op1(A), op2(B), tag) where A and B can have different storage layouts and opX(D) = D or trans(D)
-void printMatrixMatrixSolve(bool row_major_A, bool row_major_B,
-                            bool transpose_A, bool transpose_B,
-                            bool upper_solve, bool unit_diagonal)
-{
-  //write header:
-  std::cout << "// file automatically generated - do not edit!" << std::endl;
-  std::cout << "// inplace solve ";
-  if (transpose_A)
-    std::cout << "A^T \\\\ ";
-  else
-    std::cout << "A \\\\ ";
-  if (transpose_B)
-    std::cout << "B^T" << std::endl;
-  else
-    std::cout << "B" << std::endl;
-  std::cout << "// matrix layouts: ";
-  if (row_major_A)
-    std::cout << "A...row_major, ";
-  else
-    std::cout << "A...col_major, ";
-  if (row_major_B)
-    std::cout << "B...row_major" << std::endl;
-  else
-    std::cout << "B...col_major" << std::endl;
-  
-  //start OpenCL code:
-  std::cout << "__kernel void ";
-  if (transpose_A)
-    std::cout << "trans_";
-  if (unit_diagonal)
-    std::cout << "unit_";
-  if (upper_solve)
-    std::cout << "upper_";
-  else
-    std::cout << "lower_";
-  if (transpose_B)
-    std::cout << "trans_";
-  std::cout << "solve";
-  
-  std::cout << "(" << std::endl;
-  std::cout << "          __global const float * A," << std::endl;
-  std::cout << "          unsigned int A_rows," << std::endl;
-  std::cout << "          unsigned int A_cols," << std::endl;
-  std::cout << "          unsigned int A_internal_rows," << std::endl;
-  std::cout << "          unsigned int A_internal_cols," << std::endl;
-  std::cout << "          __global float * B,  " << std::endl;
-  std::cout << "          unsigned int B_rows," << std::endl;
-  std::cout << "          unsigned int B_cols," << std::endl;
-  std::cout << "          unsigned int B_internal_rows," << std::endl;
-  std::cout << "          unsigned int B_internal_cols)" << std::endl;
-  std::cout << "{ " << std::endl;
-  std::cout << "  float temp; " << std::endl;
-  if (upper_solve)
-  {
-    //Note: A is square, thus A_rows == A_cols and no dispatch for transposedness needed
-    std::cout << "  for (int row = A_rows-1; row > -1; --row) " << std::endl;
-  }
-  else //lower triangular solve
-  {
-    std::cout << "  for (int row = 0; row < A_rows; ++row) " << std::endl;
-  }
-  std::cout << "  { " << std::endl;
-  if (!unit_diagonal)
-  {
-    std::cout << "    barrier(CLK_GLOBAL_MEM_FENCE); " << std::endl;
-    std::cout << "    if (get_local_id(0) == 0) " << std::endl;
-    //Note: A is square, thus A_internal_rows == A_internal_cols and no dispatch for transposedness needed
-    if (row_major_B && transpose_B)
-      std::cout << "      B[row + get_group_id(0) * B_internal_cols] /= A[row + row*A_internal_cols]; " << std::endl;
-    else if (row_major_B && !transpose_B)
-      std::cout << "      B[row * B_internal_cols + get_group_id(0)] /= A[row + row*A_internal_cols]; " << std::endl;
-    else if (!row_major_B && transpose_B)
-      std::cout << "      B[row * B_internal_rows + get_group_id(0)] /= A[row + row*A_internal_cols]; " << std::endl;
-    else if (!row_major_B && !transpose_B)
-      std::cout << "      B[row + get_group_id(0) * B_internal_rows] /= A[row + row*A_internal_cols]; " << std::endl;
-  }
-  
-  std::cout << "    barrier(CLK_GLOBAL_MEM_FENCE); " << std::endl;
-  
-  if (row_major_B && transpose_B)
-    std::cout << "      temp = B[row + get_group_id(0) * B_internal_cols]; " << std::endl;
-  else if (row_major_B && !transpose_B)
-    std::cout << "      temp = B[row * B_internal_cols + get_group_id(0)]; " << std::endl;
-  else if (!row_major_B && transpose_B)
-    std::cout << "      temp = B[row * B_internal_rows + get_group_id(0)]; " << std::endl;
-  else if (!row_major_B && !transpose_B)
-    std::cout << "      temp = B[row + get_group_id(0) * B_internal_rows]; " << std::endl;
-
-  std::cout << "    //eliminate column of op(A) with index 'row' in parallel: " << std::endl;
-  if (upper_solve)
-    std::cout << "    for  (int elim = get_local_id(0); elim < row; elim += get_local_size(0)) " << std::endl;
-  else
-    std::cout << "    for  (int elim = row + get_local_id(0) + 1; elim < A_rows; elim += get_local_size(0)) " << std::endl;
-  
-  if (row_major_B && transpose_B)
-    std::cout << "      B[elim + get_group_id(0) * B_internal_cols] -= temp * ";
-  else if (row_major_B && !transpose_B)
-    std::cout << "      B[elim * B_internal_cols + get_group_id(0)] -= temp * ";
-  else if (!row_major_B && transpose_B)
-    std::cout << "      B[elim * B_internal_rows + get_group_id(0)] -= temp * ";
-  else if (!row_major_B && !transpose_B)
-    std::cout << "      B[elim + get_group_id(0) * B_internal_rows] -= temp * ";
-  
-  if (row_major_A && transpose_A)
-    std::cout << "A[elim + row * A_internal_cols];" << std::endl;
-  else if (row_major_A && !transpose_A)
-    std::cout << "A[elim * A_internal_cols + row];" << std::endl;
-  else if (!row_major_A && transpose_A)
-    std::cout << "A[elim * A_internal_rows + row];" << std::endl;
-  else if (!row_major_A && !transpose_A)
-    std::cout << "A[elim + row * A_internal_rows];" << std::endl;
-  
-  std::cout << "   }" << std::endl;
-  std::cout << "}" << std::endl;
-  
-}
-
-void printUsage()
-{
-  std::cout << "Must have six parameters for A \\ B:" << std::endl;
-  std::cout << " 0/1 : storage layout for A (column_major/row_major)" << std::endl;
-  std::cout << " 0/1 : storage layout for B (column_major/row_major)" << std::endl;
-  std::cout << " 0/1 : transpose for A (no/yes)" << std::endl;
-  std::cout << " 0/1 : transpose for B (no/yes)" << std::endl;
-  std::cout << " 0/1 : upper triangular system (no/yes)" << std::endl;
-  std::cout << " 0/1 : has unit diagonal (no/yes)" << std::endl;
-}
-
-void readParameter(bool & param, char input)
-{
-  if (input == '0')
-    param = false;
-  else if (input == '1')
-    param = true;
-  else
-  {
-    printUsage();
-    exit(EXIT_FAILURE);
-  }
-}
-
-int main(int args, char * argsv[])
-{
-  if (args != 7)
-  {
-    printUsage();
-    exit(EXIT_FAILURE);
-  }
-  
-  //the following flags are 'true' for row_major layout
-  bool layout_A;
-  bool layout_B;
-
-  readParameter(layout_A, argsv[1][0]);
-  readParameter(layout_B, argsv[2][0]);
-  
-  bool transpose_A;
-  bool transpose_B;
-  readParameter(transpose_A, argsv[3][0]);
-  readParameter(transpose_B, argsv[4][0]);
-  
-  bool upper_solve;
-  bool unit_diagonal;
-  readParameter(upper_solve,   argsv[5][0]);
-  readParameter(unit_diagonal, argsv[6][0]);
-  
-  printMatrixMatrixSolve(layout_A, layout_B,
-                         transpose_A, transpose_B,
-                         upper_solve, unit_diagonal);
-}
diff --git a/auxiliary/matrix_col/align1/add.cl b/auxiliary/matrix_col/align1/add.cl
deleted file mode 100644
index 2a09d6f..0000000
--- a/auxiliary/matrix_col/align1/add.cl
+++ /dev/null
@@ -1,29 +0,0 @@
-
-__kernel void add(  // C = A + B
-          __global const float * A,
-          unsigned int A_row_start,
-          unsigned int A_col_start,
-          unsigned int A_row_size,
-          unsigned int A_col_size,
-          unsigned int A_internal_rows,
-          unsigned int A_internal_cols,
-          __global const float * B,
-          unsigned int B_row_start,
-          unsigned int B_col_start,
-          unsigned int B_row_size,
-          unsigned int B_col_size,
-          unsigned int B_internal_rows,
-          unsigned int B_internal_cols,
-          __global float * C,
-          unsigned int C_row_start,
-          unsigned int C_col_start,
-          unsigned int C_row_size,
-          unsigned int C_col_size,
-          unsigned int C_internal_rows,
-          unsigned int C_internal_cols) 
-{ 
-  for (unsigned int i = get_global_id(0); i < A_row_size; i += get_global_size(0))
-    for (unsigned int j = get_global_id(1); j < A_col_size; j += get_global_size(1))
-      C[i + C_row_start + (j + C_col_start) * C_internal_rows] =  A[i + A_row_start + (j + A_col_start) * A_internal_rows]
-                                                                  + B[i + B_row_start + (j + B_col_start) * B_internal_rows];
-}
diff --git a/auxiliary/matrix_col/align1/assign.cl b/auxiliary/matrix_col/align1/assign.cl
deleted file mode 100644
index 55a678f..0000000
--- a/auxiliary/matrix_col/align1/assign.cl
+++ /dev/null
@@ -1,22 +0,0 @@
-
-__kernel void assign( // A <- B
-          __global float * A,
-          unsigned int A_row_start,
-          unsigned int A_col_start,
-          unsigned int A_row_size,
-          unsigned int A_col_size,
-          unsigned int A_internal_rows,
-          unsigned int A_internal_cols,
-          __global const float * B,  
-          unsigned int B_row_start,
-          unsigned int B_col_start,
-          unsigned int B_row_size,
-          unsigned int B_col_size,
-          unsigned int B_internal_rows,
-          unsigned int B_internal_cols)
-{ 
-  for (unsigned int i = get_global_id(0); i < A_row_size; i += get_global_size(0))
-    for (unsigned int j = get_global_id(1); j < A_col_size; j += get_global_size(1))
-      A[i + A_row_start + (j + A_col_start) * A_internal_rows] = B[i + B_row_start + (j + B_col_start) * B_internal_rows];
-}
-
diff --git a/auxiliary/matrix_col/align1/clear.cl b/auxiliary/matrix_col/align1/clear.cl
deleted file mode 100644
index bb11f6a..0000000
--- a/auxiliary/matrix_col/align1/clear.cl
+++ /dev/null
@@ -1,14 +0,0 @@
-
-__kernel void clear( // A <- 0
-          __global float * A,
-          unsigned int A_row_start,
-          unsigned int A_col_start,
-          unsigned int A_row_size,
-          unsigned int A_col_size,
-          unsigned int A_internal_rows,
-          unsigned int A_internal_cols) 
-{ 
-  for (unsigned int i = get_global_id(0); i < A_row_size; i += get_global_size(0))
-    for (unsigned int j = get_global_id(1); j < A_col_size; j += get_global_size(1))
-      A[i + A_row_start + (j + A_col_start) * A_internal_rows] = 0;
-}
diff --git a/auxiliary/matrix_col/align1/cpu_inplace_mult.cl b/auxiliary/matrix_col/align1/cpu_inplace_mult.cl
deleted file mode 100644
index 833b51b..0000000
--- a/auxiliary/matrix_col/align1/cpu_inplace_mult.cl
+++ /dev/null
@@ -1,16 +0,0 @@
-
-__kernel void cpu_inplace_mult( // A *= const
-          __global float * A,
-          unsigned int A_row_start,
-          unsigned int A_col_start,
-          unsigned int A_row_size,
-          unsigned int A_col_size,
-          unsigned int A_internal_rows,
-          unsigned int A_internal_cols,
-          float factor) 
-{ 
-  for (unsigned int i = get_global_id(0); i < A_row_size; i += get_global_size(0))
-    for (unsigned int j = get_global_id(1); j < A_col_size; j += get_global_size(1))
-      A[i + A_row_start + (j + A_col_start) * A_internal_rows] *= factor;
-}
-
diff --git a/auxiliary/matrix_col/align1/fft_direct.cl b/auxiliary/matrix_col/align1/fft_direct.cl
deleted file mode 100644
index 58818ef..0000000
--- a/auxiliary/matrix_col/align1/fft_direct.cl
+++ /dev/null
@@ -1,29 +0,0 @@
-// Direct FFT computation (quadratic complexity - use for reference only)
-__kernel void fft_direct(__global float2* input,
-                         __global float2* output,
-                         unsigned int size,
-                         unsigned int stride,
-                         unsigned int batch_num,
-                         float sign) {
-                         
-    const float NUM_PI = 3.14159265358979323846;
-    
-    for(unsigned int batch_id = 0; batch_id < batch_num; batch_id++) {
-        for(unsigned int k = get_global_id(0); k < size; k += get_global_size(0)) {
-            float2 f = 0.0f;
-
-            for(unsigned int n = 0; n < size; n++) {
-                float2 in = input[n * stride + batch_id]; //input index here
-
-                float sn, cs;
-                float arg = sign * 2 * NUM_PI * k / size * n;
-                sn = sincos(arg, &cs);
-
-                float2 ex = (float2)(cs, sn);
-                f = f + (float2)(in.x * ex.x - in.y * ex.y, in.x * ex.y + in.y * ex.x);
-            }
-
-            output[k * stride + batch_id] = f;// output index here
-        }
-    }
-}
diff --git a/auxiliary/matrix_col/align1/fft_radix2.cl b/auxiliary/matrix_col/align1/fft_radix2.cl
deleted file mode 100644
index 998cabc..0000000
--- a/auxiliary/matrix_col/align1/fft_radix2.cl
+++ /dev/null
@@ -1,39 +0,0 @@
-__kernel void fft_radix2(__global float2* input,
-                         unsigned int s,
-                         unsigned int bit_size,
-                         unsigned int size,
-                         unsigned int stride,
-                         unsigned int batch_num,
-                         float sign) {
-
-    unsigned int ss = 1 << s;
-    unsigned int half_size = size >> 1;
-
-    float cs, sn;
-    const float NUM_PI = 3.14159265358979323846;
-
-    unsigned int glb_id = get_global_id(0);
-    unsigned int glb_sz = get_global_size(0);
-	
-    for(unsigned int batch_id = 0; batch_id < batch_num; batch_id++) {
-        for(unsigned int tid = glb_id; tid < half_size; tid += glb_sz) {
-            unsigned int group = (tid & (ss - 1));
-            unsigned int pos = ((tid >> s) << (s + 1)) + group;
-
-            unsigned int offset = pos * stride + batch_id;
-            float2 in1 = input[offset];//index
-            float2 in2 = input[offset + ss * stride];//index
-
-            float arg = group * sign * NUM_PI / ss;
-
-            sn = sincos(arg, &cs);
-            float2 ex = (float2)(cs, sn);
-
-            float2 tmp = (float2)(in2.x * ex.x - in2.y * ex.y, in2.x * ex.y + in2.y * ex.x);
-
-            input[offset + ss * stride] = in1 - tmp;//index
-            input[offset] = in1 + tmp;//index
-        }
-    }
-}
-
diff --git a/auxiliary/matrix_col/align1/fft_radix2_local.cl b/auxiliary/matrix_col/align1/fft_radix2_local.cl
deleted file mode 100644
index c6e2f5d..0000000
--- a/auxiliary/matrix_col/align1/fft_radix2_local.cl
+++ /dev/null
@@ -1,74 +0,0 @@
-
-unsigned int get_reorder_num(unsigned int v, unsigned int bit_size) {
-    v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);
-    v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);
-    v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);
-    v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);
-    v = (v >> 16) | (v << 16);
-
-    v = v >> (32 - bit_size);
-
-    return v;
-}
-
-__kernel void fft_radix2_local(__global float2* input,
-                                __local float2* lcl_input,
-                                unsigned int bit_size,
-                                unsigned int size,
-                                unsigned int stride,
-                                unsigned int batch_num,
-                                float sign) {
-
-    unsigned int grp_id = get_group_id(0);
-    unsigned int grp_num = get_num_groups(0);
-
-    unsigned int lcl_sz = get_local_size(0);
-    unsigned int lcl_id = get_local_id(0);
-    const float NUM_PI = 3.14159265358979323846;
-
-    for(unsigned int batch_id = grp_id; batch_id < batch_num; batch_id += grp_num) {
-        //unsigned int base_offset = stride * batch_id;
-        //copy chunk of global memory to local
-
-        for(unsigned int p = lcl_id; p < size; p += lcl_sz) {
-            unsigned int v = get_reorder_num(p, bit_size);
-            lcl_input[v] = input[p * stride + batch_id];//index
-        }
-
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        //performs Cooley-Tukey FFT on local array
-        for(unsigned int s = 0; s < bit_size; s++) {
-            unsigned int ss = 1 << s;
-
-            float cs, sn;
-
-            for(unsigned int tid = lcl_id; tid < size; tid += lcl_sz) {
-                unsigned int group = (tid & (ss - 1));
-                unsigned int pos = ((tid >> s) << (s + 1)) + group;
-
-                float2 in1 = lcl_input[pos];
-                float2 in2 = lcl_input[pos + ss];
-
-                float arg = group * sign * NUM_PI / ss;
-
-                sn = sincos(arg, &cs);
-                float2 ex = (float2)(cs, sn);
-
-                float2 tmp = (float2)(in2.x * ex.x - in2.y * ex.y, in2.x * ex.y + in2.y * ex.x);
-
-                lcl_input[pos + ss] = in1 - tmp;
-                lcl_input[pos] = in1 + tmp;
-            }
-
-            barrier(CLK_LOCAL_MEM_FENCE);
-        }
-
-        //copy local array back to global memory
-        for(unsigned int p = lcl_id; p < size; p += lcl_sz) {
-            input[p * stride + batch_id] = lcl_input[p];//index
-        }
-
-    }
-}
-
diff --git a/auxiliary/matrix_col/align1/fft_reorder.cl b/auxiliary/matrix_col/align1/fft_reorder.cl
deleted file mode 100644
index 118e6f4..0000000
--- a/auxiliary/matrix_col/align1/fft_reorder.cl
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
-* Performs reordering of input data in bit-reversal order
-* Probably it's better to do in host side,
-*/
-
-unsigned int get_reorder_num_2(unsigned int v, unsigned int bit_size) {
-    v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);
-    v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);
-    v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);
-    v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);
-    v = (v >> 16) | (v << 16);
-
-    v = v >> (32 - bit_size);
-
-    return v;
-}
-
-__kernel void fft_reorder(__global float2* input,
-                          unsigned int bit_size,
-                          unsigned int size,
-                          unsigned int stride,
-                          int batch_num) {
-    unsigned int glb_id = get_global_id(0);
-    unsigned int glb_sz = get_global_size(0);
-	
-    for(unsigned int batch_id = 0; batch_id < batch_num; batch_id++) {
-        for(unsigned int i = glb_id; i < size; i += glb_sz) {
-            unsigned int v = get_reorder_num_2(i, bit_size);
-
-            if(i < v) {
-                float2 tmp = input[i * stride + batch_id]; // index
-                input[i * stride + batch_id] = input[v * stride + batch_id]; //index
-                input[v * stride + batch_id] = tmp; //index
-            }
-        }
-    }
-}
-
diff --git a/auxiliary/matrix_col/align1/inplace_add.cl b/auxiliary/matrix_col/align1/inplace_add.cl
deleted file mode 100644
index db947d0..0000000
--- a/auxiliary/matrix_col/align1/inplace_add.cl
+++ /dev/null
@@ -1,21 +0,0 @@
-
-__kernel void inplace_add( // A += B
-          __global float * A,
-          unsigned int A_row_start,
-          unsigned int A_col_start,
-          unsigned int A_row_size,
-          unsigned int A_col_size,
-          unsigned int A_internal_rows,
-          unsigned int A_internal_cols,
-          __global const float * B,  
-          unsigned int B_row_start,
-          unsigned int B_col_start,
-          unsigned int B_row_size,
-          unsigned int B_col_size,
-          unsigned int B_internal_rows,
-          unsigned int B_internal_cols)
-{ 
-  for (unsigned int i = get_global_id(0); i < A_row_size; i += get_global_size(0))
-    for (unsigned int j = get_global_id(1); j < A_col_size; j += get_global_size(1))
-      A[i + A_row_start + (j + A_col_start) * A_internal_rows] += B[i + B_row_start + (j + B_col_start) * B_internal_rows];
-}
diff --git a/auxiliary/matrix_col/align1/inplace_divide.cl b/auxiliary/matrix_col/align1/inplace_divide.cl
deleted file mode 100644
index 640a0dc..0000000
--- a/auxiliary/matrix_col/align1/inplace_divide.cl
+++ /dev/null
@@ -1,17 +0,0 @@
-
-__kernel void inplace_divide( // A /= const
-          __global float * A,
-          unsigned int A_row_start,
-          unsigned int A_col_start,
-          unsigned int A_row_size,
-          unsigned int A_col_size,
-          unsigned int A_internal_rows,
-          unsigned int A_internal_cols,
-          __global const float * fac) //note: CPU variant is mapped to prod_scalar
-{ 
-  float factor = *fac;
-  for (unsigned int i = get_global_id(0); i < A_row_size; i += get_global_size(0))
-    for (unsigned int j = get_global_id(1); j < A_col_size; j += get_global_size(1))
-      A[i + A_row_start + (j + A_col_start) * A_internal_rows] /= factor;
-}
-
diff --git a/auxiliary/matrix_col/align1/inplace_mult.cl b/auxiliary/matrix_col/align1/inplace_mult.cl
deleted file mode 100644
index 9c24038..0000000
--- a/auxiliary/matrix_col/align1/inplace_mult.cl
+++ /dev/null
@@ -1,18 +0,0 @@
-
-__kernel void inplace_mult( // A *= const
-          __global float * A,
-          unsigned int A_row_start,
-          unsigned int A_col_start,
-          unsigned int A_row_size,
-          unsigned int A_col_size,
-          unsigned int A_internal_rows,
-          unsigned int A_internal_cols,
-          __global const float * fac) 
-{ 
-  float factor = *fac;
-  for (unsigned int i = get_global_id(0); i < A_row_size; i += get_global_size(0))
-    for (unsigned int j = get_global_id(1); j < A_col_size; j += get_global_size(1))
-      A[i + A_row_start + (j + A_col_start) * A_internal_rows] *= factor;
-}
-
-
diff --git a/auxiliary/matrix_col/align1/inplace_sub.cl b/auxiliary/matrix_col/align1/inplace_sub.cl
deleted file mode 100644
index dc86b4c..0000000
--- a/auxiliary/matrix_col/align1/inplace_sub.cl
+++ /dev/null
@@ -1,22 +0,0 @@
-
-__kernel void inplace_sub( // A -= B
-          __global float * A,
-          unsigned int A_row_start,
-          unsigned int A_col_start,
-          unsigned int A_row_size,
-          unsigned int A_col_size,
-          unsigned int A_internal_rows,
-          unsigned int A_internal_cols,
-          __global const float * B,  
-          unsigned int B_row_start,
-          unsigned int B_col_start,
-          unsigned int B_row_size,
-          unsigned int B_col_size,
-          unsigned int B_internal_rows,
-          unsigned int B_internal_cols)
-{ 
-  for (unsigned int i = get_global_id(0); i < A_row_size; i += get_global_size(0))
-    for (unsigned int j = get_global_id(1); j < A_col_size; j += get_global_size(1))
-      A[i + A_row_start + (j + A_col_start) * A_internal_rows] -= B[i + B_row_start + (j + B_col_start) * B_internal_rows];
-}
-
diff --git a/auxiliary/matrix_col/align1/lower_triangular_substitute_inplace.cl b/auxiliary/matrix_col/align1/lower_triangular_substitute_inplace.cl
deleted file mode 100644
index acd4ef8..0000000
--- a/auxiliary/matrix_col/align1/lower_triangular_substitute_inplace.cl
+++ /dev/null
@@ -1,26 +0,0 @@
-
-__kernel void lower_triangular_substitute_inplace(
-          __global const float * matrix,
-          unsigned int matrix_rows,
-          unsigned int matrix_cols,
-          unsigned int matrix_internal_rows,
-          unsigned int matrix_internal_cols,
-          __global float * vector)
-{
-  float temp;
-  for (int row = 0; row < matrix_rows; ++row)
-  {
-    barrier(CLK_GLOBAL_MEM_FENCE);
-    if (get_global_id(0) == 0)
-      vector[row] /= matrix[row+row*matrix_internal_rows];
-
-    barrier(CLK_GLOBAL_MEM_FENCE);
-
-    temp = vector[row];
-
-    for  (int elim = row + get_global_id(0) + 1; elim < matrix_rows; elim += get_global_size(0))
-      vector[elim] -= temp * matrix[row * matrix_internal_rows + elim];
-  }
-}
-
-
diff --git a/auxiliary/matrix_col/align1/lu_factorize.cl b/auxiliary/matrix_col/align1/lu_factorize.cl
deleted file mode 100644
index 9f1b38f..0000000
--- a/auxiliary/matrix_col/align1/lu_factorize.cl
+++ /dev/null
@@ -1,27 +0,0 @@
-
-__kernel void lu_factorize(
-          __global float * matrix,
-          unsigned int matrix_rows,
-          unsigned int matrix_cols,
-          unsigned int matrix_internal_rows,
-          unsigned int matrix_internal_cols) 
-{ 
-  float temp;
-  for (unsigned int i=1; i<matrix_rows; ++i)
-  {
-    for (unsigned int k=0; k<i; ++k)
-    {
-      if (get_global_id(0) == 0)
-        matrix[i + k*matrix_internal_rows] /= matrix[k + k*matrix_internal_rows];
-
-      barrier(CLK_GLOBAL_MEM_FENCE);
-      temp = matrix[i + k*matrix_internal_rows];
-      
-      //parallel subtraction:
-      for (unsigned int j=k+1 + get_global_id(0); j<matrix_cols; j += get_global_size(0))
-        matrix[i + j*matrix_internal_rows] -= temp * matrix[k + j*matrix_internal_rows];
-    }
-  }
-} 
-
-
diff --git a/auxiliary/matrix_col/align1/rank1_update.cl b/auxiliary/matrix_col/align1/rank1_update.cl
deleted file mode 100644
index c476483..0000000
--- a/auxiliary/matrix_col/align1/rank1_update.cl
+++ /dev/null
@@ -1,21 +0,0 @@
-
-//perform a rank-1 update of the matrix, i.e. A += x * x^T
-__kernel void rank1_update(
-          __global float * matrix,
-          unsigned int matrix_rows,
-          unsigned int matrix_cols,
-          unsigned int matrix_internal_rows,
-          unsigned int matrix_internal_cols,
-          __global const float * vector1,  
-          __global const float * vector2) 
-{ 
-  float tmp;
-
-  for (unsigned int row= get_global_id(0); row < matrix_rows; row += get_global_size(0))
-  {
-    tmp = vector1[row];
-    for (unsigned int col = 0; col < matrix_cols; ++col)
-      matrix[row + col * matrix_internal_rows] += tmp * vector2[col];
-  }
-}
-
diff --git a/auxiliary/matrix_col/align1/scaled_rank1_update.cl b/auxiliary/matrix_col/align1/scaled_rank1_update.cl
deleted file mode 100644
index b0db8df..0000000
--- a/auxiliary/matrix_col/align1/scaled_rank1_update.cl
+++ /dev/null
@@ -1,22 +0,0 @@
-
-__kernel void scaled_rank1_update(
-          __global float * matrix,
-          unsigned int matrix_rows,
-          unsigned int matrix_cols,
-          unsigned int matrix_internal_rows,
-          unsigned int matrix_internal_cols,
-          float val,
-          __global const float * vector1,  
-          __global const float * vector2) 
-{ 
-  float tmp;
-
-  for (unsigned int row = get_global_id(0); row < matrix_rows; row += get_global_size(0))
-  {
-    tmp = val * vector1[row];
-    for (unsigned int col = 0; col < matrix_cols; ++col)
-      matrix[row + col*matrix_internal_rows] += tmp * vector2[col];
-  }
-}
-
-
diff --git a/auxiliary/matrix_col/align1/sub.cl b/auxiliary/matrix_col/align1/sub.cl
deleted file mode 100644
index 4699866..0000000
--- a/auxiliary/matrix_col/align1/sub.cl
+++ /dev/null
@@ -1,29 +0,0 @@
-
-__kernel void sub(  // C = A - B
-          __global const float * A,
-          unsigned int A_row_start,
-          unsigned int A_col_start,
-          unsigned int A_row_size,
-          unsigned int A_col_size,
-          unsigned int A_internal_rows,
-          unsigned int A_internal_cols,
-          __global const float * B,
-          unsigned int B_row_start,
-          unsigned int B_col_start,
-          unsigned int B_row_size,
-          unsigned int B_col_size,
-          unsigned int B_internal_rows,
-          unsigned int B_internal_cols,
-          __global float * C,
-          unsigned int C_row_start,
-          unsigned int C_col_start,
-          unsigned int C_row_size,
-          unsigned int C_col_size,
-          unsigned int C_internal_rows,
-          unsigned int C_internal_cols) 
-{ 
-  for (unsigned int i = get_global_id(0); i < A_row_size; i += get_global_size(0))
-    for (unsigned int j = get_global_id(1); j < A_col_size; j += get_global_size(1))
-      C[i + C_row_start + (j + C_col_start) * C_internal_rows] =  A[i + A_row_start + (j + A_col_start) * A_internal_rows]
-                                                                  - B[i + B_row_start + (j + B_col_start) * B_internal_rows];
-}
diff --git a/auxiliary/matrix_col/align1/trans_lower_triangular_substitute_inplace.cl b/auxiliary/matrix_col/align1/trans_lower_triangular_substitute_inplace.cl
deleted file mode 100644
index 3b90add..0000000
--- a/auxiliary/matrix_col/align1/trans_lower_triangular_substitute_inplace.cl
+++ /dev/null
@@ -1,26 +0,0 @@
-
-__kernel void trans_lower_triangular_substitute_inplace(
-          __global const float * matrix,
-          unsigned int matrix_rows,
-          unsigned int matrix_cols,
-          unsigned int matrix_internal_rows,
-          unsigned int matrix_internal_cols,
-          __global float * vector)
-{
-  float temp;
-  for (int row = 0; row < matrix_rows; ++row)
-  {
-    barrier(CLK_GLOBAL_MEM_FENCE);
-    if (get_global_id(0) == 0)
-      vector[row] /= matrix[row+row*matrix_internal_rows];
-
-    barrier(CLK_GLOBAL_MEM_FENCE);
-
-    temp = vector[row];
-
-    for  (int elim = row + get_global_id(0) + 1; elim < matrix_rows; elim += get_global_size(0))
-      vector[elim] -= temp * matrix[elim * matrix_internal_rows + row];
-  }
-}
-
-
diff --git a/auxiliary/matrix_col/align1/trans_unit_lower_triangular_substitute_inplace.cl b/auxiliary/matrix_col/align1/trans_unit_lower_triangular_substitute_inplace.cl
deleted file mode 100644
index a4fec7b..0000000
--- a/auxiliary/matrix_col/align1/trans_unit_lower_triangular_substitute_inplace.cl
+++ /dev/null
@@ -1,22 +0,0 @@
-
-__kernel void trans_unit_lower_triangular_substitute_inplace(
-          __global const float * matrix,
-          unsigned int matrix_rows,
-          unsigned int matrix_cols,
-          unsigned int matrix_internal_rows,
-          unsigned int matrix_internal_cols,
-          __global float * vector)
-{
-  float temp;
-  for (int row = 0; row < matrix_rows; ++row)
-  {
-    barrier(CLK_GLOBAL_MEM_FENCE);
-
-    temp = vector[row];
-
-    for  (int elim = row + get_global_id(0) + 1; elim < matrix_rows; elim += get_global_size(0))
-      vector[elim] -= temp * matrix[elim * matrix_internal_rows + row];
-  }
-}
-
-
diff --git a/auxiliary/matrix_col/align1/trans_unit_upper_triangular_substitute_inplace.cl b/auxiliary/matrix_col/align1/trans_unit_upper_triangular_substitute_inplace.cl
deleted file mode 100644
index 722c9d3..0000000
--- a/auxiliary/matrix_col/align1/trans_unit_upper_triangular_substitute_inplace.cl
+++ /dev/null
@@ -1,24 +0,0 @@
-
-
-//transposed lower triangular matrix
-__kernel void trans_unit_upper_triangular_substitute_inplace(
-          __global const float * matrix, 
-          unsigned int matrix_rows,
-          unsigned int matrix_cols,
-          unsigned int matrix_internal_rows,
-          unsigned int matrix_internal_cols,
-          __global float * vector) 
-{ 
-  float temp; 
-  for (int row = matrix_rows-1; row > -1; --row) 
-  { 
-    barrier(CLK_GLOBAL_MEM_FENCE); 
-    
-    temp = vector[row]; 
-    //eliminate column with index 'row' in parallel: 
-    for  (int elim = get_global_id(0); elim < row; elim += get_global_size(0)) 
-      vector[elim] -= temp * matrix[row + elim  * matrix_internal_rows]; 
-  } 
-   
-}
-
diff --git a/auxiliary/matrix_col/align1/trans_upper_triangular_substitute_inplace.cl b/auxiliary/matrix_col/align1/trans_upper_triangular_substitute_inplace.cl
deleted file mode 100644
index 1278da1..0000000
--- a/auxiliary/matrix_col/align1/trans_upper_triangular_substitute_inplace.cl
+++ /dev/null
@@ -1,28 +0,0 @@
-
-
-//transposed lower triangular matrix
-__kernel void trans_upper_triangular_substitute_inplace(
-          __global const float * matrix, 
-          unsigned int matrix_rows,
-          unsigned int matrix_cols,
-          unsigned int matrix_internal_rows,
-          unsigned int matrix_internal_cols,
-          __global float * vector) 
-{ 
-  float temp; 
-  for (int row = matrix_rows-1; row > -1; --row) 
-  { 
-    barrier(CLK_GLOBAL_MEM_FENCE); 
-    if (get_global_id(0) == 0) 
-      vector[row] /= matrix[row + row*matrix_internal_rows]; 
- 
-    barrier(CLK_GLOBAL_MEM_FENCE); 
-
-    temp = vector[row]; 
-    //eliminate column with index 'row' in parallel: 
-    for  (int elim = get_global_id(0); elim < row; elim += get_global_size(0)) 
-      vector[elim] -= temp * matrix[row + elim * matrix_internal_rows]; 
-  } 
-   
-}
-
diff --git a/auxiliary/matrix_col/align1/trans_vec_mul.cl b/auxiliary/matrix_col/align1/trans_vec_mul.cl
deleted file mode 100644
index eeca1e8..0000000
--- a/auxiliary/matrix_col/align1/trans_vec_mul.cl
+++ /dev/null
@@ -1,22 +0,0 @@
-
-
-__kernel void trans_vec_mul(
-          __global const float * matrix,
-          unsigned int matrix_rows,
-          unsigned int matrix_cols,
-          unsigned int matrix_internal_rows,
-          unsigned int matrix_internal_cols,
-          __global const float * vector,  
-          __global float * result) 
-{ 
-  //row and col indicate indices within transposed matrix
-  for (unsigned int row = get_global_id(0); row < matrix_cols; row += get_global_size(0))
-  {
-    float dot_prod2 = 0.0f;
-    for (unsigned int col = 0; col < matrix_rows; ++col)
-      dot_prod2 += matrix[row * matrix_internal_rows + col] * vector[col];
-    result[row] = dot_prod2;
-  }
-}
-
-
diff --git a/auxiliary/matrix_col/align1/unit_lower_triangular_substitute_inplace.cl b/auxiliary/matrix_col/align1/unit_lower_triangular_substitute_inplace.cl
deleted file mode 100644
index d881866..0000000
--- a/auxiliary/matrix_col/align1/unit_lower_triangular_substitute_inplace.cl
+++ /dev/null
@@ -1,22 +0,0 @@
-
-__kernel void unit_lower_triangular_substitute_inplace(
-          __global const float * matrix,
-          unsigned int matrix_rows,
-          unsigned int matrix_cols,
-          unsigned int matrix_internal_rows,
-          unsigned int matrix_internal_cols,
-          __global float * vector)
-{
-  float temp;
-  for (int row = 0; row < matrix_rows; ++row)
-  {
-    barrier(CLK_GLOBAL_MEM_FENCE);
-
-    temp = vector[row];
-
-    for  (int elim = row + get_global_id(0) + 1; elim < matrix_rows; elim += get_global_size(0))
-      vector[elim] -= temp * matrix[row * matrix_internal_rows + elim];
-  }
-}
-
-
diff --git a/auxiliary/matrix_col/align1/unit_upper_triangular_substitute_inplace.cl b/auxiliary/matrix_col/align1/unit_upper_triangular_substitute_inplace.cl
deleted file mode 100644
index f44cf13..0000000
--- a/auxiliary/matrix_col/align1/unit_upper_triangular_substitute_inplace.cl
+++ /dev/null
@@ -1,23 +0,0 @@
-
-
-__kernel void unit_upper_triangular_substitute_inplace( 
-          __global const float * matrix, 
-          unsigned int matrix_rows,
-          unsigned int matrix_cols,
-          unsigned int matrix_internal_rows,
-          unsigned int matrix_internal_cols,
-          __global float * vector) 
-{ 
-  float temp; 
-  for (int row = matrix_rows-1; row > -1; --row) 
-  { 
-    barrier(CLK_GLOBAL_MEM_FENCE); 
-    
-    temp = vector[row]; 
-    //eliminate column with index 'row' in parallel: 
-    for  (int elim = get_global_id(0); elim < row; elim += get_global_size(0)) 
-      vector[elim] -= temp * matrix[elim + row  * matrix_internal_rows]; 
-  } 
-   
-}
-
diff --git a/auxiliary/matrix_col/align1/upper_triangular_substitute_inplace.cl b/auxiliary/matrix_col/align1/upper_triangular_substitute_inplace.cl
deleted file mode 100644
index 3e371b8..0000000
--- a/auxiliary/matrix_col/align1/upper_triangular_substitute_inplace.cl
+++ /dev/null
@@ -1,27 +0,0 @@
-
-
-__kernel void upper_triangular_substitute_inplace( 
-          __global const float * matrix, 
-          unsigned int matrix_rows,
-          unsigned int matrix_cols,
-          unsigned int matrix_internal_rows,
-          unsigned int matrix_internal_cols,
-          __global float * vector) 
-{ 
-  float temp; 
-  for (int row = matrix_rows-1; row > -1; --row) 
-  { 
-    barrier(CLK_GLOBAL_MEM_FENCE); 
-    if (get_global_id(0) == 0) 
-      vector[row] /= matrix[row + row*matrix_internal_rows]; 
- 
-    barrier(CLK_GLOBAL_MEM_FENCE); 
-
-    temp = vector[row]; 
-    //eliminate column with index 'row' in parallel: 
-    for  (int elim = get_global_id(0); elim < row; elim += get_global_size(0)) 
-      vector[elim] -= temp * matrix[elim + row  * matrix_internal_rows]; 
-  } 
-   
-}
-
diff --git a/auxiliary/matrix_col/align1/vec_mul.cl b/auxiliary/matrix_col/align1/vec_mul.cl
deleted file mode 100644
index 470d325..0000000
--- a/auxiliary/matrix_col/align1/vec_mul.cl
+++ /dev/null
@@ -1,22 +0,0 @@
-
-
-
-__kernel void vec_mul(
-          __global const float * matrix,
-          unsigned int matrix_rows,
-          unsigned int matrix_cols,
-          unsigned int matrix_internal_rows,
-          unsigned int matrix_internal_cols,
-          __global const float * vector,  
-          __global float * result) 
-{ 
-  for (unsigned int row = get_global_id(0); row < matrix_rows; row += get_global_size(0))
-  {
-    float dot_prod = 0.0f;
-    for (unsigned int col = 0; col < matrix_cols; ++col)
-      dot_prod += matrix[row + col*matrix_internal_rows] * vector[col];
-    result[row] = dot_prod;
-  }
-}
-
-
diff --git a/auxiliary/matrix_col/align16/dummy b/auxiliary/matrix_col/align16/dummy
deleted file mode 100644
index 046cf28..0000000
--- a/auxiliary/matrix_col/align16/dummy
+++ /dev/null
@@ -1 +0,0 @@
-This is a dummy file for the versioning system
diff --git a/auxiliary/matrix_col/matrix.old_cl b/auxiliary/matrix_col/matrix.old_cl
deleted file mode 100644
index 46ee717..0000000
--- a/auxiliary/matrix_col/matrix.old_cl
+++ /dev/null
@@ -1,120 +0,0 @@
-//helper:
-void helper_float_parallel_reduction( __local float * tmp_buffer )
-{
-  for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)
-  {
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (get_global_id(0) < stride)
-      tmp_buffer[get_global_id(0)] += tmp_buffer[get_global_id(0)+stride];
-  }
-};
-
-
-/////////////////////////// MATRIX OPERATIONS ///////////////////////////////
-
-
-
-
-
-////// solver kernels for lower triangular systems /////////////
-
-//transposed upper triangular matrix
-
-// __kernel void float_trans_matvec_mul_align1(
-//           __global const float * matrix, //matrix is not transposed in memory!
-//           __global const float * vector,  
-//           __global float * result,
-//           unsigned int row_length,
-//           unsigned int size) 
-// { 
-//   for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0))
-//   {
-//     float dot_prod = 0.0f;
-//     for (unsigned int col = 0; col < row_length; ++col)
-//       dot_prod += matrix[row + col*size] * vector[col];
-//     result[row] = dot_prod;
-//   }
-// };
-
-
-
-
-
-
-
-
-
-//lu factorization of a matrix without pivoting:
-
-
-
-/*
-__kernel void float_matrix_lu_factorize(
-          __global float * matrix,
-          __local float * buffer,                              
-          unsigned int matrix_row_length,
-          unsigned int size) 
-{ 
-  float temp;
-  unsigned rowi;
-  unsigned rowk;
-  for (unsigned int i=1; i<size; ++i)
-  {
-    rowi = i * matrix_row_length;
-    
-    //first step: obtain a_ik from a triangular solution step:
-    for (unsigned int k=0; k<i; ++k)
-    {
-      rowk = k * matrix_row_length;
-      if (get_global_id(0) == 0)
-        matrix[rowi + k] = matrix[rowi + k] / matrix[rowk + k];
-      barrier(CLK_GLOBAL_MEM_FENCE);
-      
-      temp = matrix[rowi + k];
-      
-      for  (unsigned int j = k + 1 + get_global_id(0); j < i; j += get_global_size(0))
-        matrix[rowi + j] -= temp * matrix[rowk + j];
-    }
-
-
-    //second step: subtract block A(k,j) with k=0..i-1 and j=i+1...size-1
-    if (i < get_global_size(0))
-    {
-      //condense column down to matrix(i,j):
-      for (unsigned int j=i+get_global_id(0); j<size; j += get_global_size(0))
-      {
-        temp = 0.0;      
-        //subtraction of A(j, 0:i-1) from A(j,i):
-        for (unsigned int k=0; k<i; ++k)
-          temp += matrix[rowi + k] * matrix[k * matrix_row_length + j];
-        matrix[rowi + j] -= temp;
-      } 
-    }
-    else
-    {
-      //parallel columns:
-      for (unsigned int j=i; j<size; ++j)
-      {
-        temp = 0.0;
-        for (unsigned int k=0; k<= i / get_global_size(0); ++k)
-        {
-          rowk = k*get_global_size(0) + get_global_id(0); //reused as row index k in matrix
-          if (rowk < i)
-            buffer[get_global_id(0)] = matrix[rowi + rowk] * matrix[rowk * matrix_row_length + j];
-          else
-            buffer[get_global_id(0)] = 0.0;
-          helper_float_parallel_reduction(buffer);
-          if (get_global_id(0) == 0)
-            temp += buffer[0];
-        }
-        
-        if (get_global_id(0) == 0)
-          matrix[rowi + j] -= temp;
-      } //for j
-    } //if 
-    
-    barrier(CLK_GLOBAL_MEM_FENCE);
-  }
-} */
-
-
diff --git a/auxiliary/matrix_row/align1/add.cl b/auxiliary/matrix_row/align1/add.cl
deleted file mode 100644
index eae5ba5..0000000
--- a/auxiliary/matrix_row/align1/add.cl
+++ /dev/null
@@ -1,30 +0,0 @@
-
-__kernel void add(  // C = A + B
-          __global const float * A,
-          unsigned int A_row_start,
-          unsigned int A_col_start,
-          unsigned int A_row_size,
-          unsigned int A_col_size,
-          unsigned int A_internal_rows,
-          unsigned int A_internal_cols,
-          __global const float * B,
-          unsigned int B_row_start,
-          unsigned int B_col_start,
-          unsigned int B_row_size,
-          unsigned int B_col_size,
-          unsigned int B_internal_rows,
-          unsigned int B_internal_cols,
-          __global float * C,
-          unsigned int C_row_start,
-          unsigned int C_col_start,
-          unsigned int C_row_size,
-          unsigned int C_col_size,
-          unsigned int C_internal_rows,
-          unsigned int C_internal_cols) 
-{ 
-  for (unsigned int i = get_global_id(0); i < A_row_size; i += get_global_size(0))
-    for (unsigned int j = get_global_id(1); j < A_col_size; j += get_global_size(1))
-      C[(i + C_row_start) * C_internal_cols + j + C_col_start] =  A[(i + A_row_start) * A_internal_cols + j + A_col_start]
-                                                                  + B[(i + B_row_start) * B_internal_cols + j + B_col_start];
-}
-
diff --git a/auxiliary/matrix_row/align1/assign.cl b/auxiliary/matrix_row/align1/assign.cl
deleted file mode 100644
index e4dce74..0000000
--- a/auxiliary/matrix_row/align1/assign.cl
+++ /dev/null
@@ -1,22 +0,0 @@
-
-__kernel void assign( // A <- B
-          __global float * A,
-          unsigned int A_row_start,
-          unsigned int A_col_start,
-          unsigned int A_row_size,
-          unsigned int A_col_size,
-          unsigned int A_internal_rows,
-          unsigned int A_internal_cols,
-          __global const float * B,  
-          unsigned int B_row_start,
-          unsigned int B_col_start,
-          unsigned int B_row_size,
-          unsigned int B_col_size,
-          unsigned int B_internal_rows,
-          unsigned int B_internal_cols)
-{ 
-  for (unsigned int i = get_global_id(0); i < A_row_size; i += get_global_size(0))
-    for (unsigned int j = get_global_id(1); j < A_col_size; j += get_global_size(1))
-      A[(i + A_row_start) * A_internal_cols + j + A_col_start] = B[(i + B_row_start) * B_internal_cols + j + B_col_start];
-}
-
diff --git a/auxiliary/matrix_row/align1/clear.cl b/auxiliary/matrix_row/align1/clear.cl
deleted file mode 100644
index 89806f2..0000000
--- a/auxiliary/matrix_row/align1/clear.cl
+++ /dev/null
@@ -1,14 +0,0 @@
-
-__kernel void clear( // A <- 0
-          __global float * A,
-          unsigned int A_row_start,
-          unsigned int A_col_start,
-          unsigned int A_row_size,
-          unsigned int A_col_size,
-          unsigned int A_internal_rows,
-          unsigned int A_internal_cols) 
-{ 
-  for (unsigned int i = get_global_id(0); i < A_row_size; i += get_global_size(0))
-    for (unsigned int j = get_global_id(1); j < A_col_size; j += get_global_size(1))
-      A[(i + A_row_start) * A_internal_cols + j + A_col_start] = 0;
-}
diff --git a/auxiliary/matrix_row/align1/cpu_inplace_mult.cl b/auxiliary/matrix_row/align1/cpu_inplace_mult.cl
deleted file mode 100644
index 8785a53..0000000
--- a/auxiliary/matrix_row/align1/cpu_inplace_mult.cl
+++ /dev/null
@@ -1,15 +0,0 @@
-
-__kernel void cpu_inplace_mult( // A *= const
-          __global float * A,
-          unsigned int A_row_start,
-          unsigned int A_col_start,
-          unsigned int A_row_size,
-          unsigned int A_col_size,
-          unsigned int A_internal_rows,
-          unsigned int A_internal_cols,
-          float factor) 
-{ 
-  for (unsigned int i = get_global_id(0); i < A_row_size; i += get_global_size(0))
-    for (unsigned int j = get_global_id(1); j < A_col_size; j += get_global_size(1))
-      A[(i + A_row_start) * A_internal_cols + j + A_col_start] *= factor;
-}
diff --git a/auxiliary/matrix_row/align1/fft_direct.cl b/auxiliary/matrix_row/align1/fft_direct.cl
deleted file mode 100644
index 0642a64..0000000
--- a/auxiliary/matrix_row/align1/fft_direct.cl
+++ /dev/null
@@ -1,32 +0,0 @@
-// naive fourier transform (quadratic complexity, use for reference only)
-__kernel void fft_direct(__global float2* input,
-                         __global float2* output,
-                         unsigned int size,
-                         unsigned int stride,
-                         unsigned int batch_num,
-                         float sign) {
-
-//    unsigned int base_offset = 0;
-    const float NUM_PI = 3.14159265358979323846;
-    
-    for(unsigned int batch_id = 0; batch_id < batch_num; batch_id++) {
-        for(unsigned int k = get_global_id(0); k < size; k += get_global_size(0)) {
-            float2 f = 0.0f;
-
-            for(unsigned int n = 0; n < size; n++) {
-                float2 in = input[batch_id * stride + n]; //input index here
-
-                float sn, cs;
-                float arg = sign * 2 * NUM_PI * k / size * n;
-                sn = sincos(arg, &cs);
-
-                float2 ex = (float2)(cs, sn);
-                f = f + (float2)(in.x * ex.x - in.y * ex.y, in.x * ex.y + in.y * ex.x);
-            }
-
-            output[batch_id * stride + k] = f;// output index here
-        }
-
-//        base_offset += stride;
-    }
-}
diff --git a/auxiliary/matrix_row/align1/fft_radix2.cl b/auxiliary/matrix_row/align1/fft_radix2.cl
deleted file mode 100644
index 348a54b..0000000
--- a/auxiliary/matrix_row/align1/fft_radix2.cl
+++ /dev/null
@@ -1,46 +0,0 @@
-__kernel void fft_radix2(__global float2* input,
-                         unsigned int s,
-                         unsigned int bit_size,
-                         unsigned int size,
-                         unsigned int stride,
-                         unsigned int batch_num,
-                         float sign) {
-
-    unsigned int ss = 1 << s;
-    unsigned int half_size = size >> 1;
-
-    float cs, sn;
-    const float NUM_PI = 3.14159265358979323846;
-
-    unsigned int glb_id = get_global_id(0);
-    unsigned int glb_sz = get_global_size(0);
-	
-//    unsigned int base_offset = 0;
-	
-    for(unsigned int batch_id = 0; batch_id < batch_num; batch_id++) {
-        for(unsigned int tid = glb_id; tid < half_size; tid += glb_sz) {
-            unsigned int group = (tid & (ss - 1));
-            unsigned int pos = ((tid >> s) << (s + 1)) + group;
-
-            unsigned int offset = batch_id * stride + pos;
-            float2 in1 = input[offset];//index
-            float2 in2 = input[offset + ss];//index
-
-            float arg = group * sign * NUM_PI / ss;
-
-            sn = sincos(arg, &cs);
-            //sn = native_sin(arg);
-            //cs = native_cos(arg);
-
-            float2 ex = (float2)(cs, sn);
-
-            float2 tmp = (float2)(in2.x * ex.x - in2.y * ex.y, in2.x * ex.y + in2.y * ex.x);
-
-            input[offset + ss] = in1 - tmp;//index
-            input[offset] = in1 + tmp;//index
-        }
-
-//        base_offset += stride;
-    }
-}
-
diff --git a/auxiliary/matrix_row/align1/fft_radix2_local.cl b/auxiliary/matrix_row/align1/fft_radix2_local.cl
deleted file mode 100644
index 0dbe4e1..0000000
--- a/auxiliary/matrix_row/align1/fft_radix2_local.cl
+++ /dev/null
@@ -1,72 +0,0 @@
-
-unsigned int get_reorder_num(unsigned int v, unsigned int bit_size) {
-    v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);
-    v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);
-    v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);
-    v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);
-    v = (v >> 16) | (v << 16);
-
-    v = v >> (32 - bit_size);
-
-    return v;
-}
-
-__kernel void fft_radix2_local(__global float2* input,
-                                __local float2* lcl_input,
-                                unsigned int bit_size,
-                                unsigned int size,
-                                unsigned int stride,
-                                unsigned int batch_num,
-                                float sign) {
-
-    unsigned int grp_id = get_group_id(0);
-    unsigned int grp_num = get_num_groups(0);
-
-    unsigned int lcl_sz = get_local_size(0);
-    unsigned int lcl_id = get_local_id(0);
-    const float NUM_PI = 3.14159265358979323846;
-
-    for(unsigned int batch_id = grp_id; batch_id < batch_num; batch_id += grp_num) {
-        //unsigned int base_offset = stride * batch_id;
-        //copy chunk of global memory to local
-        for(unsigned int p = lcl_id; p < size; p += lcl_sz) {
-            unsigned int v = get_reorder_num(p, bit_size);
-            lcl_input[v] = input[batch_id * stride + p];//index
-        }
-
-        barrier(CLK_LOCAL_MEM_FENCE);
-		
-        //performs Cooley-Tukey FFT on local array
-        for(unsigned int s = 0; s < bit_size; s++) {
-            unsigned int ss = 1 << s;
-
-            float cs, sn;
-
-            for(unsigned int tid = lcl_id; tid < size; tid += lcl_sz) {
-                unsigned int group = (tid & (ss - 1));
-                unsigned int pos = ((tid >> s) << (s + 1)) + group;
-
-                float2 in1 = lcl_input[pos];
-                float2 in2 = lcl_input[pos + ss];
-
-                float arg = group * sign * NUM_PI / ss;
-
-                sn = sincos(arg, &cs);
-                float2 ex = (float2)(cs, sn);
-
-                float2 tmp = (float2)(in2.x * ex.x - in2.y * ex.y, in2.x * ex.y + in2.y * ex.x);
-
-                lcl_input[pos + ss] = in1 - tmp;
-                lcl_input[pos] = in1 + tmp;
-            }
-
-            barrier(CLK_LOCAL_MEM_FENCE);
-        }
-		
-        //copy local array back to global memory
-        for(unsigned int p = lcl_id; p < size; p += lcl_sz) {
-            input[batch_id * stride + p] = lcl_input[p];//index
-        }
-    }
-}
-
diff --git a/auxiliary/matrix_row/align1/fft_reorder.cl b/auxiliary/matrix_row/align1/fft_reorder.cl
deleted file mode 100644
index 0fafd0a..0000000
--- a/auxiliary/matrix_row/align1/fft_reorder.cl
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
-* Performs reordering of input data in bit-reversal order
-* Probably it's better to do in host side,
-*/
-unsigned int get_reorder_num_2(unsigned int v, unsigned int bit_size) {
-    v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);
-    v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);
-    v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);
-    v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);
-    v = (v >> 16) | (v << 16);
-
-    v = v >> (32 - bit_size);
-
-    return v;
-}
-
-__kernel void fft_reorder(__global float2* input,
-                          unsigned int bit_size,
-                          unsigned int size,
-                          unsigned int stride,
-                          int batch_num) {
-    //unsigned int base_offset = 0;
-
-    unsigned int glb_id = get_global_id(0);
-    unsigned int glb_sz = get_global_size(0);
-	
-    for(unsigned int batch_id = 0; batch_id < batch_num; batch_id++) {
-        for(unsigned int i = glb_id; i < size; i += glb_sz) {
-            unsigned int v = get_reorder_num_2(i, bit_size);
-
-            if(i < v) {
-                float2 tmp = input[batch_id * stride + i]; // index
-                input[batch_id * stride + i] = input[batch_id * stride + v]; //index
-                input[batch_id * stride + v] = tmp; //index
-            }
-        }
-
-        //base_offset += stride;
-    }
-}
-
-
diff --git a/auxiliary/matrix_row/align1/inplace_add.cl b/auxiliary/matrix_row/align1/inplace_add.cl
deleted file mode 100644
index 2f28b34..0000000
--- a/auxiliary/matrix_row/align1/inplace_add.cl
+++ /dev/null
@@ -1,22 +0,0 @@
-
-__kernel void inplace_add( // A += B
-          __global float * A,
-          unsigned int A_row_start,
-          unsigned int A_col_start,
-          unsigned int A_row_size,
-          unsigned int A_col_size,
-          unsigned int A_internal_rows,
-          unsigned int A_internal_cols,
-          __global const float * B,  
-          unsigned int B_row_start,
-          unsigned int B_col_start,
-          unsigned int B_row_size,
-          unsigned int B_col_size,
-          unsigned int B_internal_rows,
-          unsigned int B_internal_cols)
-{ 
-  for (unsigned int i = get_global_id(0); i < A_row_size; i += get_global_size(0))
-    for (unsigned int j = get_global_id(1); j < A_col_size; j += get_global_size(1))
-      A[(i + A_row_start) * A_internal_cols + j + A_col_start] += B[(i + B_row_start) * B_internal_cols + j + B_col_start];
-}
-
diff --git a/auxiliary/matrix_row/align1/inplace_divide.cl b/auxiliary/matrix_row/align1/inplace_divide.cl
deleted file mode 100644
index 42630d4..0000000
--- a/auxiliary/matrix_row/align1/inplace_divide.cl
+++ /dev/null
@@ -1,16 +0,0 @@
-
-__kernel void inplace_divide( // A /= const
-          __global float * A,
-          unsigned int A_row_start,
-          unsigned int A_col_start,
-          unsigned int A_row_size,
-          unsigned int A_col_size,
-          unsigned int A_internal_rows,
-          unsigned int A_internal_cols,
-          __global const float * fac) //note: CPU variant is mapped to prod_scalar
-{ 
-  float factor = *fac;
-  for (unsigned int i = get_global_id(0); i < A_row_size; i += get_global_size(0))
-    for (unsigned int j = get_global_id(1); j < A_col_size; j += get_global_size(1))
-      A[(i + A_row_start) * A_internal_cols + j + A_col_start] /= factor;
-}
diff --git a/auxiliary/matrix_row/align1/inplace_mult.cl b/auxiliary/matrix_row/align1/inplace_mult.cl
deleted file mode 100644
index 9246841..0000000
--- a/auxiliary/matrix_row/align1/inplace_mult.cl
+++ /dev/null
@@ -1,18 +0,0 @@
-
-__kernel void inplace_mult( // A *= const
-          __global float * A,
-          unsigned int A_row_start,
-          unsigned int A_col_start,
-          unsigned int A_row_size,
-          unsigned int A_col_size,
-          unsigned int A_internal_rows,
-          unsigned int A_internal_cols,
-          __global const float * fac) 
-{ 
-  float factor = *fac;
-  for (unsigned int i = get_global_id(0); i < A_row_size; i += get_global_size(0))
-    for (unsigned int j = get_global_id(1); j < A_col_size; j += get_global_size(1))
-      A[(i + A_row_start) * A_internal_cols + j + A_col_start] *= factor;
-}
-
-
diff --git a/auxiliary/matrix_row/align1/inplace_sub.cl b/auxiliary/matrix_row/align1/inplace_sub.cl
deleted file mode 100644
index 97760f9..0000000
--- a/auxiliary/matrix_row/align1/inplace_sub.cl
+++ /dev/null
@@ -1,21 +0,0 @@
-
-__kernel void inplace_sub( // A -= B
-          __global float * A,
-          unsigned int A_row_start,
-          unsigned int A_col_start,
-          unsigned int A_row_size,
-          unsigned int A_col_size,
-          unsigned int A_internal_rows,
-          unsigned int A_internal_cols,
-          __global const float * B,  
-          unsigned int B_row_start,
-          unsigned int B_col_start,
-          unsigned int B_row_size,
-          unsigned int B_col_size,
-          unsigned int B_internal_rows,
-          unsigned int B_internal_cols)
-{ 
-  for (unsigned int i = get_global_id(0); i < A_row_size; i += get_global_size(0))
-    for (unsigned int j = get_global_id(1); j < A_col_size; j += get_global_size(1))
-      A[(i + A_row_start) * A_internal_cols + j + A_col_start] -= B[(i + B_row_start) * B_internal_cols + j + B_col_start];
-}
diff --git a/auxiliary/matrix_row/align1/lower_triangular_substitute_inplace.cl b/auxiliary/matrix_row/align1/lower_triangular_substitute_inplace.cl
deleted file mode 100644
index a3d56f4..0000000
--- a/auxiliary/matrix_row/align1/lower_triangular_substitute_inplace.cl
+++ /dev/null
@@ -1,26 +0,0 @@
-
-__kernel void lower_triangular_substitute_inplace(
-          __global const float * matrix,
-          unsigned int matrix_rows,
-          unsigned int matrix_cols,
-          unsigned int matrix_internal_rows,
-          unsigned int matrix_internal_cols,
-          __global float * vector)
-{
-  float temp;
-  for (int row = 0; row < matrix_rows; ++row)
-  {
-    barrier(CLK_GLOBAL_MEM_FENCE);
-    if (get_global_id(0) == 0)
-      vector[row] /= matrix[row+row*matrix_internal_cols];
-
-    barrier(CLK_GLOBAL_MEM_FENCE);
-
-    temp = vector[row];
-
-    for  (int elim = row + get_global_id(0) + 1; elim < matrix_rows; elim += get_global_size(0))
-      vector[elim] -= temp * matrix[elim * matrix_internal_cols + row];
-  }
-}
-
-
diff --git a/auxiliary/matrix_row/align1/lu_factorize.cl b/auxiliary/matrix_row/align1/lu_factorize.cl
deleted file mode 100644
index cfde387..0000000
--- a/auxiliary/matrix_row/align1/lu_factorize.cl
+++ /dev/null
@@ -1,31 +0,0 @@
-
-__kernel void lu_factorize(
-          __global float * matrix,
-          unsigned int matrix_rows,
-          unsigned int matrix_cols,
-          unsigned int matrix_internal_rows,
-          unsigned int matrix_internal_cols) 
-{ 
-  float temp;
-  unsigned rowi;
-  unsigned rowk;
-  for (unsigned int i=1; i<matrix_rows; ++i)
-  {
-    rowi = i * matrix_internal_cols;
-    for (unsigned int k=0; k<i; ++k)
-    {
-      rowk = k * matrix_internal_cols;
-      if (get_global_id(0) == 0)
-        matrix[rowi + k] /= matrix[rowk + k];
-
-      barrier(CLK_GLOBAL_MEM_FENCE);
-      temp = matrix[rowi + k];
-      
-      //parallel subtraction:
-      for (unsigned int j=k+1 + get_global_id(0); j<matrix_rows; j += get_global_size(0))
-        matrix[rowi + j] -= temp * matrix[rowk + j];
-    }
-  }
-} 
-
-
diff --git a/auxiliary/matrix_row/align1/rank1_update.cl b/auxiliary/matrix_row/align1/rank1_update.cl
deleted file mode 100644
index f338fdf..0000000
--- a/auxiliary/matrix_row/align1/rank1_update.cl
+++ /dev/null
@@ -1,23 +0,0 @@
-
-//perform a rank-1 update of the matrix, i.e. A += x * x^T
-__kernel void rank1_update(
-          __global float * matrix,
-          unsigned int matrix_rows,
-          unsigned int matrix_cols,
-          unsigned int matrix_internal_rows,
-          unsigned int matrix_internal_cols,
-          __global const float * vector1,  
-          __global const float * vector2) 
-{ 
-  float tmp;
-  unsigned int offset;
-
-  for (unsigned int row = get_global_id(0); row < matrix_rows; row += get_global_size(0))
-  {
-    tmp = vector1[row];
-    offset = row*matrix_internal_cols;
-    for (unsigned int col = 0; col < matrix_cols; ++col)
-      matrix[offset+col] += tmp * vector2[col];
-  }
-}
-
diff --git a/auxiliary/matrix_row/align1/scaled_rank1_update.cl b/auxiliary/matrix_row/align1/scaled_rank1_update.cl
deleted file mode 100644
index 1dd39b8..0000000
--- a/auxiliary/matrix_row/align1/scaled_rank1_update.cl
+++ /dev/null
@@ -1,24 +0,0 @@
-
-__kernel void scaled_rank1_update(
-          __global float * matrix,
-          unsigned int matrix_rows,
-          unsigned int matrix_cols,
-          unsigned int matrix_internal_rows,
-          unsigned int matrix_internal_cols,
-          float val,
-          __global const float * vector1,  
-          __global const float * vector2) 
-{ 
-  float tmp;
-  unsigned int offset;
-
-  for (unsigned int row = get_global_id(0); row < matrix_rows; row += get_global_size(0))
-  {
-    tmp = val * vector1[row];
-    offset = row * matrix_internal_cols;
-    for (unsigned int col = 0; col < matrix_cols; ++col)
-      matrix[offset+col] += tmp * vector2[col];
-  }
-}
-
-
diff --git a/auxiliary/matrix_row/align1/sub.cl b/auxiliary/matrix_row/align1/sub.cl
deleted file mode 100644
index 5bd03a1..0000000
--- a/auxiliary/matrix_row/align1/sub.cl
+++ /dev/null
@@ -1,29 +0,0 @@
-
-__kernel void sub(  // C = A - B
-          __global const float * A,
-          unsigned int A_row_start,
-          unsigned int A_col_start,
-          unsigned int A_row_size,
-          unsigned int A_col_size,
-          unsigned int A_internal_rows,
-          unsigned int A_internal_cols,
-          __global const float * B,
-          unsigned int B_row_start,
-          unsigned int B_col_start,
-          unsigned int B_row_size,
-          unsigned int B_col_size,
-          unsigned int B_internal_rows,
-          unsigned int B_internal_cols,
-          __global float * C,
-          unsigned int C_row_start,
-          unsigned int C_col_start,
-          unsigned int C_row_size,
-          unsigned int C_col_size,
-          unsigned int C_internal_rows,
-          unsigned int C_internal_cols) 
-{ 
-  for (unsigned int i = get_global_id(0); i < A_row_size; i += get_global_size(0))
-    for (unsigned int j = get_global_id(1); j < A_col_size; j += get_global_size(1))
-      C[(i + C_row_start) * C_internal_cols + j + C_col_start] =  A[(i + A_row_start) * A_internal_cols + j + A_col_start]
-                                                                  - B[(i + B_row_start) * B_internal_cols + j + B_col_start];
-}
diff --git a/auxiliary/matrix_row/align1/trans_lower_triangular_substitute_inplace.cl b/auxiliary/matrix_row/align1/trans_lower_triangular_substitute_inplace.cl
deleted file mode 100644
index 2eaebfd..0000000
--- a/auxiliary/matrix_row/align1/trans_lower_triangular_substitute_inplace.cl
+++ /dev/null
@@ -1,26 +0,0 @@
-
-__kernel void trans_lower_triangular_substitute_inplace(
-          __global const float * matrix,
-          unsigned int matrix_rows,
-          unsigned int matrix_cols,
-          unsigned int matrix_internal_rows,
-          unsigned int matrix_internal_cols,
-          __global float * vector)
-{
-  float temp;
-  for (int row = 0; row < matrix_rows; ++row)
-  {
-    barrier(CLK_GLOBAL_MEM_FENCE);
-    if (get_global_id(0) == 0)
-      vector[row] /= matrix[row+row*matrix_internal_cols];
-
-    barrier(CLK_GLOBAL_MEM_FENCE);
-
-    temp = vector[row];
-
-    for  (int elim = row + get_global_id(0) + 1; elim < matrix_rows; elim += get_global_size(0))
-      vector[elim] -= temp * matrix[row * matrix_internal_cols + elim];
-  }
-}
-
-
diff --git a/auxiliary/matrix_row/align1/trans_unit_lower_triangular_substitute_inplace.cl b/auxiliary/matrix_row/align1/trans_unit_lower_triangular_substitute_inplace.cl
deleted file mode 100644
index e977fff..0000000
--- a/auxiliary/matrix_row/align1/trans_unit_lower_triangular_substitute_inplace.cl
+++ /dev/null
@@ -1,22 +0,0 @@
-
-__kernel void trans_unit_lower_triangular_substitute_inplace(
-          __global const float * matrix,
-          unsigned int matrix_rows,
-          unsigned int matrix_cols,
-          unsigned int matrix_internal_rows,
-          unsigned int matrix_internal_cols,
-          __global float * vector)
-{
-  float temp;
-  for (int row = 0; row < matrix_rows; ++row)
-  {
-    barrier(CLK_GLOBAL_MEM_FENCE);
-
-    temp = vector[row];
-
-    for  (int elim = row + get_global_id(0) + 1; elim < matrix_rows; elim += get_global_size(0))
-      vector[elim] -= temp * matrix[row * matrix_internal_cols + elim];
-  }
-}
-
-
diff --git a/auxiliary/matrix_row/align1/trans_unit_upper_triangular_substitute_inplace.cl b/auxiliary/matrix_row/align1/trans_unit_upper_triangular_substitute_inplace.cl
deleted file mode 100644
index 99ed52c..0000000
--- a/auxiliary/matrix_row/align1/trans_unit_upper_triangular_substitute_inplace.cl
+++ /dev/null
@@ -1,24 +0,0 @@
-
-
-//transposed lower triangular matrix
-__kernel void trans_unit_upper_triangular_substitute_inplace(
-          __global const float * matrix, 
-          unsigned int matrix_rows,
-          unsigned int matrix_cols,
-          unsigned int matrix_internal_rows,
-          unsigned int matrix_internal_cols,
-          __global float * vector) 
-{ 
-  float temp; 
-  for (int row = matrix_rows-1; row > -1; --row) 
-  { 
-    barrier(CLK_GLOBAL_MEM_FENCE); 
-
-    temp = vector[row]; 
-    //eliminate column with index 'row' in parallel: 
-    for  (int elim = get_global_id(0); elim < row; elim += get_global_size(0)) 
-      vector[elim] -= temp * matrix[row * matrix_internal_cols + elim]; 
-  } 
-   
-}
-
diff --git a/auxiliary/matrix_row/align1/trans_upper_triangular_substitute_inplace.cl b/auxiliary/matrix_row/align1/trans_upper_triangular_substitute_inplace.cl
deleted file mode 100644
index 21772dd..0000000
--- a/auxiliary/matrix_row/align1/trans_upper_triangular_substitute_inplace.cl
+++ /dev/null
@@ -1,28 +0,0 @@
-
-
-//transposed lower triangular matrix
-__kernel void trans_upper_triangular_substitute_inplace(
-          __global const float * matrix, 
-          unsigned int matrix_rows,
-          unsigned int matrix_cols,
-          unsigned int matrix_internal_rows,
-          unsigned int matrix_internal_cols,
-          __global float * vector) 
-{ 
-  float temp; 
-  for (int row = matrix_rows-1; row > -1; --row) 
-  { 
-    barrier(CLK_GLOBAL_MEM_FENCE); 
-    if (get_global_id(0) == 0) 
-      vector[row] /= matrix[row*matrix_internal_cols + row]; 
- 
-    barrier(CLK_GLOBAL_MEM_FENCE); 
-
-    temp = vector[row]; 
-    //eliminate column with index 'row' in parallel: 
-    for  (int elim = get_global_id(0); elim < row; elim += get_global_size(0)) 
-      vector[elim] -= temp * matrix[row * matrix_internal_cols + elim]; 
-  } 
-   
-}
-
diff --git a/auxiliary/matrix_row/align1/trans_vec_mul.cl b/auxiliary/matrix_row/align1/trans_vec_mul.cl
deleted file mode 100644
index 86e5525..0000000
--- a/auxiliary/matrix_row/align1/trans_vec_mul.cl
+++ /dev/null
@@ -1,22 +0,0 @@
-
-
-__kernel void trans_vec_mul(
-          __global const float * matrix,
-          unsigned int matrix_rows,
-          unsigned int matrix_cols,
-          unsigned int matrix_internal_rows,
-          unsigned int matrix_internal_cols,
-          __global const float * vector,  
-          __global float * result) 
-{ 
-  //row and col indicate indices within transposed matrix
-  for (unsigned int row = get_global_id(0); row < matrix_cols; row += get_global_size(0))
-  {
-    float dot_prod2 = 0.0f;
-    for (unsigned int col = 0; col < matrix_rows; ++col)
-      dot_prod2 += matrix[row + col*matrix_internal_cols] * vector[col];
-    result[row] = dot_prod2;
-  }
-}
-
-
diff --git a/auxiliary/matrix_row/align1/unit_lower_triangular_substitute_inplace.cl b/auxiliary/matrix_row/align1/unit_lower_triangular_substitute_inplace.cl
deleted file mode 100644
index 240e4e0..0000000
--- a/auxiliary/matrix_row/align1/unit_lower_triangular_substitute_inplace.cl
+++ /dev/null
@@ -1,22 +0,0 @@
-
-__kernel void unit_lower_triangular_substitute_inplace(
-          __global const float * matrix,
-          unsigned int matrix_rows,
-          unsigned int matrix_cols,
-          unsigned int matrix_internal_rows,
-          unsigned int matrix_internal_cols,
-          __global float * vector)
-{
-  float temp;
-  for (int row = 0; row < matrix_rows; ++row)
-  {
-    barrier(CLK_GLOBAL_MEM_FENCE);
-
-    temp = vector[row];
-
-    for  (int elim = row + get_global_id(0) + 1; elim < matrix_rows; elim += get_global_size(0))
-      vector[elim] -= temp * matrix[elim * matrix_internal_cols + row];
-  }
-}
-
-
diff --git a/auxiliary/matrix_row/align1/unit_upper_triangular_substitute_inplace.cl b/auxiliary/matrix_row/align1/unit_upper_triangular_substitute_inplace.cl
deleted file mode 100644
index 13a3dd9..0000000
--- a/auxiliary/matrix_row/align1/unit_upper_triangular_substitute_inplace.cl
+++ /dev/null
@@ -1,23 +0,0 @@
-
-
-__kernel void unit_upper_triangular_substitute_inplace( 
-          __global const float * matrix, 
-          unsigned int matrix_rows,
-          unsigned int matrix_cols,
-          unsigned int matrix_internal_rows,
-          unsigned int matrix_internal_cols,
-          __global float * vector) 
-{ 
-  float temp; 
-  for (int row = matrix_rows-1; row > -1; --row) 
-  { 
-    barrier(CLK_GLOBAL_MEM_FENCE); 
-
-    temp = vector[row]; 
-    //eliminate column with index 'row' in parallel: 
-    for  (int elim = get_global_id(0); elim < row; elim += get_global_size(0)) 
-      vector[elim] -= temp * matrix[elim * matrix_internal_cols + row]; 
-  } 
-   
-}
-
diff --git a/auxiliary/matrix_row/align1/upper_triangular_substitute_inplace.cl b/auxiliary/matrix_row/align1/upper_triangular_substitute_inplace.cl
deleted file mode 100644
index 8e9cc2a..0000000
--- a/auxiliary/matrix_row/align1/upper_triangular_substitute_inplace.cl
+++ /dev/null
@@ -1,27 +0,0 @@
-
-
-__kernel void upper_triangular_substitute_inplace( 
-          __global const float * matrix, 
-          unsigned int matrix_rows,
-          unsigned int matrix_cols,
-          unsigned int matrix_internal_rows,
-          unsigned int matrix_internal_cols,
-          __global float * vector) 
-{ 
-  float temp; 
-  for (int row = matrix_rows-1; row > -1; --row) 
-  { 
-    barrier(CLK_GLOBAL_MEM_FENCE); 
-    if (get_global_id(0) == 0) 
-      vector[row] /= matrix[row*matrix_internal_cols + row]; 
- 
-    barrier(CLK_GLOBAL_MEM_FENCE); 
-
-    temp = vector[row]; 
-    //eliminate column with index 'row' in parallel: 
-    for  (int elim = get_global_id(0); elim < row; elim += get_global_size(0)) 
-      vector[elim] -= temp * matrix[elim * matrix_internal_cols + row]; 
-  } 
-   
-}
-
diff --git a/auxiliary/matrix_row/align1/vec_mul.cl b/auxiliary/matrix_row/align1/vec_mul.cl
deleted file mode 100644
index 9897353..0000000
--- a/auxiliary/matrix_row/align1/vec_mul.cl
+++ /dev/null
@@ -1,22 +0,0 @@
-
-
-
-__kernel void vec_mul(
-          __global const float * matrix,
-          unsigned int matrix_rows,
-          unsigned int matrix_cols,
-          unsigned int matrix_internal_rows,
-          unsigned int matrix_internal_cols,
-          __global const float * vector,  
-          __global float * result) 
-{ 
-  for (unsigned int row = get_global_id(0); row < matrix_rows; row += get_global_size(0))
-  {
-    float dot_prod = 0.0f;
-    for (unsigned int col = 0; col < matrix_cols; ++col)
-      dot_prod += matrix[row*matrix_internal_cols + col] * vector[col];
-    result[row] = dot_prod;
-  }
-}
-
-
diff --git a/auxiliary/matrix_row/align16/dummy b/auxiliary/matrix_row/align16/dummy
deleted file mode 100644
index 046cf28..0000000
--- a/auxiliary/matrix_row/align16/dummy
+++ /dev/null
@@ -1 +0,0 @@
-This is a dummy file for the versioning system
diff --git a/auxiliary/matrix_row/matrix.old_cl b/auxiliary/matrix_row/matrix.old_cl
deleted file mode 100644
index 46ee717..0000000
--- a/auxiliary/matrix_row/matrix.old_cl
+++ /dev/null
@@ -1,120 +0,0 @@
-//helper:
-void helper_float_parallel_reduction( __local float * tmp_buffer )
-{
-  for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)
-  {
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (get_global_id(0) < stride)
-      tmp_buffer[get_global_id(0)] += tmp_buffer[get_global_id(0)+stride];
-  }
-};
-
-
-/////////////////////////// MATRIX OPERATIONS ///////////////////////////////
-
-
-
-
-
-////// solver kernels for lower triangular systems /////////////
-
-//transposed upper triangular matrix
-
-// __kernel void float_trans_matvec_mul_align1(
-//           __global const float * matrix, //matrix is not transposed in memory!
-//           __global const float * vector,  
-//           __global float * result,
-//           unsigned int row_length,
-//           unsigned int size) 
-// { 
-//   for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0))
-//   {
-//     float dot_prod = 0.0f;
-//     for (unsigned int col = 0; col < row_length; ++col)
-//       dot_prod += matrix[row + col*size] * vector[col];
-//     result[row] = dot_prod;
-//   }
-// };
-
-
-
-
-
-
-
-
-
-//lu factorization of a matrix without pivoting:
-
-
-
-/*
-__kernel void float_matrix_lu_factorize(
-          __global float * matrix,
-          __local float * buffer,                              
-          unsigned int matrix_row_length,
-          unsigned int size) 
-{ 
-  float temp;
-  unsigned rowi;
-  unsigned rowk;
-  for (unsigned int i=1; i<size; ++i)
-  {
-    rowi = i * matrix_row_length;
-    
-    //first step: obtain a_ik from a triangular solution step:
-    for (unsigned int k=0; k<i; ++k)
-    {
-      rowk = k * matrix_row_length;
-      if (get_global_id(0) == 0)
-        matrix[rowi + k] = matrix[rowi + k] / matrix[rowk + k];
-      barrier(CLK_GLOBAL_MEM_FENCE);
-      
-      temp = matrix[rowi + k];
-      
-      for  (unsigned int j = k + 1 + get_global_id(0); j < i; j += get_global_size(0))
-        matrix[rowi + j] -= temp * matrix[rowk + j];
-    }
-
-
-    //second step: subtract block A(k,j) with k=0..i-1 and j=i+1...size-1
-    if (i < get_global_size(0))
-    {
-      //condense column down to matrix(i,j):
-      for (unsigned int j=i+get_global_id(0); j<size; j += get_global_size(0))
-      {
-        temp = 0.0;      
-        //subtraction of A(j, 0:i-1) from A(j,i):
-        for (unsigned int k=0; k<i; ++k)
-          temp += matrix[rowi + k] * matrix[k * matrix_row_length + j];
-        matrix[rowi + j] -= temp;
-      } 
-    }
-    else
-    {
-      //parallel columns:
-      for (unsigned int j=i; j<size; ++j)
-      {
-        temp = 0.0;
-        for (unsigned int k=0; k<= i / get_global_size(0); ++k)
-        {
-          rowk = k*get_global_size(0) + get_global_id(0); //reused as row index k in matrix
-          if (rowk < i)
-            buffer[get_global_id(0)] = matrix[rowi + rowk] * matrix[rowk * matrix_row_length + j];
-          else
-            buffer[get_global_id(0)] = 0.0;
-          helper_float_parallel_reduction(buffer);
-          if (get_global_id(0) == 0)
-            temp += buffer[0];
-        }
-        
-        if (get_global_id(0) == 0)
-          matrix[rowi + j] -= temp;
-      } //for j
-    } //if 
-    
-    barrier(CLK_GLOBAL_MEM_FENCE);
-  }
-} */
-
-
diff --git a/auxiliary/scalar/align1/add.cl b/auxiliary/scalar/align1/add.cl
deleted file mode 100644
index dec4be6..0000000
--- a/auxiliary/scalar/align1/add.cl
+++ /dev/null
@@ -1,10 +0,0 @@
- 
-__kernel void add(
-          __global const float * val1,
-          __global const float * val2, 
-          __global float * result) 
-{ 
-  if (get_global_id(0) == 0)
-    *result = *val1 + *val2;
-}
- 
diff --git a/auxiliary/scalar/align1/cpu_add.cl b/auxiliary/scalar/align1/cpu_add.cl
deleted file mode 100644
index eaa9b74..0000000
--- a/auxiliary/scalar/align1/cpu_add.cl
+++ /dev/null
@@ -1,10 +0,0 @@
- 
-__kernel void cpu_add(
-          __global const float * val1,
-          float val2, 
-          __global float * result) 
-{ 
-  if (get_global_id(0) == 0)
-    *result = *val1 + val2;
-}
- 
diff --git a/auxiliary/scalar/align1/cpu_div.cl b/auxiliary/scalar/align1/cpu_div.cl
deleted file mode 100644
index 95ba8b0..0000000
--- a/auxiliary/scalar/align1/cpu_div.cl
+++ /dev/null
@@ -1,10 +0,0 @@
- 
-__kernel void cpu_div(
-          __global const float * val1,
-          float val2, 
-          __global float * result) 
-{ 
-  if (get_global_id(0) == 0)
-    *result = *val1 / val2;
-}
- 
diff --git a/auxiliary/scalar/align1/cpu_inplace_add.cl b/auxiliary/scalar/align1/cpu_inplace_add.cl
deleted file mode 100644
index 139dfd4..0000000
--- a/auxiliary/scalar/align1/cpu_inplace_add.cl
+++ /dev/null
@@ -1,9 +0,0 @@
- 
-__kernel void cpu_inplace_add(
-          __global float * val1,
-          float val2) 
-{ 
-  if (get_global_id(0) == 0)
-    *val1 += val2;
-}
- 
diff --git a/auxiliary/scalar/align1/cpu_inplace_div.cl b/auxiliary/scalar/align1/cpu_inplace_div.cl
deleted file mode 100644
index 61f2222..0000000
--- a/auxiliary/scalar/align1/cpu_inplace_div.cl
+++ /dev/null
@@ -1,10 +0,0 @@
- 
-__kernel void cpu_inplace_div(
-          __global float * val1,
-          float val2) 
-{ 
-  if (get_global_id(0) == 0)
-    *val1 /= val2;
-}
-
- 
diff --git a/auxiliary/scalar/align1/cpu_inplace_mul.cl b/auxiliary/scalar/align1/cpu_inplace_mul.cl
deleted file mode 100644
index fdc2322..0000000
--- a/auxiliary/scalar/align1/cpu_inplace_mul.cl
+++ /dev/null
@@ -1,9 +0,0 @@
- 
-__kernel void cpu_inplace_mul(
-          __global float * val1,
-          float val2) 
-{ 
-  if (get_global_id(0) == 0)
-    *val1 *= val2;
-}
- 
diff --git a/auxiliary/scalar/align1/cpu_inplace_sub.cl b/auxiliary/scalar/align1/cpu_inplace_sub.cl
deleted file mode 100644
index dc4dabe..0000000
--- a/auxiliary/scalar/align1/cpu_inplace_sub.cl
+++ /dev/null
@@ -1,10 +0,0 @@
- 
-__kernel void cpu_inplace_sub(
-          __global float * val1,
-          float val2) 
-{ 
-  if (get_global_id(0) == 0)
-    *val1 -= val2;
-}
- 
-
diff --git a/auxiliary/scalar/align1/cpu_mul.cl b/auxiliary/scalar/align1/cpu_mul.cl
deleted file mode 100644
index f87969d..0000000
--- a/auxiliary/scalar/align1/cpu_mul.cl
+++ /dev/null
@@ -1,10 +0,0 @@
- 
-__kernel void cpu_mul(
-          __global const float * val1,
-          float val2, 
-          __global float * result) 
-{ 
-  if (get_global_id(0) == 0)
-    *result = *val1 * val2;
-}
- 
diff --git a/auxiliary/scalar/align1/cpu_sub.cl b/auxiliary/scalar/align1/cpu_sub.cl
deleted file mode 100644
index dbb4d60..0000000
--- a/auxiliary/scalar/align1/cpu_sub.cl
+++ /dev/null
@@ -1,10 +0,0 @@
- 
-__kernel void cpu_sub(
-          __global const float * val1,
-          float val2, 
-          __global float * result) 
-{ 
-  if (get_global_id(0) == 0)
-    *result = *val1 - val2;
-}
- 
diff --git a/auxiliary/scalar/align1/divide.cl b/auxiliary/scalar/align1/divide.cl
deleted file mode 100644
index bb68deb..0000000
--- a/auxiliary/scalar/align1/divide.cl
+++ /dev/null
@@ -1,12 +0,0 @@
- 
-// note: 'div' seems to produce some name clashes with the OpenCL jit-compiler, thus using 'divide'
-__kernel void divide(
-          __global const float * val1,
-          __global const float * val2, 
-          __global float * result) 
-{ 
-  if (get_global_id(0) == 0)
-    *result = *val1 / *val2;
-}
-
- 
diff --git a/auxiliary/scalar/align1/inplace_add.cl b/auxiliary/scalar/align1/inplace_add.cl
deleted file mode 100644
index 672303a..0000000
--- a/auxiliary/scalar/align1/inplace_add.cl
+++ /dev/null
@@ -1,9 +0,0 @@
- 
-__kernel void inplace_add(
-          __global float * val1,
-          __global const float * val2) 
-{ 
-  if (get_global_id(0) == 0)
-    *val1 += *val2;
-}
- 
diff --git a/auxiliary/scalar/align1/inplace_div.cl b/auxiliary/scalar/align1/inplace_div.cl
deleted file mode 100644
index 02646ef..0000000
--- a/auxiliary/scalar/align1/inplace_div.cl
+++ /dev/null
@@ -1,9 +0,0 @@
- 
-__kernel void inplace_div(
-          __global float * val1,
-          __global const float * val2) 
-{ 
-  if (get_global_id(0) == 0)
-    *val1 /= *val2;
-}
- 
diff --git a/auxiliary/scalar/align1/inplace_mul.cl b/auxiliary/scalar/align1/inplace_mul.cl
deleted file mode 100644
index 26b259f..0000000
--- a/auxiliary/scalar/align1/inplace_mul.cl
+++ /dev/null
@@ -1,9 +0,0 @@
- 
-__kernel void inplace_mul(
-          __global float * val1,
-          __global const float * val2) 
-{ 
-  if (get_global_id(0) == 0)
-    *val1 *= *val2;
-}
- 
diff --git a/auxiliary/scalar/align1/inplace_sub.cl b/auxiliary/scalar/align1/inplace_sub.cl
deleted file mode 100644
index 4347f9f..0000000
--- a/auxiliary/scalar/align1/inplace_sub.cl
+++ /dev/null
@@ -1,9 +0,0 @@
- 
-__kernel void inplace_sub(
-          __global float * val1,
-          __global const float * val2) 
-{ 
-  if (get_global_id(0) == 0)
-    *val1 -= *val2;
-}
- 
diff --git a/auxiliary/scalar/align1/mul.cl b/auxiliary/scalar/align1/mul.cl
deleted file mode 100644
index d3cf904..0000000
--- a/auxiliary/scalar/align1/mul.cl
+++ /dev/null
@@ -1,10 +0,0 @@
- 
-__kernel void mul(
-          __global const float * val1,
-          __global const float * val2, 
-          __global float * result) 
-{ 
-  if (get_global_id(0) == 0)
-    *result = *val1 * *val2;
-}
- 
diff --git a/auxiliary/scalar/align1/sub.cl b/auxiliary/scalar/align1/sub.cl
deleted file mode 100644
index 145b9ee..0000000
--- a/auxiliary/scalar/align1/sub.cl
+++ /dev/null
@@ -1,10 +0,0 @@
- 
-__kernel void sub(
-          __global const float * val1,
-          __global const float * val2, 
-          __global float * result) 
-{ 
-  if (get_global_id(0) == 0)
-    *result = *val1 - *val2;
-}
- 
diff --git a/auxiliary/spai/align1/assemble_blocks.cl b/auxiliary/spai/align1/assemble_blocks.cl
deleted file mode 100644
index cdf6b92..0000000
--- a/auxiliary/spai/align1/assemble_blocks.cl
+++ /dev/null
@@ -1,60 +0,0 @@
-
-float get_element(__global const unsigned int * row_indices,
-					 __global const unsigned int * column_indices,
-					 __global const float * elements,
-					 unsigned int row,
-					 unsigned int col
-					 )
-{
-	unsigned int row_end = row_indices[row+1];
-	for(unsigned int i = row_indices[row]; i < row_end; ++i){
-		if(column_indices[i] == col)
-			return elements[i];
-		if(column_indices[i] > col)
-			return 0.0;
-	}
-	return 0.0;						
-}
-
-void block_assembly(__global const unsigned int * row_indices,
-					__global const unsigned int * column_indices, 
-					__global const float * elements,
-					__global const unsigned int * matrix_dimensions,
-					__global const unsigned int * set_I,
-					__global const unsigned int * set_J, 
-					unsigned int matrix_ind,
-					__global float * com_A_I_J)
-{
-	unsigned int row_n = matrix_dimensions[2*matrix_ind];
-	unsigned int col_n = matrix_dimensions[2*matrix_ind + 1];
-	
-	for(unsigned int i = 0; i < col_n; ++i){
-				//start row index
-				for(unsigned int j = 0; j < row_n; j++){
-					com_A_I_J[ i*row_n + j] = get_element(row_indices, column_indices, elements, set_I[j], set_J[i]);
-				}
-			}
-						
-}
-
-__kernel void assemble_blocks(
-          __global const unsigned int * row_indices,
-          __global const unsigned int * column_indices, 
-          __global const float * elements,
-          __global const unsigned int * set_I,
-  		  __global const unsigned int * set_J,
- 		  __global const unsigned int * i_ind,
-		  __global const unsigned int * j_ind,
-	      __global const unsigned int * block_ind,
-	      __global const unsigned int * matrix_dimensions,
-		  __global float * com_A_I_J,
-		  __global unsigned int * g_is_update,
-                   unsigned int  block_elems_num) 
-{ 
-  	for(unsigned int i  = get_global_id(0); i < block_elems_num; i += get_global_size(0)){
-        if((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){
-			
-            block_assembly(row_indices, column_indices, elements, matrix_dimensions, set_I + i_ind[i], set_J + j_ind[i], i, com_A_I_J + block_ind[i]);
-        }
-    }
-}
\ No newline at end of file
diff --git a/auxiliary/spai/align1/block_bv_assembly.cl b/auxiliary/spai/align1/block_bv_assembly.cl
deleted file mode 100644
index 251671d..0000000
--- a/auxiliary/spai/align1/block_bv_assembly.cl
+++ /dev/null
@@ -1,33 +0,0 @@
-void assemble_bv(__global float * g_bv_r, __global float * g_bv, unsigned int col_n){
-	for(unsigned int i = 0; i < col_n; ++i){
-		g_bv_r[i] = g_bv[ i];
-	}
-}
-
-void assemble_bv_block(__global float * g_bv_r, __global float * g_bv, unsigned int col_n,
- 					   __global float * g_bv_u, unsigned int col_n_u)
-{
-	assemble_bv(g_bv_r, g_bv, col_n);
-	assemble_bv(g_bv_r + col_n, g_bv_u, col_n_u);
-						
-}
-
-__kernel void block_bv_assembly(__global float * g_bv,
-						__global unsigned int * start_bv_ind,
-						__global unsigned int * matrix_dimensions,
-						__global float * g_bv_u,
-						__global unsigned int * start_bv_u_ind,
-						__global unsigned int * matrix_dimensions_u,
-						__global float * g_bv_r,
-						__global unsigned int * start_bv_r_ind,
-						__global unsigned int * matrix_dimensions_r,
-						__global unsigned int * g_is_update,
-						//__local  float * local_gb,
-						unsigned int  block_elems_num)
-{ 
-	for(unsigned int i  = get_global_id(0); i < block_elems_num; i += get_global_size(0)){
-		if((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){
-			assemble_bv_block(g_bv_r + start_bv_r_ind[i], g_bv + start_bv_ind[i], matrix_dimensions[2*i + 1], g_bv_u + start_bv_u_ind[i], matrix_dimensions_u[2*i + 1]);
-		}
-	}
-}
\ No newline at end of file
diff --git a/auxiliary/spai/align1/block_least_squares.cl b/auxiliary/spai/align1/block_least_squares.cl
deleted file mode 100644
index 326e0c7..0000000
--- a/auxiliary/spai/align1/block_least_squares.cl
+++ /dev/null
@@ -1,68 +0,0 @@
-
-void custom_dot_prod_ls(__global float * A, unsigned int row_n, __global float * v, unsigned int ind, float *res){
-            *res = 0.0;
-            for(unsigned int j = ind; j < row_n; ++j){
-                if(j == ind){
-                    *res += v[ j];
-                }else{
-                    *res += A[ j + ind*row_n]*v[ j];
-                }
-            }
-        }
-
-void backwardSolve(__global float * R,  unsigned int row_n, unsigned int col_n, __global float * y, __global float * x){
-	for (int i = col_n-1; i >= 0 ; i--) {
-		x[ i] = y[ i];
-		for (int j = i+1; j < col_n; ++j) {
-			x[ i] -= R[ i + j*row_n]*x[ j];
-		}
-		x[i] /= R[ i + i*row_n];
-	}
-	
-}
-
-		
-void apply_q_trans_vec_ls(__global float * R, unsigned int row_n, unsigned int col_n, __global const float * b_v,  __global float * y){
-            float inn_prod = 0;
-            for(unsigned int i = 0; i < col_n; ++i){
-                custom_dot_prod_ls(R, row_n, y, i, &inn_prod);
-                for(unsigned int j = i; j < row_n; ++j){
-                    if(i == j){
-                        y[ j] -= b_v[ i]*inn_prod;
-                    }
-                    else{
-                        y[j] -= b_v[ i]*inn_prod*R[ j +i*row_n];
-                    }
-                }
-                //std::cout<<y<<std::endl;
-            }
-        }
-
-void ls(__global float * R, unsigned int row_n, unsigned int col_n, __global float * b_v, __global float * m_v, __global float * y_v){
-	
-	apply_q_trans_vec_ls(R, row_n, col_n, b_v, y_v);
-	//m_new - is m_v now
-	backwardSolve(R, row_n, col_n, y_v, m_v);
-}
-
-__kernel void block_least_squares(
-          __global float * global_R,
-		  __global unsigned int * block_ind,
-          __global float * b_v,
-	      __global unsigned int * start_bv_inds,
-		  __global float * m_v,
-		  __global float * y_v,
-		  __global unsigned int * start_y_inds,
-	      __global unsigned int * matrix_dimensions,
-	      __global unsigned int * g_is_update,
-          //__local  float * local_R,
-            unsigned int  block_elems_num) 
-{ 
-  	for(unsigned int i  = get_global_id(0); i < block_elems_num; i += get_global_size(0)){
-        if((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){
-			
-            ls(global_R + block_ind[i], matrix_dimensions[2*i], matrix_dimensions[2*i + 1], b_v +start_bv_inds[i], m_v + start_bv_inds[i], y_v + start_y_inds[i] );
-			
-        }
-    }
-}
diff --git a/auxiliary/spai/align1/block_q_mult.cl b/auxiliary/spai/align1/block_q_mult.cl
deleted file mode 100644
index 0810155..0000000
--- a/auxiliary/spai/align1/block_q_mult.cl
+++ /dev/null
@@ -1,74 +0,0 @@
-
-void custom_dot_prod(__global float * A, unsigned int row_n, __local float * v, unsigned int ind, float *res){
-            *res = 0.0;
-            for(unsigned int j = ind; j < row_n; ++j){
-                if(j == ind){
-                    *res += v[j];
-                }else{
-                    *res += A[j + ind*row_n]*v[j];
-                }
-            }
-        }
-
-void apply_q_trans_vec(__global float * R, unsigned int row_n, unsigned int col_n, __global float * b_v, __local float * y){
-            float inn_prod = 0;
-            for(unsigned int i = 0; i < col_n; ++i){
-                custom_dot_prod(R, row_n, y, i, &inn_prod);
-                for(unsigned int j = i; j < row_n; ++j){
-                    if(i == j){
-                        y[j] -= b_v[ i]*inn_prod;
-                    }
-                    else{
-                        y[j] -= b_v[ i]*inn_prod*R[ j + i*row_n];
-                    }
-                }
-            }
-        }
-
-void q_mult(__global float * R, unsigned int row_n, unsigned int col_n, __global float * b_v, __local float * R_u, unsigned int col_n_u){
-				for(unsigned int i = get_local_id(0); i < col_n_u; i+= get_local_size(0)){
-					apply_q_trans_vec(R, row_n, col_n, b_v, R_u + row_n*i);
-				}				
-}
-
-void matrix_from_global_to_local(__global float* g_M, __local float* l_M, unsigned int row_n, unsigned int col_n, unsigned int mat_start_ind){
-	for(unsigned int i = get_local_id(0); i < col_n; i+= get_local_size(0)){
-		for(unsigned int j = 0; j < row_n; ++j){
-			l_M[i*row_n + j] = g_M[mat_start_ind + i*row_n + j];
-		}
-	}
-}
-
-void matrix_from_local_to_global(__global float* g_M, __local float* l_M, unsigned int row_n, unsigned int col_n, unsigned int mat_start_ind){
-	for(unsigned int i = get_local_id(0); i < col_n; i+= get_local_size(0)){
-		for(unsigned int j = 0; j < row_n; ++j){
-			g_M[mat_start_ind + i*row_n + j] = l_M[i*row_n + j];
-		}
-	}
-}
-
-
-
-__kernel void block_q_mult(__global float * global_R,
-  __global unsigned int * block_ind,
-  __global float * global_R_u,
-  __global unsigned int *block_ind_u,
-  __global float * b_v,
-  __global unsigned int * start_bv_inds,
-  __global unsigned int * matrix_dimensions,
-  __global unsigned int * matrix_dimensions_u,
-  __global unsigned int * g_is_update,
-  __local  float * local_R_u,
-    unsigned int  block_elems_num){
-		for(unsigned int i  = get_group_id(0); i < block_elems_num; i += get_num_groups(0)){
-	        if((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && (g_is_update[i] > 0)){
-				//matrix_from_global_to_local(R, local_buff_R, matrix_dimensions[2*i], matrix_dimensions[2*i + 1], start_matrix_inds[i]);
-				matrix_from_global_to_local(global_R_u, local_R_u, matrix_dimensions_u[2*i], matrix_dimensions_u[2*i+ 1], block_ind_u[i]);
-				barrier(CLK_LOCAL_MEM_FENCE);
-	            q_mult(global_R + block_ind[i], matrix_dimensions[2*i], matrix_dimensions[2*i + 1], b_v + start_bv_inds[i], local_R_u, 
-	 				   matrix_dimensions_u[2*i + 1]);
-				barrier(CLK_LOCAL_MEM_FENCE);
-	            matrix_from_local_to_global(global_R_u, local_R_u, matrix_dimensions_u[2*i], matrix_dimensions_u[2*i + 1], block_ind_u[i]);
-	        }
-	    }
-}
\ No newline at end of file
diff --git a/auxiliary/spai/align1/block_qr.cl b/auxiliary/spai/align1/block_qr.cl
deleted file mode 100644
index c4b39b1..0000000
--- a/auxiliary/spai/align1/block_qr.cl
+++ /dev/null
@@ -1,130 +0,0 @@
-void dot_prod(__local const float* A, unsigned int n, unsigned int beg_ind, float* res){
-    *res = 0;
-    for(unsigned int i = beg_ind; i < n; ++i){
-        *res += A[(beg_ind-1)*n + i]*A[(beg_ind-1)*n + i];
-    }
-}
- 
-void vector_div(__global float* v, unsigned int beg_ind, float b, unsigned int n){
-    for(unsigned int i = beg_ind; i < n; ++i){
-        v[i] /= b;
-    }
-}
-
-void copy_vector(__local const float* A, __global float* v, const unsigned int beg_ind, const unsigned int n){
-    for(unsigned int i = beg_ind; i < n; ++i){
-        v[i] = A[(beg_ind-1)*n + i];
-    }
-}
- 
- 
-void householder_vector(__local const float* A, unsigned int j, unsigned int n, __global float* v, __global float* b){
-    float sg;
-    dot_prod(A, n, j+1, &sg); 
-    copy_vector(A, v, j+1, n);
-    float mu;
-    v[j] = 1.0;
-    //print_contigious_vector(v, v_start_ind, n);
-    if(sg == 0){
-        *b = 0;
-    }
-    else{
-        mu = sqrt(A[j*n + j]*A[ j*n + j] + sg);
-        if(A[ j*n + j] <= 0){
-            v[j] = A[ j*n + j] - mu;
-        }else{
-            v[j] = -sg/(A[ j*n + j] + mu);
-        }
-		*b = 2*(v[j]*v[j])/(sg + v[j]*v[j]);
-        //*b = (2*v[j]*v[j])/(sg + (v[j])*(v[j]));
-        vector_div(v, j, v[j], n);
-        //print_contigious_vector(v, v_start_ind, n);
-    }
-}
-
-void custom_inner_prod(__local const float* A, __global float* v, unsigned int col_ind, unsigned int row_num, unsigned int start_ind, float* res){
-    for(unsigned int i = start_ind; i < row_num; ++i){
-        *res += A[col_ind*row_num + i]*v[i];  
-    }
-}
-// 
-void apply_householder_reflection(__local float* A,  unsigned int row_n, unsigned int col_n, unsigned int iter_cnt, __global float* v, float b){
-    float in_prod_res;
-    for(unsigned int i= iter_cnt + get_local_id(0); i < col_n; i+=get_local_size(0)){
-        in_prod_res = 0.0;
-        custom_inner_prod(A, v, i, row_n, iter_cnt, &in_prod_res);
-        for(unsigned int j = iter_cnt; j < row_n; ++j){
-            A[ i*row_n + j] -= b*in_prod_res* v[j];
-        }
-    }
-    
-}
-
-void store_householder_vector(__local float* A,  unsigned int ind, unsigned int n, __global float* v){
-    for(unsigned int i = ind; i < n; ++i){
-        A[ (ind-1)*n + i] = v[i];
-    }
-}
-
-void single_qr( __local float* R, __global unsigned int* matrix_dimensions, __global float* b_v, __global float* v, unsigned int matrix_ind){
-    				//matrix_dimensions[0] - number of rows
-       				//matrix_dimensions[1] - number of columns
-	unsigned int col_n = matrix_dimensions[2*matrix_ind + 1];
-	unsigned int row_n = matrix_dimensions[2*matrix_ind];
-	
-	if((col_n == row_n)&&(row_n == 1)){
-		b_v[0] = 0.0;
-	    return;
-	}
-       for(unsigned int i = 0; i < col_n; ++i){
-				if(get_local_id(0) == 0){
-               		householder_vector(R, i, row_n, v, b_v + i);
-				}
-				barrier(CLK_LOCAL_MEM_FENCE);
-               	apply_householder_reflection(R, row_n, col_n, i, v, b_v[i]);
-                barrier(CLK_LOCAL_MEM_FENCE);
-				if(get_local_id(0) == 0){
-               		if(i < matrix_dimensions[2*matrix_ind]){
-                   		store_householder_vector(R, i+1, row_n, v);
-               		}
-				}
-           }
-}
-
-void matrix_from_global_to_local_qr(__global float* g_M, __local float* l_M, unsigned int row_n, unsigned int col_n, unsigned int mat_start_ind){
-	for(unsigned int i = get_local_id(0); i < col_n; i+= get_local_size(0)){
-		for(unsigned int j = 0; j < row_n; ++j){
-			l_M[i*row_n + j] = g_M[mat_start_ind + i*row_n + j];
-		}
-	}
-}
-void matrix_from_local_to_global_qr(__global float* g_M, __local float* l_M, unsigned int row_n, unsigned int col_n, unsigned int mat_start_ind){
-	for(unsigned int i = get_local_id(0); i < col_n; i+= get_local_size(0)){
-		for(unsigned int j = 0; j < row_n; ++j){
-			g_M[mat_start_ind + i*row_n + j] = l_M[i*row_n + j];
-		}
-	}
-}
-
-
-__kernel void block_qr(
-			__global float* R, 	 
-			__global unsigned int* matrix_dimensions, 
-			__global float* b_v, 
-			__global float* v, 
-			__global unsigned int* start_matrix_inds, 
-			__global unsigned int* start_bv_inds, 
-			__global unsigned int* start_v_inds,
-			__global unsigned int * g_is_update,  
-			__local float* local_buff_R,
-			unsigned int block_elems_num){
-    for(unsigned int i  = get_group_id(0); i < block_elems_num; i += get_num_groups(0)){
-        if((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){
-			matrix_from_global_to_local_qr(R, local_buff_R, matrix_dimensions[2*i], matrix_dimensions[2*i + 1], start_matrix_inds[i]);
-			barrier(CLK_LOCAL_MEM_FENCE);
-            single_qr(local_buff_R, matrix_dimensions, b_v + start_bv_inds[i], v + start_v_inds[i], i);
-			barrier(CLK_LOCAL_MEM_FENCE);
-            matrix_from_local_to_global_qr(R, local_buff_R, matrix_dimensions[2*i], matrix_dimensions[2*i + 1], start_matrix_inds[i]);
-        }
-    }
-}
diff --git a/auxiliary/spai/align1/block_qr_assembly.cl b/auxiliary/spai/align1/block_qr_assembly.cl
deleted file mode 100644
index a702f37..0000000
--- a/auxiliary/spai/align1/block_qr_assembly.cl
+++ /dev/null
@@ -1,57 +0,0 @@
-void assemble_upper_part(__global float * R_q,
- 						unsigned int row_n_q, unsigned int col_n_q, __global float * R_u, 
-						unsigned int row_n_u, unsigned int col_n_u,
-						unsigned int col_n, unsigned int diff){
-            for(unsigned int i = 0; i < col_n_q; ++i){
-                for(unsigned int j = 0; j < diff; ++j){
-					R_q[ i*row_n_q + j] = R_u[ i*row_n_u + j + col_n ];
-                }
-            }
-        }
-
-
-void assemble_lower_part(__global float * R_q, unsigned int row_n_q, unsigned int col_n_q, __global float * R_u_u, 
-						 unsigned int row_n_u_u, unsigned int col_n_u_u, 
-						 unsigned int diff){
-	for(unsigned int i = 0; i < col_n_u_u; ++i){
-		for(unsigned int j = 0; j < row_n_u_u; ++j){
-			R_q[i*row_n_q + j + diff] = R_u_u[i*row_n_u_u + j];
-		}
-	}	
-}
-
-
-
-void assemble_qr_block(__global float * R_q, unsigned int row_n_q, unsigned int col_n_q, __global float * R_u, unsigned int row_n_u,
-						unsigned int col_n_u, __global float * R_u_u, unsigned int row_n_u_u, unsigned int col_n_u_u, unsigned int col_n){
-						unsigned int diff = row_n_u - col_n;
-						assemble_upper_part(R_q, row_n_q, col_n_q, R_u, row_n_u, col_n_u, col_n, diff);
-						if(diff > 0){
-							assemble_lower_part(R_q, row_n_q, col_n_q, R_u_u, row_n_u_u, col_n_u_u, diff);
-						}
-}
-
-__kernel void block_qr_assembly(
-          __global unsigned int * matrix_dimensions,
-	      __global float * R_u,
-		  __global unsigned int * block_ind_u,
-		  __global unsigned int * matrix_dimensions_u,
-		  __global float * R_u_u,
-	      __global unsigned int * block_ind_u_u,
-		  __global unsigned int * matrix_dimensions_u_u,
-		  __global float * R_q,
-		  __global unsigned int * block_ind_q,
-		  __global unsigned int * matrix_dimensions_q,
-		  __global unsigned int * g_is_update,
-          //__local  float * local_R_q,
-            unsigned int  block_elems_num) 
-{ 
-  	for(unsigned int i  = get_global_id(0); i < block_elems_num; i += get_global_size(0)){
-        if((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){
-			//
-            assemble_qr_block(R_q + block_ind_q[i], matrix_dimensions_q[2*i], matrix_dimensions_q[2*i + 1], R_u + block_ind_u[i], matrix_dimensions_u[2*i], 
-							matrix_dimensions_u[2*i + 1], R_u_u + block_ind_u_u[i], matrix_dimensions_u_u[2*i], matrix_dimensions_u_u[2*i + 1], matrix_dimensions[2*i + 1]);
-
-        }
-    }
-}
diff --git a/auxiliary/spai/align1/block_qr_assembly_1.cl b/auxiliary/spai/align1/block_qr_assembly_1.cl
deleted file mode 100644
index e3b9e1e..0000000
--- a/auxiliary/spai/align1/block_qr_assembly_1.cl
+++ /dev/null
@@ -1,36 +0,0 @@
-void assemble_upper_part_1(__global float * R_q, unsigned int row_n_q, unsigned int col_n_q, __global float * R_u, 
-						 unsigned int row_n_u, unsigned int col_n_u,
-						 unsigned int col_n, unsigned int diff){
-            for(unsigned int i = 0; i < col_n_q; ++i){
-                for(unsigned int j = 0; j < diff; ++j){
-					R_q[ i*row_n_q + j] = R_u[i*row_n_u + j + col_n ];
-                }
-            }
-        }
-
-
-void assemble_qr_block_1(__global float * R_q,  unsigned int row_n_q, unsigned int col_n_q, __global float * R_u, unsigned int row_n_u,
-						unsigned int col_n_u, unsigned int col_n){
-						unsigned int diff = row_n_u - col_n;
-						assemble_upper_part_1(R_q, row_n_q, col_n_q, R_u, row_n_u, col_n_u, col_n, diff);
-}
-
-__kernel void block_qr_assembly_1(
-          __global unsigned int * matrix_dimensions,
-	      __global float * R_u,
-		  __global unsigned int * block_ind_u,
-		  __global unsigned int * matrix_dimensions_u,
-		  __global float * R_q,
-		  __global unsigned int * block_ind_q,
-		  __global unsigned int * matrix_dimensions_q,
-		  __global unsigned int * g_is_update,
-          //__local  float * local_R_q,
-            unsigned int  block_elems_num) 
-{ 
-  	for(unsigned int i  = get_global_id(0); i < block_elems_num; i += get_global_size(0)){
-        if((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){
-            assemble_qr_block_1(R_q + block_ind_q[i], matrix_dimensions_q[2*i], matrix_dimensions_q[2*i + 1], R_u + block_ind_u[i], matrix_dimensions_u[2*i], 
-							matrix_dimensions_u[2*i + 1], matrix_dimensions[2*i + 1]);
-        }
-    }
-}
diff --git a/auxiliary/spai/align1/block_r_assembly.cl b/auxiliary/spai/align1/block_r_assembly.cl
deleted file mode 100644
index d00fc35..0000000
--- a/auxiliary/spai/align1/block_r_assembly.cl
+++ /dev/null
@@ -1,68 +0,0 @@
-void assemble_r(__global float * gR, unsigned int row_n_r, unsigned int col_n_r, __global float * R, 
-				unsigned int row_n, unsigned int col_n)
-{
-  for(unsigned int i = 0; i < col_n; ++i){
-     for(unsigned int j = 0; j < row_n; ++j){
-		gR[i*row_n_r + j] = R[i*row_n + j ];
-     }
-  }
-}
-
-void assemble_r_u(__global float * gR,
- 				  unsigned int row_n_r, unsigned int col_n_r, __global float * R_u, unsigned int row_n_u, unsigned int col_n_u, 
-				  unsigned int col_n)
-{
-	for(unsigned int i = 0; i < col_n_u; ++i){
-		for(unsigned int j = 0; j < col_n; ++j){
-			gR[ (i+col_n)*row_n_r + j] = R_u[ i*row_n_u + j];
-		}
-	}				
-}
-
-
-void assemble_r_u_u(__global float * gR,  unsigned int row_n_r, unsigned int col_n_r, __global float * R_u_u, unsigned int row_n_u_u, 
-					unsigned int col_n_u_u, unsigned int col_n)
-{
-	for(unsigned int i = 0; i < col_n_u_u; ++i){
-		for(unsigned int j = 0; j < row_n_u_u; ++j){
-			gR[(col_n+i)*row_n_r + j + col_n] = R_u_u[i*row_n_u_u + j];
-		}
-	}					
-}
-
-void assemble_r_block(__global float * gR, unsigned int row_n_r, unsigned int col_n_r, __global float * R, unsigned int row_n, 
-				unsigned int col_n, __global float * R_u, unsigned int row_n_u, unsigned int col_n_u, __global float * R_u_u, 
-				unsigned int row_n_u_u, unsigned int col_n_u_u){
-				assemble_r(gR, row_n_r, col_n_r, R, row_n, col_n);				
-				assemble_r_u(gR, row_n_r, col_n_r, R_u, row_n_u, col_n_u, col_n);
-				assemble_r_u_u(gR, row_n_r, col_n_r, R_u_u, row_n_u_u, col_n_u_u, col_n);
-}
-
-
-__kernel void block_r_assembly(
-          					__global float * R,
-	      					__global unsigned int * block_ind,
-		  					__global unsigned int * matrix_dimensions,
-		  					__global float * R_u,
-	      					__global unsigned int * block_ind_u,
-		  					__global unsigned int * matrix_dimensions_u,
-		  					__global float * R_u_u,
-		  					__global unsigned int * block_ind_u_u,
-		  					__global unsigned int * matrix_dimensions_u_u,
-		  					__global float * g_R,
-		  					__global unsigned int * block_ind_r,
-		  					__global unsigned int * matrix_dimensions_r,
-						    __global unsigned int * g_is_update,
-          					//__local  float * local_gR,
-            				unsigned int  block_elems_num) 
-{ 
-  	for(unsigned int i  = get_global_id(0); i < block_elems_num; i += get_global_size(0)){
-        if((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){
-			
-            assemble_r_block(g_R + block_ind_r[i], matrix_dimensions_r[2*i], matrix_dimensions_r[2*i + 1], R + block_ind[i], matrix_dimensions[2*i], 
-							matrix_dimensions[2*i + 1], R_u + block_ind_u[i], matrix_dimensions_u[2*i], matrix_dimensions_u[2*i + 1],
-							R_u_u + block_ind_u_u[i], matrix_dimensions_u_u[2*i], matrix_dimensions_u_u[2*i + 1]);
-			
-        }
-    }
-}
\ No newline at end of file
diff --git a/auxiliary/vector/align1/add.cl b/auxiliary/vector/align1/add.cl
deleted file mode 100644
index 0ab59cf..0000000
--- a/auxiliary/vector/align1/add.cl
+++ /dev/null
@@ -1,16 +0,0 @@
-
-__kernel void add(
-          __global const float * vec1,
-          unsigned int start1,
-          unsigned int size1,
-          __global const float * vec2,
-          unsigned int start2,
-          unsigned int size2,
-          __global float * result,
-          unsigned int start3,
-          unsigned int size3) 
-{ 
-  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))
-    result[i+start3] = vec1[i+start1] + vec2[i+start2];
-}
-
diff --git a/auxiliary/vector/align1/clear.cl b/auxiliary/vector/align1/clear.cl
deleted file mode 100644
index 78d488d..0000000
--- a/auxiliary/vector/align1/clear.cl
+++ /dev/null
@@ -1,10 +0,0 @@
-
-__kernel void clear(
-          __global float * vec,
-          unsigned int start1,
-          unsigned int size1) 
-{ 
-  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))
-    vec[i+start1] = 0;
-}
-
diff --git a/auxiliary/vector/align1/cpu_inplace_mul_add.cl b/auxiliary/vector/align1/cpu_inplace_mul_add.cl
deleted file mode 100644
index d020712..0000000
--- a/auxiliary/vector/align1/cpu_inplace_mul_add.cl
+++ /dev/null
@@ -1,14 +0,0 @@
-
-__kernel void cpu_inplace_mul_add(
-          __global float * vec1,
-          unsigned int start1,
-          unsigned int size1,
-          __global const float * vec2,
-          unsigned int start2,
-          unsigned int size2,
-          float factor) 
-{ 
-  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))
-    vec1[i+start1] += vec2[i+start2] * factor;
-}
-
diff --git a/auxiliary/vector/align1/cpu_inplace_mult.cl b/auxiliary/vector/align1/cpu_inplace_mult.cl
deleted file mode 100644
index 9c4d3ff..0000000
--- a/auxiliary/vector/align1/cpu_inplace_mult.cl
+++ /dev/null
@@ -1,11 +0,0 @@
-
-__kernel void cpu_inplace_mult(
-          __global float * vec,
-          unsigned int start1,
-          unsigned int size1,
-          float factor) 
-{ 
-  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))
-    vec[i+start1] *= factor;
-}
-
diff --git a/auxiliary/vector/align1/cpu_mul_add.cl b/auxiliary/vector/align1/cpu_mul_add.cl
deleted file mode 100644
index bab4419..0000000
--- a/auxiliary/vector/align1/cpu_mul_add.cl
+++ /dev/null
@@ -1,18 +0,0 @@
-
-__kernel void cpu_mul_add(
-          __global const float * vec1,
-          unsigned int start1,
-          unsigned int size1,
-          float factor,
-          __global const float * vec2,
-          unsigned int start2,
-          unsigned int size2,
-          __global float * result,
-          unsigned int start3,
-          unsigned int size3
-          ) 
-{ 
-  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))
-    result[i+start3] = vec1[i+start1] * factor + vec2[i+start2];
-}
-
diff --git a/auxiliary/vector/align1/cpu_mult.cl b/auxiliary/vector/align1/cpu_mult.cl
deleted file mode 100644
index 478cabc..0000000
--- a/auxiliary/vector/align1/cpu_mult.cl
+++ /dev/null
@@ -1,15 +0,0 @@
-
-__kernel void cpu_mult(
-          __global const float * vec,
-          unsigned int start1,
-          unsigned int size1,
-          float factor, 
-          __global float * result,
-          unsigned int start2,
-          unsigned int size2) 
-{ 
-  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))
-    result[i+start2] = vec[i+start1] * factor;
-}
-
-
diff --git a/auxiliary/vector/align1/diag_precond.cl b/auxiliary/vector/align1/diag_precond.cl
deleted file mode 100644
index c9e30d0..0000000
--- a/auxiliary/vector/align1/diag_precond.cl
+++ /dev/null
@@ -1,12 +0,0 @@
-
-__kernel void diag_precond(
-          __global const float * diag_A_inv, 
-          unsigned int start1,
-          unsigned int size1,
-          __global float * x, 
-          unsigned int start2,
-          unsigned int size2) 
-{ 
-  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))
-    x[i+start2] *= diag_A_inv[i+start1];
-}
diff --git a/auxiliary/vector/align1/divide.cl b/auxiliary/vector/align1/divide.cl
deleted file mode 100644
index 2782dc6..0000000
--- a/auxiliary/vector/align1/divide.cl
+++ /dev/null
@@ -1,16 +0,0 @@
-
-// Note: name 'div' is not allowed by the jit-compiler
-__kernel void divide(
-          __global const float * vec,
-          unsigned int start1,
-          unsigned int size1,
-          __global const float * fac,  //note: CPU variant is mapped to prod_scalar
-          __global float * result,
-          unsigned int start3,
-          unsigned int size3)  
-{ 
-  float factor = *fac;
-  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))
-    result[i+start3] = vec[i+start1] / factor;
-}
-
diff --git a/auxiliary/vector/align1/index_norm_inf.cl b/auxiliary/vector/align1/index_norm_inf.cl
deleted file mode 100644
index 32590d1..0000000
--- a/auxiliary/vector/align1/index_norm_inf.cl
+++ /dev/null
@@ -1,56 +0,0 @@
-//index_norm_inf:
-unsigned int float_vector1_index_norm_inf_impl(
-          __global const float * vec,
-          unsigned int start1,
-          unsigned int size1,
-          __local float * float_buffer,
-          __local unsigned int * index_buffer)
-{
-  //step 1: fill buffer:
-  float cur_max = 0.0f;
-  float tmp;
-  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))
-  {
-    tmp = fabs(vec[i+start1]);
-    if (cur_max < tmp)
-    {
-      float_buffer[get_global_id(0)] = tmp;
-      index_buffer[get_global_id(0)] = i;
-      cur_max = tmp;
-    }
-  }
-  
-  //step 2: parallel reduction:
-  for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)
-  {
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (get_global_id(0) < stride)
-    {
-      //find the first occurring index
-      if (float_buffer[get_global_id(0)] < float_buffer[get_global_id(0)+stride])
-      {
-        index_buffer[get_global_id(0)] = index_buffer[get_global_id(0)+stride];
-        float_buffer[get_global_id(0)] = float_buffer[get_global_id(0)+stride];
-      }
-      
-      //index_buffer[get_global_id(0)] = float_buffer[get_global_id(0)] < float_buffer[get_global_id(0)+stride] ? index_buffer[get_global_id(0)+stride] : index_buffer[get_global_id(0)];
-      //float_buffer[get_global_id(0)] = max(float_buffer[get_global_id(0)], float_buffer[get_global_id(0)+stride]);
-    }
-  }
-  
-  return index_buffer[0];
-}
-
-__kernel void index_norm_inf(
-          __global float * vec,
-          unsigned int start1,
-          unsigned int size1,
-          __local float * float_buffer,
-          __local unsigned int * index_buffer,
-          global unsigned int * result) 
-{ 
-  unsigned int tmp = float_vector1_index_norm_inf_impl(vec, start1, size1, float_buffer, index_buffer);
-  if (get_global_id(0) == 0) *result = tmp;
-}
-
-
diff --git a/auxiliary/vector/align1/inner_prod.cl b/auxiliary/vector/align1/inner_prod.cl
deleted file mode 100644
index 5f5628e..0000000
--- a/auxiliary/vector/align1/inner_prod.cl
+++ /dev/null
@@ -1,56 +0,0 @@
-
-//helper:
-void helper_inner_prod_parallel_reduction( __local float * tmp_buffer )
-{
-  for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2)
-  {
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (get_local_id(0) < stride)
-      tmp_buffer[get_local_id(0)] += tmp_buffer[get_local_id(0)+stride];
-  }
-}
-
-//////// inner products:
-float impl_inner_prod(
-          __global const float * vec1,
-          unsigned int start1,
-          unsigned int size1,
-          __global const float * vec2,
-          unsigned int start2,
-          unsigned int size2,
-          __local float * tmp_buffer)
-{
-  float tmp = 0;
-  for (unsigned int i = get_local_id(0); i < size1; i += get_local_size(0))
-    tmp += vec1[i+start1] * vec2[i+start2];
-  tmp_buffer[get_local_id(0)] = tmp;
-  
-  helper_inner_prod_parallel_reduction(tmp_buffer);
-  
-  return tmp_buffer[0];
-}
-
-
-__kernel void inner_prod(
-          __global const float * vec1,
-          unsigned int start1,
-          unsigned int size1,
-          __global const float * vec2,
-          unsigned int start2,
-          unsigned int size2,
-          __local float * tmp_buffer,
-          global float * group_buffer)
-{
-  float tmp = impl_inner_prod(vec1,
-                              (      get_group_id(0) * size1) / get_num_groups(0) + start1,
-                              ((get_group_id(0) + 1) * size1) / get_num_groups(0) - (      get_group_id(0) * size1) / get_num_groups(0),
-                              vec2,
-                              (      get_group_id(0) * size2) / get_num_groups(0) + start2,
-                              ((get_group_id(0) + 1) * size2) / get_num_groups(0) - (      get_group_id(0) * size2) / get_num_groups(0),
-                              tmp_buffer);
-  
-  if (get_local_id(0) == 0)
-    group_buffer[get_group_id(0)] = tmp;
-  
-}
-
diff --git a/auxiliary/vector/align1/inplace_add.cl b/auxiliary/vector/align1/inplace_add.cl
deleted file mode 100644
index ab7c23d..0000000
--- a/auxiliary/vector/align1/inplace_add.cl
+++ /dev/null
@@ -1,13 +0,0 @@
-
-__kernel void inplace_add(
-          __global float * vec1,
-          unsigned int start1,
-          unsigned int size1,
-          __global const float * vec2,
-          unsigned int start2,
-          unsigned int size2) 
-{ 
-  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))
-    vec1[i+start1] += vec2[i+start2];
-}
-
diff --git a/auxiliary/vector/align1/inplace_div_add.cl b/auxiliary/vector/align1/inplace_div_add.cl
deleted file mode 100644
index 77820cd..0000000
--- a/auxiliary/vector/align1/inplace_div_add.cl
+++ /dev/null
@@ -1,15 +0,0 @@
-
-///// divide add:
-__kernel void inplace_div_add(
-          __global float * vec1,
-          unsigned int start1,
-          unsigned int size1,
-          __global const float * vec2,
-          unsigned int start2,
-          unsigned int size2,
-          __global const float * fac)   //CPU variant is mapped to mult_add
-{ 
-  float factor = *fac;
-  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))
-    vec1[i+start1] -= vec2[i+start2] / factor;
-}
\ No newline at end of file
diff --git a/auxiliary/vector/align1/inplace_div_sub.cl b/auxiliary/vector/align1/inplace_div_sub.cl
deleted file mode 100644
index 0f2a10f..0000000
--- a/auxiliary/vector/align1/inplace_div_sub.cl
+++ /dev/null
@@ -1,16 +0,0 @@
-
-///// divide substract:
-__kernel void inplace_div_sub(
-          __global float * vec1,
-          unsigned int start1,
-          unsigned int size1,
-          __global const float * vec2,
-          unsigned int start2,
-          unsigned int size2,
-          __global const float * fac)   //CPU variant is mapped to mult_add
-{ 
-  float factor = *fac;
-  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))
-    vec1[i+start1] -= vec2[i+start2] / factor;
-}
-
diff --git a/auxiliary/vector/align1/inplace_divide.cl b/auxiliary/vector/align1/inplace_divide.cl
deleted file mode 100644
index 5c935f1..0000000
--- a/auxiliary/vector/align1/inplace_divide.cl
+++ /dev/null
@@ -1,12 +0,0 @@
-
-__kernel void inplace_divide(
-          __global float * vec,
-          unsigned int start1,
-          unsigned int size1,
-          __global const float * fac)  //note: CPU variant is mapped to prod_scalar
-{ 
-  float factor = *fac;
-  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))
-    vec[i+start1] /= factor;
-}
-
diff --git a/auxiliary/vector/align1/inplace_mul_add.cl b/auxiliary/vector/align1/inplace_mul_add.cl
deleted file mode 100644
index 544f5fc..0000000
--- a/auxiliary/vector/align1/inplace_mul_add.cl
+++ /dev/null
@@ -1,16 +0,0 @@
-
-__kernel void inplace_mul_add(
-          __global float * vec1,
-          unsigned int start1,
-          unsigned int size1,
-          __global const float * vec2,
-          unsigned int start2,
-          unsigned int size2,
-          __global const float * fac) 
-{ 
-  float factor = *fac;
-  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))
-    vec1[i+start1] += vec2[i+start2] * factor;
-}
-
-
diff --git a/auxiliary/vector/align1/inplace_mul_sub.cl b/auxiliary/vector/align1/inplace_mul_sub.cl
deleted file mode 100644
index f0c05b2..0000000
--- a/auxiliary/vector/align1/inplace_mul_sub.cl
+++ /dev/null
@@ -1,16 +0,0 @@
-
-__kernel void inplace_mul_sub(
-          __global float * vec1,
-          unsigned int start1,
-          unsigned int size1,
-          __global const float * vec2,
-          unsigned int start2,
-          unsigned int size2,
-          __global const float * fac)   //CPU variant is mapped to mult_add
-{ 
-  float factor = *fac;
-  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))
-    vec1[i+start1] -= vec2[i+start2] * factor;
-}
-
-
diff --git a/auxiliary/vector/align1/inplace_mult.cl b/auxiliary/vector/align1/inplace_mult.cl
deleted file mode 100644
index f8a8210..0000000
--- a/auxiliary/vector/align1/inplace_mult.cl
+++ /dev/null
@@ -1,13 +0,0 @@
-
-__kernel void inplace_mult(
-          __global float * vec,
-          unsigned int start1,
-          unsigned int size1,
-          __global const float * fac) 
-{ 
-  float factor = *fac;
-  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))
-    vec[i+start1] *= factor;
-}
-
-
diff --git a/auxiliary/vector/align1/inplace_sub.cl b/auxiliary/vector/align1/inplace_sub.cl
deleted file mode 100644
index 50bffd2..0000000
--- a/auxiliary/vector/align1/inplace_sub.cl
+++ /dev/null
@@ -1,13 +0,0 @@
-
-__kernel void inplace_sub(
-          __global float * vec1,
-          unsigned int start1,
-          unsigned int size1,
-          __global const float * vec2,
-          unsigned int start2,
-          unsigned int size2) 
-{ 
-  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))
-    vec1[i+start1] -= vec2[i+start2];
-}
-
diff --git a/auxiliary/vector/align1/mul_add.cl b/auxiliary/vector/align1/mul_add.cl
deleted file mode 100644
index 72cc286..0000000
--- a/auxiliary/vector/align1/mul_add.cl
+++ /dev/null
@@ -1,20 +0,0 @@
-
-__kernel void mul_add(
-          __global const float * vec1,
-          unsigned int start1,
-          unsigned int size1,
-          __global const float * fac,
-          __global const float * vec2,
-          unsigned int start2,
-          unsigned int size2,
-          __global float * result,
-          unsigned int start3,
-          unsigned int size3
-          ) 
-{ 
-  float factor = *fac;
-  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))
-    result[i+start3] = vec1[i+start1] * factor + vec2[i+start2];
-}
-
-
diff --git a/auxiliary/vector/align1/mul_sub.cl b/auxiliary/vector/align1/mul_sub.cl
deleted file mode 100644
index 860e867..0000000
--- a/auxiliary/vector/align1/mul_sub.cl
+++ /dev/null
@@ -1,20 +0,0 @@
-
-///// multiply subtract:
-__kernel void mul_sub(
-          __global const float * vec1,
-          unsigned int start1,
-          unsigned int size1,
-          __global const float * fac,
-          __global const float * vec2,
-          unsigned int start2,
-          unsigned int size2,
-          __global float * result,
-          unsigned int start3,
-          unsigned int size3
-          ) 
-{ 
-  float factor = *fac;
-  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))
-    result[i+start3] = vec1[i+start1] * factor - vec2[i+start2];
-}
-
diff --git a/auxiliary/vector/align1/mult.cl b/auxiliary/vector/align1/mult.cl
deleted file mode 100644
index 1915983..0000000
--- a/auxiliary/vector/align1/mult.cl
+++ /dev/null
@@ -1,15 +0,0 @@
-
-__kernel void mult(
-          __global const float * vec,
-          unsigned int start1,
-          unsigned int size1,
-          __global const float * fac, 
-          __global float * result,
-          unsigned int start3,
-          unsigned int size3) 
-{ 
-  float factor = *fac;
-  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))
-    result[i+start3] = vec[i+start1] * factor;
-}
-
diff --git a/auxiliary/vector/align1/norm_1.cl b/auxiliary/vector/align1/norm_1.cl
deleted file mode 100644
index 006ab3a..0000000
--- a/auxiliary/vector/align1/norm_1.cl
+++ /dev/null
@@ -1,45 +0,0 @@
-//helper:
-void helper_norm1_parallel_reduction( __local float * tmp_buffer )
-{
-  for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)
-  {
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (get_global_id(0) < stride)
-      tmp_buffer[get_global_id(0)] += tmp_buffer[get_global_id(0)+stride];
-  }
-}
-
-////// norm_1
-float impl_norm_1(
-          __global const float * vec,
-          unsigned int start_index,
-          unsigned int end_index,
-          __local float * tmp_buffer)
-{
-  float tmp = 0;
-  for (unsigned int i = start_index + get_local_id(0); i < end_index; i += get_local_size(0))
-    tmp += fabs(vec[i]);
-  
-  tmp_buffer[get_local_id(0)] = tmp;
-  
-  helper_norm1_parallel_reduction(tmp_buffer);
-  
-  return tmp_buffer[0];
-};
-
-__kernel void norm_1(
-          __global const float * vec,
-          unsigned int start1,
-          unsigned int size1,
-          __local float * tmp_buffer,
-          global float * group_buffer)
-{
-  float tmp = impl_norm_1(vec,
-                          (      get_group_id(0) * size1) / get_num_groups(0) + start1,
-                          ((get_group_id(0) + 1) * size1) / get_num_groups(0) + start1,
-                          tmp_buffer);
-  
-  if (get_local_id(0) == 0)
-    group_buffer[get_group_id(0)] = tmp;  
-}
-
diff --git a/auxiliary/vector/align1/norm_2.cl b/auxiliary/vector/align1/norm_2.cl
deleted file mode 100644
index bc45d17..0000000
--- a/auxiliary/vector/align1/norm_2.cl
+++ /dev/null
@@ -1,48 +0,0 @@
-//helper:
-void helper_norm2_parallel_reduction( __local float * tmp_buffer )
-{
-  for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)
-  {
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (get_global_id(0) < stride)
-      tmp_buffer[get_global_id(0)] += tmp_buffer[get_global_id(0)+stride];
-  }
-}
-
-////// norm_2
-float impl_norm_2(
-          __global const float * vec,
-          unsigned int start_index,
-          unsigned int end_index,
-          __local float * tmp_buffer)
-{
-  float tmp = 0;
-  float vec_entry = 0;
-  for (unsigned int i = start_index + get_local_id(0); i < end_index; i += get_local_size(0))
-  {
-    vec_entry = vec[i];
-    tmp += vec_entry * vec_entry;
-  }
-  tmp_buffer[get_local_id(0)] = tmp;
-  
-  helper_norm2_parallel_reduction(tmp_buffer);
-  
-  return tmp_buffer[0];
-};
-
-__kernel void norm_2(
-          __global const float * vec,
-          unsigned int start1,
-          unsigned int size1,
-          __local float * tmp_buffer,
-          global float * group_buffer)
-{
-  float tmp = impl_norm_2(vec,
-                          (      get_group_id(0) * size1) / get_num_groups(0) + start1,
-                          ((get_group_id(0) + 1) * size1) / get_num_groups(0) + start1,
-                          tmp_buffer);
-  
-  if (get_local_id(0) == 0)
-    group_buffer[get_group_id(0)] = tmp;  
-}
-
diff --git a/auxiliary/vector/align1/norm_inf.cl b/auxiliary/vector/align1/norm_inf.cl
deleted file mode 100644
index 5eaf719..0000000
--- a/auxiliary/vector/align1/norm_inf.cl
+++ /dev/null
@@ -1,39 +0,0 @@
-
-////// norm_inf
-float impl_norm_inf(
-          __global const float * vec,
-          unsigned int start_index,
-          unsigned int end_index,
-          __local float * tmp_buffer)
-{
-  float tmp = 0;
-  for (unsigned int i = start_index + get_local_id(0); i < end_index; i += get_local_size(0))
-    tmp = fmax(fabs(vec[i]), tmp);
-  tmp_buffer[get_local_id(0)] = tmp;
-  
-  //step 2: parallel reduction:
-  for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)
-  {
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (get_global_id(0) < stride)
-      tmp_buffer[get_global_id(0)] = fmax(tmp_buffer[get_global_id(0)], tmp_buffer[get_global_id(0)+stride]);
-  }
-  
-  return tmp_buffer[0];
-}
-
-__kernel void norm_inf(
-          __global const float * vec,
-          unsigned int start1,
-          unsigned int size1,
-          __local float * tmp_buffer,
-          global float * group_buffer)
-{
-  float tmp = impl_norm_inf(vec,
-                          (      get_group_id(0) * size1) / get_num_groups(0) + start1,
-                          ((get_group_id(0) + 1) * size1) / get_num_groups(0) + start1,
-                          tmp_buffer);
-  
-  if (get_local_id(0) == 0)
-    group_buffer[get_group_id(0)] = tmp;  
-}
diff --git a/auxiliary/vector/align1/plane_rotation.cl b/auxiliary/vector/align1/plane_rotation.cl
deleted file mode 100644
index 28d6c91..0000000
--- a/auxiliary/vector/align1/plane_rotation.cl
+++ /dev/null
@@ -1,26 +0,0 @@
-
-////// plane rotation: (x,y) <- (\alpha x + \beta y, -\beta x + \alpha y)
-__kernel void plane_rotation(
-          __global float * vec1,
-          unsigned int start1,
-          unsigned int size1,
-          __global float * vec2, 
-          unsigned int start2,
-          unsigned int size2,
-          float alpha,
-          float beta) 
-{ 
-  float tmp1 = 0;
-  float tmp2 = 0;
-
-  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))
-  {
-    tmp1 = vec1[i+start1];
-    tmp2 = vec2[i+start2];
-    
-    vec1[i+start1] = alpha * tmp1 + beta * tmp2;
-    vec2[i+start2] = alpha * tmp2 - beta * tmp1;
-  }
-
-}
-
diff --git a/auxiliary/vector/align1/sqrt_sum.cl b/auxiliary/vector/align1/sqrt_sum.cl
deleted file mode 100644
index 3debe8a..0000000
--- a/auxiliary/vector/align1/sqrt_sum.cl
+++ /dev/null
@@ -1,21 +0,0 @@
-
-
-__kernel void sqrt_sum(
-          __global float * vec1,
-          unsigned int start1,
-          unsigned int size1,
-          __global float * result) 
-{ 
-  //parallel reduction on global memory: (make sure get_global_size(0) is a power of 2)
-  for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)
-  {
-    if (get_global_id(0) < stride)
-      vec1[get_global_id(0)+start1] += vec1[get_global_id(0)+start1+stride];
-    barrier(CLK_GLOBAL_MEM_FENCE);
-  }
-  
-  if (get_global_id(0) == 0)
-    *result = sqrt(vec1[start1]);
-  
-}
-
diff --git a/auxiliary/vector/align1/sub.cl b/auxiliary/vector/align1/sub.cl
deleted file mode 100644
index 48d3317..0000000
--- a/auxiliary/vector/align1/sub.cl
+++ /dev/null
@@ -1,16 +0,0 @@
-
-__kernel void sub(
-          __global const float * vec1,
-          unsigned int start1,
-          unsigned int size1,
-          __global const float * vec2, 
-          unsigned int start2,
-          unsigned int size2,
-          __global float * result,
-          unsigned int start3,
-          unsigned int size3)
-{ 
-  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))
-    result[i+start3] = vec1[i+start1] - vec2[i+start2];
-}
-
diff --git a/auxiliary/vector/align1/sum.cl b/auxiliary/vector/align1/sum.cl
deleted file mode 100644
index 7e51c52..0000000
--- a/auxiliary/vector/align1/sum.cl
+++ /dev/null
@@ -1,20 +0,0 @@
-
-
-__kernel void sum(
-          __global float * vec1,
-          unsigned int start1,
-          unsigned int size1,
-          __global float * result) 
-{ 
-  //parallel reduction on global memory (make sure get_global_size(0) is a power of 2)
-  for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)
-  {
-    if (get_global_id(0) < stride)
-      vec1[get_global_id(0)+start1] += vec1[get_global_id(0)+start1+stride];
-    barrier(CLK_GLOBAL_MEM_FENCE);
-  }
-  
-  if (get_global_id(0) == 0)
-    *result = vec1[0];  
-}
-
diff --git a/auxiliary/vector/align1/swap.cl b/auxiliary/vector/align1/swap.cl
deleted file mode 100644
index b1bc41e..0000000
--- a/auxiliary/vector/align1/swap.cl
+++ /dev/null
@@ -1,21 +0,0 @@
-
-
-////// swap:
-__kernel void swap(
-          __global float * vec1,
-          unsigned int start1,
-          unsigned int size1,
-          __global float * vec2,
-          unsigned int start2,
-          unsigned int size2
-          ) 
-{ 
-  float tmp;
-  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))
-  {
-    tmp = vec2[i+start2];
-    vec2[i+start2] = vec1[i+start1];
-    vec1[i+start1] = tmp;
-  }
-}
- 
diff --git a/auxiliary/vector/align1/vmax.cl b/auxiliary/vector/align1/vmax.cl
deleted file mode 100644
index 0888b53..0000000
--- a/auxiliary/vector/align1/vmax.cl
+++ /dev/null
@@ -1,20 +0,0 @@
-
-
-__kernel void vmax(
-          __global float * vec1,
-          unsigned int start1,
-          unsigned int size1,
-          __global float * result) 
-{ 
-  //parallel reduction on global memory (make sure that size is a power of 2)
-  for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)
-  {
-    if (get_global_id(0) < stride)
-      vec1[get_global_id(0)+start1] = fmax(vec1[get_global_id(0)+start1+stride], vec1[get_global_id(0)+start1]);
-    barrier(CLK_GLOBAL_MEM_FENCE);
-  }
-  
-  if (get_global_id(0) == 0)
-    *result = vec1[start1];
-}
-
diff --git a/auxiliary/vector/align16/add.cl b/auxiliary/vector/align16/add.cl
deleted file mode 100644
index 6e786b0..0000000
--- a/auxiliary/vector/align16/add.cl
+++ /dev/null
@@ -1,18 +0,0 @@
-
-__kernel void add(
-          __global const float16 * vec1,
-          unsigned int start1,
-          unsigned int size1,
-          __global const float16 * vec2, 
-          unsigned int start2,
-          unsigned int size2,
-          __global float16 * result,
-          unsigned int start3,
-          unsigned int size3)
-{ 
-  unsigned int i_end = size/16;
-  for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))
-    result[i+start3] = vec1[i+start1] + vec2[i+start2];
-}
-
-
diff --git a/auxiliary/vector/align16/cpu_inplace_mul.cl b/auxiliary/vector/align16/cpu_inplace_mul.cl
deleted file mode 100644
index 2eba4bf..0000000
--- a/auxiliary/vector/align16/cpu_inplace_mul.cl
+++ /dev/null
@@ -1,12 +0,0 @@
-
-__kernel void cpu_inplace_mult(
-          __global float16 * vec,
-          unsigned int start1,
-          unsigned int size1,
-          float factor) 
-{ 
-  unsigned int i_end = size1/16;
-  for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))
-    vec[i+start1] *= factor;
-}
-
diff --git a/auxiliary/vector/align16/cpu_mult.cl b/auxiliary/vector/align16/cpu_mult.cl
deleted file mode 100644
index 6c36326..0000000
--- a/auxiliary/vector/align16/cpu_mult.cl
+++ /dev/null
@@ -1,15 +0,0 @@
-
-__kernel void cpu_mult(
-          __global const float16 * vec,
-          unsigned int start1,
-          unsigned int size1,
-          float factor, 
-          __global float16 * result,
-          unsigned int start2,
-          unsigned int size2) 
-{ 
-  unsigned int i_end = size1/16;
-  for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))
-    result[i+start2] = vec[i+start1] * factor;
-}
-
diff --git a/auxiliary/vector/align16/divide.cl b/auxiliary/vector/align16/divide.cl
deleted file mode 100644
index a3b3119..0000000
--- a/auxiliary/vector/align16/divide.cl
+++ /dev/null
@@ -1,18 +0,0 @@
-
-//Note: 'div' cannot be used because of complaints by the jit-compiler
-__kernel void divide(
-          __global const float16 * vec,
-          unsigned int start1,
-          unsigned int size1,
-          __global const float * fac,  //note: CPU variant is mapped to prod_scalar
-          __global float16 * result,
-          unsigned int start2,
-          unsigned int size2)  
-{ 
-  float factor = *fac;
-  unsigned int i_end = size/16;
-  for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))
-    result[i+start2] = vec[i+start1] / factor;
-}
-
-
diff --git a/auxiliary/vector/align16/inplace_add.cl b/auxiliary/vector/align16/inplace_add.cl
deleted file mode 100644
index 76f4e45..0000000
--- a/auxiliary/vector/align16/inplace_add.cl
+++ /dev/null
@@ -1,14 +0,0 @@
-
-__kernel void inplace_add(
-          __global float16 * vec1,
-          unsigned int start1,
-          unsigned int size1,
-          __global const float16 * vec2,
-          unsigned int start2,
-          unsigned int size2) 
-{ 
-  unsigned int i_end = size1/16;
-  for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))
-    vec1[i+start1] += vec2[i+start2];
-}
-
diff --git a/auxiliary/vector/align16/inplace_divide.cl b/auxiliary/vector/align16/inplace_divide.cl
deleted file mode 100644
index 01150e8..0000000
--- a/auxiliary/vector/align16/inplace_divide.cl
+++ /dev/null
@@ -1,14 +0,0 @@
-
-
-__kernel void inplace_divide(
-          __global float16 * vec,
-          unsigned int start1,
-          unsigned int size1,
-          __global const float * fac)  //note: CPU variant is mapped to prod_scalar
-{ 
-  float factor = *fac;
-  unsigned int i_end = size1/16;
-  for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))
-    vec[i+start1] /= factor;
-}
-
diff --git a/auxiliary/vector/align16/inplace_mult.cl b/auxiliary/vector/align16/inplace_mult.cl
deleted file mode 100644
index 51df457..0000000
--- a/auxiliary/vector/align16/inplace_mult.cl
+++ /dev/null
@@ -1,13 +0,0 @@
-
-__kernel void inplace_mult(
-          __global float16 * vec,
-          unsigned int start1,
-          unsigned int size1,
-          __global const float * fac) 
-{ 
-  float factor = *fac;
-  unsigned int i_end = size1/16;
-  for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))
-    vec[i+start1] *= factor;
-}
-
diff --git a/auxiliary/vector/align16/inplace_sub.cl b/auxiliary/vector/align16/inplace_sub.cl
deleted file mode 100644
index 4e67918..0000000
--- a/auxiliary/vector/align16/inplace_sub.cl
+++ /dev/null
@@ -1,15 +0,0 @@
-
-__kernel void inplace_sub(
-          __global float16 * vec1,
-          unsigned int start1,
-          unsigned int size1,
-          __global const float16 * vec2,
-          unsigned int start2,
-          unsigned int size2) 
-{ 
-  unsigned int i_end = size1/16;
-  for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))
-    vec1[i+start1] -= vec2[i+start2];
-}
-
-
diff --git a/auxiliary/vector/align16/mult.cl b/auxiliary/vector/align16/mult.cl
deleted file mode 100644
index 8af3ff7..0000000
--- a/auxiliary/vector/align16/mult.cl
+++ /dev/null
@@ -1,16 +0,0 @@
-
-__kernel void mult(
-          __global const float16 * vec,
-          unsigned int start1,
-          unsigned int size1,
-          __global const float * fac, 
-          __global float16 * result,
-          unsigned int start2,
-          unsigned int size2) 
-{ 
-  float factor = *fac;
-  unsigned int i_end = size1/16;
-  for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))
-    result[i+start2] = vec[i+start1] * factor;
-}
-
diff --git a/auxiliary/vector/align16/sub.cl b/auxiliary/vector/align16/sub.cl
deleted file mode 100644
index bcccdc6..0000000
--- a/auxiliary/vector/align16/sub.cl
+++ /dev/null
@@ -1,18 +0,0 @@
-
-__kernel void sub(
-          __global const float16 * vec1,
-          unsigned int start1,
-          unsigned int size1,
-          __global const float16 * vec2, 
-          unsigned int start2,
-          unsigned int size2,
-          __global float16 * result,
-          unsigned int start3,
-          unsigned int size3)
-{ 
-  unsigned int i_end = size1 / 16;
-  for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))
-    result[i+start3] = vec1[i+start1] - vec2[i+start2];
-}
-
-
diff --git a/auxiliary/vector/align4/cpu_inplace_mul_add.cl b/auxiliary/vector/align4/cpu_inplace_mul_add.cl
deleted file mode 100644
index bddcf43..0000000
--- a/auxiliary/vector/align4/cpu_inplace_mul_add.cl
+++ /dev/null
@@ -1,15 +0,0 @@
-
-__kernel void cpu_inplace_mul_add(
-          __global float4 * vec1,
-          unsigned int start1,
-          unsigned int size1,
-          __global const float4 * vec2,
-          unsigned int start2,
-          unsigned int size2,
-          float factor) 
-{ 
-  unsigned int i_end = size1/4;
-  for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))
-    vec1[i+start1] += vec2[i+start2] * factor;
-}
-
diff --git a/auxiliary/vector/align4/cpu_mul_add.cl b/auxiliary/vector/align4/cpu_mul_add.cl
deleted file mode 100644
index 0232f8f..0000000
--- a/auxiliary/vector/align4/cpu_mul_add.cl
+++ /dev/null
@@ -1,18 +0,0 @@
-
-__kernel void cpu_mul_add(
-          __global const float4 * vec1,
-          unsigned int start1,
-          unsigned int size1,
-          float factor,
-          __global const float4 * vec2,
-          unsigned int start2,
-          unsigned int size2,
-          __global float4 * result,
-          unsigned int start3,
-          unsigned int size3) 
-{ 
-  unsigned int i_end = size1/4;
-  for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))
-    result[i+start3] = vec1[i+start1] * factor + vec2[i+start2];
-}
-
diff --git a/auxiliary/vector/align4/inner_prod.cl_disabled b/auxiliary/vector/align4/inner_prod.cl_disabled
deleted file mode 100644
index 9f5148f..0000000
--- a/auxiliary/vector/align4/inner_prod.cl_disabled
+++ /dev/null
@@ -1,40 +0,0 @@
-
-//helper:
-void helper_inner4_float_parallel_reduction( __local float * tmp_buffer )
-{
-  for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)
-  {
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (get_global_id(0) < stride)
-      tmp_buffer[get_global_id(0)] += tmp_buffer[get_global_id(0)+stride];
-  }
-}
-
-//////// inner products:
-float float_vector4_inner_prod_impl(
-          __global const float4 * vec1,
-          __global const float4 * vec2,
-          unsigned int size,
-          __local float * tmp_buffer)
-{
-  float tmp = 0;
-  for (unsigned int i = get_global_id(0); i < size/4; i += get_global_size(0))
-    tmp += dot(vec1[i],vec2[i]);
-  tmp_buffer[get_global_id(0)] = tmp;
-  
-  helper_inner4_float_parallel_reduction(tmp_buffer);
-  
-  return tmp_buffer[0];
-}
-
-__kernel void inner_prod(
-          __global const float4 * vec1,
-          __global const float4 * vec2,
-          unsigned int size,
-          __local float * tmp_buffer,
-          global float * result)
-{
-  float tmp = float_vector4_inner_prod_impl(vec1, vec2, size, tmp_buffer);
-  if (get_global_id(0) == 0) *result = tmp;
-}
-
diff --git a/auxiliary/vector/align4/inplace_div_add.cl b/auxiliary/vector/align4/inplace_div_add.cl
deleted file mode 100644
index dd36960..0000000
--- a/auxiliary/vector/align4/inplace_div_add.cl
+++ /dev/null
@@ -1,18 +0,0 @@
-
-__kernel void inplace_div_add(
-          __global float4 * vec1,
-          unsigned int start1,
-          unsigned int size1,
-          __global const float4 * vec2,
-          unsigned int start2,
-          unsigned int size2,
-          __global const float * fac)   //CPU variant is mapped to mult_add
-{ 
-  float factor = *fac;
-  unsigned int i_end = size1 / 4;
-  for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))
-    vec1[i+start1] -= vec2[i+start2] / factor;
-}
-
-
-
diff --git a/auxiliary/vector/align4/inplace_div_sub.cl b/auxiliary/vector/align4/inplace_div_sub.cl
deleted file mode 100644
index 40ff851..0000000
--- a/auxiliary/vector/align4/inplace_div_sub.cl
+++ /dev/null
@@ -1,18 +0,0 @@
-
-
-__kernel void inplace_div_sub(
-          __global float4 * vec1,
-          unsigned int start1,
-          unsigned int size1,
-          __global const float4 * vec2,
-          unsigned int start2,
-          unsigned int size2,
-          __global const float * fac)   //CPU variant is mapped to mult_add
-{ 
-  float factor = *fac;
-  unsigned int i_end = size1/4;
-  for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))
-    vec1[i+start1] -= vec2[i+start2] / factor;
-}
-
-
diff --git a/auxiliary/vector/align4/inplace_mul_add.cl b/auxiliary/vector/align4/inplace_mul_add.cl
deleted file mode 100644
index aab7484..0000000
--- a/auxiliary/vector/align4/inplace_mul_add.cl
+++ /dev/null
@@ -1,16 +0,0 @@
-
-__kernel void inplace_mul_add(
-          __global float4 * vec1,
-          unsigned int start1,
-          unsigned int size1,
-          __global const float4 * vec2,
-          unsigned int start2,
-          unsigned int size2,
-          __global const float * fac) 
-{ 
-  float factor = *fac;
-  unsigned int size_div_4 = size1/4;
-  for (unsigned int i = get_global_id(0); i < size_div_4; i += get_global_size(0))
-    vec1[i+start1] += vec2[i+start2] * factor;
-}
-
diff --git a/auxiliary/vector/align4/inplace_mul_sub.cl b/auxiliary/vector/align4/inplace_mul_sub.cl
deleted file mode 100644
index 5c5f750..0000000
--- a/auxiliary/vector/align4/inplace_mul_sub.cl
+++ /dev/null
@@ -1,17 +0,0 @@
-
-__kernel void inplace_mul_sub(
-          __global float4 * vec1,
-          unsigned int start1,
-          unsigned int size1,
-          __global const float4 * vec2,
-          unsigned int start2,
-          unsigned int size2,
-          __global const float * fac)   //CPU variant is mapped to mult_add
-{ 
-  float factor = *fac;
-  unsigned int i_end = size/4;
-  for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))
-    vec1[i+start1] -= vec2[i+start2] * factor;
-}
-
-
diff --git a/auxiliary/vector/align4/mul_add.cl b/auxiliary/vector/align4/mul_add.cl
deleted file mode 100644
index 39327bc..0000000
--- a/auxiliary/vector/align4/mul_add.cl
+++ /dev/null
@@ -1,19 +0,0 @@
-
-__kernel void mul_add(
-          __global const float4 * vec1,
-          unsigned int start1,
-          unsigned int size1,
-          __global const float * fac,
-          __global const float4 * vec2,
-          unsigned int start2,
-          unsigned int size2,
-          __global float4 * result,
-          unsigned int start3,
-          unsigned int size3) 
-{ 
-  float factor = *fac;
-  unsigned int i_end = size1/4;
-  for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))
-    result[i+start3] = vec1[i+start1] * factor + vec2[i+start2];
-}
-
diff --git a/auxiliary/vector/align4/norm_2.cl_disabled b/auxiliary/vector/align4/norm_2.cl_disabled
deleted file mode 100644
index 18e29d8..0000000
--- a/auxiliary/vector/align4/norm_2.cl_disabled
+++ /dev/null
@@ -1,47 +0,0 @@
-
-//helper:
-void helper_norm24_float_parallel_reduction( __local float * tmp_buffer )
-{
-  for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)
-  {
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (get_global_id(0) < stride)
-      tmp_buffer[get_global_id(0)] += tmp_buffer[get_global_id(0)+stride];
-  }
-};
-
-
-////// norm_2
-float float_vector4_norm_2_impl(
-          __global const float4 * vec,
-          unsigned int size,
-          __local float * tmp_buffer)
-{
-  //step 1: fill buffer:
-  float tmp = 0;
-  float4 veci;
-  unsigned int steps = size/4;
-  for (unsigned int i = get_global_id(0); i < steps; i += get_global_size(0))
-  {
-    veci = vec[i];
-    tmp += dot(veci, veci);
-  }
-  tmp_buffer[get_global_id(0)] = tmp;
-  
-  //step 2: parallel reduction:
-  helper_norm24_float_parallel_reduction(tmp_buffer);
-  
-  return tmp_buffer[0];
-};
-
-__kernel void norm_2(
-          __global float4 * vec,
-          unsigned int size,
-          __local float * tmp_buffer,
-          global float * result) 
-{ 
-  float tmp = float_vector4_norm_2_impl(vec, size, tmp_buffer);
-  if (get_global_id(0) == 0) *result = sqrt(tmp);
-};
-
-
diff --git a/changelog b/changelog
index bdfcbbc..c3fdc04 100644
--- a/changelog
+++ b/changelog
@@ -2,6 +2,155 @@
 **** ViennaCL Change Logs ****
 ******************************
 
+*** Version 1.5.x ***
+
+-- Version 1.5.1 --
+This maintenance release fixes a few nasty bugs:
+ - Fixed a memory leak in the OpenCL kernel generator. Thanks to GitHub user dxyzab for spotting this.
+ - Added compatibility of the mixed precision CG implementation with older AMD GPUs. Thanks to Andreas Rost for the input.
+ - Fixed an error when running the QR factorization for matrices with less rows than columns. Thanks to Karol Polko for reporting.
+ - Readded accidentally removed chapters on additional algorithms and structured matrices to the manual. Thanks to Sajjadul Islam for the hint.
+ - Fixed buggy OpenCL kernels for matrix additions and subtractions for column-major matrices. Thanks to Tom Nicholson for reporting.
+ - Fixed an invalid default kernel parameter set for matrix-matrix multiplications on CPUs when using the OpenCL backend. Thanks again to Tom Nicholson.
+ - Corrected a weak check used in two tests. Thanks to Walter Mascarenhas for providing a fix.
+ - Fixed a wrong global work size inside the SPAI preconditioner. Thanks to Andreas Rost.
+
+
+-- Version 1.5.0 --
+This new minor release number update focuses on a more powerful API, and on first steps in making ViennaCL more accessible from languages other than C++.
+In addition to many internal improvements both in terms of performance and flexibility, the following changes are visible to users:
+ - API-change: User-provided OpenCL kernels extract their kernels automatically. A call to add_kernel() is now obsolete, hence the function was removed.
+ - API-change: Device class has been extend and supports all informations defined in the OpenCL 1.1 standard through member functions. Duplicate compute_units() and max_work_group_size() have been removed (thanks for Shantanu Agarwal for the input).
+ - API-change: viennacl::copy() from a ViennaCL object to an object of non-ViennaCL type no longer tries to resize the object accordingly. An assertion is thrown if the sizes are incorrect in order to provide a consistent behavior across many different types.
+ - Datastructure change: Vectors and matrices are now padded with zeros by default, resulting in higher performance particularly for matrix operations. This padding needs to be taken into account when using fast_copy(), particularly for matrices.
+ - Fixed problems with CUDA and CMake+CUDA on Visual Studio.
+ - coordinate_matrix<> now also behaves correctly for tiny matrix dimensions.
+ - CMake 2.6 as new minimum requirement instead of CMake 2.8.
+ - Vectors and matrices can be instantiated with integer template types (long, int, short, char).
+ - Added support for element_prod() and element_div() for dense matrices.
+ - Added element_pow() for vectors and matrices.
+ - Added norm_frobenius() for computing the Frobenius norm of dense matrices.
+ - Added unary element-wise operations for vectors and dense matrices: element_sin(), element_sqrt(), etc.
+ - Multiple OpenCL contexts can now be used in a multi-threaded setting (one thread per context).
+ - Multiple inner products with a common vector can now be computed efficiently via e.g.~inner_prod(x, tie(y, z));
+ - Added support for prod(A, B), where A is a sparse matrix type and B is a dense matrix (thanks to Albert Zaharovits for providing parts of the implementation).
+ - Added diag() function for extracting the diagonal of a vector to a matrix, or for generating a square matrix from a vector with the vector elements on a diagonal (similar to MATLAB).
+ - Added row() and column() functions for extracting a certain row or column of a matrix to a vector.
+ - Sparse matrix-vector products now also work with vector strides and ranges.
+ - Added async_copy() for vectors to allow for a better overlap of computation and communication.
+ - Added compressed_compressed_matrix type for the efficient representation of CSR matrices with only few nonzero rows.
+ - Added possibility to switch command queues in OpenCL contexts.
+ - Improved performance of Block-ILU by removing one spurious conversion step.
+ - Improved performance of Cuthill-McKee algorithm by about 40 percent.
+ - Improved performance of power iteration by avoiding the creation of temporaries in each step.
+ - Removed spurious status message to cout in matrix market reader and nonnegative matrix factorization.
+ - The OpenCL kernel launch logic no longer attempts to re-launch the kernel with smaller work sizes if an error is encountered (thanks to Peter Burka for pointing this out).
+ - Reduced overhead for lenghty expressions involving temporaries (at the cost of increased compilation times).
+ - vector and matrix are now padded to dimensions being multiples of 128 per default. This greatly improves GEMM performance for arbitrary sizes.
+ - Loop indices for OpenMP parallelization are now all signed, increasing compatibility with older OpenMP implementations (thanks to Mrinal Deo for the hint).
+ - Complete rewrite of the generator. Now uses the scheduler for specifying the operation. Includes a full device database for portable high performance of GEMM kernels.
+ - Added micro-scheduler for attaching the OpenCL kernel generator to the user API.
+ - Certain BLAS functionality in ViennaCL is now also available through a shared library (libviennacl).
+ - Removed the external kernel parameter tuning factility, which is to be replaced by an internal device database through the kernel generator.
+ - Completely eliminated the OpenCL kernel conversion step in the developer repository and the source-release. One can now use the developer version without the need for a Boost installation.
+
+
+*** Version 1.4.x ***
+
+-- Version 1.4.2 --
+This is a maintenance release, particularly resolving compilation problems with Visual Studio 2012.
+- Largely refactored the internal code base, unifying code for vector, vector_range, and vector_slice.
+  Similar code refactoring was applied to matrix, matrix_range, and matrix_slice.
+  This not only resolves the problems in VS 2012, but also leads to shorter compilation times and a smaller code base.
+- Improved performance of matrix-vector products of compressed_matrix on CPUs using OpenCL.
+- Resolved a bug which shows up if certain rows and columns of a compressed_matrix are empty and the matrix is copied back to host.
+- Fixed a bug and improved performance of GMRES. Thanks to Ivan Komarov for reporting via sourceforge.
+- Added additional Doxygen documentation.
+
+-- Version 1.4.1 --
+This release focuses on improved stability and performance on AMD devices rather than introducing new features:
+- Included fast matrix-matrix multiplication kernel for AMD's Tahiti GPUs if matrix dimensions are a multiple of 128.
+  Our sample HD7970 reaches over 1.3 TFLOPs in single precision and 200 GFLOPs in double precision (counting multiplications and additions as separate operations).
+- All benchmark FLOPs are now using the common convention of counting multiplications and additions separately (ignoring fused multiply-add).
+- Fixed a bug for matrix-matrix multiplication with matrix_slice<> when slice dimensions are multiples of 64.
+- Improved detection logic for Intel OpenCL SDK.
+- Fixed issues when resizing an empty compressed_matrix.
+- Fixes and improved support for BLAS-1-type operations on dense matrices and vectors.
+- Vector expressions can now be passed to inner_prod() and norm_1(), norm_2() and norm_inf() directly.
+- Improved performance when using OpenMP.
+- Better support for Intel Xeon Phi (MIC).
+- Resolved problems when using OpenCL for CPUs if the number of cores is not a power of 2.
+- Fixed a flaw when using AMG in debug mode. Thanks to Jakub Pola for reporting.
+- Removed accidental external linkage (invalidating header-only model) of SPAI-related functions. Thanks again to Jakub Pola.
+- Fixed issues with copy back to host when OpenCL handles are passed to CTORs of vector, matrix, or compressed_matrix. Thanks again to Jakub Pola.
+- Added fix for segfaults on program exit when providing custom OpenCL queues. Thanks to Denis Demidov for reporting.
+- Fixed bug in copy() to hyb_matrix as reported by Denis Demidov (thanks!).
+- Added an overload for result_of::alignment for vector_expression. Thanks again to Denis Demidov.
+- Added SSE-enabled code contributed by Alex Christensen.
+
+-- Version 1.4.0 --
+The transition from 1.3.x to 1.4.x features the largest number of additions, improvements, and cleanups since the initial release.
+In particular, host-, OpenCL-, and CUDA-based execution is now supported. OpenCL now needs to be enabled explicitly!
+New features and feature improvements are as follows:
+- Added host-based and CUDA-enabled operations on ViennaCL objects. The default is now a host-based execution for reasons of compatibility.
+  Enable OpenCL- or CUDA-based execution by defining the preprocessor constant VIENNACL_WITH_OPENCL and VIENNACL_WITH_CUDA respectively.
+  Note that CUDA-based execution requires the use of nvcc.
+- Added mixed-precision CG solver (OpenCL-based).
+- Greatly improved performance of ILU0 and ILUT preconditioners (up to 10-fold). Also fixed a bug in ILUT.
+- Added initializer types from Boost.uBLAS (unit_vector, zero_vector, scalar_vector, identity_matrix, zero_matrix, scalar_matrix).
+  Thanks to Karsten Ahnert for suggesting the feature.
+- Added incomplete Cholesky factorization preconditioner.
+- Added element-wise operations for vectors as available in Boost.uBLAS (element_prod, element_div).
+- Added restart-after-N-cycles option to BiCGStab.
+- Added level-scheduling for ILU-preconditioners. Performance strongly depends on matrix pattern.
+- Added least-squares example including a function inplace_qr_apply_trans_Q() to compute the right hand side vector Q^T b without rebuilding Q.
+- Improved performance of LU-factorization of dense matrices.
+- Improved dense matrix-vector multiplication performance (thanks to Philippe Tillet).
+- Reduced overhead when copying to/from ublas::compressed_matrix.
+- ViennaCL objects (scalar, vector, etc.) can now be used as global variables (thanks to an anonymous user on the support-mailinglist).
+- Refurbished OpenCL vector kernels backend.
+  All operations of the type v1 = a v2 @ b v3 with vectors v1, v2, v3 and scalars a and b including += and -= instead of = are now temporary-free. Similarly for matrices.
+- matrix_range and matrix_slice as well as vector_range and vector_slice can now be used and mixed completely seamlessly with all standard operations except lu_factorize().
+- Fixed a bug when using copy() with iterators on vector proxy objects.
+- Final reduction step in inner_prod() and norms is now computed on CPU if the result is a CPU scalar.
+- Reduced kernel launch overhead of simple vector kernels by packing multiple kernel arguments together.
+- Updated SVD code and added routines for the computation of symmetric eigenvalues using OpenCL.
+- custom_operation's constructor now support multiple arguments, allowing multiple expression to be packed in the same kernel for improved performances.
+  However, all the datastructures in the multiple operations must have the same size.
+- Further improvements to the OpenCL kernel generator: Added a repeat feature for generating loops inside a kernel, added element-wise products and division, added support for every one-argument OpenCL function.
+- The name of the operation is now a mandatory argument of the constructor of custom_operation.
+- Improved performances of the generated matrix-vector product code.
+- Updated interfacing code for the Eigen library, now working with Eigen 3.x.y.
+- Converter in source-release now depends on Boost.filesystem3 instead of Boost.filesystem2, thus requiring Boost 1.44 or above.
+
+*** Version 1.3.x ***
+
+-- Version 1.3.1 --
+The following bugfixes and enhancements have been applied:
+- Fixed a compilation problem with GCC 4.7 caused by the wrong order of function declarations. Also removed unnecessary indirections and unused variables.
+- Improved out-of-source build in the src-version (for packagers).
+- Added virtual destructor in the runtime_wrapper-class in the kernel generator.
+- Extended flexibility of submatrix and subvector proxies (ranges, slices).
+- Block-ILU for compressed_matrix is now applied on the GPU during the solver cycle phase. However, for the moment the implementation file in viennacl/linalg/detail/ilu/opencl block ilu.hpp needs to be included separately in order to avoid an OpenCL dependency for all ILU implementations.
+- SVD now supports double precision.
+- Slighly adjusted the interface for NMF. The approximation rank is now specified by the supplied matrices W and H.
+- Fixed a problem with matrix-matrix products if the result matrix is not initialized properly (thanks to Laszlo Marak for finding the issue and a fix).
+- The operations C += prod(A, B) and C −= prod(A, B) for matrices A, B, and C no longer introduce temporaries if the three matrices are distinct.
+
+-- Version 1.3.0 --
+Several new features enter this new minor version release.
+Some of the experimental features introduced in 1.2.0 keep their experimental state in 1.3.x due to the short time since 1.2.0, with exceptions listed below along with the new features:
+ - Full support for ranges and slices for dense matrices and vectors (no longer experimental)
+ - QR factorization now possible for arbitrary matrix sizes (no longer experimental)
+ - Further improved matrix-matrix multiplication performance for matrix dimensions which are a multiple of 64 (particularly improves performance for NVIDIA GPUs)
+ - Added Lanczos and power iteration method for eigenvalue computations of dense and sparse matrices (experimental, contributed by Guenther Mader and Astrid Rupp)
+ - Added singular value decomposition in single precision (experimental, contributed by Volodymyr Kysenko)
+ - Two new ILU-preconditioners added: ILU0 (contributed by Evan Bollig) and a block-diagonal ILU preconditioner using either ILUT or ILU0 for each block. Both preconditioners are computed entirely on the CPU.
+ - Automated OpenCL kernel generator based on high-level operation specifications added (many thanks to Philippe Tillet who had a lot of /fun fun fun/ working on this)
+ - Two new sparse matrix types (by Volodymyr Kysenko): ell_matrix for the ELL format and hyb_matrix for a hybrid format (contributed by Volodymyr Kysenko).
+ - Added possibility to specify the OpenCL platform used by a context
+ - Build options for the OpenCL compiler can now be supplied to a context (thanks to Krzysztof Bzowski for the suggestion)
+ - Added nonnegative matrix factorization by Lee and Seoung (contributed by Volodymyr Kysenko).
 
 *** Version 1.2.x ***
 
@@ -17,7 +166,6 @@ The main changes (in addition to some internal adjustments) are as follows:
  - Fixed incorrect matrix dimensions obtained with the transfer of non-square sparse Eigen and MTL matrices to ViennaCL objects (thanks to sourceforge.net user ggrocca for pointing at this)
 
 -- Version 1.2.0 --
-
 Many new features from the Google Summer of Code and the IuE Summer of Code enter this release.
 Due to their complexity, they are for the moment still in experimental state (see the respective chapters for details) and are expected to reach maturity with the 1.3.0 release.
 Shorter release cycles are planned for the near future.
@@ -73,7 +221,7 @@ storage.
  - Dense and sparse matrix types now now be filled using STL-emulated types (std::vector< std::vector<NumericT> > and std::vector< std::map< unsigned int, NumericT> >)
  - BLAS level 3 functionality is now complete. We are very happy with the general out-of-the-box performance of matrix-matrix-products, even though it cannot beat the extremely tuned implementations tailored to certain matrix sizes on a particular device yet.
  - An automated performance tuning environment allows an optimization of the kernel parameters for the library user's machine. Best parameters can be obtained from a tuning run and stored in a XML file and read at program startup using pugixml.
- - Two now preconditioners are now included: A Jacobi preconditioner and a row-scaling preconditioner. In contrast to ILUT, they are applied on the OpenCL device directly.
+ - Two new preconditioners are now included: A Jacobi preconditioner and a row-scaling preconditioner. In contrast to ILUT, they are applied on the OpenCL device directly.
  - Clean compilation of all examples under Visual Studio 2005 (we recommend newer compilers though...).
  - Error handling is now carried out using C++ exceptions.
  - Matrix Market now uses index base 1 per default (thanks to Evan Bollig for reporting that)
@@ -106,7 +254,7 @@ The main improvements in this release are:
  - Support for multi-core CPUs with ATI Stream SDK (thanks to Riccardo Rossi, UPC. BARCELONA TECH, for suggesting this)
  - inner_prod is now up to a factor of four faster (thanks to Serban Georgescu, ETH, for pointing the poor performance of the old implementation out)
  - Fixed a bug with plane_rotation that caused system freezes with ATI GPUs.
- - Extended the doxygen generated reference documentation 
+ - Extended the doxygen generated reference documentation
 
 
 -- Version 1.0.2 --
diff --git a/cmake/FindMTL.cmake b/cmake/FindMTL.cmake
index 9f45d6f..f3d07c0 100644
--- a/cmake/FindMTL.cmake
+++ b/cmake/FindMTL.cmake
@@ -1,7 +1,14 @@
-SET(MTL_INCLUDE_DIRS "${MTL_DIR}/../../include")
+#SET(MTL_INCLUDE_DIRS "${MTL_DIR}/../../include")
 find_package(Boost 1.36 REQUIRED)
 if(Boost_FOUND)
 	LIST(APPEND MTL_INCLUDE_DIRS ${Boost_INCLUDE_DIRS})
 endif(Boost_FOUND)
 
-include_directories(${MTL_INCLUDE_DIRS})
+# find MTL
+find_path(MTL_INCLUDE_DIR boost/numeric/itl)
+if(NOT MTL_INCLUDE_DIR)
+  message(SEND_ERROR "Failed to find MTL")
+endif()
+mark_as_advanced(MTL_INCLUDE_DIR)
+
+include_directories(${MTL_INCLUDE_DIRS} ${MTL_INCLUDE_DIR})
diff --git a/cmake/FindOpenCL.cmake b/cmake/FindOpenCL.cmake
index a237116..c755511 100644
--- a/cmake/FindOpenCL.cmake
+++ b/cmake/FindOpenCL.cmake
@@ -18,6 +18,11 @@ if(ENV_AMDAPPSDKROOT)
  set(ENV_OPENCLROOT $ENV{AMDAPPSDKROOT})
 endif(ENV_AMDAPPSDKROOT)
 
+set(ENV_INTELOCLSDKROOT $ENV{INTELOCLSDKROOT})
+if(ENV_INTELOCLSDKROOT)
+ set(ENV_OPENCLROOT $ENV{INTELOCLSDKROOT})
+endif(ENV_INTELOCLSDKROOT)
+
 set(ENV_OPENCLROOT2 $ENV{OPENCLROOT})
 if(ENV_OPENCLROOT2)
  set(ENV_OPENCLROOT $ENV{OPENCLROOT})
@@ -52,6 +57,7 @@ else(ENV_OPENCLROOT)
   find_path(
     OPENCL_INCLUDE_DIR
     NAMES CL/cl.h OpenCL/cl.h
+    PATHS ${PROJECT_SOURCE_DIR}      #use the CL/ include folder provided with ViennaCL
     )
 
   find_library(
diff --git a/cmake/ViennaCLCommon.cmake b/cmake/ViennaCLCommon.cmake
index 3939b4f..4d228d6 100644
--- a/cmake/ViennaCLCommon.cmake
+++ b/cmake/ViennaCLCommon.cmake
@@ -1,11 +1,4 @@
 
-# do not build tests by default, since they require Boost
-if (VIENNACL_SRC_DIST)
- option(BUILD_TESTING "Build the tests " ON)
-else (VIENNACL_SRC_DIST)
- option(BUILD_TESTING "Build the tests " OFF)
-endif(VIENNACL_SRC_DIST)
-
 include(CTest)
 include(CMakeDependentOption)
 
@@ -36,8 +29,14 @@ file(RELATIVE_PATH CONF_REL_INCLUDE_DIR "${INSTALL_CMAKE_DIR}"
 # User options
 ##############
 
+option(ENABLE_CUDA "Use the CUDA backend" OFF)
+
 option(BUILD_EXAMPLES "Build example programs" ON)
 
+option(ENABLE_OPENCL "Use the OpenCL backend" ON)
+
+option(ENABLE_OPENMP "Use OpenMP acceleration" OFF)
+
 # If you are interested in the impact of different kernel parameters on
 # performance, you may want to give ViennaProfiler a try (see
 # http://sourceforge.net/projects/viennaprofiler/) Set your connection
@@ -59,10 +58,9 @@ cmake_dependent_option(ENABLE_EIGEN "Enable examples that use Eigen" OFF
 cmake_dependent_option(ENABLE_MTL4 "Enable examples that use MTL4" OFF
    BUILD_EXAMPLES OFF)
 
-cmake_dependent_option(ENABLE_PEDANTIC_FLAGS "Enable pedantic compiler flags"
-   ON CMAKE_COMPILER_IS_GNUCXX OFF)
+option(ENABLE_PEDANTIC_FLAGS "Enable pedantic compiler flags (GCC and Clang only)" OFF)
 
-mark_as_advanced(BOOSTPATH ENABLE_VIENNAPROFILER ENABLE_UBLAS ENABLE_EIGEN
+mark_as_advanced(BOOSTPATH ENABLE_VIENNAPROFILER ENABLE_EIGEN
    ENABLE_MTL4 ENABLE_PEDANTIC_FLAGS)
 
 # Find prerequisites
@@ -76,13 +74,33 @@ IF (BOOSTPATH)
 ENDIF (BOOSTPATH)
 
 
-if(ENABLE_UBLAS OR BUILD_TESTING OR VIENNACL_SRC_DIST)
+if(ENABLE_UBLAS OR BUILD_TESTING)
    set(Boost_USE_MULTITHREADED TRUE)
-   find_package(Boost REQUIRED COMPONENTS filesystem system)
+   find_package(Boost)
+   if (Boost_MINOR_VERSION LESS 34)
+     find_package(Boost REQUIRED COMPONENTS thread)
+   elseif (Boost_MINOR_VERSION LESS 47)
+     find_package(Boost REQUIRED COMPONENTS date_time serialization system thread)
+   else ()
+     find_package(Boost REQUIRED COMPONENTS chrono date_time serialization system thread)
+   endif()
 endif()
 
-find_package(OpenCL REQUIRED)
-find_package(OpenMP)
+if (ENABLE_CUDA)
+   find_package(CUDA REQUIRED)
+   set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -arch=sm_13 -DVIENNACL_WITH_CUDA)
+endif(ENABLE_CUDA)
+
+if (ENABLE_OPENCL)
+   find_package(OpenCL REQUIRED)
+endif(ENABLE_OPENCL)
+
+if (ENABLE_OPENMP)
+   find_package(OpenMP REQUIRED)
+   set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS} -DVIENNACL_WITH_OPENMP")
+   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS} -DVIENNACL_WITH_OPENMP")
+   set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
+endif(ENABLE_OPENMP)
 
 if(ENABLE_VIENNAPROFILER)
    find_package(ViennaProfiler REQUIRED)
@@ -103,15 +121,20 @@ if(ENABLE_MTL4)
 endif()
 
 include_directories(
-   ${PROJECT_BINARY_DIR}
    ${PROJECT_SOURCE_DIR}
    ${OPENCL_INCLUDE_DIRS})
 
 # Set high warning level on GCC
 if(ENABLE_PEDANTIC_FLAGS)
-   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -pedantic")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -pedantic")
+endif()
+
+# Disable Warning 4996 (std::copy is unsafe ...) on Visual Studio
+if (MSVC)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4996")
 endif()
 
+
 # Export
 ########
 
@@ -124,7 +147,9 @@ configure_file(cmake/ViennaCLConfig.cmake.in
 configure_file(cmake/ViennaCLConfigVersion.cmake.in
    ${PROJECT_BINARY_DIR}/ViennaCLConfigVersion.cmake @ONLY)
 
-export(PACKAGE ViennaCL)
+if (CMAKE_MINOR_VERSION GREATER 6)  # export(PACKAGE ...) introduced with CMake 2.8.0
+  export(PACKAGE ViennaCL)
+endif()
 
 # Install
 #########
diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt
index 1b80cc8..b6a873f 100644
--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
@@ -34,6 +34,7 @@ if(BUILD_MANUAL)
       manual/figures/tip.eps
       manual/figures/TU_Signet_CMYK.eps
       manual/IEEEtran_v1.13.bst
+      manual/additional-algorithms.tex
       manual/installation.tex
       manual/introduction.tex
       manual/keywords.tex
@@ -41,7 +42,9 @@ if(BUILD_MANUAL)
       manual/multi-device.tex
       manual/operations.tex
       manual/other-libs.tex
+      manual/shared-lib.tex
       manual/setup.tex
+      manual/structured-matrices.tex
       manual/tuning.tex
       manual/types.tex
       manual/versioning.tex
diff --git a/doc/Doxyfile.in b/doc/Doxyfile.in
index a4a1b32..2edec42 100644
--- a/doc/Doxyfile.in
+++ b/doc/Doxyfile.in
@@ -14,76 +14,76 @@
 # Project related configuration options
 #---------------------------------------------------------------------------
 
-# This tag specifies the encoding used for all characters in the config file 
-# that follow. The default is UTF-8 which is also the encoding used for all 
-# text before the first occurrence of this tag. Doxygen uses libiconv (or the 
-# iconv built into libc) for the transcoding. See 
+# This tag specifies the encoding used for all characters in the config file
+# that follow. The default is UTF-8 which is also the encoding used for all
+# text before the first occurrence of this tag. Doxygen uses libiconv (or the
+# iconv built into libc) for the transcoding. See
 # http://www.gnu.org/software/libiconv for the list of possible encodings.
 
 DOXYFILE_ENCODING      = UTF-8
 
-# The PROJECT_NAME tag is a single word (or a sequence of words surrounded 
+# The PROJECT_NAME tag is a single word (or a sequence of words surrounded
 # by quotes) that should identify the project.
 
 PROJECT_NAME           = "ViennaCL - The Vienna Computing Library"
 
-# The PROJECT_NUMBER tag can be used to enter a project or revision number. 
-# This could be handy for archiving the generated documentation or 
+# The PROJECT_NUMBER tag can be used to enter a project or revision number.
+# This could be handy for archiving the generated documentation or
 # if some version control system is used.
 
-PROJECT_NUMBER         = 1.2.1
+PROJECT_NUMBER         = 1.5.1
 
-# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) 
-# base path where the generated documentation will be put. 
-# If a relative path is entered, it will be relative to the location 
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
+# base path where the generated documentation will be put.
+# If a relative path is entered, it will be relative to the location
 # where doxygen was started. If left blank the current directory will be used.
 
 OUTPUT_DIRECTORY       = doxygen/
 
-# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create 
-# 4096 sub-directories (in 2 levels) under the output directory of each output 
-# format and will distribute the generated files over these directories. 
-# Enabling this option can be useful when feeding doxygen a huge amount of 
-# source files, where putting all generated files in the same directory would 
+# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create
+# 4096 sub-directories (in 2 levels) under the output directory of each output
+# format and will distribute the generated files over these directories.
+# Enabling this option can be useful when feeding doxygen a huge amount of
+# source files, where putting all generated files in the same directory would
 # otherwise cause performance problems for the file system.
 
 CREATE_SUBDIRS         = NO
 
-# The OUTPUT_LANGUAGE tag is used to specify the language in which all 
-# documentation generated by doxygen is written. Doxygen will use this 
-# information to generate all constant output in the proper language. 
-# The default language is English, other supported languages are: 
-# Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional, 
-# Croatian, Czech, Danish, Dutch, Farsi, Finnish, French, German, Greek, 
-# Hungarian, Italian, Japanese, Japanese-en (Japanese with English messages), 
-# Korean, Korean-en, Lithuanian, Norwegian, Macedonian, Persian, Polish, 
-# Portuguese, Romanian, Russian, Serbian, Serbian-Cyrilic, Slovak, Slovene, 
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all constant output in the proper language.
+# The default language is English, other supported languages are:
+# Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional,
+# Croatian, Czech, Danish, Dutch, Farsi, Finnish, French, German, Greek,
+# Hungarian, Italian, Japanese, Japanese-en (Japanese with English messages),
+# Korean, Korean-en, Lithuanian, Norwegian, Macedonian, Persian, Polish,
+# Portuguese, Romanian, Russian, Serbian, Serbian-Cyrilic, Slovak, Slovene,
 # Spanish, Swedish, and Ukrainian.
 
 OUTPUT_LANGUAGE        = English
 
-# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will 
-# include brief member descriptions after the members that are listed in 
-# the file and class documentation (similar to JavaDoc). 
+# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will
+# include brief member descriptions after the members that are listed in
+# the file and class documentation (similar to JavaDoc).
 # Set to NO to disable this.
 
 BRIEF_MEMBER_DESC      = YES
 
-# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend 
-# the brief description of a member or function before the detailed description. 
-# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the 
+# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend
+# the brief description of a member or function before the detailed description.
+# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
 # brief descriptions will be completely suppressed.
 
 REPEAT_BRIEF           = YES
 
-# This tag implements a quasi-intelligent brief description abbreviator 
-# that is used to form the text in various listings. Each string 
-# in this list, if found as the leading text of the brief description, will be 
-# stripped from the text and the result after processing the whole list, is 
-# used as the annotated text. Otherwise, the brief description is used as-is. 
-# If left blank, the following values are used ("$name" is automatically 
-# replaced with the name of the entity): "The $name class" "The $name widget" 
-# "The $name file" "is" "provides" "specifies" "contains" 
+# This tag implements a quasi-intelligent brief description abbreviator
+# that is used to form the text in various listings. Each string
+# in this list, if found as the leading text of the brief description, will be
+# stripped from the text and the result after processing the whole list, is
+# used as the annotated text. Otherwise, the brief description is used as-is.
+# If left blank, the following values are used ("$name" is automatically
+# replaced with the name of the entity): "The $name class" "The $name widget"
+# "The $name file" "is" "provides" "specifies" "contains"
 # "represents" "a" "an" "the"
 
 ABBREVIATE_BRIEF       = "The $name class" \
@@ -98,202 +98,202 @@ ABBREVIATE_BRIEF       = "The $name class" \
                          an \
                          the
 
-# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then 
-# Doxygen will generate a detailed section even if there is only a brief 
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
+# Doxygen will generate a detailed section even if there is only a brief
 # description.
 
 ALWAYS_DETAILED_SEC    = NO
 
-# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all 
-# inherited members of a class in the documentation of that class as if those 
-# members were ordinary class members. Constructors, destructors and assignment 
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
+# inherited members of a class in the documentation of that class as if those
+# members were ordinary class members. Constructors, destructors and assignment
 # operators of the base classes will not be shown.
 
 INLINE_INHERITED_MEMB  = NO
 
-# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full 
-# path before files name in the file list and in the header files. If set 
+# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full
+# path before files name in the file list and in the header files. If set
 # to NO the shortest path that makes the file name unique will be used.
 
 FULL_PATH_NAMES        = YES
 
-# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag 
-# can be used to strip a user-defined part of the path. Stripping is 
-# only done if one of the specified strings matches the left-hand part of 
-# the path. The tag can be used to show relative paths in the file list. 
-# If left blank the directory from which doxygen is run is used as the 
+# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag
+# can be used to strip a user-defined part of the path. Stripping is
+# only done if one of the specified strings matches the left-hand part of
+# the path. The tag can be used to show relative paths in the file list.
+# If left blank the directory from which doxygen is run is used as the
 # path to strip.
 
-STRIP_FROM_PATH        = 
+STRIP_FROM_PATH        = /home/rupp/development/ViennaCL/viennacl-dev
 
-# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of 
-# the path mentioned in the documentation of a class, which tells 
-# the reader which header file to include in order to use a class. 
-# If left blank only the name of the header file containing the class 
-# definition is used. Otherwise one should specify the include paths that 
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of
+# the path mentioned in the documentation of a class, which tells
+# the reader which header file to include in order to use a class.
+# If left blank only the name of the header file containing the class
+# definition is used. Otherwise one should specify the include paths that
 # are normally passed to the compiler using the -I flag.
 
-STRIP_FROM_INC_PATH    = 
+STRIP_FROM_INC_PATH    =
 
-# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter 
-# (but less readable) file names. This can be useful is your file systems 
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter
+# (but less readable) file names. This can be useful is your file systems
 # doesn't support long names like on DOS, Mac, or CD-ROM.
 
 SHORT_NAMES            = NO
 
-# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen 
-# will interpret the first line (until the first dot) of a JavaDoc-style 
-# comment as the brief description. If set to NO, the JavaDoc 
-# comments will behave just like regular Qt-style comments 
+# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen
+# will interpret the first line (until the first dot) of a JavaDoc-style
+# comment as the brief description. If set to NO, the JavaDoc
+# comments will behave just like regular Qt-style comments
 # (thus requiring an explicit @brief command for a brief description.)
 
 JAVADOC_AUTOBRIEF      = NO
 
-# If the QT_AUTOBRIEF tag is set to YES then Doxygen will 
-# interpret the first line (until the first dot) of a Qt-style 
-# comment as the brief description. If set to NO, the comments 
-# will behave just like regular Qt-style comments (thus requiring 
+# If the QT_AUTOBRIEF tag is set to YES then Doxygen will
+# interpret the first line (until the first dot) of a Qt-style
+# comment as the brief description. If set to NO, the comments
+# will behave just like regular Qt-style comments (thus requiring
 # an explicit \brief command for a brief description.)
 
 QT_AUTOBRIEF           = NO
 
-# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen 
-# treat a multi-line C++ special comment block (i.e. a block of //! or /// 
-# comments) as a brief description. This used to be the default behaviour. 
-# The new default is to treat a multi-line C++ comment block as a detailed 
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen
+# treat a multi-line C++ special comment block (i.e. a block of //! or ///
+# comments) as a brief description. This used to be the default behaviour.
+# The new default is to treat a multi-line C++ comment block as a detailed
 # description. Set this tag to YES if you prefer the old behaviour instead.
 
 MULTILINE_CPP_IS_BRIEF = NO
 
-# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented 
-# member inherits the documentation from any documented member that it 
+# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented
+# member inherits the documentation from any documented member that it
 # re-implements.
 
 INHERIT_DOCS           = YES
 
-# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce 
-# a new page for each member. If set to NO, the documentation of a member will 
+# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce
+# a new page for each member. If set to NO, the documentation of a member will
 # be part of the file/class/namespace that contains it.
 
 SEPARATE_MEMBER_PAGES  = NO
 
-# The TAB_SIZE tag can be used to set the number of spaces in a tab. 
+# The TAB_SIZE tag can be used to set the number of spaces in a tab.
 # Doxygen uses this value to replace tabs by spaces in code fragments.
 
 TAB_SIZE               = 8
 
-# This tag can be used to specify a number of aliases that acts 
-# as commands in the documentation. An alias has the form "name=value". 
-# For example adding "sideeffect=\par Side Effects:\n" will allow you to 
-# put the command \sideeffect (or @sideeffect) in the documentation, which 
-# will result in a user-defined paragraph with heading "Side Effects:". 
+# This tag can be used to specify a number of aliases that acts
+# as commands in the documentation. An alias has the form "name=value".
+# For example adding "sideeffect=\par Side Effects:\n" will allow you to
+# put the command \sideeffect (or @sideeffect) in the documentation, which
+# will result in a user-defined paragraph with heading "Side Effects:".
 # You can put \n's in the value part of an alias to insert newlines.
 
-ALIASES                = 
+ALIASES                =
 
-# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C 
-# sources only. Doxygen will then generate output that is more tailored for C. 
-# For instance, some of the names that are used will be different. The list 
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C
+# sources only. Doxygen will then generate output that is more tailored for C.
+# For instance, some of the names that are used will be different. The list
 # of all members will be omitted, etc.
 
 OPTIMIZE_OUTPUT_FOR_C  = YES
 
-# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java 
-# sources only. Doxygen will then generate output that is more tailored for 
-# Java. For instance, namespaces will be presented as packages, qualified 
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java
+# sources only. Doxygen will then generate output that is more tailored for
+# Java. For instance, namespaces will be presented as packages, qualified
 # scopes will look different, etc.
 
 OPTIMIZE_OUTPUT_JAVA   = NO
 
-# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran 
-# sources only. Doxygen will then generate output that is more tailored for 
+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
+# sources only. Doxygen will then generate output that is more tailored for
 # Fortran.
 
 OPTIMIZE_FOR_FORTRAN   = NO
 
-# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL 
-# sources. Doxygen will then generate output that is tailored for 
+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
+# sources. Doxygen will then generate output that is tailored for
 # VHDL.
 
 OPTIMIZE_OUTPUT_VHDL   = NO
 
-# Doxygen selects the parser to use depending on the extension of the files it parses. 
-# With this tag you can assign which parser to use for a given extension. 
-# Doxygen has a built-in mapping, but you can override or extend it using this tag. 
-# The format is ext=language, where ext is a file extension, and language is one of 
-# the parsers supported by doxygen: IDL, Java, Javascript, C#, C, C++, D, PHP, 
-# Objective-C, Python, Fortran, VHDL, C, C++. For instance to make doxygen treat 
-# .inc files as Fortran files (default is PHP), and .f files as C (default is Fortran), 
+# Doxygen selects the parser to use depending on the extension of the files it parses.
+# With this tag you can assign which parser to use for a given extension.
+# Doxygen has a built-in mapping, but you can override or extend it using this tag.
+# The format is ext=language, where ext is a file extension, and language is one of
+# the parsers supported by doxygen: IDL, Java, Javascript, C#, C, C++, D, PHP,
+# Objective-C, Python, Fortran, VHDL, C, C++. For instance to make doxygen treat
+# .inc files as Fortran files (default is PHP), and .f files as C (default is Fortran),
 # use: inc=Fortran f=C
 
-EXTENSION_MAPPING      = 
+EXTENSION_MAPPING      =
 
-# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want 
-# to include (a tag file for) the STL sources as input, then you should 
-# set this tag to YES in order to let doxygen match functions declarations and 
-# definitions whose arguments contain STL classes (e.g. func(std::string); v.s. 
-# func(std::string) {}). This also make the inheritance and collaboration 
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
+# to include (a tag file for) the STL sources as input, then you should
+# set this tag to YES in order to let doxygen match functions declarations and
+# definitions whose arguments contain STL classes (e.g. func(std::string); v.s.
+# func(std::string) {}). This also make the inheritance and collaboration
 # diagrams that involve STL classes more complete and accurate.
 
 BUILTIN_STL_SUPPORT    = NO
 
-# If you use Microsoft's C++/CLI language, you should set this option to YES to 
+# If you use Microsoft's C++/CLI language, you should set this option to YES to
 # enable parsing support.
 
 CPP_CLI_SUPPORT        = NO
 
-# Set the SIP_SUPPORT tag to YES if your project consists of sip sources only. 
-# Doxygen will parse them like normal C++ but will assume all classes use public 
+# Set the SIP_SUPPORT tag to YES if your project consists of sip sources only.
+# Doxygen will parse them like normal C++ but will assume all classes use public
 # instead of private inheritance when no explicit protection keyword is present.
 
 SIP_SUPPORT            = NO
 
-# For Microsoft's IDL there are propget and propput attributes to indicate getter 
-# and setter methods for a property. Setting this option to YES (the default) 
-# will make doxygen to replace the get and set methods by a property in the 
-# documentation. This will only work if the methods are indeed getting or 
-# setting a simple type. If this is not the case, or you want to show the 
+# For Microsoft's IDL there are propget and propput attributes to indicate getter
+# and setter methods for a property. Setting this option to YES (the default)
+# will make doxygen to replace the get and set methods by a property in the
+# documentation. This will only work if the methods are indeed getting or
+# setting a simple type. If this is not the case, or you want to show the
 # methods anyway, you should set this option to NO.
 
 IDL_PROPERTY_SUPPORT   = YES
 
-# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC 
-# tag is set to YES, then doxygen will reuse the documentation of the first 
-# member in the group (if any) for the other members of the group. By default 
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
+# tag is set to YES, then doxygen will reuse the documentation of the first
+# member in the group (if any) for the other members of the group. By default
 # all members of a group must be documented explicitly.
 
 DISTRIBUTE_GROUP_DOC   = NO
 
-# Set the SUBGROUPING tag to YES (the default) to allow class member groups of 
-# the same type (for instance a group of public functions) to be put as a 
-# subgroup of that type (e.g. under the Public Functions section). Set it to 
-# NO to prevent subgrouping. Alternatively, this can be done per class using 
+# Set the SUBGROUPING tag to YES (the default) to allow class member groups of
+# the same type (for instance a group of public functions) to be put as a
+# subgroup of that type (e.g. under the Public Functions section). Set it to
+# NO to prevent subgrouping. Alternatively, this can be done per class using
 # the \nosubgrouping command.
 
 SUBGROUPING            = YES
 
-# When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct, union, or enum 
-# is documented as struct, union, or enum with the name of the typedef. So 
-# typedef struct TypeS {} TypeT, will appear in the documentation as a struct 
-# with name TypeT. When disabled the typedef will appear as a member of a file, 
-# namespace, or class. And the struct will be named TypeS. This can typically 
-# be useful for C code in case the coding convention dictates that all compound 
+# When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct, union, or enum
+# is documented as struct, union, or enum with the name of the typedef. So
+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
+# with name TypeT. When disabled the typedef will appear as a member of a file,
+# namespace, or class. And the struct will be named TypeS. This can typically
+# be useful for C code in case the coding convention dictates that all compound
 # types are typedef'ed and only the typedef is referenced, never the tag name.
 
 TYPEDEF_HIDES_STRUCT   = NO
 
-# The SYMBOL_CACHE_SIZE determines the size of the internal cache use to 
-# determine which symbols to keep in memory and which to flush to disk. 
-# When the cache is full, less often used symbols will be written to disk. 
-# For small to medium size projects (<1000 input files) the default value is 
-# probably good enough. For larger projects a too small cache size can cause 
-# doxygen to be busy swapping symbols to and from disk most of the time 
-# causing a significant performance penality. 
-# If the system has enough physical memory increasing the cache will improve the 
-# performance by keeping more symbols in memory. Note that the value works on 
-# a logarithmic scale so increasing the size by one will rougly double the 
-# memory usage. The cache size is given by this formula: 
-# 2^(16+SYMBOL_CACHE_SIZE). The valid range is 0..9, the default is 0, 
+# The SYMBOL_CACHE_SIZE determines the size of the internal cache use to
+# determine which symbols to keep in memory and which to flush to disk.
+# When the cache is full, less often used symbols will be written to disk.
+# For small to medium size projects (<1000 input files) the default value is
+# probably good enough. For larger projects a too small cache size can cause
+# doxygen to be busy swapping symbols to and from disk most of the time
+# causing a significant performance penality.
+# If the system has enough physical memory increasing the cache will improve the
+# performance by keeping more symbols in memory. Note that the value works on
+# a logarithmic scale so increasing the size by one will rougly double the
+# memory usage. The cache size is given by this formula:
+# 2^(16+SYMBOL_CACHE_SIZE). The valid range is 0..9, the default is 0,
 # corresponding to a cache size of 2^16 = 65536 symbols
 
 SYMBOL_CACHE_SIZE      = 0
@@ -302,292 +302,292 @@ SYMBOL_CACHE_SIZE      = 0
 # Build related configuration options
 #---------------------------------------------------------------------------
 
-# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in 
-# documentation are documented, even if no documentation was available. 
-# Private class members and static file members will be hidden unless 
+# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in
+# documentation are documented, even if no documentation was available.
+# Private class members and static file members will be hidden unless
 # the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES
 
 EXTRACT_ALL            = YES
 
-# If the EXTRACT_PRIVATE tag is set to YES all private members of a class 
+# If the EXTRACT_PRIVATE tag is set to YES all private members of a class
 # will be included in the documentation.
 
 EXTRACT_PRIVATE        = NO
 
-# If the EXTRACT_STATIC tag is set to YES all static members of a file 
+# If the EXTRACT_STATIC tag is set to YES all static members of a file
 # will be included in the documentation.
 
 EXTRACT_STATIC         = NO
 
-# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs) 
-# defined locally in source files will be included in the documentation. 
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs)
+# defined locally in source files will be included in the documentation.
 # If set to NO only classes defined in header files are included.
 
 EXTRACT_LOCAL_CLASSES  = YES
 
-# This flag is only useful for Objective-C code. When set to YES local 
-# methods, which are defined in the implementation section but not in 
-# the interface are included in the documentation. 
+# This flag is only useful for Objective-C code. When set to YES local
+# methods, which are defined in the implementation section but not in
+# the interface are included in the documentation.
 # If set to NO (the default) only methods in the interface are included.
 
 EXTRACT_LOCAL_METHODS  = NO
 
-# If this flag is set to YES, the members of anonymous namespaces will be 
-# extracted and appear in the documentation as a namespace called 
-# 'anonymous_namespace{file}', where file will be replaced with the base 
-# name of the file that contains the anonymous namespace. By default 
+# If this flag is set to YES, the members of anonymous namespaces will be
+# extracted and appear in the documentation as a namespace called
+# 'anonymous_namespace{file}', where file will be replaced with the base
+# name of the file that contains the anonymous namespace. By default
 # anonymous namespace are hidden.
 
 EXTRACT_ANON_NSPACES   = NO
 
-# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all 
-# undocumented members of documented classes, files or namespaces. 
-# If set to NO (the default) these members will be included in the 
-# various overviews, but no documentation section is generated. 
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all
+# undocumented members of documented classes, files or namespaces.
+# If set to NO (the default) these members will be included in the
+# various overviews, but no documentation section is generated.
 # This option has no effect if EXTRACT_ALL is enabled.
 
 HIDE_UNDOC_MEMBERS     = NO
 
-# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all 
-# undocumented classes that are normally visible in the class hierarchy. 
-# If set to NO (the default) these classes will be included in the various 
+# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all
+# undocumented classes that are normally visible in the class hierarchy.
+# If set to NO (the default) these classes will be included in the various
 # overviews. This option has no effect if EXTRACT_ALL is enabled.
 
 HIDE_UNDOC_CLASSES     = NO
 
-# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all 
-# friend (class|struct|union) declarations. 
-# If set to NO (the default) these declarations will be included in the 
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all
+# friend (class|struct|union) declarations.
+# If set to NO (the default) these declarations will be included in the
 # documentation.
 
 HIDE_FRIEND_COMPOUNDS  = NO
 
-# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any 
-# documentation blocks found inside the body of a function. 
-# If set to NO (the default) these blocks will be appended to the 
+# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any
+# documentation blocks found inside the body of a function.
+# If set to NO (the default) these blocks will be appended to the
 # function's detailed documentation block.
 
 HIDE_IN_BODY_DOCS      = NO
 
-# The INTERNAL_DOCS tag determines if documentation 
-# that is typed after a \internal command is included. If the tag is set 
-# to NO (the default) then the documentation will be excluded. 
+# The INTERNAL_DOCS tag determines if documentation
+# that is typed after a \internal command is included. If the tag is set
+# to NO (the default) then the documentation will be excluded.
 # Set it to YES to include the internal documentation.
 
 INTERNAL_DOCS          = NO
 
-# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate 
-# file names in lower-case letters. If set to YES upper-case letters are also 
-# allowed. This is useful if you have classes or files whose names only differ 
-# in case and if your file system supports case sensitive file names. Windows 
+# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate
+# file names in lower-case letters. If set to YES upper-case letters are also
+# allowed. This is useful if you have classes or files whose names only differ
+# in case and if your file system supports case sensitive file names. Windows
 # and Mac users are advised to set this option to NO.
 
 CASE_SENSE_NAMES       = NO
 
-# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen 
-# will show members with their full class and namespace scopes in the 
+# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen
+# will show members with their full class and namespace scopes in the
 # documentation. If set to YES the scope will be hidden.
 
 HIDE_SCOPE_NAMES       = YES
 
-# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen 
-# will put a list of the files that are included by a file in the documentation 
+# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen
+# will put a list of the files that are included by a file in the documentation
 # of that file.
 
 SHOW_INCLUDE_FILES     = YES
 
-# If the INLINE_INFO tag is set to YES (the default) then a tag [inline] 
+# If the INLINE_INFO tag is set to YES (the default) then a tag [inline]
 # is inserted in the documentation for inline members.
 
 INLINE_INFO            = YES
 
-# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen 
-# will sort the (detailed) documentation of file and class members 
-# alphabetically by member name. If set to NO the members will appear in 
+# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen
+# will sort the (detailed) documentation of file and class members
+# alphabetically by member name. If set to NO the members will appear in
 # declaration order.
 
 SORT_MEMBER_DOCS       = YES
 
-# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the 
-# brief documentation of file, namespace and class members alphabetically 
-# by member name. If set to NO (the default) the members will appear in 
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the
+# brief documentation of file, namespace and class members alphabetically
+# by member name. If set to NO (the default) the members will appear in
 # declaration order.
 
 SORT_BRIEF_DOCS        = NO
 
-# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the 
-# hierarchy of group names into alphabetical order. If set to NO (the default) 
+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the
+# hierarchy of group names into alphabetical order. If set to NO (the default)
 # the group names will appear in their defined order.
 
 SORT_GROUP_NAMES       = NO
 
-# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be 
-# sorted by fully-qualified names, including namespaces. If set to 
-# NO (the default), the class list will be sorted only by class name, 
-# not including the namespace part. 
-# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. 
-# Note: This option applies only to the class list, not to the 
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be
+# sorted by fully-qualified names, including namespaces. If set to
+# NO (the default), the class list will be sorted only by class name,
+# not including the namespace part.
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
+# Note: This option applies only to the class list, not to the
 # alphabetical list.
 
 SORT_BY_SCOPE_NAME     = NO
 
-# The GENERATE_TODOLIST tag can be used to enable (YES) or 
-# disable (NO) the todo list. This list is created by putting \todo 
+# The GENERATE_TODOLIST tag can be used to enable (YES) or
+# disable (NO) the todo list. This list is created by putting \todo
 # commands in the documentation.
 
 GENERATE_TODOLIST      = YES
 
-# The GENERATE_TESTLIST tag can be used to enable (YES) or 
-# disable (NO) the test list. This list is created by putting \test 
+# The GENERATE_TESTLIST tag can be used to enable (YES) or
+# disable (NO) the test list. This list is created by putting \test
 # commands in the documentation.
 
 GENERATE_TESTLIST      = YES
 
-# The GENERATE_BUGLIST tag can be used to enable (YES) or 
-# disable (NO) the bug list. This list is created by putting \bug 
+# The GENERATE_BUGLIST tag can be used to enable (YES) or
+# disable (NO) the bug list. This list is created by putting \bug
 # commands in the documentation.
 
 GENERATE_BUGLIST       = YES
 
-# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or 
-# disable (NO) the deprecated list. This list is created by putting 
+# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or
+# disable (NO) the deprecated list. This list is created by putting
 # \deprecated commands in the documentation.
 
 GENERATE_DEPRECATEDLIST= YES
 
-# The ENABLED_SECTIONS tag can be used to enable conditional 
+# The ENABLED_SECTIONS tag can be used to enable conditional
 # documentation sections, marked by \if sectionname ... \endif.
 
-ENABLED_SECTIONS       = 
+ENABLED_SECTIONS       =
 
-# The MAX_INITIALIZER_LINES tag determines the maximum number of lines 
-# the initial value of a variable or define consists of for it to appear in 
-# the documentation. If the initializer consists of more lines than specified 
-# here it will be hidden. Use a value of 0 to hide initializers completely. 
-# The appearance of the initializer of individual variables and defines in the 
-# documentation can be controlled using \showinitializer or \hideinitializer 
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines
+# the initial value of a variable or define consists of for it to appear in
+# the documentation. If the initializer consists of more lines than specified
+# here it will be hidden. Use a value of 0 to hide initializers completely.
+# The appearance of the initializer of individual variables and defines in the
+# documentation can be controlled using \showinitializer or \hideinitializer
 # command in the documentation regardless of this setting.
 
 MAX_INITIALIZER_LINES  = 30
 
-# Set the SHOW_USED_FILES tag to NO to disable the list of files generated 
-# at the bottom of the documentation of classes and structs. If set to YES the 
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated
+# at the bottom of the documentation of classes and structs. If set to YES the
 # list will mention the files that were used to generate the documentation.
 
 SHOW_USED_FILES        = YES
 
-# If the sources in your project are distributed over multiple directories 
-# then setting the SHOW_DIRECTORIES tag to YES will show the directory hierarchy 
+# If the sources in your project are distributed over multiple directories
+# then setting the SHOW_DIRECTORIES tag to YES will show the directory hierarchy
 # in the documentation. The default is NO.
 
 SHOW_DIRECTORIES       = NO
 
-# Set the SHOW_FILES tag to NO to disable the generation of the Files page. 
-# This will remove the Files entry from the Quick Index and from the 
+# Set the SHOW_FILES tag to NO to disable the generation of the Files page.
+# This will remove the Files entry from the Quick Index and from the
 # Folder Tree View (if specified). The default is YES.
 
 SHOW_FILES             = YES
 
-# Set the SHOW_NAMESPACES tag to NO to disable the generation of the 
-# Namespaces page.  This will remove the Namespaces entry from the Quick Index 
+# Set the SHOW_NAMESPACES tag to NO to disable the generation of the
+# Namespaces page.  This will remove the Namespaces entry from the Quick Index
 # and from the Folder Tree View (if specified). The default is YES.
 
 SHOW_NAMESPACES        = YES
 
-# The FILE_VERSION_FILTER tag can be used to specify a program or script that 
-# doxygen should invoke to get the current version for each file (typically from 
-# the version control system). Doxygen will invoke the program by executing (via 
-# popen()) the command <command> <input-file>, where <command> is the value of 
-# the FILE_VERSION_FILTER tag, and <input-file> is the name of an input file 
-# provided by doxygen. Whatever the program writes to standard output 
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that
+# doxygen should invoke to get the current version for each file (typically from
+# the version control system). Doxygen will invoke the program by executing (via
+# popen()) the command <command> <input-file>, where <command> is the value of
+# the FILE_VERSION_FILTER tag, and <input-file> is the name of an input file
+# provided by doxygen. Whatever the program writes to standard output
 # is used as the file version. See the manual for examples.
 
-FILE_VERSION_FILTER    = 
+FILE_VERSION_FILTER    =
 
-# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed by 
-# doxygen. The layout file controls the global structure of the generated output files 
-# in an output format independent way. The create the layout file that represents 
-# doxygen's defaults, run doxygen with the -l option. You can optionally specify a 
-# file name after the option, if omitted DoxygenLayout.xml will be used as the name 
+# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed by
+# doxygen. The layout file controls the global structure of the generated output files
+# in an output format independent way. The create the layout file that represents
+# doxygen's defaults, run doxygen with the -l option. You can optionally specify a
+# file name after the option, if omitted DoxygenLayout.xml will be used as the name
 # of the layout file.
 
-LAYOUT_FILE            = 
+LAYOUT_FILE            =
 
 #---------------------------------------------------------------------------
 # configuration options related to warning and progress messages
 #---------------------------------------------------------------------------
 
-# The QUIET tag can be used to turn on/off the messages that are generated 
+# The QUIET tag can be used to turn on/off the messages that are generated
 # by doxygen. Possible values are YES and NO. If left blank NO is used.
 
 QUIET                  = NO
 
-# The WARNINGS tag can be used to turn on/off the warning messages that are 
-# generated by doxygen. Possible values are YES and NO. If left blank 
+# The WARNINGS tag can be used to turn on/off the warning messages that are
+# generated by doxygen. Possible values are YES and NO. If left blank
 # NO is used.
 
 WARNINGS               = YES
 
-# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings 
-# for undocumented members. If EXTRACT_ALL is set to YES then this flag will 
+# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings
+# for undocumented members. If EXTRACT_ALL is set to YES then this flag will
 # automatically be disabled.
 
 WARN_IF_UNDOCUMENTED   = YES
 
-# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for 
-# potential errors in the documentation, such as not documenting some 
-# parameters in a documented function, or documenting parameters that 
+# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for
+# potential errors in the documentation, such as not documenting some
+# parameters in a documented function, or documenting parameters that
 # don't exist or using markup commands wrongly.
 
 WARN_IF_DOC_ERROR      = YES
 
-# This WARN_NO_PARAMDOC option can be abled to get warnings for 
-# functions that are documented, but have no documentation for their parameters 
-# or return value. If set to NO (the default) doxygen will only warn about 
-# wrong or incomplete parameter documentation, but not about the absence of 
+# This WARN_NO_PARAMDOC option can be abled to get warnings for
+# functions that are documented, but have no documentation for their parameters
+# or return value. If set to NO (the default) doxygen will only warn about
+# wrong or incomplete parameter documentation, but not about the absence of
 # documentation.
 
 WARN_NO_PARAMDOC       = NO
 
-# The WARN_FORMAT tag determines the format of the warning messages that 
-# doxygen can produce. The string should contain the $file, $line, and $text 
-# tags, which will be replaced by the file and line number from which the 
-# warning originated and the warning text. Optionally the format may contain 
-# $version, which will be replaced by the version of the file (if it could 
+# The WARN_FORMAT tag determines the format of the warning messages that
+# doxygen can produce. The string should contain the $file, $line, and $text
+# tags, which will be replaced by the file and line number from which the
+# warning originated and the warning text. Optionally the format may contain
+# $version, which will be replaced by the version of the file (if it could
 # be obtained via FILE_VERSION_FILTER)
 
 WARN_FORMAT            = "$file:$line: $text"
 
-# The WARN_LOGFILE tag can be used to specify a file to which warning 
-# and error messages should be written. If left blank the output is written 
+# The WARN_LOGFILE tag can be used to specify a file to which warning
+# and error messages should be written. If left blank the output is written
 # to stderr.
 
-WARN_LOGFILE           = 
+WARN_LOGFILE           =
 
 #---------------------------------------------------------------------------
 # configuration options related to the input files
 #---------------------------------------------------------------------------
 
-# The INPUT tag can be used to specify the files and/or directories that contain 
-# documented source files. You may enter file names like "myfile.cpp" or 
-# directories like "/usr/src/myproject". Separate the files or directories 
+# The INPUT tag can be used to specify the files and/or directories that contain
+# documented source files. You may enter file names like "myfile.cpp" or
+# directories like "/usr/src/myproject". Separate the files or directories
 # with spaces.
 
-INPUT                  = ../../viennacl/
+INPUT                  = ../../viennacl
 
-# This tag can be used to specify the character encoding of the source files 
-# that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is 
-# also the default input encoding. Doxygen uses libiconv (or the iconv built 
-# into libc) for the transcoding. See http://www.gnu.org/software/libiconv for 
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is
+# also the default input encoding. Doxygen uses libiconv (or the iconv built
+# into libc) for the transcoding. See http://www.gnu.org/software/libiconv for
 # the list of possible encodings.
 
 INPUT_ENCODING         = UTF-8
 
-# If the value of the INPUT tag contains directories, you can use the 
-# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp 
-# and *.h) to filter out the source-files in the directories. If left 
-# blank the following patterns are tested: 
-# *.c *.cc *.cxx *.cpp *.c++ *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh *.hxx 
+# If the value of the INPUT tag contains directories, you can use the
+# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
+# and *.h) to filter out the source-files in the directories. If left
+# blank the following patterns are tested:
+# *.c *.cc *.cxx *.cpp *.c++ *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh *.hxx
 # *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.py *.f90
 
 FILE_PATTERNS          = *.c \
@@ -622,87 +622,87 @@ FILE_PATTERNS          = *.c \
                          *.vhd \
                          *.vhdl
 
-# The RECURSIVE tag can be used to turn specify whether or not subdirectories 
-# should be searched for input files as well. Possible values are YES and NO. 
+# The RECURSIVE tag can be used to turn specify whether or not subdirectories
+# should be searched for input files as well. Possible values are YES and NO.
 # If left blank NO is used.
 
 RECURSIVE              = YES
 
-# The EXCLUDE tag can be used to specify files and/or directories that should 
-# excluded from the INPUT source files. This way you can easily exclude a 
+# The EXCLUDE tag can be used to specify files and/or directories that should
+# excluded from the INPUT source files. This way you can easily exclude a
 # subdirectory from a directory tree whose root is specified with the INPUT tag.
 
-EXCLUDE                = 
+EXCLUDE                =
 
-# The EXCLUDE_SYMLINKS tag can be used select whether or not files or 
-# directories that are symbolic links (a Unix filesystem feature) are excluded 
+# The EXCLUDE_SYMLINKS tag can be used select whether or not files or
+# directories that are symbolic links (a Unix filesystem feature) are excluded
 # from the input.
 
 EXCLUDE_SYMLINKS       = NO
 
-# If the value of the INPUT tag contains directories, you can use the 
-# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude 
-# certain files from those directories. Note that the wildcards are matched 
-# against the file with absolute path, so to exclude all test directories 
+# If the value of the INPUT tag contains directories, you can use the
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
+# certain files from those directories. Note that the wildcards are matched
+# against the file with absolute path, so to exclude all test directories
 # for example use the pattern */test/*
 
-EXCLUDE_PATTERNS       = 
+EXCLUDE_PATTERNS       =
 
-# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names 
-# (namespaces, classes, functions, etc.) that should be excluded from the 
-# output. The symbol name can be a fully qualified name, a word, or if the 
-# wildcard * is used, a substring. Examples: ANamespace, AClass, 
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
+# (namespaces, classes, functions, etc.) that should be excluded from the
+# output. The symbol name can be a fully qualified name, a word, or if the
+# wildcard * is used, a substring. Examples: ANamespace, AClass,
 # AClass::ANamespace, ANamespace::*Test
 
-EXCLUDE_SYMBOLS        = 
+EXCLUDE_SYMBOLS        =
 
-# The EXAMPLE_PATH tag can be used to specify one or more files or 
-# directories that contain example code fragments that are included (see 
+# The EXAMPLE_PATH tag can be used to specify one or more files or
+# directories that contain example code fragments that are included (see
 # the \include command).
 
-EXAMPLE_PATH           = 
+EXAMPLE_PATH           =
 
-# If the value of the EXAMPLE_PATH tag contains directories, you can use the 
-# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp 
-# and *.h) to filter out the source-files in the directories. If left 
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
+# and *.h) to filter out the source-files in the directories. If left
 # blank all files are included.
 
 EXAMPLE_PATTERNS       = *
 
-# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be 
-# searched for input files to be used with the \include or \dontinclude 
-# commands irrespective of the value of the RECURSIVE tag. 
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
+# searched for input files to be used with the \include or \dontinclude
+# commands irrespective of the value of the RECURSIVE tag.
 # Possible values are YES and NO. If left blank NO is used.
 
 EXAMPLE_RECURSIVE      = NO
 
-# The IMAGE_PATH tag can be used to specify one or more files or 
-# directories that contain image that are included in the documentation (see 
+# The IMAGE_PATH tag can be used to specify one or more files or
+# directories that contain image that are included in the documentation (see
 # the \image command).
 
-IMAGE_PATH             = 
+IMAGE_PATH             =
 
-# The INPUT_FILTER tag can be used to specify a program that doxygen should 
-# invoke to filter for each input file. Doxygen will invoke the filter program 
-# by executing (via popen()) the command <filter> <input-file>, where <filter> 
-# is the value of the INPUT_FILTER tag, and <input-file> is the name of an 
-# input file. Doxygen will then use the output that the filter program writes 
-# to standard output.  If FILTER_PATTERNS is specified, this tag will be 
+# The INPUT_FILTER tag can be used to specify a program that doxygen should
+# invoke to filter for each input file. Doxygen will invoke the filter program
+# by executing (via popen()) the command <filter> <input-file>, where <filter>
+# is the value of the INPUT_FILTER tag, and <input-file> is the name of an
+# input file. Doxygen will then use the output that the filter program writes
+# to standard output.  If FILTER_PATTERNS is specified, this tag will be
 # ignored.
 
-INPUT_FILTER           = 
+INPUT_FILTER           =
 
-# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern 
-# basis.  Doxygen will compare the file name with each pattern and apply the 
-# filter if there is a match.  The filters are a list of the form: 
-# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further 
-# info on how filters are used. If FILTER_PATTERNS is empty, INPUT_FILTER 
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
+# basis.  Doxygen will compare the file name with each pattern and apply the
+# filter if there is a match.  The filters are a list of the form:
+# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further
+# info on how filters are used. If FILTER_PATTERNS is empty, INPUT_FILTER
 # is applied to all files.
 
-FILTER_PATTERNS        = 
+FILTER_PATTERNS        =
 
-# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using 
-# INPUT_FILTER) will be used to filter the input files when producing source 
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
+# INPUT_FILTER) will be used to filter the input files when producing source
 # files to browse (i.e. when SOURCE_BROWSER is set to YES).
 
 FILTER_SOURCE_FILES    = NO
@@ -711,53 +711,53 @@ FILTER_SOURCE_FILES    = NO
 # configuration options related to source browsing
 #---------------------------------------------------------------------------
 
-# If the SOURCE_BROWSER tag is set to YES then a list of source files will 
-# be generated. Documented entities will be cross-referenced with these sources. 
-# Note: To get rid of all source code in the generated output, make sure also 
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will
+# be generated. Documented entities will be cross-referenced with these sources.
+# Note: To get rid of all source code in the generated output, make sure also
 # VERBATIM_HEADERS is set to NO.
 
 SOURCE_BROWSER         = NO
 
-# Setting the INLINE_SOURCES tag to YES will include the body 
+# Setting the INLINE_SOURCES tag to YES will include the body
 # of functions and classes directly in the documentation.
 
 INLINE_SOURCES         = NO
 
-# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct 
-# doxygen to hide any special comment blocks from generated source code 
+# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct
+# doxygen to hide any special comment blocks from generated source code
 # fragments. Normal C and C++ comments will always remain visible.
 
 STRIP_CODE_COMMENTS    = YES
 
-# If the REFERENCED_BY_RELATION tag is set to YES 
-# then for each documented function all documented 
+# If the REFERENCED_BY_RELATION tag is set to YES
+# then for each documented function all documented
 # functions referencing it will be listed.
 
 REFERENCED_BY_RELATION = NO
 
-# If the REFERENCES_RELATION tag is set to YES 
-# then for each documented function all documented entities 
+# If the REFERENCES_RELATION tag is set to YES
+# then for each documented function all documented entities
 # called/used by that function will be listed.
 
 REFERENCES_RELATION    = NO
 
-# If the REFERENCES_LINK_SOURCE tag is set to YES (the default) 
-# and SOURCE_BROWSER tag is set to YES, then the hyperlinks from 
-# functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will 
+# If the REFERENCES_LINK_SOURCE tag is set to YES (the default)
+# and SOURCE_BROWSER tag is set to YES, then the hyperlinks from
+# functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will
 # link to the source code.  Otherwise they will link to the documentation.
 
 REFERENCES_LINK_SOURCE = YES
 
-# If the USE_HTAGS tag is set to YES then the references to source code 
-# will point to the HTML generated by the htags(1) tool instead of doxygen 
-# built-in source browser. The htags tool is part of GNU's global source 
-# tagging system (see http://www.gnu.org/software/global/global.html). You 
+# If the USE_HTAGS tag is set to YES then the references to source code
+# will point to the HTML generated by the htags(1) tool instead of doxygen
+# built-in source browser. The htags tool is part of GNU's global source
+# tagging system (see http://www.gnu.org/software/global/global.html). You
 # will need version 4.8.6 or higher.
 
 USE_HTAGS              = NO
 
-# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen 
-# will generate a verbatim copy of the header file for each class for 
+# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen
+# will generate a verbatim copy of the header file for each class for
 # which an include is specified. Set to NO to disable this.
 
 VERBATIM_HEADERS       = YES
@@ -766,238 +766,238 @@ VERBATIM_HEADERS       = YES
 # configuration options related to the alphabetical class index
 #---------------------------------------------------------------------------
 
-# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index 
-# of all compounds will be generated. Enable this if the project 
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index
+# of all compounds will be generated. Enable this if the project
 # contains a lot of classes, structs, unions or interfaces.
 
 ALPHABETICAL_INDEX     = NO
 
-# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then 
-# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns 
+# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then
+# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns
 # in which this list will be split (can be a number in the range [1..20])
 
 COLS_IN_ALPHA_INDEX    = 5
 
-# In case all classes in a project start with a common prefix, all 
-# classes will be put under the same header in the alphabetical index. 
-# The IGNORE_PREFIX tag can be used to specify one or more prefixes that 
+# In case all classes in a project start with a common prefix, all
+# classes will be put under the same header in the alphabetical index.
+# The IGNORE_PREFIX tag can be used to specify one or more prefixes that
 # should be ignored while generating the index headers.
 
-IGNORE_PREFIX          = 
+IGNORE_PREFIX          =
 
 #---------------------------------------------------------------------------
 # configuration options related to the HTML output
 #---------------------------------------------------------------------------
 
-# If the GENERATE_HTML tag is set to YES (the default) Doxygen will 
+# If the GENERATE_HTML tag is set to YES (the default) Doxygen will
 # generate HTML output.
 
 GENERATE_HTML          = YES
 
-# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. 
-# If a relative path is entered the value of OUTPUT_DIRECTORY will be 
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
 # put in front of it. If left blank `html' will be used as the default path.
 
 HTML_OUTPUT            = html
 
-# The HTML_FILE_EXTENSION tag can be used to specify the file extension for 
-# each generated HTML page (for example: .htm,.php,.asp). If it is left blank 
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for
+# each generated HTML page (for example: .htm,.php,.asp). If it is left blank
 # doxygen will generate files with .html extension.
 
 HTML_FILE_EXTENSION    = .html
 
-# The HTML_HEADER tag can be used to specify a personal HTML header for 
-# each generated HTML page. If it is left blank doxygen will generate a 
+# The HTML_HEADER tag can be used to specify a personal HTML header for
+# each generated HTML page. If it is left blank doxygen will generate a
 # standard header.
 
-HTML_HEADER            = 
+HTML_HEADER            =
 
-# The HTML_FOOTER tag can be used to specify a personal HTML footer for 
-# each generated HTML page. If it is left blank doxygen will generate a 
+# The HTML_FOOTER tag can be used to specify a personal HTML footer for
+# each generated HTML page. If it is left blank doxygen will generate a
 # standard footer.
 
-HTML_FOOTER            = 
+HTML_FOOTER            =
 
-# The HTML_STYLESHEET tag can be used to specify a user-defined cascading 
-# style sheet that is used by each HTML page. It can be used to 
-# fine-tune the look of the HTML output. If the tag is left blank doxygen 
-# will generate a default style sheet. Note that doxygen will try to copy 
-# the style sheet file to the HTML output directory, so don't put your own 
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading
+# style sheet that is used by each HTML page. It can be used to
+# fine-tune the look of the HTML output. If the tag is left blank doxygen
+# will generate a default style sheet. Note that doxygen will try to copy
+# the style sheet file to the HTML output directory, so don't put your own
 # stylesheet in the HTML output directory as well, or it will be erased!
 
-HTML_STYLESHEET        = 
+HTML_STYLESHEET        =
 
-# If the HTML_ALIGN_MEMBERS tag is set to YES, the members of classes, 
-# files or namespaces will be aligned in HTML using tables. If set to 
+# If the HTML_ALIGN_MEMBERS tag is set to YES, the members of classes,
+# files or namespaces will be aligned in HTML using tables. If set to
 # NO a bullet list will be used.
 
 HTML_ALIGN_MEMBERS     = YES
 
-# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML 
-# documentation will contain sections that can be hidden and shown after the 
-# page has loaded. For this to work a browser that supports 
-# JavaScript and DHTML is required (for instance Mozilla 1.0+, Firefox 
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
+# documentation will contain sections that can be hidden and shown after the
+# page has loaded. For this to work a browser that supports
+# JavaScript and DHTML is required (for instance Mozilla 1.0+, Firefox
 # Netscape 6.0+, Internet explorer 5.0+, Konqueror, or Safari).
 
 HTML_DYNAMIC_SECTIONS  = NO
 
-# If the GENERATE_DOCSET tag is set to YES, additional index files 
-# will be generated that can be used as input for Apple's Xcode 3 
-# integrated development environment, introduced with OSX 10.5 (Leopard). 
-# To create a documentation set, doxygen will generate a Makefile in the 
-# HTML output directory. Running make will produce the docset in that 
-# directory and running "make install" will install the docset in 
-# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find 
-# it at startup. 
+# If the GENERATE_DOCSET tag is set to YES, additional index files
+# will be generated that can be used as input for Apple's Xcode 3
+# integrated development environment, introduced with OSX 10.5 (Leopard).
+# To create a documentation set, doxygen will generate a Makefile in the
+# HTML output directory. Running make will produce the docset in that
+# directory and running "make install" will install the docset in
+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find
+# it at startup.
 # See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html for more information.
 
 GENERATE_DOCSET        = NO
 
-# When GENERATE_DOCSET tag is set to YES, this tag determines the name of the 
-# feed. A documentation feed provides an umbrella under which multiple 
-# documentation sets from a single provider (such as a company or product suite) 
+# When GENERATE_DOCSET tag is set to YES, this tag determines the name of the
+# feed. A documentation feed provides an umbrella under which multiple
+# documentation sets from a single provider (such as a company or product suite)
 # can be grouped.
 
 DOCSET_FEEDNAME        = "Doxygen generated docs"
 
-# When GENERATE_DOCSET tag is set to YES, this tag specifies a string that 
-# should uniquely identify the documentation set bundle. This should be a 
-# reverse domain-name style string, e.g. com.mycompany.MyDocSet. Doxygen 
+# When GENERATE_DOCSET tag is set to YES, this tag specifies a string that
+# should uniquely identify the documentation set bundle. This should be a
+# reverse domain-name style string, e.g. com.mycompany.MyDocSet. Doxygen
 # will append .docset to the name.
 
 DOCSET_BUNDLE_ID       = org.doxygen.Project
 
-# If the GENERATE_HTMLHELP tag is set to YES, additional index files 
-# will be generated that can be used as input for tools like the 
-# Microsoft HTML help workshop to generate a compiled HTML help file (.chm) 
+# If the GENERATE_HTMLHELP tag is set to YES, additional index files
+# will be generated that can be used as input for tools like the
+# Microsoft HTML help workshop to generate a compiled HTML help file (.chm)
 # of the generated HTML documentation.
 
 GENERATE_HTMLHELP      = NO
 
-# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can 
-# be used to specify the file name of the resulting .chm file. You 
-# can add a path in front of the file if the result should not be 
+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can
+# be used to specify the file name of the resulting .chm file. You
+# can add a path in front of the file if the result should not be
 # written to the html output directory.
 
-CHM_FILE               = 
+CHM_FILE               =
 
-# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can 
-# be used to specify the location (absolute path including file name) of 
-# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run 
+# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can
+# be used to specify the location (absolute path including file name) of
+# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run
 # the HTML help compiler on the generated index.hhp.
 
-HHC_LOCATION           = 
+HHC_LOCATION           =
 
-# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag 
-# controls if a separate .chi index file is generated (YES) or that 
+# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag
+# controls if a separate .chi index file is generated (YES) or that
 # it should be included in the master .chm file (NO).
 
 GENERATE_CHI           = NO
 
-# If the GENERATE_HTMLHELP tag is set to YES, the CHM_INDEX_ENCODING 
-# is used to encode HtmlHelp index (hhk), content (hhc) and project file 
+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_INDEX_ENCODING
+# is used to encode HtmlHelp index (hhk), content (hhc) and project file
 # content.
 
-CHM_INDEX_ENCODING     = 
+CHM_INDEX_ENCODING     =
 
-# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag 
-# controls whether a binary table of contents is generated (YES) or a 
+# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag
+# controls whether a binary table of contents is generated (YES) or a
 # normal table of contents (NO) in the .chm file.
 
 BINARY_TOC             = NO
 
-# The TOC_EXPAND flag can be set to YES to add extra items for group members 
+# The TOC_EXPAND flag can be set to YES to add extra items for group members
 # to the contents of the HTML help documentation and to the tree view.
 
 TOC_EXPAND             = NO
 
-# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and QHP_VIRTUAL_FOLDER 
-# are set, an additional index file will be generated that can be used as input for 
-# Qt's qhelpgenerator to generate a Qt Compressed Help (.qch) of the generated 
+# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and QHP_VIRTUAL_FOLDER
+# are set, an additional index file will be generated that can be used as input for
+# Qt's qhelpgenerator to generate a Qt Compressed Help (.qch) of the generated
 # HTML documentation.
 
 GENERATE_QHP           = NO
 
-# If the QHG_LOCATION tag is specified, the QCH_FILE tag can 
-# be used to specify the file name of the resulting .qch file. 
+# If the QHG_LOCATION tag is specified, the QCH_FILE tag can
+# be used to specify the file name of the resulting .qch file.
 # The path specified is relative to the HTML output folder.
 
-QCH_FILE               = 
+QCH_FILE               =
 
-# The QHP_NAMESPACE tag specifies the namespace to use when generating 
-# Qt Help Project output. For more information please see 
+# The QHP_NAMESPACE tag specifies the namespace to use when generating
+# Qt Help Project output. For more information please see
 # http://doc.trolltech.com/qthelpproject.html#namespace
 
-QHP_NAMESPACE          = 
+QHP_NAMESPACE          =
 
-# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating 
-# Qt Help Project output. For more information please see 
+# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating
+# Qt Help Project output. For more information please see
 # http://doc.trolltech.com/qthelpproject.html#virtual-folders
 
 QHP_VIRTUAL_FOLDER     = doc
 
-# If QHP_CUST_FILTER_NAME is set, it specifies the name of a custom filter to add. 
-# For more information please see 
+# If QHP_CUST_FILTER_NAME is set, it specifies the name of a custom filter to add.
+# For more information please see
 # http://doc.trolltech.com/qthelpproject.html#custom-filters
 
-QHP_CUST_FILTER_NAME   = 
+QHP_CUST_FILTER_NAME   =
 
-# The QHP_CUST_FILT_ATTRS tag specifies the list of the attributes of the custom filter to add.For more information please see 
+# The QHP_CUST_FILT_ATTRS tag specifies the list of the attributes of the custom filter to add.For more information please see
 # <a href="http://doc.trolltech.com/qthelpproject.html#custom-filters">Qt Help Project / Custom Filters</a>.
 
-QHP_CUST_FILTER_ATTRS  = 
+QHP_CUST_FILTER_ATTRS  =
 
-# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this project's 
-# filter section matches. 
+# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this project's
+# filter section matches.
 # <a href="http://doc.trolltech.com/qthelpproject.html#filter-attributes">Qt Help Project / Filter Attributes</a>.
 
-QHP_SECT_FILTER_ATTRS  = 
+QHP_SECT_FILTER_ATTRS  =
 
-# If the GENERATE_QHP tag is set to YES, the QHG_LOCATION tag can 
-# be used to specify the location of Qt's qhelpgenerator. 
-# If non-empty doxygen will try to run qhelpgenerator on the generated 
+# If the GENERATE_QHP tag is set to YES, the QHG_LOCATION tag can
+# be used to specify the location of Qt's qhelpgenerator.
+# If non-empty doxygen will try to run qhelpgenerator on the generated
 # .qhp file.
 
-QHG_LOCATION           = 
+QHG_LOCATION           =
 
-# The DISABLE_INDEX tag can be used to turn on/off the condensed index at 
-# top of each HTML page. The value NO (the default) enables the index and 
+# The DISABLE_INDEX tag can be used to turn on/off the condensed index at
+# top of each HTML page. The value NO (the default) enables the index and
 # the value YES disables it.
 
 DISABLE_INDEX          = NO
 
-# This tag can be used to set the number of enum values (range [1..20]) 
+# This tag can be used to set the number of enum values (range [1..20])
 # that doxygen will group on one line in the generated HTML documentation.
 
 ENUM_VALUES_PER_LINE   = 4
 
-# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index 
-# structure should be generated to display hierarchical information. 
-# If the tag value is set to FRAME, a side panel will be generated 
-# containing a tree-like index structure (just like the one that 
-# is generated for HTML Help). For this to work a browser that supports 
-# JavaScript, DHTML, CSS and frames is required (for instance Mozilla 1.0+, 
-# Netscape 6.0+, Internet explorer 5.0+, or Konqueror). Windows users are 
-# probably better off using the HTML help feature. Other possible values 
-# for this tag are: HIERARCHIES, which will generate the Groups, Directories, 
-# and Class Hierarchy pages using a tree view instead of an ordered list; 
-# ALL, which combines the behavior of FRAME and HIERARCHIES; and NONE, which 
-# disables this behavior completely. For backwards compatibility with previous 
-# releases of Doxygen, the values YES and NO are equivalent to FRAME and NONE 
+# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
+# structure should be generated to display hierarchical information.
+# If the tag value is set to FRAME, a side panel will be generated
+# containing a tree-like index structure (just like the one that
+# is generated for HTML Help). For this to work a browser that supports
+# JavaScript, DHTML, CSS and frames is required (for instance Mozilla 1.0+,
+# Netscape 6.0+, Internet explorer 5.0+, or Konqueror). Windows users are
+# probably better off using the HTML help feature. Other possible values
+# for this tag are: HIERARCHIES, which will generate the Groups, Directories,
+# and Class Hierarchy pages using a tree view instead of an ordered list;
+# ALL, which combines the behavior of FRAME and HIERARCHIES; and NONE, which
+# disables this behavior completely. For backwards compatibility with previous
+# releases of Doxygen, the values YES and NO are equivalent to FRAME and NONE
 # respectively.
 
 GENERATE_TREEVIEW      = NONE
 
-# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be 
-# used to set the initial width (in pixels) of the frame in which the tree 
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be
+# used to set the initial width (in pixels) of the frame in which the tree
 # is shown.
 
 TREEVIEW_WIDTH         = 250
 
-# Use this tag to change the font size of Latex formulas included 
-# as images in the HTML documentation. The default is 10. Note that 
-# when you change the font size after a successful doxygen run you need 
-# to manually remove any form_*.png images from the HTML output directory 
+# Use this tag to change the font size of Latex formulas included
+# as images in the HTML documentation. The default is 10. Note that
+# when you change the font size after a successful doxygen run you need
+# to manually remove any form_*.png images from the HTML output directory
 # to force them to be regenerated.
 
 FORMULA_FONTSIZE       = 10
@@ -1006,74 +1006,74 @@ FORMULA_FONTSIZE       = 10
 # configuration options related to the LaTeX output
 #---------------------------------------------------------------------------
 
-# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will 
+# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will
 # generate Latex output.
 
 GENERATE_LATEX         = NO
 
-# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. 
-# If a relative path is entered the value of OUTPUT_DIRECTORY will be 
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
 # put in front of it. If left blank `latex' will be used as the default path.
 
 LATEX_OUTPUT           = latex
 
-# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be 
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
 # invoked. If left blank `latex' will be used as the default command name.
 
 LATEX_CMD_NAME         = latex
 
-# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to 
-# generate index for LaTeX. If left blank `makeindex' will be used as the 
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to
+# generate index for LaTeX. If left blank `makeindex' will be used as the
 # default command name.
 
 MAKEINDEX_CMD_NAME     = makeindex
 
-# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact 
-# LaTeX documents. This may be useful for small projects and may help to 
+# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact
+# LaTeX documents. This may be useful for small projects and may help to
 # save some trees in general.
 
 COMPACT_LATEX          = NO
 
-# The PAPER_TYPE tag can be used to set the paper type that is used 
-# by the printer. Possible values are: a4, a4wide, letter, legal and 
+# The PAPER_TYPE tag can be used to set the paper type that is used
+# by the printer. Possible values are: a4, a4wide, letter, legal and
 # executive. If left blank a4wide will be used.
 
 PAPER_TYPE             = a4wide
 
-# The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX 
+# The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX
 # packages that should be included in the LaTeX output.
 
-EXTRA_PACKAGES         = 
+EXTRA_PACKAGES         =
 
-# The LATEX_HEADER tag can be used to specify a personal LaTeX header for 
-# the generated latex document. The header should contain everything until 
-# the first chapter. If it is left blank doxygen will generate a 
+# The LATEX_HEADER tag can be used to specify a personal LaTeX header for
+# the generated latex document. The header should contain everything until
+# the first chapter. If it is left blank doxygen will generate a
 # standard header. Notice: only use this tag if you know what you are doing!
 
-LATEX_HEADER           = 
+LATEX_HEADER           =
 
-# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated 
-# is prepared for conversion to pdf (using ps2pdf). The pdf file will 
-# contain links (just like the HTML output) instead of page references 
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated
+# is prepared for conversion to pdf (using ps2pdf). The pdf file will
+# contain links (just like the HTML output) instead of page references
 # This makes the output suitable for online browsing using a pdf viewer.
 
 PDF_HYPERLINKS         = YES
 
-# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of 
-# plain latex in the generated Makefile. Set this option to YES to get a 
+# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of
+# plain latex in the generated Makefile. Set this option to YES to get a
 # higher quality PDF documentation.
 
 USE_PDFLATEX           = YES
 
-# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode. 
-# command to the generated LaTeX files. This will instruct LaTeX to keep 
-# running if errors occur, instead of asking the user for help. 
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode.
+# command to the generated LaTeX files. This will instruct LaTeX to keep
+# running if errors occur, instead of asking the user for help.
 # This option is also used when generating formulas in HTML.
 
 LATEX_BATCHMODE        = NO
 
-# If LATEX_HIDE_INDICES is set to YES then doxygen will not 
-# include the index chapters (such as File Index, Compound Index, etc.) 
+# If LATEX_HIDE_INDICES is set to YES then doxygen will not
+# include the index chapters (such as File Index, Compound Index, etc.)
 # in the output.
 
 LATEX_HIDE_INDICES     = NO
@@ -1082,68 +1082,68 @@ LATEX_HIDE_INDICES     = NO
 # configuration options related to the RTF output
 #---------------------------------------------------------------------------
 
-# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output 
-# The RTF output is optimized for Word 97 and may not look very pretty with 
+# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output
+# The RTF output is optimized for Word 97 and may not look very pretty with
 # other RTF readers or editors.
 
 GENERATE_RTF           = NO
 
-# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. 
-# If a relative path is entered the value of OUTPUT_DIRECTORY will be 
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
 # put in front of it. If left blank `rtf' will be used as the default path.
 
 RTF_OUTPUT             = rtf
 
-# If the COMPACT_RTF tag is set to YES Doxygen generates more compact 
-# RTF documents. This may be useful for small projects and may help to 
+# If the COMPACT_RTF tag is set to YES Doxygen generates more compact
+# RTF documents. This may be useful for small projects and may help to
 # save some trees in general.
 
 COMPACT_RTF            = NO
 
-# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated 
-# will contain hyperlink fields. The RTF file will 
-# contain links (just like the HTML output) instead of page references. 
-# This makes the output suitable for online browsing using WORD or other 
-# programs which support those fields. 
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated
+# will contain hyperlink fields. The RTF file will
+# contain links (just like the HTML output) instead of page references.
+# This makes the output suitable for online browsing using WORD or other
+# programs which support those fields.
 # Note: wordpad (write) and others do not support links.
 
 RTF_HYPERLINKS         = NO
 
-# Load stylesheet definitions from file. Syntax is similar to doxygen's 
-# config file, i.e. a series of assignments. You only have to provide 
+# Load stylesheet definitions from file. Syntax is similar to doxygen's
+# config file, i.e. a series of assignments. You only have to provide
 # replacements, missing definitions are set to their default value.
 
-RTF_STYLESHEET_FILE    = 
+RTF_STYLESHEET_FILE    =
 
-# Set optional variables used in the generation of an rtf document. 
+# Set optional variables used in the generation of an rtf document.
 # Syntax is similar to doxygen's config file.
 
-RTF_EXTENSIONS_FILE    = 
+RTF_EXTENSIONS_FILE    =
 
 #---------------------------------------------------------------------------
 # configuration options related to the man page output
 #---------------------------------------------------------------------------
 
-# If the GENERATE_MAN tag is set to YES (the default) Doxygen will 
+# If the GENERATE_MAN tag is set to YES (the default) Doxygen will
 # generate man pages
 
 GENERATE_MAN           = NO
 
-# The MAN_OUTPUT tag is used to specify where the man pages will be put. 
-# If a relative path is entered the value of OUTPUT_DIRECTORY will be 
+# The MAN_OUTPUT tag is used to specify where the man pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
 # put in front of it. If left blank `man' will be used as the default path.
 
 MAN_OUTPUT             = man
 
-# The MAN_EXTENSION tag determines the extension that is added to 
+# The MAN_EXTENSION tag determines the extension that is added to
 # the generated man pages (default is the subroutine's section .3)
 
 MAN_EXTENSION          = .3
 
-# If the MAN_LINKS tag is set to YES and Doxygen generates man output, 
-# then it will generate one additional man file for each entity 
-# documented in the real man page(s). These additional files 
-# only source the real man page, but without them the man command 
+# If the MAN_LINKS tag is set to YES and Doxygen generates man output,
+# then it will generate one additional man file for each entity
+# documented in the real man page(s). These additional files
+# only source the real man page, but without them the man command
 # would be unable to find the correct page. The default is NO.
 
 MAN_LINKS              = NO
@@ -1152,33 +1152,33 @@ MAN_LINKS              = NO
 # configuration options related to the XML output
 #---------------------------------------------------------------------------
 
-# If the GENERATE_XML tag is set to YES Doxygen will 
-# generate an XML file that captures the structure of 
+# If the GENERATE_XML tag is set to YES Doxygen will
+# generate an XML file that captures the structure of
 # the code including all documentation.
 
 GENERATE_XML           = NO
 
-# The XML_OUTPUT tag is used to specify where the XML pages will be put. 
-# If a relative path is entered the value of OUTPUT_DIRECTORY will be 
+# The XML_OUTPUT tag is used to specify where the XML pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
 # put in front of it. If left blank `xml' will be used as the default path.
 
 XML_OUTPUT             = xml
 
-# The XML_SCHEMA tag can be used to specify an XML schema, 
-# which can be used by a validating XML parser to check the 
+# The XML_SCHEMA tag can be used to specify an XML schema,
+# which can be used by a validating XML parser to check the
 # syntax of the XML files.
 
-XML_SCHEMA             = 
+XML_SCHEMA             =
 
-# The XML_DTD tag can be used to specify an XML DTD, 
-# which can be used by a validating XML parser to check the 
+# The XML_DTD tag can be used to specify an XML DTD,
+# which can be used by a validating XML parser to check the
 # syntax of the XML files.
 
-XML_DTD                = 
+XML_DTD                =
 
-# If the XML_PROGRAMLISTING tag is set to YES Doxygen will 
-# dump the program listings (including syntax highlighting 
-# and cross-referencing information) to the XML output. Note that 
+# If the XML_PROGRAMLISTING tag is set to YES Doxygen will
+# dump the program listings (including syntax highlighting
+# and cross-referencing information) to the XML output. Note that
 # enabling this will significantly increase the size of the XML output.
 
 XML_PROGRAMLISTING     = YES
@@ -1187,10 +1187,10 @@ XML_PROGRAMLISTING     = YES
 # configuration options for the AutoGen Definitions output
 #---------------------------------------------------------------------------
 
-# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will 
-# generate an AutoGen Definitions (see autogen.sf.net) file 
-# that captures the structure of the code including all 
-# documentation. Note that this feature is still experimental 
+# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will
+# generate an AutoGen Definitions (see autogen.sf.net) file
+# that captures the structure of the code including all
+# documentation. Note that this feature is still experimental
 # and incomplete at the moment.
 
 GENERATE_AUTOGEN_DEF   = NO
@@ -1199,334 +1199,334 @@ GENERATE_AUTOGEN_DEF   = NO
 # configuration options related to the Perl module output
 #---------------------------------------------------------------------------
 
-# If the GENERATE_PERLMOD tag is set to YES Doxygen will 
-# generate a Perl module file that captures the structure of 
-# the code including all documentation. Note that this 
-# feature is still experimental and incomplete at the 
+# If the GENERATE_PERLMOD tag is set to YES Doxygen will
+# generate a Perl module file that captures the structure of
+# the code including all documentation. Note that this
+# feature is still experimental and incomplete at the
 # moment.
 
 GENERATE_PERLMOD       = NO
 
-# If the PERLMOD_LATEX tag is set to YES Doxygen will generate 
-# the necessary Makefile rules, Perl scripts and LaTeX code to be able 
+# If the PERLMOD_LATEX tag is set to YES Doxygen will generate
+# the necessary Makefile rules, Perl scripts and LaTeX code to be able
 # to generate PDF and DVI output from the Perl module output.
 
 PERLMOD_LATEX          = NO
 
-# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be 
-# nicely formatted so it can be parsed by a human reader.  This is useful 
-# if you want to understand what is going on.  On the other hand, if this 
-# tag is set to NO the size of the Perl module output will be much smaller 
+# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be
+# nicely formatted so it can be parsed by a human reader.  This is useful
+# if you want to understand what is going on.  On the other hand, if this
+# tag is set to NO the size of the Perl module output will be much smaller
 # and Perl will parse it just the same.
 
 PERLMOD_PRETTY         = YES
 
-# The names of the make variables in the generated doxyrules.make file 
-# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. 
-# This is useful so different doxyrules.make files included by the same 
+# The names of the make variables in the generated doxyrules.make file
+# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX.
+# This is useful so different doxyrules.make files included by the same
 # Makefile don't overwrite each other's variables.
 
-PERLMOD_MAKEVAR_PREFIX = 
+PERLMOD_MAKEVAR_PREFIX =
 
 #---------------------------------------------------------------------------
-# Configuration options related to the preprocessor   
+# Configuration options related to the preprocessor
 #---------------------------------------------------------------------------
 
-# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will 
-# evaluate all C-preprocessor directives found in the sources and include 
+# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will
+# evaluate all C-preprocessor directives found in the sources and include
 # files.
 
 ENABLE_PREPROCESSING   = YES
 
-# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro 
-# names in the source code. If set to NO (the default) only conditional 
-# compilation will be performed. Macro expansion can be done in a controlled 
+# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro
+# names in the source code. If set to NO (the default) only conditional
+# compilation will be performed. Macro expansion can be done in a controlled
 # way by setting EXPAND_ONLY_PREDEF to YES.
 
 MACRO_EXPANSION        = NO
 
-# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES 
-# then the macro expansion is limited to the macros specified with the 
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES
+# then the macro expansion is limited to the macros specified with the
 # PREDEFINED and EXPAND_AS_DEFINED tags.
 
 EXPAND_ONLY_PREDEF     = NO
 
-# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files 
+# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files
 # in the INCLUDE_PATH (see below) will be search if a #include is found.
 
 SEARCH_INCLUDES        = YES
 
-# The INCLUDE_PATH tag can be used to specify one or more directories that 
-# contain include files that are not input files but should be processed by 
+# The INCLUDE_PATH tag can be used to specify one or more directories that
+# contain include files that are not input files but should be processed by
 # the preprocessor.
 
-INCLUDE_PATH           = 
+INCLUDE_PATH           =
 
-# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard 
-# patterns (like *.h and *.hpp) to filter out the header-files in the 
-# directories. If left blank, the patterns specified with FILE_PATTERNS will 
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
+# patterns (like *.h and *.hpp) to filter out the header-files in the
+# directories. If left blank, the patterns specified with FILE_PATTERNS will
 # be used.
 
-INCLUDE_FILE_PATTERNS  = 
+INCLUDE_FILE_PATTERNS  =
 
-# The PREDEFINED tag can be used to specify one or more macro names that 
-# are defined before the preprocessor is started (similar to the -D option of 
-# gcc). The argument of the tag is a list of macros of the form: name 
-# or name=definition (no spaces). If the definition and the = are 
-# omitted =1 is assumed. To prevent a macro definition from being 
-# undefined via #undef or recursively expanded use the := operator 
+# The PREDEFINED tag can be used to specify one or more macro names that
+# are defined before the preprocessor is started (similar to the -D option of
+# gcc). The argument of the tag is a list of macros of the form: name
+# or name=definition (no spaces). If the definition and the = are
+# omitted =1 is assumed. To prevent a macro definition from being
+# undefined via #undef or recursively expanded use the := operator
 # instead of the = operator.
 
-PREDEFINED             = 
+PREDEFINED             =
 
-# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then 
-# this tag can be used to specify a list of macro names that should be expanded. 
-# The macro definition that is found in the sources will be used. 
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then
+# this tag can be used to specify a list of macro names that should be expanded.
+# The macro definition that is found in the sources will be used.
 # Use the PREDEFINED tag if you want to use a different macro definition.
 
-EXPAND_AS_DEFINED      = 
+EXPAND_AS_DEFINED      =
 
-# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then 
-# doxygen's preprocessor will remove all function-like macros that are alone 
-# on a line, have an all uppercase name, and do not end with a semicolon. Such 
-# function macros are typically used for boiler-plate code, and will confuse 
+# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then
+# doxygen's preprocessor will remove all function-like macros that are alone
+# on a line, have an all uppercase name, and do not end with a semicolon. Such
+# function macros are typically used for boiler-plate code, and will confuse
 # the parser if not removed.
 
 SKIP_FUNCTION_MACROS   = YES
 
 #---------------------------------------------------------------------------
-# Configuration::additions related to external references   
+# Configuration::additions related to external references
 #---------------------------------------------------------------------------
 
-# The TAGFILES option can be used to specify one or more tagfiles. 
-# Optionally an initial location of the external documentation 
-# can be added for each tagfile. The format of a tag file without 
-# this location is as follows: 
-#   TAGFILES = file1 file2 ... 
-# Adding location for the tag files is done as follows: 
-#   TAGFILES = file1=loc1 "file2 = loc2" ... 
-# where "loc1" and "loc2" can be relative or absolute paths or 
-# URLs. If a location is present for each tag, the installdox tool 
-# does not have to be run to correct the links. 
-# Note that each tag file must have a unique name 
-# (where the name does NOT include the path) 
-# If a tag file is not located in the directory in which doxygen 
+# The TAGFILES option can be used to specify one or more tagfiles.
+# Optionally an initial location of the external documentation
+# can be added for each tagfile. The format of a tag file without
+# this location is as follows:
+#   TAGFILES = file1 file2 ...
+# Adding location for the tag files is done as follows:
+#   TAGFILES = file1=loc1 "file2 = loc2" ...
+# where "loc1" and "loc2" can be relative or absolute paths or
+# URLs. If a location is present for each tag, the installdox tool
+# does not have to be run to correct the links.
+# Note that each tag file must have a unique name
+# (where the name does NOT include the path)
+# If a tag file is not located in the directory in which doxygen
 # is run, you must also specify the path to the tagfile here.
 
-TAGFILES               = 
+TAGFILES               =
 
-# When a file name is specified after GENERATE_TAGFILE, doxygen will create 
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create
 # a tag file that is based on the input files it reads.
 
-GENERATE_TAGFILE       = 
+GENERATE_TAGFILE       =
 
-# If the ALLEXTERNALS tag is set to YES all external classes will be listed 
-# in the class index. If set to NO only the inherited external classes 
+# If the ALLEXTERNALS tag is set to YES all external classes will be listed
+# in the class index. If set to NO only the inherited external classes
 # will be listed.
 
 ALLEXTERNALS           = NO
 
-# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed 
-# in the modules index. If set to NO, only the current project's groups will 
+# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed
+# in the modules index. If set to NO, only the current project's groups will
 # be listed.
 
 EXTERNAL_GROUPS        = YES
 
-# The PERL_PATH should be the absolute path and name of the perl script 
+# The PERL_PATH should be the absolute path and name of the perl script
 # interpreter (i.e. the result of `which perl').
 
 PERL_PATH              = /usr/bin/perl
 
 #---------------------------------------------------------------------------
-# Configuration options related to the dot tool   
+# Configuration options related to the dot tool
 #---------------------------------------------------------------------------
 
-# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will 
-# generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base 
-# or super classes. Setting the tag to NO turns the diagrams off. Note that 
-# this option is superseded by the HAVE_DOT option below. This is only a 
-# fallback. It is recommended to install and use dot, since it yields more 
+# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will
+# generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base
+# or super classes. Setting the tag to NO turns the diagrams off. Note that
+# this option is superseded by the HAVE_DOT option below. This is only a
+# fallback. It is recommended to install and use dot, since it yields more
 # powerful graphs.
 
 CLASS_DIAGRAMS         = YES
 
-# You can define message sequence charts within doxygen comments using the \msc 
-# command. Doxygen will then run the mscgen tool (see 
-# http://www.mcternan.me.uk/mscgen/) to produce the chart and insert it in the 
-# documentation. The MSCGEN_PATH tag allows you to specify the directory where 
-# the mscgen tool resides. If left empty the tool is assumed to be found in the 
+# You can define message sequence charts within doxygen comments using the \msc
+# command. Doxygen will then run the mscgen tool (see
+# http://www.mcternan.me.uk/mscgen/) to produce the chart and insert it in the
+# documentation. The MSCGEN_PATH tag allows you to specify the directory where
+# the mscgen tool resides. If left empty the tool is assumed to be found in the
 # default search path.
 
-MSCGEN_PATH            = 
+MSCGEN_PATH            =
 
-# If set to YES, the inheritance and collaboration graphs will hide 
-# inheritance and usage relations if the target is undocumented 
+# If set to YES, the inheritance and collaboration graphs will hide
+# inheritance and usage relations if the target is undocumented
 # or is not a class.
 
 HIDE_UNDOC_RELATIONS   = YES
 
-# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is 
-# available from the path. This tool is part of Graphviz, a graph visualization 
-# toolkit from AT&T and Lucent Bell Labs. The other options in this section 
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
+# available from the path. This tool is part of Graphviz, a graph visualization
+# toolkit from AT&T and Lucent Bell Labs. The other options in this section
 # have no effect if this option is set to NO (the default)
 
 HAVE_DOT               = NO
 
-# By default doxygen will write a font called FreeSans.ttf to the output 
-# directory and reference it in all dot files that doxygen generates. This 
-# font does not include all possible unicode characters however, so when you need 
-# these (or just want a differently looking font) you can specify the font name 
-# using DOT_FONTNAME. You need need to make sure dot is able to find the font, 
-# which can be done by putting it in a standard location or by setting the 
-# DOTFONTPATH environment variable or by setting DOT_FONTPATH to the directory 
+# By default doxygen will write a font called FreeSans.ttf to the output
+# directory and reference it in all dot files that doxygen generates. This
+# font does not include all possible unicode characters however, so when you need
+# these (or just want a differently looking font) you can specify the font name
+# using DOT_FONTNAME. You need need to make sure dot is able to find the font,
+# which can be done by putting it in a standard location or by setting the
+# DOTFONTPATH environment variable or by setting DOT_FONTPATH to the directory
 # containing the font.
 
 DOT_FONTNAME           = FreeSans
 
-# The DOT_FONTSIZE tag can be used to set the size of the font of dot graphs. 
+# The DOT_FONTSIZE tag can be used to set the size of the font of dot graphs.
 # The default size is 10pt.
 
 DOT_FONTSIZE           = 10
 
-# By default doxygen will tell dot to use the output directory to look for the 
-# FreeSans.ttf font (which doxygen will put there itself). If you specify a 
-# different font using DOT_FONTNAME you can set the path where dot 
+# By default doxygen will tell dot to use the output directory to look for the
+# FreeSans.ttf font (which doxygen will put there itself). If you specify a
+# different font using DOT_FONTNAME you can set the path where dot
 # can find it using this tag.
 
-DOT_FONTPATH           = 
+DOT_FONTPATH           =
 
-# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen 
-# will generate a graph for each documented class showing the direct and 
-# indirect inheritance relations. Setting this tag to YES will force the 
+# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for each documented class showing the direct and
+# indirect inheritance relations. Setting this tag to YES will force the
 # the CLASS_DIAGRAMS tag to NO.
 
 CLASS_GRAPH            = YES
 
-# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen 
-# will generate a graph for each documented class showing the direct and 
-# indirect implementation dependencies (inheritance, containment, and 
+# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for each documented class showing the direct and
+# indirect implementation dependencies (inheritance, containment, and
 # class references variables) of the class with other documented classes.
 
 COLLABORATION_GRAPH    = YES
 
-# If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen 
+# If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen
 # will generate a graph for groups, showing the direct groups dependencies
 
 GROUP_GRAPHS           = YES
 
-# If the UML_LOOK tag is set to YES doxygen will generate inheritance and 
-# collaboration diagrams in a style similar to the OMG's Unified Modeling 
+# If the UML_LOOK tag is set to YES doxygen will generate inheritance and
+# collaboration diagrams in a style similar to the OMG's Unified Modeling
 # Language.
 
 UML_LOOK               = NO
 
-# If set to YES, the inheritance and collaboration graphs will show the 
+# If set to YES, the inheritance and collaboration graphs will show the
 # relations between templates and their instances.
 
 TEMPLATE_RELATIONS     = NO
 
-# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT 
-# tags are set to YES then doxygen will generate a graph for each documented 
-# file showing the direct and indirect include dependencies of the file with 
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT
+# tags are set to YES then doxygen will generate a graph for each documented
+# file showing the direct and indirect include dependencies of the file with
 # other documented files.
 
 INCLUDE_GRAPH          = YES
 
-# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and 
-# HAVE_DOT tags are set to YES then doxygen will generate a graph for each 
-# documented header file showing the documented files that directly or 
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and
+# HAVE_DOT tags are set to YES then doxygen will generate a graph for each
+# documented header file showing the documented files that directly or
 # indirectly include this file.
 
 INCLUDED_BY_GRAPH      = YES
 
-# If the CALL_GRAPH and HAVE_DOT options are set to YES then 
-# doxygen will generate a call dependency graph for every global function 
-# or class method. Note that enabling this option will significantly increase 
-# the time of a run. So in most cases it will be better to enable call graphs 
+# If the CALL_GRAPH and HAVE_DOT options are set to YES then
+# doxygen will generate a call dependency graph for every global function
+# or class method. Note that enabling this option will significantly increase
+# the time of a run. So in most cases it will be better to enable call graphs
 # for selected functions only using the \callgraph command.
 
 CALL_GRAPH             = NO
 
-# If the CALLER_GRAPH and HAVE_DOT tags are set to YES then 
-# doxygen will generate a caller dependency graph for every global function 
-# or class method. Note that enabling this option will significantly increase 
-# the time of a run. So in most cases it will be better to enable caller 
+# If the CALLER_GRAPH and HAVE_DOT tags are set to YES then
+# doxygen will generate a caller dependency graph for every global function
+# or class method. Note that enabling this option will significantly increase
+# the time of a run. So in most cases it will be better to enable caller
 # graphs for selected functions only using the \callergraph command.
 
 CALLER_GRAPH           = NO
 
-# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen 
+# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen
 # will graphical hierarchy of all classes instead of a textual one.
 
 GRAPHICAL_HIERARCHY    = YES
 
-# If the DIRECTORY_GRAPH, SHOW_DIRECTORIES and HAVE_DOT tags are set to YES 
-# then doxygen will show the dependencies a directory has on other directories 
-# in a graphical way. The dependency relations are determined by the #include 
+# If the DIRECTORY_GRAPH, SHOW_DIRECTORIES and HAVE_DOT tags are set to YES
+# then doxygen will show the dependencies a directory has on other directories
+# in a graphical way. The dependency relations are determined by the #include
 # relations between the files in the directories.
 
 DIRECTORY_GRAPH        = YES
 
-# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images 
-# generated by dot. Possible values are png, jpg, or gif 
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
+# generated by dot. Possible values are png, jpg, or gif
 # If left blank png will be used.
 
 DOT_IMAGE_FORMAT       = png
 
-# The tag DOT_PATH can be used to specify the path where the dot tool can be 
+# The tag DOT_PATH can be used to specify the path where the dot tool can be
 # found. If left blank, it is assumed the dot tool can be found in the path.
 
-DOT_PATH               = 
+DOT_PATH               =
 
-# The DOTFILE_DIRS tag can be used to specify one or more directories that 
-# contain dot files that are included in the documentation (see the 
+# The DOTFILE_DIRS tag can be used to specify one or more directories that
+# contain dot files that are included in the documentation (see the
 # \dotfile command).
 
-DOTFILE_DIRS           = 
+DOTFILE_DIRS           =
 
-# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of 
-# nodes that will be shown in the graph. If the number of nodes in a graph 
-# becomes larger than this value, doxygen will truncate the graph, which is 
-# visualized by representing a node as a red box. Note that doxygen if the 
-# number of direct children of the root node in a graph is already larger than 
-# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note 
+# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of
+# nodes that will be shown in the graph. If the number of nodes in a graph
+# becomes larger than this value, doxygen will truncate the graph, which is
+# visualized by representing a node as a red box. Note that doxygen if the
+# number of direct children of the root node in a graph is already larger than
+# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note
 # that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
 
 DOT_GRAPH_MAX_NODES    = 50
 
-# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the 
-# graphs generated by dot. A depth value of 3 means that only nodes reachable 
-# from the root by following a path via at most 3 edges will be shown. Nodes 
-# that lay further from the root node will be omitted. Note that setting this 
-# option to 1 or 2 may greatly reduce the computation time needed for large 
-# code bases. Also note that the size of a graph can be further restricted by 
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the
+# graphs generated by dot. A depth value of 3 means that only nodes reachable
+# from the root by following a path via at most 3 edges will be shown. Nodes
+# that lay further from the root node will be omitted. Note that setting this
+# option to 1 or 2 may greatly reduce the computation time needed for large
+# code bases. Also note that the size of a graph can be further restricted by
 # DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
 
 MAX_DOT_GRAPH_DEPTH    = 0
 
-# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent 
-# background. This is disabled by default, because dot on Windows does not 
-# seem to support this out of the box. Warning: Depending on the platform used, 
-# enabling this option may lead to badly anti-aliased labels on the edges of 
+# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
+# background. This is disabled by default, because dot on Windows does not
+# seem to support this out of the box. Warning: Depending on the platform used,
+# enabling this option may lead to badly anti-aliased labels on the edges of
 # a graph (i.e. they become hard to read).
 
 DOT_TRANSPARENT        = NO
 
-# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output 
-# files in one run (i.e. multiple -o and -T options on the command line). This 
-# makes dot run faster, but since only newer versions of dot (>1.8.10) 
+# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output
+# files in one run (i.e. multiple -o and -T options on the command line). This
+# makes dot run faster, but since only newer versions of dot (>1.8.10)
 # support this, this feature is disabled by default.
 
 DOT_MULTI_TARGETS      = NO
 
-# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will 
-# generate a legend page explaining the meaning of the various boxes and 
+# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will
+# generate a legend page explaining the meaning of the various boxes and
 # arrows in the dot generated graphs.
 
 GENERATE_LEGEND        = YES
 
-# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will 
-# remove the intermediate dot files that are used to generate 
+# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will
+# remove the intermediate dot files that are used to generate
 # the various graphs.
 
 DOT_CLEANUP            = YES
@@ -1535,7 +1535,7 @@ DOT_CLEANUP            = YES
 # Options related to the search engine
 #---------------------------------------------------------------------------
 
-# The SEARCHENGINE tag specifies whether or not a search engine should be 
+# The SEARCHENGINE tag specifies whether or not a search engine should be
 # used. If set to NO the values of all tags below this one will be ignored.
 
 SEARCHENGINE           = NO
diff --git a/doc/manual/additional-algorithms.tex b/doc/manual/additional-algorithms.tex
new file mode 100644
index 0000000..ecb839e
--- /dev/null
+++ b/doc/manual/additional-algorithms.tex
@@ -0,0 +1,221 @@
+\chapter{Additional Algorithms} \label{chap:additional-algorithms}
+The following algorithms are still not yet mature enough to be considered core-functionality, and/or are available with the {\OpenCL} backend only.
+
+\section{Additional Iterative Solvers}
+The following iterative solvers are only available on selected computing backends.
+
+\subsection{Mixed-Precision Conjugate Gradients}
+A two-stage mixed-precision CG algorithm is available as follows:
+\begin{lstlisting}
+viennacl::linalg::mixed_precision_cg_tag   mixed_prec_cg_config;
+vcl_result = viennacl::linalg::solve(vcl_matrix,
+                                     vcl_rhs,
+                                     mixed_prec_cg_config);
+\end{lstlisting}
+As usual, the first parameter to the constructor of \lstinline|mixed_precision_cg_tag| is the relative tolerance for the residual, while the second parameter denotes the maximum number of solver iterations.
+The third parameter denotes the relative tolerance for the inner low-precision CG iterations and defaults to $0.01$.
+
+\TIP{Have a look at \lstinline|examples/banchmarks/solver.cpp| for an example.}
+
+\NOTE{A mixed-precision solver makes sense only if the matrix and right-hand-side vector are supplied in \lstinline|double| precision.}
+
+\NOTE{The mixed-precision solver is currently available with the {\OpenCL} compute backend only.}
+
+\section{Additional Preconditioners}
+In addition to the preconditioners discussed in Sec.~\ref{sec:preconditioner}, two more preconditioners are available with the {\OpenCL} backend and are described in the following.
+
+\subsection{Algebraic Multigrid}
+\NOTE{Algebraic Multigrid preconditioners are only available with the {\OpenCL} backend and are experimental in {\ViennaCLversion}. Interface changes as well as considerable performance improvements may
+be included in future releases!}
+
+\NOTE{Algebraic Multigrid preconditioners depend on {\ublas}.}
+
+Algebraic multigrid mimics the behavior of geometric multigrid on the algebraic level and is thus suited for black-box purposes, where only the system matrix
+and the right hand side vector are available \cite{trottenberg:multigrid}. Many different flavors of the individual multigrid ingredients exists
+\cite{yang:parallel-amg}, of which the most common ones are implemented in {\ViennaCL}.
+
+The two main ingredients of algebraic multigrid are a coarsening algorithm and an interpolation algorithm. The available coarsening methods are listed in
+Tab.~\ref{tab:amg-coarsening}.
+\begin{table}[tbp]
+\begin{center}
+\begin{tabular}{l|l}
+Description & {\ViennaCL} option constant \\
+\hline
+Classical Ruge-St\"uben (RS) & \lstinline|VIENNACL_AMG_COARSE_RS| \\
+One-Pass & \lstinline|VIENNACL_AMG_COARSE_ONEPASS| \\
+RS0 & \lstinline|VIENNACL_AMG_COARSE_RS0| \\
+RS3 & \lstinline|VIENNACL_AMG_COARSE_RS3| \\
+Aggregation & \lstinline|VIENNACL_AMG_COARSE_AG| \\
+Smoothed aggregation & \lstinline|VIENNACL_AMG_COARSE_SA| \\
+\end{tabular}
+\caption{AMG coarsening methods available in {\ViennaCL}. Per default, classical RS coarsening is used.\label{tab:amg-coarsening}}
+\end{center}
+\end{table}
+The available interpolation methods are given in Tab.~\ref{tab:amg-interpolation}.
+\begin{table}[tbp]
+\begin{center}
+\begin{tabular}{l|l}
+Description & {\ViennaCL} option constant \\
+\hline
+Direct & \lstinline|VIENNACL_AMG_INTERPOL_DIRECT| \\
+Classic & \lstinline|VIENNACL_AMG_INTERPOL_ONEPASS| \\
+RS0 coarsening & \lstinline|VIENNACL_AMG_INTERPOL_RS0| \\
+RS3 coarsening & \lstinline|VIENNACL_AMG_INTERPOL_RS3| \\
+\end{tabular}
+\caption{AMG interpolation methods available in {\ViennaCL}. Per default, direct interpolation is used.\label{tab:amg-interpolation}}
+\end{center}
+\end{table}
+In addition, the following parameters can be controlled in the \lstinline|amg_tag| and can be passed to the constructor:
+\begin{itemize}
+ \item Strength of dependence threshold (default: $0.25$)
+ \item Interpolation weight (default: $1$)
+ \item Jacobi smoother weight (default: $1$)
+ \item Number of pre-smoothing steps (default: $1$)
+ \item Number of post-smoothing steps (default: $1$)
+ \item Number of coarse levels
+\end{itemize}
+
+\TIP{Note that the efficiency of the various AMG flavors are typically highly problem-specific. Therefore, failure of one method for a particular problem does
+NOT imply that other coarsening or interpolation strategies will fail as well.}
+
+\subsection{Sparse Approximate Inverses}
+
+\NOTE{Sparse Approximate Inverse preconditioners are only available with the {\OpenCL} backend and are experimental in {\ViennaCLversion}. Interface changes as well as considerable performance improvements may
+be included in future releases!}
+
+\NOTE{Sparse Approximate Inverse preconditioners depend on {\ublas}.}
+
+An alternative construction of a preconditioner for a sparse system matrix $A$ is to compute a matrix $M$ with a prescribed sparsity pattern such that
+\begin{align}
+ \Vert AM - I \Vert_F \rightarrow \min \ ,
+\end{align}
+where $\Vert \cdot \Vert_F$ denotes the Frobenius norm.
+This is the basic idea of sparse approximate inverse (SPAI) preconditioner. It becomes increasingly attractive because of their inherent high degree of
+parallelism, since the minimization problem can be solved independently for each column of $M$. {\ViennaCL} provides two preconditioners of
+this family: The first is the classical SPAI algorithm as described by Grote and Huckle \cite{grote:spai}, the second is the factored SPAI (FSPAI) for symmetric
+matrices as proposed by Huckle \cite{huckle:fspai}.
+
+SPAI can be employed for a CPU matrix \lstinline|M| of type \lstinline|MatrixType| as follows:
+\begin{lstlisting}
+// setup SPAI preconditioner, purely CPU-based
+viennacl::linalg::spai_precond<MatrixType>
+  spai_cpu(M, viennacl::linalg::spai_tag(1e-3, 3, 5e-2));
+
+//solve (e.g. using stab. Bi-conjugate gradient solver)
+vcl_result = viennacl::linalg::solve(M,
+                                     rhs,
+                                     viennacl::linalg::bicgstab_tag(),
+                                     spai_cpu);
+\end{lstlisting}
+The first parameter denotes the residual norm threshold for the full matrix, the second parameter the maximum number of pattern updates, and the third
+parameter is the threshold for the residual of each minimization problem.
+
+For GPU-matrices, only parts of the setup phase are computed on the CPU, because compute-intensive tasks can be carried out on the GPU:
+\begin{lstlisting}
+// setup SPAI preconditioner, GPU-assisted
+viennacl::linalg::spai_precond<GPUMatrixType>
+  spai_gpu(vcl_matrix, viennacl::linalg::spai_tag(1e-3, 3, 5e-2));
+
+//solve (e.g. using conjugate gradient solver)
+vcl_result = viennacl::linalg::solve(vcl_matrix,
+                                     vcl_rhs,
+                                     viennacl::linalg::bicgstab_tag(),
+                                     spai_gpu);
+\end{lstlisting}
+The \lstinline|GPUMatrixType| is typically a \lstinline|viennacl::compressed_matrix| type.
+
+For symmetric matrices, FSPAI can be used with the conjugate gradient solver:
+\begin{lstlisting}
+viennacl::linalg::fspai_precond<MatrixType> fspai_cpu(M, viennacl::linalg::fspai_tag());
+
+//solve (e.g. using stab. Bi-conjugate gradient solver)
+vcl_result = viennacl::linalg::solve(M,
+                                     rhs,
+                                     viennacl::linalg::cg_tag(),
+                                     fspai_cpu);
+\end{lstlisting}
+Our experience is that FSPAI is typically more efficient than SPAI when applied to the same matrix, both in computational effort and in terms of convergence
+acceleration of the iterative solvers.
+
+\NOTE{At present, there is no GPU-accelerated FSPAI included in {\ViennaCL}.}
+
+Note that FSPAI depends on the ordering of the unknowns, thus bandwidth reduction algorithms may be employed first, cf.~Sec.~\ref{sec:bandwidth-reduction}.
+
+
+\section{Fast Fourier Transform}
+\NOTE{The fast Fourier transform is experimental in {\ViennaCLversion} and available with the {\OpenCL} backend only.
+      Interface changes as well as considerable performance improvements may be included in future releases!}
+
+Since there is no standardized complex type in {\OpenCL} at the time of the release of {\ViennaCLversion}, vectors need to be set up with real- and imaginary
+part before computing a fast Fourier tranform (FFT). In order to store complex numbers $z_0$, $z_1$, etc.~in a \lstinline|viennacl::vector|, say \lstinline|v|,
+the real and imaginary parts are mapped to even and odd entries of \lstinline|v| respectively: \lstinline|v[0] = Real(z_0)|, \lstinline|v[1] = Imag(z_0)|,
+\lstinline|v[2] = Real(z_1)|, \lstinline|v[3] = Imag(z_1)|, etc.
+
+The FFT of \lstinline|v| can then be computed either by writing to a second vector \lstinline|output| or by directly writing the result to \lstinline|v|
+\begin{lstlisting}
+ viennacl::fft(v, output);
+ viennacl::inplace_fft(v);
+\end{lstlisting}
+Conversely, the inverse FFT is computed as
+\begin{lstlisting}
+ viennacl::ifft(v, output);
+ viennacl::inplace_ifft(v);
+\end{lstlisting}
+
+\NOTE{In {\ViennaCLversion} the FFT with complexity $N \log N$ is computed for vectors with a size of a power of two only. For other vector sizes, a standard
+discrete Fourier transform with complexity $N^2$ is employed. This is subject to change in future versions.}
+
+\section{Bandwidth Reduction} \label{sec:bandwidth-reduction}
+\NOTE{Bandwidth reduction algorithms are experimental in {\ViennaCLversion}. Interface changes as well as considerable performance improvements may
+be included in future releases!}
+
+The bandwidth of a sparse matrix is defined as the maximum difference of the indices of nonzero entries in a row, taken over all rows. A low bandwidth
+typically allows for the use of efficient banded matrix solvers instead of iterative solvers. Moreover, better cache utilization as well as lower fill-in in
+LU-factorization based algorithms can be expected.
+
+For a given sparse matrix with large bandwidth, {\ViennaCL} provides routines for renumbering the unknowns such that the reordered system matrix shows much
+smaller bandwidth. Typical applications stem from the discretization of partial differential equations by means of the finite element or the finite difference
+method. The algorithms employed are as follows:
+\begin{itemize}
+ \item Classical Cuthill-McKee algorithm \cite{cuthill:reducing-bandwidth}
+ \item Iterated Cuthill-McKee algorithm with different seeds \cite{cuthill:reducing-bandwidth}
+ \item Gibbs-Poole-Stockmeyer algorithm, cf.~\cite{lewis:gps-algorithm}
+\end{itemize}
+The iterated Cuthill-McKee algorithm applies the classical Cuthill-McKee algorithm to different starting nodes with small, but not necessarily minimal degree as root node into account.
+While this iterated application is more expensive in times of execution times, it may lead to better results than the classical Cuthill-McKee algorithm.
+A parameter $a \in [0,1]$ controls the number of nodes considered: All nodes with degree $d$ fulfilling
+\begin{align*}
+ d_{\min} \leq d \leq d_{\min} + a(d_{\max} - d_{\min})
+\end{align*}
+are considered, where $d_{\min}$ and $d_{\max}$ are the miminum and maximum nodal degrees in the graph.
+A second parameter \lstinline|gmax| specifies the number of additional root nodes considered.
+
+The algorithms are called for a \lstinline|matrix| of a type compatible with \lstinline|std::vector< std::map<int, double> >| by
+\begin{lstlisting}
+ r = viennacl::reorder(matrix, viennacl::cuthill_mckee_tag());
+ r = viennacl::reorder(matrix,
+                       viennacl::advanced_cuthill_mckee_tag(a, gmax));
+ r = viennacl::reorder(matrix, viennacl::gibbs_poole_stockmeyer_tag());
+\end{lstlisting}
+and return the permutation array. In {\ViennaCLversion}, the user then needs to manually reorder the sparse matrix based on the permutation array. Example code
+can be found in \lstinline|examples/tutorial/bandwidth-reduction.cpp|.
+
+
+\section{Nonnegative Matrix Factorization}
+\NOTE{Nonnegative Matrix Factorization is experimental in {\ViennaCLversion} and available with the {\OpenCL} backend only.
+      Interface changes as well as considerable performance improvements may be included in future releases!}
+
+In various fields such as text mining, a matrix $V$ needs to be factored into factors $W$ and $H$ such that the function
+\begin{align*}
+ f(W, H) = \Vert V - WH \Vert_{\mathrm{F}}^2
+\end{align*}
+is minimized. The algorithm proposed by Lee and Seoung \cite{lee:nmf} is available in ViennaCL in the header file \texttt{viennacl/linalg/nmf.hpp} as
+\begin{lstlisting}
+ viennacl::matrix<ScalarType> V(size1, size2);
+ viennacl::matrix<ScalarType> W(size1, k);
+ viennacl::matrix<ScalarType> H(k, size2);
+
+ viennacl::linalg::nmf_config conf;
+ viennacl::linalg::nmf(v_ref, w_nmf, h_nmf, conf);
+\end{lstlisting}
+For an overview of the parameters (tolerances) of the configuration object \lstinline|conf|, please refer to the Doxygen documentation in \texttt{doc/doxygen/}.
diff --git a/doc/manual/algorithms.tex b/doc/manual/algorithms.tex
index b38a64a..4dba1a5 100644
--- a/doc/manual/algorithms.tex
+++ b/doc/manual/algorithms.tex
@@ -21,7 +21,7 @@ The focus of {\ViennaCL} is on iterative solvers, for which {\ViennaCL} provides
 
   //solution of a full system right into the load vector vcl_rhs:
   lu_factorize(vcl_matrix);
-  lu_substitute(vcl_matrix, vcl_rhs);    
+  lu_substitute(vcl_matrix, vcl_rhs);
 \end{lstlisting}
 In {\ViennaCLminorversion} there is no pivoting included in the LU factorization
 process, hence the computation may break down or yield results with poor
@@ -50,7 +50,7 @@ It is also possible to solve for multiple right hand sides:
 matrices, listed in Tab.~\ref{tab:linear-solvers}. Unlike direct solvers, the
 convergence of iterative solvers relies on certain properties of the system
 matrix. Keep in mind that an iterative solver may fail to converge, especially
-if the matrix is ill conditioned or a wrong solver is chosen. 
+if the matrix is ill conditioned or a wrong solver is chosen.
 
 \TIP{For full details on linear solver calls, refer to the reference
 documentation located in \texttt{doc/doxygen/} and to the tutorials}
@@ -68,18 +68,18 @@ viennacl::vector<float>  vcl_result;
 
 //solution using conjugate gradient solver:
 vcl_result = viennacl::linalg::solve(vcl_matrix,
-				     vcl_rhs,
-				     viennacl::linalg::cg_tag());
+             vcl_rhs,
+             viennacl::linalg::cg_tag());
 
 //solution using BiCGStab solver:
 vcl_result = viennacl::linalg::solve(vcl_matrix,
-				     vcl_rhs,
-				     viennacl::linalg::bicgstab_tag());
+             vcl_rhs,
+             viennacl::linalg::bicgstab_tag());
 
 //solution using GMRES solver:
 vcl_result = viennacl::linalg::solve(vcl_matrix,
-				     vcl_rhs,
-				     viennacl::linalg::gmres_tag());
+             vcl_rhs,
+             viennacl::linalg::gmres_tag());
 \end{lstlisting}
 
 \begin{table}[tb]
@@ -116,23 +116,39 @@ The BiCGStab solver tag can be customized in exactly the same way. The GMRES
 solver tag takes as third argument the dimension of the Krylov space. Thus, a
 tag for GMRES(30) with tolerance $1\mathrm{E}\!-\!10$ and at most $100$ total
 iterations
-(hence, up to three restarts) can be set up by 
+(hence, up to three restarts) can be set up by
 \begin{lstlisting}
 viennacl::linalg::gmres_tag custom_gmres(1e-10, 100, 30);
 \end{lstlisting}
 
-\section{Preconditioners}
+\section{Preconditioners} \label{sec:preconditioner}
 {\ViennaCL} ships with a generic implementation of several preconditioners.
 The preconditioner setup is expect for simple diagonal preconditioners always carried out on the CPU host due to the need for dynamically allocating memory.
 Thus, one may not obtain an overall performance benefit if too much time is spent on the preconditioner setup.
 
 \TIP{The preconditioner also works for {\ublas} types!}
 
+An overview of preconditioners available for the various sparse matrix types is as follows:
+\begin{center}
+ \begin{tabular}{|l|c|c|c|c|c|c|}
+  \hline
+  Matrix Type & ICHOL & (Block-)ILU[0/T] & Jacobi & Row-scaling & AMG & SPAI \\
+  \hline
+  \lstinline|compressed_matrix| & yes & yes & yes & yes & yes & yes \\
+  \lstinline|coordinate_matrix| & no & no & yes & yes & no & no \\
+  \lstinline|ell_matrix| & no & no & no & no & no & no \\
+  \lstinline|hyb_matrix| & no & no & no & no & no & no \\
+  \hline
+ \end{tabular}
+\end{center}
+Broader support of preconditioners particularly for \lstinline|ell_matrix| and \lstinline|hyb_matrix| is scheduled for future releases.
+AMG and SPAI preconditioners are described in Chap.~\ref{chap:additional-algorithms}.
+
+
 In the following it is assumed that the sparse linear system of equations is given as follows:
 \begin{lstlisting}
-using viennacl::compressed_matrix;
+typedef viennacl::compressed_matrix<float>   SparseMatrix;
 
-typedef compressed_matrix<float>   SparseMatrix;
 SparseMatrix  vcl_matrix;
 viennacl::vector<float>  vcl_rhs;
 viennacl::vector<float>  vcl_result;
@@ -161,13 +177,37 @@ viennacl::vector<float>  vcl_result;
 \subsection{Incomplete LU Factorization with Threshold (ILUT)}
 The incomplete LU factorization preconditioner aims at computing sparse matrices lower and upper triangular matrices $L$ and $U$ such that the sparse system
 matrix is approximately given by $A \approx LU$. In order to control the sparsity pattern of $L$ and $U$, a threshold strategy is used (ILUT)
-\cite{saad-iterative-solution}. Due to the serial nature of the preconditioner, the setup as well as each application of ILUT to the residual is computed on
-the CPU.
+\cite{saad-iterative-solution}. Due to the serial nature of the preconditioner, the setup of ILUT is always computed on
+the CPU using the respective ViennaCL backend.
 
 \begin{lstlisting}
 //compute ILUT preconditioner:
-ilut_precond< SparseMatrix > vcl_ilut(vcl_matrix,
-                                      viennacl::linalg::ilut_tag());
+viennacl::linalg::ilut_tag ilut_config;
+viennacl::linalg::ilut_precond< SparseMatrix > vcl_ilut(vcl_matrix,
+                                                        ilut_config);
+
+//solve (e.g. using conjugate gradient solver)
+vcl_result = viennacl::linalg::solve(vcl_matrix,
+                                     vcl_rhs,
+                                     viennacl::linalg::bicgstab_tag(),
+                                     vcl_ilut);   //preconditioner here
+\end{lstlisting}
+The triangular substitutions may be applied in parallel on GPUs by enabling \emph{level-scheduling} \cite{saad-iterative-solution} via the member function call \lstinline|use_level_scheduling(true)| in the \lstinline|ilut_config| object.
+
+Three parameters can be passed to the constructor of \lstinline|ilut_tag|: The first specifies the maximum number of entries per row in $L$ and $U$, while the
+second parameter specifies the drop tolerance. The third parameter is the boolean specifying whether level scheduling should be used.
+
+\TIP{The performance of level scheduling depends strongly on the matrix pattern and is thus disabled by default.}
+
+\subsection{Incomplete LU Factorization with Static Pattern (ILU0)}
+Similar to ILUT, ILU0 computes an approximate LU factorization with sparse factors L and U.
+While ILUT determines the location of nonzero entries on the fly, ILU0 uses the sparsity pattern of A for the sparsity pattern of L and U \cite{saad-iterative-solution}.
+Due to the serial nature of the preconditioner, the setup of ILU0 is computed on the CPU.
+\begin{lstlisting}
+//compute ILU0 preconditioner:
+viennacl::linalg::ilu0_tag ilu0_config;
+viennacl::linalg::ilu0_precond< SparseMatrix > vcl_ilut(vcl_matrix,
+                                                        ilu0_config);
 
 //solve (e.g. using conjugate gradient solver)
 vcl_result = viennacl::linalg::solve(vcl_matrix,
@@ -175,9 +215,35 @@ vcl_result = viennacl::linalg::solve(vcl_matrix,
                                      viennacl::linalg::bicgstab_tag(),
                                      vcl_ilut);   //preconditioner here
 \end{lstlisting}
-Two parameters can be passed to the constructor of \lstinline|ilut_tag|: The first specifies the maximum number of entries per row in $L$ and $U$, while the
-second parameter specifies the drop tolerance.
+The triangular substitutions may be applied in parallel on GPUs by enabling \emph{level-scheduling} \cite{saad-iterative-solution} via the member function call \lstinline|use_level_scheduling(true)| in the \lstinline|ilu0_config| object.
+
+One parameter can be passed to the constructor of \lstinline|ilu0_tag|, being the boolean specifying whether level scheduling should be used.
 
+\TIP{The performance of level scheduling depends strongly on the matrix pattern and is thus disabled by default.}
+
+\subsection{Block-ILU}
+To overcome the serial nature of ILUT and ILU0 applied to the full system matrix,
+a parallel variant is to apply ILU to diagonal blocks of the system matrix.
+This is accomplished by the \lstinline|block_ilu| preconditioner, which takes
+the system matrix type as first template argument and the respective ILU-tag type as second template argument
+(either \lstinline|ilut_tag| or \lstinline|ilu0_tag|). Support for accelerators using {\CUDA} or {\OpenCL} is provided.
+
+\begin{lstlisting}
+//compute block-ILU preconditioner using ILU0 for each block:
+block_ilu_precond<SparseMatrix,
+                  ilu0_tag> vcl_block_ilu0(vcl_matrix,
+                                           ilu0_tag());
+
+//solve
+vcl_result = viennacl::linalg::solve(vcl_matrix,
+                                     vcl_rhs,
+                                     viennacl::linalg::bicgstab_tag(),
+                                     vcl_block_ilu0);
+\end{lstlisting}
+A third argument can be passed to the constructor of \lstinline|block_ilu_precond|:
+Either the number of blocks to be used (defaults to $8$), or an index vector with fine-grained control over the blocks. Refer to the Doxygen pages in doc/doxygen for details.
+
+\TIP{The number of blocks is a design parameter for your sparse linear system at hand. Higher number of blocks leads to better memory bandwidth utilization on GPUs, but may increase the number of solver iterations.}
 
 \subsection{Jacobi Preconditioner}
 A Jacobi preconditioner is a simple diagonal preconditioner given by the reciprocals of the diagonal entries of the system matrix $A$.
@@ -212,186 +278,63 @@ vcl_result = viennacl::linalg::solve(vcl_matrix,
 The tag \lstinline|viennacl::linalg::row_scaling_tag()| can be supplied with a parameter denoting the norm to be used. A value of \lstinline|1| specifies the
 $l^1$-norm, while a value of $2$ selects the $l^2$-norm (default).
 
-\subsection{Algebraic Multigrid}
-\NOTE{Algebraic Multigrid preconditioners are experimental in {\ViennaCLversion}. Interface changes as well as considerable performance improvements may
-be included in future releases!}
-
-\NOTE{Algebraic Multigrid preconditioners depend on {\ublas}.}
-
-Algebraic multigrid mimics the behavior of geometric multigrid on the algebraic level and is thus suited for black-box purposes, where only the system matrix
-and the right hand side vector are available \cite{trottenberg:multigrid}. Many different flavors of the individual multigrid ingredients exists
-\cite{yang:parallel-amg}, of which the most common ones are implemented in {\ViennaCL}.
 
-The two main ingredients of algebraic multigrid are a coarsening algorithm and an interpolation algorithm. The available coarsening methods are listed in
-Tab.~\ref{tab:amg-coarsening}.
-\begin{table}[tbp]
-\begin{center}
-\begin{tabular}{l|l}
-Description & {\ViennaCL} option constant \\
-\hline
-Classical Ruge-St\"uben (RS) & \lstinline|VIENNACL_AMG_COARSE_RS| \\
-One-Pass & \lstinline|VIENNACL_AMG_COARSE_ONEPASS| \\
-RS0 & \lstinline|VIENNACL_AMG_COARSE_RS0| \\
-RS3 & \lstinline|VIENNACL_AMG_COARSE_RS3| \\
-Aggregation & \lstinline|VIENNACL_AMG_COARSE_AG| \\
-Smoothed aggregation & \lstinline|VIENNACL_AMG_COARSE_SA| \\
-\end{tabular}
-\caption{AMG coarsening methods available in {\ViennaCL}. Per default, classical RS coarsening is used.\label{tab:amg-coarsening}}
-\end{center}
-\end{table}
-The available interpolation methods are given in Tab.~\ref{tab:amg-interpolation}.
-\begin{table}[tbp]
-\begin{center}
-\begin{tabular}{l|l}
-Description & {\ViennaCL} option constant \\
-\hline
-Direct & \lstinline|VIENNACL_AMG_INTERPOL_DIRECT| \\
-Classic & \lstinline|VIENNACL_AMG_INTERPOL_ONEPASS| \\
-RS0 coarsening & \lstinline|VIENNACL_AMG_INTERPOL_RS0| \\
-RS3 coarsening & \lstinline|VIENNACL_AMG_INTERPOL_RS3| \\
-\end{tabular}
-\caption{AMG interpolation methods available in {\ViennaCL}. Per default, direct interpolation is used.\label{tab:amg-interpolation}}
-\end{center}
-\end{table}
-In addition, the following parameters can be controlled in the \lstinline|amg_tag| and can be passed to the constructor:
+\section{Eigenvalue Computations}
+%{\ViennaCL}
+Two algorithms for the computations of the eigenvalues of a matrix $A$ are implemented in {\ViennaCL}:
 \begin{itemize}
- \item Strength of dependence threshold (default: $0.25$)
- \item Interpolation weight (default: $1$)
- \item Jacobi smoother weight (default: $1$)
- \item Number of pre-smoothing steps (default: $1$)
- \item Number of post-smoothing steps (default: $1$)
- \item Number of coarse levels
+\item The Power Iteration \cite{golub:matrix-computations}
+\item The Lanczos Algorithm \cite{simon:lanczos-pro}
 \end{itemize}
+Depending on the parameter \lstinline|tag| either one of them is called.
+Both algorithms can be used for either {\ublas} or {\ViennaCL} compressed matrices.\\
+In order to get the eigenvalue with the greatest absolut value the power iteration should be called. \\
+The Lanczos algorithm returns a vector of the largest eigenvalues with the same type as the entries of the matrix.
 
-\TIP{Note that the efficiency of the various AMG flavors are typically highly problem-specific. Therefore, failure of one method for a particular problem does
-NOT imply that other coarsening or interpolation strategies will fail as well.}
-
-\subsection{Sparse Approximate Inverses}
-
-\NOTE{Sparse Approximate Inverse preconditioners are experimental in {\ViennaCLversion}. Interface changes as well as considerable performance improvements may
-be included in future releases!}
-
-\NOTE{Sparse Approximate Inverse preconditioners depend on {\ublas}.}
-
-An alternative construction of a preconditioner for a sparse system matrix $A$ is to compute a matrix $M$ with a prescribed sparsity pattern such that
-\begin{align}
- \Vert AM - I \Vert_F \rightarrow \min \ ,
-\end{align}
-where $\Vert \cdot \Vert_F$ denotes the Frobenius norm.
-This is the basic idea of sparse approximate inverse (SPAI) preconditioner. It becomes increasingly attractive because of their inherent high degree of
-parallelism, since the minimization problem can be solved independently for each column of $M$. {\ViennaCL} provides two preconditioners of
-this family: The first is the classical SPAI algorithm as described by Grote and Huckle \cite{grote:spai}, the second is the factored SPAI (FSPAI) for symmetric
-matrices as proposed by Huckle \cite{huckle:fspai}.
-
-SPAI can be employed for a CPU matrix \lstinline|M| of type \lstinline|MatrixType| as follows:
-\begin{lstlisting}
-// setup SPAI preconditioner, purely CPU-based
-viennacl::linalg::spai_precond<MatrixType> 
-  spai_cpu(M, viennacl::linalg::spai_tag(1e-3, 3, 5e-2));
-
-//solve (e.g. using stab. Bi-conjugate gradient solver)
-vcl_result = viennacl::linalg::solve(M,
-                                     rhs,
-                                     viennacl::linalg::bicgstab_tag(),
-                                     spai_cpu);
-\end{lstlisting}
-The first parameter denotes the residual norm threshold for the full matrix, the second parameter the maximum number of pattern updates, and the third
-parameter is the threshold for the residual of each minimization problem.
-
-For GPU-matrices, only parts of the setup phase are computed on the CPU, because compute-intensive tasks can be carried out on the GPU:
-\begin{lstlisting}
-// setup SPAI preconditioner, GPU-assisted
-viennacl::linalg::spai_precond<GPUMatrixType> 
-  spai_gpu(vcl_matrix, viennacl::linalg::spai_tag(1e-3, 3, 5e-2));
-
-//solve (e.g. using conjugate gradient solver)
-vcl_result = viennacl::linalg::solve(vcl_matrix,
-                                     vcl_rhs,
-                                     viennacl::linalg::bicgstab_tag(),
-                                     spai_gpu);
-\end{lstlisting}
-The \lstinline|GPUMatrixType| is typically a \lstinline|viennacl::compressed_matrix| type.
-
-For symmetric matrices, FSPAI can be used with the conjugate gradient solver:
+The algorithms are called for a matrix object \lstinline|A| by
 \begin{lstlisting}
-viennacl::linalg::fspai_precond<MatrixType> fspai_cpu(M, viennacl::linalg::fspai_tag());
-
-//solve (e.g. using stab. Bi-conjugate gradient solver)
-vcl_result = viennacl::linalg::solve(M,
-                                     rhs,
-                                     viennacl::linalg::cg_tag(),
-                                     fspai_cpu);
+std::vector<double> largest_eigenvalues = viennacl::linalg::eig(A, ltag);
+double largest_eigenvalue = viennacl::linalg::eig(A, ptag);
 \end{lstlisting}
-Our experience is that FSPAI is typically more efficient than SPAI when applied to the same matrix, both in computational effort and in terms of convergence
-acceleration of the iterative solvers. 
-
-\NOTE{At present, there is no GPU-accelerated FSPAI included in {\ViennaCL}.}
-
-Note that FSPAI depends on the ordering of the unknowns, thus bandwidth reduction algorithms may be employed first, cf.~Sec.~\ref{sec:bandwidth-reduction}.
-
-
-\section{Fast Fourier Transform}
-\NOTE{The fast Fourier transform is experimental in {\ViennaCLversion}. Interface changes as well as considerable performance improvements may
-be included in future releases!}
 
-Since there is no standardized complex type in {\OpenCL} at the time of the release of {\ViennaCLversion}, vectors need to be set up with real- and imaginary
-part before computing a fast Fourier tranform (FFT). In order to store complex numbers $z_0$, $z_1$, etc.~in a \lstinline|viennacl::vector|, say \lstinline|v|,
-the real and imaginary parts are mapped to even and odd entries of \lstinline|v| respectively: \lstinline|v[0] = Real(z_0)|, \lstinline|v[1] = Imag(z_0)|,
-\lstinline|v[2] = Real(z_1)|, \lstinline|v[3] = Imag(z_1)|, etc.
 
-The FFT of \lstinline|v| can then be computed either by writing to a second vector \lstinline|output| or by directly writing the result to \lstinline|v|
+\subsection{Power Iteration}
+The Power iteration aims at computing the eigenvalues of a matrix by calculating the product of the matrix and a vector for several times, where the resulting vector is used for the next product of the matrix and so on. The computation stops as soon as the norm of the vector converges. \\
+The final vector is the eigenvector to the eigenvalue with the greatest absolut value.\\
+To call this algorithm, \lstinline|piter_tag| must be used.
+This tag has only one parameter: \\ \lstinline|terminationfactor| defines the accuracy of the computation, i.e. if the new norm of the eigenvector changes less than this parameter the computation stops and returns the corresponding eigenvalue (default: $1e-10$).\\
+The call of the constructor may look like the following:
 \begin{lstlisting}
- viennacl::fft(v, output);
- viennacl::inplace_fft(v);
-\end{lstlisting}
-Conversely, the inverse FFT is computed as
-\begin{lstlisting}
- viennacl::ifft(v, output);
- viennacl::inplace_ifft(v);
+viennacl::linalg::piter_tag ptag(1e-8);
 \end{lstlisting}
 
-\NOTE{In {\ViennaCLversion} the FFT with complexity $N \log N$ is computed for vectors with a size of a power of two only. For other vector sizes, a standard
-discrete Fourier transform with complexity $N^2$ is employed. This is subject to change in future versions.}
+\TIP{Example code can be found in \lstinline|examples/tutorial/power-iter.cpp|.}
 
-\section{Bandwidth Reduction} \label{sec:bandwidth-reduction}
-\NOTE{Bandwidth reduction algorithms are experimental in {\ViennaCLversion}. Interface changes as well as considerable performance improvements may
-be included in future releases!}
+\subsection{The Lanczos Algorithm}
+In order to compute the eigenvalues of a sparse high-dimensional matrix the Lanczos algorithm can be used to find these.
+This algorithm reformulates the given high-dimensional matrix in a way such that the matrix can be rewritten in a tridiagonal matrix at much lower dimension.
+The eigenvalues of this tridiagonal matrix are equal to the largest eigenvalues of the original matrix. \\
+The eigenvalues of the tridiagonal matrix are calculated by using the bisection method \cite{golub:matrix-computations}. \\
+To call this Lanczos algorithm, \lstinline|lanczos_tag| must be used.
+This tag has several parameters that can be passed to the constructor:
 
-The bandwidth of a sparse matrix is defined as the maximum difference of the indices of nonzero entries in a row, taken over all rows. A low bandwidth
-typically allows for the use of efficient banded matrix solvers instead of iterative solvers. Moreover, better cache utilization as well as lower fill-in in
-LU-factorization based algorithms can be expected.
-
-For a given sparse matrix with large bandwidth, {\ViennaCL} provides routines for renumbering the unknowns such that the reordered system matrix shows much
-smaller bandwidth. Typical applications stem from the discretization of partial differential equations by means of the finite element or the finite difference
-method. The algorithms employed are as follows:
 \begin{itemize}
- \item Classical Cuthill-McKee algorithm \cite{cuthill:reducing-bandwidth}
- \item Modified Cuthill-McKee algorithm \cite{cuthill:reducing-bandwidth}
- \item Gibbs-Poole-Stockmeyer algorithm, cf.~\cite{lewis:gps-algorithm}
+ \item The exponent of epsilon for the tolerance of the reorthogonalization, defined by the parameter \lstinline|factor| (default: $0.75$)
+ \item The method of the Lanczos algorithm: $0$ uses partial reorthogonalization, $1$ full reothogonalization and $2$ does not use reorthogonalization (default: $0$)
+ \item The number of eigenvalues that are returned is specified by \lstinline|num_eigenvalues| (default: $10$)
+ \item The size of the krylov space used for the computations can be set by the parameter \lstinline|krylov_size| (default: $100$). The maximum number of iterations can be equal or less this parameter
 \end{itemize}
-The modified Cuthill-McKee algorithm also takes nodes with small, but not necessarily minimal degree as root node into account and may lead to better results
-than the classical Cuthill-McKee algorithm. A parameter $a \in [0,1]$ controls the number of nodes considered: All nodes with degree $d$ fulfilling
-\begin{align*}
- d_{\min} \leq d \leq d_{\min} + a(d_{\max} - d_{\min})
-\end{align*}
-are considered, where $d_{\min}$ and $d_{\max}$ are the miminum and maximum nodal degrees in the graph. A second parameter \lstinline|gmax| specifies the
-number of additional root nodes considered.
-
-The algorithms are called for a \lstinline|matrix| of a type compatible with \lstinline|std::vector< std::map<int, double> >| by
+The call of the constructor may look like the following:
 \begin{lstlisting}
- r = viennacl::reorder(matrix, viennacl::cuthill_mckee_tag());
- r = viennacl::reorder(matrix, 
-                       viennacl::advanced_cuthill_mckee_tag(a, gmax));
- r = viennacl::reorder(matrix, viennacl::gibbs_poole_stockmeyer_tag());
+viennacl::linalg::lanczos_tag ltag(0.85, 15, 0, 200);
 \end{lstlisting}
-and return the permutation array. In {\ViennaCLversion}, the user then needs to manually reorder the sparse matrix based on the permutation array. Example code
-can be found in \lstinline|examples/tutorial/bandwidth-reduction.cpp|.
 
+\TIP{Example code can be found in \lstinline|examples/tutorial/lanczos.cpp|.}
 
 
 \section{QR Factorization}
-\NOTE{The QR factorization is experimental in {\ViennaCLversion}. Interface changes as well as considerable performance improvements may
-be included in future releases!}
+
+\NOTE{The current QR factorization implementation depends on {\ublas}.}
 
 A matrix $A \in \mathbb{R}^{n\times m}$ can be factored into $A = Q R$, where $Q \in \mathbb{R}^{n\times n}$ is an
 orthogonal matrix and $R \in \mathbb{R}^{n \times m}$ is upper triangular. This so-called QR-factorization is important for eigenvalue computations as well as
@@ -400,20 +343,25 @@ Householder reflections in file \lstinline|viennacl/linalg/qr.hpp|. An example a
 
 The Householder reflectors $v_i$ defining the Householder reflection $I - \beta_i v_i v_i^{\mathrm{T}}$ are stored in the
 columns below the diagonal of the input matrix $A$ \cite{golub:matrix-computations}. The normalization coefficients $\beta_i$ are returned by the
-worker function \lstinline|inplace_qr|. The upper triangular matrix $R$ is directly written to the upper triangular part of $A$. 
+worker function \lstinline|inplace_qr|. The upper triangular matrix $R$ is directly written to the upper triangular part of $A$.
 \begin{lstlisting}
   std::vector<ScalarType> betas = viennacl::linalg::inplace_qr(A, 12);
 \end{lstlisting}
-If $A$ is a dense matrix from \ublas, the calculation is carried out on the CPU using a single thread. If $A$ is a 
+If $A$ is a dense matrix from \ublas, the calculation is carried out on the CPU using a single thread. If $A$ is a
 \lstinline|viennacl::matrix|, a hybrid implementation is used: The panel factorization is carried out using \ublas, while expensive BLAS level 3 operations
-are computed on the OpenCL device using multiple threads. 
-
-\NOTE{The number of columns of the input matrix must be a multiple of the block size in {\ViennaCLversion}.}
+are computed on the OpenCL device using multiple threads.
 
 Typically, the orthogonal matrix $Q$ is kept in inplicit form because of computational efficiency
 However, if $Q$ and $R$ have to be computed explicitly, the function \lstinline|recoverQ| can be used:
 \begin{lstlisting}
-  viennacl::linalg::recoverQ(A, betas, Q, R); 
+  viennacl::linalg::recoverQ(A, betas, Q, R);
 \end{lstlisting}
 Here, \lstinline|A| is the inplace QR-factored matrix, \lstinline|betas| are the coefficients of the Householder reflectors as returned by
-\lstinline|inplace_qr|, while \lstinline|Q| and \lstinline|R| are the destination matrices.
+\lstinline|inplace_qr|, while \lstinline|Q| and \lstinline|R| are the destination matrices. However, the explicit formation of $Q$ is expensive and is usually avoided.
+For a number of applications of the QR factorization it is required to apply $Q^T$ to a vector $b$. This is accomplished by
+\begin{lstlisting}
+ viennacl::linalg::inplace_qr_apply_trans_Q(A, betas, b);
+\end{lstlisting}
+without setting up $Q$ (or $Q^T$) explicitly.
+
+\TIP{Have a look at \lstinline|examples/tutorial/least-squares.cpp| for a least-squares computation using QR factorizations.}
diff --git a/doc/manual/benchmarks.tex b/doc/manual/benchmarks.tex
index bb96358..f02eed3 100644
--- a/doc/manual/benchmarks.tex
+++ b/doc/manual/benchmarks.tex
@@ -1,6 +1,6 @@
 
 \chapter{Benchmark Results}
-We have compared the performance gain of {\ViennaCL} with standard CPU implementations using a single core. The code used for the benchmarks can be found in the folder \texttt{examples/benchmark/} within the source-release of {\ViennaCL}. Results are grouped by computational complexity and can be found in the subsequent sections. 
+We have compared the performance gain of {\ViennaCL} with standard CPU implementations using a single core. The code used for the benchmarks can be found in the folder \texttt{examples/benchmark/} within the source-release of {\ViennaCL}. Results are grouped by computational complexity and can be found in the subsequent sections.
 
 \begin{center}
 \begin{tabular}{|l|l|}
diff --git a/doc/manual/changelogs.tex b/doc/manual/changelogs.tex
index 2eae9a8..1daaafe 100644
--- a/doc/manual/changelogs.tex
+++ b/doc/manual/changelogs.tex
@@ -1,8 +1,182 @@
 
-\chapter*{Change Logs} \addcontentsline{toc}{chapter}{Change Logs}
+\chapter{Change Logs} %\addcontentsline{toc}{chapter}{Change Logs}
+
+\section*{Version 1.5.x}
+
+\subsection*{Version 1.5.1}
+This maintenance release fixes a few nasty bugs:
+\begin{itemize}
+ \item Fixed a memory leak in the OpenCL kernel generator. Thanks to GitHub user dxyzab for spotting this.
+ \item Added compatibility of the mixed precision CG implementation with older AMD GPUs. Thanks to Andreas Rost for the input.
+ \item Fixed an error when running the QR factorization for matrices with less rows than columns. Thanks to Karol Polko for reporting.
+ \item Readded accidentally removed chapters on additional algorithms and structured matrices to the manual. Thanks to Sajjadul Islam for the hint.
+ \item Fixed buggy OpenCL kernels for matrix additions and subtractions for column-major matrices. Thanks to Tom Nicholson for reporting.
+ \item Fixed an invalid default kernel parameter set for matrix-matrix multiplications on CPUs when using the OpenCL backend. Thanks again to Tom Nicholson.
+ \item Corrected a weak check used in two tests. Thanks to Walter Mascarenhas for providing a fix.
+ \item Fixed a wrong global work size inside the SPAI preconditioner. Thanks to Andreas Rost.
+\end{itemize}
+
+\subsection*{Version 1.5.0}
+This new minor release number update focuses on a more powerful API, and on first steps in making ViennaCL more accessible from languages other than C++.
+In addition to many internal improvements both in terms of performance and flexibility, the following changes are visible to users:
+\begin{itemize}
+ \item API-change: User-provided OpenCL kernels extract their kernels automatically. A call to \lstinline|add_kernel()| is now obsolete, hence the function was removed.
+ \item API-change: Device class has been extend and supports all informations defined in the OpenCL 1.1 standard through member functions. Duplicate \lstinline|compute_units()| and \lstinline|max_work_group_size()| have been removed (thanks for Shantanu Agarwal for the input).
+ \item API-change: \lstinline|viennacl::copy()| from a ViennaCL object to an object of non-ViennaCL type no longer tries to resize the object accordingly. An assertion is thrown if the sizes are incorrect in order to provide a consistent behavior across many different types.
+ \item Datastructure change: Vectors and matrices are now padded with zeros by default, resulting in higher performance particularly for matrix operations. This padding needs to be taken into account when using \lstinline|fast_copy()|, particularly for matrices.
+ \item Fixed problems with CUDA and CMake+CUDA on Visual Studio.
+ \item \lstinline|coordinate_matrix<>| now also behaves correctly for tiny matrix dimensions.
+ \item CMake 2.6 as new minimum requirement instead of CMake 2.8.
+ \item Vectors and matrices can be instantiated with integer template types (long, int, short, char).
+ \item Added support for \lstinline|element_prod()| and \lstinline|element_div()| for dense matrices.
+ \item Added \lstinline|element_pow()| for vectors and matrices.
+ \item Added \lstinline|norm_frobenius()| for computing the Frobenius norm of dense matrices.
+ \item Added unary element-wise operations for vectors and dense matrices: \lstinline|element_sin()|, \lstinline|element_sqrt()|, etc.
+ \item Multiple OpenCL contexts can now be used in a multi-threaded setting (one thread per context).
+ \item Multiple inner products with a common vector can now be computed efficiently via e.g.~\lstinline|inner_prod(x, tie(y, z));|
+ \item Added support for \lstinline|prod(A, B)|, where \lstinline|A| is a sparse matrix type and \lstinline|B| is a dense matrix (thanks to Albert Zaharovits for providing parts of the implementation).
+ \item Added \lstinline|diag()| function for extracting the diagonal of a vector to a matrix, or for generating a square matrix from a vector with the vector elements on a diagonal (similar to MATLAB).
+ \item Added \lstinline|row()| and \lstinline|column()| functions for extracting a certain row or column of a matrix to a vector.
+ \item Sparse matrix-vector products now also work with vector strides and ranges.
+ \item Added \lstinline|async_copy()| for vectors to allow for a better overlap of computation and communication.
+ \item Added \lstinline|compressed_compressed_matrix| type for the efficient representation of CSR matrices with only few nonzero rows.
+ \item Added possibility to switch command queues in OpenCL contexts.
+ \item Improved performance of Block-ILU by removing one spurious conversion step.
+ \item Improved performance of Cuthill-McKee algorithm by about 40 percent.
+ \item Improved performance of power iteration by avoiding the creation of temporaries in each step.
+ \item Removed spurious status message to cout in matrix market reader and nonnegative matrix factorization.
+ \item The OpenCL kernel launch logic no longer attempts to re-launch the kernel with smaller work sizes if an error is encountered (thanks to Peter Burka for pointing this out).
+ \item Reduced overhead for lenghty expressions involving temporaries (at the cost of increased compilation times).
+ \item \lstinline|vector| and \lstinline|matrix| are now padded to dimensions being multiples of 128 per default. This greatly improves GEMM performance for arbitrary sizes.
+ \item Loop indices for OpenMP parallelization are now all signed, increasing compatibility with older OpenMP implementations (thanks to Mrinal Deo for the hint).
+ \item Complete rewrite of the generator. Now uses the scheduler for specifying the operation. Includes a full device database for portable high performance of GEMM kernels.
+ \item Added micro-scheduler for attaching the OpenCL kernel generator to the user API.
+ \item Certain BLAS functionality in ViennaCL is now also available through a shared library (libviennacl).
+ \item Removed the external kernel parameter tuning factility, which is to be replaced by an internal device database through the kernel generator.
+ \item Completely eliminated the OpenCL kernel conversion step in the developer repository and the source-release. One can now use the developer version without the need for a Boost installation.
+\end{itemize}
+
+
+\section*{Version 1.4.x}
+
+\subsection*{Version 1.4.2}
+This is a maintenance release, particularly resolving compilation problems with Visual Studio 2012.
+\begin{itemize}
+ \item Largely refactored the internal code base, unifying code for \lstinline|vector|, \lstinline|vector_range|, and \lstinline|vector_slice|.
+       Similar code refactoring was applied to \lstinline|matrix|, \lstinline|matrix_range|, and \lstinline|matrix_slice|.
+       This not only resolves the problems in VS 2012, but also leads to shorter compilation times and a smaller code base.
+ \item Improved performance of matrix-vector products of \lstinline|compressed_matrix| on CPUs using OpenCL.
+ \item Resolved a bug which shows up if certain rows and columns of a \lstinline|compressed_matrix| are empty and the matrix is copied back to host.
+ \item Fixed a bug and improved performance of GMRES. Thanks to Ivan Komarov for reporting via sourceforge.
+ \item Added additional Doxygen documentation.
+\end{itemize}
+
+
+\subsection*{Version 1.4.1}
+This release focuses on improved stability and performance on AMD devices rather than introducing new features:
+\begin{itemize}
+ \item Included fast matrix-matrix multiplication kernel for AMD's Tahiti GPUs if matrix dimensions are a multiple of 128.
+       Our sample HD7970 reaches over 1.3 TFLOPs in single precision and 200 GFLOPs in double precision (counting multiplications and additions as separate operations).
+ \item All benchmark FLOPs are now using the common convention of counting multiplications and additions separately (ignoring fused multiply-add).
+ \item Fixed a bug for matrix-matrix multiplication with \lstinline|matrix_slice<>| when slice dimensions are multiples of 64.
+ \item Improved detection logic for Intel OpenCL SDK.
+ \item Fixed issues when resizing an empty \lstinline|compressed_matrix|.
+ \item Fixes and improved support for BLAS-1-type operations on dense matrices and vectors.
+ \item Vector expressions can now be passed to \lstinline|inner_prod()| and \lstinline|norm_1()|, \lstinline|norm_2()| and \lstinline|norm_inf()| directly.
+ \item Improved performance when using OpenMP.
+ \item Better support for Intel Xeon Phi (MIC).
+ \item Resolved problems when using OpenCL for CPUs if the number of cores is not a power of 2.
+ \item Fixed a flaw when using AMG in debug mode. Thanks to Jakub Pola for reporting.
+ \item Removed accidental external linkage (invalidating header-only model) of SPAI-related functions. Thanks again to Jakub Pola.
+ \item Fixed issues with copy back to host when OpenCL handles are passed to CTORs of vector, matrix, or \lstinline|compressed_matrix|. Thanks again to Jakub Pola.
+ \item Added fix for segfaults on program exit when providing custom OpenCL queues. Thanks to Denis Demidov for reporting.
+ \item Fixed bug in \lstinline|copy()| to \lstinline|hyb_matrix| as reported by Denis Demidov (thanks!).
+ \item Added an overload for \lstinline|result_of::alignment| for \lstinline|vector_expression|. Thanks again to Denis Demidov.
+ \item Added SSE-enabled code contributed by Alex Christensen.
+\end{itemize}
+
+
+
+\subsection*{Version 1.4.0}
+The transition from 1.3.x to 1.4.x features the largest number of additions, improvements, and cleanups since the initial release.
+In particular, host-, OpenCL-, and CUDA-based execution is now supported. OpenCL now needs to be enabled explicitly!
+New features and feature improvements are as follows:
+\begin{itemize}
+ \item Added host-based and CUDA-enabled operations on ViennaCL objects. The default is now a host-based execution for reasons of compatibility.
+       Enable OpenCL- or CUDA-based execution by defining the preprocessor constant \lstinline|VIENNACL_WITH_OPENCL| and \lstinline|VIENNACL_WITH_CUDA| respectively.
+       Note that CUDA-based execution requires the use of nvcc.
+ \item Added mixed-precision CG solver (OpenCL-based).
+ \item Greatly improved performance of ILU0 and ILUT preconditioners (up to 10-fold). Also fixed a bug in ILUT.
+ \item Added initializer types from Boost.uBLAS (\lstinline|unit_vector|, \lstinline|zero_vector|, \lstinline|scalar_vector|, \lstinline|identity_matrix|, \lstinline|zero_matrix|, \lstinline|scalar_matrix|).
+       Thanks to Karsten Ahnert for suggesting the feature.
+ \item Added incomplete Cholesky factorization preconditioner.
+ \item Added element-wise operations for vectors as available in Boost.uBLAS (\lstinline|element_prod|, \lstinline|element_div|).
+ \item Added restart-after-N-cycles option to BiCGStab.
+ \item Added level-scheduling for ILU-preconditioners. Performance strongly depends on matrix pattern.
+ \item Added least-squares example including a function \lstinline|inplace_qr_apply_trans_Q()| to compute the right hand side vector $Q^T b$ without rebuilding $Q$.
+ \item Improved performance of LU-factorization of dense matrices.
+ \item Improved dense matrix-vector multiplication performance (thanks to Philippe Tillet).
+ \item Reduced overhead when copying to/from \lstinline|ublas::compressed_matrix|.
+ \item ViennaCL objects (scalar, vector, etc.) can now be used as global variables (thanks to an anonymous user on the support-mailinglist).
+ \item Refurbished OpenCL vector kernels backend.
+       All operations of the type v1 = a v2 @ b v3 with vectors v1, v2, v3 and scalars a and b including += and -= instead of = are now temporary-free. Similarly for matrices.
+ \item \lstinline|matrix_range| and \lstinline|matrix_slice| as well as \lstinline|vector_range| and \lstinline|vector_slice| can now be used and mixed completely seamlessly with all standard operations except \lstinline|lu_factorize()|.
+ \item Fixed a bug when using copy() with iterators on vector proxy objects.
+ \item Final reduction step in \lstinline|inner_prod()| and norms is now computed on CPU if the result is a CPU scalar.
+ \item Reduced kernel launch overhead of simple vector kernels by packing multiple kernel arguments together.
+ \item Updated SVD code and added routines for the computation of symmetric eigenvalues using OpenCL.
+ \item \lstinline|custom_operation|'s constructor now support multiple arguments, allowing multiple expression to be packed in the same kernel for improved performances. However, all the datastructures in the multiple operations must have the same size.
+ \item Further improvements to the OpenCL kernel generator: Added a repeat feature for generating loops inside a kernel,
+       added element-wise products and division, added support for every one-argument OpenCL function.
+ \item The name of the operation is now a mandatory argument of the constructor of \lstinline|custom_operation|.
+ \item Improved performances of the generated matrix-vector product code.
+ \item Updated interfacing code for the Eigen library, now working with Eigen 3.x.y.
+ \item Converter in source-release now depends on Boost.filesystem3 instead of Boost.filesystem2, thus requiring Boost 1.44 or above.
+\end{itemize}
+
+
+
+
+
+\section*{Version 1.3.x}
+
+\subsection*{Version 1.3.1}
+The following bugfixes and enhancements have been applied:
+\begin{itemize}
+ \item Fixed a compilation problem with GCC 4.7 caused by the wrong order of function declarations. Also removed unnecessary indirections and unused variables.
+ \item Improved out-of-source build in the src-version (for packagers).
+ \item Added virtual destructor in the \lstinline|runtime_wrapper|-class in the kernel generator.
+ \item Extended flexibility of submatrix and subvector proxies (ranges, slices).
+ \item Block-ILU for \lstinline|compressed_matrix| is now applied on the GPU during the solver cycle phase. However, for the moment the implementation file in \newline \texttt{viennacl/linalg/detail/ilu/opencl\_block\_ilu.hpp} needs to be included separately in order to avoid an OpenCL dependency for all ILU implementations.
+ \item SVD now supports double precision.
+ \item Slighly adjusted the interface for NMF. The approximation rank is now specified by the supplied matrices $W$ and $H$.
+ \item Fixed a problem with matrix-matrix products if the result matrix is not initialized properly (thanks to Laszlo Marak for finding the issue and a fix).
+ \item The operations $C += prod(A, B)$ and $C -= prod(A, B)$ for matrices A, B, and C no longer introduce temporaries if the three matrices are distinct.
+\end{itemize}
+
+
+
+\subsection*{Version 1.3.0}
+Several new features enter this new minor version release.
+Some of the experimental features introduced in 1.2.0 keep their experimental state in 1.3.x due to the short time since 1.2.0, with exceptions listed below along with the new features:
+\begin{itemize}
+ \item Full support for ranges and slices for dense matrices and vectors (no longer experimental)
+ \item QR factorization now possible for arbitrary matrix sizes (no longer experimental)
+ \item Further improved matrix-matrix multiplication performance for matrix dimensions which are a multiple of 64 (particularly improves performance for NVIDIA GPUs)
+ \item Added Lanczos and power iteration method for eigenvalue computations of dense and sparse matrices (experimental, contributed by G\"unther Mader and Astrid Rupp)
+ \item Added singular value decomposition in single precision (experimental, contributed by Volodymyr Kysenko)
+ \item Two new ILU-preconditioners added: ILU0 (contributed by Evan Bollig) and a block-diagonal ILU preconditioner using either ILUT or ILU0 for each block. Both preconditioners are computed entirely on the CPU.
+ \item Automated OpenCL kernel generator based on high-level operation specifications added (many thanks to Philippe Tillet who had a lot of \emph{fun fun fun} working on this)
+ \item Two new sparse matrix types (by Volodymyr Kysenko): \lstinline|ell_matrix| for the ELL format and \lstinline|hyb_matrix| for a hybrid format (contributed by Volodymyr Kysenko).
+ \item Added possibility to specify the OpenCL platform used by a context
+ \item Build options for the OpenCL compiler can now be supplied to a context (thanks to Krzysztof Bzowski for the suggestion)
+ \item Added nonnegative matrix factorization by Lee and Seoung (contributed by Volodymyr Kysenko).
+\end{itemize}
+
+
 
 \section*{Version 1.2.x}
- 
+
 \subsection*{Version 1.2.1}
 The current release mostly provides a few bug fixes for experimental features introduced in 1.2.0.
 In addition, performance improvements for matrix-matrix multiplications are applied.
@@ -84,7 +258,7 @@ storage.
  \item Dense and sparse matrix types now now be filled using STL-emulated types (\lstinline|std::vector< std::vector<NumericT> >| and \lstinline|std::vector< std::map< unsigned int, NumericT> >|)
  \item BLAS level 3 functionality is now complete. We are very happy with the general out-of-the-box performance of matrix-matrix-products, even though it cannot beat the extremely tuned implementations tailored to certain matrix sizes on a particular device yet.
  \item An automated performance tuning environment allows an optimization of the kernel parameters for the library user's machine. Best parameters can be obtained from a tuning run and stored in a XML file and read at program startup using pugixml.
- \item Two now preconditioners are now included: A Jacobi preconditioner and a row-scaling preconditioner. In contrast to ILUT, they are applied on the OpenCL device directly.
+ \item Two new preconditioners are now included: A Jacobi preconditioner and a row-scaling preconditioner. In contrast to ILUT, they are applied on the OpenCL device directly.
  \item Clean compilation of all examples under Visual Studio 2005 (we recommend newer compilers though...).
  \item Error handling is now carried out using C++ exceptions.
  \item Matrix Market now uses index base 1 per default (thanks to Evan Bollig for reporting that)
@@ -95,7 +269,7 @@ storage.
 
 \section*{Version 1.0.x}
 
-\subsection*{Version 1.0.5} 
+\subsection*{Version 1.0.5}
 This is the last 1.0.x release. The main changes are as follows:
 \begin{itemize}
  \item Added a reader and writer for MatrixMarket files (thanks to Evan Bollig for suggesting that)
@@ -105,7 +279,7 @@ This is the last 1.0.x release. The main changes are as follows:
  \item Disabled the use of reference counting for OpenCL handles on Mac OS X (caused seg faults on program exit)
 \end{itemize}
 
-\subsection*{Version 1.0.4} 
+\subsection*{Version 1.0.4}
 The changes in this release are:
 \begin{itemize}
  \item All tutorials now work out-of the box with Visual Studio 2008.
@@ -116,16 +290,16 @@ The changes in this release are:
  \item Corrected incorrect return values in the sparse matrix regression test suite (thanks to Klaus Schnass for the hint)
 \end{itemize}
 
-\subsection*{Version 1.0.3} 
+\subsection*{Version 1.0.3}
 The main improvements in this release are:
 \begin{itemize}
  \item Support for multi-core CPUs with ATI Stream SDK (thanks to Riccardo Rossi, UPC. BARCELONA TECH, for suggesting this)
  \item \lstinline|inner_prod| is now up to a factor of four faster (thanks to Serban Georgescu, ETH, for pointing the poor performance of the old implementation out)
  \item Fixed a bug with \lstinline|plane_rotation| that caused system freezes with ATI GPUs.
- \item Extended the doxygen generated reference documentation 
+ \item Extended the doxygen generated reference documentation
 \end{itemize}
 
-\subsection*{Version 1.0.2} 
+\subsection*{Version 1.0.2}
 A bug-fix release that resolves some problems with the Visual C++ compiler.
 \begin{itemize}
  \item Fixed some compilation problems under Visual C++ (version 2005 and 2008).
@@ -133,7 +307,7 @@ A bug-fix release that resolves some problems with the Visual C++ compiler.
  \item Renamed \texttt{aux/} folder to \texttt{auxiliary/} (caused some problems on windows machines)
 \end{itemize}
 
-\subsection*{Version 1.0.1} 
+\subsection*{Version 1.0.1}
 This is a quite large revision of \texttt{ViennaCL 1.0.0}, but mainly improves things under the hood.
 \begin{itemize}
  \item Fixed a bug in lu\_substitute for dense matrices
diff --git a/doc/manual/contributors.tex b/doc/manual/contributors.tex
index b8f047b..79f24f9 100644
--- a/doc/manual/contributors.tex
+++ b/doc/manual/contributors.tex
@@ -1,29 +1,39 @@
 
 \clearpage
 
-Copyright {\copyright} 2010--2011, Institute for Microelectronics, Vienna University of Technology.
+Copyright {\copyright} 2010--2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+Portions of this software are copyright by UChicago Argonne, LLC.
 
-\vspace{2.5cm}
+\vspace{2.cm}
 
-\textit{Project Head:}\\ 
+\textit{Project Head:}\\
 
 Karl Rupp\\
 
-\vspace{2.5cm}
+\vspace{2.cm}
 
 \textit{Code Contributors:} \\
 
-Philipp Grabenweger\\
-Volodymyr Kysenko\\
-Nikolay Lukash\\
-Florian Rudolf\\
-Markus Wagner\\
-Josef Weinbub\\
-Michael Wild
+Evan Bollig \\
+Alex Christensen (BYU) \\
+Philipp Grabenweger \\
+Volodymyr Kysenko \\
+Nikolay Lukash \\
+G\"unther Mader \\
+Vittorio Patriarca \\
+Florian Rudolf \\
+Astrid Rupp \\
+Toby St Clere Smithe \\
+Philippe Tillet \\
+Markus Wagner \\
+Josef Weinbub \\
+Michael Wild \\
 
 
 
-\vspace{5.0cm}
+\vspace{3.5cm}
 
 Institute for Microelectronics\newline
 Vienna University of Technology\newline
diff --git a/doc/manual/cover.tex b/doc/manual/cover.tex
index 7482e64..b9e5fe5 100644
--- a/doc/manual/cover.tex
+++ b/doc/manual/cover.tex
@@ -2,7 +2,7 @@
 \begin{titlepage}
 
 \vspace*{3cm}
-\Huge{ViennaCL 1.2.1} 
+\Huge{ViennaCL 1.5.1}
 \rule[0.0cm]{9.5cm}{0.05cm}
 \begin{flushright}
 \Large{User Manual}
@@ -24,11 +24,11 @@
       \vspace{0.5cm}
       \begin{center}
       Institute for Microelectronics\newline
-      Gu\ss hausstra\ss e 27-29 / E360\newline 
-      A-1040 Vienna, Austria\newline
+      Gu\ss hausstra\ss e 27-29 / E360\newline
+      A-1040 Wien, Austria\newline
       \end{center}
-   \end{minipage}   
-   \hfill   
+   \end{minipage}
+   \hfill
    \begin{minipage}{2.6cm}
       \epsfig{file=figures/logo_px200, scale=1.6}
    \end{minipage}
diff --git a/doc/manual/custom-contexts.tex b/doc/manual/custom-contexts.tex
index fa739cf..fd7be48 100644
--- a/doc/manual/custom-contexts.tex
+++ b/doc/manual/custom-contexts.tex
@@ -1,6 +1,6 @@
-\chapter{Using ViennaCL in User Provided OpenCL Contexts} \label{chap:custom-contexts}
+\chapter{Using ViennaCL in User-Provided OpenCL Contexts} \label{chap:custom-contexts}
 
-Many projects need similar basic linear algebra operations, but essentially operate in their own {\OpenCL} context. 
+Many projects need similar basic linear algebra operations, but essentially operate in their own {\OpenCL} context.
 To provide the functionality and convenience of {\ViennaCL} to such existing projects,
 existing contexts can be passed to {\ViennaCL} and memory objects can be wrapped into the basic linear algebra types \lstinline|vector|, \lstinline|matrix| and \lstinline|compressed_matrix|.
 This chapter is devoted to the description of the necessary steps to use {\ViennaCL} on contexts provided by the library user.
@@ -8,7 +8,7 @@ This chapter is devoted to the description of the necessary steps to use {\Vienn
 \TIP{An example of providing a custom context to {\ViennaCL} can be found in \texttt{examples/tutorial/custom-contexts.cpp}}
 
 \section{Passing Contexts to ViennaCL}
-{\ViennaCLversion} is able to handle an arbitrary number of contexts, which are identified by a key value of type \lstinline|long|. 
+{\ViennaCLversion} is able to handle an arbitrary number of contexts, which are identified by a key value of type \lstinline|long|.
 By default, {\ViennaCL} operates on the context identified by $0$, unless the user switches the context, cf.~Chapter \ref{chap:multi-devices}.
 
 According to the {\OpenCL} standard, a context contains devices and queues for each device. Thus, it is assumed in the following that
@@ -57,13 +57,14 @@ and the queues in a STL map:
  my_queues[my_device2].push_back(my_queue3);
  ...
 
- //supply existing context with multiple devices 
+ //supply existing context with multiple devices
  //and queues to ViennaCL using id '0':
  viennacl::ocl::setup_context(0, my_context, my_devices, my_queues);
 \end{lstlisting}
 It is not necessary to pass all devices and queues created within a particular context to {\ViennaCL}, only those which {\ViennaCL} should use have to be passed.
 {\ViennaCL} will by default use the first queue on each device. The user has to care for appropriate synchronization between different queues.
 
+\TIP{{\ViennaCL} does not destroy the provided context automatically upon exit. The user should thus call \lstinline|clReleaseContext()| as usual for destroying the context.}
 
 \section{Wrapping Existing Memory with ViennaCL Types}
 Now as the user provided context is supplied to {\ViennaCL}, user-created memory objects have to be wrapped into {\ViennaCL} data-types in order to use the full functionality.
@@ -74,7 +75,7 @@ Typically, one of the types \lstinline|scalar|, \lstinline|vector|, \lstinline|m
  cl_mem my_memory3 = ...;
  cl_mem my_memory4 = ...;
  cl_mem my_memory5 = ...;
- 
+
  //wrap my_memory1 into a vector of size 10
  viennacl::vector<float> my_vec(my_memory1, 10);
 
@@ -98,4 +99,4 @@ The following has to be emphasized:
 \NOTE{The user has to ensure that the provided memory is larger or equal to the size of the wrapped object.}
 
 \NOTE{Be aware the wrapping the same memory object into several different {\ViennaCL} objects can have unwanted side-effects.
-In particular, wrapping the same memory in two {\ViennaCL} vectors implies that if the entries of one of the vectors is modified, this is also the case for the second.}
\ No newline at end of file
+In particular, wrapping the same memory in two {\ViennaCL} vectors implies that if the entries of one of the vectors is modified, this is also the case for the second.}
diff --git a/doc/manual/custom-kernels.tex b/doc/manual/custom-kernels.tex
index 5ead68e..a1e0fa4 100644
--- a/doc/manual/custom-kernels.tex
+++ b/doc/manual/custom-kernels.tex
@@ -1,6 +1,6 @@
 
 
-\chapter{Custom Compute Kernels} \label{chap:custom}
+\chapter{Custom OpenCL Compute Kernels} \label{chap:custom}
 
 For custom algorithms the built-in functionality of {\ViennaCL} may not be sufficient or not fast enough. In such cases it can be desirable to write a custom {\OpenCL} compute kernel, which is explained in this chapter. The following steps are necessary and explained one after another:
 \begin{itemize}
@@ -10,10 +10,10 @@ For custom algorithms the built-in functionality of {\ViennaCL} may not be suffi
 \end{itemize}
 A tutorial on this topic can be found at \texttt{examples/tutorial/custom-kernels.cpp}.
 
-\section{Setting up the Source Code}
+\section{Setting up the {\OpenCL} Source Code}
 The {\OpenCL} source code has to be provided as a string. One can either write the source code directly into a string within C++ files, or one can read the {\OpenCL} source from a file. For demonstration purposes, we write the source directly as a string constant:
 \begin{lstlisting}
-const char * my_compute_program = 
+const char * my_compute_program =
 "__kernel void elementwise_prod(\n"
 "          __global const float * vec1,\n"
 "          __global const float * vec2, \n"
@@ -26,38 +26,33 @@ const char * my_compute_program =
 \end{lstlisting}
 The kernel takes three vector arguments \lstinline{vec1}, \lstinline{vec2} and \lstinline{result} and the vector length variable \lstinline{size} abd computes the entry-wise product of the vectors \lstinline|vec1| and \lstinline|vec2| and writes the result to the vector \lstinline|result|. For more detailed explanation of the {\OpenCL} source code, please refer to the specification available at the Khronos group webpage \cite{khronoscl}.
 
-\section{Compilation of the Source Code}
+\section{Compilation of the {\OpenCL} Source Code}
 The source code in the string constant \lstinline{my_compute_kernel} has to be compiled to an {\OpenCL} program.
 An {\OpenCL} program is a compilation unit and may contain several different compute kernels,
 so one could also include another kernel function \lstinline{inplace_elementwise_prod} which writes the result directly to one of the two operands \lstinline{vec1} or \lstinline{vec2} in the same program.
 \begin{lstlisting}
-viennacl::ocl::program & my_prog = 
+viennacl::ocl::program & my_prog =
   viennacl::ocl::current_context().add_program(my_compute_program,
                                                "my_compute_program");
 \end{lstlisting}
-The next step is to register the kernel \lstinline|elementwise_prod| included in the compiled program:
-\begin{lstlisting}
-my_prog.add_kernel("elementwise_prod");
-\end{lstlisting}
-Similarly, one proceeds with other kernels in the compiled program.
-The next step is to extract the kernel object \lstinline|my_kernel| from the compiled program:
+The next step is to extract the kernel object \lstinline|my_kernel| from the compiled program (an explicit kernel registration was needed prior to ViennaCL 1.5.0, but is no longer needed):
 \begin{lstlisting}
 viennacl::ocl::kernel & my_kernel = my_prog.get_kernel("elementwise_prod");
 \end{lstlisting}
 Now, the kernel is set up to use the function \lstinline|elementwise_prod| compiled into the program \lstinline|my_prog|.
 
-\NOTE{Note that C++ references to kernels and programs may become invalid as other kernels or programs are added. 
+\NOTE{Note that C++ references to kernels and programs may become invalid as other kernels or programs are added.
       Therefore, first allocate the required {\ViennaCL} objects and compile/add all custom kernels, before you start taking references to custom programs or kernels.}
 
 Instead of extracting references to programs and kernels directly at program compilation, one can obtain them at other places within the application source code by
 \begin{lstlisting}
-viennacl::ocl::program & prog = 
+viennacl::ocl::program & prog =
   viennacl::ocl::current_context().get_program("my_compute_program");
 viennacl::ocl::kernel & my_kernel = my_prog.get_kernel("elementwise_prod");
 \end{lstlisting}
 This simplifies application development considerably, since no program and kernel objects need to be passed around.
 
-\section{Launching the Kernel}
+\section{Launching the {\OpenCL} Kernel}
 Before launching the kernel, one may adjust the global and local work sizes (readers not familiar with that are encouraged to read the {\OpenCL} standard \cite{khronoscl}).
 The following code specifies a one-dimensional execution model with 16 local workers and 128 global workers:
 \begin{lstlisting}
@@ -71,9 +66,12 @@ In order to use a two-dimensional execution, additionally parameters for the sec
 \end{lstlisting}
 However, for the simple kernel in this example it is not necessary to specify any work sizes at all. The default work sizes (which can be found in \texttt{viennacl/ocl/kernel.hpp}) suffice for most cases.
 
-To launch the kernel, the kernel arguments are set in the same way as for ordinary functions. We assume that three {\ViennaCL} vectors \lstinline|vec1|, \lstinline|vec2| and \lstinline|result| have already been set up:
+To launch the kernel, the kernel arguments are set in the same way as for ordinary functions.
+We assume that three {\ViennaCL} vectors \lstinline|vec1|, \lstinline|vec2| and \lstinline|result| have already been set up:
 \begin{lstlisting}
-  viennacl::ocl::enqueue(my_kernel(vec1, vec2, result, vec1.size()));
+  viennacl::ocl::enqueue(my_kernel(vec1, vec2, result, cl_uint(vec1.size())));
 \end{lstlisting}
 Per default, the kernel is enqueued in the first queue of the currently active device. A custom queue can be specified as optional second argument, cf.~the reference documentation
 located in \texttt{doc/doxygen/}.
+
+\TIP{Integer arguments need to be provided using the corresponding OpenCL types \lstinline|cl_int|, \lstinline|cl_uint|, etc. Do not pass arguments of type \lstinline|size_t|, because \lstinline|size_t| might differ on the host and the compute device.}
\ No newline at end of file
diff --git a/doc/manual/design.tex b/doc/manual/design.tex
index 5be6757..83135a9 100644
--- a/doc/manual/design.tex
+++ b/doc/manual/design.tex
@@ -24,7 +24,7 @@ or
 \begin{lstlisting}
   gpu_vector[0] = 2.0f;
 \end{lstlisting}
-where one of the operands resides on the CPU and the other on the GPU. 
+where one of the operands resides on the CPU and the other on the GPU.
 Initialization of a separate type followed by a call to \lstinline|copy| is
 certainly not desired for the above examples.
 
@@ -102,7 +102,7 @@ classes could be introduced in future releases of {\ViennaCL}.
 A remedy for quick iteration over the entries of e.g.~a vector is the following:
 \begin{lstlisting}
  std::vector<double> temp(gpu_vector.size());
- copy(gpu_vector.begin(), gpu_vector.end(), temp.begin());  
+ copy(gpu_vector.begin(), gpu_vector.end(), temp.begin());
  for (std::vector<double>::iterator it = temp.begin();
       it != temp.end();
      ++it)
@@ -126,12 +126,13 @@ One possibility was to require a mandatory
 before using any other objects provided by {\ViennaCL}, but this approach was discarded for the following two reasons:
 \begin{itemize}
  \item If \lstinline|viennacl::init();| is accidentally forgotten by the user,
-the program will most likely terminate in a rather uncontrolled way. 
+the program will most likely terminate in a rather uncontrolled way.
  \item It requires the user to remember and write one extra line of code, even
 if the default settings are fine.
 \end{itemize}
-Initialization is instead done in the constructors of {\ViennaCL}
-objects. This allows a fine-grained control over which source code to compile
+Initialization is instead done in a lazy manner when requesting {\OpenCL} kernels.
+Kernels with similar functionality are grouped together in a common compilation units.
+This allows a fine-grained control over which source code to compile
 where and when. For example, there is no reason to compile the sparse matrix
 compute kernels at program startup if there are no sparse matrices used at all.
 
diff --git a/doc/manual/figures/TU_Signet_CMYK.eps b/doc/manual/figures/TU_Signet_CMYK.eps
index 0cd7d0e..1b6c4e0 100644
--- a/doc/manual/figures/TU_Signet_CMYK.eps
+++ b/doc/manual/figures/TU_Signet_CMYK.eps
@@ -10,7 +10,7 @@
 %%LanguageLevel: 2
 %%DocumentData: Clean7Bit
 %ADOBeginClientInjection: DocumentHeader "AI11EPS"
-%%AI8_CreatorVersion: 14.0.0
%AI9_PrintingDataBegin
%ADO_BuildNumber: Adobe Illustrator(R) 14.0.0 x367 R agm 4.4890 ct 5.1541
%ADO_ContainsXMP: MainFirst
+%%AI8_CreatorVersion: 14.0.0
%AI9_PrintingDataBegin
%ADO_BuildNumber: Adobe Illustrator(R) 14.0.0 x367 R agm 4.4890 ct 5.1541
%ADO_ContainsXMP: MainFirst
 %ADOEndClientInjection: DocumentHeader "AI11EPS"
 %%Pages: 1
 %%DocumentNeededResources: 
@@ -4798,7 +4798,7 @@ currentdict Adobe_AGM_Utils eq {end} if
 %%EndPageComments
 %%BeginPageSetup
 %ADOBeginClientInjection: PageSetup Start "AI11EPS"
-%AI12_RMC_Transparency: Balance=75 RasterRes=300 GradRes=150 Text=0 Stroke=1 Clip=1 OP=0
+%AI12_RMC_Transparency: Balance=75 RasterRes=300 GradRes=150 Text=0 Stroke=1 Clip=1 OP=0
 %ADOEndClientInjection: PageSetup Start "AI11EPS"
 Adobe_AGM_Utils begin
 Adobe_AGM_Core/ps gx
@@ -4928,7 +4928,7 @@ Adobe_CoolType_Core/ps get exec
Adobe_AGM_Image/ps gx
                                                                                                     
                                                                                                     
                            
-<?xpacket end="w"?>
%  &&end XMP packet marker&&
[{ai_metadata_stream_123}
<</Type /Metadata /Subtype /XML>>
/PUT AI11_PDFMark5
[/Document
1 dict begin /Metadata {ai_metadata_stream_123} def
currentdict end /BDC AI11_PDFMark5
+<?xpacket end="w"?>
%  &&end XMP packet marker&&
[{ai_metadata_stream_123}
<</Type /Metadata /Subtype /XML>>
/PUT AI11_PDFMark5
[/Document
1 dict begin /Metadata {ai_metadata_stream_123} def
currentdict end /BDC AI11_PDFMark5
 %ADOEndClientInjection: PageSetup End "AI11EPS"
 %%EndPageSetup
 1 -1 scale 0 -226.771 translate
@@ -5051,14 +5051,14 @@ false sop
 1 sep
 ef
 %ADOBeginClientInjection: EndPageContent "AI11EPS"
-userdict /annotatepage 2 copy known {get exec}{pop pop} ifelse
+userdict /annotatepage 2 copy known {get exec}{pop pop} ifelse
 %ADOEndClientInjection: EndPageContent "AI11EPS"
 grestore
 grestore
 pgrs
 %%PageTrailer
 %ADOBeginClientInjection: PageTrailer Start "AI11EPS"
-[/EMC AI11_PDFMark5
[/NamespacePop AI11_PDFMark5
+[/EMC AI11_PDFMark5
[/NamespacePop AI11_PDFMark5
 %ADOEndClientInjection: PageTrailer Start "AI11EPS"
 [
 [/CSA [/0 ]]
diff --git a/doc/manual/installation.tex b/doc/manual/installation.tex
index 1ba2c1d..11c5736 100644
--- a/doc/manual/installation.tex
+++ b/doc/manual/installation.tex
@@ -6,7 +6,7 @@ platforms, but we could not check every possible combination of hardware,
 operating system and compiler. If you experience any trouble, please write to
 the maining list at \\
 \begin{center}
-\texttt{viennacl-support$@$lists.sourceforge.net} 
+\texttt{viennacl-support$@$lists.sourceforge.net}
 \end{center}
 
 
@@ -20,19 +20,19 @@ the maining list at \\
 Thus, before you proceed with the installation of {\ViennaCL}, make sure you
 have a recent version of {\CMake} installed.
 
-To use {\ViennaCL}, the following prerequisites have to be fulfilled:
+To use {\ViennaCL}, only the following minimal prerequisite has to be fulfilled:
 \begin{itemize}
- \item A recent C++ compiler (e.g.~{\GCC} version 4.2.x or above and Visual C++
-2010 are known to work)
- \item {\OpenCL}~\cite{khronoscl,nvidiacl} for accessing compute devices (GPUs);
-see Section~\ref{opencllibs} for details.
-(optional, since iterative solvers can also be used standalone with other libraries (\ublas, Eigen, MTL4))
+ \item A fairly recent C++ compiler (e.g.~{\GCC} version 4.2.x or above and Visual C++
+2005 and 2010 are known to work)
 \end{itemize}
 
 
-The full potential of {\ViennaCL} is only available with the following optional libraries:
+The full potential of {\ViennaCL} is available with the following optional libraries:
 \begin{itemize}
  \item {\CMake}~\cite{cmake} as build system (optional, but highly recommended for building examples)
+ \item {\OpenCL}~\cite{khronoscl,nvidiacl} for accessing compute devices (GPUs); see Section~\ref{opencllibs} for details.
+ \item {\CUDA}~\cite{nvidiacuda} for using CUDA-accelerated operations.
+ \item {\OpenMP}~\cite{openmp} for directive-based parallelism on CPUs.
  \item {\ublas} (shipped with {\Boost}~\cite{boost}) provides the same interface as {\ViennaCL} and allows to switch between CPU and GPU seamlessly, see the tutorials.
  \item Eigen \cite{eigen} can be used to fill {\ViennaCL} types directly. Moreover, the iterative solvers in {\ViennaCL} can directly be used with Eigen objects.
  \item MTL 4 \cite{mtl4} can be used to fill {\ViennaCL} types directly. Even though MTL 4 provides its own iterative solvers, the {\ViennaCL} solvers can also be used with MTL 4 objects.
@@ -41,20 +41,20 @@ The full potential of {\ViennaCL} is only available with the following optional
 %The use of {\OpenMP} for the benchmark suite allows fair comparisons between your multi-core CPU and your compute device (e.g.~GPU).
 
 \section{Generic Installation of ViennaCL} \label{sec:viennacl-installation}
-Since {\ViennaCL} is essentially a header-only library (the only exception is
-described in Chapter \ref{chap:tuning}), it is sufficient to copy the folder
+Since {\ViennaCL} is a header-only library, it is sufficient to copy the folder
 \lstinline|viennacl/| either into your project folder or to your global system
 include path. On Unix based systems, this is often \lstinline|/usr/include/| or
 \lstinline|/usr/local/include/|. If the OpenCL headers are not installed on your system,
 you should repeat the above procedure with the folder \lstinline|CL/|.
 
 On Windows, the situation strongly depends on your development environment. We advise users
-to consult the documentation of their compiler on how to set the include
-path correctly. With Visual Studio this is usually something like \texttt{C:$\setminus$Program Files$\setminus$Microsoft Visual Studio 9.0$\setminus$VC$\setminus$include}
-and can be set in \texttt{Tools -> Options -> Projects and Solutions -> VC++-\-Directories}. The include and library directories of your {\OpenCL} SDK should also be added there.
+to consult the documentation of their compiler on how to set the include path correctly.
+With Visual Studio this is usually something like \texttt{C:$\setminus$Program Files$\setminus$Microsoft Visual Studio 9.0$\setminus$VC$\setminus$include}
+and can be set in \texttt{Tools -> Options -> Projects and Solutions -> VC++-\-Directories}.
+For using the {\CUDA} backend, simply make sure that the {\CUDA} SDK is installed properly.
+If you wish to use the {\OpenCL} backend, the include and library directories of your {\OpenCL} SDK should also be added there.
 
-\NOTE{If multiple {\OpenCL} libraries are available on the host system, one has
-to ensure that the intended one is used.}
+\NOTE{If multiple {\OpenCL} libraries are available on the host system, {\ViennaCL} uses the first platform returned by the system. Consult Chap.~\ref{chap:multi-devices} for configuring the use of other platforms.}
 
 
 % -----------------------------------------------------------------------------
@@ -67,16 +67,16 @@ In order to compile and run {\OpenCL} applications, a corresponding library
 (e.g.~\texttt{libOpenCL.so} under Unix based systems) and is required.
 If {\OpenCL} is to be used with GPUs, suitable drivers have to be installed. This section describes how these can be acquired.
 
-\TIP{Note, that for Mac OS X systems there is no need to install an {\OpenCL} 
-capable driver and the corresponding library. 
-The {\OpenCL} library is already present if a suitable graphics 
+\TIP{Note, that for Mac OS X systems there is no need to install an {\OpenCL}
+capable driver and the corresponding library.
+The {\OpenCL} library is already present if a suitable graphics
 card is present. The setup of {\ViennaCL} on Mac OS X is discussed in
 Section~\ref{apple}.}
 
 \subsection{\NVIDIA Driver}
-\NVIDIA provides the {\OpenCL} library with the GPU driver. Therefore, if a 
-\NVIDIA driver is present on the system, the library is too. However, 
-not all of the released drivers contain the {\OpenCL} library. 
+\NVIDIA provides the {\OpenCL} library with the GPU driver. Therefore, if a
+\NVIDIA driver is present on the system, the library is too. However,
+not all of the released drivers contain the {\OpenCL} library.
 A driver which is known to support {\OpenCL}, and hence providing the required
 library, is $260.19.21$. Note that the latest {\NVIDIA} drivers do not include
 the {\OpenCL} headers anymore. Therefore, the official {\OpenCL} headers from
@@ -84,39 +84,62 @@ the Khronos group \cite{khronoscl} are also shipped with {\ViennaCL} in the
 folder \lstinline|CL/|.
 
 \subsection{AMD Accelerated Parallel Processing SDK (formerly Stream SDK)} \label{sec:opencl-on-ati}
-AMD provides the {\OpenCL} library with the Accelerated Parallel Processing (APP)
-SDK~\cite{atistream}. At the release of {\ViennaCLversion}, the latest version of the
-SDK is $2.4$. If used with AMD GPUs, recent AMD GPU drivers are typically required. If {\ViennaCL} is to be run on multi-core CPUs,
+AMD has provided the {\OpenCL} library with the Accelerated Parallel Processing (APP)
+SDK~\cite{atistream} previously, now the {\OpenCL} library is also included in the GPU driver.
+At the release of {\ViennaCLversion}, the latest version of the
+SDK is $2.7$. If used with AMD GPUs, recent AMD GPU drivers are typically required. If {\ViennaCL} is to be run on multi-core CPUs,
 no additional GPU driver is required. The installation notes
 of the APP SDK provides guidance throughout the
-installation process~\cite{atistreamdocu}. 
+installation process~\cite{atistreamdocu}.
 
 \TIP{If the SDK is installed in a non-system wide location on UNIX-based systems, be sure to add the
 {\OpenCL} library path to the \texttt{LD\_LIBRARY\_PATH} environment variable.
 Otherwise, linker errors will occur as the required library cannot be found.}
 
 It is important to note that the AMD APP SDK may not provide {\OpenCL}
-certified double precision support~\cite{atidouble} on some CPUs and GPUs. In
-\ViennaCL 1.0.x, double precision was only experimentally available in
-{\ViennaCL} by defining one of the preprocessor constants
-\begin{lstlisting}
-// for CPUs:
-#define VIENNACL_EXPERIMENTAL_DOUBLE_PRECISION_WITH_STREAM_SDK_ON_CPU
-// for GPUs:
-#define VIENNACL_EXPERIMENTAL_DOUBLE_PRECISION_WITH_STREAM_SDK_ON_GPU
-\end{lstlisting}
-prior to any inclusion of {\ViennaCL} header files. With
-{\ViennaCLminorversion}, this is not necessary anymore and double precision
-support is enabled by default -- provided that it is available on the device.
+certified double precision support~\cite{atidouble} on some CPUs and GPUs.
 
-\NOTE{The functions \texttt{norm\_1}, \texttt{norm\_2}, \texttt{norm\_inf} and
-\texttt{index\_norm\_inf} are known to cause problems on GPUs in
-double precision using ATI Stream SDK v2.1.}
+\NOTE{Unfortunately, some versions of the AMD APP SDK are known to have bugs. For example, APP SDK 2.7 on Linux causes BiCGStab to fail on some devices.}
 
 \subsection{INTEL OpenCL SDK} \label{sec:opencl-on-intel}
  {\ViennaCL} works fine with the INTEL OpenCL SDK on Windows and Linux.
 The correct linker path is set automatically in \lstinline|CMakeLists.txt| when using the {\CMake} build system, cf.~Sec.~\ref{sec:viennacl-installation}.
 
+% -----------------------------------------------------------------------------
+% -----------------------------------------------------------------------------
+\section{Enabling OpenMP, OpenCL, or CUDA Backends} \label{sec:cuda-opencl-backends}
+% -----------------------------------------------------------------------------
+% -----------------------------------------------------------------------------
+
+\TIP{The new default behavior in {\ViennaCL} 1.4.0 is to use the CPU backend. {\OpenCL} and {\CUDA} backends need to be enabled by appropriate preprocessor \lstinline|#define|s.}
+
+By default, {\ViennaCL} now uses the single-threaded/OpenMP-enabled CPU backend.
+The {\OpenCL} and the {\CUDA}-backend need to be enabled explicitly by using preprocessor constants as follows:
+
+\begin{center}
+\begin{tabular}{|l|l|}
+ \hline
+   \textbf{Preprocessor} \lstinline|#define| & \textbf{Default computing backend} \\
+ \hline
+ none                              & CPU, single-threaded \\
+ \hline
+ \lstinline|VIENNACL_WITH_OPENMP|  & CPU with OpenMP (compiler flags required) \\
+ \hline
+ \lstinline|VIENNACL_WITH_OPENCL|  & OpenCL \\
+ \hline
+ \lstinline|VIENNACL_WITH_CUDA|  & CUDA \\
+ \hline
+\end{tabular}
+\end{center}
+
+The preprocessor constants can be either defined at the beginning of the source file (prior to any ViennaCL-includes), or passed to the compiler as command line argument.
+For example, on \lstinline|g++| the respective command line option for enabling the OpenCL backend is \lstinline|-DVIENNACL_WITH_OPENCL|.
+Note that CUDA requires the \lstinline|nvcc| compiler. Furthermore, the use of {\OpenMP} usually requires additional compiler flags (on \lstinline|g++| this is for example \lstinline|-fopenmp|).
+
+\TIP{The CUDA backend requires a compilation using \lstinline|nvcc|.}
+
+Multiple backends can be used simultaneously. In such case, \lstinline|CUDA| has higher priority than \lstinline|OpenCL|, which has higher priority over the CPU backend when it comes to selecting the default backend.
+
 
 % -----------------------------------------------------------------------------
 % -----------------------------------------------------------------------------
@@ -129,34 +152,46 @@ on your system. The other dependencies are listed in Tab.~\ref{tab:tutorial-depe
 \begin{table}[tb]
 \begin{center}
 \begin{tabular}{l|l}
-Tutorial No. & Dependencies\\
+Example/Tutorial & Dependencies\\
 \hline
 \texttt{tutorial/amg.cpp}        & {\OpenCL}, {\ublas} \\
 \texttt{tutorial/bandwidth-reduction.cpp} & - \\
-\texttt{tutorial/blas1.cpp}      & {\OpenCL} \\
-\texttt{tutorial/blas2.cpp}      & {\OpenCL}, {\ublas} \\
-\texttt{tutorial/blas3.cpp}      & {\OpenCL}, {\ublas} \\
+\texttt{tutorial/blas1.cpp/cu}      & - \\
+\texttt{tutorial/blas2.cpp/cu}      & {\ublas} \\
+\texttt{tutorial/blas3.cpp/cu}      & {\ublas} \\
 \texttt{tutorial/custom-kernels.cpp}       & {\OpenCL} \\
 \texttt{tutorial/custom-context.cpp}       & {\OpenCL} \\
-\texttt{tutorial/fft.cpp}        & {\OpenCL} \\
-\texttt{tutorial/iterative.cpp}  & {\OpenCL}, {\ublas} \\
+\texttt{tutorial/eigen-with-viennacl.cpp}  & {\Eigen} \\
+\texttt{tutorial/fft.cpp}                  & {\OpenCL} \\
+\texttt{tutorial/iterative.cpp/cu}         & {\ublas} \\
 \texttt{tutorial/iterative-ublas.cpp}      & {\ublas}  \\
 \texttt{tutorial/iterative-eigen.cpp}      & {\Eigen}   \\
 \texttt{tutorial/iterative-mtl4.cpp}       & {\MTL}    \\
-\texttt{tutorial/matrix-range.cpp}         & {\OpenCL}, {\ublas} \\
-\texttt{tutorial/qr.cpp}         & {\OpenCL}, {\ublas} \\
-\texttt{tutorial/spai.cpp}       & {\OpenCL}, {\ublas} \\
-\texttt{tutorial/eigen-with-viennacl.cpp}  & {\OpenCL}, {\Eigen} \\
-\texttt{tutorial/mtl4-with-viennacl.cpp}   & {\OpenCL}, {\MTL} \\
-\texttt{tutorial/vector-range.cpp}         & {\OpenCL}, {\ublas} \\
+\texttt{tutorial/lanczos.cpp/cu}           & {\ublas}    \\
+\texttt{tutorial/libviennacl.cpp/cu}       & - \\
+\texttt{tutorial/least-squares.cpp/cu}     & {\ublas}    \\
+\texttt{tutorial/matrix-range.cpp/cu}      & {\ublas} \\
+\texttt{tutorial/mtl4-with-viennacl.cpp}   & {\MTL} \\
+\texttt{tutorial/multithreaded.cpp/cu}     & {\Boost} \\
+\texttt{tutorial/multithreaded\_cg.cpp/cu} & {\Boost} \\
+\texttt{tutorial/power-iter.cpp/cu}        & {\ublas} \\
+\texttt{tutorial/qr.cpp/cu}                & {\ublas} \\
+\texttt{tutorial/scheduler.cpp}            & - \\
+\texttt{tutorial/spai.cpp}                 & {\OpenCL}, {\ublas} \\
+\texttt{tutorial/sparse.cpp/cu}            & {\ublas} \\
+\texttt{tutorial/structured-matrices.cpp}  & {\OpenCL}, {\ublas} \\
+\texttt{tutorial/vector-range.cpp/cu}      & {\ublas} \\
 \texttt{tutorial/viennacl-info.cpp}        & {\OpenCL} \\
-\texttt{benchmarks/blas3.cpp}   & {\OpenCL} \\
-\texttt{benchmarks/opencl.cpp}  & {\OpenCL} \\
-\texttt{benchmarks/solver.cpp}  & {\OpenCL}, {\ublas} \\
-\texttt{benchmarks/sparse.cpp}  & {\OpenCL}, {\ublas} \\
-\texttt{benchmarks/vector.cpp}  & {\OpenCL} \\
+\texttt{tutorial/wrap-cuda-buffer.cu}      & {\CUDA} \\
+\texttt{tutorial/wrap-host-buffer.cpp}     & - \\
+\texttt{benchmarks/blas3.cpp/cu}   & - \\
+\texttt{benchmarks/opencl.cpp}     & {\OpenCL} \\
+\texttt{benchmarks/solver.cpp/cu}  & {\ublas} \\
+\texttt{benchmarks/sparse.cpp/cu}  & {\ublas} \\
+\texttt{benchmarks/vector.cpp/cu}  & - \\
 \end{tabular}
-\caption{Dependencies for the examples in the \texttt{examples/} folder}
+\caption{Dependencies for the examples in the \texttt{examples/} folder. Examples using the CUDA-backend use the \lstinline|.cu| file extension.
+Note that all examples can be run using either of the CPU, OpenCL, and CUDA backend unless an explicit {\OpenCL}-dependency is stated.}
 \label{tab:tutorial-dependencies}
 \end{center}
 \end{table}
@@ -165,6 +200,24 @@ Before building the examples, customize \texttt{CMakeLists.txt} in the {\ViennaC
 Per default, all examples using {\ublas}, {Eigen} and {MTL4} are turned off.
 Please enable the respective examples based on the libraries available on your machine.
 Directions on how to accomplish this are given directly within the \texttt{CMakeLists.txt} file.
+A brief overview of the most important flags is as follows:
+
+\begin{center}
+ \begin{tabular}{|l|l|}
+  \hline
+  {\CMake} Flag & Purpose \\
+  \hline
+  \lstinline|ENABLE_CUDA|   & Builds examples with the {\CUDA} backend enabled\\
+  \lstinline|ENABLE_OPENCL| & Builds examples with the {\OpenCL} backend enabled\\
+  \lstinline|ENABLE_OPENMP| & Builds examples with {\OpenMP} for the CPU backend enabled\\
+  \hline
+  \lstinline|ENABLE_EIGEN|  & Builds examples depending on {\Eigen}\\
+  \lstinline|ENABLE_MTL4|   & Builds examples depending on {\MTL}\\
+  \lstinline|ENABLE_UBLAS|  & Builds examples depending on {\ublas}\\
+  \hline
+ \end{tabular}
+\end{center}
+
 
 \subsection{Linux}
 To build the examples, open a terminal and change to:
@@ -177,7 +230,7 @@ Execute
 \end{lstlisting}
 to obtain a Makefile and type
 \begin{lstlisting}
- $> make 
+ $> make
 \end{lstlisting}
 to build the examples. If some of the dependencies in Tab.~\ref{tab:tutorial-dependencies} are not fulfilled, you can build each example separately:
 \begin{lstlisting}
@@ -192,22 +245,24 @@ Execute the examples from the \lstinline|build/| folder as follows:
  $> examples/tutorial/blas1
  $> examples/benchmarks/vectorbench
 \end{lstlisting}
-Note that all benchmark executables carry the suffix \lstinline|bench|. 
+Note that all benchmark executables carry the suffix \lstinline|bench|.
+
+\TIP{Use the {\CMake}-GUI via \lstinline|cmake-gui ..| within the \lstinline|build/| folder in order to enable or disable optional libraries conveniently.}
 
 \subsection{Mac OS X}
 \label{apple}
-The tools mentioned in Section \ref{dependencies} are available on 
-Macintosh platforms too. 
+The tools mentioned in Section \ref{dependencies} are available on
+Macintosh platforms too.
 For the {\GCC} compiler the Xcode~\cite{xcode} package has to be installed.
-To install {\CMake} and {\Boost} external portation tools have to be used, 
-for example, Fink~\cite{fink}, DarwinPorts~\cite{darwinports} 
-or MacPorts~\cite{macports}. Such portation tools provide the 
-aforementioned packages, {\CMake} and {\Boost}, for macintosh platforms. 
-
-\TIP{If the {\CMake} build system has problems detecting your {\Boost} libraries, 
-determine the location of your {\Boost} folder. 
-Open the \texttt{CMakeLists.txt} file in the root directory of {\ViennaCL} and 
-add your {\Boost} path after the following entry: 
+To install {\CMake} and {\Boost} external portation tools have to be used,
+for example, Fink~\cite{fink}, DarwinPorts~\cite{darwinports}
+or MacPorts~\cite{macports}. Such portation tools provide the
+aforementioned packages, {\CMake} and {\Boost}, for macintosh platforms.
+
+\TIP{If the {\CMake} build system has problems detecting your {\Boost} libraries,
+determine the location of your {\Boost} folder.
+Open the \texttt{CMakeLists.txt} file in the root directory of {\ViennaCL} and
+add your {\Boost} path after the following entry:
 \texttt{IF(\${CMAKE\_SYSTEM\_NAME} MATCHES "Darwin")} }
 
 The build process of {\ViennaCL} on Mac OS is similar to Linux.
@@ -218,10 +273,11 @@ In the following the procedure is outlined for \texttt{Visual Studio}: Assuming
 \item Open the {\CMake} GUI.
 \item Set the {\ViennaCL} base directory as source directory.
 \item Set the \texttt{build/} directory as build directory.
-\item Click on 'Configure' and select the appropriate generator (e.g.~\texttt{Visual Studio 9 2008})
-\item If either Boost or some OpenCL paths cannot be found, please select the advanced view and provide the required paths manually
-\item Click again an 'Configure'
-\item Click on 'Generate'
+\item Click on 'Configure' and select the appropriate generator (e.g.~\texttt{Visual Studio 9 2008}).
+\item If you set \lstinline|ENABLE_CUDA|, \lstinline|ENABLE_CUDA|, \lstinline|ENABLE_MTL4|, or \lstinline|ENABLE_OPENCL| and the paths cannot be found, please select the advanced view and provide the required paths manually.
+\item If you set \lstinline|ENABLE_UBLAS| and the paths cannot be found, please select the advanced view and provide the required paths manually. You may have to specify the linker path for Boost manually within your Visual Studio IDE.
+\item Click again on 'Configure'. You should not receive an error at this point.
+\item Click on 'Generate'.
 \item The project files can now be found in the {\ViennaCL} build directory, where they can be opened and compiled with Visual Studio (provided that the include and library paths are set correctly, see Sec.~\ref{sec:viennacl-installation}).
 \end{itemize}
 
diff --git a/doc/manual/introduction.tex b/doc/manual/introduction.tex
index bd64996..0c037c3 100644
--- a/doc/manual/introduction.tex
+++ b/doc/manual/introduction.tex
@@ -4,7 +4,8 @@
 The Vienna Computing Library (\ViennaCL) is a scientific computing
 library written in C++. It allows simple, high-level access
 to the vast computing resources available on parallel architectures such as
-GPUs and multi-core CPUs by using {\OpenCL}. The primary focus is on common linear algebra
+GPUs and multi-core CPUs by using either a host-based computing backend, an {\OpenCL} computing backend, or {\CUDA}.
+The primary focus is on common linear algebra
 operations (BLAS levels 1, 2 and 3) and the solution of large sparse systems of equations by means of iterative
 methods. In {\ViennaCLminorversion}, the following iterative solvers are
 implemented (confer for example to the book of Y.~Saad \cite{saad-iterative-solution}):
@@ -13,28 +14,24 @@ implemented (confer for example to the book of Y.~Saad \cite{saad-iterative-solu
  \item Stabilized BiConjugate Gradient (BiCGStab)
  \item Generalized Minimum Residual (GMRES)
 \end{itemize}
-%An optional ILU preconditioner can be used, which is in {\ViennaCLversion}
-%precomputed and applied on a single CPU core and may thus not lead to overall
-%performance gains over a purely CPU based implementation.
-%Moreover, a Jacobi and a row-scaling preconditioner are available, which can be executed directly in parallel on the {\OpenCL} device.
 A number of preconditioners is provided with {\ViennaCLversion} in order to improve convergence of these solvers, cf.~Chap.~\ref{chap:algorithms}.
 
 The solvers and preconditioners can also be used with different
 libraries due to their generic implementation. At present, it is possible to
-use the solvers and preconditioners directly with types from the {\ublas} library, which is part of 
+use the solvers and preconditioners directly with types from the {\ublas} library, which is part of
 {\Boost} \cite{boost}. The iterative solvers can directly be used with Eigen \cite{eigen} and MTL 4 \cite{mtl4}.
 
-Under the hood, {\ViennaCL} uses {\OpenCL} \cite{khronoscl} for accessing and
-executing code on compute devices. Therefore, {\ViennaCL} is not tailored 
+Under the hood, {\ViennaCL} uses a unified layer to access {\CUDA} \cite{nvidiacuda}, {\OpenCL} \cite{khronoscl}, and/or {\OpenMP} \cite{openmp} for accessing and
+executing code on compute devices. Therefore, {\ViennaCL} is not tailored
 to products from a particular vendor and can be used on many different
-platforms. At present, {\ViennaCL} is known to work on modern GPUs from {NVIDIA}
-and AMD (see Tab.~\ref{tab:double-precision-GPUs}) as well as on CPUs 
-using either the AMD Accelerated Parallel Processing SDK (formerly ATI Stream SDK) or the Intel OpenCL SDK.
+platforms. At present, {\ViennaCL} is known to work on all current CPUs and modern GPUs from NVIDIA
+and AMD (see Tab.~\ref{tab:double-precision-GPUs}), CPUs
+using either the AMD Accelerated Parallel Processing (APP) SDK (formerly ATI Stream SDK) or the Intel OpenCL SDK, and Intels MIC platform (Xeon Phi).
 
 \NOTE{Double precision arithmetic on GPUs is only possible if it is provided by the GPU. There is no double precision emulation in {\ViennaCL}.}
 
 \NOTE{Double precision arithmetic using the ATI Stream SDK or AMD APP SDK may not be fully
-OpenCL-certified. See Sec.~\ref{sec:opencl-on-ati} for details.}
+OpenCL-certified. Also, we have observed bugs in AMD APP SDKs 2.7 which affects some algorithms in {\ViennaCL} (e.g.~BiCGStab).}
 
 \begin{table}[tb]
 \begin{center}
@@ -52,13 +49,9 @@ Compute Device & float & double \\
 \NVIDIA GTX 275    & ok & ok \\
 \NVIDIA GTX 280    & ok & ok \\
 \NVIDIA GTX 285    & ok & ok \\
-\NVIDIA GTX 465    & ok & ok \\
-\NVIDIA GTX 470    & ok & ok \\
-\NVIDIA GTX 480    & ok & ok \\
-\NVIDIA GTX 560    & ok & ok \\
-\NVIDIA GTX 570    & ok & ok \\
-\NVIDIA GTX 580    & ok & ok \\
-\NVIDIA GTX 590    & ok & ok \\
+\NVIDIA GTX 4XX    & ok & ok \\
+\NVIDIA GTX 5XX    & ok & ok \\
+\NVIDIA GTX 6XX    & ok & ok \\
 \NVIDIA Quadro FX 46XX & ok & - \\
 \NVIDIA Quadro FX 48XX & ok & ok \\
 \NVIDIA Quadro FX 56XX & ok & - \\
@@ -67,28 +60,24 @@ Compute Device & float & double \\
 \NVIDIA Tesla C10XX  & ok & ok \\
 \NVIDIA Tesla C20XX  & ok & ok \\
 \hline
-ATI Radeon HD 45XX   & ok & - \\
-ATI Radeon HD 46XX   & ok & - \\
-ATI Radeon HD 47XX   & ok & - \\
+ATI Radeon HD 4XXX   & ok & - \\
 ATI Radeon HD 48XX   & ok & essentially ok \\
-ATI Radeon HD 54XX   & ok & - \\
-ATI Radeon HD 55XX   & ok & - \\
-ATI Radeon HD 56XX   & ok & - \\
-ATI Radeon HD 57XX   & ok & - \\
+ATI Radeon HD 5XXX   & ok & - \\
 ATI Radeon HD 58XX   & ok & essentially ok \\
 ATI Radeon HD 59XX   & ok & essentially ok \\
 ATI Radeon HD 68XX   & ok & - \\
 ATI Radeon HD 69XX   & ok & essentially ok \\
+ATI Radeon HD 77XX   & ok & - \\
+ATI Radeon HD 78XX   & ok & - \\
+ATI Radeon HD 79XX   & ok & essentially ok \\
 ATI FireStream V92XX & ok & essentially ok \\
 ATI FirePro V78XX    & ok & essentially ok \\
 ATI FirePro V87XX    & ok & essentially ok \\
 ATI FirePro V88XX    & ok & essentially ok \\
 \end{tabular}
-\caption{Available arithmetics in {\ViennaCL} provided by selected GPUs. At the
-release of {\ViennaCLversion}, the Stream SDK (APP SDK) from AMD/ATI may not comply to
-the {\OpenCL} standard for double precision extensions, but we have not observed
-problems with the latest version of the SDK. Support for AMD devices is now
-per default enabled in {\ViennaCL}, see Sec.~\ref{sec:opencl-on-ati}.}
+\caption{Available arithmetics in {\ViennaCL} provided by selected GPUs.
+Some older versions of the Stream SDK (APP SDK) from AMD/ATI may not comply to
+the {\OpenCL} standard for double precision extensions.}
 \label{tab:double-precision-GPUs}
 \end{center}
 \end{table}
diff --git a/doc/manual/kernel-generation.tex b/doc/manual/kernel-generation.tex
new file mode 100644
index 0000000..f1d78a8
--- /dev/null
+++ b/doc/manual/kernel-generation.tex
@@ -0,0 +1,39 @@
+\chapter{Automated OpenCL User-Kernel Generation} \label{chap:kernel-generation}
+
+While {\ViennaCL} provides a convenient means of including custom {\OpenCL} compute kernels, cf.~Chap.~\ref{chap:custom},
+it can be rather tedious to come up with a good compute kernel, or to come up with many similar kernels differing in small details only.
+For the case of BLAS level 1 and level 2 operations, {\ViennaCL} now provides an automated kernel generator, which takes a high-level specification of the operations and create one or more suitable OpenCL kernels.
+This allows for high-performance implementations of algorithms which may otherwise lead to spurious temporary objects.
+
+Consider the operation
+\begin{align*}
+\mathbf{x} = \mathbf{A} \times \bigl[ (\mathbf{y} \cdot (\mathbf{y}+\mathbf{z}))\mathbf{y} + \mathbf{z} \bigr] \ ,
+\end{align*}
+where $\mathbf{x}$, $\mathbf{y}$ and $\mathbf{z}$ denote vectors, $\mathbf{A}$ is a dense matrix, and the dot denotes the vector dot product.
+With the generator it is sufficient to write the following C++ code in order to obtain an OpenCL kernel:
+\begin{lstlisting}
+// Instantiation of the symbolic variables
+symbolic_vector<NumericT, 0> sX;
+symbolic_matrix<NumericT, 1> sA;
+symbolic_vector<NumericT, 2> sY;
+symbolic_vector<NumericT, 3> sZ;
+
+//Creation of the custom operation
+custom_operation my_op( sX = prod(sA, inner_prod(sY, sY+sZ) * sY + sZ),
+                        "operation_name" );
+\end{lstlisting}
+where \lstinline|NumericT| is either \lstinline|float| or \lstinline|double|.
+The string provided as second parameter is required and can be used to identify, manage and retrieve different kernels.
+No two \lstinline|custom_operation|s are allowed to be identified using the same string.
+
+The custom operation object \lstinline|my_op| can be enqueued like any other kernel:
+\begin{lstlisting}
+//Execution of the custom operation
+viennacl::ocl::enqueue(my_op(x,A,y,z));
+\end{lstlisting}
+Here, \lstinline|x|, \lstinline|y|, \lstinline|z| are of type \lstinline|viennacl::vector<NumericT>| and \lstinline|A| is of type \lstinline|viennacl::matrix<NumericT>|.
+
+\TIP{Sample code can be found in \lstinline|tests/src/generator_*.cpp|}
+
+\NOTE{ The kernel generator is still experimental, yet already able to generate rather complex compute kernels. }
+
diff --git a/doc/manual/keywords.tex b/doc/manual/keywords.tex
index 7da5bf9..807d40c 100644
--- a/doc/manual/keywords.tex
+++ b/doc/manual/keywords.tex
@@ -123,7 +123,7 @@
 \newenvironment{fixedwidthtableL}    [3] {\begin{mmnttableL} {|l|p{10cm}|} {#1}                                  {#2}{#3}} {\end{mmnttableL}}
 \newenvironment{fixedwidthTablep}    [3] {\begin{mmntTable}  {|l|p{#3}|}   {#1}                                  {#2}}     {\end{mmntTable}}
 \newenvironment{fixedwidthTableL}    [3] {\begin{mmntTableL} {|l|p{10cm}|} {#1}                                  {#2}{#3}} {\end{mmntTableL}}
-                                     
+
 \newenvironment{keydesctableII}      [1] {\begin{mmnttable}  {|l|p{10cm}|} {Keyword & Description}               {#1}}     {\end{mmnttable}}
 \newenvironment{keydesctableIIL}     [2] {\begin{mmnttableL} {|l|p{10cm}|} {Keyword & Description}               {#1}{#2}} {\end{mmnttableL}}
 \newenvironment{keydesctableIILp}    [3] {\begin{mmnttableL} {|l|p{#3}|}   {Keyword & Description}               {#1}{#2}} {\end{mmnttableL}}
@@ -143,7 +143,7 @@
 
 \newenvironment{keytypetableII}      [1] {\begin{mmnttable}  {|l|l|}       {Keyword & Type}                      {#1}}     {\end{mmnttable}}
 \newenvironment{keytypetableIIL}     [2] {\begin{mmnttableL} {|l|l|}       {Keyword & Type}                      {#1}{#2}} {\end{mmnttableL}}
-                                     
+
 \newenvironment{keyunittableI}       [1] {\begin{mmnttable}  {|l|l|l|}     {Keyword & Type & Unit}               {#1}}     {\end{mmnttable}}
 
 \newenvironment{parameterdescrtable} [1] {\begin{mmnttable}  {|l|l|}       {Parameter & Description}             {#1}}     {\end{mmnttable}}
diff --git a/doc/manual/license.tex b/doc/manual/license.tex
index 863771c..64a8953 100644
--- a/doc/manual/license.tex
+++ b/doc/manual/license.tex
@@ -1,7 +1,11 @@
 
-\chapter*{License}  \addcontentsline{toc}{chapter}{License}
+\chapter{License} % \addcontentsline{toc}{chapter}{License}
 
-Copyright (c) 2010, 2011 Institute for Microelectronics, TU Wien
+Copyright (c) 2010-2014 Institute for Microelectronics, Institute for Analysis and Scientific Computing, TU Wien.
+Portions of this software are copyright by UChicago Argonne, LLC.
+Argonne National Laboratory, with facilities in the state of Illinois,
+is owned by The United States Government, and operated by UChicago Argonne, LLC
+under provision of a contract with the Department of Energy.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -19,4 +23,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
\ No newline at end of file
+THE SOFTWARE.
diff --git a/doc/manual/memory-model.tex b/doc/manual/memory-model.tex
new file mode 100644
index 0000000..f34c104
--- /dev/null
+++ b/doc/manual/memory-model.tex
@@ -0,0 +1,46 @@
+\chapter{Memory Model}
+With the support of multiple compute backends, memory buffers need to be managed differently depending on whether {\CUDA}, {\OpenCL} or a plain host-based buffer is in use.
+These different \emph{memory domains} are abstracted in a class \lstinline|viennacl::backend::mem_handle|,
+which is able to refer to a buffer in all three backends, possibly at the same time. Objects of type \lstinline|mem_handle| are the building blocks of scalars, vectors and matrices in {\ViennaCL}, cf.~Chap.~\ref{chap:basic-types}.
+
+
+The raw handles for each memory domain can be obtained via the member functions
+\lstinline|cuda_handle()|, \lstinline|opencl_handle()| and \lstinline|ram_handle()|.
+Note that the former two may not be available if no support for the respective backend is activated using the preprocessor constants \lstinline|VIENNACL_WITH_CUDA| and \lstinline|VIENNACL_WITH_OPENCL|, cf.~Sec.~\ref{sec:cuda-opencl-backends}.
+
+\section{Memory Handle Operations}
+Each supported backend is required to support the following functions (arguments omitted for brevity, see reference documentation in \lstinline|doc/doxygen| for details):
+\begin{itemize}
+ \item \lstinline|memory_create()|: Create a memory buffer
+ \item \lstinline|memory_copy()|: Copy the (partial) contents of one buffer to another
+ \item \lstinline|memory_write()|: Write from a memory location in CPU RAM to the buffer
+ \item \lstinline|memory_read()|: Read from the buffer to a memory location in CPU RAM
+\end{itemize}
+A common interface layer in \lstinline|viennacl::backend| dispatches into the respective routines in the backend for the currently active memory domain of the handle.
+
+
+\section{Querying and Switching Active Memory Domains}
+A \lstinline|mem_handle| object creates its buffer according to the following prioritized list, whichever is available: {\CUDA}, {\OpenCL}, host runtime (CPU RAM).
+The current memory domain can be queried using the member function \lstinline|memory_domain()| and returns one of the values \lstinline|MEMORY_NOT_INITIALIZED|, \lstinline|MAIN_MEMORY|, \lstinline|OPENCL_MEMORY|, or \lstinline|CUDA_MEMORY|
+defined in the struct \lstinline|viennacl::memory_types|.
+
+The currently active memory handle can be switched from outside using the member function \lstinline|switch_memory_domain()|.
+For example, to indicate that the memory referenced by a handle \lstinline|h|, the line
+\begin{lstlisting}
+ h.switch_active_handle_id(viennacl::MAIN_MEMORY);
+\end{lstlisting}
+is sufficient. However, no memory is created, copied, or manipulated when switching the currently active handle,
+because a \lstinline|mem_handle| object does not know what the buffer content is referring to and is thus not able to convert data between different memory domains if required.
+
+In order to copy the contents of a memory buffer in one memory domain to a memory buffer in another memory domain within the same \lstinline|mem_handle|-object, the data type must be supplied.
+This is accomplished using the function \lstinline|viennacl::backend::switch_memory_domain(mem_handle, viennacl::memory_types)|, which takes the data type as template argument.
+Thus, in order to make current data of type \lstinline|float| availabe in CPU RAM for a handle \lstinline|h|, the function
+\begin{lstlisting}
+ viennacl::backend::switch_memory_domain<float>(h, viennacl::MAIN_MEMORY);
+\end{lstlisting}
+is sufficient.
+
+If data should be transferred from one memory handle \lstinline|h1| to another memory handle \lstinline|h2|, the function \lstinline|viennacl::backend::typesafe_memory_copy(h1, h2)| is provided.
+It takes the data type as template argument and ensures a data conversion between different memory domains if required (e.g. \lstinline|cl_uint| to \lstinline|unsigned int|).
+
+
diff --git a/doc/manual/multi-device.tex b/doc/manual/multi-device.tex
index d837a54..f8ce6d3 100644
--- a/doc/manual/multi-device.tex
+++ b/doc/manual/multi-device.tex
@@ -1,13 +1,15 @@
 
-\chapter{Configuring Contexts and Devices} \label{chap:multi-devices}
+\chapter{Configuring OpenCL Contexts and Devices} \label{chap:multi-devices}
 Support for multiple devices was officially added in {\OpenCL} 1.1.
 Among other things, this allows e.g.~to use all CPUs in a multi-socket CPU mainboard as a single {\OpenCL} compute device.
-Nevertheless, the efficient use of multiple {\OpenCL} devices is far from trivial, because algorithms have to be designed such that 
+Nevertheless, the efficient use of multiple {\OpenCL} devices is far from trivial, because algorithms have to be designed such that
 they take distributed memory and synchronization issues into account.
 
-Support for multiple devices and contexts was introduced in {\ViennaCL} with version 1.1.0. In the following we give a description of the 
+Support for multiple {\OpenCL} devices and contexts was introduced in {\ViennaCL} with version 1.1.0. In the following we give a description of the
 provided functionality.
 
+\NOTE{In {\ViennaCLversion} there is no native support for automatically executing operations over multiple GPUs. Partition of data is left to the user.}
+
 \section{Context Setup}
 Unless specified otherwise (see Chap.~\ref{chap:custom-contexts}), {\ViennaCL} silently creates its own context and adds all available default devices with a single queue per device to it.
 All operations are then carried out on this context, which can be obtained with the call
@@ -15,14 +17,21 @@ All operations are then carried out on this context, which can be obtained with
  viennacl::ocl::current_context();
 \end{lstlisting}
 This default context is identified by the ID $0$ (of type \lstinline|long|).
+{\ViennaCL} uses the first platform returned by the OpenCL backend for the context.
+If a different platform should be used on a machine with multiple platforms available,
+this can be achieved with
+\begin{lstlisting}
+ viennacl::ocl::set_context_platform_index(id, platform_index);
+\end{lstlisting}
+where the context ID is \lstinline|id| and \lstinline|platform_index| refers to the array index of the platform as returned by \lstinline|clGetPlatformIDs()|.
+
 By default, only the first device in the context is used for all operations. This device can be obtained via
 \begin{lstlisting}
  viennacl::ocl::current_context().current_device();
  viennacl::ocl::current_device(); //equivalent to above
 \end{lstlisting}
-
-A user may wish to use multiple contexts, where each context consists of a subset of the available devices. 
-To setup a context with ID \lstinline|id| with a particular device type only, the user has to specify this 
+A user may wish to use multiple {\OpenCL} contexts, where each context consists of a subset of the available devices.
+To setup a context with ID \lstinline|id| with a particular device type only, the user has to specify this
 prior to any other {\ViennaCL} related statements:
 \begin{lstlisting}
 //use only GPUs:
@@ -61,7 +70,7 @@ However, memory transfer between contexts (and thus devices) has to be done manu
 to keep track in which context the individual {\ViennaCL} objects have been created, because all operands are assumed to be in the currently active context.
 
 \section{Switching Contexts and Devices}
-{\ViennaCL} always uses the currently active context with the currently active device to enqueue compute kernels. The default context is identified by ID '$0$'.
+{\ViennaCL} always uses the currently active {\OpenCL} context with the currently active device to enqueue compute kernels. The default context is identified by ID '$0$'.
 The context with ID \lstinline|id| can be set as active context with the line.
 \begin{lstlisting}
 viennacl::ocl::switch_context(id);
@@ -79,3 +88,14 @@ std::vector<viennacl::ocl::device> const & devices =
 viennacl::ocl::current_context().switch_device(devices[1]);
 \end{lstlisting}
 If the supplied device is not part of the context, an error message is printed and the active device remains unchanged.
+
+
+\section{Setting OpenCL Compiler Flags}
+Each {\OpenCL} context provides a member function \lstinline|.build_options()|, which can be used to pass OpenCL compiler flags prior to compilation.
+Note that flags need to be passed to the context prior to the compilation of the respective kernels, i.e.~prior the first instantiation of the respective matrix or vector types.
+
+To pass the \lstinline|-cl-mad-enable| flag to the current context, the line
+\begin{lstlisting}
+ viennacl::ocl::current_context().build_options("-cl-mad-enable");
+\end{lstlisting}
+is sufficient. Confer to the {\OpenCL} standard for a full list of flags.
diff --git a/doc/manual/operations.tex b/doc/manual/operations.tex
index 205654a..a4882aa 100644
--- a/doc/manual/operations.tex
+++ b/doc/manual/operations.tex
@@ -1,41 +1,67 @@
 \chapter{Basic Operations} \label{chap:operations}
 
 The basic types have been introduced in the previous chapter, so we move on with the description of the basic BLAS operations.
+Almost all operations supported by {\ublas} are available, including element-wise operations on vectors. Thus, consider the
+\href{http://www.boost.org/doc/libs/1_52_0/libs/numeric/ublas/doc/operations_overview.htm}{ublas-documentation} as a reference as well.
 
-\section{Vector-Vector Operations (BLAS Level 1)}
+\section{Vector-Vector and Elementary Matrix-Matrix Operations (BLAS Level 1)}
 
 {\ViennaCL} provides all vector-vector operations defined at level 1 of BLAS. Tab.~\ref{tab:blas-level-1} shows how these operations can be carried
 out in \ViennaCL. The function interface is compatible with {\ublas},
 thus allowing quick code migration for {\ublas} users.
-
+Element-wise operations and standard operator overloads are available for dense matrices as well.
+The only dense matrix norm provided is \lstinline|norm_frobenius()| for the Frobenius norm.
 
 \TIP{For full details on level 1 functions, refer to the reference documentation
 located in \texttt{doc/doxygen/}}
 
+\NOTE{Mixing operations between objects of different scalar types is not supported. Convert the data manually on the host if needed.}
+
 
 \begin{table}[tb]
 \begin{center}
 \begin{tabular}{l|l|p{6cm}}
 Verbal & Mathematics & ViennaCL\\
 \hline
-swap    & $x \leftrightarrow y$ & \texttt{swap(x,y);} \\
-stretch    & $x \leftarrow \alpha x$ & \texttt{x *= alpha;} \\
-assignment & $y \leftarrow x$ & \texttt{y = x;} \\
-multiply add & $y \leftarrow \alpha x + y$ & \texttt{y += alpha * x;} \\
-multiply subtract & $y \leftarrow \alpha x - y$ & \texttt{y -= alpha * x;} \\
-inner dot product & $\alpha \leftarrow x^{\mathrm{T}} y$ & \texttt{inner\_prod(x,y);} \\
-$L^1$ norm & $\alpha \leftarrow \Vert x \Vert_1$ & \texttt{alpha = norm\_1(x);} \\
-$L^2$ norm & $\alpha \leftarrow \Vert x \Vert_2$ & \texttt{alpha = norm\_2(x);} \\
-$L^\infty$ norm & $\alpha \leftarrow \Vert x \Vert_\infty$ & \texttt{alpha = norm\_inf(x);} \\
-$L^\infty$ norm index& $i \leftarrow \max_i \vert x_i \vert$ & \texttt{i = index\_norm\_inf(x);} \\
-plane rotation & $(x,y) \leftarrow (\alpha x + \beta y, -\beta x + \alpha y)$ &
-\texttt{plane\_rotation(alpha, beta, x, y);} \\
+swap    & $x \leftrightarrow y$ & \lstinline|swap(x,y);| \\
+stretch    & $x \leftarrow \alpha x$ & \lstinline|x *= alpha;| \\
+assignment & $y \leftarrow x$ & \lstinline|y = x;| \\
+multiply add & $y \leftarrow \alpha x + y$ & \lstinline|y += alpha * x;| \\
+multiply subtract & $y \leftarrow \alpha x - y$ & \lstinline|y -= alpha * x;| \\
+inner dot product & $\alpha \leftarrow x^{\mathrm{T}} y$ & \lstinline|inner_prod(x,y);| \\
+$L^1$ norm & $\alpha \leftarrow \Vert x \Vert_1$ & \lstinline|alpha = norm_1(x);| \\
+$L^2$ norm & $\alpha \leftarrow \Vert x \Vert_2$ & \lstinline|alpha = norm_2(x);| \\
+$L^\infty$ norm & $\alpha \leftarrow \Vert x \Vert_\infty$ & \lstinline|alpha = norm_inf(x);| \\
+$L^\infty$ norm index& $i \leftarrow \max_i \vert x_i \vert$ & \lstinline|i = index_norm_inf(x);| \\
+plane rotation & $(x,y) \leftarrow (\alpha x + \beta y, -\beta x + \alpha y)$ & \lstinline|plane_rotation(a, b, x, y);| \\
+\hline
+elementwise product  & $y_i \leftarrow x_i \cdot z_i$ & \lstinline|y = element_prod(x,z);| \\
+elementwise division & $y_i \leftarrow x_i \cdot z_i$ & \lstinline|y = element_div(x,z);| \\
+elementwise power    & $y_i \leftarrow x_i^{z_i}$ & \lstinline|y = element_pow(x,z);| \\
+\hline
+elementwise modulus (ints)   & $y_i \leftarrow |x_i|$ & \lstinline|y = element_abs(x);| \\
+elementwise modulus (floats) & $y_i \leftarrow |x_i|$ & \lstinline|y = element_fabs(x);| \\
+elementwise acos  & $y_i \leftarrow \textrm{acos}(x_i)$ & \lstinline|y = element_acos(x);| \\
+elementwise asin  & $y_i \leftarrow \textrm{asin}(x_i)$ & \lstinline|y = element_asin(x);| \\
+elementwise atan  & $y_i \leftarrow \textrm{atan}(x_i)$ & \lstinline|y = element_atan(x);| \\
+elementwise ceil  & $y_i \leftarrow \lceil x_i \rceil$ & \lstinline|y = element_ceil(x);| \\
+elementwise cos   & $y_i \leftarrow \textrm{cos}(x_i)$ & \lstinline|y = element_cos(x);| \\
+elementwise cosh  & $y_i \leftarrow \textrm{cosh}(x_i)$ & \lstinline|y = element_cosh(x);| \\
+elementwise exp   & $y_i \leftarrow \textrm{exp}(x_i)$ & \lstinline|y = element_exp(x);| \\
+elementwise floor & $y_i \leftarrow \lfloor x_i \rfloor $ & \lstinline|y = element_floor(x);| \\
+elementwise log (base e)  & $y_i \leftarrow \textrm{ln}(x_i)$ & \lstinline|y = element_log(x);| \\
+elementwise log (base 10) & $y_i \leftarrow \textrm{log}_{10}(x_i)$ & \lstinline|y = element_log10(x);| \\
+elementwise sin  & $y_i \leftarrow \textrm{sin}(x_i)$ & \lstinline|y = element_sin(x);| \\
+elementwise sinh & $y_i \leftarrow \textrm{sinh}(x_i)$ & \lstinline|y = element_sinh(x);| \\
+elementwise sqrt & $y_i \leftarrow \textrm{sqrt}(x_i)$ & \lstinline|y = element_sqrt(x);| \\
+elementwise tan  & $y_i \leftarrow \textrm{tan}(x_i)$ & \lstinline|y = element_tan(x);| \\
+elementwise tanh & $y_i \leftarrow \textrm{tanh}(x_i)$ & \lstinline|y = element_tanh(x);| \\
 \end{tabular}
 \caption{BLAS level 1 routines mapped to {\ViennaCL}. Note that the free functions reside in namespace \texttt{viennacl::linalg}}
 \label{tab:blas-level-1}
 \end{center}
 \end{table}
- 
+
 \section{Matrix-Vector Operations (BLAS Level 2)}
 The interface for level 2 BLAS functions in {\ViennaCL} is similar to that of
 {\ublas} and shown in Tab.~\ref{tab:blas-level-2}.
@@ -43,6 +69,8 @@ The interface for level 2 BLAS functions in {\ViennaCL} is similar to that of
 \TIP{For full details on level 2 functions, refer to the reference documentation
 located in \texttt{doc/doxygen/}}
 
+\NOTE{Mixing operations between objects of different scalar types is not supported. Convert the data manually on the host if needed.}
+
 
 \begin{table}[tb]
 \begin{center}
@@ -50,24 +78,24 @@ located in \texttt{doc/doxygen/}}
 \begin{tabular}{p{4cm}|l|p{7cm}}
 Verbal & Mathematics & ViennaCL\\
 \hline
-matrix vector product & $y \leftarrow A x$ & \texttt{y = prod(A, x);} \\
-matrix vector product & $y \leftarrow A^\mathrm{T} x$ & \texttt{y = prod(trans(A), x);} \\
-inplace mv product & $x \leftarrow A x$ & \texttt{x = prod(A, x);} \\
-inplace mv product & $x \leftarrow A^\mathrm{T} x$ & \texttt{x = prod(trans(A), x);} \\
+matrix vector product & $y \leftarrow A x$ & \lstinline|y = prod(A, x);| \\
+matrix vector product & $y \leftarrow A^\mathrm{T} x$ & \lstinline|y = prod(trans(A), x);| \\
+inplace mv product & $x \leftarrow A x$ & \lstinline|x = prod(A, x);| \\
+inplace mv product & $x \leftarrow A^\mathrm{T} x$ & \lstinline|x = prod(trans(A), x);| \\
 \hline
-scaled product add & $y \leftarrow \alpha A x + \beta y$ & \texttt{y = alpha * prod(A, x) + beta * y} \\
-scaled product add & $y \leftarrow \alpha A^{\mathrm T} x + \beta y$ & \texttt{y = alpha * prod(trans(A), x) + beta * y} \\
+scaled product add & $y \leftarrow \alpha A x + \beta y$ & \lstinline|y = alpha * prod(A, x) + beta * y| \\
+scaled product add & $y \leftarrow \alpha A^{\mathrm T} x + \beta y$ & \lstinline|y = alpha * prod(trans(A), x) + beta * y| \\
 \hline
-tri. matrix solve & $y \leftarrow A^{-1} x$ & \texttt{y = solve(A, x, tag);} \\
-tri. matrix solve & $y \leftarrow A^\mathrm{T^{-1}} x$ & \texttt{y = solve(trans(A), x, tag);} \\
-inplace solve & $x \leftarrow A^{-1} x$ & \texttt{inplace\_solve(A, x, tag);} \\
-inplace solve & $x \leftarrow A^\mathrm{T^{-1}} x$ & \texttt{inplace\_solve(trans(A), x, tag);} \\
+tri. matrix solve & $y \leftarrow A^{-1} x$ & \lstinline|y = solve(A, x, tag);| \\
+tri. matrix solve & $y \leftarrow A^\mathrm{T^{-1}} x$ & \lstinline|y = solve(trans(A), x, tag);| \\
+inplace solve & $x \leftarrow A^{-1} x$ & \lstinline|inplace_solve(A, x, tag);| \\
+inplace solve & $x \leftarrow A^\mathrm{T^{-1}} x$ & \lstinline|inplace_solve(trans(A), x, tag);| \\
 \hline
-rank 1 update & $A \leftarrow \alpha x y^{\mathrm T} + A$ & \texttt{A += alpha * outer\_prod(x,y);} \\
-symm. rank 1 update & $A \leftarrow \alpha x x^{\mathrm T} + A$ & \texttt{A += alpha * outer\_prod(x,x);} \\
-rank 2 update & $A \leftarrow \alpha (x y^{\mathrm T} + y x^{\mathrm T}) + A$ & \texttt{A += alpha * outer\_prod(x,y); A += alpha * outer\_prod(y,x);} \\
+rank 1 update & $A \leftarrow \alpha x y^{\mathrm T} + A$ & \lstinline|A += alpha * outer_prod(x,y);| \\
+symm. rank 1 update & $A \leftarrow \alpha x x^{\mathrm T} + A$ & \lstinline|A += alpha * outer_prod(x,x);| \\
+rank 2 update & $A \leftarrow \alpha (x y^{\mathrm T} + y x^{\mathrm T}) + A$ & \lstinline|A += alpha * outer_prod(x,y);| \lstinline|A += alpha * outer_prod(y,x);| \\
 \end{tabular}
-\caption{BLAS level 2 routines mapped to \ViennaCL. Note that the free functions reside in namespace \texttt{viennacl::linalg}}
+\caption{BLAS level 2 routines mapped to \ViennaCL. Note that the free functions reside in namespace \texttt{viennacl::linalg}. \lstinline|tag| is one out of \lstinline|lower_tag|, \lstinline|unit_lower_tag|, \lstinline|upper_tag|, and \lstinline|unit_upper_tag|.}
 \label{tab:blas-level-2}
 \end{center}
 \end{table}
@@ -85,45 +113,38 @@ be carried out immediately, as is shown in the tutorial located in
 
 As for performance, {\ViennaCL} yields decent performance gains at BLAS level
 3 on mid- to high-end GPUs compared to CPU implementations using a single core
-only. Nevertheless, one must not expect to obtain the reported peak performance
-of hundreds of GFLOPs for the multiplication of arbitrary matrices. These rates
-can typically only obtained when tailoring the compute kernel(s) to a
-particular device and certain matrix dimensions, while {\ViennaCL} provides
+only. However, highest performance is usually obtained only with careful tuning to the respective target device.
+Generally, {\ViennaCL} provides
 kernels that represent a good compromise between efficiency and portability
 among a large number of different devices and device types.
 
+\TIP{
+For certain matrix dimensions, typically multiples of 64 or 128, {\ViennaCL} also provides tuned kernels reaching over 1 TFLOP in single precision (AMD HD 7970).
+}
+
+\NOTE{Mixing operations between objects of different scalar types is not supported. Convert the data manually on the host if needed.}
+
+
 \begin{table}[tb]
 \begin{center}
 \renewcommand{\arraystretch}{1.2}
 \begin{tabular}{p{4cm}|l|p{7.5cm}}
 Verbal & Mathematics & ViennaCL\\
 \hline
-matrix-matrix product & $C \leftarrow A \times B$ & \texttt{C = prod(A, B);}
-\\
-matrix-matrix product & $C \leftarrow A \times B^\mathrm{T}$ & \texttt{C =
-prod(A, trans(B));} \\
-matrix-matrix product & $C \leftarrow A^\mathrm{T} \times B$ & \texttt{C =
-prod(trans(A), B);} \\
-matrix-matrix product & $C \leftarrow A^\mathrm{T} \times B^\mathrm{T}$ &
-\texttt{C = prod(trans(A), trans(B));} \\
+matrix-matrix product & $C \leftarrow A \times B$ & \lstinline|C = prod(A, B);| \\
+matrix-matrix product & $C \leftarrow A \times B^\mathrm{T}$ & \lstinline|C = prod(A, trans(B));| \\
+matrix-matrix product & $C \leftarrow A^\mathrm{T} \times B$ & \lstinline|C = prod(trans(A), B);| \\
+matrix-matrix product & $C \leftarrow A^\mathrm{T} \times B^\mathrm{T}$ & \lstinline|C = prod(trans(A), trans(B));| \\
 \hline
-tri. matrix solve & $C \leftarrow A^{-1} B$ & \texttt{C = solve(A, B, tag);}
-\\
-tri. matrix solve & $C \leftarrow A^\mathrm{T^{-1}} B$ & \texttt{C =
-solve(trans(A), B, tag);} \\
-tri. matrix solve & $C \leftarrow A^{-1} B^\mathrm{T}$ & \texttt{C = solve(A,
-trans(B), tag);} \\
-tri. matrix solve & $C \leftarrow A^\mathrm{T^{-1}} B^\mathrm{T}$ & \texttt{C
-= solve(trans(A), trans(B), tag);} \\
+tri. matrix solve & $C \leftarrow A^{-1} B$ & \lstinline|C = solve(A, B, tag);| \\
+tri. matrix solve & $C \leftarrow A^\mathrm{T^{-1}} B$ & \lstinline|C = solve(trans(A), B, tag);| \\
+tri. matrix solve & $C \leftarrow A^{-1} B^\mathrm{T}$ & \lstinline|C = solve(A, trans(B), tag);| \\
+tri. matrix solve & $C \leftarrow A^\mathrm{T^{-1}} B^\mathrm{T}$ & \lstinline|C = solve(trans(A), trans(B), tag);| \\
 %
-inplace solve & $B \leftarrow A^{-1} B$ & \texttt{inplace\_solve(A, trans(B),
-tag);} \\
-inplace solve & $B \leftarrow A^\mathrm{T^{-1}} B$ &
-\texttt{inplace\_solve(trans(A), x, tag);} \\
-inplace solve & $B \leftarrow A^{-1} B^\mathrm{T}$ & \texttt{inplace\_solve(A,
-trans(B), tag);} \\
-inplace solve & $B \leftarrow A^\mathrm{T^{-1}} B^\mathrm{T}$ &
-\texttt{inplace\_solve(trans(A), x, tag);} \\
+inplace solve & $B \leftarrow A^{-1} B$ & \lstinline|inplace_solve(A, trans(B), tag);| \\
+inplace solve & $B \leftarrow A^\mathrm{T^{-1}} B$ & \lstinline|inplace_solve(trans(A), x, tag);| \\
+inplace solve & $B \leftarrow A^{-1} B^\mathrm{T}$ & \lstinline|inplace_solve(A, trans(B), tag);| \\
+inplace solve & $B \leftarrow A^\mathrm{T^{-1}} B^\mathrm{T}$ & \lstinline|inplace_solve(trans(A), x, tag);| \\
 \end{tabular}
 \caption{BLAS level 3 routines mapped to \ViennaCL. Note that the free functions
 reside in namespace \texttt{viennacl::linalg}}
@@ -131,3 +152,70 @@ reside in namespace \texttt{viennacl::linalg}}
 \end{center}
 \end{table}
 
+\section{Initializer Types}
+
+\NOTE{Initializer types in {\ViennaCLversion} can currently only be used for initializing vectors and matrices, not for computations!}
+
+In order to initialize vectors, the following initializer types are provided, again similar to {\ublas}:
+\begin{center}
+\begin{tabular}{|l|p{10cm}|}
+ \hline
+ \lstinline|unit_vector<T>(s, i)| & Unit vector of size $s$ with entry $1$ at index $i$, zero elsewhere. \\
+ \hline
+ \lstinline|zero_vector<T>(s)| & Vector of size $s$ with all entries being zero. \\
+ \hline
+ \lstinline|scalar_vector<T>(s, v)| & Vector of size $s$ with all entries equal to $v$. \\
+ \hline
+ \lstinline|random_vector<T>(s, d)| & Vector of size $s$ with all entries random according to the distribution specified by $d$. \\
+ \hline
+\end{tabular}
+\end{center}
+For example, to initialize a vector \lstinline|v1| with all $42$ entries being $42.0$, use
+\begin{lstlisting}
+ viennacl::vector<float> v1 = viennacl::scalar_vector<float>(42, 42.0f);
+\end{lstlisting}
+
+Similarly the following initializer types are available for matrices:
+\begin{center}
+\begin{tabular}{|l|p{10cm}|}
+ \hline
+ \lstinline|identity_matrix<T>(s, i)| & Identity matrix of dimension $s \times s$. \\
+ \hline
+ \lstinline|zero_matrix<T>(s1, s2)| & Matrix of size $s_1 \times s_2$ with all entries being zero. \\
+ \hline
+ \lstinline|scalar_matrix<T>(s1, s2, v)| & Matrix of size $s_1 \times s_2$ with all entries equal to $v$. \\
+ \hline
+ \lstinline|random_matrix<T>(s1, s2, d)| & Vector of size $s$ with all entries random according to the distribution specified by $d$. \\
+ \hline
+\end{tabular}
+\end{center}
+
+\section{Row, Column, and Diagonal Extraction}
+For many algorithms it is of interest to extract a single row or column of a dense matrix, or to access the matrix diagonal.
+This is provided in the same way as for Boost.uBLAS through the free functions \lstinline|row()|, \lstinline|column()|, and \lstinline|diag()|:
+\begin{lstlisting}
+ // A is a viennacl::matrix<T>
+ // Extract 5-th row of A, then overwrite with 6-th diagonal:
+ viennacl::vector<T> r = viennacl::row(A, 4);
+ r = viennacl::row(A, 5);
+ 
+ // Extract 4-th column of A, then overwrite with second column:
+ viennacl::vector<T> c = viennacl::column(A, 3);
+ c = viennacl::column(A, 1);
+
+ // Extract diagonal:
+ viennacl::vector<T> d = viennacl::diag(A);
+\end{lstlisting}
+The function \lstinline|diag| can also be used to create a matrix which has the provided vector entries in the off-diagonal:
+\begin{lstlisting}
+ // Create the matrix
+ // 0 1 0 0
+ // 0 0 2 0
+ // 0 0 0 3
+ viennacl::vector<float> v(3);
+ v[0] = 1.0f; v[1] = 2.0f; v[2] = 3.0f;
+ viennacl::matrix<float> A = viennacl::diag(v, 1);
+\end{lstlisting}
+This is similar to MATLAB's \lstinline|diag()| function.
+
+
diff --git a/doc/manual/other-libs.tex b/doc/manual/other-libs.tex
index 56de0a3..b4d71f9 100644
--- a/doc/manual/other-libs.tex
+++ b/doc/manual/other-libs.tex
@@ -8,13 +8,13 @@ Please feel free to suggest additional libraries for which an interface should b
 
 Since it is unlikely that all third-party libraries for which {\ViennaCL} provides interfaces are installed
 on the target machine, the wrappers are disabled by default. To selectively enable the wrappers,
-the appropriate preprocessor constants \lstinline|VIENNACL_HAVE_XXXX| have to be defined \emph{prior to any 
+the appropriate preprocessor constants \lstinline|VIENNACL_WITH_XXXX| have to be defined \emph{prior to any
 \lstinline|\#include| statements for {\ViennaCL} headers}. This can for example be assured by passing the
-preprocessor constant directly when launching the compiler. With \lstinline|GCC| this is for instance 
+preprocessor constant directly when launching the compiler. With \lstinline|GCC| this is for instance
 achieved by the \lstinline|-D| switch.
 
-\section{\ublas}
-Since all types in {\ViennaCL} have to same interface as their counterparts in {\ublas},
+\section{Boost.uBLAS}
+Since all types in {\ViennaCL} have the same interface as their counterparts in {\ublas},
 most code written for {\ViennaCL} objects remains valid when using {\ublas} objects.
 \begin{lstlisting}
 //Option 1: Using ViennaCL:
@@ -24,8 +24,8 @@ using namespace viennacl::linalg;
 //Option 2: Using ublas:
 //using namespace boost::numeric::ublas;
 
-matrix<float> dense_matrix(5,5);
-vector<float> dense_vector(5,5);
+matrix<float>            dense_matrix(5,5);
+vector<float>            dense_vector(5,5);
 compressed_matrix<float> sparse_matrix(1000, 1000);
 
 //fill with data:
@@ -41,17 +41,17 @@ The above code is valid for either the {\ViennaCL} namespace declarations, or th
 iterative solvers are not part of {\ublas} and therefore the explicit namespace specification is required. More examples
 for the exchangability of {\ublas} and {\ViennaCL} can be found in the tutorials in the \texttt{examples/tutorials/} folder.
 
-When using the iterative solvers, the preprocessor constant \texttt{VIENNACL\_HAVE\_UBLAS} must be defined prior to any other {\ViennaCL} include statements.
+When using the iterative solvers, the preprocessor constant \texttt{VIENNACL\_WITH\_UBLAS} must be defined prior to any other {\ViennaCL} include statements.
 This is essential for enabling the respective wrappers.
 
 \TIP{Refer in particular to \texttt{iterative-ublas.cpp} for a complete example on iterative solvers using {\ublas} types.}
 
 \section{\Eigen}
-To copy data from {\Eigen} \cite{eigen} objects to {\ViennaCL}, the \texttt{copy()}-functions are used just as for {\ublas} and STL types:
+To copy data from {\Eigen} \cite{eigen} objects (version 3.x.y) to {\ViennaCL}, the \texttt{copy()}-functions are used just as for {\ublas} and STL types:
 \begin{lstlisting}
  //from Eigen to ViennaCL
- viennacl::copy(eigen_vector, vcl_vector);
- viennacl::copy(eigen_densematrix, vcl_densematrix);
+ viennacl::copy(eigen_vector,       vcl_vector);
+ viennacl::copy(eigen_densematrix,  vcl_densematrix);
  viennacl::copy(eigen_sparsematrix, vcl_sparsematrix);
 \end{lstlisting}
 In addition, the STL-compliant iterator-version of \texttt{viennacl::copy()} taking three arguments can be used for copying vector data.
@@ -59,8 +59,8 @@ Here, all types prefixed with \texttt{eigen} are {\Eigen} types, the prefix \tex
 Similarly, the transfer from {\ViennaCL} back to {\Eigen} is accomplished by
 \begin{lstlisting}
  //from ViennaCL to Eigen
- viennacl::copy(vcl_vector, eigen_vector);
- viennacl::copy(vcl_densematrix, eigen_densematrix);
+ viennacl::copy(vcl_vector,       eigen_vector);
+ viennacl::copy(vcl_densematrix,  eigen_densematrix);
  viennacl::copy(vcl_sparsematrix, eigen_sparsematrix);
 \end{lstlisting}
 
@@ -71,7 +71,7 @@ The iterative solvers in {\ViennaCL} can also be used directly with {\Eigen} obj
   eigen_result = solve(eigen_matrix, eigen_rhs, bicgstab_tag());
   eigen_result = solve(eigen_matrix, eigen_rhs, gmres_tag());
 \end{lstlisting}
-When using the iterative solvers with {\Eigen}, the preprocessor constant \texttt{VIENNACL\_HAVE\_EIGEN} must be defined prior to any other {\ViennaCL} include statements.
+When using the iterative solvers with {\Eigen}, the preprocessor constant \texttt{VIENNACL\_WITH\_EIGEN} must be defined prior to any other {\ViennaCL} include statements.
 This is essential for enabling the respective wrappers.
 
 \TIP{Refer to \texttt{iterative-eigen.cpp} and \texttt{eigen-with-viennacl.cpp} for complete examples.}
@@ -82,8 +82,8 @@ This is essential for enabling the respective wrappers.
 The following lines demonstate how {\ViennaCL} types are filled with data from {\MTL} \cite{mtl4} objects:
 \begin{lstlisting}
  //from Eigen to ViennaCL
- viennacl::copy(mtl4_vector, vcl_vector);
- viennacl::copy(mtl4_densematrix, vcl_densematrix);
+ viennacl::copy(mtl4_vector,       vcl_vector);
+ viennacl::copy(mtl4_densematrix,  vcl_densematrix);
  viennacl::copy(mtl4_sparsematrix, vcl_sparsematrix);
 \end{lstlisting}
 In addition, the STL-compliant iterator-version of \texttt{viennacl::copy()} taking three arguments can be used for copying vector data.
@@ -91,8 +91,8 @@ Here, all types prefixed with \texttt{mtl4} are {\MTL} types, the prefix \texttt
 Similarly, the transfer from {\ViennaCL} back to {\MTL} is accomplished by
 \begin{lstlisting}
  //from ViennaCL to MTL4
- viennacl::copy(vcl_vector, mtl4_vector);
- viennacl::copy(vcl_densematrix, mtl4_densematrix);
+ viennacl::copy(vcl_vector,       mtl4_vector);
+ viennacl::copy(vcl_densematrix,  mtl4_densematrix);
  viennacl::copy(vcl_sparsematrix, mtl4_sparsematrix);
 \end{lstlisting}
 
@@ -105,7 +105,7 @@ Even though {\MTL} provides its own set of iterative solvers, the iterative solv
 \end{lstlisting}
 Our internal tests have shown that the execution time of {\MTL} solvers is equal to {\ViennaCL} solvers when using {\MTL} types.
 
-When using the iterative solvers with {\MTL}, the preprocessor constant \texttt{VIENNACL\_HAVE\_MTL4} must be defined prior to any other {\ViennaCL} include statements.
+When using the iterative solvers with {\MTL}, the preprocessor constant \texttt{VIENNACL\_WITH\_MTL4} must be defined prior to any other {\ViennaCL} include statements.
 This is essential for enabling the respective wrappers.
 
 \TIP{Refer to \texttt{iterative-mtl4.cpp} and \texttt{mtl4-with-viennacl.cpp} for complete examples.}
diff --git a/doc/manual/setup.tex b/doc/manual/setup.tex
index 3ac125b..06d059d 100644
--- a/doc/manual/setup.tex
+++ b/doc/manual/setup.tex
@@ -39,7 +39,7 @@
 \newrgbcolor{notebg}         {0.99609375 0.8828125 0.921875}  % PSTricks colors
 \newrgbcolor{tipbg}          {0.98828125 0.99609375 0.78515625}
 \newrgbcolor{ipdbg}          {0.85  0.85  0.93}
-\definecolor{headingfg} {rgb}{0.2,0.2,0.6}  
+\definecolor{headingfg} {rgb}{0.2,0.2,0.6}
 \newrgbcolor{headingfg}      {0.2 0.2 0.6}  % PSTricks colors
 
 \newlength{\IpdBoxWidth}
diff --git a/doc/manual/shared-lib.tex b/doc/manual/shared-lib.tex
new file mode 100644
index 0000000..1f31db6
--- /dev/null
+++ b/doc/manual/shared-lib.tex
@@ -0,0 +1,14 @@
+\chapter{Shared Library} \label{chap:shared-lib}
+In order to open up {\ViennaCL} to other languages such as C, FORTRAN, or Python,
+a shared library is under development in the subfolder \lstinline|libviennacl/|.
+Currently the different BLAS backends for dense linear algebra are available.
+Sparse linear algebra, iterative solvers, etc. will follow in future releases.
+
+The design and calling conventions are very similar to vendor BLAS libraries.
+All functions are prefixed 'ViennaCL'. The three backends provide their functionality
+through functions prefixed \lstinline|ViennaCLCUDA|, \lstinline|ViennaCLOpenCL|, and \lstinline|ViennaCLHost|, respectively.
+Since we consider the standard BLAS interface rather tedious and error-prone, an additional object-oriented interface is provided as well.
+
+Have a look at \lstinline|examples/tutorial/libviennacl.cpp| as well as the tests located at \lstinline|tests/src/libviennacl*| to get an impression on how to use these methods.
+Also, all callable functions in the shared library are listed in the public include file \lstinline|libviennacl/include/viennacl.hpp|.
+Additional documentation will be added incrementally.
\ No newline at end of file
diff --git a/doc/manual/structured-matrices.tex b/doc/manual/structured-matrices.tex
new file mode 100644
index 0000000..ee6abbf
--- /dev/null
+++ b/doc/manual/structured-matrices.tex
@@ -0,0 +1,98 @@
+\chapter{Structured Matrix Types}
+
+\NOTE{Structured matrix types are experimental in {\ViennaCLversion}. Interface changes as well as considerable performance improvements may be included in
+future releases!}
+
+There are a number of structured dense matrices for which some algorithms such as matrix-vector products can be computed with much lower computational effort
+than for the general dense matrix case. In the following, four structured dense matrix types included in {\ViennaCL} are discussed.
+Example code can be found in \lstinline|examples/tutorial/structured-matrices.cpp|.
+
+\section{Circulant Matrix}
+A circulant matrix is a matrix of the form
+\begin{align*}
+ \left( \begin{array}{ccccc}
+         c_0 & c_{n-1} & \ldots & c_2 & c_1 \\
+         c_1 & c_0 & c_{n-1} & & c_2 \\
+         \vdots & c_1 & c_0 & \ddots & \vdots \\
+         c_{n-2} & & \ddots & \ddots & c_{n-1} \\
+         c_{n-1} & c_{n-2} & \hdots & c_1 & c_0 \\
+        \end{array} \right)
+\end{align*}
+and available in {\ViennaCL} via
+\begin{lstlisting}
+ #include "viennacl/circulant_matrix.hpp"
+
+ std::size_t s = 42;
+ viennacl::circulant_matrix circ_mat(s, s);
+\end{lstlisting}
+The \lstinline|circulant_matrix| type can be manipulated in the same way as the dense matrix type \lstinline|matrix|. Note that writing to a single element of
+the matrix is structure-preserving, e.g.~changing \lstinline|circ_mat(1,2)| will automatically update \lstinline|circ_mat(0,1)|, \lstinline|circ_mat(2,3)| and
+so on.
+
+
+\section{Hankel Matrix}
+A Hankel matrix is a matrix of the form
+\begin{align*}
+ \left( \begin{array}{cccc}
+         a & b & c & d \\
+         b & c & d & e \\
+         c & d & e & f \\
+         d & e & f & g \\
+        \end{array} \right)
+\end{align*}
+and available in {\ViennaCL} via
+\begin{lstlisting}
+ #include "viennacl/hankel_matrix.hpp"
+
+ std::size_t s = 42;
+ viennacl::hankel_matrix hank_mat(s, s);
+\end{lstlisting}
+The \lstinline|hankel_matrix| type can be manipulated in the same way as the dense matrix type \lstinline|matrix|. Note that writing to a single element of
+the matrix is structure-preserving, e.g.~changing \lstinline|hank_mat(1,2)| in the example above will also update \lstinline|hank_mat(0,3)|,
+\lstinline|hank_mat(2,1)| and
+\lstinline|hank_mat(3,0)|.
+
+\section{Toeplitz Matrix}
+A Toeplitz matrix is a matrix of the form
+\begin{align*}
+ \left( \begin{array}{cccc}
+         a & b & c & d \\
+         e & a & b & c \\
+         f & e & a & b \\
+         g & f & e & a \\
+        \end{array} \right)
+\end{align*}
+and available in {\ViennaCL} via
+\begin{lstlisting}
+ #include "viennacl/toeplitz_matrix.hpp"
+
+ std::size_t s = 42;
+ viennacl::toeplitz_matrix toep_mat(s, s);
+\end{lstlisting}
+The \lstinline|toeplitz_matrix| type can be manipulated in the same way as the dense matrix type \lstinline|matrix|. Note that writing to a single element of
+the matrix is structure-preserving, e.g.~changing \lstinline|toep_mat(1,2)| in the example above will also update \lstinline|toep_mat(0,1)| and
+\lstinline|toep_mat(2,3)|.
+
+
+\section{Vandermonde Matrix}
+A Vandermonde matrix is a matrix of the form
+\begin{align*}
+ \left( \begin{array}{ccccc}
+         1 & \alpha_1 & \alpha_1^2 & \ldots & \alpha_1^{n-1} \\
+         1 & \alpha_2 & \alpha_2^2 & \ldots & \alpha_2^{n-1} \\
+         1 & \vdots & \vdots & \vdots \\
+         1 & \alpha_m & \alpha_m^2 & \ldots & \alpha_m^{n-1} \\
+        \end{array} \right)
+\end{align*}
+and available in {\ViennaCL} via
+\begin{lstlisting}
+ #include "viennacl/vandermonde_matrix.hpp"
+
+ std::size_t s = 42;
+ viennacl::vandermonde_matrix vand_mat(s, s);
+\end{lstlisting}
+The \lstinline|vandermonde_matrix| type can be manipulated in the same way as the dense matrix type \lstinline|matrix|, but restrictions apply. For
+example, the addition or subtraction of two Vandermonde matrices does not yield another Vandermonde matrix. Note that writing to a single element of
+the matrix is structure-preserving, e.g.~changing \lstinline|vand_mat(1,2)| in the example above will automatically update \lstinline|vand_mat(1,3)|,
+\lstinline|vand_mat(1,4)|, etc.
+
diff --git a/doc/manual/tuning.tex b/doc/manual/tuning.tex
index bc49ffe..9b1af0f 100644
--- a/doc/manual/tuning.tex
+++ b/doc/manual/tuning.tex
@@ -1,6 +1,6 @@
-\chapter{Kernel Parameter Tuning} \label{chap:tuning}
+\chapter{OpenCL Kernel Parameter Tuning} \label{chap:tuning}
 The choice of the global and local work sizes for {\OpenCL} kernels typically has a considerable impact on the obtained device performance.
-The default setting in {\ViennaCL} is -- with some exceptions -- to use the same global and local work sizes for each compute kernel.
+The default setting in {\ViennaCL} is -- with some exceptions -- to use the same number of work groups and work items per work group (128) for each compute kernel.
 To obtain highest performance, optimal work sizes have to be determined for each kernel in dependence of the underlying device.
 
 \section{Start Tuning Runs}
diff --git a/doc/manual/types.tex b/doc/manual/types.tex
index eff1245..ecb31a9 100644
--- a/doc/manual/types.tex
+++ b/doc/manual/types.tex
@@ -1,23 +1,21 @@
-\chapter{Basic Types}
+\chapter{Basic Types} \label{chap:basic-types}
 This chapter provides a brief overview of the basic interfaces and usage of the
-provided data types. The term \textit{GPU} refers here and in the following to
-both GPUs and multi-core CPUs accessed via {\OpenCL} and managed by
-{\ViennaCL}. Operations on the various types are explained in
+provided data types. Operations on the various types are explained in
 Chapter \ref{chap:operations}. For full details, refer to the reference pages
-in the folder
-\texttt{doc/doxygen}.
+in the folder \texttt{doc/doxygen}.
 
 \section {Scalar Type}
 The scalar type \lstinline|scalar<T>| with template parameter T
-denoting the underlying CPU scalar type (float and double, if supported - see Tab.~\ref{tab:double-precision-GPUs}) represents a
-single scalar value on the GPU. \lstinline|scalar<T>| is designed to behave much
-like a scalar type on the CPU, but library users have to keep in mind that
-every operation on \lstinline|scalar<T>| requires to launch the appropriate
-compute kernel on the GPU and is thus much slower then the CPU equivalent.
+denoting the underlying CPU scalar type (\lstinline|char|, \lstinline|short|, \lstinline|int|, \lstinline|long|, \lstinline|float| and \lstinline|double|, if supported - see Tab.~\ref{tab:double-precision-GPUs}) represents a
+single scalar value on the computing device. \lstinline|scalar<T>| is designed to behave much
+like a scalar type on conventional host-based CPU processing, but library users have to keep in mind that
+every operation on \lstinline|scalar<T>| may require the launch of an appropriate
+compute kernel on the GPU, thus making the operation much slower then the conventional CPU equivalent.
+Even if the host-based computing backend of {\ViennaCL} is used, some (small) overheads occur.
 
 \NOTE{Be aware that operations between objects of type \lstinline|scalar<T>|
-(e.g.~additions. comparisons) have large overhead. For every operation, a
-separate compute kernel launch is required.}
+(e.g.~additions. comparisons) have large overhead on GPU backends. A
+separate compute kernel launch is required for every operation in such case.}
 
 \subsection{Example Usage}
 The scalar type of {\ViennaCL} can be used just like the built-in
@@ -28,11 +26,11 @@ types, as the following snippet shows:
   viennacl::scalar<float>  gpu_float(3.1415f);
   viennacl::scalar<double> gpu_double = 2.71828;
 
-  //conversions and t
-  cpu_float = gpu_float;  
+  //conversions
+  cpu_float = gpu_float;
   gpu_float = cpu_double;  //automatic transfer and conversion
 
-  cpu_float = gpu_float * 2.0f; 
+  cpu_float = gpu_float * 2.0f;
   cpu_double = gpu_float - cpu_float;
 \end{lstlisting}
 Mixing built-in types with the {\ViennaCL} scalar is usually not a
@@ -41,6 +39,9 @@ arithmetics should be used sparsingly.
 
 \NOTE{In the present version of {\ViennaCL}, it is not possible to assign a \lstinline|scalar<float>| to a \lstinline|scalar<double>| directly.}
 
+\NOTE{Mixing operations between objects of different scalar types is not supported. Convert the data manually on the host if needed.}
+
+
 \subsection{Members}
 Apart from suitably overloaded operators that mimic the behavior of the
 respective CPU counterparts, only a single public member function
@@ -51,7 +52,7 @@ respective CPU counterparts, only a single public member function
 \begin{tabular}{p{6.5cm}|p{8.5cm}}
 Interface & Comment\\
 \hline
-\texttt{v.handle()}   & The GPU handle \\
+\texttt{v.handle()}   & The memory handle (CPU, {\CUDA}, or {\OpenCL}) \\
 \end{tabular}
 \caption{Interface of \texttt{vector$<$T$>$} in \ViennaCL. Destructors and
 operator overloads for BLAS are not listed.}
@@ -64,20 +65,15 @@ operator overloads for BLAS are not listed.}
 \section{Vector Type}
 The main vector type in {\ViennaCL} is \texttt{vector$<$T, alignment$>$},
 representing a chunk of memory on the compute device. \texttt{T} is the
-underlying scalar type (either \texttt{float} or \texttt{double} if supported, cf.~Tab.~\ref{tab:double-precision-GPUs}, complex types
-are not supported in \ViennaCLversion) and the optional argument \texttt{alignment} denotes the memory
-the vector is aligned to (in multiples of \texttt{sizeof(T)}). For example, a
-vector with a size of 55 entries and an alignment of 16 will reside in a
-block of memory equal to 64 entries. Memory alignment is fully
-transparent, so from the end-user's point of view, \texttt{alignment} allows to
-tune {\ViennaCL} for maximum speed on the available compute device.
+underlying scalar type (\lstinline|char|, \lstinline|short|, \lstinline|int|, \lstinline|long|, either \lstinline|float|, or \lstinline|double| if supported, cf.~Tab.~\ref{tab:double-precision-GPUs}, complex types
+are not supported in \ViennaCLversion). The second template argument \texttt{alignment} is deprecated and should not be provided by the library user.
 
 At construction, \texttt{vector$<$T, alignment$>$} is initialized to have the
 supplied length, but the memory is not initialized to zero. Another difference
 to CPU implementations is that accessing single vector elements is very costly,
 because every time an element is
 accessed, it has to be transferred from the CPU to the compute device or vice
-versa. 
+versa.
 \subsection{Example Usage}
 The following code snippet shows the typical use of the vector type provided by
 {\ViennaCL}. The overloaded function \texttt{copy()} function, which is used similar to
@@ -87,7 +83,7 @@ std::vector<ScalarType>      stl_vec(10);
 viennacl::vector<ScalarType> vcl_vec(10);
 
 //fill the STL vector:
-for (unsigned int i=0; i<vector_size; ++i)
+for (size_t i=0; i<stl_vec.size(); ++i)
   stl_vec[i] = i;
 
 //copy content to GPU vector (recommended initialization)
@@ -120,7 +116,7 @@ the memory can be initialized with zero values using the member function
 Tab.~\ref{tab:vector-interface} for other member functions.
 
 \NOTE{Accessing single elements of a vector using operator() or operator[] is
-very slow! Use with care!}
+very slow for GPUs due to PCI-Express latency! Use with care!}
 
 \begin{table}[tb]
 \begin{center}
@@ -128,18 +124,18 @@ very slow! Use with care!}
 Interface & Comment\\
 \hline
 \texttt{CTOR(n)}    & Constructor with number of entries \\
-\texttt{v(i)}    & Access to the $i$-th element of v (slow!) \\
-\texttt{v[i]}    & Access to the $i$-th element of v (slow!) \\
-\texttt{v.clear()}    & Initialize v with zeros \\
+\texttt{v(i)}       & Access to the $i$-th element of v (slow for GPUs!) \\
+\texttt{v[i]}       & Access to the $i$-th element of v (slow for GPUs!) \\
+\texttt{v.clear()}  & Initialize v with zeros \\
 \texttt{v.resize(n, bool preserve)}    & Resize v to length n. Preserves old values if bool is true. \\
-\texttt{v.begin()}   & Iterator to the begin of the matrix \\
-\texttt{v.end()}   & Iterator to the end of the matrix \\
-\texttt{v.size()}    & Length of the vector \\
-\texttt{v.swap(v2)}   & Swap the content of v with v2 \\
+\texttt{v.begin()}  & Iterator to the begin of the matrix \\
+\texttt{v.end()}    & Iterator to the end of the matrix \\
+\texttt{v.size()}   & Length of the vector \\
+\texttt{v.swap(v2)} & Swap the content of v with v2 \\
 \texttt{v.internal\_size()} & Returns the number of entries allocated on the GPU (taking alignment into account) \\
 \texttt{v.empty()}   & Shorthand notation for \texttt{v.size() == 0} \\
 \texttt{v.clear()}   & Sets all entries in v to zero \\
-\texttt{v.handle()}  & Returns the GPU handle (needed for custom kernels, see Chap.~\ref{chap:custom})
+\texttt{v.handle()}  & Returns the memory handle (needed for custom kernels, see Chap.~\ref{chap:custom})
 \end{tabular}
 \caption{Interface of \texttt{vector$<$T$>$} in \ViennaCL. Destructors and
 operator overloads for BLAS are not listed.}
@@ -156,10 +152,10 @@ magnitude. For example:
 for (size_t i=0; i<cpu_vector.size(); ++i)
     cpu_vector(i) = 1e-3f;
 
-// fill a ViennaCL vector - VERY SLOW!!
+// fill a ViennaCL vector - VERY SLOW with GPU backends!!
 for (size_t i=0; i<gpu_vector.size(); ++i)
     vcl_vector(i) = 1e-3f;
-\end{lstlisting} 
+\end{lstlisting}
 The difference in execution speed is typically several orders of magnitude,
 therefore direct vector element access should be used only if a very small
 number of entries is accessed in this way. A much faster initialization is as
@@ -173,14 +169,14 @@ for (long i=0; i<cpu_vector.size(); ++i)
 // fill a vector on GPU with data from CPU - faster versions:
 copy(cpu_vector, vcl_vector);                                   //option 1
 copy(cpu_vector.begin(), cpu_vector.end(), vcl_vector.begin()); //option 2
-\end{lstlisting} 
+\end{lstlisting}
 In this way, setup costs for the CPU vector and the {\ViennaCL} vector are comparable.
 
 \section{Dense Matrix Type}
 \texttt{matrix$<$T, F, alignment$>$} represents a dense matrix with interface listed in
 Tab.~\ref{tab:matrix-interface}. The second optional template argument \texttt{F}
-specifies the storage layout and defaults to \texttt{row\_major}. Since {\ViennaCL} 1.1.0, also \lstinline|column_major| memory layout can be used.
-The third template argument \texttt{alignment} denotes an alignment for the rows and columns for row-major and column-major memory layout (cf.~\texttt{alignment} for the \texttt{vector} type).
+specifies the storage layout and defaults to \texttt{row\_major}. As an alternative, a \lstinline|column_major| memory layout can be used.
+The third template argument \texttt{alignment} denotes an alignment for the rows and columns for row-major and column-major memory layout and should no longer be specified by the user (cf.~\texttt{alignment} for the \texttt{vector} type).
 
 \subsection{Example Usage}
 The use of \texttt{matrix$<$T, F$>$} is similar to that of the counterpart in {\ublas}. The operators are overloaded similarly.
@@ -190,13 +186,13 @@ The use of \texttt{matrix$<$T, F$>$} is similar to that of the counterpart in {\
  viennacl::matrix<float>  vcl_matrix(4, 5);
 
  //fill it up:
- vcl_matrix(0,2) = 1.0; 
- vcl_matrix(1,2) = -1.5; 
- vcl_matrix(2,0) = 4.2; 
- vcl_matrix(3,4) = 3.1415; 
-\end{lstlisting} 
+ vcl_matrix(0,2) = 1.0;
+ vcl_matrix(1,2) = -1.5;
+ vcl_matrix(2,0) = 4.2;
+ vcl_matrix(3,4) = 3.1415;
+\end{lstlisting}
 
-\NOTE{Accessing single elements of a matrix using \texttt{operator()} is very slow! Use with care!}
+\NOTE{Accessing single elements of a matrix using \texttt{operator()} is very slow on GPU backends! Use with care!}
 
 A much better way is to initialize a dense matrix using the provided \texttt{copy()} function:
 \begin{lstlisting}
@@ -205,10 +201,12 @@ copy(cpu_matrix, gpu_matrix);
 
 //copy content from GPU matrix to CPU matrix
 copy(gpu_matrix, cpu_matrix);
-\end{lstlisting} 
+\end{lstlisting}
 The type requirement on the \texttt{cpu\_matrix} is that \texttt{operator()} can be used for accessing entries, that a member function \texttt{size1()} returns the number of rows and that \texttt{size2()} returns the number of columns.
 Please refer to Chap.~\ref{chap:other-libs} for an overview of other libraries for which an overload of \texttt{copy()} is provided.
 
+\NOTE{The internal memory buffer of a \lstinline|matrix<>| is by default padded with zeros so that the internal matrix size is a multiple of e.g.~a power of two.}
+
 \subsection{Members}
 
 The members are listed in Tab.~\ref{tab:matrix-interface}. The usual operator overloads are not listed explicitly
@@ -230,7 +228,7 @@ Interface & Comment\\
 \texttt{mat.size2()}            & Number of columns in mat \\
 \texttt{mat.internal\_size2()}   & Internal number of columns in mat \\
 \texttt{mat.clear()}   & Sets all entries in v to zero \\
-\texttt{mat.handle()}  & Returns the GPU handle (needed for custom kernels, see Chap.~\ref{chap:custom})
+\texttt{mat.handle()}  & Returns the memory handle (needed for custom kernels, see Chap.~\ref{chap:custom})
 \end{tabular}
 \caption{Interface of the dense matrix type \texttt{matrix$<$T, F$>$} in
 \ViennaCL. Constructors, Destructors and operator overloads for BLAS are not
@@ -239,110 +237,10 @@ listed.}
 \end{center}
 \end{table}
 
-\section{Structured Matrix Types}
-
-\NOTE{Structured matrix types are experimental in {\ViennaCLversion}. Interface changes as well as considerable performance improvements may be included in
-future releases!}
-
-There are a number of structured dense matrices for which some algorithms such as matrix-vector products can be computed with much lower computational effort
-than for the general dense matrix case. In the following, four structured dense matrix types included in {\ViennaCL} are discussed. 
-Example code can be found in \lstinline|examples/tutorial/structured-matrices.cpp|.
-
-\subsection{Circulant Matrix}
-A circulant matrix is a matrix of the form
-\begin{align*}
- \left( \begin{array}{ccccc}
-         c_0 & c_{n-1} & \ldots & c_2 & c_1 \\
-         c_1 & c_0 & c_{n-1} & & c_2 \\
-         \vdots & c_1 & c_0 & \ddots & \vdots \\
-         c_{n-2} & & \ddots & \ddots & c_{n-1} \\
-         c_{n-1} & c_{n-2} & \hdots & c_1 & c_0 \\
-        \end{array} \right)
-\end{align*}
-and available in {\ViennaCL} via
-\begin{lstlisting}
- #include "viennacl/circulant_matrix.hpp"
-
- std::size_t s = 42;
- viennacl::circulant_matrix circ_mat(s, s);
-\end{lstlisting}
-The \lstinline|circulant_matrix| type can be manipulated in the same way as the dense matrix type \lstinline|matrix|. Note that writing to a single element of
-the matrix is structure-preserving, e.g.~changing \lstinline|circ_mat(1,2)| will automatically update \lstinline|circ_mat(0,1)|, \lstinline|circ_mat(2,3)| and
-so on.
-
-
-\subsection{Hankel Matrix}
-A Hankel matrix is a matrix of the form
-\begin{align*}
- \left( \begin{array}{cccc}
-         a & b & c & d \\
-         b & c & d & e \\
-         c & d & e & f \\
-         d & e & f & g \\
-        \end{array} \right)
-\end{align*}
-and available in {\ViennaCL} via
-\begin{lstlisting}
- #include "viennacl/hankel_matrix.hpp"
-
- std::size_t s = 42;
- viennacl::hankel_matrix hank_mat(s, s);
-\end{lstlisting}
-The \lstinline|hankel_matrix| type can be manipulated in the same way as the dense matrix type \lstinline|matrix|. Note that writing to a single element of
-the matrix is structure-preserving, e.g.~changing \lstinline|hank_mat(1,2)| in the example above will also update \lstinline|hank_mat(0,3)|,
-\lstinline|hank_mat(2,1)| and
-\lstinline|hank_mat(3,0)|.
-
-\subsection{Toeplitz Matrix}
-A Toeplitz matrix is a matrix of the form
-\begin{align*}
- \left( \begin{array}{cccc}
-         a & b & c & d \\
-         e & a & b & c \\
-         f & e & a & b \\
-         g & f & e & a \\
-        \end{array} \right)
-\end{align*}
-and available in {\ViennaCL} via
-\begin{lstlisting}
- #include "viennacl/toeplitz_matrix.hpp"
-
- std::size_t s = 42;
- viennacl::toeplitz_matrix toep_mat(s, s);
-\end{lstlisting}
-The \lstinline|toeplitz_matrix| type can be manipulated in the same way as the dense matrix type \lstinline|matrix|. Note that writing to a single element of
-the matrix is structure-preserving, e.g.~changing \lstinline|toep_mat(1,2)| in the example above will also update \lstinline|toep_mat(0,1)| and
-\lstinline|toep_mat(2,3)|.
-
-
-\subsection{Vandermonde Matrix}
-A Vandermonde matrix is a matrix of the form
-\begin{align*}
- \left( \begin{array}{ccccc}
-         1 & \alpha_1 & \alpha_1^2 & \ldots & \alpha_1^{n-1} \\
-         1 & \alpha_2 & \alpha_2^2 & \ldots & \alpha_2^{n-1} \\
-         1 & \vdots & \vdots & \vdots \\
-         1 & \alpha_m & \alpha_m^2 & \ldots & \alpha_m^{n-1} \\
-        \end{array} \right)
-\end{align*}
-and available in {\ViennaCL} via
-\begin{lstlisting}
- #include "viennacl/vandermonde_matrix.hpp"
-
- std::size_t s = 42;
- viennacl::vandermonde_matrix vand_mat(s, s);
-\end{lstlisting}
-The \lstinline|vandermonde_matrix| type can be manipulated in the same way as the dense matrix type \lstinline|matrix|, but restrictions apply. For
-example, the addition or subtraction of two Vandermonde matrices does not yield another Vandermonde matrix. Note that writing to a single element of
-the matrix is structure-preserving, e.g.~changing \lstinline|vand_mat(1,2)| in the example above will automatically update \lstinline|vand_mat(1,3)|,
-\lstinline|vand_mat(1,4)|, etc.
-
 
 \section{Sparse Matrix Types}
 
-There are two different sparse matrix types provided in {\ViennaCL}, \texttt{compressed\_matrix} and \texttt{coordinate\_matrix}.
-
-\TIP{In {\ViennaCLversion}, the use of \texttt{compressed\_matrix} is encouraged over \texttt{coordinate\_matrix}}
+There are five different sparse matrix types provided in {\ViennaCL}, \lstinline|compressed_matrix|, \lstinline|coordinate_matrix|, \lstinline|ell_matrix|, \lstinline|hyb_matrix|, and \lstinline|compressed_compressed_matrix|.
 
 \subsection{Compressed Matrix}
 \texttt{compressed\_matrix$<$T, alignment$>$} represents a sparse
@@ -368,9 +266,9 @@ Interface & Comment\\
 \texttt{mat.nnz()}		& Number of nonzeroes in mat \\
 \parbox{6cm}{\texttt{mat.resize(m, n, \\
            \hphantom{mat.resize(}bool preserve)}}    & Resize mat to m rows and n columns. Currently, the boolean flag is ignored and entries always discarded. \\
-\texttt{mat.handle1()}  & Returns the GPU handle holding the row indices (needed for custom kernels, see Chap.~\ref{chap:custom}) \\
-\texttt{mat.handle2()}  & Returns the GPU handle holding the column indices  (needed for custom kernels, see Chap.~\ref{chap:custom}) \\
-\texttt{mat.handle()}  & Returns the GPU handle holding the entries (needed for custom kernels, see Chap.~\ref{chap:custom})
+\texttt{mat.handle1()}  & Returns the memory handle holding the row indices (needed for custom kernels, see Chap.~\ref{chap:custom}) \\
+\texttt{mat.handle2()}  & Returns the memory handle holding the column indices  (needed for custom kernels, see Chap.~\ref{chap:custom}) \\
+\texttt{mat.handle()}  & Returns the memory handle holding the entries (needed for custom kernels, see Chap.~\ref{chap:custom})
 \end{tabular}
 \caption{Interface of the sparse matrix type \texttt{compressed\_matrix$<$T, F$>$} in \ViennaCL. Destructors and operator overloads for BLAS are not listed.}
 \label{tab:compressed-matrix-interface}
@@ -385,9 +283,9 @@ There is a direct interfacing with the standard implementation using a vector of
  std::vector< std::map< unsigned int, float> > cpu_sparse_matrix(4);
 
  //fill it up:
- cpu_sparse_matrix[0][2] = 1.0; 
- cpu_sparse_matrix[1][2] = -1.5; 
- cpu_sparse_matrix[3][0] = 4.2; 
+ cpu_sparse_matrix[0][2] =  1.0;
+ cpu_sparse_matrix[1][2] = -1.5;
+ cpu_sparse_matrix[3][0] =  4.2;
 
  //set up a sparse ViennaCL matrix:
  viennacl::compressed_matrix<float>  vcl_sparse_matrix(4, 5);
@@ -417,11 +315,11 @@ For the sparse matrix types in {\ublas}, these requirements are all fulfilled. P
 of other libraries for which an overload of \texttt{copy()} is provided.
 
 \subsubsection{Members}
-The interface is described in Tab.~\ref{tab:compressed-matrix-interface}. 
+The interface is described in Tab.~\ref{tab:compressed-matrix-interface}.
 
 \subsection{Coordinate Matrix}
-In the second sparse matrix type, \texttt{coordinate\_matrix$<$T, alignment$>$}, 
-entries are stored as triplets \texttt{(i,j,val)}, where \texttt{i} is the row index, \texttt{j} is the column index and \texttt{val} is the entry. 
+In the second sparse matrix type, \texttt{coordinate\_matrix$<$T, alignment$>$},
+entries are stored as triplets \texttt{(i,j,val)}, where \texttt{i} is the row index, \texttt{j} is the column index and \texttt{val} is the entry.
 Again, \texttt{T} is the floating point type. The optional \texttt{alignment} defaults to \texttt{1} at present.
 In general, sparse matrices should be set up on the
 CPU and then be pushed to the compute device using \texttt{copy()}, because dynamic memory management of sparse matrices is not provided on {\OpenCL} compute devices such as GPUs.
@@ -435,7 +333,7 @@ Interface & Comment\\
 %\texttt{mat(i,j) const}    & Read-only access to the element in the $i$-th row and the $j$-th column of mat \\
 %\texttt{mat.readFrom(PointerType prows, PointerType pcols, PointerType pentries)} & Fill mat with the values from the supplied piece of memory. \\
 %\texttt{mat.writeTo(PointerType prows, PointerType pcols, PointerType pentries)}  & Fill the supplied piece of memory with values from mat. \\ \\
-\texttt{mat.reserve(num)}    & Reserve memory for up to \texttt{num} nonzero entries \\
+\texttt{mat.reserve(num)}    & Reserve memory for \texttt{num} nonzero entries \\
 \texttt{mat.size1()}            & Number of rows in mat \\
 \texttt{mat.size2()}            & Number of columns in mat \\
 \texttt{mat.nnz()}		& Number of nonzeroes in mat \\
@@ -443,8 +341,8 @@ Interface & Comment\\
            \hphantom{mat.resize(}bool preserve)}}    & Resize mat to m rows and n columns. Currently, the boolean flag is ignored and entries always discarded. \\
 %\texttt{mat.clear()}    & Initialize mat with zeros \\
 \texttt{mat.resize(m, n)}    & Resize mat to m rows and n columns. Does not preserve old values. \\
-\texttt{mat.handle12()}  & Returns the GPU handle holding the row and column indices (needed for custom kernels, see Chap.~\ref{chap:custom}) \\
-\texttt{mat.handle()}  & Returns the GPU handle holding the entries (needed for custom kernels, see Chap.~\ref{chap:custom})
+\texttt{mat.handle12()}  & Returns the memory handle holding the row and column indices (needed for custom kernels, see Chap.~\ref{chap:custom}) \\
+\texttt{mat.handle()}  & Returns the memory handle holding the entries (needed for custom kernels, see Chap.~\ref{chap:custom})
 \end{tabular}
 \caption{Interface of the sparse matrix type \texttt{coordinate\_matrix$<$T, A$>$} in \ViennaCL. Destructors and operator overloads for BLAS are not listed.}
 \label{tab:coordinate-matrix-interface}
@@ -452,22 +350,47 @@ Interface & Comment\\
 \end{table}
 
 \subsubsection{Example Usage}
-The use of \texttt{coordinate\_matrix$<$T, alignment$>$} is similar to that of the first sparse matrix type 
+The use of \texttt{coordinate\_matrix$<$T, alignment$>$} is similar to that of the first sparse matrix type
 \texttt{compressed\_matrix$<$T, alignment$>$}, thus we refer to Sec.~\ref{sec:compressed-matrix-example}
 
 
 \subsubsection{Members}
-The interface is described in Tab.~\ref{tab:coordinate-matrix-interface}. 
+The interface is described in Tab.~\ref{tab:coordinate-matrix-interface}.
 
-\TIP{In {\ViennaCLversion} the use of \lstinline|compressed\_matrix| over \lstinline|coordinate\_matrix| is encouraged due to better performance!}
+%\TIP{In {\ViennaCLversion} the use of \lstinline|compressed\_matrix| over \lstinline|coordinate\_matrix| is encouraged due to better performance!}
+\NOTE{Note that only a few preconditioners work with \lstinline|coordinate_matrix| so far, cf.~ Sec.~\ref{sec:preconditioner}.}
 
 
+\subsection{ELL Matrix}
+A sparse matrix in ELL format of type \lstinline|ell_matrix| is stored in a block of memory of size $N \times n_{\max}$, where $N$ is the number of rows of the matrix and $n_{\max}$ is the maximum number of nonzeros per row.
+Rows with less than $n_{\max}$ entries are padded with zeros. In a second memory block, the respective column indices are stored.
 
-\section{Proxies}
-\NOTE{Matrix and vector proxies are experimental in {\ViennaCLversion}. Interface changes as well as considerable performance improvements may
-be included in future releases!}
+The ELL format is well suited for matrices where most rows have approximately the same number of nonzeros.
+This is often the case for matrices arising from the discretization of partial differential equations using e.g.~the finite element method.
+On the other hand, the ELL format introduces substantial overhead if the number of nonzeros per row varies a lot.
+
+For an example use of an \lstinline|ell_matrix|, have a look at \lstinline|examples/benchmarks/sparse.cpp|.
+
+\NOTE{Note that preconditioners in Sec.~\ref{sec:preconditioner} do not work with \lstinline|ell_matrix| yet.}
+
+\subsection{Hybrid Matrix}
+The higher performance of the ELL format for matrices with approximately the same number of entries per row
+and the higher flexibility of the CSR format is combined in the \lstinline|hyb_matrix| type, where the main part of the system matrix is stored in ELL format and excess entries are stored in CSR format.
+
+For an example use of an \lstinline|hyb_matrix|, have a look at \lstinline|examples/benchmarks/sparse.cpp|.
+
+\NOTE{Note that preconditioners in Sec.~\ref{sec:preconditioner} do not work with \lstinline|hyb_matrix| yet.}
+
+\subsection{Compressed Compressed Matrix}
+If only a few rows of a sparse matrix are populated, then the previous sparse matrix formats are fairly expensive in terms of memory consumption.
+This is addressed by the \lstinline|compressed_compressed_matrix<>| format, which is similar to the standard CSR format, but only stores the rows containing nonzero elements.
+An additional array is used to store the global row index $r$ in the sparse matrix $A$ of the $i$-th nonzero row.
+
+\NOTE{Note that preconditioners in Sec.~\ref{sec:preconditioner} do not work with \lstinline|compressed_compressed_matrix| yet.}
 
-Similar to {\ublas}, {\ViennaCL} provides \lstinline|range| objects in order to conveniently manipulate dense submatrices and vectors. The functionality is
+
+\section{Proxies}
+Similar to {\ublas}, {\ViennaCL} provides \lstinline|range| and \lstinline|slice| objects in order to conveniently manipulate dense submatrices and vectors. The functionality is
 provided in the headers \lstinline|viennacl/vector_proxy.hpp| and \lstinline|viennacl/matrix_proxy.hpp| respectively.
 A range refers to a contiguous integer interval and is set up as
 \begin{lstlisting}
@@ -475,6 +398,15 @@ A range refers to a contiguous integer interval and is set up as
  std::size_t upper_bound = 7;
  viennacl::range r(lower_bound, upper_bound);
 \end{lstlisting}
+A slice is similar to a range and allows in addition for arbitrary increments (\emph{stride}).
+For example, to create a slice consisting of the indices $2, 5, 8, 11, 14$, the code
+\begin{lstlisting}
+ std::size_t start  = 2;
+ std::size_t stride = 3;
+ std::size_t size   = 5
+ viennacl::slice s(start, stride, size);
+\end{lstlisting}
+
 In order to address a subvector of a vector \lstinline|v| and a submatrix of a matrix \lstinline|M|, the proxy objects \lstinline|v_sub| and \lstinline|M_sub|
 are created as follows:
 \begin{lstlisting}
@@ -484,12 +416,20 @@ are created as follows:
  viennacl::vector_range<VCLVectorType> v_sub(v, r);
  viennacl::matrix_range<VCLMatrixType> M_sub(M, r, r);
 \end{lstlisting}
+As a shortcut, one may use the free function \lstinline|project()| in order to avoid having to write the type explicitly:
+\begin{lstlisting}
+ project(v, r);    //returns a vector_range as above
+ project(M, r, r); //returns a matrix_range as above
+\end{lstlisting}
+In the same way \lstinline|vector_slice|s and \lstinline|matrix_slice|s are set up.
+
 The proxy objects can now be manipulated in the same way as vectors and dense matrices. In particular, operations such as vector proxy additions and matrix
 additions work as usual, e.g.
 \begin{lstlisting}
- vcl_sub += vcl_sub;
- M_sub += M_sub;
+ vcl_sub += vcl_sub; //or project(v, r) += project(v, r);
+ M_sub   += M_sub;   //or project(M, r, r) += project(M, r, r);
 \end{lstlisting}
  Submatrix-Submatrix products are computed in the same manner and are handy for many block-based linear algebra algorithms.
 
-\TIP{Example code can be found in \lstinline|examples/tutorial/vector-range.cpp| and \lstinline|examples/tutorial/matrix-range.cpp|}
\ No newline at end of file
+\TIP{Example code can be found in \lstinline|examples/tutorial/vector-range.cpp| and \lstinline|examples/tutorial/matrix-range.cpp|}
+
diff --git a/doc/manual/versioning.tex b/doc/manual/versioning.tex
index 8c1f314..93d9acc 100644
--- a/doc/manual/versioning.tex
+++ b/doc/manual/versioning.tex
@@ -1,5 +1,5 @@
 
-\chapter*{Versioning}  \addcontentsline{toc}{chapter}{Versioning}
+\chapter{Versioning}%  \addcontentsline{toc}{chapter}{Versioning}
 
 Each release of {\ViennaCL} carries a three-fold version number, given by\\
 \begin{center}
diff --git a/doc/manual/viennacl.bib b/doc/manual/viennacl.bib
index 6206b13..bdda2c3 100644
--- a/doc/manual/viennacl.bib
+++ b/doc/manual/viennacl.bib
@@ -24,6 +24,11 @@
    URL = {http://www.nvidia.com/object/cuda_opencl_new.html}
 }
 % -----------------------------------------------
+ at misc{nvidiacuda,
+   title = {{NVIDIA CUDA}},
+   URL = {http://www.nvidia.com/object/cuda_home_new.html}
+}
+% -----------------------------------------------
 @misc{xcode,
    title = {{Xcode Developer Tools}},
    URL = {http://developer.apple.com/technologies/tools/xcode.html}
@@ -70,13 +75,13 @@
 }
 % -----------------------------------------------
 @book{saad-iterative-solution,
-	author = {Saad, Y.  },
-	howpublished = {Paperback},
-	isbn = {0898715342},
-	month = {April},
-	publisher = {{Society for Industrial and Applied Mathematics}},
-	title = {Iterative Methods for Sparse Linear Systems, Second Edition},
-	year = {2003}
+  author = {Saad, Y.  },
+  howpublished = {Paperback},
+  isbn = {0898715342},
+  month = {April},
+  publisher = {{Society for Industrial and Applied Mathematics}},
+  title = {Iterative Methods for Sparse Linear Systems, Second Edition},
+  year = {2003}
 }
 % -----------------------------------------------
 @misc{pugixml,
@@ -130,7 +135,7 @@
  year = {1969},
  pages = {157--172},
  publisher = {ACM},
-} 
+}
 
 @article{lewis:gps-algorithm,
  author = {Lewis, J.~G.},
@@ -141,7 +146,7 @@
  year = {1982},
  pages = {190--194},
  publisher = {ACM},
-} 
+}
 
 @book{golub:matrix-computations,
  author={Golub, G.~H. and Van Loan, C.~F.},
@@ -150,3 +155,21 @@
  year = {1996}
 }
 
+ at article{simon:lanczos-pro,
+ author = {Simon, Horst~D.},
+ title = {The Lanczos Algorithm With Partial Reorthogonalization},
+ journal = {Mathematics of Computation},
+ volume = {42},
+ issue = {165},
+ year = {1984},
+ pages = {115-142},
+ publisher = {American Mathematical Society}
+}
+
+ at inproceedings{lee:nmf,
+ author = {Lee, D.~D. and Seung, S.~H.},
+ title = {{Algorithms for Non-negative Matrix Factorization}},
+ booktitle = {Advances in Neural Information Processing Systems 13},
+ pages = {556–562},
+ year = {2000},
+}
diff --git a/doc/manual/viennacl.tex b/doc/manual/viennacl.tex
index 4833a62..85c60f8 100644
--- a/doc/manual/viennacl.tex
+++ b/doc/manual/viennacl.tex
@@ -3,7 +3,7 @@
 
 \usepackage{amsmath,amssymb}
 \usepackage{newcent}
-\usepackage{pstricks} 
+\usepackage{pstricks}
 \usepackage{fancyhdr}
 \usepackage[dvips]{graphicx}
 \usepackage{makeidx}
@@ -16,9 +16,13 @@
 \usepackage{epsfig}
 %\usepackage{subfig}
 \usepackage{subfigure}
+\usepackage[toc,page]{appendix}
 
 
-\usepackage[colorlinks=true,linktocpage=true]{hyperref}
+\usepackage[pdfauthor={Karl Rupp et al.},
+            pdftitle={ViennaCL 1.5.1 Manual},
+            colorlinks=true,
+            linktocpage=true]{hyperref}
 
 %% Listings package START
 \usepackage{color}
@@ -55,11 +59,12 @@
 \newcommand{\CMake} {\texttt{CMake}}
 \newcommand{\OpenMP} {\texttt{OpenMP}}
 \newcommand{\OpenCL} {\texttt{OpenCL}}
+\newcommand{\CUDA} {\texttt{CUDA}}
 \newcommand{\ViennaCL} {\texttt{ViennaCL}}
-\newcommand{\ViennaCLversion} {\texttt{ViennaCL 1.2.1}}
-\newcommand{\ViennaCLminorversion} {\texttt{ViennaCL 1.2.x}}
+\newcommand{\ViennaCLversion} {\texttt{ViennaCL 1.5.1}}
+\newcommand{\ViennaCLminorversion} {\texttt{ViennaCL 1.5.x}}
 \newcommand{\Boost} {\texttt{Boost}}
-\newcommand{\ublas} {\texttt{ublas}}
+\newcommand{\ublas} {\texttt{uBLAS}}
 \newcommand{\Eigen} {\texttt{Eigen}}
 \newcommand{\MTL} {\texttt{MTL 4}}
 \newcommand{\GCC} {\texttt{GCC}}
@@ -68,6 +73,18 @@
 \include{keywords} % [KR] This is a mess, we should delete unused Minimos-stuff
 \include{setup}
 
+\makeatletter
+\renewcommand\@endpart{\vfil
+              \if at twoside
+                \null
+                \thispagestyle{empty}%
+                \newpage
+              \fi
+              \if at tempswa
+                \twocolumn
+              \fi}
+\makeatother
+
 \begin{document}
 \pagenumbering{roman}
 
@@ -76,7 +93,7 @@
 
 \clearpage
 
-\addtocontents{toc}{\protect\setcounter{tocdepth}{1}} 
+\addtocontents{toc}{\protect\setcounter{tocdepth}{1}}
 \tableofcontents
 %\label{s:ipl:content} \index{IPL!content}
 
@@ -84,23 +101,64 @@
 \pagenumbering{arabic}
 \include{introduction}
 \include{installation}
+
+%%%%%%%%%%%%%%%% Core Functionality %%%%%%%%%%%%%%%%
+
+\part{Core Functionality}
+The {\ViennaCL} core consists of operations and algorithms which are available on all three computing backends ({\CUDA}, host-based, {\OpenCL}).
+These features are considered stable and full support is provided. However, note that performance-characteristics may differ considerably on the different computing backends.
+In particular, the use of GPUs will not pay off if the data is too small, hence PCI-Express latency is dominant.
+
+
 \include{types}
 \include{operations}
 \include{algorithms}
+\include{other-libs}
+\include{memory-model}
+\include{shared-lib}
+
+
+%%%%%%%%%%%%%%% Addon Functionality %%%%%%%%%%%%%%%%
+
+
+\part{Addon Functionality}
+With the introduction of host-based, {\CUDA}- and {\OpenCL}-enabled computing backends in {\ViennaCL} 1.4.0, certain functionality is not available for all three backends and listed in the following.
+For example, the {\OpenCL} kernel generator makes sense in the {\OpenCL} computing backend, thus this functionality is moved out of the set of core functionality.
+
+Also, certain functionality is still in experimental stage and might experience interface changes.
+Although all functionality flagged as experimental and listed in this section passes a respective set of tests,
+library users are advised to use them with extra care and be prepared for interface changes when upgrading to a newer version of {\ViennaCL}.
+
+\include{additional-algorithms}
 \include{multi-device}
 \include{custom-kernels}
 \include{custom-contexts}
-\include{tuning}
-\include{other-libs}
-\include{benchmarks}
+%\include{kernel-generation}
+%\include{tuning}
+\include{structured-matrices}
+
+
+%%%%%%%%%%%%%%% Addon Functionality %%%%%%%%%%%%%%%%
+
+\part{Miscellaneous}
 \include{design}
+
+% Appendix
+%\appendix
+%\appendixpage
+%\addappheadtotoc
+
+\begin{appendices}
 \include{versioning}
 \include{changelogs}
 \include{license}
 
+\end{appendices}
+
+
 %\section{Bibliography}
 \bibliographystyle{IEEEtran_v1.13}
-\addcontentsline{toc}{chapter}{Bibliography} 
+\addcontentsline{toc}{chapter}{Bibliography}
 \bibliography{viennacl}
 
 %\cleardoublepage
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 3690af1..e9511cb 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -3,10 +3,10 @@ set(EXAMPLES_TESTDATA
    testdata/README
    testdata/result65025.txt
    testdata/rhs65025.txt)
-foreach(f IN LISTS EXAMPLES_TESTDATA)
+foreach(f ${EXAMPLES_TESTDATA})
    configure_file(${f} "${CMAKE_CURRENT_BINARY_DIR}/${f}" COPYONLY)
 endforeach()
 
 add_subdirectory(tutorial)
+add_subdirectory(autotuner)
 add_subdirectory(benchmarks)
-add_subdirectory(parameters)
diff --git a/examples/autotuner/CMakeLists.txt b/examples/autotuner/CMakeLists.txt
new file mode 100644
index 0000000..14d7a42
--- /dev/null
+++ b/examples/autotuner/CMakeLists.txt
@@ -0,0 +1,12 @@
+include_directories(${PROJECT_SOURCE_DIR}/external)
+
+if (ENABLE_OPENCL)
+  include_directories(${PROJECT_SOURCE_DIR}/external/)
+
+  foreach(proc vector-axpy_autotuning dot_autotuning gemv_autotuning gemm_autotuning dump_default_kernels)
+    add_executable(${proc} ${proc}.cpp)
+    target_link_libraries(${proc} ${OPENCL_LIBRARIES})
+    set_target_properties(${proc} PROPERTIES COMPILE_FLAGS "-DVIENNACL_WITH_OPENCL")
+  endforeach()
+
+endif (ENABLE_OPENCL)
diff --git a/examples/autotuner/command-line-utils.hpp b/examples/autotuner/command-line-utils.hpp
new file mode 100644
index 0000000..397ba15
--- /dev/null
+++ b/examples/autotuner/command-line-utils.hpp
@@ -0,0 +1,55 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#ifndef _COMMAND_LINE_UTILS_HPP_
+#define _COMMAND_LINE_UTILS_HPP_
+
+#include <tclap/CmdLine.h>
+
+std::vector<unsigned int> get_values_in_commas(std::string const & s){
+    std::vector<unsigned int> res;
+    std::size_t old_comma_pos = 0, new_comma_pos;
+    while((new_comma_pos = s.find(',',old_comma_pos))!= std::string::npos){
+        res.push_back(atoi(s.substr(old_comma_pos,new_comma_pos).c_str()));
+        old_comma_pos = new_comma_pos+1;
+    }
+    res.push_back(atoi(s.substr(old_comma_pos,s.length()).c_str()));
+    return res;
+}
+
+class pow_2_interval_constraint : public TCLAP::Constraint<std::string>{
+    static bool is_pow_of_two(const unsigned int x){ return ((x != 0) && !(x & (x - 1))); }
+public:
+    bool check(std::string const & s) const{
+        std::vector<unsigned int> vals = get_values_in_commas(s);
+        return vals.size()==2 && is_pow_of_two(vals[0]) && is_pow_of_two(vals[1]);
+    }
+    std::string shortID() const { return "min,max"; }
+    std::string description() const { return "Must be a power of two"; }
+};
+
+class min_max_inc_constraint : public TCLAP::Constraint<std::string>{
+public:
+    bool check(std::string const & s) const{
+        std::vector<unsigned int> vals = get_values_in_commas(s);
+        return vals.size()==3;
+    }
+    std::string shortID() const { return "min,max,inc"; }
+    std::string description() const { return "Must contain minimum value, maximum value and increment"; }
+};
+
+#endif
diff --git a/examples/autotuner/dot_autotuning.cpp b/examples/autotuner/dot_autotuning.cpp
new file mode 100644
index 0000000..5d2b1e1
--- /dev/null
+++ b/examples/autotuner/dot_autotuning.cpp
@@ -0,0 +1,270 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+//#define VIENNACL_DEBUG_ALL
+//#define VIENNACL_DEBUG_BUILD
+
+#include <iostream>
+#include <algorithm>
+#include <string>
+
+#include "viennacl/linalg/inner_prod.hpp"
+#include "viennacl/scheduler/forwards.h"
+#include "viennacl/vector.hpp"
+#include "viennacl/generator/generate.hpp"
+#include "viennacl/generator/autotune.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+
+#include "command-line-utils.hpp"
+
+static const unsigned int n_runs = 10;
+
+using namespace viennacl::generator;
+
+typedef std::vector< viennacl::ocl::platform > platforms_type;
+typedef std::vector<viennacl::ocl::device> devices_type;
+typedef std::vector<cl_device_id> cl_devices_type;
+
+
+struct autotuner_options{
+    unsigned int tuning_size;
+
+    std::string scalartype;
+    std::string output_name;
+
+    unsigned int requested_device;
+
+    std::string vector_interval;
+
+    std::string local_size_interval;
+    std::string num_groups_interval;
+
+    std::string decomposition;
+};
+
+autotuner_options get_options(int argc, char* argv[]){
+    try{
+        autotuner_options options;
+
+        TCLAP::CmdLine cmd("GEMM Autotuner", ' ', "0.1");
+
+
+        pow_2_interval_constraint pow_2_interval_cstrt;
+        min_max_inc_constraint min_max_inc_cstrt;
+
+        TCLAP::ValueArg<unsigned int> tuning_size_arg("","tuning-size","Size to use for the autotuning procedure",false,1024*1024,"unsigned int",cmd);
+
+        //Scalartype
+        std::vector<std::string> allowed_scalartypes;
+        allowed_scalartypes.push_back("float");
+        allowed_scalartypes.push_back("double");
+        TCLAP::ValuesConstraint<std::string> allowed_scalartypes_constraint( allowed_scalartypes);
+        TCLAP::ValueArg<std::string> scalartype_arg("s","scalartype","Scalartype to tune the hardware for",true,"float",&allowed_scalartypes_constraint,cmd);
+
+        //Output data file
+        TCLAP::ValueArg<std::string> output_name_arg("o","output","Name of the output data file",true,"gemm_autotuning.dat","string",cmd);
+
+        //Device id
+        TCLAP::ValueArg<unsigned int> requested_device_arg("d","device","ID of the device to use for the autotuning procedure",false,0,"unsigned int",cmd);
+
+        //Vector
+        TCLAP::ValueArg<std::string> vector_interval_arg("","vector","Vector type used in the kernel",false,"1,1",&pow_2_interval_cstrt,cmd);
+
+        //Large blocks
+        TCLAP::ValueArg<std::string> local_size_interval_arg("","local-size","Number of work-item in each work-group. Specify min,max both power of two.",false,"16,1024",&pow_2_interval_cstrt,cmd);
+        TCLAP::ValueArg<std::string> num_groups_interval_arg("","num-groups","Number of work groups required.",false,"16,1024,16",&min_max_inc_cstrt,cmd);
+
+        //Decomposition
+        std::vector<std::string> allowed_decomposition_method;
+        allowed_decomposition_method.push_back("local");
+        allowed_decomposition_method.push_back("global");
+        allowed_decomposition_method.push_back("all");
+        TCLAP::ValuesConstraint<std::string> allowed_decomposition_method_constraint(allowed_decomposition_method);
+        TCLAP::ValueArg<std::string> decomposition_method_arg("","decomposition","Work decomposition method. If set to \"local\" , the work items within a work group will access contiguous data.",false,"all",&allowed_decomposition_method_constraint,cmd);
+
+        cmd.parse(argc,argv);
+        options.tuning_size = tuning_size_arg.getValue();
+        options.scalartype = scalartype_arg.getValue();
+        options.output_name = output_name_arg.getValue();
+        options.requested_device = requested_device_arg.getValue();
+        options.vector_interval = vector_interval_arg.getValue();
+        options.local_size_interval = local_size_interval_arg.getValue();
+        options.num_groups_interval = num_groups_interval_arg.getValue();
+        options.decomposition = decomposition_method_arg.getValue();
+        return options;
+    }
+    catch (TCLAP::ArgException &e){
+        std::cerr << "error: " << "\"" << e.error() << "\"" << " [for arg " << e.argId() << "]" << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+template<class ScalarType>
+struct config{
+    typedef scalar_reduction profile_type;
+    static profile_type create_profile(std::map<std::string, autotune::tuning_param> const & params){
+      return profile_type(viennacl::generator::at(params, std::string("vector")).current(),
+                          viennacl::generator::at(params, std::string("local_size")).current(),
+                          viennacl::generator::at(params, std::string("num_groups")).current(),
+                          viennacl::generator::at(params, std::string("decomposition")).current());
+    }
+    static bool is_invalid(viennacl::ocl::device const & dev, std::map<std::string, autotune::tuning_param> const & params){
+        profile_type prof = create_profile(params);
+        return prof.is_invalid(dev, sizeof(ScalarType));
+    }
+};
+
+template<class ScalarType>
+code_generator::forced_profile_key_type make_key(autotuner_options /*options*/){
+    return code_generator::forced_profile_key_type(SCALAR_REDUCE_TYPE,sizeof(ScalarType));
+}
+
+template<class ScalarType>
+viennacl::scheduler::statement make_statement(autotuner_options /*options*/, viennacl::scalar<ScalarType> const & s, viennacl::vector<ScalarType> const & x, viennacl::vector<ScalarType> const & y){
+    return viennacl::scheduler::statement(s, viennacl::op_assign(), viennacl::linalg::inner_prod(x, y));
+}
+
+template<typename ScalarType>
+double run_benchmark(size_t size, autotuner_options options, typename config<ScalarType>::profile_type const & profile)
+{
+    //viennacl::ocl::current_context().build_options("-cl-mad-enable -cl-fast-relaxed-math");   //uncomment for additional optimizations
+    //viennacl::ocl::current_context().build_options("-cl-opt-disable");                        //uncomment to get poor performance
+    viennacl::vector<ScalarType> y(size);
+    viennacl::vector<ScalarType> x(size);
+    viennacl::scalar<ScalarType> s = 0;
+    viennacl::scheduler::statement statement = make_statement(options,s,x,y);
+    viennacl::generator::code_generator gen;
+    gen.add(statement,statement.array()[0]);
+    gen.force_profile(make_key<ScalarType>(options), profile);
+    viennacl::generator::enqueue(gen);
+    viennacl::generator::enqueue(gen);
+    viennacl::backend::finish();
+    viennacl::tools::timer timer;
+    timer.start();
+    static const unsigned int n_runs = 1;
+    for(unsigned int r = 0 ; r < n_runs; ++r)
+      viennacl::generator::enqueue(gen);
+    viennacl::backend::finish();
+    double time = timer.get()/(double)n_runs;
+    return 1e-9*2.0*static_cast<double>(size*sizeof(ScalarType))/time;
+}
+
+template<class ScalarType>
+void run_autotune(autotuner_options const & options){
+    typedef config<ScalarType> config_type;
+    typedef typename config_type::profile_type profile_type;
+
+    viennacl::ocl::device const &  device = viennacl::ocl::current_device();
+
+    viennacl::vector<ScalarType> v1(options.tuning_size), v2(options.tuning_size), v3(options.tuning_size), v4(options.tuning_size);
+    viennacl::backend::finish();
+    autotune::tuning_config<config<ScalarType> > conf;
+    std::map<double, typename config<ScalarType>::profile_type> timings;
+    viennacl::scalar<ScalarType> s = 0;
+
+    std::vector<unsigned int> tmp;
+    tmp = get_values_in_commas(options.local_size_interval); std::vector<int> local_size; for(unsigned int i=tmp[0] ; i<=tmp[1]; i*=2) local_size.push_back(i);
+    tmp = get_values_in_commas(options.num_groups_interval); std::vector<int> num_groups; for(unsigned int i=tmp[0] ; i<=tmp[1]; i+=tmp[2]) { num_groups.push_back(i); }
+    tmp = get_values_in_commas(options.vector_interval); std::vector<int> vector; for(unsigned int i=tmp[0] ; i<=tmp[1]; i*=2) vector.push_back(i);
+    std::vector<int> decomposition;
+    if(options.decomposition=="global")
+        decomposition.push_back(0);
+    else if(options.decomposition=="local")
+        decomposition.push_back(1);
+    else{
+        decomposition.push_back(0);
+        decomposition.push_back(1);
+    }
+
+    conf.add_tuning_param("vector",vector);
+    conf.add_tuning_param("local_size",local_size);
+    conf.add_tuning_param("num_groups",num_groups);
+    conf.add_tuning_param("decomposition", decomposition);
+    std::ofstream stream(options.output_name.c_str());
+
+
+    stream << "# ---- DOT AUTOTUNING ----" << std::endl;
+    stream << "#" << "Scalartype : " << options.scalartype << std::endl;
+    stream << "#----------------------" << std::endl;
+    stream << "#----------------------" << std::endl;
+    stream << "#----------------------" << std::endl;
+    stream << device.full_info(1,'#');
+    stream << "#----------------------" << std::endl;
+    stream << "#tuning for size : " << options.tuning_size << std::endl;
+
+    code_generator::forced_profile_key_type key(SCALAR_REDUCE_TYPE, sizeof(ScalarType));
+    viennacl::scheduler::statement statement(s, viennacl::op_assign(), viennacl::linalg::inner_prod(v1, v2));
+    autotune::benchmark(&timings,statement,key,conf,n_runs,&stream);
+
+    //Recompiles for the best profile
+    profile_type best_profile = timings.begin()->second;
+    viennacl::generator::code_generator dummy;
+    dummy.add(statement,statement.array()[0]);
+    dummy.force_profile(key, best_profile);
+    viennacl::generator::enqueue(dummy,true);
+    viennacl::backend::finish();
+
+    stream << "#Benchmarking " << timings.begin()->second << "..." << std::endl;
+    stream << "##Size\tGB/s" << std::endl;
+    for(unsigned int size = 1024 ; size <= 5e7 ; size *=2){
+        double percent = (double)size/1e7*100;
+        std::cout << '\r' << "Benchmarking..." << "[" << std::setprecision(2) << std::setfill (' ') << std::setw(6) << std::fixed  << percent << "%" << "]" << std::flush;
+        stream << "#" << size << "\t" << run_benchmark<ScalarType>(size,options,best_profile) << std::endl;
+    }
+    std::cout << '\r' << "Benchmarking...[100.00%]" << std::endl;
+}
+
+int main(int argc, char* argv[]){
+  typedef std::vector< viennacl::ocl::platform > platforms_type;
+  typedef std::vector<viennacl::ocl::device> devices_type;
+  autotuner_options options = get_options(argc,argv);
+  std::size_t device_counter = 0;
+  platforms_type platforms = viennacl::ocl::get_platforms();
+  for (platforms_type::iterator platform_iter  = platforms.begin();
+       platform_iter != platforms.end();
+       ++platform_iter)
+  {
+    devices_type devices = platform_iter->devices(CL_DEVICE_TYPE_ALL);
+    for(devices_type::iterator iter = devices.begin(); iter != devices.end(); iter++)
+    {
+      if(device_counter++==options.requested_device){
+        viennacl::ocl::setup_context(options.requested_device,*iter);
+        viennacl::ocl::switch_context(options.requested_device);
+        viennacl::ocl::device const & device = viennacl::ocl::current_device();
+        std::string device_name = device.name();
+        std::transform(device_name.begin(), device_name.end(), device_name.begin(), ::tolower);
+        std::replace(device_name.begin(), device_name.end(),' ', '_');
+        std::cout << "-------------------" << std::endl;
+        std::cout << device.info() << std::endl;
+        std::cout << "Operation : DOT" << std::endl;
+        std::cout << "-------------------" << std::endl;
+        std::cout << "scalatype : " << options.scalartype << std::endl;
+        std::cout << "vector : [" << options.vector_interval << "]" << std::endl;
+        std::cout << "local size : [" << options.local_size_interval << "]" << std::endl;
+        std::cout << "number of groups : [" << options.num_groups_interval << "]" << std::endl;
+        std::cout << "decomposition : [" << options.decomposition << "]" << std::endl;
+        std::cout << "tuning size : " << options.tuning_size << std::endl;
+        std::cout << "-------------------" << std::endl;
+        if(options.scalartype=="float")
+            run_autotune<float>(options);
+        else if(options.scalartype=="double")
+            run_autotune<double>(options);
+      }
+    }
+  }
+  std::cout << "Autotuning complete! Check \"" << options.output_name << "\" for results." << std::endl;
+}
diff --git a/examples/autotuner/dump_default_kernels.cpp b/examples/autotuner/dump_default_kernels.cpp
new file mode 100644
index 0000000..c04c639
--- /dev/null
+++ b/examples/autotuner/dump_default_kernels.cpp
@@ -0,0 +1,81 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include <fstream>
+
+#include "viennacl/matrix.hpp"
+#include "viennacl/generator/utils.hpp"
+#include "viennacl/generator/generate.hpp"
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+
+void dump_string_to_file(std::string const & filename, std::string const & str){
+  std::ofstream ofs(filename.c_str());
+  ofs << str << std::endl;
+}
+
+template<typename ScalarType>
+void dump_gemm_kernel(std::string const & device_name)
+{
+    std::string scalartype_name = viennacl::generator::utils::type_to_string<ScalarType>::value();
+    viennacl::matrix<ScalarType> A;
+    viennacl::matrix<ScalarType> B;
+    viennacl::matrix<ScalarType> C;
+    ScalarType alpha = ScalarType(1.0);
+    ScalarType beta  = ScalarType(1.0);
+
+    viennacl::scheduler::statement saa(C, viennacl::op_assign(), alpha*viennacl::linalg::prod(A,B) + beta*C);
+    viennacl::scheduler::statement sta(C, viennacl::op_assign(), alpha*viennacl::linalg::prod(trans(A),B) + beta*C);
+    viennacl::scheduler::statement sat(C, viennacl::op_assign(), alpha*viennacl::linalg::prod(A,trans(B)) + beta*C);
+    viennacl::scheduler::statement stt(C, viennacl::op_assign(), alpha*viennacl::linalg::prod(trans(A),trans(B)) + beta*C);
+
+    //OpenCL
+    dump_string_to_file("gemm_aa_" + scalartype_name + "_" + device_name + ".cl", viennacl::generator::get_opencl_program_string(saa));
+//    dump_string_to_file("gemm_ta_" + scalartype_name + "_" + device_name + ".cl", viennacl::generator::get_opencl_program_string(sta));
+//    dump_string_to_file("gemm_at_" + scalartype_name + "_" + device_name + ".cl", viennacl::generator::get_opencl_program_string(sat));
+//    dump_string_to_file("gemm_tt_" + scalartype_name + "_" + device_name + ".cl", viennacl::generator::get_opencl_program_string(stt));
+
+    //CUDA
+    dump_string_to_file("gemm_aa_" + scalartype_name + "_" + device_name + ".cu", viennacl::generator::get_cuda_device_code(saa));
+//    dump_string_to_file("gemm_ta_" + scalartype_name + "_" + device_name + ".cu", viennacl::generator::get_cuda_program_string(sta));
+//    dump_string_to_file("gemm_at_" + scalartype_name + "_" + device_name + ".cu", viennacl::generator::get_cuda_program_string(sat));
+//    dump_string_to_file("gemm_tt_" + scalartype_name + "_" + device_name + ".cu", viennacl::generator::get_cuda_program_string(stt));
+}
+
+int main(){
+  typedef std::vector< viennacl::ocl::platform > platforms_type;
+  unsigned int counter = 0;
+  platforms_type platforms = viennacl::ocl::get_platforms();
+  for (platforms_type::iterator platform_iter  = platforms.begin();
+       platform_iter != platforms.end();
+       ++platform_iter)
+  {
+    typedef std::vector<viennacl::ocl::device> devices_type;
+    devices_type devices = platform_iter->devices(CL_DEVICE_TYPE_ALL);
+    for(devices_type::iterator iter = devices.begin(); iter != devices.end(); iter++)
+    {
+      unsigned int current_device = counter++;
+      viennacl::ocl::setup_context(current_device,*iter);
+      viennacl::ocl::switch_context(current_device);
+      viennacl::ocl::device const & device = viennacl::ocl::current_device();
+      std::string device_name = device.name();
+      std::transform(device_name.begin(), device_name.end(), device_name.begin(), ::tolower);
+      std::replace(device_name.begin(), device_name.end(),' ', '_');
+      dump_gemm_kernel<float>(device_name);
+    }
+  }
+}
diff --git a/examples/autotuner/gemm_autotuning.cpp b/examples/autotuner/gemm_autotuning.cpp
new file mode 100644
index 0000000..50486f7
--- /dev/null
+++ b/examples/autotuner/gemm_autotuning.cpp
@@ -0,0 +1,382 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+//#define VIENNACL_DEBUG_BUILD
+//#define VIENNACL_DEBUG_ALL
+
+#include <algorithm>
+#include <string>
+#include <iostream>
+#include "viennacl/ocl/platform.hpp"
+#include "viennacl/ocl/device.hpp"
+
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/prod.hpp"
+
+#include "viennacl/generator/generate.hpp"
+#include "viennacl/generator/autotune.hpp"
+
+#include "viennacl/tools/timer.hpp"
+
+#include "command-line-utils.hpp"
+
+using namespace viennacl::generator;
+
+static const unsigned int n_runs = 1;
+
+struct autotuner_options{
+
+    std::string layout;
+    std::string scalartype;
+    std::string output_name;
+
+    unsigned int requested_device;
+
+    std::string ms_interval;
+    std::string ks_interval;
+    std::string ns_interval;
+
+    std::string local_size_1_interval;
+    std::string cache_width_interval;
+    std::string local_size_2_interval;
+
+    std::string vector_interval;
+
+    std::string lhs_fetch_method;
+    std::string rhs_fetch_method;
+
+};
+
+autotuner_options get_options(int argc, char* argv[]){
+    try{
+        autotuner_options options;
+
+        TCLAP::CmdLine cmd("GEMM Autotuner", ' ', "0.1");
+
+
+        pow_2_interval_constraint pow_2_interval_cstrt;
+
+        //Layouts
+        std::vector<std::string> allowed_layouts;
+        allowed_layouts.push_back("NN");
+        allowed_layouts.push_back("TN");
+        allowed_layouts.push_back("NT");
+        allowed_layouts.push_back("TT");
+        TCLAP::ValuesConstraint<std::string> allowed_layouts_constraint( allowed_layouts);
+        TCLAP::ValueArg<std::string> layout_arg("l","layout","Layout to tune the hardware for",true,"NN",&allowed_layouts_constraint,cmd);
+
+        //Scalartype
+        std::vector<std::string> allowed_scalartypes;
+        allowed_scalartypes.push_back("float");
+        allowed_scalartypes.push_back("double");
+        TCLAP::ValuesConstraint<std::string> allowed_scalartypes_constraint( allowed_scalartypes);
+        TCLAP::ValueArg<std::string> scalartype_arg("s","scalartype","Scalartype to tune the hardware for",true,"float",&allowed_scalartypes_constraint,cmd);
+
+        //Output data file
+        TCLAP::ValueArg<std::string> output_name_arg("o","output","Name of the output data file",true,"gemm_autotuning.dat","string",cmd);
+
+        //Device id
+        TCLAP::ValueArg<unsigned int> requested_device_arg("d","device","ID of the device to use for the autotuning procedure",false,0,"unsigned int",cmd);
+
+        //Small blocks
+        TCLAP::ValueArg<std::string> ms_interval_arg("","ms","Number of row in each block processed by each work-item. Specify min,max both power of two.",false,"1,8",&pow_2_interval_cstrt,cmd);
+        TCLAP::ValueArg<std::string> ks_interval_arg("","ks","Increment size for each small block calculation. Specify min,max both power of two.",false,"1,8",&pow_2_interval_cstrt,cmd);
+        TCLAP::ValueArg<std::string> ns_interval_arg("","ns","Number of column in each block processed by each work-item. Specify min,max both power of two.",false,"1,8",&pow_2_interval_cstrt,cmd);
+
+
+        //Large blocks
+        TCLAP::ValueArg<std::string> local_size_1_interval_arg("","local-size-1","Number of work-item rows in each work-group. Specify min,max both power of two.",false,"2,64",&pow_2_interval_cstrt,cmd);
+        TCLAP::ValueArg<std::string> cache_width_interval_arg("","cache-width","Increment size for each Large block calculation. Specify min,max both power of two.",false,"16,128",&pow_2_interval_cstrt,cmd);
+        TCLAP::ValueArg<std::string> local_size_2_interval_arg("","local-size-2","Number of work-item columns in each work-group. Specify min,max both power of two.",false,"2,64",&pow_2_interval_cstrt,cmd);
+
+
+        //Vector
+        TCLAP::ValueArg<std::string> vector_interval_arg("","vector","Vector type used in the kernel",false,"1,4",&pow_2_interval_cstrt,cmd);
+
+        //Storage Type
+        std::vector<std::string> allowed_fetch_method;
+        allowed_fetch_method.push_back("local");
+        allowed_fetch_method.push_back("global");
+        allowed_fetch_method.push_back("all");
+        TCLAP::ValuesConstraint<std::string> allowed_fetch_method_constraint(allowed_fetch_method);
+        TCLAP::ValueArg<std::string> lhs_fetch_method_arg("","lhs-fetch","Method to fetch the LHS.",false,"all",&allowed_fetch_method_constraint,cmd);
+        TCLAP::ValueArg<std::string> rhs_fetch_method_arg("","rhs-fetch","Method to fetch the RHS.",false,"all",&allowed_fetch_method_constraint,cmd);
+
+
+        cmd.parse(argc,argv);
+        options.layout = layout_arg.getValue();
+        options.scalartype = scalartype_arg.getValue();
+        options.output_name = output_name_arg.getValue();
+        options.requested_device = requested_device_arg.getValue();
+        options.ms_interval = ms_interval_arg.getValue();
+        options.ks_interval = ks_interval_arg.getValue();
+        options.ns_interval = ns_interval_arg.getValue();
+        options.local_size_1_interval = local_size_1_interval_arg.getValue();
+        options.cache_width_interval = cache_width_interval_arg.getValue();
+        options.local_size_2_interval = local_size_2_interval_arg.getValue();
+        options.vector_interval = vector_interval_arg.getValue();
+        options.lhs_fetch_method = lhs_fetch_method_arg.getValue();
+        options.rhs_fetch_method = rhs_fetch_method_arg.getValue();
+
+        return options;
+    }
+    catch (TCLAP::ArgException &e){
+        std::cerr << "error: " << "\"" << e.error() << "\"" << " [for arg " << e.argId() << "]" << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+template<class ScalarType>
+struct config{
+    typedef matrix_product profile_type;
+    static profile_type create_profile(std::map<std::string, autotune::tuning_param> const & params){
+       profile_type res(  viennacl::generator::at(params, std::string("vector")).current()
+                        , viennacl::generator::at(params, std::string("local_size1")).current()
+                        , viennacl::generator::at(params, std::string("cache_width")).current()
+                        , viennacl::generator::at(params, std::string("local_size2")).current()
+                        , viennacl::generator::at(params, std::string("ms")).current()
+                        , viennacl::generator::at(params, std::string("ks")).current()
+                        , viennacl::generator::at(params, std::string("ns")).current()
+                        , static_cast<bool>(viennacl::generator::at(params, std::string("lhs_storage")).current() > 0)
+                        , static_cast<bool>(viennacl::generator::at(params, std::string("rhs_storage")).current() > 0));
+       return res;
+    }
+    static bool is_invalid(viennacl::ocl::device const & dev, std::map<std::string, autotune::tuning_param> const & params){
+        profile_type prof = create_profile(params);
+        return prof.is_invalid(dev, sizeof(ScalarType));
+    }
+};
+
+viennacl::generator::code_generator::forced_profile_key_type make_key(std::string const & layout, std::size_t scalartype_size){
+    if(layout=="TT")
+        return code_generator::forced_profile_key_type(MATRIX_PRODUCT_TT_TYPE, scalartype_size);
+    else if(layout=="TN")
+        return code_generator::forced_profile_key_type(MATRIX_PRODUCT_TN_TYPE, scalartype_size);
+    else if(layout=="NT")
+        return code_generator::forced_profile_key_type(MATRIX_PRODUCT_NT_TYPE, scalartype_size);
+    else
+        return code_generator::forced_profile_key_type(MATRIX_PRODUCT_NN_TYPE, scalartype_size);
+}
+
+template<class MatA, class MatB, class MatC>
+viennacl::scheduler::statement make_statement(std::string const & layout, MatA const & A, MatB const & B, MatC const & C){
+    if(layout=="TT")
+        return viennacl::scheduler::statement(C, viennacl::op_assign(), viennacl::linalg::prod(trans(A),trans(B)));
+    else if(layout=="TN")
+        return viennacl::scheduler::statement(C, viennacl::op_assign(), viennacl::linalg::prod(trans(A),B));
+    else if(layout=="NT")
+        return viennacl::scheduler::statement(C, viennacl::op_assign(), viennacl::linalg::prod(A,trans(B)));
+    else
+       return  viennacl::scheduler::statement(C, viennacl::op_assign(), viennacl::linalg::prod(A,B));
+}
+
+template<typename ScalarType>
+unsigned int run_benchmark(size_t size, std::string layout, std::size_t scalartype_size, typename config<ScalarType>::profile_type const & profile)
+{
+    //viennacl::ocl::current_context().build_options("-cl-mad-enable -cl-fast-relaxed-math");   //uncomment for additional optimizations
+    //viennacl::ocl::current_context().build_options("-cl-opt-disable");                        //uncomment to get poor performance
+    viennacl::matrix<ScalarType> A(size, size);
+    viennacl::matrix<ScalarType> B(size, size);
+    viennacl::matrix<ScalarType> C(size, size);
+    viennacl::scheduler::statement statement = make_statement(layout,A,B,C);
+    viennacl::generator::code_generator gen;
+    gen.add(statement,statement.array()[0]);
+    gen.force_profile(make_key(layout,scalartype_size), profile);
+    viennacl::generator::enqueue(gen);
+    viennacl::generator::enqueue(gen);
+    viennacl::backend::finish();
+    viennacl::tools::timer timer;
+    timer.start();
+    static const unsigned int n_runs = 1;
+    for(unsigned int r = 0 ; r < n_runs; ++r)
+      viennacl::generator::enqueue(gen);
+    viennacl::backend::finish();
+    double time = timer.get()/n_runs;
+    return static_cast<unsigned int>(2*pow(size/static_cast<double>(1000.0),3)/time);
+}
+
+template<class ScalarType>
+void run_autotune(autotuner_options options){
+    typedef std::map<double, matrix_product> timings_t;
+    typedef viennacl::matrix<ScalarType> MatrixT;
+    typedef config<ScalarType> config_type;
+    typedef typename config_type::profile_type profile_type;
+
+    viennacl::ocl::device const &  device = viennacl::ocl::current_device();
+
+    autotune::tuning_config<config_type> conf;
+    timings_t timings;
+    std::list<matrix_product> fastest_firsts;
+    std::ofstream stream(options.output_name.c_str());
+
+    std::list<std::pair<unsigned int, unsigned int> > rounds_config;
+    rounds_config.push_back(std::make_pair(1280,100));
+    rounds_config.push_back(std::make_pair(2560,100));
+    rounds_config.push_back(std::make_pair(2688,100));
+    rounds_config.push_back(std::make_pair(2816,100));
+    rounds_config.push_back(std::make_pair(2944,100));
+    rounds_config.push_back(std::make_pair(3072,100));
+
+    std::vector<unsigned int> tmp;
+    tmp = get_values_in_commas(options.local_size_1_interval); std::vector<int> local_size_1; for(unsigned int i=tmp[0] ; i<=tmp[1]; i*=2) local_size_1.push_back(i);
+    tmp = get_values_in_commas(options.cache_width_interval); std::vector<int> cache_width; for(unsigned int i=tmp[0] ; i<=tmp[1]; i*=2) cache_width.push_back(i);
+    tmp = get_values_in_commas(options.local_size_2_interval); std::vector<int> local_size_2; for(unsigned int i=tmp[0] ; i<=tmp[1]; i*=2) local_size_2.push_back(i);
+    tmp = get_values_in_commas(options.ms_interval); std::vector<int> ms; for(unsigned int i=tmp[0] ; i<=tmp[1]; i*=2) ms.push_back(i);
+    tmp = get_values_in_commas(options.ks_interval); std::vector<int> ks; for(unsigned int i=tmp[0] ; i<=tmp[1]; i*=2) ks.push_back(i);
+    tmp = get_values_in_commas(options.ns_interval); std::vector<int> ns; for(unsigned int i=tmp[0] ; i<=tmp[1]; i*=2) ns.push_back(i);
+    tmp = get_values_in_commas(options.vector_interval); std::vector<int> vector; for(unsigned int i=tmp[0] ; i<=tmp[1]; i*=2) vector.push_back(i);
+    std::vector<int> lhs_storage;
+    if(options.lhs_fetch_method=="global")
+        lhs_storage.push_back(0);
+    else if(options.lhs_fetch_method=="local")
+        lhs_storage.push_back(1);
+    else{
+        lhs_storage.push_back(0);
+        lhs_storage.push_back(1);
+    }
+    std::vector<int> rhs_storage;
+    if(options.rhs_fetch_method=="global")
+        rhs_storage.push_back(0);
+    else if(options.rhs_fetch_method=="local")
+        rhs_storage.push_back(1);
+    else{
+        rhs_storage.push_back(0);
+        rhs_storage.push_back(1);
+    }
+
+    conf.add_tuning_param("local_size1",local_size_1);
+    conf.add_tuning_param("cache_width",cache_width);
+    conf.add_tuning_param("local_size2",local_size_2);
+    conf.add_tuning_param("ms",ms);
+    conf.add_tuning_param("ks",ks);
+    conf.add_tuning_param("ns",ns);
+    conf.add_tuning_param("vector",vector);
+    conf.add_tuning_param("lhs_storage",lhs_storage);
+    conf.add_tuning_param("rhs_storage",rhs_storage);
+
+
+    stream << "# ---- GEMM ----" << std::endl;
+    stream << "#" << options.layout << " | Scalartype : " << options.scalartype << std::endl;
+    stream << "#----------------------" << std::endl;
+    stream << "#----------------------" << std::endl;
+    stream << "#----------------------" << std::endl;
+    stream << device.full_info(1,'#');
+    stream << "#----------------------" << std::endl;
+    stream << "#tuning for size : " << rounds_config.front().first << std::endl;
+
+    code_generator::forced_profile_key_type key = make_key(options.layout,sizeof(ScalarType));
+    for(std::list<std::pair<unsigned int, unsigned int> >::iterator it = rounds_config.begin() ; it!= rounds_config.end(); ++it){
+        timings.clear();
+        unsigned int k = static_cast<unsigned int>(std::distance(rounds_config.begin(),it));
+        unsigned int size=it->first;
+        unsigned int n_keep=it->second;
+        MatrixT A(size,size);
+        MatrixT B(size,size);
+        MatrixT C(size,size);
+        viennacl::backend::finish();
+        viennacl::scheduler::statement statement = make_statement(options.layout,A,B,C);
+        stream << "#time" << "," << profile_type::csv_format() << std::endl;
+        if(k==0){
+          autotune::benchmark(&timings,statement,key,conf,n_runs,&stream);
+        }
+        else{
+          unsigned int n=0;
+          for(typename std::list<profile_type>::const_iterator it = fastest_firsts.begin(); it!=fastest_firsts.end(); ++it){
+            double percent = (double)n++*100/fastest_firsts.size();
+            std::cout << '\r' << "Determining best profile for size " << size << "..." << "[" << std::setprecision(2) << std::setfill (' ') << std::setw(6) << std::fixed  << percent << "%" << "]" << std::flush;
+            double exec_time = autotune::benchmark_impl(statement,key,*it,n_runs);
+            timings.insert(std::make_pair(exec_time, *it));
+            stream << std::setprecision(3) << std::scientific << exec_time << "," << it->csv_representation() << std::endl;
+          }
+          std::cout << std::endl;
+        }
+        fastest_firsts.clear();
+        viennacl::backend::finish();
+        for(timings_t::iterator itt = timings.begin(); itt!=timings.end() ; ++itt){
+            unsigned int n = static_cast<unsigned int>(std::distance(timings.begin(),itt));
+            if(n>n_keep) break;
+            fastest_firsts.push_back(itt->second);
+        }
+        stream << "# " << " Size : " << size << " | Best : " << 2*std::pow((double)size/1000,3)/timings.begin()->first << " GFlops : " << timings.begin()->second << std::endl;
+
+        //Recompiles for the best profile
+        profile_type best_profile = timings.begin()->second;
+        viennacl::generator::code_generator dummy;
+        dummy.add(statement,statement.array()[0]);
+        dummy.force_profile(key, best_profile);
+        viennacl::generator::enqueue(dummy,true);
+        viennacl::backend::finish();
+    }
+
+    stream << "#Benchmarking " << timings.begin()->second << "..." << std::endl;
+    stream << "##Size\tGFLOP/s" << std::endl;
+    for(unsigned int size = 128 ; size <= 3072 ; size += 128){
+        double percent = (double)size/3072*100;
+        std::cout << '\r' << "Benchmarking..." << "[" << std::setprecision(2) << std::setfill (' ') << std::setw(6) << std::fixed  << percent << "%" << "]" << std::flush;
+        stream << "#" << size << "\t" << run_benchmark<ScalarType>(size,options.layout,sizeof(ScalarType),timings.begin()->second) << std::endl;
+    }
+    std::cout << '\r' << "Benchmarking...[100.00%]" << std::endl;
+}
+
+
+
+int main(int argc, char* argv[]){
+  typedef std::vector< viennacl::ocl::platform > platforms_type;
+  typedef std::vector<viennacl::ocl::device> devices_type;
+  autotuner_options options = get_options(argc,argv);
+  unsigned int counter=0;
+  platforms_type platforms = viennacl::ocl::get_platforms();
+  for (platforms_type::iterator platform_iter  = platforms.begin();
+       platform_iter != platforms.end();
+       ++platform_iter)
+  {
+    devices_type devices = platform_iter->devices(CL_DEVICE_TYPE_ALL);
+    for(devices_type::iterator iter = devices.begin(); iter != devices.end(); iter++)
+    {
+      if(counter++==options.requested_device){
+        viennacl::ocl::setup_context(counter,*iter);
+        viennacl::ocl::switch_context(counter);
+        viennacl::ocl::device const & device = viennacl::ocl::current_device();
+        std::string device_name = device.name();
+        std::transform(device_name.begin(), device_name.end(), device_name.begin(), ::tolower);
+        std::replace(device_name.begin(), device_name.end(),' ', '_');
+        std::cout << "-------------------" << std::endl;
+        std::cout << device.info() << std::endl;
+        std::cout << "Operation : GEMM" << std::endl;
+        std::cout << "-------------------" << std::endl;
+        std::cout << "layout : " << options.layout << std::endl;
+        std::cout << "scalartype : " << options.scalartype << std::endl;
+        std::cout << "ms : [" << options.ms_interval << "]" << std::endl;
+        std::cout << "ks : [" << options.ks_interval << "]" <<  std::endl;
+        std::cout << "ns : [" << options.ns_interval << "]" <<  std::endl;
+        std::cout << "local size 1 : [" << options.local_size_1_interval << "]" << std::endl;
+        std::cout << "cache width : [" << options.cache_width_interval << "]" << std::endl;
+        std::cout << "local size 2 : [" << options.local_size_2_interval << "]" << std::endl;
+        std::cout << "vector : [" << options.vector_interval << "]" << std::endl;
+        std::cout << "lhs fetch method : [" << options.lhs_fetch_method << "]" << std::endl;
+        std::cout << "rhs fetch method : [" << options.rhs_fetch_method << "]" << std::endl;
+        std::cout << "-------------------" << std::endl;
+        if(options.scalartype=="float")
+            run_autotune<float>(options);
+        else if(options.scalartype=="double")
+            run_autotune<double>(options);
+      }
+    }
+  }
+  std::cout << "Autotuning Complete!" << std::endl;
+  return EXIT_SUCCESS;
+}
diff --git a/examples/autotuner/gemv_autotuning.cpp b/examples/autotuner/gemv_autotuning.cpp
new file mode 100644
index 0000000..60ad04c
--- /dev/null
+++ b/examples/autotuner/gemv_autotuning.cpp
@@ -0,0 +1,266 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+//#define VIENNACL_DEBUG_BUILD
+//#define VIENNACL_DEBUG_ALL
+
+#include <iostream>
+
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+
+#include "viennacl/generator/generate.hpp"
+#include "viennacl/generator/autotune.hpp"
+#include "viennacl/tools/timer.hpp"
+#include "command-line-utils.hpp"
+
+using namespace viennacl::generator;
+
+typedef std::vector< viennacl::ocl::platform > platforms_type;
+typedef std::vector<viennacl::ocl::device> devices_type;
+typedef std::vector<cl_device_id> cl_devices_type;
+
+static const unsigned int n_runs = 10;
+
+struct autotuner_options{
+    unsigned int tuning_size;
+
+    std::string layout;
+    std::string scalartype;
+    std::string output_name;
+
+    unsigned int requested_device;
+
+    std::string vector_interval;
+
+    std::string local_size_1_interval;
+    std::string local_size_2_interval;
+    std::string num_groups_interval;
+};
+
+autotuner_options get_options(int argc, char* argv[]){
+    try{
+        autotuner_options options;
+
+        TCLAP::CmdLine cmd("GEMM Autotuner", ' ', "0.1");
+
+
+        pow_2_interval_constraint pow_2_interval_cstrt;
+        min_max_inc_constraint min_max_inc_cstrt;
+
+        //Tuning size
+        TCLAP::ValueArg<unsigned int> tuning_size_arg("","tuning-size","Size to use for the autotuning procedure",false,3072,"unsigned int",cmd);
+
+        //Layouts
+        std::vector<std::string> allowed_layouts;
+        allowed_layouts.push_back("Nx");
+        allowed_layouts.push_back("Tx");
+        TCLAP::ValuesConstraint<std::string> allowed_layouts_constraint( allowed_layouts);
+        TCLAP::ValueArg<std::string> layout_arg("l","layout","Layout to tune the hardware for",true,"Nx",&allowed_layouts_constraint,cmd);
+
+        //Scalartype
+        std::vector<std::string> allowed_scalartypes;
+        allowed_scalartypes.push_back("float");
+        allowed_scalartypes.push_back("double");
+        TCLAP::ValuesConstraint<std::string> allowed_scalartypes_constraint( allowed_scalartypes);
+        TCLAP::ValueArg<std::string> scalartype_arg("s","scalartype","Scalartype to tune the hardware for",true,"float",&allowed_scalartypes_constraint,cmd);
+
+        //Output data file
+        TCLAP::ValueArg<std::string> output_name_arg("o","output","Name of the output data file",true,"gemm_autotuning.dat","string",cmd);
+
+        //Device id
+        TCLAP::ValueArg<unsigned int> requested_device_arg("d","device","ID of the device to use for the autotuning procedure",false,0,"unsigned int",cmd);
+
+        //Vector
+        TCLAP::ValueArg<std::string> vector_interval_arg("","vector","Vector type used in the kernel",false,"1,1",&pow_2_interval_cstrt,cmd);
+
+        //Large blocks
+        TCLAP::ValueArg<std::string> local_size_1_interval_arg("","local-size-1","Number of work-item rows in each work-group. Specify min,max both power of two.",false,"1,64",&pow_2_interval_cstrt,cmd);
+        TCLAP::ValueArg<std::string> local_size_2_interval_arg("","local-size-2","Number of work-item columns in each work-group. Specify min,max both power of two.",false,"1,64",&pow_2_interval_cstrt,cmd);
+        TCLAP::ValueArg<std::string> num_groups_interval_arg("","num-groups","Number of work groups required.",false,"16,1024,16",&min_max_inc_cstrt,cmd);
+
+
+        cmd.parse(argc,argv);
+        options.tuning_size = tuning_size_arg.getValue();
+        options.layout = layout_arg.getValue();
+        options.scalartype = scalartype_arg.getValue();
+        options.output_name = output_name_arg.getValue();
+        options.requested_device = requested_device_arg.getValue();
+        options.vector_interval = vector_interval_arg.getValue();
+        options.local_size_1_interval = local_size_1_interval_arg.getValue();
+        options.local_size_2_interval = local_size_2_interval_arg.getValue();
+        options.num_groups_interval = num_groups_interval_arg.getValue();
+        return options;
+    }
+    catch (TCLAP::ArgException &e){
+        std::cerr << "error: " << "\"" << e.error() << "\"" << " [for arg " << e.argId() << "]" << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+template<class ScalarType>
+struct config{
+    typedef vector_reduction profile_type;
+    static profile_type create_profile(std::map<std::string, autotune::tuning_param> const & params){
+      return profile_type(viennacl::generator::at(params, std::string("vector")).current(),
+                          viennacl::generator::at(params, std::string("local_size1")).current(),
+                          viennacl::generator::at(params, std::string("local_size2")).current(),
+                          viennacl::generator::at(params, std::string("num_groups")).current());
+    }
+    static bool is_invalid(viennacl::ocl::device const & dev, std::map<std::string, autotune::tuning_param> const & params){
+        profile_type prof = create_profile(params);
+        return prof.is_invalid(dev, sizeof(ScalarType));
+    }
+};
+
+template<class ScalarType>
+code_generator::forced_profile_key_type make_key(autotuner_options options){
+    if(options.layout=="Nx") return code_generator::forced_profile_key_type(VECTOR_REDUCE_Nx_TYPE,sizeof(ScalarType));
+    else return code_generator::forced_profile_key_type(VECTOR_REDUCE_Tx_TYPE,sizeof(ScalarType));;
+}
+
+template<class ScalarType>
+viennacl::scheduler::statement make_statement(autotuner_options options, viennacl::vector<ScalarType> const & y, viennacl::matrix<ScalarType> const & A, viennacl::vector<ScalarType> const & x){
+    if(options.layout =="Nx") return viennacl::scheduler::statement(y,viennacl::op_assign(), viennacl::linalg::prod(A, x));
+    else return viennacl::scheduler::statement(y,viennacl::op_assign(), viennacl::linalg::prod(viennacl::trans(A), x));
+}
+
+template<typename ScalarType>
+double run_benchmark(size_t size, autotuner_options options, typename config<ScalarType>::profile_type const & profile)
+{
+    //viennacl::ocl::current_context().build_options("-cl-mad-enable -cl-fast-relaxed-math");   //uncomment for additional optimizations
+    //viennacl::ocl::current_context().build_options("-cl-opt-disable");                        //uncomment to get poor performance
+    viennacl::matrix<ScalarType> A(size, size);
+    viennacl::vector<ScalarType> y(size);
+    viennacl::vector<ScalarType> x(size);
+    viennacl::scheduler::statement statement = make_statement(options,y,A,x);
+    viennacl::generator::code_generator gen;
+    gen.add(statement,statement.array()[0]);
+    gen.force_profile(make_key<ScalarType>(options), profile);
+    viennacl::generator::enqueue(gen);
+    viennacl::generator::enqueue(gen);
+    viennacl::backend::finish();
+    viennacl::tools::timer timer;
+    timer.start();
+    static const unsigned int n_runs = 1;
+    for(unsigned int r = 0 ; r < n_runs; ++r)
+      viennacl::generator::enqueue(gen);
+    viennacl::backend::finish();
+    double time = timer.get()/(double)n_runs;
+    return time;
+}
+
+template<class ScalarType>
+void run_autotune(autotuner_options const & options){
+    typedef config<ScalarType> config_type;
+    typedef typename config_type::profile_type profile_type;
+
+    viennacl::ocl::device const &  device = viennacl::ocl::current_device();
+
+    viennacl::vector<ScalarType> y(options.tuning_size), x(options.tuning_size);
+    viennacl::matrix<ScalarType> A(options.tuning_size, options.tuning_size);
+    std::map<double,profile_type> timings;
+    autotune::tuning_config<config_type> conf;
+
+    std::vector<unsigned int> tmp;
+    tmp = get_values_in_commas(options.vector_interval); std::vector<int> vector; for(unsigned int i = tmp[0] ; i <= tmp[1] ; i*=2) vector.push_back(i);
+    tmp = get_values_in_commas(options.local_size_1_interval); std::vector<int> local_size_1; for(unsigned int i = tmp[0] ; i <= tmp[1] ; i*=2) local_size_1.push_back(i);
+    tmp = get_values_in_commas(options.local_size_2_interval); std::vector<int> local_size_2; for(unsigned int i = tmp[0] ; i <= tmp[1] ; i*=2) local_size_2.push_back(i);
+    tmp = get_values_in_commas(options.num_groups_interval); std::vector<int> num_groups; for(unsigned int i = tmp[0] ; i <= tmp[1] ; i+=tmp[2]) num_groups.push_back(i);
+
+    conf.add_tuning_param("vector",vector);
+    conf.add_tuning_param("local_size1",local_size_1);
+    conf.add_tuning_param("local_size2",local_size_2);
+    conf.add_tuning_param("num_groups",num_groups);
+    std::ofstream stream(options.output_name.c_str());
+    code_generator::forced_profile_key_type key = make_key<ScalarType>(options);
+    viennacl::scheduler::statement statement = make_statement(options,y,A,x);
+    stream << "#--------------------------" << std::endl;
+    stream << "# ---- GEMV AUTOTUNING ----" << std::endl;
+    stream << "#" << options.layout << " | Scalartype : " << options.scalartype << std::endl;
+    stream << "#----------------------" << std::endl;
+    stream << "#----------------------" << std::endl;
+    stream << "#----------------------" << std::endl;
+    stream << device.full_info(1,'#');
+    stream << "#----------------------" << std::endl;
+    stream << "#tuning for size : " << options.tuning_size << std::endl;
+
+    autotune::benchmark(&timings,statement,key,conf,n_runs,&stream);
+
+    //Recompiles for the best profile
+    profile_type best_profile = timings.begin()->second;
+    viennacl::generator::code_generator dummy;
+    dummy.add(statement,statement.array()[0]);
+    dummy.force_profile(key, best_profile);
+    viennacl::generator::enqueue(dummy,true);
+    viennacl::backend::finish();
+
+    stream << "#Benchmarking " << timings.begin()->second << "..." << std::endl;
+    stream << "##Size\tBandwidth(GB/s)\tThroughput(GFLOP/s)" << std::endl;
+    for(unsigned int size = 128 ; size <= 3072 ; size += 128){
+        double percent = (double)size/3072*100;
+        unsigned int n_bytes_transfered = (size*size+2*size)*sizeof(ScalarType);
+        unsigned int n_flops = size*(2*size-1);
+        std::cout << '\r' << "Benchmarking..." << "[" << std::setprecision(2) << std::setfill (' ') << std::setw(6) << std::fixed  << percent << "%" << "]" << std::flush;
+        double time = run_benchmark<ScalarType>(size,options,best_profile);
+        stream << "#" << size << "\t" <<  static_cast<unsigned int>(1e-9*n_bytes_transfered/time) << "\t" << static_cast<unsigned int>(1e-9*n_flops/time) << std::endl;
+    }
+    std::cout << '\r' << "Benchmarking...[100.00%]" << std::endl;
+}
+
+
+int main(int argc, char* argv[]){
+  typedef std::vector< viennacl::ocl::platform > platforms_type;
+  typedef std::vector<viennacl::ocl::device> devices_type;
+  unsigned int counter = 0;
+  autotuner_options options = get_options(argc,argv);
+  platforms_type platforms = viennacl::ocl::get_platforms();
+  for (platforms_type::iterator platform_iter  = platforms.begin();
+       platform_iter != platforms.end();
+       ++platform_iter)
+  {
+    devices_type devices = platform_iter->devices(CL_DEVICE_TYPE_ALL);
+    for(devices_type::iterator iter = devices.begin(); iter != devices.end(); iter++)
+    {
+      if(counter++==options.requested_device){
+        viennacl::ocl::setup_context(counter,*iter);
+        viennacl::ocl::switch_context(counter);
+        viennacl::ocl::device const & device = viennacl::ocl::current_device();
+        std::string device_name = device.name();
+        std::transform(device_name.begin(), device_name.end(), device_name.begin(), ::tolower);
+        std::replace(device_name.begin(), device_name.end(),' ', '_');
+        std::cout << "-------------------" << std::endl;
+        std::cout << device.info() << std::endl;
+        std::cout << "Operation : GEMV" << std::endl;
+        std::cout << "-------------------" << std::endl;
+        std::cout << "layout : " << options.layout << std::endl;
+        std::cout << "scalatype : " << options.scalartype << std::endl;
+        std::cout << "vector : [" << options.vector_interval << "]" << std::endl;
+        std::cout << "local size 1 : [" << options.local_size_1_interval << "]" << std::endl;
+        std::cout << "local size 2 : [" << options.local_size_2_interval << "]" << std::endl;
+        std::cout << "number of groups : [" << options.num_groups_interval << "]" << std::endl;
+        std::cout << "-------------------" << std::endl;
+        if(options.scalartype=="float")
+            run_autotune<float>(options);
+        else if(options.scalartype=="double")
+            run_autotune<double>(options);
+      }
+    }
+  }
+  std::cout << "Autotuning complete!" << std::endl;
+}
diff --git a/examples/autotuner/vector-axpy_autotuning.cpp b/examples/autotuner/vector-axpy_autotuning.cpp
new file mode 100644
index 0000000..aabc955
--- /dev/null
+++ b/examples/autotuner/vector-axpy_autotuning.cpp
@@ -0,0 +1,270 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+//#define VIENNACL_DEBUG_ALL
+//#define VIENNACL_DEBUG_BUILD
+
+#include <iostream>
+#include <algorithm>
+#include <string>
+
+#include "viennacl/scheduler/forwards.h"
+#include "viennacl/vector.hpp"
+#include "viennacl/generator/generate.hpp"
+#include "viennacl/generator/autotune.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+
+#include "command-line-utils.hpp"
+
+static const unsigned int n_runs = 10;
+
+using namespace viennacl::generator;
+
+typedef std::vector< viennacl::ocl::platform > platforms_type;
+typedef std::vector<viennacl::ocl::device> devices_type;
+typedef std::vector<cl_device_id> cl_devices_type;
+
+
+struct autotuner_options{
+    unsigned int tuning_size;
+
+    std::string scalartype;
+    std::string output_name;
+
+    unsigned int requested_device;
+
+    std::string vector_interval;
+
+    std::string local_size_interval;
+    std::string num_groups_interval;
+
+    std::string decomposition;
+};
+
+autotuner_options get_options(int argc, char* argv[]){
+    try{
+        autotuner_options options;
+
+        TCLAP::CmdLine cmd("Vector AXPY Autotuner", ' ', "0.1");
+
+
+        pow_2_interval_constraint pow_2_interval_cstrt;
+        min_max_inc_constraint min_max_inc_cstrt;
+
+        TCLAP::ValueArg<unsigned int> tuning_size_arg("","tuning-size","Size to use for the autotuning procedure",false,1024*1024,"unsigned int",cmd);
+
+        //Scalartype
+        std::vector<std::string> allowed_scalartypes;
+        allowed_scalartypes.push_back("float");
+        allowed_scalartypes.push_back("double");
+        TCLAP::ValuesConstraint<std::string> allowed_scalartypes_constraint( allowed_scalartypes);
+        TCLAP::ValueArg<std::string> scalartype_arg("s","scalartype","Scalartype to tune the hardware for",true,"float",&allowed_scalartypes_constraint,cmd);
+
+        //Output data file
+        TCLAP::ValueArg<std::string> output_name_arg("o","output","Name of the output data file",true,"gemm_autotuning.dat","string",cmd);
+
+        //Device id
+        TCLAP::ValueArg<unsigned int> requested_device_arg("d","device","ID of the device to use for the autotuning procedure",false,0,"unsigned int",cmd);
+
+        //Vector
+        TCLAP::ValueArg<std::string> vector_interval_arg("","vector","Vector type used in the kernel",false,"1,1",&pow_2_interval_cstrt,cmd);
+
+        //Large blocks
+        TCLAP::ValueArg<std::string> local_size_interval_arg("","local-size","Number of work-item in each work-group. Specify min,max both power of two.",false,"16,1024",&pow_2_interval_cstrt,cmd);
+        TCLAP::ValueArg<std::string> num_groups_interval_arg("","num-groups","Number of work groups required.",false,"16,1024,16",&min_max_inc_cstrt,cmd);
+
+        //Decomposition
+        std::vector<std::string> allowed_decomposition_method;
+        allowed_decomposition_method.push_back("local");
+        allowed_decomposition_method.push_back("global");
+        allowed_decomposition_method.push_back("all");
+        TCLAP::ValuesConstraint<std::string> allowed_decomposition_method_constraint(allowed_decomposition_method);
+        TCLAP::ValueArg<std::string> decomposition_method_arg("","decomposition","Work decomposition method. If set to \"local\" , the work items within a work group will access contiguous data.",false,"all",&allowed_decomposition_method_constraint,cmd);
+
+        cmd.parse(argc,argv);
+        options.tuning_size = tuning_size_arg.getValue();
+        options.scalartype = scalartype_arg.getValue();
+        options.output_name = output_name_arg.getValue();
+        options.requested_device = requested_device_arg.getValue();
+        options.vector_interval = vector_interval_arg.getValue();
+        options.local_size_interval = local_size_interval_arg.getValue();
+        options.num_groups_interval = num_groups_interval_arg.getValue();
+        options.decomposition = decomposition_method_arg.getValue();
+        return options;
+    }
+    catch (TCLAP::ArgException &e){
+        std::cerr << "error: " << "\"" << e.error() << "\"" << " [for arg " << e.argId() << "]" << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+template<class ScalarType>
+struct config{
+    typedef vector_saxpy profile_type;
+    static profile_type create_profile(std::map<std::string, autotune::tuning_param> const & params){
+      return profile_type(viennacl::generator::at(params, std::string("vector")).current(),
+                          viennacl::generator::at(params, std::string("local_size")).current(),
+                          viennacl::generator::at(params, std::string("num_groups")).current(),
+                          viennacl::generator::at(params, std::string("decomposition")).current());
+    }
+    static bool is_invalid(viennacl::ocl::device const & dev, std::map<std::string, autotune::tuning_param> const & params){
+        profile_type prof = create_profile(params);
+        return prof.is_invalid(dev, sizeof(ScalarType));
+    }
+};
+
+template<class ScalarType>
+code_generator::forced_profile_key_type make_key(autotuner_options /*options*/){
+    return code_generator::forced_profile_key_type(VECTOR_SAXPY_TYPE,sizeof(ScalarType));
+}
+
+template<class ScalarType>
+viennacl::scheduler::statement make_statement(autotuner_options /*options*/, viennacl::vector<ScalarType> const & z, viennacl::vector<ScalarType> const & x, viennacl::vector<ScalarType> const & y){
+    return viennacl::scheduler::statement(z, viennacl::op_assign(), x + y);
+}
+
+template<typename ScalarType>
+double run_benchmark(size_t size, autotuner_options options, typename config<ScalarType>::profile_type const & profile)
+{
+    //viennacl::ocl::current_context().build_options("-cl-mad-enable -cl-fast-relaxed-math");   //uncomment for additional optimizations
+    //viennacl::ocl::current_context().build_options("-cl-opt-disable");                        //uncomment to get poor performance
+    viennacl::vector<ScalarType> y(size);
+    viennacl::vector<ScalarType> x(size);
+    viennacl::vector<ScalarType> z(size);
+    viennacl::scheduler::statement statement = make_statement(options,z,x,y);
+    viennacl::generator::code_generator gen;
+    gen.add(statement,statement.array()[0]);
+    gen.force_profile(make_key<ScalarType>(options), profile);
+    viennacl::generator::enqueue(gen);
+    viennacl::generator::enqueue(gen);
+    viennacl::backend::finish();
+    viennacl::tools::timer timer;
+    timer.start();
+    static const unsigned int n_runs = 1;
+    for(unsigned int r = 0 ; r < n_runs; ++r)
+      viennacl::generator::enqueue(gen);
+    viennacl::backend::finish();
+    double time = timer.get()/(double)n_runs;
+    return time;
+}
+
+template<class ScalarType>
+void run_autotune(autotuner_options const & options){
+    typedef config<ScalarType> config_type;
+    typedef typename config_type::profile_type profile_type;
+
+    viennacl::ocl::device const &  device = viennacl::ocl::current_device();
+
+    viennacl::vector<ScalarType> x(options.tuning_size), y(options.tuning_size), z(options.tuning_size);
+    viennacl::backend::finish();
+    autotune::tuning_config<config<ScalarType> > conf;
+    std::map<double, typename config<ScalarType>::profile_type> timings;
+
+    std::vector<unsigned int> tmp;
+    tmp = get_values_in_commas(options.local_size_interval); std::vector<int> local_size; for(unsigned int i=tmp[0] ; i<=tmp[1]; i*=2) local_size.push_back(i);
+    tmp = get_values_in_commas(options.num_groups_interval); std::vector<int> num_groups; for(unsigned int i=tmp[0] ; i<=tmp[1]; i+=tmp[2]) { num_groups.push_back(i); }
+    tmp = get_values_in_commas(options.vector_interval); std::vector<int> vector; for(unsigned int i=tmp[0] ; i<=tmp[1]; i*=2) vector.push_back(i);
+    std::vector<int> decomposition;
+    if(options.decomposition=="global")
+        decomposition.push_back(0);
+    else if(options.decomposition=="local")
+        decomposition.push_back(1);
+    else{
+        decomposition.push_back(0);
+        decomposition.push_back(1);
+    }
+
+    conf.add_tuning_param("vector",vector);
+    conf.add_tuning_param("local_size",local_size);
+    conf.add_tuning_param("num_groups",num_groups);
+    conf.add_tuning_param("decomposition", decomposition);
+    std::ofstream stream(options.output_name.c_str());
+
+
+    stream << "# ---- VECTOR AXPY AUTOTUNING ----" << std::endl;
+    stream << "#" << "scalartype : " << options.scalartype << std::endl;
+    stream << "#----------------------" << std::endl;
+    stream << "#----------------------" << std::endl;
+    stream << "#----------------------" << std::endl;
+    stream << device.full_info(1,'#');
+    stream << "#----------------------" << std::endl;
+    stream << "#tuning for size : " << options.tuning_size << std::endl;
+
+    code_generator::forced_profile_key_type key(VECTOR_SAXPY_TYPE, sizeof(ScalarType));
+    viennacl::scheduler::statement statement = make_statement(options,z,x,y);
+    autotune::benchmark(&timings,statement,key,conf,n_runs,&stream);
+
+    //Recompiles for the best profile
+    profile_type best_profile = timings.begin()->second;
+    viennacl::generator::code_generator dummy;
+    dummy.add(statement,statement.array()[0]);
+    dummy.force_profile(key, best_profile);
+    viennacl::generator::enqueue(dummy,true);
+    viennacl::backend::finish();
+
+    stream << "#Benchmarking " << timings.begin()->second << "..." << std::endl;
+    stream << "##Size\tBandwidth(GB/s)" << std::endl;
+    for(unsigned int size = 1024 ; size <= 5e7 ; size *=2){
+        double percent = (double)size/1e7*100;
+        std::cout << '\r' << "Benchmarking..." << "[" << std::setprecision(2) << std::setfill (' ') << std::setw(6) << std::fixed  << percent << "%" << "]" << std::flush;
+        unsigned int n_bytes_transfered = 3*size*sizeof(ScalarType);
+        double time = run_benchmark<ScalarType>(size,options,best_profile);
+        stream << "#" << size << "\t" << static_cast<unsigned int>(1e-9*n_bytes_transfered/time) << std::endl;
+    }
+    std::cout << '\r' << "Benchmarking...[100.00%]" << std::endl;
+}
+
+int main(int argc, char* argv[]){
+  typedef std::vector< viennacl::ocl::platform > platforms_type;
+  typedef std::vector<viennacl::ocl::device> devices_type;
+  autotuner_options options = get_options(argc,argv);
+  std::size_t device_counter = 0;
+  platforms_type platforms = viennacl::ocl::get_platforms();
+  for (platforms_type::iterator platform_iter  = platforms.begin();
+       platform_iter != platforms.end();
+       ++platform_iter)
+  {
+    devices_type devices = platform_iter->devices(CL_DEVICE_TYPE_ALL);
+    for(devices_type::iterator iter = devices.begin(); iter != devices.end(); iter++)
+    {
+      if(device_counter++==options.requested_device){
+        viennacl::ocl::setup_context(options.requested_device,*iter);
+        viennacl::ocl::switch_context(options.requested_device);
+        viennacl::ocl::device const & device = viennacl::ocl::current_device();
+        std::string device_name = device.name();
+        std::transform(device_name.begin(), device_name.end(), device_name.begin(), ::tolower);
+        std::replace(device_name.begin(), device_name.end(),' ', '_');
+        std::cout << "-------------------" << std::endl;
+        std::cout << device.info() << std::endl;
+        std::cout << "Operation : VECTOR AXPY" << std::endl;
+        std::cout << "-------------------" << std::endl;
+        std::cout << "scalatype : " << options.scalartype << std::endl;
+        std::cout << "vector : [" << options.vector_interval << "]" << std::endl;
+        std::cout << "local size : [" << options.local_size_interval << "]" << std::endl;
+        std::cout << "number of groups : [" << options.num_groups_interval << "]" << std::endl;
+        std::cout << "decomposition : [" << options.decomposition << "]" << std::endl;
+        std::cout << "tuning size : " << options.tuning_size << std::endl;
+        std::cout << "-------------------" << std::endl;
+        if(options.scalartype=="float")
+            run_autotune<float>(options);
+        else if(options.scalartype=="double")
+            run_autotune<double>(options);
+      }
+    }
+  }
+  std::cout << "Autotuning complete! Check \"" << options.output_name << "\" for results." << std::endl;
+}
diff --git a/examples/benchmarks/CMakeLists.txt b/examples/benchmarks/CMakeLists.txt
index 067d289..0e880c7 100644
--- a/examples/benchmarks/CMakeLists.txt
+++ b/examples/benchmarks/CMakeLists.txt
@@ -1,17 +1,58 @@
-foreach(bench blas3 opencl vector)
-   add_executable(${bench}bench ${bench}.cpp)
-   target_link_libraries(${bench}bench ${OPENCL_LIBRARIES})
+# Targets using CPU-based execution
+foreach(bench blas3 copy scheduler vector)
+   add_executable(${bench}bench-cpu ${bench}.cpp)
 endforeach()
 
-if(ENABLE_UBLAS)
-   include_directories(${Boost_INCLUDE_DIRS})
-   foreach(bench sparse solver iccs_qr)
-      add_executable(${bench}bench ${bench}.cpp)
-      target_link_libraries(${bench}bench ${OPENCL_LIBRARIES})
-   endforeach()
-endif()
+if (ENABLE_UBLAS)
+    include_directories(${Boost_INCLUDE_DIRS})
+    foreach(bench sparse solver)
+      add_executable(${bench}bench-cpu ${bench}.cpp)
+      target_link_libraries(${bench}bench-cpu ${Boost_LIBRARIES})
+    endforeach()
+endif (ENABLE_UBLAS)
 
-IF(CMAKE_COMPILER_IS_GNUCXX)
+
+# Targets using OpenCL
+if (ENABLE_OPENCL)
+
+  foreach(bench blas3 copy
+          generator_blas1 generator_blas2 generator_blas3
+          opencl vector)
+    add_executable(${bench}bench-opencl ${bench}.cpp)
+    target_link_libraries(${bench}bench-opencl ${OPENCL_LIBRARIES})
+    set_target_properties(${bench}bench-opencl PROPERTIES COMPILE_FLAGS "-DVIENNACL_WITH_OPENCL")
+  endforeach()
+
+  if (ENABLE_UBLAS)
+     include_directories(${Boost_INCLUDE_DIRS})
+     foreach(bench sparse solver)
+       add_executable(${bench}bench-opencl ${bench}.cpp)
+       target_link_libraries(${bench}bench-opencl ${OPENCL_LIBRARIES} ${Boost_LIBRARIES})
+       set_target_properties(${bench}bench-opencl PROPERTIES COMPILE_FLAGS "-DVIENNACL_WITH_OPENCL")
+     endforeach()
+  endif (ENABLE_UBLAS)
+
+endif (ENABLE_OPENCL)
+
+# Targets using CUDA
+if (ENABLE_CUDA)
+
+  foreach(bench blas3 copy vector)
+     cuda_add_executable(${bench}bench-cuda ${bench}.cu)
+  endforeach()
+
+  if (ENABLE_UBLAS)
+     include_directories(${Boost_INCLUDE_DIRS})
+     foreach(bench sparse solver)
+       cuda_add_executable(${bench}bench-cuda ${bench}.cu)
+      target_link_libraries(${bench}bench-cuda ${Boost_LIBRARIES})
+     endforeach()
+  endif (ENABLE_UBLAS)
+
+endif (ENABLE_CUDA)
+
+
+# IF(CMAKE_COMPILER_IS_GNUCXX)
    #ADD_DEFINITIONS(-Wall -pedantic -O0 -g)
-   ADD_DEFINITIONS(-Wall -pedantic -O3)
-ENDIF(CMAKE_COMPILER_IS_GNUCXX)
+#   ADD_DEFINITIONS(-Wall -pedantic -O3)
+# ENDIF(CMAKE_COMPILER_IS_GNUCXX)
diff --git a/examples/benchmarks/benchmark-utils.hpp b/examples/benchmarks/benchmark-utils.hpp
index 25db8c1..3289e1d 100644
--- a/examples/benchmarks/benchmark-utils.hpp
+++ b/examples/benchmarks/benchmark-utils.hpp
@@ -1,98 +1,99 @@
-#ifndef _BENCHMARK_UTILS_HPP_
-#define _BENCHMARK_UTILS_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-#include <iostream>
-
-void printOps(double num_ops, double exec_time)
-{
-  std::cout << "GFLOPS: " << num_ops / (1000000 * exec_time * 1000) << std::endl;
-}
-
-
-
-
-#ifdef _WIN32
-
-#define WINDOWS_LEAN_AND_MEAN
-#include <windows.h>
-#undef min
-#undef max
-
-class Timer
-{
-public:
-
-	Timer()
-	{
-		QueryPerformanceFrequency(&freq);
-	}
-
-	void start()
-	{
-		QueryPerformanceCounter((LARGE_INTEGER*) &start_time);
-	}
-
-	double get() const
-	{
-		LARGE_INTEGER  end_time;
-		QueryPerformanceCounter((LARGE_INTEGER*) &end_time);
-		return (static_cast<double>(end_time.QuadPart) - static_cast<double>(start_time.QuadPart)) / static_cast<double>(freq.QuadPart);
-	}
-
-
-private:
-	LARGE_INTEGER freq;
-    LARGE_INTEGER start_time;
-};
-
-#else
-
-#include <sys/time.h>
-
-class Timer
-{
-public:
-
-	Timer() : ts(0)
-	{}
-
-	void start()
-	{
-		struct timeval tval;
-		gettimeofday(&tval, NULL);
-		ts = tval.tv_sec * 1000000 + tval.tv_usec;
-	}
-
-	double get() const
-	{
-		struct timeval tval;
-		gettimeofday(&tval, NULL);
-		int64_t end_time = tval.tv_sec * 1000000 + tval.tv_usec;
-
-		return static_cast<double>(end_time-ts) / 1000000.0;
-	}
-
-private:
-	int64_t ts;
-};
-
-
-#endif
-
-#endif
+#ifndef _BENCHMARK_UTILS_HPP_
+#define _BENCHMARK_UTILS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include <iostream>
+
+void printOps(double num_ops, double exec_time)
+{
+  std::cout << "GFLOPs: " << num_ops / (1000000 * exec_time * 1000) << std::endl;
+}
+
+
+
+
+#ifdef _WIN32
+
+#define WINDOWS_LEAN_AND_MEAN
+#include <windows.h>
+#undef min
+#undef max
+
+class Timer
+{
+public:
+
+  Timer()
+  {
+    QueryPerformanceFrequency(&freq);
+  }
+
+  void start()
+  {
+    QueryPerformanceCounter((LARGE_INTEGER*) &start_time);
+  }
+
+  double get() const
+  {
+    LARGE_INTEGER  end_time;
+    QueryPerformanceCounter((LARGE_INTEGER*) &end_time);
+    return (static_cast<double>(end_time.QuadPart) - static_cast<double>(start_time.QuadPart)) / static_cast<double>(freq.QuadPart);
+  }
+
+
+private:
+  LARGE_INTEGER freq;
+    LARGE_INTEGER start_time;
+};
+
+#else
+
+#include <sys/time.h>
+
+class Timer
+{
+public:
+
+  Timer() : ts(0)
+  {}
+
+  void start()
+  {
+    struct timeval tval;
+    gettimeofday(&tval, NULL);
+    ts = static_cast<double>(tval.tv_sec * 1000000 + tval.tv_usec);
+  }
+
+  double get() const
+  {
+    struct timeval tval;
+    gettimeofday(&tval, NULL);
+    double end_time = static_cast<double>(tval.tv_sec * 1000000 + tval.tv_usec);
+
+    return static_cast<double>(end_time-ts) / 1000000.0;
+  }
+
+private:
+  double ts;
+};
+
+
+#endif
+
+#endif
diff --git a/examples/benchmarks/blas3.cpp b/examples/benchmarks/blas3.cpp
index 583570e..128c8b8 100644
--- a/examples/benchmarks/blas3.cpp
+++ b/examples/benchmarks/blas3.cpp
@@ -1,142 +1,246 @@
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-//disable debug mechanisms to have a fair comparison with ublas:
-#ifndef NDEBUG
- #define NDEBUG
-#endif
-
-//
-// include necessary system headers
-//
-#include <iostream>
-
-//
-// ViennaCL includes
-//
-#include "viennacl/scalar.hpp"
-#include "viennacl/vector.hpp"
-#include "viennacl/matrix.hpp"
-#include "viennacl/linalg/prod.hpp"
-
-// Some helper functions for this tutorial:
-#include "../tutorial/Random.hpp"
-
-
-#include "benchmark-utils.hpp"
-
-/*
-*   Tutorial: BLAS level 3 functionality
-*   
-*/
-
-#define BLAS3_MATRIX_SIZE   1500
-
-template<typename ScalarType>
-int run_benchmark()
-{
-  Timer timer;
-  double exec_time;
-
-  //
-  // One alternative: Put the matrices into a contiguous block of memory (allows to use viennacl::fast_copy(), avoiding temporary memory)
-  //
-  std::vector<ScalarType> stl_A(BLAS3_MATRIX_SIZE * BLAS3_MATRIX_SIZE);
-  std::vector<ScalarType> stl_B(BLAS3_MATRIX_SIZE * BLAS3_MATRIX_SIZE);
-  std::vector<ScalarType> stl_C(BLAS3_MATRIX_SIZE * BLAS3_MATRIX_SIZE);
-
-  //
-  // Fill the matrix
-  //
-  for (unsigned int i = 0; i < BLAS3_MATRIX_SIZE; ++i)
-    for (unsigned int j = 0; j < BLAS3_MATRIX_SIZE; ++j)
-      stl_A[i*BLAS3_MATRIX_SIZE + j] = random<ScalarType>();
-
-  for (unsigned int i = 0; i < BLAS3_MATRIX_SIZE; ++i)
-    for (unsigned int j = 0; j < BLAS3_MATRIX_SIZE; ++j)
-      stl_B[i + j*BLAS3_MATRIX_SIZE] = random<ScalarType>();
-
-  //
-  // Set up some ViennaCL objects
-  //
-  viennacl::ocl::set_context_device_type(0, viennacl::ocl::gpu_tag());
-  viennacl::matrix<ScalarType> vcl_A(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
-  viennacl::matrix<ScalarType, viennacl::column_major> vcl_B(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
-  viennacl::matrix<ScalarType> vcl_C(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
-  
-  /////////////////////////////////////////////////
-  //////////// Matrix-matrix products /////////////
-  /////////////////////////////////////////////////
-  
-  //
-  // Now iterate over all OpenCL devices in the context and compute the matrix-matrix product
-  //
-  std::vector<viennacl::ocl::device> devices = viennacl::ocl::current_context().devices();
-  for (size_t i=0; i<devices.size(); ++i)
-  {
-    viennacl::ocl::current_context().switch_device(devices[i]);
-    std::cout << " - Device Name: " << viennacl::ocl::current_device().name() << std::endl;
-
-    viennacl::fast_copy(&(stl_A[0]),
-                        &(stl_A[0]) + stl_A.size(),
-                        vcl_A);
-    viennacl::fast_copy(&(stl_B[0]),
-                        &(stl_B[0]) + stl_B.size(),
-                        vcl_B);
-    vcl_C = viennacl::linalg::prod(vcl_A, vcl_B);
-    viennacl::ocl::get_queue().finish();
-    timer.start();
-    vcl_C = viennacl::linalg::prod(vcl_A, vcl_B);
-    viennacl::ocl::get_queue().finish();
-    exec_time = timer.get();
-    std::cout << " - Execution time on device (no setup time included): " << exec_time << std::endl;
-    std::cout << " - GFLOPs (counting multiply&add as one operation): " << (vcl_A.size1() / 1000.0) * (vcl_A.size2() / 1000.0) * (vcl_B.size2() / 1000.0) / exec_time << std::endl;
-    std::cout << std::endl;
-  }
-  
-  return EXIT_SUCCESS;
-}
-
-int main()
-{
-  std::cout << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << "               Device Info" << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  
-  std::cout << viennacl::ocl::current_device().info() << std::endl;
-  
-  
-  std::cout << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << "## Benchmark :: Dense Matrix-Matrix product " << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << std::endl;
-  std::cout << "   -------------------------------" << std::endl;
-  std::cout << "   # benchmarking single-precision" << std::endl;
-  std::cout << "   -------------------------------" << std::endl;
-  run_benchmark<float>();
-  if( viennacl::ocl::current_device().double_support() )
-  {
-    std::cout << std::endl;
-    std::cout << "   -------------------------------" << std::endl;
-    std::cout << "   # benchmarking double-precision" << std::endl;
-    std::cout << "   -------------------------------" << std::endl;
-    run_benchmark<double>();
-  }
-  return 0;
-}
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/*
+*
+*   Benchmark: BLAS level 3 functionality for dense matrices (blas3.cpp and blas3.cu are identical, the latter being required for compilation using CUDA nvcc)
+*
+*/
+
+//disable debug mechanisms to have a fair benchmark environment
+#ifndef NDEBUG
+ #define NDEBUG
+#endif
+
+//#define VIENNACL_DEBUG_ALL
+//#define VIENNACL_DEBUG_BUILD
+
+//
+// include necessary system headers
+//
+#include <iostream>
+
+//
+// ViennaCL includes
+//
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/matrix_proxy.hpp"
+#include "viennacl/linalg/lu.hpp"
+
+// Some helper functions for this tutorial:
+#include "../tutorial/Random.hpp"
+
+
+#include "benchmark-utils.hpp"
+
+#define BLAS3_MATRIX_SIZE   1920
+
+template<typename ScalarType>
+int run_benchmark()
+{
+  Timer timer;
+  double exec_time;
+
+  //
+  // Set up some ViennaCL objects
+  //
+#ifdef VIENNACL_WITH_OPENCL
+  viennacl::ocl::set_context_device_type(0, viennacl::ocl::gpu_tag());
+#endif
+
+  //viennacl::ocl::current_context().build_options("-cl-mad-enable -cl-fast-relaxed-math");   //uncomment for additional optimizations
+  //viennacl::ocl::current_context().build_options("-cl-opt-disable");                        //uncomment to get poor performance
+  viennacl::matrix<ScalarType> vcl_A(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
+  viennacl::matrix<ScalarType> vcl_B(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
+  viennacl::matrix<ScalarType> vcl_C(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
+
+  //
+  // One alternative: Put the matrices into a contiguous block of memory (allows to use viennacl::fast_copy(), avoiding temporary memory)
+  //
+  std::vector<ScalarType> stl_A(vcl_A.internal_size());
+  std::vector<ScalarType> stl_B(vcl_A.internal_size());
+
+  //
+  // Fill the matrix
+  //
+  for (unsigned int i = 0; i < BLAS3_MATRIX_SIZE; ++i)
+    for (unsigned int j = 0; j < BLAS3_MATRIX_SIZE; ++j)
+      stl_A[i*BLAS3_MATRIX_SIZE + j] = random<ScalarType>();
+
+  for (unsigned int i = 0; i < BLAS3_MATRIX_SIZE; ++i)
+    for (unsigned int j = 0; j < BLAS3_MATRIX_SIZE; ++j)
+      stl_B[i + j*BLAS3_MATRIX_SIZE] = random<ScalarType>();
+
+
+  /////////////////////////////////////////////////
+  //////////// Matrix-matrix products /////////////
+  /////////////////////////////////////////////////
+
+  //
+  // Now iterate over all OpenCL devices in the context and compute the matrix-matrix product
+  //
+
+  std::cout << " ------ Benchmark 1: Matrix-Matrix product ------ " << std::endl;
+
+
+#ifdef VIENNACL_WITH_OPENCL
+  std::vector<viennacl::ocl::device> devices = viennacl::ocl::current_context().devices();
+#else
+  std::vector<long> devices(1);
+#endif
+  for (std::size_t i=0; i<devices.size(); ++i)
+  {
+#ifdef VIENNACL_WITH_OPENCL
+    viennacl::ocl::current_context().switch_device(devices[i]);
+    std::cout << " - Device Name: " << viennacl::ocl::current_device().name() << std::endl;
+#endif
+
+    viennacl::fast_copy(&(stl_A[0]),
+                        &(stl_A[0]) + stl_A.size(),
+                        vcl_A);
+    viennacl::fast_copy(&(stl_B[0]),
+                        &(stl_B[0]) + stl_B.size(),
+                        vcl_B);
+    vcl_C = viennacl::linalg::prod(vcl_A, vcl_B);
+    viennacl::backend::finish();
+    timer.start();
+    vcl_C = viennacl::linalg::prod(vcl_A, vcl_B);
+    viennacl::backend::finish();
+    exec_time = timer.get();
+    std::cout << " - Execution time on device (no setup time included): " << exec_time << std::endl;
+    std::cout << " - GFLOPs (counting multiply&add as separate operations): " << 2.0 * (vcl_A.size1() / 1000.0) * (vcl_A.size2() / 1000.0) * (vcl_B.size2() / 1000.0) / exec_time << std::endl;
+    std::cout << std::endl;
+  }
+
+  std::cout << " ------ Benchmark 2: Matrix-Matrix product using ranges ------ " << std::endl;
+
+  viennacl::range r(BLAS3_MATRIX_SIZE/4, 3 * BLAS3_MATRIX_SIZE/4);
+  for (std::size_t i=0; i<devices.size(); ++i)
+  {
+#ifdef VIENNACL_WITH_OPENCL
+    viennacl::ocl::current_context().switch_device(devices[i]);
+    std::cout << " - Device Name: " << viennacl::ocl::current_device().name() << std::endl;
+#endif
+
+    viennacl::fast_copy(&(stl_A[0]),
+                        &(stl_A[0]) + stl_A.size(),
+                        vcl_A);
+    viennacl::fast_copy(&(stl_B[0]),
+                        &(stl_B[0]) + stl_B.size(),
+                        vcl_B);
+    viennacl::project(vcl_C, r, r) = viennacl::linalg::prod(viennacl::project(vcl_A, r, r), viennacl::project(vcl_B, r, r));
+    viennacl::backend::finish();
+    timer.start();
+    viennacl::project(vcl_C, r, r) = viennacl::linalg::prod(viennacl::project(vcl_A, r, r), viennacl::project(vcl_B, r, r));
+    viennacl::backend::finish();
+    exec_time = timer.get();
+    std::cout << " - Execution time on device (no setup time included): " << exec_time << std::endl;
+    std::cout << " - GFLOPs (counting multiply&add as separate operations): " << 2.0 * (vcl_A.size1() / 2000.0) * (vcl_A.size2() / 2000.0) * (vcl_B.size2() / 2000.0) / exec_time << std::endl;
+    std::cout << std::endl;
+  }
+
+  std::cout << " ------ Benchmark 3: Matrix-Matrix product using slices ------ " << std::endl;
+
+  viennacl::slice s(0, 2, BLAS3_MATRIX_SIZE/2);
+  for (std::size_t i=0; i<devices.size(); ++i)
+  {
+#ifdef VIENNACL_WITH_OPENCL
+    viennacl::ocl::current_context().switch_device(devices[i]);
+    std::cout << " - Device Name: " << viennacl::ocl::current_device().name() << std::endl;
+#endif
+
+    viennacl::fast_copy(&(stl_A[0]),
+                        &(stl_A[0]) + stl_A.size(),
+                        vcl_A);
+    viennacl::fast_copy(&(stl_B[0]),
+                        &(stl_B[0]) + stl_B.size(),
+                        vcl_B);
+    viennacl::project(vcl_C, s, s) = viennacl::linalg::prod(viennacl::project(vcl_A, s, s), viennacl::project(vcl_B, s, s));
+    viennacl::backend::finish();
+    timer.start();
+    viennacl::project(vcl_C, s, s) = viennacl::linalg::prod(viennacl::project(vcl_A, s, s), viennacl::project(vcl_B, s, s));
+    viennacl::backend::finish();
+    exec_time = timer.get();
+    std::cout << " - Execution time on device (no setup time included): " << exec_time << std::endl;
+    std::cout << " - GFLOPs (counting multiply&add as separate operations): " << 2.0 * (vcl_A.size1() / 2000.0) * (vcl_A.size2() / 2000.0) * (vcl_B.size2() / 2000.0) / exec_time << std::endl;
+    std::cout << std::endl;
+  }
+
+
+  std::cout << " ------ Benchmark 4: LU factorization ------ " << std::endl;
+
+  for (std::size_t i=0; i<devices.size(); ++i)
+  {
+#ifdef VIENNACL_WITH_OPENCL
+    viennacl::ocl::current_context().switch_device(devices[i]);
+    std::cout << " - Device Name: " << viennacl::ocl::current_device().name() << std::endl;
+#endif
+
+    viennacl::fast_copy(&(stl_A[0]),
+                        &(stl_A[0]) + stl_A.size(),
+                        vcl_A);
+    viennacl::linalg::lu_factorize(vcl_A);
+    viennacl::backend::finish();
+    timer.start();
+    viennacl::linalg::lu_factorize(vcl_A);
+    viennacl::backend::finish();
+    exec_time = timer.get();
+    std::cout << " - Execution time on device (no setup time included): " << exec_time << std::endl;
+    std::cout << " - GFLOPs (counting multiply&add as separate operations): " << 2.0 * (vcl_A.size1() / 1000.0) * (vcl_A.size2() / 1000.0) * (vcl_A.size2() / 1000.0) / exec_time << std::endl;
+    std::cout << std::endl;
+  }
+
+  return EXIT_SUCCESS;
+}
+
+int main()
+{
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "               Device Info" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+
+#ifdef VIENNACL_WITH_OPENCL
+  std::cout << viennacl::ocl::current_device().info() << std::endl;
+#endif
+
+
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "## Benchmark :: Dense Matrix-Matrix product " << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << std::endl;
+  std::cout << "   -------------------------------" << std::endl;
+  std::cout << "   # benchmarking single-precision" << std::endl;
+  std::cout << "   -------------------------------" << std::endl;
+  run_benchmark<float>();
+#ifdef VIENNACL_WITH_OPENCL
+  if( viennacl::ocl::current_device().double_support() )
+#endif
+  {
+    std::cout << std::endl;
+    std::cout << "   -------------------------------" << std::endl;
+    std::cout << "   # benchmarking double-precision" << std::endl;
+    std::cout << "   -------------------------------" << std::endl;
+    run_benchmark<double>();
+  }
+  return 0;
+}
diff --git a/examples/benchmarks/blas3.cu b/examples/benchmarks/blas3.cu
new file mode 100644
index 0000000..128c8b8
--- /dev/null
+++ b/examples/benchmarks/blas3.cu
@@ -0,0 +1,246 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/*
+*
+*   Benchmark: BLAS level 3 functionality for dense matrices (blas3.cpp and blas3.cu are identical, the latter being required for compilation using CUDA nvcc)
+*
+*/
+
+//disable debug mechanisms to have a fair benchmark environment
+#ifndef NDEBUG
+ #define NDEBUG
+#endif
+
+//#define VIENNACL_DEBUG_ALL
+//#define VIENNACL_DEBUG_BUILD
+
+//
+// include necessary system headers
+//
+#include <iostream>
+
+//
+// ViennaCL includes
+//
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/matrix_proxy.hpp"
+#include "viennacl/linalg/lu.hpp"
+
+// Some helper functions for this tutorial:
+#include "../tutorial/Random.hpp"
+
+
+#include "benchmark-utils.hpp"
+
+#define BLAS3_MATRIX_SIZE   1920
+
+template<typename ScalarType>
+int run_benchmark()
+{
+  Timer timer;
+  double exec_time;
+
+  //
+  // Set up some ViennaCL objects
+  //
+#ifdef VIENNACL_WITH_OPENCL
+  viennacl::ocl::set_context_device_type(0, viennacl::ocl::gpu_tag());
+#endif
+
+  //viennacl::ocl::current_context().build_options("-cl-mad-enable -cl-fast-relaxed-math");   //uncomment for additional optimizations
+  //viennacl::ocl::current_context().build_options("-cl-opt-disable");                        //uncomment to get poor performance
+  viennacl::matrix<ScalarType> vcl_A(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
+  viennacl::matrix<ScalarType> vcl_B(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
+  viennacl::matrix<ScalarType> vcl_C(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
+
+  //
+  // One alternative: Put the matrices into a contiguous block of memory (allows to use viennacl::fast_copy(), avoiding temporary memory)
+  //
+  std::vector<ScalarType> stl_A(vcl_A.internal_size());
+  std::vector<ScalarType> stl_B(vcl_A.internal_size());
+
+  //
+  // Fill the matrix
+  //
+  for (unsigned int i = 0; i < BLAS3_MATRIX_SIZE; ++i)
+    for (unsigned int j = 0; j < BLAS3_MATRIX_SIZE; ++j)
+      stl_A[i*BLAS3_MATRIX_SIZE + j] = random<ScalarType>();
+
+  for (unsigned int i = 0; i < BLAS3_MATRIX_SIZE; ++i)
+    for (unsigned int j = 0; j < BLAS3_MATRIX_SIZE; ++j)
+      stl_B[i + j*BLAS3_MATRIX_SIZE] = random<ScalarType>();
+
+
+  /////////////////////////////////////////////////
+  //////////// Matrix-matrix products /////////////
+  /////////////////////////////////////////////////
+
+  //
+  // Now iterate over all OpenCL devices in the context and compute the matrix-matrix product
+  //
+
+  std::cout << " ------ Benchmark 1: Matrix-Matrix product ------ " << std::endl;
+
+
+#ifdef VIENNACL_WITH_OPENCL
+  std::vector<viennacl::ocl::device> devices = viennacl::ocl::current_context().devices();
+#else
+  std::vector<long> devices(1);
+#endif
+  for (std::size_t i=0; i<devices.size(); ++i)
+  {
+#ifdef VIENNACL_WITH_OPENCL
+    viennacl::ocl::current_context().switch_device(devices[i]);
+    std::cout << " - Device Name: " << viennacl::ocl::current_device().name() << std::endl;
+#endif
+
+    viennacl::fast_copy(&(stl_A[0]),
+                        &(stl_A[0]) + stl_A.size(),
+                        vcl_A);
+    viennacl::fast_copy(&(stl_B[0]),
+                        &(stl_B[0]) + stl_B.size(),
+                        vcl_B);
+    vcl_C = viennacl::linalg::prod(vcl_A, vcl_B);
+    viennacl::backend::finish();
+    timer.start();
+    vcl_C = viennacl::linalg::prod(vcl_A, vcl_B);
+    viennacl::backend::finish();
+    exec_time = timer.get();
+    std::cout << " - Execution time on device (no setup time included): " << exec_time << std::endl;
+    std::cout << " - GFLOPs (counting multiply&add as separate operations): " << 2.0 * (vcl_A.size1() / 1000.0) * (vcl_A.size2() / 1000.0) * (vcl_B.size2() / 1000.0) / exec_time << std::endl;
+    std::cout << std::endl;
+  }
+
+  std::cout << " ------ Benchmark 2: Matrix-Matrix product using ranges ------ " << std::endl;
+
+  viennacl::range r(BLAS3_MATRIX_SIZE/4, 3 * BLAS3_MATRIX_SIZE/4);
+  for (std::size_t i=0; i<devices.size(); ++i)
+  {
+#ifdef VIENNACL_WITH_OPENCL
+    viennacl::ocl::current_context().switch_device(devices[i]);
+    std::cout << " - Device Name: " << viennacl::ocl::current_device().name() << std::endl;
+#endif
+
+    viennacl::fast_copy(&(stl_A[0]),
+                        &(stl_A[0]) + stl_A.size(),
+                        vcl_A);
+    viennacl::fast_copy(&(stl_B[0]),
+                        &(stl_B[0]) + stl_B.size(),
+                        vcl_B);
+    viennacl::project(vcl_C, r, r) = viennacl::linalg::prod(viennacl::project(vcl_A, r, r), viennacl::project(vcl_B, r, r));
+    viennacl::backend::finish();
+    timer.start();
+    viennacl::project(vcl_C, r, r) = viennacl::linalg::prod(viennacl::project(vcl_A, r, r), viennacl::project(vcl_B, r, r));
+    viennacl::backend::finish();
+    exec_time = timer.get();
+    std::cout << " - Execution time on device (no setup time included): " << exec_time << std::endl;
+    std::cout << " - GFLOPs (counting multiply&add as separate operations): " << 2.0 * (vcl_A.size1() / 2000.0) * (vcl_A.size2() / 2000.0) * (vcl_B.size2() / 2000.0) / exec_time << std::endl;
+    std::cout << std::endl;
+  }
+
+  std::cout << " ------ Benchmark 3: Matrix-Matrix product using slices ------ " << std::endl;
+
+  viennacl::slice s(0, 2, BLAS3_MATRIX_SIZE/2);
+  for (std::size_t i=0; i<devices.size(); ++i)
+  {
+#ifdef VIENNACL_WITH_OPENCL
+    viennacl::ocl::current_context().switch_device(devices[i]);
+    std::cout << " - Device Name: " << viennacl::ocl::current_device().name() << std::endl;
+#endif
+
+    viennacl::fast_copy(&(stl_A[0]),
+                        &(stl_A[0]) + stl_A.size(),
+                        vcl_A);
+    viennacl::fast_copy(&(stl_B[0]),
+                        &(stl_B[0]) + stl_B.size(),
+                        vcl_B);
+    viennacl::project(vcl_C, s, s) = viennacl::linalg::prod(viennacl::project(vcl_A, s, s), viennacl::project(vcl_B, s, s));
+    viennacl::backend::finish();
+    timer.start();
+    viennacl::project(vcl_C, s, s) = viennacl::linalg::prod(viennacl::project(vcl_A, s, s), viennacl::project(vcl_B, s, s));
+    viennacl::backend::finish();
+    exec_time = timer.get();
+    std::cout << " - Execution time on device (no setup time included): " << exec_time << std::endl;
+    std::cout << " - GFLOPs (counting multiply&add as separate operations): " << 2.0 * (vcl_A.size1() / 2000.0) * (vcl_A.size2() / 2000.0) * (vcl_B.size2() / 2000.0) / exec_time << std::endl;
+    std::cout << std::endl;
+  }
+
+
+  std::cout << " ------ Benchmark 4: LU factorization ------ " << std::endl;
+
+  for (std::size_t i=0; i<devices.size(); ++i)
+  {
+#ifdef VIENNACL_WITH_OPENCL
+    viennacl::ocl::current_context().switch_device(devices[i]);
+    std::cout << " - Device Name: " << viennacl::ocl::current_device().name() << std::endl;
+#endif
+
+    viennacl::fast_copy(&(stl_A[0]),
+                        &(stl_A[0]) + stl_A.size(),
+                        vcl_A);
+    viennacl::linalg::lu_factorize(vcl_A);
+    viennacl::backend::finish();
+    timer.start();
+    viennacl::linalg::lu_factorize(vcl_A);
+    viennacl::backend::finish();
+    exec_time = timer.get();
+    std::cout << " - Execution time on device (no setup time included): " << exec_time << std::endl;
+    std::cout << " - GFLOPs (counting multiply&add as separate operations): " << 2.0 * (vcl_A.size1() / 1000.0) * (vcl_A.size2() / 1000.0) * (vcl_A.size2() / 1000.0) / exec_time << std::endl;
+    std::cout << std::endl;
+  }
+
+  return EXIT_SUCCESS;
+}
+
+int main()
+{
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "               Device Info" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+
+#ifdef VIENNACL_WITH_OPENCL
+  std::cout << viennacl::ocl::current_device().info() << std::endl;
+#endif
+
+
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "## Benchmark :: Dense Matrix-Matrix product " << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << std::endl;
+  std::cout << "   -------------------------------" << std::endl;
+  std::cout << "   # benchmarking single-precision" << std::endl;
+  std::cout << "   -------------------------------" << std::endl;
+  run_benchmark<float>();
+#ifdef VIENNACL_WITH_OPENCL
+  if( viennacl::ocl::current_device().double_support() )
+#endif
+  {
+    std::cout << std::endl;
+    std::cout << "   -------------------------------" << std::endl;
+    std::cout << "   # benchmarking double-precision" << std::endl;
+    std::cout << "   -------------------------------" << std::endl;
+    run_benchmark<double>();
+  }
+  return 0;
+}
diff --git a/examples/benchmarks/copy.cpp b/examples/benchmarks/copy.cpp
new file mode 100644
index 0000000..2809731
--- /dev/null
+++ b/examples/benchmarks/copy.cpp
@@ -0,0 +1,189 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/*
+*
+*   Benchmark:   Performance of viennacl::copy(), viennacl::fast_copy(), and viennacl::async_copy()
+*
+*/
+
+
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/linalg/inner_prod.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+
+#include <cstdlib>
+#include <iostream>
+#include <vector>
+#include "benchmark-utils.hpp"
+
+using std::cout;
+using std::cin;
+using std::endl;
+
+
+#define BENCHMARK_VECTOR_SIZE   10000000
+#define BENCHMARK_RUNS          10
+
+
+template<typename ScalarType>
+void run_benchmark()
+{
+
+  Timer timer;
+  double exec_time_return = 0;
+  double exec_time_complete = 0;
+
+  std::vector<ScalarType> std_vec1(BENCHMARK_VECTOR_SIZE);
+  std::vector<ScalarType> std_vec2(BENCHMARK_VECTOR_SIZE);
+  viennacl::vector<ScalarType> vcl_vec1(BENCHMARK_VECTOR_SIZE);
+  viennacl::vector<ScalarType> vcl_vec2(BENCHMARK_VECTOR_SIZE);
+
+
+  ///////////// Vector operations /////////////////
+
+  std_vec1[0] = 1.0;
+  std_vec2[0] = 1.0;
+  for (int i=1; i<BENCHMARK_VECTOR_SIZE; ++i)
+  {
+    std_vec1[i] = std_vec1[i-1] * ScalarType(1.000001);
+    std_vec2[i] = std_vec1[i-1] * ScalarType(0.999999);
+  }
+
+  // warmup:
+  viennacl::copy(std_vec1, vcl_vec1);
+  viennacl::fast_copy(std_vec2, vcl_vec2);
+  viennacl::async_copy(std_vec2, vcl_vec1);
+  viennacl::backend::finish();
+
+  //
+  // Benchmark copy operation:
+  //
+  timer.start();
+  viennacl::copy(std_vec1, vcl_vec1);
+  exec_time_return = timer.get();
+  viennacl::backend::finish();
+  exec_time_complete = timer.get();
+  std::cout << " *** viennacl::copy(), host to device ***" << std::endl;
+  std::cout << "  - Time to function return: " << exec_time_return << std::endl;
+  std::cout << "  - Time to completion: " << exec_time_complete << std::endl;
+  std::cout << "  - Estimated effective bandwidth: " << BENCHMARK_VECTOR_SIZE * sizeof(ScalarType) / exec_time_complete / 1e9 << " GB/sec" << std::endl;
+
+  timer.start();
+  viennacl::copy(vcl_vec1, std_vec1);
+  exec_time_return = timer.get();
+  viennacl::backend::finish();
+  exec_time_complete = timer.get();
+  std::cout << " *** viennacl::copy(), device to host ***" << std::endl;
+  std::cout << "  - Time to function return: " << exec_time_return << std::endl;
+  std::cout << "  - Time to completion: " << exec_time_complete << std::endl;
+  std::cout << "  - Estimated effective bandwidth: " << BENCHMARK_VECTOR_SIZE * sizeof(ScalarType) / exec_time_complete / 1e9 << " GB/sec" << std::endl;
+
+
+  //
+  // Benchmark fast_copy operation:
+  //
+  timer.start();
+  viennacl::fast_copy(std_vec1, vcl_vec1);
+  exec_time_return = timer.get();
+  viennacl::backend::finish();
+  exec_time_complete = timer.get();
+  std::cout << " *** viennacl::fast_copy(), host to device ***" << std::endl;
+  std::cout << "  - Time to function return: " << exec_time_return << std::endl;
+  std::cout << "  - Time to completion: " << exec_time_complete << std::endl;
+  std::cout << "  - Estimated effective bandwidth: " << BENCHMARK_VECTOR_SIZE * sizeof(ScalarType) / exec_time_complete / 1e9 << " GB/sec" << std::endl;
+
+  timer.start();
+  viennacl::fast_copy(vcl_vec1, std_vec1);
+  exec_time_return = timer.get();
+  viennacl::backend::finish();
+  exec_time_complete = timer.get();
+  std::cout << " *** viennacl::fast_copy(), device to host ***" << std::endl;
+  std::cout << "  - Time to function return: " << exec_time_return << std::endl;
+  std::cout << "  - Time to completion: " << exec_time_complete << std::endl;
+  std::cout << "  - Estimated effective bandwidth: " << BENCHMARK_VECTOR_SIZE * sizeof(ScalarType) / exec_time_complete / 1e9 << " GB/sec" << std::endl;
+
+  //
+  // Benchmark async_copy operation:
+  //
+  timer.start();
+  viennacl::async_copy(vcl_vec1, std_vec1);
+  exec_time_return = timer.get();
+  viennacl::backend::finish();
+  exec_time_complete = timer.get();
+  std::cout << " *** viennacl::async_copy(), host to device ***" << std::endl;
+  std::cout << "  - Time to function return: " << exec_time_return << std::endl;
+  std::cout << "  - Time to completion: " << exec_time_complete << std::endl;
+  std::cout << "  - Estimated effective bandwidth: " << BENCHMARK_VECTOR_SIZE * sizeof(ScalarType) / exec_time_complete / 1e9 << " GB/sec" << std::endl;
+
+  timer.start();
+  viennacl::async_copy(vcl_vec1, std_vec1);
+  exec_time_return = timer.get();
+  viennacl::backend::finish();
+  exec_time_complete = timer.get();
+  std::cout << " *** viennacl::async_copy(), device to host ***" << std::endl;
+  std::cout << "  - Time to function return: " << exec_time_return << std::endl;
+  std::cout << "  - Time to completion: " << exec_time_complete << std::endl;
+  std::cout << "  - Estimated effective bandwidth: " << BENCHMARK_VECTOR_SIZE * sizeof(ScalarType) / exec_time_complete / 1e9 << " GB/sec" << std::endl;
+
+}
+
+int main()
+{
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "               Device Info" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+#ifdef VIENNACL_WITH_OPENCL
+  std::cout << viennacl::ocl::current_device().info() << std::endl;
+#endif
+
+
+#ifndef NDEBUG
+  std::cout << std::endl;
+  std::cout << " ******************************************************************" << std::endl;
+  std::cout << " **** WARNING: This is not a release build." << std::endl;
+  std::cout << " ****          Performance numbers are therefore lower than normal. " << std::endl;
+  std::cout << " ******************************************************************" << std::endl;
+  std::cout << std::endl;
+#endif
+
+
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "## Benchmark :: Vector" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << std::endl;
+  std::cout << "   -------------------------------" << std::endl;
+  std::cout << "   # benchmarking single-precision" << std::endl;
+  std::cout << "   -------------------------------" << std::endl;
+  run_benchmark<float>();
+#ifdef VIENNACL_WITH_OPENCL
+  if( viennacl::ocl::current_device().double_support() )
+#endif
+  {
+    std::cout << std::endl;
+    std::cout << "   -------------------------------" << std::endl;
+    std::cout << "   # benchmarking double-precision" << std::endl;
+    std::cout << "   -------------------------------" << std::endl;
+    run_benchmark<double>();
+  }
+  return EXIT_SUCCESS;
+}
+
diff --git a/examples/benchmarks/copy.cu b/examples/benchmarks/copy.cu
new file mode 100644
index 0000000..2809731
--- /dev/null
+++ b/examples/benchmarks/copy.cu
@@ -0,0 +1,189 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/*
+*
+*   Benchmark:   Performance of viennacl::copy(), viennacl::fast_copy(), and viennacl::async_copy()
+*
+*/
+
+
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/linalg/inner_prod.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+
+#include <cstdlib>
+#include <iostream>
+#include <vector>
+#include "benchmark-utils.hpp"
+
+using std::cout;
+using std::cin;
+using std::endl;
+
+
+#define BENCHMARK_VECTOR_SIZE   10000000
+#define BENCHMARK_RUNS          10
+
+
+template<typename ScalarType>
+void run_benchmark()
+{
+
+  Timer timer;
+  double exec_time_return = 0;
+  double exec_time_complete = 0;
+
+  std::vector<ScalarType> std_vec1(BENCHMARK_VECTOR_SIZE);
+  std::vector<ScalarType> std_vec2(BENCHMARK_VECTOR_SIZE);
+  viennacl::vector<ScalarType> vcl_vec1(BENCHMARK_VECTOR_SIZE);
+  viennacl::vector<ScalarType> vcl_vec2(BENCHMARK_VECTOR_SIZE);
+
+
+  ///////////// Vector operations /////////////////
+
+  std_vec1[0] = 1.0;
+  std_vec2[0] = 1.0;
+  for (int i=1; i<BENCHMARK_VECTOR_SIZE; ++i)
+  {
+    std_vec1[i] = std_vec1[i-1] * ScalarType(1.000001);
+    std_vec2[i] = std_vec1[i-1] * ScalarType(0.999999);
+  }
+
+  // warmup:
+  viennacl::copy(std_vec1, vcl_vec1);
+  viennacl::fast_copy(std_vec2, vcl_vec2);
+  viennacl::async_copy(std_vec2, vcl_vec1);
+  viennacl::backend::finish();
+
+  //
+  // Benchmark copy operation:
+  //
+  timer.start();
+  viennacl::copy(std_vec1, vcl_vec1);
+  exec_time_return = timer.get();
+  viennacl::backend::finish();
+  exec_time_complete = timer.get();
+  std::cout << " *** viennacl::copy(), host to device ***" << std::endl;
+  std::cout << "  - Time to function return: " << exec_time_return << std::endl;
+  std::cout << "  - Time to completion: " << exec_time_complete << std::endl;
+  std::cout << "  - Estimated effective bandwidth: " << BENCHMARK_VECTOR_SIZE * sizeof(ScalarType) / exec_time_complete / 1e9 << " GB/sec" << std::endl;
+
+  timer.start();
+  viennacl::copy(vcl_vec1, std_vec1);
+  exec_time_return = timer.get();
+  viennacl::backend::finish();
+  exec_time_complete = timer.get();
+  std::cout << " *** viennacl::copy(), device to host ***" << std::endl;
+  std::cout << "  - Time to function return: " << exec_time_return << std::endl;
+  std::cout << "  - Time to completion: " << exec_time_complete << std::endl;
+  std::cout << "  - Estimated effective bandwidth: " << BENCHMARK_VECTOR_SIZE * sizeof(ScalarType) / exec_time_complete / 1e9 << " GB/sec" << std::endl;
+
+
+  //
+  // Benchmark fast_copy operation:
+  //
+  timer.start();
+  viennacl::fast_copy(std_vec1, vcl_vec1);
+  exec_time_return = timer.get();
+  viennacl::backend::finish();
+  exec_time_complete = timer.get();
+  std::cout << " *** viennacl::fast_copy(), host to device ***" << std::endl;
+  std::cout << "  - Time to function return: " << exec_time_return << std::endl;
+  std::cout << "  - Time to completion: " << exec_time_complete << std::endl;
+  std::cout << "  - Estimated effective bandwidth: " << BENCHMARK_VECTOR_SIZE * sizeof(ScalarType) / exec_time_complete / 1e9 << " GB/sec" << std::endl;
+
+  timer.start();
+  viennacl::fast_copy(vcl_vec1, std_vec1);
+  exec_time_return = timer.get();
+  viennacl::backend::finish();
+  exec_time_complete = timer.get();
+  std::cout << " *** viennacl::fast_copy(), device to host ***" << std::endl;
+  std::cout << "  - Time to function return: " << exec_time_return << std::endl;
+  std::cout << "  - Time to completion: " << exec_time_complete << std::endl;
+  std::cout << "  - Estimated effective bandwidth: " << BENCHMARK_VECTOR_SIZE * sizeof(ScalarType) / exec_time_complete / 1e9 << " GB/sec" << std::endl;
+
+  //
+  // Benchmark async_copy operation:
+  //
+  timer.start();
+  viennacl::async_copy(vcl_vec1, std_vec1);
+  exec_time_return = timer.get();
+  viennacl::backend::finish();
+  exec_time_complete = timer.get();
+  std::cout << " *** viennacl::async_copy(), host to device ***" << std::endl;
+  std::cout << "  - Time to function return: " << exec_time_return << std::endl;
+  std::cout << "  - Time to completion: " << exec_time_complete << std::endl;
+  std::cout << "  - Estimated effective bandwidth: " << BENCHMARK_VECTOR_SIZE * sizeof(ScalarType) / exec_time_complete / 1e9 << " GB/sec" << std::endl;
+
+  timer.start();
+  viennacl::async_copy(vcl_vec1, std_vec1);
+  exec_time_return = timer.get();
+  viennacl::backend::finish();
+  exec_time_complete = timer.get();
+  std::cout << " *** viennacl::async_copy(), device to host ***" << std::endl;
+  std::cout << "  - Time to function return: " << exec_time_return << std::endl;
+  std::cout << "  - Time to completion: " << exec_time_complete << std::endl;
+  std::cout << "  - Estimated effective bandwidth: " << BENCHMARK_VECTOR_SIZE * sizeof(ScalarType) / exec_time_complete / 1e9 << " GB/sec" << std::endl;
+
+}
+
+int main()
+{
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "               Device Info" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+#ifdef VIENNACL_WITH_OPENCL
+  std::cout << viennacl::ocl::current_device().info() << std::endl;
+#endif
+
+
+#ifndef NDEBUG
+  std::cout << std::endl;
+  std::cout << " ******************************************************************" << std::endl;
+  std::cout << " **** WARNING: This is not a release build." << std::endl;
+  std::cout << " ****          Performance numbers are therefore lower than normal. " << std::endl;
+  std::cout << " ******************************************************************" << std::endl;
+  std::cout << std::endl;
+#endif
+
+
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "## Benchmark :: Vector" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << std::endl;
+  std::cout << "   -------------------------------" << std::endl;
+  std::cout << "   # benchmarking single-precision" << std::endl;
+  std::cout << "   -------------------------------" << std::endl;
+  run_benchmark<float>();
+#ifdef VIENNACL_WITH_OPENCL
+  if( viennacl::ocl::current_device().double_support() )
+#endif
+  {
+    std::cout << std::endl;
+    std::cout << "   -------------------------------" << std::endl;
+    std::cout << "   # benchmarking double-precision" << std::endl;
+    std::cout << "   -------------------------------" << std::endl;
+    run_benchmark<double>();
+  }
+  return EXIT_SUCCESS;
+}
+
diff --git a/examples/benchmarks/generator_blas1.cpp b/examples/benchmarks/generator_blas1.cpp
new file mode 100644
index 0000000..d855b27
--- /dev/null
+++ b/examples/benchmarks/generator_blas1.cpp
@@ -0,0 +1,135 @@
+/* =========================================================================
+   Copyright (c) 2010-2012, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/*
+*
+*   Benchmark: BLAS level 3 functionality for dense matrices (blas3.cpp and blas3.cu are identical, the latter being required for compilation using CUDA nvcc)
+*
+*/
+
+//disable debug mechanisms to have a fair benchmark environment
+#ifndef NDEBUG
+#define NDEBUG
+#endif
+
+//#define VIENNACL_DEBUG_BUILD
+
+//
+// include necessary system headers
+//
+#include <iostream>
+#include <iomanip>
+
+//
+// ViennaCL includes
+//
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/linalg/inner_prod.hpp"
+
+#include "viennacl/generator/generate.hpp"
+
+// Some helper functions for this tutorial:
+#include "../tutorial/Random.hpp"
+
+#include "viennacl/scheduler/forwards.h"
+
+#include "benchmark-utils.hpp"
+
+#define N_RUNS 100
+#define MAX_SIZE 1e8
+
+enum operation_type{
+  dot,
+  assign
+};
+
+template<typename ScalarType>
+ScalarType run_benchmark(size_t size, operation_type type)
+{
+    std::size_t n_bytes = size*sizeof(ScalarType);
+    std::size_t n_transfers = 2;
+    if(type==dot)
+      n_transfers = 2;
+    else if(type==assign)
+      n_transfers = 2;
+    viennacl::vector<ScalarType> vcl_A = viennacl::scalar_vector<ScalarType>(size,1);
+    viennacl::vector<ScalarType> vcl_B = viennacl::scalar_vector<ScalarType>(size,1);
+
+    viennacl::scalar<ScalarType> s(0);
+
+    viennacl::scheduler::statement * statement = NULL;
+
+    if(type==dot)
+      statement = new viennacl::scheduler::statement(s, viennacl::op_assign(), viennacl::linalg::inner_prod(vcl_A, vcl_B));
+    else //if(type==assign)
+      statement = new viennacl::scheduler::statement(vcl_A, viennacl::op_assign(), vcl_B);
+
+    viennacl::generator::generate_enqueue_statement(*statement, statement->array()[0]);
+    viennacl::backend::finish();
+
+    Timer timer;
+    timer.start();
+    for(unsigned int r = 0 ; r < N_RUNS ; ++r){
+      viennacl::generator::generate_enqueue_statement(*statement, statement->array()[0]);
+    }
+    viennacl::backend::finish();
+
+    double time = timer.get()/(double)N_RUNS;
+    delete statement;
+
+    return ScalarType((n_bytes*n_transfers) / time / 1e9);
+}
+
+
+int main(){
+    typedef std::vector< viennacl::ocl::platform > platforms_type;
+    typedef std::vector<viennacl::ocl::device> devices_type;
+
+    platforms_type platforms = viennacl::ocl::get_platforms();
+    size_t num_platforms = platforms.size();
+
+    for(unsigned int k=0 ; k < num_platforms ; ++k)
+    {
+        viennacl::ocl::platform pf(k);
+        viennacl::ocl::set_context_device_type(k,CL_DEVICE_TYPE_ALL);
+        viennacl::ocl::set_context_platform_index(k,k);
+        viennacl::ocl::switch_context(k);
+        devices_type dev = viennacl::ocl::current_context().devices();
+        for(devices_type::iterator it = dev.begin() ; it != dev.end() ; ++it){
+
+                viennacl::ocl::switch_device(*it);
+                std::cout << std::endl;
+                std::cout << "----------------------------------------------" << std::endl;
+                std::cout << "               Device Info" << std::endl;
+                std::cout << "----------------------------------------------" << std::endl;
+                std::cout << viennacl::ocl::current_device().info() << std::endl;
+                std::cout << "float:" << std::endl;
+                std::cout << "#N\tAssign(GB/s)\tDot(GB/s)" << std::endl;
+                for(unsigned int size = 1024 ; size <= MAX_SIZE ; size *= 2){
+                  std::cout << std::setprecision(2) << (float)size << "\t" << (int)run_benchmark<float>(size, assign) << "\t" << (int)run_benchmark<float>(size, dot) << std::endl;
+                }
+                std::cout << std::endl;
+                std::cout << "double:" << std::endl;
+                std::cout << "#N\tAssign(GB/s)\tDot(GB/s)" << std::endl;
+                for(unsigned int size = 1024 ; size <= MAX_SIZE ; size *= 2){
+                  std::cout << std::setprecision(2) << (double)size << "\t" << (int)run_benchmark<double>(size, assign) << "\t" << (int)run_benchmark<double>(size, dot) << std::endl;
+                }
+        }
+    }
+    return 0;
+}
diff --git a/examples/benchmarks/generator_blas2.cpp b/examples/benchmarks/generator_blas2.cpp
new file mode 100644
index 0000000..ce6f3a5
--- /dev/null
+++ b/examples/benchmarks/generator_blas2.cpp
@@ -0,0 +1,127 @@
+/* =========================================================================
+   Copyright (c) 2010-2012, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/*
+*
+*   Benchmark: BLAS level 3 functionality for dense matrices (blas3.cpp and blas3.cu are identical, the latter being required for compilation using CUDA nvcc)
+*
+*/
+
+//disable debug mechanisms to have a fair benchmark environment
+#ifndef NDEBUG
+#define NDEBUG
+#endif
+
+//#define VIENNACL_DEBUG_ALL
+//#define VIENNACL_DEBUG_BUILD
+
+//
+// include necessary system headers
+//
+#include <iostream>
+#include <iomanip>
+
+//
+// ViennaCL includes
+//
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+
+#include "viennacl/generator/generate.hpp"
+
+// Some helper functions for this tutorial:
+#include "../tutorial/Random.hpp"
+
+
+#include "benchmark-utils.hpp"
+
+#define N_RUNS 100
+#define SIZE_INC 256
+#define MAX_SIZE 7936
+
+template<typename ScalarType, class FB>
+double run_benchmark(size_t size, bool is_trans)
+{
+    //viennacl::ocl::current_context().build_options("-cl-mad-enable -cl-fast-relaxed-math");   //uncomment for additional optimizations
+    //viennacl::ocl::current_context().build_options("-cl-opt-disable");                        //uncomment to get poor performance
+    viennacl::vector<ScalarType> y = viennacl::scalar_vector<ScalarType>(size,1);
+    viennacl::matrix<ScalarType,FB> A = viennacl::scalar_matrix<ScalarType>(size, size,1);
+    viennacl::vector<ScalarType> x = viennacl::scalar_vector<ScalarType>(size,1);
+
+    viennacl::scheduler::statement * statement;
+
+    if(is_trans)
+      statement = new viennacl::scheduler::statement(y, viennacl::op_assign(), viennacl::linalg::prod(trans(A),x));
+    else
+      statement = new viennacl::scheduler::statement(y, viennacl::op_assign(), viennacl::linalg::prod(A,x));
+
+    viennacl::generator::generate_enqueue_statement(*statement, statement->array()[0]);
+    viennacl::backend::finish();
+
+    Timer timer;
+    timer.start();
+    for(unsigned int r = 0 ; r < N_RUNS ; ++r){
+      viennacl::generator::generate_enqueue_statement(*statement, statement->array()[0]);
+    }
+    viennacl::backend::finish();
+
+    double time = timer.get()/(double)N_RUNS;
+    delete statement;
+    return 1e-9*size*(2*size-1)/time;
+}
+
+int main()
+{
+    typedef std::vector< viennacl::ocl::platform > platforms_type;
+    typedef std::vector<viennacl::ocl::device> devices_type;
+
+    platforms_type platforms = viennacl::ocl::get_platforms();
+    size_t num_platforms = platforms.size();
+
+    std::cout << "Running GEMV..." << std::endl;
+    for(unsigned int k=0 ; k < num_platforms ; ++k)
+    {
+        viennacl::ocl::platform pf(k);
+        viennacl::ocl::set_context_device_type(k,CL_DEVICE_TYPE_ALL);
+        viennacl::ocl::set_context_platform_index(k,k);
+        viennacl::ocl::switch_context(k);
+        devices_type dev = viennacl::ocl::current_context().devices();
+        for(devices_type::iterator it = dev.begin() ; it != dev.end() ; ++it){
+                viennacl::ocl::switch_device(*it);
+                std::cout << std::endl;
+                std::cout << "----------------------------------------------" << std::endl;
+                std::cout << "               Device Info" << std::endl;
+                std::cout << "----------------------------------------------" << std::endl;
+                std::cout << viennacl::ocl::current_device().info() << std::endl;
+                std::cout << std::endl;
+                std::cout << "float:" << std::endl;
+                std::cout << "#N\tAv(GFLOP/s)\tTv(GFLOP/s)" << std::endl;
+                for(unsigned int size = SIZE_INC ; size <= MAX_SIZE ; size += SIZE_INC){
+                    std::cout << size << "\t" << std::setprecision(3) << run_benchmark<float,viennacl::row_major>(size,false) << "\t" << run_benchmark<float,viennacl::row_major>(size,true) << std::endl;
+                }
+                std::cout << std::endl;
+                std::cout << "double:" << std::endl;
+                std::cout << "#N\tAv(GFLOP/s)\tTv(GFLOP/s)" << std::endl;
+                for(unsigned int size = SIZE_INC ; size <= MAX_SIZE ; size += SIZE_INC){
+                  std::cout << size << "\t" << std::setprecision(3) << run_benchmark<double,viennacl::row_major>(size,false) << "\t" << run_benchmark<double,viennacl::row_major>(size,true) << std::endl;
+                }
+        }
+    }
+    return 0;
+}
diff --git a/examples/benchmarks/generator_blas3.cpp b/examples/benchmarks/generator_blas3.cpp
new file mode 100644
index 0000000..408b633
--- /dev/null
+++ b/examples/benchmarks/generator_blas3.cpp
@@ -0,0 +1,129 @@
+/* =========================================================================
+   Copyright (c) 2010-2012, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/*
+*
+*   Benchmark: BLAS level 3 functionality for dense matrices (blas3.cpp and blas3.cu are identical, the latter being required for compilation using CUDA nvcc)
+*
+*/
+
+//
+// include necessary system headers
+//
+#include <iostream>
+
+//
+// ViennaCL includes
+//
+//#define VIENNACL_DEBUG_BUILD
+
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/matrix_proxy.hpp"
+#include "viennacl/linalg/lu.hpp"
+
+#include "viennacl/generator/generate.hpp"
+#include "viennacl/scheduler/forwards.h"
+
+// Some helper functions for this tutorial:
+#include "../tutorial/Random.hpp"
+
+
+#include "benchmark-utils.hpp"
+
+#define N_RUNS 2
+#define SIZE_INC 128
+#define MAX_SIZE 1536
+
+template<class MatA, class MatB, class MatC>
+viennacl::scheduler::statement * allocate_statement(bool is_lhs_trans, bool is_rhs_trans, MatA const & A, MatB const & B, MatC const & C){
+    if(is_lhs_trans)
+      if(is_rhs_trans)
+          return new viennacl::scheduler::statement(C, viennacl::op_assign(), viennacl::linalg::prod(trans(A),trans(B)));
+      else
+          return new viennacl::scheduler::statement(C, viennacl::op_assign(), viennacl::linalg::prod(trans(A),B));
+    else
+      if(is_rhs_trans)
+          return new viennacl::scheduler::statement(C, viennacl::op_assign(), viennacl::linalg::prod(A,trans(B)));
+      else
+          return new viennacl::scheduler::statement(C, viennacl::op_assign(), viennacl::linalg::prod(A,B));
+
+}
+
+template<typename ScalarType>
+unsigned int run_benchmark(size_t size, bool is_lhs_trans, bool is_rhs_trans)
+{    //viennacl::ocl::current_context().build_options("-cl-mad-enable -cl-fast-relaxed-math");   //uncomment for additional optimizations
+    //viennacl::ocl::current_context().build_options("-cl-opt-disable");                        //uncomment to get poor performance
+    viennacl::matrix<ScalarType> A(size, size);
+    viennacl::matrix<ScalarType> B(size, size);
+    viennacl::matrix<ScalarType> C(size, size);
+    viennacl::scheduler::statement * statement = allocate_statement(is_lhs_trans, is_rhs_trans,A,B,C);
+    viennacl::generator::generate_enqueue_statement(*statement, statement->array()[0]);
+    viennacl::backend::finish();
+    Timer timer;
+    timer.start();
+    for(unsigned int r = 0 ; r < N_RUNS ; ++r){
+      viennacl::generator::generate_enqueue_statement(*statement, statement->array()[0]);
+    }
+    viennacl::backend::finish();
+    double time = timer.get()/(double)N_RUNS;
+    delete statement;
+    return static_cast<unsigned int>(2*pow(size/static_cast<double>(1000.0),3)/time);
+}
+
+int main(){
+    typedef std::vector< viennacl::ocl::platform > platforms_type;
+    typedef std::vector<viennacl::ocl::device> devices_type;
+
+    platforms_type platforms = viennacl::ocl::get_platforms();
+    size_t num_platforms = platforms.size();
+
+
+    for(unsigned int k=0 ; k < num_platforms ; ++k)
+    {
+        viennacl::ocl::platform pf(k);
+        viennacl::ocl::set_context_platform_index(k,k);
+        viennacl::ocl::switch_context(k);
+        devices_type dev = viennacl::ocl::current_context().devices();
+        for(devices_type::iterator it = dev.begin() ; it != dev.end() ; ++it){
+          if(it->type()==CL_DEVICE_TYPE_GPU){
+                viennacl::ocl::switch_device(*it);
+                std::cout << std::endl;
+                std::cout << "----------------------------------------------" << std::endl;
+                std::cout << "               Device Info" << std::endl;
+                std::cout << "----------------------------------------------" << std::endl;
+                std::cout << viennacl::ocl::current_device().info() << std::endl;
+                std::cout << "----------------------------------------------" << std::endl;
+
+                std::cout << "float : " << std::endl;
+                std::cout << "#Size\tAA\tTA\tAT\tTT" << std::endl;
+                for(unsigned int size = SIZE_INC ; size <= MAX_SIZE ; size += SIZE_INC){
+                    std::cout << size << "\t" << run_benchmark<float>(size,false,false) << "\t" << run_benchmark<float>(size,true,false) << "\t" << run_benchmark<float>(size,false,true) << "\t" << run_benchmark<float>(size,true,true) << std::endl;
+                }
+
+                std::cout << "double : " << std::endl;
+                std::cout << "#Size\tAA\tTA\tAT\tTT" << std::endl;
+                for(unsigned int size = SIZE_INC ; size <= MAX_SIZE ; size += SIZE_INC){
+                    std::cout << size << "\t" << run_benchmark<double>(size,false,false) << "\t" << run_benchmark<double>(size,true,false) << "\t" << run_benchmark<double>(size,false,true) << "\t" << run_benchmark<double>(size,true,true) << std::endl;
+                }
+          }
+        }
+    }
+    return 0;
+}
diff --git a/examples/benchmarks/iccs_qr.cpp b/examples/benchmarks/iccs_qr.cpp
deleted file mode 100644
index c97dfcb..0000000
--- a/examples/benchmarks/iccs_qr.cpp
+++ /dev/null
@@ -1,139 +0,0 @@
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-#define VIENNACL_HAVE_UBLAS
-#ifndef NDEBUG
- #define NDEBUG
-#endif
-
-#include <utility>
-#include <iostream>
-#include <fstream>
-#include <string>
-#include <cmath>
-#include <algorithm>
-#include <stdio.h>
-#include <sys/time.h>
-#include <time.h>
-#include "benchmark-utils.hpp"
-#include "viennacl/scalar.hpp"
-#include "viennacl/matrix.hpp"
-#include "viennacl/compressed_matrix.hpp"
-#include "viennacl/linalg/cg.hpp"
-#include "viennacl/linalg/prod.hpp"
-#include "viennacl/linalg/inner_prod.hpp"
-#include "viennacl/linalg/ilu.hpp"
-#include "viennacl/linalg/norm_2.hpp"
-#include "viennacl/linalg/qr.hpp"
-#include "boost/numeric/ublas/vector.hpp"
-#include "boost/numeric/ublas/matrix.hpp"
-#include "boost/numeric/ublas/io.hpp"
-
-           
-//typedef viennacl::compressed_matrix<float> SparseMatrix;
-using namespace boost::numeric::ublas;
-//using namespace viennacl::linalg;
-
-
-void run(size_t rows, size_t cols, std::size_t block_size, double with_cpu = true)
-{
-  typedef float               ScalarType;
-  typedef boost::numeric::ublas::matrix<ScalarType, boost::numeric::ublas::column_major>        MatrixType;
-  typedef boost::numeric::ublas::vector<ScalarType>                   VectorType;
-  
-  typedef viennacl::matrix<ScalarType, viennacl::column_major>   VCLMatrixType;
-  typedef viennacl::vector<ScalarType>   VCLVectorType;
-  
-  Timer timer;
-  double elapsed = 0;
-  
-  MatrixType A(rows, cols);
-
-  for (size_t i=0; i<rows; ++i)
-  {
-    for (size_t j=0; j<cols; ++j)
-    {
-      A(i,j) = 1.0 + (i + 1)*(j+1);
-    }
-  }
-  
-  VCLVectorType dummy(10);
-  VCLMatrixType vcl_A(rows, cols);
-    
-  viennacl::copy(A, vcl_A);
-
-  std::cout << "Benchmark size: " << rows << " x " << cols << std::endl;
-
-  
-  //
-  //  CPU:
-  //
-  
-  if (with_cpu)
-  {
-    timer.start();
-    std::vector<ScalarType> betas_cpu = viennacl::linalg::inplace_qr_ublas(A, block_size);
-    elapsed = timer.get();
-    std::cout << "Time for QR on CPU: " << elapsed << std::endl;
-  }
-  
-  //
-  //  GPU:
-  //
-  viennacl::ocl::get_queue().finish();
-  timer.start();
-  std::vector<ScalarType> betas_gpu = viennacl::linalg::inplace_qr_viennacl(vcl_A, block_size);
-  viennacl::ocl::get_queue().finish();
-  elapsed = timer.get();
-  std::cout << "Time for QR on GPU: " << elapsed << std::endl;
-    
-  //
-  //  Hybrid:
-  //
-  viennacl::ocl::get_queue().finish();
-  timer.start();
-  std::vector<ScalarType> betas_hybrid = viennacl::linalg::inplace_qr_hybrid(vcl_A, block_size);
-  viennacl::ocl::get_queue().finish();
-  elapsed = timer.get();
-  std::cout << "Time for QR on CPU/GPU: " << elapsed << std::endl;
-  
-}
-
-
-
-
-int main (int argc, const char * argv[])
-{
-   run(200, 200, 20);    
-   
-   std::size_t max_size = 3200;
- 
-   std::cout << "---- block size: 20 -----" << std::endl;
-   for (std::size_t i=100; i<=max_size; i*=2)
-     run(i, i, 20);    
-
-   std::cout << "---- block size: 50 -----" << std::endl;
-   for (std::size_t i=100; i<=max_size; i*=2)
-     run(i, i, 50);    
-    
-   std::cout << "---- block size: 100 -----" << std::endl;
-   for (std::size_t i=100; i<=max_size; i*=2)
-     run(i, i, 100);    
-    
-   
-   return EXIT_SUCCESS;
-}
-
diff --git a/examples/benchmarks/io.hpp b/examples/benchmarks/io.hpp
index 2f83637..be28286 100644
--- a/examples/benchmarks/io.hpp
+++ b/examples/benchmarks/io.hpp
@@ -1,114 +1,115 @@
-#ifndef VECTOR_IO_HPP_
-#define VECTOR_IO_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-#include <boost/numeric/ublas/vector.hpp>
-#include <boost/numeric/ublas/matrix_sparse.hpp>
-
-#include <iostream>
-#include <fstream>
-
-template <class TYPE>
-bool readVectorFromFile(const std::string & filename, boost::numeric::ublas::vector<TYPE> & vec)
-{
-	std::ifstream file(filename.c_str());
-
-	if (!file) return false;
-
-	unsigned int size;
-	file >> size;
-	vec.resize(size);
-
-	for (unsigned int i = 0; i < size; ++i)
-	{
-		TYPE element;
-		file >> element;
-		vec[i] = element;
-	}
-
-	return true;
-}
-
-template<class TYPE>
-bool readVectorFromBinaryFile(const std::string & filename, boost::numeric::ublas::vector<TYPE> & vec)
-{
-	std::ifstream file(filename.c_str(), std::ios_base::binary);
-	if (!file) return false;
-
-	unsigned int size;
-	file.read((char*)&size, sizeof(unsigned int));
-	vec.resize(size);
-	file.read((char*)&vec[0], sizeof(TYPE)*size);
-
-	return true;
-}
-
-template<class TYPE>
-bool saveVectorToBinaryFile(const std::string & filename, const boost::numeric::ublas::vector<TYPE> & vec)
-{
-	std::ofstream file(filename.c_str(), std::ios_base::binary);
-	if (!file) return false;
-
-	unsigned int size = vec.size();
-	file.write((char*)&size, sizeof(unsigned int));
-	file.write((char*)&vec[0], sizeof(TYPE)*size);
-
-	return true;
-}
-
-template <class TYPE>
-bool readMatrixFromFile(const std::string & filename, boost::numeric::ublas::compressed_matrix<TYPE> & matrix)
-{
-  std::cout << "Reading ublas matrix" << std::endl;
-  
-  std::ifstream file(filename.c_str());
-
-  if (!file) return false;
-
-  std::string id;
-  file >> id;
-  if (id != "Matrix") return false;
-
-  unsigned int num_rows, num_columns;
-  file >> num_rows >> num_columns;
-  if (num_rows != num_columns) return false;
-  
-  matrix.resize(num_rows, num_rows, false);
-
-  for (unsigned int row = 0; row < num_rows; ++row)
-  {
-    int num_entries;
-    file >> num_entries;
-    for (int j = 0; j < num_entries; ++j)
-    {
-      unsigned int column;
-      TYPE element;
-      file >> column >> element;
-
-      //matrix.insert_element(row, column, element);
-      matrix(row, column) = element;
-    }
-  }
-
-  return true;
-}
-
-
-
-
-#endif
+#ifndef VECTOR_IO_HPP_
+#define VECTOR_IO_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include <boost/numeric/ublas/vector.hpp>
+#include <boost/numeric/ublas/matrix_sparse.hpp>
+
+#include <iostream>
+#include <fstream>
+
+template <class TYPE>
+bool readVectorFromFile(const std::string & filename, boost::numeric::ublas::vector<TYPE> & vec)
+{
+  std::ifstream file(filename.c_str());
+
+  if (!file) return false;
+
+  unsigned int size;
+  file >> size;
+  vec.resize(size);
+
+  for (unsigned int i = 0; i < size; ++i)
+  {
+    TYPE element;
+    file >> element;
+    vec[i] = element;
+  }
+
+  return true;
+}
+
+template<class TYPE>
+bool readVectorFromBinaryFile(const std::string & filename, boost::numeric::ublas::vector<TYPE> & vec)
+{
+  std::ifstream file(filename.c_str(), std::ios_base::binary);
+  if (!file) return false;
+
+  unsigned int size;
+  file.read((char*)&size, sizeof(unsigned int));
+  vec.resize(size);
+  file.read((char*)&vec[0], sizeof(TYPE)*size);
+
+  return true;
+}
+
+template<class TYPE>
+bool saveVectorToBinaryFile(const std::string & filename, const boost::numeric::ublas::vector<TYPE> & vec)
+{
+  std::ofstream file(filename.c_str(), std::ios_base::binary);
+  if (!file) return false;
+
+  unsigned int size = vec.size();
+  file.write((char*)&size, sizeof(unsigned int));
+  file.write((char*)&vec[0], sizeof(TYPE)*size);
+
+  return true;
+}
+
+template <class TYPE>
+bool readMatrixFromFile(const std::string & filename, boost::numeric::ublas::compressed_matrix<TYPE> & matrix)
+{
+  std::cout << "Reading ublas matrix" << std::endl;
+
+  std::ifstream file(filename.c_str());
+
+  if (!file) return false;
+
+  std::string id;
+  file >> id;
+  if (id != "Matrix") return false;
+
+  unsigned int num_rows, num_columns;
+  file >> num_rows >> num_columns;
+  if (num_rows != num_columns) return false;
+
+  matrix.resize(num_rows, num_rows, false);
+
+  for (unsigned int row = 0; row < num_rows; ++row)
+  {
+    int num_entries;
+    file >> num_entries;
+    for (int j = 0; j < num_entries; ++j)
+    {
+      unsigned int column;
+      TYPE element;
+      file >> column >> element;
+
+      //matrix.insert_element(row, column, element);
+      matrix(row, column) = element;
+    }
+  }
+
+  return true;
+}
+
+
+
+
+#endif
diff --git a/examples/benchmarks/opencl.cpp b/examples/benchmarks/opencl.cpp
index ca0b791..5a7460d 100644
--- a/examples/benchmarks/opencl.cpp
+++ b/examples/benchmarks/opencl.cpp
@@ -1,141 +1,147 @@
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-#ifndef NDEBUG
- #define NDEBUG
-#endif
-
-#include "viennacl/scalar.hpp"
-#include "viennacl/vector.hpp"
-#include "viennacl/matrix.hpp"
-#include "viennacl/compressed_matrix.hpp"
-
-#include <iostream>
-#include <vector>
-#include "benchmark-utils.hpp"
-
-using std::cout;
-using std::cin;
-using std::endl;
-
-
-/*
-*   Benchmark 4:
-*   Profiling performance of current OpenCL implementation
-*   
-*/
-
-#define BENCHMARK_VECTOR_SIZE   100000
-
-
-template<typename ScalarType>
-int run_benchmark()
-{
-   
-   Timer timer;
-   double exec_time;
-   
-  std::vector<ScalarType> std_vec1(BENCHMARK_VECTOR_SIZE);
-  
-  
-  viennacl::ocl::get_queue().finish();
-  
-  timer.start();
-  viennacl::scalar<ScalarType> vcl_s1;
-  exec_time = timer.get();
-  std::cout << "Time for building scalar kernels: " << exec_time << std::endl;
-  
-  timer.start();
-  viennacl::vector<ScalarType> vcl_vec1(BENCHMARK_VECTOR_SIZE);
-  exec_time = timer.get();
-  viennacl::vector<ScalarType> vcl_vec2(BENCHMARK_VECTOR_SIZE);
-  std::cout << "Time for building vector kernels: " << exec_time << std::endl;
-  
-  timer.start();
-  viennacl::matrix<ScalarType> vcl_matrix(BENCHMARK_VECTOR_SIZE/100, BENCHMARK_VECTOR_SIZE/100);
-  exec_time = timer.get();
-  std::cout << "Time for building matrix kernels: " << exec_time << std::endl;
-  
-  timer.start();
-  viennacl::compressed_matrix<ScalarType> vcl_compressed_matrix(BENCHMARK_VECTOR_SIZE, BENCHMARK_VECTOR_SIZE);
-  exec_time = timer.get();
-  std::cout << "Time for building compressed_matrix kernels: " << exec_time << std::endl;
-  
-
-  
-  ///////////// Vector operations /////////////////
-  
-  std_vec1[0] = 1.0;
-  for (int i=1; i<BENCHMARK_VECTOR_SIZE; ++i)
-    std_vec1[i] = std_vec1[i-1] * ScalarType(1.000001);
-
-  viennacl::copy(std_vec1, vcl_vec1);
-  
-  double std_accumulate = 0;
-  double vcl_accumulate = 0;
-
-  timer.start();
-  for (int i=0; i<BENCHMARK_VECTOR_SIZE; ++i)
-    std_accumulate += std_vec1[i];
-  exec_time = timer.get();
-  std::cout << "Time for " << BENCHMARK_VECTOR_SIZE << " entry accesses on host: " << exec_time << std::endl;
-  std::cout << "Time per entry: " << exec_time / BENCHMARK_VECTOR_SIZE << std::endl;
-  std::cout << "Result of operation on host: " << std_accumulate << std::endl;
-
-  vcl_accumulate = vcl_vec1[0];
-  viennacl::ocl::get_queue().finish();
-  vcl_accumulate = 0;
-  timer.start();
-  for (int i=0; i<BENCHMARK_VECTOR_SIZE; ++i)
-    vcl_accumulate += vcl_vec1[i];
-  exec_time = timer.get();
-  std::cout << "Time for " << BENCHMARK_VECTOR_SIZE << " entry accesses via OpenCL: " << exec_time << std::endl;
-  std::cout << "Time per entry: " << exec_time / BENCHMARK_VECTOR_SIZE << std::endl;
-  std::cout << "Result of operation via OpenCL: " << vcl_accumulate << std::endl;
-  
-  return 0;
-}
-
-int main()
-{
-  std::cout << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << "               Device Info" << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  
-  std::cout << viennacl::ocl::current_device().info() << std::endl;
-  
-  std::cout << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << "## Benchmark :: OpenCL performance" << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << std::endl;
-  std::cout << "   -------------------------------" << std::endl;
-  std::cout << "   # benchmarking single-precision" << std::endl;
-  std::cout << "   -------------------------------" << std::endl;
-  run_benchmark<float>();
-  if( viennacl::ocl::current_device().double_support() )
-  {
-    std::cout << std::endl;
-    std::cout << "   -------------------------------" << std::endl;
-    std::cout << "   # benchmarking double-precision" << std::endl;
-    std::cout << "   -------------------------------" << std::endl;
-    run_benchmark<double>();
-  }
-  return 0;
-}
-
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/*
+*
+*   Benchmark:  Profiling performance of current OpenCL implementation
+*
+*/
+
+
+#ifndef NDEBUG
+ #define NDEBUG
+#endif
+
+#ifndef VIENNACL_WITH_OPENCL
+  #define VIENNACL_WITH_OPENCL
+#endif
+
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/compressed_matrix.hpp"
+
+#include <iostream>
+#include <vector>
+#include "benchmark-utils.hpp"
+
+using std::cout;
+using std::cin;
+using std::endl;
+
+
+#define BENCHMARK_VECTOR_SIZE   100000
+
+
+template<typename ScalarType>
+int run_benchmark()
+{
+
+   Timer timer;
+   double exec_time;
+
+  std::vector<ScalarType> std_vec1(BENCHMARK_VECTOR_SIZE);
+
+
+  viennacl::ocl::get_queue().finish();
+
+  timer.start();
+  viennacl::scalar<ScalarType> vcl_s1;
+  exec_time = timer.get();
+  std::cout << "Time for building scalar kernels: " << exec_time << std::endl;
+
+  timer.start();
+  viennacl::vector<ScalarType> vcl_vec1(BENCHMARK_VECTOR_SIZE);
+  exec_time = timer.get();
+  viennacl::vector<ScalarType> vcl_vec2(BENCHMARK_VECTOR_SIZE);
+  std::cout << "Time for building vector kernels: " << exec_time << std::endl;
+
+  timer.start();
+  viennacl::matrix<ScalarType> vcl_matrix(BENCHMARK_VECTOR_SIZE/100, BENCHMARK_VECTOR_SIZE/100);
+  exec_time = timer.get();
+  std::cout << "Time for building matrix kernels: " << exec_time << std::endl;
+
+  timer.start();
+  viennacl::compressed_matrix<ScalarType> vcl_compressed_matrix(BENCHMARK_VECTOR_SIZE, BENCHMARK_VECTOR_SIZE);
+  exec_time = timer.get();
+  std::cout << "Time for building compressed_matrix kernels: " << exec_time << std::endl;
+
+
+
+  ///////////// Vector operations /////////////////
+
+  std_vec1[0] = 1.0;
+  for (int i=1; i<BENCHMARK_VECTOR_SIZE; ++i)
+    std_vec1[i] = std_vec1[i-1] * ScalarType(1.000001);
+
+  viennacl::copy(std_vec1, vcl_vec1);
+
+  double std_accumulate = 0;
+  double vcl_accumulate = 0;
+
+  timer.start();
+  for (int i=0; i<BENCHMARK_VECTOR_SIZE; ++i)
+    std_accumulate += std_vec1[i];
+  exec_time = timer.get();
+  std::cout << "Time for " << BENCHMARK_VECTOR_SIZE << " entry accesses on host: " << exec_time << std::endl;
+  std::cout << "Time per entry: " << exec_time / BENCHMARK_VECTOR_SIZE << std::endl;
+  std::cout << "Result of operation on host: " << std_accumulate << std::endl;
+
+  vcl_accumulate = vcl_vec1[0];
+  viennacl::ocl::get_queue().finish();
+  vcl_accumulate = 0;
+  timer.start();
+  for (int i=0; i<BENCHMARK_VECTOR_SIZE; ++i)
+    vcl_accumulate += vcl_vec1[i];
+  exec_time = timer.get();
+  std::cout << "Time for " << BENCHMARK_VECTOR_SIZE << " entry accesses via OpenCL: " << exec_time << std::endl;
+  std::cout << "Time per entry: " << exec_time / BENCHMARK_VECTOR_SIZE << std::endl;
+  std::cout << "Result of operation via OpenCL: " << vcl_accumulate << std::endl;
+
+  return 0;
+}
+
+int main()
+{
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "               Device Info" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+
+  std::cout << viennacl::ocl::current_device().info() << std::endl;
+
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "## Benchmark :: OpenCL performance" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << std::endl;
+  std::cout << "   -------------------------------" << std::endl;
+  std::cout << "   # benchmarking single-precision" << std::endl;
+  std::cout << "   -------------------------------" << std::endl;
+  run_benchmark<float>();
+  if( viennacl::ocl::current_device().double_support() )
+  {
+    std::cout << std::endl;
+    std::cout << "   -------------------------------" << std::endl;
+    std::cout << "   # benchmarking double-precision" << std::endl;
+    std::cout << "   -------------------------------" << std::endl;
+    run_benchmark<double>();
+  }
+  return 0;
+}
+
diff --git a/examples/benchmarks/qr.cpp b/examples/benchmarks/qr.cpp
index b0b96ca..9f11a66 100644
--- a/examples/benchmarks/qr.cpp
+++ b/examples/benchmarks/qr.cpp
@@ -1,7 +1,8 @@
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
@@ -14,7 +15,7 @@
    License:         MIT (X11), see file LICENSE in the base directory
 ============================================================================= */
 
-#define VIENNACL_HAVE_UBLAS
+#define VIENNACL_WITH_UBLAS
 #ifndef NDEBUG
  #define NDEBUG
 #endif
@@ -57,8 +58,8 @@ int main (int argc, const char * argv[])
     Timer timer;
     double elapsed;
 
-    size_t rows = 1800;
-    size_t cols = 1800;
+    std::size_t rows = 1800;
+    std::size_t cols = 1800;
     double num_ops_qr = 2.0 * cols * cols * (rows - cols/3.0);
     double num_ops_recovery = 4.0 * (rows*rows*cols - rows*cols*cols + cols*cols*cols);
     
@@ -66,14 +67,14 @@ int main (int argc, const char * argv[])
     MatrixType Q(rows, rows);
     MatrixType R(rows, cols);
     
-    for (size_t i=0; i<rows; ++i)
+    for (std::size_t i=0; i<rows; ++i)
     {
-      for (size_t j=0; j<cols; ++j)
+      for (std::size_t j=0; j<cols; ++j)
       {
         A(i,j) = 1.0 + (i + 1)*(j+1);
         R(i,j) = 0.0;
       }
-      for (size_t j=0; j<rows; ++j)
+      for (std::size_t j=0; j<rows; ++j)
       {
         Q(i,j) = 0.0;
       }
@@ -85,7 +86,7 @@ int main (int argc, const char * argv[])
     //std::vector<ScalarType> betas = viennacl::linalg::qr(A);
     elapsed = timer.get();
     std::cout << "Time for QR on CPU: " << elapsed << std::endl;
-    std::cout << "Estimated GFLOPS: " << 1e-9 * num_ops_qr/ elapsed << std::endl;
+    std::cout << "Estimated GFLOPs: " << 2e-9 * num_ops_qr/ elapsed << std::endl;
     
     
     //std::cout << "Inplace QR-factored A: " << A << std::endl;
@@ -94,7 +95,7 @@ int main (int argc, const char * argv[])
     viennacl::linalg::recoverQ(A, betas, Q, R); 
     elapsed = timer.get();
     std::cout << "Time for Q-recovery on CPU: " << elapsed << std::endl;
-    std::cout << "Estimated GFLOPS: " << 1e-9 * num_ops_recovery / elapsed << std::endl;
+    std::cout << "Estimated GFLOPs: " << 2e-9 * num_ops_recovery / elapsed << std::endl;
 
     /*std::cout << "R after recovery: " << R << std::endl;
     std::cout << "Q after recovery: " << Q << std::endl;
diff --git a/examples/benchmarks/qr.cpp b/examples/benchmarks/qr.cu
similarity index 86%
copy from examples/benchmarks/qr.cpp
copy to examples/benchmarks/qr.cu
index b0b96ca..9f11a66 100644
--- a/examples/benchmarks/qr.cpp
+++ b/examples/benchmarks/qr.cu
@@ -1,7 +1,8 @@
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
@@ -14,7 +15,7 @@
    License:         MIT (X11), see file LICENSE in the base directory
 ============================================================================= */
 
-#define VIENNACL_HAVE_UBLAS
+#define VIENNACL_WITH_UBLAS
 #ifndef NDEBUG
  #define NDEBUG
 #endif
@@ -57,8 +58,8 @@ int main (int argc, const char * argv[])
     Timer timer;
     double elapsed;
 
-    size_t rows = 1800;
-    size_t cols = 1800;
+    std::size_t rows = 1800;
+    std::size_t cols = 1800;
     double num_ops_qr = 2.0 * cols * cols * (rows - cols/3.0);
     double num_ops_recovery = 4.0 * (rows*rows*cols - rows*cols*cols + cols*cols*cols);
     
@@ -66,14 +67,14 @@ int main (int argc, const char * argv[])
     MatrixType Q(rows, rows);
     MatrixType R(rows, cols);
     
-    for (size_t i=0; i<rows; ++i)
+    for (std::size_t i=0; i<rows; ++i)
     {
-      for (size_t j=0; j<cols; ++j)
+      for (std::size_t j=0; j<cols; ++j)
       {
         A(i,j) = 1.0 + (i + 1)*(j+1);
         R(i,j) = 0.0;
       }
-      for (size_t j=0; j<rows; ++j)
+      for (std::size_t j=0; j<rows; ++j)
       {
         Q(i,j) = 0.0;
       }
@@ -85,7 +86,7 @@ int main (int argc, const char * argv[])
     //std::vector<ScalarType> betas = viennacl::linalg::qr(A);
     elapsed = timer.get();
     std::cout << "Time for QR on CPU: " << elapsed << std::endl;
-    std::cout << "Estimated GFLOPS: " << 1e-9 * num_ops_qr/ elapsed << std::endl;
+    std::cout << "Estimated GFLOPs: " << 2e-9 * num_ops_qr/ elapsed << std::endl;
     
     
     //std::cout << "Inplace QR-factored A: " << A << std::endl;
@@ -94,7 +95,7 @@ int main (int argc, const char * argv[])
     viennacl::linalg::recoverQ(A, betas, Q, R); 
     elapsed = timer.get();
     std::cout << "Time for Q-recovery on CPU: " << elapsed << std::endl;
-    std::cout << "Estimated GFLOPS: " << 1e-9 * num_ops_recovery / elapsed << std::endl;
+    std::cout << "Estimated GFLOPs: " << 2e-9 * num_ops_recovery / elapsed << std::endl;
 
     /*std::cout << "R after recovery: " << R << std::endl;
     std::cout << "Q after recovery: " << Q << std::endl;
diff --git a/examples/benchmarks/opencl.cpp b/examples/benchmarks/scheduler.cpp
similarity index 52%
copy from examples/benchmarks/opencl.cpp
copy to examples/benchmarks/scheduler.cpp
index ca0b791..a3ae497 100644
--- a/examples/benchmarks/opencl.cpp
+++ b/examples/benchmarks/scheduler.cpp
@@ -1,141 +1,152 @@
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-#ifndef NDEBUG
- #define NDEBUG
-#endif
-
-#include "viennacl/scalar.hpp"
-#include "viennacl/vector.hpp"
-#include "viennacl/matrix.hpp"
-#include "viennacl/compressed_matrix.hpp"
-
-#include <iostream>
-#include <vector>
-#include "benchmark-utils.hpp"
-
-using std::cout;
-using std::cin;
-using std::endl;
-
-
-/*
-*   Benchmark 4:
-*   Profiling performance of current OpenCL implementation
-*   
-*/
-
-#define BENCHMARK_VECTOR_SIZE   100000
-
-
-template<typename ScalarType>
-int run_benchmark()
-{
-   
-   Timer timer;
-   double exec_time;
-   
-  std::vector<ScalarType> std_vec1(BENCHMARK_VECTOR_SIZE);
-  
-  
-  viennacl::ocl::get_queue().finish();
-  
-  timer.start();
-  viennacl::scalar<ScalarType> vcl_s1;
-  exec_time = timer.get();
-  std::cout << "Time for building scalar kernels: " << exec_time << std::endl;
-  
-  timer.start();
-  viennacl::vector<ScalarType> vcl_vec1(BENCHMARK_VECTOR_SIZE);
-  exec_time = timer.get();
-  viennacl::vector<ScalarType> vcl_vec2(BENCHMARK_VECTOR_SIZE);
-  std::cout << "Time for building vector kernels: " << exec_time << std::endl;
-  
-  timer.start();
-  viennacl::matrix<ScalarType> vcl_matrix(BENCHMARK_VECTOR_SIZE/100, BENCHMARK_VECTOR_SIZE/100);
-  exec_time = timer.get();
-  std::cout << "Time for building matrix kernels: " << exec_time << std::endl;
-  
-  timer.start();
-  viennacl::compressed_matrix<ScalarType> vcl_compressed_matrix(BENCHMARK_VECTOR_SIZE, BENCHMARK_VECTOR_SIZE);
-  exec_time = timer.get();
-  std::cout << "Time for building compressed_matrix kernels: " << exec_time << std::endl;
-  
-
-  
-  ///////////// Vector operations /////////////////
-  
-  std_vec1[0] = 1.0;
-  for (int i=1; i<BENCHMARK_VECTOR_SIZE; ++i)
-    std_vec1[i] = std_vec1[i-1] * ScalarType(1.000001);
-
-  viennacl::copy(std_vec1, vcl_vec1);
-  
-  double std_accumulate = 0;
-  double vcl_accumulate = 0;
-
-  timer.start();
-  for (int i=0; i<BENCHMARK_VECTOR_SIZE; ++i)
-    std_accumulate += std_vec1[i];
-  exec_time = timer.get();
-  std::cout << "Time for " << BENCHMARK_VECTOR_SIZE << " entry accesses on host: " << exec_time << std::endl;
-  std::cout << "Time per entry: " << exec_time / BENCHMARK_VECTOR_SIZE << std::endl;
-  std::cout << "Result of operation on host: " << std_accumulate << std::endl;
-
-  vcl_accumulate = vcl_vec1[0];
-  viennacl::ocl::get_queue().finish();
-  vcl_accumulate = 0;
-  timer.start();
-  for (int i=0; i<BENCHMARK_VECTOR_SIZE; ++i)
-    vcl_accumulate += vcl_vec1[i];
-  exec_time = timer.get();
-  std::cout << "Time for " << BENCHMARK_VECTOR_SIZE << " entry accesses via OpenCL: " << exec_time << std::endl;
-  std::cout << "Time per entry: " << exec_time / BENCHMARK_VECTOR_SIZE << std::endl;
-  std::cout << "Result of operation via OpenCL: " << vcl_accumulate << std::endl;
-  
-  return 0;
-}
-
-int main()
-{
-  std::cout << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << "               Device Info" << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  
-  std::cout << viennacl::ocl::current_device().info() << std::endl;
-  
-  std::cout << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << "## Benchmark :: OpenCL performance" << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << std::endl;
-  std::cout << "   -------------------------------" << std::endl;
-  std::cout << "   # benchmarking single-precision" << std::endl;
-  std::cout << "   -------------------------------" << std::endl;
-  run_benchmark<float>();
-  if( viennacl::ocl::current_device().double_support() )
-  {
-    std::cout << std::endl;
-    std::cout << "   -------------------------------" << std::endl;
-    std::cout << "   # benchmarking double-precision" << std::endl;
-    std::cout << "   -------------------------------" << std::endl;
-    run_benchmark<double>();
-  }
-  return 0;
-}
-
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/*
+*
+*   Benchmark:   Vector operations (vector.cpp and vector.cu are identical, the latter being required for compilation using CUDA nvcc)
+*
+*/
+
+
+//#define VIENNACL_DEBUG_ALL
+#ifndef NDEBUG
+ #define NDEBUG
+#endif
+
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/inner_prod.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+#include "viennacl/scheduler/execute.hpp"
+
+#include <iostream>
+#include <vector>
+#include "benchmark-utils.hpp"
+
+using std::cout;
+using std::cin;
+using std::endl;
+
+
+#define BENCHMARK_VECTOR_SIZE   2
+#define BENCHMARK_RUNS          1000
+
+
+template<typename ScalarType>
+int run_benchmark()
+{
+
+  Timer timer;
+  double exec_time;
+
+  std::vector<ScalarType> std_vec1(BENCHMARK_VECTOR_SIZE);
+  std::vector<ScalarType> std_vec2(BENCHMARK_VECTOR_SIZE);
+  viennacl::vector<ScalarType> vcl_vec1(BENCHMARK_VECTOR_SIZE);
+  viennacl::vector<ScalarType> vcl_vec2(BENCHMARK_VECTOR_SIZE);
+  ScalarType alpha = ScalarType(1.1415);
+  ScalarType beta  = ScalarType(0.97172);
+
+
+  ///////////// Vector operations /////////////////
+
+  std_vec1[0] = 1.0;
+  std_vec2[0] = 1.0;
+  for (int i=1; i<BENCHMARK_VECTOR_SIZE; ++i)
+  {
+    std_vec1[i] = std_vec1[i-1] * ScalarType(1.000001);
+    std_vec2[i] = std_vec1[i-1] * ScalarType(0.999999);
+  }
+
+  viennacl::copy(std_vec1, vcl_vec1);
+  viennacl::fast_copy(std_vec1, vcl_vec1);
+  viennacl::copy(std_vec2, vcl_vec2);
+
+  viennacl::backend::finish();
+  vcl_vec2 = alpha * vcl_vec1 + beta * vcl_vec2;
+  viennacl::backend::finish();
+  timer.start();
+  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+  {
+    vcl_vec2 = alpha * vcl_vec1 + beta * vcl_vec2;
+  }
+  viennacl::backend::finish();
+  exec_time = timer.get();
+  std::cout << "Execution time per operation, no scheduler: " << exec_time / BENCHMARK_RUNS << " sec" << std::endl;
+  std::cout << "Result: " << vcl_vec2[0] << std::endl;
+
+  viennacl::backend::finish();
+  timer.start();
+  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+  {
+    viennacl::scheduler::statement   my_statement(vcl_vec2, viennacl::op_assign(), alpha * vcl_vec1 + beta * vcl_vec2); // same as vcl_v1 = alpha * vcl_vec1 + beta * vcl_vec2;
+    viennacl::scheduler::execute(my_statement);
+  }
+  viennacl::backend::finish();
+  exec_time = timer.get();
+  std::cout << "Execution time per operation, with scheduler including statement generation: " << exec_time / BENCHMARK_RUNS << " sec" << std::endl;
+  std::cout << "Result: " << vcl_vec2[0] << std::endl;
+
+  viennacl::scheduler::statement   my_statement(vcl_vec2, viennacl::op_assign(), alpha * vcl_vec1 + beta * vcl_vec2); // same as vcl_v1 = alpha * vcl_vec1 + beta * vcl_vec2;
+  viennacl::backend::finish();
+  timer.start();
+  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+  {
+    viennacl::scheduler::execute(my_statement);
+  }
+  viennacl::backend::finish();
+  exec_time = timer.get();
+  std::cout << "Execution time per operation, only execution: " << exec_time / BENCHMARK_RUNS << " sec" << std::endl;
+  std::cout << "Result: " << vcl_vec2[0] << std::endl;
+
+  return 0;
+}
+
+int main()
+{
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "               Device Info" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+
+#ifdef VIENNACL_WITH_OPENCL
+  std::cout << viennacl::ocl::current_device().info() << std::endl;
+#endif
+
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "## Benchmark :: Vector" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << std::endl;
+  std::cout << "   -------------------------------" << std::endl;
+  std::cout << "   # benchmarking single-precision" << std::endl;
+  std::cout << "   -------------------------------" << std::endl;
+  run_benchmark<float>();
+#ifdef VIENNACL_WITH_OPENCL
+  if( viennacl::ocl::current_device().double_support() )
+#endif
+  {
+    std::cout << std::endl;
+    std::cout << "   -------------------------------" << std::endl;
+    std::cout << "   # benchmarking double-precision" << std::endl;
+    std::cout << "   -------------------------------" << std::endl;
+    run_benchmark<double>();
+  }
+  return 0;
+}
+
diff --git a/examples/benchmarks/solver.cpp b/examples/benchmarks/solver.cpp
index f676dfc..5a27a3c 100644
--- a/examples/benchmarks/solver.cpp
+++ b/examples/benchmarks/solver.cpp
@@ -1,401 +1,645 @@
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-#ifndef NDEBUG
- #define NDEBUG
-#endif
-
-#include <boost/numeric/ublas/matrix_sparse.hpp>
-#include <boost/numeric/ublas/operation_sparse.hpp>
-
-#define VIENNACL_HAVE_UBLAS 1
-
-#include "viennacl/scalar.hpp"
-#include "viennacl/vector.hpp"
-#include "viennacl/coordinate_matrix.hpp"
-#include "viennacl/compressed_matrix.hpp"
-#include "viennacl/linalg/ilu.hpp"
-#include "viennacl/linalg/jacobi_precond.hpp"
-#include "viennacl/linalg/row_scaling.hpp"
-#include "viennacl/linalg/cg.hpp"
-#include "viennacl/linalg/bicgstab.hpp"
-#include "viennacl/linalg/gmres.hpp"
-#include "viennacl/io/matrix_market.hpp"
-
-
-#include <iostream>
-#include <vector>
-#include "benchmark-utils.hpp"
-#include "io.hpp"
-
-
-using namespace boost::numeric;
-
-/*
-*   Benchmark:
-*   Iterative solver tests
-*   
-*/
-
-#define BENCHMARK_RUNS          1
-
-
-template <typename ScalarType>
-ScalarType diff_inf(ublas::vector<ScalarType> & v1, viennacl::vector<ScalarType> & v2)
-{
-   ublas::vector<ScalarType> v2_cpu(v2.size());
-   viennacl::copy(v2.begin(), v2.end(), v2_cpu.begin());
-
-   for (unsigned int i=0;i<v1.size(); ++i)
-   {
-      if ( std::max( fabs(v2_cpu[i]), fabs(v1[i]) ) > 0 )
-         v2_cpu[i] = fabs(v2_cpu[i] - v1[i]) / std::max( fabs(v2_cpu[i]), fabs(v1[i]) );
-      else
-         v2_cpu[i] = 0.0;
-   }
-
-   return norm_inf(v2_cpu);
-}
-
-template <typename ScalarType>
-ScalarType diff_2(ublas::vector<ScalarType> & v1, viennacl::vector<ScalarType> & v2)
-{
-   ublas::vector<ScalarType> v2_cpu(v2.size());
-   viennacl::copy(v2.begin(), v2.end(), v2_cpu.begin());
-
-   return norm_2(v1 - v2_cpu) / norm_2(v1);
-}
-
-
-template <typename MatrixType, typename VectorType, typename SolverTag, typename PrecondTag>
-void run_solver(MatrixType const & matrix, VectorType const & rhs, VectorType const & ref_result, SolverTag const & solver, PrecondTag const & precond, long ops)
-{
-  Timer timer;
-  VectorType result(rhs);
-  VectorType residual(rhs);
-  viennacl::ocl::get_queue().finish();
-  
-  timer.start();
-  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
-  {
-    result = viennacl::linalg::solve(matrix, rhs, solver, precond);
-  }
-  viennacl::ocl::get_queue().finish();
-  double exec_time = timer.get();
-  std::cout << "Exec. time: " << exec_time << std::endl;
-  std::cout << "Est. "; printOps(ops, exec_time / BENCHMARK_RUNS);
-  residual -= viennacl::linalg::prod(matrix, result);
-  std::cout << "Relative residual: " << viennacl::linalg::norm_2(residual) / viennacl::linalg::norm_2(rhs) << std::endl;
-  std::cout << "Estimated rel. residual: " << solver.error() << std::endl;
-  std::cout << "Iterations: " << solver.iters() << std::endl;
-  result -= ref_result;
-  std::cout << "Relative deviation from result: " << viennacl::linalg::norm_2(result) / viennacl::linalg::norm_2(ref_result) << std::endl;
-}
-
-
-template<typename ScalarType>
-int run_benchmark()
-{
-  
-  Timer timer;
-  double exec_time;
-   
-  ScalarType std_factor1 = static_cast<ScalarType>(3.1415);
-  ScalarType std_factor2 = static_cast<ScalarType>(42.0);
-  viennacl::scalar<ScalarType> vcl_factor1(std_factor1);
-  viennacl::scalar<ScalarType> vcl_factor2(std_factor2);
-  
-  ublas::vector<ScalarType> ublas_vec1;
-  ublas::vector<ScalarType> ublas_vec2;
-  ublas::vector<ScalarType> ublas_result;
-  unsigned int solver_iters = 20;
-  unsigned int solver_krylov_dim = 20;
-  double solver_tolerance = 1e-6;
-
-  #ifdef _MSC_VER
-  if (!readVectorFromFile<ScalarType>("../../examples/testdata/rhs65025.txt", ublas_vec1))
-  #else
-  if (!readVectorFromFile<ScalarType>("../examples/testdata/rhs65025.txt", ublas_vec1))
-  #endif
-  {
-    std::cout << "Error reading RHS file" << std::endl;
-    return 0;
-  }
-  std::cout << "done reading rhs" << std::endl;
-  ublas_vec2 = ublas_vec1;
-  #ifdef _MSC_VER
-  if (!readVectorFromFile<ScalarType>("../../examples/testdata/result65025.txt", ublas_result))
-  #else
-  if (!readVectorFromFile<ScalarType>("../examples/testdata/result65025.txt", ublas_result))
-  #endif
-  {
-    std::cout << "Error reading result file" << std::endl;
-    return 0;
-  }
-  std::cout << "done reading result" << std::endl;
-  
-  viennacl::compressed_matrix<ScalarType> vcl_compressed_matrix(ublas_vec1.size(), ublas_vec1.size());
-  viennacl::coordinate_matrix<ScalarType> vcl_coordinate_matrix(ublas_vec1.size(), ublas_vec1.size());
-
-  viennacl::vector<ScalarType> vcl_vec1(ublas_vec1.size());
-  viennacl::vector<ScalarType> vcl_vec2(ublas_vec1.size()); 
-  viennacl::vector<ScalarType> vcl_result(ublas_vec1.size()); 
-  
-
-  ublas::compressed_matrix<ScalarType> ublas_matrix;
-  #ifdef _MSC_VER
-  if (!viennacl::io::read_matrix_market_file(ublas_matrix, "../../examples/testdata/mat65k.mtx"))
-  #else
-  if (!viennacl::io::read_matrix_market_file(ublas_matrix, "../examples/testdata/mat65k.mtx"))
-  #endif
-  {
-    std::cout << "Error reading Matrix file" << std::endl;
-    return EXIT_FAILURE;
-  }
-  //unsigned int cg_mat_size = cg_mat.size(); 
-  std::cout << "done reading matrix" << std::endl;
-  
-  //cpu to gpu:
-  viennacl::copy(ublas_matrix, vcl_compressed_matrix);
-  viennacl::copy(ublas_matrix, vcl_coordinate_matrix);
-  viennacl::copy(ublas_vec1, vcl_vec1);
-  viennacl::copy(ublas_vec2, vcl_vec2);
-  viennacl::copy(ublas_result, vcl_result);
-  
-  
-  viennacl::linalg::jacobi_precond< ublas::compressed_matrix<ScalarType> >    ublas_jacobi(ublas_matrix, viennacl::linalg::jacobi_tag());
-  viennacl::linalg::jacobi_precond< viennacl::compressed_matrix<ScalarType> > vcl_jacobi(vcl_compressed_matrix, viennacl::linalg::jacobi_tag());
-  
-  viennacl::linalg::row_scaling< ublas::compressed_matrix<ScalarType> >    ublas_row_scaling(ublas_matrix, viennacl::linalg::row_scaling_tag(1));
-  viennacl::linalg::row_scaling< viennacl::compressed_matrix<ScalarType> > vcl_row_scaling(vcl_compressed_matrix, viennacl::linalg::row_scaling_tag(1));
-  
-  ///////////////////////////////////////////////////////////////////////////////
-  //////////////////////           ILUT preconditioner         //////////////////
-  ///////////////////////////////////////////////////////////////////////////////
-  std::cout << "------- ILUT on CPU (ublas) ----------" << std::endl;
-
-  timer.start();
-  viennacl::linalg::ilut_precond< ublas::compressed_matrix<ScalarType> >    ublas_ilut(ublas_matrix, viennacl::linalg::ilut_tag());
-  exec_time = timer.get();
-  std::cout << "Setup time: " << exec_time << std::endl;
-  
-  timer.start();
-  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
-  {
-    ublas_ilut.apply(ublas_vec1);
-  }
-  exec_time = timer.get();
-  std::cout << "ublas time: " << exec_time << std::endl;
-  
-  std::cout << "------- ILUT with ViennaCL ----------" << std::endl;
-
-  timer.start();
-  viennacl::linalg::ilut_precond< viennacl::compressed_matrix<ScalarType> > vcl_ilut(vcl_compressed_matrix, viennacl::linalg::ilut_tag());
-  exec_time = timer.get();
-  std::cout << "Setup time: " << exec_time << std::endl;
-  
-  viennacl::ocl::get_queue().finish();
-  timer.start();
-  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
-  {
-    vcl_ilut.apply(vcl_vec1);
-  }
-  viennacl::ocl::get_queue().finish();
-  exec_time = timer.get();
-  std::cout << "ViennaCL time: " << exec_time << std::endl;
-  
-  ///////////////////////////////////////////////////////////////////////////////
-  //////////////////////              CG solver                //////////////////
-  ///////////////////////////////////////////////////////////////////////////////
-  long cg_ops = static_cast<long>(solver_iters * (ublas_matrix.nnz() + 6 * ublas_vec2.size()));
-  
-  viennacl::linalg::cg_tag cg_solver(solver_tolerance, solver_iters);
-  
-  std::cout << "------- CG solver (no preconditioner) using ublas ----------" << std::endl;
-  run_solver(ublas_matrix, ublas_vec2, ublas_result, cg_solver, viennacl::linalg::no_precond(), cg_ops);
-  
-  std::cout << "------- CG solver (no preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
-  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, cg_solver, viennacl::linalg::no_precond(), cg_ops);
- 
-//  std::cout << "------- CG solver (no preconditioner) via ViennaCL, coordinate_matrix ----------" << std::endl;
-//  run_solver(vcl_coordinate_matrix, vcl_vec2, vcl_result, cg_solver, cg_ops);
-
-
-
-  std::cout << "------- CG solver (ILUT preconditioner) using ublas ----------" << std::endl;
-  run_solver(ublas_matrix, ublas_vec2, ublas_result, cg_solver, ublas_ilut, cg_ops);
-  
-  std::cout << "------- CG solver (ILUT preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
-  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, cg_solver, vcl_ilut, cg_ops);
-  
-//  std::cout << "------- CG solver (ILUT preconditioner) via ViennaCL, coordinate_matrix ----------" << std::endl;
-//  run_solver(vcl_coordinate_matrix, vcl_vec2, vcl_result, cg_solver, vcl_ilut, cg_ops);
-  
-  
-  std::cout << "------- CG solver (Jacobi preconditioner) using ublas ----------" << std::endl;
-  run_solver(ublas_matrix, ublas_vec2, ublas_result, cg_solver, ublas_jacobi, cg_ops);
-  
-  std::cout << "------- CG solver (Jacobi preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
-  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, cg_solver, vcl_jacobi, cg_ops);
-  
-//  std::cout << "------- CG solver (Jacobi preconditioner) via ViennaCL, coordinate_matrix ----------" << std::endl;
-//  run_solver(vcl_coordinate_matrix, vcl_vec2, vcl_result, cg_solver, vcl_jacobi, cg_ops);
-  
-  
-  std::cout << "------- CG solver (row scaling preconditioner) using ublas ----------" << std::endl;
-  run_solver(ublas_matrix, ublas_vec2, ublas_result, cg_solver, ublas_row_scaling, cg_ops);
-  
-  std::cout << "------- CG solver (row scaling preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
-  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, cg_solver, vcl_row_scaling, cg_ops);
-  
-//  std::cout << "------- CG solver (row scaling preconditioner) via ViennaCL, coordinate_matrix ----------" << std::endl;
-//  run_solver(vcl_coordinate_matrix, vcl_vec2, vcl_result, cg_solver, vcl_row_scaling, cg_ops);
-  
-  ///////////////////////////////////////////////////////////////////////////////
-  //////////////////////           BiCGStab solver             //////////////////
-  ///////////////////////////////////////////////////////////////////////////////
-  
-  long bicgstab_ops = static_cast<long>(solver_iters * (2 * ublas_matrix.nnz() + 13 * ublas_vec2.size()));
-  
-  viennacl::linalg::bicgstab_tag bicgstab_solver(solver_tolerance, solver_iters);
-                                                                             
-  std::cout << "------- BiCGStab solver (no preconditioner) using ublas ----------" << std::endl;
-  run_solver(ublas_matrix, ublas_vec2, ublas_result, bicgstab_solver, viennacl::linalg::no_precond(), bicgstab_ops);
-  
-  std::cout << "------- BiCGStab solver (no preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
-  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, bicgstab_solver, viennacl::linalg::no_precond(), bicgstab_ops);
-  
-//  std::cout << "------- BiCGStab solver (no preconditioner) on GPU, coordinate_matrix ----------" << std::endl;
-//  run_solver(vcl_coordinate_matrix, vcl_vec2, vcl_result, bicgstab_solver, bicgstab_ops);
-
-  
-  std::cout << "------- BiCGStab solver (ILUT preconditioner) using ublas ----------" << std::endl;
-  run_solver(ublas_matrix, ublas_vec2, ublas_result, bicgstab_solver, ublas_ilut, bicgstab_ops);
-  
-  std::cout << "------- BiCGStab solver (ILUT preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
-  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, bicgstab_solver, vcl_ilut, bicgstab_ops);
-  
-//  std::cout << "------- BiCGStab solver (ILUT preconditioner) via ViennaCL, coordinate_matrix ----------" << std::endl;
-//  run_solver(vcl_coordinate_matrix, vcl_vec2, vcl_result, bicgstab_solver, vcl_ilut, bicgstab_ops);
-  
-  std::cout << "------- BiCGStab solver (Jacobi preconditioner) using ublas ----------" << std::endl;
-  run_solver(ublas_matrix, ublas_vec2, ublas_result, bicgstab_solver, ublas_jacobi, bicgstab_ops);
-  
-  std::cout << "------- BiCGStab solver (Jacobi preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
-  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, bicgstab_solver, vcl_jacobi, bicgstab_ops);
-  
-//  std::cout << "------- CG solver (Jacobi preconditioner) via ViennaCL, coordinate_matrix ----------" << std::endl;
-//  run_solver(vcl_coordinate_matrix, vcl_vec2, vcl_result, bicgstab_solver, vcl_jacobi, bicgstab_ops);
-  
-  std::cout << "------- BiCGStab solver (row scaling preconditioner) using ublas ----------" << std::endl;
-  run_solver(ublas_matrix, ublas_vec2, ublas_result, bicgstab_solver, ublas_row_scaling, bicgstab_ops);
-  
-  std::cout << "------- BiCGStab solver (row scaling preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
-  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, bicgstab_solver, vcl_row_scaling, bicgstab_ops);
-  
-//  std::cout << "------- CG solver row scaling preconditioner) via ViennaCL, coordinate_matrix ----------" << std::endl;
-//  run_solver(vcl_coordinate_matrix, vcl_vec2, vcl_result, bicgstab_solver, vcl_row_scaling, bicgstab_ops);
-
-  ///////////////////////////////////////////////////////////////////////////////
-  ///////////////////////            GMRES solver             ///////////////////
-  ///////////////////////////////////////////////////////////////////////////////
-  
-  long gmres_ops = static_cast<long>(solver_iters * (ublas_matrix.nnz() + (solver_iters * 2 + 7) * ublas_vec2.size()));
-  
-  viennacl::linalg::gmres_tag gmres_solver(solver_tolerance, solver_iters, solver_krylov_dim);
-  
-  std::cout << "------- GMRES solver (no preconditioner) using ublas ----------" << std::endl;
-  run_solver(ublas_matrix, ublas_vec2, ublas_result, gmres_solver, viennacl::linalg::no_precond(), gmres_ops);
-  
-  std::cout << "------- GMRES solver (no preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
-  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, gmres_solver, viennacl::linalg::no_precond(), gmres_ops);
-  
-//  std::cout << "------- GMRES solver (no preconditioner) on GPU, coordinate_matrix ----------" << std::endl;
-//  run_solver(vcl_coordinate_matrix, vcl_vec2, vcl_result, gmres_solver, bicgstab_ops);
-
-  
-  std::cout << "------- GMRES solver (ILUT preconditioner) using ublas ----------" << std::endl;
-  run_solver(ublas_matrix, ublas_vec2, ublas_result, gmres_solver, ublas_ilut, gmres_ops);
-  
-  std::cout << "------- GMRES solver (ILUT preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
-  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, gmres_solver, vcl_ilut, gmres_ops);
-  
-//  std::cout << "------- GMRES solver (ILUT preconditioner) via ViennaCL, coordinate_matrix ----------" << std::endl;
-//  run_solver(vcl_coordinate_matrix, vcl_vec2, vcl_result, gmres_solver, vcl_ilut, gmres_ops);
-
-
-  std::cout << "------- GMRES solver (Jacobi preconditioner) using ublas ----------" << std::endl;
-  run_solver(ublas_matrix, ublas_vec2, ublas_result, gmres_solver, ublas_jacobi, gmres_ops);
-  
-  std::cout << "------- GMRES solver (Jacobi preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
-  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, gmres_solver, vcl_jacobi, gmres_ops);
-  
-//  std::cout << "------- GMRES solver (Jacobi preconditioner) via ViennaCL, coordinate_matrix ----------" << std::endl;
-//  run_solver(vcl_coordinate_matrix, vcl_vec2, vcl_result, gmres_solver, vcl_jacobi, gmres_ops);
-  
-  
-  std::cout << "------- GMRES solver (row scaling preconditioner) using ublas ----------" << std::endl;
-  run_solver(ublas_matrix, ublas_vec2, ublas_result, gmres_solver, ublas_row_scaling, gmres_ops);
-  
-  std::cout << "------- GMRES solver (row scaling preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
-  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, gmres_solver, vcl_row_scaling, gmres_ops);
-  
-//  std::cout << "------- GMRES solver (row scaling preconditioner) via ViennaCL, coordinate_matrix ----------" << std::endl;
-//  run_solver(vcl_coordinate_matrix, vcl_vec2, vcl_result, gmres_solver, vcl_row_scaling, gmres_ops);
-  
-  return 0;
-}
-
-int main()
-{
-  std::cout << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << "               Device Info" << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  
-  std::cout << viennacl::ocl::current_device().info() << std::endl;
-  
-  std::cout << "---------------------------------------------------------------------------" << std::endl;
-  std::cout << "---------------------------------------------------------------------------" << std::endl;
-  std::cout << " Benchmark for Execution Times of Iterative Solvers provided with ViennaCL " << std::endl;
-  std::cout << "---------------------------------------------------------------------------" << std::endl;
-  std::cout << " Note that the purpose of this benchmark is not to run solvers until" << std::endl;
-  std::cout << " convergence. Instead, only the execution times of a few iterations are" << std::endl;
-  std::cout << " recorded. Residual errors are only printed for information." << std::endl << std::endl;
-   
-
-  std::cout << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << "## Benchmark :: Solver" << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << std::endl;
-  std::cout << " ATTENTION: Please be aware that GMRES may not work on ATI GPUs with Stream SDK v2.1." << std::endl;
-  std::cout << "   -------------------------------" << std::endl;
-  std::cout << "   # benchmarking single-precision" << std::endl;
-  std::cout << "   -------------------------------" << std::endl;
-  run_benchmark<float>();
-  if( viennacl::ocl::current_device().double_support() )
-  {
-    std::cout << std::endl;
-    std::cout << "   -------------------------------" << std::endl;
-    std::cout << "   # benchmarking double-precision" << std::endl;
-    std::cout << "   -------------------------------" << std::endl;
-    run_benchmark<double>();
-  }
-  return 0;
-}
-
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/*
+*
+*   Benchmark:  Iterative solver tests (solver.cpp and solver.cu are identical, the latter being required for compilation using CUDA nvcc)
+*
+*/
+
+
+#ifndef NDEBUG
+ #define NDEBUG
+#endif
+
+#include <boost/numeric/ublas/matrix_sparse.hpp>
+#include <boost/numeric/ublas/io.hpp>
+#include <boost/numeric/ublas/operation_sparse.hpp>
+
+#define VIENNACL_WITH_UBLAS 1
+
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/coordinate_matrix.hpp"
+#include "viennacl/compressed_matrix.hpp"
+#include "viennacl/ell_matrix.hpp"
+#include "viennacl/hyb_matrix.hpp"
+#include "viennacl/context.hpp"
+
+#include "viennacl/linalg/cg.hpp"
+#include "viennacl/linalg/bicgstab.hpp"
+#include "viennacl/linalg/gmres.hpp"
+
+#include "viennacl/linalg/ilu.hpp"
+#include "viennacl/linalg/ichol.hpp"
+#include "viennacl/linalg/jacobi_precond.hpp"
+#include "viennacl/linalg/row_scaling.hpp"
+
+#ifdef VIENNACL_WITH_OPENCL
+  #include "viennacl/linalg/mixed_precision_cg.hpp"
+#endif
+
+#include "viennacl/io/matrix_market.hpp"
+
+
+#include <iostream>
+#include <vector>
+#include "benchmark-utils.hpp"
+#include "io.hpp"
+
+
+using namespace boost::numeric;
+
+#define BENCHMARK_RUNS          1
+
+
+template <typename ScalarType>
+ScalarType diff_inf(ublas::vector<ScalarType> & v1, viennacl::vector<ScalarType> & v2)
+{
+   ublas::vector<ScalarType> v2_cpu(v2.size());
+   viennacl::copy(v2.begin(), v2.end(), v2_cpu.begin());
+
+   for (unsigned int i=0;i<v1.size(); ++i)
+   {
+      if ( std::max( fabs(v2_cpu[i]), fabs(v1[i]) ) > 0 )
+         v2_cpu[i] = fabs(v2_cpu[i] - v1[i]) / std::max( fabs(v2_cpu[i]), fabs(v1[i]) );
+      else
+         v2_cpu[i] = 0.0;
+   }
+
+   return norm_inf(v2_cpu);
+}
+
+template <typename ScalarType>
+ScalarType diff_2(ublas::vector<ScalarType> & v1, viennacl::vector<ScalarType> & v2)
+{
+   ublas::vector<ScalarType> v2_cpu(v2.size());
+   viennacl::copy(v2.begin(), v2.end(), v2_cpu.begin());
+
+   return norm_2(v1 - v2_cpu) / norm_2(v1);
+}
+
+
+template <typename MatrixType, typename VectorType, typename SolverTag, typename PrecondTag>
+void run_solver(MatrixType const & matrix, VectorType const & rhs, VectorType const & ref_result, SolverTag const & solver, PrecondTag const & precond, long ops)
+{
+  Timer timer;
+  VectorType result(rhs);
+  VectorType residual(rhs);
+  viennacl::backend::finish();
+
+  timer.start();
+  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+  {
+    result = viennacl::linalg::solve(matrix, rhs, solver, precond);
+  }
+  viennacl::backend::finish();
+  double exec_time = timer.get();
+  std::cout << "Exec. time: " << exec_time << std::endl;
+  std::cout << "Est. "; printOps(static_cast<double>(ops), exec_time / BENCHMARK_RUNS);
+  residual -= viennacl::linalg::prod(matrix, result);
+  std::cout << "Relative residual: " << viennacl::linalg::norm_2(residual) / viennacl::linalg::norm_2(rhs) << std::endl;
+  std::cout << "Estimated rel. residual: " << solver.error() << std::endl;
+  std::cout << "Iterations: " << solver.iters() << std::endl;
+  result -= ref_result;
+  std::cout << "Relative deviation from result: " << viennacl::linalg::norm_2(result) / viennacl::linalg::norm_2(ref_result) << std::endl;
+}
+
+
+template<typename ScalarType>
+int run_benchmark(viennacl::context ctx)
+{
+  Timer timer;
+  double exec_time;
+
+  ScalarType std_factor1 = static_cast<ScalarType>(3.1415);
+  ScalarType std_factor2 = static_cast<ScalarType>(42.0);
+  viennacl::scalar<ScalarType> vcl_factor1(std_factor1, ctx);
+  viennacl::scalar<ScalarType> vcl_factor2(std_factor2, ctx);
+
+  ublas::vector<ScalarType> ublas_vec1;
+  ublas::vector<ScalarType> ublas_vec2;
+  ublas::vector<ScalarType> ublas_result;
+  unsigned int solver_iters = 100;
+  unsigned int solver_krylov_dim = 20;
+  double solver_tolerance = 1e-6;
+
+  if (!readVectorFromFile<ScalarType>("../examples/testdata/rhs65025.txt", ublas_vec1))
+  {
+    std::cout << "Error reading RHS file" << std::endl;
+    return 0;
+  }
+  std::cout << "done reading rhs" << std::endl;
+  ublas_vec2 = ublas_vec1;
+  if (!readVectorFromFile<ScalarType>("../examples/testdata/result65025.txt", ublas_result))
+  {
+    std::cout << "Error reading result file" << std::endl;
+    return 0;
+  }
+  std::cout << "done reading result" << std::endl;
+
+  viennacl::compressed_matrix<ScalarType> vcl_compressed_matrix(ublas_vec1.size(), ublas_vec1.size(), ctx);
+  viennacl::coordinate_matrix<ScalarType> vcl_coordinate_matrix(ublas_vec1.size(), ublas_vec1.size(), ctx);
+  viennacl::ell_matrix<ScalarType> vcl_ell_matrix(ctx);
+  viennacl::hyb_matrix<ScalarType> vcl_hyb_matrix(ctx);
+
+  viennacl::vector<ScalarType> vcl_vec1(ublas_vec1.size(), ctx);
+  viennacl::vector<ScalarType> vcl_vec2(ublas_vec1.size(), ctx);
+  viennacl::vector<ScalarType> vcl_result(ublas_vec1.size(), ctx);
+
+
+  ublas::compressed_matrix<ScalarType> ublas_matrix;
+  if (!viennacl::io::read_matrix_market_file(ublas_matrix, "../examples/testdata/mat65k.mtx"))
+  {
+    std::cout << "Error reading Matrix file" << std::endl;
+    return EXIT_FAILURE;
+  }
+  //unsigned int cg_mat_size = cg_mat.size();
+  std::cout << "done reading matrix" << std::endl;
+
+  //cpu to gpu:
+  viennacl::copy(ublas_matrix, vcl_compressed_matrix);
+  viennacl::copy(ublas_matrix, vcl_coordinate_matrix);
+  viennacl::copy(ublas_matrix, vcl_ell_matrix);
+  viennacl::copy(ublas_matrix, vcl_hyb_matrix);
+  viennacl::copy(ublas_vec1, vcl_vec1);
+  viennacl::copy(ublas_vec2, vcl_vec2);
+  viennacl::copy(ublas_result, vcl_result);
+
+
+  std::cout << "------- Jacobi preconditioner ----------" << std::endl;
+  viennacl::linalg::jacobi_precond< ublas::compressed_matrix<ScalarType> >    ublas_jacobi(ublas_matrix, viennacl::linalg::jacobi_tag());
+  viennacl::linalg::jacobi_precond< viennacl::compressed_matrix<ScalarType> > vcl_jacobi_csr(vcl_compressed_matrix, viennacl::linalg::jacobi_tag());
+  viennacl::linalg::jacobi_precond< viennacl::coordinate_matrix<ScalarType> > vcl_jacobi_coo(vcl_coordinate_matrix, viennacl::linalg::jacobi_tag());
+
+  std::cout << "------- Row-Scaling preconditioner ----------" << std::endl;
+  viennacl::linalg::row_scaling< ublas::compressed_matrix<ScalarType> >    ublas_row_scaling(ublas_matrix, viennacl::linalg::row_scaling_tag(1));
+  viennacl::linalg::row_scaling< viennacl::compressed_matrix<ScalarType> > vcl_row_scaling_csr(vcl_compressed_matrix, viennacl::linalg::row_scaling_tag(1));
+  viennacl::linalg::row_scaling< viennacl::coordinate_matrix<ScalarType> > vcl_row_scaling_coo(vcl_coordinate_matrix, viennacl::linalg::row_scaling_tag(1));
+
+  ///////////////////////////////////////////////////////////////////////////////
+  //////////////////////  Incomplete Cholesky preconditioner   //////////////////
+  ///////////////////////////////////////////////////////////////////////////////
+  std::cout << "------- ICHOL0 on CPU (ublas) ----------" << std::endl;
+
+  timer.start();
+  viennacl::linalg::ichol0_precond< ublas::compressed_matrix<ScalarType> >    ublas_ichol0(ublas_matrix, viennacl::linalg::ichol0_tag());
+  exec_time = timer.get();
+  std::cout << "Setup time: " << exec_time << std::endl;
+
+  timer.start();
+  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+    ublas_ichol0.apply(ublas_vec1);
+  exec_time = timer.get();
+  std::cout << "ublas time: " << exec_time << std::endl;
+
+  std::cout << "------- ICHOL0 with ViennaCL ----------" << std::endl;
+
+  timer.start();
+  viennacl::linalg::ichol0_precond< viennacl::compressed_matrix<ScalarType> > vcl_ichol0(vcl_compressed_matrix, viennacl::linalg::ichol0_tag());
+  exec_time = timer.get();
+  std::cout << "Setup time: " << exec_time << std::endl;
+
+  viennacl::backend::finish();
+  timer.start();
+  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+    vcl_ichol0.apply(vcl_vec1);
+  viennacl::backend::finish();
+  exec_time = timer.get();
+  std::cout << "ViennaCL time: " << exec_time << std::endl;
+
+
+  ///////////////////////////////////////////////////////////////////////////////
+  //////////////////////           ILU preconditioner         //////////////////
+  ///////////////////////////////////////////////////////////////////////////////
+  std::cout << "------- ILU0 on with ublas ----------" << std::endl;
+
+  timer.start();
+  viennacl::linalg::ilu0_precond< ublas::compressed_matrix<ScalarType> >    ublas_ilu0(ublas_matrix, viennacl::linalg::ilu0_tag());
+  exec_time = timer.get();
+  std::cout << "Setup time (no level scheduling): " << exec_time << std::endl;
+  timer.start();
+  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+    ublas_ilu0.apply(ublas_vec1);
+  exec_time = timer.get();
+  std::cout << "ublas ILU0 substitution time (no level scheduling): " << exec_time << std::endl;
+
+
+  std::cout << "------- ILU0 with ViennaCL ----------" << std::endl;
+
+  timer.start();
+  viennacl::linalg::ilu0_precond< viennacl::compressed_matrix<ScalarType> > vcl_ilu0(vcl_compressed_matrix, viennacl::linalg::ilu0_tag());
+  exec_time = timer.get();
+  std::cout << "Setup time (no level scheduling): " << exec_time << std::endl;
+
+  viennacl::backend::finish();
+  timer.start();
+  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+    vcl_ilu0.apply(vcl_vec1);
+  viennacl::backend::finish();
+  exec_time = timer.get();
+  std::cout << "ViennaCL ILU0 substitution time (no level scheduling): " << exec_time << std::endl;
+
+  timer.start();
+  viennacl::linalg::ilu0_tag ilu0_with_level_scheduling; ilu0_with_level_scheduling.use_level_scheduling(true);
+  viennacl::linalg::ilu0_precond< viennacl::compressed_matrix<ScalarType> > vcl_ilu0_level_scheduling(vcl_compressed_matrix, ilu0_with_level_scheduling);
+  exec_time = timer.get();
+  std::cout << "Setup time (with level scheduling): " << exec_time << std::endl;
+
+  viennacl::backend::finish();
+  timer.start();
+  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+    vcl_ilu0_level_scheduling.apply(vcl_vec1);
+  viennacl::backend::finish();
+  exec_time = timer.get();
+  std::cout << "ViennaCL ILU0 substitution time (with level scheduling): " << exec_time << std::endl;
+
+
+
+  ////////////////////////////////////////////
+
+  std::cout << "------- Block-ILU0 with ublas ----------" << std::endl;
+
+  ublas_vec1 = ublas_vec2;
+  viennacl::copy(ublas_vec1, vcl_vec1);
+
+  timer.start();
+  viennacl::linalg::block_ilu_precond< ublas::compressed_matrix<ScalarType>,
+                                       viennacl::linalg::ilu0_tag>          ublas_block_ilu0(ublas_matrix, viennacl::linalg::ilu0_tag());
+  exec_time = timer.get();
+  std::cout << "Setup time: " << exec_time << std::endl;
+
+  timer.start();
+  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+    ublas_block_ilu0.apply(ublas_vec1);
+  exec_time = timer.get();
+  std::cout << "ublas time: " << exec_time << std::endl;
+
+  std::cout << "------- Block-ILU0 with ViennaCL ----------" << std::endl;
+
+  timer.start();
+  viennacl::linalg::block_ilu_precond< viennacl::compressed_matrix<ScalarType>,
+                                       viennacl::linalg::ilu0_tag>          vcl_block_ilu0(vcl_compressed_matrix, viennacl::linalg::ilu0_tag());
+  exec_time = timer.get();
+  std::cout << "Setup time: " << exec_time << std::endl;
+
+  //vcl_block_ilu0.apply(vcl_vec1);  //warm-up
+  viennacl::backend::finish();
+  timer.start();
+  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+    vcl_block_ilu0.apply(vcl_vec1);
+  viennacl::backend::finish();
+  exec_time = timer.get();
+  std::cout << "ViennaCL time: " << exec_time << std::endl;
+
+  ////////////////////////////////////////////
+
+  std::cout << "------- ILUT with ublas ----------" << std::endl;
+
+  ublas_vec1 = ublas_vec2;
+  viennacl::copy(ublas_vec1, vcl_vec1);
+
+  timer.start();
+  viennacl::linalg::ilut_precond< ublas::compressed_matrix<ScalarType> >    ublas_ilut(ublas_matrix, viennacl::linalg::ilut_tag());
+  exec_time = timer.get();
+  std::cout << "Setup time (no level scheduling): " << exec_time << std::endl;
+  timer.start();
+  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+    ublas_ilut.apply(ublas_vec1);
+  exec_time = timer.get();
+  std::cout << "ublas ILUT substitution time (no level scheduling): " << exec_time << std::endl;
+
+
+  std::cout << "------- ILUT with ViennaCL ----------" << std::endl;
+
+  timer.start();
+  viennacl::linalg::ilut_precond< viennacl::compressed_matrix<ScalarType> > vcl_ilut(vcl_compressed_matrix, viennacl::linalg::ilut_tag());
+  exec_time = timer.get();
+  std::cout << "Setup time (no level scheduling): " << exec_time << std::endl;
+
+  viennacl::backend::finish();
+  timer.start();
+  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+    vcl_ilut.apply(vcl_vec1);
+  viennacl::backend::finish();
+  exec_time = timer.get();
+  std::cout << "ViennaCL ILUT substitution time (no level scheduling): " << exec_time << std::endl;
+
+  timer.start();
+  viennacl::linalg::ilut_tag ilut_with_level_scheduling; ilut_with_level_scheduling.use_level_scheduling(true);
+  viennacl::linalg::ilut_precond< viennacl::compressed_matrix<ScalarType> > vcl_ilut_level_scheduling(vcl_compressed_matrix, ilut_with_level_scheduling);
+  exec_time = timer.get();
+  std::cout << "Setup time (with level scheduling): " << exec_time << std::endl;
+
+  viennacl::backend::finish();
+  timer.start();
+  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+    vcl_ilut_level_scheduling.apply(vcl_vec1);
+  viennacl::backend::finish();
+  exec_time = timer.get();
+  std::cout << "ViennaCL ILUT substitution time (with level scheduling): " << exec_time << std::endl;
+
+
+  ////////////////////////////////////////////
+
+  std::cout << "------- Block-ILUT with ublas ----------" << std::endl;
+
+  ublas_vec1 = ublas_vec2;
+  viennacl::copy(ublas_vec1, vcl_vec1);
+
+  timer.start();
+  viennacl::linalg::block_ilu_precond< ublas::compressed_matrix<ScalarType>,
+                                       viennacl::linalg::ilut_tag>          ublas_block_ilut(ublas_matrix, viennacl::linalg::ilut_tag());
+  exec_time = timer.get();
+  std::cout << "Setup time: " << exec_time << std::endl;
+
+  //ublas_block_ilut.apply(ublas_vec1);
+  timer.start();
+  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+    ublas_block_ilut.apply(ublas_vec1);
+  exec_time = timer.get();
+  std::cout << "ublas time: " << exec_time << std::endl;
+
+  std::cout << "------- Block-ILUT with ViennaCL ----------" << std::endl;
+
+  timer.start();
+  viennacl::linalg::block_ilu_precond< viennacl::compressed_matrix<ScalarType>,
+                                       viennacl::linalg::ilut_tag>          vcl_block_ilut(vcl_compressed_matrix, viennacl::linalg::ilut_tag());
+  exec_time = timer.get();
+  std::cout << "Setup time: " << exec_time << std::endl;
+
+  //vcl_block_ilut.apply(vcl_vec1);  //warm-up
+  viennacl::backend::finish();
+  timer.start();
+  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+    vcl_block_ilut.apply(vcl_vec1);
+  viennacl::backend::finish();
+  exec_time = timer.get();
+  std::cout << "ViennaCL time: " << exec_time << std::endl;
+
+
+  ///////////////////////////////////////////////////////////////////////////////
+  //////////////////////              CG solver                //////////////////
+  ///////////////////////////////////////////////////////////////////////////////
+  long cg_ops = static_cast<long>(solver_iters * (ublas_matrix.nnz() + 6 * ublas_vec2.size()));
+
+  viennacl::linalg::cg_tag cg_solver(solver_tolerance, solver_iters);
+
+  std::cout << "------- CG solver (no preconditioner) using ublas ----------" << std::endl;
+  run_solver(ublas_matrix, ublas_vec2, ublas_result, cg_solver, viennacl::linalg::no_precond(), cg_ops);
+
+  std::cout << "------- CG solver (no preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
+  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, cg_solver, viennacl::linalg::no_precond(), cg_ops);
+
+#ifdef VIENNACL_WITH_OPENCL
+  if (sizeof(ScalarType) == sizeof(double))
+  {
+    std::cout << "------- CG solver, mixed precision (no preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
+    viennacl::linalg::mixed_precision_cg_tag mixed_precision_cg_solver(solver_tolerance, solver_iters);
+
+    run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, mixed_precision_cg_solver, viennacl::linalg::no_precond(), cg_ops);
+    run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, mixed_precision_cg_solver, viennacl::linalg::no_precond(), cg_ops);
+  }
+#endif
+
+  std::cout << "------- CG solver (no preconditioner) via ViennaCL, coordinate_matrix ----------" << std::endl;
+  run_solver(vcl_coordinate_matrix, vcl_vec2, vcl_result, cg_solver, viennacl::linalg::no_precond(), cg_ops);
+
+  std::cout << "------- CG solver (no preconditioner) via ViennaCL, ell_matrix ----------" << std::endl;
+  run_solver(vcl_ell_matrix, vcl_vec2, vcl_result, cg_solver, viennacl::linalg::no_precond(), cg_ops);
+
+  std::cout << "------- CG solver (no preconditioner) via ViennaCL, hyb_matrix ----------" << std::endl;
+  run_solver(vcl_hyb_matrix, vcl_vec2, vcl_result, cg_solver, viennacl::linalg::no_precond(), cg_ops);
+
+  std::cout << "------- CG solver (ICHOL0 preconditioner) using ublas ----------" << std::endl;
+  run_solver(ublas_matrix, ublas_vec2, ublas_result, cg_solver, ublas_ichol0, cg_ops);
+
+  std::cout << "------- CG solver (ICHOL0 preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
+  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, cg_solver, vcl_ichol0, cg_ops);
+
+
+  std::cout << "------- CG solver (ILU0 preconditioner) using ublas ----------" << std::endl;
+  run_solver(ublas_matrix, ublas_vec2, ublas_result, cg_solver, ublas_ilu0, cg_ops);
+
+  std::cout << "------- CG solver (ILU0 preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
+  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, cg_solver, vcl_ilu0, cg_ops);
+
+
+  std::cout << "------- CG solver (Block-ILU0 preconditioner) using ublas ----------" << std::endl;
+  run_solver(ublas_matrix, ublas_vec2, ublas_result, cg_solver, ublas_block_ilu0, cg_ops);
+
+  std::cout << "------- CG solver (Block-ILU0 preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
+  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, cg_solver, vcl_block_ilu0, cg_ops);
+
+  std::cout << "------- CG solver (ILUT preconditioner) using ublas ----------" << std::endl;
+  run_solver(ublas_matrix, ublas_vec2, ublas_result, cg_solver, ublas_ilut, cg_ops);
+
+  std::cout << "------- CG solver (ILUT preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
+  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, cg_solver, vcl_ilut, cg_ops);
+
+  std::cout << "------- CG solver (ILUT preconditioner) via ViennaCL, coordinate_matrix ----------" << std::endl;
+  run_solver(vcl_coordinate_matrix, vcl_vec2, vcl_result, cg_solver, vcl_ilut, cg_ops);
+
+  std::cout << "------- CG solver (Block-ILUT preconditioner) using ublas ----------" << std::endl;
+  run_solver(ublas_matrix, ublas_vec2, ublas_result, cg_solver, ublas_block_ilut, cg_ops);
+
+  std::cout << "------- CG solver (Block-ILUT preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
+  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, cg_solver, vcl_block_ilut, cg_ops);
+
+  std::cout << "------- CG solver (Jacobi preconditioner) using ublas ----------" << std::endl;
+  run_solver(ublas_matrix, ublas_vec2, ublas_result, cg_solver, ublas_jacobi, cg_ops);
+
+  std::cout << "------- CG solver (Jacobi preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
+  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, cg_solver, vcl_jacobi_csr, cg_ops);
+
+  std::cout << "------- CG solver (Jacobi preconditioner) via ViennaCL, coordinate_matrix ----------" << std::endl;
+  run_solver(vcl_coordinate_matrix, vcl_vec2, vcl_result, cg_solver, vcl_jacobi_coo, cg_ops);
+
+
+  std::cout << "------- CG solver (row scaling preconditioner) using ublas ----------" << std::endl;
+  run_solver(ublas_matrix, ublas_vec2, ublas_result, cg_solver, ublas_row_scaling, cg_ops);
+
+  std::cout << "------- CG solver (row scaling preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
+  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, cg_solver, vcl_row_scaling_csr, cg_ops);
+
+  std::cout << "------- CG solver (row scaling preconditioner) via ViennaCL, coordinate_matrix ----------" << std::endl;
+  run_solver(vcl_coordinate_matrix, vcl_vec2, vcl_result, cg_solver, vcl_row_scaling_coo, cg_ops);
+
+
+  ///////////////////////////////////////////////////////////////////////////////
+  //////////////////////           BiCGStab solver             //////////////////
+  ///////////////////////////////////////////////////////////////////////////////
+
+  long bicgstab_ops = static_cast<long>(solver_iters * (2 * ublas_matrix.nnz() + 13 * ublas_vec2.size()));
+
+  viennacl::linalg::bicgstab_tag bicgstab_solver(solver_tolerance, solver_iters);
+
+  std::cout << "------- BiCGStab solver (no preconditioner) using ublas ----------" << std::endl;
+  run_solver(ublas_matrix, ublas_vec2, ublas_result, bicgstab_solver, viennacl::linalg::no_precond(), bicgstab_ops);
+
+  std::cout << "------- BiCGStab solver (no preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
+  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, bicgstab_solver, viennacl::linalg::no_precond(), bicgstab_ops);
+
+  std::cout << "------- BiCGStab solver (no preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
+  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, bicgstab_solver, viennacl::linalg::no_precond(), bicgstab_ops);
+
+
+  std::cout << "------- BiCGStab solver (ILUT preconditioner) using ublas ----------" << std::endl;
+  run_solver(ublas_matrix, ublas_vec2, ublas_result, bicgstab_solver, ublas_ilut, bicgstab_ops);
+
+  std::cout << "------- BiCGStab solver (ILUT preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
+  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, bicgstab_solver, vcl_ilut, bicgstab_ops);
+
+  std::cout << "------- BiCGStab solver (Block-ILUT preconditioner) using ublas ----------" << std::endl;
+  run_solver(ublas_matrix, ublas_vec2, ublas_result, bicgstab_solver, ublas_block_ilut, bicgstab_ops);
+
+#ifdef VIENNACL_WITH_OPENCL
+  std::cout << "------- BiCGStab solver (Block-ILUT preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
+  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, bicgstab_solver, vcl_block_ilut, bicgstab_ops);
+#endif
+
+//  std::cout << "------- BiCGStab solver (ILUT preconditioner) via ViennaCL, coordinate_matrix ----------" << std::endl;
+//  run_solver(vcl_coordinate_matrix, vcl_vec2, vcl_result, bicgstab_solver, vcl_ilut, bicgstab_ops);
+
+  std::cout << "------- BiCGStab solver (Jacobi preconditioner) using ublas ----------" << std::endl;
+  run_solver(ublas_matrix, ublas_vec2, ublas_result, bicgstab_solver, ublas_jacobi, bicgstab_ops);
+
+  std::cout << "------- BiCGStab solver (Jacobi preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
+  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, bicgstab_solver, vcl_jacobi_csr, bicgstab_ops);
+
+  std::cout << "------- BiCGStab solver (Jacobi preconditioner) via ViennaCL, coordinate_matrix ----------" << std::endl;
+  run_solver(vcl_coordinate_matrix, vcl_vec2, vcl_result, bicgstab_solver, vcl_jacobi_coo, bicgstab_ops);
+
+
+  std::cout << "------- BiCGStab solver (row scaling preconditioner) using ublas ----------" << std::endl;
+  run_solver(ublas_matrix, ublas_vec2, ublas_result, bicgstab_solver, ublas_row_scaling, bicgstab_ops);
+
+  std::cout << "------- BiCGStab solver (row scaling preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
+  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, bicgstab_solver, vcl_row_scaling_csr, bicgstab_ops);
+
+  std::cout << "------- BiCGStab solver (row scaling preconditioner) via ViennaCL, coordinate_matrix ----------" << std::endl;
+  run_solver(vcl_coordinate_matrix, vcl_vec2, vcl_result, bicgstab_solver, vcl_row_scaling_coo, bicgstab_ops);
+
+
+  ///////////////////////////////////////////////////////////////////////////////
+  ///////////////////////            GMRES solver             ///////////////////
+  ///////////////////////////////////////////////////////////////////////////////
+
+  long gmres_ops = static_cast<long>(solver_iters * (ublas_matrix.nnz() + (solver_iters * 2 + 7) * ublas_vec2.size()));
+
+  viennacl::linalg::gmres_tag gmres_solver(solver_tolerance, solver_iters, solver_krylov_dim);
+
+  std::cout << "------- GMRES solver (no preconditioner) using ublas ----------" << std::endl;
+  run_solver(ublas_matrix, ublas_vec2, ublas_result, gmres_solver, viennacl::linalg::no_precond(), gmres_ops);
+
+  std::cout << "------- GMRES solver (no preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
+  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, gmres_solver, viennacl::linalg::no_precond(), gmres_ops);
+
+  std::cout << "------- GMRES solver (no preconditioner) on GPU, coordinate_matrix ----------" << std::endl;
+  run_solver(vcl_coordinate_matrix, vcl_vec2, vcl_result, gmres_solver, viennacl::linalg::no_precond(), gmres_ops);
+
+
+  std::cout << "------- GMRES solver (ILUT preconditioner) using ublas ----------" << std::endl;
+  run_solver(ublas_matrix, ublas_vec2, ublas_result, gmres_solver, ublas_ilut, gmres_ops);
+
+  std::cout << "------- GMRES solver (ILUT preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
+  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, gmres_solver, vcl_ilut, gmres_ops);
+
+  std::cout << "------- GMRES solver (ILUT preconditioner) via ViennaCL, coordinate_matrix ----------" << std::endl;
+  run_solver(vcl_coordinate_matrix, vcl_vec2, vcl_result, gmres_solver, vcl_ilut, gmres_ops);
+
+
+  std::cout << "------- GMRES solver (Jacobi preconditioner) using ublas ----------" << std::endl;
+  run_solver(ublas_matrix, ublas_vec2, ublas_result, gmres_solver, ublas_jacobi, gmres_ops);
+
+  std::cout << "------- GMRES solver (Jacobi preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
+  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, gmres_solver, vcl_jacobi_csr, gmres_ops);
+
+  std::cout << "------- GMRES solver (Jacobi preconditioner) via ViennaCL, coordinate_matrix ----------" << std::endl;
+  run_solver(vcl_coordinate_matrix, vcl_vec2, vcl_result, gmres_solver, vcl_jacobi_coo, gmres_ops);
+
+
+  std::cout << "------- GMRES solver (row scaling preconditioner) using ublas ----------" << std::endl;
+  run_solver(ublas_matrix, ublas_vec2, ublas_result, gmres_solver, ublas_row_scaling, gmres_ops);
+
+  std::cout << "------- GMRES solver (row scaling preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
+  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, gmres_solver, vcl_row_scaling_csr, gmres_ops);
+
+  std::cout << "------- GMRES solver (row scaling preconditioner) via ViennaCL, coordinate_matrix ----------" << std::endl;
+  run_solver(vcl_coordinate_matrix, vcl_vec2, vcl_result, gmres_solver, vcl_row_scaling_coo, gmres_ops);
+
+  return EXIT_SUCCESS;
+}
+
+int main()
+{
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "               Device Info" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+
+#ifdef VIENNACL_WITH_OPENCL
+  viennacl::ocl::platform pf = viennacl::ocl::get_platforms()[0];
+  std::vector<viennacl::ocl::device> const & devices = pf.devices();
+
+  // Set first device to first context:
+  viennacl::ocl::setup_context(0, devices[0]);
+
+  // Set second device for second context (use the same device for the second context if only one device available):
+  if (devices.size() > 1)
+    viennacl::ocl::setup_context(1, devices[1]);
+  else
+    viennacl::ocl::setup_context(1, devices[0]);
+
+  std::cout << viennacl::ocl::current_device().info() << std::endl;
+  viennacl::context ctx(viennacl::ocl::get_context(1));
+#else
+  viennacl::context ctx;
+#endif
+
+  std::cout << "---------------------------------------------------------------------------" << std::endl;
+  std::cout << "---------------------------------------------------------------------------" << std::endl;
+  std::cout << " Benchmark for Execution Times of Iterative Solvers provided with ViennaCL " << std::endl;
+  std::cout << "---------------------------------------------------------------------------" << std::endl;
+  std::cout << " Note that the purpose of this benchmark is not to run solvers until" << std::endl;
+  std::cout << " convergence. Instead, only the execution times of a few iterations are" << std::endl;
+  std::cout << " recorded. Residual errors are only printed for information." << std::endl << std::endl;
+
+
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "## Benchmark :: Solver" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << std::endl;
+  std::cout << "   -------------------------------" << std::endl;
+  std::cout << "   # benchmarking single-precision" << std::endl;
+  std::cout << "   -------------------------------" << std::endl;
+  run_benchmark<float>(ctx);
+#ifdef VIENNACL_WITH_OPENCL
+  if( viennacl::ocl::current_device().double_support() )
+#endif
+  {
+    std::cout << std::endl;
+    std::cout << "   -------------------------------" << std::endl;
+    std::cout << "   # benchmarking double-precision" << std::endl;
+    std::cout << "   -------------------------------" << std::endl;
+    run_benchmark<double>(ctx);
+  }
+  return 0;
+}
+
diff --git a/examples/benchmarks/solver.cu b/examples/benchmarks/solver.cu
new file mode 100644
index 0000000..5a27a3c
--- /dev/null
+++ b/examples/benchmarks/solver.cu
@@ -0,0 +1,645 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/*
+*
+*   Benchmark:  Iterative solver tests (solver.cpp and solver.cu are identical, the latter being required for compilation using CUDA nvcc)
+*
+*/
+
+
+#ifndef NDEBUG
+ #define NDEBUG
+#endif
+
+#include <boost/numeric/ublas/matrix_sparse.hpp>
+#include <boost/numeric/ublas/io.hpp>
+#include <boost/numeric/ublas/operation_sparse.hpp>
+
+#define VIENNACL_WITH_UBLAS 1
+
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/coordinate_matrix.hpp"
+#include "viennacl/compressed_matrix.hpp"
+#include "viennacl/ell_matrix.hpp"
+#include "viennacl/hyb_matrix.hpp"
+#include "viennacl/context.hpp"
+
+#include "viennacl/linalg/cg.hpp"
+#include "viennacl/linalg/bicgstab.hpp"
+#include "viennacl/linalg/gmres.hpp"
+
+#include "viennacl/linalg/ilu.hpp"
+#include "viennacl/linalg/ichol.hpp"
+#include "viennacl/linalg/jacobi_precond.hpp"
+#include "viennacl/linalg/row_scaling.hpp"
+
+#ifdef VIENNACL_WITH_OPENCL
+  #include "viennacl/linalg/mixed_precision_cg.hpp"
+#endif
+
+#include "viennacl/io/matrix_market.hpp"
+
+
+#include <iostream>
+#include <vector>
+#include "benchmark-utils.hpp"
+#include "io.hpp"
+
+
+using namespace boost::numeric;
+
+#define BENCHMARK_RUNS          1
+
+
+template <typename ScalarType>
+ScalarType diff_inf(ublas::vector<ScalarType> & v1, viennacl::vector<ScalarType> & v2)
+{
+   ublas::vector<ScalarType> v2_cpu(v2.size());
+   viennacl::copy(v2.begin(), v2.end(), v2_cpu.begin());
+
+   for (unsigned int i=0;i<v1.size(); ++i)
+   {
+      if ( std::max( fabs(v2_cpu[i]), fabs(v1[i]) ) > 0 )
+         v2_cpu[i] = fabs(v2_cpu[i] - v1[i]) / std::max( fabs(v2_cpu[i]), fabs(v1[i]) );
+      else
+         v2_cpu[i] = 0.0;
+   }
+
+   return norm_inf(v2_cpu);
+}
+
+template <typename ScalarType>
+ScalarType diff_2(ublas::vector<ScalarType> & v1, viennacl::vector<ScalarType> & v2)
+{
+   ublas::vector<ScalarType> v2_cpu(v2.size());
+   viennacl::copy(v2.begin(), v2.end(), v2_cpu.begin());
+
+   return norm_2(v1 - v2_cpu) / norm_2(v1);
+}
+
+
+template <typename MatrixType, typename VectorType, typename SolverTag, typename PrecondTag>
+void run_solver(MatrixType const & matrix, VectorType const & rhs, VectorType const & ref_result, SolverTag const & solver, PrecondTag const & precond, long ops)
+{
+  Timer timer;
+  VectorType result(rhs);
+  VectorType residual(rhs);
+  viennacl::backend::finish();
+
+  timer.start();
+  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+  {
+    result = viennacl::linalg::solve(matrix, rhs, solver, precond);
+  }
+  viennacl::backend::finish();
+  double exec_time = timer.get();
+  std::cout << "Exec. time: " << exec_time << std::endl;
+  std::cout << "Est. "; printOps(static_cast<double>(ops), exec_time / BENCHMARK_RUNS);
+  residual -= viennacl::linalg::prod(matrix, result);
+  std::cout << "Relative residual: " << viennacl::linalg::norm_2(residual) / viennacl::linalg::norm_2(rhs) << std::endl;
+  std::cout << "Estimated rel. residual: " << solver.error() << std::endl;
+  std::cout << "Iterations: " << solver.iters() << std::endl;
+  result -= ref_result;
+  std::cout << "Relative deviation from result: " << viennacl::linalg::norm_2(result) / viennacl::linalg::norm_2(ref_result) << std::endl;
+}
+
+
+template<typename ScalarType>
+int run_benchmark(viennacl::context ctx)
+{
+  Timer timer;
+  double exec_time;
+
+  ScalarType std_factor1 = static_cast<ScalarType>(3.1415);
+  ScalarType std_factor2 = static_cast<ScalarType>(42.0);
+  viennacl::scalar<ScalarType> vcl_factor1(std_factor1, ctx);
+  viennacl::scalar<ScalarType> vcl_factor2(std_factor2, ctx);
+
+  ublas::vector<ScalarType> ublas_vec1;
+  ublas::vector<ScalarType> ublas_vec2;
+  ublas::vector<ScalarType> ublas_result;
+  unsigned int solver_iters = 100;
+  unsigned int solver_krylov_dim = 20;
+  double solver_tolerance = 1e-6;
+
+  if (!readVectorFromFile<ScalarType>("../examples/testdata/rhs65025.txt", ublas_vec1))
+  {
+    std::cout << "Error reading RHS file" << std::endl;
+    return 0;
+  }
+  std::cout << "done reading rhs" << std::endl;
+  ublas_vec2 = ublas_vec1;
+  if (!readVectorFromFile<ScalarType>("../examples/testdata/result65025.txt", ublas_result))
+  {
+    std::cout << "Error reading result file" << std::endl;
+    return 0;
+  }
+  std::cout << "done reading result" << std::endl;
+
+  viennacl::compressed_matrix<ScalarType> vcl_compressed_matrix(ublas_vec1.size(), ublas_vec1.size(), ctx);
+  viennacl::coordinate_matrix<ScalarType> vcl_coordinate_matrix(ublas_vec1.size(), ublas_vec1.size(), ctx);
+  viennacl::ell_matrix<ScalarType> vcl_ell_matrix(ctx);
+  viennacl::hyb_matrix<ScalarType> vcl_hyb_matrix(ctx);
+
+  viennacl::vector<ScalarType> vcl_vec1(ublas_vec1.size(), ctx);
+  viennacl::vector<ScalarType> vcl_vec2(ublas_vec1.size(), ctx);
+  viennacl::vector<ScalarType> vcl_result(ublas_vec1.size(), ctx);
+
+
+  ublas::compressed_matrix<ScalarType> ublas_matrix;
+  if (!viennacl::io::read_matrix_market_file(ublas_matrix, "../examples/testdata/mat65k.mtx"))
+  {
+    std::cout << "Error reading Matrix file" << std::endl;
+    return EXIT_FAILURE;
+  }
+  //unsigned int cg_mat_size = cg_mat.size();
+  std::cout << "done reading matrix" << std::endl;
+
+  //cpu to gpu:
+  viennacl::copy(ublas_matrix, vcl_compressed_matrix);
+  viennacl::copy(ublas_matrix, vcl_coordinate_matrix);
+  viennacl::copy(ublas_matrix, vcl_ell_matrix);
+  viennacl::copy(ublas_matrix, vcl_hyb_matrix);
+  viennacl::copy(ublas_vec1, vcl_vec1);
+  viennacl::copy(ublas_vec2, vcl_vec2);
+  viennacl::copy(ublas_result, vcl_result);
+
+
+  std::cout << "------- Jacobi preconditioner ----------" << std::endl;
+  viennacl::linalg::jacobi_precond< ublas::compressed_matrix<ScalarType> >    ublas_jacobi(ublas_matrix, viennacl::linalg::jacobi_tag());
+  viennacl::linalg::jacobi_precond< viennacl::compressed_matrix<ScalarType> > vcl_jacobi_csr(vcl_compressed_matrix, viennacl::linalg::jacobi_tag());
+  viennacl::linalg::jacobi_precond< viennacl::coordinate_matrix<ScalarType> > vcl_jacobi_coo(vcl_coordinate_matrix, viennacl::linalg::jacobi_tag());
+
+  std::cout << "------- Row-Scaling preconditioner ----------" << std::endl;
+  viennacl::linalg::row_scaling< ublas::compressed_matrix<ScalarType> >    ublas_row_scaling(ublas_matrix, viennacl::linalg::row_scaling_tag(1));
+  viennacl::linalg::row_scaling< viennacl::compressed_matrix<ScalarType> > vcl_row_scaling_csr(vcl_compressed_matrix, viennacl::linalg::row_scaling_tag(1));
+  viennacl::linalg::row_scaling< viennacl::coordinate_matrix<ScalarType> > vcl_row_scaling_coo(vcl_coordinate_matrix, viennacl::linalg::row_scaling_tag(1));
+
+  ///////////////////////////////////////////////////////////////////////////////
+  //////////////////////  Incomplete Cholesky preconditioner   //////////////////
+  ///////////////////////////////////////////////////////////////////////////////
+  std::cout << "------- ICHOL0 on CPU (ublas) ----------" << std::endl;
+
+  timer.start();
+  viennacl::linalg::ichol0_precond< ublas::compressed_matrix<ScalarType> >    ublas_ichol0(ublas_matrix, viennacl::linalg::ichol0_tag());
+  exec_time = timer.get();
+  std::cout << "Setup time: " << exec_time << std::endl;
+
+  timer.start();
+  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+    ublas_ichol0.apply(ublas_vec1);
+  exec_time = timer.get();
+  std::cout << "ublas time: " << exec_time << std::endl;
+
+  std::cout << "------- ICHOL0 with ViennaCL ----------" << std::endl;
+
+  timer.start();
+  viennacl::linalg::ichol0_precond< viennacl::compressed_matrix<ScalarType> > vcl_ichol0(vcl_compressed_matrix, viennacl::linalg::ichol0_tag());
+  exec_time = timer.get();
+  std::cout << "Setup time: " << exec_time << std::endl;
+
+  viennacl::backend::finish();
+  timer.start();
+  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+    vcl_ichol0.apply(vcl_vec1);
+  viennacl::backend::finish();
+  exec_time = timer.get();
+  std::cout << "ViennaCL time: " << exec_time << std::endl;
+
+
+  ///////////////////////////////////////////////////////////////////////////////
+  //////////////////////           ILU preconditioner         //////////////////
+  ///////////////////////////////////////////////////////////////////////////////
+  std::cout << "------- ILU0 on with ublas ----------" << std::endl;
+
+  timer.start();
+  viennacl::linalg::ilu0_precond< ublas::compressed_matrix<ScalarType> >    ublas_ilu0(ublas_matrix, viennacl::linalg::ilu0_tag());
+  exec_time = timer.get();
+  std::cout << "Setup time (no level scheduling): " << exec_time << std::endl;
+  timer.start();
+  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+    ublas_ilu0.apply(ublas_vec1);
+  exec_time = timer.get();
+  std::cout << "ublas ILU0 substitution time (no level scheduling): " << exec_time << std::endl;
+
+
+  std::cout << "------- ILU0 with ViennaCL ----------" << std::endl;
+
+  timer.start();
+  viennacl::linalg::ilu0_precond< viennacl::compressed_matrix<ScalarType> > vcl_ilu0(vcl_compressed_matrix, viennacl::linalg::ilu0_tag());
+  exec_time = timer.get();
+  std::cout << "Setup time (no level scheduling): " << exec_time << std::endl;
+
+  viennacl::backend::finish();
+  timer.start();
+  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+    vcl_ilu0.apply(vcl_vec1);
+  viennacl::backend::finish();
+  exec_time = timer.get();
+  std::cout << "ViennaCL ILU0 substitution time (no level scheduling): " << exec_time << std::endl;
+
+  timer.start();
+  viennacl::linalg::ilu0_tag ilu0_with_level_scheduling; ilu0_with_level_scheduling.use_level_scheduling(true);
+  viennacl::linalg::ilu0_precond< viennacl::compressed_matrix<ScalarType> > vcl_ilu0_level_scheduling(vcl_compressed_matrix, ilu0_with_level_scheduling);
+  exec_time = timer.get();
+  std::cout << "Setup time (with level scheduling): " << exec_time << std::endl;
+
+  viennacl::backend::finish();
+  timer.start();
+  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+    vcl_ilu0_level_scheduling.apply(vcl_vec1);
+  viennacl::backend::finish();
+  exec_time = timer.get();
+  std::cout << "ViennaCL ILU0 substitution time (with level scheduling): " << exec_time << std::endl;
+
+
+
+  ////////////////////////////////////////////
+
+  std::cout << "------- Block-ILU0 with ublas ----------" << std::endl;
+
+  ublas_vec1 = ublas_vec2;
+  viennacl::copy(ublas_vec1, vcl_vec1);
+
+  timer.start();
+  viennacl::linalg::block_ilu_precond< ublas::compressed_matrix<ScalarType>,
+                                       viennacl::linalg::ilu0_tag>          ublas_block_ilu0(ublas_matrix, viennacl::linalg::ilu0_tag());
+  exec_time = timer.get();
+  std::cout << "Setup time: " << exec_time << std::endl;
+
+  timer.start();
+  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+    ublas_block_ilu0.apply(ublas_vec1);
+  exec_time = timer.get();
+  std::cout << "ublas time: " << exec_time << std::endl;
+
+  std::cout << "------- Block-ILU0 with ViennaCL ----------" << std::endl;
+
+  timer.start();
+  viennacl::linalg::block_ilu_precond< viennacl::compressed_matrix<ScalarType>,
+                                       viennacl::linalg::ilu0_tag>          vcl_block_ilu0(vcl_compressed_matrix, viennacl::linalg::ilu0_tag());
+  exec_time = timer.get();
+  std::cout << "Setup time: " << exec_time << std::endl;
+
+  //vcl_block_ilu0.apply(vcl_vec1);  //warm-up
+  viennacl::backend::finish();
+  timer.start();
+  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+    vcl_block_ilu0.apply(vcl_vec1);
+  viennacl::backend::finish();
+  exec_time = timer.get();
+  std::cout << "ViennaCL time: " << exec_time << std::endl;
+
+  ////////////////////////////////////////////
+
+  std::cout << "------- ILUT with ublas ----------" << std::endl;
+
+  ublas_vec1 = ublas_vec2;
+  viennacl::copy(ublas_vec1, vcl_vec1);
+
+  timer.start();
+  viennacl::linalg::ilut_precond< ublas::compressed_matrix<ScalarType> >    ublas_ilut(ublas_matrix, viennacl::linalg::ilut_tag());
+  exec_time = timer.get();
+  std::cout << "Setup time (no level scheduling): " << exec_time << std::endl;
+  timer.start();
+  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+    ublas_ilut.apply(ublas_vec1);
+  exec_time = timer.get();
+  std::cout << "ublas ILUT substitution time (no level scheduling): " << exec_time << std::endl;
+
+
+  std::cout << "------- ILUT with ViennaCL ----------" << std::endl;
+
+  timer.start();
+  viennacl::linalg::ilut_precond< viennacl::compressed_matrix<ScalarType> > vcl_ilut(vcl_compressed_matrix, viennacl::linalg::ilut_tag());
+  exec_time = timer.get();
+  std::cout << "Setup time (no level scheduling): " << exec_time << std::endl;
+
+  viennacl::backend::finish();
+  timer.start();
+  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+    vcl_ilut.apply(vcl_vec1);
+  viennacl::backend::finish();
+  exec_time = timer.get();
+  std::cout << "ViennaCL ILUT substitution time (no level scheduling): " << exec_time << std::endl;
+
+  timer.start();
+  viennacl::linalg::ilut_tag ilut_with_level_scheduling; ilut_with_level_scheduling.use_level_scheduling(true);
+  viennacl::linalg::ilut_precond< viennacl::compressed_matrix<ScalarType> > vcl_ilut_level_scheduling(vcl_compressed_matrix, ilut_with_level_scheduling);
+  exec_time = timer.get();
+  std::cout << "Setup time (with level scheduling): " << exec_time << std::endl;
+
+  viennacl::backend::finish();
+  timer.start();
+  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+    vcl_ilut_level_scheduling.apply(vcl_vec1);
+  viennacl::backend::finish();
+  exec_time = timer.get();
+  std::cout << "ViennaCL ILUT substitution time (with level scheduling): " << exec_time << std::endl;
+
+
+  ////////////////////////////////////////////
+
+  std::cout << "------- Block-ILUT with ublas ----------" << std::endl;
+
+  ublas_vec1 = ublas_vec2;
+  viennacl::copy(ublas_vec1, vcl_vec1);
+
+  timer.start();
+  viennacl::linalg::block_ilu_precond< ublas::compressed_matrix<ScalarType>,
+                                       viennacl::linalg::ilut_tag>          ublas_block_ilut(ublas_matrix, viennacl::linalg::ilut_tag());
+  exec_time = timer.get();
+  std::cout << "Setup time: " << exec_time << std::endl;
+
+  //ublas_block_ilut.apply(ublas_vec1);
+  timer.start();
+  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+    ublas_block_ilut.apply(ublas_vec1);
+  exec_time = timer.get();
+  std::cout << "ublas time: " << exec_time << std::endl;
+
+  std::cout << "------- Block-ILUT with ViennaCL ----------" << std::endl;
+
+  timer.start();
+  viennacl::linalg::block_ilu_precond< viennacl::compressed_matrix<ScalarType>,
+                                       viennacl::linalg::ilut_tag>          vcl_block_ilut(vcl_compressed_matrix, viennacl::linalg::ilut_tag());
+  exec_time = timer.get();
+  std::cout << "Setup time: " << exec_time << std::endl;
+
+  //vcl_block_ilut.apply(vcl_vec1);  //warm-up
+  viennacl::backend::finish();
+  timer.start();
+  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+    vcl_block_ilut.apply(vcl_vec1);
+  viennacl::backend::finish();
+  exec_time = timer.get();
+  std::cout << "ViennaCL time: " << exec_time << std::endl;
+
+
+  ///////////////////////////////////////////////////////////////////////////////
+  //////////////////////              CG solver                //////////////////
+  ///////////////////////////////////////////////////////////////////////////////
+  long cg_ops = static_cast<long>(solver_iters * (ublas_matrix.nnz() + 6 * ublas_vec2.size()));
+
+  viennacl::linalg::cg_tag cg_solver(solver_tolerance, solver_iters);
+
+  std::cout << "------- CG solver (no preconditioner) using ublas ----------" << std::endl;
+  run_solver(ublas_matrix, ublas_vec2, ublas_result, cg_solver, viennacl::linalg::no_precond(), cg_ops);
+
+  std::cout << "------- CG solver (no preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
+  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, cg_solver, viennacl::linalg::no_precond(), cg_ops);
+
+#ifdef VIENNACL_WITH_OPENCL
+  if (sizeof(ScalarType) == sizeof(double))
+  {
+    std::cout << "------- CG solver, mixed precision (no preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
+    viennacl::linalg::mixed_precision_cg_tag mixed_precision_cg_solver(solver_tolerance, solver_iters);
+
+    run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, mixed_precision_cg_solver, viennacl::linalg::no_precond(), cg_ops);
+    run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, mixed_precision_cg_solver, viennacl::linalg::no_precond(), cg_ops);
+  }
+#endif
+
+  std::cout << "------- CG solver (no preconditioner) via ViennaCL, coordinate_matrix ----------" << std::endl;
+  run_solver(vcl_coordinate_matrix, vcl_vec2, vcl_result, cg_solver, viennacl::linalg::no_precond(), cg_ops);
+
+  std::cout << "------- CG solver (no preconditioner) via ViennaCL, ell_matrix ----------" << std::endl;
+  run_solver(vcl_ell_matrix, vcl_vec2, vcl_result, cg_solver, viennacl::linalg::no_precond(), cg_ops);
+
+  std::cout << "------- CG solver (no preconditioner) via ViennaCL, hyb_matrix ----------" << std::endl;
+  run_solver(vcl_hyb_matrix, vcl_vec2, vcl_result, cg_solver, viennacl::linalg::no_precond(), cg_ops);
+
+  std::cout << "------- CG solver (ICHOL0 preconditioner) using ublas ----------" << std::endl;
+  run_solver(ublas_matrix, ublas_vec2, ublas_result, cg_solver, ublas_ichol0, cg_ops);
+
+  std::cout << "------- CG solver (ICHOL0 preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
+  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, cg_solver, vcl_ichol0, cg_ops);
+
+
+  std::cout << "------- CG solver (ILU0 preconditioner) using ublas ----------" << std::endl;
+  run_solver(ublas_matrix, ublas_vec2, ublas_result, cg_solver, ublas_ilu0, cg_ops);
+
+  std::cout << "------- CG solver (ILU0 preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
+  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, cg_solver, vcl_ilu0, cg_ops);
+
+
+  std::cout << "------- CG solver (Block-ILU0 preconditioner) using ublas ----------" << std::endl;
+  run_solver(ublas_matrix, ublas_vec2, ublas_result, cg_solver, ublas_block_ilu0, cg_ops);
+
+  std::cout << "------- CG solver (Block-ILU0 preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
+  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, cg_solver, vcl_block_ilu0, cg_ops);
+
+  std::cout << "------- CG solver (ILUT preconditioner) using ublas ----------" << std::endl;
+  run_solver(ublas_matrix, ublas_vec2, ublas_result, cg_solver, ublas_ilut, cg_ops);
+
+  std::cout << "------- CG solver (ILUT preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
+  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, cg_solver, vcl_ilut, cg_ops);
+
+  std::cout << "------- CG solver (ILUT preconditioner) via ViennaCL, coordinate_matrix ----------" << std::endl;
+  run_solver(vcl_coordinate_matrix, vcl_vec2, vcl_result, cg_solver, vcl_ilut, cg_ops);
+
+  std::cout << "------- CG solver (Block-ILUT preconditioner) using ublas ----------" << std::endl;
+  run_solver(ublas_matrix, ublas_vec2, ublas_result, cg_solver, ublas_block_ilut, cg_ops);
+
+  std::cout << "------- CG solver (Block-ILUT preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
+  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, cg_solver, vcl_block_ilut, cg_ops);
+
+  std::cout << "------- CG solver (Jacobi preconditioner) using ublas ----------" << std::endl;
+  run_solver(ublas_matrix, ublas_vec2, ublas_result, cg_solver, ublas_jacobi, cg_ops);
+
+  std::cout << "------- CG solver (Jacobi preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
+  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, cg_solver, vcl_jacobi_csr, cg_ops);
+
+  std::cout << "------- CG solver (Jacobi preconditioner) via ViennaCL, coordinate_matrix ----------" << std::endl;
+  run_solver(vcl_coordinate_matrix, vcl_vec2, vcl_result, cg_solver, vcl_jacobi_coo, cg_ops);
+
+
+  std::cout << "------- CG solver (row scaling preconditioner) using ublas ----------" << std::endl;
+  run_solver(ublas_matrix, ublas_vec2, ublas_result, cg_solver, ublas_row_scaling, cg_ops);
+
+  std::cout << "------- CG solver (row scaling preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
+  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, cg_solver, vcl_row_scaling_csr, cg_ops);
+
+  std::cout << "------- CG solver (row scaling preconditioner) via ViennaCL, coordinate_matrix ----------" << std::endl;
+  run_solver(vcl_coordinate_matrix, vcl_vec2, vcl_result, cg_solver, vcl_row_scaling_coo, cg_ops);
+
+
+  ///////////////////////////////////////////////////////////////////////////////
+  //////////////////////           BiCGStab solver             //////////////////
+  ///////////////////////////////////////////////////////////////////////////////
+
+  long bicgstab_ops = static_cast<long>(solver_iters * (2 * ublas_matrix.nnz() + 13 * ublas_vec2.size()));
+
+  viennacl::linalg::bicgstab_tag bicgstab_solver(solver_tolerance, solver_iters);
+
+  std::cout << "------- BiCGStab solver (no preconditioner) using ublas ----------" << std::endl;
+  run_solver(ublas_matrix, ublas_vec2, ublas_result, bicgstab_solver, viennacl::linalg::no_precond(), bicgstab_ops);
+
+  std::cout << "------- BiCGStab solver (no preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
+  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, bicgstab_solver, viennacl::linalg::no_precond(), bicgstab_ops);
+
+  std::cout << "------- BiCGStab solver (no preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
+  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, bicgstab_solver, viennacl::linalg::no_precond(), bicgstab_ops);
+
+
+  std::cout << "------- BiCGStab solver (ILUT preconditioner) using ublas ----------" << std::endl;
+  run_solver(ublas_matrix, ublas_vec2, ublas_result, bicgstab_solver, ublas_ilut, bicgstab_ops);
+
+  std::cout << "------- BiCGStab solver (ILUT preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
+  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, bicgstab_solver, vcl_ilut, bicgstab_ops);
+
+  std::cout << "------- BiCGStab solver (Block-ILUT preconditioner) using ublas ----------" << std::endl;
+  run_solver(ublas_matrix, ublas_vec2, ublas_result, bicgstab_solver, ublas_block_ilut, bicgstab_ops);
+
+#ifdef VIENNACL_WITH_OPENCL
+  std::cout << "------- BiCGStab solver (Block-ILUT preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
+  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, bicgstab_solver, vcl_block_ilut, bicgstab_ops);
+#endif
+
+//  std::cout << "------- BiCGStab solver (ILUT preconditioner) via ViennaCL, coordinate_matrix ----------" << std::endl;
+//  run_solver(vcl_coordinate_matrix, vcl_vec2, vcl_result, bicgstab_solver, vcl_ilut, bicgstab_ops);
+
+  std::cout << "------- BiCGStab solver (Jacobi preconditioner) using ublas ----------" << std::endl;
+  run_solver(ublas_matrix, ublas_vec2, ublas_result, bicgstab_solver, ublas_jacobi, bicgstab_ops);
+
+  std::cout << "------- BiCGStab solver (Jacobi preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
+  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, bicgstab_solver, vcl_jacobi_csr, bicgstab_ops);
+
+  std::cout << "------- BiCGStab solver (Jacobi preconditioner) via ViennaCL, coordinate_matrix ----------" << std::endl;
+  run_solver(vcl_coordinate_matrix, vcl_vec2, vcl_result, bicgstab_solver, vcl_jacobi_coo, bicgstab_ops);
+
+
+  std::cout << "------- BiCGStab solver (row scaling preconditioner) using ublas ----------" << std::endl;
+  run_solver(ublas_matrix, ublas_vec2, ublas_result, bicgstab_solver, ublas_row_scaling, bicgstab_ops);
+
+  std::cout << "------- BiCGStab solver (row scaling preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
+  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, bicgstab_solver, vcl_row_scaling_csr, bicgstab_ops);
+
+  std::cout << "------- BiCGStab solver (row scaling preconditioner) via ViennaCL, coordinate_matrix ----------" << std::endl;
+  run_solver(vcl_coordinate_matrix, vcl_vec2, vcl_result, bicgstab_solver, vcl_row_scaling_coo, bicgstab_ops);
+
+
+  ///////////////////////////////////////////////////////////////////////////////
+  ///////////////////////            GMRES solver             ///////////////////
+  ///////////////////////////////////////////////////////////////////////////////
+
+  long gmres_ops = static_cast<long>(solver_iters * (ublas_matrix.nnz() + (solver_iters * 2 + 7) * ublas_vec2.size()));
+
+  viennacl::linalg::gmres_tag gmres_solver(solver_tolerance, solver_iters, solver_krylov_dim);
+
+  std::cout << "------- GMRES solver (no preconditioner) using ublas ----------" << std::endl;
+  run_solver(ublas_matrix, ublas_vec2, ublas_result, gmres_solver, viennacl::linalg::no_precond(), gmres_ops);
+
+  std::cout << "------- GMRES solver (no preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
+  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, gmres_solver, viennacl::linalg::no_precond(), gmres_ops);
+
+  std::cout << "------- GMRES solver (no preconditioner) on GPU, coordinate_matrix ----------" << std::endl;
+  run_solver(vcl_coordinate_matrix, vcl_vec2, vcl_result, gmres_solver, viennacl::linalg::no_precond(), gmres_ops);
+
+
+  std::cout << "------- GMRES solver (ILUT preconditioner) using ublas ----------" << std::endl;
+  run_solver(ublas_matrix, ublas_vec2, ublas_result, gmres_solver, ublas_ilut, gmres_ops);
+
+  std::cout << "------- GMRES solver (ILUT preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
+  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, gmres_solver, vcl_ilut, gmres_ops);
+
+  std::cout << "------- GMRES solver (ILUT preconditioner) via ViennaCL, coordinate_matrix ----------" << std::endl;
+  run_solver(vcl_coordinate_matrix, vcl_vec2, vcl_result, gmres_solver, vcl_ilut, gmres_ops);
+
+
+  std::cout << "------- GMRES solver (Jacobi preconditioner) using ublas ----------" << std::endl;
+  run_solver(ublas_matrix, ublas_vec2, ublas_result, gmres_solver, ublas_jacobi, gmres_ops);
+
+  std::cout << "------- GMRES solver (Jacobi preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
+  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, gmres_solver, vcl_jacobi_csr, gmres_ops);
+
+  std::cout << "------- GMRES solver (Jacobi preconditioner) via ViennaCL, coordinate_matrix ----------" << std::endl;
+  run_solver(vcl_coordinate_matrix, vcl_vec2, vcl_result, gmres_solver, vcl_jacobi_coo, gmres_ops);
+
+
+  std::cout << "------- GMRES solver (row scaling preconditioner) using ublas ----------" << std::endl;
+  run_solver(ublas_matrix, ublas_vec2, ublas_result, gmres_solver, ublas_row_scaling, gmres_ops);
+
+  std::cout << "------- GMRES solver (row scaling preconditioner) via ViennaCL, compressed_matrix ----------" << std::endl;
+  run_solver(vcl_compressed_matrix, vcl_vec2, vcl_result, gmres_solver, vcl_row_scaling_csr, gmres_ops);
+
+  std::cout << "------- GMRES solver (row scaling preconditioner) via ViennaCL, coordinate_matrix ----------" << std::endl;
+  run_solver(vcl_coordinate_matrix, vcl_vec2, vcl_result, gmres_solver, vcl_row_scaling_coo, gmres_ops);
+
+  return EXIT_SUCCESS;
+}
+
+int main()
+{
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "               Device Info" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+
+#ifdef VIENNACL_WITH_OPENCL
+  viennacl::ocl::platform pf = viennacl::ocl::get_platforms()[0];
+  std::vector<viennacl::ocl::device> const & devices = pf.devices();
+
+  // Set first device to first context:
+  viennacl::ocl::setup_context(0, devices[0]);
+
+  // Set second device for second context (use the same device for the second context if only one device available):
+  if (devices.size() > 1)
+    viennacl::ocl::setup_context(1, devices[1]);
+  else
+    viennacl::ocl::setup_context(1, devices[0]);
+
+  std::cout << viennacl::ocl::current_device().info() << std::endl;
+  viennacl::context ctx(viennacl::ocl::get_context(1));
+#else
+  viennacl::context ctx;
+#endif
+
+  std::cout << "---------------------------------------------------------------------------" << std::endl;
+  std::cout << "---------------------------------------------------------------------------" << std::endl;
+  std::cout << " Benchmark for Execution Times of Iterative Solvers provided with ViennaCL " << std::endl;
+  std::cout << "---------------------------------------------------------------------------" << std::endl;
+  std::cout << " Note that the purpose of this benchmark is not to run solvers until" << std::endl;
+  std::cout << " convergence. Instead, only the execution times of a few iterations are" << std::endl;
+  std::cout << " recorded. Residual errors are only printed for information." << std::endl << std::endl;
+
+
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "## Benchmark :: Solver" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << std::endl;
+  std::cout << "   -------------------------------" << std::endl;
+  std::cout << "   # benchmarking single-precision" << std::endl;
+  std::cout << "   -------------------------------" << std::endl;
+  run_benchmark<float>(ctx);
+#ifdef VIENNACL_WITH_OPENCL
+  if( viennacl::ocl::current_device().double_support() )
+#endif
+  {
+    std::cout << std::endl;
+    std::cout << "   -------------------------------" << std::endl;
+    std::cout << "   # benchmarking double-precision" << std::endl;
+    std::cout << "   -------------------------------" << std::endl;
+    run_benchmark<double>(ctx);
+  }
+  return 0;
+}
+
diff --git a/examples/benchmarks/sparse.cpp b/examples/benchmarks/sparse.cpp
index 471a06f..ce03884 100644
--- a/examples/benchmarks/sparse.cpp
+++ b/examples/benchmarks/sparse.cpp
@@ -1,235 +1,322 @@
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-//#define VIENNACL_BUILD_INFO
-#ifndef NDEBUG
- #define NDEBUG
-#endif
-
-#define VIENNACL_HAVE_UBLAS 1
-
-#include <boost/numeric/ublas/matrix_sparse.hpp>
-#include <boost/numeric/ublas/operation_sparse.hpp>
-
-
-#include "viennacl/scalar.hpp"
-#include "viennacl/vector.hpp"
-#include "viennacl/coordinate_matrix.hpp"
-#include "viennacl/compressed_matrix.hpp"
-#include "viennacl/linalg/prod.hpp"
-#include "viennacl/linalg/norm_2.hpp"
-#include "viennacl/io/matrix_market.hpp"
-
-
-#include <iostream>
-#include <vector>
-#include "benchmark-utils.hpp"
-#include "io.hpp"
-
-
-/*
-*   Benchmark 1:
-*   Vector tests
-*   
-*/
-
-#define BENCHMARK_RUNS          10
-
-
-template<typename ScalarType>
-int run_benchmark()
-{   
-   Timer timer;
-   double exec_time;
-   
-   ScalarType std_result = 0;
-   
-  ScalarType std_factor1 = ScalarType(3.1415);
-  ScalarType std_factor2 = ScalarType(42.0);
-  viennacl::scalar<ScalarType> vcl_factor1(std_factor1);
-  viennacl::scalar<ScalarType> vcl_factor2(std_factor2);
-  
-  boost::numeric::ublas::vector<ScalarType> ublas_vec1;
-  boost::numeric::ublas::vector<ScalarType> ublas_vec2;
-
-  #ifdef _MSC_VER
-  if (!readVectorFromFile<ScalarType>("../../examples/testdata/result65025.txt", ublas_vec1))
-  #else
-  if (!readVectorFromFile<ScalarType>("../examples/testdata/result65025.txt", ublas_vec1))
-  #endif
-  {
-    std::cout << "Error reading RHS file" << std::endl;
-    return 0;
-  }
-  std::cout << "done reading rhs" << std::endl;
-  ublas_vec2 = ublas_vec1;
-  
-  viennacl::compressed_matrix<ScalarType, 1> vcl_compressed_matrix_1;
-  viennacl::compressed_matrix<ScalarType, 4> vcl_compressed_matrix_4;
-  viennacl::compressed_matrix<ScalarType, 8> vcl_compressed_matrix_8;
-  viennacl::coordinate_matrix<ScalarType> vcl_coordinate_matrix_128;
-  
-  boost::numeric::ublas::compressed_matrix<ScalarType> ublas_matrix;
-  #ifdef _MSC_VER
-  if (!viennacl::io::read_matrix_market_file(ublas_matrix, "../../examples/testdata/mat65k.mtx"))
-  #else
-  if (!viennacl::io::read_matrix_market_file(ublas_matrix, "../examples/testdata/mat65k.mtx"))
-  #endif
-  {
-    std::cout << "Error reading Matrix file" << std::endl;
-    return 0;
-  }
-  //unsigned int cg_mat_size = cg_mat.size(); 
-  std::cout << "done reading matrix" << std::endl;
-  
-  viennacl::vector<ScalarType> vcl_vec1(ublas_vec1.size());
-  viennacl::vector<ScalarType> vcl_vec2(ublas_vec1.size()); 
-  viennacl::vector<ScalarType> vcl_vec3(ublas_vec1.size()); 
-  
-  //cpu to gpu:
-  viennacl::copy(ublas_matrix, vcl_compressed_matrix_1);
-  #ifndef VIENNACL_EXPERIMENTAL_DOUBLE_PRECISION_WITH_STREAM_SDK_ON_GPU
-  viennacl::copy(ublas_matrix, vcl_compressed_matrix_4);
-  viennacl::copy(ublas_matrix, vcl_compressed_matrix_8);
-  #endif
-  viennacl::copy(ublas_matrix, vcl_coordinate_matrix_128);
-  viennacl::copy(ublas_vec1, vcl_vec1);
-  viennacl::copy(ublas_vec2, vcl_vec2);
-
-  
-  ///////////// Matrix operations /////////////////
-  
-  std::cout << "------- Matrix-Vector product on CPU ----------" << std::endl;
-  timer.start();
-  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
-  {
-    ublas_vec1 = prod(ublas_matrix, ublas_vec2);
-  }
-  exec_time = timer.get();
-  std::cout << "CPU time: " << exec_time << std::endl;
-  std::cout << "CPU "; printOps(static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
-  std::cout << ublas_vec1[0] << std::endl;
-  
-  
-  std::cout << "------- Matrix-Vector product with compressed_matrix ----------" << std::endl;
-  
-  
-  vcl_vec1 = viennacl::linalg::prod(vcl_compressed_matrix_1, vcl_vec2); //startup calculation
-  vcl_vec1 = viennacl::linalg::prod(vcl_compressed_matrix_4, vcl_vec2); //startup calculation
-  vcl_vec1 = viennacl::linalg::prod(vcl_compressed_matrix_8, vcl_vec2); //startup calculation
-  std_result = 0.0;
-  
-  viennacl::ocl::get_queue().finish();
-  timer.start();
-  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
-  {
-    vcl_vec1 = viennacl::linalg::prod(vcl_compressed_matrix_1, vcl_vec2);
-  }
-  viennacl::ocl::get_queue().finish();
-  exec_time = timer.get();
-  std::cout << "GPU time align1: " << exec_time << std::endl;
-  std::cout << "GPU align1 "; printOps(static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
-  std::cout << vcl_vec1[0] << std::endl;
-
-  viennacl::ocl::get_queue().finish();
-  timer.start();
-  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
-  {
-    vcl_vec1 = viennacl::linalg::prod(vcl_compressed_matrix_4, vcl_vec2);
-  }
-  viennacl::ocl::get_queue().finish();
-  exec_time = timer.get();
-  std::cout << "GPU time align4: " << exec_time << std::endl;
-  std::cout << "GPU align4 "; printOps(static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
-  std::cout << vcl_vec1[0] << std::endl;
-
-  viennacl::ocl::get_queue().finish();
-  timer.start();
-  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
-  {
-    vcl_vec1 = viennacl::linalg::prod(vcl_compressed_matrix_8, vcl_vec2);
-  }
-  viennacl::ocl::get_queue().finish();
-  exec_time = timer.get();
-  std::cout << "GPU time align8: " << exec_time << std::endl;
-  std::cout << "GPU align8 "; printOps(static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
-  std::cout << vcl_vec1[0] << std::endl;
-  
-  // vector addition
-  
-  std::cout << "------- Matrix-Vector product with coordinate_matrix ----------" << std::endl;
-  vcl_vec1 = viennacl::linalg::prod(vcl_coordinate_matrix_128, vcl_vec2); //startup calculation
-  viennacl::ocl::get_queue().finish();
-  
-  viennacl::copy(vcl_vec1, ublas_vec2);  
-  long err_cnt = 0;
-  for (size_t i=0; i<ublas_vec1.size(); ++i)
-  {
-    if ( fabs(ublas_vec1[i] - ublas_vec2[i]) / std::max(fabs(ublas_vec1[i]), fabs(ublas_vec2[i])) > 1e-2)
-    {
-      std::cout << "Error at index " << i << ": Should: " << ublas_vec1[i] << ", Is: " << ublas_vec2[i] << std::endl;
-      ++err_cnt;
-      if (err_cnt > 5)
-        break;
-    }
-  }
-  
-  viennacl::ocl::get_queue().finish();
-  timer.start();
-  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
-  {
-    vcl_vec1 = viennacl::linalg::prod(vcl_coordinate_matrix_128, vcl_vec2);
-  }
-  viennacl::ocl::get_queue().finish();
-  exec_time = timer.get();
-  std::cout << "GPU time: " << exec_time << std::endl;
-  std::cout << "GPU "; printOps(static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
-  std::cout << vcl_vec1[0] << std::endl;
-  
-   return 0;
-}
-
-
-int main()
-{
-  std::cout << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << "               Device Info" << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  
-  std::cout << viennacl::ocl::current_device().info() << std::endl;
-  
-  std::cout << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << "## Benchmark :: Sparse" << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << std::endl;
-  std::cout << "   -------------------------------" << std::endl;
-  std::cout << "   # benchmarking single-precision" << std::endl;
-  std::cout << "   -------------------------------" << std::endl;
-  run_benchmark<float>();
-  if( viennacl::ocl::current_device().double_support() )
-  {
-    std::cout << std::endl;
-    std::cout << "   -------------------------------" << std::endl;
-    std::cout << "   # benchmarking double-precision" << std::endl;
-    std::cout << "   -------------------------------" << std::endl;
-    run_benchmark<double>();
-  }
-  return 0;
-}
-
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/*
+*   Benchmark:  Sparse matrix operations, i.e. matrix-vector products (sparse.cpp and sparse.cu are identical, the latter being required for compilation using CUDA nvcc)
+*
+*/
+
+//#define VIENNACL_BUILD_INFO
+#ifndef NDEBUG
+ #define NDEBUG
+#endif
+
+#define VIENNACL_WITH_UBLAS 1
+
+#include <boost/numeric/ublas/triangular.hpp>
+#include <boost/numeric/ublas/vector.hpp>
+#include <boost/numeric/ublas/vector_proxy.hpp>
+#include <boost/numeric/ublas/matrix_sparse.hpp>
+#include <boost/numeric/ublas/operation_sparse.hpp>
+#include <boost/numeric/ublas/lu.hpp>
+
+
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/coordinate_matrix.hpp"
+#include "viennacl/compressed_matrix.hpp"
+#include "viennacl/ell_matrix.hpp"
+#include "viennacl/hyb_matrix.hpp"
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+#include "viennacl/io/matrix_market.hpp"
+#include "viennacl/linalg/ilu.hpp"
+
+
+#include <iostream>
+#include <vector>
+#include "benchmark-utils.hpp"
+#include "io.hpp"
+
+
+#define BENCHMARK_RUNS          10
+
+
+template<typename ScalarType>
+int run_benchmark()
+{
+   Timer timer;
+   double exec_time;
+
+   //ScalarType std_result = 0;
+
+  ScalarType std_factor1 = ScalarType(3.1415);
+  ScalarType std_factor2 = ScalarType(42.0);
+  viennacl::scalar<ScalarType> vcl_factor1(std_factor1);
+  viennacl::scalar<ScalarType> vcl_factor2(std_factor2);
+
+  boost::numeric::ublas::vector<ScalarType> ublas_vec1;
+  boost::numeric::ublas::vector<ScalarType> ublas_vec2;
+
+  if (!readVectorFromFile<ScalarType>("../examples/testdata/result65025.txt", ublas_vec1))
+  {
+    std::cout << "Error reading RHS file" << std::endl;
+    return 0;
+  }
+  std::cout << "done reading rhs" << std::endl;
+  ublas_vec2 = ublas_vec1;
+
+  viennacl::compressed_matrix<ScalarType, 1> vcl_compressed_matrix_1;
+  viennacl::compressed_matrix<ScalarType, 4> vcl_compressed_matrix_4;
+  viennacl::compressed_matrix<ScalarType, 8> vcl_compressed_matrix_8;
+
+  viennacl::coordinate_matrix<ScalarType> vcl_coordinate_matrix_128;
+
+  viennacl::ell_matrix<ScalarType, 1> vcl_ell_matrix_1;
+  viennacl::hyb_matrix<ScalarType, 1> vcl_hyb_matrix_1;
+
+  boost::numeric::ublas::compressed_matrix<ScalarType> ublas_matrix;
+  if (!viennacl::io::read_matrix_market_file(ublas_matrix, "../examples/testdata/mat65k.mtx"))
+  {
+    std::cout << "Error reading Matrix file" << std::endl;
+    return 0;
+  }
+  //unsigned int cg_mat_size = cg_mat.size();
+  std::cout << "done reading matrix" << std::endl;
+
+  viennacl::vector<ScalarType> vcl_vec1(ublas_vec1.size());
+  viennacl::vector<ScalarType> vcl_vec2(ublas_vec1.size());
+
+  //cpu to gpu:
+  viennacl::copy(ublas_matrix, vcl_compressed_matrix_1);
+  #ifndef VIENNACL_EXPERIMENTAL_DOUBLE_PRECISION_WITH_STREAM_SDK_ON_GPU
+  viennacl::copy(ublas_matrix, vcl_compressed_matrix_4);
+  viennacl::copy(ublas_matrix, vcl_compressed_matrix_8);
+  #endif
+  viennacl::copy(ublas_matrix, vcl_coordinate_matrix_128);
+  viennacl::copy(ublas_matrix, vcl_ell_matrix_1);
+  viennacl::copy(ublas_matrix, vcl_hyb_matrix_1);
+  viennacl::copy(ublas_vec1, vcl_vec1);
+  viennacl::copy(ublas_vec2, vcl_vec2);
+
+
+  ///////////// Matrix operations /////////////////
+
+  std::cout << "------- Matrix-Vector product on CPU ----------" << std::endl;
+  timer.start();
+  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+  {
+    //ublas_vec1 = boost::numeric::ublas::prod(ublas_matrix, ublas_vec2);
+    boost::numeric::ublas::axpy_prod(ublas_matrix, ublas_vec2, ublas_vec1, true);
+  }
+  exec_time = timer.get();
+  std::cout << "CPU time: " << exec_time << std::endl;
+  std::cout << "CPU "; printOps(2.0 * static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
+  std::cout << ublas_vec1[0] << std::endl;
+
+
+  std::cout << "------- Matrix-Vector product with compressed_matrix ----------" << std::endl;
+
+
+  vcl_vec1 = viennacl::linalg::prod(vcl_compressed_matrix_1, vcl_vec2); //startup calculation
+  vcl_vec1 = viennacl::linalg::prod(vcl_compressed_matrix_4, vcl_vec2); //startup calculation
+  vcl_vec1 = viennacl::linalg::prod(vcl_compressed_matrix_8, vcl_vec2); //startup calculation
+  //std_result = 0.0;
+
+  viennacl::backend::finish();
+  timer.start();
+  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+  {
+    vcl_vec1 = viennacl::linalg::prod(vcl_compressed_matrix_1, vcl_vec2);
+  }
+  viennacl::backend::finish();
+  exec_time = timer.get();
+  std::cout << "GPU time align1: " << exec_time << std::endl;
+  std::cout << "GPU align1 "; printOps(2.0 * static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
+  std::cout << vcl_vec1[0] << std::endl;
+
+  std::cout << "Testing triangular solves: compressed_matrix" << std::endl;
+
+  viennacl::copy(ublas_vec1, vcl_vec1);
+  viennacl::linalg::inplace_solve(trans(vcl_compressed_matrix_1), vcl_vec1, viennacl::linalg::unit_lower_tag());
+  viennacl::copy(ublas_vec1, vcl_vec1);
+  std::cout << "ublas..." << std::endl;
+  timer.start();
+  boost::numeric::ublas::inplace_solve(trans(ublas_matrix), ublas_vec1, boost::numeric::ublas::unit_lower_tag());
+  std::cout << "Time elapsed: " << timer.get() << std::endl;
+  std::cout << "ViennaCL..." << std::endl;
+  viennacl::backend::finish();
+  timer.start();
+  viennacl::linalg::inplace_solve(trans(vcl_compressed_matrix_1), vcl_vec1, viennacl::linalg::unit_lower_tag());
+  viennacl::backend::finish();
+  std::cout << "Time elapsed: " << timer.get() << std::endl;
+
+  ublas_vec1 = boost::numeric::ublas::prod(ublas_matrix, ublas_vec2);
+
+  viennacl::backend::finish();
+  timer.start();
+  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+  {
+    vcl_vec1 = viennacl::linalg::prod(vcl_compressed_matrix_4, vcl_vec2);
+  }
+  viennacl::backend::finish();
+  exec_time = timer.get();
+  std::cout << "GPU time align4: " << exec_time << std::endl;
+  std::cout << "GPU align4 "; printOps(2.0 * static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
+  std::cout << vcl_vec1[0] << std::endl;
+
+  viennacl::backend::finish();
+  timer.start();
+  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+  {
+    vcl_vec1 = viennacl::linalg::prod(vcl_compressed_matrix_8, vcl_vec2);
+  }
+  viennacl::backend::finish();
+  exec_time = timer.get();
+  std::cout << "GPU time align8: " << exec_time << std::endl;
+  std::cout << "GPU align8 "; printOps(2.0 * static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
+  std::cout << vcl_vec1[0] << std::endl;
+
+
+  std::cout << "------- Matrix-Vector product with coordinate_matrix ----------" << std::endl;
+  vcl_vec1 = viennacl::linalg::prod(vcl_coordinate_matrix_128, vcl_vec2); //startup calculation
+  viennacl::backend::finish();
+
+  viennacl::copy(vcl_vec1, ublas_vec2);
+  long err_cnt = 0;
+  for (std::size_t i=0; i<ublas_vec1.size(); ++i)
+  {
+    if ( fabs(ublas_vec1[i] - ublas_vec2[i]) / std::max(fabs(ublas_vec1[i]), fabs(ublas_vec2[i])) > 1e-2)
+    {
+      std::cout << "Error at index " << i << ": Should: " << ublas_vec1[i] << ", Is: " << ublas_vec2[i] << std::endl;
+      ++err_cnt;
+      if (err_cnt > 5)
+        break;
+    }
+  }
+
+  viennacl::backend::finish();
+  timer.start();
+  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+  {
+    vcl_vec1 = viennacl::linalg::prod(vcl_coordinate_matrix_128, vcl_vec2);
+  }
+  viennacl::backend::finish();
+  exec_time = timer.get();
+  std::cout << "GPU time: " << exec_time << std::endl;
+  std::cout << "GPU "; printOps(2.0 * static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
+  std::cout << vcl_vec1[0] << std::endl;
+
+
+  std::cout << "------- Matrix-Vector product with ell_matrix ----------" << std::endl;
+  vcl_vec1 = viennacl::linalg::prod(vcl_ell_matrix_1, vcl_vec2); //startup calculation
+  viennacl::backend::finish();
+
+  viennacl::copy(vcl_vec1, ublas_vec2);
+  err_cnt = 0;
+  for (std::size_t i=0; i<ublas_vec1.size(); ++i)
+  {
+    if ( fabs(ublas_vec1[i] - ublas_vec2[i]) / std::max(fabs(ublas_vec1[i]), fabs(ublas_vec2[i])) > 1e-2)
+    {
+      std::cout << "Error at index " << i << ": Should: " << ublas_vec1[i] << ", Is: " << ublas_vec2[i] << std::endl;
+      ++err_cnt;
+      if (err_cnt > 5)
+        break;
+    }
+  }
+
+  viennacl::backend::finish();
+  timer.start();
+  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+  {
+    vcl_vec1 = viennacl::linalg::prod(vcl_ell_matrix_1, vcl_vec2);
+  }
+  viennacl::backend::finish();
+  exec_time = timer.get();
+  std::cout << "GPU time: " << exec_time << std::endl;
+  std::cout << "GPU "; printOps(2.0 * static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
+  std::cout << vcl_vec1[0] << std::endl;
+
+
+  std::cout << "------- Matrix-Vector product with hyb_matrix ----------" << std::endl;
+  vcl_vec1 = viennacl::linalg::prod(vcl_hyb_matrix_1, vcl_vec2); //startup calculation
+  viennacl::backend::finish();
+
+  viennacl::copy(vcl_vec1, ublas_vec2);
+  err_cnt = 0;
+  for (std::size_t i=0; i<ublas_vec1.size(); ++i)
+  {
+    if ( fabs(ublas_vec1[i] - ublas_vec2[i]) / std::max(fabs(ublas_vec1[i]), fabs(ublas_vec2[i])) > 1e-2)
+    {
+      std::cout << "Error at index " << i << ": Should: " << ublas_vec1[i] << ", Is: " << ublas_vec2[i] << std::endl;
+      ++err_cnt;
+      if (err_cnt > 5)
+        break;
+    }
+  }
+
+  viennacl::backend::finish();
+  timer.start();
+  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+  {
+    vcl_vec1 = viennacl::linalg::prod(vcl_hyb_matrix_1, vcl_vec2);
+  }
+  viennacl::backend::finish();
+  exec_time = timer.get();
+  std::cout << "GPU time: " << exec_time << std::endl;
+  std::cout << "GPU "; printOps(2.0 * static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
+  std::cout << vcl_vec1[0] << std::endl;
+
+
+  return EXIT_SUCCESS;
+}
+
+
+int main()
+{
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "               Device Info" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+
+#ifdef VIENNACL_WITH_OPENCL
+  std::cout << viennacl::ocl::current_device().info() << std::endl;
+#endif
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "## Benchmark :: Sparse" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << std::endl;
+  std::cout << "   -------------------------------" << std::endl;
+  std::cout << "   # benchmarking single-precision" << std::endl;
+  std::cout << "   -------------------------------" << std::endl;
+  run_benchmark<float>();
+#ifdef VIENNACL_WITH_OPENCL
+  if( viennacl::ocl::current_device().double_support() )
+#endif
+  {
+    std::cout << std::endl;
+    std::cout << "   -------------------------------" << std::endl;
+    std::cout << "   # benchmarking double-precision" << std::endl;
+    std::cout << "   -------------------------------" << std::endl;
+    run_benchmark<double>();
+  }
+  return 0;
+}
+
diff --git a/examples/benchmarks/sparse.cpp b/examples/benchmarks/sparse.cu
similarity index 54%
copy from examples/benchmarks/sparse.cpp
copy to examples/benchmarks/sparse.cu
index 471a06f..ce03884 100644
--- a/examples/benchmarks/sparse.cpp
+++ b/examples/benchmarks/sparse.cu
@@ -1,235 +1,322 @@
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-//#define VIENNACL_BUILD_INFO
-#ifndef NDEBUG
- #define NDEBUG
-#endif
-
-#define VIENNACL_HAVE_UBLAS 1
-
-#include <boost/numeric/ublas/matrix_sparse.hpp>
-#include <boost/numeric/ublas/operation_sparse.hpp>
-
-
-#include "viennacl/scalar.hpp"
-#include "viennacl/vector.hpp"
-#include "viennacl/coordinate_matrix.hpp"
-#include "viennacl/compressed_matrix.hpp"
-#include "viennacl/linalg/prod.hpp"
-#include "viennacl/linalg/norm_2.hpp"
-#include "viennacl/io/matrix_market.hpp"
-
-
-#include <iostream>
-#include <vector>
-#include "benchmark-utils.hpp"
-#include "io.hpp"
-
-
-/*
-*   Benchmark 1:
-*   Vector tests
-*   
-*/
-
-#define BENCHMARK_RUNS          10
-
-
-template<typename ScalarType>
-int run_benchmark()
-{   
-   Timer timer;
-   double exec_time;
-   
-   ScalarType std_result = 0;
-   
-  ScalarType std_factor1 = ScalarType(3.1415);
-  ScalarType std_factor2 = ScalarType(42.0);
-  viennacl::scalar<ScalarType> vcl_factor1(std_factor1);
-  viennacl::scalar<ScalarType> vcl_factor2(std_factor2);
-  
-  boost::numeric::ublas::vector<ScalarType> ublas_vec1;
-  boost::numeric::ublas::vector<ScalarType> ublas_vec2;
-
-  #ifdef _MSC_VER
-  if (!readVectorFromFile<ScalarType>("../../examples/testdata/result65025.txt", ublas_vec1))
-  #else
-  if (!readVectorFromFile<ScalarType>("../examples/testdata/result65025.txt", ublas_vec1))
-  #endif
-  {
-    std::cout << "Error reading RHS file" << std::endl;
-    return 0;
-  }
-  std::cout << "done reading rhs" << std::endl;
-  ublas_vec2 = ublas_vec1;
-  
-  viennacl::compressed_matrix<ScalarType, 1> vcl_compressed_matrix_1;
-  viennacl::compressed_matrix<ScalarType, 4> vcl_compressed_matrix_4;
-  viennacl::compressed_matrix<ScalarType, 8> vcl_compressed_matrix_8;
-  viennacl::coordinate_matrix<ScalarType> vcl_coordinate_matrix_128;
-  
-  boost::numeric::ublas::compressed_matrix<ScalarType> ublas_matrix;
-  #ifdef _MSC_VER
-  if (!viennacl::io::read_matrix_market_file(ublas_matrix, "../../examples/testdata/mat65k.mtx"))
-  #else
-  if (!viennacl::io::read_matrix_market_file(ublas_matrix, "../examples/testdata/mat65k.mtx"))
-  #endif
-  {
-    std::cout << "Error reading Matrix file" << std::endl;
-    return 0;
-  }
-  //unsigned int cg_mat_size = cg_mat.size(); 
-  std::cout << "done reading matrix" << std::endl;
-  
-  viennacl::vector<ScalarType> vcl_vec1(ublas_vec1.size());
-  viennacl::vector<ScalarType> vcl_vec2(ublas_vec1.size()); 
-  viennacl::vector<ScalarType> vcl_vec3(ublas_vec1.size()); 
-  
-  //cpu to gpu:
-  viennacl::copy(ublas_matrix, vcl_compressed_matrix_1);
-  #ifndef VIENNACL_EXPERIMENTAL_DOUBLE_PRECISION_WITH_STREAM_SDK_ON_GPU
-  viennacl::copy(ublas_matrix, vcl_compressed_matrix_4);
-  viennacl::copy(ublas_matrix, vcl_compressed_matrix_8);
-  #endif
-  viennacl::copy(ublas_matrix, vcl_coordinate_matrix_128);
-  viennacl::copy(ublas_vec1, vcl_vec1);
-  viennacl::copy(ublas_vec2, vcl_vec2);
-
-  
-  ///////////// Matrix operations /////////////////
-  
-  std::cout << "------- Matrix-Vector product on CPU ----------" << std::endl;
-  timer.start();
-  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
-  {
-    ublas_vec1 = prod(ublas_matrix, ublas_vec2);
-  }
-  exec_time = timer.get();
-  std::cout << "CPU time: " << exec_time << std::endl;
-  std::cout << "CPU "; printOps(static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
-  std::cout << ublas_vec1[0] << std::endl;
-  
-  
-  std::cout << "------- Matrix-Vector product with compressed_matrix ----------" << std::endl;
-  
-  
-  vcl_vec1 = viennacl::linalg::prod(vcl_compressed_matrix_1, vcl_vec2); //startup calculation
-  vcl_vec1 = viennacl::linalg::prod(vcl_compressed_matrix_4, vcl_vec2); //startup calculation
-  vcl_vec1 = viennacl::linalg::prod(vcl_compressed_matrix_8, vcl_vec2); //startup calculation
-  std_result = 0.0;
-  
-  viennacl::ocl::get_queue().finish();
-  timer.start();
-  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
-  {
-    vcl_vec1 = viennacl::linalg::prod(vcl_compressed_matrix_1, vcl_vec2);
-  }
-  viennacl::ocl::get_queue().finish();
-  exec_time = timer.get();
-  std::cout << "GPU time align1: " << exec_time << std::endl;
-  std::cout << "GPU align1 "; printOps(static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
-  std::cout << vcl_vec1[0] << std::endl;
-
-  viennacl::ocl::get_queue().finish();
-  timer.start();
-  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
-  {
-    vcl_vec1 = viennacl::linalg::prod(vcl_compressed_matrix_4, vcl_vec2);
-  }
-  viennacl::ocl::get_queue().finish();
-  exec_time = timer.get();
-  std::cout << "GPU time align4: " << exec_time << std::endl;
-  std::cout << "GPU align4 "; printOps(static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
-  std::cout << vcl_vec1[0] << std::endl;
-
-  viennacl::ocl::get_queue().finish();
-  timer.start();
-  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
-  {
-    vcl_vec1 = viennacl::linalg::prod(vcl_compressed_matrix_8, vcl_vec2);
-  }
-  viennacl::ocl::get_queue().finish();
-  exec_time = timer.get();
-  std::cout << "GPU time align8: " << exec_time << std::endl;
-  std::cout << "GPU align8 "; printOps(static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
-  std::cout << vcl_vec1[0] << std::endl;
-  
-  // vector addition
-  
-  std::cout << "------- Matrix-Vector product with coordinate_matrix ----------" << std::endl;
-  vcl_vec1 = viennacl::linalg::prod(vcl_coordinate_matrix_128, vcl_vec2); //startup calculation
-  viennacl::ocl::get_queue().finish();
-  
-  viennacl::copy(vcl_vec1, ublas_vec2);  
-  long err_cnt = 0;
-  for (size_t i=0; i<ublas_vec1.size(); ++i)
-  {
-    if ( fabs(ublas_vec1[i] - ublas_vec2[i]) / std::max(fabs(ublas_vec1[i]), fabs(ublas_vec2[i])) > 1e-2)
-    {
-      std::cout << "Error at index " << i << ": Should: " << ublas_vec1[i] << ", Is: " << ublas_vec2[i] << std::endl;
-      ++err_cnt;
-      if (err_cnt > 5)
-        break;
-    }
-  }
-  
-  viennacl::ocl::get_queue().finish();
-  timer.start();
-  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
-  {
-    vcl_vec1 = viennacl::linalg::prod(vcl_coordinate_matrix_128, vcl_vec2);
-  }
-  viennacl::ocl::get_queue().finish();
-  exec_time = timer.get();
-  std::cout << "GPU time: " << exec_time << std::endl;
-  std::cout << "GPU "; printOps(static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
-  std::cout << vcl_vec1[0] << std::endl;
-  
-   return 0;
-}
-
-
-int main()
-{
-  std::cout << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << "               Device Info" << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  
-  std::cout << viennacl::ocl::current_device().info() << std::endl;
-  
-  std::cout << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << "## Benchmark :: Sparse" << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << std::endl;
-  std::cout << "   -------------------------------" << std::endl;
-  std::cout << "   # benchmarking single-precision" << std::endl;
-  std::cout << "   -------------------------------" << std::endl;
-  run_benchmark<float>();
-  if( viennacl::ocl::current_device().double_support() )
-  {
-    std::cout << std::endl;
-    std::cout << "   -------------------------------" << std::endl;
-    std::cout << "   # benchmarking double-precision" << std::endl;
-    std::cout << "   -------------------------------" << std::endl;
-    run_benchmark<double>();
-  }
-  return 0;
-}
-
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/*
+*   Benchmark:  Sparse matrix operations, i.e. matrix-vector products (sparse.cpp and sparse.cu are identical, the latter being required for compilation using CUDA nvcc)
+*
+*/
+
+//#define VIENNACL_BUILD_INFO
+#ifndef NDEBUG
+ #define NDEBUG
+#endif
+
+#define VIENNACL_WITH_UBLAS 1
+
+#include <boost/numeric/ublas/triangular.hpp>
+#include <boost/numeric/ublas/vector.hpp>
+#include <boost/numeric/ublas/vector_proxy.hpp>
+#include <boost/numeric/ublas/matrix_sparse.hpp>
+#include <boost/numeric/ublas/operation_sparse.hpp>
+#include <boost/numeric/ublas/lu.hpp>
+
+
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/coordinate_matrix.hpp"
+#include "viennacl/compressed_matrix.hpp"
+#include "viennacl/ell_matrix.hpp"
+#include "viennacl/hyb_matrix.hpp"
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+#include "viennacl/io/matrix_market.hpp"
+#include "viennacl/linalg/ilu.hpp"
+
+
+#include <iostream>
+#include <vector>
+#include "benchmark-utils.hpp"
+#include "io.hpp"
+
+
+#define BENCHMARK_RUNS          10
+
+
+template<typename ScalarType>
+int run_benchmark()
+{
+   Timer timer;
+   double exec_time;
+
+   //ScalarType std_result = 0;
+
+  ScalarType std_factor1 = ScalarType(3.1415);
+  ScalarType std_factor2 = ScalarType(42.0);
+  viennacl::scalar<ScalarType> vcl_factor1(std_factor1);
+  viennacl::scalar<ScalarType> vcl_factor2(std_factor2);
+
+  boost::numeric::ublas::vector<ScalarType> ublas_vec1;
+  boost::numeric::ublas::vector<ScalarType> ublas_vec2;
+
+  if (!readVectorFromFile<ScalarType>("../examples/testdata/result65025.txt", ublas_vec1))
+  {
+    std::cout << "Error reading RHS file" << std::endl;
+    return 0;
+  }
+  std::cout << "done reading rhs" << std::endl;
+  ublas_vec2 = ublas_vec1;
+
+  viennacl::compressed_matrix<ScalarType, 1> vcl_compressed_matrix_1;
+  viennacl::compressed_matrix<ScalarType, 4> vcl_compressed_matrix_4;
+  viennacl::compressed_matrix<ScalarType, 8> vcl_compressed_matrix_8;
+
+  viennacl::coordinate_matrix<ScalarType> vcl_coordinate_matrix_128;
+
+  viennacl::ell_matrix<ScalarType, 1> vcl_ell_matrix_1;
+  viennacl::hyb_matrix<ScalarType, 1> vcl_hyb_matrix_1;
+
+  boost::numeric::ublas::compressed_matrix<ScalarType> ublas_matrix;
+  if (!viennacl::io::read_matrix_market_file(ublas_matrix, "../examples/testdata/mat65k.mtx"))
+  {
+    std::cout << "Error reading Matrix file" << std::endl;
+    return 0;
+  }
+  //unsigned int cg_mat_size = cg_mat.size();
+  std::cout << "done reading matrix" << std::endl;
+
+  viennacl::vector<ScalarType> vcl_vec1(ublas_vec1.size());
+  viennacl::vector<ScalarType> vcl_vec2(ublas_vec1.size());
+
+  //cpu to gpu:
+  viennacl::copy(ublas_matrix, vcl_compressed_matrix_1);
+  #ifndef VIENNACL_EXPERIMENTAL_DOUBLE_PRECISION_WITH_STREAM_SDK_ON_GPU
+  viennacl::copy(ublas_matrix, vcl_compressed_matrix_4);
+  viennacl::copy(ublas_matrix, vcl_compressed_matrix_8);
+  #endif
+  viennacl::copy(ublas_matrix, vcl_coordinate_matrix_128);
+  viennacl::copy(ublas_matrix, vcl_ell_matrix_1);
+  viennacl::copy(ublas_matrix, vcl_hyb_matrix_1);
+  viennacl::copy(ublas_vec1, vcl_vec1);
+  viennacl::copy(ublas_vec2, vcl_vec2);
+
+
+  ///////////// Matrix operations /////////////////
+
+  std::cout << "------- Matrix-Vector product on CPU ----------" << std::endl;
+  timer.start();
+  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+  {
+    //ublas_vec1 = boost::numeric::ublas::prod(ublas_matrix, ublas_vec2);
+    boost::numeric::ublas::axpy_prod(ublas_matrix, ublas_vec2, ublas_vec1, true);
+  }
+  exec_time = timer.get();
+  std::cout << "CPU time: " << exec_time << std::endl;
+  std::cout << "CPU "; printOps(2.0 * static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
+  std::cout << ublas_vec1[0] << std::endl;
+
+
+  std::cout << "------- Matrix-Vector product with compressed_matrix ----------" << std::endl;
+
+
+  vcl_vec1 = viennacl::linalg::prod(vcl_compressed_matrix_1, vcl_vec2); //startup calculation
+  vcl_vec1 = viennacl::linalg::prod(vcl_compressed_matrix_4, vcl_vec2); //startup calculation
+  vcl_vec1 = viennacl::linalg::prod(vcl_compressed_matrix_8, vcl_vec2); //startup calculation
+  //std_result = 0.0;
+
+  viennacl::backend::finish();
+  timer.start();
+  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+  {
+    vcl_vec1 = viennacl::linalg::prod(vcl_compressed_matrix_1, vcl_vec2);
+  }
+  viennacl::backend::finish();
+  exec_time = timer.get();
+  std::cout << "GPU time align1: " << exec_time << std::endl;
+  std::cout << "GPU align1 "; printOps(2.0 * static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
+  std::cout << vcl_vec1[0] << std::endl;
+
+  std::cout << "Testing triangular solves: compressed_matrix" << std::endl;
+
+  viennacl::copy(ublas_vec1, vcl_vec1);
+  viennacl::linalg::inplace_solve(trans(vcl_compressed_matrix_1), vcl_vec1, viennacl::linalg::unit_lower_tag());
+  viennacl::copy(ublas_vec1, vcl_vec1);
+  std::cout << "ublas..." << std::endl;
+  timer.start();
+  boost::numeric::ublas::inplace_solve(trans(ublas_matrix), ublas_vec1, boost::numeric::ublas::unit_lower_tag());
+  std::cout << "Time elapsed: " << timer.get() << std::endl;
+  std::cout << "ViennaCL..." << std::endl;
+  viennacl::backend::finish();
+  timer.start();
+  viennacl::linalg::inplace_solve(trans(vcl_compressed_matrix_1), vcl_vec1, viennacl::linalg::unit_lower_tag());
+  viennacl::backend::finish();
+  std::cout << "Time elapsed: " << timer.get() << std::endl;
+
+  ublas_vec1 = boost::numeric::ublas::prod(ublas_matrix, ublas_vec2);
+
+  viennacl::backend::finish();
+  timer.start();
+  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+  {
+    vcl_vec1 = viennacl::linalg::prod(vcl_compressed_matrix_4, vcl_vec2);
+  }
+  viennacl::backend::finish();
+  exec_time = timer.get();
+  std::cout << "GPU time align4: " << exec_time << std::endl;
+  std::cout << "GPU align4 "; printOps(2.0 * static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
+  std::cout << vcl_vec1[0] << std::endl;
+
+  viennacl::backend::finish();
+  timer.start();
+  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+  {
+    vcl_vec1 = viennacl::linalg::prod(vcl_compressed_matrix_8, vcl_vec2);
+  }
+  viennacl::backend::finish();
+  exec_time = timer.get();
+  std::cout << "GPU time align8: " << exec_time << std::endl;
+  std::cout << "GPU align8 "; printOps(2.0 * static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
+  std::cout << vcl_vec1[0] << std::endl;
+
+
+  std::cout << "------- Matrix-Vector product with coordinate_matrix ----------" << std::endl;
+  vcl_vec1 = viennacl::linalg::prod(vcl_coordinate_matrix_128, vcl_vec2); //startup calculation
+  viennacl::backend::finish();
+
+  viennacl::copy(vcl_vec1, ublas_vec2);
+  long err_cnt = 0;
+  for (std::size_t i=0; i<ublas_vec1.size(); ++i)
+  {
+    if ( fabs(ublas_vec1[i] - ublas_vec2[i]) / std::max(fabs(ublas_vec1[i]), fabs(ublas_vec2[i])) > 1e-2)
+    {
+      std::cout << "Error at index " << i << ": Should: " << ublas_vec1[i] << ", Is: " << ublas_vec2[i] << std::endl;
+      ++err_cnt;
+      if (err_cnt > 5)
+        break;
+    }
+  }
+
+  viennacl::backend::finish();
+  timer.start();
+  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+  {
+    vcl_vec1 = viennacl::linalg::prod(vcl_coordinate_matrix_128, vcl_vec2);
+  }
+  viennacl::backend::finish();
+  exec_time = timer.get();
+  std::cout << "GPU time: " << exec_time << std::endl;
+  std::cout << "GPU "; printOps(2.0 * static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
+  std::cout << vcl_vec1[0] << std::endl;
+
+
+  std::cout << "------- Matrix-Vector product with ell_matrix ----------" << std::endl;
+  vcl_vec1 = viennacl::linalg::prod(vcl_ell_matrix_1, vcl_vec2); //startup calculation
+  viennacl::backend::finish();
+
+  viennacl::copy(vcl_vec1, ublas_vec2);
+  err_cnt = 0;
+  for (std::size_t i=0; i<ublas_vec1.size(); ++i)
+  {
+    if ( fabs(ublas_vec1[i] - ublas_vec2[i]) / std::max(fabs(ublas_vec1[i]), fabs(ublas_vec2[i])) > 1e-2)
+    {
+      std::cout << "Error at index " << i << ": Should: " << ublas_vec1[i] << ", Is: " << ublas_vec2[i] << std::endl;
+      ++err_cnt;
+      if (err_cnt > 5)
+        break;
+    }
+  }
+
+  viennacl::backend::finish();
+  timer.start();
+  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+  {
+    vcl_vec1 = viennacl::linalg::prod(vcl_ell_matrix_1, vcl_vec2);
+  }
+  viennacl::backend::finish();
+  exec_time = timer.get();
+  std::cout << "GPU time: " << exec_time << std::endl;
+  std::cout << "GPU "; printOps(2.0 * static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
+  std::cout << vcl_vec1[0] << std::endl;
+
+
+  std::cout << "------- Matrix-Vector product with hyb_matrix ----------" << std::endl;
+  vcl_vec1 = viennacl::linalg::prod(vcl_hyb_matrix_1, vcl_vec2); //startup calculation
+  viennacl::backend::finish();
+
+  viennacl::copy(vcl_vec1, ublas_vec2);
+  err_cnt = 0;
+  for (std::size_t i=0; i<ublas_vec1.size(); ++i)
+  {
+    if ( fabs(ublas_vec1[i] - ublas_vec2[i]) / std::max(fabs(ublas_vec1[i]), fabs(ublas_vec2[i])) > 1e-2)
+    {
+      std::cout << "Error at index " << i << ": Should: " << ublas_vec1[i] << ", Is: " << ublas_vec2[i] << std::endl;
+      ++err_cnt;
+      if (err_cnt > 5)
+        break;
+    }
+  }
+
+  viennacl::backend::finish();
+  timer.start();
+  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+  {
+    vcl_vec1 = viennacl::linalg::prod(vcl_hyb_matrix_1, vcl_vec2);
+  }
+  viennacl::backend::finish();
+  exec_time = timer.get();
+  std::cout << "GPU time: " << exec_time << std::endl;
+  std::cout << "GPU "; printOps(2.0 * static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
+  std::cout << vcl_vec1[0] << std::endl;
+
+
+  return EXIT_SUCCESS;
+}
+
+
+int main()
+{
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "               Device Info" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+
+#ifdef VIENNACL_WITH_OPENCL
+  std::cout << viennacl::ocl::current_device().info() << std::endl;
+#endif
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "## Benchmark :: Sparse" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << std::endl;
+  std::cout << "   -------------------------------" << std::endl;
+  std::cout << "   # benchmarking single-precision" << std::endl;
+  std::cout << "   -------------------------------" << std::endl;
+  run_benchmark<float>();
+#ifdef VIENNACL_WITH_OPENCL
+  if( viennacl::ocl::current_device().double_support() )
+#endif
+  {
+    std::cout << std::endl;
+    std::cout << "   -------------------------------" << std::endl;
+    std::cout << "   # benchmarking double-precision" << std::endl;
+    std::cout << "   -------------------------------" << std::endl;
+    run_benchmark<double>();
+  }
+  return 0;
+}
+
diff --git a/examples/benchmarks/vector.cpp b/examples/benchmarks/vector.cpp
index 45766e1..b13c022 100644
--- a/examples/benchmarks/vector.cpp
+++ b/examples/benchmarks/vector.cpp
@@ -1,241 +1,281 @@
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-//#define VIENNACL_BUILD_INFO
-#ifndef NDEBUG
- #define NDEBUG
-#endif
-
-#include "viennacl/scalar.hpp"
-#include "viennacl/vector.hpp"
-#include "viennacl/linalg/inner_prod.hpp"
-
-#include <iostream>
-#include <vector>
-#include "benchmark-utils.hpp"
-
-using std::cout;
-using std::cin;
-using std::endl;
-
-
-/*
-*   Benchmark 1:
-*   Vector tests
-*   
-*/
-
-#define BENCHMARK_VECTOR_SIZE   3000000
-#define BENCHMARK_RUNS          10
-
-
-template<typename ScalarType>
-int run_benchmark()
-{
-   
-   Timer timer;
-   double exec_time;
-   
-  ScalarType std_result = 0;
-   
-  ScalarType std_factor1 = static_cast<ScalarType>(3.1415);
-  ScalarType std_factor2 = static_cast<ScalarType>(42.0);
-  viennacl::scalar<ScalarType> vcl_factor1(std_factor1);
-  viennacl::scalar<ScalarType> vcl_factor2(std_factor2);
-  
-  std::vector<ScalarType> std_vec1(BENCHMARK_VECTOR_SIZE);
-  std::vector<ScalarType> std_vec2(BENCHMARK_VECTOR_SIZE);
-  std::vector<ScalarType> std_vec3(BENCHMARK_VECTOR_SIZE);
-  viennacl::vector<ScalarType> vcl_vec1(BENCHMARK_VECTOR_SIZE);
-  viennacl::vector<ScalarType> vcl_vec2(BENCHMARK_VECTOR_SIZE); 
-  viennacl::vector<ScalarType> vcl_vec3(BENCHMARK_VECTOR_SIZE); 
-
-  
-  ///////////// Vector operations /////////////////
-  
-  std_vec1[0] = 1.0;
-  std_vec2[0] = 1.0;
-  for (int i=1; i<BENCHMARK_VECTOR_SIZE; ++i)
-  {
-    std_vec1[i] = std_vec1[i-1] * ScalarType(1.000001);
-    std_vec2[i] = std_vec1[i-1] * ScalarType(0.999999);
-  }
-
-  viennacl::copy(std_vec1, vcl_vec1);
-  viennacl::fast_copy(std_vec1, vcl_vec1);
-  viennacl::copy(std_vec2, vcl_vec2);
-  
-  viennacl::swap(vcl_vec1, vcl_vec2);
-  //check that vcl_vec1 is now equal to std_vec2:
-  viennacl::fast_copy(vcl_vec1, std_vec3);
-  for (int i=0; i<BENCHMARK_VECTOR_SIZE; ++i)
-    if (std_vec3[i] != std_vec2[i])
-      std::cout << "ERROR in swap(): Failed at entry " << i << std::endl;
-  
-  viennacl::fast_swap(vcl_vec1, vcl_vec2);
-  //check that vcl_vec1 is now equal to std_vec1 again:
-  viennacl::copy(vcl_vec1, std_vec3);
-  for (int i=0; i<BENCHMARK_VECTOR_SIZE; ++i)
-    if (std_vec3[i] != std_vec1[i])
-      std::cout << "ERROR in fast_swap(): Failed at entry " << i << std::endl;
-  
-  
-  // inner product
-  viennacl::ocl::get_queue().finish();
-  std::cout << "------- Vector inner products ----------" << std::endl;
-  timer.start();
-  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
-  {
-    std_result = 0;
-    for (int i=0; i<BENCHMARK_VECTOR_SIZE; ++i)
-      std_result += std_vec1[i] * std_vec2[i];
-  }
-  exec_time = timer.get();
-  std::cout << "CPU time: " << exec_time << std::endl;
-  std::cout << "CPU "; printOps(static_cast<double>(std_vec1.size()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
-  std::cout << "Result:" << std_result << std::endl;
-  
-  
-  std_result = viennacl::linalg::inner_prod(vcl_vec1, vcl_vec2); //startup calculation
-  std_result = 0.0;
-  viennacl::ocl::get_queue().finish();
-  timer.start();
-  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
-  {
-    vcl_factor2 = viennacl::linalg::inner_prod(vcl_vec1, vcl_vec2);
-  }
-  viennacl::ocl::get_queue().finish();
-  exec_time = timer.get();
-  std::cout << "GPU time: " << exec_time << std::endl;
-  std::cout << "GPU "; printOps(static_cast<double>(std_vec1.size()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
-  std::cout << "Result: " << vcl_factor2 << std::endl;
-  
-  // vector addition
-  
-  std::cout << "------- Vector addition ----------" << std::endl;
-  timer.start();
-  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
-  {
-    for (int i=0; i<BENCHMARK_VECTOR_SIZE; ++i)
-      std_vec3[i] = std_vec1[i] + std_vec2[i];
-  }
-  exec_time = timer.get();
-  std::cout << "CPU time: " << exec_time << std::endl;
-  std::cout << "CPU "; printOps(static_cast<double>(std_vec1.size()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
-  
-  
-  vcl_vec3 = vcl_vec1 + vcl_vec2; //startup calculation
-  viennacl::ocl::get_queue().finish();
-  std_result = 0.0;
-  timer.start();
-  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
-  {
-    vcl_vec3 = vcl_vec1 + vcl_vec2;
-  }
-  viennacl::ocl::get_queue().finish();
-  exec_time = timer.get();
-  std::cout << "GPU time: " << exec_time << std::endl;
-  std::cout << "GPU "; printOps(static_cast<double>(std_vec1.size()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
-  
-  
-  
-  
-  // multiply add:
-  std::cout << "------- Vector multiply add ----------" << std::endl;
-  timer.start();
-  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
-  {
-    for (int i=0; i<BENCHMARK_VECTOR_SIZE; ++i)
-      std_vec1[i] += std_factor1 * std_vec2[i];
-  }
-  exec_time = timer.get();
-  std::cout << "CPU time: " << exec_time << std::endl;
-  std::cout << "CPU "; printOps(static_cast<double>(std_vec1.size()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
-  
-  
-  vcl_vec1 += vcl_factor1 * vcl_vec2; //startup calculation
-  viennacl::ocl::get_queue().finish();
-  timer.start();
-  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
-  {
-    vcl_vec1 += vcl_factor1 * vcl_vec2;
-  }
-  viennacl::ocl::get_queue().finish();
-  exec_time = timer.get();
-  std::cout << "GPU time: " << exec_time << std::endl;
-  std::cout << "GPU "; printOps(static_cast<double>(std_vec1.size()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
-  
- 
-  
-  
-  //complicated vector addition:
-  std::cout << "------- Vector complicated expression ----------" << std::endl;
-  timer.start();
-  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
-  {
-    for (int i=0; i<BENCHMARK_VECTOR_SIZE; ++i)
-      std_vec3[i] += std_vec2[i] / std_factor1 + std_factor2 * (std_vec1[i] - std_factor1 * std_vec2[i]);
-  }
-  exec_time = timer.get();
-  std::cout << "CPU time: " << exec_time << std::endl;
-  std::cout << "CPU "; printOps(3 * static_cast<double>(std_vec1.size()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
-  
-  vcl_vec3 = vcl_vec2 / vcl_factor1 + vcl_factor2 * (vcl_vec1 - vcl_factor1*vcl_vec2); //startup calculation
-  viennacl::ocl::get_queue().finish();
-  timer.start();
-  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
-  {
-    vcl_vec3 = vcl_vec2 / vcl_factor1 + vcl_factor2 * (vcl_vec1 - vcl_factor1*vcl_vec2);
-  }
-  viennacl::ocl::get_queue().finish();
-  exec_time = timer.get();
-  std::cout << "GPU time: " << exec_time << std::endl;
-  std::cout << "GPU "; printOps(3 * static_cast<double>(std_vec1.size()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
-  
-  return 0;
-}
-
-int main()
-{
-  std::cout << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << "               Device Info" << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  
-  std::cout << viennacl::ocl::current_device().info() << std::endl;
-  
-  
-  std::cout << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << "## Benchmark :: Vector" << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << std::endl;
-  std::cout << "   -------------------------------" << std::endl;
-  std::cout << "   # benchmarking single-precision" << std::endl;
-  std::cout << "   -------------------------------" << std::endl;
-  run_benchmark<float>();
-  if( viennacl::ocl::current_device().double_support() )
-  {
-    std::cout << std::endl;
-    std::cout << "   -------------------------------" << std::endl;
-    std::cout << "   # benchmarking double-precision" << std::endl;
-    std::cout << "   -------------------------------" << std::endl;
-    run_benchmark<double>();
-  }
-  return 0;
-}
-
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/*
+*
+*   Benchmark:   Vector operations (vector.cpp and vector.cu are identical, the latter being required for compilation using CUDA nvcc)
+*
+*/
+
+
+//#define VIENNACL_DEBUG_ALL
+#ifndef NDEBUG
+ #define NDEBUG
+#endif
+
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/linalg/inner_prod.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+
+#include <iostream>
+#include <vector>
+#include "benchmark-utils.hpp"
+
+using std::cout;
+using std::cin;
+using std::endl;
+
+
+#define BENCHMARK_VECTOR_SIZE   3000000
+#define BENCHMARK_RUNS          10
+
+
+template<typename ScalarType>
+int run_benchmark()
+{
+
+   Timer timer;
+   double exec_time;
+
+  ScalarType std_result = 0;
+
+  ScalarType std_factor1 = static_cast<ScalarType>(3.1415);
+  ScalarType std_factor2 = static_cast<ScalarType>(42.0);
+  viennacl::scalar<ScalarType> vcl_factor1(std_factor1);
+  viennacl::scalar<ScalarType> vcl_factor2(std_factor2);
+
+  std::vector<ScalarType> std_vec1(BENCHMARK_VECTOR_SIZE);
+  std::vector<ScalarType> std_vec2(BENCHMARK_VECTOR_SIZE);
+  std::vector<ScalarType> std_vec3(BENCHMARK_VECTOR_SIZE);
+  viennacl::vector<ScalarType> vcl_vec1(BENCHMARK_VECTOR_SIZE);
+  viennacl::vector<ScalarType> vcl_vec2(BENCHMARK_VECTOR_SIZE);
+  viennacl::vector<ScalarType> vcl_vec3(BENCHMARK_VECTOR_SIZE);
+
+
+  ///////////// Vector operations /////////////////
+
+  std_vec1[0] = 1.0;
+  std_vec2[0] = 1.0;
+  for (int i=1; i<BENCHMARK_VECTOR_SIZE; ++i)
+  {
+    std_vec1[i] = std_vec1[i-1] * ScalarType(1.000001);
+    std_vec2[i] = std_vec1[i-1] * ScalarType(0.999999);
+  }
+
+  viennacl::copy(std_vec1, vcl_vec1);
+  viennacl::fast_copy(std_vec1, vcl_vec1);
+  viennacl::copy(std_vec2, vcl_vec2);
+
+  viennacl::swap(vcl_vec1, vcl_vec2);
+  //check that vcl_vec1 is now equal to std_vec2:
+  viennacl::fast_copy(vcl_vec1, std_vec3);
+  for (int i=0; i<BENCHMARK_VECTOR_SIZE; ++i)
+    if (std_vec3[i] != std_vec2[i])
+      std::cout << "ERROR in swap(): Failed at entry " << i << std::endl;
+
+  viennacl::fast_swap(vcl_vec1, vcl_vec2);
+  //check that vcl_vec1 is now equal to std_vec1 again:
+  viennacl::copy(vcl_vec1, std_vec3);
+  for (int i=0; i<BENCHMARK_VECTOR_SIZE; ++i)
+    if (std_vec3[i] != std_vec1[i])
+      std::cout << "ERROR in fast_swap(): Failed at entry " << i << std::endl;
+
+
+  // inner product
+  viennacl::backend::finish();
+  std::cout << "------- Vector inner products ----------" << std::endl;
+  timer.start();
+  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+  {
+    std_result = 0;
+    for (int i=0; i<BENCHMARK_VECTOR_SIZE; ++i)
+      std_result += std_vec1[i] * std_vec2[i];
+  }
+  exec_time = timer.get();
+  std::cout << "CPU time: " << exec_time << std::endl;
+  std::cout << "CPU "; printOps(2.0 * static_cast<double>(std_vec1.size()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
+  std::cout << "Result:" << std_result << std::endl;
+
+
+  std_result = viennacl::linalg::inner_prod(vcl_vec1, vcl_vec2); //startup calculation
+  std_result = 0.0;
+  viennacl::backend::finish();
+  timer.start();
+  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+  {
+    vcl_factor2 = viennacl::linalg::inner_prod(vcl_vec1, vcl_vec2);
+  }
+  viennacl::backend::finish();
+  exec_time = timer.get();
+  std::cout << "GPU time: " << exec_time << std::endl;
+  std::cout << "GPU "; printOps(2.0 * static_cast<double>(std_vec1.size()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
+  std::cout << "Result: " << vcl_factor2 << std::endl;
+
+  // inner product
+  viennacl::backend::finish();
+  std::cout << "------- Vector norm_2 ----------" << std::endl;
+  timer.start();
+  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+  {
+    std_result = 0;
+    for (int i=0; i<BENCHMARK_VECTOR_SIZE; ++i)
+    {
+      ScalarType entry = std_vec1[i];
+      std_result += entry * entry;
+    }
+  }
+  std_result = std::sqrt(std_result);
+  exec_time = timer.get();
+  std::cout << "CPU time: " << exec_time << std::endl;
+  std::cout << "CPU "; printOps(2.0 * static_cast<double>(std_vec1.size()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
+  std::cout << "Result:" << std_result << std::endl;
+
+
+  std_result = viennacl::linalg::norm_2(vcl_vec1); //startup calculation
+  std_result = 0.0;
+  viennacl::backend::finish();
+  timer.start();
+  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+  {
+    vcl_factor2 = viennacl::linalg::norm_2(vcl_vec1);
+  }
+  viennacl::backend::finish();
+  exec_time = timer.get();
+  std::cout << "GPU time: " << exec_time << std::endl;
+  std::cout << "GPU "; printOps(2.0 * static_cast<double>(std_vec1.size()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
+  std::cout << "Result: " << vcl_factor2 << std::endl;
+
+  // vector addition
+
+  std::cout << "------- Vector addition ----------" << std::endl;
+  timer.start();
+  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+  {
+    for (int i=0; i<BENCHMARK_VECTOR_SIZE; ++i)
+      std_vec3[i] = std_vec1[i] + std_vec2[i];
+  }
+  exec_time = timer.get();
+  std::cout << "CPU time: " << exec_time << std::endl;
+  std::cout << "CPU "; printOps(2.0 * static_cast<double>(std_vec1.size()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
+
+
+  vcl_vec3 = vcl_vec1 + vcl_vec2; //startup calculation
+  viennacl::backend::finish();
+  std_result = 0.0;
+  timer.start();
+  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+  {
+    vcl_vec3 = vcl_vec1 + vcl_vec2;
+  }
+  viennacl::backend::finish();
+  exec_time = timer.get();
+  std::cout << "GPU time: " << exec_time << std::endl;
+  std::cout << "GPU "; printOps(2.0 * static_cast<double>(std_vec1.size()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
+
+
+
+
+  // multiply add:
+  std::cout << "------- Vector multiply add ----------" << std::endl;
+  timer.start();
+  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+  {
+    for (int i=0; i<BENCHMARK_VECTOR_SIZE; ++i)
+      std_vec1[i] += std_factor1 * std_vec2[i];
+  }
+  exec_time = timer.get();
+  std::cout << "CPU time: " << exec_time << std::endl;
+  std::cout << "CPU "; printOps(2.0 * static_cast<double>(std_vec1.size()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
+
+
+  vcl_vec1 += vcl_factor1 * vcl_vec2; //startup calculation
+  viennacl::backend::finish();
+  timer.start();
+  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+  {
+    vcl_vec1 += vcl_factor1 * vcl_vec2;
+  }
+  viennacl::backend::finish();
+  exec_time = timer.get();
+  std::cout << "GPU time: " << exec_time << std::endl;
+  std::cout << "GPU "; printOps(2.0 * static_cast<double>(std_vec1.size()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
+
+
+
+
+  //complicated vector addition:
+  std::cout << "------- Vector complicated expression ----------" << std::endl;
+  timer.start();
+  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+  {
+    for (int i=0; i<BENCHMARK_VECTOR_SIZE; ++i)
+      std_vec3[i] += std_vec2[i] / std_factor1 + std_factor2 * (std_vec1[i] - std_factor1 * std_vec2[i]);
+  }
+  exec_time = timer.get();
+  std::cout << "CPU time: " << exec_time << std::endl;
+  std::cout << "CPU "; printOps(6.0 * static_cast<double>(std_vec1.size()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
+
+  vcl_vec3 = vcl_vec2 / vcl_factor1 + vcl_factor2 * (vcl_vec1 - vcl_factor1*vcl_vec2); //startup calculation
+  viennacl::backend::finish();
+  timer.start();
+  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+  {
+    vcl_vec3 = vcl_vec2 / vcl_factor1 + vcl_factor2 * (vcl_vec1 - vcl_factor1*vcl_vec2);
+  }
+  viennacl::backend::finish();
+  exec_time = timer.get();
+  std::cout << "GPU time: " << exec_time << std::endl;
+  std::cout << "GPU "; printOps(6.0 * static_cast<double>(std_vec1.size()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
+
+  return 0;
+}
+
+int main()
+{
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "               Device Info" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+
+#ifdef VIENNACL_WITH_OPENCL
+  std::cout << viennacl::ocl::current_device().info() << std::endl;
+#endif
+
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "## Benchmark :: Vector" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << std::endl;
+  std::cout << "   -------------------------------" << std::endl;
+  std::cout << "   # benchmarking single-precision" << std::endl;
+  std::cout << "   -------------------------------" << std::endl;
+  run_benchmark<float>();
+#ifdef VIENNACL_WITH_OPENCL
+  if( viennacl::ocl::current_device().double_support() )
+#endif
+  {
+    std::cout << std::endl;
+    std::cout << "   -------------------------------" << std::endl;
+    std::cout << "   # benchmarking double-precision" << std::endl;
+    std::cout << "   -------------------------------" << std::endl;
+    run_benchmark<double>();
+  }
+  return 0;
+}
+
diff --git a/examples/benchmarks/vector.cpp b/examples/benchmarks/vector.cu
similarity index 66%
copy from examples/benchmarks/vector.cpp
copy to examples/benchmarks/vector.cu
index 45766e1..b13c022 100644
--- a/examples/benchmarks/vector.cpp
+++ b/examples/benchmarks/vector.cu
@@ -1,241 +1,281 @@
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-//#define VIENNACL_BUILD_INFO
-#ifndef NDEBUG
- #define NDEBUG
-#endif
-
-#include "viennacl/scalar.hpp"
-#include "viennacl/vector.hpp"
-#include "viennacl/linalg/inner_prod.hpp"
-
-#include <iostream>
-#include <vector>
-#include "benchmark-utils.hpp"
-
-using std::cout;
-using std::cin;
-using std::endl;
-
-
-/*
-*   Benchmark 1:
-*   Vector tests
-*   
-*/
-
-#define BENCHMARK_VECTOR_SIZE   3000000
-#define BENCHMARK_RUNS          10
-
-
-template<typename ScalarType>
-int run_benchmark()
-{
-   
-   Timer timer;
-   double exec_time;
-   
-  ScalarType std_result = 0;
-   
-  ScalarType std_factor1 = static_cast<ScalarType>(3.1415);
-  ScalarType std_factor2 = static_cast<ScalarType>(42.0);
-  viennacl::scalar<ScalarType> vcl_factor1(std_factor1);
-  viennacl::scalar<ScalarType> vcl_factor2(std_factor2);
-  
-  std::vector<ScalarType> std_vec1(BENCHMARK_VECTOR_SIZE);
-  std::vector<ScalarType> std_vec2(BENCHMARK_VECTOR_SIZE);
-  std::vector<ScalarType> std_vec3(BENCHMARK_VECTOR_SIZE);
-  viennacl::vector<ScalarType> vcl_vec1(BENCHMARK_VECTOR_SIZE);
-  viennacl::vector<ScalarType> vcl_vec2(BENCHMARK_VECTOR_SIZE); 
-  viennacl::vector<ScalarType> vcl_vec3(BENCHMARK_VECTOR_SIZE); 
-
-  
-  ///////////// Vector operations /////////////////
-  
-  std_vec1[0] = 1.0;
-  std_vec2[0] = 1.0;
-  for (int i=1; i<BENCHMARK_VECTOR_SIZE; ++i)
-  {
-    std_vec1[i] = std_vec1[i-1] * ScalarType(1.000001);
-    std_vec2[i] = std_vec1[i-1] * ScalarType(0.999999);
-  }
-
-  viennacl::copy(std_vec1, vcl_vec1);
-  viennacl::fast_copy(std_vec1, vcl_vec1);
-  viennacl::copy(std_vec2, vcl_vec2);
-  
-  viennacl::swap(vcl_vec1, vcl_vec2);
-  //check that vcl_vec1 is now equal to std_vec2:
-  viennacl::fast_copy(vcl_vec1, std_vec3);
-  for (int i=0; i<BENCHMARK_VECTOR_SIZE; ++i)
-    if (std_vec3[i] != std_vec2[i])
-      std::cout << "ERROR in swap(): Failed at entry " << i << std::endl;
-  
-  viennacl::fast_swap(vcl_vec1, vcl_vec2);
-  //check that vcl_vec1 is now equal to std_vec1 again:
-  viennacl::copy(vcl_vec1, std_vec3);
-  for (int i=0; i<BENCHMARK_VECTOR_SIZE; ++i)
-    if (std_vec3[i] != std_vec1[i])
-      std::cout << "ERROR in fast_swap(): Failed at entry " << i << std::endl;
-  
-  
-  // inner product
-  viennacl::ocl::get_queue().finish();
-  std::cout << "------- Vector inner products ----------" << std::endl;
-  timer.start();
-  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
-  {
-    std_result = 0;
-    for (int i=0; i<BENCHMARK_VECTOR_SIZE; ++i)
-      std_result += std_vec1[i] * std_vec2[i];
-  }
-  exec_time = timer.get();
-  std::cout << "CPU time: " << exec_time << std::endl;
-  std::cout << "CPU "; printOps(static_cast<double>(std_vec1.size()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
-  std::cout << "Result:" << std_result << std::endl;
-  
-  
-  std_result = viennacl::linalg::inner_prod(vcl_vec1, vcl_vec2); //startup calculation
-  std_result = 0.0;
-  viennacl::ocl::get_queue().finish();
-  timer.start();
-  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
-  {
-    vcl_factor2 = viennacl::linalg::inner_prod(vcl_vec1, vcl_vec2);
-  }
-  viennacl::ocl::get_queue().finish();
-  exec_time = timer.get();
-  std::cout << "GPU time: " << exec_time << std::endl;
-  std::cout << "GPU "; printOps(static_cast<double>(std_vec1.size()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
-  std::cout << "Result: " << vcl_factor2 << std::endl;
-  
-  // vector addition
-  
-  std::cout << "------- Vector addition ----------" << std::endl;
-  timer.start();
-  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
-  {
-    for (int i=0; i<BENCHMARK_VECTOR_SIZE; ++i)
-      std_vec3[i] = std_vec1[i] + std_vec2[i];
-  }
-  exec_time = timer.get();
-  std::cout << "CPU time: " << exec_time << std::endl;
-  std::cout << "CPU "; printOps(static_cast<double>(std_vec1.size()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
-  
-  
-  vcl_vec3 = vcl_vec1 + vcl_vec2; //startup calculation
-  viennacl::ocl::get_queue().finish();
-  std_result = 0.0;
-  timer.start();
-  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
-  {
-    vcl_vec3 = vcl_vec1 + vcl_vec2;
-  }
-  viennacl::ocl::get_queue().finish();
-  exec_time = timer.get();
-  std::cout << "GPU time: " << exec_time << std::endl;
-  std::cout << "GPU "; printOps(static_cast<double>(std_vec1.size()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
-  
-  
-  
-  
-  // multiply add:
-  std::cout << "------- Vector multiply add ----------" << std::endl;
-  timer.start();
-  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
-  {
-    for (int i=0; i<BENCHMARK_VECTOR_SIZE; ++i)
-      std_vec1[i] += std_factor1 * std_vec2[i];
-  }
-  exec_time = timer.get();
-  std::cout << "CPU time: " << exec_time << std::endl;
-  std::cout << "CPU "; printOps(static_cast<double>(std_vec1.size()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
-  
-  
-  vcl_vec1 += vcl_factor1 * vcl_vec2; //startup calculation
-  viennacl::ocl::get_queue().finish();
-  timer.start();
-  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
-  {
-    vcl_vec1 += vcl_factor1 * vcl_vec2;
-  }
-  viennacl::ocl::get_queue().finish();
-  exec_time = timer.get();
-  std::cout << "GPU time: " << exec_time << std::endl;
-  std::cout << "GPU "; printOps(static_cast<double>(std_vec1.size()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
-  
- 
-  
-  
-  //complicated vector addition:
-  std::cout << "------- Vector complicated expression ----------" << std::endl;
-  timer.start();
-  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
-  {
-    for (int i=0; i<BENCHMARK_VECTOR_SIZE; ++i)
-      std_vec3[i] += std_vec2[i] / std_factor1 + std_factor2 * (std_vec1[i] - std_factor1 * std_vec2[i]);
-  }
-  exec_time = timer.get();
-  std::cout << "CPU time: " << exec_time << std::endl;
-  std::cout << "CPU "; printOps(3 * static_cast<double>(std_vec1.size()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
-  
-  vcl_vec3 = vcl_vec2 / vcl_factor1 + vcl_factor2 * (vcl_vec1 - vcl_factor1*vcl_vec2); //startup calculation
-  viennacl::ocl::get_queue().finish();
-  timer.start();
-  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
-  {
-    vcl_vec3 = vcl_vec2 / vcl_factor1 + vcl_factor2 * (vcl_vec1 - vcl_factor1*vcl_vec2);
-  }
-  viennacl::ocl::get_queue().finish();
-  exec_time = timer.get();
-  std::cout << "GPU time: " << exec_time << std::endl;
-  std::cout << "GPU "; printOps(3 * static_cast<double>(std_vec1.size()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
-  
-  return 0;
-}
-
-int main()
-{
-  std::cout << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << "               Device Info" << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  
-  std::cout << viennacl::ocl::current_device().info() << std::endl;
-  
-  
-  std::cout << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << "## Benchmark :: Vector" << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << std::endl;
-  std::cout << "   -------------------------------" << std::endl;
-  std::cout << "   # benchmarking single-precision" << std::endl;
-  std::cout << "   -------------------------------" << std::endl;
-  run_benchmark<float>();
-  if( viennacl::ocl::current_device().double_support() )
-  {
-    std::cout << std::endl;
-    std::cout << "   -------------------------------" << std::endl;
-    std::cout << "   # benchmarking double-precision" << std::endl;
-    std::cout << "   -------------------------------" << std::endl;
-    run_benchmark<double>();
-  }
-  return 0;
-}
-
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/*
+*
+*   Benchmark:   Vector operations (vector.cpp and vector.cu are identical, the latter being required for compilation using CUDA nvcc)
+*
+*/
+
+
+//#define VIENNACL_DEBUG_ALL
+#ifndef NDEBUG
+ #define NDEBUG
+#endif
+
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/linalg/inner_prod.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+
+#include <iostream>
+#include <vector>
+#include "benchmark-utils.hpp"
+
+using std::cout;
+using std::cin;
+using std::endl;
+
+
+#define BENCHMARK_VECTOR_SIZE   3000000
+#define BENCHMARK_RUNS          10
+
+
+template<typename ScalarType>
+int run_benchmark()
+{
+
+   Timer timer;
+   double exec_time;
+
+  ScalarType std_result = 0;
+
+  ScalarType std_factor1 = static_cast<ScalarType>(3.1415);
+  ScalarType std_factor2 = static_cast<ScalarType>(42.0);
+  viennacl::scalar<ScalarType> vcl_factor1(std_factor1);
+  viennacl::scalar<ScalarType> vcl_factor2(std_factor2);
+
+  std::vector<ScalarType> std_vec1(BENCHMARK_VECTOR_SIZE);
+  std::vector<ScalarType> std_vec2(BENCHMARK_VECTOR_SIZE);
+  std::vector<ScalarType> std_vec3(BENCHMARK_VECTOR_SIZE);
+  viennacl::vector<ScalarType> vcl_vec1(BENCHMARK_VECTOR_SIZE);
+  viennacl::vector<ScalarType> vcl_vec2(BENCHMARK_VECTOR_SIZE);
+  viennacl::vector<ScalarType> vcl_vec3(BENCHMARK_VECTOR_SIZE);
+
+
+  ///////////// Vector operations /////////////////
+
+  std_vec1[0] = 1.0;
+  std_vec2[0] = 1.0;
+  for (int i=1; i<BENCHMARK_VECTOR_SIZE; ++i)
+  {
+    std_vec1[i] = std_vec1[i-1] * ScalarType(1.000001);
+    std_vec2[i] = std_vec1[i-1] * ScalarType(0.999999);
+  }
+
+  viennacl::copy(std_vec1, vcl_vec1);
+  viennacl::fast_copy(std_vec1, vcl_vec1);
+  viennacl::copy(std_vec2, vcl_vec2);
+
+  viennacl::swap(vcl_vec1, vcl_vec2);
+  //check that vcl_vec1 is now equal to std_vec2:
+  viennacl::fast_copy(vcl_vec1, std_vec3);
+  for (int i=0; i<BENCHMARK_VECTOR_SIZE; ++i)
+    if (std_vec3[i] != std_vec2[i])
+      std::cout << "ERROR in swap(): Failed at entry " << i << std::endl;
+
+  viennacl::fast_swap(vcl_vec1, vcl_vec2);
+  //check that vcl_vec1 is now equal to std_vec1 again:
+  viennacl::copy(vcl_vec1, std_vec3);
+  for (int i=0; i<BENCHMARK_VECTOR_SIZE; ++i)
+    if (std_vec3[i] != std_vec1[i])
+      std::cout << "ERROR in fast_swap(): Failed at entry " << i << std::endl;
+
+
+  // inner product
+  viennacl::backend::finish();
+  std::cout << "------- Vector inner products ----------" << std::endl;
+  timer.start();
+  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+  {
+    std_result = 0;
+    for (int i=0; i<BENCHMARK_VECTOR_SIZE; ++i)
+      std_result += std_vec1[i] * std_vec2[i];
+  }
+  exec_time = timer.get();
+  std::cout << "CPU time: " << exec_time << std::endl;
+  std::cout << "CPU "; printOps(2.0 * static_cast<double>(std_vec1.size()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
+  std::cout << "Result:" << std_result << std::endl;
+
+
+  std_result = viennacl::linalg::inner_prod(vcl_vec1, vcl_vec2); //startup calculation
+  std_result = 0.0;
+  viennacl::backend::finish();
+  timer.start();
+  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+  {
+    vcl_factor2 = viennacl::linalg::inner_prod(vcl_vec1, vcl_vec2);
+  }
+  viennacl::backend::finish();
+  exec_time = timer.get();
+  std::cout << "GPU time: " << exec_time << std::endl;
+  std::cout << "GPU "; printOps(2.0 * static_cast<double>(std_vec1.size()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
+  std::cout << "Result: " << vcl_factor2 << std::endl;
+
+  // inner product
+  viennacl::backend::finish();
+  std::cout << "------- Vector norm_2 ----------" << std::endl;
+  timer.start();
+  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+  {
+    std_result = 0;
+    for (int i=0; i<BENCHMARK_VECTOR_SIZE; ++i)
+    {
+      ScalarType entry = std_vec1[i];
+      std_result += entry * entry;
+    }
+  }
+  std_result = std::sqrt(std_result);
+  exec_time = timer.get();
+  std::cout << "CPU time: " << exec_time << std::endl;
+  std::cout << "CPU "; printOps(2.0 * static_cast<double>(std_vec1.size()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
+  std::cout << "Result:" << std_result << std::endl;
+
+
+  std_result = viennacl::linalg::norm_2(vcl_vec1); //startup calculation
+  std_result = 0.0;
+  viennacl::backend::finish();
+  timer.start();
+  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+  {
+    vcl_factor2 = viennacl::linalg::norm_2(vcl_vec1);
+  }
+  viennacl::backend::finish();
+  exec_time = timer.get();
+  std::cout << "GPU time: " << exec_time << std::endl;
+  std::cout << "GPU "; printOps(2.0 * static_cast<double>(std_vec1.size()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
+  std::cout << "Result: " << vcl_factor2 << std::endl;
+
+  // vector addition
+
+  std::cout << "------- Vector addition ----------" << std::endl;
+  timer.start();
+  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+  {
+    for (int i=0; i<BENCHMARK_VECTOR_SIZE; ++i)
+      std_vec3[i] = std_vec1[i] + std_vec2[i];
+  }
+  exec_time = timer.get();
+  std::cout << "CPU time: " << exec_time << std::endl;
+  std::cout << "CPU "; printOps(2.0 * static_cast<double>(std_vec1.size()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
+
+
+  vcl_vec3 = vcl_vec1 + vcl_vec2; //startup calculation
+  viennacl::backend::finish();
+  std_result = 0.0;
+  timer.start();
+  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+  {
+    vcl_vec3 = vcl_vec1 + vcl_vec2;
+  }
+  viennacl::backend::finish();
+  exec_time = timer.get();
+  std::cout << "GPU time: " << exec_time << std::endl;
+  std::cout << "GPU "; printOps(2.0 * static_cast<double>(std_vec1.size()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
+
+
+
+
+  // multiply add:
+  std::cout << "------- Vector multiply add ----------" << std::endl;
+  timer.start();
+  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+  {
+    for (int i=0; i<BENCHMARK_VECTOR_SIZE; ++i)
+      std_vec1[i] += std_factor1 * std_vec2[i];
+  }
+  exec_time = timer.get();
+  std::cout << "CPU time: " << exec_time << std::endl;
+  std::cout << "CPU "; printOps(2.0 * static_cast<double>(std_vec1.size()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
+
+
+  vcl_vec1 += vcl_factor1 * vcl_vec2; //startup calculation
+  viennacl::backend::finish();
+  timer.start();
+  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+  {
+    vcl_vec1 += vcl_factor1 * vcl_vec2;
+  }
+  viennacl::backend::finish();
+  exec_time = timer.get();
+  std::cout << "GPU time: " << exec_time << std::endl;
+  std::cout << "GPU "; printOps(2.0 * static_cast<double>(std_vec1.size()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
+
+
+
+
+  //complicated vector addition:
+  std::cout << "------- Vector complicated expression ----------" << std::endl;
+  timer.start();
+  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+  {
+    for (int i=0; i<BENCHMARK_VECTOR_SIZE; ++i)
+      std_vec3[i] += std_vec2[i] / std_factor1 + std_factor2 * (std_vec1[i] - std_factor1 * std_vec2[i]);
+  }
+  exec_time = timer.get();
+  std::cout << "CPU time: " << exec_time << std::endl;
+  std::cout << "CPU "; printOps(6.0 * static_cast<double>(std_vec1.size()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
+
+  vcl_vec3 = vcl_vec2 / vcl_factor1 + vcl_factor2 * (vcl_vec1 - vcl_factor1*vcl_vec2); //startup calculation
+  viennacl::backend::finish();
+  timer.start();
+  for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
+  {
+    vcl_vec3 = vcl_vec2 / vcl_factor1 + vcl_factor2 * (vcl_vec1 - vcl_factor1*vcl_vec2);
+  }
+  viennacl::backend::finish();
+  exec_time = timer.get();
+  std::cout << "GPU time: " << exec_time << std::endl;
+  std::cout << "GPU "; printOps(6.0 * static_cast<double>(std_vec1.size()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
+
+  return 0;
+}
+
+int main()
+{
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "               Device Info" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+
+#ifdef VIENNACL_WITH_OPENCL
+  std::cout << viennacl::ocl::current_device().info() << std::endl;
+#endif
+
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "## Benchmark :: Vector" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << std::endl;
+  std::cout << "   -------------------------------" << std::endl;
+  std::cout << "   # benchmarking single-precision" << std::endl;
+  std::cout << "   -------------------------------" << std::endl;
+  run_benchmark<float>();
+#ifdef VIENNACL_WITH_OPENCL
+  if( viennacl::ocl::current_device().double_support() )
+#endif
+  {
+    std::cout << std::endl;
+    std::cout << "   -------------------------------" << std::endl;
+    std::cout << "   # benchmarking double-precision" << std::endl;
+    std::cout << "   -------------------------------" << std::endl;
+    run_benchmark<double>();
+  }
+  return 0;
+}
+
diff --git a/examples/parameters/CMakeLists.txt b/examples/parameters/CMakeLists.txt
deleted file mode 100644
index 823439e..0000000
--- a/examples/parameters/CMakeLists.txt
+++ /dev/null
@@ -1,25 +0,0 @@
-include_directories(${PROJECT_SOURCE_DIR}/external)
-
-foreach(par vector matrix sparse parameter_reader)
-   set(n ${par}params)
-   if(par STREQUAL parameter_reader)
-      set(n ${par})
-   endif()
-   add_executable(${n} ${par}.cpp
-      ${PROJECT_SOURCE_DIR}/external/pugixml/src/pugixml.cpp)
-   target_link_libraries(${n} ${OPENCL_LIBRARIES})
-endforeach()
-
-if(ENABLE_VIENNAPROFILER)
-   include_directories(${VIENNAPROFILER_INCLUDE_DIRS})
-   foreach(par vector matrix sparse)
-      add_executable(${par}params_vprof ${par}.cpp
-         ${PROJECT_SOURCE_DIR}/external/pugixml/src/pugixml.cpp)
-      target_link_libraries(${par}params_vprof
-         ${OPENCL_LIBRARIES} ${VIENNAPROFILER_LIBRARIES})
-   endforeach()
-
-   set_target_properties(vectorparams_vprof matrixparams_vprof
-      sparseparams_vprof
-      PROPERTIES COMPILE_DEFINITIONS ENABLE_VIENNAPROFILER)
-endif()
diff --git a/examples/parameters/benchmark-utils.hpp b/examples/parameters/benchmark-utils.hpp
deleted file mode 100644
index 25db8c1..0000000
--- a/examples/parameters/benchmark-utils.hpp
+++ /dev/null
@@ -1,98 +0,0 @@
-#ifndef _BENCHMARK_UTILS_HPP_
-#define _BENCHMARK_UTILS_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-#include <iostream>
-
-void printOps(double num_ops, double exec_time)
-{
-  std::cout << "GFLOPS: " << num_ops / (1000000 * exec_time * 1000) << std::endl;
-}
-
-
-
-
-#ifdef _WIN32
-
-#define WINDOWS_LEAN_AND_MEAN
-#include <windows.h>
-#undef min
-#undef max
-
-class Timer
-{
-public:
-
-	Timer()
-	{
-		QueryPerformanceFrequency(&freq);
-	}
-
-	void start()
-	{
-		QueryPerformanceCounter((LARGE_INTEGER*) &start_time);
-	}
-
-	double get() const
-	{
-		LARGE_INTEGER  end_time;
-		QueryPerformanceCounter((LARGE_INTEGER*) &end_time);
-		return (static_cast<double>(end_time.QuadPart) - static_cast<double>(start_time.QuadPart)) / static_cast<double>(freq.QuadPart);
-	}
-
-
-private:
-	LARGE_INTEGER freq;
-    LARGE_INTEGER start_time;
-};
-
-#else
-
-#include <sys/time.h>
-
-class Timer
-{
-public:
-
-	Timer() : ts(0)
-	{}
-
-	void start()
-	{
-		struct timeval tval;
-		gettimeofday(&tval, NULL);
-		ts = tval.tv_sec * 1000000 + tval.tv_usec;
-	}
-
-	double get() const
-	{
-		struct timeval tval;
-		gettimeofday(&tval, NULL);
-		int64_t end_time = tval.tv_sec * 1000000 + tval.tv_usec;
-
-		return static_cast<double>(end_time-ts) / 1000000.0;
-	}
-
-private:
-	int64_t ts;
-};
-
-
-#endif
-
-#endif
diff --git a/examples/parameters/common.hpp b/examples/parameters/common.hpp
deleted file mode 100644
index ad55fe3..0000000
--- a/examples/parameters/common.hpp
+++ /dev/null
@@ -1,251 +0,0 @@
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-////////////////////// some functions that aid testing to follow /////////////////////////////////
-
-#include "benchmark-utils.hpp"
-#include "viennacl/ocl/backend.hpp"
-#include "viennacl/io/kernel_parameters.hpp"
-
-#ifndef BENCHMARK_RUNS
- #define BENCHMARK_RUNS          10
-#endif
-
-
-void set_kernel_params(std::string program_name,
-                       std::string kernel_name,
-                       unsigned int work_groups, //total no. of threads
-                       unsigned int loc)  //threads per work group
-{
-  //get kernel from pool and set work sizes:
-  viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(program_name, kernel_name);
-  k.global_work_size(0, work_groups * loc);
-  k.local_work_size(0, loc);
-}
-
-bool validate_result(std::string program_name,
-                     std::string kernel_name,
-                     unsigned int work_groups,
-                     unsigned int local_workers)
-{
-  viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(program_name, kernel_name);
-  bool ret = (k.global_work_size() == work_groups * local_workers)
-           && (k.local_work_size() == local_workers);
-  if (!ret)
-  {
-    std::cout << "Failed: " << k.global_work_size() << " vs. " << work_groups * local_workers << " and " << k.local_work_size() << " vs. " << local_workers << std::endl;
-  }
-  return ret;
-}
-
-
-
-template <typename T, typename TestData>
-double execute(T functor, TestData & data)
-{
-  Timer t;
-  functor(data); //one startup calculation
-  viennacl::ocl::get_queue().finish();
-  t.start();
-  for (int i=0; i<BENCHMARK_RUNS; ++i)
-    functor(data);
-  viennacl::ocl::get_queue().finish();
-  return t.get();
-}
-
-
-template <typename TimingType, typename F, typename TestConfig, typename TestData>
-void record_full_timings(TimingType & timings,
-                         F functor,
-                         TestConfig & config,
-                         TestData & data)
-{
-  typedef typename TestData::value_type  ScalarType;
-  
-  double result = 0;
-  functor(data); //startup run (ensures kernel compilation)
-  for (unsigned int work_groups = config.min_work_groups(); work_groups <= config.max_work_groups(); work_groups *= 2)           //iterate over number of work groups (compute units)
-  {
-    for (unsigned int local_workers = config.min_local_size(); local_workers <= config.max_local_size(); local_workers *= 2)   //iterate over local thread number
-    {
-      //set parameter:
-      set_kernel_params(config.program_name(), config.kernel_name(), work_groups, local_workers);
-      
-      //std::cout << "Benchmarking kernel " << config.kernel_name() << std::endl;
-      result = execute(functor, data);
-      
-      //check for valid result: (kernels have an automatic fallback to smaller values included)
-      if (!validate_result(config.program_name(), config.kernel_name(), work_groups, local_workers))
-      {
-      std::cout << "Kernel start failed for kernel " << config.kernel_name() << " [" << work_groups << " groups, " << local_workers << " per group]" << std::endl;
-        break;
-      }
-      else
-        timings[result] = std::make_pair(work_groups * local_workers, local_workers);
-    }
-  }
-}
-
-template <typename TimingType, typename F, typename TestConfig, typename TestData>
-void record_restricted_timings(TimingType & timings,
-                               F functor,
-                               TestConfig & config,
-                               TestData & data)
-{
-  typedef typename TestData::value_type  ScalarType;
-  
-  double result = 0;
-  functor(data); //startup run (ensures kernel compilation)
-  for (unsigned int local_workers = config.min_local_size(); local_workers <= config.max_local_size(); local_workers *= 2)   //iterate over local thread number, up to 512
-  {
-    //set parameter:
-    set_kernel_params(config.program_name(), config.kernel_name(), 1, local_workers);
-    
-    result = execute(functor, data);
-    
-    //check for valid result: (kernels have an automatic fallback to smaller values included)
-    if (!validate_result(config.program_name(), config.kernel_name(), 1, local_workers))
-    {
-      std::cout << "Kernel start failed for kernel " << config.kernel_name() << " [1 group, " << local_workers << " per group]" << std::endl;
-      //break;
-    }
-    else
-      timings[result] = std::make_pair(local_workers, local_workers);
-  }
-}
-
-template <typename TimingType>
-void print_best(TimingType const & timings, std::string kernel_name)
-{
-  //give some feedback to stdout:
-  std::cout << "Best parameter set for " << kernel_name << ": [" << timings.begin()->second.first << " global workers, " << timings.begin()->second.second << " local workers] " << timings.begin()->first << std::endl;
-  
-}
-
-template <typename TimingType>
-void print_default(TimingType const & timings, std::string kernel_name)
-{
-  bool found = false;
-  std::cout << "Default parameter set: [16384 global workers, 128 local workers] ";
-  for (typename TimingType::const_iterator it = timings.begin(); it != timings.end(); ++it)
-  {
-    if (it->second.first == 128*128 && it->second.second == 128)
-    {
-      std::cout << it->first << std::endl;
-      found = true;
-    }
-  }
-  if (!found)
-    std::cout << "n.a." << std::endl;
-}
-
-template <typename TimingType>
-void print_default_restricted(TimingType const & timings, std::string kernel_name)
-{
-  bool found = false;
-  std::cout << "Default parameter set: [128 global workers, 128 local workers] ";
-  for (typename TimingType::const_iterator it = timings.begin(); it != timings.end(); ++it)
-  {
-    if (it->second.first == 128 && it->second.second == 128)
-    {
-      std::cout << it->first << std::endl;
-      found = true;
-    }
-  }
-  if (!found)
-    std::cout << "n.a." << std::endl;
-}
-
-
-class test_config
-{
-  public:
-    test_config() {}
-    test_config(std::string const & prog_name) : prog_(prog_name) {}
-    test_config(std::string const & prog_name, std::string const & kernel_name) : prog_(prog_name), kernel_(kernel_name) {}
-    
-    std::string const & program_name() const { return prog_; }
-    void program_name(std::string const & name) { prog_ = name; }
-    std::string const & kernel_name() const { return kernel_; }
-    void kernel_name(std::string const & name) { kernel_ = name; }
-    
-    unsigned int min_work_groups() const { return min_work_groups_; }
-    void min_work_groups(unsigned int i) { min_work_groups_ = i; }
-    unsigned int max_work_groups() const { return max_work_groups_; }
-    void max_work_groups(unsigned int i) { max_work_groups_ = i; }
-    
-    
-    unsigned int min_local_size() const { return min_local_size_; }
-    void min_local_size(unsigned int i) { min_local_size_ = i; }
-    unsigned int max_local_size() const { return max_local_size_; }
-    void max_local_size(unsigned int i) { max_local_size_ = i; }
-    
-  private:
-    std::string prog_;
-    std::string kernel_;
-    unsigned int min_work_groups_;
-    unsigned int max_work_groups_;
-    unsigned int min_local_size_;
-    unsigned int max_local_size_;
-};
-
-template <typename TimingType>
-void record_kernel_parameters(viennacl::io::parameter_database& paras, std::string kernel, TimingType& timings)
-{
-   paras.add_kernel();  
-   paras.add_data_node(viennacl::io::tag::name, kernel);   
-   paras.add_parameter();  
-   paras.add_data_node(viennacl::io::tag::name, viennacl::io::val::globsize);
-   paras.add_data_node(viennacl::io::tag::value, timings.begin()->second.first);         
-   paras.add_parameter();     
-   paras.add_data_node(viennacl::io::tag::name, viennacl::io::val::locsize);
-   paras.add_data_node(viennacl::io::tag::value, timings.begin()->second.second);            
-}
-
-
-
-
-template <typename TimingType, typename F, typename TestConfig, typename TestData>
-void optimize_full(viennacl::io::parameter_database & paras,
-                   TimingType & timings,
-                   F functor,
-                   TestConfig & config,
-                   TestData & data)
-{
-  record_full_timings(timings, functor, config, data);
-  record_kernel_parameters(paras, config.kernel_name(), timings);
-#ifdef ENABLE_VIENNAPROFILER
-  write_viennaprofiler(timings, config.program_name(), config.kernel_name());
-#endif
-  print_best(timings, config.kernel_name());
-  print_default(timings, config.kernel_name());
-}
-
-template <typename TimingType, typename F, typename TestConfig, typename TestData>
-void optimize_restricted(viennacl::io::parameter_database & paras,
-                         TimingType & timings,
-                         F functor,
-                         TestConfig & config,
-                         TestData & data)
-{
-  record_restricted_timings(timings, functor, config, data);
-  record_kernel_parameters(paras, config.kernel_name(), timings);
-#ifdef ENABLE_VIENNAPROFILER
-  write_viennaprofiler(timings, config.program_name(), config.kernel_name());
-#endif
-  print_best(timings, config.kernel_name());
-  print_default_restricted(timings, config.kernel_name());
-}
diff --git a/examples/parameters/common_vprof.hpp b/examples/parameters/common_vprof.hpp
deleted file mode 100644
index c39aad3..0000000
--- a/examples/parameters/common_vprof.hpp
+++ /dev/null
@@ -1,53 +0,0 @@
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-#include "viennaprofiler/mysqldb.hpp"
-#include "viennaprofiler/timer/precisetimer.hpp"
-#include "viennaprofiler/host.hpp"
-#include "viennaprofiler/profiler.hpp"
-
-
-template <typename TimingType>
-void write_viennaprofiler(TimingType & timings, std::string function_prefix, std::string kernel_name)
-{
-  ViennaProfiler::MySQLDB dbConn("myhost.example.com", "database", "user", "password");
-  ViennaProfiler::PreciseTimer timer; // choose a timer for measuring the execution time
-  
-  //ViennaProfiler::Host host = dbConn.getHost("pcrupp"); // create a host
-  ViennaProfiler::Profiler<ViennaProfiler::MySQLDB, ViennaProfiler::PreciseTimer> myTest(dbConn, timer, "my_machine_name"); // create a Profiler
-  myTest.setCollection("ViennaCL");
-  myTest.setFunction(function_prefix + " " + kernel_name);
-  myTest.setImplementation("default");
-  myTest.setSourceCode("not available");
-  myTest.setOperations(0);
-  
-  //do a dummy start (otherwise, date is not written properly)
-  myTest.start();
-  myTest.stop();
-
-  for (typename TimingType::iterator it = timings.begin();
-       it != timings.end(); ++it)
-  {
-    myTest.addParameter("work groups", it->second.first);
-    myTest.addParameter("work group size", it->second.second);
-    myTest.setExternalTiming(it->first, BENCHMARK_RUNS);
-    myTest.send();
-  }
-  
-  std::cout << "Optimization for " << kernel_name << " written to ViennaProfiler." << std::endl;
-}
-
-
diff --git a/examples/parameters/matrix.cpp b/examples/parameters/matrix.cpp
deleted file mode 100644
index 0ea7be0..0000000
--- a/examples/parameters/matrix.cpp
+++ /dev/null
@@ -1,267 +0,0 @@
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-
-#include "viennacl/scalar.hpp"
-#include "viennacl/vector.hpp"
-#include "viennacl/matrix.hpp"
-#include "viennacl/linalg/direct_solve.hpp"
-#include "viennacl/linalg/prod.hpp"
-
-#include <iostream>
-#include <vector>
-#include <stdlib.h>
-#include "benchmark-utils.hpp"
-
-template <typename ScalarType, typename VectorType, typename MatrixType>
-class test_data;
-
-#include "common.hpp"
-#ifdef ENABLE_VIENNAPROFILER
- #include "common_vprof.hpp"
-#endif
-#include "matrix_functors.hpp"
-
-/*
-*   Auto-Tuning for dense matrix kernels
-*/
-
-#define BENCHMARK_MATRIX_SIZE   256
-
-//a helper container that holds the objects used during benchmarking
-template <typename ScalarType, typename VectorType, typename MatrixType>
-class test_data
-{
-  public:
-    typedef typename VectorType::value_type::value_type   value_type;
-    
-    test_data(ScalarType & s1_,
-              VectorType & v1_,
-              VectorType & v2_,
-              MatrixType & mat_) : s1(s1_), v1(v1_), v2(v2_), mat(mat_)  {}
-    
-    ScalarType & s1;
-    VectorType & v1;
-    VectorType & v2;
-    MatrixType & mat;
-};
-
-
-
-
-////////////////////// some functions that aid testing to follow /////////////////////////////////
-
-
-template<typename ScalarType>
-int run_matrix_benchmark(test_config & config, viennacl::io::parameter_database& paras)
-{
-  typedef viennacl::scalar<ScalarType>   VCLScalar;
-  typedef viennacl::vector<ScalarType>   VCLVector;
-  typedef viennacl::matrix<ScalarType>   VCLMatrix;
-   
-  ////////////////////////////////////////////////////////////////////
-  //set up a little bit of data to play with:
-  //ScalarType std_result = 0;
-   
-  ScalarType std_factor1 = static_cast<ScalarType>(3.1415);
-  ScalarType std_factor2 = static_cast<ScalarType>(42.0);
-  viennacl::scalar<ScalarType> vcl_factor1(std_factor1);
-  viennacl::scalar<ScalarType> vcl_factor2(std_factor2);
-  
-  std::vector<ScalarType> std_vec1(BENCHMARK_MATRIX_SIZE);  //used to set all values to zero
-  std::vector< std::vector<ScalarType> > stl_mat(BENCHMARK_MATRIX_SIZE);  //store identity matrix here
-  VCLVector vcl_vec1(BENCHMARK_MATRIX_SIZE);
-  VCLVector vcl_vec2(BENCHMARK_MATRIX_SIZE);
-  VCLMatrix vcl_mat(BENCHMARK_MATRIX_SIZE, BENCHMARK_MATRIX_SIZE);
-  
-  for (int i=0; i<BENCHMARK_MATRIX_SIZE; ++i)
-  {
-    stl_mat[i].resize(BENCHMARK_MATRIX_SIZE);
-    stl_mat[i][i] = 1.0;
-  }
-
-  copy(std_vec1, vcl_vec1); //initialize vectors with all zeros (no need to worry about overflows then)
-  copy(std_vec1, vcl_vec2); //initialize vectors with all zeros (no need to worry about overflows then)
-  copy(stl_mat, vcl_mat);
-  
-  typedef test_data<VCLScalar, VCLVector, VCLMatrix>   TestDataType;
-  test_data<VCLScalar, VCLVector, VCLMatrix> data(vcl_factor1, vcl_vec1, vcl_vec2, vcl_mat);
-
-  //////////////////////////////////////////////////////////
-  ///////////// Start parameter recording  /////////////////
-  //////////////////////////////////////////////////////////
-  
-  typedef std::map< double, std::pair<unsigned int, unsigned int> >   TimingType;
-  std::map< std::string, TimingType > all_timings;
-  
-  std::cout << "------- Related to direct solution algorithms ----------" << std::endl;
-  config.kernel_name("trans_lower_triangular_substitute_inplace");
-  optimize_restricted(paras, all_timings[config.kernel_name()],
-                      matrix_trans_lower_triangular_substitute_inplace<TestDataType>, config, data);
-
-  config.kernel_name("lower_triangular_substitute_inplace");
-  optimize_restricted(paras, all_timings[config.kernel_name()],
-                      matrix_lower_triangular_substitute_inplace<TestDataType>, config, data);
-
-  config.kernel_name("unit_lower_triangular_substitute_inplace");
-  optimize_restricted(paras, all_timings[config.kernel_name()],
-                      matrix_unit_lower_triangular_substitute_inplace<TestDataType>, config, data);
-
-  config.kernel_name("upper_triangular_substitute_inplace");
-  optimize_restricted(paras, all_timings[config.kernel_name()],
-                      matrix_upper_triangular_substitute_inplace<TestDataType>, config, data);
-
-  config.kernel_name("trans_upper_triangular_substitute_inplace");
-  optimize_restricted(paras, all_timings[config.kernel_name()],
-                      matrix_trans_upper_triangular_substitute_inplace<TestDataType>, config, data);
-
-  config.kernel_name("unit_upper_triangular_substitute_inplace");
-  optimize_restricted(paras, all_timings[config.kernel_name()],
-                      matrix_unit_upper_triangular_substitute_inplace<TestDataType>, config, data);
-
-  config.kernel_name("lu_factorize");
-  optimize_restricted(paras, all_timings[config.kernel_name()],
-                      matrix_lu_factorize<TestDataType>, config, data);
-
-  //other kernels:
-  std::cout << "------- Related to other operations ----------" << std::endl;
-  config.kernel_name("rank1_update");
-  optimize_restricted(paras, all_timings[config.kernel_name()],
-                      matrix_rank1_update<TestDataType>, config, data);
-
-  config.kernel_name("scaled_rank1_update");
-  optimize_restricted(paras, all_timings[config.kernel_name()],
-                      matrix_scaled_rank1_update<TestDataType>, config, data);
-
-  config.kernel_name("vec_mul");
-  optimize_restricted(paras, all_timings[config.kernel_name()],
-                      matrix_vec_mul<TestDataType>, config, data);
-
-  config.kernel_name("trans_vec_mul");
-  optimize_restricted(paras, all_timings[config.kernel_name()],
-                      matrix_trans_vec_mul<TestDataType>, config, data);
-
-  return 0;
-}
-int main()
-{
-  std::cout << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << "               Device Info" << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  
-  viennacl::ocl::device dev(viennacl::ocl::current_device());
-  
-  std::cout << dev.info() << std::endl;
-  
-  // -----------------------------------------
-  viennacl::io::parameter_database  paras;
-  // -----------------------------------------  
-
-  std::string devname   = dev.name();
-  std::string driver    = dev.driver_version();
-  cl_uint compunits = dev.max_compute_units();      
-  size_t  wgsize    = dev.max_workgroup_size();        
-  
-  // -----------------------------------------
-   paras.add_device();
-   paras.add_data_node(viennacl::io::tag::name, devname);
-   paras.add_data_node(viennacl::io::tag::driver, driver);
-   paras.add_data_node(viennacl::io::tag::compun, compunits);         
-   paras.add_data_node(viennacl::io::tag::workgrp, wgsize);            
-  // -----------------------------------------
-  
-  //set up test config:
-  test_config conf;
-  conf.max_local_size(dev.max_work_group_size());
-  
-  // GPU specific test setup:
-  if (dev.type() == CL_DEVICE_TYPE_GPU)
-  {
-    unsigned int units = 1;
-    while (2 * units < dev.compute_units())
-      units *= 2;
-    conf.min_work_groups(units);
-    conf.max_work_groups(512); //reasonable upper limit on current GPUs
-    
-    conf.min_local_size(16); //less than 16 threads per work group is unlikely to have any impact
-    //conf.min_local_size(dev.max_work_group_size()); //testing only
-  } 
-  else if (dev.type() == CL_DEVICE_TYPE_CPU)// CPU specific test setup
-  {
-    conf.min_work_groups(1);
-    conf.max_work_groups(2*dev.compute_units()); //reasonable upper limit on current CPUs - more experience needed here!
-    
-    conf.min_local_size(1);
-  }
-  else
-  {
-    std::cerr << "Unknown device type (neither CPU nor GPU)! Aborting..." << std::endl;
-    exit(0);
-  }
-  
-  std::cout << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << "## Benchmark :: Matrix" << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-
-  std::cout << std::endl;
-  std::cout << "   -------------------------------" << std::endl;
-  std::cout << "   # benchmarking single-precision" << std::endl;
-  std::cout << "   -------------------------------" << std::endl;
-
-  // -----------------------------------------  
-   paras.add_test();    
-   paras.add_data_node(viennacl::io::tag::name,    viennacl::io::val::matrix);      
-   paras.add_data_node(viennacl::io::tag::numeric, viennacl::io::val::fl);   
-   paras.add_data_node(viennacl::io::tag::alignment, "1");   
-  // -----------------------------------------  
-
-  //set up test config:
-  conf.program_name(viennacl::linalg::kernels::matrix_row<float, 1>::program_name());
-
-  run_matrix_benchmark<float>(conf, paras);
-
-  if( viennacl::ocl::device(viennacl::ocl::current_device()).double_support() )
-  {
-    std::cout << std::endl;
-    std::cout << "   -------------------------------" << std::endl;
-    std::cout << "   # benchmarking double-precision" << std::endl;
-    std::cout << "   -------------------------------" << std::endl;
-  // -----------------------------------------  
-   paras.add_test();    
-   paras.add_data_node(viennacl::io::tag::name,    viennacl::io::val::matrix);      
-   paras.add_data_node(viennacl::io::tag::numeric, viennacl::io::val::dbl);   
-   paras.add_data_node(viennacl::io::tag::alignment, "1");   
-
-    conf.program_name(viennacl::linalg::kernels::matrix_row<double, 1>::program_name());
-   // -----------------------------------------      
-    run_matrix_benchmark<double>(conf, paras);
-  }
-  // -----------------------------------------    
-  //paras.dump(); // dump to terminal
-  paras.dump("matrix_parameters.xml"); // dump to outputfile
-  //std::ofstream stream; paras.dump(stream);   // dump to stream
-  // -----------------------------------------    
-  
-  std::cout << std::endl;
-  std::cout << "//////////////////////////////////////////////////////////////////////" << std::endl;
-  std::cout << "// Parameter evaluation for viennacl::matrix finished successfully! //" << std::endl;
-  std::cout << "//////////////////////////////////////////////////////////////////////" << std::endl;
-  return 0;
-}
-
diff --git a/examples/parameters/matrix_functors.hpp b/examples/parameters/matrix_functors.hpp
deleted file mode 100644
index bfde5ae..0000000
--- a/examples/parameters/matrix_functors.hpp
+++ /dev/null
@@ -1,91 +0,0 @@
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-
-/////////// direct solver kernels ////////////////
-// lower:
-template <typename TestData>
-void matrix_trans_lower_triangular_substitute_inplace(TestData & data)
-{
-  viennacl::linalg::inplace_solve(trans(data.mat), data.v1, viennacl::linalg::lower_tag());
-}
-
-template <typename TestData>
-void matrix_lower_triangular_substitute_inplace(TestData & data)
-{
-  viennacl::linalg::inplace_solve(data.mat, data.v1, viennacl::linalg::lower_tag());
-}
-
-template <typename TestData>
-void matrix_unit_lower_triangular_substitute_inplace(TestData & data)
-{
-  viennacl::linalg::inplace_solve(data.mat, data.v1, viennacl::linalg::unit_lower_tag());
-}
-
-// upper:
-template <typename TestData>
-void matrix_upper_triangular_substitute_inplace(TestData & data)
-{
-  viennacl::linalg::inplace_solve(data.mat, data.v1, viennacl::linalg::upper_tag());
-}
-
-template <typename TestData>
-void matrix_trans_upper_triangular_substitute_inplace(TestData & data)
-{
-  viennacl::linalg::inplace_solve(trans(data.mat), data.v1, viennacl::linalg::upper_tag());
-}
-
-template <typename TestData>
-void matrix_unit_upper_triangular_substitute_inplace(TestData & data)
-{
-  viennacl::linalg::inplace_solve(data.mat, data.v1, viennacl::linalg::unit_upper_tag());
-}
-
-
-template <typename TestData>
-void matrix_lu_factorize(TestData & data)
-{
-  viennacl::linalg::lu_factorize(data.mat);
-}
-
-
-
-
-//////////// other matrix operations: //////////////////
-template <typename TestData>
-void matrix_rank1_update(TestData & data)
-{
-  data.mat += viennacl::linalg::outer_prod(data.v1, data.v2);
-}
-
-template <typename TestData>
-void matrix_scaled_rank1_update(TestData & data)
-{
-  typedef typename TestData::value_type   NumericT;
-  data.mat += NumericT(2.0) * viennacl::linalg::outer_prod(data.v1, data.v2);
-}
-
-template <typename TestData>
-void matrix_vec_mul(TestData & data)
-{
-  data.v2 = viennacl::linalg::prod(data.mat, data.v1);
-}
-
-template <typename TestData>
-void matrix_trans_vec_mul(TestData & data)
-{
-  data.v2 = viennacl::linalg::prod(trans(data.mat), data.v1);
-}
diff --git a/examples/parameters/parameter_reader.cpp b/examples/parameters/parameter_reader.cpp
deleted file mode 100644
index 52a92af..0000000
--- a/examples/parameters/parameter_reader.cpp
+++ /dev/null
@@ -1,65 +0,0 @@
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-//#define VIENNACL_DEBUG_ALL
-//#define NDEBUG
-
-#include "viennacl/scalar.hpp"
-#include "viennacl/vector.hpp"
-#include "viennacl/matrix.hpp"
-#include "viennacl/compressed_matrix.hpp"
-#include "viennacl/io/kernel_parameters.hpp"
-
-
-#include <iostream>
-#include <vector>
-
-
-
-
-
-int main(int argc, char *argv[])
-{
-  // -----------------------------------------
-  std::cout << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << "               Device Info" << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  
-  std::cout << viennacl::ocl::current_device().info() << std::endl;
-
-  viennacl::io::read_kernel_parameters< viennacl::vector<float> >("vector_parameters.xml");
-  viennacl::io::read_kernel_parameters< viennacl::matrix<float> >("matrix_parameters.xml");
-  viennacl::io::read_kernel_parameters< viennacl::compressed_matrix<float> >("sparse_parameters.xml");
-  // -----------------------------------------  
-
-  //check:
-  std::cout << "vector add:" << std::endl;
-  std::cout << viennacl::ocl::get_kernel("f_vector_1", "add").local_work_size() << std::endl;
-  std::cout << viennacl::ocl::get_kernel("f_vector_1", "add").global_work_size() << std::endl;
-
-  std::cout << "matrix vec_mul:" << std::endl;
-  std::cout << viennacl::ocl::get_kernel("f_matrix_row_1", "vec_mul").local_work_size() << std::endl;
-  std::cout << viennacl::ocl::get_kernel("f_matrix_row_1", "vec_mul").global_work_size() << std::endl;
-  
-  std::cout << "compressed_matrix vec_mul:" << std::endl;
-  std::cout << viennacl::ocl::get_kernel("f_compressed_matrix_1", "vec_mul").local_work_size() << std::endl;
-  std::cout << viennacl::ocl::get_kernel("f_compressed_matrix_1", "vec_mul").global_work_size() << std::endl;
-
- 
-  return 0;
-}
-
diff --git a/examples/parameters/sparse.cpp b/examples/parameters/sparse.cpp
deleted file mode 100644
index 1ebead4..0000000
--- a/examples/parameters/sparse.cpp
+++ /dev/null
@@ -1,245 +0,0 @@
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-#include "viennacl/scalar.hpp"
-#include "viennacl/vector.hpp"
-#include "viennacl/compressed_matrix.hpp"
-#include "viennacl/linalg/prod.hpp"
-
-#include <iostream>
-#include <vector>
-#include <stdlib.h>
-#include "benchmark-utils.hpp"
-
-template <typename ScalarType, typename VectorType, typename MatrixType>
-class test_data;
-
-#include "common.hpp"
-#ifdef ENABLE_VIENNAPROFILER
- #include "common_vprof.hpp"
-#endif
-
-template <typename TestData>
-void matrix_vec_mul(TestData & data)
-{
-  data.v2 = viennacl::linalg::prod(data.mat, data.v1);
-}
-
-/*
-*   Auto-Tuning for dense matrix kernels
-*/
-
-#define BENCHMARK_MATRIX_SIZE   32768
-
-//a helper container that holds the objects used during benchmarking
-template <typename ScalarType, typename VectorType, typename MatrixType>
-class test_data
-{
-  public:
-    typedef typename VectorType::value_type::value_type   value_type;
-    
-    test_data(ScalarType & s1_,
-              VectorType & v1_,
-              VectorType & v2_,
-              MatrixType & mat_) : s1(s1_), v1(v1_), v2(v2_), mat(mat_)  {}
-    
-    ScalarType & s1;
-    VectorType & v1;
-    VectorType & v2;
-    MatrixType & mat;
-};
-
-
-
-
-////////////////////// some functions that aid testing to follow /////////////////////////////////
-
-
-template<typename ScalarType>
-int run_matrix_benchmark(test_config & config, viennacl::io::parameter_database& paras)
-{
-  typedef viennacl::scalar<ScalarType>   VCLScalar;
-  typedef viennacl::vector<ScalarType>   VCLVector;
-  typedef viennacl::compressed_matrix<ScalarType>   VCLMatrix;
-   
-  ////////////////////////////////////////////////////////////////////
-  //set up a little bit of data to play with:
-  //ScalarType std_result = 0;
-   
-  ScalarType std_factor1 = static_cast<ScalarType>(3.1415);
-  ScalarType std_factor2 = static_cast<ScalarType>(42.0);
-  viennacl::scalar<ScalarType> vcl_factor1(std_factor1);
-  viennacl::scalar<ScalarType> vcl_factor2(std_factor2);
-  
-  std::vector<ScalarType> std_vec1(BENCHMARK_MATRIX_SIZE);  //used to set all values to zero
-  std::vector< std::map< unsigned int, ScalarType> > stl_mat(BENCHMARK_MATRIX_SIZE);  //store identity matrix here
-  VCLVector vcl_vec1(BENCHMARK_MATRIX_SIZE);
-  VCLVector vcl_vec2(BENCHMARK_MATRIX_SIZE);
-  VCLMatrix vcl_mat(BENCHMARK_MATRIX_SIZE, BENCHMARK_MATRIX_SIZE);
-  
-  for (int i=0; i<BENCHMARK_MATRIX_SIZE; ++i)
-  {
-      if (i > 10)
-      {
-          stl_mat[i][i - 10] = 1.0;
-          stl_mat[i][i - 7] = 1.0;
-          stl_mat[i][i - 4] = 1.0;
-          stl_mat[i][i - 2] = 1.0;
-      }
-      stl_mat[i][i] = 1.0;
-      if (i + 10 < BENCHMARK_MATRIX_SIZE)
-      {
-          stl_mat[i][i + 5] = 1.0;
-          stl_mat[i][i + 7] = 1.0;
-          stl_mat[i][i + 9] = 1.0;
-          stl_mat[i][i + 10] = 1.0;
-      }
-  }
-
-  viennacl::copy(std_vec1, vcl_vec1); //initialize vectors with all zeros (no need to worry about overflows then)
-  viennacl::copy(std_vec1, vcl_vec2); //initialize vectors with all zeros (no need to worry about overflows then)
-  viennacl::copy(stl_mat, vcl_mat);
-  
-  typedef test_data<VCLScalar, VCLVector, VCLMatrix>   TestDataType;
-  test_data<VCLScalar, VCLVector, VCLMatrix> data(vcl_factor1, vcl_vec1, vcl_vec2, vcl_mat);
-
-  //////////////////////////////////////////////////////////
-  ///////////// Start parameter recording  /////////////////
-  //////////////////////////////////////////////////////////
-  
-  typedef std::map< double, std::pair<unsigned int, unsigned int> >   TimingType;
-  std::map< std::string, TimingType > all_timings;
-  
-
-  //other kernels:
-  std::cout << "------- Related to other operations ----------" << std::endl;
-
-  config.kernel_name("vec_mul");
-  optimize_full(paras, all_timings[config.kernel_name()],
-                      matrix_vec_mul<TestDataType>, config, data);
-
-
-  return 0;
-}
-int main()
-{
-  std::cout << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << "               Device Info" << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  
-  viennacl::ocl::device dev = viennacl::ocl::current_device();
-  
-  std::cout << dev.info() << std::endl;
-  
-  // -----------------------------------------
-  viennacl::io::parameter_database  paras;
-  // -----------------------------------------  
-
-  std::string devname   = dev.name();
-  std::string driver    = dev.driver_version();
-  cl_uint compunits = dev.max_compute_units();      
-  size_t wgsize    = dev.max_workgroup_size();        
-  
-  // -----------------------------------------
-   paras.add_device();
-   paras.add_data_node(viennacl::io::tag::name, devname);
-   paras.add_data_node(viennacl::io::tag::driver, driver);
-   paras.add_data_node(viennacl::io::tag::compun, compunits);         
-   paras.add_data_node(viennacl::io::tag::workgrp, wgsize);            
-  // -----------------------------------------
-  
-  //set up test config:
-  test_config conf;
-  conf.max_local_size(dev.max_work_group_size());
-  
-  // GPU specific test setup:
-  if (dev.type() == CL_DEVICE_TYPE_GPU)
-  {
-    unsigned int units = 1;
-    while (2 * units < dev.compute_units())
-      units *= 2;
-    conf.min_work_groups(units);
-    conf.max_work_groups(512); //reasonable upper limit on current GPUs
-    
-    conf.min_local_size(16); //less than 16 threads per work group is unlikely to have any impact
-    //conf.min_local_size(dev.max_work_group_size()); //testing only
-  } 
-  else if (dev.type() == CL_DEVICE_TYPE_CPU)// CPU specific test setup
-  {
-    conf.min_work_groups(1);
-    conf.max_work_groups(2*dev.compute_units()); //reasonable upper limit on current CPUs - more experience needed here!
-    
-    conf.min_local_size(1);
-  }
-  else
-  {
-    std::cerr << "Unknown device type (neither CPU nor GPU)! Aborting..." << std::endl;
-    exit(0);
-  }
-  
-  std::cout << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << "## Benchmark :: Matrix" << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-
-  std::cout << std::endl;
-  std::cout << "   -------------------------------" << std::endl;
-  std::cout << "   # benchmarking single-precision" << std::endl;
-  std::cout << "   -------------------------------" << std::endl;
-
-  // -----------------------------------------  
-   paras.add_test();    
-   paras.add_data_node(viennacl::io::tag::name,    viennacl::io::val::compmat);      
-   paras.add_data_node(viennacl::io::tag::numeric, viennacl::io::val::fl);   
-   paras.add_data_node(viennacl::io::tag::alignment, "1");   
-  // -----------------------------------------  
-
-  //set up test config:
-  conf.program_name(viennacl::linalg::kernels::compressed_matrix<float, 1>::program_name());
-
-  run_matrix_benchmark<float>(conf, paras);
-
-  if( viennacl::ocl::current_device().double_support() )
-  {
-    std::cout << std::endl;
-    std::cout << "   -------------------------------" << std::endl;
-    std::cout << "   # benchmarking double-precision" << std::endl;
-    std::cout << "   -------------------------------" << std::endl;
-  // -----------------------------------------  
-   paras.add_test();    
-   paras.add_data_node(viennacl::io::tag::name,    viennacl::io::val::compmat);      
-   paras.add_data_node(viennacl::io::tag::numeric, viennacl::io::val::dbl);   
-   paras.add_data_node(viennacl::io::tag::alignment, "1");   
-
-    conf.program_name(viennacl::linalg::kernels::compressed_matrix<double, 1>::program_name());
-   // -----------------------------------------      
-    run_matrix_benchmark<double>(conf, paras);
-  }
-  // -----------------------------------------    
-  //paras.dump(); // dump to terminal
-  paras.dump("sparse_parameters.xml"); // dump to outputfile
-  //std::ofstream stream; paras.dump(stream);   // dump to stream
-  // -----------------------------------------    
-  
-  std::cout << std::endl;
-  std::cout << "/////////////////////////////////////////////////////////////////////////////////" << std::endl;
-  std::cout << "// Parameter evaluation for viennacl::compressed_matrix finished successfully! //" << std::endl;
-  std::cout << "/////////////////////////////////////////////////////////////////////////////////" << std::endl;
-  return 0;
-}
-
diff --git a/examples/parameters/vector.cpp b/examples/parameters/vector.cpp
deleted file mode 100644
index 31953a9..0000000
--- a/examples/parameters/vector.cpp
+++ /dev/null
@@ -1,254 +0,0 @@
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-
-#include "viennacl/scalar.hpp"
-#include "viennacl/vector.hpp"
-#include "viennacl/linalg/inner_prod.hpp"
-#include "viennacl/linalg/norm_1.hpp"
-#include "viennacl/linalg/norm_2.hpp"
-#include "viennacl/linalg/norm_inf.hpp"
-
-#include <iostream>
-#include <vector>
-#include <stdlib.h>
-#include "benchmark-utils.hpp"
-
-template <typename ScalarType, typename VectorType>
-class test_data;
-
-#include "common.hpp"
-#ifdef ENABLE_VIENNAPROFILER
- #include "common_vprof.hpp"
-#endif
-#include "vector_functors.hpp"
-
-/*
-*   Auto-Tuning for vectors
-*/
-
-#define BENCHMARK_VECTOR_SIZE   1000000
-
-//a helper container that holds the objects used during benchmarking
-template <typename ScalarType, typename VectorType>
-class test_data
-{
-  public:
-    typedef typename VectorType::value_type::value_type   value_type;
-    
-    test_data(ScalarType & s1_,
-              VectorType & v1_,
-              VectorType & v2_,
-              VectorType & v3_) : s1(s1_), v1(v1_), v2(v2_), v3(v3_)  {}
-    
-    ScalarType & s1;
-    VectorType & v1;
-    VectorType & v2;
-    VectorType & v3;
-};
-
-
-
-////////////////////// some functions that aid testing to follow /////////////////////////////////
-
-
-
-template<typename ScalarType>
-int run_vector_benchmark(test_config & config, viennacl::io::parameter_database& paras)
-{
-  typedef viennacl::scalar<ScalarType>   VCLScalar;
-  typedef viennacl::vector<ScalarType>   VCLVector;
-   
-  ////////////////////////////////////////////////////////////////////
-  //set up a little bit of data to play with:
-  //ScalarType std_result = 0;
-   
-  ScalarType std_factor1 = static_cast<ScalarType>(3.1415);
-  ScalarType std_factor2 = static_cast<ScalarType>(42.0);
-  viennacl::scalar<ScalarType> vcl_factor1(std_factor1);
-  viennacl::scalar<ScalarType> vcl_factor2(std_factor2);
-  
-  std::vector<ScalarType> std_vec1(BENCHMARK_VECTOR_SIZE);  //used to set all values to zero
-  VCLVector vcl_vec1(BENCHMARK_VECTOR_SIZE);
-  VCLVector vcl_vec2(BENCHMARK_VECTOR_SIZE); 
-  VCLVector vcl_vec3(BENCHMARK_VECTOR_SIZE); 
-
-  viennacl::copy(std_vec1, vcl_vec1); //initialize vectors with all zeros (no need to worry about overflows then)
-  viennacl::copy(std_vec1, vcl_vec2); //initialize vectors with all zeros (no need to worry about overflows then)
-  
-  typedef test_data<VCLScalar, VCLVector>   TestDataType;
-  test_data<VCLScalar, VCLVector> data(vcl_factor1, vcl_vec1, vcl_vec2, vcl_vec3);
-
-  //////////////////////////////////////////////////////////
-  ///////////// Start parameter recording  /////////////////
-  //////////////////////////////////////////////////////////
-  
-  typedef std::map< double, std::pair<unsigned int, unsigned int> >   TimingType;
-  std::map< std::string, TimingType > all_timings;
-  
-  // vector addition  
-  std::cout << "------- Related to vector addition ----------" << std::endl;
-  config.kernel_name("add");                    optimize_full(paras, all_timings[config.kernel_name()], vector_add<TestDataType>, config, data);
-  config.kernel_name("inplace_add");            optimize_full(paras, all_timings[config.kernel_name()], vector_inplace_add<TestDataType>, config, data);
-  config.kernel_name("mul_add");                optimize_full(paras, all_timings[config.kernel_name()], vector_mul_add<TestDataType>, config, data);
-  config.kernel_name("cpu_mul_add");            optimize_full(paras, all_timings[config.kernel_name()], vector_cpu_mul_add<TestDataType>, config, data);
-  config.kernel_name("inplace_mul_add");        optimize_full(paras, all_timings[config.kernel_name()], vector_inplace_mul_add<TestDataType>, config, data);
-  config.kernel_name("cpu_inplace_mul_add");    optimize_full(paras, all_timings[config.kernel_name()], vector_cpu_inplace_mul_add<TestDataType>, config, data);
-  config.kernel_name("inplace_div_add");        optimize_full(paras, all_timings[config.kernel_name()], vector_inplace_div_add<TestDataType>, config, data);
-
-  std::cout << "------- Related to vector subtraction ----------" << std::endl;
-  config.kernel_name("sub");                    optimize_full(paras, all_timings[config.kernel_name()], vector_sub<TestDataType>, config, data);
-  config.kernel_name("inplace_sub");            optimize_full(paras, all_timings[config.kernel_name()], vector_inplace_sub<TestDataType>, config, data);
-  config.kernel_name("mul_sub");                optimize_full(paras, all_timings[config.kernel_name()], vector_mul_sub<TestDataType>, config, data);
-  config.kernel_name("inplace_mul_sub");        optimize_full(paras, all_timings[config.kernel_name()], vector_inplace_mul_sub<TestDataType>, config, data);
-  config.kernel_name("inplace_div_sub");        optimize_full(paras, all_timings[config.kernel_name()], vector_inplace_div_sub<TestDataType>, config, data);
-
-  std::cout << "------- Related to vector scaling (mult/div) ----------" << std::endl;
-  config.kernel_name("mult");                   optimize_full(paras, all_timings[config.kernel_name()], vector_mult<TestDataType>, config, data);
-  config.kernel_name("inplace_mult");           optimize_full(paras, all_timings[config.kernel_name()], vector_inplace_mult<TestDataType>, config, data);
-  config.kernel_name("cpu_mult");               optimize_full(paras, all_timings[config.kernel_name()], vector_cpu_mult<TestDataType>, config, data);
-  config.kernel_name("cpu_inplace_mult");       optimize_full(paras, all_timings[config.kernel_name()], vector_cpu_inplace_mult<TestDataType>, config, data);
-  config.kernel_name("divide");                 optimize_full(paras, all_timings[config.kernel_name()], vector_divide<TestDataType>, config, data);
-  config.kernel_name("inplace_divide");         optimize_full(paras, all_timings[config.kernel_name()], vector_inplace_divide<TestDataType>, config, data);
- 
-  std::cout << "------- Others ----------" << std::endl;
-  config.kernel_name("inner_prod");             optimize_full(paras, all_timings[config.kernel_name()], vector_inner_prod<TestDataType>, config, data);
-  config.kernel_name("swap");                   optimize_full(paras, all_timings[config.kernel_name()], vector_swap<TestDataType>, config, data);
-  config.kernel_name("clear");                  optimize_full(paras, all_timings[config.kernel_name()], vector_clear<TestDataType>, config, data);
-  config.kernel_name("plane_rotation");         optimize_full(paras, all_timings[config.kernel_name()], vector_plane_rotation<TestDataType>, config, data);
-  
-  //config.max_work_groups(32); //otherwise failures on 8500 GT
-  config.kernel_name("norm_1");                 optimize_restricted(paras, all_timings[config.kernel_name()], vector_norm_1<TestDataType>, config, data);
-  config.kernel_name("norm_2");                 optimize_restricted(paras, all_timings[config.kernel_name()], vector_norm_2<TestDataType>, config, data);
-  config.kernel_name("norm_inf");               optimize_restricted(paras, all_timings[config.kernel_name()], vector_norm_inf<TestDataType>, config, data);
-
-  
-  //restricted optimizations:
-  config.kernel_name("index_norm_inf");         optimize_restricted(paras, all_timings[config.kernel_name()], vector_index_norm_inf<TestDataType>, config, data);
-  
-  
-  return 0;
-}
-
-int main()
-{
-  std::cout << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << "               Device Info" << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  
-  viennacl::ocl::device dev = viennacl::ocl::current_device();
-  
-  std::cout << dev.info() << std::endl;
-  
-  // -----------------------------------------
-  viennacl::io::parameter_database  paras;
-  // -----------------------------------------  
-
-  std::string devname   = dev.name();
-  std::string driver    = dev.driver_version();
-  cl_uint compunits = dev.max_compute_units();      
-  size_t wgsize    = dev.max_workgroup_size();        
-  
-  // -----------------------------------------
-   paras.add_device();
-   paras.add_data_node(viennacl::io::tag::name, devname);
-   paras.add_data_node(viennacl::io::tag::driver, driver);
-   paras.add_data_node(viennacl::io::tag::compun, compunits);         
-   paras.add_data_node(viennacl::io::tag::workgrp, wgsize);            
-  // -----------------------------------------
-  
-  //set up test config:
-  test_config conf;
-  conf.max_local_size(dev.max_work_group_size());
-  
-  // GPU specific test setup:
-  if (dev.type() == CL_DEVICE_TYPE_GPU)
-  {
-    unsigned int units = 1;
-    while (2 * units < dev.compute_units())
-      units *= 2;
-    conf.min_work_groups(units);
-    conf.max_work_groups(512); //reasonable upper limit on current GPUs
-    
-    conf.min_local_size(16); //less than 16 threads per work group is unlikely to have any impact
-    //conf.min_local_size(dev.max_work_group_size()); //testing only
-  } 
-  else if (dev.type() == CL_DEVICE_TYPE_CPU)// CPU specific test setup
-  {
-    conf.min_work_groups(1);
-    conf.max_work_groups(2*dev.compute_units()); //reasonable upper limit on current CPUs - more experience needed here!
-    
-    conf.min_local_size(1);
-  }
-  else
-  {
-    std::cerr << "Unknown device type (neither CPU nor GPU)! Aborting..." << std::endl;
-    exit(0);
-  }
-  
-  std::cout << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << "## Benchmark :: Vector" << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-
-  std::cout << std::endl;
-  std::cout << "   -------------------------------" << std::endl;
-  std::cout << "   # benchmarking single-precision" << std::endl;
-  std::cout << "   -------------------------------" << std::endl;
-
-  // -----------------------------------------  
-   paras.add_test();    
-   paras.add_data_node(viennacl::io::tag::name,    viennacl::io::val::vec);      
-   paras.add_data_node(viennacl::io::tag::numeric, viennacl::io::val::fl);   
-   paras.add_data_node(viennacl::io::tag::alignment, "1");   
-  // -----------------------------------------  
-
-  //set up test config:
-  conf.program_name(viennacl::linalg::kernels::vector<float, 1>::program_name());
-
-  run_vector_benchmark<float>(conf, paras);
-
-  if( viennacl::ocl::current_device().double_support() )
-  {
-    std::cout << std::endl;
-    std::cout << "   -------------------------------" << std::endl;
-    std::cout << "   # benchmarking double-precision" << std::endl;
-    std::cout << "   -------------------------------" << std::endl;
-  // -----------------------------------------  
-   paras.add_test();    
-   paras.add_data_node(viennacl::io::tag::name,    viennacl::io::val::vec);      
-   paras.add_data_node(viennacl::io::tag::numeric, viennacl::io::val::dbl);   
-   paras.add_data_node(viennacl::io::tag::alignment, "1");   
-
-    conf.program_name(viennacl::linalg::kernels::vector<double, 1>::program_name());
-   // -----------------------------------------      
-    run_vector_benchmark<double>(conf, paras);
-  }
-  // -----------------------------------------    
-  //paras.dump(); // dump to terminal
-  paras.dump("vector_parameters.xml"); // dump to outputfile
-  //std::ofstream stream; paras.dump(stream);   // dump to stream
-  // -----------------------------------------    
-  
-  std::cout << std::endl;
-  std::cout << "//////////////////////////////////////////////////////////////////////" << std::endl;
-  std::cout << "// Parameter evaluation for viennacl::vector finished successfully! //" << std::endl;
-  std::cout << "//////////////////////////////////////////////////////////////////////" << std::endl;
-  return 0;
-}
-
diff --git a/examples/parameters/vector_functors.hpp b/examples/parameters/vector_functors.hpp
deleted file mode 100644
index d3285e0..0000000
--- a/examples/parameters/vector_functors.hpp
+++ /dev/null
@@ -1,204 +0,0 @@
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-
-
-/////////////////// full optimization functors /////////////////////
-
-// add kernels:
-// struct vector_add
-// {
-//   template <typename TestData>
-//   void operator()(TestData & data)
-//   {
-//     std::cerr << "add now!" << std::endl;
-//     data.v3 = data.v1 + data.v2;
-//   }
-// };
-
-template <typename TestData>
-void vector_add(TestData & data)
-{
-  data.v3 = data.v1 + data.v2;
-}
-
-template <typename TestData>
-void vector_inplace_add(TestData & data)
-{
-  data.v3 += data.v1;
-}
-
-template <typename TestData>
-void vector_mul_add(TestData & data)
-{
-  data.v3 = data.v1 + data.s1 * data.v2;
-}
-
-template <typename TestData>
-void vector_cpu_mul_add(TestData & data)
-{
-  typedef typename TestData::value_type   NumericT;
-  data.v3 = data.v1 + NumericT(2.0) * data.v2;
-}
-
-template <typename TestData>
-void vector_inplace_mul_add(TestData & data)
-{
-  data.v3 += data.s1 * data.v2;
-}
-
-template <typename TestData>
-void vector_cpu_inplace_mul_add(TestData & data)
-{
-  typedef typename TestData::value_type   NumericT;
-  data.v3 += NumericT(2.0) * data.v2;
-}
-
-
-template <typename TestData>
-void vector_inplace_div_add(TestData & data)
-{
-  data.v3 += data.v2 / data.s1;
-}
-
-
-// sub kernels:
-template <typename TestData>
-void vector_sub(TestData & data)
-{
-  data.v3 = data.v1 - data.v2; //a plain vector subtraction
-}
-
-template <typename TestData>
-void vector_inplace_sub(TestData & data)
-{
-  data.v3 -= data.v1; //a plain vector subtraction
-}
-
-template <typename TestData>
-void vector_mul_sub(TestData & data)
-{
-  data.v3 = data.v1 - data.s1 * data.v2;
-}
-
-template <typename TestData>
-void vector_inplace_mul_sub(TestData & data)
-{
-  data.v3 -= data.s1 * data.v2;
-}
-
-template <typename TestData>
-void vector_inplace_div_sub(TestData & data)
-{
-  data.v3 -= data.v2 / data.s1;
-}
-
-
-// mult kernels:
-template <typename TestData>
-void vector_mult(TestData & data)
-{
-  data.v3 = data.s1 * data.v2;
-}
-
-template <typename TestData>
-void vector_cpu_mult(TestData & data)
-{
-  typedef typename TestData::value_type   NumericT;
-  data.v3 = NumericT(2.0) * data.v2;
-}
-
-template <typename TestData>
-void vector_inplace_mult(TestData & data)
-{
-  data.v3 *= data.s1;
-}
-
-template <typename TestData>
-void vector_cpu_inplace_mult(TestData & data)
-{
-  typedef typename TestData::value_type   NumericT;
-  data.v3 *= NumericT(2.0);
-}
-
-
-// div kernels:
-template <typename TestData>
-void vector_divide(TestData & data)
-{
-  data.v3 = data.v2 / data.s1;
-}
-
-template <typename TestData>
-void vector_inplace_divide(TestData & data)
-{
-  data.v3 /= data.s1;
-}
-
-
-// other kernels:
-template <typename TestData>
-void vector_inner_prod(TestData & data)
-{
-  data.s1 = viennacl::linalg::inner_prod(data.v1, data.v2);
-}
-
-template <typename TestData>
-void vector_swap(TestData & data)
-{
-  swap(data.v2, data.v3);
-}
-
-template <typename TestData>
-void vector_clear(TestData & data)
-{
-  data.v3.clear();
-}
-
-template <typename TestData>
-void vector_plane_rotation(TestData & data)
-{
-  typedef typename TestData::value_type   NumericT;
-  viennacl::linalg::plane_rotation(data.v1, data.v2, NumericT(1.0), NumericT(2.0)); //a plain vector addition
-}
-
-template <typename TestData>
-void vector_norm_1(TestData & data)
-{
-  data.s1 = viennacl::linalg::norm_1(data.v3);
-}
-
-template <typename TestData>
-void vector_norm_2(TestData & data)
-{
-  data.s1 = viennacl::linalg::norm_2(data.v3);
-}
-
-template <typename TestData>
-void vector_norm_inf(TestData & data)
-{
-  data.s1 = viennacl::linalg::norm_inf(data.v3);
-}
-
-/////////////////// restricted optimization functors /////////////////////
-
-template <typename TestData>
-void vector_index_norm_inf(TestData & data)
-{
-  viennacl::linalg::index_norm_inf(data.v3);
-}
-
-
diff --git a/examples/testdata/eigen/nsm1.example b/examples/testdata/eigen/nsm1.example
new file mode 100644
index 0000000..373ba91
--- /dev/null
+++ b/examples/testdata/eigen/nsm1.example
@@ -0,0 +1,6 @@
+4
+0.066079 0.105255 0.579228 0.648683
+0.776179 0.206674 0.916046 0.692245
+0.568858 0.286236 0.319619 0.885122
+0.967071 0.844799 0.030907 0.152086
+ 1.9491317 -0.4693242 -0.4693242 -0.2660253 0 0.3951408 -0.3951408 0
\ No newline at end of file
diff --git a/examples/testdata/eigen/nsm2.example b/examples/testdata/eigen/nsm2.example
new file mode 100644
index 0000000..6ec5367
--- /dev/null
+++ b/examples/testdata/eigen/nsm2.example
@@ -0,0 +1,12 @@
+10
+0.350777 0.342985 0.656019 0.886465 0.082892 0.439252 0.057091 0.604342 0.934282 0.29226
+0.805981 0.969854 0.332081 0.376548 0.463725 0.558717 0.88037 0.808632 0.625312 0.201817
+0.689934 0.334555 0.052971 0.57211 0.376726 0.06413 0.083496 0.223991 0.01876 0.229517
+0.845132 0.193865 0.982433 0.282181 0.155989 0.150508 0.822947 0.068145 0.781244 0.350917
+0.873082 0.886554 0.793183 0.097007 0.233064 0.669249 0.340846 0.243651 0.869799 0.155046
+0.550637 0.609919 0.46136 0.455629 0.875599 0.235022 0.25192 0.085902 0.171065 0.55162
+0.257691 0.915708 0.585844 0.519667 0.344668 0.618611 0.803099 0.881155 0.58492 0.081828
+0.180549 0.439281 0.974838 0.09279 0.922669 0.050456 0.173874 0.258211 0.779692 0.128164
+0.936339 0.5986 0.513279 0.230474 0.981686 0.262627 0.035318 0.375311 0.416272 0.777941
+0.772113 0.668724 0.106412 0.52638 0.001447 0.205683 0.577281 0.98261 0.111703 0.79517
+ 4.6850737 -0.9401973 -0.5836800 -0.5836800 0.7553005 0.4587716 0.4587716 0.4781890 -0.1659641 -0.1659641 0 0 0.4858211 -0.4858211 0 0.4344098 -0.4344098 0 0.2427032 -0.2427032
\ No newline at end of file
diff --git a/examples/testdata/eigen/nsm3.example b/examples/testdata/eigen/nsm3.example
new file mode 100644
index 0000000..c3fa9f6
--- /dev/null
+++ b/examples/testdata/eigen/nsm3.example
@@ -0,0 +1,273 @@
+271
+0.499058 0.63 0.928904 0.222949 0.659771 0.189218 0.010811 0.16401 0.258427 0.233394 0.269585 0.134039 0.022617 0.284598 0.802732 0.954461 0.343103 0.698718 0.861414 0.522165 0.224366 0.073529 0.867285 0.564235 0.274874 0.658295 0.661164 0.807943 0.073302 0.488852 0.945407 0.063781 0.326843 0.6213 0.580844 0.3946 0.410522 0.336632 0.486422 0.150092 0.115609 0.788946 0.626505 0.097246 0.958659 0.405854 0.818489 0.556441 0.472942 0.150001 0.727597 0.820523 0.219362 0.115344 0.003342 0.1084 [...]
+0.819857 0.202836 0.070657 0.925519 0.972664 0.805338 0.573279 0.571844 0.217435 0.833388 0.816564 0.112783 0.251405 0.919248 0.03879 0.499985 0.046349 0.019053 0.126262 0.693021 0.084512 0.232331 0.445365 0.266492 0.736082 0.094072 0.340987 0.042536 0.408444 0.890268 0.214687 0.348603 0.40949 0.841264 0.999145 0.354305 0.012217 0.815799 0.326009 0.810109 0.946676 0.669175 0.947208 0.610434 0.895018 0.415739 0.65488 0.866514 0.954058 0.312661 0.458395 0.907958 0.047431 0.00464 0.781556 0 [...]
+0.325718 0.860937 0.906584 0.013084 0.177011 0.370394 0.908106 0.395735 0.663405 0.470688 0.471386 0.498568 0.914519 0.221734 0.446436 0.688416 0.033908 0.427195 0.298436 0.762346 0.957145 0.055994 0.131978 0.850448 0.223217 0.786709 0.45156 0.295412 0.809936 0.013186 0.571362 0.358512 0.845368 0.420216 0.017098 0.097602 0.860854 0.183975 0.618219 0.56953 0.288986 0.781082 0.413259 0.912502 0.613829 0.153175 0.979906 0.481657 0.32193 0.345487 0.718212 0.764087 0.488103 0.796863 0.888904  [...]
+0.651158 0.728828 0.396043 0.713393 0.867069 0.293066 0.987101 0.109228 0.869248 0.273324 0.087029 0.635987 0.836719 0.465307 0.05547 0.074739 0.191503 0.345563 0.786914 0.985658 0.609165 0.074548 0.33709 0.279704 0.042712 0.675539 0.373624 0.97992 0.230729 0.002066 0.155114 0.605079 0.669284 0.771651 0.968085 0.645861 0.979113 0.583282 0.403349 0.672301 0.86245 0.040828 0.891169 0.597193 0.282279 0.988271 0.277889 0.893363 0.47077 0.828766 0.284499 0.738118 0.239865 0.261984 0.003749 0. [...]
+0.396549 0.514607 0.491013 0.547011 0.063403 0.307602 0.966421 0.887068 0.893416 0.487729 0.704195 0.75644 0.044167 0.036946 0.569934 0.083177 0.010567 0.063708 0.638796 0.831624 0.434046 0.707992 0.345709 0.00284 0.3064 0.209484 0.327654 0.219575 0.565044 0.097563 0.470752 0.19948 0.00843 0.702135 0.244093 0.950522 0.916797 0.245366 0.831912 0.856208 0.559181 0.252963 0.690113 0.02063 0.187732 0.57036 0.874987 0.341944 0.236835 0.8833 0.472274 0.917359 0.655237 0.250631 0.614693 0.11743 [...]
+0.622804 0.872926 0.442744 0.256531 0.700796 0.270678 0.949425 0.884564 0.221857 0.491484 0.865777 0.235855 0.126244 0.701084 0.171723 0.60388 0.867684 0.377016 0.932944 0.275827 0.96598 0.94661 0.242816 0.763853 0.431777 0.911427 0.119585 0.634805 0.324369 0.646483 0.287975 0.875219 0.863756 0.694223 0.810148 0.571619 0.334029 0.326164 0.748069 0.08442 0.849841 0.928353 0.468963 0.219423 0.576049 0.737998 0.411323 0.539556 0.224514 0.97411 0.421689 0.888072 0.081822 0.736225 0.945825 0. [...]
+0.511573 0.756738 0.643308 0.209063 0.2973 0.979755 0.56271 0.774616 0.752878 0.475495 0.427031 0.944048 0.700985 0.831636 0.059731 0.990163 0.471828 0.534923 0.458122 0.627245 0.464994 0.274885 0.455537 0.410816 0.07813 0.540944 0.821084 0.287015 0.561322 0.218699 0.157849 0.324157 0.082785 0.310223 0.399638 0.098173 0.51598 0.317801 0.571555 0.72421 0.116483 0.574949 0.090889 0.513109 0.332848 0.710003 0.846559 0.809696 0.907152 0.459693 0.881024 0.694129 0.231019 0.779018 0.067219 0.6 [...]
+0.193757 0.426314 0.476058 0.244297 0.904238 0.781046 0.924122 0.786158 0.083239 0.723913 0.037748 0.482684 0.848139 0.611556 0.384603 0.447447 0.704689 0.733049 0.813274 0.065145 0.525819 0.921948 0.926715 0.493712 0.588904 0.224237 0.128751 0.444182 0.526309 0.864419 0.472932 0.370719 0.840241 0.110738 0.013152 0.327813 0.735031 0.060351 0.01788 0.458017 0.320012 0.425811 0.479132 0.841773 0.883098 0.1173 0.042533 0.373008 0.517525 0.73338 0.272423 0.542394 0.690156 0.884742 0.108256 0 [...]
+0.313118 0.3508 0.550876 0.994549 0.772979 0.010574 0.912063 0.548968 0.648289 0.065392 0.713775 0.965273 0.113659 0.621245 0.964376 0.617176 0.980062 0.783359 0.008465 0.313782 0.055565 0.992069 0.669143 0.724826 0.247403 0.213406 0.426713 0.626048 0.79161 0.220753 0.829224 0.639411 0.103728 0.456541 0.898585 0.819905 0.242953 0.891541 0.06795 0.536681 0.070561 0.639252 0.017815 0.374588 0.921248 0.17609 0.565702 0.127241 0.30659 0.13377 0.728621 0.109848 0.313031 0.794524 0.249888 0.32 [...]
+0.854085 0.471544 0.475875 0.583704 0.630908 0.274452 0.07057 0.151351 0.055608 0.635328 0.034446 0.018533 0.330316 0.335032 0.934969 0.777812 0.796966 0.056847 0.733679 0.602195 0.639597 0.139841 0.473873 0.136231 0.443543 0.315208 0.70012 0.283469 0.110536 0.666232 0.733865 0.894343 0.743984 0.790569 0.628912 0.841354 0.672488 0.756068 0.534459 0.929839 0.584125 0.96181 0.081677 0.263187 0.79676 0.207512 0.342749 0.965146 0.527892 0.833913 0.310941 0.704649 0.117685 0.716954 0.035233 0 [...]
+0.11568 0.51929 0.935732 0.440146 0.078717 0.543135 0.830731 0.677499 0.788528 0.554628 0.683452 0.950707 0.530025 0.008314 0.100604 0.772471 0.828466 0.120707 0.4343 0.286755 0.414474 0.187439 0.900418 0.019144 0.910977 0.071907 0.135357 0.262296 0.045348 0.00345 0.506114 0.407513 0.105951 0.571525 0.963952 0.568301 0.800145 0.172279 0.689837 0.166224 0.473441 0.070418 0.425238 0.616866 0.882245 0.990872 0.022277 0.05837 0.864322 0.402924 0.1119 0.001798 0.294416 0.122834 0.91611 0.7642 [...]
+0.634699 0.747317 0.065262 0.103142 0.111929 0.055658 0.431405 0.522946 0.541892 0.065322 0.562464 0.536761 0.183032 0.259062 0.640489 0.049956 0.991043 0.250975 0.441682 0.36834 0.146125 0.572497 0.312687 0.924968 0.686069 0.737882 0.346699 0.931565 0.682012 0.429318 0.997772 0.784675 0.508175 0.941212 0.105605 0.400955 0.921716 0.049774 0.086502 0.35599 0.713938 0.756183 0.821475 0.210211 0.135883 0.920973 0.498708 0.301417 0.882018 0.630151 0.537979 0.099237 0.501559 0.775952 0.409425 [...]
+0.312379 0.697094 0.801251 0.856951 0.8333 0.877512 0.889795 0.119576 0.849483 0.693638 0.002992 0.220895 0.036697 0.042276 0.90165 0.040383 0.434841 0.689545 0.995023 0.604602 0.305478 0.25156 0.378673 0.53238 0.701613 0.593068 0.127635 0.656988 0.082403 0.857777 0.836405 0.011812 0.058001 0.310644 0.73213 0.720738 0.667592 0.643302 0.146552 0.668583 0.293975 0.467099 0.433963 0.965388 0.994404 0.455761 0.256424 0.397465 0.925421 0.192622 0.649277 0.904064 0.283247 0.666505 0.708083 0.1 [...]
+0.260544 0.453079 0.129666 0.423862 0.59758 0.976181 0.680839 0.424061 0.802302 0.745923 0.00606 0.439605 0.274126 0.177479 0.813974 0.260365 0.938849 0.111729 0.733574 0.226221 0.957278 0.639918 0.073662 0.007301 0.652799 0.481116 0.12998 0.771591 0.380683 0.708046 0.957976 0.097174 0.37084 0.515201 0.247537 0.775058 0.643398 0.116186 0.063179 0.315634 0.472253 0.771804 0.522853 0.185561 0.502487 0.074489 0.800226 0.642341 0.789219 0.894175 0.956486 0.957499 0.792236 0.526895 0.114674 0 [...]
+0.555785 0.16099 0.802359 0.658603 0.055291 0.509101 0.802685 0.198541 0.124447 0.981765 0.035937 0.013272 0.435536 0.095835 0.036126 0.700572 0.550476 0.144312 0.420758 0.778554 0.194248 0.801392 0.70942 0.347708 0.449699 0.890768 0.593995 0.374071 0.46594 0.274425 0.689909 0.371462 0.164833 0.466312 0.38455 0.769556 0.020185 0.054515 0.552651 0.057901 0.905665 0.021715 0.910062 0.967166 0.649144 0.664579 0.731787 0.17631 0.852754 0.171776 0.320522 0.071894 0.423421 0.887719 0.270009 0. [...]
+0.630066 0.162747 0.14603 0.892067 0.188363 0.833325 0.083297 0.657405 0.408347 0.084173 0.071049 0.747915 0.813924 0.849706 0.54906 0.190781 0.952575 0.021534 0.900797 0.246732 0.450278 0.799251 0.703685 0.79492 0.242307 0.836513 0.981348 0.317082 0.378564 0.387377 0.108937 0.736729 0.38682 0.358863 0.094304 0.232597 0.525656 0.202774 0.076044 0.275896 0.459472 0.195311 0.397214 0.763105 0.427104 0.047219 0.111705 0.617788 0.626333 0.652538 0.351385 0.444838 0.633148 0.063044 0.835782 0 [...]
+0.141716 0.573218 0.716942 0.233796 0.782187 0.861713 0.868719 0.308553 0.991278 0.015938 0.488724 0.376668 0.381953 0.26922 0.523315 0.575477 0.221493 0.273194 0.272878 0.73937 0.691262 0.833944 0.356333 0.709668 0.953453 0.771395 0.30375 0.399276 0.807999 0.882616 0.597626 0.810583 0.719458 0.722211 0.356617 0.073674 0.884185 0.537201 0.476799 0.092691 0.29313 0.640153 0.567414 0.053697 0.582101 0.632335 0.295352 0.984336 0.647559 0.973138 0.658783 0.455573 0.541563 0.811632 0.580897 0 [...]
+0.834142 0.442772 0.973222 0.982403 0.361471 0.380257 0.940054 0.106048 0.706342 0.154688 0.010082 0.482118 0.665062 0.532421 0.971998 0.28766 0.38547 0.815442 0.097771 0.783509 0.957718 0.820146 0.238714 0.577339 0.780496 0.969125 0.873702 0.804067 0.363178 0.291313 0.767523 0.362725 0.516919 0.987139 0.052031 0.298244 0.476876 0.643099 0.227844 0.337816 0.754718 0.873822 0.357579 0.513951 0.935609 0.984173 0.348404 0.257814 0.382574 0.567533 0.412302 0.0184 0.900509 0.385498 0.79217 0. [...]
+0.626172 0.912329 0.966206 0.413007 0.410778 0.05264 0.104682 0.138348 0.130123 0.126271 0.653295 0.672888 0.364256 0.319194 0.03441 0.951422 0.185862 0.263492 0.603426 0.195765 0.492498 0.981509 0.394293 0.958166 0.159124 0.189121 0.072954 0.285339 0.601757 0.909665 0.674026 0.059973 0.225326 0.53955 0.847241 0.900475 0.356281 0.74449 0.277309 0.852508 0.394147 0.127167 0.81837 0.746732 0.436199 0.184707 0.988464 0.323016 0.829349 0.500536 0.833221 0.510481 0.970965 0.112596 0.234448 0. [...]
+0.719888 0.40735 0.772379 0.094384 0.407664 0.811937 0.020213 0.052337 0.604342 0.099227 0.768882 0.784949 0.880828 0.269704 0.182545 0.333436 0.019543 0.723257 0.354019 0.887628 0.276905 0.174193 0.240625 0.919915 0.937259 0.781553 0.187179 0.672893 0.007232 0.092208 0.958183 0.941361 0.845611 0.086285 0.43137 0.641088 0.077229 0.692812 0.286904 0.891237 0.400608 0.691102 0.054875 0.518026 0.292211 0.285257 0.429013 0.65333 0.497941 0.829471 0.686908 0.237206 0.630933 0.833806 0.891751  [...]
+0.245208 0.68186 0.080426 0.473814 0.42218 0.744757 0.683915 0.952618 0.463716 0.20931 0.217235 0.670657 0.748937 0.160202 0.488208 0.170652 0.157756 0.293983 0.694597 0.477935 0.037873 0.144044 0.126429 0.769701 0.602201 0.882186 0.765759 0.622835 0.835285 0.02007 0.212033 0.817515 0.665815 0.323314 0.293452 0.825965 0.85691 0.12682 0.301334 0.606521 0.357579 0.620392 0.155955 0.428957 0.843207 0.074201 0.940847 0.27016 0.556228 0.819203 0.732377 0.440003 0.795228 0.355884 0.297731 0.13 [...]
+0.433645 0.541491 0.09212 0.399 0.905361 0.484534 0.796023 0.420785 0.293414 0.899689 0.154397 0.496041 0.357051 0.844719 0.748155 0.329344 0.188962 0.951221 0.159894 0.47797 0.976258 0.333572 0.467517 0.862156 0.594561 0.276782 0.662861 0.586645 0.598125 0.130519 0.62252 0.190788 0.953991 0.52881 0.418726 0.286184 0.936781 0.585258 0.035846 0.379477 0.792406 0.859156 0.983145 0.903955 0.512256 0.419872 0.608389 0.18134 0.841586 0.130577 0.134221 0.697469 0.704719 0.37096 0.130955 0.8798 [...]
+0.954117 0.826096 0.28566 0.466297 0.836839 0.993032 0.825027 0.216522 0.32004 0.555259 0.784746 0.970589 0.392105 0.931993 0.401564 0.489209 0.465164 0.455616 0.094163 0.925644 0.920453 0.684751 0.216497 0.792676 0.060988 0.28599 0.205801 0.85949 0.255514 0.213022 0.808735 0.882565 0.014316 0.924152 0.411746 0.352966 0.235552 0.219199 0.195425 0.921328 0.235016 0.891118 0.897918 0.400539 0.065056 0.345899 0.021184 0.368015 0.436664 0.676326 0.358351 0.676476 0.698298 0.899898 0.506124 0 [...]
+0.97475 0.15154 0.307117 0.219006 0.999142 0.515442 0.903015 0.011692 0.225772 0.96793 0.503875 0.708234 0.532585 0.53548 0.377117 0.094317 0.566141 0.39864 0.037563 0.817142 0.887117 0.265381 0.763179 0.504505 0.778437 0.236191 0.211969 0.614343 0.287081 0.949618 0.74582 0.26965 0.30427 0.079819 0.688862 0.891675 0.081994 0.418015 0.819415 0.085032 0.339761 0.349961 0.830917 0.867045 0.278715 0.969835 0.827002 0.478732 0.152037 0.992419 0.880518 0.232347 0.924335 0.567316 0.719043 0.417 [...]
+0.064541 0.027356 0.483257 0.120621 0.359414 0.842416 0.619035 0.781333 0.531715 0.526405 0.92801 0.98265 0.495118 0.307789 0.273283 0.623679 0.919574 0.16949 0.543396 0.811274 0.529677 0.043252 0.08217 0.65711 0.223345 0.104017 0.885307 0.900517 0.199447 0.715751 0.368012 0.019427 0.009561 0.772079 0.420522 0.991523 0.751159 0.193918 0.893426 0.797808 0.633322 0.672939 0.083743 0.779041 0.128423 0.32074 0.064808 0.292432 0.300703 0.842075 0.494961 0.7353 0.927043 0.32588 0.277982 0.5123 [...]
+0.539256 0.854934 0.808868 0.159166 0.027593 0.017791 0.066921 0.072405 0.559174 0.715298 0.333788 0.56069 0.791012 0.477806 0.818716 0.768251 0.380263 0.522602 0.139333 0.132055 0.991433 0.979074 0.664987 0.591247 0.493216 0.669179 0.136093 0.007911 0.017091 0.955143 0.583417 0.03968 0.379088 0.915562 0.430777 0.093918 0.238694 0.530793 0.931113 0.739276 0.686061 0.217817 0.234376 0.245065 0.864392 0.443016 0.886249 0.089792 0.536855 0.974961 0.630706 0.547611 0.389674 0.863653 0.655189 [...]
+0.023917 0.194202 0.357836 0.367272 0.653877 0.330529 0.703618 0.813051 0.895735 0.268967 0.998854 0.348728 0.167986 0.492729 0.860102 0.71315 0.509425 0.635555 0.027759 0.880081 0.972365 0.254832 0.631315 0.868485 0.704224 0.945429 0.756514 0.789629 0.836931 0.851013 0.036853 0.695244 0.409788 0.027888 0.428857 0.845416 0.274374 0.011477 0.250926 0.532294 0.912867 0.592819 0.39603 0.676733 0.658028 0.282974 0.106606 0.904685 0.885634 0.481032 0.155468 0.18424 0.227648 0.134468 0.30382 0 [...]
+0.521675 0.002293 0.629686 0.420898 0.367804 0.860325 0.59045 0.081564 0.511514 0.206902 0.886642 0.841291 0.878041 0.91567 0.001819 0.928359 0.235792 0.774307 0.508984 0.387453 0.712864 0.612544 0.574245 0.282307 0.285349 0.46258 0.250417 0.99711 0.696808 0.111542 0.33721 0.541459 0.974416 0.902522 0.106085 0.312057 0.698054 0.925158 0.792458 0.94111 0.753397 0.94358 0.288293 0.037636 0.822117 0.347185 0.786735 0.942292 0.81178 0.208127 0.360236 0.168202 0.887661 0.327669 0.128979 0.678 [...]
+0.516023 0.328213 0.124517 0.050193 0.332112 0.423036 0.925047 0.200721 0.571867 0.517514 0.007899 0.751576 0.760398 0.540039 0.782996 0.128974 0.257556 0.162338 0.118699 0.610894 0.823905 0.733408 0.827446 0.080859 0.45983 0.567993 0.089838 0.093118 0.961549 0.024277 0.138364 0.565692 0.827094 0.444597 0.576192 0.992468 0.381543 0.298623 0.151668 0.03171 0.533453 0.177165 0.225842 0.809443 0.67789 0.203623 0.065453 0.141227 0.772476 0.697463 0.570358 0.823727 0.072501 0.721475 0.549294  [...]
+0.148644 0.460415 0.390016 0.342889 0.720623 0.721581 0.683987 0.870947 0.59763 0.632117 0.965353 0.60978 0.523821 0.671704 0.166178 0.492037 0.996509 0.949102 0.447012 0.719219 0.289902 0.406734 0.271224 0.564964 0.986114 0.112821 0.031211 0.857489 0.31334 0.74667 0.244146 0.821176 0.332753 0.241625 0.303987 0.713391 0.157916 0.701954 0.006308 0.892403 0.471122 0.316999 0.615654 0.559237 0.561171 0.950209 0.966355 0.346969 0.782061 0.891579 0.893645 0.324365 0.265139 0.144773 0.798437 0 [...]
+0.830887 0.232547 0.087312 0.135856 0.197466 0.732217 0.399677 0.677201 0.067044 0.151802 0.13371 0.735239 0.005585 0.501624 0.720357 0.682326 0.882283 0.060916 0.929695 0.902308 0.48576 0.021008 0.236206 0.781562 0.939102 0.918805 0.595563 0.471567 0.015419 0.928235 0.84978 0.045828 0.15488 0.168198 0.17247 0.645893 0.996661 0.743639 0.378584 0.103109 0.447528 0.704023 0.961791 0.96829 0.119416 0.223151 0.123043 0.909079 0.326974 0.845417 0.536151 0.100821 0.971014 0.464917 0.650695 0.0 [...]
+0.042486 0.933841 0.576141 0.195787 0.188966 0.871216 0.275817 0.452882 0.973077 0.534663 0.953884 0.950673 0.108374 0.384869 0.397701 0.621367 0.870427 0.931435 0.527093 0.679899 0.635785 0.438263 0.314408 0.555274 0.200794 0.344199 0.016985 0.961547 0.083065 0.547182 0.051221 0.263074 0.668087 0.544225 0.559533 0.554176 0.929511 0.296502 0.348974 0.805742 0.416136 0.995284 0.886954 0.683241 0.718835 0.531939 0.431675 0.629647 0.274254 0.391648 0.528786 0.190552 0.32214 0.207508 0.87939 [...]
+0.769954 0.626983 0.729446 0.20159 0.967669 0.979212 0.500841 0.833588 0.284378 0.061561 0.689708 0.425171 0.963458 0.089076 0.9585 0.870753 0.025376 0.810587 0.467488 0.5341 0.782672 0.105273 0.866599 0.445785 0.752111 0.371846 0.597406 0.658022 0.103544 0.964941 0.43202 0.835446 0.89208 0.775242 0.226172 0.611109 0.608417 0.23409 0.973106 0.352432 0.770067 0.68186 0.624371 0.237998 0.445959 0.753214 0.680891 0.318773 0.049225 0.142083 0.895477 0.18123 0.531757 0.413052 0.851632 0.87512 [...]
+0.148257 0.908412 0.50648 0.88969 0.010427 0.739673 0.276514 0.729777 0.219409 0.768558 0.035272 0.848414 0.854396 0.190001 0.316443 0.143801 0.394341 0.514165 0.801195 0.661388 0.15766 0.072702 0.881435 0.845037 0.408345 0.352542 0.400902 0.430158 0.151883 0.455331 0.296644 0.832187 0.72942 0.281108 0.091328 0.308841 0.568309 0.182671 0.613689 0.172996 0.048039 0.641232 0.066495 0.376319 0.240335 0.070067 0.773071 0.479341 0.201932 0.125795 0.574436 0.417234 0.143379 0.807451 0.432601 0 [...]
+0.509606 0.666883 0.561292 0.21728 0.304371 0.041927 0.831128 0.695406 0.428502 0.771388 0.644552 0.306617 0.973926 0.739509 0.232465 0.316272 0.290728 0.26294 0.549598 0.930458 0.573285 0.253783 0.769364 0.511208 0.885609 0.365927 0.752567 0.69445 0.584896 0.592906 0.580158 0.9669 0.062995 0.691228 0.594156 0.509933 0.439214 0.250439 0.47132 0.21488 0.342225 0.23945 0.1834 0.926073 0.442712 0.306334 0.659511 0.429217 0.415967 0.405575 0.841015 0.915282 0.271755 0.253122 0.896588 0.25849 [...]
+0.093407 0.130143 0.013915 0.281555 0.901806 0.811376 0.08067 0.973526 0.914312 0.715223 0.027971 0.396958 0.671409 0.820398 0.671725 0.813496 0.531281 0.259717 0.336229 0.665234 0.028793 0.090829 0.075838 0.584375 0.382209 0.483183 0.087649 0.807554 0.243685 0.077188 0.824491 4.6e-05 0.72303 0.089348 0.954837 0.932709 0.573782 0.175672 0.523809 0.996085 0.0121 0.811968 0.790563 0.364596 0.617706 0.95948 0.343899 0.284845 0.845303 0.915914 0.771349 0.448269 0.209937 0.762946 0.960424 0.2 [...]
+0.550504 0.512018 0.687151 0.1462 0.148982 0.046154 0.204612 0.705459 0.154663 0.309177 0.246338 0.934164 0.980993 0.345935 0.906747 0.623246 0.129792 0.12628 0.689308 0.696747 0.224092 0.069052 0.156075 0.884592 0.529889 0.355869 0.771547 0.979015 0.727201 0.451122 0.927096 0.032297 0.340959 0.707338 0.550423 0.743946 0.398934 0.020125 0.972365 0.127516 0.750004 0.209018 0.588081 0.581401 0.291186 0.193153 0.611378 0.140318 0.909211 0.137127 0.809743 0.381987 0.017639 0.457133 0.000992  [...]
+0.171022 0.971858 0.686661 0.010444 0.722454 0.647088 0.270743 0.095114 0.658513 0.437337 0.535493 0.747793 0.054763 0.532208 0.711088 0.786815 0.163561 0.843069 0.937547 0.312918 0.223137 0.682164 0.917872 0.905948 0.906483 0.027025 0.727036 0.313604 0.253985 0.863422 0.252607 0.76557 0.809652 0.807299 0.316932 0.331826 0.038323 0.725457 0.49828 0.647478 0.135413 0.615726 0.166741 0.210406 0.226626 0.765682 0.405455 0.632886 0.292517 0.301935 0.845469 0.136404 0.369519 0.521226 0.868706 [...]
+0.954751 0.581221 0.882941 0.52443 0.786613 0.940366 0.858148 0.432898 0.87732 0.970276 0.674629 0.71436 0.313146 0.127699 0.184135 0.905095 0.052313 0.787631 0.260647 0.440706 0.42253 0.770088 0.68022 0.931622 0.432804 0.156139 0.69857 0.734543 0.431802 0.491673 0.504212 0.732241 0.873748 0.1035 0.72388 0.178071 0.548983 0.659702 0.553147 0.250902 0.346252 0.119926 0.70074 0.636998 0.753361 0.373168 0.34546 0.004163 0.845027 0.512042 0.711486 0.501954 0.639463 0.726464 0.185201 0.390338 [...]
+0.676558 0.92741 0.338185 0.596788 0.062222 0.294024 0.878368 0.197959 0.374679 0.811203 0.176078 0.395125 0.628597 0.338342 0.861921 0.160193 0.968549 0.047149 0.723708 0.092378 0.339484 0.314994 0.386153 0.192075 0.862605 0.028115 0.192867 0.568344 0.617866 0.770262 0.393087 0.870165 0.899242 0.521144 0.804197 0.06039 0.826663 0.664597 0.802155 0.486622 0.139829 0.6521 0.492339 0.317567 0.194634 0.012802 0.569762 0.391674 0.099346 0.831183 0.40582 0.877384 0.020182 0.243079 0.335899 0. [...]
+0.220333 0.735683 0.289723 0.010385 0.28587 0.572034 0.699344 0.993471 0.904654 0.87195 0.893098 0.911316 0.902195 0.464346 0.158292 0.100338 0.512528 0.531848 0.283595 0.507896 0.406378 0.976094 0.852076 0.488813 0.49092 0.841509 0.168671 0.827665 0.555722 0.848815 0.117333 0.252764 0.686944 0.587851 0.138097 0.85491 0.531775 0.336078 0.130931 0.877737 0.887755 0.675877 0.640575 0.177832 0.80567 0.630829 0.658684 0.566858 0.234097 0.361991 0.868898 0.228239 0.327375 0.074786 0.895228 0. [...]
+0.903312 0.694193 0.33029 0.151086 0.312334 0.277103 0.51377 0.323267 0.128844 0.973342 0.42262 0.347701 0.311183 0.515083 0.673527 0.968735 0.258441 0.50442 0.243636 0.461955 0.854854 0.790651 0.33765 0.948988 0.770445 0.733719 0.913654 0.984822 0.512326 0.13715 0.022987 0.929138 0.938823 0.985227 0.669409 0.320889 0.361989 0.836019 0.599571 0.878631 0.347622 0.445849 0.672178 0.471046 0.195447 0.680927 0.278109 0.624655 0.972577 0.234856 0.971022 0.999239 0.103036 0.021133 0.574442 0.3 [...]
+0.574679 0.184019 0.436939 0.142437 0.660317 0.821624 0.676609 0.390495 0.816254 0.914652 0.049506 0.386082 0.654348 0.422745 0.884893 0.67412 0.049488 0.928609 0.904522 0.864573 0.435124 0.744101 0.405864 0.071131 0.436005 0.859502 0.854433 0.418608 0.991754 0.796787 0.183038 0.55512 0.195831 0.733941 0.857924 0.083666 0.905614 0.242285 0.117667 0.575896 0.018572 0.389131 0.947891 0.468703 0.725658 0.886569 0.396946 0.006684 0.063889 0.499129 0.929914 0.734352 0.491916 0.574367 0.187376 [...]
+0.784854 0.605616 0.016909 0.97875 0.141669 0.539308 0.941772 0.044563 0.712587 0.643685 0.632704 0.342104 0.515871 0.587278 0.5939 0.294273 0.559389 0.250659 0.242017 0.31141 0.955228 0.043218 0.568853 0.23613 0.181869 0.078409 0.890902 0.953165 0.593754 0.860418 0.989931 0.420523 0.109658 0.545399 0.3582 0.650577 0.561114 0.538835 0.569017 0.442034 0.723522 0.669616 0.908663 0.35982 0.920556 0.225321 0.438806 0.957428 0.76598 0.120911 0.564887 0.18708 0.468164 0.770717 0.218616 0.24480 [...]
+0.322164 0.645898 0.303039 0.835919 0.633769 0.354576 0.789275 0.882117 0.284745 0.158689 0.163507 0.584189 0.564463 0.027151 0.33896 0.745099 0.49145 0.157641 0.71634 0.236689 0.605873 0.262817 0.947689 0.309881 0.456653 0.646092 0.450419 0.639545 0.690191 0.492214 0.156643 0.411439 0.194176 0.083604 0.991858 0.167047 0.393634 0.009452 0.95585 0.120347 0.379084 0.59956 0.079936 0.514238 0.191031 0.883448 0.444019 0.525658 0.966024 0.694719 0.016552 0.219734 0.50198 0.041866 0.670567 0.1 [...]
+0.68566 0.127531 0.549707 0.268816 0.608195 0.427877 0.18374 0.69392 0.276768 0.120812 0.389211 0.729448 0.416926 0.17535 0.440206 0.350878 0.544439 0.494292 0.277162 0.736696 0.32339 0.423524 0.732991 0.644621 0.19839 0.685888 0.353821 0.506012 0.384175 0.528348 0.627539 0.852387 0.514304 0.86794 0.816915 0.491502 0.90775 0.698056 0.771298 0.564814 0.551003 0.997463 0.426011 0.639222 0.025378 0.376906 0.817104 0.650429 0.369952 0.4749 0.819931 0.086324 0.019032 0.583729 0.956846 0.33168 [...]
+0.474953 0.356191 0.779869 0.775673 0.611232 0.505435 0.422034 0.007897 0.983781 0.250274 0.249752 0.623697 0.823327 0.713949 0.670066 0.161351 0.65697 0.836753 0.638859 0.441488 0.233215 0.328347 0.621826 0.429067 0.21246 0.434499 0.593938 0.498959 0.312512 0.81984 0.901903 0.647551 0.566456 0.585951 0.643601 0.581089 0.994057 0.881193 0.295271 0.501651 0.042016 0.800891 0.817784 0.659985 0.933114 0.219723 0.834749 0.965837 0.945416 0.332408 0.034051 0.482048 0.817815 0.89877 0.689684 0 [...]
+0.837908 0.430293 0.824871 0.733223 0.746128 0.080289 0.634123 0.22538 0.502135 0.029795 0.844872 0.97253 0.345435 0.195978 0.163713 0.31294 0.368044 0.14549 0.140629 0.346187 0.210036 0.824059 0.607568 0.798334 0.685508 0.840974 0.545906 0.334057 0.768471 0.336438 0.988371 0.489247 0.168715 0.459409 0.464989 0.844858 0.88542 0.714194 0.501149 0.121318 0.871016 0.841207 0.165915 0.051956 0.669099 0.860849 0.619176 0.128793 0.829945 0.160402 0.74727 0.568881 0.417327 0.438782 0.168969 0.7 [...]
+0.199969 0.161917 0.113892 0.989629 0.160943 0.686202 0.201382 0.690581 0.075127 0.594509 0.906761 0.914246 0.755046 0.401588 0.345791 0.062337 0.138314 0.330443 0.346558 0.695516 0.581685 0.092616 0.309683 0.586681 0.33177 0.386122 0.654161 0.763062 0.910468 0.399893 0.084109 0.673972 0.679873 0.834424 0.133276 0.715211 0.015825 0.994764 0.017476 0.060075 0.746102 0.098541 0.078751 0.289859 0.93 0.799577 0.003075 0.857414 0.850153 0.171179 0.597226 0.925419 0.578612 0.751394 0.213117 0. [...]
+0.015699 0.003034 0.06112 0.903375 0.255631 0.418456 0.382174 0.524904 0.144008 0.988331 0.082205 0.93351 0.021802 0.085417 0.917205 0.709898 0.986849 0.908022 0.361066 0.47227 0.727327 0.313884 0.907063 0.069412 0.873779 0.241562 0.757379 0.139931 0.656815 0.84442 0.97663 0.293946 0.2293 0.97092 0.224604 0.195156 0.619477 0.088832 0.358197 0.641338 0.969567 0.924468 0.684861 0.359023 0.334433 0.322113 0.295415 0.551209 0.902763 0.182718 0.511126 0.620239 0.605726 0.462392 0.443272 0.330 [...]
+0.227039 0.442576 0.30215 0.097132 0.369056 0.695052 0.102632 0.5507 0.028531 0.032676 0.294188 0.537332 0.780941 0.739555 0.167645 0.914913 0.424844 0.236647 0.175665 0.291019 0.199017 0.79167 0.260813 0.550114 0.880813 0.558662 0.622812 0.312659 0.098985 0.870681 0.954895 0.153307 0.737298 0.892628 0.005017 0.511401 0.39808 0.768213 0.094838 0.046469 0.463762 0.628404 0.432692 0.077758 0.45952 0.217538 0.897627 0.081111 0.885759 0.843389 0.302743 0.577428 0.905273 0.201576 0.885592 0.7 [...]
+0.438931 0.396302 0.675453 0.685164 0.190012 0.048938 0.001692 0.526974 0.32369 0.426071 0.734422 0.158973 0.055827 0.464322 0.650997 0.29638 0.9723 0.221708 0.302671 0.341966 0.007214 0.607127 0.061955 0.435955 0.67587 0.149988 0.408522 0.567516 0.785244 0.912783 0.80196 0.288194 0.191108 0.674578 0.016596 0.209925 0.491821 0.336756 0.051955 0.259954 0.148529 0.037989 0.652246 0.912368 0.424078 0.742279 0.704745 0.796994 0.259822 0.47024 0.281094 0.466672 0.457373 0.470768 0.72252 0.792 [...]
+0.108562 0.587183 0.266926 0.530744 0.32526 0.849287 0.539408 0.072776 0.069471 0.997377 0.853761 0.059558 0.119431 0.10983 0.224617 0.141718 0.308235 0.890873 0.440874 0.213044 0.809615 0.494212 0.966505 0.434407 0.87779 0.888376 0.87009 0.890483 0.092743 0.975564 0.372042 0.877784 0.004636 0.402104 0.224978 0.800144 0.735812 0.296544 0.576633 0.280087 0.395386 0.738213 0.130554 0.847925 0.670329 0.102917 0.59519 0.052101 0.114737 0.099671 0.294403 0.743188 0.829063 0.132701 0.683242 0. [...]
+0.41828 0.73689 0.006629 0.414806 0.055141 0.654374 0.660008 0.524334 0.366997 0.176103 0.95402 0.528947 0.424218 0.761964 0.382511 0.55799 0.917538 0.226733 0.031676 0.044621 0.574774 0.567365 0.916435 0.623294 0.826907 0.177994 0.178323 0.922009 0.095114 0.661785 0.602065 0.566007 0.072598 0.60581 0.261287 0.73118 0.345476 0.447662 0.956249 0.801431 0.317896 0.480927 0.854602 0.253937 0.931168 0.942351 0.698357 0.991275 0.058295 0.608196 0.313632 0.852526 0.851281 0.788458 0.975035 0.4 [...]
+0.548501 0.852123 0.266154 0.874548 0.82604 0.19557 0.321651 0.511201 0.828228 0.384769 0.13459 0.577461 0.865242 0.370817 0.169062 0.366973 0.633864 0.949083 0.3968 0.948704 0.017268 0.028836 0.493937 0.16924 0.897513 0.666911 0.56449 0.373693 0.975903 0.431024 0.172874 0.940049 0.388446 0.652356 0.22866 0.860754 0.130022 0.802537 0.129331 0.318447 0.440368 0.638293 0.664231 0.594048 0.589303 0.612848 0.569264 0.165268 0.264145 0.595651 0.563406 0.176673 0.466437 0.152203 0.183583 0.262 [...]
+0.63358 0.620159 0.609206 0.399461 0.123453 0.740761 0.109853 0.58986 0.074138 0.010727 0.742828 0.701955 0.165752 0.204447 0.947567 0.659215 0.378481 0.133662 0.002406 0.364683 0.371577 0.655499 0.617735 0.747712 0.43956 0.125025 0.178419 0.061884 0.211151 0.154278 0.044996 0.163548 0.253845 0.591926 0.97036 0.981884 0.045193 0.170312 0.27224 0.708035 0.520419 0.111756 0.20663 0.927813 0.687319 0.377394 0.3328 0.215945 0.630389 0.924129 0.825927 0.352381 0.443969 0.535538 0.279526 0.055 [...]
+0.209955 0.009167 0.72642 0.102833 0.256466 0.836265 0.502645 0.40207 0.286137 0.337002 0.411212 0.350277 0.188927 0.128102 0.828306 0.148225 0.60844 0.245441 0.33855 0.12042 0.962051 0.714489 0.920376 0.48284 0.18476 0.964912 0.698496 0.42245 0.290563 0.932647 0.235755 0.376169 0.317362 0.110736 0.962066 0.18486 0.292505 0.563138 0.913779 0.391509 0.457688 0.024375 0.451356 0.06218 0.339235 0.096386 0.242117 0.661896 0.710899 0.90912 0.011923 0.485322 0.581624 0.721737 0.461852 0.012888 [...]
+0.531649 0.483912 0.074335 0.229242 0.722392 0.444332 0.161109 0.743529 0.944813 0.442217 0.901359 0.65324 0.021054 0.821414 0.084795 0.493488 0.639847 0.657903 0.206765 0.797757 0.741058 0.908396 0.663817 0.078891 0.800151 0.440068 0.209833 0.394222 0.805924 0.891195 0.616312 0.176244 0.877264 0.145693 0.211155 0.874363 0.022382 0.468388 0.016516 0.810566 0.592446 0.08161 0.821304 0.662114 0.23439 0.751556 0.689389 0.963292 0.205075 0.700177 0.199079 0.177937 0.544763 0.36001 0.992919 0 [...]
+0.646458 0.011917 0.712799 0.155677 0.071395 0.860197 0.576117 0.279303 0.749743 0.624193 0.353814 0.332062 0.111381 0.75687 0.367192 0.380786 0.953072 0.747492 0.879541 0.80851 0.651124 0.173781 0.851883 0.65982 0.038033 0.557223 0.107346 0.367806 0.885561 0.495269 0.905014 0.066549 0.940758 0.600096 0.092716 0.292551 0.461032 0.791372 0.106794 0.130465 0.605467 0.337023 0.441005 0.270373 0.972969 0.449697 0.29695 0.106709 0.809747 0.433719 0.293872 0.541405 0.473666 0.708609 0.828313 0 [...]
+0.23555 0.272485 0.692594 0.429853 0.450918 0.747391 0.511161 0.106076 0.104823 0.022154 0.898557 0.293888 0.666955 0.583881 0.839875 0.383783 0.122339 0.013112 0.815868 0.674991 0.027606 0.751759 0.940714 0.211663 0.360827 0.779471 0.510521 0.923986 0.892777 0.130217 0.0666 0.544045 0.234361 0.39376 0.669942 0.553043 0.305453 0.029009 0.418281 0.851228 0.007334 0.331435 0.440359 0.683062 0.604881 0.956598 0.005266 0.773832 0.280833 0.587533 0.39815 0.881959 0.992267 0.48935 0.075243 0.7 [...]
+0.689123 0.594651 0.119639 0.304872 0.953189 0.257157 0.425518 0.086424 0.601459 0.951351 0.892104 0.724595 0.145039 0.005196 0.310689 0.748122 0.367946 0.391008 0.359011 0.942363 0.575107 0.658895 0.655917 0.866257 0.341343 0.892381 0.435577 0.280674 0.143086 0.163389 0.522811 0.48663 0.854984 0.399745 0.285334 0.19431 0.476374 0.15792 0.845499 0.61366 0.646175 0.701363 0.226486 0.968318 0.317813 0.213584 0.218889 0.811861 0.516153 0.064888 0.609695 0.477865 0.016289 0.151438 0.931944 0 [...]
+0.625932 0.453531 0.492107 0.218139 0.206374 0.396366 0.435055 0.325193 0.030948 0.871479 0.907263 0.727808 0.660908 0.463132 0.458167 0.751752 0.417665 0.098156 0.44679 0.857493 0.623975 0.345097 0.603148 0.018477 0.593858 0.152043 0.779861 0.192197 0.645173 0.061048 0.793525 0.25171 0.733912 0.054425 0.635101 0.076409 0.708899 0.71532 0.708295 0.326955 0.947483 0.56809 0.844374 0.527148 0.593965 0.895607 0.060364 0.943058 0.072886 0.397973 0.918017 0.935896 0.688235 0.16639 0.45309 0.9 [...]
+0.56292 0.431453 0.657826 0.431136 0.604354 0.634039 0.400109 0.810949 0.841481 0.853475 0.727299 0.593375 0.982789 0.557698 0.136151 0.711186 0.069665 0.164777 0.388019 0.106537 0.174646 0.181427 0.785573 0.616662 0.420258 0.574589 0.251117 0.337332 0.158691 0.999272 0.744548 0.088948 0.829139 0.919459 0.228055 0.258781 0.857647 0.327887 0.306285 0.629177 0.369549 0.451413 0.430021 0.439849 0.308964 0.594177 0.434486 0.993033 0.986355 0.58943 0.670943 0.882549 0.395651 0.302895 0.050174 [...]
+0.603796 0.575774 0.441048 0.928372 0.770049 0.335383 0.549188 0.468785 0.33335 0.845022 0.920344 0.41502 0.318676 0.674525 0.57539 0.772463 0.551127 0.682805 0.560677 0.253967 0.466726 0.152928 0.999853 0.715681 0.754654 0.385539 0.595208 0.573883 0.351213 0.960945 0.353524 0.390834 0.504859 0.709293 0.202202 0.076436 0.484894 0.457516 0.083096 0.634976 0.135644 0.335905 0.149217 0.791845 0.047857 0.280326 0.910078 0.049838 0.08907 0.227628 0.367018 0.445155 0.188975 0.52659 0.376639 0. [...]
+0.095909 0.742601 0.271603 0.652683 0.028163 0.673817 0.785723 0.382913 0.111416 0.059417 0.172623 0.829913 0.232663 0.00366 0.801048 0.496383 0.798739 0.420266 0.74378 0.736585 0.584271 0.511955 0.267972 0.865093 0.033195 0.884249 0.625701 0.720425 0.208305 0.40726 0.316881 0.271457 0.015622 0.500586 0.773415 0.097513 0.518268 0.420945 0.92873 0.371864 0.958475 0.275522 0.580585 0.119142 0.793848 0.297529 0.219324 0.875617 0.352838 0.236521 0.038409 0.913841 0.119637 0.529686 0.602144 0 [...]
+0.891387 0.107318 0.983719 0.925889 0.634191 0.350108 0.647571 0.889949 0.303462 0.5615 0.22093 0.033192 0.689995 0.248703 0.542257 0.863423 0.470579 0.939128 0.353382 0.98638 0.098893 0.673348 0.066246 0.807942 0.957328 0.753657 0.967439 0.544049 0.798616 0.240806 0.601639 0.306356 0.375216 0.895977 0.924196 0.797769 0.384014 0.223942 0.310736 0.062249 0.207453 0.944808 0.421393 0.690699 0.690221 0.83722 0.500846 0.169167 0.190393 0.517125 0.860814 0.270322 0.15332 0.603163 0.912165 0.9 [...]
+0.746452 0.66285 0.140361 0.695915 0.475069 0.880751 0.209292 0.098564 0.497513 0.302974 0.127912 0.135788 0.106153 0.098718 0.534812 0.425278 0.489239 0.830615 0.5093 0.817427 0.749826 0.922794 0.617509 0.543353 0.658356 0.069648 0.731992 0.506103 0.704026 0.240424 0.142282 0.112653 0.007741 0.647303 0.914936 0.535796 0.101346 0.158585 0.633687 0.572362 0.690449 0.210569 0.383458 0.970291 0.947482 0.561654 0.701764 0.511438 0.933432 0.759138 0.802386 0.747712 0.275174 0.688553 0.90673 0 [...]
+0.313849 0.547473 0.353224 0.755803 0.610434 0.63785 0.198313 0.912768 0.952501 0.39029 0.941417 0.886067 0.912731 0.864613 0.834607 0.068024 0.977121 0.951638 0.681506 0.375123 0.626646 0.430852 0.492612 0.577335 0.819918 0.676518 0.631175 0.83539 0.679624 0.986463 0.543004 0.899303 0.940149 0.627054 0.924931 0.274788 0.101264 0.367918 0.847294 0.038718 0.445001 0.088005 0.92146 0.712189 0.995049 0.408159 0.32365 0.302337 0.525182 0.156383 0.650813 0.334747 0.561689 0.194965 0.468677 0. [...]
+0.06408 0.58825 0.50922 0.236164 0.003682 0.990266 0.581167 0.132959 0.13957 0.635102 0.005309 0.962559 0.508967 0.576957 0.585711 0.569841 0.373126 0.662267 0.306181 0.417255 0.910968 0.653668 0.282455 0.297113 0.775134 0.551415 0.496978 0.724439 0.062882 0.806218 0.021504 0.419243 0.106113 0.12107 0.356905 0.705125 0.069121 0.215053 0.968838 0.530482 0.247671 0.320668 0.863349 0.821577 0.541265 0.37843 0.212525 0.648502 0.523576 0.362463 0.19965 0.682159 0.506387 0.483115 0.909609 0.60 [...]
+0.552217 0.862572 0.9575 0.305578 0.904215 0.385384 0.533162 0.255507 0.603056 0.522767 0.242378 0.27944 0.475633 0.988517 0.625569 0.458274 0.38995 0.501285 0.802629 0.914872 0.808956 0.416484 0.301189 0.356587 0.89169 0.023417 0.019937 0.277413 0.539994 0.277793 0.381792 0.36296 0.309206 0.750315 0.278936 0.310963 0.464936 0.837025 0.76547 0.605443 0.433309 0.889108 0.336401 0.159534 0.577458 0.429416 0.756597 0.33582 0.66871 0.15624 0.374152 0.293123 0.784162 0.066173 0.545605 0.21416 [...]
+0.759724 0.621266 0.971847 0.878872 0.973743 0.685295 0.131657 0.971669 0.411197 0.944911 0.796778 0.726134 0.993352 0.256989 0.162198 0.180809 0.603532 0.913428 0.210674 0.218585 0.834082 0.873548 0.452373 0.181048 0.568919 0.263454 0.429853 0.34232 0.191254 0.445566 0.021327 0.714547 0.71627 0.73199 0.077732 0.17438 0.778017 0.532337 0.219444 0.450823 0.854051 0.257853 0.19822 0.498585 0.377041 0.685835 0.64278 0.705205 0.76761 0.656666 0.352714 0.96338 0.395045 0.468607 0.938107 0.028 [...]
+0.756246 0.839253 0.075702 0.437163 0.347983 0.208725 0.886939 0.632202 0.561745 0.769882 0.414046 0.271751 0.825734 0.236111 0.248783 0.010765 0.814173 0.423821 0.307835 0.000452 0.406335 0.979619 0.107089 0.444713 0.48299 0.622456 0.378342 0.516051 0.309811 0.255347 0.811797 0.94681 0.298092 0.942352 0.349274 0.472688 0.161469 0.707816 0.884139 0.53722 0.565801 0.84172 0.182623 0.898444 0.408572 0.507291 0.176547 0.861682 0.107627 0.27043 0.316637 0.067283 0.236665 0.860838 0.904916 0. [...]
+0.975416 0.136288 0.982123 0.624014 0.104491 0.949768 0.188195 0.107601 0.239832 0.224191 0.986928 0.271268 0.276088 0.918647 0.843436 0.025471 0.900133 0.822652 0.123654 0.030336 0.680902 0.818529 0.715927 0.27711 0.984656 0.133665 0.353076 0.223122 0.782901 0.791379 0.39026 0.510191 0.354866 0.292996 0.061164 0.361383 0.085236 0.541279 0.114778 0.795298 0.587535 0.270577 0.873315 0.398962 0.967309 0.846376 0.167477 0.131139 0.899142 0.033397 0.423009 0.845379 0.503177 0.40828 0.011015  [...]
+0.807723 0.771474 0.848379 0.33237 0.813811 0.358349 0.698567 0.687387 0.505699 0.903646 0.580509 0.024469 0.665423 0.011748 0.60847 0.494808 0.736231 0.648018 0.809928 0.563438 0.144316 0.317092 0.381729 0.696413 0.046602 0.145501 0.508248 0.409143 0.880441 0.390832 0.331258 0.957446 0.142335 0.880874 0.30694 0.406558 0.639645 0.314546 0.282876 0.595534 0.587236 0.68573 0.650922 0.168811 0.486819 0.75888 0.143689 0.0152 0.892705 0.891133 0.015939 0.168529 0.334407 0.280888 0.332664 0.39 [...]
+0.209692 0.465276 0.818997 0.465288 0.250082 0.12433 0.676554 0.772253 0.140667 0.934471 0.820497 0.675867 0.554677 0.89141 0.597883 0.853422 0.685168 0.983303 0.487659 0.084329 0.835702 0.776728 0.948335 0.779748 0.110164 0.184495 0.170978 0.478184 0.147556 0.652625 0.84332 0.10458 0.123132 0.243563 0.005774 0.747148 0.157258 0.268623 0.489 0.501504 0.411858 0.254574 0.340652 0.016425 0.286009 0.198045 0.855551 0.571549 0.529959 0.211615 0.517012 0.484694 0.379003 0.538331 0.087142 0.73 [...]
+0.201714 0.401425 0.245904 0.366254 0.48123 0.649435 0.772508 0.801894 0.508558 0.096596 0.41016 0.905712 0.936575 0.850533 0.683897 0.148776 0.096997 0.648068 0.445243 0.466201 0.469271 0.458863 0.945266 0.876697 0.367323 0.648203 0.279052 0.98011 0.251147 0.55343 0.088438 0.909035 0.952158 0.190643 0.738742 0.043063 0.985232 0.270825 0.566462 0.631982 0.797368 0.869521 0.260348 0.242987 0.865289 0.043426 0.142343 0.448532 0.34995 0.42001 0.442944 0.449877 0.140058 0.358161 0.393223 0.5 [...]
+0.934609 0.869786 0.352991 0.689723 0.171513 0.872568 0.408423 0.878503 0.775353 0.330061 0.468006 0.602366 0.86822 0.945238 0.201836 0.617739 0.214235 0.285679 0.439622 0.516286 0.844247 0.057006 0.089926 0.9772 0.492571 0.93983 0.104561 0.021346 0.681977 0.059087 0.252527 0.581455 0.028032 0.221555 0.211849 0.850029 0.436121 0.739946 0.726909 0.895019 0.82689 0.553359 0.096888 0.275162 0.761052 0.902028 0.368558 0.115273 0.277889 0.432287 0.899797 0.485578 0.237787 0.80887 0.300864 0.9 [...]
+0.359463 0.992508 0.995122 0.757966 0.769245 0.661695 0.694866 0.232742 0.900488 0.507314 0.71076 0.488237 0.926184 0.936178 0.042765 0.166462 0.690866 0.39565 0.193151 0.059981 0.265554 0.502763 0.294099 0.496391 0.846652 0.068746 0.390882 0.05002 0.95223 0.824464 0.503955 0.578167 0.536096 0.557974 0.240527 0.51942 0.629868 0.770153 0.520283 0.337664 0.327758 0.413637 0.372991 0.351364 0.237977 0.994064 0.199542 0.529874 0.68502 0.576181 0.605954 0.36371 0.041404 0.827565 0.603818 0.78 [...]
+0.529587 0.128395 0.681822 0.243175 0.418849 0.13231 0.335902 0.557711 0.482271 0.889577 0.213803 0.839221 0.705594 0.109193 0.280083 0.297752 0.843511 0.038922 0.872143 0.303392 0.437889 0.784438 0.593905 0.998953 0.490217 0.117989 0.942974 0.669184 0.069406 0.199535 0.151752 0.690211 0.389232 0.416649 0.555928 0.215763 0.712545 0.515283 0.439344 0.816451 0.784975 0.400446 0.060133 0.24533 0.101428 0.753518 0.407546 0.813817 0.874489 0.222091 0.255509 0.078932 0.600693 0.154184 0.023631 [...]
+0.081017 0.553809 0.613073 0.456573 0.876959 0.401786 0.480961 0.728456 0.296426 0.903643 0.001721 0.221744 0.000816 0.47068 0.318703 0.644465 0.32811 0.900611 0.351004 0.325461 0.254102 0.032797 0.671657 0.16677 0.623288 0.698881 0.729248 0.67908 0.634578 0.961028 0.11182 0.423573 0.538267 0.576662 0.79283 0.755479 0.003358 0.827846 0.40948 0.593207 0.914793 0.430463 0.921272 0.601112 0.770585 0.459861 0.870449 0.191108 0.016 0.208093 0.103633 0.971986 0.480837 0.814582 0.912136 0.16008 [...]
+0.434048 0.317557 0.153822 0.388469 0.304152 0.920632 0.214835 0.981187 0.678031 0.467277 0.302305 0.371989 0.664559 0.446731 0.832708 0.987927 0.127015 0.0846 0.486823 0.782339 0.83937 0.844535 0.725493 0.462942 0.562283 0.557878 0.690761 0.516661 0.622871 0.633624 0.512632 0.60358 0.759982 0.067463 0.973281 0.257427 0.306723 0.446646 0.891407 0.448656 0.773764 0.920823 0.195334 0.533724 0.764439 0.845207 0.273682 0.848947 0.681989 0.531079 0.330412 0.542141 0.777117 0.364642 0.68528 0. [...]
+0.758178 0.536901 0.720292 0.496426 0.601849 0.40794 0.38285 0.313028 0.296157 0.994049 0.716747 0.01228 0.035889 0.112673 0.763251 0.680901 0.080776 0.859749 0.337635 0.844713 0.121609 0.654516 0.253668 0.521553 0.112578 0.325845 0.295513 0.999725 0.264995 0.971646 0.558523 0.382442 0.239029 0.349717 0.780417 0.101356 0.733017 0.061309 0.928265 0.981022 0.041403 0.355773 0.750319 0.146838 0.498047 0.494135 0.121035 0.35236 0.574368 0.441117 0.882949 0.065676 0.076715 0.721707 0.606123 0 [...]
+0.825838 0.439719 0.477759 0.421907 0.653923 0.161078 0.45044 0.751022 0.255876 0.862857 0.63368 0.843404 0.06789 0.913827 0.838111 0.741119 0.533757 0.638917 0.728577 0.782995 0.455167 0.498755 0.337823 0.141931 0.073419 0.134108 0.511489 0.442536 0.397207 0.732166 0.480275 0.805339 0.94875 0.706182 0.705634 0.966909 0.90471 0.688379 0.079557 0.991641 0.252062 0.070159 0.308484 0.53827 0.142969 0.711898 0.370036 0.242947 0.718715 0.818617 0.861958 0.15975 0.272485 0.467671 0.173548 0.68 [...]
+0.419255 0.173982 0.353404 0.988233 0.094213 0.255004 0.526886 0.518261 0.336082 0.810523 0.211869 0.422859 0.875782 0.451954 0.670586 0.72332 0.603336 0.121156 0.315446 0.431675 0.408972 0.339174 0.832593 0.813727 0.743648 0.491024 0.397477 0.143062 0.69503 0.070891 0.938413 0.042092 0.01232 0.472832 0.144296 0.569667 0.858948 0.736563 0.800082 0.694073 0.009856 0.16059 0.675241 0.019493 0.314395 0.173468 0.196448 0.549433 0.52125 0.66501 0.233644 0.080965 0.440528 0.238899 0.422394 0.2 [...]
+0.705313 0.351586 0.076791 0.985626 0.31931 0.437081 0.464506 0.409759 0.364583 0.993886 0.3809 0.140789 0.188799 0.616168 0.01332 0.452115 0.628445 0.420063 0.435074 0.042334 0.108007 0.946634 0.610736 0.802811 0.337252 0.194241 0.169522 0.144998 0.962938 0.837645 0.856734 0.158483 0.224687 0.73446 0.947124 0.985809 0.883861 0.626207 0.609329 0.548754 0.911774 0.703316 0.017129 0.329511 0.830372 0.27757 0.418522 0.771148 0.29272 0.500909 0.357198 0.860524 0.404545 0.190298 0.625213 0.08 [...]
+0.24311 0.19178 0.832456 0.265757 0.274721 0.472748 0.329239 0.179292 0.264878 0.95364 0.216682 0.719731 0.181643 0.520097 0.540929 0.772186 0.752848 0.455871 0.850745 0.472347 0.019686 0.997841 0.1217 0.380277 0.887033 0.44387 0.639511 0.661771 0.275466 0.51365 0.636579 0.992011 0.47692 0.265344 0.889892 0.215178 0.858187 0.876308 0.924206 0.587005 0.641662 0.307205 0.290916 0.916859 0.494469 0.17114 0.021242 0.005852 0.007474 0.582111 0.862288 0.205967 0.659353 0.175225 0.099663 0.9229 [...]
+0.603023 0.56738 0.386987 0.459184 0.443711 0.511775 0.66071 0.32911 0.00888 0.957876 0.412244 0.662795 0.711105 0.969193 0.983125 0.928258 0.235961 0.408608 0.229106 0.998457 0.363581 0.651942 0.027975 0.346571 0.331481 0.876434 0.992098 0.083782 0.88228 0.111902 0.774248 0.579509 0.601146 0.440434 0.803094 0.49312 0.027472 0.990713 0.979805 0.20704 0.258799 0.396756 0.393179 0.044275 0.529745 0.494258 0.936806 0.628672 0.929221 0.860279 0.475585 0.747696 0.956008 0.73361 0.917454 0.527 [...]
+0.989025 0.262096 0.075501 0.906867 0.691412 0.694887 0.526787 0.37473 0.985547 0.18498 0.336879 0.490646 0.551474 0.070344 0.574752 0.435298 0.255622 0.709178 0.683948 0.229307 0.417082 0.331915 0.568153 0.36355 0.024127 0.654554 0.73718 0.874693 0.881631 0.737255 0.86779 0.895564 0.759936 0.917071 0.82144 0.323607 0.564741 0.473931 0.815871 0.089949 0.316261 0.502985 0.430755 0.855113 0.094914 0.468428 0.615677 0.150013 0.338821 0.365457 0.066691 0.67909 0.162236 0.899807 0.122539 0.67 [...]
+0.507878 0.172247 0.996681 0.435484 0.335081 0.326063 0.091952 0.880152 0.430218 0.061551 0.693593 0.792099 0.186445 0.966774 0.658888 0.520237 0.024593 0.465439 0.709876 0.939072 0.740304 0.001864 0.425974 0.480438 0.740823 0.577292 0.085785 0.466356 0.3812 0.053999 0.659101 0.554571 0.293455 0.861625 0.915375 0.968789 0.336868 0.400585 0.808625 0.713884 0.604957 0.345613 0.661151 0.985215 0.076933 0.145665 0.484919 0.254648 0.301112 0.347004 0.780121 0.832354 0.127485 0.465693 0.83476  [...]
+0.054233 0.553896 0.124159 0.508513 0.804652 0.190713 0.657607 0.428321 0.349595 0.373058 0.856321 0.505112 0.547025 0.642594 0.324485 0.490108 0.146657 0.11215 0.806767 0.661759 0.774388 0.926677 0.001324 0.996567 0.140322 0.83573 0.404515 0.290933 0.747684 0.54582 0.808379 0.267001 0.951257 0.99553 0.795245 0.553494 0.918983 0.723499 0.704599 0.682027 0.598332 0.468183 0.004064 0.630444 0.01448 0.641898 0.444978 0.178284 0.547156 0.722333 0.733554 0.421701 0.262526 0.877432 0.559499 0. [...]
+0.13272 0.570508 0.467087 0.994269 0.851748 0.277689 0.794122 0.551916 0.964506 0.180748 0.742432 0.930095 0.285876 0.188476 0.958538 0.171722 0.116313 0.680139 0.640412 0.244352 0.922831 0.522206 0.245997 0.54642 0.343013 0.584589 0.136095 0.231145 0.669753 0.21229 0.435455 0.40525 0.842022 0.600143 0.811841 0.923316 0.12887 0.096888 0.12053 0.851788 0.100825 0.153018 0.587821 0.635289 0.1817 0.652211 0.251583 0.574934 0.603574 0.90748 0.745024 0.191191 0.551355 0.261846 0.32077 0.48134 [...]
+0.565867 0.203517 0.071201 0.935444 0.413134 0.304972 0.776851 0.503314 0.412403 0.231666 0.287218 0.169613 0.394697 0.236689 0.34652 0.324195 0.587625 0.640534 0.499545 0.469716 0.237702 0.047151 0.324498 0.635276 0.488978 0.931153 0.913812 0.977626 0.043384 0.251822 0.393154 0.486794 0.574511 0.296537 0.279742 0.813853 0.884477 0.845401 0.84019 0.742959 0.187246 0.755551 0.105051 0.286129 0.948638 0.802542 0.843162 0.757273 0.529423 0.561981 0.641873 0.090943 0.065504 0.09994 0.509486  [...]
+0.612896 0.47072 0.862642 0.771202 0.122323 0.972648 0.566798 0.761456 0.285805 0.885222 0.523587 0.219035 0.896042 0.551844 0.518404 0.549638 0.339374 0.005897 0.25361 0.130652 0.82444 0.408354 0.892174 0.199201 0.698452 0.569561 0.058369 0.932077 0.125102 0.472363 0.461766 0.698473 0.918503 0.229603 0.520494 0.465675 0.813758 0.113347 0.685775 0.790604 0.646723 0.142418 0.439301 0.810817 0.012307 0.091177 0.411106 0.191446 0.732345 0.749442 0.726854 0.763153 0.306431 0.451196 0.076374  [...]
+0.592549 0.440178 0.548182 0.384326 0.450279 0.220169 0.108414 0.415243 0.432124 0.193829 0.333271 0.813262 0.117548 0.744233 0.098323 0.910377 0.147544 0.790776 0.066201 0.678669 0.899766 0.894522 0.348839 0.191035 0.491134 0.080406 0.441327 0.729128 0.150675 0.43329 0.879419 0.329418 0.133016 0.171435 0.876545 0.21706 0.855883 0.278641 0.5578 0.153716 0.764684 0.597716 0.250763 0.70288 0.990425 0.363486 0.695686 0.569583 0.512328 0.788785 0.87522 0.424803 0.215024 0.578313 0.529337 0.5 [...]
+0.550585 0.469469 0.107621 0.472604 0.547099 0.918913 0.614984 0.115566 0.091171 0.326396 0.877197 0.460319 0.010216 0.474991 0.838615 0.952648 0.950873 0.905351 0.099318 0.89363 0.497474 0.119909 0.980872 0.251671 0.008661 0.710273 0.357353 0.793717 0.931721 0.949716 0.994449 0.538321 0.538106 0.609986 0.103036 0.943535 0.74664 0.629575 0.567214 0.012012 0.264143 0.580006 0.477114 0.956294 0.911966 0.104022 0.823368 0.363088 0.690951 0.798144 0.912119 0.443153 0.480661 0.550394 0.805425 [...]
+0.472358 0.062352 0.056492 0.685264 0.434794 0.562057 0.114224 0.554281 0.956835 0.475045 0.12093 0.339766 0.212964 0.932229 0.355462 0.126841 0.716748 0.841884 0.040282 0.694421 0.412664 0.663624 0.113806 0.611452 0.237354 0.043928 0.483023 0.335237 0.792342 0.674192 0.267028 0.204691 0.395114 0.545365 0.907868 0.434799 0.209162 0.749438 0.881306 0.105718 0.294052 0.102049 0.265597 0.273378 0.987897 0.145348 0.917171 0.439794 0.076561 0.996529 0.301064 0.461577 0.812895 0.742371 0.41560 [...]
+0.237284 0.95561 0.339748 0.729602 0.521223 0.993144 0.184668 0.900347 0.815844 0.129619 0.169271 0.07649 0.652338 0.340631 0.679837 0.672414 0.777171 0.950409 0.311122 0.162659 0.340751 0.638368 0.634411 0.764081 0.696229 0.794176 0.657001 0.341351 0.576374 0.402389 0.056338 0.833866 0.193132 0.020584 0.479944 0.086854 0.552227 0.998156 0.894856 0.44918 0.667928 0.846498 0.80688 0.697354 0.985041 0.840929 0.257881 0.143077 0.553905 0.961793 0.672388 0.518714 0.562225 0.899394 0.326827 0 [...]
+0.384848 0.302111 0.36591 0.056583 0.5612 0.503213 0.288521 0.715384 0.543008 0.832287 0.168924 0.537802 0.780214 0.914648 0.336707 0.130982 0.159958 0.677795 0.029003 0.987673 0.972082 0.276794 0.996736 0.173004 0.168545 0.632422 0.798248 0.174254 0.818492 0.388183 0.576352 0.529823 0.636631 0.672421 0.266454 0.531781 0.166426 0.092417 0.217906 0.116532 0.324302 0.174742 0.848064 0.634648 0.966813 0.887019 0.21071 0.95088 0.380813 0.070096 0.319507 0.727959 0.974837 0.225638 0.651716 0. [...]
+0.071953 0.054965 0.845772 0.211227 0.519434 0.66705 0.69028 0.771435 0.4204 0.200923 0.819972 0.404759 0.23578 0.639046 0.09476 0.927589 0.418049 0.217635 0.28353 0.38521 0.00085 0.576605 0.78473 0.001126 0.00232 0.671561 0.60535 0.695574 0.238678 0.045858 0.275909 0.911884 0.837238 0.517744 0.463708 0.504797 0.938003 0.159143 0.379805 0.319629 0.316791 0.388206 0.42895 0.306397 0.469008 0.887545 0.820856 0.572523 0.350971 0.731482 0.315875 0.469029 0.367 0.321644 0.433885 0.308614 0.80 [...]
+0.444177 0.803573 0.946439 0.974629 0.593732 0.113293 0.229398 0.526036 0.811131 0.690301 0.17979 0.34282 0.788777 0.327879 0.710678 0.994897 0.814987 0.426108 0.564383 0.305398 0.701658 0.698684 0.64046 0.082084 0.534553 0.417039 0.04827 0.637248 0.734726 0.003874 0.198825 0.012076 0.678806 0.195383 0.708098 0.35952 0.625782 0.240239 0.525125 0.070714 0.539688 0.272759 0.410225 0.387522 0.997758 0.0721 0.892433 0.083437 0.095273 0.731121 0.804177 0.326328 0.531504 0.043001 0.395599 0.70 [...]
+0.946693 0.560185 0.521764 0.618208 0.436786 0.14627 0.277962 0.26016 0.459045 0.283965 0.158867 0.255221 0.732828 0.644638 0.92266 0.6393 0.28341 0.0007 0.639361 0.754327 0.049352 0.141331 0.861786 0.810704 0.07181 0.261815 0.670962 0.474247 0.837878 0.588459 0.083533 0.090531 0.404572 0.758878 0.419074 0.548336 0.278457 0.622112 0.22501 0.394856 0.590288 0.876948 0.589497 0.768429 0.725145 0.522804 0.255459 0.695595 0.915408 0.001606 0.722191 0.878617 0.406161 0.737416 0.930472 0.44533 [...]
+0.981132 0.197763 0.554822 0.832173 0.086676 0.734236 0.134321 0.469925 0.182673 0.888065 0.716421 0.061335 0.173382 0.112845 0.616121 0.259187 0.837382 0.64684 0.556597 0.918511 0.392296 0.910929 0.815979 0.086085 0.016242 0.894756 0.365033 0.798317 0.298612 0.371707 0.39783 0.089064 0.905676 0.759304 0.139615 0.003998 0.328387 0.621419 0.077 0.519825 0.600943 0.556517 0.374868 0.937965 0.86342 0.659898 0.26249 0.087223 0.064704 0.226708 0.579677 0.021699 0.024112 0.942046 0.795723 0.24 [...]
+0.536466 0.844062 0.749041 0.973733 0.493869 0.765257 0.650412 0.886509 0.851001 0.783706 0.930624 0.237882 0.954747 0.671156 0.760149 0.684699 0.730214 0.223912 0.357978 0.74129 0.057287 0.116348 0.453881 0.635509 0.100731 0.613545 0.514118 0.978163 0.066714 0.160792 0.660288 0.542569 0.260999 0.723454 0.020405 0.959851 0.484586 0.47533 0.424024 0.750078 0.389438 0.344571 0.68757 0.389064 0.78088 0.928188 0.412721 0.226321 0.972532 0.877902 0.289827 0.142349 0.108136 0.03405 0.905254 0. [...]
+0.272586 0.306189 0.199542 0.329667 0.760019 0.175068 0.39227 0.456474 0.429721 0.872012 0.494618 0.495741 0.556097 0.249139 0.955756 0.784795 0.203742 0.610219 0.603182 0.020242 0.031049 0.667736 0.400684 0.457374 0.8227 0.971725 0.743926 0.990813 0.515703 0.801097 0.225172 0.167918 0.342503 0.824799 0.51646 0.354156 0.094864 0.467213 0.866403 0.309784 0.558762 0.905316 0.806202 0.029482 0.895557 0.463093 0.640456 0.124374 0.935069 0.815299 0.234909 0.441227 0.175168 0.985911 0.128583 0 [...]
+0.618408 0.433953 0.568233 0.337606 0.649469 0.051596 0.23094 0.704249 0.030699 0.78072 0.182617 0.443112 0.442689 0.232066 0.406228 0.798453 0.735461 0.051036 0.579492 0.268545 0.138074 0.398191 0.423063 0.865806 0.262142 0.317864 0.315872 0.709821 0.945861 0.821592 0.812171 0.839574 0.758051 0.20154 0.597963 0.023066 0.228914 0.299167 0.892791 0.822962 0.342812 0.126126 0.941879 0.680966 0.239801 0.048979 0.326501 0.603785 0.139167 0.954322 0.142952 0.82936 0.943374 0.956898 0.081301 0 [...]
+0.744737 0.132349 0.364563 0.395306 0.58884 0.568208 0.176947 0.094931 0.646432 0.818534 0.066786 0.147599 0.275055 0.89458 0.999608 0.158425 0.338039 0.455698 0.255083 0.74453 0.182798 0.419598 0.622201 0.067818 0.460577 0.948838 0.481114 0.561553 0.505345 0.000469 0.298197 0.679599 0.288988 0.433111 0.818765 0.697939 0.065907 0.450961 0.854749 0.493808 0.917055 0.958227 0.787533 0.682877 0.996714 0.831134 0.046749 0.251575 0.669225 0.58965 0.400679 0.921652 0.326141 0.579635 0.901877 0 [...]
+0.472911 0.495566 0.774781 0.286195 0.928322 0.979599 0.588583 0.143344 0.50505 0.499987 0.794515 0.333909 0.287087 0.396552 0.405745 0.826343 0.310262 0.798905 0.248438 0.931396 0.037117 0.614508 0.343193 0.147837 0.348926 0.34021 0.160232 0.690252 0.318067 0.205077 0.004053 0.545495 0.245484 0.060279 0.155264 0.584349 0.187445 0.889944 0.268315 0.868101 0.209608 0.208723 0.602386 0.94852 0.924603 0.915742 0.156349 0.818709 0.130105 0.831304 0.069698 0.391736 0.301527 0.239694 0.638697  [...]
+0.326147 0.486711 0.637085 0.79843 0.507257 0.849931 0.673994 0.270743 0.8377 0.142917 0.678421 0.125839 0.005643 0.722244 0.809287 0.162728 0.634177 0.705531 0.622565 0.978887 0.019358 0.908265 0.420379 0.404263 0.193312 0.204423 0.835205 0.397889 0.754846 0.01464 0.435741 0.61424 0.027838 0.014942 0.667004 0.314305 0.298667 0.952935 0.179679 0.013928 0.289862 0.892883 0.084992 0.899191 0.925453 0.845577 0.11826 0.433547 0.948271 0.073788 0.3661 0.697054 0.969565 0.87039 0.340752 0.3800 [...]
+0.248855 0.515935 0.824821 0.016784 0.986734 0.880548 0.248837 0.96983 0.249804 0.441579 0.059111 0.685302 0.61681 0.734042 0.746389 0.650122 0.102343 0.083757 0.121411 0.357472 0.605712 0.752049 0.554581 0.026955 0.143378 0.004963 0.932808 0.499584 0.851345 0.17768 0.564497 0.407628 0.958647 0.782188 0.280185 0.768642 0.333677 0.418029 0.19483 0.225915 0.5029 0.161819 0.999698 0.127621 0.642035 0.923532 0.698084 0.905323 0.848359 0.454047 0.429532 0.843514 0.647748 0.995562 0.579779 0.4 [...]
+0.427445 0.674933 0.490238 0.843184 0.126819 0.986119 0.280225 0.150596 0.718225 0.695346 0.420126 0.829202 0.071692 0.826338 0.059479 0.771382 0.93345 0.949426 0.364786 0.84304 0.738519 0.107233 0.913265 0.476082 0.839717 0.220319 0.478632 0.758795 0.807689 0.327253 0.157017 0.122824 0.317937 0.266501 0.924426 0.080438 0.428185 0.890871 0.875301 0.625373 0.779338 0.062411 0.387229 0.536083 0.518106 0.038755 0.353314 0.560904 0.672545 0.930766 0.293502 0.046584 0.608449 0.016271 0.955669 [...]
+0.797129 0.256817 0.055401 0.265261 0.664171 0.423986 0.95763 0.371914 0.444912 0.392771 0.722117 0.583114 0.437619 0.406643 0.09388 0.011212 0.980074 0.883164 0.480928 0.297955 0.195028 0.056118 0.017365 0.717959 0.257964 0.486793 0.240979 0.651871 0.80495 0.871632 0.20768 0.262845 0.514089 0.403344 0.073073 0.518362 0.176389 0.877492 0.157418 0.322937 0.893819 0.440869 0.151569 0.711906 0.399203 0.917793 0.433433 0.871746 0.312741 0.728656 0.996542 0.840707 0.999708 0.30805 0.651757 0. [...]
+0.995422 0.95874 0.076447 0.667674 0.756541 0.102304 0.290488 0.0225 0.356756 0.051623 0.217513 0.02673 0.951068 0.933127 0.402278 0.454615 0.071567 0.324878 0.444624 0.723269 0.527336 0.054212 0.601392 0.944318 0.208075 0.454756 0.931227 0.529326 0.146388 0.495099 0.229147 0.538811 0.018614 0.373096 0.205545 0.945201 0.527107 0.273136 0.381884 0.494588 0.2778 0.912001 0.241583 0.137695 0.291743 0.221644 0.143844 0.809353 0.882552 0.219211 0.718817 0.124814 0.799308 0.174165 0.878125 0.0 [...]
+0.242199 0.603098 0.266911 0.80033 0.11865 0.551017 0.196818 0.152172 0.08175 0.610925 0.134386 0.123307 0.892582 0.094424 0.747917 0.427125 0.049927 0.004412 0.228793 0.417311 0.149645 0.234745 0.600987 0.445457 0.644013 0.190317 0.362775 0.84196 0.864512 0.163627 0.914361 0.449458 0.780983 0.327375 0.747973 0.616364 0.488014 0.098315 0.38958 0.524317 0.777302 0.868903 0.55549 0.52535 0.26248 0.013304 0.882985 0.974416 0.56064 0.938473 0.617014 0.12759 0.74133 0.628241 0.284066 0.379038 [...]
+0.274715 0.644251 0.141295 0.457323 0.812388 0.872547 0.068905 0.708094 0.357659 0.943379 0.622871 0.904151 0.851685 0.256621 0.74472 0.893482 0.537831 0.771114 0.995708 0.46909 0.231225 0.268128 0.13686 0.073333 0.411506 0.717568 0.220042 0.654702 0.625303 0.611118 0.659339 0.635585 0.595163 0.786822 0.988547 0.720106 0.428436 0.384025 0.360645 0.253493 0.389445 0.92222 0.97671 0.004358 0.651135 0.692632 0.520061 0.101896 0.828127 0.782201 0.119174 0.451535 0.056594 0.427575 0.923611 0. [...]
+0.730256 0.544448 0.464306 0.529856 0.128098 0.099898 0.255753 0.312484 0.876467 0.940652 0.136282 0.844597 0.600436 0.475926 0.366505 0.142437 0.317098 0.597403 0.040893 0.376875 0.452197 0.756462 0.278117 0.036632 0.762131 0.083958 0.36881 0.28716 0.441117 0.071417 0.284395 0.342582 0.693733 0.016903 0.366499 0.752568 0.597891 0.524097 0.117172 0.221989 0.587177 0.996817 0.915283 0.123426 0.285856 0.415878 0.370954 0.17416 0.321857 0.55769 0.622383 0.95217 0.186308 0.176392 0.73754 0.6 [...]
+0.752845 0.038827 0.319562 0.683522 0.557854 0.586748 0.176394 0.545859 0.950552 0.34376 0.72292 0.425315 0.055144 0.953098 0.077231 0.846118 0.698435 0.922392 0.621855 0.304641 0.047129 0.982576 0.820208 0.867871 0.954592 0.753712 0.871798 0.088569 0.446578 0.801522 0.404578 0.455941 0.547048 0.870941 0.801403 0.563866 0.475794 0.259168 0.068472 0.724091 0.027316 0.215704 0.455899 0.811646 0.625007 0.030578 0.032789 0.616237 0.143263 0.784854 0.112429 0.085719 0.417267 0.827696 0.482566 [...]
+0.583318 0.633604 0.87757 0.560288 0.439608 0.825434 0.368957 0.55327 0.722016 0.217584 0.085251 0.655625 0.41826 0.119785 0.143266 0.187551 0.736975 0.109862 0.897621 0.10136 0.754753 0.010175 0.568081 0.955717 0.465439 0.670647 0.640448 0.465287 0.871296 0.949121 0.518629 0.170516 0.537543 0.468309 0.519004 0.550314 0.028115 0.836 0.137184 0.135315 0.075463 0.754218 0.167981 0.216052 0.629524 0.580007 0.50134 0.845356 0.865212 0.228916 0.101048 0.558532 0.062787 0.041612 0.420282 0.848 [...]
+0.727241 0.99718 0.777781 0.998292 0.908347 0.383957 0.506882 0.281999 0.191803 0.532399 0.721506 0.735701 0.271089 0.584464 0.909235 0.138049 0.264376 0.287893 0.748099 0.005141 0.01809 0.132175 0.087883 0.959726 0.010518 0.075238 0.082546 0.450602 0.426366 0.431713 0.477993 0.444456 0.324985 0.244431 0.651673 0.261099 0.02213 0.948644 0.17525 0.696088 0.701295 0.146152 0.949961 0.874904 0.775266 0.554985 0.303551 0.072496 0.765072 0.58301 0.822709 0.163111 0.895226 0.486822 0.429319 0. [...]
+0.673808 0.553703 0.858313 0.212834 0.913304 0.556719 0.565993 0.962859 0.316054 0.573365 0.594535 0.228823 0.322254 0.381297 0.719087 0.778988 0.356587 0.194906 0.611317 0.182402 0.555096 0.481508 0.144075 0.535928 0.19934 0.453345 0.942301 0.367806 0.789382 0.622284 0.916493 0.441735 0.457712 0.945244 0.358752 0.313081 0.0413 0.043234 0.287798 0.492034 0.782374 0.62913 0.305896 0.007035 0.972798 0.912115 0.326084 0.747056 0.285202 0.431626 0.20126 0.901877 0.873887 0.609388 0.343081 0. [...]
+0.837497 0.71965 0.209654 0.984177 0.759482 0.956158 0.649136 0.409899 0.378739 0.246911 0.51956 0.117397 0.730544 0.392415 0.082561 0.186964 0.34562 0.510553 0.42692 0.55301 0.877712 0.933792 0.432559 0.208552 0.213802 0.004365 0.463603 0.245867 0.96331 0.292 0.74781 0.202667 0.444823 0.958947 0.547376 0.049298 0.40235 0.049056 0.19798 0.813576 0.038763 0.422519 0.732496 0.986863 0.379861 0.395769 0.987465 0.108103 0.005338 0.768306 0.705808 0.353858 0.45798 0.967848 0.620439 0.735099 0 [...]
+0.167505 0.208319 0.618322 0.721687 0.310255 0.838787 0.960832 0.417114 0.282198 0.412355 0.077018 0.958701 0.50957 0.18749 0.041567 0.213027 0.651193 0.526809 0.773583 0.771812 0.269322 0.115449 0.288252 0.127739 0.855753 0.686861 0.589856 0.900662 0.414982 0.685171 0.918255 0.867539 0.265364 0.888353 0.823784 0.182109 0.935521 0.19314 0.285399 0.596019 0.039874 0.927326 0.923216 0.293534 0.469156 0.588499 0.47712 0.640845 0.853708 0.135707 0.615582 0.834069 0.761397 0.69761 0.277602 0. [...]
+0.412178 0.353048 0.146529 0.901034 0.392082 0.00638 0.24001 0.956136 0.72238 0.696909 0.728768 0.117301 0.213136 0.376598 0.866912 0.611379 0.13926 0.330482 0.360734 0.081684 0.684239 0.235026 0.815528 0.444385 0.852775 0.545656 0.251263 0.149387 0.545804 0.257005 0.263046 0.662116 0.319749 0.145287 0.100949 0.031145 0.147168 0.461735 0.044115 0.273082 0.007174 0.71774 0.416389 0.424694 0.60848 0.711559 0.134749 0.849379 0.780092 0.159241 0.277448 0.388271 0.322737 0.859659 0.551617 0.2 [...]
+0.576187 0.482904 0.001055 0.718041 0.339021 0.965886 0.891344 0.695451 0.347686 0.166151 0.20609 0.40753 0.067434 0.174492 0.316292 0.111835 0.968876 0.715552 0.720306 0.116674 0.656945 0.683929 0.642366 0.400666 0.079363 0.109635 0.547507 0.980422 0.55221 0.567374 0.686876 0.881999 0.800188 0.594487 0.341578 0.334262 0.756841 0.567481 0.214772 0.088895 0.848689 0.020446 0.079882 0.181425 0.620151 0.73224 0.999777 0.877337 0.929249 0.578687 0.146728 0.318614 0.441618 0.758898 0.016184 0 [...]
+0.449559 0.68567 0.047887 0.646424 0.244096 0.053303 0.542009 0.350877 0.206618 0.008759 0.082881 0.419566 0.57807 0.846458 0.777381 0.799632 0.064266 0.661459 0.975256 0.820186 0.497118 0.296403 0.644928 0.901714 0.616435 0.078665 0.884251 0.293608 0.373879 0.765534 0.430089 0.213381 0.196119 0.654542 0.20128 0.986775 0.505678 0.915335 0.737619 0.96453 0.905097 0.473839 0.053284 0.141881 0.739001 0.834225 0.619278 0.935257 0.189346 0.218245 0.536261 0.202363 0.379305 0.991068 0.457747 0 [...]
+0.142169 0.328677 0.961683 0.881792 0.21017 0.491897 0.007472 0.423921 0.212666 0.502882 0.615793 0.588095 0.902075 0.392659 0.141599 0.686135 0.33651 0.152648 0.342545 0.965895 0.785838 0.15186 0.969061 0.591848 0.667934 0.948071 0.213753 0.622442 0.200692 0.895954 0.383021 0.244318 0.269051 0.289859 0.34625 0.685909 0.315618 0.570345 0.91746 0.135709 0.255307 0.306014 0.264684 0.491927 0.277344 0.48845 0.554306 0.635801 0.930425 0.636368 0.103163 0.99029 0.863433 0.539775 0.666617 0.65 [...]
+0.4053 0.72471 0.612952 0.673027 0.812052 0.505411 0.230155 0.788985 0.314602 0.757801 0.616805 0.119304 0.721663 0.698719 0.719961 0.803432 0.738866 0.858799 0.503839 0.767374 0.379787 0.972431 0.540295 0.429226 0.433301 0.608409 0.206599 0.838589 0.064351 0.138713 0.132128 0.441397 0.455761 0.586819 0.740403 0.721493 0.483153 0.917576 0.469448 0.139612 0.11818 0.124963 0.557701 0.354358 0.350953 0.903548 0.517467 0.696034 0.753738 0.194044 0.424413 0.661766 0.178129 0.67495 0.568308 0. [...]
+0.95463 0.263147 0.923738 0.80507 0.348318 0.831307 0.767884 0.713365 0.081987 0.668585 0.585415 0.209239 0.871259 0.594467 0.153898 0.72263 0.51634 0.346065 0.344002 0.799074 0.36085 0.079705 0.33199 0.178834 0.32345 0.502947 0.453131 0.655446 0.150524 0.270576 0.453531 0.343629 0.781034 0.436282 0.47827 0.510527 0.144131 0.81395 0.758266 0.633388 0.166184 0.902063 0.16093 0.454622 0.537314 0.547842 0.90589 0.388069 0.492336 0.99514 0.259047 0.564587 0.864294 0.407784 0.555407 0.943775  [...]
+0.014477 0.187814 0.971873 0.011177 0.281578 0.656105 0.859068 0.134542 0.452085 0.364779 0.880976 0.543676 0.761892 0.746252 0.338702 0.478331 0.19864 0.679163 0.996273 0.486492 0.525918 9.2e-05 0.140931 0.797943 0.823943 0.84014 0.287899 0.598261 0.508745 0.821821 0.885528 0.074087 0.803931 0.725879 0.564089 0.288613 0.197722 0.629431 0.26933 0.563513 0.529765 0.242329 0.098083 0.904471 0.370614 0.821557 0.045119 0.674587 0.952274 0.95165 0.075468 0.687228 0.662157 0.069168 0.027073 0. [...]
+0.928355 0.170947 0.188753 0.435012 0.175487 0.46832 0.28697 0.942649 0.31008 0.477803 0.182788 0.200583 0.807479 0.225911 0.273462 0.524732 0.384824 0.528678 0.132157 0.727176 0.89047 0.005959 0.428302 0.940607 0.156368 0.949645 0.130163 0.893876 0.319843 0.505071 0.153202 0.973532 0.786355 0.323453 0.417278 0.231942 0.030279 0.522388 0.477638 0.082574 0.566067 0.175443 0.569255 0.959456 0.001012 0.475457 0.843638 0.592412 0.102121 0.475382 0.665377 0.966819 0.746692 0.408385 0.131376 0 [...]
+0.709614 0.255774 0.272705 0.649295 0.842114 0.522143 0.590108 0.727521 0.170223 0.837897 0.058013 0.106138 0.369263 0.457057 0.927778 0.479429 0.834412 0.904565 0.024454 0.462034 0.732717 0.896716 0.71136 0.916811 0.732724 0.400848 0.131167 0.998939 0.365961 0.438743 0.065208 0.506354 0.087537 0.008615 0.828921 0.960194 0.451357 0.025745 0.710948 0.229608 0.324915 0.151203 0.701932 0.95234 0.50915 0.510697 0.755194 0.358639 0.240276 0.11002 0.959361 0.313728 0.136947 0.057868 0.950297 0 [...]
+0.78771 0.681682 0.696489 0.569123 0.754864 0.468516 0.495207 0.308399 0.414549 0.029969 0.964657 0.140046 0.594072 0.717888 0.459495 0.47963 0.656101 0.682506 0.335772 0.746329 0.642242 0.845225 0.873303 0.349774 0.349748 0.320306 0.269155 0.529198 0.09575 0.528516 0.50988 0.12429 0.567543 0.300199 0.481246 0.48552 0.73985 0.962867 0.190943 0.734507 0.747744 0.448869 0.486372 0.210896 0.150448 0.302334 0.101195 0.543585 0.645742 0.980038 0.577132 0.734843 0.827023 0.512441 0.158589 0.14 [...]
+0.804008 0.030255 0.185943 0.653997 0.661408 0.650101 0.873824 0.105157 0.884749 0.057304 0.416132 0.231176 0.003509 0.8687 0.237343 0.441744 0.88563 0.639054 0.670872 0.738284 0.897683 0.457147 0.306162 0.175947 0.517871 0.53287 0.944382 0.565308 0.722108 0.062512 0.058056 0.389421 0.206683 0.82346 0.853218 0.17487 0.109849 0.80162 0.484012 0.265449 0.370823 0.60056 0.538621 0.977202 0.617134 0.691434 0.043696 0.741163 0.853354 0.675406 0.453534 0.895911 0.416742 0.256291 0.881106 0.912 [...]
+0.690069 0.669658 0.82663 0.506086 0.452911 0.583687 0.261533 0.992178 0.920097 0.352304 0.951801 0.210568 0.350296 0.515786 0.010239 0.584482 0.774018 0.725026 0.115589 0.941726 0.780361 0.030951 0.060662 0.255357 0.947448 0.121062 0.218029 0.213999 0.004145 0.505294 0.888786 0.618386 0.998207 0.123308 0.32152 0.064357 0.495356 0.80323 0.711126 0.394927 0.605784 0.505679 0.856595 0.651197 0.258397 0.240458 0.028883 0.06051 0.920587 0.90389 0.08627 0.958578 0.953839 0.51418 0.369915 0.47 [...]
+0.531906 0.493446 0.73796 0.432886 0.847724 0.940374 0.10978 0.059922 0.073238 0.320055 0.950942 0.989658 0.953889 0.986899 0.511219 0.0174 0.992183 0.864354 0.118743 0.072426 0.396829 0.627257 0.397648 0.234629 0.687047 0.634146 0.086852 0.857261 0.785202 0.565349 0.39315 0.013433 0.175633 0.729605 0.298537 0.017203 0.157587 0.449053 0.541168 0.504335 0.401847 0.47439 0.07065 0.634182 0.123439 0.161673 0.372602 0.739338 0.3866 0.879682 0.755199 0.594255 0.737493 0.921298 0.912934 0.6437 [...]
+0.232721 0.11148 0.725241 0.428186 0.678522 0.394016 0.529986 0.005487 0.40355 0.88901 0.892819 0.00223 0.216813 0.437913 0.59088 0.823891 0.277322 0.972325 0.69935 0.19345 0.240189 0.138744 0.043061 0.695189 0.194545 0.898451 0.283749 0.433724 0.053604 0.920157 0.681154 0.708582 0.540723 0.893506 0.183233 0.578705 0.280255 0.375028 0.073397 0.105703 0.653684 0.573727 0.25825 0.991703 0.133521 0.785503 0.832018 0.647441 0.427815 0.16489 0.841265 0.839016 0.129358 0.782276 0.112366 0.9120 [...]
+0.519309 0.849978 0.667763 0.967321 0.983867 0.664581 0.518353 0.875868 0.962011 0.298081 0.168396 0.308856 0.617628 0.912606 0.181014 0.125732 0.652373 0.043805 0.850152 0.005202 0.552415 0.011332 0.889328 0.487956 0.787877 0.148261 0.564255 0.940402 0.252719 0.527209 0.741238 0.524546 0.502173 0.941605 0.986101 0.639469 0.330791 0.507837 0.240051 0.817583 0.034051 0.185921 0.287896 0.789848 0.898004 0.979032 0.454719 0.917409 0.985267 0.96287 0.86617 0.200317 0.473683 0.028901 0.354751 [...]
+0.957493 0.888071 0.246599 0.538869 0.511823 0.024379 0.948924 0.046728 0.886697 0.042197 0.987753 0.359944 0.760477 0.779989 0.274919 0.274255 0.145896 0.322002 0.650822 0.649689 0.370953 0.510262 0.707381 0.558249 0.529336 0.93773 0.742741 0.543744 0.894949 0.50758 0.254283 0.434123 0.580354 0.027395 0.361018 0.883957 0.017263 0.476244 0.659818 0.713478 0.46377 0.042606 0.835226 0.603171 0.315024 0.208341 0.793059 0.013812 0.489923 0.656322 0.288144 0.001503 0.029567 0.60515 0.452124 0 [...]
+0.093052 0.506088 0.500385 0.190177 0.391024 0.787283 0.614975 0.872326 0.940875 0.256671 0.322183 0.205842 0.629126 0.529175 0.54189 0.107841 0.323709 0.810891 0.086752 0.286167 0.290444 0.827879 0.24008 0.739323 0.540852 0.539829 0.350538 0.17136 0.984396 0.258086 0.638509 0.368842 0.440668 0.449734 0.269804 0.715131 0.33574 0.791729 0.031382 0.45912 0.350382 0.202965 0.448453 0.259997 0.224075 0.8334 0.704924 0.940418 0.12369 0.192798 0.821851 0.454826 0.785858 0.466689 0.933656 0.837 [...]
+0.346715 0.062059 0.569388 0.659803 0.188892 0.626588 0.573322 0.118664 0.271548 0.045173 0.15999 0.123158 0.702573 0.126523 0.183908 0.227638 0.975538 0.90045 0.842582 0.58723 0.119364 0.727307 0.505033 0.201786 0.648798 0.088513 0.372459 0.647394 0.944953 0.339423 0.358323 0.345673 0.333158 0.647718 0.52005 0.403811 0.104209 0.387847 0.079667 0.719875 0.763049 0.465395 0.600173 0.587803 0.937657 0.098086 0.913219 0.666598 0.044767 0.120409 0.663799 0.123194 0.227167 0.104427 0.35133 0. [...]
+0.111047 0.967977 0.458189 0.840487 0.435654 0.259988 0.52861 0.118755 0.295725 0.663243 0.210238 0.684288 0.441565 0.619395 0.863933 0.361186 0.425419 0.428674 0.310773 0.439507 0.839001 0.329318 0.298748 0.96903 0.580141 0.925019 0.446 0.751339 0.18956 0.944467 0.288472 0.37255 0.857564 0.733087 0.61644 0.152659 0.280124 0.48614 0.034978 0.194588 0.896871 0.518346 0.152295 0.100883 0.779426 0.340943 0.057876 0.375578 0.360361 0.413883 0.708691 0.787862 0.082903 0.776618 0.615929 0.7549 [...]
+0.353137 0.267985 0.677671 0.34277 0.249151 0.574855 0.972817 0.994077 0.058553 0.084033 0.665711 0.199287 0.690442 0.309486 0.404446 0.888042 0.50083 0.808779 0.963357 0.345877 0.087195 0.90411 0.258051 0.501021 0.363722 0.183219 0.284456 0.072921 0.216048 0.885631 0.531379 0.244539 0.251327 0.978721 0.311308 0.662302 0.898492 0.401498 0.527503 0.750214 0.237033 0.052902 0.954335 0.013536 0.445082 0.417867 0.817627 0.156331 0.668239 0.162602 0.262961 0.044791 0.257656 0.225632 0.20776 0 [...]
+0.905786 0.253184 0.335304 0.29305 0.998667 0.352428 0.050255 0.142989 0.264574 0.324781 0.192121 0.838029 0.513164 0.054416 0.829383 0.340195 0.454443 0.741107 0.416704 0.45808 0.098375 0.372832 0.271589 0.916937 0.672078 0.624835 0.548037 0.491146 0.219503 0.245217 0.655044 0.285572 0.316718 0.998151 0.949076 0.300204 0.487452 0.663199 0.852566 0.413422 0.590217 0.005208 0.813233 0.450358 0.27119 0.684016 0.724913 0.508777 0.387133 0.377255 0.560371 0.439067 0.269433 0.920976 0.980087  [...]
+0.254972 0.352556 0.593804 0.573782 0.367673 0.89936 0.194405 0.863623 0.896428 0.308793 0.202172 0.404959 0.775744 0.470633 0.772009 0.780377 0.107296 0.375993 0.7603 0.709931 0.960223 0.656883 0.156134 0.450328 0.677816 0.685429 0.011311 0.533366 0.744167 0.245708 0.785029 0.833401 0.120052 0.507609 0.168083 0.51716 0.083044 0.200994 0.183511 0.894746 0.486352 0.319113 0.340887 0.87708 0.56592 0.946772 0.724787 0.717116 0.814895 0.922375 0.239485 0.637354 0.801089 0.912502 0.245919 0.1 [...]
+0.44608 0.162534 0.313634 0.80359 0.677763 0.962822 0.483224 0.107267 0.295972 0.498828 0.223856 0.911316 0.604204 0.99475 0.234276 0.542512 0.42994 0.492422 0.227624 0.563513 0.44265 0.852953 0.801416 0.747059 0.602534 0.304691 0.316164 0.789654 0.679966 0.613878 0.492561 0.000297 0.634582 0.565241 0.167242 0.855362 0.906977 0.280876 0.575381 0.510485 0.447688 0.51224 0.809268 0.571771 0.91624 0.711088 0.528847 0.410272 0.778606 0.02522 0.087071 0.467308 0.905291 0.006601 0.112073 0.548 [...]
+0.628327 0.545357 0.314968 0.837102 0.217326 0.488232 0.136464 0.274951 0.249876 0.754727 0.596649 0.677197 0.009273 0.317388 0.040359 0.729536 0.090528 0.46125 0.139525 0.66409 0.316622 0.91246 0.918308 0.290083 0.617729 0.466942 0.615117 0.703824 0.553398 0.339 0.07521 0.027811 0.353388 0.312733 0.508337 0.601126 0.751587 0.259558 0.668292 0.094576 0.614907 0.307946 0.470899 0.495148 0.149518 0.367757 0.08537 0.045836 0.991978 0.375649 0.393485 0.958134 0.56491 0.327191 0.763849 0.6970 [...]
+0.990947 0.956328 0.735677 0.193696 0.722472 0.162782 0.134897 0.929279 0.24213 0.05768 0.09962 0.651436 0.705354 0.183732 0.127921 0.912399 0.75576 0.47223 0.417243 0.729334 0.221184 0.74674 0.443228 0.678703 0.279932 0.033634 0.850436 0.811503 0.066204 0.164071 0.655832 0.316077 0.680302 0.224714 0.606247 0.278468 0.392383 0.749334 0.67216 0.214672 0.966407 0.455123 0.02 0.085976 0.505688 0.032963 0.319553 0.285746 0.91526 0.678151 0.668003 0.810744 0.643485 0.305779 0.458471 0.737705  [...]
+0.468863 0.527182 0.70604 0.211442 0.020655 0.6016 0.293398 0.224551 0.009007 0.059974 0.441385 0.028609 0.888357 0.226104 0.682191 0.040047 0.878426 0.047257 0.86757 0.447629 0.703056 0.746401 0.443931 0.310661 0.53253 0.053343 0.602598 0.075687 0.975583 0.781391 0.391897 0.574957 0.949143 0.416617 0.85445 0.411615 0.911518 0.145824 0.300242 0.601062 0.242189 0.575663 0.723214 0.360795 0.949277 0.940518 0.915668 0.335601 0.657651 0.104853 0.318381 0.17553 0.632345 0.297522 0.114994 0.81 [...]
+0.131197 0.603358 0.201393 0.727007 0.211504 0.658027 0.436104 0.253697 0.676186 0.163429 0.420469 0.662731 0.73041 0.702475 0.906895 0.045898 0.296611 0.034652 0.954036 0.918165 0.524393 0.439473 0.165497 0.941342 0.0769 0.841555 0.220376 0.954301 0.870752 0.393386 0.89621 0.953123 0.331554 0.445563 0.947748 0.222969 0.741491 0.418783 0.003659 0.298539 0.759033 0.542858 0.502048 0.940455 0.708254 0.566907 0.32755 0.922036 0.542548 0.676552 0.020101 0.704966 0.636852 0.900311 0.195554 0. [...]
+0.8943 0.483113 0.411232 0.290947 0.632137 0.852311 0.407985 0.49706 0.952363 0.244364 0.852308 0.963335 0.899998 0.678478 0.097023 0.380547 0.249884 0.771259 0.692445 0.54009 0.913138 0.781912 0.087504 0.79779 0.37542 0.756544 0.533341 0.340627 0.949017 0.632862 0.67654 0.337208 0.50174 0.304865 0.811762 0.82293 0.279751 0.379389 0.457797 0.468665 0.165374 0.248063 0.08385 0.155603 0.870954 0.363535 0.312363 0.848335 0.549346 0.978987 0.708026 0.570307 0.119112 0.848294 0.598418 0.83708 [...]
+0.488213 0.4454 0.852295 0.205826 0.127239 0.057564 0.624949 0.680603 0.024486 0.642894 0.661166 0.003872 0.363137 0.596486 0.983143 0.076968 0.492799 0.928214 0.450929 0.244633 0.556352 0.20576 0.699575 0.247255 0.56135 0.749708 0.708172 0.120959 0.767137 0.419541 0.806118 0.629876 0.300718 0.619978 0.781072 0.349688 0.203479 0.601978 0.020713 0.905223 0.802018 0.432535 0.046826 0.696435 0.536268 0.97198 0.576692 0.984545 0.192505 0.614968 0.682653 0.100212 0.887512 0.168519 0.376438 0. [...]
+0.552441 0.060104 0.618748 0.729781 0.558912 0.49359 0.027932 0.523553 0.342423 0.915698 0.523905 0.374107 0.485519 0.341565 0.683487 0.662235 0.151496 0.937136 0.515788 0.506946 0.74397 0.575614 0.829551 0.26433 0.533645 0.077139 0.307599 0.766246 0.500197 0.163242 0.010308 0.643917 0.387967 0.377085 0.849554 0.731674 0.385265 0.063254 0.134046 0.963203 0.745706 0.424869 0.226765 0.179199 0.549459 0.560841 0.23285 0.006689 0.421931 0.324304 0.599824 0.609797 0.496005 0.628589 0.069125 0 [...]
+0.449956 0.080601 0.028633 0.159533 0.437685 0.340389 0.864936 0.458952 0.995689 0.073276 0.121579 0.977976 0.285642 0.790359 0.165284 0.4296 0.756175 0.66697 0.168206 0.471932 0.856208 0.599769 0.643753 0.666364 0.135671 0.261381 0.17863 0.291702 0.186154 0.896969 0.678452 0.752922 0.508398 0.553736 0.725397 0.317305 0.291872 0.99947 0.809905 0.57796 0.454905 0.514988 0.703374 0.616613 0.519851 0.013551 0.55879 0.31607 0.491468 0.628876 0.210755 0.614048 0.838394 0.418001 0.324248 0.096 [...]
+0.597031 0.600887 0.089968 0.846873 0.668983 0.361112 0.829419 0.592159 0.826998 0.56854 0.510339 0.870099 0.089821 0.227967 0.456452 0.284112 0.102099 0.227282 0.621035 0.209238 0.83491 0.563909 0.559233 0.649798 0.554639 0.973231 0.492336 0.463321 0.104519 0.839573 0.435181 0.438654 0.555385 0.16911 0.455653 0.286385 0.712654 0.097959 0.789006 0.5724 0.537428 0.599733 0.392074 0.219059 0.056754 0.320642 0.081464 0.495339 0.597453 0.847571 0.654469 0.682793 0.688226 0.061736 0.099887 0. [...]
+0.656547 0.509267 0.093277 0.359172 0.357993 0.885877 0.181804 0.122784 0.52933 0.234751 0.192255 0.298613 0.352071 0.184364 0.242593 0.491358 0.91691 0.873467 0.442721 0.649325 0.178047 0.873483 0.224065 0.538656 0.56488 0.652044 0.816885 0.554083 0.422247 0.448876 0.077628 0.262849 0.236516 0.904313 0.672797 0.937189 0.836116 0.78322 0.505014 0.119893 0.317505 0.908548 0.645303 0.889216 0.037932 0.667559 0.020371 0.540696 0.779368 0.262433 0.034061 0.526605 0.524469 0.430901 0.838626 0 [...]
+0.921169 0.500594 0.289031 0.769973 0.108452 0.056538 0.07727 0.522802 0.444205 0.767304 0.013856 0.609428 0.598171 0.760448 0.410059 0.434926 0.30594 0.650101 0.170979 0.848043 0.899513 0.928644 0.520356 0.572113 0.759126 0.347347 0.444619 0.761932 0.440862 0.649523 0.275555 0.412326 0.360721 0.289574 0.317889 0.164298 0.757311 0.847741 0.448829 0.338884 0.141224 0.038926 0.786929 0.245107 0.929885 0.027582 0.210943 0.504555 0.797781 0.793804 0.849814 0.036923 0.443515 0.805564 0.826262 [...]
+0.354267 0.091994 0.474065 0.313335 0.566257 0.67044 0.712598 0.965934 0.819536 0.667836 0.353421 0.677784 0.172152 0.443575 0.418553 0.557703 0.736613 0.256622 0.763309 0.249637 0.730851 0.16071 0.500737 0.171358 0.229316 0.35506 0.077023 0.551966 0.993328 0.921404 0.277522 0.504097 0.774003 0.100956 0.121864 0.045044 0.635588 0.034362 0.3251 0.429805 0.130774 0.335225 0.858175 0.998826 0.911955 0.890903 0.587558 0.584859 0.62898 0.415318 0.092674 0.331955 0.201125 0.833154 0.130221 0.9 [...]
+0.34292 0.229276 0.374982 0.881564 0.31048 0.723803 0.398008 0.28975 0.40069 0.149458 0.597972 0.599532 0.898726 0.192718 0.55661 0.693369 0.4013 0.149357 0.976953 0.48905 0.027257 0.393056 0.055255 0.556839 0.971114 0.922553 0.035496 0.112861 0.384214 0.513598 0.400279 0.760037 0.77327 0.078905 0.887359 0.083496 0.678134 0.298651 0.542964 0.779287 0.550576 0.489642 0.314914 0.473911 0.042809 0.159905 0.413528 0.078649 0.151734 0.87958 0.542412 0.549649 0.267143 0.354618 0.425831 0.43864 [...]
+0.610651 0.423033 0.823624 0.732211 0.10508 0.220473 0.103636 0.665366 0.423305 0.981569 0.53469 0.366849 0.663895 0.481336 0.808233 0.074272 0.371139 0.424406 0.55466 0.417767 0.442394 0.426597 0.744009 0.201406 0.748136 0.690498 0.679135 0.068839 0.264785 0.425813 0.129959 0.24467 0.469326 0.202043 0.878253 0.826831 0.454466 0.454437 0.322049 0.898307 0.988652 0.604907 0.639823 0.658635 0.644059 0.333952 0.918377 0.72828 0.044506 0.317199 0.66945 0.636022 0.862243 0.908776 0.510826 0.4 [...]
+0.536475 0.279625 0.590729 0.553218 0.744014 0.166978 0.465193 0.806484 0.261775 0.387859 0.084451 0.335599 0.269593 0.569972 0.874759 1.1e-05 0.74909 0.050954 0.456009 0.252231 0.993741 0.107206 0.68732 0.232489 0.816426 0.886349 0.85097 0.205059 0.183579 0.432805 0.29797 0.182828 0.967004 0.837485 0.661436 0.392584 0.042174 0.222986 0.611357 0.876297 0.002418 0.933982 0.592914 0.924334 0.947941 0.838098 0.062273 0.931337 0.951273 0.037063 0.709442 0.264834 0.081921 0.759575 0.52763 0.3 [...]
+0.07958 0.97706 0.971596 0.927777 0.290182 0.947121 0.286455 0.044769 0.145233 0.817785 0.054813 0.118195 0.053413 0.183254 0.473966 0.91465 0.622365 0.510303 0.678412 0.682331 0.173763 0.650413 0.164653 0.403406 0.132127 0.638352 0.94158 0.274885 0.438885 0.315634 0.761881 0.817239 0.840094 0.57309 0.747813 0.493832 0.059166 0.365249 0.095724 0.029245 0.195883 0.215784 0.630168 0.785372 0.370986 0.132829 0.524304 0.112747 0.762867 0.366958 0.641888 0.924023 0.08368 0.495278 0.094434 0.2 [...]
+0.2044 0.145195 0.724869 0.488649 0.560828 0.748766 0.90472 0.260416 0.175952 0.329497 0.895204 0.138724 0.494702 0.797988 0.595374 0.796324 0.305026 0.980742 0.878451 0.571496 0.719354 0.874954 0.814678 0.639346 0.826311 0.821715 0.181561 0.307663 0.016414 0.955297 0.345794 0.043688 0.845135 0.370523 0.41817 0.296532 0.210264 0.041937 0.316708 0.173314 0.10677 0.562641 0.113326 0.471039 0.05296 0.669228 0.593715 0.025529 0.799439 0.01636 0.965149 0.877136 0.14483 0.76178 0.182185 0.7299 [...]
+0.062738 0.562316 0.347134 0.449571 0.206338 0.627265 0.890406 0.646463 0.244436 0.706704 0.253067 0.594883 0.144138 0.748354 0.803029 0.193099 0.540037 0.931312 0.688561 0.599588 0.540499 0.294611 0.941148 0.623329 0.865283 0.659781 0.204256 0.157768 0.51747 0.594907 0.768473 0.037165 0.627496 0.809924 0.778337 0.116413 0.592237 0.836268 0.702663 0.71404 0.246589 0.013737 0.018998 0.05582 0.751519 0.652883 0.078497 0.457776 0.211793 0.761208 0.617494 0.570911 0.745653 0.485722 0.112582  [...]
+0.828678 0.892203 0.917497 0.968964 0.391242 0.808636 0.091485 0.212103 0.185792 0.264552 0.738003 0.582973 0.901946 0.497218 0.032808 0.986663 0.609126 0.878202 0.16733 0.892977 0.413758 0.561092 0.543095 0.373769 0.938409 0.050798 0.570236 0.707311 0.66354 0.340321 0.413003 0.173133 0.492859 0.729025 0.70409 0.121801 0.212724 0.204219 0.647657 0.713511 0.390383 0.855308 0.147074 0.274272 0.196094 0.08125 0.435053 0.154286 0.871848 0.219287 0.956864 0.016692 0.195823 0.975328 0.712495 0 [...]
+0.254182 0.039142 0.605875 0.593842 0.742276 0.258897 0.371186 0.249047 0.572051 0.79688 0.664517 0.258461 0.864235 0.096522 0.367683 0.91926 0.30846 0.294824 0.529803 0.934945 0.837008 0.092941 0.312367 0.883965 0.725578 0.291783 0.192795 0.440464 0.114186 0.140571 0.792983 0.668016 0.030178 0.106959 0.753296 0.423906 0.975502 0.840397 0.683553 0.970077 0.953872 0.316019 0.723437 0.93574 0.311427 0.43089 0.33658 0.519837 0.599995 0.829974 0.813151 0.765643 0.252164 0.924667 0.805316 0.4 [...]
+0.936 0.609836 0.821127 0.784395 0.328335 0.42554 0.40245 0.134286 0.606247 0.417759 0.175605 0.203657 0.056884 0.567465 0.173286 0.754257 0.618813 0.819926 0.613708 0.831106 0.718099 0.517262 0.529211 0.483537 0.627915 0.874797 0.790469 0.024081 0.266776 0.834899 0.565047 0.445954 0.726544 0.81789 0.044355 0.680484 0.166997 0.456647 0.930838 0.554552 0.861876 0.960738 0.273524 0.101808 0.969575 0.447603 0.28744 0.050897 0.878812 0.95326 0.351369 0.837807 0.312777 0.342538 0.656959 0.565 [...]
+0.970639 0.449384 0.055832 0.941803 0.90128 0.647131 0.665961 0.739815 0.543905 0.767098 0.847732 0.888038 0.230144 0.089544 0.29416 0.328735 0.658234 0.664246 0.3582 0.587915 0.476143 0.793185 0.881754 0.991312 0.596067 0.305487 0.674989 0.019752 0.136425 0.468774 0.498878 0.513737 0.391944 0.351232 0.651214 0.558632 0.658659 0.494251 0.255009 0.669075 0.852205 0.749153 0.656267 0.730414 0.540662 0.567136 0.897583 0.301982 0.540421 0.415072 0.703719 0.17954 0.127095 0.782013 0.501149 0. [...]
+0.423911 0.406763 0.296451 0.98517 0.366256 0.912331 0.571944 0.310223 0.513114 0.658343 0.776663 0.993054 0.924204 0.008964 0.370868 0.142529 0.019187 0.266132 0.682369 0.485147 0.182295 0.179787 0.113124 0.240237 0.641065 0.846339 0.612332 0.675593 0.599712 0.458283 0.05079 0.639009 0.569587 0.098593 0.188092 0.030955 0.946112 0.318081 0.871918 0.945315 0.063778 0.038385 0.822233 0.665678 0.241328 0.00607 0.696639 0.726451 0.89436 0.557254 0.089509 0.937239 0.044404 0.048217 0.572187 0 [...]
+0.11455 0.386064 0.0585 0.641004 0.942817 0.136726 0.781769 0.394061 0.838383 0.308816 0.32379 0.220964 0.302128 0.132503 0.36471 0.352262 0.668522 0.282964 0.682197 0.304514 0.054683 0.557623 0.142164 0.987246 0.391848 0.480226 0.562638 0.058847 0.323237 0.912349 0.332625 0.199388 0.757342 0.225536 0.921047 0.360485 0.601648 0.226373 0.520959 0.666096 0.06301 0.7637 0.616604 0.932355 0.098193 0.476589 0.195849 0.614803 0.256381 0.192809 0.525708 0.664984 0.088861 0.690194 0.654408 0.239 [...]
+0.412074 0.780803 0.792573 0.55845 0.307215 0.62484 0.558842 0.529358 0.285662 0.890806 0.741431 0.059833 0.750669 0.959157 0.842516 0.445171 0.841728 0.320707 0.706927 0.842008 0.819254 0.934855 0.192817 0.474507 0.339072 0.281768 0.833772 0.617518 0.691666 0.915777 0.641495 0.468672 0.76405 0.965064 0.372214 0.27873 0.539759 0.108658 0.218113 0.877522 0.694921 0.109401 0.510111 0.235176 0.6223 0.842707 0.147059 0.365126 0.549504 0.541998 0.350899 0.258369 0.830768 0.351374 0.337708 0.9 [...]
+0.59764 0.239232 0.588263 0.273702 0.835403 0.440613 0.688381 0.537263 0.853331 0.605019 0.532378 0.378971 0.854662 0.947638 0.897267 0.507395 0.178539 0.741763 0.125621 0.141023 0.158001 0.325387 0.16822 0.114311 0.491789 0.54465 0.970551 0.825913 0.16728 0.115043 0.012898 0.366687 0.012032 0.196852 0.319575 0.625666 0.720525 0.125212 0.827451 0.201317 0.546945 0.6032 0.908876 0.506302 0.470259 0.437369 0.320147 0.171062 0.168805 0.48864 0.959791 0.976166 0.764649 0.136409 0.10966 0.088 [...]
+0.192436 0.839689 0.139768 0.744139 0.550592 0.433484 0.583392 0.604075 0.306273 0.707697 0.554938 0.893942 0.195333 0.447088 0.262604 0.002924 0.645454 0.287455 0.22614 0.476267 0.695877 0.683603 0.479021 0.151284 0.514437 0.993649 0.727907 0.887673 0.208873 0.878985 0.708948 0.887637 0.655907 0.087582 0.926314 0.64114 0.515234 0.945028 0.547119 0.90537 0.861104 0.370264 0.308584 0.375035 0.709074 0.423296 0.302704 0.42434 0.756012 0.348304 0.114222 0.097675 0.685208 0.922406 0.285301 0 [...]
+0.874463 0.980176 0.323956 0.398333 0.203987 0.449821 0.718848 0.069116 0.219287 0.742932 0.188919 0.765229 0.828892 0.791137 0.305498 0.922993 0.153724 0.706729 0.035341 0.045242 0.08418 0.23218 0.726798 0.194183 0.54405 0.161728 0.185008 0.035286 0.961902 0.620519 0.088172 0.772811 0.662871 0.521453 0.573453 0.543582 0.351402 0.088673 0.118964 0.439305 0.319569 0.169701 0.330955 0.035214 0.426424 0.299816 0.47462 0.515568 0.284995 0.647645 0.456247 0.273558 0.407573 0.074829 0.819313 0 [...]
+0.422027 0.673314 0.188893 0.081842 0.879899 0.696838 0.097647 0.973263 0.271267 0.169368 0.342559 0.430598 0.16147 0.127614 0.953537 0.517675 0.57353 0.510192 0.343796 0.482598 0.831485 0.504291 0.592302 0.087383 0.775527 0.126651 0.397833 0.05688 0.706617 0.262424 0.447059 0.683654 0.994676 0.269632 0.794165 0.024619 0.136711 0.040881 0.624046 0.437497 0.106308 0.803087 0.064619 0.919411 0.653977 0.56601 0.728239 0.249788 0.715596 0.066896 0.317593 0.282402 0.209235 0.002909 0.46302 0. [...]
+0.143628 0.077382 0.923727 0.771703 0.786939 0.035727 0.452109 0.400857 0.546135 0.623354 0.681309 0.228783 0.77941 0.128164 0.513436 0.595581 0.072579 0.541868 0.739456 0.040997 0.4393 0.608214 0.256303 0.919529 0.237422 0.781985 0.445394 0.260278 0.068491 0.196789 0.652736 0.252707 0.252566 0.832114 0.91197 0.256789 0.015737 0.602429 0.421317 0.293474 0.655919 0.389767 0.632098 0.162819 0.371077 0.916743 0.69211 0.650458 0.462742 0.968991 0.699059 0.741683 0.894626 0.195834 0.632131 0. [...]
+0.43259 0.216629 0.738556 0.09262 0.356517 0.232439 0.758392 0.046027 0.101309 0.911068 0.865093 0.941479 0.4657 0.130527 0.473278 0.832563 0.034193 0.153372 0.497843 0.88882 0.490093 0.014835 0.360404 0.786166 0.090325 0.157003 0.332699 0.862757 0.079452 0.353728 0.929616 0.322295 0.343196 0.8972 0.757964 0.126619 0.059645 0.317593 0.788296 0.975657 0.859707 0.57679 0.294696 0.597681 0.378553 0.240844 0.051489 0.994608 0.760101 0.718179 0.737111 0.329957 0.926674 0.497826 0.76868 0.7771 [...]
+0.06528 0.350808 0.014074 0.663233 0.943933 0.281328 0.249863 0.013392 0.232354 0.424793 0.332306 0.723252 0.316829 0.891905 0.384109 0.057825 0.057223 0.017301 0.749505 0.860815 0.12194 0.976322 0.434566 0.280055 0.46127 0.973382 0.521296 0.662396 0.663206 0.61831 0.743856 0.066752 0.844554 0.705861 0.082095 0.335667 0.245566 0.265983 0.696469 0.991222 0.196048 0.163123 0.121881 0.931312 0.484355 0.148075 0.985573 0.129095 0.895635 0.794734 0.73452 0.665624 0.689562 0.256403 0.433958 0. [...]
+0.830382 0.153742 0.942964 0.274539 0.083147 0.314933 0.634311 0.69786 0.343287 0.847438 0.836143 0.66365 0.175317 0.216921 0.732469 0.293006 0.91398 0.479765 0.698292 0.983161 0.787094 0.846986 0.226437 0.96402 0.644579 0.545232 0.153352 0.755813 0.941625 0.344578 0.158942 0.567337 0.669801 0.239081 0.085044 0.535514 0.238042 0.262594 0.204699 0.509212 0.376801 0.406761 0.878226 0.558078 0.36579 0.518162 0.540346 0.253644 0.591977 0.432791 0.115563 0.180795 0.050109 0.348206 0.033896 0. [...]
+0.205011 0.012954 0.18901 0.729686 0.422512 0.054848 0.012795 0.416573 0.304631 0.469342 0.724068 0.25035 0.013478 0.271128 0.127865 0.032402 0.454992 0.92455 0.462145 0.902414 0.321961 0.49496 0.358922 0.757925 0.113796 0.024786 0.776588 0.465112 0.249787 0.424302 0.524111 0.374232 0.841787 0.340144 0.148551 0.541418 0.626272 0.490288 0.586454 0.973935 0.404635 0.876264 0.118369 0.052184 0.342622 0.131163 0.748046 0.763283 0.420667 0.275867 0.703003 0.072736 0.344695 0.301891 0.157589 0 [...]
+0.171298 0.374853 0.168762 0.247536 0.181515 0.450166 0.813955 0.789219 0.690085 0.186337 0.516032 0.773275 0.593086 0.610419 0.470429 0.301259 0.13515 0.154861 0.524172 0.941685 0.970859 0.376414 0.268451 0.187785 0.755488 0.347638 0.315326 0.726244 0.372565 0.77575 0.634647 0.987363 0.427728 0.342832 0.176451 0.977481 0.698764 0.891105 0.370885 0.163061 0.462934 0.255908 0.40516 0.782142 0.477952 0.616292 0.04244 0.495581 0.300986 0.833125 0.685673 0.584414 0.427946 0.583219 0.472108 0 [...]
+0.057767 0.080629 0.141563 0.081797 0.698964 0.102146 0.686237 0.907908 0.34741 0.976147 0.849276 0.278772 0.316109 0.684363 0.389111 0.368109 0.046856 0.605909 0.907669 0.833528 0.656691 0.930991 0.178073 0.88446 0.193344 0.688338 0.453116 0.266268 0.484579 0.588465 0.242925 0.878846 0.83013 0.320743 0.556854 0.149059 0.766749 0.362295 0.71897 0.451466 0.911823 0.013946 0.567346 0.946589 0.504441 0.511881 0.082107 0.806295 0.102011 0.541858 0.904994 0.467616 0.859729 0.047981 0.425299 0 [...]
+0.860675 0.555242 0.212962 0.587523 0.403832 0.914198 0.732372 0.39772 0.233836 0.312478 0.895046 0.34842 0.740608 0.089005 0.140031 0.31261 0.350247 0.576365 0.391282 0.658897 0.342308 0.244372 0.993063 0.993812 0.490873 0.285449 0.077695 0.888356 0.58933 0.281575 0.869244 0.974228 0.018173 0.63164 0.819371 0.065956 0.789979 0.821903 0.896512 0.489376 0.023156 0.550789 0.414328 0.277726 0.184886 0.26401 0.35915 0.834813 0.194528 0.690668 0.412746 0.402285 0.84836 0.331547 0.154883 0.381 [...]
+0.958521 0.711385 0.813882 0.166534 0.422696 0.047746 0.608019 0.127838 0.32972 0.150115 0.72482 0.14159 0.239272 0.664601 0.838202 0.565404 0.332173 0.924859 0.88341 0.91373 0.092786 0.707868 0.641218 0.383343 0.56042 0.004713 0.610901 0.698821 0.438609 0.041487 0.541344 0.320198 0.135798 0.620724 0.271444 0.284963 0.163663 0.008844 0.487456 0.834012 0.01578 0.594505 0.312481 0.079651 0.283508 0.582714 0.166489 0.608357 0.825224 0.954278 0.723423 0.459176 0.751925 0.652593 0.456614 0.02 [...]
+0.74791 0.139074 0.992249 0.477475 0.673679 0.070801 0.315705 0.977265 0.054305 0.213488 0.971512 0.647533 0.254252 0.370314 0.407338 0.203092 0.626318 0.355959 0.422874 0.593651 0.942585 0.944608 0.155594 0.437705 0.995758 0.105146 0.629929 0.75879 0.325088 0.728142 0.607039 0.378223 0.273388 0.5047 0.244328 0.446462 0.746962 0.690841 0.473285 0.601904 0.84177 0.633127 0.318988 0.141164 0.846933 0.728976 0.431251 0.831085 0.218229 0.555392 0.648652 0.368814 0.758431 0.075185 0.786291 0. [...]
+0.654196 0.304106 0.12138 0.054053 0.606171 0.41633 0.305994 0.41457 0.209665 0.510452 0.900571 0.437976 0.951456 0.034733 0.826333 0.112292 0.597448 0.194476 0.235773 0.251045 0.560874 0.714234 0.544018 0.895221 0.092517 0.707455 0.345284 0.312963 0.759156 0.276547 0.908836 0.917406 0.008772 0.627372 0.4345 0.060368 0.432009 0.621935 0.034953 0.592575 0.210456 0.585275 0.94508 0.008544 0.07241 0.090342 0.61963 0.669257 0.50172 0.347311 0.650454 0.850746 0.006458 0.014984 0.39418 0.93837 [...]
+0.206111 0.242786 0.803047 0.133281 0.540907 0.667797 0.751695 0.157988 0.605936 0.105063 0.217655 0.285734 0.193141 0.665507 0.534663 0.681041 0.09325 0.290204 0.200195 0.811169 0.768027 0.821265 0.575774 0.392858 0.153438 0.266004 0.242015 0.149863 0.614037 0.104525 0.762639 0.804558 0.630076 0.610204 0.08902 0.402846 0.874735 0.323448 0.320199 0.206749 0.673896 0.702932 0.5575 0.04542 0.492553 0.13538 0.101641 0.444449 0.704723 0.905433 0.385929 0.078992 0.116364 0.163346 0.931936 0.4 [...]
+0.353963 0.265384 0.189995 0.389915 0.57553 0.645183 0.544349 0.360687 0.276482 0.54211 0.985564 0.767483 0.764397 0.089794 0.415266 0.657386 0.813025 0.242706 0.391953 0.927427 0.474267 0.149824 0.704335 0.937167 0.734298 0.240302 0.780904 0.364454 0.420318 0.559721 0.66115 0.841117 0.246482 0.75691 0.839515 0.530366 0.850764 0.027463 0.758009 0.819318 0.135621 0.54651 0.167357 0.583056 0.842706 0.525872 0.663265 0.034098 0.217586 0.22755 0.085364 0.002665 0.164678 0.789171 0.306546 0.0 [...]
+0.283601 0.538861 0.987102 0.825719 0.628174 0.081708 0.540328 0.201638 0.54025 0.798655 0.847452 0.703178 0.073991 0.790068 0.560193 0.237338 0.97944 0.664756 0.032539 0.064419 0.855748 0.951053 0.313585 0.720925 0.225268 0.89863 0.257601 0.690713 0.444788 0.382598 0.109093 0.683135 0.558227 0.793169 0.061515 0.256628 0.942931 0.460607 0.346454 0.00414 0.395495 0.321404 0.494864 0.892831 0.859307 0.366971 0.182847 0.206313 0.361852 0.895404 0.806669 0.498444 0.097501 0.018025 0.37434 0. [...]
+0.590381 0.089431 0.354558 0.251987 0.16741 0.840379 0.033271 0.664839 0.761674 0.445803 0.549235 0.112814 0.692373 0.492934 0.942047 0.156492 0.888891 0.991603 0.883944 0.864092 0.641504 0.345942 0.130414 0.874311 0.3247 0.439407 0.38978 0.56142 0.48782 0.309346 0.418499 0.707584 0.922926 0.441713 0.347942 0.928349 0.755606 0.154187 0.023472 0.95822 0.622266 0.805049 0.945796 0.132755 0.966029 0.723877 0.657356 0.325753 0.753919 0.711799 0.261403 0.271259 0.212012 0.750806 0.415693 0.48 [...]
+0.1857 0.276039 0.867972 0.535711 0.232082 0.936583 0.995146 0.206442 0.424643 0.308582 0.408616 0.928516 0.201133 0.764702 0.780367 0.364335 0.17167 0.281891 0.199159 0.450556 0.627906 0.949953 0.136395 0.725143 0.739094 0.214808 0.312926 0.67527 0.649472 0.866843 0.582668 0.313886 0.601931 0.040897 0.287095 0.522762 0.602222 0.896066 0.913496 0.572434 0.084174 0.004514 0.677651 0.976035 0.18917 0.298604 0.723969 0.450596 0.352747 0.461259 0.175303 0.76632 0.824089 0.962631 0.030141 0.7 [...]
+0.546373 0.408488 0.066679 0.946906 0.813594 0.838625 0.762037 0.339981 0.926851 0.303304 0.693267 0.287885 0.680826 0.440826 0.545163 0.08206 0.456135 0.95903 0.404001 0.264001 0.00239 0.504715 0.532407 0.207624 0.944239 0.991328 0.111111 0.616652 0.211063 0.415166 0.152048 0.606153 0.83411 0.402572 0.0387 0.596695 0.217216 0.464634 0.538056 0.761906 0.247657 0.531829 0.698065 0.486679 0.515837 0.170475 0.99924 0.524719 0.300572 0.672104 0.029861 0.872414 0.848209 0.902925 0.623658 0.57 [...]
+0.431507 0.579665 0.762736 0.311992 0.284733 0.943153 0.271288 0.262008 0.249622 0.29794 0.383262 0.55253 0.202448 0.36035 0.530446 0.517334 0.651353 0.60996 0.810649 0.755827 0.134802 0.151625 0.60764 0.246394 0.571568 0.743077 0.41205 0.183926 0.730861 0.916731 0.395652 0.472869 0.429568 0.545086 0.754705 0.739297 0.489425 0.36408 0.073704 0.804456 0.263524 0.139058 0.047687 0.729539 0.721418 0.371314 0.956153 0.037111 0.165211 0.322853 0.270712 0.504574 0.002199 0.086256 0.718551 0.87 [...]
+0.775212 0.964064 0.915525 0.125999 0.084131 0.898226 0.056459 0.869717 0.526686 0.903658 0.295737 0.008736 0.65885 0.303282 0.549561 0.203082 0.388986 0.482393 0.923208 0.546703 0.835397 0.715996 0.809259 0.607937 0.196902 0.292124 0.519109 0.76161 0.685243 0.072839 0.511256 0.682934 0.022485 0.932772 0.668219 0.405846 0.393403 0.449227 0.664343 0.661242 0.209975 0.051201 0.89662 0.459725 0.660348 0.498899 0.91477 0.396204 0.284578 0.666001 0.523065 0.538195 0.4156 0.393213 0.107175 0.2 [...]
+0.173982 0.741764 0.91123 0.34764 0.607187 0.955714 0.843407 0.751944 0.577062 0.428625 0.108035 0.898013 0.22456 0.990821 0.147499 0.264409 0.115903 0.722498 0.490409 0.521393 0.447322 0.746029 0.556534 0.279927 0.542211 0.245901 0.39384 0.635279 0.319071 0.834671 0.300759 0.135291 0.009128 0.259489 0.333318 0.981784 0.795604 0.204954 0.396206 0.001949 0.627863 0.117178 0.762602 0.958896 0.757209 0.949056 0.643041 0.694848 0.54958 0.346004 0.03231 0.767969 0.850922 0.390767 0.451656 0.0 [...]
+0.428214 0.967442 0.489785 0.923498 0.058183 0.478555 0.620592 0.674506 0.480333 0.460207 0.937789 0.024288 0.980141 0.106341 0.280577 0.158453 0.044912 0.457276 0.104881 0.508394 0.622431 0.3706 0.133495 0.846925 0.983943 0.458829 0.456424 0.396334 0.81886 0.894446 0.042545 0.651745 0.770311 0.442309 0.337689 0.900953 0.370697 0.314673 0.846245 0.19291 0.523048 0.322385 0.71107 0.126539 0.548307 0.54388 0.655936 0.898425 0.282582 0.181076 0.468727 0.237212 0.814413 0.641105 0.398811 0.6 [...]
+0.248721 0.122385 0.396869 0.798048 0.693963 0.810002 0.221352 0.258486 0.038034 0.868472 0.671631 0.131602 0.492302 0.0168 0.544149 0.278536 0.746892 0.13278 0.50631 0.166822 0.874581 0.341854 0.680137 0.543473 0.591698 0.267267 0.593765 0.506831 0.793282 0.904469 0.622873 0.611949 0.837275 0.472411 0.229542 0.030071 0.902882 0.14685 0.186127 0.450303 0.181232 0.877844 0.500267 0.317114 0.4128 0.117684 0.194169 0.201612 0.041957 0.520762 0.116142 0.135243 0.190501 0.612302 0.825549 0.93 [...]
+0.892341 0.001801 0.188376 0.104779 0.728269 0.697019 0.571842 0.662851 0.580007 0.793228 0.880707 0.771082 0.634383 0.749809 0.012321 0.355029 0.746514 0.115614 0.152276 0.493245 0.691257 0.305257 0.225192 0.735759 0.498426 0.99025 0.345618 0.134149 0.932063 0.545418 0.899591 0.782609 0.466946 0.629457 0.422218 0.519898 0.427304 0.902394 0.085548 0.775814 0.976795 0.439531 0.714118 0.559032 0.057556 0.960627 0.720363 0.81876 0.768916 0.167486 0.302001 0.45908 0.187213 0.218971 0.762714  [...]
+0.272448 0.503908 0.705211 0.092536 0.820785 0.158771 0.049041 0.573604 0.320458 0.626613 0.417507 0.709345 0.029601 0.18053 0.172169 0.355335 0.035952 0.110581 0.454673 0.70728 0.149397 0.383691 0.835173 0.786383 0.330166 0.468426 0.88813 0.599022 0.812031 0.099796 0.503466 0.264783 0.373187 0.287138 0.880617 0.39817 0.468992 0.985792 0.976223 0.377805 0.135959 0.34955 0.303663 0.558834 0.808437 0.857326 0.878745 0.374072 0.548819 0.866471 0.402566 0.302175 0.478852 0.194796 0.357737 0. [...]
+0.608252 0.820685 0.023658 0.89476 0.20849 0.025373 0.337144 0.685544 0.783544 0.710882 0.141948 0.127516 0.068048 0.655344 0.670716 0.423261 0.786645 0.339477 0.74836 0.39696 0.914121 0.332906 0.315787 0.848876 0.424162 0.582823 0.580831 0.476031 0.044067 0.485417 0.886682 0.907653 0.007359 0.926249 0.358439 0.836127 0.873914 0.619998 0.277822 0.705799 0.036051 0.659015 0.43671 0.760983 0.764041 0.402911 0.351866 0.804907 0.170903 0.511077 0.705446 0.177501 0.080543 0.676708 0.487224 0. [...]
+0.819526 0.781564 0.317117 0.444576 0.094225 0.731131 0.389845 0.572879 0.268789 0.548159 0.97843 0.589933 0.842445 0.463073 0.05908 0.119107 0.123432 0.433458 0.975907 0.300164 0.992572 0.53837 0.729209 0.266827 0.459084 0.700384 0.278524 0.268952 0.029303 0.740328 0.740721 0.790535 0.864275 0.150633 0.194489 0.988201 0.264787 0.579996 0.017809 0.576555 0.597414 0.342315 0.140441 0.760585 0.50861 0.659375 0.027013 0.795235 0.845629 0.78997 0.61145 0.321779 0.674301 0.678298 0.358931 0.6 [...]
+0.456455 0.614402 0.95055 0.837965 0.813442 0.799242 0.955987 0.354286 0.74265 0.814274 0.442778 0.894312 0.931556 0.253987 0.510712 0.897683 0.89473 0.475402 0.25681 0.350751 0.118557 0.772225 0.842871 0.217541 0.528584 0.056912 0.014433 0.692144 0.734155 0.927756 0.659248 0.289195 0.30179 0.28255 0.280596 0.405867 0.578513 0.583006 0.518245 0.504099 0.423025 0.083192 0.783141 0.818163 0.579055 0.377748 0.880487 0.5772 0.244146 0.683097 0.74313 0.389524 0.293954 0.83159 0.409472 0.98388 [...]
+0.664452 0.95455 0.298663 0.164012 0.505839 0.668555 0.858083 0.541517 0.782314 0.554 0.455829 0.532878 0.67299 0.323542 0.387525 0.983521 0.618348 0.843135 0.28759 0.708204 0.60816 0.848313 0.809769 0.189487 0.097913 0.668365 0.912705 0.11231 0.213185 0.654529 0.781052 0.616244 0.878264 0.503181 0.900032 0.832548 0.053187 0.580392 0.443706 0.841082 0.08132 0.825646 0.894544 0.734314 0.326169 0.908284 0.96621 0.843979 0.15874 0.713713 0.262858 0.999384 0.468239 0.424264 0.919634 0.443646 [...]
+0.811798 0.466198 0.084148 0.169795 0.250636 0.789707 0.496545 0.265289 0.352015 0.887652 0.036864 0.43214 0.359548 0.494383 0.438141 0.324557 0.767915 0.468132 0.172503 0.266355 0.615526 0.216994 0.457549 0.313962 0.904978 0.027085 0.773914 0.011861 0.007719 0.95104 0.519842 0.223211 0.044752 0.706737 0.326853 0.354222 0.760967 0.938942 0.203842 0.788218 0.531218 0.460343 0.881803 0.325957 0.515806 0.606213 0.981649 0.918649 0.161535 0.90264 0.151804 0.229315 0.40108 0.942803 0.327847 0 [...]
+0.247961 0.35183 0.977735 0.672486 0.935348 0.600333 0.748957 0.428919 0.876205 0.783627 0.218857 0.771169 0.147671 0.200526 0.397813 0.896887 0.509694 0.367276 0.84833 0.88014 0.343107 0.077804 0.912202 0.569898 0.996085 0.036444 0.651687 0.895912 0.192771 0.237165 0.084873 0.16997 0.887331 0.727561 0.065176 0.693435 0.461356 0.133191 0.915935 0.391339 0.987827 0.612561 0.771393 0.748092 0.591373 0.977991 0.917878 0.591272 0.87842 0.460506 0.099902 0.500838 0.079283 0.190368 0.80792 0.9 [...]
+0.598408 0.455444 0.673238 0.663439 0.569272 0.085233 0.079091 0.338855 0.267507 0.198059 0.954283 0.513837 0.713278 0.875284 0.101341 0.888012 0.482233 0.592059 0.778886 0.837383 0.210403 0.694994 0.737937 0.009803 0.399022 0.639371 0.651163 0.946106 0.157933 0.519908 0.763733 0.971186 0.893079 0.975671 0.766645 0.740012 0.992502 0.173296 0.380072 0.478541 0.985483 0.338559 0.720034 0.672694 0.810149 0.72945 0.430427 0.310712 0.390163 0.974765 0.978174 0.89405 0.97457 0.13051 0.752438 0 [...]
+0.268524 0.566483 0.351789 0.144064 0.203412 0.644352 0.590025 0.394176 0.467909 0.744649 0.579308 0.439266 0.879447 0.931317 0.202645 0.802963 0.097924 0.256355 0.313555 0.346711 0.189437 0.036679 0.199682 0.752925 0.908731 0.024429 0.737643 0.881663 0.464735 0.861663 0.089898 0.360055 0.176047 0.23973 0.315141 0.028709 0.916785 0.940444 0.247959 0.897989 0.48008 0.155881 0.298865 0.062672 0.08592 0.09794 0.831414 0.768315 0.59496 0.87908 0.509716 0.312366 0.529496 0.222175 0.47508 0.50 [...]
+0.425598 0.391167 0.309921 0.334336 0.1487 0.71048 0.022614 0.705585 0.695318 0.297365 0.692991 0.635768 0.442309 0.963165 0.514329 0.217727 0.182067 0.7471 0.306516 0.799316 0.866959 0.753805 0.072011 0.61608 0.464107 0.482556 0.270639 0.535819 0.123515 0.80642 0.531313 0.750924 0.567528 0.020035 0.932032 0.549247 0.270567 0.205554 0.057486 0.201052 0.268232 0.988459 0.27266 0.909971 0.35099 0.485994 0.925273 0.212696 0.565845 0.488115 0.428846 0.9089 0.884739 0.322068 0.405172 0.056038 [...]
+0.932608 0.407351 0.883468 0.596448 0.85819 0.912979 0.527608 0.225373 0.238075 0.384093 0.119752 0.174807 0.956646 0.614049 0.338197 0.78469 0.953859 0.885736 0.203213 0.997859 0.606847 0.214541 0.674828 0.64687 0.854669 0.62338 0.064761 0.315143 0.057681 0.146682 0.348984 0.093978 0.593636 0.122784 0.359613 0.188914 0.405255 0.219869 0.112638 0.81062 0.169243 0.808377 0.685194 0.694659 0.420897 0.126363 0.514827 0.983654 0.692679 0.419413 0.680303 0.969736 0.557193 0.280656 0.527294 0. [...]
+0.133317 0.879545 0.803899 0.500814 0.506987 0.256183 0.869908 0.610426 0.648361 0.592455 0.967563 0.500221 0.901151 0.041537 0.504468 0.853177 0.62687 0.504289 0.233659 0.959671 0.739745 0.679569 0.543877 0.990549 0.080757 0.779725 0.139703 0.853046 0.304005 0.394986 0.067326 0.252131 0.939153 0.228817 0.819412 0.687935 0.152653 0.619721 0.817991 0.073573 0.384365 0.771816 0.535867 0.009716 0.993399 0.495662 0.002773 0.841266 0.399175 0.600868 0.046235 0.041459 0.523871 0.373548 0.29201 [...]
+0.878063 0.951076 0.010878 0.277501 0.389874 0.545006 0.940443 0.393956 0.046643 0.236846 0.451114 0.607541 0.618147 0.041544 0.034048 0.44363 0.638234 0.095855 0.735769 0.26519 0.175726 0.712683 0.669494 0.15303 0.111401 0.902878 0.28455 0.101524 0.974565 0.563453 0.649616 0.868238 0.481797 0.590246 0.246377 0.097918 0.037939 0.072482 0.032311 0.706682 0.701213 0.239366 0.578316 0.794332 0.021535 0.728558 0.744879 0.591745 0.419309 0.135008 0.344301 0.083286 0.745983 0.503481 0.610024 0 [...]
+0.282242 0.27883 0.047891 0.027766 0.757328 0.186018 0.999737 0.181767 0.514506 0.620146 0.660922 0.039769 0.74206 0.741128 0.523964 0.033236 0.572504 0.372828 0.778952 0.729532 0.027319 0.473377 0.445137 0.108272 0.310007 0.658639 0.707829 0.582365 0.755196 0.037994 0.813907 0.854272 0.88522 0.058246 0.697373 0.754797 0.637121 0.389662 0.661133 0.2477 0.017708 0.434304 0.900269 0.739562 0.435409 0.267271 0.663149 0.77974 0.592965 0.888214 0.481445 0.147342 0.267261 0.259471 0.923556 0.8 [...]
+0.921879 0.146282 0.243021 0.069572 0.012508 0.064561 0.382413 0.273617 0.764948 0.324809 0.626395 0.180905 0.329943 0.042088 0.949973 0.200038 0.048587 0.21857 0.243622 0.44021 0.715471 0.150678 0.021488 0.157399 0.926657 0.490726 0.569229 0.301829 0.528115 0.267397 0.718161 0.564573 0.024692 0.832384 0.936749 0.56283 0.16191 0.294554 0.172723 0.332957 0.843447 0.371262 0.360566 0.728153 0.111606 0.34985 0.808221 0.358635 0.154568 0.772309 0.956515 0.886159 0.572695 0.704169 0.508217 0. [...]
+0.094929 0.379475 0.159936 0.961612 0.606141 0.029459 0.371453 0.147861 0.135001 0.384143 0.323522 0.023602 0.51022 0.195694 0.132513 0.888634 0.690007 0.507386 0.247248 0.54802 0.605458 0.41957 0.482081 0.970578 0.616098 0.815976 0.377128 0.697051 0.875456 0.45434 0.55464 0.881175 0.341924 0.719295 0.471274 0.269063 0.85225 0.38566 0.815408 0.854945 0.354448 0.451809 0.56128 0.226881 0.45219 0.567243 0.400478 0.269316 0.592147 0.217435 0.058328 0.168833 0.783632 0.73791 0.284695 0.11674 [...]
+0.749684 0.06319 0.732292 0.748151 0.457034 0.416521 0.027159 0.709961 0.830416 0.143375 0.436745 0.956436 0.799568 0.805793 0.885507 0.191877 0.796296 0.833155 0.682569 0.767783 0.129466 0.62282 0.514061 0.679261 0.400028 0.614393 0.203544 0.276526 0.334383 0.992897 0.069647 0.334981 0.488252 0.112185 0.13003 0.804995 0.704458 0.89236 0.549392 0.68193 0.590783 0.421123 0.714426 0.014271 0.197268 0.756973 0.05179 0.170737 0.118696 0.77699 0.199417 0.0511 0.57553 0.496539 0.676177 0.13880 [...]
+0.155647 0.73726 0.066402 0.039801 0.149314 0.827044 0.440464 0.406331 0.884638 0.723852 0.451851 0.585687 0.2583 0.30191 0.998534 0.288373 0.264038 0.237171 0.787089 0.302536 0.377959 0.896235 0.759096 0.666571 0.376666 0.315277 0.980753 0.043679 0.010481 0.272133 0.678355 0.104769 0.879618 0.261566 0.470954 0.884192 0.640076 0.809617 0.519234 0.370748 0.573091 0.151993 0.655097 0.449992 0.153178 0.273207 0.085248 0.309816 0.391798 0.152004 0.015914 0.339962 0.609716 0.724431 0.282551 0 [...]
+0.665602 0.853811 0.028893 0.559631 0.739848 0.791963 0.56067 0.135742 0.464967 0.993598 0.569183 0.363055 0.236771 0.036652 0.169134 0.798186 0.937302 0.199688 0.961465 0.009807 0.669907 0.81357 0.134199 0.356453 0.325171 0.612599 0.090314 0.043317 0.626533 0.351659 0.300512 0.30669 0.992872 0.436214 0.762002 0.014127 0.783205 0.915504 0.149532 0.768057 0.657785 0.239619 0.913143 0.856624 0.402222 0.419478 0.932709 0.913398 0.306881 0.262628 0.574005 0.754463 0.702899 0.806333 0.744255  [...]
+0.888373 0.306063 0.95884 0.309922 0.449284 0.14651 0.946904 0.19957 0.407078 0.250397 0.471602 0.689809 0.841467 0.358661 0.263709 0.909288 0.487227 0.521751 0.988464 0.70723 0.311606 0.469245 0.716434 0.939827 0.953313 0.415065 0.855796 0.56422 0.539437 0.867792 0.25991 0.940431 0.98752 0.078498 0.26436 0.781762 0.866556 0.820482 0.452616 0.272547 0.274109 0.770219 0.510422 0.554154 0.019872 0.083199 0.250895 0.405482 0.754898 0.560189 0.741192 0.128743 0.878688 0.128488 0.606647 0.529 [...]
+0.944342 0.207222 0.692206 0.16266 0.219257 0.16811 0.427201 0.925426 0.532082 0.734852 0.09553 0.045802 0.777739 0.463457 0.808235 0.72852 0.393949 0.621555 0.866015 0.09656 0.318504 0.075828 0.756251 0.798541 0.376947 0.924717 0.073829 0.571993 0.076189 0.399346 0.251355 0.867231 0.507608 0.786432 0.836016 0.163263 0.074525 0.498295 0.163185 0.408319 0.250267 0.251022 0.22039 0.861622 0.955361 0.977269 0.041236 0.098883 0.293354 0.898847 0.542293 0.873558 0.781068 0.565639 0.925816 0.9 [...]
+0.04514 0.181396 0.563283 0.669337 0.083864 0.372333 0.040133 0.628739 0.643507 0.230016 0.644651 0.479228 0.400188 0.534909 0.885837 0.172312 0.901588 0.272546 0.024243 0.415814 0.908606 0.459349 0.398201 0.432276 0.647165 0.829488 0.903594 0.821362 0.342515 0.339593 0.183321 0.426361 0.90733 0.550557 0.26816 0.21635 0.576839 0.192841 0.811252 0.399482 0.624717 0.385722 0.333323 0.736203 0.586024 0.337697 0.012814 0.699429 0.995268 0.837771 0.436605 0.324691 0.856501 0.773953 0.940374 0 [...]
+0.72213 0.934483 0.275827 0.124153 0.887264 0.07061 0.733978 0.916258 0.228163 0.395456 0.156413 0.140229 0.53197 0.386263 0.146853 0.939097 0.739675 0.23749 0.364762 0.826805 0.528178 0.876327 0.181645 0.012441 0.017291 0.990505 0.888765 0.76088 0.271227 0.958504 0.654034 0.648129 0.793874 0.57408 0.141403 0.882897 0.237195 0.808956 0.831576 0.083077 0.988986 0.164287 0.158649 0.064589 0.149471 0.433069 0.961003 0.321804 0.212587 0.593066 0.74095 0.805724 0.303597 0.665098 0.641959 0.34 [...]
+0.526544 0.483175 0.857281 0.270834 0.944687 0.688406 0.902585 0.898062 0.866287 0.505862 0.398195 0.977176 0.034304 0.34709 0.273774 0.629913 0.32746 0.111277 0.600553 0.282131 0.125907 0.476856 0.555035 0.239596 0.989312 0.184687 0.313448 0.935498 0.606073 0.183312 0.661888 0.887327 0.329757 0.209216 0.224931 0.689531 0.679419 0.903793 0.14696 0.160986 0.872464 0.561578 0.980222 0.164386 0.647483 0.935693 0.502729 0.819358 0.015443 0.929667 0.668433 0.667628 0.36643 0.514135 0.098329 0 [...]
+0.671233 0.827119 0.147977 0.226453 0.50026 0.002436 0.985614 0.239622 0.600566 0.574784 0.16785 0.159206 0.62668 0.809342 0.979085 0.207211 0.815742 0.457008 0.592615 0.175098 0.6669 0.159551 0.964096 0.378601 0.480496 0.290161 0.84994 0.885309 0.774052 0.017304 0.13274 0.349679 0.287988 0.830099 0.510389 0.452028 0.651398 0.682138 0.087857 0.812432 0.235489 0.752694 0.907558 0.535063 0.770008 0.112986 0.337918 0.468571 0.685485 0.682425 0.143839 0.01592 0.766321 0.290166 0.31416 0.3988 [...]
+0.619995 0.298998 0.575773 0.039661 0.577069 0.737412 0.503238 0.5573 0.63277 0.596696 0.166676 0.071859 0.290513 0.562677 0.085293 0.997493 0.445747 0.145145 0.36662 0.199315 0.364564 0.148342 0.956695 0.828856 0.220514 0.664164 0.457429 0.793129 0.428413 0.739356 0.031758 0.142567 0.932056 0.866331 0.440039 0.509095 0.120387 0.439634 0.38559 0.292896 0.345866 0.257518 0.576123 0.084292 0.560921 0.265373 0.625833 0.084964 0.910577 0.300007 0.732553 0.443653 0.276021 0.223909 0.042624 0. [...]
+0.7992 0.590984 0.139515 0.187211 0.218036 0.767361 0.639327 0.437959 0.533271 0.043329 0.401043 0.620832 0.562434 0.731922 0.99714 0.330043 0.072537 0.599853 0.974141 0.585105 0.453564 0.417149 0.929957 0.744564 0.794428 0.67207 0.139281 0.60664 0.794036 0.54196 0.102961 0.789038 0.48233 0.996688 0.962852 0.993184 0.753794 0.615358 0.945161 0.221463 0.157276 0.663637 0.1701 0.66811 0.647115 0.838072 0.643842 0.631209 0.565317 0.168586 0.853562 0.035252 0.781623 0.989306 0.978973 0.39483 [...]
+0.881253 0.870811 0.456131 0.190155 0.5417 0.175095 0.443199 0.737311 0.575581 0.445843 0.84454 0.923258 0.483672 0.733849 0.846662 0.31624 0.648664 0.724445 0.297704 0.315307 0.770861 0.679057 0.474995 0.727944 0.752898 0.670546 0.296481 0.361889 0.008313 0.570887 0.858079 0.364174 0.360412 0.337633 0.841746 0.400464 0.460594 0.285687 0.421052 0.218189 0.308088 0.540851 0.805141 0.397559 0.401374 0.938482 0.238515 0.533928 0.667742 0.567885 0.339831 0.840065 0.624377 0.645294 0.427809 0 [...]
+0.368234 0.233014 0.631967 0.06632 0.210481 0.796129 0.266646 0.337607 0.862034 0.907435 0.996073 0.817139 0.541497 0.950306 0.458982 0.988548 0.821319 0.324558 0.345804 0.356325 0.051019 0.863648 0.612393 0.665924 0.83513 0.908178 0.647846 0.71024 0.310356 0.101376 0.931071 0.429706 0.993806 0.836241 0.253242 0.657144 0.344292 0.784847 0.44527 0.358216 0.057864 0.231913 0.71742 0.884325 0.285758 0.836788 0.630732 0.59223 0.521521 0.462414 0.450454 0.390257 0.593894 0.139181 0.61783 0.63 [...]
+0.673656 0.98588 0.217936 0.789718 0.134736 0.051795 0.667715 0.449806 0.41608 0.046352 0.05492 0.461256 0.42796 0.961951 0.792091 0.086714 0.397402 0.751712 0.841016 0.355683 0.024128 0.757321 0.445874 0.210194 0.897991 0.479995 0.174316 0.627613 0.175117 0.175254 0.839379 0.643068 0.63364 0.340908 0.444514 0.987998 0.296235 0.364995 0.268182 0.503013 0.384597 0.712488 0.421398 0.54862 0.093014 0.298668 0.122469 0.198855 0.149593 0.92399 0.175042 0.254899 0.790849 0.227978 0.408747 0.91 [...]
+0.979487 0.778548 0.535407 0.360927 0.727897 0.500719 0.341825 0.603243 0.252127 0.619665 0.668762 0.254793 0.722105 0.356814 0.524025 0.609535 0.94129 0.198874 0.807001 0.860396 0.723599 0.192798 0.088657 0.165616 0.112597 0.359739 0.640828 0.308691 0.218247 0.160829 0.555322 0.466681 0.262274 0.507838 0.310078 0.868569 0.283841 0.808397 0.211478 0.788827 0.931424 0.711867 0.042016 0.660937 0.296578 0.410882 0.772355 0.646179 0.854025 0.697383 0.510298 0.065337 0.512446 0.089352 0.72513 [...]
+0.751075 0.404417 0.286955 0.044609 0.519428 0.213549 0.688898 0.544912 0.576709 0.013426 0.898788 0.935648 0.732786 0.105049 0.509616 0.415178 0.892703 0.01587 0.224154 0.238838 0.582465 0.145398 0.143351 0.248946 0.368258 0.654171 0.522757 0.658925 0.906479 0.84579 0.490367 0.892619 0.873629 0.351045 0.480543 0.833152 0.008864 0.400675 0.196695 0.275062 0.817988 0.730319 0.588998 0.75812 0.823935 0.252539 0.864708 0.557111 0.224605 0.726218 0.581468 0.805704 0.112059 0.471474 0.361163  [...]
+0.720576 0.491205 0.640619 0.373806 0.545987 0.335806 0.345548 0.877355 0.797511 0.863806 0.055897 0.549522 0.373359 0.514608 0.342623 0.880554 0.101669 0.948212 0.043081 0.334809 0.339977 0.769249 0.817142 0.109134 0.993115 0.423211 0.072593 0.086545 0.924901 0.761158 0.931268 0.968271 0.512924 0.098133 0.717692 0.072667 0.111326 0.470025 0.215128 0.954832 0.345528 0.218324 0.424805 0.724014 0.25084 0.161909 0.271211 0.531464 0.796563 0.984208 0.494489 0.255275 0.139284 0.598068 0.33188 [...]
+0.142615 0.029739 0.055188 0.035097 0.354357 0.284133 0.887076 0.35333 0.111187 0.757212 0.210086 0.904693 0.363194 0.036906 0.305125 0.580983 0.378937 0.848336 0.932456 0.24636 0.864335 0.428263 0.898588 0.320447 0.943159 0.052072 0.565568 0.200367 0.547928 0.432746 0.336899 0.303478 0.604795 0.2648 0.494908 0.083565 0.330719 0.14686 0.882305 0.200113 0.019554 0.51794 0.29329 0.214616 0.556469 0.74511 0.421221 0.981034 0.504828 0.3913 0.444648 0.991189 0.801847 0.331272 0.455823 0.83111 [...]
+0.221181 0.77856 0.39302 0.901186 0.379485 0.789453 0.579089 0.391591 0.397657 0.811224 0.033926 0.256691 0.406382 0.225851 0.585709 0.693647 0.524815 0.826772 0.250193 0.02428 0.62863 0.130228 0.677837 0.215963 0.918532 0.804849 0.737474 0.090008 0.822876 0.83259 0.977992 0.409299 0.704384 0.593429 0.296432 0.617649 0.795201 0.635346 0.556821 0.619126 0.874707 0.230885 0.958495 0.065468 0.573671 0.994174 0.757781 0.315309 0.126236 0.906434 0.623464 0.393191 0.347761 0.538385 0.515051 0. [...]
+0.588378 0.448169 0.961353 0.772228 0.242312 0.41291 0.584246 0.618633 0.55864 0.409519 0.535903 0.831088 0.384948 0.601998 0.224338 0.184524 0.631785 0.198545 0.558585 0.020465 0.271426 0.574508 0.069393 0.875084 0.549683 0.589069 0.5622 0.434209 0.566492 0.576761 0.060388 0.081169 0.896866 0.771208 0.918866 0.063782 0.065751 0.332342 0.987745 0.002547 0.126978 0.70419 0.642226 0.833194 0.208274 0.822897 0.764944 0.246753 0.558383 0.368304 0.018263 0.460691 0.583275 0.35953 0.305244 0.7 [...]
+0.193906 0.313024 0.729722 0.864279 0.576551 0.116574 0.173697 0.805101 0.802307 0.22874 0.18395 0.032409 0.429895 0.108459 0.832713 0.763332 0.201966 0.267499 0.329399 0.803369 0.663805 0.73756 0.400355 0.555872 0.665169 0.43964 0.35106 0.672959 0.06347 0.385883 0.288023 0.209707 0.365384 0.77359 0.535467 0.096415 0.7442 0.938986 0.632324 0.038821 0.004729 0.398202 0.518026 0.275467 0.204073 0.694571 0.161402 0.208486 0.822951 0.390761 0.440263 0.126082 0.662811 0.95279 0.005063 0.54646 [...]
+0.243771 0.309231 0.097096 0.578304 0.332697 0.418808 0.792257 0.740596 0.469639 0.512707 0.683733 0.187718 0.156923 0.697854 0.829007 0.645709 0.509776 0.770414 0.539827 0.722533 0.545024 0.804156 0.379982 0.958502 0.831327 0.244988 0.074707 0.137549 0.88568 0.410755 0.577984 0.088239 0.624299 0.131079 0.249482 0.067653 0.389015 0.272783 0.464238 0.505344 0.94302 0.922045 0.928471 0.229047 0.045766 0.814859 0.803403 0.928024 0.72221 0.899912 0.986881 0.196198 0.147605 0.454633 0.462021  [...]
+0.273565 0.454075 0.525005 0.772328 0.914666 0.47923 0.327746 0.731955 0.433798 0.949634 0.103625 0.431214 0.557352 0.956065 0.07898 0.365689 0.863168 0.649787 0.440546 0.307929 0.444785 0.377749 0.735085 0.833988 0.245215 0.336637 0.406725 0.089333 0.72095 0.341308 0.798532 0.979349 0.407204 0.170115 0.839637 0.881266 0.321087 0.81847 0.39962 0.205893 0.625969 0.671819 0.607361 0.214768 0.742438 0.624309 0.274052 0.153342 0.917743 0.93528 0.942882 0.210749 0.784051 0.080765 0.55179 0.56 [...]
+0.739801 0.855722 0.48482 0.726097 0.164891 0.954732 0.876457 0.686662 0.425803 0.059794 0.953795 0.271282 0.514757 0.184916 0.614476 0.990622 0.232845 0.706743 0.299229 0.72661 0.100914 0.947974 0.449893 0.234515 0.797959 0.466953 0.619982 0.141304 0.862558 0.967361 0.144442 0.16084 0.471633 0.42009 0.65208 0.313634 0.902748 0.162582 0.059847 0.619093 0.429738 0.501757 0.982631 0.208194 0.08914 0.601478 0.963182 0.714627 0.878533 0.648642 0.354185 0.31807 0.603203 0.843426 0.721757 0.61 [...]
+0.138291 0.869367 0.195266 0.800494 0.467467 0.841602 0.584592 0.11639 0.096077 0.675284 0.977906 0.336546 0.571782 0.741414 0.16784 0.416498 0.112295 0.689053 0.750523 0.149283 0.195049 0.181169 0.805615 0.941943 0.46388 0.391402 0.114675 0.240434 0.143781 0.241014 0.08261 0.239314 0.742495 0.477785 0.114372 0.72311 0.185254 0.778385 0.272516 0.057458 0.697652 0.855955 0.843275 0.199172 0.476733 0.861729 0.64878 0.355433 0.439636 0.639242 0.746016 0.866091 0.02723 0.789373 0.62706 0.199 [...]
+0.063691 0.852933 0.778807 0.153337 0.640138 0.643859 0.664109 0.220131 0.409258 0.833886 0.01053 0.42376 0.789582 0.42865 0.671346 0.491233 0.196287 0.172993 0.484794 0.832566 0.829549 0.538998 0.523621 0.932443 0.786586 0.608889 0.764497 0.217265 0.553675 0.87203 0.36468 0.643887 0.343186 0.947415 0.811763 0.935761 0.877033 0.161645 0.135003 0.51882 0.601686 0.627709 0.63671 0.482158 0.913983 0.481251 0.741159 0.365178 0.37868 0.641327 0.333907 0.829578 0.35319 0.135705 0.871279 0.0983 [...]
+0.025083 0.307168 0.331121 0.576642 0.903958 0.110849 0.011218 0.267306 0.852867 0.747958 0.835523 0.024304 0.444659 0.266821 0.27315 0.472955 0.194634 0.407013 0.062316 0.986432 0.913149 0.449257 0.71381 0.942951 0.691897 0.255251 0.967186 0.786991 0.845476 0.827209 0.01579 0.904103 0.263215 0.128476 0.9828 0.623599 0.094441 0.723516 0.094442 0.656956 0.789053 0.151312 0.395526 0.35159 0.074809 0.268018 0.722326 0.564514 0.508712 0.279591 0.68613 0.305036 0.848294 0.745696 0.781762 0.26 [...]
+0.916954 0.222943 0.241991 0.406654 0.339029 0.105055 0.742029 0.673559 0.140901 0.774232 0.271138 0.190254 0.351861 0.268208 0.578788 0.474586 0.19773 0.754821 0.829097 0.31163 0.246265 0.511689 0.717102 0.20421 0.359801 0.086746 0.599269 0.76543 0.487471 0.140232 0.367264 0.93581 0.911119 0.145508 0.586975 0.485839 0.312033 0.122899 0.499793 0.506402 0.595611 0.464775 0.14966 0.942211 0.801613 0.326612 0.867927 0.046421 0.03701 0.767532 0.63082 0.100863 0.450937 0.441543 0.9859 0.79656 [...]
+0.762579 0.338752 0.253015 0.281214 0.196586 0.280363 0.501388 0.023923 0.583502 0.991781 0.863886 0.298962 0.19491 0.472914 0.286101 0.880252 0.463876 0.731844 0.194407 0.56135 0.793317 0.19307 0.187413 0.036175 0.714229 0.500474 0.223076 0.154402 0.342738 0.079538 0.090608 0.241614 0.431409 0.008351 0.401183 0.191659 0.477208 0.770993 0.74825 0.922215 0.769824 0.955221 0.183329 0.48734 0.434138 0.20821 0.140265 0.696036 0.386461 0.176866 0.353667 0.691741 0.025937 0.253286 0.17179 0.45 [...]
+0.035799 0.143902 0.534658 0.575637 0.393868 0.592546 0.151619 0.246298 0.426601 0.520557 0.806284 0.895842 0.40477 0.495261 0.056029 0.686767 0.174771 0.990034 0.26179 0.08159 0.117233 0.726363 0.177128 0.208569 0.393701 0.025945 0.992615 0.859948 0.311642 0.212956 0.714332 0.852488 0.95653 0.676857 0.089912 0.685777 0.904993 0.571338 0.189639 0.93955 0.672032 0.96518 0.553767 0.887901 0.05773 0.032454 0.333008 0.227122 0.518506 0.364905 0.33975 0.811823 0.757623 0.200333 0.354218 0.947 [...]
+0.421259 0.466643 0.756099 0.809662 0.65974 0.240698 0.985984 0.342177 0.299267 0.154623 0.577228 0.667728 0.997508 0.025993 0.359965 0.222154 0.701034 0.109252 0.487527 0.946769 0.920539 0.4055 0.734616 0.178629 0.871003 0.487291 0.031532 0.609109 0.81818 0.158708 0.813182 0.475941 0.244597 0.900625 0.649664 0.855056 0.057084 0.921737 0.415118 0.560157 0.118285 0.955234 0.848311 0.603739 0.349523 0.49029 0.831162 0.668722 0.601033 0.703484 0.507867 0.659816 0.002894 0.886623 0.310963 0. [...]
+0.579907 0.423425 0.022808 0.782932 0.209491 0.946586 0.666801 0.445213 0.663497 0.593071 0.256665 0.033358 0.346291 0.946362 0.163939 0.848295 0.004493 0.985766 0.576968 0.285867 0.927102 0.729055 0.04971 0.46016 0.521282 0.595563 0.389311 0.326445 0.700339 0.620657 0.526305 0.727717 0.648239 0.524871 0.786953 0.274411 0.406581 0.498958 0.577712 0.152963 0.663038 0.751163 0.334384 0.314652 0.224507 0.099561 0.609033 0.193376 0.689505 0.801153 0.999961 0.442394 0.911503 0.882542 0.786778 [...]
+0.025748 0.718774 0.236847 0.205 0.065024 0.853026 0.664718 0.305192 0.92794 0.006253 0.499834 0.759903 0.048216 0.064793 0.984787 0.857426 0.0714 0.59295 0.12512 0.180531 0.658147 0.493445 0.227686 0.194162 0.0055 0.735645 0.665803 0.562432 0.337977 0.927841 0.650178 0.118488 0.329979 0.37684 0.369985 0.812436 0.06136 0.98813 0.112897 0.581349 0.424868 0.7887 0.596382 0.984634 0.533075 0.784999 0.258596 0.056138 0.348476 0.641005 0.085052 0.991683 0.109299 0.261281 0.162714 0.466242 0.1 [...]
+0.122482 0.991832 0.856392 0.298996 0.372157 0.021396 0.811962 0.075383 0.743499 0.286257 0.447201 0.013916 0.605449 0.235516 0.222883 0.607866 0.613223 0.002601 0.384135 0.337937 0.63277 0.370282 0.648853 0.92705 0.886175 0.475048 0.037721 0.023657 0.304059 0.747247 0.004133 0.786435 0.286689 0.790259 0.481034 0.835261 0.057179 0.988381 0.951231 0.360289 0.354805 0.271781 0.220601 0.227422 0.637638 0.186151 0.221584 0.138214 0.757001 0.097121 0.644034 0.252135 0.335817 0.088775 0.428031 [...]
+0.626965 0.08368 0.114094 0.016854 0.285957 0.914491 0.105711 0.030762 0.463105 0.198112 0.314946 0.96545 0.441041 0.798386 0.632267 0.250223 0.38439 0.599449 0.329674 0.25765 0.65927 0.159418 0.806736 0.395999 0.000969 0.893579 0.499365 0.543822 0.729589 0.970573 0.865939 0.788444 0.713768 0.333596 0.410277 0.106189 0.529833 0.58537 0.707107 0.114006 0.65765 0.994557 0.370257 0.882868 0.722518 0.774376 0.262985 0.897531 0.117716 0.708723 0.664208 0.240489 0.654423 0.890291 0.289677 0.09 [...]
+0.818621 0.816117 0.950782 0.219762 0.246234 0.291392 0.263958 0.637757 0.637504 0.330997 0.698428 0.151235 0.942643 0.626593 0.131571 0.504951 0.030905 0.319263 0.018731 0.233807 0.629556 0.279673 0.719263 0.03058 0.081586 0.45791 0.477717 0.584921 0.164821 0.001378 0.705576 0.066408 0.754769 0.423887 0.826934 0.29132 0.222552 0.535073 0.92395 0.225594 0.412883 0.211434 0.55289 0.458795 0.946802 0.347274 0.370618 0.452346 0.80362 0.679939 0.274995 0.397347 0.371725 0.386701 0.03846 0.00 [...]
+0.357256 0.251995 0.207768 0.346347 0.450605 0.215921 0.022699 0.894981 0.824262 0.146101 0.564362 0.337907 0.88133 0.994616 0.950887 0.772569 0.008569 0.399159 0.299422 0.745315 0.526346 0.433624 0.747421 0.801985 0.929015 0.335479 0.396093 0.948733 0.733609 0.641437 0.265413 0.65288 0.632133 0.874138 0.615072 0.015492 0.420174 0.163192 0.301035 0.422064 0.966756 0.51339 0.586131 0.13062 0.155429 0.771722 0.864951 0.378424 0.181371 0.650623 0.434244 0.027413 0.615404 0.11754 0.015236 0. [...]
+0.053593 0.667917 0.240363 0.838869 0.939633 0.120441 0.598869 0.830049 0.355471 0.875986 0.911169 0.743002 0.079494 0.96461 0.490399 0.78439 0.702337 0.970446 0.929995 0.413695 0.289562 0.433666 0.555853 0.357219 0.363572 0.142332 0.745415 0.838604 0.984514 0.656764 0.983633 0.639386 0.867695 0.386601 0.295955 0.687686 0.169226 0.450093 0.90506 0.615148 0.294004 0.749001 0.281709 0.529777 0.894867 0.914321 0.826294 0.148622 0.680109 0.060991 0.423315 0.914082 0.170501 0.111522 0.419209  [...]
+0.985442 0.436226 0.174163 0.632223 0.763115 0.512685 0.751129 0.14627 0.58669 0.350569 0.004652 0.519736 0.369799 0.85583 0.867117 0.395668 0.730937 0.356956 0.797611 0.729917 0.12485 0.1236 0.236063 0.576723 0.500233 0.467335 0.337654 0.134382 0.148564 0.693911 0.8014 0.972721 0.373557 0.255091 0.318546 0.365115 0.690505 0.361367 0.376659 0.341954 0.736042 0.568732 0.466667 0.280354 0.867727 0.963378 0.347486 0.375274 0.63429 0.863431 0.093398 0.668389 0.842977 0.613148 0.321249 0.2476 [...]
+0.767854 0.469758 0.752129 0.597399 0.911758 0.091484 0.288718 0.696245 0.21651 0.25393 0.017249 0.983524 0.045059 0.408371 0.512687 0.787579 0.810851 0.661619 0.811623 0.555121 0.720623 0.452036 0.364067 0.576298 0.952591 0.143497 0.353277 0.246408 0.727057 0.037478 0.251738 0.65926 0.420414 0.527895 0.298718 0.77442 0.633102 0.602093 0.58471 0.999258 0.741292 0.189042 0.206651 0.512346 0.177688 0.951607 0.209917 0.962084 0.267315 0.312282 0.40887 0.096824 0.414019 0.533752 0.5658 0.156 [...]
+0.463208 0.800346 0.855369 0.124253 0.249381 0.214067 0.775832 0.427023 0.994268 0.814359 0.858667 0.482618 0.638798 0.169145 0.602422 0.216116 0.931014 0.341975 0.706605 0.892975 0.175227 0.479249 0.948766 0.333286 0.613922 0.814543 0.903542 0.236211 0.952947 0.602834 0.652879 0.618122 0.295859 0.944245 0.830106 0.631333 0.364829 0.203414 0.896593 0.026373 0.088463 0.347034 0.216603 0.473807 0.995801 0.523706 0.363741 0.252689 0.339373 0.909703 0.18215 0.233924 0.426748 0.167602 0.14911 [...]
+0.169172 0.527831 0.927591 0.124577 0.92592 0.883798 0.347366 0.089802 0.348778 0.126874 0.703921 0.890547 0.150337 0.444568 0.38149 0.586821 0.013034 0.689845 0.404034 0.445879 0.059126 0.989342 0.688944 0.507094 0.145522 0.154988 0.140151 0.421048 0.62768 0.404805 0.197949 0.579526 0.085576 0.06235 0.615662 0.684243 0.572678 0.741038 0.984846 0.711258 0.1738 0.87854 0.581898 0.2404 0.720516 0.809308 0.704555 0.587178 0.346755 0.766729 0.47234 0.2372 0.475456 0.35761 0.511139 0.877728 0 [...]
+0.596123 0.198168 0.597183 0.391776 0.16412 0.564349 0.050316 0.037137 0.657979 0.363993 0.285617 0.774681 0.96699 0.344169 0.013482 0.245519 0.999005 0.894443 0.511016 0.105621 0.666143 0.095165 0.753684 0.879014 0.196196 0.455956 0.388811 0.578058 0.558651 0.786653 0.517098 0.804254 0.644569 0.942736 0.859067 0.511747 0.639037 0.866143 0.045806 0.845977 0.963903 0.945002 0.853172 0.392395 0.852408 0.858237 0.185878 0.131137 0.331196 0.371252 0.562064 0.29553 0.303256 0.578287 0.96734 0 [...]
+0.897126 0.25605 0.777102 0.351986 0.10871 0.778816 0.173317 0.074854 0.167458 0.417163 0.626498 0.196912 0.325158 0.244608 0.917874 0.931985 0.618892 0.728838 0.729109 0.960109 0.608416 0.724669 0.265223 0.386425 0.475024 0.239521 0.665893 0.919949 0.815811 0.164213 0.526835 0.62849 0.643661 0.550791 0.374388 0.112766 0.512853 0.552818 0.965509 0.022738 0.628981 0.960773 0.559484 0.814735 0.173246 0.663748 0.585411 0.251602 0.881789 0.284205 0.082172 0.035839 0.437097 0.12624 0.252725 0 [...]
+0.701717 0.617952 0.193175 0.686311 0.24985 0.379473 0.399312 0.594044 0.929719 0.175114 0.075017 0.075891 0.127798 0.745674 0.929354 0.189935 0.020021 0.528006 0.987515 0.468018 0.544023 0.147499 0.880468 0.826908 0.654596 0.623526 0.603757 0.398294 0.283373 0.222874 0.397097 0.134879 0.746396 0.492744 0.118937 0.038356 0.609848 0.799333 0.817953 0.521885 0.790897 0.318661 0.38888 0.096176 0.081502 0.74291 0.091736 0.066336 0.827883 0.797389 0.870921 0.226903 0.087307 0.13584 0.678443 0 [...]
+0.583045 0.646582 0.812994 0.784174 0.49684 0.022888 0.024047 0.880416 0.372025 0.32553 0.252999 0.763413 0.499038 0.480311 0.266278 0.488802 0.279542 0.271002 0.925686 0.970242 0.087818 0.069471 0.241821 0.461147 0.596455 0.781217 0.325206 0.743461 0.630496 0.683556 0.933555 0.414146 0.351258 0.336779 0.848658 0.471393 0.750506 0.755028 0.159269 0.709565 0.133746 0.960959 0.584678 0.068599 0.768473 0.680076 0.067386 0.37791 0.166609 0.255835 0.877414 0.989139 0.177862 0.070276 0.362229  [...]
+0.424015 0.42102 0.772183 0.653838 0.783144 0.311818 0.5492 0.410235 0.912852 0.191004 0.198155 0.04631 0.085453 0.116677 0.829518 0.333149 0.492748 0.239071 0.821493 0.835494 0.623968 0.942357 0.595581 0.121163 0.037881 0.33161 0.756284 0.330872 0.705409 0.361407 0.661616 0.033997 0.969749 0.054337 0.547317 0.265318 0.727656 0.842876 0.234741 0.656052 0.971839 0.32543 0.159364 0.360194 0.846922 0.778326 0.802863 0.709567 0.98867 0.896629 0.626889 0.880147 0.31616 0.769561 0.281661 0.164 [...]
+0.591499 0.778779 0.204812 0.873003 0.982579 0.843962 0.430504 0.16395 0.432741 0.943831 0.47738 0.851931 0.725502 0.988702 0.285627 0.924498 0.567998 0.44189 0.108879 0.888896 0.254346 0.780575 0.204528 0.632757 0.137763 0.896671 0.408662 0.612939 0.243023 0.861321 0.877596 0.0564 0.987223 0.043173 0.717131 0.616261 0.046631 0.675707 0.089466 0.146621 0.818663 0.551706 0.784374 0.470999 0.525957 0.149695 0.074022 0.07536 0.498553 0.507014 0.558298 0.486539 0.379739 0.388392 0.137806 0.3 [...]
+0.249535 0.521373 0.436223 0.609519 0.319955 0.571603 0.347171 0.597312 0.599968 0.601262 0.163934 0.86045 0.67752 0.554682 0.044519 0.491564 0.427056 0.877678 0.72211 0.879787 0.906987 0.252097 0.620765 0.02699 0.144013 0.467275 0.00277 0.383646 0.285695 0.097082 0.740029 0.995507 0.108015 0.063546 0.900867 0.641839 0.611615 0.204255 0.848282 0.163947 0.212612 0.096219 0.19287 0.782175 0.858875 0.954385 0.179193 0.926528 0.359731 0.927469 0.080315 0.80968 0.31055 0.260225 0.534402 0.912 [...]
+0.854783 0.30666 0.993936 0.807476 0.56345 0.513806 0.994766 0.603838 0.286348 0.792321 0.479925 0.891113 0.38326 0.81874 0.702321 0.918619 0.896513 0.637565 0.595119 0.775903 0.933042 0.623415 0.800089 0.593898 0.944647 0.116413 0.588402 0.037918 0.261905 0.091388 0.092567 0.851152 0.991605 0.889122 0.378226 0.878863 0.307388 0.149345 0.58853 0.253506 0.840234 0.221728 0.588551 0.035083 0.830497 0.493147 0.929809 0.917683 0.546128 0.10002 0.795484 0.513285 0.323449 0.370239 0.174869 0.9 [...]
+0.150859 0.287027 0.918066 0.246038 0.249364 0.406232 0.003069 0.507127 0.588097 0.654465 0.590884 0.489383 0.594853 0.762016 0.069027 0.668004 0.531292 0.169378 0.532149 0.938017 0.318032 0.713376 0.611122 0.002976 0.464315 0.797528 0.819196 0.559558 0.619959 0.158257 0.360243 0.631642 0.239212 0.224413 0.592951 0.258366 0.348661 0.643568 0.642713 0.428149 0.003061 0.80713 0.64637 0.058353 0.281522 0.102871 0.450883 0.957942 0.6816 0.575924 0.721756 0.517737 0.986906 0.04413 0.054838 0. [...]
+0.640329 0.216172 0.979821 0.941531 0.41832 0.407387 0.654779 0.604466 0.960616 0.416491 0.082343 0.216522 0.829281 0.785951 0.511787 0.089121 0.066498 0.884023 0.075891 0.277057 0.201463 0.214278 0.68845 0.860141 0.56346 0.212524 0.599601 0.075099 0.230427 0.684443 0.773868 0.630327 0.335255 0.251865 0.590706 0.509029 0.013597 0.414678 0.254246 0.967786 0.006432 0.278988 0.856819 0.101691 0.404226 0.505077 0.634888 0.693323 0.368028 0.537849 0.741692 0.877072 0.205433 0.751437 0.235341  [...]
+0.832708 0.004092 0.005613 0.437447 0.049042 0.046994 0.644898 0.26979 0.826539 0.275861 0.267918 0.59058 0.416758 0.277623 0.235696 0.601212 0.487294 0.101018 0.553527 0.981609 0.47516 0.603959 0.608999 0.331665 0.944465 0.150899 0.791479 0.846707 0.645724 0.310986 0.866945 0.705174 0.995304 0.427135 0.500535 0.525734 0.10582 0.880982 0.501891 0.213588 0.025471 0.782387 0.004799 0.678404 0.741147 0.420552 0.056203 0.278706 0.754975 0.196102 0.098263 0.620278 0.756293 0.537689 0.941908 0 [...]
+0.264508 0.014153 0.147755 0.398736 0.551478 0.820188 0.121889 0.195094 0.36395 0.911506 0.132852 0.644292 0.699087 0.466604 0.748575 0.514487 0.978655 0.252655 0.722548 0.234182 0.584281 0.29583 0.7256 0.025758 0.061928 0.972279 0.407287 0.65956 0.300351 0.871681 0.917284 0.29636 0.777992 0.014195 0.68141 0.064783 0.604836 0.588923 0.317515 0.645854 0.180727 0.503154 0.710692 0.75635 0.115493 0.310359 0.08705 0.968913 0.404119 0.677766 0.563386 0.714593 0.052658 0.960984 0.539871 0.3208 [...]
+0.523757 0.373075 0.51652 0.401481 0.001818 0.006106 0.933919 0.130667 0.076574 0.97153 0.585854 0.113999 0.137158 0.813837 0.811081 0.690737 0.550361 0.094298 0.903208 0.371118 0.254389 0.587287 0.462328 0.57207 0.475893 0.066089 0.615617 0.225044 0.727823 0.348247 0.919797 0.819339 0.317746 0.896069 0.972821 0.373414 0.162105 0.051024 0.522999 0.678029 0.815725 0.174848 0.129773 0.373894 0.047613 0.530757 0.917855 0.985178 0.423463 0.591531 0.029253 0.01164 0.965839 0.59602 0.978112 0. [...]
+0.186458 0.390297 0.674374 0.236258 0.087823 0.165291 0.702708 0.651274 0.655984 0.893411 0.077554 0.187087 0.188179 0.412494 0.759054 0.056659 0.738153 0.763844 0.142413 0.588491 0.430136 0.262719 0.446074 0.785512 0.728413 0.247974 0.536153 0.69995 0.41136 0.92272 0.546239 0.836966 0.190864 0.259763 0.671772 0.95447 0.626195 0.690409 0.449299 0.257363 0.949791 0.861696 0.237745 0.668127 0.266476 0.999013 0.004182 0.76373 0.775765 0.797052 0.30071 0.000263 0.609707 0.132783 0.57477 0.48 [...]
+0.568056 0.544992 0.318464 0.610813 0.272952 0.891987 0.298696 0.770695 0.94229 0.725232 0.798159 0.971443 0.147717 0.446184 0.539715 0.593266 0.475314 0.625619 0.794794 0.906413 0.237245 0.8038 0.343223 0.769548 0.409218 0.770882 0.46748 0.607905 0.127881 0.256142 0.582445 0.096172 0.455242 0.734399 0.302528 0.350088 0.710577 0.041587 0.234491 0.948844 0.151792 0.699723 0.097469 0.271391 0.80524 0.748195 0.126583 0.135256 0.879579 0.980236 0.374265 0.626303 0.347529 0.037414 0.577154 0. [...]
+0.179796 0.824196 0.95134 0.845976 0.187563 0.59648 0.803502 0.845555 0.972914 0.559435 0.719658 0.214292 0.784027 0.956371 0.698737 0.089556 0.967017 0.618128 0.911046 0.165624 0.152684 0.059946 0.047143 0.061722 0.744876 0.543608 0.228292 0.347612 0.085662 0.096387 0.416566 0.678751 0.125278 0.329996 0.172075 0.693931 0.305928 0.579832 0.99497 0.140174 0.767813 0.323419 0.407898 0.767448 0.395847 0.737906 0.197526 0.653317 0.200673 0.581719 0.667661 0.263957 0.376984 0.913425 0.392907  [...]
+0.21633 0.546081 0.095452 0.974955 0.673724 0.869403 0.371736 0.340965 0.821245 0.027144 0.182378 0.491368 0.853494 0.859198 0.786833 0.907608 0.072188 0.120326 0.812083 0.872558 0.400382 0.959626 0.144512 0.039875 0.813061 0.759425 0.120973 0.737334 0.553219 0.470133 0.117767 0.884121 0.358733 0.433841 0.963701 0.014866 0.542576 0.88207 0.625647 0.755188 0.527369 0.221402 0.315928 0.971101 0.127505 0.854176 0.504704 0.19062 0.248171 0.089402 0.192091 0.166515 0.086865 0.473752 0.568339  [...]
+ 135.41733 -5.0114646 -5.0114646 2.8999824 2.8999824 4.0417488 4.0417488 -3.4973277 -3.4973277 4.673054 0.0045018 0.0045018 4.4483207 4.4483207 -3.9744448 -3.9744448 4.4989297 4.4989297 -1.4003453 -1.4003453 -3.8072204 -3.8072204 -0.4716280 -0.4716280 -4.3610482 -4.3610482 4.0932997 4.0932997 2.2959163 2.2959163 0.9528068 0.9528068 -4.3777224 -4.3777224 -3.7439733 -3.7439733 1.3032737 1.3032737 -1.7645325 -1.7645325 -0.7013300 -0.7013300 -1.528704 -1.528704 4.1708685 4.1708685 1.5076731  [...]
\ No newline at end of file
diff --git a/examples/testdata/eigen/nsm4.example b/examples/testdata/eigen/nsm4.example
new file mode 100644
index 0000000..051c935
--- /dev/null
+++ b/examples/testdata/eigen/nsm4.example
@@ -0,0 +1,921 @@
+919
+0.582473 0.279987 0.389754 0.197739 0.376918 0.813123 0.881299 0.187965 0.301015 0.212762 0.514417 0.303011 0.592527 0.375399 0.863367 0.273778 0.463579 0.463142 0.521883 0.905445 0.257884 0.674382 0.326806 0.863865 0.275244 0.308631 0.161993 0.590463 0.935553 0.790866 0.774275 0.40895 0.180891 0.193896 0.568601 0.931427 0.401512 0.898016 0.474437 0.954783 0.192345 0.194747 0.133219 0.010108 0.83757 0.79583 0.428134 0.38428 0.170618 0.802937 0.937814 0.74051 0.798177 0.63753 0.07852 0.81 [...]
+0.637201 0.511044 0.311861 0.816453 0.183037 0.832052 0.51991 0.108964 0.094618 0.926057 0.376905 0.66243 0.83926 0.379789 0.105859 0.604285 0.226771 0.25082 0.314259 0.918865 0.341359 0.812924 0.723728 0.345883 0.303908 0.563648 0.53901 0.803594 0.109703 0.717505 0.106093 0.198315 0.803089 0.737009 0.61253 0.414204 0.862708 0.160011 0.924318 0.868382 0.057008 0.735801 0.369355 0.074025 0.57138 0.406251 0.153414 0.047447 0.877531 0.973486 0.669488 0.439014 0.745456 0.280017 0.19455 0.911 [...]
+0.05316 0.520885 0.293046 0.182679 0.102265 0.227716 0.866405 0.569957 0.077663 0.342869 0.442072 0.490805 0.742674 0.787247 0.270447 0.551709 0.869551 0.636758 0.00381 0.555453 0.035536 0.071769 0.333812 0.455909 0.389076 0.857778 0.189364 0.101403 0.32903 0.424436 0.044868 0.179997 0.533041 0.389798 0.555277 0.004337 0.0554 0.665258 0.410369 0.598537 0.625169 0.549471 0.904277 0.439276 0.197926 0.447979 0.40349 0.645168 0.560294 0.605337 0.275722 0.549735 0.406647 0.798998 0.483095 0.0 [...]
+0.743744 0.810476 0.943806 0.048024 0.729994 0.040918 0.072655 0.918474 0.270362 0.020665 0.207868 0.768932 0.186755 0.668508 0.883242 0.257169 0.441963 0.894748 0.535727 0.21885 0.252121 0.102929 0.292824 0.919863 0.334856 0.227765 0.351654 0.491401 0.957965 0.708338 0.557 0.72828 0.754188 0.539953 0.728344 0.183991 0.328851 0.538396 0.492395 0.183788 0.926947 0.879128 0.595283 0.99609 0.728813 0.822941 0.636929 0.760859 0.24462 0.98614 0.845432 0.888074 0.83466 0.315497 0.954198 0.7300 [...]
+0.181449 0.40967 0.708054 0.987424 0.707372 0.56501 0.088395 0.103633 0.588444 0.311209 0.852345 0.446636 0.491885 0.986551 0.042249 0.614455 0.132086 0.333584 0.853962 0.842181 0.582036 0.143363 0.119677 0.347158 0.025179 0.532587 0.219933 0.321847 0.424973 0.945384 0.941054 0.703471 0.65091 0.395211 0.880232 0.843353 0.173488 0.89099 0.236028 0.826642 0.821293 0.357619 0.350442 0.822194 0.924311 0.180367 0.815231 0.726218 0.663032 0.11347 0.363676 0.820576 0.048446 0.848979 0.23955 0.0 [...]
+0.348167 0.565164 0.102368 0.745636 0.510129 0.546044 0.813252 0.842433 0.174811 0.892059 0.648513 0.151794 0.496404 0.717396 0.859773 0.018175 0.333313 0.317446 0.106252 0.655482 0.333688 0.152603 0.239902 0.076824 0.344064 0.45999 0.832716 0.564524 0.703148 0.37294 0.230055 0.83711 0.255879 0.459512 0.397951 0.643511 0.085167 0.766221 0.715996 0.6219 0.22721 0.794474 0.271116 0.786892 0.925791 0.188521 0.748976 0.843618 0.792099 0.471089 0.948528 0.025548 0.504643 0.487004 0.476834 0.5 [...]
+0.429401 0.452659 0.154138 0.800818 0.300696 0.245483 0.309383 0.101929 0.021632 0.112567 0.882861 0.450372 0.595244 0.557376 0.474117 0.246851 0.637802 0.419188 0.257496 0.361794 0.829368 0.638591 0.937004 0.173859 0.811338 0.239439 0.902478 0.45243 0.490752 0.453857 0.061106 0.651014 0.80925 0.777204 0.208548 0.075766 0.661537 0.468465 0.611333 0.561056 0.089571 0.594897 0.34356 0.665574 0.45237 0.360819 0.875203 0.595564 0.259066 0.272957 0.209378 0.056638 0.681093 0.617905 0.138245 0 [...]
+0.12564 0.37471 0.424854 0.708077 0.690023 0.365531 0.34158 0.775594 0.696909 0.680492 0.524621 0.941561 0.278333 0.839112 0.10299 0.175053 0.886121 0.300639 0.346899 0.69495 0.385588 0.692534 0.360705 0.564433 0.444214 0.112585 0.690166 0.178574 0.766939 0.92878 0.516485 0.828048 0.041288 0.94554 0.604313 0.981676 0.638568 0.124565 0.866313 0.765606 0.10537 0.810319 0.115362 0.210853 0.196735 0.868247 0.861562 0.069447 0.100539 0.526808 0.10397 0.242031 0.224594 0.878392 0.821874 0.5961 [...]
+0.510819 0.567473 0.189485 0.617801 0.939298 0.241531 0.483766 0.319737 0.262294 0.804523 0.373338 0.304777 0.582714 0.874763 0.02171 0.02513 0.565376 0.043568 0.409195 0.675622 0.779498 0.551111 0.200869 0.230274 0.723172 0.762944 0.317822 0.629799 0.641901 0.731012 0.169599 0.75043 0.209312 0.540351 0.601769 0.907132 0.481103 0.300583 0.838605 0.770329 0.014353 0.211144 0.399828 0.616779 0.322407 0.942459 0.722165 0.900892 0.538992 0.138832 0.104083 0.741845 0.39781 0.113001 0.159727 0 [...]
+0.386206 0.182071 0.18251 0.86637 0.704452 0.971595 0.384566 0.766661 0.747137 0.503399 0.691374 0.421202 0.699926 0.237778 0.579225 0.37468 0.657972 0.139347 0.56767 0.487557 0.205055 0.590219 0.321255 0.187058 0.682162 0.156991 0.048172 0.744395 0.686038 0.457767 0.012569 0.295604 0.882801 0.757056 0.627063 0.127954 0.732345 0.253063 0.712682 0.269707 0.734739 0.876586 0.277897 0.300097 0.16508 0.450343 0.996462 0.89115 0.06068 0.091319 0.541422 0.391416 0.110479 0.954325 0.173956 0.84 [...]
+0.603132 0.734518 0.730917 0.247553 0.073191 0.406383 0.234757 0.736216 0.318269 0.903091 0.698696 0.402941 0.969751 0.60766 0.830196 0.950519 0.562435 0.228694 0.663377 0.953293 0.83859 0.247538 0.741831 0.974135 0.392282 0.222898 0.652578 0.182264 0.063298 0.484152 0.749012 0.333739 0.693549 0.989865 0.547218 0.867642 0.938552 0.681656 0.87573 0.871624 0.992226 0.743025 0.236786 0.335568 0.891947 0.870042 0.480806 0.909246 0.381044 0.728635 0.895841 0.315575 0.470935 0.950925 0.812614  [...]
+0.544311 0.652801 0.584636 0.384426 0.085127 0.840745 0.096373 0.617174 0.069722 0.943739 0.167843 0.458113 0.78438 0.028328 0.922717 0.835835 0.494838 0.673932 0.654435 0.60087 0.8376 0.473876 0.894003 0.029785 0.393758 0.660377 0.692266 0.047044 0.040614 0.754393 0.761876 0.171412 0.022462 0.156408 0.968456 0.44036 0.236442 0.728051 0.07165 0.891743 0.886306 0.051003 0.900896 0.879807 0.214026 0.009916 0.221877 0.79519 0.110909 0.807697 0.947557 0.991625 0.247275 0.350711 0.160497 0.58 [...]
+0.572258 0.329709 0.841746 0.119582 0.85397 0.892925 0.364232 0.870892 0.220858 0.563258 0.379748 0.373241 0.039022 0.807801 0.416969 0.025137 0.207415 0.825595 0.874134 0.246927 0.337817 0.071859 0.401382 0.1455 0.658571 0.157332 0.422521 0.839911 0.098723 0.44785 0.68316 0.377775 0.402524 0.655129 0.692013 0.801595 0.497183 0.083424 0.436081 0.732412 0.956798 0.159587 0.29158 0.86369 0.638674 0.395068 0.610724 0.752416 0.360782 0.550782 0.011028 0.475813 0.058964 0.256129 0.218783 0.20 [...]
+0.174324 0.099468 0.880152 0.079863 0.627692 0.5485 0.571095 0.418993 0.436569 0.899611 0.422187 0.922744 0.39255 0.534135 0.778061 0.015381 0.469173 0.068491 0.689254 0.632752 0.155911 0.555936 0.121841 0.664274 0.879007 0.480557 0.251667 0.060968 0.241161 0.157849 0.697658 0.619047 0.610634 0.45662 0.474593 0.206203 0.37597 0.182956 0.077455 0.592146 0.362877 0.841786 0.573496 0.85871 0.470501 0.468864 0.834662 0.943672 0.824571 0.40253 0.198661 0.224552 0.119526 0.631226 0.023667 0.53 [...]
+0.598103 0.655944 0.304511 0.582938 0.132213 0.241726 0.806641 0.158292 0.336212 0.586202 0.0692 0.415707 0.135374 0.32393 0.393697 0.256427 0.420567 0.236859 0.609877 0.119616 0.093931 0.232585 0.493156 0.425432 0.882716 0.175206 0.895308 0.761131 0.982707 0.740451 0.092839 0.084464 0.387365 0.029506 0.318368 0.525081 0.801459 0.348279 0.397141 0.229342 0.12744 0.761726 0.687906 0.850111 0.807347 0.940408 0.757134 0.122973 0.268732 0.186492 0.590498 0.201868 0.082729 0.089719 0.01925 0. [...]
+0.291908 0.417175 0.600715 0.502826 0.578602 0.021172 0.033382 0.856529 0.756563 0.986854 0.241014 0.464926 0.34515 0.897271 0.538538 0.244211 0.183699 0.613657 0.053933 0.600122 0.073876 0.000462 0.973055 0.26368 0.409635 0.15468 0.818038 0.442221 0.887202 0.35893 0.400363 0.061521 0.456151 0.60584 0.227136 0.394189 0.671578 0.108371 0.415286 0.060088 0.38276 0.736782 0.323676 0.316789 0.688572 0.970708 0.623838 0.504593 0.072635 0.072765 0.460208 0.5784 0.920009 0.283249 0.934983 0.024 [...]
+0.400706 0.398339 0.938971 0.241964 0.278732 0.988354 0.021388 0.479642 0.848523 0.740251 0.321412 0.917576 0.175239 0.50433 0.525229 0.426912 0.487692 0.223387 0.455037 0.02389 0.08032 0.500476 0.894779 0.845019 0.602925 0.905848 0.140861 0.544047 0.530745 0.848174 0.654725 0.361794 0.648346 0.827702 0.891646 0.877588 0.772134 0.058379 0.188758 0.751146 0.030905 0.701943 0.996233 0.110281 0.289082 0.23806 0.421621 0.874763 0.941735 0.680618 0.640298 0.339684 0.535209 0.398262 0.746654 0 [...]
+0.146305 0.880249 0.57585 0.608252 0.068116 0.065575 0.992283 0.44417 0.9791 0.808013 0.420502 0.074131 0.848645 0.631858 0.390751 0.042872 0.362264 0.578241 0.726148 0.403222 0.105954 0.710933 0.681763 0.920276 0.908509 0.054902 0.492973 0.99805 0.877847 0.635885 0.659435 0.054296 0.647905 0.830812 0.076338 0.607111 0.928025 0.593702 0.87327 0.44328 0.237662 0.672536 0.769203 0.048656 0.716074 0.834214 0.319576 0.051291 0.816193 0.472634 0.058618 0.49846 0.837406 0.685712 0.397491 0.128 [...]
+0.823378 0.921833 0.225499 0.133176 0.73902 0.981314 0.692303 0.509291 0.730701 0.085901 0.670417 0.738916 0.973463 0.980205 0.578581 0.884973 0.699802 0.46668 0.762464 0.064172 0.83322 0.134373 0.996415 0.967218 0.375962 0.846857 0.511886 0.281463 0.926912 0.350817 0.625706 0.796943 0.003833 0.506427 0.402175 0.130718 0.236222 0.173872 0.095798 0.91629 0.352793 0.489098 0.167119 0.606947 0.311652 0.935109 0.82733 0.976895 0.032346 0.499764 0.17953 0.487334 0.426253 0.831371 0.502977 0.7 [...]
+0.207088 0.354742 0.613022 0.310966 0.762021 0.763526 0.225582 0.848917 0.071293 0.682847 0.365107 0.336551 0.089432 0.953233 0.635256 0.712233 0.712539 0.633234 0.962892 0.527707 0.981202 0.243294 0.781984 0.353456 0.268162 0.330023 0.316076 0.154426 0.693485 0.082179 0.200154 0.613365 0.091344 0.743459 0.25418 0.267908 0.763625 0.300878 0.49022 0.222759 0.701556 0.910484 0.711251 0.932967 0.928391 0.095418 0.83136 0.321773 0.173586 0.922079 0.665906 0.529406 0.761809 0.804352 0.026329  [...]
+0.304466 0.874528 0.815134 0.483309 0.829812 0.808596 0.322743 0.79473 0.117389 0.522522 0.416547 0.038168 0.351565 0.123542 0.504286 0.543629 0.183375 0.637136 0.869498 0.123734 0.489496 0.298488 0.31834 0.452616 0.337228 0.920463 0.004404 0.54901 0.397404 0.820709 0.790787 0.628022 0.471807 0.122209 0.735846 0.147017 0.763065 0.69773 0.827849 0.072489 0.444908 0.974556 0.666427 0.341388 0.76142 0.860981 0.360578 0.463069 0.455863 0.369001 0.692641 0.551132 0.763899 0.451915 0.050646 0. [...]
+0.448149 0.946005 0.498274 0.924973 0.253952 0.414089 0.100911 0.18753 0.313243 0.774703 0.526786 0.450678 0.955075 0.839045 0.026903 0.098261 0.02784 0.148832 0.991156 0.925753 0.956139 0.878942 0.109365 0.938325 0.297354 0.972156 0.053607 0.415976 0.993949 0.104887 0.867506 0.789312 0.008376 0.750921 0.105976 0.524702 0.114614 0.778407 0.816907 0.051032 0.96982 0.347962 0.837426 0.769625 0.153087 0.076318 0.16142 0.047789 0.441477 0.293924 0.650508 0.90144 0.707474 0.649552 0.175891 0. [...]
+0.555773 0.989962 0.226129 0.89669 0.872654 0.458195 0.314364 0.79225 0.569139 0.846227 0.551928 0.851887 0.356597 0.837289 0.787892 0.22453 0.98191 0.173541 0.763417 0.214391 0.830774 0.11358 0.677064 0.724657 0.783244 0.611272 0.021194 0.989272 0.796373 0.305311 0.652667 0.940108 0.909918 0.289374 0.320816 0.900796 0.910438 0.023541 0.965769 0.093075 0.111714 0.140627 0.408973 0.392104 0.71495 0.205121 0.057051 0.012796 0.9495 0.809144 0.648253 0.007571 0.509007 0.2169 0.454656 0.65570 [...]
+0.24367 0.978935 0.748805 0.889992 0.28586 0.084759 0.814811 0.825697 0.495503 0.375786 0.893392 0.022702 0.866464 0.597904 0.755356 0.749904 0.637299 0.35113 0.897762 0.33363 0.036156 0.377076 0.338108 0.972633 0.569894 0.446128 0.67339 0.486248 0.52111 0.477588 0.822607 0.389956 0.615611 0.738189 0.331631 0.530778 0.248845 0.318241 0.063798 0.232994 0.69699 0.613524 0.749774 0.168211 0.80466 0.586998 0.649559 0.456774 0.784094 0.432657 0.103043 0.162405 0.171561 0.488004 0.966718 0.346 [...]
+0.8001 0.419822 0.644527 0.657599 0.890832 0.533095 0.522643 0.45557 0.990846 0.38513 0.682382 0.735122 0.554912 0.861821 0.05716 0.564122 0.407051 0.624672 0.907345 0.372894 0.205295 0.54223 0.796255 0.123286 0.672227 0.795643 0.116182 0.711673 0.106296 0.081705 0.762555 0.702246 0.093619 0.490525 0.250361 0.487455 0.836545 0.010019 0.408205 0.446784 0.824167 0.04098 0.147372 0.382913 0.115962 0.059751 0.005956 0.187587 0.65373 0.271074 0.360327 0.395995 0.891293 0.447382 0.972358 0.158 [...]
+0.408017 0.785215 0.817069 0.253179 0.981484 0.012239 0.06833 0.836061 0.525058 0.725008 0.239696 0.695225 0.930184 0.518909 0.651249 0.41337 0.628633 0.391839 0.51146 0.585441 0.993777 0.829453 0.786092 0.778746 0.65849 0.739129 0.219872 0.187301 0.834055 0.361249 0.076481 0.943944 0.604351 0.009887 0.621202 0.131891 0.454527 0.119946 0.00429 0.87214 0.376925 0.49441 0.679275 0.677735 0.585599 0.964085 0.824743 0.610517 0.877808 0.925827 0.189717 0.388524 0.536444 0.526939 0.947969 0.85 [...]
+0.464881 0.841231 0.523726 0.47395 0.876967 0.907419 0.965403 0.39967 0.177848 0.738521 0.89677 0.907001 0.156735 0.848527 0.484701 0.959136 0.317768 0.711879 0.820522 0.77512 0.415214 0.339341 0.437808 0.784112 0.882567 0.662356 0.817959 0.783779 0.559734 0.681943 0.64543 0.345316 0.495644 0.968001 0.620449 0.793415 0.838883 0.162336 0.646638 0.915347 0.836223 0.055796 0.963505 0.192011 0.029496 0.687492 0.497784 0.641754 0.340229 0.904488 0.548585 0.506868 0.035102 0.119195 0.272814 0. [...]
+0.737244 0.789915 0.112321 0.862977 0.71378 0.32592 0.052523 0.108986 0.766681 0.633287 0.106603 0.504294 0.571536 0.716436 0.474416 0.792189 0.947534 0.655774 0.886328 0.593761 0.148604 0.209357 0.941282 0.314228 0.336112 0.869064 0.471197 0.297532 0.105676 0.218907 0.227397 0.657844 0.420728 0.290458 0.593599 0.710945 0.459415 0.712555 0.156261 0.836654 0.681701 0.317925 0.513746 0.272674 0.236495 0.890329 0.313348 0.148933 0.61039 0.278533 0.568934 0.232681 0.633823 0.280813 0.406172  [...]
+0.720062 0.173606 0.303178 0.48588 0.82303 0.10828 0.195264 0.684206 0.624749 0.938771 0.622377 0.074348 0.165582 0.911978 0.733429 0.022668 0.643566 0.578245 0.029964 0.54157 0.66503 0.970866 0.617934 0.184564 0.2747 0.229288 0.394886 0.837097 0.819678 0.902039 0.533494 0.650573 0.726398 0.443297 0.797863 0.436138 0.37627 0.858238 0.365495 0.961104 0.546781 0.167446 0.956861 0.791065 0.881508 0.89964 0.959144 0.113233 0.386215 0.194142 0.705736 0.10288 0.799484 0.238264 0.438594 0.69451 [...]
+0.154524 0.643368 0.357968 0.478598 0.770752 0.245416 0.246639 0.865576 0.279412 0.101096 0.794548 0.43555 0.909379 0.839767 0.047616 0.668903 0.500031 0.490285 0.521141 0.470845 0.797867 0.859613 0.045236 0.191718 0.847399 0.217553 0.068769 0.782249 0.847478 0.364961 0.614594 0.110909 0.696552 0.546976 0.921563 0.314926 0.186197 0.002525 0.573507 0.7606 0.335246 0.390298 0.032494 0.431743 0.353351 0.466723 0.206731 0.650114 0.109547 0.079724 0.086214 0.916046 0.233197 0.998108 0.760691  [...]
+0.325195 0.425012 0.534902 0.910084 0.788227 0.453399 0.789613 0.900837 0.353543 0.857813 0.424996 0.448532 0.06175 0.268366 0.33375 0.500908 0.820838 0.004068 0.890322 0.973295 0.97929 0.551435 0.204773 0.083294 0.309559 0.002644 0.809798 0.661724 0.803073 0.72899 0.758251 0.25057 0.893064 0.419373 0.398785 0.291712 0.04157 0.548113 0.888854 0.532907 0.314466 0.728885 0.320819 0.999349 0.965827 0.433847 0.783788 0.192032 0.447263 0.989723 0.768116 0.538007 0.366263 0.885496 0.469042 0.3 [...]
+0.792827 0.943372 0.135593 0.988051 0.746208 0.696619 0.332616 0.401814 0.938167 0.705085 0.220018 0.653189 0.1094 0.020194 0.970428 0.779661 0.474107 0.8875 0.475143 0.080514 0.286343 0.885477 0.785267 0.609958 0.466856 0.147819 0.32507 0.580074 0.314839 0.870226 0.020242 0.5009 0.208091 0.85823 0.998552 0.593344 0.280532 0.644999 0.111168 0.146695 0.728783 0.768279 0.035935 0.16505 0.090336 0.88683 0.6345 0.75994 0.017425 0.304894 0.435768 0.241573 0.502908 0.819916 0.227917 0.204491 0 [...]
+0.199448 0.815031 0.011192 0.432335 0.766808 0.194277 0.298744 0.49966 0.387798 0.034298 0.077537 0.203997 0.64889 0.371452 0.059182 0.90743 0.194256 0.294631 0.733809 0.778485 0.58397 0.906503 0.634692 0.613399 0.582839 0.555641 0.931038 0.103924 0.977882 0.94181 0.830527 0.442255 0.625563 0.41008 0.449428 0.682404 0.660338 0.568651 0.274101 0.487106 0.495119 0.378414 0.365551 0.804097 0.967749 0.739141 0.730823 0.224517 0.651412 0.293122 0.563866 0.501675 0.125892 0.984172 0.718668 0.6 [...]
+0.749857 0.141345 0.367096 0.845754 0.997245 0.623271 0.885582 0.645908 0.252475 0.790687 0.702288 0.44916 0.095733 0.862028 0.851591 0.963864 0.92766 0.299932 0.287672 0.331732 0.62 0.999721 0.962247 0.360412 0.708235 0.518839 0.040283 0.931983 0.959404 0.071301 0.678599 0.593987 0.387752 0.625359 0.577555 0.589863 0.075236 0.283856 0.192612 0.163811 0.682136 0.379766 0.682802 0.015224 0.970304 0.193369 0.478401 0.625403 0.010314 0.592398 0.902651 0.695046 0.598281 0.203135 0.911304 0.6 [...]
+0.424061 0.026365 0.600914 0.478821 0.403323 0.046145 0.81026 0.816479 0.035523 0.572769 0.28945 0.898376 0.704951 0.489287 0.31474 0.088946 0.194712 0.436033 0.688087 0.415781 0.672742 0.399904 0.893477 0.180037 0.720908 0.73701 0.812114 0.92489 0.90664 0.667171 0.796844 0.227487 0.817285 0.979891 0.127778 0.54417 0.335188 0.432737 0.369621 0.825262 0.511083 0.565694 0.355377 0.177249 0.908188 0.225886 0.015122 0.957492 0.996677 0.026835 0.272728 0.239822 0.488864 0.964796 0.930636 0.82 [...]
+0.190931 0.418984 0.638739 0.392463 0.256727 0.126393 0.165107 0.16043 0.686169 0.65217 0.246716 0.023549 0.348819 0.489975 0.651997 0.224267 0.575012 0.717435 0.904619 0.164655 0.711678 0.284727 0.044452 0.593655 0.589415 0.659396 0.848484 0.641124 0.697308 0.938072 0.318936 0.591979 0.27331 0.837281 0.569783 0.151584 0.304519 0.509604 0.185839 0.188841 0.55885 0.285534 0.43987 0.664663 0.712683 0.151457 0.086918 0.253812 0.741628 0.793984 0.160974 0.604016 0.115595 0.87568 0.226546 0.9 [...]
+0.575748 0.242621 0.655037 0.766048 0.954012 0.114163 0.802169 0.887399 0.41122 0.120418 0.959758 0.593015 0.368197 0.484172 0.690322 0.095295 0.196382 0.544581 0.203385 0.647533 0.235262 0.075969 0.634422 0.568083 0.457781 0.134443 0.05913 0.938163 0.209092 0.207685 0.248146 0.777568 0.320735 0.087111 0.543021 0.53243 0.685641 0.694426 0.902458 0.236648 0.730345 0.835309 0.411789 0.811963 0.430335 0.059967 0.108927 0.739125 0.989498 0.880825 0.319381 0.393294 0.032845 0.951374 0.410304  [...]
+0.83493 0.311066 0.425324 0.842476 0.708421 0.027834 0.878948 0.233703 0.9981 0.975662 0.318109 0.711118 0.711578 0.83921 0.149343 0.179189 0.908747 0.576068 0.533773 0.507481 0.351029 0.771012 0.986729 0.607603 0.635161 0.885848 0.609183 0.848144 0.287486 0.557915 0.187366 0.071871 0.887794 0.061446 0.272867 0.197447 0.35675 0.31171 0.228196 0.539011 0.61653 0.559536 0.053651 0.155944 0.694058 0.300688 0.148897 0.967609 0.839722 0.449905 0.3332 0.961355 0.968924 0.309028 0.503714 0.1663 [...]
+0.162733 0.204557 0.520727 0.149982 0.756261 0.056546 0.491984 0.739731 0.188034 0.212034 0.492763 0.155171 0.646989 0.793974 0.216008 0.866431 0.993188 0.810078 0.680864 0.452824 0.289325 0.875819 0.986108 0.317277 0.130085 0.127862 0.425788 0.697857 0.349068 0.994756 0.621379 0.799189 0.941472 0.719883 0.333005 0.120523 0.570867 0.816903 0.831763 0.551734 0.128428 0.320135 0.652623 0.742124 0.862753 0.073819 0.816435 0.251056 0.273261 0.772666 0.135823 0.560418 0.449133 0.987086 0.3444 [...]
+0.250896 0.445553 0.387328 0.134763 0.312609 0.020321 0.311652 0.537458 0.480839 0.28749 0.077738 0.250967 0.400708 0.960045 0.822704 0.32511 0.605701 0.87562 0.680801 0.660598 0.802796 0.916831 0.37971 0.803385 0.520724 0.12829 0.191758 0.416699 0.231161 0.750217 0.663961 0.016924 0.706174 0.826478 0.263216 0.509999 0.735809 0.516752 0.775227 0.961254 0.702284 0.770441 0.697847 0.650134 0.901557 0.634092 0.37485 0.388768 0.921684 0.821883 0.124517 0.395006 0.43145 0.478643 0.983334 0.35 [...]
+0.051296 0.668445 0.331695 0.047438 0.499741 0.916391 0.629476 0.820878 0.808589 0.689724 0.25355 0.89291 0.290781 0.189597 0.398853 0.330716 0.217182 0.436656 0.317677 0.948786 0.456903 0.244442 0.002602 0.812057 0.695648 0.033778 0.504684 0.448482 0.543119 0.749752 0.410792 0.11074 0.324735 0.378659 0.01444 0.657583 0.550303 0.363045 0.593304 0.491023 0.809103 0.836465 0.073328 0.892838 0.289088 0.817379 0.826512 0.881094 0.047997 0.933098 0.601176 0.266306 0.172345 0.984515 0.061855 0 [...]
+0.83811 0.794149 0.696565 0.75606 0.821296 0.197259 0.738074 0.745478 0.52591 0.124652 0.642604 0.70745 0.539166 0.938283 0.250852 0.1111 0.328587 0.546848 0.497688 0.049792 0.983849 0.539472 0.583043 0.135906 0.852793 0.709068 0.121845 0.454916 0.850621 0.35178 0.220929 0.282438 0.629813 0.004536 0.916751 0.713243 0.322349 0.706953 0.513599 0.270085 0.774895 0.124987 0.625052 0.819559 0.099923 0.909519 0.422215 0.892134 0.138741 0.432541 0.106025 0.369412 0.024408 0.370699 0.968295 0.54 [...]
+0.209377 0.575945 0.362334 0.562252 0.143284 0.966654 0.323248 0.127039 0.714449 0.799391 0.740234 0.878127 0.111519 0.999303 0.132059 0.896015 0.395295 0.924776 0.10131 0.545994 0.264592 0.139192 0.806718 0.422414 0.998704 0.465136 0.717363 0.782359 0.151289 0.601424 0.238674 0.443597 0.529519 0.003219 0.943271 0.878922 0.005234 0.876035 0.646159 0.710949 0.096444 0.486827 0.971343 0.233503 0.022972 0.37645 0.301444 0.99202 0.871718 0.880843 0.152054 0.347385 0.192217 0.315754 0.916984  [...]
+0.268297 0.665606 0.76867 0.449693 0.027135 0.921639 0.899706 0.522128 0.938847 0.174381 0.245048 0.203801 0.433155 0.62303 0.584108 0.929269 0.34677 0.285762 0.032518 0.255238 0.174281 0.81241 0.902463 0.697846 0.028246 0.209741 0.818332 0.693044 0.142843 0.155858 0.9946 0.904743 0.902331 0.486992 0.637261 0.539154 0.130217 0.575571 0.160924 0.296023 0.385509 0.258006 0.896938 0.446215 0.782687 0.290273 0.626507 0.329566 0.284035 0.670645 0.238656 0.193054 0.19731 0.322192 0.709734 0.59 [...]
+0.347772 0.772754 0.460164 0.421196 0.900409 0.151629 0.16084 0.274815 0.345191 0.952167 0.273346 0.491513 0.36306 0.459321 0.049241 0.422741 0.788593 0.487728 0.797274 0.211616 0.38899 0.710666 0.174031 0.4034 0.692371 0.338513 0.694776 0.438941 0.020885 0.551182 0.058585 0.432601 0.002242 0.724792 0.312063 0.414635 0.217133 0.305546 0.1887 0.223054 0.104196 0.176745 0.170281 0.307636 0.623691 0.300227 0.891954 0.908186 0.995312 0.171958 0.733909 0.935292 0.678086 0.705338 0.876082 0.89 [...]
+0.22042 0.954033 0.999036 0.034698 0.084938 0.0925 0.827874 0.735804 0.382258 0.81671 0.833292 0.073072 0.715269 0.04799 0.172015 0.050399 0.647545 0.429667 0.064462 0.555364 0.7066 0.3055 0.828232 0.301092 0.372017 0.411939 0.352902 0.306694 0.923619 0.10057 0.102096 0.694971 0.732209 0.8511 0.138652 0.877993 0.328153 0.842025 0.126189 0.339113 0.762132 0.511771 0.500773 0.09529 0.256891 0.533994 0.097623 0.875984 0.805133 0.1526 0.512924 0.175017 0.771149 0.958887 0.826294 0.099662 0.5 [...]
+0.216982 0.215674 0.697271 0.935084 0.842899 0.990872 0.708799 0.469442 0.712883 0.79567 0.613094 0.01726 0.150497 0.112069 0.419334 0.545859 0.198459 0.382498 0.453952 0.716215 0.673277 0.850822 0.906594 0.172758 0.870499 0.515409 0.215869 0.658502 0.185989 0.534081 0.704932 0.873027 0.350745 0.840625 0.883751 0.569359 0.154407 0.452436 0.60276 0.794979 0.394945 0.213661 0.824366 0.882699 0.697775 0.583075 0.924842 0.335448 0.165301 0.142589 0.394649 0.327152 0.111429 0.893263 0.213439  [...]
+0.386723 0.488972 0.011285 0.774658 0.411927 0.26717 0.695691 0.530449 0.983583 0.22835 0.652262 0.609846 0.650044 0.113822 0.89961 0.648459 0.051454 0.788211 0.068527 0.162844 0.222735 0.505791 0.432743 0.189838 0.318805 0.256126 0.507644 0.423254 0.161697 0.406971 0.045127 0.941564 0.066736 0.243451 0.045585 0.043225 0.48183 0.739593 0.86484 0.542389 0.089844 0.786361 0.78271 0.485191 0.024876 0.441849 0.439922 0.896564 0.534873 0.018229 0.126872 0.603043 0.660045 0.768675 0.651067 0.4 [...]
+0.890503 0.212901 0.364213 0.557693 0.972001 0.03892 0.643505 0.043706 0.433152 0.660164 0.948118 0.664875 0.72433 0.823177 0.248804 0.424792 0.757747 0.702273 0.338762 0.482468 0.367213 0.210241 0.934426 0.791071 0.23272 0.591426 0.870957 0.369075 0.813818 0.269942 0.46125 0.893072 0.901188 0.331776 0.690124 0.075608 0.820841 0.059778 0.44189 0.896797 0.798542 0.59412 0.9124 0.666317 0.103825 0.050333 0.98743 0.872207 0.646541 0.321333 0.343234 0.385447 0.920677 0.137058 0.120822 0.4796 [...]
+0.748591 0.99137 0.477245 0.642015 0.879181 0.704964 0.338892 0.038062 0.208857 0.758205 0.391035 0.519206 0.682433 0.945649 0.014665 0.168569 0.744314 0.0595 0.787138 0.326245 0.88013 0.652134 0.461609 0.270191 0.124281 0.79703 0.306015 0.368106 0.22147 0.301708 0.517915 0.398973 0.717987 0.868835 0.636058 0.375576 0.909601 0.216165 0.446986 0.339073 0.097796 0.224015 0.774976 0.96028 0.738442 0.858092 0.600741 0.271975 0.45439 0.491449 0.469138 0.716317 0.990209 0.910954 0.270238 0.329 [...]
+0.373858 0.318878 0.860913 0.08842 0.244101 0.277649 0.579525 0.414085 0.99022 0.095336 0.809036 0.648124 0.291108 0.272123 0.026828 0.540052 0.481741 0.132925 0.897496 0.497838 0.533374 0.342934 0.584691 0.522682 0.342708 0.538386 0.892707 0.729885 0.35504 0.712075 0.542162 0.192412 0.907257 0.88114 0.784839 0.913463 0.105158 0.607678 0.771026 0.314031 0.755519 0.425353 0.875566 0.450413 0.429661 0.533408 0.913893 0.649597 0.93905 0.286393 0.595117 0.07255 0.73705 0.18421 0.039692 0.773 [...]
+0.757412 0.308314 0.233688 0.007606 0.071407 0.395199 0.342464 0.621141 0.493405 0.028742 0.963753 0.037461 0.452788 0.921117 0.370146 0.785754 0.230857 0.671941 0.48757 0.102244 0.277862 0.521356 0.467537 0.425679 0.418758 0.104188 0.58944 0.607094 0.211858 0.224285 0.932267 0.935905 0.993985 0.180647 0.222287 0.275786 0.467413 0.722662 0.75923 0.367784 0.822239 0.372823 0.205201 0.585791 0.444048 0.805182 0.710099 0.546941 0.760192 0.076936 0.966906 0.726824 0.070791 0.315826 0.822414  [...]
+0.728725 0.013666 0.856268 0.822041 0.854587 0.485635 0.543918 0.862039 0.040761 0.880948 0.361082 0.048292 0.653136 0.110321 0.506732 0.447015 0.947452 0.705334 0.865835 0.682335 0.876722 0.304861 0.618708 0.771372 0.550518 0.945013 0.597682 0.949165 0.233166 0.275436 0.173295 0.90129 0.380777 0.337016 0.569506 0.114397 0.58144 0.68895 0.412207 0.81261 0.469351 0.620641 0.728802 0.089273 0.381092 0.844085 0.000136 0.283879 0.437766 0.347305 0.26368 0.697066 0.454864 0.687452 0.539619 0. [...]
+0.806856 0.965048 0.316999 0.769753 0.779118 0.697068 0.580539 0.857268 0.881471 0.818419 0.290232 0.415741 0.71718 0.923199 0.728657 0.597206 0.803902 0.538959 0.856331 0.402562 0.151277 0.510734 0.685388 0.97844 0.835307 0.725976 0.125351 0.615933 0.380187 0.36615 0.793436 0.822883 0.453626 0.111196 0.299057 0.241464 0.514496 0.855626 0.731405 0.765179 0.878793 0.077216 0.072417 0.567858 0.033762 0.712211 0.570077 0.021531 0.678136 0.870165 0.496839 0.200996 0.387846 0.247945 0.173179  [...]
+0.376023 0.017453 0.999153 0.967035 0.2443 0.632137 0.421018 0.056897 0.768563 0.536589 0.14442 0.472981 0.150384 0.497221 0.910343 0.161062 0.499828 0.348842 0.383226 0.996016 0.493288 0.748643 0.22543 0.403175 0.105568 0.961822 0.12676 0.476186 0.727411 0.023299 0.954347 0.923523 0.206194 0.173287 0.484355 0.965053 0.988577 0.455054 0.505706 0.711831 0.071865 0.158017 0.880297 0.042512 0.204667 0.496785 0.936768 0.046601 0.949248 0.548902 0.57026 0.65067 0.402747 0.946216 0.449087 0.89 [...]
+0.5067 0.854723 0.149099 0.782219 0.041711 0.659502 0.353997 0.718257 0.770026 0.668924 0.114955 0.160151 0.061057 0.299544 0.597363 0.114148 0.279495 0.203357 0.955843 0.437368 0.306224 0.942813 0.319289 0.581282 0.251105 0.137939 0.035775 0.806188 0.539621 0.113757 0.979191 0.16858 0.116553 0.558317 0.164806 0.600586 0.357379 0.85113 0.156844 0.478205 0.108781 0.231702 0.781982 0.207478 0.898682 0.255902 0.107225 0.130348 0.158278 0.72552 0.352818 0.141221 0.476699 0.338054 0.615441 0. [...]
+0.647581 0.277664 0.649666 0.872046 0.641778 0.709249 0.633422 0.849425 0.078712 0.797419 0.153644 0.263388 0.350491 0.854435 0.173591 0.953839 0.457501 0.673175 0.532666 0.752506 0.006496 0.265434 0.145303 0.305064 0.890405 0.660954 0.726664 0.9006 0.37263 0.58838 0.331845 0.048207 0.085074 0.05014 0.656586 0.88225 0.941837 0.928441 0.985824 0.483516 0.19756 0.503056 0.169706 0.511039 0.528203 0.373876 0.13973 0.48924 0.843724 0.893509 0.941214 0.691427 0.572061 0.664173 0.216709 0.5852 [...]
+0.378707 0.057495 0.271587 0.078181 0.729009 0.977954 0.652556 0.522676 0.995952 0.906124 0.770011 0.860871 0.277914 0.84249 0.769857 0.30027 0.105497 0.898234 0.135721 0.410108 0.397078 0.234188 0.933366 0.320293 0.183263 0.889538 0.021363 0.472001 0.285184 0.388751 0.956713 0.773072 0.845005 0.273906 0.743956 0.564507 0.930497 0.64591 0.630758 0.803952 0.578832 0.830523 0.076334 0.672247 0.380388 0.905819 0.471578 0.952507 0.126456 0.43906 0.323557 0.736733 0.354163 0.41412 0.643761 0. [...]
+0.939468 0.736586 0.332572 0.751373 0.020722 0.712249 0.226339 0.024163 0.158136 0.428884 0.455259 0.0564 0.116949 0.649295 0.649978 0.669368 0.459935 0.072998 0.772721 0.414057 0.219387 0.363285 0.193584 0.332568 0.481023 0.314709 0.265908 0.896114 0.501548 0.664686 0.146842 0.523897 0.066793 0.935607 0.407003 0.448397 0.75683 0.598115 0.758382 0.048737 0.17742 0.338163 0.298839 0.866454 0.633928 0.701663 0.555038 0.938939 0.048779 0.826673 0.975804 0.112308 0.523927 0.632951 0.665426 0 [...]
+0.777085 0.848682 0.30723 0.570323 0.287751 0.006045 0.849636 0.52803 0.114205 0.080021 0.779435 0.522976 0.220942 0.23954 0.472366 0.879729 0.518388 0.98513 0.728921 0.61219 0.824391 0.54755 0.676823 0.99704 0.062652 0.531617 0.036553 0.739945 0.491845 0.479742 0.272815 0.292368 0.492012 0.433964 0.876538 0.261459 0.233626 0.027216 0.062162 0.936331 0.757622 0.893692 0.450979 0.672232 0.271772 0.983049 0.610533 0.438423 0.887024 0.253301 0.799327 0.384781 0.968942 0.624085 0.131212 0.52 [...]
+0.060651 0.010129 0.76097 0.083662 0.095549 0.303889 0.802508 0.007043 0.707377 0.880995 0.326515 0.864809 0.980759 0.925175 0.697725 0.413939 0.310817 0.454402 0.323416 0.838789 0.287194 0.731767 0.382574 0.53424 0.379549 0.330335 0.823269 0.95431 0.320705 0.916962 0.851307 0.872555 0.457254 0.113012 0.292351 0.944569 0.555531 0.345892 0.027636 0.048252 0.372973 0.351306 0.975812 0.438741 0.37387 0.141293 0.124343 0.286149 0.638211 0.873175 0.635177 0.327818 0.233321 0.569695 0.300219 0 [...]
+0.144739 0.55167 0.948435 0.822489 0.118983 0.12829 0.270748 0.962425 0.411124 0.727111 0.068463 0.251278 0.278973 0.825651 0.818157 0.880659 0.681471 0.656749 0.669723 0.585618 0.937694 0.210102 0.016619 0.85638 0.263432 0.428197 0.318015 0.42159 0.759527 0.577264 0.328058 0.068929 0.308566 0.135325 0.461829 0.545979 0.325186 0.510967 0.100904 0.506221 0.105314 0.221987 0.711203 0.715589 0.275541 0.123847 0.071567 0.878699 0.567489 0.18272 0.137973 0.259963 0.395572 0.374439 0.848574 0. [...]
+0.191991 0.165527 0.176284 0.493459 0.78857 0.412084 0.664049 0.905487 0.988193 0.952141 0.024155 0.883631 0.82626 0.135628 0.93101 0.248884 0.486927 0.417289 0.498944 0.112084 0.45877 0.151161 0.643379 0.353616 0.133153 0.533507 0.034615 0.903163 0.713908 0.958555 0.52895 0.563607 0.426413 0.607314 0.759614 0.433683 0.360922 0.560837 0.641023 0.300835 0.9355 0.302544 0.261438 0.973597 0.504107 0.459789 0.796228 0.848611 0.046085 0.413468 0.605056 0.334757 0.783293 0.369985 0.69421 0.742 [...]
+0.656858 0.541251 0.32474 0.236529 0.41944 0.017213 0.809154 0.121686 0.932316 0.129859 0.849403 0.23165 0.133485 0.92083 0.627636 0.691317 0.778999 0.587067 0.360129 0.261174 0.177792 0.090657 0.983003 0.895957 0.739106 0.363324 0.96514 0.442581 0.248984 0.970444 0.849694 0.661583 0.424889 0.196307 0.910269 0.409017 0.946354 0.170289 0.143288 0.167709 0.489431 0.901144 0.009163 0.208382 0.293667 0.478682 0.421278 0.029921 0.132712 0.410418 0.702238 0.132768 0.620878 0.476541 0.068908 0. [...]
+0.918775 0.616849 0.790933 0.494855 0.833869 0.890381 0.665232 0.334584 0.122374 0.182574 0.345453 0.090583 0.585112 0.593993 0.358438 0.396489 0.131483 0.875212 0.740714 0.340423 0.458859 0.758903 0.042665 0.192214 0.165161 0.464929 0.493943 0.698992 0.385223 0.493521 0.367254 0.9778 0.820705 0.357598 0.232942 0.953929 0.857772 0.618513 0.831336 0.922554 0.154805 0.429258 0.814241 0.952424 0.395339 0.667988 0.226626 0.633986 0.303813 0.318117 0.22536 0.759761 0.991287 0.547794 0.467635  [...]
+0.58927 0.623793 0.179259 0.18814 0.215894 0.199117 0.309507 0.207582 0.427173 0.518073 0.77993 0.218813 0.187964 0.309865 0.804344 0.254468 0.936647 0.75385 0.44799 0.055344 0.782353 0.016566 0.14426 0.374151 0.885884 0.193926 0.770081 0.577355 0.94806 0.232189 0.084488 0.601886 0.664412 0.746057 0.493314 0.596168 0.67441 0.296793 0.229697 0.733469 0.930726 0.369058 0.716864 0.07222 0.580854 0.071213 0.635215 0.494025 0.701939 0.642334 0.998112 0.133977 0.027443 0.002735 0.208323 0.3257 [...]
+0.683446 0.46255 0.244808 0.472556 0.385621 0.717352 0.501278 0.652096 0.634642 0.184822 0.177892 0.561467 0.760949 0.767292 0.581943 0.169912 0.648608 0.527352 0.809946 0.240285 0.779878 0.540349 0.531662 0.306172 0.850821 0.309075 0.695895 0.558896 0.341617 0.142816 0.244985 0.041809 0.111634 0.685836 0.016961 0.622303 0.198412 0.267328 0.684214 0.160348 0.743537 0.282787 0.246082 0.627499 0.736711 0.694321 0.656559 0.272195 0.81397 0.96525 0.543262 0.988304 0.243508 0.33578 0.964883 0 [...]
+0.531128 0.976675 0.74721 0.994887 0.621498 0.312755 0.555218 0.763275 0.791941 0.956384 0.326914 0.478449 0.969299 0.575728 0.470187 0.957171 0.097892 0.497081 0.997317 0.852306 0.995037 0.38903 0.762554 0.861816 0.654043 0.786525 0.202615 0.961729 0.143758 0.278247 0.822969 0.451929 0.349441 0.529998 0.505649 0.521136 0.713312 0.498405 0.403633 0.86753 0.849047 0.654573 0.22016 0.684821 0.823381 0.743573 0.358101 0.96396 0.374602 0.395041 0.31157 0.021532 0.050222 0.076615 0.230049 0.0 [...]
+0.711237 0.871658 0.621153 0.384558 0.535803 0.292812 0.225218 0.922684 0.262539 0.708585 0.890271 0.741117 0.075002 0.624674 0.895254 0.368381 0.672856 0.49329 0.434908 0.702175 0.378218 0.684713 0.565943 0.182577 0.076345 0.760193 0.764551 0.608399 0.119713 0.265232 0.099454 0.192054 0.206078 0.229528 0.597443 0.476756 0.383285 0.652971 0.506867 0.294169 0.783798 0.903392 0.70052 0.169302 0.34506 0.03365 0.372343 0.105505 0.661844 0.589533 0.556928 0.344139 0.94067 0.027014 0.746241 0. [...]
+0.310431 0.464753 0.527603 0.088479 0.784103 0.548817 0.201836 0.452848 0.502927 0.099865 0.490854 0.90026 0.130442 0.124783 0.813068 0.542967 0.131974 0.30963 0.258001 0.488003 0.947551 0.613429 0.600595 0.91201 0.408854 0.750293 0.714833 0.14016 0.674756 0.630483 0.401934 0.154195 0.688486 0.406507 0.567225 0.932702 0.21487 0.825155 0.954895 0.66673 0.593866 0.35816 0.033063 0.588113 0.986087 0.682629 0.608206 0.899105 0.881125 0.475815 0.678913 0.669637 0.055962 0.892949 0.634748 0.62 [...]
+0.003289 0.308059 0.311061 0.657478 0.210296 0.175238 0.033285 0.451603 0.28336 0.078979 0.238386 0.845017 0.332089 0.836985 0.324176 0.343163 0.351679 0.573869 0.417086 0.661514 0.62933 0.996416 0.458082 0.404796 0.395687 0.869539 0.423756 0.66129 0.943777 0.216567 0.840598 0.005556 0.081572 0.320152 0.965757 0.486603 0.110424 0.05034 0.685669 0.377621 0.635785 0.506537 0.275142 0.780144 0.139871 0.538524 0.494348 0.021642 0.366381 0.185617 0.063863 0.642359 0.18939 0.125313 0.534174 0. [...]
+0.057576 0.073074 0.05532 0.921781 0.276827 0.545076 0.398956 0.581239 0.551487 0.317896 0.08429 0.82427 0.724456 0.145339 0.613643 0.52381 0.49924 0.150637 0.017778 0.367177 0.152806 0.644495 0.590243 0.823823 0.106012 0.549429 0.899998 0.409104 0.135028 0.365652 0.981439 0.610494 0.005324 0.216373 0.514816 0.053574 0.12116 0.674513 0.889564 0.947427 0.337746 0.95804 0.951193 0.345742 0.32116 0.278767 0.730573 0.361257 0.96675 0.548108 0.418562 0.805144 0.130664 0.143554 0.104008 0.1298 [...]
+0.419442 0.093535 0.431472 0.228289 0.084772 0.795689 0.491892 0.827946 0.45261 0.630596 0.859329 0.378137 0.949902 0.284922 0.822209 0.568147 0.718367 0.146012 0.285837 0.262771 0.820692 0.793251 0.944887 0.412971 0.029861 0.82283 0.149076 0.844957 0.319191 0.739533 0.211442 0.816969 0.412118 0.84673 0.642008 0.520959 0.283914 0.499173 0.341431 0.531257 0.615883 0.197048 0.851617 0.554534 0.520931 0.331838 0.308329 0.915109 0.277858 0.447251 0.940667 0.278966 0.360135 0.862948 0.779388  [...]
+0.460161 0.178498 0.044585 0.710179 0.921795 0.444137 0.557818 0.654165 0.442409 0.029934 0.251778 0.615929 0.268222 0.536917 0.899353 0.637692 0.52938 0.691925 0.048134 0.975916 0.529992 0.295367 0.410494 0.15968 0.241491 0.683622 0.829461 0.188783 0.70675 0.577297 0.11089 0.933077 0.116855 0.947617 0.665602 0.449622 0.885973 0.909535 0.732872 0.866341 0.998972 0.919414 0.693096 0.184322 0.941003 0.414359 0.190883 0.796253 0.659075 0.672 0.128027 0.122054 0.813635 0.429098 0.584082 0.85 [...]
+0.527534 0.522569 0.26634 0.168021 0.396374 0.14102 0.387881 0.153389 0.902409 0.379896 0.183137 0.244151 0.701992 0.402289 0.035721 0.531138 0.510102 0.808134 0.377849 0.282315 0.41832 0.808172 0.990415 0.731135 0.60772 0.641357 0.958614 0.316677 0.487606 0.673654 0.870459 0.239485 0.974728 0.645094 0.420538 0.127458 0.378174 0.199417 0.68398 0.528774 0.537851 0.927774 0.747601 0.497328 0.321833 0.451285 0.901601 0.194027 0.169305 0.891261 0.225421 0.905104 0.423056 0.978863 0.618984 0. [...]
+0.155626 0.670185 0.676836 0.303088 0.565461 0.987565 0.514683 0.765195 0.753042 0.308346 0.538718 0.562305 0.238573 0.873087 0.502697 0.127802 0.804192 0.160694 0.062204 0.912987 0.404514 0.830035 0.234782 0.886099 0.634449 0.173916 0.797087 0.48467 0.572459 0.983647 0.516544 0.060829 0.896252 0.631785 0.753718 0.943271 0.557159 0.533828 0.762233 0.19831 0.163153 0.538457 0.774254 0.538216 0.74119 0.023419 0.862612 0.69988 0.585563 0.510472 0.108598 0.506844 0.403271 0.379503 0.305836 0 [...]
+0.396257 0.731289 0.979623 0.194535 0.786132 0.58816 0.400429 0.145401 0.399072 0.819721 0.647345 0.186164 0.744572 0.67475 0.452666 0.007705 0.994513 0.984327 0.346747 0.271352 0.086791 0.468719 0.616066 0.651688 0.327942 0.03774 0.252553 0.099463 0.919373 0.389842 0.627637 0.325877 0.495987 0.934867 0.430292 0.417548 0.336377 0.228286 0.597247 0.227932 0.174149 0.929167 0.391238 0.312082 0.51605 0.952346 0.274696 0.983406 0.315655 0.889267 0.083707 0.30743 0.756262 0.71293 0.921924 0.3 [...]
+0.306984 0.10497 0.945849 0.162554 0.501141 0.74365 0.123865 0.037061 0.905082 0.289777 0.128299 0.619909 0.284297 0.649374 0.927659 0.711961 0.130464 0.867638 0.654355 0.979525 0.070869 0.56511 0.673476 0.997616 0.950905 0.01027 0.07882 0.126615 0.048547 0.503951 0.752211 0.41283 0.26708 0.954223 0.947674 0.900665 0.217529 0.333673 0.826113 0.085668 0.336046 0.646561 0.790856 0.353012 0.582002 0.163353 0.170268 0.121111 0.693183 0.285844 0.585486 0.06819 0.296377 0.295387 0.661927 0.090 [...]
+0.746937 0.029913 0.348305 0.369773 0.514564 0.128394 0.704604 0.205301 0.066486 0.361998 0.798643 0.913878 0.851205 0.648083 0.222422 0.103451 0.238498 0.577052 0.281001 0.642825 0.274841 0.159639 0.676996 0.107329 0.281961 0.769959 0.567984 0.602774 0.196856 0.595788 0.732764 0.05707 0.439362 0.400198 0.450935 0.104601 0.426208 0.770805 0.697013 0.175508 0.79519 0.688946 0.586961 0.682879 0.684623 0.338108 0.115002 0.032166 0.112734 0.828905 0.669169 0.107693 0.709091 0.892043 0.637983 [...]
+0.736799 0.017504 0.522509 0.346989 0.690792 0.74168 0.431773 0.883467 0.11123 0.242557 0.611252 0.759136 0.159897 0.73525 0.920019 0.350371 0.508063 0.693128 0.113978 0.748768 0.417465 0.215585 0.150575 0.615344 0.734565 0.255445 0.1183 0.722914 0.665354 0.247275 0.855871 0.367406 0.102392 0.218573 0.300682 0.013425 0.62207 0.880783 0.191757 0.620556 0.064132 0.82489 0.310217 0.529953 0.902414 0.557554 0.542568 0.68301 0.196862 0.173708 0.388271 0.355863 0.247344 0.845806 0.018287 0.924 [...]
+0.757862 0.460806 0.27629 0.60934 0.681947 0.116025 0.07129 0.383292 0.05346 0.614758 0.010437 0.37653 0.376861 0.598829 0.584403 0.16609 0.863143 0.903998 0.764322 0.045693 0.092371 0.221204 0.163139 0.888412 0.679031 0.898017 0.136938 0.46335 0.923892 0.260779 0.054473 0.278737 0.731515 0.595595 0.781778 0.678666 0.530339 0.11989 0.672397 0.396779 0.29556 0.819494 0.880223 0.3887 0.502512 0.022309 0.732472 0.172612 0.038753 0.247035 0.312277 0.28152 0.373938 0.069377 0.834133 0.102769  [...]
+0.698558 0.374683 0.64071 0.879914 0.243628 0.903076 0.497533 0.049649 0.604341 0.768608 0.941672 0.720405 0.080118 0.718049 0.815129 0.967091 0.624706 0.926495 0.59493 0.831593 0.140743 0.081417 0.126833 0.524578 0.331933 0.581021 0.199631 0.75313 0.932573 0.304323 0.333943 0.159631 0.597098 0.266812 0.140856 0.581225 0.718931 0.822744 0.906899 0.859408 0.673546 0.648624 0.759939 0.778528 0.266025 0.798149 0.276877 0.154015 0.812559 0.600208 0.883821 0.636289 0.799035 0.5041 0.825176 0. [...]
+0.453317 0.374521 0.403108 0.494564 0.681278 0.352889 0.169834 0.694528 0.407217 0.85758 0.642276 0.26127 0.671518 0.986826 0.579984 0.185074 0.380661 0.902441 0.26602 0.492374 0.55307 0.810682 0.667879 0.061849 0.359356 0.198214 0.654873 0.521069 0.444698 0.240449 0.089264 0.085774 0.662611 0.539751 0.716732 0.51314 0.747282 0.003839 0.595036 0.982814 0.079338 0.810873 0.346943 0.806258 0.554385 0.201911 0.179273 0.929354 0.696152 0.555798 0.326333 0.144677 0.196207 0.25709 0.087098 0.0 [...]
+0.460853 0.916855 0.783362 0.228713 0.138443 0.600125 0.746291 0.781843 0.97914 0.206991 0.425491 0.876464 0.447093 0.784281 0.596709 0.460219 0.988545 0.065837 0.973181 0.391033 0.152163 0.935952 0.461638 0.80455 0.984576 0.521948 0.951256 0.061107 0.461325 0.057834 0.950038 0.787489 0.604029 0.363822 0.885172 0.571915 0.324222 0.943976 0.288861 0.619929 0.321765 0.52182 0.728411 0.248587 0.49873 0.398213 0.97296 0.283524 0.358178 0.87588 0.401112 0.906221 0.367473 0.617321 0.14402 0.97 [...]
+0.870198 0.831027 0.119379 0.430796 0.656013 0.413316 0.280458 0.135203 0.814342 0.48531 0.69175 0.811687 0.157503 0.940974 0.736727 0.246675 0.539275 0.600691 0.216458 0.372946 0.663401 0.816126 0.292579 0.814874 0.025892 0.489572 0.478401 0.005191 0.301026 0.181313 0.485547 0.014266 0.640894 0.777135 0.687792 0.104749 0.853786 0.606829 0.025194 0.07919 0.211483 0.91123 0.904218 0.963869 0.174047 0.201726 0.838702 0.865994 0.72214 0.962123 0.849581 0.930851 0.075474 0.393959 0.59482 0.3 [...]
+0.851568 0.321937 0.353769 0.136782 0.437764 0.589891 0.932601 0.083223 0.401534 0.602709 0.018152 0.933384 0.677982 0.747649 0.254812 0.055716 0.574669 0.212201 0.334299 0.277178 0.831613 0.777661 0.204931 0.737958 0.712942 0.922347 0.564758 0.770683 0.317557 0.899487 0.533095 0.804286 0.529401 0.084984 0.188225 0.100843 0.217278 0.738235 0.908281 0.741966 0.384737 0.595824 0.217519 0.202374 0.47647 0.104854 0.950099 0.101471 0.266718 0.175821 0.22271 0.224134 0.758289 0.122146 0.720834 [...]
+0.579469 0.13011 0.75 0.298501 0.922644 0.520296 0.089729 0.882034 0.385867 0.281808 0.747036 0.433187 0.303153 0.406184 0.294584 0.408566 0.116307 0.990038 0.93263 0.316253 0.752414 0.916455 0.803026 0.088392 0.081614 0.610766 0.862497 0.232321 0.803335 0.769881 0.246943 0.487293 0.821958 0.221062 0.589843 0.517344 0.190729 0.829756 0.381728 0.724519 0.906792 0.727398 0.775275 0.27289 0.400103 0.918921 0.19108 0.283038 0.092194 0.61848 0.624298 0.164672 0.791337 0.280484 0.030798 0.7669 [...]
+0.667351 0.556209 0.555141 0.448527 0.077353 0.712499 0.457488 0.054352 0.538917 0.059955 0.34541 0.971366 0.9897 0.190204 0.413174 0.02252 0.583021 0.297758 0.169049 0.636392 0.912146 0.462531 0.770467 0.015542 0.328119 0.421316 0.544994 0.673562 0.482743 0.137912 0.76925 0.138031 0.665621 0.548892 0.485483 0.588657 0.841552 0.775645 0.705382 0.279896 0.113015 0.878221 0.462334 0.041506 0.858097 0.204328 0.809443 0.380545 0.848859 0.775218 0.443358 0.45626 0.070041 0.146173 0.103573 0.7 [...]
+0.294114 0.912295 0.329158 0.04044 0.345015 0.444355 0.760752 0.904657 0.668793 0.767571 0.449788 0.161325 0.475294 0.135076 0.342163 0.2982 0.06971 0.976377 0.000875 0.153043 0.780533 0.812237 0.314908 0.283662 0.094192 0.054064 0.262278 0.21059 0.291567 0.865839 0.750004 0.231184 0.307845 0.172304 0.677536 0.535478 0.300488 0.96183 0.141675 0.995357 0.845875 0.713067 0.137187 0.292075 0.852968 0.489926 0.213982 0.284273 0.663803 0.735841 0.336529 0.110867 0.897095 0.575919 0.494078 0.4 [...]
+0.310618 0.588206 0.883746 0.45258 0.386005 0.500504 0.376857 0.70995 0.107946 0.900595 0.943097 0.117732 0.970494 0.498613 0.315809 0.482959 0.662847 0.02691 0.583221 0.897833 0.25151 0.383845 0.495813 0.433945 0.951037 0.177372 0.800189 0.579617 0.672632 0.570363 0.842509 0.23336 0.553241 0.634292 0.070978 0.388698 0.116295 0.545894 0.034161 0.589273 0.258792 0.634164 0.266055 0.229732 0.682207 0.735564 0.074903 0.930888 0.902183 0.436128 0.846669 0.107619 0.753651 0.620136 0.931705 0. [...]
+0.971779 0.999375 0.166341 0.293933 0.651617 0.528583 0.252125 0.999131 0.626402 0.80738 0.791535 0.895743 0.204409 0.990412 0.302954 0.667256 0.949983 0.655841 0.140452 0.6359 0.926089 0.117177 0.251492 0.705166 0.934028 0.397282 0.09467 0.064883 0.450302 0.324157 0.957949 0.426732 0.915422 0.037409 0.222534 0.588564 0.353339 0.994069 0.517988 0.162251 0.673108 0.424786 0.757658 0.780258 0.663085 0.836858 0.358963 0.376507 0.744831 0.92257 0.736632 0.734878 0.049669 0.493006 0.691011 0. [...]
+0.765914 0.93586 0.55873 0.957758 0.798704 0.515344 0.046747 0.857418 0.354342 0.460362 0.788436 0.117729 0.971227 0.145879 0.409373 0.629212 0.367744 0.337627 0.719925 0.117498 0.660092 0.816284 0.077084 0.010235 0.18432 0.904187 0.356842 0.002265 0.415026 0.915024 0.058001 0.809191 0.834897 0.226154 0.159087 0.624896 0.534981 0.46905 0.484102 0.866234 0.168674 0.674015 0.703547 0.156306 0.008453 0.183097 0.423695 0.431346 0.173773 0.854789 0.488645 0.410975 0.940331 0.637133 0.199184 0 [...]
+0.812121 0.878029 0.526476 0.944155 0.174497 0.800267 0.06082 0.816185 0.542883 0.528444 0.972981 0.833267 0.281428 0.245938 0.037681 0.615399 0.274806 0.129579 0.757153 0.69696 0.311581 0.201573 0.13846 0.598823 0.17152 0.322651 0.063855 0.276955 0.482646 0.640288 0.153089 0.344658 0.341254 0.23999 0.201076 0.681604 0.061987 0.372327 0.017607 0.123758 0.229611 0.929474 0.335231 0.903289 0.794051 0.564762 0.171728 0.001507 0.545424 0.599573 0.979424 0.08094 0.806717 0.257119 0.185495 0.6 [...]
+0.466358 0.482365 0.104857 0.931537 0.962149 0.168067 0.393242 0.58392 0.077358 0.523364 0.492562 0.374284 0.210442 0.444283 0.942804 0.242172 0.231721 0.027837 0.211246 0.082339 0.367922 0.051619 0.295979 0.96687 0.05492 0.740709 0.796795 0.402696 0.489007 0.888154 0.829249 0.865877 0.902414 0.840014 0.47735 0.972567 0.292666 0.542817 0.718312 0.867814 0.588756 0.702553 0.917645 0.041903 0.505653 0.492824 0.210777 0.56423 0.197717 0.170687 0.276649 0.575727 0.296934 0.514501 0.205937 0. [...]
+0.651704 0.257837 0.045585 0.504684 0.04846 0.212859 0.134922 0.456909 0.12773 0.312024 0.113162 0.1268 0.969117 0.683166 0.577921 0.533392 0.62537 0.307663 0.649294 0.071121 0.191434 0.07465 0.666244 0.550617 0.986232 0.233429 0.730121 0.417068 0.295018 0.423047 0.118676 0.279356 0.479392 0.191651 0.804543 0.38503 0.277283 0.627904 0.050366 0.818868 0.102638 0.210843 0.676683 0.930027 0.601547 0.488028 0.652097 0.10778 0.206739 0.857051 0.553732 0.745375 0.935847 0.293685 0.540813 0.841 [...]
+0.501863 0.103948 0.962713 0.738856 0.012048 0.800919 0.614714 0.180427 0.06992 0.850086 0.955552 0.959041 0.723697 0.002763 0.82305 0.913661 0.18016 0.729674 0.699211 0.941785 0.858669 0.625223 0.34797 0.925304 0.518347 0.292179 0.557381 0.13153 0.865426 0.001394 0.705387 0.577572 0.441807 0.752767 0.164513 0.099879 0.814901 0.013878 0.864142 0.151958 0.301297 0.412714 0.858139 0.185861 0.252584 0.14814 0.747992 0.626954 0.172326 0.084785 0.568696 0.504564 0.356714 0.90589 0.714518 0.37 [...]
+0.156699 0.561384 0.346445 0.136802 0.756125 0.317107 0.575171 0.563925 0.016803 0.959244 0.11028 0.052525 0.342293 0.757756 0.514809 0.088315 0.351206 0.225944 0.368474 0.539007 0.683012 0.311224 0.414806 0.975442 0.00503 0.703203 0.515917 0.341829 0.691235 0.417195 0.998716 0.705202 0.360285 0.467437 0.257259 0.376043 0.160998 0.729833 0.851818 0.369947 0.243096 0.713985 0.076519 0.578515 0.960292 0.419602 0.930479 0.262129 0.348747 0.539753 0.399424 0.406525 0.259775 0.351955 0.214681 [...]
+0.527111 0.218538 0.575393 0.891983 0.846704 0.554055 0.780403 0.221965 0.668238 0.560119 0.51608 0.122547 0.86828 0.914535 0.312497 0.323472 0.544119 0.207614 0.81864 0.916965 0.070507 0.959994 0.024257 0.505139 0.763977 0.534072 0.84538 0.299121 0.319652 0.717804 0.854127 0.831631 0.043588 0.183765 0.692869 0.913619 0.375944 0.303962 0.59289 0.673877 0.716082 0.795512 0.213625 0.067488 0.88408 0.905774 0.892813 0.614633 0.552747 0.545345 0.481445 0.25235 0.43712 0.68135 0.470754 0.9517 [...]
+0.433273 0.238855 0.475806 0.932323 0.641109 0.958007 0.122408 0.550916 0.063099 0.403358 0.193474 0.720933 0.89429 0.043781 0.472071 0.123724 0.041969 0.208593 0.91128 0.905586 0.722888 0.621271 0.762604 0.792863 0.290771 0.204242 0.239571 0.034496 0.866477 0.969146 0.980669 0.537769 0.103726 0.162916 0.786799 0.717739 0.156121 0.874101 0.385882 0.692408 0.352094 0.419602 0.969387 0.136015 0.097075 0.048347 0.271851 0.323308 0.846616 0.434864 0.075338 0.619635 0.535936 0.206526 0.284788 [...]
+0.621234 0.120088 0.834509 0.328374 0.532733 0.973265 0.46578 0.083625 0.984335 0.186396 0.87582 0.21127 0.799197 0.244656 0.328117 0.244566 0.121355 0.657888 0.625881 0.800759 0.656473 0.456827 0.170289 0.675646 0.696044 0.960613 0.237834 0.94106 0.74551 0.069766 0.131766 0.273969 0.434538 0.901165 0.122685 0.290169 0.554472 0.706014 0.118998 0.474192 0.669948 0.392602 0.410171 0.506322 0.870437 0.427049 0.474214 0.961874 0.909305 0.899288 0.811014 0.59933 0.127612 0.223519 0.645652 0.9 [...]
+0.522876 0.273202 0.462544 0.562745 0.924177 0.113702 0.76394 0.13516 0.303196 0.287651 0.741634 0.205446 0.286497 0.230639 0.301145 0.504437 0.721104 0.009592 0.672995 0.948049 0.248686 0.243497 0.760612 0.318736 0.480569 0.771224 0.8256 0.853789 0.9994 0.262866 0.437668 0.47544 0.903578 0.766847 0.405902 0.469234 0.279352 0.232625 0.631974 0.404076 0.353719 0.722101 0.939266 0.55276 0.805175 0.725721 0.120799 0.707896 0.545053 0.930339 0.216296 0.331933 0.593969 0.053742 0.928883 0.080 [...]
+0.978198 0.853446 0.322863 0.215762 0.130632 0.888631 0.028153 0.590165 0.973695 0.813555 0.799991 0.614762 0.406086 0.787559 0.010696 0.747544 0.115013 0.320719 0.847407 0.108183 0.827224 0.106215 0.90634 0.794021 0.438211 0.6873 0.675509 0.657781 0.148777 0.883298 0.164088 0.651896 0.645335 0.887635 0.728102 0.896217 0.647557 0.357189 0.706957 0.832503 0.572601 0.517458 0.810488 0.918863 0.73925 0.023639 0.511124 0.240232 0.965198 0.932437 0.604929 0.877415 0.966535 0.941967 0.199624 0 [...]
+0.116246 0.417102 0.064781 0.357155 0.536984 0.957098 0.164159 0.902938 0.098632 0.465274 0.093431 0.02826 0.81183 0.462672 0.854775 0.346668 0.378682 0.504301 0.069414 0.2247 0.90144 0.342693 0.251336 0.191124 0.673693 0.143508 0.027257 0.058169 0.821581 0.458388 0.60729 0.640862 0.747108 0.133132 0.897857 0.807855 0.532013 0.131113 0.845549 0.934378 0.984576 0.424463 0.753391 0.878802 0.88033 0.828697 0.322967 0.514105 0.504209 0.933364 0.647013 0.379655 0.106963 0.111792 0.796814 0.64 [...]
+0.797393 0.300599 0.495555 0.974184 0.186662 0.109604 0.524813 0.879173 0.745795 0.216256 0.810259 0.212389 0.807703 0.646551 0.613585 0.174565 0.847073 0.564532 0.673281 0.59989 0.193223 0.658441 0.955283 0.99084 0.087867 0.502736 0.353662 0.78367 0.965916 0.863245 0.228632 0.542009 0.348954 0.98246 0.887303 0.666547 0.226816 0.814642 0.176428 0.493216 0.486246 0.174955 0.454503 0.023007 0.325184 0.755728 0.956145 0.249415 0.135793 0.68887 0.25315 0.872671 0.171842 0.825697 0.412038 0.1 [...]
+0.337028 0.435408 0.933221 0.810914 0.153619 0.148106 0.49974 0.479797 0.257714 0.034137 0.758914 0.472958 0.013702 0.997588 0.317549 0.048515 0.896435 0.694338 0.186227 0.063064 0.707446 0.794633 0.207489 0.916276 0.589943 0.901999 0.240874 0.009112 0.997509 0.883807 0.708939 0.707538 0.180424 0.277485 0.685434 0.306252 0.390721 0.404977 0.546472 0.934378 0.970472 0.464046 0.872324 0.260907 0.285524 0.855732 0.337772 0.909243 0.915976 0.354223 0.027898 0.679496 0.857992 0.147082 0.65982 [...]
+0.237387 0.871343 0.258548 0.421311 0.954316 0.715542 0.232703 0.328339 0.166937 0.887513 0.61725 0.801192 0.385619 0.853256 0.695778 0.34627 0.615535 0.508268 0.778391 0.707609 0.270584 0.036643 0.14678 0.940411 0.451426 0.135702 0.620524 0.567009 0.518045 0.800659 0.658541 0.339376 0.418887 0.871928 0.420293 0.130103 0.664535 0.34499 0.562388 0.991184 0.010997 0.536072 0.953608 0.528397 0.553569 0.397728 0.298479 0.580188 0.968292 0.454633 0.283077 0.511791 0.303949 0.883701 0.393764 0 [...]
+0.265935 0.45014 0.798241 0.295358 0.93494 0.228635 0.891649 0.60432 0.912295 0.047324 0.608128 0.943941 0.029384 0.098023 0.996917 0.021218 0.521089 0.928112 0.319581 0.596039 0.903604 0.313812 0.028818 0.201091 0.201642 0.149639 0.924714 0.165557 0.174494 0.712928 0.558835 0.043915 0.53743 0.760289 0.399453 0.92078 0.602583 0.133067 0.384872 0.290772 0.445385 0.11794 0.058963 0.651823 0.27505 0.627512 0.251516 0.360077 0.974647 0.091243 0.625878 0.900172 0.498813 0.0157 0.604894 0.2076 [...]
+0.137788 0.360917 0.435128 0.168719 0.655019 0.265594 0.695422 0.964481 0.463453 0.68401 0.15865 0.456992 0.494972 0.104499 0.05236 0.821761 0.27507 0.207098 0.146681 0.783261 0.813176 0.462153 0.194924 0.420668 0.075269 0.810102 0.174409 0.126826 0.295204 0.684729 0.563172 0.152734 0.890102 0.342292 0.347006 0.333783 0.007008 0.600486 0.699178 0.158225 0.157165 0.541625 0.876235 0.530167 0.071787 0.94768 0.243975 0.16518 0.043621 0.942307 0.731047 0.98865 0.76327 0.855952 0.398221 0.458 [...]
+0.754477 0.975612 0.506336 0.099713 0.106191 0.231155 0.891641 0.635337 0.537457 0.493361 0.515371 0.10944 0.062016 0.728741 0.607706 0.733013 0.571712 0.408782 0.843457 0.147738 0.959808 0.620571 0.780544 0.170199 0.789887 0.23766 0.173402 0.145904 0.421494 0.083304 0.246529 0.133662 0.136332 0.565354 0.186131 0.556521 0.807523 0.007573 0.911798 0.850439 0.45931 0.856455 0.12375 0.359061 0.676835 0.266608 0.023556 0.071304 0.350314 0.820442 0.661848 0.698055 0.455677 0.911058 0.532656 0 [...]
+0.578382 0.664173 0.21006 0.976015 0.25632 0.721868 0.440718 0.190752 0.107262 0.537834 0.351853 0.631592 0.995776 0.240446 0.056078 0.914274 0.135182 0.419652 0.735034 0.841972 0.020061 0.254574 0.354366 0.920068 0.380184 0.315863 0.39024 0.663021 0.637624 0.150026 0.595433 0.242275 0.743598 0.937841 0.035415 0.052226 0.731799 0.792451 0.214822 0.55699 0.039642 0.53718 0.834773 0.508018 0.942785 0.248523 0.938048 0.064855 0.621756 0.181284 0.357065 0.078559 0.942685 0.174751 0.953114 0. [...]
+0.249539 0.808195 0.186893 0.382585 0.594039 0.040978 0.250516 0.720596 0.133662 0.087696 0.507857 0.037299 0.372488 0.205855 0.067642 0.724456 0.342302 0.728773 0.273397 0.588742 0.631602 0.733553 0.63404 0.625821 0.418796 0.909535 0.588962 0.675301 0.745696 0.006794 0.574548 0.058887 0.951351 0.591399 0.543272 0.837434 0.217884 0.933091 0.105947 0.552542 0.792515 0.015211 0.632011 0.52551 0.22183 0.963226 0.104395 0.568942 0.209056 0.791662 0.869508 0.208608 0.351073 0.563441 0.441394  [...]
+0.465906 0.710447 0.9192 0.770327 0.171446 0.314212 0.063752 0.772483 0.627359 0.405108 0.300053 0.45161 0.405545 0.213086 0.991614 0.311928 0.068569 0.665127 0.085553 0.297623 0.366201 0.896501 0.87816 0.884117 0.347853 0.755194 0.232766 0.843592 0.968717 0.514181 0.02051 0.192603 0.643509 0.436882 0.812813 0.174614 0.748256 0.296844 0.186669 0.363789 0.090263 0.919349 0.329294 0.148022 0.852722 0.443386 0.838388 0.980103 0.218737 0.682678 0.775626 0.689244 0.478199 0.245177 0.043465 0. [...]
+0.221996 0.248157 0.938942 0.231402 0.294776 0.855737 0.358819 0.444505 0.30282 0.985026 0.890407 0.289146 0.201054 0.76217 0.418923 0.300295 0.468045 0.249926 0.774747 0.719589 0.351588 0.25352 0.826949 0.334943 0.5293 0.279205 0.703155 0.146392 0.8225 0.59293 0.668171 0.307238 0.083186 0.335904 0.523589 0.134464 0.390507 0.428562 0.945885 0.765928 0.189849 0.05696 0.554822 0.206691 0.553052 0.485328 0.425877 0.525027 0.627404 0.403856 0.924001 0.812366 0.520453 0.783443 0.14706 0.03328 [...]
+0.275302 0.612886 0.26572 0.381499 0.632639 0.472101 0.592234 0.209623 0.236329 0.589075 0.372628 0.68498 0.643305 0.034501 0.6113 0.329865 0.835846 0.721874 0.720132 0.200161 0.459288 0.525049 0.703934 0.231125 0.996506 0.838521 0.359398 0.311414 0.103076 0.717833 0.233984 0.146409 0.930047 0.460059 0.392866 0.168318 0.712536 0.858414 0.971985 0.485843 0.564841 0.122662 0.779536 0.935527 0.981279 0.485154 0.597806 0.361797 0.97732 0.061443 0.466061 0.731923 0.820696 0.574733 0.99382 0.9 [...]
+0.918327 0.498257 0.116266 0.3362 0.458148 0.851875 0.101415 0.1978 0.358474 0.928198 0.245052 0.949285 0.960024 0.421559 0.850587 0.456778 0.391981 0.088795 0.830242 0.29206 0.704151 0.115965 0.938304 0.763247 0.576732 0.828024 0.612714 0.194582 0.745222 0.515422 0.22995 0.468536 0.601844 0.278526 0.197841 0.585653 0.740645 0.754454 0.779832 0.908748 0.250566 0.390197 0.374039 0.776722 0.710077 0.501915 0.022717 0.190905 0.927155 0.139345 0.730799 0.188671 0.047688 0.350188 0.688561 0.5 [...]
+0.545155 0.157776 0.029553 0.679237 0.342688 0.256783 0.780784 0.606747 0.989606 0.565588 0.532695 0.791624 0.314744 0.347696 0.552582 0.37783 0.6232 0.218369 0.233878 0.027454 0.535192 0.22781 0.756332 0.370828 0.843849 0.577671 0.274489 0.455072 0.141059 0.025757 0.981297 0.808638 0.671195 0.652807 0.846663 0.433563 0.536221 0.081104 0.71685 0.503653 0.67373 0.303783 0.816342 0.860655 0.233732 0.303507 0.572656 0.308961 0.036312 0.718438 0.875112 0.039053 0.791364 0.988607 0.835225 0.6 [...]
+0.83646 0.04122 0.508737 0.254844 0.682735 0.758785 0.126232 0.233323 0.870445 0.028634 0.119743 0.578149 0.476161 0.156607 0.490182 0.098986 0.795944 0.85518 0.68133 0.632501 0.70771 0.169584 0.353793 0.734249 0.102548 0.162795 0.36756 0.320492 0.011234 0.524459 0.361078 0.976848 0.967749 0.929433 0.632779 0.859722 0.004982 0.038674 0.026386 0.027345 0.995662 0.967893 0.081596 0.050919 0.632361 0.59204 0.051439 0.617239 0.17298 0.259383 0.147867 0.485147 0.848748 0.636724 0.339836 0.484 [...]
+0.220987 0.382687 0.692833 0.305931 0.946194 0.740731 0.946352 0.407443 0.337682 0.187445 0.295671 0.323239 0.455428 0.37028 0.047463 0.004688 0.799766 0.003819 0.889393 0.547226 0.895301 0.184625 0.549257 0.728094 0.303783 0.369506 0.396387 0.626289 0.185117 0.89789 0.686944 0.597683 0.145685 0.580253 0.616407 0.215755 0.107049 0.532406 0.376462 0.468615 0.984022 0.274021 0.805703 0.933989 0.209667 0.473767 0.757904 0.540902 0.784591 0.458866 0.620136 0.40203 0.534769 0.034061 0.858457  [...]
+0.782666 0.33619 0.542521 0.484258 0.179189 0.331685 0.791245 0.629044 0.945559 0.492608 0.770724 0.565536 0.266778 0.165665 0.174893 0.602001 0.832206 0.869161 0.096471 0.86134 0.653384 0.601178 0.112693 0.573164 0.615624 0.54419 0.232647 0.929361 0.815436 0.646695 0.417595 0.922099 0.812524 0.595062 0.813804 0.442429 0.529133 0.838413 0.788443 0.48071 0.420897 0.549563 0.337203 0.59063 0.315091 0.640016 0.294274 0.555893 0.36277 0.770117 0.792449 0.375226 0.295995 0.710802 0.064591 0.6 [...]
+0.222466 0.818698 0.681862 0.82913 0.548304 0.175599 0.042877 0.140923 0.419591 0.47027 0.434751 0.368184 0.097963 0.445627 0.628489 0.989646 0.751799 0.870931 0.805491 0.013053 0.967583 0.390965 0.707953 0.118992 0.547154 0.960074 0.847594 0.311376 0.12219 0.620345 0.993395 0.51935 0.698935 0.724283 0.212242 0.23445 0.604377 0.642698 0.508605 0.584029 0.219936 0.105259 0.794141 0.486312 0.81156 0.485478 0.335489 0.451461 0.867991 0.671828 0.766709 0.046402 0.933651 0.147233 0.637541 0.7 [...]
+0.117055 0.467946 0.311684 0.680477 0.531389 0.49277 0.522179 0.494949 0.961424 0.199186 0.650145 0.708597 0.383362 0.177226 0.908987 0.495045 0.650932 0.778446 0.290285 0.369519 0.715824 0.008652 0.503568 0.110636 0.05699 0.433758 0.129445 0.258177 0.323611 0.208517 0.572373 0.474708 0.471096 0.984811 0.856136 0.075964 0.514653 0.478522 0.131438 0.451016 0.761448 0.383866 0.877629 0.411139 0.757627 0.943146 0.090997 0.285796 0.793355 0.812625 0.246914 0.596576 0.640318 0.794247 0.522552 [...]
+0.824579 0.521245 0.414433 0.783796 0.270558 0.114013 0.518816 0.152494 0.91406 0.648623 0.418365 0.655012 0.45785 0.707424 0.771526 0.77239 0.021263 0.69436 0.478138 0.323005 0.49426 0.484743 0.843966 0.486637 0.300085 0.621785 0.520839 0.108701 0.233241 0.500867 0.731263 0.07544 0.347788 0.904632 0.05409 0.107506 0.326897 0.445873 0.426774 0.211589 0.858989 0.905895 0.679588 0.004941 0.012149 0.788322 0.855643 0.035644 0.435737 0.339018 0.599462 0.745005 0.472287 0.817279 0.731794 0.99 [...]
+0.623794 0.527409 0.61021 0.935128 0.133694 0.868046 0.544483 0.986761 0.87087 0.118186 0.102337 0.003344 0.738557 0.996208 0.736568 0.076281 0.620935 0.775076 0.16643 0.802118 0.193684 0.669299 0.634161 0.061863 0.971743 0.76819 0.373388 0.875731 0.66535 0.024064 0.302379 0.512838 0.387555 0.55684 0.615543 0.665952 0.281761 0.570235 0.035779 0.089214 0.178117 0.417163 0.817728 0.411784 0.654764 0.250001 0.814306 0.512418 0.039355 0.676217 0.370693 0.364399 0.610961 0.537268 0.788722 0.0 [...]
+0.460434 0.550925 0.000526 0.175701 0.187979 0.601262 0.740624 0.36412 0.630918 0.92451 0.195974 0.699813 0.450192 0.406492 0.325578 0.94047 0.486295 0.63346 0.309559 0.794678 0.018997 0.071672 0.846959 0.807625 0.685432 0.954959 0.611975 0.457031 0.997646 0.961482 0.121852 0.205306 0.407083 0.630552 0.755791 0.276836 0.984729 0.677905 0.782667 0.354726 0.260248 0.29349 0.933515 0.345487 0.843855 0.785905 0.114126 0.141843 0.967045 0.405427 0.048413 0.827552 0.628266 0.854506 0.54601 0.6 [...]
+0.754879 0.090338 0.778912 0.881401 0.376515 0.316882 0.119345 0.796755 0.469947 0.897861 0.840137 0.016461 0.797268 0.412367 0.209885 0.720951 0.511109 0.817011 0.427292 0.450812 0.169484 0.161804 0.617154 0.53605 0.10026 0.785245 0.320641 0.481648 0.833832 0.089007 0.308467 0.385408 0.472818 0.441884 0.970842 0.012473 0.13166 0.648032 0.572685 0.63447 0.231074 0.148836 0.885971 0.769588 0.057162 0.719575 0.32425 0.104014 0.782588 0.929392 0.294983 0.478946 0.591694 0.774436 0.896473 0. [...]
+0.525812 0.203065 0.021693 0.439221 0.513443 0.8683 0.042393 0.43072 0.69479 0.097907 0.596272 0.733029 0.26519 0.286346 0.600007 0.930806 0.58432 0.109796 0.973104 0.639873 0.450564 0.040825 0.468187 0.698851 0.791863 0.976322 0.515283 0.710507 0.039809 0.301799 0.403072 0.554425 0.890966 0.211765 0.815276 0.95327 0.228906 0.307834 0.116038 0.475876 0.965068 0.080631 0.352342 0.252927 0.250844 0.562775 0.359664 0.258013 0.973098 0.886813 0.703044 0.727036 0.76892 0.312098 0.397564 0.751 [...]
+0.718524 0.461851 0.70288 0.167885 0.440081 0.194853 0.901916 0.676125 0.110838 0.176451 0.518957 0.166722 0.562771 0.871496 0.99731 0.019085 0.273236 0.672797 0.609762 0.454232 0.192283 0.601457 0.069373 0.75135 0.391702 0.481137 0.719082 0.187892 0.504195 0.934853 0.797896 0.47359 0.579261 0.036142 0.656543 0.301927 0.620342 0.281276 0.349871 0.082448 0.506076 0.87552 0.908199 0.215943 0.212846 0.827835 0.114335 0.874519 0.352006 0.239505 0.579857 0.590214 0.206638 0.995518 0.684905 0. [...]
+0.693955 0.275903 0.884465 0.766039 0.900751 0.031099 0.26026 0.858729 0.958926 0.098942 0.548076 0.709081 0.127508 0.332099 0.000452 0.415666 0.474444 0.920903 0.402589 0.405709 0.322585 0.24867 0.323051 0.007557 0.154527 0.48618 0.207951 0.41999 0.159355 0.369486 0.202366 0.202564 0.530708 0.954763 0.057846 0.072667 0.292644 0.478979 0.165808 0.341594 0.285493 0.489458 0.154954 0.7389 0.090307 0.500011 0.907478 0.127863 0.518405 0.372651 0.449175 0.255421 0.185941 0.214592 0.613884 0.9 [...]
+0.288459 0.929717 0.046968 0.603631 0.040122 0.467497 0.977093 0.923086 0.209232 0.969575 0.742651 0.537574 0.425497 0.30122 0.430088 0.261161 0.296823 0.216343 0.782721 0.513012 0.937997 0.479857 0.702272 0.987411 0.832371 0.373351 0.40706 0.040898 0.357184 0.562312 0.152856 0.896414 0.975083 0.23742 0.997679 0.286437 0.773496 0.513142 0.716325 0.016235 0.627382 0.284913 0.91757 0.509909 0.97083 0.53237 0.715232 0.842649 0.793752 0.720057 0.259985 0.400711 0.654737 0.302768 0.763212 0.3 [...]
+0.489627 0.763146 0.910034 0.715215 0.121759 0.440046 0.327244 0.959457 0.461097 0.132251 0.385013 0.575213 0.349559 0.614461 0.146611 0.875776 0.46183 0.322588 0.603295 0.713465 0.695709 0.019661 0.240802 0.303755 0.307717 0.900915 0.112632 0.540288 0.578847 0.859247 0.326965 0.202679 0.274827 0.549224 0.922641 0.352591 0.740555 0.479519 0.074928 0.75795 0.586723 0.780036 0.392366 0.330756 0.537713 0.91418 0.836488 0.472527 0.412066 0.981732 0.26608 0.573027 0.02564 0.682611 0.330804 0. [...]
+0.179846 0.25932 0.206547 0.368244 0.738618 0.94947 0.767225 0.95052 0.539626 0.444127 0.19095 0.597223 0.29349 0.375551 0.894463 0.231749 0.872739 0.640106 0.792834 0.685119 0.527713 0.725503 0.336632 0.802365 0.653761 0.39777 0.72872 0.156554 0.279675 0.009799 0.310838 0.032399 0.334546 0.281778 0.559536 0.050759 0.429067 0.06586 0.98051 0.118633 0.962151 0.007475 0.500007 0.323786 0.59188 0.192265 0.719997 0.115745 0.672794 0.350256 0.451452 0.093438 0.714988 0.147694 0.933903 0.92429 [...]
+0.458319 0.167822 0.391 0.27377 0.457092 0.799469 0.748057 0.6056 0.723139 0.917854 0.9878 0.329996 0.500214 0.959388 0.488017 0.06623 0.355064 0.56112 0.802409 0.703699 0.183147 0.062673 0.703585 0.617079 0.842328 0.834547 0.865857 0.602861 0.761765 0.047671 0.932635 0.219554 0.987047 0.500193 0.328701 0.093238 0.312027 0.045673 0.325419 0.054193 0.429591 0.324039 0.337728 0.398928 0.075061 0.415553 0.862499 0.927134 0.914538 0.494642 0.931287 0.158823 0.907947 0.910083 0.901116 0.90859 [...]
+0.840031 0.505119 0.493545 0.841701 0.070027 0.120412 0.382509 0.929204 0.752745 0.40066 0.873487 0.279754 0.471075 0.920125 0.234843 0.015079 0.917174 0.300899 0.44587 0.830211 0.993245 0.358776 0.098394 0.105981 0.465289 0.85444 0.802038 0.118368 0.222628 0.333594 0.898069 0.624025 0.402464 0.435738 0.734775 0.959381 0.728464 0.71779 0.086351 0.422706 0.859491 0.149503 0.238066 0.721287 0.238972 0.503566 0.941326 0.294992 0.64249 0.627585 0.720856 0.819752 0.437691 0.18472 0.834385 0.3 [...]
+0.897076 0.535743 0.119016 0.116619 0.229496 0.479527 0.995242 0.174237 0.982806 0.787346 0.10945 0.634553 0.607458 0.11142 0.155426 0.792917 0.587338 0.799758 0.141284 0.770221 0.849185 0.888522 0.446817 0.745444 0.566506 0.231869 0.396645 0.062491 0.794197 0.863566 0.448878 0.672047 0.375934 0.955744 0.451426 0.905582 0.46271 0.756721 0.178302 0.323892 0.636158 0.260621 0.850859 0.058804 0.120696 0.614825 0.404391 0.599014 0.216104 0.396444 0.272807 0.78481 0.873741 0.160109 0.058042 0 [...]
+0.190048 0.509239 0.885129 0.336491 0.504589 0.163321 0.577612 0.596727 0.919021 0.46258 0.819225 0.232367 0.292746 0.692378 0.715407 0.137179 0.600998 0.22549 0.122745 0.279246 0.840605 0.574223 0.810282 0.923567 0.855515 0.167898 0.715717 0.658579 0.57911 0.857118 0.395085 0.862765 0.389327 0.14185 0.536842 0.393202 0.450869 0.983672 0.314739 0.410007 0.241014 0.099683 0.769767 0.933596 0.358487 0.274059 0.661208 0.263186 0.173442 0.948525 0.684025 0.132855 0.012216 0.795707 0.554882 0 [...]
+0.429256 0.295101 0.215269 0.088342 0.115437 0.030504 0.952694 0.501454 0.188336 0.447918 0.674111 0.297262 0.10046 0.736333 0.874845 0.936434 0.651553 0.592676 0.837618 0.411611 0.659222 0.59173 0.915208 0.790601 0.041023 0.24715 0.156923 0.804998 0.566352 0.760275 0.382796 0.629689 0.645933 0.352439 0.443253 0.046777 0.889492 0.440189 0.503338 0.587279 0.810605 0.492334 0.320773 0.804124 0.086828 0.960689 0.294583 0.222145 0.96617 0.828075 0.855553 0.753581 0.927028 0.59238 0.794039 0. [...]
+0.491877 0.973458 0.88195 0.59649 0.723995 0.331205 0.370808 0.464849 0.01792 0.952964 0.821295 0.194619 0.829108 0.503914 0.197578 0.909612 0.299459 0.563678 0.474055 0.964096 0.577706 0.164179 0.899504 0.902536 0.317561 0.052207 0.402282 0.303423 0.071083 0.586105 0.972675 0.92899 0.290675 0.136327 0.566268 0.720683 0.528162 0.441448 0.761481 0.90812 0.933839 0.641184 0.776599 0.289436 0.665375 0.299302 0.113691 0.276207 0.232024 0.316804 0.393391 0.475365 0.624373 0.007052 0.418658 0. [...]
+0.518938 0.614577 0.84128 0.488603 0.629218 0.848966 0.858634 0.779636 0.514573 0.061583 0.244236 0.385678 0.17691 0.05008 0.05363 0.253887 0.680009 0.342416 0.980672 0.39998 0.437542 0.355655 0.243982 0.096105 0.198376 0.227103 0.012441 0.966305 0.372555 0.503035 0.223685 0.017179 0.273522 0.652054 0.711694 0.520464 0.685649 0.876727 0.492849 0.088751 0.394879 0.277093 0.105552 0.232127 0.366065 0.450478 0.872112 0.596034 0.743804 0.357807 0.29837 0.105499 0.565666 0.143761 0.727753 0.7 [...]
+0.026151 0.757249 0.439002 0.438978 0.694338 0.976157 0.53867 0.48975 0.942639 0.61416 0.006106 0.201839 0.66234 0.192227 0.659442 0.364267 0.621498 0.628358 0.405132 0.22698 0.808684 0.300034 0.750004 0.02933 0.745576 0.49983 0.313467 0.364061 0.563189 0.895304 0.557467 0.423697 0.582847 0.772249 0.188968 0.426051 0.98758 0.534539 0.759511 0.510408 0.256359 0.075035 0.472788 0.701666 0.088889 0.155331 0.912768 0.11269 0.849334 0.703836 0.789951 0.985481 0.540285 0.465903 0.395523 0.0284 [...]
+0.132801 0.109602 0.6986 0.094727 0.028971 0.842977 0.799427 0.547457 0.958579 0.972085 0.996011 0.190591 0.033698 0.235851 0.744587 0.7544 0.532849 0.62847 0.193665 0.498408 0.225929 0.998032 0.226858 0.284597 0.809878 0.725581 0.788214 0.339003 0.92908 0.358868 0.219362 0.679649 0.818414 0.478676 0.957489 0.897243 0.296536 0.369384 0.997938 0.71738 0.709617 0.245521 0.335851 0.231395 0.368868 0.495699 0.882869 0.817213 0.037367 0.379995 0.347489 0.473742 0.949437 0.122042 0.609308 0.55 [...]
+0.624671 0.198698 0.814703 0.237854 0.55325 0.219334 0.64266 0.092307 0.576764 0.395736 0.584126 0.714582 0.788182 0.488839 0.241356 0.192948 0.373662 0.147848 0.062781 0.450563 0.296155 0.283269 0.839023 0.399213 0.975086 0.259544 0.652587 0.478174 0.230733 0.513256 0.799853 0.453496 0.61635 0.404515 0.812873 0.098122 0.827568 0.863598 0.50481 0.840676 0.667114 0.636477 0.384446 0.887386 0.485424 0.576216 0.42063 0.851595 0.409854 0.008889 0.356909 0.613753 0.878701 0.365424 0.419114 0. [...]
+0.160616 0.582602 0.504594 0.196397 0.388976 0.01203 0.841473 0.887189 0.484549 0.72676 0.500446 0.123494 0.350364 0.404781 0.543752 0.882498 0.814074 0.964324 0.560987 0.102724 0.340725 0.918478 0.733157 0.737138 0.327623 0.360678 0.057001 0.00552 0.191663 0.262388 0.088474 0.271625 0.138823 0.896803 0.660583 0.228882 0.344475 0.113997 0.557472 0.275879 0.306099 0.191586 0.974326 0.716761 0.875749 0.824557 0.627309 0.79267 0.491438 0.247698 0.443881 0.859675 0.554524 0.270155 0.646592 0 [...]
+0.830914 0.862091 0.938966 0.691413 0.380714 0.869621 0.286824 0.636642 0.196372 0.293891 0.788565 0.300804 0.096376 0.116651 0.562888 0.369023 0.555596 0.570206 0.76076 0.544424 0.148359 0.918963 0.348612 0.345833 0.627099 0.810092 0.963189 0.858171 0.042421 0.965916 0.813043 0.111605 0.555961 0.682273 0.724078 0.674408 0.897999 0.654124 0.396625 0.270837 0.025553 0.228897 0.327067 0.509764 0.81756 0.930057 0.85464 0.269053 0.342991 0.51842 0.614647 0.730534 0.974632 0.173271 0.751375 0 [...]
+0.975382 0.026891 0.761311 0.034268 0.669125 0.472596 0.539796 0.972007 0.217299 0.305077 0.45323 0.238769 0.153427 0.08417 0.593445 0.298115 0.604885 0.068394 0.948946 0.794091 0.115622 0.468733 0.693293 0.748815 0.952023 0.926951 0.86056 0.465219 0.962368 0.402543 0.334657 0.627344 0.131199 0.37967 0.427534 0.060749 0.039668 0.194332 0.338417 0.047206 0.876937 0.222555 0.661236 0.217554 0.543717 0.930287 0.993853 0.021844 0.350127 0.168992 0.990077 0.324812 0.018191 0.009941 0.03536 0. [...]
+0.71475 0.785043 0.979134 0.373985 0.659422 0.532379 0.01956 0.297978 0.813875 0.602798 0.338804 0.438555 0.785813 0.293359 0.851797 0.417939 0.285398 0.465291 0.925394 0.604053 0.436184 0.685628 0.143106 0.97839 0.941502 0.569808 0.118764 0.19666 0.182954 0.545593 0.027654 0.15442 0.587279 0.558612 0.020835 0.601221 0.455562 0.682649 0.274481 0.40024 0.154806 0.948234 0.726964 0.501159 0.367546 0.629774 0.001841 0.830846 0.319677 0.899258 0.885646 0.069931 0.283121 0.262073 0.829187 0.9 [...]
+0.748227 0.835085 0.983319 0.840431 0.561644 0.406856 0.626613 0.59811 0.989081 0.96995 0.401804 0.921295 0.438226 0.255897 0.963907 0.714838 0.645293 0.084966 0.529134 0.612911 0.48494 0.370654 0.158363 0.540394 0.12029 0.908348 0.448578 0.703939 0.124526 0.982228 0.356781 0.216663 0.655904 0.326692 0.546859 0.795178 0.683055 0.427856 0.990934 0.313188 0.404595 0.59175 0.052317 0.96229 0.611685 0.853108 0.595443 0.529534 0.292706 0.413511 0.102974 0.996847 0.420813 0.280508 0.029169 0.4 [...]
+0.338932 0.468551 0.492399 0.307819 0.026682 0.402053 0.583904 0.112297 0.701374 0.504796 0.445886 0.300925 0.827227 0.655796 0.141902 0.088881 0.140241 0.304448 0.057883 0.094017 0.968778 0.560413 0.523567 0.723092 0.999431 0.604815 0.178368 0.612281 0.804922 0.060043 0.12497 0.828954 0.675719 0.26974 0.460852 0.013076 0.929716 0.083778 0.836917 0.880815 0.115056 0.550701 0.237327 0.526304 0.85168 0.43009 0.310947 0.548755 0.880744 0.42339 0.145622 0.464306 0.772495 0.29696 0.56122 0.68 [...]
+0.554755 0.789827 0.887463 0.04741 0.044515 0.087547 0.97934 0.746711 0.937966 0.331738 0.742592 0.343819 0.335115 0.514766 0.859513 0.357834 0.161908 0.753164 0.850831 0.486617 0.568295 0.7454 0.795827 0.54195 0.181389 0.251741 0.884721 0.848719 0.656497 0.710015 0.525951 0.159062 0.936377 0.058094 0.38624 0.291811 0.568172 0.165523 0.869243 0.443338 0.518746 0.710361 0.253891 0.141307 0.900559 0.555941 0.883514 0.706593 0.825173 0.535185 0.595551 0.498403 0.759114 0.822795 0.691352 0.4 [...]
+0.523603 0.444942 0.549218 0.770547 0.558446 0.143393 0.133313 0.244999 0.2242 0.984175 0.380653 0.623411 0.353886 0.921153 0.591314 0.050191 0.01064 0.452036 0.387055 0.09583 0.897794 0.791981 0.835344 0.643042 0.874935 0.69758 0.907336 0.973057 0.438494 0.616345 0.348928 0.13314 0.087756 0.57878 0.155103 0.461418 0.109066 0.381124 0.925256 0.158879 0.187899 0.066791 0.9013 0.792316 0.415024 0.706232 0.199171 0.900478 0.185689 0.549284 0.027618 0.232193 0.30802 0.163286 0.362668 0.57632 [...]
+0.311412 0.832744 0.581309 0.390426 0.96597 0.881112 0.654956 0.09335 0.012874 0.141231 0.350244 0.698515 0.476428 0.525371 0.63568 0.737925 0.209158 0.034374 0.14898 0.359333 0.244857 0.340747 0.455858 0.634093 0.237994 0.026682 0.120579 0.349764 0.164597 0.693985 0.039117 0.636477 0.327008 0.200731 0.515488 0.042561 0.183213 0.43625 0.643841 0.433572 0.522293 0.216029 0.329787 0.070512 0.230463 0.55617 0.343324 0.184239 0.814239 0.777669 0.383356 0.852915 0.268646 0.667041 0.388664 0.6 [...]
+0.645545 0.680528 0.149922 0.138153 0.293536 0.687558 0.324845 0.386625 0.979603 0.725615 0.249622 0.1794 0.368071 0.694512 0.939015 0.221397 0.168341 0.741326 0.810772 0.803625 0.43075 0.261512 0.291052 0.109674 0.206278 0.16109 0.478284 0.385213 0.390358 0.509462 0.618346 0.018056 0.920382 0.654098 0.704575 0.488766 0.373394 0.71912 0.504862 0.863989 0.301181 0.385908 0.568358 0.92937 0.44152 0.510371 0.278746 0.039632 0.538283 0.065726 0.52452 0.750395 0.310887 0.943101 0.092051 0.627 [...]
+0.130118 0.432519 0.465971 0.88802 0.127628 0.382146 0.369899 0.13696 0.390469 0.914879 0.135248 0.461925 0.117639 0.185469 0.093662 0.813481 0.415166 0.223784 0.467042 0.930186 0.330905 0.311952 0.259423 0.398119 0.107227 0.836596 0.744703 0.126101 0.036302 0.119875 0.170331 0.815863 0.966046 0.553788 0.624842 0.280573 0.736146 0.884763 0.21643 0.655152 0.426359 0.706265 0.669878 0.647614 0.275478 0.863372 0.542732 0.636201 0.539063 0.860798 0.031382 0.389631 0.181042 0.193547 0.807589  [...]
+0.309932 0.393058 0.627555 0.199727 0.885089 0.229584 0.17659 0.590446 0.422946 0.527152 0.32847 0.754334 0.139377 0.594918 0.914586 0.593325 0.229594 0.634282 0.565217 0.947646 0.241174 0.18973 0.435634 0.286467 0.675112 0.662449 0.526638 0.215219 0.701517 0.250495 0.81896 0.802732 0.913486 0.51752 0.999106 0.742229 0.263088 0.978148 0.189566 0.072999 0.872193 0.551281 0.45087 0.793387 0.064496 0.096921 0.99826 0.162832 0.109324 0.781333 0.039896 0.569722 0.355335 0.859678 0.255252 0.78 [...]
+0.5558 0.745856 0.665416 0.230804 0.417304 0.638911 0.317412 0.769014 0.78709 0.818466 0.491565 0.451186 0.533712 0.592616 0.568544 0.367378 0.431762 0.956498 0.716505 0.118849 0.632314 0.577373 0.693273 0.300799 0.191509 0.592859 0.876365 0.696438 0.547071 0.741042 0.380128 0.946454 0.398378 0.336199 0.306762 0.357023 0.715492 0.138114 0.661376 0.43271 0.827783 0.926412 0.226954 0.533714 0.341337 0.574145 0.761954 0.718239 0.02597 0.274706 0.228323 0.180255 0.800846 0.930389 0.156821 0. [...]
+0.717618 0.508966 0.957372 0.652753 0.820406 0.825431 0.490622 0.602565 0.341522 0.932794 0.86674 0.543281 0.95001 0.967849 0.330156 0.607305 0.352259 0.874139 0.785611 0.273047 0.62257 0.279096 0.030304 0.954035 0.352888 0.7941 0.47131 0.104574 0.26753 0.777229 0.976214 0.325807 0.042935 0.297794 0.353892 0.201598 0.034818 0.463242 0.017466 0.928547 0.827076 0.261017 0.95178 0.827035 0.9798 0.45398 0.073626 0.982976 0.929209 0.886643 0.452037 0.301102 0.751112 0.250522 0.305935 0.635197 [...]
+0.433099 0.306836 0.545727 0.27719 0.117414 0.586567 0.665011 0.387264 0.91875 0.042437 0.87762 0.851662 0.707236 0.424428 0.149276 0.024158 0.680677 0.904259 0.249269 0.14213 0.292825 0.721707 0.711092 0.242801 0.073459 0.551828 0.765958 0.963462 0.546097 0.532469 0.198206 0.380389 0.610729 0.939558 0.648161 0.837376 0.779588 0.225483 0.448468 0.471079 0.212958 0.9445 0.772831 0.676211 0.094407 0.847238 0.452156 0.778882 0.580099 0.185224 0.489783 0.724038 0.744438 0.062853 0.793466 0.9 [...]
+0.687567 0.117875 0.192072 0.855181 0.164316 0.200971 0.94819 0.053833 0.101033 0.575279 0.006359 0.483748 0.616992 0.893331 0.824071 0.716484 0.670109 0.718414 0.701725 0.183196 0.827003 0.398686 0.617218 0.693431 0.647867 0.458991 0.168738 0.153926 0.109397 0.544471 0.128712 0.972715 0.450957 0.259284 0.878235 0.51684 0.935558 0.046354 0.390587 0.825083 0.717238 0.947823 0.481367 0.991973 0.039763 0.280503 0.156874 0.792557 0.183672 0.801073 0.356331 0.670947 0.945 0.511528 0.557635 0. [...]
+0.664886 0.400437 0.261177 0.081235 0.648836 0.981797 0.744611 0.454173 0.596011 0.411363 0.421224 0.248134 0.423704 0.346817 0.018138 0.727844 0.485875 0.409388 0.715192 0.266315 0.756816 0.050632 0.602271 0.898235 0.881804 0.257808 0.633231 0.229995 0.628764 0.473103 0.856485 0.311065 0.921558 0.29959 0.991863 0.802751 0.034258 0.066154 0.587971 0.206586 0.134848 0.56126 0.875318 0.995584 0.34908 0.865146 0.821131 0.776186 0.33983 0.6825 0.329865 0.24058 0.201668 0.969906 0.509569 0.48 [...]
+0.087204 0.429647 0.273415 0.861032 0.543132 0.542547 0.920311 0.507955 0.499701 0.510627 0.750044 0.747506 0.011717 0.88701 0.921663 0.923827 0.342058 0.455898 0.349152 0.209881 0.218205 0.309804 0.973134 0.426251 0.952054 0.095933 0.976676 0.027356 0.301171 0.179956 0.492336 0.52258 0.295277 0.454294 0.567679 0.674417 0.81723 0.486644 0.308503 0.453448 0.766125 0.954512 0.41594 0.663015 0.232992 0.783273 0.285943 0.280703 0.167655 0.348917 0.159535 0.343411 0.115538 0.9053 0.551991 0.6 [...]
+0.098909 0.695387 0.92946 0.562265 0.950688 0.868389 0.167725 0.203229 0.216037 0.29188 0.254458 0.635525 0.485717 0.364057 0.06721 0.248296 0.198176 0.535084 0.500532 0.593921 0.857055 0.116102 0.492611 0.111958 0.945708 0.775519 0.783634 0.329675 0.31576 0.492431 0.703078 0.505541 0.180323 0.793042 0.261779 0.222094 0.21062 0.011916 0.031038 0.223981 0.868595 0.648674 0.497345 0.122204 0.04064 0.542299 0.442271 0.994572 0.562009 0.145341 0.005532 0.421318 0.161612 0.471562 0.034367 0.7 [...]
+0.54884 0.192055 0.501407 0.115181 0.467964 0.950224 0.619405 0.063257 0.183821 0.113648 0.908332 0.756641 0.258525 0.48586 0.77766 0.967215 0.546694 0.453862 0.771206 0.856816 0.244155 0.602144 0.19483 0.505952 0.177667 0.39447 0.013882 0.289054 0.450447 0.645242 0.37271 0.676249 0.055238 0.613546 0.002416 0.23947 0.641972 0.47707 0.558757 0.805322 0.126068 0.53222 0.942684 0.339848 0.778759 0.566907 0.438303 0.083037 0.858593 0.387721 0.578161 0.36001 0.491799 0.535447 0.865009 0.05404 [...]
+0.16448 0.360681 0.969851 0.550837 0.557967 0.119837 0.017177 0.289016 0.214047 0.433889 0.706504 0.114396 0.523839 0.024228 0.717179 0.876033 0.637474 0.69016 0.176647 0.479554 0.678064 0.010226 0.610261 0.678809 0.630063 0.850517 0.223964 0.194672 0.215753 0.467915 0.592404 0.247252 0.240206 0.81144 0.542819 0.937626 0.390022 0.823252 0.979462 0.302192 0.599764 0.20659 0.999566 0.167883 0.403338 0.333525 0.578392 0.224158 0.72892 0.898947 0.836762 0.983252 0.368695 0.8017 0.3562 0.4202 [...]
+0.770639 0.265686 0.525601 0.325242 0.88486 0.876196 0.053423 0.243946 0.288665 0.530462 0.242519 0.225331 0.558836 0.13998 0.327305 0.294826 0.090141 0.819248 0.964626 0.471461 0.67809 0.842018 0.946251 0.74265 0.069962 0.459244 0.348644 0.101456 0.199217 0.984875 0.259573 0.363462 0.946519 0.223379 0.547755 0.27985 0.727992 0.036156 0.567145 0.931081 0.844492 0.101655 0.739163 0.361254 0.963587 0.206034 0.791608 0.519289 0.794175 0.378089 0.907842 0.438569 0.919498 0.032411 0.576682 0. [...]
+0.619854 0.575764 0.932847 0.043551 0.473771 0.013913 0.169274 0.390752 0.219756 0.841129 0.071933 0.311852 0.876894 0.833784 0.338528 0.969544 0.435978 0.576519 0.523183 0.92803 0.527831 0.345887 0.645746 0.901687 0.451355 0.230006 0.833442 0.491947 0.825858 0.844027 0.648516 0.386349 0.900142 0.047684 0.627459 0.758867 0.720689 0.771941 0.467957 0.459742 0.61922 0.823141 0.087023 0.025175 0.486527 0.924857 0.463911 0.374158 0.294856 0.280575 0.582012 0.847116 0.508961 0.937339 0.047173 [...]
+0.07381 0.669722 0.240288 0.455454 0.90717 0.624174 0.633795 0.306547 0.146966 0.524041 0.464001 0.986404 0.507562 0.182988 0.897836 0.725725 0.615683 0.619757 0.135579 0.842886 0.326041 0.246317 0.103976 0.413048 0.045998 0.847746 0.749635 0.938995 0.820904 0.591734 0.576008 0.957349 0.96567 0.048537 0.561969 0.454583 0.277998 0.5864 0.814605 0.180335 0.083318 0.003133 0.999569 0.470509 0.18547 0.040348 0.563447 0.770917 0.469655 0.977612 0.894789 0.573898 0.958642 0.238649 0.234055 0.4 [...]
+0.310878 0.051189 0.253924 0.6933 0.371084 0.061322 0.01895 0.79507 0.530116 0.575666 0.097475 0.73491 0.827962 0.124362 0.462183 0.058325 0.865587 0.012256 0.602352 0.471853 0.198401 0.84268 0.387992 0.248046 0.005479 0.663928 0.927643 0.913865 0.315058 0.542275 0.233982 0.810514 0.812526 0.654253 0.257846 0.105622 0.553829 0.671092 0.637863 0.500649 0.562085 0.007058 0.612082 0.171055 0.443583 0.556651 0.972225 0.020223 0.231845 0.670164 0.993084 0.258371 0.777803 0.616936 0.503326 0.1 [...]
+0.414273 0.049787 0.041473 0.612459 0.126497 0.598256 0.999142 0.638382 0.923774 0.520411 0.003129 0.152182 0.296946 0.165052 0.649357 0.955389 0.233927 0.550858 0.746717 0.490627 0.905728 0.006014 0.954738 0.411133 0.605851 0.673127 0.980231 0.614393 0.957916 0.82614 0.391681 0.10201 0.375833 0.533384 0.447919 0.172922 0.85867 0.99775 0.905695 0.947452 0.394213 0.196845 0.531692 0.6939 0.693983 0.494927 0.419283 0.654371 0.371534 0.640684 0.300214 0.04257 0.996647 0.84926 0.483087 0.364 [...]
+0.528874 0.481965 0.994599 0.467917 0.616286 0.49941 0.611518 0.359975 0.535403 0.864649 0.625091 0.362522 0.174472 0.464094 0.226147 0.245152 0.84192 0.668438 0.45396 0.013353 0.175435 0.117164 0.785047 0.511818 0.845563 0.48884 0.617662 0.312423 0.694219 0.330062 0.366131 0.44495 0.869972 0.235764 0.46114 0.89052 0.721838 0.51475 0.754413 0.851705 0.930633 0.050929 0.515916 0.240219 0.287599 0.943095 0.954299 0.455241 0.635787 0.535788 0.444429 0.101957 0.709167 0.426154 0.36134 0.0059 [...]
+0.666975 0.723218 0.839834 0.297288 0.311287 0.392607 0.320069 0.681973 0.454424 0.074448 0.519379 0.551267 0.000202 0.929196 0.085005 0.908438 0.332213 0.178585 0.925645 0.603923 0.251677 0.905507 0.252197 0.856893 0.531076 0.916814 0.289978 0.946669 0.334872 0.815894 0.429641 0.889931 0.948286 0.042904 0.967894 0.59434 0.6875 0.527887 0.057241 0.463619 0.890389 0.194124 0.756708 0.7932 0.707679 0.07013 0.926776 0.516358 0.20571 0.024239 0.061009 0.326047 0.841862 0.912282 0.061736 0.49 [...]
+0.017948 0.357316 0.400063 0.093614 0.679533 0.505641 0.8223 0.128211 0.092148 0.035661 0.757638 0.604141 0.759024 0.896096 0.627134 0.293612 0.987313 0.614562 0.381384 0.125285 0.14123 0.370291 0.002291 0.750412 0.143168 0.723731 0.532993 0.393092 0.988123 0.305187 0.445097 0.245798 0.37918 0.810361 0.348737 0.148508 0.006857 0.956035 0.064094 0.756869 0.068784 0.439436 0.378506 0.210949 0.455907 0.958318 0.113035 0.579069 0.350573 0.926419 0.448531 0.353945 0.410957 0.40438 0.858742 0. [...]
+0.422546 0.651171 0.179198 0.452078 0.697134 0.289203 0.605426 0.402877 0.657246 0.319407 0.599237 0.609291 0.251945 0.622694 0.152059 0.755346 0.44678 0.815153 0.626783 0.623976 0.944734 0.74495 0.212002 0.37409 0.437674 0.132085 0.167887 0.772969 0.906429 0.140921 0.8275 0.282775 0.949369 0.697025 0.530244 0.033385 0.126885 0.656696 0.317233 0.403925 0.900758 0.885632 0.872553 0.343183 0.843292 0.546002 0.516309 0.362935 0.633455 0.089388 0.969203 0.627386 0.932358 0.781083 0.746676 0. [...]
+0.893533 0.876029 0.641516 0.566998 0.231588 0.730093 0.427691 0.588164 0.355948 0.419466 0.806603 0.28569 0.081667 0.808026 0.467657 0.484207 0.058643 0.390124 0.665475 0.138705 0.359397 0.166052 0.798485 0.239474 0.812019 0.676287 0.809891 0.049734 0.513946 0.591623 0.225586 0.054301 0.596677 0.472555 0.612425 0.651826 0.076146 0.67871 0.046206 0.46299 0.588261 0.314628 0.518462 0.301521 0.17028 0.730112 0.575571 0.959138 0.755024 0.088872 0.418958 0.345328 0.511934 0.631233 0.919942 0 [...]
+0.816044 0.017908 0.576036 0.235644 0.732754 0.168395 0.41077 0.646205 0.109117 0.099762 0.195401 0.40052 0.975185 0.82284 0.676433 0.706184 0.45386 0.254825 0.8524 0.164628 0.884041 0.860701 0.257059 0.822678 0.355164 0.193545 0.769193 0.750582 0.602166 0.39475 0.669241 0.63708 0.497464 0.261089 0.482693 0.734529 0.219025 0.80998 0.117518 0.558408 0.160078 0.280986 0.265926 0.103339 0.469298 0.378223 0.689108 0.379227 0.88559 0.207214 0.072091 0.257638 0.765023 0.242972 0.102359 0.33315 [...]
+0.597593 0.04709 0.936193 0.110684 0.241018 0.111753 0.099534 0.597656 0.296422 0.615271 0.039374 0.660372 0.641352 0.635904 0.267044 0.617472 0.769563 0.861586 0.724921 0.209525 0.286849 0.571655 0.339335 0.026828 0.872734 0.246081 0.135898 0.719791 0.888573 0.849921 0.79778 0.920152 0.937423 0.412267 0.627577 0.874017 0.095011 0.467754 0.447118 0.113114 0.664245 0.455476 0.858027 0.544477 0.1664 0.356797 0.809493 0.53601 0.453297 0.747235 0.674162 0.895094 0.545011 0.072304 0.652562 0. [...]
+0.272362 0.494952 0.819122 0.401375 0.660843 0.581297 0.569257 0.961892 0.353959 0.576675 0.651059 0.248067 0.055114 0.364037 0.497203 0.00657 0.613406 0.173506 0.825523 0.421925 0.916881 0.066743 0.322694 0.671912 0.629098 0.175276 0.878636 0.600313 0.386267 0.336344 0.378512 0.894364 0.181493 0.948867 0.37517 0.554278 0.340395 0.855612 0.63627 0.89589 0.670824 0.108827 0.833194 0.312354 0.041758 0.901511 0.493945 0.312206 0.387026 0.321387 0.9798 0.296143 0.221798 0.345301 0.287756 0.7 [...]
+0.558093 0.968672 0.830347 0.252232 0.171217 0.278205 0.857579 0.774944 0.065805 0.067892 0.895457 0.787306 0.840526 0.21182 0.660239 0.291306 0.604523 0.054831 0.013827 0.289429 0.769448 0.029546 0.221936 0.190069 0.244906 0.576709 0.322504 0.856625 0.756492 0.331601 0.493153 0.406518 0.11872 0.175077 0.703925 0.509129 0.800482 0.675267 0.900956 0.672838 0.080418 0.598649 0.599237 0.678851 0.334661 0.697453 0.041396 0.96646 0.885759 0.21392 0.780351 0.745134 0.786036 0.862536 0.422861 0 [...]
+0.707876 0.18053 0.142217 0.055226 0.797801 0.291576 0.139252 0.021562 0.745268 0.848219 0.114797 0.769504 0.914593 0.230954 0.186514 0.565415 0.159332 0.770002 0.764808 0.468202 0.504624 0.594257 0.220729 0.112547 0.273531 0.619525 0.414879 0.120428 0.252427 0.510177 0.059478 0.337498 0.664744 0.546715 0.524915 0.725745 0.60556 0.861764 0.626459 0.009896 0.773745 0.121152 0.643729 0.835026 0.065004 0.697778 0.383433 0.261588 0.16996 0.475758 0.730161 0.467335 0.893099 0.631784 0.533835  [...]
+0.532827 0.31974 0.601884 0.627363 0.192525 0.747794 0.24927 0.734434 0.35297 0.616434 0.329153 0.992394 0.993101 0.017033 0.543082 0.022546 0.30042 0.098244 0.69078 0.018537 0.37591 0.300153 0.286622 0.421315 0.540671 0.894986 0.572516 0.169369 0.204429 0.433163 0.497048 0.94714 0.423267 0.78655 0.76804 0.010279 0.798381 0.891502 0.054534 0.258375 0.744536 0.430897 0.672627 0.377755 0.894435 0.556919 0.448086 0.158536 0.351751 0.648604 0.422499 0.498889 0.092974 0.390233 0.7472 0.176794 [...]
+0.068767 0.778487 0.399776 0.796073 0.856262 0.194409 0.052948 0.322331 0.252225 0.809676 0.649126 0.819411 0.074883 0.486617 0.599142 0.119949 0.354881 0.407015 0.786501 0.394516 0.610336 0.98521 0.742854 0.921355 0.417503 0.234195 0.550291 0.894907 0.817524 0.502915 0.463 0.147289 0.848536 0.679663 0.806992 0.353693 0.187184 0.876113 0.193129 0.02037 0.178718 0.788553 0.868365 0.644749 0.281933 0.74397 0.068157 0.552667 0.902485 0.308538 0.692287 0.26462 0.826757 0.335901 0.575108 0.17 [...]
+0.126176 0.735143 0.357719 0.307484 0.260921 0.024959 0.159432 0.092936 0.876584 0.044337 0.634408 0.874818 0.771579 0.010362 0.092437 0.497839 0.856911 0.173696 0.512762 0.027653 0.292669 0.732113 0.487679 0.727018 0.485371 0.07756 0.041393 0.585708 0.040647 0.105792 0.01329 0.497577 0.002263 0.682357 0.398147 0.414546 0.616635 0.005481 0.567409 0.437193 0.609188 0.561908 0.242368 0.957158 0.443441 0.178592 0.37232 0.535283 0.042128 0.660079 0.248846 0.552116 0.823718 0.460674 0.234928  [...]
+0.684931 0.18135 0.031017 0.774813 0.094894 0.822345 0.118058 0.581338 0.315122 0.597162 0.217371 0.496804 0.031041 0.387551 0.153705 0.345634 0.680779 0.246984 0.946623 0.620597 0.716721 0.204459 0.415373 0.063097 0.063045 0.194837 0.148903 0.043205 0.367892 0.7862 0.467107 0.117369 0.56072 0.010802 0.38764 0.063803 0.136914 0.469554 0.67362 0.088605 0.252141 0.508256 0.304008 0.465185 0.500524 0.044439 0.649128 0.474171 0.030018 0.328843 0.631998 0.96111 0.55197 0.683428 0.232795 0.803 [...]
+0.515902 0.841095 0.998201 0.018055 0.600794 0.614338 0.817499 0.023763 0.371316 0.739009 0.212806 0.20568 0.277439 0.045886 0.821381 0.946598 0.396331 0.361019 0.675364 0.564973 0.388621 0.630004 0.30062 0.960941 0.777801 0.944493 0.635867 0.905425 0.87671 0.021533 0.785019 0.184691 0.021432 0.748578 0.47348 0.012707 0.897519 0.897489 0.408069 0.141092 0.88791 0.156923 0.205819 0.986907 0.683981 0.493373 0.134349 0.96477 0.616301 0.703441 0.154252 0.885541 0.200971 0.284042 0.701119 0.6 [...]
+0.615978 0.18555 0.316482 0.72162 0.355131 0.674202 0.236572 0.670787 0.70062 0.240179 0.735072 0.288134 0.689762 0.903895 0.795214 0.386193 0.23611 0.763752 0.827824 0.658041 0.984204 0.684223 0.224732 0.62815 0.589152 0.209831 0.879819 0.594401 0.727103 0.856717 0.14044 0.023592 0.222554 0.411094 0.552184 0.485209 0.432649 0.072044 0.649959 0.240837 0.155394 0.892206 0.443235 0.516946 0.315735 0.794744 0.797612 0.980762 0.06852 0.665595 0.143775 0.395725 0.188715 0.388576 0.079234 0.97 [...]
+0.24859 0.704156 0.84526 0.260529 0.708761 0.70136 0.38422 0.871435 0.861264 0.281861 0.210965 0.854904 0.804699 0.731072 0.96418 0.623073 0.235633 0.927912 0.7985 0.184972 0.926635 0.99032 0.643815 0.981486 0.568446 0.487713 0.8215 0.180971 0.445618 0.192809 0.738103 0.916139 0.939565 0.56924 0.683855 0.726844 0.498288 0.603588 0.518845 0.168863 0.761209 0.420792 0.47316 0.052009 0.097012 0.422498 0.093459 0.274119 0.715517 0.609322 0.193503 0.199457 0.266409 0.856002 0.085215 0.48466 0 [...]
+0.341257 0.22106 0.31148 0.4888 0.75069 0.762444 0.942583 0.068948 0.877962 0.06652 0.912259 0.170478 0.495358 0.077987 0.274313 0.093845 0.941135 0.39666 0.569744 0.679405 0.695379 0.470249 0.185231 0.869751 0.458508 0.846482 0.218271 0.010509 0.14523 0.561293 0.644684 0.203382 0.463261 0.447353 0.497923 0.110245 0.566956 0.667487 0.988025 0.109425 0.039328 0.1578 0.098036 0.677814 0.30595 0.83579 0.724069 0.728335 0.169513 0.434894 0.70932 0.428071 0.400892 0.588442 0.643368 0.001899 0 [...]
+0.605979 0.406479 0.502935 0.246412 0.15594 0.382827 0.890574 0.926433 0.972209 0.660758 0.615117 0.549773 0.041804 0.537014 0.811094 0.581525 0.036743 0.077558 0.866294 0.16243 0.116834 0.581815 0.526516 0.337585 0.258802 0.143003 0.067439 0.477435 0.454072 0.569003 0.737893 0.644476 0.352346 0.739064 0.366611 0.277723 0.963873 0.417417 0.311849 0.916088 0.672009 0.256191 0.941503 0.769357 0.358295 0.225288 0.401089 0.500787 0.826191 0.624234 0.222373 0.583106 0.919949 0.719174 0.2928 0 [...]
+0.625176 0.091631 0.13473 0.898854 0.752088 0.441958 0.328551 0.971196 0.132654 0.100043 0.222711 0.981477 0.856341 0.404797 0.88136 0.921477 0.06043 0.472737 0.996831 0.613585 0.881327 0.095256 0.602296 0.706828 0.495645 0.777095 0.964754 0.588154 0.083401 0.86631 0.515249 0.924849 0.827711 0.922095 0.89117 0.441567 0.943056 0.825865 0.251988 0.423888 0.67677 0.09961 0.932237 0.347481 0.956186 0.287314 0.133847 0.449347 0.03667 0.724005 0.427058 0.773481 0.996473 0.218597 0.292052 0.294 [...]
+0.759661 0.697107 0.032155 0.205041 0.601567 0.308774 0.870204 0.749003 0.48789 0.93718 0.005179 0.948315 0.814186 0.962749 0.365328 0.461572 0.32211 0.545783 0.371183 0.161696 0.841066 0.349103 0.025016 0.422828 0.903726 0.391359 0.730592 0.734239 0.455041 0.014411 0.015206 0.360016 0.778698 0.098048 0.711867 0.088689 0.863225 0.028037 0.021793 0.011743 0.210773 0.80943 0.927015 0.722149 0.323049 0.996219 0.58982 0.172363 0.848018 0.602074 0.098603 0.035698 0.918469 0.772512 0.405289 0. [...]
+0.475215 0.928079 0.849401 0.312839 0.600694 0.312247 0.028811 0.679719 0.622497 0.382256 0.977229 0.124206 0.065555 0.610188 0.182297 0.204126 0.605451 0.521391 0.568155 0.234429 0.031878 0.701026 0.0573 0.935136 0.174467 0.272618 0.880701 0.7701 0.950882 0.950919 0.156463 0.743173 0.230678 0.446647 0.360334 0.577554 0.55313 0.247777 0.217207 0.139738 0.544842 0.206367 0.3558 0.607545 0.007542 0.952646 0.837677 0.74821 0.859648 0.178505 0.207111 0.702267 0.781898 0.097129 0.548919 0.993 [...]
+0.340111 0.840992 0.289146 0.622819 0.587078 0.204696 0.644889 0.222375 0.280705 0.83016 0.881494 0.015081 0.940132 0.019635 0.038635 0.941829 0.913345 0.614888 0.719773 0.239751 0.098876 0.042726 0.189713 0.438931 0.402539 0.69967 0.25717 0.02897 0.544434 0.288645 0.975733 0.175626 0.398475 0.755535 0.099652 0.024655 0.899378 0.979229 0.616708 0.985264 0.106358 0.332516 0.19589 0.210931 0.517753 0.794417 0.903381 0.059315 0.652242 0.646143 0.471072 0.834832 0.339445 0.217835 0.70233 0.2 [...]
+0.843478 0.033248 0.927481 0.900468 0.115549 0.282976 0.282278 0.060962 0.956695 0.266796 0.445001 0.098663 0.405871 0.546617 0.480053 0.864674 0.410603 0.206638 0.434476 0.320458 0.195289 0.314431 0.631905 0.531668 0.618518 0.501992 0.70257 0.256207 0.545324 0.48789 0.571956 0.113877 0.939882 0.36162 0.71907 0.575809 0.045095 0.931686 0.202203 0.374748 0.144324 0.356179 0.546203 0.714914 0.124173 0.508057 0.913774 0.866156 0.116981 0.478176 0.802246 0.766073 0.172038 0.565395 0.240641 0 [...]
+0.583394 0.394975 0.170254 0.154461 0.227821 0.103744 0.591448 0.747844 0.899588 0.291726 0.41225 0.808959 0.244502 0.344852 0.412285 0.378866 0.569766 0.581529 0.96632 0.283927 0.103814 0.398888 0.513043 0.45711 0.47263 0.283282 0.701844 0.630168 0.586957 0.345447 0.483205 0.115479 0.506974 0.184778 0.751232 0.472112 0.65085 0.325806 0.161517 0.819735 0.292539 0.030777 0.994044 0.248929 0.092792 0.086808 0.14448 0.506596 0.962912 0.700548 0.963844 0.858725 0.889565 0.23456 0.967936 0.38 [...]
+0.999574 0.389426 0.168575 0.619584 0.86675 0.913442 0.78371 0.584714 0.817347 0.525544 0.703192 0.730254 0.944882 0.057913 0.678239 0.381273 0.677998 0.523616 0.510196 0.979115 0.238687 0.583676 0.53925 0.613163 0.094335 0.074699 0.846379 0.71414 0.347531 0.48237 0.016188 0.71718 0.761679 0.193533 0.971993 0.708796 0.350642 0.195928 0.60047 0.554576 0.486061 0.170827 0.022333 0.489262 0.20874 0.396635 0.567108 0.910312 0.18906 0.1987 0.391526 0.528484 0.818406 0.692708 0.087189 0.070213 [...]
+0.92686 0.233745 0.787835 0.815072 0.750105 0.010151 0.378843 0.882563 0.63771 0.838919 0.627543 0.607267 0.266287 0.509702 0.921255 0.181974 0.95683 0.814783 0.747693 0.310428 0.108579 0.143507 0.922992 0.924365 0.099517 0.584882 0.856051 0.140978 0.927236 0.449765 0.860222 0.438106 0.559692 0.655639 0.087993 0.250957 0.378512 0.698663 0.710391 0.022131 0.810581 0.554918 0.433584 0.481035 0.659282 0.017658 0.063217 0.678482 0.500926 0.707701 0.891584 0.961559 0.187164 0.169248 0.984013  [...]
+0.071705 0.875735 0.935604 0.102862 0.226516 0.09565 0.668724 0.545525 0.575987 0.338066 0.052977 0.701103 0.779605 0.962011 0.9204 0.357527 0.30944 0.400587 0.359708 0.298355 0.123223 0.278276 0.495585 0.953346 0.238439 0.358538 0.036261 0.84279 0.054528 0.079622 0.950724 0.202726 0.84856 0.414241 0.184313 0.201187 0.273284 0.122765 0.387194 0.69935 0.836218 0.040746 0.666856 0.086818 0.69088 0.313796 0.890255 0.416196 0.167853 0.74908 0.306493 0.83289 0.434717 0.183665 0.33507 0.913561 [...]
+0.519559 0.34383 0.970858 0.84 0.928488 0.883943 0.204979 0.699708 0.793766 0.156074 0.526886 0.489934 0.790112 0.408258 0.575227 0.806446 0.749604 0.774249 0.164638 0.417823 0.36149 0.160655 0.025823 0.502517 0.226715 0.006615 0.461716 0.850383 0.515126 0.226918 0.463295 0.388842 0.422902 0.059627 0.066487 0.551636 0.431954 0.339419 0.099877 0.096821 0.204236 0.447354 0.864256 0.601576 0.821901 0.497924 0.53371 0.251191 0.931904 0.447846 0.594824 0.375749 0.526176 0.205618 0.275287 0.77 [...]
+0.315591 0.234088 0.518668 0.458887 0.521328 0.560385 0.658312 0.470221 0.68603 0.559485 0.491708 0.455383 0.605812 0.80001 0.706404 0.790325 0.802768 0.769743 0.336584 0.921238 0.701739 0.845583 0.286787 0.890368 0.573694 0.99191 0.882743 0.662735 0.696292 0.602904 0.715381 0.040387 0.718054 0.292919 0.240788 0.433826 0.29419 0.367777 0.639225 0.786313 0.066984 0.261768 0.572731 0.431218 0.954218 0.169373 0.348975 0.753942 0.873664 0.918474 0.95335 0.219118 0.420559 0.345893 0.507776 0. [...]
+0.560979 0.532876 0.334033 0.295291 0.312699 0.749007 0.956658 0.627574 0.582895 0.564204 0.44182 0.431159 0.581312 0.971876 0.306805 0.762677 0.884312 0.388572 0.963566 0.243402 0.892945 0.360803 0.405752 0.100585 0.169636 0.65098 0.798743 0.97392 0.338152 0.974475 0.82171 0.189892 0.000479 0.3779 0.196724 0.809486 0.221467 0.847745 0.073501 0.843337 0.093661 0.801354 0.874144 0.539003 0.043982 0.289048 0.683729 0.856157 0.390866 0.535129 0.130562 0.256644 0.073372 0.118058 0.066179 0.6 [...]
+0.63049 0.706913 0.086349 0.732893 0.622411 0.896568 0.075028 0.13549 0.458463 0.359499 0.463451 0.780348 0.373052 0.020389 0.773612 0.1676 0.581837 0.163615 0.814586 0.694866 0.469458 0.682462 0.072292 0.654585 0.043913 0.92621 0.931852 0.011295 0.64432 0.042848 0.089552 0.24113 0.738148 0.880836 0.579728 0.502221 0.567692 0.107572 0.723557 0.052054 0.457614 0.7104 0.01528 0.118292 0.105634 0.230002 0.862622 0.573456 0.88825 0.52988 0.230194 0.884192 0.368318 0.719738 0.617162 0.062582  [...]
+0.825153 0.228624 0.436658 0.032469 0.833063 0.011233 0.45319 0.004451 0.35193 0.759315 0.021707 0.731603 0.371066 0.790948 0.283395 0.78083 0.118249 0.128835 0.36248 0.398326 0.627246 0.533049 0.394145 0.930135 0.724853 0.341731 0.677522 0.022392 0.742858 0.950635 0.689454 0.898212 0.542318 0.619013 0.39417 0.735495 0.901396 0.613587 0.026253 0.975975 0.113384 0.295645 0.962294 0.375851 0.379724 0.977173 0.201262 0.638939 0.049739 0.406741 0.560812 0.740216 0.94051 0.308855 0.648759 0.2 [...]
+0.845118 0.115634 0.476927 0.821682 0.351027 0.351126 0.83297 0.414371 0.273434 0.334183 0.135158 0.332747 0.756441 0.296001 0.653138 0.551728 0.449983 0.647284 0.490743 0.657022 0.461546 0.242191 0.780011 0.163203 0.956827 0.92109 0.489204 0.328736 0.393164 0.109561 0.886137 0.516498 0.45689 0.702504 0.168163 0.911109 0.674742 0.608331 0.205672 0.719768 0.483721 0.890144 0.37976 0.057078 0.504418 0.57418 0.384259 0.477086 0.660515 0.444376 0.103768 0.929592 0.329597 0.37357 0.52927 0.26 [...]
+0.504824 0.250709 0.335564 0.367851 0.154821 0.319502 0.034482 0.86378 0.714758 0.665855 0.017716 0.439298 0.297438 0.567459 0.833753 0.489885 0.654726 0.901969 0.890597 0.202918 0.558801 0.578265 0.666358 0.790052 0.937144 0.017506 0.913346 0.489758 0.545724 0.457761 0.239755 0.040382 0.024917 0.260963 0.277385 0.692971 0.22105 0.000755 0.030318 0.278122 0.930092 0.357702 0.510885 0.056386 0.841425 0.627031 0.613036 0.031446 0.653051 0.017888 0.895551 0.348564 0.700822 0.328357 0.85431  [...]
+0.15245 0.064567 0.596636 0.937968 0.373922 0.779161 0.736635 0.503892 0.512854 0.588596 0.463105 0.345449 0.950843 0.244938 0.081881 0.327671 0.960946 0.056864 0.962063 0.102198 0.579443 0.722318 0.626301 0.536814 0.305828 0.79205 0.620707 0.146419 0.944628 0.79374 0.919123 0.652486 0.196758 0.435735 0.060464 0.578199 0.739135 0.720001 0.862946 0.369195 0.013356 0.039028 0.954339 0.068062 0.341289 0.220403 0.617423 0.179149 0.192575 0.38126 0.859868 0.242661 0.508896 0.933163 0.612933 0 [...]
+0.363147 0.848723 0.552731 0.684514 0.046311 0.08181 0.287342 0.836139 0.395166 0.558624 0.349595 0.204306 0.43109 0.662965 0.496038 0.06292 0.725866 0.908096 0.379711 0.390955 0.006759 0.638067 0.671779 0.53274 0.612058 0.393433 0.266176 0.194564 0.930235 0.765393 0.158327 0.136226 0.7754 0.676465 0.759339 0.439432 0.021329 0.521547 0.476806 0.203609 0.738734 0.715524 0.681867 0.601995 0.485053 0.548198 0.730262 0.438556 0.773664 0.487216 0.008699 0.880517 0.771487 0.687208 0.798404 0.0 [...]
+0.589631 0.267575 0.712065 0.752634 0.086103 0.682164 0.078724 0.091891 0.623622 0.690768 0.79009 0.487188 0.676469 0.591716 0.849763 0.343349 0.146104 0.201317 0.757364 0.936586 0.0818 0.072201 0.709018 0.792924 0.905598 0.924015 0.911528 0.679345 0.374066 0.725304 0.613681 0.676101 0.868796 0.196125 0.634788 0.645277 0.156564 0.523441 0.675681 0.836062 0.283173 0.954137 0.419663 0.514131 0.140653 0.966527 0.452864 0.583103 0.101321 0.467575 0.37076 0.240467 0.485024 0.163584 0.037837 0 [...]
+0.933349 0.524475 0.753928 0.39392 0.234975 0.07589 0.332668 0.066598 0.468052 0.226427 0.785623 0.767514 0.715966 0.453244 0.839567 0.326915 0.560271 0.762144 0.522727 0.627404 0.220879 0.54608 0.537966 0.931522 0.355197 0.378454 0.471055 0.151737 0.964442 0.741982 0.480264 0.305824 0.765089 0.243074 0.527246 0.797384 0.475254 0.054969 0.864766 0.55763 0.951153 0.098296 0.310624 0.227386 0.802034 0.316468 0.886519 0.8742 0.765676 0.482171 0.533673 0.931938 0.04782 0.274534 0.122581 0.98 [...]
+0.417237 0.462414 0.401518 0.664437 0.274731 0.221842 0.766629 0.029012 0.823839 0.388383 0.216485 0.686537 0.16855 0.101781 0.234907 0.848454 0.878067 0.135861 0.766381 0.868399 0.876063 0.047494 0.965219 0.526689 0.488737 0.343043 0.470188 0.341608 0.864545 0.944832 0.305185 0.791761 0.156767 0.145094 0.816246 0.951571 0.018294 0.582976 0.591127 0.184885 0.592387 0.104528 0.797548 0.671006 0.685292 0.795151 0.556066 0.606068 0.99268 0.19952 0.498469 0.584173 0.329529 0.492451 0.17504 0 [...]
+0.428246 0.421195 0.078132 0.58085 0.385983 0.250506 0.800514 0.481451 0.218137 0.034777 0.515284 0.022167 0.322016 0.877013 0.259716 0.060153 0.894484 0.732475 0.170173 0.130325 0.667407 0.475116 0.071037 0.20999 0.166113 0.852529 0.985854 0.066201 0.768775 0.594476 0.853166 0.876207 0.098287 0.961476 0.934204 0.501917 0.547609 0.154889 0.008992 0.98159 0.615805 0.904648 0.494312 0.537783 0.717178 0.744267 0.818307 0.377117 0.562435 0.028098 0.479928 0.74408 0.83137 0.33412 0.741835 0.8 [...]
+0.2413 0.762664 0.817317 0.498866 0.724804 0.13365 0.056152 0.735996 0.8765 0.78933 0.746282 0.680227 0.290136 0.093625 0.938764 0.856175 0.391423 0.648652 0.819301 0.582988 0.042916 0.907297 0.747832 0.22961 0.63121 0.256464 0.628637 0.402941 0.243308 0.57016 0.673505 0.111986 0.10268 0.498652 0.168939 0.371014 0.208743 0.452447 0.343794 0.065911 0.764957 0.05714 0.436676 0.395817 0.643627 0.822928 0.604432 0.355942 0.583173 0.84115 0.30118 0.985775 0.044287 0.794091 0.5059 0.292671 0.0 [...]
+0.716751 0.046536 0.049343 0.714189 0.03052 0.220591 0.060343 0.715781 0.294825 0.465456 0.185849 0.246285 0.217598 0.379568 0.562498 0.528316 0.2104 0.089093 0.685045 0.300591 0.47141 0.816823 0.690948 0.087358 0.737646 0.602798 0.571692 0.293868 0.474645 0.818592 0.617412 0.171112 0.079174 0.695881 0.202887 0.887591 0.498306 0.24313 0.398363 0.849838 0.585993 0.780718 0.862411 0.312924 0.363629 0.263993 0.948864 0.643715 0.244504 0.90092 0.557715 0.577703 0.257079 0.643496 0.134986 0.4 [...]
+0.953227 0.404706 0.606613 0.188389 0.424998 0.7524 0.446441 0.697153 0.702699 0.501143 0.061768 0.566684 0.723282 0.158436 0.715206 0.113263 0.691824 0.769415 0.75355 0.562006 0.82215 0.456953 0.950334 0.183957 0.782892 0.067223 0.698242 0.450596 0.917336 0.699499 0.379586 0.645101 0.742178 0.635024 0.815705 0.422463 0.350852 0.85541 0.447982 0.675915 0.362251 0.172566 0.365284 0.351185 0.961651 0.453708 0.793227 0.465121 0.455991 0.146984 0.587493 0.460487 0.55765 0.565541 0.654802 0.9 [...]
+0.008216 0.691705 0.157055 0.654717 0.297585 0.963872 0.367119 0.747471 0.80965 0.896975 0.008118 0.475777 0.640845 0.061015 0.484038 0.596332 0.294443 0.017713 0.006627 0.973949 0.999375 0.5407 0.320984 0.230773 0.05361 0.215447 0.512463 0.057789 0.182217 0.493321 0.056554 0.275431 0.407237 0.030019 0.541923 0.261674 0.500119 0.647027 0.901228 0.096612 0.502458 0.702143 0.489319 0.444657 0.007191 0.591462 0.638822 0.857399 0.456742 0.015503 0.511298 0.213785 0.40059 0.034561 0.47001 0.9 [...]
+0.989888 0.920991 0.184905 0.928916 0.086336 0.772078 0.820383 0.593238 0.588695 0.614805 0.979857 0.671672 0.546099 0.922286 0.237053 0.078028 0.984138 0.028331 0.768249 0.85887 0.202307 0.323099 0.97341 0.754242 0.441877 0.298604 0.27268 0.279372 0.539946 0.031722 0.931937 0.921248 0.476272 0.713522 0.096819 0.614914 0.819972 0.216844 0.394637 0.697715 0.997116 0.488613 0.559335 0.396223 0.195982 0.326624 0.003832 0.474717 0.213568 0.029525 0.167799 0.664541 0.617521 0.461293 0.238796  [...]
+0.006851 0.987573 0.078632 0.060824 0.169831 0.70506 0.819278 0.405959 0.319528 0.089436 0.301353 0.444567 0.644193 0.445338 0.588753 0.872213 0.444167 0.91575 0.098472 0.117949 0.42187 0.45565 0.760408 0.576038 0.849652 0.25888 0.0281 0.746662 0.333117 0.783708 0.011929 0.649417 0.530937 0.745435 0.591239 0.771566 0.106741 0.856054 0.023829 0.217876 0.28257 0.106546 0.254595 0.571279 0.415774 0.639831 0.244279 0.083948 0.202987 0.058478 0.215912 0.191004 0.520956 0.094938 0.198378 0.657 [...]
+0.540806 0.425547 0.080376 0.803766 0.069131 0.575971 0.556352 0.862663 0.16715 0.20501 0.460317 0.208072 0.185017 0.907791 0.755895 0.656627 0.021816 0.084141 0.222179 0.294774 0.959475 0.049598 0.676312 0.806027 0.439581 0.223928 0.414511 0.974359 0.467225 0.563725 0.942435 0.92545 0.431312 0.581131 0.315481 0.54076 0.159327 0.050907 0.797582 0.680721 0.093078 0.210129 0.278495 0.485555 0.759136 0.152356 0.607948 0.469096 0.315952 0.17067 0.926227 0.736973 0.677642 0.989236 0.029414 0. [...]
+0.546584 0.787778 0.099641 0.026525 0.698675 0.612944 0.604643 0.637776 0.575062 0.621973 0.797191 0.158383 0.547243 0.256775 0.550588 0.792632 0.020593 0.688785 0.185077 0.728994 0.432531 0.838563 0.42733 0.700293 0.105829 0.516931 0.513549 0.952622 0.552739 0.569446 0.035487 0.231696 0.235994 0.576243 0.308918 0.736253 0.773273 0.500447 0.629756 0.648951 0.856377 0.193941 0.544253 0.233474 0.576942 0.598129 0.312367 0.670649 0.380773 0.595577 0.034782 0.186524 0.290804 0.18597 0.662112 [...]
+0.806707 0.472801 0.358705 0.563645 0.011227 0.432935 0.672581 0.773438 0.147042 0.57297 0.878818 0.050677 0.682313 0.058797 0.410433 0.220453 0.94751 0.597705 0.342131 0.617661 0.70377 0.379349 0.337583 0.377276 0.521033 0.028659 0.387386 0.556245 0.922665 0.656258 0.916329 0.736988 0.06906 0.093389 0.610874 0.112685 0.471183 0.78603 0.214667 0.124985 0.075148 0.440196 0.635283 0.816758 0.431363 0.928905 0.239624 0.305266 0.474595 0.124859 0.942145 0.147808 0.096664 0.740262 0.197823 0. [...]
+0.297826 0.049072 0.38022 0.975493 0.602347 0.432372 0.433242 0.029599 0.917452 0.466229 0.649037 0.875801 0.038154 0.368956 0.190672 0.469268 0.577715 0.446423 0.481099 0.367161 0.29935 0.11608 0.269915 0.837725 0.429515 0.94402 0.509888 0.745081 0.448407 0.003843 0.303499 0.227321 0.147241 0.339407 0.889181 0.300313 0.67705 0.20975 0.393793 0.714937 0.553403 0.635003 0.164795 0.524959 0.128416 0.597792 0.003086 0.390927 0.604657 0.438862 0.697622 0.48601 0.324029 0.309914 0.173944 0.46 [...]
+0.077939 0.037952 0.415383 0.96324 0.794117 0.70833 0.513375 0.7387 0.672413 0.966111 0.37184 0.07989 0.161536 0.816483 0.328543 0.893857 0.587834 0.02175 0.045137 0.430748 0.535915 0.469846 0.942374 0.600754 0.760533 0.871981 0.870933 0.993802 0.202329 0.890893 0.397304 0.723113 0.196931 0.822384 0.762874 0.785134 0.822346 0.783578 0.354087 0.327548 0.818396 0.87827 0.389207 0.456449 0.733746 0.058939 0.665903 0.631647 0.05802 0.216498 0.79851 0.414208 0.627974 0.115679 0.050507 0.84967 [...]
+0.474552 0.950507 0.015118 0.675322 0.288933 0.582925 0.624076 0.940376 0.332691 0.280115 0.736673 0.381697 0.9525 0.915339 0.826484 0.056797 0.350793 0.391792 0.185351 0.641282 0.713412 0.536724 0.133337 0.760785 0.05687 0.994276 0.759946 0.363092 0.566532 0.746798 0.502246 0.149467 0.222706 0.929954 0.487747 0.196157 0.59323 0.40616 0.907712 0.280616 0.900319 0.173476 0.228373 0.061387 0.315646 0.680986 0.280535 0.537843 0.383453 0.293599 0.665811 0.291966 0.463109 0.532057 0.68533 0.1 [...]
+0.518177 0.665697 0.350514 0.165884 0.713659 0.409255 0.161732 0.6089 0.546815 0.480268 0.724658 0.854184 0.989368 0.443757 0.157579 0.079803 0.658935 0.821468 0.771295 0.017973 0.075914 0.777336 0.778873 0.640805 0.532516 0.043094 0.316037 0.235888 0.816716 0.10391 0.405642 0.586024 0.758566 0.039505 0.337483 0.684463 0.615363 0.688951 0.633164 0.11036 0.772407 0.568648 0.841873 0.970401 0.105403 0.669635 0.948379 0.491958 0.049498 0.223594 0.193341 0.499945 0.272313 0.730944 0.928998 0 [...]
+0.325396 0.722634 0.509715 0.5276 0.793005 0.962873 0.769932 0.415853 0.425647 0.84632 0.353242 0.618156 0.346516 0.417743 0.843392 0.996821 0.009897 0.034781 0.417945 0.432475 0.88884 0.802188 0.749963 0.974473 0.023204 0.642547 0.126669 0.781171 0.258737 0.251596 0.088655 0.153928 0.610181 0.672275 0.183014 0.887141 0.39739 0.44443 0.787195 0.89686 0.064481 0.747863 0.860341 0.214713 0.740067 0.358516 0.777067 0.301119 0.013191 0.69683 0.043018 0.705034 0.5598 0.445979 0.415658 0.84053 [...]
+0.395225 0.280509 0.635973 0.522125 0.717771 0.378577 0.869616 0.020168 0.666555 0.717417 0.749403 0.426202 0.236283 0.837797 0.892791 0.054351 0.379136 0.163578 0.877846 0.397187 0.736218 0.595739 0.249816 0.14169 0.066992 0.981737 0.190403 0.026621 0.546559 0.660395 0.136564 0.681546 0.315527 0.130543 0.277809 0.727103 0.897093 0.080583 0.463244 0.668594 0.210725 0.957305 0.207925 0.693127 0.809892 0.996288 0.717959 0.370235 0.970065 0.592626 0.106228 0.20156 0.238648 0.554562 0.68145  [...]
+0.817383 0.389924 0.981471 0.00356 0.753395 0.13946 0.310678 0.163325 0.657192 0.997132 0.356117 0.581304 0.279704 0.063704 0.225936 0.589951 0.57514 0.525155 0.625366 0.426511 0.160102 0.565624 0.596938 0.658757 0.697912 0.39398 0.914102 0.691676 0.807462 0.257308 0.904528 0.085926 0.750479 0.82719 0.007654 0.192808 0.321963 0.861333 0.181209 0.674331 0.978534 0.243993 0.403346 0.043163 0.245522 0.77483 0.235887 0.002009 0.190262 0.367419 0.595211 0.05963 0.69739 0.116824 0.527462 0.545 [...]
+0.393654 0.834411 0.48925 0.741828 0.006501 0.492814 0.363621 0.487936 0.538161 0.150719 0.897185 0.31521 0.487559 0.186299 0.319217 0.493128 0.905832 0.245441 0.342797 0.670707 0.528868 0.320242 0.249483 0.277842 0.264876 0.43641 0.796114 0.043389 0.357251 0.194478 0.357298 0.184124 0.703301 0.530236 0.352487 0.061944 0.13164 0.871753 0.905345 0.797385 0.410125 0.075359 0.836363 0.125958 0.205177 0.650582 0.833424 0.746603 0.400232 0.888268 0.60936 0.131263 0.583601 0.367339 0.072725 0. [...]
+0.700028 0.875988 0.185521 0.339546 0.818186 0.378172 0.933657 0.27995 0.726457 0.148958 0.610356 0.834532 0.988819 0.837756 0.596448 0.986475 0.29414 0.217821 0.811048 0.157444 0.288989 0.025573 0.884662 0.134086 0.916097 0.245587 0.486693 0.150489 0.295662 0.980089 0.285616 0.06031 0.194434 0.768361 0.527298 0.740693 0.737971 0.862242 0.351386 0.912861 0.478205 0.361608 0.990305 0.851912 0.008524 0.628037 0.670908 0.624532 0.956022 0.379543 0.703393 0.188656 0.044614 0.145136 0.152275  [...]
+0.540963 0.591185 0.398303 0.926092 0.474619 0.79062 0.142873 0.043239 0.35868 0.443972 0.963593 0.878948 0.91142 0.151617 0.14374 0.345521 0.934209 0.867506 0.224294 0.846875 0.767926 0.510795 0.424839 0.709379 0.412582 0.933219 0.133962 0.165917 0.297856 0.454868 0.245751 0.680153 0.850307 0.423761 0.600847 0.920762 0.60568 0.078361 0.04302 0.258233 0.588969 0.405287 0.867071 0.975764 0.579747 0.63656 0.501974 0.737446 0.945562 0.090756 0.938824 0.788679 0.200201 0.161968 0.612657 0.41 [...]
+0.503353 0.887545 0.204251 0.293 0.661213 0.32633 0.146572 0.976023 0.37375 0.953516 0.03668 0.75652 0.914967 0.532324 0.747052 0.456015 0.358585 0.77166 0.961985 0.025792 0.042454 0.800249 0.285856 0.255221 0.044143 0.229436 0.001655 0.209759 0.282428 0.16372 0.880638 0.787536 0.292843 0.321864 0.569805 0.868235 0.958124 0.728159 0.854169 0.873537 0.149537 0.891123 0.63217 0.509039 0.154399 0.090262 0.861434 0.095528 0.455811 0.368962 0.484411 0.885376 0.183511 0.854863 0.526908 0.75924 [...]
+0.670747 0.775427 0.33902 0.939003 0.245964 0.59626 0.298873 0.209093 0.657929 0.459231 0.267688 0.701992 0.933382 0.62064 0.334003 0.554554 0.16663 0.35362 0.542129 0.559327 0.715606 0.605674 0.340586 0.674885 0.87233 0.194839 0.795162 0.352823 0.799169 0.962085 0.845412 0.445905 0.499027 0.159636 0.251834 0.84005 0.652586 0.822405 0.944001 0.928943 0.710929 0.554805 0.196145 0.506396 0.467914 0.669078 0.892026 0.207066 0.826952 0.744274 0.046442 0.305411 0.282454 0.386008 0.522015 0.01 [...]
+0.925351 0.344307 0.175058 0.963248 0.912213 0.400518 0.836417 0.514894 0.962277 0.985199 0.104708 0.812365 0.828799 0.270624 0.753627 0.886553 0.910357 0.530947 0.847192 0.198037 0.109689 0.119325 0.334857 0.146569 0.673898 0.344701 0.776065 0.483618 0.666939 0.189346 0.227178 0.191464 0.065213 0.279929 0.078457 0.00874 0.8458 0.344352 0.164907 0.192132 0.555294 0.583276 0.933101 0.760643 0.744122 0.976508 0.133407 0.567849 0.257342 0.75362 0.866969 0.566902 0.113939 0.285428 0.134272 0 [...]
+0.833352 0.623349 0.285263 0.567897 0.978482 0.983995 0.955123 0.599116 0.250308 0.44113 0.707144 0.959483 0.887648 0.996066 0.528937 0.064739 0.695384 0.027214 0.71996 0.08248 0.126619 0.811546 0.482021 0.233916 0.312182 0.563099 0.941151 0.480296 0.330099 0.15167 0.260815 0.933877 0.467933 0.809371 0.210749 0.617934 0.901347 0.867745 0.508298 0.549873 0.495204 0.018228 0.842827 0.680957 0.90488 0.13398 0.496354 0.362589 0.544181 0.395729 0.550107 0.493875 0.57062 0.213451 0.513445 0.46 [...]
+0.752199 0.481975 0.364776 0.084928 0.369635 0.767211 0.717103 0.258258 0.273313 0.344386 0.323712 0.537799 0.620093 0.9628 0.274165 0.690734 0.46761 0.081376 0.207836 0.054663 0.679801 0.192983 0.907248 0.243119 0.224224 0.086883 0.436761 0.323345 0.527236 0.012235 0.486487 0.516055 0.644196 0.131488 0.305449 0.460859 0.079588 0.747246 0.691246 0.325881 0.028386 0.336421 0.605539 0.334809 0.194096 0.970918 0.269932 0.441799 0.783442 0.778752 0.79356 0.21017 0.039535 0.846991 0.65414 0.0 [...]
+0.356052 0.132485 0.670573 0.278791 0.518377 0.518103 0.733227 0.528529 0.698615 0.75434 0.540424 0.337959 0.168113 0.312272 0.724698 0.829449 0.408393 0.66953 0.562356 0.141908 0.029578 0.482426 0.857315 0.711241 0.260425 0.363469 0.544492 0.366826 0.35672 0.197384 0.629386 0.022734 0.937429 0.547964 0.980178 0.787312 0.991477 0.33323 0.016308 0.970726 0.14687 0.889742 0.338166 0.540144 0.168488 0.134874 0.721704 0.688532 0.548699 0.286255 0.624236 0.844588 0.938255 0.929037 0.47419 0.7 [...]
+0.313291 0.195551 0.998703 0.139681 0.58746 0.383747 0.006675 0.973593 0.179156 0.182089 0.028712 0.82423 0.802128 0.094509 0.091938 0.084422 0.775093 0.603849 0.156064 0.451624 0.810782 0.886182 0.463623 0.748603 0.271094 0.068932 0.892224 0.210697 0.223458 0.227657 0.290052 0.790624 0.261652 0.971447 0.225071 0.468776 0.860844 0.197893 0.162159 0.127453 0.14579 0.891155 0.707829 0.717286 0.592451 0.289958 0.98463 0.765698 0.531012 0.337973 0.800032 0.653678 0.908104 0.534164 0.734772 0 [...]
+0.085789 0.259602 0.061631 0.787721 0.967838 0.603781 0.937255 0.054866 0.628897 0.714266 0.380817 0.952032 0.595138 0.117922 0.085022 0.898566 0.133275 0.013474 0.633584 0.810674 0.396931 0.143161 0.396934 0.289622 0.333048 0.37038 0.425234 0.075143 0.861473 0.304568 0.95594 0.233911 0.176703 0.588887 0.536247 0.988264 0.604145 0.624272 0.776044 0.211173 0.387103 0.57544 0.969959 0.673199 0.732011 0.17485 0.831055 0.841257 0.249035 0.286011 0.14844 0.651309 0.492126 0.528493 0.310471 0. [...]
+0.700878 0.479311 0.143137 0.997308 0.669088 0.601884 0.735352 0.362585 0.391773 0.765885 0.694449 0.040686 0.367752 0.598451 0.823623 0.637905 0.201272 0.137394 0.133254 0.235045 0.57127 0.080134 0.078451 0.697637 0.686356 0.631094 0.120159 0.485052 0.726887 0.664567 0.723157 0.482658 0.187904 0.031317 0.74143 0.710952 0.646143 0.351932 0.278734 0.103918 0.180713 0.331306 0.571488 0.541024 0.500181 0.198791 0.057198 0.313621 0.109742 0.052637 0.711352 0.112634 0.808181 0.104338 0.386565 [...]
+0.255257 0.015734 0.413854 0.739685 0.271452 0.896471 0.333799 0.371385 0.518506 0.963819 0.394551 0.71017 0.350556 0.794552 0.423093 0.026287 0.597393 0.60524 0.975544 0.397875 0.892839 0.295111 0.108851 0.237706 0.112917 0.506856 0.302645 0.236486 0.591742 0.74579 0.548307 0.842262 0.904869 0.654086 0.089888 0.861112 0.288004 0.924075 0.532062 0.404068 0.066748 0.106355 0.123401 0.673653 0.576008 0.232515 0.259967 0.805085 0.736469 0.801777 0.429633 0.749703 0.435661 0.087674 0.538307  [...]
+0.449367 0.040362 0.615352 0.943266 0.994482 0.721521 0.420488 0.579099 0.481302 0.028879 0.1764 0.806199 0.399321 0.153659 0.113651 0.704011 0.646468 0.968277 0.389927 0.780022 0.736623 0.408618 0.065619 0.649783 0.097085 0.475413 0.023746 0.226025 0.298468 0.796788 0.513836 0.155779 0.29353 0.834828 0.941172 0.579452 0.247076 0.556357 0.453471 0.008487 0.868013 0.576241 0.210048 0.770704 0.896269 0.282851 0.297324 0.310997 0.866961 0.623088 0.701526 0.537176 0.439502 0.086072 0.18577 0 [...]
+0.481076 0.039741 0.087004 0.707035 0.853986 0.621537 0.329126 0.767999 0.977217 0.237409 0.086833 0.65482 0.280075 0.629211 0.281964 0.14889 0.387115 0.834595 0.545407 0.628 0.641787 0.022087 0.940056 0.407199 0.328945 0.935722 0.297363 0.798521 0.709363 0.017563 0.762145 0.505475 0.26834 0.205243 0.580537 0.918393 0.78293 0.129845 0.307488 0.646333 0.110483 0.538819 0.866344 0.002834 0.661572 0.957325 0.227873 0.169082 0.196536 0.12367 0.982261 0.94666 0.554639 0.08157 0.808535 0.72652 [...]
+0.109097 0.907698 0.361066 0.671531 0.343976 0.935847 0.56706 0.170232 0.98285 0.642613 0.463003 0.634038 0.080474 0.964183 0.018706 0.168087 0.529884 0.026558 0.75434 0.596601 0.118077 0.374084 0.655639 0.100531 0.652315 0.804723 0.118962 0.853259 0.723169 0.278423 0.050451 0.33056 0.761647 0.71638 0.654306 0.992156 0.640515 0.843056 0.750862 0.541752 0.956484 0.442448 0.380272 0.897197 0.079104 0.505619 0.051844 0.243475 0.171924 0.918624 0.376379 0.559965 0.548219 0.854675 0.462865 0. [...]
+0.897162 0.423202 0.667475 0.372493 0.500726 0.660315 0.337031 0.840898 0.650258 0.95794 0.656094 0.034482 0.255792 0.519091 0.436259 0.847411 0.60555 0.409754 0.065012 0.202866 0.397146 0.363784 0.681265 0.085573 0.991552 0.995416 0.425966 0.505993 0.728394 0.651286 0.324843 0.989466 0.512926 0.426432 0.568097 0.454417 0.301215 0.738917 0.427565 0.874358 0.150552 0.730305 0.274855 0.179335 0.71441 0.454785 0.255079 0.84519 0.982155 0.467787 0.519049 0.401056 0.756977 0.267028 0.107683 0 [...]
+0.829469 0.628927 0.297409 0.24181 0.544952 0.772707 0.761737 0.580129 0.271992 0.92122 0.040183 0.809989 0.446521 0.415746 0.075756 0.28636 0.929123 0.112506 0.302619 0.445876 0.686522 0.521224 0.077463 0.5828 0.336134 0.912847 0.490912 0.082031 0.999193 0.109488 0.867337 0.691928 0.277144 0.20327 0.976333 0.032689 0.445933 0.048326 0.5114 0.218494 0.9454 0.180193 0.896946 0.208562 0.959003 0.046532 0.664335 0.770198 0.44021 0.574058 0.975784 0.657402 0.615626 0.060621 0.889716 0.371271 [...]
+0.938866 0.504333 0.163726 0.872412 0.750725 0.325721 0.228335 0.822232 0.278032 0.227058 0.343611 0.691395 0.356409 0.419345 0.318777 0.158578 0.178508 0.421383 0.528736 0.070944 0.524745 0.512318 0.428421 0.04207 0.38622 0.449048 0.927988 0.189515 0.445135 0.773924 0.357279 0.236119 0.443591 0.541168 0.764251 0.054841 0.773788 0.740308 0.233198 0.8871 0.677055 0.336325 0.814495 0.995317 0.417044 0.244455 0.469353 0.082343 0.559567 0.076745 0.402258 0.530453 0.903705 0.974128 0.034718 0 [...]
+0.277971 0.069786 0.502023 0.738574 0.618549 0.364656 0.001835 0.992391 0.336753 0.755472 0.853061 0.292578 0.065625 0.52302 0.897545 0.140836 0.26599 0.237788 0.466818 0.06879 0.811081 0.57746 0.751338 0.561852 0.997282 0.177158 0.204564 0.185175 0.385237 0.070334 0.837711 0.030787 0.758341 0.039368 0.963794 0.766731 0.469881 0.519272 0.567932 0.061697 0.036342 0.443355 0.210858 0.529093 0.483563 0.850443 0.735718 0.666874 0.639723 0.814707 0.198201 0.51593 0.481352 0.644943 0.833115 0. [...]
+0.174623 0.295012 0.765776 0.469556 0.502371 0.041059 0.104002 0.605582 0.584423 0.129759 0.09814 0.037307 0.195227 0.873047 0.368091 0.808622 0.311477 0.545908 0.173647 0.053219 0.376019 0.174784 0.058938 0.798528 0.955353 0.318107 0.132722 0.677777 0.364012 0.334298 0.257406 0.546061 0.178782 0.296091 0.95843 0.002792 0.501007 0.59986 0.176077 0.450901 0.99534 0.704559 0.552015 0.585796 0.243605 0.152284 0.029005 0.006651 0.18663 1.1e-05 0.695998 0.957396 0.291824 0.601687 0.978003 0.5 [...]
+0.826187 0.988868 0.422372 0.435307 0.053693 0.599391 0.684008 0.069692 0.902118 0.763767 0.322385 0.145023 0.752638 0.331393 0.449217 0.117009 0.668035 0.437971 0.75752 0.559937 0.425131 0.902409 0.021016 0.020262 0.593338 0.193508 0.209904 0.895301 0.149669 0.426197 0.251457 0.882924 0.911113 0.360427 0.838719 0.185096 0.766438 0.534451 0.295207 0.427107 0.552059 0.225857 0.104533 0.12761 0.524238 0.805512 0.071652 0.602568 0.732157 0.944078 0.566268 0.429369 0.589345 0.125415 0.328059 [...]
+0.223432 0.482327 0.495177 0.208202 0.550822 0.804274 0.783139 0.129591 0.488154 0.994808 0.727186 0.493178 0.932511 0.184006 0.299607 0.407175 0.441004 0.506374 0.347907 0.722281 0.500634 0.858532 0.180399 0.012437 0.877288 0.777969 0.526654 0.262679 0.678808 0.641715 0.550837 0.122528 0.362172 0.291582 0.747117 0.329291 0.143827 0.604064 0.513616 0.230108 0.337565 0.923828 0.505941 0.73448 0.908872 0.934338 0.602351 0.918717 0.592019 0.798829 0.683773 0.152321 0.654151 0.70668 0.86041  [...]
+0.689481 0.716566 0.388619 0.719 0.891308 0.902487 0.379351 0.760699 0.290688 0.330078 0.048963 0.029679 0.611392 0.283374 0.013889 0.640132 0.816311 0.300281 0.072103 0.727768 0.790794 0.193004 0.997874 0.193056 0.924894 0.145423 0.43937 0.635823 0.500937 0.049083 0.826397 0.766879 0.723296 0.061999 0.191939 0.186933 0.215947 0.095614 0.546824 0.615131 0.320118 0.312508 0.216066 0.659441 0.133089 0.064155 0.922687 0.431766 0.775828 0.021905 0.895153 0.844526 0.177512 0.839166 0.190703 0 [...]
+0.92239 0.371897 0.012563 0.63 0.901429 0.093253 0.763966 0.21126 0.118664 0.743719 0.450423 0.294354 0.866432 0.305589 0.656481 0.847356 0.727183 0.726055 0.97256 0.376679 0.219852 0.29275 0.29037 0.413111 0.854844 0.661075 0.122703 0.411476 0.729914 0.247067 0.079404 0.844337 0.733034 0.416535 0.319331 0.237453 0.834299 0.64554 0.463884 0.841605 0.381576 0.264912 0.141265 0.728987 0.192208 0.463411 0.866944 0.300006 0.032073 0.65116 0.197265 0.226328 0.859108 0.37118 0.529995 0.021471  [...]
+0.69383 0.821825 0.061684 0.932288 0.924148 0.103476 0.971945 0.999167 0.038711 0.614101 0.148034 0.525191 0.538577 0.208724 0.910019 0.11284 0.205413 0.856339 0.234759 0.395241 0.152562 0.953308 0.190045 0.061584 0.287318 0.341002 0.846976 0.558259 0.230869 0.694551 0.791946 0.871818 0.932138 0.97499 0.923416 0.259623 0.554724 0.421533 0.065774 0.696084 0.996982 0.261589 0.181262 0.343376 0.218038 0.465899 0.548945 0.109384 0.997741 0.989102 0.215593 0.901802 0.5807 0.959718 0.503474 0. [...]
+0.560959 0.155757 0.060986 0.081328 0.761081 0.588473 0.654781 0.217042 0.875099 0.384719 0.364061 0.481565 0.309803 0.233415 0.484837 0.017887 0.810298 0.170325 0.892351 0.379748 0.853879 0.174903 0.540867 0.843754 0.477555 0.810204 0.352845 0.298149 0.045768 0.891587 0.810605 0.18207 0.645581 0.068809 0.898749 0.293894 0.581439 0.235934 0.112967 0.726656 0.842246 0.619702 0.686846 0.528389 0.775606 0.071161 0.27572 0.985754 0.419796 0.815993 0.10372 0.069318 0.303649 0.638445 0.102497  [...]
+0.344446 0.070904 0.821169 0.497739 0.672928 0.93952 0.473591 0.553994 0.111834 0.918893 0.849816 0.858411 0.261673 0.055726 0.977747 0.746135 0.933712 0.984533 0.280231 0.398713 0.386377 0.828726 0.278544 0.56354 0.083319 0.612491 0.37771 0.782117 0.477766 0.298768 0.209451 0.202914 0.888126 0.757434 0.870151 0.097793 0.57234 0.067559 0.828841 0.343441 0.116432 0.993286 0.821877 0.980699 0.639928 0.177602 0.074499 0.087473 0.323846 0.433868 0.331195 0.466494 0.62166 0.274682 0.433484 0. [...]
+0.028116 0.422472 0.498295 0.382184 0.406692 0.293193 0.8113 0.751071 0.12929 0.495246 0.520967 0.427857 0.936979 0.572029 0.779576 0.767211 0.158391 0.531238 0.684251 0.456752 0.677085 0.520485 0.384415 0.370529 0.042834 0.269375 0.601495 0.527113 0.447476 0.731629 0.7126 0.368832 0.52942 0.018856 0.656491 0.433567 0.601174 0.51855 0.271721 0.783199 0.782778 0.835414 0.574839 0.29822 0.557812 0.676079 0.159819 0.707468 0.197014 0.900481 0.946304 0.207752 0.281894 0.72346 0.350978 0.4444 [...]
+0.542175 0.436385 0.301173 0.024403 0.212296 0.200517 0.612586 0.328688 0.009636 0.169031 0.822447 0.651187 0.213123 0.642201 0.613404 0.231209 0.159729 0.586938 0.124197 0.782886 0.174979 0.161277 0.242174 0.9415 0.872101 0.859064 0.859203 0.942173 0.251238 0.891466 0.329128 0.627019 0.350573 0.671132 0.102272 0.32601 0.597465 0.178491 0.700714 0.995504 0.326644 0.258779 0.194548 0.010035 0.667128 0.667398 0.86889 0.322073 0.0028 0.751701 0.87209 0.727884 0.844609 0.143358 0.413556 0.04 [...]
+0.176452 0.181787 0.018422 0.797833 0.555872 0.272033 0.531556 0.825622 0.702541 0.376756 0.298304 0.862121 0.145536 0.808466 0.441789 0.78788 0.016038 0.898861 0.202526 0.54107 0.521352 0.543815 0.308659 0.411434 0.070927 0.198549 0.453045 0.05083 0.385197 0.249218 0.513941 0.566137 0.400961 0.496395 0.290726 0.034264 0.160222 0.273065 0.102828 0.097021 0.110449 0.499784 0.196573 0.129972 0.610774 0.759113 0.313129 0.441589 0.82366 0.89355 0.302073 0.870894 0.712138 0.287019 0.704113 0. [...]
+0.78457 0.595379 0.323963 0.400517 0.955404 0.180316 0.792848 0.470261 0.331701 0.691199 0.733976 0.910549 0.350626 0.77983 0.137301 0.204072 0.192679 0.345862 0.207512 0.917837 0.518632 0.300396 0.738815 0.669341 0.15686 0.196543 0.525856 0.165037 0.74018 0.196262 0.916205 0.149021 0.005117 0.582276 0.085256 0.334155 0.249762 0.412513 0.761219 0.859395 0.206146 0.596937 0.688735 0.3926 0.169411 0.647787 0.101925 0.471164 0.159668 0.285302 0.544302 0.497048 0.107857 0.46368 0.421608 0.19 [...]
+0.754372 0.762981 0.750065 0.897925 0.332838 0.87759 0.53066 0.594046 0.477909 0.291991 0.974156 0.686215 0.547175 0.659428 0.656477 0.237028 0.546563 0.499322 0.544718 0.764723 0.012748 0.850121 0.989114 0.539054 0.078217 0.203692 0.707901 0.316401 0.822777 0.495395 0.441027 0.574794 0.012834 0.447206 0.522098 0.199664 0.211789 0.428594 0.981012 0.165891 0.368988 0.923712 0.565573 0.174476 0.931415 0.768012 0.294019 0.501925 0.301865 0.731529 0.964581 0.092649 0.114089 0.963383 0.663808 [...]
+0.460845 0.857292 0.681028 0.772167 0.721674 0.690409 0.905623 0.149383 0.101708 0.186937 0.897513 0.131144 0.700229 0.874494 0.834082 0.82601 0.68197 0.706404 0.507183 0.140881 0.185386 0.839278 0.54319 0.766459 0.511366 0.035649 0.090044 0.959572 0.848008 0.67342 0.828603 0.901121 0.48408 0.32727 0.408659 0.635499 0.050257 0.299086 0.809434 0.329392 0.393297 0.020524 0.301424 0.401873 0.107527 0.134473 0.888697 0.862244 0.388148 0.510632 0.021844 0.211637 0.983548 0.750061 0.581691 0.8 [...]
+0.967053 0.96408 0.407472 0.243426 0.073553 0.131468 0.519069 0.204502 0.367595 0.893036 0.41986 0.885532 0.333391 0.062071 0.307215 0.504409 0.921116 0.811716 0.514667 0.39135 0.482074 0.198621 0.39355 0.699015 0.131646 0.560909 0.308413 0.752973 0.19751 0.084269 0.220348 0.290537 0.197098 0.658206 0.833947 0.93145 0.641008 0.377695 0.169938 0.502201 0.448664 0.374086 0.868125 0.626354 0.566178 0.975222 0.897526 0.22142 0.273502 0.382143 0.609449 0.140514 0.722929 0.025713 0.015829 0.60 [...]
+0.17217 0.522288 0.448252 0.792922 0.57748 0.15994 0.76323 0.717011 0.021914 0.979877 0.371292 0.369332 0.146532 0.065641 0.653626 0.931544 0.185127 0.498961 0.611265 0.114672 0.57379 0.701752 0.54215 0.35678 0.518371 0.216947 0.378347 0.682263 0.838049 0.843137 0.469257 0.43867 0.414311 0.691668 0.886021 0.727492 0.321331 0.66419 0.869113 0.543119 0.025167 0.554314 0.968929 0.706924 0.193605 0.620561 0.741636 0.619067 0.244791 0.743527 0.390783 0.360223 0.009532 0.215124 0.430105 0.7515 [...]
+0.538358 0.865107 0.726547 0.661466 0.782077 0.82159 0.73905 0.31976 0.948858 0.214494 0.413864 0.804598 0.360016 0.23899 0.673083 0.069026 0.797489 0.141701 0.405056 0.109022 0.090287 0.514017 0.704252 0.898536 0.285641 0.115066 0.77259 0.627634 0.610435 0.199945 0.210467 0.269621 0.024115 0.243692 0.970668 0.294825 0.186316 0.665115 0.205399 0.386198 0.06171 0.15499 0.194282 0.475273 0.567611 0.703099 0.461333 0.527289 0.028867 0.770825 0.050322 0.935558 0.184008 0.995704 0.249075 0.21 [...]
+0.346538 0.490698 0.91269 0.876374 0.781423 0.99446 0.35752 0.012544 0.347587 0.195322 0.302469 0.095211 0.32456 0.129426 0.74568 0.480149 0.985917 0.651494 0.090328 0.011083 0.163684 0.657176 0.427201 0.522316 0.049098 0.747292 0.57904 0.952696 0.060083 0.296276 0.294086 0.223553 0.348225 0.749702 0.114225 0.004181 0.909361 0.670722 0.272826 0.944806 0.393012 0.159655 0.163706 0.732583 0.762198 0.27498 0.369589 0.245146 0.993073 0.809923 0.203869 0.480732 0.215743 0.678243 0.526024 0.23 [...]
+0.727137 0.437489 0.04173 0.240434 0.166577 0.04503 0.115969 0.691529 0.994682 0.992431 0.273395 0.273716 0.970538 0.471386 0.986282 0.706638 0.424122 0.784802 0.043285 0.414109 0.922077 0.12049 0.458231 0.089765 0.365094 0.152155 0.003085 0.752181 0.842045 0.724554 0.26655 0.081645 0.852945 0.808106 0.516578 0.424991 0.34356 0.028839 0.510678 0.193436 0.68723 0.247603 0.504483 0.877274 0.944757 0.871763 0.928577 0.536351 0.45194 0.4244 0.446262 0.578725 0.383054 0.396047 0.448803 0.1190 [...]
+0.78955 0.393947 0.099808 0.818707 0.120025 0.134844 0.54637 0.575053 0.82295 0.405981 0.039572 0.2826 0.773736 0.657318 0.230521 0.386952 0.559639 0.679741 0.925257 0.251477 0.921771 0.891229 0.210772 0.626167 0.117462 0.38972 0.654507 0.247269 0.878728 0.534619 0.967979 0.005074 0.865198 0.031775 0.908502 0.127176 0.513986 0.395726 0.161246 0.844874 0.222242 0.361644 0.141333 0.902706 0.034916 0.238215 0.24777 0.335444 0.590435 0.473478 0.172334 0.91289 0.82097 0.677509 0.969279 0.1432 [...]
+0.874448 0.649641 0.180206 0.678331 0.048533 0.625966 0.960739 0.430559 0.627381 0.399188 0.12404 0.229252 0.013248 0.468137 0.623747 0.334772 0.902924 0.100622 0.439467 0.74283 0.177507 0.974876 0.497174 0.589574 0.451826 0.563924 0.115104 0.829409 0.187078 0.94516 0.20494 0.766095 0.022822 0.49408 0.971455 0.812357 0.374868 0.335037 0.418123 0.769478 0.612615 0.881925 0.88566 0.273578 0.424145 0.304358 0.120891 0.484747 0.06695 0.583019 0.487225 0.567869 0.434782 0.748801 0.868822 0.67 [...]
+0.011611 0.391285 0.827313 0.077559 0.396063 0.514793 0.745892 0.430368 0.82179 0.115024 0.278401 0.926994 0.151447 0.216768 0.434655 0.443862 0.317618 0.654919 0.234427 0.718832 0.558016 0.322885 0.92205 0.733055 0.642073 0.907756 0.270265 0.493318 0.384313 0.008426 0.517116 0.966067 0.179532 0.117529 0.066117 0.54447 0.528116 0.104658 0.955946 0.068932 0.871206 0.79003 0.567403 0.884511 0.05343 0.943323 0.473561 0.009365 0.782183 0.161822 0.271529 0.01515 0.045078 0.690849 0.57739 0.06 [...]
+0.867249 0.424016 0.911589 0.875562 0.375946 0.24626 0.435246 0.213972 0.252576 0.016141 0.655003 0.582923 0.500656 0.628922 0.776538 0.779124 0.581528 0.305136 0.156121 0.868306 0.607418 0.431463 0.677808 0.456941 0.394942 0.996861 0.542057 0.320373 0.169093 0.927855 0.811142 0.4319 0.738349 0.442744 0.158709 0.670308 0.43097 0.058025 0.044538 0.053254 0.983792 0.738069 0.636544 0.878236 0.674806 0.61002 0.648544 0.219629 0.500178 0.139983 0.99197 0.094763 0.486327 0.763815 0.932324 0.5 [...]
+0.88865 0.067558 0.392284 0.702569 0.558996 0.129976 0.888643 0.372822 0.193265 0.286023 0.473818 0.909439 0.624201 0.197743 0.42951 0.401051 0.523015 0.508303 0.285916 0.856018 0.591137 0.590858 0.360958 0.722367 0.307294 0.030601 0.471525 0.086928 0.237153 0.711465 0.753987 0.831433 0.675449 0.817271 0.56896 0.217357 0.703397 0.725467 0.865385 0.107722 0.725691 0.798347 0.250639 0.60542 0.337363 0.766128 0.212503 0.664603 0.945891 0.80347 0.027318 0.158545 0.096948 0.069072 0.764798 0. [...]
+0.138186 0.788999 0.49607 0.101206 0.442603 0.63756 0.445324 0.042782 0.007917 0.656261 0.027496 0.893033 0.853819 0.811826 0.651631 0.469763 0.24234 0.063661 0.576976 0.639952 0.628228 0.89943 0.56163 0.52757 0.792644 0.369207 0.281744 0.959792 0.013064 0.513088 0.041636 0.095391 0.342793 0.299659 0.708113 0.482953 0.089359 0.525492 0.750516 0.774865 0.244021 0.960539 0.524699 0.725419 0.722187 0.562258 0.261595 0.837999 0.707424 0.722329 0.956752 0.355273 0.644425 0.809991 0.337159 0.9 [...]
+0.005967 0.004183 0.170632 0.05805 0.545392 0.757319 0.295938 0.484721 0.455447 0.829427 0.827884 0.019932 0.428629 0.507746 0.059582 0.878028 0.895357 0.746444 0.47252 0.96983 0.18766 0.044311 0.497625 0.996518 0.305628 0.97641 0.601965 0.117627 0.239 0.378576 0.103727 0.541162 0.569701 0.661081 0.534141 0.661524 0.780739 0.348399 0.187821 0.990878 0.509581 0.168981 0.811605 0.566355 0.77751 0.08039 0.770052 0.081363 0.781142 0.058325 0.120747 0.890797 0.173704 0.195471 0.242082 0.93949 [...]
+0.132134 0.531242 0.578329 0.553824 0.442238 0.356381 0.321224 0.832012 0.83017 0.430322 0.394865 0.336991 0.337778 0.949727 0.112091 0.345276 0.870268 0.951828 0.921785 0.030743 0.990652 0.178762 0.789731 0.485675 0.84738 0.192641 0.612819 0.527382 0.794322 0.376598 0.032745 0.623988 0.175484 0.080787 0.372993 0.407451 0.443991 0.168816 0.953218 0.514594 0.478958 0.218312 0.699633 0.016494 0.233485 0.796853 0.946532 0.469034 0.642536 0.632048 0.419096 0.807881 0.525593 0.124297 0.690173 [...]
+0.339652 0.974668 0.509371 0.239155 0.098125 0.910488 0.192242 0.546423 0.072129 0.172604 0.533759 0.131891 0.429047 0.808845 0.600736 0.530442 0.421037 0.856709 0.798326 0.625842 0.374988 0.718172 0.27058 0.864729 0.741969 0.852645 0.684744 0.289436 0.7914 0.337687 0.375498 0.383232 0.965861 0.910692 0.824977 0.577345 0.547133 0.245255 0.602975 0.272795 0.953097 0.861772 0.353399 0.406795 0.450477 0.123793 0.865242 0.998814 0.783191 0.738063 0.480128 0.663414 0.612381 0.66606 0.377443 0 [...]
+0.689739 0.140874 0.937174 0.559507 0.91614 0.164238 0.269575 0.077217 0.592773 0.582601 0.480342 0.25056 0.151336 0.636245 0.827534 0.436938 0.186852 0.994231 0.172615 0.669318 0.476749 0.240727 0.888979 0.239211 0.967113 0.199509 0.145835 0.922445 0.248722 0.140982 0.289728 0.467513 0.555383 0.245415 0.843341 0.059081 0.313876 0.899947 0.308972 0.302986 0.859464 0.072601 0.299387 0.647484 0.041587 0.553663 0.108262 0.28529 0.613496 0.465554 0.249824 0.436443 0.750296 0.486259 0.974489  [...]
+0.202061 0.883906 0.602254 0.541456 0.480045 0.960024 0.089916 0.65218 0.438518 0.508124 0.016977 0.928754 0.336577 0.430424 0.847425 0.669945 0.592193 0.792623 0.162979 0.191164 0.368572 0.324902 0.061809 0.653493 0.381078 0.318308 0.216639 0.111625 0.691992 0.48444 0.582394 0.421155 0.470291 0.199572 0.512565 0.419582 0.43544 0.767702 0.367402 0.401642 0.37417 0.465272 0.280131 0.822023 0.038695 0.161861 0.451953 0.644076 0.771506 0.612652 0.880361 0.327587 0.751766 0.976158 0.2459 0.2 [...]
+0.422195 0.265029 0.678523 0.653483 0.68923 0.097047 0.346988 0.037998 0.809065 0.113227 0.653656 0.624368 0.142616 0.998755 0.098102 0.809862 0.361059 0.6951 0.41931 0.651482 0.880381 0.237559 0.908758 0.962242 0.605084 0.651854 0.677292 0.703352 0.358371 0.336546 0.044573 0.45189 0.258296 0.961881 0.577316 0.866471 0.285788 0.439465 0.915257 0.58977 0.90283 0.169771 0.793662 0.920275 0.849614 0.036954 0.555576 0.640686 0.825242 0.844049 0.715743 0.737937 0.615038 0.493033 0.163367 0.45 [...]
+0.78627 0.176774 0.145493 0.100441 0.519315 0.96784 0.09444 0.755731 0.614058 0.113625 0.62375 0.246468 0.463549 0.013262 0.88257 0.258078 0.156329 0.460709 0.382004 0.878223 0.059006 0.587661 0.252542 0.713387 0.916301 0.22691 0.696408 0.614673 0.631465 0.356925 0.724054 0.372195 0.126074 0.017775 0.905662 0.834595 0.485556 0.751063 0.32449 0.85863 0.336202 0.450579 0.124706 0.385393 0.01713 0.808194 0.997883 0.709859 0.075256 0.132782 0.332209 0.998872 0.408025 0.611626 0.523286 0.2716 [...]
+0.53451 0.694411 0.425494 0.436407 0.957625 0.210949 0.537054 0.24349 0.813474 0.061737 0.622866 0.623263 0.311607 0.895193 0.226678 0.957967 0.373801 0.808982 0.044975 0.153247 0.299124 0.666161 0.904191 0.019941 0.162617 0.616048 0.439853 0.674814 0.629578 0.892145 0.687221 0.711071 0.012254 0.373538 0.480597 0.821678 0.62435 0.863997 0.374753 0.016262 0.868387 0.632423 0.497286 0.066177 0.512124 0.923318 0.312805 0.499893 0.437976 0.229503 0.949692 0.635566 0.864792 0.190258 0.347204  [...]
+0.351145 0.085273 0.207066 0.673041 0.939047 0.453572 0.313227 0.726417 0.52908 0.879256 0.860057 0.330092 0.441316 0.182185 0.17833 0.078584 0.267011 0.010162 0.186641 0.631474 0.414364 0.049549 0.086563 0.938672 0.731891 0.363955 0.802373 0.217647 0.667067 0.657414 0.994672 0.31189 0.328359 0.439982 0.514317 0.64991 0.104971 0.369113 0.43214 0.102223 0.604508 0.008417 0.818234 0.677652 0.850371 0.917896 0.055575 0.571205 0.639802 0.323584 0.840041 0.502984 0.032113 0.88529 0.423222 0.4 [...]
+0.969767 0.919933 0.925472 0.396128 0.219448 0.730317 0.486078 0.817301 0.803817 0.880951 0.80761 0.221118 0.267697 0.631434 0.757974 0.960233 0.123843 0.165789 0.539006 0.829569 0.687936 0.125713 0.520845 0.967401 0.461819 0.95194 0.265214 0.969438 0.455914 0.29504 0.875358 0.775619 0.552692 0.209563 0.73857 0.985717 0.908225 0.585886 0.955629 0.080398 0.660505 0.097978 0.954146 0.87003 0.055328 0.379052 0.105868 0.454609 0.43244 0.100843 0.483342 0.920523 0.712837 0.088864 0.511418 0.1 [...]
+0.76043 0.355562 0.950166 0.166371 0.299253 0.068837 0.614611 0.891171 0.355515 0.911714 0.023262 0.838292 0.796498 0.618585 0.719141 0.782703 0.842851 0.271985 0.596856 0.242529 0.58939 0.952526 0.897465 0.482188 0.756357 0.391751 0.820206 0.930302 0.321023 0.951543 0.923012 0.006967 0.142639 0.944909 0.789669 0.485773 0.709781 0.66631 0.906498 0.615257 0.71882 0.072731 0.941655 0.633104 0.917214 0.796577 0.107874 0.336768 0.522996 0.087526 0.171637 0.808739 0.753994 0.614224 0.562433 0 [...]
+0.33984 0.707257 0.874762 0.84055 0.546236 0.428075 0.996137 0.541593 0.641672 0.825398 0.20568 0.84657 0.037201 0.605701 0.255697 0.918527 0.004295 0.321381 0.033491 0.468327 0.978256 0.854326 0.381528 0.161516 0.778202 0.897399 0.844742 0.41072 0.979797 0.558853 0.396061 0.821083 0.205163 0.322735 0.818298 0.651344 0.136068 0.47932 0.723029 0.928115 0.902767 0.594287 0.578214 0.445805 0.017476 0.811177 0.541569 0.320709 0.972051 0.279983 0.568216 0.655482 0.60416 0.509316 0.893304 0.33 [...]
+0.388968 0.004709 0.172846 0.948396 0.670717 0.532512 0.624544 0.013306 0.319131 0.53103 0.16984 0.230485 0.007421 0.664191 0.501681 0.736368 0.785462 0.374767 0.713174 0.493977 0.915015 0.794237 0.161126 0.539607 0.033059 0.04231 0.897337 0.378604 0.260946 0.892939 0.925967 0.667679 0.801691 0.891482 0.212656 0.692777 0.168485 0.121167 0.666292 0.337054 0.898491 0.982916 0.498936 0.198336 0.251362 0.727945 0.716235 0.545155 0.434845 0.523323 0.16013 0.984974 0.612643 0.623252 0.164887 0 [...]
+0.311517 0.033276 0.983088 0.303046 0.770796 0.463192 0.073198 0.581757 0.495159 0.134981 0.829926 0.079157 0.278784 0.047259 0.842365 0.837903 0.294659 0.794594 0.94714 0.754205 0.120643 0.580177 0.017723 0.353133 0.879453 0.535453 0.924142 0.771608 0.00584 0.485406 0.246329 0.85133 0.940992 0.951599 0.26338 0.238185 0.62959 0.389367 0.559935 0.586864 0.784843 0.602555 0.953793 0.133314 0.251759 0.575213 0.033649 0.805561 0.514582 0.370914 0.474429 0.81739 0.398124 0.600039 0.781879 0.4 [...]
+0.294496 0.311554 0.148052 0.187112 0.725584 0.352471 0.035209 0.184987 0.258954 0.008073 0.269444 0.47511 0.018099 0.945204 0.935058 0.186699 0.004513 0.883586 0.093524 0.011457 0.395109 0.870189 0.140317 0.998601 0.669292 0.0326 0.180915 0.830514 0.384531 0.886111 0.659172 0.072618 0.167762 0.589315 0.927704 0.599271 0.554444 0.405159 0.96164 0.38458 0.808479 0.508891 0.111421 0.730352 0.064305 0.39994 0.839721 0.075454 0.107262 0.57766 0.15751 0.936114 0.065994 0.491573 0.21547 0.2922 [...]
+0.584851 0.767403 0.033917 0.429007 0.529107 0.721374 0.674649 0.00183 0.389184 0.971537 0.870063 0.072963 0.734297 0.39972 0.437563 0.40605 0.300772 0.200917 0.211143 0.584027 0.328567 0.005734 0.854265 0.974934 0.754499 0.725236 0.236238 0.627563 0.106817 0.299299 0.774462 0.836862 0.447672 0.791472 0.551729 0.655116 0.673844 0.371177 0.738298 0.525423 0.746832 0.927286 0.341406 0.455024 0.588956 0.494953 0.780775 0.727732 0.930029 0.095243 0.887295 0.84064 0.803714 0.109665 0.975034 0 [...]
+0.347699 0.424696 0.357867 0.356113 0.69681 0.374881 0.808899 0.600324 0.380637 0.460077 0.916758 0.190553 0.727693 0.194119 0.055198 0.562069 0.70339 0.463943 0.066865 0.115993 0.34624 0.173785 0.739067 0.396776 0.126254 0.090238 0.872825 0.221535 0.953169 0.808464 0.022815 0.400563 0.093196 0.450091 0.539801 0.885263 0.898913 0.615664 0.304246 0.261178 0.418569 0.867294 0.929591 0.935318 0.518145 0.892322 0.481114 0.823654 0.868306 0.914804 0.488188 0.856684 0.955151 0.814129 0.498404  [...]
+0.227883 0.314809 0.700075 0.8479 0.67552 0.864702 0.224561 0.274382 0.778691 0.977772 0.349187 0.479331 0.189755 0.007766 0.231089 0.708462 0.393809 0.132358 0.287785 0.466177 0.914922 0.038754 0.039645 0.208398 0.747522 0.160767 0.53214 0.36998 0.060919 0.869462 0.271941 0.246738 0.033261 0.875543 0.262316 0.611583 0.731137 0.824218 0.548609 0.837312 0.951421 0.160604 0.317461 0.902226 0.49524 0.507322 0.207759 0.951882 0.97069 0.607918 0.478325 0.094064 0.391957 0.262144 0.122825 0.87 [...]
+0.599368 0.129622 0.662298 0.427212 0.545279 0.697028 0.849694 0.315152 0.478055 0.018635 0.846135 0.846911 0.375653 0.043094 0.006395 0.470983 0.61437 0.415819 0.254436 0.866074 0.808683 0.375477 0.772898 0.088198 0.209113 0.255484 0.733505 0.657496 0.079177 0.682459 0.171847 0.403233 0.160456 0.900693 0.490889 0.963763 0.079892 0.826654 0.835125 0.300829 0.854575 0.909535 0.150803 0.041527 0.791318 0.953055 0.711091 0.333584 0.632108 0.254701 0.279116 0.031776 0.106223 0.528843 0.91323 [...]
+0.848935 0.305229 0.10525 0.699856 0.976981 0.456638 0.496264 0.063658 0.089748 0.345075 0.561768 0.165223 0.227993 0.631185 0.502419 0.828301 0.984132 0.478379 0.998848 0.621811 0.971552 0.18991 0.999437 0.939327 0.812455 0.475535 0.529158 0.387314 0.608026 0.822389 0.294464 0.202239 0.028774 0.476157 0.690106 0.826705 0.063651 0.143944 0.21926 0.250402 0.36095 0.359138 0.528591 0.90835 0.015992 0.20923 0.497343 0.364545 0.927909 0.432063 0.24751 0.851015 0.938217 0.760644 0.345463 0.14 [...]
+0.107934 0.558143 0.312761 0.501102 0.180624 0.139339 0.276542 0.553974 0.817243 0.88308 0.316202 0.21002 0.414099 0.892173 0.67007 0.488442 0.247219 0.517238 0.421382 0.302009 0.796015 0.703377 0.556241 0.480169 0.399039 0.241605 0.727142 0.088555 0.596503 0.054864 0.926941 0.325096 0.859239 0.514423 0.319749 0.03497 0.833214 0.972339 0.825791 0.240658 0.481997 0.168958 0.205668 0.977789 0.05263 0.81304 0.212889 0.553546 0.880903 0.085184 0.202527 0.86416 0.182303 0.911163 0.591884 0.46 [...]
+0.256979 0.363466 0.44878 0.502463 0.308309 0.869052 0.189631 0.591156 0.264084 0.933868 0.914163 0.754948 0.275351 0.867811 0.568287 0.036872 0.722385 0.185492 0.081731 0.961209 0.891007 0.564269 0.634404 0.590355 0.758701 0.318827 0.553502 0.205565 0.520539 0.530145 0.4774 0.494784 0.319126 0.141013 0.134997 0.783833 0.747049 0.129908 0.182188 0.166302 0.254172 0.135893 0.226551 0.750817 0.978358 0.983481 0.267 0.55896 0.616091 0.154539 0.90334 0.966862 0.379022 0.954561 0.965461 0.371 [...]
+0.337505 0.36789 0.693498 0.338744 0.595692 0.736759 0.747167 0.30457 0.449039 0.013315 0.379978 0.591856 0.632586 0.086903 0.702628 0.45899 0.090309 0.307643 0.056248 0.628789 0.503815 0.608477 0.212597 0.608592 0.266401 0.29678 0.636645 0.228992 0.565503 0.660308 0.923606 0.764929 0.134081 0.477177 0.038521 0.525963 0.696708 0.665574 0.28671 0.427139 0.485552 0.778722 0.746978 0.318679 0.364775 0.588214 0.106403 0.892628 0.461303 0.460719 0.954044 0.56622 0.62936 0.119244 0.116023 0.15 [...]
+0.139234 0.236009 0.864227 0.42989 0.556655 0.037671 0.353798 0.38042 0.897229 0.285648 0.141389 0.646819 0.91506 0.901408 0.066298 0.899413 0.836872 0.372604 0.129624 0.437961 0.725516 0.958634 0.631358 0.316665 0.170675 0.829088 0.852348 0.327172 0.773224 0.446458 0.489798 0.548148 0.325248 0.999604 0.869895 0.995998 0.536277 0.886101 0.645543 0.309215 0.986799 0.9056 0.006181 0.85525 0.759342 0.11735 0.645984 0.082491 0.612675 0.772608 0.764036 0.906763 0.438198 0.25826 0.893151 0.495 [...]
+0.507774 0.683637 0.675543 0.503973 0.645972 0.566222 0.636804 0.345782 0.418208 0.538346 0.699059 0.668372 0.224322 0.252966 0.199248 0.091727 0.166586 0.855167 0.080131 0.300229 0.102282 0.113755 0.034161 0.895075 0.924206 0.925977 0.737465 0.074262 0.44904 0.251566 0.149843 0.941413 0.40419 0.818581 0.030301 0.436106 0.096254 0.128904 0.93434 0.400663 0.965375 0.686967 0.724528 0.24524 0.857718 0.007202 0.645259 0.343767 0.46591 0.100467 0.830395 0.455934 0.728579 0.617236 0.722025 0. [...]
+0.90139 0.703697 0.52855 0.452491 0.21136 0.73007 0.581932 0.468146 0.715842 0.497434 0.809644 0.54178 0.598507 0.816199 0.486036 0.998362 0.449609 0.636796 0.415133 0.333253 0.415656 0.418394 0.738738 0.122584 0.641541 0.711477 0.199909 0.551571 0.068212 0.381725 0.906648 0.622807 0.071203 0.129207 0.216857 0.079462 0.746881 0.253021 0.985233 0.065929 0.367506 0.302012 0.571257 0.391819 0.547755 0.640438 0.198316 0.589785 0.255685 0.741261 0.292649 0.406503 0.557368 0.102042 0.267552 0. [...]
+0.468229 0.219225 0.687288 0.819506 0.047553 0.440402 0.108897 0.375617 0.943797 0.739308 0.631719 0.186643 0.433757 0.814662 0.653547 0.457187 0.26557 0.672977 0.802655 0.961394 0.037861 0.903798 0.892493 0.310024 0.114025 0.969597 0.016182 0.494681 0.62097 0.879122 0.99341 0.995257 0.573946 0.643283 0.072221 0.040187 0.288264 0.938628 0.71867 0.675234 0.207414 0.656839 0.170334 0.013026 0.913421 0.670738 0.379485 0.30034 0.870481 0.416211 0.396652 0.036604 0.239849 0.102019 0.172279 0. [...]
+0.746897 0.491175 0.854074 0.465148 0.083426 0.050661 0.560072 0.505308 0.776511 0.478481 0.604227 0.592569 0.639491 0.035783 0.397712 0.93223 0.103317 0.984685 0.090148 0.0109 0.745575 0.406932 0.141589 0.878578 0.060378 0.2006 0.814596 0.469171 0.051297 0.699503 0.847674 0.77442 0.167722 0.159403 0.717731 0.939016 0.873298 0.695211 0.747768 0.396099 0.222586 0.187045 0.710213 0.318539 0.051202 0.562419 0.164842 0.556854 0.752772 0.397455 0.9132 0.320817 0.551292 0.415889 0.756211 0.626 [...]
+0.49387 0.86226 0.589447 0.343363 0.086848 0.66664 0.65036 0.151407 0.140692 0.69553 0.553996 0.08321 0.660444 0.003101 0.294265 0.80949 0.377428 0.088269 0.838769 0.333633 0.62601 0.453573 0.665504 0.277591 0.71925 0.005186 0.241362 0.717942 0.684672 0.496703 0.273378 0.843049 0.702572 0.131842 0.770957 0.368113 0.21762 0.018711 0.486509 0.065988 0.093371 0.816118 0.422473 0.660099 0.443332 0.259667 0.58411 0.417924 0.567525 0.268899 0.534183 0.533689 0.652651 0.437807 0.701697 0.765289 [...]
+0.799843 0.979549 0.929335 0.775802 0.496205 0.921086 0.298707 0.325858 0.083484 0.408312 0.637622 0.191499 0.37155 0.343455 0.976746 0.780107 0.853923 0.490946 0.899782 0.402409 0.276388 0.908725 0.27307 0.657987 0.25894 0.11954 0.099104 0.197354 0.803637 0.207637 0.959926 0.029085 0.26232 0.519414 0.449108 0.736275 0.098754 0.431624 0.840185 0.014538 0.349265 0.920426 0.111588 0.918006 0.923073 0.447186 0.608513 0.946288 0.113516 0.641615 0.918588 0.420055 0.357127 0.074816 0.834488 0. [...]
+0.124772 0.378599 0.401743 0.937627 0.571676 0.589452 0.979655 0.494112 0.000623 0.085201 0.851458 0.725517 0.938334 0.569545 0.463363 0.132561 0.882327 0.845462 0.988253 0.350426 0.133978 0.847415 0.287092 0.169538 0.06933 0.995179 0.229193 0.650386 0.900305 0.950618 0.355905 0.309316 0.506493 0.020518 0.478845 0.867772 0.340721 0.32704 0.598566 0.162104 0.783841 0.868756 0.493808 0.805331 0.950059 0.379782 0.298187 0.737555 0.635294 0.292148 0.149559 0.241805 0.022976 0.404383 0.769181 [...]
+0.64247 0.542288 0.194222 0.094675 0.110633 0.212547 0.815956 0.551073 0.725959 0.560304 0.013776 0.188871 0.123321 0.906769 0.799336 0.675454 0.174796 0.023679 0.761339 0.145249 0.71916 0.73123 0.05999 0.44803 0.265271 0.292644 0.336778 0.451217 0.063925 0.692364 0.139088 0.136839 0.28279 0.617651 0.214181 0.730484 0.772371 0.233684 0.230977 0.961413 0.784711 0.682879 0.815756 0.864141 0.187035 0.115851 0.570995 0.003351 0.572755 0.940747 0.124908 0.164723 0.37359 0.352537 0.809985 0.76 [...]
+0.216229 0.73495 0.565489 0.41249 0.282292 0.977339 0.549535 0.69499 0.269576 0.333948 0.969989 0.532344 0.975039 0.598139 0.014465 0.0495 0.359584 0.761272 0.000994 0.551439 0.11523 0.183262 0.898877 0.435956 0.706143 0.665569 0.584822 0.337613 0.164288 0.864709 0.89817 0.533857 0.911309 0.199453 0.297481 0.228052 0.416127 0.632379 0.97116 0.554305 0.732508 0.467579 0.687363 0.9296 0.013242 0.474711 0.311833 0.809161 0.655856 0.857307 0.297075 0.079454 0.412372 0.976285 0.859331 0.56914 [...]
+0.222345 0.576924 0.229783 0.900046 0.156992 0.111557 0.634425 0.563435 0.60376 0.267906 0.100001 0.366606 0.867623 0.314519 0.394746 0.259057 0.036652 0.427722 0.732028 0.435656 0.433152 0.015366 0.123076 0.774718 0.939727 0.458575 0.136121 0.91518 0.352918 0.234951 0.649232 0.180403 0.276664 0.65285 0.309198 0.489297 0.503128 0.294144 0.552271 0.209022 0.734572 0.409359 0.688692 0.079538 0.880007 0.857868 0.272811 0.935909 0.443979 0.930122 0.465763 0.183555 0.038171 0.654934 0.672952  [...]
+0.305468 0.062958 0.706472 0.941111 0.529805 0.067714 0.532909 0.956278 0.915872 0.591934 0.892338 0.827273 0.060404 0.953588 0.251825 0.352282 0.835899 0.094731 0.593712 0.54084 0.264844 0.558339 0.944784 0.985115 0.76582 0.881484 0.093386 0.863527 0.360112 0.30481 0.358007 0.248969 0.760764 0.388581 0.906914 0.172506 0.879989 0.707734 0.358637 0.19756 0.65654 0.508835 0.692789 0.8459 0.507027 0.894827 0.774261 0.178367 0.999901 0.195119 0.97074 0.88486 0.412322 0.724608 0.98469 0.07514 [...]
+0.603298 0.116184 0.949993 0.481664 0.616828 0.500903 0.898607 0.153517 0.578701 0.675213 0.404684 0.792588 0.689848 0.753278 0.385313 0.988148 0.642406 0.073269 0.421146 0.747717 0.862385 0.383267 0.209753 0.900284 0.925277 0.674125 0.980665 0.815788 0.923473 0.861247 0.167831 0.422874 0.301559 0.72047 0.023173 0.24048 0.33105 0.323419 0.462387 0.680959 0.507059 0.282535 0.664419 0.499805 0.250502 0.018964 0.015808 0.093207 0.695341 0.5562 0.053401 0.414984 0.711268 0.780839 0.547379 0. [...]
+0.459545 0.273476 0.847212 0.252037 0.980943 0.370712 0.285425 0.213374 0.29121 0.213613 0.851242 0.795218 0.140149 0.097116 0.181531 0.595368 0.939391 0.778607 0.986309 0.84654 0.388926 0.422076 0.717154 0.780933 0.140314 0.022575 0.442107 0.189006 0.206377 0.041223 0.430629 0.197206 0.624032 0.684301 0.041165 0.520114 0.206078 0.027918 0.478693 0.066316 0.115612 0.162902 0.646942 0.115866 0.490511 0.047917 0.751162 0.392638 0.805143 0.577775 0.539223 0.484931 0.310548 0.95109 0.65663 0 [...]
+0.199045 0.881723 0.955136 0.035487 0.022426 0.142035 0.534555 0.65771 0.749205 0.032128 0.872651 0.190363 0.726874 0.161177 0.185931 0.031315 0.432143 0.673485 0.451458 0.575593 0.689558 0.837251 0.465126 0.259817 0.527947 0.391145 0.794038 0.586946 0.673302 0.005208 0.048445 0.468621 0.923636 0.97009 0.41398 0.074189 0.074007 0.47792 0.025407 0.583495 0.383966 0.426169 0.603136 0.085768 0.193759 0.720807 0.818507 0.281924 0.294911 0.213776 0.138474 0.589472 0.255549 0.431196 0.635492 0 [...]
+0.914321 0.871789 0.663848 0.877689 0.479422 0.137445 0.347553 0.615376 0.387175 0.751806 0.82346 0.180775 0.407029 0.273765 0.092571 0.143508 0.149706 0.272078 0.515306 0.696599 0.357436 0.568842 0.06984 0.109966 0.534856 0.010393 0.03365 0.736318 0.637493 0.30514 0.302045 0.986961 0.468197 0.966018 0.355311 0.682906 0.733201 0.413107 0.668182 0.353956 0.48057 0.068903 0.467111 0.718487 0.886186 0.482055 0.129242 0.057889 0.185045 0.128885 0.827258 0.409508 0.905595 0.336718 0.675853 0. [...]
+0.730074 0.699205 0.434232 0.118069 0.806467 0.000562 0.867538 0.264935 0.431619 0.475952 0.625946 0.596694 0.223188 0.45119 0.732869 0.986568 0.503705 0.875159 0.477277 0.492662 0.06951 0.270318 0.890417 0.428474 0.702289 0.58248 0.008321 0.41694 0.0386 0.623381 0.933415 0.231507 0.879209 0.229005 0.636763 0.163678 0.819571 0.393609 0.999313 0.776906 0.806478 0.593853 0.286554 0.508313 0.358375 0.006959 0.46779 0.69476 0.977393 0.048141 0.252317 0.59075 0.213549 0.189618 0.274323 0.1130 [...]
+0.839572 0.355619 0.373344 0.605119 0.386151 0.59055 0.990736 0.897609 0.663535 0.725984 0.561545 0.561492 0.209156 0.011095 0.555355 0.582065 0.389792 0.870012 0.667795 0.458256 0.485184 0.297937 0.245414 0.921659 0.551183 0.52241 0.308068 0.521336 0.354292 0.333959 0.758398 0.35618 0.641509 0.531631 0.961604 0.509603 0.138423 0.08513 0.712499 0.766248 0.3025 0.377323 0.346831 0.436372 0.457623 0.391887 0.750782 0.546578 0.093686 0.724004 0.714707 0.639964 0.952399 0.356881 0.416493 0.1 [...]
+0.060953 0.911829 0.391738 0.470617 0.235977 0.818728 0.03828 0.509014 0.680152 0.025592 0.479937 0.672323 0.449205 0.70215 0.562917 0.084139 0.56511 0.236603 0.839425 0.749985 0.843154 0.967107 0.033135 0.191039 0.219397 0.865513 0.380277 0.282653 0.655181 0.053074 0.459212 0.252535 0.405379 0.569412 0.771496 0.895685 0.738792 0.5864 0.170938 0.212991 0.963306 0.539544 0.078607 0.286883 0.874004 0.019019 0.59948 0.823451 0.110677 0.802399 0.863968 0.866384 0.908685 0.514463 0.664323 0.3 [...]
+0.551144 0.908982 0.61657 0.117648 0.898815 0.831567 0.420121 0.662557 0.394441 0.321947 0.192531 0.036146 0.75948 0.270177 0.723736 0.877053 0.278447 0.652929 0.168833 0.403815 0.665779 0.977932 0.235084 0.618531 0.224653 0.795484 0.716521 0.492935 0.774142 0.631336 0.293134 0.952799 0.296078 0.998457 0.320919 0.014669 0.583007 0.14014 0.343261 0.347503 0.260845 0.664376 0.249676 0.49733 0.267324 0.654943 0.025942 0.896894 0.793701 0.090306 0.724993 0.832378 0.116761 0.005066 0.55488 0. [...]
+0.056224 0.647067 0.937609 0.696969 0.670884 0.76162 0.693109 0.288522 0.712506 0.915946 0.503806 0.64454 0.236059 0.784642 0.991947 0.066956 0.62045 0.589905 0.987042 0.17305 0.950198 0.124626 0.354002 0.600418 0.376356 0.037404 0.295293 0.473353 0.67759 0.568805 0.901989 0.45535 0.300791 0.116364 0.516858 0.320534 0.59956 0.983132 0.88149 0.522965 0.632351 0.422543 0.765567 0.693216 0.057957 0.726947 0.799523 0.625883 0.814155 0.652304 0.818954 0.422134 0.158187 0.390345 0.071513 0.733 [...]
+0.714538 0.73767 0.801317 0.919037 0.327205 0.366484 0.074097 0.431202 0.008444 0.856023 0.225292 0.989513 0.170412 0.563752 0.215102 0.640652 0.183581 0.414409 0.784369 0.648751 0.227231 0.876673 0.026859 0.152652 0.149222 0.440856 0.733592 0.294651 0.724181 0.153216 0.751041 0.703455 0.074819 0.019192 0.903355 0.259651 0.633956 0.415694 0.490041 0.146857 0.38492 0.998379 0.225083 0.761894 0.063538 0.933627 0.110503 0.201559 0.614688 0.732767 0.716536 0.746602 0.045386 0.09256 0.759378  [...]
+0.070956 0.234143 0.785302 0.369289 0.646769 0.929191 0.444987 0.775925 0.901422 0.235524 0.823259 0.953849 0.285094 0.005831 0.11153 0.927994 0.080941 0.650129 0.516614 0.616804 0.661758 0.718305 0.79719 0.916864 0.001168 0.775085 0.449597 0.62826 0.583343 0.262542 0.677483 0.943494 0.350728 0.261171 0.301393 0.46956 0.194684 0.93086 0.363474 0.913918 0.737437 0.673108 0.787894 0.763146 0.467509 0.689337 0.240845 0.452134 0.93402 0.372602 0.626933 0.814026 0.734655 0.656264 0.199278 0.8 [...]
+0.032303 0.566995 0.752617 0.997373 0.592955 0.138227 0.859297 0.012532 0.85194 0.428746 0.345167 0.074033 0.872087 0.638938 0.647834 0.362898 0.269974 0.121165 0.344797 0.410967 0.063056 0.354864 0.353798 0.215849 0.93796 0.697293 0.220506 0.041905 0.614758 0.324114 0.441152 0.806303 0.634939 0.493911 0.50687 0.677554 0.901359 0.247838 0.126997 0.301588 0.200307 0.494946 0.442639 0.57063 0.945999 0.840627 0.340165 0.928236 0.540422 0.821839 0.742787 0.276247 0.971778 0.515804 0.647017 0 [...]
+0.398187 0.80943 0.213286 0.145047 0.517213 0.178259 0.291979 0.939981 0.505579 0.873893 0.758465 0.259172 0.272688 0.17698 0.146046 0.89735 0.396689 0.074025 0.111892 0.394576 0.335009 0.418962 0.729069 0.076942 0.077416 0.63999 0.941956 0.733608 0.696595 0.851687 0.670873 0.220285 0.152324 0.512463 0.080295 0.575857 0.631135 0.868961 0.615061 0.350712 0.116729 0.293218 0.390723 0.905168 0.111049 0.256618 0.234891 0.622059 0.653904 0.611245 0.58898 0.518641 0.267402 0.859805 0.801597 0. [...]
+0.199663 0.323856 0.386131 0.611572 0.704118 0.261914 0.310946 0.376949 0.395325 0.871675 0.22666 0.071076 0.511061 0.917382 0.74902 0.453251 0.018802 0.89107 0.728105 0.084712 0.737218 0.039807 0.689461 0.506709 0.687926 0.304304 0.462656 0.258683 0.475102 0.106356 0.090783 0.026238 0.229716 0.407081 0.140549 0.542825 0.698853 0.545422 0.214423 0.477526 0.740026 0.174617 0.692477 0.792022 0.986304 0.504156 0.150755 0.371607 0.442618 0.388419 0.244615 0.596887 0.575848 0.34579 0.693027 0 [...]
+0.334124 0.45172 0.50827 0.36866 0.336947 0.39781 0.631052 0.903793 0.638932 0.674317 0.658701 0.331324 0.943407 0.27231 0.180192 0.825925 0.782683 0.728708 0.19712 0.332383 0.949339 0.240101 0.926821 0.93053 0.132731 0.374603 0.62969 0.116581 0.125776 0.44443 0.879405 0.28561 0.598524 0.09183 0.884103 0.770956 0.427653 0.67861 0.177947 0.695135 0.066311 0.036506 0.328895 0.637146 0.227113 0.319447 0.928479 0.91588 0.099845 0.51914 0.546821 0.692736 0.950597 0.5343 0.327996 0.89961 0.654 [...]
+0.562084 0.900677 0.760158 0.177478 0.400682 0.422745 0.890636 0.416445 0.991415 0.651356 0.139188 0.066914 0.714213 0.32918 0.906314 0.516117 0.887159 0.048335 0.493583 0.799833 0.653571 0.11382 0.229265 0.905049 0.05214 0.270043 0.725778 0.086503 0.043687 0.218098 0.010012 0.646518 0.122945 0.657707 0.725547 0.151167 0.991693 0.994641 0.467932 0.740872 0.869816 0.005384 0.989873 0.83084 0.558414 0.350625 0.367837 0.216568 0.574213 0.41187 0.530479 0.237672 0.889696 0.403065 0.925945 0. [...]
+0.187149 0.439222 0.162439 0.978331 0.015905 0.991594 0.086069 0.463412 0.091377 0.70197 0.783205 0.126988 0.588273 0.047893 0.354046 0.50604 0.23232 0.428713 0.292782 0.648359 0.307038 0.396642 0.81509 0.10542 0.479585 0.862564 0.502793 0.102485 0.04232 0.523356 0.711675 0.738591 0.212318 0.248616 0.855505 0.878775 0.964157 0.632878 0.577839 0.603403 0.117339 0.552685 0.472843 0.781913 0.550455 0.303929 0.833873 0.501995 0.592207 0.565911 0.484132 0.053756 0.550816 0.056743 0.311344 0.9 [...]
+0.080588 0.20582 0.123851 0.526344 0.173037 0.360111 0.421634 0.234978 0.656059 0.532918 0.4541 0.739299 0.127061 0.440985 0.507554 0.756052 0.577254 0.743139 0.941927 0.978166 0.042131 0.115198 0.672584 0.798604 0.689891 0.512412 0.124951 0.120344 0.987464 0.712505 0.46078 0.990645 0.720805 0.572322 0.777905 0.064545 0.142759 0.889201 0.201494 0.934093 0.551242 0.758201 0.776913 0.510792 0.087075 0.916588 0.050711 0.038636 0.051586 0.740774 0.638725 0.970696 0.570953 0.436707 0.246113 0 [...]
+0.691804 0.931943 0.325475 0.277093 0.130932 0.818705 0.005774 0.836809 0.623904 0.999867 0.425585 0.029241 0.456365 0.567834 0.273166 0.038163 0.183302 0.336555 0.757713 0.144021 0.400067 0.391896 0.395882 0.367421 0.712388 0.771702 0.842811 0.913103 0.294434 0.533652 0.814263 0.678724 0.082435 0.407166 0.869291 0.758546 0.313994 0.928943 0.660153 0.635331 0.298638 0.798019 0.519161 0.308621 0.94496 0.831009 0.200469 0.268133 0.62137 0.695899 0.221312 0.725356 0.030418 0.997284 0.566516 [...]
+0.814029 0.9523 0.183537 0.57202 0.789873 0.980521 0.869246 0.216447 0.95435 0.323266 0.872703 0.477145 0.197279 0.69986 0.052913 0.922179 0.393703 0.955699 0.377781 0.711827 0.32839 0.075817 0.8233 0.472272 0.37005 0.054133 0.024527 0.725015 0.802339 0.350642 0.335115 0.277899 0.150461 0.913298 0.337313 0.658567 0.30041 0.797282 0.532337 0.577683 0.349597 0.355946 0.176035 0.597392 0.39622 0.17022 0.273689 0.233975 0.922679 0.302277 0.681256 0.454924 0.913341 0.238563 0.393273 0.36644 0 [...]
+0.631308 0.109432 0.586774 0.375874 0.537588 0.057804 0.748359 0.91542 0.284739 0.759378 0.907267 0.059642 0.357239 0.056088 0.652673 0.543714 0.75468 0.993293 0.006712 0.390227 0.470582 0.821459 0.349082 0.891886 0.665301 0.29884 0.477227 0.189907 0.041465 0.291861 0.785708 0.749845 0.652754 0.826146 0.586762 0.396548 0.642987 0.785859 0.715323 0.724534 0.872476 0.149731 0.254393 0.703616 0.384061 0.81385 0.149872 0.832639 0.374795 0.149006 0.31592 0.89402 0.981473 0.194134 0.810662 0.9 [...]
+0.14057 0.305541 0.499625 0.007331 0.120862 0.712601 0.861523 0.27045 0.243376 0.046702 0.114305 0.994695 0.201659 0.564248 0.57784 0.364261 0.274838 0.138424 0.064416 0.594618 0.038149 0.79551 0.093764 0.737953 0.708537 0.738995 0.748439 0.231576 0.168454 0.216309 0.1017 0.284754 0.402155 0.203889 0.156059 0.699499 0.244711 0.621834 0.601215 0.863026 0.405535 0.018103 0.499384 0.950276 0.769908 0.38683 0.452667 0.981603 0.873557 0.173982 0.581327 0.696021 0.908792 0.483774 0.359298 0.55 [...]
+0.498586 0.804178 0.06834 0.603504 0.926706 0.40471 0.630109 0.79214 0.246288 0.202619 0.463491 0.44655 0.795908 0.090217 0.231207 0.514696 0.340973 0.87636 0.714966 0.433577 0.806434 0.205623 0.249301 0.875049 0.388447 0.639054 0.571354 0.997558 0.267336 0.502315 0.62308 0.933209 0.187698 0.461676 0.788823 0.967886 0.541633 0.541269 0.571426 0.880024 0.047052 0.763894 0.64044 0.45835 0.984556 0.354726 0.076322 0.63136 0.570329 0.752905 0.631186 0.586568 0.688365 0.231255 0.90713 0.81383 [...]
+0.430009 0.362179 0.732632 0.159073 0.532951 0.453425 0.073068 0.082342 0.352015 0.8813 0.135138 0.735135 0.050741 0.236584 0.90518 0.67301 0.990331 0.682171 0.801159 0.93857 0.102183 0.148989 0.491978 0.945782 0.143471 0.231969 0.909922 0.336592 0.587579 0.00868 0.615413 0.157598 0.066976 0.235236 0.131876 0.926382 0.085809 0.535755 0.654421 0.256738 0.469452 0.663747 0.656649 0.882748 0.190942 0.035497 0.720829 0.680674 0.666367 0.094199 0.362646 0.9264 0.483469 0.864122 0.298835 0.815 [...]
+0.986279 0.341614 0.681155 0.588139 0.0471 0.354341 0.249471 0.569744 0.735618 0.988932 0.116179 0.318114 0.115306 0.624815 0.269364 0.162899 0.245879 0.329852 0.550459 0.013321 0.93195 0.425982 0.241999 0.204569 0.887791 0.31923 0.851989 0.343247 0.780246 0.141882 0.567485 0.215808 0.330879 0.406143 0.178731 0.766276 0.207193 0.065748 0.1154 0.760305 0.614362 0.671519 0.079644 0.61929 0.074353 0.382406 0.155327 0.294756 0.10871 0.512018 0.281223 0.046704 0.73244 0.92325 0.21055 0.07608  [...]
+0.793802 0.086523 0.895224 0.908655 0.531523 0.753439 0.02208 0.096665 0.007822 0.551055 0.798341 0.812137 0.088146 0.512524 0.088616 0.443038 0.322287 0.947335 0.308278 0.184746 0.949712 0.589789 0.037634 0.270694 0.086723 0.641704 0.155787 0.027382 0.6616 0.737508 0.235716 0.460484 0.670125 0.329211 0.996713 0.107498 0.226421 0.22159 0.78785 0.71027 0.166609 0.762165 0.502119 0.768464 0.661544 0.685521 0.952113 0.657979 0.993615 0.543629 0.217521 0.524181 0.271006 0.509188 0.675768 0.0 [...]
+0.38938 0.821202 0.104716 0.136581 0.697331 0.54607 0.008021 0.552335 0.203597 0.501455 0.548489 0.57888 0.39456 0.156464 0.506664 0.652587 0.289131 0.830102 0.604029 0.463894 0.817945 0.221587 0.019538 0.367188 0.897448 0.856228 0.716783 0.461865 0.989911 0.235976 0.769301 0.253194 0.263717 0.570234 0.055358 0.085231 0.068237 0.24484 0.104647 0.38115 0.530631 0.5796 0.494505 0.28895 0.67841 0.747146 0.160221 0.703783 0.853238 0.83941 0.737796 0.408862 0.527114 0.676304 0.020097 0.226637 [...]
+0.086653 0.274789 0.495856 0.586158 0.041713 0.401607 0.279151 0.245282 0.322663 0.02255 0.528651 0.483078 0.254918 0.957928 0.120831 0.790425 0.282433 0.54998 0.009873 0.867008 0.113021 0.315295 0.54965 0.766045 0.247469 0.751775 0.505609 0.270238 0.102066 0.59701 0.216145 0.263961 0.676092 0.299828 0.399211 0.121153 0.064533 0.492242 0.204083 0.859002 0.936889 0.034241 0.096789 0.952377 0.71453 0.906429 0.566958 0.354384 0.946356 0.237477 0.385444 0.620814 0.605764 0.294884 0.814389 0. [...]
+0.153961 0.600862 0.281649 0.126052 0.871177 0.076225 0.727343 0.716236 0.844494 0.975636 0.237688 0.542265 0.550329 0.902496 0.65001 0.164476 0.539729 0.760098 0.238847 0.848517 0.69043 0.569728 0.315223 0.557951 0.787962 0.939298 0.479285 0.215328 0.920832 0.275059 0.449186 0.997913 0.276886 0.259033 0.566352 0.480832 0.695339 0.085931 0.72283 0.707829 0.615005 0.414872 0.51296 0.560977 0.804964 0.309322 0.234984 0.959097 0.545515 0.157735 0.504795 0.099335 0.690015 0.600394 0.805981 0 [...]
+0.055532 0.691953 0.186627 0.096625 0.312178 0.767029 0.968286 0.207876 0.788815 0.331349 0.086235 0.055475 0.387704 0.62624 0.447376 0.778014 0.776463 0.021004 0.982717 0.601005 0.831402 0.562842 0.291096 0.213259 0.152791 0.25263 0.290978 0.547305 0.546763 0.352106 0.507377 0.685486 0.061579 0.399978 0.314461 0.33245 0.516476 0.80645 0.402628 0.769925 0.765084 0.206974 0.977261 0.889677 0.266401 0.613735 0.988471 0.009775 0.337598 0.274346 0.560931 0.544582 0.553993 0.564746 0.877074 0 [...]
+0.62643 0.468612 0.022092 0.268797 0.145673 0.653982 0.807671 0.416842 0.966746 0.882404 0.201866 0.287769 0.15602 0.370226 0.879165 0.809148 0.293322 0.110909 0.411093 0.297589 0.397963 0.109201 0.912309 0.229212 0.23643 0.891313 0.078658 0.32693 0.284812 0.230327 0.579385 0.740831 0.763406 0.612569 0.9966 0.552534 0.769292 0.807876 0.157222 0.261004 0.46512 0.581342 0.415461 0.004551 0.847075 0.959823 0.938974 0.029753 0.483661 0.709208 0.31799 0.266946 0.656497 0.747098 0.863311 0.164 [...]
+0.945285 0.527338 0.438826 0.535332 0.650805 0.402812 0.932789 0.435762 0.414642 0.95072 0.357514 0.461826 0.908774 0.567335 0.37174 0.239362 0.672431 0.693788 0.473445 0.579283 0.117793 0.733743 0.444066 0.530259 0.538112 0.505367 0.248047 0.957779 0.590522 0.145848 0.798204 0.334781 0.08366 0.293377 0.156192 0.070388 0.835213 0.54283 0.874255 0.945078 0.606666 0.846733 0.074242 0.371072 0.423324 0.948127 0.409918 0.450601 0.606381 0.188114 0.69586 0.1026 0.463814 0.44619 0.498504 0.443 [...]
+0.723172 0.110912 0.580835 0.370124 0.888192 0.010536 0.081835 0.838785 0.570279 0.133693 0.20077 0.10683 0.739679 0.991992 0.776658 0.282837 0.678397 0.264183 0.752538 0.289286 0.084204 0.568889 0.743565 0.373656 0.666753 0.973558 0.612665 0.187843 0.191365 0.012654 0.787357 0.742454 0.380061 0.843524 0.99803 0.474887 0.902805 0.460286 0.007076 0.287307 0.368351 0.326845 0.942799 0.675373 0.549027 0.534937 0.798108 0.993516 0.89733 0.818802 0.398644 0.599156 0.765616 0.412364 0.203354 0 [...]
+0.091569 0.941394 0.812582 0.48046 0.965205 0.087457 0.818546 0.198344 0.455214 0.482857 0.71206 0.024211 0.382439 0.02954 0.184899 0.254648 0.245759 0.836554 0.286228 0.175734 0.396116 0.372082 0.715552 0.493821 0.825148 0.284274 0.394258 0.835305 0.823678 0.008003 0.803259 0.932251 0.703431 0.546426 0.073448 0.23824 0.935758 0.174404 0.118816 0.239214 0.436079 0.322935 0.259679 0.952902 0.586884 0.939133 0.233248 0.758351 0.133347 0.632168 0.422291 0.276363 0.02824 0.964648 0.995875 0. [...]
+0.957975 0.531619 0.738261 0.999263 0.219954 0.163649 0.389644 0.301905 0.268539 0.7096 0.08139 0.329055 0.966368 0.190084 0.130446 0.377664 0.586047 0.43877 0.897931 0.812174 0.837295 0.031768 0.826528 0.59075 0.166811 0.152153 0.01114 0.622621 0.678784 0.614502 0.952763 0.354367 0.161668 0.453679 0.72794 0.581068 0.471207 0.380271 0.980383 0.26006 0.867302 0.596574 0.076528 0.99119 0.006645 0.519818 0.616673 0.040682 0.610371 0.67667 0.066792 0.572436 0.051105 0.017515 0.899855 0.15235 [...]
+0.608272 0.284894 0.059148 0.546791 0.975914 0.025099 0.758178 0.223328 0.657937 0.980314 0.827818 0.05881 0.539848 0.628829 0.534395 0.74851 0.079063 0.168072 0.274924 0.744854 0.104028 0.559799 0.991653 0.589881 0.610313 0.981185 0.672646 0.128268 0.416917 0.381943 0.377251 0.237024 0.933158 0.04365 0.863051 0.728426 0.903851 0.288279 0.402706 0.445315 0.066979 0.678166 0.839691 0.045599 0.122461 0.857849 0.723537 0.168526 0.065625 0.012454 0.738691 0.45277 0.700906 0.158374 0.276062 0 [...]
+0.732494 0.218114 0.9579 0.080511 0.879387 0.888762 0.083185 0.165418 0.467581 0.343051 0.047297 0.176711 0.670296 0.450297 0.967405 0.930145 0.410656 0.884386 0.808921 0.097999 0.609513 0.930264 0.761948 0.116656 0.803484 0.926624 0.028985 0.780168 0.225173 0.57148 0.745254 0.733307 0.510948 0.354064 0.043087 0.367642 0.084795 0.886562 0.528686 0.095528 0.523311 0.768886 0.030021 0.636478 0.319877 0.446892 0.049933 0.714684 0.100633 0.39679 0.477622 0.730162 0.27621 0.050366 0.857441 0. [...]
+0.93396 0.725647 0.152883 0.392426 0.409266 0.327932 0.377835 0.898307 0.494246 0.594883 0.963 0.814262 0.60461 0.155395 0.522696 0.26793 0.28652 0.269169 0.740585 0.988679 0.936132 0.683789 0.443092 0.592339 0.540143 0.325802 0.989087 0.997668 0.36126 0.515916 0.583803 0.499245 0.153722 0.662864 0.220751 0.85125 0.821646 0.940452 0.410773 0.252603 0.979853 0.845822 0.606759 0.784447 0.920484 0.819709 0.327557 0.171604 0.753799 0.527609 0.704033 0.601529 0.08529 0.261734 0.54419 0.459462 [...]
+0.990326 0.468423 0.048241 0.654564 0.127847 0.507345 0.449327 0.842694 0.467371 0.670939 0.286936 0.13714 0.993947 0.978967 0.928098 0.523568 0.205462 0.338566 0.640586 0.545607 0.274161 0.258259 0.61458 0.059952 0.445522 0.675834 0.839092 0.296592 0.881864 0.782028 0.443998 0.109495 0.790443 0.522086 0.497502 0.623397 0.410595 0.288985 0.40457 0.190278 0.295122 0.927566 0.785923 0.270401 0.215452 0.705119 0.127947 0.361353 0.0276 0.259015 0.280435 0.957688 0.171515 0.622069 0.664094 0. [...]
+0.833242 0.899138 0.489137 0.466517 0.469704 0.172735 0.617334 0.796951 0.776004 0.657747 0.772715 0.981021 0.32816 0.268773 0.308382 0.004857 0.860784 0.835854 0.35998 0.166301 0.045782 0.784257 0.798503 0.235614 0.187522 0.901832 0.162519 0.141925 0.725931 0.54163 0.395594 0.678141 0.479876 0.950143 0.536331 0.73111 0.617026 0.324624 0.233515 0.920231 0.536637 0.388683 0.124189 0.937422 0.61283 0.557209 0.407299 0.519499 0.505987 0.482189 0.97876 0.39802 0.850678 0.904 0.899445 0.43219 [...]
+0.607357 0.353982 0.160008 0.700677 0.534549 0.956284 0.453608 0.582295 0.374006 0.725448 0.427533 0.574332 0.542366 0.96266 0.035272 0.290326 0.342399 0.326714 0.747463 0.497182 0.173834 0.850799 0.570455 0.817278 0.796666 0.03435 0.945578 0.45685 0.589593 0.669982 0.04144 0.884278 0.638248 0.680942 0.357062 0.2266 0.142197 0.561235 0.935898 0.482214 0.899468 0.970198 0.29093 0.218715 0.732879 0.293111 0.618923 0.699726 0.25639 0.066926 0.122935 0.526842 0.932871 0.446775 0.11298 0.1879 [...]
+0.890659 0.944225 0.38397 0.526143 0.525198 0.761561 0.609906 0.426725 0.633204 0.451985 0.990625 0.386432 0.463081 0.63227 0.458287 0.466741 0.096692 0.314403 0.195988 0.783131 0.625159 0.760691 0.520509 0.805063 0.896699 0.016216 0.172557 0.775111 0.880448 0.631139 0.668589 0.432619 0.57119 0.861626 0.313083 0.804991 0.064579 0.362624 0.127651 0.469792 0.353324 0.586085 0.148485 0.464693 0.082116 0.409831 0.130858 0.747033 0.459697 0.133857 0.659129 0.712425 0.780323 0.731923 0.879644  [...]
+0.084688 0.567361 0.172937 0.886083 0.773522 0.125691 0.979732 0.410331 0.478574 0.657678 0.593502 0.785339 0.879783 0.622429 0.940621 0.726593 0.867873 0.186175 0.464718 0.006477 0.413372 0.633648 0.732092 0.150303 0.565809 0.045007 0.315264 0.919707 0.764816 0.93222 0.097253 0.928629 0.050146 0.119243 0.521585 0.543109 0.5424 0.724744 0.561666 0.521385 0.272008 0.120902 0.332315 0.257098 0.64189 0.964861 0.805889 0.089337 0.657992 0.081269 0.384958 0.752613 0.036973 0.945318 0.883556 0 [...]
+0.724415 0.40218 0.154985 0.453768 0.009985 0.847904 0.039422 0.207196 0.919522 0.582524 0.958623 0.637173 0.149529 0.395977 0.491719 0.066518 0.2884 0.425209 0.561409 0.064218 0.377536 0.395776 0.708163 0.150226 0.125023 0.860558 0.725433 0.041665 0.333074 0.96477 0.191877 0.024131 0.053565 0.425577 0.042172 0.937862 0.949876 0.050426 0.070601 0.716712 0.308803 0.392183 0.113918 0.093596 0.189626 0.322552 0.531306 0.059159 0.679999 0.353726 0.736058 0.852479 0.349952 0.032356 0.302989 0 [...]
+0.710375 0.789461 0.375353 0.377448 0.423795 0.261912 0.815118 0.968449 0.704131 0.095198 0.611557 0.701713 0.494266 0.592295 0.3711 0.959582 0.774696 0.930635 0.337507 0.639727 0.714579 0.249141 0.082676 0.035312 0.054228 0.87026 0.069699 0.481741 0.581905 0.345329 0.933072 0.473552 0.486797 0.751648 0.578053 0.331026 0.517886 0.548849 0.036389 0.523101 0.33651 0.565393 0.927954 0.267575 0.373234 0.183518 0.893039 0.379971 0.983288 0.531578 0.103017 0.820616 0.925254 0.326316 0.527108 0 [...]
+0.268837 0.348363 0.679092 0.829131 0.395272 0.10363 0.020989 0.993402 0.630702 0.608823 0.409695 0.49367 0.138275 0.86746 0.461028 0.728143 0.199823 0.338752 0.343603 0.407316 0.875967 0.641958 0.696969 0.947874 0.83177 0.583799 0.45781 0.587466 0.622478 0.811345 0.707786 0.849966 0.777554 0.836486 0.525702 0.240771 0.938289 0.494979 0.089128 0.26274 0.13815 0.49029 0.443076 0.479359 0.448269 0.857475 0.087473 0.0836 0.312847 0.119225 0.297204 0.787615 0.982737 0.908828 0.278023 0.94083 [...]
+0.456529 0.272814 0.648408 0.089075 0.516879 0.882195 0.210454 0.233043 0.088414 0.192884 0.826786 0.614339 0.028238 0.863958 0.067401 0.057711 0.716262 0.351904 0.862443 0.558383 0.089664 0.065627 0.192511 0.838598 0.622028 0.273892 0.874205 0.396804 0.957355 0.83106 0.691171 0.422017 0.591028 0.693776 0.587752 0.908698 0.626304 0.679126 0.436918 0.070532 0.41502 0.175448 0.800003 0.488827 0.416438 0.693779 0.902558 0.632358 0.099995 0.532239 0.267498 0.789561 0.444828 0.435922 0.826243 [...]
+0.526451 0.143809 0.419848 0.91777 0.446984 0.9201 0.639027 0.637537 0.512963 0.90755 0.497767 0.019261 0.084417 0.239225 0.167101 0.076009 0.169975 0.856424 0.46034 0.336098 0.271978 0.090795 0.968753 0.240775 0.034664 0.572706 0.62832 0.173035 0.680941 0.586579 0.860367 0.37183 0.616753 0.88399 0.184956 0.945849 0.389345 0.954599 0.488447 0.955233 0.82411 0.943276 0.223028 0.479107 0.634733 0.421413 0.717857 0.622284 0.04748 0.267714 0.755314 0.675999 0.892016 0.559009 0.33046 0.020018 [...]
+0.33021 0.873452 0.614282 0.934552 0.74234 0.715455 0.833907 0.037381 0.426179 0.115231 0.907945 0.347763 0.7984 0.940431 0.777896 0.074128 0.054015 0.304884 0.516404 0.863551 0.039023 0.62403 0.52971 0.452455 0.21399 0.282479 0.095648 0.961487 0.551728 0.447798 0.328681 0.476721 0.734341 0.643386 0.332035 0.579271 0.370806 0.112406 0.930619 0.785739 0.297212 0.054453 0.120422 0.553207 0.982982 0.901479 0.001025 0.396703 0.817303 0.18391 0.851497 0.874721 0.848864 0.604101 0.684112 0.687 [...]
+0.797927 0.921121 0.638698 0.868207 0.298303 0.076503 0.558986 0.652961 0.134813 0.194985 0.966897 0.540403 0.514646 0.732363 0.941422 0.026175 0.408556 0.479372 0.665158 0.149016 0.693662 0.337619 0.399039 0.492059 0.863078 0.108693 0.992202 0.131981 0.639882 0.762602 0.550844 0.997441 0.662791 0.365604 0.183216 0.40064 0.661881 0.319014 0.338128 0.852462 0.535059 0.467518 0.994348 0.424072 0.013484 0.732309 0.122542 0.262379 0.903672 0.21475 0.786226 0.189706 0.213944 0.10313 0.973003  [...]
+0.588593 0.424842 0.335112 0.548505 0.572715 0.228634 0.846041 0.743246 0.780938 0.746153 0.868904 0.704055 0.108169 0.497082 0.558535 0.370115 0.035043 0.289252 0.625867 0.231318 0.737537 0.193355 0.534817 0.122586 0.413727 0.639304 0.629119 0.076716 0.995498 0.85476 0.196265 0.168129 0.852493 0.784058 0.676727 0.061468 0.743458 0.166721 0.178756 0.403331 0.235054 0.516638 0.67993 0.855378 0.775746 0.664661 0.864465 0.810195 0.144433 0.289013 0.815243 0.245953 0.767546 0.703836 0.516791 [...]
+0.995031 0.942634 0.290564 0.388729 0.363747 0.63736 0.684537 0.106458 0.422636 0.460681 0.077507 0.020574 0.481309 0.482453 0.456822 0.431119 0.811888 0.113355 0.771785 0.728352 0.492109 0.826941 0.59689 0.614078 0.182562 0.038957 0.65429 0.704955 0.404316 0.910575 0.068745 0.242439 0.182497 0.344695 0.186964 0.560783 0.084454 0.361326 0.229333 0.182996 0.767288 0.356028 0.004929 0.6965 0.7297 0.505475 0.798767 0.704276 0.202848 0.561628 0.749559 0.308561 0.183732 0.272358 0.341215 0.75 [...]
+0.814451 0.134765 0.565691 0.066314 0.820713 0.459361 0.575891 0.533804 0.462287 0.575175 0.647997 0.198964 0.718233 0.021405 0.087262 0.379063 0.836974 0.337772 0.602746 0.222695 0.537099 0.944944 0.869228 0.500278 0.61197 0.105283 0.864054 0.857784 0.775145 0.333941 0.152498 0.727668 0.144059 0.991021 0.823941 0.201707 0.299532 0.800137 0.866534 0.089147 0.52435 0.453813 0.855049 0.908578 0.796391 0.742572 0.001308 0.750372 0.614526 0.918268 0.577712 0.012042 0.471514 0.313345 0.977426 [...]
+0.880984 0.298912 0.022857 0.276167 0.629449 0.00505 0.743356 0.11976 0.31475 0.991225 0.425174 0.652725 0.752898 0.751888 0.429614 0.075237 0.886583 0.27597 0.948849 0.901443 0.226686 0.401532 0.550327 0.090076 0.815579 0.126154 0.445495 0.432548 0.548292 0.714548 0.672536 0.525741 0.996211 0.098107 0.285021 0.616166 0.605824 0.185207 0.966736 0.91539 0.780758 0.902951 0.862405 0.162173 0.631734 0.44348 0.899554 0.212178 0.169719 0.640492 0.877619 0.852194 0.803973 0.915385 0.399073 0.7 [...]
+0.858299 0.574161 0.805959 0.641718 0.48358 0.170217 0.74445 0.922353 0.115963 0.322614 0.590058 0.965668 0.629474 0.7615 0.907891 0.736037 0.849912 0.407554 0.415386 0.117402 0.681157 0.042934 0.622251 0.264258 0.076975 0.931208 0.772561 0.03121 0.20575 0.985824 0.408834 0.484135 0.79866 0.698495 0.809154 0.032912 0.678125 0.290232 0.590887 0.790323 0.683367 0.421661 0.461942 0.032024 0.678733 0.928798 0.699465 0.342129 0.053843 0.850944 0.059817 0.017827 0.947079 0.156348 0.355197 0.48 [...]
+0.86492 0.489523 0.641294 0.47499 0.556158 0.725554 0.251307 0.835152 0.136861 0.977663 0.80857 0.214782 0.959483 0.923327 0.289515 0.30357 0.181061 0.205113 0.121902 0.414932 0.909279 0.556131 0.357528 0.570626 0.833932 0.027381 0.679953 0.517651 0.866585 0.691377 0.702187 0.245146 0.143171 0.075659 0.088406 0.966948 0.967258 0.259583 0.50798 0.961208 0.279433 0.367572 0.791557 0.060833 0.012484 0.113368 0.000311 0.658884 0.35846 0.514198 0.03256 0.320478 0.538844 0.055123 0.214431 0.81 [...]
+0.31658 0.720642 0.470504 0.59717 0.420339 0.721746 0.500058 0.145739 0.323258 0.295084 0.727463 0.157001 0.225131 0.337727 0.451329 0.415952 0.312853 0.96644 0.369111 0.884253 0.208104 0.541321 0.17443 0.700183 0.820901 0.107292 0.750299 0.805918 0.8386 0.791349 0.868283 0.849397 0.013968 0.100293 0.264306 0.48599 0.775792 0.305256 0.527096 0.176387 0.153823 0.846138 0.963546 0.152608 0.906878 0.311141 0.102241 0.358161 0.410572 0.629755 0.045696 0.888423 0.733854 0.57922 0.518553 0.969 [...]
+0.051013 0.451125 0.762032 0.292686 0.046606 0.207247 0.765371 0.264366 0.832991 0.782895 0.132444 0.386638 0.771732 0.15074 0.127205 0.34573 0.277489 0.5514 0.796759 0.095539 0.627682 0.139198 0.11172 0.306061 0.701814 0.994635 0.690745 0.659411 0.388575 0.715136 0.155139 0.450592 0.683479 0.652544 0.605298 0.367771 0.305847 0.617273 0.193456 0.634293 0.739718 0.082276 0.259269 0.486643 0.27794 0.710348 0.396847 0.331189 0.709111 0.974771 0.855233 0.927105 0.669313 0.111165 0.057719 0.5 [...]
+0.124246 0.151575 0.23699 0.078915 0.626799 0.809273 0.79133 0.64092 0.745728 0.72902 0.375191 0.489602 0.911211 0.199627 0.377465 0.796704 0.520432 0.363784 0.094857 0.964138 0.952311 0.329137 0.586645 0.474706 0.048356 0.050216 0.814157 0.034354 0.220174 0.165354 0.253494 0.894798 0.746933 0.871172 0.670693 0.073841 0.547667 0.053575 0.933763 0.857259 0.420442 0.893895 0.201369 0.638704 0.501206 0.050482 0.229378 0.785405 0.424557 0.347664 0.609252 0.16102 0.215041 0.964882 0.440475 0. [...]
+0.554595 0.567216 0.968588 0.323714 0.20393 0.038826 0.108904 0.068833 0.138914 0.578578 0.699356 0.614973 0.960621 0.564342 0.915595 0.139828 0.814529 0.151135 0.475151 0.706953 0.046546 0.486606 0.708359 0.563004 0.629903 0.894162 0.489888 0.103544 0.980644 0.970514 0.776568 0.588969 0.0017 0.062096 0.254994 0.147239 0.245347 0.810065 0.438931 0.589408 0.65689 0.175705 0.259081 0.43698 0.098445 0.453623 0.578513 0.354478 0.392103 0.557675 0.208134 0.846116 0.027345 0.131948 0.761289 0. [...]
+0.412178 0.109205 0.06451 0.05392 0.261102 0.559033 0.969597 0.821299 0.943604 0.400851 0.955179 0.104642 0.033333 0.202548 0.453628 0.701107 0.393467 0.180599 0.007724 0.259854 0.57745 0.049621 0.93243 0.325871 0.340764 0.35591 0.564764 0.906016 0.900689 0.533115 0.131577 0.007625 0.071541 0.348376 0.913966 0.847017 0.835666 0.95299 0.430707 0.225011 0.857891 0.847342 0.193168 0.014138 0.329415 0.240873 0.546084 0.24369 0.306475 0.988295 0.570454 0.398665 0.339427 0.445535 0.137993 0.26 [...]
+0.231436 0.004541 0.290151 0.12416 0.348542 0.702935 0.406806 0.874842 0.634158 0.455399 0.208067 0.829691 0.326477 0.876771 0.256056 0.726266 0.72102 0.243276 0.844533 0.273393 0.480632 0.589377 0.031125 0.420067 0.718954 0.177954 0.477556 0.582933 0.498917 0.623047 0.877816 0.556525 0.747207 0.240492 0.984968 0.365803 0.04129 0.616849 0.777257 0.084406 0.991941 0.100851 0.261672 0.416119 0.432984 0.14777 0.84765 0.60967 0.153427 0.370913 0.47249 0.700378 0.595751 0.366504 0.42031 0.300 [...]
+0.516605 0.848908 0.02795 0.786699 0.778516 0.746177 0.07222 0.361415 0.792326 0.333535 0.66841 0.071192 0.906824 0.647947 0.039109 0.269272 0.008511 0.644703 0.862927 0.166903 0.149184 0.250941 0.792343 0.019872 0.004563 0.153378 0.979645 0.252123 0.617831 0.481908 0.391549 0.83916 0.629319 0.655778 0.11691 0.532947 0.863101 0.008009 0.907391 0.600545 0.408453 0.801862 0.607373 0.275287 0.846063 0.993717 0.710234 0.249494 0.603116 0.061518 0.724981 0.578065 0.988545 0.494638 0.490083 0. [...]
+0.746116 0.489619 0.707128 0.71082 0.477045 0.720346 0.30102 0.34014 0.764113 0.556501 0.936898 0.792299 0.851045 0.008423 0.191636 0.94395 0.567609 0.779993 0.817192 0.784011 0.54355 0.537857 0.843073 0.405909 0.667604 0.567005 0.454834 0.888346 0.004642 0.84721 0.185008 0.268253 0.55389 0.419543 0.857036 0.928688 0.295655 0.367946 0.967145 0.193294 0.474041 0.336729 0.499046 0.850043 0.765219 0.079923 0.860519 0.586122 0.427081 0.066924 0.51153 0.538485 0.743389 0.449837 0.430894 0.162 [...]
+0.937103 0.276726 0.575311 0.684779 0.725571 0.614638 0.923671 0.240161 0.510724 0.611694 0.847 0.682752 0.060682 0.305822 0.201802 0.480402 0.53879 0.124219 0.796934 0.22499 0.854473 0.123224 0.84555 0.398833 0.275101 0.436965 0.419841 0.26417 0.807396 0.229642 0.759908 0.528453 0.24795 0.860793 0.379822 0.238186 0.533292 0.526309 0.574663 0.504792 0.025649 0.457043 0.451423 0.644439 0.077238 0.651644 0.780513 0.289223 0.688571 0.530814 0.906687 0.071082 0.270987 0.968253 0.346946 0.031 [...]
+0.28008 0.431379 0.811688 0.540613 0.113025 0.467827 0.915978 0.819029 0.25961 0.743203 0.791495 0.156959 0.558956 0.441763 0.027929 0.096173 0.724284 0.69487 0.297466 0.591677 0.461254 0.511648 0.396369 0.530813 0.858054 0.769176 0.334916 0.956918 0.922255 0.384573 0.812801 0.277103 0.357774 0.311444 0.782401 0.366318 0.361443 0.294445 0.353268 0.32588 0.221699 0.121438 0.921492 0.507604 0.597436 0.446188 0.771296 0.941975 0.436279 0.048803 0.192784 0.958885 0.360019 0.499764 0.475006 0 [...]
+0.631681 0.64719 0.178149 0.170871 0.095385 0.129535 0.995852 0.083179 0.801166 0.33099 0.125987 0.855188 0.845268 0.596736 0.420141 0.56348 0.152189 0.837876 0.441034 0.242924 0.190473 0.562907 0.165858 0.06228 0.525926 0.670409 0.595026 0.697333 0.896159 0.597257 0.636216 0.074344 0.702556 0.678116 0.630528 0.128547 0.904045 0.389668 0.439128 0.391784 0.596845 0.660558 0.19484 0.369805 0.950143 0.169206 0.465603 0.383435 0.088677 0.828768 0.027745 0.162411 0.497013 0.498278 0.466362 0. [...]
+0.378511 0.891963 0.23928 0.801018 0.54593 0.027149 0.191271 0.291591 0.111541 0.195794 0.25329 0.951977 0.260516 0.424597 0.745472 0.671983 0.092386 0.242594 0.838586 0.095523 0.478798 0.631396 0.231136 0.095627 0.939838 0.481825 0.723907 0.368675 0.210306 0.409469 0.429376 0.581847 0.490447 0.332672 0.060583 0.566638 0.421333 0.71323 0.424626 0.155045 0.486559 0.657172 0.430677 0.342263 0.056962 0.697084 0.728248 0.081385 0.535887 0.962034 0.98603 0.646754 0.212427 0.423957 0.336867 0. [...]
+0.725158 0.47946 0.606441 0.637026 0.752115 0.78996 0.643043 0.897487 0.942094 0.889886 0.344859 0.254881 0.164926 0.801545 0.968995 0.799677 0.909684 0.765821 0.43206 0.587028 0.941619 0.846318 0.232969 0.726144 0.396957 0.45995 0.83404 0.067339 0.145005 0.729336 0.627934 0.074737 0.620583 0.190573 0.525698 0.011051 0.79738 0.161487 0.66329 0.699253 0.822326 0.591434 0.899474 0.15341 0.080094 0.730685 0.553731 0.43546 0.228117 0.901436 0.458178 0.001962 0.027663 0.393492 0.757167 0.9147 [...]
+0.669998 0.181746 0.249516 0.318517 0.169447 0.543751 0.252051 0.411013 0.503421 0.532794 0.892778 0.099719 0.340036 0.129875 0.356025 0.928655 0.391874 0.813142 0.571149 0.847773 0.507995 0.410263 0.599173 0.89558 0.512592 0.030933 0.686337 0.058011 0.354807 0.643772 0.662985 0.794039 0.116622 0.837245 0.408735 0.692789 0.469505 0.666626 0.854252 0.268386 0.043683 0.804728 0.972458 0.226434 0.713718 0.782161 0.903947 0.589407 0.662652 0.812295 0.027122 0.594152 0.192326 0.568387 0.49615 [...]
+0.396839 0.889835 0.4264 0.875582 0.796385 0.695701 0.853562 0.908144 0.497502 0.628229 0.698805 0.690662 0.548306 0.242069 0.802074 0.382819 0.696908 0.139963 0.375407 0.035655 0.922909 0.197488 0.518719 0.180551 0.499304 0.028378 0.026567 0.673287 0.28637 0.276852 0.752045 0.229125 0.471488 0.98746 0.7688 0.984728 0.62446 0.751754 0.715862 0.29816 0.59565 0.203511 0.960926 0.406252 0.849283 0.551974 0.048293 0.724893 0.595642 0.381425 0.922897 0.744503 0.254499 0.717661 0.053676 0.3374 [...]
+0.232365 0.389297 0.441478 0.043418 0.694463 0.052969 0.074748 0.075258 0.119806 0.705604 0.568595 0.566551 0.294515 0.00684 0.24314 0.227176 0.794821 0.390889 0.497694 0.417405 0.902787 0.590465 0.454438 0.331558 0.155991 0.547416 0.961072 0.947258 0.141022 0.235764 0.743212 0.994956 0.669061 0.557731 0.069839 0.800429 0.459281 0.169997 0.118639 0.901282 0.475255 0.701918 0.131871 0.119015 0.30254 0.8809 0.211106 0.429524 0.745879 0.348206 0.504557 0.521833 0.679868 0.935954 0.498869 0. [...]
+0.065695 0.032482 0.9388 0.409451 0.528603 0.080462 0.898557 0.301978 0.796893 0.220414 0.721342 0.567166 0.326844 0.891318 0.091225 0.91816 0.057274 0.749693 0.09235 0.854231 0.641406 0.993643 0.949985 0.415534 0.288033 0.035469 0.359801 0.084123 0.998627 0.181253 0.424885 0.136642 0.680972 0.963569 0.452042 0.433455 0.524857 0.168205 0.645477 0.141435 0.858192 0.147414 0.811042 0.352153 0.732269 0.465057 0.1226 0.917993 0.440981 0.047274 0.537917 0.083953 0.664015 0.207612 0.292495 0.7 [...]
+0.926536 0.743137 0.736498 0.251061 0.491663 0.197267 0.535355 0.758627 0.703454 0.611775 0.384485 0.745461 0.048545 0.982759 0.790609 0.283688 0.549244 0.172771 0.258345 0.435838 0.661099 0.26801 0.360607 0.72653 0.792293 0.265589 0.225352 0.927394 0.581072 0.432008 0.981238 0.077052 0.728957 0.171715 0.978163 0.249111 0.730784 0.090137 0.233431 0.056306 0.904574 0.873076 0.17715 0.116356 0.770069 0.709804 0.399588 0.108296 0.771268 0.424201 0.151704 0.890782 0.139634 0.748241 0.30144 0 [...]
+0.819509 0.657943 0.383668 0.654837 0.166259 0.212591 0.961534 0.648535 0.707208 0.401734 0.864686 0.354031 0.048401 0.254351 0.424619 0.37951 0.509315 0.55875 0.243296 0.310374 0.380636 0.417435 0.983597 0.568072 0.363153 0.896629 0.781092 0.746138 0.520653 0.675071 0.988756 0.260286 0.943392 0.756642 0.492897 0.652144 0.471333 0.343298 0.63298 0.912992 0.417584 0.016803 0.754523 0.797174 0.600207 0.130536 0.108598 0.513682 0.508473 0.917932 0.927028 0.732153 0.364395 0.192236 0.516465  [...]
+0.656286 0.649881 0.349375 0.055165 0.202958 0.652965 0.711366 0.077157 0.202628 0.545609 0.686954 0.005234 0.913131 0.760815 0.014384 0.444008 0.910152 0.194167 0.956822 0.589622 0.932023 0.700394 0.224267 0.722537 0.512187 0.695135 0.292706 0.770776 0.409723 0.151025 0.875878 0.85207 0.790444 0.633047 0.649532 0.540403 0.967095 0.655646 0.459551 0.604689 0.689265 0.06698 0.787936 0.144097 0.999103 0.135097 0.45445 0.618058 0.428612 0.685904 0.473144 0.303001 0.915585 0.91914 0.437149 0 [...]
+0.010056 0.511475 0.809426 0.467976 0.806123 0.79029 0.29739 0.767301 0.074604 0.916143 0.940549 0.080903 0.008982 0.585309 0.022517 0.403288 0.69335 0.620107 0.739468 0.399652 0.769867 0.702203 0.409738 0.642946 0.739398 0.416324 0.817095 0.977507 0.057349 0.697117 0.541519 0.136608 0.251101 0.122979 0.512558 0.530157 0.829264 0.934488 0.058053 0.400424 0.999535 0.560225 0.205723 0.688719 0.031421 0.119015 0.041486 0.447503 0.168574 0.557114 0.00933 0.928894 0.560457 0.786905 0.314877 0 [...]
+0.961765 0.279355 0.294211 0.642849 0.296273 0.211892 0.755634 0.516918 0.187193 0.031347 0.859989 0.32156 0.979219 0.750594 0.222064 0.768148 0.162355 0.13196 0.719331 0.79091 0.842943 0.256649 0.997274 0.140401 0.561475 0.05291 0.098098 0.827938 0.683996 0.680567 0.573611 0.700104 0.613707 0.814732 0.501829 0.702754 0.406352 0.944642 0.591365 0.938954 0.563363 0.499529 0.551594 0.129964 0.425329 0.968739 0.750934 0.250322 0.747358 0.852988 0.492185 0.561418 0.471997 0.995153 0.064632 0 [...]
+0.544663 0.034828 0.291884 0.227539 0.215237 0.530618 0.903595 0.196368 0.521232 0.97289 0.328225 0.748674 0.179857 0.908493 0.542782 0.591154 0.171406 0.389896 0.786052 0.598828 0.662533 0.561621 0.437261 0.879362 0.818107 0.612865 0.362848 0.60655 0.186623 0.784776 0.193124 0.366169 0.492728 0.85764 0.889102 0.148895 0.510568 0.388941 0.923619 0.510842 0.765969 0.178745 0.816573 0.043192 0.427956 0.487271 0.539963 0.261102 0.242089 0.542674 0.490147 0.514704 0.125798 0.033805 0.586198  [...]
+0.495999 0.456472 0.595518 0.085743 0.444982 0.514538 0.643265 0.674829 0.785131 0.354686 0.549476 0.271534 0.707072 0.651856 0.255787 0.830008 0.482323 0.982586 0.777966 0.535366 0.306959 0.900104 0.425065 0.830133 0.293455 0.355777 0.768915 0.712699 0.971432 0.036134 0.49733 0.797244 0.337174 0.411922 0.749434 0.27234 0.147719 0.137471 0.031104 0.566193 0.622073 0.064807 0.517874 0.483325 0.250083 0.408875 0.48608 0.723494 0.766687 0.299457 0.634541 0.318901 0.447083 0.7447 0.083825 0. [...]
+0.517784 0.431936 0.355716 0.731059 0.381456 0.881391 0.034649 0.13565 0.947984 0.10091 0.653269 0.687752 0.993846 0.061286 0.368557 0.019123 0.651521 0.17589 0.357752 0.744685 0.296366 0.069908 0.87506 0.277621 0.742251 0.741334 0.279937 0.508774 0.703278 0.243888 0.395709 0.824448 0.080216 0.669781 0.258828 0.770674 0.809282 0.443463 0.75106 0.746725 0.715134 0.612158 0.902273 0.81431 0.715011 0.388818 0.872662 0.024883 0.254839 0.126071 0.610694 0.11354 0.930492 0.132822 0.286343 0.75 [...]
+0.97956 0.050991 0.324998 0.72907 0.810058 0.399616 0.866546 0.540145 0.985424 0.440033 0.216294 0.82174 0.94029 0.085015 0.606565 0.15651 0.407063 0.196542 0.705694 0.805127 0.40137 0.091877 0.17577 0.524726 0.234891 0.965617 0.248761 0.444977 0.832726 0.583143 0.625188 0.603464 0.82372 0.58655 0.538916 0.879739 0.011257 0.437059 0.820136 0.903144 0.348492 0.484035 0.323843 0.893499 0.76061 0.132665 0.738508 0.115468 0.079449 0.861353 0.215118 0.129079 0.766195 0.798487 0.366377 0.96334 [...]
+0.817125 0.083105 0.382358 0.390104 0.106449 0.662338 0.230951 0.381796 0.435354 0.547779 0.923418 0.570211 0.947427 0.321461 0.349842 0.029978 0.909595 0.877922 0.124721 0.527664 0.860025 0.713068 0.943702 0.815063 0.802642 0.035936 0.819481 0.687437 0.728566 0.834875 0.305212 0.716367 0.716878 0.199706 0.639074 0.74611 0.978739 0.221338 0.713369 0.13132 0.599851 0.864093 0.115825 0.247448 0.620831 0.767951 0.962135 0.260033 0.372774 0.696136 0.030583 0.870931 0.017014 0.549701 0.836699 [...]
+0.67766 0.9688 0.511457 0.197006 0.104862 0.850098 0.705767 0.160519 0.144774 0.463515 0.267034 0.583465 0.243211 0.855742 0.172899 0.401131 0.154832 0.75098 0.647823 0.55142 0.471875 0.354511 0.460711 0.930389 0.567037 0.907092 0.369464 0.340095 0.340475 0.91964 0.027779 0.263398 0.566899 0.668001 0.834554 0.410448 0.23892 0.719085 0.499086 0.676132 0.256511 0.416754 0.959948 0.140706 0.841679 0.127567 0.396232 0.667239 0.933961 0.698218 0.503358 0.114475 0.277504 0.745354 0.672239 0.07 [...]
+0.188194 0.916446 0.378401 0.903066 0.469368 0.750121 0.714268 0.828391 0.429663 0.941879 0.396653 0.508643 0.182937 0.155219 0.684303 0.183087 0.275031 0.956289 0.829019 0.534994 0.574615 0.173827 0.249811 0.44798 0.711755 0.058011 0.792501 0.899208 0.866715 0.450058 0.777197 0.981069 0.301091 0.121862 0.999054 0.117923 0.888931 0.860772 0.131825 0.334112 0.685387 0.68098 0.810737 0.717685 0.556929 0.646903 0.44384 0.002684 0.530958 0.591368 0.035909 0.017451 0.77761 0.91445 0.525806 0. [...]
+0.982452 0.029813 0.60538 0.969419 0.93749 0.038931 0.063132 0.960369 0.756532 0.774158 0.990455 0.757604 0.461224 0.855477 0.112838 0.29067 0.881648 0.31275 0.098772 0.661404 0.179347 0.829769 0.707915 0.426629 0.276245 0.057009 0.791118 0.300662 0.471435 0.267758 0.784396 0.283947 0.873574 0.989008 0.588844 0.482002 0.953592 0.808756 0.873435 0.439442 0.191882 0.700334 0.836109 0.478107 0.515103 0.698886 0.665234 0.583414 0.401745 0.261561 0.982996 0.693329 0.255584 0.585229 0.768886 0 [...]
+0.085996 0.364889 0.427003 0.85192 0.468818 0.708301 0.105484 0.85572 0.689103 0.937239 0.332932 0.74206 0.498085 0.0593 0.654736 0.633501 0.348228 0.043789 0.082638 0.175514 0.107444 0.207003 0.923622 0.57763 0.290274 0.13281 0.844775 0.379847 0.484876 0.526053 0.261928 0.637545 0.145971 0.925386 0.622589 0.489741 0.984223 0.756111 0.727237 0.355574 0.36336 0.945961 0.643343 0.509967 0.168658 0.72171 0.467204 0.098104 0.500043 0.515801 0.753463 0.559813 0.917014 0.531186 0.164697 0.7357 [...]
+0.666243 0.450826 0.539832 0.54932 0.402082 0.246839 0.440078 0.241887 0.919542 0.276749 0.388871 0.291211 0.095569 0.999848 0.471518 0.102279 0.782301 0.098311 0.173324 0.940945 0.775792 0.812297 0.817971 0.710792 0.459248 0.674777 0.994223 0.53624 0.767359 0.212686 0.467188 0.047717 0.296403 0.756214 0.876569 0.031756 0.646721 0.958945 0.181859 0.743799 0.04482 0.385147 0.157127 0.811305 0.599094 0.383629 0.356125 0.477347 0.39576 0.169382 0.753632 0.8465 0.767861 0.232487 0.71482 0.22 [...]
+0.874948 0.892803 0.563547 0.03665 0.976076 0.862268 0.178556 0.454853 0.233651 0.015603 0.902073 0.521093 0.705994 0.900525 0.471811 0.071482 0.373811 0.533987 0.274618 0.662091 0.632224 0.260964 0.685588 0.131292 0.884221 0.356567 0.668551 0.608821 0.042452 0.752029 0.930434 0.041901 0.036699 0.051018 0.376057 0.410899 0.811015 0.30504 0.835279 0.550214 0.021578 0.926582 0.267943 0.340854 0.469044 0.737912 0.817123 0.933564 0.372785 0.666275 0.351109 0.396894 0.531424 0.830102 0.823685 [...]
+0.134435 0.092456 0.855275 0.252983 0.653549 0.646471 0.909003 0.827509 0.341818 0.445489 0.311536 0.30632 0.863472 0.015877 0.156146 0.147321 0.225268 0.504984 0.78524 0.289406 0.014557 0.561743 0.957608 0.604799 0.915411 0.622884 0.638118 0.51452 0.693562 0.627723 0.014529 0.818024 0.759689 0.344872 0.406548 0.276014 0.023311 0.774797 0.574504 0.084871 0.379549 0.091307 0.834074 0.759719 0.275411 0.506823 0.203849 0.789507 0.775182 0.689308 0.326264 0.365952 0.315953 0.055922 0.973377  [...]
+0.69295 0.181133 0.55229 0.066516 0.840015 0.682958 0.695381 0.984363 0.62865 0.538343 0.832262 0.988333 0.113391 0.842194 0.266773 0.035002 0.544551 0.180123 0.50731 0.12499 0.395645 0.001451 0.468778 0.454113 0.25728 0.221707 0.090404 0.890139 0.171739 0.905897 0.814598 0.66065 0.084778 0.581131 0.227824 0.002299 0.125947 0.629772 0.60639 0.348817 0.190494 0.87347 0.528503 0.988152 0.893797 0.017803 0.249261 0.345566 0.90807 0.68812 0.388704 0.559618 0.775749 0.515511 0.166828 0.753527 [...]
+0.07824 0.668901 0.661081 0.577195 0.520099 0.542272 0.514841 0.946395 0.495659 0.350698 0.139194 0.953197 0.19988 0.223038 0.26756 0.725459 0.961607 0.162552 0.092969 0.142509 0.842267 0.587648 0.661826 0.131817 0.863621 0.289601 0.419579 0.46364 0.086403 0.234671 0.331582 0.354895 0.126416 0.258303 0.552558 0.436109 0.889615 0.64478 0.615402 0.524599 0.492187 0.26274 0.101354 0.184129 0.191707 0.334736 0.917231 0.64657 0.340549 0.564949 0.06069 0.028111 0.064942 0.509097 0.318293 0.259 [...]
+0.648559 0.417997 0.483381 0.130618 0.065516 0.839223 0.924609 0.498483 0.181996 0.863025 0.845393 0.429336 0.72415 0.143238 0.831431 0.777783 0.783701 0.26149 0.234661 0.917482 0.950288 0.382462 0.962461 0.600196 0.299632 0.392899 0.65832 0.252191 0.912069 0.272036 0.327039 0.036764 0.68575 0.535373 0.222262 0.312134 0.568983 0.875089 0.333038 0.982555 0.291497 0.509526 0.240764 0.71915 0.893823 0.751373 0.782204 0.019198 0.809465 0.855047 0.617476 0.513767 0.575785 0.566108 0.925963 0. [...]
+0.948791 0.187836 0.314643 0.556071 0.37584 0.23767 0.229395 0.021377 0.202605 0.4598 0.595543 0.73329 0.263102 0.468319 0.319842 0.580517 0.987379 0.336499 0.217921 0.095928 0.387881 0.076357 0.584108 0.691398 0.410003 0.88814 0.718522 0.604512 0.364527 0.444779 0.864294 0.7631 0.827125 0.428564 0.695789 0.874651 0.373121 0.969091 0.507941 0.083564 0.269633 0.859492 0.4366 0.493536 0.850529 0.638193 0.317951 0.133375 0.431386 0.919022 0.348139 0.574933 0.249662 0.912516 0.945529 0.94072 [...]
+0.017683 0.070476 0.686774 0.174072 0.485338 0.92228 0.40884 0.762746 0.221214 0.923767 0.39211 0.838867 0.208067 0.90645 0.641074 0.836503 0.192873 0.867798 0.8399 0.012077 0.598747 0.099478 0.795967 0.22687 0.066361 0.610504 0.352071 0.759222 0.099708 0.496469 0.583301 0.645333 0.312593 0.595314 0.366597 0.757162 0.781073 0.491505 0.487188 0.278748 0.903893 0.803555 0.601006 0.046752 0.799108 0.372207 0.616579 0.964825 0.043625 0.978367 0.985168 0.909616 0.839955 0.210091 0.150015 0.61 [...]
+0.51032 0.116113 0.907579 0.409524 0.714839 0.834825 0.46111 0.732667 0.394996 0.995968 0.007554 0.228321 0.557235 0.308968 0.188081 0.67209 0.141869 0.079729 0.203365 0.425746 0.789195 0.406977 0.635663 0.825776 0.896385 0.647145 0.201597 0.061203 0.65206 0.894285 0.67017 0.446712 0.451625 0.830131 0.262649 0.532559 0.876902 0.774402 0.921662 0.511018 0.165831 0.041519 0.862714 0.970531 0.676281 0.670934 0.656176 0.637612 0.748336 0.395754 0.182044 0.57781 0.141472 0.436573 0.746516 0.7 [...]
+0.430143 0.443207 0.58778 0.503118 0.446874 0.558566 0.670469 0.675777 0.68859 0.190449 0.335343 0.607065 0.077752 0.621556 0.202449 0.846734 0.392015 0.842918 0.890252 0.970475 0.913089 0.515853 0.184529 0.06042 0.637775 0.517191 0.423183 0.083555 0.789535 0.665405 0.948814 0.809231 0.251908 0.065719 0.74678 0.229523 0.285117 0.480152 0.908828 0.212488 0.66223 0.677631 0.482109 0.737264 0.577979 0.625602 0.199529 0.855269 0.524926 0.174896 0.704925 0.604961 0.005543 0.262472 0.663714 0. [...]
+0.163989 0.014223 0.275323 0.140407 0.49378 0.417284 0.99644 0.002759 0.842174 0.953078 0.608777 0.942743 0.432523 0.832696 0.639225 0.923589 0.014218 0.612202 0.420585 0.594886 0.553523 0.778796 0.850152 0.217595 0.994536 0.852597 0.264459 0.875664 0.729458 0.144904 0.552485 0.536913 0.511156 0.668448 0.730673 0.943275 0.4411 0.806674 0.495088 0.946234 0.338353 0.187464 0.25241 0.64133 0.300758 0.520331 0.9247 0.759367 0.973855 0.933085 0.870611 0.650564 0.234794 0.653787 0.602846 0.818 [...]
+0.502628 0.978069 0.60807 0.049489 0.691584 0.436029 0.694556 0.853343 0.437588 0.545702 0.233053 0.248011 0.944486 0.958575 0.805691 0.47914 0.268649 0.238169 0.593174 0.47469 0.8409 0.553743 0.710105 0.631531 0.279114 0.988081 0.288858 0.446538 0.295824 0.074147 0.79161 0.576922 0.052469 0.814514 0.300326 0.171927 0.375717 0.545013 0.034101 0.268409 0.601059 0.04133 0.031486 0.965747 0.253664 0.340796 0.317977 0.170599 0.012899 0.673069 0.056899 0.11525 0.376734 0.838847 0.10211 0.4514 [...]
+0.929428 0.890372 0.453912 0.641086 0.541416 0.721035 0.311053 0.212494 0.005056 0.7313 0.993017 0.541916 0.769071 0.245347 0.700325 0.138664 0.702469 0.253045 0.687339 0.174061 0.839652 0.165392 0.33372 0.773283 0.863395 0.189498 0.317893 0.809556 0.640407 0.397114 0.165348 0.722977 0.324761 0.286027 0.244998 0.538101 0.017831 0.259721 0.216114 0.58654 0.795664 0.385586 0.34941 0.727732 0.556767 0.032706 0.265562 0.54713 0.068862 0.604911 0.940034 0.873187 0.892989 0.669579 0.594326 0.9 [...]
+0.278727 0.702335 0.428243 0.823838 0.733769 0.183207 0.989961 0.18814 0.323469 0.970305 0.180078 0.678433 0.330029 0.48105 0.025454 0.40263 0.142643 0.884787 0.315081 0.181597 0.831929 0.083927 0.33216 0.621906 0.142152 0.952978 0.968517 0.305978 0.44658 0.616326 0.443686 0.669463 0.794101 0.802413 0.551234 0.514812 0.243086 0.846365 0.043843 0.236858 0.600165 0.79233 0.787071 0.605055 0.758962 0.449298 0.772552 0.794582 0.028028 0.157166 0.093641 0.947218 0.307748 0.084471 0.643619 0.8 [...]
+0.697041 0.227402 0.93346 0.42605 0.786845 0.91135 0.979249 0.283867 0.55126 0.208161 0.920094 0.170133 0.017128 0.214977 0.729829 0.30993 0.644836 0.9701 0.477123 0.624144 0.346174 0.42965 0.901123 0.473085 0.653761 0.012064 0.03185 0.422101 0.79169 0.306983 0.148858 0.022781 0.691604 0.197206 0.390426 0.663653 0.743739 0.922622 0.438389 0.05811 0.796917 0.889961 0.113751 0.966366 0.310411 0.677258 0.730617 0.481146 0.230615 0.028431 0.396861 0.991868 0.601779 0.264365 0.026073 0.056927 [...]
+0.407816 0.538191 0.21647 0.068138 0.136898 0.233729 0.50695 0.968474 0.611557 0.594097 0.490577 0.556542 0.198004 0.286325 0.212315 0.41362 0.064624 0.87943 0.59617 0.427582 0.346948 0.168063 0.128022 0.820984 0.484867 0.653123 0.967625 0.630753 0.736542 0.043681 0.296349 0.928931 0.826937 0.192574 0.409632 0.345803 0.459549 0.491384 0.949107 0.592099 0.724189 0.267789 0.697379 0.814282 0.744028 0.366614 0.828942 0.291507 0.544447 0.91484 0.849879 0.264919 0.546114 0.479627 0.949916 0.3 [...]
+0.057965 0.978139 0.11832 0.555084 0.642574 0.899115 0.155175 0.506988 0.547048 0.369961 0.54182 0.290473 0.89939 0.118807 0.285259 0.720306 0.844194 0.722315 0.873568 0.25774 0.807289 0.124279 0.394114 0.370653 0.18925 0.703721 0.001271 0.360089 0.260084 0.663638 0.137054 0.446887 0.636057 0.34065 0.831615 0.364125 0.310572 0.419598 0.058653 0.745031 0.245305 0.498635 0.082567 0.062881 0.890532 0.795076 0.334589 0.809771 0.14443 0.641254 0.84556 0.964001 0.822558 0.334274 0.649505 0.391 [...]
+0.249541 0.784324 0.673259 0.231957 0.45254 0.805389 0.165406 0.033509 0.372512 0.823775 0.776467 0.725055 0.580828 0.026306 0.900864 0.697466 0.63719 0.236547 0.341194 0.56362 0.286285 0.490981 0.757172 0.339417 0.629639 0.587727 0.463579 0.151829 0.526372 0.66845 0.264991 0.301192 0.964001 0.20079 0.44276 0.206526 0.224194 0.51495 0.211511 0.183249 0.746099 0.65329 0.943657 0.450575 0.611879 0.590398 0.910564 0.247899 0.408002 0.560577 0.697077 0.387358 0.359259 0.108547 0.085264 0.550 [...]
+0.543403 0.330417 0.845254 0.216885 0.837784 0.59073 0.716862 0.310857 0.93767 0.247404 0.644311 0.430732 0.902769 0.303207 0.532929 0.556055 0.31953 0.742636 0.580416 0.67205 0.185172 0.581595 0.450559 0.419825 0.760333 0.159466 0.390254 0.646736 0.626879 0.415719 0.445241 0.887239 0.904638 0.150105 0.695413 0.414577 0.264329 0.820351 0.103844 0.883548 0.216285 0.774472 0.041845 0.007259 0.512826 0.520747 0.224507 0.607254 0.688523 0.486823 0.94318 0.640403 0.574215 0.418464 0.443624 0. [...]
+0.449734 0.108791 0.577277 0.944855 0.345947 0.647275 0.223405 0.588293 0.60219 0.441256 0.972879 0.350975 0.717721 0.911011 0.700873 0.538206 0.184426 0.503099 0.846253 0.821484 0.127773 0.284241 0.458384 0.301943 0.686975 0.454453 0.600716 0.16637 0.352484 0.852036 0.844632 0.512781 0.70435 0.822794 0.656244 0.658501 0.854544 0.392444 0.130749 0.797717 0.817486 0.055247 0.105125 0.549945 0.617075 0.622399 0.605671 0.937858 0.750478 0.962437 0.850689 0.237557 0.157502 0.802719 0.03137 0 [...]
+0.828206 0.161998 0.385177 0.886575 0.21622 0.388051 0.288861 0.364084 0.639547 0.82717 0.457733 0.313791 0.906442 0.557796 0.357149 0.117686 0.382474 0.465365 0.582886 0.673998 0.644362 0.291103 0.779345 0.933744 0.992194 0.79839 0.922905 0.389201 0.233674 0.725704 0.489385 0.122762 0.232554 0.663465 0.683405 0.778069 0.035533 0.303191 0.298641 0.09804 0.71152 0.922649 0.696009 0.219908 0.835727 0.732813 0.635564 0.83133 0.641368 0.319655 0.016038 0.544345 0.171998 0.498228 0.393769 0.2 [...]
+0.794377 0.242763 0.973182 0.458066 0.210771 0.090193 0.779318 0.829115 0.830388 0.640671 0.529952 0.01722 0.535495 0.801173 0.914642 0.506486 0.037636 0.645984 0.270622 0.769666 0.818281 0.715671 0.341414 0.057206 0.703771 0.757207 0.980444 0.326677 0.102987 0.112702 0.729692 0.220014 0.66628 0.19814 0.147513 0.95392 0.6104 0.254579 0.360112 0.632371 0.117791 0.704942 0.967046 0.889 0.274583 0.898358 0.856352 0.691246 0.568728 0.095563 0.16251 0.084838 0.947636 0.069258 0.53465 0.649413 [...]
+0.008613 0.823847 0.534613 0.824153 0.63922 0.85879 0.555633 0.154208 0.053362 0.576893 0.395628 0.175636 0.471254 0.904978 0.922333 0.721239 0.633708 0.209245 0.817081 0.246452 0.265335 0.124073 0.370353 0.136244 0.091396 0.847552 0.654178 0.521499 0.281054 0.63078 0.721135 0.998491 0.099289 0.836588 0.941589 0.249126 0.519015 0.406465 0.116007 0.456786 0.181887 0.730001 0.138688 0.317126 0.286045 0.948603 0.08788 0.6309 0.580311 0.058367 0.461972 0.307913 0.888022 0.796168 0.825304 0.4 [...]
+0.419685 0.16104 0.114352 0.863469 0.123676 0.539356 0.54461 0.259324 0.54586 0.313851 0.385032 0.181111 0.31413 0.070443 0.05935 0.811654 0.961527 0.077728 0.885315 0.09171 0.503281 0.921316 0.250685 0.746241 0.927741 0.561897 0.147227 0.249103 0.161047 0.624383 0.387678 0.856083 0.579389 0.619551 0.235406 0.148463 0.413263 0.745477 0.184525 0.814191 0.315771 0.256755 0.895514 0.268319 0.926741 0.184259 0.94567 0.402964 0.038767 0.632138 0.839945 0.396746 0.766108 0.376499 0.906987 0.19 [...]
+0.345978 0.768414 0.36028 0.840142 0.779323 0.550458 0.367188 0.633428 0.829206 0.230233 0.212504 0.545992 0.328664 0.245035 0.449803 0.248248 0.431203 0.053477 0.692139 0.402919 0.527188 0.558652 0.920751 0.156127 0.188527 0.937786 0.294247 0.127046 0.381644 0.280942 0.4249 0.760793 0.393433 0.359086 0.077687 0.108948 0.623321 0.512133 0.200351 0.447531 0.610219 0.314149 0.712318 0.018622 0.082488 0.139219 0.102739 0.836554 0.436216 0.553829 0.994402 0.704723 0.295173 0.27576 0.861451 0 [...]
+0.344597 0.921128 0.828348 0.74695 0.747724 0.681817 0.721887 0.002555 0.174096 0.559839 0.30843 0.124629 0.016784 0.909135 0.331464 0.396423 0.935147 0.605229 0.230753 0.814805 0.301256 0.662043 0.180545 0.625023 0.370777 0.549713 0.939064 0.960095 0.295791 0.54789 0.029763 0.549919 0.915746 0.308748 0.316339 0.045164 0.50208 0.555906 0.527048 0.779164 0.877328 0.055783 0.022737 0.394361 0.312606 0.102192 0.326955 0.865042 0.762316 0.52617 0.498878 0.014416 0.068231 0.563746 0.085885 0. [...]
+0.883922 0.420192 0.04648 0.902735 0.507706 0.00198 0.901888 0.206652 0.108642 0.230785 0.541906 0.858903 0.137224 0.554324 0.176235 0.964497 0.3539 0.285566 0.305589 0.14196 0.530137 0.613646 0.715135 0.208538 0.125152 0.278849 0.843477 0.373284 0.730152 0.466599 0.769182 0.266632 0.023902 0.353139 0.080955 0.337635 0.12565 0.670528 0.001469 0.967535 0.673276 0.136775 0.182442 0.133945 0.009396 0.108709 0.885347 0.591084 0.511127 0.311968 0.89426 0.650798 0.079048 0.990063 0.733032 0.03 [...]
+0.498765 0.635005 0.411356 0.044482 0.434621 0.501313 0.89299 0.532179 0.216495 0.866683 0.045327 0.527896 0.583747 0.033581 0.632652 0.435592 0.625878 0.742089 0.256399 0.909541 0.608786 0.118565 0.818521 0.84773 0.031464 0.562469 0.332118 0.162445 0.8479 0.412293 0.544916 0.16155 0.319553 0.82433 0.144267 0.10195 0.872966 0.008862 0.636926 0.140639 0.696867 0.395117 0.437164 0.964363 0.596069 0.666515 0.335953 0.354903 0.688478 0.312628 0.990422 0.706813 0.051178 0.063504 0.830586 0.88 [...]
+0.741944 0.388672 0.879364 0.450894 0.204167 0.42057 0.56321 0.402655 0.639095 0.002904 0.710179 0.358387 0.721226 0.043112 0.018043 0.18411 0.911053 0.601622 0.758592 0.422039 0.421246 0.84197 0.948404 0.350475 0.546374 0.868376 0.718929 0.946365 0.759682 0.531143 0.365062 0.462862 0.694343 0.045429 0.840183 0.13056 0.070791 0.297383 0.562858 0.875896 0.719622 0.225561 0.779604 0.798893 0.527647 0.488694 0.700059 0.619648 0.557394 0.791611 0.28348 0.783176 0.517334 0.841021 0.925068 0.7 [...]
+0.088841 0.144644 0.150845 0.465291 0.50175 0.299847 0.753781 0.123235 0.550345 0.284082 0.399494 0.555385 0.958838 0.5107 0.417229 0.828819 0.027503 0.118791 0.177265 0.604039 0.673634 0.379901 0.184978 0.632855 0.841979 0.614842 0.355143 0.538217 0.759517 0.163606 0.611805 0.627944 0.073138 0.788981 0.755823 0.614304 0.493095 0.317783 0.398643 0.129074 0.74276 0.814334 0.80137 0.929502 0.144058 0.600921 0.884545 0.147246 0.532093 0.274415 0.277163 0.935373 0.324133 0.091056 0.122093 0. [...]
+0.024005 0.843774 0.258176 0.570466 0.697133 0.952506 0.327276 0.889487 0.7166 0.833309 0.509893 0.748723 0.528377 0.555449 0.191989 0.625953 0.713769 0.230884 0.032001 0.211897 0.894257 0.706913 0.225003 0.738393 0.511 0.899523 0.392674 0.42919 0.424197 0.996734 0.628049 0.117208 0.364569 0.573915 0.240954 0.306017 0.541422 0.466436 0.0981 0.20603 0.872431 0.642301 0.172016 0.863207 0.207545 0.389901 0.678694 0.30843 0.544145 0.781825 0.950994 0.837245 0.819861 0.385486 0.723251 0.02410 [...]
+0.302186 0.781115 0.424979 0.387012 0.929019 0.095012 0.262901 0.879279 0.256809 0.163271 0.824959 0.746548 0.767518 0.027842 0.686322 0.618588 0.615591 0.521683 0.150093 0.257343 0.290193 0.192514 0.426972 0.22912 0.281436 0.821226 0.126456 0.102476 0.591081 0.320903 0.757911 0.976859 0.567912 0.400134 0.953334 0.998655 0.153145 0.434319 0.199839 0.920941 0.83346 0.293589 0.417221 0.241533 0.239974 0.683381 0.851892 0.27638 0.942884 0.913894 0.510556 0.542175 0.469439 0.111319 0.587938  [...]
+0.043607 0.457882 0.491261 0.978139 0.360229 0.63252 0.149823 0.899469 0.446884 0.548606 0.98929 0.114094 0.545826 0.836316 0.016137 0.539103 0.780814 0.611842 0.554111 0.87921 0.198057 0.397859 0.947014 0.720571 0.429562 0.585112 0.533713 0.915453 0.638288 0.935282 0.941114 0.506506 0.923556 0.07003 0.460351 0.872614 0.915488 0.632867 0.060683 0.085702 0.642554 0.908422 0.335075 0.858467 0.466782 0.396958 0.829413 0.090254 0.903373 0.129734 0.112729 0.705725 0.19705 0.981946 0.759654 0. [...]
+0.347437 0.846946 0.95276 0.156363 0.565837 0.335082 0.189043 0.790623 0.496933 0.339261 0.610268 0.433547 0.114928 0.584895 0.813758 0.44152 0.487535 0.435664 0.842557 0.216774 0.791755 0.351927 0.011689 0.879074 0.489424 0.224247 0.559111 0.80933 0.928584 0.021902 0.533353 0.104529 0.368574 0.851107 0.452139 0.040788 0.335258 0.964762 0.616638 0.906366 0.686275 0.695526 0.9019 0.724234 0.360175 0.018773 0.827543 0.243014 0.605842 0.34666 0.175993 0.448087 0.403727 0.992162 0.374904 0.9 [...]
+0.10975 0.629037 0.771786 0.727177 0.856398 0.274967 0.270325 0.765302 0.04604 0.343794 0.793408 0.651077 0.674661 0.951925 0.311796 0.431739 0.146885 0.529003 0.846911 0.125349 0.061978 0.343099 0.909832 0.053449 0.595165 0.818965 0.860457 0.298664 0.993482 0.729275 0.550915 0.654831 0.266777 0.58308 0.262945 0.405671 0.281164 0.597481 0.909938 0.590442 0.248069 0.521488 0.019958 0.570397 0.130291 0.772931 0.601894 0.32583 0.453127 0.230109 0.039078 0.015692 0.56093 0.402664 0.63679 0.7 [...]
+0.015122 0.566676 0.833337 0.953399 0.291445 0.909935 0.838257 0.294238 0.600403 0.255159 0.872224 0.546954 0.051141 0.981881 0.055074 0.229642 0.048331 0.186734 0.595163 0.852174 0.019637 0.34658 0.176976 0.174619 0.707877 0.912866 0.957281 0.370013 0.700795 0.224421 0.500726 0.615698 0.65358 0.176998 0.961447 0.627617 0.176176 0.450371 0.078673 0.411117 0.053751 0.609779 0.725295 0.215405 0.575172 0.851059 0.015401 0.593753 0.392728 0.127995 0.656172 0.606877 0.117506 0.059594 0.731315 [...]
+0.400497 0.275149 0.573385 0.5793 0.869103 0.439747 0.348096 0.71674 0.166237 0.590605 0.107816 0.759408 0.279311 0.789435 0.649289 0.01838 0.047488 0.384179 0.383803 0.191287 0.34025 0.988827 0.645853 0.816553 0.224882 0.233731 0.623553 0.66515 0.455268 0.222044 0.844185 0.976603 0.957378 0.186614 0.856547 0.178299 0.125101 0.894155 0.63604 0.116459 0.353845 0.426116 0.32765 0.372264 0.281149 2.2e-05 0.944757 0.36057 0.192667 0.303637 0.070239 0.301927 0.86313 0.318831 0.722265 0.310113 [...]
+0.10915 0.124841 0.627145 0.007402 0.38179 0.665091 0.422527 0.182253 0.502971 0.584233 0.546476 0.554209 0.368942 0.131882 0.110407 0.305615 0.476629 0.196546 0.514839 0.395785 0.619766 0.871069 0.127776 0.669323 0.609335 0.807443 0.523925 0.371672 0.635821 0.9361 0.781101 0.57412 0.001723 0.641349 0.256865 0.423592 0.921964 0.243303 0.797019 0.438995 0.321074 0.775968 0.129559 0.401515 0.545574 0.04014 0.070346 0.898943 0.018607 0.80743 0.748099 0.678484 0.491208 0.097618 0.835945 0.79 [...]
+0.356898 0.126251 0.972059 0.261805 0.687262 0.512397 0.539497 0.836219 0.630031 0.134636 0.492884 0.195461 0.164552 0.753248 0.383231 0.014123 0.323769 0.855896 0.4212 0.392106 0.126572 0.106281 0.525278 0.653404 0.724641 0.198805 0.076867 0.833744 0.106446 0.591647 0.547573 0.1204 0.208768 0.213947 0.827543 0.350824 0.904599 0.886043 0.018349 0.605169 0.16932 0.564187 0.760063 0.716186 0.647508 0.44083 0.459449 0.805395 0.971756 0.378874 0.475065 0.861244 0.299091 0.562958 0.806946 0.4 [...]
+0.467567 0.472034 0.583808 0.321278 0.70196 0.104169 0.882357 0.474038 0.218815 0.53899 0.68595 0.258162 0.690495 0.871988 0.778154 0.667899 0.160048 0.958719 0.626982 0.08582 0.033426 0.305579 0.77741 0.095647 0.372496 0.163021 0.29453 0.329724 0.903068 0.369285 0.200347 0.551158 0.686208 0.363918 0.903222 0.614927 0.213611 0.489297 0.585161 0.899453 0.95006 0.433872 0.256778 0.085243 0.715933 0.010578 0.927652 0.822457 0.674341 0.52733 0.595241 0.775681 0.553464 0.959838 0.896547 0.532 [...]
+0.222116 0.268123 0.654488 0.222193 0.465979 0.129217 0.204236 0.783553 0.483138 0.908962 0.331642 0.832692 0.467055 0.408248 0.723411 0.446139 0.912244 0.339602 0.747147 0.958035 0.067882 0.035159 0.917472 0.735745 0.454583 0.1567 0.801065 0.723247 0.815526 0.584502 0.500301 0.973039 0.687661 0.355482 0.815383 0.100567 0.052441 0.012357 0.168063 0.439476 0.240824 0.143171 0.072228 0.018177 0.825719 0.590595 0.971365 0.121356 0.947308 0.709128 0.012155 0.52016 0.391085 0.170074 0.141107  [...]
+0.610428 0.106525 0.386665 0.448676 0.593647 0.041589 0.39226 0.351042 0.309589 0.437994 0.035891 0.279307 0.234921 0.51234 0.845188 0.459948 0.893866 0.201495 0.835825 0.212921 0.556528 0.616698 0.577524 0.213776 0.861656 0.406609 0.7114 0.201312 0.258835 0.922044 0.715385 0.472989 0.875846 0.290807 0.541081 0.228836 0.829116 0.779174 0.539386 0.436362 0.243921 0.799832 0.676919 0.682071 0.89446 0.624595 0.717658 0.492925 0.850743 0.088803 0.596957 0.111605 0.713113 0.492538 0.10262 0.4 [...]
+0.879517 0.732564 0.871736 0.387107 0.500995 0.983412 0.463795 0.284596 0.778949 0.624594 0.028965 0.589912 0.368351 0.943799 0.415793 0.599824 0.049598 0.85684 0.54263 0.570629 0.011779 0.418435 0.032946 0.503651 0.542865 0.270324 0.5648 0.638238 0.893004 0.389846 0.682754 0.03995 0.139032 0.192401 0.14931 0.478755 0.692764 0.697382 0.536652 0.373375 0.969588 0.904329 0.34448 0.892226 0.773182 0.046146 0.540523 0.717554 0.692904 0.290874 0.317806 0.665465 0.799117 0.782001 0.030231 0.70 [...]
+0.453527 0.421382 0.36471 0.410624 0.118482 0.901118 0.19386 0.907653 0.750597 0.936054 0.847581 0.470691 0.656342 0.348058 0.351095 0.085283 0.689414 0.585536 0.917032 0.948255 0.095338 0.276834 0.608068 0.729365 0.228534 0.249611 0.229862 0.753363 0.810492 0.359697 0.707068 0.472206 0.345439 0.050488 0.26493 0.287647 0.276802 0.137515 0.921597 0.221175 0.804872 0.352837 0.970861 0.31805 0.554472 0.265613 0.546835 0.255784 0.178635 0.328625 0.593509 0.27746 0.987677 0.14886 0.08835 0.85 [...]
+0.622858 0.050028 0.573573 0.338966 0.793192 0.919443 0.519912 0.281471 0.954203 0.851357 0.215529 0.611447 0.986204 0.591399 0.698328 0.34298 0.255291 0.071802 0.930132 0.966157 0.969607 0.413164 0.061923 0.597565 0.630292 0.595992 0.171354 0.746096 0.222723 0.23542 0.116459 0.729666 0.558392 0.005035 0.338738 0.676007 0.095219 0.761739 0.889171 0.717387 0.511058 0.389628 0.88788 0.846137 0.00183 0.780954 0.584701 0.968621 0.370111 0.941155 0.998234 0.6693 0.779952 0.550017 0.041039 0.7 [...]
+0.260028 0.105711 0.287413 0.743109 0.440886 0.59796 0.743374 0.934901 0.37586 0.248417 0.27388 0.361606 0.414609 0.975052 0.786248 0.402129 0.778071 0.06576 0.996611 0.889798 0.065305 0.016286 0.248521 0.889585 0.232097 0.240039 0.03446 0.725088 0.067267 0.669898 0.832647 0.654252 0.169603 0.937252 0.509939 0.241671 0.107543 0.761164 0.343904 0.877218 0.250753 0.396047 0.86864 0.745005 0.57118 0.849419 0.302951 0.087367 0.715268 0.992356 0.110348 0.752125 0.411123 0.963553 0.746107 0.62 [...]
+0.251095 0.728351 0.074268 0.959191 0.061484 0.540457 0.64979 0.885028 0.454694 0.942866 0.531798 0.224566 0.773338 0.269635 0.74481 0.853004 0.321755 0.886412 0.924507 0.708785 0.197154 0.067876 0.292648 0.966551 0.219611 0.02943 0.680579 0.729883 0.983106 0.588641 0.975042 0.133125 0.244995 0.87094 0.508177 0.156723 0.118402 0.306027 0.307392 0.926258 0.497468 0.745124 0.062386 0.264706 0.498323 0.850294 0.293742 0.305987 0.467664 0.445102 0.791874 0.080314 0.339721 0.175787 0.199832 0 [...]
+0.524648 0.05626 0.883321 0.215197 0.058443 0.437647 0.791822 0.252767 0.532952 0.277014 0.800311 0.357093 0.31792 0.247479 0.876007 0.581417 0.131224 0.047526 0.340892 0.209192 0.908302 0.388473 0.640401 0.850985 0.670968 0.897618 0.697032 0.13522 0.948338 0.605686 0.013351 0.060475 0.914927 0.360858 0.114502 0.206552 0.444818 0.485611 0.449672 0.074125 0.19541 0.977519 0.658853 0.331802 0.081478 0.401536 0.889937 0.836175 0.632086 0.298351 0.70952 0.947124 0.223046 0.54956 0.884941 0.0 [...]
+0.623372 0.546671 0.348811 0.863018 0.486982 0.954383 0.106452 0.666171 0.781757 0.949548 0.351902 0.111517 0.458483 0.747295 0.846581 0.920023 0.520517 0.020175 0.684838 0.856511 0.373948 0.774079 0.879535 0.623785 0.313782 0.263805 0.048814 0.372079 0.331973 0.065692 0.744465 0.162539 0.373919 0.60821 0.490302 0.66484 0.645356 0.148988 0.135534 0.072072 0.185613 0.303801 0.242977 0.236655 0.608765 0.243196 0.494762 0.100895 0.308607 0.934534 0.457001 0.988274 0.103086 0.292346 0.970976 [...]
+0.26987 0.180874 0.909734 0.547972 0.257741 0.77886 0.242787 0.354602 0.294241 0.793326 0.214064 0.832266 0.232639 0.553498 0.968071 0.738941 0.427701 0.480352 0.275192 0.432284 0.514746 0.342203 0.054284 0.097455 0.291564 0.122182 0.658725 0.390411 0.405623 0.424449 0.092648 0.168417 0.985159 0.21495 0.687681 0.733415 0.439124 0.87811 0.215021 0.400984 0.337013 0.769301 0.107856 0.949126 0.352954 0.328787 0.390209 0.554449 0.439113 0.088956 0.423735 0.591948 0.702441 0.563728 0.75173 0. [...]
+0.553829 0.484707 0.709042 0.950948 0.230826 0.037241 0.266384 0.807055 0.438422 0.762746 0.784298 0.362724 0.86281 0.307728 0.476541 0.466428 0.260338 0.648095 0.278889 0.295947 0.517936 0.212367 0.744232 0.563693 0.742683 0.240923 0.578343 0.873694 0.175682 0.484037 0.558836 0.539034 0.918174 0.617679 0.047551 0.774769 0.214302 0.253879 0.216664 0.730335 0.629985 0.449386 0.56144 0.900883 0.256778 0.80443 0.856207 0.383796 0.633161 0.534548 0.626283 0.632149 0.503291 0.992078 0.448808  [...]
+0.380612 0.862014 0.353293 0.616625 0.07048 0.693203 0.94425 0.753637 0.496183 0.688133 0.873097 0.112171 0.518062 0.202115 0.690861 0.881662 0.14675 0.191987 0.30744 0.02471 0.597491 0.055415 0.409486 0.791189 0.098326 0.082068 0.537924 0.37039 0.842755 0.16541 0.855253 0.2665 0.168304 0.342641 0.940766 0.096843 0.441665 0.688528 0.563634 0.885744 0.619395 0.134061 0.595217 0.053658 0.430391 0.206305 0.57102 0.924019 0.229698 0.828531 0.180533 0.998522 0.393909 0.857077 0.13211 0.545812 [...]
+0.639076 0.569414 0.475605 0.16863 0.488634 0.338347 0.198637 0.282529 0.504941 0.769804 0.556509 0.362991 0.5307 0.990445 0.928899 0.330541 0.367789 0.241108 0.389785 0.384271 0.162566 0.661408 0.597844 0.291911 0.66075 0.687881 0.823246 0.091344 0.845165 0.554462 0.384561 0.608924 0.555267 0.850716 0.251168 0.961558 0.554064 0.44743 0.875705 0.698081 0.576085 0.505528 0.415031 0.289187 0.65709 0.625644 0.442239 0.908435 0.358541 0.080071 0.098394 0.044704 0.554607 0.446269 0.879547 0.1 [...]
+0.514498 0.514062 0.930812 0.731691 0.84488 0.245362 0.321154 0.4878 0.750158 0.297544 0.737021 0.375292 0.480139 0.999095 0.994936 0.49846 0.622554 0.778197 0.203443 0.707835 0.169782 0.915063 0.59766 0.781359 0.841123 0.55867 0.992945 0.627005 0.475372 0.370207 0.268475 0.692676 0.282559 0.52361 0.883634 0.924158 0.137871 0.439159 0.461145 0.036164 0.717703 0.094968 0.156571 0.631364 0.061098 0.207094 0.941592 0.508595 0.617641 0.508442 0.93843 0.909226 0.675525 0.242778 0.491753 0.183 [...]
+0.225796 0.564116 0.767667 0.980861 0.822541 0.058533 0.069299 0.878527 0.215541 0.310907 0.860639 0.138993 0.133894 0.017739 0.375914 0.009515 0.749372 0.034013 0.937674 0.814251 0.281633 0.233942 0.518202 0.319552 0.415426 0.653246 0.621875 0.447861 0.847739 0.726568 0.700658 0.434755 0.9231 0.862837 0.708829 0.602079 0.033934 0.877121 0.578879 0.569731 0.671326 0.532275 0.756337 0.504043 0.040176 0.974969 0.024407 0.261138 0.448682 0.124102 0.396202 0.389629 0.826081 0.659079 0.392304 [...]
+0.527702 0.27887 0.646472 0.086698 0.718099 0.325562 0.357452 0.069739 0.726991 0.791801 0.281028 0.049167 0.567455 0.546263 0.375053 0.048316 0.209017 0.53746 0.79125 0.003646 0.433749 0.212347 0.39778 0.568556 0.120296 0.139878 0.187524 0.960071 0.53861 0.585735 0.917512 0.535143 0.600303 0.429957 0.797519 0.780347 0.026109 0.318984 0.948732 0.936884 0.698375 0.046666 0.541467 0.573263 0.357765 0.211173 0.99147 0.591408 0.495421 0.476725 0.021244 0.188297 0.892944 0.321467 0.50398 0.48 [...]
+0.488118 0.595221 0.228735 0.387723 0.924843 0.136322 0.206503 0.686499 0.314856 0.530079 0.415312 0.289547 0.352415 0.906959 0.744248 0.613492 0.284664 0.828997 0.207441 0.275596 0.422185 0.722201 0.712844 0.630687 0.634291 0.619594 0.385876 0.06408 0.72005 0.040489 0.076866 0.091359 0.518177 0.405138 0.117852 0.891792 0.277723 0.792091 0.681819 0.103897 0.412599 0.54477 0.742846 0.490028 0.576905 0.065156 0.408076 0.002436 0.135065 0.934321 0.439835 0.872171 0.703108 0.262953 0.961174  [...]
+0.48542 0.270839 0.436791 0.064484 0.730926 0.406907 0.655389 0.31056 0.248266 0.28548 0.387043 0.597268 0.084199 0.705038 0.52666 0.485471 0.667947 0.95168 0.221463 0.709196 0.506377 0.44262 0.355217 0.269781 0.173796 0.617308 0.10818 0.914716 0.548242 0.848953 0.438108 0.698937 0.518586 0.187959 0.044164 0.451301 0.414701 0.901242 0.995417 0.940037 0.516374 0.06234 0.033706 0.271674 0.773401 0.368483 0.485536 0.946248 0.427183 0.976281 0.752148 0.417539 0.908269 0.463174 0.871898 0.886 [...]
+0.651576 0.113136 0.229447 0.549459 0.462295 0.88052 0.845771 0.125717 0.655618 0.334795 0.035893 0.222975 0.172341 0.19483 0.605479 0.542196 0.302703 0.131658 0.650212 0.581248 0.404564 0.731247 0.669774 0.725082 0.199694 0.157372 0.507333 0.508967 0.443012 0.551533 0.016765 0.227647 0.415369 0.777884 0.941636 0.466234 0.056657 0.427649 0.094406 0.747534 0.848712 0.377739 0.692502 0.748064 0.054588 0.22652 0.711613 0.536777 0.686141 0.95142 0.814395 0.969232 0.088006 0.659909 0.885801 0 [...]
+0.687766 0.666778 0.173884 0.193614 0.030898 0.4693 0.622449 0.244291 0.270538 0.112942 0.728693 0.504771 0.443638 0.594859 0.490739 0.449094 0.955049 0.329301 0.22403 0.140357 0.975321 0.798588 0.473717 0.289747 0.837649 0.352419 0.321596 0.287151 0.108628 0.214486 0.643755 0.42084 0.196553 0.986042 0.446448 0.352302 0.437779 0.676599 0.523599 0.446934 0.779145 0.528809 0.921714 0.530903 0.849004 0.804583 0.153648 0.872385 0.06374 0.203388 0.855917 0.073807 0.600766 0.634608 0.308893 0. [...]
+0.696244 0.0623 0.665625 0.421264 0.271972 0.795312 0.029474 0.538981 0.621533 0.089634 0.626737 0.555755 0.741809 0.360604 0.689909 0.072196 0.095043 0.225542 0.316223 0.486423 0.431245 0.997413 0.813057 0.417415 0.441647 0.779179 0.472318 0.856596 0.903639 0.517022 0.087602 0.837952 0.993342 0.687895 0.144313 0.27178 0.192595 0.351494 0.282137 0.64777 0.978362 0.449359 0.3296 0.497708 0.847299 0.2325 0.817849 0.99623 0.709929 0.347049 0.797081 0.64173 0.442191 0.556003 0.038401 0.37422 [...]
+0.584993 0.611214 0.067887 0.600391 0.892264 0.112915 0.722149 0.748579 0.678298 0.318098 0.748857 0.664814 0.370122 0.774955 0.702406 0.160473 0.933189 0.684244 0.142557 0.309515 0.647029 0.971014 0.730556 0.854198 0.919185 0.12163 0.528593 0.276461 0.855821 0.958127 0.877196 0.685873 0.529561 0.882832 0.408311 0.562544 0.015571 0.960997 0.83658 0.716667 0.742965 0.701373 0.742746 0.895643 0.76632 0.954512 0.176566 0.955827 0.082517 0.419063 0.131209 0.767245 0.365109 0.787628 0.599457  [...]
+0.358915 0.777454 0.857806 0.025065 0.092957 0.053713 0.961306 0.277433 0.623394 0.876721 0.045228 0.022583 0.196446 0.702185 0.601244 0.139299 0.755248 0.909642 0.913179 0.484984 0.210193 0.342358 0.980077 0.001151 0.040179 0.031971 0.568469 0.883766 0.361287 0.647187 0.511149 0.385193 0.572574 0.579049 0.734476 0.536584 0.395382 0.401231 0.999656 0.184871 0.131475 0.320994 0.551631 0.65627 0.443579 0.145653 0.797288 0.514111 0.114452 0.450097 0.900342 0.45755 0.662089 0.854706 0.724249 [...]
+0.194419 0.938801 0.909976 0.70863 0.371263 0.08377 0.167431 0.56094 0.652475 0.939192 0.168641 0.17625 0.926917 0.056097 0.570949 0.142407 0.467031 0.073828 0.828995 0.996054 0.955397 0.49512 0.975217 0.706781 0.04434 0.914299 0.677028 0.142965 0.206249 0.546244 0.340021 0.859391 0.062449 0.115935 0.1363 0.288888 0.167587 0.200706 0.743043 0.13667 0.426042 0.487349 0.455659 0.749981 0.251088 0.124758 0.46563 0.274855 0.69108 0.7467 0.102684 0.510206 0.694075 0.509132 0.451651 0.642736 0 [...]
+0.395447 0.941414 0.497833 0.197525 0.065582 0.488761 0.008499 0.026395 0.62415 0.702448 0.903143 0.060689 0.272432 0.768656 0.769668 0.856478 0.286227 0.936184 0.59312 0.627448 0.166495 0.886411 0.859775 0.426673 0.374846 0.899194 0.636058 0.159982 0.860896 0.995032 0.360659 0.336996 0.851289 0.294763 0.615093 0.576985 0.92443 0.42695 0.102275 0.934481 0.602859 0.267688 0.48788 0.615281 0.960176 0.814755 0.426184 0.017961 0.784208 0.663767 0.752695 0.789784 0.204857 0.195367 0.373765 0. [...]
+0.899114 0.209965 0.302277 0.617353 0.246233 0.57647 0.038457 0.605918 0.960552 0.440133 0.568867 0.966544 0.779147 0.266464 0.402766 0.124583 0.861382 0.885151 0.090482 0.867237 0.994966 0.745532 0.689923 0.360954 0.357504 0.89986 0.561581 0.118456 0.820605 0.235729 0.356047 0.351896 0.098842 0.811559 0.245331 0.18177 0.83859 0.766117 0.176446 0.171504 0.749488 0.722611 0.737202 0.481705 0.862058 0.505813 0.340609 0.090497 0.336635 0.898936 0.224375 0.369996 0.603028 0.264779 0.349222 0 [...]
+0.733417 0.792988 0.143517 0.432416 0.919185 0.646106 0.323169 0.436356 0.12217 0.538261 0.235473 0.082736 0.402526 0.239298 0.582714 0.048906 0.386061 0.283668 0.348914 0.991216 0.756318 0.858016 0.301058 0.245273 0.186526 0.41623 0.532214 0.87961 0.154139 0.827295 0.27554 0.040779 0.059351 0.166889 0.361003 0.069672 0.798928 0.986268 0.848012 0.553557 0.735251 0.497007 0.227016 0.68094 0.682393 0.572774 0.745619 0.750718 0.624261 0.335764 0.357063 0.12627 0.310312 0.660821 0.86591 0.00 [...]
+0.199783 0.278257 0.288977 0.609885 0.625403 0.071595 0.820806 0.362328 0.503361 0.034661 0.276128 0.026544 0.175968 0.291619 0.418379 0.135947 0.031604 0.330687 0.351933 0.793282 0.418214 0.955798 0.170545 0.990821 0.412176 0.72746 0.311972 0.36981 0.40238 0.62771 0.511508 0.134671 0.906036 0.842559 0.866331 0.402929 0.749266 0.692054 0.174449 0.780712 0.453908 0.555951 0.192702 0.962749 0.724827 0.720144 0.893309 0.121699 0.300884 0.785362 0.970981 0.96722 0.399165 0.149741 0.053407 0. [...]
+0.44162 0.053725 0.304532 0.281496 0.642325 0.402182 0.44318 0.652688 0.892655 0.56564 0.996893 0.284122 0.781312 0.789434 0.020879 0.208178 0.087506 0.637225 0.149663 0.815954 0.097694 0.74426 0.539308 0.434976 0.92383 0.776372 0.414631 0.814273 0.63423 0.508208 0.508739 0.294376 0.83532 0.498324 0.883118 0.097814 0.766944 0.467476 0.472162 0.923122 0.709801 0.364927 0.192685 0.015805 0.312914 0.573052 0.048372 0.388429 0.856802 0.370239 0.447282 0.512767 0.011286 0.012988 0.877375 0.20 [...]
+0.257698 0.75757 0.689291 0.808283 0.56493 0.297049 0.241315 0.777727 0.385966 0.484261 0.919619 0.325041 0.110578 0.823958 0.919764 0.004983 0.426037 0.634497 0.596497 0.86928 0.311605 0.795617 0.921601 0.341201 0.30565 0.303608 0.021959 0.035314 0.244388 0.368232 0.281699 0.898832 0.55053 0.598504 0.83201 0.312188 0.539938 0.069831 0.567012 0.938038 0.948942 0.330156 0.809426 0.517027 0.634167 0.907577 0.305018 0.271532 0.455173 0.138474 0.5383 0.187258 0.453174 0.766848 0.572203 0.292 [...]
+0.512577 0.021969 0.932563 0.989704 0.851584 0.521928 0.504518 0.433652 0.052525 0.513581 0.722491 0.207764 0.869961 0.626449 0.439453 0.75095 0.759018 0.140305 0.443 0.142814 0.164649 0.998216 0.580049 0.838614 0.960988 0.604065 0.197045 0.156546 0.956735 0.688228 0.75913 0.195279 0.667813 0.720463 0.315742 0.628067 0.947221 0.42543 0.147232 0.381156 0.690994 0.015336 0.437081 0.486257 0.716673 0.417716 0.758875 0.840603 0.021504 0.53646 0.340202 0.085105 0.417398 0.534558 0.659213 0.42 [...]
+0.307586 0.20746 0.817534 0.861579 0.426388 0.278496 0.272112 0.179437 0.716706 0.785335 0.425503 0.813043 0.594132 0.645732 0.087392 0.258323 0.347902 0.770374 0.623793 0.613624 0.889987 0.160117 0.076542 0.128105 0.979244 0.751651 0.597153 0.249194 0.999573 0.992294 0.042516 0.339115 0.610328 0.506785 0.116011 0.554193 0.27034 0.191168 0.740228 0.392943 0.002145 0.279396 0.626432 0.432153 0.774951 0.739334 0.949897 0.657361 0.20394 0.166714 0.988363 0.348196 0.248638 0.039843 0.175324  [...]
+0.339745 0.639412 0.095373 0.888626 0.965694 0.671636 0.066454 0.959192 0.343171 0.418656 0.835271 0.973161 0.677286 0.56362 0.102934 0.194411 0.459859 0.135718 0.841817 0.784286 0.359701 0.159453 0.775002 0.699433 0.563641 0.340793 0.109291 0.822189 0.704244 0.040032 0.027122 0.05374 0.506181 0.366612 0.398389 0.907109 0.902672 0.810385 0.020415 0.299552 0.897015 0.260887 0.734251 0.408949 0.147638 0.622765 0.423543 0.182876 0.660831 0.715646 0.612235 0.874991 0.595938 0.07362 0.986049  [...]
+0.122004 0.009183 0.113717 0.998885 0.447318 0.67182 0.307713 0.999748 0.418517 0.946759 0.289245 0.694536 0.105829 0.636901 0.99316 0.635175 0.421944 0.781938 0.90752 0.43422 0.204568 0.639267 0.636862 0.711216 0.284562 0.564898 0.622848 0.139172 0.687017 0.75788 0.472745 0.359656 0.403261 0.437231 0.753116 0.167707 0.527938 0.366789 0.404974 0.527746 0.691933 0.584569 0.395217 0.297686 0.645191 0.533756 0.976755 0.196428 0.878343 0.479627 0.61019 0.861313 0.203667 0.285089 0.056789 0.5 [...]
+0.340801 0.101164 0.898385 0.57718 0.459062 0.736787 0.770981 0.674912 0.852168 0.341625 0.104384 0.324078 0.762295 0.024175 0.824836 0.231191 0.690931 0.450475 0.541945 0.535755 0.734402 0.032792 0.828029 0.701076 0.577711 0.07919 0.393408 0.092621 0.877474 0.951225 0.70689 0.526035 0.066434 0.393269 0.755578 0.240978 0.764847 0.625197 0.865189 0.647829 0.89273 0.169463 0.418773 0.568911 0.683515 0.816335 0.192768 0.828877 0.239777 0.26267 0.858871 0.098363 0.394274 0.525217 0.926154 0. [...]
+0.131304 0.409269 0.822797 0.28092 0.228235 0.231675 0.94373 0.435556 0.689641 0.753921 0.496447 0.429576 0.352329 0.149214 0.422503 0.647555 0.710653 0.04511 0.79706 0.172023 0.710296 0.434053 0.92738 0.810112 0.406416 0.447416 0.914283 0.278917 0.177511 0.950363 0.02842 0.170479 0.497053 0.928331 0.516569 0.763416 0.92258 0.681669 0.714834 0.021108 0.190845 0.724969 0.777289 0.293764 0.328331 0.776744 0.719015 0.220037 0.289568 0.123291 0.708675 0.643123 0.0904 0.30365 0.173495 0.01855 [...]
+0.914544 0.923667 0.57931 0.722898 0.81768 0.421755 0.751806 0.105398 0.912243 0.881725 0.547057 0.702597 0.577496 0.613987 0.042886 0.40363 0.724833 0.033232 0.0499 0.001036 0.664012 0.332126 0.296605 0.281046 0.35122 0.523962 0.460734 0.610473 0.651911 0.166818 0.565578 0.791935 0.876139 0.062921 0.748769 0.019704 0.214941 0.707161 0.554073 0.148576 0.601569 0.595112 0.27934 0.042868 0.029688 0.290766 0.869314 0.646447 0.106337 0.805411 0.947543 0.589955 0.19569 0.89127 0.467474 0.0008 [...]
+0.949038 0.411401 0.06436 0.263556 0.337942 0.978751 0.308263 0.717701 0.893423 0.659777 0.581016 0.762903 0.914568 0.283742 0.629973 0.049688 0.953196 0.384716 0.466229 0.076846 0.758841 0.925733 0.908247 0.53792 0.401047 0.143634 0.516505 0.587249 0.25079 0.459674 0.568261 0.961259 0.770913 0.268407 0.996956 0.746484 0.019879 0.727484 0.239719 0.269616 0.24443 0.867984 0.567957 0.043768 0.719374 0.159692 0.342104 0.303917 0.70649 0.726257 0.028868 0.979748 0.556112 0.670031 0.544691 0. [...]
+0.950818 0.71847 0.865091 0.214689 0.148659 0.704476 0.75186 0.343156 0.049229 0.866638 0.097809 0.388122 0.482152 0.143983 0.663136 0.353358 0.901517 0.742672 0.259896 0.416124 0.351756 0.73178 0.996795 0.769848 0.61146 0.32667 0.765992 0.972999 0.469063 0.867929 0.151777 0.502452 0.918403 0.022417 0.467202 0.554726 0.854274 0.473447 0.396463 0.610585 0.148044 0.548906 0.952518 0.539888 0.242201 0.578332 0.723449 0.542306 0.836861 0.139292 0.371904 0.807191 0.112735 0.940514 0.0842 0.68 [...]
+0.731147 0.57121 0.240099 0.611377 0.120284 0.78356 0.306696 0.343943 0.882199 0.765168 0.240847 0.072454 0.421595 0.02256 0.167113 0.746541 0.519983 0.368228 0.0279 0.540907 0.365365 0.780113 0.1549 0.894919 0.839342 0.950234 0.613636 0.298667 0.985245 0.329542 0.555376 0.10096 0.151462 0.679856 0.003895 0.71637 0.309029 0.788327 0.842304 0.75468 0.921449 0.224314 0.198227 0.4747 0.839091 0.931688 0.149787 0.598496 0.1733 0.582984 0.243333 0.395547 0.993699 0.051347 0.301717 0.732068 0. [...]
+0.217976 0.773697 0.580888 0.461386 0.382929 0.313111 0.122718 0.805239 0.938838 0.811627 0.256878 0.140499 0.680096 0.553301 0.763757 0.563431 0.402952 0.371078 0.050114 0.508939 0.307753 0.535547 0.623822 0.856868 0.852475 0.433991 0.069301 0.935968 0.329279 0.824289 0.644081 0.29386 0.161358 0.873841 0.869264 0.812284 0.131686 0.653318 0.639318 0.906766 0.471639 0.887565 0.389857 0.178187 0.784701 0.170202 0.399426 0.305781 0.366728 0.716958 0.650756 0.399912 0.304511 0.862249 0.25830 [...]
+0.771025 0.098376 0.348801 0.93754 0.688714 0.64996 0.553241 0.866337 0.625059 0.089098 0.615872 0.686492 0.596663 0.27344 0.436514 0.120799 0.996492 0.71605 0.673325 0.87807 0.048669 0.985661 0.113098 0.516495 0.842176 0.970575 0.505691 0.46288 0.161554 0.679588 0.877834 0.784581 0.12339 0.100373 0.228652 0.990755 0.642008 0.359283 0.445659 0.941079 0.274163 0.165662 0.505604 0.588617 0.574676 0.798241 0.141691 0.697805 0.92357 0.22779 0.962457 0.121878 0.446626 0.425162 0.684618 0.3458 [...]
+0.539482 0.036393 0.295425 0.001408 0.279095 0.128794 0.037568 0.545826 0.738852 0.031817 0.948528 0.732342 0.317523 0.041568 0.566806 0.46868 0.513179 0.694121 0.195767 0.719413 0.138371 0.30192 0.478834 0.65445 0.290077 0.681563 0.851514 0.010531 0.349156 0.315515 0.199231 0.145081 0.862865 0.591087 0.706197 0.849416 0.797545 0.798571 0.254217 0.109037 0.487106 0.669574 0.744243 0.607707 0.030226 0.336082 0.865996 0.744537 0.06838 0.824997 0.860204 0.218278 0.006804 0.764765 0.74118 0. [...]
+0.31565 0.762453 0.055877 0.217477 0.660456 0.44972 0.883036 0.855115 0.510269 0.090496 0.982926 0.606762 0.265681 0.697665 0.926282 0.34311 0.697751 0.853605 0.492223 0.907346 0.790773 0.028585 0.557114 0.706758 0.330344 0.063387 0.948674 0.858609 0.09212 0.219385 0.76001 0.993849 0.203767 0.659713 0.027696 0.407644 0.649735 0.899365 0.42358 0.099958 0.783506 0.023612 0.201948 0.788968 0.150045 0.531949 0.978928 0.497295 0.03292 0.370711 0.552657 0.366685 0.688671 0.936681 0.630421 0.88 [...]
+0.345075 0.317149 0.4724 0.764613 0.94599 0.218048 0.928299 0.063464 0.177386 0.078201 0.163861 0.489831 0.116078 0.718362 0.308146 0.373336 0.003478 0.618953 0.704705 0.762077 0.301225 0.09209 0.966039 0.608163 0.390355 0.193704 0.504179 0.734837 0.655564 0.406876 0.993965 0.988109 0.311986 0.939107 0.950524 0.224621 0.41047 0.448063 0.394378 0.990012 0.024498 0.611952 0.612793 0.248459 0.412585 0.473838 0.988322 0.266781 0.516643 0.855558 0.839533 0.807658 0.202843 0.637281 0.625118 0. [...]
+0.010802 0.168923 0.251757 0.353744 0.29745 0.087696 0.061219 0.042987 0.60442 0.817442 0.884361 0.990483 0.312687 0.666443 0.628236 0.132479 0.095605 0.368053 0.95762 0.884537 0.097603 0.763488 0.038909 0.589176 0.50137 0.807487 0.122059 0.565797 0.332357 0.051349 0.140419 0.036411 0.893416 0.398022 0.620831 0.486832 0.298464 0.243248 0.454984 0.741586 0.089075 0.345637 0.648018 0.779165 0.223351 0.124099 0.665604 0.566721 0.093123 0.733386 0.899213 0.359042 0.27852 0.073259 0.604935 0. [...]
+0.653767 0.914761 0.733801 0.044238 0.415865 0.388549 0.611478 0.763786 0.739821 0.175339 0.617332 0.771895 0.751165 0.0756 0.229087 0.926454 0.671722 0.136902 0.242356 0.827194 0.75413 0.285597 0.353671 0.52465 0.05603 0.289406 0.449598 0.235658 0.936526 0.06971 0.539216 0.570376 0.632239 0.55085 0.901523 0.993762 0.603902 0.29959 0.155722 0.82156 0.152145 0.305803 0.375306 0.013368 0.929305 0.770156 0.128103 0.668489 0.853253 0.257862 0.380175 0.767093 0.979232 0.718412 0.478087 0.6361 [...]
+0.790145 0.53588 0.189478 0.963209 0.613209 0.15093 0.129599 0.341598 0.244253 0.257189 0.006871 0.499978 0.74256 0.745069 0.541176 0.028361 0.750161 0.635221 0.239974 0.401038 0.078378 0.340961 0.944189 0.59983 0.51386 0.40966 0.003042 0.950473 0.233428 0.288836 0.894048 0.447877 0.819589 0.820536 0.319526 0.283393 0.195786 0.131434 0.278381 0.01799 0.325466 0.237586 0.705468 0.25292 0.14107 0.330242 0.229719 0.389818 0.830215 0.297335 0.790326 0.225003 0.54361 0.463768 0.952783 0.09764 [...]
+0.449833 0.059026 0.490773 0.058197 0.152029 0.726587 0.54315 0.661199 0.658872 0.330847 0.296334 0.943986 0.267103 0.505201 0.254961 0.097192 0.880544 0.451669 0.571666 0.782974 0.23826 0.989457 0.346705 0.961462 0.828823 0.082022 0.843864 0.224369 0.538498 0.387556 0.074104 0.572634 0.84097 0.216243 0.028024 0.997351 0.736142 0.996049 0.633903 0.7851 0.168987 0.042934 0.609311 0.689516 0.064353 0.240867 0.200203 0.849684 0.709827 0.857522 0.456705 0.140322 0.168463 0.86364 0.50693 0.71 [...]
+0.35236 0.113617 0.184542 0.941435 0.016939 0.13587 0.260801 0.853272 0.804766 0.138936 0.141943 0.549425 0.025876 0.38861 0.158805 0.208096 0.85379 0.875206 0.350914 0.773957 0.362627 0.670276 0.022951 0.886958 0.221525 0.509365 0.890916 0.131483 0.541485 0.673047 0.858725 0.820382 0.717748 0.187847 0.059539 0.853412 0.958599 0.441491 0.45337 0.525301 0.769118 0.075154 0.71682 0.803585 0.45652 0.376002 0.201708 0.679477 0.741079 0.373148 0.706794 0.43703 0.734932 0.779683 0.426248 0.745 [...]
+0.966543 0.97331 0.780825 0.623079 0.475157 0.767117 0.139906 0.83318 0.847946 0.376521 0.590311 0.729025 0.43536 0.417893 0.340353 0.51689 0.615374 0.621615 0.535724 0.118475 0.077171 0.805627 0.634101 0.739251 0.043341 0.088113 0.884056 0.282861 0.794409 0.704552 0.150335 0.573049 0.927171 0.526659 0.355835 0.220863 0.57094 0.49343 0.126422 0.74104 0.134264 0.500636 0.117365 0.332306 0.077735 0.964551 0.385377 0.4888 0.95049 0.888703 0.920445 0.910602 0.891723 0.7081 0.7323 0.67124 0.9 [...]
+0.092011 0.045026 0.245196 0.117888 0.594908 0.697454 0.63948 0.125027 0.908646 0.413962 0.603784 0.279665 0.13946 0.154223 0.694058 0.178643 0.374297 0.502859 0.431964 0.155358 0.414589 0.964212 0.722316 0.2724 0.995939 0.185106 0.194774 0.554559 0.986504 0.439783 0.34525 0.502363 0.887175 0.56498 0.084532 0.816899 0.002039 0.418065 0.577681 0.383794 0.700957 0.215596 0.351101 0.763967 0.652179 0.58667 0.575946 0.803012 0.86792 0.92887 0.276495 0.094843 0.81552 0.155638 0.157993 0.13171 [...]
+0.957236 0.699821 0.720084 0.610253 0.30394 0.239824 0.562163 0.828825 0.668208 0.287686 0.50933 0.638394 0.570591 0.803116 0.8247 0.755037 0.146903 0.313191 0.187626 0.096051 0.785021 0.738914 0.825478 0.668713 0.544282 0.133418 0.895531 0.836303 0.936471 0.159556 0.722663 0.851434 0.343167 0.354014 0.713071 0.275572 0.817193 0.649586 0.902939 0.553366 0.670368 0.69979 0.888866 0.763292 0.031172 0.017827 0.800629 0.236919 0.171536 0.322953 0.841107 0.561234 0.990461 0.269915 0.139568 0. [...]
+0.102191 0.428131 0.940968 0.891693 0.070099 0.797389 0.60889 0.580348 0.53661 0.573631 0.763659 0.268654 0.561578 0.756286 0.788423 0.735658 0.781972 0.852585 0.615592 0.934279 0.185313 0.037849 0.68363 0.321743 0.182333 0.865739 0.528855 0.365162 0.991967 0.666201 0.001085 0.277948 0.795777 0.598979 0.037533 0.979348 0.393823 0.841127 0.162057 0.513986 0.567228 0.406643 0.797942 0.683551 0.258368 0.965314 0.100928 0.242292 0.128075 0.350592 0.397475 0.527986 0.92454 0.477257 0.347979 0 [...]
+0.004346 0.260988 0.97398 0.806882 0.979665 0.227515 0.846839 0.100239 0.894106 0.269305 0.560849 0.294974 0.972112 0.419438 0.787501 0.443101 0.028808 0.337208 0.791584 0.80791 0.552139 0.794197 0.310834 0.872753 0.667283 0.274204 0.321855 0.807545 0.27678 0.767072 0.125746 0.444238 0.108127 0.921678 0.336383 0.913631 0.143764 0.239103 0.277628 0.98027 0.168201 0.512972 0.362309 0.611684 0.487585 0.264292 0.379476 0.197811 0.13956 0.138352 0.399413 0.592351 0.759833 0.075152 0.333089 0. [...]
+0.752003 0.719621 0.368845 0.55029 0.704369 0.241384 0.836466 0.763744 0.617617 0.685388 0.43121 0.120566 0.76063 0.632833 0.503451 0.446359 0.870289 0.507651 0.804634 0.807767 0.836179 0.203933 0.732175 0.937387 0.366039 0.183951 0.746867 0.745727 0.837806 0.597098 0.479222 0.762173 0.349802 0.684268 0.45192 0.353549 0.021644 0.044772 0.234295 0.195769 0.378747 0.245121 0.045789 0.997021 0.687617 0.494764 0.894729 0.131779 0.269794 0.372946 0.721394 0.273609 0.031058 0.454816 0.304823 0 [...]
+0.607703 0.442692 0.062253 0.802208 0.456301 0.927472 0.325184 0.999837 0.409566 0.139861 0.818965 0.777614 0.926188 0.108714 0.834268 0.966483 0.464564 0.970568 0.184289 0.049597 0.045288 0.403583 0.31633 0.144462 0.346626 0.018663 0.984543 0.12947 0.884777 0.33846 0.525888 0.000813 0.384064 0.83854 0.081738 0.514488 0.445271 0.151372 0.952192 0.656439 0.368827 0.414162 0.304628 0.860745 0.444847 0.40921 0.603301 0.065744 0.380088 0.754116 0.9059 0.501144 0.522604 0.932702 0.072822 0.78 [...]
+0.495868 0.686548 0.354448 0.930389 0.834221 0.171371 0.59772 0.056738 0.740722 0.560925 0.96238 0.081058 0.751157 0.253615 0.177866 0.771146 0.539543 0.935204 0.025746 0.194329 0.204501 0.452797 0.989908 0.264422 0.693928 0.737141 0.56012 0.565339 0.454264 0.277728 0.244093 0.578605 0.319272 0.787361 0.848773 0.362714 0.480906 0.483062 0.284015 0.075819 0.4086 0.28512 0.372153 0.515333 0.756286 0.408398 0.209147 0.843404 0.846171 0.981308 0.964543 0.245818 0.60961 0.208802 0.725926 0.28 [...]
+0.440725 0.423618 0.708785 0.585373 0.940279 0.410436 0.613349 0.293999 0.214654 0.298665 0.985937 0.901471 0.145523 0.252833 0.378112 0.178892 0.718492 0.322881 0.3115 0.050995 0.482225 0.898184 0.4554 0.23735 0.393068 0.604098 0.060348 0.868238 0.094983 0.369165 0.010358 0.843804 0.880351 0.832806 0.069519 0.891973 0.872014 0.231601 0.819353 0.188985 0.432534 0.145094 0.204618 0.160965 0.667361 0.601884 0.948015 0.495782 0.810026 0.999607 0.36312 0.154 0.814674 0.607014 0.285582 0.5161 [...]
+0.151761 0.857233 0.513354 0.363944 0.375763 0.7707 0.566748 0.874366 0.058306 0.081959 0.200641 0.605168 0.203703 0.681699 0.92557 0.294756 0.528896 0.122656 0.468815 0.445387 0.968658 0.77757 0.629352 0.542778 0.302604 0.802857 0.480801 0.329786 0.000621 0.616992 0.431189 0.253496 0.089861 0.912877 0.017909 0.300548 0.396607 0.696665 0.034563 0.267579 0.458989 0.620467 0.429348 0.813227 0.859076 0.770501 0.213508 0.196421 0.46842 0.617418 0.887196 0.060176 0.275751 0.368147 0.269022 0. [...]
+0.286472 0.951762 0.363463 0.480548 0.869754 0.536318 0.889491 0.737199 0.535765 0.935807 0.554228 0.134413 0.192729 0.134116 0.643468 0.78495 0.383993 0.970971 0.222838 0.85022 0.124354 0.701925 0.388681 0.72884 0.767938 0.761532 0.323264 0.036552 0.739305 0.681396 0.341969 0.24715 0.576939 0.012885 0.798674 0.359864 0.92474 0.563696 0.172828 0.621018 0.545696 0.992284 0.349955 0.796609 0.156584 0.001098 0.984556 0.861952 0.835097 0.613281 0.834281 0.821621 0.493465 0.75106 0.523634 0.0 [...]
+0.904078 0.400312 0.521413 0.24028 0.582227 0.320791 0.967247 0.971836 0.305838 0.699613 0.531669 0.698123 0.830964 0.143301 0.036449 0.333914 0.246931 0.897117 0.883761 0.238365 0.787722 0.630085 0.337197 0.296204 0.004054 0.078166 0.808901 0.279726 0.753155 0.216244 0.817012 0.415347 0.866193 0.274368 0.219566 0.12962 0.289111 0.310697 0.806169 0.961873 0.469745 0.944876 0.090349 0.492586 0.255846 0.836608 0.075334 0.867187 0.797781 0.914637 0.911253 0.249519 0.029926 0.853479 0.966462 [...]
+0.208838 0.574984 0.628003 0.494415 0.036995 0.361892 0.90612 0.011855 0.89635 0.958829 0.10268 0.123711 0.359284 0.929694 0.172961 0.292619 0.825529 0.877943 0.435232 0.580257 0.456355 0.257631 0.502055 0.514945 0.827508 0.368008 0.627377 0.368577 0.010356 0.083167 0.404426 0.377677 0.946673 0.547859 0.254766 0.334763 0.923255 0.507178 0.084808 0.91502 0.491225 0.152969 0.755591 0.978661 0.183501 0.87321 0.283613 0.411911 0.198232 0.951413 0.71446 0.419678 0.597853 0.361254 0.36432 0.96 [...]
+0.484584 0.39824 0.340066 0.882479 0.276821 0.185463 0.003694 0.972175 0.025175 0.559488 0.744218 0.825855 0.318444 0.271297 0.541519 0.101694 0.132317 0.727966 0.692553 0.7647 0.323841 0.275704 0.505714 0.402978 0.501346 0.356144 0.287306 0.452726 0.484723 0.414702 0.698381 0.575269 0.887777 0.757825 0.00926 0.170108 0.944011 0.665814 0.065284 0.074761 0.888421 0.386117 0.484258 0.771813 0.447656 0.844614 0.121344 0.216185 0.287262 0.040722 0.586338 0.450214 0.056251 0.200685 0.325129 0 [...]
+0.301804 0.291798 0.921112 0.837134 0.86676 0.476699 0.296527 0.671409 0.165307 0.206824 0.068235 0.081477 0.046069 0.621731 0.863235 0.464094 0.06364 0.531004 0.28899 0.329682 0.119231 0.036382 0.50525 0.879503 0.4391 0.356683 0.167355 0.772587 0.285861 0.560683 0.747101 0.416402 0.817317 0.203344 0.957636 0.644705 0.993918 0.473145 0.990843 0.556591 0.445778 0.391575 0.558147 0.681871 0.572039 0.569594 0.845188 0.847625 0.775035 0.615203 0.345993 0.125268 0.551332 0.634375 0.733803 0.0 [...]
+0.94512 0.538931 0.249533 0.059388 0.667154 0.404758 0.958629 0.505144 0.325446 0.494626 0.963213 0.421478 0.32304 0.630223 0.935633 0.644529 0.163361 0.847039 0.466629 0.123288 0.793636 0.30188 0.074042 0.728596 0.036882 0.953489 0.095788 0.583115 0.229006 0.529999 0.472025 0.678171 0.278028 0.035106 0.616217 0.921273 0.098264 0.372014 0.551311 0.052345 0.998648 0.933482 0.70272 0.076861 0.711635 0.721004 0.260904 0.235069 0.993832 0.378931 0.091798 0.056361 0.124644 0.555533 0.781381 0 [...]
+0.403967 0.349547 0.591767 0.688914 0.127204 0.09973 0.963634 0.387645 0.546594 0.674364 0.014687 0.04686 0.150473 0.28797 0.00792 0.930843 0.752844 0.151719 0.409497 0.493658 0.098802 0.5956 0.685586 0.766919 0.903701 0.003453 0.177375 0.824225 0.884973 0.094944 0.117384 0.998708 0.408089 0.554717 0.342821 0.002952 0.249674 0.408154 0.748895 0.886519 0.126811 0.035723 0.197005 0.111658 0.891914 0.292185 0.067872 0.310765 0.114413 0.423036 0.880514 0.320987 0.626266 0.284527 0.652005 0.0 [...]
+0.555323 0.982578 0.987115 0.872254 0.068149 0.768489 0.935489 0.777717 0.373879 0.186369 0.95105 0.684414 0.382136 0.259231 0.055763 0.787553 0.616542 0.091876 0.83208 0.848211 0.432176 0.120166 0.751896 0.302521 0.649695 0.445477 0.346044 0.634915 0.869953 0.002893 0.508332 0.193029 0.236503 0.1734 0.723395 0.455973 0.472098 0.788177 0.366877 0.791643 0.950071 0.750137 0.502455 0.297998 0.709076 0.220554 0.371121 0.732202 0.685969 0.279962 0.430449 0.814335 0.49653 0.347087 0.102968 0. [...]
+0.225036 0.199169 0.525974 0.756278 0.698992 0.966222 0.081831 0.788151 0.305825 0.41427 0.650195 0.841459 0.517098 0.976916 0.40298 0.754632 0.758044 0.131165 0.814019 0.973023 0.275313 0.064629 0.190345 0.411531 0.992501 0.779886 0.168145 0.611936 0.012152 0.408001 0.05516 0.598261 0.057092 0.839905 0.152179 0.863548 0.365298 0.34257 0.419554 0.880204 0.19716 0.619475 0.063826 0.225672 0.516997 0.228795 0.577869 0.210557 0.970696 0.06038 0.456763 0.311288 0.286775 0.57337 0.187825 0.41 [...]
+0.272509 0.529514 0.842228 0.012539 0.252841 0.884841 0.812704 0.270166 0.807904 0.784812 0.91038 0.172484 0.316942 0.723061 0.014268 0.090881 0.48403 0.551937 0.738473 0.326959 0.884063 0.474691 0.468084 0.083675 0.934372 0.726775 0.903338 0.957396 0.175794 0.999136 0.045831 0.531528 0.480632 0.029328 0.646808 0.783026 0.247564 0.941943 0.659618 0.597769 0.626244 0.343233 0.176193 0.045964 0.551293 0.10858 0.534778 0.27084 0.028553 0.324166 0.368497 0.453916 0.06297 0.930325 0.157378 0. [...]
+0.482965 0.618453 0.577738 0.778241 0.749998 0.09838 0.392586 0.808581 0.879752 0.996267 0.594779 0.919166 0.981637 0.463982 0.961441 0.492331 0.988538 0.232035 0.520677 0.8268 0.100322 0.629633 0.171715 0.241484 0.333002 0.761899 0.687218 0.95986 0.973713 0.23886 0.219018 0.227299 0.44631 0.911015 0.389473 0.163181 0.52579 0.165249 0.279077 0.019006 0.620311 0.108152 0.119663 0.920296 0.78572 0.729765 0.932315 0.324131 0.780191 0.264118 0.872276 0.188896 0.272379 0.362979 0.741368 0.852 [...]
+0.123518 0.099687 0.089883 0.556077 0.945146 0.092846 0.920632 0.26192 0.802178 0.779535 0.11835 0.143239 0.767907 0.072501 0.551376 0.608192 0.292309 0.237202 0.279795 0.374215 0.962945 0.827533 0.590795 0.865294 0.567311 0.331855 0.564184 0.960027 0.372734 0.252104 0.521175 0.043542 0.906422 0.829191 0.059677 0.132491 0.361633 0.068422 0.964931 0.382467 0.246975 0.836765 0.865876 0.055145 0.352874 0.736622 0.629095 0.918616 0.133721 0.800116 0.560697 0.813637 0.9675 0.58783 0.861068 0. [...]
+0.677143 0.161142 0.903023 0.171858 0.964004 0.18519 0.216085 0.067425 0.960835 0.535111 0.5077 0.289104 0.662653 0.08252 0.452559 0.235645 0.458394 0.176235 0.157413 0.226901 0.354859 0.162269 0.354803 0.697329 0.524345 0.841855 0.184265 0.78947 0.027466 0.043469 0.055853 0.80601 0.694828 0.4795 0.36649 0.213571 0.335568 0.477092 0.212098 0.397155 0.575413 0.81236 0.601546 0.863581 0.848041 0.897949 0.801745 0.905272 0.677601 0.205914 0.764344 0.541627 0.530387 0.945573 0.460489 0.42406 [...]
+0.452794 0.731771 0.288526 0.570027 0.192285 0.747382 0.727038 0.412191 0.753201 0.97029 0.480209 0.839635 0.117336 0.745174 0.494289 0.977877 0.092566 0.708944 0.821875 0.704913 0.463804 0.157508 0.369849 0.957611 0.018751 0.034716 0.578338 0.667653 0.315803 0.917085 0.6108 0.315958 0.636003 0.471148 0.915419 0.699997 0.936124 0.551288 0.653126 0.895971 0.667731 0.842226 0.227374 0.922828 0.108057 0.919168 0.719758 0.097466 0.325606 0.205991 0.392992 0.567585 0.295724 0.086146 0.944091  [...]
+0.762963 0.261903 0.207893 0.37464 0.16475 0.685911 0.407386 0.886304 0.431749 0.350338 0.728383 0.7901 0.977654 0.296395 0.06641 0.805944 0.530228 0.998635 0.784729 0.42034 0.999735 0.186881 0.096838 0.028854 0.152311 0.063433 0.001592 0.92953 0.075728 0.976524 0.745308 0.269403 0.111052 0.538912 0.959805 0.314318 0.365533 0.790569 0.925169 0.102178 0.541363 0.426718 0.958992 0.163496 0.224029 0.751407 0.4528 0.366529 0.155196 0.408923 0.24811 0.893001 0.410278 0.349401 0.85729 0.092493 [...]
+0.44769 0.741213 0.35109 0.17909 0.878908 0.196473 0.937007 0.794865 0.141245 0.696219 0.614326 0.025308 0.574098 0.860935 0.869368 0.482814 0.524073 0.188854 0.49921 0.315636 0.840022 0.31477 0.794106 0.018641 0.860056 0.097027 0.9385 0.422002 0.317292 0.067226 0.421521 0.0615 0.494693 0.33734 0.857654 0.906292 0.513627 0.678235 0.379522 0.40163 0.907668 0.505435 0.915299 0.000775 0.376209 0.431211 0.227954 0.35006 0.930851 0.392416 0.969581 0.726905 0.314935 0.156729 0.277111 0.422562  [...]
+0.33131 0.295057 0.132908 0.10344 0.419584 0.66697 0.517188 0.745233 0.400797 0.043013 0.941227 0.093679 0.976119 0.988105 0.163933 0.559237 0.086924 0.359152 0.831712 0.437674 0.704467 0.930465 0.55904 0.294484 0.518427 0.277676 0.538927 0.308126 0.18425 0.820508 0.898641 0.084445 0.519056 0.697357 0.527675 0.538497 0.827606 0.048693 0.720559 0.665009 0.336719 0.980401 0.201713 0.24846 0.842147 0.107899 0.514839 0.176307 0.298675 0.999322 0.303294 0.232021 0.600425 0.388973 0.167215 0.6 [...]
+0.030387 0.170248 0.514635 0.254101 0.763315 0.82297 0.996185 0.810705 0.145645 0.535133 0.483222 0.440617 0.281516 0.401788 0.013384 0.867289 0.146784 0.296642 0.045109 0.397114 0.408863 0.316977 0.830111 0.817659 0.513924 0.012348 0.369327 0.070675 0.036022 0.804233 0.424569 0.381979 0.272714 0.627594 0.240296 0.188695 0.954783 0.509959 0.562169 0.346704 0.857787 0.053483 0.325855 0.360192 0.783036 0.83918 0.539396 0.546923 0.73753 0.022061 0.861221 0.347144 0.387922 0.1067 0.797409 0. [...]
+0.29636 0.04354 0.792394 0.187568 0.185489 0.904013 0.02784 0.76625 0.347664 0.901173 0.894859 0.426276 0.795222 0.084605 0.779497 0.907769 0.140027 0.560284 0.159611 0.189715 0.089126 0.122329 0.128897 0.486338 0.252183 0.311843 0.276355 0.390866 0.716316 0.800273 0.342143 0.06516 0.641002 0.405841 0.970071 0.73408 0.112073 0.340315 0.086253 0.716531 0.261918 0.186138 0.469464 0.298308 0.63702 0.279073 0.726642 0.47928 0.314254 0.085573 0.029995 0.520779 0.057609 0.504959 0.654373 0.675 [...]
+0.285787 0.437248 0.738303 0.157376 0.569014 0.521909 0.664823 0.975556 0.214466 0.470468 0.172324 0.631749 0.226849 0.446817 0.491317 0.493319 0.869763 0.530873 0.177226 0.630634 0.259492 0.766343 0.378879 0.313804 0.168271 0.709775 0.057233 0.934923 0.843873 0.644125 0.411758 0.686846 0.887038 0.941026 0.804287 0.966728 0.515959 0.68667 0.786836 0.548251 0.562886 0.651871 0.488984 0.596596 0.776763 0.395899 0.356959 0.926125 0.56442 0.190957 0.029833 0.32554 0.757372 0.638072 0.783115  [...]
+0.36112 0.788146 0.482851 0.092511 0.005231 0.649859 0.553392 0.109658 0.060236 0.753692 0.738144 0.898321 0.702361 0.749905 0.487384 0.35994 0.585062 0.61789 0.827857 0.36409 0.066704 0.056801 0.520952 0.760285 0.356575 0.059096 0.327537 0.814482 0.901838 0.318539 0.455216 0.655627 0.994864 0.362591 0.361578 0.584544 0.883932 0.425474 0.436635 0.400198 0.836804 0.201057 0.409641 0.483988 0.645498 0.690338 0.872364 0.234214 0.398345 0.948483 0.55913 0.089933 0.407281 0.73728 0.235094 0.4 [...]
+0.251159 0.243669 0.463764 0.639647 0.515107 0.520411 0.812047 0.825183 0.353935 0.629386 0.073306 0.221027 0.148185 0.507797 0.563405 0.071378 0.525565 0.764222 0.568895 0.561771 0.849894 0.747635 0.219789 0.942725 0.254916 0.106154 0.028579 0.30325 0.833203 0.73251 0.282418 0.06699 0.85941 0.037345 0.846 0.500606 0.155782 0.348415 0.763786 0.344951 0.626456 0.367888 0.961895 0.14576 0.038388 0.493908 0.148528 0.574453 0.100121 0.527406 0.067918 0.058981 0.59754 0.128783 0.4325 0.65468  [...]
+0.205412 0.535007 0.15863 0.566314 0.981271 0.472282 0.960076 0.951998 0.587408 0.195095 0.380253 0.768293 0.574555 0.201832 0.836722 0.69692 0.798015 0.870297 0.16578 0.473082 0.476557 0.311394 0.677737 0.763252 0.175993 0.141897 0.550474 0.732368 0.486933 0.97503 0.85586 0.179286 0.259525 0.323066 0.164602 0.871181 0.116852 0.54591 0.452324 0.234042 0.708556 0.57997 0.168507 0.620711 0.02702 0.19146 0.648193 0.234156 0.849369 0.142035 0.477389 0.177664 0.588729 0.377341 0.147049 0.6380 [...]
+0.915355 0.178943 0.680486 0.105042 0.916221 0.172591 0.360588 0.707509 0.688452 0.256378 0.136226 0.952519 0.471926 0.664023 0.066746 0.812953 0.866605 0.199725 0.518537 0.284077 0.305869 0.92417 0.595146 0.18187 0.115274 0.210748 0.904646 0.273037 0.085742 0.123096 0.878731 0.919039 0.846102 0.897978 0.717791 0.072483 0.739069 0.667378 0.659784 0.231677 0.508554 0.019263 0.648802 0.774587 0.129989 0.968358 0.797707 0.426202 0.010578 0.283067 0.203962 0.536252 0.171082 0.160846 0.724902 [...]
+0.9609 0.105538 0.559261 0.609806 0.837084 0.472674 0.752852 0.904038 0.037214 0.232241 0.334571 0.102178 0.790844 0.543111 0.340018 0.745677 0.207162 0.170811 0.947711 0.414734 0.272383 0.806089 0.236966 0.115039 0.166256 0.080495 0.447082 0.28777 0.527894 0.189464 0.053974 0.967248 0.586507 0.733607 0.753188 0.877013 0.353196 0.694096 0.398356 0.877031 0.607896 0.390914 0.579277 0.652437 0.514193 0.140008 0.932737 0.984597 0.242116 0.155399 0.993274 0.915412 0.766137 0.13803 0.211952 0 [...]
+0.731517 0.29362 0.017066 0.160926 0.070098 0.172196 0.139867 0.548763 0.615798 0.649801 0.979642 0.062425 0.662319 0.457315 0.342215 0.604242 0.137654 0.085249 0.139575 0.248865 0.794477 0.658519 0.884273 0.630799 0.68583 0.134318 0.812251 0.305289 0.898551 0.815124 0.139694 0.214676 0.911715 0.942208 0.046484 0.487587 0.581401 0.258428 0.128444 0.426057 0.303278 0.954036 0.872052 0.077212 0.597391 0.06788 0.501803 0.152453 0.970638 0.143849 0.709808 0.422143 0.233811 0.794566 0.446722  [...]
+0.360193 0.442011 0.786981 0.478524 0.515098 0.691221 0.476663 0.565341 0.636892 0.257558 0.27861 0.172961 0.430804 0.907293 0.22534 0.073868 0.706594 0.553123 0.716888 0.042318 0.470929 0.688491 0.151738 0.929187 0.376355 0.959865 0.962318 0.829844 0.632143 0.380845 0.024607 0.875282 0.066317 0.502842 0.386994 0.705712 0.651882 0.858819 0.412001 0.227426 0.279598 0.629173 0.487038 0.494607 0.07924 0.044118 0.311647 0.150125 0.422472 0.806807 0.509755 0.694453 0.846769 0.192961 0.052276  [...]
+0.222897 0.732833 0.547159 0.546746 0.65326 0.029012 0.025796 0.842711 0.251042 0.854812 0.741255 0.12854 0.475397 0.517919 0.960728 0.445624 0.531451 0.58467 0.830558 0.518061 0.967682 0.645981 0.434172 0.804732 0.619008 0.662202 0.361871 0.910822 0.520825 0.874702 0.595431 0.200335 0.662647 0.973721 0.543255 0.127631 0.42729 0.665033 0.053961 0.835329 0.252445 0.479004 0.917625 0.863313 0.842948 0.376544 0.102138 0.399186 0.961736 0.355652 0.952279 0.801031 0.162958 0.103783 0.838393 0 [...]
+0.319207 0.085989 0.384067 0.960225 0.165857 0.158795 0.422979 0.182339 0.526226 0.904389 0.019829 0.508917 0.052523 0.318587 0.024593 0.250326 0.254384 0.32221 0.741374 0.102957 0.625312 0.922951 0.471638 0.740021 0.350553 0.387268 0.305736 0.602231 0.095461 0.23256 0.471487 0.977908 0.296993 0.755393 0.905661 0.962915 0.606518 0.754576 0.564318 0.128488 0.567346 0.234619 0.711115 0.714781 0.181255 0.724628 0.267037 0.665243 0.917654 0.571041 0.735383 0.798777 0.601539 0.342773 0.95315  [...]
+0.940637 0.751119 0.266977 0.032685 0.178571 0.756876 0.996509 0.871257 0.944854 0.498018 0.6387 0.873692 0.832353 0.824738 0.211268 0.72725 0.241755 0.655999 0.305797 0.179864 0.826014 0.676606 0.63329 0.934816 0.884298 0.800878 0.584485 0.502264 0.662422 0.171799 0.9164 0.072113 0.924217 0.994092 0.331514 0.224694 0.334819 0.393418 0.107439 0.631148 0.536122 0.583646 0.391112 0.010526 0.920519 0.191792 0.945359 0.862984 0.973485 0.769243 0.163144 0.772931 0.664155 0.855522 0.977347 0.8 [...]
+0.989388 0.134896 0.082883 0.440893 0.052277 0.369981 0.606056 0.238685 0.653356 0.541303 0.451036 0.002892 0.831529 0.224181 0.573298 0.321394 0.371832 0.705852 0.204026 0.390517 0.756407 0.22482 0.363087 0.918905 0.214993 0.280517 0.564182 0.811333 0.870312 0.487348 0.013439 0.221108 0.720967 0.176108 0.413305 0.468903 0.547989 0.344976 0.666707 0.542612 0.052654 0.052035 0.942509 0.90098 0.452734 0.728529 0.890832 0.719608 0.518336 0.607357 0.535487 0.181287 0.024441 0.671101 0.762573 [...]
+0.240899 0.67808 0.8192 0.866322 0.170884 0.267389 0.748755 0.034816 0.506555 0.884098 0.744492 0.870386 0.897733 0.553609 0.917476 0.408789 0.364989 0.974439 0.007945 0.441222 0.354988 0.98442 0.742953 0.01638 0.892208 0.926119 0.564888 0.461358 0.741666 0.999077 0.010557 0.564594 0.372008 0.699895 0.671255 0.667756 0.392563 0.763389 0.708614 0.386798 0.798399 0.345393 0.102997 0.027405 0.402965 0.633623 0.815186 0.435732 0.069112 0.649515 0.148941 0.850088 0.962836 0.880079 0.571852 0. [...]
+0.888144 0.359172 0.880762 0.001717 0.163055 0.983784 0.548588 0.003388 0.754962 0.3435 0.703543 0.198584 0.761282 0.195922 0.138662 0.515366 0.399362 0.574101 0.262538 0.372163 0.887049 0.920408 0.712095 0.777638 0.146055 0.203983 0.209567 0.296912 0.48089 0.100595 0.75072 0.715036 0.340479 0.057104 0.999163 0.07389 0.99162 0.696443 0.144753 0.231635 0.256524 0.901232 0.647357 0.452477 0.164401 0.752742 0.529738 0.639985 0.443425 0.551101 0.088845 0.142789 0.588702 0.793799 0.983831 0.1 [...]
+0.895696 0.078664 0.117439 0.414101 0.580797 0.164195 0.498385 0.028473 0.309629 0.317273 0.242951 0.161662 0.7789 0.229107 0.093678 0.857755 0.136181 0.1916 0.535441 0.789455 0.637316 0.664979 0.459353 0.670089 0.253168 0.975253 0.712285 0.566166 0.19282 0.19094 0.305787 0.557687 0.756812 0.662904 0.819109 0.56653 0.748542 0.768416 0.168643 0.255432 0.047788 0.01495 0.773372 0.28621 0.390724 0.181668 0.836578 0.825917 0.59239 0.581572 0.733279 0.19979 0.573097 0.686009 0.412957 0.49312  [...]
+0.467123 0.689517 0.122476 0.259687 0.78163 0.733343 0.787928 0.676359 0.561276 0.856595 0.418676 0.617006 0.946106 0.573452 0.495384 0.625854 0.485398 0.187241 0.726038 0.034213 0.329525 0.672422 0.333209 0.276157 0.591402 0.985877 0.095591 0.274985 0.691353 0.091778 0.410225 0.632946 0.329871 0.714138 0.109994 0.823162 0.843747 0.757806 0.265194 0.752547 0.789067 0.314958 0.957842 0.92774 0.153741 0.777637 0.553127 0.702585 0.402788 0.24959 0.170036 0.042362 0.778773 0.00521 0.339033 0 [...]
+0.462251 0.960475 0.65368 0.884966 0.207391 0.096262 0.477137 0.436814 0.548891 0.733652 0.841144 0.892482 0.570204 0.353573 0.993329 0.477773 0.513315 0.737838 0.957934 0.312096 0.514076 0.563865 0.586063 0.677139 0.078013 0.603485 0.648653 0.23933 0.932389 0.404646 0.499109 0.817156 0.120245 0.07291 0.50831 0.206216 0.55796 0.449751 0.759675 0.229991 0.357219 0.026727 0.064836 0.837574 0.944953 0.951269 0.600399 0.73049 0.952091 0.509997 0.844355 0.128241 0.649906 0.506406 0.954919 0.0 [...]
+0.289371 0.371867 0.379529 0.809176 0.321144 0.402728 0.530435 0.216014 0.358944 0.282252 0.180947 0.338253 0.304514 0.878498 0.607423 0.707969 0.379792 0.554987 0.046872 0.358961 0.504491 0.757411 0.315462 0.655466 0.784202 0.528528 0.217868 0.831334 0.158109 0.534373 0.937794 0.416083 0.056955 0.649673 0.148467 0.425013 0.446718 0.126335 0.047037 0.842823 0.762891 0.364457 0.986701 0.048704 0.590317 0.403544 0.647158 0.770698 0.737234 0.952969 0.810261 0.946079 0.168746 0.08376 0.90518 [...]
+0.220771 0.11326 0.775718 0.776518 0.613672 0.793242 0.805605 0.360414 0.775822 0.222588 0.027266 0.293432 0.453874 0.875207 0.706569 0.909285 0.412871 0.922171 0.316547 0.78319 0.42004 0.77157 0.532316 0.433326 0.623164 0.071648 0.39253 0.338269 0.623595 0.649158 0.813871 0.35354 0.720007 0.696424 0.324986 0.392033 0.138483 0.899332 0.473488 0.885188 0.311719 0.512418 0.36323 0.208431 0.155399 0.738981 0.537033 0.844159 0.899691 0.900922 0.934417 0.657585 0.38566 0.841756 0.271955 0.356 [...]
+0.383776 0.712377 0.047015 0.396804 0.277859 0.988187 0.688187 0.318683 0.566617 0.05154 0.852415 0.724155 0.236617 0.439857 0.370971 0.140542 0.941667 0.136748 0.593575 0.579226 0.14561 0.697882 0.827463 0.240241 0.49495 0.007834 0.411564 0.381027 0.117771 0.646089 0.055569 0.935482 0.237066 0.262801 0.422416 0.868709 0.953265 0.181391 0.798688 0.973214 0.914625 0.792876 0.240598 0.377233 0.43788 0.354982 0.047701 0.443017 0.608183 0.704206 0.559212 0.807757 0.482753 0.305721 0.33745 0. [...]
+0.805546 0.372548 0.135596 0.682719 0.457638 0.322739 0.519651 0.44402 0.108743 0.800549 0.55603 0.309951 0.57401 0.395864 0.979996 0.678189 0.246109 0.615841 0.929922 0.535302 0.158355 0.566022 0.640591 0.860261 0.63975 0.702812 0.272636 0.030676 0.611574 0.50316 0.117615 0.680115 0.170759 0.106315 0.312057 0.448063 0.995594 0.934509 0.72994 0.803034 0.652122 0.423914 0.310026 0.12105 0.581515 0.160101 0.77916 0.982779 0.733688 0.940058 0.542491 0.994506 0.404051 0.831699 0.36865 0.1979 [...]
+0.377822 0.168493 0.118213 0.764432 0.312736 0.146616 0.075813 0.361953 0.043789 0.160042 0.665977 0.788541 0.033348 0.881946 0.193125 0.269327 0.839537 0.685655 0.163788 0.550451 0.45887 0.306384 0.878415 0.338804 0.467059 0.899305 0.485169 0.124594 0.042776 0.751322 0.336272 0.511461 0.169509 0.287266 0.535257 0.645792 0.53959 0.145521 0.651501 0.873362 0.863027 0.679981 0.718808 0.63747 0.649875 0.284895 0.723144 0.197826 0.555947 0.227262 0.385004 0.451679 0.756873 0.63359 0.618689 0 [...]
+0.741087 0.316799 0.858432 0.459897 0.147695 0.558202 0.638995 0.107143 0.247179 0.335311 0.2307 0.615256 0.224283 0.877051 0.946913 0.539219 0.560584 0.309181 0.208998 0.489833 0.039475 0.272123 0.410941 0.170213 0.564459 0.807419 0.778974 0.105533 0.022003 0.069304 0.567439 0.291889 0.817881 0.175253 0.379141 0.851091 0.651529 0.783865 0.096181 0.443546 0.187976 0.624579 0.137571 0.405292 0.876373 0.664261 0.216761 0.743232 0.577794 0.480609 0.034317 0.590449 0.631678 0.985253 0.360805 [...]
+0.746919 0.454655 0.676216 0.750305 0.23112 0.50175 0.992466 0.511372 0.040773 0.691096 0.500181 0.405023 0.116623 0.769219 0.198352 0.273809 0.764485 0.306264 0.735802 0.801851 0.798389 0.701581 0.901859 0.314063 0.311784 0.467411 0.344771 0.457353 0.503407 0.76363 0.722461 0.152908 0.937504 0.977435 0.866952 0.044839 0.453824 0.150846 0.438761 0.552408 0.712544 0.953266 0.368372 0.868408 0.409382 0.610379 0.032175 0.747535 0.972482 0.453572 0.820456 0.30638 0.889948 0.19325 0.423617 0. [...]
+0.093514 0.11982 0.072453 0.964495 0.26584 0.854793 0.693116 0.321225 0.207467 0.972039 0.840879 0.467821 0.816608 0.930624 0.766772 0.513579 0.471545 0.411653 0.752684 0.106524 0.774737 0.169478 0.270485 0.109445 0.803286 0.995686 0.600835 0.376021 0.491656 0.983485 0.916415 0.011258 0.89466 0.620825 0.517567 0.717321 0.101063 0.972146 0.46276 0.173513 0.185373 0.634357 0.846995 0.894911 0.807266 0.612482 0.692104 0.032536 0.848792 0.103794 0.17722 0.70767 0.092286 0.154659 0.775061 0.6 [...]
+0.362804 0.71742 0.544439 0.40636 0.805091 0.384257 0.085068 0.85493 0.954501 0.209809 0.437354 0.356873 0.045143 0.067845 0.414997 0.510645 0.337527 0.005746 0.730747 0.059186 0.882901 0.415892 0.469385 0.862538 0.476603 0.162214 0.904437 0.754739 0.956821 0.738498 0.874677 0.879858 0.125481 0.135761 0.884911 0.884087 0.824063 0.644375 0.989215 0.175291 0.643593 0.369709 0.273787 0.169247 0.604047 0.53652 0.962391 0.574607 0.249327 0.213061 0.040513 0.016528 0.691899 0.407564 0.55062 0. [...]
+0.4057 0.82318 0.345248 0.090264 0.257747 0.206389 0.458211 0.265528 0.893683 0.752895 0.971531 0.445183 0.51348 0.868047 0.086466 0.202423 0.990379 0.731364 0.790628 0.335594 0.988547 0.109028 0.042263 0.519319 0.762292 0.510468 0.453563 0.657414 0.332451 0.405615 0.173868 0.122938 0.950985 0.427603 0.578368 0.569764 0.612181 0.215066 0.611058 0.101939 0.886654 0.212771 0.639106 0.310601 0.627076 0.549319 0.543339 0.872706 0.87197 0.464157 0.003543 0.381491 0.220179 0.823667 0.71774 0.4 [...]
+0.125737 0.235998 0.568465 0.573548 0.901032 0.728562 0.55355 0.810139 0.171509 0.294932 0.357326 0.242449 0.647206 0.023177 0.794911 0.665962 0.777553 0.756089 0.802883 0.274254 0.829542 0.020344 0.840852 0.564432 0.0731 0.175559 0.481811 0.629797 0.025112 0.110277 0.960798 0.673648 0.314843 0.844254 0.698703 0.205774 0.498609 0.6449 0.802399 0.356405 0.968517 0.293283 0.113049 0.403667 0.35381 0.172597 0.256754 0.180318 0.213318 0.380514 0.295005 0.682366 0.923297 0.622329 0.025245 0.9 [...]
+0.742143 0.443421 0.71298 0.530871 0.205227 0.423717 0.573459 0.101758 0.757463 0.387846 0.831234 0.46418 0.418727 0.963865 0.654475 0.27438 0.284935 0.819097 0.0947 0.448001 0.925116 0.08269 0.212722 0.360264 0.317602 0.184579 0.92847 0.068628 0.520115 0.487071 0.847351 0.374638 0.30465 0.808722 0.561636 0.010937 0.726154 0.792829 0.151246 0.045048 0.617423 0.972348 0.186991 0.735351 0.058922 0.98218 0.623521 0.126463 0.568564 0.664558 0.092363 0.189812 0.692017 0.628186 0.686633 0.3056 [...]
+0.230195 0.64335 0.14341 0.51959 0.542245 0.262847 0.418798 0.854795 0.305532 0.89197 0.337472 0.262743 0.857071 0.157373 0.342978 0.393223 0.564562 0.657881 0.410209 0.944049 0.585113 0.758335 0.727042 0.212074 0.753439 0.517226 0.532506 0.780791 0.842518 0.159527 0.489497 0.505214 0.235004 0.593229 0.765176 0.863671 0.731239 0.152594 0.225052 0.61384 0.973548 0.355739 0.088255 0.039338 0.026742 0.647227 0.223266 0.371908 0.796519 0.640411 0.651611 0.315562 0.383849 0.866209 0.186769 0. [...]
+0.436206 0.509566 0.089729 0.952082 0.706146 0.029356 0.761547 0.815389 0.051641 0.771511 0.001778 0.502968 0.98685 0.070449 0.406085 0.927633 0.711525 0.570352 0.616149 0.223886 0.578234 0.614703 0.368258 0.373835 0.466187 0.868211 0.736818 0.57335 0.733643 0.800186 0.214396 0.963479 0.812289 0.119857 0.870044 0.4806 0.716852 0.845557 0.940007 0.342488 0.182166 0.887211 0.195595 0.328086 0.415686 0.410399 0.308941 0.129047 0.443458 0.854702 0.092741 0.652748 0.85388 0.730822 0.75011 0.6 [...]
+0.789365 0.93904 0.10789 0.958482 0.495634 0.931516 0.010221 0.628782 0.118104 0.242244 0.189603 0.900298 0.706622 0.579881 0.429478 0.263048 0.708448 0.229921 0.841696 0.657567 0.78801 0.002175 0.501738 0.868945 0.341149 0.474877 0.920705 0.424219 0.67292 0.583209 0.374916 0.014297 0.300235 0.45326 0.860117 0.475862 0.191121 0.986848 0.53553 0.213312 0.731906 0.738253 0.878548 0.098405 0.579324 0.844938 0.533894 0.586057 0.172385 0.933092 0.889229 0.138364 0.444532 0.509173 0.01951 0.66 [...]
+0.109114 0.974896 0.956129 0.294099 0.355256 0.72563 0.035853 0.280134 0.684531 0.791653 0.882039 0.024586 0.488425 0.841264 0.145937 0.10068 0.490129 0.031581 0.056503 0.258361 0.373106 0.302845 0.108486 0.5801 0.171662 0.314412 0.545488 0.499067 0.577741 0.780809 0.277508 0.03182 0.627326 0.650444 0.999085 0.342331 0.27858 0.946146 0.19965 0.471105 0.861314 0.821461 0.926608 0.32846 0.392045 0.059575 0.506497 0.459561 0.781035 0.136625 0.93252 0.118757 0.153065 0.104023 0.46528 0.95562 [...]
+0.836002 0.827193 0.996552 0.024703 0.953435 0.341084 0.065277 0.277811 0.833529 0.498148 0.062635 0.80388 0.821239 0.574791 0.068077 0.743675 0.704301 0.777973 0.396517 0.231127 0.202163 0.811023 0.01528 0.153811 0.28487 0.011146 0.968725 0.158564 0.274304 0.041969 0.985214 0.356145 0.517309 0.716884 0.450016 0.872504 0.977034 0.053317 0.1058 0.867473 0.142889 0.805961 0.105233 0.490638 0.46558 0.581295 0.118511 0.947168 0.945556 0.530409 0.215156 0.395922 0.446113 0.221472 0.307134 0.4 [...]
+0.942194 0.411707 0.078379 0.049933 0.913037 0.42158 0.097928 0.643597 0.65378 0.421444 0.00637 0.309736 0.304003 0.446556 0.319939 0.504589 0.692053 0.659191 0.231566 0.25624 0.521108 0.814406 0.523957 0.251987 0.580201 0.948366 0.160818 0.276986 0.913325 0.606164 0.980662 0.998319 0.31727 0.309697 0.80587 0.540383 0.951678 0.202285 0.591712 0.629833 0.341781 0.744433 0.273912 0.933767 0.216621 0.428745 0.640352 0.082565 0.229681 0.215321 0.44104 0.463779 0.273858 0.93146 0.003495 0.305 [...]
+0.775452 0.011643 0.99337 0.701514 0.579841 0.15777 0.455278 0.69399 0.033088 0.94542 0.039461 0.735383 0.848566 0.5788 0.683358 0.703848 0.606189 0.499291 0.121051 0.355476 0.599907 0.230759 0.435919 0.461303 0.73284 0.128334 0.01076 0.391606 0.017175 0.554264 0.706098 0.666818 0.495129 0.75004 0.124662 0.206625 0.426052 0.933591 0.5938 0.904594 0.980379 0.033814 0.869992 0.442224 0.119289 0.930788 0.23361 0.954376 0.12248 0.582139 0.869285 0.304701 0.51549 0.474071 0.492676 0.69847 0.9 [...]
+0.535813 0.973848 0.606951 0.288497 0.065908 0.978184 0.961664 0.720938 0.546267 0.613427 0.530642 0.100451 0.960832 0.474271 0.953049 0.718522 0.577381 0.978895 0.322078 0.865768 0.789219 0.929988 0.40925 0.63757 0.38482 0.797004 0.657725 0.502899 0.994425 0.177626 0.786222 0.741533 0.204597 0.995101 0.983582 0.079837 0.699248 0.893559 0.322994 0.937963 0.181282 0.233425 0.960518 0.345369 0.902518 0.938805 0.207973 0.540874 0.928331 0.589056 0.791689 0.606423 0.106662 0.382628 0.442213  [...]
+0.973107 0.35822 0.739901 0.810863 0.858741 0.249012 0.162959 0.390652 0.56517 0.135617 0.255785 0.963522 0.086071 0.854951 0.85564 0.502862 0.607634 0.943549 0.378441 0.272886 0.263895 0.129984 0.339677 0.034504 0.409323 0.886198 0.837325 0.470427 0.380006 0.168347 0.185284 0.536688 0.657127 0.222883 0.49263 0.850379 0.730911 0.133943 0.496298 0.29655 0.365376 0.043239 0.254701 0.169426 0.583868 0.286865 0.344116 0.351432 0.554894 0.021959 0.070992 0.591446 0.069511 0.659442 0.569689 0. [...]
+0.643751 0.601446 0.193451 0.204007 0.215811 0.316519 0.374416 0.217011 0.158158 0.897193 0.801948 0.740231 0.848169 0.543064 8.7e-05 0.881351 0.403383 0.610869 0.304 0.788638 0.471538 0.506133 0.705177 0.28371 0.352281 0.981354 0.511474 0.282159 0.291498 0.92179 0.481436 0.069617 0.059579 0.810499 0.365167 0.247636 0.246525 0.708872 0.27944 0.503616 0.737213 0.500808 0.300294 0.048637 0.408323 0.827609 0.8225 0.25502 0.930178 0.425052 0.577134 0.66946 0.484879 0.435218 0.293309 0.919327 [...]
+0.184821 0.017343 0.52981 0.516444 0.358201 0.657183 0.854926 0.300591 0.990741 0.79564 0.962953 0.84059 0.154358 0.325967 0.237701 0.578293 0.460098 0.965395 0.573029 0.23826 0.599323 0.389932 0.111708 0.458729 0.533469 0.350086 0.073878 0.253583 0.05174 0.414693 0.948573 0.217389 0.854103 0.359614 0.510043 0.111222 0.16462 0.942092 0.741326 0.260935 0.814196 0.900458 0.321693 0.357363 0.442493 0.109491 0.379575 0.103831 0.424377 0.913946 0.04372 0.646412 0.490239 0.908664 0.879799 0.11 [...]
+0.681648 0.89931 0.066222 0.781937 0.933579 0.598369 0.635111 0.530345 0.337923 0.551939 0.912126 0.434847 0.196485 0.991662 0.802043 0.498673 0.421287 0.414618 0.880582 0.768323 0.693125 0.121743 0.802894 0.224113 0.115081 0.501828 0.028881 0.464877 0.247053 0.832219 0.423898 0.607449 0.654457 0.523614 0.073669 0.60233 0.066626 0.284548 0.656551 0.436858 0.889138 0.277844 0.109379 0.214602 0.595532 0.52803 0.280515 0.773168 0.871585 0.536692 0.010349 0.608553 0.220115 0.12577 0.063523 0 [...]
+0.234843 0.764005 0.432776 0.820991 0.04415 0.443493 0.115055 0.618893 0.880951 0.478778 0.929825 0.242942 0.893386 0.212835 0.238691 0.284008 0.507292 0.148011 0.272678 0.276409 0.470982 0.875165 0.959011 0.008452 0.344564 0.474815 0.092729 0.032106 0.74381 0.734315 0.857043 0.139416 0.287119 0.40899 0.224857 0.049578 0.239594 0.883898 0.053634 0.871105 0.929348 0.367909 0.220448 0.396149 0.701085 0.567352 0.036069 0.456962 0.210176 0.27716 0.922749 0.951187 0.675592 0.986144 0.796564 0 [...]
+0.58399 0.485128 0.811074 0.302877 0.177362 0.65745 0.947126 0.473935 0.725296 0.499037 0.972609 0.210436 0.090005 0.324417 0.699949 0.350062 0.405047 0.680616 0.981145 0.064709 0.24894 0.050596 0.126674 0.185954 0.958081 0.902962 0.059411 0.735424 0.896685 0.321658 0.463546 0.675325 0.755266 0.53899 0.159781 0.854574 0.04087 0.865084 0.17924 0.221545 0.174033 0.545614 0.914778 0.793886 0.234574 0.273117 0.989998 0.933285 0.297493 0.28737 0.955037 0.459572 0.876895 0.885963 0.30324 0.101 [...]
+0.401659 0.929029 0.229356 0.964169 0.807049 0.76725 0.800856 0.467087 0.597703 0.571864 0.05277 0.650708 0.052827 0.212571 0.221217 0.184622 0.660848 0.012575 0.519078 0.960215 0.624738 0.626652 0.737826 0.511309 0.7909 0.553438 0.184303 0.702159 0.765055 0.017223 0.34008 0.994946 0.153633 0.665663 0.70121 0.788504 0.212744 0.63612 0.315121 0.240645 0.583118 0.011085 0.429653 0.868138 0.05048 0.389139 0.991403 0.923521 0.072295 0.756724 0.574982 0.453623 0.288031 0.651404 0.208443 0.917 [...]
+0.94208 0.427346 0.836807 0.184601 0.546026 0.509833 0.288715 0.961563 0.641616 0.662297 0.216309 0.124278 0.58653 0.946576 0.201221 0.669568 0.572078 0.404899 0.484679 0.715227 0.01772 0.562602 0.318726 0.674985 0.657941 0.576379 0.046658 0.791885 0.442555 0.882439 0.607532 0.937385 0.264349 0.855658 0.865318 0.358426 0.06211 0.320081 0.264505 0.988376 0.626098 0.825142 0.917612 0.450779 0.148307 0.854193 0.534254 0.869752 0.819172 0.346026 0.938668 0.082784 0.717693 0.78835 0.561071 0. [...]
+0.791336 0.049402 0.643463 0.723384 0.057437 0.169925 0.751765 0.776179 0.261976 0.376857 0.693493 0.019895 0.834564 0.45755 0.906798 0.830058 0.856553 0.897203 0.07552 0.973234 0.743901 0.033757 0.971308 0.650438 0.897199 0.551835 0.011417 0.521542 0.060612 0.704779 0.954407 0.201394 0.193349 0.559836 0.347985 0.682266 0.48126 0.871713 0.825311 0.671503 0.526011 0.421484 0.189922 0.39309 0.097707 0.809753 0.797766 0.86745 0.268344 0.514626 0.157788 0.905004 0.313445 0.704516 0.498872 0. [...]
+0.643838 0.484277 0.163063 0.486755 0.058573 0.945662 0.912823 0.907736 0.315728 0.180138 0.645746 0.317588 0.417097 0.893415 0.921872 0.496709 0.719883 0.734784 0.974406 0.160419 0.324907 0.576419 0.898614 0.83071 0.120573 0.865308 0.220972 0.72745 0.373924 0.410807 0.778386 0.586099 0.931659 0.544869 0.796293 0.67611 0.410863 0.395234 0.708626 0.04223 0.460512 0.224573 0.063175 0.630143 0.511545 0.014266 0.02768 0.389183 0.14053 0.692519 0.411303 0.568722 0.787816 0.487915 0.478375 0.0 [...]
+0.308662 0.721073 0.419494 0.015413 0.777359 0.093159 0.540928 0.022891 0.064064 0.643813 0.572758 0.440188 0.49235 0.274611 0.423407 0.913552 0.778194 0.047708 0.044952 0.511322 0.46381 0.659006 0.98001 0.943337 0.356794 0.887116 0.609298 0.518807 0.549203 0.553406 0.796459 0.589946 0.092198 0.05436 0.974255 0.902121 0.269891 0.192583 0.057643 0.551292 0.950742 0.745981 0.664035 0.370205 0.543187 0.300752 0.877027 0.399058 0.183157 0.016739 0.414947 0.048914 0.901527 0.26395 0.768743 0. [...]
+0.373518 0.331494 0.877936 0.732507 0.16336 0.096856 0.246311 0.764089 0.257975 0.443287 0.556981 0.17398 0.339148 0.133885 0.095168 0.063518 0.898594 0.628365 0.451298 0.366704 0.872078 0.087695 0.688922 0.306974 0.381461 0.294291 0.587603 0.808718 0.875814 0.216644 0.605136 0.682559 0.344148 0.458378 0.091488 0.920028 0.892818 0.536813 0.51576 0.096358 0.861727 0.008553 0.87762 0.363361 0.351458 0.308281 0.06843 0.816823 0.819429 0.18123 0.421802 0.307684 0.517012 0.319707 0.925812 0.0 [...]
+0.285341 0.509049 0.207017 0.987405 0.110839 0.630232 0.635728 0.332467 0.518069 0.338591 0.471766 0.093759 0.45714 0.951289 0.788391 0.595363 0.236378 0.47629 0.744718 0.632868 0.687128 0.485508 0.321884 0.812699 0.634382 0.496851 0.198624 0.365725 0.806734 0.360617 0.68048 0.753792 0.850391 0.772081 0.166155 0.063838 0.801134 0.237999 0.295766 0.776529 0.095864 0.382543 0.900987 0.363086 0.775315 0.061557 0.008743 0.654706 0.611276 0.517218 0.794682 0.509379 0.609602 0.595078 0.434156  [...]
+0.036788 0.743255 0.075087 0.271102 0.90188 0.253829 0.650606 0.176925 0.8654 0.771312 0.943939 0.098137 0.774658 0.32278 0.2127 0.634044 0.184729 0.054275 0.786818 0.699374 0.644239 0.436793 0.37813 0.051198 0.707821 0.882751 0.155942 0.250345 0.820111 0.107175 0.008294 0.675954 0.692465 0.27614 0.221678 0.093614 0.9385 0.801133 0.637058 0.433695 0.860064 0.508814 0.065996 0.152886 0.443562 0.577611 0.52804 0.059389 0.558786 0.613345 0.085047 0.167197 0.187117 0.218392 0.29716 0.657988  [...]
+0.364828 0.727638 0.711259 0.668508 0.074735 0.322505 0.94762 0.481193 0.049745 0.637051 0.379289 0.132208 0.565797 0.47823 0.791209 0.6224 0.213112 0.194134 0.532446 0.458859 0.280935 0.629243 0.873227 0.242437 0.636564 0.631092 0.459091 0.16926 0.062026 0.253499 0.577283 0.313014 0.127584 0.663644 0.866406 0.423876 0.630642 0.974992 0.288706 0.608828 0.552472 0.961007 0.732015 0.910999 0.865014 0.737236 0.785815 0.476488 0.541205 0.467881 0.171307 0.806246 0.594241 0.222728 0.238068 0. [...]
+0.38621 0.128011 0.205603 0.853611 0.382808 0.098941 0.383147 0.922005 0.696354 0.070137 0.539835 0.900542 0.29303 0.086397 0.992845 0.904248 0.611085 0.136161 0.839773 0.124403 0.967304 0.185877 0.863909 0.392208 0.693962 0.92141 0.234917 0.95065 0.573254 0.443545 0.345139 0.834945 0.140262 0.303093 0.538281 0.386638 0.037491 0.216703 0.430437 0.412527 0.147961 0.939758 0.409589 0.025492 0.231401 0.710248 0.784544 0.998162 0.635698 0.407912 0.219146 0.942077 0.955812 0.025108 0.358395 0 [...]
+0.954524 0.623986 0.355843 0.435076 0.085426 0.084408 0.545412 0.880148 0.329409 0.302444 0.090694 0.618987 0.172385 0.270919 0.206661 0.975371 0.688043 0.762949 0.451425 0.352136 0.283201 0.36546 0.351859 0.991702 0.395502 0.659458 0.210932 0.626242 0.20098 0.406837 0.811175 0.402717 0.82319 0.685309 0.678788 0.087333 0.830012 0.742873 0.688376 0.43143 0.044049 0.732643 0.652583 0.434826 0.210295 0.346296 0.767141 0.610516 0.235906 0.925719 0.019836 0.726238 0.016535 0.927108 0.499096 0 [...]
+0.67686 0.866771 0.79876 0.27906 0.239956 0.633497 0.034097 0.067775 0.892007 0.194091 0.396558 0.982284 0.046668 0.090497 0.40964 0.819561 0.049868 0.60349 0.931537 0.242832 0.38759 0.109868 0.795356 0.601158 0.960673 0.784582 0.788022 0.427402 0.807438 0.505683 0.661464 0.13549 0.146421 0.985938 0.284141 0.739441 0.381822 0.96388 0.280739 0.320587 0.88871 0.911409 0.249208 0.047893 0.427985 0.131367 0.47527 0.523366 0.600589 0.015198 0.455338 0.068262 0.840261 0.709033 0.898175 0.54662 [...]
+0.776346 0.938501 0.621471 0.31689 0.489797 0.480067 0.962143 0.004993 0.652276 0.903242 0.469472 0.178433 0.554414 0.388041 0.065281 0.573441 0.380841 0.75535 0.922005 0.555396 0.408579 0.409641 0.572586 0.990945 0.252825 0.007348 0.664927 0.646502 0.134652 0.277919 0.654597 0.606429 0.569408 0.237995 0.222698 0.087394 0.633658 0.745778 0.685659 0.678581 0.886715 0.877578 0.343192 0.343242 0.851035 0.230008 0.317625 0.454684 0.185231 0.192129 0.102865 0.89859 0.497424 0.246937 0.288773  [...]
+0.823655 0.424575 0.656548 0.356229 0.867883 0.429077 0.463731 0.173161 0.258667 0.920056 0.068609 0.938788 0.417468 0.180456 0.884627 0.310316 0.026614 0.481875 0.344198 0.100811 0.012099 0.718093 0.397453 0.07801 0.082295 0.865016 0.001154 0.531987 0.00445 0.19835 0.536637 0.205028 0.703954 0.551224 0.574231 0.263111 0.301798 0.909964 0.076319 0.041794 0.985687 0.391903 0.626852 0.251493 0.195286 0.215245 0.881853 0.346608 0.1355 0.474656 0.135825 0.538193 0.229741 0.904686 0.124485 0. [...]
+0.102851 0.533553 0.938129 0.06929 0.738474 0.802773 0.211417 0.886498 0.190125 0.981329 0.164539 0.132968 0.158636 0.97949 0.471439 0.97542 0.902292 0.262303 0.566464 0.343012 0.598725 0.043714 0.506486 0.836177 0.614592 0.592282 0.652474 0.680341 0.520514 0.384867 0.83813 0.410138 0.572242 0.117568 0.170456 0.597186 0.850351 0.944756 0.460665 0.875609 0.789879 0.256032 0.987989 0.952183 0.386648 0.715836 0.611898 0.577054 0.54385 0.034915 0.282049 0.772322 0.716886 0.940823 0.543939 0. [...]
+0.049806 0.763657 0.440371 0.330542 0.89884 0.512177 0.740899 0.512381 0.039038 0.054507 0.918176 0.685691 0.416411 0.064624 0.338699 0.595914 0.077873 0.326905 0.459851 0.102731 0.758526 0.614083 0.521096 0.401593 0.241347 0.861941 0.149879 0.744581 0.694454 0.226893 0.791125 0.228022 0.364623 0.807898 0.188145 0.565904 0.521282 0.626554 0.657461 0.247068 0.627949 0.234415 0.958693 0.739127 0.710675 0.013128 0.53748 0.252295 0.893549 0.38229 0.08968 0.378906 0.87686 0.770446 0.273568 0. [...]
+0.989545 0.337153 0.285923 0.106593 0.948271 0.670342 0.142299 0.771464 0.643526 0.401798 0.043924 0.790563 0.799933 0.759438 0.544565 0.771458 0.451328 0.618668 0.926331 0.101538 0.002671 0.957241 0.722609 0.83936 0.007055 0.852432 0.644794 0.016559 0.544222 0.741968 0.458505 0.275679 0.391454 0.237306 0.631981 0.897353 0.50982 0.560256 0.126908 0.966986 0.056854 0.56204 0.914046 0.531609 0.013806 0.736254 0.675082 0.661515 0.682334 0.557327 0.04196 0.860105 0.988942 0.147381 0.248605 0 [...]
+0.425762 0.588415 0.424686 0.200131 0.630336 0.160053 0.727755 0.091156 0.464877 0.661788 0.986952 0.383775 0.007433 0.148519 0.273507 0.874467 0.962093 0.538277 0.544636 0.419233 0.433231 0.740739 0.68908 0.122502 0.284028 0.680089 0.86419 0.43267 0.426621 0.885042 0.519086 0.018561 0.14906 0.932405 0.044078 0.765023 0.30839 0.79915 0.061668 0.86731 0.674114 0.44318 0.200424 0.282757 0.886768 0.048439 0.055477 0.253513 0.02146 0.87807 0.233228 0.694187 0.715232 0.271887 0.087986 0.57920 [...]
+0.574825 0.481972 0.92193 0.448396 0.36754 0.690181 0.651531 0.550652 0.077289 0.51852 0.564091 0.429715 0.736817 0.085749 0.688063 0.534885 0.250092 0.368431 0.706961 0.831067 0.418987 0.357797 0.919731 0.930388 0.286193 0.415994 0.94311 0.703664 0.621706 0.273918 0.882417 0.098522 0.596929 0.220439 0.698206 0.824904 0.87154 0.126083 0.025754 0.21832 0.686111 0.161819 0.445864 0.623719 0.763297 0.375666 0.169543 0.849679 0.381151 0.224747 0.772239 0.647284 0.767878 0.829502 0.195482 0.9 [...]
+0.58502 0.437885 0.290123 0.687687 0.87989 0.833614 0.008276 0.552435 0.477668 0.938519 0.115209 0.497543 0.676395 0.836677 0.229749 0.417318 0.824553 0.085463 0.094572 0.055184 0.929669 0.68263 0.355555 0.121698 0.358349 0.976827 0.084437 0.038949 0.643374 0.306881 0.030119 0.013697 0.152223 0.305724 0.977254 0.659256 0.19319 0.302143 0.816346 0.116579 0.494232 0.209842 0.478952 0.462893 0.789443 0.93612 0.752571 0.532203 0.715788 0.244644 0.047815 0.231671 0.373976 0.320163 0.840748 0. [...]
+0.091645 0.34337 0.93922 0.734177 0.785525 0.257613 0.634074 0.519028 0.22233 0.397821 0.461293 0.348466 0.198935 0.898662 0.488579 0.297449 0.806605 0.661728 0.235258 0.130025 0.81969 0.178906 0.916786 0.899158 0.338245 0.642215 0.409075 0.74987 0.962977 0.920157 0.741819 0.787064 0.92649 0.266324 0.870888 0.517657 0.431434 0.385273 0.228449 0.829127 0.293784 0.194708 0.840912 0.179485 0.47476 0.259559 0.617396 0.725276 0.989899 0.858129 0.308773 0.044471 0.120092 0.337761 0.547575 0.47 [...]
+0.655114 0.168117 0.976051 0.086246 0.761442 0.54105 0.020532 0.673372 0.885845 0.351672 0.342417 0.939024 0.614344 0.224615 0.658257 0.197425 0.715924 0.096201 0.064175 0.706145 0.772342 0.11659 0.921252 0.997249 0.18881 0.562716 0.175001 0.731028 0.293814 0.958068 0.017775 0.053136 0.641739 0.202914 0.687592 0.681944 0.505663 0.429119 0.963548 0.30349 0.426557 0.081773 0.696167 0.105333 0.486826 0.507309 0.800702 0.472296 0.145194 0.791036 0.766421 0.777836 0.694708 0.838581 0.548223 0 [...]
+0.553094 0.068344 0.601112 0.424232 0.56045 0.789084 0.363195 0.294031 0.493489 0.641537 0.109077 0.143506 0.475609 0.475739 0.440736 0.127736 0.204031 0.749867 0.696838 0.257025 0.216311 0.402124 0.580092 0.557352 0.538738 0.456597 0.294338 0.008323 0.161039 0.351568 0.973228 0.971292 0.82607 0.635006 0.838454 0.283302 0.344546 0.475424 0.871445 0.352367 0.671581 0.080963 0.28642 0.575451 0.106111 0.459751 0.457225 0.853263 0.046778 0.583085 0.277828 0.46899 0.993986 0.758915 0.963238 0 [...]
+0.501472 0.307938 0.425089 0.9053 0.627677 0.343856 0.629361 0.806345 0.614151 0.810089 0.293746 0.499236 0.98858 0.769383 0.493436 0.899466 0.182888 0.461838 0.625157 0.937393 0.416126 0.420372 0.267088 0.032192 0.322282 0.734351 0.753753 0.068916 0.540492 0.514037 0.111865 0.603683 0.597228 0.92211 0.851393 0.37347 0.760271 0.541032 0.317319 0.942529 0.340126 0.146559 0.734461 0.550026 0.452213 0.024351 0.001735 0.488274 0.050189 0.039977 0.544536 0.016927 0.504044 0.522817 0.757956 0. [...]
+0.73918 0.535283 0.829425 0.801207 0.121844 0.738842 0.451266 0.283266 0.695698 0.846101 0.024382 0.504419 0.589887 0.838805 0.298132 0.285153 0.206737 0.357729 0.132849 0.477453 0.637919 0.525973 0.540828 0.515657 0.199411 0.59733 0.65247 0.040054 0.045088 0.313127 0.750992 0.920873 0.255409 0.71299 0.52416 0.898208 0.164413 0.922142 0.905331 0.460019 0.24753 0.385256 0.028131 0.698653 0.552494 0.198019 0.201824 0.128072 0.161156 0.570471 0.846405 0.315658 0.907545 0.068362 0.482246 0.1 [...]
+0.380261 0.763916 0.65471 0.665878 0.903099 0.1134 0.704403 0.430239 0.631174 0.147711 0.370724 0.204233 0.727847 0.540973 0.440525 0.955073 0.290388 0.215194 0.329797 0.404477 0.955923 0.644386 0.684179 0.245306 0.658968 0.436622 0.025207 0.407124 0.677353 0.938131 0.376287 0.717372 0.427715 0.67008 0.152036 0.460617 0.899218 0.885916 0.289399 0.038641 0.538736 0.167947 0.903465 0.090729 0.364472 0.653193 0.57029 0.373622 0.937936 0.860575 0.255882 0.592013 0.093474 0.856049 0.936403 0. [...]
+0.235808 0.092324 0.91103 0.722544 0.450063 0.704885 0.283907 0.4977 0.087933 0.037049 0.998886 0.525666 0.762506 0.410081 0.920107 0.209218 0.685173 0.263423 0.361182 0.835098 0.522668 0.496727 0.74932 0.390023 0.276221 0.489364 0.179765 0.113198 0.737423 0.173248 0.549127 0.410169 0.484736 0.439229 0.965265 0.547296 0.575305 0.936841 0.007525 0.126401 0.265024 0.268452 0.073805 0.877742 0.043138 0.409391 0.648211 0.584759 0.122329 0.463653 0.118111 0.408766 0.438037 0.430257 0.540904 0 [...]
+0.085416 0.101834 0.506908 0.908653 0.188493 0.174548 0.749465 0.733908 0.395233 0.298182 0.665088 0.583107 0.057559 0.065587 0.163144 0.569822 0.787481 0.429894 0.069412 0.621291 0.771214 0.160628 0.17542 0.778836 0.192235 0.847619 0.597718 0.16565 0.368528 0.839922 0.22694 0.294161 0.00288 0.499522 0.267467 0.004209 0.683921 0.660741 0.2129 0.325315 0.997436 0.60039 0.57305 0.62189 0.191743 0.456461 0.665878 0.258038 0.790752 0.225135 0.962764 0.203307 0.652854 0.495661 0.111927 0.5342 [...]
+0.066585 0.333637 0.771841 0.146953 0.140884 0.103663 0.838986 0.316451 0.925596 0.380687 0.505585 0.316604 0.250943 0.085919 0.912159 0.21847 0.359164 0.939715 0.420535 0.605726 0.52858 0.167971 0.902474 0.360771 0.539317 0.452282 0.977455 0.628487 0.778212 0.045861 0.789645 0.299787 0.946942 0.436342 0.148569 0.633249 0.645035 0.907593 0.51356 0.003557 0.132081 0.108775 0.296828 0.245264 0.380658 0.31869 0.376111 0.224598 0.442448 0.761415 0.318006 0.493301 0.772137 0.410301 0.186385 0 [...]
+0.945182 0.778304 0.02133 0.179268 0.67049 0.892179 0.538878 0.289823 0.704892 0.622257 0.968619 0.33051 0.481276 0.507883 0.933629 0.552585 0.180781 0.454026 0.213599 0.725985 0.97592 0.344696 0.13667 0.90611 0.742607 0.83988 0.855318 0.921464 0.858837 0.476496 0.129832 0.886927 0.129755 0.082317 0.330676 0.492457 0.846642 0.715945 0.576158 0.154167 0.468039 0.275979 0.67639 0.703137 0.00542 0.520223 0.009586 0.315825 0.630104 0.719202 0.046097 0.684544 0.437467 0.350568 0.696668 0.7490 [...]
+0.057869 0.321695 0.528094 0.315203 0.06575 0.459165 0.989828 0.755494 0.688875 0.974864 0.088166 0.169682 0.226721 0.984259 0.596731 0.407276 0.404931 0.479349 0.178691 0.310381 0.895292 0.562112 0.976002 0.51786 0.833403 0.422023 0.007946 0.520458 0.022474 0.102042 0.466726 0.51463 0.157983 0.236559 0.158297 0.444779 0.476749 0.281588 0.741182 0.272247 0.585415 0.30101 0.602394 0.49329 0.277597 0.079864 0.458203 0.031643 0.883369 0.891615 0.405175 0.535583 0.507549 0.733116 0.258615 0. [...]
+0.106261 0.288807 0.476996 0.609838 0.499441 0.351016 0.011163 0.117499 0.347207 0.280556 0.960245 0.146897 0.577191 0.708063 0.458962 0.909739 0.350375 0.90422 0.619633 0.753741 0.452554 0.996299 0.202228 0.001904 0.622433 0.751691 0.775805 0.932581 0.673127 0.578977 0.248927 0.395691 0.846946 0.039697 0.497043 0.849361 0.737659 0.37873 0.041516 0.204014 0.185801 0.118866 0.351175 0.214872 0.935756 0.213211 0.165842 0.511435 0.916203 0.868055 0.858585 0.493386 0.168604 0.080158 0.568815 [...]
+0.317034 0.101295 0.724745 0.776073 0.513813 0.115482 0.458191 0.065428 0.279932 0.27567 0.49455 0.995324 0.406903 0.677406 0.066062 0.938147 0.18089 0.834728 0.240769 0.745221 0.004198 0.895687 0.538742 0.327218 0.797165 0.058137 0.578552 0.374443 0.622231 0.431858 0.87547 0.405543 0.182486 0.791547 0.43772 0.776581 0.836159 0.038776 0.38664 0.738637 0.979174 0.593775 0.155759 0.733407 0.257785 0.177366 0.853324 0.083032 0.368446 0.732963 0.078275 0.390373 0.140992 0.941921 0.706421 0.8 [...]
+0.31569 0.534279 0.776073 0.498304 0.296471 0.040785 0.092741 0.44653 0.811833 0.8574 0.723727 0.145163 0.170984 0.319626 0.831108 0.603892 0.082602 0.820316 0.637206 0.18374 0.118316 0.904448 0.970279 0.171911 0.218737 0.740564 0.141488 0.535488 0.556934 0.781224 0.563182 0.921876 0.382652 0.747205 0.160353 0.643036 0.400673 0.416884 0.156143 0.40128 0.775114 0.426045 0.589019 0.234797 0.816882 0.922864 0.677397 0.419529 0.136754 0.010758 0.055605 0.470221 0.517727 0.347634 0.325165 0.4 [...]
+0.315747 0.689583 0.764145 0.494117 0.938681 0.560475 0.614399 0.340546 0.637106 0.000807 0.925065 0.572162 0.580742 0.772799 0.784852 0.777071 0.174757 0.719061 0.355994 0.852252 0.980409 0.217125 0.181459 0.586624 0.911523 0.01955 0.066255 0.298188 0.486818 0.085953 0.283205 0.355004 0.12303 0.128937 0.334452 0.165117 0.316339 0.687985 0.55363 0.936345 0.012689 0.900176 0.252972 0.150243 0.696805 0.995939 0.188511 0.591202 0.697444 0.561297 0.509123 0.602753 0.73295 0.240823 0.56603 0. [...]
+0.488861 0.639089 0.832402 0.964059 0.936094 0.75091 0.067864 0.196792 0.550561 0.5017 0.650962 0.55673 0.096318 0.470459 0.679606 0.14618 0.453163 0.688498 0.936289 0.58951 0.536843 0.00191 0.157687 0.701749 0.677881 0.623367 0.26543 0.884141 0.179785 0.017179 0.188827 0.69264 0.613296 0.768549 0.489216 0.628032 0.286161 0.013939 0.504048 0.246639 0.529048 0.120947 0.366961 0.526297 0.549186 0.658228 0.11135 0.692938 0.394392 0.462661 0.820121 0.613592 0.422959 0.374789 0.047813 0.42000 [...]
+0.809853 0.049744 0.783729 0.661484 0.916207 0.934072 0.506625 0.061837 0.994078 0.593241 0.135246 0.079921 0.245226 0.358586 0.21035 0.50887 0.754972 0.298246 0.834208 0.222266 0.846227 0.171114 0.658873 0.263507 0.454327 0.560976 0.511415 0.718613 0.066872 0.882072 0.670574 0.781725 0.123173 0.746901 0.189307 0.241924 0.984673 0.662807 0.759431 0.141836 0.022005 0.916966 0.448592 0.084782 0.553069 0.990619 0.746523 0.477743 0.643595 0.210533 0.776184 0.376234 0.074413 0.493902 0.754107 [...]
+0.615585 0.666555 0.987598 0.606445 0.691439 0.850004 0.524198 0.475215 0.488316 0.913468 0.234463 0.426941 0.183968 0.63807 0.27143 0.20169 0.033424 0.154937 0.57138 0.307272 0.877776 0.747682 0.830726 0.803133 0.858764 0.172122 0.742104 0.308496 0.278205 0.116492 0.047722 0.087296 0.707209 0.797842 0.860857 0.75733 0.108751 0.451173 0.569207 0.254532 0.939039 0.531941 0.883152 0.254152 0.34488 0.879404 0.041077 0.61598 0.334351 0.179644 0.54896 0.470464 0.985704 0.955782 0.313216 0.960 [...]
+0.556457 0.873174 0.301768 0.765221 0.580342 0.757252 0.448094 0.152176 0.633366 0.367147 0.923051 0.335544 0.287885 0.585772 0.306838 0.18612 0.989622 0.484613 0.488363 0.965122 0.225752 0.567612 0.731255 0.634448 0.428516 0.103511 0.150233 0.755633 0.155337 0.677668 0.928913 0.053308 0.407502 0.861716 0.642215 0.471462 0.408455 0.712107 0.597135 0.438418 0.379469 0.746674 0.218766 0.117879 0.247174 0.343334 0.852819 0.747779 0.228905 0.718117 0.471136 0.969306 0.727332 0.475035 0.02713 [...]
+0.803957 0.881211 0.81365 0.726814 0.058236 0.794041 0.608074 0.659161 0.337867 0.010144 0.520412 0.784368 0.165295 0.45921 0.702494 0.403299 0.797202 0.781463 0.977946 0.705976 0.461765 0.258615 0.300876 0.993755 0.312371 0.296202 0.630461 0.987505 0.546537 0.381415 0.473763 0.046764 0.710498 0.121693 0.82055 0.150403 0.92335 0.716808 0.22736 0.396445 0.451632 0.789651 0.387554 0.44582 0.667434 0.935192 0.178945 0.796649 0.295681 0.210668 0.519427 0.250106 0.701521 0.045192 0.258797 0.1 [...]
+0.961567 0.98544 0.875275 0.363903 0.522885 0.24235 0.491598 0.479734 0.940715 0.594795 0.262624 0.922815 0.250253 0.115165 0.423567 0.958146 0.141112 0.284275 0.968571 0.220875 0.093337 0.352155 0.732142 0.630987 0.456407 0.371905 0.599907 0.773922 0.907869 0.018277 0.456992 0.809102 0.171185 0.470436 0.009294 0.627178 0.679747 0.581049 0.712876 0.130911 0.753922 0.116578 0.062602 0.32203 0.803087 0.293819 0.707279 0.950985 0.972353 0.999335 0.796935 0.039227 0.26253 0.709869 0.006622 0 [...]
+0.948843 0.968158 0.208308 0.789883 0.849013 0.107115 0.344245 0.195774 0.227882 0.191779 0.688346 0.506826 0.157955 0.723421 0.4289 0.886173 0.333687 0.63983 0.954592 0.678489 0.295403 0.132254 0.800821 0.013812 0.819056 0.97719 0.488212 0.965918 0.701638 0.479302 0.646233 0.742354 0.739108 0.82431 0.366874 0.213935 0.206489 0.473147 0.567687 0.035777 0.087258 0.928652 0.349375 0.890793 0.41373 0.247165 0.989138 0.758799 0.965726 0.911196 0.258058 0.850791 0.648124 0.056007 0.417906 0.9 [...]
+0.636725 0.550406 0.137011 0.786911 0.661383 0.50533 0.875869 0.419239 0.988599 0.541763 0.505217 0.262992 0.106677 0.191001 0.429373 0.584696 0.951073 0.457677 0.563837 0.160433 0.419587 0.465209 0.461065 0.316012 0.688565 0.114761 0.889693 0.496666 0.724036 0.232276 0.619734 0.873878 0.059734 0.387481 0.354607 0.685158 0.399742 0.982623 0.93486 0.517634 0.171848 0.196721 0.623468 0.225621 0.022933 0.130884 0.310655 0.313268 0.728738 0.76979 0.035538 0.660954 0.710327 0.818294 0.683571  [...]
+0.34313 0.246131 0.458514 0.777678 0.346308 0.991234 0.925806 0.869676 0.062137 0.039589 0.552695 0.304017 0.720452 0.81121 0.380546 0.303159 0.206936 0.742756 0.247846 0.034507 0.889432 0.913317 0.754561 0.779942 0.509726 0.359381 0.430978 0.672554 0.410819 0.535781 0.961261 0.455786 0.327764 0.198708 0.322951 0.610518 0.566852 0.749287 0.351173 0.75716 0.300897 0.665412 0.150951 0.92951 0.863346 0.671583 0.376094 0.317881 0.693951 0.556646 0.400379 0.397833 0.034218 0.189376 0.193356 0 [...]
+0.050187 0.463844 0.622398 0.072911 0.056205 0.035108 0.19964 0.817373 0.455494 0.007097 0.876358 0.57121 0.525182 0.820608 0.057478 0.428965 0.683607 0.53825 0.029451 0.912399 0.88496 0.332962 0.546249 0.158878 0.820809 0.890601 0.895631 0.950862 0.562812 0.151973 0.167202 0.320797 0.955545 0.992768 0.625826 0.550438 0.351532 0.025408 0.888678 0.671475 0.180934 0.025443 0.805326 0.991546 0.974866 0.251354 0.418381 0.38429 0.530942 0.50295 0.951776 0.604469 0.266779 0.463131 0.81256 0.88 [...]
+0.532154 0.178807 0.361135 0.080597 0.999931 0.476125 0.367728 0.485547 0.403108 0.575269 0.636855 0.815185 0.391279 0.533798 0.954278 0.867278 0.011191 0.329305 0.774201 0.344612 0.035236 0.438606 0.421604 0.568797 0.190746 0.164728 0.000711 0.776889 0.857769 0.275934 0.938232 0.809378 0.861795 0.075101 0.359084 0.877953 0.149759 0.859147 0.967419 0.744305 0.551964 0.247558 0.115912 0.610912 0.281995 0.757013 0.298537 0.728556 0.433774 0.18833 0.12755 0.4785 0.434901 0.25647 0.051301 0. [...]
+0.868164 0.766694 0.468923 0.782563 0.615763 0.657504 0.282638 0.273896 0.003846 0.733984 0.036375 0.833063 0.09296 0.33977 0.021178 0.595741 0.948832 0.89577 0.779519 0.84911 0.190114 0.687621 0.508844 0.364754 0.692047 0.74703 0.4035 0.512157 0.368074 0.096338 0.064154 0.018558 0.895229 0.385957 0.035493 0.96253 0.261148 0.250582 0.591915 0.347801 0.207265 0.585121 0.278208 0.244583 0.876258 0.285874 0.211462 0.633196 0.229391 0.135764 0.875306 0.209888 0.067288 0.562499 0.781391 0.456 [...]
+0.378913 0.234553 0.839313 0.438969 0.84837 0.541208 0.254166 0.372291 0.11873 0.598127 0.81503 0.30821 0.688798 0.743813 0.589357 0.693295 0.457102 0.766677 0.424826 0.933318 0.922242 0.590397 0.695486 0.818288 0.410684 0.026268 0.836682 0.460225 0.620353 0.544542 0.928867 0.913515 0.667263 0.39702 0.902145 0.582074 0.105273 0.093105 0.049819 0.590203 0.000791 0.232309 0.807355 0.271629 0.419175 0.626758 0.510073 0.013355 0.243326 0.294668 0.942148 0.928191 0.444896 0.713565 0.391468 0. [...]
+0.211461 0.961466 0.236623 0.18731 0.128628 0.520657 0.679075 0.108045 0.631025 0.592278 0.224985 0.552231 0.667644 0.903877 0.538572 0.606537 0.157362 0.217779 0.278935 0.386431 0.493791 0.501527 0.486695 0.748678 0.655014 0.197883 0.093829 0.990217 0.262179 0.176837 0.493237 0.581889 0.42891 0.59647 0.915277 0.580828 0.413771 0.223766 0.449005 0.520105 0.730227 0.162597 0.949561 0.047332 0.492974 0.688656 0.404838 0.584214 0.795165 0.344226 0.450181 0.29874 0.218953 0.286625 0.00908 0. [...]
+0.234954 0.519611 0.52184 0.307178 0.165548 0.177596 0.811381 0.820671 0.508361 0.528899 0.363236 0.15909 0.85628 0.753867 0.103954 0.724363 0.365286 0.595508 0.045712 0.580439 0.897923 0.552613 0.028906 0.11258 0.559204 0.824751 0.635159 0.597773 0.862743 0.588744 0.017205 0.017836 0.885208 0.429282 0.794849 0.61719 0.727561 0.425523 0.826089 0.439217 0.389326 0.045978 0.080028 0.14405 0.643827 0.153332 0.988176 0.871638 0.825796 0.555734 0.084681 0.616666 0.291762 0.673023 0.321179 0.9 [...]
+0.543127 0.183312 0.008165 0.236359 0.019852 0.711416 0.606304 0.770684 0.558525 0.604473 0.342879 0.514092 0.496339 0.968505 0.713228 0.870501 0.865396 0.026012 0.603654 0.239087 0.979506 0.951451 0.579915 0.484356 0.855778 0.162784 0.683832 0.775274 0.670938 0.485747 0.693758 0.32618 0.528681 0.538167 0.238546 0.075479 0.730633 0.854927 0.32795 0.046649 0.054198 0.057835 0.00248 0.252874 0.787139 0.303015 0.600695 0.360571 0.13096 0.022777 0.805757 0.812134 0.984527 0.597952 0.542512 0 [...]
+0.888693 0.62475 0.594488 0.636104 0.515617 0.225149 0.771508 0.344981 0.9541 0.433093 0.513048 0.268228 0.422273 0.466353 0.944569 0.289944 0.834607 0.317993 0.206858 0.126594 0.785775 0.672041 0.272606 0.899285 0.07121 0.410055 0.443944 0.441982 0.943785 0.512107 0.080161 0.152476 0.106743 0.697055 0.150265 0.741968 0.929996 0.605897 0.835469 0.362403 0.770107 0.514106 0.07033 0.709634 0.196211 0.315121 0.511644 0.706748 0.613559 0.809036 0.558399 0.285321 0.386926 0.818651 0.377124 0. [...]
+0.737917 0.853247 0.958783 0.867812 0.307994 0.101769 0.882022 0.087429 0.891448 0.60957 0.135323 0.354432 0.77707 0.414918 0.769274 0.696676 0.799722 0.243862 0.465456 0.224807 0.534465 0.106854 0.358155 0.751274 0.457909 0.392276 0.51435 0.614708 0.840429 0.73485 0.771708 0.260797 0.851235 0.083957 0.661154 0.491491 0.778849 0.367078 0.594512 0.287381 0.344823 0.901942 0.228144 0.976361 0.860884 0.92694 0.352425 0.007116 0.444738 0.102056 0.449817 0.115285 0.394852 0.455768 0.961346 0. [...]
+0.18404 0.26064 0.842139 0.261176 0.873533 0.207616 0.599864 0.823666 0.367116 0.209847 0.47795 0.09337 0.176327 0.094407 0.405339 0.575357 0.312071 0.091318 0.106374 0.562011 0.991449 0.146492 0.065516 0.004057 0.016036 0.890235 0.387196 0.95494 0.591424 0.221878 0.451087 0.36848 0.056937 0.47623 0.170025 0.468579 0.723354 0.21916 0.22745 0.232701 0.566431 0.124073 0.066591 0.645474 0.680814 0.691122 0.297727 0.287037 0.308307 0.216934 0.004429 0.611936 0.177118 0.519181 0.525827 0.1626 [...]
+0.939674 0.184135 0.275964 0.881142 0.925478 0.086532 0.948593 0.350421 0.006377 0.97984 0.409629 0.291467 0.344592 0.042241 0.792022 0.220166 0.276567 0.054342 0.134596 0.254856 0.67283 0.862703 0.99107 0.232985 0.126286 0.086447 0.530827 0.038705 0.003604 0.547741 0.832785 0.065065 0.515352 0.769758 0.841002 0.045159 0.102952 0.867958 0.504215 0.229268 0.511924 0.16443 0.506681 0.70136 0.061267 0.150191 0.112754 0.61926 0.220447 0.620066 0.519845 0.103042 0.830627 0.246812 0.102435 0.7 [...]
+0.912959 0.560664 0.165056 0.558987 0.687764 0.719762 0.524519 0.732407 0.417406 0.347963 0.41586 0.602031 0.746971 0.99134 0.482008 0.093464 0.32819 0.368764 0.034716 0.060542 0.59042 0.572658 0.704303 0.425471 0.587245 0.277582 0.576743 0.596642 0.078238 0.255996 0.220694 0.602421 0.501192 0.959215 0.256625 0.008415 0.694151 0.084771 0.670155 0.855555 0.673037 0.123759 0.656807 0.139888 0.932935 0.113375 0.696308 0.902852 0.344198 0.883737 0.143555 0.105242 0.651595 0.554395 0.116067 0 [...]
+0.149912 0.709873 0.780769 0.204585 0.252685 0.011634 0.050907 0.331883 0.203655 0.582932 0.203069 0.256695 0.928026 0.309243 0.586581 0.174053 0.763077 0.347116 0.92723 0.490355 0.441367 0.971584 0.437904 0.680247 0.38712 0.626532 0.270405 0.404184 0.926221 0.571057 0.878565 0.16616 0.162013 0.930605 0.404711 0.891246 0.491589 0.995837 0.760907 0.949206 0.858463 0.213677 0.046096 0.555872 0.99053 0.132572 0.276193 0.858064 0.271902 0.959212 0.919482 0.083301 0.432819 0.683031 0.508916 0 [...]
+0.165302 0.143633 0.359476 0.295162 0.455091 0.733219 0.198284 0.539618 0.66197 0.846243 0.203641 0.192612 0.60736 0.446595 0.08648 0.355915 0.078634 0.63502 0.577054 0.095391 0.21821 0.657574 0.453261 0.005737 0.500324 0.206939 0.561161 0.773279 0.800817 0.790763 0.089631 0.918762 0.311653 0.328135 0.713191 0.488758 0.302236 0.361581 0.71282 0.936322 0.636778 0.607694 0.551171 0.445585 0.620263 0.226832 0.722811 0.424284 0.108232 0.008053 0.937962 0.811799 0.48746 0.905705 0.733994 0.52 [...]
+0.766545 0.161359 0.004738 0.36493 0.927263 0.831339 0.592183 0.684138 0.414078 0.540508 0.541892 0.025146 0.126182 0.604194 0.326518 0.696205 0.049515 0.933257 0.337388 0.041837 0.293841 0.794362 0.985023 0.261428 0.627157 0.305026 0.129739 0.767944 0.180594 0.316824 0.614731 0.379767 0.424815 0.044016 0.53836 0.048397 0.394285 0.078462 0.825851 0.307559 0.889196 0.413979 0.999187 0.02058 0.498611 0.182816 0.50368 0.437616 0.347138 0.725617 0.382664 0.298091 0.790524 0.434659 0.222716 0 [...]
+0.932416 0.720357 0.167895 0.496773 0.019287 0.592144 0.142099 0.141437 0.838882 0.581955 0.89965 0.455682 0.060163 0.331048 0.55529 0.482884 0.208363 0.752639 0.953793 0.081237 0.622348 0.197917 0.225331 0.762593 0.265291 0.998033 0.834 0.866651 0.231104 0.052872 0.869839 0.624554 0.496612 0.005891 0.402422 0.998382 0.337479 0.828793 0.658049 0.953288 0.046392 0.919165 0.232306 0.448411 0.172837 0.50023 0.131356 0.101006 0.474285 0.280333 0.5468 0.165245 0.833819 0.684331 0.93232 0.4471 [...]
+0.160463 0.238092 0.810815 0.147277 0.671326 0.35212 0.760093 0.349252 0.073884 0.795278 0.592896 0.685311 0.347869 0.492293 0.123196 0.179507 0.943794 0.471622 0.351166 0.302508 0.827953 0.018745 0.779164 0.19231 0.718576 0.904945 0.939584 0.410217 0.354593 0.57623 0.269192 0.342594 0.562286 0.05116 0.939009 0.185202 0.508811 0.66017 0.122008 0.878303 0.87631 0.476711 0.94243 0.752907 0.594506 0.356808 0.324957 0.306093 0.511715 0.374916 0.156082 0.335467 0.46434 0.846119 0.487256 0.199 [...]
+0.213348 0.283268 0.576662 0.243762 0.848764 0.488359 0.385248 0.210836 0.177077 0.209146 0.425928 0.693047 0.050591 0.782477 0.308313 0.132594 0.550481 0.811118 0.488609 0.913812 0.013395 0.37698 0.214079 0.469961 0.120796 0.052296 0.846823 0.543731 0.853365 0.820574 0.312947 0.847432 0.4331 0.409477 0.838841 0.913547 0.763761 0.043295 0.444823 0.531527 0.283629 0.124403 0.422741 0.190435 0.78955 0.006844 0.619774 0.704423 0.79116 0.122289 0.443345 0.586201 0.60337 0.225652 0.389766 0.4 [...]
+0.913862 0.543326 0.852789 0.183476 0.349478 0.814988 0.810729 0.594286 0.120287 0.702415 0.525013 0.152704 0.549637 0.724129 0.73853 0.157463 0.558408 0.421661 0.156414 0.76333 0.774803 0.856078 0.109592 0.333974 0.042816 0.527465 0.391398 0.695307 0.073329 0.546889 0.448341 0.285844 0.335264 0.31998 0.790432 0.50696 0.322077 0.85329 0.248712 0.686939 0.604107 0.735894 0.479337 0.882454 0.320976 0.4842 0.57925 0.728966 0.266265 0.622982 0.453331 0.826054 0.703657 0.131651 0.580815 0.787 [...]
+0.873068 0.395281 0.648392 0.726237 0.008871 0.568558 0.40505 0.912059 0.361494 0.607698 0.801089 0.647981 0.045436 0.291922 0.419628 0.457218 0.64557 0.109795 0.764891 0.576548 0.731735 0.098144 0.764672 0.924117 0.152153 0.895232 0.048792 0.743823 0.513179 0.255998 0.706312 0.657777 0.254115 0.134123 0.144091 0.279023 0.014627 0.24584 0.567221 0.806799 0.059456 0.470383 0.669199 0.999585 0.492315 0.041305 0.535872 0.892124 0.971984 0.106558 0.983328 0.783238 0.966699 0.294154 0.956308  [...]
+0.508994 0.797403 0.859744 0.905718 0.257224 0.334459 0.234817 0.262731 0.784439 0.966385 0.802857 0.191872 0.927697 0.720202 0.657913 0.46905 0.661482 0.068453 0.821805 0.268869 0.686251 0.060861 0.877699 0.49008 0.624593 0.299526 0.522635 0.355381 0.001527 0.817923 0.582192 0.624393 0.370299 0.262682 0.205553 0.600435 0.788197 0.115282 0.179623 0.752908 0.42223 0.908115 0.433143 0.475289 0.765007 0.301685 0.418909 0.310831 0.720217 0.312242 0.333171 0.618711 0.020695 0.901351 0.127861  [...]
+0.661768 0.497682 0.161356 0.442809 0.750859 0.855693 0.656956 0.891327 0.741078 0.466363 0.10946 0.715018 0.648798 0.155148 0.886638 0.918533 0.749758 0.47201 0.182829 0.331556 0.54318 0.450479 0.667658 0.87711 0.86734 0.610328 0.210502 0.467333 0.69678 0.630432 0.780469 0.847143 0.692923 0.291991 0.406305 0.62128 0.441406 0.548314 0.862695 0.535506 0.549249 0.106682 0.101174 0.439488 0.526327 0.775141 0.902878 0.355561 0.628791 0.729679 0.703262 0.343252 0.307804 0.043534 0.210136 0.28 [...]
+0.820079 0.064574 0.477742 0.231397 0.806924 0.196376 0.25126 0.24027 0.273609 0.251544 0.742092 0.357411 0.028139 0.172889 0.755995 0.463298 0.557171 0.883537 0.637331 0.126302 0.255728 0.394075 0.809789 0.371278 0.81752 0.227807 0.742039 0.994109 0.262984 0.584131 0.697011 0.071944 0.240083 0.74155 0.372571 0.667254 0.720222 0.61988 0.438033 0.848178 0.247342 0.346537 0.44302 0.921585 0.44869 0.040341 0.976199 0.051441 0.15804 0.524995 0.675702 0.833539 0.292431 0.849296 0.431514 0.489 [...]
+0.932234 0.708481 0.761142 0.663798 0.956503 0.608984 0.525111 0.135849 0.180875 0.008924 0.645297 0.417462 0.208851 0.597667 0.308027 0.881226 0.915883 0.01771 0.078965 0.767282 0.863678 0.591643 0.760447 0.890538 0.853544 0.088039 0.252145 0.008414 0.958929 0.067041 0.927803 0.930724 0.341923 0.138919 0.182947 0.101086 0.261632 0.701058 0.495523 0.192516 0.994152 0.973527 0.479366 0.550719 0.739534 0.501251 0.105942 0.342943 0.916696 0.652724 0.02623 0.155614 0.529402 0.917107 0.055484 [...]
+0.014021 0.023819 0.582764 0.284201 0.451017 0.247148 0.08521 0.258171 0.620283 0.584936 0.332139 0.406905 0.13367 0.30398 0.442055 0.58651 0.573245 0.740619 0.318097 0.462364 0.070675 0.244542 0.744253 0.374234 0.15351 0.577808 0.103158 0.036209 0.294901 0.851457 0.820964 0.477005 0.095294 0.04755 0.938076 0.412361 0.515365 0.191263 0.089315 0.884348 0.167192 0.701921 0.614455 0.364227 0.882524 0.110396 0.122898 0.370578 0.525853 0.26273 0.637738 0.393884 0.806352 0.242428 0.294314 0.46 [...]
+0.347401 0.762435 0.678655 0.05292 0.490871 0.990042 0.386624 0.076983 0.847986 0.90165 0.943765 0.883812 0.134348 0.868521 0.434865 0.268993 0.920034 0.243147 0.844285 0.367842 0.368984 0.112729 0.037028 0.762427 0.897825 0.406964 0.692144 0.711465 0.651802 0.604126 0.178095 0.925578 0.384798 0.185367 0.054114 0.29897 0.297838 0.920359 0.575261 0.40109 0.608977 0.288159 0.12396 0.034666 0.126699 0.610897 0.347612 0.315071 0.082522 0.070008 0.908378 0.284144 0.217438 0.425245 0.941328 0. [...]
+0.131304 0.223966 0.321016 0.528878 0.939217 0.604298 0.279784 0.914445 0.540008 0.597883 0.495656 0.826217 0.536262 0.74655 0.624229 0.247979 0.11978 0.507643 0.681531 0.60121 0.936649 0.84369 0.218474 0.56382 0.020678 0.947654 0.815295 0.402075 0.919746 0.426017 0.115407 0.973988 0.150301 0.710628 0.833869 0.022279 0.741263 0.853256 0.768884 0.785273 0.152567 0.970056 0.64934 0.483243 0.750396 0.831526 0.933721 0.460947 0.491805 0.602855 0.252766 0.82918 0.29708 0.327593 0.641591 0.506 [...]
+0.995914 0.103502 0.673686 0.688642 0.148187 0.042161 0.852141 0.714872 0.152474 0.596203 0.102295 0.575199 0.667886 0.656831 0.324177 0.694401 0.970663 0.993485 0.55206 0.883897 0.879058 0.107277 0.284036 0.334044 0.24201 0.349729 0.181258 0.730929 0.07137 0.047139 0.284389 0.740831 0.803606 0.389814 0.636543 0.086614 0.54407 0.741866 0.166482 0.879063 0.380405 0.682928 0.959382 0.733048 0.124509 0.688437 0.770099 0.910336 0.226665 0.645847 0.463965 0.338283 0.584121 0.541147 0.425253 0 [...]
+0.192083 0.875142 0.271307 0.170676 0.518723 0.648653 0.391989 0.588366 0.35276 0.758023 0.416659 0.45048 0.783874 0.902158 0.8027 0.020441 0.791385 0.655741 0.101713 0.436637 0.419871 0.152622 0.778273 0.828902 0.711414 0.157544 0.296944 0.330757 0.76893 0.749641 0.020896 0.207705 0.27265 0.648637 0.706261 0.844426 0.070458 0.092715 0.235573 0.033923 0.155162 0.408895 0.240285 0.491597 0.705076 0.635243 0.45489 0.269214 0.171243 0.990454 0.559261 0.798455 0.325656 0.594219 0.332929 0.55 [...]
+0.833033 0.408617 0.341769 0.713127 0.106505 0.251143 0.07925 0.054127 0.013943 0.927363 0.811935 0.378241 0.141155 0.014791 0.788697 0.594572 0.564225 0.345081 0.039027 0.906271 0.484147 0.261965 0.14752 0.062531 0.105443 0.969997 0.036549 0.956066 0.609677 0.7839 0.642582 0.318379 0.656041 0.24183 0.774374 0.606102 0.983868 0.299665 0.140412 0.963348 0.573973 0.49133 0.459612 0.572277 0.008163 0.451189 0.385997 0.713414 0.65359 0.942482 0.020133 0.160785 0.178498 0.467452 0.495557 0.19 [...]
+0.038449 0.024186 0.912473 0.232628 0.692903 0.138907 0.297216 0.553878 0.37513 0.501753 0.52885 0.492239 0.549298 0.394419 0.591646 0.798276 0.331901 0.631116 0.06943 0.182 0.271592 0.852632 0.979213 0.584579 0.600893 0.58153 0.890712 0.565809 0.532673 0.764865 0.188718 0.570075 0.545885 0.374246 0.615815 0.708782 0.880281 0.232938 0.123339 0.121073 0.269523 0.997719 0.734918 0.108768 0.557154 0.448967 0.085085 0.31175 0.863146 0.691244 0.205049 0.859521 0.943613 0.479419 0.007623 0.880 [...]
+0.185432 0.807366 0.319877 0.857785 0.906905 0.353729 0.413487 0.28359 0.972569 0.358086 0.580335 0.964313 0.06283 0.92972 0.600063 0.775821 0.001144 0.096227 0.293569 0.326028 0.730092 0.363701 0.383351 0.234137 0.229583 0.205293 0.140344 0.061065 0.113716 0.961004 0.245175 0.815578 0.117978 0.019001 0.894944 0.823671 0.864423 0.576082 0.013437 0.397216 0.491276 0.469356 0.8319 0.255639 0.862589 0.63125 0.882776 0.062724 0.275432 0.833477 0.616414 0.366735 0.42787 0.750271 0.134201 0.06 [...]
+0.464195 0.984247 0.551917 0.914893 0.561467 0.837024 0.316995 0.200471 0.966977 0.231044 0.608497 0.005228 0.287851 0.028401 0.852491 0.609895 0.764992 0.562886 0.392955 0.93814 0.259033 0.637185 0.011274 0.24887 0.560366 0.658352 0.505067 0.122335 0.093177 0.025797 0.900517 0.184695 0.927834 0.075707 0.630253 0.858852 0.384328 0.458901 0.093517 0.00205 0.693245 0.465676 0.026508 0.935929 0.953237 0.96571 0.699164 0.574109 0.0115 0.750102 0.238582 0.516647 0.965482 0.153937 0.265323 0.3 [...]
+0.842787 0.66693 0.207934 0.456733 0.37785 0.233242 0.598447 0.066446 0.585066 0.992852 0.576241 0.036981 0.159612 0.055325 0.114522 0.332846 0.614571 0.069682 0.210873 0.613412 0.692374 0.554287 0.543458 0.857532 0.761906 0.175416 0.080555 0.091065 0.244442 0.962397 0.473833 0.690177 0.531867 0.025709 0.672368 0.176285 0.920263 0.597663 0.197977 0.832999 0.093924 0.173868 0.077859 0.402479 0.407651 0.381449 0.475476 0.69425 0.404359 0.251081 0.429753 0.630515 0.112932 0.318242 0.209089  [...]
+0.532565 0.320965 0.376972 0.929006 0.751612 0.441771 0.718787 0.728182 0.18481 0.022993 0.420872 0.839687 0.4991 0.063636 0.844712 0.993021 0.152878 0.521133 0.46593 0.338563 0.22268 0.306922 0.328144 0.894525 0.638427 0.807128 0.047497 0.324591 0.265819 0.339273 0.299221 0.462967 0.172456 0.034436 0.62922 0.31372 0.49748 0.622876 0.847049 0.634444 0.282457 0.584079 0.665919 0.961586 0.809067 0.378467 0.452299 0.452855 0.18037 0.539091 0.87138 0.119265 0.330112 0.281777 0.276448 0.39466 [...]
+0.309831 0.746142 0.931286 0.02285 0.453201 0.840107 0.674848 0.766172 0.386908 0.675043 0.229874 0.785286 0.887954 0.047317 0.591802 0.332789 0.906692 0.806925 0.134363 0.578378 0.683069 0.222301 0.188692 0.760586 0.185078 0.38206 0.228745 0.905953 0.845837 0.60287 0.633906 0.030392 0.418713 0.982728 0.626291 0.080178 0.462884 0.076522 0.841315 0.202301 0.677588 0.904932 0.439761 0.874909 0.777113 0.626505 0.510439 0.524784 0.038005 0.21248 0.581773 0.038156 0.853272 0.811062 0.72324 0. [...]
+0.323001 0.74532 0.569454 0.742503 0.227512 0.550569 0.555212 0.187575 0.290542 0.143731 0.55313 0.873971 0.727719 0.644765 0.951063 0.327265 0.331507 0.446822 0.71268 0.854391 0.334142 0.176621 0.647921 0.787522 0.864662 0.386946 0.210127 0.16953 0.816733 0.533702 0.958464 0.175718 0.728877 0.516255 0.289089 0.520548 0.643486 0.513226 0.29858 0.446168 0.016183 0.6364 0.464824 0.291445 0.276897 0.789861 0.238437 0.554477 0.145339 0.745435 0.467917 0.746583 0.400241 0.276576 0.944088 0.82 [...]
+0.350017 0.498827 0.200792 0.30368 0.617913 0.42551 0.039645 0.707859 0.960246 0.151223 0.798292 0.186164 0.420377 0.708343 0.865258 0.526395 0.846464 0.780009 0.434924 0.670464 0.596909 0.615321 0.788378 0.765764 0.638506 0.279645 0.850003 0.725949 0.535421 0.317631 0.25217 0.359638 0.039677 0.431798 0.319939 0.621201 0.82973 0.214189 0.576086 0.255625 0.402615 0.128718 0.619915 0.80966 0.499919 0.245355 0.362683 0.847151 0.562531 0.0117 0.862421 0.243066 0.80102 0.902984 0.89514 0.8894 [...]
+0.302797 0.167517 0.452903 0.408696 0.654249 0.983908 0.06307 0.303737 0.343672 0.262325 0.571806 0.628447 0.223745 0.732822 0.551083 0.495899 0.106134 0.678924 0.342761 0.242011 0.501941 0.532373 0.01125 0.735025 0.609178 0.838089 0.52358 0.160143 0.815208 0.772024 0.979897 0.315796 0.301974 0.312828 0.09452 0.729191 0.700465 0.889555 0.992853 0.009833 0.275858 0.178408 0.398645 0.795038 0.458365 0.851266 0.343317 0.22052 0.117876 0.509909 0.385837 0.042763 0.083455 0.641788 0.300745 0. [...]
+0.458477 0.02719 0.138854 0.905004 0.890844 0.753416 0.371943 0.517781 0.793358 0.716882 0.973073 0.421571 0.559971 0.971411 0.178977 0.818219 0.249581 0.955861 0.080254 0.115369 0.62078 0.974852 0.199682 0.23781 0.094819 0.468064 0.745667 0.583073 0.03026 0.558007 0.161733 0.724177 0.183399 0.842233 0.954658 0.310969 0.865543 0.071949 0.815858 0.638222 0.16241 0.401772 0.004537 0.844736 0.552313 0.537991 0.921206 0.501531 0.465216 0.74322 0.355999 0.850796 0.037733 0.169155 0.517329 0.7 [...]
+0.840426 0.570441 0.159652 0.766554 0.125099 0.63686 0.733434 0.948977 0.922422 0.333652 0.62163 0.678938 0.668615 0.499931 0.725905 0.026173 0.774074 0.820364 0.769899 0.266463 0.662622 0.648981 0.895337 0.837605 0.845718 0.588908 0.818911 0.125072 0.41512 0.447982 0.797559 0.687831 0.301944 0.25574 0.771295 0.180338 0.621793 0.068846 0.437423 0.403796 0.62264 0.723578 0.517364 0.259544 0.442755 0.831009 0.731408 0.11738 0.590997 0.645359 0.202079 0.566732 0.532598 0.792079 0.559608 0.8 [...]
+0.719703 0.127376 0.529456 0.6477 0.800888 0.178171 0.590449 0.167958 0.687245 0.181632 0.349489 0.461784 0.094936 0.44209 0.52831 0.997162 0.292363 0.688398 0.855992 0.995226 0.294583 0.540854 0.358333 0.360365 0.221342 0.487119 0.765682 0.879656 0.227374 0.508028 0.420635 0.67534 0.222227 0.705741 0.081867 0.411643 0.657657 0.344638 0.724796 0.985387 0.214345 0.194433 0.998612 0.522858 0.591322 0.796833 0.090423 0.106066 0.855925 0.395914 0.368994 0.739476 0.892883 0.945133 0.926394 0. [...]
+0.599437 0.991525 0.611894 0.971517 0.13933 0.646728 0.462765 0.277888 0.867654 0.650862 0.203434 0.541095 0.08841 0.834129 0.128585 0.968461 0.267652 0.772026 0.319619 0.083661 0.544862 0.232383 0.787449 0.915332 0.590963 0.286061 0.539967 0.010705 0.272822 0.781112 0.632007 0.553526 0.450373 0.21455 0.158161 0.732794 0.377514 0.281584 0.777856 0.397407 0.876383 0.571339 0.789191 0.261103 0.885511 0.178311 0.456819 0.045834 0.713198 0.805988 0.436757 0.402445 0.125678 0.894287 0.064011  [...]
+0.501357 0.948879 0.949051 0.901575 0.576955 0.747223 0.808169 0.025517 0.720924 0.664618 0.228669 0.921335 0.830395 0.446121 0.262112 0.829399 0.233666 0.48468 0.277389 0.715719 0.931461 0.205661 0.327995 0.242096 0.367261 0.831961 0.787923 0.14042 0.572059 0.74481 0.786481 0.581214 0.99985 0.015566 0.0922 0.430829 0.505754 0.250237 0.299835 0.270884 0.277492 0.544697 0.185424 0.700851 0.446434 0.257333 0.05274 0.752705 0.473806 0.663297 0.770012 0.548738 0.731818 0.025471 0.160995 0.12 [...]
+0.358584 0.185248 0.380517 0.630504 0.066282 0.100595 0.844878 0.807397 0.341501 0.352544 0.357779 0.330385 0.122065 0.04143 0.607575 0.033898 0.205223 0.960393 0.914978 0.748841 0.423762 0.352009 0.581852 0.878171 0.035179 0.121339 0.877219 0.801445 0.463355 0.03482 0.40439 0.312907 0.388164 0.963237 0.734266 0.560996 0.877935 0.957977 0.472768 0.402141 0.936035 0.574159 0.560983 0.606569 0.065428 0.577175 0.216732 0.512508 0.485042 0.981886 0.530771 0.325858 0.850471 0.088651 0.4777 0. [...]
+0.353377 0.123704 0.20273 0.143902 0.974869 0.830758 0.232769 0.276703 0.488227 0.244357 0.21472 0.315685 0.520903 0.113601 0.843277 0.469301 0.657075 0.053765 0.991946 0.484162 0.85307 0.364021 0.120365 0.363452 0.489646 0.4663 0.961855 0.248001 0.859966 0.991104 0.442622 0.006288 0.306343 0.951332 0.439077 0.285199 0.977797 0.801413 0.523328 0.028166 0.706698 0.129766 0.648401 0.311767 0.618154 0.353711 0.824433 0.260728 0.167037 0.544152 0.45518 0.155489 0.074044 0.70462 0.758523 0.18 [...]
+0.071 0.086003 0.610524 0.910616 0.150346 0.981769 0.005729 0.897731 0.710449 0.044565 0.136137 0.132888 0.851735 0.437757 0.746061 0.351765 0.888483 0.4138 0.218153 0.229903 0.16294 0.394445 0.185802 0.026079 0.636709 0.327809 0.097284 0.209496 0.203208 0.043179 0.692068 0.337254 0.177465 0.502936 0.426389 0.455939 0.777244 0.622925 0.060429 0.804395 0.428007 0.245878 0.480996 0.173781 0.010053 0.820923 0.963576 0.440599 0.10739 0.590961 0.723074 0.7666 0.281973 0.268467 0.698044 0.7564 [...]
+0.359405 0.07133 0.687096 0.211721 0.609339 0.699926 0.638696 0.336688 0.718275 0.555101 0.270429 0.481164 0.735598 0.675534 0.370308 0.06533 0.377945 0.240048 0.291794 0.160711 0.178289 0.768362 0.349633 0.003217 0.43806 0.77928 0.108498 0.539741 0.15256 0.153411 0.77573 0.129996 0.513569 0.719827 0.927674 0.495656 0.644505 0.867875 0.24335 0.34826 0.301676 0.225963 0.708084 0.328189 0.584904 0.12402 0.896325 0.439001 0.360965 0.237961 0.098478 0.474442 0.634278 0.66349 0.32382 0.718146 [...]
+0.376985 0.737948 0.16403 0.509148 0.955399 0.672834 0.432262 0.095298 0.306057 0.634205 0.870677 0.573067 0.341872 0.554922 0.637619 0.505274 0.883915 0.753066 0.568372 0.385097 0.459505 0.102537 0.47393 0.390982 0.904497 0.939505 0.082658 0.998235 0.558913 0.972563 0.223034 0.037264 0.426522 0.152072 0.464305 0.441198 0.780129 0.214284 0.926809 0.8362 0.756617 0.657958 0.450563 0.973245 0.645392 0.184154 0.948373 0.955718 0.105898 0.739001 0.243673 0.807725 0.875319 0.503915 0.892739 0 [...]
+0.337626 0.941427 0.492243 0.792895 0.240566 0.34576 0.308423 0.523002 0.501911 0.467129 0.050481 0.242201 0.912259 0.378754 0.20641 0.801002 0.338162 0.908295 0.194651 0.939513 0.55699 0.62931 0.533649 0.56076 0.069989 0.923685 0.102879 0.001018 0.989373 0.6085 0.186319 0.326838 0.8305 0.600426 0.551719 0.688321 0.223891 0.538173 0.866429 0.263402 0.43094 0.326351 0.969106 0.498234 0.151349 0.324824 0.130525 0.029951 0.445324 0.058833 0.819855 0.786764 0.929608 0.112032 0.699784 0.16014 [...]
+0.779292 0.945196 0.434304 0.417622 0.804408 0.365027 0.938378 0.683584 0.018553 0.600043 0.168215 0.498127 0.462227 0.980168 0.373562 0.337798 0.462025 0.937269 0.151405 0.514882 0.835652 0.405663 0.481448 0.309357 0.258672 0.488036 0.91579 0.779295 0.100615 0.684327 0.488453 0.7773 0.134427 0.993736 0.153964 0.666016 0.862263 0.318264 0.247637 0.006408 0.710836 0.295843 0.323676 0.169567 0.510333 0.272793 0.540174 0.177028 0.555955 0.546049 0.065761 0.539335 0.613326 0.01368 0.005136 0 [...]
+0.640283 0.283434 0.204694 0.586055 0.112891 0.743314 0.731335 0.478747 0.051149 0.340049 0.178354 0.508978 0.349267 0.662874 0.399194 0.242097 0.860094 0.153093 0.438665 0.0081 0.016181 0.376071 0.862693 0.257538 0.720972 0.197098 0.706197 0.191482 0.386067 0.349629 0.538579 0.947496 0.505331 0.913661 0.337576 0.671318 0.453505 0.932545 0.398536 0.513714 0.329726 0.869709 0.7443 0.922947 0.43229 0.324398 0.931607 0.673792 0.688015 0.022866 0.203833 0.226229 0.70953 0.267251 0.896025 0.7 [...]
+0.683736 0.682411 0.445403 0.44641 0.794623 0.600227 0.424827 0.446737 0.3421 0.197815 0.454823 0.055803 0.657337 0.603869 0.864962 0.955897 0.468344 0.383448 0.996923 0.966084 0.39845 0.610876 0.683793 0.081527 0.731845 0.018003 0.570162 0.78053 0.587896 0.542449 0.353928 0.957319 0.815278 0.401963 0.447776 0.297945 0.616322 0.44617 0.880168 0.303641 0.864415 0.474134 0.919804 0.253202 0.317587 0.85839 0.659492 0.071561 0.411402 0.008411 0.896669 0.382764 0.521598 0.051135 0.233253 0.52 [...]
+0.122848 0.0865 0.957309 0.250126 0.04369 0.230795 0.95571 0.374498 0.738642 0.247759 0.220589 0.38959 0.580901 0.723372 0.982674 0.881197 0.790385 0.254666 0.546216 0.038077 0.316373 0.083046 0.039002 0.359715 0.612109 0.880337 0.759223 0.635511 0.887952 0.923702 0.827477 0.194901 0.492624 0.55533 0.086358 0.398905 0.102397 0.877547 0.204317 0.751739 0.893646 0.042483 0.272215 0.673032 0.442449 0.703925 0.53301 0.232664 0.950923 0.200798 0.316296 0.933508 0.1506 0.107864 0.956639 0.1316 [...]
+0.775132 0.867022 0.9443 0.062162 0.133293 0.446435 0.50095 0.936175 0.688885 0.920088 0.615426 0.893038 0.174257 0.22544 0.41919 0.100661 0.189519 0.248818 0.721059 0.518291 0.261934 0.6943 0.594728 0.763981 0.074261 0.466666 0.68416 0.541834 0.803622 0.685304 0.864795 0.020449 0.941997 0.419248 0.126017 0.260718 0.204977 0.41756 0.386995 0.143084 0.960711 0.576112 0.569032 0.3814 0.736882 0.508197 0.398317 0.428686 0.823486 0.35725 0.56601 0.240925 0.747312 0.151793 0.433761 0.384244 0 [...]
+0.176048 0.887874 0.314068 0.628404 0.008166 0.382233 0.311035 0.561692 0.916995 0.981022 0.398856 0.884989 0.717344 0.737143 0.347475 0.094078 0.71673 0.569346 0.048012 0.360886 0.349238 0.671166 0.782 0.07863 0.043894 0.355798 0.33682 0.926151 0.952827 0.468811 0.906848 0.292058 0.394256 0.214682 0.952664 0.720505 0.341147 0.47655 0.06515 0.667202 0.141569 0.885159 0.236844 0.774434 0.219039 0.142126 0.164011 0.692684 0.09352 0.261135 0.867083 0.241609 0.322606 0.228489 0.744655 0.8522 [...]
+0.083314 0.682131 0.721537 0.160487 0.462169 0.395832 0.045354 0.748368 0.12994 0.101336 0.315452 0.868144 0.852316 0.651496 0.457086 0.315617 0.625184 0.087245 0.315416 0.012866 0.532888 0.941512 0.045177 0.647849 0.483239 0.499139 0.488008 0.397989 0.215542 0.690616 0.090118 0.91497 0.132297 0.235667 0.257793 0.752122 0.915057 0.308306 0.013355 0.490975 0.729841 0.865683 0.215578 0.9391 0.176858 0.195988 0.412454 0.411525 0.615374 0.441885 0.021739 0.244053 0.943522 0.078065 0.675088 0 [...]
+0.562555 0.584388 0.051599 0.858804 0.709305 0.716078 0.349632 0.561004 0.360153 0.113141 0.96018 0.576631 0.28592 0.559882 0.516812 0.121692 0.01649 0.30496 0.415863 0.859912 0.92192 0.624398 0.827678 0.22236 0.717034 0.416102 0.024551 0.556713 0.796455 0.353051 0.632832 0.296875 0.781739 0.691583 0.076844 0.768696 0.961235 0.227948 0.027166 0.621244 0.939298 0.466262 0.067741 0.514461 0.181007 0.271013 0.05905 0.745603 0.93483 0.279658 0.747926 0.778447 0.609732 0.937677 0.381488 0.009 [...]
+0.717259 0.28868 0.614088 0.660832 0.631389 0.243559 0.025069 0.622638 0.652488 0.146872 0.853181 0.434583 0.204147 0.68562 0.511703 0.160993 0.652549 0.233356 0.977389 0.937613 0.065915 0.689915 0.742614 0.40094 0.624535 0.73153 0.133365 0.518403 0.030361 0.572738 0.719578 0.973046 0.642226 0.970062 0.099268 0.366268 0.968491 0.355633 0.392949 0.275448 0.270646 0.030431 0.248541 0.121165 0.303159 0.960219 0.964643 0.808863 0.895976 0.663196 0.585071 0.616956 0.020494 0.526343 0.930554 0 [...]
+0.680291 0.121031 0.614587 0.783375 0.613634 0.552141 0.794665 0.120973 0.774098 0.536616 0.881695 0.143524 0.727079 0.261223 0.918232 0.903904 0.895664 0.646967 0.693565 0.181013 0.911943 0.631946 0.135751 0.238409 0.980073 0.437543 0.321283 0.327169 0.015423 0.646282 0.659721 0.732549 0.807406 0.571281 0.806119 0.566653 0.666141 0.380062 0.852495 0.047917 0.748594 0.204245 0.533036 0.921775 0.604315 0.956681 0.935853 0.533321 0.486022 0.108776 0.470469 0.355559 0.303767 0.755186 0.3618 [...]
+0.506499 0.992577 0.623853 0.713503 0.755365 0.165757 0.824795 0.645671 0.602821 0.519274 0.076052 0.062612 0.71469 0.687172 0.377602 0.702628 0.460032 0.927271 0.466496 0.218821 0.723675 0.727047 0.437641 0.68513 0.694459 0.548055 0.819584 0.618306 0.337135 0.407008 0.265809 0.582615 0.480872 0.858746 0.164724 0.180527 0.480318 0.219178 0.760766 0.309829 0.497475 0.500023 0.462978 0.956237 0.197277 0.405072 0.923796 0.630192 0.381151 0.474449 0.962931 0.105611 0.121476 0.331292 0.437655 [...]
+0.466268 0.564245 0.531572 0.834146 0.292188 0.027389 0.218889 0.195207 0.49627 0.563153 0.673757 0.320898 0.934625 0.576162 0.453057 0.134902 0.182819 0.80428 0.894753 0.561806 0.292498 0.624812 0.802984 0.896452 0.961415 0.356903 0.149209 0.986475 0.630469 0.168004 0.75252 0.310101 0.277483 0.598116 0.441609 0.040156 0.640208 0.695955 0.054091 0.269687 0.818054 0.996863 0.151579 0.263378 0.154328 0.992276 0.375068 0.831706 0.255547 0.636543 0.495428 0.078615 0.006631 0.638348 0.642517  [...]
+0.87628 0.549489 0.217395 0.731464 0.975711 0.681017 0.5483 0.708437 0.698371 0.474364 0.992921 0.828985 0.631052 0.112437 0.59422 0.160652 0.831521 0.584555 0.490975 0.71384 0.242112 0.033413 0.583645 0.845354 0.556659 0.967576 0.707103 0.559193 0.390387 0.73115 0.964607 0.445991 0.634294 0.026083 0.538535 0.837847 0.913066 0.884828 0.127326 0.389254 0.169025 0.506738 0.297173 0.756717 0.984862 0.719768 0.344127 0.777543 0.729269 0.089143 0.386195 0.33347 0.895552 0.712972 0.733582 0.41 [...]
+0.325625 0.100962 0.373268 0.936329 0.212026 0.23122 0.99164 0.181287 0.435923 0.034442 0.522132 0.763979 0.442686 0.236335 0.70177 0.100259 0.604839 0.863772 0.23083 0.548769 0.277344 0.160173 0.788096 0.344 0.288144 0.807997 0.147438 0.567964 0.6799 0.941971 0.78442 0.319149 0.674321 0.898056 0.960205 0.515246 0.290415 0.57798 0.572496 0.37735 0.193976 0.694298 0.492285 0.553071 0.043376 0.926475 0.192382 0.479093 0.627738 0.344775 0.444226 0.947214 0.459982 0.898432 0.480665 0.483044  [...]
+0.787433 0.817032 0.802419 0.590407 0.083208 0.074951 0.368307 0.276026 0.317504 0.038587 0.687088 0.015098 0.40135 0.485254 0.81015 0.918076 0.517867 0.866864 0.42654 0.469848 0.149342 0.641362 0.576177 0.471234 0.142259 0.645929 0.371688 0.2768 0.66627 0.095729 0.970125 0.098367 0.290299 0.288951 0.029824 0.29691 0.222351 0.075047 0.942881 0.522349 0.523795 0.171619 0.999265 0.119743 0.208969 0.60354 0.376857 0.5559 0.618216 0.306821 0.996668 0.745746 0.883229 0.091152 0.952878 0.90716 [...]
+0.466445 0.973054 0.879184 0.925927 0.170285 0.400088 0.867806 0.337735 0.002649 0.111626 0.582216 0.760383 0.710681 0.778269 0.668934 0.144683 0.088777 0.990939 0.884206 0.706032 0.646845 0.58024 0.082033 0.698144 0.043936 0.157377 0.838392 0.040373 0.317952 0.638597 0.176692 0.654371 0.345177 0.500063 0.726064 0.393958 0.058596 0.997565 0.10346 0.310755 0.526479 0.47726 0.510964 0.565473 0.810604 0.397802 0.645704 0.828539 0.273034 0.487109 0.588212 0.709013 0.486494 0.961878 0.880376  [...]
+0.435235 0.737557 0.923891 0.218659 0.777231 0.69403 0.657871 0.25392 0.713547 0.186607 0.77235 0.199356 0.272153 0.79364 0.686043 0.430766 0.906381 0.050675 0.459774 0.821699 0.584648 0.43096 0.285973 0.665764 0.252398 0.090711 0.41225 0.703467 0.701839 0.47356 0.961379 0.16204 0.147211 0.41907 0.382569 0.879315 0.507698 0.26078 0.045778 0.139636 0.237561 0.436292 0.683864 0.274969 0.759605 0.899328 0.187353 0.190899 0.688407 0.359206 0.194153 0.24723 0.872196 0.805842 0.862328 0.6429 0 [...]
+0.013145 0.166009 0.616676 0.496305 0.991515 0.928962 0.609091 0.484437 0.443517 0.650182 0.803023 0.916695 0.88497 0.880812 0.524839 0.119854 0.530723 0.726048 0.037359 0.053312 0.264448 0.400167 0.591784 0.557527 0.37259 0.38645 0.209511 0.061095 0.78897 0.229443 0.196437 0.426879 0.060012 0.392262 0.835709 0.668497 0.236931 0.965868 0.244987 0.502394 0.106085 0.586374 0.853021 0.854178 0.930541 0.145182 0.430127 0.259047 0.807205 0.032962 0.60567 0.592766 0.989041 0.333515 0.3817 0.41 [...]
+0.123735 0.061435 0.547813 0.745619 0.026565 0.325354 0.291543 0.574307 0.445914 0.423134 0.309061 0.620019 0.132029 0.530732 0.118537 0.370071 0.155346 0.248512 0.899869 0.279188 0.153565 0.360534 0.330381 0.415082 0.976532 0.591909 0.501469 0.708206 0.120379 0.85171 0.266235 0.611259 0.877042 0.004679 0.043748 0.532091 0.137724 0.673957 0.437714 0.2861 0.846675 0.530583 0.729725 0.141185 0.258253 0.395004 0.69241 0.775456 0.764817 0.038195 0.649229 0.697291 0.950093 0.172246 0.252927 0 [...]
+0.10538 0.786115 0.401453 0.380969 0.152905 0.111128 0.169392 0.089969 0.679972 0.539817 0.069417 0.182782 0.087503 0.073973 0.698485 0.212859 0.627676 0.527075 0.903424 0.989028 0.038575 0.827346 0.972632 0.050536 0.129016 0.332421 0.758826 0.83279 0.782027 0.970652 0.414067 0.345404 0.830668 0.285128 0.211228 0.171873 0.147884 0.116886 0.973644 0.902666 0.471632 0.801881 0.497158 0.377652 0.607423 0.147967 0.337671 0.384273 0.296027 0.319018 0.78324 0.198603 0.876791 0.954974 0.031909  [...]
+0.583426 0.832385 0.550729 0.816545 0.886026 0.715286 0.396888 0.498522 0.810062 0.265627 0.002684 0.722012 0.265724 0.937341 0.534244 0.354598 0.118496 0.551975 0.851709 0.412308 0.502852 0.048482 0.969261 0.666334 0.549615 0.182674 0.699714 0.958572 0.584237 0.714997 0.982632 0.452818 0.626632 0.913575 0.707151 0.489952 0.896072 0.350318 0.255133 0.220835 0.119432 0.793836 0.831447 0.251281 0.991262 0.554194 0.254769 0.541552 0.265017 0.032922 0.543392 0.51259 0.650653 0.566412 0.13991 [...]
+0.304094 0.65703 0.220899 0.95608 0.797651 0.613527 0.824522 0.668097 0.297002 0.994673 0.498391 0.448666 0.740518 0.593104 0.411159 0.35442 0.767781 0.48859 0.085685 0.364508 0.9836 0.135273 0.839953 0.148998 0.272999 0.2899 0.021718 0.421006 0.192124 0.246184 0.379154 0.549308 0.714389 0.017439 0.107661 0.09077 0.61908 0.750338 0.016697 0.905409 0.377421 0.923931 0.208478 0.185238 0.940274 0.291372 0.600511 0.325465 0.669928 0.71927 0.225381 0.177114 0.609617 0.495953 0.396466 0.974828 [...]
+0.283723 0.096447 0.500016 0.143427 0.098091 0.499304 0.091605 0.62191 0.854224 0.746504 0.035373 0.020136 0.864003 0.141605 0.84055 0.096413 0.807005 0.904587 0.745964 0.465633 0.085081 0.790692 0.094657 0.499025 0.798018 0.903662 0.789509 0.329116 0.627435 0.84305 0.317123 0.372212 0.84141 0.674174 0.833379 0.936042 0.121785 0.154033 0.737744 0.182197 0.159053 0.194357 0.828078 0.673409 0.303272 0.292506 0.504631 0.031207 0.29669 0.736689 0.050169 0.083603 0.626477 0.292201 0.02176 0.3 [...]
+0.545385 0.417385 0.235758 0.195829 0.96534 0.21106 0.801118 0.296809 0.378672 0.078253 0.89471 0.38582 0.227479 0.183316 0.514061 0.306873 0.843198 0.74701 0.429263 0.609609 0.447394 0.450692 0.222728 0.092641 0.102499 0.013924 0.267159 0.498146 0.565474 0.503998 0.69075 0.682578 0.619824 0.36704 0.039547 0.893531 0.377195 0.170491 0.934329 0.387376 0.800629 0.357721 0.976862 0.499563 0.181311 0.520638 0.674309 0.469264 0.260759 0.822695 0.287893 0.635602 0.955116 0.67906 0.728941 0.797 [...]
+0.375944 0.249629 0.043802 0.650455 0.231077 0.096351 0.251297 0.67111 0.528386 0.541616 0.609907 0.683817 0.499505 0.226791 0.584836 0.167613 0.527442 0.510371 0.264241 0.132741 0.207391 0.480402 0.266668 0.937383 0.703123 0.61622 0.990737 0.890054 0.444406 0.892263 0.267369 0.621609 0.172525 0.202702 0.616987 0.35254 0.549177 0.246408 0.673209 0.364073 0.929685 0.402994 0.046502 0.057167 0.299499 0.964183 0.231223 0.65829 0.69908 0.17852 0.770191 0.92932 0.119125 0.016524 0.106597 0.56 [...]
+0.722622 0.591764 0.647907 0.180804 0.984284 0.766404 0.750368 0.726874 0.16592 0.131196 0.573336 0.570394 0.471788 0.448973 0.974196 0.516312 0.771413 0.035091 0.186687 0.033943 0.695026 0.733911 0.709865 0.562572 0.14965 0.260686 0.548318 0.046257 0.686141 0.502669 0.109611 0.985204 0.529411 0.799698 0.668238 0.788356 0.794487 0.852348 0.997499 0.097941 0.804016 0.821072 0.654953 0.563064 0.711393 0.38311 0.030977 0.725071 0.445434 0.314045 0.314882 0.842246 0.106983 0.980325 0.851489  [...]
+0.606394 0.651072 0.803568 0.826945 0.771795 0.037213 0.23968 0.176499 0.436204 0.816914 0.913209 0.870751 0.542484 0.297554 0.039035 0.933208 0.153476 0.221104 0.947643 0.878424 0.037723 0.292516 0.826146 0.856458 0.59969 0.729713 0.390727 0.438439 0.81273 0.628277 0.383658 0.68009 0.90416 0.630807 0.374871 0.000279 0.431759 0.657665 0.262159 0.099852 0.292404 0.278749 0.233133 0.635297 0.580658 0.283162 0.039922 0.134111 0.288075 0.789671 0.670263 0.18029 0.296131 0.652 0.749885 0.0673 [...]
+0.727648 0.008156 0.350961 0.514064 0.423719 0.986932 0.82695 0.79604 0.97725 0.655876 0.567991 0.991928 0.570464 0.709889 0.738573 0.972906 0.945438 0.996609 0.314567 0.356366 0.122144 0.462173 0.896028 0.97896 0.146146 0.696647 0.940769 0.252248 0.458344 0.321223 0.25328 0.346225 0.004357 0.615297 0.805873 0.077475 0.487714 0.004267 0.96398 0.893309 0.946972 0.552415 0.940299 0.850217 0.4615 0.20636 0.838783 0.632311 0.737899 0.804819 0.940437 0.970444 0.710397 0.526297 0.943673 0.4855 [...]
+0.003491 0.725867 0.572525 0.402741 0.392088 0.562773 0.86756 0.858856 0.134234 0.202409 0.807964 0.646191 0.441237 0.808515 0.952941 0.641448 0.438891 0.214789 0.260022 0.817592 0.347227 0.298236 0.019038 0.251202 0.828819 0.919793 0.328569 0.685571 0.332938 0.236748 0.74769 0.843947 0.496729 0.760582 0.295114 0.111543 0.396932 0.778616 0.317455 0.767843 0.050495 0.06317 0.470259 0.79103 0.093632 0.987188 0.30806 0.220004 0.165218 0.090641 0.431241 0.850759 0.882197 0.556658 0.137218 0. [...]
+0.817747 0.366676 0.417222 0.62757 0.630859 0.114466 0.846168 0.30007 0.009821 0.337734 0.013809 0.452349 0.712095 0.861544 0.571174 0.239808 0.651181 0.244127 0.361474 0.365326 0.751941 0.519625 0.933128 0.157137 0.42146 0.33411 0.977775 0.468467 0.093796 0.267674 0.656034 0.108853 0.456769 0.254719 0.810786 0.813231 0.181064 0.435083 0.924039 0.888029 0.662284 0.459194 0.829944 0.114289 0.2968 0.264191 0.146993 0.807953 0.254871 0.307283 0.65052 0.390639 0.819661 0.486767 0.427045 0.73 [...]
+0.546832 0.562005 0.326024 0.741686 0.771706 0.389764 0.361369 0.012445 0.699798 0.602424 0.885964 0.684192 0.848957 0.617276 0.345961 0.921737 0.603169 0.963083 0.074525 0.811209 0.261393 0.586272 0.942663 0.624341 0.733421 0.812531 0.44824 0.729134 0.369623 0.776847 0.672884 0.982412 0.71322 0.975855 0.760357 0.202182 0.1892 0.201767 0.200099 0.264249 0.716176 0.013591 0.180041 0.689282 0.298678 0.32118 0.399106 0.562607 0.819297 0.238775 0.837138 0.159208 0.557779 0.124196 0.723649 0. [...]
+0.204571 0.254222 0.915342 0.556492 0.222235 0.366846 0.485549 0.797863 0.884885 0.07254 0.502949 0.893276 0.930369 0.093164 0.790985 0.314602 0.031372 0.225728 0.052651 0.489212 0.609451 0.798693 0.870093 0.961675 0.809871 0.261198 0.080368 0.037468 0.72305 0.463457 0.050211 0.178401 0.567235 0.211813 0.820085 0.086661 0.73404 0.243283 0.93564 0.799732 0.681069 0.514246 0.79999 0.298139 0.609131 0.156359 0.249681 0.478597 0.475816 0.535239 0.475236 0.784902 0.077922 0.14366 0.690411 0.5 [...]
+0.796712 0.772981 0.039564 0.399412 0.975846 0.350061 0.670575 0.784678 0.943771 0.049434 0.928942 0.074716 0.191216 0.325775 0.078606 0.54021 0.62521 0.48828 0.112015 0.312462 0.417745 0.423523 0.378148 0.33277 0.927474 0.011823 0.39929 0.534143 0.555963 0.278446 0.164573 0.225763 0.959288 0.381493 0.433779 0.555978 0.688109 0.756222 0.42715 0.061324 0.667846 0.902004 0.58203 0.996246 0.966573 0.777435 0.145828 0.845992 0.978784 0.29983 0.368161 0.880049 0.781103 0.438476 0.806441 0.631 [...]
+0.506467 0.34805 0.266277 0.390952 0.357315 0.674512 0.115293 0.226549 0.97484 0.000488 0.998965 0.670013 0.420398 0.062592 0.424656 0.805766 0.39936 0.995174 0.435632 0.991422 0.018252 0.72086 0.521582 0.316133 0.102964 0.091707 0.120789 0.206323 0.148907 0.133418 0.993244 0.551108 0.207049 0.88021 0.403126 0.976475 0.959743 0.547264 0.968886 0.872016 0.522589 0.36444 0.507286 0.849156 0.145017 0.984308 0.763871 0.519463 0.17532 0.450489 0.492279 0.864733 0.78401 0.766557 0.783651 0.957 [...]
+0.082437 0.757138 0.179891 0.450177 0.354328 0.073191 0.682744 0.546091 0.816553 0.579399 0.255361 0.86938 0.173363 0.103842 0.762539 0.818895 0.225891 0.091525 0.736691 0.173814 0.912959 0.695492 0.38707 0.506763 0.324087 0.241006 0.835104 0.500481 0.077348 0.520856 0.755279 0.294881 0.080051 0.399729 0.865516 0.892558 0.103299 0.104412 0.134182 0.279263 0.014952 0.66868 0.307146 0.288692 0.010563 0.338533 0.561754 0.645759 0.717266 0.298629 0.899834 0.894107 0.437334 0.028427 0.547623  [...]
+0.149058 0.056915 0.782564 0.589234 0.759216 0.67257 0.605258 0.88679 0.14776 0.175675 0.742 0.162762 0.738342 0.75504 0.171946 0.381679 0.092799 0.773969 0.949944 0.434811 0.250784 0.060058 0.901595 0.811432 0.615869 0.281939 0.153024 0.387259 0.298893 0.657776 0.772851 0.856822 0.841027 0.832524 0.568886 0.699888 0.933187 0.298389 0.713544 0.642254 0.383586 0.104112 0.662833 0.369363 0.305836 0.425904 0.567146 0.981387 0.440796 0.306555 0.764813 0.58008 0.604008 0.541602 0.871981 0.108 [...]
+0.566113 0.666365 0.62261 0.416962 0.107327 0.806638 0.152379 0.437889 0.286995 0.12541 0.847258 0.300292 0.660532 0.378344 0.058612 0.991553 0.614098 0.637406 0.320755 0.55272 0.839004 0.335488 0.369199 0.415417 0.864125 0.916597 0.177804 0.165328 0.399764 0.571154 0.146204 0.705986 0.15705 0.886763 0.529212 0.820771 0.571836 0.848655 0.784355 0.533942 0.909457 0.983274 0.934954 0.882385 0.029015 0.677465 0.014166 0.883918 0.118186 0.085387 0.932125 0.909026 0.206572 0.745505 0.387324 0 [...]
+0.282801 0.741681 0.992865 0.565004 0.347049 0.994639 0.083234 0.672207 0.718515 0.862141 0.653677 0.643061 0.561031 0.325881 0.27009 0.511987 0.301958 0.346352 0.608402 0.269249 0.671504 0.958714 0.75179 0.751538 0.419248 0.038854 0.128399 0.392107 0.960321 0.822938 0.70314 0.261859 0.992859 0.150575 0.188526 0.183633 0.302963 0.097207 0.105078 0.892442 0.17038 0.993933 0.265116 0.318025 0.07757 0.087441 0.402178 0.811763 0.15536 0.929418 0.873941 0.097287 0.645004 0.657167 0.141665 0.3 [...]
+0.33539 0.499774 0.410815 0.896748 0.451749 0.797318 0.260782 0.711757 0.934724 0.89029 0.17964 0.672625 0.259568 0.973018 0.582792 0.428846 0.645004 0.371583 0.032623 0.225128 0.132479 0.824952 0.052774 0.135657 0.286068 0.431894 0.449858 0.981931 0.576587 0.894431 0.048093 0.936021 0.371032 0.945508 0.023695 0.375017 0.786729 0.993565 0.124269 0.755517 0.814116 0.285028 0.363652 0.698006 0.556501 0.312477 0.42569 0.418591 0.739792 0.939518 0.596154 0.295492 0.950267 0.944271 0.368846 0 [...]
+0.155036 0.396865 0.695955 0.82774 0.288557 0.708186 0.213075 0.578116 0.38726 0.826502 0.931153 0.532345 0.217721 0.421099 0.271882 0.543082 0.674675 0.479874 0.999603 0.363663 0.202204 0.560673 0.73712 0.794887 0.963529 0.032555 0.559059 0.322522 0.324926 0.001015 0.679754 0.567242 0.095308 0.015619 0.284693 0.628467 0.840147 0.656043 0.355673 0.075364 0.198077 0.373006 0.44792 0.107717 0.322439 0.408288 0.704817 0.71244 0.436154 0.329776 0.080529 0.566451 0.069087 0.070791 0.63046 0.6 [...]
+0.195754 0.96186 0.704216 0.270671 0.878916 0.052717 0.229144 0.116449 0.11958 0.780682 0.701024 0.225091 0.525424 0.430988 0.577511 0.927251 0.237917 0.554952 0.726081 0.089966 0.410356 0.059889 0.848395 0.199291 0.520686 0.62222 0.103278 0.150678 0.104575 0.607296 0.279257 0.397471 0.630802 0.211036 0.981358 0.267089 0.687771 0.989455 0.796418 0.570599 0.952013 0.574431 0.841567 0.709732 0.373459 0.74514 0.914394 0.392475 0.673667 0.089527 0.730801 0.767045 0.829534 0.332227 0.959814 0 [...]
+0.364917 0.979053 0.681408 0.25392 0.025921 0.43507 0.39117 0.553696 0.406052 0.805504 0.240506 0.185896 0.493979 0.030095 0.893126 0.418031 0.307285 0.179882 0.290654 0.259893 0.632744 0.680992 0.108556 0.273862 0.878226 0.276547 0.908552 0.118326 0.821332 0.785819 0.318234 0.673728 0.129255 0.473144 0.616615 0.666066 0.017675 0.599317 0.490602 0.701762 0.406214 0.944221 0.070822 0.590935 0.082871 0.330425 0.249493 0.073742 0.705689 0.001985 0.0374 0.786993 0.209894 0.0847 0.339644 0.29 [...]
+0.626398 0.177272 0.652977 0.579059 0.045526 0.388117 0.299625 0.332873 0.690653 0.681239 0.022464 0.437772 0.172232 0.933238 0.225683 0.510549 0.392684 0.633437 0.283895 0.743715 0.961279 0.134782 0.596851 0.133752 0.848658 0.986884 0.666983 0.91206 0.644598 0.359347 0.700791 0.413758 0.318911 0.966768 0.853577 0.386053 0.368565 0.470127 0.960509 0.464982 0.57954 0.454601 0.77913 0.661087 0.602264 0.496961 0.937874 0.929974 0.554367 0.781967 0.873 0.66121 0.195549 0.886603 0.347907 0.01 [...]
+0.261766 0.105987 0.388277 0.228281 0.399269 0.226065 0.095908 0.603932 0.089831 0.468326 0.73807 0.728242 0.779855 0.102035 0.455108 0.916994 0.341711 0.341188 0.633663 0.710929 0.629798 0.91021 0.961539 0.046501 0.567342 0.043479 0.038266 0.181023 0.580604 0.867793 0.881143 0.764517 0.176912 0.263559 0.87175 0.893285 0.035962 0.437703 0.539115 0.123296 0.669075 0.168145 0.120742 0.121354 0.730238 0.277728 0.657815 0.741703 0.369066 0.177004 0.255781 0.829582 0.097976 0.645338 0.118151  [...]
+0.780793 0.141017 0.932999 0.360701 0.577973 0.731201 0.435147 0.775046 0.813796 0.479133 0.49304 0.594717 0.50188 0.861732 0.697172 0.901122 0.225034 0.03425 0.663099 0.582688 0.941169 0.095298 0.448322 0.876597 0.428146 0.14625 0.822717 0.626409 0.723502 0.272281 0.442275 0.075205 0.722639 0.50267 0.205014 0.164092 0.875067 0.668683 0.385743 0.160364 0.228743 0.114849 0.4664 0.473554 0.652499 0.412846 0.817158 0.216585 0.864891 0.668923 0.478855 0.347135 0.831825 0.105298 0.508908 0.64 [...]
+0.966562 0.623096 0.280881 0.819717 0.763751 0.761851 0.259589 0.891072 0.657524 0.634769 0.482228 0.405304 0.51374 0.155318 0.998059 0.691627 0.553345 0.302451 0.064511 0.116704 0.281507 0.114683 0.932441 0.66168 0.312695 0.480181 0.655757 0.757427 0.042844 0.18084 0.020356 0.509967 0.555758 0.605957 0.628505 0.228438 0.329751 0.827973 0.768379 0.850594 0.699903 0.418017 0.396348 0.747197 0.176219 0.602455 0.941758 0.201059 0.079269 0.671465 0.551842 0.402511 0.005143 0.282224 0.252691  [...]
+0.188661 0.439479 0.267974 0.557212 0.691665 0.999151 0.883232 0.720773 0.167577 0.652291 0.458013 0.522446 0.894372 0.652215 0.491563 0.131095 0.357966 0.009189 0.425356 0.731809 0.434286 0.580576 0.61246 0.892174 0.67 0.915263 0.513201 0.025421 0.225616 0.769024 0.418177 0.443546 0.728411 0.528079 0.411745 0.308243 0.454004 0.801078 0.851549 0.162473 0.889304 0.240024 0.054677 0.856221 0.943032 0.874742 0.908877 0.055154 0.356718 0.503164 0.210813 0.408073 0.619089 0.86337 0.770791 0.0 [...]
+0.20419 0.666504 0.155096 0.620923 0.553463 0.843434 0.779452 0.570724 0.763916 0.416997 0.656803 0.312855 0.282762 0.872301 0.38346 0.129571 0.985232 0.921132 0.780507 0.299342 0.225652 0.141166 0.876313 0.703315 0.974889 0.145132 0.892399 0.18873 0.59985 0.966733 0.822592 0.294656 0.570187 0.011815 0.06412 0.040609 0.863328 0.679878 0.4325 0.963316 0.608769 0.696359 0.852249 0.534893 0.261986 0.337906 0.839264 0.724185 0.28019 0.04297 0.022558 0.469052 0.419531 0.140921 0.392555 0.9927 [...]
+0.428856 0.632457 0.933495 0.025853 0.441204 0.409525 0.88299 0.392493 0.090327 0.721621 0.407178 0.537502 0.09147 0.641026 0.407833 0.456347 0.134414 0.964459 0.985751 0.962912 0.268448 0.666955 0.564314 0.845481 0.094169 0.937797 0.745774 0.014928 0.481594 0.154732 0.196086 0.305384 0.172038 0.245568 0.747544 0.566952 0.951414 0.067215 0.955988 0.591601 0.721392 0.178292 0.892254 0.53619 0.693275 0.218735 0.789633 0.089776 0.90943 0.478522 0.557965 0.506304 0.871757 0.740485 0.496481 0 [...]
+0.693835 0.132782 0.42763 0.43394 0.776838 0.483566 0.147022 0.093661 0.411563 0.141933 0.903782 0.701835 0.150177 0.972864 0.535866 0.367603 0.005431 0.991024 0.915426 0.365183 0.812188 0.315066 0.347484 0.387103 0.591855 0.599778 0.521411 0.619063 0.564069 0.015617 0.78517 0.221619 0.700634 0.014333 0.827251 0.812255 0.55396 0.210565 0.758378 0.371268 0.398396 0.319594 0.012917 0.461096 0.419101 0.545554 0.21154 0.534502 0.637256 0.052848 0.156199 0.57973 0.96211 0.753198 0.102292 0.25 [...]
+0.697654 0.010187 0.083254 0.393465 0.732433 0.647132 0.076059 0.482419 0.196336 0.943261 0.488546 0.948576 0.932268 0.114893 0.440611 0.663766 0.437781 0.490434 0.952016 0.865236 0.126252 0.500011 0.295564 0.930471 0.09509 0.75659 0.878052 0.034297 0.983223 0.710947 0.296867 0.304904 0.6267 0.650475 0.21688 0.224555 0.294804 0.300443 0.129248 0.50095 0.996268 0.471511 0.684618 0.624524 0.842484 0.906303 0.938961 0.685554 0.203293 0.196009 0.211151 0.848483 0.414815 0.366635 0.408758 0.4 [...]
+0.958939 0.618629 0.507126 0.457234 0.897371 0.111977 0.886977 0.650466 0.87571 0.558379 0.15879 0.655103 0.459729 0.632664 0.191788 0.28247 0.224931 0.128083 0.406982 0.929204 0.663293 0.625595 0.501299 0.490716 0.602471 0.739342 0.29343 0.445374 0.053182 0.529597 0.801445 0.413081 0.165429 0.745226 0.291463 0.959321 0.649287 0.062815 0.190135 0.832582 0.089436 0.506841 0.379051 0.471196 0.808656 0.591878 0.708772 0.060486 0.907114 0.279767 0.140662 0.141481 0.885389 0.838456 0.323951 0 [...]
+0.743142 0.133265 0.802563 0.330302 0.700526 0.642202 0.13639 0.557426 0.522019 0.240794 0.318733 0.762313 0.86541 0.955836 0.930812 0.712795 0.435477 0.098148 0.618649 0.181644 0.259259 0.483277 0.536015 0.035911 0.932905 0.747507 0.7019 0.017634 0.515338 0.815298 0.296976 0.560184 0.971944 0.447866 0.245459 0.008688 0.727457 0.158519 0.25785 0.867885 0.354306 0.435924 0.413287 0.039704 0.395025 0.619677 0.719809 0.921677 0.364492 0.519795 0.717588 0.199726 0.301413 0.784573 0.710147 0. [...]
+0.134019 0.585976 0.413668 0.395764 0.631451 0.523222 0.695331 0.022938 0.817807 0.53375 0.769751 0.794381 0.076214 0.040232 0.410575 0.954872 0.066009 0.446956 0.28745 0.885965 0.784311 0.210335 0.272411 0.398343 0.707798 0.730687 0.520267 0.905051 0.549432 0.465535 0.896227 0.628343 0.594999 0.163294 0.68797 0.636334 0.602159 0.786345 0.781792 0.694184 0.216655 0.289811 0.335664 0.937644 0.850904 0.621267 0.045519 0.281779 0.055078 0.605903 0.541688 0.318705 0.308959 0.964023 0.205025  [...]
+0.668847 0.861019 0.922264 0.855756 0.272186 0.609355 0.654924 0.323252 0.133214 0.254492 0.120642 0.763066 0.141312 0.290942 0.081249 0.502478 0.923121 0.420039 0.489831 0.893421 0.29263 0.224986 0.760646 0.233561 0.258636 0.552203 0.757557 0.89233 0.313169 0.919429 0.976307 0.389026 0.564709 0.889652 0.384277 0.811915 0.790089 0.27202 0.903166 0.247147 0.887211 0.777123 0.246187 0.905915 0.991912 0.972941 0.477644 0.263116 0.232691 0.225252 0.304853 0.062714 0.227531 0.443798 0.078783  [...]
+0.844401 0.151248 0.899854 0.677698 0.077225 0.397302 0.278253 0.604733 0.466174 0.85805 0.678106 0.161714 0.638612 0.152762 0.461089 0.804455 0.541278 0.774023 0.373873 0.041041 0.984877 0.659667 0.376228 0.510358 0.635733 0.836111 0.566737 0.797861 0.922037 0.653278 0.799296 0.904485 0.821363 0.032237 0.974955 0.102833 0.608605 0.390898 0.721848 0.69891 0.905437 0.668603 0.94188 0.388747 0.762216 0.16113 0.065982 0.666166 0.792946 0.762011 0.153733 0.4616 0.75407 0.875948 0.995307 0.41 [...]
+0.414583 0.746497 0.031135 0.628819 0.8571 0.445719 0.568666 0.847993 0.398955 0.044237 0.481291 0.561314 0.32596 0.631988 0.26782 0.236135 0.406686 0.814812 0.36304 0.209323 0.145149 0.751884 0.252758 0.954616 0.594061 0.464705 0.869618 0.214014 0.078778 0.635739 0.385812 0.532765 0.456543 0.825977 0.063716 0.084053 0.946893 0.731113 0.380367 0.753251 0.414278 0.961588 0.746486 0.330235 0.067932 0.136601 0.891712 0.966393 0.644515 0.747485 0.487836 0.3674 0.666694 0.847444 0.972772 0.43 [...]
+0.178745 0.979126 0.635304 0.439913 0.319498 0.010083 0.710442 0.754453 0.781248 0.945863 0.2876 0.477777 0.451837 0.611185 0.825438 0.458162 0.976435 0.577201 0.95089 0.282394 0.996112 0.541874 0.121826 0.37097 0.943209 0.562042 0.776037 0.65102 0.172433 0.315021 0.440026 0.342552 0.987765 0.525032 0.576345 0.801498 0.904826 0.12275 0.683207 0.709386 0.451713 0.359746 0.323012 0.779198 0.026547 0.183727 0.252246 0.554122 0.696495 0.17139 0.111357 0.041736 0.652503 0.16186 0.296381 0.964 [...]
+0.139634 0.913743 0.520167 0.895682 0.217605 0.282336 0.336593 0.252508 0.991878 0.829179 0.465149 0.653389 0.242773 0.167742 0.04785 0.838135 0.487653 0.341514 0.020793 0.541939 0.967072 0.50517 0.109748 0.114067 0.74526 0.187056 0.016984 0.947327 0.501207 0.7861 0.649213 0.884121 0.739702 0.033508 0.670157 0.775934 0.343018 0.258543 0.574996 0.736931 0.556284 0.012019 0.018983 0.115245 0.501989 0.479537 0.661795 0.300304 0.186435 0.645899 0.763965 0.669666 0.388758 0.881811 0.081343 0. [...]
+0.587234 0.608498 0.543545 0.914651 0.093064 0.426926 0.578705 0.168085 0.111346 0.744051 0.337073 0.456602 0.491741 0.827036 0.540776 0.874075 0.007161 0.607545 0.03016 0.97073 0.452803 0.795566 0.786365 0.093077 0.980941 0.042294 0.915259 0.805976 0.545294 0.765517 0.971491 0.696746 0.833064 0.090324 0.30907 0.449087 0.209118 0.111472 0.399013 0.831855 0.269827 0.242535 0.429253 0.823646 0.279602 0.804445 0.311036 0.049541 0.379402 0.663163 0.608121 0.67356 0.149076 0.17722 0.56314 0.7 [...]
+0.055498 0.770995 0.177383 0.635093 0.480358 0.269341 0.72786 0.1392 0.55402 0.505568 0.251115 0.743206 0.955362 0.602601 0.085179 0.953398 0.928619 0.667488 0.380064 0.804589 0.620895 0.134486 0.641994 0.892203 0.27528 0.825703 0.647402 0.572134 0.103832 0.782111 0.000643 0.581215 0.67296 0.35198 0.147653 0.420977 0.925884 0.084069 0.680787 0.787908 0.030818 0.128437 0.262567 0.380776 0.041261 0.795151 0.242474 0.166308 0.185446 0.399381 0.628169 0.697587 0.696045 0.926689 0.649105 0.21 [...]
+0.185153 0.147993 0.839089 0.581543 0.025785 0.718839 0.242855 0.229168 0.889355 0.382545 0.122556 0.458006 0.705507 0.646284 0.991625 0.844782 0.315184 0.418887 0.643787 0.044029 0.895867 0.546559 0.170959 0.365594 0.616538 0.133895 0.987179 0.967044 0.707985 0.408219 0.543035 0.061718 0.595775 0.686953 0.524016 0.062274 0.485868 0.462929 0.150593 0.773895 0.500015 0.352565 0.950674 0.649032 0.010958 0.637414 0.337401 0.763756 0.700891 0.139209 0.112474 0.581706 0.131757 0.200756 0.4082 [...]
+0.097408 0.21796 0.236498 0.906142 0.53404 0.481655 0.170712 0.457808 0.690631 0.597962 0.048904 0.95356 0.304478 0.907617 0.024676 0.531756 0.342988 0.66795 0.608754 0.024335 0.576114 0.775453 0.024941 0.363179 0.803473 0.159511 0.337827 0.939748 0.651114 0.634355 0.069966 0.785149 0.261193 0.289818 0.87544 0.978921 0.914679 0.406916 0.990221 0.440834 0.862556 0.121076 0.761236 0.210137 0.546459 0.623534 0.452367 0.143281 0.749041 0.30589 0.370783 0.913928 0.73903 0.54197 0.114584 0.730 [...]
+0.737732 0.723681 0.271024 0.886087 0.791992 0.621023 0.238614 0.611145 0.420589 0.849924 0.728046 0.079148 0.38289 0.166324 0.613358 0.82 0.781728 0.571029 0.868418 0.900985 0.506147 0.112444 0.553513 0.970304 0.468654 0.765776 0.802743 0.946613 0.122144 0.936484 0.265043 0.075054 0.641835 0.058775 0.551797 0.167767 0.131593 0.695469 0.439054 0.046146 0.485911 0.089841 0.99873 0.095433 0.001492 0.571091 0.735019 0.048939 0.100144 0.361093 0.169224 0.33753 0.370921 0.905857 0.225494 0.75 [...]
+0.082255 0.829205 0.774553 0.694507 0.691109 0.798236 0.316989 0.88445 0.655484 0.537217 0.669441 0.943387 0.906858 0.929939 0.535756 0.313599 0.178019 0.712923 0.324579 0.059476 0.522443 0.752367 0.239451 0.971659 0.983174 0.619411 0.369581 0.521203 0.423459 0.093787 0.908947 0.875674 0.886434 0.101786 0.101335 0.184731 0.213648 0.701032 0.862462 0.912045 0.8904 0.281355 0.861979 0.254476 0.567631 0.026286 0.609726 0.001399 0.629978 0.194648 0.734254 0.017484 0.890541 0.136575 0.715546  [...]
+0.60437 0.69521 0.830192 0.996393 0.90679 0.86024 0.122957 0.744756 0.256965 0.483461 0.074805 0.183563 0.375027 0.518568 0.210156 0.650144 0.062626 0.552153 0.463291 0.249877 0.811507 0.477188 0.16203 0.221697 0.547196 0.518546 0.197393 0.033451 0.286109 0.227684 0.145165 0.418323 0.528383 0.331185 0.438151 0.113158 0.028056 0.817152 0.54839 0.487385 0.319621 0.909364 0.685005 0.165462 0.182821 0.573359 0.324179 0.083982 0.69295 0.865126 0.718445 0.583342 0.430538 0.181934 0.395494 0.18 [...]
+0.439409 0.702986 0.726978 0.866543 0.913219 0.742388 0.05342 0.930753 0.522149 0.596031 0.076042 0.028877 0.474392 0.651412 0.025084 0.292524 0.707419 0.631328 0.158046 0.363651 0.219804 0.092316 0.218873 0.624577 0.789673 0.972776 0.511689 0.177847 0.242853 0.973396 0.424212 0.270313 0.376511 0.193646 0.150181 0.427706 0.763427 0.703188 0.021469 0.911207 0.889964 0.145886 0.871286 0.251188 0.26999 0.476925 0.191546 0.13642 0.976819 0.931816 0.141802 0.551126 0.121327 0.6763 0.215315 0. [...]
+0.094346 0.884296 0.815134 0.803306 0.067073 0.069077 0.457204 0.357072 0.41793 0.5815 0.441773 0.850437 0.308201 0.675646 0.99767 0.893013 0.132996 0.799046 0.874502 0.362996 0.670111 0.704831 0.089094 0.204091 0.111863 0.928658 0.638677 0.398189 0.904668 0.419018 0.515452 0.617899 0.937206 0.225201 0.253403 0.464146 0.325803 0.220439 0.206088 0.230696 0.299368 0.903269 0.1699 0.173719 0.24071 0.811277 0.562345 0.082828 0.567519 0.010809 0.044682 0.736362 0.408173 0.2814 0.802357 0.0758 [...]
+0.477111 0.836276 0.305754 0.548076 0.868835 0.916988 0.342933 0.841995 0.72121 0.737641 0.36259 0.746533 0.141703 0.080889 0.442551 0.242992 0.301998 0.182855 0.495736 0.354001 0.687057 0.202415 0.052777 0.205089 0.319308 0.924627 0.679386 0.587206 0.306661 0.849854 0.324808 0.063313 0.364211 0.870585 0.02676 0.001974 0.31699 0.735198 0.413646 0.384948 0.111499 0.258615 0.361025 0.45854 0.044332 0.555842 0.406111 0.778251 0.592308 0.561316 0.917186 0.658949 0.611696 0.520774 0.837123 0. [...]
+0.871581 0.868485 0.763417 0.010563 0.919096 0.49964 0.144994 0.27652 0.515533 0.133735 0.261284 0.143771 0.893692 0.047143 0.914321 0.949498 0.6674 0.614073 0.604925 0.111972 0.910207 0.863759 0.852913 0.53337 0.376433 0.822404 0.385084 0.407987 0.908449 0.438806 0.914206 0.122571 0.812605 0.99459 0.178602 0.7862 0.943518 0.294056 0.427727 0.834199 0.429293 0.993058 0.342713 0.984466 0.100423 0.785069 0.03192 0.358562 0.282102 0.560262 0.322695 0.282215 0.847655 0.454863 0.920496 0.1235 [...]
+0.8373 0.636444 0.236655 0.394544 0.038739 0.990349 0.352956 0.741802 0.08259 0.549931 0.443619 0.373939 0.530751 0.010175 0.191082 0.640115 0.501624 0.174382 0.101526 0.079676 0.809239 0.692502 0.427418 0.997415 0.966291 0.062274 0.831887 0.208685 0.09417 0.718307 0.898092 0.87051 0.556249 0.965839 0.250099 0.51449 0.202927 0.140332 0.129598 0.637534 0.092663 0.567669 0.022574 0.655069 0.44293 0.879408 0.07686 0.959045 0.599402 0.807378 0.372091 0.945608 0.432572 0.799424 0.923392 0.126 [...]
+0.050871 0.543405 0.464299 0.123846 0.613317 0.956225 0.508663 0.861485 0.558702 0.396414 0.150457 0.654677 0.791486 0.955224 0.161877 0.412604 0.142913 0.339489 0.60319 0.353385 0.305636 0.347595 0.602724 0.584871 0.99482 0.935676 0.687135 0.993138 0.340045 0.473518 0.830191 0.48146 0.434526 0.094753 0.068581 0.707734 0.333864 0.029261 0.72204 0.23738 0.332714 0.633573 0.415737 0.73008 0.318746 0.449891 0.86897 0.384474 0.90204 0.286223 0.02508 0.927775 0.565596 0.797147 0.673108 0.2580 [...]
+0.781875 0.899704 0.450537 0.819323 0.506348 0.833055 0.28699 0.390016 0.86177 0.764982 0.715772 0.326182 0.505069 0.557869 0.560051 0.038857 0.322676 0.577799 0.070151 0.187067 0.070058 0.380022 0.91085 0.85016 0.90589 0.927219 0.29953 0.420076 0.806266 0.127461 0.142131 0.426503 0.263846 0.788313 0.011006 0.315238 0.4333 0.630985 0.10113 0.741792 0.412712 0.167333 0.70653 0.075714 0.732012 0.070031 0.508152 0.496845 0.84755 0.123008 0.586647 0.044249 0.759234 0.250599 0.508746 0.389945 [...]
+0.193458 0.566971 0.143211 0.808924 0.541137 0.811442 0.818447 0.255519 0.463313 0.665551 0.798482 0.256396 0.882644 0.535815 0.010393 0.269909 0.566912 0.107825 0.793253 0.767278 0.086369 0.524722 0.872869 0.157993 0.23587 0.557866 0.833394 0.311947 0.575235 0.997928 0.855769 0.795128 0.573151 0.201246 0.494992 0.48376 0.604573 0.859081 0.285446 0.988362 0.199879 0.096282 0.481847 0.91867 0.428786 0.172291 0.34542 0.539688 0.406188 0.986034 0.023427 0.052375 0.076148 0.348802 0.03408 0. [...]
+0.558767 0.824826 0.90774 0.333641 0.804903 0.075912 0.426262 0.62318 0.195165 0.07267 0.524294 0.153673 0.900738 0.445035 0.878191 0.518347 0.928479 0.873513 0.359272 0.635358 0.924949 0.993594 0.971503 0.658796 0.235742 0.301688 0.375071 0.544308 0.671427 0.932537 0.166331 0.529341 0.915456 0.490466 0.396428 0.681473 0.263762 0.471822 0.65695 0.727217 0.837555 0.009281 0.801377 0.961792 0.827891 0.08943 0.552332 0.004802 0.895456 0.471112 0.629639 0.813397 0.161879 0.327078 0.061646 0. [...]
+0.692087 0.729031 0.353253 0.891512 0.615567 0.391076 0.269257 0.348579 0.123904 0.834734 0.413983 0.955642 0.075299 0.68192 0.114764 0.704369 0.439877 0.155495 0.462821 0.66605 0.424279 0.359643 0.004197 0.124342 0.413606 0.573366 0.522697 0.923861 0.670253 0.947805 0.239219 0.716103 0.600528 0.167547 0.607061 0.336214 0.556563 0.403329 0.849462 0.147103 0.208112 0.5538 0.381234 0.596184 0.765962 0.93152 0.445112 0.132021 0.511021 0.736815 0.745261 0.478789 0.045964 0.387891 0.431301 0. [...]
+0.641086 0.34327 0.440861 0.763461 0.894989 0.607448 0.122106 0.437276 0.057577 0.820751 0.381885 0.001327 0.978219 0.314468 0.090399 0.668269 0.342372 0.788458 0.31971 0.78727 0.347114 0.388954 0.740065 0.874114 0.688872 0.314969 0.143186 0.889286 0.914707 0.697602 0.781484 0.580055 0.791812 0.166275 0.390393 0.862023 0.260831 0.495749 0.762946 0.962124 0.461555 0.4875 0.886917 0.73607 0.272105 0.240127 0.410523 0.894812 0.197662 0.657747 0.20204 0.428975 0.75819 0.962601 0.015367 0.194 [...]
+0.251453 0.40024 0.574358 0.351596 0.349328 0.8598 0.389341 0.918973 0.705236 0.937767 0.431941 0.357839 0.973364 0.816896 0.430351 0.728883 0.194608 0.22048 0.385258 0.626588 0.010428 0.38299 0.764684 0.895645 0.240356 0.022082 0.459057 0.244446 0.917968 0.871115 0.486779 0.92896 0.670772 0.440511 0.719825 0.193241 0.381688 0.155793 0.970767 0.702279 0.403415 0.511667 0.15564 0.150323 0.893343 0.801887 0.798736 0.187227 0.436129 0.901099 0.55812 0.733861 0.261063 0.363064 0.200362 0.851 [...]
+0.748838 0.911686 0.180146 0.243692 0.490313 0.87326 0.474296 0.66687 0.331205 0.177638 0.979331 0.329615 0.424285 0.559212 0.404796 0.929022 0.250194 0.308174 0.53538 0.897079 0.887952 0.139495 0.709708 0.936092 0.511181 0.435972 0.46485 0.270971 0.394376 0.364477 0.884572 0.918606 0.492395 0.289583 0.141789 0.204392 0.331991 0.829906 0.754307 0.052211 0.229417 0.79998 0.849682 0.094218 0.318976 0.128441 0.075598 0.746303 0.667065 0.868733 0.199238 0.60618 0.534452 0.441236 0.19211 0.86 [...]
+0.440059 0.612987 0.941631 0.091873 0.666805 0.58338 0.6699 0.549759 0.566441 0.890818 0.705469 0.39185 0.021056 0.595525 0.210118 0.93832 0.732752 0.04528 0.283138 0.770962 0.837404 0.232403 0.898452 0.242698 0.332782 0.800948 0.275907 0.86139 0.09478 0.528415 0.420625 0.872835 0.796749 0.181597 0.523231 0.940392 0.068682 0.951556 0.969227 0.514037 0.175139 0.640268 0.593377 0.448829 0.903148 0.766726 0.433021 0.989855 0.538818 0.536742 0.898593 0.716471 0.774595 0.525905 0.938147 0.470 [...]
+0.930203 0.066096 0.470968 0.926831 0.823751 0.322519 0.27734 0.305505 0.986711 0.735241 0.384077 0.014989 0.650257 0.548354 0.923507 0.137014 0.231731 0.092163 0.217154 0.521248 0.542347 0.003441 0.988901 0.627985 0.609296 0.622284 0.612137 0.49807 0.48658 0.204912 0.60077 0.282629 0.342163 0.050034 0.567155 0.769984 0.964459 0.718389 0.475405 0.244358 0.48145 0.734204 0.887148 0.363086 0.868545 0.08114 0.686219 0.550122 0.59953 0.382746 0.773974 0.85032 0.184575 0.131086 0.48342 0.8545 [...]
+0.235943 0.994033 0.450783 0.646203 0.603826 0.583145 0.847052 0.818188 0.855483 0.501567 0.323969 0.266036 0.382413 0.623518 0.831525 0.006779 0.930139 0.239951 0.244108 0.146328 0.428488 0.566815 0.61526 0.645497 0.646609 0.641265 0.343502 0.175394 0.126189 0.571178 0.671096 0.098467 0.52591 0.779945 0.114219 0.393289 0.727834 0.897092 0.532467 0.403019 0.863691 0.026727 0.409567 0.133876 0.873628 0.742736 0.724313 0.377391 0.482483 0.339179 0.032164 0.061976 0.935627 0.960208 0.039576 [...]
+0.220296 0.211564 0.707278 0.275444 0.423314 0.792123 0.65207 0.009246 0.333481 0.344819 0.234541 0.612364 0.084173 0.830658 0.056825 0.00712 0.219334 0.349278 0.371252 0.921371 0.152015 0.809231 0.339686 0.063867 0.641862 0.812392 0.314415 0.336021 0.059083 0.654881 0.966682 0.523124 0.527995 0.951969 0.070215 0.418911 0.351718 0.064843 0.373932 0.332355 0.384583 0.282773 0.292399 0.871021 0.763204 0.693364 0.329727 0.775055 0.00587 0.417185 0.303149 0.721616 0.366709 0.831117 0.726522  [...]
+0.282715 0.892136 0.286562 0.501822 0.798131 0.905312 0.68022 0.28 0.874946 0.420532 0.838117 0.828141 0.343736 0.142246 0.293496 0.344297 0.233381 0.87142 0.412088 0.973682 0.088936 0.442923 0.190073 0.156686 0.993611 0.088975 0.606648 0.5786 0.322433 0.456644 0.906569 0.779221 0.568881 0.63655 0.562819 0.81129 0.090616 0.655326 0.103694 0.022569 0.219666 0.976002 0.727984 0.032121 0.037064 0.948595 0.188124 0.019848 0.445191 0.357999 0.014256 0.021274 0.190945 0.868074 0.584995 0.70491 [...]
+0.107028 0.581607 0.299964 0.550259 0.560913 0.318251 0.356461 0.882291 0.638929 0.282011 0.449732 0.202824 0.071531 0.776493 0.903653 0.410129 0.505576 0.427552 0.478522 0.094686 0.026967 0.076821 0.395318 0.247391 0.044675 0.38487 0.698236 0.849112 0.472822 0.959024 0.352143 0.3504 0.470681 0.423201 0.859483 0.098429 0.988054 0.498734 0.609955 0.340294 0.160615 0.339345 0.663505 0.186552 0.134085 0.033068 0.06015 0.18359 0.133312 0.744666 0.907699 0.52662 0.984463 0.470827 0.946974 0.1 [...]
+0.241612 0.591697 0.043875 0.010109 0.289714 0.945015 0.530878 0.922032 0.074008 0.124364 0.259406 0.520572 0.854531 0.563848 0.682503 0.804715 0.93351 0.750394 0.99091 0.894313 0.331322 0.585393 0.486943 0.820696 0.921742 0.605871 0.193717 0.639337 0.620176 0.451157 0.617417 0.31748 0.18253 0.688911 0.761098 0.17809 0.910763 0.229084 0.012338 0.101316 0.057961 0.136776 0.849708 0.837015 0.741137 0.137052 0.704893 0.185953 0.729231 0.341651 0.761004 0.065244 0.674016 0.667194 0.584223 0. [...]
+0.241528 0.939755 0.002132 0.853796 0.7509 0.623462 0.726671 0.446309 0.570638 0.648687 0.853591 0.625251 0.764793 0.345608 0.710082 0.859298 0.290674 0.305844 0.06338 0.194167 0.601281 0.355596 0.753422 0.537534 0.067262 0.154968 0.795456 0.518771 0.879006 0.588602 0.73814 0.237885 0.189594 0.124754 0.681018 0.019924 0.689211 0.646075 0.147966 0.71615 0.877995 0.921619 0.000751 0.311369 0.369726 0.041786 0.749497 0.354803 0.874994 0.595067 0.925032 0.279133 0.245487 0.62563 0.586512 0.6 [...]
+0.058875 0.286709 0.532032 0.10436 0.355959 0.693837 0.779266 0.890703 0.126277 0.476062 0.865567 0.070708 0.748819 0.766052 0.424181 0.91581 0.560965 0.27328 0.101911 0.910464 0.256322 0.884428 0.621482 0.974981 0.897908 0.214412 0.213681 0.93499 0.953209 0.692349 0.519785 0.108894 0.713813 0.010649 0.558506 0.721973 0.352308 0.808514 0.889568 0.452941 0.119221 0.206798 0.327782 0.818542 0.622485 0.693704 0.544873 0.65765 0.408643 0.292734 0.347539 0.571566 0.04841 0.787434 0.614306 0.6 [...]
+0.792419 0.142203 0.324063 0.058885 0.256879 0.062378 0.079903 0.796588 0.514441 0.950495 0.746453 0.715958 0.398947 0.147615 0.282252 0.720352 0.563188 0.530032 0.865911 0.950042 0.699103 0.049322 0.558079 0.264119 0.811257 0.674889 0.928862 0.582597 0.675438 0.786319 0.790612 0.856965 0.731539 0.81656 0.269212 0.849964 0.0194 0.96582 0.795402 0.198179 0.894783 0.323365 0.869655 0.473552 0.315814 0.890006 0.972142 0.47839 0.68203 0.154194 0.915615 0.22573 0.966113 0.626756 0.635008 0.87 [...]
+0.230527 0.946248 0.120619 0.85118 0.224441 0.386421 0.735255 0.192366 0.63231 0.31127 0.506648 0.928514 0.615053 0.679499 0.729683 0.992509 0.329407 0.24187 0.368046 0.763108 0.55372 0.038662 0.956307 0.711553 0.779064 0.295943 0.984582 0.060733 0.034332 0.010247 0.496998 0.749121 0.527435 0.763972 0.223944 0.815274 0.234649 0.461497 0.797645 0.984207 0.878168 0.240074 0.505538 0.302502 0.145313 0.300793 0.063675 0.332276 0.66826 0.157227 0.40831 0.925472 0.256149 0.842685 0.082514 0.43 [...]
+0.856606 0.335333 0.076741 0.034171 0.753522 0.400981 0.382418 0.112002 0.370373 0.500537 0.045081 0.641217 0.019215 0.858794 0.53405 0.68423 0.332795 0.637904 0.42265 0.450126 0.595028 0.655367 0.3653 0.878899 0.968641 0.618063 0.303067 0.016675 0.325766 0.237174 0.74495 0.025789 0.462773 0.044488 0.999044 0.417844 0.758335 0.748056 0.802071 0.729642 0.662332 0.086065 0.626304 0.822342 0.128392 0.787616 0.807179 0.065956 0.040362 0.090286 0.856002 0.384153 0.808079 0.484632 0.596879 0.9 [...]
+0.214958 0.494792 0.579512 0.501857 0.575236 0.095324 0.8277 0.072267 0.749729 0.393772 0.814083 0.869981 0.37184 0.962491 0.844297 0.443801 0.584573 0.209865 0.942802 0.353609 0.324447 0.435061 0.069929 0.577693 0.356964 0.059146 0.287913 0.942992 0.893617 0.785864 0.575014 0.291915 0.249638 0.779427 0.370373 0.426387 0.910652 0.389638 0.397952 0.301183 0.047125 0.292175 0.005489 0.091173 0.498384 0.11098 0.157691 0.955512 0.130642 0.02167 0.73566 0.282501 0.417704 0.144781 0.623995 0.5 [...]
+0.050286 0.328559 0.798384 0.93258 0.476971 0.184907 0.023313 0.851738 0.352427 0.758398 0.052301 0.679353 0.044123 0.312169 0.417918 0.49727 0.650934 0.187844 0.807527 0.13437 0.591523 0.145138 0.42432 0.734215 0.342174 0.736194 0.096987 0.327199 0.202741 0.106464 0.327167 0.799641 0.844897 0.3655 0.472134 0.147335 0.704284 0.955172 0.55414 0.903036 0.194063 0.199483 0.817971 0.934922 0.778392 0.163526 0.963416 0.581279 0.848244 0.919564 0.835821 0.079785 0.619593 0.960308 0.707253 0.42 [...]
+0.130104 0.781541 0.672837 0.892979 0.428249 0.18266 0.158509 0.793301 0.520607 0.576978 0.063045 0.176732 0.812444 0.406881 0.164488 0.017339 0.902844 0.634618 0.718104 0.901754 0.799167 0.130291 0.537689 0.627566 0.965356 0.076419 0.422806 0.178264 0.201664 0.410588 0.036273 0.171281 0.25004 0.344822 0.235716 0.158425 0.671538 0.850488 0.723645 0.567841 0.903148 0.968738 0.569277 0.872796 0.21741 0.707249 0.323482 0.130791 0.770807 0.682459 0.070365 0.952156 0.968135 0.192265 0.52294 0 [...]
+0.420011 0.966067 0.361511 0.961896 0.555401 0.900723 0.094105 0.576148 0.14545 0.04706 0.78079 0.368371 0.922804 0.465879 0.155478 0.317263 0.141455 0.532583 0.130408 0.019359 0.169905 0.222478 0.864936 0.79517 0.944582 0.951423 0.92602 0.023192 0.844701 0.971693 0.714027 0.202357 0.351343 0.048873 0.127545 0.776136 0.514489 0.517055 0.436229 0.911812 0.15999 0.73307 0.016551 0.118338 0.450605 0.845333 0.443887 0.67213 0.943119 0.761789 0.294903 0.610298 0.842044 0.870801 0.271801 0.684 [...]
+0.706401 0.31183 0.670717 0.369495 0.08338 0.774046 0.219232 0.844577 0.233335 0.667906 0.052648 0.166361 0.771368 0.486373 0.28788 0.751498 0.715456 0.871532 0.341333 0.712774 0.933951 0.809677 0.50229 0.144755 0.883973 0.625837 0.68776 0.704685 0.708194 0.639917 0.30065 0.251458 0.544088 0.653922 0.751264 0.319845 0.114384 0.535329 0.311186 0.828007 0.999171 0.410289 0.029281 0.790195 0.263427 0.636578 0.367669 0.947562 0.865402 0.065122 0.674376 0.310866 0.834424 0.289975 0.115675 0.9 [...]
+0.727137 0.890998 0.851309 0.7497 0.845354 0.30041 0.238078 0.367234 0.798125 0.023975 0.885119 0.450015 0.08843 0.295908 0.211813 0.269582 0.256389 0.0748 0.514697 0.917347 0.261257 0.402821 0.267474 0.575138 0.826014 0.778188 0.045159 0.960136 0.709863 0.068354 0.901219 0.759573 0.223364 0.17427 0.478862 0.190775 0.683531 0.633905 0.369927 0.221166 0.247658 0.919644 0.705479 0.169709 0.397338 0.984545 0.83205 0.957824 0.702968 0.375215 0.160911 0.559149 0.98609 0.216185 0.606632 0.5639 [...]
+0.582798 0.072804 0.461533 0.90615 0.480582 0.154207 0.920962 0.143487 0.649446 0.563829 0.389762 0.624442 0.555546 0.704428 0.288225 0.898899 0.443749 0.192826 0.195719 0.737448 0.709339 0.459601 0.278664 0.250529 0.030098 0.90874 0.432975 0.473898 0.233065 0.344198 0.229578 0.796629 0.794136 0.225461 0.743451 0.360169 0.222591 0.968666 0.601448 0.052713 0.994469 0.786288 0.996689 0.39851 0.057664 0.927034 0.423001 0.2987 0.630938 0.382788 0.192773 0.991359 0.370482 0.497991 0.27982 0.4 [...]
+0.831511 0.843073 0.200524 0.380511 0.163067 0.563913 0.523266 0.6327 0.229055 0.518937 0.103492 0.926019 0.307525 0.527839 0.109817 0.890751 0.122387 0.844596 0.711568 0.749517 0.907839 0.21357 0.145163 0.321245 0.357378 0.951732 0.556973 0.694421 0.263161 0.578927 0.114211 0.986884 0.242303 0.88683 0.837315 0.964012 0.93398 0.277898 0.937088 0.295385 0.601444 0.698835 0.177428 0.417215 0.051495 0.616276 0.4676 0.536704 0.924014 0.583375 0.136716 0.726932 0.834192 0.417418 0.818734 0.40 [...]
+0.517866 0.633844 0.291054 0.406128 0.592776 0.829446 0.52687 0.86766 0.677581 0.438159 0.464014 0.721379 0.065677 0.391435 0.425433 0.564335 0.343047 0.557159 0.08249 0.021007 0.587532 0.05925 0.048061 0.468921 0.86133 0.160859 0.827856 0.498896 0.797861 0.696765 0.911549 0.214648 0.005753 0.792594 0.985844 0.67126 0.183577 0.451764 0.331555 0.656714 0.558334 0.097851 0.813971 0.689835 0.187189 0.421492 0.435314 0.49429 0.488882 0.482184 0.756468 0.610241 0.404299 0.686229 0.836611 0.22 [...]
+0.464031 0.928862 0.875918 0.493385 0.20676 0.947674 0.336975 0.834498 0.06823 0.766992 0.251486 0.659095 0.666862 0.159773 0.870358 0.752347 0.240654 0.196038 0.986015 0.13866 0.702624 0.378515 0.52719 0.10492 0.293451 0.887922 0.467033 0.77577 0.543083 0.764063 0.981275 0.607214 0.058245 0.388131 0.282068 0.911201 0.040326 0.612367 0.060187 0.097314 0.425853 0.652262 0.250621 0.757542 0.840787 0.259951 0.258894 0.753836 0.979994 0.308226 0.395597 0.521834 0.996245 0.720664 0.23007 0.33 [...]
+0.21501 0.743151 0.55157 0.559375 0.578956 0.541219 0.092931 0.903772 0.740091 0.81644 0.270889 0.288703 0.688427 0.549853 0.058826 0.719859 0.966325 0.856596 0.346899 0.68136 0.974171 0.495182 0.400076 0.501631 0.618475 0.326746 0.573178 0.112902 0.456428 0.73438 0.069042 0.861111 0.525251 0.316299 0.948342 0.95434 0.251998 0.399556 0.745137 0.326458 0.527247 0.63171 0.433753 0.52005 0.091376 0.74035 0.458125 0.051993 0.368164 0.006302 0.673651 0.161519 0.894772 0.890169 0.997393 0.4531 [...]
+0.780003 0.931188 0.287803 0.844327 0.036993 0.029202 0.251896 0.744611 0.023289 0.447212 0.1371 0.077589 0.482808 0.1643 0.744154 0.428521 0.44857 0.658759 0.734773 0.455921 0.161517 0.366815 0.10002 0.609136 0.94797 0.171355 0.522968 0.332323 0.082333 0.657943 0.101484 0.092257 0.516567 0.422635 0.103216 0.984389 0.379832 0.922489 0.804012 0.929354 0.671677 0.774688 0.15233 0.231761 0.752912 0.952584 0.905527 0.203031 0.790389 0.061341 0.659894 0.989601 0.071764 0.079431 0.665402 0.984 [...]
+0.891877 0.15625 0.172226 0.998437 0.426184 0.546972 0.49737 0.906484 0.458084 0.87898 0.635081 0.500169 0.760402 0.384824 0.307642 0.021899 0.371653 0.123216 0.975768 0.071123 0.932243 0.638555 0.187293 0.543247 0.192829 0.167493 0.008038 0.947879 0.508392 0.87305 0.358947 0.843495 0.31423 0.69429 0.20807 0.461857 0.495348 0.585864 0.675533 0.932153 0.776777 0.587198 0.85875 0.509466 0.660493 0.235628 0.242977 0.499043 0.445138 0.552683 0.175528 0.299427 0.456812 0.008881 0.715087 0.929 [...]
+0.638929 0.760497 0.029036 0.196052 0.091898 0.017433 0.752602 0.97291 0.105246 0.668971 0.12616 0.437493 0.419602 0.019111 0.996923 0.549435 0.533462 0.695349 0.895316 0.736302 0.622119 0.463296 0.712246 0.411211 0.264028 0.132763 0.619453 0.13334 0.30892 0.522619 0.181694 0.080889 0.388811 0.713264 0.742413 0.84966 0.415766 0.106273 0.901215 0.857806 0.924278 0.651324 0.80692 0.896551 0.78332 0.91584 0.768795 0.012335 0.199877 0.688024 0.988061 0.985199 0.204105 0.947293 0.833363 0.438 [...]
+0.448313 0.121462 0.948116 0.75653 0.85859 0.648816 0.212211 0.221459 0.055917 0.899159 0.533397 0.074919 0.346371 0.308675 0.165775 0.089619 0.680026 0.579619 0.878805 0.918309 0.140305 0.807397 0.859268 0.875944 0.816997 0.217162 0.293708 0.278065 0.081056 0.196869 0.947688 0.723112 0.881985 0.990525 0.208406 0.020333 0.536809 0.199008 0.844301 0.246485 0.748387 0.860482 0.803875 0.296383 0.784665 0.544166 0.737728 0.930805 0.197586 0.946775 0.397146 0.559454 0.071635 0.026863 0.108891 [...]
+0.242628 0.716476 0.55131 0.234682 0.911516 0.171532 0.612396 0.877825 0.873703 0.961725 0.788728 0.373275 0.172605 0.901111 0.344817 0.95572 0.391984 0.822098 0.730178 0.099755 0.836926 0.622727 0.745221 0.849036 0.961441 0.024626 0.163683 0.270775 0.540611 0.817161 0.450579 0.277202 0.175578 0.930519 0.683728 0.765042 0.853952 0.050864 0.453228 0.727515 0.116584 0.2722 0.231139 0.108743 0.845698 0.105736 0.425474 0.802365 0.686251 0.321267 0.732219 0.900921 0.112698 0.616113 0.21542 0. [...]
+0.098488 0.052284 0.684761 0.729601 0.171902 0.233525 0.550544 0.490868 0.505321 0.524488 0.886108 0.944274 0.836001 0.222248 0.212357 0.976434 0.575433 0.100866 0.420238 0.596629 0.260685 0.834758 0.297042 0.447876 0.821267 0.744355 0.283708 0.906044 0.315878 0.284925 0.344043 0.64412 0.633571 0.487156 0.915534 0.310826 0.359054 0.715343 0.047807 0.215848 0.870944 0.236084 0.747688 0.799252 0.053702 0.296322 0.973476 0.61833 0.664165 0.547154 0.886802 0.318586 0.955192 0.015306 0.999979 [...]
+0.515237 0.598232 0.560163 0.915379 0.301586 0.784116 0.244357 0.861352 0.689329 0.488332 0.215618 0.318057 0.051208 0.749325 0.644339 0.138605 0.217382 0.836932 0.129094 0.143665 0.400386 0.186238 0.598018 0.503476 0.404567 0.656834 0.94013 0.587864 0.5665 0.342153 0.349053 0.811266 0.008578 0.927812 0.685038 0.260029 0.967677 0.60896 0.884211 0.763114 0.380014 0.244021 0.886742 0.099557 0.155912 0.549198 0.338007 0.28943 0.079905 0.369989 0.174478 0.914204 0.386003 0.718333 0.836227 0. [...]
+0.722606 0.055938 0.885428 0.020928 0.491477 0.339845 0.27636 0.289298 0.302535 0.929652 0.285781 0.749996 0.218402 0.418948 0.027902 0.604527 0.498354 0.183161 0.615319 0.367031 0.691577 0.937989 0.001338 0.26408 0.801022 0.905647 0.910038 0.846358 0.134818 0.344266 0.475933 0.224712 0.241984 0.531551 0.536999 0.38213 0.8097 0.85805 0.881974 0.744092 0.623603 0.092863 0.261145 0.249482 0.181492 0.673616 0.165084 0.016777 0.084519 0.8709 0.986911 0.970453 0.208202 0.315238 0.543152 0.270 [...]
+0.832605 0.98722 0.284284 0.751407 0.532806 0.121693 0.754231 0.946471 0.680056 0.150955 0.150763 0.262727 0.026562 0.547173 0.14298 0.449218 0.928183 0.096141 0.006053 0.607218 0.260685 0.20847 0.927277 0.505007 0.284197 0.692382 0.217257 0.966623 0.166055 0.545581 0.811279 0.448058 0.031886 0.94537 0.406988 0.18798 0.663125 0.940696 0.093427 0.545771 0.510525 0.222196 0.386071 0.644687 0.675685 0.847135 0.699376 0.808463 0.233668 0.22417 0.59265 0.2535 0.487185 0.808581 0.894524 0.5352 [...]
+0.594737 0.512271 0.334205 0.510356 0.228535 0.949858 0.312046 0.717829 0.76087 0.551412 0.044751 0.31949 0.848904 0.811126 0.264422 0.89577 0.33671 0.192174 0.208792 0.499493 0.743687 0.469863 0.922802 0.703995 0.961321 0.209747 0.958472 0.931283 0.538104 0.853158 0.489822 0.172215 0.121154 0.781285 0.446728 0.89853 0.82208 0.194303 0.764277 0.529282 0.646832 0.169373 0.454491 0.280743 0.448824 0.168123 0.174093 0.426037 0.510614 0.567456 0.531119 0.654898 0.779718 0.237794 0.58562 0.48 [...]
+0.136344 0.298571 0.525491 0.48884 0.291171 0.248323 0.732246 0.335377 0.864871 0.414416 0.263708 0.474301 0.120496 0.75453 0.512015 0.967303 0.715984 0.610362 0.414518 0.283758 0.956779 0.967992 0.107642 0.635779 0.473769 0.371294 0.570974 0.680832 0.288059 0.884911 0.006804 0.281233 0.153765 0.130114 0.519375 0.59013 0.75452 0.854712 0.231958 0.129848 0.418713 0.033866 0.764702 0.602547 0.46444 0.474414 0.995153 0.696549 0.397211 0.823372 0.688503 0.689568 0.54263 0.67392 0.528027 0.26 [...]
+0.788766 0.187888 0.874847 0.928754 0.900833 0.830103 0.501916 0.89299 0.593183 0.197613 0.597059 0.676534 0.63162 0.754616 0.964249 0.649297 0.745455 0.205257 0.841213 0.565342 0.527187 0.698515 0.647239 0.87421 0.769925 0.0608 0.20691 0.539661 0.656712 0.615459 0.399159 0.891816 0.221804 0.546245 0.45021 0.006572 0.084906 0.61363 0.091407 0.003056 0.218907 0.489474 0.03379 0.095846 0.892227 0.574563 0.553198 0.916789 0.272938 0.756333 0.074213 0.085065 0.251282 0.667255 0.827196 0.4760 [...]
+0.944352 0.354418 0.549437 0.776678 0.8441 0.362678 0.358843 0.832016 0.385238 0.707442 0.283874 0.745976 0.133424 0.916372 0.65354 0.207645 0.262299 0.929136 0.564992 0.244779 0.095678 0.882766 0.494766 0.804238 0.751312 0.289011 0.165214 0.347886 0.174175 0.333143 0.687427 0.383615 0.211096 0.273235 0.892089 0.007567 0.134087 0.382345 0.680721 0.860673 0.865525 0.012098 0.573733 0.819056 0.629315 0.841802 0.859789 0.614338 0.207539 0.534387 0.662207 0.107358 0.059808 0.867585 0.073913  [...]
+0.232489 0.304645 0.170877 0.632403 0.731918 0.792626 0.28357 0.393332 0.969508 0.782204 0.198658 0.41223 0.763271 0.864994 0.196743 0.627501 0.742459 0.9217 0.995915 0.165519 0.487264 0.908878 0.111007 0.519582 0.663337 0.439734 0.385211 0.764729 0.882867 0.061839 0.132719 0.349803 0.358122 0.96441 0.298575 0.921279 0.85692 0.884023 0.185104 0.886441 0.165052 0.934929 0.90804 0.696601 0.849272 0.820058 0.925733 0.117654 0.748275 0.06424 0.462205 0.716622 0.451927 0.433361 0.919562 0.269 [...]
+0.085873 0.133816 0.024778 0.289636 0.958156 0.339834 0.880021 0.291175 0.437208 0.103245 0.330008 0.537639 0.335545 0.783522 0.686939 0.35577 0.057824 0.539559 0.599471 0.784497 0.479125 0.329788 0.723671 0.560841 0.914782 0.136403 0.017632 0.068198 0.656777 0.010714 0.386118 0.32137 0.30444 0.151033 0.658627 0.713964 0.07062 0.229506 0.786103 0.44026 0.891436 0.353142 0.586755 0.095467 0.283565 0.806851 0.407346 0.365761 0.20749 0.086268 0.49082 0.360536 0.996544 0.730032 0.460365 0.13 [...]
+0.75643 0.243258 0.106948 0.625987 0.053383 0.990229 0.171668 0.688864 0.897455 0.917046 0.074632 0.135732 0.245171 0.007574 0.99536 0.713502 0.512259 0.526674 0.12675 0.651645 0.330905 0.620944 0.899307 0.81606 0.751101 0.20148 0.322406 0.015364 0.820776 0.307031 0.981911 0.477836 0.453872 0.206212 0.783667 0.220124 0.739098 0.917601 0.33556 0.756125 0.834753 0.306706 0.38265 0.064362 0.330691 0.582128 0.860939 0.385852 0.598833 0.547941 0.52151 0.007901 0.745128 0.751504 0.266428 0.469 [...]
+0.318277 0.360125 0.610717 0.992244 0.032775 0.315578 0.014287 0.510165 0.657818 0.247101 0.859779 0.606147 0.371086 0.887767 0.635926 0.457389 0.324357 0.691598 0.917999 0.645096 0.323198 0.876575 0.399253 0.539371 0.851689 0.533534 0.715487 0.529346 0.456226 0.799904 0.650184 0.684509 0.860727 0.465784 0.43698 0.018057 0.30924 0.680789 0.395829 0.897825 0.158172 0.626605 0.023307 0.347086 0.042567 0.78287 0.140606 0.338917 0.654623 0.740576 0.84114 0.206811 0.315404 0.623307 0.518794 0 [...]
+0.940822 0.3052 0.042805 0.546851 0.48966 0.542171 0.414807 0.579083 0.928772 0.593623 0.004856 0.85785 0.460995 0.663373 0.702628 0.693628 0.581783 0.927621 0.482468 0.89887 0.392724 0.030495 0.082035 0.849563 0.216102 0.458213 0.893376 0.771072 0.778021 0.649846 0.078165 0.922802 0.792929 0.676071 0.077028 0.859156 0.725532 0.557586 0.445255 0.417488 0.503335 0.648934 0.465447 0.861857 0.94255 0.988987 0.01473 0.855091 0.931892 0.760294 0.868547 0.958611 0.635719 0.510607 0.260179 0.45 [...]
+0.967419 0.968666 0.647459 0.460063 0.776885 0.005049 0.703513 0.315535 0.075906 0.926111 0.974475 0.131101 0.971889 0.062677 0.186461 0.023826 0.552828 0.25879 0.716788 0.67589 0.936967 0.04871 0.080263 0.475737 0.071716 0.691064 0.979057 0.743239 0.750943 0.102025 0.638838 0.501361 0.120387 0.38126 0.035471 0.318673 0.565308 0.336496 0.817171 0.525812 0.81575 0.783316 0.055702 0.594961 0.595116 0.709761 0.749735 0.896958 0.72515 0.718379 0.04701 0.802827 0.130357 0.292829 0.349556 0.08 [...]
+0.529922 0.993022 0.769409 0.674102 0.312892 0.339561 0.886192 0.813589 0.183201 0.977432 0.077345 0.628005 0.377156 0.678532 0.139425 0.511538 0.482527 0.806039 0.225612 0.628636 0.624634 0.537294 0.39916 0.825594 0.317879 0.082899 0.092128 0.572164 0.477772 0.814396 0.728734 0.779952 0.292085 0.279378 0.137446 0.249452 0.478529 0.665715 0.790505 0.840123 0.358472 0.019067 0.591976 0.442604 0.383594 0.375308 0.780712 0.556578 0.597626 0.58421 0.718682 0.216974 0.972196 0.655993 0.752595 [...]
+0.642234 0.847932 0.127931 0.50024 0.579778 0.080887 0.675498 0.20887 0.925823 0.406131 0.282371 0.678155 0.815786 0.274359 0.249632 0.322325 0.85322 0.477369 0.070563 0.207714 0.532685 0.608599 0.446868 0.051436 0.849217 0.473779 0.627187 0.22867 0.839101 0.076502 0.313494 0.680084 0.91371 0.610597 0.095251 0.102443 0.245794 0.280459 0.596741 0.865551 0.83929 0.317839 0.428408 0.823595 0.505651 0.076264 0.711806 0.486503 0.681651 0.828302 0.724697 0.740427 0.567188 0.828583 0.03027 0.77 [...]
+0.87407 0.065077 0.665065 0.000899 0.163408 0.066686 0.620959 0.238517 0.072476 0.215381 0.348924 0.796304 0.170045 0.192082 0.763125 0.760365 0.816563 0.212841 0.179671 0.080224 0.929209 0.340254 0.085168 0.661228 0.469935 0.401282 0.416471 0.485691 0.172068 0.143939 0.431193 0.947062 0.013133 0.377481 0.724689 0.720091 0.66592 0.771747 0.405503 0.203921 0.663943 0.900016 0.305678 0.258007 0.942921 0.851509 0.181341 0.25396 0.727803 0.889534 0.224696 0.27428 0.161357 0.084144 0.921965 0 [...]
+0.154881 0.483971 0.18562 0.356296 0.833501 0.228889 0.836612 0.22901 0.330079 0.610976 0.166981 0.722465 0.92822 0.758446 0.819376 0.319367 0.078071 0.395363 0.987636 0.245463 0.711234 0.768129 0.380567 0.829561 0.009225 0.065694 0.713604 0.770708 0.863141 0.024584 0.964236 0.354493 0.863955 0.055633 0.393711 0.754889 0.253605 0.877694 0.1987 0.281886 0.098752 0.802542 0.112173 0.641894 0.419524 0.888842 0.490918 0.583751 0.226236 0.062465 0.569669 0.526613 0.725503 0.821614 0.803161 0. [...]
+0.60891 0.98281 0.810771 0.626322 0.97402 0.580032 0.605473 0.801408 0.772065 0.601971 0.957719 0.015092 0.840484 0.50075 0.548568 0.885505 0.430984 0.094723 0.754585 0.115698 0.834845 0.858171 0.492414 0.890946 0.098049 0.662215 0.103009 0.525985 0.95278 0.936027 0.148422 0.684384 0.215537 0.048802 0.411537 0.741318 0.333254 0.437628 0.255209 0.612887 0.17118 0.949621 0.040875 0.867758 0.829838 0.47499 0.766892 0.26252 0.14565 0.224684 0.542544 0.863832 0.966277 0.135179 0.078778 0.1909 [...]
+0.187787 0.923895 0.458044 0.725488 0.9221 0.762831 0.312139 0.616613 0.387822 0.188809 0.21806 0.400359 0.179689 0.698958 0.149976 0.443427 0.158453 0.914398 0.494297 0.632149 0.895969 0.607718 0.777421 0.766424 0.291844 0.454665 0.929106 0.68438 0.339226 0.730741 0.754873 0.73968 0.121988 0.240019 0.791894 0.078158 0.031083 0.531964 0.941528 0.309799 0.217418 0.792787 0.768267 0.283464 0.388996 0.251494 0.634409 0.959383 0.92268 0.842082 0.32121 0.744985 0.750134 0.866196 0.91692 0.951 [...]
+0.387075 0.225651 0.939602 0.853905 0.967709 0.010523 0.195172 0.623767 0.69548 0.71264 0.170452 0.419395 0.318434 0.254005 0.767863 0.124988 0.090335 0.649312 0.663624 0.558677 0.750797 0.655993 0.724747 0.220459 0.984131 0.507106 0.545717 0.088274 0.795759 0.618975 0.629934 0.48455 0.896376 0.536394 0.924113 0.483204 0.335847 0.292451 0.644423 0.463029 0.959003 0.92608 0.402941 0.134387 0.332232 0.56794 0.448038 0.52522 0.068292 0.816445 0.011664 0.896835 0.199824 0.271379 0.11378 0.28 [...]
+0.204931 0.28907 0.378584 0.608594 0.609168 0.050852 0.098468 0.969719 0.433575 0.545587 0.980042 0.592719 0.815103 0.706985 0.481474 0.122118 0.515451 0.019269 0.288608 0.947354 0.277859 0.312303 0.456759 0.025352 0.840855 0.327323 0.216203 0.791502 0.219603 0.454342 0.53303 0.420896 0.203262 0.392238 0.968928 0.697862 0.566094 0.512927 0.435513 0.738505 0.924036 0.212111 0.769214 0.408188 0.053223 0.698226 0.024464 0.40784 0.07925 0.512264 0.562664 0.072906 0.718774 0.778248 0.043557 0 [...]
+0.257122 0.536964 0.926525 0.214183 0.368301 0.892689 0.838739 0.666139 0.035511 0.465021 0.66594 0.381183 0.258808 0.082212 0.631918 0.903003 0.451789 0.047822 0.506865 0.524226 0.400096 0.086896 0.112117 0.225876 0.841569 0.513675 0.364553 0.065569 0.913068 0.800594 0.439556 0.204982 0.254542 0.207756 0.137037 0.705533 0.847533 0.703556 0.213294 0.276474 0.740377 0.308425 0.072024 0.151349 0.59466 0.702583 0.182234 0.481178 0.706304 0.992 0.573786 0.684416 0.688386 0.03849 0.475351 0.9 [...]
+0.527416 0.115842 0.357435 0.984469 0.409423 0.703084 0.547217 0.8469 0.594232 0.462734 0.876439 0.504631 0.103484 0.093317 0.410378 0.061769 0.69779 0.552489 0.015644 0.665728 0.129607 0.742615 0.561405 0.077824 0.323303 0.194285 0.115231 0.302851 0.565218 0.451151 0.50562 0.84887 0.281629 0.144662 0.5763 0.703157 0.921376 0.215189 0.998714 0.33851 0.862214 0.884984 0.095206 0.720358 0.993425 0.798329 0.928837 0.595869 0.516383 0.067433 0.240015 0.948156 0.400915 0.057742 0.614049 0.698 [...]
+0.109577 0.478456 0.520085 0.322397 0.704797 0.843299 0.051343 0.731206 0.411698 0.016102 0.462356 0.627627 0.793975 0.190026 0.070513 0.422069 0.748949 0.55205 0.172144 0.587502 0.027565 0.666191 0.832555 0.925526 0.474557 0.804766 0.821781 0.122532 0.012303 0.806564 0.034276 0.089954 0.550216 0.923098 0.266359 0.556913 0.374515 0.163487 0.532218 0.266329 0.269925 0.606135 0.892892 0.866273 0.958216 0.008652 0.811254 0.708501 0.947105 0.107253 0.184817 0.965216 0.702266 0.753279 0.54558 [...]
+0.663776 0.2438 0.200839 0.555724 0.848043 0.481317 0.926985 0.32307 0.573713 0.297294 0.274951 0.570009 0.796487 0.89856 0.071451 0.966326 0.640859 0.269659 0.05804 0.923662 0.835857 0.428277 0.332729 0.756275 0.23319 0.944653 0.952903 0.277002 0.059897 0.329306 0.261124 0.610325 0.331028 0.870405 0.291468 0.929082 0.45372 0.723258 0.80798 0.685201 0.311034 0.618449 0.039935 0.768492 0.582265 0.305539 0.987347 0.614531 0.919851 0.755615 0.767926 0.554425 0.884203 0.841639 0.305233 0.926 [...]
+0.687525 0.665264 0.200356 0.327397 0.11997 0.077249 0.865664 0.75111 0.025306 0.973377 0.629809 0.388183 0.217187 0.080141 0.604506 0.41111 0.279543 0.792057 0.111432 0.121459 0.107128 0.715124 0.923369 0.712368 0.897004 0.53251 0.696255 0.736837 0.216095 0.121311 0.151413 0.627468 0.492293 0.046931 0.456821 0.618097 0.188872 0.59757 0.645548 0.98062 0.470728 0.206993 0.260493 0.850504 0.388431 0.711321 0.875215 0.902485 0.04519 0.197797 0.740232 0.990556 0.758369 0.770648 0.990525 0.79 [...]
+0.977141 0.075745 0.464623 0.252269 0.646898 0.193921 0.178711 0.67075 0.154091 0.296984 0.356371 0.444398 0.719686 0.836535 0.154382 0.332019 0.123296 0.846418 0.523668 0.927614 0.30669 0.094234 0.281746 0.495841 0.59901 0.061833 0.084617 0.42724 0.898253 0.006737 0.07108 0.196211 0.994855 0.811892 0.088529 0.449795 0.832745 0.289197 0.866257 0.361266 0.169684 0.812616 0.475266 0.045976 0.576527 0.46831 0.116472 0.246289 0.126539 0.12212 0.441591 0.714839 0.600909 0.064496 0.158603 0.92 [...]
+0.489479 0.150065 0.067132 0.508028 0.749204 0.642601 0.612003 0.051547 0.226771 0.816995 0.412533 0.756208 0.300879 0.878525 0.267361 0.857333 0.741702 0.718552 0.352439 0.278908 0.680761 0.925538 0.935931 0.958014 0.245454 0.365003 0.575653 0.233584 0.66868 0.090097 0.924651 0.296183 0.917677 0.004735 0.866367 0.142926 0.439273 0.016735 0.288216 0.080099 0.98879 0.813015 0.625793 0.87505 0.755703 0.190379 0.85185 0.43269 0.578725 0.724359 0.524631 0.139522 0.395029 0.805826 0.322153 0. [...]
+0.604536 0.340215 0.65271 0.99239 0.401002 0.848948 0.39476 0.949481 0.58376 0.171112 0.382228 0.757454 0.829193 0.236376 0.754098 0.335169 0.899728 0.432588 0.772776 0.109934 0.007232 0.404387 0.317007 0.827831 0.455496 0.246313 0.215704 0.414221 0.449493 0.897558 0.683043 0.784933 0.322484 0.467343 0.985434 0.602237 0.418623 0.829147 0.996089 0.079494 0.178806 0.512091 0.783887 0.34273 0.54338 0.780486 0.085491 0.463475 0.456766 0.018171 0.104024 0.86601 0.213179 0.249792 0.632711 0.27 [...]
+0.076244 0.910858 0.041737 0.736696 0.071391 0.259591 0.05572 0.756073 0.208955 0.387971 0.635461 0.0579 0.948182 0.473631 0.410135 0.955588 0.988995 0.454045 0.544234 0.321778 0.060644 0.095339 0.284655 0.401972 0.901841 0.146078 0.792447 0.183231 0.92383 0.848488 0.454807 0.992199 0.656617 0.837418 0.033974 0.589714 0.299275 0.371069 0.447622 0.59836 0.823077 0.45002 0.451745 0.999566 0.858072 0.077025 0.939675 0.862295 0.201895 0.755636 0.525588 0.898335 0.458747 0.661514 0.163296 0.1 [...]
+0.931134 0.187577 0.839606 0.918881 0.633658 0.388887 0.467498 0.084295 0.793621 0.204212 0.243098 0.150143 0.119875 0.287315 0.883595 0.880067 0.663616 0.664577 0.021017 0.962753 0.911068 0.47082 0.707079 0.192426 0.762501 0.09621 0.146633 0.332685 0.324249 0.335047 0.806949 0.194908 0.085261 0.132206 0.008163 0.787047 0.540174 0.354686 0.592418 0.403086 0.078286 0.663004 0.571839 0.40327 0.19934 0.659046 0.063401 0.207764 0.748566 0.507683 0.641134 0.72422 0.614891 0.671474 0.858129 0. [...]
+0.324315 0.923233 0.581886 0.966705 0.249709 0.33083 0.274492 0.717202 0.000598 0.963444 0.64549 0.577689 0.377157 0.73246 0.93047 0.399004 0.695214 0.516129 0.199354 0.114128 0.212366 0.46392 0.20109 0.797193 0.039232 0.843348 0.937683 0.433329 0.971881 0.237649 0.820388 0.979026 0.742171 0.59518 0.090218 0.126735 0.381598 0.642877 0.973535 0.094336 0.243206 0.858303 0.573459 0.254095 0.371072 0.029744 0.264504 0.025286 0.273525 0.148373 0.989457 0.356853 0.669629 0.42576 0.587517 0.428 [...]
+0.540879 0.791691 0.130686 0.960053 0.618283 0.381935 0.981609 0.263311 0.235322 0.766513 0.78701 0.473618 0.681177 0.386751 0.199414 0.801602 0.304711 0.058612 0.707397 0.138345 0.315489 0.197021 0.557387 0.052699 0.237261 0.943949 0.663247 0.239759 0.4285 0.888166 0.229623 0.583728 0.218532 0.890754 0.297205 0.775206 0.201366 0.49453 0.112579 0.090575 0.242964 0.678539 0.973817 0.546368 0.955579 0.377756 0.846818 0.355211 0.221616 0.033431 0.066433 0.419114 0.97866 0.489105 0.558306 0. [...]
+0.935896 0.346416 0.450991 0.975841 0.623122 0.872464 0.386023 0.420137 0.092706 0.104024 0.629988 0.881784 0.806402 0.929624 0.987628 0.780876 0.502071 0.927472 0.463805 0.335876 0.153742 0.522018 0.110318 0.522336 0.091687 0.001419 0.213114 0.101054 0.329971 0.932697 0.042255 0.64576 0.601362 0.650104 0.558663 0.668298 0.882717 0.715524 0.075823 0.490369 0.61732 0.834286 0.324571 0.643844 0.666394 0.160613 0.580928 0.970176 0.113243 0.730904 0.485938 0.351786 0.123586 0.297848 0.632434 [...]
+0.184667 0.102951 0.950581 0.281021 0.895952 0.507477 0.722025 0.082459 0.519302 0.978487 0.090106 0.138272 0.895586 0.305772 0.531619 0.120698 0.979248 0.750899 0.920057 0.712396 0.936436 0.398835 0.970136 0.826054 0.759822 0.243538 0.98719 0.596336 0.372209 0.450478 0.915832 0.025077 0.759505 0.325852 0.659775 0.727378 0.390655 0.010263 0.131582 0.344869 0.133182 0.205226 0.839707 0.494628 0.97783 0.779214 0.507858 0.935224 0.983864 0.380397 0.533971 0.970638 0.38293 0.647529 0.28583 0 [...]
+0.782032 0.899795 0.398976 0.037733 0.598079 0.84009 0.850763 0.847094 0.557512 0.112111 0.429762 0.90463 0.059367 0.87185 0.594384 0.973522 0.536629 0.192092 0.686318 0.427702 0.310605 0.155846 0.019604 0.324391 0.401201 0.065077 0.081595 0.110543 0.51631 0.690568 0.905645 0.311605 0.34552 0.178085 0.206723 0.481004 0.269094 0.804871 0.2271 0.714026 0.754737 0.187865 0.516143 0.775074 0.17026 0.051189 0.893252 0.54513 0.652589 0.131669 0.416423 0.164762 0.240468 0.247324 0.674154 0.8076 [...]
+0.758251 0.433213 0.324393 0.576423 0.20019 0.677987 0.650276 0.134991 0.355717 0.603058 0.776847 0.623301 0.132954 0.352181 0.01407 0.696665 0.371424 0.348122 0.914209 0.435323 0.549209 0.08655 0.815151 0.970544 0.923793 0.0676 0.140767 0.891356 0.865774 0.750724 0.179103 0.14681 0.385229 0.405824 0.176991 0.503439 0.543341 0.236652 0.241558 0.101327 0.974017 0.661887 0.021619 0.695524 0.772227 0.68505 0.272565 0.520635 0.797002 0.175026 0.730707 0.42639 0.489237 0.667611 0.4876 0.34674 [...]
+0.991335 0.219862 0.531379 0.40214 0.986694 0.776617 0.404153 0.09346 0.191267 0.456603 0.701572 0.894345 0.443968 0.213207 0.346408 0.741509 0.153396 0.82511 0.467801 0.681935 0.705045 0.650959 0.944304 0.713625 0.174805 0.6911 0.572685 0.174432 0.604548 0.446421 0.918545 0.967215 0.989134 0.250232 0.374068 0.757846 0.41977 0.030979 0.844597 0.856994 0.464563 0.50807 0.982504 0.038358 0.374593 0.563668 0.030954 0.243698 0.34257 0.478426 0.352506 0.224261 0.844582 0.291244 0.918879 0.082 [...]
+0.376744 0.232634 0.12821 0.570398 0.311596 0.604366 0.019968 0.939592 0.022657 0.491696 0.42938 0.642635 0.144686 0.95108 0.817776 0.146818 0.277995 0.676631 0.454478 0.225158 0.654316 0.46364 0.483281 0.780805 0.167839 0.411044 0.48209 0.25844 0.184941 0.845862 0.836101 0.518393 0.137027 0.191738 0.346802 0.189328 0.107116 0.705985 0.836277 0.522418 0.522514 0.319051 0.440003 0.628254 0.293623 0.416331 0.8031 0.16835 0.795749 0.253246 0.352117 0.250122 0.322666 0.496059 0.893182 0.3115 [...]
+0.879443 0.592889 0.638527 0.797888 0.17874 0.956413 0.5087 0.545855 0.92709 0.364142 0.049804 0.634407 0.241229 0.425069 0.39487 0.57334 0.629856 0.076926 0.600424 0.486044 0.113719 0.940705 0.9532 0.697629 0.970288 0.699761 0.708118 0.3606 0.629892 0.211 0.681313 0.365673 0.804298 0.41353 0.427206 0.270973 0.060171 0.406757 0.631878 0.562066 0.676361 0.589694 0.755562 0.025846 0.166369 0.143895 0.192759 0.42757 0.500023 0.279654 0.19653 0.910794 0.493731 0.765583 0.92442 0.220936 0.365 [...]
+0.479409 0.029766 0.108195 0.171548 0.593317 0.546408 0.419144 0.272873 0.662883 0.143496 0.541172 0.330549 0.58222 0.466649 0.51701 0.885866 0.931451 0.33545 0.170966 0.549694 0.898264 0.851613 0.086277 0.590126 0.353725 0.896406 0.203044 0.614324 0.679624 0.449275 0.216761 0.924212 0.66523 0.010868 0.836691 0.611473 0.048489 0.48403 0.616092 0.424245 0.582305 0.381528 0.542563 0.873677 0.936173 0.324607 0.925997 0.90445 0.410578 0.042439 0.568934 0.411964 0.40143 0.420364 0.144441 0.06 [...]
+0.636816 0.641583 0.863727 0.736587 0.822094 0.345707 0.326016 0.244683 0.929886 0.031982 0.891959 0.66828 0.455897 0.769675 0.091605 0.169897 0.631171 0.854856 0.296785 0.872508 0.106634 0.092893 0.758513 0.867348 0.808788 0.89182 0.442636 0.631258 0.007785 0.464761 0.35567 0.858302 0.097177 0.839974 0.687783 0.77747 0.18904 0.00619 0.792815 0.517236 0.552489 0.85784 0.346185 0.451216 0.098239 0.327506 0.482292 0.405521 0.940943 0.402154 0.642788 0.403044 0.187707 0.234452 0.076648 0.06 [...]
+0.247968 0.402353 0.504197 0.46182 0.283003 0.837007 0.128281 0.655946 0.831648 0.485416 0.588387 0.006593 0.627418 0.137434 0.295431 0.315605 0.518082 0.723235 0.300837 0.570935 0.97582 0.680081 0.023989 0.016655 0.870367 0.923655 0.852729 0.690101 0.948921 0.340932 0.003871 0.442222 0.325506 0.401044 0.938319 0.67175 0.383687 0.503626 0.705983 0.441255 0.560467 0.60706 0.525639 0.182672 0.901502 0.720553 0.047373 0.242752 0.905485 0.054383 0.470256 0.619273 0.054129 0.952029 0.001897 0 [...]
+0.467366 0.397965 0.99665 0.391928 0.774281 0.362701 0.4984 0.134551 0.510099 0.992556 0.29621 0.077036 0.628212 0.4853 0.935016 0.418027 0.135257 0.139884 0.447541 0.571389 0.049587 0.808069 0.722013 0.885218 0.701305 0.880666 0.859784 0.110284 0.704853 0.220686 0.727837 0.496146 0.902978 0.530261 0.651957 0.097759 0.087529 0.071306 0.822512 0.084394 0.879397 0.133355 0.861563 0.233277 0.075542 0.477979 0.416922 0.130476 0.615469 0.142584 0.171588 0.980375 0.672171 0.157304 0.730835 0.4 [...]
+0.298075 0.983485 0.007439 0.108438 0.50749 0.749068 0.171347 0.681394 0.767938 0.505541 0.364708 0.997688 0.764344 0.68615 0.910583 0.312356 0.798823 0.470958 0.905698 0.917715 0.061164 0.034323 0.461178 0.340824 0.99549 0.278604 0.177795 0.159561 0.662047 0.010934 0.353978 0.201972 0.326549 0.994858 0.165207 0.63068 0.320093 0.519605 0.103294 0.87822 0.101264 0.342626 0.246636 0.539703 0.61665 0.003552 0.613418 0.42992 0.984666 0.913096 0.130272 0.292175 0.92232 0.408314 0.949853 0.702 [...]
+0.541159 0.756117 0.745593 0.720525 0.89903 0.442174 0.825572 0.752345 0.161229 0.290627 0.104095 0.851859 0.266028 0.036648 0.412983 0.108138 0.572702 0.95632 0.137609 0.283232 0.232837 0.624603 0.502652 0.238344 0.011793 0.488253 0.745027 0.292804 0.152205 0.284759 0.963185 0.004058 0.63689 0.663846 0.066703 0.864956 0.515837 0.043896 0.781596 0.453058 0.648373 0.015881 0.049089 0.672477 0.47582 0.6196 0.680855 0.866414 0.62753 0.380047 0.808437 0.748223 0.839588 0.001358 0.325781 0.41 [...]
+0.034203 0.484856 0.365162 0.024638 0.985037 0.244009 0.620372 0.049151 0.402548 0.788448 0.325724 0.231278 0.389693 0.096777 0.21785 0.867731 0.617712 0.903174 0.252322 0.579756 0.936431 0.664822 0.166113 0.606512 0.89162 0.829459 0.101807 0.73443 0.936942 0.031928 0.93322 0.398484 0.612667 0.272639 0.024447 0.818294 0.782745 0.753289 0.673195 0.815536 0.615559 0.819307 0.796737 0.079504 0.046509 0.784057 0.515562 0.818421 0.819987 0.283367 0.688013 0.076043 0.45518 0.89733 0.862634 0.7 [...]
+0.129935 0.915426 0.145778 0.884682 0.067449 0.226963 0.446948 0.898514 0.988815 0.039533 0.926295 0.319684 0.074253 0.476176 0.12256 0.875071 0.720463 0.193355 0.552831 0.055662 0.154569 0.334289 0.46496 0.074083 0.981223 0.934049 0.974426 0.453919 0.44549 0.444672 0.328877 0.483653 0.962945 0.805086 0.549778 0.102679 0.551285 0.715452 0.304806 0.146269 0.206215 0.433026 0.996976 0.384416 0.787968 0.876885 0.823715 0.328459 0.509776 0.843982 0.680705 0.709593 0.623799 0.877332 0.029341  [...]
+0.021677 0.356029 0.13857 0.362988 0.118101 0.613778 0.370243 0.737613 0.870496 0.678889 0.387554 0.804801 0.046079 0.371861 0.574349 0.020532 0.477401 0.75759 0.945035 0.618031 0.787749 0.451171 0.091915 0.555959 0.324277 0.944805 0.210144 0.430524 0.000326 0.742287 0.651109 0.170725 0.000701 0.757307 0.328735 0.06557 0.830121 0.992292 0.27773 0.591966 0.286982 0.935886 0.472435 0.856999 0.734698 0.64066 0.619251 0.138537 0.818913 0.28984 0.857808 0.892796 0.674702 0.16694 0.666021 0.93 [...]
+0.740394 0.472935 0.370929 0.700327 0.000176 0.006296 0.464389 0.57969 0.616767 0.128228 0.701583 0.786184 0.317564 0.5853 0.639605 0.919691 0.859857 0.647283 0.676515 0.420508 0.982661 0.809263 0.181132 0.090262 0.249636 0.837697 0.213114 0.071444 0.977426 0.08403 0.366987 0.06134 0.464363 0.889189 0.883012 0.761757 0.720606 0.692298 0.828687 0.232327 0.002229 0.95389 0.358441 0.189706 0.709513 0.675013 0.23931 0.003584 0.925553 0.021145 0.250068 0.425187 0.844458 0.574788 0.85881 0.613 [...]
+0.506884 0.0184 0.01557 0.490875 0.585245 0.66702 0.129257 0.31666 0.508371 0.522701 0.920091 0.878928 0.115342 0.448557 0.892986 0.548605 0.514195 0.743097 0.895817 0.588693 0.366862 0.694278 0.596077 0.763099 0.797746 0.93962 0.510825 0.055967 0.522508 0.100553 0.189015 0.411075 0.457695 0.837994 0.464304 0.808721 0.312133 0.89303 0.259851 0.095045 0.950577 0.995434 0.127415 0.742577 0.578245 0.792366 0.204333 0.449129 0.601698 0.853282 0.54249 0.371008 0.022031 0.526291 0.26575 0.8292 [...]
+0.879499 0.556651 0.25899 0.691254 0.877289 0.417145 0.411791 0.094214 0.158333 0.807837 0.617642 0.590636 0.209683 0.933319 0.459819 0.491676 0.462503 0.797677 0.372036 0.364164 0.854122 0.11919 0.645396 0.071135 0.393256 0.612656 0.89614 0.253405 0.996996 0.36542 0.764351 0.41036 0.230426 0.203948 0.59299 0.18575 0.748746 0.02498 0.596225 0.648393 0.125713 0.00535 0.315331 0.813905 0.258549 0.29633 0.01386 0.4288 0.480426 0.023753 0.140777 0.992639 0.140211 0.558176 0.477325 0.683813 0 [...]
+0.002332 0.479649 0.316425 0.785352 0.107489 0.370543 0.02668 0.501869 0.498437 0.61038 0.187884 0.294741 0.707143 0.636565 0.137936 0.86478 0.18201 0.199627 0.517852 0.812437 0.554238 0.752681 0.789754 0.437663 0.344098 0.055346 0.704423 0.327946 0.741235 0.154979 0.194626 0.57607 0.154884 0.487278 0.679801 0.394748 0.62887 0.509574 0.294823 0.437226 0.342793 0.267914 0.134742 0.780963 0.766963 0.178472 0.936645 0.8147 0.369313 0.78343 0.306803 0.140977 0.466691 0.981577 0.204819 0.5813 [...]
+0.1057 0.18178 0.498935 0.479836 0.348895 0.22186 0.838903 0.566189 0.927205 0.38376 0.726208 0.135862 0.697716 0.527718 0.121441 0.975595 0.998587 0.14182 0.487542 0.479799 0.516738 0.132723 0.661203 0.944913 0.918751 0.4776 0.96795 0.643212 0.877172 0.632425 0.400331 0.443154 0.13653 0.879778 0.921224 0.71701 0.681052 0.640165 0.283599 0.604239 0.130419 0.111841 0.206118 0.156774 0.151601 0.740246 0.674477 0.988233 0.04581 0.619083 0.92451 0.556411 0.018171 0.124749 0.654361 0.458393 0 [...]
+0.04276 0.459964 0.25436 0.158507 0.33903 0.94035 0.253114 0.358056 0.227326 0.172354 0.238319 0.305704 0.072685 0.133293 0.577296 0.449214 0.635686 0.786765 0.278142 0.891208 0.926116 0.184438 0.703075 0.615218 0.721351 0.288053 0.350643 0.25514 0.194166 0.664557 0.308012 0.730626 0.822834 0.956815 0.032813 0.152615 0.455058 0.745234 0.126125 0.304096 0.533954 0.328751 0.330839 0.674485 0.47044 0.068586 0.270436 0.51183 0.572676 0.562954 0.053138 0.101905 0.773625 0.203996 0.695 0.28694 [...]
+0.181972 0.841254 0.327125 0.661617 0.478268 0.85294 0.52043 0.517887 0.934829 0.167487 0.661995 0.761818 0.33981 0.205175 0.564305 0.750769 0.070009 0.675261 0.176763 0.015669 0.048145 0.002005 0.732599 0.367225 0.980062 0.719647 0.715728 0.906278 0.362313 0.670758 0.233293 0.923084 0.087334 0.932455 0.395567 0.665038 0.103355 0.320485 0.063476 0.044444 0.74358 0.635813 0.319815 0.763481 0.532128 0.207325 0.796121 0.556119 0.428734 0.455595 0.603823 0.42401 0.853683 0.16187 0.020195 0.0 [...]
+0.701829 0.712599 0.273724 0.599672 0.780353 0.030378 0.884234 0.463988 0.576675 0.53879 0.529744 0.670413 0.70301 0.41685 0.404311 0.736019 0.363771 0.874387 0.908115 0.412206 0.629593 0.286221 0.971167 0.571986 0.243411 0.01355 0.053569 0.586411 0.692292 0.818975 0.24606 0.460226 0.410849 0.41749 0.317095 0.944237 0.141289 0.488756 0.980817 0.077474 0.262122 0.377331 0.659346 0.765696 0.921039 0.171832 0.787163 0.132512 0.534844 0.031031 0.03047 0.23635 0.601651 0.435981 0.073547 0.633 [...]
+0.494637 0.828697 0.807316 0.837921 0.165668 0.436361 0.913315 0.119676 0.054959 0.240192 0.32567 0.192492 0.744931 0.336434 0.104538 0.574966 0.805262 0.30772 0.594896 0.909756 0.726656 0.965799 0.443879 0.626578 0.396938 0.520865 0.381543 0.107141 0.923602 0.043363 0.133736 0.873665 0.29775 0.118378 0.478968 0.494317 0.877016 0.522123 0.285339 0.015214 0.12825 0.530216 0.704561 0.725064 0.912709 0.85613 0.927762 0.848601 0.384327 0.332282 0.984106 0.221285 0.47727 0.00693 0.795807 0.26 [...]
+0.155609 0.453981 0.22866 0.604428 0.133991 0.590973 0.66291 0.643114 0.401142 0.456501 0.038447 0.629844 0.319192 0.528157 0.404918 0.951923 0.359276 0.8871 0.77691 0.542223 0.540625 0.420934 0.699465 0.697025 0.816673 0.923124 0.956284 0.872244 0.829836 0.267379 0.61334 0.497395 0.939913 0.37618 0.81562 0.14899 0.33478 0.480652 0.879345 0.508851 0.660439 0.642362 0.389665 0.353877 0.063993 0.250415 0.701669 0.357289 0.388253 0.640447 0.265764 0.757966 0.904491 0.010449 0.110438 0.10728 [...]
+0.587865 0.365516 0.765856 0.83217 0.904597 0.130434 0.552914 0.194715 0.825453 0.430911 0.180599 0.72425 0.182908 0.246582 0.862385 0.637492 0.370135 0.869244 0.545932 0.279804 0.959112 0.855288 0.919894 0.407311 0.04528 0.087485 0.448935 0.87701 0.488855 0.053187 0.021767 0.013914 0.503669 0.430301 0.930733 0.592656 0.069168 0.818315 0.088665 0.186991 0.003142 0.360675 0.002557 0.727491 0.637399 0.589133 0.46143 0.597212 0.529466 0.451725 0.361802 0.72021 0.54035 0.036161 0.499185 0.24 [...]
+0.01382 0.967528 0.418985 0.839066 0.671517 0.470433 0.922494 0.659771 0.476158 0.936421 0.755125 0.646416 0.915212 0.846124 0.951771 0.492154 0.532019 0.594354 0.887791 0.753951 0.844578 0.406019 0.538394 0.434794 0.264991 0.71604 0.666265 0.427778 0.207168 0.259169 0.163059 0.160042 0.084505 0.157407 0.89889 0.471554 0.151953 0.134748 0.197283 0.077429 0.981071 0.214603 0.328558 0.819655 0.162189 0.476029 0.406212 0.688506 0.41551 0.958828 0.167976 0.676344 0.026184 0.14346 0.154324 0. [...]
+0.98235 0.181234 0.430712 0.928272 0.305913 0.117865 0.247702 0.410045 0.414968 0.429006 0.676074 0.086201 0.85272 0.086384 0.569736 0.23783 0.138349 0.214345 0.252258 0.553926 0.381308 0.074408 0.225649 0.50817 0.536738 0.763111 0.351363 0.08169 0.397339 0.639128 0.241211 0.213022 0.549961 0.333717 0.140158 0.364967 0.217026 0.950009 0.649001 0.762745 0.490451 0.281804 0.264828 0.288181 0.553791 0.976347 0.385748 0.850503 0.752621 0.841345 0.084757 0.789119 0.189915 0.800499 0.72621 0.1 [...]
+0.966591 0.934192 0.06935 0.926682 0.388839 0.347452 0.554368 0.07959 0.907525 0.033048 0.915692 0.741502 0.302471 0.639878 0.299447 0.324217 0.872562 0.952014 0.193191 0.841216 0.58543 0.324398 0.583682 0.503666 0.768181 0.18017 0.290735 0.101652 0.97779 0.108963 0.378361 0.293281 0.84184 0.602986 0.150241 0.653195 0.827464 0.149207 0.806561 0.318255 0.903182 0.828645 0.880468 0.226704 0.447078 0.641785 0.148535 0.163641 0.0551 0.927133 0.474856 0.76585 0.760026 0.215454 0.095234 0.1921 [...]
+0.940264 0.429483 0.642319 0.432033 0.34068 0.019886 0.451167 0.560807 0.74763 0.68672 0.726979 0.565052 0.545903 0.813974 0.370468 0.555807 0.541074 0.099825 0.165855 0.192672 0.50025 0.216798 0.351596 0.818853 0.631152 0.905167 0.791559 0.851352 0.630166 0.77369 0.037427 0.705506 0.397807 0.464595 0.748452 0.687976 0.325808 0.706962 0.024498 0.390904 0.41807 0.865139 0.283712 0.154254 0.034874 0.2555 0.243565 0.725288 0.264007 0.43409 0.13977 0.098328 0.620952 0.314768 0.952318 0.98089 [...]
+0.370529 0.487048 0.833169 0.813939 0.650275 0.489459 0.182819 0.295363 0.59843 0.465842 0.268551 0.268577 0.175413 0.115643 0.548813 0.250649 0.816251 0.190578 0.350776 0.052562 0.487157 0.837569 0.019444 0.023307 0.302626 0.392462 0.875982 0.975178 0.021202 0.403881 0.621869 0.485837 0.142782 0.721806 0.872937 0.006349 0.148321 0.241009 0.316744 0.510907 0.589238 0.426102 0.908816 0.455229 0.955588 0.743876 0.001755 0.733016 0.027869 0.247337 0.589751 0.168797 0.145507 0.921127 0.20430 [...]
+0.582886 0.155664 0.92014 0.690707 0.466054 0.710321 0.696563 0.108556 0.107064 0.521271 0.550377 0.353583 0.139246 0.319322 0.627247 0.903366 0.741477 0.505701 0.373804 0.847248 0.963885 0.267805 0.090237 0.060197 0.602767 0.334895 0.585063 0.802237 0.99938 0.566306 0.748875 0.016679 0.897556 0.214789 0.026553 0.075995 0.196966 0.647724 0.767299 0.733938 0.10549 0.86221 0.208495 0.813412 0.227269 0.547628 0.477579 0.186048 0.191179 0.416141 0.530188 0.175692 0.85316 0.927059 0.26074 0.3 [...]
+0.78004 0.06398 0.647603 0.108664 0.492374 0.879929 0.367909 0.461389 0.409068 0.0586 0.200623 0.844011 0.915062 0.387588 0.655601 0.619025 0.06678 0.807366 0.40283 0.141435 0.735536 0.109994 0.815457 0.557588 0.339908 0.595921 0.175465 0.268203 0.004094 0.507159 0.886677 0.521009 0.094824 0.019185 0.491356 0.404908 0.458694 0.850361 0.526226 0.916332 0.932549 0.148968 0.891151 0.661913 0.885534 0.862563 0.059349 0.87509 0.66722 0.358587 0.128226 0.469505 0.236463 0.95313 0.203177 0.4156 [...]
+0.392286 0.301881 0.550336 0.905508 0.526695 0.807019 0.430555 0.723492 0.496459 0.241425 0.547886 0.978183 0.556333 0.703238 0.67189 0.721631 0.728766 0.581694 0.653759 0.404588 0.483788 0.253821 0.120393 0.117492 0.428927 0.230637 0.095496 0.712984 0.944024 0.064792 0.184218 0.068686 0.093198 0.995289 0.517639 0.839752 0.003397 0.416714 0.113361 0.2901 0.167165 0.098671 0.349454 0.133288 0.959519 0.568974 0.416515 0.034676 0.164551 0.839913 0.561024 0.394279 0.017668 0.564785 0.700567  [...]
+0.052855 0.048325 0.804606 0.033854 0.6085 0.527151 0.952764 0.850998 0.260258 0.367158 0.909684 0.502379 0.072358 0.354238 0.209494 0.741821 0.29125 0.197362 0.777544 0.804092 0.212993 0.674002 0.243791 0.570816 0.471431 0.444736 0.917392 0.42178 0.154837 0.777725 0.948245 0.438808 0.252158 0.025358 0.162816 0.836446 0.529836 0.473648 0.061622 0.593136 0.344135 0.817587 0.809096 0.046904 0.975094 0.345181 0.469223 0.366644 0.76841 0.589518 0.934555 0.215691 0.10308 0.024781 0.476539 0.9 [...]
+0.964749 0.805909 0.567125 0.564263 0.422154 0.928112 0.993501 0.687269 0.579478 0.408065 0.554091 0.455447 0.528949 0.118364 0.880781 0.406084 0.013716 0.091565 0.922465 0.586005 0.494868 0.429887 0.497251 0.287042 0.983311 0.747476 0.300866 0.32711 0.652964 0.748823 0.830518 0.903024 0.455329 0.548597 0.922371 0.127225 0.254063 0.880357 0.280811 0.901408 0.121547 0.960646 0.782033 0.618916 0.229271 0.283822 0.54718 0.593596 0.997654 0.127409 0.13216 0.119511 0.412417 0.734365 0.561849  [...]
+0.386711 0.110064 0.314092 0.675962 0.145729 0.250639 0.605461 0.188208 0.275826 0.932572 0.982285 0.339777 0.645643 0.547642 0.388455 0.973541 0.969914 0.250159 0.578179 0.653157 0.208295 0.880926 0.20764 0.596691 0.107005 0.943763 0.669372 0.661536 0.539858 0.885029 0.779353 0.918723 0.972915 0.320123 0.492206 0.12097 0.289535 0.91487 0.611585 0.588197 0.581649 0.893392 0.191619 0.323793 0.208735 0.173764 0.054212 0.065208 0.862001 0.96935 0.882902 0.464281 0.548763 0.898407 0.849037 0 [...]
+0.195557 0.88163 0.920308 0.639939 0.826953 0.340605 0.180015 0.773808 0.85731 0.803047 0.747903 0.167785 0.914368 0.501665 0.592836 0.466978 0.035628 0.772544 0.471525 0.427672 0.795118 0.158088 0.740826 0.402013 0.346781 0.621801 0.059476 0.717602 0.901731 0.76462 0.458785 0.930707 0.307343 0.325183 0.605407 0.367269 0.38358 0.390289 0.728675 0.193091 0.40321 0.145762 0.84342 0.907609 0.593624 0.008616 0.707181 0.532017 0.509625 0.823247 0.812245 0.185697 0.51454 0.727828 0.029099 0.96 [...]
+0.20902 0.96767 0.04539 0.170133 0.60203 0.824372 0.121712 0.978857 0.99931 0.570635 0.144981 0.628137 0.489272 0.954284 0.19739 0.30881 0.751525 0.848669 0.686419 0.096799 0.225392 0.117769 0.551517 0.005594 0.951734 0.43253 0.332607 0.557261 0.953914 0.218868 0.341894 0.716774 0.746043 0.338997 0.118571 0.259862 0.711384 0.263554 0.70777 0.378841 0.575296 0.232226 0.194349 0.400025 0.185918 0.40441 0.561253 0.609245 0.111889 0.687165 0.83651 0.530973 0.157998 0.935435 0.439323 0.369245 [...]
+0.777663 0.458436 0.005318 0.006888 0.724606 0.863975 0.546272 0.907362 0.034721 0.569605 0.481282 0.462002 0.667117 0.397319 0.955707 0.462225 0.240978 0.002273 0.72757 0.913031 0.563121 0.812513 0.815382 0.160669 0.514606 0.250725 0.417663 0.537579 0.796233 0.194503 0.366029 0.55487 0.559297 0.695541 0.879375 0.470612 0.863333 0.110557 0.071638 0.233465 0.346308 0.945917 0.476437 0.587955 0.023 0.880843 0.129266 0.90328 0.572676 0.33444 0.544552 0.988671 0.240502 0.208907 0.364146 0.60 [...]
+0.752221 0.087033 0.814022 0.455773 0.563278 0.430244 0.827371 0.93725 0.469712 0.913372 0.2045 0.042746 0.908952 0.911959 0.268941 0.039809 0.925377 0.797304 0.506563 0.442286 0.847732 0.468063 0.875271 0.882743 0.2679 0.897532 0.641637 0.348932 0.118149 0.581571 0.451564 0.746487 0.083696 0.590891 0.457712 0.925519 0.809181 0.617055 0.476428 0.735202 0.800572 0.703512 0.446173 0.881502 0.178311 0.306167 0.982297 0.574477 0.069049 0.609975 0.683942 0.101785 0.791777 0.701903 0.503841 0. [...]
+0.680811 0.441547 0.838755 0.654452 0.616236 0.635005 0.731812 0.584444 0.829686 0.777325 0.680674 0.909074 0.285368 0.973966 0.928712 0.028905 0.92221 0.948445 0.710755 0.424997 0.691137 0.983263 0.815783 0.885661 0.127576 0.713511 0.639405 0.144254 0.754028 0.358144 0.594065 0.731074 0.538976 0.24773 0.427595 0.849809 0.517601 0.815725 0.145071 0.777026 0.541044 0.097922 0.318551 0.663269 0.802961 0.344227 0.493755 0.088774 0.398962 0.352577 0.664156 0.304675 0.406279 0.428332 0.427735 [...]
+0.198634 0.832807 0.738526 0.354313 0.834819 0.425102 0.861264 0.26112 0.945375 0.556549 0.011082 0.833148 0.731812 0.281165 0.190546 0.258314 0.201033 0.371514 0.57521 0.601427 0.647952 0.87938 0.893018 0.96252 0.548293 0.137339 0.23478 0.536008 0.57525 0.647036 0.713449 0.987356 0.124259 0.624965 0.17513 0.916827 0.086306 0.461937 0.087138 0.970152 0.367171 0.045855 0.738424 0.908495 0.399146 0.501924 0.585497 0.825719 0.534167 0.039335 0.269761 0.401525 0.72578 0.810287 0.681677 0.293 [...]
+0.923055 0.580939 0.245538 0.13557 0.995502 0.526075 0.733731 0.172678 0.695639 0.269446 0.412551 0.211561 0.825692 0.460121 0.30138 0.043367 0.017104 0.47133 0.437092 0.666388 0.714892 0.879356 0.403892 0.193543 0.476878 0.824397 0.96122 0.62317 0.724299 0.462276 0.772777 0.74882 0.398087 0.980646 0.084331 0.754768 0.697917 0.357584 0.550112 0.575187 0.785682 0.52526 0.357248 0.226094 0.335473 0.613858 0.350299 0.186877 0.326284 0.747467 0.014332 0.519375 0.605992 0.516736 0.576142 0.29 [...]
+0.337578 0.968009 0.591266 0.013321 0.675567 0.49764 0.52473 0.315267 0.264613 0.130462 0.804983 0.311061 0.687228 0.912254 0.762887 0.757662 0.683574 0.303282 0.271835 0.042656 0.883097 0.404552 0.081754 0.400382 0.720497 0.170839 0.549795 0.165727 0.937373 0.783675 0.688802 0.787879 0.442947 0.656179 0.833663 0.196007 0.807832 0.275152 0.383234 0.447746 0.651061 0.005728 0.057635 0.274996 0.158637 0.00852 0.427011 0.676483 0.139389 0.819318 0.941936 0.067798 0.91564 0.203461 0.483388 0 [...]
+0.140507 0.894398 0.369344 0.959417 0.671621 0.547975 0.990973 0.154396 0.734897 0.234436 0.789574 0.574223 0.164128 0.928897 0.709101 0.74634 0.762549 0.426721 0.996731 0.36272 0.72397 0.147285 0.440453 0.368176 0.427869 0.497046 0.722012 0.300344 0.614637 0.893985 0.865719 0.789048 0.393972 0.198477 0.63568 0.711669 0.358805 0.218825 0.888566 0.507658 0.193583 0.605293 0.341751 0.707084 0.135619 0.674399 0.028728 0.86229 0.400348 0.817897 0.353094 0.923023 0.898916 0.181748 0.409302 0. [...]
+0.984681 0.471488 0.679874 0.421271 0.894038 0.956809 0.345847 0.761234 0.413322 0.136187 0.86975 0.469229 0.243276 0.256928 0.938456 0.124416 0.857414 0.029979 0.197615 0.457782 0.468363 0.079439 0.983491 0.732127 0.824833 0.041795 0.684807 0.838211 0.927584 0.508027 0.754713 0.718912 0.924029 0.475524 0.381052 0.166885 0.638054 0.004818 0.688745 0.192458 0.182997 0.333427 0.744597 0.075679 0.671592 0.380237 0.696583 0.50725 0.648076 0.331053 0.07671 0.304099 0.775799 0.924476 0.827817  [...]
+0.669988 0.484077 0.264714 0.85791 0.939958 0.378684 0.112719 0.943964 0.706103 0.084518 0.923426 0.670283 0.860029 0.293499 0.365439 0.078902 0.017038 0.270908 0.192804 0.083486 0.618723 0.063067 0.507419 0.338218 0.818208 0.468299 0.771796 0.197328 0.064426 0.159492 0.089446 0.654963 0.799592 0.613286 0.885764 0.125781 0.235452 0.708298 0.031411 0.502494 0.930785 0.79705 0.832613 0.116914 0.777152 0.694389 0.444456 0.858229 0.898338 0.658958 0.986262 0.223252 0.49095 0.189603 0.525751  [...]
+0.014777 0.755023 0.238897 0.721764 0.523482 0.136004 0.256584 0.747423 0.112191 0.417551 0.531806 0.608092 0.246372 0.833957 0.600111 0.612047 0.012578 0.543237 0.920214 0.279835 0.868439 0.163836 0.06769 0.637294 0.866153 0.311998 0.960181 0.239739 0.543686 0.971149 0.459578 0.08344 0.633548 0.134821 0.125849 0.318508 0.147873 0.169631 0.237515 0.73289 0.065845 0.562126 0.185116 0.214256 0.64238 0.415734 0.940191 0.444264 0.1716 0.56159 0.672069 0.228612 0.009624 0.222061 0.862325 0.21 [...]
+0.555767 0.621308 0.05284 0.707162 0.46641 0.184726 0.907255 0.745505 0.312743 0.007087 0.404895 0.244348 0.786635 0.31808 0.092235 0.588675 0.885873 0.030885 0.442848 0.555767 0.008235 0.33056 0.223916 0.836522 0.505753 0.265813 0.345087 0.720579 0.899932 0.382171 0.552352 0.709604 0.842147 0.115501 0.620169 0.000534 0.913204 0.489373 0.800341 0.64078 0.682278 0.701735 0.535271 0.660944 0.893054 0.596717 0.031001 0.663413 0.878893 0.830319 0.161871 0.647842 0.71708 0.444166 0.091076 0.0 [...]
+0.422181 0.151969 0.019666 0.252341 0.27559 0.58183 0.5125 0.827748 0.980223 0.198622 0.43149 0.560682 0.025218 0.176883 0.910035 0.877718 0.295098 0.489433 0.835855 0.676551 0.220555 0.271655 0.955992 0.77684 0.125655 0.164521 0.947768 0.117327 0.402917 0.683337 0.154611 0.047574 0.636318 0.040067 0.567677 0.024866 0.157356 0.960456 0.843053 0.076125 0.744762 0.269637 0.270539 0.964518 0.460396 0.70898 0.85584 0.798317 0.218194 0.632623 0.025353 0.798431 0.416997 0.42913 0.395049 0.5107 [...]
+0.257921 0.688747 0.56319 0.245773 0.506727 0.338031 0.198675 0.376224 0.471849 0.423529 0.931983 0.329201 0.869369 0.6922 0.995868 0.494414 0.71324 0.753326 0.497131 0.604579 0.542811 0.277613 0.981989 0.193094 0.611538 0.418581 0.48597 0.577608 0.823295 0.761369 0.367678 0.325208 0.623836 0.91868 0.365637 0.071531 0.097196 0.222555 0.509741 0.636954 0.085125 0.820404 0.584905 0.225813 0.626903 0.362557 0.723659 0.830569 0.546466 0.392761 0.729101 0.31296 0.530078 0.895137 0.571044 0.71 [...]
+0.732724 0.713133 0.353131 0.971157 0.020853 0.152687 0.919292 0.84228 0.670326 0.170791 0.670898 0.944492 0.034408 0.339796 0.098054 0.942392 0.329826 0.864287 0.640923 0.156753 0.183284 0.579239 0.2045 0.636549 0.644428 0.394762 0.093819 0.606834 0.994634 0.479998 0.129527 0.2138 0.879274 0.842167 0.891304 0.42534 0.859419 0.582557 0.413163 0.726083 0.452657 0.381492 0.954173 0.244581 0.415476 0.74845 0.73543 0.483662 0.072519 0.356522 0.811871 0.574677 0.884574 0.717488 0.851602 0.972 [...]
+0.600988 0.77326 0.228522 0.24346 0.067936 0.121263 0.131634 0.524394 0.208768 0.835772 0.139106 0.486145 0.104429 0.201625 0.225201 0.839371 0.488182 0.461772 0.020669 0.85429 0.028452 0.895073 0.827959 0.097003 0.578735 0.262968 0.53055 0.401657 0.400373 0.115186 0.424097 0.398092 0.368848 0.675238 0.209851 0.525336 0.617364 0.962307 0.477756 0.194674 0.822812 0.754426 0.789725 0.795097 0.426962 0.511273 0.947133 0.671419 0.726164 0.656545 0.86533 0.02666 0.156907 0.669645 0.693669 0.0 [...]
+0.952131 0.110612 0.694175 0.373063 0.575379 0.103109 0.293096 0.977861 0.773537 0.915582 0.986563 0.984658 0.919013 0.052922 0.057502 0.496555 0.336491 0.888194 0.569538 0.519367 0.515806 0.80374 0.886711 0.15036 0.327196 0.628576 0.300055 0.4067 0.777868 0.361801 0.435844 0.381586 0.6664 0.159829 0.441093 0.407933 0.854628 0.95369 0.026456 0.548377 0.979984 0.562511 0.518861 0.557093 0.498317 0.850678 0.997158 0.298639 0.849018 0.101528 0.997709 0.972752 0.778873 0.073287 0.830975 0.27 [...]
+0.739923 0.535688 0.564207 0.112194 0.008834 0.430858 0.080808 0.199344 0.781889 0.499351 0.117039 0.30109 0.426553 0.42501 0.29534 0.534931 0.626933 0.973034 0.872461 0.368607 0.954887 0.445601 0.915102 0.077856 0.066187 0.435493 0.719501 0.226107 0.641068 0.995156 0.73674 0.461855 0.855441 0.681858 0.45711 0.478718 0.654428 0.739253 0.280331 0.712681 0.129115 0.673256 0.535043 0.807085 0.351127 0.073793 0.082358 0.717717 0.030073 0.185115 0.135819 0.369027 0.869111 0.799993 0.081647 0. [...]
+0.464546 0.445963 0.443006 0.148446 0.047739 0.149967 0.252398 0.272881 0.107972 0.214966 0.263435 0.887096 0.773987 0.191131 0.489919 0.741437 0.182905 0.972167 0.237482 0.723627 0.238705 0.30337 0.505759 0.930735 0.047446 0.266144 0.551707 0.829362 0.157368 0.940474 0.768231 0.957403 0.182423 0.551814 0.537133 0.896723 0.376841 0.072925 0.484645 0.207119 0.899729 0.723673 0.530157 0.617581 0.88355 0.957494 0.774468 0.45228 0.893055 0.56045 0.185384 0.78358 0.263779 0.381155 0.930444 0. [...]
+0.598848 0.931581 0.398567 0.208387 0.867812 0.357886 0.048227 0.718535 0.564055 0.097712 0.980965 0.707086 0.307034 0.795733 0.011527 0.342181 0.335255 0.061847 0.102117 0.65082 0.238808 0.2299 0.815252 0.747282 0.792294 0.160387 0.935294 0.105726 0.064918 0.78869 0.088292 0.458358 0.229778 0.34665 0.17066 0.685091 0.433467 0.555755 0.576389 0.196027 0.342685 0.512883 0.62313 0.296832 0.285731 0.875535 0.215898 0.2753 0.00539 0.541737 0.024092 0.14335 0.029459 0.336455 0.107129 0.744184 [...]
+0.211719 0.068893 0.410562 0.958507 0.153906 0.36904 0.248872 0.924873 0.368797 0.158803 0.368857 0.574473 0.66027 0.973738 0.024033 0.993845 0.835757 0.787565 0.252577 0.436793 0.373229 0.289163 0.35736 0.49102 0.901812 0.799048 0.114897 0.279405 0.019656 0.286793 0.169181 0.815432 0.165822 0.593113 0.247646 0.333456 0.448495 0.908786 0.377467 0.867072 0.62992 0.243043 0.914671 0.380675 0.226753 0.578779 0.763298 0.00631 0.725683 0.247128 0.99384 0.028146 0.172082 0.588657 0.209859 0.83 [...]
+0.837359 0.271708 0.59411 0.43699 0.837759 0.750414 0.640391 0.06798 0.394867 0.685631 0.89595 0.020233 0.323161 0.985002 0.461902 0.728532 0.943383 0.099183 0.00143 0.593803 0.971395 0.007738 0.424448 0.706058 0.807584 0.915503 0.230735 0.884355 0.906791 0.777227 0.086679 0.10033 0.495774 0.844751 0.440535 0.050041 0.604017 0.183778 0.685546 0.694167 0.331542 0.026742 0.053572 0.814539 0.757326 0.116991 0.390535 0.063701 0.914462 0.564797 0.80584 0.83654 0.461489 0.378687 0.889777 0.426 [...]
+0.614788 0.604668 0.455128 0.000668 0.297983 0.12946 0.360799 0.025773 0.379363 0.927281 0.524615 0.419426 0.337315 0.157962 0.042617 0.013186 0.037537 0.579546 0.709946 0.077756 0.313697 0.804464 0.149079 0.956591 0.088473 0.258383 0.254152 0.487173 0.38918 0.31589 0.037239 0.991176 0.039897 0.848327 0.398474 0.002297 0.312407 0.635789 0.196271 0.912581 0.623703 0.493815 0.871697 0.029916 0.021754 0.636865 0.434152 0.621673 0.797127 0.010662 0.921484 0.512384 0.03109 0.07716 0.528045 0. [...]
+0.755157 0.881989 0.141703 0.531543 0.106067 0.33865 0.687841 0.162955 0.554712 0.220943 0.78588 0.207663 0.471887 0.583111 0.236548 0.376058 0.889938 0.153008 0.099703 0.70652 0.561963 0.474186 0.447982 0.927553 0.743641 0.061975 0.26876 0.976557 0.961217 0.035827 0.322622 0.856592 0.970358 0.235666 0.870238 0.039338 0.492272 0.068397 0.980268 0.864054 0.233571 0.845346 0.486201 0.366576 0.570743 0.084102 0.714075 0.35407 0.727919 0.63166 0.279802 0.711298 0.168221 0.826861 0.430182 0.5 [...]
+0.603717 0.03338 0.841034 0.095051 0.82104 0.45151 0.911813 0.340188 0.66997 0.693745 0.60997 0.981716 0.333034 0.638301 0.012172 0.820709 0.247272 0.785435 0.870016 0.590957 0.718018 0.209869 0.445618 0.432329 0.332895 0.816873 0.483923 0.908667 0.442062 0.627437 0.905935 0.442782 0.035016 0.861641 0.211872 0.779487 0.605279 0.081045 0.869571 0.527702 0.09139 0.589228 0.013159 0.902612 0.675209 0.228191 0.682907 0.855819 0.177238 0.377963 0.645771 0.851711 0.632936 0.472607 0.720585 0.1 [...]
+0.873899 0.68348 0.027931 0.873006 0.178957 0.936876 0.255579 0.595255 0.5023 0.577521 0.382018 0.174814 0.993606 0.13591 0.292095 0.988507 0.913676 0.69467 0.523292 0.275675 0.382737 0.057062 0.060263 0.847682 0.501804 0.675416 0.207187 0.681535 0.310701 0.979246 0.671884 0.752521 0.528942 0.795855 0.43422 0.086059 0.140159 0.64294 0.758743 0.621032 0.968527 0.826897 0.49738 0.058191 0.84031 0.631464 0.93784 0.487462 0.850455 0.403756 0.63775 0.699347 0.872695 0.537257 0.4741 0.009308 0 [...]
+0.435864 0.780313 0.359644 4.2e-05 0.842534 0.936799 0.481393 0.368354 0.460448 0.593669 0.304988 0.587733 0.215324 0.834545 0.547461 0.894412 0.963278 0.397876 0.255398 0.237319 0.884271 0.313677 0.083902 0.39071 0.410507 0.3271 0.317854 0.623022 0.306108 0.617921 0.00055 0.969372 0.014834 0.538313 0.629769 0.810508 0.74584 0.304723 0.093337 0.795192 0.786851 0.713858 0.652606 0.788087 0.250676 0.205633 0.440697 0.894607 0.017722 0.774988 0.713774 0.619231 0.340579 0.220396 0.382724 0.8 [...]
+0.498718 0.478628 0.090865 0.588954 0.544745 0.303298 0.326896 0.966351 0.208672 0.418483 0.294238 0.804003 0.005065 0.194245 0.297008 0.299747 0.935173 0.24209 0.783006 0.38113 0.719408 0.691896 0.335817 0.695859 0.560528 0.953935 0.761446 0.960044 0.321347 0.445824 0.849996 0.835005 0.328907 0.012206 0.941856 0.733171 0.951631 0.640809 0.706391 0.543858 0.798219 0.076772 0.265958 0.51647 0.356255 0.347815 0.866525 0.355563 0.041249 0.251457 0.327753 0.685387 0.31355 0.931299 0.729805 0 [...]
+0.675178 0.267987 0.767235 0.369291 0.190687 0.503567 0.040385 0.526984 0.578282 0.582514 0.142185 0.122456 0.837685 0.393147 0.901641 0.808574 0.790596 0.479725 0.465048 0.361569 0.327802 0.716088 0.277067 0.978061 0.362861 0.260302 0.701016 0.420575 0.292696 0.875992 0.711035 0.801103 0.688002 0.580684 0.845585 0.468176 0.956011 0.413366 0.43437 0.194463 0.935335 0.559476 0.843446 0.280308 0.563294 0.999359 0.859801 0.879823 0.810112 0.128791 0.246171 0.219322 0.922954 0.16917 0.850827 [...]
+0.729517 0.30494 0.663806 0.776473 0.358972 0.337075 0.127522 0.06978 0.377853 0.575673 0.641836 0.871843 0.69715 0.22862 0.979495 0.219615 0.620036 0.262501 0.880864 0.495455 0.709946 0.512073 0.8211 0.959423 0.782749 0.900694 0.210674 0.191656 0.172053 0.688895 0.537858 0.039366 0.76652 0.09872 0.243894 0.509915 0.79258 0.18773 0.05784 0.737586 0.453015 0.582609 0.590925 0.109949 0.6807 0.668802 0.037016 0.220778 0.987882 0.613532 0.160285 0.753121 0.896374 0.601232 0.007594 0.664207 0 [...]
+0.338933 0.617408 0.37276 0.918208 0.772978 0.32324 0.930048 0.821974 0.567667 0.708216 0.613742 0.981596 0.6326 0.621376 0.528606 0.000379 0.093665 0.128224 0.00633 0.630991 0.449065 0.862715 0.46416 0.336113 0.304395 0.553248 0.20678 0.921227 0.680876 0.455417 0.915067 0.400382 0.59073 0.957437 0.877277 0.279377 0.457956 0.506433 0.441886 0.011241 0.233305 0.260462 0.739549 0.616619 0.700278 0.342499 0.784573 0.152287 0.820119 0.331192 0.743166 0.75274 0.16861 0.77118 0.545679 0.860919 [...]
+0.127515 0.89048 0.00274 0.280995 0.529432 0.8044 0.521226 0.715016 0.153599 0.754768 0.285796 0.10419 0.195577 0.104209 0.284061 0.324246 0.523924 0.396332 0.324471 0.625477 0.728744 0.025909 0.563776 0.787475 0.234171 0.795939 0.380636 0.222797 0.158227 0.003938 0.44787 0.517526 0.246828 0.203564 0.866686 0.51951 0.636783 0.904792 0.958201 0.203436 0.224302 0.589832 0.727151 0.958181 0.769311 0.332503 0.867224 0.560049 0.361279 0.428316 0.411616 0.504288 0.898189 0.523104 0.574632 0.17 [...]
+0.721335 0.26994 0.607998 0.378284 0.890234 0.459662 0.802675 0.641797 0.579046 0.571888 0.812908 0.234993 0.947223 0.819157 0.76894 0.549624 0.378968 0.963327 0.323442 0.029199 0.322272 0.364415 0.779017 0.487883 0.209331 0.75763 0.973773 0.138623 0.424741 0.335916 0.162814 0.397722 0.430978 0.485764 0.575151 0.758232 0.204513 0.199411 0.342228 0.957954 0.022189 0.313706 0.144245 0.39583 0.202379 0.27254 0.564044 0.890137 0.115874 0.276811 0.041039 0.358029 0.538151 0.284665 0.869839 0. [...]
+0.29344 0.034482 0.526911 0.050379 0.327881 0.390029 0.080056 0.961875 0.591087 0.938196 0.658202 0.354 0.058028 0.345772 0.857639 0.257065 0.977084 0.937543 0.337466 0.926682 0.410827 0.596854 0.060106 0.221655 0.86492 0.861898 0.470531 0.611265 0.506029 0.725785 0.346813 0.925018 0.276732 0.645908 0.424128 0.693497 0.311644 0.999236 0.967694 0.869606 0.07375 0.436546 0.040266 0.512088 0.880946 0.314343 0.79755 0.059309 0.614515 0.143245 0.16391 0.857501 0.322442 0.986017 0.718185 0.290 [...]
+0.265507 0.309722 0.32533 0.269409 0.732886 0.682338 0.830042 0.494197 0.266336 0.869863 0.938717 0.717349 0.364523 0.236882 0.485085 0.526389 0.568086 0.087243 0.417618 0.626394 0.02197 0.760759 0.142697 0.23354 0.420343 0.091042 0.173687 0.406924 0.735657 0.863498 0.728325 0.15385 0.612434 0.065502 0.984284 0.817116 0.260723 0.087136 0.861316 0.658895 0.104969 0.803181 0.125323 0.594097 0.988917 0.561304 0.485738 0.421693 0.298299 0.996478 0.865868 0.796867 0.231827 0.147233 0.765816 0 [...]
+0.481224 0.765808 0.454641 0.752605 0.942847 0.309941 0.036827 0.549106 0.323589 0.908737 0.797492 0.753672 0.035389 0.480375 0.130818 0.795128 0.712103 0.393462 0.146587 0.394348 0.973361 0.842872 0.408079 0.791285 0.522274 0.60544 0.400882 0.387039 0.635458 0.700368 0.095061 0.647853 0.314701 0.226746 0.431473 0.398213 0.910147 0.646339 0.341626 0.032618 0.236674 0.787772 0.941073 0.996441 0.175795 0.116435 0.287916 0.182003 0.411283 0.521752 0.125956 0.766421 0.487114 0.386541 0.11852 [...]
+0.002727 0.063325 0.13464 0.570618 0.362615 0.470593 0.61597 0.477971 0.551373 0.925132 0.120443 0.310606 0.179212 0.181283 0.043196 0.229232 0.132475 0.185132 0.413193 0.698401 0.837212 0.865974 0.049473 0.445031 0.552609 0.353011 0.719773 0.269676 0.79609 0.861122 0.355334 0.213538 0.845363 0.504661 0.753883 0.093839 0.109659 0.03265 0.037546 0.536209 0.779341 0.299419 0.621729 0.641403 0.338667 0.554726 0.241046 0.564764 0.722989 0.444781 0.783822 0.733784 0.176522 0.866505 0.580093 0 [...]
+0.966052 0.729602 0.905963 0.46105 0.489401 0.459639 0.567599 0.940313 0.818902 0.701543 0.235812 0.460521 0.292141 0.860209 0.93628 0.097334 0.306091 0.461039 0.887883 0.941304 0.603916 0.063578 0.999172 0.217055 0.388254 0.128603 0.240526 0.492801 0.387345 0.305749 0.50508 0.347001 0.820354 0.143006 0.2094 0.570621 0.955126 0.132997 0.700794 0.668082 0.862065 0.4991 0.823673 0.150204 0.849085 0.060999 0.926882 0.875189 0.661949 0.046695 0.868553 0.906924 0.122562 0.360476 0.907506 0.09 [...]
+0.213897 0.459107 0.747649 0.417098 0.867547 0.222331 0.873388 0.447182 0.320042 0.086796 0.289532 0.078917 0.738239 0.5674 0.301815 0.53377 0.989954 0.993092 0.245295 0.384934 0.921744 0.000355 0.946269 0.38674 0.870416 0.522956 0.655318 0.782353 0.796954 0.680622 0.505952 0.341226 0.68276 0.203402 0.683759 0.084515 0.340573 0.087881 0.860813 0.228449 0.104656 0.592875 0.254625 0.32519 0.773563 0.87152 0.12477 0.158812 0.704034 0.898766 0.033677 0.913223 0.598051 0.89871 0.55993 0.57174 [...]
+0.043952 0.518589 0.735389 0.930001 0.930197 0.775103 0.925522 0.821396 0.297852 0.19776 0.629484 0.630372 0.9261 0.43256 0.960102 0.37573 0.536441 0.913999 0.119728 0.871207 0.597611 0.685136 0.009861 0.207361 0.302184 0.339722 0.256891 0.594475 0.154318 0.007298 0.047292 0.502364 0.272177 0.37634 0.472165 0.574615 0.063158 0.684594 0.273836 0.610807 0.673662 0.722131 0.92657 0.432548 0.420855 0.179555 0.895787 0.673978 0.094675 0.147052 0.022475 0.725296 0.297083 0.386238 0.776244 0.02 [...]
+0.498526 0.07448 0.675995 0.891674 0.703539 0.017851 0.195334 0.494839 0.238642 0.376025 0.059317 0.409584 0.259268 0.80501 0.562998 0.249214 0.619863 0.386881 0.812921 0.216827 0.85721 0.692332 0.702307 0.659699 0.047871 0.321574 0.192253 0.194104 0.483718 0.143185 0.968715 0.839725 0.520927 0.795409 0.885268 0.548312 0.431761 0.389709 0.77922 0.592568 0.488589 0.121325 0.374613 0.963822 0.152488 0.155474 0.402831 0.753191 0.509891 0.459636 0.259216 0.840861 0.550227 0.241521 0.724612 0 [...]
+0.969426 0.027943 0.623929 0.013286 0.086233 0.762758 0.946189 0.858645 0.457231 0.276674 0.051168 0.241539 0.120561 0.957712 0.478208 0.727807 0.826018 0.623754 0.55044 0.763435 0.394721 0.11386 0.128234 0.805218 0.295864 0.144252 0.19692 0.448193 0.967562 0.16728 0.907811 0.445937 0.876162 0.100953 0.884638 0.173951 0.656676 0.294159 0.915778 0.974471 0.90725 0.534058 0.676926 0.382981 0.425882 0.319432 0.492819 0.21364 0.593601 0.257467 0.260221 0.228922 0.837307 0.227749 0.815895 0.0 [...]
+0.309922 0.052892 0.281328 0.915372 0.855225 0.78556 0.026515 0.860878 0.774764 0.650031 0.232211 0.689505 0.379233 0.629671 0.855089 0.243152 0.167001 0.181382 0.143796 0.71868 0.111311 0.221018 0.731367 0.009134 0.983472 0.677346 0.563842 0.820677 0.724111 0.829864 0.694139 0.298109 0.52787 0.051473 0.960909 0.959922 0.114098 0.531445 0.030536 0.824806 0.568015 0.464101 0.657756 0.184279 0.365709 0.400518 0.85653 0.981698 0.877112 0.077682 0.049849 0.787082 0.028271 0.12235 0.567176 0. [...]
+0.988303 0.009902 0.970672 0.639836 0.448595 0.145612 0.537314 0.052152 0.783168 0.079839 0.780247 0.35926 0.686483 0.274826 0.692347 0.047792 0.013978 0.180617 0.424387 0.327223 0.010935 0.326927 0.103114 0.312288 0.41063 0.614807 0.160725 0.300585 0.234984 0.517467 0.194428 0.912095 0.870155 0.034636 0.518093 0.52983 0.546864 0.772459 0.074302 0.585081 0.110033 0.243792 0.643275 0.127733 0.184632 0.287984 0.601108 0.164523 0.088834 0.281881 0.130468 0.593539 0.527138 0.288989 0.555615  [...]
+0.024723 0.633062 0.208803 0.562445 0.095671 0.455876 0.246883 0.365314 0.693502 0.04526 0.874665 0.332803 0.941858 0.70757 0.171317 0.14927 0.377684 0.500595 0.494601 0.93032 0.45493 0.15968 0.68076 0.187546 0.925604 0.434372 0.101457 0.512465 0.611819 0.122793 0.602998 0.686229 0.385585 0.602266 0.807878 0.973074 0.302491 0.717063 0.69734 0.59852 0.863107 0.281469 0.72056 0.731142 0.965968 0.023005 0.306746 0.714645 0.101188 0.092001 0.494685 0.750095 0.044764 0.13293 0.404586 0.97405  [...]
+0.567235 0.525636 0.096088 0.748455 0.51802 0.494514 0.038563 0.59787 0.128856 0.66675 0.596717 0.605899 0.315789 0.995672 0.640705 0.560716 0.115819 0.245526 0.719297 0.520605 0.015957 0.237326 0.506678 0.462852 0.218106 0.854173 0.366256 0.489754 0.295458 0.839991 0.030608 0.838003 0.267938 0.568234 0.970121 0.812326 0.542358 0.692472 0.021437 0.656903 0.10656 0.741832 0.359862 0.628924 0.029226 0.126041 0.284252 0.862182 0.298191 0.56659 0.426467 0.005168 0.761992 0.218573 0.892016 0. [...]
+0.292276 0.659202 0.880965 0.13108 0.963297 0.78416 0.176773 0.062114 0.927102 0.414297 0.43779 0.083804 0.815934 0.302293 0.281043 0.265054 0.342279 0.171823 0.225226 0.304937 0.631739 0.485297 0.46734 0.906581 0.493485 0.918584 0.00191 0.122189 0.459905 0.532798 0.441822 0.987772 0.365992 0.092745 0.637185 0.609758 0.38324 0.497097 0.774767 0.097883 0.574499 0.027679 0.643792 0.621559 0.944484 0.365071 0.389897 0.341461 0.126299 0.683043 0.0188 0.635962 0.019527 0.469567 0.780113 0.027 [...]
+0.180645 0.362023 0.98079 0.757377 0.185812 0.741261 0.697021 0.67591 0.600688 0.56572 0.313506 0.92882 0.061186 0.786406 0.508632 0.057876 0.730281 0.462756 0.608754 0.583161 0.177896 0.487355 0.407131 0.782882 0.68022 0.830469 0.421803 0.178612 0.890105 0.247784 0.575567 0.559842 0.610454 0.946984 0.48389 0.77716 0.358629 0.700645 0.45732 0.40222 0.019387 0.529599 0.894447 0.385441 0.935718 0.737595 0.626757 0.566887 0.320948 0.973984 0.57217 0.037497 0.398132 0.173944 0.445204 0.51581 [...]
+0.549184 0.89434 0.189144 0.648958 0.066436 0.760144 0.122257 0.985575 0.208302 0.48594 0.243732 0.99438 0.840047 0.422493 0.477447 0.348848 0.516523 0.39607 0.915039 0.212388 0.03579 0.16322 0.002807 0.7884 0.051078 0.053679 0.518838 0.29187 0.031918 0.08437 0.422377 0.97274 0.124402 0.945396 0.307891 0.019715 0.519398 0.806974 0.243443 0.358563 0.957398 0.242283 0.759049 0.686306 0.027276 0.022145 0.770853 0.496826 0.592867 0.68281 0.611401 0.637761 0.806068 0.508448 0.549365 0.770444  [...]
+0.927282 0.595973 0.714107 0.502014 0.194086 0.480074 0.74023 0.934041 0.700723 0.020556 0.26312 0.46886 0.98885 0.246936 0.767642 0.668079 0.662836 0.796041 0.160862 0.642157 0.19583 0.892715 0.36506 0.726617 0.73488 0.087216 0.579644 0.948817 0.34219 0.063743 0.438037 0.181119 0.690992 0.595952 0.056976 0.521256 0.259857 0.861524 0.665262 0.693363 0.409663 0.11406 0.919887 0.836334 0.030679 0.618435 0.05634 0.412623 0.053136 0.480695 0.866803 0.059993 0.638399 0.512861 0.193189 0.79314 [...]
+0.886375 0.050529 0.957048 0.722854 0.00447 0.747386 0.470221 0.940306 0.07374 0.154919 0.508058 0.203569 0.056967 0.71809 0.756118 0.704362 0.955717 0.10599 0.747373 0.772418 0.335654 0.398997 0.347444 0.852654 0.904753 0.464472 0.859533 0.912617 0.184325 0.259718 0.110002 0.488824 0.26471 0.597734 0.345285 0.880591 0.257282 0.056176 0.443296 0.411452 0.867727 0.562254 0.216614 0.793968 0.807587 0.258223 0.800938 0.649677 0.796862 0.128983 0.531184 0.242956 0.68369 0.725363 0.532464 0.2 [...]
+0.434176 0.601418 0.47635 0.745637 0.49531 0.44764 0.207553 0.237946 0.565637 0.163415 0.449282 0.250126 0.091528 0.60632 0.360171 0.420902 0.311659 0.84708 0.451892 0.162153 0.18186 0.552569 0.118122 0.170901 0.720013 0.497912 0.98171 0.375686 0.159305 0.653552 0.499732 0.199599 0.696811 0.142626 0.357954 0.887297 0.098957 0.782629 0.074211 0.616414 0.260295 0.790766 0.967322 0.119512 0.427014 0.651146 0.191443 0.24214 0.093923 0.022003 0.637504 0.987513 0.508039 0.348618 0.987383 0.432 [...]
+0.392003 0.108838 0.857029 0.975137 0.207465 0.091145 0.654273 0.758363 0.312112 0.68297 0.785969 0.509022 0.029353 0.02098 0.159362 0.574053 0.637482 0.856565 0.510551 0.82681 0.050582 0.20165 0.087611 0.596505 0.096765 0.002963 0.534617 0.689156 0.162635 0.687781 0.91882 0.986738 0.532575 0.257171 0.068875 0.311308 0.953031 0.352992 0.792908 0.508211 0.925868 0.940474 0.960328 0.05884 0.826292 0.219666 0.322076 0.621478 0.607226 0.745899 0.935004 0.284687 0.457192 0.944142 0.326033 0.5 [...]
+0.631813 0.955593 0.706979 0.35127 0.341421 0.750069 0.7944 0.450603 0.122279 0.09032 0.519077 0.39628 0.628031 0.876321 0.943323 0.334442 0.008137 0.339086 0.813768 0.521193 0.014003 0.307196 0.150122 0.10093 0.707017 0.702632 0.812096 0.248349 0.056034 0.730222 0.197307 0.487676 0.051141 0.63025 0.36733 0.587249 0.504972 0.924303 0.776177 0.165977 0.797778 0.578582 0.880819 0.34771 0.401513 0.458061 0.271129 0.932119 0.867585 0.999539 0.320689 0.373929 0.750634 0.312058 0.286307 0.3818 [...]
+0.862277 0.748992 0.011482 0.955584 0.462693 0.1745 0.551121 0.015106 0.74229 0.692221 0.483472 0.77921 0.507532 0.258854 0.759093 0.091273 0.446378 0.622927 0.096065 0.449257 0.33611 0.991128 0.498883 0.398276 0.692834 0.740176 0.351592 0.826993 0.90328 0.275326 0.368131 0.305568 0.398957 0.891271 0.009968 0.630966 0.17181 0.500977 0.496671 0.593455 0.908148 0.661174 0.290462 0.541507 0.766261 0.390527 0.775989 0.560351 0.918758 0.048689 0.012499 0.783599 0.746607 0.636511 0.011193 0.20 [...]
+0.787504 0.169283 0.450464 0.645516 0.880301 0.895499 0.44072 0.584852 0.768097 0.405525 0.866597 0.349586 0.174284 0.296738 0.029412 0.136203 0.07272 0.116641 0.593643 0.013911 0.479474 0.84749 0.070192 0.00439 0.717627 0.172662 0.69225 0.685686 0.143736 0.589083 0.182088 0.160004 0.426463 0.501816 0.480212 0.054101 0.917146 0.41848 0.951281 0.060245 0.84318 0.709772 0.650682 0.781005 0.152944 0.379501 0.866973 0.229054 0.858542 0.734009 0.220477 0.878598 0.864303 0.844303 0.7903 0.5719 [...]
+0.107742 0.035328 0.051342 0.352096 0.394593 0.560855 0.612166 0.270139 0.261092 0.137123 0.782446 0.981797 0.995539 0.567629 0.105519 0.346054 0.712301 0.151723 0.389821 0.834221 0.324187 0.830016 0.961855 0.390906 0.045172 0.135189 0.476933 0.046188 0.085404 0.895809 0.236144 0.345149 0.037825 0.857558 0.235626 0.76004 0.259274 0.511349 0.102887 0.138333 0.389993 0.456858 0.471922 0.813897 0.945183 0.888477 0.053726 0.201063 0.102732 0.778907 0.925296 0.352311 0.149824 0.10703 0.743216 [...]
+0.65129 0.779537 0.08709 0.229269 0.462791 0.419783 0.245247 0.350494 0.588068 0.216763 0.396628 0.428126 0.965661 0.187993 0.422701 0.731946 0.437976 0.568136 0.373742 0.445723 0.904902 0.889657 0.194965 0.721703 0.779091 0.861887 0.720262 0.443538 0.702701 0.224825 0.901822 0.497168 0.522913 0.209602 0.310974 0.991303 0.624627 0.007767 0.962896 0.134457 0.024605 0.812923 0.018326 0.126435 0.811326 0.890683 0.536228 0.799735 0.173692 0.930998 0.248466 0.463422 0.198057 0.865181 0.620289 [...]
+0.747207 0.108917 0.32348 0.720748 0.211441 0.965377 0.816571 0.504292 0.641604 0.298463 0.180983 0.034249 0.929653 0.226269 0.566506 0.45546 0.083058 0.900475 0.41557 0.128857 0.494637 0.986222 0.258841 0.801447 0.758854 0.828494 0.874695 0.696352 0.822146 0.357054 0.01704 0.883939 0.592206 0.75246 0.555429 0.430574 0.666047 0.987605 0.462457 0.740514 0.020628 0.683699 0.939562 0.647914 0.536318 0.18677 0.088853 0.779605 0.604406 0.411128 0.776988 0.631802 0.287403 0.047464 0.783595 0.4 [...]
+0.807972 0.410795 0.618403 0.949829 0.597689 0.537864 0.053693 0.743684 0.454222 0.116913 0.191778 0.666734 0.514016 0.588556 0.226548 0.114545 0.992718 0.394266 0.767758 0.143468 0.77908 0.585436 0.171495 0.829025 0.487776 0.918489 0.180971 0.867104 0.887974 0.568085 0.006962 0.925864 0.497469 0.842117 0.229489 0.526022 0.725465 0.864719 0.306319 0.728588 0.752755 0.341515 0.556873 0.253384 0.573657 0.707999 0.182377 0.47341 0.518885 0.663312 0.644112 0.54502 0.350825 0.677913 0.11523 0 [...]
+0.948542 0.837332 0.699829 0.890985 0.740502 0.471026 0.039148 0.994491 0.331372 0.001275 0.026063 0.424963 0.642782 0.524355 0.638919 0.303751 0.693256 0.048306 0.014931 0.201907 0.175048 0.887073 0.238404 0.630272 0.89373 0.844301 0.225133 0.417291 0.965599 0.53927 0.577093 0.164601 0.833699 0.450272 0.528745 0.268129 0.281131 0.460458 0.633528 0.866305 0.545808 0.698182 0.936728 0.917972 0.0758 0.720413 0.097585 0.420132 0.156887 0.888684 0.675197 0.957689 0.43092 0.943319 0.606856 0. [...]
+0.254199 0.117174 0.869639 0.027648 0.214371 0.13861 0.65722 0.119636 0.470386 0.485616 0.648728 0.256534 0.361358 0.283682 0.600833 0.490921 0.421458 0.054124 0.851971 0.387844 0.682883 0.013762 0.545089 0.068179 0.930551 0.865965 0.118836 0.86433 0.425938 0.070376 0.157305 0.57433 0.42563 0.885002 0.140868 0.07984 0.943545 0.789395 0.665884 0.395263 0.877453 0.656363 0.747095 0.650954 0.767107 0.674898 0.124061 0.774035 0.338896 0.83469 0.007711 0.046338 0.183188 0.259533 0.88509 0.731 [...]
+0.475072 0.495258 0.195687 0.762157 0.148406 0.026849 0.577992 0.570951 0.88964 0.910045 0.980823 0.689226 0.087198 0.070279 0.1165 0.703258 0.464283 0.047487 0.467033 0.265894 0.32933 0.063302 0.630012 0.275931 0.885573 0.509124 0.686111 0.006249 0.355321 0.650524 0.565766 0.138425 0.006337 0.235123 0.885001 0.09729 0.333686 0.325939 0.703411 0.613377 0.275369 0.104151 0.050113 0.171618 0.393228 0.55639 0.137443 0.623577 0.524674 0.317931 0.415376 0.854789 0.59755 0.323819 0.507812 0.81 [...]
+0.085021 0.07843 0.87586 0.887814 0.189796 0.682771 0.062297 0.33969 0.322869 0.164405 0.908965 0.515699 0.600844 0.515351 0.564867 0.917519 0.573899 0.839517 0.915998 0.467817 0.499918 0.896863 0.227344 0.677762 0.943917 0.22443 0.019033 0.247152 0.361415 0.614996 0.631853 0.015313 0.414576 0.21846 0.930731 0.695301 0.79121 0.012123 0.618071 0.383197 0.568661 0.697044 0.425681 0.546346 0.383883 0.603181 0.668809 0.386314 0.448133 0.623773 0.637288 0.261052 0.264011 0.316816 0.773695 0.0 [...]
+0.434429 0.678411 0.190974 0.326167 0.691201 0.467103 0.85116 0.226103 0.413744 0.559884 0.947047 0.470052 0.172284 0.456734 0.405143 0.363412 0.647002 0.287835 0.038354 0.603973 0.649355 0.651045 0.733751 0.0992 0.445489 0.570332 0.043205 0.424765 0.459433 0.447431 0.684977 0.434046 0.414343 0.458758 0.684239 0.266087 0.888857 0.551111 0.146688 0.202203 0.535606 0.554355 0.037577 0.012234 0.302699 0.689539 0.99113 0.239573 0.0641 0.886806 0.466417 0.159864 0.230027 0.561072 0.165742 0.5 [...]
+0.417847 0.013695 0.03549 0.225904 0.196949 0.367212 0.83385 0.18894 0.36276 0.243957 0.467094 0.164264 0.443733 0.008422 0.4622 0.713138 0.870853 0.301417 0.18662 0.393778 0.370884 0.255227 0.005971 0.342882 0.359083 0.807882 0.807946 0.748895 0.288761 0.08244 0.655277 0.756498 0.08896 0.691011 0.93719 0.848425 0.590234 0.187539 0.803171 0.32701 0.499081 0.442931 0.244669 0.517899 0.909304 0.222184 0.276818 0.267448 0.641536 0.804868 0.204107 0.271777 0.204915 0.551584 0.433118 0.745564 [...]
+0.353711 0.349988 0.997847 0.480354 0.010769 0.405491 0.075478 0.999364 0.539698 0.612931 0.229598 0.14914 0.206134 0.977992 0.738411 0.888447 0.232915 0.149665 0.683799 0.94953 0.846888 0.42204 0.903917 0.601924 0.375596 0.829196 0.16821 0.014642 0.634703 0.057574 0.866009 0.715451 0.367072 0.164476 0.609197 0.029919 0.664763 0.022542 0.508901 0.482819 0.104945 0.942066 0.596073 0.587708 0.389402 0.443918 0.742693 0.735813 0.609405 0.019377 0.412107 0.053437 0.036813 0.779655 0.956475 0 [...]
+0.246728 0.019353 0.029658 0.917884 0.842405 0.639964 0.833751 0.592018 0.775832 0.503336 0.024221 0.007867 0.29159 0.935038 0.294411 0.514941 0.956961 0.14112 0.150355 0.792862 0.820882 0.100514 0.942736 0.802597 0.749116 0.818429 0.022114 0.826883 0.755839 0.04916 0.989945 0.372794 0.195893 0.756463 0.353414 0.058609 0.048815 0.10174 0.004356 0.936473 0.084947 0.110393 0.50026 0.323395 0.673647 0.977737 0.088639 0.410751 0.126474 0.049197 0.24344 0.633915 0.735697 0.283992 0.781561 0.0 [...]
+0.208746 0.475685 0.347607 0.488718 0.87247 0.964578 0.965796 0.123191 0.739205 0.519741 0.838181 0.421791 0.878217 0.19544 0.12879 0.961084 0.442362 0.154433 0.92048 0.722798 0.826437 0.931242 0.151618 0.710591 0.312571 0.633933 0.488972 0.664891 0.298461 0.860643 0.417454 0.006341 0.262981 0.166648 0.243673 0.65264 0.466405 0.64069 0.644594 0.571418 0.280151 0.790444 0.759477 0.261142 0.396038 0.951672 0.454475 0.09482 0.114858 0.238695 0.172924 0.870972 0.141311 0.351793 0.898607 0.37 [...]
+0.021626 0.22859 0.865201 0.319917 0.301829 0.182983 0.780476 0.923598 0.314195 0.72392 0.754405 0.734964 0.200904 0.601088 0.380352 0.740276 0.32523 0.019826 0.918316 0.944156 0.335221 0.45754 0.499252 0.914687 0.791001 0.61187 0.05283 0.114681 0.130365 0.15776 0.097004 0.033141 0.359989 0.578628 0.681028 0.657327 0.414413 0.661891 0.340504 0.322742 0.91123 0.596503 0.852828 0.465769 0.411132 0.009372 0.746023 0.126838 0.067204 0.015386 0.15725 0.531235 0.388847 0.125175 0.474401 0.0409 [...]
+0.239819 0.42286 0.822948 0.553762 0.706536 0.311422 0.08123 0.326614 0.560554 0.845287 0.70421 0.087115 0.546098 0.051538 0.566617 0.357261 0.933904 0.036846 0.790104 0.747597 0.094544 0.224771 0.640015 0.136808 0.404554 0.498445 0.155123 0.313435 0.494706 0.044277 0.270988 0.224821 0.646907 0.411285 0.186622 0.185385 0.056376 0.595714 0.028199 0.773058 0.562372 0.795059 0.683026 0.983008 0.167509 0.47991 0.258962 0.467889 0.599947 0.561032 0.4638 0.701901 0.929592 0.382585 0.594547 0.3 [...]
+0.952159 0.251412 0.465223 0.415018 0.881214 0.526978 0.054608 0.37043 0.000174 0.072106 0.399028 0.48599 0.32435 0.652639 0.619346 0.440887 0.202664 0.48695 0.910015 0.712581 0.595379 0.966914 0.951759 0.81129 0.355184 0.010293 0.44815 0.385833 0.589158 0.126715 0.860054 0.619503 0.034865 0.894448 0.601983 0.36451 0.443723 0.669341 0.341845 0.215655 0.929479 0.822874 0.988486 0.398176 0.534695 0.874837 0.44981 0.942686 0.988482 0.960611 0.976079 0.632017 0.278574 0.450773 0.925287 0.429 [...]
+0.551416 0.328838 0.148566 0.636727 0.36368 0.598577 0.229545 0.315667 0.512229 0.158837 0.023772 0.753783 0.651966 0.238918 0.402948 0.395219 0.15108 0.291697 0.660996 0.039909 0.719807 0.482779 0.545752 0.036406 0.004369 0.618862 0.228478 0.583917 0.768903 0.153499 0.141069 0.328905 0.351615 0.677843 0.221313 0.967317 0.446412 0.46631 0.967234 0.122283 0.261126 0.021635 0.286766 0.026797 0.189051 0.344984 0.365106 0.828672 0.102252 0.448962 0.121702 0.734684 0.215202 0.387387 0.122562  [...]
+0.740127 0.358311 0.941752 0.909732 0.565806 0.761247 0.656944 0.014021 0.539563 0.410456 0.297894 0.514322 0.900836 0.30026 0.93486 0.569185 0.905757 0.063555 0.058595 0.631788 0.604132 0.55108 0.300968 0.64258 0.271012 0.408309 0.709563 0.791242 0.172018 0.837228 0.326187 0.852131 0.496192 0.709752 0.053819 0.906016 0.416104 0.637582 0.583573 0.610738 0.130105 0.028288 0.907437 0.616299 0.236057 0.476152 0.596754 0.005683 0.986457 0.739204 0.381199 0.201649 0.977181 0.568985 0.857664 0 [...]
+0.379788 0.930915 0.310591 0.210541 0.114937 0.974205 0.881072 0.398148 0.73908 0.756581 0.484013 0.867148 0.019847 0.979173 0.242047 0.623847 0.599131 0.558797 0.120359 0.340056 0.195537 0.45318 0.601837 0.971198 0.73264 0.541462 0.729157 0.165397 0.116958 0.044028 0.781399 0.965285 0.098684 0.012351 0.74138 0.169475 0.675188 0.876829 0.602444 0.057666 0.951522 0.352523 0.390395 0.679024 0.779931 0.011078 0.557117 0.319439 0.678641 0.476458 0.19721 0.231103 0.92517 0.645178 0.832426 0.6 [...]
+0.661872 0.632531 0.360498 0.906297 0.615463 0.459469 0.43783 0.551122 0.31786 0.618873 0.369921 0.19902 0.323572 0.842559 0.62173 0.616453 0.772369 0.335264 0.84873 0.251193 0.509435 0.340642 0.869963 0.222014 0.69355 0.262148 0.488668 0.598724 0.719941 0.748771 0.093755 0.972906 0.194157 0.33358 0.194179 0.94034 0.415721 0.337279 0.418294 0.501111 0.014417 0.277019 0.454472 0.848804 0.714437 0.174365 0.482633 0.638485 0.446952 0.512925 0.646014 0.943064 0.414584 0.780275 0.728626 0.881 [...]
+0.604029 0.848269 0.017627 0.505245 0.000352 0.894373 0.858993 0.215608 0.752572 0.343225 0.399288 0.567574 0.727237 0.218627 0.692074 0.678089 0.619528 0.455968 0.935882 0.729929 0.286804 0.403013 0.815283 0.255632 0.126126 0.154788 0.267403 0.84476 0.731572 0.633942 0.138207 0.686166 0.69791 0.291561 0.331835 0.313226 0.760501 0.794133 0.9743 0.666307 0.807593 0.955688 0.983668 0.731833 0.264717 0.892454 0.814718 0.608143 0.205253 0.950267 0.367095 0.068812 0.871864 0.858928 0.794226 0 [...]
+0.003936 0.467161 0.292965 0.337879 0.970621 0.467689 0.66887 0.999678 0.975655 0.29286 0.247846 0.427187 0.218341 0.373239 0.15059 0.688365 0.228819 0.741051 0.447667 0.897372 0.185115 0.679345 0.048143 0.330075 0.215 0.025061 0.904611 0.206663 0.131441 0.391971 0.601478 0.526444 0.273515 0.283725 0.239892 0.742983 0.087826 0.376362 0.888951 0.672013 0.363885 0.073301 0.514476 0.679215 0.519937 0.350798 0.767646 0.85659 0.765088 0.339776 0.778393 0.172481 0.244746 0.781487 0.156821 0.38 [...]
+0.477898 0.69342 0.371295 0.70941 0.234053 0.130479 0.357999 0.704438 0.500622 0.465496 0.098413 0.333848 0.042456 0.658794 0.391385 0.159591 0.54681 0.094825 0.942925 0.365649 0.051009 0.34041 0.673797 0.690968 0.371636 0.701602 0.045518 0.866497 0.327375 0.662095 0.340734 0.753533 0.221346 0.760969 0.249001 0.579205 0.167077 0.257595 0.813951 0.646892 0.862179 0.083434 0.629838 0.461942 0.159748 0.074942 0.353041 0.170842 0.767116 0.398549 0.015069 0.308911 0.146175 0.748098 0.10665 0. [...]
+0.8442 0.0554 0.822262 0.736728 0.877882 0.245171 0.839388 0.101883 0.008936 0.015284 0.417208 0.180216 0.324161 0.508357 0.539419 0.9084 0.71362 0.016779 0.639689 0.704948 0.908094 0.328488 0.041771 0.308334 0.216044 0.162132 0.891399 0.457863 0.109251 0.799539 0.602841 0.707201 0.610331 0.984041 0.942636 0.894665 0.445888 0.911614 0.959535 0.261259 0.506992 0.06073 0.880753 0.730024 0.271665 0.892867 0.432804 0.840241 0.487316 0.88083 0.640288 0.544088 0.690011 0.089569 0.17222 0.65644 [...]
+0.618291 0.093177 0.388997 0.163269 0.745734 0.223821 0.557733 0.416483 0.716933 0.366672 0.704736 0.168179 0.881004 0.246367 0.907939 0.885731 0.938101 0.079449 0.372003 0.041179 0.030317 0.923259 0.741631 0.702657 0.584083 0.369528 0.15854 0.194363 0.957678 0.971992 0.441093 0.719992 0.661345 0.937485 0.478652 0.600004 0.669327 0.556544 0.405346 0.190697 0.92521 0.852676 0.472019 0.518865 0.32592 0.353638 0.898534 0.646066 0.431324 0.369878 0.300884 0.154603 0.956146 0.543848 0.542364  [...]
+0.537245 0.175323 0.54662 0.222343 0.823988 0.75941 0.193028 0.676031 0.828859 0.823612 0.554327 0.01281 0.842652 0.469896 0.535989 0.533005 0.214547 0.231994 0.981856 0.206786 0.922803 0.147439 0.615575 0.252832 0.276512 0.381497 0.741109 0.99779 0.022162 0.224236 0.995395 0.235407 0.155118 0.030423 0.611514 0.506235 0.458018 0.353386 0.517984 0.273612 0.531099 0.340666 0.078462 0.056459 0.270068 0.466758 0.281118 0.479345 0.157537 0.815264 0.38624 0.499218 0.122378 0.082493 0.645822 0. [...]
+0.01764 0.990347 0.780809 0.688239 0.645633 0.005622 0.919955 0.771404 0.054841 0.030075 0.464729 0.28108 0.429536 0.182916 0.209924 0.284367 0.94982 0.148061 0.855219 0.565357 0.890521 0.377889 0.074851 0.709358 0.16101 0.073525 0.261155 0.288762 0.431404 0.507783 0.740421 0.883156 0.605445 0.211845 0.612983 0.016534 0.559954 0.749439 0.099156 0.709155 0.867054 0.350071 0.614531 0.949234 0.085545 0.613504 0.553416 0.078817 0.771526 0.21011 0.691704 0.77412 0.585193 0.270029 0.10448 0.91 [...]
+0.135686 0.653555 0.50942 0.753642 0.49884 0.870012 0.407106 0.940667 0.975491 0.223962 0.715105 0.884696 0.220031 0.893312 0.8903 0.063783 0.086764 0.959504 0.92699 0.20528 0.55124 0.295196 0.963535 0.158944 0.827867 0.74049 0.531066 0.894357 0.372249 0.939707 0.835598 0.87105 0.45167 0.902545 0.483562 0.95866 0.940088 0.607115 0.701583 0.433279 0.640005 0.999953 0.951777 0.873994 0.645562 0.228825 0.761639 0.11678 0.094106 0.901286 0.007297 0.006682 0.13638 0.623324 0.264584 0.653372 0 [...]
+0.934472 0.054444 0.246072 0.97267 0.981425 0.770185 0.918063 0.555571 0.621511 0.257427 0.914173 0.606024 0.754208 0.411079 0.329532 0.078557 0.528765 0.02782 0.965928 0.089876 0.979728 0.612309 0.361286 0.257178 0.666592 0.369806 0.39403 0.886146 0.454179 0.585946 0.978899 0.191507 0.188788 0.395221 0.936242 0.551803 0.685302 0.612352 0.024042 0.746761 0.297493 0.403549 0.230934 0.473414 0.170213 0.159788 0.350856 0.465425 0.703918 0.094122 0.173864 0.421263 0.683462 0.936052 0.825473  [...]
+0.156324 0.982693 0.033227 0.441014 0.388416 0.873281 0.235873 0.254416 0.030737 0.798506 0.547856 0.852522 0.413248 0.460088 0.528379 0.643401 0.816078 0.559177 0.278413 0.255316 0.807378 0.129842 0.464404 0.769936 0.230067 0.446469 0.094575 0.021036 0.286604 0.206651 0.863743 0.807849 0.567784 0.578242 0.244468 0.858136 0.645654 0.488876 0.238507 0.670565 0.113726 0.489603 0.599706 0.926839 0.65649 0.375558 0.988586 0.349894 0.743948 0.975487 0.906189 0.524423 0.207325 0.94899 0.700461 [...]
+0.276944 0.361223 0.000597 0.965844 0.665218 0.587024 0.889186 0.627873 0.088331 0.058666 0.333475 0.12688 0.821722 0.092124 0.710987 0.454835 0.769789 0.850128 0.654844 0.893344 0.218233 0.592009 0.388906 0.844299 0.039094 0.0327 0.086215 0.077804 0.36833 0.548814 0.38458 0.869463 0.436415 0.815794 0.002955 0.18063 0.950623 0.817711 0.201268 0.819527 0.525606 0.443294 0.999592 0.793481 0.941002 0.840826 0.000654 0.744503 0.899173 0.387905 0.989622 0.030222 0.120681 0.666431 0.42177 0.63 [...]
+0.738365 0.141128 0.641406 0.645268 0.819929 0.860798 0.231642 0.018964 0.907751 0.132378 0.852998 0.781133 0.76941 0.251096 0.559669 0.752744 0.097533 0.905341 0.364656 0.414202 0.822791 0.317666 0.615068 0.076527 0.44045 0.567259 0.03738 0.192857 0.51016 0.320841 0.862197 0.361602 0.344832 0.01586 0.992716 0.999192 0.782081 0.842753 0.660495 0.827987 0.34087 0.102783 0.905896 0.296229 0.744479 0.91818 0.314286 0.431676 0.067875 0.828457 0.331442 0.318126 0.436512 0.609525 0.907501 0.09 [...]
+0.35654 0.419879 0.534064 0.134289 0.375033 0.128425 0.756286 0.327043 0.813659 0.526361 0.698406 0.245625 0.058443 0.193093 0.633432 0.989427 0.979863 0.070062 0.239701 0.37107 0.76158 0.802876 0.521386 0.842845 0.891447 0.22947 0.239128 0.032029 0.471809 0.905623 0.85092 0.225173 0.636813 0.912843 0.943328 0.660178 0.303337 0.350239 0.788017 0.363153 0.662445 0.981276 0.324988 0.973784 0.770476 0.418699 0.925277 0.186146 0.209215 0.130394 0.65283 0.463917 0.723157 0.319201 0.750491 0.1 [...]
+0.237962 0.805378 0.847803 0.657731 0.71517 0.116474 0.047196 0.407068 0.781884 0.227091 0.610165 0.590713 0.72946 0.839239 0.554386 0.513904 0.60495 0.762651 0.117102 0.601303 0.524804 0.987807 0.258736 0.68654 0.46358 0.049309 0.734499 0.380389 0.258352 0.949824 0.769901 0.770385 0.307077 0.57172 0.839468 0.423036 0.588675 0.84635 0.592433 0.049481 0.243952 0.038645 0.713771 0.632715 0.84311 0.813529 0.641038 0.534755 0.043924 0.249486 0.254833 0.518962 0.037718 0.295375 0.702698 0.509 [...]
+0.892871 0.525016 0.162712 0.91589 0.949249 0.631745 0.071991 0.911032 0.463184 0.303268 0.840186 0.02538 0.112179 0.865469 0.354422 0.049905 0.154699 0.711927 0.805765 0.963284 0.079625 0.050439 0.929939 0.103402 0.157817 0.807065 0.156515 0.619832 0.549768 0.096829 0.696651 0.151601 0.599304 0.480101 0.531431 0.381372 0.411546 0.016099 0.349246 0.037296 0.47876 0.843105 0.845737 0.016631 0.385395 0.271282 0.411441 0.483234 0.061133 0.70336 0.214503 0.848559 0.42826 0.85622 0.99889 0.61 [...]
+0.299498 0.159251 0.205351 0.522758 0.00983 0.496617 0.253657 0.884509 0.785405 0.463136 0.152175 0.670522 0.713874 0.49025 0.661005 0.398543 0.381649 0.119159 0.497077 0.416178 0.392062 0.835136 0.064441 0.320799 0.931012 0.902756 0.270204 0.888451 0.081844 0.74056 0.311877 0.544069 0.32196 0.219751 0.780085 0.081956 0.860225 0.806665 0.494248 0.341817 0.39322 0.060567 0.788384 0.006181 0.053427 0.636095 0.196913 0.218758 0.528223 0.575566 0.130204 0.86924 0.603645 0.129834 0.222499 0.8 [...]
+ 459.52871 -2.4688646 -2.4688646 7.8471802 7.8471802 4.564311 4.564311 -6.7609376 -6.7609376 1.8082646 1.8082646 8.7225684 -8.6763154 -8.6763154 4.7914295 4.7914295 8.5851948 8.5851948 -4.5847101 -4.5847101 -2.9945451 -2.9945451 -7.3465207 -7.3465207 8.1287789 8.1287789 -5.0439741 -5.0439741 -7.7324818 -7.7324818 6.157295 6.157295 -3.9393444 -3.9393444 -8.5737561 -8.5737561 4.9696353 4.9696353 2.2241786 2.2241786 8.5160174 8.5160174 -7.7159986 -7.7159986 0.5252973 0.5252973 -0.7283019 -0 [...]
\ No newline at end of file
diff --git a/examples/testdata/eigen/symm1.example b/examples/testdata/eigen/symm1.example
new file mode 100644
index 0000000..61daa9f
--- /dev/null
+++ b/examples/testdata/eigen/symm1.example
@@ -0,0 +1,6 @@
+4 
+4 1 -2 2
+1 2 0 1
+-2 0 3 -2
+2 1 -2 -1
+6.8446211 2.2685314 1.0843645 -2.197517
diff --git a/examples/testdata/eigen/symm2.example b/examples/testdata/eigen/symm2.example
new file mode 100644
index 0000000..81cd7ee
--- /dev/null
+++ b/examples/testdata/eigen/symm2.example
@@ -0,0 +1,12 @@
+10
+3 5 3 0 5 0 10 1 6 7
+5 6 3 4 5 9 1 2 6 1
+3 3 5 8 2 4 0 1 0 1
+0 4 8 10 5 9 7 10 5 0
+5 5 2 5 6 3 6 9 6 1
+0 9 4 9 3 7 2 7 7 5
+10 1 0 7 6 2 7 2 0 9
+1 2 1 10 9 7 2 0 0 6
+6 6 0 5 6 7 0 0 9 4
+7 1 1 0 1 5 9 6 4 2
+44.556858 16.790534 12.102353 5.587039 4.5331425 2.1119679 -0.5828338 -6.2187209 -9.0311173 -14.849222
diff --git a/examples/testdata/eigen/symm3.example b/examples/testdata/eigen/symm3.example
new file mode 100644
index 0000000..50eee93
--- /dev/null
+++ b/examples/testdata/eigen/symm3.example
@@ -0,0 +1,922 @@
+919
+4 3 10 7 6 8 10 5 9 5 8 2 6 3 3 8 1 10 3 6 7 5 8 5 4 10 4 0 2 0 10 8 6 6 9 3 1 5 1 1 1 8 6 2 6 9 10 3 2 5 3 3 8 3 4 2 3 3 6 5 9 0 1 4 2 4 2 9 5 2 3 1 8 6 1 0 5 5 3 5 0 3 9 2 6 2 1 10 3 4 5 2 9 1 1 3 9 9 9 1 2 3 4 9 1 5 3 8 9 6 2 10 0 6 2 4 6 4 0 7 3 3 4 1 7 7 1 7 4 6 5 6 4 6 9 9 2 9 3 4 3 10 0 6 5 1 0 4 4 2 3 0 8 1 10 1 8 1 4 4 7 4 3 2 8 6 7 10 5 9 2 7 9 9 7 7 0 9 9 1 6 2 5 1 9 7 1 2 2 9 1 1 6 10 7 3 3 4 7 8 9 6 7 3 2 10 2 7 6 10 3 7 8 4 6 1 10 1 0 0 5 1 7 2 0 7 2 1 7 6 0 4 1 3 10 0 4 5  [...]
+3 8 10 3 1 6 8 6 7 1 10 0 8 9 5 10 5 10 7 0 4 9 9 3 10 5 10 1 1 0 9 8 3 6 3 8 10 0 2 9 8 0 0 6 1 4 9 3 10 1 6 7 1 8 4 8 3 10 5 7 0 8 1 7 1 2 10 4 4 1 9 7 0 8 3 10 5 3 8 1 10 3 4 6 9 9 9 10 0 7 2 9 6 7 2 8 5 6 9 8 2 8 2 5 1 9 2 7 6 3 6 10 0 10 9 9 9 7 7 3 0 3 2 10 6 10 10 3 7 7 10 5 9 9 1 5 6 8 3 1 9 3 0 10 6 1 3 7 1 0 6 4 8 9 8 9 6 0 3 4 5 5 7 4 9 2 0 0 10 7 5 6 6 0 3 9 6 5 3 4 1 6 5 4 2 9 3 3 10 9 0 10 1 6 10 0 0 0 9 3 1 5 7 4 7 4 4 0 4 3 10 0 9 2 3 10 6 1 5 2 2 2 9 8 6 2 1 2 3 4 4 2 8  [...]
+10 10 8 2 7 6 9 9 5 3 7 9 4 8 6 0 2 5 4 6 3 4 2 4 9 5 8 3 0 6 4 0 8 0 3 0 7 7 3 6 5 1 10 6 1 8 3 6 10 3 5 0 5 2 6 0 10 3 10 1 5 10 5 2 3 2 4 3 7 4 8 3 3 1 9 7 9 10 0 0 8 2 4 5 7 3 8 0 9 8 4 5 4 2 5 9 3 3 0 6 7 1 9 1 3 2 7 5 9 7 7 5 4 10 7 7 6 4 6 5 1 9 1 3 9 2 0 2 6 8 9 9 5 9 5 10 3 2 9 1 3 10 9 8 6 1 9 1 3 9 2 1 6 7 4 10 8 4 10 0 9 4 0 6 0 8 7 1 8 1 3 9 10 8 9 9 3 9 4 2 10 9 8 5 3 8 6 4 0 7 6 4 1 6 4 9 10 9 6 4 2 4 3 9 7 3 6 0 4 1 0 2 9 1 4 2 6 5 10 0 3 9 4 10 6 8 10 5 1 9 3 1 1 1 9 1 7 [...]
+7 3 2 6 0 3 5 10 1 3 9 4 6 1 4 3 2 9 2 2 9 4 9 6 1 5 10 7 8 5 9 3 3 7 9 9 5 6 10 5 2 1 8 6 5 10 1 0 9 3 10 7 8 5 9 9 2 2 3 5 2 5 2 2 2 5 5 2 10 9 2 2 3 1 2 9 4 2 2 3 6 4 7 9 5 6 9 5 0 0 10 8 8 8 1 6 5 2 4 2 6 7 5 1 3 8 9 0 5 4 0 7 8 5 7 6 6 10 2 0 0 4 3 10 1 0 4 4 1 9 4 3 2 5 2 2 2 6 9 8 3 7 10 3 7 9 5 5 6 7 0 5 7 10 7 0 3 1 5 3 0 5 9 5 2 5 10 5 8 5 8 0 8 7 10 0 2 0 1 5 8 1 0 10 7 6 1 1 7 2 6 9 8 4 2 4 1 8 9 9 3 4 0 7 1 8 6 7 8 0 10 9 6 3 1 7 3 1 6 8 10 4 1 1 9 7 7 3 8 2 2 5 9 7 0 2 3 1  [...]
+6 1 7 0 1 5 6 9 9 9 2 3 3 9 5 2 10 3 5 3 10 4 3 1 5 3 9 10 5 7 8 1 1 7 8 1 4 8 3 10 4 0 5 10 3 1 10 6 2 10 2 3 5 5 7 2 3 0 7 10 2 2 8 10 6 3 6 7 5 5 10 5 3 8 8 6 7 7 8 5 9 3 9 1 4 3 1 0 3 9 9 6 9 10 4 4 4 8 6 7 0 6 5 4 2 10 7 4 0 1 2 9 7 4 10 3 3 0 0 3 5 10 7 8 2 9 7 4 5 10 1 1 9 1 8 8 1 4 2 5 8 4 1 7 4 3 9 8 8 7 8 8 6 1 7 1 7 9 4 7 10 3 2 5 7 9 9 5 3 2 5 4 10 10 3 1 10 7 4 10 8 5 10 5 8 5 1 1 6 7 0 3 6 8 1 8 5 2 5 4 3 1 10 7 6 8 4 8 3 6 2 3 8 2 10 6 4 5 4 9 3 3 3 3 0 2 5 5 1 8 10 1 1 0  [...]
+8 6 6 3 5 7 0 2 9 1 9 1 9 3 5 7 6 6 7 0 5 9 7 3 6 1 1 7 1 8 3 9 5 7 5 1 7 2 7 8 10 1 8 9 7 1 3 1 7 9 1 4 10 3 1 6 4 0 3 1 0 0 2 4 2 2 9 5 7 2 2 2 9 7 9 9 5 0 9 6 7 7 5 10 10 0 5 8 10 4 1 6 2 7 4 0 9 8 7 7 4 8 7 0 0 5 4 3 4 8 5 6 6 4 8 6 3 0 3 7 5 2 7 7 6 4 4 4 7 10 9 9 2 4 8 1 3 7 1 2 3 8 8 8 10 5 1 6 10 1 6 7 2 4 10 0 9 10 6 5 6 10 1 2 7 6 4 9 10 1 4 4 2 0 9 9 4 4 7 5 3 4 7 8 6 10 3 0 3 9 8 3 7 0 7 5 10 1 8 10 1 8 8 0 10 0 0 10 1 10 10 2 2 2 4 10 1 8 7 10 2 9 4 2 4 5 8 3 2 6 2 5 10 10 2 [...]
+10 8 9 5 6 0 10 0 9 0 10 8 10 10 9 2 5 9 3 4 7 2 8 4 2 10 0 10 4 5 7 10 0 7 2 2 5 3 10 10 9 0 1 0 4 0 6 2 6 5 1 0 9 4 2 4 3 4 4 4 4 4 7 1 9 10 7 6 0 1 7 2 6 6 7 9 1 3 4 1 5 0 0 3 8 4 8 9 3 9 2 9 7 9 5 10 5 6 1 10 1 5 3 10 9 8 9 7 7 0 10 7 8 8 3 2 0 3 3 3 7 7 7 10 0 6 0 8 9 9 9 1 2 4 2 7 5 10 4 9 5 7 0 2 1 3 7 3 2 6 6 2 0 6 8 8 9 1 8 7 7 6 5 1 2 0 6 5 5 2 7 6 1 5 0 5 0 5 9 4 5 7 6 10 6 1 2 9 10 9 1 5 7 6 4 8 9 9 10 0 3 5 1 8 7 10 3 8 0 8 8 7 8 8 2 8 5 2 2 1 5 6 6 6 8 5 1 10 0 1 3 7 3 10 3 [...]
+5 6 9 10 9 2 0 8 7 7 3 7 10 0 1 9 0 5 7 2 8 5 1 5 3 6 9 2 2 1 4 10 5 5 0 2 6 0 5 0 6 1 5 3 8 10 1 0 8 7 7 8 5 6 7 7 10 4 3 7 7 1 5 6 1 7 2 5 10 0 5 8 8 6 8 0 8 2 2 9 6 10 0 4 5 1 3 5 7 10 1 1 4 9 3 4 8 9 5 3 8 6 4 10 2 9 2 10 3 1 5 6 1 2 1 7 5 5 1 7 0 8 5 8 1 9 7 2 0 8 1 3 6 5 4 8 9 10 2 4 2 4 7 2 2 3 0 3 1 0 4 9 4 2 5 4 8 1 4 0 8 5 0 0 9 5 9 9 0 6 9 1 1 6 3 1 4 4 4 2 4 8 10 9 4 9 3 9 4 9 2 3 0 0 7 10 1 0 7 6 4 2 8 7 8 5 7 5 6 5 8 2 0 0 10 6 4 4 4 0 6 4 6 10 2 3 9 4 0 6 8 1 10 4 5 7 2 7  [...]
+9 7 5 1 9 9 9 7 4 6 0 5 1 2 1 10 4 4 2 8 7 9 4 3 6 6 7 10 0 4 2 10 10 1 2 3 5 0 1 8 1 6 9 2 4 3 4 2 2 5 9 4 3 5 6 8 2 7 2 10 1 8 6 9 6 7 1 0 0 10 4 9 0 10 1 1 7 9 2 9 7 3 0 0 5 7 4 8 3 1 4 2 7 1 0 5 1 2 4 7 9 9 7 5 6 1 0 6 10 0 6 2 7 2 6 1 2 4 1 7 6 9 4 6 4 10 10 0 2 10 1 8 3 0 9 0 8 7 3 1 4 9 10 6 5 7 10 2 5 10 9 4 2 7 0 4 4 10 9 5 6 5 3 2 2 0 9 5 10 8 6 2 0 7 9 7 10 4 5 5 8 5 2 4 2 10 9 5 6 1 1 1 3 8 1 1 2 9 1 2 2 9 10 2 9 7 0 2 8 3 2 1 5 8 0 9 0 5 9 4 2 7 0 8 10 4 6 1 6 5 9 1 0 0 0 0  [...]
+5 1 3 3 9 1 0 7 6 7 4 3 7 7 6 4 9 7 7 1 3 6 9 6 3 6 6 6 5 9 8 6 3 2 10 2 7 3 1 8 2 9 3 5 4 9 7 9 7 8 4 8 7 6 0 5 10 3 4 8 4 2 4 1 4 6 4 1 8 5 0 1 0 6 9 10 8 1 0 0 4 7 8 2 10 10 8 4 2 3 7 0 8 3 8 8 9 2 2 8 4 2 6 8 0 7 7 10 8 3 6 1 7 10 0 7 3 2 0 5 4 10 10 5 1 0 0 4 8 8 3 10 3 8 8 2 1 8 4 0 5 0 0 6 9 8 10 0 4 3 0 2 9 4 3 4 3 4 7 6 5 2 8 2 4 8 5 7 4 9 2 9 10 1 1 1 6 1 8 2 1 10 6 4 7 4 2 8 6 6 5 5 0 5 5 8 2 1 5 5 9 6 8 8 1 2 6 4 10 3 5 6 0 1 7 8 4 5 10 4 8 8 3 7 9 3 4 10 7 7 5 6 5 2 8 7 7 3  [...]
+8 10 7 9 2 9 10 3 0 4 5 8 6 5 5 9 8 5 8 9 2 9 10 4 2 7 10 4 8 0 0 8 1 0 7 3 5 7 10 6 0 4 0 0 8 0 9 9 1 10 5 3 0 7 1 8 10 4 4 1 1 0 6 5 2 7 5 0 9 10 0 7 1 4 3 0 1 2 8 9 5 6 7 8 8 8 9 10 1 8 6 6 9 9 5 2 4 3 1 10 7 2 5 8 8 3 8 9 7 7 4 4 5 1 9 8 9 9 1 5 1 3 8 9 6 3 0 0 5 9 2 9 4 7 4 0 10 3 9 1 7 5 8 3 6 8 9 2 3 1 0 4 6 1 9 7 5 6 2 1 0 10 5 5 8 5 8 4 8 3 2 10 7 6 8 9 2 10 7 5 3 6 10 2 3 7 0 8 1 7 10 5 5 10 6 5 6 1 5 10 8 2 0 2 7 7 6 2 8 6 8 0 1 4 6 0 6 4 4 2 0 9 6 10 4 10 6 7 7 3 2 3 0 4 7 7  [...]
+2 0 9 4 3 1 8 7 5 3 8 9 6 8 6 0 1 8 3 4 4 7 1 4 7 4 8 4 3 4 9 0 7 2 3 6 10 5 8 1 6 3 3 0 7 0 1 1 8 2 8 1 1 3 5 1 3 9 2 9 7 5 5 5 3 7 8 9 2 1 2 6 2 9 7 10 10 8 5 6 7 8 7 7 9 1 2 2 7 9 0 9 8 8 1 10 5 6 5 10 6 8 10 10 5 2 9 10 3 10 6 8 0 3 1 9 0 4 0 6 3 6 3 1 10 5 0 4 6 4 6 9 2 7 8 9 5 6 4 0 10 5 5 2 6 6 8 9 7 1 9 3 2 5 0 7 10 9 8 5 5 7 1 3 4 8 6 7 0 6 8 6 1 9 9 10 4 7 10 1 10 10 8 5 3 6 5 10 10 0 8 0 5 1 7 3 9 0 10 3 0 9 8 10 6 1 9 7 7 2 3 0 3 6 3 3 9 5 3 7 10 8 2 10 0 1 2 2 8 6 7 9 8 10 5 [...]
+6 8 4 6 3 9 10 10 1 7 6 6 5 2 9 0 3 10 4 0 7 4 4 7 9 0 6 4 10 6 0 5 8 1 7 6 7 9 8 0 4 8 7 3 2 6 6 8 5 1 7 8 5 2 4 7 6 9 4 2 3 5 2 4 8 4 5 5 8 3 7 1 6 4 3 8 3 9 8 10 6 1 9 8 8 9 7 2 2 5 6 4 2 6 10 2 10 6 4 2 8 5 4 0 3 10 6 4 4 2 7 1 3 5 2 8 0 5 10 8 8 3 2 7 7 2 0 8 8 3 7 5 4 5 2 9 8 10 5 2 8 5 0 10 9 2 6 2 1 4 5 2 7 9 2 8 1 7 8 7 3 4 0 8 8 5 10 5 6 2 0 0 7 1 10 2 6 9 3 9 5 9 0 6 10 0 6 8 0 7 9 5 10 7 0 8 6 5 1 1 8 5 3 6 8 0 0 8 5 9 1 7 5 7 9 8 6 5 9 8 3 2 4 7 6 1 2 1 6 5 1 9 9 3 1 7 1 7 1 [...]
+3 9 8 1 9 3 10 0 2 7 5 8 2 10 1 8 7 4 1 5 0 5 3 6 10 8 9 3 0 2 7 9 3 1 6 6 6 4 0 2 3 3 6 2 3 2 7 5 5 10 2 2 6 0 6 3 3 10 2 6 7 9 7 9 3 2 7 10 2 9 1 5 0 10 3 7 3 5 1 3 9 7 9 10 6 7 3 9 2 8 6 4 4 10 8 5 3 9 7 10 4 8 3 4 3 10 9 1 0 8 6 4 0 1 6 0 7 0 6 5 7 0 1 8 9 9 8 1 4 6 8 0 1 5 0 5 5 4 7 10 2 10 5 8 7 2 2 6 8 7 1 4 2 6 10 6 1 2 8 3 6 8 6 1 2 3 2 6 4 2 2 1 8 2 5 1 3 3 2 1 5 3 10 6 8 1 2 4 3 0 0 1 2 9 4 5 5 6 8 0 8 6 7 2 6 5 3 7 7 1 9 4 4 7 5 6 0 8 5 4 4 7 8 8 5 6 2 1 8 7 3 5 0 2 6 6 3 1 7 [...]
+3 5 6 4 5 5 9 1 1 6 5 6 9 1 0 7 7 4 3 5 2 9 7 8 0 3 2 0 8 9 4 2 2 3 9 8 7 4 3 0 6 10 9 10 9 2 6 3 10 9 5 7 5 0 9 8 2 10 4 10 4 4 5 6 8 1 1 6 7 10 10 2 7 7 7 8 4 2 8 2 4 4 2 6 3 2 5 10 8 4 10 6 3 3 8 4 10 1 8 4 0 4 4 5 0 4 6 0 4 6 1 2 5 5 1 7 8 2 10 0 10 7 10 5 6 7 9 6 5 8 6 5 6 5 7 2 3 3 0 4 9 9 1 2 1 1 8 2 0 6 6 5 0 7 1 0 10 1 3 2 0 10 6 3 1 8 2 0 10 6 1 2 8 0 4 1 6 9 7 1 8 0 0 6 1 3 7 8 8 6 9 10 3 3 9 7 10 1 10 1 0 2 7 3 8 2 4 10 10 8 1 6 10 5 0 1 2 1 5 6 0 6 9 10 4 4 7 4 0 3 10 8 10 3 [...]
+8 10 0 3 2 7 2 9 10 4 9 0 0 8 7 7 2 1 3 8 10 9 7 7 4 5 0 2 8 9 0 9 2 2 3 6 1 9 7 3 6 10 5 10 1 5 5 1 1 7 4 3 3 8 5 3 5 3 7 1 7 0 4 10 4 2 9 9 4 6 5 0 9 10 8 6 8 1 8 9 10 8 8 1 10 6 8 10 7 5 4 1 2 5 9 9 7 1 4 0 2 1 2 9 5 7 7 2 8 3 0 2 2 6 3 7 4 0 1 0 5 0 8 7 6 5 7 6 4 1 0 3 8 2 1 6 10 10 0 2 8 0 2 8 5 1 8 3 1 5 8 10 8 0 0 1 6 3 1 8 9 6 9 9 8 9 6 5 1 9 10 4 4 2 5 9 8 9 4 8 1 1 10 10 2 4 2 5 2 9 6 8 4 9 10 0 9 2 6 7 6 2 9 9 7 4 8 0 8 7 4 7 10 3 10 8 7 2 1 2 10 0 8 6 7 7 1 2 3 4 1 2 7 2 10 8 [...]
+1 5 2 2 10 6 5 0 4 9 8 1 3 7 7 2 1 2 2 2 2 2 8 6 1 5 5 5 5 10 1 2 0 9 7 8 4 8 1 2 2 2 9 8 3 4 7 2 9 9 5 6 3 4 0 2 4 5 4 2 2 9 3 8 9 0 6 3 10 8 4 0 3 9 6 1 1 3 8 2 2 4 10 1 9 0 5 2 10 0 5 2 6 3 7 2 4 6 5 2 6 8 2 1 1 6 2 6 7 7 3 6 8 10 9 5 0 3 5 0 5 6 2 10 10 9 8 6 1 4 7 1 4 0 0 7 6 7 6 8 3 0 5 10 3 5 9 7 0 2 9 3 8 6 9 7 5 9 4 0 7 9 6 1 7 4 10 5 9 8 2 2 5 5 8 1 3 5 3 4 10 8 1 9 2 0 6 4 3 10 5 10 8 7 2 3 8 4 8 7 6 1 1 10 3 8 5 4 10 5 5 2 5 1 0 0 3 1 2 3 7 9 6 1 1 5 0 6 10 1 1 6 9 9 6 4 8 7  [...]
+10 10 5 9 3 6 9 5 4 7 5 8 10 4 4 1 2 4 7 5 4 9 7 3 10 7 6 3 1 5 9 6 9 10 2 2 5 5 8 2 7 3 1 4 10 1 3 4 0 7 5 5 1 2 1 6 10 8 8 8 6 5 8 5 1 10 8 5 1 7 8 2 0 1 10 9 10 1 6 7 6 9 5 3 0 10 4 3 6 10 8 2 7 7 4 8 5 4 0 3 1 6 10 9 6 10 5 9 5 1 3 3 10 8 2 6 2 1 5 7 8 9 5 2 4 1 5 8 2 3 7 2 0 5 6 10 5 7 1 10 2 9 1 4 6 9 6 6 2 4 7 9 3 4 5 0 0 10 5 5 10 3 6 10 4 7 9 4 10 0 2 0 9 4 7 6 9 2 2 6 2 2 9 6 0 1 1 9 9 2 1 4 10 0 1 4 6 4 6 7 1 1 6 10 6 9 3 4 1 10 0 7 0 4 1 9 2 0 2 4 1 5 10 8 6 7 1 1 8 4 1 5 0 1 [...]
+3 7 4 2 5 7 3 7 2 7 8 3 4 1 3 3 2 7 1 10 10 7 4 10 7 6 6 0 10 6 0 7 3 2 4 0 5 1 9 8 5 3 5 2 3 10 10 9 5 4 4 6 10 3 3 2 8 0 3 9 5 9 6 0 9 1 0 7 7 1 2 4 7 1 10 8 5 0 8 7 9 0 3 10 5 9 0 7 3 4 2 3 0 9 10 1 9 1 2 2 2 10 9 10 5 7 8 9 6 3 2 0 0 8 9 2 7 4 2 7 10 6 0 0 8 1 3 6 2 10 2 6 6 0 2 0 2 9 9 1 3 6 8 10 0 4 7 4 2 3 6 8 9 0 8 5 0 6 9 9 2 6 3 8 3 1 2 3 10 5 9 8 9 10 5 9 10 9 0 4 3 2 10 6 6 5 7 1 10 9 1 3 7 4 5 5 6 2 0 8 0 7 6 3 5 10 8 2 4 5 7 2 2 3 0 0 9 5 9 9 1 10 2 3 0 8 9 0 0 2 7 2 9 5 10 [...]
+6 0 6 2 3 0 4 2 8 1 9 4 0 5 5 8 2 5 10 7 3 6 2 5 3 3 5 2 9 1 8 4 4 9 5 5 1 4 9 4 0 5 10 0 1 2 1 8 1 4 5 0 5 6 6 7 1 9 10 2 0 4 5 10 1 9 2 2 9 2 0 5 6 3 3 3 1 7 3 1 5 2 10 6 9 0 8 10 0 4 4 0 9 5 6 9 7 7 10 5 8 2 7 0 10 0 5 6 3 4 4 5 10 7 3 2 2 10 2 4 1 0 0 6 1 7 5 5 4 9 8 6 6 8 3 0 0 4 0 5 10 0 8 4 8 3 2 7 3 6 7 10 10 9 6 4 5 3 8 1 0 1 4 3 6 6 6 8 7 1 3 9 5 0 5 10 10 7 0 9 4 2 2 3 3 5 10 6 4 7 7 5 6 9 8 6 3 8 7 5 6 6 3 8 7 10 7 2 6 4 7 0 5 6 7 3 1 0 4 6 7 10 4 6 0 0 1 0 8 6 3 2 7 3 9 0 5  [...]
+7 4 3 9 10 5 7 8 7 3 2 4 7 0 2 10 2 4 10 3 4 9 6 5 3 6 3 8 2 5 1 4 0 1 3 4 10 7 7 6 2 10 6 4 1 9 10 6 10 3 8 4 1 9 6 3 5 8 4 2 7 6 3 5 2 9 8 3 6 0 0 10 8 10 6 3 1 5 2 7 0 4 8 4 9 6 3 4 4 4 10 5 5 1 4 5 8 7 0 8 3 7 4 7 8 2 6 5 1 6 6 1 3 6 5 6 0 3 9 2 9 4 5 4 0 3 8 6 4 9 1 9 10 9 6 10 8 7 5 1 0 3 10 7 6 9 1 1 1 10 10 4 0 8 0 5 3 0 0 1 2 7 9 5 3 7 5 2 7 7 8 8 4 7 8 3 7 0 6 10 6 4 10 1 5 1 7 9 4 10 5 0 9 6 8 10 4 3 2 1 7 3 1 8 10 7 6 3 10 2 8 9 10 1 3 4 4 7 10 8 2 4 7 4 1 10 9 5 10 7 10 8 9  [...]
+5 9 4 4 4 9 2 5 9 6 9 7 4 5 9 9 2 9 7 6 9 7 7 2 1 9 0 9 5 4 7 2 1 10 4 2 6 1 9 4 7 10 10 1 4 1 0 5 10 10 3 9 5 5 10 7 5 2 6 2 1 3 1 7 3 4 10 0 4 4 10 7 10 0 3 2 6 0 10 2 6 6 9 10 8 7 6 1 9 4 3 2 10 6 8 8 9 10 9 8 6 1 9 10 1 5 1 7 7 0 8 2 10 0 1 4 2 10 0 5 7 3 9 7 10 8 2 2 5 3 10 10 5 6 0 0 10 5 0 8 5 8 9 5 0 4 0 7 2 6 8 6 1 10 4 10 9 6 7 8 6 1 0 10 8 9 7 10 0 0 6 0 5 10 6 1 0 9 0 5 0 0 2 4 6 7 6 4 8 1 9 5 7 10 0 3 4 8 8 2 7 7 7 6 9 1 4 8 4 5 6 5 3 5 6 5 10 4 6 2 8 1 2 3 4 6 7 1 1 1 10 2  [...]
+8 9 2 9 3 7 8 1 4 9 10 1 4 3 7 7 8 7 4 2 6 7 9 7 9 3 9 2 7 1 1 10 1 10 1 0 5 4 2 6 0 1 1 10 5 0 0 4 10 1 9 5 5 9 10 2 2 7 2 5 5 7 5 1 8 6 0 1 4 6 7 10 1 10 5 0 3 9 3 2 9 8 4 4 3 9 3 5 1 7 0 5 7 4 1 6 0 9 0 7 1 4 4 0 4 3 4 10 4 5 3 9 5 0 2 0 3 2 2 8 4 0 5 6 5 8 5 4 7 3 4 1 7 7 7 6 0 4 7 2 8 10 8 9 9 0 10 1 1 3 7 6 6 6 1 1 2 8 3 3 6 8 8 1 9 9 2 6 9 9 4 10 4 10 7 4 5 2 5 1 7 9 4 10 6 7 9 6 9 8 4 8 2 7 7 1 10 10 3 6 0 0 6 1 0 1 7 5 1 0 10 3 5 3 8 8 6 7 3 2 7 4 4 0 2 7 5 5 5 3 10 3 3 2 1 5 6  [...]
+5 3 4 6 1 3 4 5 3 6 4 4 7 6 8 7 6 3 10 5 5 2 7 0 7 1 1 10 3 6 10 1 10 6 5 1 4 10 0 10 6 8 6 3 2 5 9 2 4 2 4 7 4 6 9 2 6 2 7 6 6 5 6 2 7 5 9 10 0 6 7 10 2 6 10 2 7 8 1 2 4 2 3 10 2 4 2 8 9 10 5 2 1 9 10 2 5 5 10 5 6 8 3 7 8 6 9 4 3 5 6 3 4 10 10 7 3 7 9 1 6 6 10 2 6 7 9 1 1 4 1 0 5 7 8 5 1 4 7 1 3 0 0 4 4 0 8 3 6 1 5 4 1 3 0 9 2 3 7 3 10 4 0 9 0 1 0 1 0 5 2 7 5 0 8 6 2 8 8 5 9 9 9 3 1 0 9 3 3 1 0 5 5 9 9 9 1 10 7 6 4 6 1 0 8 6 10 0 1 2 2 10 5 4 7 7 6 3 10 3 6 10 10 9 10 3 3 3 1 3 9 5 4 0  [...]
+4 10 9 1 5 6 2 3 6 3 2 7 9 10 0 4 1 10 7 3 3 1 9 7 5 2 0 2 10 3 8 2 0 8 7 3 3 8 3 9 1 2 6 9 0 5 8 6 4 2 0 4 8 6 9 1 1 5 8 9 8 1 0 5 4 4 1 0 4 1 3 7 5 2 6 3 1 3 10 5 2 1 5 4 0 6 2 8 5 1 6 0 2 8 1 0 9 5 7 2 9 0 10 7 7 6 1 9 5 8 5 8 1 6 7 5 2 4 7 6 6 3 6 5 2 7 5 5 8 0 0 5 1 10 0 9 1 7 0 6 0 4 2 10 10 9 4 8 9 6 0 3 3 2 3 7 1 5 8 5 3 5 10 4 4 10 9 8 6 9 1 10 3 6 4 0 0 5 7 7 8 3 8 5 9 3 5 8 2 4 2 8 10 7 7 9 1 1 6 5 6 10 9 4 4 8 7 9 0 5 5 0 0 6 9 1 2 7 10 0 4 5 3 9 0 0 0 6 7 3 4 9 1 4 7 0 4 7 2 [...]
+10 5 5 5 3 1 10 6 6 6 7 4 0 8 3 5 5 7 6 3 6 9 3 1 2 0 0 4 10 8 7 7 6 1 8 3 7 3 4 6 10 0 9 10 7 3 2 5 3 0 5 4 9 7 8 8 2 9 6 6 2 8 9 4 1 4 6 2 3 6 10 1 2 5 8 4 4 1 6 7 10 9 3 9 2 8 10 5 9 6 3 0 1 9 5 5 3 6 5 6 6 10 7 3 0 8 10 5 7 3 0 0 4 4 2 4 8 5 3 2 7 6 10 7 5 1 6 9 6 4 0 3 5 6 6 3 7 5 7 2 0 10 0 10 7 4 5 3 1 8 3 5 0 5 0 5 2 9 9 5 1 5 3 3 9 8 4 10 4 3 5 2 1 1 4 10 5 0 8 3 8 2 2 2 8 8 2 7 6 0 8 0 6 1 5 3 4 4 4 4 4 8 0 10 9 0 6 5 3 1 10 1 7 7 6 4 1 0 5 2 2 5 0 6 7 5 9 10 8 7 1 3 8 4 2 2 10 [...]
+4 10 8 10 9 1 0 9 7 6 10 8 6 9 2 0 5 6 6 5 3 0 9 1 0 0 4 0 8 10 6 5 3 2 8 4 3 4 6 9 10 0 5 3 2 6 6 9 4 2 10 3 8 6 4 10 2 7 7 3 1 4 4 6 7 6 9 3 10 9 9 6 1 1 8 2 10 7 3 9 5 4 7 4 10 0 5 4 7 8 10 2 3 6 0 7 10 1 5 7 4 5 9 1 4 6 5 8 8 4 7 6 0 1 3 7 1 5 1 5 2 10 8 8 9 4 0 4 2 4 10 3 2 10 9 10 3 1 5 0 10 7 3 5 8 0 3 3 1 2 3 9 10 0 5 1 8 1 3 4 5 3 4 9 10 3 8 7 7 1 5 7 7 4 8 6 0 10 3 1 5 3 5 6 5 7 8 2 7 5 8 9 3 0 8 3 5 4 2 2 1 7 10 4 6 1 6 4 7 5 1 8 7 2 9 9 9 6 1 10 2 1 8 4 10 1 5 3 2 3 4 5 10 10 [...]
+0 1 3 7 10 7 10 2 10 6 4 4 4 3 0 2 5 3 0 2 8 9 2 10 2 4 0 8 9 9 9 6 6 0 0 1 0 3 7 3 10 0 6 4 1 4 4 9 2 4 7 9 5 0 3 3 10 5 7 2 6 10 6 2 7 0 0 5 8 1 10 3 4 5 0 10 1 1 0 9 6 0 10 5 4 10 5 3 1 3 4 10 4 3 7 8 10 9 4 9 6 3 4 4 6 1 6 4 7 6 5 7 0 1 0 3 3 4 10 8 6 6 2 4 2 5 1 2 4 7 10 1 1 6 7 8 9 2 10 6 5 0 8 1 4 10 10 8 8 7 9 9 9 2 3 4 7 3 2 10 8 3 4 6 4 1 1 8 7 6 8 5 5 5 1 5 9 4 5 9 4 5 7 7 10 2 8 3 5 7 7 6 10 5 2 0 5 0 10 3 2 9 3 9 8 8 9 4 2 10 8 10 5 0 9 10 6 9 8 4 0 8 3 4 0 3 6 4 0 2 4 1 1 5 [...]
+2 1 0 8 5 1 4 2 0 5 8 3 10 0 8 8 5 1 10 9 2 5 7 3 10 10 8 9 6 9 7 5 5 0 8 5 0 9 9 0 1 10 5 10 10 8 2 0 6 6 3 4 4 8 2 0 1 5 4 8 8 1 6 0 7 4 3 6 9 7 7 5 4 0 4 6 7 10 6 8 9 3 9 5 7 10 7 3 9 4 7 5 2 3 2 3 1 7 2 4 6 3 10 9 9 10 8 10 2 8 9 0 3 9 8 0 10 2 9 4 1 10 8 6 3 7 6 2 8 4 5 0 4 2 2 8 3 0 10 1 2 9 2 0 5 7 0 9 10 6 7 8 0 2 9 2 4 10 2 7 9 10 0 9 10 5 0 8 6 3 0 0 1 0 9 3 2 10 0 8 8 4 3 7 0 5 9 5 2 1 0 7 2 5 1 4 10 8 3 5 1 1 5 6 3 0 10 6 9 6 4 9 3 0 9 1 9 9 1 1 10 0 6 0 4 4 6 2 8 8 10 4 4 6  [...]
+0 0 6 5 7 8 5 1 4 9 0 4 6 2 9 9 10 5 6 1 5 4 1 6 3 8 10 9 9 3 4 4 7 4 6 4 6 1 1 1 2 4 1 4 9 7 4 7 8 2 8 4 10 5 7 5 8 0 8 0 5 10 1 2 0 7 1 7 3 2 10 9 0 7 2 7 2 2 7 9 0 8 0 0 10 1 3 2 1 8 0 6 4 10 5 7 3 7 9 2 2 4 5 1 4 7 1 4 1 5 3 2 2 9 0 0 10 5 10 10 6 8 9 2 8 1 7 1 5 8 10 3 5 0 6 3 2 9 5 6 6 9 7 5 1 6 5 3 4 5 10 2 3 6 10 2 10 10 7 2 3 6 8 10 8 5 4 6 3 4 7 4 2 0 6 2 3 9 10 6 1 6 9 7 4 5 6 8 6 6 8 4 1 9 9 0 4 7 3 4 10 2 2 8 6 0 5 2 6 0 4 6 9 6 9 7 5 10 9 2 3 6 6 9 10 7 0 4 7 10 5 0 1 4 6 7 [...]
+10 9 4 9 8 3 7 4 2 8 0 9 0 7 4 0 1 9 0 8 1 7 1 10 8 7 6 9 7 4 6 5 6 9 0 3 9 2 9 7 5 7 1 6 3 8 9 0 0 7 10 9 10 1 5 7 1 7 3 10 1 9 5 0 4 4 0 8 10 4 2 1 6 3 3 5 2 7 7 7 2 8 10 7 4 1 3 5 1 7 4 2 1 7 3 4 8 7 3 8 10 10 0 4 7 2 8 8 2 7 9 1 6 7 10 8 1 3 10 7 5 5 2 0 5 6 2 1 3 3 4 2 6 6 1 10 7 4 10 4 3 1 7 2 0 1 3 0 6 8 8 10 0 0 0 6 5 10 4 5 2 4 6 6 6 3 7 7 10 5 4 9 6 8 2 3 2 1 3 10 4 8 0 2 7 6 10 8 6 9 3 2 7 8 3 6 2 3 4 8 9 1 3 2 10 10 10 2 7 8 7 2 4 0 10 1 1 3 4 10 10 7 6 2 1 0 5 3 1 9 5 7 7 0  [...]
+8 8 0 3 1 9 10 10 10 6 8 0 5 9 2 9 2 6 7 4 4 2 10 1 2 7 5 6 5 4 5 2 1 10 1 10 4 8 9 8 2 6 2 6 3 10 0 6 3 1 9 0 3 0 8 10 3 9 4 10 6 8 6 8 1 9 9 8 2 5 3 6 10 7 3 10 7 8 2 4 1 2 8 7 4 4 3 3 1 2 1 10 8 6 1 4 4 5 7 1 3 0 2 4 4 2 0 3 4 2 5 6 8 0 8 4 10 10 10 9 6 5 8 4 7 8 10 8 5 5 9 2 2 10 5 8 6 3 6 7 1 5 0 6 6 0 8 10 4 1 2 8 0 3 7 4 0 10 4 2 2 2 7 1 3 10 2 5 4 1 9 9 8 3 8 10 1 8 7 8 9 5 3 10 9 4 1 9 4 1 7 6 10 3 2 6 1 4 4 4 10 3 7 0 7 5 3 9 8 7 6 5 9 8 9 4 2 1 9 10 9 10 4 4 1 9 3 2 4 0 5 0 10 [...]
+6 3 8 3 1 5 0 5 10 3 1 7 8 3 2 2 0 9 3 4 0 1 1 10 0 6 3 6 5 7 6 1 9 7 3 9 6 6 0 8 8 8 0 0 8 2 10 6 5 2 3 2 8 6 8 3 4 7 8 10 4 2 4 10 5 9 9 1 4 5 5 4 3 10 8 3 9 0 8 4 0 4 4 10 2 4 6 7 2 9 9 10 10 1 4 4 10 0 7 10 10 10 7 5 4 9 7 4 5 4 9 2 8 3 6 3 0 4 0 5 4 0 6 4 0 1 0 3 10 0 1 3 1 5 1 9 1 1 5 4 10 4 6 5 7 7 4 2 4 4 5 5 4 7 4 1 9 0 10 0 6 4 8 0 1 3 9 8 10 1 6 7 10 1 0 1 10 1 6 10 6 8 3 6 6 10 9 3 2 2 3 2 7 8 10 6 3 8 5 9 8 2 10 7 3 6 1 6 3 3 1 7 1 2 5 2 9 3 6 7 5 2 1 7 9 6 3 9 9 3 7 7 2 4 1 [...]
+6 6 0 7 7 7 7 5 1 2 0 2 1 1 3 2 9 10 2 9 1 10 10 6 8 1 2 0 0 4 9 10 7 7 9 9 3 4 3 3 0 7 3 5 0 9 4 0 1 4 1 9 10 1 4 10 9 7 9 6 2 7 10 6 3 7 2 8 0 9 8 4 1 7 9 7 3 6 2 2 0 5 3 7 6 6 5 5 7 9 10 6 4 10 6 9 4 2 8 0 2 7 6 7 6 1 8 8 3 0 10 7 1 7 10 7 3 5 6 6 9 3 8 8 3 2 9 8 3 0 4 10 7 6 10 9 9 10 6 4 9 6 6 6 6 10 7 2 7 8 0 10 0 3 3 0 7 7 4 9 7 1 9 10 4 2 4 2 7 8 10 5 5 4 4 4 10 3 9 7 0 10 10 5 2 1 5 4 6 0 10 8 4 8 0 3 4 6 3 4 4 4 7 7 2 1 0 2 1 6 4 3 3 3 10 9 4 0 3 8 6 4 3 6 1 6 3 6 7 0 10 5 7 3  [...]
+9 3 3 9 8 5 2 0 2 10 7 3 7 6 9 3 7 2 4 5 3 4 1 5 7 8 8 0 8 6 0 1 3 9 2 10 2 5 1 0 0 10 5 10 0 0 7 2 6 6 3 7 10 3 9 1 7 10 1 3 8 3 10 2 5 2 4 0 9 2 4 3 5 9 3 8 3 8 10 2 10 0 9 2 1 4 4 5 9 8 3 0 0 6 8 6 5 0 8 9 9 0 3 0 5 2 2 3 0 4 1 10 1 8 8 1 4 4 7 9 5 9 8 4 10 0 4 8 1 8 9 0 7 3 8 1 9 1 0 2 1 8 2 5 5 0 9 5 4 2 2 3 4 3 2 1 8 7 9 9 0 8 8 6 1 1 10 5 9 2 4 4 8 4 2 6 9 10 2 6 4 9 5 10 3 2 8 2 9 9 6 8 0 2 8 3 3 8 3 7 10 9 9 3 10 1 7 7 9 6 1 9 5 5 4 6 3 9 3 4 0 4 3 0 9 6 0 2 8 3 2 5 10 2 5 3 5 2 [...]
+3 8 0 9 1 1 2 2 3 2 3 6 6 6 8 6 8 2 0 5 4 2 0 1 3 3 4 1 5 4 3 10 9 9 10 8 0 9 7 3 6 9 2 2 6 9 0 4 2 4 4 6 5 9 5 6 1 4 4 2 0 2 4 9 9 7 9 9 1 3 3 4 6 5 3 10 7 3 8 5 6 4 3 3 0 4 1 7 4 0 5 0 2 3 6 10 1 3 8 1 10 8 6 6 7 4 1 9 3 2 8 2 6 4 10 1 0 3 2 10 3 10 0 0 8 3 8 3 9 6 10 5 4 0 7 1 3 2 10 0 0 9 3 2 1 3 4 9 1 10 3 4 4 0 7 7 10 4 9 10 8 1 10 2 10 4 1 6 8 4 2 2 1 9 4 5 3 7 8 8 4 3 7 8 9 7 2 10 8 1 4 4 4 1 9 10 3 6 10 10 2 3 2 8 8 6 0 4 6 8 3 3 8 7 1 2 0 6 10 10 4 2 0 4 0 6 8 10 5 9 2 8 7 6 1  [...]
+1 10 7 5 4 7 5 6 5 7 5 10 7 6 7 1 4 5 5 1 10 6 5 4 3 7 3 0 0 6 9 4 6 3 2 0 2 7 3 1 0 3 6 7 2 5 5 6 5 1 0 4 6 8 8 2 9 0 7 4 5 5 2 4 6 9 4 5 9 3 6 9 6 1 0 7 5 6 8 8 1 8 1 3 2 2 2 7 7 6 7 3 2 6 8 2 8 6 8 9 5 5 3 1 9 8 5 8 8 0 5 7 10 0 6 7 4 2 8 4 1 9 7 3 2 10 4 4 0 2 4 3 0 6 9 1 10 5 5 0 0 9 5 1 2 5 3 1 3 8 5 3 3 4 9 1 7 5 1 8 4 2 7 4 7 6 8 3 5 6 0 2 10 10 9 9 6 2 4 8 1 3 3 1 3 6 6 5 2 9 7 2 8 0 10 9 6 3 10 6 0 4 1 8 9 4 9 10 2 6 8 4 4 2 2 9 9 3 2 7 6 4 7 3 4 9 3 2 5 2 7 3 6 3 10 4 2 4 7 3  [...]
+5 0 7 6 8 2 3 0 0 3 7 5 9 4 4 9 8 5 1 4 7 1 4 10 8 3 4 3 9 1 2 8 6 4 5 9 7 3 2 6 9 6 1 3 9 8 8 9 10 8 8 9 3 5 8 8 2 6 3 9 9 10 10 5 1 0 9 9 1 9 3 3 5 6 6 5 3 0 1 1 4 4 0 4 7 2 6 8 5 10 9 10 4 4 8 6 7 6 9 3 8 7 0 8 7 10 0 6 10 6 4 6 5 3 8 2 2 4 7 0 2 7 4 8 7 2 8 4 2 8 8 2 7 9 10 3 2 7 9 10 0 2 10 7 8 7 0 0 8 0 2 10 7 9 9 6 9 0 8 6 5 5 6 0 2 10 3 5 2 3 4 5 10 2 1 4 8 2 1 4 0 8 2 4 0 1 9 4 10 9 8 10 6 9 3 10 5 5 9 0 4 0 5 4 5 3 1 7 10 4 2 6 5 3 1 5 1 6 8 9 10 5 10 1 5 10 0 0 8 6 0 6 2 7 5 2 [...]
+1 2 3 10 3 7 10 5 1 1 10 8 8 0 3 7 1 8 9 9 7 9 2 0 3 4 6 7 9 1 9 9 0 3 1 7 3 2 2 10 7 7 3 5 8 8 2 8 2 6 8 3 1 3 4 5 2 0 4 1 8 4 5 3 9 0 2 2 3 10 5 9 10 5 0 2 7 10 4 9 6 5 3 8 7 1 5 4 5 9 1 8 3 2 1 6 5 6 7 8 10 9 10 9 4 10 8 0 4 4 0 9 5 6 6 7 7 6 8 7 4 4 10 4 4 3 5 2 7 10 5 2 0 3 0 2 5 8 5 10 7 9 10 9 1 7 5 4 7 1 10 8 10 9 6 5 5 7 1 4 8 2 4 8 1 10 8 5 8 3 6 9 7 5 10 9 7 2 1 9 7 6 8 6 6 7 10 2 0 2 3 8 4 4 9 1 6 7 8 2 10 7 0 3 5 8 1 2 8 3 0 5 4 7 3 1 2 5 8 4 9 5 9 0 1 6 6 8 6 10 5 7 9 5 10  [...]
+1 9 6 5 10 8 10 0 8 8 6 1 0 2 0 3 2 2 8 4 6 4 6 10 9 6 9 3 0 1 7 8 8 3 0 3 1 6 10 2 7 0 6 1 9 1 8 0 8 5 1 7 7 5 0 9 4 9 8 3 6 9 0 3 1 5 1 7 9 2 1 9 9 2 9 2 5 8 5 2 9 0 4 10 10 6 8 6 3 0 1 7 0 2 1 10 0 5 7 9 6 6 7 9 9 3 0 6 5 8 6 1 0 9 8 0 9 6 3 8 3 7 4 7 1 10 0 3 9 5 10 4 5 3 8 2 1 5 4 4 1 4 7 6 8 9 6 7 10 6 6 0 5 0 1 3 4 8 4 3 5 6 4 6 7 5 9 0 1 5 2 1 10 10 4 0 1 6 10 3 10 10 10 3 0 9 1 6 2 5 5 9 6 8 2 7 2 8 3 3 7 0 7 0 2 5 0 8 0 8 7 1 3 4 4 5 0 10 1 0 8 5 8 8 6 3 2 8 9 10 9 1 2 1 0 7 10 [...]
+1 8 5 2 4 10 9 6 1 2 0 6 4 3 6 6 2 7 5 0 2 7 0 6 1 10 10 10 1 2 5 2 8 0 0 6 0 9 7 7 10 0 7 10 3 8 4 2 5 7 6 10 4 2 1 5 2 0 8 6 5 10 5 7 9 0 6 7 4 8 3 1 0 0 10 5 10 8 10 7 1 7 6 5 5 1 4 6 9 7 0 9 7 5 0 10 10 9 6 0 5 3 3 5 8 0 4 8 5 0 10 3 7 10 8 9 9 10 4 9 9 2 9 9 0 8 2 10 0 7 6 10 6 5 2 7 8 9 8 4 6 6 7 7 9 7 7 4 6 5 4 0 1 10 5 9 5 3 7 9 5 4 8 6 6 3 8 4 10 6 5 5 7 1 4 0 4 5 10 5 0 4 0 1 7 4 3 7 6 5 5 0 2 10 3 6 8 3 2 5 9 3 2 2 10 6 4 5 7 9 6 6 3 1 9 4 1 4 5 9 5 1 7 9 4 6 6 6 4 1 6 10 8 6  [...]
+8 0 1 1 0 1 0 1 6 9 4 3 8 3 10 10 2 3 3 5 10 10 1 8 2 0 0 0 10 4 7 6 8 7 10 9 3 6 7 0 0 7 1 5 7 7 1 9 3 10 10 10 4 4 8 3 9 4 6 0 3 4 2 2 10 6 7 1 8 5 7 8 1 3 3 8 9 1 10 8 9 5 4 10 9 0 4 2 9 9 9 9 8 3 4 6 8 8 0 3 4 10 1 7 5 7 9 7 8 6 10 2 8 4 4 2 3 6 2 7 8 2 4 2 1 2 3 9 8 0 1 10 3 8 6 4 6 0 10 9 9 8 8 5 10 3 6 0 3 0 0 8 4 1 10 7 5 8 0 7 7 7 3 6 7 0 0 7 9 3 4 8 9 6 9 0 3 8 5 4 3 0 7 6 1 2 10 7 7 2 7 7 5 8 3 2 8 10 1 8 10 1 9 6 2 1 2 0 4 5 4 4 3 10 7 4 7 4 9 6 4 2 6 5 9 2 1 8 2 3 3 9 2 5 0  [...]
+6 0 10 8 5 8 1 5 9 3 0 3 7 6 9 5 9 1 5 10 6 10 1 6 6 9 5 6 5 1 1 2 0 3 5 2 6 1 3 6 7 1 1 10 9 6 1 1 8 8 5 2 9 10 3 1 3 1 7 3 1 0 2 10 10 5 7 0 6 10 1 10 10 10 3 6 7 3 4 4 10 3 8 3 0 2 6 9 7 7 5 0 1 4 10 3 3 1 1 2 10 0 10 1 10 3 5 8 7 0 9 6 0 2 3 6 10 7 7 5 2 8 7 9 8 10 8 2 4 5 5 0 9 0 1 5 6 5 2 6 2 3 1 7 0 8 1 4 5 0 5 9 4 2 0 6 1 9 10 8 10 2 5 2 6 6 4 8 10 4 9 9 9 1 6 7 1 9 2 1 5 4 7 10 6 3 1 5 2 10 3 5 1 1 4 4 9 4 7 3 5 4 2 2 5 0 2 2 8 0 5 5 0 7 6 0 7 9 2 6 5 3 7 0 3 6 5 4 8 9 8 0 3 8 1 [...]
+2 6 6 6 10 9 0 3 2 5 0 0 3 2 10 10 8 4 2 0 4 1 10 3 9 10 3 4 10 4 6 6 0 5 10 2 7 3 5 1 10 5 10 0 8 2 10 6 5 10 5 6 3 0 10 7 3 3 1 4 1 2 1 6 2 0 2 5 1 4 3 3 1 3 0 3 3 5 5 6 5 1 1 3 2 1 4 1 4 4 2 5 3 4 4 1 2 8 3 5 4 10 7 8 8 8 4 5 5 5 7 1 5 0 6 4 2 9 0 1 4 9 8 2 1 10 6 10 9 8 2 8 8 0 9 9 0 7 7 5 10 9 7 10 6 7 1 8 2 7 9 4 10 5 6 9 3 10 2 1 9 10 5 9 4 5 5 1 8 5 8 2 6 1 5 4 5 8 8 5 6 5 0 3 2 5 7 0 0 3 4 1 4 6 7 3 7 4 7 5 3 7 2 3 6 0 4 7 8 5 10 3 3 6 0 10 7 6 2 5 2 8 9 5 2 1 4 7 5 7 2 1 8 0 5  [...]
+6 1 1 5 3 7 4 8 4 4 8 7 2 3 9 1 3 10 3 1 1 4 5 2 0 7 2 1 10 9 3 3 8 0 0 6 2 9 8 9 3 7 9 8 6 0 6 5 1 8 6 3 1 2 8 0 1 4 7 10 10 10 0 5 7 10 4 0 10 1 3 7 0 0 3 6 8 3 0 1 3 3 3 2 1 10 4 5 9 7 3 2 10 4 8 7 8 1 10 3 10 3 8 3 6 3 3 5 4 6 5 7 2 4 0 0 5 10 3 10 3 7 6 4 3 9 4 10 0 0 4 3 6 3 10 1 1 3 2 4 2 2 4 10 2 6 0 6 10 10 3 5 4 3 2 0 2 1 6 2 4 10 6 2 8 1 7 4 0 7 8 3 2 6 0 9 9 6 7 5 6 8 1 4 10 5 7 9 5 2 1 8 3 8 4 2 1 5 5 3 10 5 3 6 9 8 2 7 4 9 9 4 1 9 1 10 0 6 7 2 7 4 8 6 8 8 8 4 10 3 4 6 3 5 6 [...]
+9 4 8 10 1 1 0 10 3 9 0 0 6 2 2 5 4 1 10 2 9 1 0 5 5 3 6 4 8 7 8 10 2 9 0 9 5 8 8 1 8 7 6 2 0 2 7 10 9 10 8 7 8 2 3 7 6 3 8 4 4 3 3 2 7 10 3 6 8 7 2 3 9 10 9 7 6 3 1 3 8 3 7 2 9 0 3 10 8 8 8 3 0 9 0 0 7 9 0 6 4 7 0 0 1 2 2 0 1 2 5 5 5 7 6 6 8 10 8 4 6 5 5 9 2 2 9 10 9 4 3 0 5 2 10 6 8 7 10 5 8 6 1 8 5 6 5 1 10 8 10 8 6 2 2 4 8 4 6 1 5 0 6 7 2 1 9 6 6 5 10 8 7 3 2 8 5 5 3 4 5 0 6 10 4 6 2 5 4 8 7 3 2 6 1 8 5 3 6 10 9 4 1 1 3 1 9 9 4 2 2 6 5 2 1 4 10 4 6 5 2 4 8 3 9 7 5 2 7 10 9 1 0 2 6 3  [...]
+10 9 3 1 10 3 6 1 4 7 9 1 6 7 6 5 7 3 10 1 10 0 0 9 8 2 6 4 2 4 9 0 10 4 7 0 5 8 2 8 4 1 1 10 6 7 0 4 4 10 1 6 5 9 5 4 8 5 0 7 6 3 6 5 1 0 9 5 6 9 5 5 8 0 9 7 7 5 5 4 5 3 0 6 1 0 3 4 6 2 0 0 1 7 9 4 10 4 1 1 3 3 7 1 7 0 3 8 1 8 1 8 6 3 2 7 1 3 6 7 3 0 6 1 5 1 7 4 2 7 9 6 5 10 6 8 9 0 7 7 9 5 5 4 7 7 6 4 0 10 1 3 1 1 10 8 1 4 2 0 3 1 1 0 8 2 6 9 10 7 7 4 2 6 6 0 0 3 8 0 4 6 1 2 2 2 6 7 1 7 5 0 1 1 0 2 10 6 2 5 9 9 0 3 7 8 3 3 5 3 9 10 9 3 3 9 7 4 9 10 2 2 7 4 2 1 2 8 0 9 9 10 6 6 4 7 8 8  [...]
+3 3 6 0 6 1 2 0 2 9 9 1 8 5 3 1 2 4 9 8 6 5 4 2 6 5 9 9 0 7 0 6 6 0 2 4 6 9 8 0 2 9 1 6 5 10 4 6 1 1 5 7 2 6 0 1 1 9 5 1 1 9 7 0 2 5 7 5 7 10 1 0 2 5 10 3 5 4 6 1 7 2 0 1 1 6 3 8 10 0 8 8 5 4 0 9 6 3 7 4 8 7 5 2 10 6 10 1 4 2 3 10 6 2 1 2 6 0 1 10 2 9 4 4 3 7 1 6 9 5 9 7 1 2 10 2 0 4 4 5 5 7 4 9 7 5 3 3 9 0 9 2 0 1 7 0 1 10 0 1 9 2 2 1 1 10 3 6 0 6 8 1 6 7 10 9 10 0 9 4 0 10 2 8 8 5 9 1 10 2 0 6 8 6 3 9 4 6 2 10 10 3 0 1 0 3 10 9 6 6 6 2 9 5 4 10 7 2 3 2 2 6 1 0 2 6 8 3 6 9 4 0 10 9 3 7  [...]
+2 10 10 9 2 7 6 8 2 7 1 8 5 5 10 1 9 0 5 1 10 10 10 4 4 3 4 2 6 8 0 3 5 1 6 2 5 10 2 8 5 3 8 5 1 9 4 1 8 7 3 5 4 4 1 3 2 8 10 9 9 3 5 10 9 8 9 0 0 9 8 8 0 6 7 6 9 3 0 5 4 4 7 7 4 4 2 6 1 1 0 0 8 9 0 8 10 6 1 8 2 10 10 6 8 4 0 8 0 3 1 7 10 0 10 2 9 7 5 0 9 5 10 5 4 5 7 2 1 5 6 0 8 2 0 6 7 0 6 4 9 10 5 8 5 0 3 8 6 0 0 5 10 6 7 8 5 8 4 6 8 9 9 2 7 3 3 6 2 3 5 8 6 7 9 6 8 5 7 2 0 8 3 8 6 1 10 4 4 10 8 6 7 9 7 8 9 6 9 7 3 4 7 2 6 6 0 5 10 4 2 8 5 2 7 3 10 6 7 2 10 7 5 3 6 3 9 5 7 0 6 3 3 3 0  [...]
+5 1 3 3 10 9 5 7 5 8 10 2 1 10 9 7 9 7 4 4 3 10 1 2 2 0 2 4 6 2 7 1 2 4 6 4 1 8 6 5 7 10 8 10 8 10 10 1 7 4 2 6 7 1 1 0 0 6 2 4 8 7 9 2 10 2 6 6 9 5 8 3 5 10 1 9 5 2 2 7 1 1 6 0 1 3 4 4 6 2 0 10 8 5 8 2 9 0 10 6 1 6 5 9 4 3 3 2 5 2 6 3 8 6 8 8 10 7 7 8 10 1 3 9 3 5 9 6 6 7 10 10 1 10 4 7 3 0 7 5 10 7 1 8 9 4 5 3 0 3 1 4 6 6 7 7 1 3 1 3 4 7 0 7 7 10 4 6 0 4 5 5 8 8 9 2 6 8 8 6 9 4 3 4 5 1 0 7 8 2 0 8 0 10 5 1 6 5 6 4 1 5 7 4 1 2 6 5 7 7 0 5 1 7 10 10 9 2 1 7 6 6 0 8 4 9 8 3 9 7 5 3 3 3 10 [...]
+3 6 5 10 2 1 1 7 9 4 5 8 7 2 5 4 5 5 4 5 8 3 9 4 0 5 10 7 3 8 10 9 3 1 3 4 0 8 8 1 6 10 5 5 6 8 1 5 3 2 5 1 8 5 10 10 9 7 8 10 4 3 9 1 1 7 5 7 3 10 7 5 9 6 5 3 7 9 10 1 8 9 4 6 3 3 6 8 4 4 0 1 0 7 2 10 6 7 5 3 0 7 5 10 2 3 5 8 10 5 10 8 9 6 1 5 4 8 6 3 2 2 8 4 8 5 10 4 6 10 0 8 0 6 5 3 2 3 6 7 5 10 9 2 2 2 10 7 6 2 5 3 2 6 7 6 6 6 9 3 3 9 1 4 4 1 9 5 5 8 1 5 7 10 7 9 2 1 1 0 5 0 0 8 2 1 4 4 4 10 3 4 5 3 9 8 2 10 10 5 2 1 1 8 7 8 9 1 8 7 4 5 6 3 9 3 9 8 5 0 10 10 2 4 1 4 7 10 6 3 7 6 1 6  [...]
+3 7 0 7 3 4 0 8 4 8 3 1 8 2 7 3 6 5 6 0 4 9 5 7 4 4 3 9 4 4 9 0 2 9 7 6 4 9 3 7 10 10 2 6 3 7 6 7 5 6 1 10 5 8 2 5 10 9 10 10 3 4 5 1 0 3 0 1 5 10 7 6 5 0 2 4 9 4 2 5 4 10 7 3 3 3 10 9 6 2 10 9 7 5 8 6 1 2 10 9 3 7 9 4 8 8 1 7 3 2 6 2 9 9 0 5 8 1 1 0 9 10 3 1 8 8 4 8 8 7 9 4 0 8 1 3 6 4 0 6 0 8 2 5 8 1 1 9 10 10 9 10 6 9 0 2 3 10 4 2 7 7 0 10 6 3 2 5 8 2 0 6 1 1 10 8 0 8 4 10 5 0 5 5 0 4 8 10 1 9 2 7 3 2 10 3 2 5 4 0 3 7 8 0 0 1 10 0 6 6 6 1 1 9 4 4 1 6 2 0 7 4 3 6 4 1 3 9 5 2 1 5 7 1 6  [...]
+8 1 5 8 5 10 9 5 3 7 0 1 5 6 5 3 3 1 10 5 1 5 5 4 8 9 8 5 4 10 10 3 8 10 10 5 6 3 1 7 4 4 9 3 1 8 5 2 4 7 8 5 4 7 10 3 7 2 0 10 6 6 5 6 1 1 4 6 0 3 2 6 1 6 1 4 6 0 0 0 3 4 0 6 4 6 8 7 2 3 9 8 0 7 5 9 3 3 1 1 2 4 7 8 6 0 2 1 4 1 2 7 0 5 4 10 1 2 7 7 8 6 3 8 2 4 10 0 9 7 0 4 10 2 1 10 10 6 3 1 4 7 2 5 5 7 3 3 2 5 6 1 8 7 9 2 3 7 7 9 6 7 9 8 7 10 7 3 10 6 5 7 1 4 6 1 6 10 8 2 7 3 9 3 3 10 2 2 2 9 0 4 0 3 3 6 7 7 1 9 8 5 0 9 9 5 7 2 10 6 6 5 9 8 6 2 9 9 9 8 1 3 0 3 5 5 6 4 8 10 9 9 9 2 5 4 2 [...]
+3 8 2 5 5 3 4 6 5 6 7 3 2 0 0 8 4 2 3 6 9 5 9 6 6 7 6 0 8 5 1 0 6 1 3 9 8 5 3 5 2 4 10 0 2 2 9 6 4 1 5 8 7 6 6 4 1 2 3 5 10 2 0 9 10 1 0 8 10 7 3 7 2 5 5 1 4 4 1 7 10 1 9 3 8 1 7 3 2 8 5 1 1 3 3 8 7 10 2 4 6 6 4 3 4 9 10 8 1 6 3 7 0 5 5 2 8 0 2 6 7 0 2 5 0 0 4 7 9 0 1 1 8 5 1 1 9 2 3 2 10 2 5 4 10 6 2 1 7 7 4 6 4 4 1 4 8 7 4 6 10 8 1 1 6 9 8 5 5 1 6 9 3 0 8 5 9 5 4 0 10 9 10 2 0 9 10 7 4 7 8 0 8 3 10 6 4 6 6 10 2 8 1 7 0 5 4 1 4 9 2 3 6 7 0 3 7 5 0 6 3 9 5 5 1 6 9 2 1 1 3 0 1 5 9 2 7 8 1 [...]
+4 4 6 9 7 1 2 7 6 0 1 5 4 6 9 5 0 1 3 6 6 10 10 9 9 8 4 3 2 7 5 8 8 4 9 5 8 8 4 0 1 8 3 10 8 3 5 0 1 1 10 2 10 6 5 4 9 5 1 7 1 9 3 5 8 7 4 4 9 1 4 3 0 8 0 8 9 7 2 6 6 6 10 5 9 6 3 0 9 7 4 8 2 5 10 0 8 7 8 8 8 3 10 4 1 6 5 7 8 1 8 8 5 1 5 1 3 2 5 8 6 3 2 0 5 6 9 5 6 8 3 8 9 6 7 5 5 1 5 0 9 1 10 2 0 2 6 9 2 8 5 7 3 4 1 4 9 8 8 10 1 1 0 2 0 5 0 6 10 3 1 2 7 8 9 5 0 2 2 9 1 8 7 0 7 9 6 0 2 3 4 8 9 4 3 6 0 7 0 5 2 5 8 9 0 10 0 7 7 1 7 6 8 2 6 6 0 8 7 5 9 2 9 0 8 8 6 8 3 2 3 3 5 3 5 10 7 5 8 7 [...]
+2 8 0 9 2 6 4 7 8 5 8 1 7 3 8 3 2 6 2 7 3 7 2 2 1 8 10 3 0 5 7 10 3 10 1 6 2 8 5 9 5 3 1 7 0 7 4 1 3 0 10 5 3 4 4 4 1 6 10 1 7 7 2 1 9 8 4 1 1 10 9 7 3 7 10 7 0 4 5 9 10 0 8 4 4 2 2 3 2 7 6 0 4 2 4 6 7 1 9 4 9 2 7 1 10 2 7 2 6 4 9 7 4 5 0 7 9 4 4 0 8 6 6 6 7 0 4 8 6 10 9 7 2 9 7 5 4 9 8 3 9 10 6 8 0 6 0 8 3 4 4 4 5 7 3 2 9 4 6 3 3 8 7 5 8 10 2 6 9 10 0 9 3 5 6 6 5 3 2 8 8 3 0 1 4 4 9 1 7 9 6 8 2 9 2 10 3 0 2 5 6 0 6 3 6 8 4 7 5 5 4 3 6 6 9 4 3 1 9 3 9 8 6 2 1 0 9 1 7 4 0 1 4 0 8 6 8 4 2  [...]
+3 3 10 2 3 4 3 10 2 10 10 3 6 3 2 5 4 10 8 1 5 5 2 6 1 2 2 10 1 8 1 3 4 9 7 1 9 2 2 4 2 9 3 3 1 6 8 1 2 0 9 10 7 1 9 1 8 3 3 6 2 9 7 1 8 8 0 4 8 7 9 0 8 1 2 3 5 1 7 6 10 2 8 6 4 8 8 5 1 2 2 2 8 0 3 5 10 7 3 10 9 6 10 3 9 6 1 3 10 1 5 2 7 10 8 5 1 4 3 7 5 0 7 9 3 2 9 9 8 8 0 7 1 1 6 0 9 0 0 1 1 8 2 7 8 4 6 1 6 3 6 1 2 6 0 6 3 7 8 9 2 3 7 0 4 3 4 5 4 6 1 10 7 10 2 4 10 9 3 0 9 1 3 0 1 4 5 5 8 0 10 3 3 3 9 4 10 4 7 5 9 2 4 7 0 3 4 5 10 9 1 1 8 3 3 3 0 3 3 7 10 1 7 1 2 1 1 7 5 1 4 6 0 7 7 2  [...]
+3 10 3 2 0 0 4 4 7 3 4 9 9 10 10 3 5 8 0 9 8 2 7 2 5 9 7 5 5 0 7 9 7 7 10 4 0 6 0 9 0 4 1 3 4 3 5 9 8 6 7 9 2 2 5 6 3 10 10 4 4 10 3 9 2 3 1 9 10 3 8 7 6 5 9 9 7 3 10 5 4 7 10 1 6 1 1 10 4 8 4 9 1 6 5 7 1 4 6 9 10 10 1 2 2 7 10 8 10 1 6 5 9 0 1 10 0 5 8 2 9 1 3 0 7 7 5 10 5 2 1 0 5 1 8 8 6 10 3 2 2 5 2 5 10 4 3 3 4 9 2 9 2 1 9 2 2 1 7 9 2 7 9 10 6 3 2 7 10 5 9 6 7 7 6 2 5 8 2 4 9 2 10 3 2 7 4 8 0 3 0 3 7 1 4 3 5 6 2 1 5 4 0 3 6 1 8 7 1 5 5 2 7 8 1 0 3 3 4 5 1 0 2 2 10 5 4 8 7 10 5 1 0 5  [...]
+6 5 10 3 7 3 4 3 2 4 4 2 4 2 4 7 4 8 3 10 4 6 2 7 8 6 7 7 4 8 3 4 8 9 1 4 7 3 4 8 8 6 7 1 7 8 0 5 10 2 8 10 0 3 1 10 3 10 5 10 1 4 8 2 3 1 3 9 1 0 0 4 6 4 1 6 9 8 0 3 6 6 2 0 3 7 7 5 6 6 2 0 2 6 7 1 2 2 5 4 10 3 0 3 4 2 6 4 5 2 2 6 8 8 7 9 5 6 7 0 4 4 6 6 6 4 0 0 5 7 3 5 10 6 0 2 3 4 3 2 10 9 4 9 3 2 5 10 10 1 2 4 3 1 3 6 0 0 2 7 2 5 7 2 4 2 1 8 5 5 3 10 1 0 8 1 5 7 3 3 2 10 4 0 6 6 5 8 1 7 10 8 8 9 0 7 8 10 9 3 0 3 4 10 5 5 10 1 6 5 7 3 4 5 6 8 5 2 2 4 7 4 9 5 7 1 8 10 10 7 2 0 5 4 10 5 [...]
+5 7 1 5 10 1 4 7 10 8 1 9 2 6 10 1 2 8 9 2 2 2 5 6 9 6 3 2 8 0 10 10 10 6 3 2 4 9 1 3 6 0 3 4 10 4 7 1 9 4 10 10 10 5 7 1 6 4 10 1 8 3 8 2 9 10 6 2 7 1 5 1 3 8 4 6 8 6 2 0 1 6 3 0 10 5 8 2 5 3 5 5 9 0 8 6 2 0 1 7 6 9 2 6 5 0 9 1 3 7 6 1 2 6 8 7 9 7 5 7 4 8 1 6 2 6 7 8 3 6 9 0 4 9 1 8 6 5 1 1 7 7 2 2 4 1 7 0 0 9 2 9 10 3 1 7 3 5 10 8 6 2 0 5 0 7 2 6 8 5 3 7 3 2 9 5 3 4 2 4 6 0 6 6 5 2 9 1 6 4 6 1 10 10 2 10 5 10 1 10 3 0 6 10 5 6 5 2 9 4 0 1 6 5 4 5 8 5 10 8 3 9 3 9 4 6 9 7 7 1 6 8 8 3 1  [...]
+9 0 5 2 2 0 4 7 1 4 1 7 3 7 4 7 2 6 5 0 7 1 5 6 8 2 1 6 8 5 1 6 4 2 8 0 5 9 8 6 5 3 1 1 10 4 6 1 9 8 4 3 6 10 1 7 2 4 1 8 1 9 4 4 7 8 10 10 8 10 7 2 9 1 9 8 8 6 7 5 4 0 0 1 0 6 7 6 5 10 5 5 8 6 1 5 10 5 0 3 6 4 8 6 9 8 9 5 5 1 4 4 4 10 6 3 9 5 10 9 0 7 7 3 2 9 6 1 6 4 8 8 1 10 4 8 5 6 7 9 9 5 7 1 10 0 2 7 7 2 1 2 10 9 4 7 9 9 0 2 8 4 4 9 8 2 8 6 6 6 10 7 9 1 3 5 0 5 5 3 9 4 8 3 2 1 3 9 1 7 10 3 9 5 10 1 10 4 8 9 3 4 6 4 4 10 7 0 8 0 5 4 8 0 7 0 5 4 10 2 9 7 7 7 2 2 7 1 7 0 1 10 0 7 6 8 1 [...]
+0 8 10 5 2 0 4 1 8 2 0 5 5 9 4 0 9 5 9 4 6 3 7 5 1 8 4 10 1 10 9 8 2 7 3 2 5 10 4 9 10 4 0 2 10 3 3 9 3 7 3 4 6 2 9 7 9 10 4 3 9 10 2 3 9 5 9 4 10 0 2 6 8 8 6 6 8 3 0 3 10 8 2 0 4 5 9 8 6 1 3 6 2 2 5 5 5 6 8 6 7 0 2 4 3 10 5 2 0 9 0 1 2 10 2 5 4 3 3 9 8 3 6 3 6 7 6 5 10 5 7 9 3 4 2 10 1 3 1 2 10 4 7 0 7 5 1 7 2 6 6 0 1 2 1 10 3 1 9 8 10 0 3 7 1 0 7 1 10 4 7 9 3 8 4 10 7 3 10 3 5 3 5 3 10 7 2 4 2 6 1 0 3 7 8 6 0 0 3 0 9 4 7 6 2 10 4 1 9 1 7 9 7 0 2 1 3 1 6 6 2 0 10 1 5 10 9 0 6 8 2 10 0 0 [...]
+1 1 5 2 8 2 7 5 6 4 6 5 2 7 5 4 3 8 6 5 3 1 5 6 0 9 4 6 6 1 5 6 4 10 10 4 2 10 5 0 5 2 2 1 0 3 6 7 5 9 9 5 5 0 3 2 7 3 8 8 4 2 10 6 10 0 9 0 7 5 8 1 9 3 9 3 4 7 1 2 3 7 0 4 9 2 1 2 0 0 2 7 8 9 3 10 0 1 9 10 1 8 3 7 1 6 6 2 3 1 5 8 1 6 10 3 7 1 3 3 3 3 9 9 9 0 10 9 2 6 4 6 0 3 0 7 10 3 4 9 1 8 8 1 8 9 1 6 1 0 5 4 0 5 10 8 5 1 4 1 0 0 10 10 2 3 0 8 8 0 2 5 7 4 5 7 10 10 7 6 3 8 5 9 10 7 1 9 2 3 7 4 2 9 9 7 7 9 7 8 9 3 5 1 0 8 2 9 3 4 3 10 6 9 10 10 2 1 10 1 3 0 7 3 0 5 6 3 6 8 6 6 5 2 3 5  [...]
+4 7 2 2 10 4 1 6 9 1 5 5 4 9 6 10 8 5 0 10 5 7 1 2 5 4 6 2 0 2 0 8 10 6 2 9 4 5 3 3 7 2 10 6 5 2 5 0 10 2 1 1 6 9 5 1 1 9 2 2 4 3 6 5 5 10 8 9 7 2 8 2 5 4 3 8 6 4 10 6 2 10 2 9 3 9 2 6 1 8 0 9 8 6 10 1 1 6 8 5 8 2 2 5 1 6 7 9 10 8 7 0 4 8 10 2 10 6 9 5 4 5 6 6 6 10 0 8 2 2 10 9 0 9 1 9 2 3 8 6 10 2 9 10 0 9 5 7 9 9 6 10 1 3 2 10 4 3 6 8 9 5 7 2 4 9 9 0 3 8 7 7 7 7 1 4 7 2 1 6 4 0 6 6 3 2 8 8 2 0 4 5 4 9 7 2 5 8 9 4 7 7 8 1 4 3 1 8 5 7 8 0 3 5 5 7 4 1 0 10 0 2 7 8 7 8 10 6 0 2 3 0 0 7 8 9 [...]
+2 1 3 2 6 2 9 1 6 4 2 3 8 3 8 4 9 1 9 1 2 3 8 7 4 1 7 7 7 0 4 1 5 3 5 9 6 1 9 1 9 10 10 2 7 7 1 2 9 10 1 0 1 10 8 9 8 2 3 9 7 9 10 5 10 6 1 2 9 9 8 8 10 10 9 6 1 2 10 9 7 6 3 9 0 1 10 8 3 5 0 7 1 10 3 1 6 7 2 9 2 9 10 4 4 1 6 10 4 2 10 5 0 6 0 2 9 1 2 8 0 3 0 3 10 7 8 2 4 7 6 2 7 10 4 3 1 2 3 5 1 7 0 6 10 10 9 10 7 6 4 10 6 6 0 8 2 5 2 3 5 8 4 1 5 10 4 0 10 9 0 8 4 1 3 1 2 2 2 10 9 1 0 5 1 10 0 6 8 7 9 3 1 0 3 2 3 4 5 10 2 5 3 1 6 9 2 7 6 2 9 1 1 1 9 2 0 9 0 2 0 0 0 0 1 9 10 6 1 2 2 2 6  [...]
+4 2 2 5 3 2 10 7 7 6 7 7 4 2 1 2 0 10 1 9 9 4 6 5 4 4 6 0 4 7 4 9 9 7 2 7 9 0 0 5 0 6 5 0 10 10 0 5 8 2 7 3 1 1 7 8 8 3 1 10 8 5 0 10 6 2 9 2 6 6 6 2 2 7 8 5 1 6 9 10 8 10 4 10 2 10 5 10 9 6 8 8 10 8 6 7 8 9 7 7 6 3 10 5 9 1 8 7 7 5 4 9 9 2 9 8 4 0 0 0 5 8 4 0 6 7 10 10 2 5 10 7 6 2 4 0 1 6 8 9 5 5 10 5 7 6 1 4 9 0 1 10 7 1 0 3 2 0 9 6 10 8 8 10 4 9 2 9 10 4 8 0 4 4 5 1 3 0 3 6 6 3 8 5 0 3 9 4 1 3 2 5 6 8 4 4 6 2 5 6 8 3 3 2 8 1 5 8 7 1 9 1 8 7 10 0 4 0 8 9 5 4 5 6 6 9 7 0 4 9 4 0 7 0 2  [...]
+2 10 4 5 6 9 7 2 1 4 5 8 5 7 1 9 6 8 0 2 8 10 0 9 1 6 9 0 3 1 0 9 9 2 4 9 4 9 2 1 6 7 7 2 4 3 9 7 9 6 5 0 4 0 4 4 0 1 3 6 10 9 9 8 1 9 5 9 9 7 3 8 1 2 4 5 5 3 4 4 4 0 8 1 8 9 2 4 5 2 10 3 3 5 7 2 6 10 6 7 2 5 2 10 5 6 0 0 4 6 10 9 6 1 2 7 5 5 9 8 0 2 0 0 5 4 6 10 9 9 10 10 8 4 8 10 4 9 5 2 10 8 0 2 8 4 1 2 5 6 5 6 3 0 0 3 3 8 1 1 8 9 6 3 2 9 4 3 2 5 0 10 0 8 6 0 8 9 9 0 7 2 4 10 0 7 3 6 3 2 0 4 5 4 3 2 7 10 5 3 9 2 8 4 6 3 3 3 3 4 9 1 4 0 4 5 2 10 6 2 2 8 9 3 4 3 9 0 5 9 3 10 2 9 1 3 2 7 [...]
+9 4 3 2 7 5 6 5 0 1 0 9 5 10 6 9 3 5 7 2 3 0 1 10 0 2 3 5 6 7 8 8 1 8 0 9 5 9 2 7 7 1 0 5 0 6 5 5 0 6 7 1 6 8 4 1 4 9 9 2 10 4 0 9 2 2 9 2 0 1 3 3 1 3 7 9 3 6 0 7 7 5 2 0 3 4 1 1 4 5 0 2 8 9 1 10 2 5 6 4 4 2 8 3 0 2 2 7 7 0 7 0 6 3 2 1 8 0 9 5 2 10 1 0 9 10 9 7 2 4 3 3 5 1 2 7 7 10 8 10 8 6 1 7 10 4 2 10 6 3 3 2 5 1 5 4 6 8 6 7 1 5 8 2 7 6 1 4 10 2 10 1 1 0 0 3 10 5 6 0 4 4 7 3 5 1 6 8 0 6 10 1 8 5 4 9 7 5 6 0 3 7 6 3 0 10 1 7 4 4 9 6 6 0 5 8 9 9 4 9 1 3 7 8 3 2 0 8 0 7 7 2 9 4 8 3 6 0 7 [...]
+5 4 7 10 5 7 0 10 0 8 9 2 8 2 7 4 10 1 7 9 6 4 4 0 4 3 10 8 9 3 10 2 4 0 9 1 9 1 3 9 4 8 6 1 10 8 6 7 0 9 3 5 0 10 9 1 8 10 1 7 8 10 7 7 9 6 9 0 4 2 4 6 0 10 6 6 2 5 2 1 3 8 0 6 2 0 3 3 0 5 1 5 8 4 5 9 8 8 2 5 6 3 7 10 7 7 9 6 5 7 9 2 1 8 5 8 9 9 0 9 5 6 8 2 4 0 3 10 7 3 10 7 7 4 4 6 4 10 3 5 0 2 3 9 1 10 7 0 6 5 10 1 2 9 7 0 3 0 1 10 1 1 5 5 5 2 5 1 10 7 8 4 0 9 9 1 5 8 4 7 1 0 6 6 6 9 1 4 4 0 6 6 10 9 9 6 8 3 3 6 10 4 10 8 7 10 5 0 7 0 0 5 8 10 8 1 6 1 0 1 8 10 7 5 3 3 5 10 5 1 4 9 3 3 [...]
+2 1 4 9 5 2 1 0 10 5 10 1 3 9 10 6 8 7 1 2 0 4 6 6 1 6 9 1 7 2 4 5 5 9 2 3 3 9 10 2 8 5 10 4 1 7 9 10 9 5 10 10 3 7 1 10 7 3 0 1 10 0 5 2 9 6 7 1 2 1 1 1 1 9 4 6 6 7 0 0 3 4 6 0 9 6 3 4 9 2 9 9 3 10 9 0 1 9 3 6 8 5 8 0 0 7 2 10 4 0 9 4 0 6 0 1 4 9 8 7 4 2 7 10 0 1 2 9 0 6 9 9 5 10 7 0 10 10 7 1 6 0 5 4 5 2 2 9 2 5 2 5 5 5 6 10 1 1 2 7 7 7 2 9 3 1 10 1 0 2 0 1 1 1 5 2 7 5 5 6 10 4 1 4 10 7 9 7 7 2 1 1 0 5 0 6 2 5 7 4 7 8 6 6 2 1 10 4 9 10 6 2 5 4 9 4 3 9 1 9 9 1 1 4 3 9 9 3 1 4 6 4 1 5 4  [...]
+3 9 8 2 10 2 7 5 4 0 0 2 7 1 10 5 4 8 2 0 0 10 7 7 3 10 9 10 7 10 2 3 5 8 4 3 6 3 5 1 3 7 1 3 3 2 5 1 8 8 7 7 2 3 4 9 9 8 0 5 7 2 8 8 8 6 3 3 4 1 7 2 9 9 1 1 1 10 10 6 9 7 5 1 5 6 8 7 1 7 1 6 7 0 8 8 8 2 6 4 5 1 9 6 3 2 7 2 7 4 6 10 8 3 6 4 10 4 0 3 2 7 2 5 9 9 7 5 3 9 6 6 3 0 9 2 5 1 7 0 5 3 10 9 5 9 10 10 1 2 0 2 0 3 7 8 4 0 7 7 2 1 4 2 5 9 10 4 4 4 4 0 10 3 0 2 9 10 7 5 8 4 0 5 1 5 10 6 4 3 6 0 9 3 10 9 7 8 8 8 5 2 10 0 2 6 6 2 1 9 0 10 8 8 0 6 0 7 9 1 8 5 8 7 9 7 4 7 10 10 2 2 0 8 0  [...]
+1 7 3 2 5 2 2 8 9 1 7 6 1 5 2 0 0 2 4 5 10 7 10 10 7 1 6 3 5 9 1 6 4 4 3 4 9 3 9 9 1 8 10 3 7 3 5 0 8 3 5 6 6 7 3 7 0 7 4 1 2 6 1 2 8 2 8 3 6 1 2 3 6 9 1 0 3 6 9 3 7 1 2 1 8 3 5 7 8 1 7 0 0 5 9 8 6 9 4 9 4 6 7 6 7 2 6 2 8 0 1 3 3 4 1 10 9 8 2 2 9 2 8 8 0 10 5 3 4 4 4 3 7 1 6 4 1 9 10 7 5 2 6 4 6 8 6 6 6 8 2 3 10 3 6 5 2 0 4 9 2 1 4 1 4 4 3 10 8 5 4 5 5 10 6 9 2 8 10 1 0 6 2 1 6 3 9 7 6 0 7 6 10 1 10 6 10 4 5 0 2 0 6 9 6 6 9 10 10 0 7 0 8 10 10 5 10 4 6 5 1 1 4 8 8 6 9 3 1 1 4 1 10 2 10 8 [...]
+8 0 3 3 3 9 6 8 0 0 1 2 6 0 7 9 3 0 7 6 8 10 1 2 5 2 1 4 4 0 6 10 3 1 5 6 6 5 10 9 0 1 10 1 0 9 8 2 0 5 9 5 1 2 0 3 8 6 6 3 9 8 9 5 10 2 1 1 0 1 9 6 3 2 8 1 0 5 6 6 2 9 7 3 6 9 10 10 6 2 3 9 0 3 7 10 1 2 9 10 8 3 4 3 0 6 7 2 1 4 3 2 3 5 6 10 5 0 9 1 9 1 4 6 3 3 4 3 3 5 5 10 3 1 2 6 9 9 5 2 9 5 4 5 0 3 9 9 1 8 6 0 10 6 2 8 6 8 2 8 2 8 8 10 9 1 2 4 5 4 1 8 1 5 7 4 0 9 5 3 8 1 7 6 4 2 0 4 6 2 1 9 6 9 4 3 0 4 10 7 3 0 2 2 4 3 3 4 0 5 8 8 0 8 10 9 1 4 8 7 1 4 3 10 3 4 0 4 9 6 0 9 10 5 2 1 4 8 [...]
+6 8 1 1 8 7 6 6 10 6 4 9 4 10 7 10 9 1 1 3 10 0 10 6 2 5 1 5 0 7 3 7 10 7 9 5 1 6 5 2 0 3 10 3 0 10 0 5 6 10 6 0 6 5 8 7 1 5 4 8 1 8 3 4 10 7 2 3 10 9 9 9 2 4 9 5 6 7 1 8 8 1 10 1 6 7 5 10 2 8 8 2 7 4 7 1 8 3 6 4 8 8 7 2 9 3 8 6 8 4 3 8 8 6 8 8 8 9 5 9 9 10 2 8 4 7 1 6 0 7 8 4 0 3 3 0 5 7 7 0 8 0 3 0 7 1 0 10 1 3 8 10 8 1 2 1 4 6 9 10 9 5 3 3 10 6 5 0 5 9 6 10 1 7 5 2 7 8 2 10 10 10 10 3 1 4 2 4 10 10 8 5 8 4 1 3 10 2 1 9 4 5 10 0 8 10 0 1 5 10 2 6 6 5 2 0 3 1 7 4 4 1 4 5 7 10 9 3 2 9 7  [...]
+1 3 9 2 8 9 7 8 1 9 3 7 3 3 7 8 6 10 10 3 6 3 5 10 6 8 8 0 4 2 3 3 8 9 3 3 0 6 0 9 10 3 3 0 3 9 9 10 7 1 5 2 1 5 0 10 2 9 1 4 9 6 9 3 9 8 4 7 6 4 1 1 8 9 2 9 6 0 2 9 7 1 1 6 2 2 8 7 9 3 2 0 4 3 2 5 6 6 6 10 1 8 8 7 3 9 9 5 4 1 3 0 10 5 7 8 0 3 3 1 4 7 8 10 6 9 3 7 3 6 8 10 9 10 6 9 4 2 10 6 6 7 9 3 3 8 4 4 4 8 5 7 6 7 4 5 3 4 8 0 6 8 2 5 3 10 7 5 6 10 10 10 10 9 5 0 1 2 9 6 9 7 7 2 7 2 10 10 2 0 0 2 0 4 8 2 1 10 4 3 8 2 5 4 6 7 4 6 0 5 7 10 0 8 2 5 0 3 7 4 8 4 8 6 0 4 9 5 0 7 8 10 6 3 2  [...]
+0 10 7 9 6 9 9 0 1 10 0 10 8 7 8 6 1 9 8 3 3 2 0 2 3 4 2 10 6 7 5 10 3 7 8 10 7 5 2 2 5 8 6 3 6 7 7 3 6 9 3 4 4 1 8 7 3 9 6 6 8 6 3 8 6 5 5 9 6 6 1 0 1 5 9 7 10 8 1 6 1 2 5 7 10 3 9 7 7 0 6 7 10 6 9 8 0 3 3 3 7 5 4 5 4 7 5 3 1 10 10 0 3 1 7 3 9 2 9 1 0 7 8 6 6 1 4 10 9 10 6 3 10 1 7 5 10 8 9 7 2 4 7 9 0 4 5 3 9 10 3 9 9 5 4 4 3 10 2 8 8 1 0 3 4 4 10 7 4 1 8 10 4 4 2 3 5 3 10 6 3 7 9 9 2 8 5 10 1 2 10 10 0 0 0 6 4 9 7 8 3 5 5 5 6 2 8 6 7 0 5 9 3 1 2 7 5 2 4 3 2 7 6 7 3 1 4 10 4 1 9 7 7 10 [...]
+5 5 9 4 7 5 1 8 7 8 1 10 3 3 4 8 1 10 5 1 1 6 3 7 1 4 10 1 7 2 2 7 9 3 3 7 5 3 7 5 10 9 7 3 8 6 7 5 9 5 7 9 6 4 9 0 5 7 9 8 8 8 4 6 1 1 5 3 2 6 1 3 0 6 6 10 2 1 5 9 2 7 4 3 3 6 10 1 8 1 0 3 8 6 4 5 0 1 10 7 1 5 7 0 5 2 6 2 2 7 9 10 1 0 10 7 4 8 9 2 0 3 8 6 9 10 2 5 1 3 4 6 1 0 0 7 9 4 5 6 1 0 7 2 1 7 2 3 6 10 1 2 8 8 6 6 2 6 0 6 6 2 8 0 5 6 10 0 1 2 5 9 4 3 0 1 8 4 4 6 2 5 1 5 2 1 7 10 6 6 1 3 5 3 1 9 1 0 8 10 2 3 7 5 6 4 10 1 1 6 8 5 10 8 1 1 6 10 8 9 5 7 8 8 8 1 8 3 0 3 10 2 4 10 2 8 1 [...]
+5 3 10 2 7 0 3 2 9 1 2 8 9 5 2 1 3 1 0 7 5 0 9 8 3 1 7 1 10 2 7 8 0 6 8 3 6 0 10 8 8 1 3 5 3 3 5 4 3 2 9 4 0 4 7 4 1 3 8 6 6 3 7 4 2 6 3 6 5 7 10 6 5 7 0 8 1 1 4 10 5 7 10 9 9 7 5 8 6 1 9 1 8 4 2 6 3 2 10 2 6 4 6 0 1 0 5 7 5 10 4 0 2 8 7 10 8 7 3 8 10 0 3 9 1 9 10 5 1 8 8 10 5 10 5 8 2 10 1 0 5 7 0 4 2 0 3 8 9 0 4 1 1 0 5 10 6 1 4 2 7 3 8 5 1 7 1 4 2 3 3 3 1 2 6 10 10 3 0 5 7 3 0 5 3 9 8 9 2 3 6 0 7 0 8 5 10 4 6 6 5 5 10 8 7 4 5 2 9 9 2 9 7 3 1 0 2 7 4 4 8 1 7 10 7 6 7 10 8 0 9 10 6 5 7  [...]
+3 8 0 2 8 9 4 2 2 0 8 5 8 1 8 8 8 6 8 3 2 10 3 1 10 6 3 0 6 7 7 2 8 2 10 8 8 1 4 5 10 10 4 5 0 1 5 6 0 2 10 2 0 1 2 5 7 10 0 2 7 0 1 10 10 9 4 0 2 0 10 9 6 1 2 1 5 4 8 2 8 6 9 2 8 3 8 3 3 2 7 10 7 8 6 7 8 0 8 8 3 1 5 10 3 0 8 7 2 1 4 2 10 8 9 0 3 8 0 1 4 7 5 8 1 4 8 5 8 8 4 4 0 10 3 1 1 5 0 8 10 1 1 8 4 7 2 8 7 10 3 1 9 6 9 4 7 1 10 8 4 4 9 10 7 10 0 1 3 9 2 2 7 6 3 8 4 2 3 1 2 7 4 5 3 10 1 7 8 5 1 1 6 5 2 8 5 0 0 10 6 1 4 7 8 1 0 10 6 10 6 8 10 9 9 0 0 5 1 1 5 6 5 2 10 10 6 5 9 1 6 4 0  [...]
+5 1 0 3 5 6 1 9 9 0 9 6 10 3 2 9 2 7 7 1 7 2 2 2 5 7 9 9 8 9 7 4 4 2 2 5 8 1 9 2 7 8 4 6 1 3 4 1 5 7 1 5 0 7 6 9 6 5 3 0 5 3 2 6 9 10 4 7 1 0 6 3 6 8 9 6 9 10 2 10 8 10 0 2 5 3 0 9 1 8 10 6 1 2 5 7 8 9 8 1 2 7 9 0 1 6 1 6 5 7 2 9 10 9 10 9 1 7 2 7 4 1 2 8 7 10 9 4 5 9 7 9 4 4 6 4 5 0 3 8 10 10 8 2 0 10 4 2 9 6 0 7 0 10 1 9 0 10 1 1 2 10 4 4 6 5 0 6 6 1 7 7 5 10 9 4 2 3 9 2 0 3 5 4 5 10 8 10 7 10 8 4 5 2 9 2 6 9 10 0 0 3 0 6 6 6 2 4 6 5 2 10 0 10 5 4 7 0 5 3 5 5 8 2 7 1 2 8 7 4 1 0 8 1 8  [...]
+0 10 8 6 9 7 5 6 7 4 5 7 6 9 4 10 2 6 9 5 0 6 9 4 2 10 5 6 9 0 2 1 0 0 10 6 1 4 6 9 1 9 10 5 3 8 5 7 4 1 8 4 3 10 6 10 10 4 6 1 4 10 3 2 7 8 4 7 3 3 9 7 2 8 7 1 2 5 8 8 0 7 3 4 3 4 7 8 2 7 3 2 7 5 5 5 2 2 3 8 4 6 3 1 3 2 7 2 2 5 0 6 9 8 2 1 3 10 2 5 2 1 0 6 7 4 6 10 3 3 1 7 9 8 0 8 2 1 4 6 6 2 3 9 8 7 6 8 3 0 5 9 9 0 7 3 8 6 10 10 4 7 3 8 9 0 8 3 1 9 7 7 1 8 5 5 3 2 7 8 0 6 4 8 7 1 2 9 5 6 3 6 5 9 8 10 5 1 5 1 9 10 2 1 2 2 2 5 4 9 4 10 3 2 1 9 10 4 1 4 3 9 1 8 3 9 6 1 4 8 5 0 3 0 5 5 5 0 [...]
+3 3 2 4 3 7 0 10 3 7 6 8 1 7 4 8 4 9 0 2 4 6 8 2 1 9 4 0 3 8 8 2 4 5 0 4 8 4 5 0 7 5 3 1 3 3 3 2 4 1 9 10 4 1 6 0 2 7 6 6 0 8 7 10 6 10 0 5 8 4 7 1 9 1 1 2 7 7 6 10 7 3 7 3 8 8 0 6 2 5 1 1 1 9 4 9 1 9 7 8 2 8 4 4 0 7 1 0 2 2 6 2 3 6 0 0 9 0 3 1 3 2 0 9 0 5 6 3 10 5 1 9 7 2 4 9 3 6 6 9 1 5 6 3 9 5 5 4 1 2 0 5 7 9 2 10 5 6 9 0 3 0 10 8 4 0 9 10 1 5 5 4 6 10 2 7 9 0 10 1 3 4 8 3 10 7 7 4 3 10 5 6 9 5 6 6 0 7 1 6 2 6 2 2 5 4 7 6 3 8 9 6 7 0 4 8 1 3 4 1 6 6 0 5 4 8 10 0 7 10 9 9 7 4 9 4 9 0 0 [...]
+9 4 4 7 9 5 0 0 0 8 7 7 9 9 2 8 10 5 3 10 8 9 4 3 5 3 7 10 9 0 10 8 4 3 9 3 1 0 3 4 6 4 8 1 3 7 0 0 7 6 4 7 0 9 10 8 8 10 2 3 0 2 0 2 3 4 8 2 0 6 5 2 7 10 1 5 4 10 9 0 3 7 4 5 4 9 1 7 7 3 10 7 5 4 8 5 4 6 2 10 6 7 7 0 3 7 1 4 3 5 0 0 8 0 6 9 10 4 4 7 10 9 0 6 1 4 9 7 8 10 7 9 6 9 2 8 1 10 10 7 0 4 10 7 6 1 1 0 5 5 9 2 4 5 10 9 9 0 2 0 0 3 5 3 7 7 3 7 9 8 6 6 7 3 3 6 3 10 4 2 2 5 2 10 0 10 8 10 10 6 5 3 4 3 7 2 4 10 6 0 7 7 8 0 6 1 3 2 6 9 7 6 6 1 7 8 4 10 6 5 4 4 1 2 9 1 4 3 4 6 1 5 3 8  [...]
+2 6 5 9 1 10 3 4 0 2 8 7 8 10 6 1 1 3 10 6 4 10 4 10 4 9 4 5 5 0 7 7 10 7 2 3 3 4 8 10 5 10 3 3 2 2 6 1 7 0 6 3 6 3 5 4 6 1 0 0 1 0 4 9 9 10 1 0 6 0 1 1 3 1 6 7 3 9 2 2 4 3 5 8 0 10 10 1 9 1 5 10 7 10 10 5 5 5 10 8 7 7 10 1 1 9 10 0 0 1 3 1 1 6 5 4 6 2 8 10 2 0 8 4 8 10 8 7 6 4 7 5 0 10 7 1 0 0 5 10 9 10 0 2 7 5 6 8 0 9 9 10 8 9 10 1 3 0 1 1 7 10 9 10 2 5 8 2 9 8 9 9 5 0 0 0 1 6 5 1 1 1 2 8 7 4 4 8 6 5 7 9 8 0 8 10 4 10 4 3 4 0 5 6 0 4 2 6 5 3 7 2 0 6 6 6 6 0 0 4 8 6 8 0 8 10 3 7 10 1 8  [...]
+6 9 7 5 4 10 8 5 5 10 8 9 8 6 3 10 9 0 5 9 9 8 3 2 0 2 10 4 7 10 4 4 2 6 1 0 2 7 7 10 5 9 0 2 1 9 1 1 4 1 3 3 4 8 9 4 4 6 3 10 0 4 9 3 0 2 8 3 2 9 5 8 6 6 2 10 3 9 8 5 3 8 4 0 10 7 2 8 4 0 0 6 0 4 0 6 7 2 7 4 9 8 0 8 9 9 5 3 6 7 6 4 9 1 0 8 10 1 1 3 1 7 0 8 9 5 8 10 4 4 4 8 1 8 1 6 7 3 4 1 0 10 7 2 1 9 5 3 0 0 8 10 9 9 2 8 0 1 9 2 0 5 1 3 1 0 3 7 1 1 1 4 6 10 5 7 10 10 7 3 7 3 1 5 10 8 5 1 0 1 7 4 1 7 7 4 6 5 8 6 6 7 8 8 8 0 7 5 3 4 10 10 0 7 5 8 8 1 2 9 6 1 2 1 3 4 10 0 8 1 2 1 5 8 1 9  [...]
+2 9 3 6 3 0 4 1 7 10 8 1 9 7 2 6 0 10 9 0 6 7 9 4 6 8 0 10 10 1 1 4 4 6 4 4 2 2 1 6 1 0 2 1 10 0 0 6 4 3 3 3 6 1 6 2 8 1 7 5 6 5 2 9 1 10 9 4 0 6 6 3 9 7 2 3 6 7 3 3 4 8 9 10 7 7 1 5 10 8 5 8 1 3 3 6 8 4 2 3 5 4 10 6 9 0 8 6 6 7 6 1 9 4 10 2 8 4 3 4 5 2 10 9 7 4 8 1 0 4 4 2 6 6 8 9 6 1 6 10 8 3 9 7 4 2 1 8 0 1 3 7 8 2 9 10 0 4 7 4 1 10 4 2 0 0 6 0 1 1 3 2 1 6 1 2 4 1 8 0 6 2 6 6 3 5 6 4 2 7 6 6 10 1 10 8 6 2 8 8 0 6 6 3 4 3 5 2 10 6 8 5 5 2 0 5 10 3 8 5 4 7 9 1 3 6 0 10 6 7 2 7 9 2 3 6 4 [...]
+1 9 8 9 1 5 8 3 4 8 9 2 7 3 5 8 5 4 0 8 3 6 3 2 2 10 5 5 7 3 3 3 6 5 4 1 2 6 5 8 4 4 6 4 4 3 3 3 2 4 6 10 8 7 3 2 8 1 7 8 7 9 1 2 10 5 2 1 3 3 8 5 10 5 8 9 10 5 8 0 7 0 1 10 2 1 6 7 9 7 1 5 2 6 10 6 9 10 9 2 4 8 9 4 9 7 7 3 7 3 9 4 9 9 10 2 4 8 4 6 4 8 3 8 6 3 3 0 8 7 6 1 8 9 5 8 4 10 10 3 10 10 4 5 0 3 6 7 6 0 2 6 2 3 0 7 10 5 8 5 9 10 6 6 3 8 3 10 9 1 4 0 1 1 6 6 5 0 5 9 2 7 2 4 8 10 6 9 5 8 8 9 7 4 9 10 5 5 9 2 3 4 3 0 10 5 0 5 9 0 7 2 7 1 7 8 3 0 5 8 1 7 0 10 0 3 3 1 3 3 2 6 9 2 4 1  [...]
+10 10 0 5 0 8 9 5 8 4 10 2 2 9 10 10 2 3 7 10 4 1 5 8 8 5 4 3 3 2 5 3 7 5 5 7 7 8 4 6 6 2 9 1 5 10 4 8 6 4 8 9 7 3 0 3 5 10 5 2 6 8 2 6 8 10 4 1 3 4 7 7 10 10 7 7 1 8 3 9 8 6 7 1 8 5 7 3 6 10 10 3 1 10 7 10 3 10 5 8 5 3 5 10 0 8 3 5 0 0 9 5 10 4 0 7 4 3 7 3 9 4 9 6 3 2 3 4 2 9 6 4 6 9 7 8 9 0 7 5 7 5 9 1 1 6 10 8 1 10 4 3 8 7 10 1 2 10 7 7 5 6 9 9 3 8 8 8 9 10 4 2 0 9 2 1 8 0 3 10 2 7 1 7 5 10 1 5 8 7 0 10 10 9 3 2 4 4 10 10 4 8 9 10 3 10 2 8 6 4 1 8 8 10 1 6 3 9 0 7 8 10 10 7 5 4 5 1 6  [...]
+3 0 9 0 3 10 3 7 3 2 1 7 2 2 8 7 10 6 3 0 4 9 1 9 5 9 7 1 9 1 1 1 2 7 9 4 7 5 5 3 9 9 7 4 9 8 6 10 1 6 4 6 2 2 9 2 1 4 6 5 5 6 0 1 3 9 5 4 0 9 1 8 6 2 9 7 8 6 3 1 2 2 7 9 4 10 9 6 4 10 1 1 9 7 5 7 6 3 2 2 2 9 2 0 5 3 7 7 10 6 8 5 9 9 4 4 1 10 4 3 5 4 2 7 10 8 0 1 3 9 7 6 1 2 9 2 9 10 9 3 7 3 3 7 0 7 9 3 10 8 10 7 1 4 2 10 4 2 2 10 0 2 2 7 5 3 6 7 7 6 3 6 4 4 0 0 4 2 3 10 2 6 7 3 5 7 6 3 6 9 0 8 1 3 3 2 10 5 10 2 3 8 4 6 1 0 3 2 5 10 10 7 1 8 9 6 9 8 9 1 5 10 10 5 3 8 6 10 4 0 9 9 7 5 1 1 [...]
+4 7 8 0 9 4 9 10 1 3 8 9 5 8 4 5 0 10 4 4 4 4 7 10 1 6 8 3 4 8 7 2 9 9 8 0 6 10 9 0 7 9 7 4 7 8 2 0 1 2 4 2 3 8 7 7 2 8 6 3 10 1 0 8 5 6 2 5 5 2 7 1 2 8 3 0 1 1 2 8 7 5 3 1 0 8 7 10 10 7 3 0 0 7 0 7 0 2 9 3 8 3 1 2 2 6 7 0 6 9 3 5 5 3 3 9 6 8 7 3 5 6 10 1 7 4 10 4 0 10 9 9 8 9 3 3 8 6 4 3 8 0 7 7 1 7 1 1 4 6 8 7 9 4 8 6 0 3 4 0 0 0 2 4 5 8 3 8 0 4 9 10 1 6 0 4 2 0 1 9 1 5 9 9 7 7 6 3 4 4 10 9 3 7 10 8 2 4 9 6 5 7 5 4 10 7 0 2 7 10 5 0 7 8 6 10 4 3 7 8 8 9 7 7 1 5 3 1 7 4 10 3 2 10 6 1 3  [...]
+5 2 4 10 9 1 2 1 4 7 6 0 6 6 10 4 5 8 2 4 10 3 0 5 6 3 10 4 7 0 4 1 9 10 3 5 7 9 1 1 0 9 5 2 3 8 0 8 0 0 0 10 9 5 4 6 2 4 2 5 5 3 2 0 0 8 10 0 1 9 1 7 3 8 2 6 0 9 7 10 3 1 10 5 0 5 1 10 1 3 4 5 9 6 6 4 4 9 2 7 3 3 10 10 2 8 6 0 0 10 7 2 9 1 1 6 5 0 9 1 8 5 0 5 7 4 2 8 9 9 4 4 7 4 3 7 8 0 4 0 4 5 1 8 2 3 2 2 4 4 9 7 0 4 10 10 9 10 1 4 7 9 0 1 3 6 6 6 9 3 9 1 5 0 9 3 4 10 10 3 10 8 0 0 1 4 1 7 3 4 2 8 10 6 8 6 0 6 4 8 5 10 8 8 4 9 10 10 5 2 3 0 3 3 2 5 1 10 10 3 4 1 0 1 9 6 10 4 4 4 9 10 6 [...]
+2 9 5 8 6 6 9 1 2 0 6 9 4 4 6 1 2 2 3 0 5 2 5 2 0 0 2 10 5 6 2 10 10 6 0 0 3 10 8 7 9 9 0 5 2 3 0 8 0 10 1 9 8 1 8 0 2 9 0 5 5 6 7 9 7 8 3 2 5 9 6 0 9 2 0 7 3 1 10 6 2 1 7 10 6 8 5 3 1 0 5 1 7 5 3 1 9 4 5 7 10 1 4 8 2 8 9 9 3 3 10 6 6 6 4 9 1 6 2 0 4 8 10 8 4 1 0 8 4 8 8 7 9 7 8 3 8 3 4 9 5 9 8 9 3 7 0 7 1 10 6 7 4 1 4 8 5 8 5 1 5 3 1 9 8 2 9 7 2 2 9 2 10 2 2 10 10 5 0 10 8 10 2 2 7 5 3 2 5 1 2 1 0 5 5 7 3 0 0 4 1 9 9 1 7 9 9 1 3 7 1 4 3 7 9 0 0 2 7 8 4 8 3 1 6 8 2 8 1 3 4 7 0 7 2 7 2 4  [...]
+9 6 4 8 9 2 7 4 7 8 9 8 2 4 3 2 6 7 0 9 5 10 7 1 2 1 3 4 2 4 1 8 10 4 0 2 2 4 3 0 7 8 1 3 10 0 1 5 8 8 0 7 0 1 2 4 8 1 2 9 8 2 8 8 1 10 3 8 8 3 7 0 0 7 4 10 8 8 7 1 7 1 5 7 0 1 2 1 9 0 9 7 0 1 1 7 8 3 7 8 6 8 10 6 10 9 6 7 10 2 4 5 5 4 9 2 1 3 3 6 10 10 6 3 0 9 8 8 8 0 5 5 1 9 0 9 0 5 1 2 6 6 7 4 6 6 7 0 8 10 0 10 10 5 8 5 10 4 6 0 9 7 7 9 2 2 1 8 8 0 6 1 7 8 7 0 4 8 0 3 0 2 8 4 6 0 10 2 5 1 5 4 8 3 5 4 8 3 7 0 8 0 5 1 6 9 8 7 2 10 5 4 0 5 6 5 8 7 9 5 4 8 1 2 0 1 0 6 5 1 1 0 6 8 6 1 7 9  [...]
+1 7 2 8 10 7 9 9 1 3 9 8 6 10 3 5 3 7 9 5 1 6 4 9 8 9 6 3 3 10 7 6 1 10 6 3 6 4 2 2 5 3 4 4 4 9 7 4 9 5 7 5 7 3 5 2 0 6 6 0 6 2 9 6 10 8 5 9 4 10 0 5 3 4 3 6 6 4 8 2 5 9 4 10 4 3 6 10 7 7 6 5 1 2 8 3 1 1 10 10 7 4 0 9 6 7 4 10 3 10 4 3 6 4 6 6 7 1 1 4 8 9 7 9 9 4 6 8 1 3 2 10 7 7 0 2 8 10 6 8 4 6 0 4 0 5 9 8 5 1 4 5 1 2 1 3 4 6 6 6 8 8 7 9 4 9 8 6 9 3 10 3 7 2 4 1 7 3 8 1 8 10 9 4 8 5 8 7 8 5 0 5 0 6 6 0 2 0 2 0 8 7 8 8 4 4 6 5 10 7 3 9 2 2 3 10 1 9 9 6 9 6 10 5 9 7 8 3 9 8 6 6 2 6 1 6 6 [...]
+1 2 5 1 4 4 5 3 0 8 5 1 10 8 8 9 7 4 10 6 4 8 1 10 1 5 0 7 2 5 3 1 4 6 8 6 8 8 1 1 0 4 10 4 8 0 9 0 0 8 2 8 5 3 10 4 3 5 7 8 1 5 3 10 3 6 7 1 5 9 8 9 7 7 2 9 4 2 6 5 5 4 8 10 0 3 10 7 5 0 6 3 1 8 3 3 5 3 6 7 9 1 10 10 0 9 5 7 6 9 8 10 1 6 8 9 10 5 7 8 9 7 1 7 0 3 9 9 0 3 4 1 1 3 0 5 1 3 2 10 1 2 10 1 8 4 9 0 1 0 0 7 9 4 1 3 7 7 9 8 5 7 6 9 5 6 3 3 3 1 4 10 10 1 3 5 2 8 1 6 3 3 4 8 3 8 4 5 4 9 10 4 6 7 10 9 3 0 0 3 5 3 2 1 0 5 0 10 10 0 3 5 2 8 9 10 3 4 8 6 9 5 10 7 10 9 10 2 8 1 10 10 9  [...]
+3 8 9 6 4 0 10 4 5 8 2 10 2 5 4 9 2 8 1 9 5 8 6 2 0 5 7 8 3 7 4 4 4 9 6 10 2 6 6 10 10 6 3 1 7 0 4 9 8 2 10 6 9 8 0 6 5 7 1 6 5 5 10 1 1 7 2 10 9 0 8 8 10 1 5 8 5 6 7 7 5 9 5 5 6 6 6 10 7 7 4 1 7 3 3 8 2 3 0 9 6 3 0 2 2 2 2 0 3 9 10 4 9 10 8 2 0 7 3 3 4 8 7 8 3 6 1 6 10 6 6 6 7 6 0 9 4 6 5 2 1 7 8 9 4 3 8 6 1 5 7 5 5 5 1 9 8 3 2 7 4 8 10 2 10 10 9 9 2 1 3 7 7 5 5 9 1 0 0 3 6 10 3 6 2 5 2 3 8 8 1 7 7 4 1 6 2 8 5 0 10 6 0 0 3 3 10 1 1 10 0 6 1 8 6 8 10 7 0 6 7 0 6 10 3 6 7 0 8 1 4 9 1 0 9  [...]
+9 5 3 5 4 9 5 8 1 9 4 5 10 3 10 7 4 5 9 7 8 9 0 5 9 3 10 10 1 3 8 4 10 4 5 1 8 7 5 0 10 8 3 2 8 7 10 6 10 9 6 1 3 7 8 7 10 1 2 2 10 5 0 1 6 8 6 2 8 1 8 6 1 8 6 0 0 3 8 8 2 1 4 5 7 8 9 3 6 0 4 9 8 1 5 2 8 9 3 4 6 7 6 8 3 8 6 5 0 3 0 3 1 10 1 2 7 6 1 9 9 1 6 8 10 5 4 10 7 4 6 0 3 7 7 0 6 0 6 5 6 9 0 9 9 7 9 6 2 7 0 5 1 5 1 9 4 8 4 1 6 1 5 1 5 0 10 3 6 0 8 8 8 4 8 4 1 5 7 3 6 4 8 4 6 5 4 0 9 1 2 5 8 4 1 3 0 6 0 3 7 3 3 0 8 9 4 8 9 6 1 4 8 10 1 9 7 4 2 10 10 0 0 5 5 10 4 9 0 2 10 3 9 10 2 6  [...]
+9 6 3 2 8 8 6 9 2 2 3 6 6 9 1 1 6 4 1 7 7 10 9 5 5 6 1 9 7 7 7 5 0 2 0 3 6 6 6 5 9 8 1 8 1 9 4 3 6 0 7 2 3 10 7 1 7 4 2 0 5 6 1 6 7 9 10 5 8 9 2 9 2 3 6 3 1 2 0 9 2 9 6 5 2 4 10 10 3 2 9 4 3 1 3 3 9 7 4 10 8 9 6 0 3 2 9 0 8 5 0 5 1 2 0 9 0 5 0 10 10 0 6 9 9 10 5 10 5 8 9 10 5 5 5 2 5 10 8 10 6 2 5 5 8 4 7 3 0 8 0 5 3 6 8 10 9 7 4 10 9 4 8 4 9 6 1 10 9 10 3 5 7 7 2 3 1 0 8 1 3 1 4 4 8 10 6 3 7 6 9 10 5 9 10 5 9 7 5 5 9 4 10 10 10 4 6 7 9 10 5 8 10 7 1 1 0 7 7 0 3 3 0 8 4 8 7 5 5 2 8 10 8  [...]
+9 9 0 4 6 7 1 5 4 2 1 5 4 7 8 4 5 0 2 10 0 9 0 10 7 5 5 4 2 9 3 7 7 8 8 8 8 9 7 7 6 0 1 3 10 0 1 7 1 10 5 10 1 2 8 9 3 6 5 1 0 8 9 8 2 7 6 6 2 3 6 4 9 6 6 3 10 10 8 8 3 7 2 10 7 2 9 5 2 9 2 5 7 10 6 0 3 4 5 7 8 8 9 4 4 2 10 6 8 8 0 5 2 6 3 0 5 7 1 10 10 9 1 7 6 6 5 7 4 5 5 9 9 8 10 0 10 6 0 3 7 6 8 2 7 5 2 10 5 3 4 9 5 7 7 3 1 3 4 1 10 4 4 9 4 2 5 9 0 1 8 9 10 9 9 5 10 1 7 4 2 3 7 6 6 10 6 0 0 4 1 0 3 3 7 7 4 7 7 0 0 10 0 2 2 2 0 7 2 9 7 0 5 5 8 5 5 6 4 10 0 8 0 5 0 7 10 4 10 7 7 5 7 8 0 [...]
+1 8 6 2 7 7 10 3 7 8 10 10 2 10 4 0 2 3 2 5 8 8 7 5 2 6 7 9 4 2 8 1 10 0 9 1 9 3 8 9 0 3 2 5 3 6 1 4 8 6 3 9 1 4 8 4 10 9 4 7 3 6 10 5 9 7 7 4 5 6 4 9 10 4 10 3 7 2 8 1 8 8 10 8 4 3 2 8 2 3 7 7 8 10 7 9 4 10 7 7 3 8 3 0 5 8 9 7 3 6 6 7 6 2 8 5 2 2 5 9 9 0 3 6 4 5 0 8 7 1 8 8 6 10 1 0 10 2 3 0 1 3 2 5 5 1 5 9 4 7 0 2 2 5 3 2 6 3 10 1 6 6 7 8 5 3 6 0 3 0 10 9 2 4 8 2 3 6 6 2 7 5 10 2 10 6 2 8 10 8 1 3 6 2 2 10 1 0 5 9 6 3 7 10 3 1 9 9 0 10 1 0 10 6 1 0 6 2 10 3 0 1 8 1 6 6 6 3 3 8 6 1 2 7  [...]
+2 2 7 6 0 4 1 8 9 4 7 6 8 4 0 2 6 1 2 8 3 6 1 6 9 6 4 6 6 2 10 3 10 2 9 10 5 8 10 6 5 4 10 4 10 4 3 8 2 1 0 3 2 6 8 9 9 10 10 6 6 7 1 8 2 6 2 4 6 8 5 4 8 8 1 7 1 6 3 2 4 2 6 7 9 5 4 5 2 8 3 10 6 7 9 6 6 8 8 3 5 9 2 7 7 5 3 5 3 8 3 6 3 7 1 7 8 4 9 7 9 10 1 6 2 10 7 0 9 1 9 4 0 7 6 10 7 10 7 10 1 3 9 2 9 4 10 10 9 1 8 5 9 10 7 9 1 2 5 10 1 0 7 5 2 6 7 7 9 3 1 10 10 6 4 3 6 7 3 1 9 8 7 9 7 10 6 8 8 5 1 1 0 6 3 10 3 3 9 4 3 3 6 5 2 7 7 8 3 3 4 5 8 8 8 2 9 8 0 10 2 1 0 8 2 8 10 2 5 1 10 9 10  [...]
+3 8 1 7 6 8 5 6 9 2 2 8 5 8 4 1 8 6 10 2 7 1 4 8 0 10 5 3 3 4 10 0 10 7 0 8 5 7 9 6 3 10 0 10 3 7 3 7 10 6 7 7 4 6 3 2 6 10 3 9 4 0 8 2 9 3 5 2 3 5 1 6 3 8 8 5 5 4 1 7 6 8 7 7 8 4 8 3 9 3 3 1 8 4 1 3 7 9 8 8 9 1 2 4 3 3 7 8 0 9 5 0 5 3 5 3 8 5 2 5 1 6 9 2 2 10 7 8 1 3 6 1 6 5 6 0 5 1 6 5 5 2 0 7 9 6 0 9 6 3 8 7 2 8 6 1 9 3 10 3 6 6 4 9 0 2 0 4 7 8 3 8 3 10 1 9 2 3 0 3 4 3 8 6 6 3 9 8 3 3 1 7 6 1 8 1 4 3 5 5 6 6 7 2 0 5 7 7 8 3 3 7 1 8 9 4 9 8 6 9 6 8 1 7 10 0 10 9 10 5 9 10 9 8 2 8 0 8 0 [...]
+4 2 9 5 5 7 3 4 7 6 5 10 4 3 4 2 2 10 9 7 4 9 4 3 10 7 9 4 10 5 0 2 7 6 3 6 3 0 10 7 3 1 10 7 8 0 7 5 10 5 5 9 7 4 10 7 10 1 0 2 8 2 3 2 10 10 2 8 7 8 9 7 4 7 8 4 7 6 5 9 3 4 7 10 0 10 9 5 2 1 10 4 10 0 10 0 6 6 9 3 2 2 6 4 9 10 10 7 6 6 3 9 2 10 1 2 0 5 9 7 5 7 8 8 8 1 7 5 10 7 6 2 2 8 10 5 1 0 4 2 5 10 7 10 1 5 6 6 9 2 1 5 10 9 0 2 1 5 9 4 7 3 0 4 10 8 1 6 8 8 5 10 1 7 4 7 5 5 5 4 8 0 4 7 10 1 0 9 10 5 1 1 9 10 5 5 9 0 10 2 4 3 1 8 2 6 7 3 9 6 0 5 9 9 2 9 4 3 10 10 6 8 8 2 9 3 0 9 8 10 [...]
+9 5 1 1 4 0 10 10 5 8 8 10 0 4 5 9 1 9 10 0 7 10 0 7 7 3 1 4 9 1 4 4 5 7 0 6 1 8 9 9 5 7 1 8 3 0 1 2 6 9 10 4 8 3 4 1 3 2 3 6 6 4 7 5 4 5 10 3 10 0 6 6 3 2 7 5 0 0 10 0 1 4 0 1 8 6 4 10 0 2 10 8 6 9 10 2 8 0 4 0 7 4 4 7 3 10 4 9 9 0 5 5 4 9 8 8 2 10 5 1 4 9 4 1 1 6 8 1 0 9 0 4 8 10 6 5 8 4 2 1 6 4 10 4 5 4 1 8 5 10 9 0 1 9 6 4 9 6 5 4 8 0 7 9 1 4 7 1 3 7 8 5 8 9 4 2 5 6 2 1 7 7 7 7 4 1 9 6 3 1 2 8 6 1 8 10 8 9 7 1 0 9 3 10 6 1 6 0 2 2 0 5 2 7 2 8 4 4 10 3 0 0 1 2 9 6 8 4 9 2 2 1 7 0 9 9  [...]
+1 1 3 3 2 0 9 2 6 0 8 5 3 3 0 5 1 6 5 10 8 1 4 8 7 0 4 6 9 4 7 4 4 6 5 7 9 7 4 9 8 5 10 8 6 1 7 10 8 4 2 8 6 4 1 10 9 2 4 5 9 3 1 1 4 9 5 0 7 0 3 7 0 9 3 4 5 1 3 1 3 0 3 1 9 9 9 0 5 2 2 2 10 6 0 2 3 3 4 5 7 3 9 3 6 7 8 4 10 3 0 0 4 9 7 0 6 0 6 0 8 0 5 4 6 5 9 10 7 0 2 10 8 5 8 9 6 1 8 4 7 5 0 1 8 2 9 7 2 3 5 3 8 10 2 5 1 1 1 5 4 1 4 10 2 8 7 7 8 7 1 7 10 1 7 1 8 0 4 2 4 2 3 5 10 6 9 4 1 9 4 10 5 10 4 2 5 0 4 2 6 5 1 9 0 5 10 2 1 4 4 7 9 6 5 6 7 2 6 7 9 0 6 3 7 6 3 2 5 8 9 9 8 8 4 6 4 3 3 [...]
+5 9 2 8 10 5 8 9 1 7 3 2 10 10 4 7 6 10 7 0 2 5 3 6 6 8 6 1 10 7 2 2 9 1 2 4 8 10 10 3 0 7 3 8 3 2 0 6 4 3 3 8 0 9 6 2 6 7 2 0 8 10 6 6 1 1 6 2 7 7 2 2 6 3 9 7 2 0 0 6 2 7 7 9 9 0 7 8 3 6 8 8 9 7 9 2 8 2 2 8 5 3 10 10 7 2 6 4 7 5 9 1 4 3 9 3 6 6 6 10 2 5 10 2 5 1 6 4 7 8 8 7 5 7 1 10 1 9 4 9 0 7 0 9 0 2 4 3 7 9 4 8 9 9 1 4 4 9 2 0 2 5 4 10 1 10 5 3 9 1 5 8 9 3 5 4 5 0 7 1 0 4 10 5 9 8 0 9 10 7 0 8 2 4 4 2 2 3 5 2 5 7 7 4 5 9 5 5 5 10 4 8 8 2 4 5 8 2 9 9 8 7 6 5 7 6 6 8 9 10 5 10 9 0 8 9  [...]
+3 2 7 9 7 4 9 2 0 7 8 9 6 9 6 7 2 5 8 5 6 1 4 9 1 10 5 6 8 1 8 0 7 8 2 1 5 0 8 0 4 9 5 4 3 2 3 10 0 3 5 1 2 10 5 7 1 10 6 9 9 5 6 7 6 8 0 2 9 2 7 6 7 8 9 5 6 5 8 1 7 1 1 10 5 8 7 3 7 7 6 9 6 4 5 2 6 9 10 9 3 7 10 4 8 6 0 3 9 4 6 8 7 3 0 5 3 3 8 6 9 0 9 1 8 9 5 1 4 6 3 3 7 5 2 7 2 7 2 2 6 2 4 9 9 5 4 3 3 2 0 10 1 3 0 3 1 9 4 1 10 8 8 5 6 2 2 9 10 0 6 5 6 3 7 1 4 9 1 5 4 7 10 7 4 5 3 5 9 4 10 10 3 0 4 1 0 6 6 0 0 2 5 5 4 4 2 1 6 7 10 6 7 10 8 8 2 6 3 8 4 1 10 10 4 5 1 7 9 0 4 8 9 6 9 10 8  [...]
+8 7 5 0 4 3 7 10 6 10 9 10 4 1 0 2 6 9 9 6 5 7 10 4 9 5 8 4 10 4 8 3 4 8 3 9 8 6 0 6 8 7 8 5 5 0 8 1 8 2 8 7 1 8 7 2 3 8 4 1 5 2 2 9 10 7 0 7 6 10 2 2 2 6 5 3 2 7 7 6 2 0 4 0 3 6 3 5 7 0 0 9 7 10 7 0 5 0 6 7 5 8 7 9 4 4 3 6 0 9 4 7 8 2 5 5 4 4 9 5 5 9 6 10 10 0 9 2 4 1 2 8 0 10 6 1 4 5 5 2 2 6 9 6 8 6 6 4 3 8 5 9 2 4 0 1 0 4 2 4 3 10 5 0 6 9 8 3 8 6 0 3 0 9 5 7 5 9 7 4 2 4 3 10 10 3 1 3 5 10 7 7 5 5 0 0 8 0 2 6 3 5 1 0 9 7 9 5 6 3 5 7 9 5 9 3 9 0 9 4 3 1 2 8 1 5 7 7 8 7 0 3 5 7 0 7 2 7 9 [...]
+9 6 9 5 0 4 7 3 10 8 7 3 4 0 4 8 7 5 6 3 1 7 4 3 5 7 8 7 2 1 2 4 5 3 0 3 8 10 4 5 5 8 7 5 4 1 1 4 0 5 10 3 4 1 8 6 10 10 5 3 5 0 3 10 4 7 4 7 5 4 7 8 1 8 4 1 2 5 2 5 2 2 3 0 6 6 7 0 10 6 0 3 10 3 6 3 0 8 8 3 3 0 6 9 10 7 9 0 4 5 2 0 8 3 2 3 6 1 8 6 9 5 4 2 3 0 6 6 3 5 0 4 6 3 4 9 1 2 5 5 2 10 1 2 7 6 4 5 2 0 3 3 5 3 6 0 3 6 3 4 10 2 9 4 2 7 6 10 9 10 8 3 3 0 2 3 5 2 10 4 8 1 4 2 5 6 8 9 5 9 8 8 5 7 6 10 1 6 9 10 4 4 10 10 4 3 8 8 4 1 7 8 3 8 4 10 8 9 5 9 7 6 6 0 1 0 1 5 2 4 9 1 10 9 4 3  [...]
+6 3 7 4 1 8 0 1 0 3 7 10 2 8 6 3 7 1 3 4 6 0 5 5 8 3 4 6 8 5 7 2 4 0 4 2 0 6 4 8 0 6 0 5 6 2 8 2 3 2 5 2 1 6 1 4 1 1 2 7 1 9 1 8 2 5 6 0 7 0 4 0 4 4 1 10 7 10 1 7 5 2 5 1 7 7 3 0 6 9 10 3 2 10 9 9 3 5 8 6 8 9 6 0 3 5 4 9 5 8 6 6 10 10 7 7 10 0 5 0 1 9 7 10 6 2 7 3 9 5 8 10 7 10 0 6 8 0 0 9 6 5 4 10 5 10 2 6 10 0 6 5 2 7 10 3 3 8 2 6 3 1 7 4 5 10 2 7 5 10 2 4 3 9 7 2 8 6 3 5 9 10 8 9 5 6 9 3 7 10 3 5 5 3 10 10 1 4 8 2 1 2 10 9 0 0 9 5 3 10 10 9 9 3 2 1 6 0 9 7 2 8 0 5 5 3 3 1 9 1 7 5 1 7  [...]
+2 6 7 0 2 5 10 5 6 6 4 6 7 6 1 0 3 3 2 4 6 8 3 6 5 0 7 5 9 3 9 5 9 10 1 8 5 4 0 6 10 10 9 7 5 5 1 3 1 6 10 6 2 3 8 9 5 6 2 6 4 0 5 7 10 4 10 7 9 9 6 1 3 3 3 10 9 4 4 2 0 6 0 3 6 6 9 9 8 3 7 10 4 4 8 10 0 0 0 6 3 5 3 5 0 9 6 4 2 6 4 0 10 7 6 1 8 7 7 8 6 5 4 3 4 10 8 9 5 5 7 8 8 6 8 5 3 0 0 3 6 5 3 7 9 2 4 6 9 5 1 10 6 0 10 0 3 8 6 3 6 4 8 10 9 2 1 4 1 4 8 9 6 1 6 9 9 2 10 6 6 0 5 0 1 1 8 1 3 10 1 5 7 0 5 2 9 2 5 10 10 5 9 0 3 4 7 10 1 5 10 8 9 1 3 10 4 7 2 0 4 7 3 6 7 3 1 3 7 4 3 0 3 9 8  [...]
+10 10 5 7 9 6 7 6 2 1 4 8 1 4 2 2 6 3 0 5 1 2 9 3 8 0 6 7 0 2 1 6 2 7 10 2 7 6 9 1 3 2 6 1 7 5 8 10 7 3 8 2 7 7 8 7 2 5 6 1 4 1 8 0 5 9 9 0 2 4 10 3 2 8 0 0 10 0 2 9 6 2 0 1 4 1 4 5 5 5 2 6 5 3 10 4 3 5 5 7 6 0 9 5 0 1 8 7 0 6 0 2 0 9 6 10 10 6 5 6 4 1 9 7 6 9 8 6 0 5 0 0 0 9 8 3 0 7 4 5 10 10 4 9 0 6 7 6 9 6 6 7 4 4 10 9 8 10 6 2 0 1 6 8 2 4 7 2 7 2 8 1 4 2 4 0 3 0 4 7 4 5 0 7 5 1 3 9 7 2 8 3 2 6 0 9 9 4 6 9 3 3 1 3 0 2 4 0 2 8 0 7 5 8 1 8 10 2 3 7 10 2 10 0 7 9 1 10 3 6 10 7 6 4 3 5 8  [...]
+0 0 4 8 7 6 8 1 7 7 5 0 3 0 5 2 8 10 0 10 3 10 5 4 1 4 0 0 3 2 6 8 8 1 1 6 10 5 5 0 7 8 0 5 2 5 6 6 10 8 9 9 0 0 5 4 7 9 8 2 4 2 1 4 0 9 6 6 1 0 8 3 3 8 10 3 1 2 10 10 9 3 8 1 9 9 9 10 9 5 9 6 5 6 1 9 1 1 2 6 3 5 2 4 4 4 7 8 8 10 10 0 2 9 9 5 9 5 9 4 3 10 3 10 8 7 9 3 7 1 8 3 10 2 0 5 0 7 8 10 1 8 8 1 9 7 7 2 1 5 6 8 3 7 8 2 3 1 5 3 2 0 4 10 0 1 5 5 8 2 8 9 7 3 1 3 7 10 5 8 2 5 1 3 3 7 1 4 3 10 7 0 4 3 0 10 3 0 4 5 6 8 3 8 1 8 6 6 8 4 4 0 10 6 7 3 8 10 0 6 8 6 1 9 6 4 10 10 7 5 3 4 9 4 3 [...]
+6 10 10 5 4 4 8 2 2 10 1 3 5 1 5 6 10 8 8 7 6 0 0 10 6 4 1 1 9 9 7 0 3 7 8 4 0 3 6 9 10 4 2 0 4 7 3 2 0 6 6 9 5 5 1 5 10 0 8 6 10 10 6 8 6 2 1 3 8 6 3 4 5 6 5 1 0 8 8 9 8 6 0 6 1 4 9 4 9 3 1 6 4 4 6 10 10 2 6 2 7 3 10 9 9 3 3 2 3 10 7 9 9 4 3 3 4 0 10 5 6 10 3 3 1 3 1 0 7 2 10 5 9 9 6 6 9 10 6 1 3 5 2 3 4 7 3 2 7 9 1 4 3 1 2 1 8 4 8 9 1 7 4 0 10 1 0 0 3 2 6 5 1 2 6 8 8 6 3 7 8 6 5 1 6 1 8 2 5 5 1 7 10 3 6 9 8 2 7 5 9 0 2 4 10 2 0 9 4 1 0 8 8 1 9 4 5 3 2 9 5 4 3 8 10 5 4 8 2 10 3 8 0 2 8  [...]
+2 9 7 7 10 8 3 1 6 0 9 1 2 6 1 3 9 2 9 3 5 1 2 10 7 2 3 0 8 0 10 8 6 10 8 10 6 8 6 8 8 4 3 6 0 6 2 1 10 8 1 0 4 5 5 0 8 1 7 8 6 2 10 10 0 9 2 2 5 0 6 1 6 8 7 7 10 7 9 10 2 0 6 5 0 10 10 0 4 3 1 4 9 6 8 8 1 0 3 8 1 5 1 8 7 9 0 5 2 7 6 6 9 3 8 4 10 6 8 7 0 5 5 0 1 3 1 1 8 6 2 3 0 7 7 0 0 6 4 5 10 10 7 0 6 8 6 7 4 9 0 2 1 9 9 8 4 5 4 2 10 6 7 3 6 4 9 3 9 6 9 7 6 7 7 7 3 7 0 1 9 10 0 1 9 1 5 5 0 0 3 6 5 9 4 1 8 10 8 8 5 8 10 9 1 9 10 5 7 9 8 5 1 6 6 3 0 1 5 1 9 5 8 7 5 6 10 1 0 8 9 5 9 4 5 5 [...]
+4 9 7 6 3 6 2 7 1 7 8 9 8 0 7 7 5 6 2 2 6 4 0 7 5 4 7 3 0 0 8 4 3 7 1 1 7 2 7 0 9 2 6 4 0 6 7 2 2 8 5 5 10 2 1 7 5 10 9 7 3 5 3 2 2 8 7 1 8 1 4 10 10 8 8 3 7 10 0 9 1 0 9 4 8 2 2 7 4 9 6 9 2 6 9 2 2 9 0 5 7 3 2 8 0 3 5 5 3 7 1 10 5 3 4 6 8 3 6 3 0 8 1 2 3 4 2 5 10 9 2 1 0 7 2 0 2 8 8 6 6 2 4 4 9 8 2 9 10 7 1 3 3 5 2 5 7 4 0 6 6 6 0 4 3 6 7 10 8 5 3 1 6 6 9 0 2 7 5 6 5 4 9 3 0 7 10 3 2 1 5 0 10 3 10 10 5 1 3 3 1 3 3 4 6 10 4 1 2 8 3 6 3 2 5 7 8 1 0 2 10 8 6 0 4 4 2 2 1 3 2 4 3 2 2 0 10 8  [...]
+6 9 6 6 3 3 0 5 2 3 9 0 0 7 8 4 0 2 7 2 0 2 3 3 2 8 1 3 10 10 1 10 0 3 4 0 4 2 7 9 9 3 10 2 5 8 1 6 9 10 4 8 1 8 3 9 1 0 5 9 9 4 7 10 9 4 5 8 9 4 10 9 5 8 0 9 4 8 3 1 3 9 10 6 10 8 4 4 1 6 5 1 1 7 10 0 7 0 5 2 8 8 0 2 6 6 3 4 6 10 8 10 9 4 10 8 0 1 2 0 8 0 1 8 2 1 9 0 0 7 5 10 5 5 6 7 2 1 7 8 6 6 8 10 7 0 3 0 10 3 6 6 5 3 8 6 3 9 5 2 5 5 5 7 0 2 3 3 8 10 9 3 3 5 4 7 3 6 0 5 8 9 7 7 5 4 8 5 0 3 2 7 7 2 9 3 6 6 4 8 1 5 8 7 8 5 1 8 10 8 7 2 9 5 8 3 6 6 4 9 10 8 10 3 7 10 4 6 6 9 1 4 2 10 9  [...]
+4 7 4 10 0 0 3 5 4 2 9 4 5 0 2 0 3 1 4 10 3 10 2 7 4 5 5 4 2 5 3 10 4 5 4 3 2 4 6 6 10 6 7 9 10 10 3 0 7 7 8 1 2 0 2 4 4 5 6 7 5 3 1 6 1 0 5 0 9 9 4 8 0 9 3 2 8 7 8 7 10 0 4 2 1 4 8 3 10 8 0 6 3 1 5 7 6 5 7 2 4 5 5 10 0 6 3 4 1 0 7 6 5 0 6 3 1 10 7 10 2 2 3 10 5 1 8 0 7 8 8 9 2 0 9 8 8 8 2 7 5 8 9 1 0 3 0 4 1 8 9 7 10 9 8 9 0 4 8 9 3 7 3 4 2 8 9 6 1 7 7 8 8 8 4 3 9 9 2 5 0 7 9 10 7 4 7 8 7 7 5 1 4 1 6 3 0 7 5 6 4 0 10 4 0 8 7 2 3 0 10 6 6 6 1 3 8 7 0 3 0 10 3 1 1 3 7 8 3 10 9 9 0 8 0 7 8 [...]
+0 7 6 2 0 3 3 1 1 0 1 0 10 6 10 1 5 5 2 2 9 0 2 9 7 3 1 10 9 10 10 10 0 6 7 2 8 7 8 3 4 2 7 0 3 8 6 1 5 7 6 1 7 2 5 4 3 8 7 5 10 3 3 9 2 0 9 9 0 8 0 2 9 5 3 9 9 3 0 2 2 3 4 8 1 3 4 7 4 7 9 2 3 1 7 3 1 0 1 5 9 2 9 5 6 6 8 9 8 5 7 5 9 10 8 6 2 7 3 2 8 3 8 10 1 0 8 10 5 0 2 2 4 6 9 3 0 5 8 1 2 7 0 8 7 6 6 6 3 5 9 1 5 1 9 9 2 6 10 0 0 0 9 9 5 3 0 3 9 7 3 9 10 5 10 7 0 6 9 5 9 6 4 10 8 8 6 6 10 9 4 1 6 10 3 1 9 6 4 4 2 4 0 9 6 5 3 10 4 5 10 3 7 7 6 4 6 2 2 1 6 9 3 8 9 10 0 7 0 10 8 3 9 10 8 4 [...]
+7 3 5 0 3 7 3 7 7 5 5 6 8 5 0 0 0 7 7 4 2 5 8 1 6 2 5 8 4 10 7 9 5 6 9 10 4 0 7 8 9 7 5 1 10 4 7 10 0 8 3 0 7 6 8 0 7 2 0 7 9 9 3 5 8 0 8 5 9 7 3 2 1 9 1 1 2 8 1 7 5 1 7 10 3 4 6 3 3 3 1 0 6 4 8 3 9 10 10 9 7 5 7 1 0 10 6 5 6 0 8 6 4 5 7 3 0 10 2 3 4 4 7 5 2 2 1 1 6 1 9 10 10 3 7 2 8 3 4 1 2 6 4 4 10 4 4 7 2 6 1 1 9 0 1 8 0 6 1 2 2 4 9 9 7 0 8 1 8 8 10 8 5 4 7 6 6 9 3 3 8 5 1 4 5 5 2 2 4 6 0 8 8 2 7 10 3 9 2 4 0 9 8 5 0 6 1 6 1 7 0 8 9 2 7 1 7 0 4 10 9 8 0 1 0 2 0 4 3 0 0 1 4 0 4 0 3 0 8 [...]
+3 0 1 0 5 5 7 0 6 4 1 3 8 7 10 5 5 8 10 1 9 7 4 6 6 7 2 6 1 6 5 6 4 9 5 3 1 2 4 3 9 8 2 4 3 6 3 2 9 10 2 9 8 7 6 8 5 9 4 4 0 8 3 4 0 5 0 2 5 4 2 9 9 9 4 0 0 10 4 4 2 3 10 2 1 5 4 9 5 5 8 4 10 8 9 4 9 10 10 9 9 1 5 4 8 2 9 5 9 1 6 4 3 6 0 0 8 2 8 4 1 10 7 8 10 10 9 1 6 6 6 9 10 2 2 10 8 9 2 6 7 3 9 6 8 9 8 2 3 2 1 10 3 5 10 8 10 9 4 0 5 2 8 8 8 8 8 10 8 9 9 1 8 6 9 7 3 4 5 9 2 4 3 6 10 4 8 1 9 3 3 6 8 10 3 10 6 2 5 10 2 8 1 0 7 9 8 8 4 1 8 6 5 0 6 1 0 6 1 0 1 5 1 0 7 1 0 8 4 5 3 10 0 9 5  [...]
+3 3 9 4 10 2 7 8 9 10 3 6 3 0 7 0 6 9 6 0 4 3 0 6 3 6 10 6 10 8 5 5 0 3 9 10 9 7 4 7 2 2 8 9 7 5 0 9 5 1 2 10 6 0 3 6 0 1 4 8 7 3 3 5 3 8 2 10 6 2 7 2 1 10 7 7 3 0 7 1 1 2 9 0 7 2 8 4 4 6 5 8 10 9 7 8 1 0 9 0 10 6 7 9 0 5 0 9 5 9 5 1 10 10 5 8 0 2 3 4 10 3 7 3 8 0 3 2 9 2 1 8 10 1 10 4 10 4 6 9 0 3 6 4 1 8 5 1 0 1 4 0 8 0 9 8 0 0 3 6 1 0 3 5 2 9 9 6 5 7 10 9 9 2 8 10 7 3 1 10 8 8 1 1 6 0 8 5 0 9 2 5 6 8 5 5 7 8 0 7 3 1 4 2 10 10 5 2 10 7 10 9 8 10 6 0 5 0 2 0 7 6 1 2 8 5 5 6 4 5 4 10 5 8 [...]
+4 2 1 3 7 7 7 5 4 10 8 3 2 1 10 8 2 5 0 0 5 9 5 10 6 10 8 2 8 9 2 8 6 8 8 0 7 4 10 4 9 4 7 8 6 5 6 4 10 3 8 3 3 2 2 6 7 3 6 1 7 6 9 6 0 4 0 1 8 7 2 8 4 2 8 8 8 3 5 2 0 0 0 8 0 10 3 9 2 10 0 10 6 7 1 7 6 6 1 3 1 9 8 4 5 10 9 6 4 7 4 9 3 3 5 1 1 3 8 7 7 7 1 2 2 10 5 8 5 10 10 7 8 4 9 10 10 3 10 3 3 0 3 9 10 7 6 7 3 3 3 6 4 9 7 10 9 4 3 1 8 7 6 5 2 5 10 3 5 5 2 10 4 9 1 4 2 0 8 1 1 4 1 3 7 3 3 7 5 4 3 3 7 10 2 2 1 5 6 3 2 5 7 1 1 9 8 3 10 9 4 3 9 2 0 9 5 10 3 3 5 6 4 10 0 8 4 4 3 4 6 6 3 1  [...]
+1 10 3 10 8 7 10 8 6 5 9 1 7 8 5 7 10 2 0 6 4 7 6 2 5 7 8 4 6 2 0 4 4 8 4 0 3 8 4 7 9 2 9 2 4 9 1 4 5 9 4 1 8 5 0 6 9 0 6 6 3 3 9 6 3 0 0 0 2 10 5 8 6 8 10 6 6 9 8 8 6 9 6 4 8 9 8 6 7 1 5 8 3 9 7 8 8 9 7 6 6 2 8 1 4 2 1 10 2 10 3 7 10 3 0 2 8 10 10 5 8 3 2 9 1 5 8 10 8 4 2 6 4 9 6 0 0 7 10 6 9 7 1 7 7 1 2 1 2 3 3 4 4 4 3 10 4 6 6 7 0 1 9 0 5 8 9 8 1 10 5 0 3 8 10 10 0 10 1 4 10 10 4 3 4 3 7 4 4 8 4 4 7 5 9 9 3 4 10 5 7 7 5 5 3 8 10 3 10 1 0 9 0 7 0 10 9 3 4 0 2 0 8 9 8 8 2 2 2 8 4 7 1 6  [...]
+7 6 9 1 2 6 0 1 4 1 6 10 7 9 6 6 10 4 8 1 0 10 5 6 2 5 9 2 3 8 5 7 0 3 10 8 2 7 4 1 0 1 8 1 3 2 5 3 4 3 8 8 2 0 5 7 3 7 6 2 2 6 9 6 10 6 5 9 4 0 9 0 3 4 6 6 9 1 1 7 7 0 1 8 9 7 6 3 10 7 7 4 0 9 0 3 10 9 6 4 2 2 8 1 6 5 8 10 3 6 4 6 8 1 1 3 2 5 1 2 10 8 2 1 2 3 2 3 2 7 7 7 9 8 0 10 3 9 0 0 0 4 10 5 3 1 1 6 3 9 4 8 4 0 7 5 6 5 3 2 2 5 9 5 1 6 10 9 6 1 0 0 10 1 6 9 5 8 8 10 1 5 1 8 2 5 8 10 4 4 8 5 7 0 10 2 8 3 7 3 8 10 0 7 7 4 6 1 6 1 9 0 3 4 5 7 6 4 9 4 9 3 10 6 9 0 3 9 6 8 7 10 7 9 0 6 4 [...]
+7 10 2 0 9 4 6 9 10 0 3 5 2 9 7 5 9 1 1 7 3 8 8 7 7 1 4 5 7 1 6 8 1 2 0 3 10 2 3 10 8 2 10 10 9 2 1 7 5 5 5 8 4 0 6 0 2 7 4 6 9 7 0 10 7 7 4 10 0 1 9 10 3 7 9 1 10 9 4 10 4 5 4 10 5 4 3 2 8 4 4 1 9 4 3 6 5 10 6 5 10 10 1 6 5 1 9 0 0 2 10 9 7 3 3 4 1 1 0 2 10 0 10 5 3 4 4 7 10 7 7 9 10 4 0 6 6 2 10 9 7 5 0 8 8 4 10 3 8 9 9 0 6 9 2 5 3 2 8 1 10 2 5 7 1 4 10 4 6 0 9 6 8 2 8 5 5 9 1 10 4 10 3 2 3 5 1 0 10 10 8 7 0 3 10 1 6 1 5 0 6 7 4 8 7 1 5 10 8 3 2 1 3 5 4 3 1 9 1 9 5 10 10 4 3 10 1 9 4 5 [...]
+1 10 0 4 7 4 0 7 10 0 0 0 0 8 9 7 8 5 3 5 8 2 5 9 5 6 0 1 6 7 2 10 0 9 4 8 4 8 5 0 2 3 8 6 4 9 7 1 7 9 10 4 10 4 9 4 9 5 0 7 6 6 10 0 8 10 6 9 3 2 7 5 4 1 3 4 2 10 8 9 6 6 9 8 8 8 3 3 0 10 2 0 8 6 9 1 4 5 5 0 7 7 7 8 9 6 5 9 6 7 8 8 9 1 1 2 9 8 8 1 9 3 5 8 2 4 3 4 0 3 7 5 5 5 3 5 1 5 1 5 6 10 10 5 9 10 7 5 2 9 8 0 8 7 3 8 4 0 2 0 3 1 0 0 7 5 7 6 7 1 9 8 10 6 8 1 4 2 9 10 2 5 5 10 9 7 5 10 7 5 7 3 9 9 10 1 4 0 8 9 5 4 9 5 0 2 10 7 5 10 6 9 5 2 7 2 8 10 0 3 7 4 7 4 2 10 6 5 4 4 0 6 5 3 2 6 [...]
+7 3 2 4 4 4 8 2 0 4 0 4 8 1 6 6 6 8 6 5 6 2 4 1 5 9 4 2 2 1 1 8 3 8 8 3 4 4 2 3 10 9 2 10 10 10 4 6 2 6 4 8 0 7 5 8 9 10 0 8 1 5 9 8 2 10 10 7 10 9 5 3 3 6 7 10 5 5 5 4 10 3 7 7 10 1 0 4 1 4 8 8 8 8 9 6 10 10 7 8 0 8 5 1 10 4 1 2 6 3 9 6 3 0 1 5 0 0 10 1 1 2 8 10 3 7 4 3 8 0 0 5 10 0 5 5 3 2 2 0 0 7 3 2 6 9 3 4 6 1 8 6 1 2 1 2 8 8 1 3 10 2 6 5 8 2 9 1 5 9 1 0 1 1 4 8 8 8 2 1 8 9 3 5 9 7 0 6 6 5 0 3 5 4 7 2 1 10 9 2 0 1 2 0 10 1 10 9 3 7 8 10 9 3 7 10 6 5 6 2 4 9 10 9 4 0 1 8 0 6 9 2 3 10 [...]
+4 7 6 1 5 7 9 0 2 8 5 6 8 4 5 4 1 2 2 4 4 5 7 1 8 6 2 4 8 5 3 5 10 3 1 9 0 2 7 9 0 8 4 9 0 9 2 9 1 6 6 8 9 9 6 6 8 5 5 3 6 10 2 2 4 2 9 2 7 0 3 4 3 0 3 9 1 1 8 5 3 10 8 6 4 0 8 2 3 0 9 4 8 1 0 10 7 5 4 7 9 1 10 0 7 7 4 4 3 9 5 0 7 7 8 10 0 7 5 6 6 9 5 8 2 10 0 8 4 10 4 10 4 6 1 3 4 8 0 4 7 7 5 0 9 9 5 1 5 4 0 4 7 7 9 8 8 8 3 3 1 8 6 7 3 2 9 3 1 5 6 4 1 9 9 3 6 10 4 5 3 9 6 2 0 4 2 4 2 7 1 4 0 1 4 8 7 2 10 0 3 2 10 3 1 4 0 5 4 6 8 0 2 9 4 8 0 8 9 5 7 7 4 9 1 7 10 6 8 2 5 5 8 1 1 9 7 7 4 4 [...]
+6 7 8 9 10 10 9 8 10 8 9 4 3 6 8 1 4 3 10 9 9 3 3 4 0 4 4 7 4 8 3 5 0 0 8 6 2 8 10 5 7 0 5 8 0 4 7 5 5 7 10 7 7 0 8 10 8 2 7 6 4 5 6 2 7 5 9 4 3 6 9 4 5 7 6 10 3 8 8 9 3 5 10 4 4 4 7 9 9 10 9 8 0 3 3 6 4 8 5 1 1 3 7 9 0 8 6 1 5 5 5 5 1 2 6 9 7 8 0 1 6 2 10 4 7 7 3 0 10 5 4 7 2 8 1 8 6 7 0 5 7 8 4 0 9 7 6 3 7 9 9 9 0 7 8 1 1 0 0 0 4 8 3 4 5 10 10 4 10 3 3 0 4 1 4 8 0 3 9 9 5 9 8 9 5 5 7 1 5 2 1 8 9 3 2 4 9 4 5 10 4 0 10 0 10 4 7 3 7 0 4 10 6 1 8 10 3 4 5 7 7 9 7 1 10 10 5 1 6 5 9 6 9 4 0  [...]
+5 10 9 4 1 9 9 1 1 3 2 6 7 8 6 0 7 7 2 8 1 10 4 1 0 0 10 10 5 10 4 9 1 4 9 10 4 8 5 10 6 1 5 2 4 3 9 9 6 10 0 9 0 1 3 9 0 1 3 9 8 7 4 10 6 10 10 3 10 9 6 4 5 8 8 6 4 8 4 7 1 1 7 7 4 4 6 6 7 9 4 8 5 2 4 6 6 9 5 8 9 6 6 0 2 8 3 2 0 8 7 0 8 10 2 2 5 8 2 9 6 1 10 2 7 7 7 0 4 4 4 7 10 5 6 6 0 7 1 2 4 4 7 9 1 8 8 5 1 5 6 3 10 5 2 2 9 2 7 2 6 5 7 0 10 7 3 8 4 3 8 2 6 7 8 6 2 4 6 4 0 6 9 6 10 1 5 7 3 5 1 9 6 5 1 1 9 2 4 2 3 2 10 3 6 6 6 6 1 8 9 5 5 3 4 9 5 6 2 0 8 7 6 0 8 5 1 4 5 2 3 3 9 3 3 5 2 [...]
+6 5 9 3 1 9 1 3 8 10 9 9 5 0 5 3 1 2 6 6 9 10 1 0 5 3 3 1 0 3 2 2 3 10 0 5 3 2 2 4 10 10 0 8 3 0 6 7 0 10 8 4 4 1 8 7 7 0 5 0 8 9 6 9 2 7 10 3 7 9 6 3 10 4 10 3 6 10 4 9 7 9 9 5 8 2 1 4 6 9 4 7 5 10 1 6 0 10 9 8 4 1 2 4 10 7 3 8 4 10 8 0 3 5 3 1 10 9 2 10 9 8 7 6 7 9 5 5 10 7 7 0 9 0 2 10 6 0 4 4 5 7 10 6 2 3 5 6 0 5 3 6 2 10 1 7 4 6 6 10 7 6 2 4 3 5 2 1 0 8 6 3 5 3 1 1 1 5 5 0 1 10 7 3 3 9 5 3 1 3 10 1 0 2 6 0 8 7 10 6 3 10 1 9 0 2 10 0 3 10 2 3 5 10 0 7 10 8 10 7 7 1 9 2 4 4 10 3 1 8 6 [...]
+4 9 5 2 9 2 2 6 3 3 4 2 4 1 6 8 4 0 6 6 10 5 7 5 1 5 2 1 4 5 6 2 1 7 7 4 0 7 0 5 6 3 9 8 6 5 5 1 8 1 0 0 10 8 9 2 1 5 10 4 1 3 0 0 7 6 8 5 7 5 3 7 3 0 9 10 1 5 0 4 9 7 6 0 1 6 8 6 1 8 7 9 1 7 1 7 3 5 9 6 0 6 2 8 8 5 7 0 6 7 8 0 10 9 0 0 5 2 4 10 10 10 8 4 9 10 5 10 4 2 10 9 1 6 8 5 0 3 7 5 5 6 7 4 0 7 10 0 6 10 5 6 7 9 10 7 6 8 9 2 6 1 10 9 5 7 0 7 3 2 1 3 5 2 4 1 1 1 9 1 8 1 9 5 8 2 9 2 2 7 1 0 10 7 3 3 7 6 2 2 8 10 4 7 10 7 7 0 4 2 9 7 5 0 3 4 2 5 3 2 5 0 1 7 1 8 4 8 2 5 4 7 8 6 10 5 0 [...]
+6 9 9 5 1 4 4 5 0 8 7 7 5 5 5 2 0 5 0 8 9 6 7 7 10 6 10 6 2 0 6 10 5 6 3 0 6 9 3 3 5 8 0 0 3 2 10 2 2 10 6 8 2 5 6 9 1 1 6 9 10 4 3 9 10 2 4 1 4 10 0 1 1 3 10 1 0 10 10 4 8 2 9 10 8 6 9 9 2 9 4 7 9 7 3 6 7 5 8 10 7 5 8 10 5 7 5 10 3 10 6 9 2 9 7 7 5 0 6 3 2 1 4 9 8 4 5 0 6 8 5 0 6 8 5 10 9 1 0 7 2 3 8 8 3 10 4 0 3 0 4 1 0 10 7 4 10 2 0 4 7 5 6 7 5 6 8 10 2 9 5 8 7 7 0 8 2 9 10 1 8 5 7 6 4 0 7 4 3 7 3 2 8 0 4 10 10 8 9 4 3 4 2 7 8 4 8 10 0 6 4 3 0 0 9 2 4 7 2 10 6 8 7 5 7 7 0 2 5 6 9 3 7  [...]
+9 1 5 2 8 8 2 4 9 8 4 8 2 0 7 1 0 6 2 3 6 0 7 8 0 6 9 7 2 6 1 5 1 10 8 7 9 10 0 8 2 6 1 9 10 10 6 10 0 4 5 1 1 1 7 7 6 8 0 1 4 2 0 1 4 4 8 2 4 7 9 6 2 3 6 7 0 5 3 6 0 4 2 7 1 8 5 7 9 3 3 8 0 0 0 0 7 5 10 1 6 6 10 6 8 1 2 6 4 0 8 8 0 6 7 2 6 9 9 7 2 10 9 6 0 0 3 5 1 1 6 2 8 5 6 2 3 4 7 0 5 2 8 3 1 4 6 6 1 9 6 3 4 5 8 9 9 0 6 0 5 3 5 3 6 2 3 9 0 0 3 9 4 9 10 3 4 2 8 0 0 1 1 8 3 3 3 0 1 0 3 3 9 4 10 0 4 10 10 10 1 2 6 0 6 7 8 0 6 8 6 6 2 2 3 5 3 4 1 7 1 6 7 1 6 1 7 2 1 0 8 7 2 1 10 10 5 9 5 [...]
+9 5 10 2 8 1 7 8 0 2 0 9 9 5 2 6 7 10 0 0 10 0 6 5 9 3 10 8 8 3 10 8 9 9 1 1 1 3 2 2 7 4 5 9 1 6 8 2 6 7 3 3 10 1 5 5 0 8 2 8 8 10 7 9 3 0 10 7 6 0 2 4 6 0 9 5 7 8 1 4 8 9 8 1 6 9 8 8 2 3 7 3 9 2 5 9 0 2 0 0 10 0 5 5 9 10 7 1 9 6 5 3 5 6 0 0 7 8 3 2 10 4 10 0 10 6 5 5 3 8 6 10 5 10 2 6 6 3 2 1 0 6 3 1 3 0 1 6 3 9 9 6 4 0 3 1 2 7 2 3 4 1 4 3 4 8 3 2 3 8 0 7 4 9 7 4 8 8 1 10 1 3 5 10 2 5 2 7 6 5 10 7 5 4 0 8 3 5 5 9 7 2 2 3 8 4 0 0 9 7 7 2 8 9 9 9 0 4 9 4 3 7 7 2 2 3 8 7 1 9 4 7 1 3 10 2 5 [...]
+2 6 3 2 1 3 5 9 8 1 10 5 8 5 3 10 6 5 2 0 8 10 0 1 1 7 3 9 3 2 7 6 1 9 9 3 10 2 5 1 8 6 6 0 1 8 9 0 7 3 2 6 10 9 5 4 9 6 3 6 5 1 10 2 1 1 4 7 4 10 5 1 9 5 4 10 9 2 1 5 2 3 1 0 7 6 4 9 9 8 8 8 0 8 1 4 6 5 10 10 7 5 1 8 6 1 2 4 1 8 3 0 0 9 0 2 2 8 0 8 8 10 10 0 3 6 1 3 4 6 0 6 0 9 3 6 7 4 7 2 10 1 0 5 0 8 9 0 9 5 4 5 0 7 6 7 4 7 6 4 3 2 10 6 4 9 5 0 8 4 1 5 0 1 10 10 3 8 3 4 4 9 3 10 10 2 3 4 1 3 6 5 7 4 4 6 1 6 1 7 5 10 3 8 5 10 0 3 10 7 8 6 2 8 4 10 3 6 6 1 0 10 10 8 7 8 7 9 8 6 2 6 6 9  [...]
+9 8 2 6 4 7 10 10 7 8 3 6 10 4 3 10 7 7 9 4 7 5 4 4 7 5 1 2 0 9 4 3 1 10 1 2 5 7 8 5 9 0 5 7 3 7 0 4 0 0 3 4 6 2 1 9 0 10 4 5 6 3 3 3 2 6 9 10 10 10 1 9 9 7 2 8 4 10 5 0 1 6 10 0 3 1 10 0 10 6 0 3 5 10 3 6 0 10 6 2 10 1 0 4 1 9 7 5 2 0 0 7 7 10 6 8 1 8 5 3 9 4 3 7 9 2 5 2 8 7 7 0 3 1 4 3 4 6 10 7 7 10 5 4 6 7 6 3 5 3 8 6 5 10 2 4 4 6 9 0 7 1 8 3 3 1 7 7 1 1 2 10 2 3 7 4 2 7 8 6 3 6 10 7 5 10 10 2 3 5 6 5 9 1 8 5 8 10 9 6 1 2 9 7 0 9 9 8 3 7 3 2 6 1 6 3 9 6 4 6 7 8 10 4 3 0 8 3 9 8 1 0 8  [...]
+3 3 9 9 2 1 4 2 3 4 9 4 5 7 0 0 6 1 9 0 5 0 7 7 0 7 5 10 10 5 10 6 5 6 0 10 5 9 5 4 8 10 2 7 2 10 7 4 6 7 6 0 3 3 5 8 0 3 3 1 7 1 4 8 3 8 5 8 3 7 7 10 5 7 10 9 5 1 0 3 4 6 10 5 4 6 10 7 9 4 4 4 1 6 2 5 6 8 0 3 7 6 4 2 8 4 2 5 5 0 0 4 8 6 4 8 7 2 8 4 2 6 10 10 0 10 1 2 0 0 1 4 7 0 7 2 7 10 7 6 9 6 5 9 2 2 8 5 10 2 5 8 4 4 5 8 5 6 0 8 10 2 5 6 10 10 0 6 3 2 3 3 4 2 6 1 1 7 8 9 4 5 9 4 9 4 8 7 6 8 2 0 3 0 8 0 4 2 1 9 2 0 7 3 4 4 9 4 3 10 5 6 7 2 9 9 1 4 1 10 9 1 5 1 2 2 5 8 1 9 8 5 0 3 9 3  [...]
+4 1 1 8 5 2 9 4 1 0 1 0 2 10 4 2 8 10 1 5 1 8 2 1 6 2 0 6 1 6 4 7 4 4 2 0 0 10 10 4 4 9 6 5 4 5 7 5 4 5 7 6 1 2 0 3 1 2 2 1 9 2 9 6 5 9 2 10 5 1 0 7 2 0 6 7 6 0 8 8 6 9 7 10 1 10 3 5 3 3 0 9 2 8 10 2 5 10 3 0 10 5 2 1 4 9 2 2 5 9 3 5 10 1 5 6 8 7 1 1 6 9 3 6 0 9 5 0 4 5 2 4 5 7 0 1 2 7 6 5 10 9 10 8 2 1 3 10 0 6 10 4 10 7 1 10 4 5 2 3 1 10 6 9 3 5 9 4 1 0 6 1 8 1 3 5 1 9 6 9 7 7 6 8 2 0 1 3 6 0 5 9 9 1 10 10 6 0 1 3 0 8 6 10 10 8 8 9 2 3 7 4 8 9 2 2 3 6 6 0 9 6 4 1 4 7 10 8 9 6 10 7 9 1  [...]
+3 9 3 3 8 3 5 2 4 5 7 10 8 2 9 8 3 2 3 10 0 5 8 3 0 0 10 5 2 6 3 1 10 9 1 0 0 0 7 1 6 9 2 10 2 8 9 5 9 10 5 0 4 10 9 9 1 2 10 7 9 10 1 10 1 5 10 8 0 6 5 5 9 8 6 2 1 5 10 10 6 1 0 9 0 8 10 7 7 8 4 5 6 4 1 1 6 6 7 1 1 5 5 6 7 0 6 2 2 6 6 10 1 3 10 6 6 5 2 2 7 0 3 9 0 7 6 0 7 7 4 5 5 2 5 0 10 7 9 10 4 2 7 5 7 8 5 0 8 2 8 3 9 1 3 7 8 9 10 1 6 4 1 8 2 0 2 5 6 1 1 1 3 1 2 9 7 1 2 5 2 8 3 10 5 5 3 8 7 8 1 5 2 5 7 10 3 10 10 0 6 3 0 6 0 4 5 0 2 2 10 2 7 4 3 2 9 7 2 4 10 1 5 5 8 2 6 6 1 0 9 9 10  [...]
+10 3 10 7 4 8 7 4 9 0 5 5 5 10 9 0 0 9 6 0 3 8 10 0 4 10 7 0 9 9 1 5 4 6 8 9 9 2 9 4 6 8 3 9 2 6 5 7 10 7 10 8 7 2 1 10 8 5 9 7 5 4 8 2 7 5 8 6 2 0 3 2 5 0 7 4 0 7 1 10 2 5 4 10 10 3 10 5 3 0 5 9 6 6 2 7 9 2 6 3 3 2 10 4 5 7 2 6 10 5 5 10 8 5 10 2 6 8 7 6 3 3 0 7 4 5 10 7 7 8 4 7 6 3 2 6 1 10 6 9 2 2 2 0 0 1 2 10 5 5 2 1 2 1 8 10 6 8 8 8 1 6 3 0 5 4 10 1 1 5 10 2 9 3 3 3 4 6 3 0 7 0 8 2 5 4 8 10 2 6 2 4 6 5 0 1 3 2 9 9 3 10 9 7 5 5 7 8 10 4 10 5 8 9 5 9 2 0 7 7 8 4 4 10 8 3 3 9 9 2 4 1 1 [...]
+0 0 9 10 1 8 0 7 10 0 8 5 0 5 1 2 5 1 8 8 10 9 8 0 2 0 3 8 2 7 7 0 6 6 2 3 5 10 10 7 7 8 1 7 4 1 5 4 5 1 9 2 2 5 10 6 2 2 4 2 7 7 8 9 0 10 0 1 3 5 10 6 4 3 9 7 7 0 1 8 3 6 10 0 7 9 4 9 3 7 1 8 7 0 10 8 0 5 8 2 9 0 7 10 0 0 4 9 1 4 3 4 8 2 7 4 8 9 0 4 9 6 3 1 10 0 10 3 5 4 7 10 7 8 8 3 0 5 5 10 7 2 6 0 1 8 0 10 4 10 7 8 9 0 7 1 3 0 7 1 10 8 9 7 4 6 3 6 9 2 4 1 3 5 4 6 7 5 10 6 6 0 7 7 2 6 7 1 5 9 5 1 2 6 5 3 0 0 2 8 6 6 4 7 0 10 2 1 8 4 3 10 2 8 1 7 6 0 4 10 9 10 6 3 10 10 4 1 5 6 5 4 8 0 [...]
+6 10 8 3 7 8 2 2 6 6 3 2 10 8 2 8 10 4 10 4 7 5 9 4 10 10 5 1 0 5 2 6 5 6 5 2 1 7 9 6 7 5 7 10 10 8 4 9 8 8 2 5 5 4 2 8 7 5 9 2 1 0 1 10 6 5 2 7 9 4 9 4 5 0 3 9 2 4 8 2 9 3 7 2 2 7 5 1 7 7 8 9 4 4 1 9 9 5 2 5 2 7 10 4 1 9 9 6 2 10 7 9 1 3 0 4 10 1 8 4 6 4 9 7 5 8 5 2 0 0 9 6 4 8 3 1 5 4 9 8 5 0 0 2 8 3 10 8 9 5 2 3 3 10 10 4 1 9 8 0 3 1 5 10 1 1 7 3 6 8 7 10 9 4 1 6 0 3 0 5 9 3 4 0 6 0 9 1 1 9 6 0 10 3 5 2 7 4 4 5 7 3 5 6 3 0 5 7 5 1 2 0 2 3 9 9 2 5 2 7 9 9 7 0 2 0 1 9 10 3 8 10 10 4 10  [...]
+5 6 6 7 4 10 1 2 5 9 6 6 9 7 1 5 3 6 0 8 6 0 9 4 10 7 8 4 5 1 0 6 7 6 5 1 2 8 1 8 9 10 0 6 2 5 7 7 5 9 2 8 5 10 0 0 8 10 3 4 10 7 8 0 10 7 8 10 1 5 5 6 0 7 3 0 1 2 4 0 8 9 6 7 1 4 0 1 0 1 2 3 6 0 8 4 9 8 7 5 9 9 1 5 8 0 9 8 7 5 9 0 9 4 6 9 7 0 7 10 8 1 10 7 3 8 9 6 9 9 1 2 0 3 1 3 0 6 2 2 7 0 1 8 6 10 1 7 8 3 9 2 7 10 10 5 7 0 7 0 8 3 6 6 2 4 4 3 10 3 7 5 2 6 6 3 9 4 9 10 1 8 7 8 8 8 10 1 6 7 2 6 2 7 1 6 10 9 0 3 3 9 0 9 3 7 7 10 6 5 1 10 5 6 3 4 4 5 10 0 6 6 10 10 6 5 1 4 0 8 3 8 6 6 5  [...]
+1 1 1 9 3 5 3 3 7 8 8 6 2 2 1 1 5 9 4 3 9 4 0 0 9 4 0 10 7 6 1 0 7 10 0 3 5 7 7 9 7 3 8 7 6 6 7 5 0 4 2 1 7 6 2 6 4 4 2 1 0 5 9 9 10 6 4 4 10 2 9 8 3 1 8 4 7 0 7 10 7 5 1 5 9 2 3 6 7 7 3 7 6 5 4 3 7 4 5 1 4 6 5 4 2 2 5 6 6 10 2 6 7 7 8 8 0 3 6 4 9 8 7 1 1 4 10 9 9 7 8 3 7 10 4 0 8 7 2 1 8 1 8 3 10 5 5 3 10 8 1 9 6 4 0 3 2 9 2 10 4 7 4 5 1 10 0 1 8 7 9 7 2 1 2 5 9 0 1 10 2 7 10 3 5 8 9 6 7 0 4 6 4 8 4 10 7 4 1 1 10 5 6 2 9 3 0 1 9 4 7 0 4 5 0 5 2 7 9 4 5 8 8 7 2 4 7 5 4 7 2 8 4 0 6 10 9 8 [...]
+0 3 9 5 9 1 7 0 10 10 9 8 6 2 8 8 9 6 7 2 1 0 10 8 4 5 3 10 0 5 3 8 4 7 9 4 3 0 5 6 7 6 1 1 0 5 6 3 3 5 10 1 3 2 6 0 6 3 5 7 2 1 1 5 9 1 1 2 7 2 10 6 9 0 4 5 2 3 2 4 6 5 1 6 5 1 6 10 9 1 2 0 7 9 9 8 9 7 2 5 10 0 6 1 9 4 4 6 4 2 4 7 7 3 6 2 3 0 6 4 8 5 6 2 1 10 7 3 5 6 8 5 10 4 6 1 9 6 8 3 5 2 0 10 1 5 7 4 4 3 3 0 2 3 6 2 9 8 5 5 0 10 8 9 8 4 7 0 6 9 1 6 5 3 7 3 3 0 0 1 6 0 3 8 2 8 10 2 2 6 7 1 8 3 7 2 10 3 9 6 3 0 7 4 10 0 10 6 0 6 8 4 3 8 1 2 0 5 10 5 3 1 3 7 7 1 5 5 6 10 3 2 0 6 1 5 4  [...]
+4 7 1 5 8 6 3 3 2 0 2 9 2 6 2 3 7 6 4 7 1 7 1 3 8 3 3 8 9 3 0 10 2 2 5 9 1 0 4 7 4 0 4 8 6 1 4 3 8 3 7 9 3 1 9 8 1 3 10 0 7 7 6 7 10 4 2 10 0 9 10 6 9 10 4 3 3 8 8 2 8 4 0 8 3 8 7 8 3 1 2 7 0 8 0 6 6 3 10 9 10 9 6 8 7 3 3 4 5 6 6 6 2 2 7 9 0 4 6 7 2 1 7 1 6 3 5 4 1 3 5 6 0 0 6 6 0 3 5 10 0 10 10 8 7 3 4 9 5 7 0 2 3 2 4 9 8 3 5 4 5 7 9 3 10 1 9 1 10 9 1 1 10 7 2 2 8 9 9 5 0 6 6 5 8 2 1 7 10 5 9 10 10 5 9 7 9 6 4 7 10 5 1 4 0 3 10 2 7 10 4 4 10 3 7 5 7 9 1 2 9 3 0 4 10 7 5 2 2 3 4 2 5 4 0  [...]
+4 1 3 6 8 10 2 1 5 4 3 7 1 8 0 1 0 2 2 3 1 2 1 6 9 1 1 8 10 4 6 4 4 7 4 1 3 8 7 10 6 3 5 2 10 10 0 9 6 0 6 10 2 7 2 3 6 4 10 0 7 2 1 9 7 9 5 6 6 2 1 6 1 1 4 9 6 9 7 9 3 1 5 0 0 0 6 1 10 4 4 1 8 5 1 1 2 0 5 4 9 6 9 5 2 7 3 3 2 10 9 9 1 7 4 10 10 1 3 2 3 0 3 2 3 8 2 6 5 7 1 0 6 3 1 3 9 5 10 0 8 5 4 9 8 10 4 5 5 6 4 10 7 5 2 3 3 9 1 8 7 4 2 0 3 2 8 3 0 9 7 2 2 4 1 3 0 0 8 0 9 5 6 0 0 2 5 8 2 8 4 3 7 6 1 1 4 3 2 0 7 6 0 7 5 0 2 5 8 6 9 10 8 0 2 6 6 0 5 4 0 2 2 3 10 8 7 8 9 9 7 5 4 6 8 6 5 2  [...]
+2 0 9 7 7 1 6 0 10 3 1 1 4 7 6 5 2 4 3 6 10 6 3 1 6 8 2 7 6 5 8 1 4 8 2 10 8 0 1 6 5 0 0 7 10 8 10 0 0 3 2 10 5 7 8 4 3 9 1 9 2 6 0 9 6 0 6 3 5 5 2 8 8 3 8 10 10 0 10 6 0 2 5 9 0 1 0 10 8 6 4 10 10 1 0 5 7 8 3 7 1 3 2 10 3 9 2 8 0 0 5 6 5 9 9 7 3 8 5 6 2 1 3 3 9 9 9 1 4 9 5 5 10 0 9 9 5 3 2 6 2 5 10 5 3 8 3 7 6 1 4 3 8 0 3 3 0 5 7 9 5 7 5 9 2 10 0 4 3 10 0 4 8 9 8 10 6 3 6 7 0 2 9 4 4 7 4 5 5 7 6 2 8 4 7 2 4 1 1 7 7 7 0 0 7 4 10 9 1 3 5 4 4 6 5 6 4 10 7 3 7 3 10 3 7 6 5 5 6 4 3 3 1 1 7 7 [...]
+3 6 2 0 8 6 6 4 9 0 0 9 5 1 6 8 9 7 6 7 10 8 7 5 0 3 3 9 7 10 8 2 5 0 2 3 5 2 10 6 4 0 5 9 3 10 1 9 0 1 5 9 6 4 5 4 6 2 2 2 1 6 5 6 4 1 5 3 10 2 0 2 6 8 5 3 1 4 3 0 5 0 9 9 8 3 2 4 10 8 9 6 0 4 0 7 0 0 4 0 8 8 1 9 5 4 0 5 3 6 1 6 6 1 0 1 6 9 9 1 1 4 3 3 4 9 8 8 0 9 6 3 5 4 6 9 4 8 5 10 8 2 7 2 9 1 3 0 4 4 8 9 6 8 4 6 8 4 2 4 4 2 2 6 0 3 1 0 3 1 4 2 3 10 8 9 3 5 10 4 4 5 2 5 2 6 8 8 0 0 5 2 5 6 4 4 3 9 10 2 5 4 2 6 4 3 6 7 2 7 2 8 9 3 9 7 0 0 8 3 7 1 0 10 5 0 6 8 6 8 7 9 1 1 8 0 6 1 10 1  [...]
+0 4 1 5 8 7 2 9 4 2 4 3 2 4 5 10 3 9 8 10 4 6 6 4 3 5 9 9 8 2 10 8 5 10 3 4 3 10 8 0 0 8 9 4 5 8 3 2 5 4 3 10 1 6 7 4 1 9 4 9 2 0 4 10 10 10 6 2 1 5 2 3 0 10 7 9 2 1 1 7 9 5 2 10 10 7 6 3 7 7 7 7 10 5 7 5 5 5 9 2 5 7 5 0 3 8 10 9 3 5 10 7 8 4 2 3 6 7 1 1 10 0 6 4 8 0 0 6 4 9 3 6 6 1 3 6 5 6 8 4 3 1 8 3 2 9 0 2 10 3 9 1 5 1 1 7 9 9 0 6 2 7 2 4 5 4 6 7 2 3 1 0 0 9 9 9 9 2 2 7 2 10 4 2 4 6 6 10 9 8 9 8 9 4 1 4 10 5 3 0 10 3 6 2 6 4 3 3 2 4 5 10 6 2 3 0 7 2 4 5 3 3 10 2 9 2 7 8 8 10 2 3 5 3  [...]
+8 8 6 7 6 2 0 4 2 9 6 2 7 2 0 8 8 3 9 10 0 1 6 1 3 0 10 9 0 3 0 0 4 0 4 4 3 7 10 5 1 4 4 10 4 6 1 0 10 6 2 6 8 4 3 5 2 2 3 10 10 1 0 1 6 7 3 5 2 5 0 10 10 8 6 9 8 1 9 0 9 7 4 8 9 8 2 8 1 9 0 4 10 1 9 5 1 3 5 2 9 2 10 1 8 9 1 2 5 2 6 4 3 3 1 3 5 10 5 9 3 8 4 4 4 6 8 1 7 0 10 2 7 0 4 4 0 5 4 10 9 2 9 3 7 6 2 3 7 8 6 5 7 5 4 9 8 7 3 4 2 7 1 4 8 4 10 6 4 6 6 7 10 1 2 10 1 9 6 1 8 7 7 3 1 5 10 5 10 6 5 4 3 6 3 2 0 2 4 0 9 0 8 7 5 9 1 10 6 6 9 10 5 3 7 3 9 10 9 1 3 10 0 2 6 4 5 10 3 4 7 7 6 2  [...]
+1 9 7 10 1 4 6 2 7 4 1 5 9 6 7 0 6 4 0 9 8 10 6 3 2 5 0 2 2 6 0 3 7 3 3 0 4 9 9 0 10 1 2 5 3 2 1 1 6 6 6 9 7 4 4 7 6 1 1 3 9 2 5 3 6 1 0 1 9 5 3 3 6 1 7 5 8 0 6 10 0 9 5 9 9 2 3 7 4 4 4 1 5 2 4 5 5 6 7 5 10 8 9 9 10 9 3 4 3 7 0 4 7 1 9 5 3 9 1 0 5 0 9 4 0 9 7 2 7 7 5 10 9 10 5 0 7 10 4 7 1 1 0 10 10 4 3 2 5 0 8 1 5 1 0 9 0 7 0 0 2 7 1 6 9 7 0 8 5 8 3 1 8 0 3 2 10 1 5 5 1 8 2 5 8 0 9 9 2 6 2 4 9 4 9 9 9 4 2 10 0 8 0 8 4 10 7 3 7 10 7 5 6 3 6 4 3 0 10 1 8 10 0 7 1 2 5 1 3 6 1 3 2 5 0 6 3 9 [...]
+10 8 4 7 7 10 8 5 0 3 9 0 2 10 1 0 9 5 8 6 0 4 1 0 3 0 5 3 9 10 0 7 4 3 2 7 9 9 6 1 5 10 0 6 2 2 10 7 7 7 7 0 9 1 1 3 0 9 3 1 4 1 10 2 0 0 0 5 7 6 7 6 2 2 4 4 6 5 9 1 7 2 10 10 2 9 0 10 2 8 10 4 8 1 1 1 1 8 7 3 7 6 0 6 2 1 0 0 6 10 10 10 8 2 9 2 8 8 9 1 10 9 7 3 7 2 3 1 9 8 2 1 10 7 8 3 6 2 5 1 3 8 7 10 10 0 6 4 2 3 4 1 4 0 1 7 0 8 7 6 8 10 2 0 4 6 3 4 2 3 9 5 7 4 0 1 2 4 6 9 7 7 0 2 10 5 0 3 7 3 6 7 1 3 2 0 3 9 3 0 7 0 0 7 9 3 5 1 0 0 6 8 4 7 0 9 7 1 4 8 8 4 0 3 9 0 1 3 9 3 10 4 6 4 2 8 [...]
+1 9 10 0 1 0 8 4 4 4 7 7 8 6 0 1 7 0 5 4 5 10 1 9 7 5 1 4 2 2 6 4 1 0 1 7 1 6 5 3 9 7 6 9 0 4 8 0 8 7 6 2 2 4 4 2 6 2 6 7 7 10 8 10 8 3 3 4 0 10 8 5 8 1 5 4 6 10 4 9 3 10 9 1 8 10 7 1 10 6 10 8 5 3 3 9 9 10 3 2 9 1 2 4 5 4 3 1 0 3 0 9 2 1 8 5 6 9 9 8 8 8 10 10 5 5 8 2 8 1 2 7 7 4 9 1 7 4 8 10 7 10 1 4 5 3 2 9 3 3 6 7 9 9 7 10 1 2 3 9 9 1 3 7 3 3 4 3 2 1 1 1 0 4 6 8 8 7 4 9 7 1 8 4 4 5 3 8 4 0 5 1 10 10 4 5 4 8 8 0 3 4 9 8 0 4 5 1 8 9 4 2 1 7 6 7 10 3 8 1 9 6 9 0 1 3 1 3 9 1 7 0 5 10 6 2  [...]
+8 6 8 3 7 9 9 8 4 3 5 10 1 1 10 6 5 0 0 5 3 9 2 2 1 2 8 7 4 10 5 0 9 7 8 10 7 9 5 4 5 5 1 3 2 8 1 1 5 1 6 3 3 8 9 9 3 2 0 3 9 3 5 4 2 2 3 6 3 1 4 2 6 4 3 3 2 6 7 0 8 5 9 3 0 0 10 2 4 0 9 5 10 4 7 8 4 9 1 6 1 9 1 9 1 4 1 0 3 3 3 8 3 8 4 7 3 0 2 0 10 0 9 4 6 3 4 8 8 1 9 4 6 10 9 2 4 4 5 4 8 6 3 1 7 2 9 8 3 0 8 9 8 0 0 1 1 8 1 5 6 8 5 5 7 8 5 10 9 0 5 7 7 0 3 1 2 6 8 0 9 3 9 0 7 1 6 4 1 0 5 4 7 4 1 8 8 3 5 3 1 0 6 6 6 4 7 4 4 1 4 5 6 8 8 4 4 8 0 4 2 9 0 4 4 8 4 9 7 1 2 0 5 0 2 4 10 7 2 4 7  [...]
+1 0 4 1 9 10 1 1 10 4 6 9 7 2 1 3 9 10 6 3 0 6 8 3 5 9 1 3 10 10 10 10 0 7 7 4 5 0 7 8 3 8 9 10 1 4 4 10 8 3 6 10 7 7 8 4 7 1 0 5 9 1 1 3 5 0 8 8 0 1 0 0 8 6 4 10 6 1 1 10 6 6 0 0 1 4 5 10 2 3 10 8 4 6 7 3 8 7 3 3 2 3 5 6 1 9 9 4 6 8 8 10 1 4 5 4 9 4 6 6 9 0 4 6 5 2 0 8 8 0 2 6 8 2 0 7 7 6 6 5 9 8 0 9 0 9 8 3 9 5 4 9 7 7 8 2 8 0 9 9 8 4 10 3 5 1 1 4 4 7 9 9 0 9 3 2 9 0 4 9 1 0 7 10 10 8 0 1 1 1 1 9 3 6 3 9 1 5 0 0 7 2 5 4 2 5 0 9 5 7 2 6 8 10 10 1 2 2 2 1 8 4 5 1 8 10 7 8 2 10 0 9 4 8 1  [...]
+4 3 10 5 4 6 8 4 9 7 2 8 8 8 3 1 4 5 9 8 0 7 3 7 8 9 3 2 2 7 4 4 10 4 9 9 1 8 1 4 7 0 10 2 6 6 2 0 4 1 9 4 7 4 8 6 8 7 2 10 0 9 4 6 2 9 1 6 1 2 7 4 2 9 8 2 0 4 10 1 10 9 2 1 9 7 8 7 2 4 1 5 6 6 9 2 4 4 4 10 5 10 9 5 1 2 4 2 3 2 6 6 5 8 4 0 5 8 10 1 4 3 3 6 3 8 2 1 3 0 7 6 9 0 6 2 6 9 0 2 10 8 7 8 7 2 5 5 1 7 2 0 3 0 7 3 1 9 8 8 1 4 0 9 7 5 6 4 1 9 0 5 10 4 5 3 2 8 0 10 5 7 1 6 5 1 4 4 5 3 1 7 5 8 6 1 8 2 6 3 3 4 3 2 8 0 6 0 2 2 6 6 7 3 9 5 4 1 3 9 8 5 4 4 9 5 2 10 6 4 5 5 4 5 5 10 10 7 9 [...]
+4 4 0 3 7 5 7 0 5 6 1 5 7 3 2 8 0 5 9 1 1 8 3 3 5 5 4 10 7 2 5 2 0 9 9 10 8 6 4 3 9 7 8 1 2 1 0 1 6 3 3 2 9 6 10 3 9 9 7 8 2 8 1 8 3 6 1 7 10 7 7 9 8 10 0 8 6 2 8 1 10 0 0 1 2 4 5 7 10 0 4 1 0 6 8 7 1 10 1 1 10 3 4 4 5 0 1 4 4 6 3 2 3 9 2 6 2 9 0 2 0 6 1 7 2 1 0 3 3 0 2 10 2 4 0 3 4 0 8 3 1 8 1 0 0 10 5 4 8 9 4 6 4 0 6 9 5 9 8 5 7 6 1 0 1 0 7 10 6 9 8 4 6 0 4 7 1 10 0 7 6 9 7 4 7 10 8 3 0 7 8 10 2 8 1 7 8 10 8 9 7 9 0 5 6 7 2 2 4 5 5 5 7 10 0 0 1 5 7 2 10 5 0 0 7 0 8 6 7 1 6 9 7 10 8 9 9 [...]
+7 5 9 0 10 6 7 8 6 5 0 5 3 6 0 9 7 10 2 0 2 6 6 10 3 1 5 8 9 3 2 2 6 7 0 8 4 5 8 5 5 7 10 9 4 5 3 9 8 4 3 7 6 10 1 3 2 2 2 6 8 10 0 9 5 10 8 1 1 7 2 2 2 9 6 8 6 7 4 2 4 3 0 7 0 1 9 5 0 0 7 5 9 8 5 4 6 9 10 6 1 6 7 8 4 2 10 3 10 3 6 0 2 1 10 6 5 3 0 2 5 1 8 0 2 10 3 10 1 4 6 7 6 7 5 4 3 7 10 1 6 1 10 3 8 4 0 5 7 5 4 2 2 2 8 9 6 8 1 7 3 9 1 2 5 6 8 4 3 9 2 10 3 1 8 3 5 6 6 10 7 4 0 10 0 6 7 4 10 4 7 0 0 6 9 7 10 7 4 2 6 1 2 6 1 6 3 0 3 3 0 3 3 5 1 4 4 5 5 10 9 4 9 9 5 10 4 10 2 2 0 7 1 1 8 [...]
+4 5 4 5 3 10 6 5 5 2 10 7 4 8 10 6 9 3 6 1 7 1 8 4 5 5 3 3 10 6 4 2 4 1 8 1 2 5 2 6 4 7 2 10 10 0 1 2 9 7 9 7 7 8 1 8 3 7 5 2 4 0 0 5 8 8 9 5 1 7 1 1 8 5 8 1 2 3 4 10 7 0 3 10 5 10 10 6 2 0 9 3 7 8 7 8 1 4 4 6 0 6 3 0 1 5 8 10 2 1 4 1 0 7 6 6 5 7 0 4 2 0 7 1 5 2 1 2 8 8 5 6 1 5 3 1 2 1 2 10 4 6 8 1 3 7 10 7 4 7 2 7 7 7 10 1 8 4 4 6 9 1 3 9 6 8 8 1 0 5 8 5 7 7 1 1 9 5 5 4 5 0 8 4 1 0 0 9 10 1 10 7 4 9 7 3 5 3 3 1 4 7 8 4 7 8 4 3 2 9 10 1 8 0 6 1 9 7 1 3 3 10 3 7 3 4 4 1 8 5 4 4 5 10 3 0 8 [...]
+3 7 0 9 2 1 5 0 3 8 5 1 0 6 6 9 6 6 3 4 9 0 8 0 10 3 4 4 0 8 6 7 8 9 8 10 7 6 4 4 8 3 5 5 6 6 1 2 9 0 1 0 9 1 0 7 7 9 7 0 4 3 10 7 4 8 6 8 5 2 4 4 8 3 2 0 8 8 9 4 3 10 5 9 1 4 6 9 2 2 0 1 7 7 6 10 5 8 4 7 7 4 0 7 4 4 8 5 9 7 8 6 4 4 7 0 5 3 9 9 8 3 6 9 9 5 0 6 6 3 7 2 10 6 5 4 10 8 5 6 1 3 9 5 6 4 8 9 2 5 2 2 1 1 2 3 5 10 0 1 1 3 3 3 10 1 1 8 6 8 8 5 1 10 7 5 10 0 8 8 5 9 5 7 1 10 7 9 5 2 0 4 4 3 5 1 4 8 8 6 5 6 6 9 3 7 6 5 5 2 6 6 0 10 2 9 8 10 8 6 7 10 10 4 4 5 7 7 8 3 10 5 6 1 2 3 5 3 [...]
+2 4 6 5 5 2 1 0 2 2 5 3 8 1 3 9 1 10 8 3 5 10 1 9 4 3 9 6 9 10 6 1 0 10 6 2 4 0 8 6 6 6 2 9 2 7 0 1 2 7 4 10 8 1 2 5 0 10 2 5 9 7 10 2 1 10 3 2 5 9 2 1 10 3 5 3 0 5 10 4 8 8 3 10 3 2 6 9 7 4 1 9 9 9 9 2 1 4 9 8 5 9 4 9 10 10 5 0 4 4 10 8 10 0 3 4 7 4 9 9 8 5 5 0 5 7 0 5 7 4 0 4 9 7 3 3 6 3 6 9 8 0 7 10 6 5 9 3 0 9 6 4 4 6 0 7 5 3 9 0 2 9 3 10 6 5 0 2 3 10 4 8 0 2 5 2 0 7 3 0 4 9 0 8 0 10 9 3 1 3 3 5 6 0 1 3 8 7 9 3 9 5 5 1 5 2 1 1 6 8 8 10 0 10 3 10 9 5 8 7 8 7 10 6 1 9 4 5 3 10 10 4 6 6 [...]
+8 9 0 2 7 7 2 9 2 4 8 4 8 2 1 8 7 4 3 6 3 8 9 0 4 9 10 4 10 8 6 3 1 4 1 10 7 2 1 7 6 7 6 4 8 2 8 1 7 7 4 6 7 6 0 8 4 6 4 0 8 1 2 4 5 4 2 7 5 3 5 4 9 10 3 4 5 1 7 6 9 4 7 2 1 0 3 3 5 5 3 8 2 4 5 10 5 9 4 5 2 0 10 1 2 1 6 6 2 5 9 2 0 10 6 3 0 2 5 7 8 2 2 5 1 1 7 8 3 5 10 3 5 5 6 4 4 3 10 3 2 5 4 1 2 1 8 10 3 2 0 5 8 9 4 3 7 5 7 1 5 6 10 6 2 6 3 1 9 5 2 3 10 8 4 10 8 6 1 1 8 8 9 6 5 2 7 4 8 0 10 6 6 7 10 4 5 10 8 7 7 0 9 9 5 10 9 8 0 2 8 3 7 8 3 10 3 7 8 3 2 0 10 3 7 1 8 10 8 10 9 9 4 10 7  [...]
+6 2 8 5 9 6 0 5 0 8 5 8 5 3 8 9 4 7 1 6 7 9 9 1 10 8 3 1 5 5 3 10 3 2 1 4 6 10 10 5 3 0 6 5 1 1 2 10 3 10 1 3 10 9 5 10 3 3 2 7 2 0 3 9 10 9 9 6 2 1 9 4 1 6 10 4 6 7 10 5 0 0 7 5 0 0 8 8 3 8 6 2 2 9 6 10 0 6 2 3 6 2 8 4 8 10 2 9 7 10 2 4 1 1 4 6 2 8 3 0 8 9 5 8 6 4 5 2 2 10 7 5 7 6 2 8 9 1 10 5 0 4 6 1 4 10 4 1 2 10 3 4 4 7 6 3 8 1 5 0 6 8 1 5 6 5 8 4 9 1 5 10 10 3 4 10 7 0 7 2 0 2 3 10 9 4 8 7 8 7 7 0 0 7 8 7 0 3 0 3 7 4 5 0 5 4 9 6 8 1 1 9 2 3 7 2 2 9 7 5 2 2 0 0 0 4 5 1 9 4 7 5 10 3 0 [...]
+7 0 7 10 9 4 6 9 9 5 8 6 10 2 2 6 10 9 2 6 5 7 2 0 9 4 8 1 0 4 7 2 9 4 10 1 8 3 8 9 8 0 4 5 7 9 6 3 3 4 9 2 7 8 0 2 4 2 1 2 8 7 0 9 4 2 4 1 5 10 10 3 2 5 7 10 10 1 0 0 8 9 3 8 3 6 3 8 6 3 6 9 1 8 3 9 10 1 5 6 7 0 1 7 7 5 2 8 6 2 1 7 5 0 9 7 3 9 0 8 8 9 10 9 10 10 7 9 9 10 3 2 0 8 3 3 5 7 0 9 2 10 3 7 4 0 7 9 8 0 1 6 10 0 3 4 5 1 6 7 8 8 1 0 3 8 2 1 0 1 3 10 7 8 3 7 8 1 4 8 5 0 9 9 1 1 10 8 2 1 10 3 4 7 4 6 0 10 10 3 6 1 9 0 0 4 9 4 10 5 0 4 8 10 0 9 3 8 6 0 6 5 5 1 9 2 7 1 3 1 7 7 1 6 10 [...]
+10 0 1 5 5 9 5 9 5 7 4 7 5 6 0 5 5 4 3 8 2 10 6 1 8 10 7 8 8 6 7 5 8 2 5 6 3 5 5 0 4 7 8 1 4 6 9 6 6 6 5 5 3 5 6 6 5 7 8 6 6 1 8 0 0 9 3 4 1 1 4 10 4 0 5 7 0 4 1 6 3 10 7 2 7 0 10 8 7 8 6 7 8 6 3 9 3 10 9 0 7 4 6 1 7 3 9 3 10 7 4 2 5 0 3 10 3 6 3 1 10 6 3 8 9 4 6 1 3 4 8 1 7 10 9 2 0 7 6 4 5 1 6 3 3 1 0 1 3 4 0 7 6 8 4 3 10 4 4 10 4 1 8 2 1 4 1 0 0 4 10 8 2 3 2 4 5 6 10 9 4 6 5 8 3 1 9 9 10 6 5 1 3 1 0 0 8 3 2 2 5 10 9 8 8 6 8 8 6 4 2 4 1 2 10 6 8 4 9 7 2 8 2 1 2 2 2 4 0 0 0 6 6 0 7 8 0  [...]
+5 10 8 8 3 10 5 0 10 4 8 0 6 4 10 1 9 10 10 7 7 0 9 0 6 4 7 7 6 3 10 4 10 7 9 8 5 2 8 1 10 9 10 8 0 6 10 0 2 0 5 8 10 5 10 9 4 10 5 8 6 10 8 3 10 10 2 10 10 0 4 8 5 5 6 4 1 2 3 6 1 1 9 9 1 1 9 9 7 0 9 2 8 9 3 2 6 9 0 3 9 7 8 3 8 9 10 8 9 5 1 7 8 3 9 8 8 1 9 8 8 5 5 1 6 6 7 5 1 10 4 0 3 2 0 3 8 1 3 1 6 1 9 6 10 8 6 10 0 3 3 2 4 5 2 2 9 4 1 6 3 0 6 3 9 9 0 0 6 10 7 9 5 0 9 6 7 8 2 2 1 10 10 1 10 7 9 1 6 8 7 10 6 7 6 4 10 10 10 10 5 3 9 4 4 10 10 0 9 1 10 10 6 0 5 7 2 7 7 8 2 4 7 2 4 3 9 0  [...]
+9 7 1 5 2 1 2 6 8 9 3 6 2 2 6 9 8 0 5 1 7 0 9 5 9 3 1 6 3 4 5 1 1 8 2 4 6 3 3 5 6 3 4 5 7 5 7 6 3 4 8 2 6 1 3 10 6 5 5 5 6 4 0 8 9 4 5 2 7 2 4 5 4 9 10 1 2 3 9 1 9 5 8 8 1 1 1 10 6 4 3 2 0 3 1 1 0 10 1 0 3 8 8 7 7 1 0 6 10 10 4 2 2 2 6 5 10 7 7 8 9 7 5 10 1 0 1 9 5 3 3 8 2 9 0 8 4 1 2 0 1 5 2 8 3 7 9 9 9 10 1 3 6 8 3 1 0 7 9 9 9 5 8 10 5 1 1 4 10 6 6 7 6 10 0 5 10 9 5 0 8 6 0 4 1 1 3 2 6 4 2 3 0 10 5 7 1 3 7 2 2 10 7 10 5 6 7 3 7 5 1 4 0 0 0 3 6 3 10 0 3 8 5 10 2 3 7 3 0 3 9 7 3 5 9 6 2  [...]
+2 5 3 8 5 4 7 9 6 2 2 8 0 2 1 10 2 2 9 3 8 6 4 2 1 5 5 8 0 7 4 9 6 10 4 2 0 4 6 2 5 4 9 8 8 10 7 8 5 5 1 0 5 6 1 0 1 9 3 3 10 7 2 7 0 8 0 10 8 0 4 4 1 6 10 8 5 3 2 7 7 5 6 9 1 3 4 4 3 9 9 9 6 10 4 3 8 3 8 10 1 3 5 8 1 5 6 0 8 2 8 8 8 6 9 3 9 7 3 10 9 10 2 5 0 9 9 1 6 3 8 6 1 5 3 0 1 2 3 6 1 10 4 7 7 9 1 1 7 0 4 1 6 3 9 1 5 9 0 8 2 8 8 4 2 5 3 10 7 6 0 3 6 1 1 5 4 9 7 0 7 5 2 7 0 1 1 9 10 4 0 5 7 5 5 3 5 4 1 7 2 1 7 9 7 9 4 6 3 3 8 10 10 3 9 3 0 2 2 7 10 1 1 1 0 6 9 5 9 3 9 7 10 6 10 1 10 [...]
+7 6 9 0 4 4 6 1 2 9 10 6 0 1 2 4 2 0 8 9 8 0 10 7 10 2 7 5 0 4 9 9 7 5 4 2 2 5 9 1 5 8 9 2 3 8 4 1 8 5 5 6 7 9 2 9 10 6 10 7 7 9 5 7 8 0 10 1 4 1 0 5 8 10 10 10 9 3 2 7 7 4 6 9 4 2 0 2 6 10 1 2 1 3 10 7 8 5 9 9 10 8 10 5 7 8 5 3 3 4 9 1 9 5 7 1 3 8 9 8 1 9 10 0 0 6 8 0 4 0 2 3 3 8 9 7 5 10 3 1 1 2 1 10 5 7 6 1 2 4 2 0 7 1 5 1 7 9 5 4 10 5 5 8 3 10 10 8 9 7 3 10 6 9 6 4 8 9 3 10 8 4 1 8 2 10 7 2 2 8 3 5 4 6 5 6 10 7 10 0 7 6 10 9 4 6 10 1 3 8 6 6 1 0 8 4 5 8 1 6 2 6 9 6 2 7 7 7 7 4 6 3 6  [...]
+9 6 10 8 10 2 1 1 0 10 7 1 7 8 8 4 5 9 9 5 4 5 4 5 3 1 7 5 1 2 6 8 10 5 8 1 10 10 7 10 7 9 9 6 2 7 2 6 6 8 7 1 1 3 7 3 7 7 1 3 9 3 7 7 4 4 0 1 0 1 10 5 1 1 10 4 4 1 7 5 1 6 7 5 6 1 1 0 4 1 5 10 7 7 10 7 8 7 10 2 10 3 1 8 10 9 6 0 3 3 6 4 7 1 6 6 3 8 10 5 8 9 4 3 10 8 10 1 1 4 6 5 5 7 4 4 0 2 4 8 3 9 3 9 2 2 5 10 2 8 3 0 10 8 7 0 7 0 10 6 3 7 1 0 10 10 7 2 5 6 6 6 8 8 10 8 5 5 10 6 5 0 1 0 3 7 9 0 7 10 1 0 2 10 5 9 8 4 4 10 10 1 4 10 7 10 2 6 10 7 2 6 8 6 0 2 10 2 1 5 10 3 2 1 1 10 8 0 4  [...]
+9 0 8 7 10 0 5 6 7 1 6 9 1 2 0 2 5 4 10 0 7 10 10 0 6 1 4 5 0 0 8 3 1 4 4 9 10 2 5 10 1 6 1 1 6 3 6 7 7 8 10 1 4 0 8 5 10 7 0 2 1 8 4 7 1 4 8 0 9 1 3 10 5 7 9 4 3 2 6 10 8 10 3 0 10 6 1 9 4 6 0 2 8 2 1 5 4 7 9 4 6 10 7 9 1 3 3 9 0 9 1 2 3 2 7 6 5 8 5 4 6 2 9 8 1 2 6 1 9 1 7 3 2 7 9 9 1 3 2 1 1 3 5 4 6 1 3 7 4 9 10 9 1 0 4 4 0 9 4 0 1 7 10 2 8 3 8 3 0 10 1 9 8 9 6 4 10 9 5 9 2 3 5 0 1 0 0 8 5 5 3 8 2 4 8 1 1 1 2 3 3 9 5 6 3 2 3 10 9 2 10 9 1 10 9 6 8 1 4 9 10 7 10 1 10 3 5 10 10 6 9 8 6 5 [...]
+7 3 9 10 3 9 0 3 9 1 8 9 10 5 4 5 8 7 5 5 8 6 7 8 4 4 8 1 9 6 2 8 0 4 2 4 9 1 10 4 4 9 6 5 0 2 6 10 9 9 7 10 6 8 9 6 2 6 8 9 3 4 5 1 3 5 6 0 9 5 0 6 7 5 5 2 0 6 3 9 5 2 3 0 5 1 6 2 0 0 9 2 7 4 3 5 8 2 9 8 4 1 4 4 7 5 7 5 2 7 6 4 1 6 7 9 4 4 10 7 9 8 1 10 6 8 8 4 9 4 8 1 4 0 10 7 10 7 6 3 2 3 4 1 6 2 7 2 1 8 8 9 2 3 0 6 3 3 5 4 8 1 7 5 4 4 3 2 9 0 1 6 10 6 5 4 4 2 4 6 6 5 8 1 6 4 6 8 1 10 6 9 4 8 9 0 4 9 3 10 7 9 6 1 10 3 5 7 9 5 7 6 8 10 7 9 6 10 10 1 3 0 9 1 5 7 9 10 7 8 2 1 7 10 9 3 1  [...]
+7 9 9 0 1 9 5 1 7 1 9 10 2 1 1 9 1 6 9 10 3 1 4 6 0 10 6 5 3 2 3 10 1 4 6 5 9 4 9 0 0 0 7 4 9 8 0 9 6 2 9 8 1 5 5 6 4 2 1 5 5 10 7 4 1 1 0 3 1 2 2 9 4 2 0 3 1 10 8 4 5 7 6 0 7 2 6 1 0 4 3 10 0 1 5 9 4 3 5 2 3 9 7 2 1 4 1 7 3 2 9 0 3 8 7 0 7 3 7 6 7 10 4 10 9 5 1 8 3 8 6 1 1 8 3 4 10 4 1 5 9 3 6 6 3 5 3 2 3 10 9 9 10 2 1 8 1 2 3 7 3 1 5 2 10 10 7 4 6 5 5 4 8 4 4 6 0 3 5 1 5 10 10 0 9 0 1 8 8 4 7 1 1 0 7 3 4 4 8 2 7 5 3 3 0 7 9 5 4 7 5 10 7 5 1 6 9 10 10 0 0 10 10 2 10 10 8 9 0 5 2 4 5 7 8 [...]
+0 6 3 2 10 4 0 4 10 6 2 4 6 3 6 8 3 9 10 10 7 0 5 2 0 5 0 9 2 3 2 1 10 10 9 3 6 8 7 1 4 3 1 5 9 5 0 10 8 6 2 0 6 9 0 5 10 5 5 3 0 7 10 7 2 3 8 10 5 7 9 2 0 7 1 5 8 10 4 2 3 9 3 1 10 4 5 8 4 2 4 10 4 7 2 1 1 1 10 3 6 2 5 5 8 5 4 5 5 8 9 3 7 8 3 2 3 9 0 6 3 7 2 0 5 5 4 8 6 0 2 1 1 2 4 8 3 2 1 1 7 4 7 0 9 9 3 8 0 6 3 9 1 10 2 8 2 9 2 1 5 9 10 0 8 7 8 5 7 10 4 8 5 10 4 0 1 5 5 8 7 9 9 6 1 6 1 7 8 9 1 2 0 5 8 4 9 10 3 9 0 6 0 8 10 0 6 4 3 6 1 7 2 4 0 6 3 4 1 5 3 2 5 7 0 7 0 6 0 3 1 3 3 4 6 0  [...]
+9 5 9 0 7 4 5 4 4 1 10 7 9 3 9 9 5 2 9 7 0 9 2 8 5 0 10 4 10 9 1 8 1 3 10 7 2 2 2 6 5 8 9 8 6 5 3 0 5 8 1 8 10 5 2 3 9 8 7 4 5 3 10 2 2 0 9 5 8 5 10 8 9 8 2 3 4 3 2 3 2 0 10 6 10 1 0 0 2 0 10 5 8 3 8 0 5 0 1 6 7 3 5 6 0 0 9 9 2 6 2 0 10 6 7 7 6 9 6 9 4 3 0 10 8 9 2 8 10 3 4 5 1 9 2 8 8 7 7 9 1 6 5 3 4 0 0 9 0 3 5 2 9 1 4 7 6 0 8 10 6 5 0 7 6 0 1 6 8 9 9 9 5 9 2 3 5 1 8 1 2 9 1 0 9 2 7 0 8 0 1 4 0 9 2 0 4 9 10 7 7 3 6 10 2 5 7 3 5 8 2 9 2 6 7 3 9 8 1 9 0 10 6 3 5 6 1 6 8 5 1 3 8 5 8 4 7 7 [...]
+9 3 4 1 4 7 9 4 5 8 7 10 3 2 7 4 3 2 0 0 6 0 5 8 7 8 3 5 0 10 3 7 6 9 2 8 4 1 1 10 10 5 2 8 7 3 8 9 7 8 1 4 8 4 2 2 3 2 3 2 5 10 7 1 2 3 9 6 4 5 7 10 5 2 9 10 4 0 3 9 7 10 4 5 7 8 5 3 3 1 10 0 0 8 1 0 7 8 7 6 3 0 5 2 4 7 1 7 10 3 10 4 5 3 0 5 0 2 9 3 5 1 8 1 8 1 9 2 4 9 6 5 9 10 8 1 3 8 8 6 2 3 10 0 9 1 0 9 8 6 10 2 6 5 6 4 8 4 0 0 6 5 8 3 1 7 4 10 2 5 7 3 10 5 4 5 5 8 3 2 8 4 10 7 8 1 1 6 3 9 4 5 0 2 7 10 1 6 9 10 3 10 1 4 2 5 5 1 0 6 8 8 4 0 10 10 5 8 4 6 7 1 8 2 8 6 9 8 6 7 10 1 8 5 6 [...]
+1 4 2 5 10 5 4 2 5 2 5 1 9 1 1 8 4 6 4 9 10 5 1 5 7 3 1 9 8 6 10 8 10 7 6 8 8 4 9 3 5 4 1 5 5 4 0 4 2 6 0 10 2 0 9 8 0 4 3 4 3 3 6 6 10 6 0 0 7 6 5 1 3 10 6 6 6 5 1 2 8 1 2 1 3 0 9 10 10 9 3 10 3 1 6 3 3 1 4 2 1 3 4 1 2 1 5 4 4 5 6 7 8 7 1 6 5 5 5 3 9 10 1 4 10 10 10 1 5 9 4 0 1 1 0 10 4 6 9 9 5 0 6 5 10 10 1 5 0 7 4 7 1 5 9 9 0 9 10 7 10 4 8 0 1 2 8 9 2 0 0 10 6 9 6 1 8 1 2 4 0 3 8 3 1 7 0 6 2 6 8 1 9 2 7 2 10 9 7 7 3 3 4 2 4 6 6 9 2 0 2 1 5 5 8 6 5 3 4 4 5 5 5 0 9 1 10 5 7 9 2 4 9 3 0  [...]
+6 1 10 8 8 3 5 4 8 1 3 10 5 5 8 1 10 2 3 4 6 0 7 9 8 8 5 4 8 1 4 9 6 0 4 4 1 0 7 10 0 3 5 6 6 5 4 0 0 9 5 5 7 10 1 8 9 9 2 6 9 5 3 4 9 6 7 4 1 10 8 0 8 10 9 3 2 7 2 0 0 3 2 1 7 6 2 2 2 1 10 8 0 8 3 6 6 3 2 7 9 4 8 7 4 0 4 2 8 9 6 4 2 8 9 5 8 0 9 8 2 8 1 10 1 4 2 8 3 5 0 1 8 8 0 1 4 3 4 7 2 7 6 9 1 2 6 0 9 0 4 2 8 1 7 7 9 1 5 6 7 5 5 4 8 0 5 4 1 8 7 8 5 2 6 5 7 2 8 0 0 3 5 7 6 0 7 1 7 6 10 6 9 0 0 2 6 2 4 4 7 2 3 7 8 3 7 2 8 5 2 9 3 2 6 9 1 1 9 9 6 9 8 1 1 10 2 7 3 6 8 2 0 2 9 7 3 6 10 2  [...]
+2 6 9 1 5 4 7 8 5 10 6 10 9 3 0 1 8 2 2 2 4 0 9 9 3 2 3 5 4 6 8 5 8 10 9 3 3 8 6 10 4 0 4 5 8 0 6 10 8 4 0 0 3 9 8 3 1 2 10 0 4 3 8 0 1 3 2 4 0 4 4 6 1 10 7 7 5 3 7 3 6 4 5 1 3 2 7 7 6 5 8 10 2 10 3 10 4 1 3 5 8 3 0 7 2 4 7 4 1 10 0 5 5 6 10 4 9 7 6 5 4 8 4 10 5 10 5 9 9 9 6 10 1 5 1 3 9 6 5 7 8 0 0 3 8 7 0 6 5 2 5 10 7 8 7 1 3 0 7 9 4 0 9 9 8 2 0 6 10 6 5 4 0 3 5 10 9 9 4 3 3 5 7 10 5 6 0 2 1 3 7 0 8 6 10 2 7 9 6 0 3 8 10 2 2 3 7 3 5 6 0 4 1 4 10 3 4 10 8 6 10 1 2 7 7 6 8 5 3 8 0 9 8 2  [...]
+5 5 8 0 10 7 6 10 2 6 10 8 0 10 0 10 1 9 10 2 10 2 4 9 8 2 5 7 3 9 0 3 3 10 5 7 3 2 8 10 0 7 7 0 1 6 1 2 3 3 0 5 9 10 7 0 3 10 4 6 8 5 5 6 0 8 4 7 6 1 0 2 7 10 7 9 1 0 4 5 4 8 2 2 1 6 2 1 7 9 0 2 8 9 4 3 8 4 7 10 7 8 4 7 3 10 10 3 4 8 5 0 1 5 0 9 7 9 4 1 3 1 1 4 1 3 5 3 6 8 9 7 9 7 1 5 3 10 9 6 3 8 7 4 7 10 3 6 6 9 2 4 7 2 0 8 9 7 1 7 0 8 5 0 9 3 9 5 10 0 2 1 1 5 8 10 9 1 10 8 5 7 3 3 0 7 6 10 1 8 7 5 6 8 1 2 4 4 3 10 9 10 6 10 1 5 7 3 2 3 10 4 7 5 3 6 4 6 8 9 9 1 3 1 0 10 0 4 3 10 0 9 5 [...]
+1 4 5 10 5 8 10 9 4 4 2 5 6 6 6 10 9 6 6 3 1 4 10 3 5 2 6 7 7 7 2 10 6 5 10 8 1 4 6 3 1 6 10 3 4 10 2 8 8 4 8 5 3 2 0 1 0 3 0 6 3 3 9 6 5 5 10 3 6 4 5 1 6 3 2 9 5 5 5 4 8 3 10 8 5 6 4 7 3 9 0 2 4 4 8 6 4 4 6 2 9 6 7 7 5 5 7 10 2 9 0 7 3 1 1 3 7 10 10 4 6 1 3 3 8 2 10 5 2 9 6 3 5 6 8 10 10 7 4 8 10 2 7 0 8 3 8 5 0 4 5 2 3 5 2 4 0 10 6 4 10 4 7 8 6 10 9 8 1 4 7 8 0 0 1 0 6 0 7 3 7 10 3 2 0 5 1 10 3 2 1 2 2 6 8 6 9 0 9 8 9 7 6 9 9 7 8 2 0 6 0 0 10 1 8 3 4 9 9 4 2 10 8 10 3 7 1 2 3 10 4 2 1  [...]
+9 2 3 7 8 6 6 4 2 7 3 3 10 8 1 2 2 0 6 3 5 6 6 1 9 8 5 10 0 4 7 9 6 2 3 9 3 0 6 0 7 1 6 2 10 4 2 8 6 5 2 0 3 0 7 4 1 2 6 5 2 10 10 3 1 0 0 5 6 10 1 6 4 1 7 2 2 3 3 5 7 10 0 7 10 3 8 5 5 7 1 7 6 8 3 2 6 8 6 10 7 6 10 4 10 9 4 10 5 5 1 5 3 6 9 0 5 7 8 5 10 6 7 4 2 3 9 9 0 5 10 3 8 4 3 2 10 5 9 2 5 5 2 6 8 5 2 8 0 4 2 4 1 8 10 4 7 10 5 7 0 1 1 0 5 9 1 3 10 1 0 2 3 1 6 9 1 9 8 1 6 5 0 0 4 3 6 4 0 8 6 0 6 1 4 4 6 7 9 2 3 0 8 8 7 3 9 9 8 7 1 6 0 6 10 10 4 8 1 10 8 1 8 1 0 6 9 4 3 10 4 1 9 10 1 [...]
+7 9 8 6 5 10 1 9 10 4 7 6 0 1 3 4 0 1 5 5 1 7 7 0 3 8 7 2 5 5 6 4 10 1 2 7 6 1 7 9 4 2 3 5 5 6 2 5 1 1 1 4 10 9 9 4 4 7 6 2 1 7 7 2 10 3 7 1 9 7 5 3 2 4 2 8 1 9 10 10 1 7 10 4 8 5 10 10 7 7 4 5 0 5 8 5 5 10 10 6 10 3 1 1 6 8 5 3 6 6 1 1 7 1 1 7 4 4 8 5 4 0 3 3 5 5 7 7 4 5 1 9 2 0 3 5 2 10 4 0 5 4 6 0 8 8 8 2 2 7 6 6 5 0 5 5 1 8 1 10 6 0 10 10 2 4 1 1 7 1 1 10 7 0 4 0 6 2 1 7 0 6 7 5 3 3 3 6 6 8 10 6 10 9 4 0 3 7 5 5 0 5 0 2 1 6 3 8 7 10 1 8 10 4 3 2 5 9 8 0 9 4 6 3 2 8 1 0 0 5 2 1 1 10 0 [...]
+1 3 6 1 1 3 2 3 9 2 0 5 6 2 7 2 6 1 7 10 7 6 9 9 5 2 8 8 9 6 10 1 9 5 8 2 6 9 10 1 3 10 1 7 7 2 6 9 10 0 4 8 2 10 6 9 5 4 5 9 3 2 1 8 0 9 3 6 1 9 10 9 0 2 10 5 7 8 1 8 2 7 8 4 5 6 6 1 6 6 1 3 10 8 4 2 4 6 6 2 6 9 0 9 9 0 3 1 8 9 8 3 1 8 5 10 8 7 6 2 8 8 3 7 8 1 5 0 2 7 5 5 9 7 3 2 3 10 8 1 3 8 7 9 10 9 10 1 5 4 8 6 10 9 0 3 6 0 4 8 7 0 7 9 7 8 10 9 9 3 1 7 9 0 6 1 1 7 1 0 7 0 6 1 6 3 9 0 5 10 10 2 4 3 7 0 10 7 8 2 8 0 0 0 7 8 7 6 0 5 5 2 10 3 10 1 9 5 5 7 10 3 10 4 2 6 4 7 8 6 1 6 0 7 6  [...]
+2 3 4 1 1 0 9 9 5 8 8 10 8 4 8 5 4 9 1 6 9 4 6 3 8 7 2 3 5 8 8 9 3 4 2 10 5 4 2 6 7 7 5 0 9 5 7 1 4 7 4 10 2 7 0 1 5 8 8 1 9 4 9 8 6 4 6 8 4 7 6 7 4 4 10 10 10 9 7 10 9 4 10 8 1 4 9 5 3 3 7 2 2 7 5 3 0 3 0 8 8 8 9 6 4 9 5 3 9 3 1 9 4 2 5 3 5 8 6 2 1 5 7 4 10 0 10 6 4 1 7 3 2 4 0 7 4 2 7 3 8 10 1 1 1 6 2 7 8 5 8 10 5 9 3 8 4 1 4 3 4 9 9 3 4 7 8 9 1 2 9 2 0 8 8 8 7 0 6 6 1 2 10 10 4 6 0 4 3 3 5 0 8 2 6 5 4 10 3 2 2 0 2 8 0 6 10 8 6 3 2 2 4 2 2 5 5 3 6 9 3 9 9 2 3 5 10 3 3 0 4 10 3 1 3 5 5  [...]
+2 10 0 7 6 3 10 4 6 6 1 10 0 3 8 2 3 9 10 4 4 8 9 3 2 6 7 5 2 6 6 4 2 6 9 8 2 10 0 2 6 7 2 0 5 4 1 10 4 8 4 1 2 4 2 7 8 0 1 6 1 2 2 2 8 1 3 0 4 7 4 6 6 10 2 1 6 2 8 7 5 3 10 6 0 2 5 8 6 4 3 5 5 8 4 8 9 7 0 10 8 3 10 3 1 10 9 5 5 7 3 7 3 5 0 2 0 7 10 4 9 0 5 4 4 10 7 6 2 5 3 1 2 3 1 6 1 3 6 6 7 2 5 1 6 7 2 10 2 5 0 9 10 2 7 4 1 1 5 0 10 10 5 1 8 8 2 10 6 6 10 2 7 5 1 8 8 8 3 2 7 1 1 3 0 6 5 3 10 4 7 6 0 5 1 7 7 7 5 6 8 10 8 5 9 6 7 0 6 8 0 7 7 6 7 1 2 0 4 3 0 5 4 4 4 6 0 5 0 8 1 2 3 7 8 9 [...]
+9 9 7 2 7 9 9 9 1 6 7 0 7 0 6 9 10 2 9 7 10 1 8 1 4 0 5 7 1 6 9 1 2 0 9 1 9 9 2 5 5 2 10 3 2 8 7 2 10 2 10 9 9 7 3 9 0 3 7 4 7 6 3 0 7 3 2 6 0 2 3 0 2 10 0 2 6 3 5 10 6 10 6 5 1 7 8 7 9 4 4 1 1 5 9 8 1 6 4 8 5 3 5 1 9 7 4 10 9 10 10 2 10 5 0 1 3 7 9 6 3 9 4 8 4 10 5 5 7 2 5 3 7 7 0 5 3 5 8 0 8 6 9 9 7 0 6 5 8 7 0 8 6 6 3 0 0 1 3 7 4 1 2 3 0 7 1 6 8 4 4 8 10 5 10 4 9 0 9 6 6 3 8 2 8 8 10 3 4 10 5 3 7 2 3 7 9 2 0 6 2 8 6 6 0 0 7 8 6 10 0 6 9 0 6 6 7 5 8 7 2 5 3 8 8 9 7 3 5 4 1 2 5 10 5 3 6 [...]
+1 0 6 6 0 8 1 2 1 5 10 8 9 0 9 6 5 1 1 7 5 9 4 0 2 8 8 7 0 8 3 7 3 10 6 4 7 8 3 5 5 7 3 4 1 7 5 0 8 0 3 2 0 8 4 6 10 0 10 6 10 1 7 4 9 2 0 10 6 1 6 7 1 8 0 10 1 6 1 8 3 5 5 7 7 6 8 0 0 10 2 2 5 0 10 1 2 9 1 1 1 1 1 2 4 0 10 7 8 3 1 8 7 1 3 5 2 5 4 0 3 2 3 4 8 8 7 0 1 1 1 10 1 3 3 10 6 6 2 5 1 2 5 6 2 4 7 9 4 6 5 9 5 2 6 5 5 1 1 8 7 10 0 3 10 7 10 5 7 2 0 3 1 3 6 7 1 1 4 8 10 7 7 1 6 10 10 5 7 5 5 9 6 1 5 1 8 7 7 3 3 2 4 2 3 3 1 0 0 3 3 10 7 1 3 2 9 3 10 6 6 4 5 7 1 5 8 3 9 0 5 9 0 5 7 8  [...]
+1 10 4 9 3 3 5 3 1 5 5 0 5 1 10 8 10 4 3 5 0 5 8 5 8 0 9 6 7 4 2 6 2 8 8 4 2 10 8 9 0 7 5 1 8 3 0 6 6 8 4 7 4 0 8 8 3 3 8 1 3 0 4 5 3 5 4 1 6 1 0 6 9 5 2 10 3 0 1 4 6 6 3 9 4 6 9 10 8 9 8 1 4 5 4 7 5 10 0 3 1 7 1 8 10 8 10 7 8 5 5 3 0 7 6 0 7 1 1 8 6 5 3 4 5 7 3 3 4 8 9 1 0 2 3 7 5 5 0 9 5 4 1 0 6 6 1 10 3 2 2 8 4 4 7 1 4 9 7 10 0 7 4 5 6 0 3 1 10 3 5 5 0 8 9 1 2 4 5 1 6 0 5 2 0 6 2 0 6 3 9 6 2 7 9 9 5 2 6 8 3 5 2 10 1 9 4 7 10 0 10 1 1 3 0 4 0 7 10 6 5 6 8 9 9 7 2 3 8 5 4 6 0 0 4 0 6 1  [...]
+6 1 1 8 6 7 7 0 3 0 5 5 10 2 3 4 8 10 7 6 9 7 2 5 10 6 3 10 2 1 7 10 7 4 0 4 8 6 4 6 2 5 1 4 3 2 1 8 7 0 5 3 0 8 9 2 3 7 8 10 9 3 2 4 1 6 5 8 10 0 9 10 6 8 0 0 5 7 6 5 5 9 4 8 1 10 7 10 1 3 10 0 8 0 6 7 8 5 3 6 0 6 9 6 5 2 3 5 5 5 7 2 4 10 5 10 7 4 6 8 8 6 7 7 7 0 9 5 0 9 6 0 10 8 9 5 7 9 3 9 2 6 2 10 2 4 8 10 7 8 5 9 3 9 1 10 7 3 5 2 0 4 4 6 6 0 4 3 6 0 7 4 2 2 4 1 0 0 0 9 9 8 6 2 6 10 4 8 0 7 6 2 1 3 9 7 3 7 2 7 0 4 8 8 8 5 0 4 7 4 6 1 4 1 8 2 8 4 3 6 1 1 7 10 0 7 5 2 9 0 9 7 0 7 4 7 0 [...]
+10 6 6 4 8 0 6 0 8 5 10 1 7 9 3 9 7 0 4 9 6 10 7 9 7 1 0 5 5 9 8 3 8 8 2 1 0 9 4 8 10 8 1 6 8 6 1 6 9 10 3 2 3 3 4 9 3 1 9 10 5 7 9 9 0 8 4 5 9 5 3 1 9 4 4 0 3 0 5 2 9 5 3 0 7 1 4 9 3 7 6 5 3 6 7 4 4 9 3 2 6 1 10 1 10 4 0 5 7 3 0 6 3 3 9 3 2 1 10 2 10 8 10 5 0 3 9 4 1 3 5 2 7 0 4 4 4 1 0 1 5 5 6 3 7 8 3 5 6 4 6 4 6 4 3 10 4 6 8 8 6 9 3 0 7 7 7 1 7 10 5 6 10 4 8 0 5 9 2 2 0 6 8 6 1 9 3 2 5 2 1 7 3 7 6 7 5 2 6 7 6 1 5 9 0 4 6 6 7 2 5 1 4 7 2 10 7 10 0 2 3 6 5 1 8 8 6 3 5 1 4 0 9 2 3 8 2 5  [...]
+7 10 4 2 1 7 4 7 1 5 6 7 0 4 9 10 2 1 5 8 8 0 7 9 7 5 8 2 1 9 3 2 10 0 8 9 10 3 9 2 3 3 4 7 4 1 0 3 7 5 9 10 3 10 3 2 9 4 0 2 10 8 9 7 3 4 3 4 9 0 10 10 4 1 8 0 1 8 2 9 8 6 7 8 7 10 9 3 3 10 8 5 5 6 10 1 1 10 7 2 3 8 5 8 4 4 4 0 6 10 5 0 0 6 4 10 9 6 3 7 3 5 2 9 10 10 10 7 4 2 1 6 3 4 10 0 4 8 8 10 7 0 5 5 1 4 7 9 1 7 4 1 3 9 2 4 1 3 6 1 9 7 5 1 10 8 4 0 6 5 5 5 5 8 9 7 8 2 7 7 0 10 1 8 4 4 7 6 1 3 5 9 9 6 10 8 10 2 4 5 5 8 10 10 7 6 5 6 8 9 0 5 7 1 10 5 3 3 3 1 10 0 9 6 1 10 8 9 8 8 8 1 [...]
+3 0 9 4 8 5 8 10 1 8 5 3 8 5 7 0 3 4 5 6 10 3 1 9 9 3 3 0 4 0 6 6 6 3 3 10 9 10 1 7 6 2 4 3 2 8 2 9 8 1 8 3 6 6 6 10 4 3 7 10 1 6 7 2 2 4 2 9 6 6 9 6 3 3 2 6 9 5 8 2 10 6 2 10 4 8 10 2 2 8 6 7 4 0 9 6 3 5 7 10 10 1 5 10 2 2 1 0 10 10 2 9 10 9 1 10 3 3 1 10 10 5 2 9 2 1 1 2 8 4 1 0 3 10 0 8 6 5 0 10 10 1 3 2 6 10 2 7 1 2 4 4 2 9 0 5 8 9 1 7 7 3 1 3 4 7 6 0 4 7 3 6 9 1 0 3 4 0 10 2 2 2 2 6 4 0 0 5 7 7 1 9 7 7 8 2 6 3 7 0 8 1 9 9 10 8 8 6 6 5 1 5 9 1 4 8 5 3 9 2 4 3 7 3 4 4 4 4 4 4 10 0 8 4 [...]
+3 0 10 1 5 10 9 1 2 2 6 9 6 5 10 9 8 6 6 3 4 4 10 1 1 4 5 5 10 4 2 1 3 4 3 3 6 5 6 2 8 8 9 7 1 5 10 4 9 6 2 2 7 4 0 3 10 5 8 5 10 0 7 5 3 6 7 7 8 2 7 10 0 10 1 4 1 10 5 6 5 0 4 4 6 6 5 4 10 2 0 3 8 2 3 2 0 9 4 1 3 4 9 8 5 2 0 8 1 1 9 9 3 8 8 5 6 0 9 3 6 7 1 3 8 6 4 1 7 9 9 8 7 10 4 3 1 8 4 6 3 3 0 7 10 7 10 9 4 4 3 10 0 9 3 4 8 1 8 8 10 5 4 8 5 0 0 8 10 1 5 10 8 1 4 4 9 4 1 10 6 7 4 9 6 3 10 4 7 9 8 5 3 5 10 6 7 4 2 3 7 8 2 0 6 6 1 3 6 5 10 8 2 9 1 8 8 9 3 6 8 8 8 4 4 2 0 0 6 0 1 6 9 1 1 [...]
+4 0 9 8 2 1 9 0 9 1 1 0 5 6 1 2 4 4 2 8 3 8 10 10 1 4 4 0 8 7 3 4 8 6 8 6 3 5 7 8 3 10 4 4 5 3 6 6 6 5 10 5 7 6 7 0 4 6 10 10 4 0 9 8 4 2 10 5 3 5 8 4 4 2 10 9 0 4 0 9 1 7 10 10 5 2 5 4 5 4 6 0 3 0 0 8 6 7 7 0 3 3 0 9 0 3 6 0 6 4 2 4 0 2 10 1 6 7 6 9 2 8 5 4 3 1 0 10 2 4 2 7 6 8 10 5 6 10 2 0 10 2 0 4 9 4 3 6 3 1 9 5 2 4 9 8 3 5 2 10 7 3 8 7 10 3 10 3 10 3 4 7 4 1 9 4 10 9 6 9 2 9 4 0 7 7 7 10 7 2 7 2 7 2 2 3 4 1 8 2 10 0 1 3 9 2 8 2 2 1 1 8 4 0 6 3 2 9 6 3 9 9 6 9 10 4 3 1 5 4 9 1 7 8 7 [...]
+7 9 6 9 5 8 10 7 1 5 5 10 1 8 10 6 8 6 0 7 2 8 3 7 6 4 2 10 3 3 4 4 5 3 3 10 10 9 8 3 2 1 7 7 5 6 2 2 9 6 10 4 1 6 0 2 7 2 9 1 8 3 7 9 5 5 5 6 3 7 8 5 10 1 4 7 8 6 0 10 5 1 6 4 8 8 9 10 10 9 4 0 7 2 0 5 0 5 7 5 9 5 10 7 4 5 6 2 9 8 5 6 4 7 8 3 4 5 4 2 5 0 6 10 7 5 8 9 10 5 4 10 2 9 10 5 1 9 1 1 10 9 2 4 0 1 9 4 2 1 10 3 4 2 3 8 5 0 6 8 4 3 8 9 8 0 10 2 10 7 1 10 4 2 3 8 3 10 9 7 4 6 3 9 9 5 8 3 5 0 7 6 2 6 4 7 2 8 9 0 6 8 9 10 5 9 2 2 4 9 10 2 10 2 3 6 6 2 10 3 10 1 7 5 5 1 2 3 4 6 3 6 5 [...]
+8 3 4 9 4 10 0 6 2 5 10 3 1 0 1 7 7 7 8 5 1 2 6 6 5 4 2 3 5 4 8 4 9 4 7 10 6 0 2 3 5 8 3 5 3 10 5 10 7 4 5 0 9 10 5 5 5 1 3 10 9 0 8 4 10 6 3 0 6 4 8 0 7 9 3 8 10 6 10 0 1 6 0 3 6 8 2 10 2 6 8 4 0 0 3 0 3 5 0 9 4 5 2 1 2 2 0 6 10 2 10 9 5 5 8 3 8 6 4 4 10 7 3 5 3 0 9 2 0 10 2 6 2 4 10 9 7 6 9 3 0 9 8 5 3 1 6 7 0 7 2 0 0 10 0 0 3 0 3 9 2 1 6 3 7 3 3 2 10 2 7 0 10 3 10 2 9 7 10 7 4 0 10 8 2 5 2 2 6 6 3 8 7 7 5 0 3 2 0 6 5 1 0 6 6 8 10 7 8 8 2 6 8 9 0 0 8 4 9 4 1 3 5 4 2 3 3 4 4 4 5 4 3 4 2 [...]
+9 1 2 3 3 1 3 4 2 9 8 0 8 8 0 6 6 1 0 6 7 7 0 4 6 4 1 2 1 10 9 10 8 4 10 2 0 4 10 7 9 10 5 3 10 9 9 10 3 1 2 3 8 2 2 6 9 5 0 3 3 9 9 7 2 8 9 3 10 7 5 2 3 4 8 3 2 5 6 0 9 2 7 4 6 0 3 4 3 5 5 1 8 8 5 10 7 9 0 6 3 6 4 0 6 5 0 3 4 1 10 3 6 9 5 1 1 4 2 0 2 3 2 7 8 6 5 0 3 4 3 3 8 3 1 7 5 1 2 0 6 3 6 7 3 10 3 10 7 7 5 10 9 0 7 3 1 7 3 7 6 4 5 9 7 7 6 5 5 2 2 7 10 3 7 7 0 7 3 3 7 3 9 9 3 0 8 2 8 2 3 3 0 6 5 8 7 10 6 5 9 10 5 6 0 1 10 1 3 7 10 6 8 8 5 2 10 8 8 7 10 6 2 1 9 6 9 4 1 0 6 0 6 0 8 4  [...]
+6 5 4 4 1 8 5 2 9 6 2 9 5 6 2 2 1 1 7 6 3 7 0 6 10 8 7 9 1 2 1 3 2 4 9 3 4 0 7 0 3 1 4 7 5 4 9 3 4 5 1 7 5 8 5 0 2 4 3 0 4 4 3 7 5 3 2 7 4 8 2 0 0 5 2 5 3 5 1 3 10 6 7 0 7 6 4 8 8 7 10 9 0 7 3 6 3 4 10 3 3 6 3 9 5 7 2 5 4 2 5 3 8 0 8 3 5 0 4 9 8 1 5 7 10 7 4 1 2 0 2 10 10 4 2 2 10 2 0 8 3 10 6 3 9 5 0 5 6 7 4 3 0 8 0 4 0 2 4 9 1 7 6 5 0 4 1 10 3 10 1 6 1 9 9 5 6 3 10 3 2 8 10 7 0 5 0 0 10 8 2 5 4 1 8 1 8 0 8 1 10 10 5 2 0 9 3 6 10 2 8 3 2 7 3 1 0 8 0 8 0 4 4 6 6 10 9 4 9 0 7 4 4 2 1 7 5  [...]
+7 7 3 0 10 8 1 8 10 8 0 8 3 7 7 9 1 6 6 3 1 7 6 1 9 0 10 3 5 2 3 7 10 7 9 2 1 5 0 7 2 9 2 2 3 1 0 0 7 7 1 8 0 1 8 6 4 0 4 6 6 7 5 8 3 3 8 6 10 6 10 6 2 10 5 5 7 10 4 0 2 2 8 5 8 6 3 9 4 5 8 9 5 8 2 0 3 10 0 7 6 7 1 3 1 7 5 1 10 10 9 1 3 2 10 3 8 10 0 8 1 4 7 5 0 4 9 2 10 10 10 1 4 2 6 2 3 9 7 6 0 9 4 5 0 6 7 1 0 0 2 6 8 0 0 9 6 5 3 0 2 8 6 5 9 5 9 9 9 7 7 10 4 5 6 3 0 6 1 4 3 10 6 6 8 0 0 2 8 6 4 2 8 5 10 9 2 1 9 0 5 5 0 0 1 9 2 8 8 8 7 2 6 6 1 2 2 0 3 6 3 6 2 4 5 5 1 9 8 10 5 0 6 1 3 7  [...]
+3 4 9 7 7 0 8 7 2 8 2 10 6 2 3 9 10 10 3 8 8 6 1 0 4 10 4 9 6 8 2 0 7 7 3 8 8 4 3 0 2 6 2 3 6 1 3 1 2 4 8 0 9 7 9 3 7 3 10 10 4 6 1 1 1 2 4 3 8 6 0 9 2 0 4 5 5 8 7 6 1 2 0 6 8 3 0 10 6 4 8 1 1 8 1 0 0 10 2 10 5 2 8 10 9 4 5 0 10 9 0 3 8 4 9 4 7 4 9 5 0 2 1 5 7 8 5 0 3 0 3 9 7 7 0 3 8 7 3 10 6 7 7 6 9 2 4 4 7 0 6 2 7 8 7 8 6 4 2 5 6 4 9 1 9 0 0 8 4 10 9 9 10 6 1 3 8 10 4 2 7 2 10 9 8 2 0 8 5 6 2 10 8 9 10 9 0 3 10 6 6 2 0 5 2 5 4 8 6 4 9 9 5 5 4 2 2 0 7 4 4 5 5 8 8 8 4 2 1 9 4 8 0 1 7 10  [...]
+2 7 7 1 6 10 7 8 9 1 7 6 8 6 8 7 3 6 5 7 10 9 0 8 4 9 6 8 3 6 10 7 3 2 10 8 9 5 5 2 10 2 5 6 9 3 7 0 6 1 7 0 9 0 0 6 0 6 5 5 4 2 0 4 6 8 6 0 7 2 2 6 4 8 6 6 6 7 8 6 2 5 6 0 8 4 10 3 1 10 4 7 6 4 0 3 8 10 2 3 2 0 2 6 0 5 4 9 4 0 3 0 1 10 1 6 8 0 6 0 7 10 1 3 7 7 0 10 1 10 6 0 10 8 6 8 5 0 4 10 0 5 0 3 3 9 10 0 5 7 4 6 5 4 9 0 6 2 8 6 1 7 3 5 5 5 0 8 4 5 7 4 7 3 10 0 10 2 2 4 8 2 1 9 7 1 7 0 9 0 3 1 8 0 7 10 6 9 5 6 0 0 1 2 8 4 4 8 0 6 10 1 6 1 8 7 2 6 2 5 5 3 4 1 5 2 0 6 7 9 8 0 9 4 0 7 1 [...]
+10 4 3 8 8 0 10 5 7 2 7 1 0 5 2 4 8 9 10 10 7 1 1 6 8 0 1 8 0 0 10 5 6 1 1 6 4 3 8 5 6 1 0 0 8 1 8 3 6 2 8 1 5 5 10 8 3 1 5 6 10 10 8 3 9 1 3 10 10 1 6 6 3 10 7 2 4 4 1 6 2 4 1 4 0 3 5 10 0 7 9 9 9 4 5 3 9 4 2 1 7 5 6 1 5 9 4 7 3 0 4 2 8 2 9 10 5 8 5 6 9 10 9 8 4 1 2 1 4 4 6 2 7 4 7 4 10 9 4 8 4 5 10 0 7 3 0 3 0 4 3 4 9 10 3 4 4 5 0 7 6 8 7 2 10 4 4 6 10 6 9 6 10 2 3 7 0 5 5 6 3 3 5 7 3 6 8 6 6 0 3 9 5 4 6 8 6 2 9 8 1 9 9 5 4 2 7 0 0 9 6 1 9 4 7 0 3 9 0 4 9 2 9 5 5 10 3 6 7 9 2 6 6 10 10 [...]
+2 4 6 6 4 0 3 7 0 6 6 9 0 3 4 8 5 3 8 7 6 4 7 10 7 6 6 9 10 5 10 3 1 0 7 0 9 1 1 0 4 2 2 4 2 9 3 10 0 6 9 10 7 4 0 4 4 8 10 5 7 4 2 1 2 5 3 1 5 10 6 9 3 0 4 8 10 5 0 2 2 7 3 2 7 5 0 2 3 0 10 9 8 6 0 10 4 6 0 9 7 7 7 6 10 5 2 9 8 9 7 4 6 0 10 4 1 7 3 1 8 5 8 10 6 5 10 10 0 7 6 10 7 8 8 0 0 9 9 8 5 7 2 5 7 0 10 10 2 10 6 3 1 7 5 5 7 0 6 2 3 4 6 1 9 9 9 8 10 7 4 10 2 3 5 9 6 7 5 6 7 7 7 8 9 3 7 10 7 7 1 4 0 6 5 8 1 8 2 10 10 3 2 4 4 7 1 6 6 5 0 6 7 5 3 8 5 6 9 2 4 5 8 2 9 7 8 0 1 4 10 2 4 6 [...]
+7 0 0 7 8 10 8 5 2 4 2 7 8 7 10 0 4 4 2 2 3 8 5 0 9 5 4 4 6 2 2 9 6 2 7 4 10 7 2 8 5 0 2 7 7 9 3 9 5 5 1 0 2 1 7 7 5 7 1 2 0 1 9 8 7 8 3 7 0 4 2 10 4 1 6 6 1 2 10 4 5 6 2 6 5 2 5 8 2 2 10 1 7 5 10 1 8 7 7 9 8 7 3 0 2 5 1 5 8 5 10 0 6 9 5 1 8 2 10 6 8 2 3 3 1 10 7 9 5 3 6 0 0 10 0 0 3 8 4 9 0 8 1 7 10 1 6 2 5 9 7 3 10 3 1 1 4 9 0 2 0 3 5 1 8 6 4 8 0 3 6 1 6 10 7 5 4 3 1 9 2 3 3 2 9 8 6 8 0 8 0 7 4 6 6 6 3 2 2 7 1 6 8 8 8 0 6 2 0 1 6 2 7 3 1 3 5 7 7 3 0 0 10 10 4 9 9 10 8 2 4 0 7 1 10 4 10 [...]
+6 4 4 8 3 1 0 6 8 10 8 7 5 7 10 8 10 1 4 6 10 4 1 1 0 3 7 2 9 6 7 8 3 1 9 6 2 10 8 0 7 4 8 8 4 4 5 6 10 7 8 6 10 4 7 5 10 1 6 9 8 9 3 5 6 7 3 4 7 9 1 10 0 5 0 7 1 9 6 6 4 3 6 5 3 10 9 6 5 7 5 3 2 10 10 1 9 9 2 0 3 8 9 2 1 5 6 6 4 3 1 2 8 4 7 2 10 3 4 1 4 10 10 10 6 8 5 3 4 7 1 3 4 0 6 9 10 3 3 2 2 10 8 5 6 9 0 7 8 1 2 2 6 7 0 8 4 5 2 4 3 2 5 6 0 8 10 6 9 7 3 3 10 9 9 4 3 5 0 2 8 5 2 0 8 7 0 6 6 6 0 10 7 7 8 6 6 2 4 8 3 10 8 6 0 0 6 0 1 10 10 8 9 4 4 0 4 10 5 9 0 1 1 3 5 5 4 6 10 4 7 2 3  [...]
+10 3 1 0 6 10 8 5 3 3 6 2 9 1 8 7 5 10 5 4 2 5 0 2 5 1 5 10 6 0 8 7 3 6 6 8 6 4 3 8 9 5 0 5 9 2 3 6 4 7 7 6 6 9 1 5 9 5 5 4 0 1 4 7 2 1 4 4 0 10 9 0 5 10 5 0 6 9 10 5 9 8 9 3 4 6 0 4 10 10 2 7 10 7 0 10 6 10 9 10 3 3 6 2 4 10 7 3 1 10 5 8 4 1 9 8 8 0 5 7 1 7 9 1 1 3 10 7 6 0 8 10 2 6 8 7 7 7 10 3 2 4 4 1 5 4 6 10 6 3 7 4 6 10 0 9 1 7 2 5 3 9 2 8 2 1 5 4 1 5 3 8 7 2 5 7 6 8 6 0 5 6 3 6 7 10 5 3 8 10 3 0 4 2 9 5 5 1 9 8 7 2 8 4 6 9 5 1 10 8 6 3 9 8 3 2 0 0 5 10 5 4 9 2 0 9 3 1 10 2 5 3 8 4 [...]
+3 10 0 10 2 10 8 8 2 5 8 3 1 9 1 4 5 0 7 7 8 6 10 2 5 10 1 8 4 4 7 6 1 4 1 3 8 2 0 7 6 4 5 10 9 2 9 6 2 0 4 6 6 2 7 4 1 5 7 0 5 7 3 8 9 9 9 9 0 6 0 7 8 2 7 5 8 2 6 2 4 9 7 7 10 8 7 1 10 5 3 1 5 3 3 0 1 5 7 1 4 3 0 0 4 4 10 5 7 10 10 0 4 0 8 3 7 10 10 0 8 10 4 0 9 2 6 8 8 4 9 2 9 4 6 7 8 3 5 7 10 10 3 2 1 7 8 4 9 5 2 5 9 7 6 4 4 2 6 5 0 10 6 8 8 1 0 2 10 1 8 6 2 10 7 5 1 2 8 2 2 0 10 0 1 1 5 2 0 0 3 10 6 5 0 1 10 1 10 2 10 8 7 9 10 6 0 6 10 6 7 1 8 6 8 0 9 8 8 7 7 7 5 2 1 7 8 6 9 5 8 2 1  [...]
+7 0 2 9 3 2 7 2 1 6 0 0 7 4 6 7 2 7 2 0 9 5 3 10 0 1 8 10 9 6 2 5 7 3 9 3 4 6 5 1 6 4 5 3 4 6 10 2 8 5 5 1 5 3 6 3 1 2 3 1 4 9 10 0 1 1 1 6 5 2 10 0 8 6 10 9 5 9 8 10 10 6 6 2 10 5 2 8 7 0 0 4 4 9 5 6 4 8 0 0 5 7 5 5 7 8 6 7 8 9 8 7 0 8 5 6 2 6 3 8 6 9 3 9 0 1 9 10 0 10 5 3 7 3 6 2 6 2 6 4 2 5 10 0 10 0 4 4 10 4 8 10 10 5 8 2 5 6 6 5 3 1 6 10 3 9 4 4 10 4 10 6 6 9 6 10 7 9 8 1 9 4 4 0 6 8 2 2 7 6 10 1 1 1 5 5 8 8 2 6 6 3 2 9 1 1 6 2 8 3 1 8 4 1 3 5 4 10 1 8 3 4 7 6 5 1 4 2 7 2 8 10 5 8 1 [...]
+8 9 9 6 8 2 8 0 5 0 1 3 5 4 10 10 5 0 2 5 10 3 5 5 0 7 7 5 3 9 4 9 1 3 5 8 4 5 4 3 3 3 0 3 1 5 9 9 5 1 6 1 9 6 8 6 8 7 4 6 8 7 6 3 1 8 4 6 8 5 8 8 0 6 0 3 10 7 10 0 3 7 6 0 0 5 7 8 1 7 3 3 0 2 2 1 8 10 5 10 8 1 9 2 9 8 7 9 3 9 9 5 10 8 1 3 9 6 7 9 5 8 9 0 3 3 5 9 2 6 5 5 5 0 2 8 2 6 7 8 7 8 2 2 5 4 3 10 8 4 9 6 5 6 4 1 6 8 7 7 3 8 0 0 7 2 8 1 6 0 10 1 8 1 8 7 2 2 4 5 3 1 7 10 0 10 10 4 7 9 7 1 4 4 7 9 2 4 10 8 8 2 6 5 6 9 7 7 9 9 8 4 9 4 5 4 6 5 6 8 10 7 6 7 2 6 9 8 10 2 8 6 5 7 7 7 1 1  [...]
+4 2 1 3 2 2 8 0 8 1 4 6 7 7 5 3 1 4 3 6 1 5 3 4 6 7 2 0 0 6 0 8 2 3 5 7 2 3 7 4 1 10 7 6 9 2 3 5 2 7 3 9 8 7 2 6 3 8 5 5 0 0 9 5 1 7 0 0 10 4 8 10 8 5 8 1 8 3 9 10 2 0 1 6 7 2 1 10 8 8 3 7 5 2 8 8 10 7 5 6 8 8 9 7 6 2 10 5 8 3 1 8 6 1 6 2 5 6 7 2 0 10 2 7 4 5 2 3 9 1 3 10 0 0 2 9 8 1 2 9 4 9 8 3 6 5 8 3 0 6 3 2 3 3 7 7 8 10 3 10 5 0 10 10 8 3 10 2 0 0 3 0 6 10 10 5 4 6 0 5 2 4 5 1 6 4 3 2 6 0 1 3 1 7 1 1 9 0 2 9 8 7 6 5 1 4 5 3 4 8 6 1 4 2 2 3 3 1 4 1 1 1 6 2 3 7 6 5 4 6 2 6 7 6 2 5 6 1  [...]
+6 3 4 1 10 4 2 10 0 7 6 3 9 5 0 10 0 1 0 7 3 6 8 7 9 6 9 9 9 9 10 9 5 10 4 1 2 1 3 4 9 7 6 0 1 1 3 4 7 10 9 4 6 0 6 9 3 1 6 4 7 2 10 5 9 10 4 5 8 9 0 10 10 2 2 2 1 1 9 5 1 4 7 6 5 0 7 1 9 6 2 9 6 3 9 6 1 1 8 1 8 9 2 2 5 4 8 9 4 2 3 1 7 9 6 5 8 1 6 7 6 6 0 0 5 4 7 7 4 8 4 0 3 9 3 9 4 6 9 2 3 5 1 9 3 0 1 7 2 5 9 3 7 6 0 6 8 10 9 0 1 6 2 3 3 7 0 10 5 0 9 8 0 9 7 1 0 7 10 8 6 10 3 8 10 3 10 2 7 6 3 0 8 2 10 4 1 6 3 0 5 3 1 4 8 7 3 1 4 3 8 3 5 2 2 1 10 2 8 5 9 7 9 7 1 8 4 8 3 0 3 7 2 4 7 6 8  [...]
+1 10 2 7 6 10 8 6 9 8 0 3 8 6 1 8 0 9 0 3 4 5 8 7 1 4 9 10 1 7 1 4 2 9 6 2 9 5 1 5 4 4 0 10 10 4 9 10 3 10 3 4 2 3 6 4 3 0 8 5 0 1 10 7 2 0 5 8 1 4 6 5 9 0 5 7 1 0 0 4 9 8 8 6 8 5 8 6 6 10 5 0 5 10 10 8 9 1 5 0 2 4 9 8 6 5 8 3 10 1 10 8 3 4 3 7 3 3 4 1 1 0 9 10 7 3 2 10 8 10 9 7 4 2 5 9 10 3 9 2 2 9 7 9 4 5 2 5 6 6 7 0 3 4 9 7 4 1 5 0 4 1 9 10 10 2 9 6 7 3 3 4 2 6 9 6 6 3 10 6 9 3 6 3 10 2 1 5 1 6 2 4 2 10 5 8 8 3 6 0 2 1 2 2 7 0 8 3 0 2 0 5 4 3 1 8 0 5 5 6 4 5 3 2 8 7 9 10 6 8 0 10 3 9  [...]
+10 6 6 3 4 1 5 4 0 4 6 9 6 0 2 7 3 2 9 1 4 10 6 6 2 1 9 6 9 5 1 2 9 4 3 0 9 1 2 0 1 7 7 7 0 10 7 7 10 9 9 1 9 7 0 3 0 3 5 8 5 3 2 4 0 4 2 9 6 3 0 10 1 3 0 5 6 2 0 7 10 1 4 6 8 10 3 3 9 4 1 0 8 1 3 10 7 0 5 6 9 9 4 4 7 8 2 9 8 6 4 10 8 5 0 8 6 8 6 7 0 5 5 9 6 1 8 6 0 3 5 10 2 4 3 0 3 9 1 3 9 2 6 2 4 2 0 7 6 4 0 7 9 3 7 10 4 2 4 1 4 9 8 9 3 2 3 8 2 6 0 5 10 8 6 9 3 9 5 5 1 4 4 4 4 5 9 5 2 7 9 0 8 7 3 5 8 2 6 8 10 0 2 2 2 3 5 5 4 0 9 4 6 3 10 0 3 0 4 4 2 9 9 6 9 5 4 5 1 9 0 8 6 1 10 0 5 1 7 [...]
+1 1 5 1 5 8 2 4 5 5 4 5 5 8 1 2 1 0 5 0 7 4 7 3 7 0 6 9 9 10 3 1 3 0 9 6 3 6 5 10 4 4 9 6 6 4 4 2 6 2 8 6 9 5 8 1 3 3 2 5 4 1 1 1 9 0 10 9 1 9 7 4 4 1 3 2 10 7 5 0 4 3 10 0 1 3 0 9 8 3 10 2 7 9 4 7 4 7 6 2 8 8 3 4 2 2 6 0 9 0 7 2 10 3 1 1 6 7 2 0 6 0 10 3 4 9 10 5 8 4 6 8 5 7 4 4 6 6 4 6 7 0 0 5 5 7 5 9 0 10 0 2 10 0 1 3 8 2 1 5 5 7 10 5 7 9 8 4 7 3 2 8 2 1 10 10 4 8 8 3 1 10 6 9 8 9 5 3 0 5 3 7 4 10 3 3 9 9 2 4 8 8 0 0 6 9 6 7 10 0 8 10 5 1 2 5 0 8 2 10 1 10 3 7 5 8 4 3 6 9 10 10 8 7 5  [...]
+0 5 10 6 4 7 2 4 9 10 4 3 9 5 5 1 2 2 9 4 10 6 3 10 10 5 1 8 1 9 4 9 6 3 3 10 2 8 8 1 5 9 2 2 7 6 9 3 7 1 5 2 9 0 7 9 3 4 2 10 10 6 10 0 0 8 6 4 0 1 9 6 8 7 7 4 8 4 1 5 1 4 6 0 2 8 5 0 9 7 10 7 9 9 8 0 2 7 4 10 0 6 10 10 6 9 3 9 5 9 2 3 0 2 5 0 4 0 2 4 1 2 3 4 9 1 0 6 9 5 2 10 3 2 1 9 6 4 1 6 2 7 4 2 10 9 10 1 5 7 8 4 9 10 4 8 0 2 3 7 5 1 8 8 8 7 6 9 7 10 2 1 1 4 10 10 1 1 4 4 9 8 8 9 1 8 5 6 4 8 10 10 3 0 3 9 3 6 10 9 8 0 3 7 2 0 9 7 5 5 8 1 6 4 8 5 4 2 8 7 1 0 8 0 10 1 4 3 3 1 3 9 10 4 [...]
+0 2 0 8 9 10 1 0 4 4 2 7 8 4 6 2 3 4 9 6 8 2 2 3 0 2 10 4 1 2 10 10 7 8 4 10 7 9 4 0 9 6 6 5 2 5 10 2 2 7 0 0 8 6 5 3 7 5 4 8 2 6 1 10 2 9 2 9 1 9 1 5 7 4 4 3 9 4 1 3 4 1 5 4 9 5 8 7 1 8 3 8 5 6 6 6 10 0 10 3 10 9 10 3 7 9 8 4 9 7 0 7 6 9 1 2 9 3 1 10 0 0 3 0 4 9 3 2 5 7 0 7 2 10 7 4 1 6 10 0 4 7 10 7 0 4 5 2 4 3 3 5 1 1 8 1 4 1 9 2 10 3 6 7 3 5 0 7 8 0 7 6 5 9 1 0 5 9 6 4 9 6 9 4 10 0 7 9 3 7 6 6 6 2 1 2 6 3 3 4 7 8 6 4 5 4 2 3 9 10 7 8 8 1 5 6 4 10 7 2 6 3 4 9 2 0 10 6 1 10 7 4 2 4 1 6 [...]
+5 2 3 10 3 2 5 6 2 8 0 10 3 4 0 10 7 1 1 7 2 8 7 6 4 2 2 0 10 3 10 9 5 6 0 4 6 10 9 8 5 4 5 2 7 2 2 2 10 6 10 7 1 3 9 9 10 1 7 3 9 2 3 0 0 5 2 1 8 9 8 1 1 4 8 2 5 8 5 5 3 6 4 8 6 4 1 8 5 8 4 4 4 9 9 7 10 3 0 0 2 6 6 0 9 8 4 3 7 2 4 10 8 5 9 10 10 0 6 9 1 7 5 2 9 5 7 4 7 7 8 7 5 6 1 3 0 7 9 9 10 8 9 9 6 5 3 9 0 7 7 3 3 8 8 9 2 8 8 10 9 3 7 8 2 2 6 2 2 3 10 2 10 10 3 0 3 0 7 5 6 10 9 2 8 9 10 3 0 2 6 5 1 3 10 4 8 9 10 1 10 0 3 4 5 9 4 0 0 5 7 3 10 1 9 4 2 1 1 6 3 8 6 8 2 7 10 0 6 3 8 2 0 8 [...]
+1 2 9 4 3 9 6 4 7 8 9 8 2 7 6 0 9 5 10 10 4 1 4 10 5 5 1 8 0 6 7 10 2 4 4 2 4 5 5 5 1 2 3 8 4 4 2 6 7 6 10 4 3 9 2 8 1 0 4 9 7 0 0 2 0 4 8 3 10 1 5 1 4 1 4 7 7 1 6 5 9 6 4 6 1 7 7 10 10 9 1 8 8 6 5 0 0 3 8 1 1 8 8 0 0 7 1 1 6 8 7 2 6 4 5 8 8 10 9 8 5 6 6 0 3 10 4 9 7 9 7 1 0 8 6 7 10 8 1 6 1 4 10 9 6 8 1 3 2 3 1 3 10 10 4 6 9 4 5 5 4 10 10 7 0 2 5 8 4 8 1 6 3 7 0 10 2 10 1 5 9 1 1 10 1 4 3 9 5 5 4 6 1 6 0 3 8 9 1 3 6 4 6 5 3 2 5 0 1 4 7 4 7 1 7 5 9 10 0 3 8 4 7 1 6 9 5 2 7 5 0 3 10 6 5 2 [...]
+7 9 4 1 3 4 6 6 0 3 6 2 4 8 9 8 6 10 2 4 7 2 4 10 3 0 8 3 6 6 6 4 1 3 3 0 7 10 9 8 7 6 7 9 8 8 7 1 5 0 2 3 0 5 9 6 7 2 9 3 7 10 7 7 0 5 9 7 7 1 8 4 3 4 8 6 8 7 5 8 1 0 1 8 2 9 0 10 10 7 0 3 1 10 10 6 0 0 0 8 0 1 8 1 6 6 10 2 6 0 3 10 1 3 8 6 10 3 3 0 1 1 4 8 10 10 7 10 4 7 6 9 1 7 7 7 10 10 5 4 5 4 6 7 10 8 3 0 2 10 0 10 0 0 0 9 0 5 4 0 9 3 10 10 10 0 5 2 7 5 1 9 2 10 9 10 5 6 8 5 8 2 3 8 8 6 10 9 4 3 5 8 7 5 9 7 8 6 7 5 2 4 2 5 4 9 8 10 1 9 5 7 6 6 9 3 9 3 8 4 6 7 10 7 2 3 3 6 3 9 4 9 3 [...]
+2 8 10 1 3 2 6 10 8 7 10 10 7 8 10 6 1 8 3 6 4 3 0 9 9 6 4 4 0 9 2 4 7 6 0 4 3 1 0 8 9 5 0 5 6 3 4 0 3 8 4 6 3 5 0 2 1 2 5 9 7 1 3 8 0 6 3 8 5 4 7 8 10 5 6 7 8 10 2 2 8 5 2 0 1 1 10 7 5 7 1 1 2 5 7 10 5 8 5 1 8 7 2 2 3 5 10 8 0 5 6 0 9 8 7 0 3 1 8 1 0 2 10 9 6 4 4 9 9 1 0 2 7 5 1 2 8 4 1 1 5 10 3 0 10 7 7 4 3 3 10 2 2 7 3 0 4 1 4 0 9 7 4 6 3 0 1 1 2 10 1 6 1 1 1 2 7 3 2 0 1 7 1 10 1 3 4 2 4 8 7 9 10 1 6 3 4 9 5 4 1 6 4 8 1 5 2 10 3 2 2 6 7 2 7 2 6 7 0 9 8 1 7 6 3 3 6 10 2 1 1 4 2 3 5 7 5 [...]
+0 6 6 9 0 4 8 2 10 9 4 0 6 5 4 7 1 6 0 0 1 4 2 10 0 7 10 0 4 10 1 1 9 1 9 0 4 5 1 6 4 9 3 2 8 9 2 2 6 4 1 4 5 1 8 1 2 10 7 4 2 5 0 7 1 6 4 3 3 3 9 8 3 7 0 3 8 7 10 7 3 4 9 8 3 3 0 5 3 1 9 6 0 9 10 3 5 4 0 6 2 10 9 9 7 7 4 1 1 5 7 7 6 10 5 4 7 1 9 0 7 8 0 8 9 3 2 4 1 10 8 4 1 7 6 2 7 3 2 4 8 8 10 2 6 2 7 10 10 7 5 9 6 1 9 1 4 8 9 7 5 3 4 1 7 0 9 2 4 2 0 2 1 10 5 10 0 5 8 9 1 7 0 3 0 2 2 3 4 8 1 9 0 8 1 4 4 10 5 2 9 6 5 8 5 5 9 4 5 0 1 5 2 3 1 8 9 5 10 2 2 6 2 3 2 5 4 4 3 5 8 7 10 6 3 0 2  [...]
+7 2 8 7 2 5 5 3 4 3 10 1 1 6 4 7 5 7 8 0 10 6 7 3 0 5 1 3 4 7 0 9 6 6 6 6 9 10 6 3 6 2 6 1 8 7 1 6 3 9 4 1 5 6 8 0 1 5 1 6 2 10 5 8 9 9 3 2 3 9 7 6 4 10 4 1 1 6 10 1 9 8 1 10 4 6 3 4 8 5 6 8 1 7 9 6 10 8 7 6 8 0 3 6 6 6 5 5 0 3 3 9 4 5 6 4 10 3 10 2 1 5 8 8 0 10 10 0 7 10 5 4 8 7 1 3 8 0 2 7 2 3 10 0 5 4 1 7 8 6 0 2 4 2 0 3 8 10 5 0 10 4 5 9 1 4 2 2 3 3 6 7 10 3 7 10 7 6 6 1 10 6 10 7 6 8 6 5 6 9 5 7 7 8 10 4 2 4 1 3 6 10 5 8 2 10 7 9 5 9 7 1 6 7 8 7 5 8 1 0 7 9 3 3 5 4 8 6 0 10 9 5 10 0 [...]
+2 1 10 7 5 8 1 9 6 4 6 2 2 2 7 1 0 1 9 1 9 7 5 3 0 9 5 6 6 0 5 3 3 3 0 8 3 0 6 2 6 1 5 4 8 5 2 8 9 8 7 3 6 9 6 9 1 4 8 9 7 9 6 10 10 7 9 0 5 9 4 9 0 9 9 4 8 7 6 2 6 10 4 3 10 0 3 5 6 3 10 2 0 8 10 7 4 7 10 6 10 10 0 8 3 6 1 7 1 3 1 1 10 4 10 2 4 7 0 0 0 5 4 2 3 1 6 1 10 5 1 10 4 0 7 8 7 8 5 10 6 3 4 1 1 7 5 5 7 5 6 7 5 5 1 1 4 7 2 8 4 4 7 4 8 5 7 2 9 7 9 7 8 5 9 8 0 1 9 10 2 8 0 1 9 1 4 10 0 7 8 2 5 6 8 4 0 3 2 3 9 9 1 4 0 3 8 9 4 3 8 4 9 6 4 9 4 4 4 10 10 5 3 6 4 8 1 10 4 10 0 8 8 10 9  [...]
+1 2 5 3 5 3 10 4 1 10 7 2 1 1 4 2 6 1 0 0 5 1 5 3 6 10 3 4 2 4 3 2 9 6 2 10 2 0 8 8 6 8 4 7 4 2 8 3 5 3 10 9 4 2 8 1 7 8 10 7 1 0 3 6 6 0 0 8 10 3 7 3 4 3 5 10 3 10 5 8 1 0 3 7 0 10 1 1 10 1 4 8 6 3 2 0 9 5 4 3 2 9 9 4 2 8 7 7 5 1 3 10 10 8 1 2 6 8 7 4 8 6 4 2 9 9 5 8 6 1 4 3 8 2 2 7 9 3 8 8 6 9 1 9 4 5 5 2 8 5 8 8 10 1 3 3 9 8 10 6 10 1 7 5 10 1 1 4 0 3 5 7 0 10 10 9 6 6 8 5 7 5 4 2 4 0 7 3 5 3 3 3 2 3 9 4 0 1 3 4 4 4 9 2 6 6 0 10 6 1 6 2 8 5 8 10 5 3 3 6 0 2 6 10 4 6 10 5 7 9 8 1 9 2 0 [...]
+7 3 1 8 1 2 0 0 6 7 7 8 6 8 0 3 10 8 0 8 10 1 5 1 7 8 2 0 8 7 1 4 9 7 8 5 5 8 6 9 4 2 8 5 10 7 0 6 7 9 6 5 8 1 3 7 5 7 10 7 7 6 6 0 1 4 5 0 5 1 10 1 9 2 0 4 0 8 9 7 4 7 4 10 8 6 3 6 4 7 4 1 5 9 8 8 0 5 10 3 5 10 8 9 5 9 9 8 2 9 7 3 7 2 0 1 6 3 0 3 4 4 3 2 6 4 4 0 8 6 5 1 2 5 1 1 8 9 1 9 1 9 5 10 0 4 6 2 9 6 6 8 3 3 9 9 7 2 6 7 2 8 8 3 8 9 3 0 3 0 9 7 4 10 7 0 0 8 6 7 3 3 3 3 3 0 8 3 0 5 9 8 9 5 8 4 6 5 4 4 1 9 8 1 7 7 1 8 10 10 9 7 10 4 3 6 1 6 3 1 6 7 3 2 3 0 4 7 9 9 4 1 3 5 10 1 9 5 10 [...]
+6 4 9 2 8 6 1 6 5 7 3 6 5 7 3 4 1 4 2 6 7 1 3 3 3 7 3 2 8 10 9 0 3 0 3 9 2 6 10 10 1 3 9 7 3 10 9 9 0 7 3 2 10 1 2 4 1 10 7 1 0 8 8 2 2 9 9 7 1 4 10 1 6 9 7 1 3 0 1 4 8 10 6 1 1 7 3 8 0 4 4 3 1 8 1 1 2 2 7 8 1 5 10 2 8 10 0 7 4 1 4 6 5 10 8 3 9 10 10 0 5 5 4 8 8 5 4 6 2 5 2 8 5 6 0 9 6 8 9 6 0 2 6 3 8 7 10 3 9 4 8 10 4 6 3 1 1 10 4 1 2 5 3 10 10 4 1 0 10 3 3 4 9 6 8 5 3 5 7 9 6 8 10 10 10 5 6 0 8 4 0 5 0 1 8 4 0 4 6 4 0 0 10 9 9 9 4 2 4 2 5 2 2 6 0 8 9 9 1 10 3 5 9 1 5 10 10 9 9 1 10 6 8 [...]
+0 4 3 2 10 2 3 8 9 5 2 7 1 3 10 1 1 1 7 3 10 10 10 9 4 1 4 4 10 5 5 5 7 10 2 2 7 0 5 9 6 3 8 2 4 9 9 4 6 5 7 1 9 3 3 0 4 5 2 6 1 2 6 3 2 4 3 7 4 6 2 4 0 7 8 9 10 9 6 1 5 9 1 8 2 2 2 7 9 10 9 4 1 6 10 4 10 8 7 6 10 9 4 2 9 5 4 0 9 7 3 10 3 3 9 2 1 9 8 0 3 4 6 4 7 10 0 9 5 9 3 6 4 9 8 4 2 1 8 10 9 4 5 8 3 2 3 4 7 3 7 2 7 1 10 7 2 0 5 6 0 4 10 10 9 7 7 0 7 9 9 6 6 9 2 2 1 1 10 2 8 0 0 4 4 2 1 4 1 1 5 4 9 4 8 10 1 9 3 5 6 7 5 4 8 2 10 4 7 5 8 8 8 2 3 0 0 10 3 7 8 0 4 1 8 9 0 8 4 10 7 4 9 6 7 [...]
+4 2 1 5 1 5 7 1 1 6 3 9 9 5 8 2 6 5 2 2 8 2 3 5 9 3 5 1 4 0 7 0 7 5 5 8 3 6 7 1 10 9 0 1 6 1 10 0 3 3 6 5 9 0 3 1 6 1 0 8 10 10 6 0 2 0 10 2 9 4 2 1 9 9 10 7 2 10 4 0 0 9 5 8 1 7 6 8 9 3 10 7 0 6 10 9 3 10 5 1 9 10 4 1 9 10 8 3 1 5 0 7 4 8 5 4 4 9 3 1 10 10 6 7 10 2 6 2 5 6 3 0 7 3 7 7 6 0 5 7 9 1 4 10 8 8 2 2 5 3 9 3 7 3 4 0 0 9 5 9 7 4 5 4 9 5 7 6 6 7 7 3 0 8 1 4 3 3 1 4 2 9 9 2 1 1 6 10 2 2 9 6 7 0 10 0 6 1 6 4 0 4 0 8 0 6 2 0 2 3 2 10 6 6 7 10 8 10 9 4 2 3 9 4 7 5 8 1 1 6 4 1 1 2 2 1 [...]
+1 8 1 9 1 10 3 10 0 5 0 8 9 0 10 7 9 0 9 7 9 5 3 4 1 8 10 1 4 1 7 10 2 7 10 7 6 2 9 2 8 2 3 8 3 0 6 10 3 3 1 7 9 1 5 4 0 0 5 8 0 0 5 0 6 7 2 9 3 1 0 10 10 7 6 7 4 6 0 8 3 7 3 5 5 9 9 1 7 2 6 0 6 2 9 1 9 8 7 2 10 9 10 7 8 9 9 5 10 1 3 6 9 0 9 3 2 0 9 4 0 5 3 1 7 4 5 3 8 9 9 2 8 7 2 1 6 8 0 9 10 10 8 10 6 4 0 5 4 1 1 5 6 2 6 5 5 4 4 7 1 5 6 6 4 10 1 6 3 3 10 6 7 6 7 5 3 8 8 9 0 8 5 1 9 1 0 3 3 5 0 0 0 9 3 8 9 7 5 3 6 4 6 0 9 6 4 7 3 8 1 5 5 7 2 3 6 8 10 2 0 10 3 2 10 10 8 9 3 8 9 1 3 9 7 2 [...]
+3 3 1 7 0 10 10 4 0 2 4 10 3 2 3 2 9 10 5 3 4 9 2 0 4 4 10 5 6 4 0 7 4 3 2 6 3 7 5 1 6 5 8 0 5 2 6 9 3 3 6 1 2 5 3 0 7 5 4 3 7 0 2 7 5 0 9 4 3 5 8 2 5 4 3 10 10 5 0 1 0 4 8 10 8 2 2 3 5 10 3 7 8 6 0 0 10 8 8 7 2 8 9 0 8 0 6 7 9 7 9 4 4 2 4 2 10 8 10 0 9 8 1 6 9 6 3 10 1 4 3 9 6 1 1 3 9 6 3 1 3 6 0 4 6 0 6 4 6 1 1 3 2 5 4 10 0 8 5 10 1 10 1 6 10 3 6 0 7 5 6 1 1 5 10 7 4 5 5 3 2 2 9 8 10 10 7 1 7 10 5 0 7 2 0 4 1 8 9 4 0 2 1 1 4 10 6 1 8 4 3 8 7 6 4 9 1 7 4 4 8 6 2 3 6 0 10 2 5 2 6 2 9 4 1 [...]
+10 10 9 0 5 2 3 5 0 8 7 5 1 6 0 10 6 7 10 9 10 0 1 1 7 2 8 9 2 6 2 10 1 2 5 1 10 5 10 0 4 0 1 5 6 6 4 3 0 10 3 6 5 9 5 8 7 9 10 1 6 3 3 8 5 2 1 8 6 4 0 10 2 8 2 5 2 7 2 8 5 9 5 10 1 3 4 4 1 6 5 2 6 1 10 9 2 10 0 0 4 2 3 9 4 8 9 0 4 2 8 3 3 8 5 2 9 0 8 4 5 9 5 9 0 8 2 10 1 0 3 7 10 5 10 10 8 4 9 7 1 1 4 10 5 6 1 0 8 7 8 2 3 0 2 6 2 1 5 8 8 3 2 7 7 0 10 7 8 9 10 2 2 7 9 8 6 8 6 0 9 9 5 5 1 0 6 3 8 5 7 4 4 3 0 7 10 7 7 2 8 1 3 7 0 10 6 10 7 5 0 1 7 2 7 3 10 5 1 1 4 5 9 5 3 8 9 0 10 6 7 2 7  [...]
+0 3 1 2 10 3 8 7 0 7 7 7 7 6 4 8 4 9 2 0 0 6 5 9 0 2 2 5 3 7 5 10 5 2 3 3 4 2 7 7 4 10 5 7 2 3 7 7 7 5 10 4 4 2 10 6 2 8 5 10 8 8 5 9 10 5 3 3 4 6 8 8 1 4 6 7 8 8 2 8 5 4 3 5 9 6 1 0 1 1 6 7 1 6 7 8 6 8 2 1 2 8 9 9 6 9 10 7 3 9 2 5 2 5 5 0 1 7 4 0 8 2 2 5 6 7 6 1 9 0 5 4 5 8 10 2 7 7 3 0 3 0 7 4 4 10 5 6 6 7 0 6 10 6 8 2 4 9 10 9 9 0 3 8 8 9 8 8 4 6 1 1 10 5 3 3 0 4 7 7 7 6 6 7 8 2 4 5 9 3 8 0 7 8 6 4 0 1 7 2 4 7 7 10 7 2 7 4 8 9 4 0 7 5 6 7 0 4 6 6 8 2 3 7 0 5 1 5 1 6 1 10 2 6 10 8 6 3  [...]
+4 7 7 3 7 6 2 2 4 7 2 10 1 3 1 9 8 4 8 5 5 2 6 9 4 10 4 3 2 3 2 10 0 6 5 5 2 8 9 10 10 8 8 3 10 9 8 0 10 1 5 5 2 7 7 8 10 0 2 5 10 6 8 3 8 4 2 6 5 6 5 0 4 0 10 0 1 4 2 4 5 9 2 9 0 4 7 0 10 3 4 2 7 6 9 8 5 3 0 8 6 0 0 4 4 3 8 2 0 2 7 8 9 8 9 10 10 8 10 3 8 9 4 9 4 2 1 5 7 4 2 0 0 5 5 5 5 2 0 6 1 6 1 5 10 9 4 8 5 4 6 3 6 3 3 0 10 2 10 9 7 8 5 1 5 9 9 0 3 2 10 4 3 10 1 1 10 7 2 4 3 2 3 4 4 4 10 5 6 6 0 6 0 2 6 6 7 5 8 9 9 5 6 1 1 10 10 10 5 0 5 10 1 6 8 7 5 9 0 0 4 3 10 5 2 6 2 2 9 1 0 4 2  [...]
+5 8 0 1 9 7 3 7 10 3 9 10 7 1 9 8 7 10 3 2 10 5 5 1 7 1 6 9 4 4 3 2 3 0 2 10 4 1 6 5 8 10 6 9 5 3 8 5 7 10 0 2 4 8 5 4 2 5 8 4 5 6 4 1 10 10 7 0 1 10 3 0 8 0 1 5 10 0 4 2 0 0 9 3 7 0 4 9 1 7 2 4 9 1 4 2 0 3 2 6 0 8 1 4 3 0 1 7 7 1 9 9 8 7 4 8 1 4 7 0 4 9 9 6 2 0 8 9 7 4 6 7 4 10 9 2 7 1 0 5 6 3 8 10 5 8 8 4 2 5 1 1 10 9 4 4 7 6 7 9 0 4 3 4 6 5 3 0 0 9 0 9 10 0 9 9 10 7 7 2 6 9 9 4 8 0 7 9 7 2 7 1 2 5 5 6 3 2 4 8 0 5 4 6 4 5 2 6 6 2 3 3 1 1 9 7 1 4 6 7 5 4 1 5 3 7 8 3 5 4 8 8 4 6 1 3 4 1  [...]
+9 5 3 10 10 6 2 2 5 7 4 9 10 7 8 0 6 7 5 8 9 7 2 1 2 8 7 7 10 10 0 8 3 2 5 5 7 8 8 3 1 1 9 5 10 3 6 5 5 7 0 8 2 1 8 2 1 10 4 2 10 0 10 9 2 6 0 7 10 0 4 4 10 9 8 4 6 3 8 4 8 0 7 3 9 5 9 7 6 5 0 9 4 5 9 2 10 9 9 0 9 0 6 4 3 10 4 9 3 0 9 7 9 8 1 1 8 3 0 8 8 2 4 7 9 9 7 9 4 9 9 0 4 8 5 8 0 0 2 10 6 4 3 1 0 4 5 2 7 7 10 0 2 2 0 7 2 1 9 6 1 10 1 1 7 9 0 1 8 5 8 2 3 2 6 4 10 0 4 6 10 4 7 0 1 1 1 5 10 0 0 6 0 7 7 8 5 8 0 7 7 2 10 0 5 1 9 2 4 7 7 2 3 0 0 2 7 2 3 0 4 8 1 0 8 4 1 0 10 4 4 6 8 4 8 2 [...]
+8 9 3 4 6 6 9 1 9 9 8 5 10 9 8 8 2 5 4 10 6 1 4 8 10 3 5 3 8 1 4 1 2 10 6 5 3 0 7 4 4 1 7 3 2 3 3 9 10 10 1 10 6 4 7 5 6 4 0 3 5 10 2 10 4 2 1 5 8 6 1 7 4 10 2 9 7 3 0 1 9 4 8 1 6 4 5 6 7 5 3 1 0 6 5 4 8 9 9 2 7 1 8 7 4 0 10 4 7 9 2 4 10 2 5 2 0 2 1 4 8 6 2 0 4 7 9 0 4 7 4 3 6 6 6 5 1 3 2 3 9 7 3 5 7 1 2 8 1 8 1 10 2 3 5 7 4 9 3 4 8 1 10 1 0 6 4 8 1 4 5 3 10 0 8 5 7 10 3 0 2 4 3 2 5 3 1 3 7 9 9 3 0 6 6 1 7 2 8 4 0 7 5 1 3 7 9 8 3 0 9 5 3 6 8 10 10 7 0 9 6 6 3 7 1 0 9 4 0 10 9 9 9 3 3 8 3 [...]
+6 4 7 8 6 4 9 9 6 4 3 6 3 7 9 10 2 6 4 2 9 5 8 1 1 6 4 2 1 8 6 5 8 10 8 2 1 5 4 10 2 10 7 3 6 10 1 4 10 4 0 1 1 7 9 6 5 1 7 0 8 6 2 10 0 4 10 8 2 6 10 8 8 9 5 1 2 7 10 2 6 7 8 5 7 3 0 4 7 5 1 10 3 4 2 1 0 0 0 1 3 2 7 6 6 5 3 8 1 4 3 2 1 4 3 5 1 10 6 3 4 5 1 3 7 10 3 1 10 4 1 10 4 0 7 0 6 0 1 4 2 5 8 0 1 2 7 5 1 9 0 3 8 0 0 0 7 9 1 0 3 1 4 2 8 6 6 3 2 3 7 1 4 5 7 4 6 2 10 9 3 4 5 7 5 0 10 4 4 5 1 5 4 2 4 1 6 6 1 5 1 10 7 7 6 7 7 9 8 8 3 4 1 10 4 9 6 1 1 4 1 7 5 1 2 2 10 5 5 9 5 7 2 6 8 9  [...]
+0 2 1 9 1 2 3 7 6 3 10 2 10 0 0 2 8 9 10 6 5 0 6 5 2 0 6 8 2 2 8 8 7 0 8 10 10 3 6 7 7 10 3 8 0 5 3 8 4 0 2 7 5 10 2 3 0 4 9 6 1 5 10 9 1 3 9 1 4 6 10 1 0 10 7 4 8 10 1 10 0 8 3 1 8 10 1 5 3 1 8 2 7 8 0 4 0 7 10 5 5 9 7 0 2 6 4 4 8 4 1 10 0 3 5 7 5 2 7 5 1 6 4 1 9 8 10 8 10 10 6 2 9 1 3 0 3 4 3 3 1 10 10 5 6 8 10 4 6 10 8 1 3 9 8 0 1 7 7 6 4 9 9 9 9 7 3 0 5 2 2 0 2 7 6 10 0 0 7 10 5 9 5 9 8 3 10 10 8 2 9 10 1 5 5 0 8 2 6 9 2 5 5 7 10 9 0 5 4 4 2 0 3 7 0 2 3 8 0 9 5 6 7 8 3 3 4 3 7 1 0 5  [...]
+8 5 2 3 7 0 9 3 1 2 8 4 2 4 3 9 3 2 8 9 0 0 9 3 4 4 9 4 8 2 2 5 0 0 1 10 9 9 6 5 0 4 5 0 1 1 2 10 8 8 6 8 7 1 2 2 4 1 0 5 2 6 7 8 6 7 10 9 1 10 1 0 5 2 6 9 6 7 1 7 2 2 7 2 9 2 4 1 4 1 10 9 10 0 2 1 6 4 3 7 9 9 6 8 8 2 8 5 10 4 6 5 7 8 7 7 2 4 7 2 0 5 0 5 0 4 4 6 9 2 8 1 7 8 8 1 5 3 7 4 7 4 2 3 6 10 2 5 10 8 2 6 7 9 9 2 4 7 6 8 0 2 0 7 0 0 2 9 2 5 6 4 9 1 1 2 10 7 7 9 0 2 0 1 2 4 5 7 2 0 10 6 8 0 2 5 7 3 8 5 1 3 9 5 10 0 8 5 0 6 10 9 5 1 4 2 1 1 4 8 5 5 10 7 8 7 2 1 6 1 9 2 9 2 3 5 1 5 1  [...]
+9 8 0 3 8 6 8 3 3 4 0 4 0 10 3 2 3 5 10 10 1 9 2 7 5 7 0 0 3 2 5 5 9 1 10 3 9 8 10 2 9 4 8 8 1 4 1 0 10 3 7 5 3 10 4 7 4 1 4 5 0 6 9 0 7 0 6 3 3 8 7 8 6 8 0 1 3 2 0 1 0 9 0 3 4 2 1 6 4 8 4 9 1 4 3 1 10 1 10 6 10 4 3 5 9 4 10 9 10 3 6 5 4 7 7 4 5 3 10 9 10 3 1 9 3 2 5 0 5 7 6 6 10 1 4 5 1 0 5 10 8 1 8 9 9 8 1 0 8 10 10 2 8 7 3 2 5 7 5 3 3 8 6 7 4 6 10 8 8 3 6 9 10 5 0 1 3 9 3 5 7 7 0 5 8 7 8 4 0 0 8 8 3 0 9 0 3 8 9 2 5 7 8 2 5 10 10 10 0 6 10 3 0 0 4 6 5 7 10 0 8 1 1 2 9 2 1 9 7 0 10 10 0 [...]
+0 4 3 10 0 7 3 5 10 3 7 8 3 10 5 8 0 2 1 0 6 10 5 6 2 7 6 1 9 6 3 2 2 6 0 4 9 8 8 4 3 7 6 0 5 6 6 0 4 6 10 3 1 3 9 0 6 1 2 9 8 4 0 0 9 8 3 9 5 5 10 10 3 8 3 0 6 0 8 5 8 10 1 8 9 1 4 4 3 0 3 9 6 4 10 9 1 0 10 10 2 7 6 4 8 6 0 9 1 8 8 6 1 7 9 6 9 0 0 0 2 3 6 5 2 5 1 10 1 5 4 6 0 4 4 3 4 0 0 4 10 4 6 4 1 10 8 9 1 4 3 5 5 2 4 4 0 10 8 5 5 3 8 0 9 10 9 1 1 5 1 7 8 2 7 3 8 6 10 3 7 6 1 9 10 2 1 0 4 0 6 7 9 7 7 0 0 2 3 9 3 5 4 10 9 0 0 10 4 1 2 5 0 5 6 1 8 8 1 8 10 3 1 9 0 8 0 10 6 0 6 6 3 1 10 [...]
+3 4 8 6 6 6 10 1 2 0 9 0 9 8 0 6 9 3 1 7 5 9 0 9 5 10 5 7 4 0 9 7 0 3 4 5 10 6 6 6 9 5 3 3 3 7 5 10 4 8 6 8 0 6 3 7 5 2 7 10 9 1 9 8 3 0 6 5 3 0 9 9 4 5 4 3 5 6 0 6 0 6 4 9 2 8 0 2 2 5 0 2 10 8 8 4 2 3 8 1 6 2 4 1 5 7 9 7 5 6 5 7 10 3 2 10 2 4 0 8 2 1 7 1 1 9 1 3 2 1 9 0 3 5 8 6 4 6 0 0 9 9 7 10 10 1 1 10 7 9 7 8 8 9 8 1 2 6 6 7 6 5 7 7 4 3 4 3 7 5 0 7 0 4 4 2 4 0 0 0 8 5 4 9 3 3 7 6 2 4 6 4 10 4 10 10 5 6 9 0 8 2 6 8 7 7 8 4 9 7 8 9 8 2 3 5 8 3 1 10 5 0 10 6 0 1 6 9 6 4 8 3 6 0 1 0 3 3  [...]
+7 6 4 6 4 5 2 10 7 8 0 5 9 4 7 9 0 6 5 6 1 3 6 9 0 3 10 8 1 5 1 6 5 5 4 2 3 9 6 9 2 10 8 9 7 4 2 3 8 5 3 7 7 3 3 10 3 10 10 8 6 2 9 1 9 7 2 5 3 5 10 4 5 8 10 1 6 1 2 9 1 10 5 0 0 5 2 9 7 1 6 6 8 6 10 9 0 8 2 8 6 0 10 6 6 1 2 9 1 0 9 1 1 0 8 3 5 0 6 0 9 7 2 2 4 2 9 8 5 1 2 2 8 1 7 4 6 0 6 7 10 4 10 7 9 2 8 8 0 8 2 0 10 5 6 0 9 1 3 9 4 3 7 2 9 2 7 3 8 5 7 6 5 10 1 9 6 4 10 5 9 2 3 7 9 9 1 3 3 6 6 4 8 7 5 1 0 2 0 1 3 4 5 1 6 0 9 9 0 5 6 3 6 9 10 7 8 0 9 2 9 9 4 10 7 5 10 1 0 0 8 10 0 1 9 9  [...]
+4 5 0 6 9 4 6 0 10 8 0 5 5 2 9 5 7 2 5 5 10 0 0 7 4 9 0 9 6 9 3 8 7 2 6 0 7 7 4 10 3 3 7 10 5 5 9 0 3 8 9 4 2 3 0 9 0 6 8 1 1 9 0 8 2 7 4 3 9 9 4 8 0 1 4 3 4 3 6 0 6 9 10 7 7 2 1 1 0 9 7 5 8 10 3 0 7 5 6 6 1 2 1 9 0 6 2 9 2 1 2 2 7 2 8 2 2 2 0 9 9 0 8 1 6 7 10 1 6 8 8 5 8 10 8 0 4 4 9 10 6 0 10 0 7 6 1 10 10 8 8 2 5 5 7 1 5 10 9 0 0 6 7 7 8 1 5 8 1 5 6 9 2 8 2 9 7 2 3 6 6 10 6 0 4 10 1 8 4 0 7 6 4 9 2 4 2 9 2 10 1 9 4 2 3 9 5 9 7 8 2 8 8 9 6 1 8 2 7 1 7 7 10 4 1 9 8 3 2 5 3 0 6 9 0 6 3 1 [...]
+7 2 6 10 9 3 10 8 7 0 2 7 7 3 0 3 7 10 7 1 1 4 6 2 6 0 3 2 7 10 7 6 7 3 5 9 6 3 10 9 8 8 9 3 4 3 10 6 0 10 6 7 8 7 7 10 6 10 5 4 3 7 7 3 2 4 1 7 8 9 0 5 10 9 6 3 7 1 2 10 6 2 6 2 8 10 7 3 5 5 5 7 4 8 9 4 5 9 3 3 9 10 9 8 10 3 3 10 0 6 3 1 6 6 1 9 7 0 5 6 2 4 0 2 4 9 1 9 2 3 2 4 9 3 1 8 6 9 10 10 1 3 2 7 9 8 1 8 9 5 1 4 8 8 6 8 4 10 0 4 7 5 7 2 0 9 7 10 6 4 3 9 0 6 9 5 0 8 3 9 6 0 5 0 4 5 6 8 9 5 1 2 3 5 3 5 8 2 4 5 2 0 5 9 2 3 5 9 1 8 2 8 0 0 4 0 9 2 4 3 7 2 0 7 9 3 8 5 2 6 8 6 3 1 6 0 1 [...]
+0 10 7 10 6 2 3 9 10 4 5 6 4 5 9 9 9 2 3 9 2 10 3 3 4 4 7 10 7 9 10 9 6 5 2 1 3 7 5 3 4 4 5 8 1 4 10 10 3 3 2 1 4 8 10 10 5 9 5 5 0 4 0 1 8 5 4 3 1 6 4 7 10 1 4 3 3 3 10 9 10 0 3 5 9 1 3 9 7 0 0 3 0 10 9 4 5 9 5 2 4 0 7 5 10 8 10 2 10 1 8 2 0 10 10 9 3 3 7 4 0 6 6 1 6 10 8 8 5 10 8 6 5 3 0 10 5 8 3 2 7 5 4 9 6 5 5 5 10 9 8 7 6 5 8 9 6 7 9 0 10 10 7 7 4 5 6 9 0 3 4 10 3 5 8 10 9 7 10 5 9 8 7 2 3 4 2 6 5 9 6 5 8 0 10 0 7 9 0 5 0 7 4 9 3 10 4 7 0 3 10 8 9 10 9 2 8 1 0 7 0 4 2 5 2 5 1 5 4 6  [...]
+3 9 1 1 7 10 0 3 1 0 8 9 1 8 3 6 7 0 8 0 5 3 8 2 5 9 4 2 8 2 9 6 7 2 0 8 6 2 2 1 9 2 0 8 7 7 2 5 1 7 4 10 7 1 3 2 8 2 5 9 10 0 1 4 6 0 9 1 0 4 10 9 0 7 7 1 10 7 0 2 8 9 0 4 2 0 2 7 2 10 4 6 0 10 4 8 6 9 2 3 2 9 1 3 4 7 8 0 0 4 1 6 6 4 8 10 0 1 4 6 10 2 6 10 1 0 3 3 9 1 4 8 2 9 9 1 10 10 5 1 10 8 7 9 7 9 3 1 5 7 4 5 2 8 5 9 7 4 0 3 10 1 7 0 5 2 3 5 7 1 2 1 0 5 0 3 1 4 7 4 8 5 0 4 10 9 10 8 1 1 3 8 7 6 4 10 1 6 2 8 0 10 9 10 8 9 1 0 5 8 1 10 10 6 0 4 8 4 5 3 4 0 8 7 8 3 6 7 4 5 1 8 4 6 5 2 [...]
+5 0 5 5 5 9 2 1 7 0 4 5 1 7 5 10 10 7 7 0 7 3 9 2 5 6 5 3 6 6 1 9 8 0 7 4 9 7 6 10 0 8 3 8 8 1 6 4 10 10 1 2 0 4 8 6 6 2 3 10 3 10 8 2 4 1 10 2 5 6 10 1 4 8 3 8 10 10 6 6 0 6 10 9 4 3 2 8 5 0 5 3 2 9 3 1 1 7 1 2 1 4 0 9 6 6 4 7 10 10 6 7 10 6 0 0 9 5 2 2 5 4 2 3 1 4 6 3 6 10 0 9 5 1 9 5 4 8 1 5 3 10 8 0 2 8 3 3 7 8 8 8 6 1 4 7 2 3 7 3 0 6 9 5 2 4 1 5 4 0 1 5 4 0 10 10 8 10 10 6 0 8 9 6 8 5 6 5 5 5 3 3 7 2 0 10 0 10 0 7 9 1 5 1 3 2 0 8 8 3 4 10 1 2 2 9 2 6 7 7 10 6 7 0 8 7 3 3 9 9 6 4 2 1 [...]
+8 3 7 7 1 7 6 7 6 3 10 7 10 10 9 7 0 6 1 3 4 3 5 0 3 8 4 6 6 8 7 3 10 3 5 10 9 4 6 9 1 5 5 0 5 1 3 7 0 6 4 1 8 2 1 3 0 3 9 3 3 3 7 9 6 7 3 7 10 1 0 7 0 6 10 7 4 4 7 10 8 9 3 7 3 6 5 2 6 9 7 9 7 7 5 5 6 2 6 5 0 10 2 7 2 2 2 10 2 7 6 10 9 2 7 10 3 2 7 9 10 10 6 4 1 0 6 4 6 3 2 7 3 10 9 5 5 4 1 5 5 10 10 2 1 7 1 2 6 10 6 1 1 7 9 10 4 9 3 8 7 3 0 9 8 9 10 2 1 4 10 1 2 4 7 4 10 10 3 5 3 0 5 6 10 10 10 4 0 8 4 7 3 0 8 1 4 4 0 1 6 10 3 8 5 0 7 1 10 9 9 2 8 2 5 10 8 0 0 4 7 4 2 2 4 5 9 8 3 5 7 0 [...]
+9 4 5 6 0 1 7 2 10 6 3 8 10 4 1 5 2 3 2 7 2 7 10 0 6 5 3 7 5 2 5 10 6 7 2 0 1 7 2 9 0 6 8 7 3 4 9 7 6 2 7 8 7 0 7 6 8 6 5 7 9 3 7 6 5 3 5 7 3 9 10 0 1 4 1 8 1 1 9 2 10 8 8 2 3 4 6 6 7 10 9 3 4 8 2 3 3 4 10 10 1 9 3 2 3 3 1 4 2 7 7 4 5 0 6 3 9 3 5 8 9 0 0 3 10 3 9 9 5 8 6 9 3 10 1 5 0 6 7 8 8 6 5 0 6 9 10 10 8 5 7 9 7 2 4 4 7 5 3 2 10 7 7 0 2 0 6 8 7 2 10 10 1 8 1 5 2 5 3 4 7 4 3 7 6 0 3 4 6 0 2 6 8 8 3 6 10 10 3 5 2 3 1 1 4 7 1 2 9 4 1 6 4 10 5 7 7 7 1 3 6 6 2 10 5 9 9 10 10 8 3 1 9 7 5  [...]
+9 7 3 6 10 10 3 0 3 4 0 6 3 5 10 3 1 0 4 0 9 6 4 0 5 9 10 7 5 1 1 2 3 2 7 4 5 0 1 0 7 7 9 2 0 10 10 0 8 0 10 4 3 6 0 8 5 2 4 8 8 5 1 7 1 0 8 9 2 4 5 9 4 6 2 4 4 0 0 9 1 10 9 2 2 0 4 7 6 5 10 10 5 7 2 1 0 9 6 1 3 6 10 8 9 3 8 8 7 1 1 3 0 6 1 8 8 9 7 0 3 5 10 7 4 6 10 9 3 9 1 3 3 8 7 5 1 9 5 6 6 2 7 8 6 2 4 0 5 7 1 5 1 7 3 4 2 1 7 0 9 0 7 2 6 8 1 0 6 10 4 1 5 4 5 2 2 10 6 1 6 1 0 5 1 1 0 8 10 8 3 0 8 7 4 4 2 5 9 7 9 0 2 2 4 4 3 0 3 0 0 0 2 6 3 0 0 5 7 2 3 5 9 8 1 1 4 0 7 9 5 6 10 4 6 1 10  [...]
+0 2 3 0 7 9 8 2 10 10 5 1 7 6 10 4 2 5 8 10 7 6 7 2 2 1 1 4 6 7 6 8 9 2 1 0 1 9 5 3 9 7 8 9 1 4 10 4 6 2 0 3 2 9 5 9 10 9 9 0 0 10 0 2 3 9 8 3 1 9 0 2 2 6 6 5 9 2 9 8 9 10 6 4 1 8 5 5 0 10 10 1 2 4 5 4 9 0 7 2 6 10 2 6 5 6 9 2 8 1 3 4 7 10 9 2 1 0 4 1 0 5 8 9 7 3 9 8 8 3 4 7 9 10 5 10 4 3 8 3 4 4 5 4 1 5 9 9 10 4 8 1 1 9 5 10 2 0 5 9 1 6 9 6 5 0 5 4 9 9 5 2 9 2 10 2 1 10 4 4 0 8 5 0 8 9 9 7 4 2 2 3 5 7 1 8 0 7 9 6 5 9 3 10 0 7 4 5 2 9 4 7 8 10 5 10 3 7 9 10 8 3 8 3 6 10 2 8 10 0 8 7 4 5  [...]
+4 0 2 1 7 6 1 3 6 7 6 4 0 8 1 3 0 5 8 6 2 10 8 10 0 2 9 10 6 0 0 8 8 0 0 8 4 5 2 6 9 0 7 3 8 5 10 3 2 2 2 9 10 5 6 0 4 0 3 5 2 3 5 8 9 0 0 1 4 5 10 10 2 10 0 9 2 3 1 10 2 2 6 3 1 10 5 1 8 0 3 2 7 2 9 3 3 10 5 0 0 7 1 4 8 6 9 1 8 5 6 2 2 8 9 8 4 2 3 3 7 7 5 1 1 3 10 0 4 5 8 2 4 3 0 3 4 10 6 3 1 1 9 6 2 6 8 1 6 2 0 6 8 5 2 7 3 0 5 7 3 0 4 2 7 8 10 9 8 5 10 5 0 7 0 8 10 8 0 6 9 1 7 2 5 10 4 1 2 3 6 6 8 2 1 2 3 1 7 5 5 2 0 8 3 5 1 0 3 3 4 10 5 7 3 3 10 1 3 10 1 8 6 10 8 10 4 1 10 8 10 1 6 2  [...]
+2 8 8 2 0 8 1 3 5 5 2 8 10 4 3 0 7 5 0 1 3 5 1 7 5 1 8 10 0 2 8 8 5 2 2 3 8 1 5 7 5 2 0 8 0 9 6 6 2 5 4 1 2 5 7 9 7 3 0 2 6 8 2 1 2 5 5 0 0 6 3 1 8 10 4 2 10 9 10 5 8 10 2 0 0 2 9 7 8 6 4 9 8 2 7 6 7 6 0 3 7 6 10 3 9 8 8 3 7 6 1 6 5 9 1 10 7 9 3 5 6 6 3 9 0 10 4 2 3 4 1 3 5 0 4 6 7 8 4 8 9 4 6 4 3 10 4 5 2 9 9 1 7 6 0 8 10 1 9 1 1 7 5 8 10 9 8 2 6 10 10 7 5 1 2 4 3 8 3 1 7 1 8 3 8 2 7 9 6 9 7 1 7 3 8 1 6 1 1 1 2 2 6 9 1 0 1 3 6 3 1 2 1 3 5 5 2 9 6 3 8 0 6 7 6 3 3 2 2 10 10 5 10 0 6 7 0 2 [...]
+1 8 9 2 1 2 7 10 5 0 0 1 5 0 7 6 4 3 10 9 4 5 5 2 1 7 10 1 8 8 6 2 9 2 9 5 1 9 8 0 6 2 0 1 7 9 7 5 8 6 10 3 5 9 3 0 7 8 9 3 4 2 1 9 0 9 3 0 8 6 8 3 0 0 4 10 4 5 7 9 7 5 1 9 7 3 9 8 1 8 9 10 8 10 6 10 1 2 10 5 0 1 1 4 2 4 7 3 3 8 4 1 3 0 0 7 5 0 10 3 6 1 7 0 2 10 2 4 2 9 7 5 10 5 2 5 2 1 10 1 3 7 0 6 7 1 7 2 2 9 4 3 4 2 1 3 9 9 4 4 9 9 5 7 6 10 8 9 2 3 2 7 8 3 4 0 4 9 5 10 10 2 5 0 10 0 9 1 5 10 3 2 9 4 10 5 7 9 3 4 3 4 1 7 2 10 3 0 2 7 1 8 0 2 2 2 8 10 8 0 7 1 8 6 2 4 9 10 6 8 7 0 10 2 1 [...]
+10 7 9 6 10 10 4 4 7 1 9 5 5 5 0 4 10 6 5 5 6 3 0 5 7 3 2 8 3 9 0 4 7 10 10 1 5 0 0 7 7 5 5 9 9 5 3 10 6 8 8 9 2 1 7 8 7 7 1 4 7 8 1 4 4 6 3 9 3 5 4 0 9 7 10 0 4 10 8 8 3 5 0 9 0 4 5 5 7 9 0 4 5 7 6 5 10 9 9 9 1 2 10 7 9 7 1 5 7 9 0 5 1 1 8 4 1 3 3 7 6 9 8 0 0 5 0 7 1 4 6 7 0 4 8 6 7 2 0 3 5 5 2 4 8 2 0 8 5 9 8 3 8 6 9 2 0 10 0 10 7 1 3 2 6 5 2 4 1 2 8 7 0 10 0 2 5 1 4 8 5 2 5 9 1 10 7 1 1 8 3 4 1 9 4 9 7 6 9 1 6 7 5 9 1 10 2 3 8 9 0 8 10 6 1 6 7 0 10 3 5 2 5 9 10 4 4 9 5 7 9 3 1 1 1 3 8 [...]
+3 7 9 6 3 4 8 6 0 7 2 2 0 2 1 10 1 7 5 9 10 3 8 7 2 9 10 5 10 5 0 7 4 6 3 1 1 2 1 2 6 7 5 4 10 8 4 5 10 7 6 3 0 5 6 3 2 1 0 5 2 0 6 8 5 3 8 6 6 2 6 6 9 2 5 7 1 9 7 3 1 1 9 6 1 5 7 10 1 9 5 6 10 3 10 6 1 1 10 0 10 4 0 6 1 8 1 2 5 8 5 7 9 10 2 4 0 1 9 5 3 7 4 1 7 5 1 6 7 7 0 9 10 10 3 9 5 3 6 8 7 2 4 5 1 9 4 1 10 10 8 10 8 7 10 4 1 5 0 7 1 10 7 0 10 2 5 5 3 0 8 1 4 1 1 10 7 6 1 3 10 1 2 3 7 4 6 1 5 10 6 6 3 7 2 6 4 10 10 0 7 1 7 1 10 5 6 2 10 3 6 3 7 2 8 10 7 1 4 5 3 3 4 4 5 7 0 4 5 10 6 4 [...]
+7 0 10 1 6 1 6 10 5 0 0 2 5 7 8 3 8 0 7 3 10 8 1 7 4 5 0 2 10 10 2 3 10 4 2 5 4 7 7 1 2 2 9 6 4 2 8 4 8 9 2 3 4 4 5 6 2 0 5 1 5 0 10 9 0 5 4 2 7 8 7 0 10 1 7 0 5 9 4 5 0 5 1 6 4 6 5 2 5 6 10 7 9 7 5 2 6 5 7 9 8 3 8 5 3 4 4 1 5 9 4 8 1 5 0 10 2 2 4 2 9 2 6 9 1 6 0 0 2 9 2 7 1 5 10 1 6 7 10 1 1 8 0 10 2 6 5 8 8 7 7 6 2 9 6 6 7 1 2 10 6 9 9 0 9 1 8 5 0 3 8 8 0 7 8 1 0 10 0 8 10 4 9 8 5 4 5 8 0 2 7 9 5 0 10 10 9 7 0 3 8 6 4 2 1 7 9 7 9 8 10 8 3 5 1 7 10 1 10 10 8 2 4 0 9 1 1 7 7 9 9 6 9 1 8  [...]
+3 8 7 7 2 4 0 5 8 7 9 9 8 6 5 3 9 1 7 2 10 0 0 6 5 6 4 7 10 0 1 6 7 6 1 1 5 8 10 10 1 3 9 9 1 6 10 2 8 4 2 2 1 1 1 9 8 10 0 9 7 3 2 10 3 10 3 10 3 5 5 7 9 4 1 9 7 0 1 8 8 8 3 5 3 1 0 4 10 10 9 10 10 1 3 9 1 2 1 9 6 0 0 9 3 2 7 3 3 0 1 2 4 3 1 10 6 3 9 3 1 0 7 9 9 6 5 10 7 0 9 6 2 6 2 7 8 0 7 5 8 1 1 10 6 0 7 2 1 2 7 5 8 7 9 8 0 1 3 2 5 9 6 2 3 8 1 6 9 10 2 2 3 1 5 3 4 1 7 7 6 5 7 1 8 5 0 0 9 1 6 5 1 4 7 2 4 1 5 3 2 8 5 6 6 5 6 10 4 4 4 2 6 0 10 1 3 5 10 10 8 6 10 3 5 0 9 8 1 0 1 2 10 9 5 [...]
+3 4 6 3 10 6 3 6 4 0 10 6 4 10 10 7 1 3 1 0 1 1 2 4 10 3 1 6 3 6 10 5 0 10 6 7 1 9 7 3 1 0 4 9 0 1 4 10 1 6 7 1 5 10 3 0 6 9 7 2 10 5 3 5 1 5 2 4 3 1 0 4 9 0 2 10 6 3 4 5 7 7 7 6 9 8 10 9 4 7 3 9 5 5 3 4 5 10 7 10 6 8 4 0 0 2 5 3 5 9 7 7 10 5 6 0 6 4 8 5 6 6 3 4 8 5 3 10 9 3 1 0 2 10 7 8 10 3 1 2 2 6 10 5 0 2 3 8 2 6 8 5 9 7 2 9 3 2 2 5 10 7 6 1 7 5 9 6 2 10 7 6 10 9 5 3 8 7 9 8 0 6 1 8 0 4 1 8 5 9 10 1 7 5 2 5 2 7 9 9 6 7 7 5 9 6 4 8 10 7 2 10 2 10 10 10 6 7 3 9 6 10 4 7 9 2 6 7 10 5 6  [...]
+7 1 6 1 1 4 2 8 1 0 9 5 10 6 1 0 8 0 10 0 2 0 0 5 2 0 0 1 2 6 0 4 6 1 8 0 3 8 1 1 4 0 3 5 3 1 0 5 0 3 4 5 3 10 8 2 3 9 8 3 9 10 3 3 3 10 3 7 2 2 0 0 0 10 6 0 4 2 7 3 5 6 1 1 4 9 9 2 3 1 7 9 9 10 2 5 7 4 8 1 9 9 4 9 3 9 10 5 10 4 1 9 4 6 10 0 1 9 2 5 5 10 9 2 7 2 4 9 7 10 4 3 0 9 8 5 1 6 9 8 4 0 8 6 10 2 2 1 10 0 8 3 6 4 3 3 6 10 7 6 7 8 7 3 4 1 6 4 8 2 1 7 3 8 4 4 2 2 2 3 1 8 1 1 10 0 5 10 0 8 5 1 9 6 1 9 1 4 3 5 6 8 10 0 8 4 1 7 7 7 9 9 7 3 6 3 9 10 9 0 3 3 5 3 2 1 2 7 3 6 7 6 6 7 2 2 8 [...]
+7 10 7 6 0 2 1 9 1 4 5 1 5 8 0 4 4 2 10 0 10 9 9 0 3 4 10 3 8 2 3 2 6 8 6 3 0 0 5 4 6 9 0 1 4 9 1 8 0 2 9 8 5 1 6 6 8 2 6 10 6 9 2 9 4 1 6 10 3 3 8 1 7 8 2 8 10 8 3 8 4 9 7 1 5 8 4 10 8 8 10 9 4 2 0 4 9 1 5 5 3 0 2 2 10 8 10 5 3 4 7 9 9 3 9 2 5 5 5 9 9 6 4 4 4 4 10 10 4 6 9 2 6 6 7 6 3 7 5 5 2 5 9 8 4 7 7 1 7 7 6 2 8 8 5 6 0 8 4 3 6 2 0 1 10 2 10 5 9 5 7 3 9 7 4 5 10 1 7 9 4 4 4 5 0 0 3 3 4 8 8 5 2 6 8 6 2 10 5 0 6 1 5 8 5 5 6 3 1 5 6 1 8 10 5 8 3 5 10 3 10 5 1 8 6 0 2 9 4 5 7 4 8 6 8 3  [...]
+4 1 6 0 2 0 1 2 8 8 8 10 10 1 8 8 8 10 10 4 5 3 0 10 2 2 2 0 5 1 0 5 5 7 0 9 0 5 1 9 10 8 5 3 10 1 1 3 10 4 0 9 1 3 9 9 3 2 10 5 5 6 5 1 4 9 6 10 7 8 1 4 0 1 7 4 0 8 3 10 6 9 10 1 7 5 8 10 0 3 9 3 6 0 7 5 5 10 9 8 1 0 10 1 7 6 5 6 5 6 8 7 9 0 8 2 4 1 1 10 9 2 6 3 8 8 5 1 9 9 5 1 1 4 5 3 0 8 7 10 4 6 1 4 4 4 9 7 7 8 7 10 10 2 3 8 6 2 3 10 2 6 5 4 3 8 10 8 4 7 7 10 3 8 4 10 6 10 0 0 3 1 5 3 2 4 9 10 10 4 4 3 10 2 9 4 9 6 8 6 0 0 0 1 8 8 9 3 4 3 3 8 9 3 8 7 6 10 5 2 0 4 4 1 4 7 7 9 10 0 7 2 [...]
+10 6 5 8 10 3 9 10 3 8 7 6 4 3 10 5 2 5 3 4 1 7 0 2 2 1 8 8 6 9 9 1 10 3 7 4 6 0 6 2 5 7 1 7 7 0 4 9 4 2 4 6 0 7 9 9 4 7 0 4 7 5 5 8 2 2 10 8 1 0 0 0 9 7 1 5 1 10 1 9 7 6 5 7 9 8 8 0 5 9 2 1 3 9 7 0 10 3 4 7 6 0 5 2 6 2 0 9 0 10 5 6 0 10 10 10 5 2 3 10 6 1 7 4 2 4 9 9 8 3 8 1 7 5 6 6 1 5 9 10 4 10 10 2 0 9 0 2 3 7 8 6 1 7 1 4 1 5 4 7 3 9 6 5 3 5 7 3 10 6 0 5 4 9 5 10 6 2 1 1 5 7 5 4 3 7 3 2 1 3 8 3 6 3 1 10 7 1 2 4 4 1 6 9 3 9 7 4 0 7 5 0 1 9 6 4 4 7 5 1 0 8 8 2 3 1 6 8 6 9 0 4 10 3 10 3 [...]
+0 10 5 8 7 10 8 4 10 7 1 2 7 7 6 5 4 2 9 3 1 3 8 8 7 6 6 5 4 9 6 3 4 5 10 5 0 1 5 2 9 1 9 4 0 5 10 4 6 0 3 8 4 6 5 10 1 0 1 5 5 10 8 5 8 0 0 5 8 6 9 9 7 0 1 4 4 7 9 4 9 0 7 9 7 1 0 10 2 4 5 2 4 2 5 9 8 9 2 6 3 10 8 0 8 10 3 6 6 6 7 10 5 6 2 8 8 9 3 6 6 7 1 3 4 9 9 2 7 3 5 7 8 7 10 7 5 4 8 3 1 0 8 8 3 8 1 6 0 7 3 7 9 6 2 4 2 4 0 4 7 2 9 8 6 9 8 2 8 0 9 1 0 1 6 6 2 4 5 0 5 1 9 6 10 4 6 2 9 5 0 4 1 6 6 2 5 3 9 1 3 8 2 1 9 6 2 2 10 10 6 2 10 6 9 8 7 10 10 9 3 7 6 7 10 3 9 9 9 8 5 7 7 4 8 8 6 [...]
+2 4 1 1 10 2 8 10 9 0 10 4 9 1 0 2 7 2 5 4 10 0 10 3 7 7 3 1 7 1 1 10 5 1 6 2 0 7 2 5 5 0 3 4 3 1 2 6 3 1 2 3 3 8 2 2 0 2 4 6 5 0 8 3 2 4 0 4 5 3 5 8 9 2 6 4 8 0 3 0 5 2 3 0 1 5 9 5 0 7 7 5 3 3 4 8 6 3 10 6 1 9 5 5 6 0 5 3 9 5 9 9 8 9 2 3 4 2 8 10 8 5 3 1 8 7 0 0 7 6 6 8 6 3 6 10 4 4 0 6 6 6 2 8 1 7 7 5 3 2 3 10 10 2 8 2 0 8 1 0 4 5 2 4 3 5 5 4 3 9 1 7 5 4 3 4 6 10 8 2 0 5 6 1 10 4 6 0 8 8 1 10 5 6 10 3 10 9 6 10 4 8 6 3 3 0 1 7 4 7 9 6 9 7 5 6 2 7 0 8 4 8 6 7 10 10 6 8 3 1 3 9 5 6 7 5 1 [...]
+0 3 7 6 7 9 2 4 6 5 8 9 0 6 8 5 7 4 4 5 7 1 9 1 4 7 7 6 2 6 2 1 8 2 4 9 5 10 2 6 3 6 3 8 9 4 1 10 0 8 10 10 3 3 5 9 8 5 3 6 9 2 2 1 6 8 0 5 0 7 1 10 2 10 7 1 9 1 9 2 0 4 0 4 7 5 1 0 6 10 1 4 8 0 5 0 4 0 5 4 10 10 6 10 0 4 10 5 4 7 10 5 5 4 4 0 4 0 4 7 2 3 0 2 9 3 10 9 10 2 6 1 3 0 1 10 9 10 3 3 0 10 2 9 7 0 10 9 7 0 1 8 9 2 8 3 7 8 8 10 0 10 4 9 6 4 5 8 9 7 10 6 5 9 10 8 1 8 1 9 10 6 2 3 10 2 9 0 1 3 0 4 8 6 2 1 9 3 2 6 3 9 4 10 2 8 8 3 0 0 6 5 9 9 9 4 0 6 4 8 3 10 9 1 4 3 6 8 6 7 6 10 7 [...]
+0 9 0 5 3 3 1 5 7 2 6 1 8 1 9 6 9 2 4 6 2 7 0 3 0 7 8 3 8 0 4 1 5 1 0 7 9 1 3 6 9 2 3 6 0 2 3 3 7 7 6 1 1 9 8 10 8 4 10 3 8 2 8 8 8 8 9 3 8 4 4 9 9 9 6 2 2 10 8 7 3 1 8 9 2 5 8 7 0 3 7 9 5 9 8 4 8 1 3 1 10 3 5 1 9 0 1 5 9 0 4 1 9 7 4 6 8 6 4 10 4 7 3 0 0 2 7 9 8 5 5 1 4 8 10 6 7 4 6 1 1 7 5 1 5 0 10 0 10 2 9 10 7 10 0 5 6 7 4 2 2 1 7 9 1 6 10 9 1 7 0 4 1 2 2 1 10 10 1 7 9 1 0 9 7 5 10 7 2 3 6 7 10 2 6 6 1 10 4 8 8 3 4 8 0 2 3 10 5 10 9 1 3 1 5 7 0 4 9 6 7 7 4 2 1 5 7 5 3 6 8 7 9 3 8 1 3  [...]
+5 7 4 4 1 9 6 9 0 3 10 0 8 4 4 0 6 3 0 0 3 0 0 6 4 4 10 7 5 4 3 3 3 9 5 2 3 9 4 6 2 3 5 7 7 7 4 1 0 3 2 0 0 6 7 7 2 1 10 4 5 8 3 7 3 9 0 10 0 6 7 6 9 2 1 1 3 3 6 1 7 8 1 1 5 3 3 2 7 8 4 5 3 8 2 5 6 9 4 2 4 2 10 3 8 5 6 8 5 5 10 0 6 5 9 7 7 4 2 1 2 6 10 1 8 6 4 4 10 3 0 9 0 2 6 4 7 10 6 8 2 2 0 2 2 4 4 5 3 9 8 1 1 7 1 7 10 7 1 3 7 3 3 0 0 9 3 4 4 9 2 8 8 1 1 1 3 9 4 6 7 2 9 1 6 3 9 4 7 3 10 4 7 7 3 6 0 6 7 10 3 0 0 6 2 2 0 1 8 8 9 6 3 8 1 2 10 9 2 0 8 8 10 0 0 2 4 9 5 0 4 7 10 1 2 3 8 9 7 [...]
+3 10 7 10 10 3 7 4 0 10 4 2 7 6 9 9 6 1 9 0 8 8 9 0 0 6 5 3 0 5 6 9 3 10 7 4 6 0 4 10 9 4 7 2 3 4 9 1 0 9 4 1 1 10 3 10 7 1 9 7 2 4 2 8 2 9 1 8 9 3 1 6 7 3 5 0 0 5 4 3 4 10 2 4 7 8 4 3 3 7 5 2 2 8 7 8 3 5 3 1 9 7 4 0 9 3 2 7 3 0 4 3 6 3 5 5 4 8 10 0 8 8 4 3 4 4 0 6 7 1 10 4 6 3 3 2 0 10 9 6 6 3 8 7 9 3 0 8 7 10 1 7 8 5 6 7 6 1 8 10 6 10 3 0 4 6 3 10 7 8 10 2 1 3 4 9 2 9 2 8 4 4 1 6 0 10 5 10 6 5 3 10 4 5 5 1 5 6 9 9 7 2 0 7 7 6 8 7 6 7 6 7 0 9 3 10 8 7 7 9 2 9 3 10 3 2 8 6 2 2 10 3 10 9  [...]
+9 0 10 4 0 5 9 10 1 2 7 2 7 0 4 5 7 8 3 10 10 2 5 3 5 5 2 2 2 2 3 1 5 7 5 5 6 9 8 2 7 4 1 10 7 8 10 3 1 6 0 3 2 10 3 3 8 8 4 7 3 1 8 10 6 5 0 5 3 5 2 9 3 1 1 6 3 2 3 8 3 3 5 9 5 3 8 3 0 8 9 1 7 10 7 1 3 10 6 5 1 10 6 0 9 0 9 6 5 7 3 10 5 8 6 2 8 2 2 8 10 9 2 5 1 1 1 8 8 0 6 2 7 6 4 5 2 6 0 3 9 6 4 9 0 5 3 9 9 0 4 6 2 5 1 8 8 8 6 7 3 9 10 9 6 1 3 0 3 1 9 5 0 10 1 6 10 8 5 0 2 8 9 7 2 9 7 8 2 3 9 4 6 7 1 9 7 10 9 3 1 7 4 9 7 5 0 0 10 7 2 10 9 5 1 6 9 9 10 6 10 8 0 9 8 1 5 2 1 4 0 10 2 6 0  [...]
+8 3 7 7 5 3 9 2 2 6 9 10 1 9 5 0 2 9 1 10 0 5 9 9 1 2 7 0 6 4 6 6 7 4 9 1 7 2 6 7 3 4 1 7 8 5 0 3 7 0 8 0 4 9 2 2 2 3 6 8 0 6 10 4 1 8 1 7 2 2 5 10 5 10 3 2 8 2 10 4 5 2 3 1 8 9 8 3 10 9 10 0 1 2 8 4 1 8 10 0 4 9 10 4 1 5 5 3 1 4 5 5 4 3 6 5 1 0 5 1 6 7 0 5 5 7 9 3 8 3 1 9 6 7 0 5 5 9 1 6 8 9 2 10 8 6 2 9 6 4 7 3 6 0 10 8 5 4 10 1 2 10 10 1 9 10 10 10 5 6 5 1 6 10 10 6 1 8 4 8 6 5 7 8 5 9 6 6 3 3 2 9 5 7 10 1 8 7 5 9 9 10 0 10 8 5 5 0 6 1 5 8 0 6 4 7 8 2 7 7 9 6 10 1 7 8 10 1 7 6 0 10 8  [...]
+1 1 6 9 2 5 6 2 5 0 7 3 0 3 6 6 8 6 0 4 4 1 2 2 0 6 7 4 3 2 2 8 2 1 10 6 9 8 10 1 10 7 7 1 8 4 5 2 5 5 1 9 8 3 6 10 6 0 4 6 6 7 4 5 9 0 4 9 7 5 8 5 3 8 8 7 0 3 4 4 3 5 9 7 3 7 6 1 2 5 3 8 0 3 10 9 9 8 3 2 7 2 2 1 1 9 2 4 1 2 8 6 6 7 8 4 9 2 2 0 4 0 5 0 0 7 9 5 0 2 10 1 2 5 9 1 7 3 2 3 7 6 1 7 10 0 1 1 1 8 9 10 0 1 4 7 2 4 1 2 10 2 1 0 5 5 0 7 6 7 2 8 7 1 2 8 9 0 8 0 7 6 1 0 10 7 3 8 6 10 8 7 7 8 3 10 1 4 6 3 4 8 7 1 10 4 8 9 4 0 7 6 8 6 5 8 7 1 3 3 7 3 4 5 2 4 0 1 2 10 9 0 2 6 1 7 5 8 9  [...]
+4 3 5 4 0 5 9 5 3 5 6 7 3 9 5 1 1 6 6 4 2 6 4 2 6 5 0 8 3 9 1 0 3 0 1 1 1 7 7 1 1 3 2 8 5 7 1 2 1 7 8 6 6 6 9 4 3 4 4 2 2 8 5 3 5 5 7 5 5 7 8 5 3 9 2 4 5 9 10 10 1 2 0 7 1 4 5 2 4 1 5 6 1 8 8 10 4 6 10 5 10 8 8 4 7 3 4 6 5 0 6 6 7 9 6 0 9 5 0 7 10 6 6 7 8 6 8 1 2 7 4 10 5 8 0 9 6 3 6 5 3 9 3 0 0 1 0 9 10 2 8 5 4 9 6 5 8 10 0 0 1 0 7 0 5 5 1 6 6 10 8 4 8 9 10 9 10 4 9 6 8 7 5 7 9 4 5 1 0 4 10 5 7 6 1 10 2 6 4 1 9 5 9 6 4 1 3 8 4 5 4 2 10 6 8 1 9 0 1 6 4 9 5 1 9 1 1 1 1 7 7 1 2 6 9 8 2 4 7 [...]
+2 5 3 6 9 6 3 7 2 3 1 9 5 5 5 2 8 2 7 6 8 4 0 3 1 10 8 2 7 10 2 4 10 4 7 9 8 7 5 9 0 10 4 1 0 2 5 7 1 7 0 7 10 9 5 6 9 8 2 5 5 0 10 1 2 7 6 5 4 9 2 3 6 3 7 6 10 9 6 7 5 4 4 7 5 7 8 2 6 10 9 2 3 9 6 3 2 0 2 1 8 2 2 5 4 5 7 7 1 7 0 7 8 6 10 8 0 5 0 8 0 5 7 10 8 4 6 4 4 9 6 6 8 3 6 3 8 1 9 9 3 1 3 7 4 5 5 8 8 5 5 1 2 0 0 9 3 0 3 4 8 7 3 5 6 5 10 8 10 6 2 7 7 9 0 5 8 0 5 10 1 8 10 6 1 10 8 1 6 4 2 7 6 9 9 2 5 2 6 2 8 8 1 7 9 1 7 2 9 6 8 9 2 6 3 9 6 0 3 0 9 6 1 0 5 9 4 2 5 4 10 10 2 7 8 7 10  [...]
+0 0 6 2 2 3 5 8 10 5 1 10 1 10 5 9 10 6 3 2 5 1 1 0 7 9 7 8 4 8 1 6 7 7 8 7 4 9 1 9 8 8 0 6 7 0 9 8 7 10 3 8 1 4 7 1 8 1 5 3 8 9 2 0 2 1 1 2 8 6 8 0 2 10 8 6 8 10 2 1 1 5 10 0 7 1 3 2 0 1 3 0 0 4 10 3 6 10 0 7 3 7 3 7 10 2 9 0 5 9 10 3 10 5 6 10 8 3 5 7 7 3 10 10 10 8 6 3 1 0 4 6 10 1 5 8 10 10 7 4 9 8 6 9 2 4 7 4 4 3 9 10 4 10 6 9 0 8 7 2 4 0 4 4 6 5 9 5 6 3 0 6 2 2 7 8 4 1 3 7 0 0 7 9 6 6 3 8 7 5 9 3 10 0 0 1 2 2 8 5 7 4 2 5 9 3 2 9 1 7 10 6 8 3 2 8 3 4 2 9 10 4 5 8 2 8 7 9 1 2 5 7 8 7 [...]
+10 1 3 6 2 1 1 8 8 9 4 0 1 9 0 4 3 6 1 6 5 10 0 9 7 4 3 4 4 7 7 9 1 2 6 0 3 10 9 9 10 8 8 8 7 8 8 4 1 6 4 1 7 2 8 5 3 3 1 5 7 8 0 6 9 4 7 10 3 4 8 3 2 1 1 7 1 0 10 7 8 9 3 2 10 2 10 6 4 0 1 0 3 3 4 5 5 0 2 2 2 2 1 10 0 6 10 5 7 7 0 3 8 10 0 9 8 9 0 6 6 4 8 2 0 7 10 10 2 5 4 3 8 2 5 4 7 3 10 2 9 4 0 5 5 6 7 7 4 10 8 0 10 5 8 9 9 1 7 10 2 7 1 8 9 1 2 5 7 7 3 7 6 3 8 4 8 7 3 9 4 2 6 10 6 7 4 7 6 9 2 10 0 4 0 3 2 3 2 0 5 3 7 10 5 2 8 3 2 3 5 1 3 4 10 6 10 6 3 0 3 1 3 2 0 5 1 9 1 4 8 6 7 5 9  [...]
+4 6 1 9 4 10 8 1 9 9 10 7 2 8 8 1 7 4 5 2 5 8 0 7 5 2 7 6 0 7 5 7 8 3 5 9 5 9 7 4 1 3 2 7 9 0 4 8 0 9 9 6 8 5 6 6 0 4 10 1 1 7 2 7 9 8 5 2 1 0 9 0 10 2 1 9 3 5 10 2 4 0 2 1 6 0 3 6 9 5 7 2 3 10 7 9 2 2 8 0 10 8 0 8 9 9 5 5 4 7 3 0 0 5 10 4 5 6 4 4 8 5 2 2 5 5 2 1 4 0 8 4 10 2 9 9 6 7 2 9 5 6 6 7 3 7 0 5 4 0 6 0 2 0 8 1 8 10 2 8 5 9 3 9 2 1 9 0 5 2 4 4 10 5 4 8 8 9 6 5 8 7 0 9 9 0 4 1 0 8 5 9 2 8 10 4 5 6 10 10 2 4 5 6 1 0 7 7 2 10 1 5 9 4 5 0 2 6 3 1 10 1 8 2 7 4 7 1 6 7 7 3 3 4 8 8 1 7  [...]
+8 5 9 1 2 0 2 1 2 9 3 2 0 4 5 3 2 7 3 6 8 7 8 4 5 2 1 7 7 10 0 10 10 8 8 0 0 3 6 8 10 8 5 1 10 2 1 5 2 9 5 9 3 7 1 10 8 0 10 7 9 9 3 10 6 7 6 5 0 5 10 9 6 10 3 5 5 2 1 0 3 9 7 7 9 3 10 3 0 10 7 10 6 0 4 2 9 9 9 3 2 7 0 5 2 10 10 1 9 4 5 1 8 1 3 2 6 4 1 10 0 4 3 7 5 7 9 3 5 6 0 3 5 7 6 10 5 3 4 7 6 3 10 4 3 7 6 4 5 9 5 10 1 7 5 1 3 0 6 7 2 2 1 7 3 3 5 6 2 10 3 10 8 10 4 10 2 9 8 0 9 3 2 6 2 6 10 5 9 8 3 9 5 2 2 1 5 8 2 10 0 3 7 1 2 2 1 0 4 0 10 1 8 2 10 2 3 9 0 5 4 2 4 7 0 5 7 8 7 6 7 7 5 [...]
+8 7 1 1 3 4 3 4 1 7 10 4 2 7 4 5 2 2 4 7 3 0 5 7 4 7 9 5 0 0 2 5 10 6 8 10 3 5 7 9 7 7 1 4 0 3 3 0 4 6 3 2 7 3 4 5 1 8 2 8 6 2 8 6 9 9 8 2 8 2 7 1 4 2 1 3 10 3 9 10 10 5 0 8 0 10 8 9 8 0 9 1 5 4 1 1 0 7 5 0 10 2 2 5 4 3 5 1 1 1 4 2 7 10 5 2 6 3 6 3 2 0 10 1 6 9 9 1 4 9 4 2 9 8 3 4 8 0 8 2 2 9 3 8 3 5 3 9 7 0 8 8 1 6 9 0 5 5 3 5 7 10 2 5 3 7 6 4 5 1 9 2 0 5 6 1 1 4 8 3 5 5 7 9 3 4 2 3 9 10 2 4 4 8 3 4 8 3 10 1 0 7 10 6 0 6 6 10 6 9 9 3 8 10 10 5 8 6 2 4 0 3 0 5 10 9 5 3 3 10 0 0 0 10 2 7  [...]
+6 10 5 8 9 5 3 3 10 7 7 4 3 6 6 4 7 0 4 5 8 9 5 3 9 9 5 8 0 10 2 3 2 9 6 10 7 0 7 5 3 3 0 6 9 7 10 0 4 5 4 1 7 6 9 4 0 7 8 9 9 8 6 1 8 8 0 10 5 6 5 3 10 0 10 10 6 9 5 10 8 6 6 9 0 6 4 0 1 4 2 9 8 7 9 2 0 2 1 6 2 0 6 5 0 1 8 5 8 10 8 9 1 0 2 3 2 5 9 2 5 7 5 2 8 3 8 9 1 3 4 8 1 9 4 4 5 1 3 0 9 2 3 8 4 4 5 3 0 1 4 0 0 6 5 4 10 0 7 10 3 0 7 9 3 8 1 9 1 5 6 1 4 0 7 5 3 8 7 6 0 8 10 9 5 9 9 6 3 0 3 2 8 1 10 3 3 3 0 8 8 1 2 7 10 10 2 3 2 1 1 8 2 6 2 4 2 5 5 10 5 5 6 0 9 10 10 10 0 0 1 7 8 1 7 7 [...]
+7 10 10 9 7 10 1 9 5 4 8 6 6 10 4 8 4 5 9 5 10 0 9 3 4 8 9 6 4 6 10 1 7 9 8 7 10 10 1 8 4 6 8 10 3 10 7 1 4 3 7 7 10 5 0 9 1 5 7 10 9 0 4 10 10 2 3 2 7 9 2 4 2 3 5 8 2 1 9 4 2 2 10 7 1 7 1 7 10 5 6 10 8 9 4 0 1 10 5 2 7 4 0 2 1 2 3 1 4 0 2 2 2 7 10 5 9 10 5 5 8 9 10 8 7 0 9 10 2 10 2 0 2 6 10 3 8 0 8 7 6 8 1 6 4 6 2 8 2 0 10 4 5 9 10 7 5 9 5 5 3 9 4 0 6 10 4 5 0 0 9 0 9 6 3 2 10 9 5 9 9 10 6 3 5 5 4 0 5 6 9 9 4 6 8 1 0 9 7 8 9 9 0 9 0 0 5 1 1 8 5 7 3 2 4 7 7 4 2 4 6 2 7 2 6 4 10 1 4 0 5  [...]
+10 6 4 5 10 4 4 7 1 7 5 9 9 7 9 4 8 7 5 1 6 1 8 8 0 9 5 10 0 7 7 1 4 7 7 8 4 0 5 6 7 0 9 7 0 4 7 8 0 1 0 2 0 6 6 10 8 6 3 9 5 0 4 7 7 5 1 6 3 2 8 5 0 8 10 3 4 1 6 5 5 0 6 2 2 4 3 7 7 5 8 7 4 7 1 7 9 5 0 9 2 7 7 1 2 5 4 8 5 7 2 9 10 4 8 6 7 9 1 1 8 10 8 1 1 8 6 9 3 5 2 3 8 9 5 7 0 1 7 2 8 3 9 10 1 7 0 1 6 0 5 6 1 8 3 9 8 2 4 5 1 3 5 7 8 1 6 2 8 3 1 0 5 1 3 0 0 0 3 8 7 5 0 5 4 9 4 9 5 3 5 9 10 2 8 4 9 3 10 7 7 7 7 9 7 3 6 6 8 8 7 4 1 5 7 9 0 5 2 6 1 9 2 6 4 3 8 10 0 10 4 9 3 2 4 4 9 7 4 4  [...]
+5 8 10 0 2 9 8 5 9 3 3 0 10 9 9 8 6 3 8 10 8 9 3 9 5 7 7 8 1 7 1 0 10 6 2 2 6 5 3 7 8 5 9 1 2 8 3 4 8 9 1 0 6 1 0 9 1 9 3 5 0 9 0 9 6 1 0 10 8 9 0 7 1 2 4 3 1 2 9 5 0 10 8 6 10 5 4 5 9 8 5 1 4 3 10 9 6 10 9 5 2 4 0 6 10 1 3 2 8 0 1 7 4 5 7 3 4 6 7 5 2 2 4 0 7 0 1 9 6 8 4 6 3 3 4 2 4 3 1 0 8 2 10 1 8 9 2 6 10 5 6 9 5 5 6 3 4 7 4 3 9 1 7 1 2 7 9 2 4 7 2 1 1 1 3 10 1 8 4 1 3 4 8 3 7 4 6 8 10 2 3 9 3 3 9 2 7 9 1 0 2 0 9 7 2 7 2 9 7 2 2 6 6 1 2 4 8 10 5 2 6 7 9 8 8 2 5 0 1 8 6 5 0 0 1 5 10 3  [...]
+9 8 7 7 0 2 5 2 9 7 7 4 6 0 1 4 4 3 0 8 0 2 8 5 7 6 7 6 5 7 8 0 7 6 0 8 1 10 10 8 0 4 9 1 8 7 4 1 2 10 8 1 0 1 7 9 3 5 1 6 3 3 5 8 1 5 7 5 3 7 3 6 2 0 5 10 2 9 6 1 7 5 6 2 1 0 10 6 6 9 10 3 9 1 8 9 7 7 10 4 7 6 8 1 2 6 10 10 5 5 5 1 5 8 8 7 3 1 0 6 4 7 2 4 9 0 2 5 9 3 5 8 10 10 6 1 8 5 4 10 7 8 0 8 9 9 0 6 4 10 1 3 8 8 5 6 7 5 10 8 9 3 5 6 2 4 1 10 6 8 2 0 6 1 4 9 1 8 7 7 4 5 9 10 5 1 2 10 8 0 4 1 5 3 5 5 1 5 9 0 0 1 3 7 4 8 1 2 2 3 6 7 4 7 1 8 2 6 2 8 2 6 4 3 0 5 8 3 3 0 1 10 0 7 6 0 10 [...]
+8 6 2 10 8 2 9 8 7 2 9 9 5 0 5 10 1 8 9 2 10 4 9 4 6 6 10 10 6 5 5 4 2 0 6 4 1 2 5 0 8 2 5 0 9 3 2 0 5 5 4 3 4 10 6 4 5 7 4 7 5 5 5 5 4 3 2 1 0 3 10 3 6 8 7 10 5 2 2 6 2 10 2 5 4 9 5 0 9 7 7 10 2 6 10 6 7 9 7 6 5 6 10 5 8 10 8 4 3 3 8 10 9 9 8 4 2 8 7 3 2 5 5 7 7 0 5 6 7 10 9 4 6 7 10 9 1 2 9 1 2 0 9 8 2 2 6 4 7 3 3 2 5 10 3 1 2 9 8 1 9 5 2 9 3 6 8 2 5 9 1 4 8 0 7 10 1 9 4 9 10 6 1 9 7 9 8 3 4 3 0 0 1 7 4 6 1 2 4 5 10 1 6 6 7 2 9 0 9 7 10 8 3 6 6 3 0 3 7 4 1 10 3 5 2 0 4 0 2 0 6 3 3 4 2  [...]
+5 6 7 9 6 9 5 1 2 2 7 10 7 3 9 4 10 0 9 3 4 7 6 7 7 1 8 8 9 7 8 10 7 4 4 3 1 3 5 10 4 7 7 7 6 4 10 0 10 8 6 7 10 5 1 6 3 2 9 0 2 5 7 5 4 4 0 10 8 7 3 10 10 0 4 0 8 7 8 5 10 0 10 3 1 7 8 0 10 8 7 1 0 10 6 1 6 4 5 2 3 1 0 6 9 7 0 5 1 6 5 4 3 2 7 4 4 2 5 6 3 1 10 10 0 2 2 3 6 9 2 2 0 10 8 9 8 2 5 5 1 10 7 8 5 3 10 7 7 1 10 5 4 4 10 2 10 0 7 8 2 9 8 9 9 7 3 9 1 7 9 7 7 7 6 1 3 10 0 3 8 4 4 10 6 7 4 7 4 6 6 1 3 4 0 5 6 9 9 1 7 0 4 3 9 10 5 7 5 2 3 2 5 4 7 4 8 1 2 0 5 3 6 4 10 9 9 10 7 5 9 1 7 [...]
+0 2 9 1 2 4 6 0 8 8 1 5 10 0 4 8 9 4 3 4 6 2 2 2 1 2 6 1 2 4 9 1 5 5 3 6 7 6 1 7 4 1 6 6 5 5 6 0 4 8 3 9 3 4 0 2 0 9 6 4 8 7 4 1 3 0 1 5 3 4 8 2 5 9 8 5 7 7 5 5 2 7 3 2 7 2 3 10 7 10 4 10 6 3 4 1 8 6 10 5 10 4 10 3 10 7 7 7 4 1 6 7 7 0 8 0 2 0 8 4 7 1 1 7 0 6 0 7 4 6 0 10 3 1 3 3 6 8 6 1 9 1 8 5 2 8 7 8 3 2 4 7 10 5 1 9 7 6 9 10 1 2 3 4 5 9 8 8 9 6 7 8 4 3 2 9 3 0 0 4 2 3 7 7 3 0 9 0 5 2 7 1 2 4 10 6 8 8 8 9 10 6 10 0 8 1 7 2 4 5 4 10 6 1 0 5 6 3 2 4 9 10 5 1 9 9 0 1 1 8 6 0 3 10 0 7 4 5 [...]
+2 6 7 9 6 2 10 2 4 7 2 10 8 10 10 9 3 1 4 8 8 10 8 3 1 10 8 9 1 10 5 6 1 7 2 7 2 4 0 3 2 7 6 7 6 8 1 1 7 0 5 7 3 8 10 2 3 7 1 8 2 6 3 8 4 1 6 5 10 10 0 7 1 6 9 2 5 3 6 9 9 8 5 5 5 5 10 4 6 8 6 3 9 4 9 3 6 2 3 6 0 0 10 2 8 6 5 1 9 6 2 0 10 6 2 0 2 7 2 1 6 5 2 4 1 2 3 10 7 1 8 3 1 10 6 10 10 9 8 7 9 7 9 8 10 7 3 9 2 2 1 2 7 2 1 3 0 2 5 10 5 0 7 1 7 2 1 8 2 7 9 7 2 1 7 6 6 6 0 2 4 2 10 6 9 10 1 10 0 0 0 3 6 3 0 6 10 5 1 3 8 2 3 8 7 6 9 8 9 5 0 5 5 10 9 7 5 0 6 2 0 4 1 8 2 6 10 6 1 4 10 6 1  [...]
+8 8 1 8 9 10 3 0 4 4 3 9 10 8 6 3 1 4 3 3 5 6 4 5 6 9 4 7 1 7 3 1 5 6 2 0 5 4 6 5 4 1 3 5 10 0 8 3 7 8 2 3 0 9 5 5 8 10 7 10 7 5 9 6 3 7 5 3 10 8 9 3 8 8 6 7 7 1 6 2 5 8 5 2 6 4 7 4 3 1 8 1 8 8 7 4 0 6 7 3 1 8 7 1 8 7 5 4 1 6 9 3 10 3 2 5 2 9 9 8 4 8 8 7 9 6 5 4 7 6 9 7 7 3 2 2 1 8 4 7 10 0 7 10 4 3 1 10 1 6 5 1 10 8 1 3 4 2 8 7 0 2 10 5 5 1 4 6 10 1 9 8 8 9 3 6 9 7 5 4 7 0 6 0 10 10 0 5 1 5 5 0 6 1 9 4 4 9 8 2 2 8 5 9 4 0 7 0 5 7 8 9 1 0 0 7 5 6 2 1 1 0 10 1 4 4 4 3 4 10 1 2 3 3 1 2 2 1 [...]
+10 6 1 3 2 0 6 3 6 3 2 5 1 0 0 10 2 5 0 7 10 2 2 9 6 4 3 10 2 3 0 1 1 10 2 1 5 9 5 2 6 9 8 4 2 0 0 3 8 0 6 7 6 2 10 1 2 10 7 4 4 5 6 5 4 0 9 9 1 8 0 10 1 5 6 7 4 2 3 1 10 3 0 2 5 3 8 10 9 4 4 4 1 3 3 1 4 9 7 8 5 5 0 1 1 3 8 6 2 2 5 7 4 8 9 5 7 0 0 6 10 10 3 1 9 5 1 9 10 4 8 5 8 7 7 7 10 7 4 10 9 10 3 7 5 6 4 3 8 0 4 6 8 4 2 3 2 6 1 6 6 4 7 10 5 2 2 5 8 4 0 4 2 7 8 4 0 1 3 4 1 10 9 3 7 3 4 1 4 6 5 10 1 9 8 2 1 7 10 1 10 1 5 10 6 3 2 3 4 4 3 4 4 4 10 7 0 5 7 0 1 10 7 9 0 9 10 3 8 1 4 3 1 2 [...]
+4 0 0 5 6 7 8 0 6 4 7 6 5 2 7 2 2 8 0 7 3 8 9 9 4 5 4 10 6 4 2 2 9 0 7 5 6 0 9 0 4 5 0 9 7 1 2 3 10 9 0 5 9 0 9 3 5 10 1 0 2 6 0 3 9 5 8 2 2 7 5 7 2 1 3 3 5 2 2 6 2 3 1 10 3 10 4 5 2 9 4 6 6 7 0 9 5 6 9 9 6 5 4 2 2 6 10 9 3 1 7 10 7 3 2 3 4 2 8 2 4 5 3 8 4 10 0 0 0 7 8 2 4 2 4 3 9 10 5 4 0 9 3 2 5 10 9 6 3 1 1 7 8 2 8 1 2 0 8 10 3 9 10 0 5 7 6 3 10 9 3 5 5 0 10 3 0 4 9 4 6 0 10 3 8 9 0 3 5 4 5 9 9 3 1 0 4 5 3 1 6 9 5 3 5 10 1 10 1 6 8 6 10 9 8 7 10 7 4 2 2 4 6 6 7 6 1 8 5 0 8 9 0 5 6 1 0 [...]
+3 10 10 4 3 9 6 5 9 2 5 7 1 9 3 2 7 9 8 4 6 0 3 3 9 1 7 0 9 2 9 4 8 0 6 4 0 6 5 3 0 3 5 8 9 0 7 9 2 2 7 10 4 1 7 6 7 2 1 7 1 5 10 7 1 0 10 9 3 9 9 8 6 7 8 4 8 9 1 2 0 7 3 1 2 2 9 4 6 3 1 10 2 6 10 9 9 9 4 9 4 5 8 6 3 1 3 5 6 6 7 1 10 1 4 1 2 8 5 2 8 5 7 3 1 5 10 2 3 5 10 10 1 10 4 9 3 6 3 6 9 2 3 9 2 5 2 5 3 3 2 3 3 4 1 2 1 6 5 1 3 9 4 2 1 3 6 6 5 3 5 8 7 1 9 6 4 9 2 9 9 2 7 8 4 6 4 9 1 10 0 3 5 3 5 10 2 8 4 3 2 0 1 9 3 2 6 8 0 2 2 8 10 10 6 3 8 4 5 7 1 4 5 9 8 5 3 6 10 5 10 6 10 3 5 4 5 [...]
+1 0 1 5 2 2 1 0 3 7 10 1 4 4 1 2 0 4 0 8 9 1 1 6 3 4 2 6 8 1 10 5 8 2 6 2 8 4 8 9 7 1 10 7 6 0 0 8 7 6 7 10 8 4 7 3 3 8 0 6 0 9 7 0 6 9 6 3 0 2 5 3 10 9 7 4 4 2 1 10 3 9 5 4 6 3 5 3 2 4 9 4 4 5 0 8 6 7 10 5 8 2 9 0 10 3 3 9 0 9 6 6 10 10 0 10 6 5 0 3 10 7 6 7 0 6 6 8 6 7 6 2 10 3 2 2 7 6 4 5 1 4 6 8 0 7 4 3 10 5 7 3 5 8 2 4 3 0 2 2 2 0 10 7 0 2 10 10 8 10 3 6 9 5 7 10 0 0 10 8 10 5 2 1 8 2 6 8 10 7 4 4 3 6 1 1 9 8 7 8 4 10 5 4 9 4 6 7 3 5 6 8 10 9 4 10 4 7 6 5 7 3 7 7 0 1 7 4 9 5 4 2 3 3 [...]
+2 7 3 10 1 4 9 5 2 9 8 7 3 5 9 0 0 3 10 10 9 6 4 3 10 3 4 9 9 2 4 5 8 9 8 5 10 6 3 6 0 7 8 7 5 8 2 10 0 5 6 3 7 8 1 4 0 9 6 0 2 1 7 8 2 7 0 4 0 4 9 2 8 3 6 4 7 3 2 1 4 7 3 5 6 4 8 5 4 1 2 6 1 0 7 6 0 5 3 4 4 9 8 6 5 8 8 8 10 2 3 4 3 8 5 9 10 5 10 8 2 6 5 2 8 3 8 6 8 8 4 8 4 6 3 3 8 5 5 8 5 10 4 3 0 7 8 7 10 2 9 3 7 0 10 4 10 2 8 4 10 0 8 2 3 10 2 2 1 10 6 8 2 0 4 5 4 0 9 2 8 6 1 8 4 6 9 4 4 7 3 9 5 9 5 6 9 4 3 7 5 10 0 8 1 0 2 0 3 1 8 0 2 6 4 9 4 4 1 10 6 3 6 2 6 3 3 0 4 4 3 2 9 2 3 2 1  [...]
+9 0 0 5 10 10 9 3 3 2 8 2 8 9 9 0 10 1 7 0 5 3 5 8 3 3 9 6 2 2 0 0 6 8 2 5 0 7 9 1 8 5 3 8 6 0 8 0 2 2 1 10 5 3 4 4 7 0 0 8 2 6 9 0 1 9 7 6 1 0 4 9 8 9 1 0 4 6 2 2 1 6 2 10 2 0 4 9 5 2 10 4 7 7 4 10 5 7 6 10 6 4 7 4 4 0 4 10 2 3 1 7 8 5 3 6 9 10 1 1 8 4 5 6 0 1 1 6 2 1 10 5 4 2 10 8 6 0 4 3 1 7 8 9 4 6 9 0 2 8 10 1 6 7 9 1 0 9 2 2 9 4 7 0 6 1 5 3 9 0 5 10 7 1 8 0 7 3 3 0 3 3 0 8 9 8 10 9 9 8 5 2 2 8 5 0 0 9 7 9 9 9 7 7 9 4 8 8 2 4 0 4 9 6 8 2 6 5 4 3 0 5 0 2 5 7 3 0 0 7 4 2 10 3 2 3 1 1  [...]
+1 9 4 7 3 1 4 2 3 4 4 9 9 6 9 6 9 6 10 2 4 0 5 2 8 1 5 1 2 9 0 6 5 4 0 8 3 6 0 5 10 1 2 0 8 0 8 7 10 2 9 8 6 10 10 1 2 10 2 3 5 8 0 8 2 9 5 0 8 2 0 6 8 1 5 8 2 0 8 8 2 0 10 7 5 3 4 5 3 6 8 0 0 0 4 5 5 2 7 2 6 5 10 8 6 5 10 3 3 8 8 9 8 9 5 0 3 3 10 1 6 5 3 6 1 3 2 5 1 9 10 1 5 3 6 0 1 10 8 8 7 6 7 0 9 7 6 5 0 3 8 3 0 5 5 2 2 3 8 1 7 10 0 2 9 2 2 4 8 0 8 9 2 10 9 7 4 6 10 1 0 6 10 10 0 3 4 3 5 5 0 9 7 7 7 7 7 1 4 8 4 7 2 0 0 4 8 4 10 10 7 1 8 7 6 7 9 8 0 9 0 9 0 5 10 0 2 10 0 6 3 1 8 5 8 3 [...]
+10 4 5 8 6 8 0 0 8 10 5 8 6 4 5 9 10 2 9 5 8 5 0 4 7 9 2 7 7 5 4 2 8 0 9 2 5 10 1 4 2 9 7 2 10 1 0 3 10 10 2 5 2 1 5 3 6 7 6 7 3 0 4 10 5 9 8 7 5 9 3 1 4 6 7 5 4 2 7 2 7 5 1 0 10 3 8 3 4 9 5 6 6 3 3 2 6 10 0 8 3 5 7 9 9 2 0 8 10 10 9 8 7 3 4 8 2 6 2 6 10 9 9 0 4 1 5 3 0 6 5 8 0 5 5 6 2 5 5 1 2 9 6 0 5 9 5 0 8 4 4 3 5 5 2 3 1 5 0 8 2 5 6 3 1 2 8 5 0 6 7 8 10 6 1 6 0 2 4 7 9 3 7 4 8 0 8 4 6 6 0 10 1 4 10 9 8 0 4 3 2 1 3 4 6 1 4 4 4 2 3 10 6 4 0 4 0 0 7 9 4 6 3 7 6 8 0 0 8 9 0 3 9 4 5 9 9 1 [...]
+8 5 3 2 4 6 0 6 0 7 0 2 9 1 9 3 2 0 1 9 9 6 9 10 2 5 5 10 6 6 8 1 3 8 0 8 6 5 5 2 9 5 3 10 6 3 6 3 6 9 2 8 2 8 9 4 1 8 10 8 7 4 4 8 7 6 0 3 6 2 0 0 6 3 2 5 8 8 6 10 5 8 5 3 8 7 2 2 0 9 8 3 10 6 3 2 7 4 4 7 7 0 10 4 1 6 5 10 9 2 4 7 0 0 2 3 0 8 5 0 8 3 2 3 10 1 4 0 3 1 6 2 1 1 3 8 7 3 7 2 1 4 0 8 8 7 6 5 1 8 4 8 8 7 2 8 10 5 9 0 0 3 3 5 7 8 5 1 2 6 8 3 10 9 7 4 8 5 10 9 10 0 6 1 4 6 3 4 1 3 10 3 6 2 7 3 7 1 2 6 9 4 3 1 4 10 6 6 7 5 9 3 0 7 5 5 1 2 6 0 2 6 7 1 4 2 6 9 0 6 9 2 10 1 5 7 1 4  [...]
+8 4 6 9 7 6 2 10 2 9 0 0 8 10 2 7 10 10 8 9 0 7 8 0 3 6 5 4 1 4 0 10 9 3 1 5 0 5 7 3 6 10 7 8 0 5 10 2 1 7 8 10 10 3 2 7 3 10 9 8 0 8 7 5 3 7 4 0 3 6 8 10 5 2 4 3 7 8 10 5 0 6 9 4 2 7 7 10 6 1 1 10 0 1 2 2 6 7 4 3 7 9 3 10 5 7 0 0 7 10 0 10 4 9 0 5 3 9 0 7 9 5 9 10 4 5 0 5 6 0 1 8 7 1 8 5 7 7 5 3 7 2 9 2 9 10 3 9 10 7 0 9 2 2 4 10 2 10 8 2 2 4 3 7 8 3 3 1 1 2 6 3 9 9 6 9 0 2 7 8 2 1 3 0 4 10 2 9 8 1 0 6 2 10 5 8 1 1 0 3 7 8 9 7 10 2 6 2 3 3 4 9 10 2 1 1 0 4 6 0 5 3 8 6 1 0 5 6 3 8 7 7 6  [...]
+7 10 7 7 7 4 10 8 4 2 3 9 1 7 0 3 3 7 1 9 6 1 0 2 10 7 8 10 10 0 0 4 1 1 2 5 10 6 10 9 0 1 6 2 7 8 2 6 4 3 10 6 9 9 3 4 7 5 4 8 1 1 2 3 6 7 2 0 1 9 4 10 8 4 6 10 4 1 4 0 5 4 8 1 7 9 4 6 3 10 0 6 5 9 7 0 7 5 1 6 8 1 10 4 3 3 3 0 8 7 9 9 8 4 1 1 7 6 9 3 8 5 0 8 5 4 7 8 7 8 9 2 9 4 8 8 8 0 6 5 2 9 1 5 0 0 9 9 5 10 9 2 7 4 7 2 4 7 0 5 2 3 7 3 1 1 1 5 10 7 5 1 1 4 5 3 7 6 4 9 6 1 6 5 6 2 10 6 6 5 5 10 10 3 5 0 5 8 5 9 10 4 8 1 0 8 8 7 4 0 1 4 4 8 1 1 2 8 6 6 7 5 3 6 4 7 10 1 2 8 2 4 10 10 5 9 [...]
+5 3 0 6 10 3 1 4 8 1 8 9 2 6 8 7 7 5 1 4 3 10 8 3 1 7 5 10 3 1 8 4 5 5 4 4 0 2 1 6 9 7 2 0 0 0 0 10 0 2 10 3 4 9 3 6 6 2 1 4 7 10 5 4 10 9 9 6 5 1 10 4 4 8 9 3 7 0 9 1 7 10 3 4 3 4 8 6 10 5 5 1 8 3 7 8 1 3 3 9 9 5 9 10 10 4 5 0 10 10 9 1 5 6 6 10 9 7 9 5 8 5 2 10 9 5 2 9 6 6 6 6 2 7 1 7 9 3 5 7 9 6 3 10 2 4 3 5 0 3 10 0 2 4 3 4 10 4 6 2 2 2 1 6 6 6 0 5 9 4 5 4 6 10 0 2 4 7 1 10 9 1 10 3 9 1 6 9 5 9 4 1 7 4 0 6 0 2 8 7 5 9 7 9 5 8 0 0 5 2 9 4 0 10 6 7 1 8 5 10 0 4 3 9 0 3 2 9 1 8 4 9 5 7  [...]
+2 1 2 0 5 3 7 3 7 0 10 3 0 2 9 5 2 8 5 4 4 10 10 4 1 6 6 10 0 6 4 1 6 2 4 7 4 8 10 7 10 2 6 3 3 6 7 6 8 9 2 7 1 10 5 8 5 0 3 10 6 1 6 4 10 6 10 1 9 10 0 10 10 0 0 9 4 7 4 2 2 4 9 5 6 0 6 2 10 7 4 1 9 4 0 10 5 4 7 9 3 9 10 6 4 3 4 1 2 1 5 10 6 3 4 9 6 5 10 6 9 10 8 2 3 5 2 5 10 6 9 4 5 7 10 2 6 3 5 3 3 1 6 1 2 2 1 0 9 5 7 8 1 1 4 4 0 9 6 7 6 5 9 9 9 1 4 8 9 3 4 4 2 10 6 10 7 10 0 1 0 7 2 2 8 7 4 9 4 4 0 7 4 6 8 3 5 4 6 1 7 10 9 5 5 1 4 10 4 4 3 10 2 2 8 8 1 0 0 4 10 2 2 8 9 1 0 0 1 4 3 4  [...]
+7 0 4 8 1 9 4 2 2 3 8 1 2 7 6 10 6 1 5 1 9 2 9 0 6 1 6 3 2 4 0 8 8 3 2 9 4 7 10 6 8 9 2 2 3 0 6 4 4 5 9 10 10 5 1 4 5 5 5 5 6 2 0 6 6 7 1 1 5 9 10 3 8 8 10 10 9 0 9 5 8 9 7 9 2 8 1 6 3 4 6 8 5 0 8 4 7 10 8 8 4 9 6 2 9 7 1 10 6 3 10 0 2 8 4 3 7 5 9 2 1 2 8 0 2 4 4 1 5 7 4 10 2 1 6 0 6 6 4 1 6 4 6 2 5 1 9 7 2 5 0 4 5 3 0 4 0 4 3 9 1 8 5 10 0 9 4 8 8 6 0 0 4 8 6 10 1 9 7 4 8 9 9 8 9 10 8 1 9 8 7 0 7 5 2 0 7 2 0 8 10 0 3 9 4 10 9 6 0 9 4 2 10 9 0 5 10 5 2 6 2 5 6 5 10 10 2 4 8 6 1 0 2 6 6 8  [...]
+0 9 4 5 0 8 5 8 5 7 8 1 7 3 6 2 4 10 6 8 2 8 9 10 0 10 10 0 8 6 0 10 4 1 0 2 4 6 4 8 7 9 0 5 10 4 2 7 9 0 0 7 5 4 3 4 3 5 8 9 1 7 4 2 1 0 9 1 4 7 10 10 10 9 7 6 6 0 9 2 9 7 6 0 3 9 0 7 9 3 2 8 10 6 3 4 9 0 5 2 7 10 6 0 7 1 0 6 0 8 1 9 9 0 1 8 9 8 7 10 6 9 4 7 5 0 10 1 6 3 3 6 2 6 0 7 9 8 10 1 10 2 5 1 1 2 8 0 6 5 5 6 1 5 5 0 10 9 4 9 0 2 5 1 9 6 2 7 9 4 0 8 7 7 9 4 7 0 6 9 10 9 7 5 3 2 4 4 4 4 10 4 2 4 6 3 10 0 1 7 3 5 9 5 9 6 5 2 2 6 8 2 4 6 3 3 7 8 6 4 5 9 8 7 6 8 6 10 3 10 0 8 8 7 0 1 [...]
+9 5 2 0 9 7 1 3 6 9 5 8 7 0 2 1 9 8 7 3 8 0 6 1 9 0 8 8 5 3 9 5 1 5 9 1 10 3 9 2 10 2 8 5 9 1 3 1 4 7 8 8 2 5 4 1 8 7 1 7 0 5 7 4 1 10 9 6 4 4 10 8 7 1 9 6 8 5 7 6 3 4 8 10 7 8 3 5 0 2 0 9 1 7 2 10 7 4 7 6 7 5 3 1 4 3 6 1 1 7 6 7 8 5 8 9 8 2 2 6 6 4 10 4 8 5 5 0 5 2 0 8 7 8 1 8 3 10 8 7 5 0 1 6 2 10 9 1 3 3 2 2 6 7 9 10 6 0 8 0 1 3 9 7 8 8 5 2 1 8 0 10 10 9 0 0 1 8 4 9 3 3 7 7 9 9 10 3 6 5 8 9 4 6 4 6 7 3 1 7 1 8 3 5 0 4 3 8 9 0 6 2 2 5 6 6 2 5 2 0 4 3 8 10 7 2 10 2 7 2 10 4 8 4 2 4 0 8  [...]
+9 10 9 9 3 2 3 0 1 9 9 7 4 8 2 7 8 4 5 8 4 9 6 4 8 1 7 8 3 2 6 2 9 2 9 0 1 0 2 6 1 2 10 0 3 1 9 8 2 6 2 9 2 6 3 7 2 2 6 1 6 0 7 2 2 2 5 5 10 2 2 10 9 2 7 0 10 8 1 1 6 5 4 4 7 6 0 0 0 2 9 5 4 6 2 7 0 5 10 9 8 0 6 0 6 2 4 8 1 9 1 10 8 3 3 2 9 6 10 2 6 0 10 7 6 1 5 10 1 10 2 8 3 9 2 2 1 5 9 0 9 5 8 2 4 9 0 10 3 5 5 0 9 3 7 2 3 2 7 8 0 1 1 1 5 8 2 9 4 2 8 9 6 9 3 3 9 7 8 4 4 10 3 9 10 4 7 2 9 10 4 10 1 8 9 8 8 6 5 7 4 1 3 1 9 10 4 0 9 4 8 1 2 7 1 6 5 7 8 10 3 0 5 6 1 6 3 0 4 6 7 8 4 8 4 3 9  [...]
+6 2 9 10 2 10 5 7 1 0 7 10 9 5 8 3 7 8 7 5 5 10 8 5 6 0 9 10 1 2 5 2 3 5 8 3 6 4 10 1 5 7 2 4 4 2 7 8 0 0 4 7 6 5 3 7 7 2 0 0 5 8 9 0 8 1 6 9 0 3 6 3 2 6 7 5 9 6 0 10 5 7 8 5 10 5 0 3 5 6 2 9 2 1 0 0 9 8 2 2 1 1 6 1 1 6 0 1 6 0 1 9 6 5 5 3 1 0 2 9 5 7 4 2 3 9 3 6 4 7 9 3 7 8 2 0 8 3 9 8 0 7 8 0 3 7 8 7 3 0 6 7 3 10 4 5 5 4 6 10 6 3 6 6 3 3 6 7 2 3 4 0 9 4 10 10 10 3 2 8 7 10 2 3 1 9 7 10 8 3 2 8 6 6 1 9 9 9 3 5 9 10 7 2 3 10 5 6 10 9 7 10 6 6 8 7 4 10 0 8 2 3 8 6 2 9 3 4 8 9 5 10 6 7 10  [...]
+10 6 2 6 6 2 4 6 4 5 3 6 6 8 1 10 10 6 5 1 9 8 4 2 4 0 4 5 7 1 8 2 5 8 7 10 9 10 2 9 2 1 10 6 3 3 8 2 3 5 4 7 6 6 2 4 4 5 5 8 3 9 10 9 5 0 1 1 4 2 4 4 5 0 9 6 7 3 5 7 7 2 8 4 1 6 2 4 10 6 4 1 8 2 7 8 1 3 7 5 5 9 10 4 8 4 4 9 2 7 5 7 7 6 0 1 8 7 1 1 9 8 10 10 2 6 2 5 1 10 9 6 3 4 10 1 5 0 3 0 10 6 10 9 1 10 5 6 0 1 0 4 1 2 0 4 0 2 0 9 4 7 8 1 8 9 8 5 7 7 6 2 10 3 10 0 9 6 9 9 9 2 6 9 10 5 0 4 0 10 2 1 3 10 1 8 6 5 8 3 4 9 6 1 1 3 6 9 0 9 8 5 2 4 2 4 9 7 7 1 3 9 7 5 4 4 1 10 10 9 4 8 2 0 2 [...]
+0 6 0 9 2 5 6 10 4 3 8 7 6 6 2 8 9 1 1 4 8 0 5 9 5 1 9 10 1 4 3 10 3 9 6 10 10 4 3 5 6 6 3 1 1 5 10 4 2 3 7 9 1 4 4 6 7 7 9 8 3 0 6 8 6 4 4 8 7 5 10 8 10 4 10 6 7 9 1 5 8 10 3 0 10 3 3 2 5 1 5 5 4 0 5 0 8 8 2 1 1 7 8 5 1 0 8 8 0 3 4 8 3 10 0 2 0 9 5 5 9 10 9 5 7 1 6 1 0 4 4 4 4 0 6 10 10 7 3 6 6 3 0 1 9 10 4 3 2 7 5 2 5 5 7 0 1 8 3 0 1 1 2 8 10 7 4 3 6 4 4 3 7 7 3 5 4 2 10 3 9 8 10 9 0 8 4 7 3 5 2 5 0 9 7 3 9 3 5 7 8 0 9 9 6 3 3 0 9 7 1 4 6 0 8 10 3 2 9 5 10 6 9 1 8 6 3 5 8 6 9 6 5 2 0 1 [...]
+1 7 2 4 1 6 1 0 10 4 8 9 4 5 10 6 1 4 6 10 8 7 7 6 7 3 5 1 5 0 0 9 4 9 1 3 5 2 0 4 4 2 10 8 9 4 4 1 3 5 1 4 9 1 2 6 6 4 2 9 3 2 1 4 6 7 9 2 7 0 8 6 7 4 4 2 10 10 0 7 1 10 2 5 4 3 7 8 10 9 2 1 8 9 7 3 2 5 7 0 3 0 10 4 6 2 0 8 3 5 2 1 2 6 5 8 1 10 10 7 5 2 6 9 6 1 2 9 2 9 0 9 2 5 3 9 9 6 10 3 7 0 3 7 4 9 7 10 0 1 10 9 1 0 8 4 2 0 0 1 9 6 10 0 0 0 5 3 9 6 8 1 0 1 10 4 3 8 10 1 9 5 8 9 7 10 2 9 6 10 7 10 4 2 10 8 5 4 7 6 5 8 3 1 4 4 1 5 4 8 7 10 8 4 9 5 3 7 1 8 2 6 4 4 3 9 2 0 10 4 5 9 9 4 6 [...]
+4 3 1 9 2 0 8 1 4 9 0 9 7 4 1 1 0 6 3 1 8 7 0 8 0 5 2 1 2 10 5 2 8 7 10 7 4 6 2 8 0 9 10 8 5 0 4 5 4 9 0 3 4 0 2 8 8 3 10 0 10 3 4 8 10 2 6 2 8 2 8 3 10 5 5 1 9 3 3 9 6 8 6 4 7 2 5 3 6 3 1 4 9 4 4 8 6 0 4 2 6 3 6 8 5 1 6 0 10 8 8 8 9 7 10 0 10 8 3 6 4 5 8 9 9 10 4 9 2 10 2 0 1 8 0 2 2 9 8 5 5 4 6 3 10 9 2 3 4 6 7 2 0 1 0 6 4 1 2 6 3 0 7 5 0 0 6 2 1 8 5 2 3 4 1 8 8 6 10 5 4 10 8 10 3 3 6 6 8 7 2 4 2 7 1 3 0 3 10 6 7 4 5 6 0 6 0 6 1 3 3 4 4 4 8 5 2 0 3 3 8 6 7 0 9 0 10 0 5 4 5 0 8 6 6 3 2  [...]
+8 5 5 5 8 2 4 8 6 5 10 7 5 1 8 10 10 6 4 10 5 1 4 3 0 5 1 2 5 5 5 4 4 6 7 9 3 10 9 3 3 0 7 8 8 2 2 8 5 2 10 0 10 5 1 1 7 6 1 7 8 6 6 8 0 1 9 3 0 9 7 2 10 4 6 2 0 5 9 1 10 7 2 8 5 2 5 1 4 5 3 9 3 5 3 10 10 7 1 4 6 4 5 9 3 6 7 10 3 6 4 8 5 4 0 6 3 9 5 5 8 5 0 10 0 3 7 4 9 4 2 7 10 5 2 0 7 4 9 0 8 5 9 3 9 3 5 0 2 4 3 1 5 4 1 1 4 4 0 1 1 0 2 6 8 8 6 4 9 5 4 1 2 7 9 9 10 5 0 8 8 7 5 10 1 9 7 10 8 4 9 0 4 8 5 8 4 8 2 6 3 0 1 0 0 5 3 9 8 0 10 8 4 6 4 0 6 3 2 5 8 0 5 9 2 7 5 1 8 4 2 2 1 9 7 8 9  [...]
+1 5 5 9 6 3 7 4 5 10 4 5 10 8 2 1 9 5 4 6 5 1 2 7 6 6 0 9 9 2 9 8 2 0 5 10 6 7 8 7 4 1 5 0 9 8 4 8 9 9 2 8 10 3 10 9 4 8 5 6 6 4 2 4 8 0 8 7 5 5 8 5 9 4 9 2 2 5 2 1 8 2 6 7 3 3 0 8 9 10 2 3 3 5 2 10 0 3 5 5 2 3 5 4 6 10 9 6 0 5 2 9 1 3 3 1 8 4 0 4 5 1 5 2 9 5 8 6 2 7 9 4 8 10 3 10 4 7 7 2 10 5 9 7 5 4 4 7 9 5 10 1 7 6 0 0 2 1 8 6 1 4 1 10 5 1 7 7 2 6 2 6 1 7 8 7 6 0 8 0 6 8 9 6 10 6 7 0 0 3 7 1 7 5 5 6 1 6 5 0 0 10 7 0 0 8 3 5 4 9 3 5 4 0 7 3 2 9 9 1 2 9 4 10 0 0 10 3 9 8 2 6 5 2 10 4 3  [...]
+7 0 7 1 2 10 6 6 3 1 10 1 7 4 6 8 3 7 2 0 1 3 7 9 5 8 10 8 9 5 2 4 3 9 10 0 1 10 4 0 8 10 7 3 7 5 3 2 9 5 1 5 7 5 6 5 6 10 2 5 1 2 6 4 4 1 9 1 4 3 2 2 0 10 3 9 7 1 4 0 8 1 2 6 3 5 1 0 5 4 2 5 4 2 8 3 7 8 1 1 7 10 9 1 8 6 9 3 2 1 9 4 0 10 6 6 0 2 9 4 4 6 7 8 2 0 3 2 2 7 8 7 9 5 1 3 10 2 0 1 9 6 9 2 2 4 6 6 2 1 1 0 1 1 5 10 8 0 7 0 10 8 8 6 5 7 10 6 0 2 9 7 6 4 3 6 3 8 7 10 2 5 10 9 10 2 8 1 10 8 4 9 5 3 7 3 0 10 4 1 2 2 0 4 2 5 4 1 6 3 9 2 9 1 4 7 9 10 4 5 10 10 6 1 9 10 8 6 9 3 7 7 4 0 2 [...]
+6 2 3 2 9 6 10 4 1 2 2 3 6 0 3 6 4 3 8 4 8 9 9 0 2 7 1 10 1 9 0 10 2 8 6 7 4 6 3 1 4 0 8 4 9 6 9 10 7 8 1 0 1 1 4 1 1 1 2 3 3 7 10 8 8 8 10 1 3 6 3 5 10 2 5 10 10 10 2 5 9 4 1 5 7 6 2 5 1 7 5 10 7 2 7 10 8 5 4 5 10 3 5 5 1 6 9 9 9 9 8 6 7 3 7 9 6 0 4 1 5 5 7 6 5 7 7 1 7 10 7 9 5 10 9 3 2 0 6 3 2 6 5 7 9 5 6 4 1 2 6 7 3 0 8 6 4 8 7 3 1 4 2 4 3 4 5 4 6 1 0 8 4 2 1 7 4 8 5 6 2 8 10 7 8 1 5 2 2 2 9 10 2 9 10 1 4 4 6 6 6 3 2 0 8 6 3 1 10 9 7 8 7 3 5 3 0 3 1 3 9 3 1 3 0 0 5 8 4 10 3 3 5 7 5 6  [...]
+7 2 4 10 4 4 8 1 8 2 9 3 3 0 9 10 2 6 9 7 6 2 9 0 0 9 0 8 4 0 0 2 9 1 8 0 10 1 4 2 8 4 6 8 2 2 6 8 9 1 4 4 3 10 5 7 0 5 2 1 10 0 2 9 2 6 6 4 1 4 10 8 3 6 1 10 3 10 10 7 8 4 8 8 0 8 10 3 0 6 4 6 9 4 8 8 7 3 7 2 10 0 9 7 4 3 9 2 6 7 4 9 0 10 4 5 3 5 4 2 2 7 3 3 0 7 6 8 1 6 8 1 9 1 5 7 7 9 10 6 7 2 6 9 7 6 4 1 2 5 8 1 9 4 9 0 2 1 10 6 3 8 10 10 6 0 9 5 5 5 6 10 10 9 10 8 2 0 5 7 9 3 4 7 7 9 1 6 10 9 6 10 8 10 6 8 3 0 4 3 10 10 4 6 5 9 4 6 4 4 4 2 1 8 5 4 6 5 9 1 8 9 7 10 1 8 10 9 0 10 7 2 6 [...]
+8 0 4 10 6 0 10 1 4 8 8 8 5 8 10 6 3 1 7 9 3 1 2 5 6 10 9 0 1 0 4 7 3 6 2 8 0 9 7 7 4 3 7 10 0 1 10 8 0 6 10 5 2 7 1 1 7 5 3 7 8 0 8 0 5 9 8 8 2 3 10 0 8 7 0 1 5 4 7 3 10 2 10 8 7 6 3 2 1 5 10 9 10 3 10 6 2 0 7 4 6 5 5 10 7 3 9 7 1 9 10 9 10 5 6 1 6 9 2 0 0 4 10 3 7 2 3 10 8 4 2 5 3 8 7 6 7 6 2 5 5 4 1 3 0 9 0 1 3 3 3 10 5 5 2 9 4 3 2 0 2 6 10 9 4 7 9 0 7 9 6 9 6 2 6 7 3 10 6 7 4 0 2 5 8 7 4 6 1 2 6 9 9 9 4 0 5 9 7 9 8 8 3 1 9 7 7 6 8 0 7 4 10 8 2 5 3 7 10 8 6 10 10 1 3 3 0 9 6 1 4 10 10 [...]
+5 0 4 10 0 1 0 4 5 1 8 0 7 0 6 10 2 8 5 0 2 2 7 3 4 2 7 2 1 6 7 7 5 8 4 0 5 4 5 9 9 2 4 4 3 6 7 4 2 3 4 6 10 9 5 2 10 1 8 0 3 6 3 7 10 3 4 8 8 9 1 9 9 0 9 3 2 0 7 9 1 0 5 2 2 10 5 3 10 3 5 7 7 9 10 9 7 1 9 4 7 3 5 5 7 9 8 2 9 1 6 3 0 1 3 3 9 1 2 10 7 3 8 7 2 7 3 2 7 10 5 8 3 3 6 10 7 7 10 4 9 2 1 8 1 2 3 1 0 10 2 10 1 7 4 0 8 5 3 2 8 9 1 7 5 10 6 6 0 8 0 9 0 10 5 4 6 2 10 6 7 6 3 0 6 8 1 0 5 9 1 6 2 3 6 3 2 5 0 8 10 10 2 1 5 6 9 4 5 9 8 0 6 10 5 0 1 4 10 5 8 7 4 4 10 3 4 1 3 7 4 8 2 9 6  [...]
+4 1 10 8 6 6 9 7 2 1 10 5 5 2 8 0 0 1 6 7 0 6 7 8 4 5 10 4 6 9 4 4 6 8 7 10 1 8 8 1 8 9 4 6 1 3 10 0 4 1 2 2 5 4 6 8 8 3 2 2 0 1 4 5 8 0 2 7 7 9 6 7 2 4 8 8 2 7 0 5 2 1 5 8 10 6 5 2 1 0 1 8 4 2 4 6 4 9 5 7 4 3 4 5 0 9 5 10 1 7 4 2 4 8 2 9 2 6 0 0 2 5 10 10 1 8 6 8 10 4 9 2 1 2 5 1 10 9 9 9 1 5 8 8 7 9 3 9 9 0 6 0 1 10 2 7 8 3 4 0 1 10 1 7 7 0 1 7 6 2 3 8 1 1 7 1 1 10 1 10 3 5 4 5 0 2 5 5 6 6 8 4 2 4 9 10 6 0 2 0 6 3 8 3 1 8 8 1 6 10 4 5 3 1 3 5 10 5 0 7 8 2 4 1 3 8 9 1 0 7 4 10 10 9 1 9  [...]
+8 2 4 8 6 5 10 7 10 4 7 9 4 8 1 2 10 0 3 4 10 4 4 6 2 8 3 9 6 4 4 4 5 8 2 9 9 3 5 9 2 0 1 9 3 7 3 3 2 1 5 4 3 8 0 10 6 1 6 6 4 9 4 0 3 10 0 3 10 6 0 5 7 5 7 4 3 5 4 2 2 4 6 8 4 7 8 5 4 5 3 7 5 3 2 9 1 4 0 7 2 1 8 8 9 7 0 8 4 5 5 0 7 9 6 0 2 4 8 6 5 3 5 9 2 0 2 4 2 8 5 9 5 8 9 2 1 3 7 1 7 2 2 10 4 0 0 7 5 5 2 3 7 2 0 8 10 1 4 4 5 3 8 8 3 10 0 3 5 6 1 8 8 5 7 2 4 9 2 6 9 3 7 0 9 7 10 1 4 7 9 10 10 4 9 1 7 0 6 10 1 5 9 2 4 7 3 1 1 5 4 0 2 0 2 0 8 5 5 6 8 6 9 5 8 9 4 6 1 5 9 4 8 8 1 6 4 2 4  [...]
+6 1 0 9 2 1 7 4 2 0 4 3 9 0 8 6 3 5 5 7 1 1 2 7 10 9 8 0 7 7 10 0 7 5 6 0 6 6 2 0 10 3 8 3 2 7 7 3 0 9 10 7 8 1 7 1 4 8 4 8 8 1 2 10 5 8 4 2 7 8 8 7 1 2 5 3 7 8 4 4 6 2 1 10 8 9 4 6 4 6 7 9 4 5 1 5 7 3 8 7 3 3 8 6 7 7 6 4 3 10 3 8 2 3 0 7 1 3 3 7 3 0 2 4 4 4 6 0 0 5 10 6 1 9 10 4 3 9 1 7 8 8 0 4 3 4 8 0 5 6 7 1 2 5 1 1 3 5 8 5 8 4 5 1 6 3 7 7 2 10 10 7 10 3 9 6 8 5 9 1 4 0 5 7 4 3 9 2 1 1 8 1 6 7 2 10 1 10 6 2 7 0 2 8 1 8 3 5 1 5 2 10 5 3 0 8 10 0 0 7 0 3 5 10 7 9 4 2 10 8 5 5 1 9 4 4 3  [...]
+9 5 1 7 10 8 2 7 5 10 8 5 7 4 0 0 1 10 4 4 9 7 4 8 6 1 10 0 8 8 10 0 9 7 8 10 10 6 5 9 9 7 6 9 4 2 7 2 2 6 7 7 3 7 3 7 10 7 6 2 10 1 1 8 1 5 5 2 8 0 6 1 7 1 9 8 6 3 1 10 1 10 2 0 1 8 0 9 7 10 2 5 1 3 1 6 5 4 10 1 8 7 4 2 0 8 7 10 8 2 3 10 5 9 6 4 10 6 4 10 7 8 6 2 2 2 2 9 1 5 0 2 0 7 4 8 9 4 7 1 3 10 9 4 0 4 7 8 9 7 1 9 7 4 9 2 0 0 2 8 3 4 7 3 10 8 7 5 6 3 5 2 5 10 10 3 5 0 2 9 3 0 4 10 10 10 2 2 5 6 5 2 10 5 7 3 0 1 6 4 2 2 5 7 10 8 3 6 9 0 7 1 6 1 3 2 10 7 4 3 3 4 9 3 2 1 4 6 4 10 7 2  [...]
+4 9 0 7 0 9 2 8 6 1 4 10 9 5 8 2 5 9 4 4 2 7 6 0 8 9 9 4 9 5 1 8 7 3 6 10 2 2 6 6 6 5 7 1 9 0 6 4 8 0 5 3 4 3 8 0 7 0 7 2 7 7 10 5 9 7 8 6 6 0 9 6 6 1 5 9 9 1 2 10 8 0 10 7 4 1 3 8 8 10 6 7 9 0 0 9 2 6 5 6 2 2 10 10 5 8 9 8 9 8 4 7 3 0 10 8 3 0 2 6 3 10 7 7 10 10 8 4 0 3 4 7 10 3 7 0 6 4 6 9 1 0 7 1 10 0 2 9 0 6 6 10 10 9 2 3 10 0 5 9 9 3 5 0 5 7 0 10 0 8 3 3 8 10 5 6 4 0 9 1 3 10 6 5 0 9 5 9 2 9 3 1 4 7 9 9 7 1 1 8 4 5 5 0 10 7 10 6 8 2 10 3 5 2 1 10 7 5 1 10 1 3 0 5 3 8 7 5 8 4 3 3 7 8 [...]
+0 7 7 0 0 0 8 8 5 0 7 1 0 0 7 0 9 0 1 1 4 5 10 5 10 9 4 9 2 4 4 5 5 9 2 4 0 5 0 4 8 0 2 3 2 6 5 9 10 7 1 2 7 7 1 3 7 3 6 0 3 2 10 5 3 3 6 1 3 3 2 8 6 1 1 8 2 10 10 8 6 3 3 6 4 6 8 10 10 2 0 8 3 3 2 0 4 10 9 4 8 3 8 4 2 10 3 8 6 10 8 0 3 2 2 8 3 9 10 7 10 9 10 4 5 7 6 0 2 1 7 1 5 9 0 9 4 2 7 4 8 6 3 9 3 4 9 7 8 6 7 3 3 5 9 3 2 10 3 4 0 6 3 6 6 10 10 6 10 6 1 6 7 6 3 1 6 7 7 2 10 6 9 2 0 3 0 4 9 8 6 2 6 10 5 6 8 4 6 4 4 4 1 8 8 7 5 10 4 4 10 10 1 4 6 3 0 1 9 7 4 1 1 4 6 1 2 2 7 8 9 1 2 7 6 [...]
+1 10 4 7 6 8 9 10 6 0 0 1 5 2 6 2 8 6 7 6 0 7 2 2 4 4 2 6 10 1 4 8 6 3 9 0 7 3 1 1 10 5 3 8 10 0 5 7 10 1 9 3 7 0 9 3 0 9 2 9 1 9 3 3 6 6 4 1 5 1 2 6 2 0 8 10 7 4 4 2 8 10 1 9 8 0 5 1 7 6 0 4 0 7 2 1 3 9 0 10 2 1 1 3 6 6 0 7 10 0 2 6 5 4 9 0 8 3 4 3 10 1 3 8 6 10 4 8 6 9 4 5 9 5 3 0 3 5 7 4 8 1 7 5 1 3 10 5 5 8 7 6 1 8 10 3 8 5 1 3 4 7 5 4 4 7 9 2 5 2 0 4 8 1 2 10 6 9 6 6 6 6 0 5 1 10 4 2 1 2 5 7 8 1 3 10 10 3 6 3 2 0 8 0 2 10 8 7 4 7 3 1 2 10 0 1 10 1 1 5 8 5 10 9 3 3 1 7 2 8 0 5 5 6 1  [...]
+10 5 0 2 5 2 9 10 10 8 0 4 0 4 8 10 9 2 4 0 9 5 10 8 0 4 4 10 5 3 0 9 1 8 3 5 8 2 7 1 8 1 8 6 10 2 7 4 7 2 9 2 4 10 8 3 7 9 9 2 4 4 10 6 9 5 8 4 3 3 5 0 3 4 8 8 3 10 7 10 2 8 2 10 6 0 1 0 5 0 1 0 0 10 10 1 0 3 1 0 5 8 10 1 8 0 0 6 6 7 2 6 5 5 0 0 7 6 4 6 4 3 4 8 3 7 9 5 0 7 0 0 8 4 4 3 4 10 7 2 7 6 7 8 8 6 6 8 5 5 6 9 8 0 3 5 3 6 0 9 7 2 10 6 10 1 4 8 8 6 3 10 5 9 1 3 6 6 4 4 1 9 7 4 10 9 4 0 1 8 8 10 0 3 0 9 6 6 9 0 4 10 3 1 8 8 6 2 2 4 6 10 10 3 8 1 6 9 8 8 4 0 4 0 9 8 5 7 7 1 2 10 1 3 [...]
+7 9 0 0 2 1 6 9 9 10 4 1 9 6 1 6 0 9 5 7 1 0 0 5 0 6 7 1 2 3 1 5 2 8 1 5 5 3 7 10 8 10 0 2 8 5 5 7 6 7 5 5 8 5 7 2 10 1 1 4 2 10 6 3 5 4 4 3 8 4 5 6 9 7 3 3 6 6 0 0 2 9 1 2 2 8 2 10 7 2 9 10 5 1 0 1 0 0 10 0 0 4 10 7 9 6 2 5 9 4 4 4 3 8 7 6 1 1 4 10 0 1 3 9 4 7 4 2 10 10 9 1 5 4 6 4 4 8 8 4 10 1 5 5 7 6 10 5 7 9 1 7 10 3 4 0 1 10 0 9 1 5 0 6 2 7 3 3 1 10 0 7 6 2 10 8 1 5 7 2 4 10 1 8 7 7 5 9 0 7 7 0 3 10 3 1 6 3 9 4 8 1 0 7 5 3 9 8 7 7 5 3 0 2 7 9 4 7 8 9 4 5 6 5 4 7 2 7 7 1 10 1 10 10 4 [...]
+9 3 10 3 6 2 2 5 10 5 2 4 7 6 9 7 0 4 0 10 9 5 1 5 5 1 2 5 7 8 3 9 1 1 0 8 10 7 2 5 2 4 9 2 10 0 0 0 7 2 3 0 3 0 9 7 8 3 7 9 7 5 2 7 0 5 9 7 7 5 6 10 6 6 7 10 0 8 6 0 3 2 8 1 4 7 0 9 3 8 0 6 7 5 7 5 3 7 9 7 4 10 1 8 4 5 6 8 0 3 1 7 3 10 10 1 6 7 4 0 6 10 1 1 4 9 2 1 6 8 10 5 8 8 8 0 3 6 4 6 9 4 5 9 0 4 4 7 6 2 0 8 0 8 10 3 4 6 0 10 5 4 0 7 7 8 4 7 8 3 1 7 2 3 8 2 2 2 10 1 1 9 9 0 5 7 8 4 2 3 5 10 4 7 4 0 2 2 5 3 1 6 7 1 6 5 6 10 5 8 6 3 8 6 3 8 8 0 0 3 0 4 4 4 4 1 2 6 9 9 9 10 1 6 5 8 6  [...]
+10 7 3 1 10 9 2 8 5 6 4 7 5 9 0 3 4 4 10 0 0 0 8 4 7 2 4 4 2 3 1 7 4 2 0 2 1 10 8 5 1 4 10 1 7 5 0 3 1 10 5 8 6 9 10 9 7 6 0 1 4 6 6 1 6 9 6 1 6 3 3 0 9 3 0 4 10 1 7 9 7 7 3 3 2 4 3 0 8 0 0 10 3 2 1 3 4 10 0 10 1 10 6 7 4 10 5 7 7 10 0 3 8 10 4 9 8 10 5 0 8 2 8 6 0 0 3 3 10 0 0 8 4 8 10 1 3 4 1 5 5 1 4 7 8 4 0 10 2 7 5 4 3 7 6 8 7 7 9 8 10 7 8 9 5 0 5 4 1 5 10 1 2 5 1 4 2 4 1 3 5 4 1 6 9 5 1 5 8 3 6 4 2 4 2 5 7 8 1 10 1 8 5 4 2 2 10 6 5 8 2 5 0 8 10 5 9 5 8 5 4 3 8 5 9 2 5 5 5 8 3 8 6 6  [...]
+3 2 0 10 1 4 9 6 5 4 6 10 5 6 6 2 3 7 4 3 5 4 3 9 0 2 7 9 1 9 9 3 5 4 6 8 3 9 2 7 10 8 5 8 2 7 4 3 3 10 1 9 0 4 0 4 7 0 4 1 9 0 7 10 9 10 7 10 9 8 3 1 4 0 10 3 3 2 3 6 5 10 10 0 5 9 4 3 8 8 6 9 9 0 5 0 0 0 1 5 10 1 4 8 1 3 9 4 5 0 9 3 8 1 6 3 7 8 1 8 4 0 10 7 3 5 3 10 6 1 4 10 3 10 5 4 0 3 10 9 9 0 2 7 0 3 8 1 4 4 0 5 10 10 7 10 3 3 10 7 4 10 7 9 5 4 2 6 0 10 7 3 0 4 7 0 2 9 0 0 2 6 8 1 1 3 6 6 7 2 4 4 2 2 10 3 9 5 4 0 9 1 4 3 3 4 4 3 8 8 1 7 7 0 0 3 3 2 3 2 1 1 10 2 2 2 9 2 0 5 1 9 1 9  [...]
+6 7 5 9 4 4 9 8 5 5 6 6 5 1 1 8 9 0 6 1 7 2 8 5 9 8 6 0 3 0 1 8 0 6 1 4 4 10 8 6 6 7 8 10 10 9 9 0 7 8 3 9 1 3 4 8 3 1 7 5 1 8 4 3 2 6 8 3 7 2 7 2 8 6 5 5 0 10 7 1 10 6 6 5 0 3 5 9 10 4 5 0 6 5 9 0 3 7 8 8 6 2 5 10 10 2 7 5 3 1 6 2 1 2 2 10 4 0 10 6 3 10 0 3 2 2 9 10 8 5 4 3 3 3 6 5 6 5 2 0 1 0 0 6 7 1 5 7 10 0 1 3 7 5 0 5 8 10 9 2 5 6 3 1 1 4 0 4 3 6 8 1 0 9 0 6 6 4 8 5 6 10 5 10 2 3 7 8 6 2 5 5 0 10 9 1 3 6 5 4 7 6 2 2 6 5 2 9 9 3 5 5 8 1 9 2 8 9 0 3 9 6 9 4 7 10 6 7 1 8 5 0 2 7 3 9 1  [...]
+2 6 6 5 3 6 3 7 7 9 2 7 7 0 9 1 5 5 9 5 7 0 2 7 6 5 3 4 4 1 5 10 6 1 10 1 0 7 4 4 6 4 2 2 10 1 2 4 8 8 4 9 8 1 4 6 4 6 3 1 9 9 6 4 4 2 2 0 9 7 3 9 4 3 7 2 6 2 6 6 9 9 8 5 2 0 10 6 3 10 2 0 6 0 4 6 0 2 6 7 2 0 9 9 9 6 0 0 3 8 3 7 0 7 6 5 3 1 10 10 4 7 7 10 2 1 2 8 8 7 9 10 2 8 8 8 9 10 5 8 6 5 9 2 8 7 10 2 9 5 5 0 5 5 0 9 10 3 5 3 8 7 8 9 10 10 8 8 7 10 1 8 9 6 6 6 8 10 0 2 4 1 7 6 10 5 9 5 10 1 10 0 9 9 8 0 6 6 9 10 2 7 7 6 2 6 5 1 7 7 1 3 1 7 7 8 3 3 9 8 1 2 3 3 1 1 4 6 0 4 9 7 3 3 5 3  [...]
+8 0 1 7 1 4 6 2 8 2 2 8 0 7 7 9 10 9 4 0 7 4 2 3 8 9 9 7 4 7 2 2 5 7 9 1 8 3 2 10 6 4 5 9 3 5 2 8 9 4 5 5 5 2 9 0 9 1 10 4 4 4 6 4 5 6 10 3 6 10 1 4 3 10 3 0 9 3 3 7 10 2 3 8 0 3 1 9 5 4 2 8 10 10 3 2 7 5 5 3 5 9 10 1 3 5 3 1 0 0 1 7 1 5 9 0 7 2 7 2 1 7 7 5 10 1 9 2 7 8 7 2 8 5 3 0 6 1 1 7 8 10 6 9 3 9 3 4 3 1 4 2 1 2 1 4 8 7 4 0 9 8 7 1 5 4 8 10 6 6 7 7 9 5 5 8 0 5 0 0 10 1 1 4 7 0 4 3 1 0 0 9 9 7 7 3 6 6 4 2 7 1 2 0 3 1 2 1 7 5 6 4 5 6 2 0 0 1 10 9 9 3 7 9 10 0 4 6 8 0 4 1 9 10 1 5 9 1 [...]
+3 2 5 6 3 3 9 7 0 0 2 8 0 5 0 9 8 2 3 2 3 0 7 5 5 1 4 3 2 10 9 10 1 0 7 9 2 10 7 4 3 7 10 4 1 8 4 6 4 7 7 2 2 2 0 0 10 5 6 9 4 6 2 10 3 4 0 2 4 8 5 0 1 2 0 7 5 9 5 0 1 9 3 5 4 8 10 3 0 6 3 3 0 9 4 1 2 7 10 4 8 6 3 10 7 4 0 5 4 1 0 5 0 4 1 0 8 1 5 1 8 6 8 10 1 10 9 8 0 5 3 0 9 6 2 9 7 10 1 6 2 8 2 6 9 3 0 9 4 4 5 5 8 8 4 5 4 4 7 1 4 9 6 2 5 4 3 5 3 7 10 5 5 8 8 4 8 4 7 3 10 5 4 7 9 6 7 6 6 3 1 6 10 10 3 5 2 6 3 3 7 6 7 9 6 4 2 9 9 5 4 10 10 7 3 2 0 3 0 0 2 9 7 1 3 5 0 0 10 5 8 1 3 3 5 8 5 [...]
+9 10 5 5 4 1 7 7 9 10 2 7 2 4 4 3 2 2 9 2 8 0 2 2 0 3 4 1 5 6 3 10 0 10 7 10 6 1 5 4 3 10 9 0 1 9 5 8 1 3 10 6 7 8 4 7 3 4 2 2 2 3 6 7 0 7 3 10 2 2 0 8 6 9 2 1 10 6 2 9 9 1 5 1 8 6 7 9 2 1 7 7 3 5 9 7 8 8 1 5 1 9 0 7 0 7 10 7 6 6 2 0 6 1 9 2 7 9 0 6 1 3 9 1 4 2 2 7 9 1 0 6 7 8 2 3 1 3 3 6 4 6 2 8 9 3 1 2 9 6 0 7 1 8 5 3 7 10 0 2 9 7 0 1 4 5 2 5 1 4 3 4 7 5 6 4 6 2 2 2 5 6 9 1 9 4 0 4 7 5 6 8 3 1 3 1 1 4 1 8 4 2 9 6 9 0 10 4 2 10 1 9 8 0 4 7 3 5 0 0 1 6 1 0 0 6 5 0 7 10 9 1 1 9 5 0 8 7 7  [...]
+1 10 1 4 5 6 4 6 1 8 6 2 2 9 4 2 9 10 4 6 5 10 6 5 4 4 8 3 3 6 0 9 9 8 10 9 4 2 4 0 2 9 4 4 7 1 0 8 0 1 0 1 8 2 3 5 10 6 6 3 4 9 6 7 7 2 4 1 2 7 6 8 4 6 3 1 2 1 0 5 7 7 7 1 6 10 4 3 5 2 6 0 2 6 1 7 5 7 0 7 9 6 5 1 10 10 6 10 8 9 5 9 7 5 1 7 7 9 6 8 2 3 8 0 10 2 3 4 5 3 2 3 3 3 6 7 6 6 10 8 4 5 4 9 3 7 5 8 7 5 7 6 7 8 8 2 0 6 2 2 1 7 2 7 8 2 9 0 9 3 9 9 8 3 9 5 9 8 8 3 3 8 3 7 1 4 10 6 5 0 2 5 8 5 10 10 2 0 6 10 0 2 3 7 2 6 3 8 1 1 6 8 5 0 6 7 9 6 7 5 6 5 2 0 0 2 7 1 9 5 10 4 8 4 4 2 8 2  [...]
+6 9 5 4 9 9 1 9 7 4 3 1 4 3 0 2 10 7 7 9 7 2 10 5 3 10 5 4 6 4 8 1 3 3 7 10 1 9 9 1 9 5 2 10 8 8 4 10 6 9 1 4 2 6 2 6 4 6 1 8 4 0 5 9 10 7 4 2 6 5 1 7 1 7 9 10 7 1 9 0 5 2 2 6 0 5 10 2 9 4 0 1 9 7 5 2 6 9 1 1 3 10 7 8 7 1 9 4 2 10 6 6 4 10 8 4 9 7 10 9 1 6 0 9 10 10 0 4 2 4 0 10 9 9 1 7 0 8 2 9 0 6 7 1 1 5 0 10 8 5 8 1 6 4 7 6 8 10 3 10 1 9 7 1 2 4 8 8 5 7 10 9 0 5 3 0 8 3 9 6 6 0 1 6 1 10 8 9 0 7 10 5 3 2 1 5 5 9 10 4 10 1 9 9 2 3 5 6 3 9 4 9 3 3 7 1 2 3 4 10 1 4 0 9 3 8 10 7 9 8 4 10 2 [...]
+5 7 10 7 10 2 8 9 10 2 8 8 0 9 6 6 5 3 9 3 2 9 0 3 10 10 9 6 6 6 5 4 8 7 9 5 3 2 3 7 6 0 6 2 3 3 9 4 7 10 1 9 3 0 3 9 7 4 1 5 5 9 4 1 9 1 4 2 8 7 4 5 1 4 7 2 5 8 8 6 1 6 10 6 4 5 0 10 5 2 2 4 10 6 9 0 6 4 4 2 4 0 3 3 6 2 7 4 1 8 9 4 0 3 3 10 0 9 9 2 2 9 1 5 7 8 3 8 5 0 2 6 9 2 9 0 9 9 1 1 2 8 10 0 10 4 7 4 10 3 3 0 6 7 8 0 9 2 6 6 2 6 4 4 4 3 3 3 9 2 2 5 7 9 3 9 1 7 9 3 1 8 9 3 9 8 5 4 7 9 1 4 9 4 6 0 2 3 8 0 4 5 4 6 8 5 8 8 9 8 6 2 7 10 0 4 6 7 8 2 8 3 8 9 8 4 10 7 2 4 3 6 1 4 7 6 6 3 1 [...]
+1 7 7 7 6 8 9 9 3 9 5 7 2 9 1 1 7 9 2 1 7 8 5 8 9 6 5 5 4 1 4 7 5 6 5 9 2 4 5 5 2 3 10 0 2 2 0 0 10 2 8 8 3 9 8 0 10 4 10 1 6 3 10 9 8 8 8 1 4 5 1 6 8 4 0 7 5 10 1 9 3 5 8 4 1 7 2 7 3 3 6 3 6 6 1 6 5 10 0 1 4 0 10 5 7 8 7 7 8 9 7 10 0 9 8 3 9 5 3 5 6 4 2 3 0 8 4 2 5 0 8 7 9 10 1 0 5 9 3 10 5 2 0 1 2 1 4 1 10 8 1 9 9 6 5 8 9 0 4 4 10 8 1 6 3 8 0 3 3 8 1 4 6 4 5 0 6 10 3 2 5 6 7 2 9 2 6 0 4 6 8 8 9 2 4 9 4 1 9 1 3 1 9 7 7 1 3 0 9 6 0 7 2 9 0 6 7 7 2 9 10 10 0 2 4 4 2 9 1 3 7 2 1 1 9 9 8 3  [...]
+10 7 9 2 5 8 4 10 5 1 9 6 6 4 2 7 9 7 1 6 6 4 8 7 4 8 1 7 1 7 2 9 8 0 9 10 7 6 9 9 2 8 9 0 0 2 7 5 0 5 9 8 4 2 10 7 2 9 3 4 5 8 3 5 10 9 1 7 0 2 8 10 10 10 1 4 0 8 4 9 10 1 9 0 9 8 9 5 1 8 2 7 0 1 7 3 10 9 5 6 1 3 5 4 4 1 9 1 0 1 1 10 4 5 6 10 7 4 4 0 4 7 6 10 9 5 9 0 6 10 2 6 8 8 9 2 4 8 10 5 8 6 6 2 8 1 7 0 1 2 10 4 10 8 9 6 1 7 9 4 3 1 0 8 0 1 5 7 8 4 2 6 6 2 1 9 2 2 6 6 5 3 3 4 7 2 5 6 3 10 0 10 9 3 4 2 6 2 10 5 4 10 9 6 10 8 5 3 0 0 8 1 10 3 8 2 4 6 0 6 1 3 1 8 3 3 6 7 8 2 2 4 4 7 1 [...]
+7 10 0 2 6 9 3 6 9 6 0 7 8 3 0 10 7 2 5 6 8 8 0 2 4 4 1 10 9 9 1 0 0 4 2 4 4 4 1 3 1 8 3 2 0 6 4 8 3 9 5 9 6 3 6 0 9 1 7 8 9 9 6 1 8 10 9 7 9 4 1 7 7 5 1 6 8 2 2 7 5 3 7 2 0 5 2 0 9 2 3 0 6 10 8 4 1 5 2 1 4 3 3 7 9 9 3 2 2 0 6 9 2 1 5 9 4 2 2 4 9 3 9 6 0 2 0 1 8 10 1 5 6 9 4 8 9 3 4 9 5 9 3 7 7 4 7 1 9 5 9 7 10 0 3 0 1 5 7 9 5 10 2 10 0 5 9 9 10 2 7 2 7 10 2 2 1 6 1 6 4 5 8 1 8 1 10 8 10 2 0 10 1 6 8 10 2 5 0 0 1 0 7 6 5 3 4 2 7 4 8 3 7 7 6 2 0 9 3 9 5 0 5 5 3 2 2 7 0 4 8 3 1 8 6 8 1 1 9 [...]
+7 2 4 3 7 1 9 7 1 9 8 6 4 0 7 5 10 4 4 1 1 3 9 1 5 1 10 0 8 6 3 5 9 2 9 3 3 7 2 0 10 9 2 4 2 2 1 10 6 5 8 1 6 5 6 9 3 4 10 8 9 4 8 9 2 2 5 2 2 7 10 0 8 4 9 2 1 7 6 5 4 2 4 10 8 5 8 1 2 7 8 5 10 3 2 3 10 9 5 0 2 8 5 0 2 1 1 9 1 4 10 7 1 1 6 1 3 0 3 10 5 6 7 9 0 3 2 5 9 5 6 8 9 4 2 2 3 1 10 9 1 4 1 4 6 7 5 6 8 0 0 8 3 0 0 6 10 3 10 10 3 2 3 9 5 7 5 10 4 4 4 3 7 3 6 3 4 8 6 7 5 0 4 6 8 9 7 5 2 0 3 3 5 1 0 6 1 4 7 9 7 0 8 10 2 4 5 6 8 4 8 8 7 7 5 2 0 3 0 2 0 10 2 1 6 2 7 10 1 4 5 4 1 2 2 10  [...]
+3 9 7 8 1 9 2 6 2 10 1 6 10 0 8 8 6 8 6 0 4 1 4 9 7 0 4 4 5 9 9 5 4 5 1 2 2 9 10 7 10 4 3 7 2 9 0 5 5 8 4 6 2 2 8 3 7 2 0 9 9 0 5 0 1 0 0 5 4 6 6 9 7 1 9 8 2 4 3 4 6 10 1 1 9 7 10 8 10 10 0 9 10 4 8 6 8 1 5 1 8 6 9 7 10 1 9 6 7 2 0 7 2 7 2 8 8 5 6 8 3 6 1 8 7 6 8 5 10 1 10 1 0 6 10 3 9 1 7 5 3 9 9 2 8 9 10 4 7 1 0 6 4 10 5 2 2 8 1 2 2 8 5 5 5 0 9 1 3 9 7 7 6 3 6 9 5 4 3 10 6 3 4 2 0 6 10 4 4 10 3 10 9 8 1 5 3 4 5 4 2 7 7 2 10 1 4 7 5 5 9 8 0 1 8 3 5 1 10 9 10 7 5 9 2 1 5 4 4 0 0 5 1 10 3 [...]
+1 0 0 4 9 9 2 8 5 10 6 6 10 10 8 1 4 3 6 8 1 10 5 5 0 10 5 6 2 4 5 5 0 5 7 8 3 5 7 9 6 10 0 2 4 4 8 10 9 4 7 5 4 4 3 8 6 4 6 5 10 10 4 6 7 4 5 10 6 6 2 6 8 0 0 9 6 6 8 9 3 0 3 3 9 3 4 2 6 7 9 8 1 3 5 0 7 9 2 7 8 9 0 0 5 1 2 5 0 6 6 7 10 0 6 0 8 9 3 0 0 8 3 8 8 10 8 3 4 6 8 0 10 3 5 8 1 2 2 10 3 0 7 5 4 7 7 7 2 1 2 6 8 3 0 8 2 5 8 4 0 9 0 10 7 9 6 7 4 1 7 6 5 10 0 3 5 10 2 7 10 1 10 8 7 3 0 2 8 2 9 4 10 0 9 10 6 3 5 2 1 8 0 3 7 0 6 3 10 7 6 0 1 4 9 6 2 10 10 5 6 10 2 2 8 4 8 4 4 2 6 2 5 2 [...]
+4 1 8 3 0 4 0 4 10 4 5 6 9 4 3 6 0 3 6 10 4 9 6 4 8 3 6 5 3 5 6 6 9 3 8 8 3 9 1 5 2 8 10 9 2 5 10 0 6 7 7 9 2 1 8 8 1 6 3 1 3 6 1 10 5 4 8 10 8 7 6 8 3 3 10 9 9 10 10 1 9 7 7 10 3 6 1 7 6 8 10 8 8 7 3 2 3 3 0 5 6 5 5 10 10 1 7 2 2 5 6 2 1 10 9 1 8 3 1 3 6 2 10 0 7 10 10 7 1 6 1 4 6 2 1 5 4 6 9 8 4 7 9 0 7 1 9 7 5 1 1 9 5 5 0 2 8 4 6 1 0 7 9 7 3 1 3 2 8 3 6 7 5 2 6 10 8 9 7 10 3 10 3 0 5 1 8 3 6 4 0 4 7 0 2 6 9 10 4 2 2 4 5 2 8 5 2 4 9 4 8 2 4 10 9 5 2 4 3 8 7 4 6 9 0 10 9 1 6 3 5 2 9 6 4 [...]
+6 7 5 6 5 2 4 6 10 10 5 2 1 4 8 2 0 6 5 7 1 10 3 10 5 1 0 5 4 8 6 1 4 4 5 10 5 9 9 0 6 0 0 3 10 7 7 3 2 0 8 0 4 9 0 7 3 5 0 9 8 7 8 5 9 3 3 0 6 8 10 3 3 6 4 9 1 5 1 4 7 10 4 10 10 8 5 5 10 7 2 10 9 5 4 6 0 2 1 10 3 3 10 3 3 9 1 8 1 10 9 10 1 2 8 1 7 8 1 10 7 6 9 1 4 2 2 3 7 5 4 6 3 7 10 0 4 6 10 2 6 9 8 7 5 0 4 0 5 4 0 10 9 1 9 3 5 0 6 6 7 8 4 3 4 3 10 3 6 6 0 1 2 4 5 9 6 4 1 0 9 2 2 8 2 1 7 0 10 0 0 10 2 8 10 10 4 9 6 0 1 9 2 0 9 1 10 1 8 3 1 0 2 0 3 7 8 9 9 2 1 3 2 4 3 2 8 5 0 7 1 4 5  [...]
+9 8 5 1 10 4 7 3 0 5 1 6 9 7 2 8 0 4 4 7 2 6 9 7 9 9 5 1 1 1 10 5 6 10 6 8 9 3 6 1 6 6 7 0 6 4 6 4 3 3 3 5 5 7 2 4 7 10 7 7 7 6 7 5 1 10 2 10 10 10 3 9 6 9 9 2 9 5 4 5 5 4 2 7 4 9 1 1 3 5 7 0 10 10 1 8 10 2 1 6 0 8 4 6 0 0 1 7 4 10 3 3 8 2 9 8 7 5 4 6 3 1 10 10 9 7 1 0 5 2 1 4 3 0 7 2 10 2 2 0 4 10 1 6 8 0 5 1 2 5 6 3 7 5 9 4 1 1 5 10 1 7 2 8 2 6 8 9 4 4 9 6 3 5 7 10 10 10 4 8 4 6 6 9 7 7 1 3 6 0 2 4 3 4 4 5 2 10 1 1 8 5 5 8 2 3 6 3 1 7 10 9 3 7 8 9 8 2 10 9 6 9 10 1 0 5 1 6 9 9 8 5 8 9  [...]
+3 5 10 4 7 10 2 7 9 4 6 3 5 3 7 8 0 2 4 9 4 5 6 6 10 5 9 9 5 4 2 0 0 1 2 8 10 6 3 1 8 8 9 5 6 3 2 8 9 0 2 2 2 5 8 3 8 5 6 5 3 3 9 5 1 3 6 5 10 0 10 4 6 5 2 5 10 2 7 3 5 3 7 7 5 3 3 1 3 6 10 5 4 0 1 9 6 9 8 5 7 4 2 6 10 8 7 9 5 7 2 9 1 1 5 5 9 1 6 8 6 2 3 7 8 7 5 8 7 9 8 9 1 4 1 3 4 2 4 10 10 5 5 7 0 0 4 2 2 6 3 3 2 8 5 3 9 3 8 7 7 7 0 6 8 0 8 0 5 5 6 0 6 0 1 3 6 4 2 1 7 0 8 9 4 5 7 8 4 10 8 9 8 4 7 9 5 0 8 10 9 5 3 8 10 8 0 6 2 0 0 1 0 2 0 7 4 2 0 4 1 9 10 4 6 10 10 8 3 3 9 1 4 6 3 2 1 6 [...]
+9 1 9 2 3 0 9 6 0 4 0 6 5 3 7 9 0 2 7 9 0 7 3 8 6 0 9 8 0 8 4 9 4 8 2 0 2 5 9 10 2 1 6 0 10 10 3 1 5 4 5 10 2 5 7 3 3 3 1 10 6 7 9 3 9 9 5 8 6 2 0 3 8 4 9 8 3 10 0 0 9 5 4 7 5 5 4 3 9 0 3 1 7 0 10 5 7 0 0 2 9 9 1 4 0 0 7 3 6 1 4 9 7 7 3 4 7 10 5 6 3 7 4 10 4 5 3 0 6 3 1 10 1 0 5 6 10 8 9 7 7 3 7 9 0 5 2 6 0 5 0 9 2 9 1 1 6 3 5 9 10 0 8 10 9 9 5 1 9 7 1 6 0 3 3 4 1 1 8 3 1 10 9 1 7 1 6 5 8 1 6 10 8 2 3 1 0 3 8 9 1 10 10 5 10 6 0 9 8 0 2 8 1 5 4 7 1 8 3 10 0 0 4 7 9 5 1 4 6 10 9 8 9 0 4 0  [...]
+10 6 2 2 2 2 2 3 5 4 9 7 2 3 2 4 1 10 3 1 6 8 1 1 4 6 7 5 10 10 3 10 5 4 8 7 9 2 0 6 1 8 3 6 7 5 7 7 9 9 5 8 7 3 9 0 8 6 2 6 4 9 0 2 8 3 0 3 4 2 0 1 5 5 8 6 0 9 8 3 1 6 5 2 6 1 10 4 8 10 1 3 5 0 10 3 9 2 9 10 0 5 6 2 4 1 6 3 9 1 9 1 4 5 1 7 2 2 9 6 0 6 9 2 1 8 2 5 4 9 1 10 9 5 2 3 6 9 7 2 1 2 1 6 7 0 8 4 8 4 9 9 10 2 1 9 6 0 10 2 5 5 2 3 0 10 1 9 10 0 2 10 10 10 4 8 9 5 6 7 2 10 10 10 8 4 2 7 9 0 6 10 7 4 0 0 2 9 3 4 3 6 4 5 4 8 9 4 2 9 4 9 7 2 7 5 4 10 2 5 10 3 4 6 10 10 5 3 4 10 0 2 7  [...]
+2 7 7 1 6 4 1 6 4 8 7 10 7 9 9 5 2 5 5 6 4 2 0 5 3 9 9 10 6 0 0 7 2 0 6 4 10 0 6 0 1 1 1 3 0 1 1 2 10 10 2 2 3 8 1 3 6 2 0 5 3 2 5 7 8 0 5 1 9 1 6 9 1 8 1 4 2 0 2 3 5 5 10 8 1 5 6 0 0 6 1 4 4 0 8 2 9 9 0 9 2 10 8 10 7 9 6 7 2 0 10 6 8 3 2 5 3 0 0 0 7 6 7 10 4 4 3 10 2 2 3 3 4 7 6 2 10 3 2 3 6 2 4 5 7 5 7 2 9 9 2 7 3 8 7 2 7 5 8 9 3 7 7 1 7 8 8 8 9 10 7 7 10 6 9 1 1 9 1 1 3 2 3 2 8 10 4 0 1 1 1 4 4 2 2 7 8 6 4 5 6 1 5 8 6 0 3 2 3 8 7 6 6 7 3 8 2 5 1 8 5 9 7 3 3 3 2 6 3 5 5 5 3 7 7 1 6 1 6 [...]
+2 3 1 10 7 5 10 2 10 3 0 10 1 4 1 0 6 7 7 4 8 9 0 7 4 7 5 6 9 9 5 8 0 8 1 8 9 3 0 9 0 4 6 7 7 0 6 8 0 8 8 1 6 7 9 5 7 10 6 5 7 9 8 7 4 5 1 10 3 9 1 0 3 2 5 5 8 10 4 8 3 9 1 6 6 0 7 0 5 3 7 4 2 6 9 10 9 7 2 3 10 7 4 9 9 9 5 0 1 5 10 8 10 3 10 8 2 7 9 6 1 5 10 9 8 1 9 4 7 10 4 5 2 6 1 1 2 8 4 10 8 2 9 7 9 2 4 10 0 2 1 5 8 7 6 2 4 10 9 7 2 8 0 7 9 10 10 6 10 5 5 3 6 7 4 5 9 9 4 8 9 9 3 7 8 10 3 3 5 4 9 1 4 0 2 1 2 5 5 2 5 3 3 1 6 0 5 2 10 5 4 5 0 5 9 8 5 7 8 7 8 8 1 5 10 4 3 9 2 8 3 1 8 7 2 [...]
+2 5 0 1 8 9 0 7 3 7 7 0 10 7 0 9 7 7 3 3 9 10 1 1 0 10 3 9 4 1 5 2 9 10 9 0 7 0 10 7 8 9 5 0 5 4 0 9 10 1 5 7 9 5 3 4 7 1 10 2 4 9 10 9 6 10 5 8 2 8 3 8 7 4 7 9 1 1 4 9 4 6 3 0 10 0 9 3 1 10 5 8 4 8 4 5 0 0 0 9 6 10 8 0 4 8 9 10 2 10 0 10 4 7 6 9 6 5 10 10 10 8 2 0 7 8 2 10 6 1 3 10 9 2 3 5 5 7 5 5 3 0 10 10 5 6 1 4 1 2 2 6 10 9 1 9 9 6 1 4 3 6 7 10 4 3 0 6 8 7 9 3 3 2 8 6 0 9 4 7 2 6 6 4 0 7 2 9 2 2 7 9 6 5 0 9 4 10 1 1 7 0 10 10 4 3 8 1 1 5 10 0 1 1 7 7 7 5 8 2 0 3 8 2 0 7 5 4 9 4 8 0  [...]
+9 0 8 6 7 9 7 7 9 6 9 9 6 2 9 1 2 10 8 4 3 3 9 2 7 10 4 4 7 7 5 6 1 9 0 1 2 5 9 4 10 2 9 5 10 5 4 6 4 4 2 8 6 7 2 0 1 9 9 9 6 10 10 5 6 0 10 6 4 9 7 8 2 1 10 5 8 0 9 9 7 0 0 6 2 4 3 2 2 3 10 2 4 3 3 0 7 6 5 4 10 0 9 5 4 3 6 9 9 2 10 0 1 6 7 6 7 9 7 3 9 6 0 10 6 10 4 5 1 2 0 5 0 5 10 10 5 8 6 0 1 9 8 9 7 10 9 7 3 5 9 2 4 5 10 1 7 8 10 10 5 2 2 5 7 10 6 10 9 0 5 8 2 10 6 0 9 3 6 8 7 4 9 10 3 1 1 5 1 5 6 1 2 4 9 8 9 9 4 7 5 9 4 10 1 5 8 2 2 4 9 6 8 0 4 5 4 10 7 3 10 10 7 5 5 8 6 4 2 2 5 9 1 [...]
+3 2 9 7 1 4 2 3 0 3 1 0 2 0 6 3 7 8 8 5 5 6 1 6 4 10 7 1 4 6 2 10 6 10 8 0 10 10 6 8 2 8 6 5 1 9 6 9 0 4 7 3 7 10 3 7 3 2 5 4 8 4 7 2 3 0 2 3 9 10 2 9 1 5 8 9 5 9 5 8 10 0 2 10 2 0 9 2 6 7 0 2 5 4 3 3 7 9 3 10 9 5 8 3 9 9 1 10 7 1 10 9 8 2 3 6 9 5 8 4 2 0 0 5 3 9 4 4 7 10 4 4 9 7 0 10 3 7 10 3 1 8 0 8 8 2 0 3 3 0 7 4 0 0 6 3 7 5 6 7 5 9 4 7 3 3 9 4 2 1 8 7 8 2 1 8 10 2 8 9 0 6 4 0 8 2 8 8 8 1 7 0 9 2 9 0 9 0 8 0 4 7 4 8 1 2 3 6 0 5 1 10 6 0 10 6 5 7 7 3 1 4 1 7 10 0 9 2 2 6 4 9 0 2 3 3 2 [...]
+8 8 7 7 1 4 7 10 9 8 0 0 6 0 6 3 3 6 0 3 4 6 6 2 10 1 8 7 0 3 3 4 7 1 2 0 4 0 2 9 8 4 5 1 3 10 0 9 9 1 3 0 9 8 2 0 8 3 3 10 7 1 10 9 8 1 6 5 4 1 1 8 6 2 2 0 2 3 10 2 0 3 2 0 0 8 6 4 2 0 7 6 1 5 9 2 0 3 6 10 8 9 10 10 3 2 8 5 7 6 0 1 5 2 1 7 1 6 4 5 3 0 5 5 10 10 1 7 6 10 4 1 3 6 2 8 7 7 4 9 8 4 2 6 5 2 10 10 8 2 4 3 8 6 10 9 10 2 8 1 5 7 3 10 4 7 6 3 2 2 2 4 7 6 5 6 5 0 8 2 9 9 7 5 9 4 4 0 2 1 0 9 8 0 9 3 6 10 6 10 10 8 0 5 0 2 6 2 0 2 10 3 6 9 0 6 10 7 0 9 7 3 2 2 4 7 0 5 6 5 2 6 3 9 0  [...]
+0 1 1 4 6 8 7 1 4 7 4 4 1 9 7 5 1 8 7 9 7 3 0 9 5 5 3 2 1 0 0 5 3 6 2 2 1 2 8 8 2 9 2 4 1 1 3 9 1 1 1 5 0 9 7 4 8 2 7 7 4 0 10 7 3 8 8 10 0 2 4 10 10 5 1 9 0 5 5 8 1 3 0 4 4 9 9 5 10 3 7 5 0 8 10 3 6 4 5 1 7 9 5 4 1 3 7 9 9 10 6 6 4 9 9 3 7 1 10 5 7 10 1 9 1 3 7 6 6 0 5 4 4 5 1 8 3 10 1 3 8 1 0 10 6 7 5 9 1 7 8 10 2 0 2 8 4 6 7 5 7 5 5 10 3 3 1 9 1 0 7 0 0 6 9 8 5 6 1 4 6 1 4 1 8 4 2 10 3 3 8 2 7 0 2 7 3 2 8 6 3 8 8 5 7 9 5 4 6 9 3 9 8 8 0 10 5 2 9 2 0 7 7 2 0 8 9 3 3 2 1 3 4 5 9 4 1 6 9 [...]
+4 9 0 0 10 3 9 6 5 1 2 3 4 0 9 2 8 0 4 5 8 0 4 8 9 8 5 3 8 3 4 8 5 2 0 5 8 10 7 6 8 10 2 10 0 5 5 10 1 6 7 1 3 5 1 2 4 4 3 7 4 6 5 10 0 7 6 9 8 9 0 5 4 7 8 9 0 1 4 5 6 7 6 9 0 3 9 3 2 7 3 8 2 6 7 10 4 6 6 4 1 2 9 4 4 4 10 6 7 3 2 8 1 0 5 2 0 8 2 7 0 5 7 5 2 0 7 8 0 1 9 6 9 7 7 3 1 10 9 8 10 5 10 10 7 3 7 6 8 10 8 10 10 1 6 10 7 10 4 9 8 7 4 8 2 1 10 10 4 10 7 1 5 8 8 1 5 4 9 9 0 8 0 6 5 0 7 0 9 7 1 10 10 5 4 2 5 6 1 10 10 1 8 10 10 8 0 0 4 5 5 9 2 7 3 1 1 0 7 4 2 0 5 6 0 5 9 2 6 1 8 2 9  [...]
+7 10 4 9 8 0 6 6 10 1 0 4 4 8 6 5 6 1 9 10 4 6 7 9 4 10 5 2 6 2 6 2 8 6 4 6 2 0 3 4 7 5 10 8 9 4 10 3 3 0 7 10 7 9 2 4 0 2 0 9 7 8 10 2 6 0 7 9 10 9 6 8 8 6 1 1 8 7 3 4 0 6 6 4 1 0 5 4 3 0 8 0 9 2 10 9 6 9 8 1 6 5 4 4 1 1 10 3 8 8 9 5 1 1 6 7 3 7 0 7 9 0 8 4 9 6 9 0 10 5 4 0 4 7 9 10 10 9 1 10 0 4 1 0 9 7 5 3 1 6 1 1 9 5 4 4 7 2 8 2 2 8 5 4 7 2 1 1 0 1 1 9 8 5 10 5 6 3 2 8 4 9 9 7 7 1 7 3 2 7 4 8 9 4 0 4 2 3 9 2 10 8 5 10 5 7 7 9 4 10 7 3 3 1 4 8 10 4 3 3 9 9 9 4 5 7 9 2 3 6 3 7 2 2 2 8  [...]
+7 6 6 7 7 9 5 1 0 3 4 4 0 10 4 2 3 3 7 5 2 1 8 8 7 6 7 8 5 4 7 0 7 3 1 7 8 8 10 10 8 4 2 6 6 7 7 1 3 5 9 3 2 0 1 2 8 5 1 3 10 10 4 2 10 1 6 4 7 8 6 2 2 9 3 7 3 0 7 6 4 3 7 1 2 3 10 10 9 4 5 8 10 9 0 2 6 10 5 0 4 7 6 5 0 8 7 7 2 7 6 5 10 5 6 3 9 7 2 1 10 3 10 0 3 9 2 5 1 8 4 4 1 7 1 4 9 0 8 10 7 9 10 1 2 0 2 7 6 5 7 2 6 2 4 1 5 9 8 10 3 10 6 9 6 6 8 4 3 6 6 6 3 5 5 6 7 1 9 10 8 2 0 4 9 5 7 10 3 10 7 5 1 1 7 10 1 2 9 10 9 0 3 4 10 0 7 10 0 5 10 0 4 4 4 10 1 0 4 10 10 4 5 2 5 1 8 4 1 1 1 5  [...]
+2 6 1 3 2 1 3 0 3 7 7 8 2 9 6 8 2 0 9 3 9 2 6 4 0 5 7 2 8 9 0 0 7 1 2 6 7 6 5 10 6 6 6 9 1 0 4 0 9 6 9 6 6 1 1 3 8 9 2 0 2 2 7 7 9 3 6 9 8 3 6 7 0 7 7 2 8 0 2 0 9 10 5 7 4 2 1 7 3 10 0 9 6 0 6 1 1 0 4 7 0 2 4 4 2 10 1 4 5 7 1 9 9 2 6 4 1 7 1 9 2 2 7 3 7 4 1 9 5 2 5 6 4 7 4 0 7 9 4 0 0 5 10 2 8 1 7 7 8 2 4 7 6 7 0 3 10 5 5 2 6 0 2 8 10 8 2 4 5 6 1 1 3 6 6 1 9 7 10 5 10 3 9 10 2 9 7 9 3 7 0 8 3 9 10 9 3 6 2 1 5 4 5 2 8 6 2 8 10 1 3 10 10 0 5 0 9 0 8 4 1 6 3 6 9 4 2 1 0 7 4 0 4 10 1 4 6 1 9 [...]
+6 5 10 1 7 0 3 0 8 5 10 3 2 4 1 0 7 6 3 8 7 0 8 1 3 6 2 3 9 0 10 8 3 6 10 5 10 3 10 0 9 8 4 4 4 10 3 2 2 10 4 0 6 5 0 7 4 0 4 8 8 0 7 7 3 5 0 6 9 2 7 10 9 1 7 1 5 2 10 10 3 0 3 7 6 5 6 8 8 8 0 1 2 4 3 5 2 6 9 1 3 5 4 6 4 10 1 9 0 4 4 6 0 7 4 1 0 10 8 0 1 1 2 8 9 2 3 0 8 3 2 3 8 0 2 2 5 10 4 10 1 2 4 0 4 9 2 0 6 0 5 9 3 4 5 10 10 9 8 9 6 6 8 8 7 10 10 6 0 3 9 5 7 1 7 10 3 6 9 6 9 6 3 2 8 1 4 10 5 1 3 10 0 9 9 4 10 4 6 0 0 3 8 0 9 6 2 8 2 4 8 10 5 4 4 3 8 5 9 10 5 0 4 9 9 3 1 2 5 3 3 8 8 8 [...]
+3 9 8 9 3 7 9 6 3 1 1 0 7 1 4 4 9 5 2 8 5 3 0 3 9 8 3 0 2 10 2 2 6 2 3 0 7 3 3 2 5 0 9 7 2 8 2 6 3 5 6 7 5 6 4 1 4 10 1 7 9 0 6 0 6 4 9 5 7 4 0 9 5 0 8 0 0 9 9 10 7 6 5 8 9 6 0 10 3 4 2 8 0 5 4 2 4 7 7 3 0 8 1 6 5 0 6 1 1 10 4 3 9 1 9 7 7 1 5 10 0 3 4 6 6 10 1 5 8 0 9 0 4 6 9 7 2 4 6 6 10 6 0 0 0 9 3 8 0 9 8 4 9 6 0 4 0 3 3 9 0 6 3 6 10 2 2 5 1 1 6 10 1 0 7 8 2 9 0 8 3 0 9 1 5 1 4 10 9 3 5 10 7 2 3 5 1 4 10 0 4 1 10 5 6 1 0 9 10 4 3 2 3 5 10 8 1 5 8 4 4 6 7 7 10 1 4 3 2 7 7 0 5 7 4 7 7 0 [...]
+0 5 6 1 4 1 10 3 1 6 9 5 6 6 1 5 6 3 7 10 10 5 4 7 4 3 0 6 5 4 0 0 10 7 3 5 10 3 0 0 8 7 8 9 5 4 8 5 5 3 3 0 2 0 9 10 1 10 3 10 7 0 3 9 5 0 6 10 10 7 7 6 9 7 4 1 3 7 6 6 9 2 10 9 3 4 0 1 9 9 6 7 3 2 9 7 6 1 2 0 2 6 9 2 1 4 7 6 0 8 4 5 5 2 1 1 8 8 1 4 9 2 2 10 9 4 4 2 3 1 1 5 6 3 8 10 7 9 2 9 10 10 3 4 10 3 5 8 10 2 10 2 5 0 4 3 8 3 7 5 4 5 4 5 5 0 10 3 4 2 0 8 5 8 1 0 0 4 5 6 8 2 7 2 6 6 10 4 9 7 4 0 5 8 7 5 9 5 6 0 7 3 1 4 2 9 5 5 1 4 7 5 6 2 3 7 5 8 9 9 1 3 3 5 7 0 3 6 9 7 0 8 9 0 1 3  [...]
+0 5 9 5 9 3 6 7 9 3 10 7 10 10 0 0 7 7 10 1 1 1 7 3 4 10 0 1 4 7 10 4 4 6 1 5 3 8 6 9 2 7 0 4 9 5 10 9 3 10 4 0 5 10 2 5 2 9 6 4 10 7 1 9 9 4 9 0 6 7 10 1 6 10 3 1 2 0 7 8 8 0 8 2 6 1 3 6 0 6 4 9 4 2 7 9 10 0 2 3 6 4 9 6 8 6 1 9 1 10 2 3 1 3 4 2 0 6 3 6 0 7 1 6 7 2 2 9 8 5 6 3 3 4 7 1 2 9 6 2 6 1 10 10 0 0 5 2 5 2 8 4 5 9 6 10 9 9 10 0 1 0 10 7 2 3 6 4 4 2 6 3 0 10 9 10 9 8 6 5 8 8 0 3 0 2 2 5 3 5 7 0 8 1 4 3 9 8 2 3 6 0 8 10 0 0 0 8 2 9 7 10 4 3 8 4 5 2 2 9 10 0 1 8 9 3 9 9 8 10 10 8 3  [...]
+1 2 4 8 7 8 10 1 10 1 4 6 7 6 2 2 9 7 9 10 4 2 9 0 0 3 0 2 6 1 7 10 9 5 9 4 0 2 0 3 9 10 5 0 6 3 6 3 1 8 10 9 3 6 1 8 4 2 1 7 7 3 1 7 8 0 7 9 9 10 7 3 5 5 5 3 2 8 4 7 1 4 10 0 7 8 9 5 4 8 5 1 6 5 7 1 9 0 10 6 0 0 7 5 8 5 9 3 8 1 4 3 9 5 1 2 4 5 8 4 7 6 0 7 0 4 4 4 6 6 0 7 9 9 5 8 0 2 9 3 6 10 4 3 1 2 6 6 1 6 9 8 0 4 9 5 3 4 3 6 4 10 2 0 3 2 8 10 9 0 5 3 5 9 6 5 8 9 1 1 4 9 9 4 6 8 3 2 8 0 10 0 3 7 3 2 8 2 2 2 7 7 0 1 1 2 10 7 7 5 7 8 10 3 5 6 7 10 9 8 0 3 5 4 10 10 2 2 9 8 8 0 3 9 5 0 3  [...]
+7 6 10 6 4 3 10 10 6 2 9 4 4 1 4 10 10 1 0 3 8 0 8 1 9 5 8 7 8 6 7 5 2 4 5 10 5 6 9 5 10 1 0 3 2 8 0 6 8 7 5 0 2 9 5 10 0 3 8 6 7 3 3 3 2 1 5 5 5 8 1 0 3 10 2 4 7 9 6 6 1 8 0 9 4 7 0 2 3 0 9 1 9 1 0 8 9 3 5 2 9 5 3 5 3 9 2 1 10 10 0 2 9 0 2 0 10 7 0 7 5 9 2 1 0 6 4 5 7 10 8 7 8 4 10 5 6 8 3 7 3 10 9 7 0 9 2 5 8 2 0 8 0 0 6 8 8 0 7 5 4 2 3 7 10 4 10 10 10 5 9 8 9 5 4 4 7 9 2 2 3 0 6 9 2 7 10 0 3 9 4 6 4 6 8 6 8 1 4 0 2 5 2 1 2 6 9 10 0 7 3 7 8 7 8 7 2 7 0 10 3 9 6 0 5 9 8 6 8 8 2 10 2 9 1 [...]
+10 0 4 0 6 4 6 5 4 3 1 3 4 8 4 5 8 3 8 10 3 10 0 9 6 2 7 9 6 1 0 7 9 7 1 10 7 10 4 10 5 10 2 9 8 10 10 1 6 0 0 0 5 0 4 10 9 5 9 10 4 1 0 0 0 10 6 5 5 8 4 0 10 0 4 5 4 0 0 4 9 5 10 10 1 1 1 0 5 4 6 8 10 1 6 2 6 5 6 4 5 4 4 3 4 1 1 7 3 7 1 4 1 5 5 2 9 0 6 10 6 4 8 9 3 8 4 5 9 9 7 5 8 6 2 0 3 4 0 3 7 3 9 9 5 8 1 8 3 9 0 0 5 1 0 3 2 2 8 1 9 7 7 0 8 3 6 6 7 8 6 0 0 1 0 3 9 2 3 3 6 5 9 10 2 10 4 1 5 8 9 4 0 9 9 3 7 9 9 1 8 3 8 7 6 3 9 0 7 9 8 6 4 5 6 10 8 9 7 2 4 6 0 1 6 2 9 9 6 0 3 3 10 9 10  [...]
+6 9 2 0 8 6 5 5 9 6 6 4 9 8 9 10 1 5 4 9 10 0 4 6 2 7 10 5 9 9 0 4 9 10 5 8 10 1 10 8 8 2 10 5 3 3 9 9 8 1 8 8 4 8 5 3 7 9 4 9 4 6 10 9 0 10 2 9 4 6 10 9 1 7 5 10 6 8 1 8 6 6 10 4 8 3 2 0 0 6 8 8 0 8 3 3 2 1 0 1 9 8 2 9 5 1 2 1 7 8 6 6 5 8 7 0 5 10 8 4 2 4 3 9 7 6 10 3 2 6 6 4 7 4 5 1 6 0 1 4 10 9 1 5 1 1 3 7 6 10 0 8 1 8 8 10 6 3 8 0 7 3 6 9 6 5 8 8 4 10 4 6 8 2 2 5 4 1 2 10 5 0 8 6 10 8 6 8 10 7 3 1 9 8 9 10 3 0 10 3 7 3 0 0 1 3 4 8 1 7 8 3 5 7 0 9 8 0 8 2 7 3 9 7 8 8 5 2 9 2 3 3 7 7 5 [...]
+10 1 5 8 7 5 5 10 7 10 6 9 10 4 4 10 6 8 10 9 6 6 0 3 3 1 4 6 5 5 0 9 9 0 10 9 5 3 3 4 2 5 1 0 4 1 10 5 1 6 2 9 4 2 0 3 5 1 3 10 9 0 6 5 4 3 10 5 4 9 2 10 10 1 1 8 5 3 4 2 4 4 10 3 3 3 4 4 10 3 4 0 8 4 4 4 3 4 6 0 8 6 7 2 4 9 5 0 3 9 6 3 1 2 0 2 2 10 2 5 4 5 1 7 1 6 0 2 2 8 2 5 9 8 9 3 10 8 0 9 2 3 8 4 1 6 8 8 0 7 10 1 7 10 4 10 10 7 10 2 3 6 2 3 8 1 3 10 9 2 5 8 3 7 3 5 3 2 1 9 6 9 9 0 4 1 5 10 10 10 10 6 1 8 10 10 8 1 2 7 4 2 7 1 0 9 5 1 1 2 9 10 3 7 4 8 0 1 4 9 6 5 7 5 1 7 9 7 8 3 2 7 [...]
+7 4 6 5 1 4 10 6 6 0 7 2 5 5 4 0 9 3 7 5 10 0 0 5 9 6 1 9 0 1 2 7 1 0 6 7 10 5 2 7 10 6 3 2 6 7 3 4 2 0 8 6 2 4 8 5 10 6 9 4 8 9 2 3 1 3 10 0 1 7 6 8 2 5 9 8 6 6 6 10 5 6 10 10 3 3 8 7 9 1 2 9 7 0 7 0 9 3 7 5 5 1 1 8 6 7 5 1 4 1 10 8 8 7 2 5 5 5 3 10 8 7 4 8 5 4 9 3 10 0 3 8 4 7 1 5 7 1 2 0 5 7 4 1 5 7 10 10 5 3 6 3 0 7 1 7 4 4 3 1 7 0 10 5 0 10 9 1 7 0 2 5 2 1 5 4 2 10 9 4 8 3 7 6 5 6 10 3 9 0 4 9 5 5 2 9 3 0 3 0 6 8 7 1 9 0 3 3 4 4 8 4 3 6 4 1 2 7 4 9 3 7 5 10 0 6 8 6 0 2 6 6 8 4 0 9 5 [...]
+3 8 5 3 2 3 6 6 9 7 6 6 1 5 1 1 8 9 1 9 3 2 5 8 8 5 10 3 10 6 9 5 7 5 9 1 5 3 4 10 8 5 4 1 10 5 9 2 6 4 8 6 7 4 0 0 1 1 2 2 1 9 0 5 4 1 7 6 10 4 8 2 6 2 7 3 3 9 9 4 3 10 4 9 7 2 10 2 7 9 9 5 1 10 9 3 5 4 6 1 9 4 5 9 4 4 9 5 8 9 1 3 9 1 9 10 2 0 7 0 3 10 4 1 1 10 7 1 5 9 1 7 7 2 10 5 3 5 8 2 10 9 1 9 9 10 3 5 9 8 5 6 3 4 7 7 8 4 5 6 2 10 4 5 1 1 5 1 0 0 0 6 0 1 5 8 4 0 6 4 1 3 9 5 0 8 0 1 1 2 8 4 9 7 9 1 6 4 0 7 1 10 3 7 9 9 10 8 1 2 3 8 10 8 5 2 0 1 7 7 2 0 9 8 8 6 6 10 3 0 5 5 6 3 4 9 3 [...]
+4 8 2 10 8 5 3 10 5 2 8 0 9 2 5 2 1 5 3 6 1 5 2 7 1 9 4 0 1 4 9 8 7 7 1 2 9 9 7 2 6 10 8 4 1 2 6 1 8 4 6 0 10 1 6 0 1 5 0 5 8 2 9 1 3 3 5 1 8 10 9 7 7 5 3 4 6 5 1 3 0 8 5 1 4 6 3 5 10 8 4 6 2 3 4 10 2 1 4 7 5 3 0 0 1 1 10 10 6 6 1 1 1 8 9 2 1 6 8 7 0 6 8 4 1 3 9 10 0 3 6 0 2 8 9 0 1 9 5 8 7 7 9 4 9 7 5 4 7 6 4 8 1 8 1 1 2 10 0 0 2 2 0 2 9 5 8 8 6 9 0 4 0 9 2 2 8 5 5 3 7 0 7 9 6 5 0 1 0 4 5 4 8 10 2 7 9 5 2 2 8 6 3 0 1 6 0 3 9 5 4 2 3 1 7 9 2 7 0 1 10 1 3 10 6 0 9 9 10 8 10 5 5 6 3 1 2 1  [...]
+8 7 8 7 3 8 3 6 0 4 0 10 0 2 2 4 2 1 3 7 2 5 9 1 1 9 9 5 7 5 3 1 5 1 4 2 5 8 5 7 9 6 1 7 6 3 4 3 10 4 3 10 8 2 1 10 1 2 8 3 5 7 2 6 2 8 4 0 9 7 6 7 3 0 2 9 2 7 1 9 6 3 5 10 10 3 5 0 0 3 4 9 1 4 1 4 0 2 2 0 8 8 3 6 8 6 2 4 10 8 3 4 5 0 4 5 6 9 0 1 3 9 10 0 0 7 8 8 2 10 0 7 8 0 0 1 2 4 4 5 3 10 0 2 4 3 4 1 0 7 3 10 9 6 4 7 10 10 0 5 5 10 6 5 5 4 7 5 9 8 4 10 0 8 1 2 10 10 6 7 1 6 10 0 9 1 5 6 9 8 9 9 8 3 0 3 9 5 6 0 2 8 3 1 7 5 4 4 4 5 9 10 0 7 3 0 9 2 9 1 7 4 8 4 6 2 0 8 7 10 9 3 1 6 8 8  [...]
+5 1 5 5 5 5 2 5 9 9 1 5 8 1 3 2 6 8 9 4 0 6 3 2 1 2 5 3 3 7 2 3 2 7 10 1 10 2 3 4 1 7 5 3 2 1 3 8 9 1 0 10 6 3 8 8 10 7 6 4 9 0 5 3 0 3 6 2 1 9 10 3 2 7 0 1 10 7 4 8 2 9 1 10 9 7 1 0 5 10 9 7 10 6 3 10 5 2 7 1 8 3 10 9 4 5 4 4 7 8 9 1 6 7 10 3 9 10 5 10 5 0 2 2 9 1 7 5 4 2 10 5 0 3 3 4 10 6 3 9 1 3 7 4 10 1 4 1 9 10 3 9 5 10 5 3 10 5 3 5 3 8 2 5 0 4 0 0 1 8 1 4 1 6 3 10 2 8 5 3 5 8 4 1 1 4 6 8 10 3 1 1 5 5 6 10 1 2 4 8 8 10 8 7 10 8 5 3 9 5 4 0 2 9 8 9 8 3 0 0 7 3 4 9 9 5 3 10 9 5 2 2 6  [...]
+6 3 4 8 0 10 2 9 5 1 2 7 1 8 5 7 9 0 10 9 0 0 7 0 7 3 3 3 8 5 8 4 6 4 1 9 1 7 10 0 8 10 6 6 8 0 10 1 6 0 10 2 2 9 10 5 9 2 1 1 10 10 10 1 5 6 2 9 9 0 0 0 0 5 8 9 7 6 5 8 2 3 1 3 5 9 4 8 8 8 8 0 9 0 6 3 6 3 6 7 9 2 10 1 5 2 5 3 9 10 4 2 7 5 10 6 3 10 2 7 6 1 9 9 3 5 2 4 7 0 5 7 1 1 6 6 0 3 2 4 3 2 5 3 1 9 6 7 5 10 7 3 5 5 8 5 5 7 8 9 4 8 2 5 6 10 6 4 10 4 4 2 1 4 7 0 8 2 2 7 1 0 10 6 9 8 6 3 0 4 10 5 8 3 1 10 5 10 8 1 5 9 0 6 7 10 1 2 0 7 10 3 10 5 8 3 4 2 10 4 6 6 5 6 0 5 4 6 1 0 6 1 9 8 [...]
+2 8 3 5 8 9 0 10 3 0 1 5 6 7 3 10 0 0 10 4 5 3 6 10 6 1 7 3 6 4 2 9 10 3 4 6 3 4 1 3 9 3 9 6 9 3 9 0 1 1 2 8 10 5 1 3 0 8 0 4 8 7 9 1 9 9 8 10 1 1 8 4 2 1 5 5 10 4 6 10 5 2 8 6 8 0 7 2 2 7 4 4 2 5 6 8 10 4 2 4 4 8 2 4 4 0 3 8 0 10 9 1 3 4 3 4 9 9 4 3 3 0 7 8 0 3 1 9 6 6 4 7 0 3 2 2 7 5 3 3 8 9 9 10 2 2 6 0 8 1 7 3 2 5 2 4 1 3 9 7 2 3 6 2 2 3 8 0 2 0 4 9 0 0 7 10 5 6 8 5 7 0 0 2 2 7 5 10 9 1 9 0 5 7 0 3 10 0 10 8 0 2 1 0 1 8 9 9 8 3 7 3 9 0 2 8 7 8 7 2 2 1 1 2 2 0 2 7 5 1 8 5 5 4 2 9 5 8  [...]
+0 4 6 6 5 6 0 4 8 0 4 8 10 6 1 1 10 2 1 0 4 3 4 4 6 2 2 3 8 8 10 6 10 6 5 9 2 4 6 5 8 6 10 6 7 2 2 0 7 6 7 0 5 8 4 6 9 8 1 9 2 8 8 10 4 10 7 8 2 3 8 5 10 10 2 2 6 8 8 0 4 3 6 7 3 0 9 1 6 6 10 2 5 1 10 3 8 10 4 3 0 2 4 7 7 6 7 0 9 0 2 0 9 0 0 7 1 6 2 5 5 1 9 5 10 10 6 6 6 4 5 9 3 1 9 7 9 7 6 4 1 0 0 0 4 7 8 7 8 0 1 8 9 9 10 2 4 7 7 0 3 4 9 7 1 8 0 10 0 2 5 9 7 4 2 1 6 4 9 0 4 7 3 7 1 8 10 3 3 6 4 5 4 4 3 2 9 2 0 4 3 4 3 1 6 5 5 1 0 7 3 1 6 6 8 5 6 2 6 3 8 1 0 6 8 8 2 6 6 10 10 9 1 5 3 0 5 [...]
+3 4 2 2 0 1 0 5 10 1 5 0 6 7 6 5 4 9 9 2 5 3 6 8 10 2 4 8 2 7 0 6 1 10 6 7 0 2 3 9 5 8 9 5 6 0 4 6 7 1 9 9 2 1 3 2 9 3 7 4 5 10 10 5 1 6 3 2 4 2 7 6 4 3 7 6 7 5 7 9 6 0 6 10 0 0 9 5 3 1 5 5 1 4 3 6 5 6 7 9 2 5 2 9 0 8 0 6 6 4 2 5 7 9 2 7 2 6 1 2 6 10 5 5 3 5 1 8 9 9 3 10 9 10 10 8 9 10 0 5 8 6 0 4 9 1 6 8 3 5 9 10 10 7 3 6 9 10 3 2 4 6 0 4 3 6 8 8 4 3 4 0 4 5 6 10 2 3 10 9 9 0 9 6 7 10 7 0 5 3 5 3 1 9 10 4 8 2 6 4 5 6 0 6 4 0 7 9 1 1 10 5 0 0 6 4 0 6 4 6 0 10 6 7 7 8 7 5 7 6 1 3 10 2 4 0 [...]
+5 10 4 7 4 3 8 0 4 0 0 5 2 8 0 9 7 2 0 0 0 4 0 0 9 1 8 7 6 0 7 3 9 5 9 10 1 5 8 3 2 4 2 1 6 8 10 10 2 2 2 0 7 10 8 3 2 6 7 5 2 4 1 0 7 4 3 6 9 5 6 1 7 5 2 5 3 6 3 7 9 6 2 0 10 6 0 2 2 6 6 10 6 8 0 6 5 1 0 7 6 10 1 9 3 4 6 7 0 1 4 6 3 1 0 9 3 6 7 8 8 3 10 1 6 4 9 2 5 8 8 2 1 0 5 4 5 9 7 0 2 1 5 6 4 4 3 4 9 2 6 10 8 6 4 3 1 2 3 4 6 7 0 10 3 0 5 1 0 2 6 6 7 3 4 2 9 8 9 8 4 0 8 1 7 2 4 9 7 3 8 1 7 8 6 8 6 10 7 4 10 5 9 9 6 3 9 1 6 4 10 4 9 3 9 9 3 7 9 8 9 0 8 3 4 6 10 10 5 1 2 4 7 2 3 9 10 6 [...]
+7 5 9 9 4 3 6 7 5 4 8 4 4 5 0 1 5 8 1 4 5 5 6 0 3 3 9 0 2 3 4 2 9 3 10 4 2 6 8 5 10 4 4 6 10 3 8 5 0 4 5 4 1 0 0 7 5 6 3 1 5 1 10 7 7 4 3 1 4 0 8 10 1 9 2 6 5 5 6 10 1 4 3 9 4 4 3 8 9 6 0 8 5 6 10 4 2 2 1 9 6 0 9 3 6 1 4 6 0 9 6 6 0 1 10 10 10 3 9 6 7 1 7 6 7 4 8 1 5 8 3 6 4 6 9 4 8 7 6 4 9 6 3 5 5 3 0 2 1 1 2 0 3 5 3 0 2 1 4 1 1 5 10 8 8 2 0 2 4 2 4 2 1 5 7 4 1 2 9 3 0 8 4 4 0 2 4 5 1 4 9 3 2 6 8 5 10 6 2 2 8 10 3 4 10 6 6 10 9 4 8 9 7 7 2 3 9 8 3 10 9 1 9 9 0 4 4 1 0 5 3 3 6 4 6 0 6 6  [...]
+4 6 0 2 8 1 2 4 5 8 4 4 1 2 7 1 3 0 7 5 6 10 10 0 2 6 1 7 2 1 10 2 0 0 1 4 1 3 6 7 7 8 7 0 1 2 10 8 1 4 9 10 3 3 3 3 4 4 7 4 3 1 1 3 6 3 1 4 0 2 6 10 7 5 2 7 9 2 7 1 10 8 2 8 5 1 10 1 2 1 10 4 4 10 2 9 10 8 8 0 5 10 9 4 3 0 4 0 3 0 1 3 2 2 2 6 3 10 10 2 5 0 6 7 10 5 9 9 1 8 6 9 2 3 3 0 0 2 10 6 10 10 3 0 5 1 3 2 4 1 6 0 1 7 1 1 3 2 6 3 5 5 0 9 2 2 9 6 0 7 8 1 7 0 3 6 4 6 3 5 3 1 2 3 6 8 2 9 0 3 6 8 2 0 8 10 2 4 7 6 1 3 9 8 6 4 2 9 2 8 2 1 7 10 6 8 9 1 0 4 10 7 1 0 2 7 7 10 4 10 10 10 7 1 [...]
+7 7 6 7 6 10 3 3 7 4 8 4 4 8 6 10 3 8 5 9 2 0 2 2 5 3 7 7 2 8 8 1 0 2 5 4 2 9 4 5 10 2 8 6 1 7 5 5 5 2 6 8 8 10 1 10 6 2 4 4 6 2 5 5 1 1 8 2 10 9 1 7 0 1 4 8 10 4 8 1 6 8 3 3 10 6 9 2 3 6 10 2 9 1 8 9 2 6 0 0 8 8 4 5 9 2 3 4 3 1 1 6 0 8 7 5 3 5 8 1 9 6 10 8 6 4 8 1 3 1 6 5 3 9 2 5 2 7 4 10 3 3 6 9 1 4 0 8 8 6 2 3 0 7 1 9 3 1 9 7 3 2 0 8 6 10 7 7 10 1 7 7 10 9 2 5 8 4 8 3 9 9 7 3 3 8 2 9 10 2 3 10 8 8 7 4 0 4 1 3 4 2 10 10 7 2 3 4 9 1 9 1 7 0 4 10 0 0 4 2 6 1 10 7 6 2 6 0 8 10 8 6 10 6 9  [...]
+0 0 2 4 10 7 5 6 3 1 10 7 0 0 10 4 8 10 0 9 10 5 5 10 2 1 7 7 9 10 9 1 9 3 1 2 10 7 1 3 2 7 8 4 7 8 0 9 0 4 1 6 2 7 7 7 5 9 1 7 8 0 9 10 6 9 7 5 7 9 8 1 4 10 1 9 9 7 10 7 3 3 10 10 0 0 2 1 8 7 9 8 6 4 7 8 10 8 4 9 7 4 3 7 1 0 7 4 2 6 7 9 3 1 2 3 9 8 3 7 4 8 6 9 5 7 2 6 8 10 3 3 1 1 0 0 10 10 5 6 5 8 8 5 6 6 5 10 4 9 4 2 2 6 5 8 4 9 5 0 7 8 5 4 1 8 2 8 9 3 6 2 3 4 7 5 3 0 5 5 7 0 6 6 10 10 5 6 5 3 6 0 1 5 7 4 10 0 9 2 3 8 10 9 10 10 1 9 8 10 9 9 8 7 0 10 2 1 4 1 8 4 6 1 7 8 5 4 9 4 10 2 9 [...]
+7 8 8 1 7 9 10 3 0 10 4 3 10 10 8 1 0 2 7 5 6 9 0 1 10 2 10 10 0 1 6 3 5 7 10 6 10 2 1 1 5 4 4 7 5 1 5 10 2 8 4 6 4 10 0 7 6 1 7 2 9 7 5 7 8 8 9 9 9 1 7 1 1 5 1 3 7 5 5 2 6 5 9 3 4 9 1 3 0 2 9 10 6 3 9 1 6 9 10 5 1 9 2 8 3 9 0 2 6 10 0 8 5 10 0 6 3 9 9 4 5 3 1 4 10 3 4 7 5 5 3 7 1 7 10 3 5 6 0 9 2 10 10 3 8 2 6 4 6 0 0 5 0 4 0 2 9 2 2 7 8 7 5 10 2 8 7 7 9 5 3 9 10 9 1 3 5 10 6 2 1 3 9 0 1 10 1 9 0 4 2 10 1 0 9 9 2 1 3 1 8 4 6 0 4 4 9 5 6 3 0 2 5 6 4 8 6 3 10 0 1 2 4 1 4 6 0 6 9 3 6 8 0 0 [...]
+2 6 2 5 6 4 3 2 9 6 8 0 2 1 0 10 9 1 4 4 8 2 3 3 9 1 10 0 7 6 2 6 3 1 1 2 0 10 3 8 9 10 4 7 3 0 9 4 10 10 4 6 1 5 7 7 6 1 1 8 9 6 8 2 3 9 0 6 9 0 6 7 10 4 2 10 7 2 3 9 0 1 8 9 1 6 10 7 2 7 0 4 1 4 9 0 0 6 3 6 3 7 1 5 2 2 10 10 7 7 0 5 10 10 1 8 5 6 10 5 10 0 6 4 8 6 6 10 10 9 8 9 4 10 6 5 4 7 9 4 4 0 4 5 8 1 4 9 5 2 1 6 7 6 6 6 9 9 8 1 5 6 3 8 3 6 2 5 2 8 6 2 9 4 1 6 9 1 0 6 5 1 5 3 5 2 1 5 1 10 4 2 8 2 1 5 6 6 10 2 9 7 6 0 0 8 5 4 6 8 8 3 9 10 0 0 1 4 6 9 1 10 10 8 7 3 8 3 2 1 9 5 1 5 7 [...]
+0 10 5 2 4 0 9 2 2 8 9 6 3 0 8 5 2 9 8 4 2 7 4 8 9 5 0 5 0 10 0 6 7 0 10 4 1 0 6 10 3 8 4 7 9 10 9 10 7 4 6 2 7 4 4 9 3 5 9 3 10 6 7 3 0 6 2 2 6 7 5 4 5 7 1 1 3 4 2 4 8 1 2 5 5 3 8 1 9 5 4 6 2 3 4 1 8 10 5 3 2 2 2 0 8 2 2 4 6 0 2 1 5 8 9 0 10 7 2 4 9 5 4 7 7 8 8 1 3 8 4 7 0 6 7 1 10 3 9 8 1 4 7 0 6 9 8 8 5 6 8 6 9 0 5 5 7 9 10 9 8 5 0 4 1 10 1 1 0 5 4 4 2 1 6 1 4 6 4 5 6 4 4 1 3 1 5 0 2 5 1 6 3 1 5 10 1 7 10 5 10 5 9 5 5 5 5 7 6 6 4 5 2 7 9 4 5 3 7 0 2 3 4 4 3 6 3 10 9 0 0 2 5 6 10 2 6 6 [...]
+5 10 0 2 7 10 5 10 4 5 7 1 3 9 6 5 7 0 0 6 1 10 3 3 0 2 10 6 4 1 6 1 3 4 7 8 6 0 10 1 4 1 4 4 8 7 4 4 6 0 2 7 3 10 8 6 7 1 0 1 6 7 1 5 7 10 9 1 2 1 5 5 1 1 9 1 5 7 6 6 5 10 5 9 6 2 0 9 1 3 10 2 10 2 10 2 9 6 6 0 6 9 4 1 0 7 0 7 2 0 5 4 1 5 5 2 6 0 7 10 8 5 1 9 1 5 3 8 9 4 6 9 6 5 4 5 2 3 8 10 2 0 1 9 8 0 10 7 1 4 2 9 10 2 2 2 3 8 0 1 9 9 3 10 1 6 3 6 8 1 0 2 2 4 3 9 0 8 8 4 4 0 10 8 7 10 8 1 6 5 8 1 5 6 10 3 9 5 5 7 5 0 9 10 0 7 9 2 1 3 7 4 9 9 4 8 2 5 1 5 3 0 4 6 8 7 3 0 9 3 6 2 3 2 9 2 [...]
+1 4 10 8 9 0 1 1 10 5 6 4 3 7 3 5 6 7 10 8 6 5 7 3 8 5 0 1 5 9 8 10 1 6 5 1 2 9 3 1 10 4 2 5 2 2 4 2 7 7 5 0 8 9 1 4 0 8 7 1 9 9 0 8 0 7 1 2 1 0 3 3 8 8 4 3 1 0 6 4 7 5 7 8 2 7 1 3 10 3 8 1 10 1 0 2 1 1 2 8 10 5 5 10 5 7 1 6 10 7 6 4 1 8 3 0 6 3 3 4 4 2 3 6 2 9 9 0 3 7 6 0 0 4 8 7 4 7 3 5 7 5 8 9 8 0 6 1 0 2 8 9 0 0 5 3 7 9 8 10 1 3 4 1 0 9 6 2 4 6 4 10 10 9 3 5 4 1 3 2 5 4 2 5 10 6 4 0 4 10 9 5 0 5 8 8 2 2 3 2 1 10 9 4 1 0 10 7 0 4 7 1 0 6 0 4 4 7 2 0 3 2 6 10 9 8 0 0 3 2 0 2 8 9 9 7 9  [...]
+4 6 7 9 7 7 6 2 1 9 6 8 8 5 6 10 0 8 8 10 8 1 9 10 4 9 4 8 2 6 2 2 3 1 6 5 9 10 0 6 5 0 3 5 8 4 10 1 5 0 1 7 7 10 3 7 7 5 4 6 6 9 4 9 2 6 8 2 5 2 9 8 6 0 5 9 6 7 10 9 8 7 0 1 4 3 0 8 5 5 5 7 6 4 6 3 4 7 1 2 3 4 9 4 0 6 10 5 3 3 0 4 4 7 8 9 10 7 2 4 10 0 6 0 8 10 1 3 3 0 4 6 5 4 9 3 9 8 5 4 9 1 8 6 2 3 5 1 6 5 7 4 8 1 6 6 3 2 1 8 5 2 7 0 2 2 5 1 5 1 0 7 1 6 7 4 8 9 5 6 7 8 4 8 1 4 7 10 0 8 10 4 7 3 2 5 0 6 2 10 6 7 3 8 9 1 2 6 3 10 5 3 5 5 9 8 2 4 5 9 5 7 9 3 10 10 0 7 2 3 8 10 10 5 7 0 9 [...]
+4 2 6 1 0 5 5 5 6 1 2 5 3 4 1 1 5 10 6 1 4 3 2 6 2 5 8 7 1 1 2 4 8 8 8 6 4 9 3 9 10 6 1 9 8 6 1 2 8 7 3 6 9 3 4 0 7 4 9 4 7 8 6 9 3 2 8 1 9 3 0 3 1 7 2 9 9 6 8 5 0 10 6 8 5 5 9 4 10 4 9 0 3 10 9 6 3 7 0 3 7 4 7 6 3 9 10 9 7 8 9 10 0 10 8 1 9 5 1 2 7 4 1 6 6 3 9 1 5 8 0 6 3 8 10 6 1 6 3 5 9 2 0 5 4 9 6 10 9 9 1 2 4 0 5 8 10 0 8 9 6 5 4 5 9 4 1 1 1 10 5 5 4 7 10 7 5 2 5 5 2 1 10 9 8 5 7 7 10 9 2 2 2 2 8 8 7 6 3 8 10 0 10 6 6 5 6 1 8 3 5 6 2 1 6 6 1 0 2 0 2 7 2 1 3 10 2 4 0 5 9 1 6 5 8 6 6  [...]
+4 2 5 5 7 4 0 5 0 10 2 6 1 6 5 9 10 9 9 5 10 1 6 1 5 3 7 9 1 7 1 1 0 9 3 2 3 6 1 3 3 2 7 3 10 0 4 3 7 2 10 3 7 2 1 1 1 2 0 10 10 9 0 9 7 1 7 9 7 1 2 9 3 8 9 7 3 0 3 8 0 2 10 10 4 3 0 1 5 3 7 7 2 8 9 3 10 4 3 5 4 0 5 7 7 1 6 2 7 6 1 10 7 7 4 7 10 5 10 1 2 8 6 8 8 6 7 8 2 10 8 1 3 9 1 2 10 10 0 1 8 10 4 4 1 6 6 0 7 10 4 4 0 10 1 2 4 6 6 7 9 7 5 7 5 2 3 2 2 7 7 7 8 1 9 2 8 2 0 4 1 4 2 3 8 0 3 1 0 6 5 5 3 6 2 4 0 0 5 5 6 10 7 2 10 0 8 7 9 9 8 6 5 8 2 8 8 3 0 5 10 6 0 1 7 1 8 0 3 0 8 4 5 4 1  [...]
+9 2 8 3 10 7 9 2 1 4 9 7 4 1 9 9 4 7 6 2 3 2 5 8 10 5 5 6 10 4 7 6 8 0 6 0 4 6 7 1 0 10 2 1 9 3 4 1 8 10 3 2 5 9 10 0 6 0 1 6 10 6 8 7 10 4 10 8 8 4 4 9 4 2 2 3 9 6 9 9 9 5 0 2 9 4 9 10 5 7 4 6 7 3 5 1 7 7 0 6 3 2 3 1 3 10 4 4 1 0 1 5 8 9 7 10 10 9 5 6 1 5 5 5 10 9 8 10 3 2 2 10 10 6 5 9 9 4 4 9 7 1 10 10 5 9 2 6 6 4 9 4 4 2 8 10 4 7 7 10 8 6 8 6 8 5 2 9 4 7 0 2 8 1 5 2 10 4 5 1 3 1 10 0 8 0 6 5 9 0 5 9 4 3 8 8 10 10 10 10 1 9 9 3 5 9 9 9 9 9 9 4 5 0 1 5 6 7 0 7 8 8 4 0 8 6 6 7 5 9 0 8 3 [...]
+4 7 2 3 2 4 1 4 0 0 2 1 3 9 6 9 6 8 8 0 5 6 0 1 5 10 3 0 7 1 1 5 10 1 9 4 7 9 6 4 10 6 3 10 1 10 8 0 5 10 5 8 5 9 7 3 6 0 8 6 6 1 1 10 8 1 1 3 2 3 6 10 3 10 5 10 9 0 10 10 10 7 2 7 10 1 3 0 1 5 6 3 5 10 3 4 3 9 9 10 5 2 2 3 3 3 8 3 0 0 4 10 10 3 9 2 9 2 10 8 10 3 10 5 2 7 10 3 10 4 8 6 6 8 2 9 4 2 8 6 8 5 3 10 6 2 6 1 9 9 4 6 4 0 3 1 8 5 10 0 5 9 2 9 1 10 2 3 8 8 6 10 1 1 3 3 4 9 0 2 1 2 5 6 0 0 7 6 4 1 9 2 1 1 10 9 10 5 9 6 7 6 0 3 4 4 6 8 1 7 0 10 2 5 9 5 8 3 1 8 4 8 3 5 7 9 3 8 3 10 9 [...]
+0 7 5 8 8 5 5 3 7 8 0 0 7 5 4 9 8 5 7 9 2 9 5 4 3 7 6 1 9 0 5 8 10 6 9 7 10 8 4 0 10 0 7 2 9 8 3 3 3 6 8 3 0 7 2 7 3 10 8 10 1 7 2 2 5 5 6 6 5 9 5 1 5 2 10 2 9 10 0 10 2 9 4 1 1 4 9 3 4 7 8 4 1 9 2 0 7 8 8 8 8 3 4 5 9 5 4 10 8 6 0 4 6 7 6 2 8 8 4 4 5 3 2 3 8 10 4 5 6 6 0 5 9 2 9 6 3 7 6 8 5 4 8 1 7 0 3 0 8 8 5 6 6 4 2 4 5 3 6 8 1 0 0 7 7 5 10 9 9 9 7 5 4 2 2 6 1 8 10 4 0 9 3 3 4 5 5 6 6 2 9 2 6 1 9 7 1 5 8 3 2 8 1 9 8 7 2 2 5 1 4 7 10 3 2 3 7 9 10 7 5 2 4 3 5 6 6 3 1 1 7 6 10 3 2 7 10 10 [...]
+0 0 10 5 7 8 4 0 4 3 1 9 3 3 4 4 10 10 2 10 8 6 3 8 4 0 7 5 1 3 2 0 8 4 3 10 6 1 5 3 9 7 7 4 9 5 6 10 8 5 4 5 9 9 4 3 4 6 6 0 7 10 1 0 7 8 9 10 2 1 7 3 10 1 6 4 1 2 9 0 6 3 7 1 9 5 0 9 0 2 5 8 0 2 7 6 5 9 9 8 9 0 0 4 7 10 2 8 3 1 8 0 6 8 1 2 7 8 2 1 10 6 0 0 7 3 8 8 0 10 1 1 4 3 8 3 6 0 9 9 7 2 7 6 2 10 2 1 10 10 1 2 3 5 4 10 6 7 0 0 7 2 3 0 3 3 0 6 0 1 0 2 9 10 5 0 2 10 4 1 1 3 6 5 10 5 10 6 8 3 6 7 10 4 3 6 10 9 8 6 5 2 6 0 3 6 5 0 6 5 1 7 2 10 10 8 6 8 4 7 4 0 10 2 5 7 0 0 9 6 6 8 1 1 [...]
+9 5 6 1 8 3 3 0 7 3 4 2 1 4 6 0 7 1 6 3 4 1 1 0 0 8 10 8 7 0 10 5 7 9 4 6 5 2 1 8 5 3 10 5 3 1 9 2 8 10 3 2 2 5 2 10 4 1 2 2 0 5 9 7 5 8 8 0 6 3 10 1 6 10 3 3 10 1 7 7 8 3 10 5 6 9 0 4 6 10 0 1 10 10 0 2 7 0 5 0 4 0 5 9 5 5 3 9 4 5 3 10 8 9 3 6 9 4 4 2 9 5 4 7 0 2 7 0 8 9 7 6 2 8 8 3 5 1 1 4 8 7 7 8 0 0 7 10 3 5 5 8 2 9 0 9 5 0 0 6 3 2 8 9 3 0 10 10 4 8 0 6 10 2 4 4 9 4 10 3 6 7 5 9 8 5 7 3 9 9 8 9 5 5 8 6 2 7 0 6 4 0 7 1 7 5 0 10 10 7 2 9 2 2 9 10 7 5 8 6 0 2 5 9 0 0 9 7 7 10 7 5 7 1 9  [...]
+1 3 4 0 7 0 7 0 6 8 3 3 8 8 5 6 8 2 5 0 1 7 5 3 2 4 4 8 6 5 5 0 2 9 3 6 6 8 9 5 7 10 0 0 7 6 5 4 0 6 3 1 10 2 10 6 6 7 7 4 3 8 7 7 4 0 2 7 6 8 1 0 0 7 2 8 9 8 1 8 9 9 8 3 0 6 7 9 4 9 6 2 1 3 5 0 6 0 10 10 1 2 9 2 10 8 4 0 8 1 0 6 10 7 4 1 7 7 7 8 9 1 5 7 6 8 1 2 1 3 0 10 2 8 0 7 0 5 7 10 9 8 3 3 10 7 3 8 1 7 2 2 0 6 3 7 1 6 4 2 7 2 5 8 3 7 10 7 10 2 5 6 4 2 10 4 3 6 4 1 1 3 0 1 2 10 7 5 3 6 6 1 5 8 8 4 8 4 0 5 0 3 3 9 10 3 1 7 0 10 5 2 1 2 3 10 9 2 7 1 9 5 9 2 4 3 3 2 9 2 8 10 3 6 7 8 10 [...]
+10 8 7 9 4 6 10 4 2 8 7 7 4 0 4 5 4 2 10 4 9 10 8 3 0 8 9 2 1 7 8 3 2 7 8 3 8 4 6 0 7 2 5 2 9 9 3 2 5 7 2 8 2 6 4 6 10 0 9 9 0 9 9 7 3 10 10 6 0 0 0 8 1 4 7 5 5 7 2 10 1 7 4 3 3 1 10 9 4 1 3 0 4 5 6 0 7 4 5 3 4 1 4 2 7 10 7 7 3 6 7 10 0 2 4 5 9 5 8 8 9 5 5 4 3 0 9 7 5 5 3 5 7 1 5 9 1 6 0 5 1 4 0 7 6 1 1 10 4 4 5 7 10 6 2 0 6 6 10 4 5 7 2 10 5 6 5 9 1 5 2 5 7 5 10 9 6 3 4 8 4 1 0 3 8 1 10 10 4 0 0 3 10 4 0 0 6 1 8 9 9 5 1 3 9 8 10 4 3 5 6 9 0 0 9 6 10 9 10 7 0 10 0 0 7 9 7 1 2 0 2 1 10 5  [...]
+3 0 7 1 10 1 5 3 8 1 1 5 6 6 2 2 1 1 7 2 7 7 4 9 5 2 9 6 0 9 4 4 2 4 7 4 6 2 5 4 8 7 9 0 9 10 1 2 0 4 9 4 3 1 2 4 9 4 5 1 6 10 10 9 1 1 10 8 5 4 8 8 0 9 3 8 1 5 3 5 8 7 7 0 4 9 0 1 3 7 10 3 1 6 7 8 10 1 5 3 1 6 8 3 3 5 10 2 4 9 2 8 1 2 4 8 6 6 8 7 5 10 4 9 6 7 4 1 2 10 7 7 4 4 4 9 7 7 0 0 10 2 2 9 10 8 1 4 7 1 9 3 6 2 4 8 4 7 0 0 8 2 3 5 2 1 6 10 10 8 10 7 4 10 1 6 0 7 5 0 5 1 5 10 1 7 7 7 8 7 1 9 9 7 1 6 9 7 4 2 0 7 8 4 3 6 3 7 6 7 5 10 1 5 9 0 10 3 10 8 4 2 5 9 8 9 9 0 2 5 8 9 7 4 8 4  [...]
+6 9 3 2 7 1 0 3 4 8 7 6 2 6 3 4 7 9 8 5 5 10 5 6 4 3 7 9 4 3 6 2 10 4 5 2 10 2 8 2 4 8 1 0 5 0 4 9 2 10 5 6 0 6 5 6 7 0 10 4 0 7 6 8 7 8 3 4 2 7 9 0 10 6 6 1 6 10 3 10 3 7 4 1 5 10 7 1 4 0 4 10 5 4 9 1 2 2 1 2 10 2 7 9 9 2 3 6 3 8 0 7 4 5 7 5 8 8 2 5 2 1 7 1 2 8 3 8 8 0 7 9 1 0 2 6 8 10 3 4 10 2 4 7 5 6 7 2 1 3 9 7 8 2 0 10 2 5 6 8 8 6 6 9 2 6 2 10 7 6 8 0 8 0 3 0 0 5 3 2 10 10 8 10 7 10 9 8 8 1 6 0 6 1 6 1 6 10 2 3 10 4 1 0 8 5 4 8 1 2 10 5 6 4 7 8 7 7 2 7 2 3 10 5 2 8 6 8 5 4 6 7 9 1 7 [...]
+4 9 1 6 6 2 8 9 8 3 4 1 8 4 2 10 8 4 4 7 3 7 10 10 10 1 9 3 7 1 6 6 9 7 9 8 0 6 2 3 1 9 9 2 8 1 7 7 6 8 5 0 4 6 3 9 5 3 2 1 1 10 2 1 0 3 7 8 5 4 10 5 0 7 1 9 5 5 7 9 7 10 8 5 10 0 6 2 7 4 10 1 8 5 1 2 2 4 4 3 7 3 4 5 4 0 6 9 1 2 2 6 1 3 4 0 0 2 1 6 5 4 10 4 10 3 6 10 6 4 3 3 2 1 6 5 5 10 5 4 1 6 2 4 6 1 9 6 9 1 2 0 9 7 2 4 7 3 5 6 6 8 8 9 8 5 2 6 2 10 0 5 5 0 10 0 2 6 10 2 5 4 10 9 9 9 5 6 2 0 9 4 10 9 4 3 1 8 2 7 1 4 8 6 4 7 0 3 7 3 1 5 6 9 3 7 6 5 7 7 2 1 6 7 7 2 7 2 0 8 2 5 2 7 6 0 5  [...]
+5 3 1 6 7 2 1 0 0 2 4 10 6 3 7 2 10 0 5 5 3 4 2 0 2 8 5 5 2 10 9 7 10 4 6 4 8 8 3 2 1 6 6 9 7 4 2 3 2 3 9 5 4 10 7 10 10 3 10 10 1 10 4 1 1 4 8 2 7 7 1 6 3 6 6 6 1 8 10 10 9 0 10 8 3 6 6 6 3 4 4 6 3 8 6 1 3 10 9 3 1 1 0 2 3 4 2 8 5 6 4 0 10 4 7 5 1 6 4 4 1 5 10 1 0 0 4 5 9 7 0 8 3 1 3 9 0 10 2 7 8 7 3 2 3 9 0 9 3 3 7 10 0 3 1 1 5 2 5 5 5 8 6 2 5 9 1 3 3 10 2 6 8 10 8 10 2 7 0 10 3 0 9 8 7 4 10 6 1 8 10 5 6 6 6 4 4 3 6 6 10 0 3 5 1 0 10 5 7 9 7 4 3 2 5 1 3 7 8 9 6 10 10 9 8 0 7 9 5 0 5 3  [...]
+5 4 0 9 4 4 10 2 9 8 0 3 1 4 6 4 10 9 7 4 1 3 9 8 7 3 9 8 7 3 2 8 8 7 5 9 7 7 7 1 0 3 0 1 4 4 7 8 8 5 0 4 10 8 9 10 9 10 2 7 0 6 4 6 3 5 0 5 7 9 10 4 4 8 0 9 5 3 7 8 0 8 9 4 1 7 9 10 0 5 9 0 1 0 10 10 3 5 6 10 9 9 1 9 2 6 10 2 4 8 3 0 4 0 10 8 6 3 10 6 9 1 6 8 6 1 2 6 10 7 5 3 4 5 1 1 5 0 2 1 9 4 5 6 2 7 5 9 6 3 3 8 4 6 10 1 6 1 3 2 7 7 2 5 3 7 10 5 7 6 2 9 7 0 2 3 3 10 0 2 7 5 2 8 6 6 0 3 6 4 3 3 6 7 10 0 6 1 3 8 9 4 1 9 8 3 8 5 0 2 8 10 7 4 3 4 8 1 4 9 8 3 9 6 3 9 8 8 9 5 10 7 1 9 7 8  [...]
+10 3 5 5 9 6 0 2 9 5 10 2 6 8 2 1 1 10 8 1 2 3 4 5 6 7 9 1 1 4 3 5 4 8 1 7 2 4 4 10 1 6 4 7 6 4 7 1 7 9 7 2 4 7 4 5 9 6 7 2 0 1 9 2 9 10 10 1 6 0 3 2 3 5 3 3 7 6 9 6 8 10 2 6 3 2 3 10 10 2 2 0 10 3 4 0 8 0 4 9 0 3 1 10 2 2 2 8 6 3 4 7 10 4 8 9 1 0 2 7 1 5 1 9 0 3 2 5 3 0 3 6 10 2 4 6 7 10 2 0 4 5 0 3 0 6 0 7 4 0 10 8 10 7 10 7 10 10 9 10 0 2 0 6 3 10 6 9 6 0 5 10 10 4 3 5 10 4 7 7 3 3 6 2 4 0 2 4 10 6 9 4 2 2 2 4 3 6 0 4 1 8 2 5 4 0 0 7 7 0 8 0 8 4 5 10 3 9 8 9 4 5 10 2 2 10 0 1 0 9 10 0 [...]
+6 6 7 0 3 9 0 1 10 4 9 10 9 3 6 7 6 0 0 6 3 3 2 10 7 8 9 1 8 0 9 7 6 3 8 3 6 6 10 4 9 10 2 5 8 0 0 6 3 5 6 6 10 5 3 5 5 1 10 8 9 6 5 7 2 7 1 7 8 10 10 7 8 4 6 9 7 4 6 9 0 2 8 1 5 5 6 0 3 3 1 2 9 2 1 9 5 5 2 6 3 0 2 8 9 3 8 6 6 5 8 3 5 8 4 2 0 10 1 9 9 6 9 9 3 5 7 8 1 5 5 2 6 6 5 1 6 8 8 9 6 8 8 6 8 8 0 2 3 3 4 1 8 6 3 6 3 8 1 8 10 4 4 8 4 2 0 5 10 3 2 10 5 8 0 3 8 5 4 4 9 7 5 1 7 9 8 8 2 4 9 7 7 4 10 7 0 5 7 4 4 10 7 1 4 0 9 10 7 9 2 8 7 7 3 8 8 8 9 6 9 1 6 4 1 5 0 3 9 6 6 1 4 7 1 2 8 2  [...]
+4 0 8 10 4 5 0 4 3 6 8 0 0 8 9 4 5 5 7 9 6 6 9 2 10 0 10 6 3 10 4 7 8 1 5 6 1 3 0 0 0 9 5 2 10 2 5 3 6 6 9 1 8 3 3 10 9 3 7 0 3 10 3 9 9 10 8 2 9 9 7 7 8 10 4 8 0 8 1 4 3 10 7 2 9 7 10 3 2 1 10 10 4 0 1 8 1 9 2 10 10 6 6 8 1 5 5 5 1 4 5 1 5 2 8 1 10 8 8 0 7 2 10 1 7 4 10 4 10 6 4 1 1 1 10 6 4 9 0 0 6 5 8 10 3 9 8 4 9 1 7 5 4 1 10 8 0 6 3 2 1 10 8 9 10 2 6 2 3 0 5 9 7 8 2 8 7 10 6 1 7 1 8 4 0 4 0 9 3 6 9 9 1 4 7 5 5 10 2 0 1 9 5 0 3 2 2 10 4 4 3 8 1 3 2 5 7 3 8 9 2 4 6 6 6 0 2 6 0 5 6 2 7 [...]
+0 5 5 2 4 6 8 9 7 9 9 1 2 2 4 1 2 7 10 5 10 1 7 5 1 10 2 8 4 2 4 0 1 1 8 10 1 5 9 1 1 9 2 5 1 0 8 9 1 6 4 9 2 10 0 5 1 0 5 5 3 8 0 7 7 4 1 10 10 10 2 10 8 8 9 4 0 7 6 6 5 5 3 10 6 4 9 2 6 0 10 5 8 6 8 10 4 0 8 5 8 3 0 0 7 3 7 6 8 6 5 3 7 5 0 8 4 1 2 4 4 7 0 10 3 7 3 1 5 1 8 0 7 8 3 1 3 9 9 3 4 4 2 10 4 4 1 6 4 6 8 10 1 4 9 2 1 1 9 5 8 6 10 10 7 4 1 3 9 9 7 8 10 3 0 9 8 7 0 6 4 4 5 7 7 5 3 5 10 3 6 4 4 9 5 1 5 5 7 1 8 9 9 2 6 7 8 10 1 1 7 10 2 3 9 1 3 10 7 8 4 10 7 0 1 1 5 10 9 1 9 3 0 2  [...]
+10 7 0 0 4 4 0 7 0 10 0 6 10 4 3 3 5 4 3 5 0 2 1 1 1 10 5 8 1 3 8 6 6 5 10 5 7 0 9 6 4 3 0 9 5 4 0 7 2 2 5 6 0 2 10 0 6 6 6 0 3 0 1 10 3 9 7 5 2 4 1 5 8 2 8 4 0 3 6 0 9 10 0 1 8 7 7 5 10 1 3 4 10 1 1 3 9 6 6 4 1 4 9 8 0 4 7 7 3 6 3 1 0 10 10 5 2 10 8 1 0 9 4 9 3 1 6 7 4 3 10 3 5 10 3 10 4 3 1 8 6 5 7 9 0 7 1 0 1 9 2 8 5 2 7 2 6 2 8 10 7 9 3 10 8 2 7 7 10 9 6 10 5 3 2 6 9 5 4 4 0 10 5 0 7 3 4 2 0 8 6 7 10 1 4 0 2 1 7 2 4 5 4 10 5 6 9 2 2 8 5 0 5 0 3 5 4 2 10 4 7 9 4 10 10 9 7 1 8 10 10 8  [...]
+4 9 10 4 3 5 9 1 4 2 9 7 7 8 5 0 1 10 6 3 4 5 10 0 5 4 3 5 8 0 1 3 7 7 5 4 3 4 9 5 7 9 2 7 7 6 9 4 5 1 9 6 8 1 0 1 4 7 5 2 4 8 6 3 0 4 7 8 1 10 0 1 1 4 9 0 7 4 2 4 1 3 4 6 4 3 4 1 10 0 6 9 8 7 6 9 0 0 10 2 8 6 1 2 1 10 9 0 7 5 2 3 8 1 5 8 3 1 2 0 9 3 5 8 2 7 5 9 8 4 3 10 5 4 5 4 1 3 4 10 10 2 2 8 9 0 10 1 3 10 7 9 2 10 0 4 4 4 1 6 1 2 2 6 1 1 9 5 4 4 2 0 4 7 3 4 10 9 5 9 7 4 0 0 10 5 4 10 5 3 8 2 9 7 2 2 0 9 2 4 7 1 5 10 6 8 6 2 1 2 9 0 9 5 2 8 0 9 8 5 6 4 0 10 2 3 7 6 2 4 2 5 3 8 3 1 6  [...]
+9 3 10 0 10 8 1 10 8 2 6 6 6 0 8 6 8 8 6 1 5 2 8 1 8 9 0 5 4 10 2 1 9 1 9 5 10 2 1 7 9 9 9 9 6 3 8 3 10 6 2 0 3 10 1 8 3 0 9 7 8 4 5 10 9 1 0 6 5 7 6 3 2 9 9 10 3 0 1 6 9 7 9 3 8 7 10 0 6 0 4 0 5 0 8 10 2 2 4 5 4 1 8 9 1 9 7 6 4 3 0 2 3 6 7 3 1 5 1 7 4 4 9 10 1 6 3 3 8 7 10 3 4 4 2 10 9 0 4 2 2 10 3 9 5 10 1 0 7 6 4 4 3 1 10 0 10 4 1 8 3 10 6 6 2 9 3 7 8 0 3 6 4 9 0 5 7 6 10 5 8 8 1 8 2 6 7 2 3 8 3 10 10 2 5 2 3 0 7 3 10 8 6 6 5 10 5 3 10 5 2 6 6 6 9 2 9 5 10 0 7 5 5 10 4 6 7 10 8 7 6 7  [...]
+0 2 5 3 0 9 5 2 10 2 8 2 7 7 10 10 8 6 0 10 8 1 10 1 3 1 5 2 7 10 5 4 5 1 7 8 4 10 3 10 4 4 3 6 6 0 10 3 8 2 8 7 5 3 4 8 8 8 6 4 7 6 6 6 1 1 6 8 4 8 8 2 5 10 2 7 5 6 4 8 1 1 0 1 10 3 0 1 3 3 9 1 10 9 8 2 8 3 5 4 2 6 2 5 5 3 7 5 1 3 6 2 5 6 4 1 3 4 7 9 0 2 4 9 10 6 1 10 4 1 10 2 0 1 7 8 9 1 6 1 5 7 0 3 6 0 4 8 3 1 0 1 7 6 8 10 4 1 0 1 7 0 2 3 3 9 0 9 5 0 1 10 6 7 3 2 2 0 3 9 3 6 7 10 10 3 8 9 7 3 5 5 7 1 2 5 2 0 0 8 10 0 4 3 4 4 0 5 8 7 1 9 8 10 3 10 1 4 4 1 0 9 9 6 8 3 2 6 5 5 10 6 10 10 [...]
+9 10 7 2 4 6 6 0 3 7 5 10 3 6 7 4 0 1 1 2 0 5 0 4 10 1 3 3 2 8 2 8 1 5 0 6 7 6 9 7 2 7 5 8 5 4 9 7 1 9 1 2 3 0 6 3 4 8 9 2 4 6 1 6 2 10 10 9 2 0 10 6 9 10 7 5 4 2 0 10 0 10 6 6 8 4 1 9 5 4 0 2 6 4 1 7 9 9 7 9 9 0 1 3 3 1 10 9 10 3 10 0 7 7 5 10 1 7 4 4 7 5 3 7 10 4 10 4 8 6 10 1 9 7 3 8 4 3 2 6 3 2 7 6 9 7 2 3 5 10 0 2 2 4 9 8 5 8 7 0 2 1 2 4 8 4 4 10 7 1 5 1 10 3 7 7 10 7 4 5 5 7 8 5 4 7 5 1 3 6 1 2 1 1 7 4 6 4 6 3 3 3 2 0 1 9 8 10 2 9 5 7 6 5 1 8 4 7 4 1 4 0 6 4 2 7 5 7 6 7 3 2 8 3 1 1 [...]
+7 0 10 5 3 8 6 4 0 6 0 2 5 8 7 2 3 1 5 7 8 9 9 5 5 7 10 10 5 5 3 10 6 1 4 0 3 10 6 7 1 3 4 2 1 6 1 1 1 10 5 8 4 4 1 3 8 5 5 1 4 6 3 5 7 2 10 8 3 7 2 3 6 5 1 1 0 7 3 5 1 3 8 7 5 8 5 8 5 7 1 6 10 3 2 5 3 7 10 5 4 0 3 0 8 0 7 8 8 1 9 1 9 3 5 5 4 7 6 10 10 3 3 6 7 6 7 2 1 4 9 3 5 10 0 6 0 1 5 5 8 6 3 10 5 3 3 7 5 1 5 1 9 10 10 3 5 4 5 9 1 2 1 3 9 10 7 6 6 0 3 3 5 3 4 2 2 2 8 0 9 4 0 3 6 3 2 5 9 9 6 1 4 7 0 10 4 1 5 10 5 4 0 0 2 4 6 5 5 9 10 8 4 0 9 0 7 5 0 10 2 9 4 10 4 1 8 6 1 3 2 1 5 3 5 5 [...]
+1 1 9 2 6 5 7 0 10 0 6 9 5 7 6 4 5 0 6 8 6 2 8 6 9 6 9 10 10 9 10 1 6 10 0 1 5 3 9 2 9 3 3 8 8 10 8 7 1 3 10 8 6 4 4 4 2 5 6 7 9 0 7 9 2 8 10 7 8 10 5 8 9 3 1 10 2 3 10 3 1 8 0 6 3 6 8 8 3 3 7 4 5 2 9 3 8 5 5 7 7 0 1 7 5 2 3 5 8 1 5 2 0 2 2 6 1 2 5 8 3 1 4 2 2 10 4 9 7 3 8 8 10 9 1 7 5 2 10 6 9 1 10 0 10 8 1 4 10 6 2 8 2 8 7 7 10 1 0 9 8 10 5 8 1 9 10 10 0 3 10 7 2 5 7 3 2 2 0 2 3 3 5 2 5 1 3 9 5 4 4 6 10 8 4 0 0 9 4 10 6 3 2 8 4 3 3 4 10 8 4 6 3 8 10 7 2 2 3 6 3 9 7 0 6 6 0 5 5 2 1 6 6  [...]
+0 5 5 8 8 0 2 10 8 0 4 1 9 2 2 5 0 6 3 8 0 4 7 1 8 2 2 3 9 7 9 10 5 6 6 5 7 10 8 10 0 4 10 2 7 4 0 1 1 8 8 10 6 5 1 2 2 3 10 4 4 3 3 6 9 7 4 1 4 8 4 1 10 7 9 6 0 0 9 4 8 7 4 4 1 5 1 8 9 6 4 4 6 10 10 3 4 4 3 6 2 6 10 10 6 1 2 0 0 8 9 0 5 8 7 8 1 9 3 8 7 5 1 2 9 1 5 4 1 9 5 4 5 2 2 9 2 6 5 3 1 5 6 7 6 7 4 4 6 5 4 9 8 8 7 7 4 2 9 5 8 0 8 7 4 1 9 0 5 8 9 8 1 2 9 7 1 9 0 9 7 10 10 0 8 2 8 3 3 6 8 9 8 8 2 3 4 7 1 1 1 6 1 5 0 7 10 2 4 10 2 8 1 4 7 0 5 6 3 1 5 4 9 8 7 10 7 8 3 10 0 7 0 6 0 0 2  [...]
+1 8 9 1 2 2 0 1 6 4 4 6 1 5 7 2 2 4 0 1 3 2 2 0 6 2 7 7 9 8 10 10 5 3 6 8 1 5 6 8 6 6 3 10 1 5 2 0 10 1 10 7 5 5 7 9 8 3 2 5 4 10 7 3 4 1 0 3 7 0 4 0 2 3 2 3 7 2 10 4 5 4 10 9 4 10 1 6 6 10 1 4 9 1 5 0 4 2 1 5 10 2 4 4 7 6 1 8 0 8 4 6 3 7 5 1 10 5 8 3 9 2 0 8 1 0 6 4 7 4 6 2 5 1 6 9 9 10 0 4 8 4 7 6 5 7 3 8 3 6 5 5 8 1 3 2 0 1 2 3 0 1 4 5 5 10 6 4 5 4 4 7 1 2 5 3 10 2 5 7 4 3 2 7 0 7 2 3 7 3 0 10 7 6 3 4 9 3 5 10 8 1 6 8 2 1 4 1 2 2 2 3 9 8 6 7 10 10 6 2 6 5 9 2 9 1 7 0 7 5 0 5 10 1 9 0  [...]
+6 5 1 7 1 0 10 3 5 4 8 3 2 1 8 2 6 1 8 10 10 5 8 6 8 6 3 7 4 10 3 7 4 9 0 5 4 3 4 9 3 0 9 10 8 4 4 9 3 0 8 10 8 5 2 10 10 2 0 6 6 7 3 2 9 1 3 8 10 5 8 5 7 6 0 6 3 2 7 0 3 10 7 0 3 1 10 9 0 5 4 4 0 9 4 10 1 6 1 10 3 0 2 7 9 4 9 8 5 10 3 8 6 8 9 5 1 7 2 4 9 6 2 8 8 0 7 2 5 0 1 5 7 6 10 5 8 10 5 4 7 3 9 9 5 1 4 0 3 6 7 8 6 3 9 4 9 5 5 6 1 5 1 6 9 5 8 4 3 3 0 6 10 2 5 9 10 6 2 5 10 0 4 7 1 6 10 1 5 9 8 4 8 8 1 6 8 5 3 10 9 8 10 5 4 7 2 1 1 9 4 4 2 6 6 7 1 9 10 10 8 1 10 9 5 4 4 6 8 8 4 1 10  [...]
+2 9 8 4 3 4 7 8 5 5 2 0 10 1 7 1 10 2 6 7 10 2 10 1 10 6 2 0 9 3 4 9 6 3 4 7 6 8 1 5 0 7 7 2 7 7 6 0 1 7 5 6 8 0 1 10 2 5 9 6 5 10 7 1 2 4 1 4 2 9 1 10 6 10 2 3 3 9 4 4 2 6 9 4 4 4 7 3 6 4 5 3 4 4 5 9 7 10 1 5 5 4 9 3 5 10 6 2 1 3 7 2 9 2 5 6 7 8 5 8 2 3 6 1 10 9 3 9 10 8 3 7 0 7 8 0 0 3 4 8 4 9 10 5 6 4 7 10 0 0 8 5 3 1 1 4 4 1 5 0 7 3 0 5 8 1 7 9 8 1 2 3 1 10 1 1 9 2 0 8 3 7 5 3 2 6 4 7 7 1 7 0 2 9 5 3 1 10 6 1 9 4 6 1 0 4 0 5 10 1 3 6 2 6 4 0 1 0 5 6 1 9 1 0 5 0 6 7 7 2 5 0 2 0 3 0 5  [...]
+5 0 7 8 4 2 4 2 7 5 8 9 2 2 10 3 7 3 9 4 10 8 1 5 6 4 10 2 4 5 2 5 5 2 8 5 4 1 10 8 10 2 0 4 8 2 3 10 5 3 6 1 3 7 3 5 5 8 1 4 4 4 3 9 1 7 8 2 10 0 10 8 5 8 1 9 9 8 9 10 8 6 5 9 4 2 9 5 0 10 5 9 10 9 4 0 1 2 8 10 9 6 5 4 6 8 2 4 3 7 7 5 9 9 7 10 4 6 7 5 1 9 8 3 3 0 4 0 8 7 0 10 3 10 9 5 7 9 4 5 4 8 3 10 4 2 9 2 9 5 2 9 10 10 6 10 4 3 7 5 7 4 6 1 3 6 0 3 2 1 6 7 9 0 10 10 1 0 6 10 10 9 8 10 10 1 10 5 1 1 3 10 4 8 1 3 1 6 1 7 0 6 7 4 8 10 4 8 10 7 1 3 3 2 0 4 9 2 7 6 4 5 6 0 2 9 9 2 0 10 2  [...]
+4 3 5 5 6 1 2 7 5 7 4 1 1 0 3 6 7 0 3 6 5 1 6 3 0 3 7 10 5 5 9 5 1 4 8 1 2 10 6 3 3 0 0 3 8 4 0 6 9 8 2 6 9 4 9 10 4 8 0 1 2 6 3 3 2 5 4 4 10 4 1 6 6 7 1 9 0 1 3 4 10 4 5 10 3 3 5 1 6 1 2 4 9 7 2 3 9 5 1 1 10 5 3 9 2 0 5 7 10 1 8 10 5 10 4 9 4 0 4 7 2 10 9 0 1 10 2 0 1 5 0 2 10 3 3 7 5 6 9 8 10 2 3 0 8 2 5 4 5 4 7 1 2 9 4 2 8 0 10 10 0 4 0 4 5 0 4 6 9 4 5 5 4 4 4 9 10 8 0 5 2 4 8 1 6 10 10 4 3 10 8 5 0 6 3 0 1 2 8 8 5 5 1 3 9 8 0 5 7 0 0 6 2 6 7 2 6 1 8 4 6 7 1 6 4 0 10 4 2 5 3 2 8 2 4 1 [...]
+8 7 8 3 1 6 10 0 0 6 5 1 10 9 4 2 9 1 6 2 10 0 9 10 2 6 10 5 6 6 9 7 10 2 1 4 4 0 6 1 1 8 9 4 5 5 9 5 0 3 3 5 10 9 4 1 3 9 8 6 5 6 4 8 4 5 1 6 2 0 2 6 8 1 0 6 1 10 7 7 6 8 7 4 7 3 0 3 2 0 7 7 2 8 10 2 10 9 10 0 10 9 0 8 0 9 2 7 5 2 2 8 10 3 10 4 6 4 0 6 0 4 5 5 6 9 0 10 3 2 6 8 8 2 7 1 9 4 1 6 7 10 3 1 6 6 9 0 3 5 4 7 6 0 7 8 6 5 0 10 4 1 9 8 8 2 5 10 2 2 3 1 10 4 6 7 2 5 9 5 5 3 9 0 0 7 2 8 10 0 1 6 8 3 3 3 4 1 5 7 1 1 9 7 6 3 8 9 4 9 4 10 8 9 10 3 0 6 5 7 3 3 0 1 8 6 6 3 3 6 0 10 4 4 4 [...]
+2 3 5 9 1 3 9 8 7 6 0 0 10 8 3 10 2 5 2 6 2 0 9 0 7 1 10 0 10 9 3 2 10 9 10 10 10 8 8 3 10 10 9 5 9 7 5 4 7 5 6 3 8 10 6 5 1 2 1 1 10 8 8 7 2 6 5 5 3 10 5 1 1 3 2 8 10 4 6 5 2 8 2 7 10 5 4 0 5 5 10 1 8 7 4 10 6 2 5 8 9 1 4 8 5 6 4 9 1 3 0 8 4 0 7 2 0 1 6 2 10 0 0 0 7 1 8 2 7 4 4 9 2 4 7 6 6 9 2 6 4 1 0 8 5 7 5 0 5 10 8 8 10 2 10 6 2 7 5 0 2 1 8 8 0 8 2 5 2 7 9 1 4 10 8 6 8 7 8 5 5 3 3 9 10 8 2 5 2 10 7 2 10 2 7 0 1 0 10 9 0 3 10 4 6 7 4 8 7 9 10 2 9 0 10 0 0 10 10 8 1 9 1 2 4 5 5 7 10 9  [...]
+1 6 1 3 9 4 10 8 9 9 7 6 5 4 0 4 5 9 6 6 10 7 6 9 7 5 9 6 2 1 4 9 9 8 3 9 10 10 10 9 10 4 6 2 5 1 5 3 4 9 8 1 4 6 4 8 2 0 1 5 6 7 9 2 4 6 8 3 0 8 7 6 7 8 7 7 4 5 5 6 10 6 0 0 10 6 1 8 0 8 5 6 10 6 5 10 0 0 6 5 0 2 0 0 4 4 5 8 5 2 10 10 3 5 1 9 2 9 4 8 1 8 6 10 6 2 9 9 1 7 3 9 8 4 0 9 1 5 10 2 8 1 3 10 9 3 5 6 5 2 9 2 2 2 7 3 1 3 4 5 5 2 1 0 4 10 2 7 3 0 0 8 1 7 1 1 3 0 7 6 8 8 2 6 7 9 0 0 7 9 5 2 8 10 0 1 4 0 0 8 10 2 10 3 4 6 0 4 6 1 5 1 5 5 8 8 6 5 1 1 2 3 0 2 3 2 2 9 9 5 3 1 8 6 1 1 3 [...]
+2 10 0 1 10 6 0 9 6 1 10 6 10 0 5 6 7 5 1 10 8 7 3 0 9 7 7 4 10 0 0 0 6 10 2 5 0 6 2 0 1 1 8 2 0 0 8 6 9 0 6 5 1 1 9 0 0 5 5 2 10 5 0 3 10 4 1 10 8 0 1 9 0 9 3 4 1 0 10 0 7 1 2 10 8 6 6 0 2 0 1 3 9 0 6 1 3 7 4 3 0 9 2 7 5 3 7 5 8 10 7 10 7 1 8 4 3 2 7 5 3 0 6 1 1 10 3 2 9 10 1 8 9 4 4 8 1 3 10 1 10 3 8 4 4 7 10 2 2 8 1 5 1 0 3 1 3 10 1 7 3 9 9 4 5 1 8 3 2 6 9 10 6 8 4 1 3 0 5 9 4 4 6 7 2 0 1 8 5 7 10 7 7 7 0 7 10 2 0 3 1 4 1 4 8 8 2 8 10 4 1 9 4 6 7 1 9 1 7 10 10 9 0 9 1 1 2 2 7 8 9 5 5  [...]
+7 8 4 1 4 10 6 2 2 4 9 7 0 2 9 1 2 1 9 7 5 10 8 5 10 3 3 7 5 5 2 9 7 2 7 3 8 1 9 2 2 2 8 2 4 9 3 0 5 10 2 1 5 0 10 3 6 4 2 0 7 0 7 8 0 4 5 9 10 1 8 10 1 7 7 0 5 10 0 0 9 7 0 0 8 1 0 5 2 10 8 0 5 10 8 4 10 4 7 3 8 7 9 10 9 4 0 0 7 5 7 5 3 6 5 7 2 3 2 8 1 0 10 2 2 9 7 7 7 7 10 8 6 1 0 4 10 3 2 10 5 10 7 5 0 1 6 1 8 2 2 10 0 2 4 6 6 9 2 8 6 0 6 7 6 1 0 4 1 3 1 6 4 5 8 5 9 4 5 4 4 8 2 3 3 1 2 4 2 6 3 10 7 3 3 3 9 0 7 8 2 3 3 1 6 9 6 0 6 7 3 0 4 5 5 7 9 6 5 2 2 9 10 4 1 10 6 7 0 3 6 2 0 5 0 1 [...]
+7 1 6 1 2 2 8 6 6 4 9 7 4 7 9 10 1 3 6 9 10 9 10 2 7 9 5 8 2 4 0 7 10 9 7 8 10 6 7 7 5 6 6 1 10 6 1 5 4 2 10 0 4 5 7 5 1 8 4 0 9 6 5 4 1 1 4 9 4 7 5 7 1 5 1 10 9 4 9 5 9 4 6 2 8 6 9 5 2 8 0 6 0 8 7 10 5 6 8 9 1 0 1 7 8 6 7 7 9 4 7 9 9 8 7 3 10 10 6 5 5 8 1 10 7 7 8 6 1 5 4 4 4 6 8 0 6 9 3 5 6 6 9 7 0 7 5 10 3 9 8 0 5 9 2 4 4 1 10 2 3 1 7 0 2 5 8 5 3 1 4 7 8 1 1 8 7 2 5 4 7 3 0 3 2 8 10 8 1 3 5 3 2 1 0 4 0 9 8 9 6 5 9 7 3 3 9 8 1 10 5 5 0 3 8 9 7 2 1 10 7 7 6 3 10 6 9 5 2 1 0 10 6 4 7 5 1 [...]
+0 0 5 2 0 5 8 10 9 0 0 4 3 1 0 4 5 9 2 4 8 8 8 9 5 4 6 4 7 6 10 1 0 10 0 2 7 7 9 4 8 10 2 4 9 2 1 3 10 10 10 8 5 7 0 9 2 5 8 7 6 2 10 5 6 5 1 2 7 6 9 5 2 1 10 3 2 7 3 7 4 2 5 7 3 3 10 9 6 4 1 9 0 7 6 3 10 8 3 1 8 5 3 6 7 1 9 7 5 6 8 2 9 3 1 9 10 2 5 2 2 6 0 9 1 8 8 0 8 2 9 7 8 4 4 1 5 9 7 5 1 2 4 6 9 1 10 10 3 6 4 2 4 0 1 10 1 6 5 1 0 4 9 5 0 10 6 4 9 2 5 9 1 3 10 7 8 1 5 3 4 4 10 0 9 6 2 5 3 3 0 2 7 1 8 2 4 6 6 2 5 6 5 10 10 9 1 2 1 2 7 10 2 8 7 6 4 0 0 3 3 9 4 10 0 5 6 5 7 9 5 0 8 7 1  [...]
+1 10 6 9 1 3 10 6 8 10 0 3 2 8 9 1 7 10 7 3 6 3 8 10 5 4 9 3 3 7 0 7 0 1 10 9 1 3 3 7 1 2 9 8 5 5 7 5 8 0 5 4 6 0 7 2 5 3 1 9 2 5 4 10 3 4 0 9 7 8 5 9 5 8 7 7 8 8 7 9 8 5 1 0 2 5 1 5 6 10 6 1 0 5 9 9 5 7 1 3 5 2 4 0 3 1 6 5 2 1 3 7 9 1 4 5 10 8 7 7 3 4 2 6 0 5 1 8 2 7 0 7 1 1 7 7 8 0 0 1 7 10 10 8 1 1 7 2 9 0 6 10 10 6 6 0 9 7 8 9 9 10 3 10 3 7 8 2 4 2 4 8 7 10 3 1 7 1 4 5 0 10 8 3 1 5 7 1 0 4 6 2 8 3 6 1 3 2 4 3 10 4 4 3 3 0 9 2 0 9 8 7 5 7 3 9 0 6 5 5 0 7 1 5 7 3 10 1 2 4 1 9 10 2 7 2  [...]
+1 6 2 0 4 7 8 8 1 6 4 8 9 4 0 8 0 4 10 9 6 10 1 2 3 2 2 3 5 8 2 9 5 9 4 7 3 10 9 5 1 7 8 0 10 9 3 6 9 2 10 2 2 7 4 4 4 5 7 10 10 6 4 1 6 7 8 7 9 10 4 3 0 0 8 5 4 2 8 7 0 8 6 10 8 4 5 7 3 0 2 9 3 7 4 9 5 5 10 0 4 1 8 2 7 0 4 9 9 3 9 6 3 5 10 6 10 7 4 3 3 8 6 3 8 8 6 2 6 1 9 7 7 4 10 9 6 9 10 0 0 7 5 7 10 2 9 1 8 0 5 4 1 3 7 6 0 9 6 7 6 8 3 2 9 2 0 8 6 6 5 9 6 6 3 7 8 3 6 0 0 0 2 8 9 9 3 6 1 9 6 3 0 8 2 1 9 0 1 4 0 7 6 0 5 5 3 8 2 2 5 10 6 3 8 3 10 8 5 2 5 1 9 0 0 2 6 10 3 4 5 2 0 3 4 3 4  [...]
+3 4 7 8 1 0 7 4 9 10 0 10 4 2 5 3 1 1 7 0 2 4 3 7 10 10 2 0 6 10 3 7 7 2 3 1 8 8 5 6 2 2 9 10 3 7 0 8 1 9 1 6 8 3 7 6 5 4 5 9 2 4 9 1 3 9 9 8 7 0 3 1 2 4 9 7 7 10 6 2 0 4 10 10 2 0 6 0 9 1 4 3 0 7 5 9 9 6 9 9 6 2 2 8 1 1 7 6 4 5 5 6 6 2 6 5 6 1 5 5 1 10 0 10 1 8 4 8 7 1 3 0 2 8 4 5 2 1 4 5 1 7 4 1 10 9 6 5 5 4 6 3 6 6 3 5 10 9 7 10 10 4 8 7 0 8 4 5 3 8 0 4 1 5 8 8 10 9 6 9 0 10 5 10 9 4 2 3 1 4 8 7 7 4 0 9 1 9 4 3 0 5 4 8 7 5 2 0 5 4 3 8 9 6 7 3 4 10 3 5 3 3 3 3 10 10 5 2 4 2 2 9 6 6 8 1 [...]
+0 6 2 0 6 1 7 3 2 3 4 3 8 10 10 9 5 9 8 0 10 5 8 3 1 1 9 7 8 6 8 9 10 7 6 0 2 5 3 9 10 0 9 3 8 6 3 8 1 9 10 8 9 1 8 8 1 7 1 6 0 2 5 2 4 8 8 0 10 9 7 3 6 9 1 3 3 6 1 0 9 8 8 5 8 4 8 3 0 5 8 6 7 2 5 6 7 9 6 6 6 0 9 6 8 0 2 0 6 1 10 3 1 5 4 9 0 0 9 5 5 2 2 10 0 6 10 0 3 3 10 3 4 8 4 10 1 4 4 6 9 1 5 6 8 3 10 7 10 6 3 0 5 1 10 6 7 5 7 1 4 9 1 3 1 10 2 9 2 9 3 7 8 8 0 3 6 6 3 5 2 2 3 2 2 0 0 10 3 7 9 1 3 4 2 7 3 5 6 8 5 7 0 2 1 9 4 3 10 6 2 8 1 7 6 6 3 1 9 7 0 2 10 4 10 1 6 6 0 3 3 2 6 3 6 9  [...]
+7 3 2 4 5 4 0 2 0 7 4 8 9 9 4 1 9 3 1 9 5 6 4 2 7 10 5 1 10 4 7 3 10 3 1 9 8 0 1 10 2 3 6 7 5 9 8 3 9 0 9 9 0 3 10 8 9 7 10 8 9 4 4 9 8 4 9 7 8 7 9 1 10 9 10 9 1 7 1 7 7 9 8 9 5 7 1 6 8 5 1 2 4 9 9 0 7 4 0 10 1 2 0 5 5 6 7 2 3 1 0 6 7 2 0 10 9 3 2 0 10 6 3 1 8 0 6 8 0 6 5 10 6 2 0 7 4 9 9 10 2 6 0 8 10 1 10 1 5 3 9 4 10 8 4 2 10 10 8 3 7 10 7 3 7 10 2 3 2 5 5 0 10 2 6 10 8 8 9 10 2 5 6 0 4 1 2 9 0 4 2 0 7 0 3 0 0 3 0 2 3 10 5 6 8 7 10 3 9 0 5 4 1 9 10 6 10 5 9 2 5 4 2 7 5 5 7 2 6 6 3 0 2 [...]
+10 5 3 9 3 5 4 4 3 1 4 3 8 3 9 9 2 6 0 8 7 8 3 10 8 1 1 6 7 2 8 5 4 8 10 0 2 10 10 7 8 2 8 5 5 4 9 10 8 6 10 2 5 6 6 3 6 8 5 1 0 9 0 1 9 4 2 10 7 9 8 9 3 3 2 5 3 9 5 3 3 5 7 0 9 3 4 4 1 9 9 7 10 4 1 8 2 5 0 10 0 6 5 0 5 7 4 6 9 3 5 2 9 7 7 0 2 6 7 9 0 0 2 10 4 3 4 1 3 1 6 9 0 1 4 6 6 1 3 5 6 1 7 0 2 3 0 7 6 9 9 0 1 2 8 4 9 2 6 3 2 6 5 2 10 6 3 8 7 1 3 5 5 2 6 6 3 3 7 6 3 8 1 5 3 4 4 2 1 0 5 4 3 4 4 7 9 4 3 8 4 0 8 9 10 8 8 6 6 0 6 4 3 6 1 6 10 5 8 0 1 0 9 2 9 6 2 2 4 9 1 3 1 2 1 10 3 0 1 [...]
+3 5 5 2 6 7 6 10 4 10 10 0 8 6 0 10 6 5 6 6 2 7 6 7 6 0 0 1 9 6 4 9 10 10 6 1 6 0 0 8 5 9 2 9 1 1 5 4 1 4 3 6 9 9 10 2 3 6 3 4 1 9 1 3 10 8 7 7 1 8 3 5 5 2 4 10 7 1 9 6 7 1 2 9 8 7 3 0 3 8 0 10 2 8 3 9 10 6 6 2 1 4 5 0 10 7 6 7 8 4 4 4 1 9 8 5 6 5 10 6 3 10 9 8 1 7 4 7 4 6 5 2 9 10 9 4 2 9 3 5 1 4 0 4 8 2 4 4 2 1 5 0 9 6 5 8 2 4 3 5 3 9 1 5 7 8 8 9 0 3 5 1 10 5 6 5 10 0 8 9 8 3 10 1 2 0 7 8 5 8 6 9 10 10 3 5 0 0 1 10 3 0 3 2 10 6 1 6 1 8 8 8 6 9 7 6 3 8 7 4 6 0 2 9 5 2 9 3 4 6 4 9 4 2 6  [...]
+4 9 5 0 7 1 6 5 9 10 3 5 4 6 2 6 0 1 5 5 1 7 4 10 5 5 6 8 2 3 9 2 6 5 1 1 5 1 0 3 2 8 1 1 6 8 8 2 1 8 5 3 3 9 0 8 9 8 6 10 4 4 2 1 6 2 9 7 3 3 4 6 0 9 3 7 9 2 6 4 1 8 5 5 8 2 8 9 1 3 9 10 5 2 0 2 9 10 3 10 3 1 5 0 6 2 10 5 3 7 6 9 2 2 8 7 0 0 3 5 5 2 2 9 8 3 4 2 9 8 7 5 6 10 5 7 3 0 2 6 0 5 6 8 10 7 10 10 4 8 9 9 3 7 3 9 7 9 5 6 10 9 8 6 9 0 9 7 10 8 9 9 5 4 5 7 10 1 1 0 9 5 2 8 2 4 0 2 3 8 6 2 10 5 2 3 9 0 4 6 1 3 5 4 6 0 10 6 5 3 7 0 2 1 8 10 3 7 1 5 7 0 1 4 4 0 5 9 5 7 3 10 10 2 7 0 8 [...]
+3 6 2 6 1 7 7 10 4 6 8 4 10 10 5 5 2 1 9 9 8 7 5 4 3 5 10 10 1 3 2 4 10 9 8 2 0 3 9 6 9 10 6 10 10 8 5 7 0 2 5 4 9 6 9 4 9 1 1 3 0 7 9 5 1 9 1 4 8 1 10 8 8 0 9 1 6 0 5 0 2 4 1 5 9 6 8 8 5 8 2 6 0 1 8 1 3 3 0 7 5 6 3 7 4 5 8 1 4 2 7 3 2 10 8 4 2 9 0 10 0 5 4 4 0 4 2 2 7 6 2 9 4 10 1 4 7 4 1 2 5 9 3 2 4 9 8 10 9 1 4 10 6 10 1 2 9 7 1 5 6 6 2 5 6 5 5 1 7 5 7 9 2 2 4 10 4 7 6 1 9 8 0 9 7 4 0 7 7 2 3 1 7 4 9 7 10 7 6 2 2 2 8 4 8 7 4 10 5 9 7 7 6 2 0 10 7 8 1 1 10 10 8 5 3 2 6 8 1 1 2 5 0 1 1  [...]
+9 4 1 6 6 5 1 4 7 7 4 5 1 3 2 0 8 10 0 3 1 10 2 2 8 7 1 0 1 5 6 10 2 7 6 9 10 4 5 2 6 10 8 6 7 4 10 8 4 3 3 0 2 5 0 10 1 7 9 9 3 3 10 3 4 1 4 0 1 2 0 8 5 9 10 5 4 9 5 7 5 6 5 8 2 7 0 7 5 2 5 10 4 8 1 0 0 6 3 10 5 3 5 4 2 7 3 1 8 4 6 1 6 7 3 4 7 9 10 10 7 5 5 0 9 0 8 10 0 9 2 7 0 6 2 0 2 10 6 1 2 8 7 0 9 5 2 7 5 8 10 4 7 9 10 1 3 0 2 2 4 2 7 6 0 8 8 5 5 1 2 4 9 4 0 6 0 3 5 6 5 7 1 6 5 7 9 2 10 2 7 5 3 8 6 8 5 3 7 8 10 8 3 2 4 8 9 5 3 6 1 1 9 0 2 2 10 2 5 1 4 0 8 3 7 9 1 8 8 4 4 2 8 1 6 1  [...]
+3 2 8 0 10 10 5 4 6 6 9 0 1 8 3 0 10 1 8 2 0 9 1 9 2 2 3 6 0 2 6 2 6 6 2 6 3 7 1 8 5 4 8 7 8 2 8 0 4 5 0 7 10 10 4 3 3 4 7 9 0 10 2 0 10 1 1 10 2 7 3 6 10 7 0 10 2 7 1 3 5 9 3 2 2 0 5 4 10 0 9 5 6 10 6 5 7 10 3 5 7 4 1 5 3 1 5 4 1 7 0 5 0 3 4 6 3 0 1 8 3 3 2 2 9 0 2 7 7 2 2 7 6 1 8 7 4 6 3 4 3 6 6 9 3 6 2 0 4 1 8 6 6 10 5 4 3 6 0 7 6 6 2 9 5 8 2 8 9 3 6 2 9 7 6 7 4 5 10 3 10 4 8 0 0 3 1 8 9 5 2 7 0 8 7 10 4 1 9 2 8 10 0 2 7 10 6 1 8 8 0 4 10 2 1 1 1 4 7 7 3 1 1 1 9 6 2 10 2 4 4 6 3 7 3 1 [...]
+7 8 5 6 4 8 5 0 2 7 0 9 3 6 6 0 4 5 10 10 4 2 8 1 8 3 6 10 5 3 6 8 1 0 3 3 8 8 2 7 9 10 3 7 10 3 8 7 5 1 7 8 7 2 1 1 0 9 8 0 6 1 3 5 5 2 7 2 5 4 10 3 10 5 2 8 5 5 0 7 3 2 10 6 9 2 0 1 3 4 4 8 7 2 6 10 0 3 9 8 6 10 0 2 7 4 5 0 10 6 8 4 1 5 5 7 5 1 1 6 9 8 10 10 3 4 9 0 8 7 1 2 2 4 5 1 6 9 7 7 2 3 10 2 10 4 6 3 3 7 1 9 10 10 8 3 9 4 6 6 1 5 10 4 8 1 1 6 4 8 4 2 10 5 6 9 5 2 1 9 1 0 0 4 1 3 7 8 0 0 10 4 0 6 3 1 7 0 4 0 10 1 10 7 8 10 8 7 5 0 8 0 1 10 7 2 10 0 8 0 8 2 5 6 3 6 0 0 3 7 5 5 0 4 [...]
+1 7 10 1 4 3 9 3 3 1 10 0 3 0 5 8 0 7 5 1 1 4 2 3 2 8 1 2 8 7 4 6 4 0 1 1 7 5 2 8 0 2 4 0 2 10 9 1 5 4 9 2 9 1 6 2 5 8 8 1 10 6 1 2 7 6 1 1 4 2 2 0 9 0 5 6 0 4 10 10 8 1 3 1 9 0 2 3 1 2 7 8 2 1 8 9 10 1 7 6 9 9 7 7 8 9 2 0 2 8 10 2 10 5 5 7 0 0 2 9 3 3 4 4 0 8 5 3 1 4 7 8 8 4 6 0 5 8 4 3 6 1 3 0 9 6 4 2 2 6 5 2 4 9 1 3 8 7 5 7 2 2 3 4 1 0 9 5 10 8 3 3 7 5 5 8 2 7 6 3 0 3 6 6 7 6 0 0 9 5 5 3 4 2 1 7 8 1 4 2 1 10 5 10 5 1 9 3 8 8 2 4 6 8 7 8 4 1 2 3 8 5 1 2 2 9 8 10 3 6 0 10 4 0 6 2 0 10 2 [...]
+4 8 7 2 10 9 1 4 2 10 10 2 1 10 5 8 1 7 3 1 4 1 2 2 3 2 9 5 10 6 9 9 5 5 4 2 6 4 10 0 5 6 7 9 2 3 1 8 0 7 9 5 8 4 2 5 2 3 6 8 8 1 7 0 8 0 8 10 3 0 7 3 2 0 9 3 7 5 7 8 5 10 8 0 9 2 9 4 9 10 7 0 10 2 8 5 7 0 8 8 8 10 0 1 1 5 2 2 8 9 4 7 3 6 7 0 4 4 8 0 8 7 9 1 5 8 10 6 10 4 7 2 4 0 0 5 10 4 3 1 8 3 1 3 9 3 6 8 10 0 8 0 3 2 2 1 3 7 5 0 3 5 6 3 5 7 6 8 0 4 6 6 9 9 4 10 8 6 2 9 3 3 0 8 4 1 6 10 9 8 3 7 7 7 6 10 9 0 0 4 5 3 8 9 9 8 8 8 5 4 5 5 2 9 8 7 6 4 8 5 6 9 9 5 8 9 5 9 2 4 2 9 6 7 5 8 3  [...]
+10 7 3 10 6 10 3 7 0 5 2 2 4 4 9 3 8 4 7 7 7 7 10 6 6 1 10 2 3 9 2 7 7 7 10 9 9 3 0 8 9 5 7 2 8 9 10 7 1 4 0 7 5 1 1 5 3 3 6 10 10 4 10 10 10 2 9 0 8 2 7 10 0 9 5 7 6 0 2 10 9 8 9 3 2 1 6 1 9 1 4 6 6 4 8 4 1 7 8 8 5 1 0 6 5 1 3 8 0 5 10 9 6 4 10 3 6 9 9 5 8 3 1 6 10 10 5 3 6 6 7 9 10 3 5 3 3 3 10 9 1 4 7 0 2 9 4 2 1 4 5 5 6 1 4 10 1 2 1 4 4 7 10 2 10 3 7 2 4 2 0 6 1 6 3 9 5 2 4 1 6 9 8 10 5 4 10 5 3 8 8 4 7 9 6 1 10 1 9 5 5 6 3 3 9 6 8 6 7 6 1 4 10 1 3 9 3 4 6 4 0 10 2 2 4 7 10 7 0 8 7 9 [...]
+1 3 9 6 9 5 7 4 9 3 9 9 2 5 10 8 3 8 5 10 2 3 5 1 9 3 1 3 4 2 2 2 6 5 2 0 2 10 10 10 1 4 4 4 0 2 10 4 2 5 5 0 0 0 9 0 1 4 1 4 10 2 3 3 10 5 0 3 4 9 1 2 0 7 1 7 7 9 2 5 6 7 6 3 9 3 3 0 8 7 6 2 5 7 9 8 4 9 7 1 4 10 1 9 2 7 5 7 4 8 8 1 7 6 8 4 6 2 1 2 5 6 5 7 9 1 8 4 0 10 10 10 4 2 2 3 0 1 8 0 7 1 7 10 6 9 7 8 6 7 5 1 5 4 1 6 7 1 6 2 10 7 7 1 6 0 3 0 6 6 7 10 0 3 0 0 8 3 9 8 8 4 7 0 5 9 0 10 2 0 8 7 3 7 6 5 9 0 7 2 6 9 3 7 2 0 10 4 7 3 9 1 4 4 2 1 2 6 5 5 6 2 5 3 4 9 10 7 8 7 8 3 0 4 9 5 0  [...]
+9 9 0 6 8 9 7 4 10 3 3 6 1 0 8 10 10 5 3 0 0 10 0 4 9 8 3 7 2 7 5 9 2 9 4 8 2 9 10 0 3 0 6 10 2 6 7 0 4 6 5 1 9 3 0 9 3 7 7 10 3 7 8 5 10 1 0 9 0 10 8 7 8 1 9 8 2 8 9 10 10 9 5 5 1 2 4 8 6 8 4 1 2 7 0 8 1 5 5 8 7 4 0 10 4 7 5 10 5 8 5 9 3 8 0 1 4 1 10 10 6 10 5 7 5 3 6 4 10 2 6 1 2 4 1 10 10 0 5 2 3 4 9 1 10 5 0 0 7 2 2 8 6 10 4 7 3 6 7 9 9 5 0 2 6 5 4 0 1 7 0 6 10 9 1 7 9 1 1 9 8 6 3 3 8 4 9 0 6 2 3 4 8 7 4 4 9 10 2 9 5 9 6 3 9 7 3 2 8 10 6 7 0 2 4 4 9 6 2 9 10 5 8 3 6 8 7 0 0 4 5 6 0 9 [...]
+10 6 10 5 4 6 10 8 0 1 1 9 9 9 0 1 8 6 0 8 6 4 1 4 8 5 3 8 6 7 0 6 5 0 0 0 5 0 2 0 1 5 7 1 7 1 7 1 2 4 9 6 9 1 3 0 5 4 1 3 3 3 10 6 6 8 7 1 9 5 8 0 9 9 1 1 10 9 9 5 9 7 9 0 10 9 3 0 5 2 6 2 9 5 4 6 0 5 6 1 4 1 0 8 2 3 6 5 7 4 3 8 1 2 7 0 10 4 4 6 8 0 8 8 4 7 9 2 3 7 2 4 6 5 6 5 5 1 5 8 2 0 0 3 8 0 4 6 0 1 6 4 2 2 4 6 2 9 7 3 10 2 8 0 10 1 8 4 2 10 9 8 7 9 0 9 5 7 4 5 8 6 5 10 9 4 1 8 4 10 2 1 1 1 1 9 9 1 5 9 2 10 6 5 8 5 6 6 1 9 4 1 10 1 2 5 2 1 0 10 7 9 9 10 7 10 8 5 1 9 4 7 6 10 6 1 5  [...]
+9 0 7 6 2 3 8 6 5 5 5 9 8 1 5 3 1 9 10 6 10 2 1 9 4 2 0 1 8 5 3 4 4 6 1 8 4 5 9 0 2 4 9 10 10 0 6 7 3 2 3 7 10 4 5 9 4 6 4 5 10 7 10 5 8 8 6 0 2 6 10 6 0 6 0 1 5 0 10 0 0 7 1 5 5 8 2 8 1 5 7 1 0 2 6 1 0 9 10 3 2 7 0 2 2 10 3 4 7 0 7 6 5 0 1 8 1 10 7 3 9 9 0 9 9 10 1 10 4 3 4 5 4 7 6 6 9 9 5 2 4 3 1 8 3 8 9 2 10 3 3 3 3 1 10 0 8 2 2 5 6 9 5 9 6 5 6 4 10 8 5 5 10 7 2 0 3 2 3 2 8 5 6 5 1 6 0 9 5 5 6 0 6 6 10 9 10 8 4 6 6 7 10 2 1 9 5 8 4 1 7 1 2 9 5 3 4 3 8 10 8 4 9 2 2 8 2 4 0 5 4 1 2 3 6  [...]
+5 7 1 4 6 3 9 10 9 0 4 4 7 9 5 9 2 10 10 2 10 5 5 0 6 7 8 10 10 1 1 4 5 9 4 4 5 2 5 6 8 0 10 8 6 10 4 6 8 2 2 1 6 10 10 1 8 3 3 5 7 6 0 5 4 4 1 4 10 1 8 0 4 9 3 0 4 6 0 7 10 2 2 0 0 3 7 1 8 5 8 7 1 1 6 4 5 2 1 5 7 3 7 7 2 5 8 3 7 3 8 1 1 10 0 8 7 5 1 8 4 6 7 5 6 3 7 5 4 2 6 0 3 4 10 5 3 1 8 5 10 9 10 9 4 6 8 10 8 5 3 3 10 1 5 7 6 8 7 7 1 1 10 6 2 5 3 9 8 8 4 1 10 8 4 4 9 3 2 7 10 5 2 0 7 2 3 2 10 6 1 9 0 8 7 10 9 1 5 10 5 9 3 9 6 3 5 7 6 0 0 7 7 8 3 5 2 1 3 8 5 7 7 9 7 5 8 7 2 9 9 3 10 4 [...]
+10 6 8 8 2 7 0 2 8 1 4 8 9 4 5 10 9 2 8 2 6 9 8 2 5 3 0 1 1 10 0 9 10 3 10 6 2 4 2 8 5 2 1 10 4 2 4 8 7 0 3 6 8 4 8 10 7 1 1 6 8 3 7 10 1 1 5 5 7 6 6 3 1 10 4 5 2 3 0 6 5 2 4 2 10 6 7 4 9 7 8 5 6 0 1 1 7 0 4 1 10 2 4 8 9 9 3 7 9 3 1 10 3 7 9 4 7 8 6 5 5 4 5 6 1 2 0 9 4 5 3 10 0 3 2 4 0 5 2 10 1 6 6 7 8 3 3 7 4 7 8 4 9 0 7 3 0 10 6 8 6 4 7 4 7 2 1 7 0 10 1 2 2 7 1 7 4 3 0 9 4 7 5 5 8 4 1 1 3 2 9 4 10 2 10 10 0 1 2 7 2 1 2 9 4 4 6 9 0 6 5 7 9 8 6 5 10 8 9 5 1 4 3 2 8 1 8 10 2 5 0 8 8 4 9 2 [...]
+0 4 4 1 6 3 2 4 0 1 9 5 1 7 10 0 0 2 0 7 0 1 5 1 4 5 5 9 8 0 1 1 3 8 4 2 9 1 1 8 5 7 10 10 1 2 2 1 8 6 2 0 7 1 3 9 0 6 0 4 2 1 1 4 7 5 7 6 10 4 2 8 7 3 1 9 0 10 2 9 4 4 6 8 8 1 8 3 4 9 10 10 1 6 10 3 4 6 5 10 8 3 6 3 3 2 5 6 0 3 0 3 8 9 7 9 1 4 5 3 7 1 2 4 1 0 7 9 6 2 2 8 8 6 4 5 8 10 0 10 5 4 8 1 9 8 6 6 7 4 7 1 10 5 3 6 4 8 8 2 6 9 4 8 7 2 5 7 8 4 3 2 3 1 9 4 10 6 7 4 6 4 10 1 2 0 9 8 10 10 4 5 2 5 3 1 6 7 4 7 8 9 6 5 3 7 6 8 3 9 6 2 3 8 6 9 2 10 8 9 0 10 5 4 10 10 5 7 7 2 8 2 10 7 5 3 [...]
+6 1 6 8 0 4 5 2 2 4 1 1 6 5 2 2 7 10 7 8 9 9 5 9 8 9 9 2 9 5 0 5 10 8 1 5 8 0 5 3 1 4 1 5 7 2 5 6 8 9 1 1 0 9 5 5 1 2 4 2 10 10 8 8 0 10 5 3 9 10 4 0 2 3 1 9 10 9 7 0 10 7 0 1 8 9 7 1 2 5 2 4 7 10 10 2 2 4 2 8 6 8 3 3 8 2 5 4 10 5 2 10 2 8 3 0 5 6 4 5 3 8 3 3 10 5 4 8 8 3 6 8 0 0 8 7 0 7 9 7 3 2 5 3 0 2 8 2 1 3 10 0 9 0 10 5 8 7 8 6 4 5 0 8 10 0 9 0 6 0 0 5 7 0 4 1 1 5 9 8 9 4 8 7 0 3 8 6 3 5 0 7 9 0 4 4 9 5 5 4 8 1 6 2 7 0 6 7 5 8 9 9 10 6 5 0 8 5 5 0 10 3 3 7 7 2 3 6 9 4 1 8 10 2 6 7 7 [...]
+10 6 7 6 9 8 7 4 3 6 2 4 10 6 10 5 5 10 9 3 8 6 8 10 5 2 2 8 7 2 3 5 5 5 7 9 10 2 9 10 7 1 9 7 7 7 5 7 4 9 9 1 8 7 5 0 9 5 4 6 2 5 2 10 10 8 9 7 10 6 3 7 9 9 0 5 3 0 2 6 6 6 7 0 4 9 7 10 5 10 10 10 4 9 1 3 9 5 7 8 0 1 8 1 3 9 3 5 2 5 2 8 6 0 5 9 7 6 7 1 2 4 0 6 8 8 1 4 3 10 3 7 3 1 5 0 10 4 7 4 9 4 9 1 0 3 9 10 0 3 5 10 7 4 5 6 9 7 0 2 6 2 3 6 2 2 4 0 8 10 2 10 4 2 5 1 3 6 1 8 3 10 10 5 0 2 9 8 9 2 5 2 0 0 5 0 6 7 8 7 10 1 9 3 8 9 2 9 7 3 5 4 0 4 2 2 10 4 8 8 9 5 5 1 2 1 3 7 10 2 10 8 8  [...]
+3 7 10 10 10 7 5 6 5 0 2 0 8 5 1 10 7 2 5 8 3 9 9 8 3 5 7 4 3 3 5 9 0 3 1 7 9 4 7 5 2 8 9 9 2 4 4 6 1 5 7 10 2 9 5 1 2 2 6 2 2 7 2 4 6 9 6 6 9 4 5 10 4 2 1 2 6 4 10 8 2 7 4 4 5 4 1 10 6 1 10 1 7 3 0 4 1 10 6 8 3 2 2 10 5 7 1 6 8 2 3 10 3 8 5 9 2 1 10 1 9 8 8 1 4 5 0 2 5 10 6 6 7 2 5 7 7 3 8 4 6 2 9 3 1 3 0 1 8 6 5 2 2 6 2 2 6 8 3 4 3 3 1 5 1 2 9 10 1 5 0 2 4 6 7 7 0 8 2 7 4 5 8 9 4 8 5 5 3 10 5 7 7 2 3 7 9 7 8 4 6 5 4 9 8 8 3 9 6 3 8 3 5 5 10 1 1 4 4 5 7 9 2 3 2 7 9 10 9 2 10 1 10 0 2 4  [...]
+5 3 0 1 3 9 10 9 5 8 2 10 10 10 3 3 8 5 4 3 9 6 8 9 10 4 1 10 10 4 2 2 3 10 1 6 9 5 10 9 0 5 6 6 6 9 7 0 5 0 0 10 2 1 4 3 10 4 6 4 6 5 7 10 5 1 1 5 10 5 9 4 8 7 7 9 10 6 6 7 10 9 9 1 9 4 4 2 6 5 7 9 2 9 3 2 5 8 10 8 9 4 2 0 10 7 8 4 5 7 9 8 2 2 4 5 1 2 9 2 9 3 7 3 10 9 10 2 8 6 8 5 0 10 9 7 7 4 7 3 0 8 10 9 10 10 8 6 3 9 2 0 5 3 10 4 7 5 6 1 3 10 7 3 6 2 8 4 6 1 2 6 6 1 7 6 1 3 5 10 0 2 5 10 8 4 9 5 1 0 0 7 9 10 6 4 7 0 5 1 1 1 2 10 10 7 7 10 1 9 7 0 6 0 2 2 6 0 0 9 2 10 5 9 8 7 6 6 9 3  [...]
+2 0 10 4 7 5 7 9 9 10 2 3 2 5 7 10 4 2 6 9 3 10 8 4 10 4 7 3 2 6 1 1 3 2 9 9 10 5 8 2 10 5 5 10 3 3 5 10 4 2 5 0 4 10 7 4 0 0 6 1 5 1 3 9 5 5 1 10 7 1 2 6 3 8 2 8 8 8 4 1 4 5 2 8 7 0 8 9 0 1 1 10 0 8 2 5 3 10 6 5 0 8 7 7 4 8 9 8 10 4 1 0 8 5 8 4 8 9 5 9 1 5 0 0 5 9 4 10 7 5 2 7 8 8 8 5 4 8 5 0 3 0 3 7 4 0 7 9 1 9 8 7 10 6 6 4 8 0 0 5 10 1 3 8 4 10 1 1 7 8 0 8 8 10 1 7 8 8 5 1 3 7 3 8 5 5 10 5 9 4 0 9 9 8 9 1 5 2 0 10 5 2 10 7 5 4 0 0 4 8 6 10 1 6 8 7 2 4 8 10 0 6 8 0 5 4 2 3 7 9 2 4 5 6  [...]
+2 10 10 5 10 2 9 9 4 3 8 1 4 5 2 0 0 1 8 0 10 7 5 2 1 10 3 2 5 8 10 9 9 9 0 10 10 8 1 2 1 1 2 1 9 2 7 10 1 3 10 5 3 4 3 6 6 8 7 1 1 2 5 1 10 8 0 6 6 6 2 1 10 9 2 5 4 8 10 7 5 8 6 3 6 1 3 9 7 5 6 4 5 6 1 3 8 9 10 4 8 6 4 9 4 3 6 7 3 2 1 1 8 4 6 8 9 2 3 5 5 0 3 2 2 8 6 9 8 2 4 7 10 8 5 0 1 5 5 7 2 9 8 9 7 0 9 6 1 10 3 10 1 8 6 10 1 0 10 0 6 1 2 2 0 8 8 6 2 3 5 3 0 0 0 10 7 0 4 4 3 3 4 2 3 10 4 9 0 6 7 3 10 8 10 7 7 4 3 2 4 7 2 10 10 6 0 7 7 9 10 3 0 1 5 5 0 6 6 6 1 5 9 2 2 0 2 5 3 10 10 8  [...]
+4 3 1 3 4 1 8 3 4 7 0 6 8 5 1 2 6 4 0 9 8 8 8 2 9 5 5 6 2 5 3 9 3 8 9 8 3 1 2 1 2 5 3 9 3 1 9 5 5 6 0 4 6 0 4 2 8 10 5 5 1 2 9 5 10 6 1 0 2 9 7 1 0 6 7 2 9 5 4 5 3 3 0 4 6 4 8 6 3 10 4 0 0 4 2 9 10 6 6 2 9 2 2 10 9 1 0 4 8 7 1 3 4 9 3 5 8 8 5 9 0 4 10 8 3 7 1 6 3 6 8 0 7 4 5 4 3 1 10 7 8 4 3 4 0 2 9 4 1 4 1 4 0 10 3 5 0 10 5 4 4 4 8 8 4 1 9 1 2 6 3 10 10 4 7 5 9 7 7 5 0 7 0 9 10 2 4 3 10 5 7 5 9 7 7 9 3 10 10 8 0 9 9 6 5 1 6 7 0 1 10 8 4 2 10 7 6 10 2 8 2 1 9 3 1 5 10 4 4 4 4 8 2 3 2 1 8 [...]
+3 7 7 9 3 9 0 7 5 9 9 6 0 7 6 7 4 8 3 10 7 9 5 5 6 10 1 8 0 7 3 3 6 5 10 7 7 10 2 7 6 5 1 5 1 0 4 4 3 4 5 9 3 1 0 10 2 3 6 10 4 7 9 2 3 1 8 9 0 7 2 9 1 3 4 2 1 5 8 3 0 5 10 4 3 6 5 4 10 7 3 6 4 8 7 7 0 2 8 0 0 4 3 1 0 1 9 9 4 8 9 2 4 7 10 10 6 8 0 4 4 7 5 5 8 9 2 10 6 8 5 1 8 4 7 8 10 3 5 4 6 1 10 9 1 7 0 2 3 1 4 8 9 7 6 10 7 0 5 5 10 9 8 4 5 6 4 6 5 10 6 5 4 9 3 9 9 7 5 4 2 5 0 6 6 8 8 0 6 1 4 6 7 3 2 10 9 9 9 7 8 7 6 4 8 5 8 9 9 0 9 0 8 6 1 5 10 1 1 8 9 10 0 8 9 0 3 4 9 9 0 9 10 8 3 9  [...]
+9 9 4 6 10 1 10 9 10 0 7 8 7 0 1 7 7 4 2 4 10 2 4 7 7 9 4 6 1 3 5 4 7 4 4 1 5 10 5 3 8 4 8 4 1 8 0 7 1 4 3 4 4 10 7 2 0 8 1 6 2 8 0 0 10 4 5 3 3 3 4 4 5 3 0 3 4 0 5 1 0 8 10 5 3 9 9 2 3 3 1 3 7 2 6 8 6 0 5 5 10 5 10 1 7 8 10 5 4 10 5 9 1 9 1 7 6 3 4 9 7 2 1 8 5 9 6 5 0 7 8 4 0 8 5 10 4 9 7 6 10 7 4 0 0 7 5 9 5 8 9 3 1 1 8 8 5 3 1 10 8 6 9 8 1 10 6 2 0 7 0 4 2 10 10 5 2 3 1 8 6 2 9 3 5 3 5 7 1 8 5 4 4 9 1 0 10 2 9 2 6 3 3 5 10 8 7 7 1 8 6 0 6 3 6 2 7 1 10 10 6 7 9 7 5 5 1 6 0 9 3 10 1 1 7 [...]
+3 2 7 2 0 1 9 7 2 10 0 6 0 5 9 8 9 2 7 5 5 0 5 3 9 6 1 5 0 0 5 9 0 7 3 5 10 5 2 10 9 0 9 3 8 7 0 9 0 0 10 6 3 0 9 4 3 0 2 10 2 5 0 10 4 5 7 10 5 7 9 9 1 6 5 1 10 10 10 9 10 9 9 9 5 5 8 9 4 4 2 2 6 5 3 3 7 6 1 1 7 1 0 1 2 2 8 3 3 10 8 4 10 5 3 1 9 0 6 3 2 1 3 3 1 4 3 7 3 5 4 3 0 4 4 9 3 2 3 0 2 1 8 6 5 9 7 3 8 7 8 5 6 10 10 0 2 6 10 1 1 10 10 1 8 4 5 10 9 2 6 1 4 2 2 10 7 2 1 9 2 7 7 3 9 5 9 4 7 2 3 4 7 7 8 9 7 10 9 9 6 8 5 10 1 3 3 10 6 0 9 0 7 6 2 0 3 10 3 10 9 8 0 9 1 0 4 8 1 0 9 2 5 5 [...]
+2 0 9 10 0 8 1 3 4 0 10 4 5 2 7 3 9 5 8 8 5 2 8 9 8 4 6 8 5 1 4 6 7 10 10 9 7 10 9 8 0 7 1 0 10 5 4 8 1 2 8 4 6 4 6 2 10 1 2 6 5 1 1 1 4 0 4 9 8 8 10 8 7 4 7 10 10 1 6 10 7 7 5 1 1 10 9 9 5 3 10 9 9 1 9 7 5 1 5 1 10 7 7 9 6 9 1 2 9 2 2 2 0 3 5 5 6 2 9 10 5 1 6 2 6 0 8 1 8 4 0 5 8 4 0 10 9 6 4 10 5 7 7 5 8 3 0 5 9 6 5 2 2 1 6 2 8 3 9 3 8 4 4 10 5 2 2 6 7 3 10 9 5 8 6 5 0 7 2 0 7 6 6 8 2 6 10 7 10 4 2 4 9 4 10 10 9 6 6 3 0 8 9 4 5 1 10 9 3 4 5 8 2 6 1 10 8 8 6 3 3 9 9 5 10 5 9 6 4 7 6 1 9  [...]
+0 8 10 8 6 1 8 10 1 5 5 10 7 6 4 0 8 7 0 3 10 4 1 7 5 1 7 6 4 6 9 6 4 10 3 0 7 8 2 6 4 0 2 3 4 8 0 4 1 3 6 6 5 10 4 1 5 0 0 6 1 0 7 4 2 4 7 6 3 1 9 9 4 4 9 9 0 8 6 10 10 8 6 2 9 6 4 6 0 5 9 0 4 9 9 2 1 5 3 3 0 3 7 4 5 5 2 4 7 1 0 5 0 10 8 3 7 5 8 8 5 9 10 10 4 8 1 9 7 10 10 7 0 0 0 10 7 6 10 7 4 1 2 9 8 8 1 8 6 5 1 1 0 0 8 0 4 7 2 0 8 1 8 10 7 1 6 4 7 8 9 5 8 2 10 3 2 2 3 4 4 9 1 8 5 6 6 6 2 1 2 7 0 3 5 9 1 4 3 4 6 8 5 8 4 1 4 2 1 9 4 0 3 1 3 8 10 2 0 0 10 6 1 6 5 9 6 10 6 7 1 2 6 10 0 8 [...]
+9 6 1 8 0 1 4 7 10 0 7 5 8 6 0 5 6 0 8 3 7 0 0 2 8 9 7 6 8 6 0 2 6 7 2 0 9 0 1 10 2 8 6 4 5 8 5 5 7 10 7 6 1 5 7 7 2 9 1 10 5 9 4 7 7 2 1 3 9 0 10 4 6 7 10 10 4 5 3 1 2 3 2 10 6 5 10 9 5 0 2 10 2 3 6 2 4 8 5 8 9 10 9 8 6 10 7 4 8 6 3 6 9 8 9 0 8 0 8 2 10 6 9 3 10 1 0 0 10 10 0 9 9 0 7 9 8 8 4 5 2 8 4 3 6 7 1 3 10 3 3 1 8 8 5 8 9 3 4 0 4 2 3 2 5 9 7 3 2 6 4 2 8 7 8 8 4 10 8 9 0 0 6 3 6 3 0 7 5 2 5 4 2 0 6 5 8 6 0 7 10 7 6 7 5 6 10 10 10 4 1 8 9 0 2 5 5 3 10 6 8 1 6 6 10 1 0 8 4 5 3 3 3 0  [...]
+3 9 7 7 6 6 8 8 10 9 9 2 5 3 8 1 3 7 7 10 7 5 6 10 1 5 0 7 6 1 5 1 0 4 7 1 3 7 4 4 7 8 9 7 7 2 1 7 2 10 4 1 0 10 10 5 2 6 4 5 6 4 1 2 3 9 2 0 5 6 0 8 8 4 9 2 7 0 4 6 6 4 1 0 1 10 0 8 2 5 6 4 8 10 3 3 9 10 10 8 6 1 6 1 6 2 2 4 1 7 10 3 6 3 3 0 3 9 7 0 2 4 0 3 6 10 4 6 0 2 4 4 10 0 3 4 2 7 0 2 3 5 6 6 9 9 0 3 2 0 4 4 5 4 6 5 6 3 7 3 6 0 10 9 10 2 4 1 0 7 5 8 0 1 3 4 9 9 3 0 8 0 1 0 2 5 3 4 8 8 5 0 2 6 5 1 5 2 10 7 9 8 0 3 7 8 0 9 1 6 1 9 9 5 8 6 8 2 3 1 5 9 4 0 6 9 4 3 1 2 8 7 0 7 3 2 9 7  [...]
+7 0 0 10 1 10 8 1 8 7 5 3 7 10 6 5 0 5 7 9 0 9 2 7 7 4 1 4 6 2 4 7 10 10 4 3 6 8 2 3 7 6 10 5 5 8 10 10 5 5 0 0 0 2 4 6 7 0 5 10 1 10 4 7 9 8 1 1 7 10 4 6 7 8 9 9 1 0 5 7 0 8 6 4 6 3 6 2 0 7 8 6 0 8 4 0 5 9 5 4 6 7 9 2 10 6 9 5 4 10 10 10 5 1 3 0 2 9 5 8 6 10 1 10 2 3 6 5 9 4 3 2 2 0 8 1 5 3 9 5 4 0 9 6 6 5 8 10 8 0 10 10 9 4 2 6 2 8 3 3 1 0 9 1 1 0 2 9 1 9 8 1 7 2 9 8 4 10 6 5 2 3 1 9 6 8 10 2 8 4 7 1 5 6 8 6 6 6 0 8 4 0 3 5 10 3 2 0 6 7 0 8 0 4 0 3 3 7 3 6 4 3 4 2 5 4 8 6 8 6 4 4 9 6 7 [...]
+4 5 4 7 4 8 2 4 7 3 6 4 3 6 1 8 2 4 9 7 5 8 9 1 4 3 0 7 2 6 5 3 1 6 2 2 0 1 8 6 3 8 6 0 3 1 9 2 8 7 8 4 7 10 2 6 6 9 4 9 4 7 2 0 4 5 10 3 1 6 8 4 7 10 4 8 10 7 5 9 2 1 0 8 0 0 6 0 5 0 0 4 6 7 3 4 0 5 2 9 9 9 9 5 4 0 10 6 1 9 0 10 6 6 10 5 8 2 0 5 10 7 9 5 0 1 7 0 4 2 1 0 2 8 6 10 0 3 5 4 4 10 7 2 3 4 9 8 3 4 10 4 2 9 9 0 8 10 9 8 1 5 9 10 0 7 3 2 2 8 9 2 7 2 0 5 9 8 5 2 7 9 2 9 3 10 4 9 4 8 10 10 1 3 8 10 8 9 2 9 6 6 6 5 10 8 9 1 9 6 2 4 3 4 4 6 10 7 3 7 3 1 4 8 7 9 3 6 10 1 2 2 7 10 10  [...]
+0 4 10 1 9 8 2 9 0 3 10 7 6 8 2 0 1 6 4 7 5 0 1 5 0 2 0 2 8 0 0 2 7 1 9 9 6 6 8 3 6 6 9 10 6 1 1 9 8 7 1 9 0 7 9 8 7 7 7 2 7 0 5 3 1 6 2 9 1 5 0 4 8 8 6 1 3 0 10 10 9 6 2 0 1 7 4 4 5 9 2 5 5 2 3 10 9 8 8 7 4 6 9 5 9 7 2 6 5 2 4 10 2 2 9 2 0 2 0 4 4 2 4 10 3 1 2 10 9 0 10 0 2 2 5 8 0 3 4 6 4 1 5 10 3 8 9 6 6 7 7 6 0 5 3 10 6 10 6 2 9 2 0 2 1 5 5 9 9 5 9 5 5 1 2 10 1 3 10 7 7 2 2 7 8 9 10 0 10 1 9 0 3 1 2 0 9 2 7 8 9 4 1 3 5 1 2 7 4 10 10 1 3 10 6 8 5 5 5 8 7 4 2 2 4 6 7 1 9 1 3 1 0 6 9 3  [...]
+3 1 3 1 5 9 6 9 9 7 2 3 0 9 3 0 3 10 1 6 9 6 5 9 2 5 5 6 7 3 4 10 7 10 4 8 6 9 10 5 6 0 9 2 4 6 9 2 2 7 3 10 3 8 3 7 10 8 4 10 9 9 1 9 9 8 2 1 0 4 3 10 9 8 3 2 10 8 6 8 0 3 7 6 3 10 8 1 2 3 10 1 7 2 6 10 3 10 4 7 6 3 6 8 2 0 0 5 7 9 8 7 10 0 8 3 7 2 9 3 0 3 6 2 4 7 8 5 6 0 1 0 9 10 7 3 1 6 10 4 2 9 4 3 6 6 5 4 6 1 0 4 10 5 3 5 4 7 4 4 3 10 5 4 3 3 3 7 6 3 0 1 2 9 0 5 8 1 3 9 3 10 8 4 5 1 10 4 8 2 0 10 3 5 1 5 7 7 7 8 1 7 2 10 0 5 5 1 4 1 8 2 0 1 8 3 6 2 10 0 2 3 0 6 4 5 6 7 5 9 9 7 7 0 1 [...]
+0 1 3 5 3 4 1 0 2 2 1 7 5 10 0 6 5 1 5 4 10 3 5 10 10 2 6 8 0 8 3 10 6 6 7 3 4 9 7 10 5 5 3 1 5 1 0 7 2 3 3 10 8 9 1 4 10 0 3 5 2 8 6 10 7 1 4 4 0 5 2 2 5 2 9 6 3 1 2 2 1 2 3 5 9 1 3 7 10 7 6 4 4 4 0 10 6 2 6 8 9 8 0 5 8 4 1 3 7 3 7 8 9 7 9 7 9 3 9 2 0 3 1 7 2 9 3 3 10 9 4 1 9 7 10 2 2 3 0 2 2 1 5 5 1 2 8 0 2 7 6 4 0 0 3 9 6 3 10 4 7 2 6 4 0 1 0 2 8 1 5 3 8 9 9 2 7 4 8 1 4 10 9 10 5 9 6 4 7 1 9 2 5 7 0 0 9 6 5 7 4 3 2 9 8 2 2 6 0 2 9 4 7 8 10 6 4 9 3 0 10 7 4 4 9 7 4 9 0 6 5 10 5 1 2 8 9 [...]
+2 3 0 6 4 4 4 5 6 7 9 2 0 6 8 7 10 9 10 9 2 5 10 1 7 6 0 5 6 4 10 9 0 3 2 3 1 5 8 2 0 4 2 2 3 9 10 9 0 3 1 9 0 1 8 10 1 9 10 10 8 9 3 3 10 0 0 6 0 1 10 5 4 3 9 0 2 2 7 4 3 4 3 4 1 9 2 9 10 0 3 10 6 3 7 10 6 7 8 8 8 1 4 4 5 9 0 5 6 4 9 9 4 3 4 7 9 1 0 10 4 8 10 9 0 0 6 4 4 5 5 5 1 7 2 4 10 4 3 4 4 8 8 7 0 4 10 3 10 1 3 0 6 5 3 2 10 8 5 6 0 4 3 7 3 5 9 3 9 8 6 3 1 3 2 5 8 1 4 6 4 10 7 10 10 2 9 7 10 7 8 1 10 3 3 5 0 6 2 7 4 3 8 0 1 9 0 9 7 10 0 4 2 1 2 10 9 5 1 5 3 7 4 9 2 3 3 1 7 8 3 2 8  [...]
+4 6 1 9 1 6 8 5 4 1 10 4 2 10 9 2 4 1 4 6 3 0 10 2 8 2 6 2 6 6 1 5 1 3 5 10 3 4 9 8 9 6 1 0 10 1 4 1 2 10 6 8 0 3 3 7 0 4 1 10 8 1 0 6 9 2 4 3 2 10 3 8 6 8 4 9 2 10 1 3 1 6 10 9 10 2 7 4 5 7 5 7 0 10 0 10 8 6 2 3 9 8 1 1 9 1 4 2 7 7 2 2 2 8 6 2 9 1 7 10 5 5 3 1 7 8 6 10 4 2 2 4 10 10 6 6 5 1 5 10 7 9 1 4 7 0 4 0 7 1 3 8 8 4 10 0 5 3 4 6 4 6 2 8 4 5 7 6 3 3 5 3 6 1 5 6 7 4 3 1 3 3 8 7 8 6 3 1 10 4 7 4 10 9 0 1 3 9 10 4 9 7 9 3 5 1 1 7 5 5 6 2 2 1 2 0 0 6 0 0 0 3 3 9 4 4 6 3 0 9 2 7 6 4 2  [...]
+2 7 9 2 8 10 8 1 1 6 8 5 4 3 8 4 4 3 3 2 2 1 1 8 4 8 5 5 3 5 10 0 10 4 0 4 0 9 10 10 3 0 9 7 0 4 9 4 2 6 5 0 3 5 7 1 7 7 9 0 5 5 7 4 0 4 2 8 8 9 7 8 4 5 10 4 3 0 8 7 6 2 3 8 9 4 7 6 5 7 5 7 9 1 8 10 3 1 4 3 0 9 6 2 6 5 1 5 5 8 2 10 0 9 3 5 6 9 1 10 8 0 0 10 2 2 0 10 9 2 7 1 2 4 6 6 9 6 1 6 10 4 9 7 9 4 0 9 6 8 6 4 1 6 10 2 3 10 9 0 7 3 4 4 0 10 7 7 9 9 2 9 4 9 4 5 1 7 3 6 3 7 3 7 1 4 1 0 6 3 0 0 0 0 1 3 7 0 3 10 7 3 6 5 8 10 8 0 4 7 0 3 2 2 6 2 0 4 7 5 2 3 6 0 4 10 2 8 4 1 6 6 9 4 10 10  [...]
+0 0 10 5 4 4 3 10 1 0 6 0 5 3 9 6 3 6 6 4 4 2 2 6 5 3 0 10 8 4 6 4 6 8 7 9 2 3 3 4 7 1 4 7 9 0 4 5 7 0 8 7 8 2 7 8 6 7 6 3 5 1 10 1 10 0 7 2 3 0 10 10 6 3 3 1 4 9 4 1 4 3 7 9 6 7 9 0 2 6 8 4 6 3 7 8 5 2 7 8 2 6 7 8 3 6 5 10 2 2 5 5 3 10 1 7 5 8 8 10 3 1 0 0 6 8 0 7 2 9 8 7 8 2 2 8 6 8 4 6 7 10 9 3 2 0 6 1 6 7 10 0 0 2 5 0 4 2 1 2 10 2 10 10 7 0 1 0 2 2 1 7 2 6 8 0 9 10 2 3 2 9 6 0 0 0 0 10 7 5 2 5 7 0 2 9 1 8 9 4 3 7 5 2 0 2 1 6 7 3 4 2 0 6 3 10 5 5 2 10 5 1 3 9 8 10 9 2 10 2 6 7 10 3 3  [...]
+1 10 10 4 3 4 8 8 10 9 7 1 9 10 7 3 4 6 3 4 8 6 4 8 1 2 9 7 2 4 2 1 6 7 9 2 7 3 5 0 2 0 0 7 7 6 8 6 3 0 1 4 3 1 2 2 7 3 7 5 6 1 2 2 5 2 3 5 4 2 10 0 8 1 6 0 8 9 9 8 9 7 10 1 10 1 7 7 8 4 5 4 2 4 9 0 7 7 5 7 7 7 0 2 0 1 4 0 7 1 2 1 9 1 9 8 10 10 7 1 7 8 2 10 4 5 2 6 7 8 10 5 10 10 8 8 9 7 0 3 1 9 1 3 5 2 10 9 0 9 1 7 1 0 0 6 7 10 9 3 8 6 6 4 2 2 6 2 6 6 6 9 4 10 1 10 2 9 5 5 8 5 8 7 9 1 9 5 9 4 9 10 9 0 7 5 8 9 8 5 4 5 5 6 7 0 10 4 9 3 0 4 9 0 6 5 4 8 3 10 8 3 1 10 4 9 2 4 9 9 1 9 9 8 3 8 [...]
+1 9 6 10 2 0 9 4 2 10 10 0 1 5 0 6 3 8 2 8 10 8 10 1 2 2 7 3 3 6 4 7 0 5 7 10 7 4 1 7 3 4 6 7 7 1 5 9 3 2 3 2 9 2 1 6 2 7 0 8 3 7 9 5 7 3 3 10 8 10 7 1 8 2 2 6 9 7 4 10 10 5 3 5 3 6 8 7 5 4 8 1 6 8 9 6 10 6 5 6 6 2 7 8 2 5 5 5 5 6 9 1 6 9 0 9 7 1 3 4 1 1 3 3 2 2 10 10 4 4 4 0 9 4 0 9 8 10 1 4 10 8 3 1 0 7 1 10 6 7 0 4 3 10 3 2 7 6 3 7 4 6 9 4 6 8 10 7 3 0 10 10 2 2 1 8 9 8 3 9 7 4 0 0 1 10 9 9 5 8 6 7 8 7 2 7 5 3 2 3 1 1 8 1 5 9 4 8 7 9 9 6 10 8 8 1 9 1 3 8 8 3 9 0 6 2 8 10 9 3 5 3 7 2 2 [...]
+7 2 10 5 7 10 1 0 5 6 9 4 5 5 10 7 6 10 10 3 2 5 0 4 4 3 10 1 5 10 2 6 8 3 3 8 9 0 6 1 0 8 10 3 2 0 1 8 3 6 2 2 2 1 3 9 3 8 3 7 5 7 6 3 4 7 3 6 2 2 8 7 9 5 1 0 2 5 2 10 10 8 2 0 7 4 3 3 10 10 0 3 8 10 9 4 9 3 8 5 3 1 0 2 9 7 4 9 7 8 5 8 8 6 3 6 6 7 1 4 1 2 9 8 5 3 3 7 8 8 7 10 5 8 3 9 2 2 3 5 1 9 0 9 10 8 1 0 10 7 5 4 2 6 1 8 6 2 1 3 2 0 1 10 0 5 5 4 8 7 4 2 9 2 6 10 9 4 10 3 2 3 5 10 0 7 4 2 6 4 0 10 8 8 5 8 3 2 2 1 6 5 1 7 6 6 7 8 1 2 7 8 10 3 4 9 2 8 9 6 3 10 8 9 8 6 8 3 1 10 7 3 0 5  [...]
+1 0 8 10 2 7 7 1 7 4 7 4 5 10 1 6 7 4 8 4 0 1 5 2 2 10 2 0 4 4 3 9 8 3 4 1 5 5 1 6 1 3 5 5 8 3 0 5 3 6 5 7 1 8 1 10 7 10 6 8 6 9 8 2 7 1 9 2 6 2 2 4 7 2 4 4 7 4 3 2 7 3 4 6 7 0 7 8 3 8 2 1 3 0 10 0 8 10 8 2 4 4 5 5 7 5 1 0 9 8 0 6 3 0 1 9 0 6 9 10 2 0 8 0 8 4 5 1 0 2 3 6 7 1 7 9 9 6 4 0 1 10 7 3 8 9 3 0 9 3 3 8 4 3 7 4 6 6 9 1 7 7 1 8 6 10 5 8 8 4 0 10 0 7 10 4 2 5 5 8 7 9 1 9 5 3 4 5 1 10 3 8 5 7 6 10 5 5 6 1 3 10 0 8 6 3 4 7 9 5 9 3 5 10 1 8 7 2 6 10 8 10 5 9 0 9 7 4 8 2 3 10 10 10 3 3 [...]
+3 7 7 9 10 2 9 0 0 4 0 5 10 1 0 3 8 9 8 10 2 3 10 8 2 3 8 9 6 2 0 6 3 2 8 6 5 10 0 5 6 1 9 5 10 6 3 10 5 7 3 5 9 4 6 3 5 3 5 1 0 7 8 3 1 10 6 2 6 7 1 7 3 3 3 7 4 7 3 10 0 10 9 10 9 9 1 3 5 6 1 0 5 1 8 8 1 9 6 4 9 4 2 5 3 1 5 4 1 4 10 5 1 10 10 0 5 2 0 3 5 10 7 10 0 9 0 8 0 10 0 8 1 8 10 1 5 1 3 3 10 7 9 9 3 3 4 7 2 2 2 2 0 10 5 6 10 3 8 9 7 0 6 6 7 10 0 3 4 10 9 10 0 10 3 3 9 7 7 10 3 3 5 7 2 6 4 6 7 7 7 8 4 10 0 1 1 7 6 6 8 1 2 8 4 6 6 3 1 8 5 9 2 1 7 1 2 4 10 5 3 2 2 9 1 3 0 7 7 8 4 0  [...]
+1 5 5 3 8 9 3 4 1 9 3 2 2 8 10 4 3 10 0 10 7 0 5 1 10 5 3 7 6 1 3 3 10 2 2 1 4 2 2 3 3 3 9 5 3 0 9 9 7 2 0 7 6 7 2 5 0 10 9 6 3 0 9 2 3 6 3 9 5 0 4 7 7 1 0 7 8 1 9 0 5 10 9 1 4 7 10 2 8 7 4 2 7 4 1 9 5 3 8 5 3 4 0 0 8 9 2 2 6 3 4 6 7 10 1 8 0 7 0 6 4 0 9 10 8 6 0 8 0 5 10 6 3 2 10 10 9 1 3 2 4 3 4 0 5 7 8 5 4 9 8 7 1 10 0 6 9 8 6 8 9 10 8 2 4 8 10 4 5 10 2 1 6 4 10 4 7 10 5 5 3 4 2 4 3 5 4 6 4 3 3 7 0 3 2 3 1 7 9 6 6 6 7 3 4 6 5 9 1 8 3 0 7 0 6 8 10 10 1 5 4 10 8 3 4 2 6 2 2 9 2 2 9 3 9  [...]
+8 2 7 1 3 2 7 0 2 8 5 7 7 3 2 3 9 1 2 0 1 0 9 2 7 5 3 9 2 6 4 4 6 2 7 8 2 4 7 6 6 0 3 3 10 5 0 9 9 5 4 6 5 7 2 9 8 6 4 0 1 5 10 9 6 8 1 7 3 10 1 0 6 4 8 5 10 5 4 7 3 5 2 2 5 1 5 1 7 7 6 5 6 10 6 0 8 5 10 9 3 2 8 10 3 5 6 2 2 2 1 2 9 5 8 0 4 0 4 8 1 10 6 9 1 3 1 3 3 9 9 8 3 5 7 7 3 9 6 3 10 6 2 2 10 8 0 8 1 8 7 0 10 10 2 8 2 10 2 2 3 2 7 10 10 1 7 6 5 4 8 8 1 10 1 10 5 6 0 10 0 1 6 5 4 7 8 2 4 8 3 6 10 7 3 6 4 4 5 3 3 9 2 4 5 4 4 0 8 3 4 10 3 3 3 9 3 6 8 1 10 5 5 8 0 0 1 1 10 7 0 9 1 1 5  [...]
+10 3 4 7 3 7 10 9 1 0 8 9 2 8 3 8 4 5 1 5 7 4 9 7 2 5 2 3 5 8 9 0 7 7 10 1 6 7 4 8 10 3 0 3 5 5 7 3 3 0 10 7 9 10 1 9 7 9 8 6 7 6 1 5 7 0 10 9 3 2 6 10 6 1 0 2 4 9 6 9 5 1 3 10 10 8 7 3 7 9 5 3 6 9 0 10 7 1 8 0 3 10 10 2 9 1 5 4 0 4 1 4 3 1 8 8 5 10 7 10 5 7 7 0 4 8 5 7 0 5 10 5 1 2 5 9 10 8 5 8 9 2 5 5 5 4 1 3 4 1 9 6 2 3 1 2 2 1 2 2 6 6 5 9 7 5 10 3 6 0 1 2 6 6 6 1 2 8 1 5 0 10 3 6 7 6 1 3 7 4 0 1 3 5 7 1 5 5 2 9 2 1 6 0 1 5 0 2 5 8 2 7 8 10 3 3 10 4 9 5 5 2 8 3 8 2 3 4 9 4 6 9 1 7 10  [...]
+5 2 10 1 8 5 5 6 5 7 7 6 5 1 10 10 6 3 3 0 1 9 3 0 10 10 1 6 6 8 0 0 5 8 0 7 0 4 1 3 7 1 1 3 7 6 4 10 10 0 7 5 3 10 4 7 0 10 9 6 0 10 10 8 7 5 4 1 8 3 7 10 8 10 4 9 3 5 4 8 2 10 10 5 8 1 0 4 2 4 3 10 1 1 9 6 4 7 0 1 0 10 10 0 5 4 3 10 5 9 3 2 6 10 8 8 0 0 6 5 4 8 1 6 3 6 7 0 8 2 3 5 8 8 5 1 6 6 7 9 3 3 2 9 6 0 8 3 6 2 7 3 4 1 7 2 8 5 3 1 4 0 0 9 2 5 3 7 6 7 10 3 3 8 9 10 6 7 5 4 6 2 7 1 10 7 3 1 7 4 3 10 4 5 4 10 9 0 2 3 0 5 0 3 10 7 5 9 2 2 8 9 3 3 7 2 0 2 0 2 0 3 8 4 5 9 1 7 3 1 7 9 6  [...]
+0 0 4 5 6 4 2 9 6 9 2 6 0 0 0 4 10 5 9 1 6 4 3 6 0 0 5 9 5 4 10 1 8 8 7 2 0 1 5 3 7 10 8 4 0 6 7 2 8 0 1 10 6 5 4 6 10 10 7 3 0 10 5 8 6 4 7 8 2 4 3 1 3 0 1 6 9 5 7 6 7 8 9 6 3 1 9 1 1 5 3 5 7 2 1 4 10 6 0 8 1 6 5 7 4 8 6 2 6 10 2 9 6 10 2 8 3 7 10 5 7 5 3 6 6 5 5 0 3 8 4 5 0 0 2 3 7 5 9 8 9 5 7 0 7 1 6 1 10 6 5 10 9 5 8 3 9 5 7 5 5 3 4 4 2 10 3 10 8 2 8 5 2 6 0 9 1 4 7 4 8 2 9 5 3 10 0 0 5 10 4 4 2 6 1 2 8 5 4 8 6 0 10 1 7 8 0 10 9 3 2 0 10 4 9 9 5 2 3 2 3 2 4 7 5 3 2 3 6 5 4 10 9 3 2 1 [...]
+2 8 5 1 2 7 1 6 2 10 9 2 6 10 2 5 8 5 5 5 6 2 5 9 6 0 5 3 8 4 10 8 0 6 10 0 9 3 3 2 0 2 6 7 7 9 10 6 1 8 6 1 0 0 6 9 4 9 2 5 2 5 2 7 9 6 4 7 7 6 1 1 10 2 5 5 8 0 0 3 2 1 5 0 3 7 4 1 8 2 10 4 10 8 10 10 2 5 7 2 10 10 4 4 7 3 1 4 9 8 4 6 10 2 4 8 9 2 4 10 10 0 8 6 3 6 5 7 6 10 0 4 4 0 7 8 8 8 4 7 4 10 5 9 7 0 3 10 9 0 5 4 0 8 8 1 6 2 9 4 4 3 0 2 7 8 0 7 5 4 7 10 3 6 9 6 0 10 2 2 3 8 4 10 8 9 8 10 7 6 2 10 3 2 5 6 9 2 9 0 1 10 10 5 7 4 1 10 5 1 1 10 8 9 0 10 0 6 9 0 2 9 1 5 3 2 1 2 4 2 7 10 [...]
+0 3 1 0 9 6 6 4 6 0 9 9 5 2 5 4 4 6 6 5 1 2 1 8 9 4 4 6 4 0 9 2 2 2 7 0 2 5 7 8 4 2 1 1 1 0 9 2 7 4 9 2 1 8 7 4 2 8 6 8 7 5 0 2 4 8 6 1 2 3 1 5 8 1 8 0 4 10 3 8 1 2 6 2 1 5 3 5 9 7 10 10 5 0 5 4 2 3 8 7 6 2 10 0 0 1 7 7 6 3 4 1 3 0 3 1 8 8 4 6 0 4 8 9 10 0 8 2 5 5 9 8 6 3 10 0 0 1 5 6 0 4 1 5 8 4 10 10 6 4 8 1 5 2 0 6 3 2 9 1 7 10 0 4 7 9 10 3 5 6 1 6 1 4 6 5 0 10 3 0 10 4 5 1 8 1 9 7 7 2 10 8 4 5 10 8 1 8 6 4 5 2 6 9 0 1 8 7 3 6 2 10 2 0 0 6 5 6 8 5 1 9 0 3 7 1 0 2 2 0 9 2 5 0 5 4 0 1 3 [...]
+9 4 5 1 2 5 0 8 6 0 2 10 5 9 2 9 3 1 8 10 1 6 5 2 10 9 8 5 6 5 10 10 0 4 10 5 6 10 4 5 10 10 8 1 5 9 4 8 5 0 10 8 5 9 7 5 8 1 4 8 7 9 6 1 8 6 9 4 3 9 2 1 6 2 1 7 1 6 2 3 5 9 3 1 6 7 3 7 6 1 3 1 9 8 1 0 7 9 2 5 9 10 7 6 0 3 10 7 0 1 10 4 2 9 5 0 9 10 5 10 1 8 8 9 2 8 5 3 5 6 10 6 6 0 7 9 10 6 6 10 1 8 7 7 6 1 9 2 6 5 2 6 2 10 10 8 10 8 7 9 9 4 8 9 5 1 6 1 2 9 3 9 10 3 8 2 4 5 10 2 3 5 0 1 6 1 9 5 2 10 9 1 4 9 0 8 8 4 4 7 4 10 6 4 4 3 1 4 4 7 8 3 3 3 8 9 9 10 8 5 5 7 10 2 2 0 5 7 9 9 5 5 1 [...]
+9 5 9 7 2 7 9 2 7 9 6 3 1 8 0 10 0 4 3 2 9 2 7 8 9 10 7 10 1 7 4 8 3 9 10 8 8 8 3 9 7 0 1 7 9 4 4 2 3 2 5 10 10 8 6 2 10 0 8 7 9 0 0 8 5 7 2 5 6 2 8 2 5 3 8 0 1 10 8 6 9 7 10 1 7 0 6 7 9 7 9 6 4 9 10 6 10 5 5 10 4 9 0 8 9 9 2 3 2 1 9 0 7 3 9 5 1 10 1 9 0 3 9 10 10 8 6 0 9 9 2 4 7 10 0 7 1 7 7 9 9 6 6 10 0 10 9 10 2 9 8 6 3 4 5 6 6 5 4 1 0 10 4 5 3 6 2 0 9 1 9 0 3 2 7 5 4 2 0 7 4 0 6 2 3 9 9 10 2 9 4 4 2 4 10 3 5 10 8 0 5 7 4 6 5 5 9 6 3 0 2 6 4 10 5 6 4 2 8 2 1 9 2 8 5 8 9 9 1 1 8 10 8 1 [...]
+6 9 10 1 1 4 3 4 3 7 3 6 4 1 2 4 7 0 10 4 10 3 9 1 2 10 9 7 7 0 1 6 5 4 9 5 4 3 10 6 7 0 1 6 7 3 2 2 10 3 4 9 0 5 10 10 3 2 1 7 7 1 2 4 3 5 8 10 6 3 4 2 1 2 0 7 1 7 4 7 5 8 7 5 5 4 6 3 8 3 8 3 2 0 2 2 9 9 5 0 4 8 1 0 3 6 10 1 3 9 4 1 4 2 1 3 6 8 6 4 6 0 8 4 8 3 1 6 6 4 2 7 4 6 9 7 3 8 1 5 5 6 4 8 8 10 3 0 7 0 7 10 5 10 2 10 9 0 3 2 7 9 7 7 10 7 8 6 3 4 10 5 3 6 7 9 2 6 6 3 6 5 6 7 1 2 5 0 4 1 0 8 6 4 8 5 3 10 7 4 6 6 7 5 4 6 1 3 2 10 10 10 6 4 6 0 8 0 2 1 10 4 2 9 9 0 1 5 2 7 8 0 9 0 4 2 [...]
+4 8 1 9 0 3 7 7 6 0 7 3 9 5 3 3 7 6 2 1 4 8 0 2 1 2 10 7 5 2 8 7 7 6 3 10 9 10 0 0 7 10 10 10 9 4 5 6 4 0 10 1 1 2 5 0 9 10 9 9 6 5 1 1 10 0 2 5 6 0 9 10 10 0 6 5 4 5 6 5 10 2 8 6 1 3 7 6 0 4 2 5 3 10 2 2 3 10 4 2 5 4 9 3 0 0 8 7 5 6 1 2 10 10 4 8 8 0 3 5 3 10 1 1 10 10 2 0 6 10 2 2 6 9 8 4 8 3 2 10 6 2 4 3 6 8 10 0 5 4 5 6 10 10 7 10 0 7 2 6 10 6 1 7 0 6 9 1 1 4 2 8 1 1 9 4 6 2 7 1 4 2 6 6 5 1 2 6 0 0 9 3 9 5 9 0 9 5 10 4 10 1 8 0 0 8 10 0 1 7 8 10 3 6 2 4 10 5 1 10 10 2 2 7 7 8 2 5 3 6 [...]
+9 8 1 6 4 3 2 7 5 4 2 9 9 10 1 6 4 5 6 4 1 9 6 7 1 1 4 8 3 4 1 8 7 10 6 7 2 8 3 6 6 4 10 7 1 3 2 4 5 6 9 0 1 7 4 8 10 9 3 10 7 5 5 0 8 9 6 10 5 5 5 4 2 2 6 8 4 6 2 6 0 1 0 10 7 8 9 2 3 6 0 0 2 8 5 7 9 2 3 6 8 4 4 1 4 6 0 9 4 3 9 2 4 5 3 8 8 2 9 4 1 3 7 8 7 8 10 5 9 7 8 6 10 9 5 9 10 10 5 0 7 2 10 6 1 10 8 5 0 3 5 9 10 3 1 0 8 0 0 0 3 10 5 4 1 1 6 2 7 10 0 1 6 8 4 5 5 5 8 6 1 7 10 9 9 4 4 1 6 0 7 0 3 0 2 7 5 8 0 2 6 9 7 3 2 7 3 1 1 7 0 9 0 5 4 7 8 0 9 1 7 4 2 1 1 3 5 9 3 10 7 3 4 4 10 3 0 [...]
+3 1 7 0 2 7 7 7 7 7 6 1 5 0 0 1 9 2 6 1 4 3 1 6 8 4 0 6 4 5 0 4 1 7 10 2 5 5 10 9 7 10 7 5 8 6 3 6 2 10 5 4 2 3 3 7 6 7 9 7 8 8 1 3 1 5 9 3 3 6 6 10 3 8 0 8 5 6 10 3 8 4 0 8 9 0 1 6 5 2 10 9 6 2 9 7 3 7 7 5 2 2 3 2 3 5 0 9 6 6 0 2 1 9 7 4 8 8 5 10 10 5 10 3 9 3 5 10 4 4 7 7 8 0 10 1 7 1 10 3 2 7 3 10 3 7 7 3 6 5 6 8 2 3 2 0 2 5 9 9 6 0 4 6 0 5 8 1 8 10 3 6 2 4 8 4 1 2 7 2 2 6 4 1 5 5 3 1 6 2 9 6 8 9 8 8 4 10 3 10 9 6 6 9 4 4 0 4 8 2 7 7 7 8 4 10 2 5 5 4 3 8 9 9 10 3 7 5 10 7 5 2 8 2 7 10 [...]
+4 2 3 8 5 4 0 7 3 3 3 0 5 4 8 4 10 5 0 2 9 2 8 9 6 1 4 4 2 3 1 5 8 9 1 8 2 3 8 4 2 2 3 10 10 5 1 1 6 8 9 7 9 6 7 0 0 6 8 3 3 9 0 5 2 1 7 6 8 10 8 1 9 6 8 0 10 9 3 10 8 9 8 6 5 7 3 6 6 6 9 10 7 0 3 6 6 1 3 3 9 9 2 3 0 1 2 5 5 1 9 4 0 0 7 5 9 6 9 9 9 0 4 6 3 7 0 4 4 5 8 0 6 3 10 10 6 3 9 0 8 10 4 4 7 1 4 10 0 6 10 1 0 5 7 7 6 1 4 5 6 0 1 3 6 5 3 4 5 10 8 3 7 9 10 10 1 0 10 7 4 9 10 7 4 5 6 8 5 5 6 3 1 2 0 6 10 4 2 0 4 2 8 10 9 3 4 3 1 4 5 10 4 2 0 1 3 2 7 10 5 6 10 7 6 3 8 8 6 2 1 1 8 1 2  [...]
+5 0 8 8 10 2 8 6 2 1 0 8 0 4 7 8 4 8 7 1 3 5 0 8 9 0 5 3 4 10 0 0 7 1 6 3 8 5 9 6 2 9 5 9 3 10 5 2 5 4 1 5 10 1 7 9 7 9 5 2 6 2 8 7 5 4 10 8 6 3 2 4 6 9 4 0 10 8 6 10 4 1 0 7 2 10 5 0 9 5 2 5 7 5 6 3 9 1 2 5 0 3 10 10 9 0 0 6 3 7 9 4 3 9 6 2 3 1 7 8 7 6 1 9 5 1 2 8 4 9 8 6 10 3 6 4 10 2 7 2 1 4 1 10 9 6 1 0 4 5 4 1 10 9 0 4 5 10 1 6 6 10 4 1 10 7 2 6 6 8 3 3 4 5 5 4 2 9 3 10 10 10 7 5 7 10 7 8 2 3 0 0 6 7 6 6 7 5 1 6 7 5 1 9 0 1 5 7 7 8 2 6 4 8 7 9 1 6 3 6 1 8 10 3 8 6 10 5 0 1 9 6 7 6 2 [...]
+10 1 5 8 5 10 7 10 4 6 0 6 1 8 7 3 2 7 0 0 9 5 1 1 8 8 2 10 7 5 1 0 9 6 2 3 1 2 6 6 2 1 3 6 4 0 2 3 1 2 10 7 7 9 4 9 8 0 4 4 2 3 4 2 2 10 4 0 9 10 1 2 7 6 10 2 6 2 4 3 1 8 5 3 9 3 2 10 7 4 2 9 5 5 4 9 8 0 10 1 5 3 3 7 7 0 6 6 1 8 2 1 0 1 5 6 9 1 8 8 1 2 6 1 10 6 7 3 0 9 4 7 2 6 0 0 4 1 3 2 3 7 7 4 1 5 5 8 0 2 2 3 7 3 4 10 7 6 9 2 10 9 6 2 5 4 8 1 4 2 0 2 4 5 4 8 4 5 3 2 9 6 7 8 1 9 5 7 5 9 2 7 10 7 7 9 2 3 9 3 8 8 10 3 4 1 5 0 2 4 4 4 5 2 4 5 8 8 6 7 0 4 7 0 8 6 3 0 0 5 2 8 0 6 8 3 0 7 1 [...]
+9 5 0 1 3 6 2 10 3 0 8 6 10 4 8 5 2 3 9 5 2 8 5 5 0 7 3 7 10 0 9 2 10 0 8 5 5 5 5 4 6 1 4 5 5 8 3 8 2 1 0 3 2 7 1 9 5 2 6 4 10 4 10 3 1 2 1 3 2 4 1 7 1 3 4 10 8 4 1 10 8 7 5 2 6 0 10 6 9 8 0 5 5 2 5 0 2 8 1 8 5 9 3 6 8 1 3 0 1 2 10 6 8 5 5 5 3 7 8 6 6 6 8 10 7 7 7 5 6 8 7 2 6 9 1 5 9 10 10 10 4 10 6 9 0 2 1 10 8 5 7 4 9 7 2 2 8 7 9 1 7 8 3 8 6 1 9 8 9 1 6 0 6 9 7 7 5 1 9 4 10 4 2 5 0 2 2 1 5 10 8 9 3 4 4 10 6 0 1 0 8 6 2 3 6 4 1 9 5 7 10 2 6 2 6 2 3 8 0 1 7 6 8 8 1 10 8 7 6 3 0 3 10 8 2  [...]
+5 4 1 2 1 4 6 7 10 1 10 3 3 2 8 7 6 5 8 6 9 5 4 2 10 9 10 9 5 5 4 4 6 6 1 1 7 6 9 5 7 9 2 4 1 7 4 2 5 2 1 1 6 2 9 0 10 5 10 4 7 2 4 9 9 2 5 5 9 2 6 9 3 3 4 10 8 8 0 0 8 7 8 6 8 5 0 3 7 1 4 9 3 8 9 8 0 5 2 1 7 5 5 4 7 6 2 7 3 7 5 1 5 3 1 3 2 6 3 2 1 4 5 1 5 7 0 0 2 5 1 10 9 3 4 9 6 3 7 7 0 5 4 8 9 1 2 9 5 7 8 10 7 8 10 0 5 4 2 4 4 8 8 0 3 1 6 10 4 5 3 3 7 9 0 3 7 0 4 4 8 4 4 1 1 8 9 9 5 5 10 2 2 4 8 7 10 8 3 6 1 10 8 6 10 2 10 3 8 4 7 9 1 2 8 6 2 6 6 4 2 1 5 3 6 6 0 5 8 8 3 5 10 2 9 3 6 9 [...]
+0 6 9 6 9 9 4 10 8 1 3 4 8 10 9 6 0 7 5 6 5 3 6 2 6 7 10 4 8 5 10 8 2 4 2 2 0 1 9 4 3 3 0 5 8 7 8 2 0 1 9 0 1 1 0 6 7 1 0 4 0 7 8 7 4 3 0 0 1 6 6 4 8 6 9 3 2 7 10 2 3 5 5 7 8 1 5 6 7 3 10 7 9 10 10 7 7 1 9 2 4 7 3 3 2 9 0 4 1 3 8 9 9 7 4 1 9 8 2 8 6 9 9 9 10 3 2 4 8 2 5 6 8 4 5 2 0 3 2 3 10 3 0 9 10 6 1 6 8 9 5 10 1 5 2 2 3 4 8 6 10 2 9 9 4 2 4 7 3 5 8 10 8 0 10 5 2 5 10 3 1 6 5 9 8 3 2 7 5 9 4 0 2 0 5 0 0 1 8 2 8 7 10 7 6 0 9 6 2 1 7 9 3 4 4 7 3 7 7 9 9 10 2 4 0 0 9 5 8 2 6 10 1 9 9 7 1 [...]
+5 6 1 10 3 5 2 4 2 0 7 10 8 9 9 10 0 9 6 3 5 1 7 3 10 1 8 2 0 9 2 0 5 7 10 4 4 8 5 7 8 9 10 9 9 2 0 0 0 10 2 1 2 10 5 2 8 0 7 7 6 4 1 2 4 0 1 10 3 8 3 7 0 9 0 6 7 4 7 0 4 0 7 8 1 3 0 2 2 7 4 0 5 10 1 8 7 8 6 4 7 2 2 8 1 9 5 3 6 0 9 0 7 9 8 0 8 2 6 3 5 8 8 10 8 3 3 7 6 7 5 0 6 9 3 10 7 4 3 8 8 9 7 10 3 10 4 2 7 10 9 3 0 3 10 4 6 8 7 9 10 5 10 4 10 3 10 9 5 9 10 1 6 4 0 8 5 8 6 0 4 0 5 6 1 7 2 4 4 1 7 5 9 0 6 1 0 7 0 7 5 10 8 6 7 1 10 9 10 2 5 8 6 8 2 8 7 9 2 1 9 2 6 0 10 0 3 1 5 5 9 2 8 9 [...]
+8 9 1 1 4 6 2 4 3 7 9 9 8 5 1 0 9 2 4 9 2 5 10 1 5 6 9 8 3 4 8 8 4 2 9 5 7 10 6 5 1 7 10 3 0 2 6 10 7 5 0 9 7 0 7 8 1 2 0 1 6 8 4 1 2 4 4 6 10 7 8 5 7 1 4 5 0 0 4 10 8 10 10 4 6 2 3 1 9 4 9 3 6 9 8 1 7 9 2 0 4 8 10 9 4 7 8 4 0 4 0 5 2 8 4 9 1 8 9 9 9 3 1 5 8 9 0 0 8 1 1 5 5 7 5 4 7 10 0 2 3 4 2 2 6 8 7 6 9 7 6 5 6 6 5 9 10 2 0 2 4 7 0 8 1 10 5 1 0 6 1 1 1 6 0 3 3 0 0 6 10 1 3 2 2 0 0 7 9 4 2 0 5 1 8 7 9 4 2 3 7 0 8 6 4 5 0 6 8 7 0 5 6 10 4 5 0 2 3 10 6 1 1 10 0 10 0 6 2 4 9 7 5 0 3 4 6 0 [...]
+10 8 6 0 4 10 2 0 4 2 4 7 7 6 7 6 1 3 10 3 0 8 9 5 7 4 0 9 9 1 7 6 9 10 0 5 0 2 8 4 5 10 9 10 6 9 3 2 10 4 3 6 3 8 0 10 10 5 6 5 2 3 4 8 6 10 1 7 0 1 3 4 6 3 3 4 5 7 3 3 9 3 9 3 10 10 9 10 3 6 3 7 3 5 5 7 0 1 3 7 6 0 6 9 2 4 1 0 8 3 6 4 1 3 3 10 1 0 2 5 9 7 4 0 4 6 4 5 2 6 5 2 3 3 8 3 10 9 10 7 8 0 9 1 3 4 3 4 4 0 1 4 2 5 5 3 2 4 0 5 4 5 6 6 10 7 0 10 10 1 0 4 7 3 4 6 5 2 7 3 3 8 6 3 1 7 2 1 0 1 4 5 8 8 8 0 6 7 3 2 4 3 7 3 2 7 8 10 2 7 8 2 8 4 8 2 9 2 4 8 10 7 1 2 6 9 8 9 4 1 6 0 6 3 0 1 [...]
+1 1 1 6 7 6 6 3 10 3 2 6 2 1 7 1 8 6 2 1 0 7 8 9 7 7 4 8 8 3 9 3 4 4 9 5 3 3 0 10 3 5 6 10 1 2 0 7 2 1 8 5 3 8 0 6 9 3 2 10 1 5 4 9 1 2 5 10 9 10 3 7 5 5 6 4 6 9 0 9 0 6 10 9 8 5 4 4 7 6 8 9 8 4 7 4 9 10 0 7 4 7 3 0 3 4 4 4 5 5 1 10 8 0 10 5 7 6 10 1 7 0 2 4 6 2 10 3 9 10 9 0 3 0 5 1 1 10 5 4 6 0 1 1 5 2 7 4 6 0 2 3 2 1 9 5 6 9 1 1 0 10 6 8 8 5 9 8 9 9 4 6 5 4 4 5 8 7 4 7 2 4 0 8 6 1 0 7 5 3 6 3 5 9 0 5 9 1 3 2 10 10 5 1 0 9 2 0 4 8 7 8 0 5 8 4 4 7 10 2 1 1 10 8 0 0 6 7 2 1 4 9 4 9 9 3 7 [...]
+10 9 0 5 8 8 8 10 4 5 0 0 7 0 10 8 0 4 9 7 1 3 1 0 3 2 6 0 2 6 7 0 0 1 4 10 10 2 5 9 1 0 2 4 1 9 9 8 10 10 3 3 7 0 3 5 4 8 9 10 6 8 7 3 8 10 1 10 2 5 4 8 6 6 10 6 9 5 8 3 2 9 2 3 3 7 6 2 1 8 9 6 2 3 10 7 1 6 2 2 8 2 0 10 2 7 3 7 5 10 4 8 1 1 10 1 0 7 1 9 0 3 7 5 7 6 1 9 0 6 9 7 1 5 6 10 3 3 1 10 5 9 4 4 2 9 2 6 7 9 1 10 7 10 5 1 9 8 8 2 10 1 10 3 9 2 8 8 2 0 3 3 10 8 7 10 7 6 1 8 1 8 0 5 6 7 10 5 3 1 8 7 7 1 5 9 4 1 4 5 0 9 5 9 2 4 6 10 9 5 0 5 1 8 6 8 5 10 3 3 0 5 0 3 5 8 5 5 2 9 2 0 10 [...]
+8 4 9 6 6 3 6 5 3 10 8 0 10 7 1 1 10 3 0 6 6 7 8 1 2 9 6 1 6 8 3 9 2 3 0 8 6 2 8 6 1 2 5 8 0 4 8 7 2 2 8 0 2 6 2 6 6 6 6 2 7 9 10 1 3 9 4 4 5 6 1 5 3 5 6 7 9 9 10 7 2 8 5 1 9 2 5 0 10 5 2 10 8 2 10 1 2 8 3 8 2 8 10 10 9 6 6 6 3 2 7 9 5 5 9 0 1 2 7 9 0 9 6 2 7 2 4 7 5 5 0 8 8 6 3 2 7 5 3 6 2 10 7 10 8 2 8 6 1 5 5 9 0 0 5 1 8 10 0 9 5 3 2 6 0 7 5 7 10 6 6 8 8 9 8 2 3 1 8 0 2 8 3 10 3 7 10 9 0 6 1 4 9 5 2 2 8 0 1 9 5 5 4 7 6 8 9 1 9 1 7 7 9 3 7 2 1 4 10 1 9 6 2 2 5 5 4 8 9 1 8 3 6 5 1 3 8 5 [...]
+3 5 2 5 0 5 9 0 9 3 9 10 0 9 8 3 2 1 10 4 6 9 5 9 7 2 7 9 10 9 6 6 10 0 5 8 8 9 3 8 4 1 8 9 0 4 5 5 5 3 3 10 6 0 6 9 4 9 7 4 6 7 2 0 2 5 2 6 5 10 8 6 4 3 10 4 0 5 6 9 1 8 6 1 1 1 4 4 5 7 3 0 4 5 2 3 5 1 10 10 9 4 0 7 10 3 9 10 5 2 4 7 6 10 10 1 2 7 3 1 2 2 7 3 3 2 8 8 10 10 0 4 9 5 9 5 1 4 3 3 5 3 1 5 3 9 7 3 4 2 0 4 10 5 3 3 0 10 1 5 4 6 10 0 2 6 10 6 3 0 2 2 1 5 0 0 1 10 10 4 8 3 10 2 8 4 3 8 3 4 5 8 9 1 8 2 3 6 2 5 0 10 4 5 1 2 4 9 5 9 0 10 3 2 10 0 6 2 7 7 3 10 8 8 3 5 8 6 8 9 3 1 10 [...]
+5 7 3 2 8 10 10 0 8 6 10 2 3 3 4 2 1 2 0 0 1 3 9 1 9 1 5 5 8 0 10 1 9 8 7 1 1 1 8 2 4 6 6 7 10 2 0 8 4 3 6 1 1 9 4 3 3 9 7 3 9 4 6 3 7 9 9 1 4 4 5 7 6 3 1 9 2 10 1 4 1 8 9 7 6 10 6 10 4 4 1 2 3 7 10 9 10 3 9 0 6 7 5 1 3 2 9 0 8 3 7 3 2 7 8 3 1 9 6 9 5 7 3 3 9 7 5 5 1 9 10 9 10 10 1 9 3 7 3 2 2 7 8 3 4 6 9 9 2 2 6 2 9 6 3 9 8 6 7 3 6 0 8 0 8 3 1 10 5 3 4 8 8 7 6 1 2 9 6 6 3 5 10 10 3 8 3 1 9 4 5 7 9 3 1 0 8 4 4 4 4 2 6 0 7 2 2 10 2 4 2 4 5 9 7 1 6 8 10 1 1 3 6 7 7 2 9 9 6 2 2 6 8 9 2 5 6  [...]
+6 7 0 1 6 1 1 4 7 1 1 4 5 6 5 8 6 8 2 10 10 10 2 6 4 9 2 1 0 9 4 6 7 9 0 7 1 1 0 9 1 4 8 8 0 3 0 5 8 1 1 0 4 2 6 2 4 4 0 9 10 8 1 4 5 8 4 8 2 10 0 6 0 8 4 0 0 3 4 10 9 5 5 8 5 6 3 6 1 0 7 7 10 10 5 1 4 5 8 1 10 4 5 8 7 9 3 4 3 2 10 1 10 6 8 7 3 9 0 4 0 0 0 4 6 2 4 8 8 9 8 7 10 8 6 4 8 1 6 5 5 1 7 5 0 5 6 2 8 8 1 8 2 8 8 8 8 8 9 0 7 7 10 10 7 6 5 10 0 2 5 8 0 2 2 4 3 8 3 0 8 7 6 6 5 1 3 4 3 6 7 10 4 3 5 1 1 8 5 9 10 5 2 6 4 3 7 10 2 4 10 1 7 5 9 9 2 6 6 9 10 1 5 10 3 7 1 1 9 3 9 8 5 10 9  [...]
+10 1 1 1 9 8 10 0 8 4 4 8 0 9 8 8 3 10 10 3 5 2 4 6 7 8 8 2 10 1 8 4 5 0 3 10 1 5 2 10 5 3 4 10 0 2 1 5 0 3 5 1 6 10 8 7 2 5 7 6 6 7 5 2 1 8 10 9 1 2 0 4 7 4 2 3 6 5 3 5 7 0 8 4 9 4 1 4 7 7 7 3 5 1 0 3 4 8 5 5 4 10 9 4 0 0 8 0 7 0 5 9 8 1 6 8 4 0 8 4 1 2 2 0 4 4 1 7 0 2 5 5 5 0 0 9 10 9 9 5 8 0 4 9 2 9 0 0 9 4 4 7 7 6 7 4 0 10 0 0 9 8 1 9 9 6 2 4 8 4 7 10 5 5 4 3 3 4 0 10 4 1 5 9 0 10 4 9 9 9 1 9 6 1 7 10 9 4 10 4 9 3 0 5 9 4 10 10 10 5 4 5 6 4 2 0 6 4 9 5 1 5 7 10 1 2 10 6 2 7 4 9 7 5 0 [...]
+9 10 7 9 1 9 2 8 2 0 2 9 6 1 9 1 9 9 4 7 3 9 10 6 6 2 8 2 8 4 7 8 10 3 0 4 2 9 2 2 0 10 6 3 4 1 1 10 10 5 2 10 2 7 3 10 2 8 8 8 1 1 6 1 1 5 10 3 0 6 8 3 3 10 7 4 9 5 6 6 3 0 8 10 2 4 7 7 1 10 6 3 8 6 4 7 8 1 1 10 1 10 8 7 2 10 0 7 10 2 6 0 3 1 0 0 5 5 4 9 7 2 4 5 3 10 7 0 2 6 10 2 7 3 1 1 6 4 8 0 5 10 7 6 9 8 8 7 5 5 1 1 8 6 0 1 9 7 3 7 4 5 6 2 3 1 10 9 4 1 9 6 1 6 4 1 5 3 3 2 2 9 3 7 2 8 0 1 3 3 10 9 1 4 1 6 5 10 10 3 3 3 9 4 0 0 6 4 8 0 10 6 3 4 3 6 2 10 6 2 10 6 7 0 3 3 10 8 9 3 1 3 4 [...]
+5 6 6 9 2 0 3 10 8 1 0 2 5 8 9 10 5 2 6 5 0 9 4 4 8 5 5 7 10 0 7 6 10 5 7 4 4 8 6 4 1 3 2 1 4 6 10 6 4 3 3 10 6 0 1 2 4 6 7 10 3 5 8 5 8 6 4 7 0 5 3 7 3 7 9 9 4 1 9 8 9 10 8 3 9 4 10 3 6 4 6 4 7 4 6 0 8 9 9 9 1 1 4 1 8 0 10 1 8 4 6 7 6 10 0 1 10 6 0 1 2 5 9 8 5 2 3 5 10 2 8 2 6 5 9 1 5 0 8 5 7 5 8 6 6 5 6 8 3 1 10 2 0 7 8 8 3 9 1 10 7 0 0 1 7 6 2 9 1 3 6 4 3 4 8 9 10 3 2 4 0 10 0 7 0 6 5 3 0 3 3 7 10 0 10 7 1 7 2 6 5 1 6 0 2 9 5 8 10 6 6 4 9 3 2 3 8 5 5 10 2 5 2 10 9 7 7 3 6 5 7 4 8 1 6  [...]
+8 5 6 0 10 3 4 2 6 2 0 3 2 3 0 6 0 1 0 0 4 10 10 1 0 4 1 3 4 9 4 0 3 9 8 0 5 8 0 7 5 6 3 9 3 4 3 0 2 0 0 9 10 8 9 9 4 0 3 2 0 9 9 0 0 8 6 8 2 7 8 2 0 4 7 4 6 7 6 2 2 0 6 6 1 4 8 1 1 3 7 4 0 1 7 10 1 7 3 0 3 9 10 5 4 1 7 3 3 0 9 9 3 2 3 5 0 10 3 8 4 3 0 9 8 9 2 1 10 5 7 3 8 2 9 4 10 6 7 5 2 1 7 2 7 2 7 10 1 10 2 8 0 0 6 5 3 5 0 4 0 1 0 8 0 0 0 7 3 1 8 7 0 4 9 3 5 1 6 3 4 1 9 0 2 4 4 10 10 6 3 0 7 1 1 7 7 1 7 3 4 0 8 6 10 2 6 6 6 8 4 4 7 5 9 8 4 4 1 5 3 5 4 10 5 6 1 3 1 0 6 3 2 6 4 10 10 7 [...]
+9 3 10 4 1 5 10 7 3 6 0 8 4 9 6 5 6 2 6 2 1 5 0 1 0 0 7 1 5 2 9 1 8 9 8 2 7 2 7 3 1 2 4 8 1 9 10 1 2 4 9 8 10 7 4 5 4 10 2 1 7 1 2 1 4 7 0 3 0 0 8 5 1 2 4 6 5 6 3 2 6 3 5 0 6 8 5 2 9 7 9 10 10 4 4 0 4 10 4 4 3 4 2 10 7 6 6 3 7 9 10 5 2 3 1 10 4 0 10 5 2 3 3 10 9 5 1 3 10 1 3 8 7 4 1 9 9 0 4 10 7 10 8 8 9 7 4 6 8 0 10 8 4 3 8 1 6 4 6 8 4 0 8 10 5 1 9 9 4 4 3 5 8 7 10 3 1 0 2 1 10 2 3 3 2 9 6 0 6 7 3 6 3 1 1 3 0 1 1 1 5 1 0 1 1 7 0 10 10 6 7 6 6 5 9 1 5 3 8 9 4 5 7 3 6 3 6 6 0 2 3 4 8 9 5  [...]
+1 3 4 0 7 10 1 6 1 7 6 7 9 10 3 2 7 5 0 5 5 5 5 5 0 7 4 0 7 10 8 7 3 7 5 4 5 6 1 1 9 7 5 3 1 3 4 5 5 6 1 9 2 6 3 7 6 5 0 8 8 4 8 5 1 7 2 10 1 9 5 7 5 5 4 9 1 2 7 7 4 7 10 5 0 3 8 7 9 10 1 1 2 3 10 4 2 1 7 0 10 2 6 4 9 0 6 7 2 0 5 9 9 10 9 0 5 8 9 8 3 8 4 6 9 6 4 9 10 10 1 0 1 7 6 9 5 1 3 1 2 6 5 8 3 6 1 8 6 6 3 7 9 5 9 5 10 0 2 6 0 10 10 6 0 7 9 8 0 9 9 0 9 10 1 8 0 6 3 3 2 4 3 7 2 0 8 10 7 7 10 9 10 6 6 2 4 4 10 9 3 1 9 5 3 1 5 3 4 6 3 4 2 8 4 3 1 7 0 3 8 1 7 9 10 7 9 1 1 7 1 6 4 8 1 0  [...]
+0 2 2 2 8 0 9 0 3 5 8 1 3 5 1 9 4 7 9 7 6 6 8 2 7 9 2 9 0 1 7 4 3 9 5 5 7 4 2 9 3 1 7 1 5 3 0 6 7 3 6 8 10 10 3 8 9 2 1 4 9 9 1 5 9 8 0 6 8 1 6 6 0 4 4 2 0 6 0 4 10 8 2 10 10 0 3 7 5 4 1 0 3 4 5 10 9 10 4 6 6 3 3 2 1 7 7 5 5 9 9 2 10 1 4 4 7 8 7 6 6 1 4 1 9 10 3 8 2 4 3 1 6 7 6 9 3 7 3 3 4 6 1 2 2 9 8 2 10 5 9 4 4 7 2 5 8 7 5 5 5 0 9 8 1 3 5 9 10 2 2 6 9 8 4 10 2 7 0 7 6 1 1 5 10 4 10 5 0 3 10 1 0 1 9 7 9 4 8 0 2 3 0 4 1 5 5 7 1 4 4 7 6 0 6 5 5 5 4 3 6 10 9 10 5 6 9 1 4 8 4 4 6 5 2 3 4 6 [...]
+1 10 6 9 1 1 10 2 5 6 2 5 2 2 0 9 8 10 4 2 9 10 8 7 8 8 9 7 2 1 6 3 10 6 0 6 6 9 9 9 0 1 2 7 8 8 3 0 1 6 5 0 4 5 0 8 2 1 1 6 9 6 6 8 0 5 4 2 1 2 8 7 3 9 2 2 8 5 9 10 8 5 5 2 1 8 5 9 10 1 6 0 10 5 7 10 8 9 6 4 10 10 2 6 6 3 10 1 10 5 10 5 5 6 6 4 8 5 7 8 1 9 5 1 6 2 2 10 4 0 7 0 4 2 3 9 8 0 0 1 2 2 5 9 4 1 3 4 7 9 4 8 7 4 8 3 10 2 9 8 7 5 2 5 10 3 2 8 7 0 6 9 6 4 10 4 9 6 2 10 10 6 7 6 7 10 4 7 3 3 6 5 1 6 3 1 1 3 2 7 2 8 5 3 4 4 10 2 5 6 4 7 2 3 9 5 5 0 3 3 9 1 3 3 7 0 0 6 7 6 6 3 10 8 4 [...]
+0 0 7 6 9 0 9 10 0 6 6 1 9 2 10 4 10 4 10 0 2 9 2 9 1 2 0 1 10 3 8 8 1 4 7 6 4 9 3 1 10 2 3 10 6 10 2 9 3 1 1 5 8 9 0 6 2 8 2 2 3 5 0 3 5 2 1 1 5 1 6 7 4 7 8 9 8 9 0 0 8 8 4 3 9 10 8 7 1 6 1 3 4 10 5 6 10 6 6 5 5 5 0 1 2 0 2 5 3 9 5 1 1 1 0 9 5 4 5 2 4 7 9 9 10 8 2 1 3 5 8 1 7 9 8 9 1 6 3 5 1 0 5 8 5 8 8 0 6 4 5 4 2 0 5 0 4 9 3 0 6 3 9 0 5 6 8 5 6 9 8 1 3 6 6 8 8 8 10 1 7 6 9 4 4 5 9 10 0 1 1 4 5 9 2 1 6 0 0 3 0 7 10 9 5 10 6 8 4 2 8 10 2 1 2 8 9 8 1 10 3 8 10 1 8 8 6 7 7 6 2 1 4 8 8 0 7 [...]
+2 2 4 9 7 10 3 1 1 8 3 0 6 2 7 3 6 4 5 2 9 6 1 0 6 5 3 2 7 9 9 4 8 5 8 5 9 4 1 7 7 7 3 10 3 5 10 1 5 9 6 8 1 8 9 6 2 5 9 4 5 8 1 2 2 5 5 10 4 6 5 3 6 3 6 7 2 4 0 3 6 3 0 3 5 10 5 10 7 2 2 3 5 1 0 3 9 1 5 2 3 10 4 6 10 6 4 0 0 3 0 5 3 4 6 10 7 10 8 9 2 3 2 2 8 2 4 2 10 0 1 6 9 2 6 10 10 1 10 7 0 4 10 9 9 2 1 9 10 8 10 7 10 0 9 6 10 0 9 6 9 8 2 9 5 2 5 1 0 9 6 7 3 4 4 3 10 0 10 7 9 3 5 3 2 5 6 6 5 9 2 0 9 10 6 2 5 0 4 1 7 8 2 0 8 8 7 9 4 7 10 0 2 3 10 5 3 5 2 2 5 3 1 5 2 5 2 6 6 3 3 9 2 6  [...]
+1 3 1 3 8 8 3 0 7 1 7 10 1 4 2 6 7 6 2 3 10 3 3 8 4 8 9 2 4 2 0 4 10 8 1 5 2 0 1 9 0 7 0 9 6 5 9 6 0 0 6 4 10 5 7 2 1 1 7 0 4 2 2 8 5 8 8 6 4 3 4 3 8 6 0 7 2 4 7 10 9 3 10 2 10 10 5 9 5 2 10 9 2 1 10 5 7 1 4 6 6 4 3 3 3 6 6 0 0 4 2 9 3 8 0 5 4 6 1 10 8 2 10 4 3 7 6 6 4 0 1 4 3 1 2 0 2 3 10 0 5 4 1 4 7 4 6 7 8 1 1 6 9 7 0 3 4 1 6 2 10 3 2 7 1 10 8 6 1 6 4 4 9 10 2 2 8 4 9 7 10 6 8 2 5 10 6 0 8 1 4 2 4 8 4 5 10 6 5 10 1 9 0 10 10 2 7 8 3 1 5 6 8 10 9 10 4 1 5 8 1 8 10 10 7 5 5 8 1 8 4 9 5  [...]
+7 7 9 3 6 6 8 6 6 4 8 2 1 5 5 10 3 0 7 7 0 3 10 3 2 3 10 9 4 3 1 7 4 2 4 4 0 7 1 10 2 0 6 5 5 10 3 10 4 4 4 7 0 7 2 4 10 8 3 4 9 5 5 10 9 0 6 3 4 6 8 7 0 4 5 8 1 10 3 5 8 8 6 7 2 5 7 4 10 7 5 5 9 3 10 3 0 0 2 8 7 7 0 9 7 8 5 3 8 8 6 3 1 5 2 8 5 9 3 5 2 3 1 7 0 1 2 4 6 9 3 10 1 2 0 1 6 10 1 8 9 8 7 0 10 5 2 1 3 5 10 6 1 8 5 0 0 9 5 3 0 0 3 3 8 1 9 10 9 10 6 4 9 5 8 6 5 9 3 6 4 10 5 2 0 8 4 6 8 6 6 7 1 9 6 8 4 10 2 1 9 4 10 5 3 9 9 4 9 1 1 6 8 7 6 0 5 10 8 8 3 3 4 9 4 0 3 10 3 1 8 3 7 9 3  [...]
+0 1 1 3 6 3 2 10 3 1 10 5 9 4 8 3 9 9 7 0 2 3 9 7 3 8 1 6 4 8 2 3 10 8 8 2 3 8 0 3 8 10 2 9 2 2 8 9 6 7 7 0 6 1 7 5 4 4 9 5 6 9 3 0 5 1 2 4 6 9 3 4 8 7 7 7 5 6 5 0 8 4 8 9 3 7 8 9 3 7 0 9 6 10 10 9 0 4 4 0 6 10 7 4 7 7 1 9 6 8 9 10 3 8 8 1 2 7 5 8 6 8 8 10 10 4 8 6 9 10 6 3 3 7 8 6 7 0 6 2 7 9 2 4 8 0 0 4 7 7 6 8 5 9 9 6 6 0 8 1 0 3 6 7 5 9 0 0 1 3 10 10 7 7 10 6 1 8 5 2 1 5 6 6 10 5 6 7 1 1 9 0 10 4 10 4 8 6 1 1 4 2 9 6 3 0 9 5 6 0 4 9 10 6 1 6 6 10 8 10 6 10 6 1 0 4 7 5 0 0 10 9 3 2 5  [...]
+1 8 8 10 0 8 9 6 2 8 7 5 5 9 4 8 9 1 4 3 8 0 8 7 4 7 3 2 9 5 10 0 8 4 8 7 6 10 0 2 2 5 9 6 10 4 7 5 7 8 0 10 4 5 8 8 3 1 8 4 9 10 4 9 10 1 8 4 9 4 2 5 7 8 2 8 10 2 10 8 6 3 0 8 2 1 5 0 10 5 10 3 5 9 9 3 9 1 4 1 10 7 5 3 8 9 4 4 1 2 4 8 1 0 7 4 8 2 9 6 5 0 0 1 9 2 5 3 4 6 10 7 2 9 2 5 2 10 1 5 3 6 2 3 10 6 10 5 0 4 1 7 2 8 1 9 1 2 5 7 6 4 4 10 7 9 2 8 8 1 4 0 6 3 8 1 9 6 10 9 7 5 0 9 9 3 2 10 10 3 4 3 10 1 2 7 5 6 0 10 3 9 1 8 2 10 6 2 10 2 3 5 3 6 8 4 10 4 10 8 4 4 8 5 0 3 6 6 7 8 9 2 7  [...]
+3 7 9 1 5 4 0 2 3 1 7 3 9 8 9 10 9 3 7 7 10 3 9 2 3 9 2 4 9 10 8 7 10 1 2 4 2 9 6 10 4 10 7 1 5 9 1 10 4 4 10 0 10 0 0 10 0 5 4 0 4 9 9 3 6 4 2 5 0 6 3 7 10 6 7 7 9 7 7 0 8 7 8 2 1 10 10 8 0 5 5 1 5 2 7 10 9 2 6 6 3 6 4 6 3 9 8 2 9 5 5 0 2 5 6 10 5 6 7 1 9 10 4 2 0 4 8 9 3 7 3 7 10 4 7 6 3 6 6 2 9 9 5 8 9 7 3 2 7 9 1 9 10 4 1 0 6 2 10 10 8 1 10 3 8 8 6 3 5 1 1 0 0 3 7 5 8 10 9 1 9 5 1 7 10 3 6 8 2 1 6 2 3 9 4 0 10 8 9 0 10 1 8 1 7 10 2 7 5 0 5 9 2 8 5 10 5 2 3 3 5 7 1 3 2 0 1 3 0 1 1 0 7 [...]
+9 3 4 4 4 1 0 1 7 8 3 1 5 5 7 8 5 10 1 3 4 9 2 6 4 6 10 10 4 8 5 4 5 0 7 6 1 1 5 7 0 1 7 5 10 7 2 6 0 6 5 1 1 4 6 1 6 5 10 7 7 7 2 9 5 0 7 8 0 8 10 4 7 0 2 8 7 4 0 10 3 4 7 0 3 6 1 6 1 2 7 8 4 5 2 6 2 0 9 0 10 10 6 0 6 5 3 4 9 5 0 2 8 4 0 3 9 6 1 8 4 2 2 8 1 4 9 6 0 6 6 7 0 8 5 0 2 3 5 8 0 3 2 0 9 0 4 2 3 0 3 4 8 8 6 1 8 6 9 10 8 7 5 10 8 7 5 4 6 5 0 6 9 2 7 9 10 2 9 9 7 1 9 3 10 4 0 6 1 9 10 9 8 6 7 10 6 3 7 4 0 8 3 6 9 0 5 9 5 2 5 6 9 9 5 6 6 2 7 6 7 4 9 4 7 2 4 8 6 8 10 2 2 9 8 6 8 0  [...]
+4 5 6 10 4 1 4 9 10 6 10 4 10 8 6 6 1 1 3 5 6 7 0 6 1 5 10 1 2 1 2 6 4 2 1 0 1 10 5 6 9 5 5 5 5 10 5 4 1 5 0 9 8 10 8 3 7 2 1 8 4 7 8 7 2 3 0 4 5 9 3 8 10 8 9 7 6 2 4 8 3 10 8 10 7 6 6 10 4 5 7 1 3 5 2 6 3 6 4 2 5 1 7 2 5 7 9 2 7 1 8 9 7 0 8 8 4 7 5 4 7 1 7 5 8 7 3 9 8 5 2 8 6 4 6 1 7 6 6 0 6 6 7 2 4 10 2 3 4 8 3 2 8 9 1 4 1 8 7 6 9 0 5 6 6 3 10 2 4 2 4 7 3 0 1 1 4 5 2 0 3 4 4 3 1 5 4 5 0 5 1 5 5 3 1 4 9 0 5 3 4 1 0 7 3 2 7 1 4 3 9 6 1 8 7 7 7 9 4 0 2 7 5 2 8 8 10 0 8 8 9 4 10 8 1 5 7 7  [...]
+4 2 3 9 7 3 1 10 4 3 5 5 7 6 7 8 4 10 10 4 3 7 9 8 10 5 8 5 2 0 9 2 10 6 1 2 10 9 9 9 0 8 5 5 7 0 8 6 5 3 3 0 4 7 3 3 5 7 4 4 7 7 5 1 2 7 4 0 9 3 10 9 1 1 5 8 8 5 2 1 9 4 4 3 7 2 1 6 6 2 2 5 6 0 6 10 0 1 2 8 1 4 6 4 3 9 3 10 5 1 0 2 2 1 3 8 6 0 5 2 1 6 7 5 1 10 1 8 0 8 7 7 5 1 6 4 5 5 1 10 1 6 3 5 3 4 7 4 5 2 3 9 4 1 3 5 5 9 5 8 3 0 3 0 0 9 10 8 3 4 0 3 4 7 9 9 9 1 1 3 9 6 4 10 0 2 2 3 1 6 9 10 6 8 1 3 7 9 5 9 7 6 6 1 6 4 8 9 7 1 7 1 2 3 5 3 4 6 6 6 1 0 0 5 9 9 5 7 1 2 8 7 9 5 6 9 2 5 1  [...]
+10 6 4 0 0 4 4 1 7 3 5 4 9 7 6 7 7 3 5 6 6 8 8 10 8 3 4 5 7 6 2 1 3 10 8 6 4 6 6 7 5 3 10 4 6 3 4 2 7 5 7 6 10 9 8 0 1 0 7 8 7 1 2 6 4 9 9 9 6 6 1 8 3 4 3 8 3 4 2 4 3 10 9 2 8 6 10 6 3 3 6 0 6 5 6 9 8 7 6 10 1 10 6 6 8 4 0 6 7 2 2 3 3 4 0 0 1 8 2 6 6 6 2 9 0 0 8 0 5 5 9 9 8 9 6 4 2 0 3 4 9 9 8 6 2 6 10 9 2 4 1 5 8 7 3 3 5 5 0 3 10 2 6 7 5 6 7 2 6 5 2 10 6 9 6 7 10 6 6 8 6 0 1 0 10 3 7 3 1 5 3 6 0 5 6 5 10 2 7 1 0 2 4 0 2 2 0 6 3 8 4 4 0 9 7 2 1 10 1 9 6 10 5 6 7 9 7 10 5 9 5 8 6 4 3 8 2  [...]
+9 7 5 6 0 7 3 8 6 3 8 8 10 6 7 10 7 1 1 3 7 1 3 10 0 6 5 1 2 10 7 2 4 8 0 4 6 9 6 7 0 10 6 3 1 8 1 10 0 10 6 6 6 10 6 5 2 2 8 6 3 4 6 6 8 10 8 9 7 5 5 8 8 0 5 0 1 7 5 9 2 0 9 9 7 5 6 1 5 2 9 10 8 10 0 7 4 0 9 5 9 6 8 9 2 5 4 10 5 8 7 2 10 2 2 5 6 4 2 7 2 10 3 5 6 10 4 10 2 8 0 4 4 5 0 6 9 5 8 0 4 3 4 0 7 3 2 2 2 3 2 0 10 9 9 10 3 6 0 7 8 4 10 5 9 10 4 6 9 9 4 9 3 4 6 1 10 6 7 0 2 7 7 6 0 2 10 1 10 4 4 4 6 4 8 0 4 1 8 1 7 4 1 9 5 8 8 8 8 5 1 4 0 3 5 4 10 10 3 0 5 4 9 0 9 1 6 9 2 8 4 8 1 1 [...]
+0 4 5 1 2 8 2 10 5 10 0 5 4 6 5 0 4 2 9 7 8 9 7 8 3 6 10 10 9 8 5 4 6 0 10 0 1 8 9 9 6 3 5 2 10 9 5 4 10 8 2 8 7 9 10 1 6 8 8 0 6 6 4 4 4 5 4 5 10 9 0 4 9 8 3 2 0 8 9 7 1 7 7 0 2 10 6 10 6 10 4 9 4 10 8 6 1 6 2 0 4 3 6 0 6 6 8 4 1 8 6 10 2 1 4 10 1 4 3 0 7 3 0 5 1 10 4 7 2 2 3 8 4 5 2 3 5 6 9 9 10 7 8 8 10 4 0 5 8 2 1 10 1 0 7 1 5 4 1 6 9 6 7 0 5 6 3 0 7 9 8 8 4 3 7 8 4 3 6 6 8 3 2 6 4 8 6 1 2 1 6 1 0 10 6 6 2 7 9 8 5 9 5 3 0 3 2 6 2 3 10 7 6 1 1 6 9 6 9 3 9 9 5 5 3 0 7 3 8 6 7 9 2 5 4 4 [...]
+7 6 10 0 0 2 7 8 9 3 2 4 10 0 5 0 8 9 5 7 6 7 10 1 6 9 2 3 4 3 7 1 2 3 0 2 9 8 8 7 9 0 3 6 7 9 3 6 5 9 6 2 7 0 7 8 0 7 8 2 7 4 2 6 0 8 8 8 5 9 8 2 0 0 7 0 5 6 5 8 9 8 8 1 7 5 6 4 0 5 8 6 8 0 6 9 4 9 9 5 1 3 8 0 4 0 0 8 4 8 1 7 6 10 1 10 5 5 10 7 8 2 9 0 10 7 7 3 5 9 7 8 2 9 5 9 4 2 10 10 1 3 10 5 5 0 6 9 8 10 0 6 9 5 8 5 9 4 5 9 2 3 8 9 7 5 4 9 6 6 1 5 3 10 4 4 1 0 0 2 2 4 7 8 4 3 7 1 1 2 8 0 2 8 8 3 1 8 6 10 1 7 5 1 4 2 5 10 7 7 7 2 6 2 9 7 0 1 10 1 4 9 4 9 9 6 0 4 0 3 4 6 10 5 5 10 5 5 [...]
+7 5 8 2 0 5 6 9 10 2 1 7 8 9 7 10 2 1 4 7 6 1 5 0 7 7 2 9 10 7 2 0 3 10 4 8 8 1 5 4 2 4 3 10 3 9 6 1 1 4 0 7 6 7 0 0 1 10 0 5 3 7 10 0 6 8 2 10 3 0 8 2 3 1 1 7 7 5 7 10 5 0 10 1 9 3 0 5 0 1 6 0 4 2 5 8 0 0 4 8 0 8 8 6 6 2 2 10 7 6 8 10 2 9 2 5 6 7 7 1 7 9 8 2 4 0 8 0 10 8 2 3 1 0 6 10 0 5 4 3 1 6 7 3 7 7 3 2 6 4 8 4 2 5 10 10 5 10 4 8 7 10 2 9 0 7 3 8 9 5 9 10 10 9 4 5 5 8 10 8 3 5 5 8 7 6 0 0 10 2 2 9 1 9 7 0 3 6 3 9 0 8 4 10 6 0 10 4 9 2 6 2 2 5 0 3 0 4 7 0 4 5 2 3 10 8 4 5 5 1 9 6 9 1 [...]
+2 0 5 3 0 4 9 8 10 3 4 0 3 7 7 7 1 2 6 0 3 10 5 7 5 6 2 2 1 2 6 7 4 7 5 2 1 9 6 5 9 4 0 4 6 6 7 2 6 7 7 0 10 6 10 2 1 3 6 8 6 9 5 1 3 1 1 10 9 9 3 4 5 4 8 5 1 7 8 8 0 10 9 10 9 8 7 1 8 3 4 7 9 3 10 1 9 6 0 8 5 1 3 9 1 10 9 8 3 1 3 8 6 0 9 0 7 6 9 0 6 1 3 5 8 2 2 0 8 5 3 3 7 9 0 5 9 9 5 10 3 9 9 3 3 1 1 4 3 10 2 7 3 4 6 7 2 9 0 4 1 4 6 10 9 6 2 9 7 6 5 2 1 3 6 4 3 7 2 5 9 0 1 5 9 1 2 6 5 5 8 2 7 6 9 3 10 5 0 2 7 3 10 1 0 10 0 2 6 0 3 9 7 0 0 0 0 5 7 5 5 7 10 0 5 1 7 7 3 4 5 8 8 8 4 1 10 2 [...]
+3 6 5 4 10 2 0 2 5 0 1 10 2 0 5 6 1 8 10 10 1 9 4 0 7 0 1 9 8 0 8 2 9 0 2 9 10 0 5 5 0 9 2 4 5 5 8 0 6 7 4 8 7 4 10 1 8 3 4 7 7 2 3 0 6 8 4 10 9 6 5 1 0 8 0 6 5 4 3 9 6 0 8 5 8 1 1 9 1 10 4 1 0 7 9 4 9 3 9 2 8 1 0 8 4 0 6 8 1 1 0 8 8 4 5 4 0 10 1 9 4 0 6 9 6 5 8 2 6 7 6 9 1 5 8 7 8 0 5 3 10 9 9 1 7 2 10 5 4 10 8 1 10 5 5 8 9 0 1 0 2 2 7 2 3 9 3 8 9 8 0 8 10 4 2 2 6 3 9 9 0 3 5 3 10 0 5 9 2 4 6 10 1 0 4 9 8 0 10 8 6 1 4 1 8 3 5 3 3 5 7 6 5 9 8 6 9 6 2 0 10 1 6 2 9 8 8 8 3 10 4 0 10 1 9 5  [...]
+9 0 10 0 6 9 1 9 7 5 7 10 6 2 6 9 3 10 1 4 10 10 10 9 7 0 6 6 0 10 2 0 8 3 9 7 7 3 5 10 9 7 4 4 5 8 3 10 2 0 3 6 0 7 10 6 2 6 1 10 1 2 10 6 1 3 8 0 7 1 8 8 8 0 10 4 3 10 0 10 5 6 1 4 2 6 1 3 8 0 9 1 8 3 0 3 5 3 3 7 1 6 4 5 3 9 3 7 4 0 4 5 8 10 5 0 1 9 0 8 0 4 9 7 2 0 4 5 3 10 10 4 2 3 6 8 1 6 9 3 10 4 1 6 9 0 6 7 9 1 5 3 8 7 2 9 0 10 3 10 9 9 4 3 10 9 3 3 8 6 0 8 5 4 2 3 4 3 9 9 0 5 9 2 6 3 10 8 10 2 3 10 9 9 10 1 5 8 9 8 8 10 0 0 6 2 3 0 8 2 5 4 8 3 10 2 4 7 0 6 10 5 10 5 8 7 3 7 8 2 7  [...]
+2 0 2 7 1 3 2 8 4 5 7 5 9 9 10 10 0 2 10 10 0 2 9 9 1 9 7 10 1 8 8 5 7 5 8 1 3 3 9 6 3 3 10 5 3 1 7 5 4 9 7 4 9 9 2 6 0 7 8 7 6 3 8 3 9 2 9 3 5 3 6 5 1 3 2 9 2 2 4 7 9 5 10 8 4 0 9 1 4 8 6 2 1 0 0 5 2 2 6 6 3 6 0 10 7 7 8 1 8 0 2 0 0 6 1 10 9 10 9 1 3 7 5 3 6 7 1 3 8 2 4 8 8 7 9 2 3 3 0 6 9 4 7 4 2 10 5 6 9 6 5 3 1 0 6 5 0 3 4 10 6 10 7 5 2 7 9 2 9 4 7 6 5 9 0 3 6 3 5 8 0 7 1 0 3 1 9 5 8 4 4 5 3 6 10 10 5 8 0 0 3 8 10 9 3 9 0 9 5 10 7 2 7 3 7 4 10 2 0 2 6 0 4 9 8 6 7 8 5 8 8 7 0 7 7 1 5  [...]
+1 9 3 4 9 3 2 3 8 7 1 1 5 9 1 6 3 6 3 1 3 2 4 3 5 4 9 4 7 5 5 5 9 7 8 10 5 0 9 6 4 3 8 2 0 9 9 5 1 3 5 7 1 8 5 6 6 2 1 10 10 5 5 2 8 0 9 1 8 0 0 7 1 9 4 1 2 5 6 0 4 2 9 0 6 0 4 7 2 9 4 4 7 9 5 8 6 4 9 2 8 2 3 7 2 2 6 5 5 6 10 5 4 9 5 1 4 9 9 9 1 7 3 5 10 3 1 1 3 10 1 6 10 7 9 5 0 4 5 7 4 10 2 3 3 0 1 0 10 9 0 0 3 3 6 3 1 6 10 7 4 0 4 9 0 1 7 1 10 1 4 9 0 0 1 2 1 9 5 6 4 6 8 2 0 1 8 3 2 9 3 7 2 8 1 8 0 8 5 2 0 1 5 10 3 1 9 4 3 4 2 4 0 1 4 3 9 8 7 1 4 4 7 2 10 10 10 5 10 6 3 9 4 7 2 8 1 10 [...]
+4 7 8 10 3 5 1 7 0 6 5 7 7 4 1 9 2 2 3 0 4 10 3 9 3 8 7 1 3 0 0 4 10 5 2 9 5 9 1 10 2 5 6 7 0 10 10 4 7 9 7 4 7 9 0 5 9 10 10 4 3 9 7 0 1 2 0 6 7 2 10 2 6 8 7 7 4 9 6 3 7 5 8 1 4 10 2 2 9 2 8 1 0 6 3 7 2 4 10 1 5 8 1 9 9 2 3 5 8 5 3 0 6 2 5 4 4 7 7 6 3 6 9 9 9 7 7 2 10 2 4 9 3 3 7 4 3 0 1 2 8 2 3 4 1 1 5 4 2 10 3 10 0 5 3 9 8 5 1 4 3 3 9 10 2 5 1 8 4 10 3 1 1 8 3 7 8 1 3 4 4 2 2 3 10 8 5 6 7 10 10 0 6 7 9 4 7 4 4 8 1 0 1 7 10 7 9 8 4 3 6 9 9 4 8 10 1 10 6 3 0 3 8 1 6 4 4 6 7 8 3 5 6 2 2  [...]
+3 6 4 5 6 9 5 6 10 0 6 2 3 2 8 6 9 5 8 1 8 1 7 3 1 5 7 5 7 3 10 5 5 3 1 10 5 10 8 4 10 10 4 1 8 5 2 1 2 6 7 9 9 6 8 2 10 10 0 6 5 9 5 0 2 8 8 10 7 9 5 0 6 8 4 4 2 9 4 8 1 6 3 2 9 3 7 0 0 5 4 10 7 1 4 1 0 6 1 10 7 7 4 3 2 7 0 6 0 9 6 3 2 9 0 0 1 3 10 8 3 10 2 9 7 2 8 6 3 5 5 8 3 3 1 6 9 3 2 2 4 1 4 7 5 10 9 6 8 6 6 3 3 3 3 3 2 7 8 8 6 4 2 4 9 4 2 2 4 4 5 3 1 7 4 9 9 7 2 7 8 9 4 6 9 4 7 5 10 9 0 9 0 7 10 5 5 2 6 9 10 6 9 8 6 5 8 2 2 6 6 0 6 6 4 1 8 0 4 2 10 2 3 1 7 6 2 1 0 0 5 4 10 10 9 6  [...]
+0 1 3 8 5 0 9 1 1 3 10 2 3 4 9 2 5 3 4 1 8 0 2 2 3 5 4 8 3 5 1 7 8 3 6 4 8 0 7 3 8 5 4 6 2 1 7 4 2 5 8 6 1 3 5 8 8 9 7 2 7 5 4 2 6 5 4 8 1 9 0 4 7 7 0 5 5 4 1 6 10 7 0 6 9 10 6 0 7 3 1 3 0 6 8 5 4 4 5 0 2 1 4 10 10 8 4 7 4 1 2 4 8 8 2 6 4 10 9 4 7 6 6 6 4 8 6 3 5 3 1 2 0 1 6 2 10 5 3 1 3 6 0 9 3 5 8 1 5 2 5 0 7 2 0 5 8 7 7 10 2 1 8 5 5 8 3 0 6 8 5 9 4 0 9 5 10 8 7 4 0 6 2 5 3 2 4 8 4 2 10 8 10 6 0 1 5 8 1 2 9 9 8 2 2 5 1 7 9 6 6 5 2 1 10 3 4 1 1 7 5 2 4 6 5 4 2 6 5 6 10 5 10 8 7 2 8 0 10 [...]
+7 10 4 9 9 3 9 1 5 3 2 3 2 9 6 7 9 10 2 3 10 2 4 7 2 7 9 10 10 5 9 1 8 10 7 4 2 9 4 3 3 4 2 4 7 9 2 6 3 1 10 8 9 1 8 6 8 2 2 4 1 5 9 1 2 1 0 9 7 2 9 6 8 6 10 10 0 3 1 7 1 0 1 4 6 8 4 2 2 4 5 0 5 7 5 6 2 3 8 3 4 1 1 7 3 4 7 7 9 0 9 5 3 1 9 10 9 8 8 6 5 4 2 2 6 1 2 1 7 6 5 1 10 0 7 7 0 3 9 6 5 6 5 6 2 2 4 5 0 3 9 9 9 0 1 0 4 8 10 3 10 0 7 3 4 3 1 2 10 6 10 9 8 0 1 6 7 1 3 1 10 7 2 2 2 8 0 3 6 8 7 10 10 4 10 9 0 8 5 8 6 6 8 0 7 9 6 10 9 5 4 3 0 10 4 4 6 1 6 10 8 8 4 6 3 3 5 8 9 1 6 3 7 9 0  [...]
+6 0 7 2 3 3 0 5 3 8 4 10 0 8 9 2 4 0 1 3 7 2 5 8 4 7 8 5 5 10 5 1 10 5 1 4 7 3 1 4 1 10 9 6 8 1 1 0 0 0 7 1 2 2 7 6 1 10 9 9 4 9 7 3 6 10 4 3 1 0 0 7 10 6 2 9 1 7 5 9 0 3 1 3 3 6 8 7 10 9 10 8 2 10 4 10 2 6 2 4 10 0 1 3 2 3 5 9 9 8 7 1 7 3 5 9 3 5 5 10 8 9 4 2 0 0 10 9 4 4 0 4 9 9 10 0 9 2 7 10 0 4 2 8 2 6 6 4 6 8 2 1 6 4 5 9 5 4 8 4 9 1 3 1 10 10 5 6 5 0 2 5 0 6 9 3 7 3 4 10 9 4 0 9 6 4 2 7 8 3 4 7 9 3 9 10 0 8 0 0 1 9 1 5 3 2 6 5 3 5 9 3 9 5 7 4 1 8 7 5 9 6 7 3 10 8 7 7 2 9 1 0 0 3 8 7 [...]
+1 6 6 10 5 1 8 2 9 1 7 10 10 3 2 1 3 8 5 1 7 4 2 10 7 10 9 2 10 4 2 4 10 9 6 2 2 3 2 10 9 6 8 7 4 4 2 4 8 9 1 1 9 2 10 2 10 2 5 1 9 8 5 9 1 3 6 0 10 7 3 2 4 7 2 9 4 10 9 3 10 7 10 9 5 6 4 0 3 9 9 0 9 5 1 7 9 6 2 2 2 4 3 7 4 2 3 0 8 0 8 7 8 1 2 8 10 1 8 5 5 9 3 5 7 9 6 2 6 1 10 4 7 0 6 10 3 6 7 8 9 4 3 2 9 7 1 8 3 4 3 6 10 6 9 1 10 0 10 9 7 1 7 1 6 10 2 2 7 7 1 8 6 5 9 10 1 3 10 3 7 6 7 2 4 9 3 10 8 2 8 6 5 7 8 6 0 7 6 0 0 4 2 2 1 6 10 10 8 8 0 10 5 10 7 2 0 0 6 3 9 2 2 4 8 3 7 5 9 5 7 5  [...]
+7 9 3 0 8 3 2 5 2 1 2 0 6 3 2 0 0 0 3 1 9 10 6 9 9 8 1 9 10 8 2 10 8 6 1 7 9 4 0 6 10 5 6 7 5 9 5 9 9 9 8 0 5 0 5 8 8 10 8 5 1 3 1 0 3 1 6 3 5 9 10 7 7 0 8 1 7 9 6 3 0 8 0 9 7 4 10 5 8 7 5 6 3 3 1 8 5 3 1 5 3 6 4 3 7 6 4 3 4 6 4 3 4 4 6 7 0 0 8 9 6 4 9 1 0 8 6 7 2 0 7 8 1 2 10 10 8 8 6 4 4 9 6 3 0 8 4 8 8 10 9 7 8 8 9 6 6 6 9 9 9 6 1 2 7 0 4 1 10 0 9 1 1 6 8 6 10 10 6 9 9 7 2 4 2 10 4 9 4 8 7 6 7 2 5 8 5 2 3 2 5 2 0 6 5 3 6 0 6 10 1 0 0 7 1 3 4 8 4 8 8 8 0 0 0 0 8 3 5 2 5 9 9 3 3 8 0 3 1 [...]
+5 1 1 6 0 10 10 8 7 10 7 6 2 10 7 4 0 3 4 2 9 7 5 4 0 6 8 7 4 6 3 7 1 8 10 2 7 1 2 4 10 5 1 1 6 8 7 5 3 6 0 4 4 10 9 9 8 5 6 9 9 0 0 8 8 4 10 7 1 4 4 6 1 8 2 1 10 1 8 6 10 8 7 4 5 10 5 2 8 4 4 8 3 8 1 10 3 5 8 1 10 7 0 5 6 10 6 4 10 9 2 6 2 6 10 1 6 1 4 9 8 2 6 3 9 8 1 3 0 3 4 1 5 4 1 1 9 4 10 4 1 4 6 9 1 8 0 6 7 0 9 5 10 10 5 2 2 3 1 6 4 9 0 4 7 7 7 10 6 6 7 6 6 10 10 7 6 7 8 10 4 4 5 0 1 7 10 8 0 3 4 6 7 1 7 9 9 10 7 8 10 6 3 10 0 4 9 1 5 6 8 10 6 4 10 9 8 3 0 10 1 7 3 8 8 2 5 10 5 8 1 [...]
+8 0 3 9 2 5 4 9 3 10 2 2 2 8 4 8 8 9 10 9 8 6 10 0 6 7 0 9 9 7 7 4 1 7 9 7 10 3 10 8 0 7 5 4 2 9 6 8 10 5 4 7 2 1 0 0 7 5 0 2 5 8 0 3 3 7 7 5 8 8 5 8 3 2 8 6 7 9 0 0 6 5 4 4 2 3 6 6 10 8 0 7 8 1 8 1 6 7 0 5 6 5 10 6 3 0 4 2 1 8 2 5 5 5 1 5 5 0 4 2 5 9 3 2 1 5 0 6 9 2 0 3 1 6 9 3 2 10 5 8 2 6 6 2 1 4 6 7 10 6 2 5 3 7 1 0 8 7 8 4 3 4 0 7 10 6 8 1 10 5 10 6 4 8 9 2 1 9 9 6 7 2 7 1 0 7 7 9 1 2 10 1 3 4 5 10 2 6 3 5 2 0 1 5 6 5 4 5 9 5 5 2 6 2 0 8 2 6 3 2 4 8 4 9 7 1 10 10 1 3 2 4 6 6 10 4 3  [...]
+0 5 9 9 8 3 5 3 6 7 7 3 4 9 3 8 6 1 10 8 10 8 7 6 9 8 7 1 9 10 0 5 10 10 4 6 6 4 2 5 0 3 3 6 9 3 9 3 6 0 3 10 1 8 4 5 5 10 0 5 3 0 0 1 8 3 7 1 3 5 4 7 6 2 10 2 8 8 8 6 7 6 9 10 6 5 1 5 1 9 3 2 2 5 2 9 5 2 9 10 10 4 10 7 2 1 4 6 2 10 1 9 7 6 6 9 10 2 2 7 9 2 0 9 2 4 2 4 2 2 9 6 2 4 0 3 8 0 6 5 5 0 5 0 6 5 0 9 0 8 3 8 4 4 7 7 1 7 4 2 2 4 3 5 8 10 2 2 8 1 4 7 2 7 4 3 1 0 8 2 1 3 8 0 6 7 4 10 6 8 0 6 2 6 10 6 10 9 7 10 7 9 8 6 4 7 10 0 0 9 6 10 10 4 4 8 2 8 7 0 5 2 4 8 4 2 5 6 9 0 1 7 10 10  [...]
+5 0 2 7 8 5 9 7 5 0 0 5 0 6 3 3 9 1 0 4 5 10 8 7 9 4 3 4 2 10 2 3 0 2 6 4 3 5 2 5 2 6 8 0 6 3 4 9 4 3 6 5 1 7 7 1 1 3 8 9 3 9 7 4 4 9 4 9 10 9 4 6 1 9 6 1 8 7 10 7 0 5 0 9 6 10 8 1 4 5 5 4 5 7 0 3 2 0 7 8 1 1 8 10 3 1 4 9 5 10 8 5 2 5 3 0 5 1 10 6 1 7 10 1 2 2 5 6 0 2 6 6 4 10 4 4 8 8 10 7 9 5 0 3 6 10 0 1 10 5 4 3 7 7 8 2 4 10 10 3 0 7 4 10 5 5 5 10 4 8 7 1 6 8 5 0 2 1 1 3 8 4 7 10 9 7 4 3 2 7 7 3 2 4 10 5 3 7 5 2 0 8 4 1 0 3 3 9 1 1 7 2 5 0 6 6 8 1 5 0 3 9 9 7 2 5 2 8 9 9 1 5 7 0 10 1  [...]
+4 7 0 1 6 5 9 7 3 10 10 0 8 2 5 7 10 8 8 8 2 5 1 1 8 4 3 9 3 3 0 4 6 4 4 4 3 5 6 8 0 1 9 0 6 0 9 9 4 5 4 1 3 3 3 1 2 8 3 2 3 9 5 2 5 6 2 1 2 3 9 6 4 6 3 5 7 10 10 6 10 8 0 6 4 6 7 5 4 2 10 5 1 0 1 10 6 0 8 6 2 0 4 4 9 2 3 0 10 8 1 1 9 6 1 5 2 9 9 0 6 8 5 7 6 0 7 0 0 3 6 4 7 7 1 5 7 7 6 7 2 4 3 8 2 3 10 10 7 9 3 7 0 3 10 0 2 0 4 3 4 2 3 5 2 1 1 7 2 2 0 0 1 5 0 10 5 2 9 3 0 9 2 3 6 1 8 5 6 6 5 4 5 3 9 10 2 7 3 7 6 5 2 3 10 7 8 9 10 1 9 8 7 0 4 2 4 8 9 6 6 1 8 2 1 9 0 1 5 6 1 5 8 6 7 6 5 9  [...]
+3 7 2 0 5 10 7 10 0 10 2 7 5 8 2 0 10 8 0 3 3 1 9 8 7 0 7 1 1 3 4 7 7 6 4 2 1 4 2 8 0 0 1 4 2 6 0 5 7 1 6 10 7 8 4 7 9 9 6 5 4 4 2 2 2 1 10 5 4 5 9 4 0 0 2 9 6 1 9 8 3 8 6 0 6 4 1 1 8 2 8 10 10 4 10 8 8 9 6 4 3 6 0 9 9 3 7 9 5 6 8 10 3 0 3 4 0 8 2 6 9 3 8 5 6 7 4 8 8 7 4 0 8 2 3 3 1 8 10 2 7 9 4 6 9 1 4 1 5 6 4 4 5 5 4 10 7 0 1 0 5 1 10 2 3 0 8 8 7 4 8 9 3 1 9 3 7 6 2 5 0 7 4 5 5 6 7 9 1 1 6 10 1 6 5 7 2 6 4 4 4 4 1 7 10 1 7 1 9 0 9 7 9 2 9 5 5 6 1 1 8 2 2 0 5 3 8 5 9 6 9 5 7 0 1 5 7 4 9 [...]
+3 2 0 8 10 10 4 8 0 5 8 0 8 1 0 3 5 9 0 0 0 3 1 6 2 6 7 7 7 4 8 2 1 7 2 2 0 9 0 3 10 1 6 3 9 3 10 8 3 4 5 9 3 1 6 5 9 7 1 1 7 2 1 10 7 10 7 3 1 6 7 5 4 10 2 2 4 6 7 2 4 0 3 1 3 8 3 6 0 9 7 9 4 7 3 10 4 3 9 7 0 1 1 9 8 6 2 0 6 6 6 3 1 0 4 6 6 8 8 10 0 9 10 9 5 7 9 2 0 4 0 4 5 8 0 6 6 6 3 0 6 2 1 4 3 1 9 2 0 7 9 8 8 0 5 10 0 1 8 8 4 1 0 3 3 3 3 7 8 6 5 4 6 6 1 10 7 2 9 1 6 4 1 8 5 2 8 8 0 1 1 8 10 10 0 0 0 0 9 5 9 2 10 6 7 8 10 6 5 8 1 4 5 2 2 4 7 0 7 0 9 10 1 6 6 9 9 8 6 2 0 9 10 9 1 4 2  [...]
+5 0 9 0 8 9 10 3 4 5 2 0 10 9 8 2 8 2 10 6 8 10 0 2 0 7 10 8 0 5 6 4 7 10 8 8 0 9 4 10 7 6 3 6 3 8 0 2 2 9 9 10 7 2 10 3 9 2 10 9 0 7 0 8 2 4 8 3 3 5 3 5 4 8 5 3 10 3 1 3 8 9 10 9 0 4 1 0 7 9 2 7 4 0 1 7 10 4 7 7 6 5 9 0 0 6 4 8 8 3 5 7 7 1 8 1 10 1 5 8 1 7 3 0 7 7 2 3 0 4 4 3 7 3 3 3 1 6 9 6 2 2 4 10 8 3 9 10 8 8 5 1 2 2 7 3 6 6 5 6 4 1 9 10 4 3 6 3 3 0 6 6 1 5 9 4 10 7 3 4 7 2 8 3 0 8 10 7 3 10 2 10 9 1 8 5 3 5 8 0 10 8 8 4 6 4 2 3 7 8 9 3 6 1 10 10 1 5 3 9 9 9 9 9 10 10 10 4 2 4 3 3 9 [...]
+1 8 6 1 10 10 3 4 3 5 5 6 2 3 9 4 4 3 6 9 10 8 3 3 4 3 6 8 5 5 10 1 1 6 5 8 4 5 9 1 7 6 9 5 6 4 3 7 5 4 4 10 2 5 4 9 2 2 5 1 1 6 3 6 3 4 10 6 3 9 6 9 9 8 10 2 1 7 1 3 9 10 8 4 5 4 9 5 2 4 5 10 8 6 2 1 9 3 10 6 1 3 7 0 10 2 6 4 7 7 0 4 8 2 2 7 9 7 8 3 9 7 1 0 6 3 2 7 7 8 3 5 9 7 1 5 9 6 5 5 5 0 9 9 3 5 6 8 8 5 6 5 8 9 5 1 10 1 3 8 3 5 5 1 8 10 7 1 10 3 7 0 7 0 1 1 3 1 3 7 4 6 10 5 4 6 10 0 10 4 10 10 7 10 8 8 7 5 6 6 1 2 5 8 9 6 1 7 8 1 8 0 3 0 2 4 7 9 7 6 6 3 3 6 9 2 10 10 4 3 7 0 9 4 10 [...]
+6 7 10 5 7 5 2 9 8 9 1 0 0 4 10 5 2 10 8 0 5 6 0 2 6 2 5 5 8 8 5 10 4 4 2 1 8 0 6 8 1 10 7 0 10 5 8 1 9 10 1 0 2 9 6 4 8 1 0 7 9 9 9 8 3 0 9 0 9 5 2 9 7 10 4 0 3 8 10 7 6 6 3 0 5 10 9 3 9 1 4 5 10 9 3 2 5 1 8 10 7 2 7 2 9 4 6 2 9 7 5 6 5 3 3 0 8 0 8 0 2 10 9 5 10 6 6 1 8 7 1 7 6 6 3 1 10 8 1 6 6 2 10 1 4 3 2 7 4 8 2 4 9 1 7 6 1 5 10 3 1 4 8 0 9 2 0 2 10 3 6 0 8 9 7 4 2 10 9 10 10 5 0 1 2 0 6 3 6 7 6 1 0 4 6 8 6 0 7 0 5 0 4 1 5 4 5 6 7 5 6 0 7 10 6 0 9 0 10 7 3 4 5 8 7 8 7 10 4 9 9 3 2 7  [...]
+9 0 7 8 4 0 6 10 9 6 9 2 6 7 7 9 1 4 9 9 0 2 0 7 10 1 3 0 1 2 8 7 2 4 4 8 0 7 4 8 4 5 5 9 4 3 2 8 2 10 3 8 1 3 10 7 9 4 2 8 10 5 0 2 3 6 10 10 2 3 8 10 7 10 9 1 3 10 9 9 0 8 1 5 0 6 4 9 0 4 0 3 5 2 6 2 1 4 8 6 0 8 6 4 1 3 0 7 7 1 10 0 10 9 7 5 3 7 0 2 5 10 5 7 3 7 1 4 9 2 10 2 8 1 1 3 4 7 7 7 10 3 10 0 3 5 9 4 8 1 3 4 8 6 1 10 2 7 1 1 5 0 6 6 5 9 4 1 7 7 6 9 3 3 7 1 10 0 4 3 10 4 8 7 3 7 2 8 6 0 6 5 3 1 5 6 8 7 1 8 9 10 3 1 0 6 0 5 7 10 1 0 8 7 6 1 3 7 8 2 9 3 2 10 5 9 7 5 8 9 5 1 5 4 8  [...]
+6 10 4 6 1 3 1 1 3 4 3 2 7 7 1 1 10 3 8 4 0 10 5 9 7 8 5 7 7 7 4 6 8 7 2 1 4 8 6 5 7 9 10 0 0 4 1 3 9 2 9 1 8 3 2 4 9 0 4 6 0 7 8 2 2 3 3 10 3 10 5 1 2 3 0 0 8 0 0 1 10 2 2 8 2 7 9 3 6 9 8 3 1 0 5 3 1 0 10 8 4 2 1 7 5 0 5 5 5 2 10 4 8 4 1 3 6 6 10 6 0 8 4 3 0 5 6 0 10 5 7 2 9 9 0 5 8 6 3 8 5 6 8 8 1 1 4 4 5 5 10 5 5 8 5 3 6 5 1 9 0 5 1 9 1 9 6 2 10 6 8 3 0 3 4 7 1 6 2 9 6 2 0 1 0 0 5 8 7 10 0 9 1 1 5 3 8 2 4 5 4 5 2 7 9 1 1 9 6 1 8 8 5 0 1 9 3 7 0 7 1 6 9 9 8 9 8 0 7 4 3 2 7 6 1 7 4 10 1 [...]
+10 1 7 6 3 0 2 6 5 7 1 4 10 5 10 7 3 7 1 10 2 9 7 2 1 2 9 1 4 3 3 2 9 7 6 4 3 3 2 10 1 10 7 4 10 4 7 3 4 1 0 8 8 10 5 10 2 5 9 6 5 2 2 3 6 1 10 9 9 0 1 3 4 10 10 0 2 0 2 10 2 0 2 8 0 10 10 10 9 9 8 6 8 0 6 2 6 2 6 4 5 1 3 5 3 9 10 1 9 9 6 3 3 9 5 0 6 6 1 7 2 1 10 2 3 8 1 6 4 9 10 9 5 8 7 6 8 10 3 5 6 0 6 6 6 10 8 2 2 9 7 0 6 8 3 7 10 3 2 10 1 7 10 10 0 0 7 2 8 8 8 9 5 3 1 10 9 8 9 3 2 4 0 7 4 7 6 4 8 6 4 10 1 9 1 8 9 6 0 2 4 9 3 4 6 0 1 2 8 4 8 2 4 9 2 4 2 6 3 10 1 3 2 4 4 1 9 5 7 3 2 2  [...]
+3 7 8 6 8 4 5 4 7 5 6 0 1 6 4 1 8 0 10 0 1 0 2 1 2 3 7 2 3 8 3 10 7 8 1 3 7 2 3 2 5 7 0 8 4 6 6 5 3 5 9 1 7 2 3 7 0 7 0 7 8 10 5 5 1 0 0 7 1 9 8 5 5 1 5 7 6 2 6 2 7 8 6 7 6 2 1 9 3 8 9 9 10 1 0 6 7 8 10 2 0 4 0 6 6 2 9 9 3 9 8 7 6 10 6 10 2 2 10 10 3 10 8 5 2 10 3 7 10 5 8 2 8 5 0 3 5 3 9 9 3 8 7 0 8 10 6 10 0 8 6 8 10 4 3 3 5 1 5 8 4 10 1 5 2 8 6 5 8 9 1 10 2 10 8 1 8 1 0 1 6 7 1 3 3 9 3 2 0 2 8 5 7 5 6 2 5 8 4 8 2 0 2 2 8 6 8 10 1 3 0 10 5 7 5 3 6 1 3 10 8 9 3 5 7 6 8 5 6 0 6 9 4 0 9 1 [...]
+2 0 2 9 4 0 4 1 0 3 7 5 5 4 6 7 9 9 7 3 6 7 10 8 6 10 6 10 6 1 8 10 8 3 8 1 5 1 8 7 0 3 4 9 6 7 6 8 3 7 3 1 9 5 9 6 9 6 2 7 3 2 8 5 8 8 6 5 9 6 6 7 9 6 10 10 10 10 10 10 4 1 5 8 3 5 4 6 4 8 2 7 7 0 4 2 6 3 2 6 5 9 5 10 1 6 3 9 2 7 9 0 2 7 8 0 4 8 3 7 2 7 2 2 1 7 6 1 4 9 8 1 9 1 10 0 8 1 2 9 10 9 7 10 9 3 8 5 3 1 9 4 3 9 6 7 0 10 1 4 5 4 2 7 5 5 10 5 0 3 10 4 1 8 7 6 5 8 9 4 2 2 10 6 1 6 2 8 9 5 3 6 9 6 9 2 2 3 4 8 6 1 2 8 6 1 4 7 8 2 0 5 1 5 4 6 7 1 8 8 2 6 6 0 4 8 4 6 6 2 5 1 2 4 7 7 0  [...]
+5 9 10 1 4 6 9 7 4 0 0 3 1 0 1 3 4 6 3 8 7 1 6 0 9 8 2 5 1 0 4 9 7 1 10 9 2 0 7 2 7 7 2 1 3 3 1 3 2 3 8 4 8 10 8 4 7 6 0 10 7 0 5 3 5 5 7 3 0 10 7 6 3 9 7 4 9 9 0 0 2 8 0 6 3 6 1 4 3 5 5 6 5 4 6 10 9 0 8 9 7 7 0 5 6 0 0 3 1 6 1 10 5 1 7 3 10 1 10 10 4 3 10 2 2 0 1 1 2 3 8 0 7 2 5 3 0 9 8 9 1 0 9 6 7 0 2 0 0 0 4 5 2 2 7 1 1 4 8 8 5 3 1 4 3 10 0 1 7 9 5 7 5 3 9 6 10 10 5 7 5 2 1 9 9 5 6 0 1 0 0 2 2 0 9 6 10 1 1 5 2 4 4 9 5 3 1 10 6 2 7 8 7 6 5 7 2 8 10 5 4 6 4 1 6 5 5 0 8 4 8 2 6 9 3 10 9  [...]
+10 10 10 7 6 8 5 6 4 1 9 9 7 9 4 3 4 0 2 6 10 4 5 5 1 5 9 5 6 4 3 0 3 3 2 6 2 2 0 4 9 1 3 10 9 4 8 9 9 5 8 0 10 7 6 4 0 4 10 6 3 5 1 8 1 6 7 10 4 10 4 8 4 6 4 9 8 1 1 9 5 9 7 5 1 9 1 6 5 3 3 3 7 3 0 10 4 5 6 4 10 10 5 4 0 2 6 10 3 10 2 3 6 2 9 4 3 6 1 0 6 3 3 0 6 4 0 1 5 8 4 1 8 6 6 7 10 9 6 8 9 5 9 6 9 4 4 8 2 1 8 2 9 9 8 8 1 6 6 9 8 1 2 8 6 1 0 3 3 4 7 5 10 8 2 4 8 9 1 0 5 9 2 9 9 9 1 9 9 2 3 9 10 7 2 1 1 7 9 0 4 5 7 3 2 0 8 6 0 1 10 0 1 1 10 9 0 7 4 9 0 8 0 9 9 8 3 4 8 1 3 3 6 5 0 9 0 [...]
+2 6 8 5 9 4 7 9 0 7 0 10 6 0 4 3 9 1 3 1 9 9 5 5 8 3 10 3 1 8 1 5 3 3 8 9 3 2 0 2 7 0 2 1 5 4 4 8 1 5 9 9 4 5 5 5 0 7 0 1 9 6 7 9 6 5 5 5 4 5 5 3 3 3 3 0 2 0 9 5 3 3 0 8 0 9 4 2 3 0 6 0 7 2 5 10 0 4 8 9 8 4 5 9 1 9 7 2 8 0 9 0 0 10 2 5 0 9 6 9 6 1 0 6 0 10 3 3 0 7 2 8 5 2 6 7 2 1 10 0 4 0 8 0 8 9 7 1 4 0 8 9 1 5 7 4 9 1 2 0 6 9 5 8 10 9 3 5 0 7 1 2 8 8 0 0 9 9 2 6 8 0 2 4 0 1 10 0 1 4 7 10 3 4 6 6 9 5 6 7 0 6 6 6 0 5 5 6 1 0 10 5 1 0 1 2 1 7 10 3 2 5 5 9 10 2 9 0 0 1 2 2 6 8 5 8 8 9 0 2  [...]
+7 5 9 3 7 5 4 0 9 9 4 2 5 6 9 6 6 3 4 2 7 2 10 6 6 0 6 4 4 1 2 7 10 8 1 7 8 7 6 8 5 9 2 9 4 0 8 6 2 2 2 3 2 1 2 7 2 4 5 9 7 8 5 2 5 2 10 3 6 7 9 9 4 0 4 10 3 6 9 2 6 6 1 10 4 9 10 6 4 7 10 2 4 4 8 2 3 4 7 9 7 7 3 1 9 4 7 3 9 10 0 0 3 2 5 9 2 4 3 10 0 1 7 0 7 2 10 10 5 3 0 6 5 3 6 7 9 3 5 4 1 10 10 5 6 8 0 10 1 7 2 0 7 10 8 4 0 8 3 1 5 9 10 2 3 5 10 3 8 9 5 3 9 9 6 7 7 1 2 3 10 1 8 3 5 6 2 1 8 3 1 6 0 4 10 4 8 6 7 7 2 10 0 10 3 5 10 3 9 9 9 1 6 5 4 0 3 5 6 8 0 10 7 8 4 5 2 4 9 4 2 6 5 7 1 [...]
+6 4 0 8 3 10 1 6 8 1 4 8 0 9 1 0 10 8 2 10 3 2 7 9 7 10 1 9 10 6 5 4 4 0 0 2 8 5 2 3 9 4 5 8 1 4 10 10 1 9 4 0 9 1 8 3 3 4 4 0 5 1 2 2 3 5 3 8 7 0 10 8 3 6 6 2 2 3 5 3 9 0 10 5 2 2 2 7 5 3 10 1 5 2 1 7 5 0 8 0 10 2 4 0 9 6 9 3 10 8 8 10 10 5 2 6 6 0 9 5 5 4 10 6 5 6 0 1 3 3 9 4 8 0 6 10 9 0 5 3 0 10 6 10 1 4 9 4 4 10 0 9 2 5 2 4 10 2 6 1 6 1 10 4 2 4 8 5 8 6 8 10 1 2 5 5 0 5 8 5 1 4 2 4 6 3 8 2 7 8 2 9 1 8 10 3 1 10 2 1 10 7 1 3 8 4 8 8 6 8 7 9 2 2 2 0 6 1 10 2 3 7 2 0 8 8 10 4 10 7 5 4  [...]
+6 0 7 2 7 5 1 5 1 9 4 5 3 6 0 9 5 8 5 5 10 8 4 0 8 10 2 0 8 0 5 0 0 8 6 7 9 1 0 5 9 6 1 6 4 6 4 8 1 2 10 9 4 1 9 7 4 0 6 4 9 1 9 0 8 7 4 4 2 7 2 9 5 3 10 3 8 0 1 5 3 5 9 1 1 8 8 2 0 4 5 9 10 9 10 4 4 1 2 10 9 5 0 8 1 7 5 8 5 1 10 9 8 2 4 7 6 2 9 1 5 5 1 6 0 4 2 7 5 6 1 1 6 7 9 2 4 7 0 5 0 6 0 3 9 6 6 0 10 8 5 8 0 9 4 3 3 0 9 2 5 5 0 6 3 5 3 0 3 4 9 7 2 7 1 1 6 1 3 6 5 9 1 3 7 8 7 4 9 0 6 1 8 5 3 8 0 1 6 1 5 10 4 10 1 10 6 10 2 1 6 0 1 10 4 3 4 6 3 2 7 0 0 0 0 3 5 10 10 4 1 0 8 1 2 0 0 6  [...]
+4 2 8 1 5 1 8 8 1 8 2 3 0 5 4 4 2 5 2 9 1 8 10 6 0 5 5 4 6 7 1 1 5 1 3 1 3 2 10 8 9 3 6 10 0 5 5 2 4 2 4 3 5 2 3 7 6 8 4 2 3 3 6 4 0 6 4 10 4 0 9 9 7 9 4 1 1 8 2 8 5 7 5 7 8 2 9 7 0 2 0 9 0 3 0 10 5 5 0 9 9 3 10 2 4 8 10 6 9 5 4 4 6 1 9 10 8 4 4 7 8 6 2 3 2 5 2 2 8 3 3 10 9 3 9 10 3 9 6 3 7 10 8 4 4 5 9 2 8 3 7 10 3 10 10 2 2 4 5 0 5 6 3 6 6 9 5 6 8 5 6 0 4 3 8 1 3 2 2 0 5 4 1 8 10 7 9 5 2 9 4 10 9 2 1 5 6 3 5 0 7 8 1 9 10 7 9 2 9 6 5 3 10 2 5 1 2 2 8 0 0 5 5 0 8 0 7 0 9 8 2 8 2 9 1 9 1  [...]
+0 0 1 2 9 7 10 8 1 9 8 10 10 8 5 3 1 5 1 9 8 8 0 5 3 0 2 10 4 7 8 8 8 7 4 1 8 5 1 10 3 5 7 9 6 2 3 1 3 2 8 7 2 4 3 0 7 8 1 4 5 1 4 0 6 10 2 5 7 10 5 3 1 3 0 8 3 3 10 6 10 1 3 8 7 2 10 7 4 10 10 6 6 8 9 1 5 6 1 9 9 10 2 0 8 3 6 1 9 4 5 6 9 5 3 2 8 1 8 8 4 3 4 2 7 9 3 1 10 6 5 0 7 2 0 6 2 6 0 0 7 2 10 6 7 0 10 0 2 9 8 8 3 0 10 9 7 0 10 6 4 1 4 4 2 6 0 7 2 2 5 10 2 7 9 6 0 9 5 1 5 3 1 3 3 2 8 10 10 4 7 10 5 5 6 3 0 1 6 2 3 6 10 3 0 4 7 9 9 4 1 10 2 9 0 9 7 6 10 10 9 4 0 9 3 5 0 8 1 7 9 10 6 [...]
+0 4 4 4 10 7 5 1 7 6 4 5 10 1 9 8 1 4 1 8 9 1 1 8 5 1 10 5 3 1 7 8 3 4 8 2 10 10 4 6 10 9 6 6 7 5 2 2 0 3 10 8 7 2 2 4 2 9 7 2 0 3 4 9 4 9 10 10 2 2 0 8 7 8 4 6 2 0 9 1 2 6 4 8 4 10 2 6 4 8 8 5 8 1 2 8 8 10 5 6 1 4 0 2 1 7 10 5 9 6 1 10 8 10 9 9 10 7 4 6 6 10 0 9 5 8 0 9 1 7 1 2 6 4 8 3 5 8 9 4 3 0 4 2 7 0 10 0 2 2 10 8 1 1 0 10 2 2 6 0 6 5 2 1 10 5 8 3 8 4 6 1 7 8 2 7 5 9 2 6 7 9 3 7 5 7 5 7 1 3 10 6 4 4 7 5 1 6 6 1 9 4 7 7 0 1 10 7 0 10 3 9 0 3 10 8 6 9 7 6 7 5 0 1 7 10 8 7 4 3 1 3 9 7 [...]
+0 10 8 1 2 8 8 1 7 9 9 8 5 9 1 0 3 7 6 6 6 9 2 8 5 8 3 1 2 1 7 1 7 0 1 2 7 3 5 9 8 3 7 9 4 3 0 6 4 7 7 3 1 0 8 0 7 10 10 8 0 4 10 1 10 3 10 7 9 6 2 4 9 10 9 8 2 2 8 5 4 10 4 7 4 10 3 6 6 7 1 3 8 1 10 9 4 4 1 3 1 5 4 2 5 3 2 2 7 3 4 5 5 1 8 4 10 3 10 3 5 6 5 1 10 10 8 2 5 3 4 4 0 9 9 7 3 8 6 5 9 0 4 6 9 1 5 2 5 6 7 4 10 3 10 3 5 9 3 7 10 0 8 6 3 4 4 10 8 1 8 10 8 5 3 2 10 9 3 2 8 7 6 7 1 9 9 10 4 4 1 8 9 0 5 4 2 5 7 10 8 10 9 6 3 1 5 9 0 9 10 9 0 7 10 3 3 1 0 4 9 0 1 4 6 7 10 8 0 9 3 6 5  [...]
+6 5 0 0 1 3 7 5 10 3 3 10 2 2 0 1 4 3 1 10 9 2 4 9 5 9 3 4 3 3 6 1 10 1 7 5 1 7 10 8 1 4 3 8 5 5 10 6 1 0 10 6 2 1 6 1 0 5 9 2 10 9 2 3 2 8 10 8 0 4 4 4 3 6 3 7 8 8 10 5 1 8 3 8 7 9 1 4 5 4 0 1 5 9 3 1 2 1 10 3 4 10 6 6 9 7 1 6 3 3 7 8 3 9 8 7 3 0 4 10 1 5 2 7 4 9 9 5 7 8 3 9 0 4 6 7 1 0 9 5 4 8 0 7 6 8 0 5 7 3 1 6 5 7 9 1 10 9 10 7 2 1 5 3 0 9 10 5 8 10 2 9 3 9 4 4 8 9 4 8 6 8 6 7 6 5 1 6 3 4 2 10 9 3 10 6 6 9 10 6 10 8 7 2 1 1 0 0 1 7 4 9 8 5 8 0 7 4 7 10 4 6 2 9 10 4 1 4 7 3 1 8 6 4 4 [...]
+4 1 6 7 9 0 4 7 8 7 6 1 2 2 0 3 5 6 1 2 6 9 5 8 5 8 1 5 9 7 0 9 1 4 0 4 2 1 1 1 5 0 7 2 2 7 6 10 6 5 10 2 6 9 7 8 2 1 9 4 1 4 1 4 3 7 3 1 9 9 10 5 0 10 8 3 4 2 4 8 4 7 6 9 7 10 0 4 5 4 3 7 5 8 1 1 5 5 0 2 4 0 3 3 1 9 1 7 4 8 5 5 5 8 6 2 4 3 2 6 3 4 4 6 4 7 1 8 1 4 5 1 5 5 9 6 1 2 8 5 3 3 0 0 1 0 6 3 7 7 6 2 7 3 10 4 1 3 2 2 0 6 0 3 0 7 1 1 9 2 4 2 10 6 9 5 2 0 10 8 6 7 7 9 3 8 1 10 3 7 6 10 10 10 3 10 8 10 2 7 10 10 10 2 9 0 9 2 3 3 4 3 10 10 3 3 1 7 4 10 2 8 10 4 0 4 9 1 7 7 0 9 4 4 9 4 [...]
+2 7 0 0 4 1 10 9 5 0 10 9 3 10 10 0 5 7 7 1 0 9 6 0 10 5 6 5 7 1 7 0 2 4 8 5 6 6 10 7 8 0 10 6 4 8 10 6 9 8 5 5 8 10 1 2 7 6 8 4 3 3 2 6 5 1 1 6 3 9 4 0 4 8 8 3 5 1 9 5 6 4 6 0 9 7 1 6 9 9 1 9 8 3 3 9 7 9 3 4 1 6 4 9 8 2 6 8 5 8 5 7 1 5 9 3 9 9 8 10 6 2 1 2 5 2 7 7 3 8 7 8 5 0 4 10 2 0 10 5 7 10 8 6 2 10 2 0 2 8 10 2 9 10 10 0 3 9 6 1 7 0 0 0 6 7 4 7 3 1 2 7 5 7 0 3 10 8 1 3 1 2 1 2 4 8 9 10 4 9 10 3 7 9 6 5 4 8 0 5 6 9 9 3 5 3 9 5 0 8 7 10 4 7 0 1 2 8 0 1 10 6 5 0 6 10 10 4 5 10 7 8 10  [...]
+8 9 5 1 4 9 8 10 8 5 0 9 9 0 3 5 4 7 4 1 2 0 5 0 5 7 0 7 3 6 10 4 1 0 7 8 4 6 9 5 9 8 9 8 3 2 1 3 5 2 10 8 2 2 10 8 1 9 4 8 2 6 6 6 4 3 9 9 4 7 1 2 2 2 9 0 6 0 3 8 10 10 0 2 2 6 9 6 6 10 5 7 2 9 4 8 4 10 3 8 7 6 6 4 5 8 6 1 2 1 0 0 5 5 10 1 6 4 6 10 6 7 8 2 2 0 3 4 1 9 0 0 6 1 1 4 0 7 8 10 3 0 10 7 7 0 4 8 7 10 6 5 7 9 9 2 2 6 2 5 7 3 9 1 4 4 2 4 1 3 2 7 5 4 5 1 6 0 9 9 5 10 6 4 8 10 6 1 3 10 8 9 8 2 3 5 9 3 0 10 10 5 5 10 7 3 5 1 7 5 1 8 9 7 8 6 5 1 4 9 0 7 6 7 6 3 5 1 1 3 9 5 10 5 0 3  [...]
+9 0 0 9 8 8 9 8 1 8 0 7 6 6 8 10 6 5 8 1 4 2 7 7 6 7 2 8 7 7 1 2 10 0 7 6 5 8 8 5 2 8 10 4 3 1 10 2 4 8 10 0 1 4 3 8 7 7 6 10 3 1 8 0 3 7 8 7 9 0 7 0 3 10 4 7 2 5 7 3 5 10 3 4 9 9 0 6 9 10 3 2 2 10 4 2 10 4 0 6 9 8 9 0 8 4 3 0 0 6 2 3 7 2 7 1 10 3 3 5 5 6 9 10 7 4 2 1 6 3 3 7 2 1 8 0 7 1 2 3 9 9 8 3 8 0 9 9 5 4 7 0 3 3 1 1 5 1 6 4 6 8 2 6 6 6 7 6 5 7 4 6 4 10 10 5 1 2 4 8 0 9 1 5 4 5 4 6 5 5 5 7 3 7 4 3 2 1 3 3 6 7 8 4 6 3 10 6 4 2 5 8 7 7 8 5 8 8 10 1 0 2 3 5 2 6 5 1 0 4 0 1 6 10 5 4 1  [...]
+3 6 5 7 5 9 3 9 6 7 9 4 9 9 7 10 5 3 8 5 8 5 6 0 1 9 4 9 7 0 1 10 9 4 9 4 5 8 5 7 2 7 10 3 4 9 0 6 5 7 6 10 3 9 1 4 9 4 9 3 2 10 9 8 4 0 5 3 0 7 8 7 4 7 6 4 1 5 3 0 5 2 10 10 1 6 7 3 10 1 4 4 5 3 5 2 6 9 10 9 7 10 6 0 10 3 4 1 2 6 7 0 4 3 5 3 10 3 2 6 10 9 2 8 6 1 4 6 2 7 9 0 0 9 10 7 4 4 1 4 5 3 10 4 9 10 5 6 7 6 6 1 7 6 5 1 7 7 2 5 8 2 9 2 0 3 8 7 6 3 1 8 5 6 3 10 3 3 3 10 2 7 1 7 0 10 2 6 5 7 8 7 7 7 0 0 7 0 6 7 8 9 8 0 9 2 4 2 1 10 3 3 8 6 9 3 5 3 7 10 2 2 8 7 0 1 5 0 3 6 5 4 5 1 6 1 [...]
+10 2 4 7 2 9 9 2 6 5 8 7 7 0 5 0 4 9 8 1 3 6 5 8 5 1 1 9 6 0 8 10 2 6 4 10 0 1 7 3 4 1 1 4 5 9 6 7 7 2 8 8 8 5 10 10 7 0 1 3 5 5 9 3 10 7 3 10 4 0 4 7 6 3 7 5 10 8 7 1 4 7 9 10 5 6 8 5 6 0 5 9 10 6 2 0 2 5 5 4 8 8 6 5 1 2 5 2 1 8 5 10 10 4 1 2 9 7 0 3 1 2 3 3 6 6 9 1 2 1 10 2 1 1 3 8 3 0 4 8 0 9 0 9 9 0 1 9 5 5 1 7 10 7 7 2 3 1 10 3 1 1 6 0 2 5 8 2 3 6 0 5 1 10 4 9 1 7 3 6 2 9 4 8 10 3 5 2 9 7 8 10 7 6 5 0 5 7 4 3 7 2 9 1 1 3 4 3 4 5 9 4 6 3 6 3 9 1 6 6 10 6 3 0 4 10 7 5 6 0 1 0 0 4 9 7  [...]
+8 9 3 0 1 3 7 9 9 4 7 8 1 7 10 1 0 3 6 7 5 6 9 6 1 7 5 9 5 7 8 5 1 8 9 6 3 7 0 1 2 1 2 5 0 6 8 7 9 6 8 8 6 1 10 5 0 5 10 8 8 9 8 3 3 0 0 10 3 1 6 8 5 5 0 1 0 4 3 10 1 7 2 8 3 1 5 2 5 2 4 6 3 2 2 3 3 10 5 9 0 3 3 2 1 2 0 1 4 1 4 3 1 7 9 3 9 7 7 3 7 9 6 4 2 10 8 6 3 1 8 9 4 4 7 3 5 3 3 4 5 9 8 10 5 9 10 0 8 4 7 1 7 7 0 4 4 7 5 0 3 3 0 5 9 8 0 5 3 3 1 3 9 3 3 4 9 0 4 9 0 5 8 3 2 5 0 2 0 2 7 2 4 8 3 7 8 3 6 9 6 2 10 2 7 0 1 8 0 9 3 4 2 2 8 6 8 5 5 3 5 1 1 6 6 10 4 0 1 5 7 10 1 2 4 9 2 4 9 2  [...]
+7 9 6 10 4 8 8 0 4 2 9 3 5 2 2 1 9 1 10 6 10 0 5 0 5 1 3 2 5 9 6 5 1 9 4 4 5 3 5 7 3 5 8 5 4 9 1 7 7 5 5 4 3 2 5 3 2 2 9 8 5 5 2 9 8 3 10 3 10 2 9 1 3 1 2 0 3 9 9 1 1 8 8 9 3 3 1 6 9 6 7 5 5 2 3 1 7 6 7 7 4 4 9 8 8 8 5 7 3 2 4 2 7 3 5 8 9 6 2 9 2 7 5 2 9 9 2 8 10 7 9 1 8 9 5 9 7 10 10 8 4 5 2 4 8 9 8 3 9 8 2 6 4 10 2 8 6 0 5 0 10 3 2 3 6 4 4 2 2 4 3 6 10 10 3 4 1 3 0 1 6 1 0 0 2 5 6 9 0 6 2 1 1 3 5 8 10 2 2 8 4 8 10 1 8 3 10 9 8 10 5 7 0 10 6 0 8 6 10 4 4 4 6 9 0 5 0 1 4 5 0 1 3 1 6 6 7  [...]
+7 7 5 7 6 9 3 2 8 3 6 10 9 0 3 10 4 0 7 10 8 7 1 1 2 5 2 5 9 1 10 4 7 2 0 4 1 7 1 4 7 10 2 4 3 8 4 8 8 0 4 6 8 4 8 8 6 3 6 9 0 1 0 10 4 3 6 3 7 8 1 6 0 7 6 6 1 10 5 3 2 4 5 3 2 10 5 10 1 2 7 3 4 7 4 9 1 6 10 7 10 2 8 7 2 6 6 4 4 9 8 7 3 9 6 6 2 7 6 6 5 4 3 7 5 5 9 5 9 1 5 2 5 9 1 8 2 2 1 9 9 5 8 6 10 9 0 8 9 5 2 8 2 2 9 4 10 0 0 0 3 1 9 10 2 2 3 4 2 3 7 8 2 6 9 8 2 2 10 7 1 6 7 10 4 8 9 0 0 5 4 1 10 6 9 4 2 9 6 6 5 3 8 8 5 7 1 4 4 3 6 7 7 4 3 2 9 1 9 3 6 2 7 2 4 5 7 4 8 7 9 6 3 3 6 2 4 8 [...]
+8 5 10 5 8 10 0 10 10 9 2 7 9 10 6 2 10 10 6 8 2 7 7 0 1 8 1 0 10 4 7 7 8 3 5 2 4 1 8 0 5 7 6 1 0 0 6 2 6 7 9 7 0 8 0 3 7 7 6 0 7 3 5 6 7 2 0 0 2 0 5 10 2 2 5 8 5 8 3 5 9 5 4 6 8 10 10 8 10 5 6 2 9 2 3 8 8 9 9 0 5 5 0 5 4 8 7 5 9 8 4 5 5 10 0 0 4 6 6 4 8 7 7 3 5 10 2 4 3 8 1 6 1 6 8 10 6 8 7 4 0 5 10 6 0 4 6 7 0 3 1 7 10 5 8 1 5 6 5 8 8 3 3 0 8 6 10 1 3 1 3 10 4 7 8 8 8 1 5 7 5 0 7 9 9 4 1 8 2 6 0 7 10 7 3 8 3 7 3 3 3 3 8 5 9 4 0 0 3 9 8 7 6 6 0 0 3 6 4 7 3 2 0 9 6 1 6 3 7 10 6 0 7 1 10  [...]
+5 1 5 6 3 3 0 8 4 9 9 3 5 7 7 9 8 10 3 10 8 3 5 5 8 0 4 8 6 6 8 8 5 6 6 0 2 10 2 3 7 2 4 6 10 0 6 2 1 10 2 8 1 6 5 8 1 1 7 0 10 7 6 10 9 1 1 4 8 2 1 2 4 9 4 0 8 7 1 9 0 3 10 3 1 7 5 4 6 5 0 0 8 7 5 7 0 2 5 7 8 7 6 4 1 3 2 3 9 2 8 3 7 5 9 9 6 9 9 8 6 1 7 9 1 3 4 5 8 0 0 10 6 4 8 1 9 10 0 0 7 3 6 1 8 0 8 10 3 4 1 8 4 0 3 7 4 2 1 6 6 1 6 3 9 7 0 1 0 6 9 10 9 5 9 1 9 7 4 5 1 2 7 6 9 8 5 2 4 5 4 4 4 10 5 10 0 0 10 10 9 2 0 4 4 5 3 2 5 9 6 3 3 6 9 10 10 1 3 4 4 7 10 10 7 4 3 3 0 5 0 1 7 7 8 7  [...]
+1 7 10 5 0 9 4 0 2 5 6 0 2 5 2 0 0 0 0 0 4 2 5 3 9 6 5 9 4 4 3 1 5 8 5 4 6 10 10 8 5 10 6 7 7 8 1 0 5 10 9 5 10 2 10 6 2 5 4 8 2 3 3 5 0 3 2 6 8 7 9 5 8 0 8 2 4 10 1 1 6 9 5 5 1 8 8 7 4 4 1 5 6 8 10 0 10 5 3 4 10 3 9 4 10 4 8 9 0 10 9 3 0 5 7 4 8 5 4 10 2 0 6 6 3 5 1 0 7 6 1 8 5 2 6 3 2 1 6 3 8 0 8 5 5 7 2 5 7 3 7 3 7 7 3 6 9 6 9 0 0 3 7 7 3 8 9 7 8 1 3 6 2 10 6 7 8 1 0 8 2 1 4 3 6 8 5 7 7 1 4 9 1 10 10 8 4 3 0 5 3 9 10 5 3 2 6 8 3 2 6 9 10 8 1 6 0 7 3 2 9 8 10 2 6 6 10 6 0 7 1 7 3 3 0 4 [...]
+2 1 5 6 9 10 1 0 3 9 7 0 8 2 3 1 7 6 1 3 6 8 4 5 5 0 3 8 6 2 9 2 4 10 3 8 0 4 5 10 4 1 7 9 0 6 10 0 1 4 6 8 10 1 9 2 8 5 5 3 7 1 10 7 10 10 5 5 3 1 4 8 1 10 9 10 2 8 7 4 5 8 0 9 7 7 10 3 1 6 9 2 3 6 10 1 1 0 5 8 0 8 4 4 9 0 8 1 5 10 9 10 1 7 3 6 3 4 0 1 4 10 2 3 7 4 2 10 6 7 5 4 10 4 1 5 3 10 10 2 4 7 6 5 7 0 0 7 4 10 2 0 9 2 3 0 2 0 5 9 0 3 10 0 1 6 0 9 6 8 4 7 10 9 5 10 5 5 3 0 6 6 0 2 4 8 1 0 0 3 7 7 6 7 1 3 10 5 2 0 2 5 6 10 9 1 4 9 8 9 7 1 7 2 1 0 0 2 2 1 6 1 5 5 4 7 6 6 8 4 10 5 6  [...]
+6 7 5 8 4 10 10 6 0 7 8 5 7 4 6 0 8 1 0 9 6 2 10 8 4 10 0 8 10 6 0 10 9 2 4 5 2 6 3 9 10 9 0 3 6 0 10 0 4 3 7 0 10 10 3 8 9 10 1 6 0 10 8 2 6 0 7 0 3 7 7 3 10 3 4 9 0 4 6 6 6 10 2 2 5 10 1 5 7 5 0 2 2 0 10 10 9 5 5 8 6 3 3 6 1 1 4 9 0 1 5 1 1 6 9 1 2 5 9 4 9 3 3 10 3 4 1 8 8 6 0 3 8 0 8 0 9 6 6 5 9 5 7 5 7 5 7 1 8 5 9 4 6 2 5 8 2 4 6 0 6 9 0 9 10 5 5 7 8 1 7 9 4 8 10 2 3 7 10 5 10 9 9 2 7 9 7 7 10 8 0 5 8 7 6 4 5 7 6 5 5 0 7 1 4 2 9 5 8 7 6 4 9 8 0 0 5 3 10 1 10 9 3 1 2 8 8 7 8 3 5 1 10  [...]
+8 10 10 8 5 8 2 10 2 5 3 0 0 0 4 2 9 5 9 2 8 6 8 8 3 0 6 0 3 3 4 0 1 10 5 3 8 10 5 10 4 1 3 3 0 7 2 8 6 5 10 1 0 4 8 4 3 9 10 4 8 10 0 1 4 4 6 9 1 4 2 5 5 4 0 4 7 10 4 0 3 9 9 8 10 7 4 8 2 8 0 7 3 6 8 8 10 3 8 4 4 6 7 0 5 2 2 8 7 9 0 4 6 0 9 5 6 0 10 0 7 4 1 9 9 8 0 3 8 7 1 2 4 2 0 6 4 3 2 4 3 0 9 3 10 5 10 5 9 6 5 9 9 8 4 6 5 3 2 1 3 4 9 4 3 6 3 3 3 1 10 4 6 3 3 2 7 1 9 9 9 10 1 3 3 4 8 6 3 6 2 5 7 2 5 10 5 3 9 9 1 9 10 0 4 8 5 9 0 2 1 2 3 6 8 9 10 8 9 10 3 3 4 6 3 7 4 1 8 8 3 10 9 7 2  [...]
+6 7 8 3 6 8 2 5 1 3 4 8 0 0 1 1 5 8 3 7 5 5 3 0 6 7 2 6 2 2 0 8 1 3 10 3 1 6 8 2 6 10 9 0 3 9 2 3 1 10 8 10 1 4 0 0 1 7 3 0 10 0 6 3 4 3 8 7 7 2 10 1 1 9 5 1 4 5 8 2 0 7 10 7 9 8 1 0 0 1 4 7 0 8 6 3 6 1 4 8 5 2 4 8 6 3 10 8 1 8 6 6 1 6 4 1 3 6 8 7 7 2 4 6 2 4 8 10 8 5 4 8 0 3 2 2 2 0 8 2 6 2 3 8 9 5 0 7 4 3 6 10 5 5 0 5 7 4 5 7 6 0 4 8 10 4 10 10 2 5 1 0 8 10 9 10 2 7 8 5 2 10 8 8 9 2 6 3 4 4 6 3 2 1 4 4 1 8 1 5 8 4 7 9 10 2 2 7 7 6 7 1 5 9 5 5 6 2 10 8 9 10 6 10 0 10 1 3 8 0 1 0 4 1 3 8 [...]
+0 8 10 10 2 7 6 9 5 0 3 6 4 2 0 10 4 7 0 2 3 10 1 8 5 7 2 4 0 7 5 7 6 0 9 4 9 9 1 3 3 6 9 4 8 4 1 9 4 5 1 4 8 0 1 9 4 2 1 2 6 2 0 8 1 4 4 9 3 2 4 9 7 2 6 7 9 5 7 8 10 7 0 5 1 3 7 4 4 6 0 7 9 8 7 5 1 5 1 9 10 3 10 3 7 4 0 2 9 7 8 0 8 7 7 4 8 1 1 4 5 7 5 8 9 6 5 5 4 3 9 6 5 6 1 4 8 4 6 9 6 10 8 5 3 0 7 6 3 5 9 9 5 1 7 10 8 4 10 5 0 0 4 10 10 1 0 1 10 3 7 4 2 10 6 5 1 10 6 7 10 2 0 4 0 3 2 2 1 1 7 6 5 8 4 2 1 3 8 7 10 3 9 8 8 6 1 0 0 5 4 8 1 3 10 0 10 10 4 8 10 4 6 2 5 5 0 7 6 2 7 5 4 10 6  [...]
+3 10 5 6 0 3 7 10 8 4 2 7 7 5 0 1 6 5 5 5 10 1 7 9 7 7 6 9 7 10 5 10 1 7 9 8 6 5 5 2 5 9 9 10 7 0 7 5 4 6 1 1 3 2 3 2 10 1 6 2 6 1 3 7 9 5 10 10 6 3 6 0 7 8 0 2 0 6 2 7 6 10 8 9 3 5 3 8 8 1 9 4 1 0 7 7 5 8 7 4 10 9 0 8 4 8 1 10 2 8 0 2 2 5 1 5 0 8 3 7 7 1 3 10 4 8 8 3 5 10 2 8 0 10 10 7 6 1 4 5 4 9 3 1 2 0 9 1 5 7 1 6 4 6 8 0 8 4 3 2 1 3 0 1 9 5 6 0 9 9 2 5 4 0 4 3 6 5 9 10 0 8 8 1 2 8 6 8 6 2 7 7 3 2 0 9 2 4 10 3 3 0 0 6 7 3 10 7 6 4 2 8 2 7 1 10 7 5 6 7 7 0 5 2 2 8 9 6 10 9 7 1 0 2 7 6 [...]
+0 6 6 6 5 6 6 10 5 2 6 9 6 3 0 2 8 10 0 4 3 0 4 2 5 1 4 7 2 2 1 6 9 10 6 5 8 6 9 10 6 2 8 6 3 5 10 7 7 3 8 10 1 4 9 0 10 4 2 2 1 5 9 1 10 3 7 0 7 6 9 10 7 4 0 8 6 8 10 5 0 3 3 3 0 7 10 8 8 3 10 8 4 10 10 5 9 3 1 4 6 5 4 7 3 4 7 4 0 7 4 6 3 6 6 7 9 7 3 1 2 8 8 7 3 4 3 0 9 10 1 4 4 2 0 4 8 4 8 9 9 0 10 2 2 0 3 5 6 10 10 7 2 1 1 7 5 1 2 0 3 7 5 2 10 9 4 9 1 2 1 0 3 2 0 2 2 0 8 2 9 0 9 3 10 0 2 5 3 7 6 9 8 1 10 3 8 2 10 9 1 0 8 3 3 4 6 0 10 0 8 3 0 0 1 0 9 2 8 0 9 5 5 2 5 7 9 5 2 1 3 9 9 8 8 [...]
+7 2 3 0 10 4 3 4 5 5 2 10 8 2 9 9 5 0 1 9 7 10 9 10 2 6 5 1 7 5 0 1 0 4 1 9 5 10 5 2 1 9 2 1 3 5 2 1 5 9 0 5 3 6 0 7 6 5 1 9 6 1 1 4 8 8 3 2 7 4 2 10 5 6 4 5 10 0 8 4 4 5 3 8 3 1 8 1 5 9 10 3 8 8 4 2 8 0 8 10 2 2 8 8 10 5 3 5 10 2 4 2 5 10 1 4 3 0 4 6 0 2 2 7 0 3 7 8 9 6 3 2 1 3 1 6 9 1 5 3 1 8 10 2 6 4 4 1 5 3 9 4 3 0 3 2 1 2 9 10 0 0 1 1 4 7 6 3 8 4 5 5 0 3 8 5 10 10 1 5 8 10 9 7 1 8 10 1 5 0 0 6 4 4 0 4 3 9 2 1 2 9 3 2 5 3 2 2 2 2 7 10 6 8 9 9 8 6 5 3 10 9 8 4 10 5 0 2 8 10 3 2 7 4 10 [...]
+1 3 2 2 8 6 9 0 3 3 9 6 10 2 2 4 5 10 4 1 1 10 4 0 1 9 7 2 6 1 2 0 3 6 0 7 7 4 9 9 3 5 6 0 10 8 0 5 7 7 7 0 3 1 1 7 5 9 5 7 0 1 10 2 10 0 9 5 4 10 6 3 4 8 7 1 0 0 2 2 6 0 10 5 1 4 3 9 2 7 6 8 5 4 8 0 9 1 9 3 1 9 6 0 6 10 9 10 7 4 10 5 5 6 6 9 3 8 0 9 4 4 6 4 6 2 7 0 1 9 3 3 9 5 8 9 4 7 2 6 0 10 6 7 3 2 9 4 10 8 2 3 0 8 8 0 4 8 10 0 10 0 7 3 8 3 9 0 1 10 10 1 8 9 2 8 0 8 3 9 5 7 5 8 8 6 5 7 7 10 7 1 4 4 10 4 8 1 0 0 6 7 7 8 7 8 8 4 7 4 2 3 9 9 7 5 3 10 0 3 8 9 9 9 2 9 1 0 2 7 3 3 9 0 8 4  [...]
+1 7 8 8 8 9 5 6 7 3 0 7 0 4 6 9 6 8 9 3 2 8 8 3 2 2 7 8 7 0 9 5 4 10 1 3 3 10 10 5 10 1 8 10 10 5 6 1 3 8 0 3 7 10 7 6 10 1 5 5 5 0 1 5 10 3 4 0 1 3 1 3 6 8 7 3 1 5 1 9 1 0 1 4 9 5 9 10 7 3 2 4 9 9 0 2 3 10 9 5 0 2 0 8 3 4 3 5 10 8 8 4 8 2 10 5 4 1 4 6 10 10 10 3 8 10 6 5 10 4 2 1 8 10 3 4 1 1 3 1 7 6 1 7 8 8 6 10 2 8 3 0 4 3 10 5 10 0 2 4 7 0 9 9 9 5 7 2 0 8 9 4 0 5 6 1 8 2 8 4 4 1 3 4 1 2 1 8 0 0 5 2 9 2 8 7 0 7 7 8 7 1 1 4 7 0 2 5 0 3 10 7 1 7 8 2 8 6 6 1 10 5 6 1 3 6 3 0 6 0 4 8 4 0  [...]
+3 4 1 10 4 1 5 1 4 6 9 9 6 0 4 2 10 8 10 2 9 7 0 3 5 0 2 0 8 2 6 4 3 10 9 9 1 6 3 9 1 9 0 1 4 2 10 4 6 10 6 7 2 10 4 4 5 7 8 4 8 4 4 9 2 8 0 10 8 1 1 5 9 5 3 6 3 2 9 7 8 8 1 7 1 5 6 1 7 0 6 10 2 9 7 9 10 10 9 1 6 7 5 4 10 2 9 3 6 6 4 9 3 0 4 5 7 7 4 5 5 9 9 8 8 9 4 7 1 4 0 0 6 2 3 6 7 9 7 9 9 6 2 8 9 7 8 5 2 5 7 4 7 9 10 4 3 5 7 9 4 5 8 4 0 2 7 1 1 10 5 3 4 1 9 8 6 9 4 5 2 7 0 5 9 10 6 6 2 6 1 8 3 5 7 1 9 4 4 3 1 5 9 7 2 3 0 7 10 6 4 7 2 9 4 7 1 1 0 7 9 6 2 9 0 3 9 1 0 5 2 0 4 3 8 7 7 7  [...]
+8 2 6 0 3 8 2 6 8 9 0 2 5 1 2 8 6 0 10 10 1 2 7 5 3 3 8 8 2 8 3 0 4 2 5 6 8 9 7 4 6 9 2 6 7 1 5 7 7 10 3 8 10 1 10 3 0 10 2 1 9 4 9 4 2 10 10 10 3 8 4 0 6 1 7 3 10 5 3 6 1 6 9 10 6 7 9 6 3 3 2 10 4 1 2 6 7 9 8 8 2 5 0 4 0 10 0 6 0 5 7 0 4 6 3 10 1 10 6 9 7 5 2 4 2 10 9 2 2 2 2 5 1 10 5 3 2 1 9 8 2 1 7 8 0 6 6 7 1 10 6 10 4 2 6 7 4 8 2 8 9 0 6 1 5 7 0 10 4 7 7 4 7 7 8 5 10 4 2 7 0 4 7 4 4 7 3 9 10 9 3 3 5 7 7 3 8 5 0 3 8 8 7 4 9 3 8 0 0 10 5 2 6 5 0 8 8 5 8 10 5 9 2 6 7 8 3 10 5 6 1 6 3 2 [...]
+10 3 8 2 4 0 0 8 1 8 3 0 0 2 4 8 0 10 1 10 10 3 3 10 1 6 9 3 7 7 8 7 1 2 9 2 5 2 3 3 6 2 2 2 10 2 2 3 2 2 9 6 9 5 4 4 3 7 3 6 7 4 1 9 8 6 6 10 1 7 5 1 4 7 1 10 9 5 7 1 7 7 10 10 8 9 6 1 8 4 10 2 0 2 1 7 3 6 10 10 0 10 10 8 1 0 4 3 6 9 4 2 0 5 5 1 0 3 2 4 5 5 7 8 5 3 4 0 10 2 1 1 5 2 5 8 9 2 3 4 7 7 7 0 0 7 6 7 2 6 9 4 10 10 7 1 7 9 5 7 3 5 0 9 1 5 3 5 4 9 5 8 4 1 6 1 0 7 10 2 4 7 5 6 1 3 8 8 6 7 8 5 2 2 10 2 5 0 0 8 5 0 8 10 1 9 8 2 4 5 2 3 9 3 2 1 3 1 4 8 1 5 1 5 6 6 7 6 1 7 1 10 0 5 8  [...]
+5 6 9 0 10 8 0 9 6 8 9 6 7 6 10 3 9 10 6 0 7 0 9 1 10 4 4 4 10 7 4 5 5 3 2 10 0 8 4 5 8 9 1 3 4 5 4 3 4 5 3 10 2 2 8 6 6 7 1 5 3 8 9 10 1 2 2 1 2 10 10 2 4 6 1 8 9 8 10 6 8 6 8 10 4 9 6 8 1 8 6 2 9 5 9 1 5 0 9 9 1 10 0 0 0 1 10 2 8 7 4 10 8 5 8 0 10 2 7 9 1 4 1 4 6 3 8 3 10 1 1 10 3 1 7 5 5 2 7 0 2 3 9 6 3 6 9 5 6 10 10 5 8 3 3 6 10 0 10 8 10 5 2 3 8 1 8 7 4 7 8 8 10 4 4 8 3 1 10 10 0 0 8 6 5 5 2 4 3 3 1 7 7 6 3 2 0 2 9 5 8 3 5 3 1 10 5 8 6 1 3 8 0 5 2 10 10 9 8 4 10 5 8 10 4 7 5 4 7 2 0 [...]
+2 9 5 10 0 0 8 0 6 8 7 4 8 5 9 4 6 6 10 5 8 4 2 1 0 0 8 10 4 1 8 3 3 1 1 9 3 4 10 10 10 8 0 1 0 7 9 7 7 7 0 10 0 6 0 1 5 1 8 10 3 6 10 3 6 4 4 7 6 2 10 3 6 2 7 7 4 0 4 8 1 6 9 0 10 3 2 5 3 10 3 9 8 6 6 5 3 7 9 2 1 8 9 5 8 3 5 7 2 10 6 4 2 2 4 9 4 5 0 3 5 9 10 2 10 6 1 8 0 2 5 9 7 0 7 6 3 10 2 7 6 1 10 7 0 4 9 5 9 0 6 10 4 0 10 10 8 0 3 4 5 3 7 0 6 10 5 0 2 7 7 9 5 1 5 10 6 3 10 4 9 2 0 0 8 10 5 3 0 3 3 3 1 6 1 5 3 6 5 2 8 4 8 5 6 9 6 6 5 8 4 0 6 5 9 10 3 7 6 8 7 10 2 8 1 0 1 9 3 10 4 9 1 [...]
+8 10 10 3 4 2 5 2 2 7 3 4 4 0 10 5 1 6 4 1 1 6 9 7 2 6 7 3 6 10 7 6 0 6 2 4 2 5 5 4 1 3 3 10 4 5 9 6 2 1 4 9 6 9 3 4 1 2 5 9 9 7 5 1 8 7 1 1 2 6 3 8 4 7 10 8 9 6 3 0 0 9 10 6 8 3 6 2 10 5 9 4 4 5 5 0 2 0 10 2 5 7 1 2 8 1 0 0 5 0 10 9 4 9 4 6 6 1 2 2 5 8 3 2 2 1 1 8 5 8 0 5 7 3 3 8 1 6 0 8 10 6 7 4 8 3 1 3 1 5 8 9 0 2 6 3 3 5 4 2 10 0 6 3 8 5 4 10 7 3 6 1 5 3 10 10 10 5 0 8 1 2 1 0 5 0 1 7 10 1 10 0 1 1 3 4 7 10 4 8 3 10 6 0 6 3 8 0 7 8 9 2 1 10 8 9 3 9 10 0 0 7 1 9 7 5 6 9 5 3 3 8 9 10 1 [...]
+1 3 10 10 10 1 3 0 10 8 9 10 2 3 1 8 10 9 1 0 8 10 6 1 1 5 7 0 6 7 3 0 0 5 9 10 1 5 1 8 4 2 3 8 10 0 3 10 9 0 8 6 1 7 6 3 0 6 2 6 5 1 2 7 9 1 5 6 1 1 9 4 3 5 4 7 3 7 9 8 8 9 8 4 9 4 7 4 3 1 5 5 6 2 4 6 4 10 2 0 9 8 6 10 8 3 2 3 5 2 6 5 8 1 3 3 7 0 7 2 8 4 9 5 1 2 5 2 1 9 7 9 4 0 6 8 0 1 10 9 0 4 4 6 9 4 1 8 10 7 8 1 2 4 3 10 1 6 9 2 6 0 5 1 7 9 4 4 8 10 3 2 3 5 10 5 8 5 0 8 7 8 6 10 2 2 5 3 3 1 9 2 0 4 10 1 5 1 10 7 1 9 2 0 4 7 0 8 9 5 6 7 5 1 0 9 4 1 9 4 8 4 6 7 5 6 2 5 9 1 9 7 4 9 10 1 [...]
+4 6 4 5 8 9 2 4 10 10 1 6 6 8 8 4 6 3 7 2 5 0 8 10 4 3 10 3 3 1 10 4 9 6 9 8 6 9 5 7 0 4 3 4 5 4 4 10 1 1 1 10 5 4 4 0 9 9 4 10 7 8 8 2 7 1 1 0 1 1 5 7 10 1 10 2 4 5 3 10 3 6 2 8 8 4 8 10 9 1 3 6 6 5 2 9 7 9 9 1 6 3 6 4 5 3 6 0 10 0 7 3 8 8 8 0 2 0 9 7 2 8 1 9 0 9 7 10 7 7 4 10 3 0 5 2 0 10 10 6 3 8 3 6 7 1 3 5 5 7 8 9 7 10 4 7 2 1 6 5 8 6 10 1 10 2 9 1 3 9 0 4 5 2 1 7 6 4 5 3 10 9 7 9 1 10 2 1 5 9 0 7 0 1 0 10 4 7 4 10 6 7 3 0 8 7 9 7 1 2 9 6 3 8 9 9 2 5 0 1 3 3 1 8 8 9 9 5 8 3 5 6 2 9  [...]
+9 3 3 1 3 2 10 0 9 6 0 10 3 1 7 2 10 10 1 7 5 2 7 4 3 8 7 10 1 8 9 4 4 6 8 4 6 7 5 4 5 10 2 2 9 8 5 6 7 9 5 6 6 8 2 2 0 4 0 0 8 2 7 7 10 4 1 6 4 7 3 1 9 2 6 9 6 9 7 2 2 9 5 3 0 5 1 3 8 9 10 9 0 9 6 2 5 5 1 8 8 5 4 1 0 7 3 7 6 9 8 3 7 10 10 2 3 9 9 4 9 8 2 9 7 4 8 8 8 5 10 1 10 4 10 10 8 10 9 2 7 7 0 0 0 9 5 5 4 2 10 10 8 8 8 2 1 3 5 9 4 6 4 3 2 6 1 0 5 8 7 4 2 3 6 4 7 0 8 3 2 6 4 5 3 8 8 10 0 6 9 9 5 9 0 2 3 4 8 3 1 0 3 6 0 4 9 3 0 1 8 0 5 2 8 10 5 3 5 6 8 2 9 10 3 1 1 8 8 3 4 7 1 5 2 10 [...]
+9 9 0 2 7 2 9 5 5 8 9 1 4 9 6 1 2 9 1 4 1 4 2 6 3 10 6 7 7 4 1 2 0 2 6 7 8 3 8 7 8 8 7 0 10 5 6 3 5 8 5 5 3 1 10 4 5 7 5 10 1 3 3 1 3 5 0 8 1 5 4 2 3 9 1 6 7 8 8 4 5 5 8 0 4 5 7 3 2 2 0 0 1 1 6 3 0 0 5 9 3 10 3 10 10 7 0 1 8 2 0 9 6 2 1 4 3 4 1 2 0 2 6 9 8 10 5 0 0 8 7 9 3 4 8 6 6 1 0 3 9 5 6 2 4 2 10 8 8 10 2 6 4 9 9 2 10 0 2 0 5 5 7 5 5 8 9 3 5 0 4 10 6 10 5 9 7 3 6 9 6 0 2 6 1 7 1 7 4 6 9 1 0 1 7 3 0 5 5 1 1 7 9 5 0 7 6 4 2 3 7 9 2 0 9 8 2 1 0 2 7 9 4 7 2 9 9 6 9 5 1 1 4 8 6 7 2 0 3 4 [...]
+3 7 2 8 3 10 5 3 7 3 10 6 2 6 8 9 7 5 1 10 1 0 0 7 3 2 10 0 4 8 2 8 4 0 6 8 6 4 10 2 0 8 2 5 1 7 2 3 7 0 3 7 4 0 5 6 10 9 10 0 3 2 1 8 8 5 5 5 6 4 10 7 1 6 2 1 5 9 4 2 8 6 0 5 8 10 6 0 1 3 3 10 4 8 3 2 10 8 3 3 8 0 0 8 6 0 9 0 4 4 9 8 10 10 8 0 1 0 4 1 9 9 6 4 6 0 6 3 1 5 2 8 5 10 0 10 9 4 2 6 1 9 2 5 8 4 10 7 3 2 1 5 4 3 9 5 3 7 5 0 6 0 9 8 9 10 3 0 7 7 4 3 8 6 2 0 1 6 10 5 1 4 6 0 3 9 0 10 0 9 6 9 4 1 0 10 7 10 9 4 7 9 5 10 6 10 4 9 2 0 9 8 5 0 1 1 10 9 8 0 6 4 6 10 0 1 9 7 8 3 3 8 8 1 [...]
+10 6 4 9 6 6 6 0 8 2 1 10 10 5 3 10 9 0 9 9 6 0 6 9 3 9 2 4 1 8 6 0 9 0 3 4 9 5 10 7 6 7 10 7 2 2 2 2 8 7 10 4 6 9 4 9 2 3 2 6 7 1 6 0 4 4 0 1 8 10 1 0 4 4 9 2 8 4 1 10 3 0 8 4 5 7 3 9 0 7 3 9 0 5 5 3 6 4 6 1 4 0 2 6 1 7 1 8 2 0 8 1 7 10 10 5 10 8 7 0 6 2 10 4 9 1 8 3 3 0 3 3 1 3 2 10 0 2 10 4 1 2 5 4 7 8 3 1 8 7 1 1 9 6 5 1 5 5 9 8 5 0 0 2 10 0 4 3 2 0 7 1 9 0 9 8 0 2 4 4 1 3 1 2 3 7 10 0 1 6 7 6 3 5 9 3 1 5 0 5 10 6 8 5 4 6 6 3 10 0 2 3 6 0 3 2 8 10 7 0 4 7 7 7 9 10 0 5 8 0 0 9 9 1 6 1 [...]
+6 2 0 9 8 9 1 6 0 7 3 5 4 7 8 6 3 9 9 6 6 8 6 1 7 3 8 8 1 1 10 9 6 10 7 4 2 2 9 8 4 2 10 7 5 0 3 10 7 10 0 7 0 7 4 9 10 1 6 2 4 8 6 0 7 1 10 10 1 8 10 5 3 2 10 7 5 0 7 9 9 5 1 2 10 7 3 2 0 9 10 6 8 0 3 3 10 9 8 1 8 5 4 9 9 9 6 1 0 7 6 1 10 3 10 3 6 6 8 9 8 0 6 0 0 5 2 6 9 3 2 9 2 10 4 5 2 4 9 3 4 7 8 8 0 9 4 7 9 0 2 8 10 7 4 7 4 8 7 3 8 2 9 5 0 5 8 9 8 9 9 6 5 0 6 6 8 7 1 1 8 6 10 1 0 5 3 9 5 8 8 10 7 5 6 5 8 8 4 9 1 5 0 9 4 0 3 2 10 6 2 3 0 7 8 6 1 9 3 4 0 1 1 9 1 2 1 1 8 1 10 4 2 3 9 8 [...]
+8 3 7 0 7 3 6 10 1 9 7 1 9 9 10 2 7 5 1 2 0 5 10 4 2 0 10 4 0 2 4 6 10 10 3 7 9 10 5 0 9 9 7 1 6 10 2 3 6 5 4 4 6 5 8 10 7 9 3 8 6 3 1 1 0 8 8 9 4 6 9 0 1 9 10 5 4 1 7 8 1 7 3 10 4 10 2 8 7 3 7 8 3 9 0 2 10 3 6 2 6 9 1 2 2 6 4 8 6 8 1 5 4 9 8 4 1 10 6 10 5 5 1 2 9 1 2 5 8 2 6 1 0 5 9 4 1 0 6 7 4 10 9 5 9 3 8 7 9 2 5 3 10 1 8 6 1 8 1 2 9 4 2 8 2 8 9 0 2 8 4 7 10 0 5 9 1 8 6 1 4 9 0 9 3 0 9 5 2 1 10 8 3 6 5 4 2 2 5 3 4 6 3 4 4 1 8 1 5 9 2 2 10 2 7 10 1 6 9 8 3 0 8 1 9 10 6 6 10 9 8 2 9 5 0 [...]
+7 4 4 10 8 3 2 6 6 1 9 3 1 3 0 8 7 9 1 9 5 6 7 5 1 1 7 10 6 3 0 10 10 4 10 5 3 7 5 2 0 4 6 10 1 6 2 4 1 1 4 8 9 0 6 0 1 10 6 6 3 3 5 7 7 3 5 9 2 6 8 5 7 0 7 1 7 5 3 2 5 2 2 6 6 3 1 0 0 9 4 2 2 6 4 0 7 4 9 2 4 6 10 8 10 9 7 5 5 2 5 7 2 8 10 10 1 4 10 0 9 2 1 2 4 0 4 2 0 7 7 10 1 1 5 2 9 6 6 9 8 7 9 0 8 4 9 10 0 7 1 5 8 8 10 6 5 1 9 7 6 10 8 4 3 1 6 0 9 5 7 0 9 4 5 5 8 8 9 7 9 9 4 7 10 0 1 9 4 0 8 5 9 9 6 2 0 2 8 6 8 5 0 1 9 0 5 0 4 5 7 2 5 10 5 1 8 6 5 4 5 5 3 5 2 5 10 8 4 5 2 0 2 10 9 4  [...]
+4 6 8 0 2 1 2 1 10 4 9 8 9 3 2 4 0 9 1 7 3 4 3 10 4 8 7 10 6 2 0 9 2 6 2 10 7 0 7 0 1 0 8 10 6 7 8 5 2 3 4 7 7 0 3 9 4 1 4 6 2 8 1 6 2 9 4 5 7 2 2 1 3 7 5 9 6 3 2 6 3 10 6 6 1 7 0 5 6 10 1 0 5 0 3 3 6 9 0 5 3 2 0 9 9 2 10 6 1 6 5 10 4 4 2 10 3 8 5 8 2 6 2 6 4 8 2 8 10 6 1 6 1 4 1 2 0 4 8 8 1 8 2 5 3 4 0 4 5 5 5 2 4 8 6 7 8 4 10 9 2 1 1 9 10 1 9 1 1 0 7 10 3 8 0 5 9 2 10 6 5 5 1 2 4 4 6 5 5 8 8 3 4 6 8 2 1 8 8 7 9 10 0 4 8 4 0 9 0 1 8 1 10 7 3 0 1 1 1 0 4 9 7 3 1 5 9 1 4 6 4 4 10 8 2 8 3  [...]
+2 6 6 4 10 8 2 3 4 6 4 4 3 2 1 5 2 9 10 0 4 3 10 7 8 1 8 6 3 4 10 7 5 6 0 7 7 7 10 5 6 1 1 2 1 1 7 8 5 1 10 6 3 2 9 10 2 2 4 5 10 7 0 9 5 6 5 0 1 1 6 2 2 9 5 5 2 7 4 9 3 9 5 8 1 7 2 2 1 6 9 2 5 8 2 9 6 1 4 3 7 4 8 8 5 6 6 9 8 4 9 2 3 2 0 8 9 5 1 3 2 1 6 6 2 8 10 4 0 1 4 6 10 5 5 0 7 10 7 0 7 7 3 5 6 6 2 2 6 4 3 3 3 9 4 0 2 5 8 8 1 4 3 0 9 2 10 6 1 8 9 7 0 10 10 9 8 5 5 1 10 0 6 5 0 4 4 4 2 2 7 1 9 0 3 10 0 3 8 3 4 0 4 0 1 7 0 9 1 4 9 7 4 3 8 5 5 1 3 1 9 4 0 6 8 6 8 4 6 3 4 8 8 10 1 9 10  [...]
+3 8 1 4 3 8 5 2 2 5 9 8 7 6 3 4 1 9 2 6 0 4 3 6 2 2 6 8 9 3 1 4 8 1 4 6 9 8 7 4 9 4 4 6 8 4 4 10 2 4 6 8 3 3 0 6 7 9 7 0 2 2 4 3 8 3 7 0 0 5 9 1 7 10 10 2 2 10 1 3 5 8 0 8 5 3 5 7 5 2 4 5 0 10 10 3 4 4 1 3 8 0 10 0 8 0 6 7 8 6 9 10 0 2 7 3 7 4 4 4 7 7 9 4 9 0 10 6 2 5 9 6 9 6 9 9 9 0 7 10 1 3 4 1 3 5 6 8 5 0 8 7 8 0 1 10 0 2 5 4 2 3 9 8 4 9 5 1 5 10 1 4 7 6 9 9 2 7 7 2 0 9 10 9 5 6 9 3 4 4 8 8 0 0 7 10 9 5 2 6 5 4 9 6 2 1 9 7 5 9 0 2 5 10 2 10 4 9 4 7 10 4 8 7 5 6 9 7 7 4 9 6 1 9 4 1 9 0 [...]
+0 8 3 0 8 1 5 7 2 7 8 4 7 0 10 5 10 7 5 7 0 5 10 8 6 5 7 4 8 9 4 3 8 10 4 0 9 3 1 5 2 8 9 3 0 6 4 0 7 10 6 7 6 4 4 1 6 0 5 9 9 6 6 2 4 5 5 7 6 2 4 6 6 0 9 2 0 3 0 9 0 9 4 1 1 7 1 8 5 9 2 6 5 5 9 4 2 10 7 1 8 3 6 10 0 5 10 4 10 1 5 2 8 3 4 10 5 1 1 3 9 6 3 7 4 9 9 4 7 4 0 0 4 3 9 10 8 7 4 6 0 3 5 5 6 8 4 10 6 3 1 8 2 3 2 10 0 0 10 2 5 3 8 4 4 6 9 5 2 4 10 7 5 4 0 6 0 9 10 4 7 5 1 2 2 1 3 1 10 1 4 7 3 10 0 4 9 8 8 9 2 0 1 1 0 3 5 9 9 6 5 6 5 7 3 2 10 8 2 5 10 5 7 4 7 9 6 9 6 10 6 3 3 4 0 0 [...]
+1 9 10 6 7 7 3 1 1 3 1 2 7 6 2 3 3 3 8 2 9 8 6 9 4 5 6 5 7 9 0 0 9 9 4 3 2 2 3 6 9 6 3 6 4 6 0 5 1 1 1 8 9 3 1 2 8 8 8 3 6 7 1 8 3 2 7 10 2 1 1 10 9 7 3 10 4 8 5 4 6 0 3 6 0 9 10 5 8 0 7 4 3 4 8 0 10 1 2 4 1 3 10 10 9 1 1 3 7 4 0 9 5 3 4 9 6 1 5 8 6 4 10 10 2 5 1 5 9 9 1 3 1 2 2 9 3 1 7 9 1 10 9 8 0 10 7 7 8 1 3 7 9 6 4 7 6 3 0 2 9 2 10 4 0 8 5 7 2 0 4 1 9 4 2 0 9 3 5 2 5 8 5 4 6 5 10 4 5 1 9 6 2 0 9 8 6 2 9 5 9 10 6 3 1 6 8 6 9 3 8 0 9 0 6 4 9 7 3 7 10 0 0 3 0 1 7 1 5 7 1 10 3 7 8 10 4  [...]
+1 4 3 0 3 9 3 10 1 10 8 5 1 3 1 0 9 6 5 9 4 3 2 5 7 3 6 0 10 6 2 10 1 7 5 3 9 0 5 10 9 8 7 1 4 6 2 9 5 9 3 4 0 1 7 8 10 6 0 4 1 1 10 0 10 7 8 10 10 3 3 7 6 5 0 4 5 7 9 7 0 7 0 7 4 1 0 0 0 4 3 1 2 8 7 5 7 8 5 5 5 8 4 9 4 0 3 7 0 10 10 5 2 1 8 3 7 2 8 1 5 6 7 1 6 2 9 7 9 9 3 4 10 3 8 6 1 3 3 1 1 3 1 9 1 7 2 9 2 6 0 0 1 8 9 3 8 4 10 2 0 4 7 1 7 5 5 10 7 10 10 8 1 3 8 0 0 5 1 0 1 9 7 1 2 1 5 8 0 3 5 7 3 3 4 7 1 0 0 6 7 10 5 6 8 3 6 4 5 6 3 5 10 9 2 0 7 3 7 6 8 9 10 8 2 4 3 8 1 7 5 9 1 7 10 6 [...]
+4 6 8 5 6 1 3 7 4 7 7 5 1 8 10 1 5 9 4 1 2 4 0 1 7 5 6 7 0 7 2 1 3 4 2 2 5 7 5 9 1 8 7 10 10 5 1 6 3 0 8 8 5 7 3 5 8 4 3 1 0 9 8 8 4 4 2 8 4 10 5 4 2 9 8 4 1 3 1 10 4 1 10 8 0 0 0 9 3 5 8 2 2 4 0 8 1 6 2 5 10 3 8 2 1 6 0 3 2 5 0 5 7 1 2 2 0 3 7 8 3 4 2 6 4 9 0 6 8 5 5 3 1 3 7 5 5 3 9 9 6 5 8 8 7 9 3 0 6 4 7 0 1 6 5 0 5 5 9 1 4 9 4 2 0 6 6 5 6 8 3 7 9 0 9 5 4 5 0 2 9 6 8 1 3 4 9 8 6 7 0 1 8 10 0 4 7 0 3 8 1 9 0 5 2 8 4 1 5 3 10 0 10 6 5 2 7 7 4 5 4 6 4 5 6 9 2 8 7 4 9 0 4 2 10 0 9 8 3 5 0 [...]
+2 7 2 6 2 5 9 8 1 10 8 6 2 3 6 10 6 0 2 9 8 4 2 10 1 4 3 8 7 1 8 2 0 6 5 9 2 10 0 6 8 10 8 0 3 1 5 3 2 6 8 3 1 7 6 7 4 7 9 4 10 9 3 10 10 6 6 4 5 10 1 7 6 3 6 8 2 10 1 7 4 0 7 9 4 7 0 4 1 1 4 1 10 10 9 0 6 9 6 2 7 7 9 1 3 1 6 0 4 1 7 1 4 0 7 0 3 5 4 5 1 5 4 1 1 10 9 0 5 0 4 0 1 3 3 3 7 2 10 6 5 1 4 5 0 1 7 6 10 1 10 6 8 1 0 4 1 9 4 7 0 2 0 6 2 1 1 4 5 8 4 3 8 1 0 8 9 9 1 9 5 4 9 0 4 4 2 8 7 2 7 3 3 7 9 1 5 5 3 3 7 2 3 10 6 0 1 5 3 1 3 9 9 2 2 10 9 1 6 4 9 1 4 0 8 0 9 9 10 8 9 2 0 3 5 2 5 [...]
+8 9 7 4 1 5 1 5 0 9 1 9 10 3 5 8 5 9 2 0 6 7 0 8 1 3 8 4 5 3 0 9 8 2 7 9 10 3 8 7 5 7 6 1 6 7 7 6 0 9 9 3 8 2 9 5 2 6 5 4 2 10 2 1 2 3 3 2 10 5 5 10 8 0 1 9 5 8 7 5 3 8 8 0 4 4 4 4 2 5 4 3 0 9 2 8 5 7 8 2 6 10 1 10 8 4 9 6 0 2 3 10 4 6 1 10 3 2 9 0 8 7 1 5 1 0 1 3 5 7 10 1 6 2 7 4 3 10 1 8 10 10 1 6 1 5 8 5 0 10 5 1 3 8 4 10 0 1 5 0 6 3 6 8 5 3 2 10 3 5 0 10 6 5 1 8 0 2 9 8 2 6 3 2 9 2 8 3 4 9 8 10 1 9 8 7 8 2 4 2 5 9 5 3 6 5 10 4 8 2 2 10 1 2 7 4 7 3 0 5 3 0 8 8 10 1 5 2 10 0 10 1 8 5 1 [...]
+7 0 4 1 2 1 4 9 6 8 10 2 10 4 5 5 10 2 4 4 8 3 0 0 2 7 4 3 10 4 7 3 2 0 0 9 10 1 0 5 8 1 1 10 8 6 0 4 7 9 3 8 4 10 3 7 6 10 2 6 6 1 8 4 6 8 7 3 9 7 6 6 9 6 5 2 10 0 2 4 4 8 2 6 1 0 8 8 7 2 2 1 6 8 7 0 6 6 9 6 9 0 2 1 8 4 1 9 7 5 4 1 2 0 4 0 1 10 2 4 6 6 7 2 5 5 4 0 6 8 1 0 6 0 10 4 10 4 10 1 10 3 10 10 5 2 9 6 0 4 8 3 2 1 6 8 8 7 0 1 6 1 8 6 7 5 7 8 0 2 1 2 10 9 8 4 8 0 1 6 9 3 5 10 1 5 10 9 1 1 5 2 1 0 2 0 2 0 5 5 6 6 2 0 2 9 8 0 9 10 5 8 8 6 4 4 0 1 2 2 0 9 2 3 6 6 8 10 9 3 5 10 4 6 6  [...]
+1 5 0 1 6 10 4 0 3 4 10 1 0 2 0 8 2 1 7 7 9 4 2 9 5 8 5 9 5 1 4 4 8 7 1 2 2 5 3 10 2 7 4 10 7 6 0 1 2 6 10 1 8 8 4 4 5 9 4 6 0 5 4 10 2 2 5 4 3 2 9 7 4 9 3 4 8 9 1 2 4 6 6 0 6 10 8 0 0 3 8 5 5 9 8 2 5 6 9 5 10 8 6 0 2 7 8 10 10 1 2 1 0 2 4 3 3 0 6 7 3 0 9 3 6 10 3 4 8 8 4 7 3 2 1 0 5 1 3 4 7 1 4 2 1 8 4 4 9 8 10 9 2 10 3 1 1 6 10 10 9 5 4 5 7 4 6 8 9 1 7 0 8 6 1 6 4 3 8 0 6 7 8 3 6 2 7 10 3 3 1 9 4 3 8 4 8 10 1 4 8 4 7 4 9 1 6 0 9 5 4 8 9 1 0 8 4 2 7 2 3 6 6 6 3 9 10 6 2 3 9 3 6 8 1 1 2  [...]
+6 6 9 5 6 0 5 6 1 0 6 4 10 3 8 1 5 5 6 7 4 10 0 4 0 7 8 9 3 9 2 1 1 10 10 9 5 10 10 5 2 4 9 4 5 6 5 9 2 2 8 10 9 7 7 1 0 2 9 0 9 1 8 5 6 10 1 4 6 9 4 0 8 8 8 1 6 5 3 5 3 8 0 8 6 3 10 9 10 1 3 2 1 7 0 5 8 0 2 5 7 2 8 3 3 9 5 3 10 5 1 3 1 3 1 9 9 1 5 0 1 3 2 3 0 3 2 9 6 7 6 2 3 6 1 7 8 9 9 10 8 0 6 0 1 2 5 4 0 9 6 10 10 1 1 6 10 5 5 3 0 7 0 3 8 4 8 3 6 6 4 9 2 0 10 1 0 4 2 0 5 2 0 9 0 5 9 5 7 8 8 7 1 7 9 6 6 10 3 5 0 4 8 1 9 8 8 2 4 1 8 1 6 4 8 6 1 1 1 8 6 0 8 7 6 9 2 3 6 7 0 4 1 1 7 2 4 6 [...]
+2 10 6 3 5 7 2 7 10 0 10 2 6 5 8 10 10 8 9 0 7 3 9 3 1 9 1 0 8 4 3 6 0 10 8 8 5 0 5 2 0 10 2 3 6 8 3 5 1 8 1 5 1 7 6 9 2 4 10 6 10 8 2 8 7 0 8 6 9 7 10 10 6 0 6 4 1 3 8 4 0 1 8 7 4 6 10 5 0 2 5 8 4 3 1 9 0 2 2 8 5 5 2 7 1 5 3 8 7 3 1 0 5 5 4 8 2 9 8 2 7 5 1 5 0 1 8 0 1 9 5 5 4 3 10 4 6 5 6 6 4 3 5 2 7 9 1 5 8 10 8 1 10 9 7 6 2 1 3 9 4 3 2 9 0 9 3 2 0 7 1 3 0 8 4 3 5 9 6 2 6 10 7 5 5 6 1 10 8 6 0 6 0 7 4 4 8 5 5 0 3 6 4 6 10 8 5 1 9 8 4 5 7 3 7 5 6 0 3 10 1 10 1 3 1 2 7 10 4 6 0 10 2 10 3 [...]
+10 5 7 4 10 10 1 1 2 9 1 1 1 7 5 10 5 10 2 3 10 7 9 2 3 5 1 3 0 2 8 9 4 4 6 8 6 10 10 5 7 4 10 9 4 2 0 7 6 6 8 0 3 10 0 1 8 10 10 10 0 4 8 2 6 4 6 2 6 5 7 5 4 10 4 2 0 9 7 1 8 2 7 4 10 0 0 4 8 3 6 1 2 9 3 7 3 9 6 9 8 8 5 8 2 8 1 10 0 10 3 3 8 6 10 8 6 7 7 10 1 0 8 6 3 2 6 10 10 4 3 2 7 1 7 9 6 1 6 1 5 10 1 2 4 6 1 1 1 9 7 0 6 7 4 7 4 0 0 7 10 4 0 9 4 9 9 7 9 10 2 9 0 3 5 4 5 7 7 10 9 1 8 1 1 9 7 4 2 1 7 0 2 5 10 0 3 4 6 5 5 6 6 8 10 8 7 5 1 2 2 1 6 5 8 1 5 2 4 5 2 6 8 5 0 9 10 7 2 7 0 7  [...]
+2 10 5 4 4 4 2 4 0 4 8 3 6 0 2 9 4 0 1 2 1 4 9 3 0 0 8 10 7 9 6 5 4 0 2 6 1 3 7 6 9 0 3 0 4 0 6 1 10 9 7 3 7 4 3 7 6 5 6 6 5 1 6 4 10 1 6 4 3 9 1 3 4 2 2 8 7 1 0 2 9 1 3 1 6 4 2 6 3 9 10 7 0 0 3 0 6 6 8 8 0 1 1 1 10 4 5 10 10 8 8 2 7 5 10 10 2 0 6 2 3 4 1 0 8 4 8 6 5 3 1 1 10 8 3 6 9 10 7 7 7 9 3 3 8 9 5 0 7 10 0 8 7 3 6 3 6 6 0 4 3 1 1 6 8 4 7 6 5 5 5 2 8 5 4 10 4 10 6 4 4 1 7 1 3 1 3 2 10 2 1 9 9 6 6 0 3 0 0 6 2 6 4 4 7 5 2 7 8 3 9 5 3 2 7 4 1 10 8 6 0 4 7 8 3 0 9 10 0 1 10 1 9 9 1 8 9 [...]
+7 10 3 4 4 8 3 8 9 7 3 5 8 1 9 7 5 3 9 2 6 7 5 0 1 5 3 0 3 5 7 1 1 5 10 8 8 6 5 5 6 8 6 3 9 8 7 0 5 5 5 3 7 10 0 1 7 5 7 0 7 2 5 3 1 9 1 0 1 7 9 10 8 6 6 9 8 1 6 10 8 1 7 7 9 4 2 2 0 4 1 6 4 4 1 0 10 8 9 9 6 4 9 2 0 3 7 7 3 10 3 7 10 1 1 5 7 6 2 0 5 0 4 8 7 10 5 3 4 9 3 1 3 0 1 6 4 6 1 7 3 5 6 1 3 8 0 1 10 4 5 9 1 4 2 5 3 9 5 1 1 6 5 2 0 9 8 10 2 3 3 2 10 2 9 5 2 10 0 9 9 5 0 5 8 8 0 9 4 9 10 0 1 4 1 3 3 6 3 10 5 0 1 6 8 7 10 1 7 1 3 5 5 7 6 0 3 2 2 10 3 6 8 5 8 6 9 6 3 8 1 6 6 0 7 2 10  [...]
+0 5 7 8 1 4 6 3 9 10 5 8 1 7 6 5 2 1 8 9 6 9 0 5 8 7 6 5 2 9 7 6 8 6 1 4 2 3 2 5 5 8 3 7 1 5 5 4 6 5 5 7 8 7 5 10 5 0 7 8 7 0 7 8 1 6 8 7 2 9 9 3 6 6 6 9 6 6 2 7 0 9 9 10 9 2 6 10 6 1 3 5 9 6 1 10 8 2 1 3 8 5 1 9 8 3 6 3 10 3 2 10 10 1 7 4 7 9 7 4 5 7 4 5 4 4 4 7 1 7 7 8 3 10 8 4 5 2 8 6 7 4 6 6 2 2 6 4 3 1 5 6 5 4 1 1 6 9 7 2 10 3 9 5 5 9 8 9 4 0 7 0 2 2 7 7 7 7 4 0 5 6 4 10 4 3 2 5 5 2 5 3 6 3 6 7 4 0 1 9 5 6 9 6 1 7 4 0 7 1 1 2 6 0 7 1 1 5 10 8 1 1 8 8 8 4 9 5 3 3 5 0 8 5 8 2 2 10 1 3 [...]
+8 4 10 1 5 4 8 0 5 2 10 1 9 2 6 10 0 4 6 5 5 6 0 10 8 8 0 1 5 3 2 5 4 4 8 10 1 4 3 5 7 8 4 8 4 7 1 0 10 9 5 3 7 7 1 10 0 2 1 1 7 7 0 1 9 1 4 4 0 10 3 4 8 5 8 9 10 5 4 10 8 0 2 8 3 3 3 2 0 6 10 5 5 5 9 3 8 8 4 10 5 9 0 10 9 7 1 6 1 1 5 9 1 0 1 7 7 8 6 8 4 0 1 8 4 0 7 5 3 10 9 7 5 4 7 4 6 9 8 10 10 2 10 8 9 1 10 5 6 6 0 3 10 3 1 10 10 6 9 6 10 3 6 5 8 4 0 0 5 3 4 4 8 1 5 4 5 2 8 0 7 4 1 3 7 0 1 0 0 5 9 8 7 5 1 5 0 10 9 10 5 2 5 10 10 2 3 3 4 4 0 3 2 9 4 3 1 8 9 8 8 0 2 7 4 7 6 1 9 7 6 6 3  [...]
+6 4 10 3 6 4 5 7 2 9 8 10 7 5 10 6 3 5 1 5 0 5 3 8 6 1 9 1 4 7 9 7 0 4 9 1 8 10 2 4 10 7 10 8 2 2 1 4 2 1 1 2 3 0 7 7 10 0 7 4 9 6 4 10 5 5 8 1 3 1 3 10 10 5 7 3 6 4 7 7 3 4 6 6 6 8 6 2 8 2 8 7 7 0 8 7 5 0 0 8 4 10 9 6 6 8 9 1 9 0 1 7 9 0 7 1 5 8 7 0 6 7 7 8 10 3 2 2 2 5 7 10 6 5 8 3 9 7 5 5 10 1 6 1 5 6 0 0 9 8 0 6 6 7 6 4 10 4 10 1 6 7 4 2 2 5 5 5 5 9 3 9 9 5 9 10 1 7 2 3 7 0 1 0 3 10 9 6 10 1 3 10 1 1 4 5 3 0 7 3 8 2 5 1 1 2 7 5 6 2 4 5 5 10 10 10 5 3 5 7 3 0 4 10 2 6 9 3 0 9 5 7 7 8  [...]
+3 7 4 10 10 10 4 3 7 5 6 4 2 6 9 6 8 3 0 7 9 4 7 7 1 10 2 1 4 9 1 7 8 9 5 4 0 3 6 9 4 7 9 6 6 6 0 2 2 9 4 3 5 4 5 2 2 6 8 8 9 0 6 8 8 9 2 9 10 8 8 2 1 0 5 0 7 1 1 5 1 10 3 8 10 4 7 8 9 1 6 0 0 4 7 2 3 9 8 7 0 4 5 10 2 4 9 8 8 1 9 7 0 8 10 10 5 1 10 0 1 8 7 9 1 5 10 7 10 2 7 10 3 8 4 9 1 7 3 9 8 0 0 8 5 7 4 8 8 7 7 4 6 5 8 5 4 10 4 4 0 0 1 2 1 6 4 1 9 0 1 2 7 0 3 9 8 10 8 9 6 4 4 8 1 7 5 2 3 5 7 1 2 6 2 3 10 6 7 8 0 5 5 8 5 2 5 2 4 9 7 10 7 0 9 7 10 7 9 6 6 2 7 0 10 5 8 4 0 5 6 8 4 8 0 3  [...]
+5 0 2 3 4 3 8 8 3 3 10 8 9 2 2 2 3 10 8 2 2 2 6 5 4 3 10 8 1 4 8 8 9 4 5 5 6 5 9 5 9 6 3 2 4 8 3 9 1 2 4 6 6 4 5 7 5 10 6 3 3 10 6 1 6 3 0 8 2 9 1 7 5 0 6 6 9 1 3 3 2 8 9 10 3 1 8 3 1 10 3 3 10 3 10 7 3 2 2 6 4 10 5 10 5 7 3 8 3 0 0 3 7 5 4 8 6 9 6 4 0 9 0 3 2 6 9 10 3 0 1 5 7 4 6 10 3 7 7 3 1 1 10 9 6 3 1 2 3 3 7 6 2 10 2 4 7 10 6 8 4 3 8 2 3 0 2 7 5 6 5 7 9 3 2 5 2 6 2 6 8 0 8 6 7 3 4 7 7 9 3 9 4 9 5 3 4 5 4 4 7 5 6 7 8 8 7 6 0 0 8 1 9 7 10 0 7 3 10 9 1 4 6 6 9 3 3 0 5 4 4 3 6 4 10 5 2 [...]
+9 6 10 10 0 6 0 2 10 10 3 4 9 1 1 8 7 2 6 1 5 8 5 5 2 7 5 1 9 7 3 1 5 8 7 6 7 9 2 10 3 0 6 8 8 7 0 9 10 9 8 7 4 9 4 1 1 5 2 3 2 8 1 10 3 6 3 8 6 3 0 9 6 0 9 4 1 6 9 4 3 9 6 9 5 10 9 10 5 2 5 2 10 9 7 0 2 4 2 1 3 3 9 2 10 3 9 8 8 0 6 6 1 5 9 8 3 8 4 8 7 5 4 1 1 4 6 6 10 1 8 6 2 3 8 10 7 1 3 1 8 1 8 3 0 2 4 2 1 1 7 4 0 3 8 2 9 0 0 7 9 9 6 1 3 1 10 7 0 8 9 6 10 5 5 6 4 1 9 8 4 3 6 8 1 7 5 2 10 4 7 2 4 9 2 6 4 1 5 6 0 5 1 9 4 2 10 1 8 1 1 8 6 3 6 6 1 10 7 5 8 7 10 4 3 4 4 3 4 4 6 7 6 3 9 8 5 [...]
+1 9 6 2 1 0 5 10 0 2 9 10 5 6 7 3 2 4 9 5 0 6 0 9 6 10 1 7 3 8 3 2 8 4 8 5 9 10 1 2 6 0 10 8 7 6 1 2 7 5 4 2 8 5 7 10 4 2 2 0 1 8 9 8 1 4 3 5 9 7 0 5 4 2 8 3 9 5 9 7 0 4 4 6 3 9 5 5 6 4 7 7 1 2 6 4 6 2 5 3 5 7 0 9 9 1 8 0 4 10 3 0 7 7 4 1 9 3 0 0 5 8 5 3 3 0 9 5 4 9 4 10 4 2 1 1 2 3 8 2 2 10 7 7 1 10 4 8 6 8 2 10 7 5 7 2 8 2 0 2 1 7 1 7 3 2 9 3 10 5 2 8 1 8 7 3 5 6 0 3 1 8 9 5 6 7 9 8 0 7 5 1 3 7 9 7 7 10 7 9 4 2 2 8 2 8 0 7 1 0 5 5 5 4 7 0 6 9 0 0 3 10 10 2 5 0 10 10 9 4 8 4 0 5 1 1 0 6 [...]
+6 8 6 9 7 10 9 10 10 1 6 10 7 3 3 6 6 6 3 0 1 6 7 0 8 7 9 3 5 1 7 2 9 2 8 7 10 2 4 8 6 0 8 4 0 9 0 4 4 9 5 4 1 5 1 5 7 10 5 1 6 6 7 3 7 0 5 4 2 7 1 8 7 6 3 7 10 0 4 4 10 4 1 6 1 10 8 10 0 9 6 7 10 10 0 2 1 5 2 1 9 8 8 3 6 6 0 9 5 6 10 7 6 5 2 10 4 10 2 9 10 6 8 1 10 6 1 1 1 5 7 6 9 5 9 10 10 9 0 1 6 6 3 5 3 9 1 1 2 6 8 7 5 4 10 7 4 6 9 0 8 9 8 10 4 2 5 10 0 3 6 5 6 7 2 0 8 1 0 5 6 0 3 0 3 9 1 4 4 5 6 0 2 5 0 2 4 4 8 6 3 1 7 2 10 4 4 8 6 5 8 9 6 1 10 4 3 0 6 1 6 0 4 5 4 5 0 6 2 4 3 3 5 1  [...]
+7 7 6 2 0 8 2 3 0 3 10 4 5 2 7 7 5 9 2 6 4 9 8 10 2 6 0 3 4 2 4 7 1 7 0 9 3 10 10 3 10 0 6 6 10 6 2 1 3 10 5 9 10 10 1 6 5 8 8 7 10 4 10 1 0 9 9 1 1 7 8 9 7 8 7 2 2 3 4 2 9 0 4 3 10 2 3 2 3 6 0 4 6 5 4 5 6 10 8 9 1 4 3 4 4 6 9 1 6 1 2 6 2 6 2 1 10 2 8 6 4 10 1 9 4 10 5 1 9 3 1 7 4 3 5 1 9 0 6 2 4 5 6 9 7 6 1 8 6 4 4 9 7 5 8 10 6 6 5 7 5 5 2 5 3 1 7 5 5 5 1 3 6 10 9 4 2 9 1 4 8 7 3 7 10 5 10 7 7 3 10 4 9 10 2 10 6 1 1 2 7 6 2 7 8 3 3 7 3 5 3 0 9 4 8 8 0 1 4 5 4 7 5 0 1 2 6 2 7 5 5 9 10 3  [...]
+10 10 0 3 4 6 0 3 7 1 3 0 1 1 10 10 8 2 0 9 9 2 6 2 10 1 4 7 8 2 4 7 5 3 10 7 3 3 5 8 4 2 1 2 7 4 2 9 8 3 6 8 10 2 1 6 10 8 6 3 0 0 8 10 2 2 1 8 0 8 5 7 0 7 3 10 8 7 1 10 1 10 0 10 0 6 8 4 10 7 9 10 4 0 2 5 8 4 5 10 10 5 6 7 3 10 8 9 10 7 9 7 1 6 9 2 7 5 0 4 8 2 10 2 6 7 7 8 0 7 2 10 6 9 6 1 2 8 6 2 3 0 6 7 7 10 10 4 7 5 1 6 2 7 3 0 6 8 4 5 3 8 4 3 0 8 6 7 2 6 7 9 4 2 10 9 7 0 6 8 3 2 4 3 3 1 10 9 10 4 0 9 4 5 3 5 4 3 9 5 8 8 1 3 10 4 2 9 3 3 8 9 5 9 1 2 6 2 1 5 7 5 7 3 0 1 9 2 3 4 9 5 8 [...]
+3 1 9 10 2 0 0 2 4 0 1 3 0 9 3 7 3 10 0 7 3 0 10 10 0 2 8 4 5 7 9 3 10 3 6 9 5 6 5 5 9 2 4 3 6 4 9 7 5 3 2 10 10 4 6 2 7 8 2 10 10 5 0 6 2 9 8 0 5 3 7 2 8 0 8 3 6 3 5 4 4 1 3 10 9 9 1 10 0 10 5 10 10 5 0 1 0 7 7 0 10 9 9 7 5 7 9 7 10 1 5 8 5 7 0 10 1 1 0 6 10 4 2 6 8 7 7 5 9 9 0 0 2 7 0 1 7 1 1 3 0 9 10 8 10 5 6 8 8 5 8 10 1 2 9 6 2 9 9 8 10 9 1 4 7 7 2 9 6 5 0 7 7 7 3 4 7 1 4 3 1 5 4 10 5 3 9 3 3 0 9 8 4 0 2 5 2 7 4 5 8 10 0 7 8 10 4 7 2 7 0 2 1 0 1 6 10 10 2 3 2 7 1 9 10 5 1 9 0 2 3 8  [...]
+1 8 9 9 5 4 9 2 7 3 9 8 4 1 4 6 8 1 3 4 10 10 6 6 10 4 7 1 9 8 10 5 3 1 1 7 10 3 3 10 10 4 7 1 9 8 3 8 8 5 7 8 2 3 1 6 10 8 3 4 1 2 5 8 3 1 5 1 4 2 3 2 8 7 4 6 4 7 7 10 10 8 4 2 2 3 2 1 7 10 0 5 9 6 6 10 3 0 8 7 7 10 3 2 9 10 5 3 9 7 1 10 10 8 8 0 6 1 6 4 1 8 1 6 4 1 8 5 8 1 3 0 9 7 3 0 5 6 8 4 10 5 6 8 8 7 10 8 3 9 10 1 5 9 3 3 4 7 10 6 8 1 4 10 3 0 3 6 0 7 8 1 6 3 0 0 2 4 1 5 1 4 8 9 6 2 4 5 0 7 7 0 4 0 10 3 10 5 4 9 6 5 3 4 9 0 9 2 6 4 5 5 6 3 6 0 1 7 8 10 9 1 10 6 10 5 3 8 8 3 6 6 8  [...]
+9 10 8 10 2 8 3 6 1 2 5 2 6 9 6 4 2 5 0 0 1 1 1 8 1 5 7 7 10 0 1 4 4 5 0 6 9 1 9 0 7 7 8 3 5 8 6 7 0 0 5 2 3 7 1 7 6 5 1 9 3 9 4 6 6 10 3 4 5 10 5 4 7 3 7 0 1 1 3 10 0 3 8 9 0 2 0 0 10 8 5 4 10 4 9 4 8 1 8 5 5 4 7 8 7 4 4 7 1 3 10 8 10 7 2 9 0 9 6 7 8 3 2 3 0 0 1 10 9 3 10 4 8 6 4 8 9 1 9 3 1 2 4 2 1 7 3 9 6 3 6 9 8 1 0 10 2 7 5 10 4 2 8 9 9 9 10 0 4 3 0 3 9 10 0 5 9 10 5 7 10 10 9 1 1 6 7 1 3 6 8 5 9 5 6 8 4 6 7 6 9 5 5 6 9 10 3 8 0 7 4 6 4 4 3 8 8 7 10 5 6 4 6 7 9 7 8 7 0 0 8 4 10 8 7  [...]
+2 6 5 3 10 7 10 4 10 7 10 2 3 3 0 2 7 8 0 8 10 6 4 4 4 8 7 5 0 2 0 5 10 7 1 3 8 0 4 0 10 1 7 2 3 5 10 1 0 4 0 5 9 10 5 4 5 8 7 7 6 6 6 0 9 10 0 7 5 9 5 5 2 9 7 7 4 7 9 7 5 2 3 8 4 0 2 4 1 5 3 5 3 6 5 9 4 9 5 7 3 1 8 3 6 3 6 6 5 4 1 0 0 6 1 7 1 8 8 10 4 6 5 7 7 7 7 3 8 9 2 8 0 10 8 5 0 4 3 0 6 4 6 2 6 0 2 8 4 8 8 10 3 3 4 6 5 5 10 10 6 10 1 0 5 10 9 8 9 8 5 2 10 0 9 2 6 7 5 5 0 4 0 4 9 4 7 9 1 7 9 6 10 8 6 0 5 4 5 3 8 6 7 5 7 9 1 3 10 8 9 10 7 9 3 3 8 10 5 8 1 3 4 2 3 10 1 3 1 1 8 5 1 5 8 [...]
+5 10 9 9 1 9 0 3 1 2 9 1 4 0 3 0 6 0 5 6 4 3 10 5 5 3 7 1 1 1 6 4 1 8 5 2 4 2 4 6 8 3 2 2 10 10 9 3 4 7 2 1 10 0 9 0 0 6 5 0 0 1 8 3 3 6 10 1 0 7 5 8 5 6 9 8 1 8 4 8 7 4 5 8 5 4 10 9 3 6 0 10 10 2 0 3 6 8 0 6 2 5 1 3 9 6 8 8 10 6 10 8 9 6 10 5 10 7 6 10 2 6 5 4 4 3 8 5 7 5 1 10 5 10 2 4 9 7 10 7 7 4 2 1 2 7 10 0 5 8 4 7 4 10 4 1 6 10 9 2 1 1 6 0 2 7 3 3 0 2 2 8 0 8 4 9 9 4 4 10 4 6 6 10 10 5 7 5 4 3 7 10 0 2 5 10 8 5 4 1 5 2 4 2 5 4 1 6 4 5 6 10 7 1 4 2 0 7 1 2 6 10 9 4 1 8 7 0 2 8 1 5 6 [...]
+3 9 10 8 8 5 9 4 8 8 3 4 3 1 5 9 7 2 8 1 9 4 9 0 2 7 7 10 9 6 3 2 5 4 4 5 5 0 0 2 8 5 2 2 2 6 6 6 6 2 8 8 6 4 8 4 2 0 8 2 9 8 5 5 7 10 6 3 0 7 4 1 10 10 6 6 4 6 8 10 8 5 6 7 10 5 8 4 4 7 4 2 8 9 0 5 9 9 5 9 7 10 9 6 5 2 0 10 3 1 10 1 0 1 8 4 0 6 7 4 7 8 0 3 9 2 5 0 7 5 2 6 3 8 10 7 9 0 4 0 2 3 8 1 0 1 7 0 2 10 5 9 2 0 5 7 1 2 1 7 3 8 1 3 3 1 8 7 6 7 3 7 3 9 5 9 8 4 4 0 4 2 10 1 2 10 7 1 6 9 9 0 8 10 8 0 5 5 7 2 10 5 0 2 7 5 2 3 2 9 1 1 10 4 9 7 7 1 1 6 5 2 7 8 2 6 1 7 8 10 4 6 0 0 0 2 0  [...]
+1 0 4 5 2 5 0 2 3 5 3 6 9 8 1 8 0 2 1 9 9 8 3 10 2 4 9 1 1 9 4 9 9 8 1 2 5 6 9 3 2 6 0 6 1 9 4 7 10 6 0 7 0 6 1 1 1 0 8 2 0 9 4 3 10 10 3 7 4 10 7 3 0 2 7 1 9 2 5 4 0 8 6 8 4 3 0 6 2 5 4 7 5 1 3 7 5 0 8 0 10 8 2 3 1 6 6 5 8 10 6 8 6 0 7 3 3 3 5 9 6 3 2 4 3 7 4 4 10 4 3 6 4 0 7 2 6 10 3 7 6 1 10 8 8 7 4 0 7 9 10 6 9 4 6 9 4 2 7 2 8 7 6 2 10 0 1 5 9 1 1 6 0 8 2 6 8 8 9 5 0 9 8 9 0 10 9 9 1 6 3 9 10 5 5 10 7 4 8 9 3 6 10 1 2 0 0 2 9 2 10 6 2 4 2 2 7 1 8 6 7 4 4 10 2 6 5 5 4 2 3 5 9 0 10 8 3 [...]
+7 6 7 1 10 5 1 10 8 8 0 0 8 3 7 4 4 0 8 1 3 9 9 3 5 10 6 0 10 7 6 1 8 6 1 4 2 3 10 4 0 3 8 2 10 9 7 10 7 5 4 9 2 10 2 1 0 8 7 10 10 1 3 7 8 5 10 10 10 2 7 8 8 4 7 4 3 10 7 1 7 6 1 0 5 8 10 0 2 3 1 6 8 3 9 2 9 3 6 5 7 5 8 9 1 1 2 2 6 2 3 7 1 1 7 9 5 7 3 6 0 9 7 8 1 7 10 1 8 8 10 7 1 5 5 2 8 4 0 0 6 8 6 10 9 7 3 3 4 4 2 8 5 7 5 4 6 3 4 0 6 7 9 8 9 2 10 6 5 10 4 2 8 9 3 2 6 4 9 7 7 10 5 9 3 5 6 9 7 1 5 1 9 6 6 4 6 7 7 0 1 0 10 9 5 5 4 10 6 6 6 5 10 6 10 8 3 9 9 3 8 9 2 2 5 7 6 3 7 10 0 4 4  [...]
+7 9 3 6 0 3 8 6 0 0 7 5 6 0 2 0 5 3 2 5 8 7 9 0 8 5 7 7 2 3 6 5 3 2 10 7 8 0 5 8 4 2 3 5 5 5 0 3 1 1 6 9 2 3 1 7 8 10 5 5 7 0 2 6 6 0 10 7 8 1 0 2 5 2 7 10 10 9 0 4 3 5 8 3 9 10 5 1 0 2 10 7 3 9 6 8 8 7 6 1 6 2 7 7 6 7 1 9 4 7 7 0 0 3 6 5 10 5 9 0 2 8 4 9 10 3 9 4 5 3 8 8 3 10 0 5 0 6 2 2 8 7 9 3 2 2 5 0 10 8 6 6 5 3 5 0 3 3 4 10 6 0 6 4 4 9 4 8 1 0 0 3 8 10 10 9 3 3 8 4 1 7 3 10 7 2 2 2 1 5 10 2 0 8 10 9 1 0 2 5 9 7 7 5 9 5 10 0 1 9 3 10 6 9 2 0 8 10 3 7 5 9 4 5 7 3 3 5 0 7 3 10 6 5 9 6 [...]
+10 1 3 10 1 5 6 9 7 10 3 7 8 2 3 1 7 6 6 8 4 2 9 1 3 7 7 1 6 4 6 8 2 3 9 7 8 7 8 9 5 1 3 9 7 8 3 1 5 10 4 8 4 2 3 8 7 2 3 9 6 3 10 10 6 8 2 8 6 10 9 3 8 2 3 9 3 5 0 9 0 9 0 5 10 1 6 5 10 7 6 10 0 4 3 3 10 8 6 1 5 7 0 9 8 7 1 9 3 4 7 5 6 2 0 10 7 8 2 8 3 2 9 3 3 9 2 4 4 10 10 2 7 7 7 3 5 3 6 10 6 4 5 10 1 3 2 0 1 2 0 8 4 10 7 4 0 10 1 10 5 8 2 4 7 0 9 3 5 0 4 10 8 2 2 4 1 1 10 8 5 5 7 8 3 8 8 1 5 6 1 2 4 6 3 7 3 7 0 2 4 8 9 8 4 2 10 6 7 2 7 9 5 0 3 6 3 2 7 2 2 9 6 6 1 6 0 3 2 2 5 3 8 1 3  [...]
+3 8 7 3 7 9 6 0 5 1 10 7 7 0 2 5 10 7 5 6 4 10 1 5 6 10 9 1 3 6 7 5 2 6 4 0 7 6 8 2 2 10 9 2 8 2 7 3 8 0 3 6 5 2 4 4 6 6 3 9 7 2 5 7 6 7 6 8 8 0 5 5 5 5 1 1 1 1 0 2 6 4 9 3 8 7 4 8 4 3 3 5 10 2 7 7 2 10 4 5 9 7 4 10 8 2 3 2 6 1 9 4 4 2 5 7 3 4 1 1 1 10 10 5 1 3 1 7 0 8 0 2 7 4 9 5 1 1 0 9 10 1 5 4 7 6 6 0 9 9 4 1 7 4 5 1 3 5 1 8 10 0 8 0 10 10 4 8 10 6 10 5 6 8 2 9 5 9 1 6 8 2 2 8 8 10 2 5 1 5 0 4 9 10 9 4 0 2 2 0 2 3 3 0 5 1 4 0 7 3 1 7 1 3 5 4 8 2 8 8 1 1 2 9 1 10 5 3 9 10 9 8 8 8 1 2  [...]
+8 1 0 8 3 3 5 4 8 6 7 10 3 7 9 0 6 9 0 8 2 6 4 6 0 1 1 8 9 5 7 4 6 0 4 8 7 10 10 5 1 5 5 2 8 6 7 4 2 2 8 7 4 9 3 0 5 3 3 6 9 2 5 0 2 1 1 10 4 8 6 4 10 4 6 1 2 5 10 6 9 4 3 3 9 4 4 8 4 3 2 1 10 5 8 6 2 4 6 4 0 9 7 4 7 2 1 0 3 2 1 7 7 10 8 9 4 7 3 10 3 5 7 2 4 6 1 0 3 3 4 5 9 7 2 7 9 8 0 7 5 3 8 1 0 10 10 9 5 7 10 8 2 8 0 2 8 5 8 5 2 5 10 9 9 0 0 3 2 0 7 0 8 3 9 6 9 3 6 4 4 1 6 8 1 4 3 1 2 7 10 7 5 8 4 7 4 7 0 10 7 4 0 8 7 6 8 3 2 8 3 4 3 8 7 1 0 8 10 3 8 3 7 3 1 1 7 8 3 4 6 9 3 10 5 3 7 6 [...]
+3 1 3 9 7 2 8 2 2 8 1 8 8 5 3 5 2 4 9 4 0 9 5 6 4 6 3 1 8 10 4 7 10 10 7 4 8 2 5 1 6 5 4 0 7 1 4 8 1 2 8 1 1 4 8 0 10 3 6 7 1 3 9 7 8 1 7 1 10 0 10 0 2 1 1 6 8 8 4 0 10 0 9 1 10 7 7 2 2 10 6 0 0 0 8 9 4 6 4 10 9 4 8 0 10 7 2 9 5 9 9 9 5 7 4 7 2 6 1 8 1 3 5 4 0 1 0 3 2 3 1 10 1 4 10 3 1 2 5 10 0 4 7 5 4 7 6 1 4 8 1 7 4 5 9 6 0 4 10 4 0 3 5 10 4 7 6 4 10 8 6 10 7 0 0 2 6 3 8 4 6 1 1 10 7 6 4 3 9 0 5 2 6 4 2 6 9 7 10 7 6 10 8 6 6 6 1 0 0 7 6 7 8 5 8 0 3 1 9 3 6 7 2 7 10 7 9 3 4 7 10 2 4 5 2 [...]
+1 1 9 2 2 3 5 2 10 3 2 3 2 2 4 0 2 0 10 1 2 1 8 5 5 8 8 2 10 3 5 3 1 8 0 5 6 2 2 7 7 2 9 7 0 5 10 6 7 9 10 8 1 10 3 8 1 4 6 3 2 6 9 4 10 10 5 1 10 2 4 4 4 6 7 4 6 3 3 7 8 2 9 9 7 7 3 6 6 10 10 1 8 2 2 8 10 5 1 9 0 6 8 8 1 5 5 9 0 4 2 7 3 2 6 4 10 2 9 5 9 9 2 3 0 4 3 1 7 8 9 0 7 10 0 6 6 9 7 4 7 10 1 7 10 5 7 3 3 8 4 7 5 8 0 7 0 4 5 10 5 4 10 9 2 5 9 2 10 6 9 6 2 8 1 8 3 10 8 1 3 0 8 4 1 0 10 7 0 10 7 3 8 8 2 1 6 6 3 2 10 5 6 3 6 10 4 1 10 9 1 1 10 3 8 9 5 4 1 9 0 7 0 9 6 10 0 1 8 7 8 2 8 [...]
+10 10 1 0 7 2 0 4 3 3 4 2 6 2 6 9 3 4 9 0 6 7 9 7 8 9 4 6 3 5 8 1 4 6 3 3 2 8 9 9 4 10 6 1 9 2 1 8 4 5 5 3 8 6 10 0 4 1 5 6 4 4 3 6 4 9 1 1 7 9 5 8 7 3 6 1 1 0 0 5 3 0 4 6 8 9 1 10 2 10 6 9 4 3 2 1 2 3 3 3 5 9 9 9 2 4 10 1 1 9 1 4 1 0 1 4 7 3 4 9 2 5 0 8 6 1 0 2 5 0 1 1 6 3 7 6 10 10 9 4 0 8 4 8 3 2 6 2 7 2 10 7 8 10 5 5 3 5 4 7 3 1 8 6 5 5 4 1 4 4 8 3 2 2 4 6 4 0 1 10 4 5 1 5 1 5 9 3 1 1 8 8 5 7 5 8 3 7 10 5 3 1 9 8 2 0 0 1 4 10 0 7 4 7 9 5 7 2 7 8 9 6 5 0 7 9 0 5 9 8 7 6 7 9 8 7 2 7 1  [...]
+2 4 0 5 10 9 3 5 5 7 8 7 2 6 1 3 6 5 6 7 5 0 10 5 8 6 7 3 10 0 3 0 10 7 6 10 2 9 7 2 8 6 3 7 7 1 9 8 2 10 0 6 5 3 3 0 0 9 2 1 0 9 6 1 3 8 3 9 8 7 3 7 6 9 1 0 10 5 1 1 2 1 0 0 10 2 1 8 9 10 4 9 4 6 2 3 5 2 7 3 1 2 0 7 10 9 4 1 8 7 8 5 9 8 9 3 1 1 2 1 10 7 6 6 6 9 8 10 8 10 5 8 7 8 1 6 1 2 6 3 9 7 8 3 7 6 7 9 2 5 5 0 0 6 6 7 6 8 8 6 10 10 3 2 0 0 10 6 8 10 8 8 1 1 3 7 5 1 5 0 7 10 2 5 0 7 9 5 4 8 1 8 7 8 7 2 2 5 6 6 4 10 3 7 5 1 3 8 7 2 4 5 10 3 3 0 6 7 7 9 8 0 3 9 5 5 1 2 6 5 5 5 6 8 10 6 [...]
+5 9 8 3 5 1 0 3 4 3 6 3 9 2 0 10 1 6 3 2 0 1 5 9 3 4 4 10 8 10 3 3 10 1 10 3 10 0 6 9 2 0 10 10 0 5 5 9 3 7 1 0 1 5 4 10 1 3 10 0 5 3 5 10 9 9 7 8 6 3 8 1 2 2 1 5 0 3 0 8 4 10 7 4 8 10 0 6 3 9 3 9 6 9 10 8 5 2 10 3 9 4 0 3 6 5 6 7 7 0 10 10 3 3 5 5 3 4 5 7 0 10 2 5 5 1 8 1 2 4 3 8 5 4 1 2 4 4 0 3 0 8 2 1 1 3 8 9 0 6 3 10 1 8 0 1 2 7 10 4 9 10 9 0 10 9 5 10 8 1 3 5 10 2 3 9 6 7 7 5 9 3 2 1 0 5 0 1 8 4 6 7 5 2 0 4 6 0 1 10 1 8 1 6 9 8 8 7 6 9 3 2 2 3 6 9 6 4 1 4 9 3 6 4 5 4 1 10 0 4 4 6 2  [...]
+8 4 8 2 6 6 0 9 9 8 0 0 3 0 10 5 7 9 2 9 0 10 1 2 6 8 0 2 9 5 1 7 9 2 0 1 5 3 3 0 10 9 3 9 0 9 10 1 0 0 6 4 9 7 10 0 10 0 9 4 4 5 0 3 5 0 6 4 7 9 6 0 10 3 6 5 9 9 2 10 5 7 3 8 8 0 0 0 4 5 6 4 8 1 1 2 0 1 2 3 10 0 0 7 9 3 9 2 8 5 10 7 8 10 3 1 2 4 0 10 10 7 4 5 4 8 4 9 8 8 3 4 3 8 2 8 5 0 7 0 5 2 7 9 0 4 0 7 1 3 0 10 2 9 8 1 10 5 7 8 2 1 4 1 1 2 10 3 1 7 1 2 6 4 7 7 1 6 9 7 2 1 6 6 8 6 6 9 10 3 0 2 5 0 2 4 2 1 5 1 8 8 0 4 8 2 10 7 7 8 6 1 7 3 1 7 4 2 9 1 5 0 0 6 2 5 7 6 7 9 7 10 1 5 0 4 0 [...]
+1 7 7 9 3 1 7 5 8 1 5 2 7 9 2 9 3 0 7 5 9 10 10 7 5 6 9 3 4 7 8 4 1 8 8 4 9 9 5 2 0 10 10 5 5 0 9 3 10 10 3 9 3 1 7 1 5 3 3 0 6 2 5 10 0 4 0 5 2 3 5 6 9 3 3 5 8 9 7 10 5 2 5 5 10 9 8 1 2 8 7 6 3 4 4 5 6 2 8 6 9 4 5 9 7 10 0 6 2 7 3 9 10 1 2 8 1 8 5 4 9 2 3 3 0 9 10 0 6 4 3 1 1 1 4 8 5 4 6 8 2 9 6 2 6 7 9 6 0 0 1 0 0 3 10 3 8 0 10 5 10 0 6 4 9 10 4 3 0 6 9 10 5 3 5 1 1 3 5 8 2 0 3 3 2 3 6 4 10 3 7 8 0 9 8 9 10 3 2 0 10 6 3 0 10 7 0 8 1 5 5 7 1 9 9 3 2 1 5 2 6 2 7 9 3 0 3 8 1 2 6 3 5 3 1 5 [...]
+2 4 9 5 1 1 5 5 1 4 4 7 3 1 8 4 10 7 5 7 3 10 9 10 1 1 3 7 4 3 10 7 7 4 3 10 10 7 3 5 5 10 10 6 0 2 10 6 3 2 6 7 1 2 3 3 10 5 9 5 5 6 7 2 4 9 6 7 10 6 2 4 8 2 10 4 7 1 5 8 2 2 8 2 10 7 9 8 0 2 4 10 2 6 1 10 6 1 7 2 6 10 0 5 4 3 1 0 8 7 6 2 4 2 10 2 7 5 9 10 0 10 10 1 5 1 10 9 7 6 9 10 8 5 3 0 6 7 7 7 6 4 5 5 4 8 0 6 9 3 2 8 9 7 4 10 10 5 3 6 0 6 9 9 10 5 1 4 0 9 3 10 3 1 10 2 5 6 1 3 10 3 4 10 7 1 9 1 7 5 5 5 0 3 6 7 7 1 7 6 9 4 1 6 1 2 9 10 6 3 4 8 7 1 4 0 10 0 5 5 8 5 4 1 3 4 0 0 8 0 1 [...]
+1 6 4 7 3 10 9 0 8 7 8 0 7 2 4 10 2 3 2 1 3 4 10 9 9 8 2 8 5 8 7 2 6 5 5 10 5 6 0 8 8 5 5 5 3 9 1 9 6 3 1 2 10 8 2 2 9 5 6 4 8 0 2 6 9 8 8 8 7 6 2 5 7 10 8 1 4 8 8 10 9 2 4 6 6 4 0 2 3 5 2 2 4 6 6 0 0 4 8 3 0 10 9 1 9 1 5 9 1 3 8 7 7 1 3 6 0 3 4 7 6 8 6 8 6 9 4 0 9 6 9 9 1 3 1 7 9 2 8 5 9 1 4 0 0 7 2 0 4 4 4 9 2 7 8 8 3 7 0 3 8 8 2 7 0 0 10 7 4 6 7 7 6 7 6 6 5 10 5 4 4 6 2 7 1 6 6 8 3 1 0 4 10 7 5 1 1 2 2 4 7 5 7 7 10 8 4 3 5 8 9 7 10 2 6 3 9 8 5 5 6 6 5 10 1 4 5 7 6 3 9 3 5 5 1 6 10 6 7 [...]
+4 1 8 1 7 3 7 5 8 1 4 8 8 5 7 1 2 3 2 10 7 10 6 9 2 5 9 0 10 9 0 8 3 7 1 0 4 7 0 6 7 8 1 9 4 8 6 3 4 5 5 6 3 6 3 8 6 10 6 1 4 6 10 8 2 8 4 0 2 1 4 10 2 1 9 4 9 0 9 9 2 2 10 10 0 2 10 3 2 5 4 5 8 2 3 9 6 2 0 0 8 3 7 5 10 4 9 10 1 9 5 7 5 10 7 4 3 9 8 7 1 1 5 8 2 7 4 7 7 9 2 9 10 7 6 5 3 6 3 0 5 8 5 4 3 7 4 6 1 3 8 2 5 4 0 4 1 6 9 4 5 6 2 5 10 6 6 7 0 5 8 10 6 4 2 5 6 8 2 1 8 10 7 4 1 5 2 7 4 8 2 4 2 10 7 0 2 1 7 2 1 2 7 7 9 1 7 4 3 8 2 6 4 6 6 1 8 1 8 9 2 9 6 4 7 10 6 4 4 10 9 10 0 2 5 7  [...]
+5 2 4 8 4 8 9 0 8 5 10 4 9 3 5 3 3 9 9 10 1 0 2 8 7 1 0 6 2 5 10 2 5 3 10 4 8 7 9 4 1 10 7 4 8 8 8 3 6 10 3 8 7 2 10 6 9 7 2 5 0 9 1 9 4 1 7 4 10 10 10 2 3 6 2 0 6 1 8 5 8 3 1 8 3 6 4 9 6 2 9 5 7 6 2 1 9 0 0 4 10 4 2 2 6 1 7 1 6 9 10 4 1 6 10 8 6 1 2 7 10 5 6 6 6 9 5 8 5 10 9 2 9 7 1 4 10 6 1 8 2 2 3 6 10 1 4 6 7 0 1 6 8 8 5 1 9 6 3 1 1 0 6 9 6 7 5 3 2 4 10 7 6 7 9 3 5 7 2 0 9 7 5 2 1 5 1 8 1 8 1 10 0 9 5 3 10 1 3 3 2 6 0 8 2 2 0 5 6 1 4 5 0 10 7 4 0 4 7 4 1 9 2 10 8 0 1 3 7 2 10 8 1 2 0 [...]
+9 0 10 7 1 10 0 4 10 5 10 10 7 8 6 0 4 1 2 2 3 9 2 2 2 3 1 2 0 8 2 1 3 8 9 9 1 2 7 1 10 0 2 4 9 4 5 8 4 5 8 4 10 8 6 3 7 4 10 10 5 1 9 8 8 8 4 0 8 3 0 5 2 3 2 1 8 0 9 5 6 4 6 0 2 0 7 0 3 3 0 0 1 6 1 10 4 4 7 7 0 1 10 10 9 9 9 8 6 0 10 3 0 4 0 7 6 0 0 1 4 3 0 3 7 5 1 1 6 6 0 3 1 3 2 10 2 0 2 1 8 0 8 0 7 3 7 3 5 8 6 9 0 7 7 10 2 5 4 10 5 0 4 3 9 1 3 5 1 7 8 8 8 1 7 3 3 10 0 10 6 0 7 9 10 0 9 8 6 4 4 10 1 3 2 10 5 3 2 5 2 9 7 4 7 2 2 5 3 3 2 0 5 0 5 9 8 6 1 2 3 1 6 0 3 2 10 2 0 3 1 10 6 8 6 [...]
+0 2 2 3 0 7 4 4 0 1 8 9 6 2 8 6 5 3 2 3 7 6 3 5 8 2 5 6 0 7 1 1 9 3 6 8 1 1 6 8 3 7 4 5 3 2 3 5 9 7 3 3 1 1 2 2 6 2 2 10 7 0 1 4 7 8 1 10 3 3 8 2 6 6 1 6 6 4 5 0 7 0 4 3 8 2 9 7 4 7 2 0 2 4 3 9 0 7 8 3 2 7 0 9 10 2 5 7 7 5 10 4 2 8 0 4 6 10 4 7 4 0 7 3 3 6 5 4 1 8 5 2 9 0 7 0 0 9 0 5 6 6 2 5 6 9 2 8 9 0 9 6 7 7 1 9 7 2 0 3 8 2 10 3 0 1 0 9 2 4 9 6 9 4 6 6 2 9 0 6 7 7 8 10 7 3 8 10 3 1 1 2 7 9 5 1 7 2 6 7 2 6 4 8 0 4 4 7 7 6 3 7 2 5 8 9 7 4 3 6 6 0 3 4 3 3 3 10 2 6 9 1 6 6 9 10 7 8 5 6 0  [...]
+3 1 8 5 10 6 2 7 7 5 9 7 2 10 4 7 6 7 0 9 2 6 3 7 0 9 5 6 8 8 5 9 3 4 6 7 7 10 1 5 5 2 9 6 6 1 9 9 8 7 7 6 7 1 0 9 6 2 9 6 10 1 7 6 2 8 8 6 5 10 10 9 1 6 10 6 1 4 8 1 0 3 5 10 10 7 2 0 7 4 7 3 4 6 2 8 2 5 8 2 2 7 3 7 9 3 4 7 9 4 5 8 5 4 0 9 3 5 9 8 5 1 2 6 6 5 5 0 7 4 2 3 4 5 3 8 0 10 0 2 5 0 7 3 0 6 9 9 7 6 4 9 7 9 9 6 4 7 4 4 10 3 1 9 8 1 6 0 0 6 1 3 2 8 8 3 1 7 1 6 8 9 7 8 7 2 6 2 6 5 5 8 7 1 5 0 5 4 8 2 0 6 4 1 1 9 7 0 4 9 4 10 2 0 10 10 6 1 10 10 6 5 7 5 7 3 6 3 4 3 6 10 7 3 1 5 5 4 [...]
+6 10 9 1 8 6 4 9 0 1 0 2 0 10 2 8 2 0 3 9 9 5 0 9 8 6 7 0 1 4 3 9 5 2 7 10 8 2 7 8 2 10 8 9 2 10 2 9 0 0 0 7 2 7 5 9 5 8 9 5 6 8 8 6 10 9 3 7 10 4 3 3 8 10 6 10 8 10 3 7 5 10 8 0 0 3 10 1 4 7 10 8 3 2 3 10 10 3 7 1 10 8 6 8 5 4 7 4 8 4 8 0 5 6 2 5 9 1 6 1 5 5 3 4 5 10 6 7 5 1 9 9 3 2 10 7 0 3 2 5 5 5 3 3 6 1 1 1 10 5 8 10 5 3 3 5 8 8 1 4 4 1 2 4 9 9 8 5 6 5 4 4 1 3 4 4 3 2 9 0 2 0 8 3 0 0 9 7 7 7 0 9 1 1 6 9 1 3 10 10 10 5 8 2 5 1 10 4 1 9 4 10 1 1 4 8 7 1 10 6 8 0 0 10 9 5 0 5 1 4 4 8 7 [...]
+0 3 0 7 8 6 9 4 7 3 1 3 3 6 3 3 4 10 9 5 8 9 8 5 6 9 4 10 1 1 8 3 10 9 2 3 7 6 6 8 8 10 0 7 2 5 6 4 1 3 1 3 5 5 7 0 4 4 6 2 5 1 0 0 9 7 2 7 10 4 6 0 10 6 6 10 7 4 5 9 7 0 0 6 7 10 8 7 9 1 3 3 7 9 5 5 8 9 10 2 7 7 7 0 10 6 6 2 9 1 0 3 3 0 9 4 0 8 5 9 5 2 3 5 0 5 3 7 1 6 9 3 1 2 10 9 0 4 6 6 1 6 1 8 8 10 2 10 5 4 4 3 9 2 0 3 0 10 9 0 6 4 2 9 1 3 9 1 5 2 0 5 1 1 9 0 6 1 10 9 2 5 0 7 10 6 3 2 3 0 8 8 0 4 4 3 4 3 10 7 4 8 1 1 10 1 8 5 3 5 0 7 10 3 10 6 2 10 10 3 4 3 3 2 5 0 9 2 10 7 9 5 4 7 1 [...]
+8 2 6 6 1 9 10 0 2 10 2 3 3 5 7 2 4 6 6 5 9 7 7 10 2 10 2 5 0 8 9 0 10 3 8 9 0 8 0 5 4 2 10 9 3 9 0 9 7 5 6 8 4 8 6 8 0 1 1 1 6 8 7 10 5 0 1 4 7 7 6 8 0 1 8 2 7 6 9 5 4 0 0 1 4 7 2 0 3 4 3 10 10 5 6 6 9 2 2 9 7 4 5 2 8 7 5 2 9 9 6 1 8 2 0 8 5 2 5 7 2 10 0 5 5 6 4 10 4 4 3 10 9 10 3 2 0 1 2 9 4 6 7 5 0 8 10 8 10 2 6 4 8 2 0 1 7 1 9 7 2 1 9 10 10 9 0 2 1 0 0 9 7 2 4 3 2 6 3 5 6 7 4 6 4 4 10 0 0 6 4 9 9 4 5 7 9 7 8 2 2 7 3 9 9 10 4 10 2 9 7 2 4 7 9 7 0 4 10 6 10 2 1 9 8 2 10 0 10 3 7 8 3 6  [...]
+5 7 2 1 7 1 7 1 0 5 10 6 7 4 3 9 5 8 1 3 7 0 5 1 7 9 7 6 10 8 9 7 6 7 0 0 10 7 0 7 0 1 1 0 0 3 9 3 6 5 6 7 2 1 3 5 6 7 8 1 10 10 8 8 10 10 0 8 3 1 5 10 9 10 4 2 9 3 3 4 7 4 6 2 4 10 5 2 4 3 4 0 7 1 9 10 7 2 2 7 5 4 9 3 1 8 5 2 7 6 1 0 5 1 0 9 5 1 2 5 4 1 5 6 5 4 5 3 0 9 8 2 5 2 2 9 1 8 0 9 3 1 1 5 2 9 2 9 0 6 4 9 5 9 7 10 10 9 3 3 10 0 9 7 8 1 8 10 3 5 5 3 10 3 10 6 1 9 9 3 0 6 10 4 10 6 3 9 7 9 4 0 3 10 6 7 0 9 7 0 1 0 9 10 4 2 6 5 7 6 10 10 7 3 2 8 8 0 4 9 2 5 3 3 8 7 1 7 0 8 4 6 2 7 7 [...]
+6 8 4 4 8 5 10 5 10 10 9 6 2 4 6 10 1 2 6 5 8 4 9 3 10 5 5 3 1 1 2 6 3 4 8 5 8 1 9 4 5 0 3 6 6 2 3 10 10 1 6 2 2 10 1 10 9 1 8 8 10 4 1 5 6 10 9 7 5 8 7 3 1 8 10 9 8 9 2 10 1 9 1 4 6 8 4 4 9 8 8 5 8 8 10 1 6 0 6 10 1 0 2 9 3 6 4 8 3 4 8 4 2 9 4 0 5 0 3 3 2 10 5 8 5 5 7 7 5 10 1 4 3 5 10 5 2 10 2 1 4 10 0 7 7 8 3 6 8 3 10 5 5 2 8 2 3 2 8 0 3 5 2 2 4 0 0 7 2 5 3 7 7 1 9 0 9 0 0 2 10 2 5 1 4 1 9 0 7 2 0 8 2 0 8 5 4 6 9 1 7 1 6 3 10 5 3 3 0 9 4 9 1 0 0 2 10 9 3 6 5 3 8 10 8 10 3 7 9 0 10 10  [...]
+5 7 9 2 1 10 6 3 8 9 4 4 7 9 1 10 1 3 10 7 10 8 6 2 5 4 10 0 2 5 8 0 4 10 5 10 9 3 9 6 8 2 8 8 8 7 0 8 10 9 7 10 2 10 7 9 1 3 4 0 3 10 6 3 1 4 10 2 4 5 7 4 6 3 3 10 7 4 2 4 5 9 4 3 5 4 2 4 0 4 1 5 1 7 0 10 2 0 4 0 9 0 4 6 1 5 7 4 5 0 5 1 0 6 6 3 10 8 7 6 1 2 1 1 6 4 9 3 0 8 5 6 0 7 2 0 10 4 7 7 4 2 6 6 9 7 7 10 2 3 3 4 4 9 9 8 7 2 4 1 2 10 2 6 7 2 4 7 2 8 2 4 4 0 4 2 7 6 2 10 10 3 0 3 8 1 2 10 9 10 3 3 3 4 2 6 8 2 0 5 6 7 4 8 10 8 0 3 1 5 9 7 9 9 4 7 7 0 2 3 4 6 2 0 9 6 10 5 4 9 7 6 9 9  [...]
+4 7 10 6 10 10 5 6 6 6 0 0 3 7 10 6 5 7 3 4 4 0 8 8 7 0 5 9 6 7 9 5 9 5 7 8 10 0 4 9 0 1 6 9 2 0 3 4 3 1 8 2 5 10 10 9 8 6 4 7 4 5 4 8 9 2 7 9 8 6 2 4 6 8 5 6 9 8 6 8 8 8 3 2 8 2 3 0 8 8 6 10 4 0 4 6 10 1 0 7 6 3 5 0 3 10 1 3 10 3 5 6 3 4 5 0 7 1 2 3 9 8 6 1 10 1 1 9 6 6 6 1 6 1 7 4 3 6 7 3 2 7 8 10 7 8 10 7 3 2 5 8 1 0 0 7 4 7 4 2 4 6 1 10 8 6 5 10 9 2 3 10 5 7 7 2 7 8 9 6 0 3 4 9 5 2 2 10 1 7 8 0 5 1 6 8 9 8 4 9 9 1 6 1 6 4 1 4 3 3 3 3 0 6 10 4 4 4 8 10 10 0 2 4 1 5 5 1 7 8 1 6 0 4 6 5 [...]
+4 1 8 7 9 8 7 7 0 0 4 0 8 9 0 0 2 6 1 0 10 9 2 0 3 2 2 3 7 8 7 2 2 4 10 0 8 3 7 10 6 6 5 0 7 0 5 7 10 6 10 4 6 1 6 3 4 9 1 4 1 3 0 9 2 7 6 7 0 10 8 10 5 10 1 7 3 6 1 8 9 6 7 4 9 2 6 0 3 5 6 6 0 2 9 3 1 10 10 1 1 8 6 5 10 10 0 1 5 10 6 7 6 2 4 5 4 9 5 2 10 10 1 2 0 3 4 2 7 8 2 1 10 1 5 6 3 10 9 5 10 8 1 4 1 1 0 5 7 1 3 0 6 4 2 5 0 5 8 0 9 10 9 5 2 5 8 1 7 4 7 9 3 10 2 6 1 7 5 5 3 6 4 3 8 2 9 5 1 6 8 2 7 10 3 4 8 4 8 2 8 4 6 5 9 9 4 4 1 6 2 6 1 6 8 7 10 0 9 0 1 9 7 5 1 2 2 8 4 4 7 2 0 4 0  [...]
+9 10 5 2 9 6 0 10 0 1 0 2 9 7 10 4 4 4 7 3 0 10 9 10 8 1 6 10 2 9 7 8 9 6 10 6 10 8 1 6 9 6 7 3 0 2 2 4 8 6 3 3 0 4 0 0 1 0 3 3 5 0 7 6 8 9 6 9 1 2 6 3 5 3 9 7 2 0 6 7 0 3 2 2 8 10 0 2 2 8 9 2 7 4 7 9 9 9 6 10 5 7 2 6 3 6 10 5 1 6 9 1 9 6 3 2 8 2 1 2 10 7 5 8 3 3 8 3 1 5 5 8 6 6 10 7 9 5 4 5 0 8 7 1 0 3 3 6 2 7 8 7 6 10 9 10 4 7 1 0 10 8 0 0 1 5 5 4 1 5 6 2 6 1 7 4 6 0 7 3 4 10 2 3 4 9 8 4 7 9 10 3 7 6 2 0 2 0 0 10 4 10 0 2 5 2 10 5 7 5 0 7 7 2 7 3 9 9 6 10 9 5 0 3 3 6 4 6 4 10 8 0 4 8 6 [...]
+3 2 2 8 0 10 5 3 5 2 10 6 4 10 9 6 10 8 2 6 1 5 1 3 3 2 4 8 5 5 6 6 0 10 7 2 4 9 1 10 4 2 3 6 3 0 1 3 9 9 0 0 4 9 7 1 0 0 7 3 6 8 4 2 10 0 5 10 8 9 10 2 2 5 2 9 10 5 10 4 1 10 8 1 4 10 5 3 9 7 9 0 8 3 8 6 5 4 9 7 10 7 5 6 0 0 2 3 4 3 5 9 1 4 2 3 8 8 3 1 9 2 6 5 6 8 10 2 8 2 3 5 2 4 10 8 9 1 6 9 1 7 1 0 9 2 8 0 1 2 7 6 8 1 6 9 0 6 9 3 2 0 4 0 6 9 6 7 7 9 9 1 6 7 8 0 3 1 3 3 1 1 0 8 1 3 8 9 0 5 5 4 2 4 8 2 5 2 1 8 3 1 0 10 3 8 6 7 8 8 6 10 1 0 8 1 10 6 3 1 3 3 6 0 3 0 8 7 2 2 3 10 9 4 6 1  [...]
+1 2 9 8 9 2 3 7 4 5 6 7 3 3 7 1 9 6 8 10 5 6 5 7 6 2 0 7 5 3 6 3 3 0 3 8 7 6 10 2 3 9 1 6 7 1 9 7 9 4 10 4 8 2 5 2 8 8 2 0 6 2 8 4 9 6 0 3 0 3 7 5 7 9 0 4 6 4 1 5 4 3 0 2 9 10 3 9 9 10 3 8 5 4 4 6 1 9 4 4 3 4 7 9 10 8 6 9 8 6 8 1 4 1 4 6 1 5 4 5 1 8 10 5 10 3 3 6 7 3 3 7 7 7 4 5 10 3 10 2 6 2 3 6 10 1 1 4 7 8 7 2 7 5 7 3 9 5 10 3 0 3 1 10 0 1 0 9 7 9 4 3 7 7 5 4 4 1 9 7 10 8 1 0 1 0 7 0 1 10 1 9 1 6 1 2 10 3 10 7 9 9 8 8 2 9 10 1 2 7 0 3 3 9 0 0 3 9 6 1 10 5 6 9 6 8 3 6 2 6 10 4 9 7 10 4 [...]
+6 7 7 6 6 2 9 4 10 0 6 7 9 10 2 8 3 5 4 3 10 3 6 9 8 8 8 2 8 8 8 5 10 7 2 7 0 6 2 3 9 7 2 1 2 5 8 5 3 8 9 0 7 4 0 0 2 3 6 6 6 9 0 10 8 2 2 9 0 8 6 10 8 0 2 6 9 4 8 0 8 10 3 0 4 3 7 4 1 3 7 3 0 1 6 2 8 5 10 1 9 0 3 10 10 5 2 3 3 8 8 0 1 2 3 1 5 6 9 3 3 5 0 10 6 10 4 6 1 6 7 0 4 0 3 0 10 2 9 5 2 3 2 9 9 7 4 5 6 4 2 0 4 0 9 6 5 7 4 6 1 2 8 2 5 9 5 6 5 1 5 1 0 10 6 6 4 9 1 7 5 1 4 4 2 7 7 1 8 0 2 10 7 2 2 3 7 10 9 10 5 1 3 5 7 1 3 10 8 4 3 1 6 7 9 0 2 9 1 2 4 9 2 6 10 1 6 4 1 6 10 6 3 4 10 7 [...]
+2 5 6 5 4 9 7 10 5 6 9 7 9 8 2 0 5 10 8 2 5 2 1 1 0 8 9 7 3 4 8 0 4 3 8 2 9 7 2 8 7 6 3 3 9 3 7 1 10 3 6 9 5 0 5 7 8 7 1 10 5 10 6 3 9 7 2 9 10 6 2 0 9 10 6 0 9 8 4 9 9 7 10 6 10 10 1 3 10 7 2 1 7 10 0 7 0 0 4 2 3 8 8 9 6 4 3 3 6 1 9 7 4 3 5 2 6 4 9 7 5 1 1 9 5 8 8 3 0 4 10 4 0 2 0 2 0 8 8 0 6 2 9 9 5 7 5 5 0 7 0 5 0 9 0 2 1 10 5 5 3 6 9 8 0 10 2 2 4 1 8 5 3 10 7 5 0 0 2 4 2 7 2 0 9 0 4 10 1 9 0 5 9 6 9 10 2 10 4 5 4 4 2 7 9 0 10 7 8 0 7 10 3 5 1 4 3 8 6 0 8 2 0 7 3 3 7 1 1 5 2 2 5 9 8 9 [...]
+5 4 2 10 5 9 8 3 2 7 10 9 6 0 2 8 6 7 5 2 7 9 0 1 8 10 8 7 5 7 6 10 4 4 2 3 3 3 7 3 2 5 3 5 7 3 4 0 2 10 5 4 8 3 5 7 4 8 2 5 1 9 9 5 10 9 9 2 10 2 1 6 6 4 6 4 2 4 3 4 0 8 4 5 0 1 4 7 5 7 0 3 8 8 5 6 5 9 6 1 4 7 5 9 6 7 6 1 3 7 9 0 7 2 2 2 8 8 7 9 3 8 5 8 5 5 4 10 1 3 8 6 4 7 5 6 1 1 3 8 10 8 4 3 3 2 0 3 10 8 5 6 9 3 9 7 5 6 6 8 10 4 0 10 1 9 3 6 9 10 8 4 9 5 0 5 2 7 1 2 9 5 9 9 8 1 7 5 4 9 8 3 10 9 3 6 5 6 5 1 2 8 0 5 1 8 1 0 0 3 8 5 3 7 3 1 9 8 0 8 7 4 1 10 0 0 4 7 0 2 10 7 0 9 7 5 9 8  [...]
+10 10 3 8 1 7 7 8 5 9 10 9 10 1 10 10 0 5 7 3 1 3 0 2 10 1 1 3 1 0 9 9 8 0 4 8 8 2 5 7 5 4 8 7 6 4 9 8 7 7 1 8 7 1 1 4 6 1 7 6 1 3 5 7 1 8 9 6 10 9 3 1 6 0 6 2 6 3 10 4 5 7 10 7 7 1 3 2 0 5 3 2 7 0 1 3 2 3 4 9 4 10 8 6 2 0 0 10 7 5 0 1 6 1 1 7 7 0 0 10 6 7 0 2 6 9 0 0 10 4 6 3 4 5 6 0 0 9 0 10 6 10 4 3 0 4 6 10 10 2 8 0 5 4 4 9 8 3 0 0 7 0 6 4 5 4 6 3 7 3 3 1 5 7 7 2 2 7 4 2 10 6 0 2 5 6 5 7 1 10 7 6 10 5 3 2 5 4 2 2 0 6 3 2 10 8 4 5 9 1 6 4 2 9 10 8 6 7 6 8 2 7 2 10 1 7 4 2 10 0 9 10 10 [...]
+0 3 10 8 2 3 3 0 4 3 2 10 3 8 10 6 4 9 5 5 1 9 1 10 1 1 4 3 9 10 10 1 2 9 2 4 7 7 4 0 4 6 9 3 7 7 7 3 6 9 9 3 7 9 5 4 4 0 0 9 6 3 2 8 6 1 0 3 8 3 9 9 5 10 2 3 5 0 4 8 4 0 9 9 2 9 5 2 7 6 2 10 1 7 0 0 3 8 6 6 4 8 3 6 1 3 10 5 8 6 8 6 9 4 1 9 3 1 9 5 8 5 3 1 0 4 3 5 9 0 10 0 6 5 7 2 10 8 4 4 6 5 2 0 5 8 1 2 7 4 4 2 1 6 0 2 2 1 9 10 1 7 3 4 2 9 5 10 7 9 1 1 6 7 3 2 3 7 8 6 4 3 1 9 4 9 7 4 10 9 6 9 10 6 4 10 10 0 8 9 0 4 9 3 3 8 9 3 8 9 3 2 2 6 1 5 3 6 3 6 1 6 9 8 0 10 5 6 7 7 2 0 7 4 7 6 9  [...]
+8 1 0 0 0 10 5 5 2 9 0 7 3 10 9 2 5 10 3 7 7 0 10 4 7 2 1 2 3 9 2 5 10 1 1 7 3 3 1 9 6 0 0 2 9 1 0 0 10 1 10 8 4 2 8 3 5 3 1 2 10 8 0 1 10 4 4 3 5 2 3 2 1 9 8 6 2 4 3 1 7 0 2 2 4 1 9 6 7 10 4 0 10 7 1 8 8 1 9 0 7 0 7 10 0 1 4 8 5 1 10 6 4 1 6 2 1 6 2 2 10 7 2 6 6 7 6 6 10 7 4 2 1 6 4 3 6 10 4 10 10 9 8 8 2 0 7 4 6 10 6 4 3 3 2 4 8 4 4 4 5 9 4 10 7 2 0 0 10 6 4 4 7 10 7 0 1 0 6 8 10 3 10 2 4 0 3 1 4 5 4 1 2 8 2 10 8 0 6 8 9 8 10 0 4 7 7 8 10 0 3 10 9 6 6 6 6 9 8 10 2 1 8 0 2 5 2 0 3 7 9 4 [...]
+7 5 4 5 6 7 0 8 4 7 2 6 4 6 7 6 4 2 6 0 2 7 4 9 1 7 9 3 5 1 10 3 1 0 5 6 10 9 2 4 9 0 2 1 4 1 3 4 1 0 6 2 0 8 7 2 10 1 7 3 4 10 9 5 5 6 5 7 6 8 10 2 1 3 3 4 2 3 5 3 7 5 2 8 10 6 8 4 5 7 9 9 9 9 3 7 10 10 5 9 5 10 4 7 6 6 4 1 8 2 6 3 5 2 4 4 1 7 9 5 5 7 0 1 7 0 5 3 7 5 9 3 0 1 6 0 4 2 6 7 0 9 3 3 3 10 3 7 0 7 4 4 0 4 7 8 4 6 10 3 6 1 3 10 9 8 6 9 6 9 0 7 6 0 6 10 10 6 8 9 0 3 0 2 8 7 3 6 7 4 0 0 5 2 8 9 4 5 2 10 2 6 0 8 8 7 7 6 4 1 10 3 1 9 8 6 3 2 7 10 4 7 9 6 7 8 7 4 9 2 8 5 10 10 4 10  [...]
+10 8 3 4 2 2 0 4 1 10 0 2 0 6 2 10 8 10 1 10 0 6 4 5 10 3 1 10 5 6 0 10 0 5 1 8 0 9 5 8 9 3 0 7 10 9 0 2 0 6 5 10 0 6 7 5 0 9 8 6 1 2 3 5 10 0 0 9 0 4 1 0 5 0 2 0 5 2 8 3 9 1 3 6 3 10 2 2 1 6 8 4 10 0 1 0 4 8 6 10 0 7 6 9 8 0 6 9 3 5 0 10 1 8 1 2 3 0 3 2 7 7 9 0 9 3 1 10 5 6 0 10 4 9 3 8 0 9 9 10 8 5 2 2 8 3 5 3 3 8 8 5 8 5 10 2 5 2 7 5 2 6 9 2 8 8 4 1 8 6 6 9 7 4 8 1 6 8 1 6 2 7 10 4 10 8 9 3 1 7 4 0 6 0 8 8 5 2 5 8 6 6 5 6 6 0 8 4 0 5 2 6 5 1 9 9 8 7 4 4 6 0 1 6 9 6 5 0 5 3 7 1 2 10 6  [...]
+3 4 1 2 5 8 2 7 4 7 2 7 0 1 4 10 8 10 4 9 1 2 2 3 6 2 8 1 0 6 8 2 9 6 0 10 10 7 6 8 2 1 10 2 7 2 5 1 1 1 7 0 6 8 5 0 2 10 7 0 5 5 2 10 8 1 3 7 5 4 6 2 5 9 4 0 2 5 0 3 6 4 7 3 5 7 4 8 8 9 1 7 8 8 3 7 8 5 0 8 2 5 4 4 5 9 4 1 10 10 3 8 4 0 6 7 7 1 10 8 7 7 3 8 3 9 9 4 4 2 5 5 7 7 7 0 0 5 6 0 10 9 0 6 8 6 7 6 1 1 8 8 5 2 8 9 8 3 7 7 3 3 9 7 2 5 2 2 10 3 7 0 1 10 6 6 3 3 2 10 10 1 4 7 9 7 8 7 0 8 0 0 5 4 0 1 10 1 2 9 5 0 1 5 9 1 1 3 1 2 10 2 5 6 5 0 4 2 0 9 10 3 10 1 10 7 7 1 7 5 8 8 1 4 1 0  [...]
+1 8 1 2 7 1 6 10 8 9 8 8 1 5 2 0 1 3 8 0 7 3 3 5 10 1 6 4 0 10 2 8 2 7 9 7 6 9 1 6 7 3 10 8 2 6 4 4 7 1 1 4 5 4 1 0 1 2 8 1 7 0 6 0 4 9 8 1 6 9 7 0 10 7 4 2 3 0 0 7 5 1 9 7 0 1 9 5 2 2 6 8 8 2 5 0 3 1 7 5 10 5 3 9 10 3 5 10 2 0 9 10 3 0 1 8 7 0 6 9 9 2 4 3 6 10 4 0 4 9 0 10 0 1 3 8 0 10 10 6 2 2 9 10 0 1 2 3 4 2 2 7 7 10 8 6 6 4 8 6 10 3 0 8 4 4 10 2 7 1 2 9 8 0 0 5 3 8 7 7 6 1 9 3 7 8 4 6 4 6 7 2 9 7 0 3 5 5 3 6 9 9 3 8 8 2 6 2 10 1 5 6 6 6 0 5 6 4 2 10 10 10 5 6 7 8 7 6 4 6 0 9 3 0 2 1 [...]
+9 2 8 3 10 0 4 3 4 4 1 10 4 0 4 6 5 4 1 1 4 4 3 8 8 9 8 7 1 10 4 5 4 8 0 0 2 1 2 10 10 2 4 5 8 1 1 2 3 6 10 5 0 2 1 9 5 9 1 7 7 8 6 5 8 6 1 10 0 5 7 9 4 8 9 8 3 0 5 0 0 9 9 3 4 10 3 3 8 7 8 2 10 9 9 6 7 6 7 6 2 4 9 7 0 5 9 5 3 1 1 5 9 8 1 3 5 6 1 2 10 7 9 4 6 8 7 6 0 9 4 5 2 9 9 3 5 5 1 9 1 6 1 7 3 5 9 7 6 8 6 1 3 0 6 10 5 10 5 1 0 0 3 6 1 6 10 8 3 7 3 5 4 1 9 8 0 1 9 4 2 5 8 6 2 6 6 2 2 10 7 3 6 7 3 1 10 4 4 4 2 3 4 7 1 10 6 10 7 5 4 3 4 9 7 5 6 8 2 0 6 10 10 6 10 6 6 0 4 7 1 1 8 10 9 5 [...]
+4 10 10 0 7 0 2 4 7 9 4 8 1 10 7 0 3 6 10 10 1 7 0 7 5 3 8 7 1 1 10 0 2 6 0 6 10 1 0 1 1 7 8 1 4 1 7 5 7 7 10 9 9 2 2 4 8 7 10 7 0 2 9 6 9 9 10 9 1 1 6 10 10 1 6 10 10 0 4 10 10 2 8 7 1 9 9 8 1 1 3 6 0 10 6 4 1 7 4 9 7 6 6 7 1 6 10 9 0 2 9 5 3 9 5 0 3 0 1 8 7 2 5 8 4 1 10 3 5 10 3 4 7 0 10 9 8 6 2 4 5 0 1 4 7 10 0 5 3 5 7 10 10 1 9 6 6 4 5 8 7 5 7 8 7 8 1 3 5 5 7 7 5 5 6 6 0 1 7 5 6 5 8 7 3 10 8 8 5 0 4 1 1 0 4 7 0 1 0 1 3 0 8 1 10 5 2 10 1 9 0 5 2 8 8 8 0 3 10 10 6 2 6 3 10 0 6 10 6 4 4 [...]
+8 5 1 6 9 7 9 8 7 6 4 6 5 9 4 8 2 6 8 4 5 10 0 3 3 7 9 9 9 5 2 6 3 3 3 1 0 5 6 6 8 6 6 6 8 3 0 10 7 4 4 2 4 1 0 2 10 7 8 10 1 7 3 7 4 1 7 2 7 4 4 2 8 0 9 0 1 0 1 5 3 3 6 6 8 8 6 0 1 7 4 10 10 9 4 1 7 7 4 1 8 4 1 3 9 4 4 1 0 7 6 9 2 5 10 1 8 6 3 10 9 7 8 4 8 8 0 3 7 5 1 6 1 9 9 7 6 1 7 6 9 5 4 10 4 7 1 10 8 6 5 5 1 4 7 5 1 5 9 4 7 8 3 3 7 6 5 4 8 6 7 5 9 2 9 8 4 10 7 2 8 9 3 0 8 9 7 9 6 4 8 6 9 3 6 7 0 4 6 3 8 8 8 5 1 7 8 7 10 10 10 6 0 8 7 10 0 0 0 1 7 5 4 6 8 7 2 0 10 10 0 0 6 2 5 7 3 6 [...]
+5 9 0 8 1 10 10 4 5 1 7 9 10 5 7 4 4 7 1 3 8 8 8 1 0 1 8 7 0 3 2 5 3 3 9 2 10 3 10 3 10 4 0 2 8 0 3 7 0 10 7 3 7 5 5 4 2 10 3 7 6 4 4 8 9 2 3 10 6 5 10 9 5 10 0 0 4 5 7 6 5 9 4 2 8 3 8 1 2 3 5 9 1 1 5 9 9 6 6 4 8 9 5 9 2 9 8 0 7 0 3 9 3 3 1 1 8 8 8 1 5 4 6 0 2 10 3 9 1 6 4 7 1 9 7 3 7 10 1 0 3 9 3 3 9 5 6 8 1 5 2 4 3 2 4 8 1 10 5 0 6 10 5 4 8 4 2 5 8 8 3 2 9 9 4 3 1 9 4 5 4 6 1 7 6 2 9 4 8 7 1 1 4 10 6 4 10 0 10 0 2 7 8 9 9 0 4 3 3 6 1 4 8 1 3 8 10 8 8 0 7 3 9 7 10 5 7 5 7 5 9 0 5 4 8 4  [...]
+4 2 3 0 9 4 8 4 4 7 2 10 7 0 4 4 9 1 8 0 4 3 5 0 2 8 6 10 3 4 9 4 8 7 0 8 3 6 5 1 7 0 10 9 9 3 7 6 5 0 8 1 4 6 5 7 1 4 0 3 9 0 2 4 10 4 9 5 6 6 4 2 5 5 2 7 7 8 0 2 5 1 7 10 2 6 8 3 6 0 10 7 3 4 5 10 1 9 4 10 7 10 9 1 9 8 5 4 1 9 10 7 4 2 9 5 8 8 10 10 5 5 5 10 1 3 10 9 10 5 3 2 5 6 3 1 10 4 1 4 9 8 7 1 2 1 3 8 7 6 7 4 2 5 3 1 4 8 5 1 5 9 3 2 4 3 4 0 0 1 5 7 5 4 7 7 4 3 5 4 7 8 4 4 3 9 10 7 2 7 7 1 1 3 0 2 1 10 0 0 1 1 0 0 9 7 5 8 8 3 7 5 8 7 10 3 6 1 3 9 4 8 6 4 9 0 3 1 6 3 2 2 10 10 4 9 [...]
+4 7 9 7 7 10 6 2 7 4 10 1 8 3 9 4 3 3 9 1 8 10 8 2 4 5 4 9 5 0 9 0 7 10 2 1 10 3 9 2 0 6 10 2 7 7 5 10 1 9 4 6 1 7 3 2 5 7 1 10 3 2 10 3 0 6 2 3 2 0 0 10 9 3 10 5 7 0 0 10 6 1 3 5 9 6 3 2 5 3 8 1 1 7 2 4 5 4 10 10 6 2 4 1 10 5 10 3 1 2 7 4 9 9 7 7 0 2 3 1 10 2 9 8 7 4 5 0 5 1 3 9 4 10 0 8 7 7 10 7 9 0 6 4 8 5 5 1 7 4 0 10 5 3 10 7 0 6 4 8 3 7 9 9 3 6 3 9 1 8 5 1 8 1 9 2 4 9 0 6 0 7 2 9 10 0 0 8 9 6 8 2 0 10 9 10 5 3 4 8 7 7 7 2 9 2 6 8 8 0 0 9 6 0 0 3 3 5 5 6 9 3 6 0 0 1 8 2 1 8 6 6 9 1  [...]
+7 7 9 9 4 10 1 5 8 10 1 3 9 6 0 7 4 4 10 6 9 2 2 4 6 10 0 2 0 8 4 0 1 10 1 5 0 4 0 2 7 10 6 10 10 8 10 6 10 4 4 10 3 4 6 4 9 7 8 8 4 2 2 8 0 9 3 3 9 5 0 9 9 10 10 2 1 10 7 1 8 3 1 1 3 10 1 0 2 8 8 9 4 2 1 0 4 8 5 7 5 5 8 0 5 2 7 2 3 3 4 3 8 2 1 9 10 3 8 10 0 9 4 9 5 3 5 8 6 3 6 9 1 5 7 5 6 6 5 5 8 7 0 3 6 0 0 0 4 8 10 3 2 6 7 4 8 2 4 7 10 1 5 1 0 10 2 9 4 6 9 10 6 7 4 2 10 8 6 6 7 2 10 2 3 6 1 1 8 5 4 5 6 1 3 6 4 4 3 2 7 3 6 6 2 2 5 1 1 1 8 0 7 7 0 3 4 2 0 5 4 7 10 7 2 8 4 5 0 9 4 4 10 3 [...]
+3 8 5 3 8 4 6 6 2 10 7 3 7 1 9 3 9 9 4 6 3 5 0 6 10 2 8 0 4 7 1 6 5 9 3 7 1 6 8 1 7 7 2 1 2 9 1 3 9 1 10 0 8 8 4 4 2 8 7 8 1 10 7 6 3 5 1 7 8 6 5 1 4 3 3 0 1 6 6 2 8 5 7 10 5 9 10 10 1 5 8 6 8 0 6 6 9 8 8 6 5 3 8 9 4 9 3 3 2 7 2 5 10 5 7 2 7 3 2 4 3 9 0 4 10 7 9 9 0 6 1 0 0 5 9 10 3 0 10 5 5 8 6 0 4 10 9 1 7 1 4 1 1 8 10 3 5 4 2 2 3 3 10 8 9 2 7 10 8 0 9 4 4 8 2 0 4 2 8 10 10 3 4 1 0 0 6 8 9 8 0 4 1 6 8 5 0 5 0 3 5 0 3 7 8 9 1 5 10 5 9 7 6 4 2 10 10 10 0 8 9 10 9 8 8 3 10 7 2 2 4 4 8 10  [...]
+7 8 9 7 10 1 3 3 9 0 2 2 1 5 8 10 10 10 5 0 4 5 9 8 1 7 0 0 3 7 6 7 4 0 0 7 1 4 7 9 3 8 5 9 4 9 10 10 0 0 3 2 8 8 10 3 7 3 3 2 7 10 3 10 8 2 9 7 0 2 9 1 3 10 5 8 8 9 9 2 1 10 9 9 2 10 0 6 6 1 7 8 10 1 5 8 6 7 8 1 8 0 1 8 8 2 6 5 9 2 6 1 0 10 7 1 2 10 8 1 8 1 4 3 4 1 6 1 5 9 7 2 6 8 0 8 5 3 4 4 5 8 3 8 9 3 10 5 2 10 7 3 10 3 3 0 5 7 1 6 1 2 3 5 10 1 10 9 4 1 7 9 0 8 0 7 9 2 2 7 9 4 0 0 7 9 8 10 1 8 9 8 4 3 6 2 3 8 1 10 10 4 9 0 6 5 10 4 8 10 9 7 2 6 6 8 0 4 5 7 3 8 4 7 10 7 1 6 5 3 10 2 1 [...]
+6 4 6 10 1 7 5 9 6 1 6 8 10 7 8 4 3 9 5 6 10 2 8 7 10 7 9 4 6 10 3 8 5 5 3 0 9 1 6 10 9 8 4 6 0 10 3 10 1 6 5 0 5 10 8 9 4 5 7 4 5 5 0 7 2 1 1 3 2 1 5 1 4 9 2 7 6 6 6 10 4 8 9 10 0 9 2 10 0 3 9 5 6 4 8 9 4 9 0 6 1 0 7 1 3 9 3 8 2 0 7 1 5 5 5 0 9 3 3 4 2 0 10 3 7 8 7 9 3 9 1 9 0 3 6 1 1 7 6 0 5 6 2 1 6 10 7 10 10 0 5 0 1 4 9 4 3 6 2 4 4 7 5 3 1 0 8 10 2 2 6 1 2 7 2 7 2 10 3 6 7 4 2 0 10 4 6 3 6 3 4 3 10 8 5 8 3 8 10 7 10 10 10 0 5 7 8 9 4 10 9 4 3 10 10 0 1 5 10 8 3 9 0 1 1 8 7 6 7 3 4 2  [...]
+6 9 9 10 6 7 8 7 0 5 9 10 10 4 5 3 1 8 8 8 9 6 0 1 8 9 2 0 7 10 2 7 3 0 9 8 8 10 0 6 9 5 1 0 1 10 10 9 0 8 1 10 9 4 3 2 0 10 9 10 5 3 4 8 4 8 6 7 9 8 4 1 9 6 10 1 2 5 9 4 2 9 1 6 5 9 2 6 4 6 4 4 6 7 6 1 5 10 5 3 8 5 2 7 8 8 8 7 0 5 3 3 10 4 9 3 7 6 0 6 1 10 10 10 2 9 7 1 6 4 6 7 6 1 2 5 3 8 10 1 7 4 2 10 7 1 0 6 7 4 2 5 0 7 3 10 7 2 6 2 10 4 5 4 7 0 0 10 9 6 6 8 1 3 8 2 9 6 0 7 9 0 4 6 0 1 6 1 1 7 4 6 3 2 8 9 0 0 1 4 10 6 2 8 10 4 7 1 6 9 3 8 5 6 1 0 6 8 4 6 1 2 6 0 1 5 7 5 10 8 6 6 7 1  [...]
+7 4 9 1 0 3 1 3 7 3 6 3 2 3 8 8 5 6 5 1 7 2 10 5 4 8 8 1 0 5 6 5 4 10 1 8 10 8 9 9 5 10 4 4 1 3 3 4 6 10 2 2 0 7 1 1 1 2 3 0 2 4 1 5 10 3 7 5 4 5 2 10 7 8 10 8 3 1 6 9 1 2 0 7 7 5 4 10 9 5 10 7 10 9 7 10 1 10 2 8 0 1 3 0 9 3 5 1 9 6 9 5 9 5 9 9 4 9 0 9 8 5 2 0 8 9 10 0 5 0 9 9 9 10 10 2 8 5 1 1 10 10 3 2 2 6 10 10 5 3 6 8 8 6 2 8 7 0 1 9 10 8 4 7 9 5 8 2 5 2 0 9 2 0 4 3 10 9 4 3 2 9 4 1 6 0 3 5 0 0 3 1 5 4 1 4 2 9 9 5 2 7 6 3 4 5 9 3 4 10 6 4 1 4 1 7 7 0 3 6 3 0 7 0 2 9 10 0 6 8 0 7 2 2  [...]
+7 5 2 6 4 9 2 10 10 5 5 0 1 7 7 4 9 5 1 8 10 2 3 5 9 0 10 8 4 7 6 2 10 8 6 6 3 9 1 6 8 7 8 4 5 9 7 10 8 8 0 10 9 0 1 5 2 10 3 7 9 4 2 0 4 5 7 6 6 9 8 4 8 4 10 5 8 6 3 6 9 7 10 2 2 7 6 3 4 9 4 1 7 4 6 4 1 10 1 2 6 0 7 4 0 10 9 8 0 8 4 7 6 6 10 6 5 10 5 7 3 9 7 2 2 3 10 5 1 0 9 3 1 1 0 10 1 3 1 0 7 0 7 0 9 0 9 2 10 6 9 3 1 2 9 4 4 7 2 6 7 9 8 7 0 1 2 5 0 9 6 6 1 1 8 2 9 0 5 6 2 10 5 9 4 8 9 3 5 1 0 4 1 8 0 5 0 3 5 6 5 5 10 7 3 6 7 2 2 0 4 4 6 2 5 4 0 2 4 5 2 5 1 9 0 4 5 9 3 10 0 10 3 7 1 8 [...]
+6 2 4 4 6 8 6 5 8 3 1 4 7 6 1 2 1 4 7 4 3 7 7 6 1 10 10 2 8 5 8 5 7 4 2 6 1 0 0 7 0 2 10 6 3 8 1 1 9 5 4 1 7 4 2 6 2 5 9 7 1 9 3 5 9 1 6 6 0 5 4 3 1 2 5 3 4 5 4 4 8 0 1 5 6 6 7 6 10 9 10 7 4 6 8 0 3 0 2 2 2 5 4 4 4 4 4 3 6 7 10 6 3 1 2 3 1 7 9 4 9 10 2 7 9 0 9 4 1 4 1 5 4 8 0 0 2 10 8 10 0 7 10 0 6 2 9 5 4 8 3 3 7 7 8 0 5 1 3 10 1 4 3 9 5 3 9 3 5 3 1 9 10 6 3 10 5 3 2 3 5 3 5 3 8 9 6 6 8 4 4 2 9 10 5 5 2 9 1 10 2 8 2 6 2 4 5 8 3 10 5 2 3 4 1 3 3 2 6 3 10 6 10 10 9 0 5 7 6 4 8 4 6 2 5 3 6 [...]
+6 8 7 7 10 10 0 10 1 5 8 1 9 10 2 0 5 5 3 3 1 0 8 3 2 8 0 6 6 10 7 7 5 7 4 8 1 7 1 4 1 7 2 1 5 1 10 9 5 6 7 1 9 4 8 2 5 3 7 2 6 2 8 2 6 8 4 7 6 0 2 3 7 9 2 9 4 9 0 5 7 9 9 9 7 10 4 7 10 4 4 2 1 6 8 5 0 3 5 10 3 7 7 5 9 3 0 9 5 8 10 5 4 6 5 0 10 9 8 0 6 10 7 10 3 8 7 6 4 6 10 6 5 8 8 1 10 8 4 2 0 1 10 6 2 9 9 9 5 8 7 7 2 2 0 8 1 7 2 6 5 8 9 2 9 7 9 6 8 6 1 7 8 9 2 5 0 2 7 4 0 4 9 6 10 8 4 2 8 2 9 5 6 10 5 6 2 9 9 10 4 8 3 4 1 8 3 0 3 4 9 2 8 4 7 10 6 1 5 7 3 3 0 0 0 3 10 7 7 2 3 3 8 10 9  [...]
+7 10 5 10 9 0 9 9 1 8 3 8 10 3 2 1 9 3 3 5 0 8 6 7 0 5 5 8 6 4 5 8 0 2 6 5 8 6 7 3 1 5 2 3 1 9 10 2 8 8 5 7 4 8 6 3 4 6 5 9 9 0 9 8 10 0 6 6 10 7 3 6 0 5 4 4 0 9 5 1 9 7 0 4 4 10 4 7 6 9 3 6 9 3 1 8 2 0 6 7 8 10 6 10 9 0 8 7 2 10 3 1 4 6 6 6 5 0 10 2 2 5 0 4 0 10 8 2 7 2 0 1 0 8 2 7 10 0 6 1 10 3 8 3 9 8 0 10 3 3 0 2 3 8 1 10 10 2 9 8 5 0 2 2 3 10 9 2 9 6 7 6 0 4 0 7 10 3 0 0 6 4 2 0 9 10 5 6 5 4 8 0 5 5 8 6 10 7 8 3 3 5 10 9 4 5 8 2 1 6 0 10 0 8 8 9 4 2 1 9 9 10 2 6 7 4 0 5 4 9 0 2 3 9  [...]
+0 10 4 10 2 5 4 9 1 5 0 6 10 4 4 8 7 8 8 2 9 8 10 1 9 5 6 10 1 6 7 2 2 2 8 6 7 2 6 9 4 8 9 5 7 5 4 3 8 6 7 5 9 3 6 8 10 1 3 1 10 1 9 0 5 7 2 2 4 0 2 8 1 10 0 0 3 5 5 9 10 5 0 3 1 8 5 9 2 8 7 3 10 10 8 5 6 2 8 6 7 7 2 7 3 8 7 2 6 10 9 6 8 9 5 1 7 1 9 9 7 5 4 4 4 10 9 4 5 7 9 4 3 7 4 4 5 8 0 0 4 2 3 3 10 2 10 5 6 10 1 4 9 6 2 2 2 0 8 9 4 1 10 7 8 6 5 7 10 3 6 0 1 2 1 9 1 8 5 5 5 6 4 8 0 1 7 9 2 2 0 0 8 6 9 8 8 7 9 8 9 10 6 9 8 9 0 3 3 6 6 10 2 8 6 3 5 5 3 10 2 1 7 6 9 2 9 6 5 5 4 3 0 1 6 2 [...]
+0 7 4 3 6 10 7 7 3 4 6 0 4 3 10 3 2 4 7 5 4 9 4 6 7 10 5 9 6 9 3 10 8 10 4 10 4 3 3 10 7 3 1 4 9 9 3 6 3 0 4 9 6 2 4 10 6 10 8 0 10 7 0 9 8 2 6 1 4 0 4 6 3 8 6 5 4 3 8 9 5 8 0 0 9 3 3 8 9 0 8 1 8 6 3 9 7 10 6 3 3 6 0 1 9 10 3 2 8 8 9 5 3 1 6 10 5 9 8 10 7 5 4 0 3 7 2 0 0 9 1 4 0 4 4 5 3 1 0 0 3 10 3 4 2 5 3 2 0 1 5 5 5 0 5 10 6 3 0 0 10 4 0 5 5 9 6 8 5 2 9 8 7 8 0 6 2 0 9 3 6 8 0 5 5 5 7 3 2 4 4 5 4 9 6 8 1 8 5 8 9 5 0 7 4 1 2 1 6 5 1 6 9 3 10 4 8 0 8 5 10 9 5 10 1 7 9 3 0 9 9 5 6 3 0 1  [...]
+1 8 6 7 5 5 4 5 10 7 0 3 5 9 7 9 4 9 3 5 8 5 2 0 7 5 5 7 1 10 7 6 1 7 0 2 8 5 6 7 6 8 7 2 7 8 1 1 9 8 10 8 4 10 1 8 7 0 1 1 7 4 10 8 10 9 3 2 10 3 0 1 6 6 9 10 0 8 5 1 3 8 4 3 9 9 2 1 3 0 9 4 3 5 9 9 6 6 1 0 4 9 1 2 4 3 8 3 7 7 8 6 5 5 5 9 0 2 2 10 8 6 1 4 3 7 2 4 6 8 8 7 4 8 9 8 0 10 7 3 1 10 2 8 8 8 2 6 4 3 7 0 4 9 2 5 8 8 0 1 9 10 10 1 10 1 2 10 6 4 5 10 9 3 1 5 2 9 5 7 6 0 2 7 9 3 0 3 7 3 5 3 6 7 9 8 7 0 3 6 1 4 2 0 0 2 4 4 0 5 10 0 6 9 5 3 2 1 10 6 8 2 9 8 0 5 3 4 3 3 7 7 10 0 4 1 0 [...]
+0 10 4 4 9 4 7 4 10 8 7 1 8 1 2 7 7 4 2 8 2 7 9 7 1 8 4 4 1 6 9 10 4 5 9 5 5 9 4 5 0 6 1 7 9 9 4 3 1 9 4 9 9 0 7 9 7 3 3 1 2 2 6 4 0 6 3 5 9 0 2 10 0 4 5 9 0 7 10 10 5 2 0 2 7 0 4 4 1 7 5 1 4 9 1 1 7 9 3 4 4 0 2 4 9 5 6 0 4 5 6 9 9 6 7 0 4 7 8 10 8 6 2 3 5 6 4 5 10 4 10 3 7 8 7 3 9 9 10 5 8 0 10 2 9 4 7 2 3 4 3 0 9 3 0 8 0 6 6 9 4 9 9 9 9 2 7 4 8 8 1 8 9 7 7 2 1 8 8 3 10 8 3 4 10 2 7 9 3 5 4 2 2 2 0 3 4 0 6 9 3 1 1 3 2 8 8 10 6 7 4 8 1 7 0 7 3 7 6 10 0 5 9 7 8 4 1 0 0 9 10 3 0 9 0 6 7 0  [...]
+4 3 8 5 1 7 10 4 7 10 8 9 1 5 0 7 9 3 10 7 10 3 0 3 1 1 10 0 1 1 10 8 1 0 9 2 1 2 10 1 2 1 9 7 5 9 6 4 5 5 5 1 3 6 4 10 2 10 3 0 5 4 9 4 1 7 5 10 3 4 5 1 10 5 3 10 0 8 2 5 10 5 3 1 6 10 7 9 3 9 3 0 7 7 9 1 1 10 10 8 10 1 10 0 9 4 7 8 1 6 7 3 7 2 8 2 5 6 3 2 3 1 3 0 3 5 6 1 4 6 10 0 7 9 4 8 4 3 2 3 7 3 6 5 8 1 7 1 1 5 5 10 10 10 6 10 3 7 9 1 5 4 10 5 6 1 5 3 5 4 0 6 7 3 8 2 9 6 9 7 6 0 5 8 7 2 0 1 0 1 0 3 10 2 4 3 8 4 6 0 6 3 7 8 4 5 3 3 3 2 5 4 6 9 6 2 9 1 0 0 8 6 8 3 9 2 6 6 9 6 7 1 3 4 [...]
+6 8 4 5 7 2 7 6 2 2 10 1 4 1 10 3 0 4 1 8 7 8 8 4 9 5 7 2 2 6 10 3 2 3 4 6 10 9 9 6 6 6 7 8 9 10 3 1 9 2 9 7 5 9 4 1 2 3 7 0 9 1 9 0 3 7 2 9 3 8 1 7 1 3 8 5 7 1 3 10 4 7 1 7 5 1 3 8 9 3 2 5 7 1 3 10 3 3 4 5 9 7 2 4 9 3 3 7 10 10 7 0 3 10 6 8 8 9 2 3 9 9 3 4 7 8 3 8 5 8 5 9 0 9 2 7 8 8 6 4 5 4 3 0 3 10 4 3 1 3 9 1 9 3 8 6 3 7 8 10 2 4 7 5 10 0 8 7 4 4 2 2 6 8 10 7 7 5 5 2 1 0 4 5 1 7 6 2 4 6 3 4 10 5 0 8 7 1 1 7 4 7 1 2 4 1 2 5 5 0 10 6 4 5 8 5 0 2 2 2 4 6 2 5 7 9 0 9 3 1 1 4 0 0 9 10 8 5 [...]
+1 2 5 2 5 5 7 5 8 3 10 6 0 6 10 6 1 0 9 10 8 4 8 8 10 9 6 8 2 6 1 2 4 9 4 1 4 7 3 8 7 10 3 3 10 9 7 3 5 8 5 2 3 0 1 0 0 9 8 8 0 2 5 1 2 1 4 4 7 2 5 6 1 4 4 8 9 4 9 6 3 5 0 4 2 3 1 0 5 3 10 7 3 1 6 1 9 4 9 9 9 3 0 5 9 5 6 7 3 6 10 5 0 8 9 1 10 7 0 6 1 5 5 6 5 6 3 2 9 8 9 8 1 7 9 8 6 0 5 1 6 7 1 5 2 5 2 6 7 6 7 3 8 6 2 1 2 5 1 9 0 4 6 10 10 9 9 3 1 0 9 9 5 8 10 5 1 1 9 10 2 0 0 9 6 5 7 5 1 9 1 0 4 6 5 7 1 8 7 3 0 9 10 7 2 8 0 2 1 4 1 10 8 3 0 7 7 0 2 10 3 5 4 6 2 6 5 0 3 3 4 2 10 3 0 8 8 2 [...]
+0 10 10 10 3 4 1 0 9 5 7 8 4 8 3 2 3 9 0 0 10 0 4 5 2 1 10 0 7 1 7 6 8 4 10 0 5 0 0 3 7 2 9 8 8 3 0 1 7 0 5 2 10 7 2 4 5 6 9 10 2 6 7 3 4 4 6 9 0 0 10 4 0 3 6 6 1 3 1 3 1 9 4 8 9 5 9 0 4 0 3 2 4 3 6 9 2 2 7 7 3 1 6 6 5 5 6 9 6 1 5 10 0 9 0 10 2 3 3 8 2 4 3 9 5 2 5 2 7 7 3 3 5 2 8 7 1 7 2 1 0 1 7 7 6 8 2 3 4 2 2 8 6 6 2 9 1 4 9 6 5 1 2 5 0 9 6 0 6 9 0 6 10 10 9 5 1 5 7 1 8 6 9 1 2 10 0 6 0 8 3 5 9 1 8 8 10 2 9 10 9 8 7 8 5 0 7 8 1 0 9 4 1 3 7 5 5 4 7 3 4 6 6 8 4 9 2 6 4 5 4 4 1 6 2 4 6 10 [...]
+1 8 3 4 3 9 2 6 8 7 4 2 6 0 4 9 4 7 7 10 9 3 9 0 7 9 9 8 6 1 5 3 4 1 0 4 0 0 9 9 0 0 1 8 2 10 8 6 5 7 9 5 9 2 6 5 3 1 1 8 6 7 2 4 2 7 1 5 1 10 8 10 3 9 9 10 0 8 5 3 3 10 6 5 0 1 8 4 5 10 0 1 3 8 8 1 2 2 8 7 3 5 5 2 7 7 8 0 5 9 5 7 5 1 4 2 8 9 10 0 2 7 5 3 7 3 5 9 7 4 1 1 10 8 6 4 6 0 5 9 9 5 9 1 8 10 4 4 3 2 1 0 9 10 5 6 3 5 3 7 1 5 9 7 8 8 9 0 8 4 9 5 1 7 5 6 6 7 5 10 9 7 6 7 7 5 8 10 5 3 6 9 10 3 1 6 6 2 6 7 2 5 7 0 5 0 2 5 2 1 9 8 9 2 8 5 5 3 10 2 3 7 9 10 8 1 3 0 1 10 8 3 1 5 3 8 8 3 [...]
+2 9 2 7 8 3 7 2 4 10 0 7 5 0 0 2 7 8 8 5 1 4 6 9 8 10 2 6 6 8 1 8 8 9 1 1 10 7 4 5 7 2 9 2 3 1 10 5 4 0 1 0 8 4 1 5 3 8 10 9 6 5 10 1 2 1 0 7 1 10 8 3 4 2 2 3 4 4 8 1 2 6 0 6 4 8 3 1 0 3 2 9 7 7 5 2 0 10 8 1 6 10 5 4 7 9 0 3 1 10 3 5 1 6 6 3 3 5 5 7 0 0 9 0 10 0 1 6 3 0 10 6 10 7 5 3 4 0 5 7 5 3 4 8 3 5 5 3 3 9 3 9 10 8 1 10 8 8 5 10 4 3 7 4 0 3 7 5 1 3 6 7 0 8 5 7 10 7 9 6 3 0 6 6 10 7 7 6 0 0 5 2 4 7 9 5 9 3 2 4 5 4 8 4 9 2 8 6 5 2 10 10 0 5 8 7 0 6 0 8 6 5 7 2 5 0 4 0 9 2 9 8 1 3 2 10 [...]
+8 6 7 2 2 8 3 2 3 1 8 4 6 4 4 10 2 5 0 7 3 2 7 10 5 7 9 4 2 1 0 8 5 3 1 8 3 1 4 3 7 6 10 2 1 0 4 9 0 0 3 6 8 0 7 8 1 4 8 6 6 2 5 10 7 7 0 1 2 6 5 7 5 8 0 0 4 5 9 10 9 0 8 4 1 0 9 0 5 10 4 1 3 2 9 6 0 4 4 4 5 3 5 4 1 1 1 4 5 1 4 10 1 5 0 0 2 1 5 8 2 7 2 3 1 6 8 10 9 9 9 6 8 6 8 4 6 9 6 9 10 9 10 7 2 4 10 4 10 3 6 3 6 7 5 6 4 10 2 6 3 6 3 2 8 3 10 10 0 1 8 4 6 10 0 5 3 10 2 6 9 1 10 3 10 3 10 9 0 8 6 4 0 5 0 5 10 10 1 4 1 3 9 5 9 2 6 10 6 10 10 4 2 0 6 1 10 7 2 6 1 1 9 3 1 6 7 6 8 8 5 1 5  [...]
+8 6 3 6 1 10 1 6 3 9 10 1 9 0 4 10 5 6 2 3 5 7 9 2 7 2 10 9 6 3 4 6 4 7 3 10 7 7 2 9 0 4 0 0 4 5 9 10 2 0 1 2 1 0 9 4 5 7 4 7 5 2 9 1 5 5 0 7 10 3 5 1 0 6 4 9 8 0 2 5 10 8 7 10 4 6 2 0 6 5 4 7 5 4 10 0 6 7 0 3 1 0 10 9 9 10 5 4 8 0 6 2 5 8 10 6 8 7 2 10 8 7 5 9 7 4 3 8 5 4 3 0 6 5 9 5 6 3 7 8 8 0 1 1 0 2 8 7 2 6 1 3 1 8 0 1 0 0 8 6 8 4 5 5 3 0 7 2 6 9 2 8 5 2 8 5 4 0 10 1 0 6 4 8 6 8 5 8 1 6 9 5 7 9 2 7 0 3 9 0 7 0 5 6 5 0 9 6 5 2 6 2 4 0 3 6 4 10 2 10 2 0 0 10 10 4 1 9 9 4 3 3 2 9 3 5 8 [...]
+3 0 5 7 3 4 1 2 5 9 2 9 6 9 7 9 7 7 7 5 4 2 8 2 10 10 3 0 8 7 5 7 5 3 6 7 2 9 0 8 9 0 1 4 0 10 4 10 8 2 2 9 1 9 4 5 0 4 5 3 7 9 8 5 3 6 1 3 10 8 0 9 6 8 6 2 0 10 2 6 6 0 1 1 6 1 1 7 1 0 10 2 1 10 10 10 0 3 1 1 4 5 4 0 9 4 2 9 6 2 2 4 10 0 8 4 3 6 0 0 0 8 4 6 10 4 8 9 10 10 3 2 0 4 2 8 9 8 8 8 1 7 8 1 0 1 8 3 6 7 3 6 8 9 9 8 6 8 0 9 7 2 8 0 8 9 1 9 4 10 3 6 3 2 7 0 10 6 4 9 1 3 2 7 1 5 10 2 4 10 1 6 0 8 1 1 6 9 3 4 4 8 6 10 2 9 3 2 8 0 1 4 4 4 0 8 7 5 4 3 4 0 9 10 1 0 8 7 2 1 0 7 7 6 10 3 [...]
+1 8 5 4 1 2 0 0 9 10 1 10 2 0 3 2 8 3 4 0 8 3 4 8 1 0 8 3 6 3 2 7 7 5 6 0 2 10 10 4 5 5 6 4 7 4 3 3 2 5 0 2 0 8 2 3 5 4 2 10 0 4 3 4 2 2 1 0 0 1 0 7 8 5 5 6 3 6 3 10 1 8 10 8 10 9 0 7 5 6 8 8 7 5 8 3 4 3 10 3 5 5 7 3 2 4 9 4 10 3 6 2 2 7 5 3 9 2 10 1 3 5 4 9 9 9 2 2 6 6 8 6 0 9 2 3 2 0 5 10 9 4 2 6 8 1 10 2 9 6 7 9 9 0 9 7 2 1 9 7 5 1 9 2 8 4 5 6 8 7 2 1 3 10 6 4 1 2 7 6 2 1 8 7 8 5 0 4 7 7 10 9 8 1 1 4 5 0 6 10 8 5 5 5 8 7 10 2 7 4 7 10 5 9 4 3 9 1 7 1 3 0 4 4 8 7 0 10 4 4 10 4 4 8 6 4  [...]
+3 10 0 9 9 8 2 4 2 2 0 7 5 2 5 9 3 0 0 10 1 5 10 3 10 8 6 1 6 7 7 5 10 6 1 4 8 3 2 4 0 8 3 8 8 0 8 4 2 3 4 8 8 3 4 2 3 2 5 3 10 2 2 2 4 2 7 10 10 7 10 5 10 4 7 1 6 10 8 2 1 8 1 4 6 8 9 9 2 0 4 0 3 10 2 0 5 7 1 4 5 1 1 9 0 8 5 10 7 2 7 4 4 8 0 9 3 2 7 10 0 4 6 4 8 6 5 7 3 3 2 6 3 2 5 5 6 3 3 4 9 1 4 3 1 0 8 4 8 5 10 10 4 10 0 3 10 3 8 9 9 10 9 5 2 2 8 1 6 6 9 8 5 2 5 9 3 2 1 8 0 1 7 2 3 7 4 1 5 9 10 6 4 5 6 3 4 8 4 3 2 4 4 4 1 3 9 3 2 9 6 7 4 9 10 4 2 4 1 2 9 8 7 6 5 2 7 9 10 4 4 4 1 1 7  [...]
+8 7 4 4 7 8 6 0 0 10 7 0 7 7 9 10 4 1 1 2 8 1 5 6 2 10 0 10 3 6 0 5 6 9 10 9 0 4 8 9 9 3 1 10 8 4 7 3 7 2 7 3 5 5 4 1 7 4 3 4 6 5 9 7 3 6 7 7 8 9 6 3 6 6 2 2 2 1 3 9 5 6 2 3 6 5 7 10 0 0 8 0 1 10 10 7 7 4 4 2 9 10 8 8 0 6 0 5 10 9 3 0 1 3 3 6 10 4 6 1 4 8 7 10 4 9 2 0 8 5 7 5 2 9 7 6 4 10 3 9 4 6 0 7 9 7 7 4 8 6 7 3 6 2 2 0 0 8 4 3 5 8 7 0 5 7 4 10 8 8 1 8 2 9 2 10 3 7 5 7 5 1 3 0 6 4 3 6 0 1 1 7 3 6 4 8 8 4 2 6 7 10 6 8 5 6 2 6 8 6 10 2 1 5 9 7 3 7 7 8 7 3 2 2 4 8 5 4 1 3 8 10 9 2 8 8 1 [...]
+6 5 4 0 2 1 7 2 0 5 10 5 8 5 3 10 4 6 8 1 1 1 4 7 3 3 3 0 10 8 5 2 3 8 0 10 1 0 7 4 7 0 2 5 0 4 5 5 1 1 7 4 10 3 2 1 2 8 4 1 10 2 7 2 4 0 6 6 3 5 3 8 8 2 5 0 5 1 7 4 10 1 3 2 6 10 1 8 10 9 4 10 6 5 1 9 7 5 2 8 5 3 6 9 8 4 8 2 7 10 5 3 2 5 10 8 1 10 9 0 3 10 2 7 7 3 7 9 7 3 5 9 3 7 9 4 1 0 5 4 3 3 2 7 3 6 2 1 10 8 7 8 6 6 2 6 10 6 4 10 9 10 3 5 7 7 6 4 3 6 3 3 1 6 2 9 3 1 3 5 1 4 5 2 4 1 5 1 7 0 1 10 6 1 0 10 9 3 2 2 2 7 9 1 9 3 3 5 9 3 1 7 1 8 8 3 7 0 6 1 6 0 10 0 9 8 0 2 9 9 10 1 6 6 9  [...]
+3 7 1 0 2 9 2 4 1 7 1 7 8 9 6 10 8 3 8 2 0 10 1 3 6 7 4 3 3 0 5 7 0 10 8 4 4 8 8 2 5 7 0 5 0 5 1 3 1 6 0 1 4 2 9 6 3 1 2 7 6 8 1 6 7 4 1 6 0 6 10 1 9 3 5 1 2 2 9 0 8 8 7 10 8 8 7 4 7 0 1 0 9 6 0 0 7 6 10 1 10 2 1 4 10 8 2 4 10 0 5 0 8 2 8 4 4 10 4 4 6 9 6 2 5 5 7 8 5 5 1 7 9 8 2 7 10 6 10 5 9 10 3 8 1 10 1 10 5 0 7 9 6 6 10 3 8 2 7 0 4 1 9 5 7 8 3 1 4 3 10 0 0 3 9 4 9 5 9 1 1 5 3 2 0 9 1 9 0 1 5 0 6 4 7 4 9 5 2 1 4 6 3 7 4 9 7 10 2 6 7 3 1 0 1 2 4 10 1 2 1 0 1 6 10 1 9 10 7 0 0 2 8 7 2 3 [...]
+3 8 8 5 9 5 3 0 6 0 10 3 0 9 6 1 6 2 0 8 3 2 3 9 5 4 6 7 10 6 8 10 6 1 3 3 0 10 5 7 6 8 4 8 8 7 9 3 9 7 3 9 7 7 5 8 2 4 9 0 0 3 10 3 7 1 10 7 10 7 10 10 9 7 6 3 5 4 6 8 6 9 3 5 10 3 5 5 2 3 1 5 0 2 9 0 10 3 4 7 3 1 10 3 9 6 2 4 6 10 3 3 9 9 8 2 3 6 3 6 3 4 9 10 7 7 5 10 8 9 8 3 6 6 10 9 3 3 9 6 6 8 5 3 6 3 8 9 5 8 2 9 8 7 4 10 4 5 9 4 1 5 4 7 2 6 1 4 7 6 3 10 9 4 4 7 2 4 0 0 0 9 8 4 9 10 3 3 0 1 0 7 1 5 3 0 7 2 8 6 0 1 1 8 5 4 6 5 3 5 9 8 6 7 6 8 0 5 7 1 1 6 8 7 2 10 2 1 7 0 1 0 2 9 1 9  [...]
+7 1 10 6 9 9 1 8 4 10 2 1 0 6 2 1 8 3 10 9 1 0 5 8 4 7 8 3 7 9 9 0 10 6 10 3 10 6 7 3 9 9 8 6 10 8 3 9 3 1 2 5 7 0 2 6 2 3 8 10 1 2 4 5 5 2 10 8 2 8 2 10 5 10 6 6 2 1 4 2 0 3 4 0 7 2 1 9 1 4 2 6 9 8 0 3 10 1 3 7 5 3 9 10 1 5 8 8 10 8 6 1 7 3 3 5 3 1 6 8 3 2 7 0 6 10 9 5 6 2 3 7 7 4 4 6 9 5 6 0 0 9 4 2 2 2 2 2 7 9 2 10 10 2 3 2 4 1 7 2 6 6 5 6 3 1 6 5 8 8 4 0 10 5 6 10 2 2 4 8 10 9 4 0 10 6 7 1 9 4 5 2 8 2 10 4 4 2 0 2 5 8 9 10 4 4 10 6 0 2 2 7 0 4 7 0 5 3 2 6 4 0 1 5 10 7 1 5 8 3 0 8 1 5 [...]
+10 10 7 3 6 1 2 8 7 6 5 8 0 7 8 4 3 9 6 4 5 5 3 3 6 9 6 5 9 10 3 0 9 6 4 3 9 10 0 2 8 8 10 4 3 1 0 3 5 6 0 8 2 4 6 7 8 5 9 1 8 5 5 4 0 7 10 9 3 2 3 5 3 8 4 4 9 9 9 8 6 3 2 6 7 0 9 1 4 9 3 1 2 5 6 6 9 10 9 5 7 3 7 6 9 3 1 6 3 3 6 8 4 9 2 0 9 5 4 7 6 10 1 3 1 7 9 0 1 7 7 6 2 2 2 5 10 6 4 8 4 4 2 5 0 0 1 2 8 10 9 9 0 10 5 2 9 5 7 8 1 0 3 5 5 6 2 4 2 9 5 4 1 4 0 8 0 9 0 6 9 8 8 10 6 7 8 7 9 6 2 10 8 10 4 3 7 1 0 7 7 10 8 8 2 1 4 9 3 9 1 5 4 4 1 7 0 8 9 8 2 9 2 3 7 5 3 0 2 7 8 1 5 2 5 1 9 9 5 [...]
+7 7 4 3 10 4 5 5 0 6 9 1 10 2 6 8 8 7 5 6 3 8 2 9 7 0 9 7 0 9 1 3 7 4 10 3 1 4 5 8 4 0 2 9 4 6 6 8 6 1 3 0 4 0 9 8 9 1 0 0 2 6 0 9 8 5 5 7 8 2 6 2 6 8 6 5 6 3 10 8 3 2 4 0 9 8 2 7 0 2 0 10 8 4 9 4 9 8 7 1 5 4 6 10 5 5 0 9 4 4 2 10 8 6 1 4 7 6 8 3 9 10 8 2 1 10 0 1 8 10 7 8 0 9 6 4 2 1 0 10 4 7 0 0 8 2 8 8 5 10 8 0 3 0 7 0 8 2 2 9 10 1 4 8 5 7 1 8 1 0 3 9 7 2 4 7 6 10 3 2 6 4 6 0 2 8 4 8 10 2 4 1 3 7 7 1 4 2 3 3 2 4 0 10 1 5 0 0 7 6 6 0 3 4 6 3 4 6 7 4 5 0 7 10 3 1 5 2 7 4 1 5 10 7 10 9 7 [...]
+9 1 9 1 9 8 3 1 4 6 1 6 5 6 7 4 3 0 5 6 8 4 4 7 4 2 5 0 5 4 6 4 4 6 0 6 5 6 9 2 7 2 5 5 6 7 1 6 4 10 0 1 10 4 8 1 7 9 5 6 6 2 5 6 1 6 9 9 1 8 0 10 3 5 6 0 3 3 0 8 0 5 4 1 1 0 3 7 0 0 7 0 2 7 0 2 9 1 4 9 2 1 9 4 6 4 5 0 6 2 4 9 2 5 8 10 2 0 7 9 7 6 0 2 7 9 0 8 1 9 5 10 3 1 6 3 4 2 4 10 1 1 3 2 7 9 8 5 5 3 7 10 6 1 3 8 9 1 10 3 5 2 3 9 9 10 8 6 4 6 4 7 7 10 10 5 6 2 7 5 10 6 0 3 6 6 0 5 7 1 6 0 7 2 9 10 4 2 6 1 4 8 0 5 3 3 10 5 9 6 0 3 8 5 0 3 7 0 6 7 1 1 8 9 7 4 3 1 5 1 10 5 8 6 6 2 8 0 2 [...]
+1 6 1 6 3 10 3 4 6 9 4 7 6 9 1 5 8 3 10 10 5 9 2 8 10 8 2 8 1 4 9 5 5 2 1 3 3 9 10 6 10 9 7 6 1 7 4 9 3 6 9 0 4 9 5 3 5 8 2 6 0 8 10 2 6 10 1 10 8 4 0 4 8 0 6 2 9 3 1 0 9 4 5 6 0 9 3 0 5 6 10 6 1 10 7 1 10 10 2 1 10 6 0 7 0 5 6 5 2 6 4 0 0 5 6 4 9 6 0 0 0 3 1 1 4 8 4 2 2 9 10 10 6 2 2 4 7 8 9 5 3 7 7 1 1 2 2 7 7 4 6 10 6 8 9 5 3 6 10 8 3 2 2 5 0 6 6 6 2 8 4 1 4 4 10 9 4 5 7 9 10 1 9 2 6 10 2 7 3 1 4 0 10 0 0 3 7 4 4 9 9 9 0 10 2 6 2 10 5 4 3 0 9 4 2 5 8 2 1 10 7 6 2 4 2 3 4 8 0 1 6 2 2 6 [...]
+10 7 5 4 9 4 4 2 8 5 2 6 3 10 7 2 6 8 1 4 0 2 6 8 6 10 9 6 3 2 2 1 7 0 1 10 1 7 1 2 5 6 7 6 0 5 6 8 6 0 8 1 3 2 2 7 4 8 3 10 4 6 9 0 8 6 4 9 10 5 0 4 10 8 2 3 1 1 5 9 9 0 8 5 10 9 2 4 5 1 6 6 7 6 3 7 2 10 2 6 8 8 9 8 4 4 3 7 8 4 5 4 7 1 8 10 1 3 1 8 2 8 5 10 2 3 8 4 0 6 3 2 8 4 0 6 2 10 3 5 0 3 7 10 10 8 4 10 0 9 4 5 6 5 6 2 1 5 8 7 10 1 4 4 5 1 10 9 7 9 10 9 4 2 0 5 6 2 7 9 2 6 0 5 10 0 1 9 4 10 6 5 7 7 6 9 4 9 10 6 1 0 5 5 9 9 2 10 8 10 5 10 10 4 1 7 9 4 1 0 0 1 4 0 8 4 1 0 2 0 5 0 4 0 [...]
+9 3 5 0 6 2 3 3 3 0 5 9 9 1 1 2 3 2 6 1 9 5 4 1 9 0 7 5 8 7 7 9 7 8 2 1 8 2 8 4 7 5 7 7 9 8 6 1 5 1 10 4 2 10 3 7 9 3 8 6 4 10 2 5 3 9 10 5 9 10 0 6 4 4 6 5 7 3 2 10 8 7 10 4 0 4 0 4 4 0 2 8 2 6 10 8 5 0 4 8 0 4 3 7 5 4 3 5 9 10 0 0 0 7 0 10 7 6 7 9 3 8 9 0 9 10 9 5 8 10 7 1 2 7 8 3 1 9 9 9 0 2 1 8 9 4 1 10 7 6 0 1 5 8 9 5 3 8 4 7 0 8 3 10 2 0 2 10 7 0 4 8 0 9 3 8 10 5 8 5 5 9 8 10 9 6 9 3 5 9 6 10 3 10 8 4 0 2 8 5 4 0 2 9 7 10 7 10 3 3 0 1 8 10 8 4 10 9 10 3 8 10 5 3 10 9 3 1 1 8 5 9 9  [...]
+3 7 3 5 7 9 1 6 3 9 6 9 10 0 0 8 6 4 3 8 0 0 3 1 7 8 3 10 3 7 1 1 2 3 9 9 4 5 5 4 9 0 9 7 6 5 8 1 0 1 10 5 5 7 3 8 0 1 2 3 10 0 5 10 5 1 1 8 1 1 3 8 10 5 1 10 9 6 7 4 0 10 2 2 1 8 7 3 6 2 10 8 8 8 0 2 3 10 8 3 0 2 3 0 0 9 2 9 7 7 10 1 7 7 7 0 0 3 10 8 4 0 0 7 2 8 0 1 1 0 2 2 0 2 1 3 3 2 4 8 9 2 6 6 4 9 7 10 5 6 9 3 3 3 7 10 7 0 9 3 8 5 7 9 0 0 1 8 2 9 10 3 10 3 5 9 8 3 6 9 10 3 6 3 9 10 7 2 4 0 9 6 4 7 4 7 7 1 7 0 3 10 7 5 8 6 4 5 7 10 6 7 7 8 9 3 7 4 4 5 10 7 6 4 0 4 3 10 6 4 10 5 7 1 9 [...]
+4 5 10 2 10 4 4 10 3 0 1 6 4 4 4 6 7 3 9 1 1 10 9 9 0 9 7 7 3 8 7 1 0 6 4 6 7 1 0 4 6 8 5 7 5 4 1 5 10 9 7 1 3 10 6 4 3 7 9 7 5 8 9 5 10 5 7 1 2 1 3 2 6 1 5 2 9 1 9 8 3 10 5 10 5 7 4 2 1 0 6 5 10 5 7 9 1 8 6 6 6 10 6 2 10 5 10 6 4 8 6 10 3 0 2 5 4 3 1 0 3 9 5 0 6 4 0 0 1 4 9 3 8 8 8 6 1 3 10 6 1 6 1 5 5 7 5 4 10 0 2 1 5 9 6 5 0 9 0 9 2 5 4 8 1 0 9 9 10 7 1 5 7 1 3 9 1 9 5 10 2 6 9 6 4 9 10 9 3 0 9 1 2 9 2 5 10 1 7 4 5 3 3 6 10 10 2 10 10 7 1 2 6 10 0 6 2 8 3 0 9 4 5 7 9 7 6 1 1 1 9 3 1 3 [...]
+0 2 9 3 0 9 7 0 0 3 3 2 0 7 3 6 6 1 9 4 7 4 6 8 3 3 2 9 4 6 8 10 1 0 10 8 5 2 1 5 8 5 4 3 8 6 9 5 4 8 1 3 7 10 6 6 6 1 7 4 3 8 3 8 7 5 1 1 2 1 10 5 6 3 6 4 3 5 3 4 4 2 10 10 0 6 9 1 8 1 10 1 0 5 4 6 8 3 6 9 8 7 2 9 4 0 5 2 4 0 3 0 1 4 10 7 1 2 7 2 5 7 4 9 8 3 2 5 2 6 4 6 7 3 10 6 3 3 6 9 7 7 1 6 5 3 6 5 2 7 7 7 2 0 9 3 6 8 7 3 0 7 2 8 5 4 2 2 0 2 7 10 1 7 9 6 9 2 4 0 10 0 9 9 4 5 8 4 3 10 8 0 6 0 6 6 0 4 8 5 9 1 4 9 7 5 3 0 8 7 4 9 2 10 7 7 1 10 10 3 9 8 1 5 3 8 6 1 6 5 10 4 8 1 8 3 2 8  [...]
+2 0 7 10 5 5 3 5 7 7 1 0 5 7 8 4 0 0 2 9 1 2 6 5 5 7 3 3 2 4 1 4 8 2 9 2 8 4 0 8 8 8 2 10 2 8 1 1 6 4 9 2 2 6 5 8 1 8 2 5 10 10 8 3 1 5 1 1 10 4 4 6 5 8 1 5 10 10 6 1 6 4 0 8 0 2 6 9 4 10 2 2 3 7 4 3 3 3 4 5 4 1 1 9 3 4 5 5 8 8 7 9 6 6 5 7 10 6 8 8 4 0 0 0 9 2 9 10 5 10 2 5 1 8 8 10 8 6 10 10 3 0 8 9 2 0 6 10 6 4 1 4 10 9 8 9 6 4 1 4 2 6 9 0 1 0 5 4 10 2 8 9 8 3 1 8 2 0 4 1 4 9 2 8 0 6 8 3 3 6 7 8 0 1 9 10 7 7 6 2 8 2 9 4 10 8 1 5 4 0 3 6 9 4 7 2 3 4 1 10 0 1 9 9 3 5 5 4 5 5 9 2 4 7 7 2  [...]
+5 8 6 2 1 3 1 0 3 7 1 7 5 5 10 6 10 8 7 4 2 7 0 10 10 3 4 5 0 0 9 7 5 1 4 3 7 7 1 0 3 9 4 0 9 4 3 3 3 0 1 5 9 10 8 3 2 6 9 3 1 9 0 5 9 0 2 6 4 7 5 5 6 2 10 10 2 2 7 8 8 8 3 4 0 3 6 3 1 8 10 9 4 8 8 0 10 7 3 1 5 10 9 1 7 2 9 7 10 9 0 10 1 1 5 2 1 9 10 4 4 10 6 6 4 0 3 8 0 3 2 8 3 8 6 5 4 1 4 2 1 9 1 1 9 5 10 3 4 6 9 9 10 5 10 3 6 5 8 9 8 1 3 4 6 5 0 3 1 4 0 0 9 1 8 8 6 8 2 10 9 2 5 6 3 6 0 6 4 9 5 1 5 7 8 8 2 3 5 5 10 6 6 10 10 6 6 0 3 7 1 10 0 2 6 2 9 10 0 6 4 8 4 5 7 0 3 5 6 5 4 7 10 4  [...]
+9 0 0 4 3 0 9 9 3 5 5 5 9 9 0 1 5 7 7 7 2 5 10 6 9 1 6 1 5 0 5 2 7 3 3 1 7 10 0 8 6 4 10 7 3 4 4 0 8 2 6 3 8 0 4 10 3 5 4 4 5 7 5 0 2 4 3 4 5 4 9 2 7 4 10 4 0 9 7 10 2 9 8 6 2 1 5 4 1 7 10 3 2 8 7 7 7 5 1 0 2 7 2 9 1 0 8 2 7 5 10 10 7 10 7 1 3 3 4 9 1 1 10 7 10 2 8 0 9 1 8 7 3 9 7 6 7 7 8 8 5 0 7 10 10 5 0 3 9 8 6 10 3 4 6 0 5 3 7 5 9 5 2 10 3 10 8 10 8 6 0 3 6 8 0 8 1 6 0 2 0 7 6 9 0 9 7 9 5 7 10 8 7 4 3 6 2 8 4 1 0 4 5 8 7 1 9 8 7 7 2 6 1 1 9 6 0 3 5 1 4 3 3 10 4 6 9 5 6 7 2 0 4 9 8 6  [...]
+9 7 10 7 9 4 4 5 1 3 0 6 10 0 1 6 2 0 0 2 7 1 10 10 3 9 8 4 6 9 9 3 8 9 4 4 5 4 9 7 0 6 3 6 3 6 9 1 7 5 10 2 1 6 4 4 0 5 5 6 9 9 0 2 5 5 5 7 1 10 4 5 4 1 4 1 8 5 5 0 3 2 8 0 1 1 0 7 4 9 4 4 3 8 2 3 2 3 10 9 5 3 7 9 4 4 2 9 9 7 0 10 4 1 3 7 6 5 7 8 9 0 0 5 7 6 0 0 8 6 10 8 8 10 9 5 1 2 1 9 8 1 4 9 6 5 4 10 3 8 1 2 3 7 6 0 3 6 9 3 3 5 6 4 2 9 1 2 8 5 10 7 10 0 3 3 8 8 2 3 8 10 9 5 1 10 3 0 4 4 1 1 8 4 7 9 10 0 5 4 10 8 3 2 10 4 9 8 1 1 1 1 4 10 3 9 0 3 6 2 7 4 5 1 5 7 6 6 7 5 1 7 9 5 8 7 9 [...]
+3 9 8 7 0 6 5 0 5 7 6 10 10 8 7 7 4 0 2 6 0 5 4 6 9 3 8 0 0 2 4 2 6 4 6 5 4 1 1 9 6 0 4 6 0 4 3 8 4 2 6 4 2 10 2 8 10 1 7 8 6 7 9 9 10 6 4 3 9 10 0 10 0 7 3 2 1 5 7 6 2 0 10 4 9 2 9 2 9 2 2 7 9 7 7 7 7 9 2 0 5 8 3 0 2 4 1 6 10 4 5 6 5 5 5 7 9 8 10 5 0 0 1 7 8 1 8 8 7 2 2 2 9 0 10 2 5 9 5 2 3 6 7 10 1 1 5 5 8 9 4 1 6 1 8 8 8 5 3 8 5 7 3 8 8 1 10 10 0 10 9 10 4 6 8 4 10 6 8 1 7 0 6 0 10 0 3 0 9 4 0 4 4 6 5 2 1 2 0 0 5 3 9 1 0 6 7 9 10 6 1 3 2 1 0 9 1 3 0 3 8 10 10 3 6 10 1 10 6 2 1 10 5 0  [...]
+4 8 1 8 8 7 2 2 0 0 0 6 4 10 6 2 8 9 0 5 5 9 5 8 4 0 4 2 0 4 6 4 1 8 4 5 3 9 1 7 7 1 8 7 7 0 0 5 1 0 9 4 2 3 8 0 3 7 7 3 7 8 10 4 6 4 8 2 0 9 2 3 5 2 6 9 5 2 6 5 9 4 4 3 6 7 8 7 1 10 7 7 1 0 7 8 9 0 8 1 5 7 7 6 10 6 0 0 8 4 10 3 2 6 4 2 4 10 0 1 6 9 10 4 5 7 7 1 6 10 7 2 3 7 4 3 0 6 2 0 7 6 0 7 5 3 7 1 3 4 6 8 6 10 5 8 8 9 3 7 1 4 1 4 10 7 7 1 1 10 3 5 2 8 6 8 0 10 10 5 10 2 6 6 8 0 10 5 5 5 4 10 6 6 2 9 4 2 7 3 1 2 1 6 10 4 7 10 2 9 7 4 10 2 0 0 5 5 4 2 1 2 5 2 0 7 8 1 2 7 0 5 2 5 6 3 1 [...]
+5 10 0 9 5 2 7 10 0 5 7 4 4 8 3 6 1 8 9 0 8 6 4 7 3 1 6 8 3 9 9 1 0 7 1 10 1 9 6 5 10 10 1 9 8 0 10 10 10 8 9 9 8 0 6 7 4 9 1 6 0 4 5 4 3 7 9 10 2 2 2 7 6 4 10 0 3 6 5 10 5 2 1 9 8 7 4 1 10 1 2 4 9 8 2 1 5 10 3 8 4 5 9 3 10 8 7 3 9 3 4 2 3 3 10 1 7 5 7 9 0 9 4 3 10 2 6 0 7 3 5 7 7 3 8 7 6 9 9 2 9 0 3 9 5 4 4 6 2 0 10 0 3 7 7 1 5 4 8 8 2 7 10 6 0 3 1 7 5 5 5 5 8 6 4 8 0 2 5 3 6 8 7 5 7 10 3 3 1 4 5 3 9 4 1 1 7 9 0 5 5 9 2 6 1 0 1 8 8 7 2 2 6 6 0 2 10 5 9 6 5 9 4 4 9 0 9 1 0 3 3 3 10 3 5 5 [...]
+0 6 6 0 0 3 6 7 7 10 1 10 9 3 2 6 6 2 2 1 5 2 9 7 4 2 10 8 8 1 5 8 1 9 5 9 9 0 9 9 1 10 4 0 0 3 9 8 3 2 1 2 6 4 3 4 3 2 1 7 0 3 5 7 8 8 9 4 2 9 10 5 0 7 3 5 0 5 3 0 2 7 5 10 0 5 3 0 5 9 6 1 5 0 8 4 6 5 8 8 10 0 5 9 1 4 5 6 4 2 6 9 3 9 8 5 3 1 2 7 2 2 5 9 9 7 9 10 8 6 6 1 7 0 3 4 5 4 7 1 8 2 6 0 1 3 9 7 2 4 3 10 10 10 3 6 2 10 0 0 4 7 2 6 3 6 2 2 4 10 4 6 0 3 2 9 6 4 1 1 4 6 7 9 5 4 10 5 5 7 2 1 10 1 1 3 6 3 3 2 8 10 6 2 6 4 2 3 1 1 1 2 4 5 5 3 2 6 7 9 10 10 0 2 6 0 3 5 8 9 10 6 9 7 6 3 0 [...]
+4 5 5 9 5 2 10 0 1 5 4 8 5 4 0 1 6 5 3 0 8 8 8 3 3 6 10 9 2 3 2 8 6 3 5 6 3 9 3 6 7 9 0 7 3 1 6 4 0 8 1 10 3 1 8 5 0 1 7 1 7 3 1 0 10 7 5 4 10 1 8 2 6 7 10 5 1 6 4 0 0 3 6 10 8 4 0 1 2 9 7 3 1 5 5 0 4 6 3 6 2 7 10 8 1 6 8 1 2 0 1 6 9 0 4 0 4 5 3 8 7 5 7 7 9 10 9 0 5 5 1 0 10 7 4 10 7 1 8 7 3 8 1 6 9 0 1 6 5 5 7 3 4 9 0 5 0 3 8 4 4 4 9 9 9 9 1 5 3 1 0 5 10 5 5 1 0 1 6 9 10 3 3 2 2 6 2 8 1 8 6 2 1 9 2 10 7 1 8 3 8 10 0 0 6 4 9 7 7 2 5 3 9 4 7 6 2 10 6 0 9 2 8 0 3 0 10 6 6 5 4 3 8 7 1 9 7 2 [...]
+3 8 4 8 0 5 2 1 8 9 5 10 6 7 6 6 9 1 5 10 7 10 9 6 4 3 8 0 1 3 2 0 8 2 8 1 4 8 7 0 9 0 1 5 3 4 7 7 0 7 2 10 2 2 9 2 8 9 8 7 6 0 8 10 5 0 10 9 6 8 2 4 1 9 7 5 3 2 1 3 1 1 10 0 3 2 0 3 10 0 0 10 5 1 6 2 3 8 0 10 7 10 1 6 8 6 5 9 9 8 10 3 2 1 10 4 8 9 9 2 10 4 1 8 4 8 6 1 4 6 9 1 0 7 2 2 7 9 5 9 8 7 1 3 7 9 2 1 6 0 9 0 3 8 3 1 5 2 7 5 8 8 2 8 10 6 7 6 8 1 3 6 9 2 6 2 8 2 5 7 4 3 5 6 5 10 10 0 10 5 2 4 10 7 9 3 3 0 3 6 1 2 10 5 4 7 8 8 6 1 0 1 0 3 8 3 5 8 10 4 2 9 7 8 10 5 10 9 4 10 4 8 4 1  [...]
+6 8 4 6 5 3 8 8 3 6 1 5 10 9 7 10 1 1 3 2 6 2 0 6 2 8 3 6 6 2 6 3 1 0 10 6 1 1 5 1 6 3 2 10 1 1 5 2 1 9 4 1 8 2 9 1 9 4 0 8 2 3 7 9 8 10 1 5 2 7 7 2 6 1 4 7 1 9 1 9 10 8 0 9 4 1 0 4 8 3 8 2 0 0 4 7 0 10 10 10 4 3 2 7 7 2 8 2 0 4 3 9 0 4 5 5 8 7 8 2 6 1 3 2 5 2 4 2 0 5 4 4 9 8 5 5 0 6 9 0 2 1 4 4 2 9 2 6 5 10 6 7 9 7 0 8 5 10 0 4 2 3 3 1 9 6 0 5 0 4 1 6 6 7 2 3 1 0 5 5 2 1 1 9 1 2 7 3 9 0 0 6 5 6 5 6 4 4 3 9 5 7 6 3 4 0 0 10 9 5 7 3 5 10 8 9 10 6 2 1 4 6 4 0 3 7 6 1 7 7 0 8 9 6 3 0 6 7 5  [...]
+6 5 4 3 6 8 0 5 9 3 3 4 3 5 3 6 1 0 9 1 4 3 4 9 1 10 10 9 6 3 10 9 10 3 1 4 0 0 1 8 10 5 7 4 1 1 8 4 8 0 0 4 3 9 10 4 10 4 4 4 5 10 0 3 0 2 6 5 9 3 10 7 8 2 0 9 3 7 1 6 3 8 0 5 4 3 4 5 1 6 4 3 0 1 10 8 10 5 0 5 3 6 3 2 1 8 4 2 4 4 9 4 8 5 3 4 10 5 8 5 8 10 9 10 9 9 2 1 8 5 0 4 7 2 9 6 10 7 7 2 10 0 7 3 3 10 0 0 8 8 7 2 2 10 10 1 1 10 6 6 6 9 5 5 10 7 7 4 1 7 3 7 0 1 10 1 0 3 10 0 6 10 9 8 3 1 10 0 1 3 9 9 7 5 4 9 9 7 1 10 1 10 7 8 6 8 10 7 8 10 6 0 10 9 1 8 1 4 2 6 1 7 8 7 5 10 10 3 5 5  [...]
+8 1 10 7 9 9 9 9 0 0 6 8 2 9 10 0 7 6 6 10 8 2 7 3 2 7 9 7 4 0 8 4 3 9 9 8 3 9 4 3 4 5 0 7 0 4 0 1 4 10 3 10 5 7 4 1 9 0 8 1 6 7 8 6 7 1 4 0 3 0 8 5 6 9 4 5 2 4 2 5 0 3 1 3 4 1 4 5 2 3 6 10 5 4 1 3 1 4 3 1 1 1 0 4 9 0 0 0 9 6 0 2 3 9 0 4 6 9 1 6 8 1 0 1 8 4 8 10 5 8 0 7 2 10 5 5 4 3 7 9 6 5 4 0 1 10 1 3 2 7 10 7 10 2 5 4 6 5 1 3 1 4 6 1 9 0 8 1 9 2 6 0 9 5 9 3 5 6 3 8 8 3 4 5 2 4 4 5 4 7 2 6 0 0 6 1 8 9 7 1 2 8 4 2 7 7 2 5 10 5 6 0 3 2 1 7 0 0 4 3 6 4 4 4 6 2 7 8 5 1 8 6 4 7 8 4 3 9 9 6  [...]
+4 6 9 9 3 6 8 8 0 4 4 7 10 10 0 9 5 2 3 2 0 8 4 10 6 1 0 9 7 10 7 2 3 0 4 5 5 2 9 8 0 8 5 10 0 6 3 10 6 5 4 4 4 9 10 0 5 5 0 2 4 7 8 0 6 0 1 5 9 4 1 5 1 1 0 2 1 0 0 9 4 0 1 4 3 7 7 10 6 7 4 8 3 5 7 7 7 10 2 2 2 7 3 0 1 0 4 6 8 4 8 9 1 7 3 2 1 3 5 2 9 1 5 8 4 1 3 2 2 4 6 7 8 1 0 8 1 9 3 8 2 6 10 2 3 9 9 10 9 6 6 9 5 1 1 7 7 0 7 7 3 10 6 1 3 8 9 3 10 4 1 8 8 4 5 8 10 6 2 5 4 1 7 10 6 1 2 10 6 5 8 1 2 6 0 2 4 6 3 8 1 6 5 7 5 2 10 4 2 3 2 6 8 4 10 4 4 7 0 10 5 8 7 0 3 4 2 10 7 3 1 8 4 6 7 5  [...]
+6 5 5 7 5 4 5 10 7 1 2 1 3 1 7 4 6 1 3 4 4 10 10 1 0 7 5 9 7 10 6 1 5 10 4 10 4 10 7 3 8 7 1 9 7 1 5 2 2 5 4 9 5 3 1 4 4 9 5 7 10 3 10 1 1 3 2 0 10 5 4 1 8 0 0 0 1 7 10 3 7 1 9 7 8 4 1 9 1 6 8 8 2 3 9 7 1 3 9 8 3 9 5 6 7 3 0 7 8 4 4 6 7 1 8 7 9 9 7 3 2 1 10 4 3 10 4 10 0 6 5 7 6 2 2 7 0 2 7 2 0 0 3 2 9 3 1 4 10 1 9 8 1 4 9 2 5 0 10 0 4 3 0 2 6 3 4 9 2 3 0 2 5 1 7 2 7 1 1 2 5 6 1 10 4 8 4 8 0 0 8 6 4 2 4 10 9 6 10 10 4 0 3 3 8 4 1 0 4 6 7 4 7 0 8 5 10 9 9 1 3 6 2 2 1 4 0 1 2 5 8 4 6 4 9 1 [...]
+6 4 7 1 6 6 6 9 10 10 3 0 9 6 7 5 2 6 9 2 0 2 6 10 8 0 7 6 1 2 8 4 5 2 4 9 9 4 8 0 8 9 5 9 1 4 2 1 0 1 6 7 10 1 9 2 8 4 0 8 9 7 5 8 4 3 0 4 6 5 1 8 3 8 1 9 1 8 3 7 4 9 0 1 8 10 5 9 4 4 5 1 8 3 0 4 6 1 3 2 8 2 5 10 9 8 7 7 8 10 10 1 3 5 10 10 5 3 10 9 10 7 6 4 9 7 2 5 2 4 8 6 8 1 6 0 0 2 8 10 5 9 10 10 4 9 5 1 7 2 4 2 3 6 4 7 10 10 8 2 0 1 5 2 9 1 1 3 3 9 9 2 6 4 4 3 0 6 5 0 7 4 8 7 0 3 1 10 9 4 8 8 7 8 4 0 5 7 9 10 4 5 8 5 8 9 0 2 3 4 5 4 1 9 0 5 7 8 8 4 7 1 6 9 9 3 8 3 0 8 8 6 1 0 8 7 7 [...]
+9 5 6 7 3 3 5 8 6 7 6 9 5 5 6 1 9 10 6 5 10 9 7 4 7 0 9 3 10 8 2 10 9 6 1 10 4 0 7 2 10 3 10 1 1 0 8 0 9 2 0 2 10 1 3 10 2 6 6 10 7 4 6 4 7 8 10 9 0 9 4 4 3 8 3 2 10 8 8 8 3 2 1 8 9 5 0 1 0 8 1 6 8 4 1 0 1 1 3 9 4 9 5 1 2 3 5 2 9 9 10 8 5 1 5 6 6 8 1 4 2 3 9 2 1 7 9 9 1 8 9 3 7 7 9 1 8 10 10 4 2 4 0 8 9 10 2 7 4 7 0 7 9 7 0 5 0 7 0 5 10 4 0 3 5 9 0 4 8 6 7 0 6 1 1 1 1 9 6 5 2 4 5 4 8 3 5 9 2 6 0 6 2 9 3 1 3 10 9 0 0 10 9 8 5 2 10 10 10 9 2 1 10 6 2 4 0 3 0 7 10 3 2 5 1 9 4 6 2 9 6 2 2 0  [...]
+10 3 7 8 10 3 10 4 6 0 3 7 4 0 8 5 9 1 10 10 8 2 0 5 3 10 8 5 5 0 6 0 5 9 9 6 3 8 5 7 4 7 1 8 9 7 10 4 7 8 10 6 3 4 6 3 0 1 3 4 7 9 6 8 7 4 5 7 0 1 0 3 3 5 5 7 0 8 4 3 10 8 10 0 4 5 5 9 7 3 0 10 8 0 4 4 2 8 2 5 9 4 2 4 1 2 2 9 3 4 5 2 6 9 3 9 4 10 5 2 4 10 3 5 10 10 1 9 7 8 1 5 5 4 3 0 8 10 3 6 9 6 4 2 7 3 5 2 2 5 5 2 6 6 10 9 5 6 4 1 1 10 2 4 3 10 10 0 10 4 1 6 8 0 1 1 10 3 3 8 0 0 2 10 5 6 10 1 0 4 2 0 5 5 7 9 0 0 6 5 2 5 0 5 3 10 10 8 1 1 5 6 8 9 6 6 1 6 8 5 0 1 4 5 10 3 0 5 7 0 4 5 0 [...]
+2 3 4 4 4 1 0 1 3 1 1 7 0 4 1 4 4 3 9 5 9 7 6 0 2 1 10 0 7 4 0 6 3 7 4 0 7 4 0 1 7 8 2 5 1 7 9 10 3 2 5 3 2 4 6 6 8 9 7 2 8 8 4 6 1 4 5 4 10 10 7 7 1 10 2 4 9 4 4 1 4 7 6 5 8 0 2 8 5 2 6 7 5 5 8 10 7 8 8 10 4 2 7 9 7 10 4 10 1 2 0 10 0 5 7 2 9 4 4 0 4 9 6 8 10 3 0 0 5 0 1 1 8 9 0 1 3 5 5 7 1 0 6 4 0 8 2 5 9 10 9 5 5 10 7 10 10 10 1 4 7 2 0 10 6 4 0 5 8 6 5 0 7 0 3 7 2 8 3 5 5 4 5 7 8 9 9 7 0 4 4 10 3 7 7 8 3 0 3 7 6 7 7 1 5 1 9 10 4 6 8 2 8 10 8 10 8 3 9 5 4 9 4 6 9 10 3 1 9 8 9 8 5 7 6  [...]
+6 4 4 3 3 7 8 9 2 1 1 6 8 2 2 3 4 7 6 4 7 9 0 1 6 0 1 8 0 4 2 3 0 4 0 2 9 3 3 9 8 10 0 5 4 5 9 1 5 8 10 1 1 3 10 6 1 8 10 3 7 10 9 8 9 8 7 0 6 6 0 0 2 1 6 1 0 0 7 5 2 7 0 10 4 8 8 8 5 3 8 7 6 3 4 0 8 8 7 9 6 7 3 6 6 6 3 2 8 1 10 7 2 10 1 4 7 3 9 9 2 4 6 10 6 1 0 4 3 8 3 1 3 1 0 1 4 0 0 5 8 3 4 10 6 8 4 6 9 10 2 6 0 2 3 2 8 2 8 3 1 8 7 6 2 6 2 8 5 9 1 0 8 10 1 4 2 2 5 5 7 2 3 4 9 5 7 7 0 4 10 2 4 5 5 9 7 4 8 1 7 6 8 3 2 1 4 7 3 10 6 2 7 9 5 6 2 10 2 8 0 4 9 0 2 4 10 3 8 4 9 5 0 9 0 1 4 0  [...]
+10 7 2 3 4 5 10 8 5 0 8 2 5 4 5 5 5 10 8 6 0 9 2 0 10 7 5 6 6 10 5 2 4 0 1 10 2 4 2 9 4 7 8 0 9 1 10 5 6 6 7 9 2 1 9 7 8 0 9 0 3 3 6 8 8 7 7 9 4 4 2 2 3 0 9 1 4 7 10 10 10 4 7 9 9 7 3 6 6 8 5 0 6 5 5 2 8 7 4 1 4 5 8 0 6 8 4 6 5 10 4 8 2 5 5 0 9 9 9 8 4 2 6 4 9 5 10 4 8 1 3 1 3 6 3 8 4 6 6 7 1 3 3 5 1 10 2 1 4 9 5 10 1 6 6 5 10 2 5 6 3 1 3 0 3 7 0 3 2 8 0 7 10 2 2 8 5 4 6 4 1 8 10 7 5 9 1 3 8 0 9 2 9 10 7 1 3 0 4 10 1 0 10 7 10 1 8 4 5 10 8 5 3 4 2 1 9 3 1 8 8 3 4 6 9 8 5 10 9 3 7 1 9 1 1 [...]
+4 3 7 1 6 10 9 7 1 9 1 0 2 0 2 4 10 6 2 3 5 1 6 3 6 7 8 3 6 10 1 7 4 9 1 9 4 2 7 3 4 2 1 4 4 1 0 6 9 6 0 4 2 3 9 10 5 7 6 7 1 9 2 2 1 2 6 1 7 2 8 7 7 1 7 3 7 7 7 4 3 1 9 6 6 10 5 10 9 2 3 8 5 3 0 5 6 2 3 1 7 9 5 9 7 3 4 1 9 4 8 1 2 8 6 5 9 0 7 7 2 8 10 8 4 1 5 9 2 10 0 3 5 3 8 10 9 10 3 2 9 0 2 6 5 7 0 4 0 4 6 10 1 8 8 8 2 8 7 10 6 7 10 10 5 4 5 6 1 9 3 8 1 0 5 1 9 6 0 3 8 0 3 8 10 5 9 9 0 7 3 10 2 5 10 10 3 2 5 0 8 2 6 1 6 10 0 4 10 10 9 8 6 9 4 2 7 10 10 9 0 3 6 0 3 6 5 6 1 8 7 4 2 6 6 [...]
+6 10 2 1 8 1 1 8 10 4 9 8 2 5 2 4 6 4 0 4 0 4 8 4 6 1 3 0 4 0 7 3 2 0 10 4 7 0 2 2 4 3 9 2 7 0 3 3 1 6 8 5 8 5 10 7 8 6 10 10 8 1 0 3 4 1 0 8 1 8 1 6 2 4 4 2 0 7 8 2 4 0 0 5 9 4 10 4 5 6 8 3 4 3 0 10 7 4 2 6 3 8 10 10 9 0 1 7 1 7 6 1 10 9 10 10 1 7 7 1 5 7 0 10 5 9 2 3 9 0 1 3 5 8 6 10 9 8 1 3 7 0 1 2 5 9 1 0 4 4 4 2 4 10 8 3 2 1 1 4 9 4 9 9 8 7 8 9 5 0 10 4 8 7 1 4 4 3 10 3 10 1 0 5 9 5 10 2 9 10 8 10 10 0 2 4 8 3 1 1 0 1 6 6 6 9 3 9 0 0 4 7 2 6 1 3 8 8 10 4 1 5 6 9 5 2 6 5 7 3 3 9 0 5  [...]
+2 0 7 8 10 4 10 6 1 9 4 5 3 3 9 9 3 2 8 7 9 0 10 3 0 10 5 8 5 8 1 10 7 8 10 10 10 0 4 4 9 5 0 1 5 2 2 8 5 5 5 10 10 6 10 10 4 0 3 4 0 8 9 7 5 6 2 1 7 2 2 8 8 7 6 4 5 1 2 9 1 2 2 0 1 3 2 2 8 10 8 0 8 8 8 7 2 10 1 3 5 1 6 8 6 0 9 10 4 0 0 5 7 3 0 2 10 8 2 6 1 5 3 3 8 2 3 4 1 0 0 1 9 3 6 0 7 10 10 8 6 0 1 8 9 5 7 6 3 5 0 3 9 6 8 10 10 7 5 4 3 4 10 7 2 5 10 7 8 6 0 1 9 9 6 1 4 1 2 2 1 7 3 1 5 2 1 0 8 8 2 7 0 8 6 1 1 1 10 2 2 0 7 6 2 1 5 7 1 10 1 5 10 5 6 4 6 6 0 10 3 1 8 3 4 8 6 10 1 6 4 5 6 [...]
+8 4 7 8 10 6 6 7 6 7 10 2 7 1 6 8 10 5 7 4 7 0 1 7 2 0 6 9 5 3 4 0 1 5 10 5 2 9 1 4 10 10 9 10 6 3 8 7 0 7 5 6 6 3 1 3 5 4 9 1 9 3 10 0 9 4 1 9 10 10 1 5 9 10 7 0 3 4 0 2 2 4 5 3 0 3 4 9 5 4 7 10 4 8 4 6 2 3 7 8 2 2 1 5 7 3 0 4 5 9 6 10 0 2 3 9 6 6 8 4 2 3 8 3 3 3 0 4 7 7 8 4 6 8 6 6 7 3 7 0 3 4 1 10 7 1 0 6 7 0 1 3 0 2 5 4 8 10 0 5 1 1 1 2 10 10 3 5 2 5 3 5 8 5 1 9 1 2 2 8 5 0 2 7 10 6 3 8 9 4 2 1 0 2 1 8 4 7 7 9 8 6 10 5 4 1 10 0 3 6 7 9 9 3 10 5 6 4 9 10 1 7 0 10 4 6 9 0 0 2 0 4 9 7 3 [...]
+4 10 3 7 4 7 6 5 3 1 0 0 7 6 9 2 4 7 10 3 0 7 6 1 4 1 2 3 4 9 3 2 9 1 1 0 7 9 9 8 8 4 0 7 6 1 3 9 7 4 7 1 1 7 7 6 5 9 4 2 10 9 2 3 5 7 8 10 4 9 8 7 4 9 6 6 3 3 3 5 1 5 2 1 10 9 5 5 9 1 4 5 9 10 5 9 6 2 2 10 0 2 7 6 0 10 6 0 8 5 6 5 4 7 9 8 3 1 3 2 8 1 4 4 8 5 8 3 4 5 6 1 4 8 3 5 1 7 5 2 6 7 5 8 8 4 2 9 2 3 9 3 8 0 8 1 0 0 9 7 4 8 9 5 6 5 2 10 8 3 0 0 10 8 8 4 3 4 1 10 10 8 7 6 1 2 9 7 1 1 8 3 7 0 2 9 9 7 5 2 10 0 2 7 2 0 9 3 3 8 6 9 6 3 9 7 3 4 0 5 3 2 2 10 7 6 0 10 10 9 9 10 5 9 1 6 5 8 [...]
+3 6 6 10 1 8 8 5 4 3 5 1 4 0 0 7 10 2 7 7 1 7 9 10 4 8 7 3 0 0 4 7 4 1 7 4 0 3 4 3 6 8 9 9 4 8 5 3 2 1 2 9 3 8 7 4 6 2 7 10 4 1 4 2 3 3 9 2 6 4 1 0 9 9 1 2 6 9 6 4 4 6 9 6 6 6 3 10 5 8 9 0 4 6 7 8 5 8 10 1 9 6 7 3 7 2 3 1 2 6 2 0 1 3 8 4 10 2 2 3 5 4 10 6 9 5 9 0 5 0 1 7 4 8 1 0 2 6 10 7 2 2 8 4 2 7 8 8 8 5 4 4 5 9 3 2 2 0 8 4 10 4 0 5 6 4 9 4 6 7 8 0 1 7 5 8 8 8 6 6 3 8 2 6 7 9 6 5 10 1 2 4 9 7 4 0 0 0 2 8 3 7 5 4 4 4 1 7 7 6 3 8 4 7 5 3 7 3 1 8 1 3 10 10 7 10 0 3 7 1 2 8 8 0 10 6 9 3 1 [...]
+10 1 4 4 1 2 0 3 0 2 6 1 9 7 6 6 10 2 1 9 5 6 4 1 9 4 3 5 10 1 2 1 8 4 10 3 10 7 0 10 4 5 6 3 7 8 7 8 9 0 9 10 3 3 0 6 5 10 4 8 3 5 10 4 8 4 8 7 4 9 2 10 6 5 9 6 9 7 10 6 1 2 6 5 9 4 4 0 9 9 5 4 3 8 4 10 10 9 6 9 4 9 5 5 3 10 0 10 0 2 5 3 4 3 2 10 1 2 3 2 4 3 7 9 6 4 1 0 0 5 3 1 10 7 2 0 9 3 3 8 4 3 7 6 5 4 4 2 5 0 1 0 10 5 7 3 1 8 8 8 9 5 7 10 7 4 4 4 2 6 2 0 1 4 10 1 10 0 6 4 2 8 7 2 8 5 6 1 6 2 0 0 1 9 1 4 8 10 6 2 1 0 4 5 9 5 1 10 8 9 7 0 5 4 8 8 7 2 10 6 3 6 4 9 10 8 9 3 4 2 1 1 1 0 [...]
+2 9 2 0 4 0 6 4 5 0 8 2 7 5 8 7 0 7 9 1 3 10 7 2 1 8 10 6 9 0 2 8 6 3 7 3 10 0 0 5 10 9 2 5 0 8 1 8 3 5 0 1 6 10 1 7 2 6 10 5 2 4 2 7 2 2 8 5 9 7 4 8 6 0 8 2 9 3 10 5 10 3 5 6 9 8 1 7 0 4 4 9 2 6 0 10 10 6 3 8 9 6 4 1 7 6 4 5 6 4 8 1 0 0 4 9 10 8 7 1 8 7 6 0 2 7 4 0 2 6 3 5 6 5 6 8 1 10 7 7 5 1 7 4 6 10 0 0 4 0 3 2 5 0 7 8 4 0 8 0 5 1 1 6 4 10 2 9 7 5 8 1 1 0 10 6 1 9 10 7 8 0 4 4 5 8 5 4 2 8 1 6 3 8 1 2 3 8 6 10 1 6 5 10 9 3 3 6 5 5 0 9 7 2 5 8 10 8 4 6 4 1 7 9 9 10 0 8 1 3 0 6 0 2 1 3  [...]
+0 6 0 8 10 0 7 3 5 9 0 10 0 0 8 10 1 2 7 5 1 10 6 5 9 2 6 9 7 7 1 6 0 10 6 5 0 9 1 7 6 10 10 1 1 10 0 1 7 8 4 5 0 5 3 4 0 3 5 0 7 2 3 10 4 0 4 9 7 8 9 5 10 0 3 10 0 2 2 0 5 6 9 10 7 2 3 6 1 3 3 8 2 10 1 5 7 6 3 2 10 6 0 2 8 3 4 10 1 4 4 3 3 2 7 5 10 2 2 3 1 1 7 8 5 4 6 6 10 10 6 9 2 1 8 3 10 8 0 3 9 4 1 7 2 1 4 10 10 5 9 10 5 5 5 3 10 1 2 2 0 9 6 3 5 10 6 6 2 5 3 10 8 5 9 7 3 8 7 9 3 5 3 7 2 2 7 10 4 1 10 7 9 10 8 8 8 2 9 4 4 2 8 10 5 4 3 8 9 0 4 0 8 10 4 10 5 5 0 1 4 4 4 10 9 8 3 3 0 10 [...]
+4 1 4 5 1 7 7 4 10 4 4 10 3 5 10 5 5 5 6 3 2 0 6 5 2 9 2 2 1 5 8 3 3 10 10 3 5 4 1 3 9 5 7 3 4 1 3 3 6 10 8 8 7 1 10 10 3 10 10 4 6 1 3 4 1 5 4 0 1 10 5 3 10 8 3 2 1 5 10 4 7 0 8 3 1 4 7 4 10 4 2 3 6 5 1 10 1 5 2 5 10 9 6 3 7 9 8 10 4 7 2 2 3 7 7 3 9 10 6 10 4 3 6 9 6 5 0 0 2 7 10 2 2 1 7 10 7 8 1 4 2 5 8 3 0 0 7 9 7 8 10 5 7 7 2 1 1 10 4 2 5 5 2 0 9 6 7 8 9 7 6 10 7 6 8 0 8 10 7 2 8 2 4 9 3 10 1 0 2 0 8 8 6 4 2 8 2 7 10 10 6 3 6 3 8 9 9 7 9 4 4 3 5 9 6 3 10 0 10 7 9 4 5 4 7 3 8 3 0 0 6  [...]
+5 8 1 1 2 4 1 3 3 2 6 7 2 2 8 0 5 7 4 10 0 3 0 8 8 1 6 5 4 10 5 0 7 7 4 7 10 7 5 7 2 8 8 1 4 10 4 10 2 3 7 5 0 2 0 9 2 7 3 3 1 5 10 0 8 3 0 6 4 0 9 1 10 4 1 6 0 0 10 4 10 2 8 9 0 2 7 6 3 8 9 6 4 7 5 10 6 3 6 2 8 5 3 1 4 1 0 0 5 4 4 3 7 8 3 1 5 5 2 9 4 1 9 2 7 9 7 2 3 0 8 4 7 9 10 8 7 8 3 3 9 8 3 7 1 0 8 1 5 6 1 6 1 8 10 3 5 2 3 4 2 2 2 7 2 0 3 10 5 7 8 4 7 5 0 8 10 6 1 7 9 10 3 9 9 10 2 7 8 9 9 8 10 3 7 2 6 9 5 5 5 2 5 7 5 0 0 10 8 9 8 8 4 1 5 2 2 8 3 0 4 2 10 4 2 5 3 6 6 3 6 7 7 7 10 3  [...]
+3 3 2 4 3 2 5 2 10 0 8 8 0 8 1 8 5 5 2 9 6 1 5 8 1 3 1 7 8 9 10 1 3 0 6 3 3 7 8 3 1 7 1 4 3 3 10 8 8 0 5 5 4 6 0 6 4 1 7 6 1 8 7 1 7 0 7 5 5 5 2 8 10 9 4 8 10 6 2 4 5 0 4 0 6 6 3 8 6 9 2 3 5 3 5 7 9 1 1 6 5 6 5 8 8 6 3 5 5 6 4 6 9 10 10 3 2 9 4 0 2 0 1 0 9 0 8 3 6 4 0 1 5 0 10 4 8 3 2 3 2 6 7 2 0 6 7 0 9 7 7 7 10 6 5 3 9 8 3 2 5 6 3 7 8 0 7 7 9 0 3 5 5 10 2 2 5 10 9 2 3 5 9 9 7 6 8 6 7 2 1 3 6 8 2 3 7 7 6 10 4 6 0 7 8 4 7 3 9 9 10 0 2 1 9 9 4 10 1 6 3 2 1 5 2 2 3 4 1 3 4 1 6 3 7 8 2 0 1  [...]
+5 0 1 10 2 10 2 7 5 6 5 1 4 3 5 9 8 6 0 7 5 10 8 0 1 4 5 9 6 4 2 1 0 2 3 9 1 6 4 8 5 4 1 0 4 0 8 3 1 3 1 10 5 5 5 1 9 4 2 4 7 3 7 8 10 4 5 0 4 2 10 5 6 7 3 4 5 6 3 3 2 6 1 2 5 1 9 4 1 2 8 5 10 4 0 8 4 8 9 1 5 1 9 7 10 0 5 10 0 3 1 2 1 4 7 3 8 10 2 0 1 2 8 10 8 6 0 2 4 6 6 8 6 3 2 7 3 10 5 1 6 5 5 0 6 1 5 6 7 5 4 6 2 7 4 4 10 1 5 6 0 3 0 0 7 9 2 3 8 7 7 8 6 9 6 8 8 3 4 0 8 9 2 10 10 6 8 0 5 6 2 2 5 8 9 0 2 10 1 3 5 0 7 1 1 3 10 5 7 1 4 6 8 3 5 2 1 0 1 2 5 6 5 4 0 2 6 9 7 7 0 0 8 9 1 6 6 1 [...]
+0 6 5 5 10 2 6 7 4 0 4 5 3 3 6 0 9 5 6 5 0 10 2 3 6 5 8 0 1 2 8 5 4 5 3 9 0 7 4 10 3 9 8 5 0 6 3 10 9 8 7 1 9 9 9 3 2 2 6 9 1 1 5 7 0 0 1 8 3 1 10 5 8 10 3 3 1 10 6 4 4 9 3 6 3 8 2 2 5 2 6 6 9 2 10 6 8 9 6 1 9 8 5 5 0 9 2 9 10 7 1 8 0 5 7 0 6 0 9 8 0 4 3 4 4 0 5 4 6 2 8 0 5 1 8 5 7 6 1 0 4 8 4 4 1 1 3 5 1 9 10 4 2 1 1 1 10 7 1 7 10 5 6 8 8 3 6 0 10 3 10 10 7 1 5 7 2 7 5 6 10 5 10 5 2 4 8 10 0 2 10 8 7 0 2 6 10 1 3 2 4 10 3 1 2 2 0 8 0 4 1 3 4 1 2 5 5 9 2 9 10 3 2 2 9 10 4 0 3 1 8 2 4 9 3 [...]
+4 2 6 5 1 3 5 4 2 5 5 4 9 4 3 10 1 9 8 5 2 0 6 3 2 1 1 1 2 10 5 7 1 6 7 6 6 7 3 7 3 8 9 6 8 9 5 9 2 10 8 4 9 0 8 5 9 7 5 0 2 2 1 3 3 7 2 7 1 9 1 6 7 1 5 2 0 5 3 8 8 7 2 9 9 8 0 2 10 0 0 3 0 5 8 2 5 5 2 2 8 10 8 1 0 8 10 9 0 2 2 4 2 7 2 10 1 3 2 1 8 6 9 0 5 5 5 1 10 1 9 2 3 0 7 1 10 5 0 10 8 4 2 10 5 2 1 4 9 10 10 2 3 1 9 7 1 10 1 9 8 2 0 2 1 1 8 3 9 0 5 4 1 10 0 1 3 4 3 4 9 6 5 3 2 9 0 10 2 1 0 1 9 0 7 8 10 3 5 1 10 9 2 5 6 6 7 1 5 4 9 8 3 3 8 7 10 5 7 7 2 7 1 3 10 1 1 4 5 9 2 6 7 5 10 0 [...]
+9 1 3 0 6 8 1 5 8 6 9 10 2 6 10 9 8 6 5 7 10 8 2 6 4 10 6 9 4 9 7 3 0 8 5 5 6 0 1 8 4 3 2 1 0 10 7 5 8 9 5 3 9 5 10 1 1 10 1 10 9 8 1 9 5 5 1 2 0 5 2 7 0 8 2 9 2 3 9 4 6 6 2 8 1 1 10 8 2 4 7 10 8 5 4 0 1 9 6 0 10 0 8 4 9 0 7 2 5 4 9 9 2 8 5 2 8 9 3 1 0 7 2 10 1 6 5 8 0 4 7 0 1 1 3 6 7 1 6 4 6 9 3 0 5 4 1 6 9 9 10 3 7 1 3 9 7 3 4 6 0 1 3 7 5 4 7 8 6 1 5 0 1 2 2 9 6 0 2 1 2 5 8 0 0 5 5 7 5 3 5 9 2 4 4 9 0 7 5 7 6 2 2 6 2 10 6 3 6 3 7 7 0 7 9 7 7 6 10 4 0 4 6 9 5 9 9 10 6 8 5 5 2 7 7 1 1 9  [...]
+3 5 10 1 3 7 6 3 1 9 9 10 10 4 4 9 0 2 3 5 6 9 8 4 5 7 5 2 8 6 0 5 5 9 7 3 1 8 10 3 1 10 4 9 7 10 8 2 9 4 6 9 6 6 2 10 8 4 8 6 2 7 2 9 5 5 7 2 6 9 1 9 6 3 9 1 7 10 9 3 8 6 6 6 10 4 0 7 4 0 1 3 1 7 0 3 0 5 10 0 9 7 5 4 7 9 8 9 4 8 6 7 2 8 1 2 2 4 8 1 8 4 5 0 4 8 6 5 9 9 9 1 2 2 8 3 3 3 9 9 0 10 5 2 9 5 9 2 3 6 8 3 7 7 1 0 1 6 4 0 4 0 10 8 5 7 8 5 4 3 6 4 1 9 10 6 9 2 1 6 9 4 2 5 6 10 0 7 5 0 3 6 0 4 1 2 5 9 3 1 1 10 5 9 1 6 7 2 8 2 5 0 7 7 4 4 8 4 3 3 8 8 1 4 10 0 0 1 3 8 4 2 5 1 8 2 7 0  [...]
+0 2 8 1 2 0 2 7 7 10 5 1 8 5 1 2 5 10 5 1 7 10 6 0 1 6 9 5 3 7 5 1 9 8 2 1 8 9 5 3 10 3 3 10 10 5 4 1 1 8 0 10 2 8 1 9 3 4 6 8 10 6 8 10 2 0 7 7 3 3 10 1 5 1 4 8 8 5 10 10 6 10 2 7 3 8 2 8 2 9 5 8 7 8 5 4 6 4 2 4 5 10 3 6 1 2 10 6 3 7 8 8 5 4 1 6 5 9 7 9 2 7 1 8 3 4 0 1 1 2 5 7 1 10 1 9 7 0 6 10 1 9 6 9 1 8 3 6 8 5 10 7 1 6 7 3 5 8 0 1 3 0 8 0 8 10 7 0 2 6 2 8 2 0 1 3 1 9 9 9 9 6 6 8 8 9 2 4 3 3 2 9 3 7 5 9 9 7 10 2 3 8 5 2 0 8 7 9 5 2 0 9 2 1 4 3 6 10 7 8 5 9 0 3 10 7 4 3 5 8 6 0 7 6 4  [...]
+7 9 3 9 2 5 9 8 9 2 0 7 0 5 0 8 8 8 5 7 10 4 6 1 5 2 0 2 3 5 5 5 5 8 1 9 7 6 0 3 0 6 9 3 10 0 7 2 7 9 4 10 1 6 8 0 10 5 0 1 7 5 4 10 4 7 10 5 6 10 0 6 0 0 3 1 3 7 5 10 10 4 1 1 7 9 4 8 1 2 1 9 0 7 4 10 3 3 6 0 4 10 3 10 10 9 8 2 7 5 10 9 8 5 9 5 4 8 9 1 3 5 2 10 5 5 0 3 9 0 1 2 4 2 0 8 9 3 1 9 7 1 10 6 0 4 5 1 4 2 7 2 4 10 10 3 7 9 8 8 2 1 1 3 5 6 6 5 6 7 1 8 0 2 9 5 4 2 10 3 4 0 1 10 1 10 5 6 10 5 9 7 7 3 9 8 2 10 4 5 10 10 5 0 7 6 5 2 7 7 9 4 8 9 5 10 2 2 7 7 10 8 1 1 6 6 9 4 3 4 2 6 3 [...]
+10 7 2 0 7 7 4 10 1 0 8 4 10 6 4 3 9 10 1 3 6 2 4 0 2 7 10 7 7 2 4 0 10 2 7 10 1 1 4 7 8 2 10 6 3 2 4 8 8 8 10 4 8 5 1 6 2 2 10 9 4 10 9 2 4 3 3 1 5 3 9 7 1 8 1 2 10 2 3 0 8 2 8 0 0 1 5 9 1 1 6 10 3 6 3 7 1 0 4 4 10 9 4 6 6 8 1 4 7 2 3 7 10 9 3 10 10 5 0 0 7 10 0 6 9 6 1 4 3 0 0 9 0 8 8 4 7 10 8 3 3 6 1 1 6 2 8 1 1 0 9 1 5 7 10 9 8 5 4 0 0 7 0 2 10 7 9 5 5 10 2 3 1 2 9 2 0 1 5 8 0 6 1 0 4 4 5 3 5 5 7 6 2 1 6 9 8 4 2 9 8 4 7 9 1 5 3 5 5 8 7 2 8 1 1 9 10 10 6 4 7 8 10 0 8 9 8 4 10 8 7 5 10 [...]
+1 5 10 5 1 7 6 8 7 7 4 0 4 9 10 10 8 7 9 2 10 7 10 6 4 5 10 5 3 8 0 3 2 8 0 8 4 6 3 4 6 9 7 2 0 8 10 8 3 6 9 7 5 1 3 7 4 5 4 6 6 2 2 9 5 3 9 7 7 7 6 10 10 4 1 8 8 2 6 1 8 1 7 5 9 10 3 7 9 6 7 5 6 5 7 2 8 10 0 4 9 10 0 2 0 6 8 3 8 4 5 3 4 8 9 3 4 4 8 5 0 1 5 7 2 7 4 1 4 1 3 10 6 4 7 7 4 4 9 0 4 4 3 3 2 8 1 3 6 8 0 7 6 0 3 6 5 2 2 2 1 10 2 5 9 3 6 6 3 4 4 5 9 0 10 9 8 7 6 0 8 10 8 1 10 9 6 5 4 7 8 9 10 10 5 5 8 8 9 6 7 2 5 7 3 9 7 7 8 2 8 2 5 5 4 8 3 3 3 2 7 3 9 2 1 9 5 3 10 8 3 0 0 8 3 4  [...]
+5 1 6 9 1 8 0 9 7 1 6 5 1 7 5 8 5 1 10 3 10 0 5 3 10 5 5 6 7 9 7 3 6 8 5 7 7 3 8 7 4 0 4 6 0 2 5 10 8 1 6 10 10 1 1 8 0 0 2 1 6 8 10 4 0 7 4 5 4 6 9 1 0 9 6 4 1 8 6 6 4 4 7 5 4 3 4 5 2 8 7 0 10 0 10 7 2 7 2 9 9 1 0 8 7 0 5 8 5 5 3 9 4 8 2 7 2 6 0 4 9 8 0 6 1 6 9 7 4 5 8 2 1 2 1 5 8 0 1 6 2 8 10 6 4 5 3 2 0 4 2 8 1 8 9 8 10 0 2 6 2 8 4 8 7 6 10 3 3 1 10 4 1 9 0 1 8 5 6 2 9 2 7 3 9 5 0 9 0 7 8 10 9 6 2 3 7 1 7 4 8 0 9 5 7 5 5 1 10 3 7 6 9 0 4 10 10 8 8 4 10 2 0 5 6 8 0 7 9 7 7 2 10 8 9 0 8 [...]
+2 6 7 0 4 3 7 7 3 4 1 2 9 8 4 9 1 2 4 10 1 5 1 5 7 1 2 3 10 10 2 2 9 3 8 6 1 9 4 8 6 1 6 5 8 0 7 9 9 7 2 7 7 5 1 8 6 5 8 5 7 9 1 4 0 8 2 1 7 10 2 1 2 9 2 3 4 0 2 2 1 6 7 0 0 8 1 1 1 6 9 7 3 6 4 1 8 6 3 4 10 7 6 4 8 6 0 5 3 4 3 9 9 1 0 10 1 8 6 4 10 7 2 1 5 4 0 4 2 6 10 1 10 9 5 1 6 9 7 2 10 4 1 3 7 1 9 2 3 4 7 10 8 1 1 4 10 6 8 7 7 6 3 1 5 8 0 6 2 8 8 10 3 6 1 10 3 9 10 9 5 7 8 4 7 8 7 5 6 8 4 8 2 10 7 3 6 6 4 5 2 5 6 4 10 4 7 5 1 8 6 1 9 6 8 5 2 3 10 0 1 10 1 7 0 6 5 8 4 1 3 3 8 9 8 9 5 [...]
+4 2 9 0 2 4 2 2 10 8 4 8 0 3 4 7 6 7 1 0 2 3 8 10 8 3 1 4 8 4 7 10 8 6 5 10 6 10 4 4 1 4 6 7 3 4 2 0 0 6 0 9 9 7 9 5 3 4 1 4 3 3 8 6 3 5 10 2 2 8 6 9 10 5 5 1 7 9 6 10 5 10 9 7 6 7 1 10 5 4 9 2 0 0 6 10 10 4 3 1 0 8 7 10 0 2 0 6 3 7 2 5 8 8 2 0 8 9 1 3 6 0 1 10 5 1 6 9 9 0 0 2 6 8 3 2 7 2 1 9 10 6 3 8 7 0 10 9 10 5 6 0 7 1 9 6 0 3 0 5 1 4 4 6 9 5 3 9 7 7 10 4 8 10 6 7 5 5 4 6 4 8 5 10 1 0 2 10 9 10 6 2 7 9 6 6 4 9 10 4 10 10 8 9 1 9 9 9 2 5 4 4 9 3 0 1 9 5 2 5 3 8 1 7 4 0 2 4 1 3 3 2 0 0 [...]
+8 10 5 1 10 8 6 7 6 2 2 6 7 7 1 9 6 3 1 4 0 9 7 4 1 1 0 2 3 4 8 7 5 10 3 5 6 1 9 0 1 10 3 3 10 7 5 2 1 3 0 6 3 5 10 0 10 10 6 6 2 4 1 9 1 5 8 3 10 8 5 3 10 7 6 2 10 5 8 3 1 10 3 10 3 0 3 0 8 7 2 5 5 0 7 0 10 2 8 8 6 3 10 8 3 9 4 5 9 4 1 2 7 6 5 1 1 10 9 1 9 0 1 0 8 0 5 7 6 3 6 9 10 10 1 10 6 1 8 5 7 6 3 6 7 6 7 2 8 5 1 10 9 3 8 6 6 1 4 8 0 4 5 6 9 10 3 4 3 1 0 6 2 6 4 3 0 8 0 5 4 2 8 4 6 8 7 3 6 8 6 9 3 6 2 9 6 8 3 3 8 10 2 4 7 0 4 3 1 7 10 9 4 2 2 10 7 6 5 2 0 8 3 0 1 4 1 10 0 4 2 2 7 1 [...]
+0 5 4 10 7 8 5 4 10 4 8 8 8 4 6 10 6 1 8 6 7 1 9 7 6 3 8 0 7 8 10 4 9 3 0 4 9 10 0 7 10 6 5 9 8 1 7 4 10 4 7 7 7 1 3 4 5 5 3 9 8 0 10 7 4 8 5 2 1 4 3 2 3 10 9 7 7 2 9 9 5 0 0 1 8 6 10 8 8 0 6 9 2 10 7 6 6 2 1 6 10 4 6 4 0 10 6 9 4 9 6 10 3 4 7 0 8 1 7 9 10 4 10 2 3 2 7 2 3 4 10 7 5 1 3 1 2 10 8 3 7 7 10 8 6 1 5 5 7 3 0 10 2 0 1 5 8 6 0 2 3 9 6 9 6 4 9 0 0 8 3 3 3 3 3 4 3 3 9 1 4 10 0 2 2 0 6 0 10 4 5 6 0 9 10 5 9 5 10 2 4 1 7 8 7 8 5 4 2 6 10 4 5 2 7 9 0 8 8 0 0 6 1 1 10 3 5 9 10 6 1 2 4 [...]
+2 0 0 7 3 7 3 6 10 9 8 10 3 2 9 10 3 8 4 4 0 2 5 8 7 6 3 9 4 6 8 1 8 10 10 8 10 9 2 4 5 1 4 0 3 8 3 9 10 2 8 0 2 4 2 3 2 0 1 7 6 4 0 6 2 0 2 5 3 10 4 10 1 10 8 3 1 2 0 10 3 3 9 6 3 9 8 10 10 6 1 7 10 6 2 4 0 0 3 10 5 8 0 8 8 9 8 7 9 2 10 9 3 10 4 4 1 0 8 7 5 6 4 7 10 1 6 7 4 8 6 5 8 2 6 3 7 9 2 3 5 10 4 9 6 7 3 4 7 2 2 9 0 10 8 3 9 1 10 8 4 5 3 2 5 3 1 9 5 3 10 4 4 2 0 9 4 10 9 0 4 10 8 9 6 2 1 4 9 3 3 1 10 2 1 10 3 10 9 1 7 7 2 8 6 2 6 4 2 2 1 0 4 8 0 9 7 0 6 10 4 0 2 6 7 7 6 9 9 5 0 8  [...]
+0 9 4 6 0 9 5 5 2 0 4 0 2 3 3 2 3 3 8 3 3 7 5 8 6 8 3 7 6 8 10 2 4 4 10 3 4 10 5 8 8 3 8 10 7 2 8 2 9 9 6 5 4 5 10 5 0 1 4 0 9 6 6 5 0 4 6 10 9 7 5 9 7 9 10 8 10 5 8 6 6 2 0 7 9 0 9 2 0 0 7 3 1 4 8 4 9 3 8 9 4 6 7 1 0 3 6 9 7 6 8 4 0 4 6 1 9 6 10 0 0 5 1 3 9 10 5 7 3 2 2 3 8 0 0 3 7 7 2 5 0 7 10 6 10 0 9 5 2 7 7 0 9 4 6 8 10 1 1 7 2 9 9 8 8 5 10 10 7 0 2 6 6 9 1 1 3 1 8 3 8 9 3 8 0 1 2 10 2 9 5 5 7 5 8 2 6 2 0 8 1 3 1 8 9 2 5 3 6 10 8 4 10 1 3 5 4 6 2 9 9 7 8 4 10 9 0 8 7 0 7 10 6 7 1 5  [...]
+5 0 3 4 8 6 7 2 2 7 6 9 1 4 6 2 7 10 5 3 7 4 3 0 7 10 1 2 9 10 6 6 1 6 10 9 3 10 1 9 1 4 3 8 5 0 0 10 6 0 5 9 3 7 5 3 6 2 3 10 0 3 3 0 2 7 0 6 9 10 5 7 10 1 0 8 10 2 5 4 7 6 10 6 6 7 9 9 10 6 10 1 3 7 4 8 2 2 0 7 10 4 0 3 1 3 2 0 10 8 7 5 4 0 1 3 4 6 3 10 3 8 5 10 8 3 2 3 4 3 9 3 0 5 6 5 3 1 1 0 5 6 10 1 0 6 2 5 1 3 9 1 4 0 8 4 4 0 8 4 8 7 6 7 1 5 5 9 0 2 1 0 9 1 3 2 8 5 3 3 3 3 5 3 4 8 7 2 4 0 4 2 10 4 5 7 10 0 4 7 4 9 4 5 7 8 1 4 7 4 10 5 5 1 3 5 8 0 3 9 0 10 0 3 10 2 3 4 0 7 2 3 8 6 3 [...]
+8 4 5 10 1 8 1 3 9 9 1 9 0 7 2 1 3 0 3 10 8 6 8 6 10 1 0 6 3 1 1 9 3 0 0 8 7 5 4 10 3 0 4 7 8 0 7 2 10 7 1 9 3 4 4 9 3 6 9 7 10 5 2 1 4 1 9 1 5 8 7 7 6 9 7 8 3 10 6 6 1 8 5 3 6 9 6 1 0 2 9 4 8 0 3 8 4 0 9 3 9 3 5 4 6 8 9 0 9 4 1 10 6 5 6 6 7 9 4 1 5 10 0 8 1 6 10 0 0 4 2 2 1 1 8 6 4 10 10 3 6 7 1 5 10 9 0 3 9 6 3 0 3 9 8 0 6 7 10 7 6 7 6 0 7 5 2 7 8 0 9 4 8 6 9 6 10 5 5 6 8 2 9 10 5 5 1 5 10 7 3 6 7 9 2 10 10 1 7 1 0 5 3 0 5 3 2 5 8 9 2 3 3 9 10 10 9 5 4 6 6 4 9 6 9 9 10 2 4 0 6 6 7 3 7  [...]
+1 7 2 4 9 2 0 10 0 7 10 10 0 9 4 9 7 0 2 2 9 9 3 0 4 4 2 4 9 4 0 6 9 6 9 7 4 7 3 5 3 0 8 0 2 0 4 9 4 7 8 2 6 7 6 4 5 7 6 2 0 3 7 6 1 10 2 7 6 6 3 5 6 10 0 0 0 1 3 3 0 7 7 3 7 1 4 9 2 10 9 9 7 2 8 10 8 2 4 9 6 10 8 2 0 2 10 1 0 0 2 2 1 9 1 10 3 7 9 0 9 1 1 10 9 4 6 10 4 5 3 10 10 2 6 6 7 4 0 5 5 7 6 2 9 7 7 7 5 3 0 1 10 0 9 0 2 8 0 1 2 5 4 0 1 6 3 4 1 4 4 4 6 10 1 2 2 10 0 1 3 7 7 4 2 5 1 2 0 9 4 4 10 1 5 6 5 6 5 5 10 8 6 8 10 1 1 8 3 5 8 2 5 8 3 6 10 3 3 1 10 3 1 10 4 2 5 7 3 9 9 7 1 8 8 [...]
+2 3 1 6 3 10 1 5 3 1 7 2 5 5 6 9 0 0 7 4 8 10 2 10 7 1 10 8 10 3 5 4 4 0 4 1 2 3 5 2 7 7 6 9 6 7 8 4 6 10 2 3 3 8 10 9 3 5 7 7 8 0 1 1 8 0 4 9 2 5 8 1 6 2 0 0 0 5 6 0 7 5 4 10 5 0 7 8 7 7 9 5 6 10 5 7 1 8 2 6 3 3 4 10 0 2 5 7 4 8 8 4 4 4 3 8 9 5 2 8 2 1 9 10 9 7 10 3 0 10 6 4 4 9 9 7 1 4 6 9 3 0 3 7 1 2 3 7 7 10 6 10 5 2 0 2 4 3 7 0 4 6 0 4 10 0 3 8 7 6 5 9 7 1 3 3 8 4 4 6 6 0 1 7 10 7 5 9 5 3 2 3 2 8 7 1 6 5 8 6 7 9 10 3 1 3 3 9 0 1 3 7 7 10 5 10 2 10 3 7 3 1 9 10 9 8 1 10 3 9 10 5 8 10 [...]
+10 4 1 5 9 4 1 10 9 10 0 0 3 7 7 9 0 10 5 7 8 4 0 2 3 7 2 7 3 1 1 6 9 10 9 9 5 10 0 3 0 8 0 5 5 4 9 1 6 0 4 2 8 10 7 8 2 8 0 8 3 2 5 8 6 4 0 8 2 10 4 1 4 8 3 0 5 2 2 0 1 8 3 1 1 7 0 1 10 7 5 7 8 3 4 2 3 1 4 4 2 0 1 5 5 5 0 8 3 10 10 9 9 10 4 3 2 9 7 2 2 2 9 0 5 5 6 9 2 10 9 5 3 9 7 0 9 6 6 8 2 9 0 6 5 0 4 10 7 8 4 2 7 10 2 1 3 2 10 9 8 5 9 8 5 5 2 9 2 0 2 3 7 7 5 6 5 2 4 2 5 3 5 1 8 7 6 8 5 3 10 0 1 9 10 5 2 8 10 5 10 4 6 0 9 1 3 0 4 6 6 2 7 1 6 0 1 5 7 3 8 6 7 8 3 7 8 1 9 0 5 5 6 8 3 3  [...]
+4 4 5 10 0 2 8 6 9 10 5 9 1 0 8 8 0 0 2 10 7 0 3 2 7 5 8 7 0 2 3 9 10 7 8 10 9 0 10 0 3 0 1 1 0 1 3 6 2 0 5 8 4 8 8 7 3 6 9 10 8 9 7 0 4 10 7 9 2 0 10 4 2 3 4 1 10 0 4 4 2 2 7 10 0 5 3 4 1 10 3 7 5 7 5 4 9 6 6 10 9 6 9 3 9 3 8 0 9 10 2 0 9 1 2 7 2 1 0 10 4 0 2 9 6 4 7 1 4 9 9 4 2 3 0 10 5 4 1 5 0 0 3 8 4 2 10 5 10 2 2 1 1 9 0 10 5 5 2 6 9 8 6 5 7 4 1 1 2 9 1 10 3 7 5 6 5 6 2 1 6 2 7 1 6 9 1 6 1 9 7 7 2 2 7 7 9 10 6 6 1 6 5 0 8 10 0 6 7 1 4 6 3 1 8 4 2 10 1 6 3 0 10 10 7 6 1 0 6 0 3 9 8 6 [...]
+10 9 5 9 4 2 4 3 5 9 7 7 2 8 0 4 2 6 2 2 9 6 5 3 8 1 4 8 3 5 9 8 4 3 8 4 4 8 6 8 3 4 7 3 3 7 4 5 9 4 5 2 3 8 7 8 2 2 2 3 4 0 6 10 6 0 5 3 1 2 8 8 3 6 5 2 8 0 2 0 4 5 2 9 0 8 2 1 7 0 7 3 9 5 10 6 0 8 6 4 6 7 9 5 2 9 0 8 5 1 3 10 7 4 1 6 1 0 9 5 2 8 5 9 0 4 7 5 4 9 5 4 0 2 7 9 9 6 0 7 8 8 4 10 0 0 8 4 6 5 7 2 5 5 4 9 9 8 2 9 0 2 6 8 2 9 1 7 1 7 0 6 0 6 0 3 8 1 5 2 6 1 8 5 3 7 9 7 7 5 10 7 10 8 10 6 5 10 3 2 6 4 1 9 6 4 8 10 4 10 10 4 8 9 4 1 1 2 8 7 10 7 6 3 3 6 8 2 5 5 7 6 10 3 7 6 4 3 2  [...]
+1 10 10 5 3 6 3 5 2 0 2 1 5 6 5 5 10 5 1 10 10 8 2 1 10 7 7 1 9 8 6 9 2 2 5 5 0 9 4 6 10 3 6 7 1 7 7 6 0 3 0 5 10 4 9 4 0 8 3 8 7 10 2 1 1 9 3 6 1 4 4 5 8 7 9 3 2 8 6 4 4 5 6 4 4 8 4 1 7 8 8 3 6 6 3 7 9 8 1 3 0 10 0 7 2 2 7 9 0 10 0 8 10 1 5 4 0 0 1 2 5 7 6 5 9 6 0 3 5 3 0 6 9 6 4 9 3 7 10 2 9 9 3 7 8 5 6 3 4 3 7 5 0 10 1 2 1 3 0 9 4 4 5 8 1 2 10 10 6 4 3 3 3 7 2 1 9 5 3 9 4 3 10 9 3 6 4 2 9 0 2 1 2 9 6 4 8 0 4 2 10 2 5 8 2 1 4 4 0 7 1 6 7 6 10 6 4 9 4 7 4 8 0 6 0 6 6 9 10 8 0 4 4 5 10 9 [...]
+8 10 1 2 8 5 8 9 6 6 0 1 2 7 4 1 0 1 6 5 3 2 10 9 7 6 7 0 4 7 4 4 6 2 1 2 3 9 1 9 3 3 9 4 3 10 5 9 2 5 4 10 2 1 7 3 5 3 8 10 6 5 0 2 10 0 1 3 2 3 0 6 4 10 6 9 2 3 5 5 4 8 7 0 4 8 10 8 10 7 7 0 2 0 5 1 0 6 2 10 7 0 10 2 5 8 8 7 8 10 2 2 9 3 0 2 6 2 9 6 8 0 7 5 0 5 10 3 10 2 5 5 7 1 1 6 2 7 4 10 10 10 4 8 9 3 7 8 3 1 9 2 5 0 10 0 8 0 8 8 2 3 3 7 2 8 10 1 3 1 5 2 7 0 8 8 9 10 5 9 2 7 8 7 4 8 8 8 1 5 3 2 10 2 0 6 6 9 8 1 0 10 6 9 1 4 6 0 5 0 2 1 1 1 2 5 6 8 3 10 6 10 1 3 2 6 1 9 0 1 9 4 0 7  [...]
+7 7 8 2 10 0 0 5 3 9 1 10 5 9 2 1 2 2 7 4 2 0 3 6 3 3 5 6 0 7 6 5 6 0 5 9 8 2 5 8 10 10 10 1 8 4 9 6 5 6 3 0 1 3 10 7 10 2 6 5 5 10 5 2 1 1 7 2 4 4 2 7 4 7 4 1 2 6 6 0 1 5 4 4 7 1 4 3 6 4 1 4 4 5 0 1 8 7 2 6 10 1 2 4 4 1 7 3 7 6 6 7 3 8 4 2 10 8 2 5 7 6 3 9 2 2 6 0 2 8 0 2 9 5 6 8 7 4 8 0 2 2 6 2 6 9 0 8 5 10 3 5 8 0 4 5 10 0 1 6 3 2 10 10 6 10 7 0 6 2 5 0 5 10 8 3 6 1 4 4 8 4 2 3 2 10 8 2 10 1 1 3 8 4 7 7 5 7 7 9 1 2 10 2 4 8 0 0 2 5 4 5 4 7 4 1 10 3 7 7 3 9 6 7 10 5 5 2 9 5 6 2 9 3 0 1 [...]
+10 1 0 6 2 10 1 5 7 2 0 1 2 1 8 8 8 1 2 8 0 6 7 3 8 0 10 10 10 4 1 3 6 0 4 9 9 2 10 0 1 7 10 9 7 3 7 7 5 6 9 8 2 4 8 1 1 2 0 7 7 9 2 6 0 8 5 9 2 7 1 9 5 5 8 7 10 7 5 3 1 7 3 3 4 7 4 8 2 8 5 8 5 7 2 8 8 7 1 4 8 8 3 8 1 8 8 8 7 1 3 1 6 5 3 1 8 2 2 8 2 2 8 6 7 2 5 10 0 10 0 1 7 9 7 1 5 7 4 8 10 8 2 8 7 0 1 10 10 4 9 4 2 5 0 4 8 5 0 6 8 8 1 0 0 6 3 3 9 2 0 3 2 10 3 3 6 9 1 6 6 1 1 10 7 4 7 2 9 8 6 2 2 8 3 7 8 7 0 0 4 0 7 5 9 2 9 1 9 4 1 10 7 2 2 2 3 8 3 4 1 10 0 8 2 6 9 8 7 6 6 9 4 6 10 8 10 [...]
+7 3 6 1 9 0 6 2 0 5 4 0 2 3 2 5 2 5 1 9 1 9 10 7 3 4 9 3 10 10 7 1 6 8 2 9 10 4 10 1 1 5 3 5 1 5 9 10 4 8 4 9 4 6 8 3 9 5 5 6 2 0 3 5 1 3 2 9 1 8 2 7 2 1 9 1 1 9 7 8 6 10 1 1 6 8 7 9 6 7 3 4 7 2 8 3 5 1 7 1 2 4 8 0 8 10 9 9 9 1 2 4 10 0 4 8 0 6 2 7 1 2 9 2 3 1 2 8 5 8 6 2 2 8 5 0 2 8 7 7 3 9 5 1 5 1 6 9 1 1 8 6 2 4 8 3 9 0 9 1 4 4 1 10 5 3 5 4 1 10 5 6 7 1 10 0 9 1 8 0 9 2 10 5 1 3 6 2 10 2 3 5 5 1 4 7 8 4 9 4 8 7 4 0 4 5 6 5 5 4 7 7 2 1 5 5 2 6 6 1 0 7 10 9 6 0 3 3 1 2 10 5 1 2 3 10 3 8 [...]
+5 3 9 8 10 5 10 0 6 3 0 2 0 8 5 2 5 6 4 10 4 5 10 4 7 10 0 4 1 9 7 2 10 8 8 2 1 8 10 5 9 0 4 6 6 7 0 10 7 10 6 3 9 7 10 4 2 7 6 4 2 0 5 6 2 5 6 1 10 5 1 0 3 5 0 9 5 6 4 7 8 6 3 10 4 2 2 4 2 5 4 7 5 7 3 6 4 0 9 5 0 0 2 3 5 9 5 3 0 0 3 1 3 10 6 3 5 2 1 9 4 4 6 6 9 6 2 0 8 3 8 1 7 4 10 3 6 4 1 8 4 7 1 2 9 4 6 8 0 2 9 2 3 6 3 3 6 3 0 1 3 7 10 6 3 7 10 5 4 3 10 7 1 3 6 2 0 1 4 8 5 4 3 8 0 7 8 4 10 10 10 4 3 4 0 0 8 6 0 1 6 3 7 3 6 6 8 4 10 1 1 8 7 5 6 9 7 9 4 5 0 3 2 9 9 2 6 0 7 6 1 3 6 1 9 5 [...]
+5 5 7 9 3 3 4 6 8 2 4 7 10 0 8 10 5 7 1 10 7 4 2 2 4 6 5 8 9 7 2 1 9 8 8 7 5 7 8 6 9 9 0 5 2 4 3 5 0 2 9 5 0 7 1 1 2 8 7 0 2 9 0 8 6 7 6 1 9 0 2 6 9 8 1 0 7 0 6 6 0 7 4 6 8 4 1 10 1 2 5 0 4 2 7 5 4 0 4 0 1 3 9 9 9 0 10 4 1 10 8 10 10 4 6 8 2 5 2 9 3 2 1 1 5 4 4 9 10 4 9 0 7 6 5 4 6 3 10 9 5 5 4 6 0 4 10 5 9 9 6 7 7 2 7 4 9 0 1 3 6 8 0 7 0 2 7 10 0 5 9 5 3 2 3 2 5 4 6 10 4 1 9 0 3 8 7 4 4 4 9 4 10 7 2 0 9 10 10 10 1 1 7 2 0 4 8 8 6 3 0 10 0 5 8 5 9 9 1 3 9 0 2 8 3 9 7 1 6 1 3 4 4 5 3 2 1  [...]
+8 1 9 8 4 7 10 1 8 5 5 3 10 10 3 10 6 4 3 2 9 9 2 7 1 5 7 4 10 10 3 3 0 10 7 8 10 10 6 10 8 2 4 1 8 0 1 2 8 10 10 3 8 0 10 3 1 2 4 0 5 8 3 5 3 4 0 5 1 3 5 8 6 5 9 3 0 4 5 6 3 1 0 4 6 8 4 10 1 8 6 8 3 9 5 7 2 4 6 3 5 6 1 1 10 8 4 8 8 0 5 3 7 5 9 6 10 5 9 2 5 4 5 6 1 3 3 3 3 7 6 7 4 4 2 1 7 8 7 2 10 2 4 7 10 9 5 9 9 5 0 10 0 1 9 3 7 4 9 3 1 0 1 2 1 3 3 5 6 4 6 4 5 3 7 0 6 8 6 4 5 0 2 6 4 1 3 4 5 5 1 6 5 7 6 2 2 8 8 7 10 5 3 8 9 5 5 2 5 8 5 10 10 5 1 4 7 3 3 0 2 4 10 0 4 9 2 2 7 5 2 8 4 10  [...]
+10 6 1 8 0 10 9 2 10 3 9 2 5 5 0 8 7 5 6 4 1 3 2 6 1 1 8 8 5 4 1 1 3 9 1 4 10 5 1 0 1 5 10 7 2 0 9 0 9 1 0 3 2 0 3 6 6 7 5 10 4 8 0 2 9 2 2 7 5 0 8 3 8 9 3 1 9 8 2 5 1 4 1 7 10 4 1 7 10 9 8 7 3 4 1 4 6 1 2 5 2 5 1 9 8 0 9 5 0 1 7 8 7 1 8 0 5 5 9 0 5 1 10 10 0 10 5 10 0 5 10 4 7 2 2 2 3 4 0 1 8 1 10 6 0 5 5 8 2 6 4 8 7 7 5 10 2 9 10 0 9 2 6 8 5 4 6 2 3 5 3 0 5 1 8 10 5 9 8 7 7 2 1 6 6 6 10 1 10 4 7 9 8 0 2 1 5 2 2 2 3 8 2 0 9 3 8 2 0 8 10 5 4 9 0 4 8 6 10 9 0 1 0 10 0 6 4 4 8 3 7 3 9 2 9  [...]
+0 10 4 8 7 1 2 8 1 9 7 6 6 4 3 5 4 3 2 2 7 5 0 4 8 7 4 5 2 7 10 1 8 10 7 1 2 6 7 2 6 7 3 1 0 10 0 1 9 0 7 6 5 0 3 1 7 9 9 9 3 8 3 10 8 7 8 0 5 5 1 3 6 3 0 3 9 10 3 1 4 3 1 5 7 1 4 7 8 9 3 1 1 3 8 6 8 1 2 2 3 1 10 8 5 5 0 5 3 0 7 4 9 3 0 6 3 4 3 10 10 3 5 8 9 3 1 7 5 4 3 5 1 7 7 7 2 1 1 10 9 0 9 5 0 9 2 1 7 1 10 8 0 1 2 7 6 7 1 0 0 5 8 10 9 0 1 3 8 10 8 5 9 6 2 4 3 10 9 1 4 4 0 5 1 1 0 5 10 3 2 7 10 10 3 2 8 4 4 7 1 3 8 8 8 7 8 5 5 4 2 8 1 0 7 9 1 7 8 6 9 1 6 1 0 6 4 2 8 0 8 7 4 3 9 7 8 3 [...]
+6 0 6 8 1 3 4 7 10 1 4 8 1 1 2 4 7 7 9 1 1 1 7 1 0 4 8 4 2 4 7 3 1 1 10 9 8 2 10 1 2 8 4 10 5 7 1 9 6 3 6 6 9 4 0 7 0 1 9 7 9 4 7 10 7 0 6 6 9 1 5 5 0 9 3 9 1 0 10 7 1 6 0 0 10 5 6 8 7 3 0 5 5 1 4 10 1 0 4 3 1 9 7 10 7 3 6 9 0 3 2 9 2 2 10 7 6 4 9 0 10 0 2 4 1 8 0 4 0 8 1 1 3 3 8 5 6 4 0 3 5 7 10 10 2 2 9 10 6 6 4 10 9 10 7 2 6 9 10 9 8 10 7 10 0 4 1 6 10 1 0 1 8 1 7 3 4 1 1 1 8 1 5 4 10 3 0 9 7 9 3 6 8 1 2 9 9 6 5 6 6 2 8 3 4 9 9 1 8 8 8 5 2 0 7 7 5 8 6 6 5 9 10 6 6 2 0 0 8 0 0 0 6 8 7  [...]
+4 8 5 2 6 3 10 6 0 10 4 0 8 5 6 6 2 2 4 0 3 0 1 7 7 0 4 5 1 5 6 7 8 6 4 2 1 8 6 7 6 10 2 6 3 4 6 0 5 0 7 3 6 0 0 1 8 7 9 7 0 2 2 8 7 2 9 5 4 3 3 9 3 7 1 1 8 5 7 5 4 10 10 0 7 0 6 1 7 6 5 7 5 6 4 8 2 0 10 7 2 1 9 4 6 10 10 8 4 6 10 6 10 6 4 4 6 0 1 6 5 3 1 8 5 6 8 5 10 10 6 5 4 3 5 8 2 0 4 9 2 1 5 7 4 1 6 3 2 1 9 5 6 1 5 1 7 9 7 7 0 2 10 9 8 9 4 5 3 1 10 6 1 10 7 4 9 8 1 7 4 10 7 7 6 4 5 4 3 6 3 1 1 5 5 1 7 2 2 1 6 1 5 5 2 2 4 3 3 1 0 6 2 3 0 5 2 5 0 10 9 1 5 2 2 9 10 3 8 4 3 6 2 2 10 10  [...]
+3 9 7 7 9 9 4 7 7 0 6 0 8 1 2 8 1 0 9 7 1 0 8 6 5 0 10 3 5 5 3 2 2 1 10 3 10 6 0 3 2 3 9 9 2 4 2 2 4 5 9 10 4 6 2 7 8 4 3 10 9 10 7 9 5 7 7 2 1 10 2 9 2 7 8 5 10 9 3 8 8 7 3 2 5 5 2 7 6 5 6 1 10 7 10 1 1 8 9 10 5 6 1 8 4 2 10 9 8 4 8 3 9 8 4 6 5 4 4 5 9 9 10 1 1 8 6 5 5 3 6 7 3 3 6 4 8 9 1 5 5 7 10 9 2 7 8 6 4 7 4 7 8 7 10 2 5 0 0 2 4 7 7 6 3 4 1 10 8 10 4 10 4 2 7 3 1 4 7 6 7 4 10 5 1 8 4 10 7 8 10 3 6 0 2 5 8 8 7 2 3 4 4 1 8 9 4 0 9 7 5 3 4 8 3 9 4 4 10 6 7 4 10 7 4 0 6 6 6 3 6 4 4 5 9 [...]
+9 9 8 7 10 8 5 6 9 8 8 3 1 3 5 0 6 3 9 5 1 4 10 3 9 0 10 10 5 8 5 9 8 1 10 3 10 5 9 4 10 6 2 0 5 7 8 10 10 8 2 2 9 7 0 4 2 10 3 6 6 1 5 10 7 8 8 4 7 1 7 3 9 7 2 2 4 3 4 5 5 1 5 0 4 2 10 3 7 10 4 2 8 3 6 6 2 9 2 3 8 7 8 5 0 2 9 0 7 8 5 1 3 2 3 10 6 1 0 1 7 3 3 7 0 7 7 7 0 1 7 7 9 7 8 1 5 10 8 6 8 4 9 4 2 5 4 8 4 0 0 9 10 1 4 6 3 4 8 4 6 6 6 6 5 0 3 9 2 0 4 10 4 6 3 1 3 7 9 9 0 3 8 3 3 4 4 1 6 7 3 3 8 1 4 1 8 9 9 4 2 6 4 2 6 4 6 1 3 0 1 5 3 3 8 9 0 3 5 4 7 5 6 5 4 1 10 10 6 3 3 10 2 8 4 9  [...]
+2 10 5 2 3 5 6 4 3 6 4 9 4 6 10 2 1 5 5 9 10 9 9 7 0 5 0 7 10 10 8 6 8 7 2 3 9 10 4 6 7 8 8 2 5 7 3 6 4 3 3 6 5 10 10 3 5 7 6 5 8 6 2 7 7 0 2 10 6 5 4 3 1 7 1 5 5 7 0 5 8 2 9 2 7 5 0 0 0 3 3 0 0 5 4 4 1 7 4 7 10 9 6 6 8 6 7 3 1 9 7 8 9 3 7 8 3 10 9 5 8 4 1 5 0 0 3 10 10 10 8 0 8 9 3 8 4 10 3 2 6 7 0 1 6 10 8 1 2 8 6 4 10 10 10 8 2 1 0 10 2 0 0 0 10 10 4 2 4 8 1 6 1 6 4 7 2 1 1 5 3 6 0 7 1 8 9 1 7 6 10 9 4 4 1 6 7 3 0 0 5 10 3 0 6 2 2 6 10 2 10 4 2 7 2 9 8 10 4 4 3 4 1 8 2 9 4 6 6 5 6 0 8 [...]
+4 7 8 7 7 8 8 5 2 2 6 9 2 4 9 7 4 2 5 8 10 7 8 10 1 4 5 4 2 3 3 0 1 3 9 9 1 6 9 7 10 10 7 2 2 6 1 7 7 6 5 5 6 8 5 4 10 8 10 1 8 7 3 7 8 8 7 8 8 1 10 5 3 1 1 7 0 8 9 7 5 3 10 7 8 4 1 1 2 9 8 5 9 0 8 6 4 9 8 8 8 8 0 4 1 10 6 3 9 10 6 0 4 10 8 8 7 4 4 8 7 2 6 0 6 3 9 10 2 9 7 5 10 9 7 10 3 6 9 10 1 6 2 5 6 8 2 0 5 1 6 1 3 2 8 0 6 2 2 7 8 4 8 9 1 1 2 4 0 6 10 7 9 0 9 4 10 9 2 2 9 5 8 8 6 3 7 6 5 1 0 7 5 3 7 2 2 9 2 9 0 7 2 0 4 10 0 4 8 3 4 8 7 3 1 6 7 0 5 9 10 8 5 5 10 5 6 9 7 6 10 6 1 6 6 6 [...]
+7 2 2 9 9 4 6 10 1 6 10 6 1 9 7 10 1 0 2 9 2 7 8 8 9 5 6 9 1 8 3 3 2 1 1 8 10 10 9 2 1 2 7 10 1 3 9 9 2 0 1 7 7 6 10 2 5 4 5 6 6 5 1 1 5 0 1 6 2 4 3 7 0 1 8 8 8 4 2 5 3 8 5 1 9 2 8 2 2 9 8 1 2 2 9 1 3 7 0 3 7 5 6 7 2 1 4 5 6 7 1 1 6 7 7 4 9 10 2 3 8 0 0 9 5 10 8 0 7 0 6 0 10 6 3 0 3 0 6 9 10 5 1 10 6 7 8 1 8 9 4 3 7 5 5 2 2 4 5 3 2 6 6 8 10 8 8 8 4 1 9 9 2 5 9 3 9 10 2 0 6 5 7 10 4 9 2 7 5 6 6 7 5 2 3 4 2 9 2 0 3 8 1 10 9 4 9 10 2 8 10 8 0 10 10 7 0 6 10 5 1 7 8 3 2 5 0 0 2 0 10 10 10 4  [...]
+10 7 10 7 7 0 8 2 1 4 7 9 4 5 3 10 6 0 1 1 8 6 0 3 9 1 0 2 2 0 1 2 2 1 10 1 9 10 8 2 9 2 6 1 4 10 8 9 1 10 0 1 8 10 0 10 2 2 9 10 2 2 1 3 10 10 10 2 7 10 3 9 1 2 6 1 6 0 1 2 6 9 9 4 3 10 1 6 6 8 4 5 2 2 6 2 5 7 8 1 7 0 5 3 0 5 0 8 0 8 6 0 3 2 2 5 5 5 5 6 9 0 1 0 6 8 1 9 9 7 6 1 8 10 0 7 0 9 0 10 6 6 4 5 5 9 8 1 8 5 5 6 7 6 5 8 2 5 9 10 1 3 1 7 3 4 0 2 3 0 10 5 4 5 2 4 3 5 0 7 7 0 4 1 5 7 8 10 4 8 7 4 6 3 0 0 6 8 2 3 9 0 4 4 10 0 7 6 5 8 2 9 1 3 4 2 6 4 9 1 5 3 10 6 1 6 1 1 5 9 2 1 1 0 3  [...]
+4 0 1 9 0 10 2 3 4 9 0 10 9 8 5 7 8 7 1 6 4 4 1 8 9 6 1 7 4 2 2 6 4 7 2 4 5 9 3 3 8 6 3 8 6 0 3 5 6 3 9 2 5 8 5 7 10 4 10 10 8 5 4 4 8 5 0 8 0 3 3 8 8 2 1 4 6 3 0 1 2 4 6 3 5 4 3 1 4 10 10 7 6 5 3 0 3 8 7 10 4 6 5 8 5 3 8 9 0 7 8 2 6 6 5 8 2 0 7 8 3 9 9 5 6 10 5 9 4 5 1 2 1 6 8 5 4 10 0 2 0 6 2 2 7 10 7 2 2 3 2 3 2 4 10 2 8 9 4 1 5 0 5 9 9 3 5 10 1 0 2 2 2 3 3 7 7 7 1 1 6 0 5 1 3 1 10 4 4 6 3 8 10 4 7 3 1 7 0 6 1 2 6 4 4 9 5 1 4 9 2 2 2 8 4 5 4 2 3 9 8 10 3 8 2 3 7 8 10 9 3 4 0 10 0 10 3 [...]
+1 3 0 0 3 1 4 2 8 2 2 9 4 9 1 0 3 8 1 3 1 10 5 8 2 7 10 8 10 2 2 8 2 0 7 10 2 10 7 7 1 8 9 10 9 5 0 9 8 6 9 3 7 7 7 7 5 8 9 5 7 5 8 8 8 7 4 3 7 9 1 9 9 3 4 9 6 1 8 4 8 9 4 7 5 4 1 10 2 3 6 10 9 8 5 1 4 0 0 4 10 10 10 3 8 6 1 8 1 10 0 1 10 10 2 1 0 4 0 4 7 5 2 7 7 4 9 5 6 3 1 6 9 8 3 7 7 10 5 0 0 1 1 1 3 2 7 10 10 4 10 4 7 3 9 10 4 8 8 6 7 2 8 0 1 4 5 10 2 7 8 10 4 3 1 7 10 0 5 8 10 1 8 7 5 9 10 0 10 8 9 1 3 9 4 2 5 6 8 3 4 3 3 10 3 0 3 10 1 8 8 10 1 6 1 1 2 10 6 8 7 3 4 9 6 0 9 9 8 10 10 [...]
+0 6 0 1 1 7 3 5 9 6 7 1 8 1 6 5 1 4 3 9 8 6 3 0 9 5 10 3 5 6 4 3 7 9 10 4 5 5 6 0 9 10 7 2 4 6 2 4 6 7 6 5 6 2 10 3 2 8 8 7 8 7 1 0 5 9 9 0 7 1 0 2 1 0 2 5 1 2 6 3 7 6 6 10 10 1 6 8 2 1 10 10 5 8 4 5 7 7 10 8 4 2 4 4 9 3 3 6 1 2 5 9 3 3 1 8 8 0 0 4 3 8 7 10 6 6 4 5 5 2 3 10 6 5 1 6 5 7 0 4 10 1 2 9 0 3 2 2 8 10 3 6 4 2 7 2 5 1 9 3 0 3 6 3 9 2 8 6 5 7 7 2 7 5 2 0 6 10 3 5 2 8 5 6 2 1 3 1 7 1 9 3 7 2 1 7 10 5 9 3 2 6 10 10 10 6 9 1 3 3 5 10 5 5 6 3 2 1 5 1 8 4 6 8 6 8 9 1 10 8 5 2 4 5 4 9  [...]
+7 5 3 0 4 7 0 5 6 9 7 10 10 9 4 3 5 1 6 0 4 4 1 6 2 8 7 7 3 2 10 4 7 2 1 5 3 2 3 10 4 1 5 9 1 4 0 5 6 1 0 8 3 6 1 9 4 3 2 7 7 7 9 0 3 0 5 4 0 7 8 8 1 2 5 6 9 2 1 9 5 9 10 9 5 3 9 4 8 8 5 10 1 3 7 7 0 4 2 9 4 9 9 4 5 0 4 5 0 5 10 8 8 7 10 3 7 1 5 3 1 1 9 6 7 0 5 0 6 6 1 2 3 9 9 9 6 1 8 7 2 2 8 10 2 5 3 4 3 1 4 7 7 3 3 8 2 10 9 10 9 4 1 6 10 2 7 9 5 0 0 5 8 4 2 9 5 9 7 4 4 6 5 6 2 4 6 2 1 5 10 4 10 5 0 10 7 6 3 6 10 9 1 2 7 6 4 5 10 5 10 1 1 3 0 8 9 0 6 4 4 4 3 0 6 2 1 6 0 7 5 8 5 9 0 6 6  [...]
+8 6 0 5 5 0 3 1 9 5 1 7 1 10 2 5 6 0 3 4 9 10 1 3 0 6 0 3 0 1 7 4 9 4 10 7 0 0 10 7 9 2 5 3 3 3 4 0 2 9 3 7 3 5 0 5 6 5 6 6 0 9 3 3 2 8 4 4 2 2 7 8 8 7 5 5 6 3 9 4 1 2 8 2 8 10 10 3 1 3 4 2 8 8 1 10 10 5 5 2 7 3 10 10 8 4 4 4 0 6 2 4 7 5 7 1 4 10 6 6 8 7 8 9 8 9 2 0 5 2 3 0 0 2 1 8 6 8 6 8 8 1 2 10 8 7 8 6 2 5 9 8 5 10 3 10 2 9 3 0 2 0 6 1 10 10 10 3 10 9 1 5 4 0 1 4 0 6 10 0 8 6 7 9 1 6 8 9 2 7 0 3 5 7 3 7 4 7 10 1 5 9 4 9 3 6 4 0 1 4 9 9 10 2 4 8 3 3 5 3 8 3 8 10 9 6 9 4 8 6 7 4 7 8 3  [...]
+1 5 3 9 2 6 0 5 0 6 2 6 0 10 3 8 10 10 1 3 9 3 0 7 0 8 9 6 10 5 9 8 6 0 2 6 1 9 4 4 6 3 6 3 7 4 7 10 7 7 1 1 0 4 6 5 5 10 0 5 7 2 3 2 2 10 3 8 2 8 10 7 6 2 1 1 1 7 5 8 10 6 8 8 6 0 9 5 6 2 8 0 4 9 5 3 0 7 5 4 4 4 3 8 4 4 8 3 6 0 9 8 3 7 8 5 7 1 9 7 7 0 8 6 9 2 6 3 2 3 3 9 5 6 10 8 9 6 7 6 4 7 2 4 7 10 9 7 9 5 4 9 3 10 10 2 2 2 3 3 5 3 10 3 0 0 2 4 1 10 5 6 6 2 4 10 0 7 8 1 8 6 1 1 0 1 1 1 10 9 2 2 9 8 0 1 0 10 4 4 1 9 5 4 7 7 10 1 1 6 4 8 2 8 10 6 2 4 5 9 6 3 7 0 2 8 1 5 0 3 7 6 9 10 4 9 [...]
+6 4 6 7 2 5 6 9 2 10 4 7 8 4 6 7 10 7 4 3 2 9 10 10 8 5 7 7 0 1 0 4 0 4 0 0 3 0 1 3 4 0 4 2 4 5 1 0 6 5 7 0 4 1 6 3 0 4 8 2 8 9 8 6 4 9 0 2 10 10 5 7 3 8 1 7 5 5 7 9 7 8 10 8 0 9 8 10 0 0 1 0 1 4 8 3 2 0 10 1 5 4 4 6 1 8 5 6 8 1 0 1 6 10 4 2 10 3 2 5 8 9 0 6 2 9 6 5 9 0 0 5 9 7 6 1 5 2 2 4 6 7 7 8 7 9 7 6 6 7 3 2 1 10 6 5 3 4 3 8 8 8 8 4 10 9 9 9 5 8 7 7 8 10 7 5 4 3 5 6 3 7 9 2 1 0 3 2 4 8 3 1 0 2 2 9 4 10 4 10 8 0 8 10 2 2 5 6 1 6 3 3 4 4 7 9 10 5 3 9 1 2 0 3 3 1 5 2 3 6 1 9 1 8 4 4 8  [...]
+6 6 4 9 1 3 3 7 5 7 0 9 2 6 7 5 5 8 9 7 3 1 5 0 9 10 8 3 9 7 7 4 4 9 1 0 4 1 1 3 10 9 10 7 0 6 7 2 10 1 1 1 10 2 2 6 9 5 7 4 1 3 3 2 2 0 3 3 7 2 2 2 4 5 4 3 8 4 1 1 10 3 9 1 2 8 9 0 7 3 2 3 3 6 2 2 9 2 7 7 6 5 0 10 4 10 2 9 4 3 1 9 6 9 2 6 10 6 2 10 7 1 1 3 0 6 8 9 9 4 0 7 6 7 1 0 4 5 3 3 6 0 8 6 8 7 9 6 7 6 6 4 4 10 9 0 1 3 3 8 8 3 2 6 3 1 0 0 9 1 4 3 1 3 10 5 5 4 0 9 0 10 7 10 3 0 8 2 4 6 2 4 6 0 3 1 0 3 9 0 4 0 9 3 2 7 7 10 3 7 8 7 4 10 4 9 1 3 8 4 4 3 10 2 9 8 1 8 4 1 1 0 7 5 10 0 3  [...]
+6 3 2 6 1 0 2 5 2 4 8 4 4 5 3 0 0 6 4 4 0 5 7 9 7 2 4 0 5 8 6 7 5 10 3 9 4 1 6 4 7 9 2 1 1 7 4 0 2 3 10 4 3 7 4 6 7 10 5 8 5 1 2 7 0 0 5 3 9 3 6 6 10 5 6 10 4 6 9 3 10 6 7 4 6 8 0 7 9 8 0 2 2 3 10 4 10 8 3 9 5 1 4 7 0 3 6 3 5 10 9 7 3 5 2 1 4 5 3 1 6 8 5 3 6 9 2 8 1 9 1 2 5 10 2 5 9 2 6 4 6 7 6 4 7 8 6 10 3 2 1 6 3 9 10 6 6 4 10 10 6 10 5 6 2 0 2 3 1 3 6 0 2 4 2 1 0 10 1 3 3 8 4 10 10 10 6 10 3 4 0 9 1 4 9 3 2 5 0 5 10 2 8 1 3 6 5 0 7 1 3 2 3 3 1 1 1 0 2 7 7 6 4 2 2 5 10 10 8 3 8 7 0 3 4 [...]
+10 6 1 10 9 10 9 9 2 1 0 0 3 2 6 4 10 7 0 8 8 8 0 4 10 6 0 9 2 8 0 7 9 1 0 6 3 8 7 2 7 7 7 9 2 9 3 7 3 8 9 7 4 2 4 10 5 1 5 10 1 3 4 6 8 10 6 7 6 1 1 5 1 7 7 10 6 8 4 8 4 2 3 9 8 3 4 7 4 9 8 2 0 9 4 6 5 9 0 7 10 9 0 4 0 3 8 9 0 9 6 5 4 4 7 7 7 8 7 6 2 4 0 6 3 3 3 8 5 7 7 6 1 7 0 3 4 0 6 10 9 6 3 2 10 3 7 3 0 3 10 9 3 9 3 3 3 7 0 5 2 3 10 4 9 8 8 0 3 10 5 2 1 3 6 1 0 7 7 3 8 8 8 8 2 5 7 10 4 7 5 5 0 7 7 0 10 6 6 7 5 5 9 4 8 2 8 6 3 2 6 0 9 5 6 6 9 3 9 6 5 3 8 7 10 10 7 7 7 3 2 8 10 0 2 8  [...]
+0 3 10 0 0 1 4 1 10 9 9 2 7 1 8 5 0 0 5 2 6 10 6 7 9 2 10 9 3 3 6 0 3 0 7 0 3 9 0 6 0 1 5 3 10 2 10 9 3 4 10 7 3 6 8 5 3 0 0 5 7 2 4 6 10 6 5 2 6 7 10 4 1 3 2 0 5 5 4 3 4 4 0 0 0 7 8 7 0 0 4 1 3 1 0 6 6 4 3 9 4 6 1 2 2 6 5 4 6 9 4 7 1 10 10 7 8 9 9 10 8 4 0 10 6 6 6 9 1 0 5 6 9 2 10 4 2 2 0 3 2 4 6 4 5 1 8 3 1 6 5 2 4 0 2 7 10 4 1 0 3 10 6 10 5 8 6 9 7 9 4 6 4 10 2 2 8 7 8 4 2 8 3 7 1 2 8 10 1 7 1 8 5 10 6 9 8 9 8 4 10 2 3 2 2 9 2 6 3 6 1 3 3 3 5 5 1 0 10 2 3 7 2 6 5 5 1 3 4 7 9 10 9 6 2 [...]
+2 7 4 9 8 6 9 4 3 5 7 4 4 9 7 5 7 10 5 3 1 4 4 2 9 1 10 8 0 3 1 1 1 9 3 3 0 2 10 6 6 8 9 8 10 7 3 6 5 5 7 6 2 10 5 10 9 3 4 4 7 3 1 6 9 9 7 0 5 7 7 7 6 9 1 9 2 1 8 2 5 0 9 6 0 5 10 7 6 8 6 0 5 8 6 7 4 7 3 7 10 4 2 8 3 4 0 10 2 5 8 7 2 3 1 7 8 0 3 0 9 2 7 3 4 9 0 5 10 2 2 1 7 2 6 7 1 2 8 7 2 0 5 1 4 4 9 5 1 7 0 6 2 7 2 6 4 5 0 5 8 7 3 0 3 4 2 1 3 10 10 10 8 9 4 1 10 8 5 10 0 2 6 4 10 6 8 2 2 4 10 6 0 5 9 9 7 8 2 10 4 0 8 1 4 6 10 4 2 8 10 3 10 9 6 2 4 3 4 1 1 2 0 1 1 9 3 0 9 10 9 5 6 2 6  [...]
+0 10 3 3 1 8 0 8 6 8 9 8 0 5 6 1 3 10 4 0 2 7 7 2 4 8 6 3 7 7 4 8 2 5 5 3 10 0 10 7 10 10 3 10 8 5 0 8 2 9 4 4 5 8 2 2 1 4 4 2 1 2 5 7 2 2 2 10 5 7 8 2 2 5 6 3 8 2 2 0 8 7 7 7 6 9 9 7 10 3 8 1 5 9 3 6 4 9 5 2 3 1 4 9 5 2 9 10 10 4 6 6 4 9 3 0 3 3 3 6 1 5 9 1 5 2 1 8 7 0 0 1 0 6 0 9 0 0 10 1 4 4 2 1 10 0 8 4 2 1 3 6 1 9 8 8 1 3 6 10 6 6 3 10 6 2 2 1 3 1 8 9 6 0 5 7 2 2 0 10 6 6 5 3 10 1 9 7 10 1 10 7 7 5 6 2 7 1 0 9 3 6 7 5 10 4 1 4 7 8 10 3 6 2 6 2 9 1 7 7 7 8 3 4 3 9 10 1 8 5 0 7 8 8 6  [...]
+2 0 10 1 5 4 6 7 0 4 7 1 0 1 6 6 10 10 2 1 3 5 8 1 2 1 7 1 5 3 6 0 8 0 8 3 0 7 2 0 2 5 2 5 3 4 2 8 10 2 4 1 9 9 10 9 8 4 5 7 8 8 7 7 1 9 4 6 8 9 9 7 10 3 10 10 8 1 4 0 4 4 6 8 8 2 7 10 5 10 7 1 6 8 2 10 1 1 8 8 2 2 7 0 1 5 4 8 9 3 3 10 2 10 1 3 8 2 6 7 0 3 8 0 3 3 0 0 9 10 7 1 1 7 7 7 8 4 5 2 10 0 2 1 8 2 9 4 8 0 0 2 0 10 4 3 6 9 4 4 0 2 4 5 4 4 1 3 0 10 2 10 4 6 0 1 9 2 4 9 1 3 8 3 0 10 4 10 4 5 9 6 1 9 6 3 1 6 8 8 2 3 9 1 1 4 9 8 3 0 8 0 2 1 1 5 5 2 6 3 6 4 9 1 4 8 5 4 3 0 7 6 6 10 2 1 [...]
+4 6 6 9 2 2 2 2 4 4 9 3 3 7 6 6 4 9 0 6 7 7 1 2 9 10 0 0 10 9 10 3 1 5 10 0 3 10 2 0 4 9 3 5 3 4 3 10 4 10 10 7 10 10 0 3 5 2 10 8 10 0 2 8 3 3 2 0 9 0 0 4 3 10 3 4 10 5 5 9 3 8 1 4 4 1 6 8 5 2 4 7 9 7 5 1 4 0 10 6 0 6 10 4 8 9 8 5 0 0 7 10 8 9 7 7 0 0 6 4 3 4 3 10 5 2 3 1 8 9 9 7 0 2 10 10 7 2 10 4 3 7 2 4 10 3 10 2 4 2 4 5 7 9 2 1 7 9 1 6 6 5 10 10 10 2 2 5 6 3 0 8 10 4 1 6 7 6 2 0 9 1 1 6 7 2 2 5 8 8 0 4 1 6 0 9 4 7 7 6 9 2 10 1 0 7 2 3 1 6 2 2 5 4 9 0 9 10 7 10 10 7 2 3 6 4 1 1 5 3 8 [...]
+3 2 4 5 8 8 2 8 8 1 7 7 5 8 3 4 0 3 5 10 4 3 4 2 7 9 10 10 9 3 8 1 6 0 8 7 4 3 5 6 4 8 0 4 3 8 7 1 0 2 3 6 3 2 6 5 6 9 3 0 9 4 7 7 8 1 4 6 4 4 0 9 5 4 8 9 10 8 2 9 1 3 5 1 6 0 1 2 8 7 0 5 0 2 7 0 4 9 7 3 4 8 7 3 5 2 10 0 0 0 7 7 4 9 5 2 4 3 5 9 1 9 0 4 9 6 7 10 5 4 1 9 0 5 3 7 0 6 5 0 8 5 3 8 10 9 2 5 4 9 5 5 5 2 10 5 5 5 3 6 2 2 6 0 0 8 8 6 5 0 9 3 3 3 6 6 7 9 1 3 8 3 7 8 9 7 5 4 6 9 2 10 9 10 3 1 0 8 7 8 7 0 9 7 2 5 1 5 1 2 9 3 1 3 8 0 9 1 9 4 8 8 8 9 4 2 7 1 2 5 6 1 8 1 9 9 6 5 10 2 7 [...]
+4 4 3 0 3 9 3 9 0 7 7 4 6 0 10 7 9 10 0 2 0 8 7 6 10 6 4 10 8 2 4 7 10 4 10 1 8 5 7 2 1 9 5 1 8 10 8 4 8 0 10 7 2 0 10 5 2 5 10 7 9 10 5 0 6 2 5 7 1 9 0 2 5 3 7 7 1 9 10 3 9 7 6 4 8 6 3 0 2 1 3 3 6 0 1 4 5 1 10 6 7 6 4 4 10 9 8 5 0 5 2 6 2 0 7 5 0 0 8 8 1 5 10 6 0 8 6 0 7 6 8 3 4 10 4 4 4 6 0 2 0 0 8 10 1 9 0 0 6 0 6 9 7 1 7 6 8 5 0 3 3 4 5 5 0 5 4 7 10 3 4 8 9 9 5 1 10 8 4 9 10 9 2 3 3 2 9 10 3 4 5 8 9 4 2 6 1 0 7 5 0 8 7 0 4 4 2 4 3 6 8 4 5 5 7 2 8 2 7 9 7 5 3 2 2 3 6 3 9 1 7 2 1 3 1 1 [...]
+2 3 1 8 0 0 5 6 9 7 10 9 5 2 2 5 5 1 7 10 9 3 7 0 6 4 9 8 5 6 1 9 5 4 7 5 9 10 2 1 6 5 3 7 1 1 4 6 3 9 10 2 9 5 2 3 1 4 2 0 7 5 2 8 9 10 9 2 5 2 4 5 3 1 3 3 5 7 8 3 9 8 4 8 1 7 1 1 8 5 1 8 0 4 8 0 10 5 8 1 0 1 7 5 5 3 2 4 8 8 5 0 0 10 5 2 8 10 3 5 7 9 2 9 3 7 5 4 4 2 0 7 9 10 7 1 9 10 6 10 4 8 1 8 9 6 0 6 5 9 10 4 10 7 4 1 0 2 6 8 8 0 8 9 9 3 0 4 8 3 8 10 0 0 6 1 4 4 3 10 2 5 0 3 4 7 6 8 7 2 10 9 9 3 4 7 3 2 9 0 6 1 3 6 8 3 8 8 1 8 10 6 4 0 1 8 9 6 4 0 2 9 6 6 4 2 9 1 4 2 2 7 6 1 2 6 7 5 [...]
+8 0 10 1 9 3 1 4 8 1 6 1 1 7 1 1 2 2 1 3 4 9 8 4 6 3 7 3 3 7 3 3 7 1 2 2 2 2 2 4 4 3 10 9 0 4 3 5 1 5 10 5 1 9 3 8 8 0 9 0 7 10 5 6 1 3 7 7 1 0 6 10 1 6 7 9 6 10 4 3 6 9 5 5 1 4 10 6 2 1 10 0 8 3 2 10 0 10 10 0 6 0 1 7 10 2 3 7 1 6 6 5 10 3 4 1 5 4 1 6 2 0 10 6 10 1 10 6 9 5 0 10 6 10 9 9 9 3 6 5 9 10 1 0 7 3 2 1 7 3 4 8 4 9 10 9 6 0 6 6 10 1 6 1 9 0 5 0 2 7 8 9 9 1 4 4 3 3 9 7 5 5 6 8 0 8 9 3 4 0 10 5 0 1 4 10 7 9 9 10 2 1 0 10 0 0 1 1 1 10 8 10 7 6 8 2 9 2 5 6 0 1 5 3 0 8 3 7 3 4 0 9 2 [...]
+1 10 5 1 0 10 7 0 9 7 4 4 4 9 8 2 5 1 1 9 3 10 6 4 2 0 5 9 10 8 2 4 2 1 7 7 9 5 2 3 7 1 1 10 0 3 1 7 1 2 8 6 2 0 5 7 8 4 8 10 3 2 3 4 8 2 1 7 2 6 8 6 9 7 10 8 1 3 3 6 5 4 5 9 2 5 3 10 5 10 1 10 3 0 6 0 5 1 3 8 7 1 4 0 4 2 2 7 0 2 8 3 1 7 7 6 9 1 3 9 0 5 4 8 5 6 8 7 4 10 3 2 1 3 3 1 8 9 6 0 0 10 9 1 8 9 3 10 10 7 3 0 4 8 8 6 1 7 7 3 1 10 6 9 3 2 0 5 3 2 9 4 10 8 2 6 0 4 3 4 10 8 5 3 3 10 3 5 5 6 5 8 6 9 3 2 0 7 8 0 6 9 4 0 3 4 5 0 9 5 6 8 3 7 1 8 9 9 5 2 7 10 1 6 5 7 6 0 7 5 3 6 4 8 6 1 4 [...]
+3 1 10 9 4 8 10 2 0 5 4 5 7 10 6 2 3 1 5 0 5 3 7 2 1 5 8 0 2 10 0 5 3 10 3 9 8 5 0 1 3 9 2 3 8 0 4 5 4 4 0 0 8 2 2 4 3 1 1 9 9 8 10 0 4 9 7 2 5 7 1 9 8 9 2 3 0 2 8 4 8 5 1 6 9 6 3 5 2 4 10 0 1 7 9 7 5 2 10 3 9 7 3 8 8 10 8 8 6 0 6 7 6 0 0 9 8 2 7 5 5 9 1 4 10 1 10 2 1 9 2 3 7 4 1 4 10 7 7 8 4 8 5 6 5 0 7 2 6 10 3 5 7 1 3 5 8 10 8 7 10 3 5 4 5 4 0 0 0 7 0 2 4 0 0 8 4 1 3 5 4 8 9 1 5 3 9 3 6 1 10 8 3 2 0 4 0 1 7 8 1 7 3 6 5 7 9 0 10 9 1 6 1 0 10 6 6 6 6 6 1 3 8 1 9 7 1 1 8 8 8 6 8 9 4 1 7  [...]
+6 7 6 0 1 7 6 5 5 10 6 2 3 10 9 4 1 2 6 10 4 4 7 2 4 6 9 7 6 0 1 1 5 2 5 6 8 2 7 8 9 4 7 4 9 7 3 10 6 9 10 7 6 3 4 8 5 7 10 3 9 4 0 1 5 1 4 8 0 3 7 9 7 5 7 7 4 10 7 0 10 2 1 1 3 2 6 8 8 7 2 10 8 5 4 6 2 3 3 6 6 0 9 7 0 9 5 1 2 9 6 1 9 8 4 0 6 9 2 5 0 4 4 8 1 1 5 0 8 6 3 7 6 1 10 4 0 7 6 0 3 7 6 5 7 10 7 0 0 1 0 0 6 5 5 10 10 3 9 1 7 8 2 7 10 8 3 0 7 10 0 4 1 10 8 9 6 4 5 6 8 1 2 3 9 9 7 7 1 8 10 5 8 2 1 10 0 8 1 8 7 7 0 10 9 3 0 2 5 9 0 8 7 5 2 0 5 10 10 3 10 4 3 2 3 9 4 6 6 2 7 4 1 4 1  [...]
+5 5 9 4 0 4 1 2 6 5 4 9 8 2 2 2 2 7 8 7 3 2 1 10 5 8 5 1 1 2 0 2 2 0 4 5 10 0 7 4 1 7 10 6 0 9 7 0 3 1 3 6 3 6 9 8 2 0 5 9 9 7 6 6 3 4 2 2 2 3 4 10 5 9 0 10 5 9 4 7 5 3 3 3 6 5 0 1 7 2 2 1 6 9 6 1 8 3 8 3 4 10 4 8 10 8 6 7 5 10 6 1 7 10 0 1 0 7 9 8 5 0 4 10 5 5 2 5 2 10 0 5 6 3 6 5 7 10 0 6 4 10 5 7 2 8 1 7 0 4 9 5 6 0 0 1 6 6 0 3 6 5 3 6 3 3 7 1 0 3 2 2 6 3 2 10 8 9 0 4 8 9 6 9 9 7 10 4 9 0 5 10 6 7 5 4 1 2 10 9 4 9 5 1 5 3 4 5 7 2 5 2 2 8 2 6 7 10 3 5 6 2 9 0 8 5 0 8 3 1 1 10 3 0 0 8 1 [...]
+6 8 6 5 8 1 7 1 6 7 3 7 6 0 7 10 6 1 10 5 2 5 8 8 2 7 1 9 2 1 0 3 1 8 7 3 10 7 2 2 0 9 0 6 1 6 6 1 9 0 7 10 7 3 9 6 6 8 6 10 5 3 6 6 2 3 6 0 2 3 6 6 7 4 7 1 1 9 3 3 4 6 10 3 8 2 8 0 2 7 7 1 0 1 6 7 7 0 9 9 0 8 5 7 8 10 9 6 9 2 1 2 2 6 3 7 9 7 3 8 4 0 9 2 1 10 5 6 6 1 1 5 0 6 8 2 1 4 7 0 7 9 8 3 0 5 7 4 5 7 3 9 8 6 6 2 5 8 8 10 7 9 6 8 0 7 10 3 1 5 5 0 0 2 9 7 3 1 9 6 10 8 9 5 6 7 4 3 5 9 7 2 8 4 10 9 5 6 5 5 8 7 8 5 1 3 1 6 6 10 6 0 7 5 1 5 4 8 10 5 9 2 1 4 10 10 6 9 4 5 8 2 10 1 0 5 7 5 [...]
+6 5 9 0 3 4 3 1 10 2 4 6 1 5 8 2 1 7 5 7 7 10 9 4 6 7 7 3 2 3 4 0 4 9 5 9 0 3 10 9 2 2 0 4 7 8 6 1 4 3 7 1 7 6 1 1 6 9 0 3 0 3 5 4 8 3 0 8 2 8 0 0 9 2 5 9 5 10 4 8 7 9 0 7 6 7 2 4 10 1 0 4 3 10 8 1 10 2 4 0 6 8 3 5 3 5 5 10 7 0 10 1 6 8 9 7 4 9 1 10 4 6 0 4 7 7 3 7 8 9 4 3 2 0 2 0 4 2 0 3 2 5 10 9 5 2 2 9 6 4 6 8 1 3 7 5 4 1 2 0 2 5 4 2 6 2 8 5 7 10 3 10 7 0 6 1 10 6 4 8 1 10 5 7 9 10 3 1 3 3 3 4 8 0 3 5 2 4 0 10 1 6 8 2 2 4 8 9 3 2 2 10 4 9 0 3 6 5 5 4 10 4 0 3 8 3 3 4 3 3 3 3 0 3 1 10  [...]
+10 2 9 6 6 5 3 2 9 5 3 10 7 5 3 4 5 7 9 4 5 7 3 7 8 3 1 5 0 9 7 0 5 6 4 1 7 4 0 5 5 9 6 8 5 2 3 9 8 4 2 8 1 2 9 4 1 7 6 5 7 8 6 1 0 4 7 7 2 7 1 2 5 2 6 2 5 8 0 10 0 9 7 2 7 6 4 2 4 2 5 10 9 5 10 10 6 0 4 10 6 10 2 2 2 5 9 2 3 9 0 1 6 7 10 6 4 0 2 4 9 9 4 2 6 0 3 10 9 9 2 10 7 3 4 8 0 7 8 7 5 10 10 6 10 3 0 3 2 0 8 7 3 4 3 6 10 2 9 2 4 0 2 6 9 10 6 1 6 7 8 10 2 1 5 2 6 3 6 2 5 8 2 9 1 3 2 10 3 10 7 6 8 6 2 7 3 3 3 8 3 5 10 9 1 1 9 0 9 10 10 6 0 5 5 0 6 5 5 5 5 7 4 3 6 6 1 3 3 3 0 6 1 10 5 [...]
+8 4 1 8 8 7 7 9 1 2 1 0 3 8 2 10 3 7 5 6 2 4 0 4 4 1 1 2 8 8 4 2 3 10 8 8 2 3 7 8 6 5 8 4 2 6 7 5 1 6 5 6 2 2 7 2 9 8 8 3 3 1 1 2 3 10 8 7 5 1 2 9 9 7 3 10 4 9 1 5 2 8 1 7 5 9 2 4 8 1 7 7 7 2 2 6 2 1 4 9 1 1 0 2 5 3 3 9 7 5 0 6 0 9 4 10 3 5 10 10 9 8 2 3 3 9 4 5 6 3 10 4 8 4 9 3 4 10 0 7 0 3 6 3 7 3 1 10 0 4 0 5 6 10 8 2 8 4 1 3 6 8 2 8 6 1 10 4 0 9 9 3 3 4 5 8 6 0 3 6 2 7 8 5 3 8 7 4 7 9 2 1 2 2 0 2 8 5 2 8 3 7 9 4 1 6 4 9 10 8 5 4 5 10 3 1 6 3 6 9 9 1 9 7 3 9 0 9 10 9 9 4 2 2 2 3 4 3 1 [...]
+6 7 3 8 2 6 2 1 5 3 4 8 5 1 0 9 9 6 2 1 5 2 4 8 1 4 8 9 6 8 7 3 5 4 6 8 7 0 3 8 8 9 0 6 5 3 7 3 9 2 7 1 9 1 10 10 7 5 7 3 4 1 2 1 0 1 6 5 6 1 7 2 10 5 1 3 3 5 8 7 1 2 7 8 9 9 4 9 3 3 6 1 2 4 5 7 1 1 5 4 8 10 10 10 0 5 4 4 9 5 9 0 9 6 7 2 9 5 8 9 0 8 10 4 6 7 5 8 6 4 5 2 9 2 10 1 6 3 10 4 4 2 7 0 4 4 2 6 7 8 8 8 10 5 5 4 4 4 1 6 1 4 3 5 2 2 6 8 9 3 10 8 3 8 1 6 8 8 4 3 1 0 3 6 7 4 7 9 8 6 1 8 8 7 2 6 3 10 8 5 6 2 7 9 6 1 6 5 8 1 7 0 8 6 4 2 1 1 7 10 9 3 9 1 3 10 7 9 0 5 6 1 5 8 5 1 1 1 2  [...]
+3 9 2 8 1 1 9 4 6 5 6 0 9 10 2 2 10 6 8 8 9 8 10 5 6 1 9 7 0 9 7 9 2 7 1 9 9 9 4 10 7 1 0 5 0 2 2 5 8 9 2 2 4 1 7 3 5 4 5 8 9 9 6 4 10 1 4 10 4 4 3 7 9 0 6 7 5 7 9 1 10 3 10 10 1 7 0 5 3 6 9 6 5 7 8 0 1 6 0 9 0 3 0 3 6 8 1 4 0 4 2 5 10 10 9 2 1 2 8 8 0 6 6 2 1 2 10 10 3 8 10 10 1 2 4 3 2 6 3 9 1 7 2 1 1 6 6 0 3 3 5 5 6 6 10 9 10 7 8 4 8 1 7 1 8 10 9 0 1 5 4 6 1 4 9 6 1 6 9 10 10 4 1 2 2 8 8 2 10 4 2 5 0 10 1 8 1 0 8 3 7 1 0 1 10 8 5 1 7 4 7 0 8 7 3 10 6 6 0 2 10 10 5 2 1 4 0 8 5 9 4 9 1  [...]
+7 10 8 6 8 9 8 6 9 5 1 8 8 1 6 8 4 7 1 5 1 5 6 8 4 9 8 1 8 2 0 1 3 8 10 4 6 7 5 6 8 7 4 4 1 0 9 0 6 2 4 8 2 3 9 5 4 3 8 9 8 10 10 2 0 10 7 2 8 4 8 5 9 4 9 0 10 8 0 5 10 9 10 7 5 3 5 0 7 2 10 4 6 2 2 10 10 8 6 3 3 8 5 8 7 1 9 6 3 10 7 5 8 7 7 1 1 2 0 9 5 8 8 7 5 5 1 5 8 7 6 9 5 4 10 2 8 3 7 2 4 5 2 7 1 1 1 1 4 9 4 9 8 8 9 2 5 1 3 3 0 10 1 2 1 6 1 7 6 6 5 6 5 7 6 10 7 8 8 4 0 5 9 1 5 4 9 6 4 7 8 1 7 7 5 4 4 5 8 4 0 2 4 1 9 3 2 5 9 8 0 9 6 2 2 5 0 5 10 9 6 7 3 3 4 5 0 7 2 3 4 1 7 2 9 9 4 7  [...]
+9 1 2 3 1 2 4 6 6 7 9 10 8 0 4 2 2 2 1 3 2 6 8 0 10 10 10 9 9 3 4 8 10 3 5 10 7 7 10 9 7 10 4 2 10 10 5 4 6 1 1 8 8 8 1 5 10 2 7 9 10 2 8 10 2 2 10 5 8 0 6 4 2 4 3 2 9 7 1 7 10 1 3 9 10 10 8 9 4 10 4 8 0 6 3 8 4 4 0 8 9 9 1 8 3 2 3 4 10 7 8 7 10 5 10 2 8 4 6 9 10 4 4 8 5 1 7 4 3 3 3 10 10 1 5 2 6 7 2 0 4 0 10 5 7 10 2 2 10 7 10 8 8 6 10 4 8 4 2 4 5 3 10 10 10 9 5 0 7 9 2 4 2 9 6 9 10 5 2 2 7 6 8 0 6 10 9 4 8 2 7 4 10 10 9 0 10 2 7 1 2 8 6 8 10 1 5 5 1 9 3 7 4 9 9 0 3 2 1 1 0 2 5 4 1 6 7  [...]
+3 6 9 3 3 4 6 2 8 3 2 0 9 6 8 10 10 3 10 8 5 5 10 3 10 2 7 5 2 5 7 9 10 8 6 2 5 8 2 6 4 1 10 7 7 6 4 6 6 8 2 9 6 2 10 6 7 7 9 1 1 0 8 6 0 9 7 10 10 9 3 1 0 1 9 1 2 0 0 4 0 9 9 5 9 6 10 7 1 9 1 9 4 8 1 7 10 4 7 4 2 7 1 4 3 5 9 9 3 3 10 6 0 9 2 8 8 2 0 5 1 7 10 4 8 9 5 7 2 5 0 9 1 7 2 4 0 7 3 9 4 2 2 6 6 7 4 5 2 3 0 10 6 7 1 1 7 8 6 0 6 8 2 2 9 7 5 4 7 8 9 7 0 9 5 3 7 10 7 0 3 5 10 5 0 8 5 8 5 4 0 2 4 4 5 8 8 2 4 8 2 5 10 1 2 1 9 2 4 1 10 8 9 8 1 5 2 4 1 5 1 3 0 6 0 5 3 9 9 7 10 4 8 4 7 5  [...]
+7 3 10 3 6 5 2 8 10 3 10 5 8 7 2 4 1 5 0 10 3 6 4 8 10 5 0 5 7 1 0 7 10 8 10 1 6 9 6 8 0 7 3 4 10 8 9 2 4 1 9 0 9 7 2 1 1 4 5 2 0 3 4 2 3 2 9 7 0 9 10 5 2 10 7 7 4 8 2 4 6 5 1 10 1 5 3 7 5 10 2 4 0 10 2 4 0 5 5 1 9 0 10 6 8 0 0 4 0 9 10 3 0 7 7 2 4 10 9 10 0 1 8 10 9 2 2 6 5 3 5 5 6 5 5 9 6 6 8 0 5 9 3 2 5 0 8 5 10 2 0 3 8 1 9 10 9 7 3 9 6 3 6 3 1 6 1 4 1 5 6 10 9 1 2 8 0 3 7 7 0 1 3 0 0 3 4 4 2 4 7 9 3 3 1 9 8 7 3 10 10 10 5 8 7 2 2 5 9 0 2 7 8 3 1 0 4 9 3 9 6 1 0 3 3 7 6 9 3 7 7 0 2 10 [...]
+9 10 8 4 6 4 1 1 0 10 9 7 1 3 3 2 6 10 5 6 4 4 3 2 6 4 3 1 9 5 9 6 7 6 4 10 2 1 1 0 8 3 2 4 1 2 2 8 7 7 10 5 2 9 1 1 4 4 0 0 9 7 1 1 0 7 1 4 3 6 6 0 5 2 4 9 10 4 0 3 1 2 8 8 2 10 6 8 8 4 1 3 0 10 1 9 6 3 0 2 0 0 8 9 7 6 8 3 3 5 9 6 9 8 1 2 2 7 5 10 0 4 8 9 0 1 1 7 4 3 5 3 0 1 4 1 8 8 10 7 0 2 4 6 4 0 9 8 7 5 3 0 5 6 2 6 1 5 8 3 6 3 10 0 5 1 10 10 0 0 2 0 2 0 2 5 5 8 2 6 10 7 8 1 10 8 0 9 0 9 2 2 3 1 1 6 9 9 3 4 1 5 7 0 1 9 1 10 2 1 3 8 1 7 2 4 2 10 9 4 10 0 7 2 8 6 0 10 4 4 6 5 0 2 6 7 9 [...]
+10 7 3 9 10 7 4 4 0 5 7 0 5 3 5 3 1 1 9 2 2 6 1 4 5 10 1 1 3 4 2 4 4 7 10 10 5 7 2 4 2 9 10 9 5 1 7 5 1 7 8 5 4 8 10 8 2 7 9 3 0 5 9 7 10 3 1 7 6 5 5 5 4 3 6 9 1 3 2 5 2 5 2 2 1 3 6 9 8 7 6 6 0 7 4 6 6 3 8 6 2 6 10 7 0 10 7 2 8 9 3 2 6 5 1 8 8 1 5 3 8 1 9 0 4 3 3 1 1 3 1 5 5 8 6 9 1 0 6 5 4 7 1 1 7 10 9 10 4 9 4 7 5 3 1 8 4 1 4 4 7 7 8 7 2 10 5 8 5 7 2 4 8 1 8 9 2 7 9 6 9 1 0 10 7 3 10 6 6 5 4 8 5 4 10 1 8 7 10 6 1 10 0 10 9 2 6 10 6 7 5 6 8 0 6 8 10 7 10 6 6 4 2 4 3 3 8 8 5 6 0 3 1 6 6  [...]
+8 10 6 10 8 5 3 3 8 1 2 5 7 2 9 8 7 6 6 3 5 8 6 3 7 4 8 9 0 1 4 8 2 1 1 6 9 9 5 2 2 8 6 6 3 6 5 0 5 4 1 2 1 4 8 1 3 3 0 1 6 1 5 10 1 3 8 1 0 7 1 9 3 4 8 5 3 4 5 5 2 6 5 10 2 5 3 2 5 10 3 7 7 0 5 8 2 10 5 9 10 8 1 0 7 8 6 0 3 8 6 9 6 2 8 4 9 8 2 8 7 0 2 10 9 4 8 3 7 5 7 1 3 6 8 7 10 8 8 3 7 1 9 1 4 8 5 0 8 10 3 1 0 5 9 3 5 1 6 6 10 1 10 2 1 6 6 8 10 8 9 4 3 3 1 8 8 10 6 5 2 2 6 4 10 3 7 3 5 1 10 6 0 4 0 4 5 1 8 1 9 5 1 5 7 8 8 4 5 8 6 8 5 10 2 0 9 6 9 1 8 9 7 9 4 9 4 2 7 9 10 4 3 0 10 7 8 [...]
+1 5 0 10 4 8 2 1 9 2 3 5 0 3 5 5 9 7 0 3 10 4 3 5 7 9 0 8 9 7 7 10 8 2 4 8 6 9 1 6 1 5 3 0 2 3 5 9 0 8 9 5 4 5 8 6 8 7 3 0 1 7 2 6 10 5 7 0 4 0 1 10 2 3 6 0 3 0 1 10 0 7 10 9 7 5 6 7 6 8 4 3 9 1 3 5 0 5 10 8 1 3 5 2 8 1 4 2 10 8 8 6 0 1 10 4 4 7 3 4 7 9 3 4 1 4 10 8 0 3 10 2 8 0 0 8 2 8 10 9 9 3 9 4 7 8 5 2 9 2 1 3 1 7 10 10 3 9 1 6 4 4 5 8 6 5 1 2 5 2 9 8 1 5 7 0 3 7 4 4 4 2 6 10 10 0 8 6 9 3 5 8 1 1 2 4 3 3 4 1 10 2 8 3 3 6 2 5 2 7 8 0 0 7 0 2 4 5 1 10 1 1 9 7 3 0 5 0 6 5 4 8 5 4 8 10  [...]
+8 6 7 4 1 6 2 8 6 0 2 5 1 8 8 6 9 6 8 5 0 0 1 3 7 8 1 9 5 9 10 1 1 5 8 10 6 3 5 10 9 10 8 10 10 10 10 0 7 5 2 3 3 1 2 0 4 6 10 5 7 8 1 5 1 0 2 10 1 7 4 9 5 0 6 4 10 0 1 5 5 5 8 2 2 0 1 9 6 10 4 4 3 3 5 2 5 3 3 6 4 6 7 9 7 0 4 0 6 6 7 9 2 5 10 0 4 9 4 9 10 7 1 1 4 10 1 6 2 3 9 5 0 9 5 2 5 6 8 4 3 7 5 9 8 1 6 4 1 0 10 10 9 9 2 8 3 6 8 5 9 1 6 3 9 2 1 5 8 4 7 2 9 4 0 2 8 4 6 8 5 7 6 9 10 3 5 0 4 3 0 7 2 6 5 8 5 1 0 6 6 1 1 1 8 6 7 8 9 8 7 3 10 2 5 5 10 0 4 5 6 8 2 5 3 2 4 9 7 6 6 2 10 4 0 1 [...]
+7 9 0 8 6 6 1 0 6 3 5 9 9 2 8 1 7 10 5 7 6 8 8 6 8 6 0 3 3 2 9 10 1 6 5 7 2 2 7 1 5 4 8 0 8 6 7 4 8 0 1 5 2 0 4 7 2 4 3 2 3 0 6 5 9 2 4 9 3 5 4 8 1 0 10 10 1 0 5 9 9 3 2 10 0 7 10 4 9 6 9 4 1 4 9 4 0 8 4 5 1 10 7 1 7 3 2 4 6 1 0 5 7 8 0 4 5 2 8 3 7 3 4 4 10 8 7 5 5 0 0 1 10 10 1 4 10 1 4 9 1 9 7 5 1 1 4 9 5 0 2 0 9 1 6 0 7 3 9 7 7 3 5 7 8 0 7 10 6 8 3 5 5 10 7 8 7 5 8 8 7 8 7 6 6 4 6 0 4 6 2 3 9 7 6 4 3 8 2 0 5 5 4 9 10 1 8 9 10 9 7 1 7 5 10 6 6 1 4 3 10 2 6 6 3 8 0 7 1 1 2 8 1 5 10 7 8  [...]
+10 0 0 7 10 3 8 2 6 6 6 0 3 5 1 1 8 4 0 6 1 6 9 5 10 0 1 10 9 7 2 1 3 10 6 8 8 5 7 7 6 2 3 9 5 10 8 1 4 2 0 5 6 10 9 8 5 6 10 6 2 4 6 9 5 2 1 2 1 0 7 5 2 9 3 4 3 0 9 9 1 4 10 2 4 3 10 1 1 0 4 4 8 0 7 5 5 4 1 9 2 0 7 2 0 3 3 7 9 7 4 5 7 5 6 0 1 2 7 8 1 3 1 8 6 8 0 3 1 5 3 1 10 6 0 7 1 8 4 8 9 0 2 4 4 2 0 2 8 4 3 0 2 8 4 2 2 0 3 3 6 10 2 1 2 2 0 0 1 1 2 4 10 2 9 0 7 7 0 3 2 3 1 10 7 3 1 1 3 5 6 5 10 6 0 4 4 10 9 4 1 2 10 0 9 3 4 10 0 6 0 7 1 10 10 6 8 6 7 9 6 1 2 3 9 4 8 5 2 1 0 7 4 3 1 3  [...]
+2 4 7 5 8 7 9 8 7 4 0 6 1 4 4 3 10 8 6 1 1 7 9 10 8 10 6 6 0 10 4 3 0 5 3 10 0 7 0 6 1 9 7 1 6 0 5 0 8 0 10 5 8 5 5 4 3 1 9 1 7 2 1 10 8 2 10 3 8 2 1 1 6 0 9 7 7 5 7 9 5 3 1 8 0 1 7 4 5 4 7 5 1 4 6 6 7 9 0 9 7 10 2 2 1 4 1 5 2 0 10 5 2 1 8 4 1 4 4 7 8 6 7 8 3 5 2 3 4 8 3 5 8 8 7 10 2 2 0 7 2 1 6 3 8 2 3 10 6 6 8 5 0 7 9 4 0 7 10 1 1 10 1 5 9 1 10 2 5 2 3 6 5 2 7 9 9 8 2 8 9 8 5 1 5 2 1 7 4 1 4 6 7 7 9 6 9 1 4 10 0 9 8 1 6 0 0 4 9 10 9 0 3 2 5 8 2 1 9 6 3 0 5 10 2 4 7 5 6 0 1 9 7 6 1 3 2  [...]
+0 6 0 7 8 2 0 8 7 8 0 6 4 10 5 1 2 1 3 2 1 5 1 0 6 1 7 5 10 3 10 7 9 3 9 7 1 0 8 4 5 7 7 4 4 4 8 1 10 5 10 5 0 3 9 10 10 6 6 0 8 5 6 6 1 5 1 6 10 6 0 1 0 1 9 6 0 10 8 8 8 3 2 0 2 7 7 8 6 4 10 1 7 0 4 4 1 1 4 4 7 7 5 7 7 2 1 8 7 8 1 9 6 5 0 7 6 8 6 9 1 5 3 8 10 9 9 0 9 8 5 1 0 0 4 6 2 2 2 2 4 3 9 9 2 10 9 1 2 5 4 1 4 3 10 10 3 3 10 7 9 7 3 5 6 7 5 4 7 1 5 8 7 7 9 0 0 2 4 5 3 9 5 0 4 1 1 8 0 9 7 5 6 5 9 4 1 5 6 1 6 7 6 2 2 10 2 0 8 10 2 0 1 9 3 6 6 4 1 0 5 3 6 9 6 3 3 6 8 4 5 2 7 7 7 3 0 1 [...]
+7 1 8 7 0 10 8 8 6 7 9 4 1 8 3 2 3 1 3 0 2 2 3 7 4 1 9 10 3 0 9 0 3 3 1 0 9 4 7 10 2 1 9 8 7 4 5 7 0 5 5 0 9 6 2 2 3 5 3 5 2 1 9 6 4 2 8 8 2 6 2 9 3 3 8 1 8 3 9 9 4 8 8 9 5 2 6 4 7 10 10 8 5 1 0 10 9 1 8 0 10 7 9 0 5 2 0 7 10 4 1 4 3 10 7 4 2 10 9 5 0 7 5 7 9 0 5 0 0 2 2 9 0 10 6 0 8 5 2 3 2 6 2 1 5 10 9 1 3 6 7 5 9 10 10 2 8 0 9 3 8 6 8 7 9 8 1 7 7 0 10 4 8 8 6 4 0 3 6 3 2 1 2 5 8 6 10 7 8 7 9 9 1 0 5 7 2 5 9 7 0 8 6 3 3 5 3 5 1 8 8 9 9 10 3 0 0 10 0 5 8 3 8 5 8 5 0 0 5 2 0 8 10 9 1 5 1 [...]
+0 2 5 3 6 4 4 6 8 0 6 3 6 5 4 1 2 2 5 6 10 6 9 3 10 10 1 10 6 8 5 7 10 9 8 9 5 3 1 4 8 4 0 2 4 6 10 1 8 6 6 10 4 1 6 1 4 0 10 7 8 5 4 0 3 8 8 1 4 1 8 7 8 7 1 0 5 7 3 7 7 10 10 8 7 8 3 10 10 0 2 10 0 9 9 1 5 0 2 10 6 9 8 1 1 0 5 10 4 7 1 4 8 4 5 5 8 2 5 7 3 10 2 10 6 10 1 7 9 8 6 7 3 3 5 8 8 8 6 0 7 6 4 1 1 1 1 1 5 3 0 0 10 10 8 4 4 8 5 6 2 7 0 5 8 0 6 9 7 6 3 1 4 8 10 10 3 2 8 4 2 1 3 3 7 6 9 3 7 9 2 2 8 1 5 8 1 9 0 9 7 2 4 9 8 10 0 4 0 9 2 10 8 8 4 6 0 0 3 10 7 2 8 2 4 4 2 3 8 6 2 2 8 1 [...]
+3 4 6 9 4 9 10 2 6 4 4 9 8 2 7 2 1 4 1 7 3 8 4 6 2 5 5 7 7 3 6 1 1 8 7 5 6 0 5 10 9 4 8 7 8 3 6 0 0 5 0 3 7 5 5 4 6 2 10 9 8 4 10 10 7 8 7 5 5 1 8 10 2 0 5 6 1 2 3 10 8 5 3 9 6 7 5 9 5 9 9 6 8 4 2 2 3 3 8 3 5 3 9 5 8 2 2 5 4 2 6 4 8 3 4 8 10 5 3 2 3 4 4 2 1 8 2 0 6 9 10 10 0 1 8 3 2 8 9 4 10 2 3 9 6 2 6 7 4 0 2 3 6 5 3 6 3 1 3 7 9 9 2 5 6 0 10 6 4 5 0 1 3 9 3 4 5 10 6 10 3 4 6 3 10 2 4 8 9 10 6 7 7 0 8 9 5 9 10 6 6 3 5 2 1 10 7 0 1 5 5 4 0 0 10 4 6 7 1 3 2 9 6 0 2 7 9 6 7 4 7 1 9 2 8 1 8 [...]
+8 9 10 1 8 2 10 10 2 9 1 6 3 4 0 0 7 2 2 3 2 8 10 5 6 8 8 9 10 3 8 8 4 8 10 5 0 10 1 10 5 2 5 4 0 4 6 7 1 8 6 4 7 10 3 9 9 3 7 9 2 3 5 5 7 10 5 3 2 2 2 5 7 1 0 9 4 1 10 0 2 3 5 1 2 0 2 5 6 0 8 9 7 7 0 2 3 10 2 1 0 5 4 3 0 6 2 0 9 9 7 4 1 3 0 2 8 2 5 10 4 9 0 3 2 2 4 0 7 2 2 5 5 8 4 7 1 7 1 3 8 5 0 6 5 3 6 3 1 2 3 7 6 4 3 10 8 9 5 10 8 3 5 0 4 4 6 9 5 6 7 3 5 2 4 6 8 10 8 0 3 4 8 7 1 9 9 10 5 10 0 3 3 7 6 8 8 1 9 6 5 9 7 9 9 2 1 6 5 0 8 9 5 5 4 2 0 9 8 9 6 5 8 3 6 4 9 10 6 2 5 2 4 0 10 5  [...]
+1 4 10 3 0 2 2 4 1 0 3 9 5 2 6 8 0 2 3 10 4 4 5 8 0 9 0 8 3 7 3 2 8 4 4 8 3 8 10 3 5 9 5 2 1 5 7 8 6 1 4 6 2 8 7 10 7 0 2 4 3 9 9 8 0 0 3 1 0 2 3 0 3 5 8 1 3 0 5 3 6 1 1 6 1 1 0 4 5 4 7 2 7 3 8 3 5 10 6 7 9 2 9 8 10 8 9 1 6 2 3 2 3 7 0 3 0 1 0 6 10 1 5 4 0 9 6 8 8 8 0 1 6 8 10 1 5 8 7 2 3 10 8 10 9 7 2 9 10 1 9 10 5 7 4 10 4 10 9 1 8 1 10 4 8 0 4 4 1 2 4 2 8 1 5 1 4 5 3 5 1 0 3 9 6 7 1 7 3 4 7 4 0 3 6 6 8 0 10 7 7 10 0 7 8 7 1 9 0 4 3 10 0 10 6 0 4 2 2 7 6 6 10 4 4 2 5 1 4 9 5 10 8 8 6 8 [...]
+7 3 5 4 9 9 9 8 0 7 8 1 8 8 9 0 0 8 6 2 5 2 7 7 1 9 6 9 1 5 7 2 10 0 4 10 10 8 1 8 0 2 10 2 4 6 5 7 10 9 2 6 5 5 0 5 1 6 7 1 10 5 8 7 0 8 9 2 2 8 0 9 4 3 9 9 0 10 9 2 9 8 9 9 9 0 6 3 2 3 7 8 2 1 3 1 7 1 1 9 2 1 9 10 3 2 9 5 6 1 10 5 4 6 3 9 8 4 9 4 2 1 2 3 4 8 1 0 3 10 3 6 8 0 2 2 10 1 10 8 7 2 8 2 7 7 0 3 5 3 1 4 10 3 10 0 2 5 6 1 1 4 0 6 9 1 7 2 0 10 10 5 3 9 6 1 1 0 0 3 2 7 2 0 0 0 8 9 1 9 8 3 10 3 6 10 9 8 1 10 0 8 10 5 7 9 5 10 0 3 5 10 7 9 5 3 7 7 9 10 7 3 2 5 10 3 3 8 4 5 0 4 2 6  [...]
+10 0 10 10 1 8 10 3 1 2 10 1 9 6 1 5 8 0 9 6 7 2 9 6 8 9 4 1 10 10 10 0 8 4 2 3 0 2 8 4 7 1 9 4 2 8 2 8 3 5 8 5 5 7 10 7 3 0 7 4 1 3 5 3 7 0 5 7 9 8 3 6 0 8 3 2 0 3 6 9 7 4 8 8 9 1 9 1 0 9 8 3 2 10 4 3 7 6 10 9 7 0 9 9 1 9 5 1 9 1 4 1 6 4 9 6 3 6 8 0 3 4 4 4 8 8 9 7 6 5 4 7 6 3 9 4 6 4 4 2 1 5 7 3 8 4 2 4 5 8 5 8 2 5 1 5 9 8 4 1 8 3 7 8 4 9 5 1 5 7 7 6 7 3 5 3 8 6 8 7 7 8 1 2 2 5 7 5 5 1 4 6 6 5 0 7 8 9 8 7 3 2 0 7 4 2 9 1 6 6 3 7 9 0 5 9 1 8 3 1 9 3 4 10 10 1 0 7 4 3 3 9 4 7 8 8 6 8 4 9 [...]
+6 3 0 4 5 8 8 6 6 6 0 6 4 0 9 4 3 9 4 4 10 2 9 7 7 2 1 7 4 9 1 5 8 5 1 1 3 8 7 5 10 2 8 0 6 5 5 4 7 9 7 9 0 1 0 5 2 3 8 0 9 7 1 4 9 0 8 1 3 9 2 0 2 5 2 7 8 2 10 8 4 1 2 8 2 8 2 8 4 4 0 0 6 5 8 8 0 9 10 10 7 8 2 9 1 7 4 8 7 2 2 5 2 6 6 9 10 4 1 10 8 8 8 7 6 6 6 1 2 6 3 10 8 8 7 9 6 6 6 5 0 4 10 5 10 9 1 0 9 2 6 1 2 9 2 4 0 10 0 5 7 4 0 4 3 1 3 0 9 10 1 8 0 1 5 6 9 3 8 8 1 0 2 6 8 2 0 3 5 10 4 10 0 4 5 2 3 0 1 8 9 5 2 6 2 7 6 4 6 10 2 6 0 4 4 10 4 10 3 7 6 5 4 4 3 1 3 9 4 4 3 2 6 3 1 5 2 7 [...]
+3 0 8 6 7 10 4 6 4 8 2 2 8 9 3 7 6 4 0 6 10 10 1 10 0 9 5 9 6 4 0 4 8 3 10 3 3 7 8 4 2 9 5 5 6 4 3 1 5 0 2 4 10 8 5 5 9 4 7 3 2 0 2 10 3 10 6 0 4 2 3 2 0 1 5 0 0 1 3 7 8 1 6 0 5 0 10 2 8 1 6 3 1 6 10 1 2 4 0 0 9 0 10 4 10 10 10 9 2 10 2 0 2 0 4 3 1 1 2 4 8 4 3 3 6 7 0 5 9 10 6 0 6 8 1 2 3 10 3 2 3 1 8 5 10 0 5 1 9 5 0 3 2 5 4 1 9 5 7 0 1 9 9 7 10 8 9 2 2 1 4 10 1 10 0 6 10 5 3 3 5 9 2 9 6 1 10 0 3 6 2 4 9 5 5 8 7 6 6 4 4 9 5 4 2 1 10 6 9 10 4 5 1 5 0 5 6 9 9 9 10 10 6 8 3 0 10 5 0 0 5 7  [...]
+0 5 10 1 4 10 0 5 4 9 5 0 7 8 9 0 8 4 3 10 0 7 5 10 6 10 5 8 9 2 4 8 5 0 1 7 8 5 1 8 6 8 4 10 2 6 0 9 4 4 10 9 7 8 10 8 8 6 6 1 1 5 5 5 2 0 2 0 2 9 8 9 2 5 5 4 4 1 2 10 5 5 1 0 1 0 6 7 4 7 2 1 3 3 10 7 0 9 1 5 10 4 5 3 10 4 5 8 0 4 5 9 5 6 10 6 9 10 5 3 5 7 3 9 2 1 2 4 9 9 5 2 9 9 6 8 0 2 6 1 0 9 10 10 5 8 3 7 10 4 6 0 0 4 4 9 5 0 1 5 8 6 3 10 1 10 4 9 10 1 5 6 10 4 8 4 10 3 3 9 2 0 3 10 10 6 2 8 4 6 7 2 2 4 8 10 4 2 3 10 6 10 10 1 8 3 9 10 2 0 2 3 9 4 10 7 6 7 5 9 7 5 10 8 8 4 2 7 7 2 9 [...]
+3 9 7 0 8 1 2 10 9 8 3 10 4 10 1 4 8 2 2 7 6 3 8 0 7 6 4 2 4 6 6 10 2 6 6 1 8 0 8 0 8 6 0 3 1 5 0 1 5 5 8 9 5 4 2 2 0 8 10 8 10 8 1 7 7 8 1 8 9 4 7 4 1 3 6 0 0 8 1 0 7 3 5 3 1 5 6 0 5 7 2 9 4 9 9 0 10 9 5 6 1 0 0 1 2 3 0 4 0 10 10 3 7 4 4 10 0 2 5 3 4 5 0 5 2 5 5 10 7 1 1 1 1 8 8 3 3 0 10 0 8 5 0 7 1 9 2 9 6 7 3 3 6 5 8 3 8 10 0 4 1 7 6 7 5 6 4 4 5 4 9 7 0 4 8 6 8 9 0 8 0 5 6 4 0 8 9 8 4 9 8 10 9 1 4 3 5 1 9 1 7 3 3 3 8 8 7 7 9 7 1 4 10 4 6 4 1 3 5 0 4 6 7 6 0 9 5 3 5 8 8 9 9 1 8 9 8 8 6 [...]
+5 6 6 8 8 2 8 4 7 0 3 2 4 9 4 0 8 3 6 0 9 1 0 6 3 1 6 2 2 10 0 8 2 3 6 6 7 9 2 8 0 2 9 6 8 9 4 6 6 5 4 9 7 7 1 7 7 1 5 2 5 0 2 10 8 6 2 6 4 8 7 3 1 0 2 5 3 3 10 4 5 4 9 7 6 2 9 8 8 4 3 10 3 0 3 9 3 3 8 5 9 8 7 8 5 3 9 3 6 1 9 4 1 4 5 4 3 7 5 10 2 6 4 6 10 3 9 10 2 5 0 10 3 6 10 3 10 1 7 9 1 8 3 6 9 8 2 8 6 4 1 8 6 10 4 1 2 9 5 3 2 1 7 10 9 6 4 8 7 0 8 10 7 0 2 1 8 5 8 7 2 8 4 2 10 4 7 10 10 1 10 9 10 4 6 2 8 10 3 5 2 3 2 5 7 4 2 4 4 9 10 6 8 0 10 6 0 10 7 2 4 10 9 7 9 1 7 5 0 9 7 7 7 9 9 [...]
+8 0 5 7 3 5 1 2 7 0 7 3 10 0 0 7 8 10 3 2 9 9 5 0 2 0 1 2 3 10 8 9 0 1 7 7 2 3 9 4 9 5 1 6 9 7 2 5 6 7 2 8 4 9 2 3 4 10 7 3 10 3 10 9 9 1 7 4 8 2 5 9 5 2 6 1 8 8 5 6 3 7 6 6 5 9 10 7 9 7 7 0 9 6 9 1 1 0 4 4 9 1 4 10 0 7 5 3 3 0 3 8 4 10 3 10 10 6 10 9 2 2 8 1 2 10 7 2 2 6 3 8 9 4 4 5 1 4 8 8 8 1 7 1 0 0 6 1 0 4 5 7 4 10 3 2 10 7 3 6 1 6 7 3 3 3 8 5 7 7 0 0 1 5 2 4 10 2 10 5 4 8 1 6 5 4 8 0 8 4 2 6 8 8 4 0 8 6 10 0 8 5 9 8 1 2 7 0 10 4 10 1 4 1 5 8 0 4 7 9 9 5 6 6 3 5 1 9 4 5 2 0 5 3 6 0  [...]
+4 0 5 3 9 8 0 2 6 0 3 1 0 5 1 9 9 4 6 10 9 5 9 1 1 7 10 3 10 9 0 6 10 7 1 1 6 8 5 9 9 7 3 3 7 10 0 0 5 2 3 5 6 7 7 7 9 3 5 7 9 1 3 1 2 2 1 10 9 9 7 9 3 2 8 1 3 8 6 10 8 9 2 2 0 1 2 1 8 1 6 10 3 2 2 6 7 10 1 10 5 3 5 2 0 10 7 0 6 4 6 7 8 0 4 10 8 8 10 3 5 5 5 5 5 10 5 0 3 2 10 10 4 4 8 5 4 3 1 6 0 2 5 9 6 8 6 7 9 3 4 3 7 8 8 5 7 7 3 6 8 7 9 6 10 4 2 7 0 1 10 6 8 1 6 3 1 7 3 6 1 1 5 4 7 3 6 3 10 10 6 0 5 10 0 3 3 5 10 8 2 7 8 8 10 8 7 10 10 7 1 10 9 8 7 7 1 10 9 4 3 1 4 9 4 8 1 0 10 6 7 1  [...]
+4 10 10 8 4 9 5 2 10 6 3 8 4 2 7 2 2 0 1 10 4 0 3 6 10 8 9 10 6 0 7 1 10 9 0 3 4 2 8 4 8 2 4 3 6 7 4 2 4 3 10 8 3 1 4 6 1 2 3 5 3 1 8 5 9 9 10 6 9 4 5 10 8 8 8 3 10 8 7 9 2 3 7 0 3 6 9 1 4 5 8 8 8 10 1 0 0 2 9 4 10 8 6 0 7 0 1 9 4 2 8 0 5 2 7 10 2 8 1 6 8 5 4 6 6 4 6 8 10 7 1 5 1 7 10 7 2 7 4 9 5 3 8 2 1 2 5 3 1 4 6 4 3 0 2 3 6 2 10 3 10 5 7 4 4 10 6 2 6 4 8 7 10 1 6 5 5 5 9 3 8 7 6 3 0 0 10 8 9 4 6 1 8 5 7 7 8 3 4 5 4 2 5 0 1 4 7 7 9 5 4 0 3 0 0 0 5 6 10 0 1 4 3 6 10 6 9 1 7 5 6 2 10 9  [...]
+3 3 3 5 0 2 1 1 3 2 6 0 0 7 7 1 1 10 6 6 7 8 5 9 3 6 0 9 5 9 8 5 6 6 7 1 10 3 6 0 9 7 5 10 8 0 5 5 7 9 5 1 2 7 6 8 10 10 0 4 9 4 8 7 6 1 7 9 8 7 1 0 10 8 1 4 7 7 9 8 6 0 8 7 8 0 5 9 2 7 6 7 4 1 4 9 0 1 8 0 9 10 9 7 10 8 9 0 9 7 4 5 1 8 1 8 0 4 3 5 9 6 7 5 7 0 7 8 4 4 7 3 10 6 8 7 5 7 9 2 1 10 5 3 2 10 8 7 0 2 2 6 4 7 2 2 9 10 8 10 3 3 5 9 8 3 4 9 10 2 5 4 5 3 8 6 5 7 5 2 9 2 1 9 2 3 5 1 1 1 0 10 8 4 8 3 6 9 10 8 7 4 1 10 7 7 0 3 0 3 8 1 1 2 5 1 6 2 1 6 10 10 6 6 0 0 3 1 5 7 0 5 4 2 7 9 0 [...]
+9 10 6 0 3 9 3 3 0 3 10 7 3 0 6 1 2 10 3 4 0 4 3 5 0 7 0 4 6 6 2 4 4 10 8 4 4 4 5 2 9 2 10 9 9 0 8 7 3 10 3 5 3 10 10 6 1 0 0 10 5 10 3 10 0 6 1 9 3 3 10 2 9 6 6 7 8 2 8 4 9 7 7 6 0 1 8 8 5 7 0 9 0 2 4 6 0 2 4 4 8 8 0 0 9 2 4 3 5 3 6 0 8 4 2 7 6 2 3 3 4 8 4 5 1 3 0 6 7 9 0 7 4 0 3 0 5 3 6 0 4 1 2 7 7 7 4 4 4 6 1 5 5 2 0 9 0 5 1 0 9 4 1 9 7 9 8 2 0 1 6 6 5 1 1 7 2 4 8 10 9 4 7 4 7 5 8 1 2 7 10 1 6 9 3 0 8 0 4 5 3 10 2 3 6 10 10 2 1 8 6 7 9 5 1 4 2 4 6 7 0 6 1 4 10 6 0 8 9 8 0 6 9 8 6 2 6  [...]
+10 9 4 10 4 8 6 5 6 9 2 10 1 7 7 2 7 1 7 10 9 3 9 0 3 0 0 9 8 3 3 8 3 8 3 8 4 0 5 1 1 0 8 1 4 3 9 1 7 7 6 7 8 0 4 6 5 10 6 9 4 1 10 9 2 4 5 3 10 3 6 6 1 1 3 3 8 5 0 6 5 1 6 3 1 5 4 5 5 7 10 5 2 3 10 5 10 0 3 9 9 2 10 9 9 4 7 6 1 3 0 8 6 4 4 0 4 10 0 8 4 4 0 0 9 3 8 7 8 6 1 4 5 3 10 1 5 0 9 2 10 4 10 8 8 1 6 8 10 3 1 2 4 4 7 9 1 0 8 5 5 9 2 3 9 5 9 7 10 10 5 4 2 3 7 10 10 9 4 3 9 3 2 6 0 10 1 4 9 3 0 2 4 3 4 8 6 8 9 10 10 7 7 7 1 7 2 4 3 3 4 9 4 6 6 6 7 6 4 9 4 1 0 0 7 3 8 2 3 2 6 10 3 2  [...]
+8 2 8 8 3 3 1 5 9 1 5 2 5 0 5 10 8 5 1 8 5 0 9 4 0 0 6 2 1 6 8 2 10 10 2 8 0 10 0 3 3 10 5 7 3 4 6 7 5 5 10 3 5 1 5 8 3 0 7 1 1 5 8 3 2 2 8 3 4 7 3 0 7 3 3 10 8 4 5 2 7 6 8 9 3 7 0 7 8 9 0 1 4 8 2 7 3 6 3 6 8 3 6 10 0 4 3 6 5 3 6 1 2 10 5 6 6 9 8 0 3 5 8 3 8 3 6 5 7 10 9 4 9 2 3 5 6 10 2 5 0 0 0 6 7 1 6 8 4 4 10 4 8 7 4 9 0 7 6 2 4 8 10 9 4 10 4 7 1 8 2 4 9 0 8 3 5 2 5 8 0 2 9 4 6 9 5 2 8 4 3 5 1 3 6 3 5 4 3 7 7 8 6 8 10 2 9 4 4 9 9 9 8 8 8 1 6 10 2 4 0 10 3 8 2 6 1 8 1 6 4 10 5 10 7 6 3 [...]
+10 6 8 4 4 5 3 2 5 5 3 2 0 5 7 6 9 3 10 4 5 5 1 1 9 9 10 3 0 9 8 3 2 1 8 10 6 3 10 10 6 7 9 3 3 5 7 6 8 2 8 3 1 8 1 4 0 9 6 3 6 2 0 4 8 1 10 10 3 3 5 10 5 9 5 3 1 9 8 2 4 3 6 3 7 4 9 5 8 6 5 8 0 1 0 3 7 1 7 5 8 10 5 1 7 4 8 3 7 7 4 0 1 8 3 9 4 0 9 2 1 6 9 2 3 5 4 2 5 10 5 2 6 2 0 9 1 0 6 5 8 8 5 6 4 4 6 8 10 8 5 8 4 4 5 3 10 1 4 0 5 6 6 6 4 2 7 9 3 1 1 6 2 6 7 5 5 6 0 10 8 3 3 2 1 3 5 2 4 10 5 2 1 10 3 0 8 4 4 0 3 8 5 9 5 3 9 1 0 5 8 7 6 7 2 9 7 5 4 5 0 2 2 2 7 10 7 10 5 0 1 7 3 0 0 6 0  [...]
+2 7 8 10 3 8 4 7 4 0 0 1 7 0 5 1 3 2 1 2 0 0 5 4 0 8 7 7 3 3 3 3 2 9 0 6 1 1 8 4 2 7 6 10 1 8 0 4 9 2 4 8 3 0 9 5 10 1 0 10 5 10 5 10 5 2 0 3 2 0 1 4 3 5 0 3 7 1 6 4 7 4 7 7 3 8 1 4 3 5 6 9 3 1 6 8 6 2 6 0 3 0 7 4 4 2 8 4 2 2 5 7 8 6 8 2 10 6 10 1 0 6 6 6 0 1 1 1 10 10 0 5 1 10 10 8 5 3 5 0 3 3 4 8 5 5 4 2 1 6 0 5 1 0 8 4 3 0 2 3 1 8 6 1 9 0 8 1 5 6 6 2 5 0 6 2 0 5 5 8 4 7 7 1 5 8 6 0 10 10 9 4 6 3 2 6 5 4 1 5 9 5 7 4 9 5 3 2 4 6 6 9 4 8 6 5 8 10 10 3 2 1 1 0 4 10 9 1 8 10 2 0 1 9 3 5 6  [...]
+7 5 5 5 5 5 10 1 3 9 4 8 5 3 8 1 2 2 5 6 3 10 4 7 4 7 8 10 2 3 1 0 2 6 4 1 1 3 0 2 7 2 1 2 9 2 10 10 7 10 7 10 2 7 0 10 1 7 2 4 4 6 7 0 3 3 1 0 3 0 2 4 4 10 9 4 6 7 3 3 0 6 10 7 8 7 6 9 0 8 10 9 5 10 4 1 10 4 10 6 1 1 2 6 8 4 9 1 0 8 6 4 10 7 4 3 5 9 7 0 5 9 3 5 9 10 9 2 0 4 2 10 7 4 1 10 10 6 1 0 0 3 2 1 3 0 4 4 3 2 5 5 8 0 9 6 0 8 8 1 7 6 5 5 2 6 4 4 7 2 7 5 10 7 10 5 4 0 0 2 1 0 6 10 2 3 9 5 7 8 0 9 7 5 4 1 9 5 5 5 1 6 3 7 2 9 9 1 9 9 7 1 2 1 5 8 7 4 0 10 3 6 1 9 5 8 10 3 5 8 1 3 8 3  [...]
+9 10 9 5 7 1 7 2 9 3 4 6 9 10 9 2 5 5 8 7 6 8 2 8 9 6 8 1 0 10 5 8 10 7 4 1 4 8 4 6 2 9 4 8 2 3 5 6 8 8 1 5 7 10 9 4 6 8 6 6 4 8 1 2 5 7 1 5 7 9 5 8 9 3 2 0 3 10 7 7 6 8 10 6 7 9 1 1 1 1 2 4 9 10 3 3 0 0 4 3 7 1 8 10 9 2 0 8 10 1 8 7 10 4 1 7 0 5 1 10 6 2 8 4 8 6 6 10 2 3 4 6 5 1 5 9 6 7 10 8 7 6 6 8 2 4 4 2 7 10 6 6 8 4 7 0 3 0 8 1 4 7 0 4 4 9 8 2 4 5 5 7 8 10 6 8 10 8 3 10 2 10 0 7 7 8 0 4 6 8 2 8 5 0 6 9 6 6 4 9 5 10 2 0 9 1 3 10 0 4 7 1 3 6 3 4 10 2 10 3 2 0 0 9 1 0 4 0 5 3 3 6 2 10  [...]
+8 5 2 3 7 2 1 7 7 6 7 9 4 0 1 5 10 1 2 0 1 3 9 5 4 6 2 4 4 2 3 7 9 9 5 6 4 2 10 3 4 4 10 8 5 2 8 7 6 1 0 3 9 3 10 1 1 6 8 1 10 0 9 6 3 10 1 8 2 3 6 3 7 9 3 2 1 0 4 10 8 3 2 4 8 8 6 6 1 9 9 1 7 7 10 2 8 1 5 3 8 5 10 7 2 5 9 4 4 4 4 7 3 5 7 3 0 8 0 10 7 7 4 2 5 7 9 1 9 9 10 8 10 8 8 10 0 10 10 0 8 6 6 3 5 1 0 9 1 5 4 6 6 0 1 9 8 1 6 2 3 9 1 6 10 10 5 2 10 3 9 6 4 3 0 4 6 3 3 8 7 9 7 2 9 10 4 1 3 5 1 3 1 8 9 2 9 5 6 3 3 6 2 5 3 0 1 7 8 3 2 0 7 9 4 9 7 8 5 3 4 9 5 6 9 6 10 10 8 6 9 4 9 8 5 7 [...]
+2 9 4 5 5 4 7 0 1 1 2 2 10 2 5 0 1 10 3 6 5 0 2 5 0 10 5 3 5 8 5 4 4 8 0 2 3 10 3 4 9 1 4 7 1 8 3 10 8 4 1 3 5 7 3 0 10 8 9 5 5 4 9 0 0 8 6 1 8 2 3 8 8 0 3 1 8 8 3 10 10 2 6 3 1 4 2 3 8 3 6 10 6 5 7 1 7 7 6 1 5 8 9 5 8 8 0 8 8 9 3 5 7 3 9 3 6 4 1 10 8 5 3 8 2 6 7 0 9 8 8 7 2 4 3 0 7 7 7 0 6 3 4 7 8 9 8 8 0 3 7 5 3 8 2 2 9 1 8 0 0 7 2 4 0 4 5 3 9 9 1 6 4 4 6 10 5 3 0 3 9 6 3 3 9 2 5 8 6 8 6 7 10 9 3 9 3 5 4 7 8 7 3 9 6 0 10 10 0 3 5 5 0 6 6 4 0 0 6 6 10 2 8 4 7 8 3 10 1 8 7 5 1 2 1 5 3 8  [...]
+7 0 1 5 4 9 4 3 1 7 7 10 10 4 3 3 9 4 8 2 0 6 9 6 1 6 2 9 0 3 3 2 6 1 6 8 7 4 9 1 9 2 8 5 4 8 1 2 3 5 1 9 4 1 5 4 4 5 7 5 5 9 3 2 9 4 9 9 3 7 5 8 2 10 1 4 4 6 1 7 1 4 3 7 6 1 0 1 9 8 2 5 3 3 9 5 7 2 1 6 3 3 8 0 8 5 2 0 8 10 5 3 7 4 9 0 10 6 6 7 1 4 10 7 5 3 10 2 2 1 9 8 8 7 3 8 6 0 3 0 8 10 8 5 4 8 6 0 7 10 7 1 10 0 9 1 8 4 9 7 7 3 10 8 4 8 5 8 7 10 2 5 4 7 4 3 5 5 1 1 0 5 9 4 5 3 9 0 2 6 9 3 9 9 10 2 9 1 4 6 4 1 8 5 3 3 2 0 0 1 6 8 2 6 2 8 6 2 10 6 3 2 7 2 3 0 5 3 3 5 2 3 4 7 4 2 1 10 7 [...]
+1 7 10 8 9 6 10 2 3 7 4 7 7 10 5 4 5 1 0 2 0 3 10 5 5 0 1 3 2 7 0 0 6 6 4 10 3 3 5 2 5 6 7 8 10 6 6 6 0 5 1 4 6 4 10 9 10 0 10 9 1 7 10 5 9 9 6 9 2 2 8 7 1 1 9 10 1 1 0 8 6 10 7 1 7 7 3 5 4 1 1 9 2 9 0 6 4 7 5 5 8 7 8 9 10 3 1 3 10 8 2 5 9 2 4 2 0 2 0 8 0 4 6 0 5 7 0 5 1 10 4 2 1 9 0 2 5 10 0 5 8 8 7 8 10 7 2 9 8 5 3 10 9 4 4 6 1 8 5 2 0 10 10 2 9 0 2 5 3 6 7 3 9 6 2 4 0 10 9 9 8 9 2 9 8 7 7 8 6 1 4 8 5 6 1 10 1 6 3 0 7 2 10 1 3 9 0 1 3 9 9 8 0 0 2 2 8 1 5 5 7 6 1 5 1 6 4 9 1 8 5 9 3 3 8 [...]
+7 7 7 1 1 4 4 7 6 8 7 10 2 10 5 2 10 1 8 6 0 9 2 4 2 1 4 7 3 0 3 4 6 1 3 5 10 9 0 7 5 0 2 2 8 4 9 2 4 7 4 9 6 8 3 2 1 2 1 6 7 6 3 9 7 2 8 7 0 10 1 2 10 8 7 6 1 2 7 4 3 7 0 0 8 2 7 8 10 9 5 9 8 0 6 10 2 5 0 1 2 2 4 0 9 6 5 10 10 10 9 7 2 5 4 0 4 9 5 10 4 1 8 2 10 0 5 7 9 3 9 3 3 2 10 2 7 7 9 4 0 7 6 7 2 8 1 5 9 9 5 3 5 0 0 6 9 0 2 1 4 2 6 0 5 10 0 5 5 0 4 2 5 0 3 9 4 6 8 6 3 1 10 4 3 8 7 7 3 0 3 4 9 9 5 6 10 8 0 3 1 3 0 2 9 0 1 5 7 6 9 7 1 6 5 9 9 7 7 10 2 9 10 4 4 8 1 1 8 9 2 3 1 6 3 9 1 [...]
+7 2 4 10 10 2 2 7 6 3 10 1 7 0 9 10 8 7 8 8 7 10 0 4 0 9 0 4 5 9 1 2 8 5 3 5 9 3 6 5 8 5 2 0 5 3 9 7 2 0 3 9 10 10 8 5 0 5 3 0 6 7 2 9 7 10 9 7 2 3 5 0 2 7 4 3 5 4 8 10 10 8 6 5 7 8 5 5 1 4 0 1 9 1 7 7 9 8 4 3 3 10 9 0 6 3 9 10 4 2 10 0 0 8 4 5 7 0 0 10 1 7 0 7 8 1 7 0 4 2 3 1 4 10 9 9 4 4 1 9 7 4 7 4 5 2 7 4 6 4 10 6 10 4 9 2 10 7 6 0 8 2 10 4 0 10 4 10 4 5 4 5 0 8 0 5 10 7 3 4 2 1 2 1 9 10 9 5 1 2 4 1 0 9 8 6 4 1 8 9 8 1 8 3 1 2 7 8 0 8 1 7 4 2 4 1 0 5 1 1 2 3 9 2 7 8 0 3 3 0 9 1 10 8  [...]
+0 1 1 7 6 8 9 4 4 0 3 6 9 8 0 8 5 5 0 6 8 7 6 1 2 5 9 4 2 6 10 8 3 7 9 2 4 8 10 7 1 2 8 1 0 4 6 10 1 1 7 7 2 9 1 10 7 10 5 2 7 9 10 9 9 1 9 0 6 7 7 4 6 9 9 10 10 4 10 5 8 9 6 1 2 6 2 6 2 7 4 8 5 7 4 3 2 5 6 7 5 10 10 1 7 1 6 6 4 8 9 4 1 3 0 0 10 3 8 0 6 9 3 6 9 7 8 10 1 8 10 0 2 9 9 7 8 7 6 8 2 4 6 2 2 8 6 9 8 5 1 4 0 3 8 5 5 3 1 7 0 10 8 9 5 1 5 2 1 7 3 4 4 3 0 10 1 7 0 2 1 10 3 6 4 2 6 3 6 3 4 6 7 4 3 10 2 10 10 1 10 5 9 7 3 8 1 0 9 3 4 1 3 3 2 9 1 2 4 9 5 7 7 6 10 0 4 3 8 2 3 0 0 0 0  [...]
+3 5 8 8 2 7 1 6 0 3 9 3 9 9 3 6 8 6 0 0 3 1 4 10 3 3 7 2 7 9 5 9 2 9 0 10 6 6 3 4 0 6 4 8 4 4 7 9 0 5 3 6 9 7 3 1 10 3 2 4 0 3 10 8 4 3 8 2 9 10 7 5 0 9 0 4 10 9 2 9 5 9 7 8 0 9 0 6 3 9 5 1 2 7 6 0 5 6 7 10 5 10 9 3 1 0 0 8 5 5 9 6 2 5 2 9 7 3 2 0 8 10 7 5 8 5 4 10 2 7 10 8 9 1 5 5 7 2 4 0 4 1 0 8 3 0 8 10 8 7 10 5 8 8 8 8 4 3 0 8 10 10 5 10 10 2 0 4 5 1 5 1 9 6 9 0 3 4 8 3 5 5 3 9 2 9 3 9 7 3 7 6 4 2 8 8 1 0 10 7 4 0 1 9 2 2 2 8 8 5 7 9 1 2 7 8 1 6 0 10 3 3 7 7 1 10 6 1 1 3 6 3 10 5 1 5 [...]
+3 10 6 5 6 10 9 8 9 3 2 10 5 10 7 8 6 3 8 7 2 2 10 8 9 0 9 3 2 4 0 3 2 2 10 6 9 1 3 8 2 6 2 4 2 1 9 5 4 10 6 5 7 8 6 4 5 10 6 6 6 5 3 10 9 10 5 4 8 7 10 7 4 2 0 9 8 7 0 4 1 5 3 5 5 4 0 6 5 2 10 5 6 6 6 10 4 0 3 10 8 7 6 3 8 0 6 2 5 0 4 7 4 7 3 3 5 1 10 10 8 7 1 2 4 5 10 5 10 1 1 10 9 1 4 7 1 0 9 10 2 10 9 10 5 1 0 3 1 2 0 6 6 8 2 1 6 4 2 9 3 8 9 5 0 10 4 8 1 0 1 10 8 5 9 7 3 8 0 6 10 1 1 7 6 7 0 3 1 4 5 2 1 3 10 9 8 5 0 5 1 10 4 1 10 1 4 0 4 10 1 4 4 1 4 3 8 4 5 8 3 0 7 1 5 0 8 4 2 1 10  [...]
+9 4 8 1 5 6 1 1 4 10 1 10 0 7 10 10 4 9 8 5 8 3 10 0 0 9 4 1 9 10 5 0 3 10 0 6 1 3 10 0 5 7 1 4 7 9 0 6 7 4 2 0 9 3 7 3 1 6 4 1 2 0 4 8 3 6 9 3 1 2 3 8 1 2 6 2 10 7 0 9 1 7 6 4 8 7 3 7 2 8 10 5 4 4 10 7 1 6 10 1 4 1 2 4 0 0 4 4 9 8 5 3 0 3 4 5 4 7 5 10 1 1 10 3 1 8 5 0 8 3 3 10 2 3 3 8 2 2 10 8 8 2 4 8 8 0 2 0 8 5 6 6 6 10 9 6 7 10 4 9 3 4 8 7 6 8 8 8 7 3 3 3 4 7 6 2 2 1 10 0 7 10 6 2 8 6 6 0 1 4 9 5 0 0 7 9 8 4 7 5 0 3 5 3 6 2 1 10 8 7 9 1 3 9 9 1 4 2 0 7 10 9 1 8 9 5 8 1 0 3 6 2 9 6 9  [...]
+4 5 10 8 9 8 9 2 1 3 8 6 6 5 9 4 7 1 9 2 10 4 3 6 3 10 5 9 9 8 6 7 2 6 9 3 4 2 2 5 5 2 8 3 8 2 9 3 3 5 2 1 10 2 7 4 2 10 5 9 9 3 9 6 3 0 5 9 9 0 5 10 9 8 5 3 8 9 2 8 7 5 4 3 1 7 10 0 8 0 3 3 3 10 9 4 9 5 7 5 9 6 5 2 2 0 1 2 5 5 3 0 10 1 7 6 8 6 10 2 6 0 7 2 3 3 10 7 5 4 0 0 9 5 0 5 2 0 7 2 10 8 0 9 0 4 0 1 5 0 2 2 7 10 2 4 1 7 8 3 8 2 10 3 4 4 5 1 6 10 1 6 10 2 6 10 9 0 4 8 5 3 9 1 3 7 1 3 1 9 8 8 3 0 5 0 6 3 1 1 4 6 2 8 8 5 1 8 0 10 7 1 2 7 4 7 9 10 0 1 4 0 10 0 0 2 9 1 7 6 5 4 2 1 10 8 [...]
+3 5 7 10 7 8 3 3 7 2 8 0 9 7 8 5 3 8 7 8 0 3 10 8 4 8 5 1 6 1 0 8 0 5 6 4 10 5 1 8 9 6 2 3 10 9 7 5 2 9 9 6 8 4 3 10 1 4 3 7 10 1 2 8 1 9 5 3 5 3 4 6 10 6 6 3 9 6 8 4 7 0 5 6 8 5 8 7 4 0 6 6 5 5 3 6 10 8 1 4 0 6 4 2 0 3 1 0 9 4 2 2 6 7 7 2 7 4 9 7 2 9 6 8 7 4 1 2 4 4 2 9 1 10 1 5 3 0 3 4 6 8 9 2 5 9 3 3 10 3 5 2 5 8 1 1 3 1 4 6 1 9 3 1 2 5 6 1 9 0 3 8 0 2 7 6 6 10 3 10 5 1 6 3 0 2 7 5 10 8 0 1 6 4 3 0 10 10 10 0 7 3 10 4 3 10 7 6 5 10 7 2 8 4 10 2 7 7 3 8 6 2 6 0 1 5 6 4 6 5 10 8 8 6 4 7 [...]
+3 10 9 3 7 3 10 0 4 2 8 4 10 1 9 7 2 5 0 7 0 7 2 2 2 2 2 0 4 1 10 7 0 3 10 5 5 5 5 7 3 6 4 8 4 7 3 7 1 1 0 4 0 8 1 2 6 5 2 0 5 0 4 9 6 0 4 2 2 8 2 8 1 10 1 2 5 3 10 0 1 6 5 1 9 2 7 5 0 4 9 2 9 0 1 7 3 0 5 10 10 0 1 10 10 0 6 10 3 1 0 2 3 8 5 3 7 6 10 0 1 3 5 1 10 10 10 3 0 4 3 10 6 7 5 4 0 7 4 7 7 4 2 2 3 4 7 7 6 0 4 2 0 0 7 6 10 5 0 1 3 9 9 8 4 0 1 7 9 10 2 0 3 2 0 8 9 1 3 4 4 10 4 10 5 7 10 0 8 4 8 2 6 8 0 10 3 6 2 7 3 2 2 2 9 9 8 5 6 6 7 7 6 2 0 1 7 3 10 3 5 9 4 6 10 5 4 9 0 10 6 10 1 [...]
+5 4 4 8 3 3 6 8 10 2 9 10 0 9 7 7 2 3 6 6 3 0 3 8 1 5 0 5 9 0 1 7 2 10 2 2 6 7 10 7 5 8 7 1 7 6 4 5 7 2 5 5 8 7 0 10 9 8 9 10 8 8 3 8 2 6 8 10 8 1 6 2 9 7 10 0 10 8 7 2 0 3 8 7 2 1 3 3 8 7 2 6 0 7 6 2 5 3 8 6 4 1 6 8 3 7 6 7 5 4 9 8 6 0 5 7 4 7 0 1 10 4 3 4 4 3 4 10 6 6 6 8 6 7 10 10 0 6 4 6 1 10 6 6 6 9 6 6 8 6 7 9 7 3 2 10 7 7 8 3 6 4 5 7 1 1 5 5 8 3 1 8 10 0 8 3 7 5 9 3 8 5 7 6 5 5 3 1 2 3 3 9 3 7 4 1 5 1 9 6 10 4 0 1 2 10 8 8 6 2 10 8 3 0 2 2 2 0 5 0 3 1 0 1 1 0 3 4 4 7 6 6 2 3 5 3 0 [...]
+2 5 1 10 0 0 9 3 10 7 1 8 6 0 1 1 1 9 6 10 4 6 9 3 4 10 4 3 2 10 3 5 6 8 3 9 7 6 7 7 5 4 5 5 5 8 3 5 6 5 9 4 3 5 8 0 4 5 8 4 2 9 6 9 6 9 1 0 6 6 5 10 7 5 8 3 5 0 5 9 4 4 9 5 10 8 5 8 8 9 4 4 8 6 6 10 1 9 8 6 9 0 0 4 8 10 2 4 6 4 1 3 7 9 10 0 10 5 9 5 9 7 10 1 8 1 8 8 1 3 2 2 8 7 1 5 1 8 0 7 4 10 1 4 2 8 8 9 2 4 3 2 6 0 0 7 4 0 9 5 1 5 7 10 2 3 4 5 5 5 1 7 5 6 4 3 10 5 9 10 6 1 2 3 1 1 10 2 0 0 4 1 0 9 1 5 1 0 1 6 4 2 4 7 2 2 0 0 4 6 7 6 6 0 8 8 0 7 8 5 2 6 3 9 4 0 5 2 10 6 2 1 9 7 2 3 0  [...]
+1 5 1 0 3 4 2 1 2 6 1 5 0 4 0 1 9 9 0 3 8 4 1 1 5 0 0 6 6 1 4 7 3 8 1 5 5 6 9 3 1 9 9 1 3 9 8 1 3 9 2 5 4 6 5 3 0 7 9 8 2 0 2 5 1 4 0 5 3 10 10 6 2 10 6 2 2 1 2 4 5 5 5 4 7 6 9 8 9 2 3 7 9 0 3 1 8 4 1 7 2 10 5 3 10 2 5 7 7 5 2 0 10 1 1 1 5 3 7 3 10 5 7 2 6 7 3 0 3 5 3 5 5 10 5 6 4 7 0 4 5 8 3 2 8 3 4 10 4 6 7 7 2 4 9 9 5 5 6 7 1 5 9 1 7 8 3 5 5 5 10 0 7 10 9 0 5 9 8 0 6 3 7 10 1 5 9 2 0 9 6 2 7 10 0 2 6 7 7 0 9 4 5 6 0 9 4 1 2 3 7 6 6 10 6 5 6 1 4 10 10 4 3 8 0 8 9 4 0 5 2 2 3 6 4 7 10 3 [...]
+5 4 0 3 1 8 0 6 2 5 5 2 8 1 6 9 2 8 4 5 1 2 10 9 7 1 6 5 7 7 10 0 9 0 0 6 0 1 9 10 4 10 0 1 5 3 6 0 3 10 1 9 2 10 0 3 2 9 8 9 3 9 8 2 8 5 8 7 9 0 0 8 0 4 7 8 7 8 9 10 5 1 6 4 1 1 3 6 1 4 10 3 1 8 1 9 3 5 9 5 5 8 4 9 3 6 6 3 10 8 9 2 8 5 7 3 1 9 3 7 7 1 10 3 8 10 2 0 9 5 0 2 7 6 4 0 2 1 2 10 4 5 2 7 1 0 5 6 10 2 8 6 6 3 7 5 2 10 4 3 10 2 10 9 5 0 10 10 0 4 3 0 1 8 2 4 3 10 1 7 6 4 2 10 0 1 7 5 7 4 4 6 6 1 5 0 5 8 9 1 10 3 3 7 7 9 4 0 8 1 10 3 8 5 8 7 6 9 7 3 10 3 3 4 0 5 5 10 1 2 7 6 5 3  [...]
+3 2 6 5 8 5 0 1 4 9 7 1 7 6 10 8 7 7 8 7 8 10 2 3 0 2 8 6 10 8 9 0 4 1 7 3 2 5 5 1 7 5 5 6 2 6 3 2 8 2 0 7 0 3 1 5 2 3 9 4 0 6 10 9 2 8 0 0 7 9 8 4 9 5 1 3 6 9 0 0 3 5 1 2 8 6 8 3 4 6 5 7 3 4 9 9 9 9 4 4 1 7 3 0 2 1 4 8 4 10 5 0 1 4 10 10 10 10 0 3 8 8 6 7 2 0 2 6 5 7 9 9 9 4 8 10 3 9 10 10 4 2 9 9 8 7 1 0 10 5 6 4 1 5 2 8 6 8 0 8 10 4 9 3 9 10 0 2 9 8 8 6 3 4 9 2 6 2 5 5 7 2 4 4 8 0 2 3 9 0 10 9 1 8 2 9 9 9 9 0 7 4 5 9 1 4 1 0 10 2 2 0 10 6 2 1 7 4 1 4 7 9 1 0 2 4 6 3 8 9 3 4 8 9 9 0 0  [...]
+7 6 10 6 10 6 10 4 2 10 8 9 8 0 8 1 10 5 6 8 3 8 9 7 7 4 10 8 9 6 0 7 2 10 9 9 10 7 1 10 10 6 9 2 6 4 8 8 9 5 4 8 1 1 6 7 5 10 7 9 0 5 3 9 7 3 10 10 8 10 10 5 3 4 7 3 3 8 4 6 5 1 7 10 3 1 1 8 0 10 7 7 0 0 5 7 3 1 3 8 6 4 0 10 9 9 7 10 2 1 8 10 7 7 10 5 8 3 0 7 1 9 9 0 4 1 8 6 2 10 5 9 4 8 10 9 0 9 5 5 4 3 8 8 6 10 5 9 7 0 5 4 8 10 6 2 6 10 5 1 2 1 8 0 10 4 3 5 4 2 0 9 4 9 7 9 0 9 6 7 10 4 4 8 7 4 1 8 5 1 10 8 1 7 8 1 2 10 8 4 9 4 3 0 10 1 8 2 9 3 7 3 6 5 4 5 0 0 10 7 2 3 0 5 2 8 3 2 10 3 [...]
+9 9 5 8 10 5 8 4 9 10 6 4 3 10 2 9 1 0 2 3 1 6 3 0 6 1 9 1 8 2 8 2 2 2 4 10 7 9 0 1 6 2 6 10 3 6 4 4 9 8 5 4 3 7 1 0 6 8 5 10 3 1 10 9 3 10 7 4 6 9 6 9 5 5 4 2 0 5 9 8 5 8 0 2 6 8 2 10 3 5 10 1 5 4 4 5 9 7 2 2 4 8 9 8 2 9 2 1 10 5 3 4 10 4 7 8 9 5 4 2 5 7 10 2 8 3 1 3 7 0 8 0 4 9 5 0 1 7 4 2 6 9 0 3 2 1 6 8 5 6 2 6 5 4 7 3 6 0 10 1 0 10 4 6 6 2 1 7 1 5 4 1 9 8 4 0 3 2 7 5 8 0 7 5 2 3 5 8 0 0 1 8 2 2 0 7 6 5 0 2 0 1 8 4 7 3 0 8 8 8 1 5 8 6 2 7 8 2 7 7 10 8 0 9 3 4 6 10 5 1 8 1 2 3 8 1 4 9 [...]
+3 10 8 0 2 3 7 4 4 7 7 3 8 6 6 10 5 1 10 9 4 0 9 1 3 0 3 0 0 8 0 3 1 8 1 3 5 0 9 9 1 3 6 5 2 6 2 1 2 10 2 6 8 8 8 5 6 2 6 1 3 7 5 4 6 1 4 2 1 4 6 0 9 4 6 2 2 6 2 7 7 1 4 3 8 5 10 3 3 6 0 8 3 3 2 3 8 8 10 1 7 10 1 6 2 1 0 10 4 4 5 8 2 1 1 4 9 1 5 6 6 6 8 0 1 3 2 0 5 2 7 8 2 3 9 4 7 6 5 10 3 10 5 10 2 5 1 1 7 4 8 6 1 3 1 2 7 7 4 4 9 3 0 5 4 0 9 6 3 0 9 8 4 1 1 6 5 7 4 5 6 2 4 5 1 1 6 4 8 9 4 4 7 6 5 5 2 3 10 5 3 3 0 5 6 8 4 3 4 3 2 2 2 0 3 7 4 4 6 6 0 2 9 1 2 6 8 7 5 5 10 0 5 10 6 6 8 4 0  [...]
+10 0 8 1 10 4 7 6 4 1 9 9 4 6 0 9 4 1 3 0 7 0 6 6 1 4 1 5 10 2 3 3 7 2 5 8 1 4 8 5 2 4 8 3 1 4 10 5 9 1 0 10 5 9 7 0 5 9 1 8 10 1 6 5 2 4 0 1 10 1 7 1 1 7 6 4 0 8 6 3 0 8 7 0 1 8 6 8 8 10 3 6 0 0 9 8 2 7 10 10 8 7 8 0 4 1 0 6 3 9 0 4 1 9 9 9 8 4 4 3 0 6 9 2 0 1 6 8 4 5 2 10 2 5 2 10 10 2 8 10 9 10 8 8 4 3 7 1 7 10 5 9 3 10 1 5 7 10 5 9 4 8 6 9 2 2 7 6 1 6 6 2 8 6 4 8 10 4 9 10 5 2 4 2 9 5 5 8 8 2 0 5 2 7 6 1 10 4 2 7 10 6 0 5 6 3 2 5 8 4 3 8 6 8 1 9 1 6 0 9 6 7 6 10 5 2 0 5 6 1 2 9 8 9 4 [...]
+4 8 0 7 10 7 0 5 1 4 7 0 9 3 8 2 2 9 4 9 1 6 2 9 4 6 9 2 2 6 6 3 1 2 6 4 10 8 0 1 2 0 7 7 2 9 6 1 8 0 1 3 0 1 2 3 6 6 10 5 3 0 3 4 8 7 1 3 7 4 4 7 4 10 6 3 7 1 3 10 6 4 5 6 4 10 1 2 1 0 1 7 8 10 1 6 5 4 2 9 7 1 5 10 3 6 3 0 7 10 0 5 4 6 7 0 3 2 6 2 7 4 1 7 6 8 8 0 2 8 1 3 7 0 10 6 9 5 8 3 6 6 7 3 2 9 0 0 7 5 4 3 0 0 4 7 0 4 6 7 5 0 4 8 9 6 2 8 8 7 6 2 7 2 5 0 4 0 5 1 5 7 2 3 6 8 8 9 4 10 10 3 3 10 6 2 2 5 2 4 0 5 3 4 6 5 7 0 7 6 6 7 6 6 4 9 0 10 3 0 2 4 5 8 1 1 0 0 3 9 1 10 1 10 2 5 0 4  [...]
+10 7 5 9 10 3 8 9 5 2 3 0 5 3 6 1 5 2 1 6 6 6 0 9 9 6 6 3 10 0 10 0 3 1 6 6 3 6 7 1 7 6 2 5 9 0 6 5 0 4 1 10 6 2 2 5 8 1 1 0 1 0 2 10 2 6 10 10 9 3 2 7 5 2 0 8 8 9 5 5 3 4 8 9 2 8 8 5 2 6 8 0 0 9 9 7 7 9 6 8 10 6 2 6 10 8 10 10 3 9 8 0 9 2 7 7 6 7 1 8 5 8 8 4 6 8 7 9 3 9 6 10 7 5 2 1 9 3 7 7 6 5 6 5 5 4 9 6 4 2 3 3 3 2 6 6 0 3 3 5 3 1 7 0 1 3 7 7 8 0 3 9 8 6 5 9 3 6 7 5 1 2 5 2 9 9 7 9 9 5 4 3 3 2 9 10 0 4 9 3 9 3 6 0 6 4 2 3 5 7 0 1 8 9 3 3 1 1 1 5 5 1 6 5 5 7 10 8 9 2 4 1 3 8 7 8 10 2  [...]
+6 9 2 1 3 2 10 6 9 8 7 4 10 8 0 1 10 8 10 2 1 8 7 7 1 5 4 0 0 9 3 8 1 9 9 4 8 9 0 6 8 5 7 9 1 2 0 7 6 2 7 10 9 4 10 1 6 6 2 4 5 2 4 10 10 6 5 0 2 5 7 7 2 0 6 7 7 6 4 1 1 9 1 1 6 4 9 5 10 0 5 0 6 4 10 8 8 6 1 2 1 0 0 3 0 10 4 6 1 6 2 10 0 4 10 8 2 8 5 10 1 0 0 0 2 5 7 6 4 4 5 2 6 8 2 3 4 7 9 7 2 8 3 5 0 2 7 2 1 4 0 3 6 8 7 3 0 10 0 6 5 0 7 5 3 5 1 8 10 8 9 0 9 3 4 5 2 9 1 9 5 4 4 0 10 8 9 10 10 1 7 2 5 6 2 9 4 1 7 6 6 4 2 2 7 8 4 6 10 6 3 3 7 9 5 10 5 1 3 10 1 7 2 10 0 1 3 1 2 4 5 5 10 0  [...]
+2 3 1 2 9 0 6 1 4 4 6 7 0 2 10 8 3 5 3 3 8 9 6 7 5 3 10 3 6 7 3 10 2 0 3 9 0 0 9 3 4 8 6 9 10 10 1 3 4 3 1 2 5 0 2 5 8 1 4 10 6 6 2 2 10 10 9 9 6 9 9 3 0 4 5 0 8 4 5 1 4 0 5 0 7 2 0 2 9 10 4 7 4 9 10 10 1 6 8 1 10 7 5 0 0 9 3 8 2 0 6 6 5 10 7 2 9 5 1 2 3 0 7 4 9 7 2 4 7 7 8 2 5 1 3 2 2 5 10 7 4 6 5 9 2 6 0 1 10 7 9 2 7 10 4 4 9 7 4 8 8 0 7 5 7 1 0 7 5 6 2 4 0 10 3 8 5 6 9 7 1 1 1 1 5 2 4 3 1 4 4 8 5 5 2 6 8 4 3 8 6 7 1 9 5 4 2 4 3 1 3 9 10 8 8 3 1 6 9 6 9 2 6 9 9 1 5 8 0 9 9 1 5 7 4 3 2  [...]
+3 6 5 1 9 6 5 0 9 10 8 1 1 10 6 6 3 10 3 9 5 8 1 8 7 8 2 1 3 8 3 5 2 5 7 4 1 8 2 9 6 6 5 3 1 9 0 3 6 6 1 7 0 7 2 0 8 6 0 4 1 3 3 9 1 7 1 9 4 3 7 9 0 9 9 7 0 6 10 0 2 8 8 8 3 7 7 7 4 0 10 2 10 10 6 0 4 1 10 2 1 5 0 1 0 10 4 7 3 6 4 10 5 4 7 5 3 6 9 4 2 7 1 4 0 3 4 4 5 9 9 10 0 8 0 10 1 2 10 6 4 1 7 7 5 6 3 6 1 2 5 9 3 5 4 3 2 2 9 7 8 4 2 10 2 3 4 9 0 10 6 4 1 7 10 5 2 1 5 9 5 7 6 8 1 0 0 4 1 8 6 7 9 0 5 7 6 2 1 7 8 0 6 3 9 6 0 7 5 0 0 0 8 1 5 9 5 9 5 9 1 4 8 4 8 1 8 8 0 0 2 0 2 6 9 8 10 1 [...]
+10 9 1 6 8 9 10 10 9 5 5 4 2 6 1 10 8 10 2 2 10 2 8 2 2 9 3 0 5 9 1 6 2 0 5 0 2 8 4 10 9 7 7 10 4 4 1 3 1 8 6 4 7 3 9 4 3 2 1 6 7 4 4 2 10 3 1 7 4 7 0 6 3 0 2 10 8 3 5 8 10 0 2 6 9 1 3 3 6 7 3 5 3 10 6 10 4 8 4 10 6 5 2 10 5 5 4 10 10 5 5 10 6 0 7 6 3 4 4 0 2 4 7 1 2 6 7 8 0 7 0 4 3 5 0 1 2 7 1 3 0 4 6 3 9 3 8 8 5 9 4 2 8 10 1 6 0 0 2 9 2 7 10 0 9 3 6 8 6 2 10 2 5 8 4 8 8 7 7 6 7 10 2 1 6 4 0 1 1 8 9 4 8 3 5 3 7 1 5 4 8 6 3 7 9 0 8 5 7 2 4 8 3 5 9 9 4 3 8 5 2 8 9 3 8 4 8 4 6 8 10 2 2 8 4 [...]
+10 9 10 3 9 10 5 5 0 8 7 8 4 4 6 10 7 1 7 1 5 3 10 5 1 2 9 2 9 5 8 10 4 5 6 2 1 3 4 5 7 7 7 5 8 2 7 10 2 0 8 9 0 4 9 1 7 3 6 7 1 6 0 5 7 9 1 7 0 4 6 8 1 2 6 10 6 0 0 3 3 8 3 2 7 7 6 3 1 4 5 10 6 0 4 5 2 9 5 6 7 9 10 0 6 3 2 7 10 8 7 9 6 0 4 2 5 3 9 10 1 5 1 2 6 8 5 4 7 8 2 6 4 8 10 4 0 10 7 7 6 6 9 6 9 1 4 6 10 8 7 9 3 6 3 7 8 3 5 10 9 10 7 6 4 4 0 2 9 2 10 3 9 9 6 4 5 7 8 7 8 4 3 10 1 6 1 10 1 3 9 9 5 4 8 0 0 5 2 1 9 8 7 10 7 1 6 3 5 7 4 8 3 2 6 6 1 3 1 3 7 4 6 9 0 3 1 10 3 3 0 2 9 5 5  [...]
+3 1 7 3 10 8 1 8 1 1 10 9 7 1 9 6 10 8 1 4 4 7 4 3 2 0 2 8 10 7 2 4 9 0 7 4 2 8 9 0 0 3 3 7 6 6 6 0 10 2 3 0 5 3 6 7 7 8 8 6 7 9 8 6 8 0 0 10 4 10 0 8 7 2 6 2 7 8 1 7 9 10 10 0 4 10 6 9 3 0 2 2 8 5 4 2 2 2 2 6 8 1 10 3 2 7 8 10 2 2 3 4 10 4 7 8 4 4 9 10 0 7 3 3 0 9 7 0 0 10 1 9 10 2 1 3 0 5 2 9 9 3 3 5 3 1 5 4 6 2 0 9 3 2 9 6 7 9 6 5 4 9 5 10 9 0 9 4 5 10 10 1 5 9 10 7 9 3 2 10 10 3 8 8 6 7 10 3 9 10 1 2 7 6 0 6 9 2 0 8 1 1 3 5 8 4 10 8 3 9 7 0 2 7 3 9 5 3 3 6 1 1 0 5 6 2 6 8 0 9 7 8 5 8 [...]
+6 2 6 2 9 9 10 1 8 8 9 8 1 9 0 4 9 8 2 6 10 7 10 9 2 7 9 9 1 4 6 10 7 5 10 1 9 5 9 0 2 6 9 4 9 3 10 6 3 3 4 3 1 2 9 4 10 6 2 1 8 6 5 8 1 4 5 6 6 6 4 0 0 1 8 6 7 0 1 1 6 3 10 6 9 9 2 1 4 3 2 3 1 0 5 3 7 0 6 3 1 9 1 1 10 4 8 5 10 5 3 10 5 5 4 8 1 7 4 6 0 3 8 0 4 7 2 7 9 10 7 7 10 7 10 1 9 8 3 5 3 10 6 3 2 1 0 3 7 8 8 10 5 0 1 0 4 6 4 1 2 2 5 5 2 10 0 0 3 10 0 6 2 7 6 10 2 1 3 3 5 5 1 4 0 9 1 10 2 3 9 2 2 10 3 5 10 2 3 10 8 0 8 2 2 5 10 5 9 9 7 6 7 9 8 6 3 3 7 1 6 2 1 5 2 5 8 7 5 2 1 9 3 3  [...]
+0 4 0 2 6 9 5 0 1 6 1 9 3 3 7 4 8 3 8 6 2 10 4 3 5 3 9 4 7 4 4 10 7 1 9 8 1 7 0 3 10 9 10 0 6 1 10 10 8 5 0 1 3 10 3 1 9 4 0 1 6 10 2 4 10 6 3 2 10 1 10 2 3 10 4 4 10 4 5 2 10 2 5 2 8 6 1 2 9 5 4 8 0 3 2 0 4 3 7 1 8 5 10 1 5 6 6 2 0 10 4 0 7 0 0 1 10 10 6 0 6 8 7 6 9 7 3 4 5 8 5 4 1 8 6 6 0 1 8 3 10 4 6 3 9 10 6 0 3 6 6 7 5 10 7 10 2 1 5 10 3 5 3 8 2 6 6 0 6 2 8 10 8 7 3 7 10 5 4 6 10 9 10 7 10 6 5 8 10 6 0 4 5 1 10 6 10 3 3 10 9 7 7 4 10 5 1 6 9 2 8 8 6 2 2 6 8 2 7 2 7 10 7 5 9 3 2 10 8 [...]
+10 9 3 6 9 9 2 9 6 3 6 0 8 7 10 1 10 0 10 8 1 7 8 5 6 2 3 3 9 3 9 4 4 4 4 1 4 4 7 7 10 2 7 7 8 5 8 9 1 8 5 6 6 3 4 9 1 0 4 1 10 3 1 10 4 4 6 9 10 9 3 5 9 7 4 3 8 5 0 2 7 6 8 4 7 1 1 8 3 3 2 7 6 3 7 7 0 10 10 9 5 9 10 7 9 6 0 7 1 9 2 5 2 3 6 10 10 0 7 0 10 3 7 8 4 10 9 3 9 1 1 1 8 0 0 0 5 3 10 3 0 4 5 3 7 0 0 9 0 1 8 1 9 7 3 10 5 3 2 2 4 2 10 6 10 8 1 10 5 5 10 7 5 6 6 10 7 7 0 7 10 1 0 4 10 10 0 6 9 3 6 8 10 10 9 0 1 2 8 2 10 6 9 2 0 6 3 7 0 2 4 7 4 0 4 4 10 1 6 10 0 4 0 4 3 6 10 2 2 0 7 [...]
+4 5 9 4 2 3 2 3 1 2 4 2 0 8 9 5 3 3 1 3 0 2 3 5 10 7 5 6 4 2 10 5 4 3 6 7 0 8 1 9 9 5 8 9 2 0 5 2 3 7 2 0 3 7 4 8 2 3 3 2 9 9 5 9 1 10 10 10 3 6 10 8 2 3 0 2 0 7 6 4 9 7 8 3 7 2 3 1 3 1 4 4 0 0 1 5 6 3 2 9 1 9 10 1 3 7 1 1 0 8 7 4 10 9 3 1 1 1 0 0 6 7 2 7 4 5 0 1 8 0 3 2 10 8 4 8 4 7 5 6 2 4 7 1 7 9 1 5 1 8 4 9 1 9 6 4 1 9 7 0 1 5 5 8 0 7 4 0 4 1 0 10 7 6 2 10 8 2 7 0 7 2 8 10 9 6 9 10 1 3 2 9 8 10 0 8 9 8 7 5 9 6 6 3 2 2 4 6 7 8 4 6 5 2 2 7 10 2 2 0 0 3 10 9 8 4 5 2 6 6 2 9 10 6 6 4 5 9 [...]
+9 7 5 3 6 2 6 2 7 8 8 5 0 4 4 8 7 5 8 7 7 6 10 4 7 2 9 9 9 9 3 8 1 0 3 10 5 10 6 1 5 6 3 2 4 1 7 9 7 0 8 7 9 6 4 3 4 8 4 3 10 5 7 7 2 2 1 2 3 8 4 3 7 3 5 9 8 4 0 0 5 1 7 4 5 0 0 0 9 1 1 1 8 0 2 3 4 7 0 4 1 0 3 2 3 5 9 2 2 10 4 9 10 3 5 2 1 2 10 6 1 0 5 8 1 9 0 9 6 3 8 3 9 4 5 4 7 3 4 9 6 4 3 6 7 10 4 9 3 2 9 5 7 8 7 3 8 1 8 2 7 4 4 1 7 5 4 7 0 0 8 9 5 6 10 3 6 1 8 6 7 5 8 6 8 2 1 6 4 9 7 4 4 10 8 0 5 4 0 9 6 2 4 1 1 4 1 2 0 1 4 3 4 0 0 6 1 7 2 4 9 9 8 7 9 10 5 0 1 7 0 4 10 2 5 5 10 3 8 2 [...]
+10 10 9 0 3 9 0 3 6 1 9 0 2 6 8 4 2 4 2 6 9 3 2 5 4 9 6 1 8 8 7 3 1 0 7 4 5 3 10 10 7 7 4 3 5 0 4 5 0 2 0 1 0 9 5 5 2 4 3 0 7 6 6 2 7 7 0 2 2 5 7 4 9 4 9 0 3 0 1 0 9 5 7 1 4 3 7 0 10 1 10 2 6 7 9 0 7 8 8 2 0 1 1 10 9 6 0 6 3 5 4 7 10 7 9 0 4 2 3 10 8 9 10 3 1 4 7 6 10 6 0 0 5 7 1 2 9 3 10 5 1 0 4 3 5 8 7 5 10 0 6 3 6 10 5 0 8 0 10 9 1 7 4 6 7 1 4 9 2 10 0 6 3 0 2 5 2 5 9 10 10 0 7 1 0 6 9 2 8 4 2 5 7 9 0 5 7 9 6 0 0 1 2 9 1 7 10 3 7 2 4 4 10 3 8 3 10 6 10 8 8 3 7 5 4 5 9 1 1 2 0 6 4 1 2  [...]
+3 3 7 10 8 7 10 2 3 2 9 5 6 2 4 5 6 6 0 1 3 1 7 7 0 3 1 9 0 6 5 7 6 10 6 3 8 5 10 6 0 9 7 1 2 5 4 10 6 3 6 10 7 1 8 1 7 8 4 5 0 8 4 6 8 6 5 5 6 0 7 9 6 0 10 1 10 3 5 8 0 2 5 2 0 1 4 10 3 0 9 0 1 7 9 8 6 9 10 2 9 2 8 0 5 4 1 7 5 3 0 4 1 7 4 2 5 7 7 4 0 5 8 4 1 10 8 2 7 7 0 0 6 8 0 0 1 5 3 2 9 2 10 0 5 8 5 8 9 6 3 10 5 8 2 2 8 8 1 3 5 6 6 3 3 3 2 8 0 4 0 5 9 8 4 5 3 0 6 2 9 3 9 6 8 9 0 9 6 0 4 8 7 8 10 1 5 9 10 9 7 5 6 1 1 8 2 4 2 10 0 5 5 9 10 7 0 1 9 5 5 2 7 1 6 6 9 9 1 5 4 1 10 9 3 7 9  [...]
+9 1 6 3 4 9 7 9 3 3 5 2 6 1 2 10 3 5 3 0 4 8 4 9 6 9 2 4 9 2 5 1 2 0 6 4 8 3 2 10 2 8 0 5 1 0 7 7 8 4 0 2 8 2 0 2 9 1 4 2 3 8 10 9 7 9 2 1 7 5 4 3 0 5 7 7 1 1 5 6 6 6 3 9 8 1 6 5 3 0 3 2 7 9 6 10 5 1 5 8 4 1 9 10 7 2 8 2 1 3 1 9 1 5 2 3 10 7 6 4 10 8 1 10 5 3 9 0 8 10 7 8 9 8 8 2 6 0 6 2 1 10 10 0 2 0 7 6 2 3 8 5 5 9 8 4 5 9 3 4 1 8 1 9 0 7 7 3 3 3 6 9 4 9 2 6 6 8 2 3 2 5 9 0 7 0 10 3 0 3 4 0 5 2 7 8 3 6 4 9 4 10 3 1 7 8 1 9 7 5 1 8 7 9 6 8 2 9 10 7 3 10 0 2 10 4 10 1 1 6 8 3 8 3 9 0 1 6 [...]
+8 7 0 6 5 4 8 7 0 7 0 7 5 10 7 5 8 7 5 3 10 5 10 4 5 4 7 7 3 1 3 6 10 3 7 2 0 4 3 0 2 4 8 9 6 1 7 8 3 6 9 1 3 2 9 8 3 3 10 1 3 7 1 7 5 5 2 5 2 7 7 1 3 8 2 6 2 4 1 9 2 9 0 4 4 6 3 7 2 8 2 2 5 0 6 3 4 9 2 6 9 7 2 5 6 5 5 5 6 3 7 4 2 8 10 3 2 6 10 6 9 10 5 4 3 2 10 6 0 7 8 1 8 2 10 4 1 0 2 0 8 0 9 4 4 6 4 3 2 8 6 8 8 1 8 6 8 6 10 2 8 0 4 0 1 1 1 0 6 3 4 7 0 7 7 4 3 3 2 7 3 0 7 5 9 3 2 5 8 3 2 5 0 3 9 8 0 9 3 3 9 6 2 4 6 4 7 4 9 10 5 4 8 6 5 6 9 1 3 1 10 10 2 3 7 4 1 0 2 3 4 0 1 1 9 4 10 6 6 [...]
+2 4 5 4 7 10 8 4 3 9 3 10 4 6 0 4 9 5 5 6 9 2 9 9 9 3 8 5 0 6 6 5 0 5 6 7 4 8 8 3 7 0 9 7 6 2 9 6 1 8 6 8 4 10 0 8 10 6 0 8 1 10 10 10 1 10 8 3 2 9 6 7 5 9 8 0 6 8 0 2 10 2 5 7 1 8 5 9 6 2 7 6 8 7 8 3 0 3 4 2 9 4 10 7 2 0 0 5 8 9 9 4 3 2 3 10 3 10 3 0 8 0 9 7 9 10 9 0 10 5 6 10 2 8 8 4 1 6 10 4 0 8 8 10 5 9 1 2 6 1 1 2 1 4 1 5 9 7 3 3 3 10 6 7 6 7 0 9 8 3 0 4 8 0 4 5 6 5 5 2 10 9 10 8 2 4 7 7 1 6 4 2 6 8 3 5 3 9 0 10 0 7 3 3 7 8 5 3 8 7 1 7 8 7 10 8 7 1 8 6 6 4 9 8 5 10 5 1 7 3 8 9 1 3 5 [...]
+2 10 8 4 4 5 9 5 5 6 10 6 7 10 6 10 5 8 5 6 0 3 2 9 4 8 2 8 9 8 8 5 8 0 6 8 1 7 6 3 3 3 2 10 5 7 7 8 5 9 2 6 5 7 1 8 8 6 5 2 9 5 7 3 4 7 9 5 10 9 8 6 9 0 3 7 2 1 4 4 0 3 8 6 2 9 7 1 0 9 10 9 0 10 9 8 1 4 8 9 0 0 9 1 2 2 4 6 2 0 5 0 9 8 8 4 2 4 4 6 10 7 2 10 6 8 6 10 1 1 2 7 4 2 5 4 6 8 5 4 2 9 2 2 10 0 2 5 9 5 7 6 2 8 3 9 5 1 7 10 0 9 10 10 5 10 3 9 0 6 4 2 9 2 9 10 4 7 2 5 1 3 1 8 2 10 3 6 4 4 4 10 3 3 1 4 9 5 0 7 0 10 10 5 6 9 7 10 0 2 7 10 5 3 4 10 6 7 3 4 2 4 2 5 3 3 7 3 7 9 7 1 10 3 [...]
+9 0 1 10 4 4 5 5 8 10 9 10 1 3 8 2 2 1 7 5 10 0 9 4 7 0 5 0 6 4 6 6 2 4 2 5 7 1 6 4 6 10 4 0 9 0 3 4 4 3 1 8 4 1 9 7 2 1 1 1 0 10 5 10 2 2 2 6 10 1 4 10 4 7 4 2 10 7 3 8 5 6 7 0 2 0 5 7 8 8 2 6 6 4 4 8 5 6 1 6 10 4 6 8 1 8 10 0 9 5 8 5 10 8 3 3 4 0 2 8 3 3 0 0 9 2 1 1 3 0 5 9 7 6 6 3 8 9 6 0 7 8 9 1 3 8 10 7 10 8 7 7 10 9 4 3 3 9 6 4 9 1 6 9 7 6 3 3 0 1 1 7 1 8 6 10 4 6 9 1 9 5 0 7 9 9 3 7 0 3 9 10 7 1 2 5 5 9 0 5 9 8 2 8 7 0 10 4 6 0 8 10 1 2 7 1 2 4 10 5 1 1 10 5 3 8 6 5 10 4 3 8 1 2 8 [...]
+1 8 8 4 8 6 10 9 5 4 1 9 5 4 7 6 7 2 9 10 5 6 3 10 5 3 4 7 3 7 2 1 0 3 9 7 4 4 3 2 8 7 8 9 6 5 8 9 2 10 0 4 2 6 3 3 5 8 4 2 5 8 2 1 0 9 3 3 6 9 1 9 5 4 2 7 1 9 0 4 7 0 4 6 5 7 2 2 3 1 8 2 0 4 6 1 10 6 5 2 3 7 0 5 8 0 10 7 1 1 0 4 0 0 2 1 5 5 2 2 2 2 8 8 5 0 1 5 1 5 2 7 8 7 9 0 5 7 0 4 5 9 7 3 8 2 9 10 9 6 3 4 3 3 1 10 6 10 4 8 3 2 2 3 6 9 6 5 10 1 8 7 6 3 9 4 3 9 7 10 10 1 9 8 8 4 5 3 7 8 8 1 1 3 4 10 0 5 5 0 8 2 7 10 10 10 6 5 9 4 2 8 2 9 10 8 10 6 1 5 0 2 0 10 1 0 0 0 0 8 9 8 7 4 2 5 2 [...]
+3 6 1 10 10 8 10 9 9 8 6 4 5 9 5 7 0 10 2 3 5 8 7 0 7 2 4 2 1 8 2 10 2 4 7 10 4 6 0 4 8 6 2 2 4 1 2 9 4 7 2 4 6 2 6 10 0 9 7 7 2 10 5 0 3 1 9 0 1 1 7 9 1 4 8 7 2 7 10 7 0 2 0 5 3 5 7 6 8 0 10 5 2 4 1 5 9 7 10 6 9 3 1 10 4 1 3 4 5 6 6 4 5 2 7 2 2 2 7 2 7 9 6 2 10 3 5 10 2 7 5 5 3 7 2 9 7 8 9 3 4 6 6 6 8 9 0 3 10 6 7 4 0 6 1 4 2 5 0 5 10 9 1 9 4 7 1 8 5 9 10 2 8 10 3 4 2 1 4 6 7 6 3 7 0 1 9 4 7 2 1 0 6 5 10 6 0 10 4 9 7 9 7 9 3 10 1 7 1 0 3 8 9 10 10 0 8 7 0 10 3 8 6 3 4 8 3 2 3 1 10 4 5 7 [...]
+5 9 0 3 8 4 9 10 6 3 8 10 4 3 9 3 6 4 3 1 1 2 3 1 5 7 9 10 9 9 6 4 8 9 0 10 6 4 3 1 8 3 0 6 8 2 4 5 4 1 8 7 9 5 0 4 8 2 2 4 0 7 4 6 7 7 2 8 8 2 7 9 5 2 7 9 0 3 10 5 5 9 9 6 3 7 7 7 1 5 4 3 8 0 7 3 5 7 8 3 6 10 4 6 5 2 1 3 6 9 0 6 8 4 3 4 2 0 1 1 5 7 8 9 8 10 9 6 1 0 2 7 7 3 8 8 6 9 8 7 1 4 3 0 7 2 2 0 9 5 1 9 9 9 0 4 6 2 10 10 0 10 7 1 0 0 5 5 5 1 5 7 5 6 7 4 1 3 1 3 7 4 1 2 7 6 0 6 2 6 1 3 10 4 6 10 1 6 0 5 1 8 6 5 9 2 3 10 6 0 10 0 7 3 7 0 3 1 1 2 6 2 9 7 3 0 10 8 9 8 5 3 3 3 4 6 0 0 1 [...]
+2 8 1 10 7 7 8 10 0 5 6 2 5 7 6 8 6 6 7 7 9 5 6 8 6 6 9 9 3 8 7 0 8 6 2 6 2 3 6 9 4 2 4 1 8 0 2 9 8 0 4 8 0 5 1 7 1 3 0 10 5 0 2 3 6 5 3 3 4 9 1 6 8 6 5 3 2 5 2 6 0 7 0 8 4 1 7 6 5 1 5 5 3 5 10 6 1 0 8 9 5 2 7 2 6 8 3 4 1 1 4 9 1 7 4 10 7 2 2 9 2 8 3 7 5 10 9 4 0 4 4 10 0 3 1 2 7 9 4 2 6 2 7 5 0 7 1 0 2 6 2 4 9 3 2 4 3 8 6 2 2 5 1 6 8 3 1 8 10 6 10 2 10 10 4 10 8 3 10 9 4 10 0 7 1 6 3 9 6 2 4 0 7 7 4 1 3 5 8 2 7 6 9 9 7 1 1 9 0 1 5 10 2 6 3 10 2 0 4 1 9 2 0 4 5 9 4 5 10 2 6 7 10 0 5 10 4 [...]
+7 6 10 7 8 7 1 0 5 1 7 1 9 6 6 3 1 0 6 1 9 10 10 5 2 2 4 3 9 2 4 6 8 6 8 3 9 3 8 3 10 10 1 8 2 10 6 4 10 9 3 1 0 7 4 10 7 2 5 1 6 10 3 6 7 0 7 6 9 9 2 2 6 2 6 5 5 2 2 5 1 3 9 8 6 3 8 10 6 5 8 4 2 6 1 4 2 1 4 4 7 6 10 6 6 10 9 8 4 8 10 7 0 1 3 4 2 9 10 4 8 4 7 7 1 1 5 0 5 2 8 8 7 9 6 3 2 10 1 6 6 8 0 5 3 3 3 1 10 4 6 4 1 2 3 0 1 6 8 3 8 6 10 7 7 2 5 6 9 5 1 5 7 10 0 8 8 5 4 0 0 7 6 6 6 5 10 3 5 1 8 8 3 7 0 6 2 2 7 10 10 0 5 6 1 8 0 8 8 8 4 5 6 10 7 5 4 9 0 7 2 1 2 4 9 2 0 5 4 2 8 1 5 6 9  [...]
+2 0 3 10 9 9 6 9 0 4 8 8 3 10 10 3 9 10 1 4 5 2 9 1 4 0 10 1 1 2 5 3 7 0 2 6 4 1 3 7 10 2 2 9 10 7 3 6 3 2 6 6 5 10 9 6 3 7 10 3 6 7 5 8 2 7 5 3 8 1 6 10 1 10 3 10 4 3 3 8 1 3 6 6 3 10 9 10 3 6 10 2 1 4 10 1 2 2 7 3 1 8 3 8 4 3 1 2 3 6 1 0 10 8 8 2 8 6 10 6 3 4 2 7 3 0 5 4 9 9 5 7 1 0 9 9 2 7 8 5 7 0 0 6 5 0 3 4 4 1 0 10 3 2 7 1 8 2 7 2 0 6 4 3 1 8 10 5 5 2 9 4 4 4 10 9 8 9 0 2 6 6 7 0 8 6 4 3 8 7 8 4 7 9 1 5 1 10 9 6 9 9 9 2 8 2 2 10 1 0 8 3 9 2 5 1 3 6 5 1 8 9 2 1 1 4 0 0 10 7 7 0 9 9  [...]
+3 10 9 10 4 8 6 0 2 0 5 7 2 3 8 2 0 4 5 9 6 10 2 0 3 8 2 7 7 3 6 0 5 9 2 0 5 4 10 7 5 6 1 2 2 7 4 6 8 8 3 8 9 2 9 7 6 7 10 9 6 10 10 5 8 4 5 7 9 1 2 2 7 4 8 10 8 9 0 4 9 6 0 1 3 2 6 0 8 6 8 4 10 1 7 4 6 6 5 0 10 0 4 2 2 7 7 7 7 5 8 9 5 0 4 0 10 7 2 8 4 7 1 3 7 7 6 1 3 5 3 9 10 2 7 0 9 9 2 4 0 0 9 10 5 5 5 4 1 5 2 1 5 3 3 9 9 8 5 2 2 9 1 1 4 3 2 6 6 8 10 5 6 4 5 4 1 10 6 1 8 2 10 5 3 4 6 7 7 1 0 10 2 7 3 1 1 2 10 1 2 10 1 7 3 10 8 2 6 0 0 4 8 8 2 0 4 7 8 7 1 9 1 2 2 7 1 4 9 7 0 5 1 4 6 2  [...]
+3 4 2 0 3 4 9 2 0 7 10 5 3 0 0 9 10 4 10 7 3 8 9 4 8 2 5 9 8 2 8 7 3 0 2 5 8 1 7 1 3 10 6 0 6 3 4 0 3 6 5 3 3 0 6 7 5 7 6 5 7 3 10 7 0 9 7 1 6 8 2 5 5 10 4 10 8 5 7 7 5 2 4 10 6 2 5 8 9 8 3 0 1 0 0 3 4 7 2 9 8 8 2 8 0 2 4 9 7 5 6 6 2 3 1 9 9 8 7 4 9 6 6 10 8 5 9 0 6 10 0 5 2 8 0 1 4 10 5 10 4 7 1 2 1 4 2 1 6 2 8 3 7 2 3 3 2 6 2 5 9 5 3 8 3 5 10 6 3 10 2 2 10 6 10 8 2 3 2 7 0 8 7 6 5 5 1 2 10 1 2 7 10 1 1 4 3 3 4 8 0 7 2 5 7 3 4 5 3 1 3 1 1 1 5 1 3 4 3 1 0 6 3 1 0 3 0 9 5 3 1 3 4 2 4 9 6  [...]
+5 8 10 10 4 4 0 7 6 7 5 5 2 6 9 6 8 1 0 4 7 10 1 4 2 1 10 4 4 3 9 1 6 5 5 0 1 4 5 3 9 7 6 3 5 2 0 7 6 1 6 0 8 2 9 0 7 6 0 9 8 1 5 4 2 1 5 2 2 1 2 2 5 8 10 6 1 3 1 3 2 5 4 7 1 7 7 10 4 10 2 3 0 1 6 4 4 6 0 9 9 6 1 8 8 2 2 8 7 7 8 0 5 1 3 5 6 2 2 3 1 2 8 8 8 1 10 10 3 3 7 6 7 3 4 6 2 8 10 7 5 4 8 3 8 2 7 2 4 1 9 8 9 1 2 10 4 9 10 9 6 2 4 3 0 8 6 3 6 9 10 6 6 6 6 10 8 10 6 1 3 3 2 3 8 1 7 8 5 0 2 5 2 4 8 2 9 7 10 7 10 0 0 10 6 7 5 8 6 0 6 5 2 8 10 7 0 8 5 0 3 0 7 2 8 2 2 2 8 6 10 10 7 10 9  [...]
+7 0 10 2 8 4 1 0 5 0 1 1 5 8 8 1 10 7 3 1 2 2 6 3 6 6 1 0 10 4 2 7 8 6 9 8 10 3 1 0 6 5 3 0 1 7 7 1 5 8 5 7 7 10 9 10 8 10 6 8 5 1 6 0 9 6 5 7 4 10 2 2 5 10 9 2 1 4 5 4 10 2 8 4 3 6 5 10 3 7 7 8 6 4 8 9 4 7 0 6 1 3 1 10 8 0 8 7 1 7 2 4 3 1 1 6 6 7 6 3 0 2 10 4 8 6 9 8 3 8 0 10 3 1 10 10 1 0 4 10 6 6 8 3 8 2 2 10 3 4 5 3 3 6 4 1 5 5 8 4 1 9 9 3 6 2 8 7 8 9 3 10 3 4 8 7 7 4 9 7 6 3 10 10 2 3 9 9 4 3 2 9 6 9 4 4 8 3 1 2 8 3 9 9 2 8 10 2 5 6 3 8 0 0 7 5 2 4 2 9 7 7 0 10 4 8 0 2 10 10 0 0 2 4 [...]
+0 8 7 10 7 1 5 6 9 8 6 6 1 2 6 10 6 4 1 6 10 10 9 9 8 0 8 1 1 9 8 1 9 2 10 10 5 8 1 7 6 0 2 4 10 4 9 9 3 6 1 4 7 10 0 4 10 7 2 10 2 7 2 10 8 8 3 7 10 9 6 7 0 8 3 0 7 7 10 3 4 2 5 10 1 1 6 0 7 9 9 9 5 7 7 9 8 4 9 3 0 10 0 3 7 8 10 10 1 1 10 3 6 5 2 2 3 8 3 3 8 4 2 4 4 1 2 9 2 5 2 7 3 5 3 10 9 5 1 3 9 5 0 4 10 7 6 6 3 7 5 4 9 0 0 8 10 5 5 4 10 9 3 2 7 9 8 6 2 5 2 10 5 6 3 7 0 1 1 7 6 7 8 1 7 1 0 4 9 2 8 9 0 5 4 8 10 5 0 9 8 2 9 0 4 3 3 6 1 4 2 1 8 5 8 10 3 8 3 10 10 8 4 7 5 6 1 8 7 2 9 7 8 [...]
+
+4596.6697 188.07273 186.77477 186.10054 184.22754 183.00956 181.82393 180.13044 179.25687 178.53225 177.75578 177.40033 176.50395 175.4307 175.13754 173.99115 173.15698 172.65109 171.96894 170.62714 169.91359 169.40989 168.84679 168.28278 167.83407 167.4522 166.86696 165.85375 165.49635 164.48983 164.18113 163.29027 162.61389 162.58701 161.75094 161.28736 160.73968 159.55174 159.09983 158.07767 157.78547 157.34607 156.7618 156.56961 156.22424 155.42141 154.14762 153.93593 153.60525 152.7 [...]
\ No newline at end of file
diff --git a/examples/testdata/svd/pysvd.example b/examples/testdata/svd/pysvd.example
new file mode 100644
index 0000000..bb49fe9
--- /dev/null
+++ b/examples/testdata/svd/pysvd.example
@@ -0,0 +1,11 @@
+8 5
+22 10 2 3 7
+14 7 10 0 8
+-1 13 -1 -11 3
+-3 -2 13 -2 4
+9 8 1 -2 4
+9 1 -7 5 -1
+2 -6 6 5 1 
+4 5 0 -2 2
+
+35.327044 20 19.5959178 0 0
diff --git a/examples/testdata/svd/qr.example b/examples/testdata/svd/qr.example
new file mode 100644
index 0000000..9567639
--- /dev/null
+++ b/examples/testdata/svd/qr.example
@@ -0,0 +1,6 @@
+3 3
+1 2 3
+-1 0 -3
+0 -2 3
+
+5.326834  2.911210 0.386908
diff --git a/examples/testdata/svd/random.example b/examples/testdata/svd/random.example
new file mode 100644
index 0000000..f3f354b
--- /dev/null
+++ b/examples/testdata/svd/random.example
@@ -0,0 +1,1003 @@
+1000 512
+0.08834 0.10614 0.61831 0.77807 0.12377 0.54896 0.68364 0.00781 0.60657 0.33968 0.86093 0.08402 0.01421 0.73769 0.69238 0.59821 0.22596 0.1263 0.02751 0.30073 0.76626 0.31306 0.53773 0.03314 0.00757 0.34715 0.05356 0.60592 0.6178 0.73482 0.54152 0.00058 0.62707 0.72706 0.91957 0.01432 0.21002 0.38132 0.48756 0.70514 0.92263 0.15634 0.08337 0.47822 0.42724 0.56289 0.53421 0.83481 0.44116 0.93946 0.64668 0.14445 0.56688 0.00813 0.38443 0.71754 0.01271 0.27184 0.47817 0.90726 0.29564 0.6186 [...]
+0.59444 0.98509 0.40811 0.33891 0.70735 0.12417 0.01476 0.09688 0.71122 0.86902 0.19858 0.16084 0.52489 0.62274 0.37364 0.69772 0.4872 0.81885 0.1069 0.6047 0.49578 0.74282 0.30682 0.73411 0.41722 0.52232 0.6199 0.5296 0.82447 0.30476 0.31099 0.44712 0.75459 0.81231 0.09384 0.10602 0.15572 0.89294 0.09294 0.82793 0.8914 0.67494 0.90767 0.08095 0.81092 0.12156 0.85556 0.50246 0.7977 0.6743 0.96074 0.62448 0.17374 0.44606 0.45137 0.02711 0.19512 0.37857 0.81903 0.34123 0.1506 0.34481 0.106 [...]
+0.65656 0.18708 0.14023 0.09244 0.07199 0.17737 0.75973 0.19448 0.02498 0.60818 0.32549 0.63648 0.95966 0.89266 0.54439 0.42763 0.53214 0.14405 0.43364 0.38416 0.57504 0.35203 0.98103 0.70966 0.83584 0.48638 0.44376 0.4335 0.90803 0.39961 0.78503 0.21394 0.17962 0.86526 0.34542 0.86536 0.92991 0.27821 0.79447 0.89732 0.68718 0.96949 0.91423 0.7045 0.09057 0.10436 0.34705 0.67405 0.28307 0.85872 0.4269 0.38403 0.73677 0.77404 0.96655 0.33727 0.65171 0.75579 0.97246 0.42304 0.546 0.18739 0 [...]
+0.34038 0.05737 0.82705 0.85986 0.06035 0.76271 0.89119 0.28764 0.93202 0.40069 0.48894 0.92015 0.56527 0.9559 0.25317 0.52983 0.98532 0.02192 0.75539 0.75015 0.89805 0.75284 0.22971 0.47088 0.12581 0.67884 0.757 0.95205 0.66215 0.44392 0.25259 0.75571 0.60202 0.56401 0.41344 0.47547 0.85942 0.5213 0.17844 0.41787 0.74015 0.46098 0.43597 0.78365 0.03958 0.80015 0.03347 0.61974 0.97335 0.36278 0.40022 0.78981 0.38338 0.67938 0.34913 0.57441 0.93223 0.40442 0.81047 0.73484 0.39618 0.81799  [...]
+0.56445 0.45432 0.64172 0.05938 0.25406 0.97975 0.03033 0.53295 0.62063 0.22541 0.90518 0.50492 0.20701 0.9395 0.25531 0.96665 0.10155 0.04242 0.64657 0.43506 0.94084 0.02733 0.70176 0.79412 0.29643 0.42116 0.8642 0.0906 0.9846 0.86343 0.07333 0.16084 0.9882 0.22708 0.00772 0.73922 0.75119 0.25127 0.37541 0.0887 0.71477 0.84679 0.75141 0.3247 0.00692 0.79957 0.04755 0.10854 0.05951 0.638 0.79012 0.75041 0.62038 0.66483 0.48643 0.92673 0.26259 0.93794 0.71054 0.06409 0.69718 0.66821 0.674 [...]
+0.71605 0.72887 0.84277 0.10719 0.5054 0.27849 0.19826 0.57909 0.69346 0.34299 0.5248 0.25746 0.30149 0.93932 0.76605 0.96155 0.87258 0.26932 0.55413 0.89864 0.01407 0.32767 0.32369 0.07494 0.30276 0.28167 0.58365 0.6475 0.09315 0.62675 0.96405 0.92382 0.56046 0.70746 0.53896 0.21774 0.46558 0.92947 0.48858 0.22231 0.53678 0.43348 0.83782 0.13713 0.08145 0.4771 0.67467 0.51827 0.16074 0.63355 0.58372 0.40008 0.49152 0.77322 0.00787 0.37705 0.35941 0.82349 0.91765 0.48355 0.86708 0.01206  [...]
+0.32445 0.1096 0.507 0.37148 0.07704 0.24056 0.41251 0.78594 0.35476 0.55962 0.18794 0.87086 0.6777 0.1274 0.22993 0.32839 0.8591 0.7207 0.06052 0.34952 0.57396 0.13143 0.20242 0.29741 0.16365 0.99734 0.26838 0.50705 0.35248 0.7358 0.48124 0.08514 0.62877 0.7426 0.47383 0.91423 0.22848 0.16402 0.37287 0.53924 0.66971 0.17476 0.9402 0.22475 0.58896 0.65166 0.73606 0.94217 0.52863 0.12501 0.43564 0.93153 0.21584 0.96679 0.30565 0.78679 0.64787 0.63957 0.1314 0.15669 0.04092 0.39454 0.62906 [...]
+0.02706 0.2333 0.52638 0.7769 0.70284 0.99844 0.88644 0.21117 0.85635 0.32725 0.52812 0.24851 0.13785 0.17486 0.14956 0.20753 0.47808 0.07349 0.6555 0.9755 0.55942 0.88711 0.54159 0.31058 0.20254 0.67118 0.78457 0.19894 0.50847 0.08033 0.18498 0.6347 0.44422 0.85976 0.85819 0.90084 0.35035 0.21823 0.98071 0.38606 0.03016 0.37264 0.80222 0.97718 0.71417 0.86842 0.55052 0.53948 0.14983 0.81298 0.42196 0.82609 0.16924 0.82811 0.70042 0.53672 0.31464 0.1224 0.74159 0.30366 0.3104 0.9108 0.70 [...]
+0.97716 0.78933 0.24968 0.19466 0.82551 0.89153 0.741 0.8871 0.08002 0.0161 0.37676 0.24274 0.65743 0.62472 0.44896 0.06215 0.61174 0.78371 0.36663 0.33846 0.97952 0.52909 0.86774 0.93319 0.72945 0.61406 0.73436 0.04612 0.09186 0.48818 0.82718 0.60851 0.55265 0.93066 0.67344 0.20826 0.24281 0.20937 0.06974 0.27072 0.37351 0.68091 0.78839 0.14913 0.08296 0.99162 0.71629 0.29597 0.49648 0.80141 0.85127 0.02248 0.94424 0.53956 0.76184 0.7972 0.24773 0.24947 0.10855 0.30339 0.08112 0.79211 0 [...]
+0.13623 0.4955 0.54181 0.98389 0.04986 0.77059 0.4262 0.27158 0.89759 0.30428 0.37215 0.44622 0.39835 0.71011 0.09994 0.08499 0.5823 0.3842 0.45055 0.55907 0.17073 0.51807 0.62536 0.75476 0.17111 0.73744 0.6817 0.82012 0.68703 0.97017 0.05408 0.9998 0.72875 0.18963 0.12221 0.71073 0.63531 0.83382 0.11393 0.3833 0.07851 0.66595 0.19733 0.43314 0.25248 0.33242 0.06592 0.65044 0.8756 0.12722 0.64693 0.89443 0.05451 0.47787 0.94134 0.45992 0.10218 0.48002 0.96134 0.58231 0.19476 0.97112 0.73 [...]
+0.11792 0.80721 0.92316 0.53679 0.23616 0.97098 0.63893 0.55086 0.3699 0.94154 0.55557 0.78736 0.31999 0.27507 0.76036 0.68804 0.56439 0.66896 0.84101 0.31927 0.39788 0.16775 0.96181 0.61806 0.2793 0.86946 0.15454 0.30206 0.03446 0.85634 0.29669 0.27423 0.91696 0.74211 0.26118 0.17843 0.25494 0.22845 0.42877 0.93122 0.71335 0.05171 0.75833 0.73565 0.54513 0.55604 0.70559 0.72534 0.53657 0.96711 0.72386 0.74385 0.52003 0.96742 0.88021 0.12793 0.24811 0.1881 0.58611 0.84415 0.8177 0.99076  [...]
+0.37809 0.98976 0.87247 0.45064 0.10198 0.99761 0.35457 0.386 0.68361 0.95699 0.95594 0.71518 0.22786 0.22382 0.81861 0.48021 0.79644 0.21393 0.73784 0.12397 0.28331 0.95646 0.08775 0.4241 0.10772 0.37459 0.6298 0.07249 0.58277 0.41352 0.06174 0.05515 0.26995 0.45165 0.28433 0.6588 0.83139 0.70978 0.70999 0.10671 0.58166 0.15648 0.99742 0.12364 0.75041 0.69596 0.05987 0.69588 0.96159 0.20579 0.80548 0.81899 0.92542 0.42154 0.50488 0.38069 0.03697 0.38917 0.08076 0.96147 0.56804 0.83156 0 [...]
+0.82928 0.56896 0.14673 0.02247 0.47971 0.81048 0.87054 0.93187 0.616 0.30208 0.26454 0.60315 0.27123 0.13384 0.43543 0.0849 0.5976 0.14023 0.40125 0.61436 0.43282 0.31974 0.79223 0.45714 0.26485 0.27532 0.9074 0.80512 0.24168 0.26033 0.36036 0.99231 0.1671 0.3303 0.89798 0.13432 0.00702 0.72932 0.52802 0.86759 0.62221 0.23806 0.92299 0.30395 0.29811 0.33967 0.46498 0.7769 0.22095 0.98771 0.75201 0.97871 0.77277 0.41984 0.68565 0.10997 0.21191 0.30839 0.00738 0.6981 0.24927 0.44355 0.542 [...]
+0.45485 0.68941 0.27512 0.05985 0.18388 0.96981 0.64422 0.74822 0.10129 0.43719 0.80817 0.37246 0.56097 0.4766 0.54166 0.36025 0.41656 0.49971 0.69304 0.11044 0.38928 0.82382 0.75465 0.19696 0.11687 0.82956 0.38804 0.08668 0.15141 0.0546 0.61216 0.7661 0.66323 0.30031 0.55192 0.46304 0.80396 0.39318 0.11823 0.07287 0.34919 0.30014 0.74061 0.37687 0.41975 0.41563 0.49189 0.73946 0.54525 0.87657 0.48934 0.33021 0.33853 0.50847 0.52528 0.46088 0.02388 0.17403 0.49847 0.78441 0.77487 0.31511 [...]
+0.03674 0.73692 0.67776 0.2417 0.23917 0.51964 0.6152 0.36694 0.95652 0.72459 0.44064 0.66671 0.39153 0.22441 0.87169 0.79628 0.80884 0.91949 0.6793 0.78554 0.8848 0.18441 0.77974 0.78656 0.72471 0.94415 0.48011 0.38994 0.24794 0.06442 0.86756 0.6751 0.88206 0.54884 0.40223 0.58705 0.99062 0.52348 0.81884 0.47803 0.06954 0.57147 0.36587 0.04378 0.96418 0.37944 0.82896 0.62115 0.71988 0.4394 0.24545 0.99186 0.46824 0.72211 0.97048 0.12139 0.56549 0.40972 0.98319 0.20327 0.16488 0.75204 0. [...]
+0.29876 0.34609 0.78773 0.63078 0.02624 0.62078 0.31761 0.46897 0.43036 0.14867 0.85639 0.53674 0.06591 0.37297 0.25682 0.03011 0.85362 0.16303 0.23772 0.08666 0.36215 0.46182 0.22298 0.53321 0.08329 0.50482 0.1812 0.20695 0.15799 0.6034 0.71784 0.00263 0.17562 0.01128 0.43362 0.18108 0.4473 0.23532 0.20956 0.89461 0.5983 0.71487 0.69545 0.5943 0.12987 0.5916 0.43076 0.42011 0.80078 0.13017 0.06343 0.42788 0.84224 0.84277 0.82674 0.80005 0.94673 0.44441 0.96121 0.07137 0.19015 0.43941 0. [...]
+0.72464 0.74292 0.23217 0.35454 0.85888 0.16672 0.90924 0.84852 0.98356 0.19394 0.2356 0.00628 0.43585 0.37938 0.63814 0.68023 0.4674 0.6305 0.45161 0.11592 0.497 0.59406 0.93133 0.50294 0.01481 0.73621 0.03918 0.91757 0.61679 0.34665 0.00674 0.91596 0.21811 0.60781 0.22961 0.77049 0.30265 0.0149 0.30067 0.16998 0.18193 0.76068 0.19837 0.39016 0.98311 0.77949 0.96058 0.58991 0.87455 0.93682 0.65857 0.15542 0.20459 0.81261 0.38315 0.19821 0.39351 0.60239 0.60879 0.40454 0.03394 0.98922 0. [...]
+0.87649 0.08524 0.73018 0.99313 0.85939 0.91634 0.32345 0.74553 0.40422 0.3461 0.99257 0.85855 0.65354 0.39585 0.79333 0.35289 0.68962 0.49867 0.52953 0.43183 0.85931 0.04367 0.11199 0.81886 0.28285 0.37988 0.02237 0.74101 0.94032 0.74237 0.77779 0.85299 0.08794 0.60559 0.93219 0.80875 0.0438 0.27209 0.66705 0.71865 0.03327 0.16459 0.04772 0.36005 0.56446 0.30787 0.95133 0.4562 0.14307 0.5826 0.49618 0.49729 0.43815 0.99489 0.55553 0.74745 0.63772 0.29936 0.1793 0.64097 0.93286 0.26892 0 [...]
+0.82068 0.54105 0.34674 0.48892 0.96045 0.83587 0.82562 0.08231 0.93099 0.17682 0.25178 0.96337 0.15718 0.97718 0.96137 0.60224 0.44646 0.54418 0.98679 0.03023 0.32465 0.71673 0.36506 0.44407 0.48378 0.04828 0.45545 0.52866 0.47397 0.96622 0.36796 0.37782 0.34457 0.8691 0.96671 0.90776 0.95569 0.43426 0.43003 0.7664 0.56975 0.30025 0.65964 0.85698 0.07405 0.98074 0.51876 0.12833 0.76633 0.69194 0.33896 0.34991 0.20956 0.7936 0.8531 0.70024 0.6391 0.12649 0.6829 0.62022 0.73073 0.69955 0. [...]
+0.11812 0.29897 0.99533 0.58992 0.88232 0.08798 0.96109 0.65395 0.37017 0.18173 0.82877 0.66906 0.87449 0.26705 0.69048 0.20557 0.34067 0.81371 0.72212 0.3084 0.14702 0.92177 0.88018 0.62691 0.61985 0.35761 0.90117 0.65457 0.02057 0.19883 0.98846 0.91923 0.28579 0.34794 0.97883 0.10816 0.86502 0.57731 0.67079 0.27354 0.66558 0.77298 0.09032 0.67524 0.38159 0.45031 0.62017 0.4178 0.63326 0.77159 0.35954 0.46089 0.80509 0.3748 0.59856 0.38707 0.43004 0.15325 0.97148 0.47759 0.24668 0.21391 [...]
+0.72907 0.25974 0.65357 0.03042 0.32214 0.97402 0.43182 0.87933 0.2307 0.70816 0.08182 0.5855 0.4397 0.22204 0.2335 0.74936 0.63922 0.86389 0.73446 0.90599 0.94798 0.46082 0.31127 0.90292 0.44576 0.31671 0.82575 0.47202 0.38386 0.04541 0.42904 0.47132 0.36338 0.38275 0.16795 0.11423 0.99585 0.73572 0.19264 0.56685 0.01427 0.1334 0.83394 0.46346 0.54635 0.33135 0.04909 0.5043 0.97154 0.93067 0.62497 0.34812 0.24553 0.28197 0.55447 0.51892 0.91208 0.98557 0.66381 0.72535 0.49792 0.54908 0. [...]
+0.4326 0.57543 0.6008 0.17124 0.19228 0.06838 0.28397 0.26968 0.31125 0.44467 0.61869 0.0583 0.60647 0.39657 0.23915 0.24805 0.26176 0.00484 0.20262 0.96822 0.87553 0.32042 0.91513 0.78306 0.50705 0.2945 0.61694 0.40586 0.24251 0.89203 0.72508 0.60804 0.2019 0.96173 0.74287 0.77797 0.44405 0.07698 0.36248 0.88055 0.81725 0.76209 0.55637 0.90823 0.10933 0.87253 0.70353 0.49826 0.2401 0.30351 0.65453 0.94888 0.37589 0.64373 0.43401 0.63233 0.82215 0.18812 0.71475 0.40138 0.13627 0.45936 0. [...]
+0.42711 0.22274 0.44814 0.1167 0.37558 0.77626 0.68819 0.53382 0.84309 0.40037 0.21946 0.49319 0.2521 0.87394 0.52568 0.84853 0.40437 0.08993 0.64514 0.08211 0.47852 0.21165 0.70185 0.50445 0.35345 0.61108 0.52747 0.09401 0.49355 0.15735 0.35705 0.2215 0.00825 0.06764 0.58831 0.06607 0.46831 0.77273 0.80445 0.22044 0.16295 0.51783 0.9785 0.67082 0.62392 0.94403 0.42129 0.33135 0.92551 0.34356 0.98906 0.06639 0.00724 0.74262 0.34264 0.50137 0.26226 0.11899 0.09797 0.05836 0.65173 0.23355  [...]
+0.39668 0.14166 0.26849 0.1617 0.82876 0.31083 0.18215 0.83779 0.36605 0.40057 0.88849 0.67252 0.867 0.527 0.50567 0.05822 0.82719 0.83745 0.763 0.18834 0.75641 0.34385 0.38989 0.30285 0.48746 0.27259 0.74153 0.67488 0.63986 0.84579 0.66645 0.24524 0.34017 0.85543 0.66637 0.18747 0.62071 0.62303 0.21254 0.28994 0.81324 0.93191 0.4128 0.85539 0.85866 0.12143 0.89104 0.25803 0.7914 0.81517 0.13935 0.32385 0.41925 0.71472 0.41314 0.72628 0.75339 0.27601 0.2047 0.75204 0.40188 0.00354 0.0514 [...]
+0.16415 0.5384 0.12627 0.87167 0.71246 0.57368 0.12891 0.3866 0.99264 0.25105 0.44306 0.47645 0.6724 0.78649 0.57122 0.67724 0.28021 0.8873 0.8502 0.35058 0.3201 0.22729 0.28486 0.7774 0.08072 0.57095 0.63 0.12264 0.68402 0.99551 0.80773 0.83382 0.67417 0.28015 0.75972 0.83945 0.63561 0.93532 0.74438 0.81402 0.45116 0.4079 0.6849 0.36569 0.35523 0.67879 0.77482 0.97385 0.79993 0.68237 0.79685 0.52622 0.17163 0.10205 0.53824 0.44088 0.92201 0.09453 0.47514 0.21433 0.00479 0.47781 0.42302  [...]
+0.73527 0.63497 0.77699 0.27699 0.5179 0.82249 0.43125 0.14498 0.87395 0.89559 0.00712 0.43805 0.47213 0.21807 0.54678 0.38301 0.56697 0.82081 0.32826 0.66033 0.8118 0.06091 0.99864 0.78106 0.71854 0.75377 0.59964 0.16143 0.48738 0.20288 0.29293 0.63673 0.75353 0.37596 0.31128 0.80012 0.38072 0.17897 0.281 0.44503 0.11076 0.64841 0.70304 0.29854 0.01369 0.57158 0.46855 0.81182 0.58182 0.26271 0.23388 0.72632 0.58025 0.41297 0.53914 0.28353 0.62417 0.13134 0.06653 0.21369 0.58113 0.909 0. [...]
+0.0689 0.34506 0.87997 0.69175 0.58427 0.0477 0.52037 0.49633 0.52755 0.89598 0.33252 0.99077 0.24957 0.94205 0.06743 0.57964 0.19508 0.852 0.69007 0.64111 0.26739 0.22773 0.19137 0.40997 0.3868 0.51493 0.37235 0.69181 0.3144 0.46205 0.73891 0.43708 0.7165 0.59497 0.89795 0.24379 0.70928 0.08233 0.35221 0.38123 0.38413 0.63597 0.53885 0.66293 0.06976 0.16465 0.45786 0.99851 0.02198 0.93111 0.45786 0.13129 0.52614 0.56553 0.62892 0.57049 0.11333 0.48763 0.00896 0.18823 0.86843 0.32003 0.5 [...]
+0.15525 0.77827 0.06788 0.44785 0.79109 0.83635 0.73861 0.02512 0.86151 0.43528 0.01214 0.36882 0.86187 0.05814 0.25751 0.82198 0.10668 0.81615 0.34126 0.88145 0.91732 0.95358 0.45227 0.56004 0.78948 0.64871 0.97248 0.17031 0.58669 0.81263 0.29739 0.60297 0.73893 0.22226 0.46229 0.7818 0.99055 0.01586 0.75458 0.75313 0.68123 0.49108 0.74062 0.62076 0.91571 0.64692 0.84266 0.73954 0.55508 0.87912 0.55172 0.57298 0.74185 0.12836 0.09454 0.64704 0.92774 0.87962 0.34192 0.13865 0.87096 0.359 [...]
+0.00245 0.27455 0.5899 0.91756 0.44039 0.04533 0.89588 0.66964 0.66333 0.32396 0.28574 0.84401 0.48269 0.02992 0.12762 0.70445 0.65897 0.05551 0.39393 0.9353 0.80401 0.01779 0.19264 0.21224 0.04497 0.54618 0.57247 0.82346 0.78705 0.564 0.09316 0.58118 0.50319 0.54116 0.55477 0.59968 0.28124 0.97668 0.50059 0.60478 0.30565 0.0188 0.67662 0.90943 0.28871 0.00343 0.67849 0.41454 0.50024 0.6533 0.15209 0.52599 0.95617 0.59944 0.12176 0.57554 0.88617 0.37508 0.33181 0.08254 0.3816 0.88022 0.8 [...]
+0.13025 0.30546 0.63121 0.20617 0.98236 0.50893 0.90446 0.06142 0.25179 0.10347 0.24108 0.27917 0.93754 0.50487 0.44158 0.05758 0.83708 0.39084 0.34279 0.21883 0.41964 0.36432 0.77486 0.76086 0.39063 0.6297 0.95513 0.02329 0.52803 0.06723 0.40191 0.5125 0.02494 0.59361 0.93819 0.38136 0.03194 0.42895 0.3347 0.62607 0.11084 0.78341 0.4246 0.89956 0.98583 0.35168 0.87138 0.25106 0.67302 0.24091 0.10812 0.28359 0.93647 0.82496 0.02548 0.85376 0.07703 0.85823 0.03298 0.46123 0.02498 0.84893  [...]
+0.34426 0.80844 0.95967 0.47863 0.34063 0.46188 0.38264 0.16666 0.33693 0.26365 0.8546 0.98594 0.67322 0.2916 0.66172 0.91676 0.19554 0.31272 0.74539 0.03717 0.3396 0.40572 0.99614 0.23479 0.91638 0.4291 0.43504 0.64931 0.44743 0.84521 0.14374 0.24144 0.71182 0.10058 0.96573 0.19796 0.7983 0.43774 0.10254 0.14696 0.26124 0.22429 0.38578 0.91151 0.28316 0.51538 0.43782 0.52607 0.72362 0.26104 0.5323 0.30381 0.97843 0.91474 0.69875 0.7042 0.8736 0.43546 0.7008 0.30791 0.16299 0.60533 0.075 [...]
+0.6718 0.98101 0.88655 0.01637 0.44403 0.3404 0.31215 0.32547 0.91252 0.97341 0.66905 0.94632 0.43393 0.63597 0.89708 0.80991 0.06703 0.54416 0.69801 0.2098 0.17026 0.4958 0.74867 0.1545 0.77921 0.40165 0.1872 0.9539 0.32834 0.88156 0.23058 0.7347 0.16861 0.14816 0.50366 0.68991 0.33867 0.11143 0.24368 0.79205 0.87771 0.7774 0.85841 0.74613 0.47625 0.48319 0.87191 0.26015 0.60632 0.8324 0.76999 0.73023 0.01384 0.71364 0.91116 0.75897 0.56872 0.35529 0.62122 0.62343 0.28852 0.32864 0.7814 [...]
+0.67467 0.14143 0.02069 0.00776 0.26458 0.5375 0.98425 0.2766 0.09475 0.83918 0.52489 0.39249 0.16932 0.70323 0.36297 0.86411 0.1859 0.65221 0.58057 0.42059 0.29244 0.62074 0.49878 0.06926 0.4704 0.53173 0.51323 0.48704 0.31854 0.7091 0.27951 0.2398 0.05619 0.87609 0.29499 0.59345 0.67957 0.49126 0.58284 0.52615 0.17512 0.79833 0.18692 0.63756 0.64062 0.3804 0.4516 0.0816 0.15488 0.43993 0.43807 0.04703 0.83093 0.25521 0.3985 0.43219 0.2999 0.4713 0.65687 0.48545 0.30152 0.13148 0.84541  [...]
+0.70567 0.39891 0.10618 0.05968 0.54802 0.28817 0.48357 0.92206 0.4536 0.94605 0.88561 0.52702 0.166 0.60994 0.48264 0.96111 0.33203 0.38577 0.33935 0.46775 0.82408 0.17224 0.19804 0.6071 0.43498 0.42442 0.5493 0.06721 0.25143 0.37939 0.7941 0.36979 0.38966 0.52897 0.04022 0.59888 0.02427 0.24468 0.83244 0.64716 0.38965 0.13506 0.0577 0.75386 0.07808 0.86154 0.72406 0.8655 0.2467 0.77136 0.68038 0.40638 0.46811 0.49559 0.79535 0.62996 0.97813 0.13214 0.51474 0.4272 0.92267 0.08763 0.4263 [...]
+0.01166 0.64169 0.42273 0.61937 0.37633 0.82418 0.90359 0.02636 0.86247 0.57744 0.43157 0.23426 0.15933 0.32765 0.81904 0.05892 0.75924 0.54963 0.27042 0.72784 0.29511 0.43594 0.18873 0.22515 0.82814 0.36783 0.53922 0.81908 0.83748 0.29889 0.50604 0.97092 0.62684 0.45481 0.78647 0.59909 0.59031 0.08703 0.8467 0.1845 0.85227 0.65144 0.2635 0.03184 0.60084 0.89795 0.01946 0.90761 0.62445 0.2222 0.26274 0.69169 0.61051 0.21257 0.97165 0.60724 0.25902 0.97577 0.22815 0.52619 0.39881 0.23846  [...]
+0.77228 0.95103 0.67354 0.26938 0.17013 0.18475 0.5164 0.39514 0.92235 0.46254 0.80172 0.58789 0.42845 0.30859 0.11225 0.63343 0.73338 0.86944 0.84568 0.46433 0.79911 0.92648 0.38886 0.23637 0.91917 0.78603 0.41467 0.39225 0.47256 0.86771 0.55998 0.9908 0.45316 0.93425 0.66006 0.93699 0.44548 0.99546 0.39008 0.6012 0.26542 0.44569 0.58382 0.01566 0.1694 0.09539 0.93938 0.69106 0.85031 0.40907 0.89258 0.07674 0.81349 0.17017 0.37816 0.91916 0.15167 0.45185 0.38688 0.30095 0.22928 0.18104  [...]
+0.48779 0.97543 0.93965 0.68668 0.98311 0.28172 0.96361 0.38196 0.00567 0.90764 0.16354 0.47067 0.86864 0.85231 0.23134 0.65454 0.94273 0.79908 0.36251 0.53365 0.41768 0.21218 0.62931 0.75598 0.94482 0.4806 0.66628 0.04897 0.52361 0.92707 0.3908 0.0751 0.05106 0.44299 0.24434 0.47072 0.19469 0.05058 0.18642 0.29139 0.8169 0.40553 0.71294 0.26433 0.24574 0.99175 0.1855 0.4719 0.56716 0.96323 0.29021 0.39489 0.6399 0.58301 0.13663 0.83798 0.38709 0.60455 0.02828 0.87623 0.07452 0.62062 0.0 [...]
+0.28823 0.53673 0.41956 0.1775 0.80035 0.47469 0.08057 0.50039 0.60375 0.18104 0.97494 0.34652 0.49972 0.79158 0.01199 0.33584 0.12901 0.3832 0.56955 0.96042 0.8406 0.69961 0.87332 0.92402 0.02959 0.93049 0.11286 0.24941 0.348 0.33704 0.83244 0.25583 0.15135 0.82662 0.79681 0.18531 0.68905 0.84591 0.98605 0.97555 0.77465 0.28553 0.18488 0.52137 0.55502 0.07885 0.61175 0.41198 0.34608 0.84426 0.71864 0.51009 0.35841 0.77072 0.4744 0.31589 0.2156 0.63958 0.38404 0.27633 0.43853 0.04342 0.9 [...]
+0.52082 0.82237 0.20147 0.72158 0.67554 0.00443 0.22031 0.94974 0.52415 0.44304 0.36732 0.99198 0.91817 0.29496 0.93474 0.55766 0.98194 0.05842 0.74161 0.29683 0.97186 0.82188 0.60216 0.51766 0.69774 0.86769 0.94226 0.89807 0.70854 0.73174 0.76841 0.76234 0.51315 0.42442 0.18765 0.79418 0.34869 0.1419 0.08891 0.71857 0.27765 0.80405 0.79577 0.95059 0.16328 0.24629 0.99988 0.35644 0.07636 0.5545 0.10924 0.98242 0.04644 0.20788 0.55937 0.75076 0.68171 0.14028 0.84279 0.63614 0.2477 0.96083 [...]
+0.09532 0.19431 0.90189 0.25006 0.15987 0.92235 0.1879 0.61039 0.95032 0.53882 0.08037 0.63308 0.15168 0.02985 0.68305 0.89642 0.73038 0.80177 0.66089 0.62534 0.33078 0.01079 0.05174 0.81439 0.19465 0.69546 0.84168 0.89371 0.5382 0.19127 0.82865 0.29047 0.64764 0.79206 0.75088 0.31991 0.40911 0.07076 0.81033 0.41741 0.43501 0.39609 0.06958 0.4874 0.47137 0.92377 0.71484 0.39279 0.25595 0.91072 0.8383 0.70218 0.92178 0.15788 0.85417 0.14899 0.80771 0.59911 0.35147 0.27225 0.68857 0.1184 0 [...]
+0.04751 0.37504 0.96717 0.21257 0.55038 0.19549 0.99506 0.97288 0.29932 0.69232 0.85978 0.4753 0.73475 0.00805 0.59643 0.72618 0.50514 0.34423 0.7674 0.98798 0.73681 0.91723 0.74142 0.6471 0.12511 0.73677 0.72034 0.9184 0.78704 0.52586 0.3534 0.38916 0.68493 0.99723 0.35302 0.8541 0.89402 0.50264 0.70011 0.92086 0.3981 0.29299 0.68584 0.06143 0.46337 0.26396 0.21787 0.01241 0.60781 0.77015 0.87086 0.61483 0.86281 0.89416 0.99436 0.5946 0.48774 0.30834 0.97871 0.17926 0.83437 0.48951 0.55 [...]
+0.79026 0.33528 0.20253 0.15707 0.33875 0.59593 0.6163 0.60491 0.47057 0.98371 0.28145 0.22777 0.20898 0.68798 0.65826 0.24512 0.03673 0.09819 0.21109 0.47945 0.2393 0.58562 0.68144 0.4265 0.2612 0.04879 0.90178 0.95565 0.51708 0.56008 0.26779 0.16969 0.14039 0.77233 0.67761 0.32608 0.41998 0.29848 0.80641 0.35109 0.87557 0.91112 0.24305 0.70854 0.19067 0.76308 0.132 0.41793 0.51431 0.56227 0.26833 0.85403 0.27911 0.57989 0.95231 0.45941 0.77715 0.75765 0.53013 0.55838 0.75916 0.9622 0.6 [...]
+0.18786 0.75996 0.2164 0.40395 0.11124 0.17662 0.82717 0.48191 0.22465 0.5145 0.35182 0.79678 0.69787 0.37662 0.81728 0.59173 0.77663 0.52538 0.36286 0.94389 0.11606 0.77955 0.70458 0.13017 0.97803 0.4534 0.43318 0.0423 0.70112 0.00797 0.62873 0.25722 0.71966 0.08238 0.58905 0.11504 0.62536 0.51346 0.23831 0.00622 0.20114 0.01532 0.77383 0.11741 0.58372 0.49008 0.83584 0.54852 0.1943 0.23776 0.16768 0.21855 0.43907 0.12099 0.75683 0.95193 0.19185 0.98462 0.05298 0.96108 0.6342 0.68389 0. [...]
+0.42403 0.8823 0.81861 0.47815 0.06663 0.57186 0.1247 0.69549 0.44423 0.33743 0.37479 0.8201 0.31703 0.08297 0.25088 0.50731 0.13311 0.80878 0.59898 0.45457 0.73479 0.0551 0.89986 0.32562 0.76894 0.2985 0.7497 0.03583 0.09826 0.91804 0.42779 0.59606 0.27517 0.22906 0.74852 0.11307 0.21697 0.50877 0.14935 0.46604 0.23813 0.66094 0.98055 0.32783 0.14058 0.39877 0.37552 0.18659 0.13564 0.13053 0.64728 0.3134 0.33355 0.56943 0.24672 0.21079 0.89105 0.80912 0.8638 0.48332 0.90763 0.63097 0.67 [...]
+0.3614 0.07583 0.32258 0.30659 0.70733 0.28888 0.58944 0.67077 0.94005 0.65465 0.18103 0.15449 0.71824 0.59648 0.32211 0.05838 0.26788 0.49714 0.95437 0.9076 0.18438 0.24711 0.25436 0.03675 0.4078 0.79544 0.94592 0.96611 0.83284 0.87396 0.05138 0.4759 0.01228 0.79321 0.93992 0.50781 0.7202 0.24374 0.22105 0.30538 0.42808 0.79195 0.21961 0.53877 0.5737 0.52204 0.78994 0.12153 0.77697 0.86491 0.42515 0.81265 0.82834 0.19458 0.92408 0.05621 0.69761 0.2222 0.53325 0.5531 0.01834 0.28972 0.13 [...]
+0.79531 0.62197 0.3345 0.07753 0.60373 0.46609 0.53797 0.3177 0.65617 0.95653 0.54338 0.82487 0.14161 0.34133 0.31164 0.75398 0.92775 0.84723 0.10625 0.89726 0.37804 0.42709 0.38053 0.19444 0.74721 0.31315 0.29829 0.54502 0.16413 0.83167 0.63775 0.35329 0.26462 0.26077 0.60409 0.11366 0.5942 0.05097 0.04856 0.74037 0.49852 0.70292 0.8294 0.38217 0.2767 0.40792 0.73723 0.69419 0.49786 0.20829 0.85636 0.60034 0.71564 0.03031 0.39701 0.38168 0.40957 0.90471 0.00557 0.81067 0.95144 0.78181 0 [...]
+0.59777 0.52888 0.29252 0.62181 0.72013 0.07325 0.36281 0.73197 0.4155 0.56077 0.58553 0.82304 0.4195 0.76527 0.01818 0.71272 0.09186 0.9047 0.09367 0.74198 0.74958 0.71832 0.94785 0.82191 0.94867 0.1493 0.88622 0.65902 0.73033 0.80258 0.60345 0.74831 0.74252 0.989 0.77234 0.98629 0.75642 0.77884 0.92692 0.58884 0.91794 0.22108 0.93299 0.63512 0.10445 0.28468 0.23539 0.55978 0.54403 0.43666 0.21683 0.3182 0.31175 0.16089 0.53569 0.28578 0.15448 0.74472 0.93857 0.22961 0.90404 0.91655 0.3 [...]
+0.66676 0.51344 0.84187 0.28341 0.16802 0.71251 0.48204 0.62246 0.10646 0.0164 0.40849 0.8946 0.8994 0.21852 0.49523 0.94128 0.77214 0.23319 0.97275 0.66259 0.68435 0.73674 0.78026 0.96988 0.95331 0.93437 0.13034 0.29902 0.54319 0.33338 0.27278 0.21246 0.34524 0.81096 0.40828 0.42004 0.72109 0.2176 0.35804 0.90453 0.65178 0.79004 0.65602 0.44199 0.38634 0.94805 0.92911 0.46978 0.0262 0.06967 0.29138 0.61975 0.09184 0.82447 0.98758 0.73621 0.71701 0.34878 0.22401 0.23628 0.9594 0.12679 0. [...]
+0.30913 0.49832 0.42183 0.07734 0.5798 0.54667 0.78964 0.98596 0.54985 0.25393 0.99739 0.32551 0.6023 0.98499 0.80001 0.9148 0.99242 0.51557 0.45314 0.95526 0.55567 0.25449 0.49114 0.6322 0.164 0.41212 0.61763 0.43482 0.9647 0.74768 0.97787 0.64894 0.36334 0.96519 0.96371 0.03269 0.56273 0.21993 0.73814 0.10786 0.93369 0.27272 0.7956 0.80596 0.40927 0.60313 0.09422 0.80984 0.25327 0.60421 0.44871 0.22002 0.25754 0.74406 0.61513 0.63723 0.4906 0.38025 0.74079 0.14078 0.34327 0.12831 0.056 [...]
+0.55569 0.8052 0.06482 0.82383 0.45264 0.52633 0.10581 0.58971 0.17931 0.09877 0.8034 0.48595 0.25188 0.91971 0.77985 0.22996 0.56944 0.29399 0.25616 0.18966 0.60276 0.08247 0.20948 0.89792 0.72696 0.37993 0.51067 0.9037 0.08024 0.80572 0.52296 0.95252 0.17104 0.97131 0.10695 0.26735 0.07935 0.5728 0.16788 0.09119 0.55706 0.86305 0.01614 0.41842 0.53248 0.07582 0.79304 0.91674 0.06976 0.93138 0.2041 0.46087 0.8695 0.41721 0.94828 0.74498 0.15652 0.26093 0.99103 0.46782 0.01038 0.97275 0. [...]
+0.47039 0.67107 0.11397 0.1826 0.90329 0.67679 0.02486 0.62072 0.0219 0.21084 0.91752 0.17045 0.18357 0.68092 0.19527 0.32091 0.33017 0.74333 0.95968 0.65793 0.76117 0.37527 0.19798 0.35479 0.27331 0.59991 0.2883 0.44769 0.51613 0.39856 0.62616 0.47669 0.54109 0.62287 0.11927 0.91797 0.61004 0.54904 0.57752 0.9333 0.41235 0.34359 0.18588 0.25141 0.01065 0.34515 0.76913 0.07306 0.93649 0.88834 0.81728 0.00723 0.54048 0.36635 0.82139 0.35818 0.93052 0.82484 0.54862 0.62352 0.46777 0.47045  [...]
+0.38357 0.27453 0.7527 0.83751 0.12898 0.2342 0.11507 0.96052 0.68064 0.59694 0.01472 0.96418 0.3948 0.9208 0.62656 0.16814 0.8303 0.84321 0.02683 0.61683 0.87236 0.44144 0.26637 0.51611 0.6384 0.84397 0.65884 0.16848 0.61582 0.67274 0.08233 0.87363 0.50366 0.74546 0.657 0.59816 0.87306 0.94448 0.98991 0.46494 0.30387 0.92447 0.47152 0.3983 0.26481 0.7876 0.71769 0.67595 0.9138 0.78419 0.40602 0.31001 0.66588 0.48078 0.05563 0.56197 0.99161 0.89386 0.83668 0.13221 0.76968 0.59789 0.58395 [...]
+0.26021 0.92466 0.08955 0.53253 0.21938 0.67229 0.26386 0.20276 0.53016 0.51055 0.55164 0.70885 0.46711 0.49775 0.09174 0.9885 0.73087 0.35279 0.76602 0.96396 0.92028 0.01802 0.04441 0.55508 0.50847 0.37216 0.42386 0.06726 0.65811 0.13781 0.68522 0.86992 0.572 0.73594 0.10417 0.3848 0.52116 0.08434 0.6938 0.27459 0.06358 0.31594 0.35838 0.73035 0.27983 0.53415 0.96565 0.28571 0.27713 0.96349 0.09811 0.01458 0.04969 0.71197 0.55328 0.41473 0.06576 0.62596 0.5702 0.56356 0.81835 0.34132 0. [...]
+0.53192 0.71461 0.03292 0.32394 0.50252 0.92631 0.24082 0.04873 0.2017 0.40145 0.32688 0.58824 0.18981 0.64958 0.87749 0.84722 0.50039 0.29919 0.2438 0.923 0.07765 0.42136 0.71087 0.32777 0.52736 0.26506 0.64723 0.37938 0.40933 0.1572 0.58295 0.0231 0.91796 0.54097 0.33087 0.86407 0.98511 0.94325 0.24179 0.87638 0.34851 0.32739 0.5803 0.04571 0.969 0.08252 0.28099 0.51273 0.18535 0.01475 0.47243 0.12411 0.65466 0.70904 0.57626 0.86764 0.60422 0.32449 0.87757 0.35714 0.67986 0.9599 0.5490 [...]
+0.72552 0.69788 0.82709 0.92481 0.78541 0.66202 0.15403 0.25652 0.51618 0.31473 0.09946 0.06723 0.95366 0.13126 0.13725 0.58842 0.0055 0.63713 0.2512 0.65581 0.15244 0.33041 0.90443 0.00093 0.97137 0.82094 0.56775 0.4994 0.39463 0.57262 0.02394 0.71004 0.45414 0.24737 0.85078 0.11059 0.23709 0.13218 0.1021 0.14806 0.05946 0.88716 0.74232 0.1172 0.12526 0.01594 0.54268 0.6229 0.04123 0.29557 0.75741 0.0278 0.64165 0.11721 0.43322 0.18746 0.82299 0.53343 0.95424 0.33903 0.47885 0.40215 0.6 [...]
+0.30014 0.96582 0.4736 0.5032 0.11368 0.8456 0.0039 0.15929 0.17091 0.94873 0.68611 0.73073 0.02694 0.28957 0.48006 0.50931 0.13153 0.50242 0.13182 0.87983 0.69688 0.92225 0.06514 0.10665 0.63888 0.15243 0.26515 0.50339 0.66193 0.47055 0.77178 0.73149 0.03073 0.3014 0.36259 0.43433 0.64506 0.40208 0.25744 0.0808 0.78956 0.28042 0.25325 0.51833 0.26741 0.49984 0.81102 0.22083 0.75238 0.85954 0.42624 0.39432 0.90413 0.99131 0.57331 0.42714 0.41623 0.15655 0.86123 0.47459 0.03445 0.87284 0. [...]
+0.23899 0.17396 0.46673 0.02262 0.53339 0.50506 0.75924 0.27217 0.42701 0.56921 0.76712 0.41615 0.477 0.85015 0.49401 0.94802 0.00268 0.83326 0.13356 0.11432 0.56992 0.68979 0.27239 0.9866 0.35056 0.22155 0.97869 0.19971 0.78904 0.94592 0.8596 0.61649 0.64157 0.91319 0.26107 0.90608 0.58058 0.56785 0.05058 0.35855 0.84302 0.56754 0.17261 0.14481 0.71858 0.16568 0.34114 0.67961 0.63084 0.35808 0.1386 0.10911 0.10784 0.38035 0.03742 0.6207 0.48048 0.59565 0.75877 0.06117 0.99828 0.32996 0. [...]
+0.03303 0.5662 0.70254 0.80767 0.14566 0.46298 0.4387 0.69669 0.36711 0.27676 0.67443 0.40301 0.31722 0.81337 0.88907 0.70406 0.29825 0.80857 0.3687 0.48156 0.4404 0.76775 0.67302 0.74313 0.8 0.07183 0.1906 0.8029 0.80008 0.78411 0.90669 0.01831 0.55627 0.87483 0.27068 0.69896 0.12408 0.0888 0.10529 0.61438 0.69981 0.1363 0.97449 0.84233 0.72502 0.81756 0.32953 0.7251 0.54757 0.37913 0.82086 0.43887 0.47962 0.23316 0.80142 0.5433 0.35858 0.43415 0.49274 0.79544 0.48762 0.42844 0.50334 0. [...]
+0.76962 0.43516 0.36562 0.32734 0.58813 0.43441 0.81924 0.66177 0.01208 0.68036 0.0164 0.66233 0.47279 0.67209 0.00693 0.91559 0.88239 0.21214 0.75869 0.21816 0.71119 0.21885 0.56872 0.30523 0.30629 0.78261 0.69853 0.87502 0.58713 0.64359 0.5324 0.00874 0.29023 0.3016 0.74717 0.09239 0.84329 0.68983 0.22937 0.578 0.06038 0.27568 0.99807 0.31531 0.0456 0.37307 0.93493 0.96396 0.42549 0.33214 0.10453 0.53298 0.87335 0.54806 0.40207 0.51433 0.63637 0.79096 0.10584 0.04398 0.43542 0.15707 0. [...]
+0.38447 0.04461 0.57563 0.6815 0.07154 0.46068 0.28534 0.35893 0.87873 0.00895 0.28898 0.85525 0.01003 0.11461 0.41554 0.13646 0.15746 0.55751 0.22198 0.13765 0.97001 0.11745 0.04851 0.8213 0.70642 0.56496 0.73133 0.89329 0.53161 0.29619 0.47453 0.70976 0.5687 0.68123 0.78335 0.84593 0.47878 0.42958 0.10051 0.79239 0.81881 0.11619 0.53635 0.36553 0.06366 0.99922 0.49184 0.70711 0.97914 0.28605 0.76982 0.61555 0.58344 0.33096 0.12641 0.03729 0.03494 0.10471 0.92893 0.63449 0.44767 0.70859 [...]
+0.7769 0.65428 0.87001 0.5285 0.4725 0.94033 0.09348 0.44974 0.4062 0.9204 0.65513 0.41426 0.47622 0.7337 0.58257 0.42448 0.10931 0.36971 0.2189 0.44146 0.14732 0.87167 0.94647 0.22149 0.34249 0.11677 0.324 0.65967 0.27368 0.59844 0.74454 0.12528 0.45052 0.71966 0.66126 0.3564 0.45245 0.84798 0.52407 0.18723 0.91062 0.62051 0.81892 0.86302 0.66525 0.33496 0.47129 0.53025 0.65318 0.28482 0.37278 0.79344 0.0826 0.65415 0.59604 0.75822 0.55818 0.3772 0.41775 0.72829 0.47302 0.56896 0.69502  [...]
+0.67633 0.60695 0.758 0.252 0.2324 0.20386 0.48789 0.58089 0.19324 0.46374 0.34164 0.56719 0.72588 0.42812 0.47386 0.26299 0.58536 0.85629 0.68327 0.74496 0.57714 0.32273 0.88433 0.24659 0.84884 0.08991 0.88625 0.97017 0.76228 0.0248 0.60639 0.50795 0.77639 0.12267 0.1679 0.57443 0.18936 0.37654 0.57344 0.34827 0.03592 0.332 0.80312 0.27103 0.6544 0.30661 0.38344 0.20928 0.58158 0.4674 0.44473 0.24221 0.38554 0.56973 0.16819 0.20726 0.52844 0.71871 0.26393 0.71623 0.23292 0.85654 0.12822 [...]
+0.54782 0.37863 0.96947 0.41099 0.27126 0.19988 0.00313 0.47982 0.45189 0.17153 0.40591 0.98306 0.92113 0.90991 0.64587 0.86375 0.29243 0.47242 0.86315 0.51214 0.39579 0.44649 0.02843 0.89609 0.3482 0.88638 0.13042 0.7158 0.50537 0.74382 0.75384 0.89531 0.15365 0.02739 0.05558 0.52413 0.81527 0.13424 0.00618 0.12686 0.79049 0.60673 0.52149 0.72034 0.74513 0.71746 0.90127 0.6771 0.39769 0.96442 0.76989 0.76505 0.70052 0.00977 0.40546 0.59442 0.8943 0.31007 0.16091 0.32993 0.99057 0.01424  [...]
+0.95551 0.14463 0.08389 0.48353 0.25105 0.18271 0.06292 0.17857 0.72246 0.28347 0.36953 0.98158 0.5662 0.74293 0.30199 0.62582 0.51437 0.69552 0.39799 0.90978 0.8442 0.02011 0.50174 0.80953 0.63418 0.95774 0.40123 0.39765 0.96799 0.89723 0.44 0.7572 0.23479 0.95143 0.95708 0.94018 0.09771 0.32854 0.41623 0.52927 0.17145 0.74527 0.19697 0.39067 0.41049 0.41071 0.64678 0.20431 0.11484 0.16762 0.16397 0.08727 0.79806 0.87532 0.96086 0.7409 0.65172 0.85606 0.7353 0.19784 0.72059 0.96942 0.39 [...]
+0.11205 0.69704 0.3939 0.09866 0.62603 0.34208 0.20776 0.50517 0.3348 0.48486 0.01282 0.86504 0.66353 0.55824 0.07594 0.83328 0.94527 0.40308 0.22186 0.72372 0.79418 0.29755 0.26286 0.23591 0.08097 0.34881 0.58813 0.09361 0.50966 0.60199 0.35086 0.40737 0.48223 0.73699 0.5215 0.04162 0.20316 0.03769 0.20808 0.8737 0.13896 0.70458 0.85516 0.49679 0.49631 0.67337 0.39054 0.76402 0.74971 0.15374 0.3628 0.31814 0.3997 0.10127 0.56063 0.56112 0.74393 0.77241 0.87057 0.54752 0.6992 0.8272 0.29 [...]
+0.72408 0.00842 0.58754 0.99098 0.63913 0.04404 0.43802 0.79566 0.85773 0.27095 0.66901 0.29517 0.68452 0.42357 0.97209 0.73418 0.34962 0.39577 0.44717 0.17123 0.49972 0.07075 0.86476 0.61891 0.68526 0.37102 0.45016 0.88005 0.59428 0.078 0.64002 0.69086 0.42722 0.00162 0.05164 0.5372 0.78879 0.37652 0.97154 0.27427 0.85031 0.42071 0.06875 0.03509 0.4432 0.69107 0.0449 0.07853 0.0846 0.02866 0.84763 0.08178 0.54183 0.03608 0.03739 0.46588 0.96305 0.50562 0.7537 0.85853 0.51844 0.12767 0.8 [...]
+0.47561 0.81475 0.59939 0.40976 0.53367 0.45829 0.00187 0.59376 0.03631 0.51995 0.6931 0.61427 0.92909 0.10766 0.67401 0.53026 0.9331 0.36185 0.50309 0.86555 0.71893 0.39948 0.22609 0.1819 0.59616 0.75203 0.28739 0.95498 0.86491 0.31969 0.63538 0.22629 0.33399 0.66527 0.96613 0.93326 0.9854 0.2383 0.72472 0.97928 0.27319 0.15071 0.29448 0.64999 0.27462 0.18758 0.98386 0.03337 0.31127 0.38113 0.45947 0.58634 0.60898 0.86027 0.83228 0.87195 0.779 0.99818 0.75743 0.24788 0.74329 0.06485 0.5 [...]
+0.84531 0.14813 0.72402 0.9851 0.91172 0.89936 0.57017 0.91661 0.15494 0.01252 0.32382 0.08721 0.52836 0.30263 0.95749 0.08181 0.5816 0.1226 0.99525 0.80226 0.74407 0.757 0.82118 0.31074 0.98151 0.84951 0.52303 0.54393 0.04618 0.66839 0.89479 0.24521 0.29073 0.05786 0.97225 0.37241 0.81005 0.57851 0.21029 0.63407 0.55837 0.15573 0.56172 0.05207 0.40628 0.4819 0.45199 0.18233 0.25567 0.47437 0.70149 0.61886 0.6715 0.64021 0.97332 0.13391 0.90734 0.26076 0.29623 0.06851 0.61676 0.78467 0.1 [...]
+0.70594 0.98748 0.7014 0.353 0.2828 0.37364 0.66784 0.56159 0.11233 0.74964 0.65522 0.58935 0.8909 0.08496 0.57413 0.53226 0.79513 0.92944 0.48579 0.74265 0.68181 0.57873 0.04799 0.73799 0.44616 0.41766 0.68348 0.92536 0.80729 0.05888 0.47003 0.73993 0.27099 0.27357 0.65628 0.77811 0.14023 0.78318 0.10903 0.85557 0.52581 0.08852 0.26556 0.97862 0.77958 0.13815 0.19232 0.36409 0.71046 0.42392 0.41562 0.48323 0.14519 0.05485 0.24981 0.41047 0.43569 0.97047 0.65171 0.57975 0.96806 0.41504 0 [...]
+0.13109 0.24171 0.8418 0.71063 0.85233 0.32628 0.0447 0.72982 0.03148 0.93169 0.01098 0.70945 0.17337 0.45125 0.05383 0.71916 0.8149 0.56807 0.61206 0.46278 0.2955 0.85657 0.1501 0.57525 0.508 0.05399 0.97283 0.85797 0.45806 0.44855 0.62089 0.93148 0.62971 0.72201 0.23674 0.52188 0.54909 0.75159 0.39236 0.58207 0.13462 0.18891 0.12825 0.99478 0.01794 0.44419 0.9507 0.07425 0.87869 0.28782 0.49886 0.84903 0.36462 0.8941 0.44722 0.60596 0.40191 0.7246 0.73626 0.22886 0.81976 0.15369 0.4614 [...]
+0.90408 0.94735 0.25979 0.23017 0.72598 0.15034 0.34267 0.19001 0.7663 0.84054 0.06381 0.15162 0.31419 0.59154 0.32421 0.22946 0.99454 0.5624 0.07869 0.37017 0.05592 0.39271 0.40781 0.76107 0.19851 0.18643 0.23133 0.48875 0.24768 0.11895 0.96045 0.91132 0.37685 0.57218 0.1989 0.71068 0.35204 0.96608 0.6496 0.80892 0.66834 0.23162 0.41734 0.42014 0.71859 0.87276 0.95983 0.39535 0.07541 0.65469 0.40123 0.43989 0.89011 0.32363 0.94951 0.58517 0.53668 0.80636 0.93949 0.00186 0.75483 0.37205  [...]
+0.19343 0.11507 0.75236 0.54146 0.26175 0.60552 0.94853 0.59274 0.64811 0.33848 0.28404 0.06372 0.749 0.06577 0.38456 0.67218 0.4117 0.01952 0.4924 0.94493 0.83522 0.54916 0.20986 0.22161 0.15733 0.26846 0.56195 0.81546 0.75174 0.81219 0.56275 0.61478 0.72518 0.55344 0.28011 0.13188 0.71156 0.62742 0.71334 0.79977 0.78949 0.36666 0.99614 0.88414 0.35267 0.51755 0.56209 0.97438 0.72492 0.44753 0.70913 0.02223 0.13156 0.22607 0.94555 0.62872 0.67155 0.99617 0.51636 0.21143 0.05454 0.19019  [...]
+0.28923 0.13417 0.94705 0.49322 0.80525 0.55201 0.98379 0.29262 0.9209 0.55757 0.35743 0.84978 0.2413 0.88038 0.26479 0.07958 0.7913 0.36276 0.21404 0.75109 0.79178 0.61843 0.74431 0.06409 0.53631 0.80035 0.26534 0.62427 0.87565 0.19916 0.81623 0.58029 0.44654 0.38601 0.21194 0.03431 0.41925 0.39867 0.23294 0.80409 0.68803 0.77151 0.90273 0.06359 0.51655 0.3922 0.13264 0.1248 0.5038 0.84227 0.82467 0.6993 0.58003 0.38971 0.23052 0.10422 0.81615 0.99816 0.57674 0.19883 0.45022 0.92797 0.6 [...]
+0.50237 0.97057 0.83622 0.72816 0.21461 0.46889 0.86907 0.18997 0.24717 0.16435 0.98402 0.73229 0.20598 0.13019 0.55766 0.99052 0.6212 0.15066 0.67411 0.78391 0.03525 0.59868 0.70517 0.8407 0.51866 0.15773 0.81375 0.47764 0.96408 0.43914 0.77847 0.12765 0.17063 0.77496 0.35346 0.86559 0.31004 0.00193 0.43825 0.19229 0.90903 0.31265 0.69981 0.32282 0.82694 0.62205 0.87238 0.55319 0.7733 0.49794 0.81566 0.87341 0.32658 0.87867 0.58077 0.20415 0.6622 0.46583 0.8854 0.74915 0.60116 0.97535 0 [...]
+0.13221 0.47869 0.04649 0.46593 0.83655 0.97689 0.23972 0.78204 0.84544 0.52046 0.0244 0.09508 0.66584 0.00415 0.3952 0.3172 0.39059 0.10292 0.34128 0.85601 0.82904 0.67228 0.19301 0.48457 0.82825 0.69712 0.21563 0.93064 0.77792 0.95627 0.38238 0.9533 0.89433 0.77003 0.50692 0.39276 0.49151 0.46053 0.06655 0.01135 0.37756 0.94418 0.59299 0.81152 0.0004 0.43449 0.97989 0.72971 0.65806 0.90323 0.72154 0.84646 0.27579 0.41837 0.43766 0.46843 0.85888 0.25479 0.89214 0.07587 0.34055 0.80735 0 [...]
+0.44921 0.41299 0.68096 0.9312 0.66025 0.59589 0.35063 0.17353 0.164 0.41184 0.13826 0.08405 0.56205 0.51066 0.01097 0.44405 0.82655 0.70464 0.17581 0.5032 0.88417 0.73979 0.90377 0.3958 0.70197 0.66794 0.97167 0.42553 0.54871 0.49774 0.65872 0.59415 0.76623 0.78201 0.79854 0.32544 0.18016 0.29209 0.06306 0.69393 0.65596 0.24304 0.04499 0.97113 0.97963 0.53427 0.08542 0.2349 0.10337 0.08685 0.34998 0.53712 0.88572 0.7408 0.82487 0.82249 0.12116 0.08547 0.93146 0.55487 0.05124 0.95298 0.0 [...]
+0.85818 0.91355 0.24486 0.00697 0.71191 0.24042 0.65888 0.92564 0.18337 0.05979 0.48307 0.30672 0.81027 0.58347 0.44786 0.4994 0.90754 0.04621 0.17685 0.02013 0.93041 0.45755 0.195 0.43178 0.09851 0.11651 0.11975 0.30465 0.14602 0.39692 0.40986 0.13334 0.65007 0.86955 0.72701 0.747 0.87715 0.54954 0.42867 0.27314 0.68343 0.79282 0.57064 0.62029 0.45521 0.4982 0.01995 0.2812 0.47267 0.65782 0.45159 0.57965 0.12457 0.09234 0.12605 0.56727 0.10804 0.48418 0.0299 0.30106 0.24151 0.89371 0.06 [...]
+0.13898 0.4762 0.70329 0.27539 0.17783 0.30413 0.76849 0.46146 0.81819 0.3781 0.69124 0.92762 0.88611 0.27294 0.0417 0.60638 0.37064 0.85897 0.1642 0.23796 0.53043 0.49607 0.26651 0.01634 0.58107 0.99837 0.11705 0.61052 0.28992 0.91892 0.5217 0.47479 0.51464 0.58324 0.17758 0.84891 0.01813 0.39428 0.4971 0.14764 0.25866 0.5307 0.93161 0.57235 0.51238 0.83614 0.95896 0.59304 0.23011 0.32484 0.532 0.4729 0.6827 0.99426 0.25083 0.11762 0.58361 0.56586 0.74965 0.03122 0.73307 0.89673 0.58629 [...]
+0.09938 0.04331 0.13671 0.08283 0.94838 0.56728 0.93656 0.456 0.62076 0.28119 0.3521 0.61063 0.39887 0.22669 0.63306 0.25003 0.47148 0.42412 0.46083 0.93008 0.43639 0.71353 0.99942 0.04045 0.44526 0.34808 0.7876 0.66706 0.29019 0.73846 0.10738 0.14469 0.27653 0.23719 0.08538 0.08865 0.47967 0.17135 0.70265 0.92293 0.8926 0.84072 0.54123 0.2734 0.49293 0.60298 0.03718 0.93433 0.88384 0.04339 0.04606 0.73827 0.19325 0.49889 0.27758 0.92404 0.66354 0.18765 0.1458 0.88064 0.89323 0.94436 0.1 [...]
+0.78953 0.96878 0.10054 0.76836 0.65568 0.34923 0.07833 0.28653 0.27442 0.72255 0.03263 0.16363 0.88547 0.866 0.29338 0.30965 0.92613 0.38707 0.9432 0.62755 0.35496 0.90667 0.52989 0.9972 0.06545 0.49742 0.4597 0.41834 0.28641 0.94629 0.39852 0.82302 0.92914 0.59679 0.52545 0.86097 0.90757 0.26508 0.56022 0.52621 0.10097 0.78596 0.5615 0.22565 0.50117 0.87361 0.07157 0.39318 0.78888 0.25898 0.42639 0.7719 0.58448 0.80962 0.46499 0.30329 0.04523 0.83557 0.44453 0.55969 0.07628 0.09586 0.3 [...]
+0.9802 0.75907 0.60934 0.12148 0.87097 0.25465 0.97092 0.15712 0.87644 0.33483 0.69757 0.53331 0.89834 0.47069 0.59993 0.92107 0.03457 0.60553 0.63315 0.99302 0.30377 0.06045 0.0099 0.64759 0.86694 0.6283 0.32454 0.44653 0.48998 0.93303 0.80653 0.1365 0.96171 0.93429 0.63111 0.2434 0.32081 0.45149 0.84704 0.83339 0.72103 0.79528 0.59509 0.97112 0.89304 0.46394 0.21173 0.8323 0.57357 0.59513 0.7434 0.69322 0.45923 0.12104 0.44619 0.08899 0.53836 0.79296 0.17332 0.20027 0.63449 0.58821 0.3 [...]
+0.35937 0.82095 0.23867 0.2155 0.48287 0.67526 0.94433 0.39311 0.86731 0.98254 0.928 0.38043 0.18434 0.48995 0.90736 0.20772 0.26584 0.07463 0.33102 0.52035 0.91578 0.05703 0.59957 0.8145 0.61723 0.23555 0.92024 0.8551 0.28363 0.08343 0.82773 0.47084 0.01758 0.26303 0.13927 0.94396 0.54331 0.16036 0.29828 0.43268 0.09251 0.24379 0.27388 0.08118 0.80103 0.13884 0.40182 0.05126 0.69603 0.5464 0.69913 0.27084 0.97968 0.5942 0.03629 0.70151 0.61578 0.52615 0.48828 0.85458 0.02108 0.72312 0.6 [...]
+0.45107 0.37422 0.85162 0.94603 0.01423 0.0486 0.82239 0.23567 0.67649 0.43077 0.14135 0.2727 0.32831 0.82542 0.45067 0.03875 0.13793 0.45849 0.83592 0.2747 0.96388 0.30399 0.16858 0.61034 0.98939 0.4641 0.66152 0.37128 0.69972 0.6578 0.48225 0.22387 0.25349 0.64401 0.09233 0.21544 0.58192 0.78752 0.58504 0.33765 0.78262 0.75454 0.72312 0.42522 0.37889 0.2833 0.49511 0.05311 0.38596 0.10623 0.16043 0.13632 0.46729 0.3938 0.37196 0.96238 0.50205 0.10946 0.66887 0.85723 0.34811 0.74782 0.2 [...]
+0.22753 0.30927 0.72002 0.18111 0.25893 0.06681 0.6057 0.97735 0.56989 0.64938 0.52961 0.00158 0.88747 0.40662 0.74737 0.78671 0.75472 0.52698 0.1349 0.2961 0.5656 0.23536 0.7811 0.84184 0.45331 0.79942 0.027 0.4551 0.83896 0.01716 0.27088 0.38466 0.34407 0.9848 0.28269 0.3593 0.82811 0.8684 0.47308 0.56901 0.26716 0.64114 0.55078 0.28636 0.06233 0.5227 0.68341 0.63069 0.59591 0.35195 0.90501 0.60157 0.43902 0.88023 0.25196 0.11661 0.89307 0.84179 0.03459 0.4553 0.51955 0.9862 0.70593 0. [...]
+0.16455 0.13914 0.93726 0.61501 0.78457 0.37862 0.80209 0.36826 0.76944 0.17216 0.39836 0.69276 0.03727 0.95002 0.00817 0.154 0.93773 0.36711 0.66356 0.79411 0.00669 0.99637 0.3326 0.95483 0.22471 0.20905 0.06835 0.69468 0.57167 0.01389 0.80743 0.02761 0.46306 0.01007 0.08576 0.55575 0.22724 0.38535 0.5056 0.63544 0.40324 0.15704 0.64573 0.1762 0.43247 0.64723 0.90157 0.63937 0.01399 0.34953 0.63822 0.29528 0.95697 0.4463 0.7985 0.02696 0.89819 0.03214 0.98505 0.17841 0.44117 0.74274 0.0 [...]
+0.80231 0.33266 0.42352 0.1993 0.39041 0.61117 0.04663 0.0034 0.19189 0.79147 0.5849 0.12208 0.65038 0.22086 0.9445 0.88426 0.19022 0.01934 0.98354 0.37355 0.23317 0.93024 0.21423 0.68365 0.03314 0.81762 0.393 0.13396 0.79785 0.56047 0.93486 0.8421 0.98656 0.607 0.82159 0.31194 0.79248 0.70794 0.4387 0.80079 0.56209 0.51291 0.73973 0.26454 0.49175 0.67505 0.63748 0.91263 0.44192 0.48837 0.02379 0.98907 0.57537 0.67824 0.90072 0.72308 0.60297 0.13359 0.28075 0.15884 0.79895 0.4582 0.76863 [...]
+0.57222 0.64985 0.5553 0.07621 0.86931 0.40565 0.53912 0.47431 0.69981 0.25851 0.66356 0.10022 0.63437 0.78448 0.28171 0.30128 0.3276 0.09874 0.49529 0.30594 0.35298 0.72413 0.69803 0.69392 0.22787 0.68322 0.53438 0.79882 0.7798 0.89622 0.21871 0.38356 0.16681 0.16301 0.73535 0.07873 0.00637 0.74807 0.6195 0.60849 0.8467 0.35756 0.52218 0.99964 0.05582 0.70399 0.3854 0.14025 0.54215 0.86432 0.69706 0.07928 0.82414 0.4201 0.13157 0.31515 0.57036 0.71641 0.12247 0.47005 0.88267 0.06646 0.9 [...]
+0.05909 0.92298 0.99644 0.47019 0.02277 0.77271 0.96742 0.19952 0.96873 0.23675 0.79276 0.63555 0.32784 0.52299 0.26147 0.01067 0.95361 0.19933 0.21388 0.21178 0.46453 0.32513 0.40001 0.90128 0.70941 0.69263 0.25816 0.78597 0.84632 0.73335 0.68701 0.0706 0.02497 0.82233 0.84007 0.51902 0.58119 0.06129 0.12866 0.79901 0.87025 0.87059 0.68361 0.65766 0.3344 0.5519 0.40646 0.56813 0.08478 0.7932 0.20472 0.84807 0.81121 0.8325 0.56915 0.30891 0.94129 0.09828 0.49074 0.23303 0.41344 0.88713 0 [...]
+0.29882 0.28082 0.2515 0.50191 0.75215 0.81256 0.18684 0.80581 0.60954 0.63011 0.08144 0.07852 0.91552 0.34086 0.28621 0.79645 0.18316 0.96463 0.88368 0.22831 0.30739 0.10257 0.51085 0.5516 0.30489 0.75654 0.23608 0.88917 0.44606 0.61819 0.12041 0.61848 0.78699 0.61511 0.23286 0.30405 0.21131 0.98045 0.29096 0.85547 0.29621 0.95599 0.28021 0.88505 0.94761 0.438 0.9718 0.88933 0.39188 0.56322 0.95327 0.36599 0.09277 0.8267 0.14 0.67293 0.36188 0.23141 0.5347 0.89709 0.24648 0.66866 0.2802 [...]
+0.36743 0.23491 0.27588 0.06522 0.31659 0.52873 0.89679 0.47649 0.70833 0.69996 0.34768 0.89286 0.6168 0.89885 0.5435 0.83187 0.75947 0.8842 0.53971 0.37273 0.9052 0.12038 0.57537 0.31373 0.30078 0.37884 0.88403 0.32539 0.53842 0.54682 0.72263 0.31273 0.62383 0.91819 0.94111 0.80881 0.75753 0.29669 0.03746 0.59367 0.63124 0.71378 0.07345 0.86808 0.92203 0.42327 0.00415 0.41999 0.24673 0.69435 0.98909 0.83066 0.52265 0.24936 0.08477 0.7296 0.77364 0.5908 0.09435 0.87163 0.99774 0.56736 0. [...]
+0.77065 0.29215 0.51682 0.77697 0.23852 0.71182 0.95313 0.24625 0.13788 0.47004 0.79361 0.34154 0.19323 0.02209 0.07453 0.02878 0.18993 0.40102 0.17799 0.98026 0.22572 0.88171 0.12129 0.23981 0.22303 0.83454 0.26236 0.80166 0.17038 0.12834 0.88731 0.96285 0.03504 0.04858 0.15659 0.1443 0.55575 0.54915 0.09299 0.22967 0.41969 0.36173 0.88838 0.0243 0.05208 0.98375 0.92031 0.2359 0.81716 0.43328 0.22043 0.93177 0.56608 0.91396 0.83306 0.34746 0.73472 0.19903 0.09893 0.09761 0.67644 0.03823 [...]
+0.85247 0.39993 0.08587 0.11011 0.78957 0.17764 0.47381 0.08547 0.39137 0.06225 0.02614 0.66562 0.1821 0.59075 0.20346 0.67429 0.17503 0.8012 0.9575 0.64164 0.86347 0.14704 0.83599 0.597 0.47412 0.68779 0.82326 0.89791 0.25177 0.0893 0.73183 0.06231 0.17206 0.81839 0.35116 0.38939 0.98799 0.16071 0.43223 0.16864 0.5018 0.69839 0.35006 0.96467 0.05839 0.7795 0.64709 0.71384 0.80545 0.98828 0.39932 0.80045 0.87286 0.19005 0.60902 0.51379 0.07336 0.95982 0.14725 0.01177 0.14051 0.81036 0.07 [...]
+0.29447 0.10687 0.8033 0.7139 0.55802 0.81074 0.53638 0.25479 0.28319 0.00109 0.17482 0.53883 0.01894 0.57015 0.49256 0.12193 0.65095 0.4321 0.86648 0.63682 0.66089 0.25474 0.68093 0.01678 0.06439 0.367 0.93526 0.59259 0.52156 0.4441 0.7547 0.06522 0.26495 0.00193 0.65207 0.48414 0.70681 0.43012 0.97886 0.66079 0.59677 0.52633 0.57509 0.51549 0.81153 0.08326 0.68546 0.91235 0.60499 0.72308 0.02113 0.53336 0.83846 0.3575 0.15367 0.14375 0.03354 0.51545 0.0172 0.21927 0.95198 0.37653 0.953 [...]
+0.85527 0.68363 0.83622 0.45834 0.52267 0.32842 0.83482 0.89959 0.35058 0.77836 0.26559 0.78942 0.61545 0.50552 0.97555 0.55957 0.82991 0.3737 0.45114 0.39702 0.69269 0.9167 0.04166 0.32619 0.5951 0.55887 0.81613 0.60231 0.48961 0.64973 0.50533 0.63911 0.35681 0.06314 0.79935 0.49817 0.95286 0.73396 0.40731 0.06511 0.10454 0.22129 0.52502 0.85954 0.72819 0.91589 0.6036 0.48526 0.65877 0.64827 0.89765 0.07198 0.73215 0.82426 0.64851 0.19597 0.01554 0.97173 0.86831 0.2909 0.56864 0.27274 0 [...]
+0.88341 0.97691 0.53798 0.74819 0.92734 0.2334 0.82137 0.12353 0.49707 0.38899 0.37165 0.98397 0.24659 0.2698 0.61934 0.02081 0.90319 0.5206 0.31213 0.51584 0.76026 0.60626 0.52574 0.07518 0.22341 0.09962 0.73384 0.31397 0.09269 0.09795 0.14387 0.57664 0.14358 0.33054 0.27876 0.19459 0.55293 0.2669 0.47175 0.50307 0.14546 0.61399 0.82325 0.9161 0.81847 0.92341 0.73436 0.69539 0.47954 0.46638 0.93324 0.67514 0.27346 0.54512 0.87675 0.65993 0.206 0.98989 0.93032 0.07331 0.91123 0.28115 0.5 [...]
+0.39189 0.79423 0.53633 0.55122 0.74232 1.0 0.39086 0.19721 0.54476 0.11389 0.395 0.20543 0.38171 0.32012 0.13556 0.52898 0.7675 0.99009 0.4458 0.43713 0.89322 0.01661 0.17659 0.96209 0.7404 0.41854 0.44832 0.2995 0.45638 0.37304 0.76277 0.70924 0.79967 0.41662 0.85042 0.15297 0.54416 0.10845 0.84259 0.17049 0.44119 0.06526 0.65741 0.36993 0.36095 0.3544 0.8381 0.72767 0.22883 0.324 0.78817 0.38681 0.23149 0.66738 0.65202 0.96415 0.06345 0.78378 0.24928 0.37638 0.07133 0.99641 0.1144 0.0 [...]
+0.18077 0.67618 0.25048 0.11962 0.40212 0.18574 0.42858 0.67848 0.80247 0.23468 0.81453 0.48484 0.20506 0.22921 0.83692 0.18433 0.26287 0.07825 0.49149 0.8332 0.74171 0.48376 0.95023 0.49047 0.1002 0.62827 0.70605 0.90054 0.0063 0.068 0.84663 0.99203 0.32313 0.02649 0.09294 0.33036 0.06563 0.93011 0.84224 0.4791 0.19324 0.25013 0.57382 0.99102 0.33834 0.33218 0.24391 0.932 0.33888 0.80916 0.51955 0.08585 0.64584 0.65178 0.75954 0.31977 0.20901 0.70532 0.14549 0.45388 0.94849 0.35398 0.15 [...]
+0.06046 0.83565 0.5784 0.57496 0.77613 0.38931 0.38392 0.69073 0.24852 0.50116 0.13402 0.71796 0.26365 0.60988 0.57199 0.25326 0.46289 0.48754 0.20922 0.80156 0.54812 0.07135 0.53679 0.81102 0.87548 0.61526 0.01493 0.7745 0.49634 0.74169 0.69359 0.39562 0.49031 0.5692 0.6476 0.12218 0.75096 0.24409 0.6611 0.78232 0.9262 0.58725 0.50809 0.95557 0.3707 0.84218 0.28875 0.33479 0.4047 0.96066 0.31657 0.00352 0.58357 0.15016 0.34001 0.27801 0.31251 0.14677 0.11203 0.65322 0.68298 0.45168 0.56 [...]
+0.33019 0.96793 0.39561 0.85826 0.87971 0.58088 0.07296 0.51077 0.11333 0.59576 0.69647 0.72008 0.41352 0.09995 0.64638 0.78148 0.07425 0.42138 0.00389 0.0051 0.3216 0.48421 0.84373 0.53215 0.61646 0.79012 0.32915 0.7892 0.57801 0.11124 0.47702 0.16716 0.3749 0.4281 0.92794 0.4317 0.91214 0.42737 0.01662 0.93732 0.69544 0.42806 0.28368 0.43102 0.32727 0.55014 0.15735 0.74421 0.86042 0.28636 0.27513 0.27002 0.67808 0.88563 0.83912 0.19395 0.65898 0.66766 0.86303 0.31497 0.54311 0.20823 0. [...]
+0.47685 0.53651 0.77995 0.73161 0.48465 0.09072 0.01028 0.17993 0.83488 0.54397 0.60325 0.27957 0.65372 0.42757 0.29832 0.35397 0.82205 0.4004 0.4006 0.67963 0.08632 0.21269 0.67726 0.27965 0.94572 0.33838 0.02877 0.528 0.99251 0.99605 0.97844 0.2016 0.88789 0.29438 0.78058 0.5795 0.96185 0.71553 0.4585 0.88239 0.32858 0.53824 0.98827 0.02489 0.32266 0.45022 0.00141 0.10778 0.18086 0.91547 0.93402 0.84653 0.75146 0.46855 0.57801 0.56421 0.14486 0.44135 0.8344 0.08341 0.67494 0.52915 0.29 [...]
+0.78982 0.77592 0.22288 0.37591 0.91693 0.93497 0.78745 0.9191 0.14522 0.30735 0.0716 0.34606 0.16174 0.38277 0.38385 0.30988 0.23873 0.5734 0.26563 0.52309 0.57171 0.52037 0.65011 0.72744 0.29525 0.90302 0.85498 0.60532 0.18804 0.09967 0.39626 0.67719 0.90981 0.47711 0.00442 0.67895 0.59505 0.09161 0.08312 0.97242 0.98305 0.16989 0.29842 0.34641 0.17327 0.44138 0.43888 0.54924 0.73175 0.0164 0.96359 0.18561 0.09042 0.97025 0.781 0.54114 0.55083 0.54103 0.64508 0.32438 0.94599 0.60314 0. [...]
+0.50267 0.11516 0.63462 0.26274 0.28335 0.64738 0.3458 0.46472 0.25867 0.65281 0.2356 0.86393 0.09356 0.01885 0.59327 0.78183 0.53291 0.21987 0.01918 0.0591 0.31711 0.76284 0.94433 0.44887 0.57929 0.06095 0.72127 0.67256 0.21982 0.27948 0.99107 0.78603 0.97996 0.02958 0.02219 0.61504 0.48472 0.35684 0.04692 0.50726 0.6413 0.98291 0.24109 0.73012 0.54716 0.22941 0.11828 0.04121 0.06891 0.97678 0.7142 0.36992 0.55682 0.12713 0.217 0.22542 0.84806 0.79825 0.55978 0.16036 0.21836 0.59585 0.9 [...]
+0.93234 0.56046 0.29827 0.07629 0.82111 0.41397 0.30507 0.74596 0.56815 0.61923 0.50208 0.67632 0.41951 0.63185 0.11165 0.40827 0.95703 0.72991 0.11666 0.84782 0.61052 0.21532 0.5777 0.92193 0.48088 0.12613 0.0032 0.91237 0.36299 0.29116 0.43587 0.00525 0.68324 0.0824 0.60204 0.53218 0.75352 0.04259 0.04757 0.55899 0.93395 0.93975 0.59406 0.77507 0.74289 0.42613 0.22643 0.90463 0.98263 0.76674 0.49098 0.57944 0.97678 0.2142 0.82604 0.96047 0.45766 0.41249 0.79251 0.814 0.13869 0.22931 0. [...]
+0.71045 0.74224 0.11964 0.33282 0.81444 0.15433 0.72834 0.57724 0.42442 0.86504 0.48232 0.96943 0.49778 0.2535 0.0525 0.73234 0.29939 0.87076 0.89337 0.18806 0.07751 0.09859 0.98127 0.88978 0.25033 0.33853 0.67403 0.36229 0.21646 0.49796 0.12125 0.31255 0.4184 0.62271 0.26406 0.58613 0.75744 0.75722 0.59468 0.69256 0.96122 0.33698 0.17746 0.7448 0.09748 0.68567 0.72728 0.98809 0.77559 0.09929 0.65307 0.55962 0.62248 0.19073 0.90027 0.9929 0.80577 0.22987 0.83828 0.70243 0.2797 0.00725 0. [...]
+0.77204 0.11219 0.08496 0.30403 0.35222 0.93337 0.14226 0.91464 0.76833 0.0963 0.4306 0.86996 0.20496 0.45283 0.10931 0.83058 0.81692 0.34062 0.87617 0.21507 0.08896 0.94545 0.81821 0.06153 0.25787 0.97192 0.91253 0.46595 0.95396 0.43223 0.21877 0.0591 0.98641 0.0094 0.44854 0.58337 0.53095 0.51119 0.25457 0.52406 0.13937 0.56967 0.02851 0.01063 0.6878 0.51171 0.71871 0.70549 0.84779 0.06117 0.58585 0.97293 0.41024 0.02654 0.63403 0.28541 0.38264 0.41271 0.55031 0.71487 0.62891 0.34715 0 [...]
+0.48953 0.42494 0.37551 0.48954 0.97816 0.17278 0.03799 0.75578 0.02207 0.55877 0.9901 0.07241 0.30112 0.5194 0.73712 0.72518 0.56031 0.68288 0.4429 0.47466 0.42692 0.90692 0.74568 0.85991 0.24744 0.10761 0.71859 0.73198 0.6265 0.37422 0.84714 0.40784 0.65818 0.60676 0.11663 0.61912 0.3875 0.1675 0.84524 0.72826 0.94145 0.42317 0.56742 0.24492 0.17233 0.04788 0.61999 0.55185 0.17494 0.75649 0.02627 0.45929 0.8025 0.61647 0.28799 0.10681 0.76999 0.65889 0.25374 0.69015 0.47392 0.97975 0.2 [...]
+0.49428 0.61183 0.8361 0.74039 0.37757 0.18979 0.87031 0.81234 0.83395 0.68638 0.22839 0.48297 0.65313 0.7282 0.43743 0.01166 0.24023 0.84411 0.36398 0.6737 0.25049 0.08565 0.73233 0.74329 0.54892 0.6408 0.19309 0.82271 0.71353 0.77737 0.63841 0.60841 0.16017 0.40112 0.56534 0.75722 0.61817 0.57342 0.49263 0.74841 0.51557 0.19601 0.51419 0.5678 0.32422 0.23382 0.66984 0.58367 0.5827 0.11236 0.62761 0.21811 0.10348 0.7996 0.43036 0.33456 0.85555 0.52346 0.67271 0.39639 0.33925 0.458 0.789 [...]
+0.71405 0.58063 0.53468 0.6698 0.37157 0.05767 0.34282 0.09191 0.10978 0.05493 0.85651 0.72996 0.84414 0.04039 0.60757 0.47516 0.92416 0.32836 0.96117 0.78342 0.94656 0.3789 0.3743 0.03799 0.56045 0.05066 0.211 0.27812 0.82813 0.19491 0.30676 0.8248 0.21522 0.53167 0.89055 0.16359 0.22356 0.50237 0.95198 0.10231 0.09165 0.70751 0.628 0.101 0.21279 0.12187 0.92376 0.25404 0.425 0.36033 0.45926 0.6062 0.5045 0.23436 0.39256 0.07602 0.51731 0.4979 0.89709 0.75523 0.58506 0.25428 0.89437 0.4 [...]
+0.78201 0.44744 0.40203 0.85008 0.32297 0.11163 0.29279 0.45887 0.05418 0.00365 0.79349 0.99574 0.02558 0.12856 0.63661 0.06671 0.53839 0.50255 0.21317 0.80613 0.79523 0.04163 0.99633 0.47368 0.00434 0.01416 0.28799 0.45682 0.26971 0.54444 0.73748 0.83248 0.52164 0.4494 0.23494 0.30053 0.67069 0.61373 0.87912 0.95573 0.67018 0.50921 0.08128 0.18712 0.82198 0.89859 0.27778 0.37045 0.87548 0.39028 0.52246 0.46179 0.90408 0.37475 0.7951 0.6663 0.31422 0.97106 0.10525 0.82043 0.05247 0.97713 [...]
+0.613 0.06146 0.66437 0.69547 0.60562 0.78187 0.73791 0.07242 0.1456 0.27727 0.09203 0.44284 0.92403 0.35169 0.87222 0.33417 0.04927 0.14538 0.14324 0.23867 0.11645 0.76802 0.91682 0.86659 0.13836 0.12898 0.02081 0.33328 0.61346 0.41595 0.66927 0.04022 0.48171 0.63214 0.6489 0.48471 0.96303 0.82681 0.64386 0.65643 0.38656 0.08971 0.77911 0.35978 0.87889 0.12781 0.07241 0.76969 0.8287 0.94879 0.58789 0.30692 0.03518 0.66458 0.58867 0.82873 0.88521 0.73068 0.40852 0.47284 0.52316 0.65789 0 [...]
+0.66087 0.12827 0.57804 0.55496 0.74865 0.80235 0.67447 0.51949 0.97775 0.20331 0.13835 0.25287 0.27726 0.92811 0.68877 0.38193 0.27261 0.88571 0.03521 0.40895 0.96389 0.1848 0.8985 0.04692 0.39131 0.07748 0.04606 0.99466 0.99133 0.75226 0.53573 0.84732 0.07743 0.63747 0.53472 0.5076 0.21651 0.27044 0.68126 0.66898 0.96857 0.08715 0.17051 0.33039 0.92785 0.923 0.81016 0.53466 0.62635 0.1214 0.0921 0.49949 0.25556 0.59645 0.97835 0.42978 0.85577 0.96531 0.17375 0.14438 0.09297 0.92892 0.4 [...]
+0.47399 0.89933 0.06354 0.16034 0.01921 0.88388 0.32374 0.1372 0.34409 0.40204 0.82051 0.00929 0.55009 0.91981 0.24041 0.69998 0.99363 0.59693 0.54011 0.39482 0.54153 0.28561 0.37192 0.492 0.79495 0.73312 0.09196 0.96524 0.67189 0.44809 0.49616 0.74258 0.32375 0.36508 0.82778 0.69487 0.95468 0.06783 0.32048 0.0461 0.83403 0.853 0.70149 0.17697 0.38476 0.03437 0.41345 0.89399 0.89861 0.40601 0.7989 0.22819 0.08255 0.01511 0.36875 0.78859 0.92531 0.35609 0.27299 0.34815 0.25029 0.23791 0.0 [...]
+0.28861 0.56237 0.49159 0.47151 0.27371 0.06394 0.41349 0.78686 0.08204 0.78941 0.27762 0.04001 0.78183 0.37936 0.1384 0.5263 0.54145 0.1713 0.65913 0.04613 0.93998 0.89785 0.97282 0.2063 0.14149 0.87683 0.83953 0.1401 0.3554 0.58951 0.9257 0.30224 0.51299 0.83817 0.67068 0.22882 0.18449 0.42046 0.86947 0.5721 0.09921 0.33128 0.18554 0.66307 0.38952 0.51239 0.72817 0.25473 0.25296 0.85459 0.816 0.20491 0.95956 0.21254 0.51566 0.14679 0.37928 0.3124 0.12922 0.9369 0.06654 0.69428 0.48134  [...]
+0.00291 0.28813 0.29664 0.78596 0.35243 0.00801 0.93253 0.2203 0.82418 0.13884 0.45381 0.18486 0.71456 0.4165 0.00617 0.38746 0.80519 0.83524 0.83624 0.67335 0.05113 0.7733 0.59778 0.46213 0.56977 0.04797 0.82671 0.5196 0.97286 0.10689 0.32893 0.4107 0.34652 0.44723 0.23516 0.54247 0.52296 0.48312 0.93027 0.22868 0.60948 0.36569 0.26492 0.68251 0.19757 0.54958 0.12941 0.00192 0.65553 0.56039 0.30675 0.11184 0.23277 0.17123 0.83359 0.52367 0.41772 0.46343 0.04825 0.2528 0.97867 0.91089 0. [...]
+0.58413 0.72078 0.07736 0.93448 0.93094 0.11469 0.50638 0.50876 0.2383 0.37442 0.67264 0.64228 0.3714 0.09365 0.25751 0.73164 0.18923 0.42523 0.47384 0.02239 0.81619 0.7223 0.92819 0.73612 0.1991 0.65026 0.06413 0.99027 0.44313 0.68738 0.63402 0.19636 0.9312 0.67711 0.95546 0.60843 0.5495 0.11121 0.2873 0.54433 0.24737 0.83462 0.26455 0.61007 0.38896 0.01274 0.00257 0.02393 0.81999 0.0154 0.22201 0.36539 0.24789 0.43179 0.66859 0.81254 0.39089 0.72184 0.71794 0.49856 0.57078 0.35263 0.39 [...]
+0.09893 0.46155 0.48183 0.8161 0.88769 0.89659 0.14721 0.12322 0.80057 0.49208 0.66704 0.86156 0.91592 0.11256 0.19956 0.01936 0.70593 0.95814 0.7941 0.39013 0.35324 0.0785 0.61746 0.50852 0.23144 0.1628 0.32041 0.14911 0.58218 0.77837 0.65749 0.46959 0.07689 0.31864 0.14069 0.73647 0.83682 0.03206 0.57976 0.96964 0.59407 0.99737 0.79158 0.14273 0.8725 0.97926 0.70122 0.71359 0.30386 0.11989 0.30254 0.68604 0.18393 0.96 0.22694 0.34941 0.24051 0.19002 0.33644 0.65987 0.00872 0.65127 0.22 [...]
+0.69385 0.88642 0.50689 0.55518 0.83594 0.82275 0.42468 0.07599 0.7463 0.57779 0.20635 0.70753 0.43628 0.89661 0.16573 0.22952 0.55778 0.07394 0.0989 0.08005 0.01325 0.17594 0.17569 0.91236 0.24117 0.34875 0.58755 0.83908 0.6199 0.8861 0.43996 0.92386 0.53644 0.13129 0.18904 0.11164 0.67211 0.02147 0.83966 0.59585 0.70461 0.97602 0.64156 0.14546 0.1994 0.0479 0.57554 0.09574 0.91235 0.92463 0.24007 0.94969 0.59818 0.94502 0.4551 0.20717 0.50662 0.72378 0.07067 0.19066 0.07949 0.35069 0.2 [...]
+0.87105 0.29385 0.08846 0.25463 0.11244 0.94616 0.78903 0.09169 0.8851 0.19119 0.13426 0.65376 0.17002 0.88695 0.0266 0.54941 0.12499 0.25301 0.54503 0.24936 0.69296 0.63116 0.20661 0.31351 0.3119 0.47571 0.9221 0.60333 0.8689 0.74496 0.26255 0.84394 0.14555 0.50273 0.89527 0.89373 0.85451 0.48875 0.65893 0.74201 0.18139 0.76359 0.58749 0.34276 0.74202 0.18784 0.41893 0.88978 0.30578 0.98527 0.83119 0.03434 0.49866 0.41221 0.03489 0.00504 0.75201 0.21533 0.23614 0.01621 0.05743 0.42421 0 [...]
+0.50437 0.51118 0.67011 0.865 0.59425 0.31595 0.94152 0.35195 0.72465 0.64132 0.45804 0.74074 0.38596 0.61666 0.18589 0.24595 0.82767 0.78076 0.70544 0.99895 0.3913 0.04717 0.48308 0.867 0.20049 0.72514 0.03628 0.81078 0.94181 0.47223 0.0624 0.10349 0.5346 0.61567 0.63001 0.95832 0.32127 0.04743 0.85213 0.82931 0.4698 0.95511 0.54715 0.07891 0.25009 0.84497 0.13047 0.17525 0.18988 0.57299 0.88663 0.80945 0.79641 0.43726 0.36142 0.17825 0.72977 0.00748 0.44619 0.01312 0.63923 0.88664 0.61 [...]
+0.03755 0.93683 0.4749 0.09025 0.92345 0.80976 0.4485 0.622 0.90372 0.27107 0.74773 0.84554 0.0053 0.40309 0.16514 0.13149 0.9509 0.50857 0.45046 0.51707 0.82638 0.14983 0.41445 0.72429 0.64645 0.14941 0.42407 0.86507 0.75111 0.06525 0.44308 0.46905 0.26671 0.57287 0.92025 0.59835 0.98636 0.13556 0.51829 0.21369 0.2716 0.84193 0.09401 0.97718 0.61477 0.44598 0.24897 0.65188 0.42782 0.30608 0.77202 0.59858 0.67387 0.21129 0.28759 0.05504 0.98051 0.08608 0.35779 0.79413 0.43584 0.56277 0.8 [...]
+0.17912 0.40789 0.19649 0.72009 0.54016 0.51393 0.43525 0.73497 0.71343 0.13395 0.70265 0.8429 0.24501 0.3975 0.95394 0.69446 0.7174 0.88121 0.93622 0.61859 0.93776 0.71552 0.19228 0.8862 0.08765 0.44805 0.21484 0.7453 0.47849 0.1136 0.05129 0.79421 0.29923 0.04172 0.93656 0.58788 0.59084 0.36577 0.25752 0.89036 0.1678 0.54316 0.3582 0.74003 0.00724 0.85319 0.98289 0.35468 0.08935 0.86269 0.62805 0.83678 0.33225 0.39667 0.0843 0.76232 0.20134 0.05494 0.72402 0.11151 0.80007 0.58088 0.517 [...]
+0.67225 0.72913 0.74607 0.24086 0.42296 0.03379 0.59761 0.85822 0.51522 0.96354 0.95252 0.75566 0.86127 0.1833 0.93017 0.4226 0.04641 0.69871 0.20078 0.45587 0.65398 0.45704 0.90183 0.60728 0.39569 0.22768 0.26339 0.05794 0.18849 0.48023 0.57638 0.62642 0.03551 0.91652 0.07502 0.18523 0.79593 0.15214 0.04683 0.42952 0.95628 0.69321 0.77429 0.44162 0.91137 0.45619 0.37411 0.06121 0.41715 0.08747 0.55705 0.30845 0.45311 0.67156 0.3794 0.14906 0.95471 0.19331 0.10817 0.16579 0.20299 0.56549 [...]
+0.88044 0.52159 0.20327 0.46402 0.51311 0.42269 0.57763 0.07256 0.42117 0.67896 0.83544 0.87895 0.75302 0.20967 0.56414 0.56114 0.18429 0.93926 0.58115 0.54024 0.75579 0.81986 0.88597 0.32305 0.66212 0.79967 0.23726 0.69962 0.05839 0.72653 0.94737 0.82189 0.04013 0.02034 0.27328 0.4041 0.24346 0.63838 0.08509 0.19724 0.11548 0.03673 0.33187 0.64271 0.67198 0.74265 0.5777 0.50183 0.05261 0.46653 0.06399 0.54622 0.10849 0.91497 0.54715 0.6998 0.6475 0.9804 0.95592 0.02555 0.76021 0.99218 0 [...]
+0.5497 0.54761 0.69582 0.4531 0.54655 0.50166 0.86462 0.43082 0.9572 0.77783 0.43535 0.92872 0.71091 0.24374 0.49344 0.01903 0.61284 0.98991 0.04613 0.53904 0.18023 0.04783 0.71619 0.70373 0.86908 0.2391 0.69436 0.45402 0.37573 0.77505 0.68759 0.91986 0.5263 0.31954 0.64333 0.5514 0.94418 0.80163 0.234 0.63424 0.76217 0.38852 0.91635 0.73069 0.03386 0.61712 0.95183 0.30605 0.38257 0.02117 0.41619 0.91835 0.82894 0.90408 0.71701 0.64328 0.53063 0.45988 0.34448 0.89935 0.35106 0.23116 0.92 [...]
+0.97756 0.9744 0.11118 0.05232 0.02478 0.75379 0.71007 0.20978 0.77765 0.30345 0.35502 0.30816 0.19549 0.81706 0.57448 0.43008 0.25562 0.72944 0.61596 0.11583 0.8191 0.34391 0.04098 0.33031 0.52185 0.31978 0.73382 0.15975 0.22951 0.92203 0.94307 0.64201 0.80657 0.89317 0.28766 0.62228 0.59561 0.1441 0.38184 0.46944 0.42315 0.03918 0.64814 0.14732 0.38849 0.57739 0.6057 0.20066 0.45344 0.46292 0.44305 0.04223 0.67442 0.5244 0.85388 0.27564 0.48251 0.73456 0.05603 0.41843 0.19922 0.04061 0 [...]
+0.15775 0.61185 0.79586 0.95926 0.47739 0.52067 0.30792 0.26438 0.98581 0.79857 0.13041 0.98684 0.47287 0.12668 0.04544 0.55249 0.97107 0.31891 0.79731 0.54438 0.2049 0.70922 0.05801 0.49832 0.33389 0.2275 0.00571 0.15235 0.34862 0.07843 0.0132 0.53587 0.56031 0.2751 0.03247 0.74493 0.15468 0.21876 0.01397 0.73742 0.17475 0.12466 0.25323 0.07082 0.09177 0.35281 0.7739 0.58097 0.47191 0.35659 0.13315 0.80379 0.60063 0.25922 0.47122 0.32509 0.74068 0.58539 0.63725 0.45166 0.0892 0.05665 0. [...]
+0.63825 0.90794 0.8521 0.78671 0.68814 0.64295 0.02284 0.72331 0.34446 0.75562 0.06971 0.31208 0.53681 0.75023 0.18093 0.67355 0.91567 0.32255 0.69612 0.42208 0.98881 0.02332 0.54771 0.86441 0.56329 0.92125 0.55484 0.27481 0.08069 0.88855 0.40708 0.29896 0.9082 0.63012 0.07454 0.21949 0.34675 0.94769 0.42125 0.59271 0.52758 0.46914 0.65663 0.04771 0.17158 0.64339 0.53899 0.54837 0.13852 0.00253 0.57781 0.80291 0.46806 0.77123 0.63543 0.68873 0.65903 0.78886 0.73435 0.94232 0.62885 0.9156 [...]
+0.84398 0.63149 0.34384 0.7822 0.9865 0.14168 0.51385 0.40427 0.66814 0.29672 0.07997 0.76132 0.35261 0.22326 0.62488 0.51591 0.48552 0.06088 0.70447 0.55167 0.03038 0.09164 0.97343 0.63297 0.49716 0.11219 0.25116 0.60595 0.96977 0.94207 0.11105 0.05752 0.54345 0.87301 0.84875 0.54983 0.41297 0.08058 0.22626 0.80049 0.54757 0.5092 0.53071 0.20657 0.63886 0.86973 0.55801 0.89484 0.30048 0.51161 0.69737 0.70293 0.15113 0.34967 0.34344 0.55482 0.64358 0.28836 0.7385 0.57189 0.41362 0.87212  [...]
+0.11871 0.36463 0.22469 0.05768 0.19911 0.7978 0.01959 0.33731 0.14191 0.51201 0.3046 0.65785 0.98782 0.44697 0.28907 0.48246 0.99868 0.47785 0.7897 0.21535 0.01807 0.2128 0.74076 0.1089 0.27229 0.93338 0.04669 0.73017 0.69126 0.64994 0.14686 0.62493 0.28542 0.88111 0.5468 0.46826 0.17402 0.61567 0.51834 0.00362 0.00764 0.55297 0.16776 0.58926 0.64171 0.30707 0.58848 0.02775 0.43096 0.46695 0.77562 0.06049 0.19838 0.9646 0.76825 0.73397 0.28499 0.93501 0.1776 0.03157 0.81758 0.16434 0.68 [...]
+0.98875 0.74051 0.26707 0.24297 0.10269 0.58914 0.37987 0.22515 0.53824 0.75252 0.12317 0.5887 0.19914 0.53845 0.55616 0.52386 0.63327 0.9975 0.89691 0.80968 0.40603 0.76964 0.72735 0.34126 0.03931 0.3914 0.92597 0.6406 0.37833 0.05846 0.88331 0.13747 0.34783 0.93192 0.08493 0.78341 0.2974 0.7793 0.34768 0.40636 0.19253 0.62506 0.10694 0.99526 0.93955 0.7427 0.99315 0.85263 0.4174 0.53378 0.29789 0.03189 0.08566 0.1078 0.34475 0.27203 0.39314 0.79928 0.50615 0.83624 0.32723 0.64624 0.438 [...]
+0.4414 0.21797 0.76265 0.26456 0.23539 0.93495 0.03542 0.73436 0.11621 0.0045 0.1426 0.67818 0.67057 0.93059 0.22117 0.91734 0.73667 0.50463 0.32308 0.7448 0.55063 0.6133 0.71696 0.28537 0.90882 0.15277 0.62207 0.73725 0.24812 0.83245 0.52245 0.14878 0.2573 0.51697 0.83594 0.74493 0.90458 0.7707 0.77824 0.70058 0.42985 0.33501 0.92533 0.82341 0.54627 0.78109 0.85929 0.23449 0.69999 0.67346 0.66583 0.77755 0.05094 0.86906 0.18795 0.53526 0.84979 0.83256 0.38164 0.03133 0.96096 0.72308 0.3 [...]
+0.15976 0.2258 0.53073 0.38782 0.87732 0.76205 0.33944 0.75851 0.97014 0.44282 0.67091 0.6584 0.4321 0.12625 0.32213 0.21388 0.81806 0.49221 0.60781 0.32139 0.94201 0.02202 0.36951 0.63635 0.46196 0.46579 0.77038 0.91921 0.95432 0.62672 0.53164 0.67293 0.14412 0.83837 0.90263 0.87013 0.94665 0.63014 0.03346 0.44362 0.38482 0.63874 0.66599 0.44281 0.65208 0.05162 0.73134 0.81008 0.80383 0.4798 0.7993 0.0653 0.02684 0.63376 0.42818 0.9908 0.09749 0.77356 0.50931 0.67076 0.62691 0.31798 0.7 [...]
+0.69725 0.684 0.17307 0.32627 0.33395 0.06901 0.81631 0.17575 0.74626 0.40595 0.01839 0.24907 0.96218 0.71152 0.30006 0.65874 0.14782 0.26842 0.19754 0.3285 0.01696 0.96746 0.52439 0.97399 0.99175 0.67595 0.93486 0.80299 0.80277 0.09212 0.55637 0.60502 0.37619 0.56298 0.59669 0.17451 0.7772 0.80621 0.77264 0.84293 0.39393 0.07492 0.89665 0.58738 0.68872 0.55894 0.70396 0.57823 0.47912 0.15674 0.68807 0.74028 0.15885 0.53939 0.68519 0.92027 0.14571 0.2849 0.84812 0.00876 0.66861 0.1487 0. [...]
+0.71398 0.90373 0.53986 0.10745 0.48476 0.81346 0.28728 0.59127 0.59313 0.15479 0.1624 0.79513 0.45024 0.99233 0.07282 0.2488 0.03222 0.62644 0.64115 0.42606 0.49568 0.24952 0.08991 0.39411 0.60471 0.38044 0.09372 0.89423 0.358 0.54185 0.71051 0.16202 0.68935 0.43166 0.09157 0.40163 0.76462 0.74394 0.43876 0.47629 0.23375 0.62875 0.18587 0.04186 0.61713 0.75655 0.71123 0.16411 0.35612 0.97534 0.27747 0.06281 0.65941 0.98732 0.06469 0.38075 0.20308 0.34693 0.98582 0.66787 0.28837 0.24005  [...]
+0.87481 0.53521 0.02991 0.87687 0.12045 0.37742 0.93195 0.22186 0.80736 0.63734 0.75599 0.5875 0.07183 0.7381 0.88023 0.98785 0.02946 0.98747 0.40933 0.64361 0.27401 0.42353 0.35977 0.21124 0.39371 0.4415 0.27624 0.64985 0.48174 0.38989 0.98516 0.28192 0.57771 0.53546 0.99155 0.53136 0.14878 0.33415 0.46981 0.51417 0.18833 0.28531 0.44616 0.51073 0.58626 0.46699 0.49613 0.76673 0.18199 0.16231 0.52755 0.76746 0.57374 0.50551 0.84552 0.86611 0.291 0.07177 0.24177 0.35416 0.26284 0.22275 0 [...]
+0.8858 0.6858 0.52761 0.20343 0.16933 0.0892 0.22902 0.20482 0.59507 0.42861 0.53409 0.19247 0.62867 0.09142 0.54151 0.16365 0.20119 0.21459 0.52769 0.83984 0.48345 0.85235 0.28155 0.08037 0.94356 0.11608 0.08464 0.01698 0.95242 0.255 0.14541 0.38981 0.50325 0.31271 0.76286 0.18321 0.00297 0.56922 0.30531 0.86399 0.63479 0.02671 0.20425 0.85655 0.53 0.97229 0.13585 0.78533 0.75609 0.06167 0.79927 0.22328 0.186 0.77877 0.36541 0.39359 0.39576 0.08195 0.34609 0.55017 0.46838 0.03501 0.0665 [...]
+0.36473 0.05586 0.06518 0.84071 0.56033 0.39891 0.73998 0.17093 0.99157 0.4546 0.11843 0.40622 0.82869 0.22909 0.35295 0.2792 0.62603 0.92621 0.44407 0.33864 0.48246 0.41191 0.64776 0.01564 0.67607 0.85856 0.85189 0.32028 0.84545 0.46597 0.79763 0.93822 0.93782 0.57832 0.21466 0.8162 0.9764 0.94705 0.91059 0.02753 0.97913 0.11168 0.25586 0.28691 0.92894 0.87144 0.88792 0.49279 0.51271 0.91746 0.99025 0.77934 0.86919 0.31627 0.08715 0.47759 0.41964 0.20223 0.99428 0.59547 0.44509 0.44083  [...]
+0.70834 0.65587 0.06419 0.53747 0.73274 0.7539 0.28683 0.94135 0.77355 0.72351 0.94035 0.42173 0.84262 0.70449 0.00586 0.12076 0.35546 0.51758 0.3367 0.13015 0.74883 0.98788 0.58446 0.90934 0.87109 0.34005 0.23227 0.54057 0.10726 0.73186 0.15844 0.28544 0.06635 0.60115 0.48034 0.25252 0.31968 0.46825 0.54428 0.90994 0.62518 0.29446 0.61141 0.7256 0.02075 0.33668 0.48582 0.08612 0.22698 0.51469 0.03627 0.74651 0.45652 0.03135 0.18585 0.70562 0.03564 0.90167 0.16145 0.75795 0.8795 0.36738  [...]
+0.29483 0.87595 0.87709 0.26033 0.53106 0.88677 0.16988 0.90281 0.86754 0.52269 0.63997 0.19787 0.02967 0.20982 0.90319 0.38881 0.33685 0.08948 0.81646 0.29824 0.8873 0.67942 0.96271 0.56094 0.93762 0.23561 0.16903 0.54859 0.44133 0.23432 0.15364 0.86876 0.67262 0.58077 0.17223 0.43335 0.93156 0.45733 0.54181 0.80171 0.51655 0.97446 0.17115 0.97188 0.35576 0.52779 0.28399 0.77059 0.03762 0.99309 0.0691 0.07445 0.87165 0.91539 0.47342 0.03773 0.40361 0.13427 0.18855 0.8329 0.58667 0.0812  [...]
+0.02712 0.05611 0.54662 0.73942 0.31566 0.97235 0.49527 0.21456 0.1059 0.08964 0.74653 0.21596 0.79289 0.67806 0.41411 0.60485 0.28566 0.02309 0.70328 0.09011 0.70052 0.78082 0.63741 0.51369 0.18912 0.7683 0.63869 0.61349 0.70394 0.75866 0.89286 0.97435 0.84173 0.96603 0.82022 0.70099 0.71331 0.3366 0.98293 0.76985 0.09239 0.21165 0.57422 0.30406 0.9493 0.50102 0.18294 0.12005 0.46455 0.94039 0.2252 0.8257 0.98098 0.90385 0.81812 0.84573 0.72439 0.06084 0.72862 0.20134 0.04042 0.0944 0.6 [...]
+0.10597 0.27669 0.67391 0.54758 0.90672 0.75176 0.64831 0.97755 0.53863 0.33895 0.10571 0.99371 0.04961 0.31543 0.73864 0.36549 0.97641 0.51819 0.90993 0.32375 0.89428 0.58464 0.65352 0.47068 0.09884 0.13959 0.31835 0.15643 0.7799 0.0847 0.93397 0.89506 0.24177 0.32927 0.8828 0.24166 0.81742 0.57217 0.88484 0.31372 0.03448 0.2722 0.32897 0.11543 0.33528 0.01102 0.19861 0.87783 0.23541 0.21651 0.13291 0.74807 0.60614 0.51446 0.19574 0.27943 0.36457 0.15006 0.40461 0.72159 0.38997 0.68858  [...]
+0.97481 0.34036 0.6759 0.68413 0.85896 0.36275 0.47681 0.20699 0.51491 0.53457 0.49153 0.1174 0.15251 0.23896 0.07732 0.82679 0.21041 0.53463 0.89707 0.43065 0.28513 0.0972 0.87189 0.71035 0.74344 0.21726 0.21871 0.94371 0.81022 0.39327 0.76554 0.00996 0.9853 0.81951 0.84951 0.70656 0.3376 0.27642 0.52805 0.87393 0.06674 0.09299 0.49305 0.68478 0.28142 0.15622 0.14355 0.43228 0.81816 0.54324 0.73368 0.23426 0.04424 0.29242 0.81083 0.76476 0.11832 0.73614 0.70427 0.13735 0.36086 0.58468 0 [...]
+0.66982 0.44541 0.86072 0.5616 0.66155 0.70037 0.18207 0.29394 0.24771 0.06614 0.23616 0.50306 0.51288 0.09656 0.61944 0.12896 0.44584 0.3671 0.76318 0.9766 0.49581 0.33779 0.2134 0.74013 0.87174 0.81418 0.62293 0.78769 0.433 0.43153 0.86399 0.6366 0.40239 0.01627 0.11642 0.92868 0.2688 0.69897 0.54933 0.78906 0.9965 0.41075 0.82367 0.15396 0.81746 0.72916 0.529 0.79924 0.71595 0.23072 0.29998 0.19429 0.34556 0.13819 0.65133 0.76392 0.81812 0.30438 0.40386 0.11049 0.60886 0.97606 0.40203 [...]
+0.91346 0.94389 0.59742 0.01523 0.46899 0.48992 0.33153 0.30831 0.49712 0.61749 0.13233 0.30222 0.85125 0.00233 0.55913 0.73868 0.01959 0.26354 0.80402 0.20898 0.2316 0.79134 0.07077 0.242 0.6158 0.53064 0.88321 0.00423 0.19019 0.11691 0.87822 0.67664 0.85282 0.43245 0.74994 0.98985 0.39851 0.77643 0.95982 0.8933 0.96976 0.27694 0.86857 0.76055 0.99794 0.73155 0.55362 0.6186 0.10611 0.35305 0.89393 0.98527 0.26625 0.35733 0.01782 0.56316 0.29378 0.38454 0.35878 0.58077 0.66705 0.83244 0. [...]
+0.94232 0.9938 0.80632 0.05416 0.61496 0.82752 0.02941 0.80246 0.87433 0.89654 0.78959 0.67403 0.58867 0.98015 0.68355 0.36377 0.24527 0.23825 0.84311 0.38403 0.04445 0.34964 0.29749 0.24899 0.6023 0.29283 0.2908 0.58031 0.3336 0.22569 0.14706 0.38866 0.28719 0.48505 0.88929 0.32985 0.79106 0.9277 0.52662 0.27958 0.47398 0.95921 0.1759 0.24148 0.6704 0.76722 0.31049 0.49269 0.34066 0.82611 0.60051 0.20241 0.16407 0.40369 0.46178 0.88566 0.80869 0.43402 0.15084 0.94512 0.74268 0.21233 0.0 [...]
+0.8677 0.54513 0.78543 0.61356 0.9955 0.75657 0.03468 0.98507 0.95994 0.85459 0.08848 0.75627 0.14044 0.98276 0.71011 0.57747 0.5779 0.84047 0.42906 0.74795 0.50157 0.302 0.58883 0.52839 0.78372 0.72 0.84895 0.86885 0.81242 0.92197 0.27315 0.58204 0.35113 0.95567 0.82967 0.33323 0.19724 0.67109 0.70505 0.9299 0.39706 0.7906 0.09113 0.41192 0.73318 0.80296 0.36891 0.98266 0.06721 0.25104 0.56111 0.75433 0.85314 0.08793 0.90949 0.42799 0.19719 0.42414 0.66548 0.92363 0.53911 0.94538 0.0573 [...]
+0.59009 0.21053 0.66929 0.97003 0.35695 0.06559 0.29072 0.97047 0.72357 0.86598 0.93896 0.43379 0.23168 0.8948 0.4699 0.93653 0.51651 0.33386 0.30564 0.37852 0.65651 0.50352 0.24679 0.96526 0.84029 0.82835 0.07951 0.33213 0.37918 0.47565 0.22525 0.88077 0.82293 0.5766 0.5448 0.12982 0.76856 0.698 0.62427 0.11752 0.33901 0.20666 0.91433 0.71247 0.45748 0.98026 0.4497 0.84058 0.18144 0.16128 0.72532 0.5497 0.00196 0.1684 0.38092 0.16556 0.5441 0.09674 0.893 0.27592 0.74395 0.06491 0.63567  [...]
+0.06313 0.43703 0.60275 0.19893 0.26517 0.62016 0.25902 0.05523 0.39134 0.41448 0.07209 0.60694 0.14726 0.27712 0.36073 0.80667 0.56584 0.60224 0.65828 0.0412 0.82776 0.19571 0.76763 0.63823 0.82073 0.77785 0.87721 0.2444 0.5786 0.01727 0.45618 0.7277 0.40189 0.94914 0.93652 0.21621 0.82688 0.04231 0.56253 0.89366 0.76267 0.30868 0.61096 0.70204 0.21986 0.69234 0.66414 0.19933 0.4408 0.18549 0.74714 0.37463 0.44115 0.66772 0.67048 0.70247 0.10583 0.11928 0.62689 0.55671 0.30685 0.72358 0 [...]
+0.86334 0.09426 0.92334 0.4469 0.69281 0.46092 0.04385 0.03886 0.81032 0.01836 0.0629 0.78458 0.8237 0.55319 0.42887 0.30596 0.23184 0.95862 0.25599 0.408 0.74464 0.08538 0.11034 0.8681 0.09949 0.57653 0.92704 0.76012 0.71487 0.36031 0.9059 0.56861 0.65635 0.25839 0.15796 0.21949 0.14384 0.92486 0.48236 0.41919 0.15579 0.67572 0.93385 0.60368 0.55758 0.68357 0.94641 0.94657 0.28851 0.3271 0.32764 0.15924 0.18206 0.63487 0.78397 0.52531 0.28172 0.63351 0.46333 0.66281 0.0772 0.18066 0.626 [...]
+0.35509 0.72442 0.08026 0.08436 0.65663 0.41241 0.5549 0.87404 0.72289 0.49124 0.07677 0.42224 0.69832 0.30082 0.76595 0.99462 0.18184 0.91239 0.85277 0.63397 0.80071 0.78023 0.23508 0.54572 0.09656 0.96607 0.24004 0.14438 0.6437 0.83589 0.36605 0.22527 0.98507 0.11796 0.54163 0.54387 0.33106 0.58124 0.92592 0.88796 0.93788 0.7181 0.04645 0.40161 0.20459 0.64229 0.05136 0.36788 0.49926 0.89204 0.95745 0.53529 0.47707 0.05468 0.44055 0.2569 0.87082 0.66724 0.32538 0.27364 0.46979 0.58375  [...]
+0.30664 0.49496 0.86385 0.15924 0.32017 0.24291 0.67221 0.32395 0.91826 0.48055 0.87946 0.38771 0.73593 0.39869 0.43786 0.14512 0.97469 0.8111 0.97016 0.5347 0.25175 0.53705 0.88446 0.16934 0.95847 0.20482 0.15012 0.09643 0.3126 0.22737 0.21016 0.07239 0.70513 0.40609 0.12438 0.9713 0.77599 0.13015 0.85806 0.98984 0.92394 0.82241 0.95696 0.34532 0.57244 0.51245 0.27293 0.14462 0.58797 0.36088 0.21465 0.87029 0.63052 0.9937 0.38693 0.87283 0.43989 0.78149 0.45718 0.99025 0.10695 0.57059 0 [...]
+0.64848 0.20653 0.13169 0.05687 0.22505 0.08696 0.79564 0.85067 0.00086 0.27036 0.91947 0.22227 0.62065 0.309 0.94855 0.18938 0.53869 0.91513 0.16197 0.49698 0.52321 0.40652 0.91074 0.4298 0.64198 0.84389 0.9096 0.21863 0.87757 0.80915 0.59071 0.13465 0.25265 0.5134 0.03893 0.68168 0.12129 0.38296 0.75545 0.2825 0.44857 0.25711 0.99161 0.84401 0.27209 0.30065 0.9181 0.80517 0.61318 0.48318 0.74252 0.19604 0.87938 0.93382 0.59567 0.96049 0.74 0.80011 0.42848 0.2334 0.87046 0.68595 0.28007 [...]
+0.29175 0.40672 0.0544 0.17739 0.73396 0.74843 0.49055 0.99617 0.76597 0.45391 0.18841 0.64519 0.98856 0.44481 0.57761 0.04769 0.27818 0.47495 0.44919 0.52902 0.43309 0.21057 0.96067 0.48599 0.31229 0.98458 0.75153 0.10158 0.65604 0.09528 0.96207 0.40164 0.02451 0.43883 0.92315 0.66061 0.6659 0.65719 0.75918 0.70678 0.73541 0.43023 0.12581 0.38619 0.57194 0.64079 0.07308 0.51649 0.00343 0.67823 0.3613 0.29246 0.29901 0.17634 0.89021 0.24508 0.55759 0.40721 0.83531 0.62524 0.25025 0.24395 [...]
+0.90756 0.35144 0.81043 0.89688 0.53725 0.40386 0.07345 0.9909 0.5676 0.47177 0.77424 0.97097 0.19442 0.50032 0.21505 0.55714 0.16494 0.65802 0.03931 0.55509 0.91412 0.71948 0.31788 0.30375 0.88009 0.88726 0.80322 0.68863 0.63031 0.52117 0.56 0.96955 0.53609 0.3956 0.78808 0.92053 0.2472 0.075 0.21798 0.91004 0.1184 0.95422 0.97605 0.09629 0.72634 0.13177 0.1799 0.76591 0.10979 0.89394 0.75381 0.34499 0.37104 0.33368 0.30637 0.9306 0.39702 0.47286 0.00241 0.47631 0.53224 0.7678 0.0235 0. [...]
+0.0519 0.01667 0.73786 0.21102 0.56121 0.96093 0.22588 0.99546 0.49001 0.46009 0.4517 0.105 0.67033 0.64294 0.59047 0.00565 0.42149 0.96303 0.05693 0.35642 0.88018 0.50268 0.92029 0.30454 0.94303 0.08196 0.59719 0.24877 0.24752 0.6087 0.57635 0.24043 0.86338 0.31279 0.5841 0.30465 0.50914 0.54915 0.69608 0.84826 0.06308 0.13988 0.82077 0.13902 0.10412 0.46476 0.43887 0.15142 0.61222 0.02895 0.48138 0.64975 0.19732 0.04631 0.4206 0.1835 0.97131 0.08836 0.24971 0.94575 0.1688 0.28689 0.646 [...]
+0.54312 0.54216 0.24905 0.47317 0.15849 0.08947 0.33681 0.60729 0.45605 0.34792 0.38326 0.17236 0.4203 0.74811 0.483 0.36965 0.72754 0.08309 0.03444 0.22837 0.34068 0.26889 0.13065 0.03321 0.83509 0.92852 0.04277 0.13988 0.31038 0.48143 0.03659 0.52273 0.96302 0.31854 0.90606 0.814 0.92683 0.42802 0.4975 0.51177 0.07115 0.81641 0.83252 0.20305 0.01709 0.90695 0.6384 0.54252 0.26761 0.2632 0.47229 0.49161 0.88318 0.72392 0.77534 0.25304 0.46144 0.56314 0.92127 0.10677 0.9657 0.61158 0.860 [...]
+0.19977 0.44435 0.40278 0.86239 0.04402 0.0605 0.24512 0.84229 0.43061 0.47285 0.88361 0.35261 0.08704 0.47389 0.21631 0.50325 0.56718 0.83681 0.80454 0.47059 0.94492 0.25392 0.64755 0.38533 0.00673 0.15366 0.16257 0.24998 0.12457 0.17412 0.56458 0.66659 0.26599 0.66019 0.49243 0.5713 0.97804 0.50248 0.72456 0.91768 0.09555 0.44397 0.3755 0.66424 0.68955 0.24884 0.41296 0.59128 0.70183 0.80832 0.41761 0.34281 0.52402 0.98216 0.2913 0.97338 0.39799 0.36302 0.76261 0.70122 0.11906 0.84385  [...]
+0.1948 0.25567 0.67588 0.33379 0.88038 0.85851 0.98117 0.10252 0.70745 0.11312 0.04314 0.06779 0.71684 0.24559 0.6238 0.43656 0.80686 0.55587 0.73902 0.40858 0.51026 0.64459 0.10825 0.9345 0.92151 0.13706 0.43777 0.15585 0.3839 0.4897 0.18112 0.14077 0.6257 0.43382 0.43867 0.95259 0.98161 0.28772 0.65853 0.40652 0.44592 0.35663 0.45052 0.05291 0.92459 0.85553 0.78248 0.82866 0.69143 0.06509 0.94427 0.02292 0.22324 0.70086 0.38225 0.49271 0.34197 0.60376 0.14015 0.67415 0.03837 0.61951 0. [...]
+0.09722 0.61216 0.43751 0.23057 0.35327 0.43726 0.13436 0.89162 0.86406 0.08569 0.03673 0.17012 0.47007 0.00997 0.87646 0.27407 0.62584 0.61233 0.79479 0.11279 0.86615 0.47382 0.46028 0.95654 0.87298 0.37949 0.71723 0.23233 0.76008 0.4445 0.99426 0.21954 0.6499 0.58232 0.94175 0.95763 0.38854 0.94023 0.06038 0.08202 0.92703 0.95487 0.90396 0.82011 0.36419 0.69691 0.56334 0.70188 0.96336 0.57352 0.72325 0.54021 0.00811 0.64708 0.74309 0.52872 0.93957 0.46914 0.21568 0.6753 0.82553 0.65552 [...]
+0.35033 0.43207 0.62528 0.10411 0.46228 0.90513 0.59787 0.71337 0.29776 0.23299 0.03785 0.68017 0.40638 0.51363 0.8316 0.92413 0.84424 0.68133 0.42512 0.12915 0.19234 0.92188 0.84522 0.0343 0.05963 0.49294 0.86621 0.39706 0.51923 0.96177 0.28051 0.76982 0.78489 0.37825 0.29255 0.1109 0.52864 0.47618 0.26107 0.59281 0.8109 0.02333 0.49433 0.94622 0.767 0.69689 0.10647 0.79206 0.57742 0.71472 0.76652 0.14404 0.97365 0.58945 0.64183 0.67773 0.72372 0.40751 0.43088 0.88941 0.72985 0.79912 0. [...]
+0.04085 0.39123 0.43643 0.60422 0.10367 0.7261 0.80323 0.76164 0.88693 0.17555 0.6628 0.93665 0.63144 0.51774 0.43673 0.03703 0.97359 0.71378 0.94695 0.76003 0.58956 0.07967 0.77181 0.78317 0.98878 0.68948 0.68557 0.35525 0.27626 0.62697 0.15741 0.3148 0.46975 0.29403 0.02339 0.83131 0.91088 0.32076 0.5357 0.02283 0.59005 0.05091 0.45041 0.13439 0.65655 0.71852 0.08499 0.19594 0.40288 0.175 0.05055 0.94075 0.32286 0.78453 0.77921 0.69037 0.18155 0.08684 0.23194 0.09904 0.4334 0.93717 0.0 [...]
+0.9535 0.0426 0.59749 0.05658 0.87996 0.79706 0.90144 0.57936 0.59097 0.24087 0.08918 0.35401 0.28313 0.36263 0.4096 0.03182 0.53561 0.3782 0.54486 0.82961 0.78464 0.79381 0.48278 0.97927 0.16056 0.70554 0.68374 0.11188 0.37368 0.10374 0.08414 0.57073 0.3673 0.51577 0.31219 0.08934 0.56424 0.3549 0.7177 0.89503 0.83315 0.37209 0.52576 0.02941 0.70777 0.96163 0.45948 0.85758 0.61597 0.71515 0.70904 0.31598 0.19307 0.27174 0.8327 0.77692 0.22711 0.1874 0.61625 0.11865 0.977 0.80725 0.49721 [...]
+0.5104 0.25463 0.33009 0.39835 0.29034 0.69394 0.43543 0.04401 0.47342 0.02527 0.98873 0.28776 0.87682 0.91171 0.95495 0.92531 0.8994 0.28741 0.43788 0.42708 0.84342 0.71996 0.53283 0.92254 0.67072 0.73787 0.91972 0.93231 0.43887 0.71998 0.3084 0.58733 0.73791 0.57786 0.77276 0.18302 0.98801 0.39665 0.32448 0.40363 0.18857 0.66239 0.05436 0.13291 0.26894 0.05511 0.42822 0.13864 0.84175 0.89432 0.21709 0.41507 0.07785 0.5459 0.13002 0.59739 0.08152 0.47907 0.27127 0.79103 0.30902 0.6874 0 [...]
+0.26528 0.02731 0.38617 0.62212 0.76132 0.12612 0.78745 0.73046 0.47205 0.18639 0.20726 0.70996 0.54403 0.53962 0.41249 0.14544 0.87162 0.2254 0.65217 0.1416 0.18041 0.41622 0.71126 0.16886 0.16808 0.8062 0.88814 0.23467 0.68143 0.08408 0.64883 0.35971 0.77017 0.39713 0.88296 0.70064 0.70538 0.62793 0.35126 0.9878 0.58448 0.93748 0.68343 0.75652 0.31608 0.20014 0.57672 0.37603 0.95683 0.95904 0.50149 0.41375 0.83006 0.98588 0.66915 0.09947 0.21842 0.92573 0.70715 0.55711 0.69586 0.0868 0 [...]
+0.02255 0.95718 0.07042 0.14185 0.46908 0.56921 0.47285 0.01119 0.77629 0.78834 0.95119 0.98209 0.62156 0.71087 0.21579 0.3451 0.80595 0.02885 0.94944 0.47489 0.1925 0.5813 0.23576 0.79927 0.46965 0.84485 0.33785 0.70115 0.63846 0.50596 0.22824 0.61775 0.26513 0.71602 0.9891 0.41065 0.08895 0.78496 0.66163 0.05669 0.32289 0.94254 0.58931 0.95785 0.55693 0.20047 0.32244 0.73212 0.63348 0.55777 0.35072 0.26366 0.05222 0.51476 0.02716 0.22679 0.46725 0.14181 0.63223 0.78521 0.68494 0.33387  [...]
+0.89438 0.37575 0.10665 0.9207 0.61846 0.66714 0.58796 0.94943 0.82942 0.60979 0.79245 0.27018 0.82328 0.30498 0.56867 0.20816 0.03982 0.2132 0.11147 0.11881 0.68671 0.45449 0.53372 0.12615 0.61865 0.79081 0.41779 0.11674 0.50124 0.5938 0.05417 0.22284 0.85134 0.24846 0.9004 0.70605 0.51332 0.36053 0.28956 0.69149 0.66792 0.44251 0.15333 0.79286 0.58901 0.44978 0.26961 0.42042 0.34326 0.72327 0.09131 0.42747 0.5017 0.26378 0.77149 0.41125 0.48306 0.0177 0.92596 0.05623 0.40336 0.45254 0. [...]
+0.28956 0.88252 0.43848 0.46437 0.29395 0.86436 0.65862 0.69411 0.6797 0.02382 0.70178 0.02567 0.40158 0.66135 0.73193 0.20124 0.74409 0.88935 0.37586 0.74628 0.51495 0.2425 0.62153 0.6617 0.40117 0.75596 0.26154 0.61942 0.56312 0.27638 0.99807 0.44074 0.77405 0.56643 0.71994 0.41491 0.91629 0.52708 0.55441 0.82159 0.64619 0.4388 0.57676 0.91547 0.71879 0.54324 0.90253 0.25228 0.43993 0.48669 0.97455 0.91264 0.67443 0.37669 0.45751 0.4038 0.45439 0.86271 0.48767 0.53931 0.91868 0.29104 0 [...]
+0.27258 0.92318 0.01136 0.29712 0.98631 0.66839 0.78527 0.87049 0.0249 0.33618 0.20948 0.96251 0.35033 0.56784 0.55099 0.96941 0.34645 0.90102 0.01244 0.75449 0.99457 0.93915 0.22251 0.48671 0.19694 0.19998 0.05477 0.70839 0.38646 0.98184 0.81037 0.56312 0.01925 0.18714 0.79271 0.43231 0.37422 0.91477 0.8913 0.00223 0.37072 0.60324 0.8859 0.79745 0.07361 0.56841 0.19353 0.58773 0.27355 0.16622 0.1828 0.29926 0.2625 0.56395 0.3917 0.04808 0.73465 0.31501 0.62036 0.66261 0.00881 0.65762 0. [...]
+0.96509 0.45193 0.25718 0.32451 0.93768 0.57722 0.44705 0.12627 0.145 0.0045 0.0691 0.59526 0.05315 0.65163 0.70277 0.06075 0.03695 0.44447 0.50509 0.21513 0.95954 0.60924 0.51396 0.3351 0.85532 0.4381 0.53268 0.15574 0.12357 0.10989 0.65534 0.76429 0.1881 0.53391 0.49851 0.53389 0.57178 0.31167 0.95542 0.08982 0.98941 0.12321 0.93491 0.41815 0.49951 0.30473 0.2914 0.24563 0.54875 0.4201 0.46516 0.51768 0.29789 0.51076 0.99001 0.37756 0.57226 0.16633 0.64044 0.31194 0.28478 0.39272 0.708 [...]
+0.4166 0.74868 0.78947 0.83096 0.91409 0.82245 0.18107 0.57718 0.68599 0.00083 0.04 0.60193 0.51398 0.33986 0.12398 0.36946 0.09838 0.16771 0.3096 0.75662 0.92628 0.2569 0.79177 0.60799 0.6184 0.63432 0.71914 0.59958 0.72919 0.78886 0.74405 0.12871 0.71396 0.47183 0.23846 0.66931 0.88244 0.76685 0.63791 0.55913 0.2507 0.23924 0.06604 0.80933 0.34353 0.51462 0.64847 0.83609 0.49462 0.86861 0.56259 0.69008 0.41445 0.40529 0.77663 0.14355 0.88058 0.03551 0.02824 0.2909 0.6127 0.48353 0.5225 [...]
+0.73643 0.07556 0.39917 0.76488 0.03867 0.43197 0.81586 0.56383 0.67816 0.86658 0.73792 0.60127 0.46549 0.13754 0.66691 0.45302 0.29807 0.1929 0.52848 0.37918 0.38824 0.93448 0.17829 0.60084 0.87687 0.77941 0.00802 0.0772 0.43336 0.86061 0.14573 0.49847 0.86989 0.06809 0.33187 0.66835 0.85556 0.88254 0.88346 0.83182 0.94088 0.51484 0.20572 0.27497 0.81439 0.67454 0.35927 0.24408 0.0239 0.07985 0.97728 0.1204 0.78573 0.98602 0.06391 0.53994 0.03577 0.12004 0.80348 0.52391 0.13076 0.35043  [...]
+0.25919 0.83521 0.40306 0.0783 0.99331 0.13937 0.25855 0.51415 0.77223 0.95323 0.70128 0.66814 0.75204 0.15908 0.28502 0.8924 0.8496 0.6966 0.19065 0.17138 0.81276 0.86326 0.22167 0.05987 0.75964 0.80556 0.9511 0.30832 0.25985 0.357 0.34404 0.93036 0.93943 0.15704 0.89445 0.86718 0.97966 0.64836 0.68921 0.52007 0.88134 0.11103 0.8886 0.38634 0.91873 0.21935 0.72684 0.82734 0.51584 0.39913 0.68506 0.32999 0.26483 0.2077 0.26112 0.9143 0.65941 0.15848 0.07578 0.20598 0.94577 0.73594 0.9449 [...]
+0.96144 0.08751 0.36206 0.641 0.30875 0.33354 0.2206 0.67677 0.31881 0.90474 0.46464 0.35003 0.12482 0.11919 0.35027 0.86592 0.71192 0.93332 0.49354 0.44667 0.69163 0.35253 0.42448 0.25294 0.31269 0.89946 0.90712 0.6031 0.31413 0.17069 0.10685 0.52999 0.21121 0.34926 0.61609 0.69263 0.44257 0.36401 0.54178 0.87346 0.74303 0.45113 0.89016 0.97782 0.50797 0.66051 0.6646 0.05542 0.61817 0.28124 0.59997 0.36616 0.73202 0.05471 0.51884 0.50504 0.33241 0.4706 0.44356 0.25181 0.96578 0.16838 0. [...]
+0.43924 0.07353 0.40022 0.43456 0.76533 0.27804 0.16001 0.93761 0.12886 0.98673 0.34466 0.29202 0.74414 0.92769 0.68871 0.5891 0.65951 0.28239 0.27917 0.09176 0.06592 0.05262 0.42864 0.5951 0.53266 0.3757 0.56283 0.45421 0.41785 0.44813 0.30706 0.77267 0.37817 0.20527 0.325 0.55878 0.41508 0.02993 0.31985 0.56799 0.83157 0.71979 0.79622 0.9064 0.24572 0.2364 0.01939 0.62069 0.6569 0.36083 0.60081 0.81696 0.76798 0.30982 0.91204 0.19681 0.74693 0.17875 0.15042 0.0643 0.99961 0.82671 0.979 [...]
+0.4727 0.05974 0.64714 0.54325 0.41114 0.43375 0.262 0.18916 0.20285 0.97503 0.44232 0.94915 0.93813 0.92091 0.93473 0.83663 0.573 0.29391 0.6494 0.18892 0.60387 0.17698 0.35021 0.84775 0.20699 0.03489 0.44658 0.77327 0.28129 0.90723 0.72167 0.06536 0.85576 0.4017 0.05232 0.92609 0.35451 0.40608 0.15558 0.06718 0.15441 0.39088 0.45864 0.44346 0.01386 0.42211 0.18775 0.91356 0.56026 0.68276 0.13822 0.13526 0.52165 0.90372 0.59143 0.22962 0.91502 0.0558 0.56347 0.86001 0.19571 0.72752 0.04 [...]
+0.13056 0.02728 0.80743 0.70263 0.74737 0.65636 0.49777 0.82774 0.62483 0.78753 0.7404 0.72578 0.58066 0.13425 0.7541 0.38741 0.9694 0.18563 0.36043 0.22506 0.75653 0.32356 0.25136 0.5798 0.14767 0.78809 0.66408 0.26798 0.35302 0.75845 0.40837 0.21884 0.26715 0.25827 0.74987 0.33973 0.31682 0.39312 0.81363 0.26645 0.6085 0.10981 0.37696 0.01001 0.30739 0.31992 0.05739 0.80487 0.67258 0.00402 0.02184 0.13934 0.19475 0.69202 0.64631 0.45329 0.26102 0.34598 0.73523 0.00932 0.8169 0.64605 0. [...]
+0.50502 0.81799 0.32903 0.93541 0.80613 0.19177 0.54411 0.58126 0.79064 0.9898 0.62992 0.41504 0.17009 0.5114 0.15275 0.83009 0.68512 0.42684 0.50037 0.38285 0.88127 0.99086 0.33252 0.85431 0.85143 0.19638 0.34144 0.04633 0.85876 0.04272 0.59677 0.43743 0.10946 0.33256 0.48659 0.11472 0.74857 0.7401 0.65473 0.58817 0.2506 0.18811 0.34288 0.34905 0.30287 0.51026 0.86031 0.99875 0.00014 0.42742 0.73199 0.13864 0.45303 0.97691 0.03478 0.5177 0.15227 0.72097 0.89038 0.64515 0.78167 0.94723 0 [...]
+0.60975 0.70064 0.99068 0.40565 0.99226 0.28435 0.21618 0.92482 0.15291 0.74841 0.93684 0.93338 0.60608 0.77343 0.45715 0.63573 0.59011 0.7017 0.00943 0.54646 0.41363 0.88319 0.10246 0.16395 0.41663 0.44202 0.76072 0.62538 0.39292 0.47253 0.95137 0.86769 0.76102 0.37104 0.57931 0.19049 0.86539 0.63508 0.33047 0.59284 0.54692 0.95978 0.47198 0.83079 0.11073 0.38195 0.86857 0.66045 0.33509 0.66083 0.46378 0.25871 0.80126 0.83918 0.42841 0.43454 0.99224 0.11829 0.49123 0.17929 0.74405 0.966 [...]
+0.83659 0.17865 0.66801 0.34673 0.2255 0.08225 0.79967 0.1715 0.62907 0.18754 0.46055 0.58152 0.99732 0.9609 0.49281 0.47191 0.12449 0.77069 0.79023 0.32306 0.13598 0.18411 0.6648 0.68928 0.08983 0.51896 0.32227 0.67031 0.1122 0.48391 0.16255 0.07336 0.74319 0.35952 0.16356 0.79584 0.91765 0.10338 0.30182 0.32288 0.53178 0.34529 0.43622 0.36898 0.19236 0.99703 0.43698 0.49699 0.03851 0.34783 0.44648 0.07329 0.58565 0.625 0.82938 0.3374 0.69381 0.39891 0.63002 0.20451 0.43781 0.51644 0.20 [...]
+0.51775 0.59282 0.73969 0.33456 0.65935 0.43411 0.29318 0.05172 0.37675 0.31659 0.94962 0.98595 0.59151 0.49923 0.77251 0.5679 0.69047 0.77638 0.4278 0.57024 0.8918 0.69935 0.79081 0.19857 0.15803 0.65302 0.82043 0.18711 0.49352 0.963 0.88976 0.99945 0.51137 0.0345 0.98613 0.50678 0.70696 0.12792 0.65741 0.00617 0.07578 0.0575 0.01962 0.28378 0.96947 0.33779 0.81913 0.79317 0.23989 0.8485 0.56983 0.23903 0.49614 0.34112 0.11614 0.25042 0.49577 0.27558 0.41382 0.75342 0.44406 0.49414 0.43 [...]
+0.67835 0.69716 0.28944 0.89308 0.8332 0.86632 0.29346 0.13235 0.74142 0.85368 0.34178 0.65519 0.6885 0.50878 0.71682 0.63029 0.44551 0.64808 0.23576 0.36626 0.83205 0.97503 0.19888 0.89667 0.67462 0.903 0.50263 0.83653 0.98875 0.17201 0.65453 0.99591 0.11904 0.41726 0.337 0.61674 0.0472 0.32432 0.56353 0.6139 0.66924 0.53711 0.11027 0.23881 0.30616 0.62821 0.96897 0.96189 0.58301 0.65891 0.15112 0.18702 0.72183 0.16809 0.2752 0.18106 0.63552 0.09467 0.88469 0.46977 0.63061 0.90201 0.584 [...]
+0.94718 0.55475 0.25371 0.16278 0.23952 0.29886 0.13009 0.74018 0.93176 0.03081 0.62848 0.8085 0.16671 0.02662 0.86959 0.04169 0.58991 0.85812 0.75185 0.28086 0.83915 0.93504 0.08318 0.41122 0.36716 0.9927 0.07199 0.36845 0.92016 0.46781 0.85701 0.88158 0.31123 0.24129 0.72486 0.18611 0.7585 0.99592 0.03792 0.91722 0.82704 0.13351 0.9977 0.78088 0.0079 0.47001 0.14323 0.10952 0.53795 0.3981 0.04758 0.38316 0.49733 0.53631 0.92251 0.28725 0.17044 0.99533 0.65442 0.03803 0.11063 0.17 0.182 [...]
+0.06709 0.62246 0.55603 0.56389 0.04031 0.92644 0.13224 0.49025 0.50837 0.86209 0.20622 0.38603 0.34849 0.56666 0.94582 0.3906 0.69924 0.3948 0.65288 0.2941 0.65552 0.15787 0.32911 0.10816 0.94812 0.80906 0.45647 0.37387 0.89981 0.75104 0.31871 0.38769 0.48923 0.73672 0.33247 0.26444 0.21519 0.98137 0.02186 0.82564 0.12555 0.04638 0.338 0.63455 0.2329 0.07569 0.43012 0.14113 0.92982 0.43334 0.4057 0.47655 0.29794 0.26894 0.56359 0.32618 0.47637 0.85883 0.85897 0.07957 0.25998 0.70495 0.2 [...]
+0.9177 0.11179 0.71374 0.89074 0.12627 0.33464 0.08303 0.49015 0.11074 0.47539 0.93685 0.11372 0.31617 0.84321 0.28606 0.65368 0.81244 0.96383 0.74133 0.04927 0.51122 0.54963 0.97268 0.06486 0.09568 0.51704 0.24516 0.96191 0.15305 0.37977 0.22831 0.48042 0.33108 0.75584 0.65928 0.04213 0.50339 0.9794 0.45715 0.24335 0.11951 0.51165 0.56911 0.19177 0.73117 0.60976 0.18425 0.31476 0.91073 0.28272 0.52919 0.2265 0.56025 0.75783 0.15014 0.10588 0.0793 0.12778 0.67745 0.60917 0.92749 0.86258  [...]
+0.53602 0.90513 0.6164 0.12755 0.14559 0.86935 0.947 0.8364 0.31082 0.40435 0.41876 0.61056 0.24729 0.30074 0.96257 0.27384 0.66824 0.4753 0.1342 0.83827 0.62679 0.05576 0.21748 0.19967 0.33317 0.44792 0.58281 0.77611 0.33129 0.64463 0.32078 0.30326 0.58132 0.7725 0.72668 0.57681 0.49345 0.54798 0.00268 0.28604 0.58026 0.06605 0.62304 0.64898 0.26528 0.3841 0.86753 0.76278 0.89073 0.31229 0.92807 0.54522 0.20274 0.78284 0.13174 0.26903 0.35073 0.31269 0.85419 0.81447 0.8531 0.21557 0.647 [...]
+0.39273 0.33257 0.54243 0.98492 0.73898 0.01865 0.81293 0.37094 0.86645 0.51023 0.76523 0.93214 0.367 0.8552 0.01096 0.19424 0.34183 0.02251 0.70416 0.20689 0.21679 0.25341 0.96829 0.32192 0.17584 0.54207 0.94306 0.26191 0.46088 0.68519 0.25907 0.18593 0.02685 0.23592 0.98799 0.71493 0.92345 0.24145 0.44124 0.62092 0.70431 0.11274 0.27026 0.08774 0.34731 0.47622 0.76851 0.26195 0.20934 0.13093 0.01542 0.75859 0.54982 0.1375 0.87037 0.51617 0.71938 0.92868 0.40517 0.47126 0.62881 0.48598  [...]
+0.22 0.16818 0.91385 0.84138 0.54077 0.53251 0.79165 0.58527 0.7625 0.18849 0.07459 0.38138 0.29538 0.56371 0.97395 0.27089 0.76562 0.63207 0.31975 0.73289 0.88283 0.45247 0.15006 0.26717 0.38305 0.93432 0.10437 0.62389 0.5818 0.01012 0.70147 0.59198 0.68337 0.30334 0.4171 0.66649 0.58149 0.26592 0.9254 0.22059 0.16501 0.72307 0.34182 0.86481 0.19989 0.24159 0.3455 0.30025 0.48166 0.67626 0.4334 0.00953 0.9463 0.4399 0.83107 0.97324 0.91877 0.52465 0.32113 0.72978 0.22253 0.37141 0.26725 [...]
+0.81077 0.15555 0.56569 0.08385 0.28488 0.24463 0.06376 0.77755 0.61649 0.89754 0.04454 0.41492 0.53154 0.59421 0.51763 0.88435 0.64241 0.22179 0.9745 0.33666 0.46045 0.83466 0.18122 0.33075 0.20931 0.14788 0.45921 0.11168 0.78136 0.31551 0.99165 0.84856 0.64329 0.59056 0.63707 0.59588 0.40403 0.22988 0.8345 0.5833 0.90245 0.94477 0.05984 0.87687 0.4124 0.64101 0.93949 0.36904 0.49254 0.70228 0.29098 0.31058 0.44966 0.86656 0.30541 0.08517 0.43793 0.44063 0.43046 0.68801 0.71495 0.89985  [...]
+0.89899 0.1358 0.09572 0.88396 0.60247 0.22741 0.53313 0.71065 0.73384 0.76073 0.60402 0.19754 0.11344 0.5953 0.14092 0.03716 0.03564 0.68403 0.34929 0.41951 0.5573 0.45837 0.44867 0.02331 0.96264 0.10731 0.50571 0.61065 0.84911 0.18611 0.81211 0.93686 0.01613 0.58751 0.51494 0.95335 0.86228 0.01423 0.47764 0.25252 0.23048 0.15348 0.88471 0.71162 0.91562 0.8482 0.37454 0.97925 0.7087 0.46444 0.11875 0.4486 0.87239 0.81592 0.94398 0.60646 0.48185 0.92291 0.60585 0.2706 0.06812 0.55275 0.1 [...]
+0.02865 0.08407 0.34166 0.98852 0.10412 0.01198 0.3712 0.61284 0.5426 0.71724 0.37633 0.89709 0.46408 0.36721 0.93268 0.15725 0.56417 0.4917 0.02692 0.411 0.06952 0.47972 0.8722 0.70624 0.04123 0.34338 0.685 0.42338 0.4852 0.00074 0.15599 0.35806 0.83178 0.72888 0.76241 0.8549 0.68006 0.77443 0.37461 0.83638 0.92709 0.29437 0.55139 0.82123 0.6314 0.02569 0.01485 0.88703 0.88995 0.15346 0.94911 0.52996 0.56342 0.80291 0.71156 0.31556 0.69241 0.98081 0.48048 0.56605 0.33307 0.85262 0.13383 [...]
+0.16977 0.60472 0.9453 0.8591 0.46027 0.25594 0.91544 0.07103 0.96316 0.19494 0.07273 0.16608 0.34292 0.0531 0.06989 0.4216 0.17487 0.16497 0.28624 0.09907 0.22648 0.10467 0.65005 0.97675 0.46074 0.08264 0.18864 0.84645 0.34329 0.47873 0.28961 0.94909 0.87928 0.18626 0.73169 0.7091 0.13261 0.86929 0.52784 0.52391 0.14315 0.8426 0.96623 0.88794 0.52543 0.62018 0.65708 0.2431 0.03941 0.72246 0.84949 0.87472 0.9177 0.83392 0.66594 0.11265 0.07793 0.29515 0.04883 0.43997 0.93156 0.37583 0.62 [...]
+0.87883 0.74131 0.43673 0.83038 0.9446 0.17457 0.63333 0.91226 0.90808 0.39739 0.23533 0.59818 0.6924 0.81414 0.88459 0.28646 0.70798 0.43177 0.53856 0.37621 0.56533 0.4823 0.34196 0.67226 0.53794 0.92478 0.70542 0.41511 0.90026 0.56573 0.19203 0.20756 0.52698 0.13109 0.96726 0.60871 0.15283 0.13765 0.89379 0.97822 0.16209 0.563 0.81151 0.57112 0.31457 0.03354 0.78756 0.79391 0.74895 0.38001 0.55694 0.51523 0.92358 0.70907 0.45004 0.82057 0.80115 0.83881 0.54182 0.48922 0.06031 0.69218 0 [...]
+0.99629 0.79392 0.59014 0.83725 0.25631 0.7304 0.38746 0.4459 0.41985 0.17408 0.39413 0.71947 0.40778 0.60031 0.50847 0.82912 0.49742 0.24888 0.02624 0.17188 0.12949 0.52553 0.77678 0.18059 0.8522 0.40048 0.53043 0.47275 0.14205 0.95675 0.01635 0.76912 0.60783 0.05609 0.60908 0.09589 0.86396 0.94805 0.22267 0.33306 0.948 0.9102 0.41027 0.85992 0.41094 0.92221 0.99584 0.05178 0.33588 0.93268 0.25631 0.44707 0.91113 0.10467 0.90028 0.38906 0.82153 0.98077 0.84731 0.81313 0.65 0.15985 0.821 [...]
+0.29716 0.30376 0.71129 0.12518 0.55618 0.78599 0.90172 0.21014 0.02414 0.31226 0.8172 0.90569 0.75121 0.17659 0.89567 0.77469 0.59169 0.93644 0.35945 0.86277 0.14528 0.87519 0.71104 0.52733 0.76107 0.84606 0.99934 0.2405 0.61406 0.57125 0.05603 0.0633 0.92773 0.99395 0.23461 0.67086 0.46116 0.15586 0.94325 0.45002 0.85772 0.47883 0.59209 0.73549 0.60923 0.70884 0.73293 0.60206 0.0541 0.24774 0.5848 0.10127 0.05498 0.30782 0.75246 0.15382 0.68243 0.64494 0.95699 0.48698 0.77359 0.04343 0 [...]
+0.22358 0.23977 0.11376 0.10295 0.25304 0.66881 0.8693 0.08228 0.9838 0.28529 0.89631 0.8991 0.2453 0.77612 0.85008 0.63149 0.72906 0.16428 0.95143 0.03121 0.72193 0.64073 0.04137 0.33021 0.5147 0.16419 0.03769 0.47648 0.56188 0.24542 0.00936 0.65855 0.93738 0.62424 0.57617 0.44962 0.52938 0.52183 0.73546 0.78047 0.83116 0.83516 0.68145 0.52479 0.24688 0.67861 0.89058 0.71885 0.8191 0.65299 0.17679 0.09638 0.91131 0.81664 0.30561 0.71677 0.01659 0.86326 0.46358 0.61134 0.81145 0.69309 0. [...]
+0.1052 0.60092 0.54465 0.43572 0.74639 0.28196 0.39644 0.23854 0.25198 0.81069 0.50386 0.16947 0.74016 0.59849 0.11362 0.93131 0.06905 0.30073 0.28924 0.98408 0.74555 0.71321 0.5004 0.92594 0.82584 0.93816 0.22721 0.89966 0.71882 0.44659 0.9099 0.59627 0.62636 0.2994 0.61338 0.59035 0.93658 0.25718 0.3206 0.79296 0.30667 0.84619 0.86596 0.86389 0.74352 0.06973 0.69264 0.05781 0.45152 0.17524 0.46274 0.51867 0.20559 0.83652 0.88552 0.87682 0.05015 0.28086 0.8704 0.17354 0.02183 0.98432 0. [...]
+0.16553 0.19546 0.73039 0.54599 0.56802 0.5229 0.59087 0.46415 0.07695 0.01437 0.76897 0.12085 0.38777 0.57199 0.10037 0.09822 0.52648 0.04719 0.39326 0.63756 0.50182 0.25078 0.49742 0.6977 0.03053 0.23202 0.87492 0.67385 0.16829 0.16858 0.40163 0.24261 0.50089 0.53358 0.77113 0.01276 0.51056 0.39691 0.32618 0.31806 0.10106 0.97524 0.12381 0.34975 0.20884 0.84144 0.07988 0.38945 0.15421 0.14938 0.18817 0.91158 0.65489 0.87384 0.45434 0.99011 0.11697 0.41131 0.6426 0.65123 0.49459 0.87973 [...]
+0.0045 0.51306 0.67306 0.35759 0.76132 0.39129 0.16146 0.13042 0.20707 0.07138 0.56344 0.52668 0.65632 0.63694 0.60525 0.53394 0.82606 0.92165 0.7897 0.30315 0.68589 0.72467 0.97833 0.47375 0.90688 0.54148 0.25134 0.36346 0.33818 0.94493 0.32708 0.95233 0.25859 0.99977 0.80434 0.04064 0.08321 0.17712 0.2899 0.90824 0.74947 0.90492 0.40343 0.27135 0.70962 0.70046 0.84264 0.23603 0.15529 0.28087 0.90862 0.19276 0.03541 0.58923 0.31297 0.35864 0.33493 0.29859 0.5067 0.32499 0.79247 0.05126  [...]
+0.22768 0.07024 0.7267 0.83723 0.14531 0.8457 0.38751 0.29486 0.33869 0.64233 0.08225 0.04749 0.24315 0.51131 0.49309 0.51592 0.61447 0.37116 0.48606 0.75537 0.94808 0.9037 0.16115 0.78246 0.76282 0.82717 0.10362 0.92695 0.77369 0.62683 0.1802 0.57578 0.84562 0.49542 0.12036 0.30265 0.0261 0.42024 0.55365 0.43022 0.60232 0.92329 0.81826 0.96437 0.36374 0.23975 0.70824 0.92194 0.50787 0.16347 0.99498 0.6825 0.6776 0.06924 0.04098 0.03806 0.19034 0.01447 0.74497 0.44962 0.8798 0.5617 0.292 [...]
+0.88249 0.7778 0.75778 0.85714 0.80583 0.79577 0.12491 0.29246 0.86364 0.90242 0.46676 0.74591 0.04024 0.94442 0.29509 0.67836 0.06008 0.17677 0.89347 0.15665 0.69002 0.68946 0.73748 0.45391 0.4768 0.24533 0.11229 0.0692 0.8655 0.72902 0.30837 0.36086 0.42153 0.18529 0.79755 0.12711 0.82331 0.35927 0.67721 0.35771 0.78924 0.30713 0.22027 0.53531 0.61044 0.6524 0.8921 0.19547 0.80633 0.72772 0.83865 0.56991 0.17225 0.29016 0.35463 0.70911 0.04097 0.99305 0.70392 0.44617 0.49704 0.63092 0. [...]
+0.29674 0.71601 0.49947 0.74166 0.39792 0.26396 0.67945 0.17335 0.12449 0.38886 0.68147 0.60687 0.63589 0.68215 0.35981 0.60384 0.29459 0.36482 0.24216 0.07765 0.9686 0.04066 0.17082 0.66233 0.05109 0.34764 0.42103 0.88326 0.97885 0.18603 0.61727 0.84457 0.58404 0.87484 0.61448 0.81812 0.38567 0.39592 0.04009 0.17208 0.14317 0.98316 0.65844 0.28211 0.42273 0.4058 0.16827 0.14417 0.76128 0.6969 0.30561 0.45728 0.49562 0.42843 0.14522 0.72648 0.89522 0.55115 0.60758 0.14579 0.91814 0.72525 [...]
+0.51915 0.16689 0.36141 0.37373 0.45717 0.80949 0.04399 0.2437 0.54526 0.89125 0.97886 0.53001 0.52938 0.99851 0.86461 0.65674 0.48049 0.98066 0.86792 0.55189 0.28758 0.09272 0.58169 0.46691 0.29765 0.73891 0.44288 0.80198 0.35152 0.78 0.28046 0.43644 0.20428 0.84553 0.48622 0.43702 0.70428 0.0176 0.03164 0.37868 0.07593 0.3604 0.18215 0.30726 0.8611 0.95863 0.5887 0.90253 0.06786 0.92097 0.81504 0.43613 0.52712 0.18842 0.43964 0.31311 0.91876 0.33131 0.20368 0.83731 0.61633 0.43852 0.26 [...]
+0.46517 0.97023 0.42637 0.74573 0.68798 0.64234 0.2286 0.23226 0.15183 0.47505 0.82754 0.87865 0.79085 0.80721 0.47456 0.72992 0.15373 0.6568 0.97112 0.89722 0.4584 0.61581 0.64584 0.78551 0.85033 0.26024 0.28976 0.53767 0.73557 0.18602 0.16109 0.57354 0.32639 0.41542 0.88796 0.48987 0.92002 0.55582 0.57162 0.20491 0.63204 0.70797 0.1234 0.15767 0.80676 0.82525 0.91089 0.33151 0.56587 0.93836 0.69336 0.51561 0.78336 0.50198 0.45098 0.13527 0.19458 0.69766 0.13137 0.27702 0.0286 0.90973 0 [...]
+0.4507 0.68583 0.58439 0.38476 0.58881 0.68581 0.91541 0.61412 0.35142 0.64889 0.28032 0.88585 0.65134 0.88256 0.16041 0.96343 0.29075 0.54717 0.00248 0.4105 0.55843 0.97272 0.14762 0.79173 0.65419 0.2371 0.72832 0.59428 0.40454 0.4235 0.98546 0.21413 0.72702 0.10285 0.12103 0.40626 0.51809 0.72325 0.60758 0.66086 0.26785 0.7476 0.33401 0.96628 0.4718 0.61324 0.51082 0.75013 0.81983 0.49014 0.68496 0.35936 0.36109 0.40208 0.37558 0.95773 0.87943 0.36625 0.33366 0.88611 0.81748 0.41953 0. [...]
+0.45892 0.59052 0.29049 0.98159 0.59569 0.27689 0.11494 0.10002 0.98849 0.0705 0.59172 0.80246 0.4956 0.12736 0.53221 0.38175 0.82918 0.04535 0.92191 0.65566 0.12435 0.36776 0.5221 0.37121 0.62186 0.15039 0.01005 0.93279 0.3166 0.58561 0.07638 0.64719 0.79838 0.03462 0.32989 0.34355 0.08285 0.24382 0.20275 0.97065 0.37123 0.7674 0.37978 0.84747 0.82365 0.69083 0.0824 0.31869 0.56855 0.99707 0.85894 0.18516 0.14457 0.02934 0.81402 0.18336 0.52563 0.05039 0.77904 0.69241 0.34112 0.55534 0. [...]
+0.94706 0.03496 0.51775 0.56066 0.15923 0.19642 0.89934 0.27335 0.9253 0.81424 0.4721 0.48164 0.91259 0.3376 0.66361 0.02778 0.1551 0.7841 0.73818 0.8208 0.80914 0.86765 0.54861 0.99793 0.3153 0.58829 0.89618 0.86483 0.84133 0.05744 0.35391 0.20535 0.60369 0.07227 0.63401 0.06039 0.85899 0.89055 0.32869 0.18562 0.27817 0.95085 0.17938 0.51768 0.51028 0.80741 0.89381 0.40203 0.903 0.21391 0.76297 0.22783 0.2788 0.81334 0.4584 0.01643 0.4043 0.23555 0.98296 0.62438 0.30359 0.28112 0.94438  [...]
+0.09203 0.47453 0.13032 0.57781 0.78569 0.00152 0.38692 0.07543 0.71459 0.19359 0.81702 0.8406 0.60138 0.26845 0.39635 0.90904 0.47094 0.48072 0.39299 0.03672 0.51342 0.14595 0.76646 0.39688 0.91546 0.61871 0.14381 0.85579 0.25451 0.72239 0.40209 0.52432 0.6086 0.33866 0.3027 0.47854 0.19619 0.61754 0.05694 0.33685 0.16367 0.75127 0.25779 0.51749 0.36315 0.1469 0.22445 0.50231 0.54966 0.67169 0.85381 0.59011 0.30579 0.10425 0.94583 0.11537 0.18751 0.87266 0.98575 0.72556 0.87449 0.0767 0 [...]
+0.09468 0.65861 0.90098 0.83184 0.90029 0.96745 0.84127 0.32357 0.47082 0.88387 0.50992 0.69803 0.7164 0.83448 0.46183 0.41631 0.81605 0.79701 0.28607 0.20145 0.30709 0.82213 0.96091 0.66321 0.67343 0.53956 0.86697 0.32686 0.04963 0.99579 0.93199 0.43125 0.88527 0.62104 0.92296 0.83484 0.43221 0.38611 0.31444 0.61699 0.38627 0.99544 0.56275 0.2823 0.06267 0.33547 0.91502 0.41099 0.32111 0.61925 0.42463 0.21511 0.55355 0.01908 0.54952 0.26156 0.64201 0.73448 0.90036 0.09536 0.43532 0.5173 [...]
+0.16345 0.40989 0.06644 0.81767 0.46849 0.01679 0.00987 0.07168 0.85225 0.86097 0.5586 0.23196 0.42653 0.77377 0.24202 0.07503 0.34262 0.40414 0.55051 0.36538 0.67969 0.71006 0.90895 0.05037 0.20878 0.22198 0.63521 0.40652 0.47711 0.98161 0.7644 0.52593 0.84945 0.38675 0.99914 0.3967 0.98092 0.92827 0.91013 0.34693 0.16511 0.19695 0.24233 0.96158 0.98003 0.81415 0.34009 0.42106 0.51922 0.42308 0.17103 0.01687 0.20789 0.73178 0.9492 0.37224 0.5397 0.71988 0.33523 0.64518 0.80428 0.59277 0 [...]
+0.54084 0.58575 0.10048 0.58301 0.53363 0.57956 0.19871 0.67913 0.69159 0.52088 0.89889 0.64956 0.77884 0.52771 0.00321 0.98464 0.51275 0.75363 0.18226 0.91322 0.50977 0.39838 0.18398 0.225 0.53032 0.7231 0.28985 0.4354 0.38814 0.12115 0.87373 0.29088 0.6036 0.18833 0.41236 0.31817 0.54863 0.69023 0.972 0.41569 0.14855 0.35063 0.19074 0.13155 0.2764 0.02319 0.89542 0.48319 0.94067 0.14914 0.66517 0.99932 0.61648 0.25026 0.19064 0.40997 0.14426 0.4784 0.04165 0.5976 0.17247 0.12801 0.8199 [...]
+0.15864 0.59954 0.37076 0.31216 0.57011 0.97165 0.83504 0.75719 0.85063 0.67292 0.04755 0.22269 0.93916 0.89588 0.58693 0.27723 0.75909 0.03176 0.69679 0.42427 0.89389 0.35714 0.25807 0.65994 0.30268 0.45932 0.65448 0.64843 0.16593 0.3283 0.96437 0.49296 0.65671 0.17698 0.56378 0.91645 0.11877 0.66674 0.28735 0.3605 0.71594 0.42114 0.14017 0.96057 0.25659 0.9549 0.89421 0.08638 0.97572 0.20693 0.38747 0.25622 0.86534 0.33376 0.4693 0.08226 0.69651 0.64603 0.79949 0.62847 0.87517 0.75859  [...]
+0.47674 0.21753 0.32174 0.34212 0.87891 0.85931 0.89534 0.90318 0.69276 0.63333 0.64606 0.30864 0.04556 0.01155 0.8348 0.43146 0.15608 0.73962 0.77417 0.6761 0.65375 0.51892 0.38613 0.68961 0.81148 0.62452 0.47804 0.49038 0.05049 0.48721 0.75606 0.64976 0.64663 0.19119 0.16878 0.42572 0.22954 0.38744 0.99274 0.94521 0.02169 0.9013 0.45869 0.34801 0.73331 0.9931 0.9731 0.15799 0.68781 0.70269 0.22551 0.39387 0.86971 0.9382 0.01006 0.84558 0.12407 0.08231 0.2149 0.15405 0.22635 0.82736 0.1 [...]
+0.27997 0.05453 0.91022 0.35544 0.85866 0.30488 0.48073 0.46637 0.68551 0.63868 0.33254 0.81242 0.08752 0.52722 0.11641 0.93272 0.49331 0.88714 0.57879 0.4033 0.57823 0.56791 0.54816 0.26683 0.37051 0.42688 0.48888 0.73934 0.78045 0.88574 0.05827 0.71706 0.39179 0.59936 0.07592 0.6751 0.54691 0.94445 0.54285 0.19672 0.83301 0.42661 0.54733 0.54718 0.80156 0.8118 0.80663 0.34853 0.55321 0.77716 0.18313 0.52412 0.18875 0.12904 0.01105 0.02436 0.86946 0.24773 0.8709 0.20366 0.92127 0.43154  [...]
+0.18359 0.50268 0.02658 0.5453 0.34878 0.75528 0.97659 0.15808 0.87947 0.0867 0.57023 0.51274 0.63424 0.90554 0.15239 0.96309 0.04303 0.8577 0.52919 0.61058 0.50484 0.0466 0.91085 0.47605 0.57234 0.49952 0.69081 0.34814 0.77741 0.58267 0.86724 0.44763 0.71463 0.18248 0.60917 0.63505 0.44268 0.25775 0.60971 0.10681 0.72091 0.76836 0.77749 0.93584 0.35663 0.15927 0.06781 0.67319 0.26085 0.28685 0.88213 0.1354 0.63138 0.3369 0.66994 0.4599 0.14272 0.64809 0.41811 0.70818 0.53736 0.51344 0.8 [...]
+0.45079 0.5905 0.52161 0.35497 0.82722 0.30005 0.53987 0.09612 0.73464 0.09281 0.27714 0.5036 0.27906 0.34926 0.56056 0.09546 0.21733 0.36337 0.30298 0.43604 0.55079 0.16038 0.2885 0.10668 0.52143 0.15066 0.06214 0.03883 0.51254 0.45188 0.3821 0.94998 0.47619 0.15117 0.91445 0.42172 0.42596 0.68172 0.98636 0.07647 0.61824 0.59337 0.86212 0.85714 0.96707 0.8802 0.16921 0.04408 0.4546 0.4539 0.45915 0.67599 0.17406 0.32869 0.83536 0.82217 0.1142 0.54593 0.15196 0.02572 0.64378 0.575 0.7041 [...]
+0.0976 0.52195 0.44738 0.75091 0.42458 0.2117 0.97651 0.15514 0.20159 0.7621 0.18667 0.33504 0.32129 0.23493 0.56058 0.29041 0.59722 0.92 0.97529 0.19896 0.15631 0.99179 0.30674 0.07016 0.90468 0.7385 0.05871 0.13307 0.99838 0.98843 0.58551 0.85377 0.72089 0.48876 0.93705 0.02232 0.95226 0.41456 0.62756 0.39018 0.00313 0.72235 0.23054 0.03071 0.79922 0.1705 0.20802 0.95125 0.65831 0.96481 0.90472 0.70077 0.31118 0.61763 0.07663 0.13596 0.20667 0.276 0.41845 0.41791 0.53411 0.96932 0.7084 [...]
+0.82943 0.61654 0.00929 0.03172 0.81662 0.91774 0.25195 0.35201 0.65299 0.01397 0.38651 0.01815 0.81502 0.78304 0.08082 0.27835 0.43576 0.98674 0.47834 0.48319 0.91557 0.55492 0.52815 0.51017 0.35332 0.5998 0.30593 0.69511 0.44814 0.84075 0.91743 0.88463 0.23864 0.43378 0.22096 0.87402 0.80373 0.41613 0.85973 0.65515 0.26394 0.52025 0.52028 0.80793 0.04508 0.76403 0.53466 0.78007 0.57649 0.33702 0.85332 0.95736 0.3604 0.88664 0.56518 0.19459 0.43416 0.46392 0.17644 0.02407 0.31059 0.7677 [...]
+0.74734 0.24948 0.50543 0.2712 0.87022 0.48307 0.84829 0.3316 0.22218 0.94824 0.59906 0.10648 0.25351 0.11866 0.96303 0.34852 0.88355 0.32681 0.92119 0.42395 0.30968 0.296 0.16688 0.02492 0.10379 0.43226 0.14834 0.09714 0.68211 0.22239 0.29315 0.6604 0.86321 0.66445 0.02983 0.11012 0.85801 0.59041 0.22218 0.83135 0.18659 0.48047 0.79801 0.94926 0.42216 0.52899 0.48836 0.64374 0.22563 0.23274 0.3489 0.86945 0.50758 0.31815 0.24989 0.60716 0.04021 0.97934 0.6784 0.0509 0.36786 0.32071 0.40 [...]
+0.48886 0.34494 0.01897 0.87607 0.90488 0.51566 0.69468 0.87456 0.82481 0.53663 0.23167 0.74206 0.16481 0.30062 0.42418 0.35362 0.5933 0.30676 0.06712 0.74885 0.62784 0.7642 0.01492 0.84586 0.51242 0.1072 0.55707 0.32331 0.24094 0.15043 0.1957 0.12277 0.26442 0.83163 0.04522 0.4405 0.13211 0.12655 0.85392 0.47179 0.54676 0.39555 0.78111 0.21369 0.05353 0.19213 0.93501 0.82597 0.75824 0.04151 0.17463 0.24543 0.09403 0.46147 0.38492 0.5779 0.51106 0.21316 0.78788 0.67083 0.90306 0.64664 0. [...]
+0.03693 0.45503 0.65762 0.58194 0.17735 0.47098 0.90118 0.75277 0.82102 0.8888 0.49088 0.16719 0.53815 0.572 0.587 0.17464 0.9837 0.74065 0.37985 0.53405 0.4065 0.84726 0.91077 0.54811 0.0356 0.92279 0.67084 0.86123 0.72878 0.13634 0.47331 0.69397 0.91815 0.54166 0.54864 0.66657 0.73131 0.34195 0.44267 0.03278 0.5619 0.65826 0.69818 0.44793 0.4229 0.09529 0.05767 0.96534 0.42245 0.03562 0.08692 0.72217 0.62439 0.58274 0.95215 0.28546 0.40918 0.66523 0.88887 0.10252 0.34302 0.11514 0.7929 [...]
+0.23714 0.93306 0.6036 0.80685 0.9852 0.9263 0.35271 0.23167 0.80613 0.56445 0.73903 0.45434 0.3601 0.60132 0.50854 0.37573 0.64613 0.10101 0.16028 0.62901 0.54598 0.16895 0.06234 0.62157 0.80653 0.69366 0.75081 0.23642 0.73809 0.66499 0.01819 0.57532 0.0121 0.49914 0.66182 0.59254 0.9639 0.91086 0.81506 0.75464 0.5206 0.13526 0.03163 0.30355 0.42839 0.77484 0.89113 0.36205 0.66787 0.88706 0.46656 0.82556 0.92129 0.15246 0.7568 0.98546 0.83336 0.29903 0.96717 0.50504 0.31932 0.15932 0.00 [...]
+0.95601 0.88907 0.4202 0.78599 0.45854 0.79095 0.87796 0.45865 0.18186 0.33251 0.69669 0.53363 0.8825 0.17155 0.57354 0.45641 0.14028 0.69495 0.78896 0.83086 0.63068 0.30828 0.76447 0.32171 0.83754 0.66404 0.49057 0.97646 0.74694 0.23959 0.32126 0.41464 0.64438 0.73822 0.17012 0.42289 0.16206 0.59616 0.79214 0.78947 0.66969 0.64663 0.33535 0.01087 0.26584 0.15561 0.78192 0.09887 0.26433 0.90106 0.93128 0.79942 0.1359 0.53771 0.63609 0.93774 0.94328 0.31761 0.29786 0.01531 0.30776 0.09602 [...]
+0.5996 0.95549 0.11235 0.33453 0.46369 0.18302 0.93369 0.93913 0.19758 0.18939 0.50118 0.19202 0.50751 0.15239 0.63165 0.8042 0.95241 0.19076 0.00897 0.31302 0.66721 0.14108 0.78361 0.87773 0.92874 0.12578 0.049 0.001 0.83571 0.22942 0.20357 0.26702 0.61692 0.09408 0.72442 0.65874 0.04609 0.06373 0.74216 0.85334 0.84351 0.4077 0.88789 0.11243 0.18663 0.9914 0.06755 0.24785 0.34897 0.32014 0.77413 0.29861 0.61626 0.35861 0.5885 0.9643 0.50174 0.13563 0.99059 0.25307 0.70806 0.9186 0.50175 [...]
+0.08041 0.88758 0.99427 0.72891 0.86435 0.69533 0.05553 0.1693 0.94285 0.38651 0.10985 0.37552 0.24916 0.26748 0.83584 0.80061 0.80059 0.57263 0.69187 0.66994 0.1274 0.84901 0.1989 0.92745 0.64987 0.73884 0.74984 0.80286 0.45842 0.41248 0.43447 0.80926 0.37664 0.13328 0.62507 0.59132 0.24727 0.3497 0.75395 0.59173 0.225 0.55108 0.14271 0.09526 0.34445 0.85412 0.17956 0.68483 0.02991 0.95564 0.85012 0.19529 0.39799 0.21596 0.51565 0.21307 0.66145 0.45336 0.82435 0.61774 0.44406 0.35349 0. [...]
+0.25386 0.3482 0.87586 0.09698 0.06069 0.22579 0.26932 0.28157 0.88229 0.87272 0.80644 0.22958 0.13755 0.77852 0.11759 0.78316 0.88826 0.10319 0.97583 0.07021 0.35279 0.22 0.06355 0.72732 0.65066 0.7233 0.37794 0.13351 0.15765 0.1513 0.31476 0.81323 0.92836 0.34449 0.6383 0.41544 0.30776 0.61216 0.30976 0.81952 0.34191 0.74748 0.63511 0.24787 0.70975 0.68827 0.20053 0.78835 0.22559 0.0999 0.53436 0.92684 0.13846 0.97751 0.41788 0.76823 0.10259 0.81824 0.78626 0.56688 0.361 0.66537 0.0333 [...]
+0.10423 0.26 0.66432 0.26582 0.63586 0.709 0.72656 0.58712 0.97229 0.71299 0.88862 0.09131 0.5421 0.62205 0.71282 0.92936 0.96687 0.41646 0.90513 0.33716 0.63373 0.55154 0.19679 0.49151 0.15147 0.53415 0.411 0.93097 0.47687 0.09271 0.04823 0.56594 0.98016 0.43511 0.25733 0.15598 0.17106 0.92075 0.86023 0.19118 0.52944 0.29804 0.13245 0.05216 0.76435 0.72057 0.70339 0.31001 0.76543 0.75627 0.90198 0.30093 0.08361 0.91536 0.14459 0.45386 0.07324 0.40578 0.7386 0.06761 0.17075 0.9555 0.6586 [...]
+0.46378 0.0181 0.15181 0.53913 0.43454 0.18054 0.20838 0.91252 0.29344 0.62864 0.07775 0.08922 0.95407 0.40249 0.80217 0.48551 0.02089 0.15682 0.14957 0.52848 0.1289 0.65229 0.2056 0.66399 0.95914 0.19234 0.47336 0.71077 0.63165 0.92247 0.38334 0.57328 0.50742 0.17287 0.73746 0.46111 0.24491 0.88285 0.83433 0.7319 0.10431 0.90546 0.28022 0.15855 0.29092 0.32036 0.04972 0.25502 0.42783 0.85292 0.17553 0.1239 0.61231 0.79987 0.03707 0.97397 0.63681 0.4123 0.22645 0.99277 0.67269 0.74801 0. [...]
+0.18698 0.91639 0.33454 0.63751 0.93655 0.9149 0.68759 0.06706 0.09776 0.10634 0.7897 0.9024 0.62333 0.9824 0.24225 0.62921 0.38518 0.74201 0.0842 0.41615 0.12134 0.6086 0.46272 0.0779 0.11369 0.3713 0.08643 0.92269 0.57572 0.8495 0.97175 0.04008 0.81597 0.39469 0.84003 0.88578 0.3629 0.93181 0.63878 0.33726 0.70411 0.56116 0.14545 0.55985 0.29514 0.33021 0.22191 0.90704 0.128 0.09805 0.61184 0.18309 0.26464 0.11487 0.07893 0.30679 0.43559 0.43254 0.96921 0.7944 0.02352 0.34277 0.79486 0 [...]
+0.57375 0.99162 0.91163 0.97403 0.1789 0.43153 0.82212 0.36594 0.0934 0.6565 0.01159 0.74817 0.99021 0.71059 0.93953 0.16051 0.33555 0.68073 0.69389 0.29471 0.76311 0.55084 0.75538 0.39995 0.39662 0.19788 0.3224 0.33298 0.70774 0.26756 0.73169 0.70102 0.3645 0.36476 0.9276 0.83225 0.28851 0.31373 0.42721 0.24561 0.14726 0.43567 0.85109 0.80831 0.13313 0.42078 0.75127 0.53413 0.64812 0.01388 0.95649 0.70715 0.77098 0.15605 0.93556 0.78041 0.20073 0.98812 0.39441 0.99784 0.6245 0.161 0.339 [...]
+0.1162 0.30716 0.41845 0.79633 0.07978 0.04999 0.24827 0.81061 0.43042 0.84798 0.49307 0.47949 0.39211 0.48878 0.35154 0.27406 0.40352 0.41581 0.79031 0.62049 0.49917 0.28056 0.85151 0.94364 0.83245 0.354 0.0769 0.14248 0.65061 0.55295 0.81811 0.74574 0.52692 0.1617 0.62831 0.98632 0.96048 0.17829 0.86985 0.16505 0.66244 0.78782 0.2641 0.33334 0.59973 0.25684 0.85295 0.98573 0.49971 0.52804 0.05068 0.54844 0.67926 0.22235 0.9989 0.45462 0.59212 0.99468 0.51973 0.95506 0.6498 0.97097 0.74 [...]
+0.98354 0.32057 0.7736 0.66767 0.67101 0.18692 0.7614 0.20345 0.44527 0.67924 0.61683 0.74745 0.7799 0.57276 0.68916 0.37305 0.31579 0.58139 0.42697 0.58418 0.28146 0.85074 0.18709 0.16971 0.31462 0.49438 0.50589 0.96649 0.71867 0.02349 0.37925 0.20303 0.53231 0.52872 0.11476 0.54924 0.39746 0.69031 0.07112 0.46831 0.71924 0.82632 0.11435 0.95083 0.91501 0.97814 0.43073 0.13847 0.24151 0.22002 0.3128 0.76541 0.67823 0.1735 0.53682 0.40683 0.56786 0.01856 0.94995 0.55855 0.29135 0.75662 0 [...]
+0.50082 0.62987 0.84801 0.57781 0.71312 0.35044 0.56692 0.68885 0.75107 0.80655 0.05291 0.93855 0.05515 0.39965 0.44746 0.93024 0.1013 0.34249 0.03845 0.11437 0.99266 0.61639 0.11187 0.19415 0.3777 0.13787 0.37875 0.64852 0.82441 0.71438 0.7687 0.47697 0.64561 0.14054 0.83869 0.94586 0.85995 0.40459 0.95884 0.82419 0.41483 0.9333 0.9183 0.94702 0.83421 0.3927 0.573 0.56926 0.52516 0.50491 0.43412 0.30484 0.228 0.65949 0.03172 0.45929 0.57759 0.84624 0.50314 0.91494 0.46399 0.04669 0.4950 [...]
+0.1599 0.87599 0.67166 0.16745 0.05562 0.57047 0.48053 0.47665 0.09907 0.59457 0.8394 0.05821 0.49663 0.85425 0.64489 0.24347 0.78297 0.04542 0.84714 0.93095 0.7024 0.6619 0.61477 0.13819 0.51687 0.37364 0.18459 0.50083 0.52977 0.11056 0.09494 0.12674 0.3037 0.62416 0.79548 0.23748 0.68563 0.04698 0.14886 0.42715 0.58626 0.19659 0.35156 0.61098 0.11448 0.20502 0.43292 0.14353 0.10451 0.94117 0.60684 0.55499 0.37173 0.29152 0.21806 0.13001 0.50258 0.11568 0.2092 0.92666 0.83808 0.14532 0. [...]
+0.0279 0.01488 0.77917 0.60113 0.59213 0.77784 0.30971 0.00539 0.16538 0.35743 0.15447 0.97948 0.79454 0.07042 0.41932 0.1567 0.77596 0.93044 0.12155 0.19943 0.1933 0.08683 0.18306 0.3824 0.35629 0.01385 0.04499 0.39503 0.36724 0.44091 0.02415 0.77353 0.43611 0.78629 0.40148 0.64145 0.65468 0.05425 0.44929 0.99696 0.18206 0.11565 0.00848 0.96094 0.17676 0.50361 0.98321 0.55717 0.59449 0.8181 0.27346 0.94899 0.04021 0.54036 0.56027 0.15387 0.71857 0.65264 0.74197 0.86321 0.07522 0.16404 0 [...]
+0.62733 0.5036 0.50703 0.20329 0.72474 0.68831 0.53168 0.77273 0.95393 0.02281 0.9787 0.63772 4e-05 0.25443 0.82422 0.3392 0.05459 0.4134 0.84928 0.00403 0.57816 0.67182 0.1169 0.90157 0.31729 0.99285 0.13355 0.81683 0.37517 0.73248 0.93211 0.0136 0.25552 0.52155 0.57685 0.13161 0.81577 0.95304 0.14516 0.41205 0.89756 0.91361 0.31462 0.24994 0.9049 0.0998 0.93757 0.43884 0.80411 0.76623 0.2573 0.9316 0.37924 0.81428 0.34586 0.18598 0.82412 0.10425 0.93298 0.26273 0.15976 0.76488 0.16509  [...]
+0.84353 0.80229 0.54576 0.43613 0.57452 0.63158 0.11789 0.54356 0.0829 0.73303 0.8421 0.56823 0.6862 0.30959 0.45821 0.70941 0.77268 0.49894 0.58116 0.31246 0.06925 0.33747 0.41168 0.08904 0.476 0.9878 0.14338 0.45678 0.85556 0.04083 0.34883 0.81125 0.48215 0.93973 0.3858 0.49254 0.83227 0.25805 0.36328 0.24748 0.94645 0.63563 0.11863 0.85744 0.44047 0.26158 0.17616 0.41367 0.71171 0.63649 0.12937 0.86303 0.61017 0.22047 0.39442 0.66586 0.97826 0.43764 0.38319 0.61495 0.0639 0.39758 0.64 [...]
+0.26777 0.97896 0.29383 0.64532 0.36991 0.35914 0.51393 0.47527 0.01343 0.29 0.45095 0.68027 0.90725 0.21488 0.54355 0.71004 0.8991 0.55338 0.46425 0.85487 0.0182 0.6322 0.28768 0.3703 0.09515 0.81863 0.1871 0.60364 0.06697 0.83867 0.92548 0.34206 0.50556 0.94573 0.132 0.67291 0.31641 0.84083 0.92172 0.67016 0.94803 0.05175 0.76357 0.54314 0.85168 0.02309 0.53274 0.05094 0.34833 0.9453 0.67775 0.39865 0.98726 0.65917 0.01359 0.79271 0.54775 0.84723 0.31247 0.08231 0.19021 0.20853 0.20206 [...]
+0.19356 0.54406 0.72494 0.82245 0.66526 0.55025 0.22174 0.40893 0.35324 0.7766 0.27341 0.74475 0.66896 0.37819 0.54616 0.67283 0.22471 0.20248 0.4365 0.79098 0.56145 0.85371 0.39803 0.56962 0.60638 0.74321 0.7793 0.05224 0.29478 0.85696 0.75348 0.79322 0.10476 0.12997 0.31689 0.83569 0.89068 0.19761 0.47391 0.19789 0.24329 0.90273 0.54793 0.90017 0.57359 0.32237 0.16162 0.99201 0.50778 0.69077 0.13749 0.55541 0.68508 0.17845 0.94052 0.96995 0.85518 0.27326 0.71514 0.69143 0.75583 0.03041 [...]
+0.3422 0.92069 0.65093 0.34999 0.55094 0.73309 0.33544 0.97722 0.4496 0.08623 0.56583 0.64073 0.17775 0.17561 0.38228 0.3728 0.12349 0.85742 0.27039 0.258 0.02506 0.59064 0.6991 0.11891 0.80746 0.59033 0.81495 0.51592 0.63953 0.94888 0.62308 0.83263 0.53261 0.49911 0.06169 0.15381 0.71637 0.81207 0.91289 0.80021 0.28903 0.14617 0.60131 0.94461 0.2431 0.56721 0.77868 0.32163 0.09048 0.23679 0.05568 0.37523 0.42103 0.81708 0.23285 0.35255 0.98614 0.7844 0.90932 0.80765 0.66802 0.62163 0.92 [...]
+0.40369 0.29816 0.3274 0.48031 0.24022 0.11062 0.66112 0.21554 0.78788 0.11329 0.63978 0.0639 0.05294 0.74973 0.5135 0.58549 0.81144 0.11793 0.23997 0.58805 0.47066 0.24221 0.13578 0.05635 0.93313 0.66725 0.92561 0.85738 0.95736 0.94867 0.95959 0.54057 0.6066 0.38412 0.21024 0.3482 0.28658 0.12899 0.16466 0.10999 0.47571 0.39138 0.94841 0.95124 0.05374 0.55842 0.65192 0.47753 0.58905 0.46335 0.65385 0.89935 0.63216 0.00897 0.46638 0.8419 0.52873 0.15343 0.16945 0.67751 0.57571 0.00786 0. [...]
+0.43393 0.97323 0.06584 0.9639 0.56367 0.29825 0.17146 0.7068 0.80424 0.91012 0.27901 0.50235 0.84774 0.56914 0.58066 0.39995 0.48228 0.52695 0.19607 0.50819 0.0594 0.67328 0.08294 0.84618 0.77619 0.05475 0.0604 0.19748 0.65297 0.83906 0.26795 0.04335 0.07136 0.40231 0.48997 0.96451 0.94933 0.79684 0.81835 0.00671 0.57176 0.91797 0.70796 0.50492 0.34012 0.29255 0.13479 0.45126 0.16967 0.11184 0.33807 0.53666 0.76344 0.60259 0.56736 0.11592 0.05329 0.21371 0.7936 0.37547 0.38548 0.56256 0 [...]
+0.42455 0.32487 0.17629 0.06627 0.73018 0.45682 0.61016 0.40229 0.77961 0.36739 0.60084 0.11517 0.33359 0.26916 0.02447 0.03003 0.30706 0.7179 0.13741 0.94548 0.53282 0.75512 0.3109 0.07008 0.51685 0.7749 0.54838 0.95751 0.33207 0.55644 0.54313 0.73975 0.6809 0.18149 0.14584 0.60102 0.72544 0.1342 0.45713 0.96136 0.43102 0.87062 0.01471 0.30165 0.04936 0.78839 0.12945 0.72726 0.0403 0.17369 0.66681 0.48002 0.79725 0.17515 0.36655 0.59762 0.12095 0.89126 0.67533 0.98026 0.16819 0.37407 0. [...]
+0.8306 0.53205 0.46877 0.61123 0.83034 0.15748 0.45356 0.16914 0.29302 0.15322 0.67078 0.92351 0.25167 0.23079 0.0437 0.44736 0.31994 0.24479 0.2591 0.15045 0.80613 0.20594 0.3103 0.38514 0.95191 0.4223 0.10213 0.73437 0.28803 0.87669 0.26995 0.78904 0.29532 0.92261 0.65189 0.02085 0.97302 0.20582 0.71531 0.88015 0.4718 0.38899 0.94366 0.78267 0.87684 0.94948 0.1621 0.33575 0.47227 0.90955 0.00437 0.36921 0.56653 0.94468 0.46184 0.0718 0.83222 0.99976 0.43445 0.35521 0.50805 0.64185 0.08 [...]
+0.94102 0.78645 0.47633 0.14165 0.14774 0.46229 0.72138 0.34803 0.68205 0.96031 0.95776 0.52581 0.34754 0.77682 0.00677 0.90649 0.08154 0.0061 0.25533 0.63311 0.24773 0.63313 0.68137 0.72668 0.47281 0.12466 0.46624 0.59038 0.61274 0.71996 0.60221 0.49729 0.79299 0.84559 0.74594 0.8762 0.63823 0.31945 0.51236 0.42764 0.39065 0.74118 0.68986 0.55032 0.73887 0.36266 0.16685 0.93991 0.07105 0.69978 0.32529 0.77541 0.82714 0.22656 0.85832 0.80988 0.02922 0.37995 0.76523 0.06834 0.72229 0.2043 [...]
+0.20634 0.36128 0.19802 0.92012 0.68112 0.89453 0.14445 0.37699 0.1332 0.0916 0.29607 0.17722 0.43081 0.05076 0.60435 0.32843 0.99384 0.78457 0.76932 0.58128 0.44075 0.31156 0.49977 0.55968 0.16866 0.0269 0.74428 0.18126 0.5622 0.79777 0.8563 0.20083 0.01191 0.40749 0.11229 0.4576 0.82516 0.13468 0.47664 0.29535 0.05431 0.12057 0.30019 0.4488 0.28844 0.27234 0.80333 0.13115 0.65694 0.96419 0.62439 0.10412 0.24348 0.98711 0.95935 0.61104 0.64366 0.54638 0.1339 0.189 0.73131 0.47757 0.5349 [...]
+0.25115 0.91421 0.56525 0.20241 0.5019 0.00743 0.39533 0.64333 0.59365 0.81756 0.5573 0.0066 0.53947 0.32996 0.51226 0.86397 0.30389 0.29995 0.42876 0.11369 0.46768 0.77714 0.74936 0.46871 0.50299 0.12334 0.87721 0.94769 0.79186 0.89732 0.48055 0.55103 0.61074 0.59854 0.84378 0.53025 0.61101 0.62321 0.78076 0.32394 0.43525 0.31967 0.46268 0.67688 0.73901 0.38868 0.40812 0.11341 0.17214 0.17619 0.79408 0.14498 0.07517 0.80056 0.20564 0.79621 0.51888 0.63954 0.81391 0.9698 0.0804 0.98347 0 [...]
+0.01937 0.56968 0.66725 0.61301 0.36014 0.69489 0.26009 0.62243 0.48398 0.23108 0.73422 0.13261 0.93826 0.99468 0.53506 0.41142 0.62265 0.00741 0.58344 0.41678 0.96456 0.70491 0.69378 0.2774 0.33647 0.45915 0.9613 0.51202 0.07013 0.68234 0.93312 0.2783 0.83365 0.13502 0.16979 0.0866 0.35905 0.11282 0.95827 0.84335 0.47337 0.09972 0.65805 0.59078 0.97987 0.20704 0.99853 0.76232 0.93945 0.50584 0.00118 0.33668 0.60444 0.33463 0.25018 0.05733 0.56097 0.35397 0.12523 0.01721 0.69775 0.77492  [...]
+0.19049 0.40346 0.91931 0.73401 0.17068 0.94918 0.78014 0.39304 0.06131 0.89237 0.1419 0.93213 0.64685 0.59057 0.83081 0.91166 0.22372 0.74258 0.9867 0.13419 0.89592 0.25141 0.16954 0.71825 0.96267 0.27436 0.23524 0.03589 0.9745 0.73588 0.26544 0.79826 0.2264 0.71968 0.07026 0.04683 0.25766 0.51638 0.74103 0.85925 0.93987 0.41758 0.11919 0.494 0.72336 0.29384 0.21089 0.68763 0.4651 0.81151 0.69665 0.25112 0.38399 0.00864 0.68128 0.65514 0.90463 0.86023 0.12145 0.35909 0.69699 0.45141 0.9 [...]
+0.25448 0.97895 0.63806 0.09011 0.14263 0.23404 0.83499 0.99801 0.12361 0.68377 0.0162 0.21908 0.63232 0.02744 0.00712 0.05476 0.17632 0.67864 0.25902 0.01024 0.23747 0.0987 0.16496 0.33094 0.16116 0.53036 0.49899 0.77727 0.96545 0.28482 0.84474 0.98527 0.33735 0.08165 0.38704 0.87318 0.64576 0.19683 0.56803 0.65278 0.32803 0.51531 0.50792 0.90966 0.82846 0.54969 0.77079 0.38445 0.33657 0.52244 0.00031 0.61479 0.50953 0.52995 0.36253 0.16305 0.03217 0.69109 0.34731 0.33035 0.63675 0.5533 [...]
+0.68914 0.0978 0.05349 0.16943 0.43228 0.01313 0.0802 0.44769 0.76804 0.32451 0.25544 0.55992 0.91984 0.84498 0.62681 0.86583 0.66821 0.64923 0.19396 0.49851 0.44066 0.48376 0.17719 0.66758 0.54888 0.949 0.93883 0.71442 0.63411 0.52781 0.09446 0.02949 0.17058 0.67625 0.53158 0.59522 0.97303 0.50408 0.46568 0.79075 0.4058 0.20139 0.22066 0.97746 0.65594 0.21945 0.01793 0.75538 0.63466 0.52073 0.55747 0.43975 0.75511 0.33543 0.73814 0.50309 0.47524 0.36876 0.30356 0.56552 0.33046 0.60319 0 [...]
+0.94145 0.16089 0.18347 0.03984 0.90649 0.94892 0.74181 0.01259 0.94394 0.14908 0.44629 0.40354 0.39968 0.94962 0.03891 0.27022 0.39774 0.55465 0.19506 0.44666 0.48041 0.6541 0.3541 0.98568 0.37527 0.06432 0.1889 0.30668 0.56813 0.97263 0.79641 0.13502 0.34213 0.21208 0.5496 0.68316 0.49566 0.88178 0.40909 0.54034 0.97797 0.3633 0.2883 0.87673 0.34699 0.0302 0.3333 0.12276 0.58871 0.88686 0.88664 0.35564 0.51006 0.35195 0.83294 0.50161 0.61486 0.63722 0.47488 0.83935 0.75357 0.60633 0.20 [...]
+0.26967 0.44287 0.72891 0.50785 0.17111 0.35266 0.19418 0.45603 0.47788 0.6251 0.61705 0.03184 0.18653 0.90489 0.53382 0.8396 0.3864 0.18219 0.23084 0.67586 0.27367 0.74585 0.35231 0.64235 0.61289 0.50348 0.16822 0.62171 0.05654 0.42766 0.41542 0.33565 0.91675 0.12837 0.40781 0.26264 0.73252 0.32574 0.92019 0.70366 0.44787 0.30305 0.69748 0.61749 0.00778 0.65773 0.21405 0.40337 0.33314 0.86165 0.89409 0.53623 0.28795 0.35919 0.52208 0.20353 0.45704 0.74887 0.39347 0.47272 0.67002 0.18535 [...]
+0.76071 0.46108 0.26578 0.73396 0.09388 0.07461 0.57196 0.41606 0.3802 0.88402 0.00178 0.11233 0.33367 0.48985 0.67889 0.46063 0.65437 0.75459 0.73976 0.50221 0.6757 0.11348 0.52507 0.64435 0.70742 0.76311 0.10911 0.68777 0.07587 0.79189 0.97672 0.62161 0.65005 0.86186 0.70969 0.33955 0.93881 0.86711 0.25443 0.21406 0.70687 0.97825 0.40799 0.4094 0.0628 0.55375 0.11896 0.2324 0.38193 0.88928 0.71445 0.90148 0.10055 0.28874 0.09912 0.90172 0.96049 0.6277 0.32856 0.3463 0.72803 0.41651 0.1 [...]
+0.72028 0.84336 0.73937 0.49645 0.43692 0.43341 0.34585 0.69974 0.98186 0.74814 0.60226 0.17187 0.16468 0.10104 0.44617 0.56127 0.68899 0.9254 0.04098 0.13085 0.06334 0.99809 0.93024 0.69588 0.79592 0.49515 0.53778 0.18552 0.02607 0.11137 0.21937 0.4425 0.75536 0.35089 0.68755 0.33561 0.42347 0.7718 0.8759 0.46056 0.75232 0.13808 0.19426 0.54515 0.00378 0.85935 0.33814 0.34404 0.11028 0.19191 0.15782 0.52496 0.11928 0.81256 0.00218 0.29492 0.83638 0.43451 0.62178 0.69549 0.1589 0.36567 0 [...]
+0.38996 0.77056 0.51045 0.27851 0.66578 0.80038 0.32658 0.10179 0.34859 0.90395 0.29353 0.66185 0.49291 0.63872 0.1349 0.58349 0.25039 0.35759 0.15046 0.42124 0.12063 0.21669 0.41499 0.18988 0.69987 0.71228 0.4119 0.05225 0.09678 0.62589 0.89164 0.63411 0.46616 0.17228 0.25663 0.28957 0.85754 0.11516 0.48829 0.76716 0.14981 0.55449 0.18569 0.47912 0.33426 0.14829 0.00945 0.4248 0.3956 0.59829 0.31632 0.76806 0.36083 0.00422 0.14759 0.4869 0.18583 0.40662 0.01926 0.85314 0.20459 0.53022 0 [...]
+0.92374 0.13038 0.54986 0.83224 0.34093 0.16621 0.34569 0.28512 0.70068 0.47112 0.86348 0.44602 0.2877 0.26581 0.6703 0.9695 0.40476 0.47814 0.94206 0.50146 0.5955 0.05634 0.88016 0.17624 0.46921 0.96921 0.21315 0.67652 0.91442 0.62382 0.32505 0.85414 0.49568 0.62577 0.69821 0.97193 0.40208 0.86055 0.38338 0.60042 0.50163 0.86107 0.7915 0.68923 0.62779 0.28318 0.99663 0.73514 0.91008 0.7965 0.25341 0.99756 0.6457 0.63425 0.93337 0.20019 0.35541 0.22593 0.6703 0.56606 0.34897 0.65869 0.68 [...]
+0.2079 0.03198 0.78808 0.22363 0.63446 0.17212 0.14358 0.52658 0.75724 0.11874 0.76294 0.58275 0.12191 0.80385 0.48676 0.08431 0.93654 0.58629 0.97699 0.24014 0.7824 0.59721 0.92638 0.72004 0.07474 0.53019 0.38822 0.58138 0.2822 0.08431 0.6425 0.90247 0.99229 0.35835 0.90715 0.12498 0.31539 0.30146 0.58483 0.88042 0.72557 0.71774 0.98676 0.45808 0.89738 0.19993 0.41692 0.79474 0.72387 0.20953 0.21513 0.00791 0.50704 0.93335 0.17878 0.14903 0.73103 0.69428 0.31856 0.38634 0.56807 0.17105  [...]
+0.84316 0.65421 0.28228 0.03402 0.2561 0.08683 0.32218 0.86468 0.98195 0.56187 0.45151 0.50359 0.09007 0.02247 0.62783 0.62972 0.01318 0.66113 0.41894 0.67421 0.1077 0.47963 0.83707 0.47319 0.31819 0.30847 0.28944 0.6454 0.56511 0.47948 0.77516 0.52963 0.69139 0.56409 0.7993 0.1752 0.34382 0.9772 0.33381 0.90452 0.10916 0.56624 0.19813 0.08478 0.50265 0.67185 0.43537 0.9113 0.97794 0.21195 0.84599 0.23256 0.86815 0.20843 0.6918 0.91135 0.69847 0.83591 0.25591 0.8628 0.90128 0.46253 0.847 [...]
+0.18602 0.17343 0.88253 0.983 0.8737 0.07686 0.40135 0.82413 0.69078 0.36534 0.39853 0.20032 0.63013 0.013 0.24466 0.11236 0.08119 0.50455 0.93225 0.00539 0.02572 0.08382 0.18208 0.86567 0.38914 0.18892 0.91424 0.01201 0.8275 0.49431 0.98015 0.05415 0.29122 0.96128 0.27095 0.92516 0.18653 0.96345 0.73301 0.8475 0.62362 0.85067 0.00168 0.29659 0.9277 0.98732 0.53142 0.06917 0.07099 0.70832 0.43252 0.0131 0.68306 0.32462 0.4647 0.15555 0.96712 0.21654 0.74254 0.35656 0.31241 0.21306 0.2822 [...]
+0.59039 0.09007 0.81193 0.83398 0.68147 0.646 0.51 0.24014 0.12706 0.0052 0.80751 0.03706 0.60717 0.36681 0.30517 0.98879 0.54789 0.37786 0.61304 0.92782 0.92582 0.78479 0.6125 0.19548 0.91361 0.98499 0.2487 0.70485 0.65245 0.41789 0.25916 0.59692 0.74834 0.95187 0.45957 0.07642 0.34942 0.53783 0.6556 0.29867 0.42258 0.27806 0.84384 0.3947 0.11192 0.71749 0.06862 0.44622 0.11366 0.8539 0.42417 0.77911 0.65086 0.36384 0.82559 0.35081 0.25161 0.31923 0.60736 0.93754 0.31636 0.6899 0.13873  [...]
+0.5001 0.04978 0.81857 0.85119 0.91906 0.22013 0.30682 0.41592 0.97675 0.38325 0.28087 0.18297 0.1247 0.3306 0.86047 0.41484 0.75387 0.25275 0.08414 0.19327 0.05633 0.2765 0.76649 0.9807 0.24224 0.72412 0.06225 0.4963 0.29162 0.06826 0.49111 0.1342 0.59887 0.37925 0.5989 0.5493 0.82179 0.72256 0.47562 0.03513 0.42125 0.75061 0.85009 0.902 0.77285 0.85359 0.24107 0.95082 0.72066 0.0507 0.56746 0.46244 0.6646 0.90621 0.58965 0.31274 0.58811 0.78197 0.97989 0.47018 0.07639 0.36305 0.31979 0 [...]
+0.90205 0.15246 0.3841 0.44915 0.01727 0.63711 0.51489 0.66022 0.00267 0.17103 0.09592 0.62949 0.33689 0.55134 0.10554 0.97742 0.79524 0.43547 0.90701 0.19352 0.5293 0.04319 0.83054 0.75022 0.08042 0.21222 0.45833 0.2574 0.18501 0.03742 0.24389 0.61538 0.13621 0.54351 0.88604 0.22383 0.57946 0.50337 0.7943 0.01821 0.02038 0.31469 0.27205 0.63234 0.47165 0.66129 0.81632 0.49918 0.83531 0.19351 0.63887 0.66964 0.10322 0.37373 0.85301 0.37753 0.5101 0.64516 0.33757 0.09971 0.08504 0.25738 0 [...]
+0.46147 0.90019 0.89829 0.21301 0.66433 0.69308 0.29506 0.90463 0.32327 0.12404 0.7493 0.08697 0.93536 0.64976 0.52642 0.75512 0.38295 0.8059 0.36249 0.17594 0.76235 0.65925 0.88328 0.26087 0.4056 0.1807 0.38331 0.22099 0.01406 0.09144 0.15951 0.19962 0.00971 0.81071 0.70786 0.85814 0.61631 0.89726 0.70277 0.72766 0.89644 0.96938 0.69072 0.76913 0.3495 0.47024 0.38034 0.48462 0.27235 0.87905 0.48958 0.26658 0.01334 0.07992 0.89755 0.0443 0.51743 0.65882 0.52736 0.45666 0.73908 0.64344 0. [...]
+0.28665 0.4055 0.65671 0.24335 0.50992 0.39055 0.6938 0.77311 0.42017 0.12871 0.3552 0.21456 0.55244 0.77082 0.22306 0.2198 0.11179 0.96955 0.21846 0.54505 0.6478 0.37927 0.35559 0.3482 0.74309 0.51072 0.05652 0.98491 0.15685 0.91398 0.99838 0.30116 0.53212 0.48575 0.8365 0.49048 0.46624 0.86683 0.90346 0.21193 0.75232 0.5022 0.44996 0.40622 0.9742 0.27817 0.00564 0.22482 0.51072 0.43358 0.28988 0.9628 0.82633 0.8921 0.39953 0.14574 0.59406 0.64874 0.62809 0.7169 0.45604 0.57331 0.65591  [...]
+0.35066 0.7949 0.03589 0.71245 0.23868 0.34894 0.45996 0.67614 0.69622 0.54169 0.07146 0.02569 0.49741 0.8002 0.5746 0.71279 0.01227 0.96734 0.96251 0.33551 0.81918 0.4242 0.51707 0.24623 0.64022 0.454 0.41147 0.42366 0.98432 0.02474 0.02545 0.71204 0.98771 0.88234 0.13252 0.76088 0.48344 0.63959 0.36712 0.03237 0.49466 0.30905 0.80991 0.10092 0.1204 0.0166 0.4966 0.36213 0.38536 0.38477 0.20201 0.59602 0.57254 0.76782 0.25904 0.99011 0.37034 0.85107 0.07174 0.81216 0.50474 0.19072 0.393 [...]
+0.72584 0.69966 0.3242 0.01801 0.4261 0.54745 0.39442 0.35588 0.33891 0.60247 0.69457 0.54297 0.51754 0.5801 0.99805 0.31567 0.34316 0.30175 0.81568 0.67151 0.82221 0.74923 0.33665 0.73942 0.95035 0.84349 0.93137 0.29653 0.57389 0.02951 0.91103 0.38071 0.68881 0.94603 0.05307 0.46377 0.71414 0.12531 0.54232 0.30823 0.64813 0.56313 0.72348 0.29156 0.10583 0.25643 0.2963 0.19161 0.12097 0.32023 0.35775 0.20945 0.5087 0.69828 0.27869 0.54041 0.47093 0.96368 0.74833 0.61911 0.78788 0.5277 0. [...]
+0.06107 0.16803 0.24727 0.2276 0.48553 0.82856 0.00383 0.45091 0.65014 0.39534 0.56595 0.85599 0.69949 0.44001 0.17442 0.85125 0.11093 0.44985 0.22026 0.26576 0.67127 0.38803 0.52611 0.0627 0.05538 0.90445 0.07814 0.94207 0.80334 0.39538 0.24417 0.15306 0.81563 0.01941 0.6318 0.06585 0.93153 0.60096 0.30195 0.70149 0.97453 0.47397 0.14091 0.21398 0.29041 0.44837 0.49558 0.2013 0.60824 0.37771 0.05819 0.19385 0.60793 0.05408 0.21656 0.8297 0.68616 0.26041 0.5793 0.9037 0.27865 0.24367 0.8 [...]
+0.03312 0.0264 0.83226 0.49948 0.01998 0.96784 0.05417 0.70468 0.63904 0.11463 0.73924 0.31224 0.57213 0.71848 0.82291 0.42329 0.53566 0.54593 0.67239 0.88161 0.28054 0.88083 0.29965 0.50195 0.07484 0.32556 0.07376 0.99274 0.52245 0.25613 0.13224 0.90973 0.63337 0.16084 0.78561 0.54719 0.56509 0.45209 0.70657 0.38407 0.82359 0.32974 0.32571 0.90984 0.12026 0.18204 0.68038 0.433 0.3661 0.85473 0.08012 0.97818 0.71125 0.94146 0.28309 0.97188 0.57706 0.17068 0.69216 0.87927 0.28089 0.33521  [...]
+0.73417 0.39826 0.03175 0.09349 0.11873 0.30401 0.78277 0.15778 0.12552 0.12214 0.97792 0.60094 0.98365 0.82463 0.82361 0.58757 0.14739 0.08593 0.69388 0.20119 0.06918 0.08506 0.69272 0.53107 0.35577 0.5842 0.09479 0.44518 0.31959 0.98081 0.50231 0.85462 0.89896 0.95393 0.97176 0.20524 0.07651 0.97368 0.3143 0.53958 0.3437 0.19807 0.78952 0.49444 0.4347 0.79682 0.50745 0.35406 0.88874 0.88935 0.08835 0.26975 0.12806 0.70931 0.08323 0.89063 0.79201 0.51179 0.99587 0.97699 0.02153 0.07567  [...]
+0.90487 0.59082 0.03111 0.87591 0.56277 0.37313 0.33139 0.04888 0.66404 0.29725 0.05832 0.8829 0.83442 0.44629 0.70387 0.39999 0.33562 0.64968 0.5162 0.93345 0.30279 0.51045 0.1239 0.17768 0.06482 0.07835 0.74527 0.40566 0.08614 0.67583 0.6193 0.97813 0.50016 0.19045 0.70736 0.79765 0.49326 0.86033 0.16488 0.7192 0.85155 0.91487 0.71946 0.89215 0.59459 0.18536 0.17489 0.7216 0.35094 0.94233 0.72099 0.72197 0.02512 0.3702 0.73637 0.52999 0.25526 0.89804 0.89736 0.01579 0.24019 0.39539 0.9 [...]
+0.12902 0.43728 0.19581 0.64022 0.7377 0.69354 0.06351 0.64342 0.4427 0.03534 0.72646 0.24903 0.71837 0.68745 0.2299 0.07448 0.46993 0.86281 0.32398 0.37243 0.95031 0.00808 0.16982 0.5469 0.9922 0.06578 0.65136 0.632 0.15171 0.57693 0.96266 0.24596 0.16788 0.79911 0.37441 0.12984 0.35965 0.57829 0.82206 0.3367 0.92164 0.41466 0.97047 0.15843 0.04048 0.55297 0.02851 0.78155 0.682 0.45938 0.37384 0.33118 0.3038 0.01086 0.06406 0.9277 0.84958 0.70599 0.06356 0.87131 0.44903 0.14298 0.14342  [...]
+0.58632 0.06926 0.41777 0.79482 0.49701 0.83501 0.42458 0.44858 0.50084 0.77895 0.13607 0.67522 0.15364 0.11476 0.44097 0.5125 0.3063 0.68632 0.44242 0.83094 0.6662 0.68616 0.05119 0.93137 0.4216 0.53676 0.20143 0.75385 0.51404 0.43194 0.24719 0.75906 0.42575 0.36804 0.46876 0.57093 0.47489 0.71347 0.43344 0.43424 0.11202 0.03558 0.05985 0.75105 0.32001 0.53936 0.81986 0.78184 0.09168 0.05597 0.25794 0.47038 0.86622 0.11165 0.02326 0.51359 0.61191 0.17732 0.28613 0.60422 0.8967 0.90141 0 [...]
+0.94057 0.53228 0.42033 0.62314 0.36191 0.80724 0.68778 0.04763 0.21956 0.62111 0.43953 0.37087 0.72223 0.76457 0.47752 0.57814 0.51324 0.55723 0.98376 0.90927 0.36124 0.94671 0.97059 0.68683 0.73264 0.56246 0.16552 0.46526 0.13821 0.29123 0.22154 0.47169 0.73659 0.15198 0.51433 0.1834 0.78059 0.70447 0.05651 0.50604 0.35053 0.77066 0.69758 0.34688 0.21408 0.02671 0.12898 0.63519 0.07709 0.17208 0.16294 0.53946 0.34875 0.36652 0.58673 0.00302 0.20815 0.90267 0.48906 0.37003 0.43675 0.640 [...]
+0.36843 0.12127 0.17699 0.40684 0.78487 0.70031 0.20519 0.45566 0.16106 0.42047 0.06506 0.0955 0.86368 0.59971 0.63019 0.31493 0.60959 0.47504 0.56605 0.24721 0.15522 0.26104 0.68146 0.3978 0.75513 0.20865 0.71325 0.767 0.1495 0.70497 0.89142 0.62073 0.52187 0.8254 0.25801 0.01822 0.18009 0.48784 0.1006 0.56236 0.09068 0.14777 0.91587 0.83031 0.58921 0.93863 0.08233 0.83851 0.60373 0.00998 0.36605 0.70422 0.1813 0.54116 0.52394 0.83551 0.7058 0.62736 0.21874 0.39373 0.56458 0.59472 0.297 [...]
+0.2843 0.40837 0.76486 0.08329 0.74066 0.4959 0.1212 0.69529 0.28273 0.70262 0.20195 0.24773 0.246 0.49033 0.6198 0.04634 0.6669 0.34345 0.49769 0.73563 0.82835 0.10564 0.21134 0.71575 0.41633 0.28577 0.36158 0.62839 0.89679 0.64299 0.03998 0.78805 0.60391 0.04157 0.56066 0.06236 0.42955 0.21311 0.41903 0.25534 0.21392 0.8911 0.85574 0.17846 0.37335 0.2611 0.78696 0.62317 0.14818 0.62335 0.55211 0.04545 0.0493 0.53727 0.80682 0.49339 0.6828 0.39253 0.84341 0.32964 0.69602 0.09359 0.60011 [...]
+0.45397 0.49076 0.0999 0.88386 0.61241 0.79626 0.63275 0.67193 0.96149 0.14686 0.25249 0.10062 0.41022 0.56736 0.98213 0.5179 0.50509 0.78962 0.23491 0.01213 0.68499 0.20168 0.33771 0.07161 0.40337 0.85314 0.2491 0.23611 0.54148 0.57219 0.68627 0.98595 0.76227 0.09792 0.44216 0.70922 0.81021 0.07142 0.1018 0.89424 0.37653 0.04652 0.54518 0.48108 0.50994 0.05533 0.51699 0.05808 0.53459 0.74408 0.78684 0.20523 0.93421 0.88013 0.05591 0.41112 0.03059 0.44079 0.95323 0.82122 0.19694 0.52912  [...]
+0.17904 0.36087 0.71314 0.24693 0.80411 0.88045 0.05462 0.66412 0.90162 0.29787 0.28844 0.94303 0.51516 0.69839 0.69153 0.25652 0.81667 0.58501 0.17996 0.96864 0.75711 0.13421 0.44315 0.91124 0.33304 0.65066 0.09148 0.54885 0.55371 0.7975 0.479 0.15847 0.22011 0.91142 0.51074 0.76211 0.17584 0.09081 0.83569 0.68421 0.21792 0.79432 0.0734 0.8087 0.4394 0.12463 0.82851 0.67567 0.34046 0.37845 0.13088 0.26287 0.51054 0.08674 0.28925 0.19359 0.77291 0.84143 0.77598 0.71081 0.91019 0.60994 0. [...]
+0.95719 0.358 0.98418 0.77258 0.20518 0.95933 0.89383 0.02582 0.18402 0.56859 0.54845 0.3638 0.47206 0.28365 0.6425 0.72455 0.18152 0.08861 0.62433 0.22857 0.49684 0.58157 0.09102 0.43288 0.53925 0.15144 0.36212 0.04464 0.40217 0.55696 0.71806 0.08519 0.51127 0.43129 0.83336 0.62101 0.76113 0.16022 0.63449 0.46421 0.08039 0.12071 0.26121 0.89451 0.90284 0.62088 0.15546 0.23783 0.23027 0.62417 0.93906 0.04694 0.62572 0.61234 0.31166 0.9768 0.95616 0.23238 0.45601 0.26629 0.0636 0.03909 0. [...]
+0.15585 0.63058 0.2437 0.95028 0.24507 0.194 0.2098 0.08014 0.68995 0.66128 0.43716 0.06664 0.28441 0.98689 0.475 0.66772 0.53965 0.25388 0.37293 0.12487 0.70924 0.60374 0.49805 0.61229 0.59352 0.7687 0.03844 0.3599 0.59854 0.53975 0.04958 0.24275 0.6879 0.75235 0.99078 0.72003 0.77304 0.82354 0.71703 0.65609 0.28272 0.07607 0.9454 0.15622 0.9736 0.14583 0.62888 0.29879 0.98313 0.95285 0.58594 0.77351 0.38621 0.57908 0.4451 0.05916 0.23937 0.2844 0.69241 0.40203 0.82657 0.55541 0.9734 0. [...]
+0.38315 0.67913 0.53914 0.75014 0.82357 0.57374 0.25815 0.01871 0.83953 0.74125 0.40142 0.089 0.62238 0.07511 0.61044 0.7089 0.48091 0.47965 0.93978 0.1267 0.86303 0.55437 0.06892 0.45537 0.4987 0.72896 0.16522 0.81743 0.16503 0.51176 0.59277 0.25869 0.20874 0.44015 0.60098 0.79446 0.7481 0.07299 0.61049 0.32151 0.14958 0.49138 0.91952 0.8375 0.29223 0.85348 0.14712 0.19264 0.40671 0.24496 0.01532 0.06842 0.7036 0.05059 0.553 0.31215 0.31948 0.97836 0.66111 0.3072 0.11056 0.26904 0.11772 [...]
+0.10453 0.16936 0.6936 0.68113 0.64791 0.71902 0.26869 0.80673 0.65385 0.59678 0.2568 0.89671 0.17586 0.5505 0.23408 0.24502 0.74827 0.94029 0.26458 0.81144 0.611 0.01644 0.0578 0.54279 0.90027 0.82892 0.38684 0.39992 0.70661 0.3138 0.69001 0.93719 0.79924 0.11908 0.05121 0.55262 0.14629 0.95452 0.27415 0.72931 0.18899 0.85679 0.3229 0.97563 0.79056 0.35894 0.27452 0.70007 0.38242 0.18441 0.29596 0.15777 0.88187 0.00625 0.18828 0.89871 0.46763 0.41631 0.89147 0.5804 0.31452 0.03103 0.135 [...]
+0.61175 0.86501 0.48598 0.42838 0.14474 0.4736 0.99892 0.73413 0.88832 0.78581 0.25146 0.33807 0.75172 0.6175 0.80462 0.38918 0.50536 0.90954 0.66902 0.47212 0.14368 0.47331 0.93175 0.64728 0.34137 0.61458 0.19138 0.85963 0.69361 0.72994 0.67521 0.71763 0.79434 0.42475 0.0602 0.33499 0.42264 0.87368 0.56283 0.13427 0.87558 0.51577 0.41746 0.7041 0.63164 0.79025 0.17847 0.10828 0.17394 0.12158 0.37824 0.9107 0.7692 0.86929 0.71366 0.96037 0.87202 0.91023 0.47279 0.60473 0.24547 0.84401 0. [...]
+0.93592 0.15858 0.26534 0.41442 0.54971 0.00119 0.89159 0.03904 0.24515 0.51606 0.16071 0.31425 0.95 0.94439 0.60578 0.76031 0.03283 0.79178 0.92057 0.96238 0.60587 0.34799 0.36558 0.73451 0.81203 0.31783 0.77624 0.56894 0.35684 0.79109 0.45955 0.19489 0.55047 0.77078 0.53057 0.39242 0.44105 0.3571 0.31239 0.66592 0.50078 0.74206 0.50455 0.97683 0.50073 0.1888 0.74692 0.57589 0.96205 0.68228 0.79659 0.66138 0.44367 0.32806 0.05746 0.96099 0.72444 0.11558 0.52784 0.93979 0.13339 0.71192 0 [...]
+0.99132 0.28857 0.50883 0.95708 0.79805 0.93288 0.64921 0.40808 0.51897 0.62361 0.80749 0.27658 0.3591 0.19251 0.23239 0.50761 0.84202 0.07044 0.57867 0.68063 0.91578 0.0254 0.49573 0.52086 0.953 0.72797 0.2987 0.95299 0.82184 0.99706 0.07547 0.12797 0.7786 0.89121 0.43095 0.06345 0.06217 0.87521 0.76788 0.95841 0.97182 0.95332 0.06978 0.00112 0.69496 0.94845 0.1164 7e-05 0.84103 0.16827 0.46233 0.35664 0.60019 0.43399 0.29321 0.2948 0.27007 0.78337 0.81959 0.26593 0.47664 0.974 0.49077  [...]
+0.19892 0.47128 0.41074 0.27699 0.0279 0.15146 0.43881 0.04942 0.28329 0.1243 0.91952 0.82364 0.23051 0.25667 0.17457 0.1549 0.3573 0.07762 0.57986 0.63342 0.51958 0.25878 0.82654 0.63573 0.49961 0.82332 0.15628 0.05973 0.79881 0.26268 0.71298 0.60541 0.90935 0.07039 0.33651 0.38115 0.04057 0.42666 0.98465 0.8641 0.78686 0.67239 0.82394 0.41038 0.70719 0.25158 0.95808 0.29328 0.69602 0.38789 0.69122 0.40063 0.77094 0.00647 0.11202 0.52971 0.42806 0.91299 0.18471 0.27972 0.61049 0.88302 0 [...]
+0.96365 0.59295 0.13785 0.92679 0.52776 0.60405 0.55583 0.48455 0.24616 0.98968 0.32548 0.97839 0.22549 0.93765 0.64652 0.31591 0.05018 0.68218 0.16978 0.84535 0.07908 0.63992 0.06376 0.87061 0.1491 0.7192 0.51673 0.78009 0.88532 0.10434 0.02906 0.33317 0.68344 0.81593 0.16333 0.20591 0.39078 0.49848 0.45524 0.88229 0.88933 0.44179 0.45202 0.64615 0.172 0.57434 0.20559 0.59339 0.41074 0.34598 0.77177 0.28927 0.34401 0.04546 0.22302 0.77151 0.62865 0.53688 0.29669 0.54351 0.19328 0.30103  [...]
+0.15605 0.33059 0.2952 0.03131 0.90631 0.59375 0.45584 0.89373 0.22331 0.8013 0.16015 0.83232 0.19668 0.28329 0.21615 0.05359 0.70551 0.914 0.73311 0.21486 0.55847 0.12799 0.76541 0.19301 0.79709 0.75244 0.91785 0.38888 0.69272 0.97702 0.01406 0.64845 0.37435 0.4476 0.64346 0.8653 0.86816 0.27887 0.45812 0.87069 0.44377 0.26965 0.37538 0.23436 0.5833 0.9476 0.95674 0.38868 0.74619 0.98471 0.41664 0.13309 0.34911 0.59714 0.99993 0.89194 0.82337 0.05813 0.07259 0.89985 0.00904 0.05837 0.57 [...]
+0.78618 0.05922 0.60327 0.07238 0.18271 0.22377 0.0302 0.97764 0.54542 0.0567 0.00423 0.99248 0.69253 0.43803 0.16287 0.34804 0.43257 0.41735 0.48005 0.93662 0.95464 0.69554 0.95352 0.77395 0.27579 0.13738 0.01602 0.29549 0.18538 0.49143 0.31615 0.80482 0.02447 0.70765 0.96001 0.43905 0.1184 0.65174 0.00411 0.01865 0.69525 0.74072 0.28242 0.81421 0.04014 0.60323 0.25705 0.44904 0.71151 0.03979 0.34695 0.52093 0.78062 0.49118 0.97225 0.69703 0.07519 0.48998 0.45027 0.85571 0.81341 0.80536 [...]
+0.21492 0.77816 0.31662 0.7001 0.38564 0.83381 0.78936 0.56127 0.81604 0.29683 0.08515 0.11832 0.19043 0.15425 0.93959 0.37166 0.58309 0.86891 0.39123 0.88063 0.78626 0.05761 0.13686 0.2255 0.99064 0.78343 0.89754 0.32553 0.62729 0.87923 0.08743 0.10556 0.39603 0.62981 0.35919 0.41423 0.33384 0.50715 0.54929 0.32509 0.53858 0.06034 0.23392 0.60603 0.8825 0.0727 0.83052 0.23362 0.69215 0.02767 0.14476 0.90165 0.2006 0.84156 0.94822 0.32762 0.87561 0.20232 0.6527 0.42429 0.54448 0.87181 0. [...]
+0.05755 0.05158 0.28781 0.97247 0.94529 0.58644 0.96638 0.82759 0.79562 0.83983 0.16188 0.19035 0.29341 0.00623 0.76541 0.97542 0.09204 0.78836 0.72424 0.03187 0.69285 0.39226 0.85785 0.34545 0.66212 0.53376 0.2633 0.4872 0.59549 0.32605 0.66526 0.66947 0.31338 0.62742 0.07125 0.76915 0.44561 0.45058 0.67245 0.02445 0.29582 0.08944 0.38649 0.55034 0.43986 0.63072 0.93365 0.88883 0.73791 0.69888 0.592 0.49023 0.00567 0.88334 0.81308 0.46122 0.22801 0.22518 0.56083 0.06479 0.90177 0.62421  [...]
+0.56908 0.13792 0.05579 0.88723 0.94869 0.82631 0.87119 0.22519 0.76889 0.17568 0.14499 0.97993 0.57217 0.5057 0.28254 0.24224 0.77219 0.14779 0.15071 0.03082 0.50164 0.60325 0.3598 0.12924 0.41958 0.09291 0.28629 0.20283 0.3323 0.69639 0.89766 0.70177 0.66495 0.31904 0.85183 0.81031 0.31864 0.44825 0.43552 0.06692 0.55976 0.98951 0.69284 0.31277 0.13067 0.85144 0.28057 0.76112 0.77577 0.29891 0.94194 0.88341 0.33107 0.7809 0.79025 0.40195 0.20889 0.21713 0.46287 0.39376 0.76591 0.56971  [...]
+0.35514 0.15789 0.4624 0.99973 0.90114 0.96622 0.7656 0.16459 0.65864 0.64983 0.22364 0.25776 0.50121 0.84149 0.0946 0.55418 0.67452 0.79835 0.49699 0.07141 0.02011 0.72414 0.90325 0.27929 0.51128 0.92162 0.77314 0.07605 0.4844 0.53747 0.93762 0.36318 0.39919 0.53343 0.05983 0.80506 0.78927 0.05394 0.18971 0.43046 0.95602 0.41747 0.35156 0.47518 0.32918 0.35754 0.45449 0.32014 0.46043 0.38419 0.26509 0.37923 0.18465 0.62154 0.49043 0.15329 0.78034 0.84993 0.97643 0.95436 0.52456 0.2065 0 [...]
+0.76785 0.84438 0.46644 0.80128 0.97945 0.18883 0.41038 0.74641 0.49415 0.42648 0.8914 0.43959 0.06252 0.4821 0.0842 0.54645 0.61349 0.30458 0.68263 0.3788 0.9361 0.36738 0.24657 0.22946 0.7954 0.78447 0.0307 0.78993 0.62407 0.6476 0.33325 0.33661 0.67516 0.39269 0.29604 0.5797 0.54576 0.32693 0.72204 0.27855 0.30633 0.53883 0.63219 0.22929 0.35457 0.75232 0.55461 0.04305 0.48572 0.13507 0.62072 0.60862 0.02666 0.3886 0.79629 0.80754 0.64919 0.70888 0.16142 0.55797 0.11068 0.03712 0.1328 [...]
+0.7931 0.0113 0.28867 0.68624 0.34093 0.47239 0.79517 0.13766 0.42535 0.44237 0.90171 0.16581 0.79127 0.07121 0.19719 0.65421 0.91696 0.78142 0.22472 0.69252 0.61824 0.49725 0.91467 0.65863 0.86494 0.81193 0.87214 0.06405 0.28852 0.37564 0.71978 0.92338 0.05347 0.83296 0.64814 0.38662 0.47835 0.88664 0.45867 0.97592 0.71065 0.67422 0.70063 0.73584 0.10487 0.16823 0.01178 0.38864 0.04968 0.49722 0.56308 0.38931 0.50567 0.38097 0.49767 0.1066 0.4873 0.79383 0.60319 0.79837 0.7475 0.54044 0 [...]
+0.09268 0.31919 0.42928 0.44219 0.73255 0.55284 0.1062 0.93415 0.36004 0.24475 0.95664 0.42866 0.69947 0.45321 0.11199 0.78433 0.84959 0.61889 0.49592 0.59737 0.91133 0.77208 0.40227 0.79819 0.38937 0.4082 0.00474 0.08929 0.93322 0.96709 0.87714 0.5977 0.69181 0.76255 0.34534 0.30405 0.41497 0.49497 0.43606 0.01772 0.5083 0.04737 0.5794 0.31008 0.57052 0.15619 0.4859 0.92596 0.89637 0.11328 0.19944 0.88982 0.41366 0.02019 0.08474 0.16215 0.45951 0.47637 0.73622 0.60622 0.08711 0.45768 0. [...]
+0.09323 0.29735 0.85722 0.45118 0.03066 0.86565 0.38515 0.61231 0.17492 0.55409 0.42249 0.03379 0.56863 0.67669 0.30028 0.00305 0.37606 0.51346 0.77799 0.80431 0.85295 0.02088 0.19342 0.43113 0.29747 0.46652 0.58839 0.29192 0.43744 0.06641 0.68295 0.86953 0.87252 0.82653 0.94176 0.07751 0.81063 0.05684 0.0984 0.94711 0.72861 0.52029 0.09387 0.18282 0.31223 0.41297 0.73243 0.65147 0.11757 0.75468 0.01789 0.02148 0.10842 0.65987 0.58405 0.89476 0.72615 0.84119 0.43928 0.66996 0.98531 0.877 [...]
+0.13577 0.82877 0.66884 0.41529 0.70022 0.14213 0.936 0.55704 0.62384 0.04993 0.73984 0.76383 0.14446 0.58342 0.12371 0.20185 0.25518 0.2977 0.62856 0.17836 0.67172 0.80279 0.09996 0.96336 0.19837 0.79116 0.88477 0.56964 0.20576 0.87224 0.95455 0.08671 0.64934 0.53059 0.93969 0.75721 0.37714 0.1237 0.51351 0.5965 0.17869 0.20574 0.66184 0.7024 0.26248 0.71139 0.6391 0.70074 0.34518 0.18284 0.86267 0.82032 0.49309 0.00329 0.54701 0.06492 0.27987 0.1908 0.7356 0.75697 0.07072 0.391 0.6117  [...]
+0.06328 0.17907 0.92421 0.77022 0.97831 0.4978 0.56476 0.81411 0.50846 0.79491 0.11549 0.16393 0.38609 0.30385 0.60633 0.80037 0.06512 0.32739 0.37055 0.38943 0.81228 0.93456 0.05665 0.47058 0.08854 0.59696 0.46842 0.57536 0.06697 0.99202 0.80502 0.60832 0.74569 0.60027 0.53237 0.54675 0.28416 0.26296 0.42385 0.54832 0.11639 0.13345 0.88778 0.73595 0.90957 0.87497 0.45221 0.02443 0.7077 0.26379 0.36899 0.08662 0.15444 0.05168 0.63076 0.48713 0.91817 0.08901 0.48826 0.64289 0.16164 0.0115 [...]
+0.54667 0.70862 0.73112 0.03192 0.74035 0.35288 0.8606 0.60302 0.4827 0.62773 0.12536 0.69409 0.45735 0.68896 0.33408 0.32933 0.79059 0.32048 0.90634 0.22993 0.86686 0.56194 0.9888 0.83078 0.78344 0.33838 0.86793 0.01241 0.652 0.54363 0.18059 0.78637 0.92978 0.10433 0.1006 0.32531 0.53249 0.08109 0.32918 0.91317 0.0083 0.31608 0.86624 0.46222 0.1172 0.56254 0.01381 0.6455 0.03996 0.50357 0.50193 0.83416 0.30642 0.50455 0.89991 0.24135 0.77284 0.02939 0.45126 0.64096 0.96915 0.78752 0.905 [...]
+0.56624 0.88296 0.65427 0.06573 0.02221 0.61188 0.47416 0.06248 0.41678 0.67898 0.10891 0.19598 0.59889 0.78995 0.31314 0.2486 0.67789 0.41775 0.87693 0.8733 0.17123 0.63409 0.50497 0.38472 0.70433 0.36002 0.7475 0.39538 0.21416 0.35272 0.00615 0.15448 0.02873 0.17071 0.8409 0.42687 0.79142 0.60589 0.59399 0.95049 0.47277 0.82972 0.43206 0.73933 0.08315 0.14422 0.75294 0.8304 0.4411 0.68064 0.30764 0.03117 0.94497 0.39675 0.54991 0.80556 0.15789 0.90695 0.62284 0.74362 0.02095 0.67264 0. [...]
+0.87591 0.10221 0.87641 0.15417 0.95623 0.13047 0.32699 0.38325 0.9162 0.57591 0.26667 0.40631 0.83021 0.00705 0.14534 0.56261 0.54102 0.62229 0.73093 0.76145 0.05255 0.77405 0.69569 0.57561 0.26951 0.37624 0.09329 0.18745 0.82443 0.64994 0.81437 0.82362 0.7342 0.59995 0.51524 0.7377 0.74152 0.00836 0.79669 0.95908 0.00064 0.38828 0.74577 0.90941 0.90628 0.95555 0.75778 0.32253 0.10069 0.69929 0.37675 0.75691 0.76128 0.84967 0.40001 0.56214 0.44347 0.54918 0.33956 0.15816 0.22199 0.91919 [...]
+0.96464 0.72912 0.35928 0.69708 0.66209 0.92396 0.43057 0.19178 0.73714 0.91516 0.38882 0.46957 0.7817 0.26207 0.47419 0.35742 0.02144 0.75817 0.30793 0.93728 0.34778 0.41326 0.40349 0.33462 0.00302 0.34221 0.86616 0.64906 0.31307 0.12805 0.1749 0.35508 0.11878 0.47882 0.49061 0.69218 0.45301 0.72238 0.49343 0.43766 0.17753 0.79688 0.72585 0.99735 0.52797 0.53469 0.77621 0.55427 0.70715 0.4072 0.27128 0.27988 0.78116 0.89211 0.38241 0.53168 0.35124 0.2441 0.13216 0.78034 0.42202 0.33817  [...]
+0.41139 0.4785 0.65786 0.95512 0.5993 0.58179 0.30772 0.51159 0.35304 0.36763 0.71723 0.9908 0.13203 0.20583 0.84909 0.32813 0.62726 0.86047 0.92392 0.95985 0.55294 0.39991 0.20526 0.26683 0.16362 0.01025 0.38161 0.51963 0.27556 0.35003 0.46998 0.91252 0.57273 0.69327 0.93602 0.08529 0.68853 0.02519 0.55565 0.3609 0.53105 0.65669 0.11591 0.40094 0.92638 0.18619 0.32836 0.31506 0.40087 0.88048 0.64461 0.66603 0.69697 0.47578 0.15967 0.22914 0.68985 0.76864 0.94703 0.78698 0.12148 0.05546  [...]
+0.29193 0.60689 0.90145 0.68031 0.43876 0.99202 0.96691 0.1833 0.68196 0.78531 0.50369 0.17676 0.39333 0.61685 0.56494 0.23622 0.18308 0.07451 0.91955 0.86429 0.68847 0.54264 0.01146 0.96522 0.42729 0.1489 0.00546 0.99994 0.05988 0.16411 0.30669 0.76936 0.82527 0.02264 0.14161 0.68591 0.052 0.91534 0.61795 0.83218 0.14649 0.00765 0.10517 0.61409 0.01127 0.41555 0.61415 0.08033 0.28693 0.82666 0.88788 0.87011 0.33727 0.96277 0.60739 0.66007 0.13274 0.83486 0.53986 0.84066 0.36539 0.14561  [...]
+0.68603 0.81436 0.90528 0.43055 0.11038 0.84393 0.31372 0.8562 0.96427 0.69094 0.77111 0.55872 0.87116 0.51066 0.21579 0.76656 0.9586 0.27963 0.07612 0.66348 0.82824 0.37681 0.31077 0.32685 0.15623 0.39749 0.35648 0.12988 0.11257 0.01888 0.1972 0.28811 0.95801 0.37532 0.21696 0.82924 0.37942 0.13994 0.69752 0.62855 0.56332 0.79731 0.76579 0.10661 0.22517 0.35167 0.42852 0.71684 0.168 0.46537 0.90948 0.37383 0.23533 0.72089 0.73886 0.09658 0.44054 0.19211 0.93647 0.50967 0.26757 0.11863 0 [...]
+0.89481 0.59708 0.53973 0.58128 0.81239 0.4971 0.35872 0.4916 0.63958 0.98453 0.49199 0.97221 0.99315 0.70113 0.11789 0.43067 0.61636 0.96101 0.02664 0.42807 0.45345 0.61804 0.53158 0.78652 0.853 0.74844 0.38966 0.64957 0.89281 0.15111 0.40694 0.68779 0.09929 0.14731 0.19333 0.59411 0.24623 0.6393 0.716 0.25118 0.74867 0.58521 0.2904 0.28735 0.88926 0.32206 0.30146 0.25402 0.37116 0.4348 0.13928 0.94811 0.93135 0.18296 0.77947 0.53323 0.27344 0.25644 0.18829 0.66733 0.0158 0.36395 0.4845 [...]
+0.8308 0.4277 0.68869 0.59172 0.81373 0.19625 0.42915 0.71397 0.61694 0.0154 0.18321 0.66984 0.91611 0.52253 0.29941 0.13489 0.42063 0.22128 0.11478 0.48648 0.70335 0.36484 0.05926 0.26563 0.80533 0.24431 0.79167 0.25978 0.12021 0.79224 0.91467 0.26139 0.497 0.41634 0.47905 0.25649 0.77457 0.2104 0.48206 0.61831 0.69753 0.04774 0.75271 0.61345 0.63677 0.86223 0.50687 0.25533 0.20883 0.84699 0.40031 0.85117 0.00919 0.98858 0.00371 0.05344 0.81966 0.9072 0.08939 0.42128 0.80769 0.64566 0.7 [...]
+0.80259 0.69261 0.2108 0.90149 0.59249 0.9511 0.0372 0.16 0.94137 0.98006 0.38599 0.43318 0.59307 0.96182 0.62225 0.14551 0.76636 0.5736 0.17423 0.1422 0.34613 0.56949 0.80615 0.44378 0.08712 0.83316 0.99415 0.34673 0.44566 0.08634 0.28552 0.01888 0.96323 0.95586 0.68286 0.81413 0.7992 0.82376 0.46221 0.28424 0.38572 0.62305 0.64885 0.80408 0.13135 0.89479 0.72806 0.30385 0.65317 0.75912 0.56233 0.32255 0.51467 0.6914 0.35184 0.03237 0.65719 0.74188 0.83229 0.5835 0.961 0.16824 0.58981 0 [...]
+0.14152 0.33625 0.72862 0.66658 0.38809 0.57978 0.52665 0.53384 0.1815 0.27611 0.00549 0.32571 0.11163 0.89573 0.99541 0.97796 0.41827 0.17911 0.43996 0.43123 0.75479 0.295 0.63024 0.69462 0.32192 0.59824 0.97582 0.48912 0.32801 0.81579 0.06655 0.06173 0.10221 0.99406 0.70768 0.62306 0.60658 0.3253 0.46971 0.7475 0.8917 0.69694 0.32362 0.25131 0.39719 0.46772 0.5818 0.49716 0.0175 0.91671 0.9118 0.6648 0.68378 0.2867 0.64149 0.79568 0.77473 0.76795 0.75279 0.60251 0.39448 0.96171 0.1956  [...]
+0.73211 0.37922 0.73888 0.97232 0.139 0.6631 0.76503 0.73351 0.75403 0.50024 0.8181 0.00603 0.63368 0.73422 0.32586 0.14564 0.38939 0.57433 0.36891 0.02342 0.6137 0.9225 0.99428 0.97035 0.66234 0.66331 0.34517 0.19718 0.18999 0.14172 0.65994 0.03048 0.67729 0.75567 0.58805 0.99572 0.83374 0.9452 0.58791 0.30942 0.29687 0.58032 0.07742 0.64923 0.42905 0.76801 0.20718 0.57111 0.5945 0.13241 0.95648 0.76633 0.46425 0.96104 0.5076 0.60181 0.39862 0.46969 0.10087 0.26666 0.74164 0.87617 0.416 [...]
+0.15422 0.22438 0.12747 0.83998 0.60976 0.13615 0.59092 0.82987 0.39385 0.39189 0.31504 0.55358 0.19899 0.70102 0.3387 0.28918 0.29851 0.09553 0.38696 0.61644 0.20393 0.45117 0.06263 0.54548 0.16887 0.18469 0.35356 0.56949 0.24037 0.97575 0.89925 0.65209 0.03841 0.05539 0.03506 0.49864 0.63893 0.99355 0.86391 0.64499 0.43614 0.4767 0.06209 0.28431 0.94258 0.82987 0.46072 0.28521 0.70166 0.09723 0.34921 0.38349 0.79033 0.93951 0.55251 0.96518 0.77156 0.72361 0.02834 0.14629 0.26358 0.8329 [...]
+0.27594 0.90433 0.93058 0.44926 0.8474 0.46314 0.30505 0.69426 0.63921 0.31863 0.39134 0.66737 0.95782 0.05345 0.831 0.19928 0.48685 0.69696 0.20486 0.46944 0.21626 0.22308 0.47265 0.74867 0.93101 0.47088 0.04119 0.98284 0.96621 0.84067 0.46378 0.76613 0.67634 0.60671 0.65183 0.03555 0.77351 0.31455 0.65576 0.31379 0.88968 0.37036 0.46036 0.69482 0.87503 0.00135 0.07909 0.97177 0.51855 0.11471 0.7929 0.15488 0.87844 0.02896 0.3315 0.97772 0.93915 0.01771 0.63352 0.63148 0.93652 0.88188 0 [...]
+0.72844 0.84714 0.00068 0.93268 0.87353 0.99405 0.66165 0.29269 0.8633 0.93739 0.54159 0.16798 0.33042 0.18654 0.78365 0.62368 0.39656 0.03394 0.39395 0.82878 0.62537 0.00261 0.10973 0.96959 0.51629 0.78774 0.01667 0.52462 0.74995 0.16938 0.83189 0.64061 0.82486 0.38443 0.66311 0.88215 0.2904 0.62272 0.08067 0.0432 0.115 0.11732 0.20726 0.89176 0.47256 0.58752 0.49357 0.72121 0.06542 0.40713 0.73626 0.14756 0.84294 0.53005 0.00717 0.23758 0.99095 0.96519 0.1988 0.75549 0.84471 0.04159 0. [...]
+0.90913 0.07878 0.56665 0.93722 0.29194 0.79613 0.89706 0.14067 0.80772 0.23094 0.51249 0.63137 0.79002 0.13432 0.99702 0.37451 0.94501 0.8441 0.31269 0.45462 0.18764 0.47191 0.53833 0.00419 0.53469 0.49867 0.02882 0.00015 0.91339 0.54178 0.56575 0.87353 0.31901 0.70931 0.28689 0.00162 0.44929 0.17494 0.6166 0.11251 0.72509 0.9 0.63166 0.83264 0.30548 0.04531 0.10351 0.81445 0.24085 0.45182 0.46536 0.63988 0.32747 0.90666 0.88771 0.74977 0.01464 0.14397 0.96323 0.14272 0.13296 0.38428 0. [...]
+0.47948 0.41253 0.25798 0.6388 0.20198 0.42632 0.75312 0.39402 0.71076 0.53637 0.07901 0.69029 0.92772 0.13729 0.11273 0.96504 0.25257 0.55829 0.80831 0.05257 0.36397 0.43263 0.93371 0.79688 0.88089 0.51971 0.72186 0.54625 0.87212 0.64172 0.69162 0.96697 0.31234 0.34596 0.78935 0.99406 0.01528 0.02837 0.24768 0.12176 0.58345 0.99059 0.24007 0.95739 0.36031 0.40197 0.23556 0.12649 0.1768 0.16095 0.59245 0.64224 0.3422 0.36558 0.48905 0.8305 0.22315 0.86256 0.21663 0.47818 0.25904 0.64372  [...]
+0.07072 0.36827 0.07995 0.03804 0.43818 0.09659 0.9362 0.75552 0.30715 0.3357 0.19679 0.02742 0.33012 0.80732 0.92663 0.80593 0.7577 0.32513 0.17126 0.45542 0.88517 0.35366 0.14254 0.76201 0.94849 0.86092 0.92317 0.09703 0.44563 0.31799 0.16499 0.00171 0.55981 0.29057 0.72337 0.002 0.05652 0.36101 0.50106 0.60989 0.05481 0.77473 0.22206 0.938 0.23204 0.14719 0.39084 0.81303 0.08618 0.51651 0.99731 0.50291 0.91477 0.43303 0.15312 0.30098 0.04837 0.14132 0.16697 0.29144 0.64224 0.58034 0.5 [...]
+0.68078 0.55487 0.45891 0.91489 0.00805 0.08811 0.89154 0.17961 0.11021 0.23802 0.03037 0.49474 0.29871 0.88914 0.35063 0.72379 0.27883 0.09623 0.0346 0.76928 0.42665 0.77529 0.1797 0.30285 0.29073 0.08654 0.77825 0.04978 0.61934 0.7081 0.84293 0.53594 0.44398 0.16695 0.47368 0.84209 0.15055 0.40431 0.48201 0.34975 0.09569 0.60663 0.10172 0.46911 0.07178 0.60641 0.55854 0.36827 0.61822 0.36319 0.05366 0.72582 0.33307 0.59152 0.14888 0.55509 0.86409 0.64679 0.23785 0.33545 0.37842 0.81495 [...]
+0.75742 0.99054 0.4525 0.20666 0.05376 0.42737 0.08153 0.14715 0.77659 0.0567 0.43214 0.15898 0.65538 0.44607 0.56895 0.26874 0.85208 0.99998 0.39764 0.8342 0.20695 0.295 0.16647 0.44358 0.13436 0.93122 0.6293 0.86076 0.52915 0.28697 0.42199 0.25631 0.71891 0.7275 0.63984 0.59433 0.41867 0.18876 0.70792 0.25719 0.90711 0.02887 0.95535 0.35757 0.07151 0.49338 0.32564 0.16541 0.646 0.89064 0.83865 0.82346 0.47159 0.95436 0.37389 0.99118 0.04987 0.2628 0.85765 0.20122 0.5808 0.11825 0.4272  [...]
+0.64861 0.10927 0.94905 0.1762 0.16538 0.45866 0.55284 0.2197 0.66674 0.02945 0.82863 0.64566 0.38682 0.87511 0.33065 0.77178 0.23944 0.63839 0.35205 0.65283 0.54128 0.88795 0.37663 0.58649 0.20355 0.58292 0.59487 0.34904 0.09395 0.77749 0.60126 0.98982 0.43291 0.83791 0.40996 0.76101 0.22742 0.08918 0.63873 0.45324 0.48748 0.36079 0.03286 0.42149 0.47867 0.52661 0.50928 0.28962 0.29789 0.62475 0.23318 0.32004 0.74903 0.51779 0.74541 0.25043 0.79404 0.41876 0.28871 0.31021 0.54366 0.4369 [...]
+0.83203 0.05555 0.98936 0.05049 0.81238 0.68037 0.81641 0.24861 0.32371 0.94562 0.29901 0.08776 0.41081 0.14328 0.60499 0.2811 0.71875 0.17674 0.76582 0.56387 0.64055 0.74491 0.83138 0.62068 0.58558 0.25509 0.65316 0.47383 0.41151 0.9409 0.24899 0.79091 0.98372 0.6739 0.23376 0.62666 0.1171 0.31022 0.14627 0.79398 0.24986 0.75208 0.40311 0.8285 0.34662 0.32705 0.15148 0.231 0.81171 0.67271 0.63932 0.39525 0.919 0.38946 0.97096 0.9471 0.26598 0.71349 0.30312 0.88445 0.74971 0.72688 0.2898 [...]
+0.36384 0.06636 0.8196 0.1434 0.13447 0.82501 0.24466 0.43728 0.74026 0.5486 0.26033 0.79098 0.88819 0.43489 0.88365 0.84496 0.0669 0.94564 0.73844 0.67861 0.04811 0.49463 0.9366 0.54503 0.30587 0.29437 0.95718 0.18137 0.352 0.75432 0.8909 0.64345 0.14665 0.5865 0.44218 0.48043 0.15408 0.1772 0.16076 0.31941 0.17197 0.2093 0.02006 0.34699 0.03921 0.96201 0.36196 0.44096 0.70734 0.81949 0.07713 0.33421 0.41971 0.12951 0.60781 0.04813 0.83381 0.7735 0.43402 0.21311 0.22883 0.19419 0.51847  [...]
+0.34254 0.45199 0.15272 0.69713 0.32373 0.79306 0.47196 0.55918 0.90713 0.67317 0.67708 0.52173 0.90894 0.81486 0.6652 0.68582 0.06129 0.73269 0.07991 0.29426 0.65859 0.02202 0.7365 0.76331 0.8795 0.76574 0.63775 0.43078 0.86656 0.48341 0.9255 0.19757 0.77835 0.30686 0.71519 0.22669 0.07766 0.51599 0.87804 0.31726 0.19521 0.84527 0.75007 0.51394 0.78821 0.19347 0.89708 0.79284 0.79432 0.50634 0.77691 0.05281 0.77768 0.79 0.69666 0.52275 0.24018 0.06956 0.56993 0.58156 0.52131 0.17222 0.5 [...]
+0.86466 0.53275 0.61746 0.78418 0.71688 0.72377 0.47263 0.4378 0.51612 0.60643 0.70139 0.61671 0.8946 0.53392 0.45121 0.31383 0.81469 0.52115 0.19869 0.53258 0.70293 0.14301 0.92668 0.22117 0.14176 0.08723 0.38651 0.88774 0.59593 0.83906 0.56835 0.66225 0.02774 0.69079 0.72335 0.88974 0.74024 0.2975 0.26472 0.51868 0.22814 0.84064 0.19492 0.66132 0.92861 0.94209 0.81414 0.93366 0.29687 0.07099 0.66295 0.39769 0.05886 0.74988 0.09904 0.22552 0.68895 0.5718 0.50615 0.34081 0.38231 0.77769  [...]
+0.60912 0.11829 0.21982 0.27534 0.1538 0.27303 0.02551 0.68308 0.10071 0.50003 0.34293 0.48401 0.81937 0.31154 0.69509 0.02398 0.59882 0.08399 0.25699 0.76444 0.89379 0.03176 0.16046 0.9367 0.47818 0.21877 0.24502 0.28024 0.50368 0.13732 0.03726 0.9763 0.9979 0.34507 0.33575 0.54048 0.13994 0.23788 0.20133 0.27117 0.84363 0.5109 0.39079 0.22676 0.51188 0.37044 0.75743 0.10973 0.52591 0.49191 0.92294 0.00031 0.11437 0.66816 0.4876 0.19274 0.31223 0.43609 0.08267 0.77903 0.79095 0.31361 0. [...]
+0.75653 0.22369 0.22157 0.84706 0.42664 0.62869 0.93469 0.95008 0.05651 0.71893 0.651 0.48159 0.56521 0.10902 0.28705 0.32243 0.61241 0.54172 0.30588 0.56751 0.16655 0.5951 0.75882 0.31301 0.4703 0.14637 0.31959 0.50752 0.73653 0.16373 0.75533 0.92425 0.59228 0.13115 0.13931 0.22811 0.34442 0.08771 0.95692 0.34314 0.7201 0.68208 0.83597 0.02069 0.31212 0.2089 0.38949 0.36946 0.22684 0.69568 0.10761 0.96104 0.55152 0.62012 0.87273 0.59111 0.06125 0.60812 0.73753 0.86649 0.19757 0.16508 0. [...]
+0.54369 0.78385 0.42452 0.14872 0.81319 0.78739 0.54732 0.79814 0.60411 0.64764 0.42416 0.76167 0.39087 0.79329 0.89196 0.1976 0.57721 0.83474 0.05856 0.07349 0.03088 0.81418 0.72155 0.40463 0.30991 0.48176 0.05666 0.64421 0.43766 0.72635 0.00527 0.82 0.93797 0.91274 0.49539 0.14821 0.44335 0.75793 0.59703 0.93438 0.07296 0.39032 0.77726 0.29066 0.24916 0.93898 0.86555 0.05194 0.54418 0.38447 0.23026 0.1512 0.8229 0.63135 0.64429 0.74206 0.92635 0.91494 0.42727 0.83883 0.23371 0.14627 0. [...]
+0.20301 0.79 0.01979 0.24845 0.31199 0.27103 0.0208 0.74035 0.4254 0.05839 0.44706 0.04333 0.08751 0.59134 0.14105 0.14337 0.2039 0.52211 0.56081 0.74671 0.74718 0.16355 0.97269 0.14311 0.82501 0.44494 0.87211 0.49651 0.88608 0.28465 0.17959 0.96107 0.11531 0.14554 0.23187 0.56282 0.6883 0.38909 0.9747 0.07949 0.52868 0.64406 0.43743 0.84346 0.66979 0.11432 0.99266 0.34758 0.56425 0.67507 0.62759 0.09984 0.24786 0.43717 0.85523 0.48778 0.26022 0.77732 0.71087 0.34273 0.45709 0.83845 0.02 [...]
+0.47643 0.82706 0.31149 0.119 0.57534 0.40055 0.18924 0.63388 0.7608 0.04598 0.09317 0.8625 0.10125 0.33502 0.9039 0.41431 0.76081 0.48995 0.54534 0.82993 0.0409 0.397 0.4885 0.65629 0.39126 0.99046 0.35529 0.19392 0.28239 0.51246 0.68545 0.81284 0.16138 0.57328 0.97914 0.04546 0.67784 0.52284 0.21535 0.17402 0.63945 0.60227 0.70785 0.87414 0.27866 0.67959 0.09116 0.47032 0.43411 0.12047 0.19978 0.80011 0.4566 0.07231 0.18208 0.52112 0.26562 0.05807 0.12871 0.55857 0.83451 0.89574 0.4578 [...]
+0.45418 0.84087 0.18736 0.40496 0.89177 0.02948 0.30645 0.22821 0.63552 0.82457 0.91334 0.5676 0.50811 0.06775 0.90567 0.90151 0.26234 0.56878 0.51144 0.16255 0.29458 0.31454 0.64428 0.18264 0.8686 0.16151 0.74097 0.13968 0.25409 0.50637 0.70812 0.37468 0.24886 0.30884 0.3837 0.02283 0.02759 0.4826 0.64053 0.70434 0.70688 0.88171 0.94942 0.24087 0.67463 0.85892 0.01635 0.59378 0.95184 0.57505 0.08733 0.27522 0.99832 0.89158 0.8435 0.34469 0.61516 0.28038 0.31008 0.95196 0.095 0.50187 0.0 [...]
+0.3253 0.9971 0.90603 0.08194 0.9034 0.19253 0.12203 0.631 0.35779 0.41393 0.10165 0.95688 0.34609 0.47764 0.30858 0.38446 0.1311 0.63519 0.89342 0.46518 0.32445 0.6603 0.2828 0.84518 0.43445 0.14983 0.54391 0.59579 0.94378 0.36916 0.81647 0.90495 0.49165 0.53344 0.23364 0.35953 0.63732 0.88323 0.82333 0.03405 0.0814 0.0659 0.78331 0.36712 0.14324 0.64946 0.35446 0.88547 0.47491 0.40672 0.65863 0.2079 0.975 0.00056 0.13597 0.79861 0.06433 0.70703 0.33306 0.22802 0.3412 0.31072 0.96523 0. [...]
+0.73626 0.16735 0.03966 0.17419 0.07014 0.08549 0.73693 0.29996 0.93426 0.68545 0.83364 0.96612 0.93625 0.99026 0.98981 0.47312 0.42163 0.00254 0.00715 0.64876 0.77106 0.93224 0.28021 0.79508 0.37829 0.52253 0.71867 0.13182 0.95422 0.03414 0.00137 0.12207 0.26509 0.31625 0.24996 0.11291 0.27516 0.43432 0.77404 0.5103 0.7761 0.93863 0.3373 0.93865 0.36623 0.45222 0.66013 0.26791 0.48858 0.34928 0.72532 0.64794 0.26709 0.58664 0.51939 0.97505 0.24595 0.77463 0.10431 0.85834 0.15583 0.3945  [...]
+0.26721 0.8647 0.53379 0.7045 0.40296 0.91794 0.48002 0.39524 0.3179 0.67663 0.14362 0.89107 0.99824 0.58114 0.87923 0.84992 0.01626 0.97329 0.70477 0.33866 0.25356 0.24647 0.35423 0.83577 0.48257 0.96047 0.83638 0.03201 0.00939 0.78808 0.84506 0.56438 0.25508 0.67653 0.1925 0.97252 0.55327 0.75535 0.19669 0.95339 0.57659 0.13667 0.72266 0.8887 0.01399 0.18948 0.72966 0.71937 0.60686 0.23526 0.74902 0.04238 0.41859 0.08544 0.4337 0.74719 0.55982 0.72441 0.86277 0.44877 0.96774 0.31694 0. [...]
+0.00205 0.91578 0.91266 0.72687 0.07475 0.86415 0.69266 0.50011 0.62892 0.45729 0.01466 0.28341 0.85267 0.86204 0.06428 0.36296 0.51598 0.51398 0.77751 0.44094 0.09458 0.84511 0.86707 0.48414 0.18175 0.14375 0.69316 0.31039 0.97611 0.75625 0.62206 0.5967 0.01511 0.66022 0.58162 0.97895 0.3226 0.46491 0.60695 0.40035 0.7345 0.56277 0.75976 0.30574 0.91125 0.61635 0.26778 0.77547 0.6153 0.99214 0.96793 0.28738 0.12389 0.98036 0.5863 0.72527 0.87268 0.816 0.35748 0.35993 0.41015 0.1646 0.86 [...]
+0.77511 0.79609 0.00184 0.01523 0.59735 0.61846 0.46027 0.7749 0.45238 0.24535 0.80289 0.66024 0.67348 0.77923 0.24775 0.13207 0.57733 0.58984 0.43173 0.26698 0.62854 0.99392 0.40056 0.77336 0.29266 0.60039 0.45949 0.84353 0.35107 0.0216 0.2778 0.92412 0.97861 0.94667 0.50316 0.42608 0.6785 0.92341 0.78575 0.44874 0.49973 0.9903 0.48411 0.88111 0.35808 0.17496 0.16878 0.05886 0.95992 0.85762 0.09872 0.82816 0.56765 0.88528 0.9676 0.05699 0.71817 0.413 0.59412 0.31632 0.40992 0.90323 0.14 [...]
+0.4411 0.04955 0.56909 0.18825 0.4213 0.8089 0.91609 0.87166 0.27303 0.74769 0.22055 0.24361 0.42464 0.83756 0.4704 0.14708 0.63302 0.79618 0.86499 0.42505 0.92573 0.70715 0.33452 0.39031 0.24925 0.64281 0.61464 0.28658 0.04488 0.73775 0.36421 0.44175 0.73377 0.1284 0.71909 0.6158 0.56655 0.58514 0.07345 0.57861 0.40895 0.2149 0.08082 0.43941 0.19408 0.10321 0.89393 0.82513 0.62212 0.56934 0.12884 0.22055 0.46301 0.68191 0.7036 0.50079 0.52226 0.94874 0.64096 0.44578 0.51766 0.51837 0.43 [...]
+0.91253 0.46278 0.98622 0.18773 0.50448 0.67477 0.37394 0.8673 0.0511 0.89376 0.87439 0.61469 0.75135 0.48069 0.11967 0.70624 0.42948 0.6663 0.70146 0.32939 0.54421 0.03312 0.63852 0.74042 0.09208 0.28284 0.29978 0.75151 0.10443 0.67339 0.5555 0.05358 0.55329 0.78579 0.56258 0.16498 0.30974 0.50955 0.79185 0.6997 0.48252 0.0816 0.1244 0.34508 0.96017 0.72922 0.1416 0.34515 0.58438 0.20784 0.94772 0.86971 0.45187 0.94118 0.87051 0.2702 0.36163 0.83032 0.25301 0.47727 0.36633 0.33045 0.572 [...]
+0.09072 0.38503 0.51129 0.79303 0.63649 0.09004 0.38513 0.52151 0.72172 0.1049 0.19257 0.07751 0.38245 0.15409 0.30901 0.28357 0.71065 0.05155 0.78296 0.26688 0.73422 0.99818 0.76095 0.14819 0.73069 0.03457 0.17313 0.17845 0.37995 0.25562 0.42031 0.56146 0.58365 0.11253 0.1598 0.47267 0.66627 0.37991 0.75004 0.5029 0.17688 0.53506 0.54997 0.99908 0.08913 0.56123 0.2841 0.57626 0.52495 0.74937 0.37321 0.60006 0.8585 0.04286 0.98148 0.15513 0.72079 0.1567 0.43332 0.83724 0.892 0.33017 0.41 [...]
+0.76155 0.16123 0.21778 0.88464 0.41223 0.09616 0.88604 0.11311 0.09525 0.94277 0.2269 0.21719 0.95192 0.83389 0.56937 0.13809 0.57919 0.54585 0.53766 0.86952 0.7299 0.26786 0.48476 0.05047 0.93466 0.29938 0.20049 0.08753 0.8744 0.33102 0.9949 0.26983 0.31614 0.40922 0.63067 0.45675 0.23693 0.21907 0.18458 0.17154 0.49096 0.63203 0.64472 0.67682 0.63215 0.5595 0.84677 0.00744 0.61066 0.044 0.76045 0.43713 0.73738 0.75848 0.70783 0.75531 0.96306 0.30025 0.86784 0.35727 0.90824 0.42765 0.3 [...]
+0.44757 0.84581 0.58413 0.11267 0.1815 0.79393 0.80861 0.09823 0.35197 0.0794 0.69739 0.07654 0.14165 0.65641 0.66814 0.68148 0.13841 0.4335 0.60954 0.85718 0.53616 0.35 0.96037 0.6879 0.74657 0.75923 0.45143 0.95349 0.34274 0.67476 0.30626 0.40904 0.25473 0.20343 0.62542 0.26249 0.61106 0.00346 0.00432 0.06663 0.74263 0.60554 0.20235 0.68579 0.46313 0.39656 0.96455 0.29526 0.39354 0.79584 0.06332 0.2847 0.72721 0.53259 0.98988 0.41079 0.50485 0.81389 0.84301 0.29612 0.72928 0.47326 0.25 [...]
+0.93133 0.75329 0.23217 0.74721 0.94385 0.81717 0.34729 0.84809 0.94646 0.03015 0.11056 0.31876 0.35053 0.57729 0.40599 0.18564 0.26811 0.42304 0.01755 0.19323 0.57706 0.89915 0.6981 0.43302 0.81807 0.49409 0.22359 0.93136 0.5087 0.80097 0.44421 0.52311 0.81711 0.54208 0.41234 0.43209 0.11741 0.02805 0.56972 0.87284 0.18108 0.75513 0.44108 0.30947 0.02492 0.87006 0.95638 0.91419 0.23336 0.39477 0.93834 0.53903 0.43731 0.21877 0.47107 0.62433 0.98221 0.01394 0.4213 0.09037 0.9356 0.16945  [...]
+0.42463 0.98236 0.84898 0.77463 0.54981 0.03819 0.33914 0.63044 0.08231 0.0544 0.58067 0.71222 0.11411 0.22851 0.11368 0.97724 0.22358 0.83452 0.62759 0.76431 0.35095 0.64467 0.48212 0.28184 0.63645 0.15061 0.48964 0.17198 0.77573 0.08202 0.4049 0.91932 0.84359 0.319 0.82452 0.10781 0.76386 0.27844 0.49829 0.37863 0.6738 0.78927 0.99184 0.83843 0.14719 0.43569 0.06888 0.13016 0.10873 0.31348 0.77301 0.66974 0.17727 0.6655 0.51701 0.89282 0.77474 0.28988 0.06788 0.97181 0.0397 0.20253 0.7 [...]
+0.31242 0.0901 0.11451 0.76399 0.50204 0.08558 0.48023 0.18012 0.82589 0.26826 0.82253 0.52792 0.60175 0.57809 0.86634 0.01816 0.62626 0.17989 0.53329 0.01962 0.68384 0.94609 0.96094 0.71714 0.04221 0.56754 0.71296 0.80251 0.47612 0.13819 0.07856 0.96991 0.21746 0.66373 0.54292 0.70498 0.55115 0.37879 0.48337 0.45108 0.14382 0.62963 0.70782 0.99755 0.15898 0.83339 0.57565 0.95235 0.21538 0.3434 0.62432 0.77623 0.52572 0.56863 0.4549 0.68245 0.66319 0.04553 0.20982 0.65902 0.76042 0.41992 [...]
+0.78683 0.85538 0.33476 0.41245 0.17817 0.76401 0.53136 0.64832 0.49662 0.15136 0.23668 0.55816 0.11711 0.17711 0.68323 0.73233 0.08693 0.99428 0.48837 0.61326 0.00555 0.12517 0.03304 0.90687 0.72993 0.76104 0.22768 0.86942 0.31742 0.5183 0.6382 0.72986 0.80844 0.90052 0.86485 0.21657 0.8307 0.58108 0.90747 0.47029 0.18051 0.13496 0.23714 0.35139 0.31715 0.1502 0.46128 0.66049 0.94189 0.21978 0.77217 0.25141 0.2301 0.78234 0.19517 0.89378 0.28965 0.02697 0.97917 0.62387 0.37459 0.78703 0 [...]
+0.86364 0.56338 0.83615 0.17155 0.51926 0.54365 0.88794 0.98344 0.15491 0.91412 0.76306 0.54259 0.45321 0.64486 0.60298 0.25322 0.32806 0.41232 0.88313 0.16721 0.09504 0.58913 0.30674 0.04475 0.93227 0.07006 0.84276 0.38405 0.99271 0.70833 0.45504 0.43675 0.18277 0.32758 0.76935 0.09557 0.57731 0.15267 0.24796 0.33997 0.91226 0.35175 0.6422 0.29712 0.6693 0.44959 0.32529 0.17122 0.10634 0.16967 0.29581 0.81318 0.88309 0.0294 0.60292 0.92921 0.62971 0.19236 0.83611 0.76376 0.25419 0.95579 [...]
+0.76361 0.40999 0.3594 0.54955 0.35825 0.25928 0.62519 0.26441 0.01847 0.64616 0.34469 0.48454 0.7496 0.77073 0.21183 0.97624 0.15919 0.54082 0.38206 0.09766 0.88645 0.51228 0.19055 0.65769 0.89586 0.36559 0.93888 0.83008 0.83072 0.90852 0.95226 0.84951 0.64683 0.94907 0.3099 0.26395 0.90062 0.4166 0.58333 0.29732 0.57359 0.2159 0.75411 0.17258 0.41108 0.183 0.87429 0.01991 0.85641 0.0801 0.79639 0.70397 0.10234 0.1893 0.97225 0.61996 0.61141 0.76174 0.42519 0.75498 0.99447 0.12556 0.730 [...]
+0.78348 0.38471 0.73472 0.9402 0.50051 0.94157 0.22723 0.52197 0.45122 0.85384 0.68984 0.98687 0.46183 0.58517 0.23814 0.11842 0.87539 0.09745 0.28077 0.08004 0.3644 0.78692 0.39023 0.64362 0.20916 0.37561 0.42763 0.85841 0.09986 0.07877 0.60744 0.08703 0.79467 0.31959 0.97954 0.25216 0.13122 0.37479 0.45537 0.72642 0.31484 0.98352 0.63104 0.00763 0.93972 0.49495 0.16414 0.67056 0.26744 0.51912 0.48644 0.72407 0.88348 0.79872 0.14615 0.66483 0.41798 0.26219 0.15659 0.59953 0.73262 0.3391 [...]
+0.30049 0.5282 0.82265 0.3105 0.95988 0.6364 0.29227 0.8361 0.8604 0.25106 0.47261 0.0666 0.92255 0.80725 0.51685 0.64205 0.59345 0.10815 0.53921 0.8007 0.45797 0.00995 0.70455 0.75825 0.15764 0.72032 0.72254 0.24251 0.15516 0.93708 0.97798 0.8634 0.81055 0.93762 0.29194 0.73772 0.2126 0.30229 0.45911 0.56028 0.62713 0.43614 0.36588 0.35465 0.70369 0.94391 0.59597 0.65907 0.48223 0.23716 0.11965 0.91353 0.39362 0.45824 0.77646 0.93173 0.67938 0.31035 0.25262 0.85159 0.17315 0.71573 0.469 [...]
+0.24395 0.75812 0.52152 0.40861 0.34889 0.74915 0.74017 0.52914 0.47027 0.19505 0.96076 0.09506 0.66901 0.31036 0.33839 0.59361 0.97885 0.97048 0.83791 0.20532 0.4185 0.11066 0.87293 0.68749 0.64634 0.37859 0.51593 0.88743 0.97436 0.38166 0.67826 0.62952 0.32504 0.70212 0.21461 0.72267 0.68206 0.73507 0.57807 0.70052 0.7817 0.3731 0.74969 0.64956 0.63309 0.60395 0.78665 0.27378 0.88399 0.03853 0.33098 0.62759 0.45146 0.83631 0.01443 0.03946 0.96796 0.3181 0.74722 0.914 0.84173 0.15643 0. [...]
+0.76718 0.98043 0.92049 0.28094 0.2084 0.35261 0.03114 0.04419 0.35342 0.88553 0.76708 0.37608 0.37114 0.44488 0.01366 0.27192 0.99411 0.66723 0.68973 0.89785 0.47327 0.05094 0.50189 0.50853 0.39298 0.2444 0.8461 0.46054 0.86006 0.88781 0.07524 0.44654 0.05818 0.39183 0.42167 0.02211 0.98295 0.674 0.4853 0.59054 0.09715 0.55004 0.39065 0.82907 0.22204 0.81164 0.6265 0.22069 0.26933 0.63343 0.93631 0.41338 0.24857 0.58932 0.49908 0.16371 0.94193 0.28495 0.5377 0.8994 0.73897 0.83138 0.796 [...]
+0.33911 0.06728 0.61091 0.92972 0.81291 0.9937 0.2326 0.50916 0.12148 0.32854 0.51793 0.48946 0.88614 0.73959 0.00522 0.89089 0.68241 0.25511 0.0411 0.98295 0.23696 0.92371 0.78727 0.03138 0.04127 0.20541 0.45805 0.27905 0.02674 0.93763 0.34841 0.26877 0.70208 0.30323 0.72894 0.38242 0.28056 0.56675 0.72508 0.18612 0.92006 0.00202 0.6361 0.56309 0.73598 0.10458 0.25348 0.41719 0.14369 0.49334 0.66763 0.19183 0.09189 0.44907 0.80942 0.569 0.10481 0.98918 0.74966 0.35064 0.08647 0.49195 0. [...]
+0.12842 0.78044 0.74677 0.89306 0.30376 0.00528 0.43071 0.10853 0.10525 0.07258 0.68257 0.46077 0.13852 0.46215 0.27265 0.98617 0.7765 0.2889 0.01312 0.27689 0.00961 0.14243 0.10303 0.69823 0.66853 0.76213 0.06519 0.49848 0.71738 0.56212 0.9535 0.961 0.40366 0.62072 0.49339 0.23319 0.81358 0.13632 0.82518 0.96769 0.21976 0.21067 0.41737 0.7595 0.96559 0.19475 0.39797 0.16234 0.93246 0.75785 0.69402 0.66414 0.58451 0.00293 0.55659 0.34548 0.47714 0.12236 0.934 0.28259 0.39812 0.5405 0.573 [...]
+0.12504 0.68066 0.08153 0.32898 0.61436 0.53892 0.67028 0.07849 0.88899 0.33778 0.81535 0.50767 0.62688 0.85263 0.48785 0.64773 0.6948 0.83895 0.16379 0.28742 0.60702 0.41012 0.17044 0.28222 0.28239 0.84015 0.57148 0.59459 0.03472 0.12493 0.28777 0.61542 0.72334 0.68042 0.65819 0.06655 0.34712 0.12499 0.79758 0.49221 0.97714 0.98166 0.96686 0.50416 0.08778 0.64415 0.64434 0.45992 0.29631 0.65484 0.42032 0.26898 0.03008 0.51717 0.16416 0.94599 0.11493 0.29812 0.06537 0.28996 0.85232 0.657 [...]
+0.53309 0.68635 0.44734 0.02588 0.63266 0.94545 0.65689 0.39403 0.72672 0.45038 0.47357 0.06695 0.27203 0.83032 0.76522 0.61754 0.53437 0.08998 0.45517 0.43623 0.59763 0.66474 0.22027 0.33629 0.0911 0.70023 0.7359 0.05985 0.07348 0.94255 0.78815 0.5905 0.00815 0.18227 0.95351 0.79595 0.61847 0.12426 0.15234 0.65994 0.35762 0.43956 0.80902 0.27251 0.60145 0.94284 0.01396 0.41932 0.36827 0.61578 0.44821 0.87582 0.32621 0.69596 0.01633 0.37648 0.09718 0.28766 0.13126 0.38118 0.63168 0.71862 [...]
+0.87999 0.51412 0.47279 0.17224 0.56657 0.18118 0.2337 0.99249 0.72428 0.73458 0.91917 0.09363 0.79088 0.86502 0.62428 0.52643 0.2734 0.37598 0.40183 0.16845 0.37568 0.41561 0.36495 0.11684 0.48886 0.39808 0.15027 0.11463 0.19693 0.81092 0.50393 0.76252 0.30121 0.08222 0.4173 0.69597 0.98063 0.63863 0.35507 0.80252 0.37019 0.64074 0.08573 0.52394 0.42527 0.80232 0.67157 0.07394 0.45653 0.54523 0.34754 0.85343 0.05523 0.93396 0.00676 0.78872 0.10921 0.45885 0.17929 0.00767 0.51798 0.55385 [...]
+0.82537 0.86886 0.26753 0.08578 0.16594 0.58511 0.73874 0.81757 0.87419 0.15901 0.62435 0.09954 0.2169 0.72861 0.36874 0.37192 0.31147 0.72093 0.61696 0.46319 0.20596 0.95814 0.60433 0.82928 0.59411 0.80719 0.27901 0.6018 0.13983 0.51842 0.059 0.04979 0.29201 0.8275 0.3111 0.70974 0.25056 0.74796 0.1767 0.77469 0.27667 0.10306 0.66432 0.92156 0.6111 0.37437 0.81618 0.20038 0.33346 0.53859 0.11264 0.62169 0.15225 0.41519 0.40439 0.24785 0.10545 0.20172 0.35813 0.50159 0.0248 0.94446 0.615 [...]
+0.59587 0.48344 0.50997 0.48534 0.37632 0.70668 0.76678 0.59351 0.5234 0.6395 0.03041 0.99486 0.5518 0.55379 0.57792 0.15925 0.15915 0.42905 0.45805 0.4344 0.31695 0.57122 0.8254 0.86268 0.64609 0.92687 0.62817 0.61163 0.33169 0.18688 0.46494 0.32491 0.72657 0.56774 0.23958 0.43153 0.10699 0.96943 0.24599 0.59333 0.35732 0.61035 0.67169 0.13053 0.8606 0.85318 0.713 0.1661 0.58286 0.04137 0.65797 0.62852 0.59371 0.55651 0.34129 0.83203 0.09432 0.21394 0.93746 0.81362 0.74576 0.25623 0.392 [...]
+0.54592 0.86515 0.50935 0.39388 0.94799 0.42655 0.89526 0.83028 0.70878 0.21824 0.31618 0.86152 0.11315 0.92293 0.20415 0.42298 0.92686 0.99722 0.93857 0.32478 0.37597 0.95123 0.06955 0.74192 0.16349 0.93092 0.30355 0.96965 0.15316 0.0412 0.04753 0.19249 0.98373 0.85771 0.714 0.51423 0.0432 0.82607 0.60782 0.31376 0.27544 0.91758 0.08625 0.0822 0.69884 0.80053 0.25542 0.24027 0.54202 0.66361 0.39291 0.72264 0.27021 0.86004 0.28073 0.5771 0.60937 0.88269 0.15661 0.16961 0.28203 0.2827 0.6 [...]
+0.52843 0.02977 0.75658 0.54365 0.35135 0.13415 0.72232 0.13351 0.53873 0.58926 0.50031 0.61498 0.5612 0.23378 0.16009 0.61862 0.66582 0.78412 0.6556 0.92212 0.61094 0.93087 0.40212 0.88043 0.27219 0.43347 0.30967 0.137 0.58074 0.68938 0.87845 0.55904 0.66882 0.57006 0.45719 0.64405 0.9538 0.60024 0.03576 0.06336 0.68819 0.07458 0.34433 0.1563 0.15422 0.60646 0.10591 0.8022 0.98059 0.18347 0.7153 0.62901 0.67829 0.51564 0.83474 0.6675 0.61936 0.73009 0.93022 0.06785 0.32405 0.44627 0.807 [...]
+0.89955 0.15702 0.9433 0.10162 0.94468 0.02725 0.59511 0.40175 0.19972 0.83386 0.3721 0.51688 0.61169 0.65087 0.42801 0.12442 0.91998 0.63318 0.08346 0.10779 0.09463 0.53417 0.19891 0.89532 0.15081 0.5201 0.43091 0.06134 0.65386 0.63489 0.95091 0.36148 0.01313 0.55817 0.34594 0.87743 0.31033 0.56459 0.16881 0.65375 0.60245 0.74574 0.63072 0.40806 0.36298 0.1678 0.62949 0.0289 0.14166 0.78356 0.38172 0.39896 0.85351 0.36632 0.65958 0.65732 0.44355 0.19926 0.96435 0.30866 0.08937 0.82371 0 [...]
+0.98925 0.76091 0.83615 0.83801 0.30504 0.87356 0.04912 0.05052 0.19223 0.39223 0.93441 0.03525 0.93819 0.23864 0.79174 0.87508 0.1245 0.8748 0.22281 0.24576 0.23235 0.79291 0.16998 0.01262 0.55901 0.93051 0.19444 0.26361 0.33027 0.43701 0.92317 0.5605 0.18675 0.01904 0.98594 0.30729 0.92326 0.62227 0.24523 0.49263 0.7298 0.47453 0.02838 0.34536 0.33704 0.96584 0.55628 0.51986 0.77866 0.55107 0.41369 0.19462 0.86649 0.12993 0.46264 0.17468 0.02966 0.79375 0.53022 0.52315 0.79976 0.85045  [...]
+0.11435 0.86734 0.91209 0.09632 0.07521 0.7037 0.81178 0.51451 0.8493 0.26614 0.08278 0.41726 0.18137 0.80861 0.71322 0.59617 0.15897 0.01933 0.16777 0.56784 0.11052 0.67951 0.42663 0.61846 0.23643 0.90378 0.04036 0.33943 0.08089 0.24992 0.86231 0.76635 0.87245 0.81859 0.62585 0.06416 0.01541 0.94988 0.54238 0.84141 0.61416 0.28818 0.00101 0.13855 0.68649 0.75482 0.56216 0.09949 0.31173 0.88369 0.28212 0.87318 0.60159 0.43 0.53231 0.76075 0.35774 0.61575 0.6978 0.40759 0.57061 0.75746 0. [...]
+0.75724 0.94341 0.15986 0.37325 0.00682 0.91474 0.9335 0.03912 0.19695 0.2901 0.36995 0.59869 0.96977 0.04916 0.99639 0.09382 0.51991 0.68874 0.81502 0.53982 0.82173 0.33009 0.14214 0.81757 0.05128 0.30365 0.34812 0.26037 0.70052 0.18506 0.61526 0.82645 0.81331 0.02146 0.82427 0.13491 0.84147 0.87812 0.83651 0.83768 0.96912 0.02551 0.52231 0.90781 0.849 0.97445 0.64454 0.63559 0.52437 0.00725 0.14565 0.05211 0.33049 0.97832 0.12485 0.60367 0.54021 0.09167 0.11345 0.40157 0.09775 0.61424  [...]
+0.97888 0.42821 0.86801 0.5048 0.69468 0.43213 0.42985 0.72414 0.90616 0.71232 0.5545 0.77833 0.06987 0.12155 0.64921 0.40961 0.58296 0.26674 0.51671 0.9149 0.44512 0.83202 0.10618 0.89207 0.18862 0.16461 0.43103 0.88298 0.80368 0.42327 0.06398 0.158 0.50281 0.59266 0.42147 0.80565 0.45728 0.58529 0.74431 0.5268 0.24202 0.12633 0.73295 0.92088 0.7245 0.93704 0.65723 0.32563 0.19636 0.54041 0.65957 0.19205 0.12133 0.78427 0.93234 0.91608 0.23613 0.30955 0.12654 0.62528 0.40721 0.09776 0.9 [...]
+0.6603 0.21813 0.20881 0.64292 0.71931 0.69083 0.33213 0.92687 0.35378 0.55589 0.17544 0.83498 0.83528 0.47567 0.98638 0.7216 0.07532 0.73283 0.26087 0.31511 0.3552 0.66459 0.37501 0.86971 0.40662 0.83829 0.91162 0.64592 0.06746 0.10362 0.52499 0.73756 0.03161 0.82503 0.75942 0.77402 0.38882 0.61016 0.33582 0.57898 0.47858 0.20805 0.58503 0.94529 0.16462 0.66166 0.58544 0.64896 0.23004 0.23165 0.49075 0.27784 0.17987 0.352 0.90448 0.72248 0.66841 0.13307 0.92574 0.11256 0.35258 0.72081 0 [...]
+0.19858 0.21566 0.95502 0.09455 0.98857 0.68868 0.31214 0.45141 0.45044 0.44734 0.48039 0.17628 0.63259 0.77358 0.8804 0.13318 0.14266 0.63195 0.92312 0.3407 0.43007 0.37187 0.98692 0.70417 0.27164 0.51602 0.95815 0.30334 0.04788 0.61022 0.23942 0.60992 0.42254 0.91223 0.78233 0.99322 0.18563 0.17855 0.86114 0.44917 0.11087 0.40677 0.02617 0.94465 0.34364 0.23317 0.59377 0.79954 0.44185 0.78947 0.64055 0.92371 0.21774 0.88814 0.86159 0.97506 0.69862 0.04676 0.22118 0.62196 0.79068 0.2098 [...]
+0.1446 0.0477 0.07193 0.41767 0.29557 0.20909 0.6232 0.04378 0.06482 0.04107 0.65263 0.17658 0.0841 0.07896 0.46045 0.3057 0.80057 0.7969 0.94043 0.10993 0.63713 0.98192 0.46622 0.30524 0.60054 0.60772 0.04629 0.59313 0.68512 0.24722 0.99019 0.58022 0.40892 0.49466 0.95504 0.75525 0.5476 0.20135 0.7027 0.33373 0.99342 0.48302 0.27301 0.30132 0.67134 0.21486 0.51306 0.26392 0.6012 0.85202 0.65507 0.50642 0.91739 0.9973 0.91215 0.88083 0.1295 0.8212 0.74911 0.83615 0.53978 0.12963 0.35174  [...]
+0.93348 0.75732 0.82235 0.76358 0.79362 0.04947 0.55536 0.20261 0.06037 0.8913 0.58718 0.9533 0.02349 0.14502 0.38173 0.56738 0.59909 0.74157 0.9464 0.36325 0.26869 0.08738 0.33845 0.50097 0.86005 0.44247 0.90459 0.47672 0.12808 0.63316 0.02752 0.46886 0.28925 0.30084 0.1836 0.87396 0.06051 0.6733 0.27153 0.19819 0.50064 0.71707 0.28868 0.74178 0.26888 0.00207 0.52801 0.94475 0.02936 0.64916 0.27157 0.03113 0.35485 0.40953 0.16608 0.55511 0.22404 0.75717 0.79204 0.85954 0.14665 0.40243 0 [...]
+0.88441 0.91362 0.19341 0.83454 0.9388 0.5867 0.80004 0.39162 0.92506 0.17756 0.7833 0.42385 0.99217 0.00704 0.81262 0.92055 0.44568 0.28907 0.4476 0.79278 0.56999 0.25265 0.42159 0.01622 0.58504 0.22834 0.97266 0.93821 0.17175 0.05008 0.67041 0.63267 0.17675 0.67577 0.85015 0.59352 0.16715 0.60191 0.45926 0.66548 0.2544 0.32291 0.37181 0.04819 0.61666 0.79807 0.60216 0.45448 0.82477 0.52426 0.86475 0.32477 0.97941 0.25872 0.21953 0.50402 0.63543 0.07404 0.71784 0.43784 0.12084 0.1657 0. [...]
+0.98377 0.26381 0.32402 0.51045 0.20302 0.05553 0.5031 0.61019 0.46108 0.49062 0.20274 0.60388 0.58511 0.72821 0.83464 0.93149 0.95736 0.43734 0.75508 0.10486 0.64855 0.55244 0.47088 0.00337 0.72695 0.52051 0.34352 0.0342 0.94513 0.58804 0.70572 0.70464 0.33333 0.76866 0.35346 0.2671 0.14623 0.56159 0.98092 0.04724 0.35721 0.95696 0.56754 0.39801 0.06616 0.69148 0.31295 0.11783 0.27995 0.32005 0.23343 0.00681 0.86519 0.08355 0.55687 0.7425 0.04848 0.21377 0.64272 0.96568 0.74456 0.15527  [...]
+0.19926 0.85726 0.20126 0.94399 0.45803 0.31323 0.83113 0.63526 0.04183 0.71368 0.39399 0.34955 0.49878 0.19535 0.78969 0.98738 0.73485 0.81895 0.48978 0.36873 0.20937 0.97768 0.90942 0.01843 0.38482 0.476 0.09079 0.18623 0.81343 0.21162 0.73189 0.62994 0.24328 0.1547 0.23985 0.65791 0.37602 0.85113 0.37882 0.10584 0.46918 0.24155 0.28358 0.51806 0.33015 0.60235 0.11316 0.32632 0.03 0.0816 0.7409 0.97271 0.27991 0.23186 0.55467 0.32437 0.09183 0.45972 0.89153 0.90038 0.20619 0.48296 0.50 [...]
+0.70846 0.8181 0.85392 0.42422 0.22738 0.07936 0.63072 0.4382 0.65532 0.37409 0.00142 0.50665 0.59006 0.50589 0.97417 0.82307 0.42163 0.432 0.09277 0.24848 0.12881 0.54062 0.1605 0.33043 0.20537 0.29081 0.74968 0.92322 0.79759 0.98848 0.3198 0.71652 0.33761 0.84424 0.83315 0.52993 0.18539 0.57878 0.0145 0.64764 0.90289 0.87055 0.15674 0.85321 0.37269 0.77342 0.17514 0.11006 0.82183 0.04768 0.85723 0.55555 0.95876 0.82687 0.38007 0.60192 0.56654 0.17581 0.2356 0.94068 0.02685 0.92573 0.54 [...]
+0.54064 0.27056 0.60781 0.00805 0.34139 0.70619 0.53392 0.67616 0.48359 0.24898 0.2478 0.99743 0.56366 0.91002 0.03357 0.45785 0.82347 0.13561 0.43001 0.51665 0.66714 0.12112 0.12497 0.44108 0.44315 0.34188 0.32525 0.6315 0.8076 0.83031 0.73646 0.13444 0.59581 0.19535 0.10845 0.61204 0.70941 0.35326 0.9837 0.66949 0.78919 0.47924 0.49399 0.9461 0.91535 0.25507 0.03143 0.21229 0.97451 0.05509 0.39911 0.91323 0.33255 0.91963 0.58945 0.70189 0.07503 0.66972 0.93151 0.46002 0.57753 0.65401 0 [...]
+0.09053 0.88604 0.07621 0.0052 0.49633 0.22923 0.90261 0.43384 0.34162 0.10719 0.95741 0.23995 0.95901 0.80512 0.06781 0.11415 0.86711 0.1729 0.70654 0.91122 0.62355 0.48996 0.17327 0.86027 0.54905 0.22219 0.23677 0.67067 0.0819 0.31757 0.58317 0.70523 0.63751 0.28994 0.47175 0.96934 0.56162 0.2056 0.30137 0.62605 0.08358 0.5513 0.75882 0.76175 0.88042 0.00534 0.64376 0.42015 0.58828 0.99651 0.29491 0.07054 0.92697 0.25102 0.61224 0.46716 0.81395 0.60606 0.30079 0.62841 0.40701 0.56005 0 [...]
+0.8439 0.50312 0.95379 0.379 0.1822 0.0425 0.04219 0.25662 0.4388 0.51293 0.82163 0.73726 0.65262 0.18215 0.6401 0.32269 0.45018 0.07552 0.56963 0.57546 0.4961 0.65196 0.64614 0.52981 0.28934 0.2662 0.18125 0.71707 0.97439 0.58383 0.17142 0.75308 0.91474 0.87137 0.68462 0.8131 0.51475 0.11349 0.52972 0.41699 0.98431 0.41617 0.88229 0.91552 0.748 0.28387 0.72594 0.34324 0.52332 0.71644 0.75842 0.20121 0.65603 0.53035 0.54764 0.21457 0.08101 0.38655 0.09668 0.95356 0.80552 0.55009 0.82182  [...]
+0.31651 0.46153 0.01709 0.59257 0.4138 0.25156 0.24506 0.70736 0.1505 0.06429 0.56274 0.21469 0.94906 0.50581 0.07146 0.20109 0.57375 0.81049 0.59435 0.75812 0.34151 0.6041 0.53436 0.37583 0.77127 0.95605 0.22353 0.66369 0.01371 0.30125 0.03596 0.91568 0.32062 0.22541 0.78937 0.10099 0.21219 0.75494 0.8292 0.51291 0.95585 0.77207 0.43077 0.64297 0.46346 0.27448 0.90915 0.65812 0.10685 0.32809 0.04955 0.9448 0.75873 0.50257 0.19716 0.38178 0.30067 0.81079 0.75295 0.78649 0.04964 0.4063 0. [...]
+0.49053 0.93086 0.25096 0.36764 0.24272 0.46577 0.96965 0.03667 0.15768 0.81868 0.0777 0.25347 0.94206 0.5643 0.74825 0.0105 0.11773 0.75751 0.67953 0.19256 0.03557 0.79155 0.44299 0.4705 0.7659 0.08042 0.98919 0.17658 0.47054 0.9724 0.74093 0.29414 0.52611 0.0224 0.38065 0.80141 0.87001 0.63735 0.8661 0.33898 0.62029 0.84441 0.58873 0.42164 0.62581 0.7423 0.82743 0.27258 0.85876 0.77727 0.64552 0.04443 0.25856 0.14535 0.62684 0.68406 0.75588 0.20035 0.31661 0.22312 0.22697 0.7988 0.0711 [...]
+0.9518 0.65375 0.87577 0.74819 0.90053 0.70488 0.09632 0.45292 0.47861 0.2882 0.09299 0.9926 0.3555 0.56649 0.89923 0.10964 0.40052 0.8586 0.30543 0.4481 0.81965 0.9049 0.28906 0.3873 0.19676 0.0665 0.54193 0.23636 0.29702 0.0597 0.56756 0.81269 0.96842 0.30815 0.3002 0.87208 0.18931 0.70523 0.79165 0.87248 0.46792 0.37919 0.85295 0.50914 0.88938 0.16174 0.3592 0.12812 0.64334 0.85501 0.92409 0.26468 0.70441 0.50759 0.52845 0.27186 0.0276 0.29913 0.49266 0.42109 0.56852 0.79252 0.91756 0 [...]
+0.626 0.33278 0.56068 0.02858 0.77745 0.33955 0.7124 0.07341 0.33518 0.44754 0.68904 0.44507 0.30347 0.81964 0.40104 0.50465 0.90962 0.5765 0.7983 0.20284 0.56485 0.5311 0.4317 0.73541 0.51455 0.66556 0.35686 0.64134 0.6508 0.1866 0.72329 0.54288 0.65467 0.2318 0.0567 0.94026 0.07991 0.55556 0.56529 0.17033 0.95721 0.64853 0.10538 0.7234 0.77706 0.27455 0.09376 0.59793 0.83003 0.11731 0.99393 0.93398 0.53691 0.41518 0.79311 0.73948 0.67783 0.87028 0.22115 0.49999 0.46834 0.14281 0.81654  [...]
+0.00822 0.30949 0.20235 0.95579 0.50576 0.31207 0.70153 0.52411 0.05524 0.75932 0.95164 0.44959 0.88881 0.25283 0.2325 0.8672 0.95905 0.42271 0.07429 0.08336 0.42689 0.53999 0.44827 0.34722 0.67092 0.37754 0.23371 0.84183 0.73096 0.36503 0.87945 0.35361 0.74245 0.28807 0.63715 0.41508 0.48395 0.4972 0.08887 0.86071 0.52742 0.27595 0.06252 0.12677 0.03803 0.45024 0.87983 0.93735 0.67414 0.88355 0.18029 0.66921 0.85155 0.55568 0.20801 0.17523 0.91199 0.8996 0.98439 0.49802 0.34187 0.71493  [...]
+0.24277 0.80183 0.30739 0.69431 0.94459 0.45887 0.91825 0.36935 0.5606 0.74166 0.88847 0.49235 0.24014 0.97205 0.10543 0.48866 0.51939 0.50612 0.03249 0.88407 0.78386 0.63759 0.32195 0.19954 0.74892 0.26862 0.50639 0.31357 0.35957 0.69947 0.40131 0.80749 0.17167 0.4118 0.28479 0.46955 0.0244 0.92485 0.15714 0.96536 0.50954 0.30795 0.25419 0.92355 0.75546 0.85728 0.84658 0.9377 0.72809 0.36244 0.84648 0.66895 0.14014 0.68341 0.87489 0.00667 0.26899 0.0121 0.87956 0.23853 0.5472 0.19233 0. [...]
+0.90599 0.82317 0.37534 0.16302 0.21763 0.83822 0.80501 0.58379 0.18232 0.7566 0.85387 0.67807 0.6118 0.81822 0.46481 0.65597 0.54369 0.84075 0.15438 0.26938 0.98287 0.96176 0.79441 0.33238 0.79222 0.34854 0.83029 0.98095 0.98418 0.33377 0.20319 0.14588 0.87422 0.22622 0.31682 0.37656 0.02637 0.09927 0.2521 0.73185 0.13682 0.6065 0.45658 0.95486 0.01978 0.41942 0.55154 0.32978 0.88381 0.90847 0.06352 0.53429 0.01218 0.53633 0.9452 0.47357 0.70111 0.95704 0.77084 0.41394 0.88594 0.20104 0 [...]
+0.32833 0.5974 0.15496 0.96903 0.04415 0.4379 0.21615 0.20773 0.74547 0.58677 0.88799 0.1289 0.50355 0.51368 0.44774 0.4458 0.18168 0.62302 0.73965 0.36498 0.57753 0.15709 0.57187 0.56095 0.34144 0.72898 0.3257 0.07847 0.51296 0.36192 0.42149 0.78251 0.29729 0.30566 0.71208 0.74686 0.4552 0.47121 0.9617 0.3471 0.91742 0.60243 0.85589 0.18523 0.02167 0.22535 0.35224 0.26593 0.54055 0.98309 0.9987 0.76617 0.7446 0.89148 0.65802 0.23281 0.11781 0.15963 0.95983 0.47482 0.55353 0.38006 0.6598 [...]
+0.82489 0.47003 0.8285 0.19121 0.44764 0.40199 0.27178 0.73326 0.09293 0.69015 0.76228 0.06185 0.1851 0.28339 0.02511 0.12479 0.78647 0.01294 0.38501 0.60001 0.58988 0.02982 0.52861 0.6932 0.56919 0.486 0.99835 0.43836 0.90069 0.37515 0.66014 0.52515 0.12355 0.89528 0.42788 0.91768 0.35664 0.76937 0.18973 0.06796 0.60015 0.40841 0.53892 0.31349 0.91341 0.8848 0.37654 0.76322 0.54763 0.51462 0.5846 0.09177 0.1827 0.24801 0.69221 0.01336 0.12056 0.67614 0.65029 0.80649 0.76966 0.85265 0.78 [...]
+0.81108 0.04529 0.35891 0.60367 0.44084 0.72118 0.07143 0.89339 0.80729 0.19588 0.26297 0.07632 0.7306 0.22466 0.09705 0.91654 0.42055 0.01126 0.22262 0.74565 0.38121 0.49594 0.11453 0.37749 0.17108 0.78855 0.10032 0.8318 0.59257 0.1592 0.02477 0.71189 0.68323 0.67948 0.41244 0.4459 0.7098 0.9679 0.08155 0.28873 0.89939 0.1677 0.31116 0.47008 0.33911 0.45694 0.04005 0.08538 0.24104 0.65665 0.40253 0.57691 0.94211 0.84062 0.82292 0.88286 0.25611 0.41518 0.13574 0.5089 0.42666 0.92651 0.01 [...]
+0.76927 0.35001 0.6174 0.66072 0.74037 0.73615 0.00381 0.87245 0.73037 0.91006 0.00899 0.40445 0.94141 0.27054 0.90796 0.47587 0.17704 0.48833 0.24222 0.17956 0.69713 0.90712 0.0415 0.01113 0.69338 0.97614 0.65008 0.66928 0.42099 0.60617 0.08311 0.96421 0.15636 0.75559 0.30077 0.47982 0.67608 0.35085 0.79481 0.08832 0.8614 0.68481 0.18772 0.79853 0.982 0.4311 0.8636 0.8451 0.33553 0.78873 0.29781 0.2195 0.16196 0.34831 0.59168 0.35108 0.9462 0.322 0.4864 0.02933 0.39019 0.05041 0.09188 0 [...]
+0.79117 0.32024 0.4533 0.45458 0.74345 0.02872 0.45241 0.08087 0.13706 0.74878 0.51686 0.46288 0.06243 0.53201 0.5628 0.8065 0.00584 0.37467 0.81889 0.8439 0.83295 0.20114 0.92754 0.05707 0.0473 0.0712 0.16133 0.12863 0.65745 0.12804 0.19956 0.35512 0.48226 0.09442 0.13979 0.14721 0.40935 0.94825 0.28044 0.72898 0.70783 0.95296 0.75123 0.60662 0.1129 0.56778 0.9059 0.81472 0.30813 0.69531 0.86244 0.84938 0.89448 0.21006 0.23963 0.28064 0.44664 0.79014 0.5537 0.13904 0.79831 0.08557 0.471 [...]
+0.74191 0.19326 0.01725 0.16238 0.65177 0.9641 0.27347 0.81456 0.04972 0.8329 0.5963 0.31707 0.88221 0.92322 0.30711 0.96638 0.11034 0.16777 0.08285 0.70184 0.88094 0.35102 0.4212 0.70909 0.50658 0.2333 0.50496 0.30437 0.2483 0.07215 0.14717 0.36948 0.4611 0.02004 0.46521 0.39442 0.67921 0.82602 0.49625 0.58644 0.41027 0.05778 0.24292 0.63576 0.09681 0.44709 0.29164 0.3857 0.45459 0.80068 0.78586 0.246 0.08182 0.81546 0.56824 0.41774 0.92389 0.64503 0.60781 0.0962 0.2139 0.47778 0.95803  [...]
+0.22841 0.2799 0.89012 0.87 0.78327 0.01693 0.89263 0.82108 0.07681 0.25731 0.02241 0.41197 0.78901 0.04575 0.78969 0.25984 0.49568 0.0583 0.27578 0.45822 0.44942 0.30436 0.40341 0.09291 0.92003 0.75913 0.71615 0.99125 0.2601 0.40314 0.48066 0.67734 0.52497 0.60541 0.17969 0.47478 0.52881 0.05181 0.70784 0.23555 0.27775 0.2638 0.72726 0.00018 0.48166 0.23358 0.51325 0.49357 0.35443 0.93006 0.58672 0.68778 0.73233 0.97728 0.94884 0.39374 0.19637 0.8188 0.04379 0.6338 0.77585 0.8058 0.5141 [...]
+0.48529 0.84929 0.92258 0.89448 0.15681 0.7349 0.44694 0.86661 0.42408 0.45881 0.49726 0.01951 0.07568 0.11295 0.08694 0.02931 0.36917 0.03541 0.25592 0.44605 0.49266 0.27796 0.16462 0.61039 0.66715 0.61748 0.36752 0.07075 0.5249 0.91628 0.08503 0.39681 0.29424 0.71195 0.43301 0.91604 0.99738 0.2797 0.55182 0.01166 0.47293 0.82201 0.57721 0.92219 0.70592 0.90756 0.33375 0.09387 0.26935 0.28017 0.91396 0.8606 0.45441 0.67513 0.9635 0.69302 0.76451 0.17899 0.62316 0.44925 0.63154 0.26021 0 [...]
+0.34943 0.93277 0.08821 0.98467 0.44573 0.10059 0.65251 0.95184 0.79167 0.34156 0.92869 0.64889 0.28533 0.69107 0.67542 0.70328 0.33049 0.09958 0.10404 0.76159 0.66142 0.00015 0.75539 0.60836 0.23376 0.01596 0.1051 0.37846 0.38808 0.55566 0.76241 0.12614 0.26806 0.07375 0.09891 0.64043 0.30446 0.10102 0.16093 0.25111 0.90467 0.84903 0.46111 0.49484 0.18457 0.76562 0.12045 0.34458 0.50786 0.05238 0.96539 0.04675 0.61095 0.55572 0.53139 0.96927 0.56409 0.44429 0.88744 0.42248 0.76793 0.786 [...]
+0.99346 0.49047 0.34369 0.87644 0.70258 0.84389 0.89749 0.87099 0.53953 0.11491 0.28862 0.91307 0.07524 0.41187 0.98215 0.94934 0.7637 0.74702 0.92505 0.32939 0.49533 0.63948 0.42425 0.44018 0.93046 0.11314 0.45491 0.31873 0.28118 0.12795 0.93387 0.18481 0.11834 0.98531 0.01371 0.96943 0.37136 0.75643 0.15092 0.96234 0.17948 0.49953 0.33335 0.20649 0.56524 0.7123 0.06377 0.30159 0.95905 0.86112 0.10113 0.40284 0.14861 0.42603 0.86883 0.53598 0.10907 0.55928 0.53454 0.3465 0.04704 0.77025 [...]
+0.23116 0.57757 0.76647 0.97824 0.57685 0.29361 0.31178 0.20663 0.09271 0.12102 0.06368 0.15851 0.25808 0.17954 0.0698 0.04577 0.24568 0.93848 0.91292 0.70219 0.6991 0.77293 0.06093 0.12166 0.37477 0.24816 0.11798 0.33549 0.69153 0.4537 0.90589 0.95119 0.7058 0.87751 0.43341 0.13497 0.50336 0.63535 0.93016 0.29991 0.91842 0.14054 0.36451 0.94112 0.39271 0.20128 0.29622 0.51588 0.61246 0.05218 0.24755 0.47114 0.19766 0.38271 0.62116 0.16289 0.37825 0.7454 0.97756 0.34556 0.18395 0.23992 0 [...]
+0.25643 0.97252 0.72571 0.47701 0.41236 0.72441 0.0832 0.41199 0.32612 0.36339 0.01535 0.82908 0.45779 0.99103 0.91364 0.07118 0.04168 0.03024 0.12144 0.74486 0.46764 0.70534 0.47897 0.08438 0.22415 0.47866 0.16661 0.19136 0.67487 0.95073 0.02617 0.10588 0.78153 0.61585 0.46794 0.04085 0.83313 0.06787 0.60068 0.18373 0.88729 0.21494 0.46613 0.57045 0.55035 0.98642 0.77722 0.4099 0.44728 0.74959 0.51168 0.28654 0.09992 0.47738 0.21177 0.39465 0.85667 0.14546 0.59833 0.24537 0.48402 0.4556 [...]
+0.43965 0.98698 0.34801 0.19708 0.95252 0.79463 0.39339 0.57737 0.43702 0.39985 0.06043 0.55493 0.74891 0.99126 0.27334 0.20268 0.28739 0.1565 0.18942 0.26668 0.11688 0.69063 0.41966 0.70247 0.18001 0.40366 0.30819 0.87738 0.09155 0.55308 0.72162 0.39524 0.58253 0.17425 0.06998 0.1192 0.99511 0.19689 0.78728 0.05539 0.76962 0.63841 0.98106 0.0445 0.75993 0.39263 0.07919 0.57133 0.71392 0.09566 0.99606 0.32172 0.72609 0.85125 0.32679 0.74568 0.31503 0.28565 0.77984 0.69859 0.30031 0.45569 [...]
+0.37527 0.56781 0.41595 0.97143 0.25099 0.35308 0.85236 0.83171 0.86086 0.33373 0.31547 0.83642 0.5239 0.41758 0.94752 0.20783 0.91339 0.9966 0.5432 0.83451 0.67712 0.63511 0.67727 0.34049 0.34293 0.886 0.25458 0.9261 0.65073 0.97542 0.62942 0.01661 0.02511 0.10666 0.08112 0.2321 0.35702 0.6828 0.19676 0.74798 0.9626 0.63687 0.68059 0.05578 0.74934 0.75207 0.42911 0.24766 0.56576 0.41836 0.20151 0.18585 0.87917 0.54729 0.59303 0.79422 0.58561 0.65829 0.98278 0.37224 0.92804 0.88381 0.610 [...]
+0.54697 0.15741 0.82484 0.28071 0.55241 0.17001 0.42561 0.69829 0.18801 0.00059 0.32459 0.45984 0.70206 0.05653 0.87187 0.32349 0.6831 0.65068 0.01237 0.46137 0.42738 0.08426 0.16959 0.35983 0.32133 0.7538 0.11388 0.43431 0.41542 0.2729 0.73491 0.30702 0.1435 0.83984 0.04313 0.56234 0.0123 0.11459 0.50946 0.38691 0.07028 0.74588 0.31587 0.81317 0.30104 0.77791 0.46266 0.35221 0.78929 0.75274 0.09484 0.59901 0.40754 0.80924 0.05587 0.13877 0.6591 0.97378 0.22924 0.68009 0.9615 0.59152 0.3 [...]
+0.96779 0.95442 0.35478 0.14625 0.85385 0.00415 0.80486 0.08285 0.34888 0.02421 0.27699 0.12747 0.34899 0.87568 0.60669 0.32577 0.49195 0.74104 0.38007 0.12982 0.68496 0.28613 0.39842 0.4137 0.04724 0.30857 0.52015 0.30039 0.58098 0.81036 0.48911 0.61061 0.06725 0.33986 0.82404 0.4693 0.58278 0.84759 0.01673 0.77312 0.22169 0.39387 0.34064 0.96113 0.99985 0.71217 0.11152 0.98748 0.00278 0.16261 0.07302 0.18844 0.33741 0.04309 0.08511 0.4411 0.84447 0.75285 0.94069 0.60289 0.61282 0.36178 [...]
+0.40693 0.57229 0.38433 0.51843 0.53005 0.70689 0.5926 0.24395 0.81266 0.95174 0.20058 0.41267 0.43713 0.51818 0.43257 0.29866 0.28601 0.73709 0.36667 0.946 0.05163 0.37978 0.42288 0.91504 0.5934 0.33234 0.36739 0.71256 0.22492 0.05688 0.44952 0.90685 0.94901 0.33368 0.65263 0.83075 0.89328 0.02458 0.98831 0.3204 0.25996 0.19263 0.25123 0.32232 0.51933 0.72593 0.37169 0.49426 0.3505 0.15483 0.50956 0.78468 0.94483 0.22307 0.18102 0.40648 0.53132 0.30094 0.73529 0.39001 0.25563 0.60845 0. [...]
+0.60562 0.12801 0.2509 0.16565 0.59364 0.00483 0.2557 0.03989 0.41373 0.58893 0.6419 0.56782 0.81615 0.92546 0.89484 0.71716 0.50263 0.58727 0.41319 0.72539 0.14751 0.17253 0.15075 0.02715 0.16386 0.36411 0.70116 0.15072 0.65627 0.94218 0.60613 0.95106 0.06391 0.40442 0.40234 0.82094 0.35482 0.67047 0.21059 0.98234 0.68659 0.91579 0.28796 0.0294 0.20464 0.58007 0.73426 0.04404 0.7118 0.17746 0.86145 0.59669 0.35148 0.22761 0.26187 0.84411 0.14455 0.59653 0.06949 0.89702 0.77433 0.48703 0 [...]
+0.65635 0.28324 0.89592 0.36602 0.16961 0.39716 0.82711 0.46155 0.5086 0.94552 0.05217 0.70322 0.18729 0.59512 0.51541 0.89596 0.23094 0.0585 0.96279 0.92624 0.13979 0.47513 0.9573 0.32306 0.75171 0.17691 0.50443 0.85804 0.35749 0.18905 0.53971 0.22831 0.26387 0.86236 0.02817 0.37839 0.64917 0.29595 0.72716 0.85952 0.54587 0.59952 0.29973 0.0068 0.88295 0.04314 0.96588 0.07414 0.28854 0.00814 0.58058 0.03359 0.98109 0.41433 0.16727 0.74491 0.15788 0.06946 0.08126 0.05614 0.51468 0.89031  [...]
+0.06796 0.59404 0.27849 0.31113 0.59884 0.61308 0.43627 0.48015 0.11734 0.1705 0.10105 0.91457 0.17997 0.47677 0.08817 0.64346 0.90018 0.10494 0.44653 0.48538 0.50224 0.75386 0.64706 0.18951 0.85108 0.04853 0.01573 0.80959 0.03888 0.61436 0.97479 0.71257 0.35264 0.5161 0.42689 0.64559 0.509 0.79451 0.96081 0.45388 0.31032 0.28939 0.56999 0.07067 0.76001 0.33752 0.54893 0.74333 0.68143 0.11895 0.39351 0.62169 0.70981 0.13557 0.01636 0.81299 0.2728 0.04204 0.54222 0.94621 0.63479 0.2917 0. [...]
+0.30614 0.68691 0.21436 0.91269 0.17191 0.84709 0.20658 0.32464 0.55198 0.38507 0.71143 0.86773 0.59968 0.73895 0.48734 0.22837 0.78865 0.27022 0.13336 0.28797 0.21743 0.32377 0.16004 0.93912 0.86887 0.40906 0.58623 0.88646 0.7203 0.86255 0.22039 0.42135 0.99478 0.21912 0.17564 0.8882 0.79 0.49774 0.72852 0.35078 0.61831 0.28215 0.94927 0.58846 0.75511 0.3217 0.01158 0.11805 0.36602 0.09155 0.08489 0.59351 0.66523 0.12941 0.93795 0.25298 0.54643 0.93176 0.28222 0.18693 0.90078 0.63676 0. [...]
+0.15619 0.21381 0.56353 0.14168 0.46018 0.90743 0.28314 0.72914 0.80839 0.0198 0.87124 0.11137 0.40423 0.97458 0.61522 0.07587 0.98397 0.63992 0.81586 0.90426 0.78334 0.68033 0.26307 0.80987 0.97855 0.76809 0.9591 0.80973 0.92006 0.79566 0.112 0.88639 0.94487 0.12663 0.25291 0.86549 0.15307 0.99316 0.70247 0.1752 0.4325 0.96211 0.42839 0.47015 0.26528 0.93026 0.42716 0.11399 0.12735 0.37802 0.15982 0.43014 0.96517 0.77512 0.09955 0.84606 0.80472 0.22653 0.02652 0.07323 0.62441 0.87931 0. [...]
+0.77802 0.03064 0.93986 0.63537 0.39967 0.12958 0.53411 0.44398 0.74461 0.92712 0.34555 0.2572 0.5872 0.92652 0.78458 0.38558 0.16413 0.8071 0.31298 0.67687 0.44213 0.48779 0.20077 0.54493 0.13127 0.87845 0.12049 0.88431 0.99521 0.7707 0.41443 0.4787 0.68741 0.74482 0.12612 0.33387 0.16601 0.17752 0.84194 0.69402 0.65149 0.97972 0.31966 0.38418 0.00728 0.4318 0.46114 0.1654 0.0037 0.97709 0.71267 0.65772 0.64899 0.36799 0.40741 0.61781 0.86344 0.3606 0.40562 0.92281 0.42992 0.92874 0.241 [...]
+0.24169 0.25664 0.41398 0.8599 0.80397 0.61757 0.62522 0.56268 0.09156 0.12509 0.41364 0.89785 0.28523 0.50079 0.6266 0.93311 0.19362 0.76317 0.40482 0.20116 0.50447 0.50628 0.73138 0.24139 0.25323 0.61631 0.45792 0.21793 0.08422 0.17487 0.46029 0.95476 0.59153 0.87361 0.72775 0.28773 0.29893 0.87407 0.34509 0.98288 0.45445 0.51438 0.30339 0.01447 0.288 0.7278 0.51521 0.98752 0.41295 0.01998 0.55829 0.5101 0.02133 0.53412 0.57664 0.46722 0.91765 0.46839 0.25717 0.3123 0.04946 0.2683 0.10 [...]
+0.55121 0.33939 0.04764 0.44536 0.08769 0.5406 0.24982 0.57342 0.21808 0.50858 0.87087 0.93568 0.84497 0.6949 0.75998 0.69362 0.09587 0.64674 0.08539 0.80729 0.42558 0.25682 0.25653 0.57754 0.01381 0.49 0.64246 0.28211 0.23886 0.95368 0.19402 0.04963 0.66792 0.30173 0.2745 0.06489 0.20112 0.19438 0.0942 0.23166 0.51732 0.58226 0.08887 0.8074 0.42975 0.3217 0.43411 0.29048 0.02058 0.45205 0.838 0.96758 0.82501 0.9717 0.92396 0.64648 0.73524 0.57569 0.24078 0.19075 0.46352 0.74237 0.52274  [...]
+0.54624 0.00514 0.27921 0.50948 0.39874 0.7816 0.83867 0.37464 0.60522 0.94567 0.19774 0.45783 0.76358 0.78752 0.57866 0.49265 0.03747 0.61149 0.38309 0.60952 0.42637 0.35458 0.40356 0.48449 0.01119 0.9728 0.49483 0.62354 0.50356 0.92399 0.79509 0.69089 0.48254 0.92931 0.84328 0.5025 0.40752 0.24076 0.08166 0.62883 0.7256 0.92052 0.9694 0.70016 0.75754 0.5946 0.96002 0.97671 0.13668 0.6632 0.14407 0.37963 0.81233 0.75678 0.64034 0.90774 0.88045 0.96242 0.57133 0.8985 0.40563 0.17911 0.36 [...]
+0.25066 0.81298 0.66348 0.09294 0.20278 0.42309 0.72714 0.1899 0.56189 0.36144 0.29791 0.79351 0.37968 0.13556 0.10844 0.09697 0.86969 0.98872 0.06128 0.08623 0.30724 0.0436 0.0205 0.19856 0.19396 0.24391 0.91993 0.12384 0.13872 0.73215 0.43787 0.50044 0.90873 0.30229 0.12357 0.37175 0.23279 0.39057 0.32642 0.75357 0.27353 0.75534 0.24136 0.78981 0.02274 0.53498 0.5309 0.99556 0.26874 0.58249 0.57349 0.22883 0.68401 0.11383 0.13099 0.38257 0.34803 0.47813 0.84877 0.0289 0.52088 0.12291 0 [...]
+0.97899 0.88192 0.23645 0.02793 0.62166 0.83885 0.52295 0.99896 0.93229 0.78083 0.00899 0.469 0.58773 0.93812 0.3596 0.22514 0.34253 0.04403 0.51394 0.15792 0.77847 0.89603 0.16314 0.67179 0.81685 0.44436 0.09807 0.69288 0.44621 0.2373 0.39111 0.03191 0.83949 0.17011 0.49286 0.43755 0.067 0.72839 0.80027 0.65586 0.78934 0.14108 0.13583 0.38864 0.51545 0.23434 0.84716 0.31445 0.06989 0.71305 0.12277 0.00578 0.36616 0.06694 0.79528 0.561 0.84228 0.5142 0.39501 0.29193 0.21174 0.31188 0.011 [...]
+0.03918 0.64886 0.67888 0.79368 0.07241 0.90724 0.98241 0.62165 0.30711 0.27769 0.39542 0.09351 0.41296 0.93838 0.70538 0.1559 0.47184 0.81898 0.08132 0.32866 0.85291 0.01479 0.00923 0.88807 0.12508 0.92163 0.06293 0.84329 0.93544 0.54665 0.92355 0.63153 0.21864 0.20318 0.76405 0.24547 0.7407 0.17348 0.70591 0.876 0.50336 0.52842 0.94765 0.95581 0.77723 0.05714 0.58996 0.5347 0.51207 0.67434 0.02196 0.44728 0.49952 0.49923 0.56158 0.14551 0.26481 0.54615 0.77386 0.2847 0.61675 0.43552 0. [...]
+0.34791 0.92786 0.26543 0.13685 0.16956 0.20221 0.12901 0.12731 0.48715 0.4595 0.95371 0.42194 0.80085 0.16166 0.62875 0.30454 0.03609 0.95596 0.86685 0.71398 0.26458 0.3595 0.06804 0.48872 0.73297 0.58364 0.09272 0.07241 0.4798 0.17629 0.04643 0.50263 0.76231 0.25936 0.01494 0.01404 0.30998 0.83897 0.89635 0.73924 0.66486 0.77973 0.8598 0.47968 0.28311 0.60784 0.191 0.9373 0.24952 0.36268 0.82833 0.92689 0.72347 0.32886 0.52005 0.71284 0.08271 0.3123 0.34605 0.033 0.56266 0.73198 0.8065 [...]
+0.69546 0.74992 0.66571 0.86486 0.26536 0.92493 0.87418 0.32427 0.37192 0.43229 0.29985 0.64108 0.59678 0.33711 0.6956 0.06893 0.82942 0.81505 0.25172 0.0261 0.60698 0.22763 0.70515 0.54428 0.1128 0.21664 0.88849 0.50661 0.49916 0.60799 0.1128 0.38067 0.42492 0.23447 0.37837 0.83687 0.86951 0.7253 0.01441 0.95586 0.39257 0.55496 0.91196 0.8895 0.69684 0.06913 0.79672 0.28707 0.3597 0.93571 0.68904 0.74515 0.01393 0.26565 0.54717 0.88236 0.98976 0.32279 0.80589 0.24546 0.33974 0.37588 0.0 [...]
+0.26245 0.72191 0.25392 0.66999 0.23388 0.36128 0.49308 0.47966 0.04985 0.56543 0.81706 0.15992 0.91013 0.79571 0.65331 0.06352 0.02419 0.391 0.38764 0.98985 0.22687 0.75242 0.08492 0.20571 0.95252 0.3349 0.16332 0.68221 0.6641 0.68148 0.42345 0.30225 0.74174 0.02621 0.1438 0.01859 0.25419 0.63815 0.41711 0.03091 0.16549 0.52608 0.2216 0.08975 0.8413 0.61611 0.43933 0.1872 0.60505 0.23856 0.37755 0.63274 0.03352 0.22089 0.6653 0.80451 0.56877 0.61439 0.1285 0.39095 0.59304 0.46455 0.6889 [...]
+0.05431 0.80363 0.18484 0.34203 0.60302 0.06496 0.97279 0.11165 0.61382 0.02696 0.8679 0.78482 0.66758 0.62051 0.79588 0.40839 0.74673 0.89071 0.18583 0.29272 0.85397 0.58241 0.67783 0.90111 0.62576 0.91685 0.26782 0.26547 0.25678 0.32685 0.2077 0.89335 0.68025 0.25662 0.25632 0.35091 0.70423 0.14314 0.37554 0.14845 0.19475 0.80711 0.5432 0.82901 0.08807 0.78759 0.44754 0.5635 0.2202 0.4333 0.70052 0.02951 0.89537 0.39822 0.61671 0.69547 0.43013 0.53274 0.65564 0.46666 0.30128 0.08141 0. [...]
+0.11417 0.52875 0.50623 0.03113 0.35935 0.11615 0.84973 0.90368 0.20862 0.02587 0.02351 0.93859 0.72234 0.16276 0.10468 0.21321 0.27275 0.34636 0.02032 0.99118 0.71548 0.33174 0.64334 0.43232 0.72851 0.99035 0.97733 0.4602 0.99763 0.71019 0.75057 0.07255 0.63472 0.63155 0.38606 0.2071 0.06437 0.68298 0.44734 0.36191 0.21138 0.94647 0.96583 0.25999 0.99997 0.20325 0.44279 0.34761 0.49533 0.78633 0.93673 0.98923 0.12125 0.11838 0.968 0.88017 0.86616 0.34195 0.36199 0.61933 0.63566 0.14202  [...]
+0.53658 0.52298 0.1797 0.3176 0.93853 0.54556 0.15372 0.78115 0.10634 0.08359 0.02173 0.63536 0.98172 0.27719 0.27909 0.78726 0.81801 0.86987 0.34616 0.91389 0.86018 0.33411 0.97515 0.84663 0.11671 0.3637 0.53691 0.91977 0.93129 0.34416 0.27367 0.61442 0.50644 0.94389 0.65764 0.84752 0.39586 0.68011 0.90975 0.09159 0.93238 0.46782 0.60246 0.13577 0.24442 0.48433 0.89135 0.41302 0.61171 0.435 0.56481 0.57765 0.0116 0.11949 0.53017 0.69441 0.18768 0.27142 0.82791 0.11499 0.34865 0.98464 0. [...]
+0.34064 0.65173 0.13175 0.87304 0.15357 0.89004 0.86678 0.83576 0.98946 0.48907 0.21302 0.71112 0.98955 0.88577 0.44301 0.2214 0.692 0.65935 0.98342 0.69929 0.07311 0.09692 0.99302 0.72016 0.01137 0.45331 0.72897 0.7728 0.35849 0.47942 0.11955 0.29413 0.73845 0.16023 0.87905 0.23822 0.70711 0.40077 0.68959 0.87931 0.93196 0.73217 0.48912 0.94399 0.08962 0.94994 0.61894 0.83665 0.99882 0.94025 0.27014 0.32764 0.18459 0.72517 0.71597 0.10617 0.27806 0.79369 0.58352 0.81725 0.87567 0.20674  [...]
+0.43697 0.96571 0.60191 0.28078 0.66413 0.9652 0.85352 0.58019 0.66373 0.95369 0.31814 0.3712 0.14133 0.42671 0.80174 0.42527 0.12792 0.91974 0.25883 0.77695 0.65031 0.37002 0.9615 0.22191 0.73142 0.73582 0.22262 0.9225 0.15556 0.57197 0.82653 0.35212 0.73897 0.26585 0.87186 0.06314 0.005 0.15823 0.27985 0.02102 0.03437 0.44838 0.74346 0.98102 0.76471 0.9895 0.65777 0.91378 0.46839 0.49211 0.91504 0.39893 0.50757 0.25604 0.52781 0.31239 0.27976 0.14438 0.24498 0.06336 0.67134 0.16039 0.0 [...]
+0.84504 0.05142 0.20999 0.7735 0.70359 0.23197 0.80834 0.30984 0.29833 0.51186 0.84398 0.90281 0.50353 0.42355 0.07157 0.90729 0.37779 0.43626 0.68354 0.5797 0.53025 0.9914 0.16519 0.05617 0.1191 0.38853 0.60638 0.889 0.77793 0.69804 0.41233 0.88066 0.43584 0.26131 0.84348 0.81953 0.43973 0.35406 0.1338 0.61147 0.64725 0.97914 0.54355 0.17451 0.90836 0.26091 0.15391 0.26819 0.74993 0.08111 0.51674 0.28679 0.83732 0.65976 0.28274 0.68724 0.37147 0.68236 0.70576 0.12439 0.70646 0.31727 0.5 [...]
+0.66293 0.99751 0.25944 0.50585 0.77811 0.67021 0.39613 0.4816 0.93688 0.90171 0.80937 0.19782 0.62105 0.4691 0.28382 0.48539 0.33018 0.5482 0.68589 0.4225 0.43435 0.36209 0.1554 0.09922 0.35519 0.09613 0.08244 0.38107 0.33587 0.21262 0.46178 0.87406 0.98618 0.20941 0.19025 0.37409 0.42443 0.51231 0.93091 0.96776 0.98363 0.53037 0.19476 0.7603 0.92941 0.14798 0.34873 0.31048 0.93962 0.3949 0.85791 0.47544 0.40215 0.93957 0.88692 0.73891 0.14944 0.48454 0.79728 0.72175 0.48014 0.9013 0.37 [...]
+0.8963 0.07986 0.31924 0.97259 0.02951 0.87198 0.85284 0.57276 0.99507 0.77705 0.71443 0.73275 0.03534 0.37276 0.34677 0.19246 0.24569 0.69945 0.74235 0.24334 0.40083 0.9963 0.93497 0.27672 0.71566 0.16919 0.10592 0.69129 0.25204 0.2649 0.39043 0.02493 0.4312 0.23436 0.00018 0.04548 0.4884 0.804 0.82065 0.37265 0.63863 0.84198 0.83192 0.76159 0.30121 0.88565 0.8561 0.68773 0.74384 0.42323 0.29775 0.30239 0.66195 0.9837 0.06729 0.03127 0.45374 0.37447 0.12474 0.93022 0.08834 0.78799 0.776 [...]
+0.32115 0.75533 0.02339 0.52942 0.66952 0.23601 0.56419 0.14569 0.99548 0.03958 0.53386 0.2959 0.55902 0.69433 0.83459 0.741 0.89038 0.89874 0.764 0.6305 0.83998 0.23719 0.26271 0.22219 0.49692 0.37234 0.42785 0.84125 0.59573 0.52704 0.59902 0.45295 0.6399 0.01974 0.9445 0.42323 0.33578 0.07718 0.98972 0.63594 0.5689 0.63964 0.96293 0.22077 0.71077 0.14392 0.65449 0.57011 0.05616 0.18467 0.57008 0.5329 0.6039 0.42739 0.6918 0.87074 0.89687 0.04449 0.67302 0.86024 0.69949 0.95096 0.06611  [...]
+0.45278 0.03457 0.79791 0.57192 0.08899 0.21792 0.50806 0.14049 0.98259 0.87447 0.87852 0.68184 0.82418 0.68367 0.34917 0.24305 0.07303 0.9008 0.33516 0.74098 0.4791 0.05212 0.96944 0.70759 0.88419 0.55341 0.33025 0.44341 0.07124 0.49775 0.24009 0.00265 0.81071 0.20929 0.16695 0.80007 0.03777 0.89426 0.44633 0.32644 0.2274 0.42684 0.2383 0.66535 0.4914 0.51661 0.95871 0.6595 0.00475 0.88713 0.60001 0.72971 0.82871 0.31428 0.19743 0.96566 0.05236 0.26209 0.79429 0.37215 0.13311 0.42013 0. [...]
+0.86113 0.51224 0.33711 0.30095 0.22347 0.609 0.63217 0.61877 0.75253 0.13518 0.06493 0.78235 0.07337 0.3311 0.02033 0.29898 0.51256 0.56486 0.3588 0.80878 0.94343 0.08083 0.63342 0.25584 0.42247 0.95024 0.67895 0.75988 0.98871 0.90593 0.03478 0.49118 0.92228 0.25236 0.72221 0.21483 0.39634 0.36818 0.55103 0.1221 0.28527 0.81754 0.91635 0.09726 0.82184 0.66086 0.66467 0.23134 0.44913 0.63371 0.00524 0.95027 0.18563 0.52939 0.94571 0.15043 0.24534 0.4199 0.72576 0.07518 0.98216 0.94788 0. [...]
+0.90745 0.23023 0.86174 0.77398 0.42907 0.10918 0.13247 0.81198 0.88831 0.04105 0.24561 0.54398 0.43576 0.01462 0.6626 0.0279 0.26443 0.83459 0.32139 0.41696 0.54482 0.62696 0.41297 0.42159 0.23995 0.61043 0.98131 0.3363 0.88708 0.21884 0.90875 0.85939 0.45792 0.53126 0.69975 0.73707 0.61151 0.76415 0.41125 0.32582 0.55871 0.97384 0.7986 0.37021 0.80659 0.7194 0.84215 0.25418 0.12355 0.99018 0.60014 0.22453 0.97507 0.51677 0.80848 0.30162 0.57562 0.82485 0.97096 0.92899 0.3633 0.92709 0. [...]
+0.33495 0.80904 0.10695 0.53382 0.06068 0.04881 0.06635 0.47648 0.25672 0.56834 0.8434 0.30915 0.25197 0.82102 0.06114 0.17892 0.82917 0.04848 0.78093 0.77647 0.88165 0.56629 0.13866 0.71699 0.48543 0.6024 0.67194 0.58116 0.6996 0.32964 0.45495 0.34576 0.1144 0.36246 0.34974 0.01804 0.34091 0.24931 0.7526 0.98401 0.55098 0.17448 0.9583 0.82957 0.43868 0.96741 0.37388 0.88042 0.02053 0.20549 0.00919 0.14864 0.66391 0.51546 0.49751 0.14912 0.72602 0.13843 0.53614 0.50004 0.37073 0.50563 0. [...]
+0.71178 0.36233 0.28833 0.57217 0.15084 0.36164 0.661 0.63824 0.39985 0.76918 0.80449 0.30215 0.89042 0.01926 0.38514 0.67793 0.15082 0.65119 0.67406 0.89928 0.10059 0.85944 0.13899 0.65302 0.70447 0.52315 0.31279 0.06635 0.15772 0.16662 0.40572 0.77771 0.00305 0.0733 0.58973 0.60666 0.94892 0.48862 0.53683 0.38119 0.1441 0.46372 0.38504 0.6941 0.86186 0.4957 0.1969 0.9068 0.78956 0.67538 0.46656 0.63023 0.36789 0.1368 0.7625 0.60177 0.62589 0.95038 0.31565 0.68188 0.11353 0.1554 0.16118 [...]
+0.52987 0.35782 0.3832 0.64405 0.53508 0.10507 0.19429 0.7542 0.21991 0.44519 0.36458 0.42607 0.21187 0.30439 0.4811 0.87628 0.25981 0.42119 0.4274 0.25938 0.3519 0.63292 0.33035 0.80521 0.47727 0.21717 0.31344 0.6798 0.44454 0.33484 0.96434 0.83192 0.63597 0.12553 0.27922 0.77633 0.81176 0.94244 0.86217 0.61816 0.79288 0.45192 0.04035 0.74513 0.84192 0.59592 0.44114 0.8922 0.28342 0.89338 0.34841 0.26332 0.98039 0.73734 0.99623 0.03489 0.59709 0.23412 0.28839 0.0448 0.16202 0.37241 0.49 [...]
+0.45245 0.69794 0.42902 0.02481 0.03004 0.2083 0.26292 0.79823 0.68415 0.39708 0.9942 0.42655 0.94003 0.48203 0.09474 0.84481 0.27093 0.22762 0.16829 0.55526 0.14107 0.1487 0.10985 0.77677 0.50238 0.10422 0.18455 0.81777 0.13821 0.85823 0.46258 0.29697 0.29179 0.82454 0.36094 0.38567 0.25339 0.02822 0.43184 0.88252 0.09671 0.91462 0.08274 0.70623 0.71088 0.99936 0.12531 0.65163 0.87204 0.67259 0.3971 0.26666 0.12352 0.24225 0.72305 0.77951 0.90357 0.39224 0.86004 0.00571 0.26792 0.24526  [...]
+0.55547 0.31079 0.77372 0.56416 9e-05 0.23538 0.04077 0.30273 0.99704 0.90912 0.18383 0.14243 0.91918 0.51763 0.23084 0.4091 0.07929 0.73935 0.77122 0.21322 0.16776 0.42395 0.94614 0.91111 0.26761 0.0106 0.26827 0.95639 0.98075 0.07983 0.87967 0.37587 0.02016 0.63193 0.60434 0.48056 0.01406 0.72863 0.55333 0.91995 0.02041 0.44876 0.30175 0.47942 0.89432 0.03069 0.31698 0.87247 0.41828 0.05675 0.44217 0.17009 0.4147 0.18529 0.50752 0.81436 0.39207 0.57712 0.91774 0.74932 0.80485 0.48382 0 [...]
+0.1162 0.81767 0.75194 0.07701 0.4494 0.50755 0.81073 0.32211 0.45053 0.69911 0.04296 0.89771 0.72747 0.99657 0.3448 0.16435 0.5871 0.56837 0.13103 0.09877 0.14272 0.50068 0.44965 0.89752 0.34558 0.21797 0.63676 0.22207 0.0099 0.35806 0.26098 0.99255 0.24398 0.5732 0.82294 0.5142 0.81077 0.39523 0.12338 0.24852 0.36727 0.70031 0.70737 0.60928 0.91359 0.96938 0.57239 0.02472 0.18734 0.616 0.2601 0.78648 0.30817 0.05822 0.89873 0.90043 0.69311 0.9659 0.01333 0.71848 0.46619 0.37457 0.88541 [...]
+0.35581 0.33298 0.52108 0.73828 0.91388 0.03427 0.02215 0.37338 0.82774 0.70601 0.0187 0.32504 0.48717 0.12967 0.42312 0.77729 0.47688 0.61524 0.03426 0.41546 0.49859 0.89923 0.85007 0.5691 0.36996 0.77645 0.51584 0.47658 0.73167 0.16662 0.5375 0.50993 0.02048 0.09411 0.21426 0.47087 0.48129 0.44847 0.10755 0.43589 0.68073 0.62041 0.83537 0.32373 0.26528 0.39972 0.6151 0.49487 0.69762 0.00546 0.447 0.8462 0.99305 0.45673 0.14087 0.81564 0.5173 0.28031 0.55265 0.98654 0.81867 0.83708 0.75 [...]
+0.52719 0.65518 0.70567 0.10486 0.96984 0.67322 0.35269 0.96215 0.6732 0.73665 0.43895 0.27805 0.30739 0.69906 0.90983 0.0703 0.77975 0.5743 0.92179 0.95369 0.68059 0.51882 0.56277 0.31796 0.61836 0.12077 0.47832 0.24929 0.59882 0.11728 0.94031 0.61221 0.29256 0.60701 0.29444 0.83685 0.42916 0.1554 0.39557 0.9827 0.37733 0.02532 0.3052 0.47644 0.95706 0.74521 0.07123 0.82657 0.92144 0.38307 0.50182 0.56962 0.49438 0.91642 0.70964 0.71715 0.02601 0.71709 0.26649 0.86611 0.0547 0.89622 0.1 [...]
+0.3928 0.48818 0.97522 0.74042 0.129 0.40888 0.65966 0.1458 0.90731 0.10531 0.05934 0.8698 0.13879 0.64793 0.47742 0.31716 0.47669 0.73499 0.46236 0.30382 0.39197 0.09026 0.53663 0.59753 0.78113 0.55454 0.78311 0.74071 0.86149 0.24636 0.87204 0.03008 0.30453 0.14312 0.74426 0.34015 0.72154 0.25892 0.43687 0.71401 0.08226 0.10258 0.2429 0.54192 0.31569 0.21069 0.21736 0.25272 0.67376 0.21561 0.40179 0.49704 0.14715 0.35212 0.96463 0.09287 0.28074 0.31949 0.00615 0.04954 0.40346 0.66764 0. [...]
+0.61736 0.3688 0.84929 0.95399 0.81333 0.57253 0.74042 0.49144 0.24801 0.74048 0.4721 0.57174 0.47098 0.44994 0.03136 0.4259 0.73234 0.06459 0.97348 0.37467 0.80151 0.84767 0.84468 0.48329 0.51363 0.11819 0.84749 0.01781 0.09539 0.86712 0.75071 0.65386 0.22696 0.09279 0.12927 0.57872 0.06818 0.50617 0.45102 0.36084 0.49315 0.23573 0.69299 0.24589 0.51587 0.22073 0.27122 0.90767 0.16218 0.51738 0.80904 0.0515 0.61729 0.4862 0.61001 0.35441 0.86906 0.55281 0.92306 0.11949 0.24201 0.40185 0 [...]
+0.17987 0.00223 0.13801 0.61897 0.79032 0.11773 0.80563 0.94131 0.94682 0.48698 0.8659 0.93505 0.82874 0.51367 0.17221 0.81549 0.97004 0.27226 0.25002 0.33902 0.8964 0.78007 0.19392 0.82379 0.16513 0.50108 0.14869 0.16395 0.78967 0.04954 0.45294 0.01474 0.32429 0.13391 0.26062 0.47404 0.3893 0.14173 0.87168 0.39598 0.50103 0.9876 0.00268 0.51991 0.53534 0.89871 0.80221 0.58952 0.53762 0.11511 0.62098 0.55308 0.24395 0.74445 0.88315 0.78101 0.82799 0.21141 0.23578 0.43267 0.65636 0.17283  [...]
+0.681 0.78829 0.45125 0.87786 0.10136 0.27802 0.99252 0.92421 0.16332 0.67457 0.37751 0.52345 0.65263 0.60686 0.24982 0.59031 0.48459 0.93415 0.40924 0.66088 0.47383 0.70161 0.5059 0.96093 0.32245 0.10274 0.80635 0.54377 0.95203 0.51386 0.32968 0.99347 0.27098 0.39894 0.37284 0.55843 0.39548 0.77099 0.55878 0.51612 0.81891 0.28892 0.40078 0.28413 0.30585 0.43884 0.75725 0.66482 0.87781 0.83177 0.37582 0.40789 0.03059 0.3269 0.55468 0.90992 0.57847 0.74257 0.06951 0.13196 0.28869 0.7908 0 [...]
+0.90236 0.46283 0.68645 0.5146 0.62602 0.57162 0.93704 0.27529 0.44275 0.26751 0.52854 0.15572 0.75209 0.9926 0.15632 0.42915 0.06756 0.80739 0.40834 0.38881 0.49729 0.97677 0.1403 0.97436 0.37392 0.34965 0.70031 0.36033 0.1757 0.86985 0.69234 0.30501 0.58938 0.6108 0.8754 0.49238 0.71673 0.21192 0.66694 0.5113 0.72986 0.78267 0.22962 0.84961 0.72971 0.71394 0.79251 0.86557 0.58376 0.37268 0.93304 0.11857 0.57786 0.37584 0.56614 0.36373 0.12312 0.25471 0.70928 0.02406 0.33334 0.60085 0.3 [...]
+0.26429 0.47109 0.76349 0.13624 0.87773 0.50535 0.80542 0.70173 0.89364 0.61252 0.30198 0.14893 0.23164 0.15832 0.35015 0.60271 0.5149 0.98944 0.79586 0.08232 0.66457 0.51563 0.84301 0.42157 0.81526 0.80172 0.07341 0.17843 0.81668 0.46575 0.60848 0.8504 0.29438 0.72172 0.33428 0.95937 0.32428 0.79633 0.93902 0.82929 0.68288 0.76789 0.93958 0.31289 0.51159 0.81187 0.19258 0.25559 0.73035 0.38115 0.77485 0.63034 0.36527 0.39267 0.86643 0.24647 0.49261 0.77471 0.74457 0.84728 0.54419 0.4392 [...]
+0.89682 0.62979 0.71534 0.77271 0.27038 0.28838 0.32341 0.6862 0.47496 0.88286 0.45639 0.07801 0.17524 0.81261 0.21879 0.58205 0.14958 0.07085 0.8097 0.59776 0.83284 0.33352 0.02835 0.68567 0.00643 0.04718 0.03474 0.80669 0.62043 0.68302 0.3953 0.66301 0.16652 0.06944 0.81359 0.82041 0.48502 0.25433 0.73041 0.63062 0.18665 0.49875 0.46094 0.3422 0.70675 0.20766 0.28084 0.40948 0.70822 0.41226 0.0356 0.6774 0.53678 0.12063 0.57993 0.40569 0.17214 0.47635 0.34437 0.04077 0.06309 0.05612 0. [...]
+0.56176 0.28315 0.02232 0.57649 0.39985 0.36401 0.00525 0.93692 0.03126 0.96401 0.75677 0.13959 0.96833 0.53515 0.25947 0.01474 0.75941 0.34326 0.8597 0.70181 0.32269 0.97522 0.8507 0.4475 0.10553 0.22392 0.97235 0.93593 0.52662 0.53553 0.79872 0.99362 0.83951 0.5917 0.68821 0.29024 0.557 0.95659 0.44628 0.56587 0.10252 0.11728 0.17636 0.37144 0.31531 0.8357 0.03174 0.82959 0.24458 0.77757 0.92951 0.90517 0.05904 0.00221 0.29005 0.48356 0.44699 0.2449 0.27819 0.14977 0.58384 0.9487 0.837 [...]
+0.16873 0.00069 0.07167 0.04323 0.50848 0.37965 0.90768 0.89513 0.42052 0.70273 0.78395 0.48142 0.98124 0.93173 0.35234 0.54436 0.92748 0.66741 0.56914 0.15241 0.25167 0.98049 0.77871 0.13588 0.32317 0.28727 0.37003 0.69639 0.31585 0.9686 0.00765 0.93587 0.74013 0.40443 0.93164 0.06286 0.35945 0.97672 0.02636 0.69941 0.50557 0.92149 0.77655 0.18568 0.42799 0.43786 0.23367 0.30713 0.90482 0.35617 0.1009 0.60274 0.071 0.15101 0.10926 0.33947 0.40895 0.89855 0.43546 0.63944 0.5735 0.83142 0 [...]
+0.24478 0.12465 0.16672 0.45537 0.18261 0.50836 0.67206 0.4951 0.13063 0.02639 0.23251 0.52872 0.50053 0.54369 0.46032 0.68793 0.16345 0.09857 0.7393 0.68352 0.93786 0.22856 0.57228 0.96365 0.61508 0.75285 0.57433 0.09808 0.76296 0.45588 0.98229 0.52372 0.64274 0.42256 0.27259 0.08514 0.286 0.03112 0.27445 0.13553 0.70723 0.95557 0.49622 0.86824 0.68288 0.00204 0.00387 0.21272 0.86749 0.46003 0.89186 0.43693 0.28468 0.60059 0.65553 0.81206 0.27903 0.68913 0.77942 0.28823 0.55576 0.25784  [...]
+0.06977 0.40937 0.75676 0.48657 0.07952 0.47783 0.72918 0.95991 0.18403 0.31539 0.17467 0.01675 0.35777 0.30321 0.22844 0.36735 0.32254 0.94099 0.37818 0.07764 0.85461 0.35985 0.72288 0.73785 0.57249 0.52506 0.49832 0.48886 0.85108 0.54296 0.08339 0.00896 0.56725 0.57576 0.22144 0.94452 0.99406 0.71351 0.2309 0.65373 0.47529 0.61395 0.30968 0.81055 0.18606 0.87734 0.95175 0.31361 0.06786 0.5978 0.49902 0.39194 0.93687 0.50722 0.24544 0.69149 0.37057 0.28279 0.48431 0.54806 0.72296 0.2485 [...]
+0.37541 0.95367 0.00616 0.47931 0.17148 0.92067 0.8465 0.06666 0.37875 0.6561 0.27031 0.9018 0.1742 0.3057 0.06877 0.77229 0.53185 0.05496 0.55927 0.53647 0.63612 0.78039 0.3748 0.69461 0.84161 0.87848 0.00779 0.62621 0.08636 0.9007 0.88013 0.37941 0.16728 0.33906 0.76108 0.64909 0.64325 0.7822 0.13153 0.75873 0.75339 0.47459 0.08841 0.25375 0.72223 0.04325 0.5736 0.35679 0.4025 0.60683 0.43473 0.36632 0.50676 0.62621 0.35653 0.40248 0.97222 0.5682 0.64468 0.52154 0.6902 0.79024 0.89337  [...]
+0.04359 0.65324 0.13755 0.67699 0.74997 0.04482 0.74223 0.28315 0.66268 0.48121 0.77835 0.85433 0.77854 0.84044 0.67148 0.80219 0.96687 0.77539 0.57695 0.58799 0.75855 0.78154 0.3292 0.02923 0.40869 0.18698 0.03542 0.55319 0.44417 0.15418 0.13611 0.02918 0.83655 0.36534 0.32885 0.66506 0.47615 0.20028 0.78869 0.78977 0.10295 0.24892 0.53167 0.86694 0.81721 0.59219 0.03823 0.4988 0.40849 0.22706 0.72817 0.09747 0.08193 0.60012 0.44176 0.4223 0.70421 0.15479 0.12 0.93464 0.19344 0.00939 0. [...]
+0.03165 0.54728 0.4658 0.13827 0.45501 0.58377 0.3066 0.09624 0.74316 0.88878 0.55814 0.75568 0.60641 0.46172 0.45053 0.94726 0.32761 0.16705 0.07041 0.91174 0.42686 0.90438 0.55937 0.48914 0.76041 0.05331 0.34958 0.34654 0.66527 0.80503 0.3397 0.57446 0.81374 0.85653 0.37865 0.16333 0.48394 0.92069 0.44574 0.33747 0.9857 0.69891 0.97052 0.78061 0.64535 0.76167 0.96364 0.64536 0.30799 0.84647 0.31076 0.51142 0.68044 0.72223 0.18804 0.56735 0.55432 0.72536 0.32403 0.33041 0.72444 0.96116  [...]
+0.97698 0.57625 0.48506 0.67219 0.01497 0.69048 0.39251 0.74406 0.72622 0.74981 0.25017 0.43103 0.87197 0.66298 0.48924 0.82588 0.08751 0.60602 0.48112 0.05753 0.48492 0.87685 0.87967 0.40972 0.26981 0.6252 0.85647 0.28232 0.11337 0.14197 0.64247 0.28593 0.95751 0.95941 0.0933 0.67699 0.47214 0.17102 0.70727 0.28762 0.10949 0.31633 0.44754 0.57901 0.41071 0.90212 0.97604 0.2919 0.93475 0.78403 0.54647 0.69447 0.91367 0.29645 0.06859 0.02315 0.37412 0.35368 0.36119 0.56377 0.82384 0.08432 [...]
+0.22778 0.16747 0.19007 0.70961 0.81179 0.02144 0.59843 0.73034 0.49139 0.42023 0.53837 0.18992 0.91065 0.36571 0.10867 0.08349 0.60768 0.92237 0.29341 0.62452 0.97688 0.90008 0.82874 0.68574 0.50538 0.54763 0.14952 0.51647 0.88105 0.49906 0.66268 0.92893 0.1918 0.33502 0.53601 0.86084 0.70809 0.68135 0.92699 0.6185 0.02296 0.45131 0.50496 0.64041 0.42013 0.03975 0.56417 0.49972 0.00467 0.74772 0.34932 0.00841 0.03134 0.35016 0.83748 0.00196 0.24809 0.88886 0.42369 0.53483 0.60732 0.6886 [...]
+0.67693 0.39072 0.98118 0.84675 0.24745 0.9468 0.18906 0.54597 0.65326 0.8451 0.36917 0.06817 0.2157 0.46143 0.52501 0.10458 0.71151 0.31872 0.16176 0.30203 0.82136 0.4053 0.69131 0.98614 0.8297 0.95754 0.68655 0.16671 0.48621 0.48327 0.46297 0.96616 0.35892 0.23581 0.86036 0.98484 0.14377 0.97447 0.75712 0.586 0.69893 0.9619 0.13519 0.46717 0.93028 0.85257 0.16837 0.55796 0.26516 0.98926 0.82865 0.24288 0.37609 0.59705 0.01904 0.54476 0.17529 0.98081 0.84641 0.90822 0.05613 0.07384 0.85 [...]
+0.2595 0.74563 0.56846 0.27073 0.92039 0.11737 0.10484 0.01429 0.63695 0.83625 0.63866 0.76301 0.73472 0.19217 0.38063 0.32841 0.77235 0.10935 0.01738 0.89287 0.376 0.37023 0.10555 0.4031 0.36453 0.69572 0.99202 0.04179 0.49395 0.22233 0.00149 0.84321 0.42939 0.92763 0.71393 0.32894 0.94328 0.98553 0.13171 0.97867 0.85015 0.90503 0.1794 0.40454 0.95424 0.24921 0.80021 0.06507 0.96261 0.05467 0.44282 0.20159 0.47018 0.14333 0.82085 0.47362 0.55088 0.02311 0.15744 0.53379 0.1782 0.8552 0.8 [...]
+0.28894 0.12164 0.80839 0.65398 0.19886 0.18733 0.39635 0.14214 0.86327 0.02831 0.59539 0.52656 0.12464 0.14438 0.58707 0.36493 0.35405 0.78319 0.49653 0.66835 0.39725 0.8219 0.86122 0.74874 0.02791 0.74765 0.39328 0.90294 0.28836 0.85979 0.16282 0.04002 0.8004 0.11417 0.65126 0.48046 0.98652 0.59629 0.72083 0.59684 0.53545 0.91114 0.04272 0.51476 0.38305 0.9293 0.46721 0.79137 0.26617 0.76993 0.65037 0.51543 0.0028 0.72883 0.19252 0.28782 0.35982 0.61055 0.97285 0.66834 0.31078 0.77851  [...]
+0.18316 0.69408 0.94544 0.08874 0.48031 0.33745 0.93479 0.81257 0.80545 0.86529 0.4808 0.21764 0.96193 0.40859 0.86313 0.21469 0.33256 0.04522 0.45234 0.6901 0.53728 0.51812 0.7802 0.19226 0.90234 0.21586 0.03189 0.18313 0.05152 0.82446 0.26878 0.8155 0.4777 0.47889 0.82157 0.00146 0.68056 0.3059 0.07211 0.73194 0.19901 0.79679 0.01663 0.73428 0.90411 0.34729 0.6537 0.65129 0.07728 0.40406 0.67595 0.6578 0.74213 0.04558 0.79862 0.77252 0.35563 0.6787 0.28301 0.97865 0.28899 0.39305 0.395 [...]
+0.44457 0.54171 0.86171 0.71093 0.29892 0.96532 0.94239 0.78089 0.47674 0.05334 0.5979 0.46734 0.30755 0.79083 0.979 0.04871 0.81214 0.12953 0.75184 0.32804 0.54232 0.2621 0.90813 0.27563 0.73997 0.7912 0.31525 0.58514 0.81395 0.58339 0.63678 0.75571 0.82097 0.66265 0.84255 0.46145 0.13415 0.48061 0.39181 0.21616 0.38699 0.22291 0.63248 0.66281 0.71716 0.07972 0.56505 0.84991 0.12594 0.09065 0.9176 0.79857 0.4244 0.70696 0.80107 0.87122 0.37595 0.00776 0.53922 0.67185 0.52395 0.43568 0.9 [...]
+0.73145 0.34225 0.63517 0.69774 0.17224 0.51275 0.86616 0.99741 0.23061 0.85749 0.96529 0.17692 0.82589 0.57209 0.25121 0.38012 0.58382 0.7279 0.30915 0.00421 0.03169 0.97086 0.54469 0.82351 0.74866 0.75459 0.64298 0.66204 0.79373 0.09065 0.78806 0.15645 0.58507 0.72994 0.3228 0.6311 0.73256 0.28846 0.48522 0.17219 0.59683 0.4063 0.96264 0.20309 0.52623 0.9753 0.7164 0.91843 0.83226 0.47238 0.54513 0.85574 0.50423 0.23581 0.56778 0.5179 0.79003 0.5603 0.43426 0.94116 0.71537 0.33485 0.59 [...]
+0.94552 0.34111 0.89572 0.26904 0.46888 0.85093 0.98473 0.15803 0.56666 0.93475 0.50999 0.7002 0.73069 0.51091 0.14812 0.55023 0.19781 0.50738 0.58956 0.86163 0.34378 0.16265 0.63484 0.36007 0.84121 0.0857 0.50688 0.81551 0.93684 0.84633 0.35269 0.19973 0.60828 0.61484 0.66946 0.37323 0.07445 0.07944 0.03698 0.94173 0.55379 0.35982 0.42403 0.46357 0.98181 0.29404 0.5561 0.01044 0.97086 0.60801 0.69895 0.76664 0.67636 0.49782 0.93012 0.15162 0.51398 0.53724 0.38206 0.10783 0.38678 0.01247 [...]
+0.90639 0.64936 0.97615 0.91839 0.22222 0.31617 0.67288 0.45119 0.63313 0.08122 0.86799 0.07932 0.02299 0.57973 0.81619 0.96738 0.87958 0.30599 0.88341 0.05715 0.96296 0.07007 0.84909 0.08489 0.84443 0.7072 0.16028 0.2601 0.5783 0.96586 0.83482 0.87202 0.6251 0.78337 0.18129 0.28969 0.52596 0.91486 0.0307 0.36185 0.93124 0.5716 0.02384 0.57849 0.35748 0.50403 0.66598 0.71499 0.83253 0.3323 0.05262 0.34457 0.87629 0.51586 0.80557 0.83981 0.48914 0.55706 0.80699 0.03371 0.99136 0.71949 0.2 [...]
+0.84683 0.64879 0.84139 0.39985 0.9364 0.77687 0.89299 0.14811 0.02268 0.10756 0.52102 0.25686 0.79562 0.53875 0.00778 0.18158 0.48199 0.61457 0.8686 0.91956 0.96177 0.17495 0.85258 0.47447 0.8179 0.53251 0.04495 0.87929 0.21412 0.2798 0.05324 0.7158 0.62715 0.36892 0.6184 0.33572 0.85884 0.37705 0.33759 0.81843 0.6255 0.35652 0.26979 0.88422 0.43658 0.87885 0.30259 0.39263 0.39878 0.62426 0.04811 0.71037 0.68812 0.58389 0.54079 0.79981 0.29746 0.53702 0.43534 0.47706 0.43988 0.41631 0.5 [...]
+0.62262 0.56872 0.9009 0.69167 0.10194 0.34224 0.72252 0.39661 0.90534 0.98687 0.73293 0.73975 0.11383 0.87497 0.3815 0.75787 0.52705 0.65583 0.32561 0.86431 0.05053 0.62364 0.69131 0.72784 0.23947 0.52049 0.18488 0.93342 0.91814 0.56144 0.14709 0.9689 0.58526 0.19286 0.90793 0.28155 0.367 0.3007 0.73525 0.42282 0.76674 0.24094 0.4549 0.07737 0.38655 0.27762 0.66344 0.94257 0.18537 0.22397 0.22923 0.03005 0.56794 0.30613 0.15498 0.03095 0.9883 0.15999 0.25152 0.68679 0.74805 0.17224 0.23 [...]
+0.66285 0.65838 0.64994 0.98345 0.17286 0.06689 0.31347 0.91258 0.02371 0.30362 0.90672 0.05913 0.31599 0.37545 0.56323 0.6873 0.7089 0.9491 0.26928 0.20835 0.19499 0.57267 0.91779 0.87451 0.53288 0.61609 0.1842 0.73469 0.86081 0.26321 0.88461 0.43225 0.54481 0.81971 0.60824 0.00462 0.0631 0.14953 0.63758 0.36207 0.86381 0.31981 0.02143 0.69493 0.99756 0.39544 0.94541 0.66854 0.83157 0.33831 0.41757 0.11308 0.26039 0.9806 0.16718 0.7987 0.84257 0.62898 0.33768 0.77171 0.00338 0.07964 0.2 [...]
+0.70624 0.62775 0.68373 0.06326 0.55247 0.88132 0.83277 0.52617 0.56593 0.6727 0.65377 0.82394 0.08851 0.96271 0.91013 0.35245 0.81419 0.39864 0.37261 0.69386 0.23584 0.44063 0.78485 0.5558 0.2943 0.55321 0.17619 0.63591 0.65599 0.50875 0.23064 0.58088 0.03078 0.7733 0.6217 0.90127 0.88223 0.95282 0.05587 0.62465 0.68469 0.64035 0.69274 0.46244 0.33934 0.8979 0.97526 0.10932 0.46712 0.07864 0.75064 0.81215 0.9947 0.69751 0.92713 0.87759 0.24985 0.84854 0.78274 0.33678 0.80309 0.74608 0.6 [...]
+0.47722 0.25967 0.22902 0.55732 0.66968 0.00678 0.20499 0.98296 0.53934 0.36757 0.66671 0.3156 0.82923 0.96916 0.89791 0.18994 0.4941 0.93469 0.15355 0.66228 0.62721 0.56284 0.86776 0.97384 0.64581 0.68044 0.85367 0.41398 0.04679 0.05312 0.4056 0.06815 0.14554 0.85884 0.89361 0.67436 0.8015 0.10181 0.42553 0.08278 0.43022 0.19993 0.36988 0.10688 0.1663 0.92907 0.91043 0.60616 0.22276 0.22596 0.72032 0.30383 0.11877 0.88034 0.6863 0.8388 0.60861 0.22497 0.52007 0.49821 0.2507 0.12132 0.51 [...]
+0.72672 0.99658 0.30419 0.14017 0.29501 0.92217 0.0375 0.81959 0.00947 0.72183 0.37731 0.18833 0.7603 0.26254 0.31531 0.33694 0.85164 0.25567 0.78417 0.95244 0.85071 0.12869 0.30344 0.46377 0.14384 0.94528 0.27613 0.62146 0.22243 0.58326 0.12334 0.08691 0.42502 0.39352 0.11856 0.53566 0.17297 0.24082 0.31245 0.65976 0.62698 0.7237 0.82846 0.02375 0.73739 0.27412 0.99049 0.74994 0.74647 0.05954 0.45335 0.32653 0.61743 0.53324 0.19058 0.26866 0.37468 0.83217 0.98237 0.63291 0.25546 0.06517 [...]
+0.61619 0.62921 0.6593 0.0549 0.73956 0.43844 0.89597 0.85184 0.83966 0.34965 0.50039 0.74288 0.00247 0.77171 0.73489 0.00116 0.7361 0.77579 0.62417 0.06319 0.82771 0.52428 0.60449 0.28065 0.02846 0.81395 0.50697 0.29602 0.98553 0.34188 0.73848 0.06272 0.60462 0.29024 0.00614 0.09314 0.05472 0.97103 0.23184 0.55201 0.00115 0.48277 0.43176 0.89746 0.87494 0.40528 0.83632 0.72112 0.59401 0.42243 0.39704 0.98448 0.5071 0.03881 0.55318 0.56586 0.24919 0.4423 0.13065 0.16981 0.67771 0.60666 0 [...]
+0.7759 0.31394 0.56672 0.60115 0.27247 0.22548 0.03445 0.17872 0.45994 0.66567 0.37764 0.84052 0.5306 0.82427 0.01949 0.97246 0.15692 0.92871 0.26773 0.02905 0.97378 0.53944 0.80063 0.48403 0.22188 0.56103 0.00955 0.76596 0.28021 0.1382 0.98743 0.73219 0.4688 0.53824 0.72698 0.66075 0.48161 0.13989 3e-05 0.44666 0.66315 0.05482 0.53274 0.7291 0.69697 0.7968 0.86443 0.72748 0.91766 0.88011 0.06215 0.4659 0.65012 0.25625 0.69653 0.95726 0.51345 0.36144 0.68449 0.33613 0.01211 0.26993 0.100 [...]
+0.95899 0.80046 0.10216 0.63112 0.17692 0.81286 0.33474 0.87197 0.75172 0.77395 0.89206 0.51793 0.20505 0.27252 0.35089 0.00972 0.73592 0.7145 0.37884 0.24151 0.65619 0.98075 0.91549 0.82784 0.48207 0.24491 0.45614 0.37295 0.75361 0.55514 0.12647 0.30445 0.15648 0.60596 0.40698 0.42731 0.41017 0.08243 0.08839 0.80517 0.38855 0.19769 0.14429 0.54092 0.47954 0.20287 0.34152 0.42225 0.06507 0.96408 0.89509 0.53426 0.53869 0.72687 0.61208 0.55819 0.10198 0.1221 0.26713 0.70748 0.87263 0.4866 [...]
+0.97138 0.07938 0.0413 0.05321 0.87269 0.0118 0.9886 0.75172 0.20454 0.70079 0.72452 0.62772 0.24761 0.0682 0.78415 0.4345 0.75643 0.75785 0.61391 0.92457 0.365 0.33214 0.93662 0.71134 0.92882 0.42421 0.73143 0.71683 0.17723 0.67744 0.02856 0.66221 0.96267 0.89715 0.61806 0.80557 0.47199 0.96275 0.56529 0.56845 0.70478 0.94846 0.77453 0.99971 0.05804 0.38812 0.17294 0.88662 0.5557 0.80089 0.21495 0.16815 0.98661 0.35795 0.28759 0.61878 0.52037 0.79836 0.61734 0.16963 0.47596 0.04798 0.31 [...]
+0.75066 0.28695 0.84864 0.12044 0.44423 0.40414 0.76631 0.42516 0.84974 0.43172 0.66253 0.40825 0.1705 0.65917 0.90685 0.82986 0.03768 0.8279 0.2112 0.88331 0.38379 0.94434 0.85648 0.98115 0.99095 0.88446 0.81435 0.87626 0.50718 0.97095 0.23164 0.84895 0.56444 0.21994 0.58996 0.52896 0.22746 0.52527 0.25527 0.44294 0.32172 0.41584 0.70973 0.64415 0.36492 0.22458 0.74018 0.2512 0.33431 0.8809 0.18012 0.01372 0.22753 0.69683 0.8071 0.56352 0.91631 0.94377 0.45778 0.915 0.61313 0.80931 0.50 [...]
+0.52683 0.40169 0.98024 0.67721 0.66012 0.80655 0.00324 0.9419 0.88658 0.31096 0.66549 0.45933 0.6542 0.51935 0.55369 0.75482 0.14318 0.76253 0.22142 0.83843 0.31025 0.51191 0.64416 0.2532 0.8964 0.15293 0.25528 0.79048 0.00265 0.87293 0.38474 0.88138 0.52366 0.90143 0.12525 0.10673 0.87497 0.54698 0.07641 0.38227 0.26121 0.36143 0.77951 0.73081 0.74113 0.70827 0.5045 0.27074 0.92846 0.81756 0.40878 0.72577 0.07444 0.00038 0.59497 0.19608 0.3367 0.27448 0.52947 0.31781 0.90165 0.33455 0. [...]
+0.94316 0.33037 0.82167 0.7071 0.5724 0.29646 0.03515 0.38611 0.84448 0.44902 0.11881 0.9084 0.49566 0.2183 0.81756 0.7426 0.79907 0.85309 0.34335 0.65691 0.98928 0.19797 0.53658 0.91818 0.2324 0.61056 0.84417 0.74002 0.27189 0.77744 0.39028 0.62403 0.74579 0.53406 0.4395 0.66901 0.31233 0.17295 0.9519 0.36791 0.14165 0.95198 0.63486 0.87679 0.72805 0.86915 0.70675 0.1748 0.90787 0.20353 0.56016 0.86489 0.1734 0.51226 0.80038 0.13463 0.25702 0.23545 0.23743 0.39472 0.63804 0.01112 0.0555 [...]
+0.95166 0.65859 0.04473 0.04511 0.95669 0.62684 0.41207 0.68853 0.93893 0.28934 0.44881 0.17427 0.11674 0.53524 0.03938 0.74234 0.35499 0.63227 0.38815 0.5655 0.82873 0.56537 0.96803 0.37849 0.29808 0.16333 0.19786 0.51483 0.96576 0.58803 0.05705 0.4782 0.54588 0.52493 0.58605 0.06916 0.46359 0.47811 0.84842 0.23596 0.83563 0.45848 0.0147 0.65273 0.48781 0.29213 0.76346 0.42346 0.73226 0.47413 0.86291 0.43316 0.15497 0.28279 0.87384 0.07079 0.29048 0.73551 0.5836 0.63793 0.14923 0.4768 0 [...]
+0.14209 0.49628 0.07485 0.35588 0.80042 0.93253 0.23065 0.38362 0.17125 0.57749 0.73636 0.06794 0.32226 0.95421 0.33868 0.99416 0.56647 0.94481 0.99377 0.00279 0.72604 0.2154 0.37132 0.84153 0.7713 0.15799 0.47607 0.776 0.16436 0.11657 0.08636 0.52243 0.84794 0.1393 0.75922 0.67665 0.76023 0.53054 0.68491 0.99895 0.85145 0.60676 0.91273 0.58776 0.03565 0.95577 0.67167 0.55238 0.60401 0.6266 0.12129 0.12676 0.12301 0.45419 0.67672 0.97456 0.35749 0.33835 0.85162 0.13855 0.92009 0.83884 0. [...]
+0.27165 0.33619 0.53435 0.25662 0.30448 0.55222 0.58537 0.35313 0.78356 0.36635 0.87903 0.08276 0.46191 0.81297 0.11545 0.09863 0.93359 0.90925 0.69013 0.17701 0.63065 0.9928 0.78972 0.89256 0.30274 0.72215 0.27534 0.64048 0.5236 0.23936 0.11811 0.25422 0.9722 0.53156 0.61721 0.98216 0.03434 0.28411 0.24409 0.23445 0.33765 0.88887 0.33903 0.73053 0.32119 0.61111 0.04565 0.10871 0.87931 0.58274 0.87508 0.56991 0.68152 0.9983 0.89895 0.00358 0.62439 0.29647 0.4536 0.25915 0.38109 0.29161 0 [...]
+0.53698 0.06903 0.99237 0.31373 0.90987 0.80954 0.7823 0.51269 0.36908 0.42218 0.7995 0.82201 0.69045 0.22417 0.35239 0.10799 0.74406 0.65334 0.968 0.34089 0.13284 0.36719 0.64687 0.05088 0.99705 0.5153 0.2598 0.84277 0.66379 0.44324 0.35234 0.31951 0.46453 0.42762 0.34322 0.22479 0.23721 0.53518 0.09625 0.69475 0.32311 0.53288 0.38091 0.5248 0.5221 0.02644 0.10016 0.21198 0.50544 0.02724 0.76676 0.73951 0.39348 0.77301 0.7844 0.69052 0.96739 0.18138 0.27885 0.31256 0.72094 0.27787 0.613 [...]
+0.61744 0.34602 0.58538 0.33242 0.79745 0.09969 0.15674 0.97445 0.11652 0.31036 0.81416 0.01041 0.46834 0.89256 0.29665 0.24752 0.68149 0.8131 0.59325 0.87158 0.3613 0.34115 0.54304 0.08788 0.63359 0.05474 0.38066 0.24634 0.52328 0.43749 0.05536 0.39782 0.25874 0.35834 0.52641 0.14186 0.52128 0.68798 0.44623 0.7927 0.20318 0.03475 0.12412 0.25231 0.96525 0.47635 0.9262 0.96684 0.81983 0.43729 0.69488 0.01843 0.61917 0.58634 0.89854 0.83075 0.74568 0.8464 0.42833 0.87467 0.55696 0.81993 0 [...]
+0.7714 0.17998 0.43548 0.50654 0.67427 0.11036 0.98798 0.37634 0.32426 0.28298 0.39598 0.49434 0.93289 0.13222 0.39358 0.06809 0.46747 0.09388 0.18047 0.3424 0.3784 0.8049 0.85745 0.09993 0.07098 0.58713 0.88361 0.95419 0.575 0.75831 0.21819 0.96916 0.99782 0.54334 0.08175 0.44751 0.68281 0.61045 0.59187 0.3942 0.14755 0.35762 0.39116 0.04053 0.34537 0.05063 0.84209 0.14219 0.1926 0.0604 0.98559 0.31666 0.03427 0.92232 0.0581 0.60299 0.59969 0.00436 0.76351 0.12765 0.51378 0.74093 0.4536 [...]
+0.83443 0.13143 0.19428 0.63714 0.24234 0.13698 0.23638 0.82097 0.93098 0.28072 0.13287 0.87348 0.77115 0.63985 0.18167 0.50099 0.76581 0.30258 0.92975 0.17426 0.85336 0.82446 0.96927 0.59181 0.21288 0.08734 0.59523 0.4293 0.89776 0.32882 0.52724 0.87034 0.35962 0.22385 0.57101 0.84979 0.70741 0.39365 0.60245 0.63276 0.46852 0.2228 0.26218 0.10266 0.68128 0.99081 0.63701 0.65796 0.65027 0.81965 0.01453 0.87434 0.93197 0.76898 0.53244 0.37799 0.17302 0.11233 0.5468 0.88094 0.20776 0.61791 [...]
+0.57637 0.6924 0.55539 0.8195 0.98889 0.72678 0.96419 0.84895 0.368 0.20435 0.62211 0.27014 0.28741 0.47009 0.83246 0.93328 0.47487 0.49669 0.88623 0.26719 0.60604 0.05437 0.29534 0.33518 0.70848 0.11611 0.19692 0.50953 0.15877 0.18292 0.69916 0.88228 0.7397 0.52161 0.33384 0.87154 0.26066 0.53179 0.68979 0.46743 0.87221 0.84038 0.17862 0.74877 0.77541 0.17019 0.4206 0.4291 0.00288 0.64129 0.37315 0.69789 0.79597 0.35961 0.73766 0.81604 0.83476 0.60223 0.73559 0.3198 0.6612 0.9152 0.6853 [...]
+0.03655 0.7371 0.52007 0.60978 0.67813 0.04027 0.98791 0.26379 0.55875 0.73176 0.18929 0.48023 0.90852 0.02023 0.56967 0.70726 0.15346 0.62253 0.97798 0.43064 0.61247 0.92604 0.79171 0.98982 0.24233 0.70898 0.08004 0.64152 0.47931 0.85966 0.86108 0.16197 0.87255 0.18047 0.22383 0.53971 0.11939 0.54187 0.06942 0.77266 0.77748 0.24744 0.33363 0.49613 0.02231 0.04013 0.22645 0.42252 0.79376 0.36385 0.26668 0.79604 0.95393 0.8146 0.7262 0.82146 0.78432 0.64796 0.58493 0.96706 0.01685 0.64298 [...]
+0.91953 0.5297 0.46138 0.07136 0.05946 0.93829 0.92729 0.12833 0.77207 0.43268 0.21626 0.61978 0.3571 0.00269 0.13417 0.66082 0.40445 0.59021 0.04255 0.78155 0.84413 0.5945 0.94472 0.62895 0.30428 0.6725 0.67954 0.7369 0.45054 0.86106 0.03611 0.7723 0.42662 0.44563 0.65656 0.1479 0.674 0.17858 0.23718 0.48527 0.533 0.44911 0.41774 0.06277 0.16067 0.92244 0.68214 0.1791 0.87968 0.57037 0.09478 0.60339 0.45593 0.11029 0.47186 0.67596 0.79164 0.59329 0.89177 0.86488 0.8986 0.1605 0.69462 0. [...]
+0.46228 0.38928 0.74292 0.1771 0.31132 0.35203 0.41947 0.93848 0.74262 0.25642 0.76443 0.63404 0.73195 0.58275 0.23271 0.76126 0.44802 0.06997 0.72788 0.95666 0.87291 0.99932 0.80693 0.2947 0.83403 0.77318 0.75807 0.21409 0.403 0.11732 0.45565 0.16163 0.70613 0.34963 0.4953 0.95496 0.43055 0.93867 0.64159 0.59268 0.96305 0.1657 0.39104 0.84531 0.37654 0.38918 0.75 0.49492 0.02719 0.64783 0.0676 0.09398 0.10462 0.15549 0.85284 0.532 0.64641 0.11366 0.79988 0.85552 0.66673 0.27076 0.90021  [...]
+0.50504 0.78893 0.16023 0.42305 0.41116 0.37588 0.38792 0.00707 0.49879 0.75395 0.68845 0.74468 0.03201 0.97996 0.73097 0.42513 0.80979 0.0299 0.3881 0.07424 0.87765 0.09237 0.28501 0.34176 0.58513 0.11505 0.77874 0.30583 0.84588 0.47195 0.85815 0.96074 0.69759 0.4216 0.85381 0.86087 0.88606 0.34309 0.7448 0.30931 0.41083 0.6177 0.89048 0.86849 0.8449 0.98806 0.21999 0.63646 0.66404 0.68201 0.87369 0.84447 0.38498 0.75565 0.3367 0.08296 0.33644 0.59221 0.8626 0.7146 0.4224 0.54117 0.3009 [...]
+0.21148 0.15945 0.71988 0.89765 0.49791 0.2545 0.0665 0.38757 0.58916 0.07899 0.75584 0.32136 0.60488 0.29877 0.35474 0.37195 0.51376 0.73286 0.49374 0.74833 0.29655 0.6884 0.49042 0.58255 0.43874 0.92663 0.03006 0.91177 0.00564 0.95128 0.98346 0.3446 0.67128 0.76328 0.61593 0.69903 0.14769 0.27279 0.27023 0.37476 0.50463 0.56123 0.47667 0.80984 0.17469 0.2871 0.71529 0.92279 0.15929 0.48772 0.91069 0.56601 0.53012 0.26914 0.82786 0.25777 0.09139 0.02617 0.8897 0.69701 0.5928 0.80109 0.2 [...]
+0.87829 0.29925 0.11291 0.93421 0.84994 0.40347 0.01175 0.20713 0.36507 0.05672 0.48126 0.3261 0.44701 0.86849 0.76743 0.55431 0.51985 0.29413 0.59227 0.55296 0.76165 0.46799 0.0002 0.1654 0.54325 0.73887 0.94781 0.85613 0.09561 0.60503 0.72157 0.34038 0.07386 0.89482 0.50717 0.25694 0.20811 0.61684 0.47926 0.24137 0.58701 0.84107 0.06761 0.51443 0.2902 0.97067 0.85318 0.94733 0.6665 0.57228 0.26527 0.42252 0.66842 0.15301 0.04714 0.10992 0.22078 0.69717 0.8685 0.70138 0.55578 0.09825 0. [...]
+0.48659 0.96598 0.74839 0.15312 0.31515 0.62548 0.47266 0.95987 0.48056 0.96828 0.75018 0.80906 0.41143 0.08656 0.58575 0.73999 0.15898 0.87073 0.59478 0.19635 0.52232 0.91898 0.20177 0.39569 0.8947 0.02435 0.8228 0.69161 0.29746 0.08681 0.10271 0.48385 0.66923 0.30307 0.94956 0.43892 0.80098 0.76629 0.06137 0.37804 0.71687 0.091 0.21582 0.35167 0.15589 0.99087 0.1349 0.90462 0.10792 0.70214 0.28387 0.22814 0.9816 0.70966 0.67327 0.95328 0.95199 0.67701 0.02134 0.83944 0.72448 0.45963 0. [...]
+0.80647 0.76656 0.56785 0.21044 0.95932 0.20001 0.58347 0.04708 0.29331 0.61981 0.88069 0.40781 0.35578 0.81703 0.91804 0.429 0.88661 0.13774 0.97888 0.74169 0.97548 0.66489 0.88902 0.37712 0.07363 0.94205 0.71605 0.99367 0.71815 0.12281 0.98148 0.19462 0.71753 0.43254 0.25235 0.76594 0.34255 0.10413 0.33996 0.79763 0.00524 0.57697 0.132 0.31982 0.76787 0.49806 0.62437 0.57614 0.29343 0.40965 0.33026 0.71862 0.39082 0.12204 0.02167 0.0271 0.73449 0.16819 0.71839 0.5411 0.85352 0.30529 0. [...]
+0.70944 0.20617 0.83109 0.73644 0.61271 0.71572 0.53752 0.14668 0.00133 0.99868 0.97408 0.81023 0.42868 0.69921 0.66774 0.31069 0.37037 0.19605 0.64056 0.35261 0.6825 0.39856 0.91242 0.3239 0.2618 0.41401 0.70915 0.92899 0.59758 0.42298 0.9323 0.9569 0.21622 0.81264 0.85989 0.33333 0.79667 0.38388 0.07181 0.34668 0.48475 0.20283 0.2014 0.58208 0.29133 0.37353 0.70438 0.25656 0.40295 0.78912 0.0093 0.10776 0.45257 0.69795 0.06083 0.11375 0.86939 0.55569 0.16442 0.80207 0.10233 0.28216 0.0 [...]
+0.73924 0.8271 0.31867 0.1933 0.90208 0.21217 0.95858 0.44288 0.87119 0.5651 0.30957 0.02859 0.85394 0.8722 0.41946 0.59447 0.84599 0.45566 0.06399 0.17224 0.18329 0.72492 0.70057 0.28201 0.04355 0.83252 0.28674 0.71295 0.27802 0.06034 0.57826 0.33718 0.07724 0.0999 0.73895 0.66188 0.55424 0.61581 0.47241 0.06777 0.97444 0.26151 0.75297 0.00334 0.33293 0.34933 0.40181 0.74894 0.06092 0.96132 0.77956 0.89911 0.47624 0.65023 0.1925 0.53562 0.04979 0.15555 0.02886 0.95681 0.02329 0.68237 0. [...]
+0.82724 0.32653 0.48643 0.07912 0.63932 0.61831 0.22528 0.52398 0.25845 0.86451 0.9602 0.88902 0.78718 0.04713 0.74561 0.723 0.79288 0.1274 0.54107 0.44745 0.18961 0.62183 0.48131 0.46927 0.5515 0.57222 0.90839 0.23394 0.53291 0.81777 0.03636 0.17242 0.42345 0.86261 0.08272 0.65431 0.34399 0.17717 0.24833 0.27879 0.20917 0.07133 0.00451 0.80957 0.00723 0.2287 0.37693 0.10676 0.45295 0.81824 0.50165 0.57549 0.34606 0.38337 0.11425 0.22755 0.49754 0.01639 0.02777 0.60614 0.34752 0.24796 0. [...]
+0.9524 0.92 0.19973 0.36717 0.26248 0.02057 0.67794 0.19122 0.57754 0.42556 0.50212 0.93051 0.68515 0.45255 0.24874 0.86969 0.25207 0.35028 0.1302 0.72691 0.12178 0.903 0.32643 0.32863 0.53985 0.56706 0.5226 0.99631 0.1945 0.99085 0.40771 0.19102 0.53187 0.15392 0.59842 0.62642 0.81041 0.06607 0.93378 0.75344 0.10973 0.19223 0.03981 0.78683 0.86868 0.94598 0.32318 0.24738 0.08664 0.28735 0.04171 0.63779 0.67035 0.68155 0.77396 0.31695 0.04799 0.03299 0.17392 0.67004 0.76733 0.28863 0.592 [...]
+0.13547 0.79763 0.47372 0.96148 0.73221 0.34019 0.43882 0.32892 0.20348 0.21424 0.06701 0.65201 0.71789 0.02365 0.81951 0.66646 0.23054 0.80983 0.7545 0.03605 0.17938 0.94679 0.15511 0.70038 0.41055 0.01208 0.20292 0.79748 0.43948 0.17442 0.41992 0.3123 0.90004 0.64456 0.04752 0.86393 0.42612 0.62965 0.26062 0.62566 0.71887 0.19378 0.19441 0.30099 0.0458 0.26445 0.39612 0.49024 0.998 0.68836 0.95686 0.06017 0.13202 0.29348 0.56663 0.1655 0.4557 0.32224 0.68178 0.84715 0.65237 0.33718 0.0 [...]
+0.04293 0.28512 0.41528 0.82787 0.57238 0.4762 0.09148 0.26309 0.21643 0.62811 0.27757 0.59934 0.7952 0.28632 0.66047 0.05887 0.23616 0.08671 0.34557 0.39044 0.48995 0.76214 0.32207 0.29842 0.10922 0.91594 0.62619 0.77556 0.12603 0.15355 0.3105 0.27668 0.24177 0.61712 0.57398 0.63328 0.47524 0.70074 0.13972 0.16287 0.61395 0.64334 0.2334 0.29405 0.83288 0.94718 0.09184 0.04906 0.24938 0.78043 0.01767 0.09028 0.8951 0.17856 0.77616 0.82946 0.4762 0.63722 0.48067 0.14789 0.82136 0.67386 0. [...]
+0.66644 0.94244 0.42269 0.12485 0.31086 0.30733 0.26308 0.52264 0.47809 0.35119 0.18633 0.46704 0.73375 0.58041 0.92056 0.70225 0.28212 0.85603 0.52409 0.10164 0.02038 0.22556 0.13813 0.41449 0.58999 0.07249 0.77517 0.04429 0.96946 0.83939 0.44791 0.61115 0.97087 0.73862 0.05548 0.96634 0.65211 0.94026 0.78484 0.13397 0.14826 0.36261 0.59049 0.82136 0.2872 0.82871 0.78587 0.50859 0.82843 0.20365 0.50852 0.07752 0.567 0.678 0.08602 0.4785 0.37897 0.73547 0.72821 0.5452 0.02014 0.64838 0.2 [...]
+0.36838 0.06332 0.8741 0.78675 0.85136 0.88817 0.05326 0.93339 0.22937 0.55145 0.66142 0.72853 0.71435 0.46147 0.9413 0.12764 0.54476 0.22027 0.29367 0.96157 0.77623 0.37053 0.88184 0.00442 0.10278 0.08157 0.32856 0.40937 0.7017 0.25155 0.40038 0.77979 0.2697 0.15006 0.83389 0.1302 0.61138 0.96897 0.52076 0.85111 0.97446 0.06161 0.20023 0.68057 0.01386 0.99033 0.65801 0.99479 0.89829 0.43478 0.12356 0.7215 0.17689 0.95416 0.42345 0.37246 0.62514 0.83714 0.43796 0.43096 0.94818 0.82894 0. [...]
+0.58928 0.24044 0.56899 0.96654 0.32902 0.35897 0.36282 0.19752 0.38953 0.79212 0.76665 0.32435 0.23712 0.26066 0.96306 0.65236 0.8505 0.78054 0.26169 0.88674 0.50182 0.73641 0.59195 0.77785 0.46362 0.02627 0.61256 0.69671 0.07095 0.29436 0.21799 0.02767 0.83714 0.45874 0.41763 0.78614 0.43654 0.48487 0.14403 0.47231 0.66338 0.94207 0.78201 0.66097 0.11507 0.88885 0.87509 0.50638 0.63931 0.43936 0.26568 0.7518 0.8458 0.66739 0.74425 0.09981 0.52158 0.26776 0.214 0.4094 0.28328 0.03339 0. [...]
+0.27769 0.64366 0.97942 0.80991 0.60863 0.33675 0.29477 0.28791 0.49727 0.1725 0.86657 0.2667 0.49828 0.09863 0.22851 0.00755 0.30884 0.18046 0.96448 0.16349 0.11621 0.48285 0.69121 0.91077 0.62069 0.85682 0.93428 0.91384 0.66435 0.81742 0.41796 0.26632 0.78931 0.42624 0.78628 0.93385 0.89839 0.1927 0.51516 0.97202 0.04071 0.37849 0.0127 0.62096 0.82767 0.65127 0.68532 0.97022 0.65744 0.70547 0.02844 0.48628 0.979 0.65619 0.08066 0.11568 0.10771 0.92392 0.02627 0.39601 0.74033 0.42378 0. [...]
+0.45746 0.96762 0.24201 0.22448 0.33032 0.19939 0.88732 0.09346 0.34713 0.46188 0.99147 0.84564 0.57399 0.58129 0.70436 0.85218 0.79551 0.96319 0.80935 0.69155 0.98886 0.94277 0.58034 0.29108 0.89141 0.4603 0.28378 0.50051 0.55721 0.44545 0.06275 0.53643 0.7118 0.2434 0.44561 0.36758 0.0338 0.60421 0.98097 0.05447 0.67583 0.26747 0.22701 0.53998 0.83059 0.26875 0.25753 0.68429 0.43052 0.26092 0.32561 0.48599 0.62302 0.25353 0.70949 0.2941 0.68922 0.85031 0.29527 0.76308 0.29589 0.62259 0 [...]
+0.64075 0.33662 0.39453 0.12161 0.0135 0.49528 0.7773 0.69992 0.57267 0.57914 0.3498 0.17781 0.95688 0.2718 0.41548 0.79539 0.37317 0.09932 0.85426 0.82263 0.07305 0.73258 0.54454 0.5617 0.46182 0.13356 0.20631 0.38787 0.3175 0.60297 0.43681 0.60394 0.8303 0.72854 0.07447 0.9197 0.74875 0.6984 0.86479 0.82532 0.45601 0.73 0.02386 0.6163 0.11249 0.90861 0.02456 0.68569 0.41166 0.88081 0.05785 0.79121 0.35443 0.26199 0.19805 0.80846 0.47931 0.35871 0.82517 0.08348 0.00784 0.34797 0.15592 0 [...]
+0.61251 0.09751 0.98721 0.29179 0.76233 0.76549 0.04881 0.33459 0.17209 0.76383 0.47091 0.10727 0.45431 0.44236 0.16558 0.80498 0.49557 0.35344 0.76225 0.63627 0.87778 0.43059 0.76944 0.83142 0.45819 0.9331 0.39142 0.47603 0.54541 0.13203 0.13991 0.82257 0.95928 0.25882 0.59246 0.14167 0.57779 0.12636 0.51844 0.0475 0.5347 0.17986 0.38356 0.0208 0.67649 0.80945 0.46281 0.99674 0.51495 0.11732 0.62664 0.88694 0.76257 0.64495 0.90727 0.17567 0.31134 0.06463 0.14264 0.38413 0.79429 0.59798  [...]
+0.60638 0.18101 0.90477 0.97369 0.71852 0.19649 0.36281 0.20381 0.78474 0.15623 0.32924 0.3242 0.96917 0.33794 0.18437 0.14084 0.32587 0.61476 0.37313 0.59787 0.21747 0.63445 0.38893 0.99653 0.6254 0.87193 0.76996 0.41188 0.21254 0.06619 0.99724 0.09634 0.95808 0.08394 0.24573 0.96079 0.35963 0.09767 0.87747 0.55784 0.40183 0.30755 0.63704 0.29101 0.70602 0.0165 0.71249 0.29248 0.08307 0.57023 0.11543 0.47724 0.77548 0.39344 0.98732 0.52431 0.88222 0.57857 0.19752 0.01654 0.96308 0.80188 [...]
+0.68147 0.25771 0.61959 0.31586 0.84539 0.45303 0.43546 0.22659 0.04026 0.39278 0.87987 0.50148 0.33174 0.6841 0.2236 0.73676 0.41746 0.62875 0.40557 0.12578 0.13078 0.59509 0.83923 0.90942 0.21166 0.90026 0.97081 0.69649 0.51149 0.25743 0.86163 0.88791 0.03685 0.05645 0.9092 0.13489 0.58025 0.69327 0.90393 0.21326 0.23103 0.65894 0.65063 0.956 0.56377 0.02853 0.09846 0.85113 0.88649 0.33077 0.93097 0.43486 0.3956 0.94169 0.60956 0.78648 0.76167 0.3129 0.89558 0.39471 0.83414 0.64457 0.3 [...]
+0.25558 0.43293 0.97162 0.73528 0.20669 0.51993 0.154 0.32642 0.50648 0.89647 0.76231 0.93309 0.37314 0.80865 0.36837 0.35186 0.84685 0.44439 0.49356 0.61053 0.70709 0.01612 0.91356 0.47663 0.22316 0.6829 0.48686 0.45037 0.12995 0.93874 0.14584 0.49335 0.63199 0.50248 0.92175 0.69506 0.20436 0.20357 0.84091 0.31674 0.82431 0.80008 0.03496 0.41185 0.56373 0.26255 0.19244 0.2538 0.79143 0.03899 0.52762 0.00948 0.44089 0.83305 0.12619 0.06639 0.05945 0.50716 0.68587 0.36699 0.45244 0.11763  [...]
+0.04632 0.31146 0.80293 0.70011 0.15839 0.33551 0.01332 0.43612 0.84234 0.0521 0.44557 0.67777 0.37245 0.07784 0.79216 0.31128 0.43813 0.01829 0.49165 0.4806 0.5407 0.60516 0.31724 0.3819 0.82574 0.1068 0.99163 0.12783 0.71787 0.97219 0.08311 0.72178 0.67416 0.68253 0.14508 0.08044 0.76967 0.70608 0.28866 0.28717 0.32051 0.82067 0.1218 0.11473 0.40262 0.55438 0.14387 0.76459 0.55633 0.03163 0.54573 0.52811 0.57166 0.24743 0.871 0.49558 0.86949 0.87862 0.59114 0.2289 0.52735 0.60003 0.230 [...]
+0.64927 0.77389 0.31825 0.39038 0.11091 0.51175 0.60107 0.8604 0.60479 0.75476 0.99619 0.43941 0.76346 0.95064 0.11385 0.33252 0.73697 0.28692 0.45877 0.47538 0.51511 0.52705 0.2033 0.26541 0.50337 0.88139 0.97629 0.6353 0.30752 0.58889 0.09186 0.28384 0.32723 0.75031 0.89382 0.21091 0.26708 0.46581 0.78426 0.56789 0.24073 0.55698 0.77288 0.86796 0.62091 0.50927 0.86236 0.26737 0.17333 0.29525 0.1193 0.0214 0.79253 0.51355 0.62216 0.31953 0.84294 0.65563 0.70146 0.6707 0.66585 0.63383 0. [...]
+0.35658 0.59492 0.51064 0.5722 0.92471 0.95734 0.98683 0.94966 0.79544 0.47908 0.62779 0.85924 0.02544 0.83575 0.03602 0.72774 0.75838 0.49221 0.12163 0.15987 0.03993 0.66752 0.1603 0.20282 0.67135 0.61087 0.95646 0.14745 0.64515 0.08007 0.44354 0.88833 0.88248 0.60659 0.18093 0.60781 0.87034 0.87148 0.41186 0.98765 0.65922 0.19449 0.34306 0.84546 0.67711 0.27924 0.85806 0.58138 0.43356 0.86019 0.91163 0.11644 0.94903 0.97912 0.45359 0.88038 0.41609 0.95366 0.03013 0.42189 0.44648 0.1327 [...]
+0.02228 0.07267 0.80534 0.39554 0.79325 0.88651 0.89504 0.97673 0.01203 0.8061 0.31636 0.43061 0.71665 0.20775 0.48228 0.62759 0.00897 0.67526 0.72955 0.7638 0.58069 0.3441 0.97017 0.15971 0.24745 0.89547 0.71932 0.45137 0.41301 0.00613 0.72204 0.95852 0.292 0.21974 0.73505 0.32097 0.41243 0.45996 0.7209 0.24322 0.7412 0.89185 0.34241 0.99683 0.22823 0.75697 0.1485 0.07366 0.27603 0.16082 0.34884 0.0836 0.14298 0.75779 0.44605 0.94335 0.2676 0.82617 0.07062 0.86533 0.25083 0.96266 0.0980 [...]
+0.18249 0.87275 0.91979 0.89774 0.47578 0.15281 0.81266 0.21105 0.24867 0.16889 0.77691 0.62209 0.99111 0.09394 0.02275 0.68217 0.26727 0.74298 0.73443 0.24424 0.11938 0.96796 0.82079 0.25026 0.25897 0.28279 0.01006 0.27444 0.67603 0.56647 0.87311 0.36279 0.55161 0.87755 0.42742 0.46802 0.03392 0.45249 0.72789 0.76996 0.31305 0.53103 0.42166 0.0767 0.06148 0.69377 0.73749 0.61736 0.55362 0.18213 0.85743 0.68874 0.91761 0.7316 0.13396 0.18412 0.6829 0.96515 0.74467 0.32923 0.33621 0.36704 [...]
+0.33676 0.7823 0.51628 0.86911 0.05052 0.7681 0.20946 0.59631 0.25898 0.39168 0.48086 0.56892 0.48152 0.81998 0.27589 0.89872 0.24476 0.52161 0.01161 0.80655 0.41106 0.32324 0.85226 0.93523 0.02145 0.60869 0.6724 0.86969 0.19136 0.28578 0.0219 0.40876 0.03005 0.45707 0.48378 0.23399 0.5184 0.19499 0.77353 0.03626 0.60311 0.01254 0.4842 0.1553 0.60906 0.08682 0.35621 0.03268 0.40594 0.18645 0.63158 0.90213 0.03947 0.34506 0.0373 0.71162 0.51281 0.60837 0.27654 0.69585 0.11549 0.39087 0.73 [...]
+0.44806 0.69626 0.81413 0.01456 0.8527 0.4755 0.72582 0.07067 0.35617 0.27557 0.84887 0.38618 0.14577 0.87768 0.32034 0.22121 0.38796 0.09839 0.17017 0.95486 0.09056 0.96329 0.73469 0.94791 0.27167 0.09503 0.1945 0.63063 0.05496 0.10824 0.55734 0.31804 0.82632 0.65354 0.42433 0.75905 0.07618 0.22784 0.14222 0.10805 0.5959 0.91715 0.77363 0.61055 0.44986 0.80926 0.10267 0.93437 0.63338 0.28163 0.59092 0.84353 0.13474 0.46276 0.47298 0.3654 0.32983 0.47863 0.87934 0.0451 0.70361 0.38815 0. [...]
+0.11457 0.46865 0.75251 0.23569 0.0543 0.05798 0.22972 0.80868 0.66299 0.053 0.27148 0.84787 0.76999 0.82869 0.52427 0.15131 0.29176 0.22285 0.18176 0.44838 0.40282 0.03903 0.19445 0.49749 0.06893 0.08224 0.41241 0.39837 0.25342 0.45351 0.87048 0.82218 0.39558 0.48781 0.5515 0.16706 0.60835 0.67209 0.28042 0.23306 0.72868 0.41473 0.91267 0.32276 0.84158 0.61293 0.22156 0.19896 0.71799 0.21567 0.84177 0.91063 0.88312 0.35912 0.46527 0.56329 0.86684 0.65402 0.79457 0.72325 0.60756 0.34537  [...]
+0.23289 0.47841 0.53577 0.12605 0.58678 0.15585 0.88804 0.23187 0.74644 0.74832 0.60926 0.02371 0.57024 0.49716 0.43169 0.23854 0.05892 0.92353 0.36277 0.74657 0.18051 0.65431 0.76219 0.2383 0.61921 0.31926 0.61404 0.42947 0.73899 0.31984 0.58335 0.59947 0.24581 0.99496 0.53278 0.04642 0.15375 0.80357 0.47942 0.27807 0.36892 0.03234 0.76866 0.19095 0.43515 0.59145 0.71222 0.90546 0.20891 0.23059 0.18036 0.68181 0.94645 0.35771 0.87004 0.26772 0.62462 0.56531 0.70683 0.59839 0.00528 0.402 [...]
+0.18196 0.58216 0.12242 0.00831 0.15024 0.45659 0.28996 0.39005 0.71731 0.5416 0.53022 0.52253 0.04526 0.02431 0.88551 0.61011 0.24834 0.77148 0.04434 0.37732 0.88679 0.26115 0.30148 0.75235 0.79955 0.47069 0.18467 0.7581 0.55985 0.52957 0.74721 0.22663 0.47779 0.40594 0.55534 0.12919 0.94596 0.42516 0.04352 0.20703 0.63452 0.85951 0.64955 0.47347 0.75449 0.23642 0.86092 0.01095 0.67571 0.48084 0.62227 0.0242 0.00358 0.00455 0.21866 0.45195 0.53222 0.22497 0.63691 0.39368 0.6801 0.89285  [...]
+0.00346 0.79683 0.66315 0.21884 0.81387 0.24788 0.1016 0.2684 0.55929 0.38943 0.76333 0.68767 0.81529 0.73356 0.96883 0.71611 0.09812 0.3262 0.24865 0.18006 0.94593 0.61747 0.38131 0.48187 0.13673 0.403 0.88264 0.75629 0.03351 0.57627 0.61988 0.67741 0.97676 0.17141 0.05137 0.04794 0.4034 0.59895 0.97812 0.36281 0.80075 0.50235 0.61643 0.29662 0.17735 0.79843 0.14801 0.86336 0.11124 0.09954 0.79884 0.34135 0.52236 0.23788 0.79407 0.87387 0.43735 0.55489 0.78089 0.45344 0.21787 0.54475 0. [...]
+0.61903 0.52721 0.32746 0.29881 0.65179 0.33998 0.84376 0.05623 0.5642 0.99829 0.33326 0.43865 0.68145 0.10317 0.54321 0.78661 0.8253 0.88379 0.84581 0.84534 0.99705 0.20764 0.85387 0.99923 0.96532 0.20148 0.09108 0.60073 0.2028 0.28812 0.14615 0.55091 0.86268 0.22712 0.29421 0.54662 0.75896 0.93914 0.76339 0.98242 0.57803 0.59264 0.31096 0.5303 0.6553 0.94536 0.3115 0.98052 0.53436 0.68339 0.19194 0.12026 0.04334 0.62391 0.068 0.85379 0.21847 0.55166 0.48489 0.40004 0.36137 0.71065 0.93 [...]
+0.6637 0.86417 0.97405 0.5648 0.20365 0.10937 0.12824 0.27669 0.75465 0.77085 0.85943 0.9426 0.76303 0.31237 0.99898 0.75414 0.67761 0.81475 0.01712 0.44601 0.38417 0.14452 0.70739 0.99304 0.34334 0.45005 0.76805 0.13075 0.6876 0.75958 0.52428 0.72575 0.20716 0.58142 0.21058 0.61172 0.40737 0.89391 0.42186 0.30675 0.17243 0.74564 0.29459 0.86292 0.83104 0.37485 0.76197 0.36461 0.22194 0.83328 0.23737 0.17034 0.222 0.01113 0.83509 0.71885 0.0105 0.85107 0.85862 0.81659 0.89223 0.27661 0.0 [...]
+0.02261 0.30854 0.21846 0.65308 0.07865 0.35784 0.11289 0.36324 0.3802 0.8228 0.3217 0.49579 0.18402 0.76842 0.38401 0.53672 0.36381 0.51657 0.96357 0.40822 0.31767 0.4384 0.66754 0.56051 0.96821 0.8873 0.35056 0.15222 0.38431 0.32298 0.95441 0.17166 0.2604 0.03421 0.62196 0.6002 0.17871 0.31795 0.9727 0.57293 0.10518 0.61855 0.30924 0.2503 0.71472 0.07641 0.79242 0.29748 0.71733 0.89374 0.3123 0.87475 0.1585 0.90377 0.9065 0.97786 0.31932 0.49337 0.98632 0.98661 0.49928 0.64987 0.00761  [...]
+0.36788 0.32617 0.1095 0.79384 0.03391 0.26084 0.87541 0.10134 0.28805 0.55765 0.135 0.5602 0.04375 0.65612 0.52197 0.03935 0.51409 0.76984 0.85463 0.72583 0.19174 0.81627 0.86701 0.16759 0.79988 0.09809 0.1673 0.9743 0.66364 0.82137 0.96016 0.6558 0.23837 0.20157 0.73295 0.9026 0.83528 0.89698 0.931 0.24989 0.14679 0.69051 0.79491 0.3978 0.84803 0.56372 0.62293 0.73634 0.41047 0.92988 0.3797 0.02726 0.4758 0.02727 0.32993 0.96914 0.18511 0.35348 0.62477 0.7018 0.91889 0.22335 0.59355 0. [...]
+0.03362 0.77213 0.63633 0.73906 0.83413 0.38964 0.30597 0.50617 0.38178 0.65112 0.34642 0.04039 0.94808 0.44738 0.39568 0.29064 0.6698 0.00091 0.63078 0.34413 0.33467 0.73202 0.51812 0.07525 0.98222 0.24635 0.39552 0.68505 0.8538 0.11838 0.82121 0.5862 0.89813 0.49749 0.34654 0.85682 0.50344 0.75106 0.10522 0.26224 0.31802 0.40985 0.23806 0.32224 0.76989 0.31603 0.08065 0.14312 0.77386 0.14013 0.26946 0.69285 0.1932 0.09079 0.08254 0.49531 0.16116 0.11916 0.53565 0.95359 0.5781 0.6707 0. [...]
+0.37076 0.42007 0.487 0.57273 0.42823 0.14515 0.5801 0.58405 0.60978 0.69962 0.71579 0.54431 0.01736 0.59278 0.04551 0.87605 0.89781 0.73912 0.67315 0.83964 0.38751 0.42912 0.41806 0.5014 0.05113 0.74961 0.71925 0.3621 0.13306 0.58665 0.72939 0.85723 0.64911 0.40292 0.00772 0.23952 0.74489 0.08747 0.74038 0.53631 0.09534 0.97192 0.59377 0.21294 0.48251 0.25367 0.38502 0.95069 0.64664 0.63912 0.43834 0.49087 0.42072 0.7479 0.06666 0.67133 0.56607 0.6975 0.14418 0.49715 0.20599 0.46459 0.4 [...]
+0.18871 0.70852 0.70517 0.07325 0.74591 0.24189 0.91234 0.68962 0.63374 0.49198 0.13876 0.58465 0.70048 0.43706 0.87863 0.43194 0.67437 0.49532 0.24415 0.66211 0.06047 0.02144 0.18807 0.29505 0.38443 0.3425 0.00128 0.9033 0.80047 0.13865 0.04689 0.52883 0.57248 0.94609 0.86995 0.28301 0.59123 0.68842 0.79704 0.01177 0.02549 0.52415 0.75514 0.90799 0.16698 0.81575 0.74201 0.83631 0.73907 0.0482 0.50688 0.47776 0.8516 0.18239 0.65228 0.67496 0.28059 0.16882 0.5118 0.22439 0.32393 0.166 0.5 [...]
+0.34599 0.35844 0.81349 0.21248 0.96732 0.15808 0.88241 0.22747 0.69687 0.90218 0.96127 0.00169 0.95602 0.49017 0.08546 0.9239 0.24385 0.70445 0.04908 0.18246 0.04946 0.73615 0.49935 0.95974 0.78694 0.73154 0.29566 0.42714 0.35098 0.03764 0.96772 0.34968 0.12535 0.52885 0.30172 0.1868 0.70637 0.48472 0.72656 0.82893 0.09465 0.37762 0.90786 0.38485 0.882 0.93991 0.1377 0.09181 0.94964 0.81512 0.87436 0.22829 0.34675 0.67474 0.56819 0.95208 0.26334 0.47035 0.09248 0.49273 0.96687 0.5224 0. [...]
+0.8463 0.75598 0.62961 0.02189 0.68055 0.88291 0.13471 0.95358 0.98566 0.88258 0.55224 0.38757 0.89381 0.39797 0.51599 0.17189 0.93297 0.72373 0.56111 0.26649 0.74463 0.48356 0.85852 0.87012 0.28986 0.79901 0.4493 0.12987 0.79983 0.2654 0.20102 0.69863 0.54202 0.80912 0.20276 0.49257 0.99558 0.96501 0.10044 0.42244 0.78424 0.31706 0.95299 0.8205 0.48575 0.79548 0.45517 0.74228 0.73648 0.17308 0.07974 0.07383 0.23978 0.32433 0.43635 0.62369 0.24399 0.99024 0.68838 0.17815 0.58575 0.15634  [...]
+0.19102 0.58906 0.84111 0.51124 0.6712 0.00279 0.3741 0.44144 0.48559 0.26413 0.02656 0.1147 0.57039 0.39453 0.38584 0.25023 0.47758 0.03934 0.64733 0.84042 0.32806 0.42908 0.76844 0.78817 0.97106 0.76167 0.40321 0.98547 0.34936 0.5461 0.11637 0.43003 0.70708 0.14881 0.31874 0.89941 0.50093 0.29723 0.24197 0.64187 0.9022 0.21856 0.98082 0.67496 0.16936 0.23478 0.0213 0.39436 0.96564 0.34946 0.96727 0.37069 0.55071 0.18881 0.04322 0.50765 0.31605 0.6665 0.67895 0.91878 0.60776 0.19912 0.8 [...]
+0.23989 0.04904 0.08103 0.09381 0.84704 0.36966 0.7335 0.38367 0.93015 0.0987 0.74229 0.03367 0.50628 0.99959 0.34729 0.57787 0.57015 0.65532 0.32421 0.42951 0.81808 0.00011 0.44458 0.83753 0.61239 0.46708 0.19539 0.06433 0.65175 0.73687 0.68665 0.07426 0.01306 0.1924 0.16369 0.77916 0.91616 0.79153 0.52714 0.4244 0.69634 0.63429 0.30809 0.47186 0.02419 0.18172 0.3707 0.07765 0.88334 0.1586 0.49445 0.00974 0.52733 0.91972 0.01038 0.48084 0.60747 0.89357 0.25175 0.87157 0.38889 0.26592 0. [...]
+0.30858 0.21487 0.36945 0.95628 0.83874 0.5295 0.29623 0.70374 0.67313 0.31231 0.2538 0.74324 0.98364 0.70647 0.80347 0.26278 0.15498 0.13878 0.47884 0.04312 0.15401 0.94127 0.22218 0.98762 0.37559 0.13354 0.74731 0.66552 0.53429 0.90945 0.0138 0.02318 0.75324 0.37816 0.44886 0.65738 0.67146 0.98738 0.56609 0.45845 0.11788 0.70018 0.26262 0.62162 0.62295 0.11231 0.86413 0.58673 0.46256 0.97863 0.6162 0.79025 0.95315 0.5419 0.38206 0.97199 0.42797 0.16029 0.22842 0.97015 0.99965 0.22429 0 [...]
+0.55777 0.15426 0.82769 0.54797 0.04376 0.1718 0.51995 0.29681 0.37565 0.7321 0.50246 0.78965 0.64332 0.56311 0.18605 0.93994 0.1652 0.4274 0.04219 0.37601 0.00334 0.95338 0.98675 0.7806 0.06122 0.46461 0.27813 0.14037 0.7844 0.59786 0.24274 0.62361 0.70209 0.45763 0.63614 0.4212 0.52548 0.89051 0.34917 0.66349 0.73785 0.14888 0.68939 0.73291 0.51021 0.90544 0.80781 0.54998 0.33725 0.00408 0.09808 0.64501 0.2297 0.67785 0.13468 0.59993 0.15004 0.03825 0.73312 0.6657 0.32564 0.44607 0.225 [...]
+0.0453 0.31282 0.38839 0.15296 0.54199 0.71796 0.56749 0.55863 0.35873 0.20815 0.24912 0.70422 0.04667 0.85557 0.2166 0.84108 0.36842 0.63556 0.82144 0.02828 0.99621 0.45573 0.8419 0.15806 0.29864 0.5413 0.54375 0.2844 0.36903 0.53036 0.9023 0.28083 0.17689 0.07716 0.7531 0.54767 0.51332 0.42542 0.36646 0.73449 0.65209 0.66441 0.44174 0.52215 0.0247 0.86006 0.37729 0.54821 0.60852 0.58636 0.31765 0.59547 0.88078 0.3084 0.32663 0.01685 0.70494 0.34698 0.13309 0.87706 0.83321 0.06903 0.028 [...]
+0.72731 0.4456 0.38292 0.00235 0.0505 0.09384 0.8772 0.83214 0.0409 0.32908 0.01391 0.0037 0.09757 0.34432 0.63786 0.90236 0.51426 0.60466 0.54654 0.51019 0.22171 0.82429 0.02984 0.62901 0.94132 0.44631 0.21398 0.48135 0.14005 0.84034 0.17477 0.64847 0.76407 0.80488 0.98818 0.30849 0.11583 0.42667 0.52346 0.92794 0.35273 0.37947 0.67403 0.35373 0.23706 0.26687 0.42374 0.58292 0.8633 0.75626 0.411 0.6172 0.02091 0.40997 0.40451 0.26277 0.26179 0.46581 0.23237 0.49073 0.90114 0.14412 0.850 [...]
+0.59176 0.64362 0.00971 0.78549 0.19119 0.7072 0.96973 0.38264 0.52218 0.01686 0.88995 0.72245 0.49607 0.8717 0.97275 0.0475 0.71273 0.041 0.26402 0.16771 0.70899 0.37404 0.2587 0.44158 0.93671 0.71692 0.46621 0.49854 0.88392 0.08125 0.57852 0.3014 0.38934 0.10454 0.6426 0.05897 0.19493 0.90384 0.8983 0.16735 0.5228 0.85092 0.95313 0.44631 0.17169 0.45857 0.08932 0.93432 0.40073 0.64753 0.9374 0.2335 0.12978 0.19756 0.23353 0.97751 0.70261 0.61918 0.53983 0.72798 0.32754 0.57488 0.753 0. [...]
+0.02118 0.30779 0.74819 0.40435 0.89065 0.16701 0.98016 0.64809 0.21948 0.11408 0.18059 0.05513 0.38392 0.88116 0.59506 0.61834 0.5912 0.29972 0.89359 0.00089 0.91141 0.74221 0.0477 0.4459 0.78551 0.98066 0.6149 0.53376 0.30093 0.91692 0.92172 0.46843 0.2045 0.8966 0.16621 0.10704 0.35931 0.13181 0.14247 0.79695 0.65361 0.3965 0.71686 0.86677 0.34941 0.60453 0.04017 0.10283 0.15742 0.89998 0.91946 0.57425 0.48696 0.1059 0.55064 0.78713 0.52428 0.00797 0.91132 0.42988 0.91754 0.46022 0.13 [...]
+0.33199 0.80711 0.90999 0.13228 0.36328 0.44595 0.64201 0.90853 0.30698 0.6181 0.94529 0.71759 0.28397 0.98946 0.47253 0.09111 0.42047 0.84502 0.82334 0.20007 0.83393 0.59237 0.68026 0.43352 0.46617 0.97768 0.5828 0.916 0.38007 0.5555 0.53132 0.12915 0.83087 0.93212 0.47045 0.39258 0.55463 0.10678 0.80819 0.86821 0.74963 0.78821 0.75016 0.19144 0.40809 0.1838 0.39223 0.73684 0.74793 0.65072 0.50892 0.83918 0.32382 0.37395 0.33141 0.28114 0.53176 0.62229 0.30612 0.2459 0.69295 0.08103 0.6 [...]
+0.98865 0.52058 0.12336 0.7574 0.44066 0.29584 0.50581 0.80033 0.24761 0.6745 0.52086 0.9042 0.56323 0.1479 0.49824 0.20043 0.7886 0.85591 0.14502 0.52539 0.78298 0.80991 0.91091 0.98854 0.84479 0.13173 0.55877 0.22352 0.89518 0.37896 0.23185 0.20612 0.54126 0.86102 0.76932 0.08334 0.29914 0.86189 0.01155 0.99166 0.70739 0.52048 0.33026 0.12803 0.80007 0.27105 0.54899 0.33585 0.8312 0.41528 0.00527 0.23168 0.53724 0.0058 0.60759 0.74056 0.6973 0.04652 0.68425 0.74981 0.72969 0.63283 0.48 [...]
+0.08862 0.11805 0.67001 0.74426 0.07364 0.68168 0.67918 0.83471 0.06207 0.4337 0.69448 0.64247 0.24072 0.67357 0.2982 0.88914 0.10609 0.35534 0.61535 0.38855 0.10408 0.6201 0.20503 0.76438 0.75791 0.43939 0.63469 0.55031 0.68621 0.70686 0.97922 0.07506 0.64899 0.34986 0.74556 0.94493 0.02417 0.73466 0.51652 0.05509 0.36195 0.50176 0.99699 0.04617 0.65955 0.51939 0.69656 0.26522 0.40247 0.00114 0.89831 0.67303 0.93049 0.63292 0.72753 0.89787 0.09995 0.21482 0.59139 0.23264 0.92924 0.45522 [...]
+0.27372 0.26424 0.53488 0.30254 0.89178 0.41571 0.75603 0.27837 0.69454 0.29691 0.33192 0.72423 0.84528 0.31913 0.08401 0.35334 0.65012 0.36115 0.1045 0.39255 0.00261 0.7337 0.39595 0.56538 0.59187 0.72834 0.41122 0.60689 0.62198 0.53284 0.6751 0.47253 0.27989 0.59673 0.67947 0.02433 0.88569 0.24216 0.42557 0.00842 0.18576 0.32994 0.61229 0.31444 0.00272 0.8625 0.34885 0.28715 0.39519 0.73659 0.88971 0.13153 0.33694 0.59473 0.42364 0.46502 0.11766 0.05727 0.78433 0.34799 0.15021 0.63882  [...]
+0.28823 0.45424 0.14038 0.71869 0.88503 0.87176 0.32272 0.63884 0.89657 0.01677 0.70815 0.91788 0.84501 0.28469 0.6202 0.00653 0.26761 0.11508 0.63349 0.33407 0.55602 0.31707 0.51139 0.20161 0.10545 0.49502 0.9528 0.1204 0.45904 0.41484 0.75301 0.08291 0.3135 0.144 0.49356 0.73365 0.64721 0.39716 0.27639 0.51363 0.89832 0.95039 0.72617 0.98553 0.65515 0.219 0.12177 0.63386 0.67572 0.36362 0.9984 0.86048 0.18271 0.38767 0.13058 0.01936 0.43348 0.77087 0.10762 0.52935 0.31233 0.88074 0.713 [...]
+0.39127 0.89094 0.32699 0.35725 0.42756 0.58599 0.7162 0.36913 0.67402 0.28811 0.32842 0.54076 0.83116 0.50903 0.42368 0.03101 0.23818 0.92197 0.14102 0.22778 0.97633 0.6299 0.75684 0.60081 0.11342 0.05837 0.74616 0.20643 0.46925 0.22763 0.09707 0.18753 0.38665 0.41509 0.64195 0.90947 0.93022 0.87232 0.88215 0.78612 0.29944 0.32797 0.77961 0.2231 0.00579 0.82881 0.42285 0.71241 0.63705 0.03227 0.02543 0.37849 0.95035 0.35507 0.60971 0.23225 0.21333 0.90284 0.57535 0.33134 0.47981 0.66785 [...]
+0.16056 0.75418 0.10768 0.68097 0.5972 0.28344 0.96141 0.81926 0.46871 0.18097 0.68838 0.00802 0.95738 0.8992 0.05522 0.22977 0.6421 0.80457 0.28441 0.73464 0.03141 0.92529 0.40899 0.61586 0.15876 0.00382 0.43644 0.85962 0.73653 0.31404 0.78692 0.83347 0.86771 0.75178 0.96611 0.74617 0.81198 0.55587 0.17774 0.57553 0.33671 0.02621 0.58303 0.47279 0.44428 0.76583 0.04415 0.67136 0.42891 0.30655 0.95438 0.63515 0.6008 0.72051 0.66967 0.07709 0.55254 0.5257 0.32811 0.90666 0.7446 0.88735 0. [...]
+0.28984 0.92513 0.47669 0.26356 0.70645 0.79728 0.1513 0.63668 0.49022 0.37846 0.44273 0.20037 0.27686 0.88399 0.30725 0.99189 0.27378 0.40891 0.05107 0.79925 0.41117 0.92714 0.36544 0.02976 0.79526 0.33226 0.58821 0.80825 0.7117 0.93048 0.24284 0.41307 0.37966 0.75487 0.5427 0.52693 0.54146 0.50335 0.40855 0.49002 0.946 0.62605 0.34671 0.70539 0.4882 0.1181 0.70444 0.33632 0.59918 0.54773 0.61236 0.55698 0.44571 0.13706 0.84458 0.19325 0.00974 0.77078 0.40368 0.12621 0.63467 0.08794 0.1 [...]
+0.00889 0.30488 0.77981 0.58075 0.94032 0.76664 0.78916 0.99276 0.24558 0.49506 0.3357 0.82153 0.41498 0.59333 0.80628 0.74407 0.5747 0.32157 0.89347 0.78911 0.01004 0.1049 0.66553 0.07983 0.34113 0.67277 0.48741 0.61922 0.14833 0.48315 0.56359 0.47609 0.30312 0.02389 0.3113 0.00456 0.66115 0.58167 0.46801 0.58153 0.33575 0.8227 0.51918 0.63385 0.03084 0.25938 0.91985 0.67941 0.83929 0.80266 0.32042 0.64575 0.9181 0.88401 0.62478 0.77132 0.91334 0.67287 0.32153 0.81216 0.36239 0.03696 0. [...]
+0.18405 0.0847 0.05262 0.46116 0.57875 0.15766 0.62644 0.52412 0.97023 0.60112 0.46507 0.61009 0.54546 0.94884 0.84337 0.22553 0.95534 0.93797 0.33614 0.43995 0.8285 0.943 0.16951 0.35585 0.85288 0.80347 0.94892 0.15121 0.15598 0.04935 0.65485 0.88794 0.88787 0.46014 0.0098 0.74764 0.8027 0.73799 0.6313 0.71763 0.4849 0.89019 0.53181 0.61981 0.92191 0.03634 0.78231 0.96226 0.8149 0.55635 0.2616 0.50962 0.42276 0.6835 0.93607 0.62577 0.07158 0.52389 0.95495 0.44377 0.42886 0.04486 0.80966 [...]
+0.88583 0.80614 0.0463 0.71176 0.09406 0.59781 0.52637 0.12662 0.75866 0.70537 0.87716 0.68023 0.87374 0.66993 0.55855 0.3538 0.70913 0.17316 0.77082 0.3357 0.15972 0.37926 0.99433 0.21172 0.1079 0.8534 0.95846 0.95028 0.23405 0.08332 0.22529 0.23148 0.88906 0.60082 0.09735 0.78571 0.14855 0.33778 0.51879 0.84528 0.33317 0.64929 0.42832 0.95091 0.01566 0.47462 0.18209 0.41637 0.0357 0.95573 0.61554 0.52076 0.6726 0.19464 0.71826 0.37277 0.55465 0.97705 0.8838 0.45808 0.17643 0.3079 0.553 [...]
+0.1484 0.701 0.73562 0.16862 0.15084 0.15004 0.06965 0.36511 0.13825 0.196 0.93108 0.99659 0.74277 0.18181 0.39931 0.36598 0.95092 0.32853 0.3431 0.73162 0.01631 0.83984 0.18107 0.62837 0.64815 0.13342 0.13466 0.2031 0.76268 0.09006 0.44824 0.94704 0.84858 0.32416 0.35042 0.38637 0.49175 0.39983 0.09554 0.16726 0.43573 0.4289 0.25107 0.02952 0.00895 0.00075 0.29385 0.53801 0.25332 0.13156 0.41905 0.74023 0.4159 0.71401 0.13601 0.41831 0.55248 0.61461 0.869 0.06841 0.63824 0.65496 0.86859 [...]
+0.49097 0.92718 0.06054 0.27967 0.15576 0.59477 0.89905 0.78132 0.03269 0.15942 0.75177 0.2068 0.11651 0.22207 0.67068 0.51471 0.19631 0.31908 0.993 0.46904 0.60436 0.81742 0.8219 0.93514 0.38267 0.36561 0.64334 0.33796 0.36571 0.8669 0.48641 0.27445 0.42325 0.73109 0.23854 0.79228 0.533 0.54183 0.47874 0.83892 0.85302 0.65071 0.77102 0.84089 0.96726 0.17704 0.36737 0.46764 0.70143 0.68927 0.98714 0.28628 0.72559 0.98791 0.47092 0.55482 0.5021 0.16047 0.58157 0.45815 0.38135 0.4398 0.786 [...]
+0.18033 0.79168 0.42996 0.57363 0.15933 0.63101 0.3403 0.70145 0.64658 0.10262 0.84145 0.41222 0.69765 0.89559 0.6347 0.52805 0.93962 0.65411 0.35048 0.53729 0.85756 0.35109 0.72609 0.86766 0.95292 0.64122 0.84676 0.3882 0.41158 0.57315 0.87266 0.43233 0.5014 0.5266 0.20801 0.33996 0.33636 0.76172 0.67164 0.55176 0.86639 0.25754 0.33566 0.14899 0.05554 0.25546 0.42246 0.45871 0.20508 0.18636 0.34621 0.60861 0.95738 0.87049 0.33235 0.81001 0.51992 0.12122 0.33232 0.30099 0.62914 0.00117 0 [...]
+0.56947 0.78782 0.75658 0.86153 0.77706 0.36908 0.40703 0.05508 0.15603 0.97321 0.74139 0.86883 0.65547 0.31544 0.77209 0.81978 0.70405 0.37265 0.49122 0.93462 0.06584 0.65468 0.53105 0.88598 0.02087 0.87751 0.97403 0.04596 0.60082 0.2917 0.42714 0.05633 0.88407 0.81681 0.15382 0.44614 0.34381 0.46123 0.67038 0.57537 0.99066 0.05185 0.66949 0.32931 0.91271 0.05527 0.19991 0.66457 0.135 0.97827 0.85124 0.05602 0.2964 0.7279 0.0019 0.77907 0.80015 0.5313 0.66238 0.66714 0.29939 0.13189 0.0 [...]
+0.00249 0.3314 0.34233 0.33143 0.32184 0.91857 0.84542 0.49718 0.58067 0.12783 0.4736 0.88517 0.36034 0.72933 0.70071 0.89802 0.34863 0.70549 0.48071 0.13106 0.25655 0.11977 0.15 0.96079 0.14397 0.79932 0.53515 0.99047 0.17677 0.81931 0.7824 0.36554 0.32914 0.45344 0.76221 0.08055 0.29076 0.85943 0.89409 0.23959 0.30115 0.98505 0.54859 0.08709 0.84278 0.43546 0.13956 0.83368 0.33373 0.05532 0.42671 0.17473 0.65359 0.11871 0.25938 0.16843 0.15085 0.30246 0.76449 0.36909 0.29115 0.46398 0. [...]
+0.33882 0.23561 0.19131 0.62787 0.62433 0.4794 0.3274 0.46628 0.63901 0.93663 0.61478 0.8861 0.58135 0.06947 0.18129 0.80357 0.48034 0.63666 0.35628 0.38466 0.87658 0.57785 0.19392 0.34534 0.22303 0.28478 0.18577 0.54345 0.38737 0.19023 0.5751 0.87936 0.23331 0.58192 0.31912 0.18327 0.27614 0.83536 0.28566 0.09655 0.51403 0.18569 0.41471 0.65002 0.20524 0.35238 0.09048 0.09748 0.7663 0.48813 0.51709 0.9632 0.0058 0.77277 0.27473 0.34067 0.68766 0.33561 0.00247 0.81244 0.31601 0.75877 0.2 [...]
+0.36993 0.97222 0.25911 0.31096 0.50929 0.77082 0.4923 0.73348 0.86492 0.29606 0.14482 0.33258 0.10586 0.89766 0.82488 0.66321 0.09401 0.23292 0.66598 0.94337 0.30977 0.15727 0.51491 0.86553 0.31304 0.80295 0.86574 0.62708 0.85883 0.46769 0.97036 0.8673 0.43669 0.40486 0.45278 0.17495 0.33633 0.10656 0.79254 0.05422 0.33782 0.99819 0.86113 0.98788 0.05474 0.85069 0.57155 0.65116 0.71406 0.64466 0.05744 0.5472 0.47174 0.9634 0.31931 0.72078 0.47232 0.56036 0.18045 0.15907 0.76565 0.44771  [...]
+0.17907 0.48335 0.26858 0.25468 0.67427 0.34676 0.54188 0.54453 0.52799 0.83111 0.61132 0.9097 0.68203 0.70366 0.02137 0.14524 0.76562 0.78799 0.7755 0.27733 0.87563 0.73129 0.57081 0.37096 0.87576 0.75611 0.65144 0.73966 0.47864 0.00065 0.00778 0.85071 0.26745 0.28727 0.89209 0.04772 0.69639 0.53544 0.06283 0.96341 0.97509 0.07171 0.60403 0.77947 0.64667 0.82562 0.01629 0.19685 0.72757 0.73712 0.62201 0.81768 0.77229 0.89432 0.32104 0.02396 0.97 0.96964 0.62824 0.03715 0.24342 0.84762 0 [...]
+0.83577 0.58262 0.9399 0.40401 0.67728 0.35374 0.21595 0.55052 0.87883 0.09617 0.2114 0.77392 0.74305 0.41188 0.65989 0.04158 0.3579 0.46244 0.11795 0.26997 0.1055 0.8631 0.36305 0.70961 0.89328 0.46603 0.3481 0.81281 0.76559 0.45221 0.28434 0.22763 0.64481 0.67153 0.49132 0.69277 0.25064 0.38717 0.67394 0.6966 0.74849 0.61241 0.62868 0.09542 0.78213 0.83565 0.82515 0.18331 0.95763 0.57054 0.90524 0.33504 0.49919 0.57947 0.34677 0.75337 0.8612 0.46007 0.98266 0.31423 0.25765 0.3946 0.268 [...]
+0.43834 0.71962 0.60957 0.56028 0.55884 0.97946 0.48557 0.48104 0.50188 0.72211 0.32489 0.31743 0.73665 0.78697 0.03508 0.40826 0.13235 0.13585 0.23199 0.29476 0.69371 0.19194 0.20315 0.32401 0.07558 0.34556 0.09664 0.3621 0.11756 0.12196 0.13513 0.22014 0.35619 0.40911 0.9531 0.90067 0.56074 0.98053 0.97136 0.28012 0.31594 0.84532 0.73939 0.41841 0.67101 0.91411 0.94097 0.50737 0.6692 0.53358 0.61436 0.79371 0.92381 0.24873 0.19596 0.9218 0.75949 0.31607 0.91012 0.52437 0.87799 0.23502  [...]
+0.9523 0.12535 0.40004 0.79697 0.07022 0.39474 0.8552 0.68715 0.15549 0.63318 0.91249 0.47118 0.70188 0.00901 0.97768 0.57737 0.27367 0.47092 0.88517 0.1456 0.54667 0.94485 0.91906 0.99756 0.91582 0.525 0.73226 0.14726 0.56238 0.15291 0.87568 0.55717 0.50963 0.46824 0.95579 0.2578 0.74262 0.42495 0.69536 0.55589 0.80258 0.97582 0.48615 0.35382 0.01162 0.70994 0.59319 0.584 0.43777 0.10471 0.93068 0.38287 0.89508 0.60593 0.4494 0.36724 0.33364 0.23584 0.7911 0.05599 0.83 0.13395 0.74242 0 [...]
+0.28808 0.69658 0.48852 0.01887 0.26698 0.50532 0.01929 0.16437 0.09371 0.19342 0.33042 0.23547 0.74863 0.95765 0.27219 0.8856 0.20705 0.96867 0.17334 0.51035 0.26556 0.89234 0.28115 0.12693 0.58702 0.37918 0.75385 0.10215 0.45249 0.75587 0.93119 0.81565 0.18598 0.01977 0.22098 0.91322 0.97727 0.9865 0.38822 0.75656 0.30585 0.26523 0.17646 0.04175 0.72473 0.02244 0.36893 0.8518 0.31481 0.9281 0.96066 0.68621 0.53591 0.39509 0.42631 0.08909 0.46254 0.22674 0.1811 0.0974 0.96364 0.67434 0. [...]
+0.3273 0.74567 0.44921 0.41751 0.50384 0.41986 0.89827 0.61027 0.35535 0.45469 0.97942 0.56434 0.71656 0.14091 0.62921 0.45311 0.86738 0.37617 0.43489 0.08371 0.13108 0.68725 0.44495 0.49765 0.86171 0.14665 0.63094 0.2842 0.203 0.89439 0.12308 0.88798 0.67915 0.96382 0.42757 0.38485 0.44128 0.66272 0.76025 0.61447 0.10625 0.26525 0.42918 0.50394 0.19802 0.21776 0.63284 0.70735 0.63356 0.79876 0.99785 0.38779 0.83007 0.99542 0.57397 0.58674 0.41347 0.4687 0.66571 0.87226 0.38137 0.87057 0 [...]
+0.73705 0.40411 0.13809 0.02476 0.91323 0.09871 0.91184 0.09686 0.94208 0.40785 0.51018 0.97541 0.1301 0.28757 0.49387 0.67543 0.1134 0.54351 0.336 0.45132 0.10694 0.54223 0.76357 0.95393 0.18923 0.20594 0.37197 0.64229 0.4133 0.01329 0.1863 0.10311 0.92641 0.47211 0.05583 0.43604 0.48985 0.3737 0.15236 0.75494 0.88056 0.66395 0.03262 0.15931 0.33993 0.14356 0.10898 0.39356 0.27841 0.60838 0.80916 0.75603 0.30437 0.615 0.07664 0.85118 0.60691 0.3119 0.11467 0.69422 0.03487 0.01918 0.4804 [...]
+0.4581 0.99127 0.15772 0.95857 0.36119 0.32896 0.44877 0.25291 0.11436 0.21126 0.61651 0.06303 0.5996 0.93818 0.1976 0.02051 0.26453 0.70848 0.29609 0.9417 0.2971 0.59331 0.85792 0.62884 0.86655 0.29659 0.0565 0.84074 0.991 0.89667 0.70607 0.76544 0.27619 0.87554 0.93407 0.9089 0.78132 0.90047 0.76511 0.1453 0.26318 0.08062 0.57275 0.03823 0.91981 0.66575 0.40201 0.57129 0.85253 0.14022 0.35129 0.08181 0.40713 0.54252 0.26378 0.86779 0.69626 0.17268 0.71812 0.52369 0.78928 0.58475 0.2871 [...]
+0.06949 0.92909 0.67772 0.90403 0.28648 0.43304 0.48706 0.63085 0.48744 0.78974 0.97259 0.40937 0.31487 0.48799 0.3831 0.44037 0.03621 0.62001 0.55272 0.98955 0.14681 0.47606 0.18943 0.59924 0.78728 0.85039 0.28304 0.81363 0.9356 0.15415 0.74667 0.02985 0.60258 0.9052 0.41847 0.34773 0.83622 0.57489 0.21462 0.40787 0.00597 0.33978 0.23413 0.97304 0.48626 0.50683 0.36058 0.03015 0.58608 0.01214 0.86448 0.31619 0.73391 0.25055 0.79169 0.38515 0.48197 0.39995 0.10383 0.40496 0.24017 0.61636 [...]
+0.14185 0.80965 0.95789 0.47589 0.1372 0.51799 0.23026 0.8216 0.07635 0.97705 0.15342 0.83395 0.74504 0.54547 0.64147 0.76347 0.34126 0.38971 0.52885 0.12739 0.4288 0.84874 0.1744 0.05005 0.12832 0.78779 0.7794 0.13228 0.21198 0.96533 0.84861 0.31558 0.08952 0.27142 0.17646 0.08669 0.67592 0.67479 0.15131 0.93615 0.27585 0.36694 0.76339 0.54386 0.97525 0.33344 0.40219 0.22218 0.19386 0.55178 0.00627 0.5762 0.90626 0.09377 0.18075 0.60554 0.85946 0.51229 0.07194 0.83603 0.84855 0.50072 0. [...]
+0.35518 0.23634 0.51988 0.87165 0.78379 0.97665 0.69089 0.70422 0.20238 0.16454 0.53464 0.10919 0.54806 0.95369 0.24597 0.19896 0.66525 0.26277 0.94625 0.57922 0.19491 0.1081 0.21105 0.29287 0.2924 0.86427 0.41872 0.51729 0.6874 0.92617 0.13393 0.05882 0.75286 0.50414 0.43271 0.77527 0.13255 0.79754 0.35259 0.5902 0.0449 0.89382 0.71834 0.17355 0.50114 0.92283 0.90919 0.67351 0.82318 0.55588 0.13113 0.88229 0.24031 0.9516 0.85566 0.86624 0.14851 0.37791 0.49193 0.63 0.23545 0.3219 0.9931 [...]
+0.83386 0.76883 0.56479 0.15958 0.99461 0.64179 0.60128 0.25754 0.48499 0.21477 0.21203 0.99834 0.67542 0.09469 0.38606 0.48155 0.72611 0.55157 0.44468 0.47292 0.09262 0.30873 0.13902 0.54851 0.10404 0.12214 0.98402 0.78711 0.65491 0.0178 0.5089 0.45249 0.87553 0.20475 0.54219 0.97123 0.28076 0.62848 0.69662 0.24122 0.27243 0.81608 0.56259 0.52747 0.2914 0.86312 0.98578 0.48771 0.53078 0.32469 0.67192 0.94633 0.22773 0.83938 0.001 0.13085 0.96293 0.01736 0.58079 0.24314 0.78074 0.59171 0 [...]
+0.86448 0.10481 0.65164 0.6374 0.68247 0.74084 0.92292 0.80298 0.48521 0.49883 0.9316 0.87583 0.36716 0.8584 0.04285 0.93958 0.43937 0.33895 0.48548 0.82152 0.63747 0.9998 0.52727 0.90071 0.01841 0.7421 0.84905 0.66886 0.49842 0.31449 0.77664 0.97709 0.08925 0.08546 0.03662 0.13005 0.01456 0.45565 0.28135 0.30221 0.48502 0.66969 0.63493 0.79819 0.55578 0.85202 0.25878 0.90484 0.34013 0.81779 0.17367 0.61364 0.37063 0.02366 0.53171 0.82229 0.60964 0.18956 0.55043 0.37346 0.17583 0.08805 0 [...]
+0.47332 0.85032 0.6262 0.94723 0.45314 0.96216 0.06723 0.78984 0.0148 0.60108 0.20294 0.49822 0.32869 0.11349 0.55409 0.48389 0.51853 0.31799 0.15581 0.76593 0.28866 0.50414 0.45032 0.49725 0.67862 0.72695 0.5945 0.62912 0.2341 0.16464 0.20891 0.32208 0.95436 0.81894 0.80781 0.43541 0.58278 0.26577 0.75792 0.09418 0.57943 0.27001 0.56092 0.86448 0.59755 0.49223 0.53348 0.45429 0.63698 0.00661 0.56106 0.87368 0.3134 0.95744 0.30976 0.62676 0.19052 0.02661 0.21846 0.7016 0.52054 0.58986 0. [...]
+0.40631 0.75171 0.23446 0.0961 0.44503 0.61151 0.0796 0.83414 0.71958 0.73884 0.46454 0.95698 0.69621 0.50838 0.73796 0.89779 0.4496 0.84665 0.94346 0.40685 0.03517 0.47894 0.0882 0.12651 0.60162 0.44489 0.48872 0.41749 0.91595 0.93569 0.61864 0.92285 0.44157 0.44593 0.94046 0.11095 0.75671 0.5092 0.95064 0.19548 0.16578 0.6437 0.41217 0.8805 0.18716 0.18831 0.96756 0.94086 0.58957 0.06116 0.81338 0.44327 0.93642 0.69002 0.88436 0.47767 0.52576 0.30022 0.05453 0.77932 0.62615 0.80404 0.7 [...]
+0.675 0.83928 0.99319 0.9597 0.63701 0.27487 0.45838 0.25019 0.69465 0.86684 0.58956 0.20176 0.94578 0.32057 0.78721 0.23967 0.76187 0.75488 0.71693 0.45401 0.11421 0.80115 0.4516 0.56029 0.976 0.87179 0.81647 0.95251 0.79856 0.38564 0.44392 0.90131 0.10595 0.52075 0.86367 0.19872 0.40186 0.95393 0.73951 0.4131 0.12435 0.8697 0.04629 0.9493 0.67303 0.2102 0.67878 0.93523 0.47563 0.12377 0.99622 0.42869 0.73196 0.2814 0.56663 0.8499 0.07298 0.59776 0.52282 0.40839 0.84161 0.4584 0.96915 0 [...]
+0.5348 0.17492 0.25712 0.25125 0.8518 0.55494 0.86328 0.30611 0.89033 0.51887 0.33169 0.61068 0.79582 0.01597 0.72594 0.5905 0.19068 0.58524 0.95833 0.87766 0.69521 0.18901 0.85981 0.44974 0.07072 0.48137 0.14631 0.44734 0.34892 0.97579 0.39925 0.60814 0.18081 0.41847 0.02598 0.43457 0.0708 0.13236 0.71645 0.29235 0.9221 0.5849 0.36471 0.12697 0.33827 0.18319 0.14927 0.69659 0.08869 0.24563 0.09162 0.38467 0.12055 0.40206 0.45854 0.48291 0.79125 0.00559 0.00786 0.14463 0.06053 0.74095 0. [...]
+0.1672 0.19216 0.81105 0.00126 0.39191 0.80236 0.86634 0.63801 0.88579 0.41532 0.14679 0.94706 0.95442 0.11068 0.28084 0.74434 0.42787 0.40257 0.57456 0.09002 0.26822 0.18678 0.28806 0.44499 0.88739 0.39487 0.76593 0.53265 0.99312 0.29367 0.07088 0.34832 0.67078 0.69205 0.4509 0.23696 0.8594 0.92385 0.25349 0.40104 0.85333 0.59685 0.22025 0.21931 0.69146 0.94134 0.73623 0.57599 0.29354 0.66973 0.29697 0.3183 0.00585 0.7654 0.2235 0.84579 0.53892 0.23533 0.50093 0.11916 0.73986 0.29907 0. [...]
+0.07244 0.49068 0.54505 0.24867 0.35718 0.93226 0.2941 0.90117 0.26698 0.50926 0.00618 0.65902 0.46861 0.21145 0.69316 0.31684 0.13438 0.77593 0.75771 0.17045 0.19879 0.26662 0.4204 0.75668 0.47183 0.84749 0.713 0.34195 0.66012 0.132 0.35403 0.02483 0.22106 0.06361 0.18513 0.81108 0.37078 0.96734 0.60364 0.64483 0.80013 0.17464 0.79456 0.86672 0.61897 0.01655 0.32708 0.45784 0.86797 0.50273 0.53358 0.07199 0.93036 0.17535 0.81605 0.66828 0.75073 0.16388 0.41055 0.75785 0.43071 0.46301 0. [...]
+0.50758 0.02679 0.78952 0.92415 0.77021 0.03145 0.44748 0.41894 0.06652 0.65057 0.26427 0.2205 0.52547 0.57771 0.01958 0.99434 0.67924 0.74832 0.03028 0.81502 0.76822 0.22017 0.11622 0.9717 0.90377 0.43961 0.44976 0.43294 0.99615 0.49034 0.8745 0.02527 0.36398 0.05125 0.11984 0.0928 0.62526 0.41685 0.99761 0.89992 0.16096 0.18856 0.66157 0.22198 0.33948 0.66487 0.59368 0.89864 0.38687 0.36474 0.93338 0.00049 0.82957 0.4029 0.92093 0.12167 0.92863 0.35123 0.81865 0.79925 0.76216 0.81637 0 [...]
+0.46695 0.56532 0.63519 0.86252 0.38508 0.47233 0.09364 0.03032 0.72585 0.02891 0.85141 0.04592 0.07182 0.61915 0.37279 0.50077 0.2424 0.10016 0.84006 0.31554 0.76127 0.21726 0.97095 0.73896 0.84149 0.51727 0.27562 0.65407 0.74911 0.68821 0.20496 0.96712 0.66863 0.79575 0.25101 0.80064 0.73055 0.5067 0.85525 0.467 0.9067 0.05118 0.88643 0.94288 0.98279 0.29085 0.21235 0.1347 0.05427 0.84643 0.417 0.19413 0.45006 0.63959 0.48405 0.11355 0.69446 0.58392 0.76976 0.10021 0.55203 0.63355 0.17 [...]
+0.9519 0.60984 0.80801 0.02844 0.17682 0.34397 0.97539 0.06919 0.94143 0.99888 0.23549 0.04037 0.55684 0.90659 0.11476 0.91533 0.95684 0.3563 0.64102 0.32832 0.14079 0.02829 0.09555 0.16976 0.26173 0.14658 0.04668 0.6332 0.0285 0.57147 0.04503 0.41098 0.23433 0.9906 0.23997 0.94122 0.98711 0.41797 0.24988 0.56043 0.58072 0.66785 0.20963 0.587 0.69106 0.07251 0.18752 0.46096 0.97848 0.22731 0.27335 0.49999 0.95094 0.06163 0.99599 0.37765 0.59078 0.86345 0.25388 0.0478 0.3651 0.66349 0.252 [...]
+0.46953 0.32387 0.25954 0.16628 0.88343 0.91543 0.92008 0.75423 0.8979 0.04277 0.26171 0.56769 0.987 0.28475 0.50805 0.84253 0.68236 0.47585 0.25795 0.51058 0.19275 0.66252 0.9281 0.63572 0.97972 0.59366 0.91862 0.65996 0.46175 0.61396 0.78425 0.0903 0.16327 0.99543 0.92929 0.43505 0.86305 0.96633 0.57494 0.1811 0.80655 0.62796 0.15515 0.19785 0.03312 0.38411 0.90617 0.53063 0.47692 0.53155 0.69851 0.43517 0.3647 0.68804 0.71545 0.34043 0.28646 0.8197 0.14333 0.96065 0.45085 0.13608 0.38 [...]
+0.40213 0.88236 0.80623 0.35045 0.26191 0.98543 0.67521 0.3087 0.26784 0.66062 0.94497 0.26877 0.69362 0.59072 0.06627 0.82318 0.77615 0.67806 0.3207 0.13423 0.58026 0.66383 0.5799 0.48726 0.33969 0.94051 0.9188 0.71755 0.56867 0.47539 0.68409 0.60286 0.19329 0.44145 0.57614 0.34344 0.14571 0.52508 0.7923 0.33665 0.20064 0.57051 0.48695 0.16603 0.23563 0.6456 0.20315 0.28479 0.2076 0.44269 0.86515 0.78817 0.03158 0.67916 0.61161 0.25757 0.45156 0.90936 0.5754 0.19705 0.11865 0.07431 0.91 [...]
+0.90687 0.40077 0.75645 0.48786 0.82323 0.44878 0.9879 0.42482 0.16447 0.77143 0.91309 0.18003 0.67909 0.01617 0.47544 0.94834 0.28697 0.18061 0.70635 0.76082 0.67297 0.23792 0.82347 0.80668 0.03212 0.55774 0.90385 0.13711 0.48786 0.33994 0.69836 0.52539 0.11379 0.57701 0.90309 0.43021 0.22263 0.63989 0.04885 0.33703 0.3707 0.99566 0.37087 0.36182 0.76345 0.79035 0.16411 0.45366 0.47013 0.26983 0.71497 0.79096 0.89389 0.2891 0.27302 0.21123 0.73138 0.32217 0.67862 0.43491 0.18199 0.25085 [...]
+0.40332 0.16834 0.73607 0.23733 0.37983 0.73463 0.977 0.88285 0.33749 0.40111 0.66541 0.6114 0.52157 0.49069 0.65111 0.28975 0.59884 0.11091 0.29302 0.18399 0.33409 0.08625 0.94463 0.78578 0.52247 0.91843 0.779 0.39644 0.87604 0.95972 0.0746 0.07664 0.95737 0.16832 0.43429 0.36708 0.63544 0.16645 0.66574 0.12611 0.96502 0.41089 0.49422 0.39148 0.29157 0.55521 0.34462 0.12747 0.40372 0.37669 0.15719 0.87304 0.26999 0.63069 0.71409 0.69776 0.76752 0.27434 0.71076 0.90315 0.84513 0.33861 0. [...]
+0.88122 0.79476 0.62086 0.83609 0.06513 0.50149 0.31641 0.12016 0.46549 0.08501 0.92387 0.8675 0.13411 0.61746 0.54571 0.45694 0.59491 0.21551 0.44308 0.37471 0.98095 0.84321 0.85779 0.80297 0.16838 0.71331 0.56411 0.28646 0.85883 0.28261 0.08334 0.58549 0.23067 0.67242 0.5164 0.95049 0.9528 0.3388 0.68167 0.40459 0.15742 0.50435 0.36399 0.46841 0.82196 0.02739 0.32708 0.34081 0.07688 0.68224 0.90624 0.15696 0.90418 0.99922 0.92609 0.37971 0.95467 0.02974 0.08066 0.44392 0.15877 0.44711  [...]
+0.42884 0.44146 0.91379 0.58062 0.04584 0.11654 0.20243 0.09615 0.01124 0.30844 0.02826 0.93377 0.76717 0.03858 0.21364 0.2872 0.44479 0.68084 0.50913 0.39253 0.34846 0.00636 0.332 0.52286 0.69557 0.83067 0.44485 0.32712 0.51417 0.50775 0.62837 0.9448 0.08001 0.48457 0.15803 0.2531 0.63958 0.02513 0.98474 0.59624 0.06566 0.22505 0.61535 0.07921 0.33862 0.5196 0.75451 0.60266 0.71934 0.01245 0.41138 0.47693 0.10899 0.44598 0.17916 0.44673 0.60404 0.31941 0.43445 0.64422 0.68633 0.67198 0. [...]
+0.30876 0.82845 0.24825 0.07156 0.35377 0.48866 0.95212 0.02008 0.34563 0.81036 0.94321 0.251 0.28227 0.49372 0.24635 0.65857 0.38668 0.2159 0.77935 0.21256 0.59928 0.28208 0.82342 0.96983 0.48456 0.93321 0.20639 0.45734 0.25957 0.239 0.24952 0.88832 0.32178 0.3709 0.92035 0.98831 0.29889 0.90292 0.25941 0.92916 0.48127 0.96992 0.89114 0.16464 0.23717 0.65866 0.54317 0.14525 0.27841 0.28269 0.50279 0.20179 0.46681 0.74958 0.38964 0.74536 0.71186 0.26743 0.43254 0.16446 0.02442 0.41508 0. [...]
+0.97263 0.56083 0.72605 0.79696 0.65958 0.47283 0.41549 0.34839 0.59908 0.43955 0.67124 0.50363 0.05841 0.14546 0.12261 0.84412 0.88969 0.39219 0.72051 0.21427 0.7145 0.28523 0.31247 0.35942 0.25123 0.61954 0.56067 0.46809 0.7922 0.41942 0.44977 0.5968 0.41771 0.38017 0.23143 0.96548 0.01644 0.29608 0.78859 0.97415 0.45804 0.15547 0.74616 0.29328 0.49182 0.14 0.8679 0.07464 0.35245 0.54699 0.6954 0.71503 0.46285 0.27321 0.20633 0.63583 0.88907 0.92343 0.60843 0.99479 0.11261 0.22335 0.89 [...]
+0.03162 0.73444 0.36038 0.51714 0.76337 0.25623 0.33229 0.30538 0.59938 0.39006 0.81054 0.53764 0.62626 0.07187 0.20031 0.23985 0.0249 0.90285 0.45922 0.21858 0.84576 0.91801 0.87291 0.97275 0.69324 0.68269 0.42546 0.14129 0.62025 0.24055 0.61885 0.10086 0.35831 0.02174 0.66795 0.6329 0.79824 0.89817 0.94106 0.79336 0.52645 0.97036 0.28432 0.89382 0.07257 0.5704 0.62282 0.26227 0.03252 0.73036 0.88207 0.76052 0.53311 0.35337 0.18198 0.93833 0.76993 0.7418 0.99866 0.4826 0.13695 0.53526 0 [...]
+0.32496 0.02405 0.37798 0.56727 0.27853 0.71671 0.26244 0.69242 0.79522 0.65592 0.84165 0.56079 0.20164 0.4685 0.54292 0.36535 0.26001 0.98322 0.7463 0.64171 0.43195 0.99737 0.63682 0.11519 0.94993 0.81412 0.73075 0.12745 0.52904 0.32434 0.75626 0.19359 0.19476 0.18034 0.67736 0.35112 0.23741 0.21937 0.19109 0.57811 0.74925 0.02485 0.80819 0.33071 0.90705 0.82835 0.97305 0.21107 0.00545 0.89546 0.11431 0.92955 0.83308 0.39653 0.37136 0.54292 0.45404 0.69981 0.71519 0.735 0.00275 0.64381  [...]
+0.39022 0.89748 0.28633 0.67883 0.80621 0.3883 0.35245 0.62983 0.59982 0.85934 0.68213 0.94516 0.12437 0.9924 0.24818 0.51005 0.46572 0.64286 0.65337 0.14879 0.37615 0.70835 0.75273 0.28961 0.40309 0.53136 0.79195 0.42829 0.8564 0.29646 0.36991 0.29386 0.48831 0.28306 0.27774 0.64951 0.23142 0.73218 0.96087 0.36061 0.87187 0.24599 0.31902 0.93028 0.91873 0.82553 0.92604 0.4038 0.73339 0.98922 0.3863 0.06598 0.78233 0.23323 0.37136 0.353 0.1928 0.49202 0.59737 0.93491 0.21839 0.4867 0.965 [...]
+0.03323 0.56542 0.74239 0.54998 0.95022 0.29141 0.83829 0.88087 0.64626 0.24348 0.64971 0.3079 0.77924 0.08788 0.39015 0.12239 0.71396 0.79859 0.06698 0.85693 0.79989 0.36424 0.3377 0.60945 0.67649 0.7492 0.56998 0.23338 0.1526 0.68438 0.61032 0.15645 0.52961 0.97344 0.53439 0.42395 0.62914 0.99827 0.14021 0.01802 0.85349 0.20322 0.06192 0.28163 0.71262 0.33142 0.1134 0.48016 0.10908 0.11715 0.70364 0.27468 0.36189 0.59619 0.68159 0.04056 0.97898 0.89544 0.39219 0.16024 0.8417 0.79925 0. [...]
+0.44324 0.70651 0.62043 0.83673 0.83803 0.16527 0.11819 0.61305 0.14241 0.91287 0.16222 0.98145 0.39032 0.69839 0.63046 0.25778 0.72768 0.37997 0.06196 0.0562 0.37858 0.07254 0.70513 0.01828 0.31751 0.32172 0.57644 0.20857 0.08581 0.74563 0.63452 0.93757 0.70469 0.57378 0.53965 0.85017 0.29266 0.40691 0.66754 0.38995 0.84974 0.43943 0.63609 0.65605 0.07783 0.24298 0.59573 0.16784 0.82357 0.26303 0.54577 0.87332 0.83984 0.78937 0.29361 0.76777 0.31962 0.13962 0.06163 0.17324 0.65103 0.210 [...]
+0.10407 0.21321 0.53995 0.84074 0.89572 0.06121 0.49115 0.86627 0.19809 0.54102 0.87216 0.90423 0.10674 0.0017 0.57534 0.86464 0.533 0.11727 0.0267 0.6799 0.11901 0.90926 0.60424 0.88934 0.83907 0.64918 0.20975 0.96866 0.94381 0.18488 0.20528 0.13281 0.5473 0.39493 0.69892 0.98082 0.55684 0.23024 0.3288 0.74198 0.68546 0.34362 0.12869 0.70078 0.35977 0.35292 0.46805 0.46086 0.39535 0.18414 0.95324 0.7404 0.26113 0.25453 0.90559 0.40242 0.8669 0.40562 0.45918 0.69464 0.60573 0.39125 0.577 [...]
+0.35337 0.85751 0.69514 0.00563 0.67804 0.07268 0.55464 0.40712 0.80689 0.73132 0.78107 0.31863 0.07437 0.69558 0.95417 0.12366 0.12274 0.66951 0.32388 0.73489 0.33703 0.62063 0.00089 0.13914 0.38014 0.63919 0.75068 0.95699 0.2124 0.52858 0.20826 0.32764 0.1495 0.56856 0.24064 0.2928 0.77893 0.47914 0.54004 0.78192 0.60231 0.04306 0.48623 0.15396 0.87941 0.16702 0.39861 0.92145 0.41542 0.4385 0.40408 0.93392 0.66891 0.52393 0.7755 0.12513 0.22514 0.31881 0.83875 0.47949 0.53589 0.03151 0 [...]
+0.82046 0.24932 0.9026 0.88437 0.79737 0.51897 0.83807 0.54638 0.64646 0.40965 0.83851 0.63599 0.58865 0.88465 0.33703 0.63368 0.54999 0.64847 0.30644 0.71498 0.1785 0.29924 0.9917 0.15583 0.20634 0.72687 0.46431 0.2835 0.08339 0.48326 0.78573 0.27834 0.41323 0.68956 0.36608 0.70879 0.20161 0.33235 0.43189 0.20909 0.17681 0.17619 0.58815 0.00983 0.97478 0.0102 0.16762 0.63071 0.99602 0.32037 0.77568 0.05341 0.36641 0.32195 0.26267 0.00268 0.79658 0.30567 0.03909 0.51378 0.59444 0.19253 0 [...]
+0.30418 0.21796 0.39115 0.42882 0.0079 0.01216 0.11126 0.06228 0.63898 0.44968 0.30582 0.84 0.79206 0.75467 0.66072 0.04386 0.84699 0.08027 0.37715 0.51502 0.85219 0.9372 0.03295 0.74943 0.82937 0.7319 0.63866 0.95825 0.59531 0.2243 0.41347 0.52251 0.59017 0.10275 0.33116 0.07829 0.87349 0.05972 0.81331 0.51309 0.704 0.57131 0.55224 0.90342 0.56065 0.60739 0.87849 0.29875 0.11719 0.71121 0.25387 0.33299 0.32813 0.26782 0.42219 0.35294 0.04691 0.41529 0.92869 0.60103 0.6549 0.0685 0.01566 [...]
+0.2595 0.10925 0.90806 0.16309 0.86282 0.58512 0.09103 0.41142 0.52721 0.10492 0.99793 0.22164 0.35009 0.13816 0.4327 0.74274 0.42516 0.43175 0.09958 0.43714 0.00896 0.78973 0.14232 0.20771 0.86924 0.35518 0.39344 0.05131 0.29086 0.1863 0.10775 0.81927 0.07408 0.24526 0.15012 0.30814 0.32348 0.37864 0.53797 0.71543 0.04359 0.25631 0.31921 0.45714 0.0831 0.08846 0.0663 0.0107 0.82989 0.52148 0.37853 0.76115 0.98792 0.54507 0.33109 0.51258 0.51957 0.74619 0.73245 0.72945 0.77766 0.57002 0. [...]
+0.7866 0.25591 0.81771 0.09061 0.09962 0.8876 0.56812 0.74048 0.52886 0.09067 0.47307 0.54605 0.83658 0.91792 0.94654 0.10369 0.74542 0.83451 0.7707 0.93997 0.30065 0.59924 0.80748 0.97939 0.06546 0.3416 0.09599 0.71797 0.63736 0.46573 0.4037 0.48138 0.90918 0.48535 0.87692 0.89549 0.40031 0.22669 0.1971 0.44532 0.23068 0.62362 0.389 0.94162 0.65487 0.29379 0.47277 0.57922 0.30768 0.38073 0.09926 0.44085 0.43031 0.17045 0.13012 0.41774 0.42819 0.99895 0.70079 0.37294 0.60677 0.8165 0.191 [...]
+0.51909 0.54682 0.07267 0.76577 0.90758 0.30589 0.84972 0.54915 0.91231 0.07568 0.78321 0.99724 0.34767 0.69029 0.77116 0.92801 0.42918 0.59139 0.01344 0.39989 0.09069 0.61429 0.82586 0.47254 0.35 0.75145 0.6861 0.6824 0.58656 0.18241 0.34358 0.59504 0.09468 0.48819 0.42672 0.72777 0.17775 0.52328 0.27484 0.70252 0.02242 0.01359 0.25757 0.50579 0.71912 0.63485 0.75811 0.29319 0.02163 0.51876 0.01404 0.02674 0.97403 0.58563 0.1413 0.23457 0.7223 0.85397 0.65592 0.61982 0.55671 0.02328 0.7 [...]
+0.67269 0.61202 0.7708 0.35496 0.28436 0.93118 0.43021 0.80414 0.95058 0.61178 0.69984 0.20245 0.07139 0.91486 0.4783 0.65523 0.27018 0.17856 0.50402 0.08656 0.18118 0.84212 0.06792 0.53538 0.03462 0.71385 0.59111 0.71649 0.00838 0.03675 0.93022 0.29381 0.8488 0.32726 0.60473 0.36456 0.56251 0.98913 0.29645 0.97971 0.29025 0.96086 0.34021 0.60634 0.16041 0.05879 0.04789 0.78578 0.69964 0.74066 0.11776 0.25902 0.00476 0.88021 0.7504 0.84461 0.48118 0.09873 0.00156 0.68028 0.58887 0.71565  [...]
+0.57474 0.28574 0.94241 0.20855 0.33864 0.79448 0.99247 0.87761 0.95492 0.74673 0.38023 0.41964 0.59719 0.00971 0.01228 0.47453 0.65118 0.9325 0.72868 0.87529 0.2814 0.22918 0.44393 0.35761 0.97051 0.8971 0.89838 0.38423 0.16307 0.48149 0.4827 0.59225 0.37772 0.76983 0.50394 0.63921 0.31799 0.60176 0.15023 0.10674 0.98941 0.61916 0.87868 0.35477 0.41243 0.02606 0.86812 0.4527 0.76951 0.07892 0.6581 0.00111 0.437 0.29145 0.99351 0.83854 0.08142 0.32448 0.19148 0.38508 0.58629 0.72121 0.95 [...]
+0.92212 0.87924 0.95509 0.8534 0.32234 0.72159 0.34188 0.87287 0.50744 0.31682 0.93764 0.50786 0.15863 0.88308 0.94825 0.88219 0.8877 0.93275 0.66045 0.78804 0.51244 0.88488 0.01513 0.54955 0.66047 0.43832 0.27907 0.06775 0.11556 0.99405 0.25263 0.85163 0.42924 0.93441 0.08985 0.34947 0.89307 0.2393 0.88304 0.40652 0.04104 0.95398 0.15479 0.93751 0.27084 0.79568 0.28398 0.42018 0.89994 0.19363 0.42465 0.76659 0.49658 0.69838 0.79559 0.30001 0.1018 0.43457 0.42897 0.29333 0.14095 0.86375  [...]
+0.04327 0.70887 0.27712 0.32431 0.68479 0.62547 0.37103 0.50176 0.26759 0.00933 0.50235 0.30516 0.72071 0.38273 0.29003 0.30525 0.27812 0.88275 0.15962 0.65489 0.8755 0.29171 0.0767 0.19035 0.58058 0.29734 0.42552 0.77631 0.27917 0.71708 0.71934 0.10299 0.25734 0.47222 0.98669 0.39728 0.82961 0.05094 0.5937 0.13858 0.61446 0.22622 0.65622 0.72369 0.46202 0.0282 0.48105 0.51274 0.52112 0.51737 0.37487 0.43701 0.49852 0.39454 0.93662 0.69873 0.29443 0.95236 0.4752 0.55395 0.62927 0.8545 0. [...]
+0.55596 0.28057 0.96332 0.77157 0.88435 0.23057 0.2674 0.94519 0.92013 0.68118 0.93135 0.21139 0.45755 0.90284 0.47 0.6678 0.8454 0.79332 0.44052 0.70747 0.78919 0.17151 0.00853 0.70107 0.28775 0.17035 0.33006 0.0852 0.56293 0.74309 0.84738 0.86539 0.20096 0.27573 0.51405 0.59209 0.95573 0.84747 0.21998 0.19465 0.71698 0.83298 0.1732 0.18109 0.84879 0.58632 0.03203 0.14051 0.12461 0.46554 0.52059 0.83901 0.87228 0.13024 0.51882 0.9994 0.65573 0.0043 0.20034 0.02395 0.12421 0.65096 0.8454 [...]
+0.53111 0.74893 0.74552 0.21564 0.67363 0.57859 0.43838 0.32566 0.77507 0.67441 0.19023 0.70066 0.9943 0.76842 0.24349 0.65734 0.40847 0.3672 0.44111 0.05227 0.87474 0.03364 0.24684 0.04259 0.89633 0.82696 0.49091 0.95987 0.40342 0.97404 0.6987 0.00219 0.75558 0.28883 0.40974 0.50605 0.49437 0.05095 0.06194 0.05957 0.16046 0.26825 0.95277 0.02719 0.52237 0.42779 0.34053 0.68909 0.46992 0.83883 0.72144 0.45034 0.01451 0.00944 0.76986 0.32419 0.23916 0.28658 0.1321 0.76534 0.60665 0.85877  [...]
+0.21886 0.88524 0.93268 0.5153 0.93614 0.03896 0.12813 0.79028 0.3589 0.80603 0.83435 0.12075 0.03016 0.13875 0.39505 0.67705 0.54017 0.44268 0.58495 0.71411 0.28721 0.61812 0.43755 0.33437 0.71275 0.84762 0.16005 0.46089 0.01815 0.41989 0.21255 0.18688 0.07718 0.76762 0.39954 0.34114 0.15461 0.67617 0.93389 0.27784 0.62145 0.32176 0.41064 0.40323 0.14874 0.63946 0.27086 0.76001 0.45673 0.88703 0.59593 0.12754 0.21584 0.60753 0.10799 0.83808 0.49858 0.89154 0.34904 0.98282 0.51616 0.6417 [...]
+0.20785 0.58222 0.81022 0.22794 0.87631 0.42088 0.30563 0.6956 0.83646 0.87105 0.66694 0.39311 0.86832 0.49432 0.08136 0.66957 0.47052 0.31706 0.0052 0.05077 0.287 0.51501 0.91033 0.99876 0.78656 0.22405 0.01399 0.02106 0.79419 0.37484 0.54771 0.87748 0.65574 0.79287 0.40304 0.01615 0.3996 0.41904 0.29454 0.92357 0.33729 0.35941 0.46399 0.07178 0.8987 0.5843 0.0947 0.03203 0.96052 0.11316 0.88694 0.10593 0.48601 0.1729 0.51765 0.67918 0.9066 0.64457 0.82209 0.4495 0.936 0.10893 0.83529 0 [...]
+0.56184 0.08747 0.8812 0.38314 0.58992 0.81122 0.60183 0.88436 0.53841 0.25121 0.47863 0.09811 0.58035 0.48763 0.32874 0.1681 0.70819 0.28249 0.02094 0.68527 0.26654 0.47326 0.88697 0.15387 0.53596 0.90558 0.38893 0.35034 0.047 0.73827 0.20492 0.34463 0.9788 0.37997 0.02844 0.25488 0.65421 0.66667 0.99245 0.16742 0.12896 0.99819 0.53631 0.7388 0.03322 0.31746 0.57811 0.20718 0.08544 0.53386 0.27314 0.53922 0.19485 0.80908 0.01918 0.3199 0.22269 0.96548 0.03719 0.59871 0.66715 0.7136 0.00 [...]
+0.13878 0.09984 0.85577 0.09442 0.15934 0.00086 0.43591 0.75315 0.97293 0.09548 0.66363 0.46384 0.93677 0.8349 0.38579 0.5365 0.58013 0.67452 0.28528 0.57643 0.80713 0.01022 0.5422 0.30303 0.27769 0.34258 0.67861 0.44731 0.60747 0.92271 0.32413 0.69083 0.59444 0.02105 0.82511 0.90103 0.1213 0.43901 0.94102 0.05662 0.61547 0.74266 0.89266 0.82506 0.00043 0.10684 0.64175 0.99877 0.84001 0.51093 0.7728 0.54938 0.6999 0.50995 0.10657 0.16237 0.67945 0.00471 0.44584 0.74744 0.54167 0.97662 0. [...]
+0.4876 0.90038 0.68631 0.03458 0.81837 0.03916 0.88675 0.47894 0.51679 0.32072 0.07738 0.11915 0.14515 0.30273 0.78681 0.32088 0.84616 0.09914 0.93733 0.1853 0.51755 0.98549 0.90619 0.42878 0.3002 0.12689 0.37005 0.10145 0.03561 0.70084 0.70857 0.90134 0.67658 0.92561 0.51554 0.06836 0.94309 0.07578 0.01355 0.72841 0.87669 0.79337 0.54623 0.54849 0.78359 0.47775 0.80352 0.89814 0.8267 0.25928 0.75311 0.70962 0.60078 0.13795 0.76879 0.94265 0.77726 0.89899 0.5909 0.89243 0.83253 0.28304 0 [...]
+0.77151 0.34006 0.39376 0.56515 0.99721 0.64927 0.64458 0.18229 0.7994 0.00349 0.88505 0.44057 0.75703 0.39249 0.14506 0.34689 0.66231 0.38773 0.54386 0.49214 0.86622 0.85684 0.32831 0.1446 0.19045 0.75635 0.93901 0.29967 0.58391 0.20944 0.95599 0.05872 0.89263 0.67874 0.7068 0.97848 0.23402 0.20209 0.88926 0.34864 0.34149 0.91615 0.65874 0.34924 0.76369 0.34171 0.22447 0.93289 0.62239 0.38198 0.77131 0.10837 0.68459 0.99965 0.32865 0.93389 0.84766 0.94253 0.36896 0.88592 0.01381 0.03825 [...]
+0.65736 0.12803 0.40548 0.53104 0.19092 0.80182 0.73033 0.61024 0.09035 0.98177 0.14463 0.50613 0.79424 0.29843 0.7572 0.24641 0.16608 0.97644 0.21934 0.20193 0.5576 0.28594 0.16919 0.67149 0.8237 0.69042 0.51543 0.78832 0.69855 0.29176 0.53112 0.83637 0.85284 0.61916 0.6281 0.83026 0.8842 0.32228 0.22106 0.64343 0.88029 0.21152 0.6551 0.59347 0.28546 0.04718 0.17631 0.55233 0.41306 0.99077 0.04633 0.34401 0.10814 0.96939 0.02414 0.7479 0.8333 0.50871 0.13842 0.60576 0.42455 0.17921 0.45 [...]
+0.13718 0.76864 0.17187 0.05983 0.27637 0.20647 0.58315 0.14415 0.22428 0.14438 0.6726 0.87941 0.61474 0.57416 0.07009 0.24075 0.42233 0.71295 0.48447 0.81966 0.19075 0.04844 0.42558 0.21917 0.14199 0.27963 0.33881 0.75942 0.25545 0.34968 0.35965 0.56743 0.40671 0.98413 0.22381 0.63682 0.08147 0.63677 0.81892 0.74007 0.99329 0.7736 0.79413 0.64208 0.31952 0.64525 0.767 0.52493 0.37595 0.85637 0.50962 0.43477 0.96973 0.03014 0.90578 0.04561 0.63841 0.31832 0.89735 0.08325 0.37592 0.56771  [...]
+0.73962 0.64205 0.92371 0.62007 0.09261 0.46994 0.11892 0.16388 0.44262 0.50993 0.36015 0.75666 0.31552 0.34017 0.99213 0.39102 0.58739 0.42121 0.76934 0.1284 0.45992 0.31528 0.08818 0.48778 0.52917 0.63105 0.86824 0.46052 0.73315 0.9826 0.69361 0.82681 0.23801 0.16287 0.10404 0.24762 0.5471 0.76464 0.6531 0.69106 0.8506 0.71204 0.30424 0.80056 0.50173 0.25459 0.3071 0.30744 0.46376 0.97944 0.69383 0.39219 0.04435 0.86377 0.93619 0.25746 0.08846 0.12987 0.48501 0.24516 0.05044 0.07312 0. [...]
+0.20571 0.50033 0.33751 0.97357 0.60478 0.70806 0.5882 0.32314 0.28239 0.48403 0.18603 0.13913 0.16728 0.10832 0.1637 0.08671 0.04031 0.23061 0.4881 0.12154 0.53121 0.95801 0.54265 0.81206 0.52957 0.12455 0.57559 0.81216 0.79685 0.4662 0.86123 0.84487 0.91735 0.11331 0.36395 0.3098 0.47031 0.59358 0.37846 0.68289 0.26722 0.28351 0.54022 0.70285 0.70565 0.90217 0.53901 0.44804 0.4119 0.7865 0.90515 0.59326 0.38812 0.19558 0.30942 0.68252 0.84428 0.69391 0.09865 0.78783 0.52725 0.76936 0.2 [...]
+0.7369 0.07579 0.53606 0.5582 0.86299 0.36802 0.96453 0.61637 0.7185 0.97657 0.61431 0.86396 0.73836 0.65237 0.55431 0.75253 0.33033 0.94558 0.17467 0.31067 0.40087 0.58665 0.74099 0.47149 0.98887 0.15869 0.64965 0.77855 0.21668 0.97073 0.22516 0.1342 0.20209 0.51573 0.90569 0.31836 0.03003 0.79163 0.41427 0.96344 0.63978 0.59988 0.48076 0.17771 0.07146 0.77202 0.74676 0.61829 0.8839 0.1969 0.31924 0.9727 0.59739 0.21453 0.6275 0.55551 0.90386 0.0736 0.30474 0.25369 0.67259 0.02112 0.826 [...]
+0.03448 0.48938 0.46086 0.06261 0.52665 0.5675 0.20405 0.58567 0.88702 0.82531 0.43466 0.98744 0.61726 0.57591 0.19434 0.79897 0.35038 0.77762 0.80898 0.38756 0.18263 0.78336 0.76338 0.87522 0.28583 0.76446 0.49476 0.48404 0.45504 0.68427 0.4088 0.09496 0.76546 0.656 0.02734 0.31282 0.26701 0.71091 0.16083 0.71505 0.39782 0.70284 0.29839 0.64619 0.31346 0.28953 0.07902 0.28834 0.47678 0.22264 0.51664 0.77182 0.03752 0.70336 0.02544 0.52584 0.81906 0.97261 0.466 0.03546 0.67076 0.68688 0. [...]
+0.13171 0.80273 0.78004 0.33272 0.67053 0.19979 0.67195 0.13139 0.76332 0.16417 0.45183 0.35227 0.50361 0.12325 0.88721 0.10862 0.06997 0.74782 0.51089 0.16593 0.10712 0.76097 0.70254 0.79887 0.73757 0.13447 0.87854 0.09355 0.01384 0.18816 0.35816 0.03546 0.13911 0.24915 0.47376 0.38959 0.87246 0.34294 0.04683 0.64722 0.48397 0.76138 0.05232 0.99607 0.50191 0.49236 0.39273 0.4214 0.8916 0.57946 0.48973 0.63618 0.38779 0.18282 0.02303 0.15748 0.87579 0.10289 0.59457 0.73829 0.45567 0.6651 [...]
+0.98606 0.46572 0.30936 0.7388 0.808 0.14493 0.97017 0.91507 0.53125 0.38809 0.70591 0.96765 0.85625 0.77797 0.61126 0.43531 0.89631 0.45658 0.85451 0.74104 0.08227 0.00683 0.62002 0.25074 0.44794 0.33008 0.68047 0.59562 0.63389 0.84896 0.79048 0.32401 0.64197 0.63308 0.62619 0.57105 0.80151 0.20868 0.37705 0.10583 0.12662 0.16691 0.48806 0.29097 0.94045 0.62869 0.95679 0.9892 0.32282 0.98925 0.8617 0.35262 0.16089 0.45232 0.9744 0.27173 0.29165 0.77329 0.44254 0.26393 0.25987 0.53056 0. [...]
+0.39268 0.97382 0.86699 0.72923 0.33914 0.80649 0.71725 0.27296 0.36874 0.51759 0.92876 0.06289 0.48074 0.61341 0.46439 0.5006 0.03277 0.00319 0.84511 0.755 0.85208 0.65184 0.2939 0.50293 0.18832 0.59553 0.90047 0.70289 0.76483 0.25953 0.03757 0.68536 0.26468 0.37118 0.63905 0.74092 0.01958 0.19142 0.30891 0.28507 0.6616 0.60742 0.09885 0.4195 0.76038 0.17111 0.84965 0.53889 0.25865 0.61226 0.0386 0.62853 0.663 0.70255 0.06428 0.87945 0.52062 0.57877 0.79549 0.79949 0.68308 0.38783 0.586 [...]
+0.69389 0.65958 0.12089 0.15096 0.26429 0.21743 0.97323 0.59373 0.49884 0.27894 0.41632 0.66605 0.74654 0.65488 0.96017 0.11088 0.3313 0.93068 0.99256 0.13995 0.68792 0.2894 0.09011 0.74499 0.08965 0.4429 0.19539 0.39387 0.77578 0.7953 0.50692 0.22944 0.66016 0.10356 0.77563 0.48461 0.93256 0.75037 0.34342 0.70208 0.86826 0.53564 0.80582 0.20652 0.86524 0.43048 0.08951 0.05176 0.32621 0.17963 0.38652 0.60327 0.09351 0.00227 0.86573 0.53376 0.75736 0.43001 0.1828 0.48339 0.45759 0.87252 0 [...]
+0.51962 0.34458 0.25174 0.40133 0.48386 0.35227 0.6126 0.00246 0.81269 0.61861 0.65 0.75885 0.50216 0.45466 0.61424 0.92364 0.10804 0.60485 0.01925 0.68022 0.52717 0.34591 0.66597 0.86344 0.90302 0.53752 0.43963 0.82754 0.30118 0.03639 0.8426 0.96215 0.86508 0.20828 0.41199 0.45575 0.21073 0.10209 0.76789 0.66773 0.26822 0.26039 0.19358 0.10439 0.13834 0.9244 0.42059 0.43985 0.48506 0.21276 0.91423 0.34996 0.3298 0.4032 0.20221 0.96089 0.45037 0.72455 0.96027 0.39941 0.83324 0.07809 0.11 [...]
+0.10421 0.21712 0.01313 0.10184 0.10849 0.11424 0.6247 0.45181 0.59045 0.12968 0.89134 0.15896 0.20176 0.57849 0.13536 0.74216 0.0988 0.77067 0.89972 0.70395 0.5795 0.29696 0.67384 0.7997 0.64871 0.09653 0.95465 0.83893 0.49924 0.86862 0.92114 0.57718 0.31181 0.92854 0.91639 0.50327 0.336 0.44744 0.30716 0.24744 0.11001 0.97786 0.78708 0.52822 0.38264 0.01996 0.49878 0.69513 0.20124 0.38563 0.97604 0.42445 0.05208 0.24605 0.30567 0.88435 0.86006 0.5078 0.03691 0.75312 0.52018 0.02602 0.4 [...]
+0.6085 0.21586 0.73018 0.84494 0.55714 0.16768 0.7946 0.13282 0.16504 0.7386 0.37051 0.65989 0.52921 0.87396 0.99917 0.02246 0.44587 0.12453 0.94071 0.64485 0.21588 0.54768 0.59122 0.25005 0.94437 0.84426 0.50632 0.50014 0.76259 0.33281 0.64752 0.42754 0.63823 0.64634 0.75065 0.89223 0.62765 0.73072 0.90542 0.0033 0.37929 0.77403 0.67453 0.3114 0.5535 0.24553 0.95679 0.13858 0.06306 0.80607 0.62155 0.15217 0.45081 0.22612 0.9969 0.61823 0.43846 0.4373 0.85833 0.64942 0.28295 0.24839 0.12 [...]
+0.81554 0.36433 0.4258 0.62741 0.62066 0.85409 0.36737 0.38149 0.94313 0.43527 0.68821 0.55278 0.96928 0.59075 0.09395 0.7633 0.94575 0.03816 0.91004 0.9427 0.0875 0.07224 0.84616 0.52105 0.38506 0.43034 0.1766 0.51317 0.7558 0.22271 0.20142 0.99181 0.20321 0.85945 0.80619 0.62367 0.80986 0.97727 0.18503 0.10278 0.92857 0.34188 0.44605 0.07755 0.7577 0.79632 0.09677 0.65026 0.17204 0.48784 0.95741 0.00478 0.89715 0.99393 0.21358 0.95264 0.07531 0.60782 0.95051 0.25681 0.59919 0.41255 0.3 [...]
+0.80965 0.57955 0.73105 0.05519 0.44348 0.59956 0.78241 0.17943 0.31369 0.04873 0.9605 0.5844 0.82583 0.93524 0.74733 0.22713 0.08093 0.38912 0.99956 0.17859 0.13021 0.18301 0.12275 0.02555 0.82706 0.49546 0.72027 0.48573 0.13105 0.63145 0.52543 0.84094 0.63066 0.85288 0.27562 0.63446 0.70372 0.39318 0.11746 0.25349 0.0558 0.62151 0.76586 0.66962 0.27094 0.49956 0.48385 0.29437 0.69554 0.67934 0.73246 0.20849 0.23628 0.64238 0.78275 0.57687 0.56309 0.15878 0.80956 0.7375 0.7124 0.23915 0 [...]
+0.2597 0.49161 0.41056 0.54441 0.82916 0.10014 0.11702 0.32466 0.79833 0.83086 0.54211 0.84925 0.57996 0.24486 0.15189 0.97791 0.14585 0.67685 0.85347 0.07565 0.42763 0.35043 0.4155 0.85172 0.65108 0.58947 0.15003 0.68929 0.25369 0.4386 0.2942 0.18093 0.96905 0.49635 0.14177 0.62129 0.38773 0.47517 0.78176 0.09324 0.43277 0.68443 0.36081 0.47447 0.03022 0.63966 0.52966 0.97904 0.014 0.61876 0.16847 0.69521 0.98479 0.55268 0.89796 0.13877 0.57344 0.07796 0.10409 0.05141 0.46407 0.85851 0. [...]
+0.10266 0.42485 0.81415 0.59262 0.51941 0.08971 0.35338 0.41753 0.33464 0.86987 0.86641 0.21812 0.63743 0.74928 0.47721 0.91933 0.37816 0.63024 0.63098 0.74864 0.45063 0.42577 0.60787 0.92269 0.29133 0.01708 0.64006 0.38549 0.94757 0.46507 0.72839 0.96411 0.91409 0.13983 0.22664 0.43506 0.41893 0.09087 0.09082 0.17484 0.93956 0.68211 0.44117 0.29353 0.82213 0.75226 0.28252 0.34567 0.82466 0.21672 0.14062 0.39617 0.07617 0.55738 0.97353 0.32258 0.9352 0.20288 0.84003 0.16531 0.97583 0.326 [...]
+0.7738 0.61731 0.46662 0.35464 0.25239 0.00979 0.43625 0.61281 0.88677 0.28057 0.13368 0.6647 0.57109 0.05556 0.01605 0.08743 0.42404 0.31755 0.37007 0.88503 0.31562 0.91911 0.50111 0.09278 0.83444 0.33018 0.73388 0.09967 0.90458 0.06318 0.88375 0.14946 0.5919 0.74286 0.84664 0.99852 0.50951 0.69722 0.03622 0.75295 0.61257 0.17174 0.56107 0.57608 0.68859 0.16571 0.27505 0.86017 0.76135 0.46165 0.42712 0.37718 0.17228 0.07571 0.65463 0.10943 0.84636 0.09928 0.5737 0.49782 0.24603 0.48544  [...]
+0.34292 0.01108 0.99156 0.56993 0.63978 0.68705 0.95469 0.71884 0.48846 0.78702 0.50629 0.88853 0.00285 0.86903 0.95105 0.64328 0.02046 0.72986 0.50075 0.78314 0.27094 0.82129 0.34548 0.38443 0.13492 0.77699 0.34098 0.40316 0.47653 0.15957 0.21245 0.38809 0.31532 0.93399 0.16867 0.5208 0.37931 0.16844 0.04104 0.85856 0.82331 0.23206 0.04308 0.10915 0.06296 0.23849 0.53777 0.27869 0.38134 0.11285 0.83196 0.80307 0.57124 0.26773 0.2914 0.37002 0.81925 0.77177 0.1477 0.67174 0.4498 0.58824  [...]
+0.53323 0.72147 0.562 0.30628 0.66183 0.81119 0.32724 0.60873 0.10104 0.3761 0.55501 0.52608 0.96588 0.30719 0.68295 0.95193 0.58887 0.44848 0.25107 0.66191 0.43717 0.67673 0.55081 0.96654 0.08593 0.2598 0.36046 0.54607 0.48588 0.64288 0.5586 0.93812 0.9347 0.4791 0.35173 0.24165 0.49715 0.76888 0.18214 0.15429 0.60368 0.05707 0.72837 0.56029 0.41851 0.77112 0.77229 0.34604 0.24982 0.52669 0.43985 0.10142 0.12066 0.24378 0.48016 0.02514 0.9165 0.93988 0.13035 0.16224 0.19589 0.42024 0.79 [...]
+0.57483 0.07684 0.71534 0.09257 0.08972 0.97981 0.6382 0.82772 0.31861 0.75707 0.91326 0.99422 0.29455 0.32876 0.58452 0.63366 0.4876 0.51775 0.34193 0.31697 0.17035 0.30051 0.45749 0.9943 0.83749 0.66935 0.16025 0.68132 0.19021 0.27784 0.45675 0.32631 0.76815 0.05931 0.39871 0.36689 0.57272 0.20741 0.98116 0.06834 0.57302 0.13286 0.20764 0.11367 0.10999 0.54057 0.69607 0.07801 0.60018 0.41839 0.95511 0.05278 0.32982 0.88181 0.8287 0.78731 0.26951 0.16968 0.22468 0.51704 0.52321 0.13603  [...]
+0.27481 0.73981 0.7488 0.25893 0.19774 0.73852 0.37275 0.5982 0.93409 0.17079 0.41259 0.63466 0.04688 0.65377 0.53718 0.86909 0.18158 0.60393 0.83944 0.95992 0.66314 0.70167 0.99193 0.15246 0.42449 0.51501 0.22709 0.25697 0.79842 0.58632 0.78737 0.86167 0.21448 0.55538 0.501 0.58648 0.24603 0.97376 0.91526 0.43688 0.43083 0.37315 0.79225 0.25407 0.65988 0.63201 0.0722 0.35126 0.32178 0.52145 0.65602 0.96999 0.33498 0.58446 0.85092 0.04675 0.94306 0.13369 0.1722 0.67885 0.47551 0.89153 0. [...]
+0.38826 0.14172 0.23324 0.82076 0.96317 0.12162 0.74498 0.22887 0.98053 0.67326 0.37011 0.95345 0.70481 0.21404 0.81001 0.2164 0.97419 0.31908 0.41226 0.47721 0.63377 0.42822 0.71024 0.09746 0.91326 0.40128 0.44569 0.97826 0.16496 0.01214 0.16618 0.3297 0.894 0.74691 0.44618 0.27933 0.44666 0.62049 0.38349 0.92753 0.96368 0.08191 0.09927 0.89175 0.02258 0.33579 0.68284 0.07606 0.43353 0.40557 0.58037 0.61269 0.06009 0.62009 0.60783 0.41593 0.31 0.26014 0.54873 0.92611 0.31348 0.6811 0.45 [...]
+0.63361 0.26028 0.59476 0.55909 0.51067 0.65349 0.68292 0.79135 0.74518 0.0942 0.344 0.72806 0.94908 0.59614 0.34466 0.17429 0.26342 0.74643 0.32298 0.99716 0.27957 0.66056 0.34253 0.35404 0.79013 0.96973 0.197 0.61302 0.88327 0.25632 0.07381 0.31733 0.04419 0.10897 0.54729 0.70853 0.20357 0.54256 0.24977 0.42534 0.8487 0.74708 0.69236 0.01256 0.69357 0.57943 0.25861 0.59145 0.60562 0.69567 0.54561 0.44643 0.29627 0.20103 0.60712 0.59725 0.64424 0.65785 0.21592 0.83212 0.86362 0.8286 0.1 [...]
+0.80187 0.32318 0.02586 0.69059 0.18446 0.13573 0.05887 0.77312 0.62742 0.73771 0.87508 0.5009 0.17813 0.99692 0.01799 0.43325 0.89608 0.08174 0.81504 0.64318 0.40464 0.18876 0.14096 0.77936 0.99618 0.18213 0.437 0.37565 0.36897 0.78584 0.37458 0.54385 0.24973 0.903 0.38199 0.59937 0.72529 0.04068 0.65326 0.93247 0.53033 0.93113 0.63434 0.52074 0.78667 0.93649 0.98141 0.5757 0.14613 0.81591 0.40906 0.07539 0.31373 0.27362 0.54135 0.11468 0.76841 0.09381 0.88027 0.26849 0.3812 0.20185 0.0 [...]
+0.63433 0.82014 0.51719 0.59044 0.64331 0.46836 0.15987 0.76753 0.50951 0.60029 0.55231 0.91832 0.63063 0.20064 0.61254 0.54202 0.9745 0.18716 0.54985 0.47432 0.35577 0.56648 0.14635 0.59221 0.22904 0.59185 0.30725 0.51525 0.07943 0.0763 0.56987 0.61354 0.04113 0.96625 0.60566 0.86886 0.67669 0.70821 0.82162 0.29527 0.15379 0.03928 0.1714 0.17269 0.98841 0.28565 0.94298 0.06271 0.85159 0.57622 0.89536 0.80802 0.01139 0.45237 0.07799 0.35368 0.73832 0.97868 0.85262 0.63086 0.79441 0.5044  [...]
+0.23654 0.73514 0.53334 0.4212 0.92373 0.32946 0.79048 0.38842 0.43112 0.33276 0.57848 0.99578 0.62817 0.19705 0.14998 0.99569 0.86977 0.34899 0.85099 0.42354 0.42462 0.37243 0.43894 0.56308 0.07125 0.9601 0.49152 0.34831 0.43052 0.31479 0.61792 0.65122 0.23554 0.6654 0.17985 0.37229 0.87224 0.99894 0.69886 0.35033 0.72944 0.55999 0.75201 0.83051 0.58726 0.40898 0.23517 0.6582 0.82625 0.11103 0.44032 0.51459 0.57957 0.37916 0.02841 0.03066 0.91601 0.10498 0.54286 0.99746 0.71868 0.70731  [...]
+0.27155 0.47611 0.39393 0.9502 0.68504 0.12961 0.45986 0.64559 0.50209 0.9335 0.09406 0.89602 0.05994 0.88128 0.35277 0.50242 0.16235 0.50257 0.49686 0.2646 0.62051 0.51148 0.69023 0.7906 0.99657 0.69941 0.72026 0.07608 0.76019 0.43586 0.46258 0.09813 0.80283 0.31414 0.78078 0.33994 0.67345 0.21226 0.62291 0.23153 0.24868 0.69212 0.49242 0.3917 0.86769 0.96098 0.6069 0.81673 0.8466 0.7201 0.12301 0.65337 0.42075 0.16094 0.38906 0.18508 0.90112 0.20676 0.91405 0.15364 0.40511 0.32554 0.77 [...]
+0.78273 0.42049 0.6817 0.72947 0.85931 0.10495 0.18915 0.79146 0.30079 0.63525 0.35506 0.20369 0.22485 0.10421 0.43562 0.53132 0.23033 0.65823 0.74902 0.84552 0.40393 0.36336 0.01057 0.37134 0.65789 0.56486 0.06733 0.5316 0.56441 0.19329 0.12906 0.46458 0.44314 0.35022 0.66856 0.96019 0.03993 0.37278 0.16366 0.32577 0.09361 0.53659 0.30897 0.49714 0.70687 0.55698 0.04026 0.97281 0.80252 0.32779 0.03689 0.84326 0.67254 0.25432 0.94331 0.87198 0.41724 0.80997 0.51113 0.70308 0.34873 0.4863 [...]
+0.01767 0.18173 0.94795 0.86176 0.99772 0.97232 0.71315 0.55023 0.41966 0.73385 0.53013 0.40165 0.04527 0.50389 0.28825 0.40002 0.72603 0.36112 0.10406 0.97086 0.66384 0.16508 0.79868 0.54912 0.50855 0.30831 0.61058 0.88997 0.13627 0.02053 0.09798 0.88451 0.54385 0.29896 0.77211 0.17652 0.89602 0.00631 0.86202 0.30162 0.35029 0.78809 0.31402 0.69152 0.8614 0.70354 0.08108 0.86256 0.17697 0.08876 0.80961 0.13051 0.68745 0.30948 0.29865 0.41161 0.02391 0.73462 0.27427 0.24129 0.99369 0.975 [...]
+0.36648 0.18773 0.69028 0.76408 0.3213 0.74812 0.86262 0.97188 0.96022 0.23871 0.32527 0.07251 0.40044 0.63917 0.40356 0.3372 0.36647 0.26303 0.92214 0.92916 0.37588 0.77602 0.75998 0.101 0.62141 0.95992 0.90775 0.63774 0.66648 0.28254 0.46794 0.96136 0.05916 0.68362 0.69534 0.82957 0.10004 0.41112 0.38731 0.18058 0.00288 0.96298 0.2562 0.3264 0.13865 0.33444 0.72843 0.413 0.92682 0.36625 0.58493 0.09915 0.74958 0.95099 0.06944 0.54033 0.21812 0.60448 0.26374 0.67623 0.97108 0.98552 0.46 [...]
+0.5865 0.32884 0.58855 0.37586 0.5499 0.47077 0.64958 0.47933 0.84331 0.79669 0.31582 0.61387 0.66096 0.6932 0.51597 0.68407 0.74059 0.61132 0.02176 0.70329 0.45362 0.63368 0.84131 0.80861 0.02503 0.65233 0.54847 0.84802 0.52489 0.78241 0.06523 0.54778 0.89745 0.20604 0.55554 0.74268 0.03847 0.3164 0.49604 0.09688 0.19266 0.41578 0.75082 0.03755 0.87862 0.49583 0.05656 0.74927 0.75671 0.22704 0.65069 0.69287 0.26906 0.07902 0.22204 0.69588 0.49424 0.3357 0.14899 0.90494 0.3377 0.50545 0. [...]
+0.88474 0.14322 0.47319 0.24937 0.21268 0.96021 0.19209 0.23445 0.04161 0.69744 0.63557 0.82738 0.52598 0.93053 0.25089 0.07532 0.34783 0.76775 0.08123 0.52667 0.56652 0.92743 0.04064 0.65007 0.35931 0.43457 0.6214 0.42601 0.83763 0.33966 0.27743 0.18221 0.54776 0.5961 0.37446 0.82168 0.29044 0.18684 0.10729 0.89425 0.40563 0.36202 0.99746 0.06225 0.7983 0.23013 0.02968 0.25081 0.65957 0.383 0.19066 0.85145 0.33045 0.62253 0.53132 0.27079 0.52604 0.79528 0.55982 0.36637 0.55718 0.5918 0. [...]
+0.86723 0.34514 0.91926 0.23447 0.56922 0.48483 0.59001 0.67963 0.48298 0.60036 0.4491 0.80093 0.5466 0.54657 0.96584 0.81968 0.01373 0.24192 0.35976 0.47795 0.51082 0.20841 0.35394 0.16221 0.00361 0.81679 0.31317 0.08864 0.50924 0.29414 0.20765 0.883 0.3155 0.7134 0.08759 0.43412 0.35502 0.80728 0.8614 0.30345 0.71611 0.68755 0.1029 0.91097 0.70727 0.96094 0.47001 0.57333 0.15268 0.62057 0.5556 0.65202 0.7559 0.13867 0.68577 0.22678 0.28088 0.1701 0.03396 0.32433 0.00935 0.07699 0.09743 [...]
+0.62697 0.80301 0.80822 0.76106 0.98943 0.41394 0.34867 0.72072 0.95908 0.0918 0.62516 0.50395 0.10101 0.58095 0.31734 0.73692 0.20999 0.05994 0.41569 0.90099 0.87751 0.66661 0.68784 0.35534 0.27542 0.91386 0.41357 0.07914 0.23736 0.86201 0.26024 0.91586 0.65566 0.03763 0.06858 0.30547 0.62527 0.50122 0.22675 0.32476 0.056 0.82925 0.26991 0.21845 0.30611 0.06833 0.82526 0.68132 0.63978 0.76781 0.7581 0.28527 0.07774 0.79633 0.63638 0.72067 0.62319 0.47876 0.72618 0.13937 0.19297 0.30137  [...]
+0.94522 0.87625 0.0818 0.21692 0.25708 0.78394 0.26935 0.09136 0.40354 0.20753 0.62067 0.92915 0.08892 0.68579 0.27203 0.68629 0.64762 0.93282 0.17166 0.83955 0.34257 0.87084 0.70001 0.73021 0.27014 0.71618 0.38578 0.16266 0.61653 0.46862 0.08344 0.05107 0.14868 0.22016 0.43957 0.34513 0.77163 0.47805 0.56116 0.30896 0.98473 0.37202 0.20786 0.62056 0.08522 0.46719 0.61171 0.61226 0.06565 0.93289 0.83992 0.8504 0.7446 0.50691 0.28135 0.11848 0.92189 0.21629 0.65723 0.398 0.61227 0.57042 0 [...]
+0.74185 0.22789 0.5072 0.9987 0.19777 0.90234 0.5443 0.25962 0.96155 0.37516 0.44587 0.94697 0.09488 0.14888 0.12861 0.49958 0.66124 0.08726 0.57904 0.52661 0.50945 0.24305 0.26217 0.23666 0.0806 0.37252 0.89671 0.45743 0.79986 0.77043 0.99609 0.19423 0.57215 0.80401 0.38432 0.3038 0.97457 0.19511 0.11391 0.67783 0.68401 0.80507 0.37776 0.71104 0.60883 0.22267 0.86978 0.18801 0.28112 0.63053 0.23136 0.62787 0.344 0.69723 0.07704 0.29497 0.24427 0.9664 0.112 0.43072 0.01161 0.44634 0.2921 [...]
+0.55475 0.81956 0.05512 0.37756 0.41704 0.41463 0.43218 0.00435 0.31268 0.72092 0.33241 0.46871 0.10381 0.2499 0.7406 0.57697 0.10302 0.79808 0.8399 0.67032 0.50108 0.30586 0.7941 0.56676 0.69154 0.51771 0.85436 0.2853 0.81089 0.60325 0.82934 0.01319 0.32457 0.16726 0.31206 0.89414 0.45333 0.04997 0.78111 0.22834 0.60384 0.62116 0.93983 0.79862 0.48014 0.7473 0.56386 0.02108 0.68454 0.08391 0.7374 0.16533 0.58383 0.20328 0.97903 0.22837 0.48613 0.30833 0.74874 0.32875 0.86879 0.56775 0.5 [...]
+0.92555 0.37029 0.78488 0.83075 0.11537 0.20106 0.46786 0.5234 0.40603 0.3838 0.71953 0.31364 0.13257 0.59184 0.69592 0.61781 0.00239 0.21652 0.89774 0.56839 0.42641 0.88134 0.93816 0.84255 0.20403 0.04906 0.00434 0.2716 0.32014 0.35548 0.95749 0.84488 0.44502 0.85755 0.07305 0.58855 0.66125 0.63704 0.19908 0.56869 0.91405 0.92272 0.19007 0.3271 0.89872 0.78319 0.28863 0.56516 0.41133 0.24759 0.3021 0.41616 0.05847 0.91526 0.58197 0.63658 0.63943 0.72948 0.55298 0.52622 0.90928 0.5475 0. [...]
+0.41699 0.99726 0.29036 0.47696 0.70663 0.93361 0.36649 0.92168 0.18904 0.31038 0.82421 0.86678 0.99431 0.37919 0.5685 0.62059 0.65934 0.47414 0.77129 0.19991 0.29399 0.52996 0.06508 0.22236 0.01484 0.29641 0.41984 0.3508 0.96105 0.39464 0.76685 0.3474 0.43847 0.02153 0.51981 0.63537 0.71619 0.16897 0.31229 0.87305 0.71479 0.09434 0.67519 0.11578 0.2258 0.74659 0.59972 0.85004 0.75374 0.12406 0.78563 0.09019 0.69506 0.19878 0.89853 0.38268 0.37212 0.77711 0.28932 0.50257 0.32936 0.85379  [...]
+0.72526 0.53898 0.3737 0.42695 0.392 0.90378 0.36537 0.90751 0.13055 0.39768 0.58965 0.60929 0.60671 0.79585 0.64087 0.21142 0.93253 0.61348 0.85087 0.74006 0.9982 0.61524 0.51452 0.3074 0.62102 0.02248 0.84701 0.82881 0.29815 0.70574 0.59477 0.45665 0.45757 0.12498 0.63859 0.72802 0.06132 0.64162 0.01213 0.81395 0.69564 0.83261 0.85374 0.81079 0.61458 0.42945 0.77065 0.77166 0.59178 0.82412 0.52129 0.78538 0.3465 0.1814 0.46826 0.08529 0.55136 0.09718 0.25207 0.55656 0.83666 0.40427 0.6 [...]
+0.42029 0.08712 0.13777 0.59165 0.45689 0.91422 0.40879 0.78671 0.43035 0.27994 0.28677 0.48719 0.07643 0.13792 0.94029 0.56325 0.2758 0.85748 0.59865 0.05955 0.23494 0.50836 0.02162 0.48475 0.41635 0.40248 0.95606 0.42358 0.32077 0.11672 0.93658 0.8644 0.92933 0.05411 0.07513 0.15159 0.86178 0.55479 0.9739 0.57905 0.30507 0.13134 0.14543 0.34957 0.0942 0.26137 0.004 0.01207 0.4319 0.23997 0.33418 0.13149 0.14485 0.76277 0.75366 0.99364 0.71598 0.55366 0.36341 0.74717 0.36689 0.91364 0.2 [...]
+0.06657 0.41946 0.34405 0.97075 0.62208 0.08608 0.41297 0.3874 0.1734 0.03659 0.71746 0.23024 0.61722 0.92311 0.96406 0.34265 0.7454 0.08099 0.23617 0.26025 0.12002 0.14029 0.69534 0.19968 0.7334 0.93629 0.85496 0.18536 0.55911 0.30429 0.45513 0.92689 0.62151 0.3314 0.35734 0.65993 0.0892 0.46357 0.59309 0.6018 0.15582 0.42856 0.83745 0.31328 0.55174 0.30946 0.94724 0.67421 0.39324 0.36704 0.4539 0.85614 0.87681 0.94184 0.9743 0.8562 0.4618 0.35073 0.36975 0.91533 0.15642 0.20471 0.58543 [...]
+0.0056 0.06271 0.06326 0.33486 0.26749 0.01081 0.0195 0.28365 0.42844 0.7617 0.29845 0.5569 0.36875 0.50254 0.66259 0.21979 0.43786 0.10024 0.57046 0.14639 0.48742 0.99572 0.02417 0.14034 0.26188 0.30076 0.6226 0.42182 0.71866 0.07451 0.70525 0.35638 0.27225 0.77113 0.3336 0.29716 0.70961 0.32221 0.4602 0.73886 0.99514 0.85462 0.28754 0.27202 0.06298 0.58867 0.69455 0.31453 0.98109 0.46863 0.17186 0.21144 0.76717 0.33887 0.52263 0.99289 0.04395 0.27494 0.51006 0.73887 0.76724 0.67869 0.7 [...]
+0.62844 0.79668 0.77778 0.53098 0.30349 0.62376 0.3388 0.65718 0.23066 0.3189 0.16897 0.28642 0.99847 0.33106 0.49546 0.91093 0.49638 0.2015 0.94241 0.97186 0.54412 0.45039 0.15352 0.11355 0.80168 0.08824 0.79157 0.39165 0.52167 0.24692 0.71549 0.60454 0.17239 0.44586 0.35998 0.95723 0.81692 0.84982 0.39092 0.42573 0.97796 0.08839 0.13081 0.73966 0.2015 0.10912 0.73724 0.69999 0.7639 0.03084 0.61441 0.54556 0.98514 0.08094 0.39484 0.75468 0.40477 0.80458 0.59688 0.39026 0.88668 0.43847 0 [...]
+0.9043 0.6134 0.88807 0.77982 0.91825 0.32686 0.37712 0.63422 0.93923 0.26773 0.25715 0.05216 0.98524 0.75179 0.07703 0.31776 0.69095 0.71537 0.50663 0.28505 0.8034 0.40153 0.09332 0.80741 0.11322 0.93524 0.4749 0.33263 0.25088 0.11605 0.38064 0.01779 0.58612 0.37667 0.41981 0.68184 0.65516 0.70387 0.98669 0.3224 0.95574 0.18272 0.01214 0.74587 0.25256 0.6602 0.42512 0.36439 0.4615 0.71047 0.5644 0.75907 0.23777 0.86059 0.0407 0.15735 0.04282 0.92232 0.99877 0.55366 0.94371 0.32204 0.725 [...]
+0.7415 0.79933 0.30901 0.75732 0.7534 0.93728 0.31377 0.56132 0.16926 0.57124 0.33707 0.4421 0.40478 0.27446 0.36246 0.87331 0.92884 0.79839 0.85431 0.71989 0.0597 0.6283 0.02662 0.92749 0.94709 0.25081 0.79324 0.26544 0.91189 0.88969 0.1868 0.29657 0.7623 0.70886 0.30152 0.87879 0.32036 0.33968 0.68241 0.37263 0.96145 0.22352 0.47194 0.75549 0.6093 0.22968 0.46419 0.85612 0.87882 0.50831 0.22688 0.64821 0.57059 0.07409 0.04748 0.09407 0.60756 0.885 0.45823 0.5746 0.92574 0.96712 0.76658 [...]
+0.78303 0.73838 0.52223 0.05102 0.76573 0.60758 0.27198 0.59855 0.71626 0.55011 0.22226 0.49715 0.22648 0.31723 0.70594 0.03505 0.37548 0.01522 0.28682 0.49574 0.01304 0.45732 0.45105 0.69085 0.75601 0.9582 0.73917 0.07521 0.91608 0.4924 0.29191 0.65099 0.78378 0.73683 0.28938 0.32506 0.63176 0.5295 0.97977 0.75056 0.26719 0.04534 0.24457 0.72698 0.25522 0.9108 0.93972 0.05971 0.05948 0.60174 0.66724 0.50139 0.33186 0.02941 0.92276 0.11203 0.20106 0.98609 0.31498 0.83537 0.06926 0.68704  [...]
+0.77806 0.23938 0.53218 0.81936 0.29073 0.54666 0.04988 0.18684 0.93923 0.36013 0.48399 0.83986 0.35154 0.30232 0.38915 0.61435 0.84783 0.16756 0.29972 0.83269 0.89857 0.44402 0.68855 0.71324 0.89156 0.21638 0.67817 0.61477 0.56574 0.90731 0.14709 0.06577 0.93626 0.43365 0.19711 0.85401 0.78453 0.59988 0.5014 0.27206 0.34932 0.6957 0.27422 0.37213 0.87093 0.50171 0.02224 0.86486 0.06054 0.27667 0.55069 0.63771 0.63391 0.35475 0.1677 0.84023 0.91577 0.60018 0.43935 0.50268 0.4361 0.61894  [...]
+0.95199 0.30873 0.70991 0.68296 0.22071 0.80905 0.40881 0.95396 0.15733 0.00264 0.03996 0.62082 0.96191 0.78605 0.20551 0.66739 0.34126 0.54956 0.05558 0.04234 0.61629 0.37119 0.38962 0.436 0.02781 0.01503 0.80609 0.92542 0.38583 0.87038 0.47753 0.82993 0.78137 0.46141 0.78394 0.37067 0.0128 0.39207 0.86698 0.82359 0.72011 0.96876 0.5291 0.35488 0.90069 0.71416 0.66776 0.3331 0.8893 0.17037 0.05666 0.45129 0.42971 0.25749 0.9159 0.2798 0.01043 0.56243 0.58133 0.20703 0.35905 0.001 0.9158 [...]
+0.37691 0.79011 0.69663 0.91707 0.92309 0.51055 0.11064 0.45999 0.00533 0.98275 0.45457 0.45612 0.70497 0.76879 0.70956 0.15625 0.20182 0.3339 0.65631 0.34689 0.96881 0.54091 0.55498 0.57515 0.60781 0.04219 0.31779 0.27895 0.48626 0.66141 0.60948 0.0783 0.13974 0.33224 0.56453 0.31703 0.26212 0.73818 0.4313 0.39405 0.98742 0.09068 0.81664 0.39138 0.52869 0.53833 0.79051 0.78176 0.70727 0.36661 0.38956 0.78509 0.1652 0.9442 0.99774 0.43597 0.26903 0.67984 0.47183 0.25325 0.94467 0.49641 0 [...]
+0.02772 0.96724 0.48165 0.8004 0.10378 0.84224 0.48877 0.52714 0.8867 0.43709 0.45111 0.35368 0.19402 0.96542 0.34115 0.62626 0.62954 0.61449 0.15543 0.94522 0.76592 0.3405 0.0351 0.66442 0.41692 0.95293 0.42331 0.20066 0.27462 0.4038 0.87735 0.31657 0.51149 0.39431 0.21932 0.98491 0.91701 0.16398 0.12937 0.55195 0.15402 0.50019 0.85905 0.41212 0.41241 0.31435 0.32742 0.94591 0.44577 0.2235 0.01945 0.60461 0.82808 0.00921 0.15962 0.21519 0.80863 0.64781 0.81919 0.76934 0.04336 0.46985 0. [...]
+0.72846 0.33202 0.32332 0.02868 0.9155 0.97215 0.08699 0.06784 0.86561 0.4737 0.55575 0.1204 0.39674 0.12119 0.97481 0.71886 0.59186 0.5794 0.33716 0.03846 0.7697 0.1011 0.26322 0.37312 0.05544 0.494 0.81202 0.88961 0.06738 0.74424 0.8519 0.81938 0.48395 0.01053 0.58166 0.15584 0.3946 0.58883 0.40763 0.93639 0.68892 0.60889 0.95887 0.31628 0.07758 0.46171 0.64631 0.84038 0.07247 0.34137 0.00234 0.07531 0.33739 0.88076 0.21803 0.35145 0.23425 0.2434 0.62756 0.82135 0.49356 0.605 0.24199 0 [...]
+0.79803 0.31284 0.89317 0.82228 0.31252 0.28192 0.53763 0.3138 0.63107 0.92652 0.82655 0.19994 0.45152 0.32114 0.50242 0.85116 0.16376 0.49185 0.75364 0.81767 0.45328 0.62582 0.54694 0.39378 0.63038 0.58869 0.58754 0.93375 0.26078 0.93074 0.19421 0.96743 0.48646 0.87116 0.151 0.52046 0.36658 0.27215 0.64334 0.24218 0.64084 0.25151 0.46264 0.77381 0.07062 0.31165 0.82197 0.2131 0.95332 0.53807 0.44365 0.51935 0.07145 0.04878 0.79107 0.7155 0.33005 0.93222 0.01896 0.69818 0.88602 0.89538 0 [...]
+0.15913 0.80458 0.63116 0.1055 0.76605 0.17739 0.86101 0.73737 0.11618 0.77768 0.63945 0.42185 0.32641 0.44 0.63778 0.75046 0.00272 0.60107 0.3221 0.40163 0.80498 0.86073 0.59464 0.02984 0.69817 0.92432 0.86609 0.01915 0.06939 0.56669 0.95658 0.99553 0.623 0.5664 0.90456 0.61479 0.05416 0.34194 0.2751 0.49643 0.34371 0.74779 0.08734 0.27383 0.9664 0.8301 0.64402 0.59975 0.82752 0.39857 0.40727 0.98674 0.27079 0.07025 0.57501 0.41961 0.71743 0.33864 0.57926 0.75892 0.47795 0.02019 0.74847 [...]
+0.38603 0.12026 0.7943 0.95704 0.46132 0.8674 0.64697 0.68606 0.17055 0.06478 0.49046 0.7653 0.65736 0.81072 0.55854 0.51543 0.07814 0.63301 0.30351 0.25194 0.57746 0.71346 0.82518 0.90723 0.47574 0.52524 0.47288 0.59062 0.02592 0.29577 0.29069 0.98161 0.73783 0.91303 0.13558 0.31467 0.09037 0.93588 0.52382 0.86769 0.44794 0.11755 0.29466 0.19819 0.13568 0.52543 0.03901 0.51813 0.21698 0.29558 0.77116 0.23515 0.49137 0.12483 0.2186 0.16365 0.4151 0.16626 0.90386 0.41082 0.35355 0.40924 0 [...]
+0.28958 0.39039 0.31818 0.15441 0.28399 0.2078 0.89346 0.37491 0.34533 0.87191 0.86209 0.77957 0.63988 0.04778 0.28597 0.13334 0.09868 0.23637 0.69152 0.31329 0.36196 0.31946 0.24284 0.80557 0.72827 0.03248 0.26179 0.75744 0.4779 0.14934 0.01458 0.51487 0.72058 0.36409 0.16971 0.19568 0.03669 0.44132 0.26594 0.54982 0.43957 0.5106 0.34543 0.92509 0.52177 0.35825 0.20677 0.43889 0.31076 0.11166 0.9016 0.68768 0.46363 0.44897 0.25784 0.73516 0.53161 0.68117 0.78515 0.87939 0.51728 0.67685  [...]
+0.69045 0.72241 0.8183 0.99228 0.58134 0.48477 0.47914 0.72697 0.17749 0.31546 0.69201 0.28238 0.35711 0.78022 0.71035 0.85922 0.42642 0.23925 0.65503 0.03443 0.19332 0.69104 0.37473 0.02908 0.78844 0.21061 0.18657 0.52086 0.98261 0.19187 0.29052 0.56305 0.15898 0.56314 0.70801 0.33517 0.83376 0.09129 0.08951 0.1635 0.70158 0.3084 0.71741 0.70614 0.69225 0.27314 0.34497 0.51448 0.44695 0.07389 0.31201 0.17144 0.60372 0.54459 0.2745 0.20343 0.81526 0.18743 0.71979 0.12827 0.22131 0.94938  [...]
+0.86281 0.9269 0.78648 0.4148 0.33673 0.25028 0.00927 0.74159 0.50876 0.39758 0.16536 0.31607 0.54106 0.6851 0.81302 0.37591 0.77094 0.55117 0.61836 0.8453 0.24805 0.32231 0.67193 0.58083 0.84989 0.67466 0.27109 0.46935 0.96655 0.88414 0.8674 0.00201 0.73976 0.18029 0.05709 0.78363 0.91232 0.17444 0.79387 0.26742 0.80608 0.38909 0.03176 0.25161 0.04077 0.20491 0.23565 0.57116 0.84174 0.10592 0.84779 0.58989 0.97478 0.08611 0.44861 0.8018 0.83891 0.73309 0.82018 0.59644 0.72673 0.99091 0. [...]
+0.31443 0.59974 0.0873 0.51807 0.368 0.29798 0.81073 0.84595 0.14567 0.18683 0.02399 0.8655 0.70785 0.13185 0.08098 0.89654 0.99625 0.68896 0.0179 0.97083 0.42392 0.92509 0.94515 0.67049 0.86168 0.76555 0.35273 0.83259 0.40084 0.81201 0.91809 0.81902 0.5 0.76451 0.828 0.16768 0.64051 0.37775 0.90737 0.73813 0.57233 0.19944 0.07946 0.91299 0.9803 0.78322 0.10682 0.65724 0.78391 0.08388 0.89677 0.63585 0.87279 0.76674 0.46489 0.47953 0.26166 0.21832 0.20866 0.0894 0.98301 0.74232 0.95208 0 [...]
+0.44209 0.80221 0.67777 0.16565 0.60623 0.191 0.33792 0.14079 0.02094 0.43349 0.61408 0.58493 0.17374 0.97498 0.6065 0.80324 0.06478 0.97266 0.81995 0.49735 0.54869 0.31272 0.54308 0.35407 0.25735 0.22381 0.9471 0.11424 0.64325 0.91565 0.8822 0.20289 0.8691 0.15824 0.68702 0.2408 0.52634 0.50181 0.54182 0.30788 0.10909 0.44156 0.96869 0.05323 0.78212 0.06644 0.55169 0.72908 0.89248 0.6663 0.67668 0.32629 0.62706 0.19958 0.84853 0.94251 0.02287 0.14303 0.31919 0.13787 0.95033 0.30642 0.07 [...]
+0.73153 0.345 0.62614 0.8551 0.76063 0.56231 0.77627 0.15278 0.62309 0.23935 0.63581 0.15621 0.65787 0.63165 0.06753 0.156 0.14806 0.73494 0.79711 0.99734 0.51965 0.4985 0.48137 0.97143 0.12279 0.11278 0.17371 0.90336 0.31707 0.97139 0.76275 0.73616 0.22825 0.6068 0.52313 0.72339 0.98398 0.40268 0.33512 0.43513 0.81784 0.037 0.96051 0.4553 0.70287 0.60886 0.51427 0.27054 0.78147 0.17103 0.28918 0.5902 0.13484 0.45657 0.11767 0.39379 0.67848 0.05241 0.48397 0.76438 0.09061 0.61294 0.35388 [...]
+0.91623 0.18559 0.71007 0.9749 0.74473 0.91693 0.12761 0.13844 0.96777 0.88134 0.55924 0.70313 0.7677 0.18281 0.48712 0.49365 0.08056 0.49663 0.86884 0.67154 0.87881 0.04434 0.01312 0.42752 0.14238 0.3304 0.63514 0.91826 0.30624 0.71613 0.458 0.71627 0.19562 0.72274 0.61615 0.24823 0.08298 0.11672 0.63182 0.79952 0.69741 0.22212 0.45491 0.68658 0.98179 0.7878 0.8015 0.12774 0.38838 0.38426 0.60568 0.31837 0.44033 0.76178 0.92982 0.15386 0.21036 0.46427 0.66588 0.85693 0.03576 0.8877 0.73 [...]
+0.7641 0.31581 0.48062 0.56847 0.63767 0.07914 0.84826 0.71109 0.50806 0.02556 0.12891 0.562 0.23149 0.94167 0.73896 0.72672 0.19848 0.03522 0.88216 0.97674 0.17386 0.91569 0.42881 0.74451 0.37475 0.20603 0.94039 0.80813 0.34023 0.72126 0.69402 0.11776 0.72842 0.82569 0.56883 0.13677 0.28805 0.09239 0.32954 0.52977 0.12738 0.76572 0.89258 0.5745 0.79529 0.71745 0.4629 0.45809 0.49049 0.86851 0.86749 0.706 0.63884 0.59212 0.67829 0.9226 0.14832 0.14474 0.57939 0.6344 0.72425 0.00471 0.918 [...]
+0.67597 0.00994 0.93812 0.17205 0.70167 0.04514 0.88239 0.61249 0.30948 0.94786 0.95552 0.81718 0.56356 0.45154 0.7966 0.48411 0.4841 0.97472 0.30377 0.06014 0.1114 0.45575 0.03684 0.42498 0.15574 0.33186 0.56468 0.71506 0.8899 0.6398 0.49753 0.25259 0.86355 0.96767 0.36824 0.06737 0.73784 0.01345 0.38447 0.06845 0.71067 0.68437 0.15222 0.58521 0.859 0.46842 0.86883 0.19662 0.3032 0.1675 0.6087 0.90818 0.16578 0.62343 0.37018 0.0099 0.56376 0.03887 0.45852 0.24022 0.08674 0.72684 0.97305 [...]
+0.7149 0.20734 0.1773 0.97669 0.71772 0.36272 0.95826 0.97707 0.29826 0.79756 0.37746 0.05699 0.66542 0.76691 0.94339 0.55208 0.09833 0.6667 0.75248 0.10642 0.72223 0.82177 0.62853 0.993 0.36104 0.86031 0.23929 0.50835 0.5204 0.46596 0.13975 0.4685 0.13129 0.39483 0.08387 0.91401 0.93873 0.58877 0.60652 0.52217 0.24604 0.70358 0.989 0.51169 0.3887 0.57767 0.2909 0.47914 0.49747 0.82736 0.75347 0.25268 0.22844 0.82315 0.96834 0.94565 0.36331 0.71336 0.42076 0.901 0.42595 0.58641 0.13754 0 [...]
+0.71578 0.77051 0.81384 0.83138 0.46505 0.69842 0.01746 0.72137 0.04957 0.3887 0.54226 0.87647 0.59031 0.53603 0.64904 0.44412 0.54697 0.85783 0.91786 0.0523 0.09001 0.57969 0.60015 0.71428 0.80739 0.74679 0.864 0.20569 0.37569 0.51684 0.77786 0.01782 0.62287 0.36667 0.31127 0.49158 0.5164 0.37554 0.0414 0.72038 0.47365 0.78289 0.98588 0.60486 0.56242 0.11171 0.3438 0.36878 0.76782 0.20488 0.90694 0.75934 0.68645 0.91932 0.76497 0.89175 0.62603 0.71258 0.55878 0.95805 0.60757 0.51668 0.3 [...]
+0.95547 0.02275 0.34381 0.30681 0.21863 0.18949 0.73591 0.46263 0.23774 0.67501 0.54005 0.74406 0.86836 0.4362 0.8181 0.44957 0.60052 0.43921 0.08723 0.68972 0.10384 0.36135 0.77816 0.65572 0.20401 0.23633 0.627 0.34661 0.16385 0.85907 0.72909 0.72213 0.29167 0.49983 0.20618 0.33654 0.342 0.25118 0.75918 0.19785 0.53191 0.77563 0.371 0.28817 0.9945 0.54481 0.80352 0.57572 0.45463 0.87744 0.37598 0.76887 0.12445 0.09336 0.79036 0.72601 0.1796 0.39747 0.47021 0.35475 0.92442 0.23868 0.8207 [...]
+0.75919 0.07889 0.619 0.1977 0.63441 0.09465 0.88976 0.7822 0.74212 0.24959 0.73686 0.05483 0.17901 0.20305 0.99417 0.58981 0.85024 0.26143 0.12642 0.42034 0.90531 0.44892 0.23064 0.91636 0.51821 0.95982 0.93887 0.33273 0.67432 0.44656 0.06539 0.34905 0.10713 0.68279 0.39866 0.14028 0.56392 0.6849 0.34042 0.2126 0.54194 0.86588 0.5381 0.25165 0.75266 0.26855 0.19687 0.69503 0.28346 0.13424 0.4388 0.93865 0.90699 0.03584 0.49658 0.56198 0.22985 0.40577 0.58249 0.60626 0.08085 0.33097 0.90 [...]
+0.84776 0.04822 0.90937 0.10122 0.85762 0.68214 0.73526 0.66956 0.21629 0.25255 0.80673 0.26068 0.56848 0.07846 0.1642 0.12573 0.48654 0.75303 0.47644 0.94625 0.4005 0.38849 0.65767 0.33705 0.14749 0.74375 0.98172 0.14925 0.37543 0.98442 0.88674 0.93935 0.44625 0.55194 0.96126 0.39657 0.39784 0.39484 0.00538 0.91189 0.05545 0.46617 0.41319 0.43942 0.30465 0.63906 0.42715 0.04077 0.16922 0.78253 0.28478 0.21918 0.5221 0.63243 0.25215 0.7519 0.93778 0.39299 0.73269 0.30685 0.06148 0.03145  [...]
+0.82167 0.25472 0.71455 0.92431 0.03126 0.01649 0.78869 0.25496 0.53703 0.88726 0.4542 0.93253 0.13661 0.12205 0.21683 0.12682 0.26901 0.43328 0.1354 0.26882 0.0002 0.87291 0.34258 0.52535 0.26786 0.53945 0.42783 0.88116 0.33019 0.6332 0.69964 0.35808 0.21696 0.80461 0.04718 0.30704 0.9754 0.79137 0.23684 0.18253 0.05233 0.5666 0.15612 0.25462 0.02352 0.79011 0.38905 0.20071 0.75554 0.53359 0.32873 0.3157 0.43448 0.74885 0.57408 0.76321 0.54977 0.5133 0.61989 0.71044 0.23879 0.42896 0.29 [...]
+0.44775 0.14334 0.77807 0.81755 0.18014 0.81917 0.69863 0.14055 0.153 0.02251 0.44078 0.45172 0.29758 0.34279 0.84669 0.41621 0.60622 0.29562 0.14747 0.00796 0.00104 0.65946 0.1105 0.55308 0.76965 0.66155 0.33752 0.08775 0.98418 0.24282 0.5835 0.08332 0.39451 0.39677 0.38961 0.56111 0.37295 0.76099 0.47789 0.44022 0.87792 0.7758 0.51635 0.10859 0.33532 0.62691 0.60359 0.26554 0.40349 0.58266 0.89421 0.0522 0.79496 0.31164 0.60835 0.48697 0.14153 0.39682 0.52911 0.1335 0.3548 0.96942 0.12 [...]
+0.04696 0.35373 0.74835 0.91848 0.57443 0.87605 0.20547 0.61252 0.51429 0.10863 0.30084 0.23699 0.63216 0.05914 0.91451 0.93616 0.68829 0.77941 0.94881 0.63954 0.51915 0.23544 0.88742 0.61699 0.84455 0.54991 0.61288 0.01995 0.95106 0.18275 0.33277 0.21735 0.58984 0.66004 0.98293 0.4848 0.0765 0.90501 0.42629 0.38387 0.46741 0.2169 0.1852 0.22876 0.373 0.41719 0.87898 0.12213 0.57747 0.25508 0.15244 0.61874 0.62497 0.74164 0.91265 0.52312 0.1571 0.0259 0.40757 0.58295 0.38871 0.8502 0.056 [...]
+0.9959 0.77887 0.68311 0.09781 0.50572 0.7166 0.90246 0.72524 0.42747 0.66314 0.89466 0.63072 0.21678 0.52096 0.10638 0.89147 0.84174 0.15081 0.58861 0.47685 0.72524 0.88557 0.61205 0.99926 0.42382 0.97863 0.12116 0.03325 0.47358 0.53688 0.57397 0.03103 0.096 0.71777 0.41078 0.29478 0.06922 0.15444 0.06917 0.87907 0.23909 0.05574 0.71818 0.39741 0.93205 0.75767 0.81635 0.70089 0.65176 0.94922 0.66719 0.03642 0.21034 0.21451 0.17572 0.75045 0.8494 0.52468 0.59696 0.35061 0.67771 0.15593 0 [...]
+0.1919 0.47835 0.54898 0.9943 0.22274 0.9482 0.85088 0.43173 0.32846 0.24439 0.14515 0.69753 0.8887 0.60968 0.41119 0.06516 0.46753 0.77871 0.66946 0.39242 0.98961 0.32635 0.23771 0.62118 0.83095 0.65328 0.53128 0.27659 0.72546 0.07024 0.39598 0.76975 0.33294 0.3416 0.74669 0.53686 0.56576 0.42304 0.4954 0.16928 0.34045 0.65862 0.00958 0.41189 0.13255 0.37325 0.49546 0.52459 0.81501 0.56686 0.19822 0.09142 0.37996 0.74661 0.08587 0.32026 0.49963 0.47447 0.39506 0.49493 0.8943 0.21651 0.4 [...]
+0.14265 0.06724 0.41029 0.31735 0.08561 0.61967 0.09926 0.97622 0.16231 0.42347 0.43958 0.63897 0.12284 0.60449 0.87471 0.9435 0.40526 0.53533 0.49296 0.38817 0.29521 0.85126 0.12995 0.21472 0.03992 0.18822 0.47135 0.90292 0.58156 0.05663 0.45004 0.09678 0.95229 0.0843 0.12737 0.82309 0.01993 0.72523 0.72876 0.81899 0.52793 0.86691 0.10826 0.79341 0.89056 0.60849 0.16319 0.56547 0.61414 0.65008 0.47212 0.88555 0.8227 0.02446 0.03233 0.51447 0.4457 0.09327 0.57272 0.26714 0.84315 0.68413  [...]
+0.05172 0.32769 0.90359 0.66075 0.41823 0.24634 0.95285 0.63928 0.54889 0.32733 0.05493 0.68442 0.35788 0.3207 0.34557 0.29639 0.14423 0.25131 0.89775 0.682 0.19748 0.51896 0.08859 0.96864 0.2797 0.36312 0.41854 0.23927 0.36276 0.83564 0.48963 0.70515 0.31705 0.36817 0.03975 0.84279 0.20182 0.65672 0.44172 0.06392 0.42903 0.04213 0.04885 0.5664 0.26599 0.11385 0.53441 0.81042 0.09122 0.22208 0.70313 0.76745 0.42741 0.62333 0.55652 0.4402 0.60242 0.21215 0.1705 0.63982 0.702 0.41534 0.791 [...]
+0.1492 0.36811 0.24437 0.89965 0.44562 0.07384 0.54693 0.2608 0.64076 0.00359 0.49988 0.2035 0.32251 0.96863 0.23595 0.06825 0.61917 0.14293 0.12375 0.98344 0.73148 0.76797 0.49671 0.99738 0.34515 0.51995 0.5199 0.75769 0.24063 0.54405 0.49032 0.93804 0.158 0.2621 0.18716 0.90165 0.47891 0.78208 0.36704 0.42571 0.8508 0.04315 0.67344 0.21979 0.84074 0.53156 0.81113 0.93747 0.40536 0.85741 0.95139 0.39207 0.09721 0.64701 0.13817 0.69592 0.55412 0.47586 0.31613 0.30423 0.23725 0.21363 0.48 [...]
+0.79674 0.92092 0.61445 0.10763 0.49936 0.40264 0.58336 0.37461 0.99746 0.50029 0.33688 0.23198 0.98006 0.93879 0.45243 0.0284 0.76628 0.4389 0.61579 0.52096 0.84372 0.0726 0.90195 0.3712 0.06834 0.39896 0.26621 0.75441 0.0027 0.00559 0.80492 0.92001 0.73297 0.88205 0.68907 0.53837 0.80562 0.53025 0.73228 0.21145 0.81963 0.55405 0.10761 0.82065 0.33185 0.53082 0.31627 0.0994 0.92765 0.96459 0.10989 0.80077 0.86668 0.22756 0.59223 0.91036 0.61373 0.752 0.45759 0.83064 0.12288 0.48573 0.59 [...]
+0.80036 0.68327 0.49825 0.41002 0.40837 0.52842 0.26552 0.06056 0.88213 0.92656 0.15896 0.60398 0.7365 0.5042 0.37902 0.45003 0.75984 0.03415 0.87421 0.14563 0.20276 0.54312 0.51363 0.39529 0.29396 0.45888 0.24806 0.20987 0.9311 0.83435 0.29996 0.60547 0.34079 0.40271 0.2603 0.02613 0.0222 0.44798 0.17734 0.19861 0.43547 0.92961 0.00492 0.56741 0.37157 0.79252 0.13628 0.59412 0.88189 0.76501 0.48249 0.99338 0.7276 0.35631 0.4929 0.86524 0.08344 0.61752 0.44705 0.99695 0.41822 0.48731 0.6 [...]
+0.40842 0.69024 0.35548 0.96683 0.6614 0.4713 0.69516 0.99895 0.77789 0.22375 0.93494 0.12967 0.44879 0.73378 0.06833 0.51584 0.25478 0.55137 0.69981 0.81169 0.44554 0.58512 0.84885 0.66495 0.25151 0.96969 0.60811 0.69768 0.8514 0.21086 0.29799 0.84257 0.10748 0.30459 0.58951 0.66659 0.95388 0.00775 0.07047 0.22521 0.42376 0.97234 0.60067 0.69786 0.56396 0.99729 0.37969 0.83628 0.7632 0.67013 0.15697 0.65195 0.06282 0.76809 0.50286 0.99027 0.63591 0.50662 0.29217 0.20768 0.23477 0.87319  [...]
+0.76694 0.68192 0.27503 0.71657 0.74462 0.00985 0.91794 0.97689 0.05768 0.83681 0.73736 0.99175 0.14872 0.60553 0.07311 0.76511 0.65919 0.11445 0.81482 0.63215 0.96328 0.49877 0.11806 0.37177 0.57413 0.07102 0.76941 0.51523 0.77042 0.21825 0.96187 0.78838 0.02465 0.58121 0.8585 0.66516 0.0247 0.18769 0.35967 0.97773 0.50372 0.6164 0.49203 0.68 0.46782 0.78326 0.53521 0.6173 0.47397 0.99898 0.35458 0.30482 0.38549 0.20302 0.17966 0.45765 0.20103 0.29826 0.97645 0.02873 0.73096 0.83721 0.9 [...]
+0.04837 0.63301 0.21241 0.79226 0.95044 0.73024 0.30701 0.71049 0.49169 0.28109 0.46003 0.67964 0.8434 0.33263 0.12264 0.21944 0.20606 0.55393 0.67165 0.79858 0.40713 0.44692 0.20432 0.76345 0.90558 0.80845 0.58706 0.92967 0.66269 0.416 0.59285 0.05503 0.00974 0.35846 0.36232 0.36585 0.73485 0.05683 0.2247 0.81652 0.14026 0.64293 0.33885 0.3445 0.90356 0.33977 0.04219 0.42091 0.69357 0.63295 0.14166 0.04619 0.03997 0.70542 0.45189 0.79501 0.22412 0.08849 0.86391 0.15103 0.99355 0.67345 0 [...]
+0.88968 0.60647 0.6384 0.95283 0.99071 0.39312 0.29908 0.72508 0.98107 0.81606 0.71944 0.80331 0.51892 0.39355 0.68138 0.033 0.23641 0.25405 0.8755 0.18324 0.12903 0.50239 0.6349 0.93828 0.66386 0.11626 0.92372 0.53058 0.36206 0.35943 0.14859 0.29855 0.77776 0.33429 0.77699 0.52782 0.09908 0.02947 0.1805 0.14584 0.19934 0.579 0.44482 0.01367 0.72164 0.29441 0.41901 0.02644 0.59615 0.63102 0.07777 0.21823 0.30342 0.55284 0.43724 0.38378 0.62005 0.16667 0.16725 0.24855 0.99 0.12399 0.96895 [...]
+0.32862 0.14224 0.12 0.45044 0.63194 0.86488 0.42214 0.45294 0.24987 0.35954 0.36475 0.68001 0.6152 0.63959 0.92643 0.80339 0.17373 0.75452 0.72187 0.40225 0.90475 0.16122 0.23559 0.51736 0.31035 0.61863 0.4113 0.33122 0.90626 0.92641 0.02418 0.37604 0.42085 0.21767 0.64862 0.62071 0.78554 0.36473 0.18428 0.98517 0.85843 0.94235 0.353 0.78921 0.84289 0.63958 0.94695 0.23133 0.47219 0.15703 0.52491 0.85653 0.96458 0.92682 0.19567 0.37869 0.77299 0.85044 0.09342 0.78967 0.05203 0.61064 0.5 [...]
+0.48088 0.16744 0.32094 0.69485 0.04676 0.0685 0.86677 0.35318 0.2672 0.63887 0.605 0.1342 0.11729 0.82514 0.32095 0.66566 0.76997 0.08735 0.26588 0.7662 0.95625 0.14823 0.81581 0.38624 0.936 0.68583 0.8042 0.68984 0.59568 0.57263 0.45465 0.42074 0.03033 0.89155 0.38173 0.77239 0.57601 0.21114 0.91095 0.69291 0.27029 0.20736 0.71259 0.24165 0.11073 0.9905 0.25678 0.22683 0.42713 0.3514 0.06852 0.96238 0.96676 0.6625 0.9065 0.9285 0.78021 0.37623 0.4243 0.62223 0.17675 0.57203 0.0741 0.98 [...]
+0.9577 0.80522 0.66019 0.46584 0.54692 0.26425 0.10284 0.03997 0.40285 0.03076 0.6246 0.96989 0.83383 0.88833 0.56917 0.98499 0.36811 0.9254 0.89115 0.19559 0.40515 0.90252 0.86094 0.70787 0.04605 0.20725 0.73276 0.80283 0.48784 0.91001 0.72645 0.72861 0.56539 0.24939 0.77107 0.15029 0.25019 0.3982 0.1304 0.78632 0.94364 0.88296 0.52336 0.95422 0.19168 0.64179 0.61571 0.91129 0.03839 0.18221 0.14364 0.6999 0.93849 0.91277 0.39077 0.04882 0.36222 0.86385 0.7535 0.63478 0.27444 0.38309 0.2 [...]
+0.50601 0.7439 0.99452 0.77584 0.33993 0.71261 0.66636 0.46618 0.90296 0.72495 0.38127 0.38133 0.77165 0.66536 0.19969 0.61742 0.24429 0.07537 0.86095 0.55016 0.71319 0.61123 0.30335 0.81436 0.01729 0.22855 0.48926 0.66063 0.33964 0.61161 0.92721 0.91749 0.35436 0.82296 0.41714 0.31272 0.38676 0.38657 0.05936 0.23419 0.68942 0.98981 0.00021 0.15106 0.55843 0.24364 0.36671 0.26127 0.79532 0.35559 0.08332 0.13145 0.52411 0.53121 0.66643 0.58716 0.65275 0.04948 0.70121 0.08934 0.78131 0.545 [...]
+0.91231 0.63767 0.00506 0.28607 0.19614 0.48013 0.05263 0.09614 0.51001 0.46738 0.56333 0.20754 0.47478 0.14373 0.49006 0.32107 0.0706 0.62874 0.40322 0.1111 0.10915 0.83422 0.30892 0.19422 0.57294 0.70436 0.28271 0.95537 0.61156 0.82019 0.5089 0.59744 0.95333 0.28625 0.96813 0.61086 0.00442 0.58233 0.09127 0.17516 0.06343 0.74113 0.69824 0.51673 0.71014 0.58946 0.77779 0.14563 0.63768 0.43097 0.86862 0.01166 0.12634 0.9337 0.43972 0.28983 0.24292 0.97858 0.3368 0.64596 0.62956 0.21466 0 [...]
+0.02192 0.84919 0.78434 0.81334 0.8816 0.07266 0.91924 0.26509 0.35968 0.90131 0.60132 0.08066 0.15897 0.0075 0.69019 0.48838 0.60834 0.79218 0.03808 0.97079 0.26212 0.48313 0.20162 0.64371 0.74245 0.04917 0.68644 0.23804 0.03936 0.0451 0.65598 0.46518 0.30896 0.80501 0.1865 0.0792 0.87211 0.398 0.00576 0.02365 0.34446 0.7441 0.51789 0.7746 0.68579 0.56288 0.85522 0.65885 0.27887 0.4303 0.74096 0.22369 0.03324 0.41199 0.62768 0.01372 0.49743 0.92926 0.41159 0.00032 0.2758 0.69055 0.81742 [...]
+0.96612 0.54042 0.64233 0.66201 0.22188 0.43645 0.76308 0.82783 0.18324 0.91973 0.27626 0.12288 0.6729 0.47582 0.48613 0.63296 0.27365 0.08683 0.36493 0.45813 0.77398 0.6947 0.12898 0.42669 0.07441 0.58339 0.6181 0.74674 0.28322 0.27583 0.24852 0.40993 0.96874 0.97527 0.28599 0.68863 0.56104 0.62518 0.85697 0.17572 0.24723 0.76799 0.56726 0.93588 0.89842 0.02221 0.28733 0.10523 0.25049 0.65419 0.52529 0.48953 0.35627 0.63604 0.95358 0.66293 0.19106 0.94134 0.88817 0.85097 0.1173 0.78699  [...]
+0.10554 0.01831 0.40472 0.48378 0.49293 0.61977 0.87415 0.38983 0.6734 0.94135 0.95262 0.63896 0.58528 0.16841 0.95784 0.35951 0.97278 0.75266 0.37454 0.3326 0.39518 0.61532 0.38964 0.08586 0.62979 0.67716 0.12926 0.81877 0.71391 0.47198 0.04036 0.03859 0.56482 0.26113 0.10738 0.09602 0.00748 0.98767 0.07693 0.86157 0.63067 0.53834 0.0768 0.33043 0.13412 0.74578 0.73379 0.9926 0.31591 0.01378 0.24366 0.16669 0.46334 0.83138 0.66298 0.17112 0.76984 0.09901 0.04374 0.93612 0.80735 0.77383  [...]
+0.66154 0.78401 0.89135 0.29459 0.48367 0.55874 0.67928 0.54319 0.32657 0.26 0.99104 0.98988 0.17954 0.57886 0.85892 0.09085 0.33573 0.92423 0.10685 0.45292 0.85563 0.30166 0.31784 0.82458 0.56198 0.62525 0.12085 0.96222 0.01263 0.98703 0.44127 0.6899 0.3044 0.67191 0.32726 0.8236 0.53941 0.15669 0.28074 0.83477 0.84421 0.33773 0.89603 0.97573 0.57485 0.51482 0.04429 0.39916 0.26904 0.8341 0.54526 0.18025 0.13149 0.12513 0.47079 0.15737 0.27183 0.94196 0.06959 0.37391 0.52493 0.68656 0.0 [...]
+0.50579 0.70707 0.51616 0.0814 0.76263 0.36461 0.88711 0.19822 0.93426 0.45722 0.33946 0.66578 0.50219 0.22043 0.97986 0.66306 0.39676 0.11145 0.9092 0.97995 0.56014 0.28152 0.37938 0.49059 0.54931 0.29328 0.17388 0.91741 0.32787 0.31764 0.38548 0.21353 0.74369 0.45258 0.09946 0.3895 0.20692 0.02804 0.89562 0.17826 0.05382 0.19833 0.32876 0.62764 0.08566 0.13004 0.12509 0.66796 0.64397 0.61349 0.91918 0.65527 0.20772 0.30675 0.06776 0.04889 0.03766 0.39342 0.14736 0.3714 0.86244 0.27514  [...]
+0.89673 0.32486 0.93177 0.45316 0.70432 0.52045 0.06805 0.65365 0.94154 0.9602 0.14319 0.40774 0.96896 0.63705 0.14404 0.74865 0.8847 0.67989 0.38937 0.77133 0.90808 0.35143 0.4292 0.27633 0.93225 0.29966 0.54449 0.55443 0.29689 0.012 0.2037 0.79055 0.49372 0.13125 0.6649 0.42924 0.55535 0.74015 0.31343 0.01601 0.34004 0.26428 0.11419 0.07187 0.47381 0.13413 0.08102 0.83238 0.87073 0.41371 0.82005 0.95422 0.46456 0.03909 0.01326 0.53858 0.63405 0.63425 0.69542 0.65788 0.28049 0.86981 0.5 [...]
+0.26666 0.52851 0.90239 0.61732 0.56516 0.08031 0.29176 0.6098 0.98535 0.69089 0.25206 0.99468 0.73164 0.68854 0.46772 0.93757 0.50111 0.47432 0.61323 0.27512 0.26545 0.47705 0.5753 0.22868 0.43936 0.51153 0.86241 0.03078 0.13826 0.49597 0.69276 0.47342 0.21258 0.38893 0.42948 0.16281 0.15207 0.50279 0.10679 0.19651 0.48926 0.37871 0.55068 0.48239 0.79961 0.5703 0.14877 0.40973 0.19942 0.12176 0.46772 0.2775 0.19428 0.04914 0.72662 0.0493 0.58383 0.47532 0.92688 0.31673 0.19448 0.1923 0. [...]
+0.01111 0.38429 0.67335 0.71146 0.27334 0.54189 0.00994 0.74523 0.32381 0.75234 0.09929 0.60251 0.3113 0.81239 0.75915 0.27466 0.84925 0.83336 0.75979 0.06185 0.97194 0.49305 0.84202 0.98265 0.17997 0.10145 0.84612 0.51327 0.76695 0.56316 0.22846 0.69821 0.2812 0.74943 0.43758 0.87697 0.39583 0.31347 0.76546 0.83486 0.80362 0.70129 0.88385 0.42205 0.20239 0.5476 0.89825 0.21207 0.1266 0.50336 0.89884 0.88657 0.89772 0.99289 0.72547 0.08183 0.59177 0.84066 0.93291 0.22996 0.97074 0.53742  [...]
+0.55957 0.57446 0.05864 0.2656 0.95552 0.9254 0.16234 0.40184 0.22112 0.18328 0.29197 0.47542 0.45252 0.02188 0.57363 0.59392 0.75146 0.09192 0.89662 0.49137 0.37585 0.30698 0.7284 0.81507 0.17841 0.3595 0.52658 0.58424 0.12117 0.47577 0.63973 0.52348 0.67365 0.21384 0.69295 0.08516 0.71368 0.85299 0.49359 0.8699 0.77824 0.34305 0.7673 0.39026 0.86841 0.83665 0.6749 0.74727 0.01924 0.35488 0.20345 0.18743 0.14054 0.09161 0.07225 0.70681 0.29269 0.76726 0.39674 0.62258 0.32466 0.28769 0.5 [...]
+0.984 0.58049 0.46668 0.96075 0.28197 0.05803 0.31774 0.86364 0.68979 0.56273 0.12024 0.01461 0.67519 0.77031 0.36315 0.90381 0.40218 0.89005 0.35206 0.37045 0.16417 0.62641 0.37276 0.00056 0.94404 0.49496 0.30016 0.49435 0.28233 0.6612 0.08493 0.69906 0.27196 0.18979 0.40795 0.1331 0.48842 0.80416 0.12482 0.52897 0.54804 0.20646 0.14009 0.3188 0.88798 0.01175 0.02254 0.12287 0.55937 0.60296 0.27346 0.75864 0.08769 0.4696 0.07715 0.74612 0.10905 0.25016 0.71865 0.15075 0.29512 0.40844 0. [...]
+0.73 0.48643 0.74386 0.47152 0.44309 0.37622 0.26113 0.1976 0.50402 0.72595 0.11769 0.73226 0.89858 0.19638 0.71975 0.52453 0.10956 0.2012 0.30214 0.32333 0.77072 0.03191 0.94354 0.46521 0.69738 0.65444 0.73983 0.43385 0.99447 0.92543 0.36544 0.41422 0.55606 0.23641 0.25988 0.38387 0.18482 0.73334 0.65318 0.27293 0.72059 0.28973 0.74255 0.27573 0.59557 0.61021 0.63274 0.71268 0.72714 0.04571 0.41969 0.89638 0.10884 0.48828 0.29764 0.82202 0.35812 0.61636 0.09281 0.04526 0.33886 0.3994 0. [...]
+0.86088 0.71468 0.53778 0.86112 0.2748 0.57635 0.95016 0.82811 0.32867 0.61065 0.83118 0.88555 0.44286 0.07364 0.63076 0.12851 0.67296 0.38794 0.43203 0.02248 0.56881 0.92973 0.47593 0.51586 0.1323 0.60685 0.62397 0.60069 0.92708 0.3804 0.59232 0.89882 0.60024 0.89455 0.96504 0.23431 0.99177 0.81171 0.42136 0.25924 0.11352 0.10869 0.65774 0.53511 0.55095 0.6546 0.17614 0.64773 0.45879 0.15439 0.64232 0.37108 0.80265 0.89919 0.98649 0.65646 0.47587 0.93717 0.3947 0.87783 0.23641 0.33185 0 [...]
+0.60326 0.09288 0.94256 0.94544 0.45367 0.35402 0.63085 0.01819 0.65133 0.83616 0.56952 0.48954 0.09902 0.12707 0.23909 0.45194 0.96926 0.64828 0.06295 0.34766 0.81943 0.93196 0.14842 0.39685 0.59952 0.11229 0.09868 0.52659 0.51274 0.74568 0.19612 0.63536 0.11139 0.45833 0.64183 0.30345 0.226 0.41135 0.7148 0.82212 0.18426 0.68522 0.39597 0.88479 0.76756 0.16071 0.21548 0.55703 0.34239 0.05538 0.54628 0.91449 0.16253 0.18745 0.21808 0.77764 0.13835 0.61749 0.84673 0.68487 0.76899 0.24857 [...]
+0.16321 0.78525 0.28695 0.61622 0.87381 0.77416 0.17353 0.84698 0.70763 0.91696 0.53632 0.2594 0.56453 0.93484 0.20234 0.8874 0.46538 0.98831 0.5713 0.66262 0.33865 0.45435 0.88499 0.01636 0.84922 0.6132 0.26065 0.51963 0.78909 0.94703 0.34258 0.34743 0.85351 0.73242 0.78175 0.29261 0.51511 0.47387 0.34375 0.85294 0.91998 0.93474 0.17367 0.06296 0.20192 0.40781 0.25922 0.85713 0.64597 0.27144 0.14366 0.37932 0.66227 0.11192 0.51134 0.12582 0.36757 0.43941 0.58105 0.92991 0.26536 0.85803  [...]
+0.14623 0.77413 0.34008 0.30906 0.31094 0.84327 0.16424 0.55828 0.55811 0.76308 0.45149 0.98718 0.09843 0.43081 0.51489 0.01519 0.07002 0.03777 0.73571 0.19023 0.27169 0.52831 0.41626 0.99013 0.87787 0.12117 0.76458 0.86952 0.87143 0.85042 0.7305 0.72039 0.18234 0.95975 0.47848 0.24264 0.88159 0.70107 0.17971 0.41362 0.04756 0.77169 0.26826 0.57445 0.16436 0.03708 0.35756 0.22966 0.52427 0.34262 0.07778 0.81054 0.91896 0.10947 0.00849 0.89132 0.46198 0.72736 0.37285 0.47809 0.74458 0.174 [...]
+0.1995 0.04871 0.42219 0.84261 0.56563 0.08323 0.25203 0.40246 0.55683 0.92246 0.9027 0.06147 0.10802 0.08655 0.13721 0.98758 0.13516 0.81179 0.58401 0.02255 0.88147 0.82172 0.43794 0.92456 0.30448 0.89235 0.23905 0.76642 0.57604 0.69734 0.44465 0.76227 0.56938 0.23457 0.72509 0.4519 0.96032 0.25647 0.21458 0.08703 0.55513 0.21709 0.75246 0.36732 0.60408 0.29798 0.79477 0.05405 0.52621 0.68359 0.48365 0.92204 0.58593 0.26963 0.97874 0.55915 0.69772 0.28355 0.6766 0.17307 0.6177 0.84064 0 [...]
+0.5408 0.86555 0.71631 0.13479 0.30448 0.73174 0.10813 0.79097 0.4004 0.50571 0.27932 0.34076 0.44121 0.60029 0.84762 0.08218 0.51434 0.63864 0.01156 0.98336 0.19549 0.75303 0.50109 0.40558 0.83305 0.85555 0.0811 0.65658 0.8917 0.62166 0.25135 0.22685 0.72056 0.57143 0.54499 0.44312 0.97782 0.00835 0.5465 0.40347 0.31831 0.03973 0.08112 0.83104 0.94419 0.70976 0.08771 0.33182 0.90038 0.35405 0.57152 0.14721 0.50044 0.1502 0.9302 0.40269 0.99423 0.52581 0.00444 0.50295 0.08486 0.3663 0.19 [...]
+0.63078 0.7458 0.71886 0.36838 0.94621 0.3293 0.69576 0.73279 0.97179 0.75927 0.44792 0.89368 0.30331 0.28683 0.11109 0.38705 0.29327 0.77326 0.93169 0.60701 0.50581 0.24977 0.77859 0.2507 0.66165 0.13441 0.57164 0.11819 0.33265 0.75546 0.42192 0.21128 0.41062 0.00329 0.75875 0.36315 0.70296 0.86759 0.09882 0.30013 0.46562 0.85156 0.30227 0.08427 0.18493 0.04087 0.433 0.08673 0.72275 0.89951 0.12128 0.57628 0.93074 0.66319 0.97084 0.4573 0.18488 0.74206 0.5621 0.27538 0.68143 0.86965 0.6 [...]
+0.33045 0.8625 0.55293 0.78781 0.84811 0.23397 0.65431 0.85797 0.67709 0.43778 0.99544 0.05298 0.89599 0.68241 0.95023 0.05265 0.05549 0.05758 0.11383 0.39269 0.57218 0.04871 0.70805 0.45748 0.34135 0.90945 0.50258 0.51164 0.66349 0.8118 0.0278 0.67509 0.7408 0.13958 0.52372 0.18213 0.59587 0.45699 0.19213 0.16983 0.45038 0.8963 0.08292 0.53236 0.35767 0.35332 0.23665 0.26881 0.15254 0.00953 0.8755 0.98777 0.33217 0.36349 0.81863 0.08395 0.82206 0.48905 0.17529 0.07066 0.10536 0.22705 0. [...]
+0.35247 0.15856 0.93742 0.56426 0.19876 0.76839 0.60979 0.93089 0.21985 0.18749 0.40911 0.59942 0.2369 0.48717 0.11561 0.35886 0.07636 0.49685 0.15815 0.48182 0.38364 0.19387 0.58748 0.56311 0.97725 0.7903 0.20553 0.38319 0.92285 0.49664 0.7817 0.54972 0.0961 0.44467 0.92213 0.99646 0.73324 0.18161 0.19352 0.90098 0.65211 0.42461 0.59288 0.1499 0.79991 0.93636 0.15346 0.94199 0.33951 0.67991 0.11858 0.04797 0.44042 0.06436 0.29745 0.04831 0.30627 0.90851 0.56109 0.83208 0.14895 0.52382 0 [...]
+0.24563 0.70137 0.94469 0.51965 0.70553 0.29894 0.69998 0.93542 0.82513 0.02351 0.98621 0.17461 0.04239 0.37439 0.83582 0.80434 0.73167 0.34396 0.91048 0.72331 0.93651 0.10632 0.54618 0.65261 0.31244 0.81462 0.01183 0.83254 0.82041 0.09931 0.1371 0.83751 0.25839 0.57417 0.38263 0.38337 0.09825 0.96634 0.69201 0.09011 0.184 0.55386 0.12493 0.3833 0.19725 0.9853 0.33853 0.20241 0.87244 0.59684 0.50986 0.60329 0.00291 0.03342 0.9093 0.36946 0.92098 0.6024 0.24396 0.04742 0.30444 0.06372 0.5 [...]
+0.54371 0.06768 0.81595 0.27954 0.88607 0.1788 0.38622 0.21588 0.25157 0.29342 0.96109 0.01739 0.25669 0.40757 0.56864 0.99459 0.67342 0.37362 0.49749 0.43818 0.32016 0.76993 0.92723 0.52986 0.17924 0.84508 0.80564 0.70348 0.54945 0.1841 0.84831 0.368 0.18772 0.94131 0.22914 0.13108 0.20797 0.17689 0.63865 0.10553 0.06143 0.70727 0.67538 0.20206 0.59772 0.6568 0.94268 0.45001 0.13162 0.5701 0.43582 0.06928 0.72034 0.39532 0.42942 0.13115 0.10948 0.41079 0.98229 0.99168 0.0778 0.80225 0.5 [...]
+0.8314 0.78142 0.1704 0.5427 0.32597 0.87721 0.8705 0.90222 0.47005 0.72726 0.22494 0.93778 0.11528 0.19569 0.49558 0.50217 0.29312 0.09116 0.01069 0.42773 0.7983 0.01842 0.38192 0.65363 0.30978 0.28436 0.27295 0.94967 0.65867 0.48575 0.531 0.69964 0.53865 0.42416 0.44013 0.78639 0.84623 0.71095 0.68334 0.0749 0.56801 0.63601 0.94594 0.43728 0.91224 0.18359 0.16238 0.00794 0.07211 0.92292 0.9716 0.06168 0.25792 0.80798 0.48434 0.37516 0.00745 0.23932 0.01915 0.31709 0.06237 0.82251 0.542 [...]
+0.53795 0.95623 0.68589 0.16686 0.51296 0.65206 0.23446 0.82073 0.41226 0.72954 0.63223 0.52207 0.43244 0.9995 0.88252 0.612 0.95376 0.89086 0.77931 0.40059 0.65515 0.32494 0.92465 0.82973 0.16574 0.02918 0.76093 0.67725 0.19517 0.40412 0.53914 0.74965 0.85547 0.38602 0.49747 0.73305 0.31983 0.54686 0.11312 0.21026 0.59693 0.05514 0.37439 0.3576 0.22697 0.73849 0.10809 0.22358 0.82929 0.01713 0.33839 0.05023 0.70057 0.4386 0.88804 0.68873 0.15659 0.75507 0.57022 0.94984 0.46386 0.30229 0 [...]
+0.80535 0.03659 0.54327 0.37415 0.47784 0.04104 0.57543 0.35368 0.22531 0.12673 0.3303 0.43603 0.27307 0.22317 0.5144 0.62443 0.21171 0.61775 0.93455 0.94617 0.86804 0.77651 0.99958 0.50996 0.20583 0.17369 0.79899 0.86594 0.959 0.57886 0.05897 0.22064 0.30998 0.49423 0.28506 0.17495 0.50296 0.18922 0.17772 0.59182 0.36721 0.94558 0.21894 0.29887 0.39843 0.21337 0.79335 0.72016 0.67591 0.17975 0.61329 0.60529 0.66377 0.42385 0.5058 0.27958 0.70278 0.95675 0.48251 0.9702 0.689 0.92602 0.09 [...]
+0.03649 0.16403 0.31154 0.683 0.44745 0.57806 0.42443 0.02782 0.84905 0.72223 0.68197 0.1115 0.88433 0.72303 0.67147 0.36919 0.72598 0.79731 0.48148 0.81965 0.16093 0.56642 0.48526 0.14497 0.37841 0.94424 0.16809 0.34682 0.03925 0.93538 0.72177 0.30802 0.24918 0.08096 0.89818 0.5746 0.37834 0.31836 0.25089 0.94389 0.38619 0.77395 0.51187 0.81673 0.20159 0.17184 0.09535 0.44021 0.64327 0.24062 0.42114 0.81473 0.50123 0.5762 0.69682 0.96339 0.95587 0.82453 0.53605 0.54533 0.20005 0.00189 0 [...]
+0.88414 0.81855 0.26292 0.04147 0.87436 0.7601 0.37921 0.79494 0.04551 0.54483 0.63574 0.3448 0.02027 0.72081 0.50336 0.01532 0.36447 0.15448 0.27087 0.64532 0.8217 0.04385 0.08375 0.35207 0.22519 0.55964 0.17002 0.68277 0.99022 0.28464 0.48465 0.27105 0.83704 0.7035 0.09366 0.12848 0.08234 0.70367 0.41935 0.37946 0.03131 0.11049 0.72605 0.12159 0.81176 0.71185 0.76057 0.95114 0.84617 0.73515 0.48392 0.45474 0.88511 0.4323 0.29413 0.36077 0.81765 0.00017 0.49959 0.71952 0.07167 0.75896 0 [...]
+0.26872 0.62749 0.29065 0.22416 0.19239 0.13788 0.14905 0.65525 0.97879 0.22602 0.51327 0.79704 0.78041 0.77395 0.04262 0.54138 0.69784 0.65394 0.49749 0.43983 0.91944 0.18076 0.45397 0.7714 0.05717 0.29603 0.24878 0.92179 0.22945 0.28668 0.59419 0.06418 0.27858 0.79702 0.22906 0.98064 0.17739 0.06079 0.64236 0.55951 0.45213 0.39896 0.30341 0.47919 0.24723 0.55339 0.56894 0.83907 0.83279 0.84772 0.9926 0.65426 0.45667 0.83813 0.96063 0.88439 0.48835 0.22176 0.75676 0.48937 0.69592 0.1019 [...]
+0.67112 0.93421 0.07637 0.51717 0.09241 0.93895 0.93716 0.34274 0.11726 0.25729 0.56942 0.01386 0.16643 0.35364 0.2276 0.77373 0.5227 0.83773 0.9588 0.41586 0.2174 0.16282 0.29708 0.71791 0.29811 0.75808 0.03334 0.44956 0.75519 0.8382 0.38975 0.6051 0.57167 0.07811 0.04441 0.00407 0.34646 0.29696 0.09697 0.28442 0.90935 0.57094 0.4414 0.68274 0.85223 0.21379 0.03614 0.57176 0.95191 0.3859 0.85137 0.80823 0.51205 0.0021 0.02956 0.51869 0.2298 0.04471 0.77978 0.76688 0.23076 0.50825 0.0336 [...]
+0.92672 0.33862 0.02744 0.64224 0.05991 0.76774 0.13477 0.39593 0.00596 0.83747 0.50218 0.0932 0.57309 0.05167 0.78645 0.68038 0.65191 0.73315 0.90904 0.14213 0.36176 0.99298 0.15829 0.66826 0.92968 0.97363 0.96188 0.72601 0.34551 0.57751 0.2438 0.78898 0.73042 0.53772 0.94555 0.37305 0.27073 0.11191 0.57079 0.24952 0.85364 0.57838 0.11844 0.93007 0.86145 0.56155 0.14012 0.32019 0.95212 0.75935 0.53007 0.4494 0.87312 0.25352 0.02775 0.15562 0.68182 0.777 0.73099 0.42199 0.27762 0.12005 0 [...]
+0.17018 0.27237 0.01877 0.63437 0.59695 0.66112 0.86248 0.64528 0.68231 0.74038 0.48897 0.46905 0.73132 0.79248 0.88181 0.67334 0.39198 0.73555 0.71477 0.54372 0.44384 0.62871 0.97315 0.6086 0.75445 0.79981 0.44802 0.87343 0.73591 0.63477 0.2388 0.40097 0.18687 0.13415 0.64432 0.17528 0.51473 0.35193 0.2186 0.26713 0.43429 0.56766 0.08839 0.9722 0.43059 0.51882 0.01408 0.49596 0.5915 0.38694 0.25553 0.45791 0.58551 0.69647 0.38179 0.12722 0.09923 0.30139 0.97537 0.90461 0.68679 0.64243 0 [...]
+0.25098 0.10566 0.29373 0.26907 0.53372 0.5766 0.42364 0.48159 0.31435 0.191 0.21168 0.85043 0.72281 0.28512 0.63867 0.33534 0.68364 0.32387 0.54925 0.20505 0.11473 0.56553 0.26505 0.87227 0.55077 0.30583 0.8891 0.39032 0.47592 0.30987 0.75545 0.88805 0.11782 0.74285 0.8948 0.46994 0.25406 0.27105 0.11874 0.95414 0.41127 0.04444 0.49365 0.53464 0.27407 0.01947 0.91881 0.37177 0.15307 0.12811 0.3683 0.66427 0.76707 0.78594 0.95473 0.41748 0.15976 0.55207 0.65929 0.22611 0.29192 0.60939 0. [...]
+0.27323 0.1389 0.80072 0.4225 0.91836 0.42622 0.96348 0.32134 0.75578 0.86425 0.97271 0.26535 0.07173 0.42492 0.5334 0.51915 0.26351 0.82577 0.36572 0.03947 0.93586 0.69024 0.04801 0.00013 0.92987 0.86259 0.79697 0.89067 0.7387 0.08252 0.799 0.19803 0.23029 0.27325 0.69358 0.68834 0.32095 0.69534 0.38061 0.98416 0.98743 0.69305 0.1455 0.71959 0.20064 0.12213 0.95194 0.68953 0.38849 0.0194 0.79383 0.15286 0.19675 0.12035 0.95659 0.7627 0.25764 0.5046 0.79761 0.32719 0.55525 0.08873 0.6953 [...]
+0.32853 0.15423 0.33038 0.74276 0.15804 0.5075 0.20597 0.42412 0.34559 0.80305 0.42589 0.78764 0.62737 0.95525 0.95792 0.12881 0.23194 0.12505 0.01151 0.42444 0.91847 0.55135 0.72587 0.99331 0.53146 0.29204 0.61263 0.45308 0.96417 0.72172 0.92586 0.09498 0.9171 0.72223 0.83252 0.01787 0.76457 0.14529 0.95095 0.50298 0.772 0.56218 0.0421 0.27001 0.36132 0.73343 0.62122 0.66263 0.98597 0.73171 0.32972 0.008 0.68902 0.56738 0.1494 0.20585 0.3069 0.0948 0.14538 0.87102 0.55278 0.48137 0.2712 [...]
+0.59237 0.08852 0.38171 0.62086 0.38291 0.61273 0.86853 0.2796 0.09562 0.79927 0.80832 0.41612 0.59388 0.56114 0.00629 0.6605 0.15567 0.33833 0.94465 0.00283 0.97177 0.14586 0.64935 0.43958 0.36019 0.90069 0.24596 0.70268 0.00079 0.3124 0.22097 0.83799 0.78516 0.60071 0.28288 0.75559 0.918 0.69188 0.72094 0.58247 0.17023 0.04701 0.10827 0.32029 0.2818 0.75773 0.27238 0.21291 0.46682 0.78112 0.16749 0.32977 0.87029 0.95719 0.74563 0.37288 0.45679 0.01406 0.10586 0.55721 0.18803 0.3193 0.6 [...]
+0.79604 0.77137 0.42864 0.42689 0.41799 0.92748 0.3001 0.51 0.62559 0.61015 0.42786 0.41896 0.42975 0.11986 0.17058 0.48047 0.14285 0.61688 0.67431 0.11617 0.30051 0.5096 0.62216 0.95402 0.7458 0.58181 0.73092 0.24205 0.36446 0.0635 0.34874 0.16606 0.36519 0.49764 0.30997 0.20884 0.967 0.13841 0.73071 0.26218 0.88762 0.41005 0.83257 0.11788 0.14498 0.2256 0.05872 0.2087 0.47424 0.6271 0.10863 0.47075 0.2097 0.20039 0.59802 0.41171 0.83023 0.79004 0.064 0.71116 0.94413 0.0369 0.23203 0.77 [...]
+0.13529 0.70452 0.81959 0.46568 0.74146 0.96418 0.59031 0.05059 0.76701 0.99624 0.71179 0.39414 0.49143 0.24391 0.2675 0.99994 0.93603 0.55314 0.96474 0.42495 0.54976 0.8145 0.68628 0.7069 0.5223 0.99738 0.86373 0.28571 0.06133 0.00192 0.1238 0.34862 0.03927 0.6891 0.21482 0.07541 0.66012 0.75378 0.89032 0.21155 0.06713 0.76691 0.13051 0.61039 0.19847 0.95489 0.26459 0.30983 0.25815 0.31904 0.2484 0.2045 0.5959 0.5991 0.71757 0.44477 0.14811 0.82388 0.96347 0.57783 0.29503 0.31967 0.6045 [...]
+0.73017 0.42035 0.13379 0.643 0.87419 0.34334 0.20805 0.2046 0.56159 0.78436 0.14979 0.44545 0.41809 0.34601 0.50181 0.83407 0.13736 0.79913 0.41903 0.68509 0.91242 0.43174 0.42441 0.22284 0.45736 0.18874 0.35441 0.41378 0.23294 0.88488 0.90213 0.0272 0.50485 0.65559 0.50839 0.17589 0.03076 0.26536 0.8976 0.93537 0.68038 0.9611 0.2364 0.92806 0.44169 0.88628 0.31212 0.21524 0.22396 0.8734 0.80639 0.37113 0.40339 0.78105 0.49758 0.51817 0.43722 0.13735 0.97612 0.62637 0.59953 0.52843 0.29 [...]
+0.09853 0.58646 0.03074 0.40978 0.95769 0.01551 0.85322 0.85127 0.14333 0.85808 0.98194 0.29425 0.70852 0.53819 0.30375 0.18734 0.47169 0.42526 0.28412 0.95688 0.88449 0.82597 0.16134 0.45525 0.24588 0.36785 0.805 0.75662 0.58621 0.87191 0.44621 0.68322 0.6912 0.55934 0.40474 0.22766 0.53927 0.20049 0.89608 0.34594 0.50814 0.88365 0.09143 0.43119 0.18936 0.01245 0.00475 0.82918 0.95159 0.93939 0.7375 0.07875 0.87401 0.97229 0.12272 0.89303 0.10311 0.61174 0.44301 0.44988 0.40604 0.50728  [...]
+0.30395 0.49738 0.6364 0.64516 0.60012 0.81627 0.79071 0.01924 0.49273 0.33219 0.51582 0.49855 0.38547 0.92034 0.58541 0.56903 0.63724 0.11756 0.73924 0.23907 0.40196 0.70654 0.8366 0.38948 0.6432 0.49468 0.82919 0.76305 0.43713 0.77457 0.12039 0.73681 0.24297 0.63924 0.13794 0.1092 0.47907 0.1783 0.27722 0.01899 0.05479 0.04343 0.20249 0.99265 0.96224 0.60522 0.64053 0.65368 0.31222 0.60886 0.44828 0.67202 0.01934 0.20462 0.01944 0.54823 0.76758 0.47849 0.55573 0.16226 0.00891 0.93808 0 [...]
+0.12508 0.33205 0.10094 0.80014 0.50767 0.42362 0.77016 0.64253 0.96319 0.57131 0.25288 0.21742 0.24635 0.25345 0.64533 0.51651 0.47233 0.90555 0.86816 0.97238 0.76614 0.414 0.52455 0.13227 0.50736 0.2759 0.25586 0.53746 0.60231 0.27557 0.6214 0.76467 0.28719 0.36578 0.16441 0.40902 0.67985 0.43012 0.06666 0.66224 0.28228 0.00434 0.64915 0.78587 0.91745 0.33505 0.9572 0.95257 0.31772 0.20231 0.44822 0.98721 0.43273 0.53682 0.74731 0.96214 0.15913 0.71674 0.81328 0.26515 0.11186 0.58212 0 [...]
+0.14075 0.40722 0.1924 0.82903 0.87948 0.02715 0.43934 0.19923 0.96376 0.69646 0.59254 0.81892 0.47279 0.8676 0.44854 0.10522 0.61114 0.45499 0.552 0.8344 0.57514 0.77931 0.8613 0.79819 0.78104 0.80899 0.5132 0.36808 0.89164 0.72769 0.81484 0.3958 0.94233 0.16854 0.58713 0.33507 0.92289 0.97787 0.74113 0.18422 0.89475 0.61925 0.5881 0.94625 0.69312 0.86007 0.29486 0.99024 0.27665 0.80011 0.02291 0.40023 0.27306 0.52882 0.47988 0.42483 0.93051 0.51754 0.5318 0.7361 0.16454 0.23689 0.43931 [...]
+0.43612 0.18104 0.45493 0.36975 0.66441 0.7084 0.31793 0.559 0.82745 0.8473 0.51388 0.37841 0.13776 0.69679 0.59814 0.47951 0.08764 0.61824 0.93265 0.49812 0.56521 0.32104 0.09418 0.14081 0.66417 0.98571 0.20371 0.83542 0.32509 0.38973 0.07177 0.22556 0.81356 0.26081 0.33814 0.87813 0.94942 0.42693 0.96657 0.81327 0.96318 0.36069 0.80962 0.68897 0.94073 0.91671 0.41101 0.52411 0.51163 0.26287 0.79915 0.68872 0.6146 0.29421 0.75836 0.15987 0.71655 0.20763 0.06532 0.92644 0.54605 0.97714 0 [...]
+0.48141 0.54389 0.048 0.54109 0.46562 0.20834 0.64773 0.23427 0.44533 0.67838 0.32225 0.5675 0.55371 0.99232 0.68278 0.54931 0.01222 0.76341 0.34407 0.16676 0.24446 0.16931 0.4312 0.5313 0.88494 0.06842 0.79257 0.66799 0.30673 0.3873 0.55998 0.96345 0.20445 0.03885 0.52886 0.98067 0.5606 0.23235 0.88696 0.53163 0.77061 0.10193 0.30155 0.20435 0.44305 0.6013 0.82255 0.48501 0.56785 0.62569 0.19468 0.91804 0.34293 0.81701 0.35937 0.753 0.35326 0.64216 0.33843 0.36674 0.87143 0.54257 0.9056 [...]
+0.33497 0.8126 0.02283 0.47791 0.40845 0.39387 0.19853 0.1535 0.01537 0.75348 0.63119 0.47256 0.27727 0.91354 0.78906 0.69968 0.18097 0.98702 0.87043 0.23268 0.44101 0.60761 0.06154 0.74229 0.57258 0.33321 0.26731 0.57144 0.33312 0.60058 0.78856 0.7714 0.43497 0.74678 0.46434 0.58426 0.76573 0.85608 0.55936 0.10438 0.65169 0.88085 0.62299 0.75176 0.10015 0.5391 0.99977 0.95539 0.75518 0.28607 0.19187 0.45641 0.31337 0.34417 0.54143 0.37581 0.11288 0.37803 0.77692 0.14462 0.93521 0.87071  [...]
+0.15556 0.52733 0.11851 0.13206 0.14194 0.76606 0.35783 0.04981 0.31328 0.61458 0.25514 0.78701 0.69134 0.50942 0.98653 0.92596 0.7336 0.46613 0.14536 0.62021 0.24228 0.1745 0.7542 0.11779 0.49121 0.11931 0.33963 0.08848 0.82169 0.0253 0.6955 0.10371 0.23881 0.42119 0.69902 0.49542 0.58754 0.78869 0.56053 0.51424 0.29405 0.65481 0.63254 0.22019 0.81412 0.26402 0.14217 0.97752 0.42745 0.77845 0.17818 0.01196 0.5513 0.90869 0.61351 0.84848 0.66659 0.05459 0.98406 0.12868 0.66924 0.79341 0. [...]
+0.10758 0.93596 0.71763 0.7609 0.09837 0.41433 0.20551 0.69768 0.24158 0.8876 0.47219 0.35213 0.78804 0.92118 0.50717 0.97386 0.67551 0.83739 0.29553 0.41333 0.71132 0.55758 0.84194 0.07321 0.35029 0.34818 0.02306 0.64378 0.9628 0.94011 0.34015 0.84474 0.66475 0.99795 0.80739 0.25652 0.13388 0.39018 0.30776 0.31443 0.82129 0.53541 0.31939 0.68997 0.05459 0.05046 0.27155 0.05269 0.43268 0.77965 0.29478 0.51274 0.50617 0.52086 0.52178 0.226 0.94766 0.85713 0.02652 0.28505 0.02228 0.37672 0 [...]
+0.05168 0.59869 0.07929 0.19612 0.51229 0.2196 0.96667 0.3771 0.55763 0.02304 0.0825 0.96943 0.14458 0.84738 0.74577 0.89844 0.1494 0.42504 0.07877 0.71356 0.87426 0.77659 0.58974 0.98819 0.51618 0.47694 0.37104 0.75119 0.08396 0.12934 0.3088 0.45778 0.31182 0.94166 0.86528 0.8539 0.3913 0.48896 0.0077 0.66816 0.72408 0.00631 0.1997 0.76408 0.26461 0.90463 0.81004 0.37243 0.56927 0.90703 0.20881 0.14701 0.11904 0.09244 0.07103 0.31947 0.33125 0.7905 0.75336 0.90829 0.58825 0.2459 0.6526  [...]
+0.59393 0.85894 0.33133 0.89565 0.8523 0.66425 0.45324 0.07127 0.86961 0.2253 0.3787 0.57978 0.46098 0.47878 0.33444 0.4987 0.33922 0.25241 0.79876 0.40559 0.95658 0.62103 0.87563 0.839 0.68141 0.71218 0.05238 0.92257 0.44787 0.40419 0.74332 0.90575 0.77999 0.04347 0.53433 0.65342 0.11071 0.10669 0.28515 0.53148 0.60652 0.82583 0.24901 0.25017 0.32132 0.47885 0.42881 0.66797 0.53847 0.81046 0.77093 0.67904 0.53231 0.93748 0.42577 0.34635 0.07208 0.72832 0.8049 0.56216 0.71086 0.59111 0.4 [...]
+0.49021 0.60612 0.57934 0.29528 0.33637 0.0992 0.2919 0.85861 0.89831 0.07614 0.78729 0.39165 0.08588 0.88588 0.27522 0.07607 0.41509 0.25018 0.9225 0.46589 0.42202 0.1179 0.06901 0.80513 0.21806 0.73769 0.01703 0.45644 0.42784 0.99101 0.08847 0.51768 0.47458 0.29922 0.97717 0.71879 0.02265 0.49985 0.92376 0.28674 0.00405 0.57438 0.22669 0.77276 0.94295 0.18487 0.89 0.98365 0.77628 0.81163 0.24087 0.25985 0.46025 0.18714 0.18221 0.64405 0.39692 0.86268 0.24145 0.71779 0.07651 0.97623 0.5 [...]
+0.1038 0.30402 0.79556 0.00541 0.73543 0.30607 0.76575 0.23138 0.56413 0.76594 0.29089 0.35093 0.06705 0.5932 0.87125 0.87236 0.31499 0.60029 0.34344 0.53949 0.32032 0.73553 0.39228 0.02083 0.42369 0.23578 0.39593 0.79861 0.63094 0.37183 0.40384 0.99147 0.44077 0.22648 0.07126 0.00442 0.02334 0.86558 0.63809 0.98386 0.70047 0.60205 0.07512 0.09228 0.36404 0.01089 0.27061 0.56263 0.60643 0.11872 0.03362 0.99381 0.27291 0.17155 0.29289 0.04885 0.83362 0.36752 0.78859 0.08692 0.69582 0.2668 [...]
+0.90559 0.52467 0.10364 0.78027 0.06588 0.85614 0.60637 0.88635 0.18387 0.34173 0.9479 0.04202 0.30891 0.45285 0.24834 0.99045 0.93489 0.75016 0.07467 0.49495 0.95241 0.55502 0.9491 0.54802 0.17122 0.48552 0.76951 0.33801 0.076 0.15247 0.02959 0.05022 0.87588 0.2128 0.11815 0.53089 0.51176 0.8962 0.82173 0.02343 0.51309 0.09822 0.06627 0.66083 0.88887 0.59148 0.04903 0.003 0.68874 0.31159 0.13297 0.21649 0.21321 0.3088 0.35219 0.73992 0.30622 0.00785 0.21371 0.55735 0.34949 0.57348 0.721 [...]
+0.48348 0.78776 0.91121 0.44155 0.36598 0.1325 0.61579 0.27506 0.02389 0.85047 0.75416 0.32767 0.2166 0.53042 0.51639 0.71523 0.30895 0.03169 0.53105 0.49232 0.54591 0.42789 0.57378 0.45775 0.38856 0.30721 0.65895 0.52937 0.05122 0.27208 0.74398 0.85526 0.55957 0.40466 0.98501 0.23092 0.6595 0.79089 0.7721 0.65298 0.42764 0.7747 0.56391 0.29074 0.09537 0.1654 0.91632 0.35578 0.54488 0.57693 0.54396 0.50368 0.29459 0.46193 0.77034 0.90255 0.72586 0.59618 0.91661 0.49415 0.35461 0.34897 0. [...]
+0.68795 0.10702 0.02484 0.76701 0.70081 0.16247 0.92092 0.18047 0.10813 0.56316 0.48358 0.3775 0.83235 0.26681 0.53748 0.7402 0.63646 0.27672 0.25008 0.17515 0.03532 0.17644 0.52873 0.88138 0.9587 0.96344 0.03362 0.00052 0.61416 0.69507 0.02841 0.57617 0.73662 0.05805 0.06397 0.41162 0.67347 0.2535 0.80275 0.39192 0.98601 0.01505 0.0402 0.68563 0.29573 0.18228 0.68185 0.38915 0.40364 0.12544 0.11766 0.39168 0.93852 0.92505 0.73263 0.03976 0.94512 0.05495 0.71434 0.15534 0.70979 0.12536 0 [...]
+0.17548 0.49517 0.23802 0.89639 0.96223 0.39948 0.37079 0.99718 0.33833 0.60328 0.42998 0.70572 0.115 0.39015 0.10948 0.23143 0.01249 0.43625 0.41403 0.11703 0.15622 0.36706 0.05025 0.80444 0.43318 0.25404 0.25436 0.22257 0.43486 0.34926 0.81472 0.73312 0.74577 0.73024 0.87941 0.55748 0.49282 0.83847 0.43329 0.59187 0.82506 0.68264 0.05789 0.72089 0.20828 0.25987 0.56164 0.62816 0.48282 0.56455 0.19567 0.70114 0.93129 0.83493 0.68226 0.35767 0.86012 0.61664 0.95598 0.48873 0.90156 0.5489 [...]
+0.95311 0.53621 0.30797 0.24926 0.96305 0.28405 0.28629 0.48831 0.4188 0.57074 0.20921 0.85936 0.94593 0.36165 0.19219 0.47838 0.17224 0.57301 0.74012 0.22295 0.42269 0.54278 0.67307 0.74459 0.71517 0.74524 0.98833 0.0694 0.30498 0.45181 0.94628 0.68942 0.64121 0.26471 0.11023 0.22493 0.70254 0.02126 0.18986 0.5789 0.89389 0.78785 0.17351 0.79769 0.12287 0.24184 0.35646 0.12513 0.39979 0.89186 0.04501 0.17726 0.73006 0.65658 0.86691 0.60747 0.05178 0.86657 0.62686 0.6299 0.90884 0.22 0.3 [...]
+0.86646 0.65883 0.38922 0.42517 0.57545 0.54329 0.59106 0.01264 0.10385 0.99691 0.79062 0.66265 0.18638 0.48003 0.93067 0.35521 0.54811 0.74653 0.29212 0.53559 0.26352 0.05784 0.45704 0.70757 0.12079 0.05638 0.11794 0.18041 0.16001 0.85895 0.42488 0.35263 0.07588 0.85431 0.27301 0.80958 0.23717 0.49978 0.29967 0.35969 0.25033 0.91038 0.77234 0.95491 0.0501 0.80477 0.82178 0.70147 0.0436 0.33806 0.14126 0.7527 0.36708 0.53661 0.81594 0.57697 0.67787 0.31571 0.07226 0.94778 0.83844 0.63196 [...]
+0.93823 0.06296 0.98762 0.56385 0.51072 0.06014 0.06589 0.27876 0.92862 0.60388 0.77532 0.31805 0.91835 0.46315 0.9215 0.59175 0.70516 0.43986 0.29644 0.80881 0.77974 0.17129 0.83252 0.45776 0.12215 0.83965 0.53604 0.32824 0.05214 0.28906 0.25876 0.29527 0.12709 0.844 0.44036 0.80712 0.73021 0.55348 0.92107 0.36226 0.77313 0.88225 0.57505 0.9692 0.52572 0.39919 0.23194 0.48212 0.23459 0.89883 0.08075 0.73773 0.68679 0.19619 0.46804 0.70792 0.16917 0.28164 0.04778 0.81526 0.50013 0.48488  [...]
+0.74555 0.91277 0.59908 0.38308 0.92024 0.357 0.15658 0.79055 0.07681 0.64245 0.90081 0.3472 0.84387 0.0478 0.33736 0.32696 0.04907 0.67969 0.33404 0.83259 0.89636 0.67241 0.95152 0.65506 0.90599 0.28451 0.51005 0.01806 0.60413 0.23208 0.63762 0.43654 0.15811 0.20983 0.58039 0.41778 0.54569 0.69296 0.45614 0.73668 0.32655 0.65533 0.82572 0.4883 0.90441 0.5184 0.94058 0.86702 0.16579 0.31216 0.23215 0.63929 0.29556 0.40867 0.464 0.6309 0.71405 0.06441 0.34127 0.23223 0.39328 0.95423 0.623 [...]
+0.29438 0.02315 0.995 0.72256 0.48384 0.3012 0.58459 0.7579 0.74988 0.32386 0.53603 0.11773 0.70601 0.60829 0.58867 0.23808 0.16716 0.33443 0.45238 0.82997 0.95538 0.52666 0.3802 0.31951 0.38053 0.84771 0.93679 0.66436 0.41225 0.30566 0.35339 0.10954 0.91505 0.48468 0.97025 0.27122 0.60285 0.80322 0.9601 0.56783 0.03177 0.41115 0.86296 0.68599 0.21245 0.13561 0.85927 0.42663 0.61245 0.76391 0.59161 0.58046 0.69695 0.30494 0.79557 0.12472 0.23567 0.90149 0.41403 0.59465 0.59213 0.28561 0. [...]
+0.27527 0.38918 0.02167 0.83474 0.33516 0.57713 0.41541 0.37122 0.97414 0.34323 0.19396 0.30742 0.05208 0.45939 0.79903 0.60534 0.48966 0.93021 0.69566 0.55395 0.15294 0.46262 0.16748 0.76032 0.03055 0.42232 0.26154 0.53405 0.87382 0.67344 0.43622 0.93107 0.212 0.75599 0.05667 0.85987 0.59051 0.57666 0.87981 0.3171 0.69218 0.94277 0.39543 0.80303 0.85928 0.3922 0.63366 0.55304 0.65387 0.26995 0.83201 0.91107 0.54036 0.40681 0.49626 0.32674 0.46455 0.07509 0.39177 0.11582 0.17197 0.25774  [...]
+0.17574 0.76709 0.26549 0.01336 0.66602 0.57064 0.8601 0.77999 0.71008 0.06019 0.77486 0.22239 0.23245 0.61295 0.86751 0.20774 0.36196 0.71029 0.58744 0.162 0.43295 0.73531 0.33923 0.03685 0.57801 0.36772 0.71178 0.16539 0.30935 0.62553 0.89722 0.07927 0.07713 0.76388 0.19222 0.08979 0.79251 0.45136 0.10742 0.02868 0.65264 0.31974 0.22962 0.22429 0.82434 0.71995 0.47875 0.47763 0.3686 0.93374 0.8623 0.08708 0.1662 0.87956 0.43649 0.61553 0.44158 0.92162 0.66163 0.40731 0.63185 0.36918 0. [...]
+0.75091 0.94864 0.7742 0.45834 0.9987 0.78631 0.63426 0.40733 0.88167 0.0782 0.64089 0.95513 0.79198 0.21605 0.28495 0.52163 0.67456 0.34841 0.76812 0.53914 0.2865 0.20526 0.2658 0.26393 0.21816 0.03102 0.04841 0.98104 0.17054 0.07475 0.99892 0.26837 0.79754 0.27527 0.51886 0.83585 0.78769 0.79386 0.05371 0.32186 0.80242 0.58144 0.72634 0.1399 0.20181 0.7893 0.01447 0.87573 0.42918 0.95566 0.39103 0.77338 0.05305 0.7902 0.84302 0.6817 0.16149 0.70162 0.81292 0.20744 0.22252 0.3307 0.0306 [...]
+0.34954 0.89431 0.33559 0.56274 0.11206 0.23684 0.09528 0.84131 0.32886 0.2846 0.39979 0.42708 0.68308 0.79525 0.09454 0.06441 0.22676 0.98645 0.35535 0.4262 0.94559 0.58281 0.42632 0.49557 0.24 0.98146 0.16711 0.5799 0.21494 0.34679 0.04471 0.22456 0.17027 0.26631 0.24034 0.47156 0.94877 0.59678 0.76543 0.10079 0.40197 0.90549 0.04249 0.42873 0.41423 0.2594 0.66653 0.73693 0.93859 0.84165 0.1809 0.95377 0.233 0.08013 0.19723 0.18986 0.42595 0.60968 0.72535 0.44388 0.03735 0.33717 0.7052 [...]
+0.44688 0.98489 0.89542 0.36794 0.54074 0.33196 0.33887 0.14666 0.06241 0.55295 0.69007 0.94166 0.37021 0.39071 0.40937 0.10872 0.8095 0.73014 0.36361 0.06647 0.95223 0.18439 0.13433 0.68968 0.40222 0.07212 0.7301 0.4051 0.89512 0.64649 0.22103 0.37139 0.13181 0.99465 0.8082 0.0136 0.61448 0.99477 0.68302 0.85116 0.54399 0.68503 0.16311 0.47601 0.08963 0.69995 0.45165 0.02811 0.06837 0.92734 0.24342 0.50719 0.25509 0.29177 0.74105 0.81861 0.26947 0.91904 0.92903 0.52357 0.90102 0.08507 0 [...]
+0.9878 0.7194 0.26981 0.30641 0.93331 0.10676 0.40416 0.40983 0.53577 0.32688 0.16868 0.62506 0.13628 0.01257 0.33465 0.42257 0.55763 0.40978 0.64511 0.24092 0.44881 0.5552 0.55052 0.20505 0.01732 0.41692 0.31803 0.58972 0.1775 0.08364 0.88647 0.90271 0.01026 0.86997 0.16498 0.61124 0.94383 0.98903 0.75511 0.96911 0.05831 0.13904 0.38128 0.05628 0.85706 0.36289 0.39191 0.53402 0.44999 0.35725 0.97953 0.69144 0.20928 0.31784 0.45705 0.90811 0.54054 0.59507 0.37452 0.1014 0.26104 0.7271 0. [...]
+0.38489 0.81984 0.61144 0.0129 0.3564 0.05657 0.63319 0.8747 0.49825 0.23796 0.71952 0.85787 0.44513 0.85966 0.52578 0.79826 0.5558 0.68418 0.52658 0.65267 0.99787 0.90082 0.50097 0.96879 0.02051 0.50502 0.58156 0.37371 0.74233 0.21821 0.97535 0.8748 0.23723 0.79523 0.0162 0.73733 0.42075 0.56324 0.72004 0.64798 0.38227 0.6343 0.88834 0.96038 0.31565 0.72474 0.92098 0.4541 0.66453 0.12863 0.6838 0.25367 0.32888 0.68033 0.10013 0.77878 0.04197 0.73875 0.42198 0.27439 0.06843 0.45544 0.527 [...]
+0.87799 0.84646 0.30962 0.69975 0.02367 0.52196 0.29777 0.12608 0.74889 0.04087 0.32115 0.31325 0.4848 0.49494 0.04749 0.2145 0.09837 0.09839 0.00941 0.73943 0.99391 0.0021 0.15516 0.79961 0.74496 0.62744 0.1548 0.21921 0.07617 0.87304 0.58473 0.18913 0.61881 0.46353 0.39063 0.41417 0.83207 0.41908 0.26587 0.29612 0.91115 0.40465 0.70969 0.85159 0.36531 0.06147 0.93368 0.50402 0.33026 0.29089 0.54133 0.79848 0.02109 0.1207 0.6781 0.02583 0.54892 0.87485 0.69121 0.48335 0.41519 0.12761 0. [...]
+0.11446 0.03288 0.22438 0.30613 0.61449 0.45319 0.22205 0.83798 0.19985 0.92786 0.10708 0.70932 0.03869 0.98612 0.62327 0.94318 0.54816 0.08851 0.66577 0.79 0.93665 0.38531 0.57174 0.10763 0.69017 0.05098 0.59716 0.54733 0.23797 0.2847 0.88866 0.30416 0.36121 0.12736 0.83841 0.26404 0.40461 0.95793 0.42981 0.67622 0.25198 0.9639 0.71557 0.96873 0.17927 0.94573 0.14337 0.48008 0.16018 0.89365 0.79336 0.11586 0.81048 0.61698 0.93982 0.08114 0.6558 0.60954 0.05796 0.20479 0.24256 0.68626 0. [...]
+0.46846 0.16982 0.46932 0.2073 0.67269 0.58268 0.76806 0.83357 0.29903 0.63234 0.48384 0.01799 0.02066 0.13444 0.45424 0.89018 0.69807 0.88368 0.16301 0.99255 0.97553 0.91555 0.53573 0.07773 0.21781 0.52757 0.57213 0.80603 0.05458 0.33434 0.64649 0.19592 0.44132 0.23996 0.2694 0.51498 0.14197 0.66172 0.0151 0.54286 0.48634 0.51907 0.11264 0.6255 0.22449 0.92851 0.32176 0.00103 0.89217 0.85113 0.19004 0.75644 0.56847 0.01268 0.77978 0.96183 0.07342 0.86456 0.57386 0.54294 0.81261 0.15805  [...]
+0.24669 0.48828 0.48021 0.57463 0.5053 0.03897 0.74241 0.31261 0.88626 0.17954 0.39745 0.29325 0.82798 0.86567 0.12378 0.62557 0.46108 0.60637 0.75402 0.75375 0.95007 0.56989 0.33712 0.23841 0.1887 0.56211 0.06431 0.82546 0.28135 0.17798 0.36707 0.43063 0.10818 0.09472 0.03024 0.16594 0.53894 0.34287 0.50077 0.28455 0.66534 0.59918 0.74575 0.6657 0.5374 0.07465 0.40974 0.73223 0.51395 0.34168 0.70884 0.27689 0.89262 0.26985 0.91585 0.7728 0.2235 0.08104 0.97069 0.2097 0.37644 0.59641 0.5 [...]
+0.28572 0.40281 0.07315 0.73552 0.64652 0.73069 0.54728 0.4878 0.02041 0.13331 0.40499 0.54815 0.24564 0.36773 0.58796 0.80418 0.36345 0.20461 0.38896 0.80596 0.62225 0.48189 0.29207 0.09556 0.33871 0.0978 0.99603 0.46081 0.79889 0.56503 0.10561 0.79643 0.21518 0.91086 0.58826 0.81569 0.78133 0.51977 0.61146 0.74834 0.12171 0.44585 0.63675 0.93861 0.95124 0.75857 0.23242 0.72578 0.807 0.12309 0.45942 0.35571 0.39042 0.54672 0.40684 0.47178 0.65486 0.34067 0.41334 0.65159 0.19329 0.45726  [...]
+0.99376 0.77885 0.31526 0.8597 0.7354 0.74132 0.76899 0.54023 0.93059 0.80663 0.69355 0.45769 0.52591 0.91212 0.16798 0.83248 0.68813 0.06865 0.32215 0.02652 0.65177 0.99582 0.94974 0.97535 0.39834 0.71425 0.90782 0.83129 0.15366 0.52044 0.80694 0.00729 0.38499 0.68112 0.52428 0.84637 0.39329 0.71851 0.29232 0.28036 0.22582 0.93897 0.7561 0.02365 0.07327 0.16363 0.25168 0.92849 0.80887 0.43434 0.28324 0.78162 0.5656 0.3745 0.0669 0.35321 0.34923 0.84777 0.93339 0.75415 0.36152 0.55321 0. [...]
+0.71859 0.29135 0.29417 0.30703 0.26066 0.5093 0.71061 0.76955 0.3502 0.45984 0.30008 0.81188 0.53061 0.1894 0.84942 0.39006 0.57952 0.52506 0.93599 0.27966 0.89501 0.9992 0.97895 0.20365 0.2283 0.97939 0.03488 0.56694 0.012 0.50323 0.3837 0.57168 0.84583 0.09465 0.9707 0.15667 0.20671 0.89509 0.62794 0.44362 0.45552 0.91192 0.98316 0.77885 0.91043 0.7363 0.34602 0.76677 0.76664 0.30634 0.45883 0.12323 0.82804 0.65433 0.67628 0.9922 0.75819 0.19065 0.02624 0.39695 0.83928 0.41431 0.35527 [...]
+0.58333 0.11949 0.32617 0.01856 0.59902 0.4213 0.7943 0.32657 0.40199 0.77665 0.501 0.62158 0.24975 0.35709 0.44596 0.18657 0.38572 0.20379 0.02052 0.14014 0.27879 0.41899 0.75811 0.00474 0.54516 0.18899 0.31135 0.08667 0.92397 0.54295 0.50217 0.51771 0.62757 0.97587 0.69338 0.65709 0.07825 0.29948 0.22431 0.91134 0.63635 0.53158 0.10074 0.6376 0.44381 0.45525 0.89222 0.61497 0.29431 0.68628 0.47998 0.1942 0.43276 0.0211 0.38229 0.31021 0.63288 0.71673 0.51151 0.48883 0.45774 0.50017 0.0 [...]
+0.74988 0.05399 0.45692 0.96705 0.18767 0.77128 0.57745 0.70853 0.0589 0.36056 0.63782 0.66512 0.93232 0.76584 0.32794 0.2557 0.59714 0.69061 0.73722 0.59839 0.93543 0.42786 0.99754 0.93913 0.49425 0.40482 0.79102 0.3118 0.42594 0.37858 0.35954 0.53325 0.06051 0.37292 0.37117 0.90912 0.13585 0.49388 0.19354 0.79769 0.07667 0.33716 0.59441 0.50395 0.01284 0.59387 0.66361 0.94196 0.25236 0.38787 0.2053 0.74998 0.38301 0.08222 0.23932 0.50441 0.41819 0.56959 0.84266 0.39156 0.1395 0.41596 0 [...]
+0.93686 0.37975 0.0291 0.57559 0.62125 0.1799 0.32137 0.6416 0.3126 0.58393 0.14017 0.29105 0.80141 0.24063 0.04249 0.26523 0.36224 0.81021 0.35969 0.13713 0.5584 0.06539 0.59542 0.3232 0.29605 0.3256 0.65183 0.37027 0.27838 0.19356 0.3665 0.59957 0.04102 0.1057 0.56226 0.50769 0.12994 0.97554 0.24515 0.33811 0.98695 0.81486 0.1489 0.36168 0.18456 0.31411 0.47024 0.36368 0.04708 0.33991 0.21676 0.35837 0.3771 0.94846 0.02902 0.35944 0.17978 0.38767 0.28274 0.01116 0.30739 0.71141 0.45667 [...]
+0.94432 0.58736 0.72581 0.61802 0.56434 0.73485 0.91115 0.74718 0.45685 0.4087 0.33129 0.73023 0.02894 0.57612 0.40665 0.59354 0.87549 0.44888 0.49257 0.14984 0.02578 0.90195 0.12406 0.82865 0.88616 0.12513 0.70599 0.14144 0.60809 0.86959 0.89016 0.40779 0.52393 0.94959 0.15705 0.87848 0.81238 0.32781 0.2849 0.47511 0.47394 0.47764 0.54092 0.82099 0.97305 0.86954 0.28747 0.13684 0.74067 0.65872 0.24953 0.30633 0.96856 0.08067 0.3111 0.02408 0.82708 0.09683 0.74631 0.87741 0.79698 0.65062 [...]
+0.47412 0.85569 0.20126 0.17111 0.55504 0.85234 0.89555 0.00815 0.27608 0.47491 0.34493 0.34187 0.4664 0.64334 0.09654 0.83067 0.68725 0.26481 0.03462 0.01293 0.43591 0.52081 0.18207 0.86317 0.41549 0.21078 0.71754 0.23323 0.48086 0.40485 0.71711 0.93091 0.17121 0.19094 0.97946 0.81276 0.58356 0.30778 0.96411 0.88802 0.90988 0.85669 0.8019 0.57852 0.63664 0.13526 0.1154 0.55806 0.6466 0.87673 0.38647 0.22281 0.93818 0.64917 0.60494 0.30541 0.62016 0.46417 0.88926 0.98177 0.19506 0.73304  [...]
+0.33804 0.56363 0.6644 0.19921 0.12598 0.90186 0.84385 0.12632 0.4816 0.77322 0.98581 0.69406 0.15793 0.93102 0.44028 0.47109 0.95226 0.67961 0.12841 0.18216 0.96055 0.40824 0.10201 0.15564 0.42701 0.72754 0.07516 0.06405 0.29315 0.47764 0.72642 0.15563 0.32026 0.37206 0.60738 0.17103 0.05998 0.48189 0.81094 0.49702 0.57575 0.36873 0.58709 0.94312 0.25962 0.31934 0.47865 0.0307 0.14859 0.00142 0.86669 0.02241 0.11136 0.85011 0.41699 0.49443 0.569 0.66595 0.42437 0.28347 0.60923 0.25868 0 [...]
+0.98075 0.07967 0.13866 0.36836 0.46896 0.9748 0.2125 0.64942 0.78759 0.71368 0.28488 0.51344 0.2404 0.01514 0.97548 0.84753 0.579 0.56813 0.863 0.85608 0.8317 0.49004 0.49221 0.52616 0.10185 0.31609 0.09169 0.30661 0.63763 0.99433 0.78121 0.83211 0.83771 0.21374 0.75959 0.68452 0.23828 0.67635 0.05516 0.53148 0.13596 0.1306 0.05559 0.63084 0.22995 0.16827 0.19456 0.62525 0.68706 0.71431 0.81572 0.34144 0.08143 0.67975 0.6272 0.286 0.22335 0.25268 0.46919 0.7275 0.10533 0.98546 0.99865 0 [...]
+0.29553 0.02383 0.61429 0.05443 0.91371 0.77805 0.15029 0.19567 0.86015 0.9369 0.42451 0.58628 0.55162 0.59914 0.61619 0.60116 0.37121 0.69086 0.83038 0.54957 0.85722 0.68289 0.84831 0.34228 0.19604 0.96749 0.50625 0.59733 0.19886 0.43343 0.50401 0.31246 0.5679 0.79993 0.71455 0.07848 0.41521 0.3627 0.19361 0.29887 0.12234 0.92966 0.46771 0.44941 0.69807 0.18465 0.5832 0.5225 0.7179 0.29129 0.62548 0.5948 0.10713 0.77818 0.87996 0.98318 0.57692 0.60385 0.38036 0.81557 0.69308 0.66291 0.8 [...]
+0.70647 0.94273 0.43633 0.96989 0.20174 0.36927 0.44292 0.55252 0.06098 0.39963 0.4114 0.35305 0.69685 0.51756 0.0556 0.38601 0.3666 0.34502 0.70703 0.97865 0.58817 0.02442 0.43015 0.01592 0.37916 0.38537 0.92831 0.50034 0.22464 0.46626 0.27731 0.9112 0.84178 0.40823 0.59699 0.32865 0.89353 0.43141 0.86108 0.5313 0.05535 0.46702 0.1901 0.71089 0.12038 0.59055 0.62743 0.98573 0.54305 0.03841 0.38093 0.61149 0.64957 0.97647 0.92546 0.46913 0.45079 0.45722 0.22024 0.61531 0.44006 0.52721 0. [...]
+0.1095 0.94214 0.64739 0.28708 0.07413 0.53254 0.11257 0.93836 0.44519 0.42739 0.90032 0.38878 0.36571 0.51811 0.47407 0.9776 0.04572 0.39377 0.38687 0.89092 0.93431 0.45972 0.12741 0.72415 0.13932 0.54242 0.41798 0.99113 0.2011 0.3311 0.80475 0.88329 0.08679 0.54257 0.46875 0.36262 0.41713 0.11207 0.0091 0.96857 0.55469 0.31262 0.57455 0.25237 0.5993 0.98466 0.99618 0.09423 0.02554 0.6637 0.54245 0.94668 0.0949 0.15441 0.20242 0.77896 0.43987 0.55415 0.17199 0.78344 0.75599 0.33449 0.81 [...]
+0.71557 0.72592 0.61385 0.07463 0.12548 0.72869 0.06917 0.29919 0.68514 0.52864 0.12173 0.05238 0.74661 0.65972 0.81532 0.2677 0.93104 0.46652 0.24623 0.44254 0.07507 0.58282 0.19026 0.30976 0.37042 0.23527 0.08925 0.84096 0.66051 0.20762 0.1981 0.70698 0.51791 0.90429 0.31843 0.23568 0.44769 0.60979 0.88413 0.75973 0.07588 0.0892 0.07679 0.80204 0.38724 0.78263 0.8377 0.25306 0.24477 0.21703 0.52726 0.49561 0.23332 0.42549 0.51542 0.32559 0.08997 0.57283 0.92133 0.97207 0.87767 0.41427  [...]
+0.66578 0.35791 0.09176 0.87243 0.07134 0.82997 0.61103 0.96695 0.61847 0.75932 0.66254 0.59977 0.55749 0.64658 0.08133 0.99514 0.89069 0.49442 0.80773 0.47479 0.69544 0.49295 0.39648 0.84809 0.80043 0.08733 0.49066 0.20329 0.02118 0.63745 0.94558 0.30658 0.83699 0.46934 0.31211 0.40751 0.10457 0.34247 0.49317 0.57314 0.1775 0.616 0.47555 0.57547 0.45783 0.50935 0.40633 0.56099 0.29233 0.04619 0.96425 0.54209 0.56434 0.5538 0.64873 0.73596 0.95188 0.24693 0.80373 0.97669 0.97939 0.34027  [...]
+0.48519 0.27596 0.7821 0.09298 0.63963 0.32728 0.14852 0.53704 0.24241 0.56229 0.71189 0.45476 0.59141 0.15055 0.1013 0.24755 0.64414 0.07699 0.33052 0.45343 0.83828 0.75328 0.38246 0.0147 0.12681 0.15417 0.56743 0.34062 0.29458 0.60765 0.10683 0.24176 0.77634 0.3788 0.15973 0.0706 0.2327 0.21862 0.49214 0.21354 0.33695 0.1206 0.50222 0.23164 0.19457 0.50751 0.80652 0.30492 0.93696 0.78094 0.95998 0.60092 0.41434 0.6 0.60332 0.7851 0.6997 0.04966 0.03398 0.60237 0.46645 0.03139 0.77615 0 [...]
+0.39297 0.73434 0.10929 0.69998 0.38166 0.85668 0.73734 0.73318 0.22945 0.57357 0.60054 0.36985 0.06715 0.29355 0.33164 0.14665 0.29452 0.16406 0.57255 0.70322 0.70339 0.69788 0.37207 0.89735 0.69723 0.52281 0.05123 0.90399 0.01029 0.5123 0.9108 0.12611 0.42916 0.91782 0.53601 0.38765 0.04059 0.2798 0.26876 0.71852 0.39703 0.63736 0.68033 0.43994 0.8665 0.3842 0.61731 0.11802 0.25224 0.98201 0.84145 0.06113 0.71801 0.81754 0.62358 0.70116 0.80502 0.59507 0.3012 0.52877 0.41278 0.16505 0. [...]
+0.71293 0.00122 0.30217 0.1112 0.34051 0.94699 0.50343 0.57146 0.01227 0.64512 0.34636 0.72146 0.5456 0.28277 0.76819 0.34623 0.48231 0.53881 0.63781 0.25436 0.28944 0.70621 0.80121 0.63209 0.65405 0.10491 0.40905 0.02882 0.90518 0.40256 0.87034 0.57841 0.69396 0.61036 0.26502 0.15421 0.05298 0.09605 0.09523 0.9619 0.77274 0.89434 0.11175 0.35019 0.23726 0.86822 0.84369 0.91055 0.16414 0.19371 0.38449 0.81949 0.74498 0.64649 0.2392 0.16853 0.00302 0.32266 0.76938 0.90857 0.14237 0.99773  [...]
+0.41438 0.42415 0.73592 0.71638 0.23302 0.49979 0.90003 0.46398 0.64229 0.5249 0.63725 0.91653 0.77841 0.96883 0.69449 0.74887 0.62601 0.49556 0.57573 0.73672 0.50392 0.48989 0.27013 0.92305 0.24178 0.66202 0.42715 0.78867 0.9887 0.0415 0.30826 0.75372 0.27402 0.32232 0.45281 0.99651 0.65019 0.8452 0.1253 0.14415 0.73039 0.993 0.60682 0.8895 0.44139 0.3075 0.87297 0.61274 0.8145 0.01074 0.5085 0.82221 0.35558 0.60857 0.37406 0.30519 0.03554 0.34935 0.98986 0.85438 0.56454 0.47078 0.78647 [...]
+0.11459 0.99586 0.97436 0.72704 0.97543 0.39496 0.79695 0.30401 0.54463 0.80364 0.14306 0.9718 0.32584 0.5572 0.95138 0.64879 0.13331 0.50781 0.81496 0.56975 0.81757 0.65894 0.75234 0.49206 0.00139 0.5273 0.66419 0.00376 0.85228 0.43775 0.4466 0.03382 0.92257 0.15234 0.28105 0.54669 0.80205 0.58904 0.9632 0.71533 0.88213 0.10588 0.90206 0.68364 0.39087 0.09858 0.47411 0.14097 0.60257 0.61828 0.36484 0.04042 0.86666 0.63286 0.06259 0.02487 0.18743 0.00553 0.85067 0.62108 0.09535 0.03308 0 [...]
+0.50266 0.37714 0.97934 0.56653 0.13076 0.76352 0.9434 0.20879 0.08726 0.39628 0.95774 0.23483 0.35175 0.21983 0.98315 0.73074 0.0041 0.83958 0.15006 0.49837 0.51022 0.52759 0.36649 0.4649 0.43991 0.35537 0.85647 0.53556 0.29389 0.54338 0.86645 0.55546 0.95218 0.58756 0.89421 0.67282 0.35912 0.54032 0.23022 0.49995 0.87616 0.4373 0.73284 0.62179 0.1631 0.49482 0.74298 0.6971 0.45964 0.91858 0.48977 0.09784 0.73976 0.95407 0.35155 0.63542 0.39045 0.81997 0.75402 0.28185 0.59404 0.24942 0. [...]
+0.40792 0.43608 0.52295 0.53217 0.26868 0.98574 0.03584 0.71785 0.73916 0.88471 0.11608 0.35261 0.65897 0.94383 0.83971 0.37641 0.61646 0.95974 0.29196 0.43298 0.78822 0.37801 0.24955 0.61639 0.85383 0.43427 0.20533 0.92427 0.54793 0.33184 0.00973 0.3951 0.3219 0.69376 0.14108 0.48056 0.90827 0.55304 0.84306 0.27516 0.57492 0.30903 0.93075 0.11182 0.47246 0.44074 0.18555 0.58995 0.3735 0.61206 0.11984 0.91053 0.19909 0.5214 0.41628 0.53589 0.33981 0.26862 0.8363 0.37479 0.16615 0.86805 0 [...]
+0.86936 0.97189 0.04284 0.46044 0.83184 0.37475 0.21801 0.8366 0.6241 0.91092 0.83889 0.08034 0.15544 0.19241 0.28654 0.25267 0.14364 0.71488 0.45384 0.9026 0.73484 0.12666 0.60429 0.06391 0.86764 0.40157 0.18895 0.42432 0.56857 0.91602 0.48208 0.07405 0.74606 0.12613 0.81404 0.11896 0.19427 0.88894 0.1941 0.90949 0.17774 0.41992 0.99469 0.64459 0.8042 0.65785 0.59228 0.15835 0.8662 0.32045 0.2549 0.88367 0.30247 0.69418 0.46484 0.95444 0.14433 0.8434 0.56326 0.67051 0.39952 0.12838 0.27 [...]
+0.74789 0.80894 0.51593 0.6901 0.13383 0.95525 0.15579 0.31309 0.08644 0.55126 0.83785 0.7828 0.47385 0.43766 0.43499 0.80129 0.82634 0.39406 0.08797 0.20739 0.65399 0.47661 0.09044 0.52595 0.56561 0.35874 0.02846 0.30545 0.23529 0.14808 0.55207 0.361 0.46553 0.59699 0.9543 0.68173 0.45947 0.66611 0.4758 0.25066 0.47847 0.0331 0.18891 0.8767 0.79866 0.9814 0.74929 0.99865 0.77896 0.84406 0.85438 0.31751 0.22521 0.49005 0.66829 0.26788 0.64533 0.29849 0.67258 0.19329 0.44041 0.93999 0.890 [...]
+0.96085 0.12956 0.36343 0.89994 0.5193 0.11863 0.00436 0.84079 0.02011 0.61796 0.25627 0.37937 0.43292 0.38447 0.61289 0.64301 0.59861 0.68639 0.17661 0.0058 0.20009 0.08861 0.62897 0.64358 0.28113 0.78509 0.14141 0.88765 0.91888 0.0751 0.91335 0.14358 0.03467 0.39905 0.88302 0.09226 0.8887 0.63657 0.1922 0.26727 0.50444 0.23904 0.86506 0.22164 0.11191 0.8398 0.56415 0.24851 0.14052 0.86238 0.34389 0.63011 0.1386 0.02051 0.88316 0.66823 0.33502 0.40365 0.69264 0.00799 0.65635 0.60153 0.4 [...]
+0.75898 0.83103 0.46752 0.27957 0.36592 0.91728 0.16791 0.79639 0.62231 0.46755 0.10967 0.82283 0.40469 0.28059 0.48325 0.89971 0.49498 0.65577 0.11451 0.8945 0.86688 0.06185 0.00238 0.80963 0.1558 0.22972 0.33023 0.50901 0.33087 0.89789 0.68636 0.09966 0.60557 0.7662 0.1933 0.16043 0.66757 0.0804 0.38791 0.28873 0.17949 0.01089 0.36486 0.53987 0.65707 0.96561 0.12891 0.1634 0.31564 0.61633 0.4626 0.64454 0.01998 0.15695 0.20361 0.15758 0.26559 0.20196 0.60375 0.84543 0.35064 0.52231 0.1 [...]
+0.2048 0.50148 0.03003 0.59581 0.04157 0.2481 0.27281 0.12087 0.03143 0.65939 0.1836 0.28289 0.31468 0.47663 0.82315 0.66843 0.02501 0.83957 0.85672 0.19418 0.23939 0.17088 0.58498 0.14473 0.58897 0.26546 0.29701 0.75412 0.41038 0.97555 0.2654 0.85596 0.88661 0.24286 0.65447 0.08188 0.36368 0.47453 0.56152 0.37253 0.08145 0.85534 0.08557 0.5451 0.42724 0.76628 0.43425 0.73568 0.73206 0.89467 0.44596 0.48003 0.62803 0.64579 0.55713 0.75213 0.97783 0.70176 0.47572 0.76211 0.17195 0.60589 0 [...]
+0.70339 0.33759 0.39857 0.87841 0.53871 0.17101 0.64573 0.89804 0.56715 0.62904 0.17344 0.35298 0.15758 0.35369 0.1325 0.54507 0.0184 0.54779 0.88807 0.76996 0.13429 0.23934 0.20192 0.41605 0.9726 0.65873 0.40108 0.09982 0.21834 0.73892 0.22066 0.74291 0.7026 0.03236 0.78551 0.26118 0.38285 0.228 0.86368 0.41167 0.59105 0.63512 0.72564 0.73352 0.52212 0.12832 0.01416 0.1261 0.51254 0.46221 0.45866 0.48584 0.92823 0.42098 0.06474 0.67989 0.42383 0.55292 0.15391 0.26687 0.47408 0.04265 0.5 [...]
+0.29898 0.69876 0.68229 0.46381 0.94059 0.62292 0.97125 0.76895 0.49896 0.03838 0.36743 0.07406 0.60584 0.27314 0.77816 0.87009 0.98457 0.93941 0.63201 0.01911 0.94945 0.91616 0.53252 0.66658 0.1318 0.78609 0.84031 0.23819 0.13603 0.83173 0.94541 0.87412 0.56217 0.41702 0.3233 0.49015 0.39717 0.38258 0.6244 0.7229 0.29723 0.98634 0.16982 0.44968 0.11799 0.78623 0.6466 0.52185 0.44241 0.76787 0.2283 0.81528 0.39637 0.89496 0.62493 0.28937 0.36838 0.94939 0.19255 0.25554 0.27965 0.15514 0. [...]
+0.25359 0.98568 0.32276 0.42193 0.36351 0.08062 0.2061 0.73417 0.46818 0.43526 0.33314 0.4471 0.40335 0.81565 0.85027 0.93743 0.34553 0.90514 0.84964 0.81128 0.98463 0.35726 0.96447 0.17297 0.39825 0.04047 0.47701 0.99603 0.14849 0.59921 0.93702 0.53898 0.59234 0.47654 0.22118 0.31353 0.79495 0.11992 0.69537 0.5887 0.20023 0.00217 0.95106 0.26943 0.45325 0.49841 0.38787 0.67174 0.65418 0.2225 0.17428 0.33509 0.79429 0.56143 0.5147 0.94649 0.31217 0.71462 0.97534 0.90026 0.03154 0.80885 0 [...]
+0.85695 0.00764 0.43581 0.34091 0.03369 0.08601 0.55625 0.79232 0.87956 0.41804 0.85664 0.16536 0.05968 0.70227 0.41275 0.58834 0.93988 0.82902 0.22884 0.68855 0.82365 0.16731 0.22126 0.16266 0.07329 0.41859 0.30728 0.85561 0.76312 0.64873 0.48982 0.69818 0.1588 0.53432 0.04321 0.74284 0.58921 0.63834 0.67225 0.90747 0.4367 0.57023 0.19428 0.92967 0.99948 0.38385 0.59965 0.48017 0.06134 0.09266 0.60203 0.47339 0.55316 0.42919 0.3369 0.01502 0.49498 0.30463 0.02488 0.41304 0.2149 0.18179  [...]
+0.63067 0.30323 0.95202 0.7744 0.79329 0.2967 0.57933 0.66998 0.73706 0.83938 0.7307 0.52943 0.0855 0.94542 0.5458 0.91836 0.43212 0.07452 0.13258 0.11503 0.38582 0.29432 0.50652 0.06905 0.05967 0.24941 0.9469 0.40999 0.34811 0.60146 0.44407 0.33933 0.8098 0.2976 0.41928 0.28361 0.99675 0.27399 0.27389 0.858 0.62509 0.33237 0.80145 0.20627 0.11927 0.53093 0.61288 0.61758 0.69683 0.17186 0.05354 0.34074 0.79054 0.67311 0.51675 0.48775 0.87047 0.64132 0.0486 0.55453 0.08869 0.80955 0.59269 [...]
+0.25972 0.84684 0.65766 0.29017 0.92433 0.82366 0.21567 0.61447 0.95112 0.39197 0.08495 0.49366 0.36952 0.25299 0.53919 0.4193 0.98622 0.49472 0.6523 0.57708 0.45142 0.68491 0.3243 0.42495 0.17502 0.07869 0.08988 0.97595 0.12065 0.85249 0.97427 0.63639 0.13063 0.62193 0.45866 0.0918 0.83966 0.55793 0.81812 0.01863 0.26009 0.26623 0.92878 0.4765 0.67891 0.64396 0.14372 0.06795 0.50771 0.06981 0.01192 0.09578 0.74834 0.2832 0.87627 0.23177 0.97899 0.94652 0.76609 0.54408 0.82174 0.18245 0. [...]
+0.28228 0.57299 0.25396 0.17382 0.59169 0.26192 0.98212 0.20463 0.85309 0.4136 0.809 0.67304 0.81888 0.25133 0.73075 0.66141 0.10607 0.28302 0.74111 0.91045 0.51181 0.01165 0.15865 0.99811 0.81896 0.41441 0.89657 0.73744 0.18035 0.14688 0.27966 0.46983 0.7144 0.08115 0.97622 0.14373 0.01656 0.28123 0.82167 0.35226 0.99112 0.39551 0.79719 0.44097 0.08073 0.19346 0.3018 0.55266 0.66304 0.78901 0.97912 0.01942 0.51656 0.53741 0.73551 0.40068 0.91204 0.84868 0.5862 0.52447 0.86317 0.18648 0. [...]
+0.59159 0.07185 0.60502 0.80969 0.43383 0.11526 0.28434 0.91773 0.37518 0.05841 0.80535 0.63638 0.79531 0.52572 0.09694 0.17671 0.73848 0.04648 0.73729 0.67322 0.12678 0.89834 0.35431 0.9311 0.88201 0.26325 0.32944 0.76265 0.4067 0.69436 0.48334 0.5889 0.55107 0.16513 0.64377 0.2126 0.06968 0.604 0.57894 0.95438 0.53849 0.42301 0.03765 0.02939 0.35167 0.10093 0.11739 0.65783 0.71869 0.509 0.02156 0.18765 0.4371 0.64182 0.31948 0.85495 0.57652 0.95288 0.26225 0.4242 0.63186 0.6315 0.2985  [...]
+0.44543 0.75327 0.69578 0.17692 0.87278 0.05178 0.67378 0.33538 0.04809 0.12639 0.27936 0.64404 0.86145 0.47211 0.97608 0.60244 0.67515 0.21522 0.12614 0.0829 0.89398 0.7686 0.48971 0.19999 0.02593 0.49915 0.03354 0.68502 0.55737 0.91248 0.94655 0.492 0.14559 0.86136 0.2327 0.45065 0.77892 0.50756 0.80663 0.29135 0.13585 0.03419 0.33517 0.19 0.0316 0.38598 0.24132 0.55864 0.89583 0.00379 0.1832 0.2275 0.95356 0.55894 0.56801 0.32705 0.13187 0.19147 0.9792 0.79615 0.62182 0.29457 0.17492  [...]
+0.48101 0.96927 0.85508 0.84988 0.86792 0.82818 0.07967 0.17552 0.09088 0.28829 0.57672 0.43578 0.65786 0.57451 0.5841 0.66931 0.44957 0.90414 0.88323 0.90254 0.89413 0.64663 0.85262 0.6036 0.82808 0.24759 0.20179 0.84417 0.2111 0.63891 0.93637 0.42567 0.8381 0.73949 0.65836 0.64729 0.80845 0.50569 0.40124 0.54644 0.03061 0.58069 0.6077 0.17146 0.50876 0.62611 0.9055 0.01957 0.48046 0.05705 0.99489 0.69401 0.21098 0.90937 0.44132 0.01497 0.71345 0.84312 0.94766 0.63341 0.43469 0.28747 0. [...]
+0.72326 0.10311 0.5714 0.41587 0.43497 0.22922 0.60025 0.86347 0.63225 0.40236 0.97157 0.91836 0.15691 0.45748 0.9711 0.18903 0.94686 0.04016 0.52583 0.90429 0.07888 0.12601 0.21725 0.50527 0.92261 0.62263 0.99008 0.65767 0.68403 0.64784 0.27424 0.96339 0.20222 0.02121 0.82529 0.34196 0.85177 0.59565 0.39737 0.40784 0.59711 0.66125 0.27465 0.18303 0.17542 0.02781 0.76332 0.28599 0.47464 0.45949 0.92918 0.68346 0.72336 0.45448 0.93278 0.02364 0.00241 0.73528 0.41618 0.87946 0.97855 0.3111 [...]
+0.4457 0.98488 0.23953 0.35272 0.5934 0.57135 0.50996 0.1819 0.36568 0.95206 0.60876 0.14083 0.49099 0.44465 0.51693 0.58031 0.40237 0.06211 0.23888 0.79256 0.8437 0.10341 0.66018 0.59769 0.24261 0.55149 0.38172 0.83268 0.51292 0.54379 0.5852 0.57977 0.85159 0.44994 0.05603 0.82786 0.48838 0.9772 0.34629 0.47371 0.25478 0.49786 0.1479 0.80883 0.99874 0.83912 0.94622 0.79933 0.03364 0.14701 0.99837 0.53984 0.73381 0.68132 0.17624 0.7824 0.56029 0.84586 0.06189 0.93397 0.86462 0.75877 0.73 [...]
+0.92277 0.57209 0.48502 0.9046 0.78904 0.56284 0.20197 0.22198 0.05741 0.79198 0.27433 0.14719 0.4904 0.68973 0.16609 0.18645 0.16437 0.16394 0.64989 0.295 0.36618 0.36483 0.89864 0.74556 0.66071 0.23654 0.60162 0.43685 0.85336 0.24309 0.66255 0.40015 0.11457 0.15751 0.91203 0.15164 0.09154 0.52817 0.87083 0.04874 0.30687 0.1075 0.23963 0.96529 0.26769 0.35269 0.43924 0.93031 0.69222 0.00745 0.09495 0.45665 0.22312 0.90046 0.40573 0.93348 0.43154 0.77757 0.16759 0.71227 0.49096 0.84207 0 [...]
+0.27362 0.2035 0.01197 0.483 0.97184 0.20695 0.89883 0.59476 0.98775 0.76909 0.7726 0.72163 0.07342 0.13757 0.90116 0.23013 0.50788 0.53456 0.92449 0.73286 0.42567 0.43921 0.39835 0.08088 0.44882 0.97303 0.08132 0.69004 0.32918 0.44968 0.71781 0.16382 0.22754 0.97873 0.93673 0.05216 0.42505 0.61016 0.62622 0.40207 0.40198 0.27208 0.72925 0.21668 0.7914 0.64167 0.03914 0.75187 0.38613 0.68847 0.59207 0.33927 0.37685 0.71042 0.69982 0.86831 0.33713 0.80993 0.59282 0.66267 0.29683 0.85548 0 [...]
+0.27333 0.84111 0.36718 0.62684 0.32779 0.96959 0.32369 0.97305 0.64461 0.65145 0.59656 0.87373 0.64622 0.11926 0.23707 0.50153 0.28902 0.72387 0.6133 0.8225 0.78273 0.9374 0.11982 0.97821 0.8945 0.31514 0.56905 0.36921 0.21338 0.45638 0.88657 0.21878 0.16434 0.83196 0.1628 0.53191 0.08491 0.96981 0.37302 0.58778 0.9001 0.97662 0.576 0.94831 0.69019 0.27333 0.85529 0.98923 0.5847 0.98342 0.57818 0.54632 0.74422 0.02378 0.02064 0.4495 0.5818 0.69204 0.48206 0.86682 0.01174 0.84042 0.14611 [...]
+0.07867 0.12119 0.84472 0.31217 0.42476 0.54229 0.65843 0.22411 0.93508 0.80965 0.75214 0.33485 0.92495 0.24692 0.49803 0.64336 0.28913 0.37485 0.55499 0.48994 0.15436 0.74827 0.73495 0.75183 0.61444 0.60512 0.50942 0.32759 0.95886 0.56101 0.22451 0.97039 0.72919 0.39191 0.91864 0.54321 0.34829 0.12168 0.63422 0.21445 0.75303 0.91078 0.01082 0.44177 0.60676 0.78616 0.77122 0.16883 0.47446 0.1102 0.06134 0.52968 0.01091 0.52836 0.17388 0.31986 0.65069 0.73552 0.82999 0.63608 0.44992 0.803 [...]
+0.21686 0.38683 0.54045 0.56334 0.5927 0.58694 0.35011 0.6131 0.6993 0.54258 0.05797 0.56451 0.64504 0.22629 0.20746 0.91831 0.95966 0.43836 0.43414 0.27886 0.79613 0.59485 0.03383 0.48379 0.52804 0.3993 0.77044 0.12992 0.9948 0.66618 0.43704 0.85785 0.66488 0.89793 0.53533 0.41019 0.65529 0.47184 0.06494 0.37588 0.56459 0.89125 0.53392 0.50775 0.01077 0.49584 0.07495 0.06526 0.4689 0.61879 0.20611 0.84123 0.79635 0.59452 0.12562 0.87248 0.14214 0.21024 0.14624 0.45962 0.71993 0.17212 0. [...]
+0.57219 0.44451 0.61814 0.85839 0.49781 0.79836 0.23321 0.44695 0.17834 0.66088 0.86544 0.98714 0.57345 0.90465 0.26641 0.63917 0.22537 0.42114 0.62551 0.27771 0.03876 0.99613 0.12665 0.7534 0.17773 0.97245 0.03114 0.7701 0.95485 0.69054 0.98361 0.16918 0.34215 0.47387 0.77605 0.76003 0.82722 0.31655 0.94392 0.3363 0.27887 0.01806 0.67384 0.88651 0.12318 0.95998 0.92968 0.87293 0.884 0.39912 0.45541 0.10473 0.9533 0.26644 0.24344 0.33058 0.24612 0.97846 0.87762 0.86597 0.27075 0.38406 0. [...]
+0.42867 0.18314 0.46555 0.60893 0.14751 0.6732 0.71514 0.31017 0.85306 0.13596 0.9698 0.35585 0.5694 0.1441 0.89357 0.00976 0.19126 0.81718 0.38163 0.40472 0.95728 0.13443 0.24726 0.4808 0.02439 0.11449 0.76441 0.93472 0.1535 0.18823 0.89423 0.52597 0.01231 0.90121 0.49201 0.43355 0.84404 0.13363 0.93721 0.38449 0.86587 0.89926 0.09235 0.39944 0.19588 0.32111 0.27339 0.58679 0.16527 0.60575 0.58703 0.81859 0.21791 0.09854 0.01272 0.37814 0.96219 0.30338 0.96472 0.98337 0.72511 0.76415 0. [...]
+0.90915 0.76577 0.01663 0.00498 0.06247 0.97587 0.98428 0.53171 0.43109 0.42607 0.63593 0.56521 0.66037 0.05485 0.36384 0.86146 0.09212 0.69378 0.81927 0.85085 0.19772 0.64368 0.14813 0.15624 0.62322 0.00643 0.79556 0.04969 0.48165 0.97342 0.25965 0.46503 0.34535 0.8853 0.01035 0.4893 0.24565 0.07637 0.94225 0.7651 0.68513 0.97446 0.20534 0.72376 0.40136 0.2561 0.34132 0.66696 0.07947 0.39767 0.33481 0.04012 0.47901 0.45676 0.54195 0.64402 0.10756 0.50293 0.3243 0.42104 0.48127 0.02566 0 [...]
+0.50024 0.88848 0.11641 0.15806 0.30098 0.25992 0.83509 0.26296 0.47088 0.45188 0.35958 0.14708 0.7605 0.85673 0.99305 0.0067 0.41703 0.15302 0.65 0.44601 0.40774 0.39114 0.91144 0.69674 0.18255 0.70111 0.52316 0.8325 0.8265 0.82006 0.08905 0.00616 0.86968 0.66245 0.33326 0.09036 0.82144 0.77728 0.35153 0.18345 0.62998 0.63309 0.29766 0.389 0.06697 0.79351 0.40416 0.049 0.42271 0.57705 0.43629 0.98235 0.30912 0.13668 0.90675 0.95238 0.5877 0.15929 0.0501 0.20956 0.57903 0.46219 0.76005 0 [...]
+0.02682 0.14153 0.7702 0.88776 0.80738 0.81332 0.83965 0.94107 0.32636 0.10755 0.1287 0.36224 0.06691 0.8354 0.67114 0.70358 0.13109 0.06574 0.80546 0.58261 0.20482 0.39535 0.14752 0.96264 0.997 0.25722 0.61324 0.3162 0.31062 0.37112 0.43548 0.31626 0.44891 0.2379 0.72128 0.10917 0.60472 0.26598 0.88374 0.65936 0.76888 0.35135 0.9965 0.23757 0.56533 0.48261 0.05824 0.29555 0.34439 0.55846 0.23308 0.59815 0.90522 0.33642 0.61142 0.84271 0.28114 0.38551 0.70925 0.32838 0.69045 0.24622 0.46 [...]
+0.70814 0.05779 0.05357 0.05137 0.92649 0.62809 0.33581 0.28188 0.81665 0.34031 0.38609 0.51192 0.52959 0.55955 0.20577 0.38069 0.91113 0.33392 0.54515 0.89464 0.2157 0.76504 0.22359 0.86958 0.10421 0.01416 0.87659 0.85185 0.49931 0.81587 0.6227 0.58237 0.29225 0.02758 0.34226 0.11554 0.98766 0.29303 0.51859 0.52641 0.98193 0.34176 0.3153 0.35923 0.35764 0.04161 0.91912 0.63377 0.10923 0.71929 0.28738 0.10674 0.86048 0.01382 0.7483 0.52232 0.29071 0.46175 0.53435 0.21448 0.22557 0.86527  [...]
+0.3502 0.12447 0.0178 0.90948 0.08983 0.51701 0.08286 0.13622 0.93966 0.63833 0.96293 0.02345 0.74866 0.93196 0.23193 0.13256 0.77316 0.11813 0.03903 0.60349 0.21987 0.53475 0.29618 0.05168 0.10171 0.94079 0.78314 0.15498 0.74017 0.15774 0.26403 0.57181 0.5087 0.64846 0.64004 0.17219 0.71652 0.39656 0.65167 0.25191 0.7245 0.4975 0.26379 0.96669 0.87782 0.66406 0.74444 0.25857 0.67593 0.209 0.19912 0.53407 0.29567 0.62362 0.87227 0.93955 0.44409 0.19244 0.29568 0.93274 0.37913 0.67859 0.6 [...]
+0.4078 0.06959 0.8829 0.06702 0.65535 0.87379 0.38406 0.81789 0.6039 0.09166 0.2841 0.14419 0.02205 0.74361 0.75491 0.23082 0.48814 0.72919 0.70757 0.52979 0.66432 0.73271 0.41892 0.30444 0.05039 0.61118 0.48104 0.8475 0.97684 0.08722 0.15297 0.46921 0.84064 0.04698 0.52658 0.98879 0.5695 0.12911 0.54448 0.96604 0.31921 0.06478 0.16837 0.58572 0.68748 0.88681 0.04453 0.8638 0.55904 0.59568 0.66267 0.55568 0.87564 0.5777 0.44372 0.25527 0.68667 0.99748 0.61444 0.04168 0.8228 0.35742 0.770 [...]
+0.56415 0.96303 0.66323 0.44059 0.58398 0.57534 0.42879 0.8562 0.84009 0.24144 0.29703 0.499 0.47144 0.64478 0.27604 0.43558 0.89804 0.80728 0.63819 0.11292 0.17251 0.58826 0.75192 0.32173 0.51939 0.56519 0.19613 0.75561 0.62833 0.40463 0.22485 0.24903 0.63874 0.74629 0.96627 0.02077 0.47105 0.87799 0.46615 0.37283 0.00026 0.56896 0.7492 0.25285 0.56389 0.39696 0.60527 0.09736 0.59238 0.60999 0.22144 0.94561 0.78653 0.60386 0.262 0.55306 0.82186 0.30958 0.45616 0.41882 0.15624 0.94289 0. [...]
+0.49924 0.08725 0.02394 0.51135 0.40786 0.97065 0.3334 0.75234 0.08863 0.67679 0.25863 0.52377 0.0516 0.44578 0.2581 0.14099 0.39056 0.14307 0.47456 0.95654 0.00999 0.11527 0.82547 0.56096 0.50161 0.74861 0.08585 0.4656 0.05068 0.32993 0.23046 0.54016 0.21965 0.07115 0.15894 0.00285 0.29498 0.73454 0.71802 0.51759 0.68252 0.25803 0.78842 0.76918 0.50011 0.15551 0.29277 0.70044 0.84503 0.19734 0.95295 0.96693 0.44977 0.8868 0.16856 0.3361 0.73277 0.76445 0.17289 0.31664 0.57256 0.72433 0. [...]
+0.21336 0.74715 0.37997 0.34787 0.67726 0.2505 0.01387 0.31949 0.78424 0.89067 0.66581 0.72652 0.53178 0.36794 0.84605 0.87765 0.08697 0.69777 0.38524 0.44556 0.62675 0.21142 0.26409 0.63633 0.04413 0.14446 0.41908 0.05557 0.29658 0.03132 0.7405 0.058 0.49955 0.72714 0.54023 0.83835 0.44054 0.62213 0.1298 0.57743 0.62889 0.67331 0.36459 0.60674 0.3514 0.37582 0.79859 0.0529 0.21305 0.69069 0.21975 0.98037 0.57297 0.03802 0.89581 0.90872 0.33663 0.7246 0.19633 0.42652 0.12642 0.6084 0.218 [...]
+0.92555 0.14319 0.63986 0.55243 0.26514 0.44471 0.88019 0.64871 0.17392 0.68811 0.05384 0.17352 0.1531 0.86819 0.04043 0.35525 0.17811 0.15379 0.25458 0.64869 0.33785 0.74002 0.53151 0.37834 0.91346 0.50219 0.7338 0.81001 0.90565 0.19547 0.39776 0.62465 0.95322 0.87748 0.10401 0.27476 0.18094 0.41276 0.47615 0.9426 0.15956 0.92007 0.42019 0.80386 0.71463 0.72617 0.41758 0.71879 0.37904 0.87957 0.99105 0.38254 0.98326 0.96074 0.27573 0.95566 0.00832 0.7776 0.6326 0.88816 0.06246 0.59177 0 [...]
+0.38623 0.28873 0.49184 0.17059 0.47079 0.98857 0.63277 0.82651 0.38318 0.82217 0.2441 0.37201 0.41833 0.29389 0.87479 0.42819 0.50357 0.8189 0.28386 0.64161 0.40763 0.28569 0.36731 0.14217 0.22062 0.75119 0.51498 0.37079 0.44696 0.32655 0.16164 0.73813 0.83364 0.29344 0.18069 0.53535 0.87347 0.9127 0.50343 0.79618 0.51775 0.5583 0.86571 0.38772 0.70816 0.79042 0.11897 0.41502 0.94588 0.12763 0.03279 0.02571 0.7977 0.61964 0.77107 0.65317 0.11657 0.25698 0.09102 0.40981 0.18524 0.53814 0 [...]
+0.13957 0.87274 0.15153 0.5209 0.21362 0.86401 0.01566 0.18546 0.2426 0.0088 0.1404 0.49716 0.5399 0.66 0.3694 0.68458 0.81315 0.32495 0.25764 0.76728 0.11924 0.42696 0.06097 0.6886 0.65578 0.19261 0.22425 0.20905 0.37978 0.25241 0.58859 0.45824 0.86106 0.68447 0.44958 0.49704 0.67569 0.56632 0.30116 0.09871 0.70878 0.24655 0.31419 0.08177 0.32759 0.00513 0.04892 0.75554 0.00682 0.64484 0.50752 0.95052 0.87982 0.97038 0.71832 0.17028 0.25583 0.07704 0.25063 0.03824 0.46981 0.33922 0.2851 [...]
+0.21655 0.46584 0.55004 0.49793 0.32013 0.34435 0.87957 0.2763 0.72825 0.73085 0.7776 0.26957 0.22474 0.74748 0.84589 0.90142 0.57828 0.7664 0.42522 0.19699 0.69804 0.37807 0.48609 0.54701 0.3553 0.53928 0.71832 0.52924 0.51656 0.73888 0.77282 0.6778 0.79778 0.07909 0.65424 0.62621 0.50598 0.57047 0.74204 0.03398 0.31293 0.4419 0.23041 0.98874 0.10764 0.90577 0.10465 0.71839 0.41625 0.78286 0.81161 0.46736 0.01077 0.62058 0.53062 0.76308 0.26644 0.76428 0.93083 0.63568 0.78791 0.61132 0. [...]
+0.75043 0.57887 0.44618 0.77504 0.20775 0.96256 0.4636 0.60451 0.48893 0.74378 0.51692 0.93307 0.09935 0.22102 0.4867 0.80951 0.20136 0.55081 0.25749 0.56015 0.17731 0.54155 0.89242 0.76447 0.37736 0.74516 0.17368 0.81312 0.88723 0.95007 0.10126 0.50595 0.3046 0.88618 0.90787 0.0445 0.28127 0.28438 0.44138 0.88406 0.5743 0.93496 0.86335 0.61309 0.6656 0.99608 0.92969 0.70283 0.8696 0.58176 0.69631 0.30411 0.57542 0.15245 0.69776 0.74561 0.78319 0.58374 0.98882 0.95985 0.73806 0.61096 0.2 [...]
+0.46771 0.36418 0.32706 0.56114 0.71985 0.31179 0.16865 0.07044 0.32981 0.74006 0.91976 0.39988 0.38613 0.11888 0.34506 0.39824 0.73962 0.37347 0.34867 0.36424 0.33455 0.56667 0.4072 0.12626 0.64928 0.75237 0.45925 0.48648 0.766 0.74795 0.87716 0.24626 0.61881 0.80268 0.21302 0.482 0.82639 0.18729 0.86483 0.51791 0.59571 0.29602 0.6281 0.51989 0.2691 0.09968 0.36563 0.4199 0.95712 0.87376 0.84088 0.01531 0.39352 0.64089 0.87027 0.95396 0.16647 0.54005 0.93395 0.18824 0.55326 0.50341 0.22 [...]
+0.71241 0.7521 0.47737 0.58392 0.18786 0.25359 0.5628 0.16413 0.15073 0.14109 0.91101 0.28432 0.02238 0.63933 0.67549 0.36421 0.11828 0.6852 0.25268 0.84003 0.42211 0.94758 0.64491 0.46438 0.52066 0.17168 0.61385 0.07595 0.21491 0.23341 0.54162 0.24371 0.25277 0.36225 0.24999 0.35151 0.56324 0.55523 0.92108 0.97056 0.82076 0.05791 0.77823 0.85556 0.1176 0.75926 0.35397 0.27201 0.62707 0.56922 0.00305 0.46944 0.54654 0.58425 0.08253 0.83731 0.22237 0.88866 0.84348 0.38188 0.83352 0.91434  [...]
+0.41908 0.57415 0.51879 0.58364 0.99046 0.01233 0.38997 0.49404 0.36748 0.47602 0.98223 0.83093 0.25237 0.95743 0.34751 0.45535 0.39372 0.89154 0.48823 0.66511 0.62605 0.15269 0.11584 0.28846 0.59991 0.08901 0.08724 0.16143 0.58616 0.66947 0.57155 0.51106 0.14959 0.97128 0.49247 0.12609 0.92237 0.73131 0.69767 0.00923 0.17897 0.09174 0.54356 0.20382 0.67547 0.37335 0.18408 0.16725 0.99088 0.11406 0.6615 0.95839 0.02722 0.55319 0.82263 0.5993 0.7027 0.92349 0.85165 0.1966 0.10325 0.76282  [...]
+0.26225 0.28473 0.51995 0.7886 0.92393 0.46468 0.15648 0.9576 0.68991 0.47298 0.49628 0.3713 0.74429 0.33203 0.10608 0.70288 0.89764 0.86424 0.79868 0.50957 0.77678 0.76513 0.59539 0.74538 0.75723 0.22339 0.2532 0.63945 0.36076 0.63201 0.14128 0.25313 0.34545 0.86448 0.59786 0.69461 0.97314 0.15412 0.19492 0.14047 0.84574 0.4386 0.10042 0.69164 0.44158 0.65942 0.95049 0.26546 0.23071 0.62379 0.07804 0.31393 0.67727 0.02067 0.6537 0.44676 0.79606 0.57926 0.37987 0.09666 0.42157 0.02913 0. [...]
+0.77461 0.88752 0.52115 0.43958 0.48255 0.04685 0.17184 0.94347 0.09502 0.67245 0.15912 0.66969 0.15256 0.81664 0.9204 0.1726 0.28734 0.65063 0.5381 0.08569 0.06441 0.9103 0.97707 0.52342 0.95013 0.55448 0.799 0.56988 0.97923 0.53269 0.08999 0.61968 0.72427 0.56886 6e-05 0.65202 0.86278 0.57187 0.23868 0.1994 0.86588 0.93194 0.03312 0.93031 0.92145 0.51598 0.75268 0.52403 0.08343 0.5907 0.07887 0.97691 0.35585 0.1651 0.03645 0.73531 0.83547 0.10653 0.1279 0.65757 0.85056 0.86854 0.21297  [...]
+0.21349 0.81934 0.21367 0.31981 0.43317 0.40901 0.13581 0.17907 0.23337 0.55935 0.04173 0.20334 0.11304 0.67786 0.08299 0.76195 0.59247 0.5285 0.36956 0.76715 0.97118 0.02823 0.98791 0.24136 0.0465 0.12408 0.35373 0.97852 0.14351 0.89551 0.21504 0.21154 0.42462 0.59127 0.86073 0.74945 0.28319 0.41636 0.40359 0.68272 0.03 0.64152 0.32257 0.99239 0.45566 0.60056 0.62951 0.55885 0.72289 0.64578 0.10045 0.26845 0.69209 0.56948 0.4168 0.04173 0.79684 0.97128 0.38454 0.37355 0.30167 0.14122 0. [...]
+0.83083 0.35274 0.04569 0.22007 0.93323 0.97599 0.22871 0.94085 0.57804 0.69072 0.03712 0.86893 0.49304 0.5576 0.86899 0.99513 0.81329 0.86843 0.99599 0.51546 0.86483 0.0339 0.11531 0.72893 0.05414 0.06723 0.70915 0.41994 0.3872 0.94021 0.12489 0.91405 0.86596 0.09685 0.52615 0.33512 0.78775 0.72825 0.31017 0.26489 0.73562 0.27895 0.36091 0.9742 0.71629 0.93952 0.97635 0.12377 0.74309 0.34283 0.18339 0.7582 0.87835 0.70852 0.52724 0.79549 0.33304 0.08279 0.87032 0.71461 0.08136 0.06451 0 [...]
+0.38769 0.02667 0.4731 0.70302 0.20146 0.64389 0.7269 0.36061 0.42231 0.58644 0.18601 0.26914 0.5581 0.86455 0.81535 0.76549 0.58256 0.17057 0.88439 0.62196 0.33952 0.81247 0.7877 0.59786 0.82621 0.4717 0.7956 0.7058 0.1289 0.98684 0.54901 0.23557 0.84014 0.84582 0.76848 0.62325 0.68959 0.53049 0.44633 0.65685 0.86724 0.84832 0.81551 0.89061 0.73457 0.26389 0.57067 0.51283 0.90901 0.55198 0.69465 0.30983 0.10427 0.34189 0.49707 0.25953 0.07323 0.02995 0.93579 0.16787 0.12745 0.62281 0.96 [...]
+0.73602 0.46173 0.00979 0.33604 0.18299 0.45951 0.6633 0.08171 0.98379 0.92458 0.22917 0.44051 0.77667 0.00705 0.01281 0.87052 0.35171 0.1267 0.08044 0.55115 0.38924 0.89143 0.66681 0.56664 0.42209 0.05469 0.70799 0.53465 0.81751 0.78751 0.11308 0.15317 0.635 0.06282 0.4105 0.58418 0.5659 0.83141 0.24925 0.18224 0.68505 0.77999 0.67251 0.28432 0.19129 0.82772 0.47656 0.52554 0.389 0.43455 0.78106 0.99377 0.80217 0.82492 0.02434 0.35568 0.4588 0.7889 0.66891 0.61364 0.89247 0.76605 0.6025 [...]
+0.27795 0.23045 0.55012 0.25675 0.18915 0.92648 0.81006 0.90695 0.17661 0.94475 0.93255 0.29206 0.84933 0.96239 0.65668 0.1665 0.40249 0.11329 0.81033 0.84712 0.6933 0.67574 0.0702 0.47081 0.25306 0.99313 0.22171 0.87693 0.09918 0.4257 0.39027 0.14908 0.33158 0.91164 0.45604 0.53015 0.63943 0.56012 0.0631 0.05532 0.89535 0.99668 0.2637 0.20747 0.84759 0.59963 0.47532 0.03828 0.83637 0.56887 0.30659 0.21232 0.52194 0.02494 0.17676 0.62249 0.46974 0.39015 0.41438 0.26447 0.47144 0.54221 0. [...]
+0.30296 0.43473 0.33917 0.43736 0.00112 0.62449 0.57336 0.93752 0.22689 0.24658 0.60999 0.42488 0.21939 0.84563 0.24862 0.94902 0.64133 0.43494 0.93223 0.69247 0.65256 0.10652 0.01775 0.69327 0.64611 0.88997 0.79892 0.92085 0.62546 0.34683 0.96964 0.50083 0.66278 0.5165 0.06735 0.99484 0.66726 0.72651 0.98341 0.07384 0.8637 0.9416 0.79227 0.39339 0.01582 0.02379 0.71129 0.64007 0.11374 0.28028 0.68396 0.75804 0.53127 0.1126 0.51365 0.81507 0.76949 0.81585 0.69007 0.35795 0.22214 0.53606  [...]
+0.10734 0.97714 0.55195 0.04129 0.83041 0.02692 0.35977 0.90593 0.66524 0.12488 0.28299 0.65919 0.1941 0.03659 0.27152 0.90184 0.67151 0.38382 0.77057 0.59135 0.45884 0.95284 0.31892 0.536 0.53516 0.99395 0.78718 0.30208 0.84103 0.91456 0.4673 0.19025 0.11018 0.51576 0.69778 0.32192 0.03662 0.52794 0.33296 0.73568 0.16268 0.09312 0.72991 0.3856 0.62328 0.76601 0.54218 0.23653 0.86315 0.71836 0.36552 0.57775 0.03387 0.45179 0.76518 0.43525 0.24043 0.97279 0.58529 0.29366 0.72352 0.50773 0 [...]
+0.11814 0.03547 0.52647 0.46636 0.02709 0.8181 0.09435 0.04618 0.94682 0.78394 0.44611 0.07713 0.88371 0.5053 0.05016 0.70641 0.97683 0.17251 0.872 0.76925 0.47056 0.34244 0.34288 0.42092 0.69902 0.38953 0.23664 0.85481 0.02668 0.68878 0.28314 0.12522 0.12381 0.12259 0.28294 0.28026 0.56737 0.04129 0.9375 0.90594 0.7351 0.13901 0.10892 0.62445 0.77356 0.2724 0.30557 0.07825 0.76661 0.52764 0.29359 0.26393 0.65728 0.97593 0.26864 0.18664 0.22826 0.69135 0.63487 0.41903 0.7398 0.10508 0.87 [...]
+0.2328 0.98756 0.62332 0.56732 0.37083 0.25867 0.56531 0.32055 0.98502 0.56534 0.22547 0.48182 0.04718 0.11008 0.08339 0.42977 0.55336 0.89679 0.74566 0.47157 0.98318 0.84944 0.02526 0.13567 0.21691 0.50991 0.5026 0.31568 0.53314 0.32688 0.95124 0.78389 0.84007 0.0577 0.19942 0.11957 0.44353 0.4121 0.03289 0.72493 0.87835 0.627 0.40129 0.51983 0.45309 0.17761 0.98436 0.51131 0.36366 0.12851 0.10152 0.39492 0.47603 0.40648 0.96663 0.07097 0.89901 0.69183 0.72243 0.10009 0.27774 0.65625 0. [...]
+0.03156 0.53082 0.97774 0.19589 0.35388 0.80656 0.20081 0.67887 0.15412 0.87452 0.35998 0.85049 0.45734 0.80772 0.88912 0.6159 0.60225 0.35089 0.64249 0.93755 0.8151 0.61403 0.06731 0.43371 0.64513 0.43271 0.80153 0.15645 0.62919 0.91709 0.35909 0.07749 0.10396 0.8492 0.08487 0.55807 0.60067 0.31194 0.73431 0.5893 0.51041 0.99421 0.82527 0.69644 0.72487 0.13895 0.85533 0.62562 0.3656 0.75837 0.838 0.50009 0.27554 0.72508 0.81677 0.3324 0.25771 0.45198 0.18878 0.78137 0.78469 0.23775 0.47 [...]
+0.3859 0.55728 0.40788 0.82823 0.70884 0.57358 0.07935 0.14776 0.05589 0.57698 0.26081 0.5912 0.00628 0.73224 0.59617 0.78372 0.87852 0.61938 0.75316 0.48589 0.05639 0.87774 0.00978 0.86772 0.40734 0.01804 0.42609 0.02886 0.787 0.62968 0.56387 0.82453 0.74365 0.89004 0.35882 0.02665 0.73693 0.88062 0.44324 0.64292 0.79792 0.01562 0.32428 0.21803 0.08329 0.59065 0.27883 0.97493 0.42641 0.35506 0.10058 0.50423 0.9522 0.97807 0.29856 0.58776 0.48723 0.65524 0.5505 0.9758 0.20336 0.75533 0.7 [...]
+0.32604 0.7148 0.9208 0.24162 0.50832 0.20377 0.22405 0.51382 0.23708 0.74025 0.43449 0.17396 0.83087 0.59994 0.86776 0.41512 0.33342 0.82915 0.20061 0.02471 0.43666 0.59046 0.97999 0.73572 0.34431 0.15833 0.44673 0.05947 0.56416 0.73861 0.41076 0.67471 0.12935 0.02183 0.13337 0.82812 0.33841 0.29183 0.76316 0.683 0.03426 0.57156 0.79309 0.86044 0.14908 0.09374 0.31134 0.87189 0.83409 0.26214 0.09504 0.52801 0.36058 0.01592 0.82281 0.43269 0.59337 0.86219 0.64937 0.67894 0.86976 0.09502  [...]
+0.42437 0.25289 0.54017 0.54525 0.68179 0.69113 0.7473 0.01737 0.13827 0.18336 0.84181 0.29874 0.96523 0.9463 0.44571 0.99995 0.76722 0.1611 0.62486 0.21229 0.67533 0.9728 0.36875 0.93014 0.05273 0.87108 0.36117 0.45972 0.64182 0.75104 0.1045 0.5444 0.36596 0.69246 0.69165 0.27049 0.636 0.94357 0.48084 0.69244 0.25908 0.75598 0.45462 0.54404 0.55214 0.641 0.47861 0.76854 0.28489 0.58704 0.68226 0.08253 0.35587 0.50208 0.78238 0.09587 0.93464 0.1538 0.48271 0.67799 0.66188 0.70131 0.97839 [...]
+0.84671 0.55774 0.42089 0.86225 0.15938 0.61185 0.61085 0.02859 0.31664 0.47014 0.04914 0.78371 0.54949 0.72965 0.416 0.5835 0.01105 0.21962 0.49213 0.15347 0.91897 0.69178 0.54172 0.4197 0.47211 0.77352 0.60459 0.07162 0.17606 0.76443 0.64992 0.28849 0.13737 0.12056 0.2135 0.10392 0.45962 0.65706 0.53479 0.98918 0.39077 0.85581 0.76489 0.5434 0.9925 0.22079 0.92774 0.87979 0.41943 0.31533 0.70986 0.12527 0.85438 0.58455 0.22356 0.6489 0.93091 0.31296 0.87162 0.67703 0.16528 0.76186 0.85 [...]
+0.07789 0.00606 0.55351 0.3384 0.31837 0.79351 0.98368 0.11277 0.88348 0.03551 0.33152 0.67624 0.00011 0.2803 0.70736 0.84836 0.30471 0.45881 0.64842 0.35255 0.03289 0.18315 0.04101 0.55309 0.14538 0.99957 0.77818 0.62337 0.77474 0.217 0.82574 0.87144 0.41614 0.23632 0.83685 0.15217 0.69853 0.99415 0.72405 0.13126 0.89091 0.59445 0.89446 0.33559 0.2052 0.54293 0.34833 0.60958 0.14678 0.66305 0.88795 0.89458 0.27485 0.59277 0.17393 0.81522 0.22358 0.77369 0.92956 0.59474 0.55067 0.47264 0 [...]
+0.73739 0.09997 0.12844 0.42047 0.31171 0.14199 0.32405 0.11766 0.71944 0.88185 0.97293 0.68397 0.0174 0.83711 0.69952 0.8136 0.323 0.61651 0.7892 0.34904 0.09284 0.87991 0.91021 0.55742 0.37932 0.11702 0.23986 0.13598 0.0399 0.18931 0.8928 0.29269 0.83832 0.14087 0.96813 0.06076 0.53086 0.67695 0.2239 0.53367 0.62578 0.2216 0.3912 0.23853 0.46282 0.37771 0.04429 0.51126 0.60539 0.99962 0.28898 0.49657 0.61688 0.87098 0.09706 0.8867 0.87994 0.52276 0.34786 0.11715 0.68981 0.11175 0.83494 [...]
+0.13541 0.50265 0.74125 0.81397 0.37472 0.4246 0.85972 0.59856 0.66466 0.32387 0.15787 0.74364 0.01742 0.00058 0.91171 0.11155 0.74971 0.96393 0.79881 0.2191 0.26909 0.80197 0.40681 0.53973 0.02221 0.37832 0.41491 0.92778 0.48416 0.81076 0.82118 0.28778 0.56054 0.21308 0.92878 0.61027 0.25459 0.58602 0.04714 0.25437 0.71871 0.48015 0.39035 0.5748 0.50508 0.59638 0.32533 0.17957 0.31901 0.5581 0.72702 0.04073 0.49716 0.82452 0.7372 0.34647 0.99872 0.34941 0.54363 0.4857 0.67308 0.15991 0. [...]
+0.96114 0.692 0.57695 0.32655 0.98053 0.92967 0.31942 0.86586 0.79034 0.59102 0.80847 0.80425 0.475 0.00992 0.70749 0.91679 0.4146 0.74856 0.11892 0.57952 0.64874 0.11782 0.03143 0.30817 0.61925 0.50935 0.88455 0.58201 0.23908 0.74017 0.64217 0.08053 0.24253 0.87846 0.00541 0.44282 0.10974 0.16156 0.76636 0.04114 0.81795 0.01925 0.84741 0.75165 0.44106 0.5423 0.15098 0.46156 0.71523 0.70914 0.90001 0.29248 0.60312 0.84582 0.74872 0.04788 0.58069 0.9275 0.19644 0.9644 0.48089 0.14806 0.30 [...]
+0.81776 0.44732 0.55252 0.30129 0.84118 0.38214 0.97511 0.48012 0.07452 0.2078 0.93663 0.7465 0.14422 0.47703 0.17897 0.54757 0.08676 0.32731 0.38924 0.56971 0.20043 0.5828 0.28902 0.49305 0.83343 0.24011 0.65642 0.25515 0.39 0.05727 0.80439 0.79069 0.13639 0.9822 0.23051 0.65423 0.38871 0.24932 0.44214 0.35013 0.15428 0.04304 0.1755 0.65818 0.30746 0.88085 0.03184 0.49467 0.83426 0.59402 0.68027 0.65982 0.67122 0.38174 0.5058 0.5492 0.59955 0.79303 0.88333 0.50197 0.19762 0.48033 0.5613 [...]
+0.78602 0.94929 0.41552 0.83275 0.48075 0.21594 0.73901 0.41358 0.76396 0.54421 0.97645 0.28826 0.75149 0.95924 0.05447 0.5715 0.70528 0.39746 0.67188 0.95611 0.39788 0.6071 0.08481 0.97654 0.55835 0.30779 0.783 0.24577 0.32056 0.54486 0.54522 0.74822 0.85741 0.94153 0.21077 0.97547 0.11055 0.29461 0.13713 0.92831 0.38796 0.28932 0.69793 0.58279 0.39737 0.36076 0.97432 0.01008 0.65501 0.99042 0.12264 0.0144 0.66433 0.98526 0.42435 0.56901 0.34078 0.79891 0.8865 0.52707 0.0066 0.99924 0.5 [...]
+0.18721 0.62777 0.87631 0.99945 0.68559 0.38095 0.23066 0.35127 0.36499 0.10066 0.6226 0.45804 0.74151 0.87416 0.02877 0.38773 0.25928 0.97825 0.5673 0.81833 0.8065 0.25031 0.32467 0.27338 0.42641 0.80733 0.10873 0.92842 0.55496 0.64081 0.0682 0.20656 0.86294 0.12192 0.53908 0.87364 0.74646 0.1634 0.63528 0.80801 0.49284 0.07686 0.76945 0.67989 0.20559 0.11243 0.8735 0.98389 0.85459 0.11977 0.55345 0.71219 0.53303 0.09315 0.09968 0.4525 0.96934 0.70706 0.19421 0.26982 0.45798 0.71104 0.4 [...]
+0.36571 0.47759 0.80729 0.60187 0.48248 0.11255 0.74878 0.27399 0.86033 0.20577 0.26279 0.17537 0.80602 0.9909 0.12696 0.77716 0.97042 0.47142 0.04567 0.01379 0.53147 0.87377 0.81032 0.70047 0.26916 0.96902 0.57634 0.76445 0.20739 0.38028 0.80801 0.41935 0.14279 0.61403 0.7979 0.43922 0.99038 0.16154 0.57493 0.96526 0.34517 0.74946 0.57673 0.71514 0.25517 0.60641 0.04086 0.61717 0.93809 0.84142 0.58091 0.92218 0.13751 0.16724 0.04835 0.16032 0.05949 0.50654 0.26013 0.37194 0.05468 0.2324 [...]
+0.06006 0.64714 0.84436 0.84809 0.74029 0.81643 0.85713 0.05098 0.66174 0.35001 0.61803 0.52119 0.49997 0.35727 0.0924 0.99843 0.71093 0.1363 0.23882 0.94705 0.48051 0.70666 0.21819 0.84303 0.36813 0.0135 0.28814 0.47623 0.36424 0.18746 0.69849 0.19471 0.74856 0.45706 0.59349 0.2665 0.16813 0.6022 0.52127 0.39013 0.30264 0.60042 0.65245 0.76908 0.7439 0.75316 0.86349 0.23681 0.76442 0.10949 0.48661 0.79794 0.69343 0.10917 0.45903 0.87161 0.55543 0.98648 0.27862 0.42282 0.56439 0.13289 0. [...]
+0.62549 0.59671 0.78171 0.27719 0.83743 0.45842 0.06909 0.51608 0.64272 0.9944 0.47393 0.31887 0.35461 0.77013 0.56498 0.36748 0.37981 0.36841 0.49515 0.91029 0.78855 0.34324 0.20318 0.54586 0.38942 0.40743 0.94102 0.94693 0.39792 0.60354 0.2009 0.90673 0.22292 0.82134 0.24809 0.76409 0.65738 0.40834 0.40751 0.9081 0.22765 0.0519 0.48046 0.66316 0.4229 0.94506 0.44187 0.73474 0.19368 0.69396 0.08291 0.13526 0.86987 0.81046 0.89933 0.82164 0.78977 0.27896 0.89975 0.78452 0.06279 0.35306 0 [...]
+0.91331 0.7962 0.53711 0.94627 0.14844 0.93549 0.37156 0.46232 0.64167 0.02754 0.37302 0.21746 0.39367 0.10315 0.79927 0.83415 0.7619 0.82517 0.41697 0.63422 0.09343 0.1839 0.96655 0.63876 0.17868 0.43349 0.78875 0.6297 0.1984 0.99626 0.0678 0.23817 0.05351 0.29668 0.11304 0.60062 0.50278 0.06496 0.48391 0.70611 0.26653 0.86358 0.37177 0.55622 0.22894 0.54951 0.37723 0.97763 0.4648 0.85516 0.76192 0.10716 0.24336 0.65268 0.90788 0.01293 0.23172 0.94718 0.6876 0.02924 0.40594 0.53887 0.49 [...]
+0.42924 0.01572 0.18126 0.78119 0.36599 0.27321 0.95959 0.71429 0.39627 0.18304 0.97154 0.05976 0.82229 0.42628 0.96419 0.03811 0.95528 0.25781 0.76017 0.03556 0.21502 0.33363 0.43879 0.07291 0.82116 0.88562 0.27823 0.51317 0.83985 0.56853 0.22525 0.99072 0.76523 0.67077 0.26409 0.19608 0.13459 0.61177 0.48762 0.05341 0.84407 0.19356 0.66973 0.56592 0.37569 0.88667 0.28328 0.21343 0.23202 0.4204 0.13761 0.16488 0.69632 0.98888 0.91556 0.09574 0.60134 0.83527 0.96934 0.96811 0.86425 0.640 [...]
+0.28788 0.23908 0.86864 0.14523 0.70808 0.73128 0.8122 0.78767 0.99502 0.73991 0.30973 0.96035 0.25325 0.29618 0.84683 0.29502 0.59631 0.09417 0.56332 0.80287 0.88945 0.40029 0.67428 0.91941 0.55438 0.9589 0.57036 0.17335 0.08363 0.95182 0.80217 0.10867 0.3162 0.89483 0.39251 0.57203 0.69671 0.87838 0.19019 0.52758 0.21738 0.10016 0.75205 0.6161 0.09076 0.68439 0.09412 0.40605 0.5807 0.9975 0.89421 0.37725 0.11627 0.04776 0.12681 0.88206 0.23944 0.10872 0.59057 0.38277 0.82427 0.9696 0.9 [...]
+0.42069 0.94062 0.13846 0.09839 0.7844 0.06706 0.03793 0.10995 0.31963 0.15366 0.83842 0.04088 0.814 0.42049 0.91307 0.94807 0.99564 0.51968 0.10599 0.7338 0.88052 0.40141 0.299 0.60134 0.15417 0.69147 0.28176 0.22533 0.62629 0.20422 0.50057 0.32747 0.06652 0.5395 0.78768 0.82117 0.81113 0.47038 0.85873 0.86826 0.1153 0.95336 0.7209 0.33269 0.72934 0.20553 0.73021 0.90654 0.68943 0.65958 0.6339 0.4018 0.17883 0.79885 0.65089 0.57168 0.22243 0.14689 0.90328 0.89335 0.46063 0.78906 0.75647 [...]
+0.21163 0.52939 0.44153 0.97807 0.61568 0.6513 0.78573 0.10173 0.87799 0.71173 0.29017 0.87718 0.89898 0.54559 0.89436 0.50521 0.64538 0.31275 0.54879 0.53217 0.01384 0.91265 0.1228 0.31387 0.1876 0.46235 0.60141 0.34736 0.16673 0.2065 0.29187 0.53454 0.484 0.95307 0.62674 0.06029 0.43023 0.42022 0.01654 0.21386 0.68227 0.33328 0.74451 0.42643 0.47011 0.51183 0.5191 0.86521 0.23986 0.55676 0.13322 0.65374 0.13645 0.41439 0.76959 0.82409 0.63897 0.25088 0.01063 0.27086 0.38955 0.24511 0.9 [...]
+0.15031 0.9681 0.40889 0.043 0.8883 0.40351 0.99669 0.17419 0.16751 0.16099 0.98526 0.78338 0.95975 0.82895 0.38329 0.55923 0.89047 0.92686 0.6253 0.88937 0.47603 0.24777 0.90526 0.40146 0.49785 0.18741 0.2466 0.54637 0.54012 0.48695 0.14188 0.30632 0.80844 0.66339 0.8281 0.68651 0.46344 0.03816 0.57299 0.99094 0.12186 0.96228 0.0684 0.76152 0.32602 0.53952 0.18096 0.89319 0.52983 0.90476 0.82644 0.74448 0.78034 0.75783 0.12563 0.45883 0.70761 0.1536 0.00951 0.70636 0.39996 0.12648 0.871 [...]
+0.99724 0.56911 0.41901 0.52756 0.30502 0.30157 0.60004 0.17917 0.09866 0.60589 0.77885 0.28599 0.89606 0.3119 0.81405 0.60787 0.18131 0.31071 0.10415 0.65917 0.93986 0.43073 0.86191 0.17136 0.37368 0.72581 0.28645 0.43149 0.21064 0.12778 0.1866 0.57583 0.37112 0.11563 0.0486 0.8866 0.26548 0.12867 0.47846 0.10245 0.65478 0.21879 0.80532 0.67915 0.41794 0.88647 0.27168 0.97906 0.03464 0.0377 0.88177 0.16401 0.6204 0.07285 0.37572 0.75719 0.59636 0.47515 0.52058 0.27258 0.55373 0.78601 0. [...]
+0.7593 0.09 0.90777 0.77296 0.64326 0.31757 0.4901 0.3094 0.17875 0.29926 0.90875 0.74249 0.35846 0.36889 0.04972 0.82852 0.22799 0.92533 0.11324 0.92292 0.61844 0.07004 0.3472 0.002 0.37307 0.14356 0.99528 0.09433 0.00368 0.79058 0.3733 0.65883 0.08514 0.4529 0.77456 0.94796 0.62382 0.43325 0.91745 0.38031 0.79689 0.13539 0.96665 0.85662 0.51957 0.61915 0.80513 0.56362 0.46335 0.78346 0.93839 0.33335 0.54334 0.86401 0.34937 0.79369 0.45913 0.13558 0.6603 0.37284 0.92248 0.87325 0.64742  [...]
+0.14432 0.69989 0.29864 0.59372 0.60796 0.14817 0.70061 0.98632 0.76921 0.71117 0.62485 0.16146 0.70226 0.21568 0.3335 0.16592 0.53993 0.7708 0.39733 0.7935 0.17229 0.4574 0.95173 0.04886 0.25149 0.52207 0.37972 0.91724 0.36117 0.49883 0.77888 0.18242 0.32057 0.74622 0.66048 0.34766 0.9819 0.05485 0.56502 0.89037 0.02544 0.76286 0.66118 0.22366 0.47259 0.64223 0.54333 0.16902 0.5198 0.18955 0.34042 0.02032 0.47149 0.84927 0.94491 0.0651 0.32635 0.0366 0.03302 0.85893 0.40395 0.16372 0.29 [...]
+0.46304 0.31686 0.09826 0.70275 0.04396 0.5575 0.91783 0.02152 0.46334 0.3753 0.43439 0.03013 0.35848 0.33084 0.54961 0.51599 0.01386 0.16311 0.22391 0.56088 0.2349 0.41647 0.11377 0.033 0.99861 0.96839 0.17222 0.81671 0.8551 0.58609 0.14281 0.22462 0.73049 0.14448 0.50479 0.54329 0.44302 0.84478 0.71332 0.56343 0.47189 0.88249 0.26968 0.46301 0.79943 0.90398 0.97002 0.10541 0.90782 0.70754 0.96891 0.84895 0.95132 0.74788 0.93859 0.42145 0.94323 0.22521 0.2418 0.78981 0.17527 0.76129 0.8 [...]
+0.84918 0.24946 0.83342 0.75483 0.52398 0.66853 0.15519 0.67392 0.4051 0.56688 0.2243 0.31774 0.38902 0.46438 0.31011 0.77529 0.0597 0.19834 0.86213 0.54171 0.45995 0.06512 0.86877 0.44895 0.67385 0.98699 0.40158 0.34347 0.08364 0.23674 0.88137 0.24516 0.45265 0.67568 0.95389 0.49066 0.27583 0.01526 0.16546 0.03672 0.55279 0.0306 0.187 0.22371 0.24007 0.89507 0.99884 0.16538 0.0211 0.17032 0.35817 0.20862 0.23185 0.23803 0.4919 0.53301 0.29828 0.39943 0.85532 0.37319 0.31096 0.7742 0.167 [...]
+0.72135 0.1776 0.28626 0.24966 0.85797 0.49362 0.03657 0.38073 0.24993 0.67497 0.84981 0.85046 0.15866 0.29893 0.37282 0.68659 0.27997 0.89187 0.72966 0.56996 0.83058 0.18638 0.77833 0.46409 0.12781 0.5832 0.61302 0.21859 0.21729 0.51362 0.23154 0.41352 0.20524 0.36528 0.96093 0.78112 0.55438 0.16747 0.81527 0.66389 0.62344 0.31048 0.64694 0.57685 0.67106 0.49811 0.40731 0.54065 0.59254 0.11632 0.62516 0.6877 0.23966 0.54794 0.88446 0.25325 0.06081 0.97248 0.59577 0.85383 0.16113 0.76799 [...]
+0.14543 0.26278 0.77504 0.31984 0.13105 0.51721 0.76964 0.11196 0.07341 0.14716 0.11066 0.73981 0.9846 0.6949 0.65513 0.21527 0.99917 0.24936 0.83124 0.79287 0.29182 0.56975 0.20623 0.27583 0.31429 0.20105 0.83653 0.86432 0.77578 0.98958 0.10032 0.8451 0.90457 0.36341 0.40849 0.24355 0.28255 0.73967 0.10594 0.62456 0.93252 0.6847 0.31975 0.09156 0.02518 0.73943 0.2001 0.75098 0.38092 0.67548 0.26876 0.06932 0.35347 0.158 0.39604 0.9229 0.82314 0.65695 0.91874 0.99331 0.46664 0.09545 0.38 [...]
+0.5873 0.83249 0.88415 0.33833 0.05286 0.35813 0.32791 0.69462 0.20048 0.84503 0.55646 0.05775 0.96176 0.76573 0.69836 0.37597 0.99617 0.96671 0.67774 0.51425 0.7293 0.45191 0.70118 0.40505 0.97447 0.33743 0.64239 0.63334 0.76473 0.88479 0.87529 0.80806 0.01408 0.38665 0.08229 0.26651 0.94509 0.83547 0.69531 0.1058 0.03005 0.17942 0.80863 0.55477 0.6211 0.91967 0.02789 0.09353 0.37629 0.75016 0.62396 0.20531 0.74348 0.1297 0.0488 0.71021 0.69952 0.42377 0.57086 0.97048 0.13687 0.54415 0. [...]
+0.56778 0.32462 0.43375 0.87968 0.94941 0.17718 0.01229 0.24853 0.07895 0.20152 0.41657 0.73271 0.41259 0.43458 0.04032 0.62517 0.86761 0.26536 0.29744 0.44519 0.91037 0.30023 0.31416 0.3517 0.39032 0.91617 0.04015 0.62347 0.45676 0.75688 0.26094 0.33856 0.71968 0.14434 0.62037 0.26136 0.1528 0.81328 0.84773 0.76827 0.01385 0.42675 0.65025 0.99394 0.86487 0.39516 0.21678 0.87486 0.61932 0.1828 0.35497 0.98002 0.2765 0.66367 0.4237 0.66886 0.64278 0.08222 0.59086 0.33857 0.20733 0.11395 0 [...]
+0.16982 0.49372 0.49919 0.86881 0.29878 0.2948 0.98248 0.46294 0.19913 0.42071 0.61313 0.98954 0.42619 0.38158 0.10041 0.79288 0.95988 0.83634 0.25817 0.01832 0.79418 0.81091 0.55294 0.56185 0.82567 0.99861 0.95246 0.4805 0.86765 0.64437 0.32807 0.17582 0.44185 0.79278 0.53138 0.67101 0.49167 0.92906 0.64333 0.26589 0.00252 0.22355 0.49782 0.86936 0.24831 0.56699 0.37211 0.28212 0.94021 0.29913 0.74149 0.6162 0.23527 0.22521 0.74163 0.86082 0.55601 0.74601 0.95976 0.42042 0.76987 0.28406 [...]
+0.6922 0.12757 0.8193 0.11203 0.91076 0.91761 0.24972 0.04508 0.52542 0.64847 0.18077 0.97957 0.68995 0.71891 0.94981 0.81123 0.83735 0.03163 0.65196 0.92575 0.66901 0.04043 0.93051 0.84475 0.29818 0.7947 0.46689 0.96831 0.17996 0.12766 0.41432 0.4895 0.39863 0.46677 0.19277 0.25494 0.4004 0.56415 0.38118 0.62825 0.03113 0.1204 0.74993 0.29529 0.4295 0.99581 0.06141 0.91115 0.78294 0.96888 0.02631 0.00865 0.98856 0.72838 0.86952 0.09492 0.88683 0.74004 0.38547 0.60524 0.60393 0.79585 0.4 [...]
+0.40271 0.47559 0.4406 0.86939 0.36229 0.70863 0.39996 0.44171 0.8974 0.41074 0.12654 0.51985 0.30274 0.54302 0.7003 0.66041 0.3831 0.07802 0.02384 0.40116 0.92148 0.6791 0.69426 0.56237 0.63338 0.78592 0.1307 0.92788 0.10484 0.40587 0.91199 0.44001 0.03835 0.93804 0.87598 0.69511 0.01262 0.53273 0.44293 0.30118 0.52618 0.53001 0.69579 0.84749 0.68353 0.81063 0.19435 0.40415 0.74285 0.76256 0.13853 0.64615 0.75843 0.90396 0.43533 0.62836 0.41027 0.486 0.66204 0.74381 0.34423 0.81963 0.29 [...]
+0.82396 0.89119 0.34014 0.6194 0.93901 0.88509 0.02166 0.7875 0.32215 0.15732 0.90509 0.99497 0.04218 0.29807 0.95214 0.86078 0.59935 0.07632 0.89421 0.89448 0.60714 0.30214 0.39775 0.99675 0.85505 0.62545 0.25616 0.76789 0.36039 0.50776 0.71929 0.50533 0.1273 0.90315 0.34345 0.13254 0.18714 0.97637 0.03413 0.22446 0.60471 0.96998 0.04843 0.5852 0.45376 0.48622 0.036 0.46874 0.25039 0.04955 0.09727 0.62708 0.0037 0.46376 0.79298 0.39804 0.61598 0.48533 0.14152 0.89875 0.10711 0.50933 0.3 [...]
+0.33897 0.23692 0.48072 0.47127 0.38096 0.16773 0.74521 0.80202 0.70991 0.22772 0.7158 0.46206 0.93301 0.48501 0.82953 0.57205 0.05995 0.22638 0.12457 0.43118 0.81226 0.35961 0.44053 0.02821 0.42001 0.48086 0.68045 0.74511 0.36034 0.60999 0.96212 0.39798 0.60614 0.9111 0.81324 0.80584 0.75603 0.46895 0.95773 0.1289 0.65616 0.87168 0.23114 0.2731 0.10257 0.11921 0.4392 0.38221 0.64755 0.98505 0.50488 0.78698 0.28439 0.7103 0.25144 0.44558 0.21126 0.61776 0.19236 0.02339 0.35706 0.25713 0. [...]
+0.34464 0.24706 0.89843 0.35862 0.19467 0.16406 0.10676 0.73485 0.33294 0.09092 0.16938 0.53302 0.95858 0.3954 0.72597 0.39349 0.29425 0.76807 0.62152 0.32935 0.83492 0.12505 0.77384 0.81247 0.79218 0.102 0.97482 0.78816 0.63601 0.50014 0.5209 0.69915 0.28426 0.72566 0.31126 0.18408 0.17442 0.15913 0.66089 0.33968 0.47559 0.91487 0.62448 0.7828 0.17129 0.99071 0.46249 0.71139 0.44511 0.19849 0.25983 0.38277 0.57478 0.75959 0.83755 0.62109 0.95774 0.25207 0.53864 0.88112 0.14599 0.04127 0 [...]
+0.19104 0.05484 0.78499 0.49502 0.02816 0.07987 0.48414 0.37884 0.98592 0.48593 0.24242 0.6457 0.88387 0.68202 0.31994 0.56271 0.26669 0.86824 0.02286 0.18414 0.3686 0.74584 0.73873 0.1985 0.36905 0.90577 0.25128 0.08662 0.01656 0.77057 0.78841 0.13159 0.31135 0.65999 0.2821 0.45762 0.9503 0.33542 0.20493 0.39652 0.32834 0.93398 0.63024 0.65482 0.57139 0.43764 0.88876 0.0589 0.96846 0.59677 0.99178 0.39812 0.88978 0.17266 0.83681 0.12129 0.05445 0.79538 0.12922 0.90666 0.13163 0.36013 0. [...]
+0.38903 0.85971 0.79063 0.51079 0.34427 0.36123 0.91602 0.89045 0.11531 0.03734 0.71255 0.18458 0.81833 0.39593 0.94472 0.39615 0.56393 0.99795 0.00138 0.37458 0.79016 0.48601 0.86029 0.66688 0.78388 0.27396 0.87004 0.45553 0.38881 0.7497 0.90176 0.79465 0.51585 0.37021 0.13624 0.30472 0.69356 0.85102 0.32664 0.47475 0.39994 0.38324 0.81473 0.12984 0.47392 0.20737 0.7844 0.65011 0.38621 0.17139 0.31314 0.2416 0.56206 0.61431 0.59012 0.15285 0.51547 0.87717 0.1599 0.1472 0.66747 0.55046 0 [...]
+0.1467 0.38309 0.05901 0.30124 0.78806 0.99364 0.37032 0.64534 0.75639 0.87164 0.32012 0.86703 0.94165 0.68628 0.18865 0.88889 0.85473 0.94432 0.76115 0.89518 0.28184 0.14193 0.27084 0.20818 0.47312 0.1752 0.13968 0.53989 0.26248 0.37 0.00806 0.17768 0.58021 0.02396 0.92703 0.77364 0.4633 0.44805 0.40946 0.93714 0.59215 0.83081 0.225 0.94356 0.99167 0.80388 0.36141 0.88454 0.89728 0.638 0.82038 0.68997 0.13258 0.57615 0.98834 0.29941 0.15629 0.00321 0.53491 0.55717 0.61617 0.29611 0.9538 [...]
+0.21123 0.44645 0.36686 0.41566 0.82646 0.19372 0.93013 0.70102 0.99323 0.69571 0.42839 0.72881 0.17689 0.29471 0.32304 0.87914 0.53233 0.4384 0.57531 0.70931 0.73314 0.56551 0.75505 0.12191 0.00065 0.862 0.48363 0.11966 0.98571 0.68824 0.97101 0.41721 0.43089 0.80592 0.99308 0.60696 0.27162 0.63766 0.53044 0.62705 0.74065 0.10226 0.34862 0.55307 0.52994 0.00107 0.26502 0.08748 0.73633 0.57387 0.85982 0.1924 0.57193 0.00539 0.5164 0.447 0.61771 0.52449 0.68353 0.53697 0.10786 0.28029 0.3 [...]
+0.65229 0.35204 0.51981 0.88329 0.66626 0.37912 0.23329 0.49184 0.03253 0.98089 0.04781 0.33895 0.73757 0.61661 0.95124 0.27196 0.88288 0.58312 0.14374 0.47186 0.07083 0.59103 0.35826 0.88988 0.46943 0.49769 0.77347 0.74029 0.23794 0.13341 0.81867 0.15144 0.48152 0.81956 0.90753 0.06692 0.64927 0.66995 0.81606 0.02587 0.86675 0.00831 0.61599 0.88712 0.22229 0.43593 0.18147 0.53399 0.94624 0.85262 0.60039 0.16246 0.23935 0.47475 0.36691 0.08842 0.36657 0.49499 0.03991 0.19712 0.08481 0.05 [...]
+0.8958 0.34433 0.24266 0.69631 0.29011 0.07763 0.8776 0.08178 0.39201 0.5822 0.51835 0.44402 0.05986 0.14843 0.81083 0.10195 0.00401 0.32672 0.22979 0.54839 0.08413 0.39618 0.61511 0.20209 0.1514 0.93967 0.54072 0.21036 0.49132 0.60606 0.45017 0.38693 0.72124 0.71833 0.48556 0.48519 0.49965 0.99966 0.28217 0.21505 0.14988 0.67031 0.18513 0.19321 0.10894 0.26026 0.9772 0.46093 0.37344 0.11122 0.9485 0.25407 0.60495 0.56838 0.40801 0.7857 0.8071 0.68962 0.2396 0.34575 0.40122 0.61585 0.942 [...]
+0.30543 0.00498 0.31165 0.17996 0.90126 0.01489 0.07776 0.15121 0.31187 0.31267 0.86079 0.44323 0.30866 0.61221 0.94742 0.1609 0.95857 0.43895 0.85504 0.20639 0.2015 0.6316 0.16729 0.94334 0.15582 0.8659 0.28865 0.12037 0.33094 0.10159 0.21524 0.86036 0.20923 0.18324 0.32907 0.41809 0.9853 0.22598 0.15145 0.18768 0.5815 0.45199 0.19178 0.91134 0.92305 0.74984 0.56757 0.25962 0.12475 0.3966 0.26038 0.5888 0.6186 0.24545 0.3254 0.98613 0.2831 0.19226 0.48049 0.9632 0.82934 0.64248 0.08533  [...]
+0.06507 0.70332 0.4323 0.64924 0.02786 0.69219 0.99708 0.62379 0.69123 0.62303 0.14084 0.09783 0.82408 0.71671 0.78065 0.37083 0.11715 0.33479 0.43466 0.99436 0.95447 0.14913 0.67426 0.7542 0.23299 0.8091 0.9856 0.17836 0.85449 0.85442 0.25085 0.38038 0.45172 0.80575 0.38616 0.12591 0.34246 0.97059 0.97529 0.03342 0.41081 0.30165 0.01416 0.64878 0.83075 0.10914 0.68153 0.6791 0.57458 0.04997 0.60469 0.38111 0.09568 0.34857 0.01854 0.4628 0.48341 0.39066 0.6398 0.96382 0.31376 0.82561 0.9 [...]
+0.91278 0.63241 0.83871 0.52433 0.578 0.28228 0.80428 0.48611 0.69783 0.92116 0.84506 0.5813 0.65353 0.78589 0.21356 0.22852 0.49665 0.24019 0.33982 0.42333 0.44732 0.19585 0.94297 0.50193 0.45557 0.52007 0.64467 0.57702 0.82702 0.72539 0.10602 0.48531 0.93168 0.51644 0.57484 0.59419 0.08327 0.75665 0.49009 0.14286 0.80783 0.98116 0.60038 0.26084 0.25577 0.79408 0.09575 0.23688 0.54122 0.08619 0.75116 0.76635 0.794 0.50073 0.85623 0.11708 0.80802 0.51513 0.0775 0.74918 0.32383 0.38114 0. [...]
+0.61662 0.59933 0.54529 0.76028 0.66759 0.28027 0.26896 0.01711 0.81399 0.3457 0.96706 0.00357 0.56458 0.81079 0.88927 0.25899 0.3719 0.03367 0.11418 0.33945 0.11474 0.32405 0.85468 0.62075 0.2906 0.54253 0.39652 0.34868 0.9001 0.52842 0.05205 0.05751 0.85899 0.31457 0.22758 0.0279 0.41664 0.58365 0.58721 0.81172 0.15051 0.51651 0.02441 0.68608 0.83798 0.86151 0.11932 0.71157 0.78008 0.00579 0.62403 0.3956 0.80736 0.41276 0.10119 0.73394 0.24063 0.37483 0.24057 0.63324 0.01943 0.58225 0. [...]
+0.25165 0.22768 0.27163 0.46653 0.87683 0.02402 0.8941 0.83444 0.89147 0.04571 0.31705 0.70074 0.07713 0.15412 0.73448 0.15272 0.78631 0.97428 0.81665 0.23012 0.5988 0.52454 0.68614 0.85291 0.73391 0.22249 0.72239 0.80143 0.14966 0.31341 0.79501 0.69464 0.18963 0.1961 0.67403 0.72231 0.37686 0.71913 0.11478 0.02543 0.07662 0.91866 0.8886 0.85518 0.08263 0.21722 0.70503 0.60185 0.53744 0.7649 0.33678 0.93297 0.89766 0.70874 0.3922 0.83618 0.8747 0.03422 0.93895 0.43065 0.86482 0.32542 0.4 [...]
+0.42267 0.09086 0.84102 0.39962 0.92897 0.22646 0.47745 0.62993 0.52792 0.87215 0.20432 0.09603 0.5856 0.06798 0.69083 0.73517 0.03486 0.89098 0.33608 0.89111 0.46041 0.19739 0.61532 0.1611 0.4909 0.38397 0.0822 0.11292 0.8417 0.61995 0.93648 0.53275 0.03518 0.00521 0.06478 0.88496 0.58118 0.32431 0.86733 0.02635 0.7324 0.78358 0.80084 0.85265 0.55209 0.86167 0.78579 0.96866 0.95941 0.79766 0.47434 0.22006 0.69413 0.64036 0.05555 0.84152 0.92547 0.44072 0.32576 0.02691 0.75109 0.28605 0. [...]
+0.19841 0.9733 0.21071 0.24164 0.97136 0.33886 0.9214 0.13266 0.1139 0.12488 0.79107 0.96996 0.60172 0.48858 0.47702 0.10787 0.46511 0.29463 0.05079 0.08251 0.70008 0.17203 0.15909 0.58504 0.97598 0.50901 0.76384 0.4063 0.45062 0.42274 0.89142 0.92393 0.07005 0.66618 0.29838 0.67968 0.97194 0.44401 0.49939 0.5011 0.60568 0.65589 0.01275 0.5006 0.50679 0.48571 0.30699 0.0301 0.16537 0.1212 0.42529 0.56989 0.65898 0.24537 0.78463 0.53773 0.37843 0.57275 0.23035 0.23879 0.58311 0.512 0.0572 [...]
+0.2596 0.06069 0.90133 0.59587 0.93498 0.51458 0.94126 0.63852 0.55775 0.49928 0.72768 0.65635 0.15717 0.77785 0.8802 0.06983 0.73343 0.07406 0.3031 0.49942 0.66983 0.75403 0.60628 0.97744 0.6985 0.73751 0.06084 0.9499 0.0023 0.69492 0.40826 0.8524 0.28971 0.15855 0.29711 0.29855 0.81195 0.47782 0.56228 0.13557 0.13974 0.23612 0.84544 0.6284 0.05504 0.29127 0.51011 0.08232 0.18907 0.63848 0.64151 0.63744 0.31046 0.69403 0.89004 0.15836 0.85281 0.9806 0.66152 0.50674 0.42234 0.20322 0.252 [...]
+0.65201 0.09178 0.62228 0.0429 0.81459 0.57106 0.07297 0.30905 0.7647 0.08471 0.29001 0.04823 0.04533 0.6266 0.42808 0.79652 0.74056 0.24328 0.24849 0.80449 0.31566 0.9508 0.77165 0.78744 0.09235 0.05198 0.30525 0.68491 0.06725 0.84964 0.33252 0.23065 0.40901 0.50441 0.12678 0.45749 0.33056 0.21415 0.21547 0.47555 0.7883 0.79982 0.19452 0.16293 0.95323 0.96318 0.76462 0.80378 0.7347 0.22885 0.12758 0.93851 0.58843 0.10437 0.23703 0.79012 0.7938 0.64958 0.08244 0.74478 0.21708 0.86626 0.7 [...]
+0.73945 0.14637 0.35596 0.32107 0.85179 0.8567 0.02421 0.13763 0.6896 0.45031 0.47831 0.86695 0.00637 0.20132 0.14478 0.56834 0.56943 0.47281 0.70602 0.44239 0.9751 0.12911 0.02695 0.55562 0.03745 0.24186 0.18906 0.58853 0.86915 0.0543 0.60363 0.73469 0.34578 0.19689 0.9726 0.27302 0.63561 0.56514 0.28309 0.65667 0.62222 0.00703 0.79662 0.44749 0.327 0.29919 0.32358 0.92276 0.6098 0.14569 0.1791 0.41179 0.32416 0.36094 0.51059 0.58791 0.68235 0.42243 0.34656 0.90444 0.15341 0.96816 0.395 [...]
+0.23158 0.81419 0.40593 0.20194 0.56118 0.06597 0.70927 0.32548 0.75892 0.00373 0.40175 0.18896 0.51981 0.53728 0.97221 0.61382 0.63003 0.12263 0.17103 0.16754 0.85616 0.51178 0.65553 0.73167 0.04516 0.49677 0.81889 0.76169 0.91836 0.03276 0.31719 0.02337 0.14489 0.03355 0.2094 0.06677 0.84338 0.83803 0.83846 0.5201 0.26393 0.49384 0.13421 0.71669 0.02984 0.66755 0.67362 0.24914 0.1033 0.3473 0.77473 0.73928 0.8486 0.33849 0.53622 0.29039 0.20941 0.35288 0.77953 0.62314 0.34482 0.2038 0. [...]
+0.28067 0.79077 0.14115 0.64964 0.28134 0.57216 0.58947 0.83073 0.55405 0.65101 0.28525 0.27988 0.75777 0.18325 0.37275 0.76987 0.67235 0.13293 0.26284 0.97385 0.30979 0.63101 0.00546 0.7238 0.29333 0.1669 0.08587 0.78681 0.40946 0.29914 0.95561 0.87086 0.68586 0.24352 0.38234 0.22639 0.29534 0.61537 0.98332 0.39313 0.18682 0.89602 0.87413 0.27158 0.03221 0.92194 0.71092 0.31444 0.04561 0.5574 0.0522 0.08319 0.63868 0.77222 0.11405 0.64494 0.30146 0.74304 0.94826 0.64195 0.29967 0.644 0. [...]
+0.74173 0.78398 0.52346 0.06352 0.60671 0.64118 0.14498 0.74207 0.50271 0.38416 0.34158 0.93526 0.84559 0.31768 0.2265 0.48647 0.42297 0.81645 0.24367 0.92959 0.06465 0.9028 0.1912 0.22934 0.73092 0.8166 0.05081 0.74486 0.88121 0.60037 0.8019 0.1553 0.30281 0.27108 0.55948 0.50547 0.10009 0.55363 0.8193 0.5664 0.37001 0.67924 0.08983 0.26978 0.37423 0.50611 0.33881 0.37133 0.54213 0.12871 0.85825 0.64784 0.85016 0.22785 0.1312 0.09199 0.75866 0.56169 0.03679 0.7276 0.74733 0.52481 0.1825 [...]
+0.77024 0.94638 0.76333 0.87685 0.09842 0.58799 0.41065 0.44991 0.46211 0.00823 0.46399 0.75054 0.10124 0.98976 0.47378 0.84073 0.51248 0.28901 0.6141 0.72773 0.5338 0.59161 0.30788 0.68519 0.70554 0.69447 0.70005 0.58945 0.05152 0.91112 0.74554 0.29447 0.55896 0.60994 0.10454 0.22069 0.42795 0.46349 0.22799 0.47796 0.70996 0.2777 0.34053 0.76283 0.79072 0.15138 0.63709 0.05972 0.859 0.10919 0.3059 0.89576 0.82973 0.58164 0.92024 0.51782 0.73589 0.31719 0.63264 0.60915 0.95232 0.29147 0. [...]
+0.186 0.68039 0.84378 0.89842 0.75319 0.71587 0.94377 0.14034 0.66891 0.80546 0.49579 0.44138 0.76019 0.34293 0.72322 0.73688 0.71192 0.18088 0.74538 0.07967 0.51913 0.84175 0.73985 0.07049 0.45443 0.00563 0.09708 0.79969 0.30909 0.64062 0.43318 0.93953 0.39134 0.52384 0.31143 0.79182 0.74511 0.46745 0.52877 0.71778 0.23398 0.93945 0.34509 0.98151 0.60801 0.63777 0.07801 0.8003 0.00812 0.74793 0.49156 0.89925 0.35305 0.04282 0.06535 0.97049 0.15458 0.76779 0.17323 0.6061 0.9564 0.17635 0 [...]
+0.51931 0.43948 0.86327 0.35277 0.47074 0.42117 0.58604 0.02681 0.67691 0.03394 0.2685 0.97783 0.33896 0.11225 0.77295 0.56309 0.06876 0.63733 0.78959 0.10362 0.30258 0.13047 0.10146 0.47802 0.40545 0.01861 0.89848 0.81099 0.78728 0.13272 0.90835 0.88532 0.91732 0.01426 0.91887 0.93072 0.79471 0.11506 0.87655 0.65665 0.20116 0.35602 0.71306 0.02224 0.50474 0.77143 0.52029 0.66029 0.71474 0.60111 0.70132 0.67444 0.36946 0.24851 0.55548 0.83978 0.20388 0.38954 0.11494 0.45801 0.8547 0.4329 [...]
+0.12012 0.33943 0.30117 0.69724 0.45468 0.33814 0.28406 0.10112 0.19077 0.8641 0.28525 0.15541 0.9372 0.12253 0.52403 0.59068 0.76267 0.99486 0.5064 0.53519 0.45216 0.08077 0.89403 0.84972 0.16851 0.97828 0.50787 0.88341 0.0541 0.06358 0.48292 0.72708 0.71565 0.63742 0.78961 0.21035 0.22875 0.26247 0.99442 0.64008 0.02603 0.85868 0.37463 0.58124 0.59538 0.20495 0.21111 0.1637 0.82426 0.40356 0.75907 0.60333 0.39497 0.24762 0.90629 0.01519 0.59865 0.08956 0.35356 0.85865 0.45791 0.57435 0 [...]
+0.29301 0.4408 0.5708 0.14983 0.94861 0.28156 0.72256 0.83612 0.46466 0.23019 0.71784 0.36191 0.60037 0.19997 0.10507 0.95627 0.74849 0.93337 0.7983 0.17002 0.45175 0.99455 0.10998 0.8901 0.25121 0.08034 0.79496 0.06546 0.05639 0.19073 0.3033 0.11946 0.7202 0.83637 0.76963 0.332 0.96352 0.04444 0.54398 0.13867 0.18904 0.96875 0.56056 0.00809 0.10737 0.95257 0.44604 0.38821 0.03802 0.5463 0.98362 0.00715 0.57769 0.21573 0.24966 0.39787 0.93815 0.82632 0.15971 0.20522 0.09322 0.8176 0.2598 [...]
+0.83934 0.55172 0.96894 0.829 0.90303 0.10846 0.31693 0.01108 0.63985 0.60115 0.33671 0.83441 0.68693 0.71208 0.27539 0.67179 0.46119 0.62686 0.8995 0.87406 0.46092 0.76173 0.89462 0.82062 0.17962 0.11078 0.61558 0.2791 0.40528 0.18923 0.43824 0.17734 0.77538 0.86891 0.58966 0.76517 0.31057 0.28483 0.50551 0.53616 0.73759 0.89259 0.38969 0.55767 0.10682 0.92846 0.02589 0.64916 0.38274 0.8673 0.04282 0.33462 0.0069 0.38983 0.19624 0.54557 0.3424 0.43371 0.76326 0.3245 0.78531 0.90105 0.26 [...]
+0.79729 0.76941 0.96483 0.58898 0.41311 0.03121 0.01641 0.29242 0.04037 0.17691 0.69084 0.74587 0.49038 0.92378 0.10021 0.66081 0.59896 0.72267 0.38706 0.09221 0.01831 0.32452 0.1611 0.08339 0.4911 0.32201 0.12075 0.58419 0.57419 0.23126 0.95269 0.0539 0.50237 0.95408 0.57888 0.17981 0.87202 0.42836 0.85731 0.85676 0.86736 0.94987 0.70361 0.60717 0.65346 0.49973 0.94486 0.02973 0.61249 0.23616 0.73572 0.36735 0.82579 0.35763 0.06972 0.75126 0.48575 0.77599 0.32056 0.45895 0.34359 0.44159 [...]
+0.52984 0.54217 0.48208 0.88615 0.95665 0.56248 0.90998 0.29333 0.74168 0.77028 0.00624 0.47998 0.91832 0.66646 0.61253 0.65958 0.39888 0.1123 0.12201 0.28328 0.42557 0.49346 0.06377 0.91448 0.42444 0.99917 0.99814 0.12223 0.50641 0.99907 0.77924 0.61395 0.24721 0.82975 0.12402 0.32695 0.28947 0.03717 0.85734 0.27231 0.45028 0.82611 0.3314 0.02018 0.29471 0.89064 0.08732 0.1142 0.92705 0.66653 0.11562 0.97136 0.84464 0.80052 0.64935 0.21408 0.68362 0.63272 0.14817 0.86939 0.63889 0.96669 [...]
+0.85743 0.88469 0.28352 0.49077 0.80035 0.80725 0.42462 0.52633 0.86742 0.98445 0.46969 0.92481 0.34634 0.41432 0.27007 0.59055 0.38127 0.71613 0.49002 0.56785 0.96493 0.71788 0.99534 0.52875 0.54219 0.11508 0.22838 0.6816 0.73424 0.58977 0.47324 0.6702 0.82823 0.1216 0.19965 0.119 0.46639 0.88506 0.32799 0.90747 0.19426 0.48295 0.22115 0.81487 0.17777 0.54795 0.87334 0.41311 0.77872 0.43941 0.21369 0.35858 0.11769 0.38705 0.22359 0.11874 0.0906 0.30362 0.44122 0.12771 0.21459 0.20088 0. [...]
+0.09352 0.27033 0.11885 0.38419 0.36368 0.55322 0.44742 0.14503 0.85167 0.61102 0.75635 0.38585 0.52005 0.9812 0.38135 0.84124 0.7831 0.7354 0.11248 0.95386 0.47203 0.01572 0.13279 0.76322 0.30759 0.9383 0.83852 0.06591 0.97148 0.86063 0.63826 0.01768 0.81467 0.64021 0.82328 0.08777 0.82605 0.4496 0.01087 0.38099 0.10364 0.84356 0.56486 0.35853 0.10539 0.03678 0.54847 0.62037 0.64134 0.19667 0.61322 0.94813 0.14057 0.15789 0.9588 0.19836 0.40165 0.78355 0.44124 0.24807 0.42628 0.517 0.43 [...]
+0.49628 0.25961 0.84742 0.96916 0.36733 0.83172 0.35273 0.55458 0.48878 0.6131 0.58084 0.53679 0.56581 0.97774 0.50689 0.55804 0.09925 0.37206 0.30594 0.56868 0.2903 0.1344 0.06644 0.44659 0.44906 0.45336 0.15954 0.05568 0.1083 0.12255 0.7433 0.87955 0.10321 0.73206 0.86537 0.3933 0.41885 0.84334 0.09678 0.26914 0.67063 0.55853 0.60951 0.27683 0.8603 0.0233 0.01796 0.33669 0.8149 0.24378 0.85783 0.8991 0.59959 0.08272 0.72028 0.43741 0.49989 0.21417 0.01556 0.38585 0.81855 0.58632 0.8577 [...]
+0.47349 0.8828 0.07038 0.84378 0.03391 0.58868 0.25305 0.50498 0.43322 0.98457 0.89305 0.53151 0.78192 0.78716 0.53492 0.52115 0.84336 0.14424 0.81804 0.08935 0.08724 0.33269 0.89218 0.99553 0.78666 0.31033 0.7941 0.24454 0.48416 0.93221 0.81344 0.25859 0.74524 0.54074 0.67547 0.62423 0.78725 0.69487 0.68784 0.78879 0.22129 0.04618 0.69124 0.01249 0.6178 0.40577 0.09384 0.1146 0.30023 0.99553 0.02469 0.02966 0.71544 0.97964 0.74822 0.898 0.86405 0.3249 0.5395 0.87634 0.1715 0.02122 0.550 [...]
+0.08894 0.83377 0.71445 0.88692 0.73866 0.88509 0.25792 0.60432 0.29501 0.60734 0.43911 0.87153 0.2943 0.36555 0.51936 0.62168 0.6538 0.5469 0.39796 0.0812 0.25902 0.17066 0.16066 0.05856 0.15494 0.97221 0.26396 0.47751 0.41418 0.45674 0.5418 0.55772 0.0903 0.62289 0.15658 0.6624 0.05847 0.48142 0.69197 0.85982 0.80962 0.48183 0.02957 0.50343 0.64062 0.56983 0.9504 0.66544 0.51441 0.77116 0.0992 0.28036 0.6032 0.05509 0.41685 0.85073 0.43825 0.66878 0.07463 0.93399 0.51383 0.25661 0.7420 [...]
+0.87987 0.37853 0.03895 0.52383 0.53367 0.98655 0.81749 0.08556 0.79622 0.2593 0.66936 0.88909 0.14389 0.18303 0.31925 0.38334 0.63371 0.82645 0.24838 0.44553 0.18158 0.08271 0.12211 0.08774 0.87883 0.40775 0.70234 0.91618 0.48509 0.82181 0.50504 0.56325 0.03714 0.7193 0.50453 0.30028 0.97366 0.64942 0.74036 0.92824 0.51803 0.63648 0.47714 0.10022 0.51666 0.18901 0.63739 0.11545 0.33523 0.71366 0.36775 0.40344 0.25273 0.20154 0.92625 0.54131 0.2474 0.75372 0.64159 0.08234 0.82494 0.09479 [...]
+0.03479 0.73156 0.49269 0.17602 0.62727 0.96274 0.67282 0.07381 0.29662 0.46088 0.90761 0.3912 0.57805 0.00454 0.43663 0.57076 0.32242 0.87732 0.0448 0.0608 0.71501 0.078 0.9946 0.97521 0.09486 0.52262 0.17928 0.4009 0.23611 0.84351 0.96552 0.70623 0.16817 0.22848 0.13237 0.91769 0.3626 0.73932 0.57234 0.35941 0.93144 0.85029 0.15722 0.36402 0.36941 0.68756 0.9158 0.85647 0.88713 0.87014 0.60378 0.71879 0.65777 0.03161 0.45475 0.50258 0.49666 0.08615 0.09788 0.67421 0.68332 0.80667 0.806 [...]
+0.09059 0.54011 0.27478 0.93319 0.07509 0.19845 0.0009 0.9902 0.25404 0.70962 0.66272 0.11641 0.46008 0.35899 0.63018 0.56712 0.2188 0.72287 0.24352 0.51856 0.9743 0.52943 0.48142 0.90731 0.3901 0.26545 0.0008 0.67336 0.33736 0.87069 0.22472 0.74947 0.21999 0.1306 0.31023 0.94794 0.98646 0.44908 0.9595 0.22838 0.2761 0.96244 0.18479 0.51069 0.27045 0.39282 0.79091 0.42838 0.17343 0.76842 0.82189 0.40194 0.42266 0.75948 0.16888 0.90375 0.66316 0.03612 0.64987 0.59927 0.80452 0.63766 0.676 [...]
+0.13464 0.80467 0.25233 0.67439 0.47438 0.27759 0.91373 0.92723 0.09414 0.44751 0.56952 0.48344 0.68914 0.51746 0.79186 0.54059 0.40132 0.16624 0.91401 0.71531 0.95231 0.16192 0.66724 0.83369 0.95738 0.54127 0.32898 0.79717 0.01723 0.2483 0.05297 0.40958 0.98893 0.80196 0.37789 0.42855 0.56103 0.21043 0.55898 0.21921 0.80545 0.27782 0.00686 0.98184 0.22178 0.69737 0.15984 0.81538 0.19671 0.01419 0.5532 0.04608 0.99633 0.9283 0.37456 0.30237 0.52353 0.83089 0.98766 0.81766 0.73739 0.24534 [...]
+0.55871 0.34754 0.48691 0.20959 0.11927 0.89632 0.75809 0.16117 0.93364 0.29962 0.95964 0.02939 0.2796 0.28 0.96778 0.27531 0.56186 0.64829 0.28052 0.08448 0.47824 0.29156 0.21222 0.72097 0.48756 0.35065 0.93993 0.23406 0.65958 0.15521 0.61516 0.49595 0.29357 0.53379 0.73093 0.7381 0.00901 0.66325 0.97176 0.46811 0.94415 0.04253 0.62319 0.54845 0.58461 0.96151 0.96909 0.14346 0.6134 0.59562 0.12226 0.00192 0.34932 0.89453 0.28962 0.17832 0.00058 0.24562 0.70548 0.40368 0.7013 0.90903 0.5 [...]
+0.13344 0.98287 0.13414 0.78188 0.15321 0.29413 0.02884 0.42107 0.81327 0.30428 0.6487 0.16815 0.42756 0.41306 0.4452 0.60883 0.02104 0.08367 0.17071 0.65563 0.18456 0.85008 0.59398 0.87289 0.10143 0.44112 0.9599 0.92354 0.03973 0.23552 0.54429 0.39946 0.63253 0.9536 0.8384 0.18186 0.64409 0.22771 0.20601 0.94079 0.15254 0.69494 0.7834 0.70933 0.63082 0.94738 0.41622 0.52837 0.25858 0.89225 0.99913 0.5425 0.62974 0.81614 0.16841 0.16059 0.00965 0.20026 0.14814 0.86028 0.19001 0.57277 0.7 [...]
+0.75393 0.38804 0.27986 0.62523 0.69114 0.32439 0.68156 0.65199 0.21972 0.35349 0.78959 0.1578 0.55025 0.03988 0.3961 0.17712 0.25314 0.95423 0.8502 0.24531 0.23448 0.27034 0.94684 0.12091 0.59735 0.92212 0.15215 0.2729 0.19976 0.47043 0.21841 0.16108 0.18887 0.46928 0.66467 0.18067 0.02403 0.37143 0.48917 0.28485 0.70741 0.51052 0.66144 0.67324 0.37298 0.67981 0.27641 0.35161 0.57837 0.1579 0.92535 0.52433 0.26148 0.83965 0.92964 0.04112 0.45746 0.66122 0.50392 0.72877 0.73973 0.51057 0 [...]
+0.10972 0.0367 0.7549 0.75332 0.96487 0.00055 0.72017 0.82119 0.7435 0.71572 0.77951 0.61231 0.21464 0.35356 0.77663 0.54706 0.0949 0.56564 0.18319 0.16363 0.54141 0.30689 0.01536 0.77225 0.42002 0.54567 0.07322 0.85775 0.75301 0.70607 0.21389 0.3358 0.70246 0.37065 0.02153 0.04842 0.90119 0.41294 0.04438 0.85652 0.11187 0.12747 0.24995 0.8071 0.56278 0.05964 0.08409 0.862 0.72467 0.60193 0.73271 0.0467 0.34744 0.69656 0.73068 0.20585 0.92208 0.03903 0.19508 0.9551 0.5197 0.45891 0.10264 [...]
+0.19921 0.32393 0.1664 0.38967 0.17244 0.41487 0.89046 0.54289 0.65261 0.2884 0.03564 0.66331 0.8909 0.28045 0.62045 0.37418 0.92007 0.17777 0.36269 0.45208 0.98459 0.56578 0.77851 0.31993 0.55422 0.51035 0.51307 0.33112 0.45238 0.33233 0.37505 0.95296 0.92025 0.1096 0.38867 0.77228 0.46268 0.60092 0.40695 0.08591 0.12879 0.39189 0.85632 0.72558 0.92719 0.76077 0.68352 0.21694 0.38994 0.53387 0.05223 0.8617 0.02664 0.65223 0.89559 0.87773 0.24792 0.03621 0.77706 0.05211 0.56402 0.60137 0 [...]
+0.67284 0.6307 0.61011 0.62325 0.93686 0.33074 0.45557 0.1407 0.50318 0.10722 0.51756 0.78808 0.13793 0.34381 0.39047 0.32095 0.50297 0.32483 0.3731 0.23069 0.51064 0.17627 0.32736 0.47855 0.69256 0.94636 0.87244 0.3592 0.66178 0.7288 0.98414 0.36448 0.74394 0.3576 0.29899 0.93198 0.79413 0.02887 0.42228 0.44561 0.16425 0.24308 0.02232 0.68773 0.10759 0.45976 0.77458 0.90292 0.43344 0.67734 0.2873 0.77097 0.90572 0.77487 0.61981 0.93492 0.33314 0.34939 0.36465 0.95005 0.59301 0.79561 0.7 [...]
+0.74591 0.98422 0.46617 0.80775 0.09717 0.13022 0.19672 0.24347 0.76826 0.53835 0.50413 0.52687 0.34783 0.79916 0.83705 0.60599 0.62883 0.41011 0.90478 0.96912 0.57486 0.29957 0.02319 0.39096 0.92489 0.5589 0.82756 0.79733 0.96099 0.3323 0.28718 0.97273 0.45972 0.07911 0.62587 0.42126 0.04045 0.02964 0.27338 0.26874 0.26199 0.85313 0.06167 0.14546 0.9444 0.70062 0.85511 0.53337 0.71402 0.50912 0.58344 0.6347 0.1655 0.77963 0.43375 0.18339 0.80664 0.20304 0.18863 0.71211 0.16833 0.15812 0 [...]
+0.55464 0.56044 0.19747 0.26625 0.27617 0.53336 0.08715 0.3895 0.18969 0.23989 0.35849 0.52609 0.01809 0.42155 0.91093 0.37647 0.40702 0.52839 0.37619 0.02122 0.18428 0.91291 0.33934 0.89195 0.50799 0.60343 0.20318 0.97203 0.40005 0.90673 0.64304 0.87463 0.42716 0.89704 0.47439 0.6911 0.37036 0.03411 0.09439 0.99336 0.13933 0.15933 0.2232 0.63557 0.89016 0.14628 0.99713 0.19746 0.61282 0.84302 0.82433 0.07019 0.01547 0.71737 0.28945 0.63259 0.75077 0.45513 0.44085 0.68985 0.68787 0.47728 [...]
+0.69217 0.61526 0.9908 0.31393 0.30332 0.53585 0.25923 0.85838 0.83584 0.6067 0.78919 0.54114 0.80766 0.63694 0.61449 0.30135 0.45034 0.96043 0.53827 0.89035 0.31555 0.7806 0.86437 0.70559 0.41857 0.67339 0.63833 0.40147 0.94584 0.65748 0.0131 0.89715 0.27511 0.20679 0.81338 0.31157 0.02396 0.30076 0.16006 0.45928 0.36826 0.95314 0.57231 0.57758 0.69528 0.02648 0.28677 0.91792 0.41449 0.92471 0.89002 0.54494 0.70588 0.84879 0.4404 0.92223 0.06035 0.13028 0.05023 0.7297 0.42825 0.70602 0. [...]
+0.22582 0.19085 0.72177 0.73088 0.95939 0.54503 0.20069 0.91119 0.23615 0.49898 0.33219 0.60137 0.05029 0.94376 0.18469 0.80372 0.09516 0.8238 0.78024 0.91873 0.56704 0.86149 0.05598 0.25232 0.21682 0.31102 0.52112 0.78877 0.25454 0.85832 0.20349 0.0572 0.43343 0.00249 0.08037 0.87369 0.73833 0.43929 0.25355 0.76141 0.88258 0.48898 0.73722 0.85178 0.40112 0.1101 0.76397 0.10765 0.57601 0.48835 0.33235 0.7426 0.87958 0.26031 0.45079 0.30693 0.60824 0.44615 0.95081 0.52342 0.58081 0.25284  [...]
+0.60062 0.06364 0.42551 0.06148 0.3015 0.96608 0.7743 0.77851 0.34672 0.63751 0.49952 0.44149 0.49582 0.91069 0.52644 0.30692 0.81128 0.4676 0.36701 0.94936 0.3262 0.5751 0.08164 0.69572 0.27776 0.59541 0.51208 0.85664 0.42955 0.65366 0.86454 0.46069 0.72791 0.52505 0.65961 0.64397 0.07095 0.76073 0.63371 0.88508 0.23983 0.30134 0.08325 0.78768 0.91472 0.07053 0.42644 0.6255 0.8569 0.61792 0.49082 0.36059 0.43397 0.15635 0.93796 0.88877 0.20684 0.83193 0.90106 0.75838 0.16631 0.40667 0.6 [...]
+0.9872 0.79144 0.8592 0.82875 0.59735 0.77854 0.2562 0.86456 0.17043 0.81104 0.21656 0.07723 0.54755 0.74595 0.21709 0.72475 0.97448 0.08549 0.51431 0.07031 0.92492 0.43978 0.55631 0.54099 0.95749 0.84871 0.77992 0.07494 0.36613 0.08692 0.04135 0.26118 0.01603 0.378 0.75041 0.02096 0.7667 0.68708 0.54354 0.2651 0.22196 0.49356 0.86115 0.94196 0.67141 0.47989 0.42918 0.29202 0.68189 0.73719 0.54869 0.26725 0.29187 0.99262 0.92251 0.17621 0.02101 0.3803 0.90542 0.75165 0.157 0.2144 0.59816 [...]
+0.95979 0.43349 0.98566 0.98431 0.78154 0.15846 0.94384 0.60507 0.21788 0.87553 0.75864 0.60812 0.04421 0.3124 0.56133 0.56078 0.16698 0.48025 0.5233 0.87505 0.20839 0.14194 0.39851 0.2027 0.5798 0.06314 0.61723 0.23864 0.43942 0.95128 0.35768 0.88893 0.40372 0.69019 0.18182 0.05988 0.73846 0.91317 0.25272 0.85554 0.37624 0.12234 0.8587 0.81299 0.1854 0.18614 0.85776 0.81228 0.36838 0.07313 0.54362 0.35362 0.05567 0.25897 0.71674 0.26018 0.91168 0.86349 0.25186 0.63059 0.95255 0.32287 0. [...]
+0.22954 0.1195 0.7653 0.77356 0.61583 0.09027 0.68725 0.23233 0.59686 0.49146 0.32337 0.69371 0.45822 0.95966 0.25222 0.69599 0.104 0.28156 0.01992 0.3042 0.39781 0.20232 0.99384 0.66186 0.88046 0.74426 0.02351 0.04995 0.41411 0.73469 0.39949 0.24558 0.28127 0.33131 0.98644 0.69257 0.23739 0.7657 0.2776 0.66142 0.27488 0.445 0.07628 0.50737 0.45177 0.13556 0.08189 0.94861 0.58673 0.97681 0.89038 0.17588 0.6222 0.17457 0.72808 0.97688 0.6995 0.30755 0.24482 0.7603 0.33351 0.95534 0.3386 0 [...]
+0.25792 0.63826 0.98209 0.60702 0.98826 0.62679 0.91484 0.8502 0.43131 0.47974 0.49845 0.93198 0.51849 0.1094 0.15859 0.31649 0.99271 0.89159 0.36099 0.72143 0.27065 0.44502 0.18599 0.5428 0.58977 0.39794 0.12763 0.85842 0.06345 0.06648 0.75583 0.75487 0.88379 0.81342 0.46645 0.47308 0.29186 0.62338 0.11185 0.94994 0.02827 0.98491 0.23027 0.05258 0.91189 0.37229 0.60019 0.01756 0.95932 0.78172 0.86501 0.80909 0.92248 0.53612 0.70888 0.92957 0.57551 0.99049 0.37033 0.85768 0.93287 0.95663 [...]
+0.36856 0.74475 0.73588 0.13012 0.65088 0.77785 0.03716 0.59085 0.35307 0.70067 0.80631 0.63147 0.97122 0.22004 0.52029 0.20383 0.37281 0.9851 0.8994 0.20382 0.82971 0.21964 0.5328 0.47772 0.15536 0.90857 0.99268 0.45612 0.77909 0.24579 0.22835 0.25613 0.10612 0.84631 0.3848 0.06455 0.98128 0.54474 0.08329 0.48466 0.80509 0.65699 0.26181 0.58458 0.77257 0.14493 0.22355 0.37618 0.81683 0.25189 0.12178 0.97236 0.17138 0.40559 0.11647 0.53599 0.6075 0.02983 0.14032 0.87442 0.98679 0.4168 0. [...]
+0.0816 0.51598 0.3439 0.92918 0.64222 0.47483 0.20118 0.37331 0.27331 0.99062 0.76113 0.58579 0.15527 0.8719 0.37396 0.47953 0.33148 0.71362 0.45893 0.60228 0.21117 0.07718 0.92887 0.48244 0.33186 0.94214 0.0166 0.59272 0.68849 0.4761 0.89884 0.54659 0.96371 0.49725 0.41727 0.01978 0.33452 0.26684 0.33137 0.05194 0.87871 0.90731 0.65863 0.96831 0.19486 0.8802 0.62565 0.92342 0.13387 0.12243 0.40475 0.93727 0.45369 0.31179 0.7096 0.93259 0.53144 0.27213 0.84614 0.17314 0.90423 0.31768 0.9 [...]
+0.45601 0.90149 0.69099 0.66293 0.87432 0.92359 0.69611 0.30422 0.30257 0.15688 0.05714 0.93996 0.35674 0.6929 0.7221 0.25993 0.0177 0.54728 0.9344 0.57836 0.14804 0.3641 0.34029 0.59104 0.2955 0.67969 0.20257 0.45941 0.20403 0.36674 0.62408 0.93736 0.48197 0.94526 0.61778 0.43018 0.85336 0.48317 0.78765 0.62514 0.04129 0.35003 0.60558 0.58441 0.02136 0.11546 0.29022 0.57249 0.23991 0.76092 0.71141 0.67934 0.4676 0.13383 0.75469 0.58566 0.56144 0.02749 0.82041 0.77954 0.33864 0.8746 0.74 [...]
+0.99372 0.78927 0.181 0.12688 0.53083 0.68184 0.98729 0.72357 0.53989 0.89021 0.16231 0.91252 0.18115 0.68174 0.78516 0.76526 0.55763 0.02967 0.52767 0.71871 0.0741 0.85223 0.52358 0.06648 0.62002 0.61397 0.78834 0.76117 0.96986 0.69534 0.00157 0.49748 0.2964 0.27311 0.67536 0.39612 0.9728 0.7597 0.07737 0.96324 0.87415 0.45161 0.44088 0.79107 0.99834 0.13157 0.51701 0.11128 0.02857 0.10322 0.58894 0.85791 0.03713 0.11842 0.98712 0.04614 0.37751 0.86379 0.05325 0.04716 0.05944 0.24672 0. [...]
+0.5315 0.14601 0.78499 0.90311 0.87004 0.4396 0.0897 0.97212 0.80671 0.92948 0.5306 0.33305 0.36891 0.38962 0.85294 0.78887 0.49948 0.38472 0.3204 0.73219 0.22784 0.71317 0.30336 0.60312 0.9898 0.16275 0.15205 0.88267 0.63715 0.14706 0.40063 0.73366 0.28887 0.47667 0.82649 0.10949 0.82263 0.43481 0.43872 0.54548 0.21189 0.71623 0.08802 0.77379 0.7248 0.98894 0.37611 0.69801 0.53736 0.89 0.26173 0.68876 0.96971 0.28296 0.7566 0.85876 0.43484 0.95107 0.57873 0.04467 0.6893 0.26331 0.25303  [...]
+0.13021 0.15357 0.79185 0.81851 0.45168 0.49025 0.67434 0.99851 0.54252 0.13697 0.2792 0.67407 0.70041 0.40186 0.73875 0.45282 0.00237 0.95952 0.32164 0.03816 0.01364 0.47123 0.54098 0.86259 0.66031 0.02534 0.51478 0.17902 0.18421 0.64131 0.42731 0.09452 0.29231 0.15162 0.4127 0.75084 0.85038 0.72825 0.06087 0.45138 0.94462 0.75411 0.33603 0.6479 0.69858 0.29637 0.96584 0.54667 0.9378 0.33573 0.22147 0.96483 0.48562 0.49076 0.14601 0.70068 0.77867 0.56101 0.99148 0.88275 0.94598 0.82692  [...]
+0.51747 0.38597 0.3968 0.25847 0.84426 0.59117 0.64493 0.37322 0.00121 0.13006 0.55607 0.12391 0.9569 0.94648 0.36403 0.08085 0.12662 0.06749 0.53228 0.46884 0.36201 0.25018 0.64618 0.36497 0.13925 0.95668 0.85869 0.31187 0.45388 0.02809 0.38294 0.96423 0.07713 0.00928 0.65227 0.86895 0.73201 0.21735 0.51591 0.09843 0.87012 0.45532 0.04199 0.9158 0.84022 0.12753 0.60606 0.57302 0.62379 0.13479 0.23977 0.66425 0.89074 0.8185 0.63759 0.24781 0.95375 0.36297 0.80342 0.70806 0.78825 0.51035  [...]
+0.02103 0.13003 0.77571 0.42523 0.04562 0.20127 0.51338 0.81343 0.86596 0.77343 0.81822 0.02282 0.30953 0.36555 0.64208 0.60174 0.94641 0.14207 0.94553 0.41129 0.52649 0.04366 0.98303 0.00048 0.88943 0.19516 0.86608 0.69968 0.66832 0.64253 0.6907 0.98617 0.85895 0.92421 0.48326 0.4045 0.36047 0.4071 0.84385 0.15833 0.72087 0.19808 0.70295 0.28876 0.1922 0.93251 0.87386 0.78475 0.64997 0.52041 0.95405 0.93732 0.24181 0.35982 0.27281 0.31277 0.90163 0.33816 0.37563 0.99539 0.50792 0.3262 0 [...]
+0.85592 0.56375 0.10082 0.32561 0.63742 0.06369 0.5291 0.83967 0.0971 0.88247 0.96091 0.39206 0.25904 0.10958 0.78774 0.00326 0.92069 0.36186 0.10318 0.19837 0.34141 0.87198 0.39275 0.1701 0.56143 0.47824 0.11784 0.83457 0.11474 0.90729 0.48233 0.10113 0.59506 0.05195 0.33693 0.41317 0.15715 0.06518 0.03357 0.89237 0.64599 0.81733 0.43339 0.91818 0.28533 0.79026 0.8008 0.78604 0.24492 0.54528 0.69241 0.37676 0.13224 0.54286 0.06705 0.27844 0.53564 0.41297 0.27883 0.23217 0.3609 0.82647 0 [...]
+0.76312 0.47628 0.34262 0.01809 0.83321 0.08123 0.21934 0.1377 0.36847 0.8048 0.4445 0.17869 0.7837 0.36777 0.38338 0.67032 0.52907 0.01831 0.31703 0.97513 0.3472 0.90252 0.01994 0.55352 0.49743 0.89105 0.42496 0.47314 0.06254 0.55979 0.42602 0.30294 0.04304 0.03686 0.34478 0.71703 0.24625 0.13617 0.85445 0.17963 0.77652 0.70537 0.39833 0.26498 0.46166 0.25524 0.81115 0.47747 0.53267 0.11448 0.13482 0.72704 0.58603 0.18644 0.07815 0.44993 0.70402 0.63372 0.84286 0.42661 0.38727 0.60612 0 [...]
+0.44622 0.30095 0.39444 0.68273 0.76093 0.41319 0.19971 0.02709 0.68576 0.33537 0.5817 0.81382 0.06909 0.82501 0.15901 0.12825 0.81774 0.39806 0.89047 0.72752 0.03326 0.10497 0.01778 0.90926 0.23813 0.26142 0.7792 0.08926 0.49311 0.81588 0.91203 0.26758 0.68478 0.02311 0.73103 0.31748 0.26853 0.24317 0.16928 0.08346 0.10602 0.87219 0.77437 0.70509 0.40969 0.6329 0.8793 0.39744 0.18471 0.3091 0.6106 0.5416 0.30077 0.29784 0.19656 0.5588 0.48019 0.68497 0.05951 0.12015 0.30069 0.14985 0.76 [...]
+0.22752 0.83308 0.98557 0.54648 0.84328 0.96821 0.11247 0.85854 0.82602 0.39651 0.86161 0.35072 0.00564 0.6272 0.83479 0.64248 0.42922 0.19029 0.12885 0.962 0.46054 0.64776 0.30404 0.81553 0.40923 0.01 0.42036 0.48639 0.12826 0.57083 0.57328 0.87896 0.99529 0.28405 0.41615 0.80329 0.17856 0.24088 0.59058 0.15999 0.37789 0.91726 0.55708 0.66447 0.97396 0.42374 0.0573 0.33819 0.72386 0.68764 0.76692 0.30608 0.04228 0.46074 0.99495 0.37176 0.94357 0.60633 0.43792 0.85922 0.10583 0.18469 0.1 [...]
+0.32325 0.62023 0.24672 0.57683 0.30472 0.71286 0.37618 0.13635 0.45476 0.31015 0.96302 0.59064 0.09185 0.50633 0.37182 0.94059 0.20073 0.66493 0.87472 0.7594 0.65032 0.58052 0.36299 0.25195 0.04161 0.29482 0.02816 0.84183 0.42821 0.4573 0.51601 0.71858 0.66213 0.97807 0.49876 0.71962 0.99411 0.25973 0.15112 0.87534 0.44087 0.37917 0.22901 0.80202 0.73085 0.17031 0.08257 0.14157 0.52342 0.77368 0.1636 0.06984 0.41708 0.03095 0.49174 0.10071 0.99659 0.75417 0.41239 0.26007 0.78428 0.365 0 [...]
+0.39611 0.45889 0.91861 0.63559 0.59992 0.43606 0.42939 0.68709 0.77728 0.99206 0.85592 0.8499 0.88823 0.75028 0.80865 0.13519 0.30862 0.647 0.45579 0.93526 0.41044 0.4434 0.05507 0.99539 0.53144 0.02448 0.60916 0.72226 0.43503 0.32489 0.97005 0.10168 0.11395 0.10265 0.06784 0.93188 0.21363 0.02232 0.22695 0.53861 0.97083 0.54896 0.97041 0.28609 0.29249 0.24268 0.70972 0.77151 0.95706 0.33906 0.64266 0.75848 0.51098 0.97837 0.95697 0.48037 0.03932 0.96392 0.13943 0.71976 0.15347 0.32086  [...]
+0.9903 0.57981 0.59589 0.14258 0.74175 0.54308 0.7137 0.88634 0.16678 0.47485 0.57647 0.85672 0.24158 0.32704 0.65029 0.92766 0.97462 0.48351 0.38176 0.3762 0.40649 0.23711 0.08962 0.89683 0.22121 0.80482 0.51454 0.03619 0.15226 0.1246 0.65166 0.24367 0.59154 0.26073 0.08225 0.08872 0.58615 0.08686 0.99203 0.31843 0.27643 0.49491 0.56025 0.80305 0.61095 0.61934 0.98123 0.57563 0.79713 0.85778 0.07184 0.63764 0.55737 0.04082 0.24345 0.63737 0.90549 0.52843 0.66215 0.47606 0.58141 0.84784  [...]
+0.94991 0.86202 0.69689 0.95173 0.40511 0.33906 0.74459 0.13467 0.39892 0.29782 0.30443 0.23341 0.17128 0.30928 0.03548 0.03394 0.93163 0.36278 0.05813 0.38706 0.39321 0.25065 0.7608 0.88937 0.33015 0.06648 0.45489 0.53945 0.69726 0.41993 0.77908 0.77716 0.17753 0.81503 0.65645 0.41274 0.08245 0.08796 0.44547 0.95681 0.2161 0.06478 0.31742 0.13964 0.30831 0.28187 0.04806 0.5727 0.35032 0.33664 0.93674 0.94958 0.31231 0.99596 0.57181 0.13522 0.1602 0.59939 0.55323 0.82073 0.38283 0.81133  [...]
+0.97749 0.7896 0.68034 0.20741 0.21756 0.98443 0.32327 0.58069 0.06017 0.99776 0.77138 0.97979 0.79485 0.22767 0.82394 0.35754 0.37346 0.00919 0.80163 0.62715 0.75956 0.68631 0.24818 0.281 0.60339 0.46562 0.15241 0.99614 0.28538 0.92859 0.38077 0.35824 0.96839 0.18945 0.06802 0.98746 0.16909 0.23215 0.29514 0.8069 0.3574 0.46613 0.76392 0.24689 0.46493 0.71591 0.88835 0.54349 0.3377 0.21071 0.65917 0.71131 0.14986 0.9773 0.69831 0.71852 0.80359 0.56297 0.36259 0.36771 0.59892 0.11934 0.8 [...]
+0.95296 0.92314 0.04387 0.05947 0.76642 0.19517 0.4005 0.07132 0.87913 0.96353 0.87779 0.62908 0.16056 0.24128 0.18503 0.12437 0.65745 0.77918 0.84013 0.07318 0.6277 0.85432 0.82947 0.03364 0.25822 0.21619 0.87156 0.88709 0.20272 0.71484 0.29366 0.47196 0.04451 0.18811 0.24582 0.93388 0.63248 0.41549 0.7362 0.97885 0.24753 0.7642 0.30178 0.82244 0.44271 0.45456 0.13539 0.64312 0.49115 0.82552 0.69161 0.4729 0.48692 0.16115 0.77827 0.52375 0.0174 0.14102 0.10773 0.49742 0.43856 0.03259 0. [...]
+0.64215 0.89459 0.70114 0.87902 0.50023 0.11344 0.56148 0.04335 0.56341 0.23904 0.31026 0.8124 0.73194 0.14069 0.83274 0.38429 0.18298 0.16743 0.87129 0.91558 0.60349 0.23044 0.98718 0.09845 0.75608 0.80171 0.9231 0.5025 0.99089 0.8449 0.49845 0.98485 0.18507 0.15701 0.45596 0.60734 0.35174 0.62295 0.37927 0.21494 0.61081 0.9762 0.63527 0.81228 0.66983 0.61572 0.33991 0.6339 0.18991 0.49455 0.25245 0.55179 0.47934 0.34919 0.46981 0.00433 0.59124 0.20375 0.87673 0.48572 0.90758 0.68313 0. [...]
+0.56797 0.34008 0.76318 0.87249 0.91532 0.42723 0.90186 0.87588 0.84222 0.84521 0.12582 0.73161 0.5905 0.49551 0.81489 0.94743 0.01777 0.71707 0.04483 0.80608 0.38214 0.8463 0.38216 0.89591 0.21008 0.39655 0.34869 0.77636 0.16462 0.29816 0.06449 0.2933 0.15063 0.48386 0.96143 0.21688 0.8832 0.50609 0.18075 0.62273 0.25095 0.01307 0.90531 0.19092 0.6182 0.92389 0.78663 0.87964 0.88268 0.15538 0.63745 0.03883 0.42736 0.19611 0.80852 0.87132 0.10545 0.25718 0.23363 0.77869 0.13359 0.95947 0 [...]
+0.82335 0.39915 0.84088 0.05043 0.264 0.62593 0.32861 0.61616 0.14846 0.0452 0.51747 0.3448 0.68958 0.85471 0.6564 0.32927 0.61853 0.06053 0.02699 0.66084 0.41317 0.3049 0.51782 0.73907 0.97688 0.08116 0.67995 0.75035 0.08633 0.93004 0.23879 0.08484 0.84898 0.7974 0.09554 0.75232 0.02131 0.58273 0.02239 0.08654 0.67163 0.11921 0.87298 0.22971 0.52136 0.68072 0.57738 0.14716 0.908 0.01997 0.61892 0.68659 0.12608 0.75962 0.75124 0.18776 0.97491 0.55686 0.10086 0.83211 0.2031 0.0786 0.89278 [...]
+0.1888 0.41419 0.80735 0.57817 0.60759 0.8942 0.50269 0.92289 0.82404 0.04151 0.36155 0.27443 0.42027 0.32291 0.59778 0.81657 0.91919 0.3606 0.64515 0.26573 0.5173 0.63252 0.34944 0.83247 0.57844 0.41192 0.00067 0.61362 0.78671 0.65134 0.50409 0.14719 0.43419 0.47895 0.01534 0.6863 0.95529 0.73202 0.23234 0.33261 0.73241 0.11973 0.09261 0.6888 0.49128 0.96703 0.27317 0.51375 0.74779 0.34739 0.68916 0.37474 0.95428 0.13018 0.37302 0.7053 0.67797 0.75174 0.99212 0.70386 0.95554 0.31312 0.4 [...]
+0.30538 0.64892 0.55833 0.58024 0.20477 0.51176 0.15323 0.77759 0.03773 0.98415 0.77618 0.76256 0.24923 0.15598 0.30552 0.66625 0.36837 0.87623 0.05685 0.87054 0.60891 0.87649 0.56508 0.42781 0.4383 0.33924 0.39448 0.55714 0.79631 0.46769 0.36354 0.61307 0.1184 0.37344 0.94801 0.54194 0.55329 0.36659 0.94925 0.39357 0.84657 0.75396 0.50189 0.55909 0.01827 0.95623 0.07833 0.57454 0.4984 0.38471 0.63651 0.31261 0.37554 0.54717 0.88655 0.23199 0.1165 0.5558 0.22648 0.90692 0.92087 0.7049 0. [...]
+0.93031 0.03443 0.33367 0.73198 0.24101 0.7187 0.54357 0.17208 0.40945 0.00817 0.14221 0.23179 0.15309 0.13659 0.70686 0.34366 0.32708 0.69066 0.78483 0.4241 0.05468 0.15056 0.54877 0.36255 0.54083 0.37216 0.75717 0.7618 0.59877 0.16521 0.75643 0.64194 0.08916 0.53614 0.11965 0.36878 0.89044 0.11517 0.81374 0.77883 0.98283 0.49385 0.87258 0.19857 0.22909 0.69636 0.9876 0.3586 0.0199 0.48605 0.74851 0.86655 0.83684 0.20949 0.34562 0.83028 0.58221 0.50961 0.8039 0.15202 0.21142 0.984 0.948 [...]
+0.84906 0.88999 0.09001 0.28402 0.64234 0.938 0.21021 0.31342 0.81109 0.8694 0.02766 0.23936 0.42326 0.90316 0.72122 0.0272 0.43663 0.57437 0.50302 0.86681 0.77622 0.7086 0.81004 0.23018 0.17284 0.70414 0.2705 0.72264 0.19645 0.69369 0.47582 0.17223 0.00389 0.07631 0.99451 0.10669 0.06909 0.20756 0.10022 0.54859 0.0958 0.77851 0.24793 0.48217 0.33185 0.6051 0.83237 0.12976 0.80405 0.32571 0.59549 0.29181 0.06971 0.68084 0.80486 0.16669 0.69625 0.61094 0.30763 0.2235 0.62141 0.8892 0.501  [...]
+0.56722 0.59813 0.27258 0.75032 0.61748 0.31027 0.02958 0.66817 0.88137 0.58901 0.66058 0.67224 0.52482 0.75042 0.51346 0.19812 0.0069 0.04114 0.96183 0.00705 0.05146 0.41092 0.61314 0.13259 0.64721 0.94098 0.78658 0.6382 0.80264 0.44794 0.47094 0.60939 0.28772 0.9132 0.04194 0.15276 0.69368 0.02055 0.96475 0.75987 0.81749 0.74155 0.81143 0.05167 0.68028 0.55114 0.69367 0.45554 0.59931 0.51126 0.0678 0.47883 0.52588 0.8392 0.24668 0.90323 0.66985 0.09373 0.30794 0.02973 0.34418 0.59438 0 [...]
+0.10379 0.80273 0.64092 0.41845 0.11198 0.73804 0.81123 0.08729 0.5703 0.60187 0.58149 0.56724 0.04151 0.43657 0.98506 0.51869 0.13746 0.9147 0.67117 0.22143 0.6189 0.29163 0.93229 0.92473 0.86892 0.7572 0.37487 0.37358 0.95252 0.81204 0.592 0.82079 0.72749 0.31085 0.5532 0.4461 0.22491 0.3538 0.51824 0.05232 0.46024 0.91668 0.764 0.84751 0.45383 0.98879 0.99694 0.63554 0.3065 0.62592 0.49616 0.86312 0.03961 0.28876 0.33454 0.50645 0.22566 0.78514 0.51138 0.12009 0.94122 0.06142 0.304 0. [...]
+0.34289 0.06981 0.10597 0.76354 0.46138 0.65954 0.45158 0.63167 0.01696 0.47606 0.38098 0.92762 0.71698 0.52507 0.54318 0.94722 0.10642 0.12975 0.0639 0.118 0.09142 0.49003 0.52089 0.0047 0.13019 0.32076 0.77266 0.4242 0.67189 0.13577 0.81311 0.35026 0.80947 0.8525 0.00037 0.01008 0.99202 0.87685 0.14187 0.62184 0.88358 0.24598 0.65791 0.19015 0.33434 0.18243 0.12683 0.8086 0.38538 0.82031 0.02767 0.50123 0.32822 0.18602 0.13724 0.99258 0.51881 0.64615 0.53406 0.9593 0.1709 0.76042 0.292 [...]
+0.17717 0.26267 0.95851 0.01158 0.85817 0.39393 0.87696 0.13554 0.16605 0.95316 0.0683 0.38004 0.38887 0.6644 0.45102 0.79403 0.49133 0.69952 0.78372 0.45109 0.44397 0.10273 0.42704 0.53337 0.07723 0.95513 0.04386 0.56544 0.61775 0.68072 0.96989 0.09037 0.879 0.72106 0.41251 0.13653 0.36119 0.24646 0.27841 0.22799 0.26702 0.83759 0.62604 0.17695 0.48964 0.53965 0.73804 0.77576 0.35551 0.77352 0.75451 0.07831 0.765 0.04981 0.83465 0.83264 0.04422 0.80086 0.17854 0.83955 0.87635 0.93202 0. [...]
+0.69691 0.74152 0.80677 0.27071 0.44958 0.81098 0.26548 0.3841 0.37484 0.44523 0.30451 0.37498 0.22098 0.75349 0.30799 0.01644 0.81271 0.10411 0.69448 0.6682 0.38921 0.52464 0.88074 0.72837 0.06382 0.73546 0.85957 0.86124 0.48689 0.97671 0.25031 0.27322 0.23102 0.71427 0.45465 0.39514 0.64041 0.4982 0.0446 0.56759 0.1629 0.95192 0.42943 0.53088 0.16888 0.09142 0.00585 0.10248 0.99077 0.62062 0.33034 0.66176 0.46674 0.24191 0.33293 0.98686 0.35401 0.47837 0.575 0.91242 0.30441 0.30946 0.2 [...]
+0.21548 0.14947 0.96533 0.28348 0.7198 0.22343 0.40917 0.3433 0.04113 0.99969 0.27868 0.3347 0.91589 0.16118 0.20092 0.83649 0.6339 0.00074 0.67839 0.67801 0.15674 0.07221 0.16628 0.23978 0.83737 0.66654 0.69089 0.02729 0.80947 0.09448 0.83574 0.77983 0.30326 0.25708 0.12077 0.64887 0.85448 0.47097 0.179 0.84402 0.42267 0.00318 0.06918 0.67238 0.11072 0.78341 0.60363 0.15275 0.49624 0.01657 0.08297 0.02014 0.85101 0.69195 0.04738 0.62517 0.13261 0.25381 0.15083 0.02084 0.00732 0.13356 0. [...]
+0.46457 0.05199 0.67981 0.91728 0.09088 0.78168 0.59726 0.4175 0.03792 0.99482 0.39588 0.81023 0.90244 0.46995 0.78951 0.58626 0.83137 0.43032 0.95347 0.47752 0.8287 0.64575 0.12985 0.51379 0.79757 0.61755 0.59667 0.73649 0.85484 0.30649 0.49034 0.96338 0.9122 0.17586 0.54618 0.98433 0.90174 0.75284 0.68677 0.38685 0.31672 0.87195 0.71651 0.79065 0.77224 0.01877 0.81445 0.00454 0.27054 0.03608 0.51944 0.97408 0.45627 0.90256 0.0136 0.81011 0.29564 0.1383 0.60786 0.06884 0.98381 0.16158 0 [...]
+0.0352 0.73907 0.07832 0.12603 0.70045 0.19086 0.41544 0.43122 0.28806 0.90188 0.10849 0.90839 0.61039 0.6868 0.9569 0.9604 0.21371 0.17163 0.42695 0.02797 0.35825 0.13757 0.91286 0.18043 0.06611 0.74589 0.62299 0.78444 0.73016 0.08263 0.6631 0.44479 0.86018 0.32322 0.03575 0.11759 0.54713 0.99118 0.069 0.95933 0.57042 0.50184 0.70676 0.06241 0.70958 0.08527 0.38057 0.12963 0.46354 0.26146 0.83491 0.9953 0.05208 0.34648 0.63346 0.99905 0.14729 0.21381 0.33809 0.97506 0.96427 0.8259 0.446 [...]
+0.07032 0.25694 0.6019 0.69384 0.02889 0.14617 0.15022 0.47003 0.09851 0.06602 0.26838 0.40584 0.90371 0.89788 0.99547 0.94171 0.84704 0.7964 0.00185 0.6263 0.65887 0.3534 0.30988 0.27481 0.5531 0.80839 0.36079 0.42858 0.83002 0.65305 0.5613 0.32256 0.40243 0.92073 0.50341 0.74069 0.4507 0.45434 0.91198 0.81707 0.41087 0.527 0.82058 0.0483 0.39304 0.63468 0.07101 0.83498 0.28372 0.9593 0.60617 0.5706 0.74169 0.34655 0.41719 0.06424 0.12684 0.69721 0.76324 0.85872 0.28866 0.9649 0.21659 0 [...]
+0.83808 0.98897 0.01819 0.81421 0.54908 0.96802 0.31467 0.64347 0.1963 0.89267 0.78791 0.6942 0.22131 0.04503 0.74652 0.76846 0.58673 0.54925 0.11507 0.4527 0.0129 0.53979 0.42841 0.15229 0.67891 0.76272 0.38824 0.08801 0.82571 0.91658 0.74957 0.01677 0.43588 0.51189 0.68108 0.1926 0.73347 0.87406 0.52136 0.84613 0.91472 0.80656 0.9259 0.87153 0.06811 0.24534 0.21609 0.63936 0.46614 0.97172 0.65077 0.56427 0.13208 0.80023 0.87696 0.09371 0.68769 0.6951 0.3001 0.3529 0.65972 0.46575 0.960 [...]
+0.44455 0.73216 0.05001 0.60169 0.3477 0.40861 0.55252 0.17536 0.92151 0.0307 0.82405 0.2804 0.58071 0.69849 0.81115 0.65439 0.66296 0.99248 0.9468 0.95959 0.42686 0.61299 0.59524 0.79282 0.57803 0.68562 0.46375 0.12844 0.21652 0.07319 0.1111 0.24817 0.86721 0.64568 0.54316 0.95678 0.00286 0.20029 0.90196 0.33333 0.99367 0.20622 0.42049 0.31729 0.68334 0.63379 0.34523 0.54375 0.65199 0.6528 0.50515 0.20019 0.52935 0.72192 0.69333 0.50645 0.01716 0.77431 0.08366 0.66659 0.16844 0.49869 0. [...]
+0.2633 0.35086 0.08368 0.24355 0.26436 0.75294 0.94009 0.05519 0.19065 0.59256 0.23546 0.06302 0.67891 0.67444 0.3416 0.44825 0.19883 0.69592 0.11134 0.00394 0.03198 0.85523 0.33858 0.79493 0.30732 0.71468 0.61779 0.20967 0.35722 0.53484 0.89719 0.58976 0.34076 0.2934 0.99882 0.24168 0.48258 0.59203 0.09444 0.67372 0.89078 0.70341 0.83817 0.5985 0.73304 0.19967 0.51825 0.18356 0.36128 0.14025 0.54638 0.2095 0.91273 0.79502 0.04732 0.86902 0.12533 0.09996 0.61511 0.15937 0.73437 0.13853 0 [...]
+0.10704 0.06269 0.11505 0.96652 0.19423 0.18009 0.96147 0.45928 0.76214 0.74664 0.31579 0.41924 0.12135 0.12824 0.71162 0.28325 0.90202 0.94668 0.02869 0.94335 0.4587 0.56433 0.05392 0.15504 0.04731 0.2065 0.30957 0.96489 0.38534 0.39038 0.90207 0.83786 0.09864 0.20785 0.83073 0.42154 0.06312 0.49129 0.29787 0.64233 0.41583 0.85433 0.66433 0.56466 0.52015 0.4379 0.51523 0.90673 0.23845 0.91229 0.12651 0.10294 0.91793 0.52686 0.8538 0.61096 0.7885 0.63454 0.86457 0.71568 0.53693 0.57563 0 [...]
+0.75724 0.78493 0.31312 0.3734 0.09914 0.58353 0.54872 0.36138 0.49576 0.95802 0.71476 0.25089 0.36967 0.87468 0.49944 0.44634 0.48739 0.69036 0.13486 0.97602 0.80991 0.19631 0.73315 0.27156 0.03494 0.88579 0.61256 0.001 0.63401 0.71623 0.24926 0.51375 0.12353 0.84938 0.26062 0.04911 0.65443 0.16062 0.15304 0.0862 0.85508 0.48841 0.27636 0.60373 0.32153 0.23523 0.58642 0.81551 0.84115 0.09161 0.45277 0.87649 0.06932 0.79639 0.23277 0.76932 0.81679 0.21 0.61121 0.44153 0.26396 0.48785 0.5 [...]
+0.08654 0.32613 0.09222 0.92627 0.48487 0.89011 0.4753 0.82686 0.48122 0.75895 0.10331 0.89039 0.28082 0.3092 0.04575 0.38966 0.42389 0.36393 0.06223 0.44498 0.30176 0.21451 0.69424 0.08494 0.49272 0.09421 0.57494 0.19418 0.2568 0.51802 0.82195 0.22783 0.57188 0.69334 0.40602 0.6844 0.85882 0.71753 0.33914 0.17984 0.18012 0.19031 0.11239 0.45974 0.02705 0.38123 0.97769 0.35223 0.22451 0.37042 0.76398 0.65812 0.95636 0.14019 0.42558 0.3899 0.60958 0.14468 0.70169 0.91563 0.95234 0.86339 0 [...]
+0.41325 0.50713 0.49999 0.13196 0.53172 0.26911 0.92811 0.45429 0.28848 0.94917 0.19659 0.49986 0.87053 0.59056 0.76028 0.61109 0.03408 0.5449 0.67678 0.49247 0.8247 0.45014 0.01041 0.84494 0.14064 0.32126 0.98424 0.83907 0.73931 0.02468 0.14599 0.12301 0.47758 0.00484 0.74249 0.44697 0.56247 0.42328 0.7425 0.42886 0.48474 0.90381 0.60176 0.86971 0.93747 0.33453 0.43752 0.814 0.47608 0.42172 0.91952 0.59612 0.78983 0.53554 0.56526 0.87922 0.61706 0.47139 0.00071 0.20329 0.19238 0.83683 0 [...]
+0.81162 0.70336 0.35652 0.65735 0.80718 0.25381 0.55653 0.18534 0.48978 0.19358 0.52343 0.9795 0.15518 0.50499 0.99363 0.97282 0.89967 0.81965 0.62036 0.18967 0.8166 0.48348 0.45298 0.68073 0.00982 0.72586 0.14512 0.10788 0.86264 0.92521 0.87527 0.57483 0.20689 0.86694 0.33527 0.28318 0.77917 0.15651 0.12984 0.46379 0.2446 0.54596 0.41631 0.93986 0.22316 0.54079 0.16272 0.17911 0.8573 0.37972 0.85723 0.44989 0.41157 0.59846 0.04275 0.78767 0.17334 0.50055 0.81646 0.3837 0.22087 0.43777 0 [...]
+0.11376 0.11175 0.9872 0.176 0.81131 0.35246 0.72059 0.88953 0.67403 0.30692 0.74278 0.90179 0.88951 0.28263 0.99065 0.82017 0.86175 0.10479 0.58856 0.9535 0.00541 0.85853 0.4186 0.64745 0.68393 0.62745 0.71925 0.23904 0.74586 0.85568 0.39168 0.65018 0.95883 0.80733 0.74071 0.42242 0.98324 0.66176 0.4893 0.57922 0.98719 0.2203 0.98938 0.4648 0.31482 0.94328 0.90162 0.93095 0.08062 0.89174 0.57961 0.59322 0.93065 0.27051 0.89786 0.32394 0.28892 0.46692 0.97214 0.73106 0.02658 0.40369 0.67 [...]
+0.78062 0.10939 0.88397 0.22028 0.76025 0.82691 0.81926 0.33829 0.92449 0.91705 0.66328 0.09549 0.63765 0.21057 0.87849 0.3684 0.6509 0.46002 0.51444 0.11816 0.80641 0.23471 0.86193 0.18911 0.62851 0.79885 0.08547 0.779 0.6443 0.74103 0.08135 0.71833 0.2554 0.78888 0.80122 0.07006 0.12252 0.32148 0.87419 0.01489 0.40608 0.16079 0.72772 0.30271 0.36434 0.45247 0.34166 0.69114 0.02747 0.34532 0.86114 0.68955 0.91292 0.7026 0.24379 0.33507 0.07248 0.33037 0.14498 0.98016 0.28107 0.5676 0.99 [...]
+0.24265 0.7326 0.11779 0.84577 0.63197 0.99752 0.33875 0.73648 0.88301 0.20497 0.62194 0.16998 0.94929 0.29456 0.0371 0.86568 0.76601 0.56482 0.03451 0.71273 0.87969 0.81928 0.03223 0.35732 0.50456 0.49592 0.36901 0.79193 0.25576 0.79728 0.28308 0.30884 0.39503 0.12472 0.68887 0.82055 0.52297 0.40262 0.86594 0.42974 0.75566 0.51689 0.13023 0.78775 0.71241 0.20072 0.45057 0.90258 0.74093 0.40927 0.247 0.40492 0.16991 0.76054 0.82772 0.35623 0.87972 0.205 0.15749 0.18957 0.0563 0.85867 0.0 [...]
+0.66256 0.37898 0.58872 0.41743 0.56247 0.3367 0.62434 0.20267 0.35314 0.32903 0.73241 0.62267 0.13648 0.96869 0.44862 0.80856 0.75812 0.35483 0.09178 0.60267 0.90712 0.69272 0.65448 0.1646 0.45125 0.86998 0.24712 0.19516 0.69366 0.88951 0.38755 0.50912 0.22448 0.68036 0.28577 0.93485 0.92081 0.90712 0.22445 0.63153 0.27217 0.99872 0.72286 0.80257 0.72845 0.83318 0.95851 0.42225 0.4274 0.76407 0.66246 0.86704 0.07723 0.41524 0.14699 0.32181 0.37293 0.87067 0.03144 0.36162 0.0358 0.65675  [...]
+0.60568 0.31342 0.36968 0.67743 0.38608 0.15013 0.33469 0.49337 0.93642 0.2102 0.41492 0.47708 0.04504 0.91987 0.92175 0.21528 0.31275 0.75372 0.53075 0.27148 0.82849 0.76297 0.29979 0.634 0.84138 0.29935 0.08311 0.57037 0.91315 0.64949 0.99652 0.06319 0.69883 0.12107 0.34725 0.37665 0.46045 0.37758 0.78443 0.31884 0.82419 0.24168 0.06356 0.58676 0.16907 0.4121 0.50416 0.73608 0.27969 0.66344 0.27442 0.94527 0.36172 0.33831 0.99421 0.95549 0.35943 0.96335 0.74939 0.20058 0.46583 0.71597  [...]
+0.89068 0.88021 0.97131 0.96631 0.29766 0.48552 0.71459 0.27433 0.58534 0.41456 0.00378 0.42735 0.77412 0.21812 0.92924 0.63244 0.32188 0.89807 0.61371 0.64238 0.47809 0.56618 0.97016 0.63519 0.80211 0.80267 0.23032 0.33593 0.33144 0.52728 0.10969 0.48897 0.71833 0.56415 0.5212 0.79922 0.2839 0.74002 0.1281 0.10306 0.91587 0.95686 0.24463 0.26854 0.1971 0.85109 0.14429 0.14611 0.69932 0.7217 0.78692 0.56275 0.74811 0.67178 0.54393 0.25235 0.1902 0.66099 0.97845 0.92482 0.2992 0.28666 0.3 [...]
+0.91018 0.87048 0.09992 0.45564 0.62912 0.54833 0.58998 0.363 0.89125 0.03075 0.13986 0.06456 0.71925 0.43396 0.69085 0.96766 0.46758 0.99549 0.62531 0.39002 0.5312 0.7998 0.79783 0.41527 0.17221 0.82617 0.32232 0.84927 0.29164 0.02885 0.8871 0.32589 0.92709 0.6532 0.1626 0.09026 0.48138 0.18917 0.44867 0.73389 0.75775 0.6185 0.9742 0.44955 0.88977 0.4638 0.64111 0.25048 0.62175 0.45346 0.27503 0.86682 0.7888 0.9849 0.10704 0.7416 0.10282 0.64924 0.20668 0.21178 0.56294 0.26345 0.26255 0 [...]
+0.05895 0.44009 0.57956 0.67708 0.96172 0.83589 0.15397 0.85326 0.85197 0.81069 0.88833 0.26462 0.8624 0.84696 0.21588 0.07002 0.09338 0.42998 0.8701 0.84501 0.37988 0.5139 0.37743 0.50857 0.35155 0.55559 0.73779 0.54224 0.05486 0.03666 0.68834 0.18841 0.52679 0.05542 0.78316 0.7095 0.85908 0.43246 0.82796 0.85378 0.42488 0.4794 0.03632 0.72758 0.41381 0.81842 0.28826 0.89334 0.78457 0.66411 0.70295 0.77503 0.69298 0.2263 0.7477 0.68847 0.16814 0.40894 0.60196 0.64366 0.82043 0.84056 0.2 [...]
+0.7844 0.2802 0.13842 0.78462 0.19852 0.45506 0.71595 0.72554 0.09212 0.50241 0.73106 0.40432 0.69684 0.07511 0.90206 0.70585 0.92847 0.89917 0.39239 0.07457 0.86243 0.30771 0.69279 0.66648 0.02048 0.85369 0.35621 0.15122 0.90374 0.24787 0.45235 0.69079 0.38752 0.30215 0.53717 0.18865 0.92896 0.6102 0.76763 0.67905 0.74193 0.26675 0.5623 0.41086 0.76768 0.91031 0.26068 0.57434 0.15602 0.21918 0.07121 0.87443 0.24128 0.89286 0.96059 0.1638 0.59938 0.82902 0.32097 0.86542 0.90502 0.62051 0 [...]
+0.59986 0.85707 0.38409 0.51527 0.66243 0.60932 0.02517 0.59201 0.14682 0.88061 0.99988 0.40582 0.73761 0.00329 0.71027 0.42391 0.54464 0.71535 0.87206 0.02857 0.47527 0.87671 0.98191 0.06902 0.01867 0.53192 0.36571 0.49212 0.16456 0.70601 0.08307 0.83667 0.95787 0.00071 0.07988 0.89586 0.03653 0.4492 0.03752 0.71829 0.57195 0.3117 0.36992 0.61841 0.89858 0.68814 0.31616 0.98027 0.28778 0.63222 0.43583 0.10583 0.96758 0.02134 0.16537 0.60962 0.80064 0.6386 0.71583 0.75472 0.21966 0.60466 [...]
+0.7933 0.72328 0.90683 0.61263 0.93474 0.56854 0.07963 0.37959 0.43604 0.11091 0.00049 0.56533 0.2216 0.16809 0.69612 0.97365 0.10302 0.18095 0.6579 0.01853 0.25956 0.74813 0.78873 0.24043 0.86697 0.22325 0.95931 0.88187 0.90514 0.27387 0.93525 0.47603 0.23827 0.88183 0.19399 0.80223 0.51909 0.51883 0.93799 0.61449 0.13085 0.03039 0.47775 0.27386 0.28823 0.70316 0.22933 0.99688 0.08895 0.94698 0.23188 0.80598 0.42838 0.73478 0.00121 0.08346 0.73537 0.42334 0.77707 0.91094 0.44963 0.75134 [...]
+0.16699 0.61967 0.44313 0.04031 0.87367 0.44477 0.35076 0.22056 0.97741 0.76129 0.25838 0.56109 0.44619 0.93835 0.35731 0.67855 0.57938 0.8385 0.02729 0.53754 0.38543 0.55113 0.25725 0.92377 0.12248 0.16796 0.04155 0.67578 0.37678 0.07805 0.85834 0.21209 0.03637 0.41427 0.65688 0.42885 0.46759 0.84386 0.70197 0.41884 0.00697 0.71203 0.89348 0.29324 0.84537 0.49638 0.6433 0.90859 0.30711 0.75322 0.30646 0.32583 0.59577 0.89676 0.12326 0.98408 0.05404 0.26509 0.12231 0.22104 0.95748 0.4024 [...]
+0.08464 0.28642 0.93237 0.76693 0.11279 0.7265 0.41618 0.47242 0.49059 0.16773 0.18209 0.30094 0.34946 0.48085 0.60009 0.41412 0.42907 0.44735 0.93935 0.95127 0.6078 0.05483 0.49668 0.15158 0.42794 0.17333 0.8801 0.80082 0.14888 0.81405 0.27499 0.9257 0.5322 0.72593 0.04519 0.30875 0.98572 0.55554 0.09198 0.71966 0.59239 0.78597 0.1973 0.97336 0.04825 0.59578 0.9248 0.15042 0.30858 0.45218 0.80711 0.30374 0.40377 0.17214 0.25179 0.72856 0.4398 0.78493 0.45371 0.47431 0.11287 0.50784 0.64 [...]
+0.48291 0.7323 0.3991 0.83019 0.59167 0.22359 0.16042 0.21349 0.24337 0.57747 0.38085 0.59961 0.5554 0.05976 0.42038 0.6422 0.45252 0.4836 0.57147 0.03924 0.67549 0.91341 0.68954 0.97864 0.23003 0.14725 0.34 0.4018 0.9454 0.21522 0.74022 0.4602 0.75199 0.91642 0.79868 0.27566 0.32746 0.08417 0.68186 0.0982 0.16232 0.97545 0.43617 0.84987 0.71697 0.79798 0.00986 0.41591 0.78577 0.53458 0.89402 0.80803 0.58164 0.44614 0.9844 0.29451 0.21735 0.06133 0.88528 0.18991 0.38076 0.19758 0.09061 0 [...]
+0.45065 0.67879 0.91334 0.33498 0.2309 0.14179 0.20925 0.53814 0.23735 0.01521 0.74928 0.78561 0.37708 0.64473 0.55861 0.0199 0.61391 0.36948 0.98828 0.67244 0.65464 0.68514 0.8018 0.50252 0.33861 0.99818 0.97407 0.44421 0.78532 0.18448 0.61636 0.168 0.05516 0.16551 0.29822 0.70446 0.38425 0.59438 0.21431 0.86555 0.04151 0.86486 0.52412 0.63352 0.86481 0.4929 0.69015 0.22106 0.32673 0.43515 0.73115 0.35114 0.29321 0.35115 0.10768 0.52191 0.76153 0.1626 0.64925 0.47451 0.20822 0.29536 0.8 [...]
+0.26014 0.68476 0.03478 0.83721 0.1862 0.76258 0.07158 0.69991 0.51644 0.55985 0.49763 0.45212 0.02476 0.71985 0.60904 0.52966 0.03756 0.52913 0.51257 0.04536 0.53227 0.39411 0.22251 0.34017 0.54915 0.04862 0.00764 0.72202 0.25179 0.98024 0.10002 0.1426 0.2159 0.37963 0.13861 0.01954 0.02896 0.01279 0.88975 0.83412 0.89338 0.95702 0.33809 0.43224 0.12075 0.29552 0.14159 0.74638 0.86218 0.56805 0.74509 0.63999 0.1567 0.23333 0.17974 0.60025 0.78848 0.57935 0.79651 0.70913 0.32941 0.04694  [...]
+0.31213 0.99693 0.97392 0.47853 0.85893 0.03723 0.23797 0.111 0.04023 0.29444 0.66792 0.58305 0.12463 0.71651 0.58288 0.85956 0.54018 0.91354 0.26521 0.50555 0.6186 0.87286 0.75483 0.08655 0.48342 0.83254 0.28967 0.72679 0.33438 0.13823 0.91666 0.46571 0.50361 0.86205 0.31035 0.21168 0.74286 0.57976 0.19511 0.55296 0.57727 0.22854 0.48479 0.28697 0.5245 0.96223 0.7356 0.06369 0.64431 0.6548 0.95624 0.31457 0.04998 0.53832 0.22824 0.92113 0.72628 0.52331 0.46361 0.72603 0.7569 0.44304 0.6 [...]
+0.19103 0.90163 0.48945 0.93409 0.92802 0.32752 0.31927 0.15305 0.16114 0.00307 0.89337 0.20593 0.39927 0.89468 0.93254 0.74224 0.18507 0.66176 0.15508 0.89678 0.5678 0.13619 0.94451 0.32073 0.44161 0.95142 0.47083 0.19531 0.98929 0.30932 0.65509 0.12958 0.03631 0.77902 0.47013 0.03553 0.72626 0.15809 0.23968 0.19303 0.45991 0.07926 0.11791 0.04923 0.98011 0.65017 0.89137 0.26663 0.45606 0.96379 0.71426 0.0753 0.11425 0.0013 0.35578 0.06524 0.68663 0.08184 0.54101 0.48681 0.09517 0.70594 [...]
+0.96586 0.15368 0.66243 0.7624 0.61965 0.83227 0.68732 0.3616 0.33148 0.27968 0.21465 0.2437 0.80184 0.00694 0.02779 0.26439 0.13172 0.16326 0.34684 0.64464 0.15253 0.48484 0.44369 0.85669 0.81268 0.99421 0.12607 0.88158 0.09535 0.74934 0.92569 0.08937 0.2156 0.22887 0.92047 0.88787 0.58224 0.30222 0.05791 0.39204 0.49293 0.47973 0.95385 0.93244 0.24103 0.98959 0.73282 0.07697 0.57368 0.4566 0.71505 0.59432 0.38784 0.77421 0.12229 0.58465 0.69642 0.26862 0.39254 0.09616 0.83421 0.78255 0 [...]
+0.71047 0.26665 0.80915 0.67711 0.64396 0.77951 0.31977 0.64772 0.27834 0.11924 0.99817 0.50471 0.55935 0.09734 0.71756 0.00054 0.69257 0.4753 0.5193 0.18163 0.45446 0.8035 0.1446 0.6611 0.548 0.53066 0.53592 0.14215 0.85016 0.56747 0.666 0.70921 0.30545 0.65027 0.1717 0.23117 0.91551 0.93295 0.34397 0.27241 0.44812 0.01665 0.4321 0.83841 0.61255 0.78198 0.71662 0.71587 0.10914 0.89849 0.14412 0.99421 0.33171 0.98945 0.7059 0.65358 0.11513 0.06567 0.98789 0.79735 0.19012 0.52958 0.06197  [...]
+0.97428 0.9342 0.25219 0.33068 0.27619 0.00712 0.999 0.88138 0.66651 0.19077 0.9758 0.44074 0.25415 0.49531 0.17231 0.0636 0.19177 0.30271 0.12744 0.33613 0.26807 0.63987 0.37502 0.13168 0.05306 0.31367 0.7706 0.93881 0.61041 0.66904 0.04793 0.3294 0.87732 0.53453 0.46372 0.86642 0.1253 0.20176 0.66949 0.25928 0.1403 0.624 0.02796 0.55564 0.85928 0.87301 0.4154 0.85105 0.94377 0.55534 0.32156 0.0637 0.19784 0.1908 0.72428 0.6017 0.62422 0.42294 0.57891 0.79995 0.30429 0.96719 0.93279 0.3 [...]
+0.13987 0.63811 0.32325 0.21564 0.96766 0.29177 0.35354 0.38815 0.21605 0.2976 0.15958 0.981 0.90098 0.70041 0.46397 0.27341 0.48874 0.80573 0.80422 0.57731 0.13613 0.3975 0.99052 0.70011 0.24687 0.84408 0.23082 0.17813 0.16926 0.40638 0.05823 0.97775 0.34888 0.11765 0.47899 0.18101 0.00407 0.57888 0.95097 0.15436 0.26188 0.43756 0.59625 0.11282 0.14359 0.57375 0.46326 0.08793 0.37149 0.52057 0.38091 0.09435 0.48905 0.76363 0.23564 0.65742 0.13407 0.39348 0.05454 0.68127 0.13987 0.49112  [...]
+0.51086 0.28467 0.78802 0.07858 0.54943 0.61436 0.51777 0.4923 0.54923 0.88577 0.87542 0.95869 0.12405 0.90791 0.37316 0.6649 0.89563 0.96087 0.08938 0.09529 0.75677 0.59316 0.64345 0.50215 0.20135 0.04139 0.24533 0.55173 0.3563 0.13594 0.88214 0.9589 0.19569 0.45728 0.13299 0.77838 0.94958 0.84709 0.67238 0.53687 0.73216 0.59738 0.28136 0.29849 0.66686 0.89135 0.30572 0.0248 0.14936 0.37378 0.78712 0.14101 0.24451 0.02246 0.40576 0.54509 0.74045 0.241 0.88399 0.32917 0.12166 0.20022 0.5 [...]
+0.4192 0.81321 0.87057 0.20141 0.52082 0.03509 0.61414 0.50053 0.73034 0.76734 0.74432 0.46631 0.18826 0.55309 0.53368 0.03097 0.20787 0.92358 0.92789 0.89635 0.24492 0.74015 0.89315 0.15541 0.41261 0.472 0.91784 0.07324 0.11478 0.00806 0.17414 0.82292 0.22992 0.04025 0.78751 0.72909 0.93119 0.94202 0.57365 0.33126 0.5264 0.06396 0.09578 0.98361 0.73235 0.09252 0.88959 0.46476 0.89534 0.21348 0.96577 0.90554 0.72493 0.7473 0.84944 0.53002 0.0709 0.0296 0.23384 0.89422 0.05684 0.41105 0.6 [...]
+0.38487 0.58659 0.86782 0.23453 0.00262 0.08709 0.35013 0.45869 0.35164 0.83279 0.24711 0.83599 0.69829 0.22885 0.92815 0.34786 0.5905 0.98982 0.09247 0.31321 0.58921 0.62916 0.71559 0.26423 0.99797 0.66984 0.99556 0.98228 0.63359 0.06051 0.07842 0.15626 0.75131 0.11328 0.22683 0.97736 0.15876 0.32131 0.36763 0.4854 0.5158 0.7323 0.81066 0.73181 0.05638 0.58091 0.06745 0.26048 0.04398 0.9098 0.77034 0.38893 0.09615 0.69217 0.27203 0.03386 0.61731 0.2878 0.47533 0.30762 0.69787 0.18502 0. [...]
+0.46371 0.39665 0.8476 0.1653 0.09396 0.49767 0.05884 0.51267 0.24829 0.64112 0.46283 0.19508 0.22043 0.62849 0.22775 0.75906 0.42843 0.04851 0.58553 0.3308 0.89459 0.16787 0.88217 0.55538 0.66046 0.40529 0.36776 0.22722 0.52581 0.04601 0.22639 0.24911 0.20078 0.07378 0.78555 0.07242 0.06771 0.2697 0.63372 0.83632 0.22681 0.92876 0.97765 0.81696 0.78323 0.94664 0.46057 0.26288 0.53474 0.54553 0.66832 0.96278 0.05413 0.88115 0.40928 0.96223 0.84455 0.03519 0.36672 0.73149 0.00928 0.47274  [...]
+0.49619 0.97897 0.73023 0.33601 0.89152 0.41367 0.78084 0.50827 0.25657 0.75169 0.90183 0.73437 0.92575 0.4399 0.91433 0.31208 0.48529 0.74093 0.1047 0.43968 0.32322 0.76228 0.30105 0.64668 0.99083 0.80532 0.8709 0.04678 0.77355 0.08082 0.639 0.2604 0.93847 0.58245 0.27461 0.96767 0.37857 0.74057 0.22148 0.62488 0.50841 0.9789 0.82094 0.45708 0.54132 0.0864 0.13011 0.91475 0.95591 0.14733 0.83356 0.99299 0.31825 0.73046 0.60619 0.24015 0.97892 0.92412 0.44946 0.24171 0.89994 0.90495 0.83 [...]
+0.03672 0.29228 0.34141 0.49608 0.65906 0.19626 0.24324 0.92723 0.3125 0.47967 0.51051 0.14522 0.13683 0.72021 0.03939 0.34191 0.87001 0.94767 0.65969 0.03307 0.4515 0.50705 0.84323 0.52942 0.08773 0.35379 0.69057 0.58179 0.57498 0.32586 0.87966 0.28396 0.76608 0.87564 0.16094 0.78913 0.75499 0.39204 0.50118 0.41739 0.83448 0.41752 0.33795 0.7228 0.71465 0.29443 0.23002 0.51407 0.68917 0.00742 0.20841 0.12396 0.46337 0.69151 0.07903 0.20094 0.3627 0.20176 0.7969 0.53772 0.56413 0.14223 0 [...]
+0.97973 0.11352 0.65176 0.89939 0.28677 0.32938 0.69548 0.00865 0.84538 0.37892 0.44853 0.40841 0.78768 0.62266 0.57275 0.55655 0.10847 0.11929 0.21102 0.36714 0.8751 0.79319 0.621 0.93538 0.43854 0.30699 0.20872 0.60451 0.80265 0.09825 0.342 0.68697 0.41524 0.9022 0.21456 0.07023 0.57664 0.43185 0.42095 0.77254 0.75686 0.44959 0.49875 0.23352 0.62003 0.48919 0.54866 0.76819 0.8639 0.23244 0.40169 0.21857 0.01416 0.98099 0.25917 0.77756 0.55314 0.43252 0.84119 0.54068 0.4465 0.34844 0.64 [...]
+0.98509 0.83546 0.30158 0.93669 0.67382 0.12139 0.33514 0.14762 0.90476 0.93425 0.42606 0.71016 0.08166 0.01922 0.9316 0.10742 0.8685 0.7763 0.8654 0.55481 0.11937 0.29272 0.92639 0.37142 0.57147 0.91564 0.6541 0.20238 0.02414 0.34304 0.09601 0.50768 0.89453 0.31818 0.11283 0.31679 0.42542 0.25282 0.04161 0.59527 0.77176 0.71466 0.00366 0.02866 0.29737 0.88182 0.18939 0.46054 0.1804 0.94428 0.52336 0.94439 0.20989 0.24688 0.91308 0.89042 0.22209 0.33227 0.6022 0.35409 0.47909 0.26327 0.5 [...]
+0.51419 0.22474 0.18969 0.69613 0.48641 0.32081 0.20637 0.9749 0.66794 0.37801 0.75721 0.48103 0.77072 0.67899 0.55284 0.62961 0.66568 0.60452 0.85553 0.68526 0.48592 0.20932 0.38052 0.9232 0.46774 0.46042 0.08159 0.45314 0.96313 0.436 0.51124 0.14783 0.39797 0.73182 0.16308 0.22475 0.48489 0.33976 0.04102 0.13891 0.54639 0.21771 0.8871 0.96324 0.17509 0.93442 0.5178 0.19387 0.1065 0.39709 0.27036 0.97093 0.96212 0.88576 0.12852 0.78092 0.7698 0.22343 0.13277 0.42303 0.38867 0.10039 0.25 [...]
+0.66268 0.11185 0.82323 0.36698 0.91126 0.86102 0.89977 0.7058 0.92873 0.02855 0.29275 0.19057 0.60052 0.18103 0.92814 0.54724 0.00699 0.85538 0.78014 0.45372 0.20123 0.13253 0.94039 0.31501 0.12491 0.17405 0.38508 0.50105 0.01581 0.51238 0.38503 0.04814 0.97444 0.38493 0.24299 0.11192 0.64427 0.03862 0.95385 0.93266 0.56122 0.76545 0.50382 0.53618 0.26854 0.91166 0.96229 0.21218 0.97582 0.96168 0.25406 0.26747 0.3181 0.15256 0.44519 0.39624 0.5494 0.40898 0.84144 0.14113 0.29474 0.41398 [...]
+0.20367 0.83007 0.60481 0.46929 0.61091 0.81851 0.28898 0.44467 0.29798 0.69689 0.72304 0.9991 0.06517 0.47393 0.09179 0.73042 0.71616 0.55341 0.78971 0.55799 0.44635 0.00531 0.05639 0.00205 0.52359 0.16949 0.0704 0.08871 0.01437 0.25039 0.19471 0.02959 0.62202 0.21999 0.48501 0.60847 0.71531 0.19624 0.41036 0.51579 0.04465 0.0126 0.89814 0.46561 0.69669 0.2369 0.92493 0.46095 0.7821 0.60636 0.46963 0.15359 0.52615 0.90674 0.08288 0.81309 0.17714 0.90081 0.77329 0.7619 0.34593 0.24401 0. [...]
+0.54784 0.01285 0.91404 0.34597 0.13948 0.61287 0.37538 0.65048 0.08136 0.61705 0.0544 0.22491 0.87627 0.73013 0.97664 0.85087 0.15271 0.98075 0.96429 0.74212 0.98106 0.45688 0.80254 0.19567 0.2561 0.6348 0.52504 0.53079 0.6886 0.0939 0.70355 0.53628 0.64471 0.08491 0.26518 0.06154 0.02297 0.52421 0.7415 0.12084 0.66146 0.9681 0.61492 0.60031 0.36545 0.07915 0.83147 0.58225 0.18725 0.59365 0.86369 0.3557 0.98911 0.19884 0.5753 0.98213 0.78198 0.34283 0.14488 0.87029 0.62451 0.75714 0.193 [...]
+0.92016 0.58382 0.84238 0.82267 0.3443 0.57102 0.08629 0.45526 0.59551 0.2173 0.60102 0.40947 0.45562 0.05202 0.03894 0.59769 0.67836 0.95194 0.12115 0.73232 0.05027 0.20903 0.56842 0.7963 0.83474 0.2751 0.44374 0.51521 0.05756 0.09651 0.01578 0.15162 0.5223 0.47891 0.06069 0.97143 0.07985 0.28777 0.94994 0.04842 0.07123 0.50179 0.23606 0.21512 0.60294 0.0718 0.58429 0.95377 0.1833 0.68787 0.64938 0.33608 0.11488 0.82009 0.5796 0.38398 0.76438 0.26907 0.02484 0.45172 0.74351 0.52469 0.63 [...]
+
+357.76682 15.52643 15.41456 15.39743 15.30751 15.12317 15.10437 15.05586 14.97215 14.95116 14.89953 14.80868 14.79457 14.72332 14.68542 14.63161 14.54824 14.51876 14.43473 14.39619 14.35533 14.34099 14.28227 14.25196 14.224 14.18849 14.14696 14.08621 14.07092 14.0478 13.99554 13.96481 13.95052 13.91731 13.88834 13.80712 13.78927 13.72949 13.71821 13.64984 13.61477 13.60514 13.53211 13.51086 13.47732 13.45874 13.43397 13.40933 13.37311 13.31796 13.30471 13.24929 13.22562 13.21887 13.1798  [...]
diff --git a/examples/testdata/svd/wiki.example b/examples/testdata/svd/wiki.example
new file mode 100644
index 0000000..2299989
--- /dev/null
+++ b/examples/testdata/svd/wiki.example
@@ -0,0 +1,7 @@
+4 4
+4 1 -2 2
+1 2 0 1
+-2 0 3 -2
+2 1 -2 -1
+
+6.84462111  2.26853141  2.19751698  1.08436446
\ No newline at end of file
diff --git a/examples/testdata/svd/wiki.qr.example b/examples/testdata/svd/wiki.qr.example
new file mode 100644
index 0000000..3b7027c
--- /dev/null
+++ b/examples/testdata/svd/wiki.qr.example
@@ -0,0 +1,6 @@
+3 3
+12 -51 4
+6 167 -68
+-4 24 -41
+
+190.56724372 32.85688323 13.69492038
\ No newline at end of file
diff --git a/examples/tutorial/CMakeLists.txt b/examples/tutorial/CMakeLists.txt
index 1de176b..eacf4eb 100644
--- a/examples/tutorial/CMakeLists.txt
+++ b/examples/tutorial/CMakeLists.txt
@@ -1,27 +1,116 @@
-foreach(tut bandwidth-reduction blas1 custom-kernels custom-context fft viennacl-info)
+
+
+
+#
+# Part 1: Tutorials which work without OpenCL as well:
+#
+foreach(tut bandwidth-reduction blas1 scheduler wrap-host-buffer)
    add_executable(${tut} ${tut}.cpp)
-   target_link_libraries(${tut} ${OPENCL_LIBRARIES})
+   if (ENABLE_OPENCL)
+     target_link_libraries(${tut} ${OPENCL_LIBRARIES})
+     set_target_properties(${tut} PROPERTIES COMPILE_FLAGS "-DVIENNACL_WITH_OPENCL")
+   endif (ENABLE_OPENCL)
 endforeach()
 
 if(ENABLE_UBLAS)
    include_directories(${Boost_INCLUDE_DIRS})
-   foreach(tut amg blas2 blas3 iterative iterative-ublas matrix-range qr spai sparse structured-matrices vector-range)
+   foreach(tut blas2 blas3 iterative-ublas lanczos least-squares matrix-range power-iter qr sparse vector-range)
       add_executable(${tut} ${tut}.cpp)
-      target_link_libraries(${tut} ${OPENCL_LIBRARIES})
+      target_link_libraries(${tut} ${Boost_LIBRARIES})
+      if (ENABLE_OPENCL)
+        target_link_libraries(${tut} ${OPENCL_LIBRARIES} ${Boost_LIBRARIES})
+        set_target_properties(${tut} PROPERTIES COMPILE_FLAGS "-DVIENNACL_WITH_OPENCL")
+      endif (ENABLE_OPENCL)
    endforeach()
 endif()
 
 if(ENABLE_EIGEN)
    include_directories(${EIGEN_INCLUDE_DIR})
-   add_executable(iterative-eigen iterative-eigen.cpp)
-   add_executable(eigen-with-viennacl
-      eigen-with-viennacl.cpp)
-   target_link_libraries(eigen-with-viennacl ${OPENCL_LIBRARIES})
+   add_executable(iterative-eigen     iterative-eigen.cpp)
+   add_executable(eigen-with-viennacl eigen-with-viennacl.cpp)
+   if (ENABLE_OPENCL)
+     target_link_libraries(eigen-with-viennacl ${OPENCL_LIBRARIES})
+     set_target_properties(eigen-with-viennacl PROPERTIES COMPILE_FLAGS "-DVIENNACL_WITH_OPENCL")
+   endif (ENABLE_OPENCL)
 endif()
 
 if(ENABLE_MTL4)
    foreach(tut iterative-mtl4 mtl4-with-viennacl)
-      add_executable(${tut} ${tut}.cpp)
-      target_link_libraries(${tut} ${OPENCL_LIBRARIES})
+     add_executable(${tut} ${tut}.cpp)
+     if (ENABLE_OPENCL)
+       target_link_libraries(${tut} ${OPENCL_LIBRARIES})
+       set_target_properties(${tut} PROPERTIES COMPILE_FLAGS "-DVIENNACL_WITH_OPENCL")
+     endif (ENABLE_OPENCL)
    endforeach()
 endif()
+
+
+
+#
+# Part 2: Tutorials which work only with OpenCL enabled:
+#
+if (ENABLE_OPENCL)
+  foreach(tut custom-kernels custom-context fft viennacl-info)
+    add_executable(${tut} ${tut}.cpp)
+    target_link_libraries(${tut} ${OPENCL_LIBRARIES})
+    set_target_properties(${tut} PROPERTIES COMPILE_FLAGS "-DVIENNACL_WITH_OPENCL")
+  endforeach()
+
+  if(ENABLE_UBLAS)
+    include_directories(${Boost_INCLUDE_DIRS})
+    foreach(tut amg iterative multithreaded multithreaded_cg spai structured-matrices)
+        add_executable(${tut} ${tut}.cpp)
+        target_link_libraries(${tut} ${Boost_LIBRARIES})
+        if (ENABLE_OPENCL)
+          target_link_libraries(${tut} ${OPENCL_LIBRARIES} ${Boost_LIBRARIES})
+          set_target_properties(${tut} PROPERTIES COMPILE_FLAGS "-DVIENNACL_WITH_OPENCL")
+        endif (ENABLE_OPENCL)
+    endforeach()
+  endif()
+endif (ENABLE_OPENCL)
+
+
+#
+# Part 3: Tutorials with CUDA support
+#
+
+if (ENABLE_CUDA)
+  foreach(tut blas1 wrap-cuda-buffer)
+    cuda_add_executable(${tut}-cuda ${tut}.cu)
+  endforeach()
+
+  if(ENABLE_UBLAS)
+    include_directories(${Boost_INCLUDE_DIRS})
+    foreach(tut blas2 blas3 iterative least-squares matrix-range power-iter qr sparse vector-range)
+        cuda_add_executable(${tut}-cuda ${tut}.cu)
+        target_link_libraries(${tut}-cuda ${Boost_LIBRARIES})
+    endforeach()
+  endif()
+
+endif (ENABLE_CUDA)
+
+#
+# Part 4: Tutorials on using the shared C-library
+#
+
+include_directories(${PROJECT_SOURCE_DIR}/libviennacl/include/)
+
+if(ENABLE_CUDA)
+  if(ENABLE_OPENCL)
+    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-DVIENNACL_WITH_OPENCL") #set flags before setting executable!
+    cuda_add_executable(libviennacl-tutorial libviennacl.cu)
+    #set_target_properties(libviennacl-tutorial PROPERTIES COMPILE_FLAGS "-DVIENNACL_WITH_OPENCL -DVIENNACL_WITH_CUDA")
+    target_link_libraries(libviennacl-tutorial viennacl ${OPENCL_LIBRARIES})
+  else(ENABLE_OPENCL)
+    cuda_add_executable(libviennacl-tutorial libviennacl.cu)
+    target_link_libraries(libviennacl-tutorial viennacl)
+  endif(ENABLE_OPENCL)
+else(ENABLE_CUDA)
+  add_executable(libviennacl-tutorial libviennacl.cpp)
+  if(ENABLE_OPENCL)
+    set_target_properties(libviennacl-tutorial PROPERTIES COMPILE_FLAGS "-DVIENNACL_WITH_OPENCL")
+    target_link_libraries(libviennacl-tutorial viennacl ${OPENCL_LIBRARIES})
+  else(ENABLE_OPENCL)
+    target_link_libraries(libviennacl-tutorial viennacl)
+  endif(ENABLE_OPENCL)
+endif(ENABLE_CUDA)
diff --git a/examples/tutorial/CMakeLists.txt~ b/examples/tutorial/CMakeLists.txt~
deleted file mode 100644
index 8772248..0000000
--- a/examples/tutorial/CMakeLists.txt~
+++ /dev/null
@@ -1,32 +0,0 @@
-foreach(tut bandwidth-reduction blas1 custom-kernels custom-context fft viennacl-info)
-   add_executable(${tut} ${tut}.cpp)
-   target_link_libraries(${tut} ${OPENCL_LIBRARIES})
-endforeach()
-
-if(ENABLE_UBLAS)
-   include_directories(${Boost_INCLUDE_DIRS})
-   foreach(tut amg blas2 blas3 iterative iterative-ublas matrix-range qr spai sparse structured-matrices vector-range)
-      add_executable(${tut} ${tut}.cpp)
-      target_link_libraries(${tut} ${OPENCL_LIBRARIES})
-   endforeach()
-endif()
-
-if(ENABLE_EIGEN)
-   include_directories(${EIGEN_INCLUDE_DIR})
-   add_executable(iterative-eigen iterative-eigen.cpp)
-   add_executable(eigen-with-viennacl
-      eigen-with-viennacl.cpp)
-   target_link_libraries(eigen-with-viennacl ${OPENCL_LIBRARIES})
-endif()
-
-if(ENABLE_MTL4)
-   foreach(tut iterative-mtl4 mtl4-with-viennacl)
-      add_executable(${tut} ${tut}.cpp)
-      target_link_libraries(${tut} ${OPENCL_LIBRARIES})
-   endforeach()
-endif()
-
-IF(CMAKE_COMPILER_IS_GNUCXX)
-   #ADD_DEFINITIONS(-Wall -pedantic -O0 -g)
-   ADD_DEFINITIONS(-Wall -pedantic -O3)
-ENDIF(CMAKE_COMPILER_IS_GNUCXX)
diff --git a/examples/tutorial/Random.hpp b/examples/tutorial/Random.hpp
index 93d37ca..3e5439f 100644
--- a/examples/tutorial/Random.hpp
+++ b/examples/tutorial/Random.hpp
@@ -1,52 +1,52 @@
-
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-#ifndef _RANDOM_HPP_
-#define _RANDOM_HPP_
-
-#include <time.h>
-#include <stdlib.h>
-
-inline void init()
-{
-	static bool init = false;
-	if (!init)
-	{
-		srand( (unsigned int)time(NULL) );
-		init = true;
-	}
-}
-
-template<class TYPE>
-TYPE random();
-
-template<>
-double random<double>()
-{
-  init();
-  return static_cast<double>(rand()) / static_cast<double>(RAND_MAX);
-}
-
-template<>
-float random<float>()
-{
-  init();
-  return static_cast<float>(random<double>());
-}
-
-#endif
-
+/* =========================================================================
+   Copyright (c) 2010-2012, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#ifndef _RANDOM_HPP_
+#define _RANDOM_HPP_
+
+#include <time.h>
+#include <stdlib.h>
+
+inline void init()
+{
+  static bool init = false;
+  if (!init)
+  {
+    srand( (unsigned int)time(NULL) );
+    init = true;
+  }
+}
+
+template<class TYPE>
+TYPE random();
+
+template<>
+double random<double>()
+{
+  init();
+  return static_cast<double>(rand()) / static_cast<double>(RAND_MAX);
+}
+
+template<>
+float random<float>()
+{
+  init();
+  return static_cast<float>(random<double>());
+}
+
+#endif
+
diff --git a/examples/tutorial/amg.cpp b/examples/tutorial/amg.cpp
old mode 100755
new mode 100644
index e7b8212..a054730
--- a/examples/tutorial/amg.cpp
+++ b/examples/tutorial/amg.cpp
@@ -1,19 +1,29 @@
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
 ============================================================================= */
 
+
+/*
+*
+*   Tutorial: Algebraic multigrid preconditioner (only available with the OpenCL backend, experimental)
+*
+*/
+
+
+
 #ifndef NDEBUG     //without NDEBUG the performance of sparse ublas matrices is poor.
  #define NDEBUG
 #endif
@@ -21,7 +31,7 @@
 #include <boost/numeric/ublas/matrix_sparse.hpp>
 #include <boost/numeric/ublas/operation_sparse.hpp>
 
-#define VIENNACL_HAVE_UBLAS 1
+#define VIENNACL_WITH_UBLAS 1
 
 #define SOLVER_ITERS 2500
 //#define SCALAR float
@@ -52,7 +62,7 @@ void run_solver(MatrixType const & matrix, VectorType const & rhs, VectorType co
 {
   VectorType result(rhs);
   VectorType residual(rhs);
-  
+
   result = viennacl::linalg::solve(matrix, rhs, solver, precond);
   residual -= viennacl::linalg::prod(matrix, result);
   std::cout << "  > Relative residual: " << viennacl::linalg::norm_2(residual) / viennacl::linalg::norm_2(rhs) << std::endl;
@@ -63,8 +73,8 @@ void run_solver(MatrixType const & matrix, VectorType const & rhs, VectorType co
 
 template <typename ScalarType>
 void run_amg(viennacl::linalg::cg_tag & cg_solver,
-             boost::numeric::ublas::vector<ScalarType> & ublas_vec,
-             boost::numeric::ublas::vector<ScalarType> & ublas_result,
+             boost::numeric::ublas::vector<ScalarType> & /*ublas_vec*/,
+             boost::numeric::ublas::vector<ScalarType> & /*ublas_result*/,
              boost::numeric::ublas::compressed_matrix<ScalarType> & ublas_matrix,
              viennacl::vector<ScalarType> & vcl_vec,
              viennacl::vector<ScalarType> & vcl_result,
@@ -72,35 +82,35 @@ void run_amg(viennacl::linalg::cg_tag & cg_solver,
              std::string info,
              viennacl::linalg::amg_tag & amg_tag)
 {
-  
+
   viennacl::linalg::amg_precond<boost::numeric::ublas::compressed_matrix<ScalarType> > ublas_amg = viennacl::linalg::amg_precond<boost::numeric::ublas::compressed_matrix<ScalarType> > (ublas_matrix, amg_tag);
   boost::numeric::ublas::vector<ScalarType> avgstencil;
   unsigned int coarselevels = amg_tag.get_coarselevels();
-  
+
   std::cout << "-- CG with AMG preconditioner, " << info << " --" << std::endl;
-  
-  std::cout << " * Setup phase (ublas types)..." << std::endl;      
-  
+
+  std::cout << " * Setup phase (ublas types)..." << std::endl;
+
   // Coarse level measure might have been changed during setup. Reload!
   ublas_amg.tag().set_coarselevels(coarselevels);
   ublas_amg.setup();
 
   std::cout << " * Operator complexity: " << ublas_amg.calc_complexity(avgstencil) << std::endl;
-  
+
   amg_tag.set_coarselevels(coarselevels);
   viennacl::linalg::amg_precond<viennacl::compressed_matrix<ScalarType> > vcl_amg = viennacl::linalg::amg_precond<viennacl::compressed_matrix<ScalarType> > (vcl_compressed_matrix, amg_tag);
-  std::cout << " * Setup phase (ViennaCL types)..." << std::endl;      
+  std::cout << " * Setup phase (ViennaCL types)..." << std::endl;
   vcl_amg.tag().set_coarselevels(coarselevels);
   vcl_amg.setup();
-    
-  std::cout << " * CG solver (ublas types)..." << std::endl;         
-  run_solver(ublas_matrix, ublas_vec, ublas_result, cg_solver, ublas_amg);   
-  
-  std::cout << " * CG solver (ViennaCL types)..." << std::endl;         
+
+  std::cout << " * CG solver (ublas types)..." << std::endl;
+  //run_solver(ublas_matrix, ublas_vec, ublas_result, cg_solver, ublas_amg);
+
+  std::cout << " * CG solver (ViennaCL types)..." << std::endl;
   run_solver(vcl_compressed_matrix, vcl_vec, vcl_result, cg_solver, vcl_amg);
 
 }
-  
+
 int main()
 {
   //
@@ -110,9 +120,27 @@ int main()
   std::cout << "----------------------------------------------" << std::endl;
   std::cout << "               Device Info" << std::endl;
   std::cout << "----------------------------------------------" << std::endl;
-  
+
+#ifdef VIENNACL_WITH_OPENCL
+  // Optional: Customize OpenCL backend
+  viennacl::ocl::platform pf = viennacl::ocl::get_platforms()[0];
+  std::vector<viennacl::ocl::device> const & devices = pf.devices();
+
+  // Optional: Set first device to first context:
+  viennacl::ocl::setup_context(0, devices[0]);
+
+  // Optional: Set second device for second context (use the same device for the second context if only one device available):
+  if (devices.size() > 1)
+    viennacl::ocl::setup_context(1, devices[1]);
+  else
+    viennacl::ocl::setup_context(1, devices[0]);
+
   std::cout << viennacl::ocl::current_device().info() << std::endl;
-  
+  viennacl::context ctx(viennacl::ocl::get_context(1));
+#else
+  viennacl::context ctx;
+#endif
+
   typedef float    ScalarType;  // feel free to change this to double if supported by your device
 
 
@@ -121,46 +149,34 @@ int main()
   //
   boost::numeric::ublas::vector<ScalarType> ublas_vec, ublas_result;
   boost::numeric::ublas::compressed_matrix<ScalarType> ublas_matrix;
-  
+
   viennacl::linalg::cg_tag cg_solver;
   viennacl::linalg::amg_tag amg_tag;
   viennacl::linalg::amg_precond<boost::numeric::ublas::compressed_matrix<ScalarType> > ublas_amg;
-    
+
   // Read matrix
-  #ifdef _MSC_VER
-  if (!viennacl::io::read_matrix_market_file(ublas_matrix, "../../examples/testdata/mat65k.mtx"))
-  #else
   if (!viennacl::io::read_matrix_market_file(ublas_matrix, "../examples/testdata/mat65k.mtx"))
-  #endif
   {
     std::cout << "Error reading Matrix file" << std::endl;
     return EXIT_FAILURE;
   }
-  
+
   // Set up rhs and result vector
-  #ifdef _MSC_VER
-  if (!readVectorFromFile("../../examples/testdata/rhs65025.txt", ublas_vec))
-  #else
   if (!readVectorFromFile("../examples/testdata/rhs65025.txt", ublas_vec))
-  #endif
   {
     std::cout << "Error reading RHS file" << std::endl;
     return 0;
   }
 
-  #ifdef _MSC_VER
-  if (!readVectorFromFile("../../examples/testdata/result65025.txt", ublas_result))
-  #else
   if (!readVectorFromFile("../examples/testdata/result65025.txt", ublas_result))
-  #endif
   {
     std::cout << "Error reading Result file" << std::endl;
     return 0;
   }
-  
-  viennacl::vector<ScalarType> vcl_vec(ublas_vec.size());
-  viennacl::vector<ScalarType> vcl_result(ublas_vec.size());
-  viennacl::compressed_matrix<ScalarType> vcl_compressed_matrix(ublas_vec.size(), ublas_vec.size());
+
+  viennacl::vector<ScalarType> vcl_vec(ublas_vec.size(), ctx);
+  viennacl::vector<ScalarType> vcl_result(ublas_vec.size(), ctx);
+  viennacl::compressed_matrix<ScalarType> vcl_compressed_matrix(ublas_vec.size(), ublas_vec.size(), ctx);
 
   // Copy to GPU
   viennacl::copy(ublas_matrix, vcl_compressed_matrix);
@@ -172,10 +188,10 @@ int main()
   //
   std::cout << "-- CG solver (CPU, no preconditioner) --" << std::endl;
   run_solver(ublas_matrix, ublas_vec, ublas_result, cg_solver, viennacl::linalg::no_precond());
-  
-  std::cout << "-- CG solver (GPU, no preconditioner) --" << std::endl;   
+
+  std::cout << "-- CG solver (GPU, no preconditioner) --" << std::endl;
   run_solver(vcl_compressed_matrix, vcl_vec, vcl_result, cg_solver, viennacl::linalg::no_precond());
-  
+
   //
   // With AMG Preconditioner RS+DIRECT
   //
@@ -188,49 +204,49 @@ int main()
                                       3,    // postsmoothing steps
                                       0);   // number of coarse levels to be used (0: automatically use as many as reasonable)
   run_amg (cg_solver, ublas_vec, ublas_result, ublas_matrix, vcl_vec, vcl_result, vcl_compressed_matrix, "RS COARSENING, DIRECT INTERPOLATION", amg_tag);
-  
+
   //
   // With AMG Preconditioner RS+CLASSIC
   //
   amg_tag = viennacl::linalg::amg_tag(VIENNACL_AMG_COARSE_RS, VIENNACL_AMG_INTERPOL_CLASSIC, 0.25, 0.2, 0.67, 3, 3, 0);
   run_amg ( cg_solver, ublas_vec, ublas_result, ublas_matrix, vcl_vec, vcl_result, vcl_compressed_matrix, "RS COARSENING, CLASSIC INTERPOLATION", amg_tag);
-  
+
   //
   // With AMG Preconditioner ONEPASS+DIRECT
   //
   amg_tag = viennacl::linalg::amg_tag(VIENNACL_AMG_COARSE_ONEPASS, VIENNACL_AMG_INTERPOL_DIRECT,0.25, 0.2, 0.67, 3, 3, 0);
   run_amg (cg_solver, ublas_vec, ublas_result, ublas_matrix, vcl_vec, vcl_result, vcl_compressed_matrix, "ONEPASS COARSENING, DIRECT INTERPOLATION", amg_tag);
-  
+
   //
   // With AMG Preconditioner RS0+DIRECT
   //
   amg_tag = viennacl::linalg::amg_tag(VIENNACL_AMG_COARSE_RS0, VIENNACL_AMG_INTERPOL_DIRECT, 0.25, 0.2, 0.67, 3, 3, 0);
   run_amg (cg_solver, ublas_vec, ublas_result, ublas_matrix, vcl_vec, vcl_result, vcl_compressed_matrix, "RS0 COARSENING, DIRECT INTERPOLATION", amg_tag);
-  
+
   //
   // With AMG Preconditioner RS3+DIRECT
   //
   amg_tag = viennacl::linalg::amg_tag(VIENNACL_AMG_COARSE_RS3, VIENNACL_AMG_INTERPOL_DIRECT, 0.25, 0.2, 0.67, 3, 3, 0);
   run_amg (cg_solver, ublas_vec, ublas_result, ublas_matrix, vcl_vec, vcl_result, vcl_compressed_matrix, "RS3 COARSENING, DIRECT INTERPOLATION", amg_tag);
-  
+
   //
   // With AMG Preconditioner AG
   //
   amg_tag = viennacl::linalg::amg_tag(VIENNACL_AMG_COARSE_AG, VIENNACL_AMG_INTERPOL_AG, 0.08, 0, 0.67, 3, 3, 0);
   run_amg (cg_solver, ublas_vec, ublas_result, ublas_matrix, vcl_vec, vcl_result, vcl_compressed_matrix, "AG COARSENING, AG INTERPOLATION", amg_tag);
-  
+
   //
   // With AMG Preconditioner SA
   //
   amg_tag = viennacl::linalg::amg_tag(VIENNACL_AMG_COARSE_AG, VIENNACL_AMG_INTERPOL_SA, 0.08, 0.67, 0.67, 3, 3, 0);
   run_amg (cg_solver, ublas_vec, ublas_result, ublas_matrix, vcl_vec, vcl_result, vcl_compressed_matrix, "AG COARSENING, SA INTERPOLATION",amg_tag);
-  
-  
+
+
   //
   //  That's it.
   //
   std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl;
-  
+
   return EXIT_SUCCESS;
 }
 
diff --git a/examples/tutorial/bandwidth-reduction.cpp b/examples/tutorial/bandwidth-reduction.cpp
index 5f0a057..b8ea1ea 100644
--- a/examples/tutorial/bandwidth-reduction.cpp
+++ b/examples/tutorial/bandwidth-reduction.cpp
@@ -1,19 +1,25 @@
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
 ============================================================================= */
 
+/*
+*   Tutorial: Matrix bandwidth reduction algorithms
+*/
+
+
 #include <iostream>
 #include <fstream>
 #include <string>
@@ -25,10 +31,6 @@
 
 #include "viennacl/misc/bandwidth_reduction.hpp"
 
-/*
-*   Tutorial: Matrix bandwidth reduction algorithms
-*/
-
 
 //
 // Part 1: Helper functions
@@ -39,15 +41,11 @@
 std::vector< std::map<int, double> > reorder_matrix(std::vector< std::map<int, double> > const & matrix, std::vector<int> const & r)
 {
     std::vector< std::map<int, double> > matrix2(r.size());
-    std::vector<std::size_t> r2(r.size());
-    
-    for (std::size_t i = 0; i < r.size(); i++)
-        r2[r[i]] = i;
 
     for (std::size_t i = 0; i < r.size(); i++)
-        for (std::map<int, double>::const_iterator it = matrix[r[i]].begin();  it != matrix[r[i]].end(); it++)
-            matrix2[i][r2[it->first]] = it->second;
-    
+      for (std::map<int, double>::const_iterator it = matrix[i].begin();  it != matrix[i].end(); it++)
+        matrix2[r[i]][r[it->first]] = it->second;
+
     return matrix2;
 }
 
@@ -55,47 +53,67 @@ std::vector< std::map<int, double> > reorder_matrix(std::vector< std::map<int, d
 int calc_bw(std::vector< std::map<int, double> > const & matrix)
 {
     int bw = 0;
-    
+
     for (std::size_t i = 0; i < matrix.size(); i++)
-        for (std::map<int, double>::const_iterator it = matrix[i].begin();  it != matrix[i].end(); it++)
-            bw = std::max(bw, std::abs(static_cast<int>(i - it->first)));
-    
+    {
+      int min_index = static_cast<int>(matrix.size());
+      int max_index = 0;
+      for (std::map<int, double>::const_iterator it = matrix[i].begin();  it != matrix[i].end(); it++)
+      {
+        if (it->first > max_index)
+          max_index = it->first;
+        if (it->first < min_index)
+          min_index = it->first;
+      }
+
+      if (max_index > min_index) //row isn't empty
+        bw = std::max(bw, max_index - min_index);
+    }
+
     return bw;
 }
 
 
 // Calculate the bandwidth of a reordered matrix
-int calc_reordered_bw(std::vector< std::map<int, double> > const & matrix,  std::vector<int> const & r)
+template <typename IndexT>
+int calc_reordered_bw(std::vector< std::map<int, double> > const & matrix,  std::vector<IndexT> const & r)
 {
-    std::vector<int> r2(r.size());
     int bw = 0;
-    
-    for (std::size_t i = 0; i < r.size(); i++)
-        r2[r[i]] = i;
 
     for (std::size_t i = 0; i < r.size(); i++)
-        for (std::map<int, double>::const_iterator it = matrix[r[i]].begin();  it != matrix[r[i]].end(); it++)
-            bw = std::max(bw, std::abs(static_cast<int>(i - r2[it->first])));
-    
+    {
+      int min_index = static_cast<int>(matrix.size());
+      int max_index = 0;
+      for (std::map<int, double>::const_iterator it = matrix[i].begin();  it != matrix[i].end(); it++)
+      {
+        if (r[it->first] > max_index)
+          max_index = r[it->first];
+        if (r[it->first] < min_index)
+          min_index = r[it->first];
+      }
+      if (max_index > min_index)
+        bw = std::max(bw, max_index - min_index);
+    }
+
     return bw;
 }
 
 
 // Generates a random permutation by Knuth shuffle algorithm
-// reference: http://en.wikipedia.org/wiki/Knuth_shuffle 
+// reference: http://en.wikipedia.org/wiki/Knuth_shuffle
 //  (URL taken on July 2nd, 2011)
 std::vector<int> generate_random_reordering(int n)
 {
     std::vector<int> r(n);
     int tmp;
     int j;
-    
+
     for (int i = 0; i < n; i++)
         r[i] = i;
-    
+
     for (int i = 0; i < n - 1; i++)
     {
-        j = i + static_cast<std::size_t>((static_cast<double>(rand()) / static_cast<double>(RAND_MAX)) * (n - 1 - i));
+        j = i + static_cast<int>((static_cast<double>(rand()) / static_cast<double>(RAND_MAX)) * (n - 1 - i));
         if (j != i)
         {
             tmp = r[i];
@@ -103,7 +121,7 @@ std::vector<int> generate_random_reordering(int n)
             r[j] = tmp;
         }
     }
-    
+
     return r;
 }
 
@@ -121,7 +139,7 @@ std::vector< std::map<int, double> > gen_3d_mesh_matrix(int l, int m, int n, boo
     int ind;
     int ind1;
     int ind2;
-    
+
     s = l * m * n;
     matrix.resize(s);
     for (int i = 0; i < l; i++)
@@ -131,9 +149,9 @@ std::vector< std::map<int, double> > gen_3d_mesh_matrix(int l, int m, int n, boo
             for (int k = 0; k < n; k++)
             {
                 ind = i + l * j + l * m * k;
-                
+
                 matrix[ind][ind] = 1.0;
-                
+
                 if (i > 0)
                 {
                     ind2 = ind - 1;
@@ -152,7 +170,7 @@ std::vector< std::map<int, double> > gen_3d_mesh_matrix(int l, int m, int n, boo
                     matrix[ind][ind2] = 1.0;
                     matrix[ind2][ind] = 1.0;
                 }
-                
+
                 if (tri)
                 {
                     if (i < l - 1 && j < m - 1)
@@ -204,7 +222,7 @@ std::vector< std::map<int, double> > gen_3d_mesh_matrix(int l, int m, int n, boo
             }
         }
     }
-    
+
     return matrix;
 }
 
@@ -215,21 +233,21 @@ std::vector< std::map<int, double> > gen_3d_mesh_matrix(int l, int m, int n, boo
 
 
 
-int main(int argc, char *argv[])
+int main(int, char **)
 {
   srand(42);
   std::cout << "-- Generating matrix --" << std::endl;
-  std::size_t dof_per_dim = 64;   //number of grid points per coordinate direction
-  std::size_t n = dof_per_dim * dof_per_dim * dof_per_dim; //total number of unknowns
+  int dof_per_dim = 64;   //number of grid points per coordinate direction
+  int n = dof_per_dim * dof_per_dim * dof_per_dim; //total number of unknowns
   std::vector< std::map<int, double> > matrix = gen_3d_mesh_matrix(dof_per_dim, dof_per_dim, dof_per_dim, false);  //If last parameter is 'true', a tetrahedral grid instead of a hexahedral grid is used.
-  
+
   //
   // Shuffle the generated matrix
   //
   std::vector<int> r = generate_random_reordering(n);
   std::vector< std::map<int, double> > matrix2 = reorder_matrix(matrix, r);
-  
-  
+
+
   //
   // Print some statistics:
   //
@@ -242,8 +260,9 @@ int main(int argc, char *argv[])
   //
   std::cout << "-- Cuthill-McKee algorithm --" << std::endl;
   r = viennacl::reorder(matrix2, viennacl::cuthill_mckee_tag());
+  r = viennacl::reorder(matrix2, viennacl::cuthill_mckee_tag());
   std::cout << " * Reordered bandwidth: " << calc_reordered_bw(matrix2, r) << std::endl;
-  
+
   //
   // Reorder using advanced Cuthill-McKee algorithm
   //
@@ -252,18 +271,18 @@ int main(int argc, char *argv[])
   std::size_t gmax = 1;
   r = viennacl::reorder(matrix2, viennacl::advanced_cuthill_mckee_tag(a, gmax));
   std::cout << " * Reordered bandwidth: " << calc_reordered_bw(matrix2, r) << std::endl;
-  
+
   //
   // Reorder using Gibbs-Poole-Stockmeyer algorithm
   //
   std::cout << "-- Gibbs-Poole-Stockmeyer algorithm --" << std::endl;
   r = viennacl::reorder(matrix2, viennacl::gibbs_poole_stockmeyer_tag());
   std::cout << " * Reordered bandwidth: " << calc_reordered_bw(matrix2, r) << std::endl;
-    
+
   //
   //  That's it.
   //
   std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl;
-    
+
   return EXIT_SUCCESS;
 }
diff --git a/examples/tutorial/blas1.cpp b/examples/tutorial/blas1.cpp
index e9c52d2..34df64a 100644
--- a/examples/tutorial/blas1.cpp
+++ b/examples/tutorial/blas1.cpp
@@ -1,205 +1,216 @@
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-// include necessary system headers
-#include <iostream>
-
-//include basic scalar and vector types of ViennaCL
-#include "viennacl/scalar.hpp"
-#include "viennacl/vector.hpp"
-
-//include the generic inner product functions of ViennaCL
-#include "viennacl/linalg/inner_prod.hpp"
-
-//include the generic norm functions of ViennaCL
-#include "viennacl/linalg/norm_1.hpp"
-#include "viennacl/linalg/norm_2.hpp"
-#include "viennacl/linalg/norm_inf.hpp"
-
-// Some helper functions for this tutorial:
-#include "Random.hpp"
-
-/*
-*   Tutorial: BLAS level 1 functionality
-*/
-
-int main()
-{
-  //Change this type definition to double if your gpu supports that
-  typedef float       ScalarType;
-  
-  /////////////////////////////////////////////////
-  ///////////// Scalar operations /////////////////
-  /////////////////////////////////////////////////
-  
-  //
-  // Define a few CPU scalars:
-  //
-  ScalarType s1 = ScalarType(3.1415926);   //note: writing ScalarType s1 = 3.1415926; leads to warnings with some compilers if ScalarType is 'float'.
-  ScalarType s2 = ScalarType(2.71763);
-  ScalarType s3 = ScalarType(42.0);
-  
-  //
-  // ViennaCL scalars are defined in the same way:
-  //  
-  //std::cout << "Creating a few scalars..." << std::endl;
-  viennacl::scalar<ScalarType> vcl_s1;
-  viennacl::scalar<ScalarType> vcl_s2 = ScalarType(1.0);
-  viennacl::scalar<ScalarType> vcl_s3 = ScalarType(1.0);
-
-  //
-  // CPU scalars can be transparently assigned to GPU scalars and vice versa:
-  //
-  std::cout << "Copying a few scalars..." << std::endl;
-  vcl_s1 = s1;
-  s2 = vcl_s2;
-  vcl_s3 = s3;
-  
-  //
-  // Operations between GPU scalars work just as for CPU scalars:
-  // (Note that such single compute kernels on the GPU are considerably slower than on the CPU)
-  //
-  
-  std::cout << "Manipulating a few scalars..." << std::endl;
-  s1 += s2;
-  vcl_s1 += vcl_s2;
-  
-  s1 *= s2;
-  vcl_s1 *= vcl_s2;
-  
-  s1 -= s2;
-  vcl_s1 -= vcl_s2;
-
-  s1 /= s2;
-  vcl_s1 /= vcl_s2;
-
-  s1 = s2 + s3;
-  vcl_s1 = vcl_s2 + vcl_s3;
-  
-  s1 = s2 + s3 * s2 - s3 / s1;
-  vcl_s1 = vcl_s2 + vcl_s3 * vcl_s2 - vcl_s3 / vcl_s1;
-  
-  
-  //
-  // Operations can also be mixed:
-  //
-
-  vcl_s1 = s1 * vcl_s2 + s3 - vcl_s3;
-  
-  
-  //
-  // Output stream is overloaded as well:
-  //
-  
-  std::cout << "CPU scalar s3: " << s3 << std::endl;
-  std::cout << "GPU scalar vcl_s3: " << vcl_s3 << std::endl;
-
-  
-  /////////////////////////////////////////////////
-  ///////////// Vector operations /////////////////
-  /////////////////////////////////////////////////
-  
-  //
-  // Define a few vectors (from STL and plain C) and viennacl::vectors
-  //
-  std::vector<ScalarType>      std_vec1(10);
-  std::vector<ScalarType>      std_vec2(10);
-  ScalarType                   plain_vec3[10];  //plain C array
-
-  viennacl::vector<ScalarType> vcl_vec1(10);
-  viennacl::vector<ScalarType> vcl_vec2(10);
-  viennacl::vector<ScalarType> vcl_vec3(10);
-
-  //
-  // Let us fill the CPU vectors with random values:
-  // (random<> is a helper function from Random.hpp)
-  //
-  
-  for (unsigned int i = 0; i < 10; ++i)
-  {
-    std_vec1[i] = random<ScalarType>(); 
-    vcl_vec2(i) = random<ScalarType>();  //also works for GPU vectors, but is MUCH slower (approx. factor 10.000) than the CPU analogue
-    plain_vec3[i] = random<ScalarType>(); 
-  }
-  
-  //
-  // Copy the CPU vectors to the GPU vectors and vice versa
-  //
-  viennacl::copy(std_vec1.begin(), std_vec1.end(), vcl_vec1.begin()); //either the STL way
-  viennacl::copy(vcl_vec2.begin(), vcl_vec2.end(), std_vec2.begin()); //either the STL way
-  viennacl::copy(vcl_vec2, std_vec2);                                 //using the short hand notation for objects that provide .begin() and .end() members
-  viennacl::copy(vcl_vec2.begin(), vcl_vec2.end(), plain_vec3);       //copy to plain C vector
-
-  // 
-  // Also partial copies by providing the corresponding iterators are possible:
-  //
-  viennacl::copy(std_vec1.begin() + 4, std_vec1.begin() + 8, vcl_vec1.begin() + 4);   //cpu to gpu
-  viennacl::copy(vcl_vec1.begin() + 4, vcl_vec1.begin() + 8, vcl_vec2.begin() + 1);   //gpu to gpu
-  viennacl::copy(vcl_vec1.begin() + 4, vcl_vec1.begin() + 8, std_vec1.begin() + 1);   //gpu to cpu
-
-  //
-  // Compute the inner product of two GPU vectors and write the result to either CPU or GPU
-  //
-  
-  vcl_s1 = viennacl::linalg::inner_prod(vcl_vec1, vcl_vec2);
-  s1 = viennacl::linalg::inner_prod(vcl_vec1, vcl_vec2);
-  s2 = viennacl::linalg::inner_prod(std_vec1, std_vec2); //inner prod can also be used with std::vector (computations are carried out on CPU then)
-
-  //
-  // Compute norms:
-  //
-
-  s1 = viennacl::linalg::norm_1(vcl_vec1);
-  vcl_s2 = viennacl::linalg::norm_2(vcl_vec2);
-  s3 = viennacl::linalg::norm_inf(vcl_vec3);
-
-  //
-  // Plane rotation of two vectors:
-  //
-
-  viennacl::linalg::plane_rotation(vcl_vec1, vcl_vec2, 1.1f, 2.3f);
-
-  //
-  // Use viennacl::vector via the overloaded operators just as you would write it on paper:
-  //
-  
-  //simple expression:
-  vcl_vec1 = vcl_s1 * vcl_vec2 / vcl_s3;
-  
-  //more complicated expression:
-  vcl_vec1 = vcl_vec2 / vcl_s3 + vcl_s2 * (vcl_vec1 - vcl_s2 * vcl_vec2);
-
-  //
-  // Swap the content of two vectors without a temporary vector:
-  //
-
-  viennacl::swap(vcl_vec1, vcl_vec2);  //swaps all entries in memory
-  viennacl::fast_swap(vcl_vec1, vcl_vec2); //swaps OpenCL memory handles only
-  
-  //
-  // The vectors can also be cleared directly:
-  //
-  vcl_vec1.clear();
-  vcl_vec2.clear();
-  
-  //
-  //  That's it.
-  //
-  std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl;
-
-  return EXIT_SUCCESS;
-}
-
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/*
+*
+*   Tutorial: BLAS level 1 functionality (blas1.cpp and blas1.cu are identical, the latter being required for compilation using CUDA nvcc)
+*
+*/
+
+
+// include necessary system headers
+#include <iostream>
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+
+//include the generic inner product functions of ViennaCL
+#include "viennacl/linalg/inner_prod.hpp"
+
+//include the generic norm functions of ViennaCL
+#include "viennacl/linalg/norm_1.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+#include "viennacl/linalg/norm_inf.hpp"
+
+// Some helper functions for this tutorial:
+#include "Random.hpp"
+
+int main()
+{
+  //Change this type definition to double if your gpu supports that
+  typedef float       ScalarType;
+
+  /////////////////////////////////////////////////
+  ///////////// Scalar operations /////////////////
+  /////////////////////////////////////////////////
+
+  //
+  // Define a few CPU scalars:
+  //
+  ScalarType s1 = ScalarType(3.1415926);   //note: writing ScalarType s1 = 3.1415926; leads to warnings with some compilers if ScalarType is 'float'.
+  ScalarType s2 = ScalarType(2.71763);
+  ScalarType s3 = ScalarType(42.0);
+
+  //
+  // ViennaCL scalars are defined in the same way:
+  //
+  //std::cout << "Creating a few scalars..." << std::endl;
+  viennacl::scalar<ScalarType> vcl_s1;
+  viennacl::scalar<ScalarType> vcl_s2 = ScalarType(1.0);
+  viennacl::scalar<ScalarType> vcl_s3 = ScalarType(1.0);
+
+  //
+  // CPU scalars can be transparently assigned to GPU scalars and vice versa:
+  //
+  std::cout << "Copying a few scalars..." << std::endl;
+  vcl_s1 = s1;
+  s2 = vcl_s2;
+  vcl_s3 = s3;
+
+  //
+  // Operations between GPU scalars work just as for CPU scalars:
+  // (Note that such single compute kernels on the GPU are considerably slower than on the CPU)
+  //
+
+  std::cout << "Manipulating a few scalars..." << std::endl;
+  std::cout << "operator +=" << std::endl;
+  s1 += s2;
+  vcl_s1 += vcl_s2;
+
+  /*std::cout << "operator *=" << std::endl;
+  s1 *= s2;
+  vcl_s1 *= vcl_s2;
+
+  std::cout << "operator -=" << std::endl;
+  s1 -= s2;
+  vcl_s1 -= vcl_s2;
+
+  std::cout << "operator /=" << std::endl;
+  s1 /= s2;
+  vcl_s1 /= vcl_s2;*/
+
+  std::cout << "operator +" << std::endl;
+  s1 = s2 + s3;
+  vcl_s1 = vcl_s2 + vcl_s3;
+
+  std::cout << "multiple operators" << std::endl;
+  s1 = s2 + s3 * s2 - s3 / s1;
+  vcl_s1 = vcl_s2 + vcl_s3 * vcl_s2 - vcl_s3 / vcl_s1;
+
+
+  //
+  // Operations can also be mixed:
+  //
+
+  std::cout << "mixed operations" << std::endl;
+  vcl_s1 = s1 * vcl_s2 + s3 - vcl_s3;
+
+
+  //
+  // Output stream is overloaded as well:
+  //
+
+  std::cout << "CPU scalar s3: " << s3 << std::endl;
+  std::cout << "GPU scalar vcl_s3: " << vcl_s3 << std::endl;
+
+
+  /////////////////////////////////////////////////
+  ///////////// Vector operations /////////////////
+  /////////////////////////////////////////////////
+
+  //
+  // Define a few vectors (from STL and plain C) and viennacl::vectors
+  //
+  std::vector<ScalarType>      std_vec1(10);
+  std::vector<ScalarType>      std_vec2(10);
+  ScalarType                   plain_vec3[10];  //plain C array
+
+  viennacl::vector<ScalarType> vcl_vec1(10);
+  viennacl::vector<ScalarType> vcl_vec2(10);
+  viennacl::vector<ScalarType> vcl_vec3(10);
+
+  //
+  // Let us fill the CPU vectors with random values:
+  // (random<> is a helper function from Random.hpp)
+  //
+
+  for (unsigned int i = 0; i < 10; ++i)
+  {
+    std_vec1[i] = random<ScalarType>();
+    vcl_vec2(i) = random<ScalarType>();  //also works for GPU vectors, but is MUCH slower (approx. factor 10.000) than the CPU analogue
+    plain_vec3[i] = random<ScalarType>();
+  }
+
+  //
+  // Copy the CPU vectors to the GPU vectors and vice versa
+  //
+  viennacl::copy(std_vec1.begin(), std_vec1.end(), vcl_vec1.begin()); //either the STL way
+  viennacl::copy(vcl_vec2.begin(), vcl_vec2.end(), std_vec2.begin()); //either the STL way
+  viennacl::copy(vcl_vec2, std_vec2);                                 //using the short hand notation for objects that provide .begin() and .end() members
+  viennacl::copy(vcl_vec2.begin(), vcl_vec2.end(), plain_vec3);       //copy to plain C vector
+
+  //
+  // Also partial copies by providing the corresponding iterators are possible:
+  //
+  viennacl::copy(std_vec1.begin() + 4, std_vec1.begin() + 8, vcl_vec1.begin() + 4);   //cpu to gpu
+  viennacl::copy(vcl_vec1.begin() + 4, vcl_vec1.begin() + 8, vcl_vec2.begin() + 1);   //gpu to gpu
+  viennacl::copy(vcl_vec1.begin() + 4, vcl_vec1.begin() + 8, std_vec1.begin() + 1);   //gpu to cpu
+
+  //
+  // Compute the inner product of two GPU vectors and write the result to either CPU or GPU
+  //
+
+  vcl_s1 = viennacl::linalg::inner_prod(vcl_vec1, vcl_vec2);
+  s1 = viennacl::linalg::inner_prod(vcl_vec1, vcl_vec2);
+  s2 = viennacl::linalg::inner_prod(std_vec1, std_vec2); //inner prod can also be used with std::vector (computations are carried out on CPU then)
+
+  //
+  // Compute norms:
+  //
+
+  s1 = viennacl::linalg::norm_1(vcl_vec1);
+  vcl_s2 = viennacl::linalg::norm_2(vcl_vec2);
+  s3 = viennacl::linalg::norm_inf(vcl_vec3);
+
+  //
+  // Plane rotation of two vectors:
+  //
+
+  viennacl::linalg::plane_rotation(vcl_vec1, vcl_vec2, 1.1f, 2.3f);
+
+  //
+  // Use viennacl::vector via the overloaded operators just as you would write it on paper:
+  //
+
+  //simple expression:
+  vcl_vec1 = vcl_s1 * vcl_vec2 / vcl_s3;
+
+  //more complicated expression:
+  vcl_vec1 = vcl_vec2 / vcl_s3 + vcl_s2 * (vcl_vec1 - vcl_s2 * vcl_vec2);
+
+  //
+  // Swap the content of two vectors without a temporary vector:
+  //
+
+  viennacl::swap(vcl_vec1, vcl_vec2);  //swaps all entries in memory
+  viennacl::fast_swap(vcl_vec1, vcl_vec2); //swaps OpenCL memory handles only
+
+  //
+  // The vectors can also be cleared directly:
+  //
+  vcl_vec1.clear();
+  vcl_vec2.clear();
+
+  //
+  //  That's it.
+  //
+  std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl;
+
+  return EXIT_SUCCESS;
+}
+
diff --git a/examples/tutorial/blas1.cpp b/examples/tutorial/blas1.cu
similarity index 89%
copy from examples/tutorial/blas1.cpp
copy to examples/tutorial/blas1.cu
index e9c52d2..34df64a 100644
--- a/examples/tutorial/blas1.cpp
+++ b/examples/tutorial/blas1.cu
@@ -1,205 +1,216 @@
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-// include necessary system headers
-#include <iostream>
-
-//include basic scalar and vector types of ViennaCL
-#include "viennacl/scalar.hpp"
-#include "viennacl/vector.hpp"
-
-//include the generic inner product functions of ViennaCL
-#include "viennacl/linalg/inner_prod.hpp"
-
-//include the generic norm functions of ViennaCL
-#include "viennacl/linalg/norm_1.hpp"
-#include "viennacl/linalg/norm_2.hpp"
-#include "viennacl/linalg/norm_inf.hpp"
-
-// Some helper functions for this tutorial:
-#include "Random.hpp"
-
-/*
-*   Tutorial: BLAS level 1 functionality
-*/
-
-int main()
-{
-  //Change this type definition to double if your gpu supports that
-  typedef float       ScalarType;
-  
-  /////////////////////////////////////////////////
-  ///////////// Scalar operations /////////////////
-  /////////////////////////////////////////////////
-  
-  //
-  // Define a few CPU scalars:
-  //
-  ScalarType s1 = ScalarType(3.1415926);   //note: writing ScalarType s1 = 3.1415926; leads to warnings with some compilers if ScalarType is 'float'.
-  ScalarType s2 = ScalarType(2.71763);
-  ScalarType s3 = ScalarType(42.0);
-  
-  //
-  // ViennaCL scalars are defined in the same way:
-  //  
-  //std::cout << "Creating a few scalars..." << std::endl;
-  viennacl::scalar<ScalarType> vcl_s1;
-  viennacl::scalar<ScalarType> vcl_s2 = ScalarType(1.0);
-  viennacl::scalar<ScalarType> vcl_s3 = ScalarType(1.0);
-
-  //
-  // CPU scalars can be transparently assigned to GPU scalars and vice versa:
-  //
-  std::cout << "Copying a few scalars..." << std::endl;
-  vcl_s1 = s1;
-  s2 = vcl_s2;
-  vcl_s3 = s3;
-  
-  //
-  // Operations between GPU scalars work just as for CPU scalars:
-  // (Note that such single compute kernels on the GPU are considerably slower than on the CPU)
-  //
-  
-  std::cout << "Manipulating a few scalars..." << std::endl;
-  s1 += s2;
-  vcl_s1 += vcl_s2;
-  
-  s1 *= s2;
-  vcl_s1 *= vcl_s2;
-  
-  s1 -= s2;
-  vcl_s1 -= vcl_s2;
-
-  s1 /= s2;
-  vcl_s1 /= vcl_s2;
-
-  s1 = s2 + s3;
-  vcl_s1 = vcl_s2 + vcl_s3;
-  
-  s1 = s2 + s3 * s2 - s3 / s1;
-  vcl_s1 = vcl_s2 + vcl_s3 * vcl_s2 - vcl_s3 / vcl_s1;
-  
-  
-  //
-  // Operations can also be mixed:
-  //
-
-  vcl_s1 = s1 * vcl_s2 + s3 - vcl_s3;
-  
-  
-  //
-  // Output stream is overloaded as well:
-  //
-  
-  std::cout << "CPU scalar s3: " << s3 << std::endl;
-  std::cout << "GPU scalar vcl_s3: " << vcl_s3 << std::endl;
-
-  
-  /////////////////////////////////////////////////
-  ///////////// Vector operations /////////////////
-  /////////////////////////////////////////////////
-  
-  //
-  // Define a few vectors (from STL and plain C) and viennacl::vectors
-  //
-  std::vector<ScalarType>      std_vec1(10);
-  std::vector<ScalarType>      std_vec2(10);
-  ScalarType                   plain_vec3[10];  //plain C array
-
-  viennacl::vector<ScalarType> vcl_vec1(10);
-  viennacl::vector<ScalarType> vcl_vec2(10);
-  viennacl::vector<ScalarType> vcl_vec3(10);
-
-  //
-  // Let us fill the CPU vectors with random values:
-  // (random<> is a helper function from Random.hpp)
-  //
-  
-  for (unsigned int i = 0; i < 10; ++i)
-  {
-    std_vec1[i] = random<ScalarType>(); 
-    vcl_vec2(i) = random<ScalarType>();  //also works for GPU vectors, but is MUCH slower (approx. factor 10.000) than the CPU analogue
-    plain_vec3[i] = random<ScalarType>(); 
-  }
-  
-  //
-  // Copy the CPU vectors to the GPU vectors and vice versa
-  //
-  viennacl::copy(std_vec1.begin(), std_vec1.end(), vcl_vec1.begin()); //either the STL way
-  viennacl::copy(vcl_vec2.begin(), vcl_vec2.end(), std_vec2.begin()); //either the STL way
-  viennacl::copy(vcl_vec2, std_vec2);                                 //using the short hand notation for objects that provide .begin() and .end() members
-  viennacl::copy(vcl_vec2.begin(), vcl_vec2.end(), plain_vec3);       //copy to plain C vector
-
-  // 
-  // Also partial copies by providing the corresponding iterators are possible:
-  //
-  viennacl::copy(std_vec1.begin() + 4, std_vec1.begin() + 8, vcl_vec1.begin() + 4);   //cpu to gpu
-  viennacl::copy(vcl_vec1.begin() + 4, vcl_vec1.begin() + 8, vcl_vec2.begin() + 1);   //gpu to gpu
-  viennacl::copy(vcl_vec1.begin() + 4, vcl_vec1.begin() + 8, std_vec1.begin() + 1);   //gpu to cpu
-
-  //
-  // Compute the inner product of two GPU vectors and write the result to either CPU or GPU
-  //
-  
-  vcl_s1 = viennacl::linalg::inner_prod(vcl_vec1, vcl_vec2);
-  s1 = viennacl::linalg::inner_prod(vcl_vec1, vcl_vec2);
-  s2 = viennacl::linalg::inner_prod(std_vec1, std_vec2); //inner prod can also be used with std::vector (computations are carried out on CPU then)
-
-  //
-  // Compute norms:
-  //
-
-  s1 = viennacl::linalg::norm_1(vcl_vec1);
-  vcl_s2 = viennacl::linalg::norm_2(vcl_vec2);
-  s3 = viennacl::linalg::norm_inf(vcl_vec3);
-
-  //
-  // Plane rotation of two vectors:
-  //
-
-  viennacl::linalg::plane_rotation(vcl_vec1, vcl_vec2, 1.1f, 2.3f);
-
-  //
-  // Use viennacl::vector via the overloaded operators just as you would write it on paper:
-  //
-  
-  //simple expression:
-  vcl_vec1 = vcl_s1 * vcl_vec2 / vcl_s3;
-  
-  //more complicated expression:
-  vcl_vec1 = vcl_vec2 / vcl_s3 + vcl_s2 * (vcl_vec1 - vcl_s2 * vcl_vec2);
-
-  //
-  // Swap the content of two vectors without a temporary vector:
-  //
-
-  viennacl::swap(vcl_vec1, vcl_vec2);  //swaps all entries in memory
-  viennacl::fast_swap(vcl_vec1, vcl_vec2); //swaps OpenCL memory handles only
-  
-  //
-  // The vectors can also be cleared directly:
-  //
-  vcl_vec1.clear();
-  vcl_vec2.clear();
-  
-  //
-  //  That's it.
-  //
-  std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl;
-
-  return EXIT_SUCCESS;
-}
-
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/*
+*
+*   Tutorial: BLAS level 1 functionality (blas1.cpp and blas1.cu are identical, the latter being required for compilation using CUDA nvcc)
+*
+*/
+
+
+// include necessary system headers
+#include <iostream>
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+
+//include the generic inner product functions of ViennaCL
+#include "viennacl/linalg/inner_prod.hpp"
+
+//include the generic norm functions of ViennaCL
+#include "viennacl/linalg/norm_1.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+#include "viennacl/linalg/norm_inf.hpp"
+
+// Some helper functions for this tutorial:
+#include "Random.hpp"
+
+int main()
+{
+  //Change this type definition to double if your gpu supports that
+  typedef float       ScalarType;
+
+  /////////////////////////////////////////////////
+  ///////////// Scalar operations /////////////////
+  /////////////////////////////////////////////////
+
+  //
+  // Define a few CPU scalars:
+  //
+  ScalarType s1 = ScalarType(3.1415926);   //note: writing ScalarType s1 = 3.1415926; leads to warnings with some compilers if ScalarType is 'float'.
+  ScalarType s2 = ScalarType(2.71763);
+  ScalarType s3 = ScalarType(42.0);
+
+  //
+  // ViennaCL scalars are defined in the same way:
+  //
+  //std::cout << "Creating a few scalars..." << std::endl;
+  viennacl::scalar<ScalarType> vcl_s1;
+  viennacl::scalar<ScalarType> vcl_s2 = ScalarType(1.0);
+  viennacl::scalar<ScalarType> vcl_s3 = ScalarType(1.0);
+
+  //
+  // CPU scalars can be transparently assigned to GPU scalars and vice versa:
+  //
+  std::cout << "Copying a few scalars..." << std::endl;
+  vcl_s1 = s1;
+  s2 = vcl_s2;
+  vcl_s3 = s3;
+
+  //
+  // Operations between GPU scalars work just as for CPU scalars:
+  // (Note that such single compute kernels on the GPU are considerably slower than on the CPU)
+  //
+
+  std::cout << "Manipulating a few scalars..." << std::endl;
+  std::cout << "operator +=" << std::endl;
+  s1 += s2;
+  vcl_s1 += vcl_s2;
+
+  /*std::cout << "operator *=" << std::endl;
+  s1 *= s2;
+  vcl_s1 *= vcl_s2;
+
+  std::cout << "operator -=" << std::endl;
+  s1 -= s2;
+  vcl_s1 -= vcl_s2;
+
+  std::cout << "operator /=" << std::endl;
+  s1 /= s2;
+  vcl_s1 /= vcl_s2;*/
+
+  std::cout << "operator +" << std::endl;
+  s1 = s2 + s3;
+  vcl_s1 = vcl_s2 + vcl_s3;
+
+  std::cout << "multiple operators" << std::endl;
+  s1 = s2 + s3 * s2 - s3 / s1;
+  vcl_s1 = vcl_s2 + vcl_s3 * vcl_s2 - vcl_s3 / vcl_s1;
+
+
+  //
+  // Operations can also be mixed:
+  //
+
+  std::cout << "mixed operations" << std::endl;
+  vcl_s1 = s1 * vcl_s2 + s3 - vcl_s3;
+
+
+  //
+  // Output stream is overloaded as well:
+  //
+
+  std::cout << "CPU scalar s3: " << s3 << std::endl;
+  std::cout << "GPU scalar vcl_s3: " << vcl_s3 << std::endl;
+
+
+  /////////////////////////////////////////////////
+  ///////////// Vector operations /////////////////
+  /////////////////////////////////////////////////
+
+  //
+  // Define a few vectors (from STL and plain C) and viennacl::vectors
+  //
+  std::vector<ScalarType>      std_vec1(10);
+  std::vector<ScalarType>      std_vec2(10);
+  ScalarType                   plain_vec3[10];  //plain C array
+
+  viennacl::vector<ScalarType> vcl_vec1(10);
+  viennacl::vector<ScalarType> vcl_vec2(10);
+  viennacl::vector<ScalarType> vcl_vec3(10);
+
+  //
+  // Let us fill the CPU vectors with random values:
+  // (random<> is a helper function from Random.hpp)
+  //
+
+  for (unsigned int i = 0; i < 10; ++i)
+  {
+    std_vec1[i] = random<ScalarType>();
+    vcl_vec2(i) = random<ScalarType>();  //also works for GPU vectors, but is MUCH slower (approx. factor 10.000) than the CPU analogue
+    plain_vec3[i] = random<ScalarType>();
+  }
+
+  //
+  // Copy the CPU vectors to the GPU vectors and vice versa
+  //
+  viennacl::copy(std_vec1.begin(), std_vec1.end(), vcl_vec1.begin()); //either the STL way
+  viennacl::copy(vcl_vec2.begin(), vcl_vec2.end(), std_vec2.begin()); //either the STL way
+  viennacl::copy(vcl_vec2, std_vec2);                                 //using the short hand notation for objects that provide .begin() and .end() members
+  viennacl::copy(vcl_vec2.begin(), vcl_vec2.end(), plain_vec3);       //copy to plain C vector
+
+  //
+  // Also partial copies by providing the corresponding iterators are possible:
+  //
+  viennacl::copy(std_vec1.begin() + 4, std_vec1.begin() + 8, vcl_vec1.begin() + 4);   //cpu to gpu
+  viennacl::copy(vcl_vec1.begin() + 4, vcl_vec1.begin() + 8, vcl_vec2.begin() + 1);   //gpu to gpu
+  viennacl::copy(vcl_vec1.begin() + 4, vcl_vec1.begin() + 8, std_vec1.begin() + 1);   //gpu to cpu
+
+  //
+  // Compute the inner product of two GPU vectors and write the result to either CPU or GPU
+  //
+
+  vcl_s1 = viennacl::linalg::inner_prod(vcl_vec1, vcl_vec2);
+  s1 = viennacl::linalg::inner_prod(vcl_vec1, vcl_vec2);
+  s2 = viennacl::linalg::inner_prod(std_vec1, std_vec2); //inner prod can also be used with std::vector (computations are carried out on CPU then)
+
+  //
+  // Compute norms:
+  //
+
+  s1 = viennacl::linalg::norm_1(vcl_vec1);
+  vcl_s2 = viennacl::linalg::norm_2(vcl_vec2);
+  s3 = viennacl::linalg::norm_inf(vcl_vec3);
+
+  //
+  // Plane rotation of two vectors:
+  //
+
+  viennacl::linalg::plane_rotation(vcl_vec1, vcl_vec2, 1.1f, 2.3f);
+
+  //
+  // Use viennacl::vector via the overloaded operators just as you would write it on paper:
+  //
+
+  //simple expression:
+  vcl_vec1 = vcl_s1 * vcl_vec2 / vcl_s3;
+
+  //more complicated expression:
+  vcl_vec1 = vcl_vec2 / vcl_s3 + vcl_s2 * (vcl_vec1 - vcl_s2 * vcl_vec2);
+
+  //
+  // Swap the content of two vectors without a temporary vector:
+  //
+
+  viennacl::swap(vcl_vec1, vcl_vec2);  //swaps all entries in memory
+  viennacl::fast_swap(vcl_vec1, vcl_vec2); //swaps OpenCL memory handles only
+
+  //
+  // The vectors can also be cleared directly:
+  //
+  vcl_vec1.clear();
+  vcl_vec2.clear();
+
+  //
+  //  That's it.
+  //
+  std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl;
+
+  return EXIT_SUCCESS;
+}
+
diff --git a/examples/tutorial/blas2.cpp b/examples/tutorial/blas2.cpp
index 9b57ef4..8ee2d53 100644
--- a/examples/tutorial/blas2.cpp
+++ b/examples/tutorial/blas2.cpp
@@ -1,258 +1,263 @@
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-//
-// include necessary system headers
-//
-#include <iostream>
-
-//
-// ublas includes
-//
-#include <boost/numeric/ublas/io.hpp>
-#include <boost/numeric/ublas/triangular.hpp>
-#include <boost/numeric/ublas/matrix_sparse.hpp>
-#include <boost/numeric/ublas/matrix.hpp>
-#include <boost/numeric/ublas/matrix_proxy.hpp>
-#include <boost/numeric/ublas/lu.hpp>
-#include <boost/numeric/ublas/io.hpp>
-
-
-// Must be set if you want to use ViennaCL algorithms on ublas objects
-#define VIENNACL_HAVE_UBLAS 1
-
-
-//
-// ViennaCL includes
-//
-#include "viennacl/scalar.hpp"
-#include "viennacl/vector.hpp"
-#include "viennacl/matrix.hpp"
-#include "viennacl/linalg/direct_solve.hpp"
-#include "viennacl/linalg/prod.hpp"       //generic matrix-vector product
-#include "viennacl/linalg/norm_2.hpp"     //generic l2-norm for vectors
-
-// Some helper functions for this tutorial:
-#include "Random.hpp"
-#include "vector-io.hpp"
-
-/*
-*   Tutorial: BLAS level 2 functionality
-*   
-*/
-
-using namespace boost::numeric;
-
-int main()
-{
-  typedef float       ScalarType;
-  
-  //
-  // Set up some ublas objects
-  //
-  ublas::vector<ScalarType> rhs(12);
-  for (unsigned int i = 0; i < rhs.size(); ++i)
-    rhs(i) = random<ScalarType>();
-  ublas::vector<ScalarType> rhs2 = rhs;
-  ublas::vector<ScalarType> result = ublas::zero_vector<ScalarType>(10);
-  ublas::vector<ScalarType> result2 = result;
-  ublas::vector<ScalarType> rhs_trans = rhs;
-  rhs_trans.resize(result.size(), true);
-  ublas::vector<ScalarType> result_trans = ublas::zero_vector<ScalarType>(rhs.size());
-
-  
-  ublas::matrix<ScalarType> matrix(result.size(),rhs.size());
-
-  //
-  // Fill the matrix
-  //
-  for (unsigned int i = 0; i < matrix.size1(); ++i)
-    for (unsigned int j = 0; j < matrix.size2(); ++j)
-      matrix(i,j) = random<ScalarType>();
-    
-  //
-  // Use some plain STL types:
-  //
-  std::vector< ScalarType > stl_result(result.size());
-  std::vector< ScalarType > stl_rhs(rhs.size());
-  std::vector< std::vector<ScalarType> > stl_matrix(result.size());
-  for (unsigned int i=0; i < result.size(); ++i)
-  {
-    stl_matrix[i].resize(rhs.size());
-    for (unsigned int j = 0; j < matrix.size2(); ++j)
-    {
-      stl_rhs[j] = rhs[j];
-      stl_matrix[i][j] = matrix(i,j);
-    }
-  }
-
-  //
-  // Set up some ViennaCL objects
-  //
-  viennacl::vector<ScalarType> vcl_rhs(rhs.size());
-  viennacl::vector<ScalarType> vcl_result(result.size()); 
-  viennacl::matrix<ScalarType> vcl_matrix(result.size(), rhs.size());
-  viennacl::matrix<ScalarType> vcl_matrix2(result.size(), rhs.size());
-
-  viennacl::copy(rhs.begin(), rhs.end(), vcl_rhs.begin());
-  viennacl::copy(matrix, vcl_matrix);     //copy from ublas dense matrix type to ViennaCL type
-
-  //
-  // Some basic matrix operations
-  //
-  vcl_matrix2 = vcl_matrix;
-  vcl_matrix2 += vcl_matrix;
-  vcl_matrix2 -= vcl_matrix;
-  vcl_matrix2 = vcl_matrix2 + vcl_matrix;
-  vcl_matrix2 = vcl_matrix2 - vcl_matrix;
-  
-  viennacl::scalar<ScalarType> vcl_3(3.0);
-  vcl_matrix2 *= ScalarType(2.0);
-  vcl_matrix2 /= ScalarType(2.0);
-  vcl_matrix2 *= vcl_3;
-  vcl_matrix2 /= vcl_3;
-
-  //
-  // A matrix can be cleared directly:
-  //
-  vcl_matrix.clear();
-  
-  viennacl::copy(stl_matrix, vcl_matrix); //alternative: copy from STL vector< vector<> > type to ViennaCL type
-
-  //for demonstration purposes (no effect):
-  viennacl::copy(vcl_matrix, matrix); //copy back from ViennaCL to ublas type.
-  viennacl::copy(vcl_matrix, stl_matrix); //copy back from ViennaCL to STL type.
-  
-  /////////////////////////////////////////////////
-  //////////// Matrix vector products /////////////
-  /////////////////////////////////////////////////
-  
-  
-  //
-  // Compute matrix-vector products
-  //
-  std::cout << "----- Matrix-Vector product -----" << std::endl;
-  result = ublas::prod(matrix, rhs);                            //the ublas way
-  stl_result = viennacl::linalg::prod(stl_matrix, stl_rhs);     //using STL
-  vcl_result = viennacl::linalg::prod(vcl_matrix, vcl_rhs);     //the ViennaCL way
-  
-  //
-  // Compute transposed matrix-vector products
-  //
-  std::cout << "----- Transposed Matrix-Vector product -----" << std::endl;
-  result_trans = prod(trans(matrix), rhs_trans);
-  
-  viennacl::vector<ScalarType> vcl_rhs_trans(rhs_trans.size());
-  viennacl::vector<ScalarType> vcl_result_trans(result_trans.size()); 
-  viennacl::copy(rhs_trans.begin(), rhs_trans.end(), vcl_rhs_trans.begin());
-  vcl_result_trans = viennacl::linalg::prod(trans(vcl_matrix), vcl_rhs_trans);
-  
-  
-  
-  /////////////////////////////////////////////////
-  //////////////// Direct solver  /////////////////
-  /////////////////////////////////////////////////
-  
-  
-  //
-  // Setup suitable matrices
-  //
-  ublas::matrix<ScalarType> tri_matrix(10,10);
-  for (size_t i=0; i<tri_matrix.size1(); ++i)
-  {
-    for (size_t j=0; j<i; ++j)
-      tri_matrix(i,j) = 0.0;
-
-    for (size_t j=i; j<tri_matrix.size2(); ++j)
-      tri_matrix(i,j) = matrix(i,j);
-  }
-  
-  viennacl::matrix<ScalarType> vcl_tri_matrix(tri_matrix.size1(), tri_matrix.size2());
-  viennacl::copy(tri_matrix, vcl_tri_matrix);
-  
-  rhs.resize(tri_matrix.size1(), true);
-  rhs2.resize(tri_matrix.size1(), true);
-  vcl_rhs.resize(tri_matrix.size1(), true);
-  
-  viennacl::copy(rhs.begin(), rhs.end(), vcl_rhs.begin());
-  vcl_result.resize(10);
-
-  
-  //
-  // Triangular solver
-  //
-  std::cout << "----- Upper Triangular solve -----" << std::endl;
-  result = ublas::solve(tri_matrix, rhs, ublas::upper_tag());                                    //ublas
-  vcl_result = viennacl::linalg::solve(vcl_tri_matrix, vcl_rhs, viennacl::linalg::upper_tag());  //ViennaCL
-  
-  //
-  // Inplace variants of the above
-  //
-  ublas::inplace_solve(tri_matrix, rhs, ublas::upper_tag());                                //ublas
-  viennacl::linalg::inplace_solve(vcl_tri_matrix, vcl_rhs, viennacl::linalg::upper_tag());  //ViennaCL
-  
-
-  //
-  // Set up a full system for LU solver:
-  // 
-  std::cout << "----- LU factorization -----" << std::endl;
-  size_t lu_dim = 300;
-  ublas::matrix<ScalarType> square_matrix(lu_dim, lu_dim);
-  ublas::vector<ScalarType> lu_rhs(lu_dim);
-  viennacl::matrix<ScalarType> vcl_square_matrix(lu_dim, lu_dim);
-  viennacl::vector<ScalarType> vcl_lu_rhs(lu_dim);
-
-  for (size_t i=0; i<lu_dim; ++i)
-    for (size_t j=0; j<lu_dim; ++j)
-      square_matrix(i,j) = random<ScalarType>();
-
-  //put some more weight on diagonal elements:
-  for (size_t j=0; j<lu_dim; ++j)
-  {
-    square_matrix(j,j) += 10.0;
-    lu_rhs(j) = random<ScalarType>();
-  }
-    
-  viennacl::copy(square_matrix, vcl_square_matrix);
-  viennacl::copy(lu_rhs, vcl_lu_rhs);
-  viennacl::linalg::lu_factorize(vcl_square_matrix);
-  viennacl::linalg::lu_substitute(vcl_square_matrix, vcl_lu_rhs);
-  viennacl::copy(square_matrix, vcl_square_matrix);
-  viennacl::copy(lu_rhs, vcl_lu_rhs);
-
-  
-  //
-  // ublas:
-  //
-  ublas::lu_factorize(square_matrix);
-  ublas::inplace_solve (square_matrix, lu_rhs, ublas::unit_lower_tag ());
-  ublas::inplace_solve (square_matrix, lu_rhs, ublas::upper_tag ());
-
-
-  //
-  // ViennaCL:
-  //
-  viennacl::linalg::lu_factorize(vcl_square_matrix);
-  viennacl::linalg::lu_substitute(vcl_square_matrix, vcl_lu_rhs);
-
-  //
-  //  That's it. 
-  //
-  std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl;
-  
-  return 0;
-}
-
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/*
+*
+*   Tutorial: BLAS level 2 functionality (blas2.cpp and blas2.cu are identical, the latter being required for compilation using CUDA nvcc)
+*
+*/
+
+
+//
+// include necessary system headers
+//
+#include <iostream>
+
+//
+// ublas includes
+//
+#include <boost/numeric/ublas/io.hpp>
+#include <boost/numeric/ublas/triangular.hpp>
+#include <boost/numeric/ublas/matrix_sparse.hpp>
+#include <boost/numeric/ublas/matrix.hpp>
+#include <boost/numeric/ublas/matrix_proxy.hpp>
+#include <boost/numeric/ublas/vector_proxy.hpp>
+#include <boost/numeric/ublas/lu.hpp>
+#include <boost/numeric/ublas/io.hpp>
+
+
+// Must be set if you want to use ViennaCL algorithms on ublas objects
+#define VIENNACL_WITH_UBLAS 1
+
+
+//
+// ViennaCL includes
+//
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/direct_solve.hpp"
+#include "viennacl/linalg/prod.hpp"       //generic matrix-vector product
+#include "viennacl/linalg/norm_2.hpp"     //generic l2-norm for vectors
+#include "viennacl/linalg/lu.hpp"         //LU substitution routines
+
+// Some helper functions for this tutorial:
+#include "Random.hpp"
+#include "vector-io.hpp"
+
+using namespace boost::numeric;
+
+int main()
+{
+  typedef float       ScalarType;
+
+  //
+  // Set up some ublas objects
+  //
+  ublas::vector<ScalarType> rhs(12);
+  for (unsigned int i = 0; i < rhs.size(); ++i)
+    rhs(i) = random<ScalarType>();
+  ublas::vector<ScalarType> rhs2 = rhs;
+  ublas::vector<ScalarType> result = ublas::zero_vector<ScalarType>(10);
+  ublas::vector<ScalarType> result2 = result;
+  ublas::vector<ScalarType> rhs_trans = rhs;
+  rhs_trans.resize(result.size(), true);
+  ublas::vector<ScalarType> result_trans = ublas::zero_vector<ScalarType>(rhs.size());
+
+
+  ublas::matrix<ScalarType> matrix(result.size(),rhs.size());
+
+  //
+  // Fill the matrix
+  //
+  for (unsigned int i = 0; i < matrix.size1(); ++i)
+    for (unsigned int j = 0; j < matrix.size2(); ++j)
+      matrix(i,j) = random<ScalarType>();
+
+  //
+  // Use some plain STL types:
+  //
+  std::vector< ScalarType > stl_result(result.size());
+  std::vector< ScalarType > stl_rhs(rhs.size());
+  std::vector< std::vector<ScalarType> > stl_matrix(result.size());
+  for (unsigned int i=0; i < result.size(); ++i)
+  {
+    stl_matrix[i].resize(rhs.size());
+    for (unsigned int j = 0; j < matrix.size2(); ++j)
+    {
+      stl_rhs[j] = rhs[j];
+      stl_matrix[i][j] = matrix(i,j);
+    }
+  }
+
+  //
+  // Set up some ViennaCL objects
+  //
+  viennacl::vector<ScalarType> vcl_rhs(rhs.size());
+  viennacl::vector<ScalarType> vcl_result(result.size());
+  viennacl::matrix<ScalarType> vcl_matrix(result.size(), rhs.size());
+  viennacl::matrix<ScalarType> vcl_matrix2(result.size(), rhs.size());
+
+  viennacl::copy(rhs.begin(), rhs.end(), vcl_rhs.begin());
+  viennacl::copy(matrix, vcl_matrix);     //copy from ublas dense matrix type to ViennaCL type
+
+  //
+  // Some basic matrix operations
+  //
+  vcl_matrix2 = vcl_matrix;
+  vcl_matrix2 += vcl_matrix;
+  vcl_matrix2 -= vcl_matrix;
+  vcl_matrix2 = vcl_matrix2 + vcl_matrix;
+  vcl_matrix2 = vcl_matrix2 - vcl_matrix;
+
+  viennacl::scalar<ScalarType> vcl_3(3.0);
+  vcl_matrix2 *= ScalarType(2.0);
+  vcl_matrix2 /= ScalarType(2.0);
+  vcl_matrix2 *= vcl_3;
+  vcl_matrix2 /= vcl_3;
+
+  //
+  // A matrix can be cleared directly:
+  //
+  vcl_matrix.clear();
+
+  viennacl::copy(stl_matrix, vcl_matrix); //alternative: copy from STL vector< vector<> > type to ViennaCL type
+
+  //for demonstration purposes (no effect):
+  viennacl::copy(vcl_matrix, matrix); //copy back from ViennaCL to ublas type.
+  viennacl::copy(vcl_matrix, stl_matrix); //copy back from ViennaCL to STL type.
+
+  /////////////////////////////////////////////////
+  //////////// Matrix vector products /////////////
+  /////////////////////////////////////////////////
+
+
+  //
+  // Compute matrix-vector products
+  //
+  std::cout << "----- Matrix-Vector product -----" << std::endl;
+  result = ublas::prod(matrix, rhs);                            //the ublas way
+  stl_result = viennacl::linalg::prod(stl_matrix, stl_rhs);     //using STL
+  vcl_result = viennacl::linalg::prod(vcl_matrix, vcl_rhs);     //the ViennaCL way
+
+  //
+  // Compute transposed matrix-vector products
+  //
+  std::cout << "----- Transposed Matrix-Vector product -----" << std::endl;
+  result_trans = prod(trans(matrix), rhs_trans);
+
+  viennacl::vector<ScalarType> vcl_rhs_trans(rhs_trans.size());
+  viennacl::vector<ScalarType> vcl_result_trans(result_trans.size());
+  viennacl::copy(rhs_trans.begin(), rhs_trans.end(), vcl_rhs_trans.begin());
+  vcl_result_trans = viennacl::linalg::prod(trans(vcl_matrix), vcl_rhs_trans);
+
+
+
+  /////////////////////////////////////////////////
+  //////////////// Direct solver  /////////////////
+  /////////////////////////////////////////////////
+
+
+  //
+  // Setup suitable matrices
+  //
+  ublas::matrix<ScalarType> tri_matrix(10,10);
+  for (std::size_t i=0; i<tri_matrix.size1(); ++i)
+  {
+    for (std::size_t j=0; j<i; ++j)
+      tri_matrix(i,j) = 0.0;
+
+    for (std::size_t j=i; j<tri_matrix.size2(); ++j)
+      tri_matrix(i,j) = matrix(i,j);
+  }
+
+  viennacl::matrix<ScalarType> vcl_tri_matrix = viennacl::identity_matrix<ScalarType>(tri_matrix.size1());
+  viennacl::copy(tri_matrix, vcl_tri_matrix);
+
+  rhs.resize(tri_matrix.size1(), true);
+  rhs2.resize(tri_matrix.size1(), true);
+  vcl_rhs.resize(tri_matrix.size1(), true);
+
+  viennacl::copy(rhs.begin(), rhs.end(), vcl_rhs.begin());
+  vcl_result.resize(10);
+
+
+  //
+  // Triangular solver
+  //
+  std::cout << "----- Upper Triangular solve -----" << std::endl;
+  result = ublas::solve(tri_matrix, rhs, ublas::upper_tag());                                    //ublas
+  vcl_result = viennacl::linalg::solve(vcl_tri_matrix, vcl_rhs, viennacl::linalg::upper_tag());  //ViennaCL
+
+  //
+  // Inplace variants of the above
+  //
+  ublas::inplace_solve(tri_matrix, rhs, ublas::upper_tag());                                //ublas
+  viennacl::linalg::inplace_solve(vcl_tri_matrix, vcl_rhs, viennacl::linalg::upper_tag());  //ViennaCL
+
+
+  //
+  // Set up a full system for LU solver:
+  //
+  std::cout << "----- LU factorization -----" << std::endl;
+  std::size_t lu_dim = 300;
+  ublas::matrix<ScalarType> square_matrix(lu_dim, lu_dim);
+  ublas::vector<ScalarType> lu_rhs(lu_dim);
+  viennacl::matrix<ScalarType> vcl_square_matrix(lu_dim, lu_dim);
+  viennacl::vector<ScalarType> vcl_lu_rhs(lu_dim);
+
+  for (std::size_t i=0; i<lu_dim; ++i)
+    for (std::size_t j=0; j<lu_dim; ++j)
+      square_matrix(i,j) = random<ScalarType>();
+
+  //put some more weight on diagonal elements:
+  for (std::size_t j=0; j<lu_dim; ++j)
+  {
+    square_matrix(j,j) += 10.0;
+    lu_rhs(j) = random<ScalarType>();
+  }
+
+  viennacl::copy(square_matrix, vcl_square_matrix);
+  viennacl::copy(lu_rhs, vcl_lu_rhs);
+  viennacl::linalg::lu_factorize(vcl_square_matrix);
+  viennacl::linalg::lu_substitute(vcl_square_matrix, vcl_lu_rhs);
+  viennacl::copy(square_matrix, vcl_square_matrix);
+  viennacl::copy(lu_rhs, vcl_lu_rhs);
+
+
+  //
+  // ublas:
+  //
+  ublas::lu_factorize(square_matrix);
+  ublas::inplace_solve (square_matrix, lu_rhs, ublas::unit_lower_tag ());
+  ublas::inplace_solve (square_matrix, lu_rhs, ublas::upper_tag ());
+
+
+  //
+  // ViennaCL:
+  //
+  viennacl::linalg::lu_factorize(vcl_square_matrix);
+  viennacl::linalg::lu_substitute(vcl_square_matrix, vcl_lu_rhs);
+
+  //
+  //  That's it.
+  //
+  std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl;
+
+  return 0;
+}
+
diff --git a/examples/tutorial/blas2.cpp b/examples/tutorial/blas2.cu
similarity index 88%
copy from examples/tutorial/blas2.cpp
copy to examples/tutorial/blas2.cu
index 9b57ef4..8ee2d53 100644
--- a/examples/tutorial/blas2.cpp
+++ b/examples/tutorial/blas2.cu
@@ -1,258 +1,263 @@
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-//
-// include necessary system headers
-//
-#include <iostream>
-
-//
-// ublas includes
-//
-#include <boost/numeric/ublas/io.hpp>
-#include <boost/numeric/ublas/triangular.hpp>
-#include <boost/numeric/ublas/matrix_sparse.hpp>
-#include <boost/numeric/ublas/matrix.hpp>
-#include <boost/numeric/ublas/matrix_proxy.hpp>
-#include <boost/numeric/ublas/lu.hpp>
-#include <boost/numeric/ublas/io.hpp>
-
-
-// Must be set if you want to use ViennaCL algorithms on ublas objects
-#define VIENNACL_HAVE_UBLAS 1
-
-
-//
-// ViennaCL includes
-//
-#include "viennacl/scalar.hpp"
-#include "viennacl/vector.hpp"
-#include "viennacl/matrix.hpp"
-#include "viennacl/linalg/direct_solve.hpp"
-#include "viennacl/linalg/prod.hpp"       //generic matrix-vector product
-#include "viennacl/linalg/norm_2.hpp"     //generic l2-norm for vectors
-
-// Some helper functions for this tutorial:
-#include "Random.hpp"
-#include "vector-io.hpp"
-
-/*
-*   Tutorial: BLAS level 2 functionality
-*   
-*/
-
-using namespace boost::numeric;
-
-int main()
-{
-  typedef float       ScalarType;
-  
-  //
-  // Set up some ublas objects
-  //
-  ublas::vector<ScalarType> rhs(12);
-  for (unsigned int i = 0; i < rhs.size(); ++i)
-    rhs(i) = random<ScalarType>();
-  ublas::vector<ScalarType> rhs2 = rhs;
-  ublas::vector<ScalarType> result = ublas::zero_vector<ScalarType>(10);
-  ublas::vector<ScalarType> result2 = result;
-  ublas::vector<ScalarType> rhs_trans = rhs;
-  rhs_trans.resize(result.size(), true);
-  ublas::vector<ScalarType> result_trans = ublas::zero_vector<ScalarType>(rhs.size());
-
-  
-  ublas::matrix<ScalarType> matrix(result.size(),rhs.size());
-
-  //
-  // Fill the matrix
-  //
-  for (unsigned int i = 0; i < matrix.size1(); ++i)
-    for (unsigned int j = 0; j < matrix.size2(); ++j)
-      matrix(i,j) = random<ScalarType>();
-    
-  //
-  // Use some plain STL types:
-  //
-  std::vector< ScalarType > stl_result(result.size());
-  std::vector< ScalarType > stl_rhs(rhs.size());
-  std::vector< std::vector<ScalarType> > stl_matrix(result.size());
-  for (unsigned int i=0; i < result.size(); ++i)
-  {
-    stl_matrix[i].resize(rhs.size());
-    for (unsigned int j = 0; j < matrix.size2(); ++j)
-    {
-      stl_rhs[j] = rhs[j];
-      stl_matrix[i][j] = matrix(i,j);
-    }
-  }
-
-  //
-  // Set up some ViennaCL objects
-  //
-  viennacl::vector<ScalarType> vcl_rhs(rhs.size());
-  viennacl::vector<ScalarType> vcl_result(result.size()); 
-  viennacl::matrix<ScalarType> vcl_matrix(result.size(), rhs.size());
-  viennacl::matrix<ScalarType> vcl_matrix2(result.size(), rhs.size());
-
-  viennacl::copy(rhs.begin(), rhs.end(), vcl_rhs.begin());
-  viennacl::copy(matrix, vcl_matrix);     //copy from ublas dense matrix type to ViennaCL type
-
-  //
-  // Some basic matrix operations
-  //
-  vcl_matrix2 = vcl_matrix;
-  vcl_matrix2 += vcl_matrix;
-  vcl_matrix2 -= vcl_matrix;
-  vcl_matrix2 = vcl_matrix2 + vcl_matrix;
-  vcl_matrix2 = vcl_matrix2 - vcl_matrix;
-  
-  viennacl::scalar<ScalarType> vcl_3(3.0);
-  vcl_matrix2 *= ScalarType(2.0);
-  vcl_matrix2 /= ScalarType(2.0);
-  vcl_matrix2 *= vcl_3;
-  vcl_matrix2 /= vcl_3;
-
-  //
-  // A matrix can be cleared directly:
-  //
-  vcl_matrix.clear();
-  
-  viennacl::copy(stl_matrix, vcl_matrix); //alternative: copy from STL vector< vector<> > type to ViennaCL type
-
-  //for demonstration purposes (no effect):
-  viennacl::copy(vcl_matrix, matrix); //copy back from ViennaCL to ublas type.
-  viennacl::copy(vcl_matrix, stl_matrix); //copy back from ViennaCL to STL type.
-  
-  /////////////////////////////////////////////////
-  //////////// Matrix vector products /////////////
-  /////////////////////////////////////////////////
-  
-  
-  //
-  // Compute matrix-vector products
-  //
-  std::cout << "----- Matrix-Vector product -----" << std::endl;
-  result = ublas::prod(matrix, rhs);                            //the ublas way
-  stl_result = viennacl::linalg::prod(stl_matrix, stl_rhs);     //using STL
-  vcl_result = viennacl::linalg::prod(vcl_matrix, vcl_rhs);     //the ViennaCL way
-  
-  //
-  // Compute transposed matrix-vector products
-  //
-  std::cout << "----- Transposed Matrix-Vector product -----" << std::endl;
-  result_trans = prod(trans(matrix), rhs_trans);
-  
-  viennacl::vector<ScalarType> vcl_rhs_trans(rhs_trans.size());
-  viennacl::vector<ScalarType> vcl_result_trans(result_trans.size()); 
-  viennacl::copy(rhs_trans.begin(), rhs_trans.end(), vcl_rhs_trans.begin());
-  vcl_result_trans = viennacl::linalg::prod(trans(vcl_matrix), vcl_rhs_trans);
-  
-  
-  
-  /////////////////////////////////////////////////
-  //////////////// Direct solver  /////////////////
-  /////////////////////////////////////////////////
-  
-  
-  //
-  // Setup suitable matrices
-  //
-  ublas::matrix<ScalarType> tri_matrix(10,10);
-  for (size_t i=0; i<tri_matrix.size1(); ++i)
-  {
-    for (size_t j=0; j<i; ++j)
-      tri_matrix(i,j) = 0.0;
-
-    for (size_t j=i; j<tri_matrix.size2(); ++j)
-      tri_matrix(i,j) = matrix(i,j);
-  }
-  
-  viennacl::matrix<ScalarType> vcl_tri_matrix(tri_matrix.size1(), tri_matrix.size2());
-  viennacl::copy(tri_matrix, vcl_tri_matrix);
-  
-  rhs.resize(tri_matrix.size1(), true);
-  rhs2.resize(tri_matrix.size1(), true);
-  vcl_rhs.resize(tri_matrix.size1(), true);
-  
-  viennacl::copy(rhs.begin(), rhs.end(), vcl_rhs.begin());
-  vcl_result.resize(10);
-
-  
-  //
-  // Triangular solver
-  //
-  std::cout << "----- Upper Triangular solve -----" << std::endl;
-  result = ublas::solve(tri_matrix, rhs, ublas::upper_tag());                                    //ublas
-  vcl_result = viennacl::linalg::solve(vcl_tri_matrix, vcl_rhs, viennacl::linalg::upper_tag());  //ViennaCL
-  
-  //
-  // Inplace variants of the above
-  //
-  ublas::inplace_solve(tri_matrix, rhs, ublas::upper_tag());                                //ublas
-  viennacl::linalg::inplace_solve(vcl_tri_matrix, vcl_rhs, viennacl::linalg::upper_tag());  //ViennaCL
-  
-
-  //
-  // Set up a full system for LU solver:
-  // 
-  std::cout << "----- LU factorization -----" << std::endl;
-  size_t lu_dim = 300;
-  ublas::matrix<ScalarType> square_matrix(lu_dim, lu_dim);
-  ublas::vector<ScalarType> lu_rhs(lu_dim);
-  viennacl::matrix<ScalarType> vcl_square_matrix(lu_dim, lu_dim);
-  viennacl::vector<ScalarType> vcl_lu_rhs(lu_dim);
-
-  for (size_t i=0; i<lu_dim; ++i)
-    for (size_t j=0; j<lu_dim; ++j)
-      square_matrix(i,j) = random<ScalarType>();
-
-  //put some more weight on diagonal elements:
-  for (size_t j=0; j<lu_dim; ++j)
-  {
-    square_matrix(j,j) += 10.0;
-    lu_rhs(j) = random<ScalarType>();
-  }
-    
-  viennacl::copy(square_matrix, vcl_square_matrix);
-  viennacl::copy(lu_rhs, vcl_lu_rhs);
-  viennacl::linalg::lu_factorize(vcl_square_matrix);
-  viennacl::linalg::lu_substitute(vcl_square_matrix, vcl_lu_rhs);
-  viennacl::copy(square_matrix, vcl_square_matrix);
-  viennacl::copy(lu_rhs, vcl_lu_rhs);
-
-  
-  //
-  // ublas:
-  //
-  ublas::lu_factorize(square_matrix);
-  ublas::inplace_solve (square_matrix, lu_rhs, ublas::unit_lower_tag ());
-  ublas::inplace_solve (square_matrix, lu_rhs, ublas::upper_tag ());
-
-
-  //
-  // ViennaCL:
-  //
-  viennacl::linalg::lu_factorize(vcl_square_matrix);
-  viennacl::linalg::lu_substitute(vcl_square_matrix, vcl_lu_rhs);
-
-  //
-  //  That's it. 
-  //
-  std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl;
-  
-  return 0;
-}
-
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/*
+*
+*   Tutorial: BLAS level 2 functionality (blas2.cpp and blas2.cu are identical, the latter being required for compilation using CUDA nvcc)
+*
+*/
+
+
+//
+// include necessary system headers
+//
+#include <iostream>
+
+//
+// ublas includes
+//
+#include <boost/numeric/ublas/io.hpp>
+#include <boost/numeric/ublas/triangular.hpp>
+#include <boost/numeric/ublas/matrix_sparse.hpp>
+#include <boost/numeric/ublas/matrix.hpp>
+#include <boost/numeric/ublas/matrix_proxy.hpp>
+#include <boost/numeric/ublas/vector_proxy.hpp>
+#include <boost/numeric/ublas/lu.hpp>
+#include <boost/numeric/ublas/io.hpp>
+
+
+// Must be set if you want to use ViennaCL algorithms on ublas objects
+#define VIENNACL_WITH_UBLAS 1
+
+
+//
+// ViennaCL includes
+//
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/direct_solve.hpp"
+#include "viennacl/linalg/prod.hpp"       //generic matrix-vector product
+#include "viennacl/linalg/norm_2.hpp"     //generic l2-norm for vectors
+#include "viennacl/linalg/lu.hpp"         //LU substitution routines
+
+// Some helper functions for this tutorial:
+#include "Random.hpp"
+#include "vector-io.hpp"
+
+using namespace boost::numeric;
+
+int main()
+{
+  typedef float       ScalarType;
+
+  //
+  // Set up some ublas objects
+  //
+  ublas::vector<ScalarType> rhs(12);
+  for (unsigned int i = 0; i < rhs.size(); ++i)
+    rhs(i) = random<ScalarType>();
+  ublas::vector<ScalarType> rhs2 = rhs;
+  ublas::vector<ScalarType> result = ublas::zero_vector<ScalarType>(10);
+  ublas::vector<ScalarType> result2 = result;
+  ublas::vector<ScalarType> rhs_trans = rhs;
+  rhs_trans.resize(result.size(), true);
+  ublas::vector<ScalarType> result_trans = ublas::zero_vector<ScalarType>(rhs.size());
+
+
+  ublas::matrix<ScalarType> matrix(result.size(),rhs.size());
+
+  //
+  // Fill the matrix
+  //
+  for (unsigned int i = 0; i < matrix.size1(); ++i)
+    for (unsigned int j = 0; j < matrix.size2(); ++j)
+      matrix(i,j) = random<ScalarType>();
+
+  //
+  // Use some plain STL types:
+  //
+  std::vector< ScalarType > stl_result(result.size());
+  std::vector< ScalarType > stl_rhs(rhs.size());
+  std::vector< std::vector<ScalarType> > stl_matrix(result.size());
+  for (unsigned int i=0; i < result.size(); ++i)
+  {
+    stl_matrix[i].resize(rhs.size());
+    for (unsigned int j = 0; j < matrix.size2(); ++j)
+    {
+      stl_rhs[j] = rhs[j];
+      stl_matrix[i][j] = matrix(i,j);
+    }
+  }
+
+  //
+  // Set up some ViennaCL objects
+  //
+  viennacl::vector<ScalarType> vcl_rhs(rhs.size());
+  viennacl::vector<ScalarType> vcl_result(result.size());
+  viennacl::matrix<ScalarType> vcl_matrix(result.size(), rhs.size());
+  viennacl::matrix<ScalarType> vcl_matrix2(result.size(), rhs.size());
+
+  viennacl::copy(rhs.begin(), rhs.end(), vcl_rhs.begin());
+  viennacl::copy(matrix, vcl_matrix);     //copy from ublas dense matrix type to ViennaCL type
+
+  //
+  // Some basic matrix operations
+  //
+  vcl_matrix2 = vcl_matrix;
+  vcl_matrix2 += vcl_matrix;
+  vcl_matrix2 -= vcl_matrix;
+  vcl_matrix2 = vcl_matrix2 + vcl_matrix;
+  vcl_matrix2 = vcl_matrix2 - vcl_matrix;
+
+  viennacl::scalar<ScalarType> vcl_3(3.0);
+  vcl_matrix2 *= ScalarType(2.0);
+  vcl_matrix2 /= ScalarType(2.0);
+  vcl_matrix2 *= vcl_3;
+  vcl_matrix2 /= vcl_3;
+
+  //
+  // A matrix can be cleared directly:
+  //
+  vcl_matrix.clear();
+
+  viennacl::copy(stl_matrix, vcl_matrix); //alternative: copy from STL vector< vector<> > type to ViennaCL type
+
+  //for demonstration purposes (no effect):
+  viennacl::copy(vcl_matrix, matrix); //copy back from ViennaCL to ublas type.
+  viennacl::copy(vcl_matrix, stl_matrix); //copy back from ViennaCL to STL type.
+
+  /////////////////////////////////////////////////
+  //////////// Matrix vector products /////////////
+  /////////////////////////////////////////////////
+
+
+  //
+  // Compute matrix-vector products
+  //
+  std::cout << "----- Matrix-Vector product -----" << std::endl;
+  result = ublas::prod(matrix, rhs);                            //the ublas way
+  stl_result = viennacl::linalg::prod(stl_matrix, stl_rhs);     //using STL
+  vcl_result = viennacl::linalg::prod(vcl_matrix, vcl_rhs);     //the ViennaCL way
+
+  //
+  // Compute transposed matrix-vector products
+  //
+  std::cout << "----- Transposed Matrix-Vector product -----" << std::endl;
+  result_trans = prod(trans(matrix), rhs_trans);
+
+  viennacl::vector<ScalarType> vcl_rhs_trans(rhs_trans.size());
+  viennacl::vector<ScalarType> vcl_result_trans(result_trans.size());
+  viennacl::copy(rhs_trans.begin(), rhs_trans.end(), vcl_rhs_trans.begin());
+  vcl_result_trans = viennacl::linalg::prod(trans(vcl_matrix), vcl_rhs_trans);
+
+
+
+  /////////////////////////////////////////////////
+  //////////////// Direct solver  /////////////////
+  /////////////////////////////////////////////////
+
+
+  //
+  // Setup suitable matrices
+  //
+  ublas::matrix<ScalarType> tri_matrix(10,10);
+  for (std::size_t i=0; i<tri_matrix.size1(); ++i)
+  {
+    for (std::size_t j=0; j<i; ++j)
+      tri_matrix(i,j) = 0.0;
+
+    for (std::size_t j=i; j<tri_matrix.size2(); ++j)
+      tri_matrix(i,j) = matrix(i,j);
+  }
+
+  viennacl::matrix<ScalarType> vcl_tri_matrix = viennacl::identity_matrix<ScalarType>(tri_matrix.size1());
+  viennacl::copy(tri_matrix, vcl_tri_matrix);
+
+  rhs.resize(tri_matrix.size1(), true);
+  rhs2.resize(tri_matrix.size1(), true);
+  vcl_rhs.resize(tri_matrix.size1(), true);
+
+  viennacl::copy(rhs.begin(), rhs.end(), vcl_rhs.begin());
+  vcl_result.resize(10);
+
+
+  //
+  // Triangular solver
+  //
+  std::cout << "----- Upper Triangular solve -----" << std::endl;
+  result = ublas::solve(tri_matrix, rhs, ublas::upper_tag());                                    //ublas
+  vcl_result = viennacl::linalg::solve(vcl_tri_matrix, vcl_rhs, viennacl::linalg::upper_tag());  //ViennaCL
+
+  //
+  // Inplace variants of the above
+  //
+  ublas::inplace_solve(tri_matrix, rhs, ublas::upper_tag());                                //ublas
+  viennacl::linalg::inplace_solve(vcl_tri_matrix, vcl_rhs, viennacl::linalg::upper_tag());  //ViennaCL
+
+
+  //
+  // Set up a full system for LU solver:
+  //
+  std::cout << "----- LU factorization -----" << std::endl;
+  std::size_t lu_dim = 300;
+  ublas::matrix<ScalarType> square_matrix(lu_dim, lu_dim);
+  ublas::vector<ScalarType> lu_rhs(lu_dim);
+  viennacl::matrix<ScalarType> vcl_square_matrix(lu_dim, lu_dim);
+  viennacl::vector<ScalarType> vcl_lu_rhs(lu_dim);
+
+  for (std::size_t i=0; i<lu_dim; ++i)
+    for (std::size_t j=0; j<lu_dim; ++j)
+      square_matrix(i,j) = random<ScalarType>();
+
+  //put some more weight on diagonal elements:
+  for (std::size_t j=0; j<lu_dim; ++j)
+  {
+    square_matrix(j,j) += 10.0;
+    lu_rhs(j) = random<ScalarType>();
+  }
+
+  viennacl::copy(square_matrix, vcl_square_matrix);
+  viennacl::copy(lu_rhs, vcl_lu_rhs);
+  viennacl::linalg::lu_factorize(vcl_square_matrix);
+  viennacl::linalg::lu_substitute(vcl_square_matrix, vcl_lu_rhs);
+  viennacl::copy(square_matrix, vcl_square_matrix);
+  viennacl::copy(lu_rhs, vcl_lu_rhs);
+
+
+  //
+  // ublas:
+  //
+  ublas::lu_factorize(square_matrix);
+  ublas::inplace_solve (square_matrix, lu_rhs, ublas::unit_lower_tag ());
+  ublas::inplace_solve (square_matrix, lu_rhs, ublas::upper_tag ());
+
+
+  //
+  // ViennaCL:
+  //
+  viennacl::linalg::lu_factorize(vcl_square_matrix);
+  viennacl::linalg::lu_substitute(vcl_square_matrix, vcl_lu_rhs);
+
+  //
+  //  That's it.
+  //
+  std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl;
+
+  return 0;
+}
+
diff --git a/examples/tutorial/blas3.cpp b/examples/tutorial/blas3.cpp
index db17e25..9a59433 100644
--- a/examples/tutorial/blas3.cpp
+++ b/examples/tutorial/blas3.cpp
@@ -1,190 +1,188 @@
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-//disable debug mechanisms to have a fair comparison with ublas:
-#ifndef NDEBUG
- #define NDEBUG
-#endif
-
-
-//
-// include necessary system headers
-//
-#include <iostream>
-
-//
-// ublas includes
-//
-#include <boost/numeric/ublas/io.hpp>
-#include <boost/numeric/ublas/triangular.hpp>
-#include <boost/numeric/ublas/matrix_sparse.hpp>
-#include <boost/numeric/ublas/matrix.hpp>
-#include <boost/numeric/ublas/matrix_proxy.hpp>
-#include <boost/numeric/ublas/lu.hpp>
-#include <boost/numeric/ublas/io.hpp>
-
-
-// Must be set if you want to use ViennaCL algorithms on ublas objects
-#define VIENNACL_HAVE_UBLAS 1
-
-//
-// ViennaCL includes
-//
-#include "viennacl/scalar.hpp"
-#include "viennacl/vector.hpp"
-#include "viennacl/matrix.hpp"
-#include "viennacl/linalg/prod.hpp"
-
-// Some helper functions for this tutorial:
-#include "Random.hpp"
-#include "vector-io.hpp"
-
-#include "../benchmarks/benchmark-utils.hpp"
-
-/*
-*   Tutorial: BLAS level 3 functionality
-*   
-*/
-
-#define BLAS3_MATRIX_SIZE   700
-
-using namespace boost::numeric;
-
-int main()
-{
-  typedef float     ScalarType;
-
-  Timer timer;
-  double exec_time;
-
-  //
-  // Set up some ublas objects
-  //
-  ublas::matrix<ScalarType> ublas_A(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
-  ublas::matrix<ScalarType, ublas::column_major> ublas_B(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
-  ublas::matrix<ScalarType> ublas_C(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
-  ublas::matrix<ScalarType> ublas_C1(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
-  ublas::matrix<ScalarType> ublas_C2(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
-
-  //
-  // One alternative: Put the matrices into a contiguous block of memory (allows to use viennacl::fast_copy(), avoiding temporary memory)
-  //
-  std::vector<ScalarType> stl_A(BLAS3_MATRIX_SIZE * BLAS3_MATRIX_SIZE);
-  std::vector<ScalarType> stl_B(BLAS3_MATRIX_SIZE * BLAS3_MATRIX_SIZE);
-  std::vector<ScalarType> stl_C(BLAS3_MATRIX_SIZE * BLAS3_MATRIX_SIZE);
-
-  //
-  // Fill the matrix
-  //
-  for (unsigned int i = 0; i < ublas_A.size1(); ++i)
-    for (unsigned int j = 0; j < ublas_A.size2(); ++j)
-    {
-      ublas_A(i,j) = random<ScalarType>();
-      stl_A[i*ublas_A.size2() + j] = ublas_A(i,j);
-    }
-
-  for (unsigned int i = 0; i < ublas_B.size1(); ++i)
-    for (unsigned int j = 0; j < ublas_B.size2(); ++j)
-    {
-      ublas_B(i,j) = random<ScalarType>();
-      stl_B[i + j*ublas_B.size1()] = ublas_B(i,j);
-    }
-
-  //
-  // Set up some ViennaCL objects
-  //
-  //viennacl::ocl::set_context_device_type(0, viennacl::ocl::gpu_tag());  //uncomment this is you wish to use GPUs only
-  viennacl::matrix<ScalarType> vcl_A(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
-  viennacl::matrix<ScalarType, viennacl::column_major> vcl_B(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
-  viennacl::matrix<ScalarType> vcl_C(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
-  
-  /////////////////////////////////////////////////
-  //////////// Matrix-matrix products /////////////
-  /////////////////////////////////////////////////
-  
-  //
-  // Compute reference product using ublas:
-  //
-  std::cout << "--- Computing matrix-matrix product using ublas ---" << std::endl;
-  timer.start();
-  ublas_C = ublas::prod(ublas_A, ublas_B);
-  exec_time = timer.get();
-  std::cout << " - Execution time: " << exec_time << std::endl;
-  
-  //
-  // Now iterate over all OpenCL devices in the context and compute the matrix-matrix product
-  //
-  std::cout << std::endl << "--- Computing matrix-matrix product on each available compute device using ViennaCL ---" << std::endl;
-  std::vector<viennacl::ocl::device> devices = viennacl::ocl::current_context().devices();
-  for (size_t i=0; i<devices.size(); ++i)
-  {
-    viennacl::ocl::current_context().switch_device(devices[i]);
-    std::cout << " - Device Name: " << viennacl::ocl::current_device().name() << std::endl;
-
-    //viennacl::copy(ublas_A, vcl_A);
-    //viennacl::copy(ublas_B, vcl_B);
-    viennacl::fast_copy(&(stl_A[0]),
-                        &(stl_A[0]) + stl_A.size(),
-                        vcl_A);
-    viennacl::fast_copy(&(stl_B[0]),
-                        &(stl_B[0]) + stl_B.size(),
-                        vcl_B);
-    vcl_C = viennacl::linalg::prod(vcl_A, vcl_B);
-    viennacl::ocl::get_queue().finish();
-    timer.start();
-    vcl_C = viennacl::linalg::prod(vcl_A, vcl_B);
-    viennacl::ocl::get_queue().finish();
-    exec_time = timer.get();
-    std::cout << " - Execution time on device (no setup time included): " << exec_time << std::endl;
-    
-    //
-    // Verify the result
-    //
-    //viennacl::copy(vcl_C, ublas_C1);
-    viennacl::fast_copy(vcl_C, &(stl_C[0]));
-    for (unsigned int i = 0; i < ublas_C1.size1(); ++i)
-      for (unsigned int j = 0; j < ublas_C1.size2(); ++j)
-        ublas_C1(i,j) = stl_C[i * ublas_C1.size2() + j];
-
-    std::cout << " - Checking result... ";
-    bool check_ok = true;
-    for (unsigned int i = 0; i < ublas_A.size1(); ++i)
-    {
-      for (unsigned int j = 0; j < ublas_A.size2(); ++j)
-      {
-        if ( fabs(ublas_C1(i,j) - ublas_C(i,j)) / ublas_C(i,j) > 1e-4 )
-        {
-          check_ok = false;
-          break;
-        }
-      }
-      if (!check_ok)
-        break;
-    }
-    if (check_ok)
-      std::cout << "[OK]" << std::endl << std::endl;
-    else
-      std::cout << "[FAILED]" << std::endl << std::endl;
-      
-  }
-
-  //
-  //  That's it. 
-  //
-  std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl;
-  return EXIT_SUCCESS;
-}
-
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/*
+*
+*   Tutorial: BLAS level 3 functionality (blas3.cpp and blas3.cu are identical, the latter being required for compilation using CUDA nvcc)
+*
+*/
+
+//disable debug mechanisms to have a fair comparison with ublas:
+#ifndef NDEBUG
+ #define NDEBUG
+#endif
+
+
+//
+// include necessary system headers
+//
+#include <iostream>
+
+//
+// ublas includes
+//
+#include <boost/numeric/ublas/io.hpp>
+#include <boost/numeric/ublas/triangular.hpp>
+#include <boost/numeric/ublas/matrix_sparse.hpp>
+#include <boost/numeric/ublas/matrix.hpp>
+#include <boost/numeric/ublas/matrix_proxy.hpp>
+#include <boost/numeric/ublas/lu.hpp>
+#include <boost/numeric/ublas/io.hpp>
+
+
+// Must be set if you want to use ViennaCL algorithms on ublas objects
+#define VIENNACL_WITH_UBLAS 1
+
+//
+// ViennaCL includes
+//
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/prod.hpp"
+
+// Some helper functions for this tutorial:
+#include "Random.hpp"
+#include "vector-io.hpp"
+
+#include "../benchmarks/benchmark-utils.hpp"
+
+#define BLAS3_MATRIX_SIZE   400
+
+using namespace boost::numeric;
+
+
+#ifndef VIENNACL_WITH_OPENCL
+  struct dummy
+  {
+    std::size_t size() const { return 1; }
+  };
+#endif
+
+
+int main()
+{
+  typedef float     ScalarType;
+
+  Timer timer;
+  double exec_time;
+
+  //
+  // Set up some ublas objects
+  //
+  ublas::matrix<ScalarType> ublas_A(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
+  ublas::matrix<ScalarType, ublas::column_major> ublas_B(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
+  ublas::matrix<ScalarType> ublas_C(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
+  ublas::matrix<ScalarType> ublas_C1(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
+
+  //
+  // Fill the matrix
+  //
+  for (unsigned int i = 0; i < ublas_A.size1(); ++i)
+    for (unsigned int j = 0; j < ublas_A.size2(); ++j)
+    {
+      ublas_A(i,j) = random<ScalarType>();
+    }
+
+  for (unsigned int i = 0; i < ublas_B.size1(); ++i)
+    for (unsigned int j = 0; j < ublas_B.size2(); ++j)
+    {
+      ublas_B(i,j) = random<ScalarType>();
+    }
+
+  //
+  // Set up some ViennaCL objects
+  //
+  //viennacl::ocl::set_context_device_type(0, viennacl::ocl::gpu_tag());  //uncomment this is you wish to use GPUs only
+  viennacl::matrix<ScalarType> vcl_A(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
+  viennacl::matrix<ScalarType, viennacl::column_major> vcl_B(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
+  viennacl::matrix<ScalarType> vcl_C(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
+
+  /////////////////////////////////////////////////
+  //////////// Matrix-matrix products /////////////
+  /////////////////////////////////////////////////
+
+  //
+  // Compute reference product using ublas:
+  //
+  std::cout << "--- Computing matrix-matrix product using ublas ---" << std::endl;
+  timer.start();
+  ublas_C = ublas::prod(ublas_A, ublas_B);
+  exec_time = timer.get();
+  std::cout << " - Execution time: " << exec_time << std::endl;
+
+  //
+  // Now iterate over all OpenCL devices in the context and compute the matrix-matrix product
+  //
+  std::cout << std::endl << "--- Computing matrix-matrix product on each available compute device using ViennaCL ---" << std::endl;
+#ifdef VIENNACL_WITH_OPENCL
+  std::vector<viennacl::ocl::device> devices = viennacl::ocl::current_context().devices();
+#else
+  dummy devices;
+#endif
+
+  for (std::size_t i=0; i<devices.size(); ++i)
+  {
+#ifdef VIENNACL_WITH_OPENCL
+    viennacl::ocl::current_context().switch_device(devices[i]);
+    std::cout << " - Device Name: " << viennacl::ocl::current_device().name() << std::endl;
+#endif
+
+    viennacl::copy(ublas_A, vcl_A);
+    viennacl::copy(ublas_B, vcl_B);
+    vcl_C = viennacl::linalg::prod(vcl_A, vcl_B);
+    viennacl::backend::finish();
+    timer.start();
+    vcl_C = viennacl::linalg::prod(vcl_A, vcl_B);
+    viennacl::backend::finish();
+    exec_time = timer.get();
+    std::cout << " - Execution time on device (no setup time included): " << exec_time << std::endl;
+
+    //
+    // Verify the result
+    //
+    viennacl::copy(vcl_C, ublas_C1);
+
+    std::cout << " - Checking result... ";
+    bool check_ok = true;
+    for (std::size_t i = 0; i < ublas_A.size1(); ++i)
+    {
+      for (std::size_t j = 0; j < ublas_A.size2(); ++j)
+      {
+        if ( std::fabs(ublas_C1(i,j) - ublas_C(i,j)) / ublas_C(i,j) > 1e-4 )
+        {
+          check_ok = false;
+          break;
+        }
+      }
+      if (!check_ok)
+        break;
+    }
+    if (check_ok)
+      std::cout << "[OK]" << std::endl << std::endl;
+    else
+      std::cout << "[FAILED]" << std::endl << std::endl;
+
+  }
+
+  //
+  //  That's it.
+  //
+  std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl;
+  return EXIT_SUCCESS;
+}
+
diff --git a/examples/tutorial/blas3.cpp b/examples/tutorial/blas3.cu
similarity index 70%
copy from examples/tutorial/blas3.cpp
copy to examples/tutorial/blas3.cu
index db17e25..9a59433 100644
--- a/examples/tutorial/blas3.cpp
+++ b/examples/tutorial/blas3.cu
@@ -1,190 +1,188 @@
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-//disable debug mechanisms to have a fair comparison with ublas:
-#ifndef NDEBUG
- #define NDEBUG
-#endif
-
-
-//
-// include necessary system headers
-//
-#include <iostream>
-
-//
-// ublas includes
-//
-#include <boost/numeric/ublas/io.hpp>
-#include <boost/numeric/ublas/triangular.hpp>
-#include <boost/numeric/ublas/matrix_sparse.hpp>
-#include <boost/numeric/ublas/matrix.hpp>
-#include <boost/numeric/ublas/matrix_proxy.hpp>
-#include <boost/numeric/ublas/lu.hpp>
-#include <boost/numeric/ublas/io.hpp>
-
-
-// Must be set if you want to use ViennaCL algorithms on ublas objects
-#define VIENNACL_HAVE_UBLAS 1
-
-//
-// ViennaCL includes
-//
-#include "viennacl/scalar.hpp"
-#include "viennacl/vector.hpp"
-#include "viennacl/matrix.hpp"
-#include "viennacl/linalg/prod.hpp"
-
-// Some helper functions for this tutorial:
-#include "Random.hpp"
-#include "vector-io.hpp"
-
-#include "../benchmarks/benchmark-utils.hpp"
-
-/*
-*   Tutorial: BLAS level 3 functionality
-*   
-*/
-
-#define BLAS3_MATRIX_SIZE   700
-
-using namespace boost::numeric;
-
-int main()
-{
-  typedef float     ScalarType;
-
-  Timer timer;
-  double exec_time;
-
-  //
-  // Set up some ublas objects
-  //
-  ublas::matrix<ScalarType> ublas_A(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
-  ublas::matrix<ScalarType, ublas::column_major> ublas_B(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
-  ublas::matrix<ScalarType> ublas_C(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
-  ublas::matrix<ScalarType> ublas_C1(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
-  ublas::matrix<ScalarType> ublas_C2(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
-
-  //
-  // One alternative: Put the matrices into a contiguous block of memory (allows to use viennacl::fast_copy(), avoiding temporary memory)
-  //
-  std::vector<ScalarType> stl_A(BLAS3_MATRIX_SIZE * BLAS3_MATRIX_SIZE);
-  std::vector<ScalarType> stl_B(BLAS3_MATRIX_SIZE * BLAS3_MATRIX_SIZE);
-  std::vector<ScalarType> stl_C(BLAS3_MATRIX_SIZE * BLAS3_MATRIX_SIZE);
-
-  //
-  // Fill the matrix
-  //
-  for (unsigned int i = 0; i < ublas_A.size1(); ++i)
-    for (unsigned int j = 0; j < ublas_A.size2(); ++j)
-    {
-      ublas_A(i,j) = random<ScalarType>();
-      stl_A[i*ublas_A.size2() + j] = ublas_A(i,j);
-    }
-
-  for (unsigned int i = 0; i < ublas_B.size1(); ++i)
-    for (unsigned int j = 0; j < ublas_B.size2(); ++j)
-    {
-      ublas_B(i,j) = random<ScalarType>();
-      stl_B[i + j*ublas_B.size1()] = ublas_B(i,j);
-    }
-
-  //
-  // Set up some ViennaCL objects
-  //
-  //viennacl::ocl::set_context_device_type(0, viennacl::ocl::gpu_tag());  //uncomment this is you wish to use GPUs only
-  viennacl::matrix<ScalarType> vcl_A(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
-  viennacl::matrix<ScalarType, viennacl::column_major> vcl_B(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
-  viennacl::matrix<ScalarType> vcl_C(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
-  
-  /////////////////////////////////////////////////
-  //////////// Matrix-matrix products /////////////
-  /////////////////////////////////////////////////
-  
-  //
-  // Compute reference product using ublas:
-  //
-  std::cout << "--- Computing matrix-matrix product using ublas ---" << std::endl;
-  timer.start();
-  ublas_C = ublas::prod(ublas_A, ublas_B);
-  exec_time = timer.get();
-  std::cout << " - Execution time: " << exec_time << std::endl;
-  
-  //
-  // Now iterate over all OpenCL devices in the context and compute the matrix-matrix product
-  //
-  std::cout << std::endl << "--- Computing matrix-matrix product on each available compute device using ViennaCL ---" << std::endl;
-  std::vector<viennacl::ocl::device> devices = viennacl::ocl::current_context().devices();
-  for (size_t i=0; i<devices.size(); ++i)
-  {
-    viennacl::ocl::current_context().switch_device(devices[i]);
-    std::cout << " - Device Name: " << viennacl::ocl::current_device().name() << std::endl;
-
-    //viennacl::copy(ublas_A, vcl_A);
-    //viennacl::copy(ublas_B, vcl_B);
-    viennacl::fast_copy(&(stl_A[0]),
-                        &(stl_A[0]) + stl_A.size(),
-                        vcl_A);
-    viennacl::fast_copy(&(stl_B[0]),
-                        &(stl_B[0]) + stl_B.size(),
-                        vcl_B);
-    vcl_C = viennacl::linalg::prod(vcl_A, vcl_B);
-    viennacl::ocl::get_queue().finish();
-    timer.start();
-    vcl_C = viennacl::linalg::prod(vcl_A, vcl_B);
-    viennacl::ocl::get_queue().finish();
-    exec_time = timer.get();
-    std::cout << " - Execution time on device (no setup time included): " << exec_time << std::endl;
-    
-    //
-    // Verify the result
-    //
-    //viennacl::copy(vcl_C, ublas_C1);
-    viennacl::fast_copy(vcl_C, &(stl_C[0]));
-    for (unsigned int i = 0; i < ublas_C1.size1(); ++i)
-      for (unsigned int j = 0; j < ublas_C1.size2(); ++j)
-        ublas_C1(i,j) = stl_C[i * ublas_C1.size2() + j];
-
-    std::cout << " - Checking result... ";
-    bool check_ok = true;
-    for (unsigned int i = 0; i < ublas_A.size1(); ++i)
-    {
-      for (unsigned int j = 0; j < ublas_A.size2(); ++j)
-      {
-        if ( fabs(ublas_C1(i,j) - ublas_C(i,j)) / ublas_C(i,j) > 1e-4 )
-        {
-          check_ok = false;
-          break;
-        }
-      }
-      if (!check_ok)
-        break;
-    }
-    if (check_ok)
-      std::cout << "[OK]" << std::endl << std::endl;
-    else
-      std::cout << "[FAILED]" << std::endl << std::endl;
-      
-  }
-
-  //
-  //  That's it. 
-  //
-  std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl;
-  return EXIT_SUCCESS;
-}
-
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/*
+*
+*   Tutorial: BLAS level 3 functionality (blas3.cpp and blas3.cu are identical, the latter being required for compilation using CUDA nvcc)
+*
+*/
+
+//disable debug mechanisms to have a fair comparison with ublas:
+#ifndef NDEBUG
+ #define NDEBUG
+#endif
+
+
+//
+// include necessary system headers
+//
+#include <iostream>
+
+//
+// ublas includes
+//
+#include <boost/numeric/ublas/io.hpp>
+#include <boost/numeric/ublas/triangular.hpp>
+#include <boost/numeric/ublas/matrix_sparse.hpp>
+#include <boost/numeric/ublas/matrix.hpp>
+#include <boost/numeric/ublas/matrix_proxy.hpp>
+#include <boost/numeric/ublas/lu.hpp>
+#include <boost/numeric/ublas/io.hpp>
+
+
+// Must be set if you want to use ViennaCL algorithms on ublas objects
+#define VIENNACL_WITH_UBLAS 1
+
+//
+// ViennaCL includes
+//
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/prod.hpp"
+
+// Some helper functions for this tutorial:
+#include "Random.hpp"
+#include "vector-io.hpp"
+
+#include "../benchmarks/benchmark-utils.hpp"
+
+#define BLAS3_MATRIX_SIZE   400
+
+using namespace boost::numeric;
+
+
+#ifndef VIENNACL_WITH_OPENCL
+  struct dummy
+  {
+    std::size_t size() const { return 1; }
+  };
+#endif
+
+
+int main()
+{
+  typedef float     ScalarType;
+
+  Timer timer;
+  double exec_time;
+
+  //
+  // Set up some ublas objects
+  //
+  ublas::matrix<ScalarType> ublas_A(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
+  ublas::matrix<ScalarType, ublas::column_major> ublas_B(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
+  ublas::matrix<ScalarType> ublas_C(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
+  ublas::matrix<ScalarType> ublas_C1(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
+
+  //
+  // Fill the matrix
+  //
+  for (unsigned int i = 0; i < ublas_A.size1(); ++i)
+    for (unsigned int j = 0; j < ublas_A.size2(); ++j)
+    {
+      ublas_A(i,j) = random<ScalarType>();
+    }
+
+  for (unsigned int i = 0; i < ublas_B.size1(); ++i)
+    for (unsigned int j = 0; j < ublas_B.size2(); ++j)
+    {
+      ublas_B(i,j) = random<ScalarType>();
+    }
+
+  //
+  // Set up some ViennaCL objects
+  //
+  //viennacl::ocl::set_context_device_type(0, viennacl::ocl::gpu_tag());  //uncomment this is you wish to use GPUs only
+  viennacl::matrix<ScalarType> vcl_A(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
+  viennacl::matrix<ScalarType, viennacl::column_major> vcl_B(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
+  viennacl::matrix<ScalarType> vcl_C(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
+
+  /////////////////////////////////////////////////
+  //////////// Matrix-matrix products /////////////
+  /////////////////////////////////////////////////
+
+  //
+  // Compute reference product using ublas:
+  //
+  std::cout << "--- Computing matrix-matrix product using ublas ---" << std::endl;
+  timer.start();
+  ublas_C = ublas::prod(ublas_A, ublas_B);
+  exec_time = timer.get();
+  std::cout << " - Execution time: " << exec_time << std::endl;
+
+  //
+  // Now iterate over all OpenCL devices in the context and compute the matrix-matrix product
+  //
+  std::cout << std::endl << "--- Computing matrix-matrix product on each available compute device using ViennaCL ---" << std::endl;
+#ifdef VIENNACL_WITH_OPENCL
+  std::vector<viennacl::ocl::device> devices = viennacl::ocl::current_context().devices();
+#else
+  dummy devices;
+#endif
+
+  for (std::size_t i=0; i<devices.size(); ++i)
+  {
+#ifdef VIENNACL_WITH_OPENCL
+    viennacl::ocl::current_context().switch_device(devices[i]);
+    std::cout << " - Device Name: " << viennacl::ocl::current_device().name() << std::endl;
+#endif
+
+    viennacl::copy(ublas_A, vcl_A);
+    viennacl::copy(ublas_B, vcl_B);
+    vcl_C = viennacl::linalg::prod(vcl_A, vcl_B);
+    viennacl::backend::finish();
+    timer.start();
+    vcl_C = viennacl::linalg::prod(vcl_A, vcl_B);
+    viennacl::backend::finish();
+    exec_time = timer.get();
+    std::cout << " - Execution time on device (no setup time included): " << exec_time << std::endl;
+
+    //
+    // Verify the result
+    //
+    viennacl::copy(vcl_C, ublas_C1);
+
+    std::cout << " - Checking result... ";
+    bool check_ok = true;
+    for (std::size_t i = 0; i < ublas_A.size1(); ++i)
+    {
+      for (std::size_t j = 0; j < ublas_A.size2(); ++j)
+      {
+        if ( std::fabs(ublas_C1(i,j) - ublas_C(i,j)) / ublas_C(i,j) > 1e-4 )
+        {
+          check_ok = false;
+          break;
+        }
+      }
+      if (!check_ok)
+        break;
+    }
+    if (check_ok)
+      std::cout << "[OK]" << std::endl << std::endl;
+    else
+      std::cout << "[FAILED]" << std::endl << std::endl;
+
+  }
+
+  //
+  //  That's it.
+  //
+  std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl;
+  return EXIT_SUCCESS;
+}
+
diff --git a/examples/tutorial/blas3range.cpp b/examples/tutorial/blas3range.cpp
index d7e376e..1ec5f36 100644
--- a/examples/tutorial/blas3range.cpp
+++ b/examples/tutorial/blas3range.cpp
@@ -1,211 +1,213 @@
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-//disable debug mechanisms to have a fair comparison with ublas:
-#ifndef NDEBUG
- #define NDEBUG
-#endif
-
-
-//
-// include necessary system headers
-//
-#include <iostream>
-
-//
-// ublas includes
-//
-#include <boost/numeric/ublas/io.hpp>
-#include <boost/numeric/ublas/triangular.hpp>
-#include <boost/numeric/ublas/matrix_sparse.hpp>
-#include <boost/numeric/ublas/matrix.hpp>
-#include <boost/numeric/ublas/matrix_proxy.hpp>
-#include <boost/numeric/ublas/lu.hpp>
-#include <boost/numeric/ublas/io.hpp>
-
-
-// Must be set if you want to use ViennaCL algorithms on ublas objects
-#define VIENNACL_HAVE_UBLAS 1
-
-//
-// ViennaCL includes
-//
-#include "viennacl/scalar.hpp"
-#include "viennacl/vector.hpp"
-#include "viennacl/matrix.hpp"
-#include "viennacl/linalg/prod.hpp"
-#include "viennacl/matrix_proxy.hpp"
-
-// Some helper functions for this tutorial:
-#include "Random.hpp"
-#include "vector-io.hpp"
-
-#include "../benchmarks/benchmark-utils.hpp"
-
-/*
-*   Tutorial: BLAS level 3 functionality
-*   
-*/
-
-#define BLAS3_MATRIX_SIZE   1500
-
-using namespace boost::numeric;
-
-int main()
-{
-  typedef float     ScalarType;
-
-  Timer timer;
-  double exec_time;
-
-  //
-  // Set up some ublas objects
-  //
-  ublas::matrix<ScalarType> ublas_A(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
-  ublas::matrix<ScalarType, ublas::column_major> ublas_B(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
-  ublas::matrix<ScalarType> ublas_C(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
-  ublas::matrix<ScalarType> ublas_C1(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
-  ublas::matrix<ScalarType> ublas_C2(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
-
-  //
-  // One alternative: Put the matrices into a contiguous block of memory (allows to use viennacl::fast_copy(), avoiding temporary memory)
-  //
-  std::vector<ScalarType> stl_A(BLAS3_MATRIX_SIZE * BLAS3_MATRIX_SIZE);
-  std::vector<ScalarType> stl_B(BLAS3_MATRIX_SIZE * BLAS3_MATRIX_SIZE);
-  std::vector<ScalarType> stl_C(BLAS3_MATRIX_SIZE * BLAS3_MATRIX_SIZE);
-
-  //
-  // Fill the matrix
-  //
-  for (unsigned int i = 0; i < ublas_A.size1(); ++i)
-    for (unsigned int j = 0; j < ublas_A.size2(); ++j)
-    {
-      ublas_A(i,j) = random<ScalarType>();
-      stl_A[i*ublas_A.size2() + j] = ublas_A(i,j);
-    }
-
-  for (unsigned int i = 0; i < ublas_B.size1(); ++i)
-    for (unsigned int j = 0; j < ublas_B.size2(); ++j)
-    {
-      ublas_B(i,j) = random<ScalarType>();
-      stl_B[i + j*ublas_B.size1()] = ublas_B(i,j);
-    }
-    
-  ublas::range ublas_r1(1, BLAS3_MATRIX_SIZE-1);
-  ublas::range ublas_r2(2, BLAS3_MATRIX_SIZE-2);
-  ublas::matrix_range< ublas::matrix<ScalarType> >  ublas_A_sub(ublas_A, ublas_r1, ublas_r2);
-  ublas::matrix_range< ublas::matrix<ScalarType, ublas::column_major> >  ublas_B_sub(ublas_B, ublas_r2, ublas_r1);
-  ublas::matrix_range< ublas::matrix<ScalarType> >  ublas_C_sub(ublas_C, ublas_r1, ublas_r1);
-
-  //
-  // Set up some ViennaCL objects
-  //
-  //viennacl::ocl::set_context_device_type(0, viennacl::ocl::gpu_tag());  //uncomment this is you wish to use GPUs only
-  viennacl::matrix<ScalarType> vcl_A(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
-  viennacl::matrix<ScalarType, viennacl::column_major> vcl_B(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
-  viennacl::matrix<ScalarType> vcl_C(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
-
-  viennacl::range vcl_r1(1, BLAS3_MATRIX_SIZE-1);
-  viennacl::range vcl_r2(2, BLAS3_MATRIX_SIZE-2);
-  viennacl::matrix_range< viennacl::matrix<ScalarType> >  vcl_A_sub(vcl_A, vcl_r1, vcl_r2);
-  viennacl::matrix_range< viennacl::matrix<ScalarType, viennacl::column_major> >  vcl_B_sub(vcl_B, vcl_r2, vcl_r1);
-  viennacl::matrix_range< viennacl::matrix<ScalarType> >  vcl_C_sub(vcl_C, vcl_r1, vcl_r1);
-  
-  ublas_C.clear();
-  viennacl::copy(ublas_C, vcl_C);
-  
-  /////////////////////////////////////////////////
-  //////////// Matrix-matrix products /////////////
-  /////////////////////////////////////////////////
-  
-  //
-  // Compute reference product using ublas:
-  //
-  std::cout << "--- Computing matrix-matrix product using ublas ---" << std::endl;
-  timer.start();
-  ublas_C_sub = ublas::prod(ublas_A_sub, ublas_B_sub);
-  exec_time = timer.get();
-  std::cout << " - Execution time: " << exec_time << std::endl;
-  
-  //std::cout << ublas_C << std::endl;
-  
-  //
-  // Now iterate over all OpenCL devices in the context and compute the matrix-matrix product
-  //
-  std::cout << std::endl << "--- Computing matrix-matrix product on each available compute device using ViennaCL ---" << std::endl;
-  std::vector<viennacl::ocl::device> devices = viennacl::ocl::current_context().devices();
-  for (size_t i=0; i<devices.size(); ++i)
-  {
-    viennacl::ocl::current_context().switch_device(devices[i]);
-    std::cout << " - Device Name: " << viennacl::ocl::current_device().name() << std::endl;
-
-    //viennacl::copy(ublas_A, vcl_A);
-    //viennacl::copy(ublas_B, vcl_B);
-    viennacl::fast_copy(&(stl_A[0]),
-                        &(stl_A[0]) + stl_A.size(),
-                        vcl_A);
-    viennacl::fast_copy(&(stl_B[0]),
-                        &(stl_B[0]) + stl_B.size(),
-                        vcl_B);
-    vcl_C_sub = viennacl::linalg::prod(vcl_A_sub, vcl_B_sub);
-    viennacl::ocl::get_queue().finish();
-    timer.start();
-    vcl_C_sub = viennacl::linalg::prod(vcl_A_sub, vcl_B_sub);
-    viennacl::ocl::get_queue().finish();
-    exec_time = timer.get();
-    std::cout << " - Execution time on device (no setup time included): " << exec_time << std::endl;
-    std::cout << " - GFLOPs: " << (vcl_A.size1() / 1000.0) * (vcl_A.size2() / 1000.0) * (vcl_B.size2() / 1000.0) / exec_time << std::endl;
-
-    //std::cout << vcl_C << std::endl;
-    
-    //
-    // Verify the result
-    //
-    //viennacl::copy(vcl_C, ublas_C1);
-    viennacl::fast_copy(vcl_C, &(stl_C[0]));
-    for (unsigned int i = 0; i < ublas_C1.size1(); ++i)
-      for (unsigned int j = 0; j < ublas_C1.size2(); ++j)
-        ublas_C1(i,j) = stl_C[i * ublas_C1.size2() + j];
-
-    std::cout << " - Checking result... ";
-    bool check_ok = true;
-    for (unsigned int i = 0; i < ublas_A.size1(); ++i)
-    {
-      for (unsigned int j = 0; j < ublas_A.size2(); ++j)
-      {
-        if ( fabs(ublas_C1(i,j) - ublas_C(i,j)) / ublas_C(i,j) > 1e-4 )
-        {
-          check_ok = false;
-          break;
-        }
-      }
-      if (!check_ok)
-        break;
-    }
-    if (check_ok)
-      std::cout << "[OK]" << std::endl << std::endl;
-    else
-      std::cout << "[FAILED]" << std::endl << std::endl;
-      
-  }
-
-  //
-  //  That's it. 
-  //
-  std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl;
-  return EXIT_SUCCESS;
-}
-
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+               
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/*
+* 
+*   Tutorial: BLAS level 3 functionality on sub-matrices (blas3range.cpp and blas3range.cu are identical, the latter being required for compilation using CUDA nvcc)
+*   
+*/
+
+//disable debug mechanisms to have a fair comparison with ublas:
+#ifndef NDEBUG
+ #define NDEBUG
+#endif
+
+
+//
+// include necessary system headers
+//
+#include <iostream>
+
+//
+// ublas includes
+//
+#include <boost/numeric/ublas/io.hpp>
+#include <boost/numeric/ublas/triangular.hpp>
+#include <boost/numeric/ublas/matrix_sparse.hpp>
+#include <boost/numeric/ublas/matrix.hpp>
+#include <boost/numeric/ublas/matrix_proxy.hpp>
+#include <boost/numeric/ublas/lu.hpp>
+#include <boost/numeric/ublas/io.hpp>
+
+
+// Must be set if you want to use ViennaCL algorithms on ublas objects
+#define VIENNACL_WITH_UBLAS 1
+
+//
+// ViennaCL includes
+//
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/matrix_proxy.hpp"
+
+// Some helper functions for this tutorial:
+#include "Random.hpp"
+#include "vector-io.hpp"
+
+#include "../benchmarks/benchmark-utils.hpp"
+
+#define BLAS3_MATRIX_SIZE   1500
+
+using namespace boost::numeric;
+
+int main()
+{
+  typedef float     ScalarType;
+
+  Timer timer;
+  double exec_time;
+
+  //
+  // Set up some ublas objects
+  //
+  ublas::matrix<ScalarType> ublas_A(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
+  ublas::matrix<ScalarType, ublas::column_major> ublas_B(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
+  ublas::matrix<ScalarType> ublas_C(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
+  ublas::matrix<ScalarType> ublas_C1(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
+  ublas::matrix<ScalarType> ublas_C2(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
+
+  //
+  // One alternative: Put the matrices into a contiguous block of memory (allows to use viennacl::fast_copy(), avoiding temporary memory)
+  //
+  std::vector<ScalarType> stl_A(BLAS3_MATRIX_SIZE * BLAS3_MATRIX_SIZE);
+  std::vector<ScalarType> stl_B(BLAS3_MATRIX_SIZE * BLAS3_MATRIX_SIZE);
+  std::vector<ScalarType> stl_C(BLAS3_MATRIX_SIZE * BLAS3_MATRIX_SIZE);
+
+  //
+  // Fill the matrix
+  //
+  for (unsigned int i = 0; i < ublas_A.size1(); ++i)
+    for (unsigned int j = 0; j < ublas_A.size2(); ++j)
+    {
+      ublas_A(i,j) = random<ScalarType>();
+      stl_A[i*ublas_A.size2() + j] = ublas_A(i,j);
+    }
+
+  for (unsigned int i = 0; i < ublas_B.size1(); ++i)
+    for (unsigned int j = 0; j < ublas_B.size2(); ++j)
+    {
+      ublas_B(i,j) = random<ScalarType>();
+      stl_B[i + j*ublas_B.size1()] = ublas_B(i,j);
+    }
+    
+  ublas::range ublas_r1(1, BLAS3_MATRIX_SIZE-1);
+  ublas::range ublas_r2(2, BLAS3_MATRIX_SIZE-2);
+  ublas::matrix_range< ublas::matrix<ScalarType> >  ublas_A_sub(ublas_A, ublas_r1, ublas_r2);
+  ublas::matrix_range< ublas::matrix<ScalarType, ublas::column_major> >  ublas_B_sub(ublas_B, ublas_r2, ublas_r1);
+  ublas::matrix_range< ublas::matrix<ScalarType> >  ublas_C_sub(ublas_C, ublas_r1, ublas_r1);
+
+  //
+  // Set up some ViennaCL objects
+  //
+  //viennacl::ocl::set_context_device_type(0, viennacl::ocl::gpu_tag());  //uncomment this is you wish to use GPUs only
+  viennacl::matrix<ScalarType> vcl_A(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
+  viennacl::matrix<ScalarType, viennacl::column_major> vcl_B(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
+  viennacl::matrix<ScalarType> vcl_C(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
+
+  viennacl::range vcl_r1(1, BLAS3_MATRIX_SIZE-1);
+  viennacl::range vcl_r2(2, BLAS3_MATRIX_SIZE-2);
+  viennacl::matrix_range< viennacl::matrix<ScalarType> >  vcl_A_sub(vcl_A, vcl_r1, vcl_r2);
+  viennacl::matrix_range< viennacl::matrix<ScalarType, viennacl::column_major> >  vcl_B_sub(vcl_B, vcl_r2, vcl_r1);
+  viennacl::matrix_range< viennacl::matrix<ScalarType> >  vcl_C_sub(vcl_C, vcl_r1, vcl_r1);
+  
+  ublas_C.clear();
+  viennacl::copy(ublas_C, vcl_C);
+  
+  /////////////////////////////////////////////////
+  //////////// Matrix-matrix products /////////////
+  /////////////////////////////////////////////////
+  
+  //
+  // Compute reference product using ublas:
+  //
+  std::cout << "--- Computing matrix-matrix product using ublas ---" << std::endl;
+  timer.start();
+  ublas_C_sub = ublas::prod(ublas_A_sub, ublas_B_sub);
+  exec_time = timer.get();
+  std::cout << " - Execution time: " << exec_time << std::endl;
+  
+  //std::cout << ublas_C << std::endl;
+  
+  //
+  // Now iterate over all OpenCL devices in the context and compute the matrix-matrix product
+  //
+  std::cout << std::endl << "--- Computing matrix-matrix product on each available compute device using ViennaCL ---" << std::endl;
+  std::vector<viennacl::ocl::device> devices = viennacl::ocl::current_context().devices();
+  for (std::size_t i=0; i<devices.size(); ++i)
+  {
+    viennacl::ocl::current_context().switch_device(devices[i]);
+    std::cout << " - Device Name: " << viennacl::ocl::current_device().name() << std::endl;
+
+    //viennacl::copy(ublas_A, vcl_A);
+    //viennacl::copy(ublas_B, vcl_B);
+    viennacl::fast_copy(&(stl_A[0]),
+                        &(stl_A[0]) + stl_A.size(),
+                        vcl_A);
+    viennacl::fast_copy(&(stl_B[0]),
+                        &(stl_B[0]) + stl_B.size(),
+                        vcl_B);
+    vcl_C_sub = viennacl::linalg::prod(vcl_A_sub, vcl_B_sub);
+    viennacl::ocl::get_queue().finish();
+    timer.start();
+    vcl_C_sub = viennacl::linalg::prod(vcl_A_sub, vcl_B_sub);
+    viennacl::ocl::get_queue().finish();
+    exec_time = timer.get();
+    std::cout << " - Execution time on device (no setup time included): " << exec_time << std::endl;
+    std::cout << " - GFLOPs: " << (vcl_A.size1() / 1000.0) * (vcl_A.size2() / 1000.0) * (vcl_B.size2() / 1000.0) / exec_time << std::endl;
+
+    //std::cout << vcl_C << std::endl;
+    
+    //
+    // Verify the result
+    //
+    //viennacl::copy(vcl_C, ublas_C1);
+    viennacl::fast_copy(vcl_C, &(stl_C[0]));
+    for (unsigned int i = 0; i < ublas_C1.size1(); ++i)
+      for (unsigned int j = 0; j < ublas_C1.size2(); ++j)
+        ublas_C1(i,j) = stl_C[i * ublas_C1.size2() + j];
+
+    std::cout << " - Checking result... ";
+    bool check_ok = true;
+    for (unsigned int i = 0; i < ublas_A.size1(); ++i)
+    {
+      for (unsigned int j = 0; j < ublas_A.size2(); ++j)
+      {
+        if ( fabs(ublas_C1(i,j) - ublas_C(i,j)) / ublas_C(i,j) > 1e-4 )
+        {
+          check_ok = false;
+          break;
+        }
+      }
+      if (!check_ok)
+        break;
+    }
+    if (check_ok)
+      std::cout << "[OK]" << std::endl << std::endl;
+    else
+      std::cout << "[FAILED]" << std::endl << std::endl;
+      
+  }
+
+  //
+  //  That's it. 
+  //
+  std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl;
+  return EXIT_SUCCESS;
+}
+
diff --git a/examples/tutorial/blas3range.cpp b/examples/tutorial/blas3range.cu
similarity index 94%
copy from examples/tutorial/blas3range.cpp
copy to examples/tutorial/blas3range.cu
index d7e376e..1ec5f36 100644
--- a/examples/tutorial/blas3range.cpp
+++ b/examples/tutorial/blas3range.cu
@@ -1,211 +1,213 @@
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-//disable debug mechanisms to have a fair comparison with ublas:
-#ifndef NDEBUG
- #define NDEBUG
-#endif
-
-
-//
-// include necessary system headers
-//
-#include <iostream>
-
-//
-// ublas includes
-//
-#include <boost/numeric/ublas/io.hpp>
-#include <boost/numeric/ublas/triangular.hpp>
-#include <boost/numeric/ublas/matrix_sparse.hpp>
-#include <boost/numeric/ublas/matrix.hpp>
-#include <boost/numeric/ublas/matrix_proxy.hpp>
-#include <boost/numeric/ublas/lu.hpp>
-#include <boost/numeric/ublas/io.hpp>
-
-
-// Must be set if you want to use ViennaCL algorithms on ublas objects
-#define VIENNACL_HAVE_UBLAS 1
-
-//
-// ViennaCL includes
-//
-#include "viennacl/scalar.hpp"
-#include "viennacl/vector.hpp"
-#include "viennacl/matrix.hpp"
-#include "viennacl/linalg/prod.hpp"
-#include "viennacl/matrix_proxy.hpp"
-
-// Some helper functions for this tutorial:
-#include "Random.hpp"
-#include "vector-io.hpp"
-
-#include "../benchmarks/benchmark-utils.hpp"
-
-/*
-*   Tutorial: BLAS level 3 functionality
-*   
-*/
-
-#define BLAS3_MATRIX_SIZE   1500
-
-using namespace boost::numeric;
-
-int main()
-{
-  typedef float     ScalarType;
-
-  Timer timer;
-  double exec_time;
-
-  //
-  // Set up some ublas objects
-  //
-  ublas::matrix<ScalarType> ublas_A(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
-  ublas::matrix<ScalarType, ublas::column_major> ublas_B(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
-  ublas::matrix<ScalarType> ublas_C(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
-  ublas::matrix<ScalarType> ublas_C1(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
-  ublas::matrix<ScalarType> ublas_C2(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
-
-  //
-  // One alternative: Put the matrices into a contiguous block of memory (allows to use viennacl::fast_copy(), avoiding temporary memory)
-  //
-  std::vector<ScalarType> stl_A(BLAS3_MATRIX_SIZE * BLAS3_MATRIX_SIZE);
-  std::vector<ScalarType> stl_B(BLAS3_MATRIX_SIZE * BLAS3_MATRIX_SIZE);
-  std::vector<ScalarType> stl_C(BLAS3_MATRIX_SIZE * BLAS3_MATRIX_SIZE);
-
-  //
-  // Fill the matrix
-  //
-  for (unsigned int i = 0; i < ublas_A.size1(); ++i)
-    for (unsigned int j = 0; j < ublas_A.size2(); ++j)
-    {
-      ublas_A(i,j) = random<ScalarType>();
-      stl_A[i*ublas_A.size2() + j] = ublas_A(i,j);
-    }
-
-  for (unsigned int i = 0; i < ublas_B.size1(); ++i)
-    for (unsigned int j = 0; j < ublas_B.size2(); ++j)
-    {
-      ublas_B(i,j) = random<ScalarType>();
-      stl_B[i + j*ublas_B.size1()] = ublas_B(i,j);
-    }
-    
-  ublas::range ublas_r1(1, BLAS3_MATRIX_SIZE-1);
-  ublas::range ublas_r2(2, BLAS3_MATRIX_SIZE-2);
-  ublas::matrix_range< ublas::matrix<ScalarType> >  ublas_A_sub(ublas_A, ublas_r1, ublas_r2);
-  ublas::matrix_range< ublas::matrix<ScalarType, ublas::column_major> >  ublas_B_sub(ublas_B, ublas_r2, ublas_r1);
-  ublas::matrix_range< ublas::matrix<ScalarType> >  ublas_C_sub(ublas_C, ublas_r1, ublas_r1);
-
-  //
-  // Set up some ViennaCL objects
-  //
-  //viennacl::ocl::set_context_device_type(0, viennacl::ocl::gpu_tag());  //uncomment this is you wish to use GPUs only
-  viennacl::matrix<ScalarType> vcl_A(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
-  viennacl::matrix<ScalarType, viennacl::column_major> vcl_B(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
-  viennacl::matrix<ScalarType> vcl_C(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
-
-  viennacl::range vcl_r1(1, BLAS3_MATRIX_SIZE-1);
-  viennacl::range vcl_r2(2, BLAS3_MATRIX_SIZE-2);
-  viennacl::matrix_range< viennacl::matrix<ScalarType> >  vcl_A_sub(vcl_A, vcl_r1, vcl_r2);
-  viennacl::matrix_range< viennacl::matrix<ScalarType, viennacl::column_major> >  vcl_B_sub(vcl_B, vcl_r2, vcl_r1);
-  viennacl::matrix_range< viennacl::matrix<ScalarType> >  vcl_C_sub(vcl_C, vcl_r1, vcl_r1);
-  
-  ublas_C.clear();
-  viennacl::copy(ublas_C, vcl_C);
-  
-  /////////////////////////////////////////////////
-  //////////// Matrix-matrix products /////////////
-  /////////////////////////////////////////////////
-  
-  //
-  // Compute reference product using ublas:
-  //
-  std::cout << "--- Computing matrix-matrix product using ublas ---" << std::endl;
-  timer.start();
-  ublas_C_sub = ublas::prod(ublas_A_sub, ublas_B_sub);
-  exec_time = timer.get();
-  std::cout << " - Execution time: " << exec_time << std::endl;
-  
-  //std::cout << ublas_C << std::endl;
-  
-  //
-  // Now iterate over all OpenCL devices in the context and compute the matrix-matrix product
-  //
-  std::cout << std::endl << "--- Computing matrix-matrix product on each available compute device using ViennaCL ---" << std::endl;
-  std::vector<viennacl::ocl::device> devices = viennacl::ocl::current_context().devices();
-  for (size_t i=0; i<devices.size(); ++i)
-  {
-    viennacl::ocl::current_context().switch_device(devices[i]);
-    std::cout << " - Device Name: " << viennacl::ocl::current_device().name() << std::endl;
-
-    //viennacl::copy(ublas_A, vcl_A);
-    //viennacl::copy(ublas_B, vcl_B);
-    viennacl::fast_copy(&(stl_A[0]),
-                        &(stl_A[0]) + stl_A.size(),
-                        vcl_A);
-    viennacl::fast_copy(&(stl_B[0]),
-                        &(stl_B[0]) + stl_B.size(),
-                        vcl_B);
-    vcl_C_sub = viennacl::linalg::prod(vcl_A_sub, vcl_B_sub);
-    viennacl::ocl::get_queue().finish();
-    timer.start();
-    vcl_C_sub = viennacl::linalg::prod(vcl_A_sub, vcl_B_sub);
-    viennacl::ocl::get_queue().finish();
-    exec_time = timer.get();
-    std::cout << " - Execution time on device (no setup time included): " << exec_time << std::endl;
-    std::cout << " - GFLOPs: " << (vcl_A.size1() / 1000.0) * (vcl_A.size2() / 1000.0) * (vcl_B.size2() / 1000.0) / exec_time << std::endl;
-
-    //std::cout << vcl_C << std::endl;
-    
-    //
-    // Verify the result
-    //
-    //viennacl::copy(vcl_C, ublas_C1);
-    viennacl::fast_copy(vcl_C, &(stl_C[0]));
-    for (unsigned int i = 0; i < ublas_C1.size1(); ++i)
-      for (unsigned int j = 0; j < ublas_C1.size2(); ++j)
-        ublas_C1(i,j) = stl_C[i * ublas_C1.size2() + j];
-
-    std::cout << " - Checking result... ";
-    bool check_ok = true;
-    for (unsigned int i = 0; i < ublas_A.size1(); ++i)
-    {
-      for (unsigned int j = 0; j < ublas_A.size2(); ++j)
-      {
-        if ( fabs(ublas_C1(i,j) - ublas_C(i,j)) / ublas_C(i,j) > 1e-4 )
-        {
-          check_ok = false;
-          break;
-        }
-      }
-      if (!check_ok)
-        break;
-    }
-    if (check_ok)
-      std::cout << "[OK]" << std::endl << std::endl;
-    else
-      std::cout << "[FAILED]" << std::endl << std::endl;
-      
-  }
-
-  //
-  //  That's it. 
-  //
-  std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl;
-  return EXIT_SUCCESS;
-}
-
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+               
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/*
+* 
+*   Tutorial: BLAS level 3 functionality on sub-matrices (blas3range.cpp and blas3range.cu are identical, the latter being required for compilation using CUDA nvcc)
+*   
+*/
+
+//disable debug mechanisms to have a fair comparison with ublas:
+#ifndef NDEBUG
+ #define NDEBUG
+#endif
+
+
+//
+// include necessary system headers
+//
+#include <iostream>
+
+//
+// ublas includes
+//
+#include <boost/numeric/ublas/io.hpp>
+#include <boost/numeric/ublas/triangular.hpp>
+#include <boost/numeric/ublas/matrix_sparse.hpp>
+#include <boost/numeric/ublas/matrix.hpp>
+#include <boost/numeric/ublas/matrix_proxy.hpp>
+#include <boost/numeric/ublas/lu.hpp>
+#include <boost/numeric/ublas/io.hpp>
+
+
+// Must be set if you want to use ViennaCL algorithms on ublas objects
+#define VIENNACL_WITH_UBLAS 1
+
+//
+// ViennaCL includes
+//
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/matrix_proxy.hpp"
+
+// Some helper functions for this tutorial:
+#include "Random.hpp"
+#include "vector-io.hpp"
+
+#include "../benchmarks/benchmark-utils.hpp"
+
+#define BLAS3_MATRIX_SIZE   1500
+
+using namespace boost::numeric;
+
+int main()
+{
+  typedef float     ScalarType;
+
+  Timer timer;
+  double exec_time;
+
+  //
+  // Set up some ublas objects
+  //
+  ublas::matrix<ScalarType> ublas_A(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
+  ublas::matrix<ScalarType, ublas::column_major> ublas_B(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
+  ublas::matrix<ScalarType> ublas_C(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
+  ublas::matrix<ScalarType> ublas_C1(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
+  ublas::matrix<ScalarType> ublas_C2(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
+
+  //
+  // One alternative: Put the matrices into a contiguous block of memory (allows to use viennacl::fast_copy(), avoiding temporary memory)
+  //
+  std::vector<ScalarType> stl_A(BLAS3_MATRIX_SIZE * BLAS3_MATRIX_SIZE);
+  std::vector<ScalarType> stl_B(BLAS3_MATRIX_SIZE * BLAS3_MATRIX_SIZE);
+  std::vector<ScalarType> stl_C(BLAS3_MATRIX_SIZE * BLAS3_MATRIX_SIZE);
+
+  //
+  // Fill the matrix
+  //
+  for (unsigned int i = 0; i < ublas_A.size1(); ++i)
+    for (unsigned int j = 0; j < ublas_A.size2(); ++j)
+    {
+      ublas_A(i,j) = random<ScalarType>();
+      stl_A[i*ublas_A.size2() + j] = ublas_A(i,j);
+    }
+
+  for (unsigned int i = 0; i < ublas_B.size1(); ++i)
+    for (unsigned int j = 0; j < ublas_B.size2(); ++j)
+    {
+      ublas_B(i,j) = random<ScalarType>();
+      stl_B[i + j*ublas_B.size1()] = ublas_B(i,j);
+    }
+    
+  ublas::range ublas_r1(1, BLAS3_MATRIX_SIZE-1);
+  ublas::range ublas_r2(2, BLAS3_MATRIX_SIZE-2);
+  ublas::matrix_range< ublas::matrix<ScalarType> >  ublas_A_sub(ublas_A, ublas_r1, ublas_r2);
+  ublas::matrix_range< ublas::matrix<ScalarType, ublas::column_major> >  ublas_B_sub(ublas_B, ublas_r2, ublas_r1);
+  ublas::matrix_range< ublas::matrix<ScalarType> >  ublas_C_sub(ublas_C, ublas_r1, ublas_r1);
+
+  //
+  // Set up some ViennaCL objects
+  //
+  //viennacl::ocl::set_context_device_type(0, viennacl::ocl::gpu_tag());  //uncomment this is you wish to use GPUs only
+  viennacl::matrix<ScalarType> vcl_A(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
+  viennacl::matrix<ScalarType, viennacl::column_major> vcl_B(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
+  viennacl::matrix<ScalarType> vcl_C(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
+
+  viennacl::range vcl_r1(1, BLAS3_MATRIX_SIZE-1);
+  viennacl::range vcl_r2(2, BLAS3_MATRIX_SIZE-2);
+  viennacl::matrix_range< viennacl::matrix<ScalarType> >  vcl_A_sub(vcl_A, vcl_r1, vcl_r2);
+  viennacl::matrix_range< viennacl::matrix<ScalarType, viennacl::column_major> >  vcl_B_sub(vcl_B, vcl_r2, vcl_r1);
+  viennacl::matrix_range< viennacl::matrix<ScalarType> >  vcl_C_sub(vcl_C, vcl_r1, vcl_r1);
+  
+  ublas_C.clear();
+  viennacl::copy(ublas_C, vcl_C);
+  
+  /////////////////////////////////////////////////
+  //////////// Matrix-matrix products /////////////
+  /////////////////////////////////////////////////
+  
+  //
+  // Compute reference product using ublas:
+  //
+  std::cout << "--- Computing matrix-matrix product using ublas ---" << std::endl;
+  timer.start();
+  ublas_C_sub = ublas::prod(ublas_A_sub, ublas_B_sub);
+  exec_time = timer.get();
+  std::cout << " - Execution time: " << exec_time << std::endl;
+  
+  //std::cout << ublas_C << std::endl;
+  
+  //
+  // Now iterate over all OpenCL devices in the context and compute the matrix-matrix product
+  //
+  std::cout << std::endl << "--- Computing matrix-matrix product on each available compute device using ViennaCL ---" << std::endl;
+  std::vector<viennacl::ocl::device> devices = viennacl::ocl::current_context().devices();
+  for (std::size_t i=0; i<devices.size(); ++i)
+  {
+    viennacl::ocl::current_context().switch_device(devices[i]);
+    std::cout << " - Device Name: " << viennacl::ocl::current_device().name() << std::endl;
+
+    //viennacl::copy(ublas_A, vcl_A);
+    //viennacl::copy(ublas_B, vcl_B);
+    viennacl::fast_copy(&(stl_A[0]),
+                        &(stl_A[0]) + stl_A.size(),
+                        vcl_A);
+    viennacl::fast_copy(&(stl_B[0]),
+                        &(stl_B[0]) + stl_B.size(),
+                        vcl_B);
+    vcl_C_sub = viennacl::linalg::prod(vcl_A_sub, vcl_B_sub);
+    viennacl::ocl::get_queue().finish();
+    timer.start();
+    vcl_C_sub = viennacl::linalg::prod(vcl_A_sub, vcl_B_sub);
+    viennacl::ocl::get_queue().finish();
+    exec_time = timer.get();
+    std::cout << " - Execution time on device (no setup time included): " << exec_time << std::endl;
+    std::cout << " - GFLOPs: " << (vcl_A.size1() / 1000.0) * (vcl_A.size2() / 1000.0) * (vcl_B.size2() / 1000.0) / exec_time << std::endl;
+
+    //std::cout << vcl_C << std::endl;
+    
+    //
+    // Verify the result
+    //
+    //viennacl::copy(vcl_C, ublas_C1);
+    viennacl::fast_copy(vcl_C, &(stl_C[0]));
+    for (unsigned int i = 0; i < ublas_C1.size1(); ++i)
+      for (unsigned int j = 0; j < ublas_C1.size2(); ++j)
+        ublas_C1(i,j) = stl_C[i * ublas_C1.size2() + j];
+
+    std::cout << " - Checking result... ";
+    bool check_ok = true;
+    for (unsigned int i = 0; i < ublas_A.size1(); ++i)
+    {
+      for (unsigned int j = 0; j < ublas_A.size2(); ++j)
+      {
+        if ( fabs(ublas_C1(i,j) - ublas_C(i,j)) / ublas_C(i,j) > 1e-4 )
+        {
+          check_ok = false;
+          break;
+        }
+      }
+      if (!check_ok)
+        break;
+    }
+    if (check_ok)
+      std::cout << "[OK]" << std::endl << std::endl;
+    else
+      std::cout << "[FAILED]" << std::endl << std::endl;
+      
+  }
+
+  //
+  //  That's it. 
+  //
+  std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl;
+  return EXIT_SUCCESS;
+}
+
diff --git a/examples/tutorial/custom-context.cpp b/examples/tutorial/custom-context.cpp
index 0625843..92f1f88 100644
--- a/examples/tutorial/custom-context.cpp
+++ b/examples/tutorial/custom-context.cpp
@@ -1,278 +1,284 @@
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-//
-// include necessary system headers
-//
-#include <iostream>
-#include <string>
-
-//
-// ViennaCL includes
-//
-#include "viennacl/ocl/backend.hpp"
-#include "viennacl/vector.hpp"
-#include "viennacl/matrix.hpp"
-#include "viennacl/linalg/matrix_operations.hpp"
-#include "viennacl/linalg/norm_2.hpp"
-#include "viennacl/linalg/prod.hpp"
-
-
-// Some helper functions for this tutorial:
-#include "Random.hpp"
-
-
-/*
-*
-*   Tutorial:  Use ViennaCL within user-defined OpenCL contexts
-*   
-*/
-
-
-
-//
-// A custom compute kernel which computes an elementwise product of two vectors
-// Input: v1 ... vector
-//        v2 ... vector
-// Output: result ... vector
-//
-// Algorithm: set result[i] <- v1[i] * v2[i]
-//            (in MATLAB notation this is something like 'result = v1 .* v2');
-//
-
-const char * my_compute_program = 
-"__kernel void elementwise_prod(\n"
-"          __global const float * vec1,\n"
-"          __global const float * vec2, \n"
-"          __global float * result,\n"
-"          unsigned int size) \n"
-"{ \n"
-"  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
-"    result[i] = vec1[i] * vec2[i];\n"
-"};\n";
-
-
-int main()
-{
-  typedef float       ScalarType;
-  
-  
-  /////////////////////////////////////////////////////////////////////////////////////////////////////////
-  //////////////////////// Part 1: Set up a custom context and perform a sample operation. ////////////////
-  ////////////////////////         This is rather lengthy due to the OpenCL framework.     ////////////////
-  ////////////////////////         The following does essentially the same as the          ////////////////
-  ////////////////////////         'custom_kernels'-tutorial!                               ////////////////
-  /////////////////////////////////////////////////////////////////////////////////////////////////////////
-  
-  //manually set up a custom OpenCL context:
-  std::vector<cl_device_id> device_id_array;
-  
-  //get all available devices
-  viennacl::ocl::platform pf;
-  std::cout << "Platform info: " << pf.info() << std::endl;
-  std::vector<viennacl::ocl::device> devices = pf.devices(CL_DEVICE_TYPE_DEFAULT);
-  std::cout << devices[0].name() << std::endl;
-  std::cout << "Number of devices for custom context: " << devices.size() << std::endl;
-  
-  //set up context using all found devices:
-  for (size_t i=0; i<devices.size(); ++i)
-  {
-      device_id_array.push_back(devices[i].id());
-  }
-     
-  std::cout << "Creating context..." << std::endl;
-  cl_int err;
-  cl_context my_context = clCreateContext(0, device_id_array.size(), &(device_id_array[0]), NULL, NULL, &err);
-  VIENNACL_ERR_CHECK(err);
-   
-  
-  //create two Vectors:
-  unsigned int vector_size = 10;
-  std::vector<ScalarType> vec1(vector_size);
-  std::vector<ScalarType> vec2(vector_size);
-  std::vector<ScalarType> result(vector_size);
-  
-  //
-  // fill the operands vec1 and vec2:
-  //
-  for (unsigned int i=0; i<vector_size; ++i)
-  {
-    vec1[i] = static_cast<ScalarType>(i);
-    vec2[i] = static_cast<ScalarType>(vector_size-i);
-  }
-  
-  //
-  // create memory in OpenCL context:
-  //
-  cl_mem mem_vec1 = clCreateBuffer(my_context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, vector_size * sizeof(ScalarType), &(vec1[0]), &err);
-  VIENNACL_ERR_CHECK(err);
-  cl_mem mem_vec2 = clCreateBuffer(my_context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, vector_size * sizeof(ScalarType), &(vec2[0]), &err);
-  VIENNACL_ERR_CHECK(err);
-  cl_mem mem_result = clCreateBuffer(my_context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, vector_size * sizeof(ScalarType), &(result[0]), &err);
-  VIENNACL_ERR_CHECK(err);
-
-  // 
-  // create a command queue for each device:
-  // 
-  
-  std::vector<cl_command_queue> queues(devices.size());
-  for (size_t i=0; i<devices.size(); ++i)
-  {
-    queues[i] = clCreateCommandQueue(my_context, devices[i].id(), 0, &err);
-    VIENNACL_ERR_CHECK(err);
-  }
-  
-  // 
-  // create and build a program in the context:
-  // 
-  size_t source_len = std::string(my_compute_program).length();
-  cl_program my_prog = clCreateProgramWithSource(my_context, 1, &my_compute_program, &source_len, &err);
-  err = clBuildProgram(my_prog, 0, NULL, NULL, NULL, NULL);
-  
-/*            char buffer[1024];
-            cl_build_status status;
-            clGetProgramBuildInfo(my_prog, devices[1].id(), CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL);
-            clGetProgramBuildInfo(my_prog, devices[1].id(), CL_PROGRAM_BUILD_LOG, sizeof(char)*1024, &buffer, NULL);
-            std::cout << "Build Scalar: Err = " << err << " Status = " << status << std::endl;
-            std::cout << "Log: " << buffer << std::endl;*/
-  
-  VIENNACL_ERR_CHECK(err);
-  
-  // 
-  // create a kernel from the program:
-  // 
-  const char * kernel_name = "elementwise_prod";
-  cl_kernel my_kernel = clCreateKernel(my_prog, kernel_name, &err);
-  VIENNACL_ERR_CHECK(err);
-
-  
-  //
-  // Execute elementwise_prod kernel on first queue: result = vec1 .* vec2;
-  //
-  err = clSetKernelArg(my_kernel, 0, sizeof(cl_mem), (void*)&mem_vec1);
-  VIENNACL_ERR_CHECK(err);
-  err = clSetKernelArg(my_kernel, 1, sizeof(cl_mem), (void*)&mem_vec2);
-  VIENNACL_ERR_CHECK(err);
-  err = clSetKernelArg(my_kernel, 2, sizeof(cl_mem), (void*)&mem_result);
-  VIENNACL_ERR_CHECK(err);
-  err = clSetKernelArg(my_kernel, 3, sizeof(unsigned int), (void*)&vector_size);
-  VIENNACL_ERR_CHECK(err);
-  size_t global_size = vector_size;
-  size_t local_size = vector_size;
-  err = clEnqueueNDRangeKernel(queues[0], my_kernel, 1, NULL, &global_size, &local_size, 0, NULL, NULL);
-  VIENNACL_ERR_CHECK(err);
-  
-  
-  //
-  // Read and output result:
-  //
-  err = clEnqueueReadBuffer(queues[0], mem_vec1, CL_TRUE, 0, sizeof(ScalarType)*vector_size, &(vec1[0]), 0, NULL, NULL);
-  VIENNACL_ERR_CHECK(err);
-  err = clEnqueueReadBuffer(queues[0], mem_result, CL_TRUE, 0, sizeof(ScalarType)*vector_size, &(result[0]), 0, NULL, NULL);
-  VIENNACL_ERR_CHECK(err);
-
-  std::cout << "vec1  : ";
-  for (size_t i=0; i<vec1.size(); ++i)
-    std::cout << vec1[i] << " ";
-  std::cout << std::endl;
-
-  std::cout << "vec2  : ";
-  for (size_t i=0; i<vec2.size(); ++i)
-    std::cout << vec2[i] << " ";
-  std::cout << std::endl;
-
-  std::cout << "result: ";
-  for (size_t i=0; i<result.size(); ++i)
-    std::cout << result[i] << " ";
-  std::cout << std::endl;
-  
-  ////////////////////////////////////////////////////////////////////////////////////////////////////////
-  /////////////////////// Part 2: Let ViennaCL use the already created context: //////////////////////////
-  ////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-  //Tell ViennaCL to use the previously created context.
-  //This context is assigned an id '0' when using viennacl::ocl::switch_context().
-  viennacl::ocl::setup_context(0, my_context, device_id_array, queues);
-  viennacl::ocl::switch_context(0); //activate the new context (only mandatory with context-id not equal to zero)
-  
-  //
-  // Proof that ViennaCL really uses the new context:
-  //
-  std::cout << "Existing context: " << my_context << std::endl;
-  std::cout << "ViennaCL uses context: " << viennacl::ocl::current_context().handle().get() << std::endl;
-
-  //
-  // Wrap existing OpenCL objects into ViennaCL:
-  //
-  viennacl::vector<ScalarType> vcl_vec1(mem_vec1, vector_size);
-  viennacl::vector<ScalarType> vcl_vec2(mem_vec2, vector_size);
-  viennacl::vector<ScalarType> vcl_result(mem_result, vector_size);
-  viennacl::scalar<ScalarType> vcl_s = 2.0;
-
-  std::cout << "Standard vector operations within ViennaCL:" << std::endl;
-  vcl_result = vcl_s * vcl_vec1 + vcl_vec2;
-  
-  std::cout << "vec1  : ";
-  std::cout << vcl_vec1 << std::endl;
-
-  std::cout << "vec2  : ";
-  std::cout << vcl_vec2 << std::endl;
-
-  std::cout << "result: ";
-  std::cout << vcl_result << std::endl;
-  
-  //
-  // We can also reuse the existing elementwise_prod kernel. 
-  // Therefore, we first have to make the existing program known to ViennaCL
-  // For more details on the three lines, see tutorial 'custom-kernels'
-  //
-  std::cout << "Using existing kernel within the OpenCL backend of ViennaCL:" << std::endl;
-  viennacl::ocl::program & my_vcl_prog = viennacl::ocl::current_context().add_program(my_prog, "my_compute_program");
-  viennacl::ocl::kernel & my_vcl_kernel = my_vcl_prog.add_kernel("elementwise_prod");
-  viennacl::ocl::enqueue(my_vcl_kernel(vcl_vec1, vcl_vec2, vcl_result, static_cast<cl_uint>(vcl_vec1.size())));  //Note that size_t might differ between host and device. Thus, a cast to cl_uint is necessary here.
-  
-  std::cout << "vec1  : ";
-  std::cout << vcl_vec1 << std::endl;
-
-  std::cout << "vec2  : ";
-  std::cout << vcl_vec2 << std::endl;
-
-  std::cout << "result: ";
-  std::cout << vcl_result << std::endl;
-  
-  
-  //
-  // Since a linear piece of memory can be interpreted in several ways, 
-  // we will now create a 3x3 row-major matrix out of the linear memory in mem_vec1/
-  // The first three entries in vcl_vec2 and vcl_result are used to carry out matrix-vector products:
-  //
-  viennacl::matrix<ScalarType> vcl_matrix(mem_vec1, 3, 3);
-  
-  vcl_vec2.resize(3);   //note that the resize operation leads to new memory, thus vcl_vec2 is now at a different memory location (values are copied)
-  vcl_result.resize(3); //note that the resize operation leads to new memory, thus vcl_vec2 is now at a different memory location (values are copied)
-  vcl_result = viennacl::linalg::prod(vcl_matrix, vcl_vec2);
-
-  std::cout << "result of matrix-vector product: ";
-  std::cout << vcl_result << std::endl;
-
-  //
-  //  That's it.
-  //
-  std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl;
-  
-  return 0;
-}
-
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/*
+*
+*   Tutorial:  Use ViennaCL within user-defined (i.e. your own) OpenCL contexts
+*
+*/
+
+
+//
+// include necessary system headers
+//
+#include <iostream>
+#include <string>
+
+#ifndef VIENNACL_WITH_OPENCL
+  #define VIENNACL_WITH_OPENCL
+#endif
+
+//
+// ViennaCL includes
+//
+#include "viennacl/ocl/backend.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/matrix_operations.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+#include "viennacl/linalg/prod.hpp"
+
+
+// Some helper functions for this tutorial:
+#include "Random.hpp"
+
+
+
+
+//
+// A custom compute kernel which computes an elementwise product of two vectors
+// Input: v1 ... vector
+//        v2 ... vector
+// Output: result ... vector
+//
+// Algorithm: set result[i] <- v1[i] * v2[i]
+//            (in MATLAB notation this is something like 'result = v1 .* v2');
+//
+
+const char * my_compute_program =
+"__kernel void elementwise_prod(\n"
+"          __global const float * vec1,\n"
+"          __global const float * vec2, \n"
+"          __global float * result,\n"
+"          unsigned int size) \n"
+"{ \n"
+"  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
+"    result[i] = vec1[i] * vec2[i];\n"
+"};\n";
+
+
+int main()
+{
+  typedef float       ScalarType;
+
+
+  /////////////////////////////////////////////////////////////////////////////////////////////////////////
+  //////////////////////// Part 1: Set up a custom context and perform a sample operation. ////////////////
+  ////////////////////////         This is rather lengthy due to the OpenCL framework.     ////////////////
+  ////////////////////////         The following does essentially the same as the          ////////////////
+  ////////////////////////         'custom_kernels'-tutorial!                               ////////////////
+  /////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+  //manually set up a custom OpenCL context:
+  std::vector<cl_device_id> device_id_array;
+
+  //get all available devices
+  viennacl::ocl::platform pf;
+  std::cout << "Platform info: " << pf.info() << std::endl;
+  std::vector<viennacl::ocl::device> devices = pf.devices(CL_DEVICE_TYPE_DEFAULT);
+  std::cout << devices[0].name() << std::endl;
+  std::cout << "Number of devices for custom context: " << devices.size() << std::endl;
+
+  //set up context using all found devices:
+  for (std::size_t i=0; i<devices.size(); ++i)
+  {
+      device_id_array.push_back(devices[i].id());
+  }
+
+  std::cout << "Creating context..." << std::endl;
+  cl_int err;
+  cl_context my_context = clCreateContext(0, cl_uint(device_id_array.size()), &(device_id_array[0]), NULL, NULL, &err);
+  VIENNACL_ERR_CHECK(err);
+
+
+  //create two Vectors:
+  unsigned int vector_size = 10;
+  std::vector<ScalarType> vec1(vector_size);
+  std::vector<ScalarType> vec2(vector_size);
+  std::vector<ScalarType> result(vector_size);
+
+  //
+  // fill the operands vec1 and vec2:
+  //
+  for (unsigned int i=0; i<vector_size; ++i)
+  {
+    vec1[i] = static_cast<ScalarType>(i);
+    vec2[i] = static_cast<ScalarType>(vector_size-i);
+  }
+
+  //
+  // create memory in OpenCL context:
+  //
+  cl_mem mem_vec1 = clCreateBuffer(my_context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, vector_size * sizeof(ScalarType), &(vec1[0]), &err);
+  VIENNACL_ERR_CHECK(err);
+  cl_mem mem_vec2 = clCreateBuffer(my_context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, vector_size * sizeof(ScalarType), &(vec2[0]), &err);
+  VIENNACL_ERR_CHECK(err);
+  cl_mem mem_result = clCreateBuffer(my_context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, vector_size * sizeof(ScalarType), &(result[0]), &err);
+  VIENNACL_ERR_CHECK(err);
+
+  //
+  // create a command queue for each device:
+  //
+
+  std::vector<cl_command_queue> queues(devices.size());
+  for (std::size_t i=0; i<devices.size(); ++i)
+  {
+    queues[i] = clCreateCommandQueue(my_context, devices[i].id(), 0, &err);
+    VIENNACL_ERR_CHECK(err);
+  }
+
+  //
+  // create and build a program in the context:
+  //
+  std::size_t source_len = std::string(my_compute_program).length();
+  cl_program my_prog = clCreateProgramWithSource(my_context, 1, &my_compute_program, &source_len, &err);
+  err = clBuildProgram(my_prog, 0, NULL, NULL, NULL, NULL);
+
+/*            char buffer[1024];
+            cl_build_status status;
+            clGetProgramBuildInfo(my_prog, devices[1].id(), CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL);
+            clGetProgramBuildInfo(my_prog, devices[1].id(), CL_PROGRAM_BUILD_LOG, sizeof(char)*1024, &buffer, NULL);
+            std::cout << "Build Scalar: Err = " << err << " Status = " << status << std::endl;
+            std::cout << "Log: " << buffer << std::endl;*/
+
+  VIENNACL_ERR_CHECK(err);
+
+  //
+  // create a kernel from the program:
+  //
+  const char * kernel_name = "elementwise_prod";
+  cl_kernel my_kernel = clCreateKernel(my_prog, kernel_name, &err);
+  VIENNACL_ERR_CHECK(err);
+
+
+  //
+  // Execute elementwise_prod kernel on first queue: result = vec1 .* vec2;
+  //
+  err = clSetKernelArg(my_kernel, 0, sizeof(cl_mem), (void*)&mem_vec1);
+  VIENNACL_ERR_CHECK(err);
+  err = clSetKernelArg(my_kernel, 1, sizeof(cl_mem), (void*)&mem_vec2);
+  VIENNACL_ERR_CHECK(err);
+  err = clSetKernelArg(my_kernel, 2, sizeof(cl_mem), (void*)&mem_result);
+  VIENNACL_ERR_CHECK(err);
+  err = clSetKernelArg(my_kernel, 3, sizeof(unsigned int), (void*)&vector_size);
+  VIENNACL_ERR_CHECK(err);
+  std::size_t global_size = vector_size;
+  std::size_t local_size = vector_size;
+  err = clEnqueueNDRangeKernel(queues[0], my_kernel, 1, NULL, &global_size, &local_size, 0, NULL, NULL);
+  VIENNACL_ERR_CHECK(err);
+
+
+  //
+  // Read and output result:
+  //
+  err = clEnqueueReadBuffer(queues[0], mem_vec1, CL_TRUE, 0, sizeof(ScalarType)*vector_size, &(vec1[0]), 0, NULL, NULL);
+  VIENNACL_ERR_CHECK(err);
+  err = clEnqueueReadBuffer(queues[0], mem_result, CL_TRUE, 0, sizeof(ScalarType)*vector_size, &(result[0]), 0, NULL, NULL);
+  VIENNACL_ERR_CHECK(err);
+
+  std::cout << "vec1  : ";
+  for (std::size_t i=0; i<vec1.size(); ++i)
+    std::cout << vec1[i] << " ";
+  std::cout << std::endl;
+
+  std::cout << "vec2  : ";
+  for (std::size_t i=0; i<vec2.size(); ++i)
+    std::cout << vec2[i] << " ";
+  std::cout << std::endl;
+
+  std::cout << "result: ";
+  for (std::size_t i=0; i<result.size(); ++i)
+    std::cout << result[i] << " ";
+  std::cout << std::endl;
+
+  ////////////////////////////////////////////////////////////////////////////////////////////////////////
+  /////////////////////// Part 2: Let ViennaCL use the already created context: //////////////////////////
+  ////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+  //Tell ViennaCL to use the previously created context.
+  //This context is assigned an id '0' when using viennacl::ocl::switch_context().
+  viennacl::ocl::setup_context(0, my_context, device_id_array, queues);
+  viennacl::ocl::switch_context(0); //activate the new context (only mandatory with context-id not equal to zero)
+
+  //
+  // Proof that ViennaCL really uses the new context:
+  //
+  std::cout << "Existing context: " << my_context << std::endl;
+  std::cout << "ViennaCL uses context: " << viennacl::ocl::current_context().handle().get() << std::endl;
+
+  //
+  // Wrap existing OpenCL objects into ViennaCL:
+  //
+  viennacl::vector<ScalarType> vcl_vec1(mem_vec1, vector_size);
+  viennacl::vector<ScalarType> vcl_vec2(mem_vec2, vector_size);
+  viennacl::vector<ScalarType> vcl_result(mem_result, vector_size);
+  viennacl::scalar<ScalarType> vcl_s = 2.0;
+
+  std::cout << "Standard vector operations within ViennaCL:" << std::endl;
+  vcl_result = vcl_s * vcl_vec1 + vcl_vec2;
+
+  std::cout << "vec1  : ";
+  std::cout << vcl_vec1 << std::endl;
+
+  std::cout << "vec2  : ";
+  std::cout << vcl_vec2 << std::endl;
+
+  std::cout << "result: ";
+  std::cout << vcl_result << std::endl;
+
+  //
+  // We can also reuse the existing elementwise_prod kernel.
+  // Therefore, we first have to make the existing program known to ViennaCL
+  // For more details on the three lines, see tutorial 'custom-kernels'
+  //
+  std::cout << "Using existing kernel within the OpenCL backend of ViennaCL:" << std::endl;
+  viennacl::ocl::program & my_vcl_prog = viennacl::ocl::current_context().add_program(my_prog, "my_compute_program");
+  viennacl::ocl::kernel & my_vcl_kernel = my_vcl_prog.add_kernel(my_kernel, "elementwise_prod");
+  viennacl::ocl::enqueue(my_vcl_kernel(vcl_vec1, vcl_vec2, vcl_result, static_cast<cl_uint>(vcl_vec1.size())));  //Note that std::size_t might differ between host and device. Thus, a cast to cl_uint is necessary here.
+
+  std::cout << "vec1  : ";
+  std::cout << vcl_vec1 << std::endl;
+
+  std::cout << "vec2  : ";
+  std::cout << vcl_vec2 << std::endl;
+
+  std::cout << "result: ";
+  std::cout << vcl_result << std::endl;
+
+
+  //
+  // Since a linear piece of memory can be interpreted in several ways,
+  // we will now create a 3x3 row-major matrix out of the linear memory in mem_vec1/
+  // The first three entries in vcl_vec2 and vcl_result are used to carry out matrix-vector products:
+  //
+  viennacl::matrix<ScalarType> vcl_matrix(mem_vec1, 3, 3);
+
+  vcl_vec2.resize(3);   //note that the resize operation leads to new memory, thus vcl_vec2 is now at a different memory location (values are copied)
+  vcl_result.resize(3); //note that the resize operation leads to new memory, thus vcl_vec2 is now at a different memory location (values are copied)
+  vcl_result = viennacl::linalg::prod(vcl_matrix, vcl_vec2);
+
+  std::cout << "result of matrix-vector product: ";
+  std::cout << vcl_result << std::endl;
+
+  //
+  //  That's it.
+  //
+  std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl;
+
+  return 0;
+}
+
diff --git a/examples/tutorial/custom-kernels.cpp b/examples/tutorial/custom-kernels.cpp
index 920c294..12f662f 100644
--- a/examples/tutorial/custom-kernels.cpp
+++ b/examples/tutorial/custom-kernels.cpp
@@ -1,135 +1,141 @@
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-//
-// include necessary system headers
-//
-#include <iostream>
-#include <string>
-
-//
-// ViennaCL includes
-//
-#include "viennacl/ocl/backend.hpp"
-#include "viennacl/vector.hpp"
-#include "viennacl/linalg/norm_2.hpp"
-
-
-// Some helper functions for this tutorial:
-#include "Random.hpp"
-
-
-/*
-*
-*   Tutorial:  Custom compute kernels
-*   
-*/
-
-
-//
-// Custom compute kernels which compute an elementwise product/division of two vectors
-// Input: v1 ... vector
-//        v2 ... vector
-// Output: result ... vector
-//
-// Algorithm: set result[i] <- v1[i] * v2[i]
-//            or  result[i] <- v1[i] / v2[i]
-//            (in MATLAB notation this is something like 'result = v1 .* v2' and 'result = v1 ./ v2');
-//
-const char * my_compute_program = 
-"__kernel void elementwise_prod(\n"
-"          __global const float * vec1,\n"
-"          __global const float * vec2, \n"
-"          __global float * result,\n"
-"          unsigned int size) \n"
-"{ \n"
-"  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
-"    result[i] = vec1[i] * vec2[i];\n"
-"};\n\n"
-"__kernel void elementwise_div(\n"
-"          __global const float * vec1,\n"
-"          __global const float * vec2, \n"
-"          __global float * result,\n"
-"          unsigned int size) \n"
-"{ \n"
-"  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
-"    result[i] = vec1[i] / vec2[i];\n"
-"};\n";
-
-int main()
-{
-  typedef float       ScalarType;
-
-  //
-  // Initialize OpenCL vectors:
-  //
-  unsigned int vector_size = 10;
-  viennacl::scalar<ScalarType>  s = 1.0; //dummy
-  viennacl::vector<ScalarType>  vec1(vector_size);
-  viennacl::vector<ScalarType>  vec2(vector_size);
-  viennacl::vector<ScalarType>  result_mul(vector_size);
-  viennacl::vector<ScalarType>  result_div(vector_size);
-
-  //
-  // fill the operands vec1 and vec2:
-  //
-  for (unsigned int i=0; i<vector_size; ++i)
-  {
-    vec1[i] = static_cast<ScalarType>(i);
-    vec2[i] = static_cast<ScalarType>(vector_size-i);
-  }
-
-  //
-  // Set up the OpenCL program given in my_compute_kernel:
-  // A program is one compilation unit and can hold many different compute kernels.
-  //
-  viennacl::ocl::program & my_prog = viennacl::ocl::current_context().add_program(my_compute_program, "my_compute_program");
-  my_prog.add_kernel("elementwise_prod");  //register elementwise product kernel
-  my_prog.add_kernel("elementwise_div");   //register elementwise division kernel
-  
-  //
-  // Now we can get the kernels from the program 'my_program'.
-  // (Note that first all kernels need to be registered via add_kernel() before get_kernel() can be called,
-  // otherwise existing references might be invalidated)
-  //
-  viennacl::ocl::kernel & my_kernel_mul = my_prog.get_kernel("elementwise_prod");
-  viennacl::ocl::kernel & my_kernel_div = my_prog.get_kernel("elementwise_div");
-  
-  //
-  // Launch the kernel with 'vector_size' threads in one work group
-  // Note that size_t might differ between host and device. Thus, a cast to cl_uint is necessary for the forth argument.
-  //
-  viennacl::ocl::enqueue(my_kernel_mul(vec1, vec2, result_mul, static_cast<cl_uint>(vec1.size())));  
-  viennacl::ocl::enqueue(my_kernel_div(vec1, vec2, result_div, static_cast<cl_uint>(vec1.size())));
-  
-  //
-  // Print the result:
-  //
-  std::cout << "        vec1: " << vec1 << std::endl;
-  std::cout << "        vec2: " << vec2 << std::endl;
-  std::cout << "vec1 .* vec2: " << result_mul << std::endl;
-  std::cout << "vec1 /* vec2: " << result_div << std::endl;
-  std::cout << "norm_2(vec1 .* vec2): " << viennacl::linalg::norm_2(result_mul) << std::endl;
-  std::cout << "norm_2(vec1 /* vec2): " << viennacl::linalg::norm_2(result_div) << std::endl;
-  
-  //
-  //  That's it.
-  //
-  std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl;
-  
-  return 0;
-}
-
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/*
+*
+*   Tutorial:  Use user-provided OpenCL compute kernels with ViennaCL objects
+*
+*/
+
+
+//
+// include necessary system headers
+//
+#include <iostream>
+#include <string>
+
+#ifndef VIENNACL_WITH_OPENCL
+  #define VIENNACL_WITH_OPENCL
+#endif
+
+
+//
+// ViennaCL includes
+//
+#include "viennacl/ocl/backend.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+
+
+// Some helper functions for this tutorial:
+#include "Random.hpp"
+
+
+
+//
+// Custom compute kernels which compute an elementwise product/division of two vectors
+// Input: v1 ... vector
+//        v2 ... vector
+// Output: result ... vector
+//
+// Algorithm: set result[i] <- v1[i] * v2[i]
+//            or  result[i] <- v1[i] / v2[i]
+//            (in MATLAB notation this is something like 'result = v1 .* v2' and 'result = v1 ./ v2');
+//
+const char * my_compute_program =
+"__kernel void elementwise_prod(\n"
+"          __global const float * vec1,\n"
+"          __global const float * vec2, \n"
+"          __global float * result,\n"
+"          unsigned int size) \n"
+"{ \n"
+"  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
+"    result[i] = vec1[i] * vec2[i];\n"
+"};\n\n"
+"__kernel void elementwise_div(\n"
+"          __global const float * vec1,\n"
+"          __global const float * vec2, \n"
+"          __global float * result,\n"
+"          unsigned int size) \n"
+"{ \n"
+"  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
+"    result[i] = vec1[i] / vec2[i];\n"
+"};\n";
+
+int main()
+{
+  typedef float       ScalarType;
+
+  //
+  // Initialize OpenCL vectors:
+  //
+  unsigned int vector_size = 10;
+  viennacl::scalar<ScalarType>  s = 1.0; //dummy
+  viennacl::vector<ScalarType>  vec1(vector_size);
+  viennacl::vector<ScalarType>  vec2(vector_size);
+  viennacl::vector<ScalarType>  result_mul(vector_size);
+  viennacl::vector<ScalarType>  result_div(vector_size);
+
+  //
+  // fill the operands vec1 and vec2:
+  //
+  for (unsigned int i=0; i<vector_size; ++i)
+  {
+    vec1[i] = static_cast<ScalarType>(i);
+    vec2[i] = static_cast<ScalarType>(vector_size-i);
+  }
+
+  //
+  // Set up the OpenCL program given in my_compute_kernel:
+  // A program is one compilation unit and can hold many different compute kernels.
+  //
+  viennacl::ocl::program & my_prog = viennacl::ocl::current_context().add_program(my_compute_program, "my_compute_program");
+  // Note: Releases older than ViennaCL 1.5.0 required calls to add_kernel(). This is no longer needed, the respective interface has been removed.
+
+  //
+  // Now we can get the kernels from the program 'my_program'.
+  // (Note that first all kernels need to be registered via add_kernel() before get_kernel() can be called,
+  // otherwise existing references might be invalidated)
+  //
+  viennacl::ocl::kernel & my_kernel_mul = my_prog.get_kernel("elementwise_prod");
+  viennacl::ocl::kernel & my_kernel_div = my_prog.get_kernel("elementwise_div");
+
+  //
+  // Launch the kernel with 'vector_size' threads in one work group
+  // Note that std::size_t might differ between host and device. Thus, a cast to cl_uint is necessary for the forth argument.
+  //
+  viennacl::ocl::enqueue(my_kernel_mul(vec1, vec2, result_mul, static_cast<cl_uint>(vec1.size())));
+  viennacl::ocl::enqueue(my_kernel_div(vec1, vec2, result_div, static_cast<cl_uint>(vec1.size())));
+
+  //
+  // Print the result:
+  //
+  std::cout << "        vec1: " << vec1 << std::endl;
+  std::cout << "        vec2: " << vec2 << std::endl;
+  std::cout << "vec1 .* vec2: " << result_mul << std::endl;
+  std::cout << "vec1 /* vec2: " << result_div << std::endl;
+  std::cout << "norm_2(vec1 .* vec2): " << viennacl::linalg::norm_2(result_mul) << std::endl;
+  std::cout << "norm_2(vec1 /* vec2): " << viennacl::linalg::norm_2(result_div) << std::endl;
+
+  //
+  //  That's it.
+  //
+  std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl;
+
+  return 0;
+}
+
diff --git a/examples/tutorial/eigen-with-viennacl.cpp b/examples/tutorial/eigen-with-viennacl.cpp
index d501b53..0660ae4 100644
--- a/examples/tutorial/eigen-with-viennacl.cpp
+++ b/examples/tutorial/eigen-with-viennacl.cpp
@@ -1,7 +1,8 @@
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
@@ -14,6 +15,12 @@
    License:         MIT (X11), see file LICENSE in the base directory
 ============================================================================= */
 
+/*
+*
+*   Tutorial:  Shows how to exchange data between ViennaCL and Eigen (http://eigen.tuxfamily.org/) objects.
+*   
+*/
+
 //
 // include necessary system headers
 //
@@ -28,7 +35,7 @@
 //
 // IMPORTANT: Must be set prior to any ViennaCL includes if you want to use ViennaCL algorithms on Eigen objects
 //
-#define VIENNACL_HAVE_EIGEN 1
+#define VIENNACL_WITH_EIGEN 1
 
 //
 // ViennaCL includes
@@ -117,13 +124,13 @@ void run_test()
   //
   Eigen::SparseMatrix<ScalarType, Eigen::RowMajor> eigen_sparsemat(6, 5);
   Eigen::SparseMatrix<ScalarType, Eigen::RowMajor> eigen_sparsemat2(6, 5);
-  eigen_sparsemat.startFill(5*2);
-  eigen_sparsemat.fill(0,0) = 2.0;   eigen_sparsemat.fill(0,1) = -1.0;
-  eigen_sparsemat.fill(1,1) = 2.0;   eigen_sparsemat.fill(1,2) = -1.0;
-  eigen_sparsemat.fill(2,2) = -1.0;  eigen_sparsemat.fill(2,3) = -1.0;
-  eigen_sparsemat.fill(3,3) = 2.0;   eigen_sparsemat.fill(3,4) = -1.0;
-  eigen_sparsemat.fill(5,4) = -1.0;
-  eigen_sparsemat.endFill();
+  eigen_sparsemat.reserve(5*2);
+  eigen_sparsemat.insert(0,0) = 2.0;   eigen_sparsemat.insert(0,1) = -1.0;
+  eigen_sparsemat.insert(1,1) = 2.0;   eigen_sparsemat.insert(1,2) = -1.0;
+  eigen_sparsemat.insert(2,2) = -1.0;  eigen_sparsemat.insert(2,3) = -1.0;
+  eigen_sparsemat.insert(3,3) = 2.0;   eigen_sparsemat.insert(3,4) = -1.0;
+  eigen_sparsemat.insert(5,4) = -1.0;
+  //eigen_sparsemat.endFill();
   
   //
   // Create and fill a few vectors from the Eigen library:
@@ -195,7 +202,9 @@ int main(int, char *[])
   std::cout << "----------------------------------------------" << std::endl;
   run_test<float>();
   
+#ifdef VIENNACL_HAVE_OPENCL   
   if( viennacl::ocl::current_device().double_support() )
+#endif
   {
     std::cout << "----------------------------------------------" << std::endl;
     std::cout << "## Double precision" << std::endl;
diff --git a/examples/tutorial/fft.cpp b/examples/tutorial/fft.cpp
index 6150625..c558333 100644
--- a/examples/tutorial/fft.cpp
+++ b/examples/tutorial/fft.cpp
@@ -1,20 +1,28 @@
 
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
 ============================================================================= */
 
+/*
+*
+*   Tutorial: FFT functionality (experimental)
+*
+*/
+
+
 // include necessary system headers
 #include <iostream>
 #include <vector>
@@ -29,19 +37,15 @@
 // include FFT routines
 #include "viennacl/fft.hpp"
 
-/*
-*   Tutorial: FFT functionality (experimental in ViennaCL 1.2.x)
-*/
-
-int main() 
+int main()
 {
   // Change this type definition to double if your gpu supports that
   typedef float       ScalarType;
-  
+
   // Create vectors of eight complex values (represented as pairs of floating point values: [real_0, imag_0, real_1, imag_1, etc.])
-  viennacl::vector<ScalarType> input_vec(16);  
-  viennacl::vector<ScalarType> output_vec(16); 
-  
+  viennacl::vector<ScalarType> input_vec(16);
+  viennacl::vector<ScalarType> output_vec(16);
+
   // Fill with values (use viennacl::copy() for larger data!)
   for (std::size_t i=0; i<input_vec.size(); ++i)
   {
@@ -50,28 +54,28 @@ int main()
     else
       input_vec(i) = 0;                // odd indices represent imaginary part
   }
-  
+
   // Print the vector
   std::cout << "input_vec: " << input_vec << std::endl;
-  
+
   // Compute FFT and store result in 'output_vec'
   std::cout << "Computing FFT..." << std::endl;
   viennacl::fft(input_vec, output_vec);
-  
+
   // Compute FFT and store result directly in 'input_vec'
   viennacl::inplace_fft(input_vec);
-  
+
   // Print result
   std::cout << "input_vec: " << input_vec << std::endl;
   std::cout << "output_vec: " << output_vec << std::endl;
 
   std::cout << "Computing inverse FFT..." << std::endl;
   viennacl::ifft(input_vec, output_vec); // either store result into output_vec
-  viennacl::inplace_ifft(input_vec);     // or compute in-place  
-  
+  viennacl::inplace_ifft(input_vec);     // or compute in-place
+
   std::cout << "input_vec: " << input_vec << std::endl;
   std::cout << "output_vec: " << output_vec << std::endl;
-  
+
   //
   //  That's it.
   //
diff --git a/examples/tutorial/iterative-eigen.cpp b/examples/tutorial/iterative-eigen.cpp
index f9f7255..c955d80 100644
--- a/examples/tutorial/iterative-eigen.cpp
+++ b/examples/tutorial/iterative-eigen.cpp
@@ -1,19 +1,26 @@
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
 ============================================================================= */
 
+/*
+*
+*   Tutorial:  Use of the iterative solvers in ViennaCL with Eigen (http://eigen.tuxfamily.org/)
+*
+*/
+
 //
 // include necessary system headers
 //
@@ -31,7 +38,7 @@
 #include <Eigen/Sparse>
 
 // Must be set prior to any ViennaCL includes if you want to use ViennaCL algorithms on Eigen objects
-#define VIENNACL_HAVE_EIGEN 1
+#define VIENNACL_WITH_EIGEN 1
 
 //
 // ViennaCL includes
@@ -52,69 +59,58 @@
 int main(int, char *[])
 {
   typedef float ScalarType;
-  
+
   Eigen::SparseMatrix<ScalarType, Eigen::RowMajor> eigen_matrix(65025, 65025);
   Eigen::VectorXf eigen_rhs;
   Eigen::VectorXf eigen_result;
   Eigen::VectorXf ref_result;
   Eigen::VectorXf residual;
-  
+
   //
   // Read system from file
   //
-  eigen_matrix.startFill(65025 * 7);
-  #ifdef _MSC_VER
-  if (!viennacl::io::read_matrix_market_file(eigen_matrix, "../../examples/testdata/mat65k.mtx"))
-  #else
+  std::cout << "Reading matrix (this might take some time)..." << std::endl;
+  eigen_matrix.reserve(65025 * 7);
   if (!viennacl::io::read_matrix_market_file(eigen_matrix, "../examples/testdata/mat65k.mtx"))
-  #endif
   {
     std::cout << "Error reading Matrix file" << std::endl;
     return 0;
   }
-  eigen_matrix.endFill();
-  //std::cout << "done reading matrix" << std::endl;
+  //eigen_matrix.endFill();
+  std::cout << "Done: reading matrix" << std::endl;
 
-  #ifdef _MSC_VER
-  if (!readVectorFromFile("../../examples/testdata/rhs65025.txt", eigen_rhs))
-  #else
   if (!readVectorFromFile("../examples/testdata/rhs65025.txt", eigen_rhs))
-  #endif
   {
     std::cout << "Error reading RHS file" << std::endl;
     return 0;
   }
-  
-  #ifdef _MSC_VER
-  if (!readVectorFromFile("../../examples/testdata/result65025.txt", ref_result))
-  #else
+
   if (!readVectorFromFile("../examples/testdata/result65025.txt", ref_result))
-  #endif
   {
     std::cout << "Error reading Result file" << std::endl;
     return 0;
   }
-  
+
   //CG solver:
   std::cout << "----- Running CG -----" << std::endl;
   eigen_result = viennacl::linalg::solve(eigen_matrix, eigen_rhs, viennacl::linalg::cg_tag());
-  
+
   residual = eigen_matrix * eigen_result - eigen_rhs;
   std::cout << "Relative residual: " << viennacl::linalg::norm_2(residual) / viennacl::linalg::norm_2(eigen_rhs) << std::endl;
 
   //BiCGStab solver:
   std::cout << "----- Running BiCGStab -----" << std::endl;
   eigen_result = viennacl::linalg::solve(eigen_matrix, eigen_rhs, viennacl::linalg::bicgstab_tag());
-  
+
   residual = eigen_matrix * eigen_result - eigen_rhs;
   std::cout << "Relative residual: " << viennacl::linalg::norm_2(residual) / viennacl::linalg::norm_2(eigen_rhs) << std::endl;
 
   //GMRES solver:
   std::cout << "----- Running GMRES -----" << std::endl;
   eigen_result = viennacl::linalg::solve(eigen_matrix, eigen_rhs, viennacl::linalg::gmres_tag());
-  
+
   residual = eigen_matrix * eigen_result - eigen_rhs;
   std::cout << "Relative residual: " << viennacl::linalg::norm_2(residual) / viennacl::linalg::norm_2(eigen_rhs) << std::endl;
-  
+
 }
 
diff --git a/examples/tutorial/iterative-mtl4.cpp b/examples/tutorial/iterative-mtl4.cpp
index 5a05d4b..306807f 100644
--- a/examples/tutorial/iterative-mtl4.cpp
+++ b/examples/tutorial/iterative-mtl4.cpp
@@ -1,7 +1,8 @@
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
@@ -14,6 +15,12 @@
    License:         MIT (X11), see file LICENSE in the base directory
 ============================================================================= */
 
+/*
+*
+*   Tutorial:  Use of the iterative solvers in ViennaCL with MTL4 (http://www.mtl4.org/)
+*   
+*/
+
 //
 // include necessary system headers
 //
@@ -28,7 +35,7 @@
 #include <boost/numeric/itl/itl.hpp>
 
 // Must be set prior to any ViennaCL includes if you want to use ViennaCL algorithms on Eigen objects
-#define VIENNACL_HAVE_MTL4 1
+#define VIENNACL_WITH_MTL4 1
 
 //
 // ViennaCL includes
@@ -44,48 +51,6 @@
 #include "Random.hpp"
 #include "vector-io.hpp"
 
-//
-// The following function is just a hel
-//
-/*template <typename MTLMatrixType>
-void read_system(MTLMatrixType & matrix)
-{
-  typedef typename MTLMatrixType::value_type    value_type;
-  
-  std::vector<std::map<unsigned int, value_type> >  stl_matrix(mtl::num_rows(matrix), mtl::num_cols(matrix));
-  
-  viennacl::tools::sparse_matrix_adapter<value_type> adapted_stl_matrix(stl_matrix);
-  
-  #ifdef _MSC_VER
-  if (!viennacl::io::read_matrix_market_file(adapted_stl_matrix, "../../examples/testdata/mat65k.mtx"))
-  #else
-  if (!viennacl::io::read_matrix_market_file(adapted_stl_matrix, "../examples/testdata/mat65k.mtx"))
-  #endif
-  {
-    std::cout << "Error reading Matrix file" << std::endl;
-    return 0;
-  }
-
-  // Now shift to MTL matrix:
-  
-  mtl::matrix::inserter< MTLMatrixType >  ins(matrix);
-  typedef typename mtl::Collection<MTLMatrixType>::value_type  ValueType;
-  
-  typedef viennacl::tools::sparse_matrix_adapter<value_type>::iterator1  Iterator1;
-  for (Iterator1 it = adapted_stl_matrix.begin1();
-                 it != adapted_stl_matrix.end1();
-               ++it)
-  {
-    for (typename Iterator1::iterator it2 = it.begin();
-                                      it2 != it.end();
-                                    ++it2)
-    {
-      ins(it2.index1(), it2.index2() << ValueType(*it);
-    }
-  }
-} */
-
-
 
 int main(int, char *[])
 {
@@ -95,7 +60,7 @@ int main(int, char *[])
   mtl4_matrix.change_dim(65025, 65025);
   set_to_zero(mtl4_matrix);  
   
-  mtl::dense_vector<ScalarType> mtl4_rhs(65025, 0.0);
+  mtl::dense_vector<ScalarType> mtl4_rhs(65025, 1.0);
   mtl::dense_vector<ScalarType> mtl4_result(65025, 0.0);
   mtl::dense_vector<ScalarType> mtl4_ref_result(65025, 0.0);
   mtl::dense_vector<ScalarType> mtl4_residual(65025, 0.0);
@@ -104,22 +69,8 @@ int main(int, char *[])
   // Read system from file
   //
 
-  #ifdef _MSC_VER
-  mtl::io::matrix_market_istream("../../examples/testdata/mat65k.mtx") >> mtl4_matrix;
-  #else
   mtl::io::matrix_market_istream("../examples/testdata/mat65k.mtx") >> mtl4_matrix;
-  #endif
     
-  #ifdef _MSC_VER
-  if (!readVectorFromFile("../../examples/testdata/result65025.txt", mtl4_ref_result))
-  #else
-  if (!readVectorFromFile("../examples/testdata/result65025.txt", mtl4_ref_result))
-  #endif
-  {
-    std::cout << "Error reading Result file" << std::endl;
-    return 0;
-  }
-  
   //
   //CG solver:
   //
diff --git a/examples/tutorial/iterative-ublas.cpp b/examples/tutorial/iterative-ublas.cpp
index 24e52b8..ad6a87e 100644
--- a/examples/tutorial/iterative-ublas.cpp
+++ b/examples/tutorial/iterative-ublas.cpp
@@ -1,156 +1,156 @@
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-//
-// include necessary system headers
-//
-#include <iostream>
-
-//
-// Necessary to obtain a suitable performance in ublas
-#ifndef NDEBUG
- #define NDEBUG
-#endif
-
-
-//
-// ublas includes
-//
-#include <boost/numeric/ublas/io.hpp>
-#include <boost/numeric/ublas/triangular.hpp>
-#include <boost/numeric/ublas/matrix_sparse.hpp>
-#include <boost/numeric/ublas/matrix.hpp>
-#include <boost/numeric/ublas/matrix_proxy.hpp>
-#include <boost/numeric/ublas/operation.hpp>
-#include <boost/numeric/ublas/operation_sparse.hpp>
-#include <boost/numeric/ublas/io.hpp>
-#include <boost/numeric/ublas/lu.hpp>
-
-// Must be set if you want to use ViennaCL algorithms on ublas objects
-#define VIENNACL_HAVE_UBLAS 1
-
-//
-// ViennaCL includes
-//
-#include "viennacl/linalg/ilu.hpp"
-#include "viennacl/linalg/cg.hpp"
-#include "viennacl/linalg/bicgstab.hpp"
-#include "viennacl/linalg/gmres.hpp"
-#include "viennacl/io/matrix_market.hpp"
-
-// Some helper functions for this tutorial:
-#include "Random.hpp"
-#include "vector-io.hpp"
-
-/*
-*
-*   Tutorial:  Iterative solvers without OpenCL
-*   
-*/
-using namespace boost::numeric;
-
-
-int main()
-{
-  typedef float       ScalarType;
-  
-  //
-  // Set up some ublas objects
-  //
-  ublas::vector<ScalarType> rhs;
-  ublas::vector<ScalarType> rhs2;
-  ublas::vector<ScalarType> ref_result;
-  ublas::vector<ScalarType> result;
-  ublas::compressed_matrix<ScalarType> ublas_matrix;
-  
-  //
-  // Read system from file
-  //
-  #ifdef _MSC_VER
-  if (!viennacl::io::read_matrix_market_file(ublas_matrix, "../../examples/testdata/mat65k.mtx"))
-  #else
-  if (!viennacl::io::read_matrix_market_file(ublas_matrix, "../examples/testdata/mat65k.mtx"))
-  #endif
-  {
-    std::cout << "Error reading Matrix file" << std::endl;
-    return 0;
-  }
-  //std::cout << "done reading matrix" << std::endl;
-
-  #ifdef _MSC_VER
-  if (!readVectorFromFile("../../examples/testdata/rhs65025.txt", rhs))
-  #else
-  if (!readVectorFromFile("../examples/testdata/rhs65025.txt", rhs))
-  #endif
-  {
-    std::cout << "Error reading RHS file" << std::endl;
-    return 0;
-  }
-  //std::cout << "done reading rhs" << std::endl;
-
-  #ifdef _MSC_VER
-  if (!readVectorFromFile("../../examples/testdata/result65025.txt", ref_result))
-  #else
-  if (!readVectorFromFile("../examples/testdata/result65025.txt", ref_result))
-  #endif
-  {
-    std::cout << "Error reading Result file" << std::endl;
-    return 0;
-  }
-  //std::cout << "done reading result" << std::endl;
-
-  
-  //
-  // set up ILUT preconditioners for ViennaCL and ublas objects. Other preconditioners can also be used (see manual)
-  // 
-  viennacl::linalg::ilut_precond< ublas::compressed_matrix<ScalarType> >    ublas_ilut(ublas_matrix, viennacl::linalg::ilut_tag());
-  
-  //
-  // Conjugate gradient solver:
-  //
-  std::cout << "----- CG Test -----" << std::endl;
-
-  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::cg_tag());
-  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::cg_tag(1e-6, 20), ublas_ilut);
-
-  
-  //
-  // Stabilized BiConjugate gradient solver:
-  //
-  std::cout << "----- BiCGStab Test -----" << std::endl;
-
-  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::bicgstab_tag());          //without preconditioner
-  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::bicgstab_tag(1e-6, 20), ublas_ilut); //with preconditioner
-  
-  //
-  // GMRES solver:
-  //
-  std::cout << "----- GMRES Test -----" << std::endl;
-
-  //
-  // for ublas objects:
-  //
-  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::gmres_tag());   //without preconditioner
-  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::gmres_tag(1e-6, 20), ublas_ilut);//with preconditioner
-
-  //
-  //  That's it. 
-  //
-  std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl;
-  
-  return 0;
-}
-
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/*
+*
+*   Tutorial:  Use of the iterative solvers in ViennaCL with Boost.uBLAS
+*
+*/
+
+//
+// include necessary system headers
+//
+#include <iostream>
+
+//
+// Necessary to obtain a suitable performance in ublas
+#ifndef NDEBUG
+ #define NDEBUG
+#endif
+
+
+//
+// ublas includes
+//
+#include <boost/numeric/ublas/io.hpp>
+#include <boost/numeric/ublas/triangular.hpp>
+#include <boost/numeric/ublas/matrix_sparse.hpp>
+#include <boost/numeric/ublas/matrix.hpp>
+#include <boost/numeric/ublas/matrix_proxy.hpp>
+#include <boost/numeric/ublas/operation.hpp>
+#include <boost/numeric/ublas/operation_sparse.hpp>
+#include <boost/numeric/ublas/io.hpp>
+#include <boost/numeric/ublas/lu.hpp>
+
+// Must be set if you want to use ViennaCL algorithms on ublas objects
+#define VIENNACL_WITH_UBLAS 1
+
+//
+// ViennaCL includes
+//
+#include "viennacl/linalg/ilu.hpp"
+#include "viennacl/linalg/cg.hpp"
+#include "viennacl/linalg/bicgstab.hpp"
+#include "viennacl/linalg/gmres.hpp"
+#include "viennacl/io/matrix_market.hpp"
+
+// Some helper functions for this tutorial:
+#include "Random.hpp"
+#include "vector-io.hpp"
+
+using namespace boost::numeric;
+
+
+int main()
+{
+  typedef float       ScalarType;
+
+  //
+  // Set up some ublas objects
+  //
+  ublas::vector<ScalarType> rhs;
+  ublas::vector<ScalarType> rhs2;
+  ublas::vector<ScalarType> ref_result;
+  ublas::vector<ScalarType> result;
+  ublas::compressed_matrix<ScalarType> ublas_matrix;
+
+  //
+  // Read system from file
+  //
+  if (!viennacl::io::read_matrix_market_file(ublas_matrix, "../examples/testdata/mat65k.mtx"))
+  {
+    std::cout << "Error reading Matrix file" << std::endl;
+    return 0;
+  }
+  //std::cout << "done reading matrix" << std::endl;
+
+  if (!readVectorFromFile("../examples/testdata/rhs65025.txt", rhs))
+  {
+    std::cout << "Error reading RHS file" << std::endl;
+    return 0;
+  }
+  //std::cout << "done reading rhs" << std::endl;
+
+  if (!readVectorFromFile("../examples/testdata/result65025.txt", ref_result))
+  {
+    std::cout << "Error reading Result file" << std::endl;
+    return 0;
+  }
+  //std::cout << "done reading result" << std::endl;
+
+
+  //
+  // set up ILUT preconditioners for ViennaCL and ublas objects. Other preconditioners can also be used (see manual)
+  //
+  viennacl::linalg::ilut_precond< ublas::compressed_matrix<ScalarType> >    ublas_ilut(ublas_matrix, viennacl::linalg::ilut_tag());
+  viennacl::linalg::ilu0_precond< ublas::compressed_matrix<ScalarType> >    ublas_ilu0(ublas_matrix, viennacl::linalg::ilu0_tag());
+  viennacl::linalg::block_ilu_precond< ublas::compressed_matrix<ScalarType>,
+                                       viennacl::linalg::ilu0_tag>          ublas_block_ilu0(ublas_matrix, viennacl::linalg::ilu0_tag());
+
+  //
+  // Conjugate gradient solver:
+  //
+  std::cout << "----- CG Test -----" << std::endl;
+
+  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::cg_tag());
+  std::cout << "Residual norm: " << norm_2(prod(ublas_matrix, result) - rhs) << std::endl;
+  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::cg_tag(1e-6, 20), ublas_ilut);
+  std::cout << "Residual norm: " << norm_2(prod(ublas_matrix, result) - rhs) << std::endl;
+  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::cg_tag(1e-6, 20), ublas_ilu0);
+  std::cout << "Residual norm: " << norm_2(prod(ublas_matrix, result) - rhs) << std::endl;
+  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::cg_tag(1e-6, 20), ublas_block_ilu0);
+  std::cout << "Residual norm: " << norm_2(prod(ublas_matrix, result) - rhs) << std::endl;
+
+  //
+  // Stabilized BiConjugate gradient solver:
+  //
+  std::cout << "----- BiCGStab Test -----" << std::endl;
+
+  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::bicgstab_tag());          //without preconditioner
+  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::bicgstab_tag(1e-6, 20), ublas_ilut); //with preconditioner
+  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::bicgstab_tag(1e-6, 20), ublas_ilu0); //with preconditioner
+
+  //
+  // GMRES solver:
+  //
+  std::cout << "----- GMRES Test -----" << std::endl;
+
+  //
+  // for ublas objects:
+  //
+  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::gmres_tag());   //without preconditioner
+  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::gmres_tag(1e-6, 20), ublas_ilut);//with preconditioner
+  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::gmres_tag(1e-6, 20), ublas_ilu0);//with preconditioner
+
+  //
+  //  That's it.
+  //
+  std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl;
+
+  return 0;
+}
+
diff --git a/examples/tutorial/iterative.cpp b/examples/tutorial/iterative.cpp
index 6b2ed06..1efde9d 100644
--- a/examples/tutorial/iterative.cpp
+++ b/examples/tutorial/iterative.cpp
@@ -1,237 +1,235 @@
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-//
-// include necessary system headers
-//
-#include <iostream>
-
-//
-// Necessary to obtain a suitable performance in ublas
-#ifndef NDEBUG
- #define NDEBUG
-#endif
-
-//
-// ublas includes
-//
-#include <boost/numeric/ublas/io.hpp>
-#include <boost/numeric/ublas/triangular.hpp>
-#include <boost/numeric/ublas/matrix_sparse.hpp>
-#include <boost/numeric/ublas/matrix.hpp>
-#include <boost/numeric/ublas/matrix_proxy.hpp>
-#include <boost/numeric/ublas/operation.hpp>
-#include <boost/numeric/ublas/operation_sparse.hpp>
-#include <boost/numeric/ublas/io.hpp>
-#include <boost/numeric/ublas/lu.hpp>
-
-// Must be set if you want to use ViennaCL algorithms on ublas objects
-#define VIENNACL_HAVE_UBLAS 1
-
-
-//
-// ViennaCL includes
-//
-#include "viennacl/scalar.hpp"
-#include "viennacl/vector.hpp"
-#include "viennacl/compressed_matrix.hpp"
-#include "viennacl/coordinate_matrix.hpp"
-#include "viennacl/linalg/prod.hpp"
-#include "viennacl/linalg/ilu.hpp"
-#include "viennacl/linalg/jacobi_precond.hpp"
-#include "viennacl/linalg/cg.hpp"
-#include "viennacl/linalg/bicgstab.hpp"
-#include "viennacl/linalg/gmres.hpp"
-#include "viennacl/io/matrix_market.hpp"
-
-
-// Some helper functions for this tutorial:
-#include "Random.hpp"
-#include "vector-io.hpp"
-
-
-/*
-*
-*   Tutorial:  Iterative solvers
-*   
-*/
-using namespace boost::numeric;
-
-
-int main()
-{
-  typedef float       ScalarType;
-  
-  //
-  // Set up some ublas objects
-  //
-  ublas::vector<ScalarType> rhs;
-  ublas::vector<ScalarType> rhs2;
-  ublas::vector<ScalarType> ref_result;
-  ublas::vector<ScalarType> result;
-  ublas::compressed_matrix<ScalarType> ublas_matrix;
-  
-  //
-  // Read system from file
-  //
-  #ifdef _MSC_VER
-  if (!viennacl::io::read_matrix_market_file(ublas_matrix, "../../examples/testdata/mat65k.mtx"))
-  #else
-  if (!viennacl::io::read_matrix_market_file(ublas_matrix, "../examples/testdata/mat65k.mtx"))
-  #endif
-  {
-    std::cout << "Error reading Matrix file" << std::endl;
-    return 0;
-  }
-  //std::cout << "done reading matrix" << std::endl;
-
-  #ifdef _MSC_VER
-  if (!readVectorFromFile("../../examples/testdata/rhs65025.txt", rhs))
-  #else
-  if (!readVectorFromFile("../examples/testdata/rhs65025.txt", rhs))
-  #endif
-  {
-    std::cout << "Error reading RHS file" << std::endl;
-    return 0;
-  }
-  //std::cout << "done reading rhs" << std::endl;
-
-  #ifdef _MSC_VER
-  if (!readVectorFromFile("../../examples/testdata/result65025.txt", ref_result))
-  #else
-  if (!readVectorFromFile("../examples/testdata/result65025.txt", ref_result))
-  #endif
-  {
-    std::cout << "Error reading Result file" << std::endl;
-    return 0;
-  }
-  //std::cout << "done reading result" << std::endl;
-
-  //
-  // Set up some ViennaCL objects
-  //
-  size_t vcl_size = rhs.size();
-  viennacl::compressed_matrix<ScalarType> vcl_compressed_matrix;
-  viennacl::coordinate_matrix<ScalarType> vcl_coordinate_matrix;
-  viennacl::vector<ScalarType> vcl_rhs(vcl_size); 
-  viennacl::vector<ScalarType> vcl_result(vcl_size);
-  viennacl::vector<ScalarType> vcl_ref_result(vcl_size);
-  
-  viennacl::copy(rhs.begin(), rhs.end(), vcl_rhs.begin());
-  viennacl::copy(ref_result.begin(), ref_result.end(), vcl_ref_result.begin());
-  
-  
-  //
-  // Transfer ublas-matrix to GPU:
-  //
-  viennacl::copy(ublas_matrix, vcl_compressed_matrix);
-  
-  //
-  // alternative way: via STL. Sparse matrix as std::vector< std::map< unsigned int, ScalarType> >
-  //
-  std::vector< std::map< unsigned int, ScalarType> > stl_matrix(rhs.size());
-  for (ublas::compressed_matrix<ScalarType>::iterator1 iter1 = ublas_matrix.begin1();
-       iter1 != ublas_matrix.end1();
-       ++iter1)
-  {
-    for (ublas::compressed_matrix<ScalarType>::iterator2 iter2 = iter1.begin();
-         iter2 != iter1.end();
-         ++iter2)
-         stl_matrix[iter2.index1()][static_cast<unsigned int>(iter2.index2())] = *iter2;
-  }
-  viennacl::copy(stl_matrix, vcl_coordinate_matrix);
-  viennacl::copy(vcl_coordinate_matrix, stl_matrix);
-  
-  //
-  // set up ILUT preconditioners for ViennaCL and ublas objects:
-  // 
-  viennacl::linalg::ilut_precond< ublas::compressed_matrix<ScalarType> >    ublas_ilut(ublas_matrix, viennacl::linalg::ilut_tag());
-  viennacl::linalg::ilut_precond< viennacl::compressed_matrix<ScalarType> > vcl_ilut(vcl_compressed_matrix, viennacl::linalg::ilut_tag());
-
-  //
-  // set up Jacobi preconditioners for ViennaCL and ublas objects:
-  // 
-  viennacl::linalg::jacobi_precond< ublas::compressed_matrix<ScalarType> >    ublas_jacobi(ublas_matrix, viennacl::linalg::jacobi_tag());
-  viennacl::linalg::jacobi_precond< viennacl::compressed_matrix<ScalarType> > vcl_jacobi(vcl_compressed_matrix, viennacl::linalg::jacobi_tag());
-  
-  //
-  // Conjugate gradient solver:
-  //
-  std::cout << "----- CG Test -----" << std::endl;
-  
-  //
-  // for ublas objects:
-  //
-  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::cg_tag());
-  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::cg_tag(1e-6, 20), ublas_ilut);
-  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::cg_tag(1e-6, 20), ublas_jacobi);
-
-  
-  //
-  // for ViennaCL objects:
-  //
-  vcl_result = viennacl::linalg::solve(vcl_compressed_matrix, vcl_rhs, viennacl::linalg::cg_tag());
-  vcl_result = viennacl::linalg::solve(vcl_compressed_matrix, vcl_rhs, viennacl::linalg::cg_tag(1e-6, 20), vcl_ilut);
-  vcl_result = viennacl::linalg::solve(vcl_compressed_matrix, vcl_rhs, viennacl::linalg::cg_tag(1e-6, 20), vcl_jacobi);
-  
-  //
-  // Stabilized BiConjugate gradient solver:
-  //
-  std::cout << "----- BiCGStab Test -----" << std::endl;
-
-  //
-  // for ublas objects:
-  //
-  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::bicgstab_tag());          //without preconditioner
-  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::bicgstab_tag(1e-6, 20), ublas_ilut); //with preconditioner
-  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::bicgstab_tag(1e-6, 20), ublas_jacobi); //with preconditioner
-
-  
-  //
-  // for ViennaCL objects:
-  //
-  vcl_result = viennacl::linalg::solve(vcl_compressed_matrix, vcl_rhs, viennacl::linalg::bicgstab_tag());   //without preconditioner
-  vcl_result = viennacl::linalg::solve(vcl_compressed_matrix, vcl_rhs, viennacl::linalg::bicgstab_tag(1e-6, 20), vcl_ilut); //with preconditioner
-  vcl_result = viennacl::linalg::solve(vcl_compressed_matrix, vcl_rhs, viennacl::linalg::bicgstab_tag(1e-6, 20), vcl_jacobi); //with preconditioner
-  
-  //
-  // GMRES solver:
-  //
-  std::cout << "----- GMRES Test -----" << std::endl;
-  std::cout << " ATTENTION: Please be aware that GMRES may not work on ATI GPUs when using Stream SDK v2.1." << std::endl;
-
-  //
-  // for ublas objects:
-  //
-  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::gmres_tag());   //without preconditioner
-  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::gmres_tag(1e-6, 20), ublas_ilut);//with preconditioner
-  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::gmres_tag(1e-6, 20), ublas_jacobi);//with preconditioner
-
-  //
-  // for ViennaCL objects:
-  //
-  vcl_result = viennacl::linalg::solve(vcl_compressed_matrix, vcl_rhs, viennacl::linalg::gmres_tag());   //without preconditioner
-  vcl_result = viennacl::linalg::solve(vcl_compressed_matrix, vcl_rhs, viennacl::linalg::gmres_tag(1e-6, 20), vcl_ilut);//with preconditioner
-  vcl_result = viennacl::linalg::solve(vcl_compressed_matrix, vcl_rhs, viennacl::linalg::gmres_tag(1e-6, 20), vcl_jacobi);//with preconditioner
-
-  //
-  //  That's it.
-  //
-  std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl;
-  
-  return 0;
-}
-
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/*
+*
+*   Tutorial:  Iterative solvers in ViennaCL (iterative.cpp and iterative.cu are identical, the latter being required for compilation using CUDA nvcc)
+*
+*/
+
+//
+// include necessary system headers
+//
+#include <iostream>
+
+//
+// Necessary to obtain a suitable performance in ublas
+#ifndef NDEBUG
+ #define NDEBUG
+#endif
+
+//
+// ublas includes
+//
+#include <boost/numeric/ublas/io.hpp>
+#include <boost/numeric/ublas/triangular.hpp>
+#include <boost/numeric/ublas/matrix_sparse.hpp>
+#include <boost/numeric/ublas/matrix.hpp>
+#include <boost/numeric/ublas/matrix_proxy.hpp>
+#include <boost/numeric/ublas/operation.hpp>
+#include <boost/numeric/ublas/operation_sparse.hpp>
+#include <boost/numeric/ublas/io.hpp>
+#include <boost/numeric/ublas/lu.hpp>
+
+// Must be set if you want to use ViennaCL algorithms on ublas objects
+#define VIENNACL_WITH_UBLAS 1
+
+
+//
+// ViennaCL includes
+//
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/compressed_matrix.hpp"
+#include "viennacl/coordinate_matrix.hpp"
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/linalg/ilu.hpp"
+#include "viennacl/linalg/jacobi_precond.hpp"
+#include "viennacl/linalg/cg.hpp"
+#include "viennacl/linalg/bicgstab.hpp"
+#include "viennacl/linalg/gmres.hpp"
+#include "viennacl/io/matrix_market.hpp"
+
+
+// Some helper functions for this tutorial:
+#include "Random.hpp"
+#include "vector-io.hpp"
+
+
+using namespace boost::numeric;
+
+
+int main()
+{
+  typedef float       ScalarType;
+
+  //
+  // Set up some ublas objects
+  //
+  ublas::vector<ScalarType> rhs;
+  ublas::vector<ScalarType> rhs2;
+  ublas::vector<ScalarType> ref_result;
+  ublas::vector<ScalarType> result;
+  ublas::compressed_matrix<ScalarType> ublas_matrix;
+
+  //
+  // Read system from file
+  //
+  if (!viennacl::io::read_matrix_market_file(ublas_matrix, "../examples/testdata/mat65k.mtx"))
+  {
+    std::cout << "Error reading Matrix file" << std::endl;
+    return 0;
+  }
+  //std::cout << "done reading matrix" << std::endl;
+
+  if (!readVectorFromFile("../examples/testdata/rhs65025.txt", rhs))
+  {
+    std::cout << "Error reading RHS file" << std::endl;
+    return 0;
+  }
+  //std::cout << "done reading rhs" << std::endl;
+
+  if (!readVectorFromFile("../examples/testdata/result65025.txt", ref_result))
+  {
+    std::cout << "Error reading Result file" << std::endl;
+    return 0;
+  }
+  //std::cout << "done reading result" << std::endl;
+
+  //
+  // Set up some ViennaCL objects
+  //
+  std::size_t vcl_size = rhs.size();
+  viennacl::compressed_matrix<ScalarType> vcl_compressed_matrix;
+  viennacl::coordinate_matrix<ScalarType> vcl_coordinate_matrix;
+  viennacl::vector<ScalarType> vcl_rhs(vcl_size);
+  viennacl::vector<ScalarType> vcl_result(vcl_size);
+  viennacl::vector<ScalarType> vcl_ref_result(vcl_size);
+
+  viennacl::copy(rhs.begin(), rhs.end(), vcl_rhs.begin());
+  viennacl::copy(ref_result.begin(), ref_result.end(), vcl_ref_result.begin());
+
+
+  //
+  // Transfer ublas-matrix to GPU:
+  //
+  viennacl::copy(ublas_matrix, vcl_compressed_matrix);
+
+  //
+  // alternative way: via STL. Sparse matrix as std::vector< std::map< unsigned int, ScalarType> >
+  //
+  std::vector< std::map< unsigned int, ScalarType> > stl_matrix(rhs.size());
+  for (ublas::compressed_matrix<ScalarType>::iterator1 iter1 = ublas_matrix.begin1();
+       iter1 != ublas_matrix.end1();
+       ++iter1)
+  {
+    for (ublas::compressed_matrix<ScalarType>::iterator2 iter2 = iter1.begin();
+         iter2 != iter1.end();
+         ++iter2)
+         stl_matrix[iter2.index1()][static_cast<unsigned int>(iter2.index2())] = *iter2;
+  }
+  viennacl::copy(stl_matrix, vcl_coordinate_matrix);
+  viennacl::copy(vcl_coordinate_matrix, stl_matrix);
+
+  //
+  // set up ILUT preconditioners for ublas and ViennaCL objects:
+  //
+  std::cout << "Setting up preconditioners for uBLAS-matrix..." << std::endl;
+  viennacl::linalg::ilut_precond< ublas::compressed_matrix<ScalarType> >    ublas_ilut(ublas_matrix, viennacl::linalg::ilut_tag());
+  viennacl::linalg::ilu0_precond< ublas::compressed_matrix<ScalarType> >    ublas_ilu0(ublas_matrix, viennacl::linalg::ilu0_tag());
+  viennacl::linalg::block_ilu_precond< ublas::compressed_matrix<ScalarType>,
+                                       viennacl::linalg::ilu0_tag>          ublas_block_ilu0(ublas_matrix, viennacl::linalg::ilu0_tag());
+
+  std::cout << "Setting up preconditioners for ViennaCL-matrix..." << std::endl;
+  viennacl::linalg::ilut_precond< viennacl::compressed_matrix<ScalarType> > vcl_ilut(vcl_compressed_matrix, viennacl::linalg::ilut_tag());
+  viennacl::linalg::ilu0_precond< viennacl::compressed_matrix<ScalarType> > vcl_ilu0(vcl_compressed_matrix, viennacl::linalg::ilu0_tag());
+  viennacl::linalg::block_ilu_precond< viennacl::compressed_matrix<ScalarType>,
+                                       viennacl::linalg::ilu0_tag>          vcl_block_ilu0(vcl_compressed_matrix, viennacl::linalg::ilu0_tag());
+
+  //
+  // set up Jacobi preconditioners for ViennaCL and ublas objects:
+  //
+  viennacl::linalg::jacobi_precond< ublas::compressed_matrix<ScalarType> >    ublas_jacobi(ublas_matrix, viennacl::linalg::jacobi_tag());
+  viennacl::linalg::jacobi_precond< viennacl::compressed_matrix<ScalarType> > vcl_jacobi(vcl_compressed_matrix, viennacl::linalg::jacobi_tag());
+
+  //
+  // Conjugate gradient solver:
+  //
+  std::cout << "----- CG Test -----" << std::endl;
+
+  //
+  // for ublas objects:
+  //
+  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::cg_tag());
+  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::cg_tag(1e-6, 20), ublas_ilut);
+  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::cg_tag(1e-6, 20), ublas_jacobi);
+
+
+  //
+  // for ViennaCL objects:
+  //
+  vcl_result = viennacl::linalg::solve(vcl_compressed_matrix, vcl_rhs, viennacl::linalg::cg_tag());
+  vcl_result = viennacl::linalg::solve(vcl_compressed_matrix, vcl_rhs, viennacl::linalg::cg_tag(1e-6, 20), vcl_ilut);
+  vcl_result = viennacl::linalg::solve(vcl_compressed_matrix, vcl_rhs, viennacl::linalg::cg_tag(1e-6, 20), vcl_jacobi);
+
+  //
+  // Stabilized BiConjugate gradient solver:
+  //
+  std::cout << "----- BiCGStab Test -----" << std::endl;
+
+  //
+  // for ublas objects:
+  //
+  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::bicgstab_tag());          //without preconditioner
+  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::bicgstab_tag(1e-6, 20), ublas_ilut); //with preconditioner
+  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::bicgstab_tag(1e-6, 20), ublas_jacobi); //with preconditioner
+
+
+  //
+  // for ViennaCL objects:
+  //
+  vcl_result = viennacl::linalg::solve(vcl_compressed_matrix, vcl_rhs, viennacl::linalg::bicgstab_tag());   //without preconditioner
+  vcl_result = viennacl::linalg::solve(vcl_compressed_matrix, vcl_rhs, viennacl::linalg::bicgstab_tag(1e-6, 20), vcl_ilut); //with preconditioner
+  vcl_result = viennacl::linalg::solve(vcl_compressed_matrix, vcl_rhs, viennacl::linalg::bicgstab_tag(1e-6, 20), vcl_jacobi); //with preconditioner
+
+  //
+  // GMRES solver:
+  //
+  std::cout << "----- GMRES Test -----" << std::endl;
+
+  //
+  // for ublas objects:
+  //
+  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::gmres_tag());   //without preconditioner
+  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::gmres_tag(1e-6, 20), ublas_ilut);//with preconditioner
+  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::gmres_tag(1e-6, 20), ublas_jacobi);//with preconditioner
+
+  //
+  // for ViennaCL objects:
+  //
+  vcl_result = viennacl::linalg::solve(vcl_compressed_matrix, vcl_rhs, viennacl::linalg::gmres_tag());   //without preconditioner
+  vcl_result = viennacl::linalg::solve(vcl_compressed_matrix, vcl_rhs, viennacl::linalg::gmres_tag(1e-6, 20), vcl_ilut);//with preconditioner
+  vcl_result = viennacl::linalg::solve(vcl_compressed_matrix, vcl_rhs, viennacl::linalg::gmres_tag(1e-6, 20), vcl_jacobi);//with preconditioner
+
+  //
+  //  That's it.
+  //
+  std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl;
+
+  return 0;
+}
+
diff --git a/examples/tutorial/iterative.cpp b/examples/tutorial/iterative.cu
similarity index 84%
copy from examples/tutorial/iterative.cpp
copy to examples/tutorial/iterative.cu
index 6b2ed06..1efde9d 100644
--- a/examples/tutorial/iterative.cpp
+++ b/examples/tutorial/iterative.cu
@@ -1,237 +1,235 @@
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-//
-// include necessary system headers
-//
-#include <iostream>
-
-//
-// Necessary to obtain a suitable performance in ublas
-#ifndef NDEBUG
- #define NDEBUG
-#endif
-
-//
-// ublas includes
-//
-#include <boost/numeric/ublas/io.hpp>
-#include <boost/numeric/ublas/triangular.hpp>
-#include <boost/numeric/ublas/matrix_sparse.hpp>
-#include <boost/numeric/ublas/matrix.hpp>
-#include <boost/numeric/ublas/matrix_proxy.hpp>
-#include <boost/numeric/ublas/operation.hpp>
-#include <boost/numeric/ublas/operation_sparse.hpp>
-#include <boost/numeric/ublas/io.hpp>
-#include <boost/numeric/ublas/lu.hpp>
-
-// Must be set if you want to use ViennaCL algorithms on ublas objects
-#define VIENNACL_HAVE_UBLAS 1
-
-
-//
-// ViennaCL includes
-//
-#include "viennacl/scalar.hpp"
-#include "viennacl/vector.hpp"
-#include "viennacl/compressed_matrix.hpp"
-#include "viennacl/coordinate_matrix.hpp"
-#include "viennacl/linalg/prod.hpp"
-#include "viennacl/linalg/ilu.hpp"
-#include "viennacl/linalg/jacobi_precond.hpp"
-#include "viennacl/linalg/cg.hpp"
-#include "viennacl/linalg/bicgstab.hpp"
-#include "viennacl/linalg/gmres.hpp"
-#include "viennacl/io/matrix_market.hpp"
-
-
-// Some helper functions for this tutorial:
-#include "Random.hpp"
-#include "vector-io.hpp"
-
-
-/*
-*
-*   Tutorial:  Iterative solvers
-*   
-*/
-using namespace boost::numeric;
-
-
-int main()
-{
-  typedef float       ScalarType;
-  
-  //
-  // Set up some ublas objects
-  //
-  ublas::vector<ScalarType> rhs;
-  ublas::vector<ScalarType> rhs2;
-  ublas::vector<ScalarType> ref_result;
-  ublas::vector<ScalarType> result;
-  ublas::compressed_matrix<ScalarType> ublas_matrix;
-  
-  //
-  // Read system from file
-  //
-  #ifdef _MSC_VER
-  if (!viennacl::io::read_matrix_market_file(ublas_matrix, "../../examples/testdata/mat65k.mtx"))
-  #else
-  if (!viennacl::io::read_matrix_market_file(ublas_matrix, "../examples/testdata/mat65k.mtx"))
-  #endif
-  {
-    std::cout << "Error reading Matrix file" << std::endl;
-    return 0;
-  }
-  //std::cout << "done reading matrix" << std::endl;
-
-  #ifdef _MSC_VER
-  if (!readVectorFromFile("../../examples/testdata/rhs65025.txt", rhs))
-  #else
-  if (!readVectorFromFile("../examples/testdata/rhs65025.txt", rhs))
-  #endif
-  {
-    std::cout << "Error reading RHS file" << std::endl;
-    return 0;
-  }
-  //std::cout << "done reading rhs" << std::endl;
-
-  #ifdef _MSC_VER
-  if (!readVectorFromFile("../../examples/testdata/result65025.txt", ref_result))
-  #else
-  if (!readVectorFromFile("../examples/testdata/result65025.txt", ref_result))
-  #endif
-  {
-    std::cout << "Error reading Result file" << std::endl;
-    return 0;
-  }
-  //std::cout << "done reading result" << std::endl;
-
-  //
-  // Set up some ViennaCL objects
-  //
-  size_t vcl_size = rhs.size();
-  viennacl::compressed_matrix<ScalarType> vcl_compressed_matrix;
-  viennacl::coordinate_matrix<ScalarType> vcl_coordinate_matrix;
-  viennacl::vector<ScalarType> vcl_rhs(vcl_size); 
-  viennacl::vector<ScalarType> vcl_result(vcl_size);
-  viennacl::vector<ScalarType> vcl_ref_result(vcl_size);
-  
-  viennacl::copy(rhs.begin(), rhs.end(), vcl_rhs.begin());
-  viennacl::copy(ref_result.begin(), ref_result.end(), vcl_ref_result.begin());
-  
-  
-  //
-  // Transfer ublas-matrix to GPU:
-  //
-  viennacl::copy(ublas_matrix, vcl_compressed_matrix);
-  
-  //
-  // alternative way: via STL. Sparse matrix as std::vector< std::map< unsigned int, ScalarType> >
-  //
-  std::vector< std::map< unsigned int, ScalarType> > stl_matrix(rhs.size());
-  for (ublas::compressed_matrix<ScalarType>::iterator1 iter1 = ublas_matrix.begin1();
-       iter1 != ublas_matrix.end1();
-       ++iter1)
-  {
-    for (ublas::compressed_matrix<ScalarType>::iterator2 iter2 = iter1.begin();
-         iter2 != iter1.end();
-         ++iter2)
-         stl_matrix[iter2.index1()][static_cast<unsigned int>(iter2.index2())] = *iter2;
-  }
-  viennacl::copy(stl_matrix, vcl_coordinate_matrix);
-  viennacl::copy(vcl_coordinate_matrix, stl_matrix);
-  
-  //
-  // set up ILUT preconditioners for ViennaCL and ublas objects:
-  // 
-  viennacl::linalg::ilut_precond< ublas::compressed_matrix<ScalarType> >    ublas_ilut(ublas_matrix, viennacl::linalg::ilut_tag());
-  viennacl::linalg::ilut_precond< viennacl::compressed_matrix<ScalarType> > vcl_ilut(vcl_compressed_matrix, viennacl::linalg::ilut_tag());
-
-  //
-  // set up Jacobi preconditioners for ViennaCL and ublas objects:
-  // 
-  viennacl::linalg::jacobi_precond< ublas::compressed_matrix<ScalarType> >    ublas_jacobi(ublas_matrix, viennacl::linalg::jacobi_tag());
-  viennacl::linalg::jacobi_precond< viennacl::compressed_matrix<ScalarType> > vcl_jacobi(vcl_compressed_matrix, viennacl::linalg::jacobi_tag());
-  
-  //
-  // Conjugate gradient solver:
-  //
-  std::cout << "----- CG Test -----" << std::endl;
-  
-  //
-  // for ublas objects:
-  //
-  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::cg_tag());
-  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::cg_tag(1e-6, 20), ublas_ilut);
-  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::cg_tag(1e-6, 20), ublas_jacobi);
-
-  
-  //
-  // for ViennaCL objects:
-  //
-  vcl_result = viennacl::linalg::solve(vcl_compressed_matrix, vcl_rhs, viennacl::linalg::cg_tag());
-  vcl_result = viennacl::linalg::solve(vcl_compressed_matrix, vcl_rhs, viennacl::linalg::cg_tag(1e-6, 20), vcl_ilut);
-  vcl_result = viennacl::linalg::solve(vcl_compressed_matrix, vcl_rhs, viennacl::linalg::cg_tag(1e-6, 20), vcl_jacobi);
-  
-  //
-  // Stabilized BiConjugate gradient solver:
-  //
-  std::cout << "----- BiCGStab Test -----" << std::endl;
-
-  //
-  // for ublas objects:
-  //
-  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::bicgstab_tag());          //without preconditioner
-  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::bicgstab_tag(1e-6, 20), ublas_ilut); //with preconditioner
-  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::bicgstab_tag(1e-6, 20), ublas_jacobi); //with preconditioner
-
-  
-  //
-  // for ViennaCL objects:
-  //
-  vcl_result = viennacl::linalg::solve(vcl_compressed_matrix, vcl_rhs, viennacl::linalg::bicgstab_tag());   //without preconditioner
-  vcl_result = viennacl::linalg::solve(vcl_compressed_matrix, vcl_rhs, viennacl::linalg::bicgstab_tag(1e-6, 20), vcl_ilut); //with preconditioner
-  vcl_result = viennacl::linalg::solve(vcl_compressed_matrix, vcl_rhs, viennacl::linalg::bicgstab_tag(1e-6, 20), vcl_jacobi); //with preconditioner
-  
-  //
-  // GMRES solver:
-  //
-  std::cout << "----- GMRES Test -----" << std::endl;
-  std::cout << " ATTENTION: Please be aware that GMRES may not work on ATI GPUs when using Stream SDK v2.1." << std::endl;
-
-  //
-  // for ublas objects:
-  //
-  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::gmres_tag());   //without preconditioner
-  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::gmres_tag(1e-6, 20), ublas_ilut);//with preconditioner
-  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::gmres_tag(1e-6, 20), ublas_jacobi);//with preconditioner
-
-  //
-  // for ViennaCL objects:
-  //
-  vcl_result = viennacl::linalg::solve(vcl_compressed_matrix, vcl_rhs, viennacl::linalg::gmres_tag());   //without preconditioner
-  vcl_result = viennacl::linalg::solve(vcl_compressed_matrix, vcl_rhs, viennacl::linalg::gmres_tag(1e-6, 20), vcl_ilut);//with preconditioner
-  vcl_result = viennacl::linalg::solve(vcl_compressed_matrix, vcl_rhs, viennacl::linalg::gmres_tag(1e-6, 20), vcl_jacobi);//with preconditioner
-
-  //
-  //  That's it.
-  //
-  std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl;
-  
-  return 0;
-}
-
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/*
+*
+*   Tutorial:  Iterative solvers in ViennaCL (iterative.cpp and iterative.cu are identical, the latter being required for compilation using CUDA nvcc)
+*
+*/
+
+//
+// include necessary system headers
+//
+#include <iostream>
+
+//
+// Necessary to obtain a suitable performance in ublas
+#ifndef NDEBUG
+ #define NDEBUG
+#endif
+
+//
+// ublas includes
+//
+#include <boost/numeric/ublas/io.hpp>
+#include <boost/numeric/ublas/triangular.hpp>
+#include <boost/numeric/ublas/matrix_sparse.hpp>
+#include <boost/numeric/ublas/matrix.hpp>
+#include <boost/numeric/ublas/matrix_proxy.hpp>
+#include <boost/numeric/ublas/operation.hpp>
+#include <boost/numeric/ublas/operation_sparse.hpp>
+#include <boost/numeric/ublas/io.hpp>
+#include <boost/numeric/ublas/lu.hpp>
+
+// Must be set if you want to use ViennaCL algorithms on ublas objects
+#define VIENNACL_WITH_UBLAS 1
+
+
+//
+// ViennaCL includes
+//
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/compressed_matrix.hpp"
+#include "viennacl/coordinate_matrix.hpp"
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/linalg/ilu.hpp"
+#include "viennacl/linalg/jacobi_precond.hpp"
+#include "viennacl/linalg/cg.hpp"
+#include "viennacl/linalg/bicgstab.hpp"
+#include "viennacl/linalg/gmres.hpp"
+#include "viennacl/io/matrix_market.hpp"
+
+
+// Some helper functions for this tutorial:
+#include "Random.hpp"
+#include "vector-io.hpp"
+
+
+using namespace boost::numeric;
+
+
+int main()
+{
+  typedef float       ScalarType;
+
+  //
+  // Set up some ublas objects
+  //
+  ublas::vector<ScalarType> rhs;
+  ublas::vector<ScalarType> rhs2;
+  ublas::vector<ScalarType> ref_result;
+  ublas::vector<ScalarType> result;
+  ublas::compressed_matrix<ScalarType> ublas_matrix;
+
+  //
+  // Read system from file
+  //
+  if (!viennacl::io::read_matrix_market_file(ublas_matrix, "../examples/testdata/mat65k.mtx"))
+  {
+    std::cout << "Error reading Matrix file" << std::endl;
+    return 0;
+  }
+  //std::cout << "done reading matrix" << std::endl;
+
+  if (!readVectorFromFile("../examples/testdata/rhs65025.txt", rhs))
+  {
+    std::cout << "Error reading RHS file" << std::endl;
+    return 0;
+  }
+  //std::cout << "done reading rhs" << std::endl;
+
+  if (!readVectorFromFile("../examples/testdata/result65025.txt", ref_result))
+  {
+    std::cout << "Error reading Result file" << std::endl;
+    return 0;
+  }
+  //std::cout << "done reading result" << std::endl;
+
+  //
+  // Set up some ViennaCL objects
+  //
+  std::size_t vcl_size = rhs.size();
+  viennacl::compressed_matrix<ScalarType> vcl_compressed_matrix;
+  viennacl::coordinate_matrix<ScalarType> vcl_coordinate_matrix;
+  viennacl::vector<ScalarType> vcl_rhs(vcl_size);
+  viennacl::vector<ScalarType> vcl_result(vcl_size);
+  viennacl::vector<ScalarType> vcl_ref_result(vcl_size);
+
+  viennacl::copy(rhs.begin(), rhs.end(), vcl_rhs.begin());
+  viennacl::copy(ref_result.begin(), ref_result.end(), vcl_ref_result.begin());
+
+
+  //
+  // Transfer ublas-matrix to GPU:
+  //
+  viennacl::copy(ublas_matrix, vcl_compressed_matrix);
+
+  //
+  // alternative way: via STL. Sparse matrix as std::vector< std::map< unsigned int, ScalarType> >
+  //
+  std::vector< std::map< unsigned int, ScalarType> > stl_matrix(rhs.size());
+  for (ublas::compressed_matrix<ScalarType>::iterator1 iter1 = ublas_matrix.begin1();
+       iter1 != ublas_matrix.end1();
+       ++iter1)
+  {
+    for (ublas::compressed_matrix<ScalarType>::iterator2 iter2 = iter1.begin();
+         iter2 != iter1.end();
+         ++iter2)
+         stl_matrix[iter2.index1()][static_cast<unsigned int>(iter2.index2())] = *iter2;
+  }
+  viennacl::copy(stl_matrix, vcl_coordinate_matrix);
+  viennacl::copy(vcl_coordinate_matrix, stl_matrix);
+
+  //
+  // set up ILUT preconditioners for ublas and ViennaCL objects:
+  //
+  std::cout << "Setting up preconditioners for uBLAS-matrix..." << std::endl;
+  viennacl::linalg::ilut_precond< ublas::compressed_matrix<ScalarType> >    ublas_ilut(ublas_matrix, viennacl::linalg::ilut_tag());
+  viennacl::linalg::ilu0_precond< ublas::compressed_matrix<ScalarType> >    ublas_ilu0(ublas_matrix, viennacl::linalg::ilu0_tag());
+  viennacl::linalg::block_ilu_precond< ublas::compressed_matrix<ScalarType>,
+                                       viennacl::linalg::ilu0_tag>          ublas_block_ilu0(ublas_matrix, viennacl::linalg::ilu0_tag());
+
+  std::cout << "Setting up preconditioners for ViennaCL-matrix..." << std::endl;
+  viennacl::linalg::ilut_precond< viennacl::compressed_matrix<ScalarType> > vcl_ilut(vcl_compressed_matrix, viennacl::linalg::ilut_tag());
+  viennacl::linalg::ilu0_precond< viennacl::compressed_matrix<ScalarType> > vcl_ilu0(vcl_compressed_matrix, viennacl::linalg::ilu0_tag());
+  viennacl::linalg::block_ilu_precond< viennacl::compressed_matrix<ScalarType>,
+                                       viennacl::linalg::ilu0_tag>          vcl_block_ilu0(vcl_compressed_matrix, viennacl::linalg::ilu0_tag());
+
+  //
+  // set up Jacobi preconditioners for ViennaCL and ublas objects:
+  //
+  viennacl::linalg::jacobi_precond< ublas::compressed_matrix<ScalarType> >    ublas_jacobi(ublas_matrix, viennacl::linalg::jacobi_tag());
+  viennacl::linalg::jacobi_precond< viennacl::compressed_matrix<ScalarType> > vcl_jacobi(vcl_compressed_matrix, viennacl::linalg::jacobi_tag());
+
+  //
+  // Conjugate gradient solver:
+  //
+  std::cout << "----- CG Test -----" << std::endl;
+
+  //
+  // for ublas objects:
+  //
+  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::cg_tag());
+  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::cg_tag(1e-6, 20), ublas_ilut);
+  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::cg_tag(1e-6, 20), ublas_jacobi);
+
+
+  //
+  // for ViennaCL objects:
+  //
+  vcl_result = viennacl::linalg::solve(vcl_compressed_matrix, vcl_rhs, viennacl::linalg::cg_tag());
+  vcl_result = viennacl::linalg::solve(vcl_compressed_matrix, vcl_rhs, viennacl::linalg::cg_tag(1e-6, 20), vcl_ilut);
+  vcl_result = viennacl::linalg::solve(vcl_compressed_matrix, vcl_rhs, viennacl::linalg::cg_tag(1e-6, 20), vcl_jacobi);
+
+  //
+  // Stabilized BiConjugate gradient solver:
+  //
+  std::cout << "----- BiCGStab Test -----" << std::endl;
+
+  //
+  // for ublas objects:
+  //
+  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::bicgstab_tag());          //without preconditioner
+  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::bicgstab_tag(1e-6, 20), ublas_ilut); //with preconditioner
+  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::bicgstab_tag(1e-6, 20), ublas_jacobi); //with preconditioner
+
+
+  //
+  // for ViennaCL objects:
+  //
+  vcl_result = viennacl::linalg::solve(vcl_compressed_matrix, vcl_rhs, viennacl::linalg::bicgstab_tag());   //without preconditioner
+  vcl_result = viennacl::linalg::solve(vcl_compressed_matrix, vcl_rhs, viennacl::linalg::bicgstab_tag(1e-6, 20), vcl_ilut); //with preconditioner
+  vcl_result = viennacl::linalg::solve(vcl_compressed_matrix, vcl_rhs, viennacl::linalg::bicgstab_tag(1e-6, 20), vcl_jacobi); //with preconditioner
+
+  //
+  // GMRES solver:
+  //
+  std::cout << "----- GMRES Test -----" << std::endl;
+
+  //
+  // for ublas objects:
+  //
+  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::gmres_tag());   //without preconditioner
+  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::gmres_tag(1e-6, 20), ublas_ilut);//with preconditioner
+  result = viennacl::linalg::solve(ublas_matrix, rhs, viennacl::linalg::gmres_tag(1e-6, 20), ublas_jacobi);//with preconditioner
+
+  //
+  // for ViennaCL objects:
+  //
+  vcl_result = viennacl::linalg::solve(vcl_compressed_matrix, vcl_rhs, viennacl::linalg::gmres_tag());   //without preconditioner
+  vcl_result = viennacl::linalg::solve(vcl_compressed_matrix, vcl_rhs, viennacl::linalg::gmres_tag(1e-6, 20), vcl_ilut);//with preconditioner
+  vcl_result = viennacl::linalg::solve(vcl_compressed_matrix, vcl_rhs, viennacl::linalg::gmres_tag(1e-6, 20), vcl_jacobi);//with preconditioner
+
+  //
+  //  That's it.
+  //
+  std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl;
+
+  return 0;
+}
+
diff --git a/examples/tutorial/lanczos.cpp b/examples/tutorial/lanczos.cpp
new file mode 100644
index 0000000..9ac94e9
--- /dev/null
+++ b/examples/tutorial/lanczos.cpp
@@ -0,0 +1,85 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/*
+*
+*   Tutorial: Calculation of eigenvalues using Lanczos' method (lanczos.cpp and lanczos.cu are identical, the latter being required for compilation using CUDA nvcc)
+*
+*/
+
+// include necessary system headers
+#include <iostream>
+
+#ifndef NDEBUG
+  #define NDEBUG
+#endif
+
+#define VIENNACL_WITH_UBLAS
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/compressed_matrix.hpp"
+
+
+#include "viennacl/linalg/lanczos.hpp"
+#include "viennacl/io/matrix_market.hpp"
+// Some helper functions for this tutorial:
+#include <iostream>
+#include <fstream>
+#include <limits>
+#include <string>
+#include <iomanip>
+#include <boost/numeric/ublas/matrix.hpp>
+#include <boost/numeric/ublas/matrix_proxy.hpp>
+#include <boost/numeric/ublas/matrix_expression.hpp>
+#include <boost/numeric/ublas/matrix_sparse.hpp>
+#include <boost/numeric/ublas/vector.hpp>
+#include <boost/numeric/ublas/operation.hpp>
+#include <boost/numeric/ublas/vector_expression.hpp>
+
+
+
+template <typename MatrixType>
+std::vector<double> initEig(MatrixType const & A)
+{
+  viennacl::linalg::lanczos_tag ltag(0.75, 10, viennacl::linalg::lanczos_tag::partial_reorthogonalization, 1700);
+  std::vector<double> lanczos_eigenvalues = viennacl::linalg::eig(A, ltag);
+  for(std::size_t i = 0; i< lanczos_eigenvalues.size(); i++){
+          std::cout << "Eigenvalue " << i+1 << ": " << std::setprecision(10) << lanczos_eigenvalues[i] << std::endl;
+  }
+
+  return lanczos_eigenvalues;
+}
+
+
+int main()
+{
+  typedef double     ScalarType;
+
+  boost::numeric::ublas::compressed_matrix<ScalarType> ublas_A;
+
+  if (!viennacl::io::read_matrix_market_file(ublas_A, "../examples/testdata/mat65k.mtx"))
+  {
+    std::cout << "Error reading Matrix file" << std::endl;
+    return 0;
+  }
+
+  std::cout << "Running Lanczos algorithm (this might take a while)..." << std::endl;
+  std::vector<double> eigenvalues = initEig(ublas_A);
+}
+
diff --git a/examples/tutorial/lanczos.cu b/examples/tutorial/lanczos.cu
new file mode 100644
index 0000000..9ac94e9
--- /dev/null
+++ b/examples/tutorial/lanczos.cu
@@ -0,0 +1,85 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/*
+*
+*   Tutorial: Calculation of eigenvalues using Lanczos' method (lanczos.cpp and lanczos.cu are identical, the latter being required for compilation using CUDA nvcc)
+*
+*/
+
+// include necessary system headers
+#include <iostream>
+
+#ifndef NDEBUG
+  #define NDEBUG
+#endif
+
+#define VIENNACL_WITH_UBLAS
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/compressed_matrix.hpp"
+
+
+#include "viennacl/linalg/lanczos.hpp"
+#include "viennacl/io/matrix_market.hpp"
+// Some helper functions for this tutorial:
+#include <iostream>
+#include <fstream>
+#include <limits>
+#include <string>
+#include <iomanip>
+#include <boost/numeric/ublas/matrix.hpp>
+#include <boost/numeric/ublas/matrix_proxy.hpp>
+#include <boost/numeric/ublas/matrix_expression.hpp>
+#include <boost/numeric/ublas/matrix_sparse.hpp>
+#include <boost/numeric/ublas/vector.hpp>
+#include <boost/numeric/ublas/operation.hpp>
+#include <boost/numeric/ublas/vector_expression.hpp>
+
+
+
+template <typename MatrixType>
+std::vector<double> initEig(MatrixType const & A)
+{
+  viennacl::linalg::lanczos_tag ltag(0.75, 10, viennacl::linalg::lanczos_tag::partial_reorthogonalization, 1700);
+  std::vector<double> lanczos_eigenvalues = viennacl::linalg::eig(A, ltag);
+  for(std::size_t i = 0; i< lanczos_eigenvalues.size(); i++){
+          std::cout << "Eigenvalue " << i+1 << ": " << std::setprecision(10) << lanczos_eigenvalues[i] << std::endl;
+  }
+
+  return lanczos_eigenvalues;
+}
+
+
+int main()
+{
+  typedef double     ScalarType;
+
+  boost::numeric::ublas::compressed_matrix<ScalarType> ublas_A;
+
+  if (!viennacl::io::read_matrix_market_file(ublas_A, "../examples/testdata/mat65k.mtx"))
+  {
+    std::cout << "Error reading Matrix file" << std::endl;
+    return 0;
+  }
+
+  std::cout << "Running Lanczos algorithm (this might take a while)..." << std::endl;
+  std::vector<double> eigenvalues = initEig(ublas_A);
+}
+
diff --git a/examples/tutorial/least-squares.cpp b/examples/tutorial/least-squares.cpp
new file mode 100644
index 0000000..9216a96
--- /dev/null
+++ b/examples/tutorial/least-squares.cpp
@@ -0,0 +1,144 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/*
+*
+*   Tutorial: Least Squares problem for matrices from ViennaCL or Boost.uBLAS (least-squares.cpp and least-squares.cu are identical, the latter being required for compilation using CUDA nvcc)
+*
+*   See Example 2 at http://tutorial.math.lamar.edu/Classes/LinAlg/QRDecomposition.aspx for a reference solution.
+*
+*/
+
+// activate ublas support in ViennaCL
+#define VIENNACL_WITH_UBLAS
+
+//
+// include necessary system headers
+//
+#include <iostream>
+
+//
+// Boost includes
+//
+#include <boost/numeric/ublas/triangular.hpp>
+#include <boost/numeric/ublas/vector.hpp>
+#include <boost/numeric/ublas/vector_proxy.hpp>
+#include <boost/numeric/ublas/matrix.hpp>
+#include <boost/numeric/ublas/matrix_proxy.hpp>
+#include <boost/numeric/ublas/lu.hpp>
+#include <boost/numeric/ublas/io.hpp>
+
+
+//
+// ViennaCL includes
+//
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/matrix_proxy.hpp"
+#include "viennacl/linalg/qr.hpp"
+#include "viennacl/linalg/lu.hpp"
+
+
+/*
+*   Tutorial: Least Squares problem for matrices from ViennaCL or Boost.uBLAS
+*
+*   See Example 2 at http://tutorial.math.lamar.edu/Classes/LinAlg/QRDecomposition.aspx for a reference solution.
+*/
+
+int main (int, const char **)
+{
+  typedef float               ScalarType;     //feel free to change this to 'double' if supported by your hardware
+  typedef boost::numeric::ublas::matrix<ScalarType>              MatrixType;
+  typedef boost::numeric::ublas::vector<ScalarType>              VectorType;
+  typedef viennacl::matrix<ScalarType, viennacl::column_major>   VCLMatrixType;
+  typedef viennacl::vector<ScalarType>                           VCLVectorType;
+
+  //
+  // Create vectors and matrices with data, cf. http://tutorial.math.lamar.edu/Classes/LinAlg/QRDecomposition.aspx
+  //
+  VectorType ublas_b(4);
+  ublas_b(0) = -4;
+  ublas_b(1) =  2;
+  ublas_b(2) =  5;
+  ublas_b(3) = -1;
+
+  MatrixType ublas_A(4, 3);
+  MatrixType Q = boost::numeric::ublas::zero_matrix<ScalarType>(4, 4);
+  MatrixType R = boost::numeric::ublas::zero_matrix<ScalarType>(4, 3);
+
+  ublas_A(0, 0) =  2; ublas_A(0, 1) = -1; ublas_A(0, 2) =  1;
+  ublas_A(1, 0) =  1; ublas_A(1, 1) = -5; ublas_A(1, 2) =  2;
+  ublas_A(2, 0) = -3; ublas_A(2, 1) =  1; ublas_A(2, 2) = -4;
+  ublas_A(3, 0) =  1; ublas_A(3, 1) = -1; ublas_A(3, 2) =  1;
+
+  //
+  // Setup the matrix in ViennaCL:
+  //
+  VCLVectorType vcl_b(ublas_b.size());
+  VCLMatrixType vcl_A(ublas_A.size1(), ublas_A.size2());
+
+  viennacl::copy(ublas_b, vcl_b);
+  viennacl::copy(ublas_A, vcl_A);
+
+
+
+  //////////// Part 1: Use Boost.uBLAS for all computations ////////////////
+
+  std::cout << "--- Boost.uBLAS ---" << std::endl;
+  std::vector<ScalarType> ublas_betas = viennacl::linalg::inplace_qr(ublas_A);  //computes the QR factorization
+
+  // compute modified RHS of the minimization problem:
+  // b' := Q^T b
+  viennacl::linalg::inplace_qr_apply_trans_Q(ublas_A, ublas_betas, ublas_b);
+
+  // Final step: triangular solve: Rx = b'', where b'' are the first three entries in b'
+  // We only need the upper left square part of A, which defines the upper triangular matrix R
+  boost::numeric::ublas::range ublas_range(0, 3);
+  boost::numeric::ublas::matrix_range<MatrixType> ublas_R(ublas_A, ublas_range, ublas_range);
+  boost::numeric::ublas::vector_range<VectorType> ublas_b2(ublas_b, ublas_range);
+  boost::numeric::ublas::inplace_solve(ublas_R, ublas_b2, boost::numeric::ublas::upper_tag());
+
+  std::cout << "Result: " << ublas_b2 << std::endl;
+
+  //////////// Part 2: Use ViennaCL types for BLAS 3 computations, but use Boost.uBLAS for the panel factorization ////////////////
+
+  std::cout << "--- ViennaCL (hybrid implementation)  ---" << std::endl;
+  std::vector<ScalarType> hybrid_betas = viennacl::linalg::inplace_qr(vcl_A);
+
+  // compute modified RHS of the minimization problem:
+  // b := Q^T b
+  viennacl::linalg::inplace_qr_apply_trans_Q(vcl_A, hybrid_betas, vcl_b);
+
+  // Final step: triangular solve: Rx = b'.
+  // We only need the upper part of A such that R is a square matrix
+  viennacl::range vcl_range(0, 3);
+  viennacl::matrix_range<VCLMatrixType> vcl_R(vcl_A, vcl_range, vcl_range);
+  viennacl::vector_range<VCLVectorType> vcl_b2(vcl_b, vcl_range);
+  viennacl::linalg::inplace_solve(vcl_R, vcl_b2, viennacl::linalg::upper_tag());
+
+  std::cout << "Result: " << vcl_b2 << std::endl;
+
+
+
+  //
+  //  That's it.
+  //
+  std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl;
+
+  return EXIT_SUCCESS;
+}
+
diff --git a/examples/tutorial/least-squares.cu b/examples/tutorial/least-squares.cu
new file mode 100644
index 0000000..9216a96
--- /dev/null
+++ b/examples/tutorial/least-squares.cu
@@ -0,0 +1,144 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/*
+*
+*   Tutorial: Least Squares problem for matrices from ViennaCL or Boost.uBLAS (least-squares.cpp and least-squares.cu are identical, the latter being required for compilation using CUDA nvcc)
+*
+*   See Example 2 at http://tutorial.math.lamar.edu/Classes/LinAlg/QRDecomposition.aspx for a reference solution.
+*
+*/
+
+// activate ublas support in ViennaCL
+#define VIENNACL_WITH_UBLAS
+
+//
+// include necessary system headers
+//
+#include <iostream>
+
+//
+// Boost includes
+//
+#include <boost/numeric/ublas/triangular.hpp>
+#include <boost/numeric/ublas/vector.hpp>
+#include <boost/numeric/ublas/vector_proxy.hpp>
+#include <boost/numeric/ublas/matrix.hpp>
+#include <boost/numeric/ublas/matrix_proxy.hpp>
+#include <boost/numeric/ublas/lu.hpp>
+#include <boost/numeric/ublas/io.hpp>
+
+
+//
+// ViennaCL includes
+//
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/matrix_proxy.hpp"
+#include "viennacl/linalg/qr.hpp"
+#include "viennacl/linalg/lu.hpp"
+
+
+/*
+*   Tutorial: Least Squares problem for matrices from ViennaCL or Boost.uBLAS
+*
+*   See Example 2 at http://tutorial.math.lamar.edu/Classes/LinAlg/QRDecomposition.aspx for a reference solution.
+*/
+
+int main (int, const char **)
+{
+  typedef float               ScalarType;     //feel free to change this to 'double' if supported by your hardware
+  typedef boost::numeric::ublas::matrix<ScalarType>              MatrixType;
+  typedef boost::numeric::ublas::vector<ScalarType>              VectorType;
+  typedef viennacl::matrix<ScalarType, viennacl::column_major>   VCLMatrixType;
+  typedef viennacl::vector<ScalarType>                           VCLVectorType;
+
+  //
+  // Create vectors and matrices with data, cf. http://tutorial.math.lamar.edu/Classes/LinAlg/QRDecomposition.aspx
+  //
+  VectorType ublas_b(4);
+  ublas_b(0) = -4;
+  ublas_b(1) =  2;
+  ublas_b(2) =  5;
+  ublas_b(3) = -1;
+
+  MatrixType ublas_A(4, 3);
+  MatrixType Q = boost::numeric::ublas::zero_matrix<ScalarType>(4, 4);
+  MatrixType R = boost::numeric::ublas::zero_matrix<ScalarType>(4, 3);
+
+  ublas_A(0, 0) =  2; ublas_A(0, 1) = -1; ublas_A(0, 2) =  1;
+  ublas_A(1, 0) =  1; ublas_A(1, 1) = -5; ublas_A(1, 2) =  2;
+  ublas_A(2, 0) = -3; ublas_A(2, 1) =  1; ublas_A(2, 2) = -4;
+  ublas_A(3, 0) =  1; ublas_A(3, 1) = -1; ublas_A(3, 2) =  1;
+
+  //
+  // Setup the matrix in ViennaCL:
+  //
+  VCLVectorType vcl_b(ublas_b.size());
+  VCLMatrixType vcl_A(ublas_A.size1(), ublas_A.size2());
+
+  viennacl::copy(ublas_b, vcl_b);
+  viennacl::copy(ublas_A, vcl_A);
+
+
+
+  //////////// Part 1: Use Boost.uBLAS for all computations ////////////////
+
+  std::cout << "--- Boost.uBLAS ---" << std::endl;
+  std::vector<ScalarType> ublas_betas = viennacl::linalg::inplace_qr(ublas_A);  //computes the QR factorization
+
+  // compute modified RHS of the minimization problem:
+  // b' := Q^T b
+  viennacl::linalg::inplace_qr_apply_trans_Q(ublas_A, ublas_betas, ublas_b);
+
+  // Final step: triangular solve: Rx = b'', where b'' are the first three entries in b'
+  // We only need the upper left square part of A, which defines the upper triangular matrix R
+  boost::numeric::ublas::range ublas_range(0, 3);
+  boost::numeric::ublas::matrix_range<MatrixType> ublas_R(ublas_A, ublas_range, ublas_range);
+  boost::numeric::ublas::vector_range<VectorType> ublas_b2(ublas_b, ublas_range);
+  boost::numeric::ublas::inplace_solve(ublas_R, ublas_b2, boost::numeric::ublas::upper_tag());
+
+  std::cout << "Result: " << ublas_b2 << std::endl;
+
+  //////////// Part 2: Use ViennaCL types for BLAS 3 computations, but use Boost.uBLAS for the panel factorization ////////////////
+
+  std::cout << "--- ViennaCL (hybrid implementation)  ---" << std::endl;
+  std::vector<ScalarType> hybrid_betas = viennacl::linalg::inplace_qr(vcl_A);
+
+  // compute modified RHS of the minimization problem:
+  // b := Q^T b
+  viennacl::linalg::inplace_qr_apply_trans_Q(vcl_A, hybrid_betas, vcl_b);
+
+  // Final step: triangular solve: Rx = b'.
+  // We only need the upper part of A such that R is a square matrix
+  viennacl::range vcl_range(0, 3);
+  viennacl::matrix_range<VCLMatrixType> vcl_R(vcl_A, vcl_range, vcl_range);
+  viennacl::vector_range<VCLVectorType> vcl_b2(vcl_b, vcl_range);
+  viennacl::linalg::inplace_solve(vcl_R, vcl_b2, viennacl::linalg::upper_tag());
+
+  std::cout << "Result: " << vcl_b2 << std::endl;
+
+
+
+  //
+  //  That's it.
+  //
+  std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl;
+
+  return EXIT_SUCCESS;
+}
+
diff --git a/examples/tutorial/libviennacl.cpp b/examples/tutorial/libviennacl.cpp
new file mode 100644
index 0000000..c738b1d
--- /dev/null
+++ b/examples/tutorial/libviennacl.cpp
@@ -0,0 +1,105 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/*
+*
+*   Tutorial: Using the ViennaCL BLAS-like shared library
+*
+*/
+
+
+// include necessary system headers
+#include <iostream>
+#include <vector>
+
+// Some helper functions for this tutorial:
+#include "viennacl.hpp"
+
+#include "viennacl/vector.hpp"
+
+int main()
+{
+  std::size_t size = 10;
+
+  ViennaCLInt half_size = static_cast<ViennaCLInt>(size / 2);
+
+  //
+  // Part 1: Host-based execution
+  //
+
+  viennacl::vector<float> host_x = viennacl::scalar_vector<float>(size, 1.0, viennacl::context(viennacl::MAIN_MEMORY));
+  viennacl::vector<float> host_y = viennacl::scalar_vector<float>(size, 2.0, viennacl::context(viennacl::MAIN_MEMORY));
+
+  // Create backend:
+  ViennaCLBackend my_backend;
+  ViennaCLBackendCreate(&my_backend);
+
+  ViennaCLHostSswap(my_backend, half_size,
+                    viennacl::linalg::host_based::detail::extract_raw_pointer<float>(host_x), 1, 2,
+                    viennacl::linalg::host_based::detail::extract_raw_pointer<float>(host_y), 0, 2);
+
+  std::cout << " --- Host ---" << std::endl;
+  std::cout << "host_x: " << host_x << std::endl;
+  std::cout << "host_y: " << host_y << std::endl;
+
+  //
+  // Part 2: CUDA-based execution
+  //
+
+#ifdef VIENNACL_WITH_CUDA
+  viennacl::vector<float> cuda_x = viennacl::scalar_vector<float>(size, 1.0, viennacl::context(viennacl::CUDA_MEMORY));
+  viennacl::vector<float> cuda_y = viennacl::scalar_vector<float>(size, 2.0, viennacl::context(viennacl::CUDA_MEMORY));
+
+  ViennaCLCUDASswap(my_backend, half_size,
+                    viennacl::linalg::cuda::detail::cuda_arg<float>(cuda_x), 0, 2,
+                    viennacl::linalg::cuda::detail::cuda_arg<float>(cuda_y), 1, 2);
+
+  std::cout << " --- CUDA ---" << std::endl;
+  std::cout << "cuda_x: " << cuda_x << std::endl;
+  std::cout << "cuda_y: " << cuda_y << std::endl;
+#endif
+
+  //
+  // Part 3: OpenCL-based execution
+  //
+
+#ifdef VIENNACL_WITH_OPENCL
+  long context_id = 0;
+  viennacl::vector<float> opencl_x = viennacl::scalar_vector<float>(size, 1.0, viennacl::context(viennacl::ocl::get_context(context_id)));
+  viennacl::vector<float> opencl_y = viennacl::scalar_vector<float>(size, 2.0, viennacl::context(viennacl::ocl::get_context(context_id)));
+
+  ViennaCLBackendSetOpenCLContextID(my_backend, context_id);
+
+  ViennaCLOpenCLSswap(my_backend, half_size,
+                      viennacl::traits::opencl_handle(opencl_x).get(), 1, 2,
+                      viennacl::traits::opencl_handle(opencl_y).get(), 1, 2);
+
+  std::cout << " --- OpenCL ---" << std::endl;
+  std::cout << "opencl_x: " << opencl_x << std::endl;
+  std::cout << "opencl_y: " << opencl_y << std::endl;
+#endif
+
+  ViennaCLBackendDestroy(&my_backend);
+
+  //
+  //  That's it.
+  //
+  std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl;
+
+  return EXIT_SUCCESS;
+}
+
diff --git a/examples/tutorial/libviennacl.cu b/examples/tutorial/libviennacl.cu
new file mode 100644
index 0000000..c738b1d
--- /dev/null
+++ b/examples/tutorial/libviennacl.cu
@@ -0,0 +1,105 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/*
+*
+*   Tutorial: Using the ViennaCL BLAS-like shared library
+*
+*/
+
+
+// include necessary system headers
+#include <iostream>
+#include <vector>
+
+// Some helper functions for this tutorial:
+#include "viennacl.hpp"
+
+#include "viennacl/vector.hpp"
+
+int main()
+{
+  std::size_t size = 10;
+
+  ViennaCLInt half_size = static_cast<ViennaCLInt>(size / 2);
+
+  //
+  // Part 1: Host-based execution
+  //
+
+  viennacl::vector<float> host_x = viennacl::scalar_vector<float>(size, 1.0, viennacl::context(viennacl::MAIN_MEMORY));
+  viennacl::vector<float> host_y = viennacl::scalar_vector<float>(size, 2.0, viennacl::context(viennacl::MAIN_MEMORY));
+
+  // Create backend:
+  ViennaCLBackend my_backend;
+  ViennaCLBackendCreate(&my_backend);
+
+  ViennaCLHostSswap(my_backend, half_size,
+                    viennacl::linalg::host_based::detail::extract_raw_pointer<float>(host_x), 1, 2,
+                    viennacl::linalg::host_based::detail::extract_raw_pointer<float>(host_y), 0, 2);
+
+  std::cout << " --- Host ---" << std::endl;
+  std::cout << "host_x: " << host_x << std::endl;
+  std::cout << "host_y: " << host_y << std::endl;
+
+  //
+  // Part 2: CUDA-based execution
+  //
+
+#ifdef VIENNACL_WITH_CUDA
+  viennacl::vector<float> cuda_x = viennacl::scalar_vector<float>(size, 1.0, viennacl::context(viennacl::CUDA_MEMORY));
+  viennacl::vector<float> cuda_y = viennacl::scalar_vector<float>(size, 2.0, viennacl::context(viennacl::CUDA_MEMORY));
+
+  ViennaCLCUDASswap(my_backend, half_size,
+                    viennacl::linalg::cuda::detail::cuda_arg<float>(cuda_x), 0, 2,
+                    viennacl::linalg::cuda::detail::cuda_arg<float>(cuda_y), 1, 2);
+
+  std::cout << " --- CUDA ---" << std::endl;
+  std::cout << "cuda_x: " << cuda_x << std::endl;
+  std::cout << "cuda_y: " << cuda_y << std::endl;
+#endif
+
+  //
+  // Part 3: OpenCL-based execution
+  //
+
+#ifdef VIENNACL_WITH_OPENCL
+  long context_id = 0;
+  viennacl::vector<float> opencl_x = viennacl::scalar_vector<float>(size, 1.0, viennacl::context(viennacl::ocl::get_context(context_id)));
+  viennacl::vector<float> opencl_y = viennacl::scalar_vector<float>(size, 2.0, viennacl::context(viennacl::ocl::get_context(context_id)));
+
+  ViennaCLBackendSetOpenCLContextID(my_backend, context_id);
+
+  ViennaCLOpenCLSswap(my_backend, half_size,
+                      viennacl::traits::opencl_handle(opencl_x).get(), 1, 2,
+                      viennacl::traits::opencl_handle(opencl_y).get(), 1, 2);
+
+  std::cout << " --- OpenCL ---" << std::endl;
+  std::cout << "opencl_x: " << opencl_x << std::endl;
+  std::cout << "opencl_y: " << opencl_y << std::endl;
+#endif
+
+  ViennaCLBackendDestroy(&my_backend);
+
+  //
+  //  That's it.
+  //
+  std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl;
+
+  return EXIT_SUCCESS;
+}
+
diff --git a/examples/tutorial/matrix-range.cpp b/examples/tutorial/matrix-range.cpp
index 51493dc..f101693 100644
--- a/examples/tutorial/matrix-range.cpp
+++ b/examples/tutorial/matrix-range.cpp
@@ -1,21 +1,29 @@
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
 ============================================================================= */
 
+/*
+*
+*   Tutorial: Explains the use of matrix ranges with simple BLAS level 1 and 2 operations.
+*             (matrix-range.cpp and matrix-range.cu are identical, the latter being required for compilation using CUDA nvcc)
+*/
+
+
 // activate ublas support in ViennaCL
-#define VIENNACL_HAVE_UBLAS
+#define VIENNACL_WITH_UBLAS
 
 //
 // include necessary system headers
@@ -39,20 +47,16 @@
 #include "boost/numeric/ublas/matrix_proxy.hpp"
 #include "boost/numeric/ublas/io.hpp"
 
-/*
-*   Tutorial: Use of matrix ranges
-*/
-
-int main (int argc, const char * argv[])
+int main (int, const char **)
 {
   typedef float                                           ScalarType;    //feel free to change this to 'double' if supported by your hardware
   typedef boost::numeric::ublas::matrix<ScalarType>       MatrixType;
-  
+
   typedef viennacl::matrix<ScalarType, viennacl::row_major>    VCLMatrixType;
-  
+
   std::size_t dim_large = 5;
   std::size_t dim_small = 3;
-  
+
   //
   // Setup ublas objects and fill with data:
   //
@@ -60,24 +64,24 @@ int main (int argc, const char * argv[])
   MatrixType ublas_B(dim_small, dim_small);
   MatrixType ublas_C(dim_large, dim_small);
   MatrixType ublas_D(dim_small, dim_large);
-  
-  
+
+
   for (std::size_t i=0; i<ublas_A.size1(); ++i)
     for (std::size_t j=0; j<ublas_A.size2(); ++j)
-      ublas_A(i,j) = (i+1) + (j+1)*(i+1);
+      ublas_A(i,j) = static_cast<ScalarType>((i+1) + (j+1)*(i+1));
 
   for (std::size_t i=0; i<ublas_B.size1(); ++i)
     for (std::size_t j=0; j<ublas_B.size2(); ++j)
-      ublas_B(i,j) = (i+1) + (j+1)*(i+1);
+      ublas_B(i,j) = static_cast<ScalarType>((i+1) + (j+1)*(i+1));
 
   for (std::size_t i=0; i<ublas_C.size1(); ++i)
     for (std::size_t j=0; j<ublas_C.size2(); ++j)
-      ublas_C(i,j) = (j+2) + (j+1)*(i+1);
+      ublas_C(i,j) = static_cast<ScalarType>((j+2) + (j+1)*(i+1));
 
   for (std::size_t i=0; i<ublas_D.size1(); ++i)
     for (std::size_t j=0; j<ublas_D.size2(); ++j)
-      ublas_D(i,j) = (j+2) + (j+1)*(i+1);
-  
+      ublas_D(i,j) = static_cast<ScalarType>((j+2) + (j+1)*(i+1));
+
   //
   // Extract submatrices using the ranges in ublas
   //
@@ -96,12 +100,12 @@ int main (int argc, const char * argv[])
   VCLMatrixType vcl_B(dim_small, dim_small);
   VCLMatrixType vcl_C(dim_large, dim_small);
   VCLMatrixType vcl_D(dim_small, dim_large);
-  
+
   viennacl::copy(ublas_A, vcl_A);
   viennacl::copy(ublas_B, vcl_B);
   viennacl::copy(ublas_C, vcl_C);
   viennacl::copy(ublas_D, vcl_D);
-  
+
   //
   // Extract submatrices using the ranges in ViennaCL
   //
@@ -109,22 +113,22 @@ int main (int argc, const char * argv[])
   viennacl::range vcl_r2(dim_large - dim_small, dim_large); //the last 'dim_small' entries
   viennacl::matrix_range<VCLMatrixType>   vcl_A_sub1(vcl_A, vcl_r1, vcl_r1); //upper left part of A
   viennacl::matrix_range<VCLMatrixType>   vcl_A_sub2(vcl_A, vcl_r2, vcl_r2); //lower right part of A
-  
+
   viennacl::matrix_range<VCLMatrixType>   vcl_C_sub(vcl_C, vcl_r1, vcl_r1); //upper left part of C
   viennacl::matrix_range<VCLMatrixType>   vcl_D_sub(vcl_D, vcl_r1, vcl_r1); //upper left part of D
 
   //
   // Copy from ublas to submatrices and back:
   //
-  
+
   ublas_A_sub1 = ublas_B;
   viennacl::copy(ublas_B, vcl_A_sub1);
   viennacl::copy(vcl_A_sub1, ublas_B);
-  
+
   //
   // Addition:
   //
-  
+
   // range to range:
   ublas_A_sub2 += ublas_A_sub2;
   vcl_A_sub2 += vcl_A_sub2;
@@ -133,7 +137,7 @@ int main (int argc, const char * argv[])
   ublas_B += ublas_A_sub2;
   vcl_B += vcl_A_sub2;
 
-  
+
   //
   // use matrix range with matrix-matrix product:
   //
@@ -145,7 +149,7 @@ int main (int argc, const char * argv[])
   //
   std::cout << "Result ublas:    " << ublas_A << std::endl;
   std::cout << "Result ViennaCL: " << vcl_A << std::endl;
-  
+
   //
   //  That's it.
   //
diff --git a/examples/tutorial/matrix-range.cpp b/examples/tutorial/matrix-range.cu
similarity index 87%
copy from examples/tutorial/matrix-range.cpp
copy to examples/tutorial/matrix-range.cu
index 51493dc..f101693 100644
--- a/examples/tutorial/matrix-range.cpp
+++ b/examples/tutorial/matrix-range.cu
@@ -1,21 +1,29 @@
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
 ============================================================================= */
 
+/*
+*
+*   Tutorial: Explains the use of matrix ranges with simple BLAS level 1 and 2 operations.
+*             (matrix-range.cpp and matrix-range.cu are identical, the latter being required for compilation using CUDA nvcc)
+*/
+
+
 // activate ublas support in ViennaCL
-#define VIENNACL_HAVE_UBLAS
+#define VIENNACL_WITH_UBLAS
 
 //
 // include necessary system headers
@@ -39,20 +47,16 @@
 #include "boost/numeric/ublas/matrix_proxy.hpp"
 #include "boost/numeric/ublas/io.hpp"
 
-/*
-*   Tutorial: Use of matrix ranges
-*/
-
-int main (int argc, const char * argv[])
+int main (int, const char **)
 {
   typedef float                                           ScalarType;    //feel free to change this to 'double' if supported by your hardware
   typedef boost::numeric::ublas::matrix<ScalarType>       MatrixType;
-  
+
   typedef viennacl::matrix<ScalarType, viennacl::row_major>    VCLMatrixType;
-  
+
   std::size_t dim_large = 5;
   std::size_t dim_small = 3;
-  
+
   //
   // Setup ublas objects and fill with data:
   //
@@ -60,24 +64,24 @@ int main (int argc, const char * argv[])
   MatrixType ublas_B(dim_small, dim_small);
   MatrixType ublas_C(dim_large, dim_small);
   MatrixType ublas_D(dim_small, dim_large);
-  
-  
+
+
   for (std::size_t i=0; i<ublas_A.size1(); ++i)
     for (std::size_t j=0; j<ublas_A.size2(); ++j)
-      ublas_A(i,j) = (i+1) + (j+1)*(i+1);
+      ublas_A(i,j) = static_cast<ScalarType>((i+1) + (j+1)*(i+1));
 
   for (std::size_t i=0; i<ublas_B.size1(); ++i)
     for (std::size_t j=0; j<ublas_B.size2(); ++j)
-      ublas_B(i,j) = (i+1) + (j+1)*(i+1);
+      ublas_B(i,j) = static_cast<ScalarType>((i+1) + (j+1)*(i+1));
 
   for (std::size_t i=0; i<ublas_C.size1(); ++i)
     for (std::size_t j=0; j<ublas_C.size2(); ++j)
-      ublas_C(i,j) = (j+2) + (j+1)*(i+1);
+      ublas_C(i,j) = static_cast<ScalarType>((j+2) + (j+1)*(i+1));
 
   for (std::size_t i=0; i<ublas_D.size1(); ++i)
     for (std::size_t j=0; j<ublas_D.size2(); ++j)
-      ublas_D(i,j) = (j+2) + (j+1)*(i+1);
-  
+      ublas_D(i,j) = static_cast<ScalarType>((j+2) + (j+1)*(i+1));
+
   //
   // Extract submatrices using the ranges in ublas
   //
@@ -96,12 +100,12 @@ int main (int argc, const char * argv[])
   VCLMatrixType vcl_B(dim_small, dim_small);
   VCLMatrixType vcl_C(dim_large, dim_small);
   VCLMatrixType vcl_D(dim_small, dim_large);
-  
+
   viennacl::copy(ublas_A, vcl_A);
   viennacl::copy(ublas_B, vcl_B);
   viennacl::copy(ublas_C, vcl_C);
   viennacl::copy(ublas_D, vcl_D);
-  
+
   //
   // Extract submatrices using the ranges in ViennaCL
   //
@@ -109,22 +113,22 @@ int main (int argc, const char * argv[])
   viennacl::range vcl_r2(dim_large - dim_small, dim_large); //the last 'dim_small' entries
   viennacl::matrix_range<VCLMatrixType>   vcl_A_sub1(vcl_A, vcl_r1, vcl_r1); //upper left part of A
   viennacl::matrix_range<VCLMatrixType>   vcl_A_sub2(vcl_A, vcl_r2, vcl_r2); //lower right part of A
-  
+
   viennacl::matrix_range<VCLMatrixType>   vcl_C_sub(vcl_C, vcl_r1, vcl_r1); //upper left part of C
   viennacl::matrix_range<VCLMatrixType>   vcl_D_sub(vcl_D, vcl_r1, vcl_r1); //upper left part of D
 
   //
   // Copy from ublas to submatrices and back:
   //
-  
+
   ublas_A_sub1 = ublas_B;
   viennacl::copy(ublas_B, vcl_A_sub1);
   viennacl::copy(vcl_A_sub1, ublas_B);
-  
+
   //
   // Addition:
   //
-  
+
   // range to range:
   ublas_A_sub2 += ublas_A_sub2;
   vcl_A_sub2 += vcl_A_sub2;
@@ -133,7 +137,7 @@ int main (int argc, const char * argv[])
   ublas_B += ublas_A_sub2;
   vcl_B += vcl_A_sub2;
 
-  
+
   //
   // use matrix range with matrix-matrix product:
   //
@@ -145,7 +149,7 @@ int main (int argc, const char * argv[])
   //
   std::cout << "Result ublas:    " << ublas_A << std::endl;
   std::cout << "Result ViennaCL: " << vcl_A << std::endl;
-  
+
   //
   //  That's it.
   //
diff --git a/examples/tutorial/mtl4-with-viennacl.cpp b/examples/tutorial/mtl4-with-viennacl.cpp
index 12f4ad1..df78bed 100644
--- a/examples/tutorial/mtl4-with-viennacl.cpp
+++ b/examples/tutorial/mtl4-with-viennacl.cpp
@@ -1,7 +1,8 @@
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
@@ -14,6 +15,12 @@
    License:         MIT (X11), see file LICENSE in the base directory
 ============================================================================= */
 
+/*
+*
+*   Tutorial:  Shows how to exchange data between ViennaCL and MTL4 (http://www.mtl4.org/) objects.
+*   
+*/
+
 //
 // include necessary system headers
 //
@@ -28,7 +35,7 @@
 //
 // Must be set prior to any ViennaCL includes if you want to use ViennaCL algorithms on MTL4 objects
 //
-#define VIENNACL_HAVE_MTL4 1
+#define VIENNACL_WITH_MTL4 1
 
 //#define VIENNACL_BUILD_INFO
 //#define VIENNACL_DEBUG_ALL
@@ -164,7 +171,9 @@ int main(int, char *[])
   std::cout << "----------------------------------------------" << std::endl;
   run_test<float>();
   
+#ifdef VIENNACL_HAVE_OPENCL   
   if( viennacl::ocl::current_device().double_support() )
+#endif
   {
     std::cout << "----------------------------------------------" << std::endl;
     std::cout << "## Double precision" << std::endl;
diff --git a/examples/tutorial/multithreaded.cpp b/examples/tutorial/multithreaded.cpp
new file mode 100644
index 0000000..8a293d0
--- /dev/null
+++ b/examples/tutorial/multithreaded.cpp
@@ -0,0 +1,127 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/*
+*
+*   Tutorial: Using ViennaCL with multiple threads, one thread per GPU
+*
+*/
+
+#ifndef VIENNACL_WITH_OPENCL
+  #define VIENNACL_WITH_OPENCL
+#endif
+
+// include necessary system headers
+#include <iostream>
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+
+#include "viennacl/ocl/device.hpp"
+#include "viennacl/ocl/platform.hpp"
+#include "viennacl/ocl/backend.hpp"
+
+//include the generic inner product functions of ViennaCL
+#include "viennacl/linalg/norm_2.hpp"
+
+// Some helper functions for this tutorial:
+#include "Random.hpp"
+
+#include <boost/thread.hpp>
+
+
+template <typename NumericT>
+class worker
+{
+public:
+  worker(std::size_t tid) : thread_id_(tid) {}
+
+  void operator()()
+  {
+    std::size_t N = 6;
+
+    viennacl::context ctx(viennacl::ocl::get_context(static_cast<long>(thread_id_)));
+    viennacl::vector<NumericT> u = viennacl::scalar_vector<NumericT>(N, NumericT(1) * NumericT(thread_id_ + 1), ctx);
+    viennacl::vector<NumericT> v = viennacl::scalar_vector<NumericT>(N, NumericT(2) * NumericT(thread_id_ + 1), ctx);
+    viennacl::matrix<NumericT> A = viennacl::linalg::outer_prod(u, v);
+    viennacl::vector<NumericT> x(u);
+
+    u += v;
+    NumericT result = viennacl::linalg::norm_2(u);
+
+    std::stringstream ss;
+    ss << "Result of thread " << thread_id_ << " on device " << viennacl::ocl::get_context(static_cast<long>(thread_id_)).devices()[0].name() << ": " << result << std::endl;
+    ss << "  A: " << A << std::endl;
+    ss << "  x: " << x << std::endl;
+    message_ = ss.str();
+  }
+
+  std::string message() const { return message_; }
+
+private:
+  std::string message_;
+  std::size_t thread_id_;
+};
+
+int main()
+{
+  //Change this type definition to double if your gpu supports that
+  typedef float       ScalarType;
+
+  if (viennacl::ocl::get_platforms().size() == 0)
+  {
+    std::cerr << "Error: No platform found!" << std::endl;
+    return EXIT_FAILURE;
+  }
+
+  //
+  // Part 1: Setup first device for first context, second device for second context:
+  //
+  viennacl::ocl::platform pf = viennacl::ocl::get_platforms()[0];
+  std::vector<viennacl::ocl::device> const & devices = pf.devices();
+
+  // Set first device to first context:
+  viennacl::ocl::setup_context(0, devices[0]);
+
+  // Set second device for second context (use the same device for the second context if only one device available):
+  if (devices.size() > 1)
+    viennacl::ocl::setup_context(1, devices[1]);
+  else
+    viennacl::ocl::setup_context(1, devices[0]);
+
+  //
+  // Part 2: Now let two threads operate on two GPUs in parallel
+  //
+
+  worker<ScalarType> work_functor0(0);
+  worker<ScalarType> work_functor1(1);
+  boost::thread worker_thread_0(boost::ref(work_functor0));
+  boost::thread worker_thread_1(boost::ref(work_functor1));
+
+  worker_thread_0.join();
+  worker_thread_1.join();
+
+  std::cout << work_functor0.message() << std::endl;
+  std::cout << work_functor1.message() << std::endl;
+
+  std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl;
+
+  return EXIT_SUCCESS;
+}
+
diff --git a/examples/tutorial/multithreaded_cg.cpp b/examples/tutorial/multithreaded_cg.cpp
new file mode 100644
index 0000000..ab43e3b
--- /dev/null
+++ b/examples/tutorial/multithreaded_cg.cpp
@@ -0,0 +1,185 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/*
+*
+*   Tutorial: Using ViennaCL with multiple threads for a conjugate gradient solver, one thread per GPU
+*
+*/
+
+#ifndef VIENNACL_WITH_OPENCL
+  #define VIENNACL_WITH_OPENCL
+#endif
+
+// include necessary system headers
+#include <iostream>
+
+//
+// ublas includes
+//
+#include <boost/numeric/ublas/io.hpp>
+#include <boost/numeric/ublas/triangular.hpp>
+#include <boost/numeric/ublas/matrix_sparse.hpp>
+#include <boost/numeric/ublas/matrix.hpp>
+#include <boost/numeric/ublas/matrix_proxy.hpp>
+#include <boost/numeric/ublas/operation.hpp>
+#include <boost/numeric/ublas/operation_sparse.hpp>
+#include <boost/numeric/ublas/io.hpp>
+#include <boost/numeric/ublas/lu.hpp>
+
+// Must be set if you want to use ViennaCL algorithms on ublas objects
+#define VIENNACL_WITH_UBLAS 1
+
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/compressed_matrix.hpp"
+#include "viennacl/io/matrix_market.hpp"
+
+#include "viennacl/ocl/device.hpp"
+#include "viennacl/ocl/platform.hpp"
+#include "viennacl/ocl/backend.hpp"
+
+//include the generic inner product functions of ViennaCL
+#include "viennacl/linalg/norm_2.hpp"
+#include "viennacl/linalg/cg.hpp"
+
+// Some helper functions for this tutorial:
+#include "Random.hpp"
+#include "vector-io.hpp"
+
+using namespace boost::numeric;
+
+#include <boost/thread.hpp>
+
+template <typename NumericT>
+class worker
+{
+public:
+  worker(std::size_t tid) : thread_id_(tid) {}
+
+  void operator()()
+  {
+    //
+    // Set up some ublas objects
+    //
+    ublas::vector<NumericT> rhs;
+    ublas::vector<NumericT> ref_result;
+    ublas::compressed_matrix<NumericT> ublas_matrix;
+
+    //
+    // Read system from file
+    //
+    if (!viennacl::io::read_matrix_market_file(ublas_matrix, "../examples/testdata/mat65k.mtx"))
+    {
+      std::cout << "Error reading Matrix file" << std::endl;
+      return;
+    }
+
+    if (!readVectorFromFile("../examples/testdata/rhs65025.txt", rhs))
+    {
+      std::cout << "Error reading RHS file" << std::endl;
+      return;
+    }
+
+    if (!readVectorFromFile("../examples/testdata/result65025.txt", ref_result))
+    {
+      std::cout << "Error reading Result file" << std::endl;
+      return;
+    }
+
+    //
+    // Set up some ViennaCL objects in the respective context
+    //
+    viennacl::context ctx(viennacl::ocl::get_context(static_cast<long>(thread_id_)));
+
+    std::size_t vcl_size = rhs.size();
+    viennacl::compressed_matrix<NumericT> vcl_compressed_matrix(ctx);
+    viennacl::vector<NumericT> vcl_rhs(vcl_size, ctx);
+    viennacl::vector<NumericT> vcl_ref_result(vcl_size, ctx);
+
+    viennacl::copy(rhs.begin(), rhs.end(), vcl_rhs.begin());
+    viennacl::copy(ref_result.begin(), ref_result.end(), vcl_ref_result.begin());
+
+
+    //
+    // Transfer ublas-matrix to GPU:
+    //
+    viennacl::copy(ublas_matrix, vcl_compressed_matrix);
+
+    viennacl::vector<NumericT> vcl_result = viennacl::linalg::solve(vcl_compressed_matrix, vcl_rhs, viennacl::linalg::cg_tag());
+
+    std::stringstream ss;
+    ss << "Result of thread " << thread_id_ << " on device " << viennacl::ocl::get_context(static_cast<long>(thread_id_)).devices()[0].name() << ": " << vcl_result[0] << ", should: " << ref_result[0] << std::endl;
+    message_ = ss.str();
+  }
+
+  std::string message() const { return message_; }
+
+private:
+  std::string message_;
+  std::size_t thread_id_;
+};
+
+
+int main()
+{
+  //Change this type definition to double if your gpu supports that
+  typedef float       ScalarType;
+
+  if (viennacl::ocl::get_platforms().size() == 0)
+  {
+    std::cerr << "Error: No platform found!" << std::endl;
+    return EXIT_FAILURE;
+  }
+
+  //
+  // Part 1: Setup first device for first context, second device for second context:
+  //
+  viennacl::ocl::platform pf = viennacl::ocl::get_platforms()[0];
+  std::vector<viennacl::ocl::device> const & devices = pf.devices();
+
+  // Set first device to first context:
+  viennacl::ocl::setup_context(0, devices[0]);
+
+  // Set second device for second context (use the same device for the second context if only one device available):
+  if (devices.size() > 1)
+    viennacl::ocl::setup_context(1, devices[1]);
+  else
+    viennacl::ocl::setup_context(1, devices[0]);
+
+  //
+  // Part 2: Now let two threads operate on two GPUs in parallel
+  //
+
+  worker<ScalarType> work_functor0(0);
+  worker<ScalarType> work_functor1(1);
+  boost::thread worker_thread_0(boost::ref(work_functor0));
+  boost::thread worker_thread_1(boost::ref(work_functor1));
+
+  worker_thread_0.join();
+  worker_thread_1.join();
+
+  std::cout << work_functor0.message() << std::endl;
+  std::cout << work_functor1.message() << std::endl;
+
+  std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl;
+
+  return EXIT_SUCCESS;
+}
+
diff --git a/examples/tutorial/power-iter.cpp b/examples/tutorial/power-iter.cpp
new file mode 100644
index 0000000..3028ca7
--- /dev/null
+++ b/examples/tutorial/power-iter.cpp
@@ -0,0 +1,79 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/*
+*
+*   Tutorial: Calculation of the eigenvalue with largest modulus using the power iteration method
+*             (power-iter.cpp and power-iter.cu are identical, the latter being required for compilation using CUDA nvcc)
+*
+*/
+
+// include necessary system headers
+#include <iostream>
+
+#ifndef NDEBUG
+  #define NDEBUG
+#endif
+
+#define VIENNACL_WITH_UBLAS
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/compressed_matrix.hpp"
+
+
+#include "viennacl/linalg/power_iter.hpp"
+#include "viennacl/io/matrix_market.hpp"
+// Some helper functions for this tutorial:
+#include <iostream>
+#include <fstream>
+#include <limits>
+#include <string>
+#include <boost/numeric/ublas/matrix.hpp>
+#include <boost/numeric/ublas/matrix_proxy.hpp>
+#include <boost/numeric/ublas/matrix_expression.hpp>
+#include <boost/numeric/ublas/matrix_sparse.hpp>
+#include <boost/numeric/ublas/vector.hpp>
+#include <boost/numeric/ublas/operation.hpp>
+#include <boost/numeric/ublas/vector_expression.hpp>
+
+
+
+int main()
+{
+  typedef double     ScalarType;
+
+  boost::numeric::ublas::compressed_matrix<ScalarType> ublas_A;
+
+  if (!viennacl::io::read_matrix_market_file(ublas_A, "../examples/testdata/mat65k.mtx"))
+  {
+    std::cout << "Error reading Matrix file" << std::endl;
+    return 0;
+  }
+
+  viennacl::compressed_matrix<double>  vcl_A(ublas_A.size1(), ublas_A.size2());
+  viennacl::copy(ublas_A, vcl_A);
+
+  viennacl::linalg::power_iter_tag ptag(1e-6);
+
+  std::cout << "Starting computation of eigenvalue with largest modulus (might take about a minute)..." << std::endl;
+  std::cout << "Result of power iteration with ublas matrix (single-threaded): " << viennacl::linalg::eig(ublas_A, ptag) << std::endl;
+  std::cout << "Result of power iteration with ViennaCL (OpenCL accelerated): " << viennacl::linalg::eig(vcl_A, ptag) << std::endl;
+
+}
+
diff --git a/examples/tutorial/power-iter.cu b/examples/tutorial/power-iter.cu
new file mode 100644
index 0000000..3028ca7
--- /dev/null
+++ b/examples/tutorial/power-iter.cu
@@ -0,0 +1,79 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/*
+*
+*   Tutorial: Calculation of the eigenvalue with largest modulus using the power iteration method
+*             (power-iter.cpp and power-iter.cu are identical, the latter being required for compilation using CUDA nvcc)
+*
+*/
+
+// include necessary system headers
+#include <iostream>
+
+#ifndef NDEBUG
+  #define NDEBUG
+#endif
+
+#define VIENNACL_WITH_UBLAS
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/compressed_matrix.hpp"
+
+
+#include "viennacl/linalg/power_iter.hpp"
+#include "viennacl/io/matrix_market.hpp"
+// Some helper functions for this tutorial:
+#include <iostream>
+#include <fstream>
+#include <limits>
+#include <string>
+#include <boost/numeric/ublas/matrix.hpp>
+#include <boost/numeric/ublas/matrix_proxy.hpp>
+#include <boost/numeric/ublas/matrix_expression.hpp>
+#include <boost/numeric/ublas/matrix_sparse.hpp>
+#include <boost/numeric/ublas/vector.hpp>
+#include <boost/numeric/ublas/operation.hpp>
+#include <boost/numeric/ublas/vector_expression.hpp>
+
+
+
+int main()
+{
+  typedef double     ScalarType;
+
+  boost::numeric::ublas::compressed_matrix<ScalarType> ublas_A;
+
+  if (!viennacl::io::read_matrix_market_file(ublas_A, "../examples/testdata/mat65k.mtx"))
+  {
+    std::cout << "Error reading Matrix file" << std::endl;
+    return 0;
+  }
+
+  viennacl::compressed_matrix<double>  vcl_A(ublas_A.size1(), ublas_A.size2());
+  viennacl::copy(ublas_A, vcl_A);
+
+  viennacl::linalg::power_iter_tag ptag(1e-6);
+
+  std::cout << "Starting computation of eigenvalue with largest modulus (might take about a minute)..." << std::endl;
+  std::cout << "Result of power iteration with ublas matrix (single-threaded): " << viennacl::linalg::eig(ublas_A, ptag) << std::endl;
+  std::cout << "Result of power iteration with ViennaCL (OpenCL accelerated): " << viennacl::linalg::eig(vcl_A, ptag) << std::endl;
+
+}
+
diff --git a/examples/tutorial/qr.cpp b/examples/tutorial/qr.cpp
index a355809..7b7bc77 100644
--- a/examples/tutorial/qr.cpp
+++ b/examples/tutorial/qr.cpp
@@ -1,21 +1,29 @@
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
 ============================================================================= */
 
+
+/*
+*
+*   Tutorial: QR factorization of matrices from ViennaCL or Boost.uBLAS (qr.cpp and qr.cu are identical, the latter being required for compilation using CUDA nvcc)
+*
+*/
+
 // activate ublas support in ViennaCL
-#define VIENNACL_HAVE_UBLAS 
+#define VIENNACL_WITH_UBLAS
 
 //
 // include necessary system headers
@@ -59,12 +67,12 @@ double check(MatrixType const & qr, MatrixType const & ref)
       if (qr(i,j) != 0.0 && ref(i,j) != 0.0)
       {
         double rel_err = fabs(qr(i,j) - ref(i,j)) / fabs(ref(i,j) );
-        
+
         if (rel_err > max_error)
           max_error = rel_err;
       }
-      
-      
+
+
       if (qr(i,j) != qr(i,j))
       {
         std::cout << "!!!" << std::endl;
@@ -81,108 +89,87 @@ double check(MatrixType const & qr, MatrixType const & ref)
 }
 
 
-/*
-*   Tutorial: QR factorization of matrices from ViennaCL or Boost.uBLAS
-*/
-
-int main (int argc, const char * argv[])
+int main (int, const char **)
 {
-  typedef float               ScalarType;     //feel free to change this to 'double' if supported by your hardware
+  typedef double               ScalarType;     //feel free to change this to 'double' if supported by your hardware
   typedef boost::numeric::ublas::matrix<ScalarType>        MatrixType;
-  typedef boost::numeric::ublas::vector<ScalarType>        VectorType;
   typedef viennacl::matrix<ScalarType, viennacl::column_major>        VCLMatrixType;
   typedef viennacl::vector<ScalarType>        VCLVectorType;
 
-  std::size_t rows = 36;   //number of rows in the matrix
-  std::size_t cols = 48;   //number of columns
-  
+  std::size_t rows = 113;   //number of rows in the matrix
+  std::size_t cols = 54;   //number of columns
+
   //
   // Create matrices with some data
   //
   MatrixType ublas_A(rows, cols);
   MatrixType Q(rows, rows);
   MatrixType R(rows, cols);
-  
+
   // Some random data with a bit of extra weight on the diagonal
   for (std::size_t i=0; i<rows; ++i)
   {
     for (std::size_t j=0; j<cols; ++j)
     {
-      ublas_A(i,j) = -1.0 + (i + 1)*(j+1)
+      ublas_A(i,j) = -1.0 + (i+1)*(j+1)
                      + ( (rand() % 1000) - 500.0) / 1000.0;
 
       if (i == j)
         ublas_A(i,j) += 10.0;
-                     
+
       R(i,j) = 0.0;
     }
-    
+
     for (std::size_t j=0; j<rows; ++j)
       Q(i,j) = 0.0;
   }
-  
+
   // keep initial input matrix for comparison
   MatrixType ublas_A_backup(ublas_A);
-  
-  
+
+
   //
   // Setup the matrix in ViennaCL:
   //
   VCLVectorType dummy(10);
   VCLMatrixType vcl_A(ublas_A.size1(), ublas_A.size2());
-  
+
   viennacl::copy(ublas_A, vcl_A);
-  
+
   //
   // Compute QR factorization of A. A is overwritten with Householder vectors. Coefficients are returned and a block size of 3 is used.
   // Note that at the moment the number of columns of A must be divisible by the block size
   //
 
   std::cout << "--- Boost.uBLAS ---" << std::endl;
-  std::vector<ScalarType> ublas_betas = viennacl::linalg::inplace_qr(ublas_A, 12);  //computes the QR factorization
-  
+  std::vector<ScalarType> ublas_betas = viennacl::linalg::inplace_qr(ublas_A);  //computes the QR factorization
+
   //
   // A check for the correct result:
   //
-  viennacl::linalg::recoverQ(ublas_A, ublas_betas, Q, R); 
+  viennacl::linalg::recoverQ(ublas_A, ublas_betas, Q, R);
   MatrixType ublas_QR = prod(Q, R);
   double ublas_error = check(ublas_QR, ublas_A_backup);
   std::cout << "Max rel error (ublas): " << ublas_error << std::endl;
-  
-  //
-  // QR factorization in ViennaCL using OpenCL only
-  //
-  std::cout << "--- ViennaCL only ---" << std::endl;
-  std::vector<ScalarType> viennacl_betas = viennacl::linalg::inplace_qr_viennacl(vcl_A, 12); //this is a OpenCL-only implementation
-  viennacl::copy(vcl_A, ublas_A);
-  
-  //
-  // A check for the correct result:
-  //
-  Q.clear(); R.clear();
-  viennacl::linalg::recoverQ(ublas_A, viennacl_betas, Q, R); 
-  double vcl_error = check(ublas_QR, ublas_A_backup);
-  std::cout << "Max rel error (ViennaCL): " << vcl_error << std::endl;
 
-  
   //
   // QR factorization in ViennaCL using Boost.uBLAS for the panel factorization
   //
   std::cout << "--- Hybrid (default) ---" << std::endl;
   viennacl::copy(ublas_A_backup, vcl_A);
-  std::vector<ScalarType> hybrid_betas = viennacl::linalg::inplace_qr(vcl_A, 12);
-  
+  std::vector<ScalarType> hybrid_betas = viennacl::linalg::inplace_qr(vcl_A);
+
 
   //
   // A check for the correct result:
   //
   viennacl::copy(vcl_A, ublas_A);
   Q.clear(); R.clear();
-  viennacl::linalg::recoverQ(ublas_A, hybrid_betas, Q, R); 
+  viennacl::linalg::recoverQ(ublas_A, hybrid_betas, Q, R);
   double hybrid_error = check(ublas_QR, ublas_A_backup);
   std::cout << "Max rel error (hybrid): " << hybrid_error << std::endl;
 
-  
+
   //
   //  That's it.
   //
diff --git a/examples/tutorial/qr.cpp b/examples/tutorial/qr.cu
similarity index 76%
copy from examples/tutorial/qr.cpp
copy to examples/tutorial/qr.cu
index a355809..7b7bc77 100644
--- a/examples/tutorial/qr.cpp
+++ b/examples/tutorial/qr.cu
@@ -1,21 +1,29 @@
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
 ============================================================================= */
 
+
+/*
+*
+*   Tutorial: QR factorization of matrices from ViennaCL or Boost.uBLAS (qr.cpp and qr.cu are identical, the latter being required for compilation using CUDA nvcc)
+*
+*/
+
 // activate ublas support in ViennaCL
-#define VIENNACL_HAVE_UBLAS 
+#define VIENNACL_WITH_UBLAS
 
 //
 // include necessary system headers
@@ -59,12 +67,12 @@ double check(MatrixType const & qr, MatrixType const & ref)
       if (qr(i,j) != 0.0 && ref(i,j) != 0.0)
       {
         double rel_err = fabs(qr(i,j) - ref(i,j)) / fabs(ref(i,j) );
-        
+
         if (rel_err > max_error)
           max_error = rel_err;
       }
-      
-      
+
+
       if (qr(i,j) != qr(i,j))
       {
         std::cout << "!!!" << std::endl;
@@ -81,108 +89,87 @@ double check(MatrixType const & qr, MatrixType const & ref)
 }
 
 
-/*
-*   Tutorial: QR factorization of matrices from ViennaCL or Boost.uBLAS
-*/
-
-int main (int argc, const char * argv[])
+int main (int, const char **)
 {
-  typedef float               ScalarType;     //feel free to change this to 'double' if supported by your hardware
+  typedef double               ScalarType;     //feel free to change this to 'double' if supported by your hardware
   typedef boost::numeric::ublas::matrix<ScalarType>        MatrixType;
-  typedef boost::numeric::ublas::vector<ScalarType>        VectorType;
   typedef viennacl::matrix<ScalarType, viennacl::column_major>        VCLMatrixType;
   typedef viennacl::vector<ScalarType>        VCLVectorType;
 
-  std::size_t rows = 36;   //number of rows in the matrix
-  std::size_t cols = 48;   //number of columns
-  
+  std::size_t rows = 113;   //number of rows in the matrix
+  std::size_t cols = 54;   //number of columns
+
   //
   // Create matrices with some data
   //
   MatrixType ublas_A(rows, cols);
   MatrixType Q(rows, rows);
   MatrixType R(rows, cols);
-  
+
   // Some random data with a bit of extra weight on the diagonal
   for (std::size_t i=0; i<rows; ++i)
   {
     for (std::size_t j=0; j<cols; ++j)
     {
-      ublas_A(i,j) = -1.0 + (i + 1)*(j+1)
+      ublas_A(i,j) = -1.0 + (i+1)*(j+1)
                      + ( (rand() % 1000) - 500.0) / 1000.0;
 
       if (i == j)
         ublas_A(i,j) += 10.0;
-                     
+
       R(i,j) = 0.0;
     }
-    
+
     for (std::size_t j=0; j<rows; ++j)
       Q(i,j) = 0.0;
   }
-  
+
   // keep initial input matrix for comparison
   MatrixType ublas_A_backup(ublas_A);
-  
-  
+
+
   //
   // Setup the matrix in ViennaCL:
   //
   VCLVectorType dummy(10);
   VCLMatrixType vcl_A(ublas_A.size1(), ublas_A.size2());
-  
+
   viennacl::copy(ublas_A, vcl_A);
-  
+
   //
   // Compute QR factorization of A. A is overwritten with Householder vectors. Coefficients are returned and a block size of 3 is used.
   // Note that at the moment the number of columns of A must be divisible by the block size
   //
 
   std::cout << "--- Boost.uBLAS ---" << std::endl;
-  std::vector<ScalarType> ublas_betas = viennacl::linalg::inplace_qr(ublas_A, 12);  //computes the QR factorization
-  
+  std::vector<ScalarType> ublas_betas = viennacl::linalg::inplace_qr(ublas_A);  //computes the QR factorization
+
   //
   // A check for the correct result:
   //
-  viennacl::linalg::recoverQ(ublas_A, ublas_betas, Q, R); 
+  viennacl::linalg::recoverQ(ublas_A, ublas_betas, Q, R);
   MatrixType ublas_QR = prod(Q, R);
   double ublas_error = check(ublas_QR, ublas_A_backup);
   std::cout << "Max rel error (ublas): " << ublas_error << std::endl;
-  
-  //
-  // QR factorization in ViennaCL using OpenCL only
-  //
-  std::cout << "--- ViennaCL only ---" << std::endl;
-  std::vector<ScalarType> viennacl_betas = viennacl::linalg::inplace_qr_viennacl(vcl_A, 12); //this is a OpenCL-only implementation
-  viennacl::copy(vcl_A, ublas_A);
-  
-  //
-  // A check for the correct result:
-  //
-  Q.clear(); R.clear();
-  viennacl::linalg::recoverQ(ublas_A, viennacl_betas, Q, R); 
-  double vcl_error = check(ublas_QR, ublas_A_backup);
-  std::cout << "Max rel error (ViennaCL): " << vcl_error << std::endl;
 
-  
   //
   // QR factorization in ViennaCL using Boost.uBLAS for the panel factorization
   //
   std::cout << "--- Hybrid (default) ---" << std::endl;
   viennacl::copy(ublas_A_backup, vcl_A);
-  std::vector<ScalarType> hybrid_betas = viennacl::linalg::inplace_qr(vcl_A, 12);
-  
+  std::vector<ScalarType> hybrid_betas = viennacl::linalg::inplace_qr(vcl_A);
+
 
   //
   // A check for the correct result:
   //
   viennacl::copy(vcl_A, ublas_A);
   Q.clear(); R.clear();
-  viennacl::linalg::recoverQ(ublas_A, hybrid_betas, Q, R); 
+  viennacl::linalg::recoverQ(ublas_A, hybrid_betas, Q, R);
   double hybrid_error = check(ublas_QR, ublas_A_backup);
   std::cout << "Max rel error (hybrid): " << hybrid_error << std::endl;
 
-  
+
   //
   //  That's it.
   //
diff --git a/examples/tutorial/rand.cpp b/examples/tutorial/rand.cpp
new file mode 100644
index 0000000..ee350ff
--- /dev/null
+++ b/examples/tutorial/rand.cpp
@@ -0,0 +1,70 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/*
+*
+*   Tutorial: Dumps random values into the supplied vector/matrix
+*
+*
+*/
+
+#define VIENNACL_DEBUG_BUILD
+
+// include necessary system headers
+#include <iostream>
+#include "viennacl/rand/gaussian.hpp"
+#include "viennacl/rand/uniform.hpp"
+#include "viennacl/matrix.hpp"
+
+
+int main(){
+
+  typedef float NumericT;
+
+  static const unsigned int size1 = 8;
+  static const unsigned int size2 = 9;
+
+  static const float sigma = 0.8;
+  static const float mu = 1.4;
+
+  static const float a = 0;
+  static const float b = 4;
+
+  //Dumps size1xsize2 observations of a N(mu,sigma) random variable.
+  viennacl::matrix<NumericT> mat = viennacl::random_matrix<NumericT>(size1, size2, viennacl::rand::gaussian_tag(mu,sigma));
+  std::cout << "------------------" << std::endl;
+  std::cout << "Dump Gaussian(" << mu << "," << sigma << ") Observations into matrix : " << std::endl;
+  std::cout << "------------------" << std::endl;
+  std::cout << mat << std::endl;
+
+  std::cout << std::endl;
+
+  //Dumps size1 observations of a U(a,b) random variable.
+  viennacl::vector<NumericT> vec = viennacl::random_vector<NumericT>(size1,viennacl::rand::uniform_tag(a,b));
+  std::cout << "------------------" << std::endl;
+  std::cout << "Uniform(" << a << "," << b << ") Observations into vector : " << std::endl;
+  std::cout << "------------------" << std::endl;
+  std::cout << vec << std::endl;
+
+  //
+  //  That's it.
+  //
+  std::cout << std::endl;
+  std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl;
+
+
+}
diff --git a/examples/tutorial/scheduler.cpp b/examples/tutorial/scheduler.cpp
new file mode 100644
index 0000000..afc6e38
--- /dev/null
+++ b/examples/tutorial/scheduler.cpp
@@ -0,0 +1,130 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/*
+*
+*   Tutorial: Show how to use the scheduler
+*
+*/
+
+
+// include necessary system headers
+#include <iostream>
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+
+#include "viennacl/scheduler/execute.hpp"
+#include "viennacl/scheduler/io.hpp"
+
+int main()
+{
+  typedef float       ScalarType;
+
+  viennacl::vector<ScalarType> vcl_vec1(10);
+  viennacl::vector<ScalarType> vcl_vec2(10);
+  viennacl::vector<ScalarType> vcl_vec3(10);
+
+  //
+  // Let us fill the CPU vectors with random values:
+  // (random<> is a helper function from Random.hpp)
+  //
+
+  for (unsigned int i = 0; i < 10; ++i)
+  {
+    vcl_vec1[i] = ScalarType(i);
+    vcl_vec2[i] = ScalarType(10 - i);
+  }
+
+  //
+  // Build expression graph for the operation vcl_vec3 = vcl_vec1 + vcl_vec2
+  //
+  // This requires the following expression graph:
+  //
+  //             ( = )
+  //            /      |
+  //    vcl_vec3      ( + )
+  //                 /     |
+  //           vcl_vec1    vcl_vec2
+  //
+  // One expression node consists of two leaves and the operation connecting the two.
+  // Here we thus need two nodes: One for {vcl_vec3, = , link}, where 'link' points to the second node
+  // {vcl_vec1, +, vcl_vec2}.
+  //
+  // The following is the lowest level on which one could build the expression tree.
+  // Even for a C API one would introduce some additional convenience layer such as add_vector_float_to_lhs(...); etc.
+  //
+  typedef viennacl::scheduler::statement::container_type   NodeContainerType;   // this is just std::vector<viennacl::scheduler::statement_node>
+  NodeContainerType expression_nodes(2);                                        //container with two nodes
+
+  ////// First node //////
+
+  // specify LHS of first node, i.e. vcl_vec3:
+  expression_nodes[0].lhs.type_family  = viennacl::scheduler::VECTOR_TYPE_FAMILY;   // family of vectors
+  expression_nodes[0].lhs.subtype      = viennacl::scheduler::DENSE_VECTOR_TYPE;    // a dense vector
+  expression_nodes[0].lhs.numeric_type = viennacl::scheduler::FLOAT_TYPE;           // vector consisting of floats
+  expression_nodes[0].lhs.vector_float = &vcl_vec3;                                 // provide pointer to vcl_vec3;
+
+  // specify assignment operation for this node:
+  expression_nodes[0].op.type_family   = viennacl::scheduler::OPERATION_BINARY_TYPE_FAMILY; // this is a binary operation, so both LHS and RHS operands are important
+  expression_nodes[0].op.type          = viennacl::scheduler::OPERATION_BINARY_ASSIGN_TYPE; // assignment operation: '='
+
+  // specify RHS: Just refer to the second node:
+  expression_nodes[0].rhs.type_family  = viennacl::scheduler::COMPOSITE_OPERATION_FAMILY; // this links to another node (no need to set .subtype and .numeric_type)
+  expression_nodes[0].rhs.node_index   = 1;                                               // index of the other node
+
+  ////// Second node //////
+
+  // LHS
+  expression_nodes[1].lhs.type_family  = viennacl::scheduler::VECTOR_TYPE_FAMILY;   // family of vectors
+  expression_nodes[1].lhs.subtype      = viennacl::scheduler::DENSE_VECTOR_TYPE;    // a dense vector
+  expression_nodes[1].lhs.numeric_type = viennacl::scheduler::FLOAT_TYPE;           // vector consisting of floats
+  expression_nodes[1].lhs.vector_float = &vcl_vec1;                                 // provide pointer to vcl_vec1
+
+  // OP
+  expression_nodes[1].op.type_family   = viennacl::scheduler::OPERATION_BINARY_TYPE_FAMILY; // this is a binary operation, so both LHS and RHS operands are important
+  expression_nodes[1].op.type          = viennacl::scheduler::OPERATION_BINARY_ADD_TYPE;    // addition operation: '+'
+
+  // RHS
+  expression_nodes[1].rhs.type_family  = viennacl::scheduler::VECTOR_TYPE_FAMILY;  // family of vectors
+  expression_nodes[1].rhs.subtype      = viennacl::scheduler::DENSE_VECTOR_TYPE;   // a dense vector
+  expression_nodes[1].rhs.numeric_type = viennacl::scheduler::FLOAT_TYPE;          // vector consisting of floats
+  expression_nodes[1].rhs.vector_float = &vcl_vec2;                                // provide pointer to vcl_vec2
+
+
+  // create the full statement (aka. single line of code such as vcl_vec3 = vcl_vec1 + vcl_vec2):
+  viennacl::scheduler::statement vec_addition(expression_nodes);
+
+  // print it
+  std::cout << vec_addition << std::endl;
+
+  // run it
+  viennacl::scheduler::execute(vec_addition);
+
+  // print vectors
+  std::cout << "vcl_vec1: " << vcl_vec1 << std::endl;
+  std::cout << "vcl_vec2: " << vcl_vec2 << std::endl;
+  std::cout << "vcl_vec3: " << vcl_vec3 << std::endl;
+
+
+  std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl;
+
+  return EXIT_SUCCESS;
+}
+
diff --git a/examples/tutorial/spai.cpp b/examples/tutorial/spai.cpp
index 3bdd395..0545e85 100644
--- a/examples/tutorial/spai.cpp
+++ b/examples/tutorial/spai.cpp
@@ -1,27 +1,32 @@
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
 ============================================================================= */
 
+/*
+*
+*   Tutorial: Sparse approximate inverse preconditioner (only available with the OpenCL backend, experimental)
+*
+*/
 
-//
-#define VIENNACL_HAVE_UBLAS
+// enable Boost.uBLAS support
+#define VIENNACL_WITH_UBLAS
 
 #ifndef NDEBUG
  #define NDEBUG
 #endif
-//#define VIENNACL_BUILD_INFO
 
 #include <utility>
 #include <iostream>
@@ -30,9 +35,7 @@
 #include <cmath>
 #include <algorithm>
 #include <stdio.h>
-#include <sys/time.h>
 #include <time.h>
-//#include <omp.h>
 #include "viennacl/scalar.hpp"
 #include "viennacl/matrix.hpp"
 #include "viennacl/compressed_matrix.hpp"
@@ -44,15 +47,9 @@
 #include "viennacl/linalg/norm_2.hpp"
 #include "viennacl/io/matrix_market.hpp"
 #include "viennacl/linalg/spai.hpp"
-//#include "qr.hpp"
-//#include "spai-static.hpp"
-//#include "viennacl/linalg/fspai.hpp"
 #include "boost/numeric/ublas/vector.hpp"
 #include "boost/numeric/ublas/matrix.hpp"
 #include "boost/numeric/ublas/io.hpp"
-#include "boost/foreach.hpp"
-#include "boost/tokenizer.hpp"
-//#include "viennacl/linalg/detail/spai/small_matrix.hpp"
 
 #include "vector-io.hpp"
 
@@ -61,49 +58,66 @@ void run_solver(MatrixType const & A, VectorType const & b, SolverTag const & so
 {
     VectorType result = viennacl::linalg::solve(A, b, solver_tag, precond);
     std::cout << " * Solver iterations: " << solver_tag.iters() << std::endl;
-    VectorType residual = viennacl::linalg::prod(A, result) - b;
+    VectorType residual = viennacl::linalg::prod(A, result);
+    residual -= b;
     std::cout << " * Rel. Residual: " << viennacl::linalg::norm_2(residual) / viennacl::linalg::norm_2(b) << std::endl;
 }
 
 
-int main (int argc, const char * argv[])
+int main (int, const char **)
 {
     typedef float               ScalarType;
     typedef boost::numeric::ublas::compressed_matrix<ScalarType>        MatrixType;
     typedef boost::numeric::ublas::vector<ScalarType>                   VectorType;
     typedef viennacl::compressed_matrix<ScalarType>                     GPUMatrixType;
     typedef viennacl::vector<ScalarType>                                GPUVectorType;
-  
+
+#ifdef VIENNACL_WITH_OPENCL
+  // Optional: Customize OpenCL backend
+  viennacl::ocl::platform pf = viennacl::ocl::get_platforms()[0];
+  std::vector<viennacl::ocl::device> const & devices = pf.devices();
+
+  // Optional: Set first device to first context:
+  viennacl::ocl::setup_context(0, devices[0]);
+
+  // Optional: Set second device for second context (use the same device for the second context if only one device available):
+  if (devices.size() > 1)
+    viennacl::ocl::setup_context(1, devices[1]);
+  else
+    viennacl::ocl::setup_context(1, devices[0]);
+
+  std::cout << viennacl::ocl::current_device().info() << std::endl;
+  viennacl::context ctx(viennacl::ocl::get_context(1));
+#else
+  viennacl::context ctx;
+#endif
+
     MatrixType M;
 
     //
     // Read system matrix from file
     //
-    #ifdef _MSC_VER
-    if (!viennacl::io::read_matrix_market_file(ublas_matrix, "../../examples/testdata/mat65k.mtx"))
-    #else
     if (!viennacl::io::read_matrix_market_file(M, "../examples/testdata/mat65k.mtx"))
-    #endif
     {
       std::cerr<<"ERROR: Could not read matrix file " << std::endl;
       exit(EXIT_FAILURE);
     }
-    
+
     std::cout << "Size of matrix: " << M.size1() << std::endl;
     std::cout << "Avg. Entries per row: " << M.nnz() / static_cast<double>(M.size1()) << std::endl;
-    
+
     //
     // Use uniform load vector:
     //
     VectorType rhs(M.size2());
-    for (size_t i=0; i<rhs.size(); ++i)
+    for (std::size_t i=0; i<rhs.size(); ++i)
       rhs(i) = 1;
 
-    GPUMatrixType  gpu_M(M.size1(), M.size2());
-    GPUVectorType  gpu_rhs(M.size1());
+    GPUMatrixType  gpu_M(M.size1(), M.size2(), ctx);
+    GPUVectorType  gpu_rhs(M.size1(), ctx);
     viennacl::copy(M, gpu_M);
     viennacl::copy(rhs, gpu_rhs);
-    
+
     ///////////////////////////////// Tests to follow /////////////////////////////
 
     viennacl::linalg::bicgstab_tag solver_tag(1e-10, 50); //for simplicity and reasonably short execution times we use only 50 iterations here
@@ -120,10 +134,11 @@ int main (int argc, const char * argv[])
     std::cout << "--- Reference 2: Pure BiCGStab on GPU ---" << std::endl;
     GPUVectorType gpu_result = viennacl::linalg::solve(gpu_M, gpu_rhs, solver_tag);
     std::cout << " * Solver iterations: " << solver_tag.iters() << std::endl;
-    GPUVectorType gpu_residual = viennacl::linalg::prod(gpu_M, gpu_result) - gpu_rhs;
+    GPUVectorType gpu_residual = viennacl::linalg::prod(gpu_M, gpu_result);
+    gpu_residual -= gpu_rhs;
     std::cout << " * Rel. Residual: " << viennacl::linalg::norm_2(gpu_residual) / viennacl::linalg::norm_2(gpu_rhs) << std::endl;
-    
-    
+
+
     //
     // Reference: ILUT preconditioner:
     //
@@ -132,35 +147,44 @@ int main (int argc, const char * argv[])
     viennacl::linalg::ilut_precond<MatrixType> ilut(M, viennacl::linalg::ilut_tag());
     std::cout << " * Iterative solver run..." << std::endl;
     run_solver(M, rhs, solver_tag, ilut);
-    
-    
+
+
     //
     // Test 1: SPAI with CPU:
     //
-    std::cout << "--- Test 1: CPU-based SPAI ---" << std::endl;  
+    std::cout << "--- Test 1: CPU-based SPAI ---" << std::endl;
     std::cout << " * Preconditioner setup..." << std::endl;
     viennacl::linalg::spai_precond<MatrixType> spai_cpu(M, viennacl::linalg::spai_tag(1e-3, 3, 5e-2));
     std::cout << " * Iterative solver run..." << std::endl;
     run_solver(M, rhs, solver_tag, spai_cpu);
-    
+
     //
     // Test 2: FSPAI with CPU:
-    //      
-    std::cout << "--- Test 2: CPU-based FSPAI ---" << std::endl;  
+    //
+    std::cout << "--- Test 2: CPU-based FSPAI ---" << std::endl;
     std::cout << " * Preconditioner setup..." << std::endl;
     viennacl::linalg::fspai_precond<MatrixType> fspai_cpu(M, viennacl::linalg::fspai_tag());
     std::cout << " * Iterative solver run..." << std::endl;
     run_solver(M, rhs, solver_tag, fspai_cpu);
-    
+
     //
     // Test 3: SPAI with GPU:
-    //      
-    std::cout << "--- Test 3: GPU-based SPAI ---" << std::endl;  
+    //
+    std::cout << "--- Test 3: GPU-based SPAI ---" << std::endl;
     std::cout << " * Preconditioner setup..." << std::endl;
     viennacl::linalg::spai_precond<GPUMatrixType> spai_gpu(gpu_M, viennacl::linalg::spai_tag(1e-3, 3, 5e-2));
     std::cout << " * Iterative solver run..." << std::endl;
     run_solver(gpu_M, gpu_rhs, solver_tag, spai_gpu);
-    
+
+    //
+    // Test 4: FSPAI with GPU:
+    //
+    std::cout << "--- Test 4: GPU-based FSPAI ---" << std::endl;
+    std::cout << " * Preconditioner setup..." << std::endl;
+    viennacl::linalg::fspai_precond<GPUMatrixType> fspai_gpu(gpu_M, viennacl::linalg::fspai_tag());
+    std::cout << " * Iterative solver run..." << std::endl;
+    run_solver(gpu_M, gpu_rhs, solver_tag, fspai_gpu);
+
     return EXIT_SUCCESS;
 }
 
diff --git a/examples/tutorial/sparse.cpp b/examples/tutorial/sparse.cpp
index 5994e19..5329d1f 100644
--- a/examples/tutorial/sparse.cpp
+++ b/examples/tutorial/sparse.cpp
@@ -1,114 +1,121 @@
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-//
-// include necessary system headers
-//
-#include <iostream>
-
-
-//
-// ublas includes
-//
-#include <boost/numeric/ublas/io.hpp>
-#include <boost/numeric/ublas/triangular.hpp>
-#include <boost/numeric/ublas/matrix_sparse.hpp>
-#include <boost/numeric/ublas/matrix.hpp>
-#include <boost/numeric/ublas/matrix_proxy.hpp>
-#include <boost/numeric/ublas/operation.hpp>
-#include <boost/numeric/ublas/operation_sparse.hpp>
-#include <boost/numeric/ublas/io.hpp>
-#include <boost/numeric/ublas/lu.hpp>
-
-// Must be set if you want to use ViennaCL algorithms on ublas objects
-#define VIENNACL_HAVE_UBLAS 1
-
-
-//
-// ViennaCL includes
-//
-#include "viennacl/scalar.hpp"
-#include "viennacl/vector.hpp"
-#include "viennacl/compressed_matrix.hpp"
-#include "viennacl/coordinate_matrix.hpp"
-#include "viennacl/linalg/prod.hpp"
-#include "viennacl/linalg/ilu.hpp"
-#include "viennacl/linalg/jacobi_precond.hpp"
-#include "viennacl/linalg/cg.hpp"
-#include "viennacl/linalg/bicgstab.hpp"
-#include "viennacl/linalg/gmres.hpp"
-#include "viennacl/io/matrix_market.hpp"
-
-
-// Some helper functions for this tutorial:
-#include "Random.hpp"
-#include "vector-io.hpp"
-
-
-/*
-*
-*   Tutorial:  Modification of sparse matrices
-*   
-*/
-using namespace boost::numeric;
-
-
-int main()
-{
-  typedef float       ScalarType;
-  
-  std::size_t size = 5;
-  
-  //
-  // Set up some ublas objects
-  //
-  ublas::vector<ScalarType> rhs(size, size);
-  ublas::compressed_matrix<ScalarType> ublas_matrix(size, size);
-  
-  ublas_matrix(0,0) =  2.0; ublas_matrix(0,1) = -1.0;
-  ublas_matrix(1,0) = -1.0; ublas_matrix(1,1) =  2.0; ublas_matrix(1,2) = -1.0;
-  ublas_matrix(2,1) = -1.0; ublas_matrix(2,2) =  2.0; ublas_matrix(2,3) = -1.0;
-  ublas_matrix(3,2) = -1.0; ublas_matrix(3,3) =  2.0; ublas_matrix(3,4) = -1.0;
-  ublas_matrix(4,3) = -1.0; ublas_matrix(4,4) =  2.0;
-
-  //
-  // Set up some ViennaCL objects
-  //
-  viennacl::vector<ScalarType> vcl_rhs(size); 
-  viennacl::compressed_matrix<ScalarType> vcl_compressed_matrix(size, size);
-
-  viennacl::copy(ublas_matrix, vcl_compressed_matrix);
-  
-  std::cout << "ublas: " << ublas_matrix << std::endl;
-
-  std::cout << "Modifying vcl_compressed_matrix a bit: " << std::endl;
-  vcl_compressed_matrix(0, 0) = 3.0;
-  vcl_compressed_matrix(2, 3) = -3.0;
-  vcl_compressed_matrix(4, 2) = -3.0;  //this is a new nonzero entry
-  vcl_compressed_matrix(4, 3) = -3.0;
-  
-  ublas::compressed_matrix<ScalarType> temp(size, size);
-  viennacl::copy(vcl_compressed_matrix, temp);
-  std::cout << "ViennaCL: " << temp << std::endl;
-  
-  //
-  //  That's it.
-  //
-  std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl;
-  
-  return 0;
-}
-
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/*
+*
+*   Tutorial:  Handling sparse matrices (sparse.cpp and sparse.cu are identical, the latter being required for compilation using CUDA nvcc)
+*
+*/
+
+//
+// include necessary system headers
+//
+#include <iostream>
+
+
+//
+// ublas includes
+//
+#include <boost/numeric/ublas/io.hpp>
+#include <boost/numeric/ublas/triangular.hpp>
+#include <boost/numeric/ublas/matrix_sparse.hpp>
+#include <boost/numeric/ublas/matrix.hpp>
+#include <boost/numeric/ublas/matrix_proxy.hpp>
+#include <boost/numeric/ublas/vector_proxy.hpp>
+#include <boost/numeric/ublas/operation.hpp>
+#include <boost/numeric/ublas/operation_sparse.hpp>
+#include <boost/numeric/ublas/io.hpp>
+#include <boost/numeric/ublas/lu.hpp>
+
+// Must be set if you want to use ViennaCL algorithms on ublas objects
+#define VIENNACL_WITH_UBLAS 1
+
+
+//
+// ViennaCL includes
+//
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/compressed_matrix.hpp"
+#include "viennacl/linalg/prod.hpp"
+
+
+// Some helper functions for this tutorial:
+#include "Random.hpp"
+#include "vector-io.hpp"
+
+
+using namespace boost::numeric;
+
+
+int main()
+{
+  typedef float       ScalarType;
+
+  std::size_t size = 5;
+
+  //
+  // Set up some ublas objects
+  //
+  ublas::vector<ScalarType> rhs = ublas::scalar_vector<ScalarType>(size, ScalarType(size));
+  ublas::compressed_matrix<ScalarType> ublas_matrix(size, size);
+
+  ublas_matrix(0,0) =  2.0f; ublas_matrix(0,1) = -1.0f;
+  ublas_matrix(1,0) = -1.0f; ublas_matrix(1,1) =  2.0f; ublas_matrix(1,2) = -1.0f;
+  ublas_matrix(2,1) = -1.0f; ublas_matrix(2,2) =  2.0f; ublas_matrix(2,3) = -1.0f;
+  ublas_matrix(3,2) = -1.0f; ublas_matrix(3,3) =  2.0f; ublas_matrix(3,4) = -1.0f;
+  ublas_matrix(4,3) = -1.0f; ublas_matrix(4,4) =  2.0f;
+
+  std::cout << "ublas: " << ublas_matrix << std::endl;
+
+  //
+  // Set up some ViennaCL objects
+  //
+  viennacl::vector<ScalarType> vcl_rhs(size);
+  viennacl::compressed_matrix<ScalarType> vcl_compressed_matrix(size, size);
+
+  viennacl::copy(rhs, vcl_rhs);
+  viennacl::copy(ublas_matrix, vcl_compressed_matrix);
+
+  // just get the data directly from the GPU and print it:
+  ublas::compressed_matrix<ScalarType> temp(size, size);
+  viennacl::copy(vcl_compressed_matrix, temp);
+  std::cout << "ViennaCL: " << temp << std::endl;
+
+  // now modify GPU data directly:
+  std::cout << "Modifying vcl_compressed_matrix a bit: " << std::endl;
+  vcl_compressed_matrix(0, 0) =  3.0f;
+  vcl_compressed_matrix(2, 3) = -3.0f;
+  vcl_compressed_matrix(4, 2) = -3.0f;  //this is a new nonzero entry
+  vcl_compressed_matrix(4, 3) = -3.0f;
+
+  // and print it again:
+  viennacl::copy(vcl_compressed_matrix, temp);
+  std::cout << "ViennaCL: " << temp << std::endl;
+
+  // compute matrix-vector products:
+  std::cout << "ublas: " << ublas::prod(temp, rhs) << std::endl;
+  std::cout << "ViennaCL: " << viennacl::linalg::prod(vcl_compressed_matrix, vcl_rhs) << std::endl;
+
+  //
+  //  That's it.
+  //
+  std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl;
+
+  return EXIT_SUCCESS;
+}
+
diff --git a/examples/tutorial/sparse.cpp b/examples/tutorial/sparse.cu
similarity index 59%
copy from examples/tutorial/sparse.cpp
copy to examples/tutorial/sparse.cu
index 5994e19..5329d1f 100644
--- a/examples/tutorial/sparse.cpp
+++ b/examples/tutorial/sparse.cu
@@ -1,114 +1,121 @@
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-//
-// include necessary system headers
-//
-#include <iostream>
-
-
-//
-// ublas includes
-//
-#include <boost/numeric/ublas/io.hpp>
-#include <boost/numeric/ublas/triangular.hpp>
-#include <boost/numeric/ublas/matrix_sparse.hpp>
-#include <boost/numeric/ublas/matrix.hpp>
-#include <boost/numeric/ublas/matrix_proxy.hpp>
-#include <boost/numeric/ublas/operation.hpp>
-#include <boost/numeric/ublas/operation_sparse.hpp>
-#include <boost/numeric/ublas/io.hpp>
-#include <boost/numeric/ublas/lu.hpp>
-
-// Must be set if you want to use ViennaCL algorithms on ublas objects
-#define VIENNACL_HAVE_UBLAS 1
-
-
-//
-// ViennaCL includes
-//
-#include "viennacl/scalar.hpp"
-#include "viennacl/vector.hpp"
-#include "viennacl/compressed_matrix.hpp"
-#include "viennacl/coordinate_matrix.hpp"
-#include "viennacl/linalg/prod.hpp"
-#include "viennacl/linalg/ilu.hpp"
-#include "viennacl/linalg/jacobi_precond.hpp"
-#include "viennacl/linalg/cg.hpp"
-#include "viennacl/linalg/bicgstab.hpp"
-#include "viennacl/linalg/gmres.hpp"
-#include "viennacl/io/matrix_market.hpp"
-
-
-// Some helper functions for this tutorial:
-#include "Random.hpp"
-#include "vector-io.hpp"
-
-
-/*
-*
-*   Tutorial:  Modification of sparse matrices
-*   
-*/
-using namespace boost::numeric;
-
-
-int main()
-{
-  typedef float       ScalarType;
-  
-  std::size_t size = 5;
-  
-  //
-  // Set up some ublas objects
-  //
-  ublas::vector<ScalarType> rhs(size, size);
-  ublas::compressed_matrix<ScalarType> ublas_matrix(size, size);
-  
-  ublas_matrix(0,0) =  2.0; ublas_matrix(0,1) = -1.0;
-  ublas_matrix(1,0) = -1.0; ublas_matrix(1,1) =  2.0; ublas_matrix(1,2) = -1.0;
-  ublas_matrix(2,1) = -1.0; ublas_matrix(2,2) =  2.0; ublas_matrix(2,3) = -1.0;
-  ublas_matrix(3,2) = -1.0; ublas_matrix(3,3) =  2.0; ublas_matrix(3,4) = -1.0;
-  ublas_matrix(4,3) = -1.0; ublas_matrix(4,4) =  2.0;
-
-  //
-  // Set up some ViennaCL objects
-  //
-  viennacl::vector<ScalarType> vcl_rhs(size); 
-  viennacl::compressed_matrix<ScalarType> vcl_compressed_matrix(size, size);
-
-  viennacl::copy(ublas_matrix, vcl_compressed_matrix);
-  
-  std::cout << "ublas: " << ublas_matrix << std::endl;
-
-  std::cout << "Modifying vcl_compressed_matrix a bit: " << std::endl;
-  vcl_compressed_matrix(0, 0) = 3.0;
-  vcl_compressed_matrix(2, 3) = -3.0;
-  vcl_compressed_matrix(4, 2) = -3.0;  //this is a new nonzero entry
-  vcl_compressed_matrix(4, 3) = -3.0;
-  
-  ublas::compressed_matrix<ScalarType> temp(size, size);
-  viennacl::copy(vcl_compressed_matrix, temp);
-  std::cout << "ViennaCL: " << temp << std::endl;
-  
-  //
-  //  That's it.
-  //
-  std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl;
-  
-  return 0;
-}
-
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/*
+*
+*   Tutorial:  Handling sparse matrices (sparse.cpp and sparse.cu are identical, the latter being required for compilation using CUDA nvcc)
+*
+*/
+
+//
+// include necessary system headers
+//
+#include <iostream>
+
+
+//
+// ublas includes
+//
+#include <boost/numeric/ublas/io.hpp>
+#include <boost/numeric/ublas/triangular.hpp>
+#include <boost/numeric/ublas/matrix_sparse.hpp>
+#include <boost/numeric/ublas/matrix.hpp>
+#include <boost/numeric/ublas/matrix_proxy.hpp>
+#include <boost/numeric/ublas/vector_proxy.hpp>
+#include <boost/numeric/ublas/operation.hpp>
+#include <boost/numeric/ublas/operation_sparse.hpp>
+#include <boost/numeric/ublas/io.hpp>
+#include <boost/numeric/ublas/lu.hpp>
+
+// Must be set if you want to use ViennaCL algorithms on ublas objects
+#define VIENNACL_WITH_UBLAS 1
+
+
+//
+// ViennaCL includes
+//
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/compressed_matrix.hpp"
+#include "viennacl/linalg/prod.hpp"
+
+
+// Some helper functions for this tutorial:
+#include "Random.hpp"
+#include "vector-io.hpp"
+
+
+using namespace boost::numeric;
+
+
+int main()
+{
+  typedef float       ScalarType;
+
+  std::size_t size = 5;
+
+  //
+  // Set up some ublas objects
+  //
+  ublas::vector<ScalarType> rhs = ublas::scalar_vector<ScalarType>(size, ScalarType(size));
+  ublas::compressed_matrix<ScalarType> ublas_matrix(size, size);
+
+  ublas_matrix(0,0) =  2.0f; ublas_matrix(0,1) = -1.0f;
+  ublas_matrix(1,0) = -1.0f; ublas_matrix(1,1) =  2.0f; ublas_matrix(1,2) = -1.0f;
+  ublas_matrix(2,1) = -1.0f; ublas_matrix(2,2) =  2.0f; ublas_matrix(2,3) = -1.0f;
+  ublas_matrix(3,2) = -1.0f; ublas_matrix(3,3) =  2.0f; ublas_matrix(3,4) = -1.0f;
+  ublas_matrix(4,3) = -1.0f; ublas_matrix(4,4) =  2.0f;
+
+  std::cout << "ublas: " << ublas_matrix << std::endl;
+
+  //
+  // Set up some ViennaCL objects
+  //
+  viennacl::vector<ScalarType> vcl_rhs(size);
+  viennacl::compressed_matrix<ScalarType> vcl_compressed_matrix(size, size);
+
+  viennacl::copy(rhs, vcl_rhs);
+  viennacl::copy(ublas_matrix, vcl_compressed_matrix);
+
+  // just get the data directly from the GPU and print it:
+  ublas::compressed_matrix<ScalarType> temp(size, size);
+  viennacl::copy(vcl_compressed_matrix, temp);
+  std::cout << "ViennaCL: " << temp << std::endl;
+
+  // now modify GPU data directly:
+  std::cout << "Modifying vcl_compressed_matrix a bit: " << std::endl;
+  vcl_compressed_matrix(0, 0) =  3.0f;
+  vcl_compressed_matrix(2, 3) = -3.0f;
+  vcl_compressed_matrix(4, 2) = -3.0f;  //this is a new nonzero entry
+  vcl_compressed_matrix(4, 3) = -3.0f;
+
+  // and print it again:
+  viennacl::copy(vcl_compressed_matrix, temp);
+  std::cout << "ViennaCL: " << temp << std::endl;
+
+  // compute matrix-vector products:
+  std::cout << "ublas: " << ublas::prod(temp, rhs) << std::endl;
+  std::cout << "ViennaCL: " << viennacl::linalg::prod(vcl_compressed_matrix, vcl_rhs) << std::endl;
+
+  //
+  //  That's it.
+  //
+  std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl;
+
+  return EXIT_SUCCESS;
+}
+
diff --git a/examples/tutorial/structured-matrices.cpp b/examples/tutorial/structured-matrices.cpp
index a31d982..5c235eb 100644
--- a/examples/tutorial/structured-matrices.cpp
+++ b/examples/tutorial/structured-matrices.cpp
@@ -1,19 +1,26 @@
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
 ============================================================================= */
 
+/*
+*
+*   Tutorial:  Handling structured dense matrices (experimental, only available with OpenCL backend)
+*
+*/
+
 // include necessary system headers
 #include <iostream>
 
@@ -34,16 +41,13 @@
 #include "boost/numeric/ublas/matrix_proxy.hpp"
 #include "boost/numeric/ublas/io.hpp"
 
-/*
-*   Tutorial: Structured matrices
-*/
 
-int main() 
+int main()
 {
   typedef float      ScalarType;
-    
+
   std::size_t size = 4;
-  
+
   //
   // Set up ublas objects
   //
@@ -53,8 +57,8 @@ int main()
   boost::numeric::ublas::matrix<ScalarType> ublas_toeplitz(size, size);
   boost::numeric::ublas::matrix<ScalarType> ublas_vandermonde(size, size);
 
-  for(std::size_t i = 0; i < size; i++) 
-    for(std::size_t j = 0; j < size; j++) 
+  for(std::size_t i = 0; i < size; i++)
+    for(std::size_t j = 0; j < size; j++)
     {
       ublas_circulant(i,j)   = static_cast<ScalarType>((i - j + size) % size);
       ublas_hankel(i,j)      = static_cast<ScalarType>((i + j) % (2 * size));
@@ -77,32 +81,32 @@ int main()
   viennacl::copy(ublas_hankel, vcl_hankel);
   viennacl::copy(ublas_toeplitz, vcl_toeplitz);
   viennacl::copy(ublas_vandermonde, vcl_vandermonde);
-  
+
   // fill vectors:
   for(std::size_t i = 0; i < size; i++)
   {
     ublas_vec[i] = ScalarType(i);
     vcl_vec[i] = ScalarType(i);
   }
-  
+
   //
   // Add matrices:
   //
   std::cout << "Circulant matrix before addition: " << vcl_circulant << std::endl << std::endl;
   vcl_circulant += vcl_circulant;
   std::cout << "Circulant matrix after addition: " << vcl_circulant << std::endl << std::endl;
-  
+
   //
   // Manipulate single entry
   //
   std::cout << "Hankel matrix before manipulation: " << vcl_hankel << std::endl << std::endl;
   vcl_hankel(1, 2) = ScalarType(3.14);
   std::cout << "Hankel matrix after manipulation: " << vcl_hankel << std::endl << std::endl;
-  
+
   std::cout << "Vandermonde matrix before manipulation: " << vcl_vandermonde << std::endl << std::endl;
   vcl_vandermonde(1) = ScalarType(1.1); //NOTE: Write access only via row index
   std::cout << "Vandermonde matrix after manipulation: " << vcl_vandermonde << std::endl << std::endl;
-  
+
   //
   // Compute matrix-vector product (FFT-accelerated)
   //
@@ -110,7 +114,7 @@ int main()
   std::cout << "Vector: " << vcl_vec << std::endl << std::endl;
   vcl_result = viennacl::linalg::prod(vcl_toeplitz, vcl_vec);
   std::cout << "Result of matrix-vector product: " << vcl_result << std::endl << std::endl;
-  
+
   //
   //  That's it.
   //
diff --git a/examples/tutorial/vector-io.hpp b/examples/tutorial/vector-io.hpp
index e7b9940..85a0659 100644
--- a/examples/tutorial/vector-io.hpp
+++ b/examples/tutorial/vector-io.hpp
@@ -1,172 +1,173 @@
-#ifndef VECTOR_IO_HPP_
-#define VECTOR_IO_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-#include <string>
-#include <iostream>
-#include <fstream>
-
-#include "viennacl/tools/tools.hpp"
-#include "viennacl/meta/result_of.hpp"
-#include "viennacl/traits/size.hpp"
-
-
-template <typename MatrixType, typename ScalarType>
-void insert(MatrixType & matrix, long row, long col, ScalarType value)
-{
-  matrix(row, col) = value; 
-}
-
-#ifdef VIENNACL_HAVE_EIGEN
-template <typename ScalarType, int option>
-void insert(Eigen::SparseMatrix<ScalarType, option> & matrix, long row, long col, double value)
-{
-  matrix.fill(row, col) = value; 
-}
-#endif
-
-template <typename MatrixType>
-class my_inserter
-{
-  public:
-    my_inserter(MatrixType & mat) : mat_(mat) {} 
-    
-    void apply(long row, long col, double value)
-    {
-      insert(mat_, row, col, value);
-    }
-    
-  private:
-    MatrixType & mat_;
-};
-
-#ifdef VIENNACL_HAVE_MTL4
-#include <boost/numeric/mtl/matrix/inserter.hpp>
-/*template <typename ScalarType>
-void insert(mtl::compressed2D<ScalarType> & matrix, long row, long col, ScalarType value)
-{
-  typedef mtl::compressed2D<ScalarType>   MatrixType;
-  mtl::matrix::inserter<MatrixType>      ins(matrix);
-  
-  typename mtl::Collection<MatrixType>::value_type val(value);
-  ins(row, col) << val;
-  //matrix.fill(row, col) = val; 
-}*/
-
-template <typename ScalarType>
-void resize_vector(mtl::dense_vector<ScalarType> & vec, unsigned int size)
-{
-  vec.change_dim(size); 
-}
-
-template <typename ScalarType>
-class my_inserter<mtl::compressed2D<ScalarType> >
-{
-    typedef mtl::compressed2D<ScalarType>    MatrixType;
-  public:
-    my_inserter(MatrixType & mat) : mat_(mat), ins_(mat) {} 
-    
-    void apply(long row, long col, ScalarType value)
-    {
-      typename mtl::Collection<MatrixType>::value_type val(value);
-      ins_(row, col) << val;
-    }
-    
-  private:
-    MatrixType & mat_;
-    mtl::matrix::inserter<MatrixType> ins_;
-};
-#endif
-
-template <typename VectorType>
-void resize_vector(VectorType & vec, unsigned int size)
-{
-  vec.resize(size);
-}
-
-template <typename VectorType>
-bool readVectorFromFile(const std::string & filename,
-                        VectorType & vec)
-{
-  typedef typename viennacl::result_of::value_type<VectorType>::type    ScalarType;
-  
-  std::ifstream file(filename.c_str());
-
-  if (!file) return false;
-
-  unsigned int size;
-  file >> size;
-  
-  resize_vector(vec, size);
-
-  for (unsigned int i = 0; i < size; ++i)
-  {
-    ScalarType element;
-    file >> element;
-    vec[i] = element;
-  }
-
-  return true;
-}
-
-
-template <class MatrixType>
-bool readMatrixFromFile(const std::string & filename, MatrixType & matrix)
-{
-  typedef typename viennacl::result_of::value_type<MatrixType>::type    ScalarType;
-  
-  std::cout << "Reading matrix..." << std::endl;
-  
-  std::ifstream file(filename.c_str());
-
-  if (!file) return false;
-
-  std::string id;
-  file >> id;
-  if (id != "Matrix") return false;
-
-  
-  unsigned int num_rows, num_columns;
-  file >> num_rows >> num_columns;
-  if (num_rows != num_columns) return false;
-  
-  viennacl::traits::resize(matrix, num_rows, num_rows);
-
-  my_inserter<MatrixType> ins(matrix);
-  for (unsigned int row = 0; row < num_rows; ++row)
-  {
-    int num_entries;
-    file >> num_entries;
-    for (int j = 0; j < num_entries; ++j)
-    {
-      unsigned int column;
-      ScalarType element;
-      file >> column >> element;
-
-      ins.apply(row, column, element);
-      //insert(matrix, row, column, element);
-      //note: the obvious 'matrix(row, column) = element;' does not work with Eigen, hence another level of indirection
-    }
-    //std::cout << "reading of row finished" << std::endl;
-  }
-
-  return true;
-}
-
-
-#endif
+#ifndef VECTOR_IO_HPP_
+#define VECTOR_IO_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include <string>
+#include <iostream>
+#include <fstream>
+
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/meta/result_of.hpp"
+#include "viennacl/traits/size.hpp"
+
+
+template <typename MatrixType, typename ScalarType>
+void insert(MatrixType & matrix, long row, long col, ScalarType value)
+{
+  matrix(row, col) = value;
+}
+
+#ifdef VIENNACL_HAVE_EIGEN
+template <typename ScalarType, int option>
+void insert(Eigen::SparseMatrix<ScalarType, option> & matrix, long row, long col, double value)
+{
+  matrix.fill(row, col) = value;
+}
+#endif
+
+template <typename MatrixType>
+class my_inserter
+{
+  public:
+    my_inserter(MatrixType & mat) : mat_(mat) {}
+
+    void apply(long row, long col, double value)
+    {
+      insert(mat_, row, col, value);
+    }
+
+  private:
+    MatrixType & mat_;
+};
+
+#ifdef VIENNACL_HAVE_MTL4
+#include <boost/numeric/mtl/matrix/inserter.hpp>
+/*template <typename ScalarType>
+void insert(mtl::compressed2D<ScalarType> & matrix, long row, long col, ScalarType value)
+{
+  typedef mtl::compressed2D<ScalarType>   MatrixType;
+  mtl::matrix::inserter<MatrixType>      ins(matrix);
+
+  typename mtl::Collection<MatrixType>::value_type val(value);
+  ins(row, col) << val;
+  //matrix.fill(row, col) = val;
+}*/
+
+template <typename ScalarType>
+void resize_vector(mtl::dense_vector<ScalarType> & vec, unsigned int size)
+{
+  vec.change_dim(size);
+}
+
+template <typename ScalarType>
+class my_inserter<mtl::compressed2D<ScalarType> >
+{
+    typedef mtl::compressed2D<ScalarType>    MatrixType;
+  public:
+    my_inserter(MatrixType & mat) : mat_(mat), ins_(mat) {}
+
+    void apply(long row, long col, ScalarType value)
+    {
+      typename mtl::Collection<MatrixType>::value_type val(value);
+      ins_(row, col) << val;
+    }
+
+  private:
+    MatrixType & mat_;
+    mtl::matrix::inserter<MatrixType> ins_;
+};
+#endif
+
+template <typename VectorType>
+void resize_vector(VectorType & vec, unsigned int size)
+{
+  vec.resize(size);
+}
+
+template <typename VectorType>
+bool readVectorFromFile(const std::string & filename,
+                        VectorType & vec)
+{
+  typedef typename viennacl::result_of::value_type<VectorType>::type    ScalarType;
+
+  std::ifstream file(filename.c_str());
+
+  if (!file) return false;
+
+  unsigned int size;
+  file >> size;
+
+  resize_vector(vec, size);
+
+  for (unsigned int i = 0; i < size; ++i)
+  {
+    ScalarType element;
+    file >> element;
+    vec[i] = element;
+  }
+
+  return true;
+}
+
+
+template <class MatrixType>
+bool readMatrixFromFile(const std::string & filename, MatrixType & matrix)
+{
+  typedef typename viennacl::result_of::value_type<MatrixType>::type    ScalarType;
+
+  std::cout << "Reading matrix..." << std::endl;
+
+  std::ifstream file(filename.c_str());
+
+  if (!file) return false;
+
+  std::string id;
+  file >> id;
+  if (id != "Matrix") return false;
+
+
+  unsigned int num_rows, num_columns;
+  file >> num_rows >> num_columns;
+  if (num_rows != num_columns) return false;
+
+  viennacl::traits::resize(matrix, num_rows, num_rows);
+
+  my_inserter<MatrixType> ins(matrix);
+  for (unsigned int row = 0; row < num_rows; ++row)
+  {
+    int num_entries;
+    file >> num_entries;
+    for (int j = 0; j < num_entries; ++j)
+    {
+      unsigned int column;
+      ScalarType element;
+      file >> column >> element;
+
+      ins.apply(row, column, element);
+      //insert(matrix, row, column, element);
+      //note: the obvious 'matrix(row, column) = element;' does not work with Eigen, hence another level of indirection
+    }
+    //std::cout << "reading of row finished" << std::endl;
+  }
+
+  return true;
+}
+
+
+#endif
diff --git a/examples/tutorial/vector-range.cpp b/examples/tutorial/vector-range.cpp
index 2214960..161ab55 100644
--- a/examples/tutorial/vector-range.cpp
+++ b/examples/tutorial/vector-range.cpp
@@ -1,21 +1,29 @@
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
 ============================================================================= */
 
+/*
+*
+*   Tutorial: Explains the use of vector ranges with simple BLAS level 1 and 2 operations.
+*             (vector-range.cpp and vector-range.cu are identical, the latter being required for compilation using CUDA nvcc)
+*
+*/
+
 // activate ublas support in ViennaCL
-#define VIENNACL_HAVE_UBLAS
+#define VIENNACL_WITH_UBLAS
 
 //
 // include necessary system headers
@@ -38,33 +46,30 @@
 #include "boost/numeric/ublas/vector_proxy.hpp"
 #include "boost/numeric/ublas/io.hpp"
 
-/*
-*   Tutorial: Use of matrix ranges
-*/
 
-int main (int argc, const char * argv[])
+int main (int, const char **)
 {
   typedef float                                           ScalarType;    //feel free to change this to 'double' if supported by your hardware
   typedef boost::numeric::ublas::vector<ScalarType>       VectorType;
-  
+
   typedef viennacl::vector<ScalarType>                    VCLVectorType;
-  
+
   std::size_t dim_large = 7;
   std::size_t dim_small = 3;
-  
+
   //
   // Setup ublas objects and fill with data:
   //
   VectorType ublas_v1(dim_large);
   VectorType ublas_v2(dim_small);
-  
+
   for (std::size_t i=0; i<ublas_v1.size(); ++i)
-    ublas_v1(i) = i+1;
+    ublas_v1(i) = ScalarType(i+1);
 
   for (std::size_t i=0; i<ublas_v2.size(); ++i)
-    ublas_v2(i) = dim_large + i;
-    
-  
+    ublas_v2(i) = ScalarType(dim_large + i);
+
+
   //
   // Extract submatrices using the ranges in ublas
   //
@@ -74,18 +79,18 @@ int main (int argc, const char * argv[])
   boost::numeric::ublas::vector_range<VectorType> ublas_v1_sub1(ublas_v1, ublas_r1); // front part of vector v_1
   boost::numeric::ublas::vector_range<VectorType> ublas_v1_sub2(ublas_v1, ublas_r2); // center part of vector v_1
   boost::numeric::ublas::vector_range<VectorType> ublas_v1_sub3(ublas_v1, ublas_r3); // tail of vector v_1
-  
+
 
   //
   // Setup ViennaCL objects
   //
   VCLVectorType vcl_v1(dim_large);
   VCLVectorType vcl_v2(dim_small);
-  
+
   viennacl::copy(ublas_v1, vcl_v1);
   viennacl::copy(ublas_v2, vcl_v2);
-    
-  
+
+
   //
   // Extract submatrices using the ranges in ViennaCL
   //
@@ -96,19 +101,19 @@ int main (int argc, const char * argv[])
   viennacl::vector_range<VCLVectorType>   vcl_v1_sub2(vcl_v1, vcl_r2); // center part of vector v_1
   viennacl::vector_range<VCLVectorType>   vcl_v1_sub3(vcl_v1, vcl_r3); // tail of vector v_1
 
-  
+
   //
   // Copy from ublas to submatrices and back:
   //
-  
+
   ublas_v1_sub1 = ublas_v2;
   viennacl::copy(ublas_v2, vcl_v1_sub1);
-  viennacl::copy(vcl_v1_sub1, ublas_v2);  
-  
+  viennacl::copy(vcl_v1_sub1, ublas_v2);
+
   //
   // Addition:
   //
-  
+
   ublas_v1_sub1 += ublas_v1_sub1;
   vcl_v1_sub1 += vcl_v1_sub1;
 
@@ -117,7 +122,7 @@ int main (int argc, const char * argv[])
 
   ublas_v1_sub3 += ublas_v1_sub3;
   vcl_v1_sub3 += vcl_v1_sub3;
-  
+
   //
   // print vectors:
   //
diff --git a/examples/tutorial/vector-range.cpp b/examples/tutorial/vector-range.cu
similarity index 87%
copy from examples/tutorial/vector-range.cpp
copy to examples/tutorial/vector-range.cu
index 2214960..161ab55 100644
--- a/examples/tutorial/vector-range.cpp
+++ b/examples/tutorial/vector-range.cu
@@ -1,21 +1,29 @@
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
 ============================================================================= */
 
+/*
+*
+*   Tutorial: Explains the use of vector ranges with simple BLAS level 1 and 2 operations.
+*             (vector-range.cpp and vector-range.cu are identical, the latter being required for compilation using CUDA nvcc)
+*
+*/
+
 // activate ublas support in ViennaCL
-#define VIENNACL_HAVE_UBLAS
+#define VIENNACL_WITH_UBLAS
 
 //
 // include necessary system headers
@@ -38,33 +46,30 @@
 #include "boost/numeric/ublas/vector_proxy.hpp"
 #include "boost/numeric/ublas/io.hpp"
 
-/*
-*   Tutorial: Use of matrix ranges
-*/
 
-int main (int argc, const char * argv[])
+int main (int, const char **)
 {
   typedef float                                           ScalarType;    //feel free to change this to 'double' if supported by your hardware
   typedef boost::numeric::ublas::vector<ScalarType>       VectorType;
-  
+
   typedef viennacl::vector<ScalarType>                    VCLVectorType;
-  
+
   std::size_t dim_large = 7;
   std::size_t dim_small = 3;
-  
+
   //
   // Setup ublas objects and fill with data:
   //
   VectorType ublas_v1(dim_large);
   VectorType ublas_v2(dim_small);
-  
+
   for (std::size_t i=0; i<ublas_v1.size(); ++i)
-    ublas_v1(i) = i+1;
+    ublas_v1(i) = ScalarType(i+1);
 
   for (std::size_t i=0; i<ublas_v2.size(); ++i)
-    ublas_v2(i) = dim_large + i;
-    
-  
+    ublas_v2(i) = ScalarType(dim_large + i);
+
+
   //
   // Extract submatrices using the ranges in ublas
   //
@@ -74,18 +79,18 @@ int main (int argc, const char * argv[])
   boost::numeric::ublas::vector_range<VectorType> ublas_v1_sub1(ublas_v1, ublas_r1); // front part of vector v_1
   boost::numeric::ublas::vector_range<VectorType> ublas_v1_sub2(ublas_v1, ublas_r2); // center part of vector v_1
   boost::numeric::ublas::vector_range<VectorType> ublas_v1_sub3(ublas_v1, ublas_r3); // tail of vector v_1
-  
+
 
   //
   // Setup ViennaCL objects
   //
   VCLVectorType vcl_v1(dim_large);
   VCLVectorType vcl_v2(dim_small);
-  
+
   viennacl::copy(ublas_v1, vcl_v1);
   viennacl::copy(ublas_v2, vcl_v2);
-    
-  
+
+
   //
   // Extract submatrices using the ranges in ViennaCL
   //
@@ -96,19 +101,19 @@ int main (int argc, const char * argv[])
   viennacl::vector_range<VCLVectorType>   vcl_v1_sub2(vcl_v1, vcl_r2); // center part of vector v_1
   viennacl::vector_range<VCLVectorType>   vcl_v1_sub3(vcl_v1, vcl_r3); // tail of vector v_1
 
-  
+
   //
   // Copy from ublas to submatrices and back:
   //
-  
+
   ublas_v1_sub1 = ublas_v2;
   viennacl::copy(ublas_v2, vcl_v1_sub1);
-  viennacl::copy(vcl_v1_sub1, ublas_v2);  
-  
+  viennacl::copy(vcl_v1_sub1, ublas_v2);
+
   //
   // Addition:
   //
-  
+
   ublas_v1_sub1 += ublas_v1_sub1;
   vcl_v1_sub1 += vcl_v1_sub1;
 
@@ -117,7 +122,7 @@ int main (int argc, const char * argv[])
 
   ublas_v1_sub3 += ublas_v1_sub3;
   vcl_v1_sub3 += vcl_v1_sub3;
-  
+
   //
   // print vectors:
   //
diff --git a/examples/tutorial/viennacl-info.cpp b/examples/tutorial/viennacl-info.cpp
index dc01c32..62d9cca 100644
--- a/examples/tutorial/viennacl-info.cpp
+++ b/examples/tutorial/viennacl-info.cpp
@@ -1,21 +1,30 @@
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
 ============================================================================= */
 
+/*
+*
+*   Tutorial: Prints informations about the OpenCL backend. Requires compilation with VIENNACL_WITH_OPENCL being defined.
+*
+*
+*/
+
 // include necessary system headers
 #include <iostream>
+#include <cstdlib>
 
 //include ViennaCL headers
 #include "viennacl/ocl/device.hpp"
@@ -27,42 +36,55 @@ int main()
    //
    //  retrieve the devices
    //
-   viennacl::ocl::platform pf;
-   typedef std::vector<viennacl::ocl::device> devices_type;
-   devices_type devices = pf.devices(CL_DEVICE_TYPE_ALL);
-   
-   //
-   // print some platform info
-   //
-   std::cout << "#" << std::endl;
-   std::cout << "# ViennaCL uses the OpenCL platform from " << pf.info() << " on this machine." << std::endl;
-   std::cout << "#" << std::endl;
-   
-   //
-   //  traverse the devices and print the information
-   //
-   std::cout << "# Available Devices: " << std::endl;
-   std::cout << "# =========================================" << std::endl;
-   for(devices_type::iterator iter = devices.begin(); iter != devices.end(); iter++)
+   typedef std::vector< viennacl::ocl::platform > platforms_type;
+   platforms_type platforms = viennacl::ocl::get_platforms();
+
+   bool is_first_element = true;
+   for (platforms_type::iterator platform_iter  = platforms.begin();
+                                 platform_iter != platforms.end();
+                               ++platform_iter)
    {
-      std::cout << std::endl;
-
-      std::cout << "  -----------------------------------------" << std::endl;
-      std::cout << "  No.:              " << std::distance(devices.begin(), iter) << std::endl;
-      std::cout << "  Name:             " << iter->name() << std::endl;
-      std::cout << "  Compute Units:    " << iter->max_compute_units() << std::endl;
-      std::cout << "  Workgroup Size:   " << iter->max_workgroup_size() << std::endl;
-      std::cout << "  Global Memory:    " << iter->global_memory()/(1024*1024) << " MB" << std::endl;
-      std::cout << "  Local Memory:     " << iter->local_memory()/1024 << " KB" << std::endl;
-      std::cout << "  Max-alloc Memory: " << iter->max_allocable_memory()/(1024*1024) << " MB" << std::endl;
-      std::cout << "  Double Support:   " << iter->double_support() << std::endl;
-      std::cout << "  Driver Version:   " << iter->driver_version() << std::endl;
-      std::cout << "  -----------------------------------------" << std::endl;
+    typedef std::vector<viennacl::ocl::device> devices_type;
+    devices_type devices = platform_iter->devices(CL_DEVICE_TYPE_ALL);
+
+    //
+    // print some platform info
+    //
+    std::cout << "# =========================================" << std::endl;
+    std::cout << "#         Platform Information             " << std::endl;
+    std::cout << "# =========================================" << std::endl;
+
+    std::cout << "#" << std::endl;
+    std::cout << "# Vendor and version: " << platform_iter->info() << std::endl;
+    std::cout << "#" << std::endl;
+
+    if (is_first_element)
+    {
+      std::cout << "# ViennaCL uses this OpenCL platform by default." << std::endl;
+      is_first_element = false;
+    }
+
+
+    //
+    //  traverse the devices and print the information
+    //
+    std::cout << "# " << std::endl;
+    std::cout << "# Available Devices: " << std::endl;
+    std::cout << "# " << std::endl;
+    for(devices_type::iterator iter = devices.begin(); iter != devices.end(); iter++)
+    {
+        std::cout << std::endl;
+
+        std::cout << "  -----------------------------------------" << std::endl;
+        std::cout << iter->full_info();
+        std::cout << "  -----------------------------------------" << std::endl;
+    }
+    std::cout << std::endl;
+    std::cout << "###########################################" << std::endl;
+    std::cout << std::endl;
    }
-   std::cout << std::endl;
-   std::cout << "# =========================================" << std::endl;
 
-   return 0;
+   return EXIT_SUCCESS;
 }
 
 
diff --git a/examples/tutorial/wrap-cuda-buffer.cu b/examples/tutorial/wrap-cuda-buffer.cu
new file mode 100644
index 0000000..37ee382
--- /dev/null
+++ b/examples/tutorial/wrap-cuda-buffer.cu
@@ -0,0 +1,121 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/*
+*
+*   Tutorial:  Use ViennaCL with user-provided CUDA buffers
+*
+*/
+
+
+//
+// include necessary system headers
+//
+#include <iostream>
+#include <cstdlib>
+#include <string>
+
+#include <cuda.h>
+
+//
+// ViennaCL includes
+//
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/matrix_operations.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+#include "viennacl/linalg/prod.hpp"
+
+
+// Some helper functions for this tutorial:
+#include "Random.hpp"
+
+//
+//  A simple CUDA kernel for the vector operation x += y
+//
+template <typename T>
+__global__ void my_inplace_add_kernel(T * vec1, T * vec2, unsigned int size)
+{
+    for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                      i < size;
+                      i += gridDim.x * blockDim.x)
+      vec1[i] += vec2[i];
+}
+
+
+
+int main()
+{
+  typedef float       ScalarType;
+
+  //
+  // Part 1: Allocate some CUDA memory
+  //
+  std::size_t size = 10;
+  ScalarType *cuda_x;
+  ScalarType *cuda_y;
+
+  cudaMalloc(&cuda_x, size * sizeof(ScalarType));
+  cudaMalloc(&cuda_y, size * sizeof(ScalarType));
+
+  // Initialize with data
+  std::vector<ScalarType> host_x(size, 1.0);
+  std::vector<ScalarType> host_y(size, 2.0);
+
+  cudaMemcpy(cuda_x, &(host_x[0]), size * sizeof(ScalarType), cudaMemcpyHostToDevice);
+  cudaMemcpy(cuda_y, &(host_y[0]), size * sizeof(ScalarType), cudaMemcpyHostToDevice);
+
+  // run kernel
+  my_inplace_add_kernel<<<128, 128>>>(cuda_x, cuda_y, static_cast<unsigned int>(1000));
+
+  // copy result back
+  std::vector<ScalarType> result_cuda(size);
+  cudaMemcpy(&(result_cuda[0]), cuda_x, size * sizeof(ScalarType), cudaMemcpyDeviceToHost);
+
+  std::cout << "Result with CUDA (native): ";
+  for (std::size_t i=0; i<size; ++i)
+    std::cout << result_cuda[i] << " ";
+  std::cout << std::endl;
+
+  //
+  // Part 2: Now do the same within ViennaCL
+  //
+
+  // wrap the existing CUDA buffers inside ViennaCL vectors
+  viennacl::vector<ScalarType> vcl_vec1(cuda_x, viennacl::CUDA_MEMORY, size); // Second parameter specifies that this is CUDA memory rather than host memory
+  viennacl::vector<ScalarType> vcl_vec2(cuda_y, viennacl::CUDA_MEMORY, size); // Second parameter specifies that this is CUDA memory rather than host memory
+
+  // reset values to 0 and 1, respectively
+  vcl_vec1 = viennacl::scalar_vector<ScalarType>(size, ScalarType(1.0));
+  vcl_vec2 = viennacl::scalar_vector<ScalarType>(size, ScalarType(2.0));
+
+  vcl_vec1 += vcl_vec2;
+
+  std::cout << "Result with ViennaCL: " << vcl_vec1 << std::endl;
+
+  // ViennaCL does not automatically free your buffers (you're still the owner), so don't forget to clean up :-)
+  cudaFree(cuda_x);
+  cudaFree(cuda_y);
+
+  //
+  //  That's it.
+  //
+  std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl;
+
+  return EXIT_SUCCESS;
+}
+
diff --git a/examples/tutorial/wrap-host-buffer.cpp b/examples/tutorial/wrap-host-buffer.cpp
new file mode 100644
index 0000000..0f68bab
--- /dev/null
+++ b/examples/tutorial/wrap-host-buffer.cpp
@@ -0,0 +1,86 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/*
+*
+*   Tutorial:  Use ViennaCL within user-provided memory buffers on the host
+*
+*/
+
+
+//
+// include necessary system headers
+//
+#include <iostream>
+#include <cstdlib>
+#include <string>
+
+//
+// ViennaCL includes
+//
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/matrix_operations.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+#include "viennacl/linalg/prod.hpp"
+
+
+// Some helper functions for this tutorial:
+#include "Random.hpp"
+
+
+int main()
+{
+  typedef float       ScalarType;
+
+  //
+  // Part 1: Allocate some buffers on the host
+  //
+  std::size_t size = 10;
+
+  std::vector<ScalarType> host_x(size, 1.0);
+  std::vector<ScalarType> host_y(size, 2.0);
+
+  std::cout << "Result on host: ";
+  for (std::size_t i=0; i<size; ++i)
+    std::cout << host_x[i] + host_y[i] << " ";
+  std::cout << std::endl;
+
+  //
+  // Part 2: Now do the same computations within ViennaCL
+  //
+
+  // wrap host buffer within ViennaCL vectors:
+  viennacl::vector<ScalarType> vcl_vec1(&(host_x[0]), viennacl::MAIN_MEMORY, size); // Second parameter specifies that this is host memory rather than CUDA memory
+  viennacl::vector<ScalarType> vcl_vec2(&(host_y[0]), viennacl::MAIN_MEMORY, size); // Second parameter specifies that this is host memory rather than CUDA memory
+
+  // reset values to 0 and 1, respectively
+  vcl_vec1 = viennacl::scalar_vector<ScalarType>(size, ScalarType(1.0));
+  vcl_vec2 = viennacl::scalar_vector<ScalarType>(size, ScalarType(2.0));
+
+  vcl_vec1 += vcl_vec2;
+
+  std::cout << "Result with ViennaCL: " << vcl_vec1 << std::endl;
+
+  //
+  //  That's it.
+  //
+  std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl;
+
+  return 0;
+}
+
diff --git a/external/pugixml/src/pugiconfig.hpp b/external/pugixml/src/pugiconfig.hpp
deleted file mode 100644
index 6b553ae..0000000
--- a/external/pugixml/src/pugiconfig.hpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/**
- * pugixml parser - version 1.0
- * --------------------------------------------------------
- * Copyright (C) 2006-2010, by Arseny Kapoulkine (arseny.kapoulkine at gmail.com)
- * Report bugs and download new versions at http://pugixml.org/
- *
- * This library is distributed under the MIT License. See notice at the end
- * of this file.
- *
- * This work is based on the pugxml parser, which is:
- * Copyright (C) 2003, by Kristen Wegner (kristen at tima.net)
- */
-
-#ifndef HEADER_PUGICONFIG_HPP
-#define HEADER_PUGICONFIG_HPP
-
-// Uncomment this to enable wchar_t mode
-// #define PUGIXML_WCHAR_MODE
-
-// Uncomment this to disable XPath
-// #define PUGIXML_NO_XPATH
-
-// Uncomment this to disable STL
-// Note: you can't use XPath with PUGIXML_NO_STL
-// #define PUGIXML_NO_STL
-
-// Uncomment this to disable exceptions
-// Note: you can't use XPath with PUGIXML_NO_EXCEPTIONS
-// #define PUGIXML_NO_EXCEPTIONS
-
-// Set this to control attributes for public classes/functions, i.e.:
-// #define PUGIXML_API __declspec(dllexport) // to export all public symbols from DLL
-// #define PUGIXML_CLASS __declspec(dllimport) // to import all classes from DLL
-// #define PUGIXML_FUNCTION __fastcall // to set calling conventions to all public functions to fastcall
-// In absence of PUGIXML_CLASS/PUGIXML_FUNCTION definitions PUGIXML_API is used instead
-
-#endif
-
-/**
- * Copyright (c) 2006-2010 Arseny Kapoulkine
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
diff --git a/external/pugixml/src/pugixml.cpp b/external/pugixml/src/pugixml.cpp
deleted file mode 100644
index 9132d8a..0000000
--- a/external/pugixml/src/pugixml.cpp
+++ /dev/null
@@ -1,9576 +0,0 @@
-/**
- * pugixml parser - version 1.0
- * --------------------------------------------------------
- * Copyright (C) 2006-2010, by Arseny Kapoulkine (arseny.kapoulkine at gmail.com)
- * Report bugs and download new versions at http://pugixml.org/
- *
- * This library is distributed under the MIT License. See notice at the end
- * of this file.
- *
- * This work is based on the pugxml parser, which is:
- * Copyright (C) 2003, by Kristen Wegner (kristen at tima.net)
- */
-
-#include "pugixml.hpp"
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <assert.h>
-#include <setjmp.h>
-#include <wchar.h>
-
-#ifndef PUGIXML_NO_XPATH
-#	include <math.h>
-#	include <float.h>
-#endif
-
-#ifndef PUGIXML_NO_STL
-#	include <istream>
-#	include <ostream>
-#	include <string>
-#endif
-
-// For placement new
-#include <new>
-
-#ifdef _MSC_VER
-#	pragma warning(disable: 4127) // conditional expression is constant
-#	pragma warning(disable: 4324) // structure was padded due to __declspec(align())
-#	pragma warning(disable: 4611) // interaction between '_setjmp' and C++ object destruction is non-portable
-#	pragma warning(disable: 4702) // unreachable code
-#	pragma warning(disable: 4996) // this function or variable may be unsafe
-#endif
-
-#ifdef __INTEL_COMPILER
-#	pragma warning(disable: 177) // function was declared but never referenced 
-#	pragma warning(disable: 1478 1786) // function was declared "deprecated"
-#endif
-
-#ifdef __BORLANDC__
-#	pragma warn -8008 // condition is always false
-#	pragma warn -8066 // unreachable code
-#endif
-
-#ifdef __SNC__
-#	pragma diag_suppress=178 // function was declared but never referenced
-#	pragma diag_suppress=237 // controlling expression is constant
-#endif
-
-// uintptr_t
-#if !defined(_MSC_VER) || _MSC_VER >= 1600
-#	include <stdint.h>
-#else
-#	if _MSC_VER < 1300
-// No native uintptr_t in MSVC6
-typedef size_t uintptr_t;
-#	endif
-typedef unsigned __int8 uint8_t;
-typedef unsigned __int16 uint16_t;
-typedef unsigned __int32 uint32_t;
-typedef __int32 int32_t;
-#endif
-
-// Inlining controls
-#if defined(_MSC_VER) && _MSC_VER >= 1300
-#	define PUGIXML_NO_INLINE __declspec(noinline)
-#elif defined(__GNUC__)
-#	define PUGIXML_NO_INLINE __attribute__((noinline))
-#else
-#	define PUGIXML_NO_INLINE 
-#endif
-
-// Simple static assertion
-#define STATIC_ASSERT(cond) { static const char condition_failed[(cond) ? 1 : -1] = {0}; (void)condition_failed[0]; }
-
-// Digital Mars C++ bug workaround for passing char loaded from memory via stack
-#ifdef __DMC__
-#	define DMC_VOLATILE volatile
-#else
-#	define DMC_VOLATILE
-#endif
-
-using namespace pugi;
-
-// Memory allocation
-namespace
-{
-	void* default_allocate(size_t size)
-	{
-		return malloc(size);
-	}
-
-	void default_deallocate(void* ptr)
-	{
-		free(ptr);
-	}
-
-	allocation_function global_allocate = default_allocate;
-	deallocation_function global_deallocate = default_deallocate;
-}
-
-// String utilities
-namespace
-{
-	// Get string length
-	size_t strlength(const char_t* s)
-	{
-		assert(s);
-
-	#ifdef PUGIXML_WCHAR_MODE
-		return wcslen(s);
-	#else
-		return strlen(s);
-	#endif
-	}
-
-	// Compare two strings
-	bool strequal(const char_t* src, const char_t* dst)
-	{
-		assert(src && dst);
-
-	#ifdef PUGIXML_WCHAR_MODE
-		return wcscmp(src, dst) == 0;
-	#else
-		return strcmp(src, dst) == 0;
-	#endif
-	}
-
-	// Compare lhs with [rhs_begin, rhs_end)
-	bool strequalrange(const char_t* lhs, const char_t* rhs, size_t count)
-	{
-		for (size_t i = 0; i < count; ++i)
-			if (lhs[i] != rhs[i])
-				return false;
-	
-		return lhs[count] == 0;
-	}
-	
-#ifdef PUGIXML_WCHAR_MODE
-	// Convert string to wide string, assuming all symbols are ASCII
-	void widen_ascii(wchar_t* dest, const char* source)
-	{
-		for (const char* i = source; *i; ++i) *dest++ = *i;
-		*dest = 0;
-	}
-#endif
-}
-
-#if !defined(PUGIXML_NO_STL) || !defined(PUGIXML_NO_XPATH)
-// auto_ptr-like buffer holder for exception recovery
-namespace
-{
-	struct buffer_holder
-	{
-		void* data;
-		void (*deleter)(void*);
-
-		buffer_holder(void* data, void (*deleter)(void*)): data(data), deleter(deleter)
-		{
-		}
-
-		~buffer_holder()
-		{
-			if (data) deleter(data);
-		}
-
-		void* release()
-		{
-			void* result = data;
-			data = 0;
-			return result;
-		}
-	};
-}
-#endif
-
-namespace
-{
-	static const size_t xml_memory_page_size = 32768;
-
-	static const uintptr_t xml_memory_page_alignment = 32;
-	static const uintptr_t xml_memory_page_pointer_mask = ~(xml_memory_page_alignment - 1);
-	static const uintptr_t xml_memory_page_name_allocated_mask = 16;
-	static const uintptr_t xml_memory_page_value_allocated_mask = 8;
-	static const uintptr_t xml_memory_page_type_mask = 7;
-
-	struct xml_allocator;
-
-	struct xml_memory_page
-	{
-		static xml_memory_page* construct(void* memory)
-		{
-			if (!memory) return 0; //$ redundant, left for performance
-
-			xml_memory_page* result = static_cast<xml_memory_page*>(memory);
-
-			result->allocator = 0;
-			result->memory = 0;
-			result->prev = 0;
-			result->next = 0;
-			result->busy_size = 0;
-			result->freed_size = 0;
-
-			return result;
-		}
-
-		xml_allocator* allocator;
-
-		void* memory;
-
-		xml_memory_page* prev;
-		xml_memory_page* next;
-
-		size_t busy_size;
-		size_t freed_size;
-
-		char data[1];
-	};
-
-	struct xml_memory_string_header
-	{
-		uint16_t page_offset; // offset from page->data
-		uint16_t full_size; // 0 if string occupies whole page
-	};
-
-	struct xml_allocator
-	{
-		xml_allocator(xml_memory_page* root): _root(root), _busy_size(root->busy_size)
-		{
-		}
-
-		xml_memory_page* allocate_page(size_t data_size)
-		{
-			size_t size = offsetof(xml_memory_page, data) + data_size;
-
-			// allocate block with some alignment, leaving memory for worst-case padding
-			void* memory = global_allocate(size + xml_memory_page_alignment);
-			if (!memory) return 0;
-
-			// align upwards to page boundary
-			void* page_memory = reinterpret_cast<void*>((reinterpret_cast<uintptr_t>(memory) + (xml_memory_page_alignment - 1)) & ~(xml_memory_page_alignment - 1));
-
-			// prepare page structure
-			xml_memory_page* page = xml_memory_page::construct(page_memory);
-
-			page->memory = memory;
-			page->allocator = _root->allocator;
-
-			return page;
-		}
-
-		static void deallocate_page(xml_memory_page* page)
-		{
-			global_deallocate(page->memory);
-		}
-
-		void* allocate_memory_oob(size_t size, xml_memory_page*& out_page);
-
-		void* allocate_memory(size_t size, xml_memory_page*& out_page)
-		{
-			if (_busy_size + size > xml_memory_page_size) return allocate_memory_oob(size, out_page);
-
-			void* buf = _root->data + _busy_size;
-
-			_busy_size += size;
-
-			out_page = _root;
-
-			return buf;
-		}
-
-		void deallocate_memory(void* ptr, size_t size, xml_memory_page* page)
-		{
-			if (page == _root) page->busy_size = _busy_size;
-
-			assert(ptr >= page->data && ptr < page->data + page->busy_size);
-			(void)!ptr;
-
-			page->freed_size += size;
-			assert(page->freed_size <= page->busy_size);
-
-			if (page->freed_size == page->busy_size)
-			{
-				if (page->next == 0)
-				{
-					assert(_root == page);
-
-					// top page freed, just reset sizes
-					page->busy_size = page->freed_size = 0;
-					_busy_size = 0;
-				}
-				else
-				{
-					assert(_root != page);
-					assert(page->prev);
-
-					// remove from the list
-					page->prev->next = page->next;
-					page->next->prev = page->prev;
-
-					// deallocate
-					deallocate_page(page);
-				}
-			}
-		}
-
-		char_t* allocate_string(size_t length)
-		{
-			// allocate memory for string and header block
-			size_t size = sizeof(xml_memory_string_header) + length * sizeof(char_t);
-			
-			// round size up to pointer alignment boundary
-			size_t full_size = (size + (sizeof(void*) - 1)) & ~(sizeof(void*) - 1);
-
-			xml_memory_page* page;
-			xml_memory_string_header* header = static_cast<xml_memory_string_header*>(allocate_memory(full_size, page));
-
-			if (!header) return 0;
-
-			// setup header
-			ptrdiff_t page_offset = reinterpret_cast<char*>(header) - page->data;
-
-			assert(page_offset >= 0 && page_offset < (1 << 16));
-			header->page_offset = static_cast<uint16_t>(page_offset);
-
-			// full_size == 0 for large strings that occupy the whole page
-			assert(full_size < (1 << 16) || (page->busy_size == full_size && page_offset == 0));
-			header->full_size = static_cast<uint16_t>(full_size < (1 << 16) ? full_size : 0);
-
-			return reinterpret_cast<char_t*>(header + 1);
-		}
-
-		void deallocate_string(char_t* string)
-		{
-			// get header
-			xml_memory_string_header* header = reinterpret_cast<xml_memory_string_header*>(string) - 1;
-
-			// deallocate
-			size_t page_offset = offsetof(xml_memory_page, data) + header->page_offset;
-			xml_memory_page* page = reinterpret_cast<xml_memory_page*>(reinterpret_cast<char*>(header) - page_offset);
-
-			// if full_size == 0 then this string occupies the whole page
-			size_t full_size = header->full_size == 0 ? page->busy_size : header->full_size;
-
-			deallocate_memory(header, full_size, page);
-		}
-
-		xml_memory_page* _root;
-		size_t _busy_size;
-	};
-
-	PUGIXML_NO_INLINE void* xml_allocator::allocate_memory_oob(size_t size, xml_memory_page*& out_page)
-	{
-		const size_t large_allocation_threshold = xml_memory_page_size / 4;
-
-		xml_memory_page* page = allocate_page(size <= large_allocation_threshold ? xml_memory_page_size : size);
-		if (!page) return 0;
-
-		if (size <= large_allocation_threshold)
-		{
-			_root->busy_size = _busy_size;
-
-			// insert page at the end of linked list
-			page->prev = _root;
-			_root->next = page;
-			_root = page;
-
-			_busy_size = size;
-		}
-		else
-		{
-			// insert page before the end of linked list, so that it is deleted as soon as possible
-			// the last page is not deleted even if it's empty (see deallocate_memory)
-			assert(_root->prev);
-
-			page->prev = _root->prev;
-			page->next = _root;
-
-			_root->prev->next = page;
-			_root->prev = page;
-		}
-
-		// allocate inside page
-		page->busy_size = size;
-
-		out_page = page;
-		return page->data;
-	}
-}
-
-namespace pugi
-{
-	/// A 'name=value' XML attribute structure.
-	struct xml_attribute_struct
-	{
-		/// Default ctor
-		xml_attribute_struct(xml_memory_page* page): header(reinterpret_cast<uintptr_t>(page)), name(0), value(0), prev_attribute_c(0), next_attribute(0)
-		{
-		}
-
-		uintptr_t header;
-
-		char_t* name;	///< Pointer to attribute name.
-		char_t*	value;	///< Pointer to attribute value.
-
-		xml_attribute_struct* prev_attribute_c;	///< Previous attribute (cyclic list)
-		xml_attribute_struct* next_attribute;	///< Next attribute
-	};
-
-	/// An XML document tree node.
-	struct xml_node_struct
-	{
-		/// Default ctor
-		/// \param type - node type
-		xml_node_struct(xml_memory_page* page, xml_node_type type): header(reinterpret_cast<uintptr_t>(page) | (type - 1)), parent(0), name(0), value(0), first_child(0), prev_sibling_c(0), next_sibling(0), first_attribute(0)
-		{
-		}
-
-		uintptr_t header;
-
-		xml_node_struct*		parent;					///< Pointer to parent
-
-		char_t*					name;					///< Pointer to element name.
-		char_t*					value;					///< Pointer to any associated string data.
-
-		xml_node_struct*		first_child;			///< First child
-		
-		xml_node_struct*		prev_sibling_c;			///< Left brother (cyclic list)
-		xml_node_struct*		next_sibling;			///< Right brother
-		
-		xml_attribute_struct*	first_attribute;		///< First attribute
-	};
-}
-
-namespace
-{
-	struct xml_document_struct: public xml_node_struct, public xml_allocator
-	{
-		xml_document_struct(xml_memory_page* page): xml_node_struct(page, node_document), xml_allocator(page), buffer(0)
-		{
-		}
-
-		const char_t* buffer;
-	};
-
-	static inline xml_allocator& get_allocator(const xml_node_struct* node)
-	{
-		assert(node);
-
-		return *reinterpret_cast<xml_memory_page*>(node->header & xml_memory_page_pointer_mask)->allocator;
-	}
-}
-
-// Low-level DOM operations
-namespace
-{
-	inline xml_attribute_struct* allocate_attribute(xml_allocator& alloc)
-	{
-		xml_memory_page* page;
-		void* memory = alloc.allocate_memory(sizeof(xml_attribute_struct), page);
-
-		return new (memory) xml_attribute_struct(page);
-	}
-
-	inline xml_node_struct* allocate_node(xml_allocator& alloc, xml_node_type type)
-	{
-		xml_memory_page* page;
-		void* memory = alloc.allocate_memory(sizeof(xml_node_struct), page);
-
-		return new (memory) xml_node_struct(page, type);
-	}
-
-	inline void destroy_attribute(xml_attribute_struct* a, xml_allocator& alloc)
-	{
-		uintptr_t header = a->header;
-
-		if (header & xml_memory_page_name_allocated_mask) alloc.deallocate_string(a->name);
-		if (header & xml_memory_page_value_allocated_mask) alloc.deallocate_string(a->value);
-
-		alloc.deallocate_memory(a, sizeof(xml_attribute_struct), reinterpret_cast<xml_memory_page*>(header & xml_memory_page_pointer_mask));
-	}
-
-	inline void destroy_node(xml_node_struct* n, xml_allocator& alloc)
-	{
-		uintptr_t header = n->header;
-
-		if (header & xml_memory_page_name_allocated_mask) alloc.deallocate_string(n->name);
-		if (header & xml_memory_page_value_allocated_mask) alloc.deallocate_string(n->value);
-
-		for (xml_attribute_struct* attr = n->first_attribute; attr; )
-		{
-			xml_attribute_struct* next = attr->next_attribute;
-
-			destroy_attribute(attr, alloc);
-
-			attr = next;
-		}
-
-		for (xml_node_struct* child = n->first_child; child; )
-		{
-			xml_node_struct* next = child->next_sibling;
-
-			destroy_node(child, alloc);
-
-			child = next;
-		}
-
-		alloc.deallocate_memory(n, sizeof(xml_node_struct), reinterpret_cast<xml_memory_page*>(header & xml_memory_page_pointer_mask));
-	}
-
-	PUGIXML_NO_INLINE xml_node_struct* append_node(xml_node_struct* node, xml_allocator& alloc, xml_node_type type = node_element)
-	{
-		xml_node_struct* child = allocate_node(alloc, type);
-		if (!child) return 0;
-
-		child->parent = node;
-
-		xml_node_struct* first_child = node->first_child;
-			
-		if (first_child)
-		{
-			xml_node_struct* last_child = first_child->prev_sibling_c;
-
-			last_child->next_sibling = child;
-			child->prev_sibling_c = last_child;
-			first_child->prev_sibling_c = child;
-		}
-		else
-		{
-			node->first_child = child;
-			child->prev_sibling_c = child;
-		}
-			
-		return child;
-	}
-
-	PUGIXML_NO_INLINE xml_attribute_struct* append_attribute_ll(xml_node_struct* node, xml_allocator& alloc)
-	{
-		xml_attribute_struct* a = allocate_attribute(alloc);
-		if (!a) return 0;
-
-		xml_attribute_struct* first_attribute = node->first_attribute;
-
-		if (first_attribute)
-		{
-			xml_attribute_struct* last_attribute = first_attribute->prev_attribute_c;
-
-			last_attribute->next_attribute = a;
-			a->prev_attribute_c = last_attribute;
-			first_attribute->prev_attribute_c = a;
-		}
-		else
-		{
-			node->first_attribute = a;
-			a->prev_attribute_c = a;
-		}
-			
-		return a;
-	}
-}
-
-// Helper classes for code generation
-namespace
-{
-	struct opt_false
-	{
-		enum { value = 0 };
-	};
-
-	struct opt_true
-	{
-		enum { value = 1 };
-	};
-}
-
-// Unicode utilities
-namespace
-{
-	inline uint16_t endian_swap(uint16_t value)
-	{
-		return static_cast<uint16_t>(((value & 0xff) << 8) | (value >> 8));
-	}
-
-	inline uint32_t endian_swap(uint32_t value)
-	{
-		return ((value & 0xff) << 24) | ((value & 0xff00) << 8) | ((value & 0xff0000) >> 8) | (value >> 24);
-	}
-
-	struct utf8_counter
-	{
-		typedef size_t value_type;
-
-		static value_type low(value_type result, uint32_t ch)
-		{
-			// U+0000..U+007F
-			if (ch < 0x80) return result + 1;
-			// U+0080..U+07FF
-			else if (ch < 0x800) return result + 2;
-			// U+0800..U+FFFF
-			else return result + 3;
-		}
-
-		static value_type high(value_type result, uint32_t)
-		{
-			// U+10000..U+10FFFF
-			return result + 4;
-		}
-	};
-
-	struct utf8_writer
-	{
-		typedef uint8_t* value_type;
-
-		static value_type low(value_type result, uint32_t ch)
-		{
-			// U+0000..U+007F
-			if (ch < 0x80)
-			{
-				*result = static_cast<uint8_t>(ch);
-				return result + 1;
-			}
-			// U+0080..U+07FF
-			else if (ch < 0x800)
-			{
-				result[0] = static_cast<uint8_t>(0xC0 | (ch >> 6));
-				result[1] = static_cast<uint8_t>(0x80 | (ch & 0x3F));
-				return result + 2;
-			}
-			// U+0800..U+FFFF
-			else
-			{
-				result[0] = static_cast<uint8_t>(0xE0 | (ch >> 12));
-				result[1] = static_cast<uint8_t>(0x80 | ((ch >> 6) & 0x3F));
-				result[2] = static_cast<uint8_t>(0x80 | (ch & 0x3F));
-				return result + 3;
-			}
-		}
-
-		static value_type high(value_type result, uint32_t ch)
-		{
-			// U+10000..U+10FFFF
-			result[0] = static_cast<uint8_t>(0xF0 | (ch >> 18));
-			result[1] = static_cast<uint8_t>(0x80 | ((ch >> 12) & 0x3F));
-			result[2] = static_cast<uint8_t>(0x80 | ((ch >> 6) & 0x3F));
-			result[3] = static_cast<uint8_t>(0x80 | (ch & 0x3F));
-			return result + 4;
-		}
-
-		static value_type any(value_type result, uint32_t ch)
-		{
-			return (ch < 0x10000) ? low(result, ch) : high(result, ch);
-		}
-	};
-
-	struct utf16_counter
-	{
-		typedef size_t value_type;
-
-		static value_type low(value_type result, uint32_t)
-		{
-			return result + 1;
-		}
-
-		static value_type high(value_type result, uint32_t)
-		{
-			return result + 2;
-		}
-	};
-
-	struct utf16_writer
-	{
-		typedef uint16_t* value_type;
-
-		static value_type low(value_type result, uint32_t ch)
-		{
-			*result = static_cast<uint16_t>(ch);
-
-			return result + 1;
-		}
-
-		static value_type high(value_type result, uint32_t ch)
-		{
-			uint32_t msh = (uint32_t)(ch - 0x10000) >> 10;
-			uint32_t lsh = (uint32_t)(ch - 0x10000) & 0x3ff;
-
-			result[0] = static_cast<uint16_t>(0xD800 + msh);
-			result[1] = static_cast<uint16_t>(0xDC00 + lsh);
-
-			return result + 2;
-		}
-
-		static value_type any(value_type result, uint32_t ch)
-		{
-			return (ch < 0x10000) ? low(result, ch) : high(result, ch);
-		}
-	};
-
-	struct utf32_counter
-	{
-		typedef size_t value_type;
-
-		static value_type low(value_type result, uint32_t)
-		{
-			return result + 1;
-		}
-
-		static value_type high(value_type result, uint32_t)
-		{
-			return result + 1;
-		}
-	};
-
-	struct utf32_writer
-	{
-		typedef uint32_t* value_type;
-
-		static value_type low(value_type result, uint32_t ch)
-		{
-			*result = ch;
-
-			return result + 1;
-		}
-
-		static value_type high(value_type result, uint32_t ch)
-		{
-			*result = ch;
-
-			return result + 1;
-		}
-
-		static value_type any(value_type result, uint32_t ch)
-		{
-			*result = ch;
-
-			return result + 1;
-		}
-	};
-
-	template <size_t size> struct wchar_selector;
-
-	template <> struct wchar_selector<2>
-	{
-		typedef uint16_t type;
-		typedef utf16_counter counter;
-		typedef utf16_writer writer;
-	};
-
-	template <> struct wchar_selector<4>
-	{
-		typedef uint32_t type;
-		typedef utf32_counter counter;
-		typedef utf32_writer writer;
-	};
-
-	typedef wchar_selector<sizeof(wchar_t)>::counter wchar_counter;
-	typedef wchar_selector<sizeof(wchar_t)>::writer wchar_writer;
-
-	template <typename Traits, typename opt_swap = opt_false> struct utf_decoder
-	{
-		static inline typename Traits::value_type decode_utf8_block(const uint8_t* data, size_t size, typename Traits::value_type result)
-		{
-			const uint8_t utf8_byte_mask = 0x3f;
-
-			while (size)
-			{
-				uint8_t lead = *data;
-
-				// 0xxxxxxx -> U+0000..U+007F
-				if (lead < 0x80)
-				{
-					result = Traits::low(result, lead);
-					data += 1;
-					size -= 1;
-
-					// process aligned single-byte (ascii) blocks
-					if ((reinterpret_cast<uintptr_t>(data) & 3) == 0)
-					{
-						while (size >= 4 && (*reinterpret_cast<const uint32_t*>(data) & 0x80808080) == 0)
-						{
-							result = Traits::low(result, data[0]);
-							result = Traits::low(result, data[1]);
-							result = Traits::low(result, data[2]);
-							result = Traits::low(result, data[3]);
-							data += 4;
-							size -= 4;
-						}
-					}
-				}
-				// 110xxxxx -> U+0080..U+07FF
-				else if ((unsigned)(lead - 0xC0) < 0x20 && size >= 2 && (data[1] & 0xc0) == 0x80)
-				{
-					result = Traits::low(result, ((lead & ~0xC0) << 6) | (data[1] & utf8_byte_mask));
-					data += 2;
-					size -= 2;
-				}
-				// 1110xxxx -> U+0800-U+FFFF
-				else if ((unsigned)(lead - 0xE0) < 0x10 && size >= 3 && (data[1] & 0xc0) == 0x80 && (data[2] & 0xc0) == 0x80)
-				{
-					result = Traits::low(result, ((lead & ~0xE0) << 12) | ((data[1] & utf8_byte_mask) << 6) | (data[2] & utf8_byte_mask));
-					data += 3;
-					size -= 3;
-				}
-				// 11110xxx -> U+10000..U+10FFFF
-				else if ((unsigned)(lead - 0xF0) < 0x08 && size >= 4 && (data[1] & 0xc0) == 0x80 && (data[2] & 0xc0) == 0x80 && (data[3] & 0xc0) == 0x80)
-				{
-					result = Traits::high(result, ((lead & ~0xF0) << 18) | ((data[1] & utf8_byte_mask) << 12) | ((data[2] & utf8_byte_mask) << 6) | (data[3] & utf8_byte_mask));
-					data += 4;
-					size -= 4;
-				}
-				// 10xxxxxx or 11111xxx -> invalid
-				else
-				{
-					data += 1;
-					size -= 1;
-				}
-			}
-
-			return result;
-		}
-
-		static inline typename Traits::value_type decode_utf16_block(const uint16_t* data, size_t size, typename Traits::value_type result)
-		{
-			const uint16_t* end = data + size;
-
-			while (data < end)
-			{
-				uint16_t lead = opt_swap::value ? endian_swap(*data) : *data;
-
-				// U+0000..U+D7FF
-				if (lead < 0xD800)
-				{
-					result = Traits::low(result, lead);
-					data += 1;
-				}
-				// U+E000..U+FFFF
-				else if ((unsigned)(lead - 0xE000) < 0x2000)
-				{
-					result = Traits::low(result, lead);
-					data += 1;
-				}
-				// surrogate pair lead
-				else if ((unsigned)(lead - 0xD800) < 0x400 && data + 1 < end)
-				{
-					uint16_t next = opt_swap::value ? endian_swap(data[1]) : data[1];
-
-					if ((unsigned)(next - 0xDC00) < 0x400)
-					{
-						result = Traits::high(result, 0x10000 + ((lead & 0x3ff) << 10) + (next & 0x3ff));
-						data += 2;
-					}
-					else
-					{
-						data += 1;
-					}
-				}
-				else
-				{
-					data += 1;
-				}
-			}
-
-			return result;
-		}
-
-		static inline typename Traits::value_type decode_utf32_block(const uint32_t* data, size_t size, typename Traits::value_type result)
-		{
-			const uint32_t* end = data + size;
-
-			while (data < end)
-			{
-				uint32_t lead = opt_swap::value ? endian_swap(*data) : *data;
-
-				// U+0000..U+FFFF
-				if (lead < 0x10000)
-				{
-					result = Traits::low(result, lead);
-					data += 1;
-				}
-				// U+10000..U+10FFFF
-				else
-				{
-					result = Traits::high(result, lead);
-					data += 1;
-				}
-			}
-
-			return result;
-		}
-	};
-
-	template <typename T> inline void convert_utf_endian_swap(T* result, const T* data, size_t length)
-	{
-		for (size_t i = 0; i < length; ++i) result[i] = endian_swap(data[i]);
-	}
-
-	inline void convert_wchar_endian_swap(wchar_t* result, const wchar_t* data, size_t length)
-	{
-		for (size_t i = 0; i < length; ++i) result[i] = static_cast<wchar_t>(endian_swap(static_cast<wchar_selector<sizeof(wchar_t)>::type>(data[i])));
-	}
-}
-
-namespace
-{	
-	enum chartype_t
-	{
-		ct_parse_pcdata = 1,	// \0, &, \r, <
-		ct_parse_attr = 2,		// \0, &, \r, ', "
-		ct_parse_attr_ws = 4,	// \0, &, \r, ', ", \n, tab
-		ct_space = 8,			// \r, \n, space, tab
-		ct_parse_cdata = 16,	// \0, ], >, \r
-		ct_parse_comment = 32,	// \0, -, >, \r
-		ct_symbol = 64,			// Any symbol > 127, a-z, A-Z, 0-9, _, :, -, .
-		ct_start_symbol = 128	// Any symbol > 127, a-z, A-Z, _, :
-	};
-
-	const unsigned char chartype_table[256] =
-	{
-		55,  0,   0,   0,   0,   0,   0,   0,      0,   12,  12,  0,   0,   63,  0,   0,   // 0-15
-		0,   0,   0,   0,   0,   0,   0,   0,      0,   0,   0,   0,   0,   0,   0,   0,   // 16-31
-		8,   0,   6,   0,   0,   0,   7,   6,      0,   0,   0,   0,   0,   96,  64,  0,   // 32-47
-		64,  64,  64,  64,  64,  64,  64,  64,     64,  64,  192, 0,   1,   0,   48,  0,   // 48-63
-		0,   192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 192, 192, 192, 192, 192, // 64-79
-		192, 192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 0,   0,   16,  0,   192, // 80-95
-		0,   192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 192, 192, 192, 192, 192, // 96-111
-		192, 192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 0, 0, 0, 0, 0,           // 112-127
-
-		192, 192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 192, 192, 192, 192, 192, // 128+
-		192, 192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 192, 192, 192, 192, 192,
-		192, 192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 192, 192, 192, 192, 192,
-		192, 192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 192, 192, 192, 192, 192,
-		192, 192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 192, 192, 192, 192, 192,
-		192, 192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 192, 192, 192, 192, 192,
-		192, 192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 192, 192, 192, 192, 192,
-		192, 192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 192, 192, 192, 192, 192
-	};
-
-	enum chartypex_t
-	{
-		ctx_special_pcdata = 1,   // Any symbol >= 0 and < 32 (except \t, \r, \n), &, <, >
-		ctx_special_attr = 2,     // Any symbol >= 0 and < 32 (except \t), &, <, >, "
-		ctx_start_symbol = 4,	  // Any symbol > 127, a-z, A-Z, _
-		ctx_digit = 8,			  // 0-9
-		ctx_symbol = 16			  // Any symbol > 127, a-z, A-Z, 0-9, _, -, .
-	};
-	
-	const unsigned char chartypex_table[256] =
-	{
-		3,  3,  3,  3,  3,  3,  3,  3,     3,  0,  2,  3,  3,  2,  3,  3,     // 0-15
-		3,  3,  3,  3,  3,  3,  3,  3,     3,  3,  3,  3,  3,  3,  3,  3,     // 16-31
-		0,  0,  2,  0,  0,  0,  3,  0,     0,  0,  0,  0,  0, 16, 16,  0,     // 32-47
-		24, 24, 24, 24, 24, 24, 24, 24,    24, 24, 0,  0,  3,  0,  3,  0,     // 48-63
-
-		0,  20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 20, 20, 20, 20, 20,    // 64-79
-		20, 20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 0,  0,  0,  0,  20,    // 80-95
-		0,  20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 20, 20, 20, 20, 20,    // 96-111
-		20, 20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 0,  0,  0,  0,  0,     // 112-127
-
-		20, 20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 20, 20, 20, 20, 20,    // 128+
-		20, 20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 20, 20, 20, 20, 20,
-		20, 20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 20, 20, 20, 20, 20,
-		20, 20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 20, 20, 20, 20, 20,
-		20, 20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 20, 20, 20, 20, 20,
-		20, 20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 20, 20, 20, 20, 20,
-		20, 20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 20, 20, 20, 20, 20,
-		20, 20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 20, 20, 20, 20, 20
-	};
-	
-#ifdef PUGIXML_WCHAR_MODE
-	#define IS_CHARTYPE_IMPL(c, ct, table) ((static_cast<unsigned int>(c) < 128 ? table[static_cast<unsigned int>(c)] : table[128]) & (ct))
-#else
-	#define IS_CHARTYPE_IMPL(c, ct, table) (table[static_cast<unsigned char>(c)] & (ct))
-#endif
-
-	#define IS_CHARTYPE(c, ct) IS_CHARTYPE_IMPL(c, ct, chartype_table)
-	#define IS_CHARTYPEX(c, ct) IS_CHARTYPE_IMPL(c, ct, chartypex_table)
-
-	bool is_little_endian()
-	{
-		unsigned int ui = 1;
-
-		return *reinterpret_cast<unsigned char*>(&ui) == 1;
-	}
-
-	xml_encoding get_wchar_encoding()
-	{
-		STATIC_ASSERT(sizeof(wchar_t) == 2 || sizeof(wchar_t) == 4);
-
-		if (sizeof(wchar_t) == 2)
-			return is_little_endian() ? encoding_utf16_le : encoding_utf16_be;
-		else 
-			return is_little_endian() ? encoding_utf32_le : encoding_utf32_be;
-	}
-
-	xml_encoding guess_buffer_encoding(uint8_t d0, uint8_t d1, uint8_t d2, uint8_t d3)
-	{
-		// look for BOM in first few bytes
-		if (d0 == 0 && d1 == 0 && d2 == 0xfe && d3 == 0xff) return encoding_utf32_be;
-		if (d0 == 0xff && d1 == 0xfe && d2 == 0 && d3 == 0) return encoding_utf32_le;
-		if (d0 == 0xfe && d1 == 0xff) return encoding_utf16_be;
-		if (d0 == 0xff && d1 == 0xfe) return encoding_utf16_le;
-		if (d0 == 0xef && d1 == 0xbb && d2 == 0xbf) return encoding_utf8;
-
-		// look for <, <? or <?xm in various encodings
-		if (d0 == 0 && d1 == 0 && d2 == 0 && d3 == 0x3c) return encoding_utf32_be;
-		if (d0 == 0x3c && d1 == 0 && d2 == 0 && d3 == 0) return encoding_utf32_le;
-		if (d0 == 0 && d1 == 0x3c && d2 == 0 && d3 == 0x3f) return encoding_utf16_be;
-		if (d0 == 0x3c && d1 == 0 && d2 == 0x3f && d3 == 0) return encoding_utf16_le;
-		if (d0 == 0x3c && d1 == 0x3f && d2 == 0x78 && d3 == 0x6d) return encoding_utf8;
-
-		// look for utf16 < followed by node name (this may fail, but is better than utf8 since it's zero terminated so early)
-		if (d0 == 0 && d1 == 0x3c) return encoding_utf16_be;
-		if (d0 == 0x3c && d1 == 0) return encoding_utf16_le;
-
-		// no known BOM detected, assume utf8
-		return encoding_utf8;
-	}
-
-	xml_encoding get_buffer_encoding(xml_encoding encoding, const void* contents, size_t size)
-	{
-		// replace wchar encoding with utf implementation
-		if (encoding == encoding_wchar) return get_wchar_encoding();
-
-		// replace utf16 encoding with utf16 with specific endianness
-		if (encoding == encoding_utf16) return is_little_endian() ? encoding_utf16_le : encoding_utf16_be;
-
-		// replace utf32 encoding with utf32 with specific endianness
-		if (encoding == encoding_utf32) return is_little_endian() ? encoding_utf32_le : encoding_utf32_be;
-
-		// only do autodetection if no explicit encoding is requested
-		if (encoding != encoding_auto) return encoding;
-
-		// skip encoding autodetection if input buffer is too small
-		if (size < 4) return encoding_utf8;
-
-		// try to guess encoding (based on XML specification, Appendix F.1)
-		const uint8_t* data = static_cast<const uint8_t*>(contents);
-
-		DMC_VOLATILE uint8_t d0 = data[0], d1 = data[1], d2 = data[2], d3 = data[3];
-
-		return guess_buffer_encoding(d0, d1, d2, d3);
-	}
-
-	bool get_mutable_buffer(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, bool is_mutable)
-	{
-		if (is_mutable)
-		{
-			out_buffer = static_cast<char_t*>(const_cast<void*>(contents));
-		}
-		else
-		{
-			void* buffer = global_allocate(size > 0 ? size : 1);
-			if (!buffer) return false;
-
-			memcpy(buffer, contents, size);
-
-			out_buffer = static_cast<char_t*>(buffer);
-		}
-
-		out_length = size / sizeof(char_t);
-
-		return true;
-	}
-
-#ifdef PUGIXML_WCHAR_MODE
-	inline bool need_endian_swap_utf(xml_encoding le, xml_encoding re)
-	{
-		return (le == encoding_utf16_be && re == encoding_utf16_le) || (le == encoding_utf16_le && re == encoding_utf16_be) ||
-		       (le == encoding_utf32_be && re == encoding_utf32_le) || (le == encoding_utf32_le && re == encoding_utf32_be);
-	}
-
-	bool convert_buffer_endian_swap(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, bool is_mutable)
-	{
-		const char_t* data = static_cast<const char_t*>(contents);
-	
-		if (is_mutable)
-		{
-			out_buffer = const_cast<char_t*>(data);
-		}
-		else
-		{
-			out_buffer = static_cast<char_t*>(global_allocate(size > 0 ? size : 1));
-			if (!out_buffer) return false;
-		}
-
-		out_length = size / sizeof(char_t);
-
-		convert_wchar_endian_swap(out_buffer, data, out_length);
-
-		return true;
-	}
-
-	bool convert_buffer_utf8(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size)
-	{
-		const uint8_t* data = static_cast<const uint8_t*>(contents);
-
-		// first pass: get length in wchar_t units
-		out_length = utf_decoder<wchar_counter>::decode_utf8_block(data, size, 0);
-
-		// allocate buffer of suitable length
-		out_buffer = static_cast<char_t*>(global_allocate((out_length > 0 ? out_length : 1) * sizeof(char_t)));
-		if (!out_buffer) return false;
-
-		// second pass: convert utf8 input to wchar_t
-		wchar_writer::value_type out_begin = reinterpret_cast<wchar_writer::value_type>(out_buffer);
-		wchar_writer::value_type out_end = utf_decoder<wchar_writer>::decode_utf8_block(data, size, out_begin);
-
-		assert(out_end == out_begin + out_length);
-		(void)!out_end;
-
-		return true;
-	}
-
-	template <typename opt_swap> bool convert_buffer_utf16(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, opt_swap)
-	{
-		const uint16_t* data = static_cast<const uint16_t*>(contents);
-		size_t length = size / sizeof(uint16_t);
-
-		// first pass: get length in wchar_t units
-		out_length = utf_decoder<wchar_counter, opt_swap>::decode_utf16_block(data, length, 0);
-
-		// allocate buffer of suitable length
-		out_buffer = static_cast<char_t*>(global_allocate((out_length > 0 ? out_length : 1) * sizeof(char_t)));
-		if (!out_buffer) return false;
-
-		// second pass: convert utf16 input to wchar_t
-		wchar_writer::value_type out_begin = reinterpret_cast<wchar_writer::value_type>(out_buffer);
-		wchar_writer::value_type out_end = utf_decoder<wchar_writer, opt_swap>::decode_utf16_block(data, length, out_begin);
-
-		assert(out_end == out_begin + out_length);
-		(void)!out_end;
-
-		return true;
-	}
-
-	template <typename opt_swap> bool convert_buffer_utf32(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, opt_swap)
-	{
-		const uint32_t* data = static_cast<const uint32_t*>(contents);
-		size_t length = size / sizeof(uint32_t);
-
-		// first pass: get length in wchar_t units
-		out_length = utf_decoder<wchar_counter, opt_swap>::decode_utf32_block(data, length, 0);
-
-		// allocate buffer of suitable length
-		out_buffer = static_cast<char_t*>(global_allocate((out_length > 0 ? out_length : 1) * sizeof(char_t)));
-		if (!out_buffer) return false;
-
-		// second pass: convert utf32 input to wchar_t
-		wchar_writer::value_type out_begin = reinterpret_cast<wchar_writer::value_type>(out_buffer);
-		wchar_writer::value_type out_end = utf_decoder<wchar_writer, opt_swap>::decode_utf32_block(data, length, out_begin);
-
-		assert(out_end == out_begin + out_length);
-		(void)!out_end;
-
-		return true;
-	}
-
-	bool convert_buffer(char_t*& out_buffer, size_t& out_length, xml_encoding encoding, const void* contents, size_t size, bool is_mutable)
-	{
-		// get native encoding
-		xml_encoding wchar_encoding = get_wchar_encoding();
-
-		// fast path: no conversion required
-		if (encoding == wchar_encoding) return get_mutable_buffer(out_buffer, out_length, contents, size, is_mutable);
-
-		// only endian-swapping is required
-		if (need_endian_swap_utf(encoding, wchar_encoding)) return convert_buffer_endian_swap(out_buffer, out_length, contents, size, is_mutable);
-
-		// source encoding is utf8
-		if (encoding == encoding_utf8) return convert_buffer_utf8(out_buffer, out_length, contents, size);
-
-		// source encoding is utf16
-		if (encoding == encoding_utf16_be || encoding == encoding_utf16_le)
-		{
-			xml_encoding native_encoding = is_little_endian() ? encoding_utf16_le : encoding_utf16_be;
-
-			return (native_encoding == encoding) ?
-				convert_buffer_utf16(out_buffer, out_length, contents, size, opt_false()) :
-				convert_buffer_utf16(out_buffer, out_length, contents, size, opt_true());
-		}
-
-		// source encoding is utf32
-		if (encoding == encoding_utf32_be || encoding == encoding_utf32_le)
-		{
-			xml_encoding native_encoding = is_little_endian() ? encoding_utf32_le : encoding_utf32_be;
-
-			return (native_encoding == encoding) ?
-				convert_buffer_utf32(out_buffer, out_length, contents, size, opt_false()) :
-				convert_buffer_utf32(out_buffer, out_length, contents, size, opt_true());
-		}
-
-		assert(!"Invalid encoding");
-		return false;
-	}
-#else
-	template <typename opt_swap> bool convert_buffer_utf16(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, opt_swap)
-	{
-		const uint16_t* data = static_cast<const uint16_t*>(contents);
-		size_t length = size / sizeof(uint16_t);
-
-		// first pass: get length in utf8 units
-		out_length = utf_decoder<utf8_counter, opt_swap>::decode_utf16_block(data, length, 0);
-
-		// allocate buffer of suitable length
-		out_buffer = static_cast<char_t*>(global_allocate((out_length > 0 ? out_length : 1) * sizeof(char_t)));
-		if (!out_buffer) return false;
-
-		// second pass: convert utf16 input to utf8
-		uint8_t* out_begin = reinterpret_cast<uint8_t*>(out_buffer);
-		uint8_t* out_end = utf_decoder<utf8_writer, opt_swap>::decode_utf16_block(data, length, out_begin);
-
-		assert(out_end == out_begin + out_length);
-		(void)!out_end;
-
-		return true;
-	}
-
-	template <typename opt_swap> bool convert_buffer_utf32(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, opt_swap)
-	{
-		const uint32_t* data = static_cast<const uint32_t*>(contents);
-		size_t length = size / sizeof(uint32_t);
-
-		// first pass: get length in utf8 units
-		out_length = utf_decoder<utf8_counter, opt_swap>::decode_utf32_block(data, length, 0);
-
-		// allocate buffer of suitable length
-		out_buffer = static_cast<char_t*>(global_allocate((out_length > 0 ? out_length : 1) * sizeof(char_t)));
-		if (!out_buffer) return false;
-
-		// second pass: convert utf32 input to utf8
-		uint8_t* out_begin = reinterpret_cast<uint8_t*>(out_buffer);
-		uint8_t* out_end = utf_decoder<utf8_writer, opt_swap>::decode_utf32_block(data, length, out_begin);
-
-		assert(out_end == out_begin + out_length);
-		(void)!out_end;
-
-		return true;
-	}
-
-	bool convert_buffer(char_t*& out_buffer, size_t& out_length, xml_encoding encoding, const void* contents, size_t size, bool is_mutable)
-	{
-		// fast path: no conversion required
-		if (encoding == encoding_utf8) return get_mutable_buffer(out_buffer, out_length, contents, size, is_mutable);
-
-		// source encoding is utf16
-		if (encoding == encoding_utf16_be || encoding == encoding_utf16_le)
-		{
-			xml_encoding native_encoding = is_little_endian() ? encoding_utf16_le : encoding_utf16_be;
-
-			return (native_encoding == encoding) ?
-				convert_buffer_utf16(out_buffer, out_length, contents, size, opt_false()) :
-				convert_buffer_utf16(out_buffer, out_length, contents, size, opt_true());
-		}
-
-		// source encoding is utf32
-		if (encoding == encoding_utf32_be || encoding == encoding_utf32_le)
-		{
-			xml_encoding native_encoding = is_little_endian() ? encoding_utf32_le : encoding_utf32_be;
-
-			return (native_encoding == encoding) ?
-				convert_buffer_utf32(out_buffer, out_length, contents, size, opt_false()) :
-				convert_buffer_utf32(out_buffer, out_length, contents, size, opt_true());
-		}
-
-		assert(!"Invalid encoding");
-		return false;
-	}
-#endif
-
-	size_t as_utf8_begin(const wchar_t* str, size_t length)
-	{
-		STATIC_ASSERT(sizeof(wchar_t) == 2 || sizeof(wchar_t) == 4);
-
-		// get length in utf8 characters
-		return sizeof(wchar_t) == 2 ?
-			utf_decoder<utf8_counter>::decode_utf16_block(reinterpret_cast<const uint16_t*>(str), length, 0) :
-			utf_decoder<utf8_counter>::decode_utf32_block(reinterpret_cast<const uint32_t*>(str), length, 0);
-    }
-
-    void as_utf8_end(char* buffer, size_t size, const wchar_t* str, size_t length)
-    {
-		STATIC_ASSERT(sizeof(wchar_t) == 2 || sizeof(wchar_t) == 4);
-
-        // convert to utf8
-        uint8_t* begin = reinterpret_cast<uint8_t*>(buffer);
-        uint8_t* end = sizeof(wchar_t) == 2 ?
-            utf_decoder<utf8_writer>::decode_utf16_block(reinterpret_cast<const uint16_t*>(str), length, begin) :
-            utf_decoder<utf8_writer>::decode_utf32_block(reinterpret_cast<const uint32_t*>(str), length, begin);
-    
-        assert(begin + size == end);
-        (void)!end;
-
-		// zero-terminate
-		buffer[size] = 0;
-	}
-    
-#ifndef PUGIXML_NO_STL
-    std::string as_utf8_impl(const wchar_t* str, size_t length)
-    {
-		// first pass: get length in utf8 characters
-        size_t size = as_utf8_begin(str, length);
-
-		// allocate resulting string
-		std::string result;
-		result.resize(size);
-
-		// second pass: convert to utf8
-		if (size > 0) as_utf8_end(&result[0], size, str, length);
-
-	  	return result;
-    }
-
-	std::wstring as_wide_impl(const char* str, size_t size)
-	{
-		const uint8_t* data = reinterpret_cast<const uint8_t*>(str);
-
-		// first pass: get length in wchar_t units
-		size_t length = utf_decoder<wchar_counter>::decode_utf8_block(data, size, 0);
-
-		// allocate resulting string
-		std::wstring result;
-		result.resize(length);
-
-		// second pass: convert to wchar_t
-		if (length > 0)
-		{
-			wchar_writer::value_type begin = reinterpret_cast<wchar_writer::value_type>(&result[0]);
-			wchar_writer::value_type end = utf_decoder<wchar_writer>::decode_utf8_block(data, size, begin);
-
-			assert(begin + length == end);
-			(void)!end;
-		}
-
-		return result;
-	}
-#endif
-
-	inline bool strcpy_insitu_allow(size_t length, uintptr_t allocated, char_t* target)
-	{
-		assert(target);
-		size_t target_length = strlength(target);
-
-		// always reuse document buffer memory if possible
-		if (!allocated) return target_length >= length;
-
-		// reuse heap memory if waste is not too great
-		const size_t reuse_threshold = 32;
-
-		return target_length >= length && (target_length < reuse_threshold || target_length - length < target_length / 2);
-	}
-
-	bool strcpy_insitu(char_t*& dest, uintptr_t& header, uintptr_t header_mask, const char_t* source)
-	{
-		size_t source_length = strlength(source);
-
-		if (source_length == 0)
-		{
-			// empty string and null pointer are equivalent, so just deallocate old memory
-			xml_allocator* alloc = reinterpret_cast<xml_memory_page*>(header & xml_memory_page_pointer_mask)->allocator;
-
-			if (header & header_mask) alloc->deallocate_string(dest);
-			
-			// mark the string as not allocated
-			dest = 0;
-			header &= ~header_mask;
-
-			return true;
-		}
-		else if (dest && strcpy_insitu_allow(source_length, header & header_mask, dest))
-		{
-			// we can reuse old buffer, so just copy the new data (including zero terminator)
-			memcpy(dest, source, (source_length + 1) * sizeof(char_t));
-			
-			return true;
-		}
-		else
-		{
-			xml_allocator* alloc = reinterpret_cast<xml_memory_page*>(header & xml_memory_page_pointer_mask)->allocator;
-
-			// allocate new buffer
-			char_t* buf = alloc->allocate_string(source_length + 1);
-			if (!buf) return false;
-
-			// copy the string (including zero terminator)
-			memcpy(buf, source, (source_length + 1) * sizeof(char_t));
-
-			// deallocate old buffer (*after* the above to protect against overlapping memory and/or allocation failures)
-			if (header & header_mask) alloc->deallocate_string(dest);
-			
-			// the string is now allocated, so set the flag
-			dest = buf;
-			header |= header_mask;
-
-			return true;
-		}
-	}
-
-	struct gap
-	{
-		char_t* end;
-		size_t size;
-			
-		gap(): end(0), size(0)
-		{
-		}
-			
-		// Push new gap, move s count bytes further (skipping the gap).
-		// Collapse previous gap.
-		void push(char_t*& s, size_t count)
-		{
-			if (end) // there was a gap already; collapse it
-			{
-				// Move [old_gap_end, new_gap_start) to [old_gap_start, ...)
-				assert(s >= end);
-				memmove(end - size, end, reinterpret_cast<char*>(s) - reinterpret_cast<char*>(end));
-			}
-				
-			s += count; // end of current gap
-				
-			// "merge" two gaps
-			end = s;
-			size += count;
-		}
-			
-		// Collapse all gaps, return past-the-end pointer
-		char_t* flush(char_t* s)
-		{
-			if (end)
-			{
-				// Move [old_gap_end, current_pos) to [old_gap_start, ...)
-				assert(s >= end);
-				memmove(end - size, end, reinterpret_cast<char*>(s) - reinterpret_cast<char*>(end));
-
-				return s - size;
-			}
-			else return s;
-		}
-	};
-	
-	char_t* strconv_escape(char_t* s, gap& g)
-	{
-		char_t* stre = s + 1;
-
-		switch (*stre)
-		{
-			case '#':	// &#...
-			{
-				unsigned int ucsc = 0;
-
-				if (stre[1] == 'x') // &#x... (hex code)
-				{
-					stre += 2;
-
-					char_t ch = *stre;
-
-					if (ch == ';') return stre;
-
-					for (;;)
-					{
-						if (static_cast<unsigned int>(ch - '0') <= 9)
-							ucsc = 16 * ucsc + (ch - '0');
-						else if (static_cast<unsigned int>((ch | ' ') - 'a') <= 5)
-							ucsc = 16 * ucsc + ((ch | ' ') - 'a' + 10);
-						else if (ch == ';')
-							break;
-						else // cancel
-							return stre;
-
-						ch = *++stre;
-					}
-					
-					++stre;
-				}
-				else	// &#... (dec code)
-				{
-					char_t ch = *++stre;
-
-					if (ch == ';') return stre;
-
-					for (;;)
-					{
-						if (static_cast<unsigned int>(ch - '0') <= 9)
-							ucsc = 10 * ucsc + (ch - '0');
-						else if (ch == ';')
-							break;
-						else // cancel
-							return stre;
-
-						ch = *++stre;
-					}
-					
-					++stre;
-				}
-
-			#ifdef PUGIXML_WCHAR_MODE
-				s = reinterpret_cast<char_t*>(wchar_writer::any(reinterpret_cast<wchar_writer::value_type>(s), ucsc));
-			#else
-				s = reinterpret_cast<char_t*>(utf8_writer::any(reinterpret_cast<uint8_t*>(s), ucsc));
-			#endif
-					
-				g.push(s, stre - s);
-				return stre;
-			}
-			case 'a':	// &a
-			{
-				++stre;
-
-				if (*stre == 'm') // &am
-				{
-					if (*++stre == 'p' && *++stre == ';') // &
-					{
-						*s++ = '&';
-						++stre;
-							
-						g.push(s, stre - s);
-						return stre;
-					}
-				}
-				else if (*stre == 'p') // &ap
-				{
-					if (*++stre == 'o' && *++stre == 's' && *++stre == ';') // '
-					{
-						*s++ = '\'';
-						++stre;
-
-						g.push(s, stre - s);
-						return stre;
-					}
-				}
-				break;
-			}
-			case 'g': // &g
-			{
-				if (*++stre == 't' && *++stre == ';') // >
-				{
-					*s++ = '>';
-					++stre;
-					
-					g.push(s, stre - s);
-					return stre;
-				}
-				break;
-			}
-			case 'l': // &l
-			{
-				if (*++stre == 't' && *++stre == ';') // <
-				{
-					*s++ = '<';
-					++stre;
-						
-					g.push(s, stre - s);
-					return stre;
-				}
-				break;
-			}
-			case 'q': // &q
-			{
-				if (*++stre == 'u' && *++stre == 'o' && *++stre == 't' && *++stre == ';') // "
-				{
-					*s++ = '"';
-					++stre;
-					
-					g.push(s, stre - s);
-					return stre;
-				}
-				break;
-			}
-		}
-		
-		return stre;
-	}
-
-	// Utility macro for last character handling
-	#define ENDSWITH(c, e) ((c) == (e) || ((c) == 0 && endch == (e)))
-
-	char_t* strconv_comment(char_t* s, char_t endch)
-	{
-		gap g;
-		
-		while (true)
-		{
-			while (!IS_CHARTYPE(*s, ct_parse_comment)) ++s;
-		
-			if (*s == '\r') // Either a single 0x0d or 0x0d 0x0a pair
-			{
-				*s++ = '\n'; // replace first one with 0x0a
-				
-				if (*s == '\n') g.push(s, 1);
-			}
-			else if (s[0] == '-' && s[1] == '-' && ENDSWITH(s[2], '>')) // comment ends here
-			{
-				*g.flush(s) = 0;
-				
-				return s + (s[2] == '>' ? 3 : 2);
-			}
-			else if (*s == 0)
-			{
-				return 0;
-			}
-			else ++s;
-		}
-	}
-
-	char_t* strconv_cdata(char_t* s, char_t endch)
-	{
-		gap g;
-			
-		while (true)
-		{
-			while (!IS_CHARTYPE(*s, ct_parse_cdata)) ++s;
-			
-			if (*s == '\r') // Either a single 0x0d or 0x0d 0x0a pair
-			{
-				*s++ = '\n'; // replace first one with 0x0a
-				
-				if (*s == '\n') g.push(s, 1);
-			}
-			else if (s[0] == ']' && s[1] == ']' && ENDSWITH(s[2], '>')) // CDATA ends here
-			{
-				*g.flush(s) = 0;
-				
-				return s + 1;
-			}
-			else if (*s == 0)
-			{
-				return 0;
-			}
-			else ++s;
-		}
-	}
-	
-	typedef char_t* (*strconv_pcdata_t)(char_t*);
-		
-	template <typename opt_eol, typename opt_escape> struct strconv_pcdata_impl
-	{
-		static char_t* parse(char_t* s)
-		{
-			gap g;
-			
-			while (true)
-			{
-				while (!IS_CHARTYPE(*s, ct_parse_pcdata)) ++s;
-					
-				if (*s == '<') // PCDATA ends here
-				{
-					*g.flush(s) = 0;
-					
-					return s + 1;
-				}
-				else if (opt_eol::value && *s == '\r') // Either a single 0x0d or 0x0d 0x0a pair
-				{
-					*s++ = '\n'; // replace first one with 0x0a
-					
-					if (*s == '\n') g.push(s, 1);
-				}
-				else if (opt_escape::value && *s == '&')
-				{
-					s = strconv_escape(s, g);
-				}
-				else if (*s == 0)
-				{
-					return s;
-				}
-				else ++s;
-			}
-		}
-	};
-	
-	strconv_pcdata_t get_strconv_pcdata(unsigned int optmask)
-	{
-		STATIC_ASSERT(parse_escapes == 0x10 && parse_eol == 0x20);
-
-		switch ((optmask >> 4) & 3) // get bitmask for flags (eol escapes)
-		{
-		case 0: return strconv_pcdata_impl<opt_false, opt_false>::parse;
-		case 1: return strconv_pcdata_impl<opt_false, opt_true>::parse;
-		case 2: return strconv_pcdata_impl<opt_true, opt_false>::parse;
-		case 3: return strconv_pcdata_impl<opt_true, opt_true>::parse;
-		default: return 0; // should not get here
-		}
-	}
-
-	typedef char_t* (*strconv_attribute_t)(char_t*, char_t);
-	
-	template <typename opt_escape> struct strconv_attribute_impl
-	{
-		static char_t* parse_wnorm(char_t* s, char_t end_quote)
-		{
-			gap g;
-
-			// trim leading whitespaces
-			if (IS_CHARTYPE(*s, ct_space))
-			{
-				char_t* str = s;
-				
-				do ++str;
-				while (IS_CHARTYPE(*str, ct_space));
-				
-				g.push(s, str - s);
-			}
-
-			while (true)
-			{
-				while (!IS_CHARTYPE(*s, ct_parse_attr_ws | ct_space)) ++s;
-				
-				if (*s == end_quote)
-				{
-					char_t* str = g.flush(s);
-					
-					do *str-- = 0;
-					while (IS_CHARTYPE(*str, ct_space));
-				
-					return s + 1;
-				}
-				else if (IS_CHARTYPE(*s, ct_space))
-				{
-					*s++ = ' ';
-		
-					if (IS_CHARTYPE(*s, ct_space))
-					{
-						char_t* str = s + 1;
-						while (IS_CHARTYPE(*str, ct_space)) ++str;
-						
-						g.push(s, str - s);
-					}
-				}
-				else if (opt_escape::value && *s == '&')
-				{
-					s = strconv_escape(s, g);
-				}
-				else if (!*s)
-				{
-					return 0;
-				}
-				else ++s;
-			}
-		}
-
-		static char_t* parse_wconv(char_t* s, char_t end_quote)
-		{
-			gap g;
-
-			while (true)
-			{
-				while (!IS_CHARTYPE(*s, ct_parse_attr_ws)) ++s;
-				
-				if (*s == end_quote)
-				{
-					*g.flush(s) = 0;
-				
-					return s + 1;
-				}
-				else if (IS_CHARTYPE(*s, ct_space))
-				{
-					if (*s == '\r')
-					{
-						*s++ = ' ';
-				
-						if (*s == '\n') g.push(s, 1);
-					}
-					else *s++ = ' ';
-				}
-				else if (opt_escape::value && *s == '&')
-				{
-					s = strconv_escape(s, g);
-				}
-				else if (!*s)
-				{
-					return 0;
-				}
-				else ++s;
-			}
-		}
-
-		static char_t* parse_eol(char_t* s, char_t end_quote)
-		{
-			gap g;
-
-			while (true)
-			{
-				while (!IS_CHARTYPE(*s, ct_parse_attr)) ++s;
-				
-				if (*s == end_quote)
-				{
-					*g.flush(s) = 0;
-				
-					return s + 1;
-				}
-				else if (*s == '\r')
-				{
-					*s++ = '\n';
-					
-					if (*s == '\n') g.push(s, 1);
-				}
-				else if (opt_escape::value && *s == '&')
-				{
-					s = strconv_escape(s, g);
-				}
-				else if (!*s)
-				{
-					return 0;
-				}
-				else ++s;
-			}
-		}
-
-		static char_t* parse_simple(char_t* s, char_t end_quote)
-		{
-			gap g;
-
-			while (true)
-			{
-				while (!IS_CHARTYPE(*s, ct_parse_attr)) ++s;
-				
-				if (*s == end_quote)
-				{
-					*g.flush(s) = 0;
-				
-					return s + 1;
-				}
-				else if (opt_escape::value && *s == '&')
-				{
-					s = strconv_escape(s, g);
-				}
-				else if (!*s)
-				{
-					return 0;
-				}
-				else ++s;
-			}
-		}
-	};
-
-	strconv_attribute_t get_strconv_attribute(unsigned int optmask)
-	{
-		STATIC_ASSERT(parse_escapes == 0x10 && parse_eol == 0x20 && parse_wconv_attribute == 0x40 && parse_wnorm_attribute == 0x80);
-		
-		switch ((optmask >> 4) & 15) // get bitmask for flags (wconv wnorm eol escapes)
-		{
-		case 0:  return strconv_attribute_impl<opt_false>::parse_simple;
-		case 1:  return strconv_attribute_impl<opt_true>::parse_simple;
-		case 2:  return strconv_attribute_impl<opt_false>::parse_eol;
-		case 3:  return strconv_attribute_impl<opt_true>::parse_eol;
-		case 4:  return strconv_attribute_impl<opt_false>::parse_wconv;
-		case 5:  return strconv_attribute_impl<opt_true>::parse_wconv;
-		case 6:  return strconv_attribute_impl<opt_false>::parse_wconv;
-		case 7:  return strconv_attribute_impl<opt_true>::parse_wconv;
-		case 8:  return strconv_attribute_impl<opt_false>::parse_wnorm;
-		case 9:  return strconv_attribute_impl<opt_true>::parse_wnorm;
-		case 10: return strconv_attribute_impl<opt_false>::parse_wnorm;
-		case 11: return strconv_attribute_impl<opt_true>::parse_wnorm;
-		case 12: return strconv_attribute_impl<opt_false>::parse_wnorm;
-		case 13: return strconv_attribute_impl<opt_true>::parse_wnorm;
-		case 14: return strconv_attribute_impl<opt_false>::parse_wnorm;
-		case 15: return strconv_attribute_impl<opt_true>::parse_wnorm;
-		default: return 0; // should not get here
-		}
-	}
-
-	inline xml_parse_result make_parse_result(xml_parse_status status, ptrdiff_t offset = 0)
-	{
-		xml_parse_result result;
-		result.status = status;
-		result.offset = offset;
-
-		return result;
-	}
-
-	struct xml_parser
-	{
-		xml_allocator alloc;
-		char_t* error_offset;
-		jmp_buf error_handler;
-		
-		// Parser utilities.
-		#define SKIPWS()			{ while (IS_CHARTYPE(*s, ct_space)) ++s; }
-		#define OPTSET(OPT)			( optmsk & OPT )
-		#define PUSHNODE(TYPE)		{ cursor = append_node(cursor, alloc, TYPE); if (!cursor) THROW_ERROR(status_out_of_memory, s); }
-		#define POPNODE()			{ cursor = cursor->parent; }
-		#define SCANFOR(X)			{ while (*s != 0 && !(X)) ++s; }
-		#define SCANWHILE(X)		{ while ((X)) ++s; }
-		#define ENDSEG()			{ ch = *s; *s = 0; ++s; }
-		#define THROW_ERROR(err, m)	error_offset = m, longjmp(error_handler, err)
-		#define CHECK_ERROR(err, m)	{ if (*s == 0) THROW_ERROR(err, m); }
-		
-		xml_parser(const xml_allocator& alloc): alloc(alloc), error_offset(0)
-		{
-		}
-
-		// DOCTYPE consists of nested sections of the following possible types:
-		// <!-- ... -->, <? ... ?>, "...", '...'
-		// <![...]]>
-		// <!...>
-		// First group can not contain nested groups
-		// Second group can contain nested groups of the same type
-		// Third group can contain all other groups
-		char_t* parse_doctype_primitive(char_t* s)
-		{
-			if (*s == '"' || *s == '\'')
-			{
-				// quoted string
-				char_t ch = *s++;
-				SCANFOR(*s == ch);
-				if (!*s) THROW_ERROR(status_bad_doctype, s);
-
-				s++;
-			}
-			else if (s[0] == '<' && s[1] == '?')
-			{
-				// <? ... ?>
-				s += 2;
-				SCANFOR(s[0] == '?' && s[1] == '>'); // no need for ENDSWITH because ?> can't terminate proper doctype
-				if (!*s) THROW_ERROR(status_bad_doctype, s);
-
-				s += 2;
-			}
-			else if (s[0] == '<' && s[1] == '!' && s[2] == '-' && s[3] == '-')
-			{
-				s += 4;
-				SCANFOR(s[0] == '-' && s[1] == '-' && s[2] == '>'); // no need for ENDSWITH because --> can't terminate proper doctype
-				if (!*s) THROW_ERROR(status_bad_doctype, s);
-
-				s += 4;
-			}
-			else THROW_ERROR(status_bad_doctype, s);
-
-			return s;
-		}
-
-		char_t* parse_doctype_ignore(char_t* s)
-		{
-			assert(s[0] == '<' && s[1] == '!' && s[2] == '[');
-			s++;
-
-			while (*s)
-			{
-				if (s[0] == '<' && s[1] == '!' && s[2] == '[')
-				{
-					// nested ignore section
-					s = parse_doctype_ignore(s);
-				}
-				else if (s[0] == ']' && s[1] == ']' && s[2] == '>')
-				{
-					// ignore section end
-					s += 3;
-
-					return s;
-				}
-				else s++;
-			}
-
-			THROW_ERROR(status_bad_doctype, s);
-
-			return s;
-		}
-
-		char_t* parse_doctype_group(char_t* s, char_t endch, bool toplevel)
-		{
-			assert(s[0] == '<' && s[1] == '!');
-			s++;
-
-			while (*s)
-			{
-				if (s[0] == '<' && s[1] == '!' && s[2] != '-')
-				{
-					if (s[2] == '[')
-					{
-						// ignore
-						s = parse_doctype_ignore(s);
-					}
-					else
-					{
-						// some control group
-						s = parse_doctype_group(s, endch, false);
-					}
-				}
-				else if (s[0] == '<' || s[0] == '"' || s[0] == '\'')
-				{
-					// unknown tag (forbidden), or some primitive group
-					s = parse_doctype_primitive(s);
-				}
-				else if (*s == '>')
-				{
-					s++;
-
-					return s;
-				}
-				else s++;
-			}
-
-			if (!toplevel || endch != '>') THROW_ERROR(status_bad_doctype, s);
-
-			return s;
-		}
-
-		char_t* parse_exclamation(char_t* s, xml_node_struct* cursor, unsigned int optmsk, char_t endch)
-		{
-			// parse node contents, starting with exclamation mark
-			++s;
-
-			if (*s == '-') // '<!-...'
-			{
-				++s;
-
-				if (*s == '-') // '<!--...'
-				{
-					++s;
-
-					if (OPTSET(parse_comments))
-					{
-						PUSHNODE(node_comment); // Append a new node on the tree.
-						cursor->value = s; // Save the offset.
-					}
-
-					if (OPTSET(parse_eol) && OPTSET(parse_comments))
-					{
-						s = strconv_comment(s, endch);
-
-						if (!s) THROW_ERROR(status_bad_comment, cursor->value);
-					}
-					else
-					{
-						// Scan for terminating '-->'.
-						SCANFOR(s[0] == '-' && s[1] == '-' && ENDSWITH(s[2], '>'));
-						CHECK_ERROR(status_bad_comment, s);
-
-						if (OPTSET(parse_comments))
-							*s = 0; // Zero-terminate this segment at the first terminating '-'.
-
-						s += (s[2] == '>' ? 3 : 2); // Step over the '\0->'.
-					}
-				}
-				else THROW_ERROR(status_bad_comment, s);
-			}
-			else if (*s == '[')
-			{
-				// '<![CDATA[...'
-				if (*++s=='C' && *++s=='D' && *++s=='A' && *++s=='T' && *++s=='A' && *++s == '[')
-				{
-					++s;
-
-					if (OPTSET(parse_cdata))
-					{
-						PUSHNODE(node_cdata); // Append a new node on the tree.
-						cursor->value = s; // Save the offset.
-
-						if (OPTSET(parse_eol))
-						{
-							s = strconv_cdata(s, endch);
-
-							if (!s) THROW_ERROR(status_bad_cdata, cursor->value);
-						}
-						else
-						{
-							// Scan for terminating ']]>'.
-							SCANFOR(s[0] == ']' && s[1] == ']' && ENDSWITH(s[2], '>'));
-							CHECK_ERROR(status_bad_cdata, s);
-
-							*s++ = 0; // Zero-terminate this segment.
-						}
-					}
-					else // Flagged for discard, but we still have to scan for the terminator.
-					{
-						// Scan for terminating ']]>'.
-						SCANFOR(s[0] == ']' && s[1] == ']' && ENDSWITH(s[2], '>'));
-						CHECK_ERROR(status_bad_cdata, s);
-
-						++s;
-					}
-
-					s += (s[1] == '>' ? 2 : 1); // Step over the last ']>'.
-				}
-				else THROW_ERROR(status_bad_cdata, s);
-			}
-			else if (s[0] == 'D' && s[1] == 'O' && s[2] == 'C' && s[3] == 'T' && s[4] == 'Y' && s[5] == 'P' && ENDSWITH(s[6], 'E'))
-			{
-				s -= 2;
-
-                if (cursor->parent) THROW_ERROR(status_bad_doctype, s);
-
-                char_t* mark = s + 9;
-
-				s = parse_doctype_group(s, endch, true);
-
-                if (OPTSET(parse_doctype))
-                {
-                    while (IS_CHARTYPE(*mark, ct_space)) ++mark;
-
-                    PUSHNODE(node_doctype);
-
-                    cursor->value = mark;
-
-                    assert((s[0] == 0 && endch == '>') || s[-1] == '>');
-                    s[*s == 0 ? 0 : -1] = 0;
-
-                    POPNODE();
-                }
-			}
-			else if (*s == 0 && endch == '-') THROW_ERROR(status_bad_comment, s);
-			else if (*s == 0 && endch == '[') THROW_ERROR(status_bad_cdata, s);
-			else THROW_ERROR(status_unrecognized_tag, s);
-
-			return s;
-		}
-
-		char_t* parse_question(char_t* s, xml_node_struct*& ref_cursor, unsigned int optmsk, char_t endch)
-		{
-			// load into registers
-			xml_node_struct* cursor = ref_cursor;
-			char_t ch = 0;
-
-			// parse node contents, starting with question mark
-			++s;
-
-			// read PI target
-			char_t* target = s;
-
-			if (!IS_CHARTYPE(*s, ct_start_symbol)) THROW_ERROR(status_bad_pi, s);
-
-			SCANWHILE(IS_CHARTYPE(*s, ct_symbol));
-			CHECK_ERROR(status_bad_pi, s);
-
-			// determine node type; stricmp / strcasecmp is not portable
-			bool declaration = (target[0] | ' ') == 'x' && (target[1] | ' ') == 'm' && (target[2] | ' ') == 'l' && target + 3 == s;
-
-			if (declaration ? OPTSET(parse_declaration) : OPTSET(parse_pi))
-			{
-				if (declaration)
-				{
-					// disallow non top-level declarations
-					if (cursor->parent) THROW_ERROR(status_bad_pi, s);
-
-					PUSHNODE(node_declaration);
-				}
-				else
-				{
-					PUSHNODE(node_pi);
-				}
-
-				cursor->name = target;
-
-				ENDSEG();
-
-				// parse value/attributes
-				if (ch == '?')
-				{
-					// empty node
-					if (!ENDSWITH(*s, '>')) THROW_ERROR(status_bad_pi, s);
-					s += (*s == '>');
-
-					POPNODE();
-				}
-				else if (IS_CHARTYPE(ch, ct_space))
-				{
-					SKIPWS();
-
-					// scan for tag end
-					char_t* value = s;
-
-					SCANFOR(s[0] == '?' && ENDSWITH(s[1], '>'));
-					CHECK_ERROR(status_bad_pi, s);
-
-					if (declaration)
-					{
-						// replace ending ? with / so that 'element' terminates properly
-						*s = '/';
-
-						// we exit from this function with cursor at node_declaration, which is a signal to parse() to go to LOC_ATTRIBUTES
-						s = value;
-					}
-					else
-					{
-						// store value and step over >
-						cursor->value = value;
-						POPNODE();
-
-						ENDSEG();
-
-						s += (*s == '>');
-					}
-				}
-				else THROW_ERROR(status_bad_pi, s);
-			}
-			else
-			{
-				// scan for tag end
-				SCANFOR(s[0] == '?' && ENDSWITH(s[1], '>'));
-				CHECK_ERROR(status_bad_pi, s);
-
-				s += (s[1] == '>' ? 2 : 1);
-			}
-
-			// store from registers
-			ref_cursor = cursor;
-
-			return s;
-		}
-
-		void parse(char_t* s, xml_node_struct* xmldoc, unsigned int optmsk, char_t endch)
-		{
-			strconv_attribute_t strconv_attribute = get_strconv_attribute(optmsk);
-			strconv_pcdata_t strconv_pcdata = get_strconv_pcdata(optmsk);
-			
-			char_t ch = 0;
-			xml_node_struct* cursor = xmldoc;
-			char_t* mark = s;
-
-			while (*s != 0)
-			{
-				if (*s == '<')
-				{
-					++s;
-
-				LOC_TAG:
-					if (IS_CHARTYPE(*s, ct_start_symbol)) // '<#...'
-					{
-						PUSHNODE(node_element); // Append a new node to the tree.
-
-						cursor->name = s;
-
-						SCANWHILE(IS_CHARTYPE(*s, ct_symbol)); // Scan for a terminator.
-						ENDSEG(); // Save char in 'ch', terminate & step over.
-
-						if (ch == '>')
-						{
-							// end of tag
-						}
-						else if (IS_CHARTYPE(ch, ct_space))
-						{
-						LOC_ATTRIBUTES:
-						    while (true)
-						    {
-								SKIPWS(); // Eat any whitespace.
-						
-								if (IS_CHARTYPE(*s, ct_start_symbol)) // <... #...
-								{
-									xml_attribute_struct* a = append_attribute_ll(cursor, alloc); // Make space for this attribute.
-									if (!a) THROW_ERROR(status_out_of_memory, s);
-
-									a->name = s; // Save the offset.
-
-									SCANWHILE(IS_CHARTYPE(*s, ct_symbol)); // Scan for a terminator.
-									CHECK_ERROR(status_bad_attribute, s); //$ redundant, left for performance
-
-									ENDSEG(); // Save char in 'ch', terminate & step over.
-									CHECK_ERROR(status_bad_attribute, s); //$ redundant, left for performance
-
-									if (IS_CHARTYPE(ch, ct_space))
-									{
-										SKIPWS(); // Eat any whitespace.
-										CHECK_ERROR(status_bad_attribute, s); //$ redundant, left for performance
-
-										ch = *s;
-										++s;
-									}
-									
-									if (ch == '=') // '<... #=...'
-									{
-										SKIPWS(); // Eat any whitespace.
-
-										if (*s == '"' || *s == '\'') // '<... #="...'
-										{
-											ch = *s; // Save quote char to avoid breaking on "''" -or- '""'.
-											++s; // Step over the quote.
-											a->value = s; // Save the offset.
-
-											s = strconv_attribute(s, ch);
-										
-											if (!s) THROW_ERROR(status_bad_attribute, a->value);
-
-											// After this line the loop continues from the start;
-											// Whitespaces, / and > are ok, symbols and EOF are wrong,
-											// everything else will be detected
-											if (IS_CHARTYPE(*s, ct_start_symbol)) THROW_ERROR(status_bad_attribute, s);
-										}
-										else THROW_ERROR(status_bad_attribute, s);
-									}
-									else THROW_ERROR(status_bad_attribute, s);
-								}
-								else if (*s == '/')
-								{
-									++s;
-									
-									if (*s == '>')
-									{
-										POPNODE();
-										s++;
-										break;
-									}
-									else if (*s == 0 && endch == '>')
-									{
-										POPNODE();
-										break;
-									}
-									else THROW_ERROR(status_bad_start_element, s);
-								}
-								else if (*s == '>')
-								{
-									++s;
-
-									break;
-								}
-								else if (*s == 0 && endch == '>')
-								{
-									break;
-								}
-								else THROW_ERROR(status_bad_start_element, s);
-							}
-
-							// !!!
-						}
-						else if (ch == '/') // '<#.../'
-						{
-							if (!ENDSWITH(*s, '>')) THROW_ERROR(status_bad_start_element, s);
-
-							POPNODE(); // Pop.
-
-							s += (*s == '>');
-						}
-						else if (ch == 0)
-						{
-							// we stepped over null terminator, backtrack & handle closing tag
-							--s;
-							
-							if (endch != '>') THROW_ERROR(status_bad_start_element, s);
-						}
-						else THROW_ERROR(status_bad_start_element, s);
-					}
-					else if (*s == '/')
-					{
-						++s;
-
-						char_t* name = cursor->name;
-						if (!name) THROW_ERROR(status_end_element_mismatch, s);
-						
-						while (IS_CHARTYPE(*s, ct_symbol))
-						{
-							if (*s++ != *name++) THROW_ERROR(status_end_element_mismatch, s);
-						}
-
-						if (*name)
-						{
-							if (*s == 0 && name[0] == endch && name[1] == 0) THROW_ERROR(status_bad_end_element, s);
-							else THROW_ERROR(status_end_element_mismatch, s);
-						}
-							
-						POPNODE(); // Pop.
-
-						SKIPWS();
-
-						if (*s == 0)
-						{
-							if (endch != '>') THROW_ERROR(status_bad_end_element, s);
-						}
-						else
-						{
-							if (*s != '>') THROW_ERROR(status_bad_end_element, s);
-							++s;
-						}
-					}
-					else if (*s == '?') // '<?...'
-					{
-						s = parse_question(s, cursor, optmsk, endch);
-
-						assert(cursor);
-						if ((cursor->header & xml_memory_page_type_mask) + 1 == node_declaration) goto LOC_ATTRIBUTES;
-					}
-					else if (*s == '!') // '<!...'
-					{
-						s = parse_exclamation(s, cursor, optmsk, endch);
-					}
-					else if (*s == 0 && endch == '?') THROW_ERROR(status_bad_pi, s);
-					else THROW_ERROR(status_unrecognized_tag, s);
-				}
-				else
-				{
-					mark = s; // Save this offset while searching for a terminator.
-
-					SKIPWS(); // Eat whitespace if no genuine PCDATA here.
-
-					if ((!OPTSET(parse_ws_pcdata) || mark == s) && (*s == '<' || !*s))
-					{
-						continue;
-					}
-
-					s = mark;
-							
-					if (cursor->parent)
-					{
-						PUSHNODE(node_pcdata); // Append a new node on the tree.
-						cursor->value = s; // Save the offset.
-
-						s = strconv_pcdata(s);
-								
-						POPNODE(); // Pop since this is a standalone.
-						
-						if (!*s) break;
-					}
-					else
-					{
-						SCANFOR(*s == '<'); // '...<'
-						if (!*s) break;
-						
-						++s;
-					}
-
-					// We're after '<'
-					goto LOC_TAG;
-				}
-			}
-
-			// check that last tag is closed
-			if (cursor != xmldoc) THROW_ERROR(status_end_element_mismatch, s);
-		}
-
-		static xml_parse_result parse(char_t* buffer, size_t length, xml_node_struct* root, unsigned int optmsk)
-		{
-			xml_document_struct* xmldoc = static_cast<xml_document_struct*>(root);
-
-			// store buffer for offset_debug
-			xmldoc->buffer = buffer;
-
-			// early-out for empty documents
-			if (length == 0) return make_parse_result(status_ok);
-
-			// create parser on stack
-			xml_parser parser(*xmldoc);
-
-			// save last character and make buffer zero-terminated (speeds up parsing)
-			char_t endch = buffer[length - 1];
-			buffer[length - 1] = 0;
-			
-			// perform actual parsing
-			int error = setjmp(parser.error_handler);
-
-			if (error == 0)
-			{
-				parser.parse(buffer, xmldoc, optmsk, endch);
-			}
-
-			xml_parse_result result = make_parse_result(static_cast<xml_parse_status>(error), parser.error_offset ? parser.error_offset - buffer : 0);
-			assert(result.offset >= 0 && static_cast<size_t>(result.offset) <= length);
-
-			// update allocator state
-			*static_cast<xml_allocator*>(xmldoc) = parser.alloc;
-
-			// since we removed last character, we have to handle the only possible false positive
-			if (result && endch == '<')
-			{
-				// there's no possible well-formed document with < at the end
-				return make_parse_result(status_unrecognized_tag, length);
-			}
-
-			return result;
-		}
-	};
-
-	// Output facilities
-	xml_encoding get_write_native_encoding()
-	{
-	#ifdef PUGIXML_WCHAR_MODE
-		return get_wchar_encoding();
-	#else
-		return encoding_utf8;
-	#endif
-	}
-
-	xml_encoding get_write_encoding(xml_encoding encoding)
-	{
-		// replace wchar encoding with utf implementation
-		if (encoding == encoding_wchar) return get_wchar_encoding();
-
-		// replace utf16 encoding with utf16 with specific endianness
-		if (encoding == encoding_utf16) return is_little_endian() ? encoding_utf16_le : encoding_utf16_be;
-
-		// replace utf32 encoding with utf32 with specific endianness
-		if (encoding == encoding_utf32) return is_little_endian() ? encoding_utf32_le : encoding_utf32_be;
-
-		// only do autodetection if no explicit encoding is requested
-		if (encoding != encoding_auto) return encoding;
-
-		// assume utf8 encoding
-		return encoding_utf8;
-	}
-
-#ifdef PUGIXML_WCHAR_MODE
-	size_t get_valid_length(const char_t* data, size_t length)
-	{
-		assert(length > 0);
-
-		// discard last character if it's the lead of a surrogate pair 
-		return (sizeof(wchar_t) == 2 && (unsigned)(static_cast<uint16_t>(data[length - 1]) - 0xD800) < 0x400) ? length - 1 : length;
-	}
-
-	size_t convert_buffer(char* result, const char_t* data, size_t length, xml_encoding encoding)
-	{
-		// only endian-swapping is required
-		if (need_endian_swap_utf(encoding, get_wchar_encoding()))
-		{
-			convert_wchar_endian_swap(reinterpret_cast<char_t*>(result), data, length);
-
-			return length * sizeof(char_t);
-		}
-	
-		// convert to utf8
-		if (encoding == encoding_utf8)
-		{
-			uint8_t* dest = reinterpret_cast<uint8_t*>(result);
-
-			uint8_t* end = sizeof(wchar_t) == 2 ?
-				utf_decoder<utf8_writer>::decode_utf16_block(reinterpret_cast<const uint16_t*>(data), length, dest) :
-				utf_decoder<utf8_writer>::decode_utf32_block(reinterpret_cast<const uint32_t*>(data), length, dest);
-
-			return static_cast<size_t>(end - dest);
-		}
-
-		// convert to utf16
-		if (encoding == encoding_utf16_be || encoding == encoding_utf16_le)
-		{
-			uint16_t* dest = reinterpret_cast<uint16_t*>(result);
-
-			// convert to native utf16
-			uint16_t* end = utf_decoder<utf16_writer>::decode_utf32_block(reinterpret_cast<const uint32_t*>(data), length, dest);
-
-			// swap if necessary
-			xml_encoding native_encoding = is_little_endian() ? encoding_utf16_le : encoding_utf16_be;
-
-			if (native_encoding != encoding) convert_utf_endian_swap(dest, dest, static_cast<size_t>(end - dest));
-
-			return static_cast<size_t>(end - dest) * sizeof(uint16_t);
-		}
-
-		// convert to utf32
-		if (encoding == encoding_utf32_be || encoding == encoding_utf32_le)
-		{
-			uint32_t* dest = reinterpret_cast<uint32_t*>(result);
-
-			// convert to native utf32
-			uint32_t* end = utf_decoder<utf32_writer>::decode_utf16_block(reinterpret_cast<const uint16_t*>(data), length, dest);
-
-			// swap if necessary
-			xml_encoding native_encoding = is_little_endian() ? encoding_utf32_le : encoding_utf32_be;
-
-			if (native_encoding != encoding) convert_utf_endian_swap(dest, dest, static_cast<size_t>(end - dest));
-
-			return static_cast<size_t>(end - dest) * sizeof(uint32_t);
-		}
-
-		assert(!"Invalid encoding");
-		return 0;
-	}
-#else
-	size_t get_valid_length(const char_t* data, size_t length)
-	{
-		assert(length > 4);
-
-		for (size_t i = 1; i <= 4; ++i)
-		{
-			uint8_t ch = static_cast<uint8_t>(data[length - i]);
-
-			// either a standalone character or a leading one
-			if ((ch & 0xc0) != 0x80) return length - i;
-		}
-
-		// there are four non-leading characters at the end, sequence tail is broken so might as well process the whole chunk
-		return length;
-	}
-
-	size_t convert_buffer(char* result, const char_t* data, size_t length, xml_encoding encoding)
-	{
-		if (encoding == encoding_utf16_be || encoding == encoding_utf16_le)
-		{
-			uint16_t* dest = reinterpret_cast<uint16_t*>(result);
-
-			// convert to native utf16
-			uint16_t* end = utf_decoder<utf16_writer>::decode_utf8_block(reinterpret_cast<const uint8_t*>(data), length, dest);
-
-			// swap if necessary
-			xml_encoding native_encoding = is_little_endian() ? encoding_utf16_le : encoding_utf16_be;
-
-			if (native_encoding != encoding) convert_utf_endian_swap(dest, dest, static_cast<size_t>(end - dest));
-
-			return static_cast<size_t>(end - dest) * sizeof(uint16_t);
-		}
-
-		if (encoding == encoding_utf32_be || encoding == encoding_utf32_le)
-		{
-			uint32_t* dest = reinterpret_cast<uint32_t*>(result);
-
-			// convert to native utf32
-			uint32_t* end = utf_decoder<utf32_writer>::decode_utf8_block(reinterpret_cast<const uint8_t*>(data), length, dest);
-
-			// swap if necessary
-			xml_encoding native_encoding = is_little_endian() ? encoding_utf32_le : encoding_utf32_be;
-
-			if (native_encoding != encoding) convert_utf_endian_swap(dest, dest, static_cast<size_t>(end - dest));
-
-			return static_cast<size_t>(end - dest) * sizeof(uint32_t);
-		}
-
-		assert(!"Invalid encoding");
-		return 0;
-	}
-#endif
-
-	class xml_buffered_writer
-	{
-		xml_buffered_writer(const xml_buffered_writer&);
-		xml_buffered_writer& operator=(const xml_buffered_writer&);
-
-	public:
-		xml_buffered_writer(xml_writer& writer, xml_encoding user_encoding): writer(writer), bufsize(0), encoding(get_write_encoding(user_encoding))
-		{
-		}
-
-		~xml_buffered_writer()
-		{
-			flush();
-		}
-
-		void flush()
-		{
-			flush(buffer, bufsize);
-			bufsize = 0;
-		}
-
-		void flush(const char_t* data, size_t size)
-		{
-			if (size == 0) return;
-
-			// fast path, just write data
-			if (encoding == get_write_native_encoding())
-				writer.write(data, size * sizeof(char_t));
-			else
-			{
-				// convert chunk
-				size_t result = convert_buffer(scratch, data, size, encoding);
-				assert(result <= sizeof(scratch));
-
-				// write data
-				writer.write(scratch, result);
-			}
-		}
-
-		void write(const char_t* data, size_t length)
-		{
-			if (bufsize + length > bufcapacity)
-			{
-				// flush the remaining buffer contents
-				flush();
-
-				// handle large chunks
-				if (length > bufcapacity)
-				{
-					if (encoding == get_write_native_encoding())
-					{
-						// fast path, can just write data chunk
-						writer.write(data, length * sizeof(char_t));
-						return;
-					}
-
-					// need to convert in suitable chunks
-					while (length > bufcapacity)
-					{
-						// get chunk size by selecting such number of characters that are guaranteed to fit into scratch buffer
-						// and form a complete codepoint sequence (i.e. discard start of last codepoint if necessary)
-						size_t chunk_size = get_valid_length(data, bufcapacity);
-
-						// convert chunk and write
-						flush(data, chunk_size);
-
-						// iterate
-						data += chunk_size;
-						length -= chunk_size;
-					}
-
-					// small tail is copied below
-					bufsize = 0;
-				}
-			}
-
-			memcpy(buffer + bufsize, data, length * sizeof(char_t));
-			bufsize += length;
-		}
-
-		void write(const char_t* data)
-		{
-			write(data, strlength(data));
-		}
-
-		void write(char_t d0)
-		{
-			if (bufsize + 1 > bufcapacity) flush();
-
-			buffer[bufsize + 0] = d0;
-			bufsize += 1;
-		}
-
-		void write(char_t d0, char_t d1)
-		{
-			if (bufsize + 2 > bufcapacity) flush();
-
-			buffer[bufsize + 0] = d0;
-			buffer[bufsize + 1] = d1;
-			bufsize += 2;
-		}
-
-		void write(char_t d0, char_t d1, char_t d2)
-		{
-			if (bufsize + 3 > bufcapacity) flush();
-
-			buffer[bufsize + 0] = d0;
-			buffer[bufsize + 1] = d1;
-			buffer[bufsize + 2] = d2;
-			bufsize += 3;
-		}
-
-		void write(char_t d0, char_t d1, char_t d2, char_t d3)
-		{
-			if (bufsize + 4 > bufcapacity) flush();
-
-			buffer[bufsize + 0] = d0;
-			buffer[bufsize + 1] = d1;
-			buffer[bufsize + 2] = d2;
-			buffer[bufsize + 3] = d3;
-			bufsize += 4;
-		}
-
-		void write(char_t d0, char_t d1, char_t d2, char_t d3, char_t d4)
-		{
-			if (bufsize + 5 > bufcapacity) flush();
-
-			buffer[bufsize + 0] = d0;
-			buffer[bufsize + 1] = d1;
-			buffer[bufsize + 2] = d2;
-			buffer[bufsize + 3] = d3;
-			buffer[bufsize + 4] = d4;
-			bufsize += 5;
-		}
-
-		void write(char_t d0, char_t d1, char_t d2, char_t d3, char_t d4, char_t d5)
-		{
-			if (bufsize + 6 > bufcapacity) flush();
-
-			buffer[bufsize + 0] = d0;
-			buffer[bufsize + 1] = d1;
-			buffer[bufsize + 2] = d2;
-			buffer[bufsize + 3] = d3;
-			buffer[bufsize + 4] = d4;
-			buffer[bufsize + 5] = d5;
-			bufsize += 6;
-		}
-
-		// utf8 maximum expansion: x4 (-> utf32)
-		// utf16 maximum expansion: x2 (-> utf32)
-		// utf32 maximum expansion: x1
-		enum { bufcapacity = 2048 };
-
-		char_t buffer[bufcapacity];
-		char scratch[4 * bufcapacity];
-
-		xml_writer& writer;
-		size_t bufsize;
-		xml_encoding encoding;
-	};
-
-	void write_bom(xml_writer& writer, xml_encoding encoding)
-	{
-		switch (encoding)
-		{
-		case encoding_utf8:
-			writer.write("\xef\xbb\xbf", 3);
-			break;
-
-		case encoding_utf16_be:
-			writer.write("\xfe\xff", 2);
-			break;
-
-		case encoding_utf16_le:
-			writer.write("\xff\xfe", 2);
-			break;
-
-		case encoding_utf32_be:
-			writer.write("\x00\x00\xfe\xff", 4);
-			break;
-
-		case encoding_utf32_le:
-			writer.write("\xff\xfe\x00\x00", 4);
-			break;
-
-		default:
-			assert(!"Invalid encoding");
-		}
-	}
-
-	void text_output_escaped(xml_buffered_writer& writer, const char_t* s, chartypex_t type)
-	{
-		while (*s)
-		{
-			const char_t* prev = s;
-			
-			// While *s is a usual symbol
-			while (!IS_CHARTYPEX(*s, type)) ++s;
-		
-			writer.write(prev, static_cast<size_t>(s - prev));
-
-			switch (*s)
-			{
-				case 0: break;
-				case '&':
-					writer.write('&', 'a', 'm', 'p', ';');
-					++s;
-					break;
-				case '<':
-					writer.write('&', 'l', 't', ';');
-					++s;
-					break;
-				case '>':
-					writer.write('&', 'g', 't', ';');
-					++s;
-					break;
-				case '"':
-					writer.write('&', 'q', 'u', 'o', 't', ';');
-					++s;
-					break;
-				default: // s is not a usual symbol
-				{
-					unsigned int ch = static_cast<unsigned int>(*s++);
-					assert(ch < 32);
-
-					writer.write('&', '#', static_cast<char_t>((ch / 10) + '0'), static_cast<char_t>((ch % 10) + '0'), ';');
-				}
-			}
-		}
-	}
-
-	void text_output_cdata(xml_buffered_writer& writer, const char_t* s)
-	{
-		do
-		{
-			writer.write('<', '!', '[', 'C', 'D');
-			writer.write('A', 'T', 'A', '[');
-
-			const char_t* prev = s;
-
-			// look for ]]> sequence - we can't output it as is since it terminates CDATA
-			while (*s && !(s[0] == ']' && s[1] == ']' && s[2] == '>')) ++s;
-
-			// skip ]] if we stopped at ]]>, > will go to the next CDATA section
-			if (*s) s += 2;
-
-			writer.write(prev, static_cast<size_t>(s - prev));
-
-			writer.write(']', ']', '>');
-		}
-		while (*s);
-	}
-
-	void node_output_attributes(xml_buffered_writer& writer, const xml_node& node)
-	{
-		const char_t* default_name = PUGIXML_TEXT(":anonymous");
-
-		for (xml_attribute a = node.first_attribute(); a; a = a.next_attribute())
-		{
-			writer.write(' ');
-			writer.write(a.name()[0] ? a.name() : default_name);
-			writer.write('=', '"');
-
-			text_output_escaped(writer, a.value(), ctx_special_attr);
-
-			writer.write('"');
-		}
-	}
-
-	void node_output(xml_buffered_writer& writer, const xml_node& node, const char_t* indent, unsigned int flags, unsigned int depth)
-	{
-		const char_t* default_name = PUGIXML_TEXT(":anonymous");
-
-		if ((flags & format_indent) != 0 && (flags & format_raw) == 0)
-			for (unsigned int i = 0; i < depth; ++i) writer.write(indent);
-
-		switch (node.type())
-		{
-		case node_document:
-		{
-			for (xml_node n = node.first_child(); n; n = n.next_sibling())
-				node_output(writer, n, indent, flags, depth);
-			break;
-		}
-			
-		case node_element:
-		{
-			const char_t* name = node.name()[0] ? node.name() : default_name;
-
-			writer.write('<');
-			writer.write(name);
-
-			node_output_attributes(writer, node);
-
-			if (flags & format_raw)
-			{
-				if (!node.first_child())
-					writer.write(' ', '/', '>');
-				else
-				{
-					writer.write('>');
-
-					for (xml_node n = node.first_child(); n; n = n.next_sibling())
-						node_output(writer, n, indent, flags, depth + 1);
-
-					writer.write('<', '/');
-					writer.write(name);
-					writer.write('>');
-				}
-			}
-			else if (!node.first_child())
-				writer.write(' ', '/', '>', '\n');
-			else if (node.first_child() == node.last_child() && (node.first_child().type() == node_pcdata || node.first_child().type() == node_cdata))
-			{
-				writer.write('>');
-
-                if (node.first_child().type() == node_pcdata)
-                    text_output_escaped(writer, node.first_child().value(), ctx_special_pcdata);
-                else
-                    text_output_cdata(writer, node.first_child().value());
-
-				writer.write('<', '/');
-				writer.write(name);
-				writer.write('>', '\n');
-			}
-			else
-			{
-				writer.write('>', '\n');
-				
-				for (xml_node n = node.first_child(); n; n = n.next_sibling())
-					node_output(writer, n, indent, flags, depth + 1);
-
-				if ((flags & format_indent) != 0 && (flags & format_raw) == 0)
-					for (unsigned int i = 0; i < depth; ++i) writer.write(indent);
-				
-				writer.write('<', '/');
-				writer.write(name);
-				writer.write('>', '\n');
-			}
-
-			break;
-		}
-		
-		case node_pcdata:
-			text_output_escaped(writer, node.value(), ctx_special_pcdata);
-			if ((flags & format_raw) == 0) writer.write('\n');
-			break;
-
-		case node_cdata:
-			text_output_cdata(writer, node.value());
-			if ((flags & format_raw) == 0) writer.write('\n');
-			break;
-
-		case node_comment:
-			writer.write('<', '!', '-', '-');
-			writer.write(node.value());
-			writer.write('-', '-', '>');
-			if ((flags & format_raw) == 0) writer.write('\n');
-			break;
-
-		case node_pi:
-		case node_declaration:
-			writer.write('<', '?');
-			writer.write(node.name()[0] ? node.name() : default_name);
-
-			if (node.type() == node_declaration)
-			{
-				node_output_attributes(writer, node);
-			}
-			else if (node.value()[0])
-			{
-				writer.write(' ');
-				writer.write(node.value());
-			}
-
-			writer.write('?', '>');
-			if ((flags & format_raw) == 0) writer.write('\n');
-			break;
-
-		case node_doctype:
-			writer.write('<', '!', 'D', 'O', 'C');
-			writer.write('T', 'Y', 'P', 'E');
-
-            if (node.value()[0])
-            {
-                writer.write(' ');
-                writer.write(node.value());
-            }
-
-            writer.write('>');
-			if ((flags & format_raw) == 0) writer.write('\n');
-			break;
-
-		default:
-			assert(!"Invalid node type");
-		}
-	}
-
-	inline bool has_declaration(const xml_node& node)
-	{
-		for (xml_node child = node.first_child(); child; child = child.next_sibling())
-		{
-			xml_node_type type = child.type();
-
-			if (type == node_declaration) return true;
-			if (type == node_element) return false;
-		}
-
-		return false;
-	}
-
-	inline bool allow_insert_child(xml_node_type parent, xml_node_type child)
-	{
-		if (parent != node_document && parent != node_element) return false;
-		if (child == node_document || child == node_null) return false;
-		if (parent != node_document && (child == node_declaration || child == node_doctype)) return false;
-
-		return true;
-	}
-
-	void recursive_copy_skip(xml_node& dest, const xml_node& source, const xml_node& skip)
-	{
-		assert(dest.type() == source.type());
-
-		switch (source.type())
-		{
-		case node_element:
-		{
-			dest.set_name(source.name());
-
-			for (xml_attribute a = source.first_attribute(); a; a = a.next_attribute())
-				dest.append_attribute(a.name()).set_value(a.value());
-
-			for (xml_node c = source.first_child(); c; c = c.next_sibling())
-			{
-				if (c == skip) continue;
-
-				xml_node cc = dest.append_child(c.type());
-				assert(cc);
-
-				recursive_copy_skip(cc, c, skip);
-			}
-
-			break;
-		}
-
-		case node_pcdata:
-		case node_cdata:
-		case node_comment:
-        case node_doctype:
-			dest.set_value(source.value());
-			break;
-
-		case node_pi:
-			dest.set_name(source.name());
-			dest.set_value(source.value());
-			break;
-
-		case node_declaration:
-		{
-			dest.set_name(source.name());
-
-			for (xml_attribute a = source.first_attribute(); a; a = a.next_attribute())
-				dest.append_attribute(a.name()).set_value(a.value());
-
-			break;
-		}
-
-		default:
-			assert(!"Invalid node type");
-		}
-	}
-
-	// we need to get length of entire file to load it in memory; the only (relatively) sane way to do it is via seek/tell trick
-	xml_parse_status get_file_size(FILE* file, size_t& out_result)
-	{
-	#if defined(_MSC_VER) && _MSC_VER >= 1400
-		// there are 64-bit versions of fseek/ftell, let's use them
-		typedef __int64 length_type;
-
-		_fseeki64(file, 0, SEEK_END);
-		length_type length = _ftelli64(file);
-		_fseeki64(file, 0, SEEK_SET);
-	#elif defined(__MINGW32__) && !defined(__NO_MINGW_LFS) && !defined(__STRICT_ANSI__)
-		// there are 64-bit versions of fseek/ftell, let's use them
-		typedef off64_t length_type;
-
-		fseeko64(file, 0, SEEK_END);
-		length_type length = ftello64(file);
-		fseeko64(file, 0, SEEK_SET);
-	#else
-		// if this is a 32-bit OS, long is enough; if this is a unix system, long is 64-bit, which is enough; otherwise we can't do anything anyway.
-		typedef long length_type;
-
-		fseek(file, 0, SEEK_END);
-		length_type length = ftell(file);
-		fseek(file, 0, SEEK_SET);
-	#endif
-
-		// check for I/O errors
-		if (length < 0) return status_io_error;
-		
-		// check for overflow
-		size_t result = static_cast<size_t>(length);
-
-		if (static_cast<length_type>(result) != length) return status_out_of_memory;
-
-		// finalize
-		out_result = result;
-
-		return status_ok;
-	}
-
-	xml_parse_result load_file_impl(xml_document& doc, FILE* file, unsigned int options, xml_encoding encoding)
-	{
-		if (!file) return make_parse_result(status_file_not_found);
-
-		// get file size (can result in I/O errors)
-		size_t size = 0;
-		xml_parse_status size_status = get_file_size(file, size);
-
-		if (size_status != status_ok)
-		{
-			fclose(file);
-			return make_parse_result(size_status);
-		}
-		
-		// allocate buffer for the whole file
-		char* contents = static_cast<char*>(global_allocate(size > 0 ? size : 1));
-
-		if (!contents)
-		{
-			fclose(file);
-			return make_parse_result(status_out_of_memory);
-		}
-
-		// read file in memory
-		size_t read_size = fread(contents, 1, size, file);
-		fclose(file);
-
-		if (read_size != size)
-		{
-			global_deallocate(contents);
-			return make_parse_result(status_io_error);
-		}
-		
-		return doc.load_buffer_inplace_own(contents, size, options, encoding);
-	}
-
-#ifndef PUGIXML_NO_STL
-	template <typename T> xml_parse_result load_stream_impl(xml_document& doc, std::basic_istream<T>& stream, unsigned int options, xml_encoding encoding)
-	{
-		// get length of remaining data in stream
-		typename std::basic_istream<T>::pos_type pos = stream.tellg();
-		stream.seekg(0, std::ios::end);
-		std::streamoff length = stream.tellg() - pos;
-		stream.seekg(pos);
-
-		if (stream.fail() || pos < 0) return make_parse_result(status_io_error);
-
-		// guard against huge files
-		size_t read_length = static_cast<size_t>(length);
-
-		if (static_cast<std::streamsize>(read_length) != length || length < 0) return make_parse_result(status_out_of_memory);
-
-		// read stream data into memory (guard against stream exceptions with buffer holder)
-		buffer_holder buffer(global_allocate((read_length > 0 ? read_length : 1) * sizeof(T)), global_deallocate);
-		if (!buffer.data) return make_parse_result(status_out_of_memory);
-
-		stream.read(static_cast<T*>(buffer.data), static_cast<std::streamsize>(read_length));
-
-		// read may set failbit | eofbit in case gcount() is less than read_length (i.e. line ending conversion), so check for other I/O errors
-		if (stream.bad()) return make_parse_result(status_io_error);
-
-		// load data from buffer
-		size_t actual_length = static_cast<size_t>(stream.gcount());
-		assert(actual_length <= read_length);
-
-		return doc.load_buffer_inplace_own(buffer.release(), actual_length * sizeof(T), options, encoding);
-	}
-#endif
-
-#if defined(_MSC_VER) || defined(__BORLANDC__) || defined(__MINGW32__)
-	FILE* open_file_wide(const wchar_t* path, const wchar_t* mode)
-	{
-		return _wfopen(path, mode);
-	}
-#else
-	char* convert_path_heap(const wchar_t* str)
-	{
-		assert(str);
-
-		// first pass: get length in utf8 characters
-		size_t length = wcslen(str);
-        size_t size = as_utf8_begin(str, length);
-
-		// allocate resulting string
-		char* result = static_cast<char*>(global_allocate(size + 1));
-		if (!result) return 0;
-
-		// second pass: convert to utf8
-        as_utf8_end(result, size, str, length);
-
-	  	return result;
-	}
-
-	FILE* open_file_wide(const wchar_t* path, const wchar_t* mode)
-	{
-		// there is no standard function to open wide paths, so our best bet is to try utf8 path
-		char* path_utf8 = convert_path_heap(path);
-		if (!path_utf8) return 0;
-
-		// convert mode to ASCII (we mirror _wfopen interface)
-		char mode_ascii[4] = {0};
-		for (size_t i = 0; mode[i]; ++i) mode_ascii[i] = static_cast<char>(mode[i]);
-
-		// try to open the utf8 path
-		FILE* result = fopen(path_utf8, mode_ascii);
-
-		// free dummy buffer
-		global_deallocate(path_utf8);
-
-		return result;
-	}
-#endif
-}
-
-namespace pugi
-{
-	xml_writer_file::xml_writer_file(void* file): file(file)
-	{
-	}
-
-	void xml_writer_file::write(const void* data, size_t size)
-	{
-		fwrite(data, size, 1, static_cast<FILE*>(file));
-	}
-
-#ifndef PUGIXML_NO_STL
-	xml_writer_stream::xml_writer_stream(std::basic_ostream<char, std::char_traits<char> >& stream): narrow_stream(&stream), wide_stream(0)
-	{
-	}
-
-	xml_writer_stream::xml_writer_stream(std::basic_ostream<wchar_t, std::char_traits<wchar_t> >& stream): narrow_stream(0), wide_stream(&stream)
-	{
-	}
-
-	void xml_writer_stream::write(const void* data, size_t size)
-	{
-		if (narrow_stream)
-		{
-			assert(!wide_stream);
-			narrow_stream->write(reinterpret_cast<const char*>(data), static_cast<std::streamsize>(size));
-		}
-		else
-		{
-			assert(wide_stream);
-			assert(size % sizeof(wchar_t) == 0);
-
-			wide_stream->write(reinterpret_cast<const wchar_t*>(data), static_cast<std::streamsize>(size / sizeof(wchar_t)));
-		}
-	}
-#endif
-
-	xml_tree_walker::xml_tree_walker(): _depth(0)
-	{
-	}
-	
-	xml_tree_walker::~xml_tree_walker()
-	{
-	}
-
-	int xml_tree_walker::depth() const
-	{
-		return _depth;
-	}
-
-	bool xml_tree_walker::begin(xml_node&)
-	{
-		return true;
-	}
-
-	bool xml_tree_walker::end(xml_node&)
-	{
-		return true;
-	}
-
-	xml_attribute::xml_attribute(): _attr(0)
-	{
-	}
-
-	xml_attribute::xml_attribute(xml_attribute_struct* attr): _attr(attr)
-	{
-	}
-
-	xml_attribute::operator xml_attribute::unspecified_bool_type() const
-	{
-      	return _attr ? &xml_attribute::_attr : 0;
-   	}
-
-   	bool xml_attribute::operator!() const
-   	{
-   		return !_attr;
-   	}
-
-	bool xml_attribute::operator==(const xml_attribute& r) const
-	{
-		return (_attr == r._attr);
-	}
-	
-	bool xml_attribute::operator!=(const xml_attribute& r) const
-	{
-		return (_attr != r._attr);
-	}
-
-	bool xml_attribute::operator<(const xml_attribute& r) const
-	{
-		return (_attr < r._attr);
-	}
-	
-	bool xml_attribute::operator>(const xml_attribute& r) const
-	{
-		return (_attr > r._attr);
-	}
-	
-	bool xml_attribute::operator<=(const xml_attribute& r) const
-	{
-		return (_attr <= r._attr);
-	}
-	
-	bool xml_attribute::operator>=(const xml_attribute& r) const
-	{
-		return (_attr >= r._attr);
-	}
-
-   	xml_attribute xml_attribute::next_attribute() const
-   	{
-    	return _attr ? xml_attribute(_attr->next_attribute) : xml_attribute();
-   	}
-
-    xml_attribute xml_attribute::previous_attribute() const
-    {
-    	return _attr && _attr->prev_attribute_c->next_attribute ? xml_attribute(_attr->prev_attribute_c) : xml_attribute();
-    }
-
-	int xml_attribute::as_int() const
-	{
-		if (!_attr || !_attr->value) return 0;
-
-	#ifdef PUGIXML_WCHAR_MODE
-		return (int)wcstol(_attr->value, 0, 10);
-	#else
-		return (int)strtol(_attr->value, 0, 10);
-	#endif
-	}
-
-	unsigned int xml_attribute::as_uint() const
-	{
-		if (!_attr || !_attr->value) return 0;
-
-	#ifdef PUGIXML_WCHAR_MODE
-		return (unsigned int)wcstoul(_attr->value, 0, 10);
-	#else
-		return (unsigned int)strtoul(_attr->value, 0, 10);
-	#endif
-	}
-
-	double xml_attribute::as_double() const
-	{
-		if (!_attr || !_attr->value) return 0;
-
-	#ifdef PUGIXML_WCHAR_MODE
-		return wcstod(_attr->value, 0);
-	#else
-		return strtod(_attr->value, 0);
-	#endif
-	}
-
-	float xml_attribute::as_float() const
-	{
-		if (!_attr || !_attr->value) return 0;
-
-	#ifdef PUGIXML_WCHAR_MODE
-		return (float)wcstod(_attr->value, 0);
-	#else
-		return (float)strtod(_attr->value, 0);
-	#endif
-	}
-
-	bool xml_attribute::as_bool() const
-	{
-		if (!_attr || !_attr->value) return false;
-
-		// only look at first char
-		char_t first = *_attr->value;
-
-		// 1*, t* (true), T* (True), y* (yes), Y* (YES)
-		return (first == '1' || first == 't' || first == 'T' || first == 'y' || first == 'Y');
-	}
-
-	bool xml_attribute::empty() const
-	{
-		return !_attr;
-	}
-
-	const char_t* xml_attribute::name() const
-	{
-		return (_attr && _attr->name) ? _attr->name : PUGIXML_TEXT("");
-	}
-
-	const char_t* xml_attribute::value() const
-	{
-		return (_attr && _attr->value) ? _attr->value : PUGIXML_TEXT("");
-	}
-
-    size_t xml_attribute::hash_value() const
-    {
-        return static_cast<size_t>(reinterpret_cast<uintptr_t>(_attr) / sizeof(xml_attribute_struct));
-    }
-
-	xml_attribute_struct* xml_attribute::internal_object() const
-	{
-        return _attr;
-	}
-
-	xml_attribute& xml_attribute::operator=(const char_t* rhs)
-	{
-		set_value(rhs);
-		return *this;
-	}
-	
-	xml_attribute& xml_attribute::operator=(int rhs)
-	{
-		set_value(rhs);
-		return *this;
-	}
-
-	xml_attribute& xml_attribute::operator=(unsigned int rhs)
-	{
-		set_value(rhs);
-		return *this;
-	}
-
-	xml_attribute& xml_attribute::operator=(double rhs)
-	{
-		set_value(rhs);
-		return *this;
-	}
-	
-	xml_attribute& xml_attribute::operator=(bool rhs)
-	{
-		set_value(rhs);
-		return *this;
-	}
-
-	bool xml_attribute::set_name(const char_t* rhs)
-	{
-		if (!_attr) return false;
-		
-		return strcpy_insitu(_attr->name, _attr->header, xml_memory_page_name_allocated_mask, rhs);
-	}
-		
-	bool xml_attribute::set_value(const char_t* rhs)
-	{
-		if (!_attr) return false;
-
-		return strcpy_insitu(_attr->value, _attr->header, xml_memory_page_value_allocated_mask, rhs);
-	}
-
-	bool xml_attribute::set_value(int rhs)
-	{
-		char buf[128];
-		sprintf(buf, "%d", rhs);
-	
-	#ifdef PUGIXML_WCHAR_MODE
-		char_t wbuf[128];
-		widen_ascii(wbuf, buf);
-
-		return set_value(wbuf);
-	#else
-		return set_value(buf);
-	#endif
-	}
-
-	bool xml_attribute::set_value(unsigned int rhs)
-	{
-		char buf[128];
-		sprintf(buf, "%u", rhs);
-
-	#ifdef PUGIXML_WCHAR_MODE
-		char_t wbuf[128];
-		widen_ascii(wbuf, buf);
-
-		return set_value(wbuf);
-	#else
-		return set_value(buf);
-	#endif
-	}
-
-	bool xml_attribute::set_value(double rhs)
-	{
-		char buf[128];
-		sprintf(buf, "%g", rhs);
-
-	#ifdef PUGIXML_WCHAR_MODE
-		char_t wbuf[128];
-		widen_ascii(wbuf, buf);
-
-		return set_value(wbuf);
-	#else
-		return set_value(buf);
-	#endif
-	}
-	
-	bool xml_attribute::set_value(bool rhs)
-	{
-		return set_value(rhs ? PUGIXML_TEXT("true") : PUGIXML_TEXT("false"));
-	}
-
-#ifdef __BORLANDC__
-	bool operator&&(const xml_attribute& lhs, bool rhs)
-	{
-		return (bool)lhs && rhs;
-	}
-
-	bool operator||(const xml_attribute& lhs, bool rhs)
-	{
-		return (bool)lhs || rhs;
-	}
-#endif
-
-	xml_node::xml_node(): _root(0)
-	{
-	}
-
-	xml_node::xml_node(xml_node_struct* p): _root(p)
-	{
-	}
-	
-	xml_node::operator xml_node::unspecified_bool_type() const
-	{
-      	return _root ? &xml_node::_root : 0;
-   	}
-
-   	bool xml_node::operator!() const
-   	{
-   		return !_root;
-   	}
-
-	xml_node::iterator xml_node::begin() const
-	{
-		return iterator(_root ? _root->first_child : 0, _root);
-	}
-
-	xml_node::iterator xml_node::end() const
-	{
-		return iterator(0, _root);
-	}
-	
-	xml_node::attribute_iterator xml_node::attributes_begin() const
-	{
-		return attribute_iterator(_root ? _root->first_attribute : 0, _root);
-	}
-
-	xml_node::attribute_iterator xml_node::attributes_end() const
-	{
-		return attribute_iterator(0, _root);
-	}
-
-	bool xml_node::operator==(const xml_node& r) const
-	{
-		return (_root == r._root);
-	}
-
-	bool xml_node::operator!=(const xml_node& r) const
-	{
-		return (_root != r._root);
-	}
-
-	bool xml_node::operator<(const xml_node& r) const
-	{
-		return (_root < r._root);
-	}
-	
-	bool xml_node::operator>(const xml_node& r) const
-	{
-		return (_root > r._root);
-	}
-	
-	bool xml_node::operator<=(const xml_node& r) const
-	{
-		return (_root <= r._root);
-	}
-	
-	bool xml_node::operator>=(const xml_node& r) const
-	{
-		return (_root >= r._root);
-	}
-
-	bool xml_node::empty() const
-	{
-		return !_root;
-	}
-	
-	const char_t* xml_node::name() const
-	{
-		return (_root && _root->name) ? _root->name : PUGIXML_TEXT("");
-	}
-
-	xml_node_type xml_node::type() const
-	{
-		return _root ? static_cast<xml_node_type>((_root->header & xml_memory_page_type_mask) + 1) : node_null;
-	}
-	
-	const char_t* xml_node::value() const
-	{
-		return (_root && _root->value) ? _root->value : PUGIXML_TEXT("");
-	}
-	
-	xml_node xml_node::child(const char_t* name) const
-	{
-		if (!_root) return xml_node();
-
-		for (xml_node_struct* i = _root->first_child; i; i = i->next_sibling)
-			if (i->name && strequal(name, i->name)) return xml_node(i);
-
-		return xml_node();
-	}
-
-	xml_attribute xml_node::attribute(const char_t* name) const
-	{
-		if (!_root) return xml_attribute();
-
-		for (xml_attribute_struct* i = _root->first_attribute; i; i = i->next_attribute)
-			if (i->name && strequal(name, i->name))
-				return xml_attribute(i);
-		
-		return xml_attribute();
-	}
-	
-	xml_node xml_node::next_sibling(const char_t* name) const
-	{
-		if (!_root) return xml_node();
-		
-		for (xml_node_struct* i = _root->next_sibling; i; i = i->next_sibling)
-			if (i->name && strequal(name, i->name)) return xml_node(i);
-
-		return xml_node();
-	}
-
-	xml_node xml_node::next_sibling() const
-	{
-		if (!_root) return xml_node();
-		
-		if (_root->next_sibling) return xml_node(_root->next_sibling);
-		else return xml_node();
-	}
-
-	xml_node xml_node::previous_sibling(const char_t* name) const
-	{
-		if (!_root) return xml_node();
-		
-		for (xml_node_struct* i = _root->prev_sibling_c; i->next_sibling; i = i->prev_sibling_c)
-			if (i->name && strequal(name, i->name)) return xml_node(i);
-
-		return xml_node();
-	}
-
-	xml_node xml_node::previous_sibling() const
-	{
-		if (!_root) return xml_node();
-		
-		if (_root->prev_sibling_c->next_sibling) return xml_node(_root->prev_sibling_c);
-		else return xml_node();
-	}
-
-	xml_node xml_node::parent() const
-	{
-		return _root ? xml_node(_root->parent) : xml_node();
-	}
-
-	xml_node xml_node::root() const
-	{
-		if (!_root) return xml_node();
-
-		xml_memory_page* page = reinterpret_cast<xml_memory_page*>(_root->header & xml_memory_page_pointer_mask);
-
-		return xml_node(static_cast<xml_document_struct*>(page->allocator));
-	}
-
-	const char_t* xml_node::child_value() const
-	{
-		if (!_root) return PUGIXML_TEXT("");
-		
-		for (xml_node_struct* i = _root->first_child; i; i = i->next_sibling)
-		{
-			xml_node_type type = static_cast<xml_node_type>((i->header & xml_memory_page_type_mask) + 1);
-
-			if (i->value && (type == node_pcdata || type == node_cdata))
-				return i->value;
-		}
-
-		return PUGIXML_TEXT("");
-	}
-
-	const char_t* xml_node::child_value(const char_t* name) const
-	{
-		return child(name).child_value();
-	}
-
-	xml_attribute xml_node::first_attribute() const
-	{
-		return _root ? xml_attribute(_root->first_attribute) : xml_attribute();
-	}
-
-	xml_attribute xml_node::last_attribute() const
-	{
-		return _root && _root->first_attribute ? xml_attribute(_root->first_attribute->prev_attribute_c) : xml_attribute();
-	}
-
-	xml_node xml_node::first_child() const
-	{
-		return _root ? xml_node(_root->first_child) : xml_node();
-	}
-
-	xml_node xml_node::last_child() const
-	{
-		return _root && _root->first_child ? xml_node(_root->first_child->prev_sibling_c) : xml_node();
-	}
-
-	bool xml_node::set_name(const char_t* rhs)
-	{
-		switch (type())
-		{
-		case node_pi:
-		case node_declaration:
-		case node_element:
-			return strcpy_insitu(_root->name, _root->header, xml_memory_page_name_allocated_mask, rhs);
-
-		default:
-			return false;
-		}
-	}
-		
-	bool xml_node::set_value(const char_t* rhs)
-	{
-		switch (type())
-		{
-		case node_pi:
-		case node_cdata:
-		case node_pcdata:
-		case node_comment:
-        case node_doctype:
-			return strcpy_insitu(_root->value, _root->header, xml_memory_page_value_allocated_mask, rhs);
-
-		default:
-			return false;
-		}
-	}
-
-	xml_attribute xml_node::append_attribute(const char_t* name)
-	{
-		if (type() != node_element && type() != node_declaration) return xml_attribute();
-		
-		xml_attribute a(append_attribute_ll(_root, get_allocator(_root)));
-		a.set_name(name);
-		
-		return a;
-	}
-
-	xml_attribute xml_node::prepend_attribute(const char_t* name)
-	{
-		if (type() != node_element && type() != node_declaration) return xml_attribute();
-		
-		xml_attribute a(allocate_attribute(get_allocator(_root)));
-		if (!a) return xml_attribute();
-
-		a.set_name(name);
-		
-        xml_attribute_struct* head = _root->first_attribute;
-
-		if (head)
-        {
-            a._attr->prev_attribute_c = head->prev_attribute_c;
-            head->prev_attribute_c = a._attr;
-        }
-        else
-            a._attr->prev_attribute_c = a._attr;
-		
-		a._attr->next_attribute = head;
-        _root->first_attribute = a._attr;
-				
-		return a;
-	}
-
-	xml_attribute xml_node::insert_attribute_before(const char_t* name, const xml_attribute& attr)
-	{
-		if ((type() != node_element && type() != node_declaration) || attr.empty()) return xml_attribute();
-		
-		// check that attribute belongs to *this
-		xml_attribute_struct* cur = attr._attr;
-
-		while (cur->prev_attribute_c->next_attribute) cur = cur->prev_attribute_c;
-
-		if (cur != _root->first_attribute) return xml_attribute();
-
-		xml_attribute a(allocate_attribute(get_allocator(_root)));
-		if (!a) return xml_attribute();
-
-		a.set_name(name);
-
-		if (attr._attr->prev_attribute_c->next_attribute)
-			attr._attr->prev_attribute_c->next_attribute = a._attr;
-		else
-			_root->first_attribute = a._attr;
-		
-		a._attr->prev_attribute_c = attr._attr->prev_attribute_c;
-		a._attr->next_attribute = attr._attr;
-		attr._attr->prev_attribute_c = a._attr;
-				
-		return a;
-	}
-
-	xml_attribute xml_node::insert_attribute_after(const char_t* name, const xml_attribute& attr)
-	{
-		if ((type() != node_element && type() != node_declaration) || attr.empty()) return xml_attribute();
-		
-		// check that attribute belongs to *this
-		xml_attribute_struct* cur = attr._attr;
-
-		while (cur->prev_attribute_c->next_attribute) cur = cur->prev_attribute_c;
-
-		if (cur != _root->first_attribute) return xml_attribute();
-
-		xml_attribute a(allocate_attribute(get_allocator(_root)));
-		if (!a) return xml_attribute();
-
-		a.set_name(name);
-
-		if (attr._attr->next_attribute)
-			attr._attr->next_attribute->prev_attribute_c = a._attr;
-		else
-			_root->first_attribute->prev_attribute_c = a._attr;
-		
-		a._attr->next_attribute = attr._attr->next_attribute;
-		a._attr->prev_attribute_c = attr._attr;
-		attr._attr->next_attribute = a._attr;
-
-		return a;
-	}
-
-	xml_attribute xml_node::append_copy(const xml_attribute& proto)
-	{
-		if (!proto) return xml_attribute();
-
-		xml_attribute result = append_attribute(proto.name());
-		result.set_value(proto.value());
-
-		return result;
-	}
-
-	xml_attribute xml_node::prepend_copy(const xml_attribute& proto)
-	{
-		if (!proto) return xml_attribute();
-
-		xml_attribute result = prepend_attribute(proto.name());
-		result.set_value(proto.value());
-
-		return result;
-	}
-
-	xml_attribute xml_node::insert_copy_after(const xml_attribute& proto, const xml_attribute& attr)
-	{
-		if (!proto) return xml_attribute();
-
-		xml_attribute result = insert_attribute_after(proto.name(), attr);
-		result.set_value(proto.value());
-
-		return result;
-	}
-
-	xml_attribute xml_node::insert_copy_before(const xml_attribute& proto, const xml_attribute& attr)
-	{
-		if (!proto) return xml_attribute();
-
-		xml_attribute result = insert_attribute_before(proto.name(), attr);
-		result.set_value(proto.value());
-
-		return result;
-	}
-
-	xml_node xml_node::append_child(xml_node_type type)
-	{
-		if (!allow_insert_child(this->type(), type)) return xml_node();
-		
-		xml_node n(append_node(_root, get_allocator(_root), type));
-
-		if (type == node_declaration) n.set_name(PUGIXML_TEXT("xml"));
-
-		return n;
-	}
-
-	xml_node xml_node::prepend_child(xml_node_type type)
-	{
-		if (!allow_insert_child(this->type(), type)) return xml_node();
-		
-		xml_node n(allocate_node(get_allocator(_root), type));
-		if (!n) return xml_node();
-
-        n._root->parent = _root;
-
-        xml_node_struct* head = _root->first_child;
-
-		if (head)
-        {
-            n._root->prev_sibling_c = head->prev_sibling_c;
-            head->prev_sibling_c = n._root;
-        }
-        else
-            n._root->prev_sibling_c = n._root;
-		
-		n._root->next_sibling = head;
-        _root->first_child = n._root;
-				
-		if (type == node_declaration) n.set_name(PUGIXML_TEXT("xml"));
-
-		return n;
-	}
-
-	xml_node xml_node::insert_child_before(xml_node_type type, const xml_node& node)
-	{
-		if (!allow_insert_child(this->type(), type)) return xml_node();
-		if (!node._root || node._root->parent != _root) return xml_node();
-	
-		xml_node n(allocate_node(get_allocator(_root), type));
-		if (!n) return xml_node();
-
-		n._root->parent = _root;
-		
-		if (node._root->prev_sibling_c->next_sibling)
-			node._root->prev_sibling_c->next_sibling = n._root;
-		else
-			_root->first_child = n._root;
-		
-		n._root->prev_sibling_c = node._root->prev_sibling_c;
-		n._root->next_sibling = node._root;
-		node._root->prev_sibling_c = n._root;
-
-		if (type == node_declaration) n.set_name(PUGIXML_TEXT("xml"));
-
-		return n;
-	}
-
-	xml_node xml_node::insert_child_after(xml_node_type type, const xml_node& node)
-	{
-		if (!allow_insert_child(this->type(), type)) return xml_node();
-		if (!node._root || node._root->parent != _root) return xml_node();
-	
-		xml_node n(allocate_node(get_allocator(_root), type));
-		if (!n) return xml_node();
-
-		n._root->parent = _root;
-	
-		if (node._root->next_sibling)
-			node._root->next_sibling->prev_sibling_c = n._root;
-		else
-			_root->first_child->prev_sibling_c = n._root;
-		
-		n._root->next_sibling = node._root->next_sibling;
-		n._root->prev_sibling_c = node._root;
-		node._root->next_sibling = n._root;
-
-		if (type == node_declaration) n.set_name(PUGIXML_TEXT("xml"));
-
-		return n;
-	}
-
-    xml_node xml_node::append_child(const char_t* name)
-    {
-        xml_node result = append_child(node_element);
-
-        result.set_name(name);
-
-        return result;
-    }
-
-    xml_node xml_node::prepend_child(const char_t* name)
-    {
-        xml_node result = prepend_child(node_element);
-
-        result.set_name(name);
-
-        return result;
-    }
-
-    xml_node xml_node::insert_child_after(const char_t* name, const xml_node& node)
-    {
-        xml_node result = insert_child_after(node_element, node);
-
-        result.set_name(name);
-
-        return result;
-    }
-
-    xml_node xml_node::insert_child_before(const char_t* name, const xml_node& node)
-    {
-        xml_node result = insert_child_before(node_element, node);
-
-        result.set_name(name);
-
-        return result;
-    }
-
-	xml_node xml_node::append_copy(const xml_node& proto)
-	{
-		xml_node result = append_child(proto.type());
-
-		if (result) recursive_copy_skip(result, proto, result);
-
-		return result;
-	}
-
-	xml_node xml_node::prepend_copy(const xml_node& proto)
-	{
-		xml_node result = prepend_child(proto.type());
-
-		if (result) recursive_copy_skip(result, proto, result);
-
-		return result;
-	}
-
-	xml_node xml_node::insert_copy_after(const xml_node& proto, const xml_node& node)
-	{
-		xml_node result = insert_child_after(proto.type(), node);
-
-		if (result) recursive_copy_skip(result, proto, result);
-
-		return result;
-	}
-
-	xml_node xml_node::insert_copy_before(const xml_node& proto, const xml_node& node)
-	{
-		xml_node result = insert_child_before(proto.type(), node);
-
-		if (result) recursive_copy_skip(result, proto, result);
-
-		return result;
-	}
-
-	bool xml_node::remove_attribute(const char_t* name)
-	{
-		return remove_attribute(attribute(name));
-	}
-
-	bool xml_node::remove_attribute(const xml_attribute& a)
-	{
-		if (!_root || !a._attr) return false;
-
-		// check that attribute belongs to *this
-		xml_attribute_struct* attr = a._attr;
-
-		while (attr->prev_attribute_c->next_attribute) attr = attr->prev_attribute_c;
-
-		if (attr != _root->first_attribute) return false;
-
-		if (a._attr->next_attribute) a._attr->next_attribute->prev_attribute_c = a._attr->prev_attribute_c;
-		else if (_root->first_attribute) _root->first_attribute->prev_attribute_c = a._attr->prev_attribute_c;
-		
-		if (a._attr->prev_attribute_c->next_attribute) a._attr->prev_attribute_c->next_attribute = a._attr->next_attribute;
-		else _root->first_attribute = a._attr->next_attribute;
-
-		destroy_attribute(a._attr, get_allocator(_root));
-
-		return true;
-	}
-
-	bool xml_node::remove_child(const char_t* name)
-	{
-		return remove_child(child(name));
-	}
-
-	bool xml_node::remove_child(const xml_node& n)
-	{
-		if (!_root || !n._root || n._root->parent != _root) return false;
-
-		if (n._root->next_sibling) n._root->next_sibling->prev_sibling_c = n._root->prev_sibling_c;
-		else if (_root->first_child) _root->first_child->prev_sibling_c = n._root->prev_sibling_c;
-		
-		if (n._root->prev_sibling_c->next_sibling) n._root->prev_sibling_c->next_sibling = n._root->next_sibling;
-		else _root->first_child = n._root->next_sibling;
-        
-        destroy_node(n._root, get_allocator(_root));
-
-		return true;
-	}
-
-	xml_node xml_node::find_child_by_attribute(const char_t* name, const char_t* attr_name, const char_t* attr_value) const
-	{
-		if (!_root) return xml_node();
-		
-		for (xml_node_struct* i = _root->first_child; i; i = i->next_sibling)
-			if (i->name && strequal(name, i->name))
-			{
-				for (xml_attribute_struct* a = i->first_attribute; a; a = a->next_attribute)
-					if (strequal(attr_name, a->name) && strequal(attr_value, a->value))
-						return xml_node(i);
-			}
-
-		return xml_node();
-	}
-
-	xml_node xml_node::find_child_by_attribute(const char_t* attr_name, const char_t* attr_value) const
-	{
-		if (!_root) return xml_node();
-		
-		for (xml_node_struct* i = _root->first_child; i; i = i->next_sibling)
-			for (xml_attribute_struct* a = i->first_attribute; a; a = a->next_attribute)
-				if (strequal(attr_name, a->name) && strequal(attr_value, a->value))
-					return xml_node(i);
-
-		return xml_node();
-	}
-
-#ifndef PUGIXML_NO_STL
-	string_t xml_node::path(char_t delimiter) const
-	{
-		string_t path;
-
-		xml_node cursor = *this; // Make a copy.
-		
-		path = cursor.name();
-
-		while (cursor.parent())
-		{
-			cursor = cursor.parent();
-			
-			string_t temp = cursor.name();
-			temp += delimiter;
-			temp += path;
-			path.swap(temp);
-		}
-
-		return path;
-	}
-#endif
-
-	xml_node xml_node::first_element_by_path(const char_t* path, char_t delimiter) const
-	{
-		xml_node found = *this; // Current search context.
-
-		if (!_root || !path || !path[0]) return found;
-
-		if (path[0] == delimiter)
-		{
-			// Absolute path; e.g. '/foo/bar'
-			found = found.root();
-			++path;
-		}
-
-		const char_t* path_segment = path;
-
-		while (*path_segment == delimiter) ++path_segment;
-
-		const char_t* path_segment_end = path_segment;
-
-		while (*path_segment_end && *path_segment_end != delimiter) ++path_segment_end;
-
-		if (path_segment == path_segment_end) return found;
-
-		const char_t* next_segment = path_segment_end;
-
-		while (*next_segment == delimiter) ++next_segment;
-
-		if (*path_segment == '.' && path_segment + 1 == path_segment_end)
-			return found.first_element_by_path(next_segment, delimiter);
-		else if (*path_segment == '.' && *(path_segment+1) == '.' && path_segment + 2 == path_segment_end)
-			return found.parent().first_element_by_path(next_segment, delimiter);
-		else
-		{
-			for (xml_node_struct* j = found._root->first_child; j; j = j->next_sibling)
-			{
-				if (j->name && strequalrange(j->name, path_segment, static_cast<size_t>(path_segment_end - path_segment)))
-				{
-					xml_node subsearch = xml_node(j).first_element_by_path(next_segment, delimiter);
-
-					if (subsearch) return subsearch;
-				}
-			}
-
-			return xml_node();
-		}
-	}
-
-	bool xml_node::traverse(xml_tree_walker& walker)
-	{
-		walker._depth = -1;
-		
-		xml_node arg_begin = *this;
-		if (!walker.begin(arg_begin)) return false;
-
-		xml_node cur = first_child();
-				
-		if (cur)
-		{
-			++walker._depth;
-
-			do 
-			{
-				xml_node arg_for_each = cur;
-				if (!walker.for_each(arg_for_each))
-					return false;
-						
-				if (cur.first_child())
-				{
-					++walker._depth;
-					cur = cur.first_child();
-				}
-				else if (cur.next_sibling())
-					cur = cur.next_sibling();
-				else
-				{
-					// Borland C++ workaround
-					while (!cur.next_sibling() && cur != *this && (bool)cur.parent())
-					{
-						--walker._depth;
-						cur = cur.parent();
-					}
-						
-					if (cur != *this)
-						cur = cur.next_sibling();
-				}
-			}
-			while (cur && cur != *this);
-		}
-
-		assert(walker._depth == -1);
-
-		xml_node arg_end = *this;
-		return walker.end(arg_end);
-	}
-
-    size_t xml_node::hash_value() const
-    {
-        return static_cast<size_t>(reinterpret_cast<uintptr_t>(_root) / sizeof(xml_node_struct));
-    }
-
-	xml_node_struct* xml_node::internal_object() const
-	{
-        return _root;
-	}
-
-	void xml_node::print(xml_writer& writer, const char_t* indent, unsigned int flags, xml_encoding encoding, unsigned int depth) const
-	{
-		if (!_root) return;
-
-		xml_buffered_writer buffered_writer(writer, encoding);
-
-		node_output(buffered_writer, *this, indent, flags, depth);
-	}
-
-#ifndef PUGIXML_NO_STL
-	void xml_node::print(std::basic_ostream<char, std::char_traits<char> >& stream, const char_t* indent, unsigned int flags, xml_encoding encoding, unsigned int depth) const
-	{
-		xml_writer_stream writer(stream);
-
-		print(writer, indent, flags, encoding, depth);
-	}
-
-	void xml_node::print(std::basic_ostream<wchar_t, std::char_traits<wchar_t> >& stream, const char_t* indent, unsigned int flags, unsigned int depth) const
-	{
-		xml_writer_stream writer(stream);
-
-		print(writer, indent, flags, encoding_wchar, depth);
-	}
-#endif
-
-	ptrdiff_t xml_node::offset_debug() const
-	{
-		xml_node_struct* r = root()._root;
-
-		if (!r) return -1;
-
-		const char_t* buffer = static_cast<xml_document_struct*>(r)->buffer;
-
-		if (!buffer) return -1;
-
-		switch (type())
-		{
-		case node_document:
-			return 0;
-
-		case node_element:
-		case node_declaration:
-		case node_pi:
-			return (_root->header & xml_memory_page_name_allocated_mask) ? -1 : _root->name - buffer;
-
-		case node_pcdata:
-		case node_cdata:
-		case node_comment:
-		case node_doctype:
-			return (_root->header & xml_memory_page_value_allocated_mask) ? -1 : _root->value - buffer;
-
-		default:
-			return -1;
-		}
-	}
-
-#ifdef __BORLANDC__
-	bool operator&&(const xml_node& lhs, bool rhs)
-	{
-		return (bool)lhs && rhs;
-	}
-
-	bool operator||(const xml_node& lhs, bool rhs)
-	{
-		return (bool)lhs || rhs;
-	}
-#endif
-
-	xml_node_iterator::xml_node_iterator()
-	{
-	}
-
-	xml_node_iterator::xml_node_iterator(const xml_node& node): _wrap(node), _parent(node.parent())
-	{
-	}
-
-	xml_node_iterator::xml_node_iterator(xml_node_struct* ref, xml_node_struct* parent): _wrap(ref), _parent(parent)
-	{
-	}
-
-	bool xml_node_iterator::operator==(const xml_node_iterator& rhs) const
-	{
-		return _wrap._root == rhs._wrap._root && _parent._root == rhs._parent._root;
-	}
-	
-	bool xml_node_iterator::operator!=(const xml_node_iterator& rhs) const
-	{
-		return _wrap._root != rhs._wrap._root || _parent._root != rhs._parent._root;
-	}
-
-	xml_node& xml_node_iterator::operator*()
-	{
-		assert(_wrap._root);
-		return _wrap;
-	}
-
-	xml_node* xml_node_iterator::operator->()
-	{
-		assert(_wrap._root);
-		return &_wrap;
-	}
-
-	const xml_node_iterator& xml_node_iterator::operator++()
-	{
-		assert(_wrap._root);
-		_wrap._root = _wrap._root->next_sibling;
-		return *this;
-	}
-
-	xml_node_iterator xml_node_iterator::operator++(int)
-	{
-		xml_node_iterator temp = *this;
-		++*this;
-		return temp;
-	}
-
-	const xml_node_iterator& xml_node_iterator::operator--()
-	{
-		_wrap = _wrap._root ? _wrap.previous_sibling() : _parent.last_child();
-		return *this;
-	}
-
-	xml_node_iterator xml_node_iterator::operator--(int)
-	{
-		xml_node_iterator temp = *this;
-		--*this;
-		return temp;
-	}
-
-	xml_attribute_iterator::xml_attribute_iterator()
-	{
-	}
-
-	xml_attribute_iterator::xml_attribute_iterator(const xml_attribute& attr, const xml_node& parent): _wrap(attr), _parent(parent)
-	{
-	}
-
-	xml_attribute_iterator::xml_attribute_iterator(xml_attribute_struct* ref, xml_node_struct* parent): _wrap(ref), _parent(parent)
-	{
-	}
-
-	bool xml_attribute_iterator::operator==(const xml_attribute_iterator& rhs) const
-	{
-		return _wrap._attr == rhs._wrap._attr && _parent._root == rhs._parent._root;
-	}
-	
-	bool xml_attribute_iterator::operator!=(const xml_attribute_iterator& rhs) const
-	{
-		return _wrap._attr != rhs._wrap._attr || _parent._root != rhs._parent._root;
-	}
-
-	xml_attribute& xml_attribute_iterator::operator*()
-	{
-		assert(_wrap._attr);
-		return _wrap;
-	}
-
-	xml_attribute* xml_attribute_iterator::operator->()
-	{
-		assert(_wrap._attr);
-		return &_wrap;
-	}
-
-	const xml_attribute_iterator& xml_attribute_iterator::operator++()
-	{
-		assert(_wrap._attr);
-		_wrap._attr = _wrap._attr->next_attribute;
-		return *this;
-	}
-
-	xml_attribute_iterator xml_attribute_iterator::operator++(int)
-	{
-		xml_attribute_iterator temp = *this;
-		++*this;
-		return temp;
-	}
-
-	const xml_attribute_iterator& xml_attribute_iterator::operator--()
-	{
-		_wrap = _wrap._attr ? _wrap.previous_attribute() : _parent.last_attribute();
-		return *this;
-	}
-
-	xml_attribute_iterator xml_attribute_iterator::operator--(int)
-	{
-		xml_attribute_iterator temp = *this;
-		--*this;
-		return temp;
-	}
-
-    xml_parse_result::xml_parse_result(): status(status_internal_error), offset(0), encoding(encoding_auto)
-    {
-    }
-
-    xml_parse_result::operator bool() const
-    {
-        return status == status_ok;
-    }
-
-	const char* xml_parse_result::description() const
-	{
-		switch (status)
-		{
-		case status_ok: return "No error";
-
-		case status_file_not_found: return "File was not found";
-		case status_io_error: return "Error reading from file/stream";
-		case status_out_of_memory: return "Could not allocate memory";
-		case status_internal_error: return "Internal error occurred";
-
-		case status_unrecognized_tag: return "Could not determine tag type";
-
-		case status_bad_pi: return "Error parsing document declaration/processing instruction";
-		case status_bad_comment: return "Error parsing comment";
-		case status_bad_cdata: return "Error parsing CDATA section";
-		case status_bad_doctype: return "Error parsing document type declaration";
-		case status_bad_pcdata: return "Error parsing PCDATA section";
-		case status_bad_start_element: return "Error parsing start element tag";
-		case status_bad_attribute: return "Error parsing element attribute";
-		case status_bad_end_element: return "Error parsing end element tag";
-		case status_end_element_mismatch: return "Start-end tags mismatch";
-
-		default: return "Unknown error";
-		}
-	}
-
-	xml_document::xml_document(): _buffer(0)
-	{
-		create();
-	}
-
-	xml_document::~xml_document()
-	{
-		destroy();
-	}
-
-	void xml_document::reset()
-	{
-		destroy();
-		create();
-	}
-
-    void xml_document::reset(const xml_document& proto)
-    {
-        reset();
-
-        for (xml_node cur = proto.first_child(); cur; cur = cur.next_sibling())
-            append_copy(cur);
-    }
-
-	void xml_document::create()
-	{
-		// initialize sentinel page
-		STATIC_ASSERT(offsetof(xml_memory_page, data) + sizeof(xml_document_struct) + xml_memory_page_alignment <= sizeof(_memory));
-
-		// align upwards to page boundary
-		void* page_memory = reinterpret_cast<void*>((reinterpret_cast<uintptr_t>(_memory) + (xml_memory_page_alignment - 1)) & ~(xml_memory_page_alignment - 1));
-
-		// prepare page structure
-		xml_memory_page* page = xml_memory_page::construct(page_memory);
-
-		page->busy_size = xml_memory_page_size;
-
-		// allocate new root
-		_root = new (page->data) xml_document_struct(page);
-		_root->prev_sibling_c = _root;
-
-		// setup sentinel page
-		page->allocator = static_cast<xml_document_struct*>(_root);
-	}
-
-	void xml_document::destroy()
-	{
-		// destroy static storage
-		if (_buffer)
-		{
-			global_deallocate(_buffer);
-			_buffer = 0;
-		}
-
-		// destroy dynamic storage, leave sentinel page (it's in static memory)
-		if (_root)
-		{
-			xml_memory_page* root_page = reinterpret_cast<xml_memory_page*>(_root->header & xml_memory_page_pointer_mask);
-			assert(root_page && !root_page->prev && !root_page->memory);
-
-			// destroy all pages
-			for (xml_memory_page* page = root_page->next; page; )
-			{
-				xml_memory_page* next = page->next;
-
-				xml_allocator::deallocate_page(page);
-
-				page = next;
-			}
-
-			// cleanup root page
-			root_page->allocator = 0;
-			root_page->next = 0;
-			root_page->busy_size = root_page->freed_size = 0;
-
-			_root = 0;
-		}
-	}
-
-#ifndef PUGIXML_NO_STL
-	xml_parse_result xml_document::load(std::basic_istream<char, std::char_traits<char> >& stream, unsigned int options, xml_encoding encoding)
-	{
-		reset();
-
-		return load_stream_impl(*this, stream, options, encoding);
-	}
-
-	xml_parse_result xml_document::load(std::basic_istream<wchar_t, std::char_traits<wchar_t> >& stream, unsigned int options)
-	{
-		reset();
-
-		return load_stream_impl(*this, stream, options, encoding_wchar);
-	}
-#endif
-
-	xml_parse_result xml_document::load(const char_t* contents, unsigned int options)
-	{
-		// Force native encoding (skip autodetection)
-	#ifdef PUGIXML_WCHAR_MODE
-		xml_encoding encoding = encoding_wchar;
-	#else
-		xml_encoding encoding = encoding_utf8;
-	#endif
-
-		return load_buffer(contents, strlength(contents) * sizeof(char_t), options, encoding);
-	}
-
-	xml_parse_result xml_document::load_file(const char* path, unsigned int options, xml_encoding encoding)
-	{
-		reset();
-
-		FILE* file = fopen(path, "rb");
-
-		return load_file_impl(*this, file, options, encoding);
-	}
-
-	xml_parse_result xml_document::load_file(const wchar_t* path, unsigned int options, xml_encoding encoding)
-	{
-		reset();
-
-		FILE* file = open_file_wide(path, L"rb");
-
-		return load_file_impl(*this, file, options, encoding);
-	}
-
-	xml_parse_result xml_document::load_buffer_impl(void* contents, size_t size, unsigned int options, xml_encoding encoding, bool is_mutable, bool own)
-	{
-		reset();
-
-		// check input buffer
-		assert(contents || size == 0);
-
-		// get actual encoding
-		xml_encoding buffer_encoding = get_buffer_encoding(encoding, contents, size);
-
-		// get private buffer
-		char_t* buffer = 0;
-		size_t length = 0;
-
-		if (!convert_buffer(buffer, length, buffer_encoding, contents, size, is_mutable)) return make_parse_result(status_out_of_memory);
-		
-		// delete original buffer if we performed a conversion
-		if (own && buffer != contents && contents) global_deallocate(contents);
-
-		// parse
-		xml_parse_result res = xml_parser::parse(buffer, length, _root, options);
-
-		// remember encoding
-		res.encoding = buffer_encoding;
-
-		// grab onto buffer if it's our buffer, user is responsible for deallocating contens himself
-		if (own || buffer != contents) _buffer = buffer;
-
-		return res;
-	}
-
-	xml_parse_result xml_document::load_buffer(const void* contents, size_t size, unsigned int options, xml_encoding encoding)
-	{
-		return load_buffer_impl(const_cast<void*>(contents), size, options, encoding, false, false);
-	}
-
-	xml_parse_result xml_document::load_buffer_inplace(void* contents, size_t size, unsigned int options, xml_encoding encoding)
-	{
-		return load_buffer_impl(contents, size, options, encoding, true, false);
-	}
-		
-	xml_parse_result xml_document::load_buffer_inplace_own(void* contents, size_t size, unsigned int options, xml_encoding encoding)
-	{
-		return load_buffer_impl(contents, size, options, encoding, true, true);
-	}
-
-	void xml_document::save(xml_writer& writer, const char_t* indent, unsigned int flags, xml_encoding encoding) const
-	{
-		if (flags & format_write_bom) write_bom(writer, get_write_encoding(encoding));
-
-		xml_buffered_writer buffered_writer(writer, encoding);
-
-		if (!(flags & format_no_declaration) && !has_declaration(*this))
-		{
-			buffered_writer.write(PUGIXML_TEXT("<?xml version=\"1.0\"?>"));
-			if (!(flags & format_raw)) buffered_writer.write('\n');
-		}
-
-		node_output(buffered_writer, *this, indent, flags, 0);
-	}
-
-#ifndef PUGIXML_NO_STL
-	void xml_document::save(std::basic_ostream<char, std::char_traits<char> >& stream, const char_t* indent, unsigned int flags, xml_encoding encoding) const
-	{
-		xml_writer_stream writer(stream);
-
-		save(writer, indent, flags, encoding);
-	}
-
-	void xml_document::save(std::basic_ostream<wchar_t, std::char_traits<wchar_t> >& stream, const char_t* indent, unsigned int flags) const
-	{
-		xml_writer_stream writer(stream);
-
-		save(writer, indent, flags, encoding_wchar);
-	}
-#endif
-
-	bool xml_document::save_file(const char* path, const char_t* indent, unsigned int flags, xml_encoding encoding) const
-	{
-		FILE* file = fopen(path, "wb");
-		if (!file) return false;
-
-		xml_writer_file writer(file);
-		save(writer, indent, flags, encoding);
-
-		fclose(file);
-
-		return true;
-	}
-
-	bool xml_document::save_file(const wchar_t* path, const char_t* indent, unsigned int flags, xml_encoding encoding) const
-	{
-		FILE* file = open_file_wide(path, L"wb");
-		if (!file) return false;
-
-		xml_writer_file writer(file);
-		save(writer, indent, flags, encoding);
-
-		fclose(file);
-
-		return true;
-	}
-
-    xml_node xml_document::document_element() const
-    {
-		for (xml_node_struct* i = _root->first_child; i; i = i->next_sibling)
-			if ((i->header & xml_memory_page_type_mask) + 1 == node_element)
-                return xml_node(i);
-
-        return xml_node();
-    }
-
-#ifndef PUGIXML_NO_STL
-	std::string PUGIXML_FUNCTION as_utf8(const wchar_t* str)
-	{
-		assert(str);
-
-        return as_utf8_impl(str, wcslen(str));
-	}
-
-	std::string PUGIXML_FUNCTION as_utf8(const std::wstring& str)
-	{
-        return as_utf8_impl(str.c_str(), str.size());
-	}
-	
-	std::wstring PUGIXML_FUNCTION as_wide(const char* str)
-	{
-		assert(str);
-
-        return as_wide_impl(str, strlen(str));
-	}
-	
-	std::wstring PUGIXML_FUNCTION as_wide(const std::string& str)
-	{
-        return as_wide_impl(str.c_str(), str.size());
-	}
-#endif
-
-    void PUGIXML_FUNCTION set_memory_management_functions(allocation_function allocate, deallocation_function deallocate)
-    {
-    	global_allocate = allocate;
-    	global_deallocate = deallocate;
-    }
-
-    allocation_function PUGIXML_FUNCTION get_memory_allocation_function()
-    {
-    	return global_allocate;
-    }
-
-    deallocation_function PUGIXML_FUNCTION get_memory_deallocation_function()
-    {
-    	return global_deallocate;
-    }
-}
-
-#if !defined(PUGIXML_NO_STL) && (defined(_MSC_VER) || defined(__ICC))
-namespace std
-{
-	// Workarounds for (non-standard) iterator category detection for older versions (MSVC7/IC8 and earlier)
-	std::bidirectional_iterator_tag _Iter_cat(const xml_node_iterator&)
-	{
-		return std::bidirectional_iterator_tag();
-	}
-
-	std::bidirectional_iterator_tag _Iter_cat(const xml_attribute_iterator&)
-	{
-		return std::bidirectional_iterator_tag();
-	}
-}
-#endif
-
-#if !defined(PUGIXML_NO_STL) && defined(__SUNPRO_CC)
-namespace std
-{
-	// Workarounds for (non-standard) iterator category detection
-	std::bidirectional_iterator_tag __iterator_category(const xml_node_iterator&)
-	{
-		return std::bidirectional_iterator_tag();
-	}
-
-	std::bidirectional_iterator_tag __iterator_category(const xml_attribute_iterator&)
-	{
-		return std::bidirectional_iterator_tag();
-	}
-}
-#endif
-
-#ifndef PUGIXML_NO_XPATH
-
-// STL replacements
-namespace
-{
-	struct equal_to
-	{
-		template <typename T> bool operator()(const T& lhs, const T& rhs) const
-		{
-			return lhs == rhs;
-		}
-	};
-
-	struct not_equal_to
-	{
-		template <typename T> bool operator()(const T& lhs, const T& rhs) const
-		{
-			return lhs != rhs;
-		}
-	};
-
-	struct less
-	{
-		template <typename T> bool operator()(const T& lhs, const T& rhs) const
-		{
-			return lhs < rhs;
-		}
-	};
-
-	struct less_equal
-	{
-		template <typename T> bool operator()(const T& lhs, const T& rhs) const
-		{
-			return lhs <= rhs;
-		}
-	};
-
-	template <typename T> void swap(T& lhs, T& rhs)
-	{
-		T temp = lhs;
-		lhs = rhs;
-		rhs = temp;
-	}
-
-	template <typename I, typename Pred> I min_element(I begin, I end, const Pred& pred)
-	{
-		I result = begin;
-
-		for (I it = begin + 1; it != end; ++it)
-			if (pred(*it, *result))
-				result = it;
-
-		return result;
-	}
-
-	template <typename I> void reverse(I begin, I end)
-	{
-		while (begin + 1 < end) swap(*begin++, *--end);
-	}
-
-	template <typename I> I unique(I begin, I end)
-	{
-		// fast skip head
-		while (begin + 1 < end && *begin != *(begin + 1)) begin++;
-
-		if (begin == end) return begin;
-
-		// last written element
-		I write = begin++; 
-
-		// merge unique elements
-		while (begin != end)
-		{
-			if (*begin != *write)
-				*++write = *begin++;
-			else
-				begin++;
-		}
-
-		// past-the-end (write points to live element)
-		return write + 1;
-	}
-
-	template <typename I> void copy_backwards(I begin, I end, I target)
-	{
-		while (begin != end) *--target = *--end;
-	}
-
-	template <typename I, typename Pred, typename T> void insertion_sort(I begin, I end, const Pred& pred, T*)
-	{
-		assert(begin != end);
-
-		for (I it = begin + 1; it != end; ++it)
-		{
-			T val = *it;
-
-			if (pred(val, *begin))
-			{
-				// move to front
-				copy_backwards(begin, it, it + 1);
-				*begin = val;
-			}
-			else
-			{
-				I hole = it;
-
-				// move hole backwards
-				while (pred(val, *(hole - 1)))
-				{
-					*hole = *(hole - 1);
-					hole--;
-				}
-
-				// fill hole with element
-				*hole = val;
-			}
-		}
-	}
-
-	// std variant for elements with ==
-	template <typename I, typename Pred> void partition(I begin, I middle, I end, const Pred& pred, I* out_eqbeg, I* out_eqend)
-	{
-		I eqbeg = middle, eqend = middle + 1;
-
-		// expand equal range
-		while (eqbeg != begin && *(eqbeg - 1) == *eqbeg) --eqbeg;
-		while (eqend != end && *eqend == *eqbeg) ++eqend;
-
-		// process outer elements
-		I ltend = eqbeg, gtbeg = eqend;
-
-		for (;;)
-		{
-			// find the element from the right side that belongs to the left one
-			for (; gtbeg != end; ++gtbeg)
-				if (!pred(*eqbeg, *gtbeg))
-				{
-					if (*gtbeg == *eqbeg) swap(*gtbeg, *eqend++);
-					else break;
-				}
-
-			// find the element from the left side that belongs to the right one
-			for (; ltend != begin; --ltend)
-				if (!pred(*(ltend - 1), *eqbeg))
-				{
-					if (*eqbeg == *(ltend - 1)) swap(*(ltend - 1), *--eqbeg);
-					else break;
-				}
-
-			// scanned all elements
-			if (gtbeg == end && ltend == begin)
-			{
-				*out_eqbeg = eqbeg;
-				*out_eqend = eqend;
-				return;
-			}
-
-			// make room for elements by moving equal area
-			if (gtbeg == end)
-			{
-				if (--ltend != --eqbeg) swap(*ltend, *eqbeg);
-				swap(*eqbeg, *--eqend);
-			}
-			else if (ltend == begin)
-			{
-				if (eqend != gtbeg) swap(*eqbeg, *eqend);
-				++eqend;
-				swap(*gtbeg++, *eqbeg++);
-			}
-			else swap(*gtbeg++, *--ltend);
-		}
-	}
-
-	template <typename I, typename Pred> void median3(I first, I middle, I last, const Pred& pred)
-	{
-		if (pred(*middle, *first)) swap(*middle, *first);
-		if (pred(*last, *middle)) swap(*last, *middle);
-		if (pred(*middle, *first)) swap(*middle, *first);
-	}
-
-	template <typename I, typename Pred> void median(I first, I middle, I last, const Pred& pred)
-	{
-		if (last - first <= 40)
-		{
-			// median of three for small chunks
-			median3(first, middle, last, pred);
-		}
-		else
-		{
-			// median of nine
-			size_t step = (last - first + 1) / 8;
-
-			median3(first, first + step, first + 2 * step, pred);
-			median3(middle - step, middle, middle + step, pred);
-			median3(last - 2 * step, last - step, last, pred);
-			median3(first + step, middle, last - step, pred);
-		}
-	}
-
-	template <typename I, typename Pred> void sort(I begin, I end, const Pred& pred)
-	{
-		// sort large chunks
-		while (end - begin > 32)
-		{
-			// find median element
-			I middle = begin + (end - begin) / 2;
-			median(begin, middle, end - 1, pred);
-
-			// partition in three chunks (< = >)
-			I eqbeg, eqend;
-			partition(begin, middle, end, pred, &eqbeg, &eqend);
-
-			// loop on larger half
-			if (eqbeg - begin > end - eqend)
-			{
-				sort(eqend, end, pred);
-				end = eqbeg;
-			}
-			else
-			{
-				sort(begin, eqbeg, pred);
-				begin = eqend;
-			}
-		}
-
-		// insertion sort small chunk
-		if (begin != end) insertion_sort(begin, end, pred, &*begin);
-	}
-}
-
-// Allocator used for AST and evaluation stacks
-namespace
-{
-	struct xpath_memory_block
-	{	
-		xpath_memory_block* next;
-
-		char data[4096];
-	};
-		
-	class xpath_allocator
-	{
-		xpath_memory_block* _root;
-		size_t _root_size;
-
-	public:
-	#ifdef PUGIXML_NO_EXCEPTIONS
-		jmp_buf* error_handler;
-	#endif
-
-		xpath_allocator(xpath_memory_block* root, size_t root_size = 0): _root(root), _root_size(root_size)
-		{
-		#ifdef PUGIXML_NO_EXCEPTIONS
-			error_handler = 0;
-		#endif
-		}
-		
-		void* allocate_nothrow(size_t size)
-		{
-			const size_t block_capacity = sizeof(_root->data);
-
-			// align size so that we're able to store pointers in subsequent blocks
-			size = (size + sizeof(void*) - 1) & ~(sizeof(void*) - 1);
-
-			if (_root_size + size <= block_capacity)
-			{
-				void* buf = _root->data + _root_size;
-				_root_size += size;
-				return buf;
-			}
-			else
-			{
-				size_t block_data_size = (size > block_capacity) ? size : block_capacity;
-				size_t block_size = block_data_size + offsetof(xpath_memory_block, data);
-
-				xpath_memory_block* block = static_cast<xpath_memory_block*>(global_allocate(block_size));
-				if (!block) return 0;
-				
-				block->next = _root;
-				
-				_root = block;
-				_root_size = size;
-				
-				return block->data;
-			}
-		}
-
-		void* allocate(size_t size)
-		{
-			void* result = allocate_nothrow(size);
-
-			if (!result)
-			{
-			#ifdef PUGIXML_NO_EXCEPTIONS
-				assert(error_handler);
-				longjmp(*error_handler, 1);
-			#else
-				throw std::bad_alloc();
-			#endif
-			}
-
-			return result;
-		}
-
-		void* reallocate(void* ptr, size_t old_size, size_t new_size)
-		{
-			// align size so that we're able to store pointers in subsequent blocks
-			old_size = (old_size + sizeof(void*) - 1) & ~(sizeof(void*) - 1);
-			new_size = (new_size + sizeof(void*) - 1) & ~(sizeof(void*) - 1);
-
-			// we can only reallocate the last object
-			assert(ptr == 0 || static_cast<char*>(ptr) + old_size == _root->data + _root_size);
-
-			// adjust root size so that we have not allocated the object at all
-			bool only_object = (_root_size == old_size);
-
-			if (ptr) _root_size -= old_size;
-
-			// allocate a new version (this will obviously reuse the memory if possible)
-			void* result = allocate(new_size);
-			assert(result);
-
-			// we have a new block
-			if (result != ptr && ptr)
-			{
-				// copy old data
-				assert(new_size > old_size);
-				memcpy(result, ptr, old_size);
-
-				// free the previous page if it had no other objects
-				if (only_object)
-				{
-					assert(_root->data == result);
-					assert(_root->next);
-
-					xpath_memory_block* next = _root->next->next;
-
-					if (next)
-					{
-						// deallocate the whole page, unless it was the first one
-						global_deallocate(_root->next);
-						_root->next = next;
-					}
-				}
-			}
-
-			return result;
-		}
-
-		void revert(const xpath_allocator& state)
-		{
-			// free all new pages
-			xpath_memory_block* cur = _root;
-
-			while (cur != state._root)
-			{
-				xpath_memory_block* next = cur->next;
-
-				global_deallocate(cur);
-
-				cur = next;
-			}
-
-			// restore state
-			_root = state._root;
-			_root_size = state._root_size;
-		}
-
-		void release()
-		{
-			xpath_memory_block* cur = _root;
-			assert(cur);
-
-			while (cur->next)
-			{
-				xpath_memory_block* next = cur->next;
-
-				global_deallocate(cur);
-
-				cur = next;
-			}
-		}
-	};
-
-	struct xpath_allocator_capture
-	{
-		xpath_allocator_capture(xpath_allocator* alloc): _target(alloc), _state(*alloc)
-		{
-		}
-
-		~xpath_allocator_capture()
-		{
-			_target->revert(_state);
-		}
-
-		xpath_allocator* _target;
-		xpath_allocator _state;
-	};
-
-	struct xpath_stack
-	{
-		xpath_allocator* result;
-		xpath_allocator* temp;
-	};
-
-	struct xpath_stack_data
-	{
-		xpath_memory_block blocks[2];
-		xpath_allocator result;
-		xpath_allocator temp;
-		xpath_stack stack;
-
-	#ifdef PUGIXML_NO_EXCEPTIONS
-		jmp_buf error_handler;
-	#endif
-
-		xpath_stack_data(): result(blocks + 0), temp(blocks + 1)
-		{
-			blocks[0].next = blocks[1].next = 0;
-
-			stack.result = &result;
-			stack.temp = &temp;
-
-		#ifdef PUGIXML_NO_EXCEPTIONS
-			result.error_handler = temp.error_handler = &error_handler;
-		#endif
-		}
-
-		~xpath_stack_data()
-		{
-			result.release();
-			temp.release();
-		}
-	};
-}
-
-// String class
-namespace
-{
-	class xpath_string
-	{
-		const char_t* _buffer;
-		bool _uses_heap;
-
-		static char_t* duplicate_string(const char_t* string, size_t length, xpath_allocator* alloc)
-		{
-			char_t* result = static_cast<char_t*>(alloc->allocate((length + 1) * sizeof(char_t)));
-			assert(result);
-
-			memcpy(result, string, length * sizeof(char_t));
-			result[length] = 0;
-
-			return result;
-		}
-
-		static char_t* duplicate_string(const char_t* string, xpath_allocator* alloc)
-		{
-			return duplicate_string(string, strlength(string), alloc);
-		}
-
-	public:
-		xpath_string(): _buffer(PUGIXML_TEXT("")), _uses_heap(false)
-		{
-		}
-
-		explicit xpath_string(const char_t* str, xpath_allocator* alloc)
-		{
-			bool empty = (*str == 0);
-
-			_buffer = empty ? PUGIXML_TEXT("") : duplicate_string(str, alloc);
-			_uses_heap = !empty;
-		}
-
-		explicit xpath_string(const char_t* str, bool use_heap): _buffer(str), _uses_heap(use_heap)
-		{
-		}
-
-		xpath_string(const char_t* begin, const char_t* end, xpath_allocator* alloc)
-		{
-			assert(begin <= end);
-
-			bool empty = (begin == end);
-
-			_buffer = empty ? PUGIXML_TEXT("") : duplicate_string(begin, static_cast<size_t>(end - begin), alloc);
-			_uses_heap = !empty;
-		}
-
-		void append(const xpath_string& o, xpath_allocator* alloc)
-		{
-			// skip empty sources
-			if (!*o._buffer) return;
-
-			// fast append for constant empty target and constant source
-			if (!*_buffer && !_uses_heap && !o._uses_heap)
-			{
-				_buffer = o._buffer;
-			}
-			else
-			{
-				// need to make heap copy
-				size_t target_length = strlength(_buffer);
-				size_t source_length = strlength(o._buffer);
-				size_t length = target_length + source_length;
-
-				// allocate new buffer
-				char_t* result = static_cast<char_t*>(alloc->reallocate(_uses_heap ? const_cast<char_t*>(_buffer) : 0, (target_length + 1) * sizeof(char_t), (length + 1) * sizeof(char_t)));
-				assert(result);
-
-				// append first string to the new buffer in case there was no reallocation
-				if (!_uses_heap) memcpy(result, _buffer, target_length * sizeof(char_t));
-
-				// append second string to the new buffer
-				memcpy(result + target_length, o._buffer, source_length * sizeof(char_t));
-				result[length] = 0;
-
-				// finalize
-				_buffer = result;
-				_uses_heap = true;
-			}
-		}
-
-		const char_t* c_str() const
-		{
-			return _buffer;
-		}
-
-		size_t length() const
-		{
-			return strlength(_buffer);
-		}
-		
-		char_t* data(xpath_allocator* alloc)
-		{
-			// make private heap copy
-			if (!_uses_heap)
-			{
-				_buffer = duplicate_string(_buffer, alloc);
-				_uses_heap = true;
-			}
-
-			return const_cast<char_t*>(_buffer);
-		}
-
-		bool empty() const
-		{
-			return *_buffer == 0;
-		}
-
-		bool operator==(const xpath_string& o) const
-		{
-			return strequal(_buffer, o._buffer);
-		}
-
-		bool operator!=(const xpath_string& o) const
-		{
-			return !strequal(_buffer, o._buffer);
-		}
-
-		bool uses_heap() const
-		{
-			return _uses_heap;
-		}
-	};
-
-	xpath_string xpath_string_const(const char_t* str)
-	{
-		return xpath_string(str, false);
-	}
-}
-
-namespace
-{
-	bool starts_with(const char_t* string, const char_t* pattern)
-	{
-		while (*pattern && *string == *pattern)
-		{
-			string++;
-			pattern++;
-		}
-
-		return *pattern == 0;
-	}
-
-	const char_t* find_char(const char_t* s, char_t c)
-	{
-	#ifdef PUGIXML_WCHAR_MODE
-		return wcschr(s, c);
-	#else
-		return strchr(s, c);
-	#endif
-	}
-
-	const char_t* find_substring(const char_t* s, const char_t* p)
-	{
-	#ifdef PUGIXML_WCHAR_MODE
-		// MSVC6 wcsstr bug workaround (if s is empty it always returns 0)
-		return (*p == 0) ? s : wcsstr(s, p);
-	#else
-		return strstr(s, p);
-	#endif
-	}
-
-	// Converts symbol to lower case, if it is an ASCII one
-	char_t tolower_ascii(char_t ch)
-	{
-		return static_cast<unsigned int>(ch - 'A') < 26 ? static_cast<char_t>(ch | ' ') : ch;
-	}
-
-	xpath_string string_value(const xpath_node& na, xpath_allocator* alloc)
-	{
-		if (na.attribute())
-			return xpath_string_const(na.attribute().value());
-		else
-		{
-			const xml_node& n = na.node();
-
-			switch (n.type())
-			{
-			case node_pcdata:
-			case node_cdata:
-			case node_comment:
-			case node_pi:
-				return xpath_string_const(n.value());
-			
-			case node_document:
-			case node_element:
-			{
-				xpath_string result;
-
-				xml_node cur = n.first_child();
-				
-				while (cur && cur != n)
-				{
-					if (cur.type() == node_pcdata || cur.type() == node_cdata)
-						result.append(xpath_string_const(cur.value()), alloc);
-
-					if (cur.first_child())
-						cur = cur.first_child();
-					else if (cur.next_sibling())
-						cur = cur.next_sibling();
-					else
-					{
-						while (!cur.next_sibling() && cur != n)
-							cur = cur.parent();
-
-						if (cur != n) cur = cur.next_sibling();
-					}
-				}
-				
-				return result;
-			}
-			
-			default:
-				return xpath_string();
-			}
-		}
-	}
-	
-	unsigned int node_height(xml_node n)
-	{
-	    unsigned int result = 0;
-	    
-	    while (n)
-	    {
-	        ++result;
-	        n = n.parent();
-	    }
-	    
-	    return result;
-	}
-	
-	bool node_is_before(xml_node ln, unsigned int lh, xml_node rn, unsigned int rh)
-	{
-		// normalize heights
-		for (unsigned int i = rh; i < lh; i++) ln = ln.parent();
-		for (unsigned int j = lh; j < rh; j++) rn = rn.parent();
-	    
-		// one node is the ancestor of the other
-	    if (ln == rn) return lh < rh;
-	    
-		// find common ancestor
-	    while (ln.parent() != rn.parent())
-	    {
-	        ln = ln.parent();
-	        rn = rn.parent();
-	    }
-
-		// there is no common ancestor (the shared parent is null), nodes are from different documents
-		if (!ln.parent()) return ln < rn;
-
-		// determine sibling order
-        for (; ln; ln = ln.next_sibling())
-            if (ln == rn)
-                return true;
-                
-        return false;
-    }
-
-    bool node_is_ancestor(xml_node parent, xml_node node)
-    {
-    	while (node && node != parent) node = node.parent();
-
-    	return parent && node == parent;
-    }
-
-    const void* document_order(const xpath_node& xnode)
-    {
-        xml_node_struct* node = xnode.node().internal_object();
-
-        if (node)
-        {
-            if (node->name && (node->header & xml_memory_page_name_allocated_mask) == 0) return node->name;
-            if (node->value && (node->header & xml_memory_page_value_allocated_mask) == 0) return node->value;
-            return 0;
-        }
-
-        xml_attribute_struct* attr = xnode.attribute().internal_object();
-
-        if (attr)
-        {
-            if ((attr->header & xml_memory_page_name_allocated_mask) == 0) return attr->name;
-            if ((attr->header & xml_memory_page_value_allocated_mask) == 0) return attr->value;
-            return 0;
-        }
-
-		return 0;
-    }
-    
-	struct document_order_comparator
-	{
-		bool operator()(const xpath_node& lhs, const xpath_node& rhs) const
-		{
-			// optimized document order based check
-			const void* lo = document_order(lhs);
-			const void* ro = document_order(rhs);
-
-			if (lo && ro) return lo < ro;
-
-            // slow comparison
-			xml_node ln = lhs.node(), rn = rhs.node();
-
-			// compare attributes
-			if (lhs.attribute() && rhs.attribute())
-			{
-				// shared parent
-				if (lhs.parent() == rhs.parent())
-				{
-					// determine sibling order
-				    for (xml_attribute a = lhs.attribute(); a; a = a.next_attribute())
-				        if (a == rhs.attribute())
-				            return true;
-				    
-				    return false;
-				}
-				
-				// compare attribute parents
-				ln = lhs.parent();
-				rn = rhs.parent();
-			}
-			else if (lhs.attribute())
-			{
-				// attributes go after the parent element
-				if (lhs.parent() == rhs.node()) return false;
-				
-				ln = lhs.parent();
-			}
-			else if (rhs.attribute())
-			{
-				// attributes go after the parent element
-				if (rhs.parent() == lhs.node()) return true;
-				
-				rn = rhs.parent();
-			}
-
-			if (ln == rn) return false;
-			
-			unsigned int lh = node_height(ln);
-			unsigned int rh = node_height(rn);
-			
-			return node_is_before(ln, lh, rn, rh);
-		}
-	};
-
-	struct duplicate_comparator
-	{
-		bool operator()(const xpath_node& lhs, const xpath_node& rhs) const
-		{
-			if (lhs.attribute()) return rhs.attribute() ? lhs.attribute() < rhs.attribute() : true;
-			else return rhs.attribute() ? false : lhs.node() < rhs.node();
-		}
-	};
-	
-	double gen_nan()
-	{
-	#if defined(__STDC_IEC_559__) || ((FLT_RADIX - 0 == 2) && (FLT_MAX_EXP - 0 == 128) && (FLT_MANT_DIG - 0 == 24))
-		union { float f; int32_t i; } u[sizeof(float) == sizeof(int32_t) ? 1 : -1];
-		u[0].i = 0x7fc00000;
-		return u[0].f;
-	#else
-		// fallback
-		const volatile double zero = 0.0;
-		return zero / zero;
-	#endif
-	}
-	
-	bool is_nan(double value)
-	{
-	#if defined(_MSC_VER) || defined(__BORLANDC__)
-		return !!_isnan(value);
-	#elif defined(fpclassify) && defined(FP_NAN)
-		return fpclassify(value) == FP_NAN;
-	#else
-		// fallback
-		const volatile double v = value;
-		return v != v;
-	#endif
-	}
-	
-	const char_t* convert_number_to_string_special(double value)
-	{
-	#if defined(_MSC_VER) || defined(__BORLANDC__)
-		if (_finite(value)) return (value == 0) ? PUGIXML_TEXT("0") : 0;
-		if (_isnan(value)) return PUGIXML_TEXT("NaN");
-		return PUGIXML_TEXT("-Infinity") + (value > 0);
-	#elif defined(fpclassify) && defined(FP_NAN) && defined(FP_INFINITE) && defined(FP_ZERO)
-		switch (fpclassify(value))
-		{
-		case FP_NAN:
-			return PUGIXML_TEXT("NaN");
-
-		case FP_INFINITE:
-			return PUGIXML_TEXT("-Infinity") + (value > 0);
-
-		case FP_ZERO:
-			return PUGIXML_TEXT("0");
-
-		default:
-			return 0;
-		}
-	#else
-		// fallback
-		const volatile double v = value;
-
-		if (v == 0) return PUGIXML_TEXT("0");
-		if (v != v) return PUGIXML_TEXT("NaN");
-		if (v * 2 == v) return PUGIXML_TEXT("-Infinity") + (value > 0);
-		return 0;
-	#endif
-	}
-	
-	bool convert_number_to_boolean(double value)
-	{
-		return (value != 0 && !is_nan(value));
-	}
-	
-	void truncate_zeros(char* begin, char* end)
-	{
-		while (begin != end && end[-1] == '0') end--;
-
-		*end = 0;
-	}
-
-	// gets mantissa digits in the form of 0.xxxxx with 0. implied and the exponent
-#if defined(_MSC_VER) && _MSC_VER >= 1400
-	void convert_number_to_mantissa_exponent(double value, char* buffer, size_t buffer_size, char** out_mantissa, int* out_exponent)
-	{
-		// get base values
-		int sign, exponent;
-		_ecvt_s(buffer, buffer_size, value, DBL_DIG + 1, &exponent, &sign);
-
-		// truncate redundant zeros
-		truncate_zeros(buffer, buffer + strlen(buffer));
-
-		// fill results
-		*out_mantissa = buffer;
-		*out_exponent = exponent;
-	}
-#else
-	void convert_number_to_mantissa_exponent(double value, char* buffer, size_t buffer_size, char** out_mantissa, int* out_exponent)
-	{
-		// get a scientific notation value with IEEE DBL_DIG decimals
-		sprintf(buffer, "%.*e", DBL_DIG, value);
-		assert(strlen(buffer) < buffer_size);
-		(void)!buffer_size;
-
-		// get the exponent (possibly negative)
-		char* exponent_string = strchr(buffer, 'e');
-		assert(exponent_string);
-
-		int exponent = atoi(exponent_string + 1);
-
-		// extract mantissa string: skip sign
-		char* mantissa = buffer[0] == '-' ? buffer + 1 : buffer;
-		assert(mantissa[0] != '0' && mantissa[1] == '.');
-
-		// divide mantissa by 10 to eliminate integer part
-		mantissa[1] = mantissa[0];
-		mantissa++;
-		exponent++;
-
-		// remove extra mantissa digits and zero-terminate mantissa
-		truncate_zeros(mantissa, exponent_string);
-
-		// fill results
-		*out_mantissa = mantissa;
-		*out_exponent = exponent;
-	}
-#endif
-
-	xpath_string convert_number_to_string(double value, xpath_allocator* alloc)
-	{
-		// try special number conversion
-		const char_t* special = convert_number_to_string_special(value);
-		if (special) return xpath_string_const(special);
-
-		// get mantissa + exponent form
-		char mantissa_buffer[64];
-
-		char* mantissa;
-		int exponent;
-		convert_number_to_mantissa_exponent(value, mantissa_buffer, sizeof(mantissa_buffer), &mantissa, &exponent);
-
-		// make the number!
-		char_t result[512];
-		char_t* s = result;
-
-		// sign
-		if (value < 0) *s++ = '-';
-
-		// integer part
-		if (exponent <= 0)
-		{
-			*s++ = '0';
-		}
-		else
-		{
-			while (exponent > 0)
-			{
-				assert(*mantissa == 0 || (unsigned)(*mantissa - '0') <= 9);
-				*s++ = *mantissa ? *mantissa++ : '0';
-				exponent--;
-			}
-		}
-
-		// fractional part
-		if (*mantissa)
-		{
-			// decimal point
-			*s++ = '.';
-
-			// extra zeroes from negative exponent
-			while (exponent < 0)
-			{
-				*s++ = '0';
-				exponent++;
-			}
-
-			// extra mantissa digits
-			while (*mantissa)
-			{
-				assert((unsigned)(*mantissa - '0') <= 9);
-				*s++ = *mantissa++;
-			}
-		}
-
-		// zero-terminate
-		assert(s < result + sizeof(result) / sizeof(result[0]));
-		*s = 0;
-
-		return xpath_string(result, alloc);
-	}
-	
-	bool check_string_to_number_format(const char_t* string)
-	{
-		// parse leading whitespace
-		while (IS_CHARTYPE(*string, ct_space)) ++string;
-
-		// parse sign
-		if (*string == '-') ++string;
-
-		if (!*string) return false;
-
-		// if there is no integer part, there should be a decimal part with at least one digit
-		if (!IS_CHARTYPEX(string[0], ctx_digit) && (string[0] != '.' || !IS_CHARTYPEX(string[1], ctx_digit))) return false;
-
-		// parse integer part
-		while (IS_CHARTYPEX(*string, ctx_digit)) ++string;
-
-		// parse decimal part
-		if (*string == '.')
-		{
-			++string;
-
-			while (IS_CHARTYPEX(*string, ctx_digit)) ++string;
-		}
-
-		// parse trailing whitespace
-		while (IS_CHARTYPE(*string, ct_space)) ++string;
-
-		return *string == 0;
-	}
-
-	double convert_string_to_number(const char_t* string)
-	{
-		// check string format
-		if (!check_string_to_number_format(string)) return gen_nan();
-
-		// parse string
-	#ifdef PUGIXML_WCHAR_MODE
-		return wcstod(string, 0);
-	#else
-		return atof(string);
-	#endif
-	}
-
-	bool convert_string_to_number(const char_t* begin, const char_t* end, double* out_result)
-	{
-		char_t buffer[32];
-
-		size_t length = static_cast<size_t>(end - begin);
-		char_t* scratch = buffer;
-
-		if (length >= sizeof(buffer) / sizeof(buffer[0]))
-		{
-			// need to make dummy on-heap copy
-			scratch = static_cast<char_t*>(global_allocate((length + 1) * sizeof(char_t)));
-			if (!scratch) return false;
-		}
-
-		// copy string to zero-terminated buffer and perform conversion
-		memcpy(scratch, begin, length * sizeof(char_t));
-		scratch[length] = 0;
-
-		*out_result = convert_string_to_number(scratch);
-
-		// free dummy buffer
-		if (scratch != buffer) global_deallocate(scratch);
-
-		return true;
-	}
-	
-	double round_nearest(double value)
-	{
-		return floor(value + 0.5);
-	}
-
-	double round_nearest_nzero(double value)
-	{
-		// same as round_nearest, but returns -0 for [-0.5, -0]
-		// ceil is used to differentiate between +0 and -0 (we return -0 for [-0.5, -0] and +0 for +0)
-		return (value >= -0.5 && value <= 0) ? ceil(value) : floor(value + 0.5);
-	}
-	
-	const char_t* qualified_name(const xpath_node& node)
-	{
-		return node.attribute() ? node.attribute().name() : node.node().name();
-	}
-	
-	const char_t* local_name(const xpath_node& node)
-	{
-		const char_t* name = qualified_name(node);
-		const char_t* p = find_char(name, ':');
-		
-		return p ? p + 1 : name;
-	}
-
-	struct namespace_uri_predicate
-	{
-		const char_t* prefix;
-		size_t prefix_length;
-
-		namespace_uri_predicate(const char_t* name)
-		{
-			const char_t* pos = find_char(name, ':');
-
-			prefix = pos ? name : 0;
-			prefix_length = pos ? static_cast<size_t>(pos - name) : 0;
-		}
-
-		bool operator()(const xml_attribute& a) const
-		{
-			const char_t* name = a.name();
-
-			if (!starts_with(name, PUGIXML_TEXT("xmlns"))) return false;
-
-			return prefix ? name[5] == ':' && strequalrange(name + 6, prefix, prefix_length) : name[5] == 0;
-		}
-	};
-
-	const char_t* namespace_uri(const xml_node& node)
-	{
-		namespace_uri_predicate pred = node.name();
-		
-		xml_node p = node;
-		
-		while (p)
-		{
-			xml_attribute a = p.find_attribute(pred);
-			
-			if (a) return a.value();
-			
-			p = p.parent();
-		}
-		
-		return PUGIXML_TEXT("");
-	}
-
-	const char_t* namespace_uri(const xml_attribute& attr, const xml_node& parent)
-	{
-		namespace_uri_predicate pred = attr.name();
-		
-		// Default namespace does not apply to attributes
-		if (!pred.prefix) return PUGIXML_TEXT("");
-		
-		xml_node p = parent;
-		
-		while (p)
-		{
-			xml_attribute a = p.find_attribute(pred);
-			
-			if (a) return a.value();
-			
-			p = p.parent();
-		}
-		
-		return PUGIXML_TEXT("");
-	}
-
-	const char_t* namespace_uri(const xpath_node& node)
-	{
-		return node.attribute() ? namespace_uri(node.attribute(), node.parent()) : namespace_uri(node.node());
-	}
-
-	void normalize_space(char_t* buffer)
-	{
-		char_t* write = buffer;
-
-		for (char_t* it = buffer; *it; )
-		{
-			char_t ch = *it++;
-
-			if (IS_CHARTYPE(ch, ct_space))
-			{
-				// replace whitespace sequence with single space
-				while (IS_CHARTYPE(*it, ct_space)) it++;
-
-				// avoid leading spaces
-				if (write != buffer) *write++ = ' ';
-			}
-			else *write++ = ch;
-		}
-
-		// remove trailing space
-		if (write != buffer && IS_CHARTYPE(write[-1], ct_space)) write--;
-
-		// zero-terminate
-		*write = 0;
-	}
-
-	void translate(char_t* buffer, const char_t* from, const char_t* to)
-	{
-		size_t to_length = strlength(to);
-
-		char_t* write = buffer;
-
-		while (*buffer)
-		{
-			DMC_VOLATILE char_t ch = *buffer++;
-
-			const char_t* pos = find_char(from, ch);
-
-			if (!pos)
-				*write++ = ch; // do not process
-			else if (static_cast<size_t>(pos - from) < to_length)
-				*write++ = to[pos - from]; // replace
-		}
-
-		// zero-terminate
-		*write = 0;
-	}
-
-	struct xpath_variable_boolean: xpath_variable
-	{
-		xpath_variable_boolean(): value(false)
-		{
-		}
-
-		bool value;
-		char_t name[1];
-	};
-
-	struct xpath_variable_number: xpath_variable
-	{
-		xpath_variable_number(): value(0)
-		{
-		}
-
-		double value;
-		char_t name[1];
-	};
-
-	struct xpath_variable_string: xpath_variable
-	{
-		xpath_variable_string(): value(0)
-		{
-		}
-
-		~xpath_variable_string()
-		{
-			if (value) global_deallocate(value);
-		}
-
-		char_t* value;
-		char_t name[1];
-	};
-
-	struct xpath_variable_node_set: xpath_variable
-	{
-		xpath_node_set value;
-		char_t name[1];
-	};
-
-	const xpath_node_set dummy_node_set;
-
-	unsigned int hash_string(const char_t* str)
-	{
-		// Jenkins one-at-a-time hash (http://en.wikipedia.org/wiki/Jenkins_hash_function#one-at-a-time)
-		unsigned int result = 0;
-
-		while (*str)
-		{
-			result += static_cast<unsigned int>(*str++);
-			result += result << 10;
-			result ^= result >> 6;
-		}
-	
-		result += result << 3;
-		result ^= result >> 11;
-		result += result << 15;
-	
-		return result;
-	}
-
-	template <typename T> T* new_xpath_variable(const char_t* name)
-	{
-		size_t length = strlength(name);
-		if (length == 0) return 0; // empty variable names are invalid
-
-		// $$ we can't use offsetof(T, name) because T is non-POD, so we just allocate additional length characters
-		void* memory = global_allocate(sizeof(T) + length * sizeof(char_t));
-		if (!memory) return 0;
-
-		T* result = new (memory) T();
-
-		memcpy(result->name, name, (length + 1) * sizeof(char_t));
-
-		return result;
-	}
-
-	xpath_variable* new_xpath_variable(xpath_value_type type, const char_t* name)
-	{
-		switch (type)
-		{
-		case xpath_type_node_set:
-			return new_xpath_variable<xpath_variable_node_set>(name);
-
-		case xpath_type_number:
-			return new_xpath_variable<xpath_variable_number>(name);
-
-		case xpath_type_string:
-			return new_xpath_variable<xpath_variable_string>(name);
-
-		case xpath_type_boolean:
-			return new_xpath_variable<xpath_variable_boolean>(name);
-
-		default:
-			return 0;
-		}
-	}
-
-	template <typename T> void delete_xpath_variable(T* var)
-	{
-		var->~T();
-		global_deallocate(var);
-	}
-
-	void delete_xpath_variable(xpath_value_type type, xpath_variable* var)
-	{
-		switch (type)
-		{
-		case xpath_type_node_set:
-			delete_xpath_variable(static_cast<xpath_variable_node_set*>(var));
-			break;
-
-		case xpath_type_number:
-			delete_xpath_variable(static_cast<xpath_variable_number*>(var));
-			break;
-
-		case xpath_type_string:
-			delete_xpath_variable(static_cast<xpath_variable_string*>(var));
-			break;
-
-		case xpath_type_boolean:
-			delete_xpath_variable(static_cast<xpath_variable_boolean*>(var));
-			break;
-
-		default:
-			assert(!"Invalid variable type");
-		}
-	}
-
-	xpath_variable* get_variable(xpath_variable_set* set, const char_t* begin, const char_t* end)
-	{
-		char_t buffer[32];
-
-		size_t length = static_cast<size_t>(end - begin);
-		char_t* scratch = buffer;
-
-		if (length >= sizeof(buffer) / sizeof(buffer[0]))
-		{
-			// need to make dummy on-heap copy
-			scratch = static_cast<char_t*>(global_allocate((length + 1) * sizeof(char_t)));
-			if (!scratch) return 0;
-		}
-
-		// copy string to zero-terminated buffer and perform lookup
-		memcpy(scratch, begin, length * sizeof(char_t));
-		scratch[length] = 0;
-
-		xpath_variable* result = set->get(scratch);
-
-		// free dummy buffer
-		if (scratch != buffer) global_deallocate(scratch);
-
-		return result;
-	}
-}
-
-// Internal node set class
-namespace
-{
-	xpath_node_set::type_t xpath_sort(xpath_node* begin, xpath_node* end, xpath_node_set::type_t type, bool rev)
-	{
-		xpath_node_set::type_t order = rev ? xpath_node_set::type_sorted_reverse : xpath_node_set::type_sorted;
-
-		if (type == xpath_node_set::type_unsorted)
-		{
-			sort(begin, end, document_order_comparator());
-
-			type = xpath_node_set::type_sorted;
-		}
-		
-		if (type != order) reverse(begin, end);
-			
-		return order;
-	}
-
-	xpath_node xpath_first(const xpath_node* begin, const xpath_node* end, xpath_node_set::type_t type)
-	{
-		if (begin == end) return xpath_node();
-
-		switch (type)
-		{
-		case xpath_node_set::type_sorted:
-			return *begin;
-
-		case xpath_node_set::type_sorted_reverse:
-			return *(end - 1);
-
-		case xpath_node_set::type_unsorted:
-			return *min_element(begin, end, document_order_comparator());
-
-		default:
-			assert(!"Invalid node set type");
-			return xpath_node();
-		}
-	}
-	class xpath_node_set_raw
-	{
-		xpath_node_set::type_t _type;
-
-		xpath_node* _begin;
-		xpath_node* _end;
-		xpath_node* _eos;
-
-	public:
-		xpath_node_set_raw(): _type(xpath_node_set::type_unsorted), _begin(0), _end(0), _eos(0)
-		{
-		}
-
-		xpath_node* begin() const
-		{
-			return _begin;
-		}
-
-		xpath_node* end() const
-		{
-			return _end;
-		}
-
-		bool empty() const
-		{
-			return _begin == _end;
-		}
-
-		size_t size() const
-		{
-			return static_cast<size_t>(_end - _begin);
-		}
-
-		xpath_node first() const
-		{
-			return xpath_first(_begin, _end, _type);
-		}
-
-		void push_back(const xpath_node& node, xpath_allocator* alloc)
-		{
-			if (_end == _eos)
-			{
-				size_t capacity = static_cast<size_t>(_eos - _begin);
-
-				// get new capacity (1.5x rule)
-				size_t new_capacity = capacity + capacity / 2 + 1;
-
-				// reallocate the old array or allocate a new one
-				xpath_node* data = static_cast<xpath_node*>(alloc->reallocate(_begin, capacity * sizeof(xpath_node), new_capacity * sizeof(xpath_node)));
-				assert(data);
-
-				// finalize
-				_begin = data;
-				_end = data + capacity;
-				_eos = data + new_capacity;
-			}
-
-			*_end++ = node;
-		}
-
-		void append(const xpath_node* begin, const xpath_node* end, xpath_allocator* alloc)
-		{
-			size_t size = static_cast<size_t>(_end - _begin);
-			size_t capacity = static_cast<size_t>(_eos - _begin);
-			size_t count = static_cast<size_t>(end - begin);
-
-			if (size + count > capacity)
-			{
-				// reallocate the old array or allocate a new one
-				xpath_node* data = static_cast<xpath_node*>(alloc->reallocate(_begin, capacity * sizeof(xpath_node), (size + count) * sizeof(xpath_node)));
-				assert(data);
-
-				// finalize
-				_begin = data;
-				_end = data + size;
-				_eos = data + size + count;
-			}
-
-			memcpy(_end, begin, count * sizeof(xpath_node));
-			_end += count;
-		}
-
-		void sort_do()
-		{
-			_type = xpath_sort(_begin, _end, _type, false);
-		}
-
-		void truncate(xpath_node* pos)
-		{
-			assert(_begin <= pos && pos <= _end);
-
-			_end = pos;
-		}
-
-		void remove_duplicates()
-		{
-			if (_type == xpath_node_set::type_unsorted)
-				sort(_begin, _end, duplicate_comparator());
-		
-			_end = unique(_begin, _end);
-		}
-
-		xpath_node_set::type_t type() const
-		{
-			return _type;
-		}
-
-		void set_type(xpath_node_set::type_t type)
-		{
-			_type = type;
-		}
-	};
-}
-
-namespace
-{
-	struct xpath_context
-	{
-		xpath_node n;
-		size_t position, size;
-
-		xpath_context(const xpath_node& n, size_t position, size_t size): n(n), position(position), size(size)
-		{
-		}
-	};
-
-	enum lexeme_t
-	{
-		lex_none = 0,
-		lex_equal,
-		lex_not_equal,
-		lex_less,
-		lex_greater,
-		lex_less_or_equal,
-		lex_greater_or_equal,
-		lex_plus,
-		lex_minus,
-		lex_multiply,
-		lex_union,
-		lex_var_ref,
-		lex_open_brace,
-		lex_close_brace,
-		lex_quoted_string,
-		lex_number,
-		lex_slash,
-		lex_double_slash,
-		lex_open_square_brace,
-		lex_close_square_brace,
-		lex_string,
-		lex_comma,
-		lex_axis_attribute,
-		lex_dot,
-		lex_double_dot,
-		lex_double_colon,
-		lex_eof
-	};
-
-	struct xpath_lexer_string
-	{
-		const char_t* begin;
-		const char_t* end;
-
-		xpath_lexer_string(): begin(0), end(0)
-		{
-		}
-
-		bool operator==(const char_t* other) const
-		{
-			size_t length = static_cast<size_t>(end - begin);
-
-			return strequalrange(other, begin, length);
-		}
-	};
-
-	class xpath_lexer
-	{
-		const char_t* _cur;
-		const char_t* _cur_lexeme_pos;
-		xpath_lexer_string _cur_lexeme_contents;
-
-		lexeme_t _cur_lexeme;
-
-	public:
-		explicit xpath_lexer(const char_t* query): _cur(query)
-		{
-			next();
-		}
-		
-		const char_t* state() const
-		{
-			return _cur;
-		}
-		
-		void next()
-		{
-			const char_t* cur = _cur;
-
-			while (IS_CHARTYPE(*cur, ct_space)) ++cur;
-
-			// save lexeme position for error reporting
-			_cur_lexeme_pos = cur;
-
-			switch (*cur)
-			{
-			case 0:
-				_cur_lexeme = lex_eof;
-				break;
-			
-			case '>':
-				if (*(cur+1) == '=')
-				{
-					cur += 2;
-					_cur_lexeme = lex_greater_or_equal;
-				}
-				else
-				{
-					cur += 1;
-					_cur_lexeme = lex_greater;
-				}
-				break;
-
-			case '<':
-				if (*(cur+1) == '=')
-				{
-					cur += 2;
-					_cur_lexeme = lex_less_or_equal;
-				}
-				else
-				{
-					cur += 1;
-					_cur_lexeme = lex_less;
-				}
-				break;
-
-			case '!':
-				if (*(cur+1) == '=')
-				{
-					cur += 2;
-					_cur_lexeme = lex_not_equal;
-				}
-				else
-				{
-					_cur_lexeme = lex_none;
-				}
-				break;
-
-			case '=':
-				cur += 1;
-				_cur_lexeme = lex_equal;
-
-				break;
-			
-			case '+':
-				cur += 1;
-				_cur_lexeme = lex_plus;
-
-				break;
-
-			case '-':
-				cur += 1;
-				_cur_lexeme = lex_minus;
-
-				break;
-
-			case '*':
-				cur += 1;
-				_cur_lexeme = lex_multiply;
-
-				break;
-
-			case '|':
-				cur += 1;
-				_cur_lexeme = lex_union;
-
-				break;
-			
-			case '$':
-				cur += 1;
-
-				if (IS_CHARTYPEX(*cur, ctx_start_symbol))
-				{
-					_cur_lexeme_contents.begin = cur;
-
-					while (IS_CHARTYPEX(*cur, ctx_symbol)) cur++;
-
-					if (cur[0] == ':' && IS_CHARTYPEX(cur[1], ctx_symbol)) // qname
-					{
-						cur++; // :
-
-						while (IS_CHARTYPEX(*cur, ctx_symbol)) cur++;
-					}
-
-					_cur_lexeme_contents.end = cur;
-				
-					_cur_lexeme = lex_var_ref;
-				}
-				else
-				{
-					_cur_lexeme = lex_none;
-				}
-
-				break;
-
-			case '(':
-				cur += 1;
-				_cur_lexeme = lex_open_brace;
-
-				break;
-
-			case ')':
-				cur += 1;
-				_cur_lexeme = lex_close_brace;
-
-				break;
-			
-			case '[':
-				cur += 1;
-				_cur_lexeme = lex_open_square_brace;
-
-				break;
-
-			case ']':
-				cur += 1;
-				_cur_lexeme = lex_close_square_brace;
-
-				break;
-
-			case ',':
-				cur += 1;
-				_cur_lexeme = lex_comma;
-
-				break;
-
-			case '/':
-				if (*(cur+1) == '/')
-				{
-					cur += 2;
-					_cur_lexeme = lex_double_slash;
-				}
-				else
-				{
-					cur += 1;
-					_cur_lexeme = lex_slash;
-				}
-				break;
-		
-			case '.':
-				if (*(cur+1) == '.')
-				{
-					cur += 2;
-					_cur_lexeme = lex_double_dot;
-				}
-				else if (IS_CHARTYPEX(*(cur+1), ctx_digit))
-				{
-					_cur_lexeme_contents.begin = cur; // .
-
-					++cur;
-
-					while (IS_CHARTYPEX(*cur, ctx_digit)) cur++;
-
-					_cur_lexeme_contents.end = cur;
-					
-					_cur_lexeme = lex_number;
-				}
-				else
-				{
-					cur += 1;
-					_cur_lexeme = lex_dot;
-				}
-				break;
-
-			case '@':
-				cur += 1;
-				_cur_lexeme = lex_axis_attribute;
-
-				break;
-
-			case '"':
-			case '\'':
-			{
-				char_t terminator = *cur;
-
-				++cur;
-
-				_cur_lexeme_contents.begin = cur;
-				while (*cur && *cur != terminator) cur++;
-				_cur_lexeme_contents.end = cur;
-				
-				if (!*cur)
-					_cur_lexeme = lex_none;
-				else
-				{
-					cur += 1;
-					_cur_lexeme = lex_quoted_string;
-				}
-
-				break;
-			}
-
-			case ':':
-				if (*(cur+1) == ':')
-				{
-					cur += 2;
-					_cur_lexeme = lex_double_colon;
-				}
-				else
-				{
-					_cur_lexeme = lex_none;
-				}
-				break;
-
-			default:
-				if (IS_CHARTYPEX(*cur, ctx_digit))
-				{
-					_cur_lexeme_contents.begin = cur;
-
-					while (IS_CHARTYPEX(*cur, ctx_digit)) cur++;
-				
-					if (*cur == '.')
-					{
-						cur++;
-
-						while (IS_CHARTYPEX(*cur, ctx_digit)) cur++;
-					}
-
-					_cur_lexeme_contents.end = cur;
-
-					_cur_lexeme = lex_number;
-				}
-				else if (IS_CHARTYPEX(*cur, ctx_start_symbol))
-				{
-					_cur_lexeme_contents.begin = cur;
-
-					while (IS_CHARTYPEX(*cur, ctx_symbol)) cur++;
-
-					if (cur[0] == ':')
-					{
-						if (cur[1] == '*') // namespace test ncname:*
-						{
-							cur += 2; // :*
-						}
-						else if (IS_CHARTYPEX(cur[1], ctx_symbol)) // namespace test qname
-						{
-							cur++; // :
-
-							while (IS_CHARTYPEX(*cur, ctx_symbol)) cur++;
-						}
-					}
-
-					_cur_lexeme_contents.end = cur;
-				
-					_cur_lexeme = lex_string;
-				}
-				else
-				{
-					_cur_lexeme = lex_none;
-				}
-			}
-
-			_cur = cur;
-		}
-
-		lexeme_t current() const
-		{
-			return _cur_lexeme;
-		}
-
-		const char_t* current_pos() const
-		{
-			return _cur_lexeme_pos;
-		}
-
-		const xpath_lexer_string& contents() const
-		{
-			assert(_cur_lexeme == lex_var_ref || _cur_lexeme == lex_number || _cur_lexeme == lex_string || _cur_lexeme == lex_quoted_string);
-
-			return _cur_lexeme_contents;
-		}
-	};
-
-	enum ast_type_t
-	{
-		ast_op_or,						// left or right
-		ast_op_and,						// left and right
-		ast_op_equal,					// left = right
-		ast_op_not_equal, 				// left != right
-		ast_op_less,					// left < right
-		ast_op_greater,					// left > right
-		ast_op_less_or_equal,			// left <= right
-		ast_op_greater_or_equal,		// left >= right
-		ast_op_add,						// left + right
-		ast_op_subtract,				// left - right
-		ast_op_multiply,				// left * right
-		ast_op_divide,					// left / right
-		ast_op_mod,						// left % right
-		ast_op_negate,					// left - right
-		ast_op_union,					// left | right
-		ast_predicate,					// apply predicate to set; next points to next predicate
-		ast_filter,						// select * from left where right
-		ast_filter_posinv,				// select * from left where right; proximity position invariant
-		ast_string_constant,			// string constant
-		ast_number_constant,			// number constant
-		ast_variable,					// variable
-		ast_func_last,					// last()
-		ast_func_position,				// position()
-		ast_func_count,					// count(left)
-		ast_func_id,					// id(left)
-		ast_func_local_name_0,			// local-name()
-		ast_func_local_name_1,			// local-name(left)
-		ast_func_namespace_uri_0,		// namespace-uri()
-		ast_func_namespace_uri_1,		// namespace-uri(left)
-		ast_func_name_0,				// name()
-		ast_func_name_1,				// name(left)
-		ast_func_string_0,				// string()
-		ast_func_string_1,				// string(left)
-		ast_func_concat,				// concat(left, right, siblings)
-		ast_func_starts_with,			// starts_with(left, right)
-		ast_func_contains,				// contains(left, right)
-		ast_func_substring_before,		// substring-before(left, right)
-		ast_func_substring_after,		// substring-after(left, right)
-		ast_func_substring_2,			// substring(left, right)
-		ast_func_substring_3,			// substring(left, right, third)
-		ast_func_string_length_0,		// string-length()
-		ast_func_string_length_1,		// string-length(left)
-		ast_func_normalize_space_0,		// normalize-space()
-		ast_func_normalize_space_1,		// normalize-space(left)
-		ast_func_translate,				// translate(left, right, third)
-		ast_func_boolean,				// boolean(left)
-		ast_func_not,					// not(left)
-		ast_func_true,					// true()
-		ast_func_false,					// false()
-		ast_func_lang,					// lang(left)
-		ast_func_number_0,				// number()
-		ast_func_number_1,				// number(left)
-		ast_func_sum,					// sum(left)
-		ast_func_floor,					// floor(left)
-		ast_func_ceiling,				// ceiling(left)
-		ast_func_round,					// round(left)
-		ast_step,						// process set left with step
-		ast_step_root					// select root node
-	};
-
-	enum axis_t
-	{
-		axis_ancestor,
-		axis_ancestor_or_self,
-		axis_attribute,
-		axis_child,
-		axis_descendant,
-		axis_descendant_or_self,
-		axis_following,
-		axis_following_sibling,
-		axis_namespace,
-		axis_parent,
-		axis_preceding,
-		axis_preceding_sibling,
-		axis_self
-	};
-	
-	enum nodetest_t
-	{
-		nodetest_none,
-		nodetest_name,
-		nodetest_type_node,
-		nodetest_type_comment,
-		nodetest_type_pi,
-		nodetest_type_text,
-		nodetest_pi,
-		nodetest_all,
-		nodetest_all_in_namespace
-	};
-
-	template <axis_t N> struct axis_to_type
-	{
-		static const axis_t axis;
-	};
-
-	template <axis_t N> const axis_t axis_to_type<N>::axis = N;
-		
-	class xpath_ast_node
-	{
-	private:
-		// node type
-		char _type;
-		char _rettype;
-
-		// for ast_step / ast_predicate
-		char _axis;
-		char _test;
-
-		// tree node structure
-		xpath_ast_node* _left;
-		xpath_ast_node* _right;
-		xpath_ast_node* _next;
-
-		union
-		{
-			// value for ast_string_constant
-			const char_t* string;
-			// value for ast_number_constant
-			double number;
-			// variable for ast_variable
-			xpath_variable* variable;
-			// node test for ast_step (node name/namespace/node type/pi target)
-			const char_t* nodetest;
-		} _data;
-
-		xpath_ast_node(const xpath_ast_node&);
-		xpath_ast_node& operator=(const xpath_ast_node&);
-
-		template <class Comp> static bool compare_eq(xpath_ast_node* lhs, xpath_ast_node* rhs, const xpath_context& c, const xpath_stack& stack, const Comp& comp)
-		{
-			xpath_value_type lt = lhs->rettype(), rt = rhs->rettype();
-
-			if (lt != xpath_type_node_set && rt != xpath_type_node_set)
-			{
-				if (lt == xpath_type_boolean || rt == xpath_type_boolean)
-					return comp(lhs->eval_boolean(c, stack), rhs->eval_boolean(c, stack));
-				else if (lt == xpath_type_number || rt == xpath_type_number)
-					return comp(lhs->eval_number(c, stack), rhs->eval_number(c, stack));
-				else if (lt == xpath_type_string || rt == xpath_type_string)
-				{
-					xpath_allocator_capture cr(stack.result);
-
-					xpath_string ls = lhs->eval_string(c, stack);
-					xpath_string rs = rhs->eval_string(c, stack);
-
-					return comp(ls, rs);
-				}
-			}
-			else if (lt == xpath_type_node_set && rt == xpath_type_node_set)
-			{
-				xpath_allocator_capture cr(stack.result);
-
-				xpath_node_set_raw ls = lhs->eval_node_set(c, stack);
-				xpath_node_set_raw rs = rhs->eval_node_set(c, stack);
-
-				for (const xpath_node* li = ls.begin(); li != ls.end(); ++li)
-					for (const xpath_node* ri = rs.begin(); ri != rs.end(); ++ri)
-					{
-						xpath_allocator_capture cri(stack.result);
-
-						if (comp(string_value(*li, stack.result), string_value(*ri, stack.result)))
-							return true;
-					}
-
-				return false;
-			}
-			else
-			{
-				if (lt == xpath_type_node_set)
-				{
-					swap(lhs, rhs);
-					swap(lt, rt);
-				}
-
-				if (lt == xpath_type_boolean)
-					return comp(lhs->eval_boolean(c, stack), rhs->eval_boolean(c, stack));
-				else if (lt == xpath_type_number)
-				{
-					xpath_allocator_capture cr(stack.result);
-
-					double l = lhs->eval_number(c, stack);
-					xpath_node_set_raw rs = rhs->eval_node_set(c, stack);
-
-					for (const xpath_node* ri = rs.begin(); ri != rs.end(); ++ri)
-					{
-						xpath_allocator_capture cri(stack.result);
-
-						if (comp(l, convert_string_to_number(string_value(*ri, stack.result).c_str())))
-							return true;
-					}
-
-					return false;
-				}
-				else if (lt == xpath_type_string)
-				{
-					xpath_allocator_capture cr(stack.result);
-
-					xpath_string l = lhs->eval_string(c, stack);
-					xpath_node_set_raw rs = rhs->eval_node_set(c, stack);
-
-					for (const xpath_node* ri = rs.begin(); ri != rs.end(); ++ri)
-					{
-						xpath_allocator_capture cri(stack.result);
-
-						if (comp(l, string_value(*ri, stack.result)))
-							return true;
-					}
-
-					return false;
-				}
-			}
-
-			assert(!"Wrong types");
-			return false;
-		}
-
-		template <class Comp> static bool compare_rel(xpath_ast_node* lhs, xpath_ast_node* rhs, const xpath_context& c, const xpath_stack& stack, const Comp& comp)
-		{
-			xpath_value_type lt = lhs->rettype(), rt = rhs->rettype();
-
-			if (lt != xpath_type_node_set && rt != xpath_type_node_set)
-				return comp(lhs->eval_number(c, stack), rhs->eval_number(c, stack));
-			else if (lt == xpath_type_node_set && rt == xpath_type_node_set)
-			{
-				xpath_allocator_capture cr(stack.result);
-
-				xpath_node_set_raw ls = lhs->eval_node_set(c, stack);
-				xpath_node_set_raw rs = rhs->eval_node_set(c, stack);
-
-				for (const xpath_node* li = ls.begin(); li != ls.end(); ++li)
-				{
-					xpath_allocator_capture cri(stack.result);
-
-					double l = convert_string_to_number(string_value(*li, stack.result).c_str());
-
-					for (const xpath_node* ri = rs.begin(); ri != rs.end(); ++ri)
-					{
-						xpath_allocator_capture crii(stack.result);
-
-						if (comp(l, convert_string_to_number(string_value(*ri, stack.result).c_str())))
-							return true;
-					}
-				}
-
-				return false;
-			}
-			else if (lt != xpath_type_node_set && rt == xpath_type_node_set)
-			{
-				xpath_allocator_capture cr(stack.result);
-
-				double l = lhs->eval_number(c, stack);
-				xpath_node_set_raw rs = rhs->eval_node_set(c, stack);
-
-				for (const xpath_node* ri = rs.begin(); ri != rs.end(); ++ri)
-				{
-					xpath_allocator_capture cri(stack.result);
-
-					if (comp(l, convert_string_to_number(string_value(*ri, stack.result).c_str())))
-						return true;
-				}
-
-				return false;
-			}
-			else if (lt == xpath_type_node_set && rt != xpath_type_node_set)
-			{
-				xpath_allocator_capture cr(stack.result);
-
-				xpath_node_set_raw ls = lhs->eval_node_set(c, stack);
-				double r = rhs->eval_number(c, stack);
-
-				for (const xpath_node* li = ls.begin(); li != ls.end(); ++li)
-				{
-					xpath_allocator_capture cri(stack.result);
-
-					if (comp(convert_string_to_number(string_value(*li, stack.result).c_str()), r))
-						return true;
-				}
-
-				return false;
-			}
-			else
-			{
-				assert(!"Wrong types");
-				return false;
-			}
-		}
-
-		void apply_predicate(xpath_node_set_raw& ns, size_t first, xpath_ast_node* expr, const xpath_stack& stack)
-		{
-			assert(ns.size() >= first);
-
-			size_t i = 1;
-			size_t size = ns.size() - first;
-				
-			xpath_node* last = ns.begin() + first;
-				
-			// remove_if... or well, sort of
-			for (xpath_node* it = last; it != ns.end(); ++it, ++i)
-			{
-				xpath_context c(*it, i, size);
-			
-				if (expr->rettype() == xpath_type_number)
-				{
-					if (expr->eval_number(c, stack) == i)
-						*last++ = *it;
-				}
-				else if (expr->eval_boolean(c, stack))
-					*last++ = *it;
-			}
-			
-			ns.truncate(last);
-		}
-
-		void apply_predicates(xpath_node_set_raw& ns, size_t first, const xpath_stack& stack)
-		{
-			if (ns.size() == first) return;
-			
-			for (xpath_ast_node* pred = _right; pred; pred = pred->_next)
-			{
-				apply_predicate(ns, first, pred->_left, stack);
-			}
-		}
-
-		void step_push(xpath_node_set_raw& ns, const xml_attribute& a, const xml_node& parent, xpath_allocator* alloc)
-		{
-			if (!a) return;
-
-			const char_t* name = a.name();
-
-			// There are no attribute nodes corresponding to attributes that declare namespaces
-			// That is, "xmlns:..." or "xmlns"
-			if (starts_with(name, PUGIXML_TEXT("xmlns")) && (name[5] == 0 || name[5] == ':')) return;
-			
-			switch (_test)
-			{
-			case nodetest_name:
-				if (strequal(name, _data.nodetest)) ns.push_back(xpath_node(a, parent), alloc);
-				break;
-				
-			case nodetest_type_node:
-			case nodetest_all:
-				ns.push_back(xpath_node(a, parent), alloc);
-				break;
-				
-			case nodetest_all_in_namespace:
-				if (starts_with(name, _data.nodetest))
-					ns.push_back(xpath_node(a, parent), alloc);
-				break;
-			
-			default:
-				;
-			}
-		}
-		
-		void step_push(xpath_node_set_raw& ns, const xml_node& n, xpath_allocator* alloc)
-		{
-			if (!n) return;
-
-			switch (_test)
-			{
-			case nodetest_name:
-				if (n.type() == node_element && strequal(n.name(), _data.nodetest)) ns.push_back(n, alloc);
-				break;
-				
-			case nodetest_type_node:
-				ns.push_back(n, alloc);
-				break;
-				
-			case nodetest_type_comment:
-				if (n.type() == node_comment)
-					ns.push_back(n, alloc);
-				break;
-				
-			case nodetest_type_text:
-				if (n.type() == node_pcdata || n.type() == node_cdata)
-					ns.push_back(n, alloc);
-				break;
-				
-			case nodetest_type_pi:
-				if (n.type() == node_pi)
-					ns.push_back(n, alloc);
-				break;
-									
-			case nodetest_pi:
-				if (n.type() == node_pi && strequal(n.name(), _data.nodetest))
-					ns.push_back(n, alloc);
-				break;
-				
-			case nodetest_all:
-				if (n.type() == node_element)
-					ns.push_back(n, alloc);
-				break;
-				
-			case nodetest_all_in_namespace:
-				if (n.type() == node_element && starts_with(n.name(), _data.nodetest))
-					ns.push_back(n, alloc);
-				break;
-
-			default:
-				assert(!"Unknown axis");
-			} 
-		}
-
-		template <class T> void step_fill(xpath_node_set_raw& ns, const xml_node& n, xpath_allocator* alloc, T)
-		{
-			const axis_t axis = T::axis;
-
-			switch (axis)
-			{
-			case axis_attribute:
-			{
-				for (xml_attribute a = n.first_attribute(); a; a = a.next_attribute())
-					step_push(ns, a, n, alloc);
-				
-				break;
-			}
-			
-			case axis_child:
-			{
-				for (xml_node c = n.first_child(); c; c = c.next_sibling())
-					step_push(ns, c, alloc);
-					
-				break;
-			}
-			
-			case axis_descendant:
-			case axis_descendant_or_self:
-			{
-				if (axis == axis_descendant_or_self)
-					step_push(ns, n, alloc);
-					
-				xml_node cur = n.first_child();
-				
-				while (cur && cur != n)
-				{
-					step_push(ns, cur, alloc);
-					
-					if (cur.first_child())
-						cur = cur.first_child();
-					else if (cur.next_sibling())
-						cur = cur.next_sibling();
-					else
-					{
-						while (!cur.next_sibling() && cur != n)
-							cur = cur.parent();
-					
-						if (cur != n) cur = cur.next_sibling();
-					}
-				}
-				
-				break;
-			}
-			
-			case axis_following_sibling:
-			{
-				for (xml_node c = n.next_sibling(); c; c = c.next_sibling())
-					step_push(ns, c, alloc);
-				
-				break;
-			}
-			
-			case axis_preceding_sibling:
-			{
-				for (xml_node c = n.previous_sibling(); c; c = c.previous_sibling())
-					step_push(ns, c, alloc);
-				
-				break;
-			}
-			
-			case axis_following:
-			{
-				xml_node cur = n;
-
-				// exit from this node so that we don't include descendants
-				while (cur && !cur.next_sibling()) cur = cur.parent();
-				cur = cur.next_sibling();
-
-				for (;;)
-				{
-					step_push(ns, cur, alloc);
-
-					if (cur.first_child())
-						cur = cur.first_child();
-					else if (cur.next_sibling())
-						cur = cur.next_sibling();
-					else
-					{
-						while (cur && !cur.next_sibling()) cur = cur.parent();
-						cur = cur.next_sibling();
-
-						if (!cur) break;
-					}
-				}
-
-				break;
-			}
-
-			case axis_preceding:
-			{
-				xml_node cur = n;
-
-				while (cur && !cur.previous_sibling()) cur = cur.parent();
-				cur = cur.previous_sibling();
-
-				for (;;)
-				{
-					if (cur.last_child())
-						cur = cur.last_child();
-					else
-					{
-						// leaf node, can't be ancestor
-						step_push(ns, cur, alloc);
-
-						if (cur.previous_sibling())
-							cur = cur.previous_sibling();
-						else
-						{
-							do 
-							{
-								cur = cur.parent();
-								if (!cur) break;
-
-								if (!node_is_ancestor(cur, n)) step_push(ns, cur, alloc);
-							}
-							while (!cur.previous_sibling());
-
-							cur = cur.previous_sibling();
-
-							if (!cur) break;
-						}
-					}
-				}
-
-				break;
-			}
-			
-			case axis_ancestor:
-			case axis_ancestor_or_self:
-			{
-				if (axis == axis_ancestor_or_self)
-					step_push(ns, n, alloc);
-
-				xml_node cur = n.parent();
-				
-				while (cur)
-				{
-					step_push(ns, cur, alloc);
-					
-					cur = cur.parent();
-				}
-				
-				break;
-			}
-
-			case axis_self:
-			{
-				step_push(ns, n, alloc);
-
-				break;
-			}
-
-			case axis_parent:
-			{
-				if (n.parent()) step_push(ns, n.parent(), alloc);
-
-				break;
-			}
-				
-			default:
-				assert(!"Unimplemented axis");
-			}
-		}
-		
-		template <class T> void step_fill(xpath_node_set_raw& ns, const xml_attribute& a, const xml_node& p, xpath_allocator* alloc, T v)
-		{
-			const axis_t axis = T::axis;
-
-			switch (axis)
-			{
-			case axis_ancestor:
-			case axis_ancestor_or_self:
-			{
-				if (axis == axis_ancestor_or_self && _test == nodetest_type_node) // reject attributes based on principal node type test
-					step_push(ns, a, p, alloc);
-
-				xml_node cur = p;
-				
-				while (cur)
-				{
-					step_push(ns, cur, alloc);
-					
-					cur = cur.parent();
-				}
-				
-				break;
-			}
-
-			case axis_descendant_or_self:
-			case axis_self:
-			{
-				if (_test == nodetest_type_node) // reject attributes based on principal node type test
-					step_push(ns, a, p, alloc);
-
-				break;
-			}
-
-			case axis_following:
-			{
-				xml_node cur = p;
-				
-				for (;;)
-				{
-					if (cur.first_child())
-						cur = cur.first_child();
-					else if (cur.next_sibling())
-						cur = cur.next_sibling();
-					else
-					{
-						while (cur && !cur.next_sibling()) cur = cur.parent();
-						cur = cur.next_sibling();
-						
-						if (!cur) break;
-					}
-
-					step_push(ns, cur, alloc);
-				}
-
-				break;
-			}
-
-			case axis_parent:
-			{
-				step_push(ns, p, alloc);
-
-				break;
-			}
-
-			case axis_preceding:
-			{
-				// preceding:: axis does not include attribute nodes and attribute ancestors (they are the same as parent's ancestors), so we can reuse node preceding
-				step_fill(ns, p, alloc, v);
-				break;
-			}
-			
-			default:
-				assert(!"Unimplemented axis");
-			}
-		}
-		
-		template <class T> xpath_node_set_raw step_do(const xpath_context& c, const xpath_stack& stack, T v)
-		{
-			const axis_t axis = T::axis;
-			bool attributes = (axis == axis_ancestor || axis == axis_ancestor_or_self || axis == axis_descendant_or_self || axis == axis_following || axis == axis_parent || axis == axis_preceding || axis == axis_self);
-
-			xpath_node_set_raw ns;
-			ns.set_type((axis == axis_ancestor || axis == axis_ancestor_or_self || axis == axis_preceding || axis == axis_preceding_sibling) ? xpath_node_set::type_sorted_reverse : xpath_node_set::type_sorted);
-
-			if (_left)
-			{
-				xpath_node_set_raw s = _left->eval_node_set(c, stack);
-
-				// self axis preserves the original order
-				if (axis == axis_self) ns.set_type(s.type());
-
-				for (const xpath_node* it = s.begin(); it != s.end(); ++it)
-				{
-					size_t size = ns.size();
-
-					// in general, all axes generate elements in a particular order, but there is no order guarantee if axis is applied to two nodes
-					if (axis != axis_self && size != 0) ns.set_type(xpath_node_set::type_unsorted);
-					
-					if (it->node())
-						step_fill(ns, it->node(), stack.result, v);
-					else if (attributes)
-						step_fill(ns, it->attribute(), it->parent(), stack.result, v);
-						
-					apply_predicates(ns, size, stack);
-				}
-			}
-			else
-			{
-				if (c.n.node())
-					step_fill(ns, c.n.node(), stack.result, v);
-				else if (attributes)
-					step_fill(ns, c.n.attribute(), c.n.parent(), stack.result, v);
-				
-				apply_predicates(ns, 0, stack);
-			}
-
-			// child, attribute and self axes always generate unique set of nodes
-			// for other axis, if the set stayed sorted, it stayed unique because the traversal algorithms do not visit the same node twice
-			if (axis != axis_child && axis != axis_attribute && axis != axis_self && ns.type() == xpath_node_set::type_unsorted)
-				ns.remove_duplicates();
-
-			return ns;
-		}
-		
-	public:
-		xpath_ast_node(ast_type_t type, xpath_value_type rettype, const char_t* value):
-			_type((char)type), _rettype((char)rettype), _axis(0), _test(0), _left(0), _right(0), _next(0)
-		{
-			assert(type == ast_string_constant);
-			_data.string = value;
-		}
-
-		xpath_ast_node(ast_type_t type, xpath_value_type rettype, double value):
-			_type((char)type), _rettype((char)rettype), _axis(0), _test(0), _left(0), _right(0), _next(0)
-		{
-			assert(type == ast_number_constant);
-			_data.number = value;
-		}
-		
-		xpath_ast_node(ast_type_t type, xpath_value_type rettype, xpath_variable* value):
-			_type((char)type), _rettype((char)rettype), _axis(0), _test(0), _left(0), _right(0), _next(0)
-		{
-			assert(type == ast_variable);
-			_data.variable = value;
-		}
-		
-		xpath_ast_node(ast_type_t type, xpath_value_type rettype, xpath_ast_node* left = 0, xpath_ast_node* right = 0):
-			_type((char)type), _rettype((char)rettype), _axis(0), _test(0), _left(left), _right(right), _next(0)
-		{
-		}
-
-		xpath_ast_node(ast_type_t type, xpath_ast_node* left, axis_t axis, nodetest_t test, const char_t* contents):
-			_type((char)type), _rettype(xpath_type_node_set), _axis((char)axis), _test((char)test), _left(left), _right(0), _next(0)
-		{
-			_data.nodetest = contents;
-		}
-
-		void set_next(xpath_ast_node* value)
-		{
-			_next = value;
-		}
-
-		void set_right(xpath_ast_node* value)
-		{
-			_right = value;
-		}
-
-		bool eval_boolean(const xpath_context& c, const xpath_stack& stack)
-		{
-			switch (_type)
-			{
-			case ast_op_or:
-				return _left->eval_boolean(c, stack) || _right->eval_boolean(c, stack);
-				
-			case ast_op_and:
-				return _left->eval_boolean(c, stack) && _right->eval_boolean(c, stack);
-				
-			case ast_op_equal:
-				return compare_eq(_left, _right, c, stack, equal_to());
-
-			case ast_op_not_equal:
-				return compare_eq(_left, _right, c, stack, not_equal_to());
-	
-			case ast_op_less:
-				return compare_rel(_left, _right, c, stack, less());
-			
-			case ast_op_greater:
-				return compare_rel(_right, _left, c, stack, less());
-
-			case ast_op_less_or_equal:
-				return compare_rel(_left, _right, c, stack, less_equal());
-			
-			case ast_op_greater_or_equal:
-				return compare_rel(_right, _left, c, stack, less_equal());
-
-			case ast_func_starts_with:
-			{
-				xpath_allocator_capture cr(stack.result);
-
-				xpath_string lr = _left->eval_string(c, stack);
-				xpath_string rr = _right->eval_string(c, stack);
-
-				return starts_with(lr.c_str(), rr.c_str());
-			}
-
-			case ast_func_contains:
-			{
-				xpath_allocator_capture cr(stack.result);
-
-				xpath_string lr = _left->eval_string(c, stack);
-				xpath_string rr = _right->eval_string(c, stack);
-
-				return find_substring(lr.c_str(), rr.c_str()) != 0;
-			}
-
-			case ast_func_boolean:
-				return _left->eval_boolean(c, stack);
-				
-			case ast_func_not:
-				return !_left->eval_boolean(c, stack);
-				
-			case ast_func_true:
-				return true;
-				
-			case ast_func_false:
-				return false;
-
-			case ast_func_lang:
-			{
-				if (c.n.attribute()) return false;
-				
-				xpath_allocator_capture cr(stack.result);
-
-				xpath_string lang = _left->eval_string(c, stack);
-				
-				for (xml_node n = c.n.node(); n; n = n.parent())
-				{
-					xml_attribute a = n.attribute(PUGIXML_TEXT("xml:lang"));
-					
-					if (a)
-					{
-						const char_t* value = a.value();
-						
-						// strnicmp / strncasecmp is not portable
-						for (const char_t* lit = lang.c_str(); *lit; ++lit)
-						{
-							if (tolower_ascii(*lit) != tolower_ascii(*value)) return false;
-							++value;
-						}
-						
-						return *value == 0 || *value == '-';
-					}
-				}
-				
-				return false;
-			}
-
-			case ast_variable:
-			{
-				assert(_rettype == _data.variable->type());
-
-				if (_rettype == xpath_type_boolean)
-					return _data.variable->get_boolean();
-
-				// fallthrough to type conversion
-			}
-
-			default:
-			{
-				switch (_rettype)
-				{
-				case xpath_type_number:
-					return convert_number_to_boolean(eval_number(c, stack));
-					
-				case xpath_type_string:
-				{
-					xpath_allocator_capture cr(stack.result);
-
-					return !eval_string(c, stack).empty();
-				}
-					
-				case xpath_type_node_set:				
-				{
-					xpath_allocator_capture cr(stack.result);
-
-					return !eval_node_set(c, stack).empty();
-				}
-
-				default:
-					assert(!"Wrong expression for return type boolean");
-					return false;
-				}
-			}
-			}
-		}
-
-		double eval_number(const xpath_context& c, const xpath_stack& stack)
-		{
-			switch (_type)
-			{
-			case ast_op_add:
-				return _left->eval_number(c, stack) + _right->eval_number(c, stack);
-				
-			case ast_op_subtract:
-				return _left->eval_number(c, stack) - _right->eval_number(c, stack);
-
-			case ast_op_multiply:
-				return _left->eval_number(c, stack) * _right->eval_number(c, stack);
-
-			case ast_op_divide:
-				return _left->eval_number(c, stack) / _right->eval_number(c, stack);
-
-			case ast_op_mod:
-				return fmod(_left->eval_number(c, stack), _right->eval_number(c, stack));
-
-			case ast_op_negate:
-				return -_left->eval_number(c, stack);
-
-			case ast_number_constant:
-				return _data.number;
-
-			case ast_func_last:
-				return (double)c.size;
-			
-			case ast_func_position:
-				return (double)c.position;
-
-			case ast_func_count:
-			{
-				xpath_allocator_capture cr(stack.result);
-
-				return (double)_left->eval_node_set(c, stack).size();
-			}
-			
-			case ast_func_string_length_0:
-			{
-				xpath_allocator_capture cr(stack.result);
-
-				return (double)string_value(c.n, stack.result).length();
-			}
-			
-			case ast_func_string_length_1:
-			{
-				xpath_allocator_capture cr(stack.result);
-
-				return (double)_left->eval_string(c, stack).length();
-			}
-			
-			case ast_func_number_0:
-			{
-				xpath_allocator_capture cr(stack.result);
-
-				return convert_string_to_number(string_value(c.n, stack.result).c_str());
-			}
-			
-			case ast_func_number_1:
-				return _left->eval_number(c, stack);
-
-			case ast_func_sum:
-			{
-				xpath_allocator_capture cr(stack.result);
-
-				double r = 0;
-				
-				xpath_node_set_raw ns = _left->eval_node_set(c, stack);
-				
-				for (const xpath_node* it = ns.begin(); it != ns.end(); ++it)
-				{
-					xpath_allocator_capture cri(stack.result);
-
-					r += convert_string_to_number(string_value(*it, stack.result).c_str());
-				}
-			
-				return r;
-			}
-
-			case ast_func_floor:
-			{
-				double r = _left->eval_number(c, stack);
-				
-				return r == r ? floor(r) : r;
-			}
-
-			case ast_func_ceiling:
-			{
-				double r = _left->eval_number(c, stack);
-				
-				return r == r ? ceil(r) : r;
-			}
-
-			case ast_func_round:
-				return round_nearest_nzero(_left->eval_number(c, stack));
-			
-			case ast_variable:
-			{
-				assert(_rettype == _data.variable->type());
-
-				if (_rettype == xpath_type_number)
-					return _data.variable->get_number();
-
-				// fallthrough to type conversion
-			}
-
-			default:
-			{
-				switch (_rettype)
-				{
-				case xpath_type_boolean:
-					return eval_boolean(c, stack) ? 1 : 0;
-					
-				case xpath_type_string:
-				{
-					xpath_allocator_capture cr(stack.result);
-
-					return convert_string_to_number(eval_string(c, stack).c_str());
-				}
-					
-				case xpath_type_node_set:
-				{
-					xpath_allocator_capture cr(stack.result);
-
-					return convert_string_to_number(eval_string(c, stack).c_str());
-				}
-					
-				default:
-					assert(!"Wrong expression for return type number");
-					return 0;
-				}
-				
-			}
-			}
-		}
-		
-		xpath_string eval_string_concat(const xpath_context& c, const xpath_stack& stack)
-		{
-			assert(_type == ast_func_concat);
-
-			xpath_allocator_capture ct(stack.temp);
-
-			// count the string number
-			size_t count = 1;
-			for (xpath_ast_node* nc = _right; nc; nc = nc->_next) count++;
-
-			// gather all strings
-			xpath_string static_buffer[4];
-			xpath_string* buffer = static_buffer;
-
-			// allocate on-heap for large concats
-			if (count > sizeof(static_buffer) / sizeof(static_buffer[0]))
-			{
-				buffer = static_cast<xpath_string*>(stack.temp->allocate(count * sizeof(xpath_string)));
-				assert(buffer);
-			}
-
-			// evaluate all strings to temporary stack
-			xpath_stack swapped_stack = {stack.temp, stack.result};
-
-			buffer[0] = _left->eval_string(c, swapped_stack);
-
-			size_t pos = 1;
-			for (xpath_ast_node* n = _right; n; n = n->_next, ++pos) buffer[pos] = n->eval_string(c, swapped_stack);
-			assert(pos == count);
-
-			// get total length
-			size_t length = 0;
-			for (size_t i = 0; i < count; ++i) length += buffer[i].length();
-
-			// create final string
-			char_t* result = static_cast<char_t*>(stack.result->allocate((length + 1) * sizeof(char_t)));
-			assert(result);
-
-			char_t* ri = result;
-
-			for (size_t j = 0; j < count; ++j)
-				for (const char_t* bi = buffer[j].c_str(); *bi; ++bi)
-					*ri++ = *bi;
-
-			*ri = 0;
-
-			return xpath_string(result, true);
-		}
-
-		xpath_string eval_string(const xpath_context& c, const xpath_stack& stack)
-		{
-			switch (_type)
-			{
-			case ast_string_constant:
-				return xpath_string_const(_data.string);
-			
-			case ast_func_local_name_0:
-			{
-				xpath_node na = c.n;
-				
-				return xpath_string_const(local_name(na));
-			}
-
-			case ast_func_local_name_1:
-			{
-				xpath_allocator_capture cr(stack.result);
-
-				xpath_node_set_raw ns = _left->eval_node_set(c, stack);
-				xpath_node na = ns.first();
-				
-				return xpath_string_const(local_name(na));
-			}
-
-			case ast_func_name_0:
-			{
-				xpath_node na = c.n;
-				
-				return xpath_string_const(qualified_name(na));
-			}
-
-			case ast_func_name_1:
-			{
-				xpath_allocator_capture cr(stack.result);
-
-				xpath_node_set_raw ns = _left->eval_node_set(c, stack);
-				xpath_node na = ns.first();
-				
-				return xpath_string_const(qualified_name(na));
-			}
-
-			case ast_func_namespace_uri_0:
-			{
-				xpath_node na = c.n;
-				
-				return xpath_string_const(namespace_uri(na));
-			}
-
-			case ast_func_namespace_uri_1:
-			{
-				xpath_allocator_capture cr(stack.result);
-
-				xpath_node_set_raw ns = _left->eval_node_set(c, stack);
-				xpath_node na = ns.first();
-				
-				return xpath_string_const(namespace_uri(na));
-			}
-
-			case ast_func_string_0:
-				return string_value(c.n, stack.result);
-
-			case ast_func_string_1:
-				return _left->eval_string(c, stack);
-
-			case ast_func_concat:
-				return eval_string_concat(c, stack);
-
-			case ast_func_substring_before:
-			{
-				xpath_allocator_capture cr(stack.temp);
-
-				xpath_stack swapped_stack = {stack.temp, stack.result};
-
-				xpath_string s = _left->eval_string(c, swapped_stack);
-				xpath_string p = _right->eval_string(c, swapped_stack);
-
-				const char_t* pos = find_substring(s.c_str(), p.c_str());
-				
-				return pos ? xpath_string(s.c_str(), pos, stack.result) : xpath_string();
-			}
-			
-			case ast_func_substring_after:
-			{
-				xpath_allocator_capture cr(stack.temp);
-
-				xpath_stack swapped_stack = {stack.temp, stack.result};
-
-				xpath_string s = _left->eval_string(c, swapped_stack);
-				xpath_string p = _right->eval_string(c, swapped_stack);
-				
-				const char_t* pos = find_substring(s.c_str(), p.c_str());
-				if (!pos) return xpath_string();
-
-				const char_t* result = pos + p.length();
-
-				return s.uses_heap() ? xpath_string(result, stack.result) : xpath_string_const(result);
-			}
-
-			case ast_func_substring_2:
-			{
-				xpath_allocator_capture cr(stack.temp);
-
-				xpath_stack swapped_stack = {stack.temp, stack.result};
-
-				xpath_string s = _left->eval_string(c, swapped_stack);
-				size_t s_length = s.length();
-
-				double first = round_nearest(_right->eval_number(c, stack));
-				
-				if (is_nan(first)) return xpath_string(); // NaN
-				else if (first >= s_length + 1) return xpath_string();
-				
-				size_t pos = first < 1 ? 1 : (size_t)first;
-				assert(1 <= pos && pos <= s_length + 1);
-
-				const char_t* rbegin = s.c_str() + (pos - 1);
-				
-				return s.uses_heap() ? xpath_string(rbegin, stack.result) : xpath_string_const(rbegin);
-			}
-			
-			case ast_func_substring_3:
-			{
-				xpath_allocator_capture cr(stack.temp);
-
-				xpath_stack swapped_stack = {stack.temp, stack.result};
-
-				xpath_string s = _left->eval_string(c, swapped_stack);
-				size_t s_length = s.length();
-
-				double first = round_nearest(_right->eval_number(c, stack));
-				double last = first + round_nearest(_right->_next->eval_number(c, stack));
-				
-				if (is_nan(first) || is_nan(last)) return xpath_string();
-				else if (first >= s_length + 1) return xpath_string();
-				else if (first >= last) return xpath_string();
-				else if (last < 1) return xpath_string();
-				
-				size_t pos = first < 1 ? 1 : (size_t)first;
-				size_t end = last >= s_length + 1 ? s_length + 1 : (size_t)last;
-
-				assert(1 <= pos && pos <= end && end <= s_length + 1);
-				const char_t* rbegin = s.c_str() + (pos - 1);
-				const char_t* rend = s.c_str() + (end - 1);
-
-				return (end == s_length + 1 && !s.uses_heap()) ? xpath_string_const(rbegin) : xpath_string(rbegin, rend, stack.result);
-			}
-
-			case ast_func_normalize_space_0:
-			{
-				xpath_string s = string_value(c.n, stack.result);
-
-				normalize_space(s.data(stack.result));
-
-				return s;
-			}
-
-			case ast_func_normalize_space_1:
-			{
-				xpath_string s = _left->eval_string(c, stack);
-
-				normalize_space(s.data(stack.result));
-			
-				return s;
-			}
-
-			case ast_func_translate:
-			{
-				xpath_allocator_capture cr(stack.temp);
-
-				xpath_stack swapped_stack = {stack.temp, stack.result};
-
-				xpath_string s = _left->eval_string(c, stack);
-				xpath_string from = _right->eval_string(c, swapped_stack);
-				xpath_string to = _right->_next->eval_string(c, swapped_stack);
-
-				translate(s.data(stack.result), from.c_str(), to.c_str());
-
-				return s;
-			}
-
-			case ast_variable:
-			{
-				assert(_rettype == _data.variable->type());
-
-				if (_rettype == xpath_type_string)
-					return xpath_string_const(_data.variable->get_string());
-
-				// fallthrough to type conversion
-			}
-
-			default:
-			{
-				switch (_rettype)
-				{
-				case xpath_type_boolean:
-					return xpath_string_const(eval_boolean(c, stack) ? PUGIXML_TEXT("true") : PUGIXML_TEXT("false"));
-					
-				case xpath_type_number:
-					return convert_number_to_string(eval_number(c, stack), stack.result);
-					
-				case xpath_type_node_set:
-				{
-					xpath_allocator_capture cr(stack.temp);
-
-					xpath_stack swapped_stack = {stack.temp, stack.result};
-
-					xpath_node_set_raw ns = eval_node_set(c, swapped_stack);
-					return ns.empty() ? xpath_string() : string_value(ns.first(), stack.result);
-				}
-				
-				default:
-					assert(!"Wrong expression for return type string");
-					return xpath_string();
-				}
-			}
-			}
-		}
-
-		xpath_node_set_raw eval_node_set(const xpath_context& c, const xpath_stack& stack)
-		{
-			switch (_type)
-			{
-			case ast_op_union:
-			{
-				xpath_allocator_capture cr(stack.temp);
-
-				xpath_stack swapped_stack = {stack.temp, stack.result};
-
-				xpath_node_set_raw ls = _left->eval_node_set(c, swapped_stack);
-				xpath_node_set_raw rs = _right->eval_node_set(c, stack);
-				
-				// we can optimize merging two sorted sets, but this is a very rare operation, so don't bother
-  		        rs.set_type(xpath_node_set::type_unsorted);
-
-				rs.append(ls.begin(), ls.end(), stack.result);
-				rs.remove_duplicates();
-				
-				return rs;
-			}
-
-			case ast_filter:
-			case ast_filter_posinv:
-			{
-				xpath_node_set_raw set = _left->eval_node_set(c, stack);
-
-				// either expression is a number or it contains position() call; sort by document order
-				if (_type == ast_filter) set.sort_do();
-
-				apply_predicate(set, 0, _right, stack);
-			
-				return set;
-			}
-			
-			case ast_func_id:
-				return xpath_node_set_raw();
-			
-			case ast_step:
-			{
-				switch (_axis)
-				{
-				case axis_ancestor:
-					return step_do(c, stack, axis_to_type<axis_ancestor>());
-					
-				case axis_ancestor_or_self:
-					return step_do(c, stack, axis_to_type<axis_ancestor_or_self>());
-
-				case axis_attribute:
-					return step_do(c, stack, axis_to_type<axis_attribute>());
-
-				case axis_child:
-					return step_do(c, stack, axis_to_type<axis_child>());
-				
-				case axis_descendant:
-					return step_do(c, stack, axis_to_type<axis_descendant>());
-
-				case axis_descendant_or_self:
-					return step_do(c, stack, axis_to_type<axis_descendant_or_self>());
-
-				case axis_following:
-					return step_do(c, stack, axis_to_type<axis_following>());
-				
-				case axis_following_sibling:
-					return step_do(c, stack, axis_to_type<axis_following_sibling>());
-				
-				case axis_namespace:
-					// namespaced axis is not supported
-					return xpath_node_set_raw();
-				
-				case axis_parent:
-					return step_do(c, stack, axis_to_type<axis_parent>());
-				
-				case axis_preceding:
-					return step_do(c, stack, axis_to_type<axis_preceding>());
-
-				case axis_preceding_sibling:
-					return step_do(c, stack, axis_to_type<axis_preceding_sibling>());
-				
-				case axis_self:
-					return step_do(c, stack, axis_to_type<axis_self>());
-				}
-			}
-
-			case ast_step_root:
-			{
-				assert(!_right); // root step can't have any predicates
-
-				xpath_node_set_raw ns;
-
-				ns.set_type(xpath_node_set::type_sorted);
-
-				if (c.n.node()) ns.push_back(c.n.node().root(), stack.result);
-				else if (c.n.attribute()) ns.push_back(c.n.parent().root(), stack.result);
-
-				return ns;
-			}
-
-			case ast_variable:
-			{
-				assert(_rettype == _data.variable->type());
-
-				if (_rettype == xpath_type_node_set)
-				{
-					const xpath_node_set& s = _data.variable->get_node_set();
-
-					xpath_node_set_raw ns;
-
-					ns.set_type(s.type());
-					ns.append(s.begin(), s.end(), stack.result);
-
-					return ns;
-				}
-
-				// fallthrough to type conversion
-			}
-
-			default:
-				assert(!"Wrong expression for return type node set");
-				return xpath_node_set_raw();
-			}
-		}
-		
-		bool is_posinv()
-		{
-			switch (_type)
-			{
-			case ast_func_position:
-				return false;
-
-			case ast_string_constant:
-			case ast_number_constant:
-			case ast_variable:
-				return true;
-
-			case ast_step:
-			case ast_step_root:
-				return true;
-
-			case ast_predicate:
-			case ast_filter:
-			case ast_filter_posinv:
-				return true;
-
-			default:
-				if (_left && !_left->is_posinv()) return false;
-				
-				for (xpath_ast_node* n = _right; n; n = n->_next)
-					if (!n->is_posinv()) return false;
-					
-				return true;
-			}
-		}
-
-		xpath_value_type rettype() const
-		{
-			return static_cast<xpath_value_type>(_rettype);
-		}
-	};
-
-	struct xpath_parser
-	{
-	    xpath_allocator* _alloc;
-	    xpath_lexer _lexer;
-
-		const char_t* _query;
-		xpath_variable_set* _variables;
-
-		xpath_parse_result* _result;
-
-	#ifdef PUGIXML_NO_EXCEPTIONS
-		jmp_buf _error_handler;
-	#endif
-
-		void throw_error(const char* message)
-		{
-			_result->error = message;
-			_result->offset = _lexer.current_pos() - _query;
-
-		#ifdef PUGIXML_NO_EXCEPTIONS
-			longjmp(_error_handler, 1);
-		#else
-			throw xpath_exception(*_result);
-		#endif
-		}
-
-		void throw_error_oom()
-        {
-        #ifdef PUGIXML_NO_EXCEPTIONS
-            throw_error("Out of memory");
-        #else
-            throw std::bad_alloc();
-        #endif
-        }
-
-		void* alloc_node()
-		{
-			void* result = _alloc->allocate_nothrow(sizeof(xpath_ast_node));
-
-			if (!result) throw_error_oom();
-
-			return result;
-		}
-
-		const char_t* alloc_string(const xpath_lexer_string& value)
-		{
-			if (value.begin)
-			{
-				size_t length = static_cast<size_t>(value.end - value.begin);
-
-				char_t* c = static_cast<char_t*>(_alloc->allocate_nothrow((length + 1) * sizeof(char_t)));
-				if (!c) throw_error_oom();
-
-				memcpy(c, value.begin, length * sizeof(char_t));
-				c[length] = 0;
-
-				return c;
-			}
-			else return 0;
-		}
-
-		xpath_ast_node* parse_function_helper(ast_type_t type0, ast_type_t type1, size_t argc, xpath_ast_node* args[2])
-		{
-			assert(argc <= 1);
-
-			if (argc == 1 && args[0]->rettype() != xpath_type_node_set) throw_error("Function has to be applied to node set");
-
-			return new (alloc_node()) xpath_ast_node(argc == 0 ? type0 : type1, xpath_type_string, args[0]);
-		}
-
-		xpath_ast_node* parse_function(const xpath_lexer_string& name, size_t argc, xpath_ast_node* args[2])
-		{
-			switch (name.begin[0])
-			{
-			case 'b':
-				if (name == PUGIXML_TEXT("boolean") && argc == 1)
-					return new (alloc_node()) xpath_ast_node(ast_func_boolean, xpath_type_boolean, args[0]);
-					
-				break;
-			
-			case 'c':
-				if (name == PUGIXML_TEXT("count") && argc == 1)
-				{
-					if (args[0]->rettype() != xpath_type_node_set) throw_error("Function has to be applied to node set");
-					return new (alloc_node()) xpath_ast_node(ast_func_count, xpath_type_number, args[0]);
-				}
-				else if (name == PUGIXML_TEXT("contains") && argc == 2)
-					return new (alloc_node()) xpath_ast_node(ast_func_contains, xpath_type_string, args[0], args[1]);
-				else if (name == PUGIXML_TEXT("concat") && argc >= 2)
-					return new (alloc_node()) xpath_ast_node(ast_func_concat, xpath_type_string, args[0], args[1]);
-				else if (name == PUGIXML_TEXT("ceiling") && argc == 1)
-					return new (alloc_node()) xpath_ast_node(ast_func_ceiling, xpath_type_number, args[0]);
-					
-				break;
-			
-			case 'f':
-				if (name == PUGIXML_TEXT("false") && argc == 0)
-					return new (alloc_node()) xpath_ast_node(ast_func_false, xpath_type_boolean);
-				else if (name == PUGIXML_TEXT("floor") && argc == 1)
-					return new (alloc_node()) xpath_ast_node(ast_func_floor, xpath_type_number, args[0]);
-					
-				break;
-			
-			case 'i':
-				if (name == PUGIXML_TEXT("id") && argc == 1)
-					return new (alloc_node()) xpath_ast_node(ast_func_id, xpath_type_node_set, args[0]);
-					
-				break;
-			
-			case 'l':
-				if (name == PUGIXML_TEXT("last") && argc == 0)
-					return new (alloc_node()) xpath_ast_node(ast_func_last, xpath_type_number);
-				else if (name == PUGIXML_TEXT("lang") && argc == 1)
-					return new (alloc_node()) xpath_ast_node(ast_func_lang, xpath_type_boolean, args[0]);
-				else if (name == PUGIXML_TEXT("local-name") && argc <= 1)
-					return parse_function_helper(ast_func_local_name_0, ast_func_local_name_1, argc, args);
-			
-				break;
-			
-			case 'n':
-				if (name == PUGIXML_TEXT("name") && argc <= 1)
-					return parse_function_helper(ast_func_name_0, ast_func_name_1, argc, args);
-				else if (name == PUGIXML_TEXT("namespace-uri") && argc <= 1)
-					return parse_function_helper(ast_func_namespace_uri_0, ast_func_namespace_uri_1, argc, args);
-				else if (name == PUGIXML_TEXT("normalize-space") && argc <= 1)
-					return new (alloc_node()) xpath_ast_node(argc == 0 ? ast_func_normalize_space_0 : ast_func_normalize_space_1, xpath_type_string, args[0], args[1]);
-				else if (name == PUGIXML_TEXT("not") && argc == 1)
-					return new (alloc_node()) xpath_ast_node(ast_func_not, xpath_type_boolean, args[0]);
-				else if (name == PUGIXML_TEXT("number") && argc <= 1)
-					return new (alloc_node()) xpath_ast_node(argc == 0 ? ast_func_number_0 : ast_func_number_1, xpath_type_number, args[0]);
-			
-				break;
-			
-			case 'p':
-				if (name == PUGIXML_TEXT("position") && argc == 0)
-					return new (alloc_node()) xpath_ast_node(ast_func_position, xpath_type_number);
-				
-				break;
-			
-			case 'r':
-				if (name == PUGIXML_TEXT("round") && argc == 1)
-					return new (alloc_node()) xpath_ast_node(ast_func_round, xpath_type_number, args[0]);
-
-				break;
-			
-			case 's':
-				if (name == PUGIXML_TEXT("string") && argc <= 1)
-					return new (alloc_node()) xpath_ast_node(argc == 0 ? ast_func_string_0 : ast_func_string_1, xpath_type_string, args[0]);
-				else if (name == PUGIXML_TEXT("string-length") && argc <= 1)
-					return new (alloc_node()) xpath_ast_node(argc == 0 ? ast_func_string_length_0 : ast_func_string_length_1, xpath_type_string, args[0]);
-				else if (name == PUGIXML_TEXT("starts-with") && argc == 2)
-					return new (alloc_node()) xpath_ast_node(ast_func_starts_with, xpath_type_boolean, args[0], args[1]);
-				else if (name == PUGIXML_TEXT("substring-before") && argc == 2)
-					return new (alloc_node()) xpath_ast_node(ast_func_substring_before, xpath_type_string, args[0], args[1]);
-				else if (name == PUGIXML_TEXT("substring-after") && argc == 2)
-					return new (alloc_node()) xpath_ast_node(ast_func_substring_after, xpath_type_string, args[0], args[1]);
-				else if (name == PUGIXML_TEXT("substring") && (argc == 2 || argc == 3))
-					return new (alloc_node()) xpath_ast_node(argc == 2 ? ast_func_substring_2 : ast_func_substring_3, xpath_type_string, args[0], args[1]);
-				else if (name == PUGIXML_TEXT("sum") && argc == 1)
-				{
-					if (args[0]->rettype() != xpath_type_node_set) throw_error("Function has to be applied to node set");
-					return new (alloc_node()) xpath_ast_node(ast_func_sum, xpath_type_number, args[0]);
-				}
-
-				break;
-			
-			case 't':
-				if (name == PUGIXML_TEXT("translate") && argc == 3)
-					return new (alloc_node()) xpath_ast_node(ast_func_translate, xpath_type_string, args[0], args[1]);
-				else if (name == PUGIXML_TEXT("true") && argc == 0)
-					return new (alloc_node()) xpath_ast_node(ast_func_true, xpath_type_boolean);
-					
-				break;
-			}
-
-			throw_error("Unrecognized function or wrong parameter count");
-
-			return 0;
-		}
-
-		axis_t parse_axis_name(const xpath_lexer_string& name, bool& specified)
-		{
-			specified = true;
-
-			switch (name.begin[0])
-			{
-			case 'a':
-				if (name == PUGIXML_TEXT("ancestor"))
-					return axis_ancestor;
-				else if (name == PUGIXML_TEXT("ancestor-or-self"))
-					return axis_ancestor_or_self;
-				else if (name == PUGIXML_TEXT("attribute"))
-					return axis_attribute;
-				
-				break;
-			
-			case 'c':
-				if (name == PUGIXML_TEXT("child"))
-					return axis_child;
-				
-				break;
-			
-			case 'd':
-				if (name == PUGIXML_TEXT("descendant"))
-					return axis_descendant;
-				else if (name == PUGIXML_TEXT("descendant-or-self"))
-					return axis_descendant_or_self;
-				
-				break;
-			
-			case 'f':
-				if (name == PUGIXML_TEXT("following"))
-					return axis_following;
-				else if (name == PUGIXML_TEXT("following-sibling"))
-					return axis_following_sibling;
-				
-				break;
-			
-			case 'n':
-				if (name == PUGIXML_TEXT("namespace"))
-					return axis_namespace;
-				
-				break;
-			
-			case 'p':
-				if (name == PUGIXML_TEXT("parent"))
-					return axis_parent;
-				else if (name == PUGIXML_TEXT("preceding"))
-					return axis_preceding;
-				else if (name == PUGIXML_TEXT("preceding-sibling"))
-					return axis_preceding_sibling;
-				
-				break;
-			
-			case 's':
-				if (name == PUGIXML_TEXT("self"))
-					return axis_self;
-				
-				break;
-			}
-
-			specified = false;
-			return axis_child;
-		}
-
-		nodetest_t parse_node_test_type(const xpath_lexer_string& name)
-		{
-			switch (name.begin[0])
-			{
-			case 'c':
-				if (name == PUGIXML_TEXT("comment"))
-					return nodetest_type_comment;
-
-				break;
-
-			case 'n':
-				if (name == PUGIXML_TEXT("node"))
-					return nodetest_type_node;
-
-				break;
-
-			case 'p':
-				if (name == PUGIXML_TEXT("processing-instruction"))
-					return nodetest_type_pi;
-
-				break;
-
-			case 't':
-				if (name == PUGIXML_TEXT("text"))
-					return nodetest_type_text;
-
-				break;
-			}
-
-			return nodetest_none;
-		}
-
-	    // PrimaryExpr ::= VariableReference | '(' Expr ')' | Literal | Number | FunctionCall
-	    xpath_ast_node* parse_primary_expression()
-	    {
-	    	switch (_lexer.current())
-	    	{
-	    	case lex_var_ref:
-	    	{
-				xpath_lexer_string name = _lexer.contents();
-
-				if (!_variables)
-					throw_error("Unknown variable: variable set is not provided");
-
-				xpath_variable* var = get_variable(_variables, name.begin, name.end);
-
-				if (!var)
-					throw_error("Unknown variable: variable set does not contain the given name");
-
-				_lexer.next();
-
-	    		return new (alloc_node()) xpath_ast_node(ast_variable, var->type(), var);
-			}
-
-			case lex_open_brace:
-			{
-				_lexer.next();
-
-				xpath_ast_node* n = parse_expression();
-
-				if (_lexer.current() != lex_close_brace)
-					throw_error("Unmatched braces");
-
-				_lexer.next();
-
-				return n;
-			}
-
-			case lex_quoted_string:
-			{
-				const char_t* value = alloc_string(_lexer.contents());
-
-				xpath_ast_node* n = new (alloc_node()) xpath_ast_node(ast_string_constant, xpath_type_string, value);
-				_lexer.next();
-
-				return n;
-			}
-
-			case lex_number:
-			{
-				double value = 0;
-
-				if (!convert_string_to_number(_lexer.contents().begin, _lexer.contents().end, &value))
-					throw_error_oom();
-
-				xpath_ast_node* n = new (alloc_node()) xpath_ast_node(ast_number_constant, xpath_type_number, value);
-				_lexer.next();
-
-				return n;
-			}
-
-			case lex_string:
-			{
-				xpath_ast_node* args[2] = {0};
-				size_t argc = 0;
-				
-				xpath_lexer_string function = _lexer.contents();
-				_lexer.next();
-				
-				xpath_ast_node* last_arg = 0;
-				
-				if (_lexer.current() != lex_open_brace)
-					throw_error("Unrecognized function call");
-				_lexer.next();
-
-				if (_lexer.current() != lex_close_brace)
-					args[argc++] = parse_expression();
-
-				while (_lexer.current() != lex_close_brace)
-				{
-					if (_lexer.current() != lex_comma)
-						throw_error("No comma between function arguments");
-					_lexer.next();
-					
-					xpath_ast_node* n = parse_expression();
-					
-					if (argc < 2) args[argc] = n;
-					else last_arg->set_next(n);
-
-					argc++;
-					last_arg = n;
-				}
-				
-				_lexer.next();
-
-				return parse_function(function, argc, args);
-			}
-
-	    	default:
-	    		throw_error("Unrecognizable primary expression");
-
-	    		return 0;
-	    	}
-	    }
-	    
-	    // FilterExpr ::= PrimaryExpr | FilterExpr Predicate
-	    // Predicate ::= '[' PredicateExpr ']'
-	    // PredicateExpr ::= Expr
-	    xpath_ast_node* parse_filter_expression()
-	    {
-	    	xpath_ast_node* n = parse_primary_expression();
-
-	    	while (_lexer.current() == lex_open_square_brace)
-	    	{
-	    		_lexer.next();
-
-				xpath_ast_node* expr = parse_expression();
-
-				if (n->rettype() != xpath_type_node_set) throw_error("Predicate has to be applied to node set");
-
-				bool posinv = expr->rettype() != xpath_type_number && expr->is_posinv();
-
-	    		n = new (alloc_node()) xpath_ast_node(posinv ? ast_filter_posinv : ast_filter, xpath_type_node_set, n, expr);
-
-	    		if (_lexer.current() != lex_close_square_brace)
-	    			throw_error("Unmatched square brace");
-	    	
-	    		_lexer.next();
-	    	}
-	    	
-	    	return n;
-	    }
-	    
-	    // Step ::= AxisSpecifier NodeTest Predicate* | AbbreviatedStep
-	    // AxisSpecifier ::= AxisName '::' | '@'?
-	    // NodeTest ::= NameTest | NodeType '(' ')' | 'processing-instruction' '(' Literal ')'
-	    // NameTest ::= '*' | NCName ':' '*' | QName
-	    // AbbreviatedStep ::= '.' | '..'
-	    xpath_ast_node* parse_step(xpath_ast_node* set)
-	    {
-			if (set && set->rettype() != xpath_type_node_set)
-				throw_error("Step has to be applied to node set");
-
-			bool axis_specified = false;
-			axis_t axis = axis_child; // implied child axis
-
-			if (_lexer.current() == lex_axis_attribute)
-			{
-				axis = axis_attribute;
-				axis_specified = true;
-				
-				_lexer.next();
-			}
-			else if (_lexer.current() == lex_dot)
-			{
-				_lexer.next();
-				
-				return new (alloc_node()) xpath_ast_node(ast_step, set, axis_self, nodetest_type_node, 0);
-			}
-			else if (_lexer.current() == lex_double_dot)
-			{
-				_lexer.next();
-				
-				return new (alloc_node()) xpath_ast_node(ast_step, set, axis_parent, nodetest_type_node, 0);
-			}
-	    
-			nodetest_t nt_type = nodetest_none;
-			xpath_lexer_string nt_name;
-			
-			if (_lexer.current() == lex_string)
-			{
-				// node name test
-				nt_name = _lexer.contents();
-				_lexer.next();
-
-				// was it an axis name?
-				if (_lexer.current() == lex_double_colon)
-				{
-					// parse axis name
-					if (axis_specified) throw_error("Two axis specifiers in one step");
-
-					axis = parse_axis_name(nt_name, axis_specified);
-
-					if (!axis_specified) throw_error("Unknown axis");
-
-					// read actual node test
-					_lexer.next();
-
-					if (_lexer.current() == lex_multiply)
-					{
-						nt_type = nodetest_all;
-						nt_name = xpath_lexer_string();
-						_lexer.next();
-					}
-					else if (_lexer.current() == lex_string)
-					{
-						nt_name = _lexer.contents();
-						_lexer.next();
-					}
-					else throw_error("Unrecognized node test");
-				}
-				
-				if (nt_type == nodetest_none)
-				{
-					// node type test or processing-instruction
-					if (_lexer.current() == lex_open_brace)
-					{
-						_lexer.next();
-						
-						if (_lexer.current() == lex_close_brace)
-						{
-							_lexer.next();
-
-							nt_type = parse_node_test_type(nt_name);
-
-							if (nt_type == nodetest_none) throw_error("Unrecognized node type");
-							
-							nt_name = xpath_lexer_string();
-						}
-						else if (nt_name == PUGIXML_TEXT("processing-instruction"))
-						{
-							if (_lexer.current() != lex_quoted_string)
-								throw_error("Only literals are allowed as arguments to processing-instruction()");
-						
-							nt_type = nodetest_pi;
-							nt_name = _lexer.contents();
-							_lexer.next();
-							
-							if (_lexer.current() != lex_close_brace)
-								throw_error("Unmatched brace near processing-instruction()");
-							_lexer.next();
-						}
-						else
-							throw_error("Unmatched brace near node type test");
-
-					}
-					// QName or NCName:*
-					else
-					{
-						if (nt_name.end - nt_name.begin > 2 && nt_name.end[-2] == ':' && nt_name.end[-1] == '*') // NCName:*
-						{
-							nt_name.end--; // erase *
-							
-							nt_type = nodetest_all_in_namespace;
-						}
-						else nt_type = nodetest_name;
-					}
-				}
-			}
-			else if (_lexer.current() == lex_multiply)
-			{
-				nt_type = nodetest_all;
-				_lexer.next();
-			}
-			else throw_error("Unrecognized node test");
-			
-			xpath_ast_node* n = new (alloc_node()) xpath_ast_node(ast_step, set, axis, nt_type, alloc_string(nt_name));
-			
-			xpath_ast_node* last = 0;
-			
-			while (_lexer.current() == lex_open_square_brace)
-			{
-				_lexer.next();
-				
-				xpath_ast_node* expr = parse_expression();
-
-				xpath_ast_node* pred = new (alloc_node()) xpath_ast_node(ast_predicate, xpath_type_node_set, expr);
-				
-				if (_lexer.current() != lex_close_square_brace)
-	    			throw_error("Unmatched square brace");
-				_lexer.next();
-				
-				if (last) last->set_next(pred);
-				else n->set_right(pred);
-				
-				last = pred;
-			}
-			
-			return n;
-	    }
-	    
-	    // RelativeLocationPath ::= Step | RelativeLocationPath '/' Step | RelativeLocationPath '//' Step
-	    xpath_ast_node* parse_relative_location_path(xpath_ast_node* set)
-	    {
-			xpath_ast_node* n = parse_step(set);
-			
-			while (_lexer.current() == lex_slash || _lexer.current() == lex_double_slash)
-			{
-				lexeme_t l = _lexer.current();
-				_lexer.next();
-
-				if (l == lex_double_slash)
-					n = new (alloc_node()) xpath_ast_node(ast_step, n, axis_descendant_or_self, nodetest_type_node, 0);
-				
-				n = parse_step(n);
-			}
-			
-			return n;
-	    }
-	    
-	    // LocationPath ::= RelativeLocationPath | AbsoluteLocationPath
-	    // AbsoluteLocationPath ::= '/' RelativeLocationPath? | '//' RelativeLocationPath
-	    xpath_ast_node* parse_location_path()
-	    {
-			if (_lexer.current() == lex_slash)
-			{
-				_lexer.next();
-				
-				xpath_ast_node* n = new (alloc_node()) xpath_ast_node(ast_step_root, xpath_type_node_set);
-
-				// relative location path can start from axis_attribute, dot, double_dot, multiply and string lexemes; any other lexeme means standalone root path
-				lexeme_t l = _lexer.current();
-
-				if (l == lex_string || l == lex_axis_attribute || l == lex_dot || l == lex_double_dot || l == lex_multiply)
-					return parse_relative_location_path(n);
-				else
-					return n;
-			}
-			else if (_lexer.current() == lex_double_slash)
-			{
-				_lexer.next();
-				
-				xpath_ast_node* n = new (alloc_node()) xpath_ast_node(ast_step_root, xpath_type_node_set);
-				n = new (alloc_node()) xpath_ast_node(ast_step, n, axis_descendant_or_self, nodetest_type_node, 0);
-				
-				return parse_relative_location_path(n);
-			}
-
-			// else clause moved outside of if because of bogus warning 'control may reach end of non-void function being inlined' in gcc 4.0.1
-			return parse_relative_location_path(0);
-	    }
-	    
-	    // PathExpr ::= LocationPath
-	    //				| FilterExpr
-	    //				| FilterExpr '/' RelativeLocationPath
-	    //				| FilterExpr '//' RelativeLocationPath
-	    xpath_ast_node* parse_path_expression()
-	    {
-			// Clarification.
-			// PathExpr begins with either LocationPath or FilterExpr.
-			// FilterExpr begins with PrimaryExpr
-			// PrimaryExpr begins with '$' in case of it being a variable reference,
-			// '(' in case of it being an expression, string literal, number constant or
-			// function call.
-
-			if (_lexer.current() == lex_var_ref || _lexer.current() == lex_open_brace || 
-				_lexer.current() == lex_quoted_string || _lexer.current() == lex_number ||
-				_lexer.current() == lex_string)
-	    	{
-	    		if (_lexer.current() == lex_string)
-	    		{
-	    			// This is either a function call, or not - if not, we shall proceed with location path
-	    			const char_t* state = _lexer.state();
-	    			
-					while (IS_CHARTYPE(*state, ct_space)) ++state;
-	    			
-	    			if (*state != '(') return parse_location_path();
-
-					// This looks like a function call; however this still can be a node-test. Check it.
-					if (parse_node_test_type(_lexer.contents()) != nodetest_none) return parse_location_path();
-	    		}
-	    		
-	    		xpath_ast_node* n = parse_filter_expression();
-
-	    		if (_lexer.current() == lex_slash || _lexer.current() == lex_double_slash)
-	    		{
-					lexeme_t l = _lexer.current();
-	    			_lexer.next();
-	    			
-					if (l == lex_double_slash)
-					{
-						if (n->rettype() != xpath_type_node_set) throw_error("Step has to be applied to node set");
-
-						n = new (alloc_node()) xpath_ast_node(ast_step, n, axis_descendant_or_self, nodetest_type_node, 0);
-					}
-	
-	    			// select from location path
-	    			return parse_relative_location_path(n);
-	    		}
-
-	    		return n;
-	    	}
-	    	else return parse_location_path();
-	    }
-
-	    // UnionExpr ::= PathExpr | UnionExpr '|' PathExpr
-	    xpath_ast_node* parse_union_expression()
-	    {
-	    	xpath_ast_node* n = parse_path_expression();
-
-	    	while (_lexer.current() == lex_union)
-	    	{
-	    		_lexer.next();
-
-				xpath_ast_node* expr = parse_union_expression();
-
-				if (n->rettype() != xpath_type_node_set || expr->rettype() != xpath_type_node_set)
-					throw_error("Union operator has to be applied to node sets");
-
-	    		n = new (alloc_node()) xpath_ast_node(ast_op_union, xpath_type_node_set, n, expr);
-	    	}
-
-	    	return n;
-	    }
-
-	    // UnaryExpr ::= UnionExpr | '-' UnaryExpr
-	    xpath_ast_node* parse_unary_expression()
-	    {
-	    	if (_lexer.current() == lex_minus)
-	    	{
-	    		_lexer.next();
-
-				xpath_ast_node* expr = parse_unary_expression();
-
-	    		return new (alloc_node()) xpath_ast_node(ast_op_negate, xpath_type_number, expr);
-	    	}
-	    	else return parse_union_expression();
-	    }
-	    
-	    // MultiplicativeExpr ::= UnaryExpr
-	    //						  | MultiplicativeExpr '*' UnaryExpr
-	    //						  | MultiplicativeExpr 'div' UnaryExpr
-	    //						  | MultiplicativeExpr 'mod' UnaryExpr
-	    xpath_ast_node* parse_multiplicative_expression()
-	    {
-	    	xpath_ast_node* n = parse_unary_expression();
-
-	    	while (_lexer.current() == lex_multiply || (_lexer.current() == lex_string &&
-	    		   (_lexer.contents() == PUGIXML_TEXT("mod") || _lexer.contents() == PUGIXML_TEXT("div"))))
-	    	{
-	    		ast_type_t op = _lexer.current() == lex_multiply ? ast_op_multiply :
-	    			_lexer.contents().begin[0] == 'd' ? ast_op_divide : ast_op_mod;
-	    		_lexer.next();
-
-				xpath_ast_node* expr = parse_unary_expression();
-
-	    		n = new (alloc_node()) xpath_ast_node(op, xpath_type_number, n, expr);
-	    	}
-
-	    	return n;
-	    }
-
-	    // AdditiveExpr ::= MultiplicativeExpr
-	    //					| AdditiveExpr '+' MultiplicativeExpr
-	    //					| AdditiveExpr '-' MultiplicativeExpr
-	    xpath_ast_node* parse_additive_expression()
-	    {
-	    	xpath_ast_node* n = parse_multiplicative_expression();
-
-	    	while (_lexer.current() == lex_plus || _lexer.current() == lex_minus)
-	    	{
-	    		lexeme_t l = _lexer.current();
-
-	    		_lexer.next();
-
-				xpath_ast_node* expr = parse_multiplicative_expression();
-
-	    		n = new (alloc_node()) xpath_ast_node(l == lex_plus ? ast_op_add : ast_op_subtract, xpath_type_number, n, expr);
-	    	}
-
-	    	return n;
-	    }
-
-	    // RelationalExpr ::= AdditiveExpr
-	    //					  | RelationalExpr '<' AdditiveExpr
-	    //					  | RelationalExpr '>' AdditiveExpr
-	    //					  | RelationalExpr '<=' AdditiveExpr
-	    //					  | RelationalExpr '>=' AdditiveExpr
-	    xpath_ast_node* parse_relational_expression()
-	    {
-	    	xpath_ast_node* n = parse_additive_expression();
-
-	    	while (_lexer.current() == lex_less || _lexer.current() == lex_less_or_equal || 
-	    		   _lexer.current() == lex_greater || _lexer.current() == lex_greater_or_equal)
-	    	{
-	    		lexeme_t l = _lexer.current();
-	    		_lexer.next();
-
-				xpath_ast_node* expr = parse_additive_expression();
-
-	    		n = new (alloc_node()) xpath_ast_node(l == lex_less ? ast_op_less : l == lex_greater ? ast_op_greater :
-	    						l == lex_less_or_equal ? ast_op_less_or_equal : ast_op_greater_or_equal, xpath_type_boolean, n, expr);
-	    	}
-
-	    	return n;
-	    }
-	    
-	    // EqualityExpr ::= RelationalExpr
-	    //					| EqualityExpr '=' RelationalExpr
-	    //					| EqualityExpr '!=' RelationalExpr
-	    xpath_ast_node* parse_equality_expression()
-	    {
-	    	xpath_ast_node* n = parse_relational_expression();
-
-	    	while (_lexer.current() == lex_equal || _lexer.current() == lex_not_equal)
-	    	{
-	    		lexeme_t l = _lexer.current();
-
-	    		_lexer.next();
-
-				xpath_ast_node* expr = parse_relational_expression();
-
-	    		n = new (alloc_node()) xpath_ast_node(l == lex_equal ? ast_op_equal : ast_op_not_equal, xpath_type_boolean, n, expr);
-	    	}
-
-	    	return n;
-	    }
-	    
-	    // AndExpr ::= EqualityExpr | AndExpr 'and' EqualityExpr
-	    xpath_ast_node* parse_and_expression()
-	    {
-	    	xpath_ast_node* n = parse_equality_expression();
-
-	    	while (_lexer.current() == lex_string && _lexer.contents() == PUGIXML_TEXT("and"))
-	    	{
-	    		_lexer.next();
-
-				xpath_ast_node* expr = parse_equality_expression();
-
-	    		n = new (alloc_node()) xpath_ast_node(ast_op_and, xpath_type_boolean, n, expr);
-	    	}
-
-	    	return n;
-	    }
-
-	    // OrExpr ::= AndExpr | OrExpr 'or' AndExpr
-	    xpath_ast_node* parse_or_expression()
-	    {
-	    	xpath_ast_node* n = parse_and_expression();
-
-	    	while (_lexer.current() == lex_string && _lexer.contents() == PUGIXML_TEXT("or"))
-	    	{
-	    		_lexer.next();
-
-				xpath_ast_node* expr = parse_and_expression();
-
-	    		n = new (alloc_node()) xpath_ast_node(ast_op_or, xpath_type_boolean, n, expr);
-	    	}
-
-	    	return n;
-	    }
-		
-		// Expr ::= OrExpr
-		xpath_ast_node* parse_expression()
-		{
-			return parse_or_expression();
-		}
-
-		xpath_parser(const char_t* query, xpath_variable_set* variables, xpath_allocator* alloc, xpath_parse_result* result): _alloc(alloc), _lexer(query), _query(query), _variables(variables), _result(result)
-		{
-		}
-
-		xpath_ast_node* parse()
-		{
-			xpath_ast_node* result = parse_expression();
-			
-			if (_lexer.current() != lex_eof)
-			{
-				// there are still unparsed tokens left, error
-				throw_error("Incorrect query");
-			}
-			
-			return result;
-		}
-
-		static xpath_ast_node* parse(const char_t* query, xpath_variable_set* variables, xpath_allocator* alloc, xpath_parse_result* result)
-		{
-			xpath_parser parser(query, variables, alloc, result);
-
-		#ifdef PUGIXML_NO_EXCEPTIONS
-			int error = setjmp(parser._error_handler);
-
-			return (error == 0) ? parser.parse() : 0;
-		#else
-			return parser.parse();
-		#endif
-		}
-	};
-
-    struct xpath_query_impl
-    {
-		static xpath_query_impl* create()
-		{
-			void* memory = global_allocate(sizeof(xpath_query_impl));
-
-            return new (memory) xpath_query_impl();
-		}
-
-		static void destroy(void* ptr)
-		{
-			if (!ptr) return;
-			
-			// free all allocated pages
-			static_cast<xpath_query_impl*>(ptr)->alloc.release();
-
-			// free allocator memory (with the first page)
-			global_deallocate(ptr);
-		}
-
-        xpath_query_impl(): root(0), alloc(&block)
-        {
-            block.next = 0;
-        }
-
-        xpath_ast_node* root;
-        xpath_allocator alloc;
-        xpath_memory_block block;
-    };
-
-	xpath_string evaluate_string_impl(xpath_query_impl* impl, const xpath_node& n, xpath_stack_data& sd)
-	{
-		if (!impl) return xpath_string();
-
-	#ifdef PUGIXML_NO_EXCEPTIONS
-		if (setjmp(sd.error_handler)) return xpath_string();
-	#endif
-
-		xpath_context c(n, 1, 1);
-
-		return impl->root->eval_string(c, sd.stack);
-	}
-}
-
-namespace pugi
-{
-#ifndef PUGIXML_NO_EXCEPTIONS
-	xpath_exception::xpath_exception(const xpath_parse_result& result): _result(result)
-	{
-		assert(result.error);
-	}
-	
-	const char* xpath_exception::what() const throw()
-	{
-		return _result.error;
-	}
-
-	const xpath_parse_result& xpath_exception::result() const
-	{
-		return _result;
-	}
-#endif
-	
-	xpath_node::xpath_node()
-	{
-	}
-		
-	xpath_node::xpath_node(const xml_node& node): _node(node)
-	{
-	}
-		
-	xpath_node::xpath_node(const xml_attribute& attribute, const xml_node& parent): _node(attribute ? parent : xml_node()), _attribute(attribute)
-	{
-	}
-
-	xml_node xpath_node::node() const
-	{
-		return _attribute ? xml_node() : _node;
-	}
-		
-	xml_attribute xpath_node::attribute() const
-	{
-		return _attribute;
-	}
-	
-	xml_node xpath_node::parent() const
-	{
-		return _attribute ? _node : _node.parent();
-	}
-
-	xpath_node::operator xpath_node::unspecified_bool_type() const
-	{
-		return (_node || _attribute) ? &xpath_node::_node : 0;
-	}
-	
-	bool xpath_node::operator!() const
-	{
-		return !(_node || _attribute);
-	}
-
-	bool xpath_node::operator==(const xpath_node& n) const
-	{
-		return _node == n._node && _attribute == n._attribute;
-	}
-	
-	bool xpath_node::operator!=(const xpath_node& n) const
-	{
-		return _node != n._node || _attribute != n._attribute;
-	}
-
-#ifdef __BORLANDC__
-	bool operator&&(const xpath_node& lhs, bool rhs)
-	{
-		return (bool)lhs && rhs;
-	}
-
-	bool operator||(const xpath_node& lhs, bool rhs)
-	{
-		return (bool)lhs || rhs;
-	}
-#endif
-
-	void xpath_node_set::_assign(const_iterator begin, const_iterator end)
-	{
-		assert(begin <= end);
-
-		size_t size = static_cast<size_t>(end - begin);
-
-		if (size <= 1)
-		{
-			// deallocate old buffer
-			if (_begin != &_storage) global_deallocate(_begin);
-
-			// use internal buffer
-			if (begin != end) _storage = *begin;
-
-			_begin = &_storage;
-			_end = &_storage + size;
-		}
-		else
-		{
-			// make heap copy
-			xpath_node* storage = static_cast<xpath_node*>(global_allocate(size * sizeof(xpath_node)));
-
-			if (!storage)
-			{
-			#ifdef PUGIXML_NO_EXCEPTIONS
-				return;
-			#else
-				throw std::bad_alloc();
-			#endif
-			}
-
-			memcpy(storage, begin, size * sizeof(xpath_node));
-			
-			// deallocate old buffer
-			if (_begin != &_storage) global_deallocate(_begin);
-
-			// finalize
-			_begin = storage;
-			_end = storage + size;
-		}
-	}
-
-	xpath_node_set::xpath_node_set(): _type(type_unsorted), _begin(&_storage), _end(&_storage)
-	{
-	}
-
-	xpath_node_set::xpath_node_set(const_iterator begin, const_iterator end, type_t type): _type(type), _begin(&_storage), _end(&_storage)
-	{
-		_assign(begin, end);
-	}
-
-	xpath_node_set::~xpath_node_set()
-	{
-		if (_begin != &_storage) global_deallocate(_begin);
-	}
-		
-	xpath_node_set::xpath_node_set(const xpath_node_set& ns): _type(ns._type), _begin(&_storage), _end(&_storage)
-	{
-		_assign(ns._begin, ns._end);
-	}
-	
-	xpath_node_set& xpath_node_set::operator=(const xpath_node_set& ns)
-	{
-		if (this == &ns) return *this;
-		
-		_type = ns._type;
-		_assign(ns._begin, ns._end);
-
-		return *this;
-	}
-
-	xpath_node_set::type_t xpath_node_set::type() const
-	{
-		return _type;
-	}
-		
-	size_t xpath_node_set::size() const
-	{
-		return _end - _begin;
-	}
-		
-	bool xpath_node_set::empty() const
-	{
-		return _begin == _end;
-	}
-		
-	const xpath_node& xpath_node_set::operator[](size_t index) const
-	{
-		assert(index < size());
-		return _begin[index];
-	}
-
-	xpath_node_set::const_iterator xpath_node_set::begin() const
-	{
-		return _begin;
-	}
-		
-	xpath_node_set::const_iterator xpath_node_set::end() const
-	{
-		return _end;
-	}
-	
-	void xpath_node_set::sort(bool reverse)
-	{
-		_type = xpath_sort(_begin, _end, _type, reverse);
-	}
-
-	xpath_node xpath_node_set::first() const
-	{
-		return xpath_first(_begin, _end, _type);
-	}
-
-    xpath_parse_result::xpath_parse_result(): error("Internal error"), offset(0)
-    {
-    }
-
-    xpath_parse_result::operator bool() const
-    {
-        return error == 0;
-    }
-	const char* xpath_parse_result::description() const
-	{
-		return error ? error : "No error";
-	}
-
-	xpath_variable::xpath_variable()
-    {
-    }
-
-	const char_t* xpath_variable::name() const
-	{
-		switch (_type)
-		{
-		case xpath_type_node_set:
-			return static_cast<const xpath_variable_node_set*>(this)->name;
-
-		case xpath_type_number:
-			return static_cast<const xpath_variable_number*>(this)->name;
-
-		case xpath_type_string:
-			return static_cast<const xpath_variable_string*>(this)->name;
-
-		case xpath_type_boolean:
-			return static_cast<const xpath_variable_boolean*>(this)->name;
-
-		default:
-			assert(!"Invalid variable type");
-			return 0;
-		}
-	}
-
-	xpath_value_type xpath_variable::type() const
-	{
-		return _type;
-	}
-
-	bool xpath_variable::get_boolean() const
-	{
-		return (_type == xpath_type_boolean) ? static_cast<const xpath_variable_boolean*>(this)->value : false;
-	}
-
-	double xpath_variable::get_number() const
-	{
-		return (_type == xpath_type_number) ? static_cast<const xpath_variable_number*>(this)->value : gen_nan();
-	}
-
-	const char_t* xpath_variable::get_string() const
-	{
-		const char_t* value = (_type == xpath_type_string) ? static_cast<const xpath_variable_string*>(this)->value : 0;
-		return value ? value : PUGIXML_TEXT("");
-	}
-
-	const xpath_node_set& xpath_variable::get_node_set() const
-	{
-		return (_type == xpath_type_node_set) ? static_cast<const xpath_variable_node_set*>(this)->value : dummy_node_set;
-	}
-
-	bool xpath_variable::set(bool value)
-	{
-		if (_type != xpath_type_boolean) return false;
-
-		static_cast<xpath_variable_boolean*>(this)->value = value;
-		return true;
-	}
-
-	bool xpath_variable::set(double value)
-	{
-		if (_type != xpath_type_number) return false;
-
-		static_cast<xpath_variable_number*>(this)->value = value;
-		return true;
-	}
-
-	bool xpath_variable::set(const char_t* value)
-	{
-		if (_type != xpath_type_string) return false;
-
-		xpath_variable_string* var = static_cast<xpath_variable_string*>(this);
-
-		// duplicate string
-		size_t size = (strlength(value) + 1) * sizeof(char_t);
-
-		char_t* copy = static_cast<char_t*>(global_allocate(size));
-		if (!copy) return false;
-
-		memcpy(copy, value, size);
-
-		// replace old string
-		if (var->value) global_deallocate(var->value);
-		var->value = copy;
-
-		return true;
-	}
-
-	bool xpath_variable::set(const xpath_node_set& value)
-	{
-		if (_type != xpath_type_node_set) return false;
-
-		static_cast<xpath_variable_node_set*>(this)->value = value;
-		return true;
-	}
-
-	xpath_variable_set::xpath_variable_set()
-	{
-		for (size_t i = 0; i < sizeof(_data) / sizeof(_data[0]); ++i) _data[i] = 0;
-	}
-
-	xpath_variable_set::~xpath_variable_set()
-	{
-		for (size_t i = 0; i < sizeof(_data) / sizeof(_data[0]); ++i)
-		{
-			xpath_variable* var = _data[i];
-
-			while (var)
-			{
-				xpath_variable* next = var->_next;
-
-				delete_xpath_variable(var->_type, var);
-
-				var = next;
-			}
-		}
-	}
-
-	xpath_variable* xpath_variable_set::find(const char_t* name) const
-	{
-		const size_t hash_size = sizeof(_data) / sizeof(_data[0]);
-		size_t hash = hash_string(name) % hash_size;
-
-		// look for existing variable
-		for (xpath_variable* var = _data[hash]; var; var = var->_next)
-			if (strequal(var->name(), name))
-				return var;
-
-		return 0;
-	}
-
-	xpath_variable* xpath_variable_set::add(const char_t* name, xpath_value_type type)
-	{
-		const size_t hash_size = sizeof(_data) / sizeof(_data[0]);
-		size_t hash = hash_string(name) % hash_size;
-
-		// look for existing variable
-		for (xpath_variable* var = _data[hash]; var; var = var->_next)
-			if (strequal(var->name(), name))
-				return var->type() == type ? var : 0;
-
-		// add new variable
-		xpath_variable* result = new_xpath_variable(type, name);
-
-		if (result)
-		{
-			result->_type = type;
-			result->_next = _data[hash];
-
-			_data[hash] = result;
-		}
-
-		return result;
-	}
-
-	bool xpath_variable_set::set(const char_t* name, bool value)
-	{
-		xpath_variable* var = add(name, xpath_type_boolean);
-		return var ? var->set(value) : false;
-	}
-
-	bool xpath_variable_set::set(const char_t* name, double value)
-	{
-		xpath_variable* var = add(name, xpath_type_number);
-		return var ? var->set(value) : false;
-	}
-
-	bool xpath_variable_set::set(const char_t* name, const char_t* value)
-	{
-		xpath_variable* var = add(name, xpath_type_string);
-		return var ? var->set(value) : false;
-	}
-
-	bool xpath_variable_set::set(const char_t* name, const xpath_node_set& value)
-	{
-		xpath_variable* var = add(name, xpath_type_node_set);
-		return var ? var->set(value) : false;
-	}
-
-	xpath_variable* xpath_variable_set::get(const char_t* name)
-	{
-		return find(name);
-	}
-
-	const xpath_variable* xpath_variable_set::get(const char_t* name) const
-	{
-		return find(name);
-	}
-
-	xpath_query::xpath_query(const char_t* query, xpath_variable_set* variables): _impl(0)
-	{
-		xpath_query_impl* impl = xpath_query_impl::create();
-
-		if (!impl)
-		{
-		#ifdef PUGIXML_NO_EXCEPTIONS
-			_result.error = "Out of memory";
-        #else
-			throw std::bad_alloc();
-		#endif
-		}
-		else
-		{
-			buffer_holder impl_holder(impl, xpath_query_impl::destroy);
-
-			impl->root = xpath_parser::parse(query, variables, &impl->alloc, &_result);
-
-			if (impl->root)
-			{
-                _impl = static_cast<xpath_query_impl*>(impl_holder.release());
-				_result.error = 0;
-			}
-		}
-	}
-
-	xpath_query::~xpath_query()
-	{
-		xpath_query_impl::destroy(_impl);
-	}
-
-	xpath_value_type xpath_query::return_type() const
-	{
-		if (!_impl) return xpath_type_none;
-
-		return static_cast<xpath_query_impl*>(_impl)->root->rettype();
-	}
-
-	bool xpath_query::evaluate_boolean(const xpath_node& n) const
-	{
-		if (!_impl) return false;
-		
-		xpath_context c(n, 1, 1);
-		xpath_stack_data sd;
-
-	#ifdef PUGIXML_NO_EXCEPTIONS
-		if (setjmp(sd.error_handler)) return false;
-	#endif
-		
-		return static_cast<xpath_query_impl*>(_impl)->root->eval_boolean(c, sd.stack);
-	}
-	
-	double xpath_query::evaluate_number(const xpath_node& n) const
-	{
-		if (!_impl) return gen_nan();
-		
-		xpath_context c(n, 1, 1);
-		xpath_stack_data sd;
-
-	#ifdef PUGIXML_NO_EXCEPTIONS
-		if (setjmp(sd.error_handler)) return gen_nan();
-	#endif
-
-		return static_cast<xpath_query_impl*>(_impl)->root->eval_number(c, sd.stack);
-	}
-
-#ifndef PUGIXML_NO_STL
-	string_t xpath_query::evaluate_string(const xpath_node& n) const
-	{
-		xpath_stack_data sd;
-
-		return evaluate_string_impl(static_cast<xpath_query_impl*>(_impl), n, sd).c_str();
-	}
-#endif
-
-	size_t xpath_query::evaluate_string(char_t* buffer, size_t capacity, const xpath_node& n) const
-	{
-		xpath_stack_data sd;
-
-		xpath_string r = evaluate_string_impl(static_cast<xpath_query_impl*>(_impl), n, sd);
-
-		size_t full_size = r.length() + 1;
-		
-		if (capacity > 0)
-        {
-            size_t size = (full_size < capacity) ? full_size : capacity;
-            assert(size > 0);
-
-            memcpy(buffer, r.c_str(), (size - 1) * sizeof(char_t));
-            buffer[size - 1] = 0;
-        }
-		
-		return full_size;
-	}
-
-	xpath_node_set xpath_query::evaluate_node_set(const xpath_node& n) const
-	{
-		if (!_impl) return xpath_node_set();
-
-        xpath_ast_node* root = static_cast<xpath_query_impl*>(_impl)->root;
-
-		if (root->rettype() != xpath_type_node_set)
-		{
-		#ifdef PUGIXML_NO_EXCEPTIONS
-			return xpath_node_set();
-		#else
-			xpath_parse_result result;
-			result.error = "Expression does not evaluate to node set";
-
-			throw xpath_exception(result);
-		#endif
-		}
-		
-		xpath_context c(n, 1, 1);
-		xpath_stack_data sd;
-
-	#ifdef PUGIXML_NO_EXCEPTIONS
-		if (setjmp(sd.error_handler)) return xpath_node_set();
-	#endif
-
-		xpath_node_set_raw r = root->eval_node_set(c, sd.stack);
-
-		return xpath_node_set(r.begin(), r.end(), r.type());
-	}
-
-	const xpath_parse_result& xpath_query::result() const
-	{
-		return _result;
-	}
-
-	xpath_query::operator xpath_query::unspecified_bool_type() const
-	{
-		return _impl ? &xpath_query::_impl : 0;
-	}
-
-	bool xpath_query::operator!() const
-	{
-		return !_impl;
-	}
-
-	xpath_node xml_node::select_single_node(const char_t* query, xpath_variable_set* variables) const
-	{
-		xpath_query q(query, variables);
-		return select_single_node(q);
-	}
-
-	xpath_node xml_node::select_single_node(const xpath_query& query) const
-	{
-		xpath_node_set s = query.evaluate_node_set(*this);
-		return s.empty() ? xpath_node() : s.first();
-	}
-
-	xpath_node_set xml_node::select_nodes(const char_t* query, xpath_variable_set* variables) const
-	{
-		xpath_query q(query, variables);
-		return select_nodes(q);
-	}
-
-	xpath_node_set xml_node::select_nodes(const xpath_query& query) const
-	{
-		return query.evaluate_node_set(*this);
-	}
-}
-
-#endif
-
-/**
- * Copyright (c) 2006-2010 Arseny Kapoulkine
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
diff --git a/external/pugixml/src/pugixml.hpp b/external/pugixml/src/pugixml.hpp
deleted file mode 100644
index 78959ac..0000000
--- a/external/pugixml/src/pugixml.hpp
+++ /dev/null
@@ -1,1131 +0,0 @@
-/**
- * pugixml parser - version 1.0
- * --------------------------------------------------------
- * Copyright (C) 2006-2010, by Arseny Kapoulkine (arseny.kapoulkine at gmail.com)
- * Report bugs and download new versions at http://pugixml.org/
- *
- * This library is distributed under the MIT License. See notice at the end
- * of this file.
- *
- * This work is based on the pugxml parser, which is:
- * Copyright (C) 2003, by Kristen Wegner (kristen at tima.net)
- */
-
-#ifndef HEADER_PUGIXML_HPP
-#define HEADER_PUGIXML_HPP
-
-#include "pugiconfig.hpp"
-
-#ifndef PUGIXML_NO_STL
-namespace std
-{
-	struct bidirectional_iterator_tag;
-
-#ifdef __SUNPRO_CC
-	// Sun C++ compiler has a bug which forces template argument names in forward declarations to be the same as in actual definitions
-	template <class _T> class allocator;
-	template <class _charT> struct char_traits;
-	template <class _charT, class _Traits> class basic_istream;
-	template <class _charT, class _Traits> class basic_ostream;
-	template <class _charT, class _Traits, class _Allocator> class basic_string;
-#else
-	// Borland C++ compiler has a bug which forces template argument names in forward declarations to be the same as in actual definitions
-	template <class _Ty> class allocator;
-	template <class _Ty> struct char_traits;
-	template <class _Elem, class _Traits> class basic_istream;
-	template <class _Elem, class _Traits> class basic_ostream;
-	template <class _Elem, class _Traits, class _Ax> class basic_string;
-#endif
-
-	// Digital Mars compiler has a bug which requires a forward declaration for explicit instantiation (otherwise type selection is messed up later, producing link errors)
-	// Also note that we have to declare char_traits as a class here, since it's defined that way
-#ifdef __DMC__
-	template <> class char_traits<char>;
-#endif
-}
-#endif
-
-// Macro for deprecated features
-#ifndef PUGIXML_DEPRECATED
-#	if defined(__GNUC__)
-#		define PUGIXML_DEPRECATED __attribute__((deprecated))
-#	elif defined(_MSC_VER) && _MSC_VER >= 1300
-#		define PUGIXML_DEPRECATED __declspec(deprecated)
-#	else
-#		define PUGIXML_DEPRECATED
-#	endif
-#endif
-
-// Include exception header for XPath
-#if !defined(PUGIXML_NO_XPATH) && !defined(PUGIXML_NO_EXCEPTIONS)
-#	include <exception>
-#endif
-
-// If no API is defined, assume default
-#ifndef PUGIXML_API
-#   define PUGIXML_API
-#endif
-
-// If no API for classes is defined, assume default
-#ifndef PUGIXML_CLASS
-#   define PUGIXML_CLASS PUGIXML_API
-#endif
-
-// If no API for functions is defined, assume default
-#ifndef PUGIXML_FUNCTION
-#   define PUGIXML_FUNCTION PUGIXML_API
-#endif
-
-#include <stddef.h>
-
-// Character interface macros
-#ifdef PUGIXML_WCHAR_MODE
-#	define PUGIXML_TEXT(t) L ## t
-#	define PUGIXML_CHAR wchar_t
-#else
-#	define PUGIXML_TEXT(t) t
-#	define PUGIXML_CHAR char
-#endif
-
-namespace pugi
-{
-	// Character type used for all internal storage and operations; depends on PUGIXML_WCHAR_MODE
-	typedef PUGIXML_CHAR char_t;
-
-#ifndef PUGIXML_NO_STL
-	// String type used for operations that work with STL string; depends on PUGIXML_WCHAR_MODE
-	typedef std::basic_string<PUGIXML_CHAR, std::char_traits<PUGIXML_CHAR>, std::allocator<PUGIXML_CHAR> > string_t;
-#endif
-}
-
-// The PugiXML namespace
-namespace pugi
-{
-	// Tree node types
-	enum xml_node_type
-	{
-		node_null,          // Empty (null) node handle
-		node_document,		// A document tree's absolute root
-		node_element,		// Element tag, i.e. '<node/>'
-		node_pcdata,		// Plain character data, i.e. 'text'
-		node_cdata,			// Character data, i.e. '<![CDATA[text]]>'
-		node_comment,		// Comment tag, i.e. '<!-- text -->'
-		node_pi,			// Processing instruction, i.e. '<?name?>'
-		node_declaration,	// Document declaration, i.e. '<?xml version="1.0"?>'
-        node_doctype        // Document type declaration, i.e. '<!DOCTYPE doc>'
-	};
-
-	// Parsing options
-
-	// Minimal parsing mode (equivalent to turning all other flags off).
-    // Only elements and PCDATA sections are added to the DOM tree, no text conversions are performed.
-	const unsigned int parse_minimal = 0x0000;
-
-	// This flag determines if processing instructions (node_pi) are added to the DOM tree. This flag is off by default.
-	const unsigned int parse_pi = 0x0001;
-
-	// This flag determines if comments (node_comment) are added to the DOM tree. This flag is off by default.
-	const unsigned int parse_comments = 0x0002;
-
-	// This flag determines if CDATA sections (node_cdata) are added to the DOM tree. This flag is on by default.
-	const unsigned int parse_cdata = 0x0004;
-
-	// This flag determines if plain character data (node_pcdata) that consist only of whitespace are added to the DOM tree.
-    // This flag is off by default; turning it on usually results in slower parsing and more memory consumption.
-	const unsigned int parse_ws_pcdata = 0x0008;
-
-	// This flag determines if character and entity references are expanded during parsing. This flag is on by default.
-	const unsigned int parse_escapes = 0x0010;
-
-	// This flag determines if EOL characters are normalized (converted to #xA) during parsing. This flag is on by default.
-	const unsigned int parse_eol = 0x0020;
-	
- 	// This flag determines if attribute values are normalized using CDATA normalization rules during parsing. This flag is on by default.
- 	const unsigned int parse_wconv_attribute = 0x0040;
-
- 	// This flag determines if attribute values are normalized using NMTOKENS normalization rules during parsing. This flag is off by default.
- 	const unsigned int parse_wnorm_attribute = 0x0080;
-	
-    // This flag determines if document declaration (node_declaration) is added to the DOM tree. This flag is off by default.
-	const unsigned int parse_declaration = 0x0100;
-
-    // This flag determines if document type declaration (node_doctype) is added to the DOM tree. This flag is off by default.
-	const unsigned int parse_doctype = 0x0200;
-
-	// The default parsing mode.
-    // Elements, PCDATA and CDATA sections are added to the DOM tree, character/reference entities are expanded,
-    // End-of-Line characters are normalized, attribute values are normalized using CDATA normalization rules.
-	const unsigned int parse_default = parse_cdata | parse_escapes | parse_wconv_attribute | parse_eol;
-
-    // The full parsing mode.
-    // Nodes of all types are added to the DOM tree, character/reference entities are expanded,
-    // End-of-Line characters are normalized, attribute values are normalized using CDATA normalization rules.
-    const unsigned int parse_full = parse_default | parse_pi | parse_comments | parse_declaration | parse_doctype;
-
-	// These flags determine the encoding of input data for XML document
-	enum xml_encoding
-	{
-		encoding_auto,      // Auto-detect input encoding using BOM or < / <? detection; use UTF8 if BOM is not found
-		encoding_utf8,      // UTF8 encoding
-		encoding_utf16_le,  // Little-endian UTF16
-		encoding_utf16_be,  // Big-endian UTF16
-		encoding_utf16,     // UTF16 with native endianness
-		encoding_utf32_le,  // Little-endian UTF32
-		encoding_utf32_be,  // Big-endian UTF32
-		encoding_utf32,     // UTF32 with native endianness
-		encoding_wchar      // The same encoding wchar_t has (either UTF16 or UTF32)
-	};
-
-	// Formatting flags
-	
-	// Indent the nodes that are written to output stream with as many indentation strings as deep the node is in DOM tree. This flag is on by default.
-	const unsigned int format_indent = 0x01;
-	
-	// Write encoding-specific BOM to the output stream. This flag is off by default.
-	const unsigned int format_write_bom = 0x02;
-
-	// Use raw output mode (no indentation and no line breaks are written). This flag is off by default.
-	const unsigned int format_raw = 0x04;
-	
-	// Omit default XML declaration even if there is no declaration in the document. This flag is off by default.
-	const unsigned int format_no_declaration = 0x08;
-
-	// The default set of formatting flags.
-    // Nodes are indented depending on their depth in DOM tree, a default declaration is output if document has none.
-	const unsigned int format_default = format_indent;
-		
-	// Forward declarations
-	struct xml_attribute_struct;
-	struct xml_node_struct;
-
-	class xml_node_iterator;
-	class xml_attribute_iterator;
-
-	class xml_tree_walker;
-	
-	class xml_node;
-
-	#ifndef PUGIXML_NO_XPATH
-	class xpath_node;
-	class xpath_node_set;
-	class xpath_query;
-	class xpath_variable_set;
-	#endif
-
-	// Writer interface for node printing (see xml_node::print)
-	class PUGIXML_CLASS xml_writer
-	{
-	public:
-		virtual ~xml_writer() {}
-
-		// Write memory chunk into stream/file/whatever
-		virtual void write(const void* data, size_t size) = 0;
-	};
-
-	// xml_writer implementation for FILE*
-	class PUGIXML_CLASS xml_writer_file: public xml_writer
-	{
-	public:
-        // Construct writer from a FILE* object; void* is used to avoid header dependencies on stdio
-		xml_writer_file(void* file);
-
-		virtual void write(const void* data, size_t size);
-
-	private:
-		void* file;
-	};
-
-	#ifndef PUGIXML_NO_STL
-	// xml_writer implementation for streams
-	class PUGIXML_CLASS xml_writer_stream: public xml_writer
-	{
-	public:
-        // Construct writer from an output stream object
-		xml_writer_stream(std::basic_ostream<char, std::char_traits<char> >& stream);
-		xml_writer_stream(std::basic_ostream<wchar_t, std::char_traits<wchar_t> >& stream);
-
-		virtual void write(const void* data, size_t size);
-
-	private:
-		std::basic_ostream<char, std::char_traits<char> >* narrow_stream;
-		std::basic_ostream<wchar_t, std::char_traits<wchar_t> >* wide_stream;
-	};
-	#endif
-
-	// A light-weight handle for manipulating attributes in DOM tree
-	class PUGIXML_CLASS xml_attribute
-	{
-		friend class xml_attribute_iterator;
-		friend class xml_node;
-
-	private:
-		xml_attribute_struct* _attr;
-	
-    	typedef xml_attribute_struct* xml_attribute::*unspecified_bool_type;
-
-	public:
-        // Default constructor. Constructs an empty attribute.
-		xml_attribute();
-		
-        // Constructs attribute from internal pointer
-		explicit xml_attribute(xml_attribute_struct* attr);
-
-    	// Safe bool conversion operator
-    	operator unspecified_bool_type() const;
-
-    	// Borland C++ workaround
-    	bool operator!() const;
-
-		// Comparison operators (compares wrapped attribute pointers)
-		bool operator==(const xml_attribute& r) const;
-		bool operator!=(const xml_attribute& r) const;
-		bool operator<(const xml_attribute& r) const;
-		bool operator>(const xml_attribute& r) const;
-		bool operator<=(const xml_attribute& r) const;
-		bool operator>=(const xml_attribute& r) const;
-
-		// Check if attribute is empty
-		bool empty() const;
-
-		// Get attribute name/value, or "" if attribute is empty
-		const char_t* name() const;
-		const char_t* value() const;
-
-		// Get attribute value as a number, or 0 if conversion did not succeed or attribute is empty
-		int as_int() const;
-		unsigned int as_uint() const;
-		double as_double() const;
-		float as_float() const;
-
-        // Get attribute value as bool (returns true if first character is in '1tTyY' set), or false if attribute is empty
-		bool as_bool() const;
-
-        // Set attribute name/value (returns false if attribute is empty or there is not enough memory)
-		bool set_name(const char_t* rhs);
-		bool set_value(const char_t* rhs);
-
-        // Set attribute value with type conversion (numbers are converted to strings, boolean is converted to "true"/"false")
-		bool set_value(int rhs);
-		bool set_value(unsigned int rhs);
-		bool set_value(double rhs);
-		bool set_value(bool rhs);
-
-		// Set attribute value (equivalent to set_value without error checking)
-		xml_attribute& operator=(const char_t* rhs);
-		xml_attribute& operator=(int rhs);
-		xml_attribute& operator=(unsigned int rhs);
-		xml_attribute& operator=(double rhs);
-		xml_attribute& operator=(bool rhs);
-
-        // Get next/previous attribute in the attribute list of the parent node
-    	xml_attribute next_attribute() const;
-    	xml_attribute previous_attribute() const;
-
-        // Get hash value (unique for handles to the same object)
-        size_t hash_value() const;
-
-		// Get internal pointer
-		xml_attribute_struct* internal_object() const;
-	};
-
-#ifdef __BORLANDC__
-	// Borland C++ workaround
-	bool PUGIXML_FUNCTION operator&&(const xml_attribute& lhs, bool rhs);
-	bool PUGIXML_FUNCTION operator||(const xml_attribute& lhs, bool rhs);
-#endif
-
-	// A light-weight handle for manipulating nodes in DOM tree
-	class PUGIXML_CLASS xml_node
-	{
-		friend class xml_attribute_iterator;
-		friend class xml_node_iterator;
-
-	protected:
-		xml_node_struct* _root;
-
-    	typedef xml_node_struct* xml_node::*unspecified_bool_type;
-
-	public:
-		// Default constructor. Constructs an empty node.
-		xml_node();
-
-        // Constructs node from internal pointer
-		explicit xml_node(xml_node_struct* p);
-
-    	// Safe bool conversion operator
-		operator unspecified_bool_type() const;
-
-		// Borland C++ workaround
-		bool operator!() const;
-	
-		// Comparison operators (compares wrapped node pointers)
-		bool operator==(const xml_node& r) const;
-		bool operator!=(const xml_node& r) const;
-		bool operator<(const xml_node& r) const;
-		bool operator>(const xml_node& r) const;
-		bool operator<=(const xml_node& r) const;
-		bool operator>=(const xml_node& r) const;
-
-		// Check if node is empty.
-		bool empty() const;
-
-		// Get node type
-		xml_node_type type() const;
-
-		// Get node name/value, or "" if node is empty or it has no name/value
-		const char_t* name() const;
-		const char_t* value() const;
-	
-		// Get attribute list
-		xml_attribute first_attribute() const;
-        xml_attribute last_attribute() const;
-
-        // Get children list
-		xml_node first_child() const;
-        xml_node last_child() const;
-
-        // Get next/previous sibling in the children list of the parent node
-		xml_node next_sibling() const;
-		xml_node previous_sibling() const;
-		
-        // Get parent node
-		xml_node parent() const;
-
-		// Get root of DOM tree this node belongs to
-		xml_node root() const;
-
-		// Get child, attribute or next/previous sibling with the specified name
-		xml_node child(const char_t* name) const;
-		xml_attribute attribute(const char_t* name) const;
-		xml_node next_sibling(const char_t* name) const;
-		xml_node previous_sibling(const char_t* name) const;
-
-		// Get child value of current node; that is, value of the first child node of type PCDATA/CDATA
-		const char_t* child_value() const;
-
-		// Get child value of child with specified name. Equivalent to child(name).child_value().
-		const char_t* child_value(const char_t* name) const;
-
-		// Set node name/value (returns false if node is empty, there is not enough memory, or node can not have name/value)
-		bool set_name(const char_t* rhs);
-		bool set_value(const char_t* rhs);
-		
-		// Add attribute with specified name. Returns added attribute, or empty attribute on errors.
-		xml_attribute append_attribute(const char_t* name);
-		xml_attribute prepend_attribute(const char_t* name);
-		xml_attribute insert_attribute_after(const char_t* name, const xml_attribute& attr);
-		xml_attribute insert_attribute_before(const char_t* name, const xml_attribute& attr);
-
-		// Add a copy of the specified attribute. Returns added attribute, or empty attribute on errors.
-		xml_attribute append_copy(const xml_attribute& proto);
-		xml_attribute prepend_copy(const xml_attribute& proto);
-		xml_attribute insert_copy_after(const xml_attribute& proto, const xml_attribute& attr);
-		xml_attribute insert_copy_before(const xml_attribute& proto, const xml_attribute& attr);
-
-		// Add child node with specified type. Returns added node, or empty node on errors.
-		xml_node append_child(xml_node_type type = node_element);
-		xml_node prepend_child(xml_node_type type = node_element);
-		xml_node insert_child_after(xml_node_type type, const xml_node& node);
-		xml_node insert_child_before(xml_node_type type, const xml_node& node);
-
-		// Add child element with specified name. Returns added node, or empty node on errors.
-		xml_node append_child(const char_t* name);
-		xml_node prepend_child(const char_t* name);
-		xml_node insert_child_after(const char_t* name, const xml_node& node);
-		xml_node insert_child_before(const char_t* name, const xml_node& node);
-
-		// Add a copy of the specified node as a child. Returns added node, or empty node on errors.
-		xml_node append_copy(const xml_node& proto);
-		xml_node prepend_copy(const xml_node& proto);
-		xml_node insert_copy_after(const xml_node& proto, const xml_node& node);
-		xml_node insert_copy_before(const xml_node& proto, const xml_node& node);
-
-		// Remove specified attribute
-		bool remove_attribute(const xml_attribute& a);
-		bool remove_attribute(const char_t* name);
-
-		// Remove specified child
-		bool remove_child(const xml_node& n);
-		bool remove_child(const char_t* name);
-
-		// Find attribute using predicate. Returns first attribute for which predicate returned true.
-		template <typename Predicate> xml_attribute find_attribute(Predicate pred) const
-		{
-			if (!_root) return xml_attribute();
-			
-			for (xml_attribute attrib = first_attribute(); attrib; attrib = attrib.next_attribute())
-				if (pred(attrib))
-					return attrib;
-		
-			return xml_attribute();
-		}
-
-		// Find child node using predicate. Returns first child for which predicate returned true.
-		template <typename Predicate> xml_node find_child(Predicate pred) const
-		{
-			if (!_root) return xml_node();
-	
-			for (xml_node node = first_child(); node; node = node.next_sibling())
-				if (pred(node))
-					return node;
-        
-	        return xml_node();
-		}
-
-		// Find node from subtree using predicate. Returns first node from subtree (depth-first), for which predicate returned true.
-		template <typename Predicate> xml_node find_node(Predicate pred) const
-		{
-			if (!_root) return xml_node();
-
-			xml_node cur = first_child();
-			
-			while (cur._root && cur._root != _root)
-			{
-				if (pred(cur)) return cur;
-
-				if (cur.first_child()) cur = cur.first_child();
-				else if (cur.next_sibling()) cur = cur.next_sibling();
-				else
-				{
-					while (!cur.next_sibling() && cur._root != _root) cur = cur.parent();
-
-					if (cur._root != _root) cur = cur.next_sibling();
-				}
-			}
-
-			return xml_node();
-		}
-
-		// Find child node by attribute name/value
-		xml_node find_child_by_attribute(const char_t* name, const char_t* attr_name, const char_t* attr_value) const;
-		xml_node find_child_by_attribute(const char_t* attr_name, const char_t* attr_value) const;
-
-	#ifndef PUGIXML_NO_STL
-		// Get the absolute node path from root as a text string.
-		string_t path(char_t delimiter = '/') const;
-	#endif
-
-		// Search for a node by path consisting of node names and . or .. elements.
-		xml_node first_element_by_path(const char_t* path, char_t delimiter = '/') const;
-
-		// Recursively traverse subtree with xml_tree_walker
-		bool traverse(xml_tree_walker& walker);
-	
-	#ifndef PUGIXML_NO_XPATH
-		// Select single node by evaluating XPath query. Returns first node from the resulting node set.
-		xpath_node select_single_node(const char_t* query, xpath_variable_set* variables = 0) const;
-		xpath_node select_single_node(const xpath_query& query) const;
-
-		// Select node set by evaluating XPath query
-		xpath_node_set select_nodes(const char_t* query, xpath_variable_set* variables = 0) const;
-		xpath_node_set select_nodes(const xpath_query& query) const;
-	#endif
-		
-		// Print subtree using a writer object
-		void print(xml_writer& writer, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto, unsigned int depth = 0) const;
-
-	#ifndef PUGIXML_NO_STL
-		// Print subtree to stream
-		void print(std::basic_ostream<char, std::char_traits<char> >& os, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto, unsigned int depth = 0) const;
-		void print(std::basic_ostream<wchar_t, std::char_traits<wchar_t> >& os, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, unsigned int depth = 0) const;
-	#endif
-
-		// Child nodes iterators
-		typedef xml_node_iterator iterator;
-
-		iterator begin() const;
-		iterator end() const;
-
-		// Attribute iterators
-		typedef xml_attribute_iterator attribute_iterator;
-
-		attribute_iterator attributes_begin() const;
-		attribute_iterator attributes_end() const;
-
-		// Get node offset in parsed file/string (in char_t units) for debugging purposes
-		ptrdiff_t offset_debug() const;
-
-        // Get hash value (unique for handles to the same object)
-        size_t hash_value() const;
-
-		// Get internal pointer
-		xml_node_struct* internal_object() const;
-	};
-
-#ifdef __BORLANDC__
-	// Borland C++ workaround
-	bool PUGIXML_FUNCTION operator&&(const xml_node& lhs, bool rhs);
-	bool PUGIXML_FUNCTION operator||(const xml_node& lhs, bool rhs);
-#endif
-
-	// Child node iterator (a bidirectional iterator over a collection of xml_node)
-	class PUGIXML_CLASS xml_node_iterator
-	{
-		friend class xml_node;
-
-	private:
-		xml_node _wrap;
-		xml_node _parent;
-
-		xml_node_iterator(xml_node_struct* ref, xml_node_struct* parent);
-
-	public:
-		// Iterator traits
-		typedef ptrdiff_t difference_type;
-		typedef xml_node value_type;
-		typedef xml_node* pointer;
-		typedef xml_node& reference;
-
-	#ifndef PUGIXML_NO_STL
-		typedef std::bidirectional_iterator_tag iterator_category;
-	#endif
-
-        // Default constructor
-		xml_node_iterator();
-
-        // Construct an iterator which points to the specified node
-		xml_node_iterator(const xml_node& node);
-
-        // Iterator operators
-		bool operator==(const xml_node_iterator& rhs) const;
-		bool operator!=(const xml_node_iterator& rhs) const;
-
-		xml_node& operator*();
-		xml_node* operator->();
-
-		const xml_node_iterator& operator++();
-		xml_node_iterator operator++(int);
-
-		const xml_node_iterator& operator--();
-		xml_node_iterator operator--(int);
-	};
-
-	// Attribute iterator (a bidirectional iterator over a collection of xml_attribute)
-	class PUGIXML_CLASS xml_attribute_iterator
-	{
-		friend class xml_node;
-
-	private:
-		xml_attribute _wrap;
-		xml_node _parent;
-
-		xml_attribute_iterator(xml_attribute_struct* ref, xml_node_struct* parent);
-
-	public:
-		// Iterator traits
-		typedef ptrdiff_t difference_type;
-		typedef xml_attribute value_type;
-		typedef xml_attribute* pointer;
-		typedef xml_attribute& reference;
-
-	#ifndef PUGIXML_NO_STL
-		typedef std::bidirectional_iterator_tag iterator_category;
-	#endif
-
-        // Default constructor
-		xml_attribute_iterator();
-
-        // Construct an iterator which points to the specified attribute
-		xml_attribute_iterator(const xml_attribute& attr, const xml_node& parent);
-
-		// Iterator operators
-		bool operator==(const xml_attribute_iterator& rhs) const;
-		bool operator!=(const xml_attribute_iterator& rhs) const;
-
-		xml_attribute& operator*();
-		xml_attribute* operator->();
-
-		const xml_attribute_iterator& operator++();
-		xml_attribute_iterator operator++(int);
-
-		const xml_attribute_iterator& operator--();
-		xml_attribute_iterator operator--(int);
-	};
-
-	// Abstract tree walker class (see xml_node::traverse)
-	class PUGIXML_CLASS xml_tree_walker
-	{
-		friend class xml_node;
-
-	private:
-		int _depth;
-	
-	protected:
-		// Get current traversal depth
-		int depth() const;
-	
-	public:
-		xml_tree_walker();
-		virtual ~xml_tree_walker();
-
-		// Callback that is called when traversal begins
-		virtual bool begin(xml_node& node);
-
-		// Callback that is called for each node traversed
-		virtual bool for_each(xml_node& node) = 0;
-
-		// Callback that is called when traversal ends
-		virtual bool end(xml_node& node);
-	};
-
-	// Parsing status, returned as part of xml_parse_result object
-	enum xml_parse_status
-	{
-		status_ok = 0,              // No error
-
-		status_file_not_found,      // File was not found during load_file()
-		status_io_error,            // Error reading from file/stream
-		status_out_of_memory,       // Could not allocate memory
-		status_internal_error,      // Internal error occurred
-
-		status_unrecognized_tag,    // Parser could not determine tag type
-
-		status_bad_pi,              // Parsing error occurred while parsing document declaration/processing instruction
-		status_bad_comment,         // Parsing error occurred while parsing comment
-		status_bad_cdata,           // Parsing error occurred while parsing CDATA section
-		status_bad_doctype,         // Parsing error occurred while parsing document type declaration
-		status_bad_pcdata,          // Parsing error occurred while parsing PCDATA section
-		status_bad_start_element,   // Parsing error occurred while parsing start element tag
-		status_bad_attribute,       // Parsing error occurred while parsing element attribute
-		status_bad_end_element,     // Parsing error occurred while parsing end element tag
-		status_end_element_mismatch // There was a mismatch of start-end tags (closing tag had incorrect name, some tag was not closed or there was an excessive closing tag)
-	};
-
-	// Parsing result
-	struct PUGIXML_CLASS xml_parse_result
-	{
-		// Parsing status (see xml_parse_status)
-		xml_parse_status status;
-
-		// Last parsed offset (in char_t units from start of input data)
-		ptrdiff_t offset;
-
-		// Source document encoding
-		xml_encoding encoding;
-
-        // Default constructor, initializes object to failed state
-		xml_parse_result();
-
-		// Cast to bool operator
-		operator bool() const;
-
-		// Get error description
-		const char* description() const;
-	};
-
-	// Document class (DOM tree root)
-	class PUGIXML_CLASS xml_document: public xml_node
-	{
-	private:
-		char_t* _buffer;
-
-		char _memory[192];
-		
-		// Non-copyable semantics
-		xml_document(const xml_document&);
-		const xml_document& operator=(const xml_document&);
-
-		void create();
-		void destroy();
-
-		xml_parse_result load_buffer_impl(void* contents, size_t size, unsigned int options, xml_encoding encoding, bool is_mutable, bool own);
-
-	public:
-		// Default constructor, makes empty document
-		xml_document();
-
-		// Destructor, invalidates all node/attribute handles to this document
-		~xml_document();
-
-        // Removes all nodes, leaving the empty document
-		void reset();
-
-        // Removes all nodes, then copies the entire contents of the specified document
-		void reset(const xml_document& proto);
-
-	#ifndef PUGIXML_NO_STL
-		// Load document from stream.
-		xml_parse_result load(std::basic_istream<char, std::char_traits<char> >& stream, unsigned int options = parse_default, xml_encoding encoding = encoding_auto);
-		xml_parse_result load(std::basic_istream<wchar_t, std::char_traits<wchar_t> >& stream, unsigned int options = parse_default);
-	#endif
-
-		// Load document from zero-terminated string. No encoding conversions are applied.
-		xml_parse_result load(const char_t* contents, unsigned int options = parse_default);
-
-		// Load document from file
-		xml_parse_result load_file(const char* path, unsigned int options = parse_default, xml_encoding encoding = encoding_auto);
-		xml_parse_result load_file(const wchar_t* path, unsigned int options = parse_default, xml_encoding encoding = encoding_auto);
-
-		// Load document from buffer. Copies/converts the buffer, so it may be deleted or changed after the function returns.
-		xml_parse_result load_buffer(const void* contents, size_t size, unsigned int options = parse_default, xml_encoding encoding = encoding_auto);
-
-		// Load document from buffer, using the buffer for in-place parsing (the buffer is modified and used for storage of document data).
-        // You should ensure that buffer data will persist throughout the document's lifetime, and free the buffer memory manually once document is destroyed.
-		xml_parse_result load_buffer_inplace(void* contents, size_t size, unsigned int options = parse_default, xml_encoding encoding = encoding_auto);
-
-		// Load document from buffer, using the buffer for in-place parsing (the buffer is modified and used for storage of document data).
-        // You should allocate the buffer with pugixml allocation function; document will free the buffer when it is no longer needed (you can't use it anymore).
-		xml_parse_result load_buffer_inplace_own(void* contents, size_t size, unsigned int options = parse_default, xml_encoding encoding = encoding_auto);
-
-		// Save XML document to writer (semantics is slightly different from xml_node::print, see documentation for details).
-		void save(xml_writer& writer, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto) const;
-
-	#ifndef PUGIXML_NO_STL
-		// Save XML document to stream (semantics is slightly different from xml_node::print, see documentation for details).
-		void save(std::basic_ostream<char, std::char_traits<char> >& stream, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto) const;
-		void save(std::basic_ostream<wchar_t, std::char_traits<wchar_t> >& stream, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default) const;
-	#endif
-
-		// Save XML to file
-		bool save_file(const char* path, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto) const;
-		bool save_file(const wchar_t* path, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto) const;
-
-        // Get document element
-        xml_node document_element() const;
-	};
-
-#ifndef PUGIXML_NO_XPATH
-	// XPath query return type
-	enum xpath_value_type
-	{
-		xpath_type_none,      // Unknown type (query failed to compile)
-		xpath_type_node_set,  // Node set (xpath_node_set)
-		xpath_type_number,    // Number
-		xpath_type_string,    // String
-		xpath_type_boolean    // Boolean
-	};
-
-    // XPath parsing result
-	struct PUGIXML_CLASS xpath_parse_result
-	{
-		// Error message (0 if no error)
-		const char* error;
-
-		// Last parsed offset (in char_t units from string start)
-		ptrdiff_t offset;
-
-        // Default constructor, initializes object to failed state
-		xpath_parse_result();
-
-		// Cast to bool operator
-		operator bool() const;
-
-		// Get error description
-		const char* description() const;
-	};
-
-	// A single XPath variable
-	class PUGIXML_CLASS xpath_variable
-	{
-		friend class xpath_variable_set;
-
-	protected:
-		xpath_value_type _type;
-		xpath_variable* _next;
-
-		xpath_variable();
-
-		// Non-copyable semantics
-		xpath_variable(const xpath_variable&);
-		xpath_variable& operator=(const xpath_variable&);
-		
-	public:
-        // Get variable name
-		const char_t* name() const;
-
-        // Get variable type
-		xpath_value_type type() const;
-
-        // Get variable value; no type conversion is performed, default value (false, NaN, empty string, empty node set) is returned on type mismatch error
-		bool get_boolean() const;
-		double get_number() const;
-		const char_t* get_string() const;
-		const xpath_node_set& get_node_set() const;
-
-        // Set variable value; no type conversion is performed, false is returned on type mismatch error
-		bool set(bool value);
-		bool set(double value);
-		bool set(const char_t* value);
-		bool set(const xpath_node_set& value);
-	};
-
-	// A set of XPath variables
-	class PUGIXML_CLASS xpath_variable_set
-	{
-	private:
-		xpath_variable* _data[64];
-
-		// Non-copyable semantics
-		xpath_variable_set(const xpath_variable_set&);
-		xpath_variable_set& operator=(const xpath_variable_set&);
-
-		xpath_variable* find(const char_t* name) const;
-
-	public:
-        // Default constructor/destructor
-		xpath_variable_set();
-		~xpath_variable_set();
-
-        // Add a new variable or get the existing one, if the types match
-		xpath_variable* add(const char_t* name, xpath_value_type type);
-
-        // Set value of an existing variable; no type conversion is performed, false is returned if there is no such variable or if types mismatch
-		bool set(const char_t* name, bool value);
-		bool set(const char_t* name, double value);
-		bool set(const char_t* name, const char_t* value);
-		bool set(const char_t* name, const xpath_node_set& value);
-
-        // Get existing variable by name
-		xpath_variable* get(const char_t* name);
-		const xpath_variable* get(const char_t* name) const;
-	};
-
-	// A compiled XPath query object
-	class PUGIXML_CLASS xpath_query
-	{
-	private:
-		void* _impl;
-		xpath_parse_result _result;
-
-    	typedef void* xpath_query::*unspecified_bool_type;
-
-		// Non-copyable semantics
-		xpath_query(const xpath_query&);
-		xpath_query& operator=(const xpath_query&);
-
-	public:
-        // Construct a compiled object from XPath expression.
-        // If PUGIXML_NO_EXCEPTIONS is not defined, throws xpath_exception on compilation errors.
-		explicit xpath_query(const char_t* query, xpath_variable_set* variables = 0);
-
-		// Destructor
-		~xpath_query();
-
-		// Get query expression return type
-		xpath_value_type return_type() const;
-		
-		// Evaluate expression as boolean value in the specified context; performs type conversion if necessary.
-        // If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors.
-		bool evaluate_boolean(const xpath_node& n) const;
-		
-		// Evaluate expression as double value in the specified context; performs type conversion if necessary.
-        // If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors.
-		double evaluate_number(const xpath_node& n) const;
-		
-	#ifndef PUGIXML_NO_STL
-		// Evaluate expression as string value in the specified context; performs type conversion if necessary.
-        // If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors.
-		string_t evaluate_string(const xpath_node& n) const;
-	#endif
-		
-		// Evaluate expression as string value in the specified context; performs type conversion if necessary.
-        // At most capacity characters are written to the destination buffer, full result size is returned (includes terminating zero).
-        // If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors.
-        // If PUGIXML_NO_EXCEPTIONS is defined, returns empty  set instead.
-		size_t evaluate_string(char_t* buffer, size_t capacity, const xpath_node& n) const;
-
-		// Evaluate expression as node set in the specified context.
-        // If PUGIXML_NO_EXCEPTIONS is not defined, throws xpath_exception on type mismatch and std::bad_alloc on out of memory errors.
-        // If PUGIXML_NO_EXCEPTIONS is defined, returns empty node set instead.
-		xpath_node_set evaluate_node_set(const xpath_node& n) const;
-
-		// Get parsing result (used to get compilation errors in PUGIXML_NO_EXCEPTIONS mode)
-		const xpath_parse_result& result() const;
-
-		// Safe bool conversion operator
-		operator unspecified_bool_type() const;
-
-    	// Borland C++ workaround
-		bool operator!() const;
-	};
-	
-	#ifndef PUGIXML_NO_EXCEPTIONS
-	// XPath exception class
-	class PUGIXML_CLASS xpath_exception: public std::exception
-	{
-	private:
-		xpath_parse_result _result;
-
-	public:
-		// Construct exception from parse result
-		explicit xpath_exception(const xpath_parse_result& result);
-
-		// Get error message
-		virtual const char* what() const throw();
-
-        // Get parse result
-		const xpath_parse_result& result() const;
-	};
-	#endif
-	
-	// XPath node class (either xml_node or xml_attribute)
-	class PUGIXML_CLASS xpath_node
-	{
-	private:
-		xml_node _node;
-		xml_attribute _attribute;
-	
-    	typedef xml_node xpath_node::*unspecified_bool_type;
-
-	public:
-		// Default constructor; constructs empty XPath node
-		xpath_node();
-		
-		// Construct XPath node from XML node/attribute
-		xpath_node(const xml_node& node);
-		xpath_node(const xml_attribute& attribute, const xml_node& parent);
-
-		// Get node/attribute, if any
-		xml_node node() const;
-		xml_attribute attribute() const;
-		
-		// Get parent of contained node/attribute
-		xml_node parent() const;
-
-    	// Safe bool conversion operator
-		operator unspecified_bool_type() const;
-		
-    	// Borland C++ workaround
-    	bool operator!() const;
-
-		// Comparison operators
-		bool operator==(const xpath_node& n) const;
-		bool operator!=(const xpath_node& n) const;
-	};
-
-#ifdef __BORLANDC__
-	// Borland C++ workaround
-	bool PUGIXML_FUNCTION operator&&(const xpath_node& lhs, bool rhs);
-	bool PUGIXML_FUNCTION operator||(const xpath_node& lhs, bool rhs);
-#endif
-
-	// A fixed-size collection of XPath nodes
-	class PUGIXML_CLASS xpath_node_set
-	{
-	public:
-		// Collection type
-		enum type_t
-		{
-			type_unsorted,			// Not ordered
-			type_sorted,			// Sorted by document order (ascending)
-			type_sorted_reverse		// Sorted by document order (descending)
-		};
-		
-		// Constant iterator type
-		typedef const xpath_node* const_iterator;
-	
-		// Default constructor. Constructs empty set.
-		xpath_node_set();
-
-		// Constructs a set from iterator range; data is not checked for duplicates and is not sorted according to provided type, so be careful
-		xpath_node_set(const_iterator begin, const_iterator end, type_t type = type_unsorted);
-
-		// Destructor
-		~xpath_node_set();
-		
-		// Copy constructor/assignment operator
-		xpath_node_set(const xpath_node_set& ns);
-		xpath_node_set& operator=(const xpath_node_set& ns);
-
-		// Get collection type
-		type_t type() const;
-		
-		// Get collection size
-		size_t size() const;
-
-        // Indexing operator
-		const xpath_node& operator[](size_t index) const;
-		
-		// Collection iterators
-		const_iterator begin() const;
-		const_iterator end() const;
-
-		// Sort the collection in ascending/descending order by document order
-		void sort(bool reverse = false);
-		
-		// Get first node in the collection by document order
-		xpath_node first() const;
-		
-		// Check if collection is empty
-		bool empty() const;
-    
-	private:
-		type_t _type;
-		
-		xpath_node _storage;
-		
-		xpath_node* _begin;
-		xpath_node* _end;
-
-		void _assign(const_iterator begin, const_iterator end);
-	};
-#endif
-
-#ifndef PUGIXML_NO_STL
-	// Convert wide string to UTF8
-	std::basic_string<char, std::char_traits<char>, std::allocator<char> > PUGIXML_FUNCTION as_utf8(const wchar_t* str);
-	std::basic_string<char, std::char_traits<char>, std::allocator<char> > PUGIXML_FUNCTION as_utf8(const std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >& str);
-	
-	// Convert UTF8 to wide string
-	std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> > PUGIXML_FUNCTION as_wide(const char* str);
-	std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> > PUGIXML_FUNCTION as_wide(const std::basic_string<char, std::char_traits<char>, std::allocator<char> >& str);
-#endif
-
-	// Memory allocation function interface; returns pointer to allocated memory or NULL on failure
-	typedef void* (*allocation_function)(size_t size);
-	
-	// Memory deallocation function interface
-    typedef void (*deallocation_function)(void* ptr);
-
-    // Override default memory management functions. All subsequent allocations/deallocations will be performed via supplied functions.
-    void PUGIXML_FUNCTION set_memory_management_functions(allocation_function allocate, deallocation_function deallocate);
-    
-    // Get current memory management functions
-    allocation_function PUGIXML_FUNCTION get_memory_allocation_function();
-    deallocation_function PUGIXML_FUNCTION get_memory_deallocation_function();
-}
-
-#if !defined(PUGIXML_NO_STL) && (defined(_MSC_VER) || defined(__ICC))
-namespace std
-{
-	// Workarounds for (non-standard) iterator category detection for older versions (MSVC7/IC8 and earlier)
-	std::bidirectional_iterator_tag PUGIXML_FUNCTION _Iter_cat(const pugi::xml_node_iterator&);
-	std::bidirectional_iterator_tag PUGIXML_FUNCTION _Iter_cat(const pugi::xml_attribute_iterator&);
-}
-#endif
-
-#if !defined(PUGIXML_NO_STL) && defined(__SUNPRO_CC)
-namespace std
-{
-	// Workarounds for (non-standard) iterator category detection
-	std::bidirectional_iterator_tag PUGIXML_FUNCTION __iterator_category(const pugi::xml_node_iterator&);
-	std::bidirectional_iterator_tag PUGIXML_FUNCTION __iterator_category(const pugi::xml_attribute_iterator&);
-}
-#endif
-
-#endif
-
-/**
- * Copyright (c) 2006-2010 Arseny Kapoulkine
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
diff --git a/external/tclap/Arg.h b/external/tclap/Arg.h
new file mode 100644
index 0000000..b28eef1
--- /dev/null
+++ b/external/tclap/Arg.h
@@ -0,0 +1,692 @@
+// -*- Mode: c++; c-basic-offset: 4; tab-width: 4; -*-
+
+/******************************************************************************
+ *
+ *  file:  Arg.h
+ *
+ *  Copyright (c) 2003, Michael E. Smoot .
+ *  Copyright (c) 2004, Michael E. Smoot, Daniel Aarno .
+ *  All rights reverved.
+ *
+ *  See the file COPYING in the top directory of this distribution for
+ *  more information.
+ *
+ *  THE SOFTWARE IS PROVIDED _AS IS_, WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ *  OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ *  THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ *  DEALINGS IN THE SOFTWARE.
+ *
+ *****************************************************************************/
+
+
+#ifndef TCLAP_ARGUMENT_H
+#define TCLAP_ARGUMENT_H
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#else
+#define HAVE_SSTREAM
+#endif
+
+#include <string>
+#include <vector>
+#include <list>
+#include <iostream>
+#include <iomanip>
+#include <cstdio>
+
+#if defined(HAVE_SSTREAM)
+#include <sstream>
+typedef std::istringstream istringstream;
+#elif defined(HAVE_STRSTREAM)
+#include <strstream>
+typedef std::istrstream istringstream;
+#else
+#error "Need a stringstream (sstream or strstream) to compile!"
+#endif
+
+#include <tclap/ArgException.h>
+#include <tclap/Visitor.h>
+#include <tclap/CmdLineInterface.h>
+#include <tclap/ArgTraits.h>
+#include <tclap/StandardTraits.h>
+
+namespace TCLAP {
+
+/**
+ * A virtual base class that defines the essential data for all arguments.
+ * This class, or one of its existing children, must be subclassed to do
+ * anything.
+ */
+class Arg
+{
+	private:
+		/**
+		 * Prevent accidental copying.
+		 */
+		Arg(const Arg& rhs);
+
+		/**
+		 * Prevent accidental copying.
+		 */
+		Arg& operator=(const Arg& rhs);
+
+		/**
+		 * Indicates whether the rest of the arguments should be ignored.
+		 */
+		static bool& ignoreRestRef() { static bool ign = false; return ign; }
+
+		/**
+		 * The delimiter that separates an argument flag/name from the
+		 * value.
+		 */
+		static char& delimiterRef() { static char delim = ' '; return delim; }
+
+	protected:
+
+		/**
+		 * The single char flag used to identify the argument.
+		 * This value (preceded by a dash {-}), can be used to identify
+		 * an argument on the command line.  The _flag can be blank,
+		 * in fact this is how unlabeled args work.  Unlabeled args must
+		 * override appropriate functions to get correct handling. Note
+		 * that the _flag does NOT include the dash as part of the flag.
+		 */
+		std::string _flag;
+
+		/**
+		 * A single work namd indentifying the argument.
+		 * This value (preceded by two dashed {--}) can also be used
+		 * to identify an argument on the command line.  Note that the
+		 * _name does NOT include the two dashes as part of the _name. The
+		 * _name cannot be blank.
+		 */
+		std::string _name;
+
+		/**
+		 * Description of the argument.
+		 */
+		std::string _description;
+
+		/**
+		 * Indicating whether the argument is required.
+		 */
+		bool _required;
+
+		/**
+		 * Label to be used in usage description.  Normally set to
+		 * "required", but can be changed when necessary.
+		 */
+		std::string _requireLabel;
+
+		/**
+		 * Indicates whether a value is required for the argument.
+		 * Note that the value may be required but the argument/value
+		 * combination may not be, as specified by _required.
+		 */
+		bool _valueRequired;
+
+		/**
+		 * Indicates whether the argument has been set.
+		 * Indicates that a value on the command line has matched the
+		 * name/flag of this argument and the values have been set accordingly.
+		 */
+		bool _alreadySet;
+
+		/**
+		 * A pointer to a vistitor object.
+		 * The visitor allows special handling to occur as soon as the
+		 * argument is matched.  This defaults to NULL and should not
+		 * be used unless absolutely necessary.
+		 */
+		Visitor* _visitor;
+
+		/**
+		 * Whether this argument can be ignored, if desired.
+		 */
+		bool _ignoreable;
+
+		/**
+		 * Indicates that the arg was set as part of an XOR and not on the
+		 * command line.
+		 */
+		bool _xorSet;
+
+		bool _acceptsMultipleValues;
+
+		/**
+		 * Performs the special handling described by the Vistitor.
+		 */
+		void _checkWithVisitor() const;
+
+		/**
+		 * Primary constructor. YOU (yes you) should NEVER construct an Arg
+		 * directly, this is a base class that is extended by various children
+		 * that are meant to be used.  Use SwitchArg, ValueArg, MultiArg,
+		 * UnlabeledValueArg, or UnlabeledMultiArg instead.
+		 *
+		 * \param flag - The flag identifying the argument.
+		 * \param name - The name identifying the argument.
+		 * \param desc - The description of the argument, used in the usage.
+		 * \param req - Whether the argument is required.
+		 * \param valreq - Whether the a value is required for the argument.
+		 * \param v - The visitor checked by the argument. Defaults to NULL.
+		 */
+ 		Arg( const std::string& flag,
+			 const std::string& name,
+			 const std::string& desc,
+			 bool req,
+			 bool valreq,
+			 Visitor* v = NULL );
+
+	public:
+		/**
+		 * Destructor.
+		 */
+		virtual ~Arg();
+
+		/**
+		 * Adds this to the specified list of Args.
+		 * \param argList - The list to add this to.
+		 */
+		virtual void addToList( std::list<Arg*>& argList ) const;
+
+		/**
+		 * Begin ignoring arguments since the "--" argument was specified.
+		 */
+		static void beginIgnoring() { ignoreRestRef() = true; }
+
+		/**
+		 * Whether to ignore the rest.
+		 */
+		static bool ignoreRest() { return ignoreRestRef(); }
+
+		/**
+		 * The delimiter that separates an argument flag/name from the
+		 * value.
+		 */
+		static char delimiter() { return delimiterRef(); }
+
+		/**
+		 * The char used as a place holder when SwitchArgs are combined.
+		 * Currently set to the bell char (ASCII 7).
+		 */
+		static char blankChar() { return (char)7; }
+
+		/**
+		 * The char that indicates the beginning of a flag.  Defaults to '-', but
+		 * clients can define TCLAP_FLAGSTARTCHAR to override.
+		 */
+#ifndef TCLAP_FLAGSTARTCHAR
+#define TCLAP_FLAGSTARTCHAR '-'
+#endif
+		static char flagStartChar() { return TCLAP_FLAGSTARTCHAR; }
+
+		/**
+		 * The sting that indicates the beginning of a flag.  Defaults to "-", but
+		 * clients can define TCLAP_FLAGSTARTSTRING to override. Should be the same
+		 * as TCLAP_FLAGSTARTCHAR.
+		 */
+#ifndef TCLAP_FLAGSTARTSTRING
+#define TCLAP_FLAGSTARTSTRING "-"
+#endif
+		static const std::string flagStartString() { return TCLAP_FLAGSTARTSTRING; }
+
+		/**
+		 * The sting that indicates the beginning of a name.  Defaults to "--", but
+		 *  clients can define TCLAP_NAMESTARTSTRING to override.
+		 */
+#ifndef TCLAP_NAMESTARTSTRING
+#define TCLAP_NAMESTARTSTRING "--"
+#endif
+		static const std::string nameStartString() { return TCLAP_NAMESTARTSTRING; }
+
+		/**
+		 * The name used to identify the ignore rest argument.
+		 */
+		static const std::string ignoreNameString() { return "ignore_rest"; }
+
+		/**
+		 * Sets the delimiter for all arguments.
+		 * \param c - The character that delimits flags/names from values.
+		 */
+		static void setDelimiter( char c ) { delimiterRef() = c; }
+
+		/**
+		 * Pure virtual method meant to handle the parsing and value assignment
+		 * of the string on the command line.
+		 * \param i - Pointer the the current argument in the list.
+		 * \param args - Mutable list of strings. What is
+		 * passed in from main.
+		 */
+		virtual bool processArg(int *i, std::vector<std::string>& args) = 0;
+
+		/**
+		 * Operator ==.
+		 * Equality operator. Must be virtual to handle unlabeled args.
+		 * \param a - The Arg to be compared to this.
+		 */
+		virtual bool operator==(const Arg& a) const;
+
+		/**
+		 * Returns the argument flag.
+		 */
+		const std::string& getFlag() const;
+
+		/**
+		 * Returns the argument name.
+		 */
+		const std::string& getName() const;
+
+		/**
+		 * Returns the argument description.
+		 */
+		std::string getDescription() const;
+
+		/**
+		 * Indicates whether the argument is required.
+		 */
+		virtual bool isRequired() const;
+
+		/**
+		 * Sets _required to true. This is used by the XorHandler.
+		 * You really have no reason to ever use it.
+		 */
+		void forceRequired();
+
+		/**
+		 * Sets the _alreadySet value to true.  This is used by the XorHandler.
+		 * You really have no reason to ever use it.
+		 */
+		void xorSet();
+
+		/**
+		 * Indicates whether a value must be specified for argument.
+		 */
+		bool isValueRequired() const;
+
+		/**
+		 * Indicates whether the argument has already been set.  Only true
+		 * if the arg has been matched on the command line.
+		 */
+		bool isSet() const;
+
+		/**
+		 * Indicates whether the argument can be ignored, if desired.
+		 */
+		bool isIgnoreable() const;
+
+		/**
+		 * A method that tests whether a string matches this argument.
+		 * This is generally called by the processArg() method.  This
+		 * method could be re-implemented by a child to change how
+		 * arguments are specified on the command line.
+		 * \param s - The string to be compared to the flag/name to determine
+		 * whether the arg matches.
+		 */
+		virtual bool argMatches( const std::string& s ) const;
+
+		/**
+		 * Returns a simple string representation of the argument.
+		 * Primarily for debugging.
+		 */
+		virtual std::string toString() const;
+
+		/**
+		 * Returns a short ID for the usage.
+		 * \param valueId - The value used in the id.
+		 */
+		virtual std::string shortID( const std::string& valueId = "val" ) const;
+
+		/**
+		 * Returns a long ID for the usage.
+		 * \param valueId - The value used in the id.
+		 */
+		virtual std::string longID( const std::string& valueId = "val" ) const;
+
+		/**
+		 * Trims a value off of the flag.
+		 * \param flag - The string from which the flag and value will be
+		 * trimmed. Contains the flag once the value has been trimmed.
+		 * \param value - Where the value trimmed from the string will
+		 * be stored.
+		 */
+		virtual void trimFlag( std::string& flag, std::string& value ) const;
+
+		/**
+		 * Checks whether a given string has blank chars, indicating that
+		 * it is a combined SwitchArg.  If so, return true, otherwise return
+		 * false.
+		 * \param s - string to be checked.
+		 */
+		bool _hasBlanks( const std::string& s ) const;
+
+		/**
+		 * Sets the requireLabel. Used by XorHandler.  You shouldn't ever
+		 * use this.
+		 * \param s - Set the requireLabel to this value.
+		 */
+		void setRequireLabel( const std::string& s );
+
+		/**
+		 * Used for MultiArgs and XorHandler to determine whether args
+		 * can still be set.
+		 */
+		virtual bool allowMore();
+
+		/**
+		 * Use by output classes to determine whether an Arg accepts
+		 * multiple values.
+		 */
+		virtual bool acceptsMultipleValues();
+
+		/**
+		 * Clears the Arg object and allows it to be reused by new
+		 * command lines.
+		 */
+		 virtual void reset();
+};
+
+/**
+ * Typedef of an Arg list iterator.
+ */
+typedef std::list<Arg*>::iterator ArgListIterator;
+
+/**
+ * Typedef of an Arg vector iterator.
+ */
+typedef std::vector<Arg*>::iterator ArgVectorIterator;
+
+/**
+ * Typedef of a Visitor list iterator.
+ */
+typedef std::list<Visitor*>::iterator VisitorListIterator;
+
+/*
+ * Extract a value of type T from it's string representation contained
+ * in strVal. The ValueLike parameter used to select the correct
+ * specialization of ExtractValue depending on the value traits of T.
+ * ValueLike traits use operator>> to assign the value from strVal.
+ */
+template<typename T> void
+ExtractValue(T &destVal, const std::string& strVal, ValueLike vl)
+{
+    static_cast<void>(vl); // Avoid warning about unused vl
+    std::istringstream is(strVal);
+
+    int valuesRead = 0;
+    while ( is.good() ) {
+	if ( is.peek() != EOF )
+#ifdef TCLAP_SETBASE_ZERO
+	    is >> std::setbase(0) >> destVal;
+#else
+	    is >> destVal;
+#endif
+	else
+	    break;
+
+	valuesRead++;
+    }
+
+    if ( is.fail() )
+	throw( ArgParseException("Couldn't read argument value "
+				 "from string '" + strVal + "'"));
+
+
+    if ( valuesRead > 1 )
+	throw( ArgParseException("More than one valid value parsed from "
+				 "string '" + strVal + "'"));
+
+}
+
+/*
+ * Extract a value of type T from it's string representation contained
+ * in strVal. The ValueLike parameter used to select the correct
+ * specialization of ExtractValue depending on the value traits of T.
+ * StringLike uses assignment (operator=) to assign from strVal.
+ */
+template<typename T> void
+ExtractValue(T &destVal, const std::string& strVal, StringLike sl)
+{
+    static_cast<void>(sl); // Avoid warning about unused sl
+    SetString(destVal, strVal);
+}
+
+//////////////////////////////////////////////////////////////////////
+//BEGIN Arg.cpp
+//////////////////////////////////////////////////////////////////////
+
+inline Arg::Arg(const std::string& flag,
+         const std::string& name,
+         const std::string& desc,
+         bool req,
+         bool valreq,
+         Visitor* v) :
+  _flag(flag),
+  _name(name),
+  _description(desc),
+  _required(req),
+  _requireLabel("required"),
+  _valueRequired(valreq),
+  _alreadySet(false),
+  _visitor( v ),
+  _ignoreable(true),
+  _xorSet(false),
+  _acceptsMultipleValues(false)
+{
+	if ( _flag.length() > 1 )
+		throw(SpecificationException(
+				"Argument flag can only be one character long", toString() ) );
+
+	if ( _name != ignoreNameString() &&
+		 ( _flag == Arg::flagStartString() ||
+		   _flag == Arg::nameStartString() ||
+		   _flag == " " ) )
+		throw(SpecificationException("Argument flag cannot be either '" +
+							Arg::flagStartString() + "' or '" +
+							Arg::nameStartString() + "' or a space.",
+							toString() ) );
+
+	if ( ( _name.substr( 0, Arg::flagStartString().length() ) == Arg::flagStartString() ) ||
+		 ( _name.substr( 0, Arg::nameStartString().length() ) == Arg::nameStartString() ) ||
+		 ( _name.find( " ", 0 ) != std::string::npos ) )
+		throw(SpecificationException("Argument name begin with either '" +
+							Arg::flagStartString() + "' or '" +
+							Arg::nameStartString() + "' or space.",
+							toString() ) );
+
+}
+
+inline Arg::~Arg() { }
+
+inline std::string Arg::shortID( const std::string& valueId ) const
+{
+	std::string id = "";
+
+	if ( _flag != "" )
+		id = Arg::flagStartString() + _flag;
+	else
+		id = Arg::nameStartString() + _name;
+
+	if ( _valueRequired )
+		id += std::string( 1, Arg::delimiter() ) + "<" + valueId  + ">";
+
+	if ( !_required )
+		id = "[" + id + "]";
+
+	return id;
+}
+
+inline std::string Arg::longID( const std::string& valueId ) const
+{
+	std::string id = "";
+
+	if ( _flag != "" )
+	{
+		id += Arg::flagStartString() + _flag;
+
+		if ( _valueRequired )
+			id += std::string( 1, Arg::delimiter() ) + "<" + valueId + ">";
+
+		id += ",  ";
+	}
+
+	id += Arg::nameStartString() + _name;
+
+	if ( _valueRequired )
+		id += std::string( 1, Arg::delimiter() ) + "<" + valueId + ">";
+
+	return id;
+
+}
+
+inline bool Arg::operator==(const Arg& a) const
+{
+	if ( ( _flag != "" && _flag == a._flag ) || _name == a._name)
+		return true;
+	else
+		return false;
+}
+
+inline std::string Arg::getDescription() const
+{
+	std::string desc = "";
+	if ( _required )
+		desc = "(" + _requireLabel + ")  ";
+
+//	if ( _valueRequired )
+//		desc += "(value required)  ";
+
+	desc += _description;
+	return desc;
+}
+
+inline const std::string& Arg::getFlag() const { return _flag; }
+
+inline const std::string& Arg::getName() const { return _name; }
+
+inline bool Arg::isRequired() const { return _required; }
+
+inline bool Arg::isValueRequired() const { return _valueRequired; }
+
+inline bool Arg::isSet() const
+{
+	if ( _alreadySet && !_xorSet )
+		return true;
+	else
+		return false;
+}
+
+inline bool Arg::isIgnoreable() const { return _ignoreable; }
+
+inline void Arg::setRequireLabel( const std::string& s)
+{
+	_requireLabel = s;
+}
+
+inline bool Arg::argMatches( const std::string& argFlag ) const
+{
+	if ( ( argFlag == Arg::flagStartString() + _flag && _flag != "" ) ||
+	       argFlag == Arg::nameStartString() + _name )
+		return true;
+	else
+		return false;
+}
+
+inline std::string Arg::toString() const
+{
+	std::string s = "";
+
+	if ( _flag != "" )
+		s += Arg::flagStartString() + _flag + " ";
+
+	s += "(" + Arg::nameStartString() + _name + ")";
+
+	return s;
+}
+
+inline void Arg::_checkWithVisitor() const
+{
+	if ( _visitor != NULL )
+		_visitor->visit();
+}
+
+/**
+ * Implementation of trimFlag.
+ */
+inline void Arg::trimFlag(std::string& flag, std::string& value) const
+{
+	int stop = 0;
+	for ( int i = 0; static_cast<unsigned int>(i) < flag.length(); i++ )
+		if ( flag[i] == Arg::delimiter() )
+		{
+			stop = i;
+			break;
+		}
+
+	if ( stop > 1 )
+	{
+		value = flag.substr(stop+1);
+		flag = flag.substr(0,stop);
+	}
+
+}
+
+/**
+ * Implementation of _hasBlanks.
+ */
+inline bool Arg::_hasBlanks( const std::string& s ) const
+{
+	for ( int i = 1; static_cast<unsigned int>(i) < s.length(); i++ )
+		if ( s[i] == Arg::blankChar() )
+			return true;
+
+	return false;
+}
+
+inline void Arg::forceRequired()
+{
+	_required = true;
+}
+
+inline void Arg::xorSet()
+{
+	_alreadySet = true;
+	_xorSet = true;
+}
+
+/**
+ * Overridden by Args that need to added to the end of the list.
+ */
+inline void Arg::addToList( std::list<Arg*>& argList ) const
+{
+	argList.push_front( const_cast<Arg*>(this) );
+}
+
+inline bool Arg::allowMore()
+{
+	return false;
+}
+
+inline bool Arg::acceptsMultipleValues()
+{
+	return _acceptsMultipleValues;
+}
+
+inline void Arg::reset()
+{
+	_xorSet = false;
+	_alreadySet = false;
+}
+
+//////////////////////////////////////////////////////////////////////
+//END Arg.cpp
+//////////////////////////////////////////////////////////////////////
+
+} //namespace TCLAP
+
+#endif
+
diff --git a/external/tclap/ArgException.h b/external/tclap/ArgException.h
new file mode 100644
index 0000000..3411aa9
--- /dev/null
+++ b/external/tclap/ArgException.h
@@ -0,0 +1,200 @@
+// -*- Mode: c++; c-basic-offset: 4; tab-width: 4; -*-
+
+/****************************************************************************** 
+ * 
+ *  file:  ArgException.h
+ * 
+ *  Copyright (c) 2003, Michael E. Smoot .
+ *  All rights reverved.
+ * 
+ *  See the file COPYING in the top directory of this distribution for
+ *  more information.
+ *  
+ *  THE SOFTWARE IS PROVIDED _AS IS_, WITHOUT WARRANTY OF ANY KIND, EXPRESS 
+ *  OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
+ *  THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
+ *  DEALINGS IN THE SOFTWARE.  
+ *  
+ *****************************************************************************/ 
+
+
+#ifndef TCLAP_ARG_EXCEPTION_H
+#define TCLAP_ARG_EXCEPTION_H
+
+#include <string>
+#include <exception>
+
+namespace TCLAP {
+
+/**
+ * A simple class that defines and argument exception.  Should be caught
+ * whenever a CmdLine is created and parsed.
+ */
+class ArgException : public std::exception
+{
+	public:
+	
+		/**
+		 * Constructor.
+		 * \param text - The text of the exception.
+		 * \param id - The text identifying the argument source.
+		 * \param td - Text describing the type of ArgException it is.
+		 * of the exception.
+		 */
+		ArgException( const std::string& text = "undefined exception", 
+					  const std::string& id = "undefined",
+					  const std::string& td = "Generic ArgException")
+			: std::exception(), 
+			  _errorText(text), 
+			  _argId( id ), 
+			  _typeDescription(td)
+		{ } 
+		
+		/**
+		 * Destructor.
+		 */
+		virtual ~ArgException() throw() { }
+
+		/**
+		 * Returns the error text.
+		 */
+		std::string error() const { return ( _errorText ); }
+
+		/**
+		 * Returns the argument id.
+		 */
+		std::string argId() const  
+		{ 
+			if ( _argId == "undefined" )
+				return " ";
+			else
+				return ( "Argument: " + _argId ); 
+		}
+
+		/**
+		 * Returns the arg id and error text. 
+		 */
+		const char* what() const throw() 
+		{
+			static std::string ex; 
+			ex = _argId + " -- " + _errorText;
+			return ex.c_str();
+		}
+
+		/**
+		 * Returns the type of the exception.  Used to explain and distinguish
+		 * between different child exceptions.
+		 */
+		std::string typeDescription() const
+		{
+			return _typeDescription; 
+		}
+
+
+	private:
+
+		/**
+		 * The text of the exception message.
+		 */
+		std::string _errorText;
+
+		/**
+		 * The argument related to this exception.
+		 */
+		std::string _argId;
+
+		/**
+		 * Describes the type of the exception.  Used to distinguish
+		 * between different child exceptions.
+		 */
+		std::string _typeDescription;
+
+};
+
+/**
+ * Thrown from within the child Arg classes when it fails to properly
+ * parse the argument it has been passed.
+ */
+class ArgParseException : public ArgException
+{ 
+	public:
+		/**
+		 * Constructor.
+		 * \param text - The text of the exception.
+		 * \param id - The text identifying the argument source 
+		 * of the exception.
+		 */
+		ArgParseException( const std::string& text = "undefined exception", 
+					       const std::string& id = "undefined" )
+			: ArgException( text, 
+			                id, 
+							std::string( "Exception found while parsing " ) + 
+							std::string( "the value the Arg has been passed." ))
+			{ }
+};
+
+/**
+ * Thrown from CmdLine when the arguments on the command line are not
+ * properly specified, e.g. too many arguments, required argument missing, etc.
+ */
+class CmdLineParseException : public ArgException
+{
+	public:
+		/**
+		 * Constructor.
+		 * \param text - The text of the exception.
+		 * \param id - The text identifying the argument source 
+		 * of the exception.
+		 */
+		CmdLineParseException( const std::string& text = "undefined exception", 
+					           const std::string& id = "undefined" )
+			: ArgException( text, 
+			                id,
+							std::string( "Exception found when the values ") +
+							std::string( "on the command line do not meet ") +
+							std::string( "the requirements of the defined ") +
+							std::string( "Args." ))
+		{ }
+};
+
+/**
+ * Thrown from Arg and CmdLine when an Arg is improperly specified, e.g. 
+ * same flag as another Arg, same name, etc.
+ */
+class SpecificationException : public ArgException
+{
+	public:
+		/**
+		 * Constructor.
+		 * \param text - The text of the exception.
+		 * \param id - The text identifying the argument source 
+		 * of the exception.
+		 */
+		SpecificationException( const std::string& text = "undefined exception",
+					            const std::string& id = "undefined" )
+			: ArgException( text, 
+			                id,
+							std::string("Exception found when an Arg object ")+
+							std::string("is improperly defined by the ") +
+							std::string("developer." )) 
+		{ }
+
+};
+
+class ExitException {
+public:
+	ExitException(int estat) : _estat(estat) {}
+
+	int getExitStatus() const { return _estat; }
+
+private:
+	int _estat;
+};
+
+} // namespace TCLAP
+
+#endif
+
diff --git a/external/tclap/ArgTraits.h b/external/tclap/ArgTraits.h
new file mode 100644
index 0000000..0b2c18f
--- /dev/null
+++ b/external/tclap/ArgTraits.h
@@ -0,0 +1,87 @@
+// -*- Mode: c++; c-basic-offset: 4; tab-width: 4; -*-
+
+/******************************************************************************
+ *
+ *  file:  ArgTraits.h
+ *
+ *  Copyright (c) 2007, Daniel Aarno, Michael E. Smoot .
+ *  All rights reverved.
+ *
+ *  See the file COPYING in the top directory of this distribution for
+ *  more information.
+ *
+ *  THE SOFTWARE IS PROVIDED _AS IS_, WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ *  OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ *  THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ *  DEALINGS IN THE SOFTWARE.
+ *
+ *****************************************************************************/
+
+// This is an internal tclap file, you should probably not have to
+// include this directly
+
+#ifndef TCLAP_ARGTRAITS_H
+#define TCLAP_ARGTRAITS_H
+
+namespace TCLAP {
+
+// We use two empty structs to get compile type specialization
+// function to work
+
+/**
+ * A value like argument value type is a value that can be set using
+ * operator>>. This is the default value type.
+ */
+struct ValueLike {
+    typedef ValueLike ValueCategory;
+	virtual ~ValueLike() {}
+};
+
+/**
+ * A string like argument value type is a value that can be set using
+ * operator=(string). Usefull if the value type contains spaces which
+ * will be broken up into individual tokens by operator>>.
+ */
+struct StringLike {
+	virtual ~StringLike() {}
+};
+
+/**
+ * A class can inherit from this object to make it have string like
+ * traits. This is a compile time thing and does not add any overhead
+ * to the inherenting class.
+ */
+struct StringLikeTrait {
+    typedef StringLike ValueCategory;
+	virtual ~StringLikeTrait() {}
+};
+
+/**
+ * A class can inherit from this object to make it have value like
+ * traits. This is a compile time thing and does not add any overhead
+ * to the inherenting class.
+ */
+struct ValueLikeTrait {
+    typedef ValueLike ValueCategory;
+	virtual ~ValueLikeTrait() {}
+};
+
+/**
+ * Arg traits are used to get compile type specialization when parsing
+ * argument values. Using an ArgTraits you can specify the way that
+ * values gets assigned to any particular type during parsing. The two
+ * supported types are StringLike and ValueLike.
+ */
+template<typename T>
+struct ArgTraits {
+    typedef typename T::ValueCategory ValueCategory;
+	virtual ~ArgTraits() {}
+    //typedef ValueLike ValueCategory;
+};
+
+#endif
+
+} // namespace
diff --git a/external/tclap/COPYING b/external/tclap/COPYING
new file mode 100644
index 0000000..987be0c
--- /dev/null
+++ b/external/tclap/COPYING
@@ -0,0 +1,25 @@
+
+
+Copyright (c) 2003 Michael E. Smoot 
+
+Permission is hereby granted, free of charge, to any person 
+obtaining a copy of this software and associated documentation 
+files (the "Software"), to deal in the Software without restriction, 
+including without limitation the rights to use, copy, modify, merge, 
+publish, distribute, sublicense, and/or sell copies of the Software, 
+and to permit persons to whom the Software is furnished to do so, 
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be 
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
+OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN 
+AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR 
+IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 
+THE SOFTWARE.
+
+
diff --git a/external/tclap/CmdLine.h b/external/tclap/CmdLine.h
new file mode 100644
index 0000000..0fec8d8
--- /dev/null
+++ b/external/tclap/CmdLine.h
@@ -0,0 +1,633 @@
+// -*- Mode: c++; c-basic-offset: 4; tab-width: 4; -*-
+
+/******************************************************************************
+ *
+ *  file:  CmdLine.h
+ *
+ *  Copyright (c) 2003, Michael E. Smoot .
+ *  Copyright (c) 2004, Michael E. Smoot, Daniel Aarno.
+ *  All rights reverved.
+ *
+ *  See the file COPYING in the top directory of this distribution for
+ *  more information.
+ *
+ *  THE SOFTWARE IS PROVIDED _AS IS_, WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ *  OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ *  THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ *  DEALINGS IN THE SOFTWARE.
+ *
+ *****************************************************************************/
+
+#ifndef TCLAP_CMDLINE_H
+#define TCLAP_CMDLINE_H
+
+#include <tclap/SwitchArg.h>
+#include <tclap/MultiSwitchArg.h>
+#include <tclap/UnlabeledValueArg.h>
+#include <tclap/UnlabeledMultiArg.h>
+
+#include <tclap/XorHandler.h>
+#include <tclap/HelpVisitor.h>
+#include <tclap/VersionVisitor.h>
+#include <tclap/IgnoreRestVisitor.h>
+
+#include <tclap/CmdLineOutput.h>
+#include <tclap/StdOutput.h>
+
+#include <tclap/Constraint.h>
+#include <tclap/ValuesConstraint.h>
+
+#include <string>
+#include <vector>
+#include <list>
+#include <iostream>
+#include <iomanip>
+#include <algorithm>
+#include <stdlib.h> // Needed for exit(), which isn't defined in some envs.
+
+namespace TCLAP {
+
+template<typename T> void DelPtr(T ptr)
+{
+	delete ptr;
+}
+
+template<typename C> void ClearContainer(C &c)
+{
+	typedef typename C::value_type value_type;
+	std::for_each(c.begin(), c.end(), DelPtr<value_type>);
+	c.clear();
+}
+
+
+/**
+ * The base class that manages the command line definition and passes
+ * along the parsing to the appropriate Arg classes.
+ */
+class CmdLine : public CmdLineInterface
+{
+	protected:
+
+		/**
+		 * The list of arguments that will be tested against the
+		 * command line.
+		 */
+		std::list<Arg*> _argList;
+
+		/**
+		 * The name of the program.  Set to argv[0].
+		 */
+		std::string _progName;
+
+		/**
+		 * A message used to describe the program.  Used in the usage output.
+		 */
+		std::string _message;
+
+		/**
+		 * The version to be displayed with the --version switch.
+		 */
+		std::string _version;
+
+		/**
+		 * The number of arguments that are required to be present on
+		 * the command line. This is set dynamically, based on the
+		 * Args added to the CmdLine object.
+		 */
+		int _numRequired;
+
+		/**
+		 * The character that is used to separate the argument flag/name
+		 * from the value.  Defaults to ' ' (space).
+		 */
+		char _delimiter;
+
+		/**
+		 * The handler that manages xoring lists of args.
+		 */
+		XorHandler _xorHandler;
+
+		/**
+		 * A list of Args to be explicitly deleted when the destructor
+		 * is called.  At the moment, this only includes the three default
+		 * Args.
+		 */
+		std::list<Arg*> _argDeleteOnExitList;
+
+		/**
+		 * A list of Visitors to be explicitly deleted when the destructor
+		 * is called.  At the moment, these are the Vistors created for the
+		 * default Args.
+		 */
+		std::list<Visitor*> _visitorDeleteOnExitList;
+
+		/**
+		 * Object that handles all output for the CmdLine.
+		 */
+		CmdLineOutput* _output;
+
+		/**
+		 * Should CmdLine handle parsing exceptions internally?
+		 */
+		bool _handleExceptions;
+
+		/**
+		 * Throws an exception listing the missing args.
+		 */
+		void missingArgsException();
+
+		/**
+		 * Checks whether a name/flag string matches entirely matches
+		 * the Arg::blankChar.  Used when multiple switches are combined
+		 * into a single argument.
+		 * \param s - The message to be used in the usage.
+		 */
+		bool _emptyCombined(const std::string& s);
+
+		/**
+		 * Perform a delete ptr; operation on ptr when this object is deleted.
+		 */
+		void deleteOnExit(Arg* ptr);
+
+		/**
+		 * Perform a delete ptr; operation on ptr when this object is deleted.
+		 */
+		void deleteOnExit(Visitor* ptr);
+
+private:
+
+		/**
+		 * Prevent accidental copying.
+		 */
+		CmdLine(const CmdLine& rhs);
+		CmdLine& operator=(const CmdLine& rhs);
+
+		/**
+		 * Encapsulates the code common to the constructors
+		 * (which is all of it).
+		 */
+		void _constructor();
+
+
+		/**
+		 * Is set to true when a user sets the output object. We use this so
+		 * that we don't delete objects that are created outside of this lib.
+		 */
+		bool _userSetOutput;
+
+		/**
+		 * Whether or not to automatically create help and version switches.
+		 */
+		bool _helpAndVersion;
+
+	public:
+
+		/**
+		 * Command line constructor. Defines how the arguments will be
+		 * parsed.
+		 * \param message - The message to be used in the usage
+		 * output.
+		 * \param delimiter - The character that is used to separate
+		 * the argument flag/name from the value.  Defaults to ' ' (space).
+		 * \param version - The version number to be used in the
+		 * --version switch.
+		 * \param helpAndVersion - Whether or not to create the Help and
+		 * Version switches. Defaults to true.
+		 */
+		CmdLine(const std::string& message,
+				const char delimiter = ' ',
+				const std::string& version = "none",
+				bool helpAndVersion = true);
+
+		/**
+		 * Deletes any resources allocated by a CmdLine object.
+		 */
+		virtual ~CmdLine();
+
+		/**
+		 * Adds an argument to the list of arguments to be parsed.
+		 * \param a - Argument to be added.
+		 */
+		void add( Arg& a );
+
+		/**
+		 * An alternative add.  Functionally identical.
+		 * \param a - Argument to be added.
+		 */
+		void add( Arg* a );
+
+		/**
+		 * Add two Args that will be xor'd.  If this method is used, add does
+		 * not need to be called.
+		 * \param a - Argument to be added and xor'd.
+		 * \param b - Argument to be added and xor'd.
+		 */
+		void xorAdd( Arg& a, Arg& b );
+
+		/**
+		 * Add a list of Args that will be xor'd.  If this method is used,
+		 * add does not need to be called.
+		 * \param xors - List of Args to be added and xor'd.
+		 */
+		void xorAdd( std::vector<Arg*>& xors );
+
+		/**
+		 * Parses the command line.
+		 * \param argc - Number of arguments.
+		 * \param argv - Array of arguments.
+		 */
+		void parse(int argc, const char * const * argv);
+
+		/**
+		 * Parses the command line.
+		 * \param args - A vector of strings representing the args.
+		 * args[0] is still the program name.
+		 */
+		void parse(std::vector<std::string>& args);
+
+		/**
+		 *
+		 */
+		CmdLineOutput* getOutput();
+
+		/**
+		 *
+		 */
+		void setOutput(CmdLineOutput* co);
+
+		/**
+		 *
+		 */
+		std::string& getVersion();
+
+		/**
+		 *
+		 */
+		std::string& getProgramName();
+
+		/**
+		 *
+		 */
+		std::list<Arg*>& getArgList();
+
+		/**
+		 *
+		 */
+		XorHandler& getXorHandler();
+
+		/**
+		 *
+		 */
+		char getDelimiter();
+
+		/**
+		 *
+		 */
+		std::string& getMessage();
+
+		/**
+		 *
+		 */
+		bool hasHelpAndVersion();
+
+		/**
+		 * Disables or enables CmdLine's internal parsing exception handling.
+		 *
+		 * @param state Should CmdLine handle parsing exceptions internally?
+		 */
+		void setExceptionHandling(const bool state);
+
+		/**
+		 * Returns the current state of the internal exception handling.
+		 *
+		 * @retval true Parsing exceptions are handled internally.
+		 * @retval false Parsing exceptions are propagated to the caller.
+		 */
+		bool getExceptionHandling() const;
+
+		/**
+		 * Allows the CmdLine object to be reused.
+		 */
+		void reset();
+
+};
+
+
+///////////////////////////////////////////////////////////////////////////////
+//Begin CmdLine.cpp
+///////////////////////////////////////////////////////////////////////////////
+
+inline CmdLine::CmdLine(const std::string& m,
+                        char delim,
+                        const std::string& v,
+                        bool help )
+    :
+  _argList(std::list<Arg*>()),
+  _progName("not_set_yet"),
+  _message(m),
+  _version(v),
+  _numRequired(0),
+  _delimiter(delim),
+  _xorHandler(XorHandler()),
+  _argDeleteOnExitList(std::list<Arg*>()),
+  _visitorDeleteOnExitList(std::list<Visitor*>()),
+  _output(0),
+  _handleExceptions(true),
+  _userSetOutput(false),
+  _helpAndVersion(help)
+{
+	_constructor();
+}
+
+inline CmdLine::~CmdLine()
+{
+	ClearContainer(_argDeleteOnExitList);
+	ClearContainer(_visitorDeleteOnExitList);
+
+	if ( !_userSetOutput ) {
+		delete _output;
+		_output = 0;
+	}
+}
+
+inline void CmdLine::_constructor()
+{
+	_output = new StdOutput;
+
+	Arg::setDelimiter( _delimiter );
+
+	Visitor* v;
+
+	if ( _helpAndVersion )
+	{
+		v = new HelpVisitor( this, &_output );
+		SwitchArg* help = new SwitchArg("h","help",
+		                      "Displays usage information and exits.",
+		                      false, v);
+		add( help );
+		deleteOnExit(help);
+		deleteOnExit(v);
+
+		v = new VersionVisitor( this, &_output );
+		SwitchArg* vers = new SwitchArg("","version",
+		                      "Displays version information and exits.",
+		                      false, v);
+		add( vers );
+		deleteOnExit(vers);
+		deleteOnExit(v);
+	}
+
+	v = new IgnoreRestVisitor();
+	SwitchArg* ignore  = new SwitchArg(Arg::flagStartString(),
+	          Arg::ignoreNameString(),
+	          "Ignores the rest of the labeled arguments following this flag.",
+	          false, v);
+	add( ignore );
+	deleteOnExit(ignore);
+	deleteOnExit(v);
+}
+
+inline void CmdLine::xorAdd( std::vector<Arg*>& ors )
+{
+	_xorHandler.add( ors );
+
+	for (ArgVectorIterator it = ors.begin(); it != ors.end(); it++)
+	{
+		(*it)->forceRequired();
+		(*it)->setRequireLabel( "OR required" );
+		add( *it );
+	}
+}
+
+inline void CmdLine::xorAdd( Arg& a, Arg& b )
+{
+	std::vector<Arg*> ors;
+	ors.push_back( &a );
+	ors.push_back( &b );
+	xorAdd( ors );
+}
+
+inline void CmdLine::add( Arg& a )
+{
+	add( &a );
+}
+
+inline void CmdLine::add( Arg* a )
+{
+	for( ArgListIterator it = _argList.begin(); it != _argList.end(); it++ )
+		if ( *a == *(*it) )
+			throw( SpecificationException(
+			        "Argument with same flag/name already exists!",
+			        a->longID() ) );
+
+	a->addToList( _argList );
+
+	if ( a->isRequired() )
+		_numRequired++;
+}
+
+
+inline void CmdLine::parse(int argc, const char * const * argv)
+{
+		// this step is necessary so that we have easy access to
+		// mutable strings.
+		std::vector<std::string> args;
+		for (int i = 0; i < argc; i++)
+			args.push_back(argv[i]);
+
+		parse(args);
+}
+
+inline void CmdLine::parse(std::vector<std::string>& args)
+{
+	bool shouldExit = false;
+	int estat = 0;
+
+	try {
+		_progName = args.front();
+		args.erase(args.begin());
+
+		int requiredCount = 0;
+
+		for (int i = 0; static_cast<unsigned int>(i) < args.size(); i++) 
+		{
+			bool matched = false;
+			for (ArgListIterator it = _argList.begin();
+			     it != _argList.end(); it++) {
+				if ( (*it)->processArg( &i, args ) )
+				{
+					requiredCount += _xorHandler.check( *it );
+					matched = true;
+					break;
+				}
+			}
+
+			// checks to see if the argument is an empty combined
+			// switch and if so, then we've actually matched it
+			if ( !matched && _emptyCombined( args[i] ) )
+				matched = true;
+
+			if ( !matched && !Arg::ignoreRest() )
+				throw(CmdLineParseException("Couldn't find match "
+				                            "for argument",
+				                            args[i]));
+		}
+
+		if ( requiredCount < _numRequired )
+			missingArgsException();
+
+		if ( requiredCount > _numRequired )
+			throw(CmdLineParseException("Too many arguments!"));
+
+	} catch ( ArgException& e ) {
+		// If we're not handling the exceptions, rethrow.
+		if ( !_handleExceptions) {
+			throw;
+		}
+
+		try {
+			_output->failure(*this,e);
+		} catch ( ExitException &ee ) {
+			estat = ee.getExitStatus();
+			shouldExit = true;
+		}
+	} catch (ExitException &ee) {
+		// If we're not handling the exceptions, rethrow.
+		if ( !_handleExceptions) {
+			throw;
+		}
+
+		estat = ee.getExitStatus();
+		shouldExit = true;
+	}
+
+	if (shouldExit)
+		exit(estat);
+}
+
+inline bool CmdLine::_emptyCombined(const std::string& s)
+{
+	if ( s.length() > 0 && s[0] != Arg::flagStartChar() )
+		return false;
+
+	for ( int i = 1; static_cast<unsigned int>(i) < s.length(); i++ )
+		if ( s[i] != Arg::blankChar() )
+			return false;
+
+	return true;
+}
+
+inline void CmdLine::missingArgsException()
+{
+		int count = 0;
+
+		std::string missingArgList;
+		for (ArgListIterator it = _argList.begin(); it != _argList.end(); it++)
+		{
+			if ( (*it)->isRequired() && !(*it)->isSet() )
+			{
+				missingArgList += (*it)->getName();
+				missingArgList += ", ";
+				count++;
+			}
+		}
+		missingArgList = missingArgList.substr(0,missingArgList.length()-2);
+
+		std::string msg;
+		if ( count > 1 )
+			msg = "Required arguments missing: ";
+		else
+			msg = "Required argument missing: ";
+
+		msg += missingArgList;
+
+		throw(CmdLineParseException(msg));
+}
+
+inline void CmdLine::deleteOnExit(Arg* ptr)
+{
+	_argDeleteOnExitList.push_back(ptr);
+}
+
+inline void CmdLine::deleteOnExit(Visitor* ptr)
+{
+	_visitorDeleteOnExitList.push_back(ptr);
+}
+
+inline CmdLineOutput* CmdLine::getOutput()
+{
+	return _output;
+}
+
+inline void CmdLine::setOutput(CmdLineOutput* co)
+{
+	if ( !_userSetOutput )
+		delete _output;
+	_userSetOutput = true;
+	_output = co;
+}
+
+inline std::string& CmdLine::getVersion()
+{
+	return _version;
+}
+
+inline std::string& CmdLine::getProgramName()
+{
+	return _progName;
+}
+
+inline std::list<Arg*>& CmdLine::getArgList()
+{
+	return _argList;
+}
+
+inline XorHandler& CmdLine::getXorHandler()
+{
+	return _xorHandler;
+}
+
+inline char CmdLine::getDelimiter()
+{
+	return _delimiter;
+}
+
+inline std::string& CmdLine::getMessage()
+{
+	return _message;
+}
+
+inline bool CmdLine::hasHelpAndVersion()
+{
+	return _helpAndVersion;
+}
+
+inline void CmdLine::setExceptionHandling(const bool state)
+{
+	_handleExceptions = state;
+}
+
+inline bool CmdLine::getExceptionHandling() const
+{
+	return _handleExceptions;
+}
+
+inline void CmdLine::reset()
+{
+	for( ArgListIterator it = _argList.begin(); it != _argList.end(); it++ )
+		(*it)->reset();
+	
+	_progName.clear();
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//End CmdLine.cpp
+///////////////////////////////////////////////////////////////////////////////
+
+
+
+} //namespace TCLAP
+#endif
diff --git a/external/tclap/CmdLineInterface.h b/external/tclap/CmdLineInterface.h
new file mode 100644
index 0000000..1b25e9b
--- /dev/null
+++ b/external/tclap/CmdLineInterface.h
@@ -0,0 +1,150 @@
+
+/****************************************************************************** 
+ * 
+ *  file:  CmdLineInterface.h
+ * 
+ *  Copyright (c) 2003, Michael E. Smoot .
+ *  Copyright (c) 2004, Michael E. Smoot, Daniel Aarno.
+ *  All rights reverved.
+ *
+ *  See the file COPYING in the top directory of this distribution for
+ *  more information.
+ *  
+ *  THE SOFTWARE IS PROVIDED _AS IS_, WITHOUT WARRANTY OF ANY KIND, EXPRESS 
+ *  OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
+ *  THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
+ *  DEALINGS IN THE SOFTWARE.  
+ *  
+ *****************************************************************************/ 
+
+#ifndef TCLAP_COMMANDLINE_INTERFACE_H
+#define TCLAP_COMMANDLINE_INTERFACE_H
+
+#include <string>
+#include <vector>
+#include <list>
+#include <iostream>
+#include <algorithm>
+
+
+namespace TCLAP {
+     
+class Arg;
+class CmdLineOutput;
+class XorHandler;
+
+/**
+ * The base class that manages the command line definition and passes
+ * along the parsing to the appropriate Arg classes.
+ */
+class CmdLineInterface
+{
+	public:
+
+		/**
+		 * Destructor
+		 */
+		virtual ~CmdLineInterface() {}
+
+		/**
+		 * Adds an argument to the list of arguments to be parsed.
+		 * \param a - Argument to be added. 
+		 */
+		virtual void add( Arg& a )=0;
+
+		/**
+		 * An alternative add.  Functionally identical.
+		 * \param a - Argument to be added. 
+		 */
+		virtual void add( Arg* a )=0;
+
+		/**
+		 * Add two Args that will be xor'd.  
+		 * If this method is used, add does
+		 * not need to be called.
+		 * \param a - Argument to be added and xor'd. 
+		 * \param b - Argument to be added and xor'd. 
+		 */
+		virtual void xorAdd( Arg& a, Arg& b )=0;
+
+		/**
+		 * Add a list of Args that will be xor'd.  If this method is used, 
+		 * add does not need to be called.
+		 * \param xors - List of Args to be added and xor'd. 
+		 */
+		virtual void xorAdd( std::vector<Arg*>& xors )=0;
+
+		/**
+		 * Parses the command line.
+		 * \param argc - Number of arguments.
+		 * \param argv - Array of arguments.
+		 */
+		virtual void parse(int argc, const char * const * argv)=0;
+
+        /**
+         * Parses the command line.
+         * \param args - A vector of strings representing the args. 
+         * args[0] is still the program name.
+         */
+        void parse(std::vector<std::string>& args);
+
+		/**
+		 * Returns the CmdLineOutput object.
+		 */
+		virtual CmdLineOutput* getOutput()=0;
+
+		/**
+		 * \param co - CmdLineOutput object that we want to use instead. 
+		 */
+		virtual void setOutput(CmdLineOutput* co)=0;
+
+		/**
+		 * Returns the version string.
+		 */
+		virtual std::string& getVersion()=0;
+
+		/**
+		 * Returns the program name string.
+		 */
+		virtual std::string& getProgramName()=0;
+
+		/**
+		 * Returns the argList. 
+		 */
+		virtual std::list<Arg*>& getArgList()=0;
+
+		/**
+		 * Returns the XorHandler. 
+		 */
+		virtual XorHandler& getXorHandler()=0;
+
+		/**
+		 * Returns the delimiter string.
+		 */
+		virtual char getDelimiter()=0;
+
+		/**
+		 * Returns the message string.
+		 */
+		virtual std::string& getMessage()=0;
+
+		/**
+		 * Indicates whether or not the help and version switches were created
+		 * automatically.
+		 */
+		virtual bool hasHelpAndVersion()=0;
+
+		/** 
+		 * Resets the instance as if it had just been constructed so that the
+		 * instance can be reused. 
+		 */
+		virtual void reset()=0;
+};
+
+} //namespace
+
+
+#endif 
diff --git a/external/tclap/CmdLineOutput.h b/external/tclap/CmdLineOutput.h
new file mode 100644
index 0000000..71ee5a3
--- /dev/null
+++ b/external/tclap/CmdLineOutput.h
@@ -0,0 +1,74 @@
+
+
+/****************************************************************************** 
+ * 
+ *  file:  CmdLineOutput.h
+ * 
+ *  Copyright (c) 2004, Michael E. Smoot
+ *  All rights reverved.
+ * 
+ *  See the file COPYING in the top directory of this distribution for
+ *  more information.
+ *  
+ *  THE SOFTWARE IS PROVIDED _AS IS_, WITHOUT WARRANTY OF ANY KIND, EXPRESS 
+ *  OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
+ *  THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
+ *  DEALINGS IN THE SOFTWARE.  
+ *  
+ *****************************************************************************/ 
+
+#ifndef TCLAP_CMDLINEOUTPUT_H
+#define TCLAP_CMDLINEOUTPUT_H
+
+#include <string>
+#include <vector>
+#include <list>
+#include <iostream>
+#include <iomanip>
+#include <algorithm>
+
+namespace TCLAP {
+
+class CmdLineInterface;
+class ArgException;
+
+/**
+ * The interface that any output object must implement.
+ */
+class CmdLineOutput 
+{
+
+	public:
+
+		/**
+		 * Virtual destructor.
+		 */
+		virtual ~CmdLineOutput() {}
+
+		/**
+		 * Generates some sort of output for the USAGE. 
+		 * \param c - The CmdLine object the output is generated for. 
+		 */
+		virtual void usage(CmdLineInterface& c)=0;
+
+		/**
+		 * Generates some sort of output for the version. 
+		 * \param c - The CmdLine object the output is generated for. 
+		 */
+		virtual void version(CmdLineInterface& c)=0;
+
+		/**
+		 * Generates some sort of output for a failure. 
+		 * \param c - The CmdLine object the output is generated for. 
+		 * \param e - The ArgException that caused the failure. 
+		 */
+		virtual void failure( CmdLineInterface& c, 
+						      ArgException& e )=0;
+
+};
+
+} //namespace TCLAP
+#endif 
diff --git a/external/tclap/Constraint.h b/external/tclap/Constraint.h
new file mode 100644
index 0000000..a92acf9
--- /dev/null
+++ b/external/tclap/Constraint.h
@@ -0,0 +1,68 @@
+
+/******************************************************************************
+ *
+ *  file:  Constraint.h
+ *
+ *  Copyright (c) 2005, Michael E. Smoot
+ *  All rights reverved.
+ *
+ *  See the file COPYING in the top directory of this distribution for
+ *  more information.
+ *
+ *  THE SOFTWARE IS PROVIDED _AS IS_, WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ *  OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ *  THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ *  DEALINGS IN THE SOFTWARE.
+ *
+ *****************************************************************************/
+
+#ifndef TCLAP_CONSTRAINT_H
+#define TCLAP_CONSTRAINT_H
+
+#include <string>
+#include <vector>
+#include <list>
+#include <iostream>
+#include <iomanip>
+#include <algorithm>
+
+namespace TCLAP {
+
+/**
+ * The interface that defines the interaction between the Arg and Constraint.
+ */
+template<class T>
+class Constraint
+{
+
+	public:
+		/**
+		 * Returns a description of the Constraint.
+		 */
+		virtual std::string description() const =0;
+
+		/**
+		 * Returns the short ID for the Constraint.
+		 */
+		virtual std::string shortID() const =0;
+
+		/**
+		 * The method used to verify that the value parsed from the command
+		 * line meets the constraint.
+		 * \param value - The value that will be checked.
+		 */
+		virtual bool check(const T& value) const =0;
+
+		/**
+		 * Destructor.
+		 * Silences warnings about Constraint being a base class with virtual
+		 * functions but without a virtual destructor.
+		 */
+		virtual ~Constraint() { ; }
+};
+
+} //namespace TCLAP
+#endif
diff --git a/external/tclap/DocBookOutput.h b/external/tclap/DocBookOutput.h
new file mode 100644
index 0000000..a42ca27
--- /dev/null
+++ b/external/tclap/DocBookOutput.h
@@ -0,0 +1,299 @@
+// -*- Mode: c++; c-basic-offset: 4; tab-width: 4; -*-
+
+/****************************************************************************** 
+ * 
+ *  file:  DocBookOutput.h
+ * 
+ *  Copyright (c) 2004, Michael E. Smoot
+ *  All rights reverved.
+ * 
+ *  See the file COPYING in the top directory of this distribution for
+ *  more information.
+ *  
+ *  THE SOFTWARE IS PROVIDED _AS IS_, WITHOUT WARRANTY OF ANY KIND, EXPRESS 
+ *  OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
+ *  THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
+ *  DEALINGS IN THE SOFTWARE.  
+ *  
+ *****************************************************************************/ 
+
+#ifndef TCLAP_DOCBOOKOUTPUT_H
+#define TCLAP_DOCBOOKOUTPUT_H
+
+#include <string>
+#include <vector>
+#include <list>
+#include <iostream>
+#include <algorithm>
+
+#include <tclap/CmdLineInterface.h>
+#include <tclap/CmdLineOutput.h>
+#include <tclap/XorHandler.h>
+#include <tclap/Arg.h>
+
+namespace TCLAP {
+
+/**
+ * A class that generates DocBook output for usage() method for the 
+ * given CmdLine and its Args.
+ */
+class DocBookOutput : public CmdLineOutput
+{
+
+	public:
+
+		/**
+		 * Prints the usage to stdout.  Can be overridden to 
+		 * produce alternative behavior.
+		 * \param c - The CmdLine object the output is generated for. 
+		 */
+		virtual void usage(CmdLineInterface& c);
+
+		/**
+		 * Prints the version to stdout. Can be overridden 
+		 * to produce alternative behavior.
+		 * \param c - The CmdLine object the output is generated for. 
+		 */
+		virtual void version(CmdLineInterface& c);
+
+		/**
+		 * Prints (to stderr) an error message, short usage 
+		 * Can be overridden to produce alternative behavior.
+		 * \param c - The CmdLine object the output is generated for. 
+		 * \param e - The ArgException that caused the failure. 
+		 */
+		virtual void failure(CmdLineInterface& c, 
+						     ArgException& e );
+
+	protected:
+
+		/**
+		 * Substitutes the char r for string x in string s.
+		 * \param s - The string to operate on. 
+		 * \param r - The char to replace. 
+		 * \param x - What to replace r with. 
+		 */
+		void substituteSpecialChars( std::string& s, char r, std::string& x );
+		void removeChar( std::string& s, char r);
+		void basename( std::string& s );
+
+		void printShortArg(Arg* it);
+		void printLongArg(Arg* it);
+
+		char theDelimiter;
+};
+
+
+inline void DocBookOutput::version(CmdLineInterface& _cmd) 
+{ 
+	std::cout << _cmd.getVersion() << std::endl;
+}
+
+inline void DocBookOutput::usage(CmdLineInterface& _cmd ) 
+{
+	std::list<Arg*> argList = _cmd.getArgList();
+	std::string progName = _cmd.getProgramName();
+	std::string xversion = _cmd.getVersion();
+	theDelimiter = _cmd.getDelimiter();
+	XorHandler xorHandler = _cmd.getXorHandler();
+	std::vector< std::vector<Arg*> > xorList = xorHandler.getXorList();
+	basename(progName);
+
+	std::cout << "<?xml version='1.0'?>" << std::endl;
+	std::cout << "<!DOCTYPE refentry PUBLIC \"-//OASIS//DTD DocBook XML V4.2//EN\"" << std::endl;
+	std::cout << "\t\"http://www.oasis-open.org/docbook/xml/4.2/docbookx.dtd\">" << std::endl << std::endl;
+
+	std::cout << "<refentry>" << std::endl;
+
+	std::cout << "<refmeta>" << std::endl;
+	std::cout << "<refentrytitle>" << progName << "</refentrytitle>" << std::endl;
+	std::cout << "<manvolnum>1</manvolnum>" << std::endl;
+	std::cout << "</refmeta>" << std::endl;
+
+	std::cout << "<refnamediv>" << std::endl;
+	std::cout << "<refname>" << progName << "</refname>" << std::endl;
+	std::cout << "<refpurpose>" << _cmd.getMessage() << "</refpurpose>" << std::endl;
+	std::cout << "</refnamediv>" << std::endl;
+
+	std::cout << "<refsynopsisdiv>" << std::endl;
+	std::cout << "<cmdsynopsis>" << std::endl;
+
+	std::cout << "<command>" << progName << "</command>" << std::endl;
+
+	// xor
+	for ( int i = 0; (unsigned int)i < xorList.size(); i++ )
+	{
+		std::cout << "<group choice='req'>" << std::endl;
+		for ( ArgVectorIterator it = xorList[i].begin(); 
+						it != xorList[i].end(); it++ )
+			printShortArg((*it));
+
+		std::cout << "</group>" << std::endl;
+	}
+	
+	// rest of args
+	for (ArgListIterator it = argList.begin(); it != argList.end(); it++)
+		if ( !xorHandler.contains( (*it) ) )
+			printShortArg((*it));
+
+ 	std::cout << "</cmdsynopsis>" << std::endl;
+	std::cout << "</refsynopsisdiv>" << std::endl;
+
+	std::cout << "<refsect1>" << std::endl;
+	std::cout << "<title>Description</title>" << std::endl;
+	std::cout << "<para>" << std::endl;
+	std::cout << _cmd.getMessage() << std::endl; 
+	std::cout << "</para>" << std::endl;
+	std::cout << "</refsect1>" << std::endl;
+
+	std::cout << "<refsect1>" << std::endl;
+	std::cout << "<title>Options</title>" << std::endl;
+
+	std::cout << "<variablelist>" << std::endl;
+	
+	for (ArgListIterator it = argList.begin(); it != argList.end(); it++)
+		printLongArg((*it));
+
+	std::cout << "</variablelist>" << std::endl;
+	std::cout << "</refsect1>" << std::endl;
+
+	std::cout << "<refsect1>" << std::endl;
+	std::cout << "<title>Version</title>" << std::endl;
+	std::cout << "<para>" << std::endl;
+	std::cout << xversion << std::endl; 
+	std::cout << "</para>" << std::endl;
+	std::cout << "</refsect1>" << std::endl;
+	
+	std::cout << "</refentry>" << std::endl;
+
+}
+
+inline void DocBookOutput::failure( CmdLineInterface& _cmd,
+				    ArgException& e ) 
+{ 
+	static_cast<void>(_cmd); // unused
+	std::cout << e.what() << std::endl;
+	throw ExitException(1);
+}
+
+inline void DocBookOutput::substituteSpecialChars( std::string& s,
+				                                   char r,
+												   std::string& x )
+{
+	size_t p;
+	while ( (p = s.find_first_of(r)) != std::string::npos )
+	{
+		s.erase(p,1);
+		s.insert(p,x);
+	}
+}
+
+inline void DocBookOutput::removeChar( std::string& s, char r)
+{
+	size_t p;
+	while ( (p = s.find_first_of(r)) != std::string::npos )
+	{
+		s.erase(p,1);
+	}
+}
+
+inline void DocBookOutput::basename( std::string& s )
+{
+	size_t p = s.find_last_of('/');
+	if ( p != std::string::npos )
+	{
+		s.erase(0, p + 1);
+	}
+}
+
+inline void DocBookOutput::printShortArg(Arg* a)
+{
+	std::string lt = "<"; 
+	std::string gt = ">"; 
+
+	std::string id = a->shortID();
+	substituteSpecialChars(id,'<',lt);
+	substituteSpecialChars(id,'>',gt);
+	removeChar(id,'[');
+	removeChar(id,']');
+	
+	std::string choice = "opt";
+	if ( a->isRequired() )
+		choice = "plain";
+
+	std::cout << "<arg choice='" << choice << '\'';
+	if ( a->acceptsMultipleValues() )
+		std::cout << " rep='repeat'";
+
+
+	std::cout << '>';
+	if ( !a->getFlag().empty() )
+		std::cout << a->flagStartChar() << a->getFlag();
+	else
+		std::cout << a->nameStartString() << a->getName();
+	if ( a->isValueRequired() )
+	{
+		std::string arg = a->shortID();
+		removeChar(arg,'[');
+		removeChar(arg,']');
+		removeChar(arg,'<');
+		removeChar(arg,'>');
+		arg.erase(0, arg.find_last_of(theDelimiter) + 1);
+		std::cout << theDelimiter;
+		std::cout << "<replaceable>" << arg << "</replaceable>";
+	}
+	std::cout << "</arg>" << std::endl;
+
+}
+
+inline void DocBookOutput::printLongArg(Arg* a)
+{
+	std::string lt = "<"; 
+	std::string gt = ">"; 
+
+	std::string desc = a->getDescription();
+	substituteSpecialChars(desc,'<',lt);
+	substituteSpecialChars(desc,'>',gt);
+
+	std::cout << "<varlistentry>" << std::endl;
+
+	if ( !a->getFlag().empty() )
+	{
+		std::cout << "<term>" << std::endl;
+		std::cout << "<option>";
+		std::cout << a->flagStartChar() << a->getFlag();
+		std::cout << "</option>" << std::endl;
+		std::cout << "</term>" << std::endl;
+	}
+
+	std::cout << "<term>" << std::endl;
+	std::cout << "<option>";
+	std::cout << a->nameStartString() << a->getName();
+	if ( a->isValueRequired() )
+	{
+		std::string arg = a->shortID();
+		removeChar(arg,'[');
+		removeChar(arg,']');
+		removeChar(arg,'<');
+		removeChar(arg,'>');
+		arg.erase(0, arg.find_last_of(theDelimiter) + 1);
+		std::cout << theDelimiter;
+		std::cout << "<replaceable>" << arg << "</replaceable>";
+	}
+	std::cout << "</option>" << std::endl;
+	std::cout << "</term>" << std::endl;
+
+	std::cout << "<listitem>" << std::endl;
+	std::cout << "<para>" << std::endl;
+	std::cout << desc << std::endl;
+	std::cout << "</para>" << std::endl;
+	std::cout << "</listitem>" << std::endl;
+
+	std::cout << "</varlistentry>" << std::endl;
+}
+
+} //namespace TCLAP
+#endif 
diff --git a/external/tclap/HelpVisitor.h b/external/tclap/HelpVisitor.h
new file mode 100644
index 0000000..cc3bd07
--- /dev/null
+++ b/external/tclap/HelpVisitor.h
@@ -0,0 +1,76 @@
+
+/****************************************************************************** 
+ * 
+ *  file:  HelpVisitor.h
+ * 
+ *  Copyright (c) 2003, Michael E. Smoot .
+ *  All rights reverved.
+ * 
+ *  See the file COPYING in the top directory of this distribution for
+ *  more information.
+ *  
+ *  THE SOFTWARE IS PROVIDED _AS IS_, WITHOUT WARRANTY OF ANY KIND, EXPRESS 
+ *  OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
+ *  THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
+ *  DEALINGS IN THE SOFTWARE.  
+ *  
+ *****************************************************************************/ 
+
+#ifndef TCLAP_HELP_VISITOR_H
+#define TCLAP_HELP_VISITOR_H
+
+#include <tclap/CmdLineInterface.h>
+#include <tclap/CmdLineOutput.h>
+#include <tclap/Visitor.h>
+
+namespace TCLAP {
+
+/**
+ * A Visitor object that calls the usage method of the given CmdLineOutput
+ * object for the specified CmdLine object.
+ */
+class HelpVisitor: public Visitor
+{
+	private:
+		/**
+		 * Prevent accidental copying.
+		 */
+		HelpVisitor(const HelpVisitor& rhs);
+		HelpVisitor& operator=(const HelpVisitor& rhs);
+
+	protected:
+
+		/**
+		 * The CmdLine the output will be generated for. 
+		 */
+		CmdLineInterface* _cmd;
+
+		/**
+		 * The output object. 
+		 */
+		CmdLineOutput** _out;
+
+	public:
+
+		/**
+		 * Constructor.
+		 * \param cmd - The CmdLine the output will be generated for.
+		 * \param out - The type of output. 
+		 */
+		HelpVisitor(CmdLineInterface* cmd, CmdLineOutput** out) 
+				: Visitor(), _cmd( cmd ), _out( out ) { }
+
+		/**
+		 * Calls the usage method of the CmdLineOutput for the 
+		 * specified CmdLine.
+		 */
+		void visit() { (*_out)->usage(*_cmd); throw ExitException(0); }
+		
+};
+
+}
+
+#endif
diff --git a/external/tclap/IgnoreRestVisitor.h b/external/tclap/IgnoreRestVisitor.h
new file mode 100644
index 0000000..e328649
--- /dev/null
+++ b/external/tclap/IgnoreRestVisitor.h
@@ -0,0 +1,52 @@
+
+/****************************************************************************** 
+ * 
+ *  file:  IgnoreRestVisitor.h
+ * 
+ *  Copyright (c) 2003, Michael E. Smoot .
+ *  All rights reverved.
+ * 
+ *  See the file COPYING in the top directory of this distribution for
+ *  more information.
+ *  
+ *  THE SOFTWARE IS PROVIDED _AS IS_, WITHOUT WARRANTY OF ANY KIND, EXPRESS 
+ *  OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
+ *  THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
+ *  DEALINGS IN THE SOFTWARE.  
+ *  
+ *****************************************************************************/ 
+
+
+#ifndef TCLAP_IGNORE_REST_VISITOR_H
+#define TCLAP_IGNORE_REST_VISITOR_H
+
+#include <tclap/Visitor.h>
+#include <tclap/Arg.h>
+
+namespace TCLAP {
+
+/**
+ * A Vistor that tells the CmdLine to begin ignoring arguments after
+ * this one is parsed.
+ */
+class IgnoreRestVisitor: public Visitor
+{
+	public:
+
+		/**
+		 * Constructor.
+		 */
+		IgnoreRestVisitor() : Visitor() {}
+
+		/**
+		 * Sets Arg::_ignoreRest.
+		 */
+		void visit() { Arg::beginIgnoring();  }
+};
+
+}
+
+#endif
diff --git a/external/tclap/MultiArg.h b/external/tclap/MultiArg.h
new file mode 100644
index 0000000..34bb2d7
--- /dev/null
+++ b/external/tclap/MultiArg.h
@@ -0,0 +1,433 @@
+/****************************************************************************** 
+ * 
+ *  file:  MultiArg.h
+ * 
+ *  Copyright (c) 2003, Michael E. Smoot .
+ *  Copyright (c) 2004, Michael E. Smoot, Daniel Aarno.
+ *  All rights reverved.
+ * 
+ *  See the file COPYING in the top directory of this distribution for
+ *  more information.
+ *  
+ *  THE SOFTWARE IS PROVIDED _AS IS_, WITHOUT WARRANTY OF ANY KIND, EXPRESS 
+ *  OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
+ *  THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
+ *  DEALINGS IN THE SOFTWARE.  
+ *  
+ *****************************************************************************/
+
+
+#ifndef TCLAP_MULTIPLE_ARGUMENT_H
+#define TCLAP_MULTIPLE_ARGUMENT_H
+
+#include <string>
+#include <vector>
+
+#include <tclap/Arg.h>
+#include <tclap/Constraint.h>
+
+namespace TCLAP {
+/**
+ * An argument that allows multiple values of type T to be specified.  Very
+ * similar to a ValueArg, except a vector of values will be returned
+ * instead of just one.
+ */
+template<class T>
+class MultiArg : public Arg
+{
+public:
+	typedef std::vector<T> container_type;	
+	typedef typename container_type::iterator iterator;
+	typedef typename container_type::const_iterator const_iterator;
+
+protected:
+
+	/**
+	 * The list of values parsed from the CmdLine.
+	 */
+	std::vector<T> _values;
+
+	/**
+	 * The description of type T to be used in the usage.
+	 */
+	std::string _typeDesc;
+
+	/**
+	 * A list of constraint on this Arg. 
+	 */
+	Constraint<T>* _constraint;
+
+	/**
+	 * Extracts the value from the string.
+	 * Attempts to parse string as type T, if this fails an exception
+	 * is thrown.
+	 * \param val - The string to be read.
+	 */
+	void _extractValue( const std::string& val );
+
+	/**
+	 * Used by XorHandler to decide whether to keep parsing for this arg.
+	 */
+	bool _allowMore;
+
+public:
+
+	/**
+	 * Constructor.
+	 * \param flag - The one character flag that identifies this
+	 * argument on the command line.
+	 * \param name - A one word name for the argument.  Can be
+	 * used as a long flag on the command line.
+	 * \param desc - A description of what the argument is for or
+	 * does.
+	 * \param req - Whether the argument is required on the command
+	 * line.
+	 * \param typeDesc - A short, human readable description of the
+	 * type that this object expects.  This is used in the generation
+	 * of the USAGE statement.  The goal is to be helpful to the end user
+	 * of the program.
+	 * \param v - An optional visitor.  You probably should not
+	 * use this unless you have a very good reason.
+	 */
+	MultiArg( const std::string& flag,
+                  const std::string& name,
+                  const std::string& desc,
+                  bool req,
+                  const std::string& typeDesc,
+                  Visitor* v = NULL);
+
+	/**
+	 * Constructor.
+	 * \param flag - The one character flag that identifies this
+	 * argument on the command line.
+	 * \param name - A one word name for the argument.  Can be
+	 * used as a long flag on the command line.
+	 * \param desc - A description of what the argument is for or
+	 * does.
+	 * \param req - Whether the argument is required on the command
+	 * line.
+	 * \param typeDesc - A short, human readable description of the
+	 * type that this object expects.  This is used in the generation
+	 * of the USAGE statement.  The goal is to be helpful to the end user
+	 * of the program.
+	 * \param parser - A CmdLine parser object to add this Arg to
+	 * \param v - An optional visitor.  You probably should not
+	 * use this unless you have a very good reason.
+	 */
+	MultiArg( const std::string& flag, 
+                  const std::string& name,
+                  const std::string& desc,
+                  bool req,
+                  const std::string& typeDesc,
+                  CmdLineInterface& parser,
+                  Visitor* v = NULL );
+
+	/**
+	 * Constructor.
+	 * \param flag - The one character flag that identifies this
+	 * argument on the command line.
+	 * \param name - A one word name for the argument.  Can be
+	 * used as a long flag on the command line.
+	 * \param desc - A description of what the argument is for or
+	 * does.
+	 * \param req - Whether the argument is required on the command
+	 * line.
+	 * \param constraint - A pointer to a Constraint object used
+	 * to constrain this Arg.
+	 * \param v - An optional visitor.  You probably should not
+	 * use this unless you have a very good reason.
+	 */
+	MultiArg( const std::string& flag,
+                  const std::string& name,
+                  const std::string& desc,
+                  bool req,
+                  Constraint<T>* constraint,
+                  Visitor* v = NULL );
+		  
+	/**
+	 * Constructor.
+	 * \param flag - The one character flag that identifies this
+	 * argument on the command line.
+	 * \param name - A one word name for the argument.  Can be
+	 * used as a long flag on the command line.
+	 * \param desc - A description of what the argument is for or
+	 * does.
+	 * \param req - Whether the argument is required on the command
+	 * line.
+	 * \param constraint - A pointer to a Constraint object used
+	 * to constrain this Arg.
+	 * \param parser - A CmdLine parser object to add this Arg to
+	 * \param v - An optional visitor.  You probably should not
+	 * use this unless you have a very good reason.
+	 */
+	MultiArg( const std::string& flag, 
+                  const std::string& name,
+                  const std::string& desc,
+                  bool req,
+                  Constraint<T>* constraint,
+                  CmdLineInterface& parser,
+                  Visitor* v = NULL );
+		  
+	/**
+	 * Handles the processing of the argument.
+	 * This re-implements the Arg version of this method to set the
+	 * _value of the argument appropriately.  It knows the difference
+	 * between labeled and unlabeled.
+	 * \param i - Pointer the the current argument in the list.
+	 * \param args - Mutable list of strings. Passed from main().
+	 */
+	virtual bool processArg(int* i, std::vector<std::string>& args); 
+
+	/**
+	 * Returns a vector of type T containing the values parsed from
+	 * the command line.
+	 */
+	const std::vector<T>& getValue();
+
+	/**
+	 * Returns an iterator over the values parsed from the command
+	 * line.
+	 */
+	const_iterator begin() const { return _values.begin(); }
+
+	/**
+	 * Returns the end of the values parsed from the command
+	 * line.
+	 */
+	const_iterator end() const { return _values.end(); }
+
+	/**
+	 * Returns the a short id string.  Used in the usage. 
+	 * \param val - value to be used.
+	 */
+	virtual std::string shortID(const std::string& val="val") const;
+
+	/**
+	 * Returns the a long id string.  Used in the usage. 
+	 * \param val - value to be used.
+	 */
+	virtual std::string longID(const std::string& val="val") const;
+
+	/**
+	 * Once we've matched the first value, then the arg is no longer
+	 * required.
+	 */
+	virtual bool isRequired() const;
+
+	virtual bool allowMore();
+	
+	virtual void reset();
+
+private:
+	/**
+	 * Prevent accidental copying
+	 */
+	MultiArg<T>(const MultiArg<T>& rhs);
+	MultiArg<T>& operator=(const MultiArg<T>& rhs);
+
+};
+
+template<class T>
+MultiArg<T>::MultiArg(const std::string& flag, 
+                      const std::string& name,
+                      const std::string& desc,
+                      bool req,
+                      const std::string& typeDesc,
+                      Visitor* v) :
+   Arg( flag, name, desc, req, true, v ),
+  _values(std::vector<T>()),
+  _typeDesc( typeDesc ),
+  _constraint( NULL ),
+  _allowMore(false)
+{ 
+	_acceptsMultipleValues = true;
+}
+
+template<class T>
+MultiArg<T>::MultiArg(const std::string& flag, 
+                      const std::string& name,
+                      const std::string& desc,
+                      bool req,
+                      const std::string& typeDesc,
+                      CmdLineInterface& parser,
+                      Visitor* v)
+: Arg( flag, name, desc, req, true, v ),
+  _values(std::vector<T>()),
+  _typeDesc( typeDesc ),
+  _constraint( NULL ),
+  _allowMore(false)
+{ 
+	parser.add( this );
+	_acceptsMultipleValues = true;
+}
+
+/**
+ *
+ */
+template<class T>
+MultiArg<T>::MultiArg(const std::string& flag, 
+                      const std::string& name,
+                      const std::string& desc,
+                      bool req,
+                      Constraint<T>* constraint,
+                      Visitor* v)
+: Arg( flag, name, desc, req, true, v ),
+  _values(std::vector<T>()),
+  _typeDesc( constraint->shortID() ),
+  _constraint( constraint ),
+  _allowMore(false)
+{ 
+	_acceptsMultipleValues = true;
+}
+
+template<class T>
+MultiArg<T>::MultiArg(const std::string& flag, 
+                      const std::string& name,
+                      const std::string& desc,
+                      bool req,
+                      Constraint<T>* constraint,
+                      CmdLineInterface& parser,
+                      Visitor* v)
+: Arg( flag, name, desc, req, true, v ),
+  _values(std::vector<T>()),
+  _typeDesc( constraint->shortID() ),
+  _constraint( constraint ),
+  _allowMore(false)
+{ 
+	parser.add( this );
+	_acceptsMultipleValues = true;
+}
+
+template<class T>
+const std::vector<T>& MultiArg<T>::getValue() { return _values; }
+
+template<class T>
+bool MultiArg<T>::processArg(int *i, std::vector<std::string>& args) 
+{
+ 	if ( _ignoreable && Arg::ignoreRest() )
+		return false;
+
+	if ( _hasBlanks( args[*i] ) )
+		return false;
+
+	std::string flag = args[*i];
+	std::string value = "";
+
+   	trimFlag( flag, value );
+
+   	if ( argMatches( flag ) )
+   	{
+   		if ( Arg::delimiter() != ' ' && value == "" )
+			throw( ArgParseException( 
+			           "Couldn't find delimiter for this argument!",
+					   toString() ) );
+
+		// always take the first one, regardless of start string
+		if ( value == "" )
+		{
+			(*i)++;
+			if ( static_cast<unsigned int>(*i) < args.size() )
+				_extractValue( args[*i] );
+			else
+				throw( ArgParseException("Missing a value for this argument!",
+                                         toString() ) );
+		} 
+		else
+			_extractValue( value );
+
+		/*
+		// continuing taking the args until we hit one with a start string 
+		while ( (unsigned int)(*i)+1 < args.size() &&
+				args[(*i)+1].find_first_of( Arg::flagStartString() ) != 0 &&
+		        args[(*i)+1].find_first_of( Arg::nameStartString() ) != 0 ) 
+				_extractValue( args[++(*i)] );
+		*/
+
+		_alreadySet = true;
+		_checkWithVisitor();
+
+		return true;
+	}
+	else
+		return false;
+}
+
+/**
+ *
+ */
+template<class T>
+std::string MultiArg<T>::shortID(const std::string& val) const
+{
+	static_cast<void>(val); // Ignore input, don't warn
+	return Arg::shortID(_typeDesc) + " ... ";
+}
+
+/**
+ *
+ */
+template<class T>
+std::string MultiArg<T>::longID(const std::string& val) const
+{
+	static_cast<void>(val); // Ignore input, don't warn
+	return Arg::longID(_typeDesc) + "  (accepted multiple times)";
+}
+
+/**
+ * Once we've matched the first value, then the arg is no longer
+ * required.
+ */
+template<class T>
+bool MultiArg<T>::isRequired() const
+{
+	if ( _required )
+	{
+		if ( _values.size() > 1 )
+			return false;
+		else
+			return true;
+   	}
+   	else
+		return false;
+
+}
+
+template<class T>
+void MultiArg<T>::_extractValue( const std::string& val ) 
+{
+    try {
+	T tmp;
+	ExtractValue(tmp, val, typename ArgTraits<T>::ValueCategory());
+	_values.push_back(tmp);
+    } catch( ArgParseException &e) {
+	throw ArgParseException(e.error(), toString());
+    }
+
+    if ( _constraint != NULL )
+	if ( ! _constraint->check( _values.back() ) )
+	    throw( CmdLineParseException( "Value '" + val +
+					  "' does not meet constraint: " +
+					  _constraint->description(), 
+					  toString() ) );
+}
+		
+template<class T>
+bool MultiArg<T>::allowMore()
+{
+	bool am = _allowMore;
+	_allowMore = true;
+	return am;
+}
+
+template<class T>
+void MultiArg<T>::reset()
+{
+	Arg::reset();
+	_values.clear();
+}
+
+} // namespace TCLAP
+
+#endif
diff --git a/external/tclap/MultiSwitchArg.h b/external/tclap/MultiSwitchArg.h
new file mode 100644
index 0000000..8820b64
--- /dev/null
+++ b/external/tclap/MultiSwitchArg.h
@@ -0,0 +1,216 @@
+
+/****************************************************************************** 
+*
+*  file:  MultiSwitchArg.h
+*
+*  Copyright (c) 2003, Michael E. Smoot .
+*  Copyright (c) 2004, Michael E. Smoot, Daniel Aarno.
+*  Copyright (c) 2005, Michael E. Smoot, Daniel Aarno, Erik Zeek.
+*  All rights reverved.
+*
+*  See the file COPYING in the top directory of this distribution for
+*  more information.
+*
+*  THE SOFTWARE IS PROVIDED _AS IS_, WITHOUT WARRANTY OF ANY KIND, EXPRESS
+*  OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+*  THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*  DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#ifndef TCLAP_MULTI_SWITCH_ARG_H
+#define TCLAP_MULTI_SWITCH_ARG_H
+
+#include <string>
+#include <vector>
+
+#include <tclap/SwitchArg.h>
+
+namespace TCLAP {
+
+/**
+* A multiple switch argument.  If the switch is set on the command line, then
+* the getValue method will return the number of times the switch appears.
+*/
+class MultiSwitchArg : public SwitchArg
+{
+	protected:
+
+		/**
+		 * The value of the switch.
+		 */
+		int _value;
+
+		/**
+		 * Used to support the reset() method so that ValueArg can be
+		 * reset to their constructed value.
+		 */
+		int _default;
+
+	public:
+
+		/**
+		 * MultiSwitchArg constructor.
+		 * \param flag - The one character flag that identifies this
+		 * argument on the command line.
+		 * \param name - A one word name for the argument.  Can be
+		 * used as a long flag on the command line.
+		 * \param desc - A description of what the argument is for or
+		 * does.
+		 * \param init - Optional. The initial/default value of this Arg. 
+		 * Defaults to 0.
+		 * \param v - An optional visitor.  You probably should not
+		 * use this unless you have a very good reason.
+		 */
+		MultiSwitchArg(const std::string& flag, 
+				const std::string& name,
+				const std::string& desc,
+				int init = 0,
+				Visitor* v = NULL);
+
+
+		/**
+		 * MultiSwitchArg constructor.
+		 * \param flag - The one character flag that identifies this
+		 * argument on the command line.
+		 * \param name - A one word name for the argument.  Can be
+		 * used as a long flag on the command line.
+		 * \param desc - A description of what the argument is for or
+		 * does.
+		 * \param parser - A CmdLine parser object to add this Arg to
+		 * \param init - Optional. The initial/default value of this Arg. 
+		 * Defaults to 0.
+		 * \param v - An optional visitor.  You probably should not
+		 * use this unless you have a very good reason.
+		 */
+		MultiSwitchArg(const std::string& flag, 
+				const std::string& name,
+				const std::string& desc,
+				CmdLineInterface& parser,
+				int init = 0,
+				Visitor* v = NULL);
+
+
+		/**
+		 * Handles the processing of the argument.
+		 * This re-implements the SwitchArg version of this method to set the
+		 * _value of the argument appropriately.
+		 * \param i - Pointer the the current argument in the list.
+		 * \param args - Mutable list of strings. Passed
+		 * in from main().
+		 */
+		virtual bool processArg(int* i, std::vector<std::string>& args); 
+
+		/**
+		 * Returns int, the number of times the switch has been set.
+		 */
+		int getValue();
+
+		/**
+		 * Returns the shortID for this Arg.
+		 */
+		std::string shortID(const std::string& val) const;
+
+		/**
+		 * Returns the longID for this Arg.
+		 */
+		std::string longID(const std::string& val) const;
+		
+		void reset();
+
+};
+
+//////////////////////////////////////////////////////////////////////
+//BEGIN MultiSwitchArg.cpp
+//////////////////////////////////////////////////////////////////////
+inline MultiSwitchArg::MultiSwitchArg(const std::string& flag,
+					const std::string& name,
+					const std::string& desc,
+					int init,
+					Visitor* v )
+: SwitchArg(flag, name, desc, false, v),
+_value( init ),
+_default( init )
+{ }
+
+inline MultiSwitchArg::MultiSwitchArg(const std::string& flag,
+					const std::string& name, 
+					const std::string& desc, 
+					CmdLineInterface& parser,
+					int init,
+					Visitor* v )
+: SwitchArg(flag, name, desc, false, v),
+_value( init ),
+_default( init )
+{ 
+	parser.add( this );
+}
+
+inline int MultiSwitchArg::getValue() { return _value; }
+
+inline bool MultiSwitchArg::processArg(int *i, std::vector<std::string>& args)
+{
+	if ( _ignoreable && Arg::ignoreRest() )
+		return false;
+
+	if ( argMatches( args[*i] ))
+	{
+		// so the isSet() method will work
+		_alreadySet = true;
+
+		// Matched argument: increment value.
+		++_value;
+
+		_checkWithVisitor();
+
+		return true;
+	}
+	else if ( combinedSwitchesMatch( args[*i] ) )
+	{
+		// so the isSet() method will work
+		_alreadySet = true;
+
+		// Matched argument: increment value.
+		++_value;
+
+		// Check for more in argument and increment value.
+		while ( combinedSwitchesMatch( args[*i] ) ) 
+			++_value;
+
+		_checkWithVisitor();
+
+		return false;
+	}
+	else
+		return false;
+}
+
+inline std::string 
+MultiSwitchArg::shortID(const std::string& val) const
+{
+	return Arg::shortID(val) + " ... ";
+}
+
+inline std::string 
+MultiSwitchArg::longID(const std::string& val) const
+{
+	return Arg::longID(val) + "  (accepted multiple times)";
+}
+
+inline void
+MultiSwitchArg::reset()
+{
+	MultiSwitchArg::_value = MultiSwitchArg::_default;
+}
+
+//////////////////////////////////////////////////////////////////////
+//END MultiSwitchArg.cpp
+//////////////////////////////////////////////////////////////////////
+
+} //namespace TCLAP
+
+#endif
diff --git a/external/tclap/OptionalUnlabeledTracker.h b/external/tclap/OptionalUnlabeledTracker.h
new file mode 100644
index 0000000..8174c5f
--- /dev/null
+++ b/external/tclap/OptionalUnlabeledTracker.h
@@ -0,0 +1,62 @@
+
+
+/****************************************************************************** 
+ * 
+ *  file:  OptionalUnlabeledTracker.h
+ * 
+ *  Copyright (c) 2005, Michael E. Smoot .
+ *  All rights reverved.
+ * 
+ *  See the file COPYING in the top directory of this distribution for
+ *  more information.
+ *  
+ *  THE SOFTWARE IS PROVIDED _AS IS_, WITHOUT WARRANTY OF ANY KIND, EXPRESS 
+ *  OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
+ *  THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
+ *  DEALINGS IN THE SOFTWARE.  
+ *  
+ *****************************************************************************/ 
+
+
+#ifndef TCLAP_OPTIONAL_UNLABELED_TRACKER_H
+#define TCLAP_OPTIONAL_UNLABELED_TRACKER_H
+
+#include <string>
+
+namespace TCLAP {
+
+class OptionalUnlabeledTracker
+{
+
+	public:
+
+		static void check( bool req, const std::string& argName );
+
+		static void gotOptional() { alreadyOptionalRef() = true; }
+
+		static bool& alreadyOptional() { return alreadyOptionalRef(); } 
+
+	private:
+
+		static bool& alreadyOptionalRef() { static bool ct = false; return ct; }
+};
+
+
+inline void OptionalUnlabeledTracker::check( bool req, const std::string& argName )
+{
+    if ( OptionalUnlabeledTracker::alreadyOptional() )
+        throw( SpecificationException(
+	"You can't specify ANY Unlabeled Arg following an optional Unlabeled Arg",
+	                argName ) );
+
+    if ( !req )
+        OptionalUnlabeledTracker::gotOptional();
+}
+
+
+} // namespace TCLAP
+
+#endif
diff --git a/external/tclap/StandardTraits.h b/external/tclap/StandardTraits.h
new file mode 100644
index 0000000..46d7f6f
--- /dev/null
+++ b/external/tclap/StandardTraits.h
@@ -0,0 +1,208 @@
+// -*- Mode: c++; c-basic-offset: 4; tab-width: 4; -*-
+
+/******************************************************************************
+ *
+ *  file:  StandardTraits.h
+ *
+ *  Copyright (c) 2007, Daniel Aarno, Michael E. Smoot .
+ *  All rights reverved.
+ *
+ *  See the file COPYING in the top directory of this distribution for
+ *  more information.
+ *
+ *  THE SOFTWARE IS PROVIDED _AS IS_, WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ *  OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ *  THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ *  DEALINGS IN THE SOFTWARE.
+ *
+ *****************************************************************************/
+
+// This is an internal tclap file, you should probably not have to
+// include this directly
+
+#ifndef TCLAP_STANDARD_TRAITS_H
+#define TCLAP_STANDARD_TRAITS_H
+
+#ifdef HAVE_CONFIG_H
+#include <config.h> // To check for long long
+#endif
+
+// If Microsoft has already typedef'd wchar_t as an unsigned 
+// short, then compiles will break because it's as if we're
+// creating ArgTraits twice for unsigned short. Thus...
+#ifdef _MSC_VER
+#ifndef _NATIVE_WCHAR_T_DEFINED
+#define TCLAP_DONT_DECLARE_WCHAR_T_ARGTRAITS
+#endif
+#endif
+
+namespace TCLAP {
+
+// ======================================================================
+// Integer types
+// ======================================================================
+
+/**
+ * longs have value-like semantics.
+ */
+template<>
+struct ArgTraits<long> {
+    typedef ValueLike ValueCategory;
+};
+
+/**
+ * ints have value-like semantics.
+ */
+template<>
+struct ArgTraits<int> {
+    typedef ValueLike ValueCategory;
+};
+
+/**
+ * shorts have value-like semantics.
+ */
+template<>
+struct ArgTraits<short> {
+    typedef ValueLike ValueCategory;
+};
+
+/**
+ * chars have value-like semantics.
+ */
+template<>
+struct ArgTraits<char> {
+    typedef ValueLike ValueCategory;
+};
+
+#ifdef HAVE_LONG_LONG
+/**
+ * long longs have value-like semantics.
+ */
+template<>
+struct ArgTraits<long long> {
+    typedef ValueLike ValueCategory;
+};
+#endif
+
+// ======================================================================
+// Unsigned integer types
+// ======================================================================
+
+/**
+ * unsigned longs have value-like semantics.
+ */
+template<>
+struct ArgTraits<unsigned long> {
+    typedef ValueLike ValueCategory;
+};
+
+/**
+ * unsigned ints have value-like semantics.
+ */
+template<>
+struct ArgTraits<unsigned int> {
+    typedef ValueLike ValueCategory;
+};
+
+/**
+ * unsigned shorts have value-like semantics.
+ */
+template<>
+struct ArgTraits<unsigned short> {
+    typedef ValueLike ValueCategory;
+};
+
+/**
+ * unsigned chars have value-like semantics.
+ */
+template<>
+struct ArgTraits<unsigned char> {
+    typedef ValueLike ValueCategory;
+};
+
+// Microsoft implements size_t awkwardly. 
+#if defined(_MSC_VER) && defined(_M_X64)
+/**
+ * size_ts have value-like semantics.
+ */
+template<>
+struct ArgTraits<size_t> {
+    typedef ValueLike ValueCategory;
+};
+#endif
+
+
+#ifdef HAVE_LONG_LONG
+/**
+ * unsigned long longs have value-like semantics.
+ */
+template<>
+struct ArgTraits<unsigned long long> {
+    typedef ValueLike ValueCategory;
+};
+#endif
+
+// ======================================================================
+// Float types
+// ======================================================================
+
+/**
+ * floats have value-like semantics.
+ */
+template<>
+struct ArgTraits<float> {
+    typedef ValueLike ValueCategory;
+};
+
+/**
+ * doubles have value-like semantics.
+ */
+template<>
+struct ArgTraits<double> {
+    typedef ValueLike ValueCategory;
+};
+
+// ======================================================================
+// Other types
+// ======================================================================
+
+/**
+ * bools have value-like semantics.
+ */
+template<>
+struct ArgTraits<bool> {
+    typedef ValueLike ValueCategory;
+};
+
+
+/**
+ * wchar_ts have value-like semantics.
+ */
+#ifndef TCLAP_DONT_DECLARE_WCHAR_T_ARGTRAITS
+template<>
+struct ArgTraits<wchar_t> {
+    typedef ValueLike ValueCategory;
+};
+#endif
+
+/**
+ * Strings have string like argument traits.
+ */
+template<>
+struct ArgTraits<std::string> {
+    typedef StringLike ValueCategory;
+};
+
+template<typename T>
+void SetString(T &dst, const std::string &src)
+{
+    dst = src;
+}
+
+} // namespace
+
+#endif
+
diff --git a/external/tclap/StdOutput.h b/external/tclap/StdOutput.h
new file mode 100644
index 0000000..35f7b99
--- /dev/null
+++ b/external/tclap/StdOutput.h
@@ -0,0 +1,298 @@
+// -*- Mode: c++; c-basic-offset: 4; tab-width: 4; -*-
+
+/****************************************************************************** 
+ * 
+ *  file:  StdOutput.h
+ * 
+ *  Copyright (c) 2004, Michael E. Smoot
+ *  All rights reverved.
+ * 
+ *  See the file COPYING in the top directory of this distribution for
+ *  more information.
+ *  
+ *  THE SOFTWARE IS PROVIDED _AS IS_, WITHOUT WARRANTY OF ANY KIND, EXPRESS 
+ *  OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
+ *  THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
+ *  DEALINGS IN THE SOFTWARE.  
+ *  
+ *****************************************************************************/ 
+
+#ifndef TCLAP_STDCMDLINEOUTPUT_H
+#define TCLAP_STDCMDLINEOUTPUT_H
+
+#include <string>
+#include <vector>
+#include <list>
+#include <iostream>
+#include <algorithm>
+
+#include <tclap/CmdLineInterface.h>
+#include <tclap/CmdLineOutput.h>
+#include <tclap/XorHandler.h>
+#include <tclap/Arg.h>
+
+namespace TCLAP {
+
+/**
+ * A class that isolates any output from the CmdLine object so that it
+ * may be easily modified.
+ */
+class StdOutput : public CmdLineOutput
+{
+
+	public:
+
+		/**
+		 * Prints the usage to stdout.  Can be overridden to 
+		 * produce alternative behavior.
+		 * \param c - The CmdLine object the output is generated for. 
+		 */
+		virtual void usage(CmdLineInterface& c);
+
+		/**
+		 * Prints the version to stdout. Can be overridden 
+		 * to produce alternative behavior.
+		 * \param c - The CmdLine object the output is generated for. 
+		 */
+		virtual void version(CmdLineInterface& c);
+
+		/**
+		 * Prints (to stderr) an error message, short usage 
+		 * Can be overridden to produce alternative behavior.
+		 * \param c - The CmdLine object the output is generated for. 
+		 * \param e - The ArgException that caused the failure. 
+		 */
+		virtual void failure(CmdLineInterface& c, 
+				     ArgException& e );
+
+	protected:
+
+        /**
+         * Writes a brief usage message with short args.
+		 * \param c - The CmdLine object the output is generated for. 
+         * \param os - The stream to write the message to.
+         */
+        void _shortUsage( CmdLineInterface& c, std::ostream& os ) const;
+
+        /**
+		 * Writes a longer usage message with long and short args, 
+		 * provides descriptions and prints message.
+		 * \param c - The CmdLine object the output is generated for. 
+		 * \param os - The stream to write the message to.
+		 */
+		void _longUsage( CmdLineInterface& c, std::ostream& os ) const;
+
+		/**
+		 * This function inserts line breaks and indents long strings 
+		 * according the  params input. It will only break lines at spaces, 
+		 * commas and pipes.
+		 * \param os - The stream to be printed to.
+		 * \param s - The string to be printed.
+		 * \param maxWidth - The maxWidth allowed for the output line. 
+		 * \param indentSpaces - The number of spaces to indent the first line. 
+		 * \param secondLineOffset - The number of spaces to indent the second
+		 * and all subsequent lines in addition to indentSpaces.
+		 */
+		void spacePrint( std::ostream& os, 
+						 const std::string& s, 
+						 int maxWidth, 
+						 int indentSpaces, 
+						 int secondLineOffset ) const;
+
+};
+
+
+inline void StdOutput::version(CmdLineInterface& _cmd) 
+{
+	std::string progName = _cmd.getProgramName();
+	std::string xversion = _cmd.getVersion();
+
+	std::cout << std::endl << progName << "  version: " 
+			  << xversion << std::endl << std::endl;
+}
+
+inline void StdOutput::usage(CmdLineInterface& _cmd ) 
+{
+	std::cout << std::endl << "USAGE: " << std::endl << std::endl; 
+
+	_shortUsage( _cmd, std::cout );
+
+	std::cout << std::endl << std::endl << "Where: " << std::endl << std::endl;
+
+	_longUsage( _cmd, std::cout );
+
+	std::cout << std::endl; 
+
+}
+
+inline void StdOutput::failure( CmdLineInterface& _cmd,
+								ArgException& e ) 
+{
+	std::string progName = _cmd.getProgramName();
+
+	std::cerr << "PARSE ERROR: " << e.argId() << std::endl
+		      << "             " << e.error() << std::endl << std::endl;
+
+	if ( _cmd.hasHelpAndVersion() )
+		{
+			std::cerr << "Brief USAGE: " << std::endl;
+
+			_shortUsage( _cmd, std::cerr );	
+
+			std::cerr << std::endl << "For complete USAGE and HELP type: " 
+					  << std::endl << "   " << progName << " --help" 
+					  << std::endl << std::endl;
+		}
+	else
+		usage(_cmd);
+
+	throw ExitException(1);
+}
+
+inline void 
+StdOutput::_shortUsage( CmdLineInterface& _cmd, 
+						std::ostream& os ) const
+{
+	std::list<Arg*> argList = _cmd.getArgList();
+	std::string progName = _cmd.getProgramName();
+	XorHandler xorHandler = _cmd.getXorHandler();
+	std::vector< std::vector<Arg*> > xorList = xorHandler.getXorList();
+
+	std::string s = progName + " ";
+
+	// first the xor
+	for ( int i = 0; static_cast<unsigned int>(i) < xorList.size(); i++ )
+		{
+			s += " {";
+			for ( ArgVectorIterator it = xorList[i].begin(); 
+				  it != xorList[i].end(); it++ )
+				s += (*it)->shortID() + "|";
+
+			s[s.length()-1] = '}';
+		}
+
+	// then the rest
+	for (ArgListIterator it = argList.begin(); it != argList.end(); it++)
+		if ( !xorHandler.contains( (*it) ) )
+			s += " " + (*it)->shortID();
+
+	// if the program name is too long, then adjust the second line offset 
+	int secondLineOffset = static_cast<int>(progName.length()) + 2;
+	if ( secondLineOffset > 75/2 )
+		secondLineOffset = static_cast<int>(75/2);
+
+	spacePrint( os, s, 75, 3, secondLineOffset );
+}
+
+inline void 
+StdOutput::_longUsage( CmdLineInterface& _cmd, 
+					   std::ostream& os ) const
+{
+	std::list<Arg*> argList = _cmd.getArgList();
+	std::string message = _cmd.getMessage();
+	XorHandler xorHandler = _cmd.getXorHandler();
+	std::vector< std::vector<Arg*> > xorList = xorHandler.getXorList();
+
+	// first the xor 
+	for ( int i = 0; static_cast<unsigned int>(i) < xorList.size(); i++ )
+		{
+			for ( ArgVectorIterator it = xorList[i].begin(); 
+				  it != xorList[i].end(); 
+				  it++ )
+				{
+					spacePrint( os, (*it)->longID(), 75, 3, 3 );
+					spacePrint( os, (*it)->getDescription(), 75, 5, 0 );
+
+					if ( it+1 != xorList[i].end() )
+						spacePrint(os, "-- OR --", 75, 9, 0);
+				}
+			os << std::endl << std::endl;
+		}
+
+	// then the rest
+	for (ArgListIterator it = argList.begin(); it != argList.end(); it++)
+		if ( !xorHandler.contains( (*it) ) )
+			{
+				spacePrint( os, (*it)->longID(), 75, 3, 3 ); 
+				spacePrint( os, (*it)->getDescription(), 75, 5, 0 ); 
+				os << std::endl;
+			}
+
+	os << std::endl;
+
+	spacePrint( os, message, 75, 3, 0 );
+}
+
+inline void StdOutput::spacePrint( std::ostream& os, 
+						           const std::string& s, 
+						           int maxWidth, 
+						           int indentSpaces, 
+						           int secondLineOffset ) const
+{
+	int len = static_cast<int>(s.length());
+
+	if ( (len + indentSpaces > maxWidth) && maxWidth > 0 )
+		{
+			int allowedLen = maxWidth - indentSpaces;
+			int start = 0;
+			while ( start < len )
+				{
+					// find the substring length
+					// int stringLen = std::min<int>( len - start, allowedLen );
+					// doing it this way to support a VisualC++ 2005 bug 
+					using namespace std; 
+					int stringLen = min<int>( len - start, allowedLen );
+
+					// trim the length so it doesn't end in middle of a word
+					if ( stringLen == allowedLen )
+						while ( stringLen >= 0 &&
+								s[stringLen+start] != ' ' && 
+								s[stringLen+start] != ',' &&
+								s[stringLen+start] != '|' ) 
+							stringLen--;
+	
+					// ok, the word is longer than the line, so just split 
+					// wherever the line ends
+					if ( stringLen <= 0 )
+						stringLen = allowedLen;
+
+					// check for newlines
+					for ( int i = 0; i < stringLen; i++ )
+						if ( s[start+i] == '\n' )
+							stringLen = i+1;
+
+					// print the indent	
+					for ( int i = 0; i < indentSpaces; i++ )
+						os << " ";
+
+					if ( start == 0 )
+						{
+							// handle second line offsets
+							indentSpaces += secondLineOffset;
+
+							// adjust allowed len
+							allowedLen -= secondLineOffset;
+						}
+
+					os << s.substr(start,stringLen) << std::endl;
+
+					// so we don't start a line with a space
+					while ( s[stringLen+start] == ' ' && start < len )
+						start++;
+			
+					start += stringLen;
+				}
+		}
+	else
+		{
+			for ( int i = 0; i < indentSpaces; i++ )
+				os << " ";
+			os << s << std::endl;
+		}
+}
+
+} //namespace TCLAP
+#endif 
diff --git a/external/tclap/SwitchArg.h b/external/tclap/SwitchArg.h
new file mode 100644
index 0000000..3916109
--- /dev/null
+++ b/external/tclap/SwitchArg.h
@@ -0,0 +1,266 @@
+
+/****************************************************************************** 
+ * 
+ *  file:  SwitchArg.h
+ * 
+ *  Copyright (c) 2003, Michael E. Smoot .
+ *  Copyright (c) 2004, Michael E. Smoot, Daniel Aarno.
+ *  All rights reverved.
+ * 
+ *  See the file COPYING in the top directory of this distribution for
+ *  more information.
+ *  
+ *  THE SOFTWARE IS PROVIDED _AS IS_, WITHOUT WARRANTY OF ANY KIND, EXPRESS 
+ *  OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
+ *  THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
+ *  DEALINGS IN THE SOFTWARE.  
+ *  
+ *****************************************************************************/ 
+
+
+#ifndef TCLAP_SWITCH_ARG_H
+#define TCLAP_SWITCH_ARG_H
+
+#include <string>
+#include <vector>
+
+#include <tclap/Arg.h>
+
+namespace TCLAP {
+
+/**
+ * A simple switch argument.  If the switch is set on the command line, then
+ * the getValue method will return the opposite of the default value for the
+ * switch.
+ */
+class SwitchArg : public Arg
+{
+	protected:
+
+		/**
+		 * The value of the switch.
+		 */
+		bool _value;
+
+		/**
+		 * Used to support the reset() method so that ValueArg can be
+		 * reset to their constructed value.
+		 */
+        bool _default;
+
+	public:
+
+        /**
+		 * SwitchArg constructor.
+		 * \param flag - The one character flag that identifies this
+		 * argument on the command line.
+		 * \param name - A one word name for the argument.  Can be
+		 * used as a long flag on the command line.
+		 * \param desc - A description of what the argument is for or
+		 * does.
+		 * \param def - The default value for this Switch. 
+		 * \param v - An optional visitor.  You probably should not
+		 * use this unless you have a very good reason.
+		 */
+		SwitchArg(const std::string& flag, 
+			      const std::string& name, 
+			      const std::string& desc,
+			      bool def = false,
+				  Visitor* v = NULL);
+
+				  
+		/**
+		 * SwitchArg constructor.
+		 * \param flag - The one character flag that identifies this
+		 * argument on the command line.
+		 * \param name - A one word name for the argument.  Can be
+		 * used as a long flag on the command line.
+		 * \param desc - A description of what the argument is for or
+		 * does.
+		 * \param parser - A CmdLine parser object to add this Arg to
+		 * \param def - The default value for this Switch.
+		 * \param v - An optional visitor.  You probably should not
+		 * use this unless you have a very good reason.
+		 */
+		SwitchArg(const std::string& flag, 
+			      const std::string& name, 
+			      const std::string& desc,
+				  CmdLineInterface& parser,
+			      bool def = false,
+				  Visitor* v = NULL);
+				  
+				  
+        /**
+		 * Handles the processing of the argument.
+		 * This re-implements the Arg version of this method to set the
+		 * _value of the argument appropriately.
+		 * \param i - Pointer the the current argument in the list.
+		 * \param args - Mutable list of strings. Passed
+		 * in from main().
+		 */
+		virtual bool processArg(int* i, std::vector<std::string>& args); 
+
+		/**
+		 * Checks a string to see if any of the chars in the string
+		 * match the flag for this Switch.
+		 */
+		bool combinedSwitchesMatch(std::string& combined);
+
+		/**
+		 * Returns bool, whether or not the switch has been set.
+		 */
+		bool getValue();
+		
+		virtual void reset();
+
+	private:
+		/**
+		 * Checks to see if we've found the last match in
+		 * a combined string.
+		 */
+		bool lastCombined(std::string& combined);
+
+		/**
+		 * Does the common processing of processArg.
+		 */
+		void commonProcessing();
+};
+
+//////////////////////////////////////////////////////////////////////
+//BEGIN SwitchArg.cpp
+//////////////////////////////////////////////////////////////////////
+inline SwitchArg::SwitchArg(const std::string& flag, 
+                            const std::string& name, 
+                            const std::string& desc, 
+                            bool default_val,
+                            Visitor* v )
+: Arg(flag, name, desc, false, false, v),
+  _value( default_val ),
+  _default( default_val )
+{ }
+
+inline SwitchArg::SwitchArg(const std::string& flag, 
+                            const std::string& name, 
+                            const std::string& desc, 
+                            CmdLineInterface& parser,
+                            bool default_val,
+                            Visitor* v )
+: Arg(flag, name, desc, false, false, v),
+  _value( default_val ),
+  _default(default_val)
+{ 
+	parser.add( this );
+}
+
+inline bool SwitchArg::getValue() { return _value; }
+
+inline bool SwitchArg::lastCombined(std::string& combinedSwitches ) 
+{
+	for ( unsigned int i = 1; i < combinedSwitches.length(); i++ )
+		if ( combinedSwitches[i] != Arg::blankChar() )
+			return false;
+	
+	return true;
+}
+
+inline bool SwitchArg::combinedSwitchesMatch(std::string& combinedSwitches )
+{
+	// make sure this is actually a combined switch
+	if ( combinedSwitches.length() > 0 &&
+	     combinedSwitches[0] != Arg::flagStartString()[0] )
+		return false;
+
+	// make sure it isn't a long name 
+	if ( combinedSwitches.substr( 0, Arg::nameStartString().length() ) == 
+	     Arg::nameStartString() )
+		return false;
+
+	// make sure the delimiter isn't in the string 
+	if ( combinedSwitches.find_first_of( Arg::delimiter() ) != std::string::npos )
+		return false;
+
+	// ok, we're not specifying a ValueArg, so we know that we have
+	// a combined switch list.  
+	for ( unsigned int i = 1; i < combinedSwitches.length(); i++ )
+		if ( _flag.length() > 0 && 
+		     combinedSwitches[i] == _flag[0] &&
+		     _flag[0] != Arg::flagStartString()[0] ) 
+		{
+			// update the combined switches so this one is no longer present
+			// this is necessary so that no unlabeled args are matched
+			// later in the processing.
+			//combinedSwitches.erase(i,1);
+			combinedSwitches[i] = Arg::blankChar(); 
+			return true;
+		}
+
+	// none of the switches passed in the list match. 
+	return false;	
+}
+
+inline void SwitchArg::commonProcessing()
+{
+	if ( _xorSet )
+		throw(CmdLineParseException(
+		      "Mutually exclusive argument already set!", toString()));
+
+	if ( _alreadySet ) 
+		throw(CmdLineParseException("Argument already set!", toString()));
+
+	_alreadySet = true;
+
+	if ( _value == true )
+		_value = false;
+	else
+		_value = true;
+
+	_checkWithVisitor();
+}
+
+inline bool SwitchArg::processArg(int *i, std::vector<std::string>& args)
+{
+	if ( _ignoreable && Arg::ignoreRest() )
+		return false;
+
+	// if the whole string matches the flag or name string
+	if ( argMatches( args[*i] ) )
+	{
+		commonProcessing();
+
+		return true;
+	}
+	// if a substring matches the flag as part of a combination
+	else if ( combinedSwitchesMatch( args[*i] ) )
+	{
+		// check again to ensure we don't misinterpret 
+		// this as a MultiSwitchArg 
+		if ( combinedSwitchesMatch( args[*i] ) )
+			throw(CmdLineParseException("Argument already set!", 
+			                            toString()));
+
+		commonProcessing();
+
+		// We only want to return true if we've found the last combined
+		// match in the string, otherwise we return true so that other 
+		// switches in the combination will have a chance to match.
+		return lastCombined( args[*i] );
+	}
+	else
+		return false;
+}
+
+inline void SwitchArg::reset()
+{
+	Arg::reset();
+	_value = _default;  
+}
+//////////////////////////////////////////////////////////////////////
+//End SwitchArg.cpp
+//////////////////////////////////////////////////////////////////////
+
+} //namespace TCLAP
+
+#endif
diff --git a/external/tclap/UnlabeledMultiArg.h b/external/tclap/UnlabeledMultiArg.h
new file mode 100644
index 0000000..d5e1781
--- /dev/null
+++ b/external/tclap/UnlabeledMultiArg.h
@@ -0,0 +1,301 @@
+
+/****************************************************************************** 
+ * 
+ *  file:  UnlabeledMultiArg.h
+ * 
+ *  Copyright (c) 2003, Michael E. Smoot.
+ *  All rights reverved.
+ * 
+ *  See the file COPYING in the top directory of this distribution for
+ *  more information.
+ *  
+ *  THE SOFTWARE IS PROVIDED _AS IS_, WITHOUT WARRANTY OF ANY KIND, EXPRESS 
+ *  OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
+ *  THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
+ *  DEALINGS IN THE SOFTWARE.  
+ *  
+ *****************************************************************************/ 
+
+
+#ifndef TCLAP_MULTIPLE_UNLABELED_ARGUMENT_H
+#define TCLAP_MULTIPLE_UNLABELED_ARGUMENT_H
+
+#include <string>
+#include <vector>
+
+#include <tclap/MultiArg.h>
+#include <tclap/OptionalUnlabeledTracker.h>
+
+namespace TCLAP {
+
+/**
+ * Just like a MultiArg, except that the arguments are unlabeled.  Basically,
+ * this Arg will slurp up everything that hasn't been matched to another 
+ * Arg.
+ */
+template<class T>
+class UnlabeledMultiArg : public MultiArg<T>
+{
+
+	// If compiler has two stage name lookup (as gcc >= 3.4 does)
+	// this is requried to prevent undef. symbols
+	using MultiArg<T>::_ignoreable;
+	using MultiArg<T>::_hasBlanks;
+	using MultiArg<T>::_extractValue;
+	using MultiArg<T>::_typeDesc;
+	using MultiArg<T>::_name;
+	using MultiArg<T>::_description;
+	using MultiArg<T>::_alreadySet;
+	using MultiArg<T>::toString;
+
+	public:
+		
+		/**
+		 * Constructor.  
+		 * \param name - The name of the Arg. Note that this is used for
+		 * identification, not as a long flag.
+		 * \param desc - A description of what the argument is for or
+		 * does.
+		 * \param req - Whether the argument is required on the command
+		 *  line.
+		 * \param typeDesc - A short, human readable description of the
+		 * type that this object expects.  This is used in the generation
+		 * of the USAGE statement.  The goal is to be helpful to the end user
+		 * of the program.
+		 * \param ignoreable - Whether or not this argument can be ignored
+		 * using the "--" flag.
+		 * \param v - An optional visitor.  You probably should not
+		 * use this unless you have a very good reason.
+		 */
+		UnlabeledMultiArg( const std::string& name,
+				           const std::string& desc,
+						   bool req,
+				           const std::string& typeDesc,
+						   bool ignoreable = false,
+				           Visitor* v = NULL );
+		/**
+		 * Constructor.  
+		 * \param name - The name of the Arg. Note that this is used for
+		 * identification, not as a long flag.
+		 * \param desc - A description of what the argument is for or
+		 * does.
+		 * \param req - Whether the argument is required on the command
+		 *  line.
+		 * \param typeDesc - A short, human readable description of the
+		 * type that this object expects.  This is used in the generation
+		 * of the USAGE statement.  The goal is to be helpful to the end user
+		 * of the program.
+		 * \param parser - A CmdLine parser object to add this Arg to
+		 * \param ignoreable - Whether or not this argument can be ignored
+		 * using the "--" flag.
+		 * \param v - An optional visitor.  You probably should not
+		 * use this unless you have a very good reason.
+		 */
+		UnlabeledMultiArg( const std::string& name,
+				           const std::string& desc,
+						   bool req,
+				           const std::string& typeDesc,
+						   CmdLineInterface& parser,
+						   bool ignoreable = false,
+				           Visitor* v = NULL );
+						 
+		/**
+		 * Constructor.  
+		 * \param name - The name of the Arg. Note that this is used for
+		 * identification, not as a long flag.
+		 * \param desc - A description of what the argument is for or
+		 * does.
+		 * \param req - Whether the argument is required on the command
+		 *  line.
+		 * \param constraint - A pointer to a Constraint object used
+		 * to constrain this Arg.
+		 * \param ignoreable - Whether or not this argument can be ignored
+		 * using the "--" flag.
+		 * \param v - An optional visitor.  You probably should not
+		 * use this unless you have a very good reason.
+		 */
+		UnlabeledMultiArg( const std::string& name,
+						   const std::string& desc,
+						   bool req,
+						   Constraint<T>* constraint,
+						   bool ignoreable = false,
+						   Visitor* v = NULL );
+
+		/**
+		 * Constructor.  
+		 * \param name - The name of the Arg. Note that this is used for
+		 * identification, not as a long flag.
+		 * \param desc - A description of what the argument is for or
+		 * does.
+		 * \param req - Whether the argument is required on the command
+		 *  line.
+		 * \param constraint - A pointer to a Constraint object used
+		 * to constrain this Arg.
+		 * \param parser - A CmdLine parser object to add this Arg to
+		 * \param ignoreable - Whether or not this argument can be ignored
+		 * using the "--" flag.
+		 * \param v - An optional visitor.  You probably should not
+		 * use this unless you have a very good reason.
+		 */
+		UnlabeledMultiArg( const std::string& name, 
+						   const std::string& desc, 
+						   bool req,
+						   Constraint<T>* constraint,
+						   CmdLineInterface& parser,
+						   bool ignoreable = false,
+						   Visitor* v = NULL );
+						 
+		/**
+		 * Handles the processing of the argument.
+		 * This re-implements the Arg version of this method to set the
+		 * _value of the argument appropriately.  It knows the difference
+		 * between labeled and unlabeled.
+		 * \param i - Pointer the the current argument in the list.
+		 * \param args - Mutable list of strings. Passed from main().
+		 */
+		virtual bool processArg(int* i, std::vector<std::string>& args); 
+
+		/**
+		 * Returns the a short id string.  Used in the usage.
+		 * \param val - value to be used.
+		 */
+		virtual std::string shortID(const std::string& val="val") const;
+
+		/**
+		 * Returns the a long id string.  Used in the usage.
+		 * \param val - value to be used.
+		 */
+		virtual std::string longID(const std::string& val="val") const;
+
+		/**
+		 * Opertor ==.
+		 * \param a - The Arg to be compared to this.
+		 */
+		virtual bool operator==(const Arg& a) const;
+
+		/**
+		 * Pushes this to back of list rather than front.
+		 * \param argList - The list this should be added to.
+		 */
+		virtual void addToList( std::list<Arg*>& argList ) const;
+};
+
+template<class T>
+UnlabeledMultiArg<T>::UnlabeledMultiArg(const std::string& name, 
+				                        const std::string& desc, 
+										bool req,
+					                    const std::string& typeDesc,
+										bool ignoreable,
+					                    Visitor* v)
+: MultiArg<T>("", name, desc,  req, typeDesc, v)
+{ 
+	_ignoreable = ignoreable;
+	OptionalUnlabeledTracker::check(true, toString());
+}
+
+template<class T>
+UnlabeledMultiArg<T>::UnlabeledMultiArg(const std::string& name, 
+				                        const std::string& desc, 
+										bool req,
+					                    const std::string& typeDesc,
+										CmdLineInterface& parser,
+										bool ignoreable,
+					                    Visitor* v)
+: MultiArg<T>("", name, desc,  req, typeDesc, v)
+{ 
+	_ignoreable = ignoreable;
+	OptionalUnlabeledTracker::check(true, toString());
+	parser.add( this );
+}
+
+
+template<class T>
+UnlabeledMultiArg<T>::UnlabeledMultiArg(const std::string& name, 
+				                        const std::string& desc, 
+										bool req,
+					                    Constraint<T>* constraint,
+										bool ignoreable,
+					                    Visitor* v)
+: MultiArg<T>("", name, desc,  req, constraint, v)
+{ 
+	_ignoreable = ignoreable;
+	OptionalUnlabeledTracker::check(true, toString());
+}
+
+template<class T>
+UnlabeledMultiArg<T>::UnlabeledMultiArg(const std::string& name, 
+				                        const std::string& desc, 
+										bool req,
+					                    Constraint<T>* constraint,
+										CmdLineInterface& parser,
+										bool ignoreable,
+					                    Visitor* v)
+: MultiArg<T>("", name, desc,  req, constraint, v)
+{ 
+	_ignoreable = ignoreable;
+	OptionalUnlabeledTracker::check(true, toString());
+	parser.add( this );
+}
+
+
+template<class T>
+bool UnlabeledMultiArg<T>::processArg(int *i, std::vector<std::string>& args) 
+{
+
+	if ( _hasBlanks( args[*i] ) )
+		return false;
+
+	// never ignore an unlabeled multi arg
+
+
+	// always take the first value, regardless of the start string 
+	_extractValue( args[(*i)] );
+
+	/*
+	// continue taking args until we hit the end or a start string 
+	while ( (unsigned int)(*i)+1 < args.size() &&
+			args[(*i)+1].find_first_of( Arg::flagStartString() ) != 0 &&
+            args[(*i)+1].find_first_of( Arg::nameStartString() ) != 0 ) 
+		_extractValue( args[++(*i)] );
+	*/
+
+	_alreadySet = true;
+
+	return true;
+}
+
+template<class T>
+std::string UnlabeledMultiArg<T>::shortID(const std::string& val) const
+{
+	static_cast<void>(val); // Ignore input, don't warn
+	return std::string("<") + _typeDesc + "> ...";
+}
+
+template<class T>
+std::string UnlabeledMultiArg<T>::longID(const std::string& val) const
+{
+	static_cast<void>(val); // Ignore input, don't warn
+	return std::string("<") + _typeDesc + ">  (accepted multiple times)";
+}
+
+template<class T>
+bool UnlabeledMultiArg<T>::operator==(const Arg& a) const
+{
+	if ( _name == a.getName() || _description == a.getDescription() )
+		return true;
+	else
+		return false;
+}
+
+template<class T>
+void UnlabeledMultiArg<T>::addToList( std::list<Arg*>& argList ) const
+{
+	argList.push_back( const_cast<Arg*>(static_cast<const Arg* const>(this)) );
+}
+
+}
+
+#endif
diff --git a/external/tclap/UnlabeledValueArg.h b/external/tclap/UnlabeledValueArg.h
new file mode 100644
index 0000000..5721d61
--- /dev/null
+++ b/external/tclap/UnlabeledValueArg.h
@@ -0,0 +1,340 @@
+
+/****************************************************************************** 
+ * 
+ *  file:  UnlabeledValueArg.h
+ * 
+ *  Copyright (c) 2003, Michael E. Smoot .
+ *  Copyright (c) 2004, Michael E. Smoot, Daniel Aarno.
+ *  All rights reverved.
+ * 
+ *  See the file COPYING in the top directory of this distribution for
+ *  more information.
+ *  
+ *  THE SOFTWARE IS PROVIDED _AS IS_, WITHOUT WARRANTY OF ANY KIND, EXPRESS 
+ *  OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
+ *  THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
+ *  DEALINGS IN THE SOFTWARE.  
+ *  
+ *****************************************************************************/ 
+
+
+#ifndef TCLAP_UNLABELED_VALUE_ARGUMENT_H
+#define TCLAP_UNLABELED_VALUE_ARGUMENT_H
+
+#include <string>
+#include <vector>
+
+#include <tclap/ValueArg.h>
+#include <tclap/OptionalUnlabeledTracker.h>
+
+
+namespace TCLAP {
+
+/**
+ * The basic unlabeled argument that parses a value.
+ * This is a template class, which means the type T defines the type
+ * that a given object will attempt to parse when an UnlabeledValueArg
+ * is reached in the list of args that the CmdLine iterates over.
+ */
+template<class T>
+class UnlabeledValueArg : public ValueArg<T>
+{
+
+	// If compiler has two stage name lookup (as gcc >= 3.4 does)
+	// this is requried to prevent undef. symbols
+	using ValueArg<T>::_ignoreable;
+	using ValueArg<T>::_hasBlanks;
+	using ValueArg<T>::_extractValue;
+	using ValueArg<T>::_typeDesc;
+	using ValueArg<T>::_name;
+	using ValueArg<T>::_description;
+	using ValueArg<T>::_alreadySet;
+	using ValueArg<T>::toString;
+
+	public:
+
+		/**
+		 * UnlabeledValueArg constructor.
+		 * \param name - A one word name for the argument.  Note that this is used for
+		 * identification, not as a long flag.
+		 * \param desc - A description of what the argument is for or
+		 * does.
+		 * \param req - Whether the argument is required on the command
+		 * line.
+		 * \param value - The default value assigned to this argument if it
+		 * is not present on the command line.
+		 * \param typeDesc - A short, human readable description of the
+		 * type that this object expects.  This is used in the generation
+		 * of the USAGE statement.  The goal is to be helpful to the end user
+		 * of the program.
+		 * \param ignoreable - Allows you to specify that this argument can be
+		 * ignored if the '--' flag is set.  This defaults to false (cannot
+		 * be ignored) and should  generally stay that way unless you have 
+		 * some special need for certain arguments to be ignored.
+		 * \param v - Optional Vistor.  You should leave this blank unless
+		 * you have a very good reason.
+		 */
+		UnlabeledValueArg( const std::string& name, 
+			               const std::string& desc, 
+						   bool req,
+				           T value,
+				           const std::string& typeDesc,
+						   bool ignoreable = false,
+				           Visitor* v = NULL); 
+
+		/**
+		 * UnlabeledValueArg constructor.
+		 * \param name - A one word name for the argument.  Note that this is used for
+		 * identification, not as a long flag.
+		 * \param desc - A description of what the argument is for or
+		 * does.
+		 * \param req - Whether the argument is required on the command
+		 * line.
+		 * \param value - The default value assigned to this argument if it
+		 * is not present on the command line.
+		 * \param typeDesc - A short, human readable description of the
+		 * type that this object expects.  This is used in the generation
+		 * of the USAGE statement.  The goal is to be helpful to the end user
+		 * of the program.
+		 * \param parser - A CmdLine parser object to add this Arg to
+		 * \param ignoreable - Allows you to specify that this argument can be
+		 * ignored if the '--' flag is set.  This defaults to false (cannot
+		 * be ignored) and should  generally stay that way unless you have 
+		 * some special need for certain arguments to be ignored.
+		 * \param v - Optional Vistor.  You should leave this blank unless
+		 * you have a very good reason.
+		 */
+		UnlabeledValueArg( const std::string& name, 
+			               const std::string& desc, 
+						   bool req,
+				           T value,
+				           const std::string& typeDesc,
+						   CmdLineInterface& parser,
+						   bool ignoreable = false,
+				           Visitor* v = NULL ); 					
+						
+		/**
+		 * UnlabeledValueArg constructor.
+		 * \param name - A one word name for the argument.  Note that this is used for
+		 * identification, not as a long flag.
+		 * \param desc - A description of what the argument is for or
+		 * does.
+		 * \param req - Whether the argument is required on the command
+		 * line.
+		 * \param value - The default value assigned to this argument if it
+		 * is not present on the command line.
+		 * \param constraint - A pointer to a Constraint object used
+		 * to constrain this Arg.
+		 * \param ignoreable - Allows you to specify that this argument can be
+		 * ignored if the '--' flag is set.  This defaults to false (cannot
+		 * be ignored) and should  generally stay that way unless you have 
+		 * some special need for certain arguments to be ignored.
+		 * \param v - Optional Vistor.  You should leave this blank unless
+		 * you have a very good reason.
+		 */
+		UnlabeledValueArg( const std::string& name, 
+			               const std::string& desc, 
+						   bool req,
+				           T value,
+				           Constraint<T>* constraint,
+						   bool ignoreable = false,
+				           Visitor* v = NULL ); 
+
+		
+		/**
+		 * UnlabeledValueArg constructor.
+		 * \param name - A one word name for the argument.  Note that this is used for
+		 * identification, not as a long flag.
+		 * \param desc - A description of what the argument is for or
+		 * does.
+		 * \param req - Whether the argument is required on the command
+		 * line.
+		 * \param value - The default value assigned to this argument if it
+		 * is not present on the command line.
+		 * \param constraint - A pointer to a Constraint object used
+		 * to constrain this Arg.
+		 * \param parser - A CmdLine parser object to add this Arg to
+		 * \param ignoreable - Allows you to specify that this argument can be
+		 * ignored if the '--' flag is set.  This defaults to false (cannot
+		 * be ignored) and should  generally stay that way unless you have 
+		 * some special need for certain arguments to be ignored.
+		 * \param v - Optional Vistor.  You should leave this blank unless
+		 * you have a very good reason.
+		 */
+		UnlabeledValueArg( const std::string& name, 
+			               const std::string& desc, 
+						   bool req,
+				           T value,
+				           Constraint<T>* constraint,
+						   CmdLineInterface& parser,
+						   bool ignoreable = false,
+				           Visitor* v = NULL);
+						
+		/**
+		 * Handles the processing of the argument.
+		 * This re-implements the Arg version of this method to set the
+		 * _value of the argument appropriately.  Handling specific to
+		 * unlabled arguments.
+		 * \param i - Pointer the the current argument in the list.
+		 * \param args - Mutable list of strings. 
+		 */
+		virtual bool processArg(int* i, std::vector<std::string>& args); 
+
+		/**
+		 * Overrides shortID for specific behavior.
+		 */
+		virtual std::string shortID(const std::string& val="val") const;
+
+		/**
+		 * Overrides longID for specific behavior.
+		 */
+		virtual std::string longID(const std::string& val="val") const;
+
+		/**
+		 * Overrides operator== for specific behavior.
+		 */
+		virtual bool operator==(const Arg& a ) const;
+
+		/**
+		 * Instead of pushing to the front of list, push to the back.
+		 * \param argList - The list to add this to.
+		 */
+		virtual void addToList( std::list<Arg*>& argList ) const;
+
+};
+
+/**
+ * Constructor implemenation.
+ */
+template<class T>
+UnlabeledValueArg<T>::UnlabeledValueArg(const std::string& name, 
+					                    const std::string& desc, 
+										bool req,
+					                    T val,
+					                    const std::string& typeDesc,
+					                    bool ignoreable,
+					                    Visitor* v)
+: ValueArg<T>("", name, desc, req, val, typeDesc, v)
+{ 
+	_ignoreable = ignoreable;
+
+	OptionalUnlabeledTracker::check(req, toString());
+
+}
+
+template<class T>
+UnlabeledValueArg<T>::UnlabeledValueArg(const std::string& name, 
+					                    const std::string& desc, 
+										bool req,
+					                    T val,
+					                    const std::string& typeDesc,
+					                    CmdLineInterface& parser,
+					                    bool ignoreable,
+					                    Visitor* v)
+: ValueArg<T>("", name, desc, req, val, typeDesc, v)
+{ 
+	_ignoreable = ignoreable;
+	OptionalUnlabeledTracker::check(req, toString());
+	parser.add( this );
+}
+
+/**
+ * Constructor implemenation.
+ */
+template<class T>
+UnlabeledValueArg<T>::UnlabeledValueArg(const std::string& name, 
+                                        const std::string& desc, 
+										bool req,
+                                        T val,
+                                        Constraint<T>* constraint,
+                                        bool ignoreable,
+                                        Visitor* v)
+: ValueArg<T>("", name, desc, req, val, constraint, v)
+{ 
+	_ignoreable = ignoreable;
+	OptionalUnlabeledTracker::check(req, toString());
+}
+
+template<class T>
+UnlabeledValueArg<T>::UnlabeledValueArg(const std::string& name, 
+					                    const std::string& desc, 
+										bool req,
+					                    T val,
+					                    Constraint<T>* constraint,
+					                    CmdLineInterface& parser,
+					                    bool ignoreable,
+					                    Visitor* v)
+: ValueArg<T>("", name, desc, req, val, constraint,  v)
+{ 
+	_ignoreable = ignoreable;
+	OptionalUnlabeledTracker::check(req, toString());
+	parser.add( this );
+}
+
+/**
+ * Implementation of processArg().
+ */
+template<class T>
+bool UnlabeledValueArg<T>::processArg(int *i, std::vector<std::string>& args) 
+{
+	
+	if ( _alreadySet )
+		return false;
+	
+	if ( _hasBlanks( args[*i] ) )
+		return false;
+
+	// never ignore an unlabeled arg
+	
+	_extractValue( args[*i] );
+	_alreadySet = true;
+	return true;
+}
+
+/**
+ * Overriding shortID for specific output.
+ */
+template<class T>
+std::string UnlabeledValueArg<T>::shortID(const std::string& val) const
+{
+	static_cast<void>(val); // Ignore input, don't warn
+	return std::string("<") + _typeDesc + ">";
+}
+
+/**
+ * Overriding longID for specific output.
+ */
+template<class T>
+std::string UnlabeledValueArg<T>::longID(const std::string& val) const
+{
+	static_cast<void>(val); // Ignore input, don't warn
+
+	// Ideally we would like to be able to use RTTI to return the name
+	// of the type required for this argument.  However, g++ at least, 
+	// doesn't appear to return terribly useful "names" of the types.  
+	return std::string("<") + _typeDesc + ">";
+}
+
+/**
+ * Overriding operator== for specific behavior.
+ */
+template<class T>
+bool UnlabeledValueArg<T>::operator==(const Arg& a ) const
+{
+	if ( _name == a.getName() || _description == a.getDescription() )
+		return true;
+	else
+		return false;
+}
+
+template<class T>
+void UnlabeledValueArg<T>::addToList( std::list<Arg*>& argList ) const
+{
+	argList.push_back( const_cast<Arg*>(static_cast<const Arg* const>(this)) );
+}
+
+}
+#endif
diff --git a/external/tclap/ValueArg.h b/external/tclap/ValueArg.h
new file mode 100644
index 0000000..7ac2952
--- /dev/null
+++ b/external/tclap/ValueArg.h
@@ -0,0 +1,425 @@
+/****************************************************************************** 
+ * 
+ *  file:  ValueArg.h
+ * 
+ *  Copyright (c) 2003, Michael E. Smoot .
+ *  Copyright (c) 2004, Michael E. Smoot, Daniel Aarno.
+ *  All rights reverved.
+ * 
+ *  See the file COPYING in the top directory of this distribution for
+ *  more information.
+ *  
+ *  THE SOFTWARE IS PROVIDED _AS IS_, WITHOUT WARRANTY OF ANY KIND, EXPRESS 
+ *  OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
+ *  THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
+ *  DEALINGS IN THE SOFTWARE.  
+ *  
+ *****************************************************************************/ 
+
+
+#ifndef TCLAP_VALUE_ARGUMENT_H
+#define TCLAP_VALUE_ARGUMENT_H
+
+#include <string>
+#include <vector>
+
+#include <tclap/Arg.h>
+#include <tclap/Constraint.h>
+
+namespace TCLAP {
+
+/**
+ * The basic labeled argument that parses a value.
+ * This is a template class, which means the type T defines the type
+ * that a given object will attempt to parse when the flag/name is matched
+ * on the command line.  While there is nothing stopping you from creating
+ * an unflagged ValueArg, it is unwise and would cause significant problems.
+ * Instead use an UnlabeledValueArg.
+ */
+template<class T>
+class ValueArg : public Arg 
+{
+    protected:
+
+        /**
+         * The value parsed from the command line.
+         * Can be of any type, as long as the >> operator for the type
+         * is defined.
+         */
+        T _value;
+
+		/**
+		 * Used to support the reset() method so that ValueArg can be
+		 * reset to their constructed value.
+		 */
+        T _default;
+
+        /**
+         * A human readable description of the type to be parsed.
+         * This is a hack, plain and simple.  Ideally we would use RTTI to
+         * return the name of type T, but until there is some sort of
+         * consistent support for human readable names, we are left to our
+         * own devices.
+         */
+        std::string _typeDesc;
+
+        /**
+         * A Constraint this Arg must conform to. 
+         */
+        Constraint<T>* _constraint;
+
+        /**
+         * Extracts the value from the string.
+         * Attempts to parse string as type T, if this fails an exception
+         * is thrown.
+         * \param val - value to be parsed. 
+         */
+        void _extractValue( const std::string& val );
+
+	public:
+
+        /**
+         * Labeled ValueArg constructor.
+         * You could conceivably call this constructor with a blank flag, 
+         * but that would make you a bad person.  It would also cause
+         * an exception to be thrown.   If you want an unlabeled argument, 
+         * use the other constructor.
+         * \param flag - The one character flag that identifies this
+         * argument on the command line.
+         * \param name - A one word name for the argument.  Can be
+         * used as a long flag on the command line.
+         * \param desc - A description of what the argument is for or
+         * does.
+         * \param req - Whether the argument is required on the command
+         * line.
+         * \param value - The default value assigned to this argument if it
+         * is not present on the command line.
+         * \param typeDesc - A short, human readable description of the
+         * type that this object expects.  This is used in the generation
+         * of the USAGE statement.  The goal is to be helpful to the end user
+         * of the program.
+         * \param v - An optional visitor.  You probably should not
+         * use this unless you have a very good reason.
+         */
+        ValueArg( const std::string& flag, 
+                  const std::string& name, 
+                  const std::string& desc, 
+                  bool req, 
+                  T value,
+                  const std::string& typeDesc,
+                  Visitor* v = NULL);
+				 
+				 
+        /**
+         * Labeled ValueArg constructor.
+         * You could conceivably call this constructor with a blank flag, 
+         * but that would make you a bad person.  It would also cause
+         * an exception to be thrown.   If you want an unlabeled argument, 
+         * use the other constructor.
+         * \param flag - The one character flag that identifies this
+         * argument on the command line.
+         * \param name - A one word name for the argument.  Can be
+         * used as a long flag on the command line.
+         * \param desc - A description of what the argument is for or
+         * does.
+         * \param req - Whether the argument is required on the command
+         * line.
+         * \param value - The default value assigned to this argument if it
+         * is not present on the command line.
+         * \param typeDesc - A short, human readable description of the
+         * type that this object expects.  This is used in the generation
+         * of the USAGE statement.  The goal is to be helpful to the end user
+         * of the program.
+         * \param parser - A CmdLine parser object to add this Arg to
+         * \param v - An optional visitor.  You probably should not
+         * use this unless you have a very good reason.
+         */
+        ValueArg( const std::string& flag, 
+                  const std::string& name, 
+                  const std::string& desc, 
+                  bool req, 
+                  T value,
+                  const std::string& typeDesc,
+                  CmdLineInterface& parser,
+                  Visitor* v = NULL );
+ 
+        /**
+         * Labeled ValueArg constructor.
+         * You could conceivably call this constructor with a blank flag, 
+         * but that would make you a bad person.  It would also cause
+         * an exception to be thrown.   If you want an unlabeled argument, 
+         * use the other constructor.
+         * \param flag - The one character flag that identifies this
+         * argument on the command line.
+         * \param name - A one word name for the argument.  Can be
+         * used as a long flag on the command line.
+         * \param desc - A description of what the argument is for or
+         * does.
+         * \param req - Whether the argument is required on the command
+         * line.
+         * \param value - The default value assigned to this argument if it
+         * is not present on the command line.
+         * \param constraint - A pointer to a Constraint object used
+		 * to constrain this Arg.
+         * \param parser - A CmdLine parser object to add this Arg to.
+         * \param v - An optional visitor.  You probably should not
+         * use this unless you have a very good reason.
+         */
+        ValueArg( const std::string& flag, 
+                  const std::string& name, 
+                  const std::string& desc, 
+                  bool req, 
+                  T value,
+                  Constraint<T>* constraint,
+                  CmdLineInterface& parser,
+                  Visitor* v = NULL );
+	  
+        /**
+         * Labeled ValueArg constructor.
+         * You could conceivably call this constructor with a blank flag, 
+         * but that would make you a bad person.  It would also cause
+         * an exception to be thrown.   If you want an unlabeled argument, 
+         * use the other constructor.
+         * \param flag - The one character flag that identifies this
+         * argument on the command line.
+         * \param name - A one word name for the argument.  Can be
+         * used as a long flag on the command line.
+         * \param desc - A description of what the argument is for or
+         * does.
+         * \param req - Whether the argument is required on the command
+         * line.
+         * \param value - The default value assigned to this argument if it
+         * is not present on the command line.
+         * \param constraint - A pointer to a Constraint object used
+		 * to constrain this Arg.
+         * \param v - An optional visitor.  You probably should not
+         * use this unless you have a very good reason.
+         */
+        ValueArg( const std::string& flag, 
+                  const std::string& name, 
+                  const std::string& desc, 
+                  bool req, 
+                  T value,
+                  Constraint<T>* constraint,
+                  Visitor* v = NULL );
+
+        /**
+         * Handles the processing of the argument.
+         * This re-implements the Arg version of this method to set the
+         * _value of the argument appropriately.  It knows the difference
+         * between labeled and unlabeled.
+         * \param i - Pointer the the current argument in the list.
+         * \param args - Mutable list of strings. Passed 
+         * in from main().
+         */
+        virtual bool processArg(int* i, std::vector<std::string>& args); 
+
+        /**
+         * Returns the value of the argument.
+         */
+        T& getValue() ;
+
+        /**
+         * Specialization of shortID.
+         * \param val - value to be used.
+         */
+        virtual std::string shortID(const std::string& val = "val") const;
+
+        /**
+         * Specialization of longID.
+         * \param val - value to be used.
+         */
+        virtual std::string longID(const std::string& val = "val") const;
+        
+        virtual void reset() ;
+
+private:
+       /**
+        * Prevent accidental copying
+        */
+       ValueArg<T>(const ValueArg<T>& rhs);
+       ValueArg<T>& operator=(const ValueArg<T>& rhs);
+};
+
+
+/**
+ * Constructor implementation.
+ */
+template<class T>
+ValueArg<T>::ValueArg(const std::string& flag, 
+                      const std::string& name, 
+                      const std::string& desc, 
+                      bool req, 
+                      T val,
+                      const std::string& typeDesc,
+                      Visitor* v)
+: Arg(flag, name, desc, req, true, v),
+  _value( val ),
+  _default( val ),
+  _typeDesc( typeDesc ),
+  _constraint( NULL )
+{ }
+
+template<class T>
+ValueArg<T>::ValueArg(const std::string& flag, 
+                      const std::string& name, 
+                      const std::string& desc, 
+                      bool req, 
+                      T val,
+                      const std::string& typeDesc,
+                      CmdLineInterface& parser,
+                      Visitor* v)
+: Arg(flag, name, desc, req, true, v),
+  _value( val ),
+  _default( val ),
+  _typeDesc( typeDesc ),
+  _constraint( NULL )
+{ 
+    parser.add( this );
+}
+
+template<class T>
+ValueArg<T>::ValueArg(const std::string& flag, 
+                      const std::string& name, 
+                      const std::string& desc, 
+                      bool req, 
+                      T val,
+                      Constraint<T>* constraint,
+                      Visitor* v)
+: Arg(flag, name, desc, req, true, v),
+  _value( val ),
+  _default( val ),
+  _typeDesc( constraint->shortID() ),
+  _constraint( constraint )
+{ }
+
+template<class T>
+ValueArg<T>::ValueArg(const std::string& flag, 
+                      const std::string& name, 
+                      const std::string& desc, 
+                      bool req, 
+                      T val,
+                      Constraint<T>* constraint,
+                      CmdLineInterface& parser,
+                      Visitor* v)
+: Arg(flag, name, desc, req, true, v),
+  _value( val ),
+  _default( val ),
+  _typeDesc( constraint->shortID() ),
+  _constraint( constraint )
+{ 
+    parser.add( this );
+}
+
+
+/**
+ * Implementation of getValue().
+ */
+template<class T>
+T& ValueArg<T>::getValue() { return _value; }
+
+/**
+ * Implementation of processArg().
+ */
+template<class T>
+bool ValueArg<T>::processArg(int *i, std::vector<std::string>& args)
+{
+    if ( _ignoreable && Arg::ignoreRest() )
+		return false;
+
+    if ( _hasBlanks( args[*i] ) )
+		return false;
+
+    std::string flag = args[*i];
+
+    std::string value = "";
+    trimFlag( flag, value );
+
+    if ( argMatches( flag ) )
+    {
+        if ( _alreadySet )
+		{
+			if ( _xorSet )
+				throw( CmdLineParseException(
+				       "Mutually exclusive argument already set!", 
+				                             toString()) );
+			else
+				throw( CmdLineParseException("Argument already set!", 
+				                             toString()) );
+		}
+
+        if ( Arg::delimiter() != ' ' && value == "" )
+			throw( ArgParseException( 
+							"Couldn't find delimiter for this argument!",
+                             toString() ) );
+
+        if ( value == "" )
+        {
+            (*i)++;
+            if ( static_cast<unsigned int>(*i) < args.size() ) 
+				_extractValue( args[*i] );
+            else
+				throw( ArgParseException("Missing a value for this argument!",
+                                                    toString() ) );
+        }
+        else
+			_extractValue( value );
+				
+        _alreadySet = true;
+        _checkWithVisitor();
+        return true;
+    }	
+    else
+		return false;
+}
+
+/**
+ * Implementation of shortID.
+ */
+template<class T>
+std::string ValueArg<T>::shortID(const std::string& val) const
+{
+	static_cast<void>(val); // Ignore input, don't warn
+	return Arg::shortID( _typeDesc ); 
+}
+
+/**
+ * Implementation of longID.
+ */
+template<class T>
+std::string ValueArg<T>::longID(const std::string& val) const
+{
+	static_cast<void>(val); // Ignore input, don't warn
+	return Arg::longID( _typeDesc ); 
+}
+
+template<class T>
+void ValueArg<T>::_extractValue( const std::string& val ) 
+{
+    try {
+	ExtractValue(_value, val, typename ArgTraits<T>::ValueCategory());
+    } catch( ArgParseException &e) {
+	throw ArgParseException(e.error(), toString());
+    }
+    
+    if ( _constraint != NULL )
+	if ( ! _constraint->check( _value ) )
+	    throw( CmdLineParseException( "Value '" + val + 
+					  + "' does not meet constraint: " 
+					  + _constraint->description(),
+					  toString() ) );
+}
+
+template<class T>
+void ValueArg<T>::reset()
+{
+	Arg::reset();
+	_value = _default;
+}
+
+} // namespace TCLAP
+
+#endif
diff --git a/external/tclap/ValuesConstraint.h b/external/tclap/ValuesConstraint.h
new file mode 100644
index 0000000..cb41f64
--- /dev/null
+++ b/external/tclap/ValuesConstraint.h
@@ -0,0 +1,148 @@
+
+
+/****************************************************************************** 
+ * 
+ *  file:  ValuesConstraint.h
+ * 
+ *  Copyright (c) 2005, Michael E. Smoot
+ *  All rights reverved.
+ * 
+ *  See the file COPYING in the top directory of this distribution for
+ *  more information.
+ *  
+ *  THE SOFTWARE IS PROVIDED _AS IS_, WITHOUT WARRANTY OF ANY KIND, EXPRESS 
+ *  OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
+ *  THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
+ *  DEALINGS IN THE SOFTWARE.  
+ *  
+ *****************************************************************************/ 
+
+#ifndef TCLAP_VALUESCONSTRAINT_H
+#define TCLAP_VALUESCONSTRAINT_H
+
+#include <string>
+#include <vector>
+#include <tclap/Constraint.h>
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#else
+#define HAVE_SSTREAM
+#endif
+
+#if defined(HAVE_SSTREAM)
+#include <sstream>
+#elif defined(HAVE_STRSTREAM)
+#include <strstream>
+#else
+#error "Need a stringstream (sstream or strstream) to compile!"
+#endif
+
+namespace TCLAP {
+
+/**
+ * A Constraint that constrains the Arg to only those values specified
+ * in the constraint.
+ */
+template<class T>
+class ValuesConstraint : public Constraint<T>
+{
+
+	public:
+
+		/**
+		 * Constructor. 
+		 * \param allowed - vector of allowed values. 
+		 */
+		ValuesConstraint(std::vector<T>& allowed);	
+
+		/**
+		 * Virtual destructor.
+		 */
+		virtual ~ValuesConstraint() {}
+
+		/**
+		 * Returns a description of the Constraint. 
+		 */
+		virtual std::string description() const;
+
+		/**
+		 * Returns the short ID for the Constraint.
+		 */
+		virtual std::string shortID() const;
+
+		/**
+		 * The method used to verify that the value parsed from the command
+		 * line meets the constraint.
+		 * \param value - The value that will be checked. 
+		 */
+		virtual bool check(const T& value) const;
+	
+	protected:
+
+		/**
+		 * The list of valid values. 
+		 */
+		std::vector<T> _allowed;
+
+		/**
+		 * The string used to describe the allowed values of this constraint.
+		 */
+		std::string _typeDesc;
+
+};
+
+template<class T>
+ValuesConstraint<T>::ValuesConstraint(std::vector<T>& allowed)
+: _allowed(allowed),
+  _typeDesc("")
+{ 
+    for ( unsigned int i = 0; i < _allowed.size(); i++ )
+    {
+
+#if defined(HAVE_SSTREAM)
+        std::ostringstream os;
+#elif defined(HAVE_STRSTREAM)
+        std::ostrstream os;
+#else
+#error "Need a stringstream (sstream or strstream) to compile!"
+#endif
+
+        os << _allowed[i];
+
+        std::string temp( os.str() ); 
+
+        if ( i > 0 )
+			_typeDesc += "|";
+        _typeDesc += temp;
+    }
+}
+
+template<class T>
+bool ValuesConstraint<T>::check( const T& val ) const
+{
+	if ( std::find(_allowed.begin(),_allowed.end(),val) == _allowed.end() )
+		return false;
+	else 
+		return true;
+}
+
+template<class T>
+std::string ValuesConstraint<T>::shortID() const
+{
+    return _typeDesc;	
+}
+
+template<class T>
+std::string ValuesConstraint<T>::description() const
+{
+    return _typeDesc;	
+}
+
+
+} //namespace TCLAP
+#endif 
+
diff --git a/external/tclap/VersionVisitor.h b/external/tclap/VersionVisitor.h
new file mode 100644
index 0000000..c110d4f
--- /dev/null
+++ b/external/tclap/VersionVisitor.h
@@ -0,0 +1,81 @@
+// -*- Mode: c++; c-basic-offset: 4; tab-width: 4; -*-
+
+/****************************************************************************** 
+ * 
+ *  file:  VersionVisitor.h
+ * 
+ *  Copyright (c) 2003, Michael E. Smoot .
+ *  All rights reverved.
+ * 
+ *  See the file COPYING in the top directory of this distribution for
+ *  more information.
+ *  
+ *  THE SOFTWARE IS PROVIDED _AS IS_, WITHOUT WARRANTY OF ANY KIND, EXPRESS 
+ *  OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
+ *  THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
+ *  DEALINGS IN THE SOFTWARE.  
+ *  
+ *****************************************************************************/ 
+
+
+#ifndef TCLAP_VERSION_VISITOR_H
+#define TCLAP_VERSION_VISITOR_H
+
+#include <tclap/CmdLineInterface.h>
+#include <tclap/CmdLineOutput.h>
+#include <tclap/Visitor.h>
+
+namespace TCLAP {
+
+/**
+ * A Vistor that will call the version method of the given CmdLineOutput
+ * for the specified CmdLine object and then exit.
+ */
+class VersionVisitor: public Visitor
+{
+	private:
+		/**
+		 * Prevent accidental copying
+		 */
+		VersionVisitor(const VersionVisitor& rhs);
+		VersionVisitor& operator=(const VersionVisitor& rhs);
+
+	protected:
+
+		/**
+		 * The CmdLine of interest.
+		 */
+		CmdLineInterface* _cmd;
+
+		/**
+		 * The output object. 
+		 */
+		CmdLineOutput** _out;
+
+	public:
+
+		/**
+		 * Constructor.
+		 * \param cmd - The CmdLine the output is generated for. 
+		 * \param out - The type of output. 
+		 */
+		VersionVisitor( CmdLineInterface* cmd, CmdLineOutput** out ) 
+				: Visitor(), _cmd( cmd ), _out( out ) { }
+
+		/**
+		 * Calls the version method of the output object using the
+		 * specified CmdLine.
+		 */
+		void visit() { 
+		    (*_out)->version(*_cmd); 
+		    throw ExitException(0); 
+		}
+
+};
+
+}
+
+#endif
diff --git a/external/tclap/Visitor.h b/external/tclap/Visitor.h
new file mode 100644
index 0000000..38ddcbd
--- /dev/null
+++ b/external/tclap/Visitor.h
@@ -0,0 +1,53 @@
+
+/****************************************************************************** 
+ * 
+ *  file:  Visitor.h
+ * 
+ *  Copyright (c) 2003, Michael E. Smoot .
+ *  All rights reverved.
+ * 
+ *  See the file COPYING in the top directory of this distribution for
+ *  more information.
+ *  
+ *  THE SOFTWARE IS PROVIDED _AS IS_, WITHOUT WARRANTY OF ANY KIND, EXPRESS 
+ *  OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
+ *  THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
+ *  DEALINGS IN THE SOFTWARE.  
+ *  
+ *****************************************************************************/ 
+
+
+#ifndef TCLAP_VISITOR_H
+#define TCLAP_VISITOR_H
+
+namespace TCLAP {
+
+/**
+ * A base class that defines the interface for visitors.
+ */
+class Visitor
+{
+	public:
+
+		/**
+		 * Constructor. Does nothing.
+		 */
+		Visitor() { }
+
+		/**
+		 * Destructor. Does nothing.
+		 */
+		virtual ~Visitor() { }
+
+		/**
+		 * Does nothing. Should be overridden by child.
+		 */
+		virtual void visit() { }
+};
+
+}
+
+#endif
diff --git a/external/tclap/XorHandler.h b/external/tclap/XorHandler.h
new file mode 100644
index 0000000..d9dfad3
--- /dev/null
+++ b/external/tclap/XorHandler.h
@@ -0,0 +1,166 @@
+
+/****************************************************************************** 
+ * 
+ *  file:  XorHandler.h
+ * 
+ *  Copyright (c) 2003, Michael E. Smoot .
+ *  Copyright (c) 2004, Michael E. Smoot, Daniel Aarno.
+ *  All rights reverved.
+ * 
+ *  See the file COPYING in the top directory of this distribution for
+ *  more information.
+ *  
+ *  THE SOFTWARE IS PROVIDED _AS IS_, WITHOUT WARRANTY OF ANY KIND, EXPRESS 
+ *  OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
+ *  THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
+ *  DEALINGS IN THE SOFTWARE.  
+ *  
+ *****************************************************************************/ 
+
+#ifndef TCLAP_XORHANDLER_H
+#define TCLAP_XORHANDLER_H
+
+#include <tclap/Arg.h>
+#include <string>
+#include <vector>
+#include <algorithm>
+#include <iostream>
+
+namespace TCLAP {
+
+/**
+ * This class handles lists of Arg's that are to be XOR'd on the command
+ * line.  This is used by CmdLine and you shouldn't ever use it.
+ */
+class XorHandler
+{
+	protected:
+
+		/**
+		 * The list of of lists of Arg's to be or'd together.
+		 */
+		std::vector< std::vector<Arg*> > _orList;
+
+	public:
+
+		/**
+		 * Constructor.  Does nothing.
+		 */
+		XorHandler( ) : _orList(std::vector< std::vector<Arg*> >()) {}
+
+		/**
+		 * Add a list of Arg*'s that will be orred together.
+		 * \param ors - list of Arg* that will be xor'd.
+		 */
+		void add( std::vector<Arg*>& ors );
+			
+		/**
+		 * Checks whether the specified Arg is in one of the xor lists and
+		 * if it does match one, returns the size of the xor list that the
+		 * Arg matched.  If the Arg matches, then it also sets the rest of
+		 * the Arg's in the list. You shouldn't use this.  
+		 * \param a - The Arg to be checked.
+		 */
+		int check( const Arg* a );
+
+		/**
+		 * Returns the XOR specific short usage.
+		 */
+		std::string shortUsage();
+
+		/**
+		 * Prints the XOR specific long usage.
+		 * \param os - Stream to print to.
+		 */
+		void printLongUsage(std::ostream& os);
+
+		/**
+		 * Simply checks whether the Arg is contained in one of the arg
+		 * lists.
+		 * \param a - The Arg to be checked.
+		 */
+		bool contains( const Arg* a );
+
+		std::vector< std::vector<Arg*> >& getXorList(); 
+
+};
+
+
+//////////////////////////////////////////////////////////////////////
+//BEGIN XOR.cpp
+//////////////////////////////////////////////////////////////////////
+inline void XorHandler::add( std::vector<Arg*>& ors )
+{ 
+	_orList.push_back( ors );
+}
+
+inline int XorHandler::check( const Arg* a ) 
+{
+	// iterate over each XOR list
+	for ( int i = 0; static_cast<unsigned int>(i) < _orList.size(); i++ )
+	{
+		// if the XOR list contains the arg..
+		ArgVectorIterator ait = std::find( _orList[i].begin(), 
+		                                   _orList[i].end(), a );
+		if ( ait != _orList[i].end() )
+		{
+			// first check to see if a mutually exclusive switch
+			// has not already been set
+			for ( ArgVectorIterator it = _orList[i].begin(); 
+				  it != _orList[i].end(); 
+				  it++ )
+				if ( a != (*it) && (*it)->isSet() )
+					throw(CmdLineParseException(
+					      "Mutually exclusive argument already set!",
+					      (*it)->toString()));
+
+			// go through and set each arg that is not a
+			for ( ArgVectorIterator it = _orList[i].begin(); 
+				  it != _orList[i].end(); 
+				  it++ )
+				if ( a != (*it) )
+					(*it)->xorSet();
+
+			// return the number of required args that have now been set
+			if ( (*ait)->allowMore() )
+				return 0;
+			else
+				return static_cast<int>(_orList[i].size());
+		}
+	}
+
+	if ( a->isRequired() )
+		return 1;
+	else
+		return 0;
+}
+
+inline bool XorHandler::contains( const Arg* a )
+{
+	for ( int i = 0; static_cast<unsigned int>(i) < _orList.size(); i++ )
+		for ( ArgVectorIterator it = _orList[i].begin(); 
+			  it != _orList[i].end(); 
+			  it++ )	
+			if ( a == (*it) )
+				return true;
+
+	return false;
+}
+
+inline std::vector< std::vector<Arg*> >& XorHandler::getXorList() 
+{
+	return _orList;
+}
+
+
+
+//////////////////////////////////////////////////////////////////////
+//END XOR.cpp
+//////////////////////////////////////////////////////////////////////
+
+} //namespace TCLAP
+
+#endif 
diff --git a/external/tclap/ZshCompletionOutput.h b/external/tclap/ZshCompletionOutput.h
new file mode 100644
index 0000000..0b37fc7
--- /dev/null
+++ b/external/tclap/ZshCompletionOutput.h
@@ -0,0 +1,323 @@
+// -*- Mode: c++; c-basic-offset: 4; tab-width: 4; -*-
+
+/****************************************************************************** 
+ * 
+ *  file:  ZshCompletionOutput.h
+ * 
+ *  Copyright (c) 2006, Oliver Kiddle
+ *  All rights reverved.
+ * 
+ *  See the file COPYING in the top directory of this distribution for
+ *  more information.
+ *  
+ *  THE SOFTWARE IS PROVIDED _AS IS_, WITHOUT WARRANTY OF ANY KIND, EXPRESS 
+ *  OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
+ *  THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
+ *  DEALINGS IN THE SOFTWARE.
+ *  
+ *****************************************************************************/ 
+
+#ifndef TCLAP_ZSHCOMPLETIONOUTPUT_H
+#define TCLAP_ZSHCOMPLETIONOUTPUT_H
+
+#include <string>
+#include <vector>
+#include <list>
+#include <iostream>
+#include <map>
+
+#include <tclap/CmdLineInterface.h>
+#include <tclap/CmdLineOutput.h>
+#include <tclap/XorHandler.h>
+#include <tclap/Arg.h>
+
+namespace TCLAP {
+
+/**
+ * A class that generates a Zsh completion function as output from the usage()
+ * method for the given CmdLine and its Args.
+ */
+class ZshCompletionOutput : public CmdLineOutput
+{
+
+	public:
+
+		ZshCompletionOutput();
+
+		/**
+		 * Prints the usage to stdout.  Can be overridden to 
+		 * produce alternative behavior.
+		 * \param c - The CmdLine object the output is generated for. 
+		 */
+		virtual void usage(CmdLineInterface& c);
+
+		/**
+		 * Prints the version to stdout. Can be overridden 
+		 * to produce alternative behavior.
+		 * \param c - The CmdLine object the output is generated for. 
+		 */
+		virtual void version(CmdLineInterface& c);
+
+		/**
+		 * Prints (to stderr) an error message, short usage 
+		 * Can be overridden to produce alternative behavior.
+		 * \param c - The CmdLine object the output is generated for. 
+		 * \param e - The ArgException that caused the failure. 
+		 */
+		virtual void failure(CmdLineInterface& c,
+						     ArgException& e );
+
+	protected:
+
+		void basename( std::string& s );
+		void quoteSpecialChars( std::string& s );
+
+		std::string getMutexList( CmdLineInterface& _cmd, Arg* a );
+		void printOption( Arg* it, std::string mutex );
+		void printArg( Arg* it );
+
+		std::map<std::string, std::string> common;
+		char theDelimiter;
+};
+
+ZshCompletionOutput::ZshCompletionOutput()
+: common(std::map<std::string, std::string>()),
+  theDelimiter('=')
+{
+	common["host"] = "_hosts";
+	common["hostname"] = "_hosts";
+	common["file"] = "_files";
+	common["filename"] = "_files";
+	common["user"] = "_users";
+	common["username"] = "_users";
+	common["directory"] = "_directories";
+	common["path"] = "_directories";
+	common["url"] = "_urls";
+}
+
+inline void ZshCompletionOutput::version(CmdLineInterface& _cmd)
+{
+	std::cout << _cmd.getVersion() << std::endl;
+}
+
+inline void ZshCompletionOutput::usage(CmdLineInterface& _cmd )
+{
+	std::list<Arg*> argList = _cmd.getArgList();
+	std::string progName = _cmd.getProgramName();
+	std::string xversion = _cmd.getVersion();
+	theDelimiter = _cmd.getDelimiter();
+	basename(progName);
+
+	std::cout << "#compdef " << progName << std::endl << std::endl <<
+		"# " << progName << " version " << _cmd.getVersion() << std::endl << std::endl <<
+		"_arguments -s -S";
+
+	for (ArgListIterator it = argList.begin(); it != argList.end(); it++)
+	{
+		if ( (*it)->shortID().at(0) == '<' )
+			printArg((*it));
+		else if ( (*it)->getFlag() != "-" )
+			printOption((*it), getMutexList(_cmd, *it));
+	}
+
+	std::cout << std::endl;
+}
+
+inline void ZshCompletionOutput::failure( CmdLineInterface& _cmd,
+				                ArgException& e )
+{
+	static_cast<void>(_cmd); // unused
+	std::cout << e.what() << std::endl;
+}
+
+inline void ZshCompletionOutput::quoteSpecialChars( std::string& s )
+{
+	size_t idx = s.find_last_of(':');
+	while ( idx != std::string::npos )
+	{
+		s.insert(idx, 1, '\\');
+		idx = s.find_last_of(':', idx);
+	}
+	idx = s.find_last_of('\'');
+	while ( idx != std::string::npos )
+	{
+		s.insert(idx, "'\\'");
+		if (idx == 0)
+			idx = std::string::npos;
+		else
+			idx = s.find_last_of('\'', --idx);
+	}
+}
+
+inline void ZshCompletionOutput::basename( std::string& s )
+{
+	size_t p = s.find_last_of('/');
+	if ( p != std::string::npos )
+	{
+		s.erase(0, p + 1);
+	}
+}
+
+inline void ZshCompletionOutput::printArg(Arg* a)
+{
+	static int count = 1;
+
+	std::cout << " \\" << std::endl << "  '";
+	if ( a->acceptsMultipleValues() )
+		std::cout << '*';
+	else
+		std::cout << count++;
+	std::cout << ':';
+	if ( !a->isRequired() )
+		std::cout << ':';
+
+	std::cout << a->getName() << ':';
+	std::map<std::string, std::string>::iterator compArg = common.find(a->getName());
+	if ( compArg != common.end() )
+	{
+		std::cout << compArg->second;
+	}
+	else
+	{
+		std::cout << "_guard \"^-*\" " << a->getName();
+	}
+	std::cout << '\'';
+}
+
+inline void ZshCompletionOutput::printOption(Arg* a, std::string mutex)
+{
+	std::string flag = a->flagStartChar() + a->getFlag();
+	std::string name = a->nameStartString() + a->getName();
+	std::string desc = a->getDescription();
+
+	// remove full stop and capitalisation from description as
+	// this is the convention for zsh function
+	if (!desc.compare(0, 12, "(required)  "))
+	{
+		desc.erase(0, 12);
+	}
+	if (!desc.compare(0, 15, "(OR required)  "))
+	{
+		desc.erase(0, 15);
+	}
+	size_t len = desc.length();
+	if (len && desc.at(--len) == '.')
+	{
+		desc.erase(len);
+	}
+	if (len)
+	{
+		desc.replace(0, 1, 1, tolower(desc.at(0)));
+	}
+
+	std::cout << " \\" << std::endl << "  '" << mutex;
+
+	if ( a->getFlag().empty() )
+	{
+		std::cout << name;
+	}
+	else
+	{
+		std::cout << "'{" << flag << ',' << name << "}'";
+	}
+	if ( theDelimiter == '=' && a->isValueRequired() )
+		std::cout << "=-";
+	quoteSpecialChars(desc);
+	std::cout << '[' << desc << ']';
+
+	if ( a->isValueRequired() )
+	{
+		std::string arg = a->shortID();
+		arg.erase(0, arg.find_last_of(theDelimiter) + 1);
+		if ( arg.at(arg.length()-1) == ']' )
+			arg.erase(arg.length()-1);
+		if ( arg.at(arg.length()-1) == ']' )
+		{
+			arg.erase(arg.length()-1);
+		}
+		if ( arg.at(0) == '<' )
+		{
+			arg.erase(arg.length()-1);
+			arg.erase(0, 1);
+		}
+		size_t p = arg.find('|');
+		if ( p != std::string::npos )
+		{
+			do
+			{
+				arg.replace(p, 1, 1, ' ');
+			}
+			while ( (p = arg.find_first_of('|', p)) != std::string::npos );
+			quoteSpecialChars(arg);
+			std::cout << ": :(" << arg << ')';
+		}
+		else
+		{
+			std::cout << ':' << arg;
+			std::map<std::string, std::string>::iterator compArg = common.find(arg);
+			if ( compArg != common.end() )
+			{
+				std::cout << ':' << compArg->second;
+			}
+		}
+	}
+
+	std::cout << '\'';
+}
+
+inline std::string ZshCompletionOutput::getMutexList( CmdLineInterface& _cmd, Arg* a)
+{
+	XorHandler xorHandler = _cmd.getXorHandler();
+	std::vector< std::vector<Arg*> > xorList = xorHandler.getXorList();
+	
+	if (a->getName() == "help" || a->getName() == "version")
+	{
+		return "(-)";
+	}
+
+	std::ostringstream list;
+	if ( a->acceptsMultipleValues() )
+	{
+		list << '*';
+	}
+
+	for ( int i = 0; static_cast<unsigned int>(i) < xorList.size(); i++ )
+	{
+		for ( ArgVectorIterator it = xorList[i].begin();
+			it != xorList[i].end();
+			it++)
+		if ( a == (*it) )
+		{
+			list << '(';
+			for ( ArgVectorIterator iu = xorList[i].begin();
+				iu != xorList[i].end();
+				iu++ )
+			{
+				bool notCur = (*iu) != a;
+				bool hasFlag = !(*iu)->getFlag().empty();
+				if ( iu != xorList[i].begin() && (notCur || hasFlag) )
+					list << ' ';
+				if (hasFlag)
+					list << (*iu)->flagStartChar() << (*iu)->getFlag() << ' ';
+				if ( notCur || hasFlag )
+					list << (*iu)->nameStartString() << (*iu)->getName();
+			}
+			list << ')';
+			return list.str();
+		}
+	}
+	
+	// wasn't found in xor list
+	if (!a->getFlag().empty()) {
+		list << "(" << a->flagStartChar() << a->getFlag() << ' ' <<
+			a->nameStartString() << a->getName() << ')';
+	}
+	
+	return list.str();
+}
+
+} //namespace TCLAP
+#endif
diff --git a/libviennacl/CMakeLists.txt b/libviennacl/CMakeLists.txt
new file mode 100644
index 0000000..2713829
--- /dev/null
+++ b/libviennacl/CMakeLists.txt
@@ -0,0 +1,35 @@
+
+include_directories(${PROJECT_SOURCE_DIR}/libviennacl/include/)
+
+if(ENABLE_CUDA)
+
+  if(ENABLE_OPENCL)
+    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-DVIENNACL_WITH_OPENCL") #set flags before setting executable!
+    cuda_add_library(viennacl SHARED src/backend.cu
+                                     src/blas1.cu src/blas1_host.cu src/blas1_cuda.cu src/blas1_opencl.cu
+                                     src/blas2.cu src/blas2_host.cu src/blas2_cuda.cu src/blas2_opencl.cu
+                                     src/blas3.cu src/blas3_host.cu src/blas3_cuda.cu src/blas3_opencl.cu)
+    set_target_properties(viennacl PROPERTIES COMPILE_FLAGS "-DVIENNACL_WITH_OPENCL -DVIENNACL_WITH_CUDA")
+    target_link_libraries(viennacl ${OPENCL_LIBRARIES})
+  else(ENABLE_OPENCL)
+    cuda_add_library(viennacl SHARED src/backend.cu
+                                     src/blas1.cu src/blas1_host.cu src/blas1_cuda.cu
+                                     src/blas2.cu src/blas2_host.cu src/blas2_cuda.cu
+                                     src/blas3.cu src/blas3_host.cu src/blas3_cuda.cu)
+    set_target_properties(viennacl PROPERTIES COMPILE_FLAGS "-DVIENNACL_WITH_CUDA")
+  endif(ENABLE_OPENCL)
+else(ENABLE_CUDA)
+  if(ENABLE_OPENCL)
+    add_library(viennacl SHARED src/backend.cpp
+                                src/blas1.cpp src/blas1_host.cpp src/blas1_opencl.cpp
+                                src/blas2.cpp src/blas2_host.cpp src/blas2_opencl.cpp
+                                src/blas3.cpp src/blas3_host.cpp src/blas3_opencl.cpp)
+    set_target_properties(viennacl PROPERTIES COMPILE_FLAGS "-DVIENNACL_WITH_OPENCL")
+    target_link_libraries(viennacl ${OPENCL_LIBRARIES})
+  else(ENABLE_OPENCL)
+    add_library(viennacl SHARED src/backend.cpp
+                                src/blas1.cpp src/blas1_host.cpp
+                                src/blas2.cpp src/blas2_host.cpp
+                                src/blas3.cpp src/blas3_host.cpp)
+  endif(ENABLE_OPENCL)
+endif(ENABLE_CUDA)
diff --git a/libviennacl/include/viennacl.hpp b/libviennacl/include/viennacl.hpp
new file mode 100644
index 0000000..67a52d5
--- /dev/null
+++ b/libviennacl/include/viennacl.hpp
@@ -0,0 +1,607 @@
+#ifndef VIENNACL_VIENNACL_HPP
+#define VIENNACL_VIENNACL_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include <stdlib.h>
+
+#ifdef VIENNACL_WITH_OPENCL
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#else
+#include <CL/cl.h>
+#endif
+#endif
+
+
+// Extra export declarations when building with Visual Studio:
+#if defined(_MSC_VER)
+  #if defined(viennacl_EXPORTS)
+    #define  VIENNACL_EXPORTED_FUNCTION __declspec(dllexport)
+  #else
+    #define  VIENNACL_EXPORTED_FUNCTION __declspec(dllimport)
+  #endif /* viennacl_EXPORTS */
+#else /* defined (_MSC_VER) */
+ #define VIENNACL_EXPORTED_FUNCTION
+#endif
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef int ViennaCLInt;
+
+
+/************** Enums ***************/
+
+typedef enum
+{
+  ViennaCLCUDA,
+  ViennaCLOpenCL,
+  ViennaCLHost
+} ViennaCLBackendTypes;
+
+typedef enum
+{
+  ViennaCLRowMajor,
+  ViennaCLColumnMajor
+} ViennaCLOrder;
+
+typedef enum
+{
+  ViennaCLNoTrans,
+  ViennaCLTrans
+} ViennaCLTranspose;
+
+typedef enum
+{
+  ViennaCLUpper,
+  ViennaCLLower
+} ViennaCLUplo;
+
+typedef enum
+{
+  ViennaCLUnit,
+  ViennaCLNonUnit
+} ViennaCLDiag;
+
+typedef enum
+{
+  ViennaCLFloat,
+  ViennaCLDouble
+} ViennaCLPrecision;
+
+// Error codes:
+typedef enum
+{
+  ViennaCLSuccess = 0,
+  ViennaCLGenericFailure
+} ViennaCLStatus;
+
+
+/************* Backend Management ******************/
+
+/** @brief Generic backend for CUDA, OpenCL, host-based stuff */
+struct ViennaCLBackend_impl;
+typedef ViennaCLBackend_impl*   ViennaCLBackend;
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLBackendCreate(ViennaCLBackend * backend);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLBackendSetOpenCLContextID(ViennaCLBackend backend, ViennaCLInt context_id);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLBackendDestroy(ViennaCLBackend * backend);
+
+/******** User Types **********/
+
+struct ViennaCLHostScalar_impl;
+typedef ViennaCLHostScalar_impl*    ViennaCLHostScalar;
+
+struct ViennaCLScalar_impl;
+typedef ViennaCLScalar_impl*        ViennaCLScalar;
+
+struct ViennaCLVector_impl;
+typedef ViennaCLVector_impl*        ViennaCLVector;
+
+struct ViennaCLMatrix_impl;
+typedef ViennaCLMatrix_impl*        ViennaCLMatrix;
+
+
+/******************** BLAS Level 1 ***********************/
+
+// IxASUM
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLiamax(ViennaCLInt *alpha, ViennaCLVector x);
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDAiSamax(ViennaCLBackend backend, ViennaCLInt n,
+                                                             ViennaCLInt *alpha,
+                                                             float *x, ViennaCLInt offx, ViennaCLInt incx);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDAiDamax(ViennaCLBackend backend, ViennaCLInt n,
+                                                             ViennaCLInt *alpha,
+                                                             double *x, ViennaCLInt offx, ViennaCLInt incx);
+
+#ifdef VIENNACL_WITH_OPENCL
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLiSamax(ViennaCLBackend backend, ViennaCLInt n,
+                                                               ViennaCLInt *alpha,
+                                                               cl_mem x, ViennaCLInt offx, ViennaCLInt incx);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLiDamax(ViennaCLBackend backend, ViennaCLInt n,
+                                                               ViennaCLInt *alpha,
+                                                               cl_mem x, ViennaCLInt offx, ViennaCLInt incx);
+#endif
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostiSamax(ViennaCLBackend backend, ViennaCLInt n,
+                                                             ViennaCLInt *alpha,
+                                                             float *x, ViennaCLInt offx, ViennaCLInt incx);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostiDamax(ViennaCLBackend backend, ViennaCLInt n,
+                                                             ViennaCLInt *alpha,
+                                                             double *x, ViennaCLInt offx, ViennaCLInt incx);
+
+
+// xASUM
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLasum(ViennaCLHostScalar *alpha, ViennaCLVector x);
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDASasum(ViennaCLBackend backend, ViennaCLInt n,
+                                                            float *alpha,
+                                                            float *x, ViennaCLInt offx, ViennaCLInt incx);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADasum(ViennaCLBackend backend, ViennaCLInt n,
+                                                            double *alpha,
+                                                            double *x, ViennaCLInt offx, ViennaCLInt incx);
+
+#ifdef VIENNACL_WITH_OPENCL
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSasum(ViennaCLBackend backend, ViennaCLInt n,
+                                                              float *alpha,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDasum(ViennaCLBackend backend, ViennaCLInt n,
+                                                              double *alpha,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx);
+#endif
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSasum(ViennaCLBackend backend, ViennaCLInt n,
+                                                            float *alpha,
+                                                            float *x, ViennaCLInt offx, ViennaCLInt incx);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDasum(ViennaCLBackend backend, ViennaCLInt n,
+                                                            double *alpha,
+                                                            double *x, ViennaCLInt offx, ViennaCLInt incx);
+
+
+
+// xAXPY
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLaxpy(ViennaCLHostScalar alpha, ViennaCLVector x, ViennaCLVector y);
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDASaxpy(ViennaCLBackend backend, ViennaCLInt n,
+                                                            float alpha,
+                                                            float *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                            float *y, ViennaCLInt offy, ViennaCLInt incy);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADaxpy(ViennaCLBackend backend, ViennaCLInt n,
+                                                            double alpha,
+                                                            double *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                            double *y, ViennaCLInt offy, ViennaCLInt incy);
+
+#ifdef VIENNACL_WITH_OPENCL
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSaxpy(ViennaCLBackend backend, ViennaCLInt n,
+                                                              float alpha,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                              cl_mem y, ViennaCLInt offy, ViennaCLInt incy);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDaxpy(ViennaCLBackend backend, ViennaCLInt n,
+                                                              double alpha,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                              cl_mem y, ViennaCLInt offy, ViennaCLInt incy);
+#endif
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSaxpy(ViennaCLBackend backend, ViennaCLInt n,
+                                                            float alpha,
+                                                            float *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                            float *y, ViennaCLInt offy, ViennaCLInt incy);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDaxpy(ViennaCLBackend backend, ViennaCLInt n,
+                                                            double alpha,
+                                                            double *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                            double *y, ViennaCLInt offy, ViennaCLInt incy);
+
+
+// xCOPY
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLcopy(ViennaCLVector x, ViennaCLVector y);
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDAScopy(ViennaCLBackend backend, ViennaCLInt n,
+                                                            float *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                            float *y, ViennaCLInt offy, ViennaCLInt incy);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADcopy(ViennaCLBackend backend, ViennaCLInt n,
+                                                            double *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                            double *y, ViennaCLInt offy, ViennaCLInt incy);
+
+#ifdef VIENNACL_WITH_OPENCL
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLScopy(ViennaCLBackend backend, ViennaCLInt n,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                              cl_mem y, ViennaCLInt offy, ViennaCLInt incy);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDcopy(ViennaCLBackend backend, ViennaCLInt n,
+                                   cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                   cl_mem y, ViennaCLInt offy, ViennaCLInt incy);
+#endif
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostScopy(ViennaCLBackend backend, ViennaCLInt n,
+                                                            float *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                            float *y, ViennaCLInt offy, ViennaCLInt incy);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDcopy(ViennaCLBackend backend, ViennaCLInt n,
+                                                            double *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                            double *y, ViennaCLInt offy, ViennaCLInt incy);
+
+// xDOT
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLdot(ViennaCLHostScalar *alpha, ViennaCLVector x, ViennaCLVector y);
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDASdot(ViennaCLBackend backend, ViennaCLInt n,
+                                                           float *alpha,
+                                                           float *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                           float *y, ViennaCLInt offy, ViennaCLInt incy);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADdot(ViennaCLBackend backend, ViennaCLInt n,
+                                                           double *alpha,
+                                                           double *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                           double *y, ViennaCLInt offy, ViennaCLInt incy);
+
+#ifdef VIENNACL_WITH_OPENCL
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSdot(ViennaCLBackend backend, ViennaCLInt n,
+                                                             float *alpha,
+                                                             cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                             cl_mem y, ViennaCLInt offy, ViennaCLInt incy);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDdot(ViennaCLBackend backend, ViennaCLInt n,
+                                                             double *alpha,
+                                                             cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                             cl_mem y, ViennaCLInt offy, ViennaCLInt incy);
+#endif
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSdot(ViennaCLBackend backend, ViennaCLInt n,
+                                                           float *alpha,
+                                                           float *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                           float *y, ViennaCLInt offy, ViennaCLInt incy);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDdot(ViennaCLBackend backend, ViennaCLInt n,
+                                                           double *alpha,
+                                                           double *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                           double *y, ViennaCLInt offy, ViennaCLInt incy);
+
+// xNRM2
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLnrm2(ViennaCLHostScalar *alpha, ViennaCLVector x);
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDASnrm2(ViennaCLBackend backend, ViennaCLInt n,
+                                                            float *alpha,
+                                                            float *x, ViennaCLInt offx, ViennaCLInt incx);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADnrm2(ViennaCLBackend backend, ViennaCLInt n,
+                                                            double *alpha,
+                                                            double *x, ViennaCLInt offx, ViennaCLInt incx);
+
+#ifdef VIENNACL_WITH_OPENCL
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSnrm2(ViennaCLBackend backend, ViennaCLInt n,
+                                                              float *alpha,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDnrm2(ViennaCLBackend backend, ViennaCLInt n,
+                                                              double *alpha,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx);
+#endif
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSnrm2(ViennaCLBackend backend, ViennaCLInt n,
+                                                            float *alpha,
+                                                            float *x, ViennaCLInt offx, ViennaCLInt incx);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDnrm2(ViennaCLBackend backend, ViennaCLInt n,
+                                                            double *alpha,
+                                                            double *x, ViennaCLInt offx, ViennaCLInt incx);
+
+
+// xROT
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLrot(ViennaCLVector     x,     ViennaCLVector y,
+                                                      ViennaCLHostScalar c, ViennaCLHostScalar s);
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDASrot(ViennaCLBackend backend, ViennaCLInt n,
+                                                           float *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                           float *y, ViennaCLInt offy, ViennaCLInt incy,
+                                                           float c, float s);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADrot(ViennaCLBackend backend, ViennaCLInt n,
+                                                           double *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                           double *y, ViennaCLInt offy, ViennaCLInt incy,
+                                                           double c, double s);
+
+#ifdef VIENNACL_WITH_OPENCL
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSrot(ViennaCLBackend backend, ViennaCLInt n,
+                                                             cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                             cl_mem y, ViennaCLInt offy, ViennaCLInt incy,
+                                                             float c, float s);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDrot(ViennaCLBackend backend, ViennaCLInt n,
+                                                             cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                             cl_mem y, ViennaCLInt offy, ViennaCLInt incy,
+                                                             double c, double s);
+#endif
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSrot(ViennaCLBackend backend, ViennaCLInt n,
+                                                           float *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                           float *y, ViennaCLInt offy, ViennaCLInt incy,
+                                                           float c, float s);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDrot(ViennaCLBackend backend, ViennaCLInt n,
+                                                           double *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                           double *y, ViennaCLInt offy, ViennaCLInt incy,
+                                                           double c, double s);
+
+
+
+// xSCAL
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLscal(ViennaCLHostScalar alpha, ViennaCLVector x);
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDASscal(ViennaCLBackend backend, ViennaCLInt n,
+                                                            float alpha,
+                                                            float *x, ViennaCLInt offx, ViennaCLInt incx);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADscal(ViennaCLBackend backend, ViennaCLInt n,
+                                                            double alpha,
+                                                            double *x, ViennaCLInt offx, ViennaCLInt incx);
+
+#ifdef VIENNACL_WITH_OPENCL
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSscal(ViennaCLBackend backend, ViennaCLInt n,
+                                                              float alpha,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDscal(ViennaCLBackend backend, ViennaCLInt n,
+                                                              double alpha,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx);
+#endif
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSscal(ViennaCLBackend backend, ViennaCLInt n,
+                                                            float alpha,
+                                                            float *x, ViennaCLInt offx, ViennaCLInt incx);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDscal(ViennaCLBackend backend, ViennaCLInt n,
+                                                            double alpha,
+                                                            double *x, ViennaCLInt offx, ViennaCLInt incx);
+
+
+// xSWAP
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLswap(ViennaCLVector x, ViennaCLVector y);
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDASswap(ViennaCLBackend backend, ViennaCLInt n,
+                                                            float *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                            float *y, ViennaCLInt offy, ViennaCLInt incy);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADswap(ViennaCLBackend backend, ViennaCLInt n,
+                                                            double *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                            double *y, ViennaCLInt offy, ViennaCLInt incy);
+
+#ifdef VIENNACL_WITH_OPENCL
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSswap(ViennaCLBackend backend, ViennaCLInt n,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                              cl_mem y, ViennaCLInt offy, ViennaCLInt incy);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDswap(ViennaCLBackend backend, ViennaCLInt n,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                              cl_mem y, ViennaCLInt offy, ViennaCLInt incy);
+#endif
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSswap(ViennaCLBackend backend, ViennaCLInt n,
+                                                            float *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                            float *y, ViennaCLInt offy, ViennaCLInt incy);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDswap(ViennaCLBackend backend, ViennaCLInt n,
+                                                            double *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                            double *y, ViennaCLInt offy, ViennaCLInt incy);
+
+
+
+/******************** BLAS Level 2 ***********************/
+
+// xGEMV: y <- alpha * Ax + beta * y
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLgemv(ViennaCLHostScalar alpha, ViennaCLMatrix A, ViennaCLVector x, ViennaCLHostScalar beta, ViennaCLVector y);
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDASgemv(ViennaCLBackend backend,
+                                                            ViennaCLOrder order, ViennaCLTranspose transA,
+                                                            ViennaCLInt m, ViennaCLInt n, float alpha, float *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                            float *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                            float beta,
+                                                            float *y, ViennaCLInt offy, ViennaCLInt incy);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADgemv(ViennaCLBackend backend,
+                                                            ViennaCLOrder order, ViennaCLTranspose transA,
+                                                            ViennaCLInt m, ViennaCLInt n, double alpha, double *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                            double *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                            double beta,
+                                                            double *y, ViennaCLInt offy, ViennaCLInt incy);
+
+#ifdef VIENNACL_WITH_OPENCL
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSgemv(ViennaCLBackend backend,
+                                                              ViennaCLOrder order, ViennaCLTranspose transA,
+                                                              ViennaCLInt m, ViennaCLInt n, float alpha, cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                              float beta,
+                                                              cl_mem y, ViennaCLInt offy, ViennaCLInt incy);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDgemv(ViennaCLBackend backend,
+                                                              ViennaCLOrder order, ViennaCLTranspose transA,
+                                                              ViennaCLInt m, ViennaCLInt n, double alpha, cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                              double beta,
+                                                              cl_mem y, ViennaCLInt offy, ViennaCLInt incy);
+#endif
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSgemv(ViennaCLBackend backend,
+                                                            ViennaCLOrder order, ViennaCLTranspose transA,
+                                                            ViennaCLInt m, ViennaCLInt n, float alpha, float *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                            float *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                            float beta,
+                                                            float *y, ViennaCLInt offy, ViennaCLInt incy);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDgemv(ViennaCLBackend backend,
+                                                            ViennaCLOrder order, ViennaCLTranspose transA,
+                                                            ViennaCLInt m, ViennaCLInt n, double alpha, double *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                            double *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                            double beta,
+                                                            double *y, ViennaCLInt offy, ViennaCLInt incy);
+
+// xTRSV: Ax <- x
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLtrsv(ViennaCLMatrix A, ViennaCLVector x, ViennaCLUplo uplo);
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDAStrsv(ViennaCLBackend backend,
+                                                            ViennaCLUplo uplo, ViennaCLOrder order, ViennaCLTranspose transA, ViennaCLDiag diag,
+                                                            ViennaCLInt n, float *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                            float *x, ViennaCLInt offx, ViennaCLInt incx);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADtrsv(ViennaCLBackend backend,
+                                                            ViennaCLUplo uplo, ViennaCLOrder order, ViennaCLTranspose transA, ViennaCLDiag diag,
+                                                            ViennaCLInt n, double *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                            double *x, ViennaCLInt offx, ViennaCLInt incx);
+
+#ifdef VIENNACL_WITH_OPENCL
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLStrsv(ViennaCLBackend backend,
+                                                              ViennaCLUplo uplo, ViennaCLOrder order, ViennaCLTranspose transA, ViennaCLDiag diag,
+                                                              ViennaCLInt n, cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDtrsv(ViennaCLBackend backend,
+                                                              ViennaCLUplo uplo, ViennaCLOrder order, ViennaCLTranspose transA, ViennaCLDiag diag,
+                                                              ViennaCLInt n, cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx);
+#endif
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostStrsv(ViennaCLBackend backend,
+                                                            ViennaCLUplo uplo, ViennaCLOrder order, ViennaCLTranspose transA, ViennaCLDiag diag,
+                                                            ViennaCLInt n, float *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                            float *x, ViennaCLInt offx, ViennaCLInt incx);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDtrsv(ViennaCLBackend backend,
+                                                            ViennaCLUplo uplo, ViennaCLOrder order, ViennaCLTranspose transA, ViennaCLDiag diag,
+                                                            ViennaCLInt n, double *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                            double *x, ViennaCLInt offx, ViennaCLInt incx);
+
+
+// xGER: A <- alpha * x * y + A
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLger(ViennaCLHostScalar alpha, ViennaCLVector x, ViennaCLVector y, ViennaCLMatrix A);
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDASger(ViennaCLBackend backend,
+                                                           ViennaCLOrder order,
+                                                           ViennaCLInt m, ViennaCLInt n,
+                                                           float alpha,
+                                                           float *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                           float *y, ViennaCLInt offy, ViennaCLInt incy,
+                                                           float *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADger(ViennaCLBackend backend,
+                                                           ViennaCLOrder order,
+                                                           ViennaCLInt m,  ViennaCLInt n,
+                                                           double alpha,
+                                                           double *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                           double *y, ViennaCLInt offy, ViennaCLInt incy,
+                                                           double *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda);
+
+#ifdef VIENNACL_WITH_OPENCL
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSger(ViennaCLBackend backend,
+                                                             ViennaCLOrder order,
+                                                             ViennaCLInt m, ViennaCLInt n,
+                                                             float alpha,
+                                                             cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                             cl_mem y, ViennaCLInt offy, ViennaCLInt incy,
+                                                             cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDger(ViennaCLBackend backend,
+                                                             ViennaCLOrder order,
+                                                             ViennaCLInt m, ViennaCLInt n,
+                                                             cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                             cl_mem y, ViennaCLInt offy, ViennaCLInt incy,
+                                                             double alpha, cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda);
+#endif
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSger(ViennaCLBackend backend,
+                                                           ViennaCLOrder order,
+                                                           ViennaCLInt m, ViennaCLInt n,
+                                                           float alpha,
+                                                           float *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                           float *y, ViennaCLInt offy, ViennaCLInt incy,
+                                                           float *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDger(ViennaCLBackend backend,
+                                                           ViennaCLOrder order,
+                                                           ViennaCLInt m, ViennaCLInt n,
+                                                           double alpha,
+                                                           double *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                           double *y, ViennaCLInt offy, ViennaCLInt incy,
+                                                           double *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda);
+
+
+
+/******************** BLAS Level 3 ***********************/
+
+// xGEMM: C <- alpha * AB + beta * C
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLgemm(ViennaCLHostScalar alpha, ViennaCLMatrix A, ViennaCLMatrix B, ViennaCLHostScalar beta, ViennaCLMatrix C);
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDASgemm(ViennaCLBackend backend,
+                                                            ViennaCLOrder orderA, ViennaCLTranspose transA,
+                                                            ViennaCLOrder orderB, ViennaCLTranspose transB,
+                                                            ViennaCLOrder orderC,
+                                                            ViennaCLInt m, ViennaCLInt n, ViennaCLInt k,
+                                                            float alpha,
+                                                            float *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                            float *B, ViennaCLInt offB_row, ViennaCLInt offB_col, ViennaCLInt incB_row, ViennaCLInt incB_col, ViennaCLInt ldb,
+                                                            float beta,
+                                                            float *C, ViennaCLInt offC_row, ViennaCLInt offC_col, ViennaCLInt incC_row, ViennaCLInt incC_col, ViennaCLInt ldc);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADgemm(ViennaCLBackend backend,
+                                                            ViennaCLOrder orderA, ViennaCLTranspose transA,
+                                                            ViennaCLOrder orderB, ViennaCLTranspose transB,
+                                                            ViennaCLOrder orderC,
+                                                            ViennaCLInt m, ViennaCLInt n, ViennaCLInt k,
+                                                            double alpha,
+                                                            double *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                            double *B, ViennaCLInt offB_row, ViennaCLInt offB_col, ViennaCLInt incB_row, ViennaCLInt incB_col, ViennaCLInt ldb,
+                                                            double beta,
+                                                            double *C, ViennaCLInt offC_row, ViennaCLInt offC_col, ViennaCLInt incC_row, ViennaCLInt incC_col, ViennaCLInt ldc);
+
+#ifdef VIENNACL_WITH_OPENCL
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSgemm(ViennaCLBackend backend,
+                                                              ViennaCLOrder orderA, ViennaCLTranspose transA,
+                                                              ViennaCLOrder orderB, ViennaCLTranspose transB,
+                                                              ViennaCLOrder orderC,
+                                                              ViennaCLInt m, ViennaCLInt n, ViennaCLInt k,
+                                                              float alpha,
+                                                              cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                              cl_mem B, ViennaCLInt offB_row, ViennaCLInt offB_col, ViennaCLInt incB_row, ViennaCLInt incB_col, ViennaCLInt ldb,
+                                                              float beta,
+                                                              cl_mem C, ViennaCLInt offC_row, ViennaCLInt offC_col, ViennaCLInt incC_row, ViennaCLInt incC_col, ViennaCLInt ldc);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDgemm(ViennaCLBackend backend,
+                                                              ViennaCLOrder orderA, ViennaCLTranspose transA,
+                                                              ViennaCLOrder orderB, ViennaCLTranspose transB,
+                                                              ViennaCLOrder orderC,
+                                                              ViennaCLInt m, ViennaCLInt n, ViennaCLInt k,
+                                                              double alpha,
+                                                              cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                              cl_mem B, ViennaCLInt offB_row, ViennaCLInt offB_col, ViennaCLInt incB_row, ViennaCLInt incB_col, ViennaCLInt ldb,
+                                                              double beta,
+                                                              cl_mem C, ViennaCLInt offC_row, ViennaCLInt offC_col, ViennaCLInt incC_row, ViennaCLInt incC_col, ViennaCLInt ldc);
+#endif
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSgemm(ViennaCLBackend backend,
+                                                            ViennaCLOrder orderA, ViennaCLTranspose transA,
+                                                            ViennaCLOrder orderB, ViennaCLTranspose transB,
+                                                            ViennaCLOrder orderC,
+                                                            ViennaCLInt m, ViennaCLInt n, ViennaCLInt k,
+                                                            float alpha,
+                                                            float *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                            float *B, ViennaCLInt offB_row, ViennaCLInt offB_col, ViennaCLInt incB_row, ViennaCLInt incB_col, ViennaCLInt ldb,
+                                                            float beta,
+                                                            float *C, ViennaCLInt offC_row, ViennaCLInt offC_col, ViennaCLInt incC_row, ViennaCLInt incC_col, ViennaCLInt ldc);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDgemm(ViennaCLBackend backend,
+                                                            ViennaCLOrder orderA, ViennaCLTranspose transA,
+                                                            ViennaCLOrder orderB, ViennaCLTranspose transB,
+                                                            ViennaCLOrder orderC,
+                                                            ViennaCLInt m, ViennaCLInt n, ViennaCLInt k,
+                                                            double alpha,
+                                                            double *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                            double *B, ViennaCLInt offB_row, ViennaCLInt offB_col, ViennaCLInt incB_row, ViennaCLInt incB_col, ViennaCLInt ldb,
+                                                            double beta,
+                                                            double *C, ViennaCLInt offC_row, ViennaCLInt offC_col, ViennaCLInt incC_row, ViennaCLInt incC_col, ViennaCLInt ldc);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif
diff --git a/libviennacl/src/backend.cpp b/libviennacl/src/backend.cpp
new file mode 100644
index 0000000..c9f6bf4
--- /dev/null
+++ b/libviennacl/src/backend.cpp
@@ -0,0 +1,46 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLBackendCreate(ViennaCLBackend * backend)
+{
+  *backend = new ViennaCLBackend_impl();
+
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLBackendSetOpenCLContextID(ViennaCLBackend backend, ViennaCLInt context_id)
+{
+  backend->opencl_backend.context_id = context_id;
+
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLBackendDestroy(ViennaCLBackend * backend)
+{
+  delete *backend;
+  *backend = NULL;
+
+  return ViennaCLSuccess;
+}
+
diff --git a/libviennacl/src/backend.cu b/libviennacl/src/backend.cu
new file mode 100644
index 0000000..c9f6bf4
--- /dev/null
+++ b/libviennacl/src/backend.cu
@@ -0,0 +1,46 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLBackendCreate(ViennaCLBackend * backend)
+{
+  *backend = new ViennaCLBackend_impl();
+
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLBackendSetOpenCLContextID(ViennaCLBackend backend, ViennaCLInt context_id)
+{
+  backend->opencl_backend.context_id = context_id;
+
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLBackendDestroy(ViennaCLBackend * backend)
+{
+  delete *backend;
+  *backend = NULL;
+
+  return ViennaCLSuccess;
+}
+
diff --git a/libviennacl/src/blas1.cpp b/libviennacl/src/blas1.cpp
new file mode 100644
index 0000000..047669a
--- /dev/null
+++ b/libviennacl/src/blas1.cpp
@@ -0,0 +1,402 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+#include "init_vector.hpp"
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+
+//include the generic inner product functions of ViennaCL
+#include "viennacl/linalg/inner_prod.hpp"
+
+//include the generic norm functions of ViennaCL
+#include "viennacl/linalg/norm_1.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+#include "viennacl/linalg/norm_inf.hpp"
+
+// IxAMAX
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLiamax(ViennaCLInt *index, ViennaCLVector x)
+{
+  viennacl::backend::mem_handle v1_handle;
+
+  if (init_vector(v1_handle, x) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  switch (x->precision)
+  {
+    case ViennaCLFloat:
+    {
+      viennacl::vector_base<float> v1(v1_handle, x->size, x->offset, x->inc);
+
+      *index = static_cast<ViennaCLInt>(viennacl::linalg::index_norm_inf(v1));
+      return ViennaCLSuccess;
+    }
+
+    case ViennaCLDouble:
+    {
+      viennacl::vector_base<double> v1(v1_handle, x->size, x->offset, x->inc);
+
+      *index = static_cast<ViennaCLInt>(viennacl::linalg::index_norm_inf(v1));
+      return ViennaCLSuccess;
+    }
+
+    default:
+      return ViennaCLGenericFailure;
+  }
+}
+
+
+
+
+// xASUM
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLasum(ViennaCLHostScalar *alpha, ViennaCLVector x)
+{
+  if ((*alpha)->precision != x->precision)
+    return ViennaCLGenericFailure;
+
+  viennacl::backend::mem_handle v1_handle;
+
+  if (init_vector(v1_handle, x) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  switch (x->precision)
+  {
+    case ViennaCLFloat:
+    {
+      viennacl::vector_base<float> v1(v1_handle, x->size, x->offset, x->inc);
+
+      (*alpha)->value_float = viennacl::linalg::norm_1(v1);
+      return ViennaCLSuccess;
+    }
+
+    case ViennaCLDouble:
+    {
+      viennacl::vector_base<double> v1(v1_handle, x->size, x->offset, x->inc);
+
+      (*alpha)->value_double = viennacl::linalg::norm_1(v1);
+      return ViennaCLSuccess;
+    }
+
+    default:
+      return ViennaCLGenericFailure;
+  }
+}
+
+
+
+// xAXPY
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLaxpy(ViennaCLHostScalar alpha, ViennaCLVector x, ViennaCLVector y)
+{
+  if (alpha->precision != x->precision)
+    return ViennaCLGenericFailure;
+
+  if (x->precision != y->precision)
+    return ViennaCLGenericFailure;
+
+  viennacl::backend::mem_handle v1_handle;
+  viennacl::backend::mem_handle v2_handle;
+
+  if (init_vector(v1_handle, x) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  if (init_vector(v2_handle, y) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  switch (x->precision)
+  {
+    case ViennaCLFloat:
+    {
+      viennacl::vector_base<float> v1(v1_handle, x->size, x->offset, x->inc);
+      viennacl::vector_base<float> v2(v2_handle, y->size, y->offset, y->inc);
+
+      v2 += alpha->value_float * v1;
+      return ViennaCLSuccess;
+    }
+
+    case ViennaCLDouble:
+    {
+      viennacl::vector_base<double> v1(v1_handle, x->size, x->offset, x->inc);
+      viennacl::vector_base<double> v2(v2_handle, y->size, y->offset, y->inc);
+
+      v2 += alpha->value_double * v1;
+      return ViennaCLSuccess;
+    }
+
+    default:
+      return ViennaCLGenericFailure;
+  }
+}
+
+
+// xCOPY
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLcopy(ViennaCLVector x, ViennaCLVector y)
+{
+  if (x->precision != y->precision)
+    return ViennaCLGenericFailure;
+
+  viennacl::backend::mem_handle v1_handle;
+  viennacl::backend::mem_handle v2_handle;
+
+  if (init_vector(v1_handle, x) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  if (init_vector(v2_handle, y) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  switch (x->precision)
+  {
+    case ViennaCLFloat:
+    {
+      viennacl::vector_base<float> v1(v1_handle, x->size, x->offset, x->inc);
+      viennacl::vector_base<float> v2(v2_handle, y->size, y->offset, y->inc);
+
+      v2 = v1;
+      return ViennaCLSuccess;
+    }
+
+    case ViennaCLDouble:
+    {
+      viennacl::vector_base<double> v1(v1_handle, x->size, x->offset, x->inc);
+      viennacl::vector_base<double> v2(v2_handle, y->size, y->offset, y->inc);
+
+      v2 = v1;
+      return ViennaCLSuccess;
+    }
+
+    default:
+      return ViennaCLGenericFailure;
+  }
+}
+
+// xDOT
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLdot(ViennaCLHostScalar *alpha, ViennaCLVector x, ViennaCLVector y)
+{
+  if ((*alpha)->precision != x->precision)
+    return ViennaCLGenericFailure;
+
+  if (x->precision != y->precision)
+    return ViennaCLGenericFailure;
+
+  viennacl::backend::mem_handle v1_handle;
+  viennacl::backend::mem_handle v2_handle;
+
+  if (init_vector(v1_handle, x) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  if (init_vector(v2_handle, y) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  switch (x->precision)
+  {
+    case ViennaCLFloat:
+    {
+      viennacl::vector_base<float> v1(v1_handle, x->size, x->offset, x->inc);
+      viennacl::vector_base<float> v2(v2_handle, y->size, y->offset, y->inc);
+
+      (*alpha)->value_float = viennacl::linalg::inner_prod(v1, v2);
+      return ViennaCLSuccess;
+    }
+
+    case ViennaCLDouble:
+    {
+      viennacl::vector_base<double> v1(v1_handle, x->size, x->offset, x->inc);
+      viennacl::vector_base<double> v2(v2_handle, y->size, y->offset, y->inc);
+
+      (*alpha)->value_double = viennacl::linalg::inner_prod(v1, v2);
+      return ViennaCLSuccess;
+    }
+
+    default:
+      return ViennaCLGenericFailure;
+  }
+}
+
+// xNRM2
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLnrm2(ViennaCLHostScalar *alpha, ViennaCLVector x)
+{
+  if ((*alpha)->precision != x->precision)
+    return ViennaCLGenericFailure;
+
+  viennacl::backend::mem_handle v1_handle;
+
+  if (init_vector(v1_handle, x) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  switch (x->precision)
+  {
+    case ViennaCLFloat:
+    {
+      viennacl::vector_base<float> v1(v1_handle, x->size, x->offset, x->inc);
+
+      (*alpha)->value_float = viennacl::linalg::norm_2(v1);
+      return ViennaCLSuccess;
+    }
+
+    case ViennaCLDouble:
+    {
+      viennacl::vector_base<double> v1(v1_handle, x->size, x->offset, x->inc);
+
+      (*alpha)->value_double = viennacl::linalg::norm_2(v1);
+      return ViennaCLSuccess;
+    }
+
+    default:
+      return ViennaCLGenericFailure;
+  }
+}
+
+
+
+// xROT
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLrot(ViennaCLVector     x, ViennaCLVector     y,
+                                                      ViennaCLHostScalar c, ViennaCLHostScalar s)
+{
+  if (c->precision != x->precision)
+    return ViennaCLGenericFailure;
+
+  if (s->precision != x->precision)
+    return ViennaCLGenericFailure;
+
+  if (x->precision != y->precision)
+    return ViennaCLGenericFailure;
+
+  viennacl::backend::mem_handle v1_handle;
+  viennacl::backend::mem_handle v2_handle;
+
+  if (init_vector(v1_handle, x) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  if (init_vector(v2_handle, y) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  switch (x->precision)
+  {
+    case ViennaCLFloat:
+    {
+      viennacl::vector_base<float> v1(v1_handle, x->size, x->offset, x->inc);
+      viennacl::vector_base<float> v2(v2_handle, y->size, y->offset, y->inc);
+
+      viennacl::linalg::plane_rotation(v1, v2, c->value_float, s->value_float);
+      return ViennaCLSuccess;
+    }
+
+    case ViennaCLDouble:
+    {
+      viennacl::vector_base<double> v1(v1_handle, x->size, x->offset, x->inc);
+      viennacl::vector_base<double> v2(v2_handle, y->size, y->offset, y->inc);
+
+      viennacl::linalg::plane_rotation(v1, v2, c->value_double, s->value_double);
+      return ViennaCLSuccess;
+    }
+
+    default:
+      return ViennaCLGenericFailure;
+  }
+}
+
+// xSCAL
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLscal(ViennaCLHostScalar alpha, ViennaCLVector x)
+{
+  if (alpha->precision != x->precision)
+    return ViennaCLGenericFailure;
+
+  viennacl::backend::mem_handle v1_handle;
+
+  if (init_vector(v1_handle, x) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  switch (x->precision)
+  {
+    case ViennaCLFloat:
+    {
+      viennacl::vector_base<float> v1(v1_handle, x->size, x->offset, x->inc);
+
+      v1 *= alpha->value_float;
+      return ViennaCLSuccess;
+    }
+
+    case ViennaCLDouble:
+    {
+      viennacl::vector_base<double> v1(v1_handle, x->size, x->offset, x->inc);
+
+      v1 *= alpha->value_double;
+      return ViennaCLSuccess;
+    }
+
+    default:
+      return ViennaCLGenericFailure;
+  }
+}
+
+
+// xSWAP
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLswap(ViennaCLVector x, ViennaCLVector y)
+{
+  if (x->precision != y->precision)
+    return ViennaCLGenericFailure;
+
+  viennacl::backend::mem_handle v1_handle;
+  viennacl::backend::mem_handle v2_handle;
+
+  if (init_vector(v1_handle, x) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  if (init_vector(v2_handle, y) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  switch (x->precision)
+  {
+    case ViennaCLFloat:
+    {
+      viennacl::vector_base<float> v1(v1_handle, x->size, x->offset, x->inc);
+      viennacl::vector_base<float> v2(v2_handle, y->size, y->offset, y->inc);
+
+      viennacl::swap(v1, v2);
+      return ViennaCLSuccess;
+    }
+
+    case ViennaCLDouble:
+    {
+      viennacl::vector_base<double> v1(v1_handle, x->size, x->offset, x->inc);
+      viennacl::vector_base<double> v2(v2_handle, y->size, y->offset, y->inc);
+
+      viennacl::swap(v1, v2);
+      return ViennaCLSuccess;
+    }
+
+    default:
+      return ViennaCLGenericFailure;
+  }
+}
+
+
diff --git a/libviennacl/src/blas1.cu b/libviennacl/src/blas1.cu
new file mode 100644
index 0000000..047669a
--- /dev/null
+++ b/libviennacl/src/blas1.cu
@@ -0,0 +1,402 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+#include "init_vector.hpp"
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+
+//include the generic inner product functions of ViennaCL
+#include "viennacl/linalg/inner_prod.hpp"
+
+//include the generic norm functions of ViennaCL
+#include "viennacl/linalg/norm_1.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+#include "viennacl/linalg/norm_inf.hpp"
+
+// IxAMAX
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLiamax(ViennaCLInt *index, ViennaCLVector x)
+{
+  viennacl::backend::mem_handle v1_handle;
+
+  if (init_vector(v1_handle, x) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  switch (x->precision)
+  {
+    case ViennaCLFloat:
+    {
+      viennacl::vector_base<float> v1(v1_handle, x->size, x->offset, x->inc);
+
+      *index = static_cast<ViennaCLInt>(viennacl::linalg::index_norm_inf(v1));
+      return ViennaCLSuccess;
+    }
+
+    case ViennaCLDouble:
+    {
+      viennacl::vector_base<double> v1(v1_handle, x->size, x->offset, x->inc);
+
+      *index = static_cast<ViennaCLInt>(viennacl::linalg::index_norm_inf(v1));
+      return ViennaCLSuccess;
+    }
+
+    default:
+      return ViennaCLGenericFailure;
+  }
+}
+
+
+
+
+// xASUM
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLasum(ViennaCLHostScalar *alpha, ViennaCLVector x)
+{
+  if ((*alpha)->precision != x->precision)
+    return ViennaCLGenericFailure;
+
+  viennacl::backend::mem_handle v1_handle;
+
+  if (init_vector(v1_handle, x) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  switch (x->precision)
+  {
+    case ViennaCLFloat:
+    {
+      viennacl::vector_base<float> v1(v1_handle, x->size, x->offset, x->inc);
+
+      (*alpha)->value_float = viennacl::linalg::norm_1(v1);
+      return ViennaCLSuccess;
+    }
+
+    case ViennaCLDouble:
+    {
+      viennacl::vector_base<double> v1(v1_handle, x->size, x->offset, x->inc);
+
+      (*alpha)->value_double = viennacl::linalg::norm_1(v1);
+      return ViennaCLSuccess;
+    }
+
+    default:
+      return ViennaCLGenericFailure;
+  }
+}
+
+
+
+// xAXPY
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLaxpy(ViennaCLHostScalar alpha, ViennaCLVector x, ViennaCLVector y)
+{
+  if (alpha->precision != x->precision)
+    return ViennaCLGenericFailure;
+
+  if (x->precision != y->precision)
+    return ViennaCLGenericFailure;
+
+  viennacl::backend::mem_handle v1_handle;
+  viennacl::backend::mem_handle v2_handle;
+
+  if (init_vector(v1_handle, x) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  if (init_vector(v2_handle, y) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  switch (x->precision)
+  {
+    case ViennaCLFloat:
+    {
+      viennacl::vector_base<float> v1(v1_handle, x->size, x->offset, x->inc);
+      viennacl::vector_base<float> v2(v2_handle, y->size, y->offset, y->inc);
+
+      v2 += alpha->value_float * v1;
+      return ViennaCLSuccess;
+    }
+
+    case ViennaCLDouble:
+    {
+      viennacl::vector_base<double> v1(v1_handle, x->size, x->offset, x->inc);
+      viennacl::vector_base<double> v2(v2_handle, y->size, y->offset, y->inc);
+
+      v2 += alpha->value_double * v1;
+      return ViennaCLSuccess;
+    }
+
+    default:
+      return ViennaCLGenericFailure;
+  }
+}
+
+
+// xCOPY
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLcopy(ViennaCLVector x, ViennaCLVector y)
+{
+  if (x->precision != y->precision)
+    return ViennaCLGenericFailure;
+
+  viennacl::backend::mem_handle v1_handle;
+  viennacl::backend::mem_handle v2_handle;
+
+  if (init_vector(v1_handle, x) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  if (init_vector(v2_handle, y) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  switch (x->precision)
+  {
+    case ViennaCLFloat:
+    {
+      viennacl::vector_base<float> v1(v1_handle, x->size, x->offset, x->inc);
+      viennacl::vector_base<float> v2(v2_handle, y->size, y->offset, y->inc);
+
+      v2 = v1;
+      return ViennaCLSuccess;
+    }
+
+    case ViennaCLDouble:
+    {
+      viennacl::vector_base<double> v1(v1_handle, x->size, x->offset, x->inc);
+      viennacl::vector_base<double> v2(v2_handle, y->size, y->offset, y->inc);
+
+      v2 = v1;
+      return ViennaCLSuccess;
+    }
+
+    default:
+      return ViennaCLGenericFailure;
+  }
+}
+
+// xDOT
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLdot(ViennaCLHostScalar *alpha, ViennaCLVector x, ViennaCLVector y)
+{
+  if ((*alpha)->precision != x->precision)
+    return ViennaCLGenericFailure;
+
+  if (x->precision != y->precision)
+    return ViennaCLGenericFailure;
+
+  viennacl::backend::mem_handle v1_handle;
+  viennacl::backend::mem_handle v2_handle;
+
+  if (init_vector(v1_handle, x) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  if (init_vector(v2_handle, y) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  switch (x->precision)
+  {
+    case ViennaCLFloat:
+    {
+      viennacl::vector_base<float> v1(v1_handle, x->size, x->offset, x->inc);
+      viennacl::vector_base<float> v2(v2_handle, y->size, y->offset, y->inc);
+
+      (*alpha)->value_float = viennacl::linalg::inner_prod(v1, v2);
+      return ViennaCLSuccess;
+    }
+
+    case ViennaCLDouble:
+    {
+      viennacl::vector_base<double> v1(v1_handle, x->size, x->offset, x->inc);
+      viennacl::vector_base<double> v2(v2_handle, y->size, y->offset, y->inc);
+
+      (*alpha)->value_double = viennacl::linalg::inner_prod(v1, v2);
+      return ViennaCLSuccess;
+    }
+
+    default:
+      return ViennaCLGenericFailure;
+  }
+}
+
+// xNRM2
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLnrm2(ViennaCLHostScalar *alpha, ViennaCLVector x)
+{
+  if ((*alpha)->precision != x->precision)
+    return ViennaCLGenericFailure;
+
+  viennacl::backend::mem_handle v1_handle;
+
+  if (init_vector(v1_handle, x) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  switch (x->precision)
+  {
+    case ViennaCLFloat:
+    {
+      viennacl::vector_base<float> v1(v1_handle, x->size, x->offset, x->inc);
+
+      (*alpha)->value_float = viennacl::linalg::norm_2(v1);
+      return ViennaCLSuccess;
+    }
+
+    case ViennaCLDouble:
+    {
+      viennacl::vector_base<double> v1(v1_handle, x->size, x->offset, x->inc);
+
+      (*alpha)->value_double = viennacl::linalg::norm_2(v1);
+      return ViennaCLSuccess;
+    }
+
+    default:
+      return ViennaCLGenericFailure;
+  }
+}
+
+
+
+// xROT
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLrot(ViennaCLVector     x, ViennaCLVector     y,
+                                                      ViennaCLHostScalar c, ViennaCLHostScalar s)
+{
+  if (c->precision != x->precision)
+    return ViennaCLGenericFailure;
+
+  if (s->precision != x->precision)
+    return ViennaCLGenericFailure;
+
+  if (x->precision != y->precision)
+    return ViennaCLGenericFailure;
+
+  viennacl::backend::mem_handle v1_handle;
+  viennacl::backend::mem_handle v2_handle;
+
+  if (init_vector(v1_handle, x) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  if (init_vector(v2_handle, y) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  switch (x->precision)
+  {
+    case ViennaCLFloat:
+    {
+      viennacl::vector_base<float> v1(v1_handle, x->size, x->offset, x->inc);
+      viennacl::vector_base<float> v2(v2_handle, y->size, y->offset, y->inc);
+
+      viennacl::linalg::plane_rotation(v1, v2, c->value_float, s->value_float);
+      return ViennaCLSuccess;
+    }
+
+    case ViennaCLDouble:
+    {
+      viennacl::vector_base<double> v1(v1_handle, x->size, x->offset, x->inc);
+      viennacl::vector_base<double> v2(v2_handle, y->size, y->offset, y->inc);
+
+      viennacl::linalg::plane_rotation(v1, v2, c->value_double, s->value_double);
+      return ViennaCLSuccess;
+    }
+
+    default:
+      return ViennaCLGenericFailure;
+  }
+}
+
+// xSCAL
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLscal(ViennaCLHostScalar alpha, ViennaCLVector x)
+{
+  if (alpha->precision != x->precision)
+    return ViennaCLGenericFailure;
+
+  viennacl::backend::mem_handle v1_handle;
+
+  if (init_vector(v1_handle, x) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  switch (x->precision)
+  {
+    case ViennaCLFloat:
+    {
+      viennacl::vector_base<float> v1(v1_handle, x->size, x->offset, x->inc);
+
+      v1 *= alpha->value_float;
+      return ViennaCLSuccess;
+    }
+
+    case ViennaCLDouble:
+    {
+      viennacl::vector_base<double> v1(v1_handle, x->size, x->offset, x->inc);
+
+      v1 *= alpha->value_double;
+      return ViennaCLSuccess;
+    }
+
+    default:
+      return ViennaCLGenericFailure;
+  }
+}
+
+
+// xSWAP
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLswap(ViennaCLVector x, ViennaCLVector y)
+{
+  if (x->precision != y->precision)
+    return ViennaCLGenericFailure;
+
+  viennacl::backend::mem_handle v1_handle;
+  viennacl::backend::mem_handle v2_handle;
+
+  if (init_vector(v1_handle, x) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  if (init_vector(v2_handle, y) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  switch (x->precision)
+  {
+    case ViennaCLFloat:
+    {
+      viennacl::vector_base<float> v1(v1_handle, x->size, x->offset, x->inc);
+      viennacl::vector_base<float> v2(v2_handle, y->size, y->offset, y->inc);
+
+      viennacl::swap(v1, v2);
+      return ViennaCLSuccess;
+    }
+
+    case ViennaCLDouble:
+    {
+      viennacl::vector_base<double> v1(v1_handle, x->size, x->offset, x->inc);
+      viennacl::vector_base<double> v2(v2_handle, y->size, y->offset, y->inc);
+
+      viennacl::swap(v1, v2);
+      return ViennaCLSuccess;
+    }
+
+    default:
+      return ViennaCLGenericFailure;
+  }
+}
+
+
diff --git a/libviennacl/src/blas1_cuda.cu b/libviennacl/src/blas1_cuda.cu
new file mode 100644
index 0000000..e6dddbb
--- /dev/null
+++ b/libviennacl/src/blas1_cuda.cu
@@ -0,0 +1,264 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+
+//include the generic inner product functions of ViennaCL
+#include "viennacl/linalg/inner_prod.hpp"
+
+//include the generic norm functions of ViennaCL
+#include "viennacl/linalg/norm_1.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+#include "viennacl/linalg/norm_inf.hpp"
+
+
+#ifdef VIENNACL_WITH_CUDA
+
+
+// IxAMAX
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDAiSamax(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                             ViennaCLInt *index,
+                                                             float *x, ViennaCLInt offx, ViennaCLInt incx)
+{
+  viennacl::vector_base<float> v1(x, viennacl::CUDA_MEMORY, n, offx, incx);
+
+  *index = static_cast<ViennaCLInt>(viennacl::linalg::index_norm_inf(v1));
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDAiDamax(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                             ViennaCLInt *index,
+                                                             double *x, ViennaCLInt offx, ViennaCLInt incx)
+{
+  viennacl::vector_base<double> v1(x, viennacl::CUDA_MEMORY, n, offx, incx);
+
+  *index = static_cast<ViennaCLInt>(viennacl::linalg::index_norm_inf(v1));
+  return ViennaCLSuccess;
+}
+
+
+
+// xASUM
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDASasum(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            float *alpha,
+                                                            float *x, ViennaCLInt offx, ViennaCLInt incx)
+{
+  viennacl::vector_base<float> v1(x, viennacl::CUDA_MEMORY, n, offx, incx);
+
+  *alpha = viennacl::linalg::norm_1(v1);
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADasum(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            double *alpha,
+                                                            double *x, ViennaCLInt offx, ViennaCLInt incx)
+{
+  viennacl::vector_base<double> v1(x, viennacl::CUDA_MEMORY, n, offx, incx);
+
+  *alpha = viennacl::linalg::norm_1(v1);
+  return ViennaCLSuccess;
+}
+
+
+// xAXPY
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDASaxpy(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            float alpha,
+                                                            float *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                            float *y, ViennaCLInt offy, ViennaCLInt incy)
+{
+  viennacl::vector_base<float> v1(x, viennacl::CUDA_MEMORY, n, offx, incx);
+  viennacl::vector_base<float> v2(y, viennacl::CUDA_MEMORY, n, offy, incy);
+
+  v2 += alpha * v1;
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADaxpy(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            double alpha,
+                                                            double *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                            double *y, ViennaCLInt offy, ViennaCLInt incy)
+{
+  viennacl::vector_base<double> v1(x, viennacl::CUDA_MEMORY, n, offx, incx);
+  viennacl::vector_base<double> v2(y, viennacl::CUDA_MEMORY, n, offy, incy);
+
+  v2 += alpha * v1;
+  return ViennaCLSuccess;
+}
+
+
+// xCOPY
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDAScopy(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            float *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                            float *y, ViennaCLInt offy, ViennaCLInt incy)
+{
+  viennacl::vector_base<float> v1(x, viennacl::CUDA_MEMORY, n, offx, incx);
+  viennacl::vector_base<float> v2(y, viennacl::CUDA_MEMORY, n, offy, incy);
+
+  v2 = v1;
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADcopy(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            double *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                            double *y, ViennaCLInt offy, ViennaCLInt incy)
+{
+  viennacl::vector_base<double> v1(x, viennacl::CUDA_MEMORY, n, offx, incx);
+  viennacl::vector_base<double> v2(y, viennacl::CUDA_MEMORY, n, offy, incy);
+
+  v2 = v1;
+  return ViennaCLSuccess;
+}
+
+// xDOT
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDASdot(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                           float *alpha,
+                                                           float *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                           float *y, ViennaCLInt offy, ViennaCLInt incy)
+{
+  viennacl::vector_base<float> v1(x, viennacl::CUDA_MEMORY, n, offx, incx);
+  viennacl::vector_base<float> v2(y, viennacl::CUDA_MEMORY, n, offy, incy);
+
+  *alpha = viennacl::linalg::inner_prod(v1, v2);
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADdot(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                           double *alpha,
+                                                           double *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                           double *y, ViennaCLInt offy, ViennaCLInt incy)
+{
+  viennacl::vector_base<double> v1(x, viennacl::CUDA_MEMORY, n, offx, incx);
+  viennacl::vector_base<double> v2(y, viennacl::CUDA_MEMORY, n, offy, incy);
+
+  *alpha = viennacl::linalg::inner_prod(v1, v2);
+  return ViennaCLSuccess;
+}
+
+// xNRM2
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDASnrm2(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            float *alpha,
+                                                            float *x, ViennaCLInt offx, ViennaCLInt incx)
+{
+  viennacl::vector_base<float> v1(x, viennacl::CUDA_MEMORY, n, offx, incx);
+
+  *alpha = viennacl::linalg::norm_2(v1);
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADnrm2(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            double *alpha,
+                                                            double *x, ViennaCLInt offx, ViennaCLInt incx)
+{
+  viennacl::vector_base<double> v1(x, viennacl::CUDA_MEMORY, n, offx, incx);
+
+  *alpha = viennacl::linalg::norm_2(v1);
+  return ViennaCLSuccess;
+}
+
+
+
+// xROT
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDASrot(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                           float *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                           float *y, ViennaCLInt offy, ViennaCLInt incy,
+                                                           float c, float s)
+{
+  viennacl::vector_base<float> v1(x, viennacl::CUDA_MEMORY, n, offx, incx);
+  viennacl::vector_base<float> v2(y, viennacl::CUDA_MEMORY, n, offy, incy);
+
+  viennacl::linalg::plane_rotation(v1, v2, c, s);
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADrot(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                           double *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                           double *y, ViennaCLInt offy, ViennaCLInt incy,
+                                                           double c, double s)
+{
+  viennacl::vector_base<double> v1(x, viennacl::CUDA_MEMORY, n, offx, incx);
+  viennacl::vector_base<double> v2(y, viennacl::CUDA_MEMORY, n, offy, incy);
+
+  viennacl::linalg::plane_rotation(v1, v2, c, s);
+  return ViennaCLSuccess;
+}
+
+
+
+// xSCAL
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDASscal(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            float alpha,
+                                                            float *x, ViennaCLInt offx, ViennaCLInt incx)
+{
+  viennacl::vector_base<float> v1(x, viennacl::CUDA_MEMORY, n, offx, incx);
+
+  v1 *= alpha;
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADscal(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            double alpha,
+                                                            double *x, ViennaCLInt offx, ViennaCLInt incx)
+{
+  viennacl::vector_base<double> v1(x, viennacl::CUDA_MEMORY, n, offx, incx);
+
+  v1 *= alpha;
+  return ViennaCLSuccess;
+}
+
+
+// xSWAP
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDASswap(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            float *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                            float *y, ViennaCLInt offy, ViennaCLInt incy)
+{
+  viennacl::vector_base<float> v1(x, viennacl::CUDA_MEMORY, n, offx, incx);
+  viennacl::vector_base<float> v2(y, viennacl::CUDA_MEMORY, n, offy, incy);
+
+  viennacl::swap(v1, v2);
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADswap(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            double *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                            double *y, ViennaCLInt offy, ViennaCLInt incy)
+{
+  viennacl::vector_base<double> v1(x, viennacl::CUDA_MEMORY, n, offx, incx);
+  viennacl::vector_base<double> v2(y, viennacl::CUDA_MEMORY, n, offy, incy);
+
+  viennacl::swap(v1, v2);
+  return ViennaCLSuccess;
+}
+#endif
+
+
diff --git a/libviennacl/src/blas1_host.cpp b/libviennacl/src/blas1_host.cpp
new file mode 100644
index 0000000..7de2a7d
--- /dev/null
+++ b/libviennacl/src/blas1_host.cpp
@@ -0,0 +1,257 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+
+//include the generic inner product functions of ViennaCL
+#include "viennacl/linalg/inner_prod.hpp"
+
+//include the generic norm functions of ViennaCL
+#include "viennacl/linalg/norm_1.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+#include "viennacl/linalg/norm_inf.hpp"
+
+
+// IxAMAX
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostiSamax(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                             ViennaCLInt *index,
+                                                             float *x, ViennaCLInt offx, int incx)
+{
+  viennacl::vector_base<float> v1(x, viennacl::MAIN_MEMORY, n, offx, incx);
+
+  *index = static_cast<ViennaCLInt>(viennacl::linalg::index_norm_inf(v1));
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostiDamax(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                             ViennaCLInt *index,
+                                                             double *x, ViennaCLInt offx, int incx)
+{
+  viennacl::vector_base<double> v1(x, viennacl::MAIN_MEMORY, n, offx, incx);
+
+  *index = static_cast<ViennaCLInt>(viennacl::linalg::index_norm_inf(v1));
+  return ViennaCLSuccess;
+}
+
+
+
+// xASUM
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSasum(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            float *alpha,
+                                                            float *x, ViennaCLInt offx, int incx)
+{
+  viennacl::vector_base<float> v1(x, viennacl::MAIN_MEMORY, n, offx, incx);
+
+  *alpha = viennacl::linalg::norm_1(v1);
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDasum(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            double *alpha,
+                                                            double *x, ViennaCLInt offx, int incx)
+{
+  viennacl::vector_base<double> v1(x, viennacl::MAIN_MEMORY, n, offx, incx);
+
+  *alpha = viennacl::linalg::norm_1(v1);
+  return ViennaCLSuccess;
+}
+
+
+
+// xAXPY
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSaxpy(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            float alpha,
+                                                            float *x, ViennaCLInt offx, int incx,
+                                                            float *y, ViennaCLInt offy, int incy)
+{
+  viennacl::vector_base<float> v1(x, viennacl::MAIN_MEMORY, n, offx, incx);
+  viennacl::vector_base<float> v2(y, viennacl::MAIN_MEMORY, n, offy, incy);
+
+  v2 += alpha * v1;
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDaxpy(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            double alpha,
+                                                            double *x, ViennaCLInt offx, int incx,
+                                                            double *y, ViennaCLInt offy, int incy)
+{
+  viennacl::vector_base<double> v1(x, viennacl::MAIN_MEMORY, n, offx, incx);
+  viennacl::vector_base<double> v2(y, viennacl::MAIN_MEMORY, n, offy, incy);
+
+  v2 += alpha * v1;
+  return ViennaCLSuccess;
+}
+
+
+// xCOPY
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostScopy(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            float *x, ViennaCLInt offx, int incx,
+                                                            float *y, ViennaCLInt offy, int incy)
+{
+  viennacl::vector_base<float> v1(x, viennacl::MAIN_MEMORY, n, offx, incx);
+  viennacl::vector_base<float> v2(y, viennacl::MAIN_MEMORY, n, offy, incy);
+
+  v2 = v1;
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDcopy(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            double *x, ViennaCLInt offx, int incx,
+                                                            double *y, ViennaCLInt offy, int incy)
+{
+  viennacl::vector_base<double> v1(x, viennacl::MAIN_MEMORY, n, offx, incx);
+  viennacl::vector_base<double> v2(y, viennacl::MAIN_MEMORY, n, offy, incy);
+
+  v2 = v1;
+  return ViennaCLSuccess;
+}
+
+// xAXPY
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSdot(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                           float *alpha,
+                                                           float *x, ViennaCLInt offx, int incx,
+                                                           float *y, ViennaCLInt offy, int incy)
+{
+  viennacl::vector_base<float> v1(x, viennacl::MAIN_MEMORY, n, offx, incx);
+  viennacl::vector_base<float> v2(y, viennacl::MAIN_MEMORY, n, offy, incy);
+
+  *alpha = viennacl::linalg::inner_prod(v1, v2);
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDdot(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                           double *alpha,
+                                                           double *x, ViennaCLInt offx, int incx,
+                                                           double *y, ViennaCLInt offy, int incy)
+{
+  viennacl::vector_base<double> v1(x, viennacl::MAIN_MEMORY, n, offx, incx);
+  viennacl::vector_base<double> v2(y, viennacl::MAIN_MEMORY, n, offy, incy);
+
+  *alpha = viennacl::linalg::inner_prod(v1, v2);
+  return ViennaCLSuccess;
+}
+
+// xNRM2
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSnrm2(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            float *alpha,
+                                                            float *x, ViennaCLInt offx, int incx)
+{
+  viennacl::vector_base<float> v1(x, viennacl::MAIN_MEMORY, n, offx, incx);
+
+  *alpha = viennacl::linalg::norm_2(v1);
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDnrm2(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            double *alpha,
+                                                            double *x, ViennaCLInt offx, int incx)
+{
+  viennacl::vector_base<double> v1(x, viennacl::MAIN_MEMORY, n, offx, incx);
+
+  *alpha = viennacl::linalg::norm_2(v1);
+  return ViennaCLSuccess;
+}
+
+
+// xROT
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSrot(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                           float *x, ViennaCLInt offx, int incx,
+                                                           float *y, ViennaCLInt offy, int incy,
+                                                           float c, float s)
+{
+  viennacl::vector_base<float> v1(x, viennacl::MAIN_MEMORY, n, offx, incx);
+  viennacl::vector_base<float> v2(y, viennacl::MAIN_MEMORY, n, offy, incy);
+
+  viennacl::linalg::plane_rotation(v1, v2, c, s);
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDrot(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                           double *x, ViennaCLInt offx, int incx,
+                                                           double *y, ViennaCLInt offy, int incy,
+                                                           double c, double s)
+{
+  viennacl::vector_base<double> v1(x, viennacl::MAIN_MEMORY, n, offx, incx);
+  viennacl::vector_base<double> v2(y, viennacl::MAIN_MEMORY, n, offy, incy);
+
+  viennacl::linalg::plane_rotation(v1, v2, c, s);
+  return ViennaCLSuccess;
+}
+
+
+
+// xSCAL
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSscal(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            float alpha,
+                                                            float *x, ViennaCLInt offx, int incx)
+{
+  viennacl::vector_base<float> v1(x, viennacl::MAIN_MEMORY, n, offx, incx);
+
+  v1 *= alpha;
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDscal(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            double alpha,
+                                                            double *x, ViennaCLInt offx, int incx)
+{
+  viennacl::vector_base<double> v1(x, viennacl::MAIN_MEMORY, n, offx, incx);
+
+  v1 *= alpha;
+  return ViennaCLSuccess;
+}
+
+// xSWAP
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSswap(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            float *x, ViennaCLInt offx, int incx,
+                                                            float *y, ViennaCLInt offy, int incy)
+{
+  viennacl::vector_base<float> v1(x, viennacl::MAIN_MEMORY, n, offx, incx);
+  viennacl::vector_base<float> v2(y, viennacl::MAIN_MEMORY, n, offy, incy);
+
+  viennacl::swap(v1, v2);
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDswap(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            double *x, ViennaCLInt offx, int incx,
+                                                            double *y, ViennaCLInt offy, int incy)
+{
+  viennacl::vector_base<double> v1(x, viennacl::MAIN_MEMORY, n, offx, incx);
+  viennacl::vector_base<double> v2(y, viennacl::MAIN_MEMORY, n, offy, incy);
+
+  viennacl::swap(v1, v2);
+  return ViennaCLSuccess;
+}
diff --git a/libviennacl/src/blas1_host.cu b/libviennacl/src/blas1_host.cu
new file mode 100644
index 0000000..7de2a7d
--- /dev/null
+++ b/libviennacl/src/blas1_host.cu
@@ -0,0 +1,257 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+
+//include the generic inner product functions of ViennaCL
+#include "viennacl/linalg/inner_prod.hpp"
+
+//include the generic norm functions of ViennaCL
+#include "viennacl/linalg/norm_1.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+#include "viennacl/linalg/norm_inf.hpp"
+
+
+// IxAMAX
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostiSamax(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                             ViennaCLInt *index,
+                                                             float *x, ViennaCLInt offx, int incx)
+{
+  viennacl::vector_base<float> v1(x, viennacl::MAIN_MEMORY, n, offx, incx);
+
+  *index = static_cast<ViennaCLInt>(viennacl::linalg::index_norm_inf(v1));
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostiDamax(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                             ViennaCLInt *index,
+                                                             double *x, ViennaCLInt offx, int incx)
+{
+  viennacl::vector_base<double> v1(x, viennacl::MAIN_MEMORY, n, offx, incx);
+
+  *index = static_cast<ViennaCLInt>(viennacl::linalg::index_norm_inf(v1));
+  return ViennaCLSuccess;
+}
+
+
+
+// xASUM
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSasum(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            float *alpha,
+                                                            float *x, ViennaCLInt offx, int incx)
+{
+  viennacl::vector_base<float> v1(x, viennacl::MAIN_MEMORY, n, offx, incx);
+
+  *alpha = viennacl::linalg::norm_1(v1);
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDasum(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            double *alpha,
+                                                            double *x, ViennaCLInt offx, int incx)
+{
+  viennacl::vector_base<double> v1(x, viennacl::MAIN_MEMORY, n, offx, incx);
+
+  *alpha = viennacl::linalg::norm_1(v1);
+  return ViennaCLSuccess;
+}
+
+
+
+// xAXPY
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSaxpy(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            float alpha,
+                                                            float *x, ViennaCLInt offx, int incx,
+                                                            float *y, ViennaCLInt offy, int incy)
+{
+  viennacl::vector_base<float> v1(x, viennacl::MAIN_MEMORY, n, offx, incx);
+  viennacl::vector_base<float> v2(y, viennacl::MAIN_MEMORY, n, offy, incy);
+
+  v2 += alpha * v1;
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDaxpy(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            double alpha,
+                                                            double *x, ViennaCLInt offx, int incx,
+                                                            double *y, ViennaCLInt offy, int incy)
+{
+  viennacl::vector_base<double> v1(x, viennacl::MAIN_MEMORY, n, offx, incx);
+  viennacl::vector_base<double> v2(y, viennacl::MAIN_MEMORY, n, offy, incy);
+
+  v2 += alpha * v1;
+  return ViennaCLSuccess;
+}
+
+
+// xCOPY
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostScopy(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            float *x, ViennaCLInt offx, int incx,
+                                                            float *y, ViennaCLInt offy, int incy)
+{
+  viennacl::vector_base<float> v1(x, viennacl::MAIN_MEMORY, n, offx, incx);
+  viennacl::vector_base<float> v2(y, viennacl::MAIN_MEMORY, n, offy, incy);
+
+  v2 = v1;
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDcopy(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            double *x, ViennaCLInt offx, int incx,
+                                                            double *y, ViennaCLInt offy, int incy)
+{
+  viennacl::vector_base<double> v1(x, viennacl::MAIN_MEMORY, n, offx, incx);
+  viennacl::vector_base<double> v2(y, viennacl::MAIN_MEMORY, n, offy, incy);
+
+  v2 = v1;
+  return ViennaCLSuccess;
+}
+
+// xAXPY
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSdot(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                           float *alpha,
+                                                           float *x, ViennaCLInt offx, int incx,
+                                                           float *y, ViennaCLInt offy, int incy)
+{
+  viennacl::vector_base<float> v1(x, viennacl::MAIN_MEMORY, n, offx, incx);
+  viennacl::vector_base<float> v2(y, viennacl::MAIN_MEMORY, n, offy, incy);
+
+  *alpha = viennacl::linalg::inner_prod(v1, v2);
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDdot(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                           double *alpha,
+                                                           double *x, ViennaCLInt offx, int incx,
+                                                           double *y, ViennaCLInt offy, int incy)
+{
+  viennacl::vector_base<double> v1(x, viennacl::MAIN_MEMORY, n, offx, incx);
+  viennacl::vector_base<double> v2(y, viennacl::MAIN_MEMORY, n, offy, incy);
+
+  *alpha = viennacl::linalg::inner_prod(v1, v2);
+  return ViennaCLSuccess;
+}
+
+// xNRM2
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSnrm2(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            float *alpha,
+                                                            float *x, ViennaCLInt offx, int incx)
+{
+  viennacl::vector_base<float> v1(x, viennacl::MAIN_MEMORY, n, offx, incx);
+
+  *alpha = viennacl::linalg::norm_2(v1);
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDnrm2(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            double *alpha,
+                                                            double *x, ViennaCLInt offx, int incx)
+{
+  viennacl::vector_base<double> v1(x, viennacl::MAIN_MEMORY, n, offx, incx);
+
+  *alpha = viennacl::linalg::norm_2(v1);
+  return ViennaCLSuccess;
+}
+
+
+// xROT
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSrot(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                           float *x, ViennaCLInt offx, int incx,
+                                                           float *y, ViennaCLInt offy, int incy,
+                                                           float c, float s)
+{
+  viennacl::vector_base<float> v1(x, viennacl::MAIN_MEMORY, n, offx, incx);
+  viennacl::vector_base<float> v2(y, viennacl::MAIN_MEMORY, n, offy, incy);
+
+  viennacl::linalg::plane_rotation(v1, v2, c, s);
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDrot(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                           double *x, ViennaCLInt offx, int incx,
+                                                           double *y, ViennaCLInt offy, int incy,
+                                                           double c, double s)
+{
+  viennacl::vector_base<double> v1(x, viennacl::MAIN_MEMORY, n, offx, incx);
+  viennacl::vector_base<double> v2(y, viennacl::MAIN_MEMORY, n, offy, incy);
+
+  viennacl::linalg::plane_rotation(v1, v2, c, s);
+  return ViennaCLSuccess;
+}
+
+
+
+// xSCAL
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSscal(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            float alpha,
+                                                            float *x, ViennaCLInt offx, int incx)
+{
+  viennacl::vector_base<float> v1(x, viennacl::MAIN_MEMORY, n, offx, incx);
+
+  v1 *= alpha;
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDscal(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            double alpha,
+                                                            double *x, ViennaCLInt offx, int incx)
+{
+  viennacl::vector_base<double> v1(x, viennacl::MAIN_MEMORY, n, offx, incx);
+
+  v1 *= alpha;
+  return ViennaCLSuccess;
+}
+
+// xSWAP
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSswap(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            float *x, ViennaCLInt offx, int incx,
+                                                            float *y, ViennaCLInt offy, int incy)
+{
+  viennacl::vector_base<float> v1(x, viennacl::MAIN_MEMORY, n, offx, incx);
+  viennacl::vector_base<float> v2(y, viennacl::MAIN_MEMORY, n, offy, incy);
+
+  viennacl::swap(v1, v2);
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDswap(ViennaCLBackend /*backend*/, ViennaCLInt n,
+                                                            double *x, ViennaCLInt offx, int incx,
+                                                            double *y, ViennaCLInt offy, int incy)
+{
+  viennacl::vector_base<double> v1(x, viennacl::MAIN_MEMORY, n, offx, incx);
+  viennacl::vector_base<double> v2(y, viennacl::MAIN_MEMORY, n, offy, incy);
+
+  viennacl::swap(v1, v2);
+  return ViennaCLSuccess;
+}
diff --git a/libviennacl/src/blas1_opencl.cpp b/libviennacl/src/blas1_opencl.cpp
new file mode 100644
index 0000000..52f0897
--- /dev/null
+++ b/libviennacl/src/blas1_opencl.cpp
@@ -0,0 +1,261 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+
+//include the generic inner product functions of ViennaCL
+#include "viennacl/linalg/inner_prod.hpp"
+
+//include the generic norm functions of ViennaCL
+#include "viennacl/linalg/norm_1.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+#include "viennacl/linalg/norm_inf.hpp"
+
+#ifdef VIENNACL_WITH_OPENCL
+
+// IxAMAX
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLiSamax(ViennaCLBackend backend, ViennaCLInt n,
+                                                               ViennaCLInt *index,
+                                                               cl_mem x, ViennaCLInt offx, ViennaCLInt incx)
+{
+  viennacl::vector_base<float> v1(x, n, offx, incx, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  *index = static_cast<ViennaCLInt>(viennacl::linalg::index_norm_inf(v1));
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLiDamax(ViennaCLBackend backend, ViennaCLInt n,
+                                                               ViennaCLInt *index,
+                                                               cl_mem x, ViennaCLInt offx, ViennaCLInt incx)
+{
+  viennacl::vector_base<double> v1(x, n, offx, incx, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  *index = static_cast<ViennaCLInt>(viennacl::linalg::index_norm_inf(v1));
+  return ViennaCLSuccess;
+}
+
+
+
+
+// xASUM
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSasum(ViennaCLBackend backend, ViennaCLInt n,
+                                                              float *alpha,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx)
+{
+  viennacl::vector_base<float> v1(x, n, offx, incx, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  *alpha = viennacl::linalg::norm_1(v1);
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDasum(ViennaCLBackend backend, ViennaCLInt n,
+                                                              double *alpha,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx)
+{
+  viennacl::vector_base<double> v1(x, n, offx, incx, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  *alpha = viennacl::linalg::norm_1(v1);
+  return ViennaCLSuccess;
+}
+
+
+
+// xAXPY
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSaxpy(ViennaCLBackend backend, ViennaCLInt n,
+                                                              float alpha,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                              cl_mem y, ViennaCLInt offy, ViennaCLInt incy)
+{
+  viennacl::vector_base<float> v1(x, n, offx, incx, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+  viennacl::vector_base<float> v2(y, n, offy, incy, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  v2 += alpha * v1;
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDaxpy(ViennaCLBackend backend, ViennaCLInt n,
+                                                              double alpha,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                              cl_mem y, ViennaCLInt offy, ViennaCLInt incy)
+{
+  viennacl::vector_base<double> v1(x, n, offx, incx, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+  viennacl::vector_base<double> v2(y, n, offy, incy, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  v2 += alpha * v1;
+  return ViennaCLSuccess;
+}
+
+
+// xCOPY
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLScopy(ViennaCLBackend backend, ViennaCLInt n,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                              cl_mem y, ViennaCLInt offy, ViennaCLInt incy)
+{
+  viennacl::vector_base<float> v1(x, n, offx, incx, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+  viennacl::vector_base<float> v2(y, n, offy, incy, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  v2 = v1;
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDcopy(ViennaCLBackend backend, ViennaCLInt n,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                              cl_mem y, ViennaCLInt offy, ViennaCLInt incy)
+{
+  viennacl::vector_base<double> v1(x, n, offx, incx, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+  viennacl::vector_base<double> v2(y, n, offy, incy, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  v2 = v1;
+  return ViennaCLSuccess;
+}
+
+// xDOT
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSdot(ViennaCLBackend backend, ViennaCLInt n,
+                                                             float *alpha,
+                                                             cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                             cl_mem y, ViennaCLInt offy, ViennaCLInt incy)
+{
+  viennacl::vector_base<float> v1(x, n, offx, incx, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+  viennacl::vector_base<float> v2(y, n, offy, incy, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  *alpha = viennacl::linalg::inner_prod(v1, v2);
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDdot(ViennaCLBackend backend, ViennaCLInt n,
+                                                             double *alpha,
+                                                             cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                             cl_mem y, ViennaCLInt offy, ViennaCLInt incy)
+{
+  viennacl::vector_base<double> v1(x, n, offx, incx, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+  viennacl::vector_base<double> v2(y, n, offy, incy, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  *alpha = viennacl::linalg::inner_prod(v1, v2);
+  return ViennaCLSuccess;
+}
+
+
+// xNRM2
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSnrm2(ViennaCLBackend backend, ViennaCLInt n,
+                                                              float *alpha,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx)
+{
+  viennacl::vector_base<float> v1(x, n, offx, incx, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  *alpha = viennacl::linalg::norm_2(v1);
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDnrm2(ViennaCLBackend backend, ViennaCLInt n,
+                                                              double *alpha,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx)
+{
+  viennacl::vector_base<double> v1(x, n, offx, incx, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  *alpha = viennacl::linalg::norm_2(v1);
+  return ViennaCLSuccess;
+}
+
+
+// xROT
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSrot(ViennaCLBackend backend, ViennaCLInt n,
+                                                             cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                             cl_mem y, ViennaCLInt offy, ViennaCLInt incy,
+                                                             float c, float s)
+{
+  viennacl::vector_base<float> v1(x, n, offx, incx, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+  viennacl::vector_base<float> v2(y, n, offy, incy, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  viennacl::linalg::plane_rotation(v1, v2, c, s);
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDrot(ViennaCLBackend backend, ViennaCLInt n,
+                                                             cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                             cl_mem y, ViennaCLInt offy, ViennaCLInt incy,
+                                                             double c, double s)
+{
+  viennacl::vector_base<double> v1(x, n, offx, incx, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+  viennacl::vector_base<double> v2(y, n, offy, incy, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  viennacl::linalg::plane_rotation(v1, v2, c, s);
+  return ViennaCLSuccess;
+}
+
+
+
+// xSCAL
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSscal(ViennaCLBackend backend, ViennaCLInt n,
+                                                              float alpha,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx)
+{
+  viennacl::vector_base<float> v1(x, n, offx, incx, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  v1 *= alpha;
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDscal(ViennaCLBackend backend, ViennaCLInt n,
+                                                              double alpha,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx)
+{
+  viennacl::vector_base<double> v1(x, n, offx, incx, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  v1 *= alpha;
+  return ViennaCLSuccess;
+}
+
+// xSWAP
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSswap(ViennaCLBackend backend, ViennaCLInt n,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                              cl_mem y, ViennaCLInt offy, ViennaCLInt incy)
+{
+  viennacl::vector_base<float> v1(x, n, offx, incx, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+  viennacl::vector_base<float> v2(y, n, offy, incy, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  viennacl::swap(v1, v2);
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDswap(ViennaCLBackend backend, ViennaCLInt n,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                              cl_mem y, ViennaCLInt offy, ViennaCLInt incy)
+{
+  viennacl::vector_base<double> v1(x, n, offx, incx, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+  viennacl::vector_base<double> v2(y, n, offy, incy, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  viennacl::swap(v1, v2);
+  return ViennaCLSuccess;
+}
+#endif
diff --git a/libviennacl/src/blas1_opencl.cu b/libviennacl/src/blas1_opencl.cu
new file mode 100644
index 0000000..52f0897
--- /dev/null
+++ b/libviennacl/src/blas1_opencl.cu
@@ -0,0 +1,261 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+
+//include the generic inner product functions of ViennaCL
+#include "viennacl/linalg/inner_prod.hpp"
+
+//include the generic norm functions of ViennaCL
+#include "viennacl/linalg/norm_1.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+#include "viennacl/linalg/norm_inf.hpp"
+
+#ifdef VIENNACL_WITH_OPENCL
+
+// IxAMAX
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLiSamax(ViennaCLBackend backend, ViennaCLInt n,
+                                                               ViennaCLInt *index,
+                                                               cl_mem x, ViennaCLInt offx, ViennaCLInt incx)
+{
+  viennacl::vector_base<float> v1(x, n, offx, incx, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  *index = static_cast<ViennaCLInt>(viennacl::linalg::index_norm_inf(v1));
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLiDamax(ViennaCLBackend backend, ViennaCLInt n,
+                                                               ViennaCLInt *index,
+                                                               cl_mem x, ViennaCLInt offx, ViennaCLInt incx)
+{
+  viennacl::vector_base<double> v1(x, n, offx, incx, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  *index = static_cast<ViennaCLInt>(viennacl::linalg::index_norm_inf(v1));
+  return ViennaCLSuccess;
+}
+
+
+
+
+// xASUM
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSasum(ViennaCLBackend backend, ViennaCLInt n,
+                                                              float *alpha,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx)
+{
+  viennacl::vector_base<float> v1(x, n, offx, incx, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  *alpha = viennacl::linalg::norm_1(v1);
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDasum(ViennaCLBackend backend, ViennaCLInt n,
+                                                              double *alpha,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx)
+{
+  viennacl::vector_base<double> v1(x, n, offx, incx, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  *alpha = viennacl::linalg::norm_1(v1);
+  return ViennaCLSuccess;
+}
+
+
+
+// xAXPY
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSaxpy(ViennaCLBackend backend, ViennaCLInt n,
+                                                              float alpha,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                              cl_mem y, ViennaCLInt offy, ViennaCLInt incy)
+{
+  viennacl::vector_base<float> v1(x, n, offx, incx, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+  viennacl::vector_base<float> v2(y, n, offy, incy, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  v2 += alpha * v1;
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDaxpy(ViennaCLBackend backend, ViennaCLInt n,
+                                                              double alpha,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                              cl_mem y, ViennaCLInt offy, ViennaCLInt incy)
+{
+  viennacl::vector_base<double> v1(x, n, offx, incx, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+  viennacl::vector_base<double> v2(y, n, offy, incy, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  v2 += alpha * v1;
+  return ViennaCLSuccess;
+}
+
+
+// xCOPY
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLScopy(ViennaCLBackend backend, ViennaCLInt n,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                              cl_mem y, ViennaCLInt offy, ViennaCLInt incy)
+{
+  viennacl::vector_base<float> v1(x, n, offx, incx, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+  viennacl::vector_base<float> v2(y, n, offy, incy, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  v2 = v1;
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDcopy(ViennaCLBackend backend, ViennaCLInt n,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                              cl_mem y, ViennaCLInt offy, ViennaCLInt incy)
+{
+  viennacl::vector_base<double> v1(x, n, offx, incx, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+  viennacl::vector_base<double> v2(y, n, offy, incy, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  v2 = v1;
+  return ViennaCLSuccess;
+}
+
+// xDOT
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSdot(ViennaCLBackend backend, ViennaCLInt n,
+                                                             float *alpha,
+                                                             cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                             cl_mem y, ViennaCLInt offy, ViennaCLInt incy)
+{
+  viennacl::vector_base<float> v1(x, n, offx, incx, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+  viennacl::vector_base<float> v2(y, n, offy, incy, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  *alpha = viennacl::linalg::inner_prod(v1, v2);
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDdot(ViennaCLBackend backend, ViennaCLInt n,
+                                                             double *alpha,
+                                                             cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                             cl_mem y, ViennaCLInt offy, ViennaCLInt incy)
+{
+  viennacl::vector_base<double> v1(x, n, offx, incx, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+  viennacl::vector_base<double> v2(y, n, offy, incy, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  *alpha = viennacl::linalg::inner_prod(v1, v2);
+  return ViennaCLSuccess;
+}
+
+
+// xNRM2
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSnrm2(ViennaCLBackend backend, ViennaCLInt n,
+                                                              float *alpha,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx)
+{
+  viennacl::vector_base<float> v1(x, n, offx, incx, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  *alpha = viennacl::linalg::norm_2(v1);
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDnrm2(ViennaCLBackend backend, ViennaCLInt n,
+                                                              double *alpha,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx)
+{
+  viennacl::vector_base<double> v1(x, n, offx, incx, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  *alpha = viennacl::linalg::norm_2(v1);
+  return ViennaCLSuccess;
+}
+
+
+// xROT
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSrot(ViennaCLBackend backend, ViennaCLInt n,
+                                                             cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                             cl_mem y, ViennaCLInt offy, ViennaCLInt incy,
+                                                             float c, float s)
+{
+  viennacl::vector_base<float> v1(x, n, offx, incx, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+  viennacl::vector_base<float> v2(y, n, offy, incy, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  viennacl::linalg::plane_rotation(v1, v2, c, s);
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDrot(ViennaCLBackend backend, ViennaCLInt n,
+                                                             cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                             cl_mem y, ViennaCLInt offy, ViennaCLInt incy,
+                                                             double c, double s)
+{
+  viennacl::vector_base<double> v1(x, n, offx, incx, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+  viennacl::vector_base<double> v2(y, n, offy, incy, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  viennacl::linalg::plane_rotation(v1, v2, c, s);
+  return ViennaCLSuccess;
+}
+
+
+
+// xSCAL
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSscal(ViennaCLBackend backend, ViennaCLInt n,
+                                                              float alpha,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx)
+{
+  viennacl::vector_base<float> v1(x, n, offx, incx, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  v1 *= alpha;
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDscal(ViennaCLBackend backend, ViennaCLInt n,
+                                                              double alpha,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx)
+{
+  viennacl::vector_base<double> v1(x, n, offx, incx, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  v1 *= alpha;
+  return ViennaCLSuccess;
+}
+
+// xSWAP
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSswap(ViennaCLBackend backend, ViennaCLInt n,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                              cl_mem y, ViennaCLInt offy, ViennaCLInt incy)
+{
+  viennacl::vector_base<float> v1(x, n, offx, incx, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+  viennacl::vector_base<float> v2(y, n, offy, incy, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  viennacl::swap(v1, v2);
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDswap(ViennaCLBackend backend, ViennaCLInt n,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                              cl_mem y, ViennaCLInt offy, ViennaCLInt incy)
+{
+  viennacl::vector_base<double> v1(x, n, offx, incx, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+  viennacl::vector_base<double> v2(y, n, offy, incy, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+  viennacl::swap(v1, v2);
+  return ViennaCLSuccess;
+}
+#endif
diff --git a/libviennacl/src/blas2.cpp b/libviennacl/src/blas2.cpp
new file mode 100644
index 0000000..7175484
--- /dev/null
+++ b/libviennacl/src/blas2.cpp
@@ -0,0 +1,309 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+#include "init_vector.hpp"
+#include "init_matrix.hpp"
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/direct_solve.hpp"
+#include "viennacl/linalg/prod.hpp"
+
+// GEMV
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLgemv(ViennaCLHostScalar alpha, ViennaCLMatrix A, ViennaCLVector x, ViennaCLHostScalar beta, ViennaCLVector y)
+{
+  viennacl::backend::mem_handle v1_handle;
+  viennacl::backend::mem_handle v2_handle;
+  viennacl::backend::mem_handle A_handle;
+
+  if (init_vector(v1_handle, x) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  if (init_vector(v2_handle, y) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  if (init_matrix(A_handle, A) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  switch (x->precision)
+  {
+    case ViennaCLFloat:
+    {
+      viennacl::vector_base<float> v1(v1_handle, x->size, x->offset, x->inc);
+      viennacl::vector_base<float> v2(v2_handle, y->size, y->offset, y->inc);
+
+      if (A->order == ViennaCLRowMajor)
+      {
+        viennacl::matrix_base<float> mat(A_handle,
+                                         A->size1, A->start1, A->stride1, A->internal_size1,
+                                         A->size2, A->start2, A->stride2, A->internal_size2);
+        v2 *= beta->value_float;
+        if (A->trans == ViennaCLTrans)
+          v2 += alpha->value_float * viennacl::linalg::prod(viennacl::trans(mat), v1);
+        else
+          v2 += alpha->value_float * viennacl::linalg::prod(mat, v1);
+      }
+      else
+      {
+        viennacl::matrix_base<float, viennacl::column_major> mat(A_handle,
+                                                                 A->size1, A->start1, A->stride1, A->internal_size1,
+                                                                 A->size2, A->start2, A->stride2, A->internal_size2);
+        v2 *= beta->value_float;
+        if (A->trans == ViennaCLTrans)
+          v2 += alpha->value_float * viennacl::linalg::prod(viennacl::trans(mat), v1);
+        else
+          v2 += alpha->value_float * viennacl::linalg::prod(mat, v1);
+      }
+
+      return ViennaCLSuccess;
+    }
+
+    case ViennaCLDouble:
+    {
+      viennacl::vector_base<double> v1(v1_handle, x->size, x->offset, x->inc);
+      viennacl::vector_base<double> v2(v2_handle, y->size, y->offset, y->inc);
+
+      if (A->order == ViennaCLRowMajor)
+      {
+        viennacl::matrix_base<double> mat(A_handle,
+                                          A->size1, A->start1, A->stride1, A->internal_size1,
+                                          A->size2, A->start2, A->stride2, A->internal_size2);
+        v2 *= beta->value_double;
+        if (A->trans == ViennaCLTrans)
+          v2 += alpha->value_double * viennacl::linalg::prod(viennacl::trans(mat), v1);
+        else
+          v2 += alpha->value_double * viennacl::linalg::prod(mat, v1);
+      }
+      else
+      {
+        viennacl::matrix_base<double, viennacl::column_major> mat(A_handle,
+                                                                  A->size1, A->start1, A->stride1, A->internal_size1,
+                                                                  A->size2, A->start2, A->stride2, A->internal_size2);
+        v2 *= beta->value_double;
+        if (A->trans == ViennaCLTrans)
+          v2 += alpha->value_double * viennacl::linalg::prod(viennacl::trans(mat), v1);
+        else
+          v2 += alpha->value_double * viennacl::linalg::prod(mat, v1);
+      }
+
+      return ViennaCLSuccess;
+    }
+
+    default:
+      return ViennaCLGenericFailure;
+  }
+}
+
+
+// xTRSV
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLtrsv(ViennaCLMatrix A, ViennaCLVector x, ViennaCLUplo uplo)
+{
+  viennacl::backend::mem_handle v1_handle;
+  viennacl::backend::mem_handle A_handle;
+
+  if (init_vector(v1_handle, x) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  if (init_matrix(A_handle, A) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  switch (x->precision)
+  {
+    case ViennaCLFloat:
+    {
+      viennacl::vector_base<float> v1(v1_handle, x->size, x->offset, x->inc);
+
+      if (A->order == ViennaCLRowMajor)
+      {
+        viennacl::matrix_base<float> mat(A_handle,
+                                         A->size1, A->start1, A->stride1, A->internal_size1,
+                                         A->size2, A->start2, A->stride2, A->internal_size2);
+        if (A->trans == ViennaCLTrans)
+        {
+          if (uplo == ViennaCLUpper)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat), v1, viennacl::linalg::upper_tag());
+          else
+            viennacl::linalg::inplace_solve(viennacl::trans(mat), v1, viennacl::linalg::lower_tag());
+        }
+        else
+        {
+          if (uplo == ViennaCLUpper)
+            viennacl::linalg::inplace_solve(mat, v1, viennacl::linalg::upper_tag());
+          else
+            viennacl::linalg::inplace_solve(mat, v1, viennacl::linalg::lower_tag());
+        }
+      }
+      else
+      {
+        viennacl::matrix_base<float, viennacl::column_major> mat(A_handle,
+                                                                 A->size1, A->start1, A->stride1, A->internal_size1,
+                                                                 A->size2, A->start2, A->stride2, A->internal_size2);
+        if (A->trans == ViennaCLTrans)
+        {
+          if (uplo == ViennaCLUpper)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat), v1, viennacl::linalg::upper_tag());
+          else
+            viennacl::linalg::inplace_solve(viennacl::trans(mat), v1, viennacl::linalg::lower_tag());
+        }
+        else
+        {
+          if (uplo == ViennaCLUpper)
+            viennacl::linalg::inplace_solve(mat, v1, viennacl::linalg::upper_tag());
+          else
+            viennacl::linalg::inplace_solve(mat, v1, viennacl::linalg::lower_tag());
+        }
+      }
+
+      return ViennaCLSuccess;
+    }
+    case ViennaCLDouble:
+    {
+      viennacl::vector_base<double> v1(v1_handle, x->size, x->offset, x->inc);
+
+      if (A->order == ViennaCLRowMajor)
+      {
+        viennacl::matrix_base<double> mat(A_handle,
+                                          A->size1, A->start1, A->stride1, A->internal_size1,
+                                          A->size2, A->start2, A->stride2, A->internal_size2);
+        if (A->trans == ViennaCLTrans)
+        {
+          if (uplo == ViennaCLUpper)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat), v1, viennacl::linalg::upper_tag());
+          else
+            viennacl::linalg::inplace_solve(viennacl::trans(mat), v1, viennacl::linalg::lower_tag());
+        }
+        else
+        {
+          if (uplo == ViennaCLUpper)
+            viennacl::linalg::inplace_solve(mat, v1, viennacl::linalg::upper_tag());
+          else
+            viennacl::linalg::inplace_solve(mat, v1, viennacl::linalg::lower_tag());
+        }
+      }
+      else
+      {
+        viennacl::matrix_base<double, viennacl::column_major> mat(A_handle,
+                                                                  A->size1, A->start1, A->stride1, A->internal_size1,
+                                                                  A->size2, A->start2, A->stride2, A->internal_size2);
+        if (A->trans == ViennaCLTrans)
+        {
+          if (uplo == ViennaCLUpper)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat), v1, viennacl::linalg::upper_tag());
+          else
+            viennacl::linalg::inplace_solve(viennacl::trans(mat), v1, viennacl::linalg::lower_tag());
+        }
+        else
+        {
+          if (uplo == ViennaCLUpper)
+            viennacl::linalg::inplace_solve(mat, v1, viennacl::linalg::upper_tag());
+          else
+            viennacl::linalg::inplace_solve(mat, v1, viennacl::linalg::lower_tag());
+        }
+      }
+
+      return ViennaCLSuccess;
+    }
+
+    default:
+      return  ViennaCLGenericFailure;
+  }
+}
+
+
+// xGER
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLger(ViennaCLHostScalar alpha, ViennaCLVector x, ViennaCLVector y, ViennaCLMatrix A)
+{
+  viennacl::backend::mem_handle v1_handle;
+  viennacl::backend::mem_handle v2_handle;
+  viennacl::backend::mem_handle A_handle;
+
+  if (init_vector(v1_handle, x) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  if (init_vector(v2_handle, y) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  if (init_matrix(A_handle, A) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  switch (x->precision)
+  {
+    case ViennaCLFloat:
+    {
+      viennacl::vector_base<float> v1(v1_handle, x->size, x->offset, x->inc);
+      viennacl::vector_base<float> v2(v2_handle, y->size, y->offset, y->inc);
+
+      if (A->order == ViennaCLRowMajor)
+      {
+        viennacl::matrix_base<float> mat(A_handle,
+                                         A->size1, A->start1, A->stride1, A->internal_size1,
+                                         A->size2, A->start2, A->stride2, A->internal_size2);
+
+        mat += alpha->value_float * viennacl::linalg::outer_prod(v1, v2);
+      }
+      else
+      {
+        viennacl::matrix_base<float, viennacl::column_major> mat(A_handle,
+                                                                 A->size1, A->start1, A->stride1, A->internal_size1,
+                                                                 A->size2, A->start2, A->stride2, A->internal_size2);
+
+        mat += alpha->value_float * viennacl::linalg::outer_prod(v1, v2);
+      }
+
+      return ViennaCLSuccess;
+    }
+    case ViennaCLDouble:
+    {
+      viennacl::vector_base<double> v1(v1_handle, x->size, x->offset, x->inc);
+      viennacl::vector_base<double> v2(v2_handle, y->size, y->offset, y->inc);
+
+      if (A->order == ViennaCLRowMajor)
+      {
+        viennacl::matrix_base<double> mat(A_handle,
+                                          A->size1, A->start1, A->stride1, A->internal_size1,
+                                          A->size2, A->start2, A->stride2, A->internal_size2);
+
+        mat += alpha->value_double * viennacl::linalg::outer_prod(v1, v2);
+      }
+      else
+      {
+        viennacl::matrix_base<double, viennacl::column_major> mat(A_handle,
+                                                                  A->size1, A->start1, A->stride1, A->internal_size1,
+                                                                  A->size2, A->start2, A->stride2, A->internal_size2);
+
+        mat += alpha->value_double * viennacl::linalg::outer_prod(v1, v2);
+      }
+
+      return ViennaCLSuccess;
+    }
+    default:
+      return  ViennaCLGenericFailure;
+  }
+}
+
+
diff --git a/libviennacl/src/blas2.cu b/libviennacl/src/blas2.cu
new file mode 100644
index 0000000..7175484
--- /dev/null
+++ b/libviennacl/src/blas2.cu
@@ -0,0 +1,309 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+#include "init_vector.hpp"
+#include "init_matrix.hpp"
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/direct_solve.hpp"
+#include "viennacl/linalg/prod.hpp"
+
+// GEMV
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLgemv(ViennaCLHostScalar alpha, ViennaCLMatrix A, ViennaCLVector x, ViennaCLHostScalar beta, ViennaCLVector y)
+{
+  viennacl::backend::mem_handle v1_handle;
+  viennacl::backend::mem_handle v2_handle;
+  viennacl::backend::mem_handle A_handle;
+
+  if (init_vector(v1_handle, x) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  if (init_vector(v2_handle, y) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  if (init_matrix(A_handle, A) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  switch (x->precision)
+  {
+    case ViennaCLFloat:
+    {
+      viennacl::vector_base<float> v1(v1_handle, x->size, x->offset, x->inc);
+      viennacl::vector_base<float> v2(v2_handle, y->size, y->offset, y->inc);
+
+      if (A->order == ViennaCLRowMajor)
+      {
+        viennacl::matrix_base<float> mat(A_handle,
+                                         A->size1, A->start1, A->stride1, A->internal_size1,
+                                         A->size2, A->start2, A->stride2, A->internal_size2);
+        v2 *= beta->value_float;
+        if (A->trans == ViennaCLTrans)
+          v2 += alpha->value_float * viennacl::linalg::prod(viennacl::trans(mat), v1);
+        else
+          v2 += alpha->value_float * viennacl::linalg::prod(mat, v1);
+      }
+      else
+      {
+        viennacl::matrix_base<float, viennacl::column_major> mat(A_handle,
+                                                                 A->size1, A->start1, A->stride1, A->internal_size1,
+                                                                 A->size2, A->start2, A->stride2, A->internal_size2);
+        v2 *= beta->value_float;
+        if (A->trans == ViennaCLTrans)
+          v2 += alpha->value_float * viennacl::linalg::prod(viennacl::trans(mat), v1);
+        else
+          v2 += alpha->value_float * viennacl::linalg::prod(mat, v1);
+      }
+
+      return ViennaCLSuccess;
+    }
+
+    case ViennaCLDouble:
+    {
+      viennacl::vector_base<double> v1(v1_handle, x->size, x->offset, x->inc);
+      viennacl::vector_base<double> v2(v2_handle, y->size, y->offset, y->inc);
+
+      if (A->order == ViennaCLRowMajor)
+      {
+        viennacl::matrix_base<double> mat(A_handle,
+                                          A->size1, A->start1, A->stride1, A->internal_size1,
+                                          A->size2, A->start2, A->stride2, A->internal_size2);
+        v2 *= beta->value_double;
+        if (A->trans == ViennaCLTrans)
+          v2 += alpha->value_double * viennacl::linalg::prod(viennacl::trans(mat), v1);
+        else
+          v2 += alpha->value_double * viennacl::linalg::prod(mat, v1);
+      }
+      else
+      {
+        viennacl::matrix_base<double, viennacl::column_major> mat(A_handle,
+                                                                  A->size1, A->start1, A->stride1, A->internal_size1,
+                                                                  A->size2, A->start2, A->stride2, A->internal_size2);
+        v2 *= beta->value_double;
+        if (A->trans == ViennaCLTrans)
+          v2 += alpha->value_double * viennacl::linalg::prod(viennacl::trans(mat), v1);
+        else
+          v2 += alpha->value_double * viennacl::linalg::prod(mat, v1);
+      }
+
+      return ViennaCLSuccess;
+    }
+
+    default:
+      return ViennaCLGenericFailure;
+  }
+}
+
+
+// xTRSV
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLtrsv(ViennaCLMatrix A, ViennaCLVector x, ViennaCLUplo uplo)
+{
+  viennacl::backend::mem_handle v1_handle;
+  viennacl::backend::mem_handle A_handle;
+
+  if (init_vector(v1_handle, x) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  if (init_matrix(A_handle, A) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  switch (x->precision)
+  {
+    case ViennaCLFloat:
+    {
+      viennacl::vector_base<float> v1(v1_handle, x->size, x->offset, x->inc);
+
+      if (A->order == ViennaCLRowMajor)
+      {
+        viennacl::matrix_base<float> mat(A_handle,
+                                         A->size1, A->start1, A->stride1, A->internal_size1,
+                                         A->size2, A->start2, A->stride2, A->internal_size2);
+        if (A->trans == ViennaCLTrans)
+        {
+          if (uplo == ViennaCLUpper)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat), v1, viennacl::linalg::upper_tag());
+          else
+            viennacl::linalg::inplace_solve(viennacl::trans(mat), v1, viennacl::linalg::lower_tag());
+        }
+        else
+        {
+          if (uplo == ViennaCLUpper)
+            viennacl::linalg::inplace_solve(mat, v1, viennacl::linalg::upper_tag());
+          else
+            viennacl::linalg::inplace_solve(mat, v1, viennacl::linalg::lower_tag());
+        }
+      }
+      else
+      {
+        viennacl::matrix_base<float, viennacl::column_major> mat(A_handle,
+                                                                 A->size1, A->start1, A->stride1, A->internal_size1,
+                                                                 A->size2, A->start2, A->stride2, A->internal_size2);
+        if (A->trans == ViennaCLTrans)
+        {
+          if (uplo == ViennaCLUpper)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat), v1, viennacl::linalg::upper_tag());
+          else
+            viennacl::linalg::inplace_solve(viennacl::trans(mat), v1, viennacl::linalg::lower_tag());
+        }
+        else
+        {
+          if (uplo == ViennaCLUpper)
+            viennacl::linalg::inplace_solve(mat, v1, viennacl::linalg::upper_tag());
+          else
+            viennacl::linalg::inplace_solve(mat, v1, viennacl::linalg::lower_tag());
+        }
+      }
+
+      return ViennaCLSuccess;
+    }
+    case ViennaCLDouble:
+    {
+      viennacl::vector_base<double> v1(v1_handle, x->size, x->offset, x->inc);
+
+      if (A->order == ViennaCLRowMajor)
+      {
+        viennacl::matrix_base<double> mat(A_handle,
+                                          A->size1, A->start1, A->stride1, A->internal_size1,
+                                          A->size2, A->start2, A->stride2, A->internal_size2);
+        if (A->trans == ViennaCLTrans)
+        {
+          if (uplo == ViennaCLUpper)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat), v1, viennacl::linalg::upper_tag());
+          else
+            viennacl::linalg::inplace_solve(viennacl::trans(mat), v1, viennacl::linalg::lower_tag());
+        }
+        else
+        {
+          if (uplo == ViennaCLUpper)
+            viennacl::linalg::inplace_solve(mat, v1, viennacl::linalg::upper_tag());
+          else
+            viennacl::linalg::inplace_solve(mat, v1, viennacl::linalg::lower_tag());
+        }
+      }
+      else
+      {
+        viennacl::matrix_base<double, viennacl::column_major> mat(A_handle,
+                                                                  A->size1, A->start1, A->stride1, A->internal_size1,
+                                                                  A->size2, A->start2, A->stride2, A->internal_size2);
+        if (A->trans == ViennaCLTrans)
+        {
+          if (uplo == ViennaCLUpper)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat), v1, viennacl::linalg::upper_tag());
+          else
+            viennacl::linalg::inplace_solve(viennacl::trans(mat), v1, viennacl::linalg::lower_tag());
+        }
+        else
+        {
+          if (uplo == ViennaCLUpper)
+            viennacl::linalg::inplace_solve(mat, v1, viennacl::linalg::upper_tag());
+          else
+            viennacl::linalg::inplace_solve(mat, v1, viennacl::linalg::lower_tag());
+        }
+      }
+
+      return ViennaCLSuccess;
+    }
+
+    default:
+      return  ViennaCLGenericFailure;
+  }
+}
+
+
+// xGER
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLger(ViennaCLHostScalar alpha, ViennaCLVector x, ViennaCLVector y, ViennaCLMatrix A)
+{
+  viennacl::backend::mem_handle v1_handle;
+  viennacl::backend::mem_handle v2_handle;
+  viennacl::backend::mem_handle A_handle;
+
+  if (init_vector(v1_handle, x) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  if (init_vector(v2_handle, y) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  if (init_matrix(A_handle, A) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  switch (x->precision)
+  {
+    case ViennaCLFloat:
+    {
+      viennacl::vector_base<float> v1(v1_handle, x->size, x->offset, x->inc);
+      viennacl::vector_base<float> v2(v2_handle, y->size, y->offset, y->inc);
+
+      if (A->order == ViennaCLRowMajor)
+      {
+        viennacl::matrix_base<float> mat(A_handle,
+                                         A->size1, A->start1, A->stride1, A->internal_size1,
+                                         A->size2, A->start2, A->stride2, A->internal_size2);
+
+        mat += alpha->value_float * viennacl::linalg::outer_prod(v1, v2);
+      }
+      else
+      {
+        viennacl::matrix_base<float, viennacl::column_major> mat(A_handle,
+                                                                 A->size1, A->start1, A->stride1, A->internal_size1,
+                                                                 A->size2, A->start2, A->stride2, A->internal_size2);
+
+        mat += alpha->value_float * viennacl::linalg::outer_prod(v1, v2);
+      }
+
+      return ViennaCLSuccess;
+    }
+    case ViennaCLDouble:
+    {
+      viennacl::vector_base<double> v1(v1_handle, x->size, x->offset, x->inc);
+      viennacl::vector_base<double> v2(v2_handle, y->size, y->offset, y->inc);
+
+      if (A->order == ViennaCLRowMajor)
+      {
+        viennacl::matrix_base<double> mat(A_handle,
+                                          A->size1, A->start1, A->stride1, A->internal_size1,
+                                          A->size2, A->start2, A->stride2, A->internal_size2);
+
+        mat += alpha->value_double * viennacl::linalg::outer_prod(v1, v2);
+      }
+      else
+      {
+        viennacl::matrix_base<double, viennacl::column_major> mat(A_handle,
+                                                                  A->size1, A->start1, A->stride1, A->internal_size1,
+                                                                  A->size2, A->start2, A->stride2, A->internal_size2);
+
+        mat += alpha->value_double * viennacl::linalg::outer_prod(v1, v2);
+      }
+
+      return ViennaCLSuccess;
+    }
+    default:
+      return  ViennaCLGenericFailure;
+  }
+}
+
+
diff --git a/libviennacl/src/blas2_cuda.cu b/libviennacl/src/blas2_cuda.cu
new file mode 100644
index 0000000..7656135
--- /dev/null
+++ b/libviennacl/src/blas2_cuda.cu
@@ -0,0 +1,286 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/direct_solve.hpp"
+#include "viennacl/linalg/prod.hpp"
+
+
+#ifdef VIENNACL_WITH_CUDA
+
+// xGEMV
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDASgemv(ViennaCLBackend /*backend*/,
+                                                            ViennaCLOrder order, ViennaCLTranspose transA,
+                                                            ViennaCLInt m, ViennaCLInt n, float alpha, float *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                            float *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                            float beta,
+                                                            float *y, ViennaCLInt offy, ViennaCLInt incy)
+{
+  if (order == ViennaCLRowMajor)
+  {
+    viennacl::vector_base<float> v1(x, viennacl::CUDA_MEMORY, n, offx, incx);
+    viennacl::vector_base<float> v2(y, viennacl::CUDA_MEMORY, m, offy, incy);
+    viennacl::matrix_base<float> mat(A, viennacl::CUDA_MEMORY,
+                                     m, offA_row, incA_row, m,
+                                     n, offA_col, incA_col, lda);
+    v2 *= beta;
+    if (transA == ViennaCLTrans)
+      v2 += alpha * viennacl::linalg::prod(viennacl::trans(mat), v1);
+    else
+      v2 += alpha * viennacl::linalg::prod(mat, v1);
+  }
+  else
+  {
+    viennacl::vector_base<float> v1(x, viennacl::CUDA_MEMORY, n, offx, incx);
+    viennacl::vector_base<float> v2(y, viennacl::CUDA_MEMORY, m, offy, incy);
+    viennacl::matrix_base<float, viennacl::column_major> mat(A, viennacl::CUDA_MEMORY,
+                                                             m, offA_row, incA_row, lda,
+                                                             n, offA_col, incA_col, n);
+    v2 *= beta;
+    if (transA == ViennaCLTrans)
+      v2 += alpha * viennacl::linalg::prod(viennacl::trans(mat), v1);
+    else
+      v2 += alpha * viennacl::linalg::prod(mat, v1);
+  }
+
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADgemv(ViennaCLBackend /*backend*/,
+                                                            ViennaCLOrder order, ViennaCLTranspose transA,
+                                                            ViennaCLInt m, ViennaCLInt n, double alpha, double *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                            double *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                            double beta,
+                                                            double *y, ViennaCLInt offy, ViennaCLInt incy)
+{
+  if (order == ViennaCLRowMajor)
+  {
+    viennacl::vector_base<double> v1(x, viennacl::CUDA_MEMORY, n, offx, incx);
+    viennacl::vector_base<double> v2(y, viennacl::CUDA_MEMORY, m, offy, incy);
+    viennacl::matrix_base<double> mat(A, viennacl::CUDA_MEMORY,
+                                      m, offA_row, incA_row, m,
+                                      n, offA_col, incA_col, lda);
+    v2 *= beta;
+    if (transA == ViennaCLTrans)
+      v2 += alpha * viennacl::linalg::prod(viennacl::trans(mat), v1);
+    else
+      v2 += alpha * viennacl::linalg::prod(mat, v1);
+  }
+  else
+  {
+    viennacl::vector_base<double> v1(x, viennacl::CUDA_MEMORY, n, offx, incx);
+    viennacl::vector_base<double> v2(y, viennacl::CUDA_MEMORY, m, offy, incy);
+    viennacl::matrix_base<double, viennacl::column_major> mat(A, viennacl::CUDA_MEMORY,
+                                                              m, offA_row, incA_row, lda,
+                                                              n, offA_col, incA_col, n);
+    v2 *= beta;
+    if (transA == ViennaCLTrans)
+      v2 += alpha * viennacl::linalg::prod(viennacl::trans(mat), v1);
+    else
+      v2 += alpha * viennacl::linalg::prod(mat, v1);
+  }
+
+  return ViennaCLSuccess;
+}
+
+
+
+// xTRSV
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDAStrsv(ViennaCLBackend /*backend*/,
+                                                            ViennaCLUplo uplo, ViennaCLOrder order, ViennaCLTranspose transA, ViennaCLDiag diag,
+                                                            ViennaCLInt n, float *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                            float *x, ViennaCLInt offx, ViennaCLInt incx)
+{
+  if (order == ViennaCLRowMajor)
+  {
+    viennacl::vector_base<float> v(x, viennacl::CUDA_MEMORY, n, offx, incx);
+    viennacl::matrix_base<float> mat(A, viennacl::CUDA_MEMORY,
+                                     n, offA_row, incA_row, n,
+                                     n, offA_col, incA_col, lda);
+    if (transA == ViennaCLTrans)
+    {
+      if (uplo == ViennaCLUpper)
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::upper_tag());
+      else
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::lower_tag());
+    }
+    else
+    {
+      if (uplo == ViennaCLUpper)
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::upper_tag());
+      else
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::lower_tag());
+    }
+  }
+  else
+  {
+    viennacl::vector_base<float> v(x, viennacl::CUDA_MEMORY, n, offx, incx);
+    viennacl::matrix_base<float, viennacl::column_major> mat(A, viennacl::CUDA_MEMORY,
+                                                             n, offA_row, incA_row, lda,
+                                                             n, offA_col, incA_col, n);
+    if (transA == ViennaCLTrans)
+    {
+      if (uplo == ViennaCLUpper)
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::upper_tag());
+      else
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::lower_tag());
+    }
+    else
+    {
+      if (uplo == ViennaCLUpper)
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::upper_tag());
+      else
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::lower_tag());
+    }
+  }
+
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADtrsv(ViennaCLBackend /*backend*/,
+                                                            ViennaCLUplo uplo, ViennaCLOrder order, ViennaCLTranspose transA, ViennaCLDiag diag,
+                                                            ViennaCLInt n, double *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                            double *x, ViennaCLInt offx, ViennaCLInt incx)
+{
+  if (order == ViennaCLRowMajor)
+  {
+    viennacl::vector_base<double> v(x, viennacl::CUDA_MEMORY, n, offx, incx);
+    viennacl::matrix_base<double> mat(A, viennacl::CUDA_MEMORY,
+                                      n, offA_row, incA_row, n,
+                                      n, offA_col, incA_col, lda);
+    if (transA == ViennaCLTrans)
+    {
+      if (uplo == ViennaCLUpper)
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::upper_tag());
+      else
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::lower_tag());
+    }
+    else
+    {
+      if (uplo == ViennaCLUpper)
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::upper_tag());
+      else
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::lower_tag());
+    }
+  }
+  else
+  {
+    viennacl::vector_base<double> v(x, viennacl::CUDA_MEMORY, n, offx, incx);
+    viennacl::matrix_base<double, viennacl::column_major> mat(A, viennacl::CUDA_MEMORY,
+                                                              n, offA_row, incA_row, lda,
+                                                              n, offA_col, incA_col, n);
+    if (transA == ViennaCLTrans)
+    {
+      if (uplo == ViennaCLUpper)
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::upper_tag());
+      else
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::lower_tag());
+    }
+    else
+    {
+      if (uplo == ViennaCLUpper)
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::upper_tag());
+      else
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::lower_tag());
+    }
+  }
+
+  return ViennaCLSuccess;
+}
+
+
+
+// xGER
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDASger(ViennaCLBackend /*backend*/,
+                                                           ViennaCLOrder order,
+                                                           ViennaCLInt m, ViennaCLInt n,
+                                                           float alpha,
+                                                           float *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                           float *y, ViennaCLInt offy, ViennaCLInt incy,
+                                                           float *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda)
+{
+  if (order == ViennaCLRowMajor)
+  {
+    viennacl::vector_base<float> v1(x, viennacl::CUDA_MEMORY, n, offx, incx);
+    viennacl::vector_base<float> v2(y, viennacl::CUDA_MEMORY, m, offy, incy);
+    viennacl::matrix_base<float> mat(A, viennacl::CUDA_MEMORY,
+                                     m, offA_row, incA_row, m,
+                                     n, offA_col, incA_col, lda);
+
+    mat += alpha * viennacl::linalg::outer_prod(v1, v2);
+  }
+  else
+  {
+    viennacl::vector_base<float> v1(x, viennacl::CUDA_MEMORY, n, offx, incx);
+    viennacl::vector_base<float> v2(y, viennacl::CUDA_MEMORY, m, offy, incy);
+    viennacl::matrix_base<float, viennacl::column_major> mat(A, viennacl::CUDA_MEMORY,
+                                                             m, offA_row, incA_row, lda,
+                                                             n, offA_col, incA_col, n);
+
+    mat += alpha * viennacl::linalg::outer_prod(v1, v2);
+  }
+
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADger(ViennaCLBackend /*backend*/,
+                                                           ViennaCLOrder order,
+                                                           ViennaCLInt m,  ViennaCLInt n,
+                                                           double alpha,
+                                                           double *x, ViennaCLInt offx, ViennaCLInt incx,
+                                                           double *y, ViennaCLInt offy, ViennaCLInt incy,
+                                                           double *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda)
+{
+  if (order == ViennaCLRowMajor)
+  {
+    viennacl::vector_base<double> v1(x, viennacl::CUDA_MEMORY, n, offx, incx);
+    viennacl::vector_base<double> v2(y, viennacl::CUDA_MEMORY, m, offy, incy);
+    viennacl::matrix_base<double> mat(A, viennacl::CUDA_MEMORY,
+                                      m, offA_row, incA_row, m,
+                                      n, offA_col, incA_col, lda);
+
+    mat += alpha * viennacl::linalg::outer_prod(v1, v2);
+  }
+  else
+  {
+    viennacl::vector_base<double> v1(x, viennacl::CUDA_MEMORY, n, offx, incx);
+    viennacl::vector_base<double> v2(y, viennacl::CUDA_MEMORY, m, offy, incy);
+    viennacl::matrix_base<double, viennacl::column_major> mat(A, viennacl::CUDA_MEMORY,
+                                                              m, offA_row, incA_row, lda,
+                                                              n, offA_col, incA_col, n);
+
+    mat += alpha * viennacl::linalg::outer_prod(v1, v2);
+  }
+
+  return ViennaCLSuccess;
+}
+
+#endif
diff --git a/libviennacl/src/blas2_host.cpp b/libviennacl/src/blas2_host.cpp
new file mode 100644
index 0000000..1a8e9b2
--- /dev/null
+++ b/libviennacl/src/blas2_host.cpp
@@ -0,0 +1,283 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/direct_solve.hpp"
+#include "viennacl/linalg/prod.hpp"
+
+
+// xGEMV
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSgemv(ViennaCLBackend /*backend*/,
+                                                            ViennaCLOrder order, ViennaCLTranspose transA,
+                                                            ViennaCLInt m, ViennaCLInt n, float alpha, float *A, ViennaCLInt offA_row, ViennaCLInt offA_col, int incA_row, int incA_col, ViennaCLInt lda,
+                                                            float *x, ViennaCLInt offx, int incx,
+                                                            float beta,
+                                                            float *y, ViennaCLInt offy, int incy)
+{
+  if (order == ViennaCLRowMajor)
+  {
+    viennacl::vector_base<float> v1(x, viennacl::MAIN_MEMORY, n, offx, incx);
+    viennacl::vector_base<float> v2(y, viennacl::MAIN_MEMORY, m, offy, incy);
+    viennacl::matrix_base<float> mat(A, viennacl::MAIN_MEMORY,
+                                     m, offA_row, incA_row, m,
+                                     n, offA_col, incA_col, lda);
+    v2 *= beta;
+    if (transA == ViennaCLTrans)
+      v2 += alpha * viennacl::linalg::prod(viennacl::trans(mat), v1);
+    else
+      v2 += alpha * viennacl::linalg::prod(mat, v1);
+  }
+  else
+  {
+    viennacl::vector_base<float> v1(x, viennacl::MAIN_MEMORY, n, offx, incx);
+    viennacl::vector_base<float> v2(y, viennacl::MAIN_MEMORY, m, offy, incy);
+    viennacl::matrix_base<float, viennacl::column_major> mat(A, viennacl::MAIN_MEMORY,
+                                                             m, offA_row, incA_row, lda,
+                                                             n, offA_col, incA_col, n);
+    v2 *= beta;
+    if (transA == ViennaCLTrans)
+      v2 += alpha * viennacl::linalg::prod(viennacl::trans(mat), v1);
+    else
+      v2 += alpha * viennacl::linalg::prod(mat, v1);
+  }
+
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDgemv(ViennaCLBackend /*backend*/,
+                                                            ViennaCLOrder order, ViennaCLTranspose transA,
+                                                            ViennaCLInt m, ViennaCLInt n, double alpha, double *A, ViennaCLInt offA_row, ViennaCLInt offA_col, int incA_row, int incA_col, ViennaCLInt lda,
+                                                            double *x, ViennaCLInt offx, int incx,
+                                                            double beta,
+                                                            double *y, ViennaCLInt offy, int incy)
+{
+  if (order == ViennaCLRowMajor)
+  {
+    viennacl::vector_base<double> v1(x, viennacl::MAIN_MEMORY, n, offx, incx);
+    viennacl::vector_base<double> v2(y, viennacl::MAIN_MEMORY, m, offy, incy);
+    viennacl::matrix_base<double> mat(A, viennacl::MAIN_MEMORY,
+                                      m, offA_row, incA_row, m,
+                                      n, offA_col, incA_col, lda);
+    v2 *= beta;
+    if (transA == ViennaCLTrans)
+      v2 += alpha * viennacl::linalg::prod(viennacl::trans(mat), v1);
+    else
+      v2 += alpha * viennacl::linalg::prod(mat, v1);
+  }
+  else
+  {
+    viennacl::vector_base<double> v1(x, viennacl::MAIN_MEMORY, n, offx, incx);
+    viennacl::vector_base<double> v2(y, viennacl::MAIN_MEMORY, m, offy, incy);
+    viennacl::matrix_base<double, viennacl::column_major> mat(A, viennacl::MAIN_MEMORY,
+                                                              m, offA_row, incA_row, lda,
+                                                              n, offA_col, incA_col, n);
+    v2 *= beta;
+    if (transA == ViennaCLTrans)
+      v2 += alpha * viennacl::linalg::prod(viennacl::trans(mat), v1);
+    else
+      v2 += alpha * viennacl::linalg::prod(mat, v1);
+  }
+
+  return ViennaCLSuccess;
+}
+
+
+
+// xTRSV
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostStrsv(ViennaCLBackend /*backend*/,
+                                                            ViennaCLUplo uplo, ViennaCLOrder order, ViennaCLTranspose transA,
+                                                            ViennaCLInt n, float *A, ViennaCLInt offA_row, ViennaCLInt offA_col, int incA_row, int incA_col, ViennaCLInt lda,
+                                                            float *x, ViennaCLInt offx, int incx)
+{
+  if (order == ViennaCLRowMajor)
+  {
+    viennacl::vector_base<float> v(x, viennacl::MAIN_MEMORY, n, offx, incx);
+    viennacl::matrix_base<float> mat(A, viennacl::MAIN_MEMORY,
+                                     n, offA_row, incA_row, n,
+                                     n, offA_col, incA_col, lda);
+    if (transA == ViennaCLTrans)
+    {
+      if (uplo == ViennaCLUpper)
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::upper_tag());
+      else
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::lower_tag());
+    }
+    else
+    {
+      if (uplo == ViennaCLUpper)
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::upper_tag());
+      else
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::lower_tag());
+    }
+  }
+  else
+  {
+    viennacl::vector_base<float> v(x, viennacl::MAIN_MEMORY, n, offx, incx);
+    viennacl::matrix_base<float, viennacl::column_major> mat(A, viennacl::MAIN_MEMORY,
+                                                             n, offA_row, incA_row, lda,
+                                                             n, offA_col, incA_col, n);
+    if (transA == ViennaCLTrans)
+    {
+      if (uplo == ViennaCLUpper)
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::upper_tag());
+      else
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::lower_tag());
+    }
+    else
+    {
+      if (uplo == ViennaCLUpper)
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::upper_tag());
+      else
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::lower_tag());
+    }
+  }
+
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDtrsv(ViennaCLBackend /*backend*/,
+                                                            ViennaCLUplo uplo, ViennaCLOrder order, ViennaCLTranspose transA,
+                                                            ViennaCLInt n, double *A, ViennaCLInt offA_row, ViennaCLInt offA_col, int incA_row, int incA_col, ViennaCLInt lda,
+                                                            double *x, ViennaCLInt offx, int incx)
+{
+  if (order == ViennaCLRowMajor)
+  {
+    viennacl::vector_base<double> v(x, viennacl::MAIN_MEMORY, n, offx, incx);
+    viennacl::matrix_base<double> mat(A, viennacl::MAIN_MEMORY,
+                                      n, offA_row, incA_row, n,
+                                      n, offA_col, incA_col, lda);
+    if (transA == ViennaCLTrans)
+    {
+      if (uplo == ViennaCLUpper)
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::upper_tag());
+      else
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::lower_tag());
+    }
+    else
+    {
+      if (uplo == ViennaCLUpper)
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::upper_tag());
+      else
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::lower_tag());
+    }
+  }
+  else
+  {
+    viennacl::vector_base<double> v(x, viennacl::MAIN_MEMORY, n, offx, incx);
+    viennacl::matrix_base<double, viennacl::column_major> mat(A, viennacl::MAIN_MEMORY,
+                                                              n, offA_row, incA_row, lda,
+                                                              n, offA_col, incA_col, n);
+    if (transA == ViennaCLTrans)
+    {
+      if (uplo == ViennaCLUpper)
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::upper_tag());
+      else
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::lower_tag());
+    }
+    else
+    {
+      if (uplo == ViennaCLUpper)
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::upper_tag());
+      else
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::lower_tag());
+    }
+  }
+
+  return ViennaCLSuccess;
+}
+
+
+
+// xGER
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSger(ViennaCLBackend /*backend*/,
+                                                           ViennaCLOrder order,
+                                                           ViennaCLInt m, ViennaCLInt n,
+                                                           float alpha,
+                                                           float *x, ViennaCLInt offx, int incx,
+                                                           float *y, ViennaCLInt offy, int incy,
+                                                           float *A, ViennaCLInt offA_row, ViennaCLInt offA_col, int incA_row, int incA_col, ViennaCLInt lda)
+{
+  if (order == ViennaCLRowMajor)
+  {
+    viennacl::vector_base<float> v1(x, viennacl::MAIN_MEMORY, n, offx, incx);
+    viennacl::vector_base<float> v2(y, viennacl::MAIN_MEMORY, m, offy, incy);
+    viennacl::matrix_base<float> mat(A, viennacl::MAIN_MEMORY,
+                                     m, offA_row, incA_row, m,
+                                     n, offA_col, incA_col, lda);
+
+    mat += alpha * viennacl::linalg::outer_prod(v1, v2);
+  }
+  else
+  {
+    viennacl::vector_base<float> v1(x, viennacl::MAIN_MEMORY, n, offx, incx);
+    viennacl::vector_base<float> v2(y, viennacl::MAIN_MEMORY, m, offy, incy);
+    viennacl::matrix_base<float, viennacl::column_major> mat(A, viennacl::MAIN_MEMORY,
+                                                             m, offA_row, incA_row, lda,
+                                                             n, offA_col, incA_col, n);
+
+    mat += alpha * viennacl::linalg::outer_prod(v1, v2);
+  }
+
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDger(ViennaCLBackend /*backend*/,
+                                                           ViennaCLOrder order,
+                                                           ViennaCLInt m, ViennaCLInt n,
+                                                           double alpha,
+                                                           double *x, ViennaCLInt offx, int incx,
+                                                           double *y, ViennaCLInt offy, int incy,
+                                                           double *A, ViennaCLInt offA_row, ViennaCLInt offA_col, int incA_row, int incA_col, ViennaCLInt lda)
+{
+  if (order == ViennaCLRowMajor)
+  {
+    viennacl::vector_base<double> v1(x, viennacl::MAIN_MEMORY, n, offx, incx);
+    viennacl::vector_base<double> v2(y, viennacl::MAIN_MEMORY, m, offy, incy);
+    viennacl::matrix_base<double> mat(A, viennacl::MAIN_MEMORY,
+                                      m, offA_row, incA_row, m,
+                                      n, offA_col, incA_col, lda);
+
+    mat += alpha * viennacl::linalg::outer_prod(v1, v2);
+  }
+  else
+  {
+    viennacl::vector_base<double> v1(x, viennacl::MAIN_MEMORY, n, offx, incx);
+    viennacl::vector_base<double> v2(y, viennacl::MAIN_MEMORY, m, offy, incy);
+    viennacl::matrix_base<double, viennacl::column_major> mat(A, viennacl::MAIN_MEMORY,
+                                                              m, offA_row, incA_row, lda,
+                                                              n, offA_col, incA_col, n);
+
+    mat += alpha * viennacl::linalg::outer_prod(v1, v2);
+  }
+
+  return ViennaCLSuccess;
+}
+
diff --git a/libviennacl/src/blas2_host.cu b/libviennacl/src/blas2_host.cu
new file mode 100644
index 0000000..1a8e9b2
--- /dev/null
+++ b/libviennacl/src/blas2_host.cu
@@ -0,0 +1,283 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/direct_solve.hpp"
+#include "viennacl/linalg/prod.hpp"
+
+
+// xGEMV
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSgemv(ViennaCLBackend /*backend*/,
+                                                            ViennaCLOrder order, ViennaCLTranspose transA,
+                                                            ViennaCLInt m, ViennaCLInt n, float alpha, float *A, ViennaCLInt offA_row, ViennaCLInt offA_col, int incA_row, int incA_col, ViennaCLInt lda,
+                                                            float *x, ViennaCLInt offx, int incx,
+                                                            float beta,
+                                                            float *y, ViennaCLInt offy, int incy)
+{
+  if (order == ViennaCLRowMajor)
+  {
+    viennacl::vector_base<float> v1(x, viennacl::MAIN_MEMORY, n, offx, incx);
+    viennacl::vector_base<float> v2(y, viennacl::MAIN_MEMORY, m, offy, incy);
+    viennacl::matrix_base<float> mat(A, viennacl::MAIN_MEMORY,
+                                     m, offA_row, incA_row, m,
+                                     n, offA_col, incA_col, lda);
+    v2 *= beta;
+    if (transA == ViennaCLTrans)
+      v2 += alpha * viennacl::linalg::prod(viennacl::trans(mat), v1);
+    else
+      v2 += alpha * viennacl::linalg::prod(mat, v1);
+  }
+  else
+  {
+    viennacl::vector_base<float> v1(x, viennacl::MAIN_MEMORY, n, offx, incx);
+    viennacl::vector_base<float> v2(y, viennacl::MAIN_MEMORY, m, offy, incy);
+    viennacl::matrix_base<float, viennacl::column_major> mat(A, viennacl::MAIN_MEMORY,
+                                                             m, offA_row, incA_row, lda,
+                                                             n, offA_col, incA_col, n);
+    v2 *= beta;
+    if (transA == ViennaCLTrans)
+      v2 += alpha * viennacl::linalg::prod(viennacl::trans(mat), v1);
+    else
+      v2 += alpha * viennacl::linalg::prod(mat, v1);
+  }
+
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDgemv(ViennaCLBackend /*backend*/,
+                                                            ViennaCLOrder order, ViennaCLTranspose transA,
+                                                            ViennaCLInt m, ViennaCLInt n, double alpha, double *A, ViennaCLInt offA_row, ViennaCLInt offA_col, int incA_row, int incA_col, ViennaCLInt lda,
+                                                            double *x, ViennaCLInt offx, int incx,
+                                                            double beta,
+                                                            double *y, ViennaCLInt offy, int incy)
+{
+  if (order == ViennaCLRowMajor)
+  {
+    viennacl::vector_base<double> v1(x, viennacl::MAIN_MEMORY, n, offx, incx);
+    viennacl::vector_base<double> v2(y, viennacl::MAIN_MEMORY, m, offy, incy);
+    viennacl::matrix_base<double> mat(A, viennacl::MAIN_MEMORY,
+                                      m, offA_row, incA_row, m,
+                                      n, offA_col, incA_col, lda);
+    v2 *= beta;
+    if (transA == ViennaCLTrans)
+      v2 += alpha * viennacl::linalg::prod(viennacl::trans(mat), v1);
+    else
+      v2 += alpha * viennacl::linalg::prod(mat, v1);
+  }
+  else
+  {
+    viennacl::vector_base<double> v1(x, viennacl::MAIN_MEMORY, n, offx, incx);
+    viennacl::vector_base<double> v2(y, viennacl::MAIN_MEMORY, m, offy, incy);
+    viennacl::matrix_base<double, viennacl::column_major> mat(A, viennacl::MAIN_MEMORY,
+                                                              m, offA_row, incA_row, lda,
+                                                              n, offA_col, incA_col, n);
+    v2 *= beta;
+    if (transA == ViennaCLTrans)
+      v2 += alpha * viennacl::linalg::prod(viennacl::trans(mat), v1);
+    else
+      v2 += alpha * viennacl::linalg::prod(mat, v1);
+  }
+
+  return ViennaCLSuccess;
+}
+
+
+
+// xTRSV
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostStrsv(ViennaCLBackend /*backend*/,
+                                                            ViennaCLUplo uplo, ViennaCLOrder order, ViennaCLTranspose transA,
+                                                            ViennaCLInt n, float *A, ViennaCLInt offA_row, ViennaCLInt offA_col, int incA_row, int incA_col, ViennaCLInt lda,
+                                                            float *x, ViennaCLInt offx, int incx)
+{
+  if (order == ViennaCLRowMajor)
+  {
+    viennacl::vector_base<float> v(x, viennacl::MAIN_MEMORY, n, offx, incx);
+    viennacl::matrix_base<float> mat(A, viennacl::MAIN_MEMORY,
+                                     n, offA_row, incA_row, n,
+                                     n, offA_col, incA_col, lda);
+    if (transA == ViennaCLTrans)
+    {
+      if (uplo == ViennaCLUpper)
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::upper_tag());
+      else
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::lower_tag());
+    }
+    else
+    {
+      if (uplo == ViennaCLUpper)
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::upper_tag());
+      else
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::lower_tag());
+    }
+  }
+  else
+  {
+    viennacl::vector_base<float> v(x, viennacl::MAIN_MEMORY, n, offx, incx);
+    viennacl::matrix_base<float, viennacl::column_major> mat(A, viennacl::MAIN_MEMORY,
+                                                             n, offA_row, incA_row, lda,
+                                                             n, offA_col, incA_col, n);
+    if (transA == ViennaCLTrans)
+    {
+      if (uplo == ViennaCLUpper)
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::upper_tag());
+      else
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::lower_tag());
+    }
+    else
+    {
+      if (uplo == ViennaCLUpper)
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::upper_tag());
+      else
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::lower_tag());
+    }
+  }
+
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDtrsv(ViennaCLBackend /*backend*/,
+                                                            ViennaCLUplo uplo, ViennaCLOrder order, ViennaCLTranspose transA,
+                                                            ViennaCLInt n, double *A, ViennaCLInt offA_row, ViennaCLInt offA_col, int incA_row, int incA_col, ViennaCLInt lda,
+                                                            double *x, ViennaCLInt offx, int incx)
+{
+  if (order == ViennaCLRowMajor)
+  {
+    viennacl::vector_base<double> v(x, viennacl::MAIN_MEMORY, n, offx, incx);
+    viennacl::matrix_base<double> mat(A, viennacl::MAIN_MEMORY,
+                                      n, offA_row, incA_row, n,
+                                      n, offA_col, incA_col, lda);
+    if (transA == ViennaCLTrans)
+    {
+      if (uplo == ViennaCLUpper)
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::upper_tag());
+      else
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::lower_tag());
+    }
+    else
+    {
+      if (uplo == ViennaCLUpper)
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::upper_tag());
+      else
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::lower_tag());
+    }
+  }
+  else
+  {
+    viennacl::vector_base<double> v(x, viennacl::MAIN_MEMORY, n, offx, incx);
+    viennacl::matrix_base<double, viennacl::column_major> mat(A, viennacl::MAIN_MEMORY,
+                                                              n, offA_row, incA_row, lda,
+                                                              n, offA_col, incA_col, n);
+    if (transA == ViennaCLTrans)
+    {
+      if (uplo == ViennaCLUpper)
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::upper_tag());
+      else
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::lower_tag());
+    }
+    else
+    {
+      if (uplo == ViennaCLUpper)
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::upper_tag());
+      else
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::lower_tag());
+    }
+  }
+
+  return ViennaCLSuccess;
+}
+
+
+
+// xGER
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSger(ViennaCLBackend /*backend*/,
+                                                           ViennaCLOrder order,
+                                                           ViennaCLInt m, ViennaCLInt n,
+                                                           float alpha,
+                                                           float *x, ViennaCLInt offx, int incx,
+                                                           float *y, ViennaCLInt offy, int incy,
+                                                           float *A, ViennaCLInt offA_row, ViennaCLInt offA_col, int incA_row, int incA_col, ViennaCLInt lda)
+{
+  if (order == ViennaCLRowMajor)
+  {
+    viennacl::vector_base<float> v1(x, viennacl::MAIN_MEMORY, n, offx, incx);
+    viennacl::vector_base<float> v2(y, viennacl::MAIN_MEMORY, m, offy, incy);
+    viennacl::matrix_base<float> mat(A, viennacl::MAIN_MEMORY,
+                                     m, offA_row, incA_row, m,
+                                     n, offA_col, incA_col, lda);
+
+    mat += alpha * viennacl::linalg::outer_prod(v1, v2);
+  }
+  else
+  {
+    viennacl::vector_base<float> v1(x, viennacl::MAIN_MEMORY, n, offx, incx);
+    viennacl::vector_base<float> v2(y, viennacl::MAIN_MEMORY, m, offy, incy);
+    viennacl::matrix_base<float, viennacl::column_major> mat(A, viennacl::MAIN_MEMORY,
+                                                             m, offA_row, incA_row, lda,
+                                                             n, offA_col, incA_col, n);
+
+    mat += alpha * viennacl::linalg::outer_prod(v1, v2);
+  }
+
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDger(ViennaCLBackend /*backend*/,
+                                                           ViennaCLOrder order,
+                                                           ViennaCLInt m, ViennaCLInt n,
+                                                           double alpha,
+                                                           double *x, ViennaCLInt offx, int incx,
+                                                           double *y, ViennaCLInt offy, int incy,
+                                                           double *A, ViennaCLInt offA_row, ViennaCLInt offA_col, int incA_row, int incA_col, ViennaCLInt lda)
+{
+  if (order == ViennaCLRowMajor)
+  {
+    viennacl::vector_base<double> v1(x, viennacl::MAIN_MEMORY, n, offx, incx);
+    viennacl::vector_base<double> v2(y, viennacl::MAIN_MEMORY, m, offy, incy);
+    viennacl::matrix_base<double> mat(A, viennacl::MAIN_MEMORY,
+                                      m, offA_row, incA_row, m,
+                                      n, offA_col, incA_col, lda);
+
+    mat += alpha * viennacl::linalg::outer_prod(v1, v2);
+  }
+  else
+  {
+    viennacl::vector_base<double> v1(x, viennacl::MAIN_MEMORY, n, offx, incx);
+    viennacl::vector_base<double> v2(y, viennacl::MAIN_MEMORY, m, offy, incy);
+    viennacl::matrix_base<double, viennacl::column_major> mat(A, viennacl::MAIN_MEMORY,
+                                                              m, offA_row, incA_row, lda,
+                                                              n, offA_col, incA_col, n);
+
+    mat += alpha * viennacl::linalg::outer_prod(v1, v2);
+  }
+
+  return ViennaCLSuccess;
+}
+
diff --git a/libviennacl/src/blas2_opencl.cpp b/libviennacl/src/blas2_opencl.cpp
new file mode 100644
index 0000000..8f8eafe
--- /dev/null
+++ b/libviennacl/src/blas2_opencl.cpp
@@ -0,0 +1,283 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/direct_solve.hpp"
+#include "viennacl/linalg/prod.hpp"
+
+
+// xGEMV
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSgemv(ViennaCLBackend backend,
+                                                              ViennaCLOrder order, ViennaCLTranspose transA,
+                                                              ViennaCLInt m, ViennaCLInt n, float alpha, cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                              float beta,
+                                                              cl_mem y, ViennaCLInt offy, ViennaCLInt incy)
+{
+  if (order == ViennaCLRowMajor)
+  {
+    viennacl::vector_base<float> v1(x, n, offx, incx, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+    viennacl::vector_base<float> v2(y, m, offy, incy, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+    viennacl::matrix_base<float> mat(A, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                     m, offA_row, incA_row, m,
+                                     n, offA_col, incA_col, lda);
+    v2 *= beta;
+    if (transA == ViennaCLTrans)
+      v2 += alpha * viennacl::linalg::prod(viennacl::trans(mat), v1);
+    else
+      v2 += alpha * viennacl::linalg::prod(mat, v1);
+  }
+  else
+  {
+    viennacl::vector_base<float> v1(x, n, offx, incx);
+    viennacl::vector_base<float> v2(y, m, offy, incy);
+    viennacl::matrix_base<float, viennacl::column_major> mat(A, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                                             m, offA_row, incA_row, lda,
+                                                             n, offA_col, incA_col, n);
+    v2 *= beta;
+    if (transA == ViennaCLTrans)
+      v2 += alpha * viennacl::linalg::prod(viennacl::trans(mat), v1);
+    else
+      v2 += alpha * viennacl::linalg::prod(mat, v1);
+  }
+
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDgemv(ViennaCLBackend backend,
+                                                              ViennaCLOrder order, ViennaCLTranspose transA,
+                                                              ViennaCLInt m, ViennaCLInt n, double alpha, cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                              double beta,
+                                                              cl_mem y, ViennaCLInt offy, ViennaCLInt incy)
+{
+  if (order == ViennaCLRowMajor)
+  {
+    viennacl::vector_base<double> v1(x, n, offx, incx, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+    viennacl::vector_base<double> v2(y, m, offy, incy, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+    viennacl::matrix_base<double> mat(A, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                      m, offA_row, incA_row, m,
+                                      n, offA_col, incA_col, lda);
+    v2 *= beta;
+    if (transA == ViennaCLTrans)
+      v2 += alpha * viennacl::linalg::prod(viennacl::trans(mat), v1);
+    else
+      v2 += alpha * viennacl::linalg::prod(mat, v1);
+  }
+  else
+  {
+    viennacl::vector_base<double> v1(x, n, offx, incx, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+    viennacl::vector_base<double> v2(y, m, offy, incy, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+    viennacl::matrix_base<double, viennacl::column_major> mat(A, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                                              m, offA_row, incA_row, lda,
+                                                              n, offA_col, incA_col, n);
+    v2 *= beta;
+    if (transA == ViennaCLTrans)
+      v2 += alpha * viennacl::linalg::prod(viennacl::trans(mat), v1);
+    else
+      v2 += alpha * viennacl::linalg::prod(mat, v1);
+  }
+
+  return ViennaCLSuccess;
+}
+
+
+
+// xTRSV
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLStrsv(ViennaCLBackend backend,
+                                                              ViennaCLUplo uplo, ViennaCLOrder order, ViennaCLTranspose transA,
+                                                              ViennaCLInt n, cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx)
+{
+  if (order == ViennaCLRowMajor)
+  {
+    viennacl::vector_base<float> v(x, n, offx, incx, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+    viennacl::matrix_base<float> mat(A, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                     n, offA_row, incA_row, n,
+                                     n, offA_col, incA_col, lda);
+    if (transA == ViennaCLTrans)
+    {
+      if (uplo == ViennaCLUpper)
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::upper_tag());
+      else
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::lower_tag());
+    }
+    else
+    {
+      if (uplo == ViennaCLUpper)
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::upper_tag());
+      else
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::lower_tag());
+    }
+  }
+  else
+  {
+    viennacl::vector_base<float> v(x, n, offx, incx, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+    viennacl::matrix_base<float, viennacl::column_major> mat(A, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                                             n, offA_row, incA_row, lda,
+                                                             n, offA_col, incA_col, n);
+    if (transA == ViennaCLTrans)
+    {
+      if (uplo == ViennaCLUpper)
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::upper_tag());
+      else
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::lower_tag());
+    }
+    else
+    {
+      if (uplo == ViennaCLUpper)
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::upper_tag());
+      else
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::lower_tag());
+    }
+  }
+
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDtrsv(ViennaCLBackend backend,
+                                                              ViennaCLUplo uplo, ViennaCLOrder order, ViennaCLTranspose transA,
+                                                              ViennaCLInt n, cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx)
+{
+  if (order == ViennaCLRowMajor)
+  {
+    viennacl::vector_base<double> v(x, n, offx, incx, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+    viennacl::matrix_base<double> mat(A, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                      n, offA_row, incA_row, n,
+                                      n, offA_col, incA_col, lda);
+    if (transA == ViennaCLTrans)
+    {
+      if (uplo == ViennaCLUpper)
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::upper_tag());
+      else
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::lower_tag());
+    }
+    else
+    {
+      if (uplo == ViennaCLUpper)
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::upper_tag());
+      else
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::lower_tag());
+    }
+  }
+  else
+  {
+    viennacl::vector_base<double> v(x, n, offx, incx, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+    viennacl::matrix_base<double, viennacl::column_major> mat(A, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                                              n, offA_row, incA_row, lda,
+                                                              n, offA_col, incA_col, n);
+    if (transA == ViennaCLTrans)
+    {
+      if (uplo == ViennaCLUpper)
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::upper_tag());
+      else
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::lower_tag());
+    }
+    else
+    {
+      if (uplo == ViennaCLUpper)
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::upper_tag());
+      else
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::lower_tag());
+    }
+  }
+
+  return ViennaCLSuccess;
+}
+
+
+
+// xGER
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSger(ViennaCLBackend backend,
+                                                             ViennaCLOrder order,
+                                                             ViennaCLInt m, ViennaCLInt n,
+                                                             float alpha,
+                                                             cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                             cl_mem y, ViennaCLInt offy, ViennaCLInt incy,
+                                                             cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda)
+{
+  if (order == ViennaCLRowMajor)
+  {
+    viennacl::vector_base<float> v1(x, n, offx, incx, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+    viennacl::vector_base<float> v2(y, m, offy, incy, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+    viennacl::matrix_base<float> mat(A, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                     m, offA_row, incA_row, m,
+                                     n, offA_col, incA_col, lda);
+
+    mat += alpha * viennacl::linalg::outer_prod(v1, v2);
+  }
+  else
+  {
+    viennacl::vector_base<float> v1(x, n, offx, incx, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+    viennacl::vector_base<float> v2(y, m, offy, incy, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+    viennacl::matrix_base<float, viennacl::column_major> mat(A, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                                             m, offA_row, incA_row, lda,
+                                                             n, offA_col, incA_col, n);
+
+    mat += alpha * viennacl::linalg::outer_prod(v1, v2);
+  }
+
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDger(ViennaCLBackend backend,
+                                                             ViennaCLOrder order,
+                                                             ViennaCLInt m, ViennaCLInt n,
+                                                             double alpha,
+                                                             cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                             cl_mem y, ViennaCLInt offy, ViennaCLInt incy,
+                                                             cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda)
+{
+  if (order == ViennaCLRowMajor)
+  {
+    viennacl::vector_base<double> v1(x, n, offx, incx, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+    viennacl::vector_base<double> v2(y, m, offy, incy, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+    viennacl::matrix_base<double> mat(A, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                      m, offA_row, incA_row, m,
+                                      n, offA_col, incA_col, lda);
+
+    mat += alpha * viennacl::linalg::outer_prod(v1, v2);
+  }
+  else
+  {
+    viennacl::vector_base<double> v1(x, n, offx, incx, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+    viennacl::vector_base<double> v2(y, m, offy, incy, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+    viennacl::matrix_base<double, viennacl::column_major> mat(A, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                                              m, offA_row, incA_row, lda,
+                                                              n, offA_col, incA_col, n);
+
+    mat += alpha * viennacl::linalg::outer_prod(v1, v2);
+  }
+
+  return ViennaCLSuccess;
+}
+
diff --git a/libviennacl/src/blas2_opencl.cu b/libviennacl/src/blas2_opencl.cu
new file mode 100644
index 0000000..8f8eafe
--- /dev/null
+++ b/libviennacl/src/blas2_opencl.cu
@@ -0,0 +1,283 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/direct_solve.hpp"
+#include "viennacl/linalg/prod.hpp"
+
+
+// xGEMV
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSgemv(ViennaCLBackend backend,
+                                                              ViennaCLOrder order, ViennaCLTranspose transA,
+                                                              ViennaCLInt m, ViennaCLInt n, float alpha, cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                              float beta,
+                                                              cl_mem y, ViennaCLInt offy, ViennaCLInt incy)
+{
+  if (order == ViennaCLRowMajor)
+  {
+    viennacl::vector_base<float> v1(x, n, offx, incx, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+    viennacl::vector_base<float> v2(y, m, offy, incy, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+    viennacl::matrix_base<float> mat(A, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                     m, offA_row, incA_row, m,
+                                     n, offA_col, incA_col, lda);
+    v2 *= beta;
+    if (transA == ViennaCLTrans)
+      v2 += alpha * viennacl::linalg::prod(viennacl::trans(mat), v1);
+    else
+      v2 += alpha * viennacl::linalg::prod(mat, v1);
+  }
+  else
+  {
+    viennacl::vector_base<float> v1(x, n, offx, incx);
+    viennacl::vector_base<float> v2(y, m, offy, incy);
+    viennacl::matrix_base<float, viennacl::column_major> mat(A, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                                             m, offA_row, incA_row, lda,
+                                                             n, offA_col, incA_col, n);
+    v2 *= beta;
+    if (transA == ViennaCLTrans)
+      v2 += alpha * viennacl::linalg::prod(viennacl::trans(mat), v1);
+    else
+      v2 += alpha * viennacl::linalg::prod(mat, v1);
+  }
+
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDgemv(ViennaCLBackend backend,
+                                                              ViennaCLOrder order, ViennaCLTranspose transA,
+                                                              ViennaCLInt m, ViennaCLInt n, double alpha, cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                              double beta,
+                                                              cl_mem y, ViennaCLInt offy, ViennaCLInt incy)
+{
+  if (order == ViennaCLRowMajor)
+  {
+    viennacl::vector_base<double> v1(x, n, offx, incx, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+    viennacl::vector_base<double> v2(y, m, offy, incy, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+    viennacl::matrix_base<double> mat(A, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                      m, offA_row, incA_row, m,
+                                      n, offA_col, incA_col, lda);
+    v2 *= beta;
+    if (transA == ViennaCLTrans)
+      v2 += alpha * viennacl::linalg::prod(viennacl::trans(mat), v1);
+    else
+      v2 += alpha * viennacl::linalg::prod(mat, v1);
+  }
+  else
+  {
+    viennacl::vector_base<double> v1(x, n, offx, incx, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+    viennacl::vector_base<double> v2(y, m, offy, incy, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+    viennacl::matrix_base<double, viennacl::column_major> mat(A, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                                              m, offA_row, incA_row, lda,
+                                                              n, offA_col, incA_col, n);
+    v2 *= beta;
+    if (transA == ViennaCLTrans)
+      v2 += alpha * viennacl::linalg::prod(viennacl::trans(mat), v1);
+    else
+      v2 += alpha * viennacl::linalg::prod(mat, v1);
+  }
+
+  return ViennaCLSuccess;
+}
+
+
+
+// xTRSV
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLStrsv(ViennaCLBackend backend,
+                                                              ViennaCLUplo uplo, ViennaCLOrder order, ViennaCLTranspose transA,
+                                                              ViennaCLInt n, cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx)
+{
+  if (order == ViennaCLRowMajor)
+  {
+    viennacl::vector_base<float> v(x, n, offx, incx, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+    viennacl::matrix_base<float> mat(A, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                     n, offA_row, incA_row, n,
+                                     n, offA_col, incA_col, lda);
+    if (transA == ViennaCLTrans)
+    {
+      if (uplo == ViennaCLUpper)
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::upper_tag());
+      else
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::lower_tag());
+    }
+    else
+    {
+      if (uplo == ViennaCLUpper)
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::upper_tag());
+      else
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::lower_tag());
+    }
+  }
+  else
+  {
+    viennacl::vector_base<float> v(x, n, offx, incx, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+    viennacl::matrix_base<float, viennacl::column_major> mat(A, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                                             n, offA_row, incA_row, lda,
+                                                             n, offA_col, incA_col, n);
+    if (transA == ViennaCLTrans)
+    {
+      if (uplo == ViennaCLUpper)
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::upper_tag());
+      else
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::lower_tag());
+    }
+    else
+    {
+      if (uplo == ViennaCLUpper)
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::upper_tag());
+      else
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::lower_tag());
+    }
+  }
+
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDtrsv(ViennaCLBackend backend,
+                                                              ViennaCLUplo uplo, ViennaCLOrder order, ViennaCLTranspose transA,
+                                                              ViennaCLInt n, cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                              cl_mem x, ViennaCLInt offx, ViennaCLInt incx)
+{
+  if (order == ViennaCLRowMajor)
+  {
+    viennacl::vector_base<double> v(x, n, offx, incx, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+    viennacl::matrix_base<double> mat(A, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                      n, offA_row, incA_row, n,
+                                      n, offA_col, incA_col, lda);
+    if (transA == ViennaCLTrans)
+    {
+      if (uplo == ViennaCLUpper)
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::upper_tag());
+      else
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::lower_tag());
+    }
+    else
+    {
+      if (uplo == ViennaCLUpper)
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::upper_tag());
+      else
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::lower_tag());
+    }
+  }
+  else
+  {
+    viennacl::vector_base<double> v(x, n, offx, incx, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+    viennacl::matrix_base<double, viennacl::column_major> mat(A, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                                              n, offA_row, incA_row, lda,
+                                                              n, offA_col, incA_col, n);
+    if (transA == ViennaCLTrans)
+    {
+      if (uplo == ViennaCLUpper)
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::upper_tag());
+      else
+        viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::lower_tag());
+    }
+    else
+    {
+      if (uplo == ViennaCLUpper)
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::upper_tag());
+      else
+        viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::lower_tag());
+    }
+  }
+
+  return ViennaCLSuccess;
+}
+
+
+
+// xGER
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSger(ViennaCLBackend backend,
+                                                             ViennaCLOrder order,
+                                                             ViennaCLInt m, ViennaCLInt n,
+                                                             float alpha,
+                                                             cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                             cl_mem y, ViennaCLInt offy, ViennaCLInt incy,
+                                                             cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda)
+{
+  if (order == ViennaCLRowMajor)
+  {
+    viennacl::vector_base<float> v1(x, n, offx, incx, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+    viennacl::vector_base<float> v2(y, m, offy, incy, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+    viennacl::matrix_base<float> mat(A, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                     m, offA_row, incA_row, m,
+                                     n, offA_col, incA_col, lda);
+
+    mat += alpha * viennacl::linalg::outer_prod(v1, v2);
+  }
+  else
+  {
+    viennacl::vector_base<float> v1(x, n, offx, incx, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+    viennacl::vector_base<float> v2(y, m, offy, incy, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+    viennacl::matrix_base<float, viennacl::column_major> mat(A, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                                             m, offA_row, incA_row, lda,
+                                                             n, offA_col, incA_col, n);
+
+    mat += alpha * viennacl::linalg::outer_prod(v1, v2);
+  }
+
+  return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDger(ViennaCLBackend backend,
+                                                             ViennaCLOrder order,
+                                                             ViennaCLInt m, ViennaCLInt n,
+                                                             double alpha,
+                                                             cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+                                                             cl_mem y, ViennaCLInt offy, ViennaCLInt incy,
+                                                             cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda)
+{
+  if (order == ViennaCLRowMajor)
+  {
+    viennacl::vector_base<double> v1(x, n, offx, incx, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+    viennacl::vector_base<double> v2(y, m, offy, incy, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+    viennacl::matrix_base<double> mat(A, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                      m, offA_row, incA_row, m,
+                                      n, offA_col, incA_col, lda);
+
+    mat += alpha * viennacl::linalg::outer_prod(v1, v2);
+  }
+  else
+  {
+    viennacl::vector_base<double> v1(x, n, offx, incx, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+    viennacl::vector_base<double> v2(y, m, offy, incy, viennacl::ocl::get_context(backend->opencl_backend.context_id));
+    viennacl::matrix_base<double, viennacl::column_major> mat(A, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                                              m, offA_row, incA_row, lda,
+                                                              n, offA_col, incA_col, n);
+
+    mat += alpha * viennacl::linalg::outer_prod(v1, v2);
+  }
+
+  return ViennaCLSuccess;
+}
+
diff --git a/libviennacl/src/blas3.cpp b/libviennacl/src/blas3.cpp
new file mode 100644
index 0000000..a038cba
--- /dev/null
+++ b/libviennacl/src/blas3.cpp
@@ -0,0 +1,970 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+#include "init_matrix.hpp"
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/direct_solve.hpp"
+#include "viennacl/linalg/prod.hpp"
+
+// GEMV
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLgemm(ViennaCLHostScalar alpha, ViennaCLMatrix A, ViennaCLMatrix B, ViennaCLHostScalar beta, ViennaCLMatrix C)
+{
+  viennacl::backend::mem_handle A_handle;
+  viennacl::backend::mem_handle B_handle;
+  viennacl::backend::mem_handle C_handle;
+
+  if (init_matrix(A_handle, A) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  if (init_matrix(B_handle, B) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  if (init_matrix(C_handle, C) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  switch (A->precision)
+  {
+    case ViennaCLFloat:
+    {
+      if (A->order == ViennaCLRowMajor && B->order == ViennaCLRowMajor && C->order == ViennaCLRowMajor)
+      {
+        viennacl::matrix_base<float> mat_A(A_handle,
+                                           A->size1, A->start1, A->stride1, A->internal_size1,
+                                           A->size2, A->start2, A->stride2, A->internal_size2);
+        viennacl::matrix_base<float> mat_B(B_handle,
+                                           B->size1, B->start1, B->stride1, B->internal_size1,
+                                           B->size2, B->start2, B->stride2, B->internal_size2);
+        viennacl::matrix_base<float> mat_C(C_handle,
+                                           C->size1, C->start1, C->stride1, C->internal_size1,
+                                           C->size2, C->start2, C->stride2, C->internal_size2);
+
+        if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans)
+          viennacl::linalg::prod_impl(viennacl::trans(mat_A), viennacl::trans(mat_B), mat_C, alpha->value_float, beta->value_float);
+        else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans)
+          viennacl::linalg::prod_impl(viennacl::trans(mat_A), mat_B, mat_C, alpha->value_float, beta->value_float);
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans)
+          viennacl::linalg::prod_impl(mat_A, viennacl::trans(mat_B), mat_C, alpha->value_float, beta->value_float);
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans)
+          viennacl::linalg::prod_impl(mat_A, mat_B, mat_C, alpha->value_float, beta->value_float);
+        else
+          return ViennaCLGenericFailure;
+      }
+      else if (A->order == ViennaCLRowMajor && B->order == ViennaCLRowMajor && C->order == ViennaCLColumnMajor)
+      {
+        viennacl::matrix_base<float> mat_A(A_handle,
+                                           A->size1, A->start1, A->stride1, A->internal_size1,
+                                           A->size2, A->start2, A->stride2, A->internal_size2);
+        viennacl::matrix_base<float> mat_B(B_handle,
+                                           B->size1, B->start1, B->stride1, B->internal_size1,
+                                           B->size2, B->start2, B->stride2, B->internal_size2);
+        viennacl::matrix_base<float, viennacl::column_major> mat_C(C_handle,
+                                                                   C->size1, C->start1, C->stride1, C->internal_size1,
+                                                                   C->size2, C->start2, C->stride2, C->internal_size2);
+
+        if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans)
+          viennacl::linalg::prod_impl(viennacl::trans(mat_A), viennacl::trans(mat_B), mat_C, alpha->value_float, beta->value_float);
+        else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans)
+          viennacl::linalg::prod_impl(viennacl::trans(mat_A), mat_B, mat_C, alpha->value_float, beta->value_float);
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans)
+          viennacl::linalg::prod_impl(mat_A, viennacl::trans(mat_B), mat_C, alpha->value_float, beta->value_float);
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans)
+          viennacl::linalg::prod_impl(mat_A, mat_B, mat_C, alpha->value_float, beta->value_float);
+        else
+          return ViennaCLGenericFailure;
+      }
+      else if (A->order == ViennaCLRowMajor && B->order == ViennaCLColumnMajor && C->order == ViennaCLRowMajor)
+      {
+        viennacl::matrix_base<float> mat_A(A_handle,
+                                           A->size1, A->start1, A->stride1, A->internal_size1,
+                                           A->size2, A->start2, A->stride2, A->internal_size2);
+        viennacl::matrix_base<float, viennacl::column_major> mat_B(B_handle,
+                                                                   B->size1, B->start1, B->stride1, B->internal_size1,
+                                                                   B->size2, B->start2, B->stride2, B->internal_size2);
+        viennacl::matrix_base<float> mat_C(C_handle,
+                                           C->size1, C->start1, C->stride1, C->internal_size1,
+                                           C->size2, C->start2, C->stride2, C->internal_size2);
+
+        if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans)
+          viennacl::linalg::prod_impl(viennacl::trans(mat_A), viennacl::trans(mat_B), mat_C, alpha->value_float, beta->value_float);
+        else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans)
+          viennacl::linalg::prod_impl(viennacl::trans(mat_A), mat_B, mat_C, alpha->value_float, beta->value_float);
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans)
+          viennacl::linalg::prod_impl(mat_A, viennacl::trans(mat_B), mat_C, alpha->value_float, beta->value_float);
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans)
+          viennacl::linalg::prod_impl(mat_A, mat_B, mat_C, alpha->value_float, beta->value_float);
+        else
+          return ViennaCLGenericFailure;
+      }
+      else if (A->order == ViennaCLRowMajor && B->order == ViennaCLColumnMajor && C->order == ViennaCLColumnMajor)
+      {
+        viennacl::matrix_base<float> mat_A(A_handle,
+                                           A->size1, A->start1, A->stride1, A->internal_size1,
+                                           A->size2, A->start2, A->stride2, A->internal_size2);
+        viennacl::matrix_base<float, viennacl::column_major> mat_B(B_handle,
+                                                                   B->size1, B->start1, B->stride1, B->internal_size1,
+                                                                   B->size2, B->start2, B->stride2, B->internal_size2);
+        viennacl::matrix_base<float, viennacl::column_major> mat_C(C_handle,
+                                                                   C->size1, C->start1, C->stride1, C->internal_size1,
+                                                                   C->size2, C->start2, C->stride2, C->internal_size2);
+
+        if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans)
+          viennacl::linalg::prod_impl(viennacl::trans(mat_A), viennacl::trans(mat_B), mat_C, alpha->value_float, beta->value_float);
+        else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans)
+          viennacl::linalg::prod_impl(viennacl::trans(mat_A), mat_B, mat_C, alpha->value_float, beta->value_float);
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans)
+          viennacl::linalg::prod_impl(mat_A, viennacl::trans(mat_B), mat_C, alpha->value_float, beta->value_float);
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans)
+          viennacl::linalg::prod_impl(mat_A, mat_B, mat_C, alpha->value_float, beta->value_float);
+        else
+          return ViennaCLGenericFailure;
+      }
+      if (A->order == ViennaCLColumnMajor && B->order == ViennaCLRowMajor && C->order == ViennaCLRowMajor)
+      {
+        viennacl::matrix_base<float, viennacl::column_major> mat_A(A_handle,
+                                                                   A->size1, A->start1, A->stride1, A->internal_size1,
+                                                                   A->size2, A->start2, A->stride2, A->internal_size2);
+        viennacl::matrix_base<float> mat_B(B_handle,
+                                           B->size1, B->start1, B->stride1, B->internal_size1,
+                                           B->size2, B->start2, B->stride2, B->internal_size2);
+        viennacl::matrix_base<float> mat_C(C_handle,
+                                           C->size1, C->start1, C->stride1, C->internal_size1,
+                                           C->size2, C->start2, C->stride2, C->internal_size2);
+
+        if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans)
+          viennacl::linalg::prod_impl(viennacl::trans(mat_A), viennacl::trans(mat_B), mat_C, alpha->value_float, beta->value_float);
+        else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans)
+          viennacl::linalg::prod_impl(viennacl::trans(mat_A), mat_B, mat_C, alpha->value_float, beta->value_float);
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans)
+          viennacl::linalg::prod_impl(mat_A, viennacl::trans(mat_B), mat_C, alpha->value_float, beta->value_float);
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans)
+          viennacl::linalg::prod_impl(mat_A, mat_B, mat_C, alpha->value_float, beta->value_float);
+        else
+          return ViennaCLGenericFailure;
+      }
+      else if (A->order == ViennaCLColumnMajor && B->order == ViennaCLRowMajor && C->order == ViennaCLColumnMajor)
+      {
+        viennacl::matrix_base<float, viennacl::column_major> mat_A(A_handle,
+                                                                   A->size1, A->start1, A->stride1, A->internal_size1,
+                                                                   A->size2, A->start2, A->stride2, A->internal_size2);
+        viennacl::matrix_base<float> mat_B(B_handle,
+                                           B->size1, B->start1, B->stride1, B->internal_size1,
+                                           B->size2, B->start2, B->stride2, B->internal_size2);
+        viennacl::matrix_base<float, viennacl::column_major> mat_C(C_handle,
+                                                                   C->size1, C->start1, C->stride1, C->internal_size1,
+                                                                   C->size2, C->start2, C->stride2, C->internal_size2);
+
+        if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans)
+          viennacl::linalg::prod_impl(viennacl::trans(mat_A), viennacl::trans(mat_B), mat_C, alpha->value_float, beta->value_float);
+        else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans)
+          viennacl::linalg::prod_impl(viennacl::trans(mat_A), mat_B, mat_C, alpha->value_float, beta->value_float);
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans)
+          viennacl::linalg::prod_impl(mat_A, viennacl::trans(mat_B), mat_C, alpha->value_float, beta->value_float);
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans)
+          viennacl::linalg::prod_impl(mat_A, mat_B, mat_C, alpha->value_float, beta->value_float);
+        else
+          return ViennaCLGenericFailure;
+      }
+      else if (A->order == ViennaCLColumnMajor && B->order == ViennaCLColumnMajor && C->order == ViennaCLRowMajor)
+      {
+        viennacl::matrix_base<float, viennacl::column_major> mat_A(A_handle,
+                                                                   A->size1, A->start1, A->stride1, A->internal_size1,
+                                                                   A->size2, A->start2, A->stride2, A->internal_size2);
+        viennacl::matrix_base<float, viennacl::column_major> mat_B(B_handle,
+                                                                   B->size1, B->start1, B->stride1, B->internal_size1,
+                                                                   B->size2, B->start2, B->stride2, B->internal_size2);
+        viennacl::matrix_base<float> mat_C(C_handle,
+                                           C->size1, C->start1, C->stride1, C->internal_size1,
+                                           C->size2, C->start2, C->stride2, C->internal_size2);
+
+        if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans)
+          viennacl::linalg::prod_impl(viennacl::trans(mat_A), viennacl::trans(mat_B), mat_C, alpha->value_float, beta->value_float);
+        else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans)
+          viennacl::linalg::prod_impl(viennacl::trans(mat_A), mat_B, mat_C, alpha->value_float, beta->value_float);
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans)
+          viennacl::linalg::prod_impl(mat_A, viennacl::trans(mat_B), mat_C, alpha->value_float, beta->value_float);
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans)
+          viennacl::linalg::prod_impl(mat_A, mat_B, mat_C, alpha->value_float, beta->value_float);
+        else
+          return ViennaCLGenericFailure;
+      }
+      else if (A->order == ViennaCLColumnMajor && B->order == ViennaCLColumnMajor && C->order == ViennaCLColumnMajor)
+      {
+        viennacl::matrix_base<float, viennacl::column_major> mat_A(A_handle,
+                                                                   A->size1, A->start1, A->stride1, A->internal_size1,
+                                                                   A->size2, A->start2, A->stride2, A->internal_size2);
+        viennacl::matrix_base<float, viennacl::column_major> mat_B(B_handle,
+                                                                   B->size1, B->start1, B->stride1, B->internal_size1,
+                                                                   B->size2, B->start2, B->stride2, B->internal_size2);
+        viennacl::matrix_base<float, viennacl::column_major> mat_C(C_handle,
+                                                                   C->size1, C->start1, C->stride1, C->internal_size1,
+                                                                   C->size2, C->start2, C->stride2, C->internal_size2);
+
+        if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans)
+          viennacl::linalg::prod_impl(viennacl::trans(mat_A), viennacl::trans(mat_B), mat_C, alpha->value_float, beta->value_float);
+        else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans)
+          viennacl::linalg::prod_impl(viennacl::trans(mat_A), mat_B, mat_C, alpha->value_float, beta->value_float);
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans)
+          viennacl::linalg::prod_impl(mat_A, viennacl::trans(mat_B), mat_C, alpha->value_float, beta->value_float);
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans)
+          viennacl::linalg::prod_impl(mat_A, mat_B, mat_C, alpha->value_float, beta->value_float);
+        else
+          return ViennaCLGenericFailure;
+      }
+      else
+        return ViennaCLGenericFailure;
+
+      return ViennaCLSuccess;
+    }
+
+    case ViennaCLDouble:
+    {
+      if (A->order == ViennaCLRowMajor && B->order == ViennaCLRowMajor && C->order == ViennaCLRowMajor)
+      {
+        viennacl::matrix_base<double> mat_A(A_handle,
+                                           A->size1, A->start1, A->stride1, A->internal_size1,
+                                           A->size2, A->start2, A->stride2, A->internal_size2);
+        viennacl::matrix_base<double> mat_B(B_handle,
+                                           B->size1, B->start1, B->stride1, B->internal_size1,
+                                           B->size2, B->start2, B->stride2, B->internal_size2);
+        viennacl::matrix_base<double> mat_C(C_handle,
+                                           C->size1, C->start1, C->stride1, C->internal_size1,
+                                           C->size2, C->start2, C->stride2, C->internal_size2);
+
+        if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans)
+          viennacl::linalg::prod_impl(viennacl::trans(mat_A), viennacl::trans(mat_B), mat_C, alpha->value_double, beta->value_double);
+        else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans)
+          viennacl::linalg::prod_impl(viennacl::trans(mat_A), mat_B, mat_C, alpha->value_double, beta->value_double);
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans)
+          viennacl::linalg::prod_impl(mat_A, viennacl::trans(mat_B), mat_C, alpha->value_double, beta->value_double);
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans)
+          viennacl::linalg::prod_impl(mat_A, mat_B, mat_C, alpha->value_double, beta->value_double);
+        else
+          return ViennaCLGenericFailure;
+      }
+      else if (A->order == ViennaCLRowMajor && B->order == ViennaCLRowMajor && C->order == ViennaCLColumnMajor)
+      {
+        viennacl::matrix_base<double> mat_A(A_handle,
+                                           A->size1, A->start1, A->stride1, A->internal_size1,
+                                           A->size2, A->start2, A->stride2, A->internal_size2);
+        viennacl::matrix_base<double> mat_B(B_handle,
+                                           B->size1, B->start1, B->stride1, B->internal_size1,
+                                           B->size2, B->start2, B->stride2, B->internal_size2);
+        viennacl::matrix_base<double, viennacl::column_major> mat_C(C_handle,
+                                                                   C->size1, C->start1, C->stride1, C->internal_size1,
+                                                                   C->size2, C->start2, C->stride2, C->internal_size2);
+
+        if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans)
+          viennacl::linalg::prod_impl(viennacl::trans(mat_A), viennacl::trans(mat_B), mat_C, alpha->value_double, beta->value_double);
+        else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans)
+          viennacl::linalg::prod_impl(viennacl::trans(mat_A), mat_B, mat_C, alpha->value_double, beta->value_double);
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans)
+          viennacl::linalg::prod_impl(mat_A, viennacl::trans(mat_B), mat_C, alpha->value_double, beta->value_double);
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans)
+          viennacl::linalg::prod_impl(mat_A, mat_B, mat_C, alpha->value_double, beta->value_double);
+        else
+          return ViennaCLGenericFailure;
+      }
+      else if (A->order == ViennaCLRowMajor && B->order == ViennaCLColumnMajor && C->order == ViennaCLRowMajor)
+      {
+        viennacl::matrix_base<double> mat_A(A_handle,
+                                           A->size1, A->start1, A->stride1, A->internal_size1,
+                                           A->size2, A->start2, A->stride2, A->internal_size2);
+        viennacl::matrix_base<double, viennacl::column_major> mat_B(B_handle,
+                                                                   B->size1, B->start1, B->stride1, B->internal_size1,
+                                                                   B->size2, B->start2, B->stride2, B->internal_size2);
+        viennacl::matrix_base<double> mat_C(C_handle,
+                                           C->size1, C->start1, C->stride1, C->internal_size1,
+                                           C->size2, C->start2, C->stride2, C->internal_size2);
+
+        if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans)
+          viennacl::linalg::prod_impl(viennacl::trans(mat_A), viennacl::trans(mat_B), mat_C, alpha->value_double, beta->value_double);
+        else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans)
+          viennacl::linalg::prod_impl(viennacl::trans(mat_A), mat_B, mat_C, alpha->value_double, beta->value_double);
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans)
+          viennacl::linalg::prod_impl(mat_A, viennacl::trans(mat_B), mat_C, alpha->value_double, beta->value_double);
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans)
+          viennacl::linalg::prod_impl(mat_A, mat_B, mat_C, alpha->value_double, beta->value_double);
+        else
+          return ViennaCLGenericFailure;
+      }
+      else if (A->order == ViennaCLRowMajor && B->order == ViennaCLColumnMajor && C->order == ViennaCLColumnMajor)
+      {
+        viennacl::matrix_base<double> mat_A(A_handle,
+                                           A->size1, A->start1, A->stride1, A->internal_size1,
+                                           A->size2, A->start2, A->stride2, A->internal_size2);
+        viennacl::matrix_base<double, viennacl::column_major> mat_B(B_handle,
+                                                                   B->size1, B->start1, B->stride1, B->internal_size1,
+                                                                   B->size2, B->start2, B->stride2, B->internal_size2);
+        viennacl::matrix_base<double, viennacl::column_major> mat_C(C_handle,
+                                                                   C->size1, C->start1, C->stride1, C->internal_size1,
+                                                                   C->size2, C->start2, C->stride2, C->internal_size2);
+
+        if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans)
+          viennacl::linalg::prod_impl(viennacl::trans(mat_A), viennacl::trans(mat_B), mat_C, alpha->value_double, beta->value_double);
+        else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans)
+          viennacl::linalg::prod_impl(viennacl::trans(mat_A), mat_B, mat_C, alpha->value_double, beta->value_double);
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans)
+          viennacl::linalg::prod_impl(mat_A, viennacl::trans(mat_B), mat_C, alpha->value_double, beta->value_double);
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans)
+          viennacl::linalg::prod_impl(mat_A, mat_B, mat_C, alpha->value_double, beta->value_double);
+        else
+          return ViennaCLGenericFailure;
+      }
+      if (A->order == ViennaCLColumnMajor && B->order == ViennaCLRowMajor && C->order == ViennaCLRowMajor)
+      {
+        viennacl::matrix_base<double, viennacl::column_major> mat_A(A_handle,
+                                                                   A->size1, A->start1, A->stride1, A->internal_size1,
+                                                                   A->size2, A->start2, A->stride2, A->internal_size2);
+        viennacl::matrix_base<double> mat_B(B_handle,
+                                           B->size1, B->start1, B->stride1, B->internal_size1,
+                                           B->size2, B->start2, B->stride2, B->internal_size2);
+        viennacl::matrix_base<double> mat_C(C_handle,
+                                           C->size1, C->start1, C->stride1, C->internal_size1,
+                                           C->size2, C->start2, C->stride2, C->internal_size2);
+
+        if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans)
+          viennacl::linalg::prod_impl(viennacl::trans(mat_A), viennacl::trans(mat_B), mat_C, alpha->value_double, beta->value_double);
+        else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans)
+          viennacl::linalg::prod_impl(viennacl::trans(mat_A), mat_B, mat_C, alpha->value_double, beta->value_double);
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans)
+          viennacl::linalg::prod_impl(mat_A, viennacl::trans(mat_B), mat_C, alpha->value_double, beta->value_double);
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans)
+          viennacl::linalg::prod_impl(mat_A, mat_B, mat_C, alpha->value_double, beta->value_double);
+        else
+          return ViennaCLGenericFailure;
+      }
+      else if (A->order == ViennaCLColumnMajor && B->order == ViennaCLRowMajor && C->order == ViennaCLColumnMajor)
+      {
+        viennacl::matrix_base<double, viennacl::column_major> mat_A(A_handle,
+                                                                   A->size1, A->start1, A->stride1, A->internal_size1,
+                                                                   A->size2, A->start2, A->stride2, A->internal_size2);
+        viennacl::matrix_base<double> mat_B(B_handle,
+                                           B->size1, B->start1, B->stride1, B->internal_size1,
+                                           B->size2, B->start2, B->stride2, B->internal_size2);
+        viennacl::matrix_base<double, viennacl::column_major> mat_C(C_handle,
+                                                                   C->size1, C->start1, C->stride1, C->internal_size1,
+                                                                   C->size2, C->start2, C->stride2, C->internal_size2);
+
+        if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans)
+          viennacl::linalg::prod_impl(viennacl::trans(mat_A), viennacl::trans(mat_B), mat_C, alpha->value_double, beta->value_double);
+        else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans)
+          viennacl::linalg::prod_impl(viennacl::trans(mat_A), mat_B, mat_C, alpha->value_double, beta->value_double);
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans)
+          viennacl::linalg::prod_impl(mat_A, viennacl::trans(mat_B), mat_C, alpha->value_double, beta->value_double);
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans)
+          viennacl::linalg::prod_impl(mat_A, mat_B, mat_C, alpha->value_double, beta->value_double);
+        else
+          return ViennaCLGenericFailure;
+      }
+      else if (A->order == ViennaCLColumnMajor && B->order == ViennaCLColumnMajor && C->order == ViennaCLRowMajor)
+      {
+        viennacl::matrix_base<double, viennacl::column_major> mat_A(A_handle,
+                                                                   A->size1, A->start1, A->stride1, A->internal_size1,
+                                                                   A->size2, A->start2, A->stride2, A->internal_size2);
+        viennacl::matrix_base<double, viennacl::column_major> mat_B(B_handle,
+                                                                   B->size1, B->start1, B->stride1, B->internal_size1,
+                                                                   B->size2, B->start2, B->stride2, B->internal_size2);
+        viennacl::matrix_base<double> mat_C(C_handle,
+                                           C->size1, C->start1, C->stride1, C->internal_size1,
+                                           C->size2, C->start2, C->stride2, C->internal_size2);
+
+        if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans)
+          viennacl::linalg::prod_impl(viennacl::trans(mat_A), viennacl::trans(mat_B), mat_C, alpha->value_double, beta->value_double);
+        else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans)
+          viennacl::linalg::prod_impl(viennacl::trans(mat_A), mat_B, mat_C, alpha->value_double, beta->value_double);
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans)
+          viennacl::linalg::prod_impl(mat_A, viennacl::trans(mat_B), mat_C, alpha->value_double, beta->value_double);
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans)
+          viennacl::linalg::prod_impl(mat_A, mat_B, mat_C, alpha->value_double, beta->value_double);
+        else
+          return ViennaCLGenericFailure;
+      }
+      else if (A->order == ViennaCLColumnMajor && B->order == ViennaCLColumnMajor && C->order == ViennaCLColumnMajor)
+      {
+        viennacl::matrix_base<double, viennacl::column_major> mat_A(A_handle,
+                                                                   A->size1, A->start1, A->stride1, A->internal_size1,
+                                                                   A->size2, A->start2, A->stride2, A->internal_size2);
+        viennacl::matrix_base<double, viennacl::column_major> mat_B(B_handle,
+                                                                   B->size1, B->start1, B->stride1, B->internal_size1,
+                                                                   B->size2, B->start2, B->stride2, B->internal_size2);
+        viennacl::matrix_base<double, viennacl::column_major> mat_C(C_handle,
+                                                                   C->size1, C->start1, C->stride1, C->internal_size1,
+                                                                   C->size2, C->start2, C->stride2, C->internal_size2);
+
+        if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans)
+          viennacl::linalg::prod_impl(viennacl::trans(mat_A), viennacl::trans(mat_B), mat_C, alpha->value_double, beta->value_double);
+        else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans)
+          viennacl::linalg::prod_impl(viennacl::trans(mat_A), mat_B, mat_C, alpha->value_double, beta->value_double);
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans)
+          viennacl::linalg::prod_impl(mat_A, viennacl::trans(mat_B), mat_C, alpha->value_double, beta->value_double);
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans)
+          viennacl::linalg::prod_impl(mat_A, mat_B, mat_C, alpha->value_double, beta->value_double);
+        else
+          return ViennaCLGenericFailure;
+      }
+      else
+        return ViennaCLGenericFailure;
+
+      return ViennaCLSuccess;
+    }
+
+    default:
+      return ViennaCLGenericFailure;
+  }
+}
+
+
+// xTRSV
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLtrsm(ViennaCLMatrix A, ViennaCLUplo uplo, ViennaCLDiag diag, ViennaCLMatrix B)
+{
+  viennacl::backend::mem_handle A_handle;
+  viennacl::backend::mem_handle B_handle;
+
+  if (init_matrix(A_handle, A) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  if (init_matrix(B_handle, B) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  switch (A->precision)
+  {
+    case ViennaCLFloat:
+    {
+      if (A->order == ViennaCLRowMajor && B->order == ViennaCLRowMajor)
+      {
+        viennacl::matrix_base<float> mat_A(A_handle,
+                                           A->size1, A->start1, A->stride1, A->internal_size1,
+                                           A->size2, A->start2, A->stride2, A->internal_size2);
+        viennacl::matrix_base<float> mat_B(B_handle,
+                                           B->size1, B->start1, B->stride1, B->internal_size1,
+                                           B->size2, B->start2, B->stride2, B->internal_size2);
+
+        if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans)
+        {
+          if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::upper_tag());
+          else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_upper_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::lower_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_lower_tag());
+          else
+            return ViennaCLGenericFailure;
+        }
+        else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans)
+        {
+          if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::upper_tag());
+          else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::unit_upper_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::lower_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::unit_lower_tag());
+          else
+            return ViennaCLGenericFailure;
+        }
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans)
+        {
+          if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::upper_tag());
+          else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_upper_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::lower_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_lower_tag());
+          else
+            return ViennaCLGenericFailure;
+        }
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans)
+        {
+          if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::upper_tag());
+          else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::unit_upper_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::lower_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::unit_lower_tag());
+          else
+            return ViennaCLGenericFailure;
+        }
+      }
+      else if (A->order == ViennaCLRowMajor && B->order == ViennaCLColumnMajor)
+      {
+        viennacl::matrix_base<float> mat_A(A_handle,
+                                           A->size1, A->start1, A->stride1, A->internal_size1,
+                                           A->size2, A->start2, A->stride2, A->internal_size2);
+        viennacl::matrix_base<float, viennacl::column_major> mat_B(B_handle,
+                                                                   B->size1, B->start1, B->stride1, B->internal_size1,
+                                                                   B->size2, B->start2, B->stride2, B->internal_size2);
+
+        if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans)
+        {
+          if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::upper_tag());
+          else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_upper_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::lower_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_lower_tag());
+          else
+            return ViennaCLGenericFailure;
+        }
+        else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans)
+        {
+          if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::upper_tag());
+          else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::unit_upper_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::lower_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::unit_lower_tag());
+          else
+            return ViennaCLGenericFailure;
+        }
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans)
+        {
+          if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::upper_tag());
+          else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_upper_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::lower_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_lower_tag());
+          else
+            return ViennaCLGenericFailure;
+        }
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans)
+        {
+          if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::upper_tag());
+          else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::unit_upper_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::lower_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::unit_lower_tag());
+          else
+            return ViennaCLGenericFailure;
+        }
+      }
+      else if (A->order == ViennaCLColumnMajor && B->order == ViennaCLRowMajor)
+      {
+        viennacl::matrix_base<float, viennacl::column_major> mat_A(A_handle,
+                                                                   A->size1, A->start1, A->stride1, A->internal_size1,
+                                                                   A->size2, A->start2, A->stride2, A->internal_size2);
+        viennacl::matrix_base<float> mat_B(B_handle,
+                                           B->size1, B->start1, B->stride1, B->internal_size1,
+                                           B->size2, B->start2, B->stride2, B->internal_size2);
+
+        if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans)
+        {
+          if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::upper_tag());
+          else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_upper_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::lower_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_lower_tag());
+          else
+            return ViennaCLGenericFailure;
+        }
+        else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans)
+        {
+          if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::upper_tag());
+          else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::unit_upper_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::lower_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::unit_lower_tag());
+          else
+            return ViennaCLGenericFailure;
+        }
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans)
+        {
+          if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::upper_tag());
+          else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_upper_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::lower_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_lower_tag());
+          else
+            return ViennaCLGenericFailure;
+        }
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans)
+        {
+          if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::upper_tag());
+          else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::unit_upper_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::lower_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::unit_lower_tag());
+          else
+            return ViennaCLGenericFailure;
+        }
+      }
+      else if (A->order == ViennaCLColumnMajor && B->order == ViennaCLColumnMajor)
+      {
+        viennacl::matrix_base<float, viennacl::column_major> mat_A(A_handle,
+                                                                   A->size1, A->start1, A->stride1, A->internal_size1,
+                                                                   A->size2, A->start2, A->stride2, A->internal_size2);
+        viennacl::matrix_base<float, viennacl::column_major> mat_B(B_handle,
+                                                                   B->size1, B->start1, B->stride1, B->internal_size1,
+                                                                   B->size2, B->start2, B->stride2, B->internal_size2);
+
+        if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans)
+        {
+          if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::upper_tag());
+          else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_upper_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::lower_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_lower_tag());
+          else
+            return ViennaCLGenericFailure;
+        }
+        else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans)
+        {
+          if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::upper_tag());
+          else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::unit_upper_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::lower_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::unit_lower_tag());
+          else
+            return ViennaCLGenericFailure;
+        }
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans)
+        {
+          if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::upper_tag());
+          else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_upper_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::lower_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_lower_tag());
+          else
+            return ViennaCLGenericFailure;
+        }
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans)
+        {
+          if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::upper_tag());
+          else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::unit_upper_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::lower_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::unit_lower_tag());
+          else
+            return ViennaCLGenericFailure;
+        }
+      }
+
+      return ViennaCLSuccess;
+    }
+    case ViennaCLDouble:
+    {
+      if (A->order == ViennaCLRowMajor && B->order == ViennaCLRowMajor)
+      {
+        viennacl::matrix_base<double> mat_A(A_handle,
+                                           A->size1, A->start1, A->stride1, A->internal_size1,
+                                           A->size2, A->start2, A->stride2, A->internal_size2);
+        viennacl::matrix_base<double> mat_B(B_handle,
+                                           B->size1, B->start1, B->stride1, B->internal_size1,
+                                           B->size2, B->start2, B->stride2, B->internal_size2);
+
+        if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans)
+        {
+          if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::upper_tag());
+          else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_upper_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::lower_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_lower_tag());
+          else
+            return ViennaCLGenericFailure;
+        }
+        else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans)
+        {
+          if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::upper_tag());
+          else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::unit_upper_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::lower_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::unit_lower_tag());
+          else
+            return ViennaCLGenericFailure;
+        }
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans)
+        {
+          if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::upper_tag());
+          else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_upper_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::lower_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_lower_tag());
+          else
+            return ViennaCLGenericFailure;
+        }
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans)
+        {
+          if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::upper_tag());
+          else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::unit_upper_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::lower_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::unit_lower_tag());
+          else
+            return ViennaCLGenericFailure;
+        }
+      }
+      else if (A->order == ViennaCLRowMajor && B->order == ViennaCLColumnMajor)
+      {
+        viennacl::matrix_base<double> mat_A(A_handle,
+                                           A->size1, A->start1, A->stride1, A->internal_size1,
+                                           A->size2, A->start2, A->stride2, A->internal_size2);
+        viennacl::matrix_base<double, viennacl::column_major> mat_B(B_handle,
+                                                                   B->size1, B->start1, B->stride1, B->internal_size1,
+                                                                   B->size2, B->start2, B->stride2, B->internal_size2);
+
+        if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans)
+        {
+          if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::upper_tag());
+          else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_upper_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::lower_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_lower_tag());
+          else
+            return ViennaCLGenericFailure;
+        }
+        else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans)
+        {
+          if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::upper_tag());
+          else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::unit_upper_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::lower_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::unit_lower_tag());
+          else
+            return ViennaCLGenericFailure;
+        }
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans)
+        {
+          if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::upper_tag());
+          else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_upper_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::lower_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_lower_tag());
+          else
+            return ViennaCLGenericFailure;
+        }
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans)
+        {
+          if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::upper_tag());
+          else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::unit_upper_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::lower_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::unit_lower_tag());
+          else
+            return ViennaCLGenericFailure;
+        }
+      }
+      else if (A->order == ViennaCLColumnMajor && B->order == ViennaCLRowMajor)
+      {
+        viennacl::matrix_base<double, viennacl::column_major> mat_A(A_handle,
+                                                                   A->size1, A->start1, A->stride1, A->internal_size1,
+                                                                   A->size2, A->start2, A->stride2, A->internal_size2);
+        viennacl::matrix_base<double> mat_B(B_handle,
+                                           B->size1, B->start1, B->stride1, B->internal_size1,
+                                           B->size2, B->start2, B->stride2, B->internal_size2);
+
+        if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans)
+        {
+          if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::upper_tag());
+          else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_upper_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::lower_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_lower_tag());
+          else
+            return ViennaCLGenericFailure;
+        }
+        else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans)
+        {
+          if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::upper_tag());
+          else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::unit_upper_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::lower_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::unit_lower_tag());
+          else
+            return ViennaCLGenericFailure;
+        }
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans)
+        {
+          if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::upper_tag());
+          else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_upper_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::lower_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_lower_tag());
+          else
+            return ViennaCLGenericFailure;
+        }
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans)
+        {
+          if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::upper_tag());
+          else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::unit_upper_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::lower_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::unit_lower_tag());
+          else
+            return ViennaCLGenericFailure;
+        }
+      }
+      else if (A->order == ViennaCLColumnMajor && B->order == ViennaCLColumnMajor)
+      {
+        viennacl::matrix_base<double, viennacl::column_major> mat_A(A_handle,
+                                                                   A->size1, A->start1, A->stride1, A->internal_size1,
+                                                                   A->size2, A->start2, A->stride2, A->internal_size2);
+        viennacl::matrix_base<double, viennacl::column_major> mat_B(B_handle,
+                                                                   B->size1, B->start1, B->stride1, B->internal_size1,
+                                                                   B->size2, B->start2, B->stride2, B->internal_size2);
+
+        if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans)
+        {
+          if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::upper_tag());
+          else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_upper_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::lower_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_lower_tag());
+          else
+            return ViennaCLGenericFailure;
+        }
+        else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans)
+        {
+          if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::upper_tag());
+          else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::unit_upper_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::lower_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::unit_lower_tag());
+          else
+            return ViennaCLGenericFailure;
+        }
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans)
+        {
+          if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::upper_tag());
+          else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_upper_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::lower_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_lower_tag());
+          else
+            return ViennaCLGenericFailure;
+        }
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans)
+        {
+          if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::upper_tag());
+          else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::unit_upper_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::lower_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::unit_lower_tag());
+          else
+            return ViennaCLGenericFailure;
+        }
+      }
+
+      return ViennaCLSuccess;
+    }
+
+    default:
+      return  ViennaCLGenericFailure;
+  }
+}
+
+
+
diff --git a/libviennacl/src/blas3.cu b/libviennacl/src/blas3.cu
new file mode 100644
index 0000000..a038cba
--- /dev/null
+++ b/libviennacl/src/blas3.cu
@@ -0,0 +1,970 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+#include "init_matrix.hpp"
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/direct_solve.hpp"
+#include "viennacl/linalg/prod.hpp"
+
+// GEMV
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLgemm(ViennaCLHostScalar alpha, ViennaCLMatrix A, ViennaCLMatrix B, ViennaCLHostScalar beta, ViennaCLMatrix C)
+{
+  viennacl::backend::mem_handle A_handle;
+  viennacl::backend::mem_handle B_handle;
+  viennacl::backend::mem_handle C_handle;
+
+  if (init_matrix(A_handle, A) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  if (init_matrix(B_handle, B) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  if (init_matrix(C_handle, C) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  switch (A->precision)
+  {
+    case ViennaCLFloat:
+    {
+      if (A->order == ViennaCLRowMajor && B->order == ViennaCLRowMajor && C->order == ViennaCLRowMajor)
+      {
+        viennacl::matrix_base<float> mat_A(A_handle,
+                                           A->size1, A->start1, A->stride1, A->internal_size1,
+                                           A->size2, A->start2, A->stride2, A->internal_size2);
+        viennacl::matrix_base<float> mat_B(B_handle,
+                                           B->size1, B->start1, B->stride1, B->internal_size1,
+                                           B->size2, B->start2, B->stride2, B->internal_size2);
+        viennacl::matrix_base<float> mat_C(C_handle,
+                                           C->size1, C->start1, C->stride1, C->internal_size1,
+                                           C->size2, C->start2, C->stride2, C->internal_size2);
+
+        if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans)
+          viennacl::linalg::prod_impl(viennacl::trans(mat_A), viennacl::trans(mat_B), mat_C, alpha->value_float, beta->value_float);
+        else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans)
+          viennacl::linalg::prod_impl(viennacl::trans(mat_A), mat_B, mat_C, alpha->value_float, beta->value_float);
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans)
+          viennacl::linalg::prod_impl(mat_A, viennacl::trans(mat_B), mat_C, alpha->value_float, beta->value_float);
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans)
+          viennacl::linalg::prod_impl(mat_A, mat_B, mat_C, alpha->value_float, beta->value_float);
+        else
+          return ViennaCLGenericFailure;
+      }
+      else if (A->order == ViennaCLRowMajor && B->order == ViennaCLRowMajor && C->order == ViennaCLColumnMajor)
+      {
+        viennacl::matrix_base<float> mat_A(A_handle,
+                                           A->size1, A->start1, A->stride1, A->internal_size1,
+                                           A->size2, A->start2, A->stride2, A->internal_size2);
+        viennacl::matrix_base<float> mat_B(B_handle,
+                                           B->size1, B->start1, B->stride1, B->internal_size1,
+                                           B->size2, B->start2, B->stride2, B->internal_size2);
+        viennacl::matrix_base<float, viennacl::column_major> mat_C(C_handle,
+                                                                   C->size1, C->start1, C->stride1, C->internal_size1,
+                                                                   C->size2, C->start2, C->stride2, C->internal_size2);
+
+        if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans)
+          viennacl::linalg::prod_impl(viennacl::trans(mat_A), viennacl::trans(mat_B), mat_C, alpha->value_float, beta->value_float);
+        else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans)
+          viennacl::linalg::prod_impl(viennacl::trans(mat_A), mat_B, mat_C, alpha->value_float, beta->value_float);
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans)
+          viennacl::linalg::prod_impl(mat_A, viennacl::trans(mat_B), mat_C, alpha->value_float, beta->value_float);
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans)
+          viennacl::linalg::prod_impl(mat_A, mat_B, mat_C, alpha->value_float, beta->value_float);
+        else
+          return ViennaCLGenericFailure;
+      }
+      else if (A->order == ViennaCLRowMajor && B->order == ViennaCLColumnMajor && C->order == ViennaCLRowMajor)
+      {
+        viennacl::matrix_base<float> mat_A(A_handle,
+                                           A->size1, A->start1, A->stride1, A->internal_size1,
+                                           A->size2, A->start2, A->stride2, A->internal_size2);
+        viennacl::matrix_base<float, viennacl::column_major> mat_B(B_handle,
+                                                                   B->size1, B->start1, B->stride1, B->internal_size1,
+                                                                   B->size2, B->start2, B->stride2, B->internal_size2);
+        viennacl::matrix_base<float> mat_C(C_handle,
+                                           C->size1, C->start1, C->stride1, C->internal_size1,
+                                           C->size2, C->start2, C->stride2, C->internal_size2);
+
+        if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans)
+          viennacl::linalg::prod_impl(viennacl::trans(mat_A), viennacl::trans(mat_B), mat_C, alpha->value_float, beta->value_float);
+        else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans)
+          viennacl::linalg::prod_impl(viennacl::trans(mat_A), mat_B, mat_C, alpha->value_float, beta->value_float);
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans)
+          viennacl::linalg::prod_impl(mat_A, viennacl::trans(mat_B), mat_C, alpha->value_float, beta->value_float);
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans)
+          viennacl::linalg::prod_impl(mat_A, mat_B, mat_C, alpha->value_float, beta->value_float);
+        else
+          return ViennaCLGenericFailure;
+      }
+      else if (A->order == ViennaCLRowMajor && B->order == ViennaCLColumnMajor && C->order == ViennaCLColumnMajor)
+      {
+        viennacl::matrix_base<float> mat_A(A_handle,
+                                           A->size1, A->start1, A->stride1, A->internal_size1,
+                                           A->size2, A->start2, A->stride2, A->internal_size2);
+        viennacl::matrix_base<float, viennacl::column_major> mat_B(B_handle,
+                                                                   B->size1, B->start1, B->stride1, B->internal_size1,
+                                                                   B->size2, B->start2, B->stride2, B->internal_size2);
+        viennacl::matrix_base<float, viennacl::column_major> mat_C(C_handle,
+                                                                   C->size1, C->start1, C->stride1, C->internal_size1,
+                                                                   C->size2, C->start2, C->stride2, C->internal_size2);
+
+        if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans)
+          viennacl::linalg::prod_impl(viennacl::trans(mat_A), viennacl::trans(mat_B), mat_C, alpha->value_float, beta->value_float);
+        else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans)
+          viennacl::linalg::prod_impl(viennacl::trans(mat_A), mat_B, mat_C, alpha->value_float, beta->value_float);
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans)
+          viennacl::linalg::prod_impl(mat_A, viennacl::trans(mat_B), mat_C, alpha->value_float, beta->value_float);
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans)
+          viennacl::linalg::prod_impl(mat_A, mat_B, mat_C, alpha->value_float, beta->value_float);
+        else
+          return ViennaCLGenericFailure;
+      }
+      if (A->order == ViennaCLColumnMajor && B->order == ViennaCLRowMajor && C->order == ViennaCLRowMajor)
+      {
+        viennacl::matrix_base<float, viennacl::column_major> mat_A(A_handle,
+                                                                   A->size1, A->start1, A->stride1, A->internal_size1,
+                                                                   A->size2, A->start2, A->stride2, A->internal_size2);
+        viennacl::matrix_base<float> mat_B(B_handle,
+                                           B->size1, B->start1, B->stride1, B->internal_size1,
+                                           B->size2, B->start2, B->stride2, B->internal_size2);
+        viennacl::matrix_base<float> mat_C(C_handle,
+                                           C->size1, C->start1, C->stride1, C->internal_size1,
+                                           C->size2, C->start2, C->stride2, C->internal_size2);
+
+        if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans)
+          viennacl::linalg::prod_impl(viennacl::trans(mat_A), viennacl::trans(mat_B), mat_C, alpha->value_float, beta->value_float);
+        else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans)
+          viennacl::linalg::prod_impl(viennacl::trans(mat_A), mat_B, mat_C, alpha->value_float, beta->value_float);
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans)
+          viennacl::linalg::prod_impl(mat_A, viennacl::trans(mat_B), mat_C, alpha->value_float, beta->value_float);
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans)
+          viennacl::linalg::prod_impl(mat_A, mat_B, mat_C, alpha->value_float, beta->value_float);
+        else
+          return ViennaCLGenericFailure;
+      }
+      else if (A->order == ViennaCLColumnMajor && B->order == ViennaCLRowMajor && C->order == ViennaCLColumnMajor)
+      {
+        viennacl::matrix_base<float, viennacl::column_major> mat_A(A_handle,
+                                                                   A->size1, A->start1, A->stride1, A->internal_size1,
+                                                                   A->size2, A->start2, A->stride2, A->internal_size2);
+        viennacl::matrix_base<float> mat_B(B_handle,
+                                           B->size1, B->start1, B->stride1, B->internal_size1,
+                                           B->size2, B->start2, B->stride2, B->internal_size2);
+        viennacl::matrix_base<float, viennacl::column_major> mat_C(C_handle,
+                                                                   C->size1, C->start1, C->stride1, C->internal_size1,
+                                                                   C->size2, C->start2, C->stride2, C->internal_size2);
+
+        if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans)
+          viennacl::linalg::prod_impl(viennacl::trans(mat_A), viennacl::trans(mat_B), mat_C, alpha->value_float, beta->value_float);
+        else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans)
+          viennacl::linalg::prod_impl(viennacl::trans(mat_A), mat_B, mat_C, alpha->value_float, beta->value_float);
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans)
+          viennacl::linalg::prod_impl(mat_A, viennacl::trans(mat_B), mat_C, alpha->value_float, beta->value_float);
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans)
+          viennacl::linalg::prod_impl(mat_A, mat_B, mat_C, alpha->value_float, beta->value_float);
+        else
+          return ViennaCLGenericFailure;
+      }
+      else if (A->order == ViennaCLColumnMajor && B->order == ViennaCLColumnMajor && C->order == ViennaCLRowMajor)
+      {
+        viennacl::matrix_base<float, viennacl::column_major> mat_A(A_handle,
+                                                                   A->size1, A->start1, A->stride1, A->internal_size1,
+                                                                   A->size2, A->start2, A->stride2, A->internal_size2);
+        viennacl::matrix_base<float, viennacl::column_major> mat_B(B_handle,
+                                                                   B->size1, B->start1, B->stride1, B->internal_size1,
+                                                                   B->size2, B->start2, B->stride2, B->internal_size2);
+        viennacl::matrix_base<float> mat_C(C_handle,
+                                           C->size1, C->start1, C->stride1, C->internal_size1,
+                                           C->size2, C->start2, C->stride2, C->internal_size2);
+
+        if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans)
+          viennacl::linalg::prod_impl(viennacl::trans(mat_A), viennacl::trans(mat_B), mat_C, alpha->value_float, beta->value_float);
+        else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans)
+          viennacl::linalg::prod_impl(viennacl::trans(mat_A), mat_B, mat_C, alpha->value_float, beta->value_float);
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans)
+          viennacl::linalg::prod_impl(mat_A, viennacl::trans(mat_B), mat_C, alpha->value_float, beta->value_float);
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans)
+          viennacl::linalg::prod_impl(mat_A, mat_B, mat_C, alpha->value_float, beta->value_float);
+        else
+          return ViennaCLGenericFailure;
+      }
+      else if (A->order == ViennaCLColumnMajor && B->order == ViennaCLColumnMajor && C->order == ViennaCLColumnMajor)
+      {
+        viennacl::matrix_base<float, viennacl::column_major> mat_A(A_handle,
+                                                                   A->size1, A->start1, A->stride1, A->internal_size1,
+                                                                   A->size2, A->start2, A->stride2, A->internal_size2);
+        viennacl::matrix_base<float, viennacl::column_major> mat_B(B_handle,
+                                                                   B->size1, B->start1, B->stride1, B->internal_size1,
+                                                                   B->size2, B->start2, B->stride2, B->internal_size2);
+        viennacl::matrix_base<float, viennacl::column_major> mat_C(C_handle,
+                                                                   C->size1, C->start1, C->stride1, C->internal_size1,
+                                                                   C->size2, C->start2, C->stride2, C->internal_size2);
+
+        if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans)
+          viennacl::linalg::prod_impl(viennacl::trans(mat_A), viennacl::trans(mat_B), mat_C, alpha->value_float, beta->value_float);
+        else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans)
+          viennacl::linalg::prod_impl(viennacl::trans(mat_A), mat_B, mat_C, alpha->value_float, beta->value_float);
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans)
+          viennacl::linalg::prod_impl(mat_A, viennacl::trans(mat_B), mat_C, alpha->value_float, beta->value_float);
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans)
+          viennacl::linalg::prod_impl(mat_A, mat_B, mat_C, alpha->value_float, beta->value_float);
+        else
+          return ViennaCLGenericFailure;
+      }
+      else
+        return ViennaCLGenericFailure;
+
+      return ViennaCLSuccess;
+    }
+
+    case ViennaCLDouble:
+    {
+      if (A->order == ViennaCLRowMajor && B->order == ViennaCLRowMajor && C->order == ViennaCLRowMajor)
+      {
+        viennacl::matrix_base<double> mat_A(A_handle,
+                                           A->size1, A->start1, A->stride1, A->internal_size1,
+                                           A->size2, A->start2, A->stride2, A->internal_size2);
+        viennacl::matrix_base<double> mat_B(B_handle,
+                                           B->size1, B->start1, B->stride1, B->internal_size1,
+                                           B->size2, B->start2, B->stride2, B->internal_size2);
+        viennacl::matrix_base<double> mat_C(C_handle,
+                                           C->size1, C->start1, C->stride1, C->internal_size1,
+                                           C->size2, C->start2, C->stride2, C->internal_size2);
+
+        if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans)
+          viennacl::linalg::prod_impl(viennacl::trans(mat_A), viennacl::trans(mat_B), mat_C, alpha->value_double, beta->value_double);
+        else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans)
+          viennacl::linalg::prod_impl(viennacl::trans(mat_A), mat_B, mat_C, alpha->value_double, beta->value_double);
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans)
+          viennacl::linalg::prod_impl(mat_A, viennacl::trans(mat_B), mat_C, alpha->value_double, beta->value_double);
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans)
+          viennacl::linalg::prod_impl(mat_A, mat_B, mat_C, alpha->value_double, beta->value_double);
+        else
+          return ViennaCLGenericFailure;
+      }
+      else if (A->order == ViennaCLRowMajor && B->order == ViennaCLRowMajor && C->order == ViennaCLColumnMajor)
+      {
+        viennacl::matrix_base<double> mat_A(A_handle,
+                                           A->size1, A->start1, A->stride1, A->internal_size1,
+                                           A->size2, A->start2, A->stride2, A->internal_size2);
+        viennacl::matrix_base<double> mat_B(B_handle,
+                                           B->size1, B->start1, B->stride1, B->internal_size1,
+                                           B->size2, B->start2, B->stride2, B->internal_size2);
+        viennacl::matrix_base<double, viennacl::column_major> mat_C(C_handle,
+                                                                   C->size1, C->start1, C->stride1, C->internal_size1,
+                                                                   C->size2, C->start2, C->stride2, C->internal_size2);
+
+        if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans)
+          viennacl::linalg::prod_impl(viennacl::trans(mat_A), viennacl::trans(mat_B), mat_C, alpha->value_double, beta->value_double);
+        else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans)
+          viennacl::linalg::prod_impl(viennacl::trans(mat_A), mat_B, mat_C, alpha->value_double, beta->value_double);
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans)
+          viennacl::linalg::prod_impl(mat_A, viennacl::trans(mat_B), mat_C, alpha->value_double, beta->value_double);
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans)
+          viennacl::linalg::prod_impl(mat_A, mat_B, mat_C, alpha->value_double, beta->value_double);
+        else
+          return ViennaCLGenericFailure;
+      }
+      else if (A->order == ViennaCLRowMajor && B->order == ViennaCLColumnMajor && C->order == ViennaCLRowMajor)
+      {
+        viennacl::matrix_base<double> mat_A(A_handle,
+                                           A->size1, A->start1, A->stride1, A->internal_size1,
+                                           A->size2, A->start2, A->stride2, A->internal_size2);
+        viennacl::matrix_base<double, viennacl::column_major> mat_B(B_handle,
+                                                                   B->size1, B->start1, B->stride1, B->internal_size1,
+                                                                   B->size2, B->start2, B->stride2, B->internal_size2);
+        viennacl::matrix_base<double> mat_C(C_handle,
+                                           C->size1, C->start1, C->stride1, C->internal_size1,
+                                           C->size2, C->start2, C->stride2, C->internal_size2);
+
+        if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans)
+          viennacl::linalg::prod_impl(viennacl::trans(mat_A), viennacl::trans(mat_B), mat_C, alpha->value_double, beta->value_double);
+        else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans)
+          viennacl::linalg::prod_impl(viennacl::trans(mat_A), mat_B, mat_C, alpha->value_double, beta->value_double);
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans)
+          viennacl::linalg::prod_impl(mat_A, viennacl::trans(mat_B), mat_C, alpha->value_double, beta->value_double);
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans)
+          viennacl::linalg::prod_impl(mat_A, mat_B, mat_C, alpha->value_double, beta->value_double);
+        else
+          return ViennaCLGenericFailure;
+      }
+      else if (A->order == ViennaCLRowMajor && B->order == ViennaCLColumnMajor && C->order == ViennaCLColumnMajor)
+      {
+        viennacl::matrix_base<double> mat_A(A_handle,
+                                           A->size1, A->start1, A->stride1, A->internal_size1,
+                                           A->size2, A->start2, A->stride2, A->internal_size2);
+        viennacl::matrix_base<double, viennacl::column_major> mat_B(B_handle,
+                                                                   B->size1, B->start1, B->stride1, B->internal_size1,
+                                                                   B->size2, B->start2, B->stride2, B->internal_size2);
+        viennacl::matrix_base<double, viennacl::column_major> mat_C(C_handle,
+                                                                   C->size1, C->start1, C->stride1, C->internal_size1,
+                                                                   C->size2, C->start2, C->stride2, C->internal_size2);
+
+        if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans)
+          viennacl::linalg::prod_impl(viennacl::trans(mat_A), viennacl::trans(mat_B), mat_C, alpha->value_double, beta->value_double);
+        else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans)
+          viennacl::linalg::prod_impl(viennacl::trans(mat_A), mat_B, mat_C, alpha->value_double, beta->value_double);
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans)
+          viennacl::linalg::prod_impl(mat_A, viennacl::trans(mat_B), mat_C, alpha->value_double, beta->value_double);
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans)
+          viennacl::linalg::prod_impl(mat_A, mat_B, mat_C, alpha->value_double, beta->value_double);
+        else
+          return ViennaCLGenericFailure;
+      }
+      if (A->order == ViennaCLColumnMajor && B->order == ViennaCLRowMajor && C->order == ViennaCLRowMajor)
+      {
+        viennacl::matrix_base<double, viennacl::column_major> mat_A(A_handle,
+                                                                   A->size1, A->start1, A->stride1, A->internal_size1,
+                                                                   A->size2, A->start2, A->stride2, A->internal_size2);
+        viennacl::matrix_base<double> mat_B(B_handle,
+                                           B->size1, B->start1, B->stride1, B->internal_size1,
+                                           B->size2, B->start2, B->stride2, B->internal_size2);
+        viennacl::matrix_base<double> mat_C(C_handle,
+                                           C->size1, C->start1, C->stride1, C->internal_size1,
+                                           C->size2, C->start2, C->stride2, C->internal_size2);
+
+        if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans)
+          viennacl::linalg::prod_impl(viennacl::trans(mat_A), viennacl::trans(mat_B), mat_C, alpha->value_double, beta->value_double);
+        else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans)
+          viennacl::linalg::prod_impl(viennacl::trans(mat_A), mat_B, mat_C, alpha->value_double, beta->value_double);
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans)
+          viennacl::linalg::prod_impl(mat_A, viennacl::trans(mat_B), mat_C, alpha->value_double, beta->value_double);
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans)
+          viennacl::linalg::prod_impl(mat_A, mat_B, mat_C, alpha->value_double, beta->value_double);
+        else
+          return ViennaCLGenericFailure;
+      }
+      else if (A->order == ViennaCLColumnMajor && B->order == ViennaCLRowMajor && C->order == ViennaCLColumnMajor)
+      {
+        viennacl::matrix_base<double, viennacl::column_major> mat_A(A_handle,
+                                                                   A->size1, A->start1, A->stride1, A->internal_size1,
+                                                                   A->size2, A->start2, A->stride2, A->internal_size2);
+        viennacl::matrix_base<double> mat_B(B_handle,
+                                           B->size1, B->start1, B->stride1, B->internal_size1,
+                                           B->size2, B->start2, B->stride2, B->internal_size2);
+        viennacl::matrix_base<double, viennacl::column_major> mat_C(C_handle,
+                                                                   C->size1, C->start1, C->stride1, C->internal_size1,
+                                                                   C->size2, C->start2, C->stride2, C->internal_size2);
+
+        if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans)
+          viennacl::linalg::prod_impl(viennacl::trans(mat_A), viennacl::trans(mat_B), mat_C, alpha->value_double, beta->value_double);
+        else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans)
+          viennacl::linalg::prod_impl(viennacl::trans(mat_A), mat_B, mat_C, alpha->value_double, beta->value_double);
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans)
+          viennacl::linalg::prod_impl(mat_A, viennacl::trans(mat_B), mat_C, alpha->value_double, beta->value_double);
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans)
+          viennacl::linalg::prod_impl(mat_A, mat_B, mat_C, alpha->value_double, beta->value_double);
+        else
+          return ViennaCLGenericFailure;
+      }
+      else if (A->order == ViennaCLColumnMajor && B->order == ViennaCLColumnMajor && C->order == ViennaCLRowMajor)
+      {
+        viennacl::matrix_base<double, viennacl::column_major> mat_A(A_handle,
+                                                                   A->size1, A->start1, A->stride1, A->internal_size1,
+                                                                   A->size2, A->start2, A->stride2, A->internal_size2);
+        viennacl::matrix_base<double, viennacl::column_major> mat_B(B_handle,
+                                                                   B->size1, B->start1, B->stride1, B->internal_size1,
+                                                                   B->size2, B->start2, B->stride2, B->internal_size2);
+        viennacl::matrix_base<double> mat_C(C_handle,
+                                           C->size1, C->start1, C->stride1, C->internal_size1,
+                                           C->size2, C->start2, C->stride2, C->internal_size2);
+
+        if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans)
+          viennacl::linalg::prod_impl(viennacl::trans(mat_A), viennacl::trans(mat_B), mat_C, alpha->value_double, beta->value_double);
+        else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans)
+          viennacl::linalg::prod_impl(viennacl::trans(mat_A), mat_B, mat_C, alpha->value_double, beta->value_double);
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans)
+          viennacl::linalg::prod_impl(mat_A, viennacl::trans(mat_B), mat_C, alpha->value_double, beta->value_double);
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans)
+          viennacl::linalg::prod_impl(mat_A, mat_B, mat_C, alpha->value_double, beta->value_double);
+        else
+          return ViennaCLGenericFailure;
+      }
+      else if (A->order == ViennaCLColumnMajor && B->order == ViennaCLColumnMajor && C->order == ViennaCLColumnMajor)
+      {
+        viennacl::matrix_base<double, viennacl::column_major> mat_A(A_handle,
+                                                                   A->size1, A->start1, A->stride1, A->internal_size1,
+                                                                   A->size2, A->start2, A->stride2, A->internal_size2);
+        viennacl::matrix_base<double, viennacl::column_major> mat_B(B_handle,
+                                                                   B->size1, B->start1, B->stride1, B->internal_size1,
+                                                                   B->size2, B->start2, B->stride2, B->internal_size2);
+        viennacl::matrix_base<double, viennacl::column_major> mat_C(C_handle,
+                                                                   C->size1, C->start1, C->stride1, C->internal_size1,
+                                                                   C->size2, C->start2, C->stride2, C->internal_size2);
+
+        if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans)
+          viennacl::linalg::prod_impl(viennacl::trans(mat_A), viennacl::trans(mat_B), mat_C, alpha->value_double, beta->value_double);
+        else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans)
+          viennacl::linalg::prod_impl(viennacl::trans(mat_A), mat_B, mat_C, alpha->value_double, beta->value_double);
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans)
+          viennacl::linalg::prod_impl(mat_A, viennacl::trans(mat_B), mat_C, alpha->value_double, beta->value_double);
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans)
+          viennacl::linalg::prod_impl(mat_A, mat_B, mat_C, alpha->value_double, beta->value_double);
+        else
+          return ViennaCLGenericFailure;
+      }
+      else
+        return ViennaCLGenericFailure;
+
+      return ViennaCLSuccess;
+    }
+
+    default:
+      return ViennaCLGenericFailure;
+  }
+}
+
+
+// xTRSV
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLtrsm(ViennaCLMatrix A, ViennaCLUplo uplo, ViennaCLDiag diag, ViennaCLMatrix B)
+{
+  viennacl::backend::mem_handle A_handle;
+  viennacl::backend::mem_handle B_handle;
+
+  if (init_matrix(A_handle, A) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  if (init_matrix(B_handle, B) != ViennaCLSuccess)
+    return ViennaCLGenericFailure;
+
+  switch (A->precision)
+  {
+    case ViennaCLFloat:
+    {
+      if (A->order == ViennaCLRowMajor && B->order == ViennaCLRowMajor)
+      {
+        viennacl::matrix_base<float> mat_A(A_handle,
+                                           A->size1, A->start1, A->stride1, A->internal_size1,
+                                           A->size2, A->start2, A->stride2, A->internal_size2);
+        viennacl::matrix_base<float> mat_B(B_handle,
+                                           B->size1, B->start1, B->stride1, B->internal_size1,
+                                           B->size2, B->start2, B->stride2, B->internal_size2);
+
+        if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans)
+        {
+          if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::upper_tag());
+          else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_upper_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::lower_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_lower_tag());
+          else
+            return ViennaCLGenericFailure;
+        }
+        else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans)
+        {
+          if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::upper_tag());
+          else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::unit_upper_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::lower_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::unit_lower_tag());
+          else
+            return ViennaCLGenericFailure;
+        }
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans)
+        {
+          if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::upper_tag());
+          else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_upper_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::lower_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_lower_tag());
+          else
+            return ViennaCLGenericFailure;
+        }
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans)
+        {
+          if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::upper_tag());
+          else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::unit_upper_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::lower_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::unit_lower_tag());
+          else
+            return ViennaCLGenericFailure;
+        }
+      }
+      else if (A->order == ViennaCLRowMajor && B->order == ViennaCLColumnMajor)
+      {
+        viennacl::matrix_base<float> mat_A(A_handle,
+                                           A->size1, A->start1, A->stride1, A->internal_size1,
+                                           A->size2, A->start2, A->stride2, A->internal_size2);
+        viennacl::matrix_base<float, viennacl::column_major> mat_B(B_handle,
+                                                                   B->size1, B->start1, B->stride1, B->internal_size1,
+                                                                   B->size2, B->start2, B->stride2, B->internal_size2);
+
+        if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans)
+        {
+          if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::upper_tag());
+          else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_upper_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::lower_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_lower_tag());
+          else
+            return ViennaCLGenericFailure;
+        }
+        else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans)
+        {
+          if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::upper_tag());
+          else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::unit_upper_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::lower_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::unit_lower_tag());
+          else
+            return ViennaCLGenericFailure;
+        }
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans)
+        {
+          if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::upper_tag());
+          else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_upper_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::lower_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_lower_tag());
+          else
+            return ViennaCLGenericFailure;
+        }
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans)
+        {
+          if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::upper_tag());
+          else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::unit_upper_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::lower_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::unit_lower_tag());
+          else
+            return ViennaCLGenericFailure;
+        }
+      }
+      else if (A->order == ViennaCLColumnMajor && B->order == ViennaCLRowMajor)
+      {
+        viennacl::matrix_base<float, viennacl::column_major> mat_A(A_handle,
+                                                                   A->size1, A->start1, A->stride1, A->internal_size1,
+                                                                   A->size2, A->start2, A->stride2, A->internal_size2);
+        viennacl::matrix_base<float> mat_B(B_handle,
+                                           B->size1, B->start1, B->stride1, B->internal_size1,
+                                           B->size2, B->start2, B->stride2, B->internal_size2);
+
+        if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans)
+        {
+          if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::upper_tag());
+          else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_upper_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::lower_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_lower_tag());
+          else
+            return ViennaCLGenericFailure;
+        }
+        else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans)
+        {
+          if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::upper_tag());
+          else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::unit_upper_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::lower_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::unit_lower_tag());
+          else
+            return ViennaCLGenericFailure;
+        }
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans)
+        {
+          if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::upper_tag());
+          else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_upper_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::lower_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_lower_tag());
+          else
+            return ViennaCLGenericFailure;
+        }
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans)
+        {
+          if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::upper_tag());
+          else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::unit_upper_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::lower_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::unit_lower_tag());
+          else
+            return ViennaCLGenericFailure;
+        }
+      }
+      else if (A->order == ViennaCLColumnMajor && B->order == ViennaCLColumnMajor)
+      {
+        viennacl::matrix_base<float, viennacl::column_major> mat_A(A_handle,
+                                                                   A->size1, A->start1, A->stride1, A->internal_size1,
+                                                                   A->size2, A->start2, A->stride2, A->internal_size2);
+        viennacl::matrix_base<float, viennacl::column_major> mat_B(B_handle,
+                                                                   B->size1, B->start1, B->stride1, B->internal_size1,
+                                                                   B->size2, B->start2, B->stride2, B->internal_size2);
+
+        if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans)
+        {
+          if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::upper_tag());
+          else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_upper_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::lower_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_lower_tag());
+          else
+            return ViennaCLGenericFailure;
+        }
+        else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans)
+        {
+          if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::upper_tag());
+          else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::unit_upper_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::lower_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::unit_lower_tag());
+          else
+            return ViennaCLGenericFailure;
+        }
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans)
+        {
+          if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::upper_tag());
+          else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_upper_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::lower_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_lower_tag());
+          else
+            return ViennaCLGenericFailure;
+        }
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans)
+        {
+          if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::upper_tag());
+          else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::unit_upper_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::lower_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::unit_lower_tag());
+          else
+            return ViennaCLGenericFailure;
+        }
+      }
+
+      return ViennaCLSuccess;
+    }
+    case ViennaCLDouble:
+    {
+      if (A->order == ViennaCLRowMajor && B->order == ViennaCLRowMajor)
+      {
+        viennacl::matrix_base<double> mat_A(A_handle,
+                                           A->size1, A->start1, A->stride1, A->internal_size1,
+                                           A->size2, A->start2, A->stride2, A->internal_size2);
+        viennacl::matrix_base<double> mat_B(B_handle,
+                                           B->size1, B->start1, B->stride1, B->internal_size1,
+                                           B->size2, B->start2, B->stride2, B->internal_size2);
+
+        if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans)
+        {
+          if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::upper_tag());
+          else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_upper_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::lower_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_lower_tag());
+          else
+            return ViennaCLGenericFailure;
+        }
+        else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans)
+        {
+          if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::upper_tag());
+          else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::unit_upper_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::lower_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::unit_lower_tag());
+          else
+            return ViennaCLGenericFailure;
+        }
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans)
+        {
+          if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::upper_tag());
+          else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_upper_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::lower_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_lower_tag());
+          else
+            return ViennaCLGenericFailure;
+        }
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans)
+        {
+          if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::upper_tag());
+          else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::unit_upper_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::lower_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::unit_lower_tag());
+          else
+            return ViennaCLGenericFailure;
+        }
+      }
+      else if (A->order == ViennaCLRowMajor && B->order == ViennaCLColumnMajor)
+      {
+        viennacl::matrix_base<double> mat_A(A_handle,
+                                           A->size1, A->start1, A->stride1, A->internal_size1,
+                                           A->size2, A->start2, A->stride2, A->internal_size2);
+        viennacl::matrix_base<double, viennacl::column_major> mat_B(B_handle,
+                                                                   B->size1, B->start1, B->stride1, B->internal_size1,
+                                                                   B->size2, B->start2, B->stride2, B->internal_size2);
+
+        if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans)
+        {
+          if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::upper_tag());
+          else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_upper_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::lower_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_lower_tag());
+          else
+            return ViennaCLGenericFailure;
+        }
+        else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans)
+        {
+          if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::upper_tag());
+          else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::unit_upper_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::lower_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::unit_lower_tag());
+          else
+            return ViennaCLGenericFailure;
+        }
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans)
+        {
+          if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::upper_tag());
+          else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_upper_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::lower_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_lower_tag());
+          else
+            return ViennaCLGenericFailure;
+        }
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans)
+        {
+          if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::upper_tag());
+          else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::unit_upper_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::lower_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::unit_lower_tag());
+          else
+            return ViennaCLGenericFailure;
+        }
+      }
+      else if (A->order == ViennaCLColumnMajor && B->order == ViennaCLRowMajor)
+      {
+        viennacl::matrix_base<double, viennacl::column_major> mat_A(A_handle,
+                                                                   A->size1, A->start1, A->stride1, A->internal_size1,
+                                                                   A->size2, A->start2, A->stride2, A->internal_size2);
+        viennacl::matrix_base<double> mat_B(B_handle,
+                                           B->size1, B->start1, B->stride1, B->internal_size1,
+                                           B->size2, B->start2, B->stride2, B->internal_size2);
+
+        if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans)
+        {
+          if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::upper_tag());
+          else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_upper_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::lower_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_lower_tag());
+          else
+            return ViennaCLGenericFailure;
+        }
+        else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans)
+        {
+          if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::upper_tag());
+          else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::unit_upper_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::lower_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::unit_lower_tag());
+          else
+            return ViennaCLGenericFailure;
+        }
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans)
+        {
+          if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::upper_tag());
+          else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_upper_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::lower_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_lower_tag());
+          else
+            return ViennaCLGenericFailure;
+        }
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans)
+        {
+          if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::upper_tag());
+          else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::unit_upper_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::lower_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::unit_lower_tag());
+          else
+            return ViennaCLGenericFailure;
+        }
+      }
+      else if (A->order == ViennaCLColumnMajor && B->order == ViennaCLColumnMajor)
+      {
+        viennacl::matrix_base<double, viennacl::column_major> mat_A(A_handle,
+                                                                   A->size1, A->start1, A->stride1, A->internal_size1,
+                                                                   A->size2, A->start2, A->stride2, A->internal_size2);
+        viennacl::matrix_base<double, viennacl::column_major> mat_B(B_handle,
+                                                                   B->size1, B->start1, B->stride1, B->internal_size1,
+                                                                   B->size2, B->start2, B->stride2, B->internal_size2);
+
+        if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans)
+        {
+          if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::upper_tag());
+          else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_upper_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::lower_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_lower_tag());
+          else
+            return ViennaCLGenericFailure;
+        }
+        else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans)
+        {
+          if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::upper_tag());
+          else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::unit_upper_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::lower_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::unit_lower_tag());
+          else
+            return ViennaCLGenericFailure;
+        }
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans)
+        {
+          if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::upper_tag());
+          else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_upper_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::lower_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_lower_tag());
+          else
+            return ViennaCLGenericFailure;
+        }
+        else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans)
+        {
+          if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::upper_tag());
+          else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::unit_upper_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+            viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::lower_tag());
+          else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+            viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::unit_lower_tag());
+          else
+            return ViennaCLGenericFailure;
+        }
+      }
+
+      return ViennaCLSuccess;
+    }
+
+    default:
+      return  ViennaCLGenericFailure;
+  }
+}
+
+
+
diff --git a/libviennacl/src/blas3.hpp b/libviennacl/src/blas3.hpp
new file mode 100644
index 0000000..cfcc034
--- /dev/null
+++ b/libviennacl/src/blas3.hpp
@@ -0,0 +1,60 @@
+#ifndef VIENNACL_SRC_BLAS3_HPP
+#define VIENNACL_SRC_BLAS3_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/direct_solve.hpp"
+#include "viennacl/linalg/prod.hpp"
+
+namespace detail
+{
+  template <typename ScalarType, typename MatrixTypeA, typename MatrixTypeB, typename MatrixTypeC>
+  void gemm_dispatch(ScalarType alpha,
+                     MatrixTypeA const & A, ViennaCLTranspose transA,
+                     MatrixTypeB const & B, ViennaCLTranspose transB,
+                     ScalarType beta,
+                     MatrixTypeC & C)
+  {
+
+    if (transA == ViennaCLTrans && transB == ViennaCLTrans)
+      viennacl::linalg::prod_impl(viennacl::trans(A), viennacl::trans(B), C, alpha, beta);
+    else if (transA == ViennaCLTrans && transB == ViennaCLNoTrans)
+      viennacl::linalg::prod_impl(viennacl::trans(A), B, C, alpha, beta);
+    else if (transA == ViennaCLNoTrans && transB == ViennaCLTrans)
+      viennacl::linalg::prod_impl(A, viennacl::trans(B), C, alpha, beta);
+    else if (transA == ViennaCLNoTrans && transB == ViennaCLNoTrans)
+      viennacl::linalg::prod_impl(A, B, C, alpha, beta);
+    //else
+    //  return ViennaCLGenericFailure;
+  }
+}
+
+
+#endif
diff --git a/libviennacl/src/blas3_cuda.cu b/libviennacl/src/blas3_cuda.cu
new file mode 100644
index 0000000..d4a4ebf
--- /dev/null
+++ b/libviennacl/src/blas3_cuda.cu
@@ -0,0 +1,249 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+#include "blas3.hpp"
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/direct_solve.hpp"
+#include "viennacl/linalg/prod.hpp"
+
+
+#ifdef VIENNACL_WITH_CUDA
+
+
+
+//
+// xGEMV
+//
+
+namespace detail
+{
+  template <typename NumericT>
+  ViennaCLStatus ViennaCLCUDAgemm_impl(ViennaCLBackend /*backend*/,
+                                       ViennaCLOrder orderA, ViennaCLTranspose transA,
+                                       ViennaCLOrder orderB, ViennaCLTranspose transB,
+                                       ViennaCLOrder orderC,
+                                       ViennaCLInt m, ViennaCLInt n, ViennaCLInt k,
+                                       NumericT alpha,
+                                       NumericT *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                       NumericT *B, ViennaCLInt offB_row, ViennaCLInt offB_col, ViennaCLInt incB_row, ViennaCLInt incB_col, ViennaCLInt ldb,
+                                       NumericT beta,
+                                       NumericT *C, ViennaCLInt offC_row, ViennaCLInt offC_col, ViennaCLInt incC_row, ViennaCLInt incC_col, ViennaCLInt ldc)
+  {
+    ViennaCLInt A_size1 = (transA == ViennaCLTrans) ? k : m;
+    ViennaCLInt A_size2 = (transA == ViennaCLTrans) ? m : k;
+
+    ViennaCLInt B_size1 = (transB == ViennaCLTrans) ? n : k;
+    ViennaCLInt B_size2 = (transB == ViennaCLTrans) ? k : n;
+
+    /////// A row-major
+
+    if (orderA == ViennaCLRowMajor && orderB == ViennaCLRowMajor && orderC == ViennaCLRowMajor)
+    {
+      viennacl::matrix_base<NumericT> matA(A, viennacl::CUDA_MEMORY,
+                                           A_size1, offA_row, incA_row, m,
+                                           A_size2, offA_col, incA_col, lda);
+
+      viennacl::matrix_base<NumericT> matB(B, viennacl::CUDA_MEMORY,
+                                           B_size1, offB_row, incB_row, k,
+                                           B_size2, offB_col, incB_col, ldb);
+
+      viennacl::matrix_base<NumericT> matC(C, viennacl::CUDA_MEMORY,
+                                           m, offC_row, incC_row, m,
+                                           n, offC_col, incC_col, ldc);
+
+      detail::gemm_dispatch(alpha, matA, transA, matB, transB, beta, matC);
+    }
+    else if (orderA == ViennaCLRowMajor && orderB == ViennaCLRowMajor && orderC == ViennaCLColumnMajor)
+    {
+      viennacl::matrix_base<NumericT> matA(A, viennacl::CUDA_MEMORY,
+                                           A_size1, offA_row, incA_row, m,
+                                           A_size2, offA_col, incA_col, lda);
+
+      viennacl::matrix_base<NumericT> matB(B, viennacl::CUDA_MEMORY,
+                                           B_size1, offB_row, incB_row, k,
+                                           B_size2, offB_col, incB_col, ldb);
+
+      viennacl::matrix_base<NumericT, viennacl::column_major> matC(C, viennacl::CUDA_MEMORY,
+                                                                   m, offC_row, incC_row, ldc,
+                                                                   n, offC_col, incC_col, n);
+
+      detail::gemm_dispatch(alpha, matA, transA, matB, transB, beta, matC);
+    }
+    else if (orderA == ViennaCLRowMajor && orderB == ViennaCLColumnMajor && orderC == ViennaCLRowMajor)
+    {
+      viennacl::matrix_base<NumericT> matA(A, viennacl::CUDA_MEMORY,
+                                           A_size1, offA_row, incA_row, m,
+                                           A_size2, offA_col, incA_col, lda);
+
+      viennacl::matrix_base<NumericT, viennacl::column_major> matB(B, viennacl::CUDA_MEMORY,
+                                                                   B_size1, offB_row, incB_row, ldb,
+                                                                   B_size2, offB_col, incB_col, n);
+
+      viennacl::matrix_base<NumericT> matC(C, viennacl::CUDA_MEMORY,
+                                           m, offC_row, incC_row, m,
+                                           n, offC_col, incC_col, ldc);
+
+      detail::gemm_dispatch(alpha, matA, transA, matB, transB, beta, matC);
+    }
+    else if (orderA == ViennaCLRowMajor && orderB == ViennaCLColumnMajor && orderC == ViennaCLColumnMajor)
+    {
+      viennacl::matrix_base<NumericT> matA(A, viennacl::CUDA_MEMORY,
+                                           A_size1, offA_row, incA_row, m,
+                                           A_size2, offA_col, incA_col, lda);
+
+      viennacl::matrix_base<NumericT, viennacl::column_major> matB(B, viennacl::CUDA_MEMORY,
+                                                                   B_size1, offB_row, incB_row, ldb,
+                                                                   B_size2, offB_col, incB_col, n);
+
+      viennacl::matrix_base<NumericT, viennacl::column_major> matC(C, viennacl::CUDA_MEMORY,
+                                                                   m, offC_row, incC_row, ldc,
+                                                                   n, offC_col, incC_col, n);
+
+      detail::gemm_dispatch(alpha, matA, transA, matB, transB, beta, matC);
+    }
+
+    /////// A column-major
+
+    else if (orderA == ViennaCLColumnMajor && orderB == ViennaCLRowMajor && orderC == ViennaCLRowMajor)
+    {
+      viennacl::matrix_base<NumericT, viennacl::column_major> matA(A, viennacl::CUDA_MEMORY,
+                                                                   A_size1, offA_row, incA_row, lda,
+                                                                   A_size2, offA_col, incA_col, k);
+
+      viennacl::matrix_base<NumericT> matB(B, viennacl::CUDA_MEMORY,
+                                           B_size1, offB_row, incB_row, k,
+                                           B_size2, offB_col, incB_col, ldb);
+
+      viennacl::matrix_base<NumericT> matC(C, viennacl::CUDA_MEMORY,
+                                           m, offC_row, incC_row, m,
+                                           n, offC_col, incC_col, ldc);
+
+      detail::gemm_dispatch(alpha, matA, transA, matB, transB, beta, matC);
+    }
+    else if (orderA == ViennaCLColumnMajor && orderB == ViennaCLRowMajor && orderC == ViennaCLColumnMajor)
+    {
+      viennacl::matrix_base<NumericT, viennacl::column_major> matA(A, viennacl::CUDA_MEMORY,
+                                                                   A_size1, offA_row, incA_row, lda,
+                                                                   A_size2, offA_col, incA_col, k);
+
+      viennacl::matrix_base<NumericT> matB(B, viennacl::CUDA_MEMORY,
+                                           B_size1, offB_row, incB_row, k,
+                                           B_size2, offB_col, incB_col, ldb);
+
+      viennacl::matrix_base<NumericT, viennacl::column_major> matC(C, viennacl::CUDA_MEMORY,
+                                                                   m, offC_row, incC_row, ldc,
+                                                                   n, offC_col, incC_col, n);
+
+      detail::gemm_dispatch(alpha, matA, transA, matB, transB, beta, matC);
+    }
+    else if (orderA == ViennaCLColumnMajor && orderB == ViennaCLColumnMajor && orderC == ViennaCLRowMajor)
+    {
+      viennacl::matrix_base<NumericT, viennacl::column_major> matA(A, viennacl::CUDA_MEMORY,
+                                                                   A_size1, offA_row, incA_row, lda,
+                                                                   A_size2, offA_col, incA_col, k);
+
+      viennacl::matrix_base<NumericT, viennacl::column_major> matB(B, viennacl::CUDA_MEMORY,
+                                                                   B_size1, offB_row, incB_row, ldb,
+                                                                   B_size2, offB_col, incB_col, n);
+
+      viennacl::matrix_base<NumericT> matC(C, viennacl::CUDA_MEMORY,
+                                           m, offC_row, incC_row, m,
+                                           n, offC_col, incC_col, ldc);
+
+      detail::gemm_dispatch(alpha, matA, transA, matB, transB, beta, matC);
+    }
+    else if (orderA == ViennaCLColumnMajor && orderB == ViennaCLColumnMajor && orderC == ViennaCLColumnMajor)
+    {
+      viennacl::matrix_base<NumericT, viennacl::column_major> matA(A, viennacl::CUDA_MEMORY,
+                                                                   A_size1, offA_row, incA_row, lda,
+                                                                   A_size2, offA_col, incA_col, k);
+
+      viennacl::matrix_base<NumericT, viennacl::column_major> matB(B, viennacl::CUDA_MEMORY,
+                                                                   B_size1, offB_row, incB_row, ldb,
+                                                                   B_size2, offB_col, incB_col, n);
+
+      viennacl::matrix_base<NumericT, viennacl::column_major> matC(C, viennacl::CUDA_MEMORY,
+                                                                   m, offC_row, incC_row, ldc,
+                                                                   n, offC_col, incC_col, n);
+
+      detail::gemm_dispatch(alpha, matA, transA, matB, transB, beta, matC);
+    }
+
+    return ViennaCLSuccess;
+  }
+
+}
+
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDASgemm(ViennaCLBackend backend,
+                                                            ViennaCLOrder orderA, ViennaCLTranspose transA,
+                                                            ViennaCLOrder orderB, ViennaCLTranspose transB,
+                                                            ViennaCLOrder orderC,
+                                                            ViennaCLInt m, ViennaCLInt n, ViennaCLInt k,
+                                                            float alpha,
+                                                            float *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                            float *B, ViennaCLInt offB_row, ViennaCLInt offB_col, ViennaCLInt incB_row, ViennaCLInt incB_col, ViennaCLInt ldb,
+                                                            float beta,
+                                                            float *C, ViennaCLInt offC_row, ViennaCLInt offC_col, ViennaCLInt incC_row, ViennaCLInt incC_col, ViennaCLInt ldc)
+{
+  return detail::ViennaCLCUDAgemm_impl<float>(backend,
+                                              orderA, transA,
+                                              orderB, transB,
+                                              orderC,
+                                              m, n, k,
+                                              alpha,
+                                              A, offA_row, offA_col, incA_row, incA_col, lda,
+                                              B, offB_row, offB_col, incB_row, incB_col, ldb,
+                                              beta,
+                                              C, offC_row, offC_col, incC_row, incC_col, ldc);
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADgemm(ViennaCLBackend backend,
+                                                            ViennaCLOrder orderA, ViennaCLTranspose transA,
+                                                            ViennaCLOrder orderB, ViennaCLTranspose transB,
+                                                            ViennaCLOrder orderC,
+                                                            ViennaCLInt m, ViennaCLInt n, ViennaCLInt k,
+                                                            double alpha,
+                                                            double *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                            double *B, ViennaCLInt offB_row, ViennaCLInt offB_col, ViennaCLInt incB_row, ViennaCLInt incB_col, ViennaCLInt ldb,
+                                                            double beta,
+                                                            double *C, ViennaCLInt offC_row, ViennaCLInt offC_col, ViennaCLInt incC_row, ViennaCLInt incC_col, ViennaCLInt ldc)
+{
+  return detail::ViennaCLCUDAgemm_impl<double>(backend,
+                                               orderA, transA,
+                                               orderB, transB,
+                                               orderC,
+                                               m, n, k,
+                                               alpha,
+                                               A, offA_row, offA_col, incA_row, incA_col, lda,
+                                               B, offB_row, offB_col, incB_row, incB_col, ldb,
+                                               beta,
+                                               C, offC_row, offC_col, incC_row, incC_col, ldc);
+}
+
+
+#endif
diff --git a/libviennacl/src/blas3_host.cpp b/libviennacl/src/blas3_host.cpp
new file mode 100644
index 0000000..742e61c
--- /dev/null
+++ b/libviennacl/src/blas3_host.cpp
@@ -0,0 +1,243 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+#include "blas3.hpp"
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/direct_solve.hpp"
+#include "viennacl/linalg/prod.hpp"
+
+
+//
+// xGEMV
+//
+
+namespace detail
+{
+  template <typename NumericT>
+  ViennaCLStatus ViennaCLHostgemm_impl(ViennaCLBackend /*backend*/,
+                                       ViennaCLOrder orderA, ViennaCLTranspose transA,
+                                       ViennaCLOrder orderB, ViennaCLTranspose transB,
+                                       ViennaCLOrder orderC,
+                                       ViennaCLInt m, ViennaCLInt n, ViennaCLInt k,
+                                       NumericT alpha,
+                                       NumericT *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                       NumericT *B, ViennaCLInt offB_row, ViennaCLInt offB_col, ViennaCLInt incB_row, ViennaCLInt incB_col, ViennaCLInt ldb,
+                                       NumericT beta,
+                                       NumericT *C, ViennaCLInt offC_row, ViennaCLInt offC_col, ViennaCLInt incC_row, ViennaCLInt incC_col, ViennaCLInt ldc)
+  {
+    ViennaCLInt A_size1 = (transA == ViennaCLTrans) ? k : m;
+    ViennaCLInt A_size2 = (transA == ViennaCLTrans) ? m : k;
+
+    ViennaCLInt B_size1 = (transB == ViennaCLTrans) ? n : k;
+    ViennaCLInt B_size2 = (transB == ViennaCLTrans) ? k : n;
+
+    /////// A row-major
+    if (orderA == ViennaCLRowMajor && orderB == ViennaCLRowMajor && orderC == ViennaCLRowMajor)
+    {
+      viennacl::matrix_base<NumericT> matA(A, viennacl::MAIN_MEMORY,
+                                           A_size1, offA_row, incA_row, m,
+                                           A_size2, offA_col, incA_col, lda);
+
+      viennacl::matrix_base<NumericT> matB(B, viennacl::MAIN_MEMORY,
+                                           B_size1, offB_row, incB_row, k,
+                                           B_size2, offB_col, incB_col, ldb);
+
+      viennacl::matrix_base<NumericT> matC(C, viennacl::MAIN_MEMORY,
+                                           m, offC_row, incC_row, m,
+                                           n, offC_col, incC_col, ldc);
+
+      detail::gemm_dispatch(alpha, matA, transA, matB, transB, beta, matC);
+    }
+    else if (orderA == ViennaCLRowMajor && orderB == ViennaCLRowMajor && orderC == ViennaCLColumnMajor)
+    {
+      viennacl::matrix_base<NumericT> matA(A, viennacl::MAIN_MEMORY,
+                                           A_size1, offA_row, incA_row, m,
+                                           A_size2, offA_col, incA_col, lda);
+
+      viennacl::matrix_base<NumericT> matB(B, viennacl::MAIN_MEMORY,
+                                           B_size1, offB_row, incB_row, k,
+                                           B_size2, offB_col, incB_col, ldb);
+
+      viennacl::matrix_base<NumericT, viennacl::column_major> matC(C, viennacl::MAIN_MEMORY,
+                                                                   m, offC_row, incC_row, ldc,
+                                                                   n, offC_col, incC_col, n);
+
+      detail::gemm_dispatch(alpha, matA, transA, matB, transB, beta, matC);
+    }
+    else if (orderA == ViennaCLRowMajor && orderB == ViennaCLColumnMajor && orderC == ViennaCLRowMajor)
+    {
+      viennacl::matrix_base<NumericT> matA(A, viennacl::MAIN_MEMORY,
+                                           A_size1, offA_row, incA_row, m,
+                                           A_size2, offA_col, incA_col, lda);
+
+      viennacl::matrix_base<NumericT, viennacl::column_major> matB(B, viennacl::MAIN_MEMORY,
+                                                                   B_size1, offB_row, incB_row, ldb,
+                                                                   B_size2, offB_col, incB_col, n);
+
+      viennacl::matrix_base<NumericT> matC(C, viennacl::MAIN_MEMORY,
+                                           m, offC_row, incC_row, m,
+                                           n, offC_col, incC_col, ldc);
+
+      detail::gemm_dispatch(alpha, matA, transA, matB, transB, beta, matC);
+    }
+    else if (orderA == ViennaCLRowMajor && orderB == ViennaCLColumnMajor && orderC == ViennaCLColumnMajor)
+    {
+      viennacl::matrix_base<NumericT> matA(A, viennacl::MAIN_MEMORY,
+                                           A_size1, offA_row, incA_row, m,
+                                           A_size2, offA_col, incA_col, lda);
+
+      viennacl::matrix_base<NumericT, viennacl::column_major> matB(B, viennacl::MAIN_MEMORY,
+                                                                   B_size1, offB_row, incB_row, ldb,
+                                                                   B_size2, offB_col, incB_col, n);
+
+      viennacl::matrix_base<NumericT, viennacl::column_major> matC(C, viennacl::MAIN_MEMORY,
+                                                                   m, offC_row, incC_row, ldc,
+                                                                   n, offC_col, incC_col, n);
+
+      detail::gemm_dispatch(alpha, matA, transA, matB, transB, beta, matC);
+    }
+
+    /////// A column-major
+
+    else if (orderA == ViennaCLColumnMajor && orderB == ViennaCLRowMajor && orderC == ViennaCLRowMajor)
+    {
+      viennacl::matrix_base<NumericT, viennacl::column_major> matA(A, viennacl::MAIN_MEMORY,
+                                                                   A_size1, offA_row, incA_row, lda,
+                                                                   A_size2, offA_col, incA_col, k);
+
+      viennacl::matrix_base<NumericT> matB(B, viennacl::MAIN_MEMORY,
+                                           B_size1, offB_row, incB_row, k,
+                                           B_size2, offB_col, incB_col, ldb);
+
+      viennacl::matrix_base<NumericT> matC(C, viennacl::MAIN_MEMORY,
+                                           m, offC_row, incC_row, m,
+                                           n, offC_col, incC_col, ldc);
+
+      detail::gemm_dispatch(alpha, matA, transA, matB, transB, beta, matC);
+    }
+    else if (orderA == ViennaCLColumnMajor && orderB == ViennaCLRowMajor && orderC == ViennaCLColumnMajor)
+    {
+      viennacl::matrix_base<NumericT, viennacl::column_major> matA(A, viennacl::MAIN_MEMORY,
+                                                                   A_size1, offA_row, incA_row, lda,
+                                                                   A_size2, offA_col, incA_col, k);
+
+      viennacl::matrix_base<NumericT> matB(B, viennacl::MAIN_MEMORY,
+                                           B_size1, offB_row, incB_row, k,
+                                           B_size2, offB_col, incB_col, ldb);
+
+      viennacl::matrix_base<NumericT, viennacl::column_major> matC(C, viennacl::MAIN_MEMORY,
+                                                                   m, offC_row, incC_row, ldc,
+                                                                   n, offC_col, incC_col, n);
+
+      detail::gemm_dispatch(alpha, matA, transA, matB, transB, beta, matC);
+    }
+    else if (orderA == ViennaCLColumnMajor && orderB == ViennaCLColumnMajor && orderC == ViennaCLRowMajor)
+    {
+      viennacl::matrix_base<NumericT, viennacl::column_major> matA(A, viennacl::MAIN_MEMORY,
+                                                                   A_size1, offA_row, incA_row, lda,
+                                                                   A_size2, offA_col, incA_col, k);
+
+      viennacl::matrix_base<NumericT, viennacl::column_major> matB(B, viennacl::MAIN_MEMORY,
+                                                                   B_size1, offB_row, incB_row, ldb,
+                                                                   B_size2, offB_col, incB_col, n);
+
+      viennacl::matrix_base<NumericT> matC(C, viennacl::MAIN_MEMORY,
+                                        m, offC_row, incC_row, m,
+                                        n, offC_col, incC_col, ldc);
+
+      detail::gemm_dispatch(alpha, matA, transA, matB, transB, beta, matC);
+    }
+    else if (orderA == ViennaCLColumnMajor && orderB == ViennaCLColumnMajor && orderC == ViennaCLColumnMajor)
+    {
+      viennacl::matrix_base<NumericT, viennacl::column_major> matA(A, viennacl::MAIN_MEMORY,
+                                                                   A_size1, offA_row, incA_row, lda,
+                                                                   A_size2, offA_col, incA_col, k);
+
+      viennacl::matrix_base<NumericT, viennacl::column_major> matB(B, viennacl::MAIN_MEMORY,
+                                                                   B_size1, offB_row, incB_row, ldb,
+                                                                   B_size2, offB_col, incB_col, n);
+
+      viennacl::matrix_base<NumericT, viennacl::column_major> matC(C, viennacl::MAIN_MEMORY,
+                                                                   m, offC_row, incC_row, ldc,
+                                                                   n, offC_col, incC_col, n);
+
+      detail::gemm_dispatch(alpha, matA, transA, matB, transB, beta, matC);
+    }
+
+    return ViennaCLSuccess;
+  }
+
+}
+
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSgemm(ViennaCLBackend backend,
+                                                            ViennaCLOrder orderA, ViennaCLTranspose transA,
+                                                            ViennaCLOrder orderB, ViennaCLTranspose transB,
+                                                            ViennaCLOrder orderC,
+                                                            ViennaCLInt m, ViennaCLInt n, ViennaCLInt k,
+                                                            float alpha,
+                                                            float *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                            float *B, ViennaCLInt offB_row, ViennaCLInt offB_col, ViennaCLInt incB_row, ViennaCLInt incB_col, ViennaCLInt ldb,
+                                                            float beta,
+                                                            float *C, ViennaCLInt offC_row, ViennaCLInt offC_col, ViennaCLInt incC_row, ViennaCLInt incC_col, ViennaCLInt ldc)
+{
+  return detail::ViennaCLHostgemm_impl<float>(backend,
+                                              orderA, transA,
+                                              orderB, transB,
+                                              orderC,
+                                              m, n, k,
+                                              alpha,
+                                              A, offA_row, offA_col, incA_row, incA_col, lda,
+                                              B, offB_row, offB_col, incB_row, incB_col, ldb,
+                                              beta,
+                                              C, offC_row, offC_col, incC_row, incC_col, ldc);
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDgemm(ViennaCLBackend backend,
+                                                            ViennaCLOrder orderA, ViennaCLTranspose transA,
+                                                            ViennaCLOrder orderB, ViennaCLTranspose transB,
+                                                            ViennaCLOrder orderC,
+                                                            ViennaCLInt m, ViennaCLInt n, ViennaCLInt k,
+                                                            double alpha,
+                                                            double *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                            double *B, ViennaCLInt offB_row, ViennaCLInt offB_col, ViennaCLInt incB_row, ViennaCLInt incB_col, ViennaCLInt ldb,
+                                                            double beta,
+                                                            double *C, ViennaCLInt offC_row, ViennaCLInt offC_col, ViennaCLInt incC_row, ViennaCLInt incC_col, ViennaCLInt ldc)
+{
+  return detail::ViennaCLHostgemm_impl<double>(backend,
+                                               orderA, transA,
+                                               orderB, transB,
+                                               orderC,
+                                               m, n, k,
+                                               alpha,
+                                               A, offA_row, offA_col, incA_row, incA_col, lda,
+                                               B, offB_row, offB_col, incB_row, incB_col, ldb,
+                                               beta,
+                                               C, offC_row, offC_col, incC_row, incC_col, ldc);
+}
+
+
diff --git a/libviennacl/src/blas3_host.cu b/libviennacl/src/blas3_host.cu
new file mode 100644
index 0000000..742e61c
--- /dev/null
+++ b/libviennacl/src/blas3_host.cu
@@ -0,0 +1,243 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+#include "blas3.hpp"
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/direct_solve.hpp"
+#include "viennacl/linalg/prod.hpp"
+
+
+//
+// xGEMV
+//
+
+namespace detail
+{
+  template <typename NumericT>
+  ViennaCLStatus ViennaCLHostgemm_impl(ViennaCLBackend /*backend*/,
+                                       ViennaCLOrder orderA, ViennaCLTranspose transA,
+                                       ViennaCLOrder orderB, ViennaCLTranspose transB,
+                                       ViennaCLOrder orderC,
+                                       ViennaCLInt m, ViennaCLInt n, ViennaCLInt k,
+                                       NumericT alpha,
+                                       NumericT *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                       NumericT *B, ViennaCLInt offB_row, ViennaCLInt offB_col, ViennaCLInt incB_row, ViennaCLInt incB_col, ViennaCLInt ldb,
+                                       NumericT beta,
+                                       NumericT *C, ViennaCLInt offC_row, ViennaCLInt offC_col, ViennaCLInt incC_row, ViennaCLInt incC_col, ViennaCLInt ldc)
+  {
+    ViennaCLInt A_size1 = (transA == ViennaCLTrans) ? k : m;
+    ViennaCLInt A_size2 = (transA == ViennaCLTrans) ? m : k;
+
+    ViennaCLInt B_size1 = (transB == ViennaCLTrans) ? n : k;
+    ViennaCLInt B_size2 = (transB == ViennaCLTrans) ? k : n;
+
+    /////// A row-major
+    if (orderA == ViennaCLRowMajor && orderB == ViennaCLRowMajor && orderC == ViennaCLRowMajor)
+    {
+      viennacl::matrix_base<NumericT> matA(A, viennacl::MAIN_MEMORY,
+                                           A_size1, offA_row, incA_row, m,
+                                           A_size2, offA_col, incA_col, lda);
+
+      viennacl::matrix_base<NumericT> matB(B, viennacl::MAIN_MEMORY,
+                                           B_size1, offB_row, incB_row, k,
+                                           B_size2, offB_col, incB_col, ldb);
+
+      viennacl::matrix_base<NumericT> matC(C, viennacl::MAIN_MEMORY,
+                                           m, offC_row, incC_row, m,
+                                           n, offC_col, incC_col, ldc);
+
+      detail::gemm_dispatch(alpha, matA, transA, matB, transB, beta, matC);
+    }
+    else if (orderA == ViennaCLRowMajor && orderB == ViennaCLRowMajor && orderC == ViennaCLColumnMajor)
+    {
+      viennacl::matrix_base<NumericT> matA(A, viennacl::MAIN_MEMORY,
+                                           A_size1, offA_row, incA_row, m,
+                                           A_size2, offA_col, incA_col, lda);
+
+      viennacl::matrix_base<NumericT> matB(B, viennacl::MAIN_MEMORY,
+                                           B_size1, offB_row, incB_row, k,
+                                           B_size2, offB_col, incB_col, ldb);
+
+      viennacl::matrix_base<NumericT, viennacl::column_major> matC(C, viennacl::MAIN_MEMORY,
+                                                                   m, offC_row, incC_row, ldc,
+                                                                   n, offC_col, incC_col, n);
+
+      detail::gemm_dispatch(alpha, matA, transA, matB, transB, beta, matC);
+    }
+    else if (orderA == ViennaCLRowMajor && orderB == ViennaCLColumnMajor && orderC == ViennaCLRowMajor)
+    {
+      viennacl::matrix_base<NumericT> matA(A, viennacl::MAIN_MEMORY,
+                                           A_size1, offA_row, incA_row, m,
+                                           A_size2, offA_col, incA_col, lda);
+
+      viennacl::matrix_base<NumericT, viennacl::column_major> matB(B, viennacl::MAIN_MEMORY,
+                                                                   B_size1, offB_row, incB_row, ldb,
+                                                                   B_size2, offB_col, incB_col, n);
+
+      viennacl::matrix_base<NumericT> matC(C, viennacl::MAIN_MEMORY,
+                                           m, offC_row, incC_row, m,
+                                           n, offC_col, incC_col, ldc);
+
+      detail::gemm_dispatch(alpha, matA, transA, matB, transB, beta, matC);
+    }
+    else if (orderA == ViennaCLRowMajor && orderB == ViennaCLColumnMajor && orderC == ViennaCLColumnMajor)
+    {
+      viennacl::matrix_base<NumericT> matA(A, viennacl::MAIN_MEMORY,
+                                           A_size1, offA_row, incA_row, m,
+                                           A_size2, offA_col, incA_col, lda);
+
+      viennacl::matrix_base<NumericT, viennacl::column_major> matB(B, viennacl::MAIN_MEMORY,
+                                                                   B_size1, offB_row, incB_row, ldb,
+                                                                   B_size2, offB_col, incB_col, n);
+
+      viennacl::matrix_base<NumericT, viennacl::column_major> matC(C, viennacl::MAIN_MEMORY,
+                                                                   m, offC_row, incC_row, ldc,
+                                                                   n, offC_col, incC_col, n);
+
+      detail::gemm_dispatch(alpha, matA, transA, matB, transB, beta, matC);
+    }
+
+    /////// A column-major
+
+    else if (orderA == ViennaCLColumnMajor && orderB == ViennaCLRowMajor && orderC == ViennaCLRowMajor)
+    {
+      viennacl::matrix_base<NumericT, viennacl::column_major> matA(A, viennacl::MAIN_MEMORY,
+                                                                   A_size1, offA_row, incA_row, lda,
+                                                                   A_size2, offA_col, incA_col, k);
+
+      viennacl::matrix_base<NumericT> matB(B, viennacl::MAIN_MEMORY,
+                                           B_size1, offB_row, incB_row, k,
+                                           B_size2, offB_col, incB_col, ldb);
+
+      viennacl::matrix_base<NumericT> matC(C, viennacl::MAIN_MEMORY,
+                                           m, offC_row, incC_row, m,
+                                           n, offC_col, incC_col, ldc);
+
+      detail::gemm_dispatch(alpha, matA, transA, matB, transB, beta, matC);
+    }
+    else if (orderA == ViennaCLColumnMajor && orderB == ViennaCLRowMajor && orderC == ViennaCLColumnMajor)
+    {
+      viennacl::matrix_base<NumericT, viennacl::column_major> matA(A, viennacl::MAIN_MEMORY,
+                                                                   A_size1, offA_row, incA_row, lda,
+                                                                   A_size2, offA_col, incA_col, k);
+
+      viennacl::matrix_base<NumericT> matB(B, viennacl::MAIN_MEMORY,
+                                           B_size1, offB_row, incB_row, k,
+                                           B_size2, offB_col, incB_col, ldb);
+
+      viennacl::matrix_base<NumericT, viennacl::column_major> matC(C, viennacl::MAIN_MEMORY,
+                                                                   m, offC_row, incC_row, ldc,
+                                                                   n, offC_col, incC_col, n);
+
+      detail::gemm_dispatch(alpha, matA, transA, matB, transB, beta, matC);
+    }
+    else if (orderA == ViennaCLColumnMajor && orderB == ViennaCLColumnMajor && orderC == ViennaCLRowMajor)
+    {
+      viennacl::matrix_base<NumericT, viennacl::column_major> matA(A, viennacl::MAIN_MEMORY,
+                                                                   A_size1, offA_row, incA_row, lda,
+                                                                   A_size2, offA_col, incA_col, k);
+
+      viennacl::matrix_base<NumericT, viennacl::column_major> matB(B, viennacl::MAIN_MEMORY,
+                                                                   B_size1, offB_row, incB_row, ldb,
+                                                                   B_size2, offB_col, incB_col, n);
+
+      viennacl::matrix_base<NumericT> matC(C, viennacl::MAIN_MEMORY,
+                                        m, offC_row, incC_row, m,
+                                        n, offC_col, incC_col, ldc);
+
+      detail::gemm_dispatch(alpha, matA, transA, matB, transB, beta, matC);
+    }
+    else if (orderA == ViennaCLColumnMajor && orderB == ViennaCLColumnMajor && orderC == ViennaCLColumnMajor)
+    {
+      viennacl::matrix_base<NumericT, viennacl::column_major> matA(A, viennacl::MAIN_MEMORY,
+                                                                   A_size1, offA_row, incA_row, lda,
+                                                                   A_size2, offA_col, incA_col, k);
+
+      viennacl::matrix_base<NumericT, viennacl::column_major> matB(B, viennacl::MAIN_MEMORY,
+                                                                   B_size1, offB_row, incB_row, ldb,
+                                                                   B_size2, offB_col, incB_col, n);
+
+      viennacl::matrix_base<NumericT, viennacl::column_major> matC(C, viennacl::MAIN_MEMORY,
+                                                                   m, offC_row, incC_row, ldc,
+                                                                   n, offC_col, incC_col, n);
+
+      detail::gemm_dispatch(alpha, matA, transA, matB, transB, beta, matC);
+    }
+
+    return ViennaCLSuccess;
+  }
+
+}
+
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSgemm(ViennaCLBackend backend,
+                                                            ViennaCLOrder orderA, ViennaCLTranspose transA,
+                                                            ViennaCLOrder orderB, ViennaCLTranspose transB,
+                                                            ViennaCLOrder orderC,
+                                                            ViennaCLInt m, ViennaCLInt n, ViennaCLInt k,
+                                                            float alpha,
+                                                            float *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                            float *B, ViennaCLInt offB_row, ViennaCLInt offB_col, ViennaCLInt incB_row, ViennaCLInt incB_col, ViennaCLInt ldb,
+                                                            float beta,
+                                                            float *C, ViennaCLInt offC_row, ViennaCLInt offC_col, ViennaCLInt incC_row, ViennaCLInt incC_col, ViennaCLInt ldc)
+{
+  return detail::ViennaCLHostgemm_impl<float>(backend,
+                                              orderA, transA,
+                                              orderB, transB,
+                                              orderC,
+                                              m, n, k,
+                                              alpha,
+                                              A, offA_row, offA_col, incA_row, incA_col, lda,
+                                              B, offB_row, offB_col, incB_row, incB_col, ldb,
+                                              beta,
+                                              C, offC_row, offC_col, incC_row, incC_col, ldc);
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDgemm(ViennaCLBackend backend,
+                                                            ViennaCLOrder orderA, ViennaCLTranspose transA,
+                                                            ViennaCLOrder orderB, ViennaCLTranspose transB,
+                                                            ViennaCLOrder orderC,
+                                                            ViennaCLInt m, ViennaCLInt n, ViennaCLInt k,
+                                                            double alpha,
+                                                            double *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                            double *B, ViennaCLInt offB_row, ViennaCLInt offB_col, ViennaCLInt incB_row, ViennaCLInt incB_col, ViennaCLInt ldb,
+                                                            double beta,
+                                                            double *C, ViennaCLInt offC_row, ViennaCLInt offC_col, ViennaCLInt incC_row, ViennaCLInt incC_col, ViennaCLInt ldc)
+{
+  return detail::ViennaCLHostgemm_impl<double>(backend,
+                                               orderA, transA,
+                                               orderB, transB,
+                                               orderC,
+                                               m, n, k,
+                                               alpha,
+                                               A, offA_row, offA_col, incA_row, incA_col, lda,
+                                               B, offB_row, offB_col, incB_row, incB_col, ldb,
+                                               beta,
+                                               C, offC_row, offC_col, incC_row, incC_col, ldc);
+}
+
+
diff --git a/libviennacl/src/blas3_opencl.cpp b/libviennacl/src/blas3_opencl.cpp
new file mode 100644
index 0000000..d11dfed
--- /dev/null
+++ b/libviennacl/src/blas3_opencl.cpp
@@ -0,0 +1,249 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+#include "blas3.hpp"
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/direct_solve.hpp"
+#include "viennacl/linalg/prod.hpp"
+
+
+#ifdef VIENNACL_WITH_OPENCL
+
+
+
+//
+// xGEMV
+//
+
+namespace detail
+{
+  template <typename NumericT>
+  ViennaCLStatus ViennaCLOpenCLgemm_impl(ViennaCLBackend backend,
+                                         ViennaCLOrder orderA, ViennaCLTranspose transA,
+                                         ViennaCLOrder orderB, ViennaCLTranspose transB,
+                                         ViennaCLOrder orderC,
+                                         ViennaCLInt m, ViennaCLInt n, ViennaCLInt k,
+                                         NumericT alpha,
+                                         cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                         cl_mem B, ViennaCLInt offB_row, ViennaCLInt offB_col, ViennaCLInt incB_row, ViennaCLInt incB_col, ViennaCLInt ldb,
+                                         NumericT beta,
+                                         cl_mem C, ViennaCLInt offC_row, ViennaCLInt offC_col, ViennaCLInt incC_row, ViennaCLInt incC_col, ViennaCLInt ldc)
+  {
+    ViennaCLInt A_size1 = (transA == ViennaCLTrans) ? k : m;
+    ViennaCLInt A_size2 = (transA == ViennaCLTrans) ? m : k;
+
+    ViennaCLInt B_size1 = (transB == ViennaCLTrans) ? n : k;
+    ViennaCLInt B_size2 = (transB == ViennaCLTrans) ? k : n;
+
+    /////// A row-major
+
+    if (orderA == ViennaCLRowMajor && orderB == ViennaCLRowMajor && orderC == ViennaCLRowMajor)
+    {
+      viennacl::matrix_base<NumericT> matA(A, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                           A_size1, offA_row, incA_row, m,
+                                           A_size2, offA_col, incA_col, lda);
+
+      viennacl::matrix_base<NumericT> matB(B, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                           B_size1, offB_row, incB_row, k,
+                                           B_size2, offB_col, incB_col, ldb);
+
+      viennacl::matrix_base<NumericT> matC(C, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                           m, offC_row, incC_row, m,
+                                           n, offC_col, incC_col, ldc);
+
+      detail::gemm_dispatch(alpha, matA, transA, matB, transB, beta, matC);
+    }
+    else if (orderA == ViennaCLRowMajor && orderB == ViennaCLRowMajor && orderC == ViennaCLColumnMajor)
+    {
+      viennacl::matrix_base<NumericT> matA(A, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                           A_size1, offA_row, incA_row, m,
+                                           A_size2, offA_col, incA_col, lda);
+
+      viennacl::matrix_base<NumericT> matB(B, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                           B_size1, offB_row, incB_row, k,
+                                           B_size2, offB_col, incB_col, ldb);
+
+      viennacl::matrix_base<NumericT, viennacl::column_major> matC(C, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                                                   m, offC_row, incC_row, ldc,
+                                                                   n, offC_col, incC_col, n);
+
+      detail::gemm_dispatch(alpha, matA, transA, matB, transB, beta, matC);
+    }
+    else if (orderA == ViennaCLRowMajor && orderB == ViennaCLColumnMajor && orderC == ViennaCLRowMajor)
+    {
+      viennacl::matrix_base<NumericT> matA(A, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                           A_size1, offA_row, incA_row, m,
+                                           A_size2, offA_col, incA_col, lda);
+
+      viennacl::matrix_base<NumericT, viennacl::column_major> matB(B, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                                                   B_size1, offB_row, incB_row, ldb,
+                                                                   B_size2, offB_col, incB_col, n);
+
+      viennacl::matrix_base<NumericT> matC(C, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                           m, offC_row, incC_row, m,
+                                           n, offC_col, incC_col, ldc);
+
+      detail::gemm_dispatch(alpha, matA, transA, matB, transB, beta, matC);
+    }
+    else if (orderA == ViennaCLRowMajor && orderB == ViennaCLColumnMajor && orderC == ViennaCLColumnMajor)
+    {
+      viennacl::matrix_base<NumericT> matA(A, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                           A_size1, offA_row, incA_row, m,
+                                           A_size2, offA_col, incA_col, lda);
+
+      viennacl::matrix_base<NumericT, viennacl::column_major> matB(B, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                                                   B_size1, offB_row, incB_row, ldb,
+                                                                   B_size2, offB_col, incB_col, n);
+
+      viennacl::matrix_base<NumericT, viennacl::column_major> matC(C, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                                                   m, offC_row, incC_row, ldc,
+                                                                   n, offC_col, incC_col, n);
+
+      detail::gemm_dispatch(alpha, matA, transA, matB, transB, beta, matC);
+    }
+
+    /////// A column-major
+
+    else if (orderA == ViennaCLColumnMajor && orderB == ViennaCLRowMajor && orderC == ViennaCLRowMajor)
+    {
+      viennacl::matrix_base<NumericT, viennacl::column_major> matA(A, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                                                   A_size1, offA_row, incA_row, lda,
+                                                                   A_size2, offA_col, incA_col, k);
+
+      viennacl::matrix_base<NumericT> matB(B, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                           B_size1, offB_row, incB_row, k,
+                                           B_size2, offB_col, incB_col, ldb);
+
+      viennacl::matrix_base<NumericT> matC(C, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                           m, offC_row, incC_row, m,
+                                           n, offC_col, incC_col, ldc);
+
+      detail::gemm_dispatch(alpha, matA, transA, matB, transB, beta, matC);
+    }
+    else if (orderA == ViennaCLColumnMajor && orderB == ViennaCLRowMajor && orderC == ViennaCLColumnMajor)
+    {
+      viennacl::matrix_base<NumericT, viennacl::column_major> matA(A, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                                                   A_size1, offA_row, incA_row, lda,
+                                                                   A_size2, offA_col, incA_col, k);
+
+      viennacl::matrix_base<NumericT> matB(B, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                           B_size1, offB_row, incB_row, k,
+                                           B_size2, offB_col, incB_col, ldb);
+
+      viennacl::matrix_base<NumericT, viennacl::column_major> matC(C, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                                                   m, offC_row, incC_row, ldc,
+                                                                   n, offC_col, incC_col, n);
+
+      detail::gemm_dispatch(alpha, matA, transA, matB, transB, beta, matC);
+    }
+    else if (orderA == ViennaCLColumnMajor && orderB == ViennaCLColumnMajor && orderC == ViennaCLRowMajor)
+    {
+      viennacl::matrix_base<NumericT, viennacl::column_major> matA(A, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                                                   A_size1, offA_row, incA_row, lda,
+                                                                   A_size2, offA_col, incA_col, k);
+
+      viennacl::matrix_base<NumericT, viennacl::column_major> matB(B, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                                                   B_size1, offB_row, incB_row, ldb,
+                                                                   B_size2, offB_col, incB_col, n);
+
+      viennacl::matrix_base<NumericT> matC(C, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                           m, offC_row, incC_row, m,
+                                           n, offC_col, incC_col, ldc);
+
+      detail::gemm_dispatch(alpha, matA, transA, matB, transB, beta, matC);
+    }
+    else if (orderA == ViennaCLColumnMajor && orderB == ViennaCLColumnMajor && orderC == ViennaCLColumnMajor)
+    {
+      viennacl::matrix_base<NumericT, viennacl::column_major> matA(A, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                                                   A_size1, offA_row, incA_row, lda,
+                                                                   A_size2, offA_col, incA_col, k);
+
+      viennacl::matrix_base<NumericT, viennacl::column_major> matB(B, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                                                   B_size1, offB_row, incB_row, ldb,
+                                                                   B_size2, offB_col, incB_col, n);
+
+      viennacl::matrix_base<NumericT, viennacl::column_major> matC(C, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                                                   m, offC_row, incC_row, ldc,
+                                                                   n, offC_col, incC_col, n);
+
+      detail::gemm_dispatch(alpha, matA, transA, matB, transB, beta, matC);
+    }
+
+    return ViennaCLSuccess;
+  }
+
+}
+
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSgemm(ViennaCLBackend backend,
+                                                              ViennaCLOrder orderA, ViennaCLTranspose transA,
+                                                              ViennaCLOrder orderB, ViennaCLTranspose transB,
+                                                              ViennaCLOrder orderC,
+                                                              ViennaCLInt m, ViennaCLInt n, ViennaCLInt k,
+                                                              float alpha,
+                                                              cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                              cl_mem B, ViennaCLInt offB_row, ViennaCLInt offB_col, ViennaCLInt incB_row, ViennaCLInt incB_col, ViennaCLInt ldb,
+                                                              float beta,
+                                                              cl_mem C, ViennaCLInt offC_row, ViennaCLInt offC_col, ViennaCLInt incC_row, ViennaCLInt incC_col, ViennaCLInt ldc)
+{
+  return detail::ViennaCLOpenCLgemm_impl<float>(backend,
+                                                orderA, transA,
+                                                orderB, transB,
+                                                orderC,
+                                                m, n, k,
+                                                alpha,
+                                                A, offA_row, offA_col, incA_row, incA_col, lda,
+                                                B, offB_row, offB_col, incB_row, incB_col, ldb,
+                                                beta,
+                                                C, offC_row, offC_col, incC_row, incC_col, ldc);
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDgemm(ViennaCLBackend backend,
+                                                              ViennaCLOrder orderA, ViennaCLTranspose transA,
+                                                              ViennaCLOrder orderB, ViennaCLTranspose transB,
+                                                              ViennaCLOrder orderC,
+                                                              ViennaCLInt m, ViennaCLInt n, ViennaCLInt k,
+                                                              double alpha,
+                                                              cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                              cl_mem B, ViennaCLInt offB_row, ViennaCLInt offB_col, ViennaCLInt incB_row, ViennaCLInt incB_col, ViennaCLInt ldb,
+                                                              double beta,
+                                                              cl_mem C, ViennaCLInt offC_row, ViennaCLInt offC_col, ViennaCLInt incC_row, ViennaCLInt incC_col, ViennaCLInt ldc)
+{
+  return detail::ViennaCLOpenCLgemm_impl<double>(backend,
+                                                 orderA, transA,
+                                                 orderB, transB,
+                                                 orderC,
+                                                 m, n, k,
+                                                 alpha,
+                                                 A, offA_row, offA_col, incA_row, incA_col, lda,
+                                                 B, offB_row, offB_col, incB_row, incB_col, ldb,
+                                                 beta,
+                                                 C, offC_row, offC_col, incC_row, incC_col, ldc);
+}
+
+
+#endif
diff --git a/libviennacl/src/blas3_opencl.cu b/libviennacl/src/blas3_opencl.cu
new file mode 100644
index 0000000..d11dfed
--- /dev/null
+++ b/libviennacl/src/blas3_opencl.cu
@@ -0,0 +1,249 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+#include "blas3.hpp"
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/direct_solve.hpp"
+#include "viennacl/linalg/prod.hpp"
+
+
+#ifdef VIENNACL_WITH_OPENCL
+
+
+
+//
+// xGEMV
+//
+
+namespace detail
+{
+  template <typename NumericT>
+  ViennaCLStatus ViennaCLOpenCLgemm_impl(ViennaCLBackend backend,
+                                         ViennaCLOrder orderA, ViennaCLTranspose transA,
+                                         ViennaCLOrder orderB, ViennaCLTranspose transB,
+                                         ViennaCLOrder orderC,
+                                         ViennaCLInt m, ViennaCLInt n, ViennaCLInt k,
+                                         NumericT alpha,
+                                         cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                         cl_mem B, ViennaCLInt offB_row, ViennaCLInt offB_col, ViennaCLInt incB_row, ViennaCLInt incB_col, ViennaCLInt ldb,
+                                         NumericT beta,
+                                         cl_mem C, ViennaCLInt offC_row, ViennaCLInt offC_col, ViennaCLInt incC_row, ViennaCLInt incC_col, ViennaCLInt ldc)
+  {
+    ViennaCLInt A_size1 = (transA == ViennaCLTrans) ? k : m;
+    ViennaCLInt A_size2 = (transA == ViennaCLTrans) ? m : k;
+
+    ViennaCLInt B_size1 = (transB == ViennaCLTrans) ? n : k;
+    ViennaCLInt B_size2 = (transB == ViennaCLTrans) ? k : n;
+
+    /////// A row-major
+
+    if (orderA == ViennaCLRowMajor && orderB == ViennaCLRowMajor && orderC == ViennaCLRowMajor)
+    {
+      viennacl::matrix_base<NumericT> matA(A, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                           A_size1, offA_row, incA_row, m,
+                                           A_size2, offA_col, incA_col, lda);
+
+      viennacl::matrix_base<NumericT> matB(B, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                           B_size1, offB_row, incB_row, k,
+                                           B_size2, offB_col, incB_col, ldb);
+
+      viennacl::matrix_base<NumericT> matC(C, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                           m, offC_row, incC_row, m,
+                                           n, offC_col, incC_col, ldc);
+
+      detail::gemm_dispatch(alpha, matA, transA, matB, transB, beta, matC);
+    }
+    else if (orderA == ViennaCLRowMajor && orderB == ViennaCLRowMajor && orderC == ViennaCLColumnMajor)
+    {
+      viennacl::matrix_base<NumericT> matA(A, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                           A_size1, offA_row, incA_row, m,
+                                           A_size2, offA_col, incA_col, lda);
+
+      viennacl::matrix_base<NumericT> matB(B, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                           B_size1, offB_row, incB_row, k,
+                                           B_size2, offB_col, incB_col, ldb);
+
+      viennacl::matrix_base<NumericT, viennacl::column_major> matC(C, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                                                   m, offC_row, incC_row, ldc,
+                                                                   n, offC_col, incC_col, n);
+
+      detail::gemm_dispatch(alpha, matA, transA, matB, transB, beta, matC);
+    }
+    else if (orderA == ViennaCLRowMajor && orderB == ViennaCLColumnMajor && orderC == ViennaCLRowMajor)
+    {
+      viennacl::matrix_base<NumericT> matA(A, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                           A_size1, offA_row, incA_row, m,
+                                           A_size2, offA_col, incA_col, lda);
+
+      viennacl::matrix_base<NumericT, viennacl::column_major> matB(B, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                                                   B_size1, offB_row, incB_row, ldb,
+                                                                   B_size2, offB_col, incB_col, n);
+
+      viennacl::matrix_base<NumericT> matC(C, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                           m, offC_row, incC_row, m,
+                                           n, offC_col, incC_col, ldc);
+
+      detail::gemm_dispatch(alpha, matA, transA, matB, transB, beta, matC);
+    }
+    else if (orderA == ViennaCLRowMajor && orderB == ViennaCLColumnMajor && orderC == ViennaCLColumnMajor)
+    {
+      viennacl::matrix_base<NumericT> matA(A, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                           A_size1, offA_row, incA_row, m,
+                                           A_size2, offA_col, incA_col, lda);
+
+      viennacl::matrix_base<NumericT, viennacl::column_major> matB(B, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                                                   B_size1, offB_row, incB_row, ldb,
+                                                                   B_size2, offB_col, incB_col, n);
+
+      viennacl::matrix_base<NumericT, viennacl::column_major> matC(C, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                                                   m, offC_row, incC_row, ldc,
+                                                                   n, offC_col, incC_col, n);
+
+      detail::gemm_dispatch(alpha, matA, transA, matB, transB, beta, matC);
+    }
+
+    /////// A column-major
+
+    else if (orderA == ViennaCLColumnMajor && orderB == ViennaCLRowMajor && orderC == ViennaCLRowMajor)
+    {
+      viennacl::matrix_base<NumericT, viennacl::column_major> matA(A, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                                                   A_size1, offA_row, incA_row, lda,
+                                                                   A_size2, offA_col, incA_col, k);
+
+      viennacl::matrix_base<NumericT> matB(B, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                           B_size1, offB_row, incB_row, k,
+                                           B_size2, offB_col, incB_col, ldb);
+
+      viennacl::matrix_base<NumericT> matC(C, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                           m, offC_row, incC_row, m,
+                                           n, offC_col, incC_col, ldc);
+
+      detail::gemm_dispatch(alpha, matA, transA, matB, transB, beta, matC);
+    }
+    else if (orderA == ViennaCLColumnMajor && orderB == ViennaCLRowMajor && orderC == ViennaCLColumnMajor)
+    {
+      viennacl::matrix_base<NumericT, viennacl::column_major> matA(A, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                                                   A_size1, offA_row, incA_row, lda,
+                                                                   A_size2, offA_col, incA_col, k);
+
+      viennacl::matrix_base<NumericT> matB(B, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                           B_size1, offB_row, incB_row, k,
+                                           B_size2, offB_col, incB_col, ldb);
+
+      viennacl::matrix_base<NumericT, viennacl::column_major> matC(C, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                                                   m, offC_row, incC_row, ldc,
+                                                                   n, offC_col, incC_col, n);
+
+      detail::gemm_dispatch(alpha, matA, transA, matB, transB, beta, matC);
+    }
+    else if (orderA == ViennaCLColumnMajor && orderB == ViennaCLColumnMajor && orderC == ViennaCLRowMajor)
+    {
+      viennacl::matrix_base<NumericT, viennacl::column_major> matA(A, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                                                   A_size1, offA_row, incA_row, lda,
+                                                                   A_size2, offA_col, incA_col, k);
+
+      viennacl::matrix_base<NumericT, viennacl::column_major> matB(B, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                                                   B_size1, offB_row, incB_row, ldb,
+                                                                   B_size2, offB_col, incB_col, n);
+
+      viennacl::matrix_base<NumericT> matC(C, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                           m, offC_row, incC_row, m,
+                                           n, offC_col, incC_col, ldc);
+
+      detail::gemm_dispatch(alpha, matA, transA, matB, transB, beta, matC);
+    }
+    else if (orderA == ViennaCLColumnMajor && orderB == ViennaCLColumnMajor && orderC == ViennaCLColumnMajor)
+    {
+      viennacl::matrix_base<NumericT, viennacl::column_major> matA(A, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                                                   A_size1, offA_row, incA_row, lda,
+                                                                   A_size2, offA_col, incA_col, k);
+
+      viennacl::matrix_base<NumericT, viennacl::column_major> matB(B, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                                                   B_size1, offB_row, incB_row, ldb,
+                                                                   B_size2, offB_col, incB_col, n);
+
+      viennacl::matrix_base<NumericT, viennacl::column_major> matC(C, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+                                                                   m, offC_row, incC_row, ldc,
+                                                                   n, offC_col, incC_col, n);
+
+      detail::gemm_dispatch(alpha, matA, transA, matB, transB, beta, matC);
+    }
+
+    return ViennaCLSuccess;
+  }
+
+}
+
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSgemm(ViennaCLBackend backend,
+                                                              ViennaCLOrder orderA, ViennaCLTranspose transA,
+                                                              ViennaCLOrder orderB, ViennaCLTranspose transB,
+                                                              ViennaCLOrder orderC,
+                                                              ViennaCLInt m, ViennaCLInt n, ViennaCLInt k,
+                                                              float alpha,
+                                                              cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                              cl_mem B, ViennaCLInt offB_row, ViennaCLInt offB_col, ViennaCLInt incB_row, ViennaCLInt incB_col, ViennaCLInt ldb,
+                                                              float beta,
+                                                              cl_mem C, ViennaCLInt offC_row, ViennaCLInt offC_col, ViennaCLInt incC_row, ViennaCLInt incC_col, ViennaCLInt ldc)
+{
+  return detail::ViennaCLOpenCLgemm_impl<float>(backend,
+                                                orderA, transA,
+                                                orderB, transB,
+                                                orderC,
+                                                m, n, k,
+                                                alpha,
+                                                A, offA_row, offA_col, incA_row, incA_col, lda,
+                                                B, offB_row, offB_col, incB_row, incB_col, ldb,
+                                                beta,
+                                                C, offC_row, offC_col, incC_row, incC_col, ldc);
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDgemm(ViennaCLBackend backend,
+                                                              ViennaCLOrder orderA, ViennaCLTranspose transA,
+                                                              ViennaCLOrder orderB, ViennaCLTranspose transB,
+                                                              ViennaCLOrder orderC,
+                                                              ViennaCLInt m, ViennaCLInt n, ViennaCLInt k,
+                                                              double alpha,
+                                                              cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+                                                              cl_mem B, ViennaCLInt offB_row, ViennaCLInt offB_col, ViennaCLInt incB_row, ViennaCLInt incB_col, ViennaCLInt ldb,
+                                                              double beta,
+                                                              cl_mem C, ViennaCLInt offC_row, ViennaCLInt offC_col, ViennaCLInt incC_row, ViennaCLInt incC_col, ViennaCLInt ldc)
+{
+  return detail::ViennaCLOpenCLgemm_impl<double>(backend,
+                                                 orderA, transA,
+                                                 orderB, transB,
+                                                 orderC,
+                                                 m, n, k,
+                                                 alpha,
+                                                 A, offA_row, offA_col, incA_row, incA_col, lda,
+                                                 B, offB_row, offB_col, incB_row, incB_col, ldb,
+                                                 beta,
+                                                 C, offC_row, offC_col, incC_row, incC_col, ldc);
+}
+
+
+#endif
diff --git a/libviennacl/src/init_matrix.hpp b/libviennacl/src/init_matrix.hpp
new file mode 100644
index 0000000..a3add8e
--- /dev/null
+++ b/libviennacl/src/init_matrix.hpp
@@ -0,0 +1,101 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl.hpp"
+#include "viennacl/backend/mem_handle.hpp"
+
+
+
+static ViennaCLStatus init_cuda_matrix(viennacl::backend::mem_handle & h, ViennaCLMatrix A)
+{
+#ifdef VIENNACL_WITH_CUDA
+  h.switch_active_handle_id(viennacl::CUDA_MEMORY);
+  h.cuda_handle().reset(A->cuda_mem);
+  h.cuda_handle().inc();
+  if (A->precision == ViennaCLFloat)
+    h.raw_size(A->internal_size1 * A->internal_size2 * sizeof(float)); // not necessary, but still set for conciseness
+  else if (A->precision == ViennaCLDouble)
+    h.raw_size(A->internal_size1 * A->internal_size2 * sizeof(double)); // not necessary, but still set for conciseness
+  else
+    return ViennaCLGenericFailure;
+
+  return ViennaCLSuccess;
+#else
+  (void)h;
+  (void)A;
+  return ViennaCLGenericFailure;
+#endif
+}
+
+static ViennaCLStatus init_opencl_matrix(viennacl::backend::mem_handle & h, ViennaCLMatrix A)
+{
+#ifdef VIENNACL_WITH_OPENCL
+  h.switch_active_handle_id(viennacl::OPENCL_MEMORY);
+  h.opencl_handle() = A->opencl_mem;
+  h.opencl_handle().inc();
+  if (A->precision == ViennaCLFloat)
+    h.raw_size(A->internal_size1 * A->internal_size2 * sizeof(float)); // not necessary, but still set for conciseness
+  else if (A->precision == ViennaCLDouble)
+    h.raw_size(A->internal_size1 * A->internal_size2 * sizeof(double)); // not necessary, but still set for conciseness
+  else
+    return ViennaCLGenericFailure;
+
+  return ViennaCLSuccess;
+#else
+  (void)h;
+  (void)A;
+  return ViennaCLGenericFailure;
+#endif
+}
+
+
+static ViennaCLStatus init_host_matrix(viennacl::backend::mem_handle & h, ViennaCLMatrix A)
+{
+  h.switch_active_handle_id(viennacl::MAIN_MEMORY);
+  h.ram_handle().reset(A->host_mem);
+  h.ram_handle().inc();
+  if (A->precision == ViennaCLFloat)
+    h.raw_size(A->internal_size1 * A->internal_size2 * sizeof(float)); // not necessary, but still set for conciseness
+  else if (A->precision == ViennaCLDouble)
+    h.raw_size(A->internal_size1 * A->internal_size2 * sizeof(double)); // not necessary, but still set for conciseness
+  else
+    return ViennaCLGenericFailure;
+
+  return ViennaCLSuccess;
+}
+
+
+static ViennaCLStatus init_matrix(viennacl::backend::mem_handle & h, ViennaCLMatrix A)
+{
+  switch (A->backend->backend_type)
+  {
+    case ViennaCLCUDA:
+      return init_cuda_matrix(h, A);
+
+    case ViennaCLOpenCL:
+      return init_opencl_matrix(h, A);
+
+    case ViennaCLHost:
+      return init_host_matrix(h, A);
+
+    default:
+      return ViennaCLGenericFailure;
+  }
+}
+
+
+
diff --git a/libviennacl/src/init_vector.hpp b/libviennacl/src/init_vector.hpp
new file mode 100644
index 0000000..9fa99a2
--- /dev/null
+++ b/libviennacl/src/init_vector.hpp
@@ -0,0 +1,101 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl.hpp"
+#include "viennacl/backend/mem_handle.hpp"
+
+
+
+static ViennaCLStatus init_cuda_vector(viennacl::backend::mem_handle & h, ViennaCLVector x)
+{
+#ifdef VIENNACL_WITH_CUDA
+  h.switch_active_handle_id(viennacl::CUDA_MEMORY);
+  h.cuda_handle().reset(x->cuda_mem);
+  h.cuda_handle().inc();
+  if (x->precision == ViennaCLFloat)
+    h.raw_size(x->inc * x->size * sizeof(float)); // not necessary, but still set for conciseness
+  else if (x->precision == ViennaCLDouble)
+    h.raw_size(x->inc * x->size * sizeof(double)); // not necessary, but still set for conciseness
+  else
+    return ViennaCLGenericFailure;
+
+  return ViennaCLSuccess;
+#else
+  (void)h;
+  (void)x;
+  return ViennaCLGenericFailure;
+#endif
+}
+
+static ViennaCLStatus init_opencl_vector(viennacl::backend::mem_handle & h, ViennaCLVector x)
+{
+#ifdef VIENNACL_WITH_OPENCL
+  h.switch_active_handle_id(viennacl::OPENCL_MEMORY);
+  h.opencl_handle() = x->opencl_mem;
+  h.opencl_handle().inc();
+  if (x->precision == ViennaCLFloat)
+    h.raw_size(x->inc * x->size * sizeof(float)); // not necessary, but still set for conciseness
+  else if (x->precision == ViennaCLDouble)
+    h.raw_size(x->inc * x->size * sizeof(double)); // not necessary, but still set for conciseness
+  else
+    return ViennaCLGenericFailure;
+
+  return ViennaCLSuccess;
+#else
+  (void)h;
+  (void)x;
+  return ViennaCLGenericFailure;
+#endif
+}
+
+
+static ViennaCLStatus init_host_vector(viennacl::backend::mem_handle & h, ViennaCLVector x)
+{
+  h.switch_active_handle_id(viennacl::MAIN_MEMORY);
+  h.ram_handle().reset(x->host_mem);
+  h.ram_handle().inc();
+  if (x->precision == ViennaCLFloat)
+    h.raw_size(x->inc * x->size * sizeof(float)); // not necessary, but still set for conciseness
+  else if (x->precision == ViennaCLDouble)
+    h.raw_size(x->inc * x->size * sizeof(double)); // not necessary, but still set for conciseness
+  else
+    return ViennaCLGenericFailure;
+
+  return ViennaCLSuccess;
+}
+
+
+static ViennaCLStatus init_vector(viennacl::backend::mem_handle & h, ViennaCLVector x)
+{
+  switch (x->backend->backend_type)
+  {
+    case ViennaCLCUDA:
+      return init_cuda_vector(h, x);
+
+    case ViennaCLOpenCL:
+      return init_opencl_vector(h, x);
+
+    case ViennaCLHost:
+      return init_host_vector(h, x);
+
+    default:
+      return ViennaCLGenericFailure;
+  }
+}
+
+
+
diff --git a/libviennacl/src/viennacl_private.hpp b/libviennacl/src/viennacl_private.hpp
new file mode 100644
index 0000000..c66c848
--- /dev/null
+++ b/libviennacl/src/viennacl_private.hpp
@@ -0,0 +1,141 @@
+#ifndef VIENNACL_VIENNACL_PRIVATE_HPP
+#define VIENNACL_VIENNACL_PRIVATE_HPP
+
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include <stdlib.h>
+
+#ifdef VIENNACL_WITH_OPENCL
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#else
+#include <CL/cl.h>
+#endif
+#endif
+
+#include "viennacl.hpp"
+
+
+/************* Backend Management ******************/
+
+struct ViennaCLCUDABackend_impl
+{
+    //TODO: Add stream and/or device descriptors here
+};
+
+struct ViennaCLOpenCLBackend_impl
+{
+  ViennaCLInt context_id;
+};
+
+struct ViennaCLHostBackend_impl
+{
+  // Nothing to specify *at the moment*
+};
+
+
+/** @brief Generic backend for CUDA, OpenCL, host-based stuff */
+struct ViennaCLBackend_impl
+{
+  ViennaCLBackendTypes backend_type;
+
+  ViennaCLCUDABackend_impl     cuda_backend;
+  ViennaCLOpenCLBackend_impl   opencl_backend;
+  ViennaCLHostBackend_impl     host_backend;
+};
+
+
+
+/******** User Types **********/
+
+struct ViennaCLHostScalar_impl
+{
+  ViennaCLPrecision  precision;
+
+  union {
+    float  value_float;
+    double value_double;
+  };
+};
+
+struct ViennaCLScalar_impl
+{
+  ViennaCLBackend    backend;
+  ViennaCLPrecision  precision;
+
+  // buffer:
+#ifdef VIENNACL_WITH_CUDA
+  char * cuda_mem;
+#endif
+#ifdef VIENNACL_WITH_OPENCL
+  cl_mem opencl_mem;
+#endif
+  char * host_mem;
+
+  ViennaCLInt   offset;
+};
+
+struct ViennaCLVector_impl
+{
+  ViennaCLBackend    backend;
+  ViennaCLPrecision  precision;
+
+  // buffer:
+#ifdef VIENNACL_WITH_CUDA
+  char * cuda_mem;
+#endif
+#ifdef VIENNACL_WITH_OPENCL
+  cl_mem opencl_mem;
+#endif
+  char * host_mem;
+
+  ViennaCLInt   offset;
+  ViennaCLInt   inc;
+  ViennaCLInt   size;
+};
+
+struct ViennaCLMatrix_impl
+{
+  ViennaCLBackend    backend;
+  ViennaCLPrecision  precision;
+  ViennaCLOrder      order;
+  ViennaCLTranspose  trans;
+
+  // buffer:
+#ifdef VIENNACL_WITH_CUDA
+  char * cuda_mem;
+#endif
+#ifdef VIENNACL_WITH_OPENCL
+  cl_mem opencl_mem;
+#endif
+  char * host_mem;
+
+  ViennaCLInt   size1;
+  ViennaCLInt   start1;
+  ViennaCLInt   stride1;
+  ViennaCLInt   internal_size1;
+
+  ViennaCLInt   size2;
+  ViennaCLInt   start2;
+  ViennaCLInt   stride2;
+  ViennaCLInt   internal_size2;
+};
+
+
+#endif
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index bf8c661..dfc29ab 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -1,12 +1,115 @@
-foreach(PROG blas3 fft iterators matrix matrix_range scalar sparse structured-matrices vector vector_range)
-   add_executable(${PROG}-test src/${PROG}.cpp)
-   target_link_libraries(${PROG}-test ${OPENCL_LIBRARIES})
-   add_test(${PROG} ${PROG}-test)
+
+include_directories(${Boost_INCLUDE_DIRS})
+
+# tests with CPU backend
+foreach(PROG blas3_prod_float blas3_prod_double blas3_solve_float blas3_solve_double iterators
+             global_variables
+             matrix_vector matrix_vector_int
+             matrix_row_float matrix_row_double matrix_row_int
+             matrix_col_float matrix_col_double matrix_col_int
+             scalar scheduler_matrix scheduler_matrix_matrix scheduler_matrix_vector scheduler_sparse scheduler_vector sparse
+             vector_float vector_double vector_int vector_uint vector_multi_inner_prod
+             spmdm)
+   add_executable(${PROG}-test-cpu src/${PROG}.cpp)
+   target_link_libraries(${PROG}-test-cpu ${Boost_LIBRARIES})
+   add_test(${PROG}-cpu ${PROG}-test-cpu)
 endforeach(PROG)
 
-include_directories(${PROJECT_SOURCE_DIR}/external)
-add_executable(external_linkage 
-                src/external_1.cpp
-                src/external_2.cpp 
-                ${PROJECT_SOURCE_DIR}/external/pugixml/src/pugixml.cpp)
-target_link_libraries(external_linkage ${OPENCL_LIBRARIES})
+
+# tests with OpenCL backend
+if (ENABLE_OPENCL)
+  foreach(PROG blas3_prod_float blas3_prod_double blas3_solve_float blas3_solve_double fft iterators
+               generator_blas1 generator_blas2 generator_blas3 #generator_segmentation
+               global_variables
+               matrix_vector matrix_vector_int
+               matrix_row_float matrix_row_double matrix_row_int
+               matrix_col_float matrix_col_double matrix_col_int
+               nmf qr_method
+               scalar sparse structured-matrices svd
+               vector_float vector_double vector_int vector_uint vector_multi_inner_prod
+               spmdm)
+     add_executable(${PROG}-test-opencl src/${PROG}.cpp)
+     target_link_libraries(${PROG}-test-opencl ${OPENCL_LIBRARIES} ${Boost_LIBRARIES})
+     add_test(${PROG}-opencl ${PROG}-test-opencl)
+     set_target_properties(${PROG}-test-opencl PROPERTIES COMPILE_FLAGS "-DVIENNACL_WITH_OPENCL")
+  endforeach(PROG)
+
+  include_directories(${PROJECT_SOURCE_DIR}/external)
+  add_executable(external_linkage-opencl
+                 src/external_1.cpp
+                 src/external_2.cpp)
+  target_link_libraries(external_linkage-opencl ${OPENCL_LIBRARIES} ${Boost_LIBRARIES})
+  set_target_properties(external_linkage-opencl PROPERTIES COMPILE_FLAGS "-DVIENNACL_WITH_OPENCL")
+endif (ENABLE_OPENCL)
+
+# tests with CUDA backend
+if (ENABLE_CUDA)
+  foreach(PROG blas3_prod_float blas3_prod_double blas3_solve_float blas3_solve_double iterators
+               global_variables
+               matrix_vector matrix_vector_int
+               matrix_row_float matrix_row_double matrix_row_int
+               matrix_col_float matrix_col_double matrix_col_int
+               scalar sparse
+               vector_float vector_double vector_int vector_uint vector_multi_inner_prod
+               spmdm)
+     cuda_add_executable(${PROG}-test-cuda src/${PROG}.cu)
+     target_link_libraries(${PROG}-test-cuda ${Boost_LIBRARIES})
+     add_test(${PROG}-cuda ${PROG}-test-cuda)
+  endforeach(PROG)
+
+  include_directories(${PROJECT_SOURCE_DIR}/external)
+  cuda_add_executable(external_linkage-cuda
+                      src/external_1.cu
+                      src/external_2.cu)
+  target_link_libraries(external_linkage-cuda ${Boost_LIBRARIES})
+endif (ENABLE_CUDA)
+
+# test shared library
+include_directories(${PROJECT_SOURCE_DIR}/libviennacl/include/)
+
+if(ENABLE_CUDA)
+  if(ENABLE_OPENCL)
+    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-DVIENNACL_WITH_OPENCL") #set flags before setting executable!
+    cuda_add_executable(libviennacl_blas1-test src/libviennacl_blas1.cu)
+    target_link_libraries(libviennacl_blas1-test viennacl ${OPENCL_LIBRARIES})
+
+    cuda_add_executable(libviennacl_blas2-test src/libviennacl_blas2.cu)
+    target_link_libraries(libviennacl_blas2-test viennacl ${OPENCL_LIBRARIES})
+
+    cuda_add_executable(libviennacl_blas3-test src/libviennacl_blas3.cu)
+    target_link_libraries(libviennacl_blas3-test viennacl ${OPENCL_LIBRARIES})
+
+  else(ENABLE_OPENCL)
+    cuda_add_executable(libviennacl_blas1-test src/libviennacl_blas1.cu)
+    target_link_libraries(libviennacl_blas1-test viennacl)
+
+    cuda_add_executable(libviennacl_blas2-test src/libviennacl_blas2.cu)
+    target_link_libraries(libviennacl_blas2-test viennacl)
+
+    cuda_add_executable(libviennacl_blas3-test src/libviennacl_blas3.cu)
+    target_link_libraries(libviennacl_blas3-test viennacl)
+  endif(ENABLE_OPENCL)
+else(ENABLE_CUDA)
+  add_executable(libviennacl_blas1-test src/libviennacl_blas1.cpp)
+  add_executable(libviennacl_blas2-test src/libviennacl_blas2.cpp)
+  add_executable(libviennacl_blas3-test src/libviennacl_blas3.cpp)
+  if(ENABLE_OPENCL)
+    set_target_properties(libviennacl_blas1-test PROPERTIES COMPILE_FLAGS "-DVIENNACL_WITH_OPENCL")
+    target_link_libraries(libviennacl_blas1-test viennacl ${OPENCL_LIBRARIES})
+
+    set_target_properties(libviennacl_blas2-test PROPERTIES COMPILE_FLAGS "-DVIENNACL_WITH_OPENCL")
+    target_link_libraries(libviennacl_blas2-test viennacl ${OPENCL_LIBRARIES})
+
+    set_target_properties(libviennacl_blas3-test PROPERTIES COMPILE_FLAGS "-DVIENNACL_WITH_OPENCL")
+    target_link_libraries(libviennacl_blas3-test viennacl ${OPENCL_LIBRARIES})
+  else(ENABLE_OPENCL)
+    target_link_libraries(libviennacl_blas1-test viennacl)
+    target_link_libraries(libviennacl_blas2-test viennacl)
+    target_link_libraries(libviennacl_blas3-test viennacl)
+  endif(ENABLE_OPENCL)
+endif(ENABLE_CUDA)
+add_test(libviennacl-blas1 libviennacl_blas1-test)
+add_test(libviennacl-blas2 libviennacl_blas2-test)
+add_test(libviennacl-blas3 libviennacl_blas3-test)
+
+
diff --git a/examples/tutorial/Random.hpp b/tests/src/Random.hpp
similarity index 81%
copy from examples/tutorial/Random.hpp
copy to tests/src/Random.hpp
index 93d37ca..5cb57c7 100644
--- a/examples/tutorial/Random.hpp
+++ b/tests/src/Random.hpp
@@ -1,52 +1,53 @@
-
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-#ifndef _RANDOM_HPP_
-#define _RANDOM_HPP_
-
-#include <time.h>
-#include <stdlib.h>
-
-inline void init()
-{
-	static bool init = false;
-	if (!init)
-	{
-		srand( (unsigned int)time(NULL) );
-		init = true;
-	}
-}
-
-template<class TYPE>
-TYPE random();
-
-template<>
-double random<double>()
-{
-  init();
-  return static_cast<double>(rand()) / static_cast<double>(RAND_MAX);
-}
-
-template<>
-float random<float>()
-{
-  init();
-  return static_cast<float>(random<double>());
-}
-
-#endif
-
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#ifndef _RANDOM_HPP_
+#define _RANDOM_HPP_
+
+#include <time.h>
+#include <stdlib.h>
+
+inline void init()
+{
+  static bool init = false;
+  if (!init)
+  {
+    srand( (unsigned int)time(NULL) );
+    init = true;
+  }
+}
+
+template<class TYPE>
+TYPE random();
+
+template<>
+double random<double>()
+{
+  init();
+  return static_cast<double>(rand()) / static_cast<double>(RAND_MAX);
+}
+
+template<>
+float random<float>()
+{
+  init();
+  return static_cast<float>(random<double>());
+}
+
+#endif
+
diff --git a/tests/src/blas3_prod_double.cpp b/tests/src/blas3_prod_double.cpp
new file mode 100644
index 0000000..36746fe
--- /dev/null
+++ b/tests/src/blas3_prod_double.cpp
@@ -0,0 +1,65 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "blas3_prod_float_double.hpp"
+
+//
+// -------------------------------------------------------------
+//
+int main()
+{
+   std::cout << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << "## Test :: BLAS 3 routines" << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << std::endl;
+
+   int retval = EXIT_SUCCESS;
+
+   std::cout << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << std::endl;
+#ifdef VIENNACL_WITH_OPENCL
+   if( viennacl::ocl::current_device().double_support() )
+#endif
+   {
+      {
+        typedef double NumericT;
+        NumericT epsilon = 1.0E-11;
+        std::cout << "# Testing setup:" << std::endl;
+        std::cout << "  eps:     " << epsilon << std::endl;
+        std::cout << "  numeric: double" << std::endl;
+        retval = test<NumericT>(epsilon);
+        if( retval == EXIT_SUCCESS )
+          std::cout << "# Test passed" << std::endl;
+        else
+          return retval;
+      }
+      std::cout << std::endl;
+      std::cout << "----------------------------------------------" << std::endl;
+      std::cout << std::endl;
+   }
+
+   std::cout << std::endl;
+   std::cout << "------- Test completed --------" << std::endl;
+   std::cout << std::endl;
+
+
+   return retval;
+}
diff --git a/tests/src/blas3_prod_double.cu b/tests/src/blas3_prod_double.cu
new file mode 100644
index 0000000..36746fe
--- /dev/null
+++ b/tests/src/blas3_prod_double.cu
@@ -0,0 +1,65 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "blas3_prod_float_double.hpp"
+
+//
+// -------------------------------------------------------------
+//
+int main()
+{
+   std::cout << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << "## Test :: BLAS 3 routines" << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << std::endl;
+
+   int retval = EXIT_SUCCESS;
+
+   std::cout << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << std::endl;
+#ifdef VIENNACL_WITH_OPENCL
+   if( viennacl::ocl::current_device().double_support() )
+#endif
+   {
+      {
+        typedef double NumericT;
+        NumericT epsilon = 1.0E-11;
+        std::cout << "# Testing setup:" << std::endl;
+        std::cout << "  eps:     " << epsilon << std::endl;
+        std::cout << "  numeric: double" << std::endl;
+        retval = test<NumericT>(epsilon);
+        if( retval == EXIT_SUCCESS )
+          std::cout << "# Test passed" << std::endl;
+        else
+          return retval;
+      }
+      std::cout << std::endl;
+      std::cout << "----------------------------------------------" << std::endl;
+      std::cout << std::endl;
+   }
+
+   std::cout << std::endl;
+   std::cout << "------- Test completed --------" << std::endl;
+   std::cout << std::endl;
+
+
+   return retval;
+}
diff --git a/tests/src/blas3_prod_float.cpp b/tests/src/blas3_prod_float.cpp
new file mode 100644
index 0000000..b10ee57
--- /dev/null
+++ b/tests/src/blas3_prod_float.cpp
@@ -0,0 +1,61 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "blas3_prod_float_double.hpp"
+
+
+//
+// -------------------------------------------------------------
+//
+int main()
+{
+   std::cout << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << "## Test :: BLAS 3 routines" << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << std::endl;
+
+   int retval = EXIT_SUCCESS;
+
+   std::cout << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << std::endl;
+   {
+      typedef float NumericT;
+      NumericT epsilon = NumericT(1.0E-3);
+      std::cout << "# Testing setup:" << std::endl;
+      std::cout << "  eps:     " << epsilon << std::endl;
+      std::cout << "  numeric: float" << std::endl;
+      retval = test<NumericT>(epsilon);
+      if( retval == EXIT_SUCCESS )
+        std::cout << "# Test passed" << std::endl;
+      else
+        return retval;
+   }
+   std::cout << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << std::endl;
+
+   std::cout << std::endl;
+   std::cout << "------- Test completed --------" << std::endl;
+   std::cout << std::endl;
+
+
+   return retval;
+}
diff --git a/tests/src/blas3_prod_float.cu b/tests/src/blas3_prod_float.cu
new file mode 100644
index 0000000..b10ee57
--- /dev/null
+++ b/tests/src/blas3_prod_float.cu
@@ -0,0 +1,61 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "blas3_prod_float_double.hpp"
+
+
+//
+// -------------------------------------------------------------
+//
+int main()
+{
+   std::cout << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << "## Test :: BLAS 3 routines" << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << std::endl;
+
+   int retval = EXIT_SUCCESS;
+
+   std::cout << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << std::endl;
+   {
+      typedef float NumericT;
+      NumericT epsilon = NumericT(1.0E-3);
+      std::cout << "# Testing setup:" << std::endl;
+      std::cout << "  eps:     " << epsilon << std::endl;
+      std::cout << "  numeric: float" << std::endl;
+      retval = test<NumericT>(epsilon);
+      if( retval == EXIT_SUCCESS )
+        std::cout << "# Test passed" << std::endl;
+      else
+        return retval;
+   }
+   std::cout << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << std::endl;
+
+   std::cout << std::endl;
+   std::cout << "------- Test completed --------" << std::endl;
+   std::cout << std::endl;
+
+
+   return retval;
+}
diff --git a/tests/src/blas3_prod_float_double.hpp b/tests/src/blas3_prod_float_double.hpp
new file mode 100644
index 0000000..f6154a0
--- /dev/null
+++ b/tests/src/blas3_prod_float_double.hpp
@@ -0,0 +1,855 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+//#define NDEBUG
+//#define VIENNACL_DEBUG_BUILD
+
+// We don't need debug mode in UBLAS:
+#define BOOST_UBLAS_NDEBUG
+
+//
+// *** System
+//
+#include <iostream>
+
+//
+// *** Boost
+//
+#include <boost/numeric/ublas/io.hpp>
+#include <boost/numeric/ublas/triangular.hpp>
+#include <boost/numeric/ublas/matrix_sparse.hpp>
+#include <boost/numeric/ublas/matrix.hpp>
+#include <boost/numeric/ublas/matrix_proxy.hpp>
+#include <boost/numeric/ublas/lu.hpp>
+#include <boost/numeric/ublas/io.hpp>
+
+//
+// *** ViennaCL
+//
+//#define VIENNACL_DEBUG_ALL
+//#define VIENNACL_DEBUG_BUILD
+#define VIENNACL_WITH_UBLAS 1
+#include "viennacl/scalar.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/matrix_proxy.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+#include "viennacl/linalg/direct_solve.hpp"
+#include "examples/tutorial/Random.hpp"
+//
+// -------------------------------------------------------------
+//
+using namespace boost::numeric;
+//
+// -------------------------------------------------------------
+//
+template <typename ScalarType>
+ScalarType diff(ScalarType & s1, viennacl::scalar<ScalarType> & s2)
+{
+   viennacl::backend::finish();
+   if (s1 != s2)
+      return (s1 - s2) / std::max(std::fabs(s1), std::fabs(s2));
+   return 0;
+}
+
+template <typename ScalarType>
+ScalarType diff(ublas::vector<ScalarType> & v1, viennacl::vector<ScalarType> & v2)
+{
+   ublas::vector<ScalarType> v2_cpu(v2.size());
+   viennacl::backend::finish();  //workaround for a bug in APP SDK 2.7 on Trinity APUs (with Catalyst 12.8)
+   viennacl::copy(v2.begin(), v2.end(), v2_cpu.begin());
+
+   for (std::size_t i=0;i<v1.size(); ++i)
+   {
+      if ( std::max( std::fabs(v2_cpu[i]), std::fabs(v1[i]) ) > 0 )
+         v2_cpu[i] = std::fabs(v2_cpu[i] - v1[i]) / std::max( std::fabs(v2_cpu[i]), std::fabs(v1[i]) );
+      else
+         v2_cpu[i] = 0.0;
+   }
+
+   return norm_inf(v2_cpu);
+}
+
+
+template <typename ScalarType, typename VCLMatrixType>
+ScalarType diff(ublas::matrix<ScalarType> & mat1, VCLMatrixType & mat2)
+{
+   ublas::matrix<ScalarType> mat2_cpu(mat2.size1(), mat2.size2());
+   viennacl::backend::finish();  //workaround for a bug in APP SDK 2.7 on Trinity APUs (with Catalyst 12.8)
+   viennacl::copy(mat2, mat2_cpu);
+   ScalarType ret = 0;
+   ScalarType act = 0;
+
+    for (unsigned int i = 0; i < mat2_cpu.size1(); ++i)
+    {
+      for (unsigned int j = 0; j < mat2_cpu.size2(); ++j)
+      {
+         act = std::fabs(mat2_cpu(i,j) - mat1(i,j)) / std::max( std::fabs(mat2_cpu(i, j)), std::fabs(mat1(i,j)) );
+         if (act > ret)
+           ret = act;
+      }
+    }
+   //std::cout << ret << std::endl;
+   return ret;
+}
+
+
+
+
+
+
+//
+// Part 1: Matrix-matrix multiplications
+//
+
+
+template< typename NumericT, typename Epsilon,
+          typename ReferenceMatrixTypeA, typename ReferenceMatrixTypeB, typename ReferenceMatrixTypeC,
+          typename MatrixTypeA, typename MatrixTypeB, typename MatrixTypeC>
+int test_prod(Epsilon const& epsilon,
+
+              ReferenceMatrixTypeA const & A, ReferenceMatrixTypeA const & A_trans,
+              ReferenceMatrixTypeB const & B, ReferenceMatrixTypeB const & B_trans,
+              ReferenceMatrixTypeC & C,
+
+              MatrixTypeA const & vcl_A, MatrixTypeA const & vcl_A_trans,
+              MatrixTypeB const & vcl_B, MatrixTypeB const & vcl_B_trans,
+              MatrixTypeC & vcl_C
+             )
+{
+   int retval = EXIT_SUCCESS;
+   NumericT act_diff = 0;
+
+
+   // Test: C +-= A * B --------------------------------------------------------------------------
+   C     = viennacl::linalg::prod(A, B);
+   vcl_C = viennacl::linalg::prod(vcl_A, vcl_B);
+   act_diff = std::fabs(diff(C, vcl_C));
+
+   if( act_diff > epsilon )
+   {
+     std::cout << "# Error at operation: matrix-matrix product" << std::endl;
+     std::cout << "  diff: " << act_diff << std::endl;
+     retval = EXIT_FAILURE;
+   }
+   else
+     std::cout << "Test C = A * B passed!" << std::endl;
+
+
+   C     += viennacl::linalg::prod(A, B);
+   vcl_C += viennacl::linalg::prod(vcl_A, vcl_B);
+   act_diff = std::fabs(diff(C, vcl_C));
+
+   if( act_diff > epsilon )
+   {
+     std::cout << "# Error at operation: matrix-matrix product" << std::endl;
+     std::cout << "  diff: " << act_diff << std::endl;
+     retval = EXIT_FAILURE;
+   }
+   else
+     std::cout << "Test C += A * B passed!" << std::endl;
+
+   C     -= viennacl::linalg::prod(A, B);
+   vcl_C -= viennacl::linalg::prod(vcl_A, vcl_B);
+   act_diff = std::fabs(diff(C, vcl_C));
+
+   if( act_diff > epsilon )
+   {
+     std::cout << "# Error at operation: matrix-matrix product" << std::endl;
+     std::cout << "  diff: " << act_diff << std::endl;
+     retval = EXIT_FAILURE;
+   }
+   else
+     std::cout << "Test C -= A * B passed!" << std::endl;
+
+
+
+
+
+   // Test: C +-= A * trans(B) --------------------------------------------------------------------------
+   C     = boost::numeric::ublas::prod(A, trans(B_trans));
+   vcl_C = viennacl::linalg::prod(vcl_A, trans(vcl_B_trans));
+   act_diff = std::fabs(diff(C, vcl_C));
+
+   if( act_diff > epsilon )
+   {
+     std::cout << "# Error at operation: matrix-matrix product" << std::endl;
+     std::cout << "  diff: " << act_diff << std::endl;
+     retval = EXIT_FAILURE;
+   }
+   else
+     std::cout << "Test C = A * trans(B) passed!" << std::endl;
+
+
+   C     += boost::numeric::ublas::prod(A, trans(B_trans));
+   vcl_C += viennacl::linalg::prod(vcl_A, trans(vcl_B_trans));
+   act_diff = std::fabs(diff(C, vcl_C));
+
+   if( act_diff > epsilon )
+   {
+     std::cout << "# Error at operation: matrix-matrix product" << std::endl;
+     std::cout << "  diff: " << act_diff << std::endl;
+     retval = EXIT_FAILURE;
+   }
+   else
+     std::cout << "Test C += A * trans(B) passed!" << std::endl;
+
+
+   C     -= boost::numeric::ublas::prod(A, trans(B_trans));
+   vcl_C -= viennacl::linalg::prod(vcl_A, trans(vcl_B_trans));
+   act_diff = std::fabs(diff(C, vcl_C));
+
+   if( act_diff > epsilon )
+   {
+     std::cout << "# Error at operation: matrix-matrix product" << std::endl;
+     std::cout << "  diff: " << act_diff << std::endl;
+     retval = EXIT_FAILURE;
+   }
+   else
+     std::cout << "Test C -= A * trans(B) passed!" << std::endl;
+
+
+
+   // Test: C +-= trans(A) * B --------------------------------------------------------------------------
+   C     = boost::numeric::ublas::prod(trans(A_trans), B);
+   vcl_C = viennacl::linalg::prod(trans(vcl_A_trans), vcl_B);
+   act_diff = std::fabs(diff(C, vcl_C));
+
+   if( act_diff > epsilon )
+   {
+     std::cout << "# Error at operation: matrix-matrix product" << std::endl;
+     std::cout << "  diff: " << act_diff << std::endl;
+     retval = EXIT_FAILURE;
+   }
+   else
+     std::cout << "Test C = trans(A) * B passed!" << std::endl;
+
+
+   C     += boost::numeric::ublas::prod(trans(A_trans), B);
+   vcl_C += viennacl::linalg::prod(trans(vcl_A_trans), vcl_B);
+   act_diff = std::fabs(diff(C, vcl_C));
+
+   if( act_diff > epsilon )
+   {
+     std::cout << "# Error at operation: matrix-matrix product" << std::endl;
+     std::cout << "  diff: " << act_diff << std::endl;
+     retval = EXIT_FAILURE;
+   }
+   else
+     std::cout << "Test C += trans(A) * B passed!" << std::endl;
+
+
+   C     -= boost::numeric::ublas::prod(trans(A_trans), B);
+   vcl_C -= viennacl::linalg::prod(trans(vcl_A_trans), vcl_B);
+   act_diff = std::fabs(diff(C, vcl_C));
+
+   if( act_diff > epsilon )
+   {
+     std::cout << "# Error at operation: matrix-matrix product" << std::endl;
+     std::cout << "  diff: " << act_diff << std::endl;
+     retval = EXIT_FAILURE;
+   }
+   else
+     std::cout << "Test C -= trans(A) * B passed!" << std::endl;
+
+
+
+
+
+   // Test: C +-= trans(A) * trans(B) --------------------------------------------------------------------------
+   C     = boost::numeric::ublas::prod(trans(A_trans), trans(B_trans));
+   vcl_C = viennacl::linalg::prod(trans(vcl_A_trans), trans(vcl_B_trans));
+   act_diff = std::fabs(diff(C, vcl_C));
+
+   if( act_diff > epsilon )
+   {
+     std::cout << "# Error at operation: matrix-matrix product" << std::endl;
+     std::cout << "  diff: " << act_diff << std::endl;
+     retval = EXIT_FAILURE;
+   }
+   else
+     std::cout << "Test C = trans(A) * trans(B) passed!" << std::endl;
+
+   C     += boost::numeric::ublas::prod(trans(A_trans), trans(B_trans));
+   vcl_C += viennacl::linalg::prod(trans(vcl_A_trans), trans(vcl_B_trans));
+   act_diff = std::fabs(diff(C, vcl_C));
+
+   if( act_diff > epsilon )
+   {
+     std::cout << "# Error at operation: matrix-matrix product" << std::endl;
+     std::cout << "  diff: " << act_diff << std::endl;
+     retval = EXIT_FAILURE;
+   }
+   else
+     std::cout << "Test C += trans(A) * trans(B) passed!" << std::endl;
+
+
+   C     -= boost::numeric::ublas::prod(trans(A_trans), trans(B_trans));
+   vcl_C -= viennacl::linalg::prod(trans(vcl_A_trans), trans(vcl_B_trans));
+   act_diff = std::fabs(diff(C, vcl_C));
+
+   if( act_diff > epsilon )
+   {
+     std::cout << "# Error at operation: matrix-matrix product" << std::endl;
+     std::cout << "  diff: " << act_diff << std::endl;
+     retval = EXIT_FAILURE;
+   }
+   else
+     std::cout << "Test C -= trans(A) * trans(B) passed!" << std::endl;
+
+
+
+
+   return retval;
+}
+
+
+
+template< typename NumericT, typename F_A, typename F_B, typename F_C, typename Epsilon >
+int test_prod(Epsilon const& epsilon)
+{
+  int ret;
+
+  long matrix_size1 = 131;  //some odd number, not too large
+  long matrix_size2 = 67;  //some odd number, not too large
+  long matrix_size3 = 73;  //some odd number, not too large
+  //long matrix_size1 = 128;  //some odd number, not too large
+  //long matrix_size2 = 64;  //some odd number, not too large
+  //long matrix_size3 = 128;  //some odd number, not too large
+  //long matrix_size1 = 256;  // for testing AMD kernels
+  //long matrix_size2 = 256;  // for testing AMD kernels
+  //long matrix_size3 = 256;  // for testing AMD kernels
+
+  // --------------------------------------------------------------------------
+
+  // ublas reference:
+  ublas::matrix<NumericT> A(matrix_size1, matrix_size2);
+  ublas::matrix<NumericT> big_A = ublas::scalar_matrix<NumericT>(4*matrix_size1, 4*matrix_size2, NumericT(3.1415));
+
+  ublas::matrix<NumericT> B(matrix_size2, matrix_size3);
+  ublas::matrix<NumericT> big_B = ublas::scalar_matrix<NumericT>(4*matrix_size2, 4*matrix_size3, NumericT(42.0));
+
+  ublas::matrix<NumericT> C(matrix_size1, matrix_size3);
+
+  //fill A and B:
+  for (unsigned int i = 0; i < A.size1(); ++i)
+    for (unsigned int j = 0; j < A.size2(); ++j)
+        A(i,j) = static_cast<NumericT>(0.1) * random<NumericT>();
+  for (unsigned int i = 0; i < B.size1(); ++i)
+    for (unsigned int j = 0; j < B.size2(); ++j)
+        B(i,j) = static_cast<NumericT>(0.1) * random<NumericT>();
+
+  ublas::matrix<NumericT>     A_trans = trans(A);
+  ublas::matrix<NumericT> big_A_trans = trans(big_A);
+
+  ublas::matrix<NumericT>     B_trans = trans(B);
+  ublas::matrix<NumericT> big_B_trans = trans(big_B);
+
+  //
+  // ViennaCL objects
+  //
+
+  // A
+  viennacl::range range1_A(matrix_size1, 2*matrix_size1);
+  viennacl::range range2_A(matrix_size2, 2*matrix_size2);
+  viennacl::slice slice1_A(matrix_size1, 2, matrix_size1);
+  viennacl::slice slice2_A(matrix_size2, 3, matrix_size2);
+
+  viennacl::matrix<NumericT, F_A>    vcl_A(matrix_size1, matrix_size2);
+  viennacl::copy(A, vcl_A);
+
+  viennacl::matrix<NumericT, F_A>    vcl_big_range_A(4*matrix_size1, 4*matrix_size2);
+  viennacl::matrix_range<viennacl::matrix<NumericT, F_A> > vcl_range_A(vcl_big_range_A, range1_A, range2_A);
+  viennacl::copy(A, vcl_range_A);
+
+  viennacl::matrix<NumericT, F_A>    vcl_big_slice_A(4*matrix_size1, 4*matrix_size2);
+  viennacl::matrix_slice<viennacl::matrix<NumericT, F_A> > vcl_slice_A(vcl_big_slice_A, slice1_A, slice2_A);
+  viennacl::copy(A, vcl_slice_A);
+
+
+  // A^T
+  viennacl::matrix<NumericT, F_A>    vcl_A_trans(matrix_size2, matrix_size1);
+  viennacl::copy(A_trans, vcl_A_trans);
+
+  viennacl::matrix<NumericT, F_A>    vcl_big_range_A_trans(4*matrix_size2, 4*matrix_size1);
+  viennacl::matrix_range<viennacl::matrix<NumericT, F_A> > vcl_range_A_trans(vcl_big_range_A_trans, range2_A, range1_A);
+  viennacl::copy(A_trans, vcl_range_A_trans);
+
+  viennacl::matrix<NumericT, F_A>    vcl_big_slice_A_trans(4*matrix_size2, 4*matrix_size1);
+  viennacl::matrix_slice<viennacl::matrix<NumericT, F_A> > vcl_slice_A_trans(vcl_big_slice_A_trans, slice2_A, slice1_A);
+  viennacl::copy(A_trans, vcl_slice_A_trans);
+
+
+
+  // B
+  viennacl::range range1_B(2*matrix_size2, 3*matrix_size2);
+  viennacl::range range2_B(2*matrix_size3, 3*matrix_size3);
+  viennacl::slice slice1_B(matrix_size2, 3, matrix_size2);
+  viennacl::slice slice2_B(matrix_size3, 2, matrix_size3);
+
+  viennacl::matrix<NumericT, F_B>    vcl_B(matrix_size2, matrix_size3);
+  viennacl::copy(B, vcl_B);
+
+  viennacl::matrix<NumericT, F_B>    vcl_big_range_B(4*matrix_size2, 4*matrix_size3);
+  viennacl::matrix_range<viennacl::matrix<NumericT, F_B> > vcl_range_B(vcl_big_range_B, range1_B, range2_B);
+  viennacl::copy(B, vcl_range_B);
+
+  viennacl::matrix<NumericT, F_B>    vcl_big_slice_B(4*matrix_size2, 4*matrix_size3);
+  viennacl::matrix_slice<viennacl::matrix<NumericT, F_B> > vcl_slice_B(vcl_big_slice_B, slice1_B, slice2_B);
+  viennacl::copy(B, vcl_slice_B);
+
+
+  // B^T
+
+  viennacl::matrix<NumericT, F_B>    vcl_B_trans(matrix_size3, matrix_size2);
+  viennacl::copy(B_trans, vcl_B_trans);
+
+  viennacl::matrix<NumericT, F_B>    vcl_big_range_B_trans(4*matrix_size3, 4*matrix_size2);
+  viennacl::matrix_range<viennacl::matrix<NumericT, F_B> > vcl_range_B_trans(vcl_big_range_B_trans, range2_B, range1_B);
+  viennacl::copy(B_trans, vcl_range_B_trans);
+
+  viennacl::matrix<NumericT, F_B>    vcl_big_slice_B_trans(4*matrix_size3, 4*matrix_size2);
+  viennacl::matrix_slice<viennacl::matrix<NumericT, F_B> > vcl_slice_B_trans(vcl_big_slice_B_trans, slice2_B, slice1_B);
+  viennacl::copy(B_trans, vcl_slice_B_trans);
+
+
+  // C
+
+  viennacl::range range1_C(matrix_size1-1, 2*matrix_size1-1);
+  viennacl::range range2_C(matrix_size3-1, 2*matrix_size3-1);
+  viennacl::slice slice1_C(matrix_size1-1, 3, matrix_size1);
+  viennacl::slice slice2_C(matrix_size3-1, 3, matrix_size3);
+
+  viennacl::matrix<NumericT, F_C>    vcl_C(matrix_size1, matrix_size3);
+
+  viennacl::matrix<NumericT, F_C>    vcl_big_range_C(4*matrix_size1, 4*matrix_size3);
+  viennacl::matrix_range<viennacl::matrix<NumericT, F_C> > vcl_range_C(vcl_big_range_C, range1_C, range2_C);
+
+  viennacl::matrix<NumericT, F_C>    vcl_big_slice_C(4*matrix_size1, 4*matrix_size3);
+  viennacl::matrix_slice<viennacl::matrix<NumericT, F_C> > vcl_slice_C(vcl_big_slice_C, slice1_C, slice2_C);
+
+
+  std::cout << "--- Part 1: Testing matrix-matrix products ---" << std::endl;
+
+  //////
+  //////  A: matrix
+  //////
+
+  //
+  //
+  std::cout << "Now using A=matrix, B=matrix, C=matrix" << std::endl;
+  ret = test_prod<NumericT>(epsilon,
+                            A, A_trans, B, B_trans, C,
+                            vcl_A, vcl_A_trans,
+                            vcl_B, vcl_B_trans,
+                            vcl_C);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+
+  //
+  //
+  std::cout << "Now using A=matrix, B=matrix, C=range" << std::endl;
+  ret = test_prod<NumericT>(epsilon,
+                            A, A_trans, B, B_trans, C,
+                            vcl_A, vcl_A_trans,
+                            vcl_B, vcl_B_trans,
+                            vcl_range_C);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+  //
+  //
+  std::cout << "Now using A=matrix, B=matrix, C=slice" << std::endl;
+  ret = test_prod<NumericT>(epsilon,
+                            A, A_trans, B, B_trans, C,
+                            vcl_A, vcl_A_trans,
+                            vcl_B, vcl_B_trans,
+                            vcl_slice_C);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+
+
+  //
+  //
+  std::cout << "Now using A=matrix, B=range, C=matrix" << std::endl;
+  ret = test_prod<NumericT>(epsilon,
+                            A, A_trans, B, B_trans, C,
+                            vcl_A, vcl_A_trans,
+                            vcl_range_B, vcl_range_B_trans,
+                            vcl_C);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+
+  //
+  //
+  std::cout << "Now using A=matrix, B=range, C=range" << std::endl;
+  ret = test_prod<NumericT>(epsilon,
+                            A, A_trans, B, B_trans, C,
+                            vcl_A, vcl_A_trans,
+                            vcl_range_B, vcl_range_B_trans,
+                            vcl_range_C);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+  //
+  //
+  std::cout << "Now using A=matrix, B=range, C=slice" << std::endl;
+  ret = test_prod<NumericT>(epsilon,
+                            A, A_trans, B, B_trans, C,
+                            vcl_A, vcl_A_trans,
+                            vcl_range_B, vcl_range_B_trans,
+                            vcl_slice_C);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+
+  //
+  //
+  std::cout << "Now using A=matrix, B=slice, C=matrix" << std::endl;
+  ret = test_prod<NumericT>(epsilon,
+                            A, A_trans, B, B_trans, C,
+                            vcl_A, vcl_A_trans,
+                            vcl_slice_B, vcl_slice_B_trans,
+                            vcl_C);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+
+  //
+  //
+  std::cout << "Now using A=matrix, B=slice, C=range" << std::endl;
+  ret = test_prod<NumericT>(epsilon,
+                            A, A_trans, B, B_trans, C,
+                            vcl_A, vcl_A_trans,
+                            vcl_slice_B, vcl_slice_B_trans,
+                            vcl_range_C);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+  //
+  //
+  std::cout << "Now using A=matrix, B=slice, C=slice" << std::endl;
+  ret = test_prod<NumericT>(epsilon,
+                            A, A_trans, B, B_trans, C,
+                            vcl_A, vcl_A_trans,
+                            vcl_slice_B, vcl_slice_B_trans,
+                            vcl_slice_C);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+
+  //////
+  //////  A: range
+  //////
+
+  //
+  //
+  std::cout << "Now using A=range, B=matrix, C=matrix" << std::endl;
+  ret = test_prod<NumericT>(epsilon,
+                            A, A_trans, B, B_trans, C,
+                            vcl_range_A, vcl_range_A_trans,
+                            vcl_B, vcl_B_trans,
+                            vcl_C);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+
+  //
+  //
+  std::cout << "Now using A=range, B=matrix, C=range" << std::endl;
+  ret = test_prod<NumericT>(epsilon,
+                            A, A_trans, B, B_trans, C,
+                            vcl_range_A, vcl_range_A_trans,
+                            vcl_B, vcl_B_trans,
+                            vcl_range_C);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+  //
+  //
+  std::cout << "Now using A=range, B=matrix, C=slice" << std::endl;
+  ret = test_prod<NumericT>(epsilon,
+                            A, A_trans, B, B_trans, C,
+                            vcl_range_A, vcl_range_A_trans,
+                            vcl_B, vcl_B_trans,
+                            vcl_slice_C);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+
+
+  //
+  //
+  std::cout << "Now using A=range, B=range, C=matrix" << std::endl;
+  ret = test_prod<NumericT>(epsilon,
+                            A, A_trans, B, B_trans, C,
+                            vcl_range_A, vcl_range_A_trans,
+                            vcl_range_B, vcl_range_B_trans,
+                            vcl_C);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+
+  //
+  //
+  std::cout << "Now using A=range, B=range, C=range" << std::endl;
+  ret = test_prod<NumericT>(epsilon,
+                            A, A_trans, B, B_trans, C,
+                            vcl_range_A, vcl_range_A_trans,
+                            vcl_range_B, vcl_range_B_trans,
+                            vcl_range_C);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+  //
+  //
+  std::cout << "Now using A=range, B=range, C=slice" << std::endl;
+  ret = test_prod<NumericT>(epsilon,
+                            A, A_trans, B, B_trans, C,
+                            vcl_range_A, vcl_range_A_trans,
+                            vcl_range_B, vcl_range_B_trans,
+                            vcl_slice_C);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+
+  //
+  //
+  std::cout << "Now using A=range, B=slice, C=matrix" << std::endl;
+  ret = test_prod<NumericT>(epsilon,
+                            A, A_trans, B, B_trans, C,
+                            vcl_range_A, vcl_range_A_trans,
+                            vcl_slice_B, vcl_slice_B_trans,
+                            vcl_C);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+
+  //
+  //
+  std::cout << "Now using A=range, B=slice, C=range" << std::endl;
+  ret = test_prod<NumericT>(epsilon,
+                            A, A_trans, B, B_trans, C,
+                            vcl_range_A, vcl_range_A_trans,
+                            vcl_slice_B, vcl_slice_B_trans,
+                            vcl_range_C);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+  //
+  //
+  std::cout << "Now using A=range, B=slice, C=slice" << std::endl;
+  ret = test_prod<NumericT>(epsilon,
+                            A, A_trans, B, B_trans, C,
+                            vcl_range_A, vcl_range_A_trans,
+                            vcl_slice_B, vcl_slice_B_trans,
+                            vcl_slice_C);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+
+
+  //////
+  //////  A: slice
+  //////
+
+  //
+  //
+  std::cout << "Now using A=slice, B=matrix, C=matrix" << std::endl;
+  ret = test_prod<NumericT>(epsilon,
+                            A, A_trans, B, B_trans, C,
+                            vcl_slice_A, vcl_slice_A_trans,
+                            vcl_B, vcl_B_trans,
+                            vcl_C);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+
+  //
+  //
+  std::cout << "Now using A=slice, B=matrix, C=range" << std::endl;
+  ret = test_prod<NumericT>(epsilon,
+                            A, A_trans, B, B_trans, C,
+                            vcl_slice_A, vcl_slice_A_trans,
+                            vcl_B, vcl_B_trans,
+                            vcl_range_C);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+  //
+  //
+  std::cout << "Now using A=slice, B=matrix, C=slice" << std::endl;
+  ret = test_prod<NumericT>(epsilon,
+                            A, A_trans, B, B_trans, C,
+                            vcl_slice_A, vcl_slice_A_trans,
+                            vcl_B, vcl_B_trans,
+                            vcl_slice_C);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+
+
+  //
+  //
+  std::cout << "Now using A=slice, B=range, C=matrix" << std::endl;
+  ret = test_prod<NumericT>(epsilon,
+                            A, A_trans, B, B_trans, C,
+                            vcl_slice_A, vcl_slice_A_trans,
+                            vcl_range_B, vcl_range_B_trans,
+                            vcl_C);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+
+  //
+  //
+  std::cout << "Now using A=slice, B=range, C=range" << std::endl;
+  ret = test_prod<NumericT>(epsilon,
+                            A, A_trans, B, B_trans, C,
+                            vcl_slice_A, vcl_slice_A_trans,
+                            vcl_range_B, vcl_range_B_trans,
+                            vcl_range_C);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+  //
+  //
+  std::cout << "Now using A=slice, B=range, C=slice" << std::endl;
+  ret = test_prod<NumericT>(epsilon,
+                            A, A_trans, B, B_trans, C,
+                            vcl_slice_A, vcl_slice_A_trans,
+                            vcl_range_B, vcl_range_B_trans,
+                            vcl_slice_C);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+
+  //
+  //
+  std::cout << "Now using A=slice, B=slice, C=matrix" << std::endl;
+  ret = test_prod<NumericT>(epsilon,
+                            A, A_trans, B, B_trans, C,
+                            vcl_slice_A, vcl_slice_A_trans,
+                            vcl_slice_B, vcl_slice_B_trans,
+                            vcl_C);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+
+  //
+  //
+  std::cout << "Now using A=slice, B=slice, C=range" << std::endl;
+  ret = test_prod<NumericT>(epsilon,
+                            A, A_trans, B, B_trans, C,
+                            vcl_slice_A, vcl_slice_A_trans,
+                            vcl_slice_B, vcl_slice_B_trans,
+                            vcl_range_C);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+  //
+  //
+  std::cout << "Now using A=slice, B=slice, C=slice" << std::endl;
+  ret = test_prod<NumericT>(epsilon,
+                            A, A_trans, B, B_trans, C,
+                            vcl_slice_A, vcl_slice_A_trans,
+                            vcl_slice_B, vcl_slice_B_trans,
+                            vcl_slice_C);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+
+  return ret;
+
+}
+
+
+//
+// Control functions
+//
+
+
+
+template< typename NumericT, typename Epsilon >
+int test(Epsilon const& epsilon)
+{
+  int ret;
+
+  std::cout << "///////////////////////////////////////" << std::endl;
+  std::cout << "/// Now testing A=row, B=row, C=row ///" << std::endl;
+  std::cout << "///////////////////////////////////////" << std::endl;
+  ret = test_prod<NumericT, viennacl::row_major, viennacl::row_major, viennacl::row_major>(epsilon);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+  std::cout << "///////////////////////////////////////" << std::endl;
+  std::cout << "/// Now testing A=row, B=row, C=col ///" << std::endl;
+  std::cout << "///////////////////////////////////////" << std::endl;
+  ret = test_prod<NumericT, viennacl::row_major, viennacl::row_major, viennacl::column_major>(epsilon);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+  std::cout << "///////////////////////////////////////" << std::endl;
+  std::cout << "/// Now testing A=row, B=col, C=row ///" << std::endl;
+  std::cout << "///////////////////////////////////////" << std::endl;
+  ret = test_prod<NumericT, viennacl::row_major, viennacl::column_major, viennacl::row_major>(epsilon);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+  std::cout << "///////////////////////////////////////" << std::endl;
+  std::cout << "/// Now testing A=row, B=col, C=col ///" << std::endl;
+  std::cout << "///////////////////////////////////////" << std::endl;
+  ret = test_prod<NumericT, viennacl::row_major, viennacl::column_major, viennacl::column_major>(epsilon);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+  std::cout << "///////////////////////////////////////" << std::endl;
+  std::cout << "/// Now testing A=col, B=row, C=row ///" << std::endl;
+  std::cout << "///////////////////////////////////////" << std::endl;
+  ret = test_prod<NumericT, viennacl::column_major, viennacl::row_major, viennacl::row_major>(epsilon);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+  std::cout << "///////////////////////////////////////" << std::endl;
+  std::cout << "/// Now testing A=col, B=row, C=col ///" << std::endl;
+  std::cout << "///////////////////////////////////////" << std::endl;
+  ret = test_prod<NumericT, viennacl::column_major, viennacl::row_major, viennacl::column_major>(epsilon);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+  std::cout << "///////////////////////////////////////" << std::endl;
+  std::cout << "/// Now testing A=col, B=col, C=row ///" << std::endl;
+  std::cout << "///////////////////////////////////////" << std::endl;
+  ret = test_prod<NumericT, viennacl::column_major, viennacl::column_major, viennacl::row_major>(epsilon);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+  std::cout << "///////////////////////////////////////" << std::endl;
+  std::cout << "/// Now testing A=col, B=col, C=col ///" << std::endl;
+  std::cout << "///////////////////////////////////////" << std::endl;
+  ret = test_prod<NumericT, viennacl::column_major, viennacl::column_major, viennacl::column_major>(epsilon);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+
+
+  return ret;
+}
diff --git a/tests/src/blas3.cpp b/tests/src/blas3_solve_double.cpp
similarity index 55%
copy from tests/src/blas3.cpp
copy to tests/src/blas3_solve_double.cpp
index b4e3a9d..e063f79 100644
--- a/tests/src/blas3.cpp
+++ b/tests/src/blas3_solve_double.cpp
@@ -1,20 +1,22 @@
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
 ============================================================================= */
 
 //#define NDEBUG
+//#define VIENNACL_DEBUG_BUILD
 
 //
 // *** System
@@ -35,11 +37,12 @@
 //
 // *** ViennaCL
 //
-//#define VIENNACL_DEBUG_INFO_ALL
+//#define VIENNACL_DEBUG_ALL
 //#define VIENNACL_DEBUG_BUILD
-#define VIENNACL_HAVE_UBLAS 1
+#define VIENNACL_WITH_UBLAS 1
 #include "viennacl/scalar.hpp"
 #include "viennacl/matrix.hpp"
+#include "viennacl/matrix_proxy.hpp"
 #include "viennacl/vector.hpp"
 #include "viennacl/linalg/prod.hpp"
 #include "viennacl/linalg/norm_2.hpp"
@@ -53,8 +56,9 @@ using namespace boost::numeric;
 // -------------------------------------------------------------
 //
 template <typename ScalarType>
-ScalarType diff(ScalarType & s1, viennacl::scalar<ScalarType> & s2) 
+ScalarType diff(ScalarType & s1, viennacl::scalar<ScalarType> & s2)
 {
+   viennacl::backend::finish();
    if (s1 != s2)
       return (s1 - s2) / std::max(fabs(s1), fabs(s2));
    return 0;
@@ -64,7 +68,9 @@ template <typename ScalarType>
 ScalarType diff(ublas::vector<ScalarType> & v1, viennacl::vector<ScalarType> & v2)
 {
    ublas::vector<ScalarType> v2_cpu(v2.size());
+   viennacl::backend::finish();
    viennacl::copy(v2.begin(), v2.end(), v2_cpu.begin());
+   viennacl::backend::finish();
 
    for (std::size_t i=0;i<v1.size(); ++i)
    {
@@ -77,128 +83,36 @@ ScalarType diff(ublas::vector<ScalarType> & v1, viennacl::vector<ScalarType> & v
    return norm_inf(v2_cpu);
 }
 
-template <typename ScalarType, typename F, unsigned int ALIGNMENT>
-ScalarType diff(ublas::matrix<ScalarType> & mat1, viennacl::matrix<ScalarType, F, ALIGNMENT> & mat2)
+
+template <typename ScalarType, typename VCLMatrixType>
+ScalarType diff(ublas::matrix<ScalarType> & mat1, VCLMatrixType & mat2)
 {
    ublas::matrix<ScalarType> mat2_cpu(mat2.size1(), mat2.size2());
-   copy(mat2, mat2_cpu);
-   double ret = 0;
-   double act = 0;
+   viennacl::backend::finish();  //workaround for a bug in APP SDK 2.7 on Trinity APUs (with Catalyst 12.8)
+   viennacl::copy(mat2, mat2_cpu);
+   ScalarType ret = 0;
+   ScalarType act = 0;
 
-    for (std::size_t i = 0; i < mat2_cpu.size1(); ++i)
+    for (unsigned int i = 0; i < mat2_cpu.size1(); ++i)
     {
-      for (std::size_t j = 0; j < mat2_cpu.size2(); ++j)
+      for (unsigned int j = 0; j < mat2_cpu.size2(); ++j)
       {
-         act = fabs(mat2_cpu(i,j) - mat1(i,j)) / std::max( fabs(mat2_cpu(i, j)), fabs(mat1(i,j)) );
+         act = std::fabs(mat2_cpu(i,j) - mat1(i,j)) / std::max( std::fabs(mat2_cpu(i, j)), std::fabs(mat1(i,j)) );
          if (act > ret)
            ret = act;
       }
     }
    //std::cout << ret << std::endl;
-   return ScalarType(ret);
+   return ret;
 }
 
+
+
 //
-// -------------------------------------------------------------
+// Triangular solvers
 //
-template< typename NumericT, typename MatrixTypeA, typename MatrixTypeB, typename MatrixTypeC, typename Epsilon >
-int test_prod(Epsilon const& epsilon)
-{
-   int retval = EXIT_SUCCESS;
-   long matrix_size1 = 157;  //some odd number, not too large
-   long matrix_size2 = 91;  //some odd number, not too large
-   long matrix_size3 = 73;  //some odd number, not too large
-   NumericT act_diff = 0;
-   
-   // --------------------------------------------------------------------------            
-   ublas::matrix<NumericT> A(matrix_size1, matrix_size2);
-   ublas::matrix<NumericT> B(matrix_size2, matrix_size3);
-   ublas::matrix<NumericT> C(matrix_size1, matrix_size3);
-
-   //fill A and B:
-   for (unsigned int i = 0; i < A.size1(); ++i)
-      for (unsigned int j = 0; j < A.size2(); ++j)
-         A(i,j) = static_cast<NumericT>(0.1) * random<NumericT>();
-   for (unsigned int i = 0; i < B.size1(); ++i)
-      for (unsigned int j = 0; j < B.size2(); ++j)
-         B(i,j) = static_cast<NumericT>(0.1) * random<NumericT>();
-
-   ublas::matrix<NumericT> A_trans = trans(A);
-   ublas::matrix<NumericT> B_trans = trans(B);
-   
-   MatrixTypeA vcl_A(matrix_size1, matrix_size2);
-   MatrixTypeB vcl_B(matrix_size2, matrix_size3);
-   MatrixTypeA vcl_A_trans(matrix_size2, matrix_size1);
-   MatrixTypeB vcl_B_trans(matrix_size3, matrix_size2);
-   MatrixTypeC vcl_C(matrix_size1, matrix_size3);
-
-   
-   viennacl::copy(A, vcl_A);
-   viennacl::copy(B, vcl_B);
-   viennacl::copy(A_trans, vcl_A_trans);
-   viennacl::copy(B_trans, vcl_B_trans);
-
-   // Test: C = A * B --------------------------------------------------------------------------       
-   C     = viennacl::linalg::prod(A, B);
-   vcl_C = viennacl::linalg::prod(vcl_A, vcl_B);
-   act_diff = fabs(diff(C, vcl_C));
-   
-   if( act_diff > epsilon )
-   {
-     std::cout << "# Error at operation: matrix-matrix product" << std::endl;
-     std::cout << "  diff: " << act_diff << std::endl;
-     retval = EXIT_FAILURE;
-   }
-   else
-     std::cout << "Test C = A * B passed!" << std::endl;
-   
-   // Test: C = A * trans(B) --------------------------------------------------------------------------       
-   C     = boost::numeric::ublas::prod(A, trans(B_trans));
-   vcl_C = viennacl::linalg::prod(vcl_A, trans(vcl_B_trans));
-   act_diff = fabs(diff(C, vcl_C));
-   
-   if( act_diff > epsilon )
-   {
-     std::cout << "# Error at operation: matrix-matrix product" << std::endl;
-     std::cout << "  diff: " << act_diff << std::endl;
-     retval = EXIT_FAILURE;
-   }
-   else
-     std::cout << "Test C = A * trans(B) passed!" << std::endl;
-   
-   // Test: C = trans(A) * B --------------------------------------------------------------------------       
-   C     = boost::numeric::ublas::prod(trans(A_trans), B);
-   vcl_C = viennacl::linalg::prod(trans(vcl_A_trans), vcl_B);
-   act_diff = fabs(diff(C, vcl_C));
-   
-   if( act_diff > epsilon )
-   {
-     std::cout << "# Error at operation: matrix-matrix product" << std::endl;
-     std::cout << "  diff: " << act_diff << std::endl;
-     retval = EXIT_FAILURE;
-   }
-   else
-     std::cout << "Test C = trans(A) * B passed!" << std::endl;
-   
-   
-   // Test: C = trans(A) * trans(B) --------------------------------------------------------------------------       
-   C     = boost::numeric::ublas::prod(trans(A_trans), trans(B_trans));
-   vcl_C = viennacl::linalg::prod(trans(vcl_A_trans), trans(vcl_B_trans));
-   act_diff = fabs(diff(C, vcl_C));
-   
-   if( act_diff > epsilon )
-   {
-     std::cout << "# Error at operation: matrix-matrix product" << std::endl;
-     std::cout << "  diff: " << act_diff << std::endl;
-     retval = EXIT_FAILURE;
-   }
-   else
-     std::cout << "Test C = trans(A) * trans(B) passed!" << std::endl;
-   
-   
-   
-   return retval;
-}
+
+
 
 template <typename RHSTypeRef, typename RHSTypeCheck, typename Epsilon >
 void run_solver_check(RHSTypeRef & B_ref, RHSTypeCheck & B_check, int & retval, Epsilon const & epsilon)
@@ -213,56 +127,38 @@ void run_solver_check(RHSTypeRef & B_ref, RHSTypeCheck & B_check, int & retval,
    }
    else
      std::cout << " passed! " << act_diff << std::endl;
-   
+
 }
 
-template< typename NumericT, typename MatrixTypeA, typename MatrixTypeB, typename Epsilon >
-int test_solve(Epsilon const& epsilon)
+
+template< typename NumericT, typename Epsilon,
+          typename ReferenceMatrixTypeA, typename ReferenceMatrixTypeB, typename ReferenceMatrixTypeC,
+          typename MatrixTypeA, typename MatrixTypeB, typename MatrixTypeC, typename MatrixTypeResult>
+int test_solve(Epsilon const& epsilon,
+
+              ReferenceMatrixTypeA const & A,
+              ReferenceMatrixTypeB const & B_start,
+              ReferenceMatrixTypeC const & C_start,
+
+              MatrixTypeA const & vcl_A,
+              MatrixTypeB & vcl_B,
+              MatrixTypeC & vcl_C,
+              MatrixTypeResult const &
+             )
 {
    int retval = EXIT_SUCCESS;
-   long matrix_size = 83;  //some odd number, not too large
-   long rhs_num = 61;
-   
-   // --------------------------------------------------------------------------            
-   ublas::matrix<NumericT> A(matrix_size, matrix_size);
-   ublas::matrix<NumericT> B_start(matrix_size, rhs_num);
-   ublas::matrix<NumericT> C_start(rhs_num, matrix_size);
-
-   //fill A and B:
-   for (std::size_t i = 0; i < A.size1(); ++i)
-   {
-      for (std::size_t j = 0; j < A.size2(); ++j)
-         A(i,j) = static_cast<NumericT>(-0.5) * random<NumericT>();
-      A(i,i) = NumericT(1.0) + NumericT(2.0) * random<NumericT>(); //some extra weight on diagonal for stability
-   }
-   
-   for (std::size_t i = 0; i < B_start.size1(); ++i)
-      for (std::size_t j = 0; j < B_start.size2(); ++j)
-         B_start(i,j) = random<NumericT>();
-
-   for (std::size_t i = 0; i < C_start.size1(); ++i)
-      for (std::size_t j = 0; j < C_start.size2(); ++j)
-         C_start(i,j) = random<NumericT>();
-      
-   ublas::matrix<NumericT> B = B_start;
-   ublas::matrix<NumericT> result = B_start;
-   ublas::matrix<NumericT> C = C_start;
-   ublas::matrix<NumericT> A_trans = trans(A);
-   ublas::matrix<NumericT> C_trans = trans(C);
-
-   
-   MatrixTypeA vcl_A(matrix_size, matrix_size);
-   MatrixTypeB vcl_B(matrix_size, rhs_num);
-   MatrixTypeB vcl_result(matrix_size, rhs_num);
-   MatrixTypeB vcl_C(rhs_num, matrix_size);
-   MatrixTypeB vcl_C_result(rhs_num, matrix_size);
-
-   
-   viennacl::copy(A, vcl_A);
-   viennacl::copy(B, vcl_B);
-   viennacl::copy(C, vcl_C);
-   
-   // Test: A \ B with various tags --------------------------------------------------------------------------       
+
+   // --------------------------------------------------------------------------
+
+   ReferenceMatrixTypeA result;
+   ReferenceMatrixTypeC C_trans;
+
+   ReferenceMatrixTypeB B = B_start;
+   ReferenceMatrixTypeC C = C_start;
+
+   MatrixTypeResult vcl_result;
+
+   // Test: A \ B with various tags --------------------------------------------------------------------------
    std::cout << "Testing A \\ B: " << std::endl;
    std::cout << " * upper_tag:      ";
    result = ublas::solve(A, B, ublas::upper_tag());
@@ -283,14 +179,14 @@ int test_solve(Epsilon const& epsilon)
    result = ublas::solve(A, B, ublas::unit_lower_tag());
    vcl_result = viennacl::linalg::solve(vcl_A, vcl_B, viennacl::linalg::unit_lower_tag());
    run_solver_check(result, vcl_result, retval, epsilon);
-   
+
    if (retval == EXIT_SUCCESS)
      std::cout << "Test A \\ B passed!" << std::endl;
-   
+
    B = B_start;
    C = C_start;
-   
-   // Test: A \ B^T --------------------------------------------------------------------------       
+
+   // Test: A \ B^T --------------------------------------------------------------------------
    std::cout << "Testing A \\ B^T: " << std::endl;
    std::cout << " * upper_tag:      ";
    viennacl::copy(C, vcl_C); C_trans = trans(C);
@@ -321,14 +217,14 @@ int test_solve(Epsilon const& epsilon)
    ublas::inplace_solve(A, C_trans, ublas::unit_lower_tag());
    viennacl::linalg::inplace_solve(vcl_A, trans(vcl_C), viennacl::linalg::unit_lower_tag());
    C = trans(C_trans); run_solver_check(C, vcl_C, retval, epsilon);
-   
+
    if (retval == EXIT_SUCCESS)
      std::cout << "Test A \\ B^T passed!" << std::endl;
 
    B = B_start;
    C = C_start;
-   
-   // Test: A \ B with various tags --------------------------------------------------------------------------       
+
+   // Test: A \ B with various tags --------------------------------------------------------------------------
    std::cout << "Testing A^T \\ B: " << std::endl;
    std::cout << " * upper_tag:      ";
    viennacl::copy(B, vcl_B);
@@ -353,14 +249,14 @@ int test_solve(Epsilon const& epsilon)
    result = ublas::solve(trans(A), B, ublas::unit_lower_tag());
    vcl_result = viennacl::linalg::solve(trans(vcl_A), vcl_B, viennacl::linalg::unit_lower_tag());
    run_solver_check(result, vcl_result, retval, epsilon);
-   
+
    if (retval == EXIT_SUCCESS)
      std::cout << "Test A^T \\ B passed!" << std::endl;
-   
+
    B = B_start;
    C = C_start;
 
-   // Test: A^T \ B^T --------------------------------------------------------------------------       
+   // Test: A^T \ B^T --------------------------------------------------------------------------
    std::cout << "Testing A^T \\ B^T: " << std::endl;
    std::cout << " * upper_tag:      ";
    viennacl::copy(C, vcl_C); C_trans = trans(C);
@@ -394,133 +290,225 @@ int test_solve(Epsilon const& epsilon)
 
    if (retval == EXIT_SUCCESS)
      std::cout << "Test A^T \\ B^T passed!" << std::endl;
-   
-   return retval;  
+
+   return retval;
 }
 
-template< typename NumericT, typename Epsilon >
-int test(Epsilon const& epsilon)
+
+template< typename NumericT, typename F_A, typename F_B, typename Epsilon >
+int test_solve(Epsilon const& epsilon)
 {
-  int ret;
+  int ret = EXIT_SUCCESS;
+  long matrix_size = 135;  //some odd number, not too large
+  long rhs_num = 67;
 
-  std::cout << "--- Part 1: Testing matrix-matrix products ---" << std::endl;
-  
-  //
-  //
-  std::cout << "Now using A=row, B=row, C=row" << std::endl;
-  ret = test_prod<NumericT,
-             viennacl::matrix<NumericT, viennacl::row_major>,
-             viennacl::matrix<NumericT, viennacl::row_major>,
-             viennacl::matrix<NumericT, viennacl::row_major>  >(epsilon);
+  std::cout << "--- Part 2: Testing matrix-matrix solver ---" << std::endl;
+
+
+  ublas::matrix<NumericT> A(matrix_size, matrix_size);
+  ublas::matrix<NumericT> B_start(matrix_size, rhs_num);
+  ublas::matrix<NumericT> C_start(rhs_num, matrix_size);
+
+  for (std::size_t i = 0; i < A.size1(); ++i)
+  {
+    for (std::size_t j = 0; j < A.size2(); ++j)
+        A(i,j) = static_cast<NumericT>(-0.5) * random<NumericT>();
+    A(i,i) = NumericT(1.0) + NumericT(2.0) * random<NumericT>(); //some extra weight on diagonal for stability
+  }
+
+  for (std::size_t i = 0; i < B_start.size1(); ++i)
+    for (std::size_t j = 0; j < B_start.size2(); ++j)
+        B_start(i,j) = random<NumericT>();
+
+  for (std::size_t i = 0; i < C_start.size1(); ++i)
+    for (std::size_t j = 0; j < C_start.size2(); ++j)
+        C_start(i,j) = random<NumericT>();
+
+
+  // A
+  viennacl::range range1_A(matrix_size, 2*matrix_size);
+  viennacl::range range2_A(2*matrix_size, 3*matrix_size);
+  viennacl::slice slice1_A(matrix_size, 2, matrix_size);
+  viennacl::slice slice2_A(0, 3, matrix_size);
+
+  viennacl::matrix<NumericT, F_A>    vcl_A(matrix_size, matrix_size);
+  viennacl::copy(A, vcl_A);
+
+  viennacl::matrix<NumericT, F_A>    vcl_big_range_A(4*matrix_size, 4*matrix_size);
+  viennacl::matrix_range<viennacl::matrix<NumericT, F_A> > vcl_range_A(vcl_big_range_A, range1_A, range2_A);
+  viennacl::copy(A, vcl_range_A);
+
+  viennacl::matrix<NumericT, F_A>    vcl_big_slice_A(4*matrix_size, 4*matrix_size);
+  viennacl::matrix_slice<viennacl::matrix<NumericT, F_A> > vcl_slice_A(vcl_big_slice_A, slice1_A, slice2_A);
+  viennacl::copy(A, vcl_slice_A);
+
+
+  // B
+  viennacl::range range1_B(matrix_size, 2*matrix_size);
+  viennacl::range range2_B(2*rhs_num, 3*rhs_num);
+  viennacl::slice slice1_B(matrix_size, 2, matrix_size);
+  viennacl::slice slice2_B(0, 3, rhs_num);
+
+  viennacl::matrix<NumericT, F_B>    vcl_B(matrix_size, rhs_num);
+  viennacl::copy(B_start, vcl_B);
+
+  viennacl::matrix<NumericT, F_B>    vcl_big_range_B(4*matrix_size, 4*rhs_num);
+  viennacl::matrix_range<viennacl::matrix<NumericT, F_B> > vcl_range_B(vcl_big_range_B, range1_B, range2_B);
+  viennacl::copy(B_start, vcl_range_B);
+
+  viennacl::matrix<NumericT, F_B>    vcl_big_slice_B(4*matrix_size, 4*rhs_num);
+  viennacl::matrix_slice<viennacl::matrix<NumericT, F_B> > vcl_slice_B(vcl_big_slice_B, slice1_B, slice2_B);
+  viennacl::copy(B_start, vcl_slice_B);
+
+
+  // C
+  viennacl::range range1_C(rhs_num, 2*rhs_num);
+  viennacl::range range2_C(2*matrix_size, 3*matrix_size);
+  viennacl::slice slice1_C(rhs_num, 2, rhs_num);
+  viennacl::slice slice2_C(0, 3, matrix_size);
+
+  viennacl::matrix<NumericT, F_B>    vcl_C(rhs_num, matrix_size);
+  viennacl::copy(C_start, vcl_C);
+
+  viennacl::matrix<NumericT, F_B>    vcl_big_range_C(4*rhs_num, 4*matrix_size);
+  viennacl::matrix_range<viennacl::matrix<NumericT, F_B> > vcl_range_C(vcl_big_range_C, range1_C, range2_C);
+  viennacl::copy(C_start, vcl_range_C);
+
+  viennacl::matrix<NumericT, F_B>    vcl_big_slice_C(4*rhs_num, 4*matrix_size);
+  viennacl::matrix_slice<viennacl::matrix<NumericT, F_B> > vcl_slice_C(vcl_big_slice_C, slice1_C, slice2_C);
+  viennacl::copy(C_start, vcl_slice_C);
+
+
+  std::cout << "Now using A=matrix, B=matrix" << std::endl;
+  ret = test_solve<NumericT>(epsilon,
+                             A, B_start, C_start,
+                             vcl_A, vcl_B, vcl_C, vcl_B
+                            );
   if (ret != EXIT_SUCCESS)
     return ret;
 
-  //
-  //
-  std::cout << "Now using A=row, B=row, C=column" << std::endl;
-  ret = test_prod<NumericT,
-             viennacl::matrix<NumericT, viennacl::row_major>,
-             viennacl::matrix<NumericT, viennacl::row_major>,
-             viennacl::matrix<NumericT, viennacl::column_major>  >(epsilon);
+  std::cout << "Now using A=matrix, B=range" << std::endl;
+  ret = test_solve<NumericT>(epsilon,
+                             A, B_start, C_start,
+                             vcl_A, vcl_range_B, vcl_range_C, vcl_B
+                            );
   if (ret != EXIT_SUCCESS)
     return ret;
 
-  //
-  //
-  std::cout << "Now using A=row, B=column, C=row" << std::endl;
-  ret = test_prod<NumericT,
-             viennacl::matrix<NumericT, viennacl::row_major>,
-             viennacl::matrix<NumericT, viennacl::column_major>,
-             viennacl::matrix<NumericT, viennacl::row_major>  >(epsilon);
+  std::cout << "Now using A=matrix, B=slice" << std::endl;
+  ret = test_solve<NumericT>(epsilon,
+                             A, B_start, C_start,
+                             vcl_A, vcl_slice_B, vcl_slice_C, vcl_B
+                            );
   if (ret != EXIT_SUCCESS)
     return ret;
-  
-  //
-  //
-  std::cout << "Now using A=row, B=column, C=column" << std::endl;
-  ret = test_prod<NumericT,
-             viennacl::matrix<NumericT, viennacl::row_major>,
-             viennacl::matrix<NumericT, viennacl::column_major>,
-             viennacl::matrix<NumericT, viennacl::column_major>  >(epsilon);
+
+
+
+  std::cout << "Now using A=range, B=matrix" << std::endl;
+  ret = test_solve<NumericT>(epsilon,
+                             A, B_start, C_start,
+                             vcl_range_A, vcl_B, vcl_C, vcl_B
+                            );
   if (ret != EXIT_SUCCESS)
     return ret;
 
-  
-  
-  //
-  //
-  std::cout << "Now using A=column, B=row, C=row" << std::endl;
-  ret = test_prod<NumericT,
-             viennacl::matrix<NumericT, viennacl::column_major>,
-             viennacl::matrix<NumericT, viennacl::row_major>,
-             viennacl::matrix<NumericT, viennacl::row_major>  >(epsilon);
+  std::cout << "Now using A=range, B=range" << std::endl;
+  ret = test_solve<NumericT>(epsilon,
+                             A, B_start, C_start,
+                             vcl_range_A, vcl_range_B, vcl_range_C, vcl_B
+                            );
   if (ret != EXIT_SUCCESS)
     return ret;
 
-  //
-  //
-  std::cout << "Now using A=column, B=row, C=column" << std::endl;
-  ret = test_prod<NumericT,
-             viennacl::matrix<NumericT, viennacl::column_major>,
-             viennacl::matrix<NumericT, viennacl::row_major>,
-             viennacl::matrix<NumericT, viennacl::column_major>  >(epsilon);
+  std::cout << "Now using A=range, B=slice" << std::endl;
+  ret = test_solve<NumericT>(epsilon,
+                             A, B_start, C_start,
+                             vcl_range_A, vcl_slice_B, vcl_slice_C, vcl_B
+                            );
   if (ret != EXIT_SUCCESS)
     return ret;
 
-  //
-  //
-  std::cout << "Now using A=column, B=column, C=row" << std::endl;
-  ret = test_prod<NumericT,
-             viennacl::matrix<NumericT, viennacl::column_major>,
-             viennacl::matrix<NumericT, viennacl::column_major>,
-             viennacl::matrix<NumericT, viennacl::row_major>  >(epsilon);
+
+
+
+  std::cout << "Now using A=slice, B=matrix" << std::endl;
+  ret = test_solve<NumericT>(epsilon,
+                             A, B_start, C_start,
+                             vcl_slice_A, vcl_B, vcl_C, vcl_B
+                            );
   if (ret != EXIT_SUCCESS)
     return ret;
-  
-  //
-  //
-  std::cout << "Now using A=column, B=column, C=column" << std::endl;
-  ret = test_prod<NumericT,
-             viennacl::matrix<NumericT, viennacl::column_major>,
-             viennacl::matrix<NumericT, viennacl::column_major>,
-             viennacl::matrix<NumericT, viennacl::column_major>  >(epsilon);
+
+  std::cout << "Now using A=slice, B=range" << std::endl;
+  ret = test_solve<NumericT>(epsilon,
+                             A, B_start, C_start,
+                             vcl_slice_A, vcl_range_B, vcl_range_C, vcl_B
+                            );
   if (ret != EXIT_SUCCESS)
     return ret;
-  
-  std::cout << "--- Part 2: Testing matrix-matrix solver ---" << std::endl;
-  
-  std::cout << "Now using A=row, B=row" << std::endl;
-  ret = test_solve<NumericT,
-             viennacl::matrix<NumericT, viennacl::row_major>,
-             viennacl::matrix<NumericT, viennacl::row_major>  >(epsilon);
+
+  std::cout << "Now using A=slice, B=slice" << std::endl;
+  ret = test_solve<NumericT>(epsilon,
+                             A, B_start, C_start,
+                             vcl_slice_A, vcl_slice_B, vcl_slice_C, vcl_B
+                            );
   if (ret != EXIT_SUCCESS)
     return ret;
 
-  std::cout << "Now using A=row, B=col" << std::endl;
-  ret = test_solve<NumericT,
-             viennacl::matrix<NumericT, viennacl::row_major>,
-             viennacl::matrix<NumericT, viennacl::column_major>  >(epsilon);
+
+
+
+  return ret;
+
+}
+
+
+
+//
+// Control functions
+//
+
+
+template< typename NumericT, typename Epsilon >
+int test(Epsilon const& epsilon)
+{
+  int ret;
+
+  std::cout << "////////////////////////////////" << std::endl;
+  std::cout << "/// Now testing A=row, B=row ///" << std::endl;
+  std::cout << "////////////////////////////////" << std::endl;
+  ret = test_solve<NumericT, viennacl::row_major, viennacl::row_major>(epsilon);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+
+  std::cout << "////////////////////////////////" << std::endl;
+  std::cout << "/// Now testing A=row, B=col ///" << std::endl;
+  std::cout << "////////////////////////////////" << std::endl;
+  ret = test_solve<NumericT, viennacl::row_major, viennacl::column_major>(epsilon);
   if (ret != EXIT_SUCCESS)
     return ret;
 
-  std::cout << "Now using A=col, B=row" << std::endl;
-  ret = test_solve<NumericT,
-             viennacl::matrix<NumericT, viennacl::column_major>,
-             viennacl::matrix<NumericT, viennacl::row_major>  >(epsilon);
+  std::cout << "////////////////////////////////" << std::endl;
+  std::cout << "/// Now testing A=col, B=row ///" << std::endl;
+  std::cout << "////////////////////////////////" << std::endl;
+  ret = test_solve<NumericT, viennacl::column_major, viennacl::row_major>(epsilon);
   if (ret != EXIT_SUCCESS)
     return ret;
 
-  std::cout << "Now using A=col, B=col" << std::endl;
-  ret = test_solve<NumericT,
-             viennacl::matrix<NumericT, viennacl::column_major>,
-             viennacl::matrix<NumericT, viennacl::column_major>  >(epsilon);
+  std::cout << "////////////////////////////////" << std::endl;
+  std::cout << "/// Now testing A=col, B=col ///" << std::endl;
+  std::cout << "////////////////////////////////" << std::endl;
+  ret = test_solve<NumericT, viennacl::column_major, viennacl::column_major>(epsilon);
   if (ret != EXIT_SUCCESS)
     return ret;
-  
+
+
+
   return ret;
 }
 
-
 //
 // -------------------------------------------------------------
 //
@@ -554,7 +542,9 @@ int main()
    std::cout << std::endl;
    std::cout << "----------------------------------------------" << std::endl;
    std::cout << std::endl;
+#ifdef VIENNACL_WITH_OPENCL
    if( viennacl::ocl::current_device().double_support() )
+#endif
    {
       {
         typedef double NumericT;
@@ -572,5 +562,11 @@ int main()
       std::cout << "----------------------------------------------" << std::endl;
       std::cout << std::endl;
    }
+
+   std::cout << std::endl;
+   std::cout << "------- Test completed --------" << std::endl;
+   std::cout << std::endl;
+
+
    return retval;
 }
diff --git a/tests/src/blas3range.cpp b/tests/src/blas3_solve_double.cu
similarity index 54%
rename from tests/src/blas3range.cpp
rename to tests/src/blas3_solve_double.cu
index 6b5b613..e063f79 100644
--- a/tests/src/blas3range.cpp
+++ b/tests/src/blas3_solve_double.cu
@@ -1,20 +1,22 @@
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
 ============================================================================= */
 
 //#define NDEBUG
+//#define VIENNACL_DEBUG_BUILD
 
 //
 // *** System
@@ -35,13 +37,13 @@
 //
 // *** ViennaCL
 //
-//#define VIENNACL_DEBUG_INFO_ALL
+//#define VIENNACL_DEBUG_ALL
 //#define VIENNACL_DEBUG_BUILD
-#define VIENNACL_HAVE_UBLAS 1
+#define VIENNACL_WITH_UBLAS 1
 #include "viennacl/scalar.hpp"
 #include "viennacl/matrix.hpp"
-#include "viennacl/vector.hpp"
 #include "viennacl/matrix_proxy.hpp"
+#include "viennacl/vector.hpp"
 #include "viennacl/linalg/prod.hpp"
 #include "viennacl/linalg/norm_2.hpp"
 #include "viennacl/linalg/direct_solve.hpp"
@@ -54,8 +56,9 @@ using namespace boost::numeric;
 // -------------------------------------------------------------
 //
 template <typename ScalarType>
-ScalarType diff(ScalarType & s1, viennacl::scalar<ScalarType> & s2) 
+ScalarType diff(ScalarType & s1, viennacl::scalar<ScalarType> & s2)
 {
+   viennacl::backend::finish();
    if (s1 != s2)
       return (s1 - s2) / std::max(fabs(s1), fabs(s2));
    return 0;
@@ -65,9 +68,11 @@ template <typename ScalarType>
 ScalarType diff(ublas::vector<ScalarType> & v1, viennacl::vector<ScalarType> & v2)
 {
    ublas::vector<ScalarType> v2_cpu(v2.size());
-   copy(v2.begin(), v2.end(), v2_cpu.begin());
+   viennacl::backend::finish();
+   viennacl::copy(v2.begin(), v2.end(), v2_cpu.begin());
+   viennacl::backend::finish();
 
-   for (unsigned int i=0;i<v1.size(); ++i)
+   for (std::size_t i=0;i<v1.size(); ++i)
    {
       if ( std::max( fabs(v2_cpu[i]), fabs(v1[i]) ) > 0 )
          v2_cpu[i] = fabs(v2_cpu[i] - v1[i]) / std::max( fabs(v2_cpu[i]), fabs(v1[i]) );
@@ -78,19 +83,21 @@ ScalarType diff(ublas::vector<ScalarType> & v1, viennacl::vector<ScalarType> & v
    return norm_inf(v2_cpu);
 }
 
+
 template <typename ScalarType, typename VCLMatrixType>
 ScalarType diff(ublas::matrix<ScalarType> & mat1, VCLMatrixType & mat2)
 {
    ublas::matrix<ScalarType> mat2_cpu(mat2.size1(), mat2.size2());
+   viennacl::backend::finish();  //workaround for a bug in APP SDK 2.7 on Trinity APUs (with Catalyst 12.8)
    viennacl::copy(mat2, mat2_cpu);
-   double ret = 0;
-   double act = 0;
+   ScalarType ret = 0;
+   ScalarType act = 0;
 
     for (unsigned int i = 0; i < mat2_cpu.size1(); ++i)
     {
       for (unsigned int j = 0; j < mat2_cpu.size2(); ++j)
       {
-         act = fabs(mat2_cpu(i,j) - mat1(i,j)) / std::max( fabs(mat2_cpu(i, j)), fabs(mat1(i,j)) );
+         act = std::fabs(mat2_cpu(i,j) - mat1(i,j)) / std::max( std::fabs(mat2_cpu(i, j)), std::fabs(mat1(i,j)) );
          if (act > ret)
            ret = act;
       }
@@ -99,119 +106,14 @@ ScalarType diff(ublas::matrix<ScalarType> & mat1, VCLMatrixType & mat2)
    return ret;
 }
 
+
+
 //
-// -------------------------------------------------------------
+// Triangular solvers
 //
-template< typename NumericT, typename MatrixTypeA, typename MatrixTypeB, typename MatrixTypeC, typename Epsilon >
-int test_prod(Epsilon const& epsilon)
-{
-   int retval = EXIT_SUCCESS;
-   long matrix_size1 = 157;  //some odd number, not too large
-   long matrix_size2 = 91;  //some odd number, not too large
-   long matrix_size3 = 73;  //some odd number, not too large
-   NumericT act_diff = 0;
-   
-   // --------------------------------------------------------------------------            
-   ublas::matrix<NumericT> A(matrix_size1, matrix_size2);
-   ublas::matrix<NumericT> B(matrix_size2, matrix_size3);
-   ublas::matrix<NumericT> C(matrix_size1, matrix_size3);
-
-   //fill A and B:
-   for (unsigned int i = 0; i < A.size1(); ++i)
-      for (unsigned int j = 0; j < A.size2(); ++j)
-         A(i,j) = static_cast<NumericT>(0.1) * random<NumericT>();
-   for (unsigned int i = 0; i < B.size1(); ++i)
-      for (unsigned int j = 0; j < B.size2(); ++j)
-         B(i,j) = static_cast<NumericT>(0.1) * random<NumericT>();
-
-   ublas::matrix<NumericT> A_trans = trans(A);
-   ublas::matrix<NumericT> B_trans = trans(B);
-   
-   MatrixTypeA vcl_A_full(3*matrix_size1, 3*matrix_size2); vcl_A_full.clear();
-   MatrixTypeB vcl_B_full(3*matrix_size2, 3*matrix_size3); vcl_B_full.clear();
-   MatrixTypeA vcl_A_trans_full(3*matrix_size2, 3*matrix_size1); vcl_A_trans_full.clear();
-   MatrixTypeB vcl_B_trans_full(3*matrix_size3, 3*matrix_size2); vcl_B_trans_full.clear();
-   MatrixTypeC vcl_C_full(3*matrix_size1, 3*matrix_size3); vcl_C_full.clear();
-
-   viennacl::range r1(matrix_size1, 2*matrix_size1);
-   viennacl::range r2(matrix_size2, 2*matrix_size2);
-   viennacl::range r3(matrix_size3, 2*matrix_size3);
-   viennacl::matrix_range<MatrixTypeA> vcl_A(vcl_A_full, r1, r2);
-   viennacl::matrix_range<MatrixTypeB> vcl_B(vcl_B_full, r2, r3);
-   viennacl::matrix_range<MatrixTypeA> vcl_A_trans(vcl_A_trans_full, r2, r1);
-   viennacl::matrix_range<MatrixTypeB> vcl_B_trans(vcl_B_trans_full, r3, r2);
-   viennacl::matrix_range<MatrixTypeC> vcl_C(vcl_C_full, r1, r3);
-
-   
-   viennacl::copy(A, vcl_A);
-   viennacl::copy(B, vcl_B);
-   viennacl::copy(A_trans, vcl_A_trans);
-   viennacl::copy(B_trans, vcl_B_trans);
-
-   // Test: C = A * B --------------------------------------------------------------------------       
-   C     = viennacl::linalg::prod(A, B);
-   vcl_C = viennacl::linalg::prod(vcl_A, vcl_B);
-   act_diff = fabs(diff(C, vcl_C));
-   
-   if( act_diff > epsilon )
-   {
-     std::cout << "# Error at operation: matrix-matrix product" << std::endl;
-     std::cout << "  diff: " << act_diff << std::endl;
-     retval = EXIT_FAILURE;
-   }
-   else
-     std::cout << "Test C = A * B passed!" << std::endl;
-   
-   // Test: C = A * trans(B) --------------------------------------------------------------------------       
-   C     = boost::numeric::ublas::prod(A, trans(B_trans));
-   vcl_C = viennacl::linalg::prod(vcl_A, trans(vcl_B_trans));
-   act_diff = fabs(diff(C, vcl_C));
-   
-   if( act_diff > epsilon )
-   {
-     std::cout << "# Error at operation: matrix-matrix product" << std::endl;
-     std::cout << "  diff: " << act_diff << std::endl;
-     retval = EXIT_FAILURE;
-   }
-   else
-     std::cout << "Test C = A * trans(B) passed!" << std::endl;
-   
-   // Test: C = trans(A) * B --------------------------------------------------------------------------       
-   C     = boost::numeric::ublas::prod(trans(A_trans), B);
-   vcl_C = viennacl::linalg::prod(trans(vcl_A_trans), vcl_B);
-   act_diff = fabs(diff(C, vcl_C));
-   
-   if( act_diff > epsilon )
-   {
-     std::cout << "# Error at operation: matrix-matrix product" << std::endl;
-     std::cout << "  diff: " << act_diff << std::endl;
-     retval = EXIT_FAILURE;
-   }
-   else
-     std::cout << "Test C = trans(A) * B passed!" << std::endl;
-   
-   
-   // Test: C = trans(A) * trans(B) --------------------------------------------------------------------------       
-   C     = boost::numeric::ublas::prod(trans(A_trans), trans(B_trans));
-   vcl_C = viennacl::linalg::prod(trans(vcl_A_trans), trans(vcl_B_trans));
-   act_diff = fabs(diff(C, vcl_C));
-   
-   if( act_diff > epsilon )
-   {
-     std::cout << "# Error at operation: matrix-matrix product" << std::endl;
-     std::cout << "  diff: " << act_diff << std::endl;
-     retval = EXIT_FAILURE;
-   }
-   else
-     std::cout << "Test C = trans(A) * trans(B) passed!" << std::endl;
-   
-   
-   
-   return retval;
-}
 
 
-/*
+
 template <typename RHSTypeRef, typename RHSTypeCheck, typename Epsilon >
 void run_solver_check(RHSTypeRef & B_ref, RHSTypeCheck & B_check, int & retval, Epsilon const & epsilon)
 {
@@ -225,56 +127,38 @@ void run_solver_check(RHSTypeRef & B_ref, RHSTypeCheck & B_check, int & retval,
    }
    else
      std::cout << " passed! " << act_diff << std::endl;
-   
+
 }
 
-template< typename NumericT, typename MatrixTypeA, typename MatrixTypeB, typename Epsilon >
-int test_solve(Epsilon const& epsilon)
+
+template< typename NumericT, typename Epsilon,
+          typename ReferenceMatrixTypeA, typename ReferenceMatrixTypeB, typename ReferenceMatrixTypeC,
+          typename MatrixTypeA, typename MatrixTypeB, typename MatrixTypeC, typename MatrixTypeResult>
+int test_solve(Epsilon const& epsilon,
+
+              ReferenceMatrixTypeA const & A,
+              ReferenceMatrixTypeB const & B_start,
+              ReferenceMatrixTypeC const & C_start,
+
+              MatrixTypeA const & vcl_A,
+              MatrixTypeB & vcl_B,
+              MatrixTypeC & vcl_C,
+              MatrixTypeResult const &
+             )
 {
    int retval = EXIT_SUCCESS;
-   long matrix_size = 83;  //some odd number, not too large
-   long rhs_num = 61;
-   
-   // --------------------------------------------------------------------------            
-   ublas::matrix<NumericT> A(matrix_size, matrix_size);
-   ublas::matrix<NumericT> B_start(matrix_size, rhs_num);
-   ublas::matrix<NumericT> C_start(rhs_num, matrix_size);
-
-   //fill A and B:
-   for (unsigned int i = 0; i < A.size1(); ++i)
-   {
-      for (unsigned int j = 0; j < A.size2(); ++j)
-         A(i,j) = static_cast<NumericT>(-0.5) * random<NumericT>();
-      A(i,i) = 1.0 + 2.0 * random<NumericT>(); //some extra weight on diagonal for stability
-   }
-   
-   for (unsigned int i = 0; i < B_start.size1(); ++i)
-      for (unsigned int j = 0; j < B_start.size2(); ++j)
-         B_start(i,j) = random<NumericT>();
-
-   for (unsigned int i = 0; i < C_start.size1(); ++i)
-      for (unsigned int j = 0; j < C_start.size2(); ++j)
-         C_start(i,j) = random<NumericT>();
-      
-   ublas::matrix<NumericT> B = B_start;
-   ublas::matrix<NumericT> result = B_start;
-   ublas::matrix<NumericT> C = C_start;
-   ublas::matrix<NumericT> A_trans = trans(A);
-   ublas::matrix<NumericT> C_trans = trans(C);
-
-   
-   MatrixTypeA vcl_A(matrix_size, matrix_size);
-   MatrixTypeB vcl_B(matrix_size, rhs_num);
-   MatrixTypeB vcl_result(matrix_size, rhs_num);
-   MatrixTypeB vcl_C(rhs_num, matrix_size);
-   MatrixTypeB vcl_C_result(rhs_num, matrix_size);
-
-   
-   viennacl::copy(A, vcl_A);
-   viennacl::copy(B, vcl_B);
-   viennacl::copy(C, vcl_C);
-   
-   // Test: A \ B with various tags --------------------------------------------------------------------------       
+
+   // --------------------------------------------------------------------------
+
+   ReferenceMatrixTypeA result;
+   ReferenceMatrixTypeC C_trans;
+
+   ReferenceMatrixTypeB B = B_start;
+   ReferenceMatrixTypeC C = C_start;
+
+   MatrixTypeResult vcl_result;
+
+   // Test: A \ B with various tags --------------------------------------------------------------------------
    std::cout << "Testing A \\ B: " << std::endl;
    std::cout << " * upper_tag:      ";
    result = ublas::solve(A, B, ublas::upper_tag());
@@ -295,14 +179,14 @@ int test_solve(Epsilon const& epsilon)
    result = ublas::solve(A, B, ublas::unit_lower_tag());
    vcl_result = viennacl::linalg::solve(vcl_A, vcl_B, viennacl::linalg::unit_lower_tag());
    run_solver_check(result, vcl_result, retval, epsilon);
-   
+
    if (retval == EXIT_SUCCESS)
      std::cout << "Test A \\ B passed!" << std::endl;
-   
+
    B = B_start;
    C = C_start;
-   
-   // Test: A \ B^T --------------------------------------------------------------------------       
+
+   // Test: A \ B^T --------------------------------------------------------------------------
    std::cout << "Testing A \\ B^T: " << std::endl;
    std::cout << " * upper_tag:      ";
    viennacl::copy(C, vcl_C); C_trans = trans(C);
@@ -333,14 +217,14 @@ int test_solve(Epsilon const& epsilon)
    ublas::inplace_solve(A, C_trans, ublas::unit_lower_tag());
    viennacl::linalg::inplace_solve(vcl_A, trans(vcl_C), viennacl::linalg::unit_lower_tag());
    C = trans(C_trans); run_solver_check(C, vcl_C, retval, epsilon);
-   
+
    if (retval == EXIT_SUCCESS)
      std::cout << "Test A \\ B^T passed!" << std::endl;
 
    B = B_start;
    C = C_start;
-   
-   // Test: A \ B with various tags --------------------------------------------------------------------------       
+
+   // Test: A \ B with various tags --------------------------------------------------------------------------
    std::cout << "Testing A^T \\ B: " << std::endl;
    std::cout << " * upper_tag:      ";
    viennacl::copy(B, vcl_B);
@@ -365,14 +249,14 @@ int test_solve(Epsilon const& epsilon)
    result = ublas::solve(trans(A), B, ublas::unit_lower_tag());
    vcl_result = viennacl::linalg::solve(trans(vcl_A), vcl_B, viennacl::linalg::unit_lower_tag());
    run_solver_check(result, vcl_result, retval, epsilon);
-   
+
    if (retval == EXIT_SUCCESS)
      std::cout << "Test A^T \\ B passed!" << std::endl;
-   
+
    B = B_start;
    C = C_start;
 
-   // Test: A^T \ B^T --------------------------------------------------------------------------       
+   // Test: A^T \ B^T --------------------------------------------------------------------------
    std::cout << "Testing A^T \\ B^T: " << std::endl;
    std::cout << " * upper_tag:      ";
    viennacl::copy(C, vcl_C); C_trans = trans(C);
@@ -406,134 +290,225 @@ int test_solve(Epsilon const& epsilon)
 
    if (retval == EXIT_SUCCESS)
      std::cout << "Test A^T \\ B^T passed!" << std::endl;
-   
-   return retval;  
-} */
 
-template< typename NumericT, typename Epsilon >
-int test(Epsilon const& epsilon)
+   return retval;
+}
+
+
+template< typename NumericT, typename F_A, typename F_B, typename Epsilon >
+int test_solve(Epsilon const& epsilon)
 {
-  int ret;
+  int ret = EXIT_SUCCESS;
+  long matrix_size = 135;  //some odd number, not too large
+  long rhs_num = 67;
+
+  std::cout << "--- Part 2: Testing matrix-matrix solver ---" << std::endl;
+
+
+  ublas::matrix<NumericT> A(matrix_size, matrix_size);
+  ublas::matrix<NumericT> B_start(matrix_size, rhs_num);
+  ublas::matrix<NumericT> C_start(rhs_num, matrix_size);
+
+  for (std::size_t i = 0; i < A.size1(); ++i)
+  {
+    for (std::size_t j = 0; j < A.size2(); ++j)
+        A(i,j) = static_cast<NumericT>(-0.5) * random<NumericT>();
+    A(i,i) = NumericT(1.0) + NumericT(2.0) * random<NumericT>(); //some extra weight on diagonal for stability
+  }
+
+  for (std::size_t i = 0; i < B_start.size1(); ++i)
+    for (std::size_t j = 0; j < B_start.size2(); ++j)
+        B_start(i,j) = random<NumericT>();
+
+  for (std::size_t i = 0; i < C_start.size1(); ++i)
+    for (std::size_t j = 0; j < C_start.size2(); ++j)
+        C_start(i,j) = random<NumericT>();
+
+
+  // A
+  viennacl::range range1_A(matrix_size, 2*matrix_size);
+  viennacl::range range2_A(2*matrix_size, 3*matrix_size);
+  viennacl::slice slice1_A(matrix_size, 2, matrix_size);
+  viennacl::slice slice2_A(0, 3, matrix_size);
+
+  viennacl::matrix<NumericT, F_A>    vcl_A(matrix_size, matrix_size);
+  viennacl::copy(A, vcl_A);
+
+  viennacl::matrix<NumericT, F_A>    vcl_big_range_A(4*matrix_size, 4*matrix_size);
+  viennacl::matrix_range<viennacl::matrix<NumericT, F_A> > vcl_range_A(vcl_big_range_A, range1_A, range2_A);
+  viennacl::copy(A, vcl_range_A);
+
+  viennacl::matrix<NumericT, F_A>    vcl_big_slice_A(4*matrix_size, 4*matrix_size);
+  viennacl::matrix_slice<viennacl::matrix<NumericT, F_A> > vcl_slice_A(vcl_big_slice_A, slice1_A, slice2_A);
+  viennacl::copy(A, vcl_slice_A);
+
+
+  // B
+  viennacl::range range1_B(matrix_size, 2*matrix_size);
+  viennacl::range range2_B(2*rhs_num, 3*rhs_num);
+  viennacl::slice slice1_B(matrix_size, 2, matrix_size);
+  viennacl::slice slice2_B(0, 3, rhs_num);
+
+  viennacl::matrix<NumericT, F_B>    vcl_B(matrix_size, rhs_num);
+  viennacl::copy(B_start, vcl_B);
+
+  viennacl::matrix<NumericT, F_B>    vcl_big_range_B(4*matrix_size, 4*rhs_num);
+  viennacl::matrix_range<viennacl::matrix<NumericT, F_B> > vcl_range_B(vcl_big_range_B, range1_B, range2_B);
+  viennacl::copy(B_start, vcl_range_B);
 
-  std::cout << "--- Part 1: Testing matrix-matrix products ---" << std::endl;
-  
-  //
-  //
-  std::cout << "Now using A=row, B=row, C=row" << std::endl;
-  ret = test_prod<NumericT,
-             viennacl::matrix<NumericT, viennacl::row_major>,
-             viennacl::matrix<NumericT, viennacl::row_major>,
-             viennacl::matrix<NumericT, viennacl::row_major>  >(epsilon);
+  viennacl::matrix<NumericT, F_B>    vcl_big_slice_B(4*matrix_size, 4*rhs_num);
+  viennacl::matrix_slice<viennacl::matrix<NumericT, F_B> > vcl_slice_B(vcl_big_slice_B, slice1_B, slice2_B);
+  viennacl::copy(B_start, vcl_slice_B);
+
+
+  // C
+  viennacl::range range1_C(rhs_num, 2*rhs_num);
+  viennacl::range range2_C(2*matrix_size, 3*matrix_size);
+  viennacl::slice slice1_C(rhs_num, 2, rhs_num);
+  viennacl::slice slice2_C(0, 3, matrix_size);
+
+  viennacl::matrix<NumericT, F_B>    vcl_C(rhs_num, matrix_size);
+  viennacl::copy(C_start, vcl_C);
+
+  viennacl::matrix<NumericT, F_B>    vcl_big_range_C(4*rhs_num, 4*matrix_size);
+  viennacl::matrix_range<viennacl::matrix<NumericT, F_B> > vcl_range_C(vcl_big_range_C, range1_C, range2_C);
+  viennacl::copy(C_start, vcl_range_C);
+
+  viennacl::matrix<NumericT, F_B>    vcl_big_slice_C(4*rhs_num, 4*matrix_size);
+  viennacl::matrix_slice<viennacl::matrix<NumericT, F_B> > vcl_slice_C(vcl_big_slice_C, slice1_C, slice2_C);
+  viennacl::copy(C_start, vcl_slice_C);
+
+
+  std::cout << "Now using A=matrix, B=matrix" << std::endl;
+  ret = test_solve<NumericT>(epsilon,
+                             A, B_start, C_start,
+                             vcl_A, vcl_B, vcl_C, vcl_B
+                            );
   if (ret != EXIT_SUCCESS)
     return ret;
 
-  //
-  //
-  std::cout << "Now using A=row, B=row, C=column" << std::endl;
-  ret = test_prod<NumericT,
-             viennacl::matrix<NumericT, viennacl::row_major>,
-             viennacl::matrix<NumericT, viennacl::row_major>,
-             viennacl::matrix<NumericT, viennacl::column_major>  >(epsilon);
+  std::cout << "Now using A=matrix, B=range" << std::endl;
+  ret = test_solve<NumericT>(epsilon,
+                             A, B_start, C_start,
+                             vcl_A, vcl_range_B, vcl_range_C, vcl_B
+                            );
   if (ret != EXIT_SUCCESS)
     return ret;
 
-  //
-  //
-  std::cout << "Now using A=row, B=column, C=row" << std::endl;
-  ret = test_prod<NumericT,
-             viennacl::matrix<NumericT, viennacl::row_major>,
-             viennacl::matrix<NumericT, viennacl::column_major>,
-             viennacl::matrix<NumericT, viennacl::row_major>  >(epsilon);
+  std::cout << "Now using A=matrix, B=slice" << std::endl;
+  ret = test_solve<NumericT>(epsilon,
+                             A, B_start, C_start,
+                             vcl_A, vcl_slice_B, vcl_slice_C, vcl_B
+                            );
   if (ret != EXIT_SUCCESS)
     return ret;
-  
-  //
-  //
-  std::cout << "Now using A=row, B=column, C=column" << std::endl;
-  ret = test_prod<NumericT,
-             viennacl::matrix<NumericT, viennacl::row_major>,
-             viennacl::matrix<NumericT, viennacl::column_major>,
-             viennacl::matrix<NumericT, viennacl::column_major>  >(epsilon);
+
+
+
+  std::cout << "Now using A=range, B=matrix" << std::endl;
+  ret = test_solve<NumericT>(epsilon,
+                             A, B_start, C_start,
+                             vcl_range_A, vcl_B, vcl_C, vcl_B
+                            );
   if (ret != EXIT_SUCCESS)
     return ret;
 
-  
-  
-  //
-  //
-  std::cout << "Now using A=column, B=row, C=row" << std::endl;
-  ret = test_prod<NumericT,
-             viennacl::matrix<NumericT, viennacl::column_major>,
-             viennacl::matrix<NumericT, viennacl::row_major>,
-             viennacl::matrix<NumericT, viennacl::row_major>  >(epsilon);
+  std::cout << "Now using A=range, B=range" << std::endl;
+  ret = test_solve<NumericT>(epsilon,
+                             A, B_start, C_start,
+                             vcl_range_A, vcl_range_B, vcl_range_C, vcl_B
+                            );
   if (ret != EXIT_SUCCESS)
     return ret;
 
-  //
-  //
-  std::cout << "Now using A=column, B=row, C=column" << std::endl;
-  ret = test_prod<NumericT,
-             viennacl::matrix<NumericT, viennacl::column_major>,
-             viennacl::matrix<NumericT, viennacl::row_major>,
-             viennacl::matrix<NumericT, viennacl::column_major>  >(epsilon);
+  std::cout << "Now using A=range, B=slice" << std::endl;
+  ret = test_solve<NumericT>(epsilon,
+                             A, B_start, C_start,
+                             vcl_range_A, vcl_slice_B, vcl_slice_C, vcl_B
+                            );
   if (ret != EXIT_SUCCESS)
     return ret;
 
-  //
-  //
-  std::cout << "Now using A=column, B=column, C=row" << std::endl;
-  ret = test_prod<NumericT,
-             viennacl::matrix<NumericT, viennacl::column_major>,
-             viennacl::matrix<NumericT, viennacl::column_major>,
-             viennacl::matrix<NumericT, viennacl::row_major>  >(epsilon);
+
+
+
+  std::cout << "Now using A=slice, B=matrix" << std::endl;
+  ret = test_solve<NumericT>(epsilon,
+                             A, B_start, C_start,
+                             vcl_slice_A, vcl_B, vcl_C, vcl_B
+                            );
   if (ret != EXIT_SUCCESS)
     return ret;
-  
-  //
-  //
-  std::cout << "Now using A=column, B=column, C=column" << std::endl;
-  ret = test_prod<NumericT,
-             viennacl::matrix<NumericT, viennacl::column_major>,
-             viennacl::matrix<NumericT, viennacl::column_major>,
-             viennacl::matrix<NumericT, viennacl::column_major>  >(epsilon);
+
+  std::cout << "Now using A=slice, B=range" << std::endl;
+  ret = test_solve<NumericT>(epsilon,
+                             A, B_start, C_start,
+                             vcl_slice_A, vcl_range_B, vcl_range_C, vcl_B
+                            );
   if (ret != EXIT_SUCCESS)
     return ret;
-  
-  /*
-  std::cout << "--- Part 2: Testing matrix-matrix solver ---" << std::endl;
-  
-  std::cout << "Now using A=row, B=row" << std::endl;
-  ret = test_solve<NumericT,
-             viennacl::matrix<NumericT, viennacl::row_major>,
-             viennacl::matrix<NumericT, viennacl::row_major>  >(epsilon);
+
+  std::cout << "Now using A=slice, B=slice" << std::endl;
+  ret = test_solve<NumericT>(epsilon,
+                             A, B_start, C_start,
+                             vcl_slice_A, vcl_slice_B, vcl_slice_C, vcl_B
+                            );
   if (ret != EXIT_SUCCESS)
     return ret;
 
-  std::cout << "Now using A=row, B=col" << std::endl;
-  ret = test_solve<NumericT,
-             viennacl::matrix<NumericT, viennacl::row_major>,
-             viennacl::matrix<NumericT, viennacl::column_major>  >(epsilon);
+
+
+
+  return ret;
+
+}
+
+
+
+//
+// Control functions
+//
+
+
+template< typename NumericT, typename Epsilon >
+int test(Epsilon const& epsilon)
+{
+  int ret;
+
+  std::cout << "////////////////////////////////" << std::endl;
+  std::cout << "/// Now testing A=row, B=row ///" << std::endl;
+  std::cout << "////////////////////////////////" << std::endl;
+  ret = test_solve<NumericT, viennacl::row_major, viennacl::row_major>(epsilon);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+
+  std::cout << "////////////////////////////////" << std::endl;
+  std::cout << "/// Now testing A=row, B=col ///" << std::endl;
+  std::cout << "////////////////////////////////" << std::endl;
+  ret = test_solve<NumericT, viennacl::row_major, viennacl::column_major>(epsilon);
   if (ret != EXIT_SUCCESS)
     return ret;
 
-  std::cout << "Now using A=col, B=row" << std::endl;
-  ret = test_solve<NumericT,
-             viennacl::matrix<NumericT, viennacl::column_major>,
-             viennacl::matrix<NumericT, viennacl::row_major>  >(epsilon);
+  std::cout << "////////////////////////////////" << std::endl;
+  std::cout << "/// Now testing A=col, B=row ///" << std::endl;
+  std::cout << "////////////////////////////////" << std::endl;
+  ret = test_solve<NumericT, viennacl::column_major, viennacl::row_major>(epsilon);
   if (ret != EXIT_SUCCESS)
     return ret;
 
-  std::cout << "Now using A=col, B=col" << std::endl;
-  ret = test_solve<NumericT,
-             viennacl::matrix<NumericT, viennacl::column_major>,
-             viennacl::matrix<NumericT, viennacl::column_major>  >(epsilon);
+  std::cout << "////////////////////////////////" << std::endl;
+  std::cout << "/// Now testing A=col, B=col ///" << std::endl;
+  std::cout << "////////////////////////////////" << std::endl;
+  ret = test_solve<NumericT, viennacl::column_major, viennacl::column_major>(epsilon);
   if (ret != EXIT_SUCCESS)
-    return ret; */
-  
+    return ret;
+
+
+
   return ret;
 }
 
-
 //
 // -------------------------------------------------------------
 //
@@ -554,7 +529,7 @@ int main()
    std::cout << std::endl;
    {
       typedef float NumericT;
-      NumericT epsilon = 1.0E-3;
+      NumericT epsilon = NumericT(1.0E-3);
       std::cout << "# Testing setup:" << std::endl;
       std::cout << "  eps:     " << epsilon << std::endl;
       std::cout << "  numeric: float" << std::endl;
@@ -567,7 +542,9 @@ int main()
    std::cout << std::endl;
    std::cout << "----------------------------------------------" << std::endl;
    std::cout << std::endl;
+#ifdef VIENNACL_WITH_OPENCL
    if( viennacl::ocl::current_device().double_support() )
+#endif
    {
       {
         typedef double NumericT;
@@ -585,5 +562,11 @@ int main()
       std::cout << "----------------------------------------------" << std::endl;
       std::cout << std::endl;
    }
+
+   std::cout << std::endl;
+   std::cout << "------- Test completed --------" << std::endl;
+   std::cout << std::endl;
+
+
    return retval;
 }
diff --git a/tests/src/blas3.cpp b/tests/src/blas3_solve_float.cpp
similarity index 55%
copy from tests/src/blas3.cpp
copy to tests/src/blas3_solve_float.cpp
index b4e3a9d..e063f79 100644
--- a/tests/src/blas3.cpp
+++ b/tests/src/blas3_solve_float.cpp
@@ -1,20 +1,22 @@
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
 ============================================================================= */
 
 //#define NDEBUG
+//#define VIENNACL_DEBUG_BUILD
 
 //
 // *** System
@@ -35,11 +37,12 @@
 //
 // *** ViennaCL
 //
-//#define VIENNACL_DEBUG_INFO_ALL
+//#define VIENNACL_DEBUG_ALL
 //#define VIENNACL_DEBUG_BUILD
-#define VIENNACL_HAVE_UBLAS 1
+#define VIENNACL_WITH_UBLAS 1
 #include "viennacl/scalar.hpp"
 #include "viennacl/matrix.hpp"
+#include "viennacl/matrix_proxy.hpp"
 #include "viennacl/vector.hpp"
 #include "viennacl/linalg/prod.hpp"
 #include "viennacl/linalg/norm_2.hpp"
@@ -53,8 +56,9 @@ using namespace boost::numeric;
 // -------------------------------------------------------------
 //
 template <typename ScalarType>
-ScalarType diff(ScalarType & s1, viennacl::scalar<ScalarType> & s2) 
+ScalarType diff(ScalarType & s1, viennacl::scalar<ScalarType> & s2)
 {
+   viennacl::backend::finish();
    if (s1 != s2)
       return (s1 - s2) / std::max(fabs(s1), fabs(s2));
    return 0;
@@ -64,7 +68,9 @@ template <typename ScalarType>
 ScalarType diff(ublas::vector<ScalarType> & v1, viennacl::vector<ScalarType> & v2)
 {
    ublas::vector<ScalarType> v2_cpu(v2.size());
+   viennacl::backend::finish();
    viennacl::copy(v2.begin(), v2.end(), v2_cpu.begin());
+   viennacl::backend::finish();
 
    for (std::size_t i=0;i<v1.size(); ++i)
    {
@@ -77,128 +83,36 @@ ScalarType diff(ublas::vector<ScalarType> & v1, viennacl::vector<ScalarType> & v
    return norm_inf(v2_cpu);
 }
 
-template <typename ScalarType, typename F, unsigned int ALIGNMENT>
-ScalarType diff(ublas::matrix<ScalarType> & mat1, viennacl::matrix<ScalarType, F, ALIGNMENT> & mat2)
+
+template <typename ScalarType, typename VCLMatrixType>
+ScalarType diff(ublas::matrix<ScalarType> & mat1, VCLMatrixType & mat2)
 {
    ublas::matrix<ScalarType> mat2_cpu(mat2.size1(), mat2.size2());
-   copy(mat2, mat2_cpu);
-   double ret = 0;
-   double act = 0;
+   viennacl::backend::finish();  //workaround for a bug in APP SDK 2.7 on Trinity APUs (with Catalyst 12.8)
+   viennacl::copy(mat2, mat2_cpu);
+   ScalarType ret = 0;
+   ScalarType act = 0;
 
-    for (std::size_t i = 0; i < mat2_cpu.size1(); ++i)
+    for (unsigned int i = 0; i < mat2_cpu.size1(); ++i)
     {
-      for (std::size_t j = 0; j < mat2_cpu.size2(); ++j)
+      for (unsigned int j = 0; j < mat2_cpu.size2(); ++j)
       {
-         act = fabs(mat2_cpu(i,j) - mat1(i,j)) / std::max( fabs(mat2_cpu(i, j)), fabs(mat1(i,j)) );
+         act = std::fabs(mat2_cpu(i,j) - mat1(i,j)) / std::max( std::fabs(mat2_cpu(i, j)), std::fabs(mat1(i,j)) );
          if (act > ret)
            ret = act;
       }
     }
    //std::cout << ret << std::endl;
-   return ScalarType(ret);
+   return ret;
 }
 
+
+
 //
-// -------------------------------------------------------------
+// Triangular solvers
 //
-template< typename NumericT, typename MatrixTypeA, typename MatrixTypeB, typename MatrixTypeC, typename Epsilon >
-int test_prod(Epsilon const& epsilon)
-{
-   int retval = EXIT_SUCCESS;
-   long matrix_size1 = 157;  //some odd number, not too large
-   long matrix_size2 = 91;  //some odd number, not too large
-   long matrix_size3 = 73;  //some odd number, not too large
-   NumericT act_diff = 0;
-   
-   // --------------------------------------------------------------------------            
-   ublas::matrix<NumericT> A(matrix_size1, matrix_size2);
-   ublas::matrix<NumericT> B(matrix_size2, matrix_size3);
-   ublas::matrix<NumericT> C(matrix_size1, matrix_size3);
-
-   //fill A and B:
-   for (unsigned int i = 0; i < A.size1(); ++i)
-      for (unsigned int j = 0; j < A.size2(); ++j)
-         A(i,j) = static_cast<NumericT>(0.1) * random<NumericT>();
-   for (unsigned int i = 0; i < B.size1(); ++i)
-      for (unsigned int j = 0; j < B.size2(); ++j)
-         B(i,j) = static_cast<NumericT>(0.1) * random<NumericT>();
-
-   ublas::matrix<NumericT> A_trans = trans(A);
-   ublas::matrix<NumericT> B_trans = trans(B);
-   
-   MatrixTypeA vcl_A(matrix_size1, matrix_size2);
-   MatrixTypeB vcl_B(matrix_size2, matrix_size3);
-   MatrixTypeA vcl_A_trans(matrix_size2, matrix_size1);
-   MatrixTypeB vcl_B_trans(matrix_size3, matrix_size2);
-   MatrixTypeC vcl_C(matrix_size1, matrix_size3);
-
-   
-   viennacl::copy(A, vcl_A);
-   viennacl::copy(B, vcl_B);
-   viennacl::copy(A_trans, vcl_A_trans);
-   viennacl::copy(B_trans, vcl_B_trans);
-
-   // Test: C = A * B --------------------------------------------------------------------------       
-   C     = viennacl::linalg::prod(A, B);
-   vcl_C = viennacl::linalg::prod(vcl_A, vcl_B);
-   act_diff = fabs(diff(C, vcl_C));
-   
-   if( act_diff > epsilon )
-   {
-     std::cout << "# Error at operation: matrix-matrix product" << std::endl;
-     std::cout << "  diff: " << act_diff << std::endl;
-     retval = EXIT_FAILURE;
-   }
-   else
-     std::cout << "Test C = A * B passed!" << std::endl;
-   
-   // Test: C = A * trans(B) --------------------------------------------------------------------------       
-   C     = boost::numeric::ublas::prod(A, trans(B_trans));
-   vcl_C = viennacl::linalg::prod(vcl_A, trans(vcl_B_trans));
-   act_diff = fabs(diff(C, vcl_C));
-   
-   if( act_diff > epsilon )
-   {
-     std::cout << "# Error at operation: matrix-matrix product" << std::endl;
-     std::cout << "  diff: " << act_diff << std::endl;
-     retval = EXIT_FAILURE;
-   }
-   else
-     std::cout << "Test C = A * trans(B) passed!" << std::endl;
-   
-   // Test: C = trans(A) * B --------------------------------------------------------------------------       
-   C     = boost::numeric::ublas::prod(trans(A_trans), B);
-   vcl_C = viennacl::linalg::prod(trans(vcl_A_trans), vcl_B);
-   act_diff = fabs(diff(C, vcl_C));
-   
-   if( act_diff > epsilon )
-   {
-     std::cout << "# Error at operation: matrix-matrix product" << std::endl;
-     std::cout << "  diff: " << act_diff << std::endl;
-     retval = EXIT_FAILURE;
-   }
-   else
-     std::cout << "Test C = trans(A) * B passed!" << std::endl;
-   
-   
-   // Test: C = trans(A) * trans(B) --------------------------------------------------------------------------       
-   C     = boost::numeric::ublas::prod(trans(A_trans), trans(B_trans));
-   vcl_C = viennacl::linalg::prod(trans(vcl_A_trans), trans(vcl_B_trans));
-   act_diff = fabs(diff(C, vcl_C));
-   
-   if( act_diff > epsilon )
-   {
-     std::cout << "# Error at operation: matrix-matrix product" << std::endl;
-     std::cout << "  diff: " << act_diff << std::endl;
-     retval = EXIT_FAILURE;
-   }
-   else
-     std::cout << "Test C = trans(A) * trans(B) passed!" << std::endl;
-   
-   
-   
-   return retval;
-}
+
+
 
 template <typename RHSTypeRef, typename RHSTypeCheck, typename Epsilon >
 void run_solver_check(RHSTypeRef & B_ref, RHSTypeCheck & B_check, int & retval, Epsilon const & epsilon)
@@ -213,56 +127,38 @@ void run_solver_check(RHSTypeRef & B_ref, RHSTypeCheck & B_check, int & retval,
    }
    else
      std::cout << " passed! " << act_diff << std::endl;
-   
+
 }
 
-template< typename NumericT, typename MatrixTypeA, typename MatrixTypeB, typename Epsilon >
-int test_solve(Epsilon const& epsilon)
+
+template< typename NumericT, typename Epsilon,
+          typename ReferenceMatrixTypeA, typename ReferenceMatrixTypeB, typename ReferenceMatrixTypeC,
+          typename MatrixTypeA, typename MatrixTypeB, typename MatrixTypeC, typename MatrixTypeResult>
+int test_solve(Epsilon const& epsilon,
+
+              ReferenceMatrixTypeA const & A,
+              ReferenceMatrixTypeB const & B_start,
+              ReferenceMatrixTypeC const & C_start,
+
+              MatrixTypeA const & vcl_A,
+              MatrixTypeB & vcl_B,
+              MatrixTypeC & vcl_C,
+              MatrixTypeResult const &
+             )
 {
    int retval = EXIT_SUCCESS;
-   long matrix_size = 83;  //some odd number, not too large
-   long rhs_num = 61;
-   
-   // --------------------------------------------------------------------------            
-   ublas::matrix<NumericT> A(matrix_size, matrix_size);
-   ublas::matrix<NumericT> B_start(matrix_size, rhs_num);
-   ublas::matrix<NumericT> C_start(rhs_num, matrix_size);
-
-   //fill A and B:
-   for (std::size_t i = 0; i < A.size1(); ++i)
-   {
-      for (std::size_t j = 0; j < A.size2(); ++j)
-         A(i,j) = static_cast<NumericT>(-0.5) * random<NumericT>();
-      A(i,i) = NumericT(1.0) + NumericT(2.0) * random<NumericT>(); //some extra weight on diagonal for stability
-   }
-   
-   for (std::size_t i = 0; i < B_start.size1(); ++i)
-      for (std::size_t j = 0; j < B_start.size2(); ++j)
-         B_start(i,j) = random<NumericT>();
-
-   for (std::size_t i = 0; i < C_start.size1(); ++i)
-      for (std::size_t j = 0; j < C_start.size2(); ++j)
-         C_start(i,j) = random<NumericT>();
-      
-   ublas::matrix<NumericT> B = B_start;
-   ublas::matrix<NumericT> result = B_start;
-   ublas::matrix<NumericT> C = C_start;
-   ublas::matrix<NumericT> A_trans = trans(A);
-   ublas::matrix<NumericT> C_trans = trans(C);
-
-   
-   MatrixTypeA vcl_A(matrix_size, matrix_size);
-   MatrixTypeB vcl_B(matrix_size, rhs_num);
-   MatrixTypeB vcl_result(matrix_size, rhs_num);
-   MatrixTypeB vcl_C(rhs_num, matrix_size);
-   MatrixTypeB vcl_C_result(rhs_num, matrix_size);
-
-   
-   viennacl::copy(A, vcl_A);
-   viennacl::copy(B, vcl_B);
-   viennacl::copy(C, vcl_C);
-   
-   // Test: A \ B with various tags --------------------------------------------------------------------------       
+
+   // --------------------------------------------------------------------------
+
+   ReferenceMatrixTypeA result;
+   ReferenceMatrixTypeC C_trans;
+
+   ReferenceMatrixTypeB B = B_start;
+   ReferenceMatrixTypeC C = C_start;
+
+   MatrixTypeResult vcl_result;
+
+   // Test: A \ B with various tags --------------------------------------------------------------------------
    std::cout << "Testing A \\ B: " << std::endl;
    std::cout << " * upper_tag:      ";
    result = ublas::solve(A, B, ublas::upper_tag());
@@ -283,14 +179,14 @@ int test_solve(Epsilon const& epsilon)
    result = ublas::solve(A, B, ublas::unit_lower_tag());
    vcl_result = viennacl::linalg::solve(vcl_A, vcl_B, viennacl::linalg::unit_lower_tag());
    run_solver_check(result, vcl_result, retval, epsilon);
-   
+
    if (retval == EXIT_SUCCESS)
      std::cout << "Test A \\ B passed!" << std::endl;
-   
+
    B = B_start;
    C = C_start;
-   
-   // Test: A \ B^T --------------------------------------------------------------------------       
+
+   // Test: A \ B^T --------------------------------------------------------------------------
    std::cout << "Testing A \\ B^T: " << std::endl;
    std::cout << " * upper_tag:      ";
    viennacl::copy(C, vcl_C); C_trans = trans(C);
@@ -321,14 +217,14 @@ int test_solve(Epsilon const& epsilon)
    ublas::inplace_solve(A, C_trans, ublas::unit_lower_tag());
    viennacl::linalg::inplace_solve(vcl_A, trans(vcl_C), viennacl::linalg::unit_lower_tag());
    C = trans(C_trans); run_solver_check(C, vcl_C, retval, epsilon);
-   
+
    if (retval == EXIT_SUCCESS)
      std::cout << "Test A \\ B^T passed!" << std::endl;
 
    B = B_start;
    C = C_start;
-   
-   // Test: A \ B with various tags --------------------------------------------------------------------------       
+
+   // Test: A \ B with various tags --------------------------------------------------------------------------
    std::cout << "Testing A^T \\ B: " << std::endl;
    std::cout << " * upper_tag:      ";
    viennacl::copy(B, vcl_B);
@@ -353,14 +249,14 @@ int test_solve(Epsilon const& epsilon)
    result = ublas::solve(trans(A), B, ublas::unit_lower_tag());
    vcl_result = viennacl::linalg::solve(trans(vcl_A), vcl_B, viennacl::linalg::unit_lower_tag());
    run_solver_check(result, vcl_result, retval, epsilon);
-   
+
    if (retval == EXIT_SUCCESS)
      std::cout << "Test A^T \\ B passed!" << std::endl;
-   
+
    B = B_start;
    C = C_start;
 
-   // Test: A^T \ B^T --------------------------------------------------------------------------       
+   // Test: A^T \ B^T --------------------------------------------------------------------------
    std::cout << "Testing A^T \\ B^T: " << std::endl;
    std::cout << " * upper_tag:      ";
    viennacl::copy(C, vcl_C); C_trans = trans(C);
@@ -394,133 +290,225 @@ int test_solve(Epsilon const& epsilon)
 
    if (retval == EXIT_SUCCESS)
      std::cout << "Test A^T \\ B^T passed!" << std::endl;
-   
-   return retval;  
+
+   return retval;
 }
 
-template< typename NumericT, typename Epsilon >
-int test(Epsilon const& epsilon)
+
+template< typename NumericT, typename F_A, typename F_B, typename Epsilon >
+int test_solve(Epsilon const& epsilon)
 {
-  int ret;
+  int ret = EXIT_SUCCESS;
+  long matrix_size = 135;  //some odd number, not too large
+  long rhs_num = 67;
 
-  std::cout << "--- Part 1: Testing matrix-matrix products ---" << std::endl;
-  
-  //
-  //
-  std::cout << "Now using A=row, B=row, C=row" << std::endl;
-  ret = test_prod<NumericT,
-             viennacl::matrix<NumericT, viennacl::row_major>,
-             viennacl::matrix<NumericT, viennacl::row_major>,
-             viennacl::matrix<NumericT, viennacl::row_major>  >(epsilon);
+  std::cout << "--- Part 2: Testing matrix-matrix solver ---" << std::endl;
+
+
+  ublas::matrix<NumericT> A(matrix_size, matrix_size);
+  ublas::matrix<NumericT> B_start(matrix_size, rhs_num);
+  ublas::matrix<NumericT> C_start(rhs_num, matrix_size);
+
+  for (std::size_t i = 0; i < A.size1(); ++i)
+  {
+    for (std::size_t j = 0; j < A.size2(); ++j)
+        A(i,j) = static_cast<NumericT>(-0.5) * random<NumericT>();
+    A(i,i) = NumericT(1.0) + NumericT(2.0) * random<NumericT>(); //some extra weight on diagonal for stability
+  }
+
+  for (std::size_t i = 0; i < B_start.size1(); ++i)
+    for (std::size_t j = 0; j < B_start.size2(); ++j)
+        B_start(i,j) = random<NumericT>();
+
+  for (std::size_t i = 0; i < C_start.size1(); ++i)
+    for (std::size_t j = 0; j < C_start.size2(); ++j)
+        C_start(i,j) = random<NumericT>();
+
+
+  // A
+  viennacl::range range1_A(matrix_size, 2*matrix_size);
+  viennacl::range range2_A(2*matrix_size, 3*matrix_size);
+  viennacl::slice slice1_A(matrix_size, 2, matrix_size);
+  viennacl::slice slice2_A(0, 3, matrix_size);
+
+  viennacl::matrix<NumericT, F_A>    vcl_A(matrix_size, matrix_size);
+  viennacl::copy(A, vcl_A);
+
+  viennacl::matrix<NumericT, F_A>    vcl_big_range_A(4*matrix_size, 4*matrix_size);
+  viennacl::matrix_range<viennacl::matrix<NumericT, F_A> > vcl_range_A(vcl_big_range_A, range1_A, range2_A);
+  viennacl::copy(A, vcl_range_A);
+
+  viennacl::matrix<NumericT, F_A>    vcl_big_slice_A(4*matrix_size, 4*matrix_size);
+  viennacl::matrix_slice<viennacl::matrix<NumericT, F_A> > vcl_slice_A(vcl_big_slice_A, slice1_A, slice2_A);
+  viennacl::copy(A, vcl_slice_A);
+
+
+  // B
+  viennacl::range range1_B(matrix_size, 2*matrix_size);
+  viennacl::range range2_B(2*rhs_num, 3*rhs_num);
+  viennacl::slice slice1_B(matrix_size, 2, matrix_size);
+  viennacl::slice slice2_B(0, 3, rhs_num);
+
+  viennacl::matrix<NumericT, F_B>    vcl_B(matrix_size, rhs_num);
+  viennacl::copy(B_start, vcl_B);
+
+  viennacl::matrix<NumericT, F_B>    vcl_big_range_B(4*matrix_size, 4*rhs_num);
+  viennacl::matrix_range<viennacl::matrix<NumericT, F_B> > vcl_range_B(vcl_big_range_B, range1_B, range2_B);
+  viennacl::copy(B_start, vcl_range_B);
+
+  viennacl::matrix<NumericT, F_B>    vcl_big_slice_B(4*matrix_size, 4*rhs_num);
+  viennacl::matrix_slice<viennacl::matrix<NumericT, F_B> > vcl_slice_B(vcl_big_slice_B, slice1_B, slice2_B);
+  viennacl::copy(B_start, vcl_slice_B);
+
+
+  // C
+  viennacl::range range1_C(rhs_num, 2*rhs_num);
+  viennacl::range range2_C(2*matrix_size, 3*matrix_size);
+  viennacl::slice slice1_C(rhs_num, 2, rhs_num);
+  viennacl::slice slice2_C(0, 3, matrix_size);
+
+  viennacl::matrix<NumericT, F_B>    vcl_C(rhs_num, matrix_size);
+  viennacl::copy(C_start, vcl_C);
+
+  viennacl::matrix<NumericT, F_B>    vcl_big_range_C(4*rhs_num, 4*matrix_size);
+  viennacl::matrix_range<viennacl::matrix<NumericT, F_B> > vcl_range_C(vcl_big_range_C, range1_C, range2_C);
+  viennacl::copy(C_start, vcl_range_C);
+
+  viennacl::matrix<NumericT, F_B>    vcl_big_slice_C(4*rhs_num, 4*matrix_size);
+  viennacl::matrix_slice<viennacl::matrix<NumericT, F_B> > vcl_slice_C(vcl_big_slice_C, slice1_C, slice2_C);
+  viennacl::copy(C_start, vcl_slice_C);
+
+
+  std::cout << "Now using A=matrix, B=matrix" << std::endl;
+  ret = test_solve<NumericT>(epsilon,
+                             A, B_start, C_start,
+                             vcl_A, vcl_B, vcl_C, vcl_B
+                            );
   if (ret != EXIT_SUCCESS)
     return ret;
 
-  //
-  //
-  std::cout << "Now using A=row, B=row, C=column" << std::endl;
-  ret = test_prod<NumericT,
-             viennacl::matrix<NumericT, viennacl::row_major>,
-             viennacl::matrix<NumericT, viennacl::row_major>,
-             viennacl::matrix<NumericT, viennacl::column_major>  >(epsilon);
+  std::cout << "Now using A=matrix, B=range" << std::endl;
+  ret = test_solve<NumericT>(epsilon,
+                             A, B_start, C_start,
+                             vcl_A, vcl_range_B, vcl_range_C, vcl_B
+                            );
   if (ret != EXIT_SUCCESS)
     return ret;
 
-  //
-  //
-  std::cout << "Now using A=row, B=column, C=row" << std::endl;
-  ret = test_prod<NumericT,
-             viennacl::matrix<NumericT, viennacl::row_major>,
-             viennacl::matrix<NumericT, viennacl::column_major>,
-             viennacl::matrix<NumericT, viennacl::row_major>  >(epsilon);
+  std::cout << "Now using A=matrix, B=slice" << std::endl;
+  ret = test_solve<NumericT>(epsilon,
+                             A, B_start, C_start,
+                             vcl_A, vcl_slice_B, vcl_slice_C, vcl_B
+                            );
   if (ret != EXIT_SUCCESS)
     return ret;
-  
-  //
-  //
-  std::cout << "Now using A=row, B=column, C=column" << std::endl;
-  ret = test_prod<NumericT,
-             viennacl::matrix<NumericT, viennacl::row_major>,
-             viennacl::matrix<NumericT, viennacl::column_major>,
-             viennacl::matrix<NumericT, viennacl::column_major>  >(epsilon);
+
+
+
+  std::cout << "Now using A=range, B=matrix" << std::endl;
+  ret = test_solve<NumericT>(epsilon,
+                             A, B_start, C_start,
+                             vcl_range_A, vcl_B, vcl_C, vcl_B
+                            );
   if (ret != EXIT_SUCCESS)
     return ret;
 
-  
-  
-  //
-  //
-  std::cout << "Now using A=column, B=row, C=row" << std::endl;
-  ret = test_prod<NumericT,
-             viennacl::matrix<NumericT, viennacl::column_major>,
-             viennacl::matrix<NumericT, viennacl::row_major>,
-             viennacl::matrix<NumericT, viennacl::row_major>  >(epsilon);
+  std::cout << "Now using A=range, B=range" << std::endl;
+  ret = test_solve<NumericT>(epsilon,
+                             A, B_start, C_start,
+                             vcl_range_A, vcl_range_B, vcl_range_C, vcl_B
+                            );
   if (ret != EXIT_SUCCESS)
     return ret;
 
-  //
-  //
-  std::cout << "Now using A=column, B=row, C=column" << std::endl;
-  ret = test_prod<NumericT,
-             viennacl::matrix<NumericT, viennacl::column_major>,
-             viennacl::matrix<NumericT, viennacl::row_major>,
-             viennacl::matrix<NumericT, viennacl::column_major>  >(epsilon);
+  std::cout << "Now using A=range, B=slice" << std::endl;
+  ret = test_solve<NumericT>(epsilon,
+                             A, B_start, C_start,
+                             vcl_range_A, vcl_slice_B, vcl_slice_C, vcl_B
+                            );
   if (ret != EXIT_SUCCESS)
     return ret;
 
-  //
-  //
-  std::cout << "Now using A=column, B=column, C=row" << std::endl;
-  ret = test_prod<NumericT,
-             viennacl::matrix<NumericT, viennacl::column_major>,
-             viennacl::matrix<NumericT, viennacl::column_major>,
-             viennacl::matrix<NumericT, viennacl::row_major>  >(epsilon);
+
+
+
+  std::cout << "Now using A=slice, B=matrix" << std::endl;
+  ret = test_solve<NumericT>(epsilon,
+                             A, B_start, C_start,
+                             vcl_slice_A, vcl_B, vcl_C, vcl_B
+                            );
   if (ret != EXIT_SUCCESS)
     return ret;
-  
-  //
-  //
-  std::cout << "Now using A=column, B=column, C=column" << std::endl;
-  ret = test_prod<NumericT,
-             viennacl::matrix<NumericT, viennacl::column_major>,
-             viennacl::matrix<NumericT, viennacl::column_major>,
-             viennacl::matrix<NumericT, viennacl::column_major>  >(epsilon);
+
+  std::cout << "Now using A=slice, B=range" << std::endl;
+  ret = test_solve<NumericT>(epsilon,
+                             A, B_start, C_start,
+                             vcl_slice_A, vcl_range_B, vcl_range_C, vcl_B
+                            );
   if (ret != EXIT_SUCCESS)
     return ret;
-  
-  std::cout << "--- Part 2: Testing matrix-matrix solver ---" << std::endl;
-  
-  std::cout << "Now using A=row, B=row" << std::endl;
-  ret = test_solve<NumericT,
-             viennacl::matrix<NumericT, viennacl::row_major>,
-             viennacl::matrix<NumericT, viennacl::row_major>  >(epsilon);
+
+  std::cout << "Now using A=slice, B=slice" << std::endl;
+  ret = test_solve<NumericT>(epsilon,
+                             A, B_start, C_start,
+                             vcl_slice_A, vcl_slice_B, vcl_slice_C, vcl_B
+                            );
   if (ret != EXIT_SUCCESS)
     return ret;
 
-  std::cout << "Now using A=row, B=col" << std::endl;
-  ret = test_solve<NumericT,
-             viennacl::matrix<NumericT, viennacl::row_major>,
-             viennacl::matrix<NumericT, viennacl::column_major>  >(epsilon);
+
+
+
+  return ret;
+
+}
+
+
+
+//
+// Control functions
+//
+
+
+template< typename NumericT, typename Epsilon >
+int test(Epsilon const& epsilon)
+{
+  int ret;
+
+  std::cout << "////////////////////////////////" << std::endl;
+  std::cout << "/// Now testing A=row, B=row ///" << std::endl;
+  std::cout << "////////////////////////////////" << std::endl;
+  ret = test_solve<NumericT, viennacl::row_major, viennacl::row_major>(epsilon);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+
+  std::cout << "////////////////////////////////" << std::endl;
+  std::cout << "/// Now testing A=row, B=col ///" << std::endl;
+  std::cout << "////////////////////////////////" << std::endl;
+  ret = test_solve<NumericT, viennacl::row_major, viennacl::column_major>(epsilon);
   if (ret != EXIT_SUCCESS)
     return ret;
 
-  std::cout << "Now using A=col, B=row" << std::endl;
-  ret = test_solve<NumericT,
-             viennacl::matrix<NumericT, viennacl::column_major>,
-             viennacl::matrix<NumericT, viennacl::row_major>  >(epsilon);
+  std::cout << "////////////////////////////////" << std::endl;
+  std::cout << "/// Now testing A=col, B=row ///" << std::endl;
+  std::cout << "////////////////////////////////" << std::endl;
+  ret = test_solve<NumericT, viennacl::column_major, viennacl::row_major>(epsilon);
   if (ret != EXIT_SUCCESS)
     return ret;
 
-  std::cout << "Now using A=col, B=col" << std::endl;
-  ret = test_solve<NumericT,
-             viennacl::matrix<NumericT, viennacl::column_major>,
-             viennacl::matrix<NumericT, viennacl::column_major>  >(epsilon);
+  std::cout << "////////////////////////////////" << std::endl;
+  std::cout << "/// Now testing A=col, B=col ///" << std::endl;
+  std::cout << "////////////////////////////////" << std::endl;
+  ret = test_solve<NumericT, viennacl::column_major, viennacl::column_major>(epsilon);
   if (ret != EXIT_SUCCESS)
     return ret;
-  
+
+
+
   return ret;
 }
 
-
 //
 // -------------------------------------------------------------
 //
@@ -554,7 +542,9 @@ int main()
    std::cout << std::endl;
    std::cout << "----------------------------------------------" << std::endl;
    std::cout << std::endl;
+#ifdef VIENNACL_WITH_OPENCL
    if( viennacl::ocl::current_device().double_support() )
+#endif
    {
       {
         typedef double NumericT;
@@ -572,5 +562,11 @@ int main()
       std::cout << "----------------------------------------------" << std::endl;
       std::cout << std::endl;
    }
+
+   std::cout << std::endl;
+   std::cout << "------- Test completed --------" << std::endl;
+   std::cout << std::endl;
+
+
    return retval;
 }
diff --git a/tests/src/blas3.cpp b/tests/src/blas3_solve_float.cu
similarity index 55%
rename from tests/src/blas3.cpp
rename to tests/src/blas3_solve_float.cu
index b4e3a9d..e063f79 100644
--- a/tests/src/blas3.cpp
+++ b/tests/src/blas3_solve_float.cu
@@ -1,20 +1,22 @@
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
 ============================================================================= */
 
 //#define NDEBUG
+//#define VIENNACL_DEBUG_BUILD
 
 //
 // *** System
@@ -35,11 +37,12 @@
 //
 // *** ViennaCL
 //
-//#define VIENNACL_DEBUG_INFO_ALL
+//#define VIENNACL_DEBUG_ALL
 //#define VIENNACL_DEBUG_BUILD
-#define VIENNACL_HAVE_UBLAS 1
+#define VIENNACL_WITH_UBLAS 1
 #include "viennacl/scalar.hpp"
 #include "viennacl/matrix.hpp"
+#include "viennacl/matrix_proxy.hpp"
 #include "viennacl/vector.hpp"
 #include "viennacl/linalg/prod.hpp"
 #include "viennacl/linalg/norm_2.hpp"
@@ -53,8 +56,9 @@ using namespace boost::numeric;
 // -------------------------------------------------------------
 //
 template <typename ScalarType>
-ScalarType diff(ScalarType & s1, viennacl::scalar<ScalarType> & s2) 
+ScalarType diff(ScalarType & s1, viennacl::scalar<ScalarType> & s2)
 {
+   viennacl::backend::finish();
    if (s1 != s2)
       return (s1 - s2) / std::max(fabs(s1), fabs(s2));
    return 0;
@@ -64,7 +68,9 @@ template <typename ScalarType>
 ScalarType diff(ublas::vector<ScalarType> & v1, viennacl::vector<ScalarType> & v2)
 {
    ublas::vector<ScalarType> v2_cpu(v2.size());
+   viennacl::backend::finish();
    viennacl::copy(v2.begin(), v2.end(), v2_cpu.begin());
+   viennacl::backend::finish();
 
    for (std::size_t i=0;i<v1.size(); ++i)
    {
@@ -77,128 +83,36 @@ ScalarType diff(ublas::vector<ScalarType> & v1, viennacl::vector<ScalarType> & v
    return norm_inf(v2_cpu);
 }
 
-template <typename ScalarType, typename F, unsigned int ALIGNMENT>
-ScalarType diff(ublas::matrix<ScalarType> & mat1, viennacl::matrix<ScalarType, F, ALIGNMENT> & mat2)
+
+template <typename ScalarType, typename VCLMatrixType>
+ScalarType diff(ublas::matrix<ScalarType> & mat1, VCLMatrixType & mat2)
 {
    ublas::matrix<ScalarType> mat2_cpu(mat2.size1(), mat2.size2());
-   copy(mat2, mat2_cpu);
-   double ret = 0;
-   double act = 0;
+   viennacl::backend::finish();  //workaround for a bug in APP SDK 2.7 on Trinity APUs (with Catalyst 12.8)
+   viennacl::copy(mat2, mat2_cpu);
+   ScalarType ret = 0;
+   ScalarType act = 0;
 
-    for (std::size_t i = 0; i < mat2_cpu.size1(); ++i)
+    for (unsigned int i = 0; i < mat2_cpu.size1(); ++i)
     {
-      for (std::size_t j = 0; j < mat2_cpu.size2(); ++j)
+      for (unsigned int j = 0; j < mat2_cpu.size2(); ++j)
       {
-         act = fabs(mat2_cpu(i,j) - mat1(i,j)) / std::max( fabs(mat2_cpu(i, j)), fabs(mat1(i,j)) );
+         act = std::fabs(mat2_cpu(i,j) - mat1(i,j)) / std::max( std::fabs(mat2_cpu(i, j)), std::fabs(mat1(i,j)) );
          if (act > ret)
            ret = act;
       }
     }
    //std::cout << ret << std::endl;
-   return ScalarType(ret);
+   return ret;
 }
 
+
+
 //
-// -------------------------------------------------------------
+// Triangular solvers
 //
-template< typename NumericT, typename MatrixTypeA, typename MatrixTypeB, typename MatrixTypeC, typename Epsilon >
-int test_prod(Epsilon const& epsilon)
-{
-   int retval = EXIT_SUCCESS;
-   long matrix_size1 = 157;  //some odd number, not too large
-   long matrix_size2 = 91;  //some odd number, not too large
-   long matrix_size3 = 73;  //some odd number, not too large
-   NumericT act_diff = 0;
-   
-   // --------------------------------------------------------------------------            
-   ublas::matrix<NumericT> A(matrix_size1, matrix_size2);
-   ublas::matrix<NumericT> B(matrix_size2, matrix_size3);
-   ublas::matrix<NumericT> C(matrix_size1, matrix_size3);
-
-   //fill A and B:
-   for (unsigned int i = 0; i < A.size1(); ++i)
-      for (unsigned int j = 0; j < A.size2(); ++j)
-         A(i,j) = static_cast<NumericT>(0.1) * random<NumericT>();
-   for (unsigned int i = 0; i < B.size1(); ++i)
-      for (unsigned int j = 0; j < B.size2(); ++j)
-         B(i,j) = static_cast<NumericT>(0.1) * random<NumericT>();
-
-   ublas::matrix<NumericT> A_trans = trans(A);
-   ublas::matrix<NumericT> B_trans = trans(B);
-   
-   MatrixTypeA vcl_A(matrix_size1, matrix_size2);
-   MatrixTypeB vcl_B(matrix_size2, matrix_size3);
-   MatrixTypeA vcl_A_trans(matrix_size2, matrix_size1);
-   MatrixTypeB vcl_B_trans(matrix_size3, matrix_size2);
-   MatrixTypeC vcl_C(matrix_size1, matrix_size3);
-
-   
-   viennacl::copy(A, vcl_A);
-   viennacl::copy(B, vcl_B);
-   viennacl::copy(A_trans, vcl_A_trans);
-   viennacl::copy(B_trans, vcl_B_trans);
-
-   // Test: C = A * B --------------------------------------------------------------------------       
-   C     = viennacl::linalg::prod(A, B);
-   vcl_C = viennacl::linalg::prod(vcl_A, vcl_B);
-   act_diff = fabs(diff(C, vcl_C));
-   
-   if( act_diff > epsilon )
-   {
-     std::cout << "# Error at operation: matrix-matrix product" << std::endl;
-     std::cout << "  diff: " << act_diff << std::endl;
-     retval = EXIT_FAILURE;
-   }
-   else
-     std::cout << "Test C = A * B passed!" << std::endl;
-   
-   // Test: C = A * trans(B) --------------------------------------------------------------------------       
-   C     = boost::numeric::ublas::prod(A, trans(B_trans));
-   vcl_C = viennacl::linalg::prod(vcl_A, trans(vcl_B_trans));
-   act_diff = fabs(diff(C, vcl_C));
-   
-   if( act_diff > epsilon )
-   {
-     std::cout << "# Error at operation: matrix-matrix product" << std::endl;
-     std::cout << "  diff: " << act_diff << std::endl;
-     retval = EXIT_FAILURE;
-   }
-   else
-     std::cout << "Test C = A * trans(B) passed!" << std::endl;
-   
-   // Test: C = trans(A) * B --------------------------------------------------------------------------       
-   C     = boost::numeric::ublas::prod(trans(A_trans), B);
-   vcl_C = viennacl::linalg::prod(trans(vcl_A_trans), vcl_B);
-   act_diff = fabs(diff(C, vcl_C));
-   
-   if( act_diff > epsilon )
-   {
-     std::cout << "# Error at operation: matrix-matrix product" << std::endl;
-     std::cout << "  diff: " << act_diff << std::endl;
-     retval = EXIT_FAILURE;
-   }
-   else
-     std::cout << "Test C = trans(A) * B passed!" << std::endl;
-   
-   
-   // Test: C = trans(A) * trans(B) --------------------------------------------------------------------------       
-   C     = boost::numeric::ublas::prod(trans(A_trans), trans(B_trans));
-   vcl_C = viennacl::linalg::prod(trans(vcl_A_trans), trans(vcl_B_trans));
-   act_diff = fabs(diff(C, vcl_C));
-   
-   if( act_diff > epsilon )
-   {
-     std::cout << "# Error at operation: matrix-matrix product" << std::endl;
-     std::cout << "  diff: " << act_diff << std::endl;
-     retval = EXIT_FAILURE;
-   }
-   else
-     std::cout << "Test C = trans(A) * trans(B) passed!" << std::endl;
-   
-   
-   
-   return retval;
-}
+
+
 
 template <typename RHSTypeRef, typename RHSTypeCheck, typename Epsilon >
 void run_solver_check(RHSTypeRef & B_ref, RHSTypeCheck & B_check, int & retval, Epsilon const & epsilon)
@@ -213,56 +127,38 @@ void run_solver_check(RHSTypeRef & B_ref, RHSTypeCheck & B_check, int & retval,
    }
    else
      std::cout << " passed! " << act_diff << std::endl;
-   
+
 }
 
-template< typename NumericT, typename MatrixTypeA, typename MatrixTypeB, typename Epsilon >
-int test_solve(Epsilon const& epsilon)
+
+template< typename NumericT, typename Epsilon,
+          typename ReferenceMatrixTypeA, typename ReferenceMatrixTypeB, typename ReferenceMatrixTypeC,
+          typename MatrixTypeA, typename MatrixTypeB, typename MatrixTypeC, typename MatrixTypeResult>
+int test_solve(Epsilon const& epsilon,
+
+              ReferenceMatrixTypeA const & A,
+              ReferenceMatrixTypeB const & B_start,
+              ReferenceMatrixTypeC const & C_start,
+
+              MatrixTypeA const & vcl_A,
+              MatrixTypeB & vcl_B,
+              MatrixTypeC & vcl_C,
+              MatrixTypeResult const &
+             )
 {
    int retval = EXIT_SUCCESS;
-   long matrix_size = 83;  //some odd number, not too large
-   long rhs_num = 61;
-   
-   // --------------------------------------------------------------------------            
-   ublas::matrix<NumericT> A(matrix_size, matrix_size);
-   ublas::matrix<NumericT> B_start(matrix_size, rhs_num);
-   ublas::matrix<NumericT> C_start(rhs_num, matrix_size);
-
-   //fill A and B:
-   for (std::size_t i = 0; i < A.size1(); ++i)
-   {
-      for (std::size_t j = 0; j < A.size2(); ++j)
-         A(i,j) = static_cast<NumericT>(-0.5) * random<NumericT>();
-      A(i,i) = NumericT(1.0) + NumericT(2.0) * random<NumericT>(); //some extra weight on diagonal for stability
-   }
-   
-   for (std::size_t i = 0; i < B_start.size1(); ++i)
-      for (std::size_t j = 0; j < B_start.size2(); ++j)
-         B_start(i,j) = random<NumericT>();
-
-   for (std::size_t i = 0; i < C_start.size1(); ++i)
-      for (std::size_t j = 0; j < C_start.size2(); ++j)
-         C_start(i,j) = random<NumericT>();
-      
-   ublas::matrix<NumericT> B = B_start;
-   ublas::matrix<NumericT> result = B_start;
-   ublas::matrix<NumericT> C = C_start;
-   ublas::matrix<NumericT> A_trans = trans(A);
-   ublas::matrix<NumericT> C_trans = trans(C);
-
-   
-   MatrixTypeA vcl_A(matrix_size, matrix_size);
-   MatrixTypeB vcl_B(matrix_size, rhs_num);
-   MatrixTypeB vcl_result(matrix_size, rhs_num);
-   MatrixTypeB vcl_C(rhs_num, matrix_size);
-   MatrixTypeB vcl_C_result(rhs_num, matrix_size);
-
-   
-   viennacl::copy(A, vcl_A);
-   viennacl::copy(B, vcl_B);
-   viennacl::copy(C, vcl_C);
-   
-   // Test: A \ B with various tags --------------------------------------------------------------------------       
+
+   // --------------------------------------------------------------------------
+
+   ReferenceMatrixTypeA result;
+   ReferenceMatrixTypeC C_trans;
+
+   ReferenceMatrixTypeB B = B_start;
+   ReferenceMatrixTypeC C = C_start;
+
+   MatrixTypeResult vcl_result;
+
+   // Test: A \ B with various tags --------------------------------------------------------------------------
    std::cout << "Testing A \\ B: " << std::endl;
    std::cout << " * upper_tag:      ";
    result = ublas::solve(A, B, ublas::upper_tag());
@@ -283,14 +179,14 @@ int test_solve(Epsilon const& epsilon)
    result = ublas::solve(A, B, ublas::unit_lower_tag());
    vcl_result = viennacl::linalg::solve(vcl_A, vcl_B, viennacl::linalg::unit_lower_tag());
    run_solver_check(result, vcl_result, retval, epsilon);
-   
+
    if (retval == EXIT_SUCCESS)
      std::cout << "Test A \\ B passed!" << std::endl;
-   
+
    B = B_start;
    C = C_start;
-   
-   // Test: A \ B^T --------------------------------------------------------------------------       
+
+   // Test: A \ B^T --------------------------------------------------------------------------
    std::cout << "Testing A \\ B^T: " << std::endl;
    std::cout << " * upper_tag:      ";
    viennacl::copy(C, vcl_C); C_trans = trans(C);
@@ -321,14 +217,14 @@ int test_solve(Epsilon const& epsilon)
    ublas::inplace_solve(A, C_trans, ublas::unit_lower_tag());
    viennacl::linalg::inplace_solve(vcl_A, trans(vcl_C), viennacl::linalg::unit_lower_tag());
    C = trans(C_trans); run_solver_check(C, vcl_C, retval, epsilon);
-   
+
    if (retval == EXIT_SUCCESS)
      std::cout << "Test A \\ B^T passed!" << std::endl;
 
    B = B_start;
    C = C_start;
-   
-   // Test: A \ B with various tags --------------------------------------------------------------------------       
+
+   // Test: A \ B with various tags --------------------------------------------------------------------------
    std::cout << "Testing A^T \\ B: " << std::endl;
    std::cout << " * upper_tag:      ";
    viennacl::copy(B, vcl_B);
@@ -353,14 +249,14 @@ int test_solve(Epsilon const& epsilon)
    result = ublas::solve(trans(A), B, ublas::unit_lower_tag());
    vcl_result = viennacl::linalg::solve(trans(vcl_A), vcl_B, viennacl::linalg::unit_lower_tag());
    run_solver_check(result, vcl_result, retval, epsilon);
-   
+
    if (retval == EXIT_SUCCESS)
      std::cout << "Test A^T \\ B passed!" << std::endl;
-   
+
    B = B_start;
    C = C_start;
 
-   // Test: A^T \ B^T --------------------------------------------------------------------------       
+   // Test: A^T \ B^T --------------------------------------------------------------------------
    std::cout << "Testing A^T \\ B^T: " << std::endl;
    std::cout << " * upper_tag:      ";
    viennacl::copy(C, vcl_C); C_trans = trans(C);
@@ -394,133 +290,225 @@ int test_solve(Epsilon const& epsilon)
 
    if (retval == EXIT_SUCCESS)
      std::cout << "Test A^T \\ B^T passed!" << std::endl;
-   
-   return retval;  
+
+   return retval;
 }
 
-template< typename NumericT, typename Epsilon >
-int test(Epsilon const& epsilon)
+
+template< typename NumericT, typename F_A, typename F_B, typename Epsilon >
+int test_solve(Epsilon const& epsilon)
 {
-  int ret;
+  int ret = EXIT_SUCCESS;
+  long matrix_size = 135;  //some odd number, not too large
+  long rhs_num = 67;
 
-  std::cout << "--- Part 1: Testing matrix-matrix products ---" << std::endl;
-  
-  //
-  //
-  std::cout << "Now using A=row, B=row, C=row" << std::endl;
-  ret = test_prod<NumericT,
-             viennacl::matrix<NumericT, viennacl::row_major>,
-             viennacl::matrix<NumericT, viennacl::row_major>,
-             viennacl::matrix<NumericT, viennacl::row_major>  >(epsilon);
+  std::cout << "--- Part 2: Testing matrix-matrix solver ---" << std::endl;
+
+
+  ublas::matrix<NumericT> A(matrix_size, matrix_size);
+  ublas::matrix<NumericT> B_start(matrix_size, rhs_num);
+  ublas::matrix<NumericT> C_start(rhs_num, matrix_size);
+
+  for (std::size_t i = 0; i < A.size1(); ++i)
+  {
+    for (std::size_t j = 0; j < A.size2(); ++j)
+        A(i,j) = static_cast<NumericT>(-0.5) * random<NumericT>();
+    A(i,i) = NumericT(1.0) + NumericT(2.0) * random<NumericT>(); //some extra weight on diagonal for stability
+  }
+
+  for (std::size_t i = 0; i < B_start.size1(); ++i)
+    for (std::size_t j = 0; j < B_start.size2(); ++j)
+        B_start(i,j) = random<NumericT>();
+
+  for (std::size_t i = 0; i < C_start.size1(); ++i)
+    for (std::size_t j = 0; j < C_start.size2(); ++j)
+        C_start(i,j) = random<NumericT>();
+
+
+  // A
+  viennacl::range range1_A(matrix_size, 2*matrix_size);
+  viennacl::range range2_A(2*matrix_size, 3*matrix_size);
+  viennacl::slice slice1_A(matrix_size, 2, matrix_size);
+  viennacl::slice slice2_A(0, 3, matrix_size);
+
+  viennacl::matrix<NumericT, F_A>    vcl_A(matrix_size, matrix_size);
+  viennacl::copy(A, vcl_A);
+
+  viennacl::matrix<NumericT, F_A>    vcl_big_range_A(4*matrix_size, 4*matrix_size);
+  viennacl::matrix_range<viennacl::matrix<NumericT, F_A> > vcl_range_A(vcl_big_range_A, range1_A, range2_A);
+  viennacl::copy(A, vcl_range_A);
+
+  viennacl::matrix<NumericT, F_A>    vcl_big_slice_A(4*matrix_size, 4*matrix_size);
+  viennacl::matrix_slice<viennacl::matrix<NumericT, F_A> > vcl_slice_A(vcl_big_slice_A, slice1_A, slice2_A);
+  viennacl::copy(A, vcl_slice_A);
+
+
+  // B
+  viennacl::range range1_B(matrix_size, 2*matrix_size);
+  viennacl::range range2_B(2*rhs_num, 3*rhs_num);
+  viennacl::slice slice1_B(matrix_size, 2, matrix_size);
+  viennacl::slice slice2_B(0, 3, rhs_num);
+
+  viennacl::matrix<NumericT, F_B>    vcl_B(matrix_size, rhs_num);
+  viennacl::copy(B_start, vcl_B);
+
+  viennacl::matrix<NumericT, F_B>    vcl_big_range_B(4*matrix_size, 4*rhs_num);
+  viennacl::matrix_range<viennacl::matrix<NumericT, F_B> > vcl_range_B(vcl_big_range_B, range1_B, range2_B);
+  viennacl::copy(B_start, vcl_range_B);
+
+  viennacl::matrix<NumericT, F_B>    vcl_big_slice_B(4*matrix_size, 4*rhs_num);
+  viennacl::matrix_slice<viennacl::matrix<NumericT, F_B> > vcl_slice_B(vcl_big_slice_B, slice1_B, slice2_B);
+  viennacl::copy(B_start, vcl_slice_B);
+
+
+  // C
+  viennacl::range range1_C(rhs_num, 2*rhs_num);
+  viennacl::range range2_C(2*matrix_size, 3*matrix_size);
+  viennacl::slice slice1_C(rhs_num, 2, rhs_num);
+  viennacl::slice slice2_C(0, 3, matrix_size);
+
+  viennacl::matrix<NumericT, F_B>    vcl_C(rhs_num, matrix_size);
+  viennacl::copy(C_start, vcl_C);
+
+  viennacl::matrix<NumericT, F_B>    vcl_big_range_C(4*rhs_num, 4*matrix_size);
+  viennacl::matrix_range<viennacl::matrix<NumericT, F_B> > vcl_range_C(vcl_big_range_C, range1_C, range2_C);
+  viennacl::copy(C_start, vcl_range_C);
+
+  viennacl::matrix<NumericT, F_B>    vcl_big_slice_C(4*rhs_num, 4*matrix_size);
+  viennacl::matrix_slice<viennacl::matrix<NumericT, F_B> > vcl_slice_C(vcl_big_slice_C, slice1_C, slice2_C);
+  viennacl::copy(C_start, vcl_slice_C);
+
+
+  std::cout << "Now using A=matrix, B=matrix" << std::endl;
+  ret = test_solve<NumericT>(epsilon,
+                             A, B_start, C_start,
+                             vcl_A, vcl_B, vcl_C, vcl_B
+                            );
   if (ret != EXIT_SUCCESS)
     return ret;
 
-  //
-  //
-  std::cout << "Now using A=row, B=row, C=column" << std::endl;
-  ret = test_prod<NumericT,
-             viennacl::matrix<NumericT, viennacl::row_major>,
-             viennacl::matrix<NumericT, viennacl::row_major>,
-             viennacl::matrix<NumericT, viennacl::column_major>  >(epsilon);
+  std::cout << "Now using A=matrix, B=range" << std::endl;
+  ret = test_solve<NumericT>(epsilon,
+                             A, B_start, C_start,
+                             vcl_A, vcl_range_B, vcl_range_C, vcl_B
+                            );
   if (ret != EXIT_SUCCESS)
     return ret;
 
-  //
-  //
-  std::cout << "Now using A=row, B=column, C=row" << std::endl;
-  ret = test_prod<NumericT,
-             viennacl::matrix<NumericT, viennacl::row_major>,
-             viennacl::matrix<NumericT, viennacl::column_major>,
-             viennacl::matrix<NumericT, viennacl::row_major>  >(epsilon);
+  std::cout << "Now using A=matrix, B=slice" << std::endl;
+  ret = test_solve<NumericT>(epsilon,
+                             A, B_start, C_start,
+                             vcl_A, vcl_slice_B, vcl_slice_C, vcl_B
+                            );
   if (ret != EXIT_SUCCESS)
     return ret;
-  
-  //
-  //
-  std::cout << "Now using A=row, B=column, C=column" << std::endl;
-  ret = test_prod<NumericT,
-             viennacl::matrix<NumericT, viennacl::row_major>,
-             viennacl::matrix<NumericT, viennacl::column_major>,
-             viennacl::matrix<NumericT, viennacl::column_major>  >(epsilon);
+
+
+
+  std::cout << "Now using A=range, B=matrix" << std::endl;
+  ret = test_solve<NumericT>(epsilon,
+                             A, B_start, C_start,
+                             vcl_range_A, vcl_B, vcl_C, vcl_B
+                            );
   if (ret != EXIT_SUCCESS)
     return ret;
 
-  
-  
-  //
-  //
-  std::cout << "Now using A=column, B=row, C=row" << std::endl;
-  ret = test_prod<NumericT,
-             viennacl::matrix<NumericT, viennacl::column_major>,
-             viennacl::matrix<NumericT, viennacl::row_major>,
-             viennacl::matrix<NumericT, viennacl::row_major>  >(epsilon);
+  std::cout << "Now using A=range, B=range" << std::endl;
+  ret = test_solve<NumericT>(epsilon,
+                             A, B_start, C_start,
+                             vcl_range_A, vcl_range_B, vcl_range_C, vcl_B
+                            );
   if (ret != EXIT_SUCCESS)
     return ret;
 
-  //
-  //
-  std::cout << "Now using A=column, B=row, C=column" << std::endl;
-  ret = test_prod<NumericT,
-             viennacl::matrix<NumericT, viennacl::column_major>,
-             viennacl::matrix<NumericT, viennacl::row_major>,
-             viennacl::matrix<NumericT, viennacl::column_major>  >(epsilon);
+  std::cout << "Now using A=range, B=slice" << std::endl;
+  ret = test_solve<NumericT>(epsilon,
+                             A, B_start, C_start,
+                             vcl_range_A, vcl_slice_B, vcl_slice_C, vcl_B
+                            );
   if (ret != EXIT_SUCCESS)
     return ret;
 
-  //
-  //
-  std::cout << "Now using A=column, B=column, C=row" << std::endl;
-  ret = test_prod<NumericT,
-             viennacl::matrix<NumericT, viennacl::column_major>,
-             viennacl::matrix<NumericT, viennacl::column_major>,
-             viennacl::matrix<NumericT, viennacl::row_major>  >(epsilon);
+
+
+
+  std::cout << "Now using A=slice, B=matrix" << std::endl;
+  ret = test_solve<NumericT>(epsilon,
+                             A, B_start, C_start,
+                             vcl_slice_A, vcl_B, vcl_C, vcl_B
+                            );
   if (ret != EXIT_SUCCESS)
     return ret;
-  
-  //
-  //
-  std::cout << "Now using A=column, B=column, C=column" << std::endl;
-  ret = test_prod<NumericT,
-             viennacl::matrix<NumericT, viennacl::column_major>,
-             viennacl::matrix<NumericT, viennacl::column_major>,
-             viennacl::matrix<NumericT, viennacl::column_major>  >(epsilon);
+
+  std::cout << "Now using A=slice, B=range" << std::endl;
+  ret = test_solve<NumericT>(epsilon,
+                             A, B_start, C_start,
+                             vcl_slice_A, vcl_range_B, vcl_range_C, vcl_B
+                            );
   if (ret != EXIT_SUCCESS)
     return ret;
-  
-  std::cout << "--- Part 2: Testing matrix-matrix solver ---" << std::endl;
-  
-  std::cout << "Now using A=row, B=row" << std::endl;
-  ret = test_solve<NumericT,
-             viennacl::matrix<NumericT, viennacl::row_major>,
-             viennacl::matrix<NumericT, viennacl::row_major>  >(epsilon);
+
+  std::cout << "Now using A=slice, B=slice" << std::endl;
+  ret = test_solve<NumericT>(epsilon,
+                             A, B_start, C_start,
+                             vcl_slice_A, vcl_slice_B, vcl_slice_C, vcl_B
+                            );
   if (ret != EXIT_SUCCESS)
     return ret;
 
-  std::cout << "Now using A=row, B=col" << std::endl;
-  ret = test_solve<NumericT,
-             viennacl::matrix<NumericT, viennacl::row_major>,
-             viennacl::matrix<NumericT, viennacl::column_major>  >(epsilon);
+
+
+
+  return ret;
+
+}
+
+
+
+//
+// Control functions
+//
+
+
+template< typename NumericT, typename Epsilon >
+int test(Epsilon const& epsilon)
+{
+  int ret;
+
+  std::cout << "////////////////////////////////" << std::endl;
+  std::cout << "/// Now testing A=row, B=row ///" << std::endl;
+  std::cout << "////////////////////////////////" << std::endl;
+  ret = test_solve<NumericT, viennacl::row_major, viennacl::row_major>(epsilon);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+
+  std::cout << "////////////////////////////////" << std::endl;
+  std::cout << "/// Now testing A=row, B=col ///" << std::endl;
+  std::cout << "////////////////////////////////" << std::endl;
+  ret = test_solve<NumericT, viennacl::row_major, viennacl::column_major>(epsilon);
   if (ret != EXIT_SUCCESS)
     return ret;
 
-  std::cout << "Now using A=col, B=row" << std::endl;
-  ret = test_solve<NumericT,
-             viennacl::matrix<NumericT, viennacl::column_major>,
-             viennacl::matrix<NumericT, viennacl::row_major>  >(epsilon);
+  std::cout << "////////////////////////////////" << std::endl;
+  std::cout << "/// Now testing A=col, B=row ///" << std::endl;
+  std::cout << "////////////////////////////////" << std::endl;
+  ret = test_solve<NumericT, viennacl::column_major, viennacl::row_major>(epsilon);
   if (ret != EXIT_SUCCESS)
     return ret;
 
-  std::cout << "Now using A=col, B=col" << std::endl;
-  ret = test_solve<NumericT,
-             viennacl::matrix<NumericT, viennacl::column_major>,
-             viennacl::matrix<NumericT, viennacl::column_major>  >(epsilon);
+  std::cout << "////////////////////////////////" << std::endl;
+  std::cout << "/// Now testing A=col, B=col ///" << std::endl;
+  std::cout << "////////////////////////////////" << std::endl;
+  ret = test_solve<NumericT, viennacl::column_major, viennacl::column_major>(epsilon);
   if (ret != EXIT_SUCCESS)
     return ret;
-  
+
+
+
   return ret;
 }
 
-
 //
 // -------------------------------------------------------------
 //
@@ -554,7 +542,9 @@ int main()
    std::cout << std::endl;
    std::cout << "----------------------------------------------" << std::endl;
    std::cout << std::endl;
+#ifdef VIENNACL_WITH_OPENCL
    if( viennacl::ocl::current_device().double_support() )
+#endif
    {
       {
         typedef double NumericT;
@@ -572,5 +562,11 @@ int main()
       std::cout << "----------------------------------------------" << std::endl;
       std::cout << std::endl;
    }
+
+   std::cout << std::endl;
+   std::cout << "------- Test completed --------" << std::endl;
+   std::cout << std::endl;
+
+
    return retval;
 }
diff --git a/tests/src/blas3_solve_float_double.hpp b/tests/src/blas3_solve_float_double.hpp
new file mode 100644
index 0000000..6a1a22a
--- /dev/null
+++ b/tests/src/blas3_solve_float_double.hpp
@@ -0,0 +1,514 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+//#define NDEBUG
+//#define VIENNACL_DEBUG_BUILD
+
+// We don't need debug mode in UBLAS:
+#define BOOST_UBLAS_NDEBUG
+
+
+//
+// *** System
+//
+#include <iostream>
+
+//
+// *** Boost
+//
+#include <boost/numeric/ublas/io.hpp>
+#include <boost/numeric/ublas/triangular.hpp>
+#include <boost/numeric/ublas/matrix_sparse.hpp>
+#include <boost/numeric/ublas/matrix.hpp>
+#include <boost/numeric/ublas/matrix_proxy.hpp>
+#include <boost/numeric/ublas/lu.hpp>
+#include <boost/numeric/ublas/io.hpp>
+
+//
+// *** ViennaCL
+//
+//#define VIENNACL_DEBUG_ALL
+//#define VIENNACL_DEBUG_BUILD
+#define VIENNACL_WITH_UBLAS 1
+#include "viennacl/scalar.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/matrix_proxy.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+#include "viennacl/linalg/direct_solve.hpp"
+#include "examples/tutorial/Random.hpp"
+//
+// -------------------------------------------------------------
+//
+using namespace boost::numeric;
+//
+// -------------------------------------------------------------
+//
+template <typename ScalarType>
+ScalarType diff(ScalarType & s1, viennacl::scalar<ScalarType> & s2)
+{
+   viennacl::backend::finish();
+   if (s1 != s2)
+      return (s1 - s2) / std::max(std::fabs(s1), std::fabs(s2));
+   return 0;
+}
+
+template <typename ScalarType>
+ScalarType diff(ublas::vector<ScalarType> & v1, viennacl::vector<ScalarType> & v2)
+{
+   ublas::vector<ScalarType> v2_cpu(v2.size());
+   viennacl::backend::finish();
+   viennacl::copy(v2.begin(), v2.end(), v2_cpu.begin());
+   viennacl::backend::finish();
+
+   for (std::size_t i=0;i<v1.size(); ++i)
+   {
+      if ( std::max( std::fabs(v2_cpu[i]), std::fabs(v1[i]) ) > 0 )
+         v2_cpu[i] = std::fabs(v2_cpu[i] - v1[i]) / std::max( std::fabs(v2_cpu[i]), std::fabs(v1[i]) );
+      else
+         v2_cpu[i] = 0.0;
+   }
+
+   return norm_inf(v2_cpu);
+}
+
+
+template <typename ScalarType, typename VCLMatrixType>
+ScalarType diff(ublas::matrix<ScalarType> & mat1, VCLMatrixType & mat2)
+{
+   ublas::matrix<ScalarType> mat2_cpu(mat2.size1(), mat2.size2());
+   viennacl::backend::finish();  //workaround for a bug in APP SDK 2.7 on Trinity APUs (with Catalyst 12.8)
+   viennacl::copy(mat2, mat2_cpu);
+   ScalarType ret = 0;
+   ScalarType act = 0;
+
+    for (unsigned int i = 0; i < mat2_cpu.size1(); ++i)
+    {
+      for (unsigned int j = 0; j < mat2_cpu.size2(); ++j)
+      {
+         act = std::fabs(mat2_cpu(i,j) - mat1(i,j)) / std::max( std::fabs(mat2_cpu(i, j)), std::fabs(mat1(i,j)) );
+         if (act > ret)
+           ret = act;
+      }
+    }
+   //std::cout << ret << std::endl;
+   return ret;
+}
+
+
+
+//
+// Triangular solvers
+//
+
+
+
+template <typename RHSTypeRef, typename RHSTypeCheck, typename Epsilon >
+void run_solver_check(RHSTypeRef & B_ref, RHSTypeCheck & B_check, int & retval, Epsilon const & epsilon)
+{
+   double act_diff = std::fabs(diff(B_ref, B_check));
+   if( act_diff > epsilon )
+   {
+     std::cout << " FAILED!" << std::endl;
+     std::cout << "# Error at operation: matrix-matrix solve" << std::endl;
+     std::cout << "  diff: " << act_diff << std::endl;
+     retval = EXIT_FAILURE;
+   }
+   else
+     std::cout << " passed! " << act_diff << std::endl;
+
+}
+
+
+template< typename NumericT, typename Epsilon,
+          typename ReferenceMatrixTypeA, typename ReferenceMatrixTypeB, typename ReferenceMatrixTypeC,
+          typename MatrixTypeA, typename MatrixTypeB, typename MatrixTypeC, typename MatrixTypeResult>
+int test_solve(Epsilon const& epsilon,
+
+              ReferenceMatrixTypeA const & A,
+              ReferenceMatrixTypeB const & B_start,
+              ReferenceMatrixTypeC const & C_start,
+
+              MatrixTypeA const & vcl_A,
+              MatrixTypeB & vcl_B,
+              MatrixTypeC & vcl_C,
+              MatrixTypeResult const &
+             )
+{
+   int retval = EXIT_SUCCESS;
+
+   // --------------------------------------------------------------------------
+
+   ReferenceMatrixTypeA result;
+   ReferenceMatrixTypeC C_trans;
+
+   ReferenceMatrixTypeB B = B_start;
+   ReferenceMatrixTypeC C = C_start;
+
+   MatrixTypeResult vcl_result;
+
+   // Test: A \ B with various tags --------------------------------------------------------------------------
+   std::cout << "Testing A \\ B: " << std::endl;
+   std::cout << " * upper_tag:      ";
+   result = ublas::solve(A, B, ublas::upper_tag());
+   vcl_result = viennacl::linalg::solve(vcl_A, vcl_B, viennacl::linalg::upper_tag());
+   run_solver_check(result, vcl_result, retval, epsilon);
+
+   std::cout << " * unit_upper_tag: ";
+   result = ublas::solve(A, B, ublas::unit_upper_tag());
+   vcl_result = viennacl::linalg::solve(vcl_A, vcl_B, viennacl::linalg::unit_upper_tag());
+   run_solver_check(result, vcl_result, retval, epsilon);
+
+   std::cout << " * lower_tag:      ";
+   result = ublas::solve(A, B, ublas::lower_tag());
+   vcl_result = viennacl::linalg::solve(vcl_A, vcl_B, viennacl::linalg::lower_tag());
+   run_solver_check(result, vcl_result, retval, epsilon);
+
+   std::cout << " * unit_lower_tag: ";
+   result = ublas::solve(A, B, ublas::unit_lower_tag());
+   vcl_result = viennacl::linalg::solve(vcl_A, vcl_B, viennacl::linalg::unit_lower_tag());
+   run_solver_check(result, vcl_result, retval, epsilon);
+
+   if (retval == EXIT_SUCCESS)
+     std::cout << "Test A \\ B passed!" << std::endl;
+
+   B = B_start;
+   C = C_start;
+
+   // Test: A \ B^T --------------------------------------------------------------------------
+   std::cout << "Testing A \\ B^T: " << std::endl;
+   std::cout << " * upper_tag:      ";
+   viennacl::copy(C, vcl_C); C_trans = trans(C);
+   //check solve():
+   result = ublas::solve(A, C_trans, ublas::upper_tag());
+   vcl_result = viennacl::linalg::solve(vcl_A, trans(vcl_C), viennacl::linalg::upper_tag());
+   run_solver_check(result, vcl_result, retval, epsilon);
+   //check compute kernels:
+   std::cout << " * upper_tag:      ";
+   ublas::inplace_solve(A, C_trans, ublas::upper_tag());
+   viennacl::linalg::inplace_solve(vcl_A, trans(vcl_C), viennacl::linalg::upper_tag());
+   C = trans(C_trans); run_solver_check(C, vcl_C, retval, epsilon);
+
+   std::cout << " * unit_upper_tag: ";
+   viennacl::copy(C, vcl_C); C_trans = trans(C);
+   ublas::inplace_solve(A, C_trans, ublas::unit_upper_tag());
+   viennacl::linalg::inplace_solve(vcl_A, trans(vcl_C), viennacl::linalg::unit_upper_tag());
+   C = trans(C_trans); run_solver_check(C, vcl_C, retval, epsilon);
+
+   std::cout << " * lower_tag:      ";
+   viennacl::copy(C, vcl_C); C_trans = trans(C);
+   ublas::inplace_solve(A, C_trans, ublas::lower_tag());
+   viennacl::linalg::inplace_solve(vcl_A, trans(vcl_C), viennacl::linalg::lower_tag());
+   C = trans(C_trans); run_solver_check(C, vcl_C, retval, epsilon);
+
+   std::cout << " * unit_lower_tag: ";
+   viennacl::copy(C, vcl_C); C_trans = trans(C);
+   ublas::inplace_solve(A, C_trans, ublas::unit_lower_tag());
+   viennacl::linalg::inplace_solve(vcl_A, trans(vcl_C), viennacl::linalg::unit_lower_tag());
+   C = trans(C_trans); run_solver_check(C, vcl_C, retval, epsilon);
+
+   if (retval == EXIT_SUCCESS)
+     std::cout << "Test A \\ B^T passed!" << std::endl;
+
+   B = B_start;
+   C = C_start;
+
+   // Test: A \ B with various tags --------------------------------------------------------------------------
+   std::cout << "Testing A^T \\ B: " << std::endl;
+   std::cout << " * upper_tag:      ";
+   viennacl::copy(B, vcl_B);
+   result = ublas::solve(trans(A), B, ublas::upper_tag());
+   vcl_result = viennacl::linalg::solve(trans(vcl_A), vcl_B, viennacl::linalg::upper_tag());
+   run_solver_check(result, vcl_result, retval, epsilon);
+
+   std::cout << " * unit_upper_tag: ";
+   viennacl::copy(B, vcl_B);
+   result = ublas::solve(trans(A), B, ublas::unit_upper_tag());
+   vcl_result = viennacl::linalg::solve(trans(vcl_A), vcl_B, viennacl::linalg::unit_upper_tag());
+   run_solver_check(result, vcl_result, retval, epsilon);
+
+   std::cout << " * lower_tag:      ";
+   viennacl::copy(B, vcl_B);
+   result = ublas::solve(trans(A), B, ublas::lower_tag());
+   vcl_result = viennacl::linalg::solve(trans(vcl_A), vcl_B, viennacl::linalg::lower_tag());
+   run_solver_check(result, vcl_result, retval, epsilon);
+
+   std::cout << " * unit_lower_tag: ";
+   viennacl::copy(B, vcl_B);
+   result = ublas::solve(trans(A), B, ublas::unit_lower_tag());
+   vcl_result = viennacl::linalg::solve(trans(vcl_A), vcl_B, viennacl::linalg::unit_lower_tag());
+   run_solver_check(result, vcl_result, retval, epsilon);
+
+   if (retval == EXIT_SUCCESS)
+     std::cout << "Test A^T \\ B passed!" << std::endl;
+
+   B = B_start;
+   C = C_start;
+
+   // Test: A^T \ B^T --------------------------------------------------------------------------
+   std::cout << "Testing A^T \\ B^T: " << std::endl;
+   std::cout << " * upper_tag:      ";
+   viennacl::copy(C, vcl_C); C_trans = trans(C);
+   //check solve():
+   result = ublas::solve(trans(A), C_trans, ublas::upper_tag());
+   vcl_result = viennacl::linalg::solve(trans(vcl_A), trans(vcl_C), viennacl::linalg::upper_tag());
+   run_solver_check(result, vcl_result, retval, epsilon);
+   //check kernels:
+   std::cout << " * upper_tag:      ";
+   ublas::inplace_solve(trans(A), C_trans, ublas::upper_tag());
+   viennacl::linalg::inplace_solve(trans(vcl_A), trans(vcl_C), viennacl::linalg::upper_tag());
+   C = trans(C_trans); run_solver_check(C, vcl_C, retval, epsilon);
+
+   std::cout << " * unit_upper_tag: ";
+   viennacl::copy(C, vcl_C); C_trans = trans(C);
+   ublas::inplace_solve(trans(A), C_trans, ublas::unit_upper_tag());
+   viennacl::linalg::inplace_solve(trans(vcl_A), trans(vcl_C), viennacl::linalg::unit_upper_tag());
+   C = trans(C_trans); run_solver_check(C, vcl_C, retval, epsilon);
+
+   std::cout << " * lower_tag:      ";
+   viennacl::copy(C, vcl_C); C_trans = trans(C);
+   ublas::inplace_solve(trans(A), C_trans, ublas::lower_tag());
+   viennacl::linalg::inplace_solve(trans(vcl_A), trans(vcl_C), viennacl::linalg::lower_tag());
+   C = trans(C_trans); run_solver_check(C, vcl_C, retval, epsilon);
+
+   std::cout << " * unit_lower_tag: ";
+   viennacl::copy(C, vcl_C); C_trans = trans(C);
+   ublas::inplace_solve(trans(A), C_trans, ublas::unit_lower_tag());
+   viennacl::linalg::inplace_solve(trans(vcl_A), trans(vcl_C), viennacl::linalg::unit_lower_tag());
+   C = trans(C_trans); run_solver_check(C, vcl_C, retval, epsilon);
+
+   if (retval == EXIT_SUCCESS)
+     std::cout << "Test A^T \\ B^T passed!" << std::endl;
+
+   return retval;
+}
+
+
+template< typename NumericT, typename F_A, typename F_B, typename Epsilon >
+int test_solve(Epsilon const& epsilon)
+{
+  int ret = EXIT_SUCCESS;
+  long matrix_size = 135;  //some odd number, not too large
+  long rhs_num = 67;
+
+  std::cout << "--- Part 2: Testing matrix-matrix solver ---" << std::endl;
+
+
+  ublas::matrix<NumericT> A(matrix_size, matrix_size);
+  ublas::matrix<NumericT> B_start(matrix_size, rhs_num);
+  ublas::matrix<NumericT> C_start(rhs_num, matrix_size);
+
+  for (std::size_t i = 0; i < A.size1(); ++i)
+  {
+    for (std::size_t j = 0; j < A.size2(); ++j)
+        A(i,j) = static_cast<NumericT>(-0.5) * random<NumericT>();
+    A(i,i) = NumericT(1.0) + NumericT(2.0) * random<NumericT>(); //some extra weight on diagonal for stability
+  }
+
+  for (std::size_t i = 0; i < B_start.size1(); ++i)
+    for (std::size_t j = 0; j < B_start.size2(); ++j)
+        B_start(i,j) = random<NumericT>();
+
+  for (std::size_t i = 0; i < C_start.size1(); ++i)
+    for (std::size_t j = 0; j < C_start.size2(); ++j)
+        C_start(i,j) = random<NumericT>();
+
+
+  // A
+  viennacl::range range1_A(matrix_size, 2*matrix_size);
+  viennacl::range range2_A(2*matrix_size, 3*matrix_size);
+  viennacl::slice slice1_A(matrix_size, 2, matrix_size);
+  viennacl::slice slice2_A(0, 3, matrix_size);
+
+  viennacl::matrix<NumericT, F_A>    vcl_A(matrix_size, matrix_size);
+  viennacl::copy(A, vcl_A);
+
+  viennacl::matrix<NumericT, F_A>    vcl_big_range_A(4*matrix_size, 4*matrix_size);
+  viennacl::matrix_range<viennacl::matrix<NumericT, F_A> > vcl_range_A(vcl_big_range_A, range1_A, range2_A);
+  viennacl::copy(A, vcl_range_A);
+
+  viennacl::matrix<NumericT, F_A>    vcl_big_slice_A(4*matrix_size, 4*matrix_size);
+  viennacl::matrix_slice<viennacl::matrix<NumericT, F_A> > vcl_slice_A(vcl_big_slice_A, slice1_A, slice2_A);
+  viennacl::copy(A, vcl_slice_A);
+
+
+  // B
+  viennacl::range range1_B(matrix_size, 2*matrix_size);
+  viennacl::range range2_B(2*rhs_num, 3*rhs_num);
+  viennacl::slice slice1_B(matrix_size, 2, matrix_size);
+  viennacl::slice slice2_B(0, 3, rhs_num);
+
+  viennacl::matrix<NumericT, F_B>    vcl_B(matrix_size, rhs_num);
+  viennacl::copy(B_start, vcl_B);
+
+  viennacl::matrix<NumericT, F_B>    vcl_big_range_B(4*matrix_size, 4*rhs_num);
+  viennacl::matrix_range<viennacl::matrix<NumericT, F_B> > vcl_range_B(vcl_big_range_B, range1_B, range2_B);
+  viennacl::copy(B_start, vcl_range_B);
+
+  viennacl::matrix<NumericT, F_B>    vcl_big_slice_B(4*matrix_size, 4*rhs_num);
+  viennacl::matrix_slice<viennacl::matrix<NumericT, F_B> > vcl_slice_B(vcl_big_slice_B, slice1_B, slice2_B);
+  viennacl::copy(B_start, vcl_slice_B);
+
+
+  // C
+  viennacl::range range1_C(rhs_num, 2*rhs_num);
+  viennacl::range range2_C(2*matrix_size, 3*matrix_size);
+  viennacl::slice slice1_C(rhs_num, 2, rhs_num);
+  viennacl::slice slice2_C(0, 3, matrix_size);
+
+  viennacl::matrix<NumericT, F_B>    vcl_C(rhs_num, matrix_size);
+  viennacl::copy(C_start, vcl_C);
+
+  viennacl::matrix<NumericT, F_B>    vcl_big_range_C(4*rhs_num, 4*matrix_size);
+  viennacl::matrix_range<viennacl::matrix<NumericT, F_B> > vcl_range_C(vcl_big_range_C, range1_C, range2_C);
+  viennacl::copy(C_start, vcl_range_C);
+
+  viennacl::matrix<NumericT, F_B>    vcl_big_slice_C(4*rhs_num, 4*matrix_size);
+  viennacl::matrix_slice<viennacl::matrix<NumericT, F_B> > vcl_slice_C(vcl_big_slice_C, slice1_C, slice2_C);
+  viennacl::copy(C_start, vcl_slice_C);
+
+
+  std::cout << "Now using A=matrix, B=matrix" << std::endl;
+  ret = test_solve<NumericT>(epsilon,
+                             A, B_start, C_start,
+                             vcl_A, vcl_B, vcl_C, vcl_B
+                            );
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+  std::cout << "Now using A=matrix, B=range" << std::endl;
+  ret = test_solve<NumericT>(epsilon,
+                             A, B_start, C_start,
+                             vcl_A, vcl_range_B, vcl_range_C, vcl_B
+                            );
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+  std::cout << "Now using A=matrix, B=slice" << std::endl;
+  ret = test_solve<NumericT>(epsilon,
+                             A, B_start, C_start,
+                             vcl_A, vcl_slice_B, vcl_slice_C, vcl_B
+                            );
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+
+
+  std::cout << "Now using A=range, B=matrix" << std::endl;
+  ret = test_solve<NumericT>(epsilon,
+                             A, B_start, C_start,
+                             vcl_range_A, vcl_B, vcl_C, vcl_B
+                            );
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+  std::cout << "Now using A=range, B=range" << std::endl;
+  ret = test_solve<NumericT>(epsilon,
+                             A, B_start, C_start,
+                             vcl_range_A, vcl_range_B, vcl_range_C, vcl_B
+                            );
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+  std::cout << "Now using A=range, B=slice" << std::endl;
+  ret = test_solve<NumericT>(epsilon,
+                             A, B_start, C_start,
+                             vcl_range_A, vcl_slice_B, vcl_slice_C, vcl_B
+                            );
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+
+
+
+  std::cout << "Now using A=slice, B=matrix" << std::endl;
+  ret = test_solve<NumericT>(epsilon,
+                             A, B_start, C_start,
+                             vcl_slice_A, vcl_B, vcl_C, vcl_B
+                            );
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+  std::cout << "Now using A=slice, B=range" << std::endl;
+  ret = test_solve<NumericT>(epsilon,
+                             A, B_start, C_start,
+                             vcl_slice_A, vcl_range_B, vcl_range_C, vcl_B
+                            );
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+  std::cout << "Now using A=slice, B=slice" << std::endl;
+  ret = test_solve<NumericT>(epsilon,
+                             A, B_start, C_start,
+                             vcl_slice_A, vcl_slice_B, vcl_slice_C, vcl_B
+                            );
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+
+
+
+  return ret;
+
+}
+
+
+
+//
+// Control functions
+//
+
+
+template< typename NumericT, typename Epsilon >
+int test(Epsilon const& epsilon)
+{
+  int ret;
+
+  std::cout << "////////////////////////////////" << std::endl;
+  std::cout << "/// Now testing A=row, B=row ///" << std::endl;
+  std::cout << "////////////////////////////////" << std::endl;
+  ret = test_solve<NumericT, viennacl::row_major, viennacl::row_major>(epsilon);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+
+  std::cout << "////////////////////////////////" << std::endl;
+  std::cout << "/// Now testing A=row, B=col ///" << std::endl;
+  std::cout << "////////////////////////////////" << std::endl;
+  ret = test_solve<NumericT, viennacl::row_major, viennacl::column_major>(epsilon);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+  std::cout << "////////////////////////////////" << std::endl;
+  std::cout << "/// Now testing A=col, B=row ///" << std::endl;
+  std::cout << "////////////////////////////////" << std::endl;
+  ret = test_solve<NumericT, viennacl::column_major, viennacl::row_major>(epsilon);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+  std::cout << "////////////////////////////////" << std::endl;
+  std::cout << "/// Now testing A=col, B=col ///" << std::endl;
+  std::cout << "////////////////////////////////" << std::endl;
+  ret = test_solve<NumericT, viennacl::column_major, viennacl::column_major>(epsilon);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+
+
+  return ret;
+}
diff --git a/tests/src/external_1.cpp b/tests/src/external_1.cpp
index 593b80b..68504ff 100644
--- a/tests/src/external_1.cpp
+++ b/tests/src/external_1.cpp
@@ -1,14 +1,15 @@
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
@@ -19,15 +20,14 @@
 //
 
 
-//#define VIENNACL_HAVE_EIGEN
-#define VIENNACL_HAVE_UBLAS
+//#define VIENNACL_WITH_EIGEN
+#define VIENNACL_WITH_UBLAS
 
 //
 // *** System
 //
 #include <iostream>
 
-
 //
 // *** ViennaCL
 //
@@ -36,27 +36,37 @@
 #include "viennacl/matrix.hpp"
 #include "viennacl/compressed_matrix.hpp"
 #include "viennacl/coordinate_matrix.hpp"
-#include "viennacl/circulant_matrix.hpp"
-#include "viennacl/hankel_matrix.hpp"
-#include "viennacl/toeplitz_matrix.hpp"
-#include "viennacl/vandermonde_matrix.hpp"
+#include "viennacl/ell_matrix.hpp"
+#include "viennacl/hyb_matrix.hpp"
+#ifdef VIENNACL_WITH_OPENCL
+  #include "viennacl/circulant_matrix.hpp"
+  #include "viennacl/hankel_matrix.hpp"
+  #include "viennacl/toeplitz_matrix.hpp"
+  #include "viennacl/vandermonde_matrix.hpp"
+#endif
 
 #include "viennacl/linalg/ilu.hpp"
 #include "viennacl/linalg/row_scaling.hpp"
 #include "viennacl/linalg/jacobi_precond.hpp"
-#include "viennacl/linalg/amg.hpp"
-#include "viennacl/linalg/spai.hpp"
 #include "viennacl/linalg/cg.hpp"
 #include "viennacl/linalg/bicgstab.hpp"
 #include "viennacl/linalg/gmres.hpp"
 #include "viennacl/linalg/direct_solve.hpp"
 #include "viennacl/linalg/qr.hpp"
 
-#include "viennacl/fft.hpp"
 #include "viennacl/misc/bandwidth_reduction.hpp"
 
-#include "viennacl/io/kernel_parameters.hpp"
+#ifdef VIENNACL_WITH_OPENCL
+  #include "viennacl/linalg/amg.hpp"
+  #include "viennacl/linalg/spai.hpp"
+  #include "viennacl/linalg/svd.hpp"
+  #include "viennacl/fft.hpp"
+  #include "viennacl/generator/generate.hpp"
+#endif
+
 #include "viennacl/io/matrix_market.hpp"
+#include "viennacl/scheduler/execute.hpp"
+
 
 
 //defined in external_2.cpp
@@ -68,16 +78,21 @@ void other_func();
 int main()
 {
   typedef float   NumericType;
-  
+
   //doing nothing but instantiating a few types
   viennacl::scalar<NumericType>  s;
   viennacl::vector<NumericType>  v(10);
   viennacl::matrix<NumericType>  m(10, 10);
   viennacl::compressed_matrix<NumericType>  compr(10, 10);
   viennacl::coordinate_matrix<NumericType>  coord(10, 10);
-  
+
   //this is the external linkage check:
   other_func();
-  
+
+   std::cout << std::endl;
+   std::cout << "------- Test completed --------" << std::endl;
+   std::cout << std::endl;
+
+
   return EXIT_SUCCESS;
 }
diff --git a/tests/src/external_1.cpp b/tests/src/external_1.cu
similarity index 69%
copy from tests/src/external_1.cpp
copy to tests/src/external_1.cu
index 593b80b..68504ff 100644
--- a/tests/src/external_1.cpp
+++ b/tests/src/external_1.cu
@@ -1,14 +1,15 @@
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
@@ -19,15 +20,14 @@
 //
 
 
-//#define VIENNACL_HAVE_EIGEN
-#define VIENNACL_HAVE_UBLAS
+//#define VIENNACL_WITH_EIGEN
+#define VIENNACL_WITH_UBLAS
 
 //
 // *** System
 //
 #include <iostream>
 
-
 //
 // *** ViennaCL
 //
@@ -36,27 +36,37 @@
 #include "viennacl/matrix.hpp"
 #include "viennacl/compressed_matrix.hpp"
 #include "viennacl/coordinate_matrix.hpp"
-#include "viennacl/circulant_matrix.hpp"
-#include "viennacl/hankel_matrix.hpp"
-#include "viennacl/toeplitz_matrix.hpp"
-#include "viennacl/vandermonde_matrix.hpp"
+#include "viennacl/ell_matrix.hpp"
+#include "viennacl/hyb_matrix.hpp"
+#ifdef VIENNACL_WITH_OPENCL
+  #include "viennacl/circulant_matrix.hpp"
+  #include "viennacl/hankel_matrix.hpp"
+  #include "viennacl/toeplitz_matrix.hpp"
+  #include "viennacl/vandermonde_matrix.hpp"
+#endif
 
 #include "viennacl/linalg/ilu.hpp"
 #include "viennacl/linalg/row_scaling.hpp"
 #include "viennacl/linalg/jacobi_precond.hpp"
-#include "viennacl/linalg/amg.hpp"
-#include "viennacl/linalg/spai.hpp"
 #include "viennacl/linalg/cg.hpp"
 #include "viennacl/linalg/bicgstab.hpp"
 #include "viennacl/linalg/gmres.hpp"
 #include "viennacl/linalg/direct_solve.hpp"
 #include "viennacl/linalg/qr.hpp"
 
-#include "viennacl/fft.hpp"
 #include "viennacl/misc/bandwidth_reduction.hpp"
 
-#include "viennacl/io/kernel_parameters.hpp"
+#ifdef VIENNACL_WITH_OPENCL
+  #include "viennacl/linalg/amg.hpp"
+  #include "viennacl/linalg/spai.hpp"
+  #include "viennacl/linalg/svd.hpp"
+  #include "viennacl/fft.hpp"
+  #include "viennacl/generator/generate.hpp"
+#endif
+
 #include "viennacl/io/matrix_market.hpp"
+#include "viennacl/scheduler/execute.hpp"
+
 
 
 //defined in external_2.cpp
@@ -68,16 +78,21 @@ void other_func();
 int main()
 {
   typedef float   NumericType;
-  
+
   //doing nothing but instantiating a few types
   viennacl::scalar<NumericType>  s;
   viennacl::vector<NumericType>  v(10);
   viennacl::matrix<NumericType>  m(10, 10);
   viennacl::compressed_matrix<NumericType>  compr(10, 10);
   viennacl::coordinate_matrix<NumericType>  coord(10, 10);
-  
+
   //this is the external linkage check:
   other_func();
-  
+
+   std::cout << std::endl;
+   std::cout << "------- Test completed --------" << std::endl;
+   std::cout << std::endl;
+
+
   return EXIT_SUCCESS;
 }
diff --git a/tests/src/external_2.cpp b/tests/src/external_2.cpp
index 0689de5..bc3c34f 100644
--- a/tests/src/external_2.cpp
+++ b/tests/src/external_2.cpp
@@ -1,14 +1,15 @@
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
@@ -19,12 +20,14 @@
 //
 
 
+//#define VIENNACL_WITH_EIGEN
+#define VIENNACL_WITH_UBLAS
+
 //
 // *** System
 //
 #include <iostream>
 
-
 //
 // *** ViennaCL
 //
@@ -33,10 +36,14 @@
 #include "viennacl/matrix.hpp"
 #include "viennacl/compressed_matrix.hpp"
 #include "viennacl/coordinate_matrix.hpp"
-#include "viennacl/circulant_matrix.hpp"
-#include "viennacl/hankel_matrix.hpp"
-#include "viennacl/toeplitz_matrix.hpp"
-#include "viennacl/vandermonde_matrix.hpp"
+#include "viennacl/ell_matrix.hpp"
+#include "viennacl/hyb_matrix.hpp"
+#ifdef VIENNACL_WITH_OPENCL
+  #include "viennacl/circulant_matrix.hpp"
+  #include "viennacl/hankel_matrix.hpp"
+  #include "viennacl/toeplitz_matrix.hpp"
+  #include "viennacl/vandermonde_matrix.hpp"
+#endif
 
 #include "viennacl/linalg/ilu.hpp"
 #include "viennacl/linalg/row_scaling.hpp"
@@ -45,22 +52,30 @@
 #include "viennacl/linalg/bicgstab.hpp"
 #include "viennacl/linalg/gmres.hpp"
 #include "viennacl/linalg/direct_solve.hpp"
+#include "viennacl/linalg/qr.hpp"
 
-#include "viennacl/fft.hpp"
 #include "viennacl/misc/bandwidth_reduction.hpp"
 
-#include "viennacl/io/kernel_parameters.hpp"
+#ifdef VIENNACL_WITH_OPENCL
+  #include "viennacl/linalg/amg.hpp"
+  #include "viennacl/linalg/spai.hpp"
+  #include "viennacl/linalg/svd.hpp"
+  #include "viennacl/fft.hpp"
+  #include "viennacl/generator/generate.hpp"
+#endif
+
 #include "viennacl/io/matrix_market.hpp"
+#include "viennacl/scheduler/execute.hpp"
 
 void other_func()
 {
   typedef float   NumericType;
-  
+
   //doing nothing but instantiating a few types
   viennacl::scalar<NumericType>  s;
   viennacl::vector<NumericType>  v(10);
   viennacl::matrix<NumericType>  m(10, 10);
   viennacl::compressed_matrix<NumericType>  compr(10, 10);
   viennacl::coordinate_matrix<NumericType>  coord(10, 10);
-  
+
 }
diff --git a/tests/src/external_1.cpp b/tests/src/external_2.cu
similarity index 69%
copy from tests/src/external_1.cpp
copy to tests/src/external_2.cu
index 593b80b..bc3c34f 100644
--- a/tests/src/external_1.cpp
+++ b/tests/src/external_2.cu
@@ -1,14 +1,15 @@
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
@@ -19,15 +20,14 @@
 //
 
 
-//#define VIENNACL_HAVE_EIGEN
-#define VIENNACL_HAVE_UBLAS
+//#define VIENNACL_WITH_EIGEN
+#define VIENNACL_WITH_UBLAS
 
 //
 // *** System
 //
 #include <iostream>
 
-
 //
 // *** ViennaCL
 //
@@ -36,48 +36,46 @@
 #include "viennacl/matrix.hpp"
 #include "viennacl/compressed_matrix.hpp"
 #include "viennacl/coordinate_matrix.hpp"
-#include "viennacl/circulant_matrix.hpp"
-#include "viennacl/hankel_matrix.hpp"
-#include "viennacl/toeplitz_matrix.hpp"
-#include "viennacl/vandermonde_matrix.hpp"
+#include "viennacl/ell_matrix.hpp"
+#include "viennacl/hyb_matrix.hpp"
+#ifdef VIENNACL_WITH_OPENCL
+  #include "viennacl/circulant_matrix.hpp"
+  #include "viennacl/hankel_matrix.hpp"
+  #include "viennacl/toeplitz_matrix.hpp"
+  #include "viennacl/vandermonde_matrix.hpp"
+#endif
 
 #include "viennacl/linalg/ilu.hpp"
 #include "viennacl/linalg/row_scaling.hpp"
 #include "viennacl/linalg/jacobi_precond.hpp"
-#include "viennacl/linalg/amg.hpp"
-#include "viennacl/linalg/spai.hpp"
 #include "viennacl/linalg/cg.hpp"
 #include "viennacl/linalg/bicgstab.hpp"
 #include "viennacl/linalg/gmres.hpp"
 #include "viennacl/linalg/direct_solve.hpp"
 #include "viennacl/linalg/qr.hpp"
 
-#include "viennacl/fft.hpp"
 #include "viennacl/misc/bandwidth_reduction.hpp"
 
-#include "viennacl/io/kernel_parameters.hpp"
-#include "viennacl/io/matrix_market.hpp"
+#ifdef VIENNACL_WITH_OPENCL
+  #include "viennacl/linalg/amg.hpp"
+  #include "viennacl/linalg/spai.hpp"
+  #include "viennacl/linalg/svd.hpp"
+  #include "viennacl/fft.hpp"
+  #include "viennacl/generator/generate.hpp"
+#endif
 
+#include "viennacl/io/matrix_market.hpp"
+#include "viennacl/scheduler/execute.hpp"
 
-//defined in external_2.cpp
-void other_func();
-
-//
-// -------------------------------------------------------------
-//
-int main()
+void other_func()
 {
   typedef float   NumericType;
-  
+
   //doing nothing but instantiating a few types
   viennacl::scalar<NumericType>  s;
   viennacl::vector<NumericType>  v(10);
   viennacl::matrix<NumericType>  m(10, 10);
   viennacl::compressed_matrix<NumericType>  compr(10, 10);
   viennacl::coordinate_matrix<NumericType>  coord(10, 10);
-  
-  //this is the external linkage check:
-  other_func();
-  
-  return EXIT_SUCCESS;
+
 }
diff --git a/tests/src/fft.cpp b/tests/src/fft.cpp
index 015cd26..fb009ed 100644
--- a/tests/src/fft.cpp
+++ b/tests/src/fft.cpp
@@ -1,15 +1,16 @@
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
@@ -28,7 +29,7 @@
 
 typedef float ScalarType;
 
-const ScalarType EPS = 0.06;  //use smaller values in double precision
+const ScalarType EPS = ScalarType(0.06f);  //use smaller values in double precision
 
 typedef ScalarType (*test_function_ptr)(std::vector<ScalarType>&,
                                         std::vector<ScalarType>&,
@@ -47,7 +48,7 @@ void read_vectors_pair(std::istream& str,
                       std::vector<ScalarType>& output,
                       unsigned int& rows,
                       unsigned int& cols,
-                      unsigned int& batch_size) 
+                      unsigned int& batch_size)
 {
     rows = 1;
 
@@ -55,10 +56,10 @@ void read_vectors_pair(std::istream& str,
     input.resize(2 * cols * batch_size);
     output.resize(2 * cols * batch_size);
 
-    for(unsigned int i = 0; i < input.size(); i++) 
+    for(unsigned int i = 0; i < input.size(); i++)
         str >> input[i];
 
-    for(unsigned int i = 0; i < output.size(); i++) 
+    for(unsigned int i = 0; i < output.size(); i++)
         str >> output[i];
 }
 
@@ -67,7 +68,7 @@ void read_matrices_pair(std::istream& str,
                         std::vector<ScalarType>& output,
                         unsigned int& rows,
                         unsigned int& cols,
-                        unsigned int& batch_size) 
+                        unsigned int& batch_size)
 {
     batch_size = 1;
     str >> rows >> cols;
@@ -85,12 +86,12 @@ void read_matrices_pair(std::istream& str,
 }
 
 template <typename ScalarType>
-ScalarType diff(std::vector<ScalarType>& vec, std::vector<ScalarType>& ref) 
+ScalarType diff(std::vector<ScalarType>& vec, std::vector<ScalarType>& ref)
 {
     ScalarType df = 0.0;
     ScalarType norm_ref = 0;
 
-    for(std::size_t i = 0; i < vec.size(); i++) 
+    for(std::size_t i = 0; i < vec.size(); i++)
     {
         df = df + pow(vec[i] - ref[i], 2);
         norm_ref += ref[i] * ref[i];
@@ -100,33 +101,33 @@ ScalarType diff(std::vector<ScalarType>& vec, std::vector<ScalarType>& ref)
 }
 
 template <typename ScalarType>
-ScalarType diff_max(std::vector<ScalarType>& vec, std::vector<ScalarType>& ref) 
+ScalarType diff_max(std::vector<ScalarType>& vec, std::vector<ScalarType>& ref)
 {
   ScalarType df = 0.0;
   ScalarType mx = 0.0;
   ScalarType norm_max = 0;
-  
-  for (std::size_t i = 0; i < vec.size(); i++) 
+
+  for (std::size_t i = 0; i < vec.size(); i++)
   {
     df = std::max<ScalarType>(fabs(vec[i] - ref[i]), df);
     mx = std::max<ScalarType>(fabs(vec[i]), mx);
-    
+
     if (mx > 0)
     {
       if (norm_max < df / mx)
         norm_max = df / mx;
     }
   }
-  
+
   return norm_max;
 }
 
 void convolve_ref(std::vector<ScalarType>& in1,
                   std::vector<ScalarType>& in2,
-                  std::vector<ScalarType>& out) 
+                  std::vector<ScalarType>& out)
 {
     out.resize(in1.size());
-    unsigned int data_size = in1.size() >> 1;
+    unsigned int data_size = static_cast<unsigned int>(in1.size()) >> 1;
 
     for(unsigned int n = 0; n < data_size; n++) {
         std::complex<ScalarType> el;
@@ -146,7 +147,7 @@ void convolve_ref(std::vector<ScalarType>& in1,
 
 ScalarType opencl_fft(std::vector<ScalarType>& in,
                       std::vector<ScalarType>& out,
-                      unsigned int row, unsigned int col, unsigned int batch_size)
+                      unsigned int /*row*/, unsigned int /*col*/, unsigned int batch_size)
 {
     viennacl::vector<ScalarType> input(in.size());
     viennacl::vector<ScalarType> output(in.size());
@@ -157,6 +158,7 @@ ScalarType opencl_fft(std::vector<ScalarType>& in,
 
     viennacl::fft(input, output, batch_size);
 
+    viennacl::backend::finish();
     viennacl::fast_copy(output, res);
 
     return diff_max(res, out);
@@ -164,7 +166,7 @@ ScalarType opencl_fft(std::vector<ScalarType>& in,
 
 ScalarType opencl_2d_fft_1arg(std::vector<ScalarType>& in,
                               std::vector<ScalarType>& out,
-                              unsigned int row, unsigned int col, unsigned int batch_size)
+                              unsigned int row, unsigned int col, unsigned int /*batch_size*/)
 {
     viennacl::matrix<ScalarType> input(row, 2 * col);
 
@@ -174,6 +176,7 @@ ScalarType opencl_2d_fft_1arg(std::vector<ScalarType>& in,
     //std::cout << input << "\n";
     viennacl::inplace_fft(input);
     //std::cout << input << "\n";
+    viennacl::backend::finish();
     viennacl::fast_copy(input, &res[0]);
 
     return diff_max(res, out);
@@ -181,7 +184,7 @@ ScalarType opencl_2d_fft_1arg(std::vector<ScalarType>& in,
 
 ScalarType opencl_2d_fft_2arg(std::vector<ScalarType>& in,
                               std::vector<ScalarType>& out,
-                              unsigned int row, unsigned int col, unsigned int batch_size)
+                              unsigned int row, unsigned int col, unsigned int /*batch_size*/)
 {
     viennacl::matrix<ScalarType> input(row, 2 * col);
     viennacl::matrix<ScalarType> output(row, 2 * col);
@@ -192,6 +195,7 @@ ScalarType opencl_2d_fft_2arg(std::vector<ScalarType>& in,
     //std::cout << input << "\n";
     viennacl::fft(input, output);
     //std::cout << input << "\n";
+    viennacl::backend::finish();
     viennacl::fast_copy(output, &res[0]);
 
     return diff_max(res, out);
@@ -199,7 +203,7 @@ ScalarType opencl_2d_fft_2arg(std::vector<ScalarType>& in,
 
 ScalarType opencl_direct(std::vector<ScalarType>& in,
                          std::vector<ScalarType>& out,
-                         unsigned int row, unsigned int col, unsigned int batch_num)
+                         unsigned int /*row*/, unsigned int /*col*/, unsigned int batch_num)
 {
     viennacl::vector<ScalarType> input(in.size());
     viennacl::vector<ScalarType> output(in.size());
@@ -208,10 +212,11 @@ ScalarType opencl_direct(std::vector<ScalarType>& in,
 
     viennacl::fast_copy(in, input);
 
-    unsigned int size = (input.size() >> 1) / batch_num;
+    unsigned int size = (static_cast<unsigned int>(input.size()) >> 1) / batch_num;
 
-    viennacl::detail::fft::direct<ScalarType>(input.handle(), output.handle(), size, size, batch_num);
+    viennacl::detail::fft::direct<ScalarType>(input.handle().opencl_handle(), output.handle().opencl_handle(), size, size, batch_num);
 
+    viennacl::backend::finish();
     viennacl::fast_copy(output, res);
 
     return diff_max(res, out);
@@ -219,7 +224,7 @@ ScalarType opencl_direct(std::vector<ScalarType>& in,
 
 ScalarType opencl_bluestein(std::vector<ScalarType>& in,
                             std::vector<ScalarType>& out,
-                            unsigned int row, unsigned int col, unsigned int batch_size)
+                            unsigned int /*row*/, unsigned int /*col*/, unsigned int batch_size)
 {
     viennacl::vector<ScalarType> input(in.size());
     viennacl::vector<ScalarType> output(in.size());
@@ -230,6 +235,7 @@ ScalarType opencl_bluestein(std::vector<ScalarType>& in,
 
     viennacl::detail::fft::bluestein(input, output, batch_size);
 
+    viennacl::backend::finish();
     viennacl::fast_copy(output, res);
 
     return diff_max(res, out);
@@ -237,7 +243,7 @@ ScalarType opencl_bluestein(std::vector<ScalarType>& in,
 
 ScalarType opencl_radix2(std::vector<ScalarType>& in,
                          std::vector<ScalarType>& out,
-                         unsigned int row, unsigned int col, unsigned int batch_num)
+                         unsigned int /*row*/, unsigned int /*col*/, unsigned int batch_num)
 {
     viennacl::vector<ScalarType> input(in.size());
     viennacl::vector<ScalarType> output(in.size());
@@ -246,10 +252,11 @@ ScalarType opencl_radix2(std::vector<ScalarType>& in,
 
     viennacl::fast_copy(in, input);
 
-    unsigned int size = (input.size() >> 1) / batch_num;
+    unsigned int size = (static_cast<unsigned int>(input.size()) >> 1) / batch_num;
 
-    viennacl::detail::fft::radix2<ScalarType>(input.handle(), size, size, batch_num);
+    viennacl::detail::fft::radix2<ScalarType>(input.handle().opencl_handle(), size, size, batch_num);
 
+    viennacl::backend::finish();
     viennacl::fast_copy(input, res);
 
     return diff_max(res, out);
@@ -257,7 +264,7 @@ ScalarType opencl_radix2(std::vector<ScalarType>& in,
 
 ScalarType opencl_convolve(std::vector<ScalarType>& in1,
                            std::vector<ScalarType>& in2,
-                           unsigned int row, unsigned int col, unsigned int batch_size)
+                           unsigned int /*row*/, unsigned int /*col*/, unsigned int /*batch_size*/)
 {
     //if(in1.size() > 2048) return -1;
     viennacl::vector<ScalarType> input1(in1.size());
@@ -269,6 +276,7 @@ ScalarType opencl_convolve(std::vector<ScalarType>& in1,
 
     viennacl::linalg::convolve(input1, input2, output);
 
+    viennacl::backend::finish();
     std::vector<ScalarType> res(in1.size());
     viennacl::fast_copy(output, res);
 
@@ -295,7 +303,7 @@ int test_correctness(const std::string& log_tag,
     unsigned int test_size = 0;
 
     fstr >> test_size;
-    
+
     std::cout << "Test size: " << test_size << std::endl;
 
     for(unsigned int i = 0; i < test_size; i++) {
@@ -313,54 +321,59 @@ int test_correctness(const std::string& log_tag,
 
 
 
-int main() {
-    std::cout << "*" << std::endl;
-    std::cout << "* ViennaCL test: FFT" << std::endl;
-    std::cout << "*" << std::endl;
-
-    //1D FFT tests
-    if (test_correctness("fft::direct", "../non-release/testdata/cufft.data", read_vectors_pair, &opencl_direct) == EXIT_FAILURE)
-      return EXIT_FAILURE;
-    if (test_correctness("fft::fft", "../non-release/testdata/cufft.data", read_vectors_pair, &opencl_fft) == EXIT_FAILURE)
-      return EXIT_FAILURE;
-    if (test_correctness("fft::batch::direct", "../non-release/testdata/batch_radix.data", read_vectors_pair, &opencl_direct) == EXIT_FAILURE)
-      return EXIT_FAILURE;
-    if (test_correctness("fft::radix2", "../non-release/testdata/radix2.data", read_vectors_pair, &opencl_radix2) == EXIT_FAILURE)
-      return EXIT_FAILURE;
-    if (test_correctness("fft::batch::radix2", "../non-release/testdata/batch_radix.data", read_vectors_pair, &opencl_radix2) == EXIT_FAILURE)
-      return EXIT_FAILURE;
-    if (test_correctness("fft::batch::fft", "../non-release/testdata/batch_radix.data", read_vectors_pair, &opencl_fft) == EXIT_FAILURE)
-      return EXIT_FAILURE;
-    if (test_correctness("fft::convolve::1", "../non-release/testdata/cufft.data", read_vectors_pair, &opencl_convolve) == EXIT_FAILURE)
-      return EXIT_FAILURE;
-    if (test_correctness("fft::convolve::2", "../non-release/testdata/radix2.data", read_vectors_pair, &opencl_convolve) == EXIT_FAILURE)
-      return EXIT_FAILURE;
-    if (test_correctness("fft::bluestein::1", "../non-release/testdata/cufft.data", read_vectors_pair, &opencl_bluestein) == EXIT_FAILURE)
-      return EXIT_FAILURE;
-    if (test_correctness("fft::bluestein::2", "../non-release/testdata/radix2.data", read_vectors_pair, &opencl_bluestein) == EXIT_FAILURE)
-      return EXIT_FAILURE;
-
-    //2D FFT tests
-    if (test_correctness("fft:2d::radix2::sml::1_arg", 
-                         "../non-release/testdata/fft2d_radix2.data", read_matrices_pair, &opencl_2d_fft_1arg) == EXIT_FAILURE)
-      return EXIT_FAILURE;
-    if (test_correctness("fft:2d::direct::sml::1_arg",
-                         "../non-release/testdata/fft2d_direct.data", read_matrices_pair, &opencl_2d_fft_1arg) == EXIT_FAILURE)
-      return EXIT_FAILURE;
-    if (test_correctness("fft:2d::direct::big::1_arg",
-                         "../non-release/testdata/fft2d_direct_big.data", read_matrices_pair, &opencl_2d_fft_1arg) == EXIT_FAILURE)
-      return EXIT_FAILURE;
-
-    if (test_correctness("fft:2d::radix2::sml::2_arg", 
-                         "../non-release/testdata/fft2d_radix2.data", read_matrices_pair, &opencl_2d_fft_2arg) == EXIT_FAILURE)
-      return EXIT_FAILURE;
-    if (test_correctness("fft:2d::direct::sml::2_arg",
-                         "../non-release/testdata/fft2d_direct.data", read_matrices_pair, &opencl_2d_fft_2arg) == EXIT_FAILURE)
-      return EXIT_FAILURE;
-    if (test_correctness("fft:2d::direct::bscalarig::2_arg", 
-                         "../non-release/testdata/fft2d_direct_big.data", read_matrices_pair, &opencl_2d_fft_2arg) == EXIT_FAILURE)
-      return EXIT_FAILURE;
-
-
-    return EXIT_SUCCESS;
+int main()
+{
+  std::cout << "*" << std::endl;
+  std::cout << "* ViennaCL test: FFT" << std::endl;
+  std::cout << "*" << std::endl;
+
+  //1D FFT tests
+  if (test_correctness("fft::direct", "../non-release/testdata/cufft.data", read_vectors_pair, &opencl_direct) == EXIT_FAILURE)
+    return EXIT_FAILURE;
+  if (test_correctness("fft::fft", "../non-release/testdata/cufft.data", read_vectors_pair, &opencl_fft) == EXIT_FAILURE)
+    return EXIT_FAILURE;
+  if (test_correctness("fft::batch::direct", "../non-release/testdata/batch_radix.data", read_vectors_pair, &opencl_direct) == EXIT_FAILURE)
+    return EXIT_FAILURE;
+  if (test_correctness("fft::radix2", "../non-release/testdata/radix2.data", read_vectors_pair, &opencl_radix2) == EXIT_FAILURE)
+    return EXIT_FAILURE;
+  if (test_correctness("fft::batch::radix2", "../non-release/testdata/batch_radix.data", read_vectors_pair, &opencl_radix2) == EXIT_FAILURE)
+    return EXIT_FAILURE;
+  if (test_correctness("fft::batch::fft", "../non-release/testdata/batch_radix.data", read_vectors_pair, &opencl_fft) == EXIT_FAILURE)
+    return EXIT_FAILURE;
+  if (test_correctness("fft::convolve::1", "../non-release/testdata/cufft.data", read_vectors_pair, &opencl_convolve) == EXIT_FAILURE)
+    return EXIT_FAILURE;
+  if (test_correctness("fft::convolve::2", "../non-release/testdata/radix2.data", read_vectors_pair, &opencl_convolve) == EXIT_FAILURE)
+    return EXIT_FAILURE;
+  if (test_correctness("fft::bluestein::1", "../non-release/testdata/cufft.data", read_vectors_pair, &opencl_bluestein) == EXIT_FAILURE)
+    return EXIT_FAILURE;
+  if (test_correctness("fft::bluestein::2", "../non-release/testdata/radix2.data", read_vectors_pair, &opencl_bluestein) == EXIT_FAILURE)
+    return EXIT_FAILURE;
+
+  //2D FFT tests
+  if (test_correctness("fft:2d::radix2::sml::1_arg",
+                        "../non-release/testdata/fft2d_radix2.data", read_matrices_pair, &opencl_2d_fft_1arg) == EXIT_FAILURE)
+    return EXIT_FAILURE;
+  if (test_correctness("fft:2d::direct::sml::1_arg",
+                        "../non-release/testdata/fft2d_direct.data", read_matrices_pair, &opencl_2d_fft_1arg) == EXIT_FAILURE)
+    return EXIT_FAILURE;
+  if (test_correctness("fft:2d::direct::big::1_arg",
+                        "../non-release/testdata/fft2d_direct_big.data", read_matrices_pair, &opencl_2d_fft_1arg) == EXIT_FAILURE)
+    return EXIT_FAILURE;
+
+  if (test_correctness("fft:2d::radix2::sml::2_arg",
+                        "../non-release/testdata/fft2d_radix2.data", read_matrices_pair, &opencl_2d_fft_2arg) == EXIT_FAILURE)
+    return EXIT_FAILURE;
+  if (test_correctness("fft:2d::direct::sml::2_arg",
+                        "../non-release/testdata/fft2d_direct.data", read_matrices_pair, &opencl_2d_fft_2arg) == EXIT_FAILURE)
+    return EXIT_FAILURE;
+  if (test_correctness("fft:2d::direct::bscalarig::2_arg",
+                        "../non-release/testdata/fft2d_direct_big.data", read_matrices_pair, &opencl_2d_fft_2arg) == EXIT_FAILURE)
+    return EXIT_FAILURE;
+
+  std::cout << std::endl;
+  std::cout << "------- Test completed --------" << std::endl;
+  std::cout << std::endl;
+
+
+  return EXIT_SUCCESS;
 }
diff --git a/tests/src/generator_blas1.cpp b/tests/src/generator_blas1.cpp
new file mode 100644
index 0000000..0d578ef
--- /dev/null
+++ b/tests/src/generator_blas1.cpp
@@ -0,0 +1,524 @@
+/* =========================================================================
+   Copyright (c) 2010-2012, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+//
+// *** System
+//
+#include <iostream>
+
+//
+// *** Boost
+//
+#include <boost/numeric/ublas/io.hpp>
+#include <boost/numeric/ublas/vector.hpp>
+
+//
+// *** ViennaCL
+//
+#define VIENNACL_WITH_UBLAS 1
+
+#define VIENNACL_DEBUG_ALL
+#define VIENNACL_DEBUG_BUILD
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/inner_prod.hpp"
+#include "viennacl/linalg/norm_1.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+#include "viennacl/linalg/norm_inf.hpp"
+#include "viennacl/generator/generate.hpp"
+#include "viennacl/scheduler/io.hpp"
+
+#define CHECK_RESULT(cpu,gpu, op) \
+    if ( float delta = std::fabs ( diff ( cpu, gpu) ) > epsilon ) {\
+        std::cout << "# Error at operation: " #op << std::endl;\
+        std::cout << "  diff: " << delta << std::endl;\
+        retval = EXIT_FAILURE;\
+    }\
+
+
+using namespace boost::numeric;
+using namespace viennacl;
+
+template <typename ScalarType, typename VCLMatrixType>
+ScalarType diff(ublas::matrix<ScalarType> & mat1, VCLMatrixType & mat2)
+{
+    ublas::matrix<ScalarType> mat2_cpu(mat2.size1(), mat2.size2());
+    viennacl::backend::finish();
+    viennacl::copy(mat2, mat2_cpu);
+    double ret = 0;
+    double act = 0;
+    for (unsigned int i = 0; i < mat2_cpu.size1(); ++i)
+    {
+      for (unsigned int j = 0; j < mat2_cpu.size2(); ++j)
+      {
+         act = std::fabs(mat2_cpu(i,j) - mat1(i,j)) / std::max( std::fabs(mat2_cpu(i, j)), std::fabs(mat1(i,j)) );
+         if (act > ret)
+           ret = act;
+      }
+    }
+    //std::cout << ret << std::endl;
+    return ret;
+}
+
+template <typename ScalarType, unsigned int Alignment>
+ScalarType diff ( ublas::vector<ScalarType> & v1, viennacl::vector<ScalarType,Alignment> & v2 ) {
+    ublas::vector<ScalarType> v2_cpu ( v2.size() );
+    viennacl::copy( v2.begin(), v2.end(), v2_cpu.begin() );
+    for ( unsigned int i=0; i<v1.size(); ++i ) {
+        if ( std::max ( std::fabs ( v2_cpu[i] ), std::fabs ( v1[i] ) ) > 0 )
+            v2_cpu[i] = std::fabs ( v2_cpu[i] - v1[i] ) / std::max ( std::fabs ( v2_cpu[i] ), std::fabs ( v1[i] ) );
+        else
+            v2_cpu[i] = 0.0;
+    }
+    return norm_inf ( v2_cpu );
+}
+
+template<typename ScalarType>
+ScalarType diff(ScalarType s, viennacl::scalar<ScalarType> & gs){
+  ScalarType other = gs;
+  return (s - other) / std::max(s, other);
+}
+
+
+template< typename NumericT, typename Epsilon >
+int test_vector ( Epsilon const& epsilon) {
+    int retval = EXIT_SUCCESS;
+
+    unsigned int size = 1024;
+
+    ublas::vector<NumericT> cw(size);
+    ublas::vector<NumericT> cx(size);
+    ublas::vector<NumericT> cy(size);
+    ublas::vector<NumericT> cz(size);
+
+    NumericT s;
+
+
+
+    for(unsigned int i=0; i<cw.size(); ++i){
+      cw[i]=std::rand()/(NumericT)RAND_MAX;
+    }
+
+    std::cout << "Running tests for vector of size " << cw.size() << std::endl;
+
+    viennacl::vector<NumericT> w (size);
+    viennacl::vector<NumericT> x (size);
+    viennacl::vector<NumericT> y (size);
+    viennacl::vector<NumericT> z (size);
+    viennacl::scalar<NumericT> gs(0);
+
+    cx = NumericT(2.0f)*cw;
+    cy = NumericT(3.0f)*cw;
+    cz = NumericT(4.0f)*cw;
+    viennacl::copy (cw, w);
+    viennacl::copy (cx, x);
+    viennacl::copy (cy, y);
+    viennacl::copy (cz, z);
+
+    NumericT alpha = NumericT(3.14);
+    NumericT beta  = NumericT(3.51);
+
+    // --------------------------------------------------------------------------
+
+    {
+    std::cout << "w = x + y ..." << std::endl;
+    cw = cx + cy;
+    viennacl::scheduler::statement statement(w, viennacl::op_assign(), x + y);
+    generator::generate_enqueue_statement(statement, statement.array()[0]);
+    viennacl::backend::finish();
+    CHECK_RESULT(cw, w, w = x + y);
+    }
+
+    {
+    std::cout << "y = w + x ..." << std::endl;
+    cy = cw + cx;
+    viennacl::scheduler::statement statement(y, viennacl::op_assign(), w + x);
+    generator::generate_enqueue_statement(statement, statement.array()[0]);
+    viennacl::backend::finish();
+    CHECK_RESULT(cy, y, y = w + x);
+    }
+
+    {
+    std::cout << "x = y + w ..." << std::endl;
+    cx = cy + cw;
+    viennacl::scheduler::statement statement(x, viennacl::op_assign(), y + w);
+    generator::generate_enqueue_statement(statement, statement.array()[0]);
+    viennacl::backend::finish();
+    CHECK_RESULT(cx, x, x = y + w);
+    }
+
+    {
+    std::cout << "w = alpha*x + beta*y ..." << std::endl;
+    cw = alpha*cx + beta*cy;
+    viennacl::scheduler::statement statement(w, viennacl::op_assign(), alpha*x + beta*y);
+    generator::generate_enqueue_statement(statement, statement.array()[0]);
+    viennacl::backend::finish();
+    CHECK_RESULT(cw, w, w = alpha*x + beta*y);
+    }
+
+
+    {
+    std::cout << "s = inner_prod(x,y)..." << std::endl;
+    s = 0;
+    for(unsigned int i=0 ; i<size ; ++i)  s+=cx[i]*cy[i];
+    viennacl::scheduler::statement statement(gs, viennacl::op_assign(), viennacl::linalg::inner_prod(x,y));
+    generator::generate_enqueue_statement(statement, statement.array()[0]);
+    viennacl::backend::finish();
+    CHECK_RESULT(s, gs, s = inner_prod(x,y));
+    }
+//    {
+//        std::cout << "w = x > 0.42" << std::endl;
+//        for(unsigned int i=0 ; i < size ; ++i){
+//            cw(i) = cx(i) > (NumericT)0.42;
+//        }
+//        generator::custom_operation op;
+//        op.add(vec(w) = vec(x) > (NumericT)0.42);
+//        op.execute();
+//        viennacl::backend::finish();
+//        CHECK_RESULT(cw, w, w = x > 1)
+//    }
+
+//    {
+//        std::cout << "w = -w ..." << std::endl;
+//        cw = -cw;
+//        generator::custom_operation op;
+//        op.add(vec(w) = -vec(w));
+//        op.execute();
+//        viennacl::backend::finish();
+//        CHECK_RESULT(cw,w, w=-w);
+//    }
+
+//    {
+//        std::cout << "w = x + shift(x,-5) + shift(x,3) ..." << std::endl;
+//        for(unsigned int i=0 ; i<size; ++i){
+//            int ind1 = std::max((int)i - 5, 0);
+//            int ind2 = std::min(i + 3, size-1);
+//            cw(i) = cx(i) + cx(ind1) + cx(ind2);
+//        }
+//        generator::custom_operation op;
+//        op.add(vec(w) = vec(x) + generator::shift(vec(x),-5) + generator::shift(vec(x),3));
+//        op.execute();
+//        viennacl::backend::finish();
+//        CHECK_RESULT(cw,w, w = x + shift(x,-5) + shift(x,3) );
+//    }
+
+//    {
+//        std::cout << "s = inner_prod(x,y)..." << std::endl;
+//        s = 0;
+//        for(unsigned int i=0 ; i<size ; ++i)  s+=cx[i]*cy[i];
+//        generator::custom_operation op;
+//        op.add(scal(gs)= generator::inner_prod(vec(x), vec(y)));
+//        op.execute();
+//        viennacl::backend::finish();
+//        CHECK_RESULT(s,gs, s=inner_prod(x,y));
+//    }
+
+//    {
+//        std::cout << "s = max(x)..." << std::endl;
+//        s = *std::max_element(cx.begin(),cx.end());
+//        generator::custom_operation op;
+//        op.add(scal(gs)= generator::reduce<generator::fmax_type>(vec(x)));
+//        op.execute();
+//        viennacl::backend::finish();
+//        CHECK_RESULT(s,gs, s=max(x));
+//    }
+
+//    {
+//        std::cout << "Multiline ..." << std::endl;
+//        viennacl::generator::custom_operation op;
+//        op.add(vec(w) = vec(x) - vec(y));
+//        op.add(vec(y) = element_prod(vec(w), vec(z)));
+//        op.add(vec(z) = vec(x) + vec(z));
+//        op.execute();
+//        viennacl::backend::finish();
+//        for(unsigned int i=0 ; i < size ; ++i){
+//            cw(i) = cx(i) - cy(i);
+//            cy(i) = cw(i)*cz(i);
+//            cz(i) = cx(i) + cz(i);
+//        }
+//        CHECK_RESULT(cw, w, Multiline);
+//        CHECK_RESULT(cy, y, Multiline);
+//        CHECK_RESULT(cz, z, Multiline);
+//    }
+
+    return retval;
+}
+
+
+
+template< typename NumericT, class Layout, typename Epsilon >
+int test_matrix ( Epsilon const& epsilon) {
+    int retval = EXIT_SUCCESS;
+
+    unsigned int size1 = 1024;
+    unsigned int size2 = 1024;
+
+    unsigned int pattern_size1 = 256;
+    unsigned int pattern_size2 = 128;
+
+//    unsigned int n_rep1 = size1/pattern_size1;
+//    unsigned int n_rep2 = size2/pattern_size2;
+
+    ublas::matrix<NumericT> cA(size1,size2);
+    ublas::matrix<NumericT> cB(size1,size2);
+    ublas::matrix<NumericT> cC(size1,size2);
+
+    ublas::matrix<NumericT> cPattern(pattern_size1,pattern_size2);
+
+    ublas::vector<NumericT> cx(size1);
+
+
+    for(unsigned int i=0; i<size1; ++i)
+        for(unsigned int j=0 ; j<size2; ++j)
+            cA(i,j)=(NumericT)std::rand()/RAND_MAX;
+
+    for(unsigned int i = 0 ; i < pattern_size1 ; ++i)
+        for(unsigned int j = 0 ; j < pattern_size2 ; ++j)
+            cPattern(i,j) = (NumericT)std::rand()/RAND_MAX;
+
+
+    for(unsigned int i=0; i<size2; ++i){
+        cx(i) = (NumericT)std::rand()/RAND_MAX;
+    }
+
+//    std::cout << "Running tests for matrix of size " << cA.size1() << "," << cA.size2() << std::endl;
+
+    viennacl::matrix<NumericT,Layout> A (size1, size2);
+    viennacl::matrix<NumericT,Layout> B (size1, size2);
+    viennacl::matrix<NumericT,Layout> C (size1, size2);
+
+    viennacl::matrix<NumericT, Layout> pattern(pattern_size1, pattern_size2);
+
+    viennacl::vector<NumericT> x(size1);
+
+
+    cB = cA;
+    cC = cA;
+    viennacl::copy(cA,A);
+    viennacl::copy(cB,B);
+    viennacl::copy(cC,C);
+
+    viennacl::copy(cx,x);
+    viennacl::copy(cPattern,pattern);
+
+    {
+      std::cout << "C = A + B ..." << std::endl;
+      cC     = ( cA + cB );
+      viennacl::scheduler::statement statement(C, viennacl::op_assign(), A + B);
+      generator::generate_enqueue_statement(statement, statement.array()[0]);
+      viennacl::backend::finish();
+      CHECK_RESULT(cC, C, C=A+B)
+    }
+
+//    {
+//        std::cout << "C = diag(x) ..." << std::endl;
+//        for(unsigned int i = 0 ; i < size1 ; ++i){
+//          for(unsigned int j = 0 ; j < size2 ; ++j){
+//            cC(i,j) = (i==j)?cx[i]:0;
+//          }
+//        }
+//        generator::custom_operation op;
+//        op.add(mat(C) = generator::diag(vec(x)));
+//        op.execute();
+//        viennacl::backend::finish();
+//        CHECK_RESULT(cC, C, C = diag(x))
+//    }
+
+//    {
+//        std::cout << "x = diag(C) ..." << std::endl;
+//        for(unsigned int i = 0; i < size1 ; ++i){
+//            cx(i) = cA(i,i);
+//        }
+//        generator::custom_operation op;
+//        op.add(vec(x) = generator::diag(mat(A)));
+//        op.execute();
+//        viennacl::backend::finish();
+//        CHECK_RESULT(cx,x, x = diag(A));
+//    }
+
+//    {
+//        std::cout << "C = repmat(P, M, N) ..." << std::endl;
+//        for(unsigned int i = 0 ; i < size1 ; ++i)
+//            for(unsigned int j = 0 ; j < size2 ; ++j)
+//                cC(i,j) = cPattern(i%pattern_size1, j%pattern_size2);
+//        generator::custom_operation op;
+//        op.add(mat(C) = generator::repmat(mat(pattern),n_rep1,n_rep2));
+//        op.execute();
+//        viennacl::backend::finish();
+//        CHECK_RESULT(cC, C, C = repmat(P, M, N))
+//    }
+
+//    {
+//        std::cout << "C = repmat(x, 1, N) ..." << std::endl;
+//        for(unsigned int i = 0 ; i < size1 ; ++i)
+//            for(unsigned int j = 0 ; j < size2 ; ++j)
+//                cC(i,j) = cx(i);
+//        generator::custom_operation op;
+//        op.add(mat(C) = generator::repmat(vec(x),1, C.size2()));
+//        op.execute();
+//        viennacl::backend::finish();
+//        CHECK_RESULT(cC, C, C = repmat(x, 1, N))
+//    }
+
+//    {
+//        std::cout << "C = trans(repmat(x, 1, N)) ..." << std::endl;
+//        for(unsigned int i = 0 ; i < size1 ; ++i)
+//            for(unsigned int j = 0 ; j < size2 ; ++j)
+//                cC(i,j) = cx(j);
+//        generator::custom_operation op;
+//        op.add(mat(C) = generator::trans(generator::repmat(vec(x),1,C.size2())));
+//        op.execute();
+//        viennacl::backend::finish();
+//        CHECK_RESULT(cC, C, C = repmat(x, 1, N))
+//    }
+
+
+//    {
+//        std::cout << "C = -A ..." << std::endl;
+//        for(unsigned int i = 0 ; i < size1 ; ++i)
+//            for(unsigned int j = 0 ; j < size2 ; ++j)
+//                cC(i,j) = -cA(i,j);
+//        generator::custom_operation op;
+//        op.add(mat(C) = -mat(A));
+//        op.execute();
+//        viennacl::backend::finish();
+
+//        CHECK_RESULT(cC, C, C = -A)
+//    }
+
+//    {
+//        std::cout << "C = 1/(1+EXP(-A)) ..." << std::endl;
+//        for(unsigned int i = 0 ; i < size1 ; ++i)
+//            for(unsigned int j = 0 ; j < size2 ; ++j)
+//                cC(i,j) = 1.0f/(1.0f+std::exp(-cA(i,j)));
+//        generator::custom_operation op;
+//        op.add(mat(C) = 1.0f/(1.0f+generator::exp(-mat(A))));
+//        op.execute();
+//        viennacl::backend::finish();
+//        CHECK_RESULT(cC, C, C = 1/(1+EXP(-A)))
+//    }
+
+
+    return retval;
+}
+
+
+int main(int argc, char* argv[]){
+    std::vector<std::string> args(argv,argv+argc);
+    unsigned int requested_device;
+    if(argc!=2){
+        requested_device=0;
+    }
+    else{
+        requested_device = atoi(args[1].c_str());
+    }
+    int retval = EXIT_SUCCESS;
+
+    typedef std::vector< viennacl::ocl::platform > platforms_type;
+    typedef std::vector<viennacl::ocl::device> devices_type;
+
+    platforms_type platforms = viennacl::ocl::get_platforms();
+    size_t num_platforms = platforms.size();
+
+    unsigned int current_device = 0;
+
+    for(unsigned int k=0 ; k < num_platforms ; ++k)
+    {
+        viennacl::ocl::platform pf(k);
+        viennacl::ocl::set_context_device_type(k,CL_DEVICE_TYPE_ALL);
+        viennacl::ocl::set_context_platform_index(k,k);
+        viennacl::ocl::switch_context(k);
+        devices_type dev = viennacl::ocl::current_context().devices();
+        for(devices_type::iterator it = dev.begin() ; it != dev.end() ; ++it){
+
+            if(current_device++ == requested_device ){
+                viennacl::ocl::switch_device(*it);
+                std::cout << std::endl;
+                std::cout << "----------------------------------------------" << std::endl;
+                std::cout << "               Device Info" << std::endl;
+                std::cout << "----------------------------------------------" << std::endl;
+                std::cout << viennacl::ocl::current_device().info() << std::endl;
+
+                std::cout << std::endl;
+                std::cout << "----------------------------------------------" << std::endl;
+                std::cout << "----------------------------------------------" << std::endl;
+                std::cout << "## Test :: Vector" << std::endl;
+                std::cout << "----------------------------------------------" << std::endl;
+
+                {
+                    double epsilon = 1.0E-4;
+
+                    std::cout << "# Testing setup:" << std::endl;
+                    std::cout << "  numeric: float" << std::endl;
+                    retval = test_vector<float> (epsilon);
+
+
+                    std::cout << std::endl;
+
+//                    std::cout << "# Testing setup:" << std::endl;
+//                    std::cout << "  numeric: double" << std::endl;
+//                    retval = test_vector<double> (epsilon);
+
+                    if ( retval == EXIT_SUCCESS )
+                        std::cout << "# Test passed" << std::endl;
+                    else
+                        return retval;
+              }
+
+
+//              std::cout << std::endl;
+//              std::cout << "----------------------------------------------" << std::endl;
+//              std::cout << "----------------------------------------------" << std::endl;
+//              std::cout << "## Test :: Matrix" << std::endl;
+//              std::cout << "----------------------------------------------" << std::endl;
+
+//              {
+//                  double epsilon = 1.0E-4;
+//                  std::cout << "# Testing setup:" << std::endl;
+
+//                  std::cout << "  numeric: float" << std::endl;
+//                  std::cout << "  --------------" << std::endl;
+//                  std::cout << "  Row-Major"      << std::endl;
+//                  std::cout << "  --------------" << std::endl;
+//                  retval = test_matrix<float, viennacl::row_major> (epsilon);
+
+//                  std::cout << "  --------------" << std::endl;
+//                  std::cout << "  Column-Major"      << std::endl;
+//                  std::cout << "  --------------" << std::endl;
+//                  retval &= test_matrix<float, viennacl::column_major> (epsilon);
+
+//                  std::cout << "  numeric: double" << std::endl;
+//                  std::cout << "  --------------" << std::endl;
+//                  std::cout << "  Row-Major"      << std::endl;
+//                  std::cout << "  --------------" << std::endl;
+//                  retval = test_matrix<double, viennacl::row_major> (epsilon);
+
+//                  std::cout << "  --------------" << std::endl;
+//                  std::cout << "  Column-Major"      << std::endl;
+//                  std::cout << "  --------------" << std::endl;
+//                  retval &= test_matrix<double, viennacl::column_major> (epsilon);
+
+//                  if ( retval == EXIT_SUCCESS )
+//                      std::cout << "# Test passed" << std::endl;
+//                  else
+//                      return retval;
+//              }
+
+            }
+        }
+    }
+}
diff --git a/tests/src/generator_blas2.cpp b/tests/src/generator_blas2.cpp
new file mode 100644
index 0000000..8ac5ecd
--- /dev/null
+++ b/tests/src/generator_blas2.cpp
@@ -0,0 +1,261 @@
+/* =========================================================================
+   Copyright (c) 2010-2012, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+//
+// *** System
+//
+#include <iostream>
+#include <cmath>
+
+//
+// *** Boost
+//
+#include <boost/numeric/ublas/io.hpp>
+#include <boost/numeric/ublas/vector.hpp>
+
+//
+// *** ViennaCL
+//
+#define VIENNACL_WITH_UBLAS 1
+
+//#define VIENNACL_DEBUG_ALL
+//#define VIENNACL_DEBUG_BUILD
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+
+#include "viennacl/linalg/prod.hpp"
+
+#include "viennacl/generator/generate.hpp"
+
+#define CHECK_RESULT(cpu,gpu, op) \
+    if ( double delta = std::fabs ( diff ( cpu, gpu) ) > epsilon ) {\
+        std::cout << "# Error at operation: " #op << std::endl;\
+        std::cout << "  diff: " << delta << std::endl;\
+        retval = EXIT_FAILURE;\
+    }\
+
+
+using namespace boost::numeric;
+using namespace viennacl;
+
+template <typename ScalarType, typename VCLMatrixType>
+ScalarType diff(ublas::matrix<ScalarType> & mat1, VCLMatrixType & mat2)
+{
+   ublas::matrix<ScalarType> mat2_cpu(mat2.size1(), mat2.size2());
+   viennacl::backend::finish();  //workaround for a bug in APP SDK 2.7 on Trinity APUs (with Catalyst 12.8)
+   viennacl::copy(mat2, mat2_cpu);
+   ScalarType ret = 0;
+   ScalarType act = 0;
+
+    for (unsigned int i = 0; i < mat2_cpu.size1(); ++i)
+    {
+      for (unsigned int j = 0; j < mat2_cpu.size2(); ++j)
+      {
+         act = std::fabs(mat2_cpu(i,j) - mat1(i,j)) / std::max( std::fabs(mat2_cpu(i, j)), std::fabs(mat1(i,j)) );
+         if (act > ret)
+           ret = act;
+      }
+    }
+   //std::cout << ret << std::endl;
+   return ret;
+}
+
+template <typename ScalarType, unsigned int Alignment>
+ScalarType diff ( ublas::vector<ScalarType> & v1, viennacl::vector<ScalarType,Alignment> & v2 ) {
+    ublas::vector<ScalarType> v2_cpu ( v2.size() );
+    viennacl::copy( v2.begin(), v2.end(), v2_cpu.begin() );
+    for ( unsigned int i=0; i<v1.size(); ++i ) {
+        if ( std::max ( std::fabs ( v2_cpu[i] ), std::fabs ( v1[i] ) ) > 0 )
+            v2_cpu[i] = std::fabs ( v2_cpu[i] - v1[i] ) / std::max ( std::fabs ( v2_cpu[i] ), std::fabs ( v1[i] ) );
+        else
+            v2_cpu[i] = 0.0;
+    }
+    return norm_inf ( v2_cpu );
+}
+
+
+template< typename NumericT, class Layout, typename Epsilon >
+int test( Epsilon const& epsilon) {
+    int retval = EXIT_SUCCESS;
+
+    ublas::vector<NumericT> cx;
+    ublas::vector<NumericT> cy;
+
+    ublas::matrix<NumericT> cA;
+    ublas::matrix<NumericT> cB;
+    ublas::matrix<NumericT> cC;
+    ublas::matrix<NumericT> cD;
+
+    unsigned int size1 = 841;
+    unsigned int size2 = 772;
+
+    cA.resize(size1,size2);
+    cx.resize(size2);
+    cy.resize(size1);
+
+    for(unsigned int i=0; i<size1; ++i){
+        for(unsigned int j=0 ; j<size2; ++j){
+            cA(i,j)=static_cast<NumericT>(std::rand()/RAND_MAX);
+        }
+    }
+
+    for(unsigned int i=0; i<size2; ++i){
+        cx(i) = static_cast<NumericT>(std::rand()/RAND_MAX);
+    }
+
+    for(unsigned int i=0; i<size1; ++i){
+        cy(i) = static_cast<NumericT>(std::rand()/RAND_MAX);
+    }
+
+//    std::cout << "Running tests for matrix of size " << cA.size1() << "," << cA.size2() << std::endl;
+
+    viennacl::matrix<NumericT,Layout> A (size1, size2);
+    viennacl::matrix<NumericT,Layout> B (size1, size2);
+    viennacl::matrix<NumericT,Layout> C (size1, size2);
+    viennacl::matrix<NumericT,Layout> D (size1, size2);
+
+    viennacl::vector<NumericT> x(size2);
+    viennacl::vector<NumericT> y(size1);
+
+
+    cB = cA;
+    cC = cA;
+    cD = cA;
+    viennacl::copy(cA,A);
+    viennacl::copy(cB,B);
+    viennacl::copy(cC,C);
+    viennacl::copy(cD,D);
+
+    viennacl::copy(cx,x);
+    viennacl::copy(cy,y);
+
+
+    // --------------------------------------------------------------------------
+    {
+        std::cout << "y = A*x..." << std::endl;
+        cy     =  ublas::prod(cA,cx);
+        viennacl::scheduler::statement statement(y, viennacl::op_assign(), viennacl::linalg::prod(A,x));
+        generator::generate_enqueue_statement(statement, statement.array()[0]);
+        viennacl::backend::finish();
+        CHECK_RESULT(cy,y,y=A*x)
+    }
+
+    {
+        std::cout << "x = trans(A)*y..." << std::endl;
+        cx     =  ublas::prod(trans(cA),cy);
+        viennacl::scheduler::statement statement(x, viennacl::op_assign(), viennacl::linalg::prod(trans(A),y));
+        generator::generate_enqueue_statement(statement, statement.array()[0]);
+        viennacl::backend::finish();
+        CHECK_RESULT(cx,x,x=trans(A)*y)
+    }
+
+//    {
+//        std::cout << "y = reduce_rows<max>(A)..." << std::endl;
+//        for(unsigned int i = 0 ; i < size1 ; ++i){
+//            NumericT current_max = -INFINITY;
+//            for(unsigned int j = 0 ; j < size2 ; ++j){
+//                current_max = std::max(current_max,cA(i,j));
+//            }
+//            cy(i) = current_max;
+//        }
+//        generator::custom_operation op;
+//        op.add(dv_t(y) = generator::reduce_rows<generator::fmax_type>(dm_t(A)));
+//        op.execute();
+//        viennacl::backend::finish();
+//        CHECK_RESULT(cy,y,y = reduce_rows<max>(A))
+//    }
+
+
+//    {
+//        std::cout << "x = reduce_cols<max>(A)..." << std::endl;
+//        for(unsigned int j = 0 ; j < size2 ; ++j){
+//            NumericT current_max = -INFINITY;
+//            for(unsigned int i = 0 ; i < size1 ; ++i){
+//                current_max = std::max(current_max,cA(i,j));
+//            }
+//            cx(j) = current_max;
+//        }
+//        generator::custom_operation op;
+//        op.add(dv_t(x) = generator::reduce_cols<generator::fmax_type>(dm_t(A)));
+//        op.execute();
+//        viennacl::backend::finish();
+//        CHECK_RESULT(cx,x,x = reduce_cols<max>(A))
+//    }
+
+
+    return retval;
+}
+
+
+int main() {
+    std::cout << std::endl;
+    std::cout << "----------------------------------------------" << std::endl;
+    std::cout << "----------------------------------------------" << std::endl;
+    std::cout << "## Test :: Generated BLAS2" << std::endl;
+    std::cout << "----------------------------------------------" << std::endl;
+    std::cout << "----------------------------------------------" << std::endl;
+    std::cout << std::endl;
+
+    int retval = EXIT_SUCCESS;
+
+    std::cout << std::endl;
+    std::cout << "----------------------------------------------" << std::endl;
+    std::cout << std::endl;
+    {
+        double epsilon = 1.0E-4;
+        std::cout << "# Testing setup:" << std::endl;
+        std::cout << "  numeric: float" << std::endl;
+        std::cout << "  --------------" << std::endl;
+        std::cout << "  Row-Major"      << std::endl;
+        std::cout << "  --------------" << std::endl;
+        retval = test<float, viennacl::row_major> (epsilon);
+        std::cout << "  --------------" << std::endl;
+        std::cout << "  Column-Major"   << std::endl;
+        std::cout << "  --------------" << std::endl;
+        retval &= test<float, viennacl::column_major> (epsilon);
+
+        if ( retval == EXIT_SUCCESS )
+            std::cout << "# Test passed" << std::endl;
+        else
+            return retval;
+    }
+
+    std::cout << std::endl;
+    std::cout << "----------------------------------------------" << std::endl;
+    std::cout << std::endl;
+#ifdef VIENNACL_WITH_OPENCL
+   if( viennacl::ocl::current_device().double_support() )
+#endif
+    {
+        double epsilon = 1.0E-4;
+        std::cout << "# Testing setup:" << std::endl;
+        std::cout << "  numeric: double" << std::endl;
+        std::cout << "  --------------" << std::endl;
+        std::cout << "  Row-Major"      << std::endl;
+        std::cout << "  --------------" << std::endl;
+        retval = test<double, viennacl::row_major> (epsilon);
+        std::cout << "  --------------" << std::endl;
+        std::cout << "  Column-Major"   << std::endl;
+        std::cout << "  --------------" << std::endl;
+        retval &= test<double, viennacl::column_major> (epsilon);
+
+        if ( retval == EXIT_SUCCESS )
+            std::cout << "# Test passed" << std::endl;
+        else
+            return retval;
+    }
+}
diff --git a/tests/src/generator_blas3.cpp b/tests/src/generator_blas3.cpp
new file mode 100644
index 0000000..04f9a49
--- /dev/null
+++ b/tests/src/generator_blas3.cpp
@@ -0,0 +1,424 @@
+/* =========================================================================
+   Copyright (c) 2010-2012, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#ifndef NDEBUG
+  #define NDEBUG
+#endif
+
+//
+// *** System
+//
+#include <iostream>
+
+//
+// *** Boost
+//
+#include <boost/numeric/ublas/io.hpp>
+#include <boost/numeric/ublas/triangular.hpp>
+#include <boost/numeric/ublas/matrix_sparse.hpp>
+#include <boost/numeric/ublas/matrix.hpp>
+#include <boost/numeric/ublas/matrix_proxy.hpp>
+#include <boost/numeric/ublas/lu.hpp>
+#include <boost/numeric/ublas/io.hpp>
+
+//
+// *** ViennaCL
+//
+//#define VIENNACL_DEBUG_ALL
+//#define VIENNACL_DEBUG_BUILD
+#define VIENNACL_HAVE_UBLAS 1
+#include "viennacl/scalar.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/matrix_proxy.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+#include "viennacl/linalg/direct_solve.hpp"
+#include "examples/tutorial/Random.hpp"
+#include "viennacl/generator/generate.hpp"
+#include "list"
+//
+// -------------------------------------------------------------
+//
+using namespace boost::numeric;
+//
+// -------------------------------------------------------------
+//
+static const unsigned int min_large_block_size = 32;
+static const unsigned int max_large_block_size = 128;
+static const unsigned int n_large_blocks = static_cast<unsigned int>(std::log(static_cast<double>(max_large_block_size/min_large_block_size))/std::log(2.0)+1.0);
+
+static const unsigned int min_alignment = 1;
+static const unsigned int max_alignment = 8;
+
+static const unsigned int max_small_block_size = max_alignment;
+
+//
+// -------------------------------------------------------------
+
+template <typename ScalarType>
+ScalarType diff(ScalarType & s1, viennacl::scalar<ScalarType> & s2)
+{
+   viennacl::backend::finish();
+   if (s1 != s2)
+      return (s1 - s2) / std::max(std::fabs(s1), std::fabs(s2));
+   return 0;
+}
+
+template <typename ScalarType, typename VCLMatrixType>
+ScalarType diff(ublas::matrix<ScalarType> & mat1, VCLMatrixType & mat2)
+{
+   ublas::matrix<ScalarType> mat2_cpu(mat2.size1(), mat2.size2());
+   viennacl::backend::finish();  //workaround for a bug in APP SDK 2.7 on Trinity APUs (with Catalyst 12.8)
+   viennacl::copy(mat2, mat2_cpu);
+   ScalarType ret = 0;
+   ScalarType act = 0;
+
+    for (unsigned int i = 0; i < mat2_cpu.size1(); ++i)
+    {
+      for (unsigned int j = 0; j < mat2_cpu.size2(); ++j)
+      {
+         act = std::fabs(mat2_cpu(i,j) - mat1(i,j)) / std::max( std::fabs(mat2_cpu(i, j)), std::fabs(mat1(i,j)) );
+         if (act > ret)
+           ret = act;
+      }
+    }
+   //std::cout << ret << std::endl;
+   return ret;
+}
+
+
+
+
+
+
+//
+// Part 1: Matrix-matrix multiplications
+//
+
+
+template< typename NumericT, typename Epsilon,
+          typename ReferenceMatrixTypeA, typename ReferenceMatrixTypeB, typename ReferenceMatrixTypeC,
+          typename MatrixTypeA, typename MatrixTypeB, typename MatrixTypeC>
+int test_prod(Epsilon const& epsilon,
+
+              ReferenceMatrixTypeA const & A, ReferenceMatrixTypeA const & A_trans,
+              ReferenceMatrixTypeB const & B, ReferenceMatrixTypeB const & B_trans,
+              ReferenceMatrixTypeC & C,
+
+              MatrixTypeA const & vcl_A, MatrixTypeA const & vcl_A_trans,
+              MatrixTypeB const & vcl_B, MatrixTypeB const & vcl_B_trans,
+              MatrixTypeC & vcl_C
+             )
+{
+   int retval = EXIT_SUCCESS;
+   NumericT act_diff = 0;
+   NumericT alpha = NumericT(3.14);
+   NumericT beta  = NumericT(4.51);
+
+std::cout << "Testing C = alpha*prod(A,B) + beta*C ..." << std::endl;
+{
+    C     = alpha*viennacl::linalg::prod(A, B) + beta*C;
+
+    viennacl::scheduler::statement statement(vcl_C, viennacl::op_assign(), alpha*viennacl::linalg::prod(vcl_A,vcl_B)+beta*vcl_C);
+    viennacl::generator::generate_enqueue_statement(statement, statement.array()[0]);
+    viennacl::backend::finish();
+    act_diff = std::fabs(diff(C, vcl_C));
+    if( act_diff > epsilon )
+    {
+      std::cout << "# Error at operation: matrix-matrix product" << std::endl;
+      std::cout << "  diff: " << act_diff << std::endl;
+      retval = EXIT_FAILURE;
+    }
+    else
+      std::cout << "Test C = A * B passed!" << std::endl;
+}
+
+
+   std::cout << "Testing C = alpha*trans(A) * B + beta*C ..." << std::endl;
+   {
+       C     = alpha*boost::numeric::ublas::prod(trans(A_trans), B) + beta*C;
+       viennacl::scheduler::statement statement(vcl_C, viennacl::op_assign(), alpha*viennacl::linalg::prod(trans(vcl_A_trans),vcl_B) + beta*vcl_C);
+       viennacl::generator::generate_enqueue_statement(statement, statement.array()[0]);
+       viennacl::backend::finish();
+       act_diff = std::fabs(diff(C, vcl_C));
+       if( act_diff > epsilon )
+       {
+         std::cout << "# Error at operation: matrix-matrix product" << std::endl;
+         std::cout << "  diff: " << act_diff << std::endl;
+         retval = EXIT_FAILURE;
+       }
+       else std::cout << "Test C = trans(A) * B passed!" << std::endl;
+   }
+
+std::cout << "Testing C = alpha*A * trans(B) + beta*C ..." << std::endl;
+{
+    C     = boost::numeric::ublas::prod(A,trans(B_trans)) + beta*C;
+    viennacl::scheduler::statement statement(vcl_C, viennacl::op_assign(), viennacl::linalg::prod(vcl_A,trans(vcl_B_trans)) + beta*vcl_C);
+    viennacl::generator::generate_enqueue_statement(statement, statement.array()[0]);
+    viennacl::backend::finish();
+    act_diff = std::fabs(diff(C, vcl_C));
+    if( act_diff > epsilon )
+    {
+      std::cout << "# Error at operation: matrix-matrix product" << std::endl;
+      std::cout << "  diff: " << act_diff << std::endl;
+      retval = EXIT_FAILURE;
+    }
+    else std::cout << "Test C = A * trans(B) passed!" << std::endl;
+}
+
+std::cout << "Testing C = alpha*trans(A) * trans(B) + beta*C ..." << std::endl;
+{
+    C     = boost::numeric::ublas::prod(trans(A_trans), trans(B_trans)) + beta*C;
+    viennacl::scheduler::statement statement(vcl_C, viennacl::op_assign(), viennacl::linalg::prod(trans(vcl_A_trans),trans(vcl_B_trans)) + beta*vcl_C);
+    viennacl::generator::generate_enqueue_statement(statement, statement.array()[0]);
+    viennacl::backend::finish();
+    act_diff = std::fabs(diff(C, vcl_C));
+    if( act_diff > epsilon )
+    {
+      std::cout << "# Error at operation: matrix-matrix product" << std::endl;
+      std::cout << "  diff: " << act_diff << std::endl;
+      retval = EXIT_FAILURE;
+    }
+    else std::cout << "Test C = trans(A) * trans(B) passed!" << std::endl;
+}
+
+
+   return retval;
+}
+
+template< typename NumericT, typename F_A, typename F_B, typename F_C, typename Epsilon>
+int test_prod(Epsilon const& epsilon)
+{
+  int ret;
+
+  long matrix_size1 = 2*max_large_block_size;
+  long matrix_size2 = 3*max_large_block_size;
+  long matrix_size3 = 4*max_large_block_size;
+
+  // --------------------------------------------------------------------------
+
+  // ublas reference:
+  ublas::matrix<NumericT> A(matrix_size1, matrix_size2);
+  ublas::matrix<NumericT> B(matrix_size2, matrix_size3);
+  ublas::matrix<NumericT> C(matrix_size1, matrix_size3);
+
+  //fill A and B:
+  for (unsigned int i = 0; i < A.size1(); ++i)
+    for (unsigned int j = 0; j < A.size2(); ++j)
+        A(i,j) = static_cast<NumericT>(0.1) * random<NumericT>();
+  for (unsigned int i = 0; i < B.size1(); ++i)
+    for (unsigned int j = 0; j < B.size2(); ++j)
+        B(i,j) = static_cast<NumericT>(0.1) * random<NumericT>();
+  for (unsigned int i = 0; i < C.size1(); ++i)
+    for (unsigned int j = 0; j < C.size2(); ++j)
+        C(i,j) = static_cast<NumericT>(0.1) * random<NumericT>();
+
+
+  ublas::matrix<NumericT>     A_trans = trans(A);
+  ublas::matrix<NumericT>     B_trans = trans(B);
+
+  //
+  // ViennaCL objects
+  //
+
+
+
+  // A
+  viennacl::matrix<NumericT, F_A>    vcl_A(matrix_size1, matrix_size2);
+  viennacl::copy(A, vcl_A);
+
+  // A^T
+  viennacl::matrix<NumericT, F_A>    vcl_A_trans(matrix_size2, matrix_size1);
+  viennacl::copy(A_trans, vcl_A_trans);
+
+  // B
+  viennacl::matrix<NumericT, F_B>    vcl_B(matrix_size2, matrix_size3);
+  viennacl::copy(B, vcl_B);
+
+  // B^T
+  viennacl::matrix<NumericT, F_B>    vcl_B_trans(matrix_size3, matrix_size2);
+  viennacl::copy(B_trans, vcl_B_trans);
+
+  // C
+  viennacl::matrix<NumericT, F_C>    vcl_C(matrix_size1, matrix_size3);
+  viennacl::copy(C, vcl_C);
+
+  std::cout << "--- Part 1: Testing matrix-matrix products ---" << std::endl;
+
+  //////
+  //////  A: matrix
+  //////
+
+  //
+  //
+  std::cout << "Now using A=matrix, B=matrix, C=matrix" << std::endl;
+  ret = test_prod<NumericT>(epsilon,
+                            A, A_trans, B, B_trans, C,
+                            vcl_A, vcl_A_trans,
+                            vcl_B, vcl_B_trans,
+                            vcl_C);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+  return EXIT_SUCCESS;
+}
+
+template< typename NumericT, typename Epsilon >
+int test(Epsilon const& epsilon)
+{
+  int ret;
+
+  std::cout << "///////////////////////////////////////" << std::endl;
+  std::cout << "/// Now testing A=row, B=row, C=row ///" << std::endl;
+  std::cout << "///////////////////////////////////////" << std::endl;
+  ret = test_prod<NumericT, viennacl::row_major, viennacl::row_major, viennacl::row_major>(epsilon);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+  std::cout << "///////////////////////////////////////" << std::endl;
+  std::cout << "/// Now testing A=col, B=row, C=row ///" << std::endl;
+  std::cout << "///////////////////////////////////////" << std::endl;
+  ret = test_prod<NumericT, viennacl::column_major, viennacl::row_major, viennacl::row_major>(epsilon);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+  std::cout << "///////////////////////////////////////" << std::endl;
+  std::cout << "/// Now testing A=row, B=col, C=row ///" << std::endl;
+  std::cout << "///////////////////////////////////////" << std::endl;
+  ret = test_prod<NumericT, viennacl::row_major, viennacl::column_major, viennacl::row_major>(epsilon);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+  std::cout << "///////////////////////////////////////" << std::endl;
+  std::cout << "/// Now testing A=col, B=col, C=row ///" << std::endl;
+  std::cout << "///////////////////////////////////////" << std::endl;
+  ret = test_prod<NumericT, viennacl::column_major, viennacl::column_major, viennacl::row_major>(epsilon);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+
+  std::cout << "///////////////////////////////////////" << std::endl;
+  std::cout << "/// Now testing A=row, B=row, C=col ///" << std::endl;
+  std::cout << "///////////////////////////////////////" << std::endl;
+  ret = test_prod<NumericT, viennacl::row_major, viennacl::row_major, viennacl::column_major>(epsilon);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+  std::cout << "///////////////////////////////////////" << std::endl;
+  std::cout << "/// Now testing A=col, B=row, C=col ///" << std::endl;
+  std::cout << "///////////////////////////////////////" << std::endl;
+  ret = test_prod<NumericT, viennacl::column_major, viennacl::row_major, viennacl::column_major>(epsilon);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+  std::cout << "///////////////////////////////////////" << std::endl;
+  std::cout << "/// Now testing A=row, B=col, C=col ///" << std::endl;
+  std::cout << "///////////////////////////////////////" << std::endl;
+  ret = test_prod<NumericT, viennacl::row_major, viennacl::column_major, viennacl::column_major>(epsilon);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+  std::cout << "///////////////////////////////////////" << std::endl;
+  std::cout << "/// Now testing A=col, B=col, C=col ///" << std::endl;
+  std::cout << "///////////////////////////////////////" << std::endl;
+  ret = test_prod<NumericT, viennacl::column_major, viennacl::column_major, viennacl::column_major>(epsilon);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+
+
+  return ret;
+}
+
+int main(int argc, char* argv[])
+{
+    std::vector<std::string> args(argv,argv+argc);
+    int retval = EXIT_SUCCESS;
+
+    typedef std::vector<viennacl::ocl::device> devices_type;
+
+    //platforms_type platforms = viennacl::ocl::get_platforms();
+    //size_t num_platforms = platforms.size();
+
+    devices_type dev = viennacl::ocl::current_context().devices();
+    for(devices_type::iterator it = dev.begin() ; it != dev.end() ; ++it){
+            std::cout << std::endl;
+            std::cout << "----------------------------------------------" << std::endl;
+            std::cout << "----------------------------------------------" << std::endl;
+            std::cout << "## Test :: Generated BLAS 3 routines" << std::endl;
+            std::cout << "----------------------------------------------" << std::endl;
+            std::cout << "----------------------------------------------" << std::endl;
+            std::cout << std::endl;
+
+            int retval = EXIT_SUCCESS;
+
+            //srand(time(NULL));
+
+            std::cout << std::endl;
+            std::cout << "----------------------------------------------" << std::endl;
+            std::cout << std::endl;
+            {
+               typedef float NumericT;
+               NumericT epsilon = NumericT(1.0E-3);
+               std::cout << "# Testing setup:" << std::endl;
+
+               std::cout << viennacl::ocl::current_device().info() << std::endl;
+
+               std::cout << "  eps:     " << epsilon << std::endl;
+               std::cout << "  numeric: float" << std::endl;
+               retval = test<NumericT>(epsilon);
+               if( retval == EXIT_SUCCESS )
+                 std::cout << "# Test passed" << std::endl;
+               else
+                 return retval;
+            }
+            std::cout << std::endl;
+            std::cout << "----------------------------------------------" << std::endl;
+            std::cout << std::endl;
+         #ifdef VIENNACL_WITH_OPENCL
+            if( viennacl::ocl::current_device().double_support() )
+         #endif
+            {
+               {
+                 typedef double NumericT;
+                 NumericT epsilon = 1.0E-11;
+                 std::cout << "# Testing setup:" << std::endl;
+                 std::cout << "  eps:     " << epsilon << std::endl;
+                 std::cout << "  numeric: double" << std::endl;
+                 retval = test<NumericT>(epsilon);
+                 if( retval == EXIT_SUCCESS )
+                   std::cout << "# Test passed" << std::endl;
+                 else
+                   return retval;
+               }
+               std::cout << std::endl;
+               std::cout << "----------------------------------------------" << std::endl;
+               std::cout << std::endl;
+            }
+
+            std::cout << std::endl;
+            std::cout << "------- Test completed --------" << std::endl;
+            std::cout << std::endl;
+    }
+
+
+
+
+   return retval;
+}
+
diff --git a/tests/src/global_variables.cpp b/tests/src/global_variables.cpp
new file mode 100644
index 0000000..0b7c9bf
--- /dev/null
+++ b/tests/src/global_variables.cpp
@@ -0,0 +1,85 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+//
+// *** System
+//
+#include <iostream>
+#include <algorithm>
+#include <cmath>
+
+//
+// *** ViennaCL
+//
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/compressed_matrix.hpp"
+#include "viennacl/coordinate_matrix.hpp"
+#include "viennacl/ell_matrix.hpp"
+#include "viennacl/hyb_matrix.hpp"
+#ifdef VIENNACL_WITH_OPENCL
+  #include "viennacl/circulant_matrix.hpp"
+  #include "viennacl/hankel_matrix.hpp"
+  #include "viennacl/toeplitz_matrix.hpp"
+  #include "viennacl/vandermonde_matrix.hpp"
+#endif
+
+viennacl::scalar<float>  s1;
+viennacl::scalar<int> s2;
+
+viennacl::vector<float>  v1;
+viennacl::vector<int> v2;
+
+viennacl::matrix<float>  m1;
+//viennacl::matrix<int> m2;
+
+// TODO: Add checks for other types
+
+//
+// -------------------------------------------------------------
+//
+int main()
+{
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "## Test :: Instantiation of global variables" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << std::endl;
+
+  s1 = viennacl::scalar<float>(1.0f);
+  s2 = viennacl::scalar<int>(1);
+
+  v1 = viennacl::vector<float>(5);
+  v2 = viennacl::vector<int>(5);
+
+  m1 = viennacl::matrix<float>(5, 4);
+  //m2 = viennacl::matrix<int>(5, 4);
+
+  std::cout << std::endl;
+  std::cout << "------- Test completed --------" << std::endl;
+  std::cout << std::endl;
+
+
+  return EXIT_SUCCESS;
+}
+//
+// -------------------------------------------------------------
+//
+
diff --git a/tests/src/global_variables.cu b/tests/src/global_variables.cu
new file mode 100644
index 0000000..0b7c9bf
--- /dev/null
+++ b/tests/src/global_variables.cu
@@ -0,0 +1,85 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+//
+// *** System
+//
+#include <iostream>
+#include <algorithm>
+#include <cmath>
+
+//
+// *** ViennaCL
+//
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/compressed_matrix.hpp"
+#include "viennacl/coordinate_matrix.hpp"
+#include "viennacl/ell_matrix.hpp"
+#include "viennacl/hyb_matrix.hpp"
+#ifdef VIENNACL_WITH_OPENCL
+  #include "viennacl/circulant_matrix.hpp"
+  #include "viennacl/hankel_matrix.hpp"
+  #include "viennacl/toeplitz_matrix.hpp"
+  #include "viennacl/vandermonde_matrix.hpp"
+#endif
+
+viennacl::scalar<float>  s1;
+viennacl::scalar<int> s2;
+
+viennacl::vector<float>  v1;
+viennacl::vector<int> v2;
+
+viennacl::matrix<float>  m1;
+//viennacl::matrix<int> m2;
+
+// TODO: Add checks for other types
+
+//
+// -------------------------------------------------------------
+//
+int main()
+{
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "## Test :: Instantiation of global variables" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << std::endl;
+
+  s1 = viennacl::scalar<float>(1.0f);
+  s2 = viennacl::scalar<int>(1);
+
+  v1 = viennacl::vector<float>(5);
+  v2 = viennacl::vector<int>(5);
+
+  m1 = viennacl::matrix<float>(5, 4);
+  //m2 = viennacl::matrix<int>(5, 4);
+
+  std::cout << std::endl;
+  std::cout << "------- Test completed --------" << std::endl;
+  std::cout << std::endl;
+
+
+  return EXIT_SUCCESS;
+}
+//
+// -------------------------------------------------------------
+//
+
diff --git a/tests/src/iterators.cpp b/tests/src/iterators.cpp
index db8913c..e48a496 100644
--- a/tests/src/iterators.cpp
+++ b/tests/src/iterators.cpp
@@ -1,14 +1,15 @@
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
@@ -24,7 +25,7 @@
 // *** ViennaCL
 //
 //#define VCL_BUILD_INFO
-//#define VIENNACL_HAVE_UBLAS 1
+//#define VIENNACL_WITH_UBLAS 1
 #include "viennacl/matrix.hpp"
 #include "viennacl/vector.hpp"
 
@@ -43,8 +44,8 @@ int test()
    vcl_cont[1] = 2;
    vcl_cont[2] = 3;
 
-   typename VclVector::const_iterator const_iter_def_const;
-   typename VclVector::iterator       iter_def_const;
+   //typename VclVector::const_iterator const_iter_def_const;
+   //typename VclVector::iterator       iter_def_const;
 
    for(typename VclVector::const_iterator iter = vcl_cont.begin();
        iter != vcl_cont.end(); iter++)
@@ -58,7 +59,7 @@ int test()
       std::cout << *iter << std::endl;
    }
 
-   // --------------------------------------------------------------------------                        
+   // --------------------------------------------------------------------------
    return retval;
 }
 
@@ -91,7 +92,9 @@ int main()
    std::cout << "----------------------------------------------" << std::endl;
    std::cout << std::endl;
 
+#ifdef VIENNACL_WITH_OPENCL
    if( viennacl::ocl::current_device().double_support() )
+#endif
    {
       {
          typedef double NumericT;
@@ -107,5 +110,11 @@ int main()
       std::cout << "----------------------------------------------" << std::endl;
       std::cout << std::endl;
    }
+
+   std::cout << std::endl;
+   std::cout << "------- Test completed --------" << std::endl;
+   std::cout << std::endl;
+
+
    return retval;
 }
diff --git a/tests/src/iterators.cpp b/tests/src/iterators.cu
similarity index 87%
copy from tests/src/iterators.cpp
copy to tests/src/iterators.cu
index db8913c..e48a496 100644
--- a/tests/src/iterators.cpp
+++ b/tests/src/iterators.cu
@@ -1,14 +1,15 @@
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
@@ -24,7 +25,7 @@
 // *** ViennaCL
 //
 //#define VCL_BUILD_INFO
-//#define VIENNACL_HAVE_UBLAS 1
+//#define VIENNACL_WITH_UBLAS 1
 #include "viennacl/matrix.hpp"
 #include "viennacl/vector.hpp"
 
@@ -43,8 +44,8 @@ int test()
    vcl_cont[1] = 2;
    vcl_cont[2] = 3;
 
-   typename VclVector::const_iterator const_iter_def_const;
-   typename VclVector::iterator       iter_def_const;
+   //typename VclVector::const_iterator const_iter_def_const;
+   //typename VclVector::iterator       iter_def_const;
 
    for(typename VclVector::const_iterator iter = vcl_cont.begin();
        iter != vcl_cont.end(); iter++)
@@ -58,7 +59,7 @@ int test()
       std::cout << *iter << std::endl;
    }
 
-   // --------------------------------------------------------------------------                        
+   // --------------------------------------------------------------------------
    return retval;
 }
 
@@ -91,7 +92,9 @@ int main()
    std::cout << "----------------------------------------------" << std::endl;
    std::cout << std::endl;
 
+#ifdef VIENNACL_WITH_OPENCL
    if( viennacl::ocl::current_device().double_support() )
+#endif
    {
       {
          typedef double NumericT;
@@ -107,5 +110,11 @@ int main()
       std::cout << "----------------------------------------------" << std::endl;
       std::cout << std::endl;
    }
+
+   std::cout << std::endl;
+   std::cout << "------- Test completed --------" << std::endl;
+   std::cout << std::endl;
+
+
    return retval;
 }
diff --git a/tests/src/libviennacl_blas1.cpp b/tests/src/libviennacl_blas1.cpp
new file mode 100644
index 0000000..53745af
--- /dev/null
+++ b/tests/src/libviennacl_blas1.cpp
@@ -0,0 +1,668 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/*
+*
+*   Testing the ViennaCL BLAS-like shared library
+*
+*/
+
+
+// include necessary system headers
+#include <iostream>
+#include <vector>
+#include <cmath>
+
+// Some helper functions for this tutorial:
+#include "viennacl.hpp"
+
+#include "viennacl/vector.hpp"
+
+template <typename ScalarType>
+ScalarType diff(ScalarType const & s1, ScalarType const & s2)
+{
+   if (s1 != s2)
+      return (s1 - s2) / std::max(static_cast<ScalarType>(std::fabs(static_cast<double>(s1))),
+                                  static_cast<ScalarType>(std::fabs(static_cast<double>(s2))));
+   return ScalarType(0);
+}
+
+template <typename ScalarType, typename ViennaCLVectorType>
+ScalarType diff(std::vector<ScalarType> const & v1, ViennaCLVectorType const & vcl_vec)
+{
+   std::vector<ScalarType> v2_cpu(vcl_vec.size());
+   viennacl::backend::finish();
+   viennacl::copy(vcl_vec, v2_cpu);
+
+   ScalarType inf_norm = 0;
+   for (unsigned int i=0;i<v1.size(); ++i)
+   {
+      if ( std::max( std::fabs(v2_cpu[i]), std::fabs(v1[i]) ) > 0 )
+         v2_cpu[i] = std::fabs(v2_cpu[i] - v1[i]) / std::max( std::fabs(v2_cpu[i]), std::fabs(v1[i]) );
+      else
+         v2_cpu[i] = 0.0;
+
+      if (v2_cpu[i] > inf_norm)
+        inf_norm = v2_cpu[i];
+   }
+
+   return inf_norm;
+}
+
+template <typename T, typename U, typename EpsilonT>
+void check(T const & t, U const & u, EpsilonT eps)
+{
+  EpsilonT rel_error = static_cast<EpsilonT>(diff(t,u));
+  if (rel_error > eps)
+  {
+    std::cerr << "Relative error: " << rel_error << std::endl;
+    std::cerr << "Aborting!" << std::endl;
+    exit(EXIT_FAILURE);
+  }
+  std::cout << "SUCCESS ";
+}
+
+int main()
+{
+  std::size_t size  = 10; // at least 7
+  float  eps_float  = 1e-5f;
+  double eps_double = 1e-12;
+
+  float  ref_float_alpha;
+  double ref_double_alpha;
+
+  std::vector<float> ref_float_x(size, 1.0f);
+  std::vector<float> ref_float_y(size, 2.0f);
+
+  std::vector<double> ref_double_x(size, 1.0);
+  std::vector<double> ref_double_y(size, 2.0);
+
+  ViennaCLBackend my_backend;
+  ViennaCLBackendCreate(&my_backend);
+
+  // Host setup
+  float host_float_alpha = 0;
+  viennacl::vector<float> host_float_x = viennacl::scalar_vector<float>(size, 1.0f, viennacl::context(viennacl::MAIN_MEMORY));
+  viennacl::vector<float> host_float_y = viennacl::scalar_vector<float>(size, 2.0f, viennacl::context(viennacl::MAIN_MEMORY));
+
+  double host_double_alpha = 0;
+  viennacl::vector<double> host_double_x = viennacl::scalar_vector<double>(size, 1.0, viennacl::context(viennacl::MAIN_MEMORY));
+  viennacl::vector<double> host_double_y = viennacl::scalar_vector<double>(size, 2.0, viennacl::context(viennacl::MAIN_MEMORY));
+
+  // CUDA setup
+#ifdef VIENNACL_WITH_CUDA
+  float cuda_float_alpha = 0;
+  viennacl::vector<float> cuda_float_x = viennacl::scalar_vector<float>(size, 1.0f, viennacl::context(viennacl::CUDA_MEMORY));
+  viennacl::vector<float> cuda_float_y = viennacl::scalar_vector<float>(size, 2.0f, viennacl::context(viennacl::CUDA_MEMORY));
+
+  double cuda_double_alpha = 0;
+  viennacl::vector<double> cuda_double_x = viennacl::scalar_vector<double>(size, 1.0, viennacl::context(viennacl::CUDA_MEMORY));
+  viennacl::vector<double> cuda_double_y = viennacl::scalar_vector<double>(size, 2.0, viennacl::context(viennacl::CUDA_MEMORY));
+#endif
+
+  // OpenCL setup
+#ifdef VIENNACL_WITH_OPENCL
+  ViennaCLInt context_id = 0;
+  float opencl_float_alpha = 0;
+  viennacl::vector<float> opencl_float_x = viennacl::scalar_vector<float>(size, 1.0f, viennacl::context(viennacl::ocl::get_context(context_id)));
+  viennacl::vector<float> opencl_float_y = viennacl::scalar_vector<float>(size, 2.0f, viennacl::context(viennacl::ocl::get_context(context_id)));
+
+  double opencl_double_alpha = 0;
+  viennacl::vector<double> *opencl_double_x = NULL;
+  viennacl::vector<double> *opencl_double_y = NULL;
+  if( viennacl::ocl::current_device().double_support() )
+  {
+    opencl_double_x = new viennacl::vector<double>(viennacl::scalar_vector<double>(size, 1.0, viennacl::context(viennacl::ocl::get_context(context_id))));
+    opencl_double_y = new viennacl::vector<double>(viennacl::scalar_vector<double>(size, 2.0, viennacl::context(viennacl::ocl::get_context(context_id))));
+  }
+
+  ViennaCLBackendSetOpenCLContextID(my_backend, context_id);
+#endif
+
+  // consistency checks:
+  check(ref_float_x, host_float_x, eps_float);
+  check(ref_float_y, host_float_y, eps_float);
+  check(ref_double_x, host_double_x, eps_double);
+  check(ref_double_y, host_double_y, eps_double);
+#ifdef VIENNACL_WITH_CUDA
+  check(ref_float_x, cuda_float_x, eps_float);
+  check(ref_float_y, cuda_float_y, eps_float);
+  check(ref_double_x, cuda_double_x, eps_double);
+  check(ref_double_y, cuda_double_y, eps_double);
+#endif
+#ifdef VIENNACL_WITH_OPENCL
+  check(ref_float_x, opencl_float_x, eps_float);
+  check(ref_float_y, opencl_float_y, eps_float);
+  if( viennacl::ocl::current_device().double_support() )
+  {
+    check(ref_double_x, *opencl_double_x, eps_double);
+    check(ref_double_y, *opencl_double_y, eps_double);
+  }
+#endif
+
+  // ASUM
+  std::cout << std::endl << "-- Testing xASUM...";
+  ref_float_alpha  = 0;
+  ref_double_alpha = 0;
+  for (std::size_t i=0; i<size/4; ++i)
+  {
+    ref_float_alpha  += std::fabs(ref_float_x[2 + 3*i]);
+    ref_double_alpha += std::fabs(ref_double_x[2 + 3*i]);
+  }
+
+  std::cout << std::endl << "Host: ";
+  ViennaCLHostSasum(my_backend, ViennaCLInt(size/4),
+                    &host_float_alpha,
+                    viennacl::linalg::host_based::detail::extract_raw_pointer<float>(host_float_x), 2, 3);
+  check(ref_float_alpha, host_float_alpha, eps_float);
+  ViennaCLHostDasum(my_backend, ViennaCLInt(size/4),
+                    &host_double_alpha,
+                    viennacl::linalg::host_based::detail::extract_raw_pointer<double>(host_double_x), 2, 3);
+  check(ref_double_alpha, host_double_alpha, eps_double);
+
+
+#ifdef VIENNACL_WITH_CUDA
+  std::cout << std::endl << "CUDA: ";
+  ViennaCLCUDASasum(my_backend, ViennaCLInt(size/4),
+                    &cuda_float_alpha,
+                    viennacl::linalg::cuda::detail::cuda_arg<float>(cuda_float_x), 2, 3);
+  check(ref_float_alpha, cuda_float_alpha, eps_float);
+  ViennaCLCUDADasum(my_backend, ViennaCLInt(size/4),
+                    &cuda_double_alpha,
+                    viennacl::linalg::cuda::detail::cuda_arg<double>(cuda_double_x), 2, 3);
+  check(ref_double_alpha, cuda_double_alpha, eps_double);
+#endif
+
+#ifdef VIENNACL_WITH_OPENCL
+  std::cout << std::endl << "OpenCL: ";
+  ViennaCLOpenCLSasum(my_backend, ViennaCLInt(size/4),
+                      &opencl_float_alpha,
+                      viennacl::traits::opencl_handle(opencl_float_x).get(), 2, 3);
+  check(ref_float_alpha, opencl_float_alpha, eps_float);
+  if( viennacl::ocl::current_device().double_support() )
+  {
+    ViennaCLOpenCLDasum(my_backend, ViennaCLInt(size/4),
+                        &opencl_double_alpha,
+                        viennacl::traits::opencl_handle(*opencl_double_x).get(), 2, 3);
+    check(ref_double_alpha, opencl_double_alpha, eps_double);
+  }
+#endif
+
+
+
+  // AXPY
+  std::cout << std::endl << "-- Testing xAXPY...";
+  for (std::size_t i=0; i<size/3; ++i)
+  {
+    ref_float_y[1 + 2*i]  += 2.0f * ref_float_x[0 + 2*i];
+    ref_double_y[1 + 2*i] += 2.0  * ref_double_x[0 + 2*i];
+  }
+
+  std::cout << std::endl << "Host: ";
+  ViennaCLHostSaxpy(my_backend, ViennaCLInt(size/3),
+                    2.0f,
+                    viennacl::linalg::host_based::detail::extract_raw_pointer<float>(host_float_x), 0, 2,
+                    viennacl::linalg::host_based::detail::extract_raw_pointer<float>(host_float_y), 1, 2);
+  check(ref_float_x, host_float_x, eps_float);
+  check(ref_float_y, host_float_y, eps_float);
+  ViennaCLHostDaxpy(my_backend, ViennaCLInt(size/3),
+                    2.0,
+                    viennacl::linalg::host_based::detail::extract_raw_pointer<double>(host_double_x), 0, 2,
+                    viennacl::linalg::host_based::detail::extract_raw_pointer<double>(host_double_y), 1, 2);
+  check(ref_double_x, host_double_x, eps_double);
+  check(ref_double_y, host_double_y, eps_double);
+
+
+#ifdef VIENNACL_WITH_CUDA
+  std::cout << std::endl << "CUDA: ";
+  ViennaCLCUDASaxpy(my_backend, ViennaCLInt(size/3),
+                    2.0f,
+                    viennacl::linalg::cuda::detail::cuda_arg<float>(cuda_float_x), 0, 2,
+                    viennacl::linalg::cuda::detail::cuda_arg<float>(cuda_float_y), 1, 2);
+  check(ref_float_x, cuda_float_x, eps_float);
+  check(ref_float_y, cuda_float_y, eps_float);
+  ViennaCLCUDADaxpy(my_backend, ViennaCLInt(size/3),
+                    2.0,
+                    viennacl::linalg::cuda::detail::cuda_arg<double>(cuda_double_x), 0, 2,
+                    viennacl::linalg::cuda::detail::cuda_arg<double>(cuda_double_y), 1, 2);
+  check(ref_double_x, cuda_double_x, eps_double);
+  check(ref_double_y, cuda_double_y, eps_double);
+#endif
+
+#ifdef VIENNACL_WITH_OPENCL
+  std::cout << std::endl << "OpenCL: ";
+  ViennaCLOpenCLSaxpy(my_backend, ViennaCLInt(size/3),
+                      2.0f,
+                      viennacl::traits::opencl_handle(opencl_float_x).get(), 0, 2,
+                      viennacl::traits::opencl_handle(opencl_float_y).get(), 1, 2);
+  check(ref_float_x, opencl_float_x, eps_float);
+  check(ref_float_y, opencl_float_y, eps_float);
+  if( viennacl::ocl::current_device().double_support() )
+  {
+    ViennaCLOpenCLDaxpy(my_backend, ViennaCLInt(size/3),
+                        2.0,
+                        viennacl::traits::opencl_handle(*opencl_double_x).get(), 0, 2,
+                        viennacl::traits::opencl_handle(*opencl_double_y).get(), 1, 2);
+    check(ref_double_x, *opencl_double_x, eps_double);
+    check(ref_double_y, *opencl_double_y, eps_double);
+  }
+#endif
+
+
+
+  // COPY
+  std::cout << std::endl << "-- Testing xCOPY...";
+  for (std::size_t i=0; i<size/3; ++i)
+  {
+    ref_float_y[0 + 2*i]  = ref_float_x[1 + 2*i];
+    ref_double_y[0 + 2*i] = ref_double_x[1 + 2*i];
+  }
+
+  std::cout << std::endl << "Host: ";
+  ViennaCLHostScopy(my_backend, ViennaCLInt(size/3),
+                    viennacl::linalg::host_based::detail::extract_raw_pointer<float>(host_float_x), 1, 2,
+                    viennacl::linalg::host_based::detail::extract_raw_pointer<float>(host_float_y), 0, 2);
+  check(ref_float_x, host_float_x, eps_float);
+  check(ref_float_y, host_float_y, eps_float);
+  ViennaCLHostDcopy(my_backend, ViennaCLInt(size/3),
+                    viennacl::linalg::host_based::detail::extract_raw_pointer<double>(host_double_x), 1, 2,
+                    viennacl::linalg::host_based::detail::extract_raw_pointer<double>(host_double_y), 0, 2);
+  check(ref_double_x, host_double_x, eps_double);
+  check(ref_double_y, host_double_y, eps_double);
+
+
+#ifdef VIENNACL_WITH_CUDA
+  std::cout << std::endl << "CUDA: ";
+  ViennaCLCUDAScopy(my_backend, ViennaCLInt(size/3),
+                    viennacl::linalg::cuda::detail::cuda_arg<float>(cuda_float_x), 1, 2,
+                    viennacl::linalg::cuda::detail::cuda_arg<float>(cuda_float_y), 0, 2);
+  check(ref_float_x, cuda_float_x, eps_float);
+  check(ref_float_y, cuda_float_y, eps_float);
+  ViennaCLCUDADcopy(my_backend, ViennaCLInt(size/3),
+                    viennacl::linalg::cuda::detail::cuda_arg<double>(cuda_double_x), 1, 2,
+                    viennacl::linalg::cuda::detail::cuda_arg<double>(cuda_double_y), 0, 2);
+  check(ref_double_x, cuda_double_x, eps_double);
+  check(ref_double_y, cuda_double_y, eps_double);
+#endif
+
+#ifdef VIENNACL_WITH_OPENCL
+  std::cout << std::endl << "OpenCL: ";
+  ViennaCLOpenCLScopy(my_backend, ViennaCLInt(size/3),
+                      viennacl::traits::opencl_handle(opencl_float_x).get(), 1, 2,
+                      viennacl::traits::opencl_handle(opencl_float_y).get(), 0, 2);
+  check(ref_float_x, opencl_float_x, eps_float);
+  check(ref_float_y, opencl_float_y, eps_float);
+  if( viennacl::ocl::current_device().double_support() )
+  {
+    ViennaCLOpenCLDcopy(my_backend, ViennaCLInt(size/3),
+                        viennacl::traits::opencl_handle(*opencl_double_x).get(), 1, 2,
+                        viennacl::traits::opencl_handle(*opencl_double_y).get(), 0, 2);
+    check(ref_double_x, *opencl_double_x, eps_double);
+    check(ref_double_y, *opencl_double_y, eps_double);
+  }
+#endif
+
+
+
+  // DOT
+  std::cout << std::endl << "-- Testing xDOT...";
+  ref_float_alpha  = 0;
+  ref_double_alpha = 0;
+  for (std::size_t i=0; i<size/2; ++i)
+  {
+    ref_float_alpha  += ref_float_y[3 + 2*i]  * ref_float_x[2 + 2*i];
+    ref_double_alpha += ref_double_y[3 + 2*i] * ref_double_x[2 + 2*i];
+  }
+
+  std::cout << std::endl << "Host: ";
+  ViennaCLHostSdot(my_backend, ViennaCLInt(size/2),
+                   &host_float_alpha,
+                   viennacl::linalg::host_based::detail::extract_raw_pointer<float>(host_float_x), 2, 1,
+                   viennacl::linalg::host_based::detail::extract_raw_pointer<float>(host_float_y), 3, 1);
+  check(ref_float_alpha, host_float_alpha, eps_float);
+  ViennaCLHostDdot(my_backend, ViennaCLInt(size/2),
+                   &host_double_alpha,
+                   viennacl::linalg::host_based::detail::extract_raw_pointer<double>(host_double_x), 2, 1,
+                   viennacl::linalg::host_based::detail::extract_raw_pointer<double>(host_double_y), 3, 1);
+  check(ref_double_alpha, host_double_alpha, eps_double);
+
+
+#ifdef VIENNACL_WITH_CUDA
+  std::cout << std::endl << "CUDA: ";
+  ViennaCLCUDASdot(my_backend, ViennaCLInt(size/2),
+                   &cuda_float_alpha,
+                   viennacl::linalg::cuda::detail::cuda_arg<float>(cuda_float_x), 2, 1,
+                   viennacl::linalg::cuda::detail::cuda_arg<float>(cuda_float_y), 3, 1);
+  check(ref_float_alpha, cuda_float_alpha, eps_float);
+  ViennaCLCUDADdot(my_backend, ViennaCLInt(size/2),
+                   &cuda_double_alpha,
+                   viennacl::linalg::cuda::detail::cuda_arg<double>(cuda_double_x), 2, 1,
+                   viennacl::linalg::cuda::detail::cuda_arg<double>(cuda_double_y), 3, 1);
+  check(ref_double_alpha, cuda_double_alpha, eps_double);
+#endif
+
+#ifdef VIENNACL_WITH_OPENCL
+  std::cout << std::endl << "OpenCL: ";
+  ViennaCLOpenCLSdot(my_backend, ViennaCLInt(size/2),
+                     &opencl_float_alpha,
+                     viennacl::traits::opencl_handle(opencl_float_x).get(), 2, 1,
+                     viennacl::traits::opencl_handle(opencl_float_y).get(), 3, 1);
+  check(ref_float_alpha, opencl_float_alpha, eps_float);
+  if( viennacl::ocl::current_device().double_support() )
+  {
+    ViennaCLOpenCLDdot(my_backend, ViennaCLInt(size/2),
+                       &opencl_double_alpha,
+                       viennacl::traits::opencl_handle(*opencl_double_x).get(), 2, 1,
+                       viennacl::traits::opencl_handle(*opencl_double_y).get(), 3, 1);
+    check(ref_double_alpha, opencl_double_alpha, eps_double);
+  }
+#endif
+
+
+
+  // NRM2
+  std::cout << std::endl << "-- Testing xNRM2...";
+  ref_float_alpha  = 0;
+  ref_double_alpha = 0;
+  for (std::size_t i=0; i<size/3; ++i)
+  {
+    ref_float_alpha  += ref_float_x[1 + 2*i]  * ref_float_x[1 + 2*i];
+    ref_double_alpha += ref_double_x[1 + 2*i] * ref_double_x[1 + 2*i];
+  }
+  ref_float_alpha = std::sqrt(ref_float_alpha);
+  ref_double_alpha = std::sqrt(ref_double_alpha);
+
+  std::cout << std::endl << "Host: ";
+  ViennaCLHostSnrm2(my_backend, ViennaCLInt(size/3),
+                    &host_float_alpha,
+                    viennacl::linalg::host_based::detail::extract_raw_pointer<float>(host_float_x), 1, 2);
+  check(ref_float_alpha, host_float_alpha, eps_float);
+  ViennaCLHostDnrm2(my_backend, ViennaCLInt(size/3),
+                    &host_double_alpha,
+                    viennacl::linalg::host_based::detail::extract_raw_pointer<double>(host_double_x), 1, 2);
+  check(ref_double_alpha, host_double_alpha, eps_double);
+
+
+#ifdef VIENNACL_WITH_CUDA
+  std::cout << std::endl << "CUDA: ";
+  ViennaCLCUDASnrm2(my_backend, ViennaCLInt(size/3),
+                    &cuda_float_alpha,
+                    viennacl::linalg::cuda::detail::cuda_arg<float>(cuda_float_x), 1, 2);
+  check(ref_float_alpha, cuda_float_alpha, eps_float);
+  ViennaCLCUDADnrm2(my_backend, ViennaCLInt(size/3),
+                    &cuda_double_alpha,
+                    viennacl::linalg::cuda::detail::cuda_arg<double>(cuda_double_x), 1, 2);
+  check(ref_double_alpha, cuda_double_alpha, eps_double);
+#endif
+
+#ifdef VIENNACL_WITH_OPENCL
+  std::cout << std::endl << "OpenCL: ";
+  ViennaCLOpenCLSnrm2(my_backend, ViennaCLInt(size/3),
+                      &opencl_float_alpha,
+                      viennacl::traits::opencl_handle(opencl_float_x).get(), 1, 2);
+  check(ref_float_alpha, opencl_float_alpha, eps_float);
+  if( viennacl::ocl::current_device().double_support() )
+  {
+    ViennaCLOpenCLDnrm2(my_backend, ViennaCLInt(size/3),
+                        &opencl_double_alpha,
+                        viennacl::traits::opencl_handle(*opencl_double_x).get(), 1, 2);
+    check(ref_double_alpha, opencl_double_alpha, eps_double);
+  }
+#endif
+
+
+
+
+  // ROT
+  std::cout << std::endl << "-- Testing xROT...";
+  for (std::size_t i=0; i<size/4; ++i)
+  {
+    float tmp            =  0.6f * ref_float_x[2 + 3*i] + 0.8f * ref_float_y[1 + 2*i];
+    ref_float_y[1 + 2*i] = -0.8f * ref_float_x[2 + 3*i] + 0.6f * ref_float_y[1 + 2*i];;
+    ref_float_x[2 + 3*i] = tmp;
+
+    double tmp2           =  0.6 * ref_double_x[2 + 3*i] + 0.8 * ref_double_y[1 + 2*i];
+    ref_double_y[1 + 2*i] = -0.8 * ref_double_x[2 + 3*i] + 0.6 * ref_double_y[1 + 2*i];;
+    ref_double_x[2 + 3*i] = tmp2;
+  }
+
+  std::cout << std::endl << "Host: ";
+  ViennaCLHostSrot(my_backend, ViennaCLInt(size/4),
+                   viennacl::linalg::host_based::detail::extract_raw_pointer<float>(host_float_x), 2, 3,
+                   viennacl::linalg::host_based::detail::extract_raw_pointer<float>(host_float_y), 1, 2,
+                   0.6f, 0.8f);
+  check(ref_float_x, host_float_x, eps_float);
+  check(ref_float_y, host_float_y, eps_float);
+  ViennaCLHostDrot(my_backend, ViennaCLInt(size/4),
+                   viennacl::linalg::host_based::detail::extract_raw_pointer<double>(host_double_x), 2, 3,
+                   viennacl::linalg::host_based::detail::extract_raw_pointer<double>(host_double_y), 1, 2,
+                   0.6, 0.8);
+  check(ref_double_x, host_double_x, eps_double);
+  check(ref_double_y, host_double_y, eps_double);
+
+
+#ifdef VIENNACL_WITH_CUDA
+  std::cout << std::endl << "CUDA: ";
+  ViennaCLCUDASrot(my_backend, ViennaCLInt(size/4),
+                   viennacl::linalg::cuda::detail::cuda_arg<float>(cuda_float_x), 2, 3,
+                   viennacl::linalg::cuda::detail::cuda_arg<float>(cuda_float_y), 1, 2,
+                   0.6f, 0.8f);
+  check(ref_float_x, cuda_float_x, eps_float);
+  check(ref_float_y, cuda_float_y, eps_float);
+  ViennaCLCUDADrot(my_backend, ViennaCLInt(size/4),
+                   viennacl::linalg::cuda::detail::cuda_arg<double>(cuda_double_x), 2, 3,
+                   viennacl::linalg::cuda::detail::cuda_arg<double>(cuda_double_y), 1, 2,
+                   0.6, 0.8);
+  check(ref_double_x, cuda_double_x, eps_double);
+  check(ref_double_y, cuda_double_y, eps_double);
+#endif
+
+#ifdef VIENNACL_WITH_OPENCL
+  std::cout << std::endl << "OpenCL: ";
+  ViennaCLOpenCLSrot(my_backend, ViennaCLInt(size/4),
+                     viennacl::traits::opencl_handle(opencl_float_x).get(), 2, 3,
+                     viennacl::traits::opencl_handle(opencl_float_y).get(), 1, 2,
+                     0.6f, 0.8f);
+  check(ref_float_x, opencl_float_x, eps_float);
+  check(ref_float_y, opencl_float_y, eps_float);
+  if( viennacl::ocl::current_device().double_support() )
+  {
+    ViennaCLOpenCLDrot(my_backend, ViennaCLInt(size/4),
+                       viennacl::traits::opencl_handle(*opencl_double_x).get(), 2, 3,
+                       viennacl::traits::opencl_handle(*opencl_double_y).get(), 1, 2,
+                       0.6, 0.8);
+    check(ref_double_x, *opencl_double_x, eps_double);
+    check(ref_double_y, *opencl_double_y, eps_double);
+  }
+#endif
+
+
+
+  // SCAL
+  std::cout << std::endl << "-- Testing xSCAL...";
+  for (std::size_t i=0; i<size/4; ++i)
+  {
+    ref_float_x[1 + 3*i]  *= 2.0f;
+    ref_double_x[1 + 3*i] *= 2.0;
+  }
+
+  std::cout << std::endl << "Host: ";
+  ViennaCLHostSscal(my_backend, ViennaCLInt(size/4),
+                    2.0f,
+                    viennacl::linalg::host_based::detail::extract_raw_pointer<float>(host_float_x), 1, 3);
+  check(ref_float_x, host_float_x, eps_float);
+  ViennaCLHostDscal(my_backend, ViennaCLInt(size/4),
+                    2.0,
+                    viennacl::linalg::host_based::detail::extract_raw_pointer<double>(host_double_x), 1, 3);
+  check(ref_double_x, host_double_x, eps_double);
+
+#ifdef VIENNACL_WITH_CUDA
+  std::cout << std::endl << "CUDA: ";
+  ViennaCLCUDASscal(my_backend, ViennaCLInt(size/4),
+                    2.0f,
+                    viennacl::linalg::cuda::detail::cuda_arg<float>(cuda_float_x), 1, 3);
+  check(ref_float_x, cuda_float_x, eps_float);
+  ViennaCLCUDADscal(my_backend, ViennaCLInt(size/4),
+                    2.0,
+                    viennacl::linalg::cuda::detail::cuda_arg<double>(cuda_double_x), 1, 3);
+  check(ref_double_x, cuda_double_x, eps_double);
+#endif
+
+#ifdef VIENNACL_WITH_OPENCL
+  std::cout << std::endl << "OpenCL: ";
+  ViennaCLOpenCLSscal(my_backend, ViennaCLInt(size/4),
+                      2.0f,
+                      viennacl::traits::opencl_handle(opencl_float_x).get(), 1, 3);
+  check(ref_float_x, opencl_float_x, eps_float);
+  if( viennacl::ocl::current_device().double_support() )
+  {
+    ViennaCLOpenCLDscal(my_backend, ViennaCLInt(size/4),
+                        2.0,
+                        viennacl::traits::opencl_handle(*opencl_double_x).get(), 1, 3);
+    check(ref_double_x, *opencl_double_x, eps_double);
+  }
+#endif
+
+
+  // SWAP
+  std::cout << std::endl << "-- Testing xSWAP...";
+  for (std::size_t i=0; i<size/3; ++i)
+  {
+    float tmp = ref_float_x[2 + 2*i];
+    ref_float_x[2 + 2*i] = ref_float_y[1 + 2*i];
+    ref_float_y[1 + 2*i] = tmp;
+
+    double tmp2 = ref_double_x[2 + 2*i];
+    ref_double_x[2 + 2*i] = ref_double_y[1 + 2*i];
+    ref_double_y[1 + 2*i] = tmp2;
+  }
+
+  std::cout << std::endl << "Host: ";
+  ViennaCLHostSswap(my_backend, ViennaCLInt(size/3),
+                    viennacl::linalg::host_based::detail::extract_raw_pointer<float>(host_float_x), 2, 2,
+                    viennacl::linalg::host_based::detail::extract_raw_pointer<float>(host_float_y), 1, 2);
+  check(ref_float_y, host_float_y, eps_float);
+  ViennaCLHostDswap(my_backend, ViennaCLInt(size/3),
+                    viennacl::linalg::host_based::detail::extract_raw_pointer<double>(host_double_x), 2, 2,
+                    viennacl::linalg::host_based::detail::extract_raw_pointer<double>(host_double_y), 1, 2);
+  check(ref_double_y, host_double_y, eps_double);
+
+
+#ifdef VIENNACL_WITH_CUDA
+  std::cout << std::endl << "CUDA: ";
+  ViennaCLCUDASswap(my_backend, ViennaCLInt(size/3),
+                    viennacl::linalg::cuda::detail::cuda_arg<float>(cuda_float_x), 2, 2,
+                    viennacl::linalg::cuda::detail::cuda_arg<float>(cuda_float_y), 1, 2);
+  check(ref_float_y, cuda_float_y, eps_float);
+  ViennaCLCUDADswap(my_backend, ViennaCLInt(size/3),
+                    viennacl::linalg::cuda::detail::cuda_arg<double>(cuda_double_x), 2, 2,
+                    viennacl::linalg::cuda::detail::cuda_arg<double>(cuda_double_y), 1, 2);
+  check(ref_double_y, cuda_double_y, eps_double);
+#endif
+
+#ifdef VIENNACL_WITH_OPENCL
+  std::cout << std::endl << "OpenCL: ";
+  ViennaCLOpenCLSswap(my_backend, ViennaCLInt(size/3),
+                      viennacl::traits::opencl_handle(opencl_float_x).get(), 2, 2,
+                      viennacl::traits::opencl_handle(opencl_float_y).get(), 1, 2);
+  check(ref_float_y, opencl_float_y, eps_float);
+  if( viennacl::ocl::current_device().double_support() )
+  {
+    ViennaCLOpenCLDswap(my_backend, ViennaCLInt(size/3),
+                        viennacl::traits::opencl_handle(*opencl_double_x).get(), 2, 2,
+                        viennacl::traits::opencl_handle(*opencl_double_y).get(), 1, 2);
+    check(ref_double_y, *opencl_double_y, eps_double);
+  }
+#endif
+
+
+  // IAMAX
+  std::cout << std::endl << "-- Testing IxASUM...";
+  ViennaCLInt ref_index = 0;
+  ref_float_alpha = 0;
+  for (std::size_t i=0; i<size/3; ++i)
+  {
+    if (ref_float_x[0 + 2*i] > std::fabs(ref_float_alpha))
+    {
+      ref_index = ViennaCLInt(i);
+      ref_float_alpha = std::fabs(ref_float_x[0 + 2*i]);
+    }
+  }
+
+  std::cout << std::endl << "Host: ";
+  ViennaCLInt idx = 0;
+  ViennaCLHostiSamax(my_backend, ViennaCLInt(size/3),
+                     &idx,
+                     viennacl::linalg::host_based::detail::extract_raw_pointer<float>(host_float_x), 0, 2);
+  check(static_cast<float>(ref_index), static_cast<float>(idx), eps_float);
+  idx = 0;
+  ViennaCLHostiDamax(my_backend, ViennaCLInt(size/3),
+                     &idx,
+                     viennacl::linalg::host_based::detail::extract_raw_pointer<double>(host_double_x), 0, 2);
+  check(ref_index, idx, eps_double);
+
+#ifdef VIENNACL_WITH_CUDA
+  std::cout << std::endl << "CUDA: ";
+  idx = 0;
+  ViennaCLCUDAiSamax(my_backend, ViennaCLInt(size/3),
+                     &idx,
+                     viennacl::linalg::cuda::detail::cuda_arg<float>(cuda_float_x), 0, 2);
+  check(ref_float_x[2*ref_index], ref_float_x[2*idx], eps_float);
+  idx = 0;
+  ViennaCLCUDAiDamax(my_backend, ViennaCLInt(size/3),
+                     &idx,
+                     viennacl::linalg::cuda::detail::cuda_arg<double>(cuda_double_x), 0, 2);
+  check(ref_double_x[2*ref_index], ref_double_x[2*idx], eps_double);
+#endif
+
+#ifdef VIENNACL_WITH_OPENCL
+  std::cout << std::endl << "OpenCL: ";
+  idx = 0;
+  ViennaCLOpenCLiSamax(my_backend, ViennaCLInt(size/3),
+                       &idx,
+                       viennacl::traits::opencl_handle(opencl_float_x).get(), 0, 2);
+  check(ref_float_x[2*ref_index], ref_float_x[2*idx], eps_float);
+  idx = 0;
+  if( viennacl::ocl::current_device().double_support() )
+  {
+    ViennaCLOpenCLiDamax(my_backend, ViennaCLInt(size/3),
+                         &idx,
+                         viennacl::traits::opencl_handle(*opencl_double_x).get(), 0, 2);
+    check(ref_double_x[2*ref_index], ref_double_x[2*idx], eps_double);
+  }
+#endif
+
+#ifdef VIENNACL_WITH_OPENCL
+  //cleanup
+  if( viennacl::ocl::current_device().double_support() )
+  {
+    delete opencl_double_x;
+    delete opencl_double_y;
+  }
+#endif
+
+  ViennaCLBackendDestroy(&my_backend);
+
+  //
+  //  That's it.
+  //
+  std::cout << std::endl << "!!!! TEST COMPLETED SUCCESSFULLY !!!!" << std::endl;
+
+  return EXIT_SUCCESS;
+}
+
diff --git a/tests/src/libviennacl_blas1.cu b/tests/src/libviennacl_blas1.cu
new file mode 100644
index 0000000..53745af
--- /dev/null
+++ b/tests/src/libviennacl_blas1.cu
@@ -0,0 +1,668 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/*
+*
+*   Testing the ViennaCL BLAS-like shared library
+*
+*/
+
+
+// include necessary system headers
+#include <iostream>
+#include <vector>
+#include <cmath>
+
+// Some helper functions for this tutorial:
+#include "viennacl.hpp"
+
+#include "viennacl/vector.hpp"
+
+template <typename ScalarType>
+ScalarType diff(ScalarType const & s1, ScalarType const & s2)
+{
+   if (s1 != s2)
+      return (s1 - s2) / std::max(static_cast<ScalarType>(std::fabs(static_cast<double>(s1))),
+                                  static_cast<ScalarType>(std::fabs(static_cast<double>(s2))));
+   return ScalarType(0);
+}
+
+template <typename ScalarType, typename ViennaCLVectorType>
+ScalarType diff(std::vector<ScalarType> const & v1, ViennaCLVectorType const & vcl_vec)
+{
+   std::vector<ScalarType> v2_cpu(vcl_vec.size());
+   viennacl::backend::finish();
+   viennacl::copy(vcl_vec, v2_cpu);
+
+   ScalarType inf_norm = 0;
+   for (unsigned int i=0;i<v1.size(); ++i)
+   {
+      if ( std::max( std::fabs(v2_cpu[i]), std::fabs(v1[i]) ) > 0 )
+         v2_cpu[i] = std::fabs(v2_cpu[i] - v1[i]) / std::max( std::fabs(v2_cpu[i]), std::fabs(v1[i]) );
+      else
+         v2_cpu[i] = 0.0;
+
+      if (v2_cpu[i] > inf_norm)
+        inf_norm = v2_cpu[i];
+   }
+
+   return inf_norm;
+}
+
+template <typename T, typename U, typename EpsilonT>
+void check(T const & t, U const & u, EpsilonT eps)
+{
+  EpsilonT rel_error = static_cast<EpsilonT>(diff(t,u));
+  if (rel_error > eps)
+  {
+    std::cerr << "Relative error: " << rel_error << std::endl;
+    std::cerr << "Aborting!" << std::endl;
+    exit(EXIT_FAILURE);
+  }
+  std::cout << "SUCCESS ";
+}
+
+int main()
+{
+  std::size_t size  = 10; // at least 7
+  float  eps_float  = 1e-5f;
+  double eps_double = 1e-12;
+
+  float  ref_float_alpha;
+  double ref_double_alpha;
+
+  std::vector<float> ref_float_x(size, 1.0f);
+  std::vector<float> ref_float_y(size, 2.0f);
+
+  std::vector<double> ref_double_x(size, 1.0);
+  std::vector<double> ref_double_y(size, 2.0);
+
+  ViennaCLBackend my_backend;
+  ViennaCLBackendCreate(&my_backend);
+
+  // Host setup
+  float host_float_alpha = 0;
+  viennacl::vector<float> host_float_x = viennacl::scalar_vector<float>(size, 1.0f, viennacl::context(viennacl::MAIN_MEMORY));
+  viennacl::vector<float> host_float_y = viennacl::scalar_vector<float>(size, 2.0f, viennacl::context(viennacl::MAIN_MEMORY));
+
+  double host_double_alpha = 0;
+  viennacl::vector<double> host_double_x = viennacl::scalar_vector<double>(size, 1.0, viennacl::context(viennacl::MAIN_MEMORY));
+  viennacl::vector<double> host_double_y = viennacl::scalar_vector<double>(size, 2.0, viennacl::context(viennacl::MAIN_MEMORY));
+
+  // CUDA setup
+#ifdef VIENNACL_WITH_CUDA
+  float cuda_float_alpha = 0;
+  viennacl::vector<float> cuda_float_x = viennacl::scalar_vector<float>(size, 1.0f, viennacl::context(viennacl::CUDA_MEMORY));
+  viennacl::vector<float> cuda_float_y = viennacl::scalar_vector<float>(size, 2.0f, viennacl::context(viennacl::CUDA_MEMORY));
+
+  double cuda_double_alpha = 0;
+  viennacl::vector<double> cuda_double_x = viennacl::scalar_vector<double>(size, 1.0, viennacl::context(viennacl::CUDA_MEMORY));
+  viennacl::vector<double> cuda_double_y = viennacl::scalar_vector<double>(size, 2.0, viennacl::context(viennacl::CUDA_MEMORY));
+#endif
+
+  // OpenCL setup
+#ifdef VIENNACL_WITH_OPENCL
+  ViennaCLInt context_id = 0;
+  float opencl_float_alpha = 0;
+  viennacl::vector<float> opencl_float_x = viennacl::scalar_vector<float>(size, 1.0f, viennacl::context(viennacl::ocl::get_context(context_id)));
+  viennacl::vector<float> opencl_float_y = viennacl::scalar_vector<float>(size, 2.0f, viennacl::context(viennacl::ocl::get_context(context_id)));
+
+  double opencl_double_alpha = 0;
+  viennacl::vector<double> *opencl_double_x = NULL;
+  viennacl::vector<double> *opencl_double_y = NULL;
+  if( viennacl::ocl::current_device().double_support() )
+  {
+    opencl_double_x = new viennacl::vector<double>(viennacl::scalar_vector<double>(size, 1.0, viennacl::context(viennacl::ocl::get_context(context_id))));
+    opencl_double_y = new viennacl::vector<double>(viennacl::scalar_vector<double>(size, 2.0, viennacl::context(viennacl::ocl::get_context(context_id))));
+  }
+
+  ViennaCLBackendSetOpenCLContextID(my_backend, context_id);
+#endif
+
+  // consistency checks:
+  check(ref_float_x, host_float_x, eps_float);
+  check(ref_float_y, host_float_y, eps_float);
+  check(ref_double_x, host_double_x, eps_double);
+  check(ref_double_y, host_double_y, eps_double);
+#ifdef VIENNACL_WITH_CUDA
+  check(ref_float_x, cuda_float_x, eps_float);
+  check(ref_float_y, cuda_float_y, eps_float);
+  check(ref_double_x, cuda_double_x, eps_double);
+  check(ref_double_y, cuda_double_y, eps_double);
+#endif
+#ifdef VIENNACL_WITH_OPENCL
+  check(ref_float_x, opencl_float_x, eps_float);
+  check(ref_float_y, opencl_float_y, eps_float);
+  if( viennacl::ocl::current_device().double_support() )
+  {
+    check(ref_double_x, *opencl_double_x, eps_double);
+    check(ref_double_y, *opencl_double_y, eps_double);
+  }
+#endif
+
+  // ASUM
+  std::cout << std::endl << "-- Testing xASUM...";
+  ref_float_alpha  = 0;
+  ref_double_alpha = 0;
+  for (std::size_t i=0; i<size/4; ++i)
+  {
+    ref_float_alpha  += std::fabs(ref_float_x[2 + 3*i]);
+    ref_double_alpha += std::fabs(ref_double_x[2 + 3*i]);
+  }
+
+  std::cout << std::endl << "Host: ";
+  ViennaCLHostSasum(my_backend, ViennaCLInt(size/4),
+                    &host_float_alpha,
+                    viennacl::linalg::host_based::detail::extract_raw_pointer<float>(host_float_x), 2, 3);
+  check(ref_float_alpha, host_float_alpha, eps_float);
+  ViennaCLHostDasum(my_backend, ViennaCLInt(size/4),
+                    &host_double_alpha,
+                    viennacl::linalg::host_based::detail::extract_raw_pointer<double>(host_double_x), 2, 3);
+  check(ref_double_alpha, host_double_alpha, eps_double);
+
+
+#ifdef VIENNACL_WITH_CUDA
+  std::cout << std::endl << "CUDA: ";
+  ViennaCLCUDASasum(my_backend, ViennaCLInt(size/4),
+                    &cuda_float_alpha,
+                    viennacl::linalg::cuda::detail::cuda_arg<float>(cuda_float_x), 2, 3);
+  check(ref_float_alpha, cuda_float_alpha, eps_float);
+  ViennaCLCUDADasum(my_backend, ViennaCLInt(size/4),
+                    &cuda_double_alpha,
+                    viennacl::linalg::cuda::detail::cuda_arg<double>(cuda_double_x), 2, 3);
+  check(ref_double_alpha, cuda_double_alpha, eps_double);
+#endif
+
+#ifdef VIENNACL_WITH_OPENCL
+  std::cout << std::endl << "OpenCL: ";
+  ViennaCLOpenCLSasum(my_backend, ViennaCLInt(size/4),
+                      &opencl_float_alpha,
+                      viennacl::traits::opencl_handle(opencl_float_x).get(), 2, 3);
+  check(ref_float_alpha, opencl_float_alpha, eps_float);
+  if( viennacl::ocl::current_device().double_support() )
+  {
+    ViennaCLOpenCLDasum(my_backend, ViennaCLInt(size/4),
+                        &opencl_double_alpha,
+                        viennacl::traits::opencl_handle(*opencl_double_x).get(), 2, 3);
+    check(ref_double_alpha, opencl_double_alpha, eps_double);
+  }
+#endif
+
+
+
+  // AXPY
+  std::cout << std::endl << "-- Testing xAXPY...";
+  for (std::size_t i=0; i<size/3; ++i)
+  {
+    ref_float_y[1 + 2*i]  += 2.0f * ref_float_x[0 + 2*i];
+    ref_double_y[1 + 2*i] += 2.0  * ref_double_x[0 + 2*i];
+  }
+
+  std::cout << std::endl << "Host: ";
+  ViennaCLHostSaxpy(my_backend, ViennaCLInt(size/3),
+                    2.0f,
+                    viennacl::linalg::host_based::detail::extract_raw_pointer<float>(host_float_x), 0, 2,
+                    viennacl::linalg::host_based::detail::extract_raw_pointer<float>(host_float_y), 1, 2);
+  check(ref_float_x, host_float_x, eps_float);
+  check(ref_float_y, host_float_y, eps_float);
+  ViennaCLHostDaxpy(my_backend, ViennaCLInt(size/3),
+                    2.0,
+                    viennacl::linalg::host_based::detail::extract_raw_pointer<double>(host_double_x), 0, 2,
+                    viennacl::linalg::host_based::detail::extract_raw_pointer<double>(host_double_y), 1, 2);
+  check(ref_double_x, host_double_x, eps_double);
+  check(ref_double_y, host_double_y, eps_double);
+
+
+#ifdef VIENNACL_WITH_CUDA
+  std::cout << std::endl << "CUDA: ";
+  ViennaCLCUDASaxpy(my_backend, ViennaCLInt(size/3),
+                    2.0f,
+                    viennacl::linalg::cuda::detail::cuda_arg<float>(cuda_float_x), 0, 2,
+                    viennacl::linalg::cuda::detail::cuda_arg<float>(cuda_float_y), 1, 2);
+  check(ref_float_x, cuda_float_x, eps_float);
+  check(ref_float_y, cuda_float_y, eps_float);
+  ViennaCLCUDADaxpy(my_backend, ViennaCLInt(size/3),
+                    2.0,
+                    viennacl::linalg::cuda::detail::cuda_arg<double>(cuda_double_x), 0, 2,
+                    viennacl::linalg::cuda::detail::cuda_arg<double>(cuda_double_y), 1, 2);
+  check(ref_double_x, cuda_double_x, eps_double);
+  check(ref_double_y, cuda_double_y, eps_double);
+#endif
+
+#ifdef VIENNACL_WITH_OPENCL
+  std::cout << std::endl << "OpenCL: ";
+  ViennaCLOpenCLSaxpy(my_backend, ViennaCLInt(size/3),
+                      2.0f,
+                      viennacl::traits::opencl_handle(opencl_float_x).get(), 0, 2,
+                      viennacl::traits::opencl_handle(opencl_float_y).get(), 1, 2);
+  check(ref_float_x, opencl_float_x, eps_float);
+  check(ref_float_y, opencl_float_y, eps_float);
+  if( viennacl::ocl::current_device().double_support() )
+  {
+    ViennaCLOpenCLDaxpy(my_backend, ViennaCLInt(size/3),
+                        2.0,
+                        viennacl::traits::opencl_handle(*opencl_double_x).get(), 0, 2,
+                        viennacl::traits::opencl_handle(*opencl_double_y).get(), 1, 2);
+    check(ref_double_x, *opencl_double_x, eps_double);
+    check(ref_double_y, *opencl_double_y, eps_double);
+  }
+#endif
+
+
+
+  // COPY
+  std::cout << std::endl << "-- Testing xCOPY...";
+  for (std::size_t i=0; i<size/3; ++i)
+  {
+    ref_float_y[0 + 2*i]  = ref_float_x[1 + 2*i];
+    ref_double_y[0 + 2*i] = ref_double_x[1 + 2*i];
+  }
+
+  std::cout << std::endl << "Host: ";
+  ViennaCLHostScopy(my_backend, ViennaCLInt(size/3),
+                    viennacl::linalg::host_based::detail::extract_raw_pointer<float>(host_float_x), 1, 2,
+                    viennacl::linalg::host_based::detail::extract_raw_pointer<float>(host_float_y), 0, 2);
+  check(ref_float_x, host_float_x, eps_float);
+  check(ref_float_y, host_float_y, eps_float);
+  ViennaCLHostDcopy(my_backend, ViennaCLInt(size/3),
+                    viennacl::linalg::host_based::detail::extract_raw_pointer<double>(host_double_x), 1, 2,
+                    viennacl::linalg::host_based::detail::extract_raw_pointer<double>(host_double_y), 0, 2);
+  check(ref_double_x, host_double_x, eps_double);
+  check(ref_double_y, host_double_y, eps_double);
+
+
+#ifdef VIENNACL_WITH_CUDA
+  std::cout << std::endl << "CUDA: ";
+  ViennaCLCUDAScopy(my_backend, ViennaCLInt(size/3),
+                    viennacl::linalg::cuda::detail::cuda_arg<float>(cuda_float_x), 1, 2,
+                    viennacl::linalg::cuda::detail::cuda_arg<float>(cuda_float_y), 0, 2);
+  check(ref_float_x, cuda_float_x, eps_float);
+  check(ref_float_y, cuda_float_y, eps_float);
+  ViennaCLCUDADcopy(my_backend, ViennaCLInt(size/3),
+                    viennacl::linalg::cuda::detail::cuda_arg<double>(cuda_double_x), 1, 2,
+                    viennacl::linalg::cuda::detail::cuda_arg<double>(cuda_double_y), 0, 2);
+  check(ref_double_x, cuda_double_x, eps_double);
+  check(ref_double_y, cuda_double_y, eps_double);
+#endif
+
+#ifdef VIENNACL_WITH_OPENCL
+  std::cout << std::endl << "OpenCL: ";
+  ViennaCLOpenCLScopy(my_backend, ViennaCLInt(size/3),
+                      viennacl::traits::opencl_handle(opencl_float_x).get(), 1, 2,
+                      viennacl::traits::opencl_handle(opencl_float_y).get(), 0, 2);
+  check(ref_float_x, opencl_float_x, eps_float);
+  check(ref_float_y, opencl_float_y, eps_float);
+  if( viennacl::ocl::current_device().double_support() )
+  {
+    ViennaCLOpenCLDcopy(my_backend, ViennaCLInt(size/3),
+                        viennacl::traits::opencl_handle(*opencl_double_x).get(), 1, 2,
+                        viennacl::traits::opencl_handle(*opencl_double_y).get(), 0, 2);
+    check(ref_double_x, *opencl_double_x, eps_double);
+    check(ref_double_y, *opencl_double_y, eps_double);
+  }
+#endif
+
+
+
+  // DOT
+  std::cout << std::endl << "-- Testing xDOT...";
+  ref_float_alpha  = 0;
+  ref_double_alpha = 0;
+  for (std::size_t i=0; i<size/2; ++i)
+  {
+    ref_float_alpha  += ref_float_y[3 + 2*i]  * ref_float_x[2 + 2*i];
+    ref_double_alpha += ref_double_y[3 + 2*i] * ref_double_x[2 + 2*i];
+  }
+
+  std::cout << std::endl << "Host: ";
+  ViennaCLHostSdot(my_backend, ViennaCLInt(size/2),
+                   &host_float_alpha,
+                   viennacl::linalg::host_based::detail::extract_raw_pointer<float>(host_float_x), 2, 1,
+                   viennacl::linalg::host_based::detail::extract_raw_pointer<float>(host_float_y), 3, 1);
+  check(ref_float_alpha, host_float_alpha, eps_float);
+  ViennaCLHostDdot(my_backend, ViennaCLInt(size/2),
+                   &host_double_alpha,
+                   viennacl::linalg::host_based::detail::extract_raw_pointer<double>(host_double_x), 2, 1,
+                   viennacl::linalg::host_based::detail::extract_raw_pointer<double>(host_double_y), 3, 1);
+  check(ref_double_alpha, host_double_alpha, eps_double);
+
+
+#ifdef VIENNACL_WITH_CUDA
+  std::cout << std::endl << "CUDA: ";
+  ViennaCLCUDASdot(my_backend, ViennaCLInt(size/2),
+                   &cuda_float_alpha,
+                   viennacl::linalg::cuda::detail::cuda_arg<float>(cuda_float_x), 2, 1,
+                   viennacl::linalg::cuda::detail::cuda_arg<float>(cuda_float_y), 3, 1);
+  check(ref_float_alpha, cuda_float_alpha, eps_float);
+  ViennaCLCUDADdot(my_backend, ViennaCLInt(size/2),
+                   &cuda_double_alpha,
+                   viennacl::linalg::cuda::detail::cuda_arg<double>(cuda_double_x), 2, 1,
+                   viennacl::linalg::cuda::detail::cuda_arg<double>(cuda_double_y), 3, 1);
+  check(ref_double_alpha, cuda_double_alpha, eps_double);
+#endif
+
+#ifdef VIENNACL_WITH_OPENCL
+  std::cout << std::endl << "OpenCL: ";
+  ViennaCLOpenCLSdot(my_backend, ViennaCLInt(size/2),
+                     &opencl_float_alpha,
+                     viennacl::traits::opencl_handle(opencl_float_x).get(), 2, 1,
+                     viennacl::traits::opencl_handle(opencl_float_y).get(), 3, 1);
+  check(ref_float_alpha, opencl_float_alpha, eps_float);
+  if( viennacl::ocl::current_device().double_support() )
+  {
+    ViennaCLOpenCLDdot(my_backend, ViennaCLInt(size/2),
+                       &opencl_double_alpha,
+                       viennacl::traits::opencl_handle(*opencl_double_x).get(), 2, 1,
+                       viennacl::traits::opencl_handle(*opencl_double_y).get(), 3, 1);
+    check(ref_double_alpha, opencl_double_alpha, eps_double);
+  }
+#endif
+
+
+
+  // NRM2
+  std::cout << std::endl << "-- Testing xNRM2...";
+  ref_float_alpha  = 0;
+  ref_double_alpha = 0;
+  for (std::size_t i=0; i<size/3; ++i)
+  {
+    ref_float_alpha  += ref_float_x[1 + 2*i]  * ref_float_x[1 + 2*i];
+    ref_double_alpha += ref_double_x[1 + 2*i] * ref_double_x[1 + 2*i];
+  }
+  ref_float_alpha = std::sqrt(ref_float_alpha);
+  ref_double_alpha = std::sqrt(ref_double_alpha);
+
+  std::cout << std::endl << "Host: ";
+  ViennaCLHostSnrm2(my_backend, ViennaCLInt(size/3),
+                    &host_float_alpha,
+                    viennacl::linalg::host_based::detail::extract_raw_pointer<float>(host_float_x), 1, 2);
+  check(ref_float_alpha, host_float_alpha, eps_float);
+  ViennaCLHostDnrm2(my_backend, ViennaCLInt(size/3),
+                    &host_double_alpha,
+                    viennacl::linalg::host_based::detail::extract_raw_pointer<double>(host_double_x), 1, 2);
+  check(ref_double_alpha, host_double_alpha, eps_double);
+
+
+#ifdef VIENNACL_WITH_CUDA
+  std::cout << std::endl << "CUDA: ";
+  ViennaCLCUDASnrm2(my_backend, ViennaCLInt(size/3),
+                    &cuda_float_alpha,
+                    viennacl::linalg::cuda::detail::cuda_arg<float>(cuda_float_x), 1, 2);
+  check(ref_float_alpha, cuda_float_alpha, eps_float);
+  ViennaCLCUDADnrm2(my_backend, ViennaCLInt(size/3),
+                    &cuda_double_alpha,
+                    viennacl::linalg::cuda::detail::cuda_arg<double>(cuda_double_x), 1, 2);
+  check(ref_double_alpha, cuda_double_alpha, eps_double);
+#endif
+
+#ifdef VIENNACL_WITH_OPENCL
+  std::cout << std::endl << "OpenCL: ";
+  ViennaCLOpenCLSnrm2(my_backend, ViennaCLInt(size/3),
+                      &opencl_float_alpha,
+                      viennacl::traits::opencl_handle(opencl_float_x).get(), 1, 2);
+  check(ref_float_alpha, opencl_float_alpha, eps_float);
+  if( viennacl::ocl::current_device().double_support() )
+  {
+    ViennaCLOpenCLDnrm2(my_backend, ViennaCLInt(size/3),
+                        &opencl_double_alpha,
+                        viennacl::traits::opencl_handle(*opencl_double_x).get(), 1, 2);
+    check(ref_double_alpha, opencl_double_alpha, eps_double);
+  }
+#endif
+
+
+
+
+  // ROT
+  std::cout << std::endl << "-- Testing xROT...";
+  for (std::size_t i=0; i<size/4; ++i)
+  {
+    float tmp            =  0.6f * ref_float_x[2 + 3*i] + 0.8f * ref_float_y[1 + 2*i];
+    ref_float_y[1 + 2*i] = -0.8f * ref_float_x[2 + 3*i] + 0.6f * ref_float_y[1 + 2*i];;
+    ref_float_x[2 + 3*i] = tmp;
+
+    double tmp2           =  0.6 * ref_double_x[2 + 3*i] + 0.8 * ref_double_y[1 + 2*i];
+    ref_double_y[1 + 2*i] = -0.8 * ref_double_x[2 + 3*i] + 0.6 * ref_double_y[1 + 2*i];;
+    ref_double_x[2 + 3*i] = tmp2;
+  }
+
+  std::cout << std::endl << "Host: ";
+  ViennaCLHostSrot(my_backend, ViennaCLInt(size/4),
+                   viennacl::linalg::host_based::detail::extract_raw_pointer<float>(host_float_x), 2, 3,
+                   viennacl::linalg::host_based::detail::extract_raw_pointer<float>(host_float_y), 1, 2,
+                   0.6f, 0.8f);
+  check(ref_float_x, host_float_x, eps_float);
+  check(ref_float_y, host_float_y, eps_float);
+  ViennaCLHostDrot(my_backend, ViennaCLInt(size/4),
+                   viennacl::linalg::host_based::detail::extract_raw_pointer<double>(host_double_x), 2, 3,
+                   viennacl::linalg::host_based::detail::extract_raw_pointer<double>(host_double_y), 1, 2,
+                   0.6, 0.8);
+  check(ref_double_x, host_double_x, eps_double);
+  check(ref_double_y, host_double_y, eps_double);
+
+
+#ifdef VIENNACL_WITH_CUDA
+  std::cout << std::endl << "CUDA: ";
+  ViennaCLCUDASrot(my_backend, ViennaCLInt(size/4),
+                   viennacl::linalg::cuda::detail::cuda_arg<float>(cuda_float_x), 2, 3,
+                   viennacl::linalg::cuda::detail::cuda_arg<float>(cuda_float_y), 1, 2,
+                   0.6f, 0.8f);
+  check(ref_float_x, cuda_float_x, eps_float);
+  check(ref_float_y, cuda_float_y, eps_float);
+  ViennaCLCUDADrot(my_backend, ViennaCLInt(size/4),
+                   viennacl::linalg::cuda::detail::cuda_arg<double>(cuda_double_x), 2, 3,
+                   viennacl::linalg::cuda::detail::cuda_arg<double>(cuda_double_y), 1, 2,
+                   0.6, 0.8);
+  check(ref_double_x, cuda_double_x, eps_double);
+  check(ref_double_y, cuda_double_y, eps_double);
+#endif
+
+#ifdef VIENNACL_WITH_OPENCL
+  std::cout << std::endl << "OpenCL: ";
+  ViennaCLOpenCLSrot(my_backend, ViennaCLInt(size/4),
+                     viennacl::traits::opencl_handle(opencl_float_x).get(), 2, 3,
+                     viennacl::traits::opencl_handle(opencl_float_y).get(), 1, 2,
+                     0.6f, 0.8f);
+  check(ref_float_x, opencl_float_x, eps_float);
+  check(ref_float_y, opencl_float_y, eps_float);
+  if( viennacl::ocl::current_device().double_support() )
+  {
+    ViennaCLOpenCLDrot(my_backend, ViennaCLInt(size/4),
+                       viennacl::traits::opencl_handle(*opencl_double_x).get(), 2, 3,
+                       viennacl::traits::opencl_handle(*opencl_double_y).get(), 1, 2,
+                       0.6, 0.8);
+    check(ref_double_x, *opencl_double_x, eps_double);
+    check(ref_double_y, *opencl_double_y, eps_double);
+  }
+#endif
+
+
+
+  // SCAL
+  std::cout << std::endl << "-- Testing xSCAL...";
+  for (std::size_t i=0; i<size/4; ++i)
+  {
+    ref_float_x[1 + 3*i]  *= 2.0f;
+    ref_double_x[1 + 3*i] *= 2.0;
+  }
+
+  std::cout << std::endl << "Host: ";
+  ViennaCLHostSscal(my_backend, ViennaCLInt(size/4),
+                    2.0f,
+                    viennacl::linalg::host_based::detail::extract_raw_pointer<float>(host_float_x), 1, 3);
+  check(ref_float_x, host_float_x, eps_float);
+  ViennaCLHostDscal(my_backend, ViennaCLInt(size/4),
+                    2.0,
+                    viennacl::linalg::host_based::detail::extract_raw_pointer<double>(host_double_x), 1, 3);
+  check(ref_double_x, host_double_x, eps_double);
+
+#ifdef VIENNACL_WITH_CUDA
+  std::cout << std::endl << "CUDA: ";
+  ViennaCLCUDASscal(my_backend, ViennaCLInt(size/4),
+                    2.0f,
+                    viennacl::linalg::cuda::detail::cuda_arg<float>(cuda_float_x), 1, 3);
+  check(ref_float_x, cuda_float_x, eps_float);
+  ViennaCLCUDADscal(my_backend, ViennaCLInt(size/4),
+                    2.0,
+                    viennacl::linalg::cuda::detail::cuda_arg<double>(cuda_double_x), 1, 3);
+  check(ref_double_x, cuda_double_x, eps_double);
+#endif
+
+#ifdef VIENNACL_WITH_OPENCL
+  std::cout << std::endl << "OpenCL: ";
+  ViennaCLOpenCLSscal(my_backend, ViennaCLInt(size/4),
+                      2.0f,
+                      viennacl::traits::opencl_handle(opencl_float_x).get(), 1, 3);
+  check(ref_float_x, opencl_float_x, eps_float);
+  if( viennacl::ocl::current_device().double_support() )
+  {
+    ViennaCLOpenCLDscal(my_backend, ViennaCLInt(size/4),
+                        2.0,
+                        viennacl::traits::opencl_handle(*opencl_double_x).get(), 1, 3);
+    check(ref_double_x, *opencl_double_x, eps_double);
+  }
+#endif
+
+
+  // SWAP
+  std::cout << std::endl << "-- Testing xSWAP...";
+  for (std::size_t i=0; i<size/3; ++i)
+  {
+    float tmp = ref_float_x[2 + 2*i];
+    ref_float_x[2 + 2*i] = ref_float_y[1 + 2*i];
+    ref_float_y[1 + 2*i] = tmp;
+
+    double tmp2 = ref_double_x[2 + 2*i];
+    ref_double_x[2 + 2*i] = ref_double_y[1 + 2*i];
+    ref_double_y[1 + 2*i] = tmp2;
+  }
+
+  std::cout << std::endl << "Host: ";
+  ViennaCLHostSswap(my_backend, ViennaCLInt(size/3),
+                    viennacl::linalg::host_based::detail::extract_raw_pointer<float>(host_float_x), 2, 2,
+                    viennacl::linalg::host_based::detail::extract_raw_pointer<float>(host_float_y), 1, 2);
+  check(ref_float_y, host_float_y, eps_float);
+  ViennaCLHostDswap(my_backend, ViennaCLInt(size/3),
+                    viennacl::linalg::host_based::detail::extract_raw_pointer<double>(host_double_x), 2, 2,
+                    viennacl::linalg::host_based::detail::extract_raw_pointer<double>(host_double_y), 1, 2);
+  check(ref_double_y, host_double_y, eps_double);
+
+
+#ifdef VIENNACL_WITH_CUDA
+  std::cout << std::endl << "CUDA: ";
+  ViennaCLCUDASswap(my_backend, ViennaCLInt(size/3),
+                    viennacl::linalg::cuda::detail::cuda_arg<float>(cuda_float_x), 2, 2,
+                    viennacl::linalg::cuda::detail::cuda_arg<float>(cuda_float_y), 1, 2);
+  check(ref_float_y, cuda_float_y, eps_float);
+  ViennaCLCUDADswap(my_backend, ViennaCLInt(size/3),
+                    viennacl::linalg::cuda::detail::cuda_arg<double>(cuda_double_x), 2, 2,
+                    viennacl::linalg::cuda::detail::cuda_arg<double>(cuda_double_y), 1, 2);
+  check(ref_double_y, cuda_double_y, eps_double);
+#endif
+
+#ifdef VIENNACL_WITH_OPENCL
+  std::cout << std::endl << "OpenCL: ";
+  ViennaCLOpenCLSswap(my_backend, ViennaCLInt(size/3),
+                      viennacl::traits::opencl_handle(opencl_float_x).get(), 2, 2,
+                      viennacl::traits::opencl_handle(opencl_float_y).get(), 1, 2);
+  check(ref_float_y, opencl_float_y, eps_float);
+  if( viennacl::ocl::current_device().double_support() )
+  {
+    ViennaCLOpenCLDswap(my_backend, ViennaCLInt(size/3),
+                        viennacl::traits::opencl_handle(*opencl_double_x).get(), 2, 2,
+                        viennacl::traits::opencl_handle(*opencl_double_y).get(), 1, 2);
+    check(ref_double_y, *opencl_double_y, eps_double);
+  }
+#endif
+
+
+  // IAMAX
+  std::cout << std::endl << "-- Testing IxASUM...";
+  ViennaCLInt ref_index = 0;
+  ref_float_alpha = 0;
+  for (std::size_t i=0; i<size/3; ++i)
+  {
+    if (ref_float_x[0 + 2*i] > std::fabs(ref_float_alpha))
+    {
+      ref_index = ViennaCLInt(i);
+      ref_float_alpha = std::fabs(ref_float_x[0 + 2*i]);
+    }
+  }
+
+  std::cout << std::endl << "Host: ";
+  ViennaCLInt idx = 0;
+  ViennaCLHostiSamax(my_backend, ViennaCLInt(size/3),
+                     &idx,
+                     viennacl::linalg::host_based::detail::extract_raw_pointer<float>(host_float_x), 0, 2);
+  check(static_cast<float>(ref_index), static_cast<float>(idx), eps_float);
+  idx = 0;
+  ViennaCLHostiDamax(my_backend, ViennaCLInt(size/3),
+                     &idx,
+                     viennacl::linalg::host_based::detail::extract_raw_pointer<double>(host_double_x), 0, 2);
+  check(ref_index, idx, eps_double);
+
+#ifdef VIENNACL_WITH_CUDA
+  std::cout << std::endl << "CUDA: ";
+  idx = 0;
+  ViennaCLCUDAiSamax(my_backend, ViennaCLInt(size/3),
+                     &idx,
+                     viennacl::linalg::cuda::detail::cuda_arg<float>(cuda_float_x), 0, 2);
+  check(ref_float_x[2*ref_index], ref_float_x[2*idx], eps_float);
+  idx = 0;
+  ViennaCLCUDAiDamax(my_backend, ViennaCLInt(size/3),
+                     &idx,
+                     viennacl::linalg::cuda::detail::cuda_arg<double>(cuda_double_x), 0, 2);
+  check(ref_double_x[2*ref_index], ref_double_x[2*idx], eps_double);
+#endif
+
+#ifdef VIENNACL_WITH_OPENCL
+  std::cout << std::endl << "OpenCL: ";
+  idx = 0;
+  ViennaCLOpenCLiSamax(my_backend, ViennaCLInt(size/3),
+                       &idx,
+                       viennacl::traits::opencl_handle(opencl_float_x).get(), 0, 2);
+  check(ref_float_x[2*ref_index], ref_float_x[2*idx], eps_float);
+  idx = 0;
+  if( viennacl::ocl::current_device().double_support() )
+  {
+    ViennaCLOpenCLiDamax(my_backend, ViennaCLInt(size/3),
+                         &idx,
+                         viennacl::traits::opencl_handle(*opencl_double_x).get(), 0, 2);
+    check(ref_double_x[2*ref_index], ref_double_x[2*idx], eps_double);
+  }
+#endif
+
+#ifdef VIENNACL_WITH_OPENCL
+  //cleanup
+  if( viennacl::ocl::current_device().double_support() )
+  {
+    delete opencl_double_x;
+    delete opencl_double_y;
+  }
+#endif
+
+  ViennaCLBackendDestroy(&my_backend);
+
+  //
+  //  That's it.
+  //
+  std::cout << std::endl << "!!!! TEST COMPLETED SUCCESSFULLY !!!!" << std::endl;
+
+  return EXIT_SUCCESS;
+}
+
diff --git a/tests/src/libviennacl_blas2.cpp b/tests/src/libviennacl_blas2.cpp
new file mode 100644
index 0000000..7078a3f
--- /dev/null
+++ b/tests/src/libviennacl_blas2.cpp
@@ -0,0 +1,265 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/*
+*
+*   Testing the ViennaCL BLAS-like shared library
+*
+*/
+
+
+// include necessary system headers
+#include <iostream>
+#include <vector>
+
+// Some helper functions for this tutorial:
+#include "viennacl.hpp"
+
+#include "viennacl/vector.hpp"
+
+template <typename ScalarType>
+ScalarType diff(ScalarType const & s1, ScalarType const & s2)
+{
+   if (s1 != s2)
+      return (s1 - s2) / std::max(std::fabs(s1), std::fabs(s2));
+   return 0;
+}
+
+template <typename ScalarType, typename ViennaCLVectorType>
+ScalarType diff(std::vector<ScalarType> const & v1, ViennaCLVectorType const & vcl_vec)
+{
+   std::vector<ScalarType> v2_cpu(vcl_vec.size());
+   viennacl::backend::finish();
+   viennacl::copy(vcl_vec, v2_cpu);
+
+   ScalarType inf_norm = 0;
+   for (unsigned int i=0;i<v1.size(); ++i)
+   {
+      if ( std::max( std::fabs(v2_cpu[i]), std::fabs(v1[i]) ) > 0 )
+         v2_cpu[i] = std::fabs(v2_cpu[i] - v1[i]) / std::max( std::fabs(v2_cpu[i]), std::fabs(v1[i]) );
+      else
+         v2_cpu[i] = 0.0;
+
+      if (v2_cpu[i] > inf_norm)
+        inf_norm = v2_cpu[i];
+   }
+
+   return inf_norm;
+}
+
+template <typename T, typename U, typename EpsilonT>
+void check(T const & t, U const & u, EpsilonT eps)
+{
+  EpsilonT rel_error = diff(t,u);
+  if (rel_error > eps)
+  {
+    std::cerr << "Relative error: " << rel_error << std::endl;
+    std::cerr << "Aborting!" << std::endl;
+    exit(EXIT_FAILURE);
+  }
+  std::cout << "SUCCESS ";
+}
+
+int main()
+{
+  std::size_t size1  = 13; // at least 7
+  std::size_t size2  = 11; // at least 7
+  float  eps_float  = 1e-5f;
+  double eps_double = 1e-12;
+
+  ViennaCLBackend my_backend;
+  ViennaCLBackendCreate(&my_backend);
+
+  std::vector<float> ref_float_x(size1); for (std::size_t i=0; i<size1; ++i) ref_float_x[i] = static_cast<float>(i);
+  std::vector<float> ref_float_y(size2); for (std::size_t i=0; i<size2; ++i) ref_float_y[i] = static_cast<float>(size2 - i);
+  std::vector<float> ref_float_A(size1*size2); for (std::size_t i=0; i<size1*size2; ++i) ref_float_A[i] = static_cast<float>(3*i);
+  std::vector<float> ref_float_B(size1*size2); for (std::size_t i=0; i<size1*size2; ++i) ref_float_B[i] = static_cast<float>(2*i);
+
+  std::vector<double> ref_double_x(size1, 1.0); for (std::size_t i=0; i<size1; ++i) ref_double_x[i] = static_cast<double>(i);
+  std::vector<double> ref_double_y(size2, 2.0); for (std::size_t i=0; i<size2; ++i) ref_double_y[i] = static_cast<double>(size2 - i);
+  std::vector<double> ref_double_A(size1*size2, 3.0); for (std::size_t i=0; i<size1*size2; ++i) ref_double_A[i] = static_cast<double>(3*i);
+  std::vector<double> ref_double_B(size1*size2, 4.0); for (std::size_t i=0; i<size1*size2; ++i) ref_double_B[i] = static_cast<double>(2*i);
+
+  // Host setup
+  viennacl::vector<float> host_float_x = viennacl::scalar_vector<float>(size1, 1.0f, viennacl::context(viennacl::MAIN_MEMORY)); for (std::size_t i=0; i<size1; ++i) host_float_x[i] = float(i);
+  viennacl::vector<float> host_float_y = viennacl::scalar_vector<float>(size2, 2.0f, viennacl::context(viennacl::MAIN_MEMORY)); for (std::size_t i=0; i<size2; ++i) host_float_y[i] = float(size2 - i);
+  viennacl::vector<float> host_float_A = viennacl::scalar_vector<float>(size1*size2, 3.0f, viennacl::context(viennacl::MAIN_MEMORY)); for (std::size_t i=0; i<size1*size2; ++i) host_float_A[i] = float(3*i);
+  viennacl::vector<float> host_float_B = viennacl::scalar_vector<float>(size1*size2, 4.0f, viennacl::context(viennacl::MAIN_MEMORY)); for (std::size_t i=0; i<size1*size2; ++i) host_float_B[i] = float(2*i);
+
+  viennacl::vector<double> host_double_x = viennacl::scalar_vector<double>(size1, 1.0, viennacl::context(viennacl::MAIN_MEMORY)); for (std::size_t i=0; i<size1; ++i) host_double_x[i] = double(i);
+  viennacl::vector<double> host_double_y = viennacl::scalar_vector<double>(size2, 2.0, viennacl::context(viennacl::MAIN_MEMORY)); for (std::size_t i=0; i<size2; ++i) host_double_y[i] = double(size2 - i);
+  viennacl::vector<double> host_double_A = viennacl::scalar_vector<double>(size1*size2, 3.0, viennacl::context(viennacl::MAIN_MEMORY)); for (std::size_t i=0; i<size1*size2; ++i) host_double_A[i] = double(3*i);
+  viennacl::vector<double> host_double_B = viennacl::scalar_vector<double>(size1*size2, 4.0, viennacl::context(viennacl::MAIN_MEMORY)); for (std::size_t i=0; i<size1*size2; ++i) host_double_B[i] = double(2*i);
+
+  // CUDA setup
+#ifdef VIENNACL_WITH_CUDA
+  viennacl::vector<float> cuda_float_x = viennacl::scalar_vector<float>(size1, 1.0f, viennacl::context(viennacl::CUDA_MEMORY)); for (std::size_t i=0; i<size1; ++i) cuda_float_x[i] = float(i);
+  viennacl::vector<float> cuda_float_y = viennacl::scalar_vector<float>(size2, 2.0f, viennacl::context(viennacl::CUDA_MEMORY)); for (std::size_t i=0; i<size2; ++i) cuda_float_y[i] = float(size2 - i);
+  viennacl::vector<float> cuda_float_A = viennacl::scalar_vector<float>(size1*size2, 3.0f, viennacl::context(viennacl::CUDA_MEMORY)); for (std::size_t i=0; i<size1*size2; ++i) cuda_float_A[i] = float(3*i);
+  viennacl::vector<float> cuda_float_B = viennacl::scalar_vector<float>(size1*size2, 4.0f, viennacl::context(viennacl::CUDA_MEMORY)); for (std::size_t i=0; i<size1*size2; ++i) cuda_float_B[i] = float(2*i);
+
+  viennacl::vector<double> cuda_double_x = viennacl::scalar_vector<double>(size1, 1.0, viennacl::context(viennacl::CUDA_MEMORY)); for (std::size_t i=0; i<size1; ++i) cuda_double_x[i] = double(i);
+  viennacl::vector<double> cuda_double_y = viennacl::scalar_vector<double>(size2, 2.0, viennacl::context(viennacl::CUDA_MEMORY)); for (std::size_t i=0; i<size2; ++i) cuda_double_y[i] = double(size2 - i);
+  viennacl::vector<double> cuda_double_A = viennacl::scalar_vector<double>(size1*size2, 3.0, viennacl::context(viennacl::CUDA_MEMORY)); for (std::size_t i=0; i<size1*size2; ++i) cuda_double_A[i] = double(3*i);
+  viennacl::vector<double> cuda_double_B = viennacl::scalar_vector<double>(size1*size2, 4.0, viennacl::context(viennacl::CUDA_MEMORY)); for (std::size_t i=0; i<size1*size2; ++i) cuda_double_B[i] = double(2*i);
+#endif
+
+  // OpenCL setup
+#ifdef VIENNACL_WITH_OPENCL
+  ViennaCLInt context_id = 0;
+  viennacl::vector<float> opencl_float_x = viennacl::scalar_vector<float>(size1, 1.0f, viennacl::context(viennacl::ocl::get_context(context_id))); for (std::size_t i=0; i<size1; ++i) opencl_float_x[i] = float(i);
+  viennacl::vector<float> opencl_float_y = viennacl::scalar_vector<float>(size2, 2.0f, viennacl::context(viennacl::ocl::get_context(context_id))); for (std::size_t i=0; i<size2; ++i) opencl_float_y[i] = float(size2 - i);
+  viennacl::vector<float> opencl_float_A = viennacl::scalar_vector<float>(size1*size2, 3.0f, viennacl::context(viennacl::ocl::get_context(context_id))); for (std::size_t i=0; i<size1*size2; ++i) opencl_float_A[i] = float(3*i);
+  viennacl::vector<float> opencl_float_B = viennacl::scalar_vector<float>(size1*size2, 4.0f, viennacl::context(viennacl::ocl::get_context(context_id))); for (std::size_t i=0; i<size1*size2; ++i) opencl_float_B[i] = float(2*i);
+
+  viennacl::vector<double> *opencl_double_x = NULL;
+  viennacl::vector<double> *opencl_double_y = NULL;
+  viennacl::vector<double> *opencl_double_A = NULL;
+  viennacl::vector<double> *opencl_double_B = NULL;
+  if( viennacl::ocl::current_device().double_support() )
+  {
+    opencl_double_x = new viennacl::vector<double>(viennacl::scalar_vector<double>(size1, 1.0, viennacl::context(viennacl::ocl::get_context(context_id)))); for (std::size_t i=0; i<size1; ++i) (*opencl_double_x)[i] = double(i);
+    opencl_double_y = new viennacl::vector<double>(viennacl::scalar_vector<double>(size2, 2.0, viennacl::context(viennacl::ocl::get_context(context_id)))); for (std::size_t i=0; i<size2; ++i) (*opencl_double_y)[i] = double(size2 - i);
+    opencl_double_A = new viennacl::vector<double>(viennacl::scalar_vector<double>(size1*size2, 3.0, viennacl::context(viennacl::ocl::get_context(context_id)))); for (std::size_t i=0; i<size1*size2; ++i) (*opencl_double_A)[i] = double(3*i);
+    opencl_double_B = new viennacl::vector<double>(viennacl::scalar_vector<double>(size1*size2, 4.0, viennacl::context(viennacl::ocl::get_context(context_id)))); for (std::size_t i=0; i<size1*size2; ++i) (*opencl_double_B)[i] = double(2*i);
+  }
+
+  ViennaCLBackendSetOpenCLContextID(my_backend, context_id);
+#endif
+
+  // consistency checks:
+  check(ref_float_x, host_float_x, eps_float);
+  check(ref_float_y, host_float_y, eps_float);
+  check(ref_float_A, host_float_A, eps_float);
+  check(ref_float_B, host_float_B, eps_float);
+  check(ref_double_x, host_double_x, eps_double);
+  check(ref_double_y, host_double_y, eps_double);
+  check(ref_double_A, host_double_A, eps_double);
+  check(ref_double_B, host_double_B, eps_double);
+#ifdef VIENNACL_WITH_CUDA
+  check(ref_float_x, cuda_float_x, eps_float);
+  check(ref_float_y, cuda_float_y, eps_float);
+  check(ref_float_A, cuda_float_A, eps_float);
+  check(ref_float_B, cuda_float_B, eps_float);
+  check(ref_double_x, cuda_double_x, eps_double);
+  check(ref_double_y, cuda_double_y, eps_double);
+  check(ref_double_A, cuda_double_A, eps_double);
+  check(ref_double_B, cuda_double_B, eps_double);
+#endif
+#ifdef VIENNACL_WITH_OPENCL
+  check(ref_float_x, opencl_float_x, eps_float);
+  check(ref_float_y, opencl_float_y, eps_float);
+  check(ref_float_A, opencl_float_A, eps_float);
+  check(ref_float_B, opencl_float_B, eps_float);
+  if( viennacl::ocl::current_device().double_support() )
+  {
+    check(ref_double_x, *opencl_double_x, eps_double);
+    check(ref_double_y, *opencl_double_y, eps_double);
+    check(ref_double_A, *opencl_double_A, eps_double);
+    check(ref_double_B, *opencl_double_B, eps_double);
+  }
+#endif
+
+  // GEMV
+  std::cout << std::endl << "-- Testing xGEMV...";
+  for (std::size_t i=0; i<size1/3; ++i)
+  {
+    ref_float_x[i * 2 + 1] *= 0.1234f;
+    ref_double_x[i * 2 + 1] *= 0.1234;
+    for (std::size_t j=0; j<size2/4; ++j)
+    {
+      ref_float_x[i * 2 + 1]  += 3.1415f * ref_float_A[(2*i+2) * size2 + 3 * j + 1] * ref_float_y[j * 3 + 1];
+      ref_double_x[i * 2 + 1] += 3.1415  * ref_double_A[(2*i+2) * size2 + 3 * j + 1] * ref_double_y[j * 3 + 1];
+    }
+  }
+
+  std::cout << std::endl << "Host: ";
+  ViennaCLHostSgemv(my_backend,
+                    ViennaCLRowMajor, ViennaCLNoTrans,
+                    ViennaCLInt(size1/3), ViennaCLInt(size2/4), 3.1415f, viennacl::linalg::host_based::detail::extract_raw_pointer<float>(host_float_A), 2, 1, 2, 3, ViennaCLInt(size2),
+                    viennacl::linalg::host_based::detail::extract_raw_pointer<float>(host_float_y), 1, 3,
+                    0.1234f,
+                    viennacl::linalg::host_based::detail::extract_raw_pointer<float>(host_float_x), 1, 2);
+  check(ref_float_x, host_float_x, eps_float);
+  ViennaCLHostDgemv(my_backend,
+                    ViennaCLRowMajor, ViennaCLNoTrans,
+                    ViennaCLInt(size1/3), ViennaCLInt(size2/4), 3.1415, viennacl::linalg::host_based::detail::extract_raw_pointer<double>(host_double_A), 2, 1, 2, 3, ViennaCLInt(size2),
+                    viennacl::linalg::host_based::detail::extract_raw_pointer<double>(host_double_y), 1, 3,
+                    0.1234,
+                    viennacl::linalg::host_based::detail::extract_raw_pointer<double>(host_double_x), 1, 2);
+  check(ref_double_x, host_double_x, eps_double);
+
+
+#ifdef VIENNACL_WITH_CUDA
+  std::cout << std::endl << "CUDA: ";
+  ViennaCLCUDASgemv(my_backend,
+                    ViennaCLRowMajor, ViennaCLNoTrans,
+                    ViennaCLInt(size1/3), ViennaCLInt(size2/4), 3.1415f, viennacl::linalg::cuda::detail::cuda_arg<float>(cuda_float_A), 2, 1, 2, 3, size2,
+                    viennacl::linalg::cuda::detail::cuda_arg<float>(cuda_float_y), 1, 3,
+                    0.1234f,
+                    viennacl::linalg::cuda::detail::cuda_arg<float>(cuda_float_x), 1, 2);
+  check(ref_float_x, cuda_float_x, eps_float);
+  ViennaCLCUDADgemv(my_backend,
+                    ViennaCLRowMajor, ViennaCLNoTrans,
+                    ViennaCLInt(size1/3), ViennaCLInt(size2/4), 3.1415, viennacl::linalg::cuda::detail::cuda_arg<double>(cuda_double_A), 2, 1, 2, 3, size2,
+                    viennacl::linalg::cuda::detail::cuda_arg<double>(cuda_double_y), 1, 3,
+                    0.1234,
+                    viennacl::linalg::cuda::detail::cuda_arg<double>(cuda_double_x), 1, 2);
+  check(ref_double_x, cuda_double_x, eps_double);
+#endif
+
+#ifdef VIENNACL_WITH_OPENCL
+  std::cout << std::endl << "OpenCL: ";
+  ViennaCLOpenCLSgemv(my_backend,
+                      ViennaCLRowMajor, ViennaCLNoTrans,
+                      ViennaCLInt(size1/3), ViennaCLInt(size2/4), 3.1415f, viennacl::traits::opencl_handle(opencl_float_A), 2, 1, 2, 3, ViennaCLInt(size2),
+                      viennacl::traits::opencl_handle(opencl_float_y), 1, 3,
+                      0.1234f,
+                      viennacl::traits::opencl_handle(opencl_float_x), 1, 2);
+  check(ref_float_x, opencl_float_x, eps_float);
+  if( viennacl::ocl::current_device().double_support() )
+  {
+    ViennaCLOpenCLDgemv(my_backend,
+                        ViennaCLRowMajor, ViennaCLNoTrans,
+                        ViennaCLInt(size1/3), ViennaCLInt(size2/4), 3.1415, viennacl::traits::opencl_handle(*opencl_double_A), 2, 1, 2, 3, ViennaCLInt(size2),
+                        viennacl::traits::opencl_handle(*opencl_double_y), 1, 3,
+                        0.1234,
+                        viennacl::traits::opencl_handle(*opencl_double_x), 1, 2);
+    check(ref_double_x, *opencl_double_x, eps_double);
+  }
+#endif
+
+
+
+#ifdef VIENNACL_WITH_OPENCL
+  delete opencl_double_x;
+  delete opencl_double_y;
+  delete opencl_double_A;
+  delete opencl_double_B;
+#endif
+
+  ViennaCLBackendDestroy(&my_backend);
+
+  //
+  //  That's it.
+  //
+  std::cout << std::endl << "!!!! TEST COMPLETED SUCCESSFULLY !!!!" << std::endl;
+
+  return EXIT_SUCCESS;
+}
+
diff --git a/tests/src/libviennacl_blas2.cu b/tests/src/libviennacl_blas2.cu
new file mode 100644
index 0000000..7078a3f
--- /dev/null
+++ b/tests/src/libviennacl_blas2.cu
@@ -0,0 +1,265 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/*
+*
+*   Testing the ViennaCL BLAS-like shared library
+*
+*/
+
+
+// include necessary system headers
+#include <iostream>
+#include <vector>
+
+// Some helper functions for this tutorial:
+#include "viennacl.hpp"
+
+#include "viennacl/vector.hpp"
+
+template <typename ScalarType>
+ScalarType diff(ScalarType const & s1, ScalarType const & s2)
+{
+   if (s1 != s2)
+      return (s1 - s2) / std::max(std::fabs(s1), std::fabs(s2));
+   return 0;
+}
+
+template <typename ScalarType, typename ViennaCLVectorType>
+ScalarType diff(std::vector<ScalarType> const & v1, ViennaCLVectorType const & vcl_vec)
+{
+   std::vector<ScalarType> v2_cpu(vcl_vec.size());
+   viennacl::backend::finish();
+   viennacl::copy(vcl_vec, v2_cpu);
+
+   ScalarType inf_norm = 0;
+   for (unsigned int i=0;i<v1.size(); ++i)
+   {
+      if ( std::max( std::fabs(v2_cpu[i]), std::fabs(v1[i]) ) > 0 )
+         v2_cpu[i] = std::fabs(v2_cpu[i] - v1[i]) / std::max( std::fabs(v2_cpu[i]), std::fabs(v1[i]) );
+      else
+         v2_cpu[i] = 0.0;
+
+      if (v2_cpu[i] > inf_norm)
+        inf_norm = v2_cpu[i];
+   }
+
+   return inf_norm;
+}
+
+template <typename T, typename U, typename EpsilonT>
+void check(T const & t, U const & u, EpsilonT eps)
+{
+  EpsilonT rel_error = diff(t,u);
+  if (rel_error > eps)
+  {
+    std::cerr << "Relative error: " << rel_error << std::endl;
+    std::cerr << "Aborting!" << std::endl;
+    exit(EXIT_FAILURE);
+  }
+  std::cout << "SUCCESS ";
+}
+
+int main()
+{
+  std::size_t size1  = 13; // at least 7
+  std::size_t size2  = 11; // at least 7
+  float  eps_float  = 1e-5f;
+  double eps_double = 1e-12;
+
+  ViennaCLBackend my_backend;
+  ViennaCLBackendCreate(&my_backend);
+
+  std::vector<float> ref_float_x(size1); for (std::size_t i=0; i<size1; ++i) ref_float_x[i] = static_cast<float>(i);
+  std::vector<float> ref_float_y(size2); for (std::size_t i=0; i<size2; ++i) ref_float_y[i] = static_cast<float>(size2 - i);
+  std::vector<float> ref_float_A(size1*size2); for (std::size_t i=0; i<size1*size2; ++i) ref_float_A[i] = static_cast<float>(3*i);
+  std::vector<float> ref_float_B(size1*size2); for (std::size_t i=0; i<size1*size2; ++i) ref_float_B[i] = static_cast<float>(2*i);
+
+  std::vector<double> ref_double_x(size1, 1.0); for (std::size_t i=0; i<size1; ++i) ref_double_x[i] = static_cast<double>(i);
+  std::vector<double> ref_double_y(size2, 2.0); for (std::size_t i=0; i<size2; ++i) ref_double_y[i] = static_cast<double>(size2 - i);
+  std::vector<double> ref_double_A(size1*size2, 3.0); for (std::size_t i=0; i<size1*size2; ++i) ref_double_A[i] = static_cast<double>(3*i);
+  std::vector<double> ref_double_B(size1*size2, 4.0); for (std::size_t i=0; i<size1*size2; ++i) ref_double_B[i] = static_cast<double>(2*i);
+
+  // Host setup
+  viennacl::vector<float> host_float_x = viennacl::scalar_vector<float>(size1, 1.0f, viennacl::context(viennacl::MAIN_MEMORY)); for (std::size_t i=0; i<size1; ++i) host_float_x[i] = float(i);
+  viennacl::vector<float> host_float_y = viennacl::scalar_vector<float>(size2, 2.0f, viennacl::context(viennacl::MAIN_MEMORY)); for (std::size_t i=0; i<size2; ++i) host_float_y[i] = float(size2 - i);
+  viennacl::vector<float> host_float_A = viennacl::scalar_vector<float>(size1*size2, 3.0f, viennacl::context(viennacl::MAIN_MEMORY)); for (std::size_t i=0; i<size1*size2; ++i) host_float_A[i] = float(3*i);
+  viennacl::vector<float> host_float_B = viennacl::scalar_vector<float>(size1*size2, 4.0f, viennacl::context(viennacl::MAIN_MEMORY)); for (std::size_t i=0; i<size1*size2; ++i) host_float_B[i] = float(2*i);
+
+  viennacl::vector<double> host_double_x = viennacl::scalar_vector<double>(size1, 1.0, viennacl::context(viennacl::MAIN_MEMORY)); for (std::size_t i=0; i<size1; ++i) host_double_x[i] = double(i);
+  viennacl::vector<double> host_double_y = viennacl::scalar_vector<double>(size2, 2.0, viennacl::context(viennacl::MAIN_MEMORY)); for (std::size_t i=0; i<size2; ++i) host_double_y[i] = double(size2 - i);
+  viennacl::vector<double> host_double_A = viennacl::scalar_vector<double>(size1*size2, 3.0, viennacl::context(viennacl::MAIN_MEMORY)); for (std::size_t i=0; i<size1*size2; ++i) host_double_A[i] = double(3*i);
+  viennacl::vector<double> host_double_B = viennacl::scalar_vector<double>(size1*size2, 4.0, viennacl::context(viennacl::MAIN_MEMORY)); for (std::size_t i=0; i<size1*size2; ++i) host_double_B[i] = double(2*i);
+
+  // CUDA setup
+#ifdef VIENNACL_WITH_CUDA
+  viennacl::vector<float> cuda_float_x = viennacl::scalar_vector<float>(size1, 1.0f, viennacl::context(viennacl::CUDA_MEMORY)); for (std::size_t i=0; i<size1; ++i) cuda_float_x[i] = float(i);
+  viennacl::vector<float> cuda_float_y = viennacl::scalar_vector<float>(size2, 2.0f, viennacl::context(viennacl::CUDA_MEMORY)); for (std::size_t i=0; i<size2; ++i) cuda_float_y[i] = float(size2 - i);
+  viennacl::vector<float> cuda_float_A = viennacl::scalar_vector<float>(size1*size2, 3.0f, viennacl::context(viennacl::CUDA_MEMORY)); for (std::size_t i=0; i<size1*size2; ++i) cuda_float_A[i] = float(3*i);
+  viennacl::vector<float> cuda_float_B = viennacl::scalar_vector<float>(size1*size2, 4.0f, viennacl::context(viennacl::CUDA_MEMORY)); for (std::size_t i=0; i<size1*size2; ++i) cuda_float_B[i] = float(2*i);
+
+  viennacl::vector<double> cuda_double_x = viennacl::scalar_vector<double>(size1, 1.0, viennacl::context(viennacl::CUDA_MEMORY)); for (std::size_t i=0; i<size1; ++i) cuda_double_x[i] = double(i);
+  viennacl::vector<double> cuda_double_y = viennacl::scalar_vector<double>(size2, 2.0, viennacl::context(viennacl::CUDA_MEMORY)); for (std::size_t i=0; i<size2; ++i) cuda_double_y[i] = double(size2 - i);
+  viennacl::vector<double> cuda_double_A = viennacl::scalar_vector<double>(size1*size2, 3.0, viennacl::context(viennacl::CUDA_MEMORY)); for (std::size_t i=0; i<size1*size2; ++i) cuda_double_A[i] = double(3*i);
+  viennacl::vector<double> cuda_double_B = viennacl::scalar_vector<double>(size1*size2, 4.0, viennacl::context(viennacl::CUDA_MEMORY)); for (std::size_t i=0; i<size1*size2; ++i) cuda_double_B[i] = double(2*i);
+#endif
+
+  // OpenCL setup
+#ifdef VIENNACL_WITH_OPENCL
+  ViennaCLInt context_id = 0;
+  viennacl::vector<float> opencl_float_x = viennacl::scalar_vector<float>(size1, 1.0f, viennacl::context(viennacl::ocl::get_context(context_id))); for (std::size_t i=0; i<size1; ++i) opencl_float_x[i] = float(i);
+  viennacl::vector<float> opencl_float_y = viennacl::scalar_vector<float>(size2, 2.0f, viennacl::context(viennacl::ocl::get_context(context_id))); for (std::size_t i=0; i<size2; ++i) opencl_float_y[i] = float(size2 - i);
+  viennacl::vector<float> opencl_float_A = viennacl::scalar_vector<float>(size1*size2, 3.0f, viennacl::context(viennacl::ocl::get_context(context_id))); for (std::size_t i=0; i<size1*size2; ++i) opencl_float_A[i] = float(3*i);
+  viennacl::vector<float> opencl_float_B = viennacl::scalar_vector<float>(size1*size2, 4.0f, viennacl::context(viennacl::ocl::get_context(context_id))); for (std::size_t i=0; i<size1*size2; ++i) opencl_float_B[i] = float(2*i);
+
+  viennacl::vector<double> *opencl_double_x = NULL;
+  viennacl::vector<double> *opencl_double_y = NULL;
+  viennacl::vector<double> *opencl_double_A = NULL;
+  viennacl::vector<double> *opencl_double_B = NULL;
+  if( viennacl::ocl::current_device().double_support() )
+  {
+    opencl_double_x = new viennacl::vector<double>(viennacl::scalar_vector<double>(size1, 1.0, viennacl::context(viennacl::ocl::get_context(context_id)))); for (std::size_t i=0; i<size1; ++i) (*opencl_double_x)[i] = double(i);
+    opencl_double_y = new viennacl::vector<double>(viennacl::scalar_vector<double>(size2, 2.0, viennacl::context(viennacl::ocl::get_context(context_id)))); for (std::size_t i=0; i<size2; ++i) (*opencl_double_y)[i] = double(size2 - i);
+    opencl_double_A = new viennacl::vector<double>(viennacl::scalar_vector<double>(size1*size2, 3.0, viennacl::context(viennacl::ocl::get_context(context_id)))); for (std::size_t i=0; i<size1*size2; ++i) (*opencl_double_A)[i] = double(3*i);
+    opencl_double_B = new viennacl::vector<double>(viennacl::scalar_vector<double>(size1*size2, 4.0, viennacl::context(viennacl::ocl::get_context(context_id)))); for (std::size_t i=0; i<size1*size2; ++i) (*opencl_double_B)[i] = double(2*i);
+  }
+
+  ViennaCLBackendSetOpenCLContextID(my_backend, context_id);
+#endif
+
+  // consistency checks:
+  check(ref_float_x, host_float_x, eps_float);
+  check(ref_float_y, host_float_y, eps_float);
+  check(ref_float_A, host_float_A, eps_float);
+  check(ref_float_B, host_float_B, eps_float);
+  check(ref_double_x, host_double_x, eps_double);
+  check(ref_double_y, host_double_y, eps_double);
+  check(ref_double_A, host_double_A, eps_double);
+  check(ref_double_B, host_double_B, eps_double);
+#ifdef VIENNACL_WITH_CUDA
+  check(ref_float_x, cuda_float_x, eps_float);
+  check(ref_float_y, cuda_float_y, eps_float);
+  check(ref_float_A, cuda_float_A, eps_float);
+  check(ref_float_B, cuda_float_B, eps_float);
+  check(ref_double_x, cuda_double_x, eps_double);
+  check(ref_double_y, cuda_double_y, eps_double);
+  check(ref_double_A, cuda_double_A, eps_double);
+  check(ref_double_B, cuda_double_B, eps_double);
+#endif
+#ifdef VIENNACL_WITH_OPENCL
+  check(ref_float_x, opencl_float_x, eps_float);
+  check(ref_float_y, opencl_float_y, eps_float);
+  check(ref_float_A, opencl_float_A, eps_float);
+  check(ref_float_B, opencl_float_B, eps_float);
+  if( viennacl::ocl::current_device().double_support() )
+  {
+    check(ref_double_x, *opencl_double_x, eps_double);
+    check(ref_double_y, *opencl_double_y, eps_double);
+    check(ref_double_A, *opencl_double_A, eps_double);
+    check(ref_double_B, *opencl_double_B, eps_double);
+  }
+#endif
+
+  // GEMV
+  std::cout << std::endl << "-- Testing xGEMV...";
+  for (std::size_t i=0; i<size1/3; ++i)
+  {
+    ref_float_x[i * 2 + 1] *= 0.1234f;
+    ref_double_x[i * 2 + 1] *= 0.1234;
+    for (std::size_t j=0; j<size2/4; ++j)
+    {
+      ref_float_x[i * 2 + 1]  += 3.1415f * ref_float_A[(2*i+2) * size2 + 3 * j + 1] * ref_float_y[j * 3 + 1];
+      ref_double_x[i * 2 + 1] += 3.1415  * ref_double_A[(2*i+2) * size2 + 3 * j + 1] * ref_double_y[j * 3 + 1];
+    }
+  }
+
+  std::cout << std::endl << "Host: ";
+  ViennaCLHostSgemv(my_backend,
+                    ViennaCLRowMajor, ViennaCLNoTrans,
+                    ViennaCLInt(size1/3), ViennaCLInt(size2/4), 3.1415f, viennacl::linalg::host_based::detail::extract_raw_pointer<float>(host_float_A), 2, 1, 2, 3, ViennaCLInt(size2),
+                    viennacl::linalg::host_based::detail::extract_raw_pointer<float>(host_float_y), 1, 3,
+                    0.1234f,
+                    viennacl::linalg::host_based::detail::extract_raw_pointer<float>(host_float_x), 1, 2);
+  check(ref_float_x, host_float_x, eps_float);
+  ViennaCLHostDgemv(my_backend,
+                    ViennaCLRowMajor, ViennaCLNoTrans,
+                    ViennaCLInt(size1/3), ViennaCLInt(size2/4), 3.1415, viennacl::linalg::host_based::detail::extract_raw_pointer<double>(host_double_A), 2, 1, 2, 3, ViennaCLInt(size2),
+                    viennacl::linalg::host_based::detail::extract_raw_pointer<double>(host_double_y), 1, 3,
+                    0.1234,
+                    viennacl::linalg::host_based::detail::extract_raw_pointer<double>(host_double_x), 1, 2);
+  check(ref_double_x, host_double_x, eps_double);
+
+
+#ifdef VIENNACL_WITH_CUDA
+  std::cout << std::endl << "CUDA: ";
+  ViennaCLCUDASgemv(my_backend,
+                    ViennaCLRowMajor, ViennaCLNoTrans,
+                    ViennaCLInt(size1/3), ViennaCLInt(size2/4), 3.1415f, viennacl::linalg::cuda::detail::cuda_arg<float>(cuda_float_A), 2, 1, 2, 3, size2,
+                    viennacl::linalg::cuda::detail::cuda_arg<float>(cuda_float_y), 1, 3,
+                    0.1234f,
+                    viennacl::linalg::cuda::detail::cuda_arg<float>(cuda_float_x), 1, 2);
+  check(ref_float_x, cuda_float_x, eps_float);
+  ViennaCLCUDADgemv(my_backend,
+                    ViennaCLRowMajor, ViennaCLNoTrans,
+                    ViennaCLInt(size1/3), ViennaCLInt(size2/4), 3.1415, viennacl::linalg::cuda::detail::cuda_arg<double>(cuda_double_A), 2, 1, 2, 3, size2,
+                    viennacl::linalg::cuda::detail::cuda_arg<double>(cuda_double_y), 1, 3,
+                    0.1234,
+                    viennacl::linalg::cuda::detail::cuda_arg<double>(cuda_double_x), 1, 2);
+  check(ref_double_x, cuda_double_x, eps_double);
+#endif
+
+#ifdef VIENNACL_WITH_OPENCL
+  std::cout << std::endl << "OpenCL: ";
+  ViennaCLOpenCLSgemv(my_backend,
+                      ViennaCLRowMajor, ViennaCLNoTrans,
+                      ViennaCLInt(size1/3), ViennaCLInt(size2/4), 3.1415f, viennacl::traits::opencl_handle(opencl_float_A), 2, 1, 2, 3, ViennaCLInt(size2),
+                      viennacl::traits::opencl_handle(opencl_float_y), 1, 3,
+                      0.1234f,
+                      viennacl::traits::opencl_handle(opencl_float_x), 1, 2);
+  check(ref_float_x, opencl_float_x, eps_float);
+  if( viennacl::ocl::current_device().double_support() )
+  {
+    ViennaCLOpenCLDgemv(my_backend,
+                        ViennaCLRowMajor, ViennaCLNoTrans,
+                        ViennaCLInt(size1/3), ViennaCLInt(size2/4), 3.1415, viennacl::traits::opencl_handle(*opencl_double_A), 2, 1, 2, 3, ViennaCLInt(size2),
+                        viennacl::traits::opencl_handle(*opencl_double_y), 1, 3,
+                        0.1234,
+                        viennacl::traits::opencl_handle(*opencl_double_x), 1, 2);
+    check(ref_double_x, *opencl_double_x, eps_double);
+  }
+#endif
+
+
+
+#ifdef VIENNACL_WITH_OPENCL
+  delete opencl_double_x;
+  delete opencl_double_y;
+  delete opencl_double_A;
+  delete opencl_double_B;
+#endif
+
+  ViennaCLBackendDestroy(&my_backend);
+
+  //
+  //  That's it.
+  //
+  std::cout << std::endl << "!!!! TEST COMPLETED SUCCESSFULLY !!!!" << std::endl;
+
+  return EXIT_SUCCESS;
+}
+
diff --git a/tests/src/libviennacl_blas3.cpp b/tests/src/libviennacl_blas3.cpp
new file mode 100644
index 0000000..99c5618
--- /dev/null
+++ b/tests/src/libviennacl_blas3.cpp
@@ -0,0 +1,623 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/*
+*
+*   Testing the ViennaCL BLAS-like shared library
+*
+*/
+
+
+// include necessary system headers
+#include <iostream>
+#include <vector>
+
+// Some helper functions for this tutorial:
+#include "viennacl.hpp"
+
+#include "examples/tutorial/Random.hpp"
+
+
+#include "viennacl/vector.hpp"
+
+template <typename ScalarType>
+ScalarType diff(ScalarType const & s1, ScalarType const & s2)
+{
+   if (s1 != s2)
+      return (s1 - s2) / std::max(std::fabs(s1), std::fabs(s2));
+   return 0;
+}
+
+template <typename ScalarType, typename ViennaCLVectorType>
+ScalarType diff(std::vector<ScalarType> const & v1, ViennaCLVectorType const & vcl_vec)
+{
+   std::vector<ScalarType> v2_cpu(vcl_vec.size());
+   viennacl::backend::finish();
+   viennacl::copy(vcl_vec, v2_cpu);
+
+   ScalarType inf_norm = 0;
+   for (unsigned int i=0;i<v1.size(); ++i)
+   {
+      if ( std::max( std::fabs(v2_cpu[i]), std::fabs(v1[i]) ) > 0 )
+         v2_cpu[i] = std::fabs(v2_cpu[i] - v1[i]) / std::max( std::fabs(v2_cpu[i]), std::fabs(v1[i]) );
+      else
+         v2_cpu[i] = 0.0;
+
+      if (v2_cpu[i] > inf_norm)
+        inf_norm = v2_cpu[i];
+   }
+
+   return inf_norm;
+}
+
+template <typename T, typename U, typename EpsilonT>
+void check(T const & t, U const & u, EpsilonT eps)
+{
+  EpsilonT rel_error = diff(t,u);
+  if (rel_error > eps)
+  {
+    std::cerr << "Relative error: " << rel_error << std::endl;
+    std::cerr << "Aborting!" << std::endl;
+    exit(EXIT_FAILURE);
+  }
+  std::cout << "SUCCESS ";
+}
+
+
+template <typename T>
+T get_value(std::vector<T> & array, ViennaCLInt i, ViennaCLInt j,
+            ViennaCLInt start1, ViennaCLInt start2,
+            ViennaCLInt stride1, ViennaCLInt stride2,
+            ViennaCLInt rows, ViennaCLInt cols,
+            ViennaCLOrder order, ViennaCLTranspose trans)
+{
+  // row-major
+  if (order == ViennaCLRowMajor && trans == ViennaCLTrans)
+    return array[(j*stride1 + start1) * cols + (i*stride2 + start2)];
+  else if (order == ViennaCLRowMajor && trans != ViennaCLTrans)
+    return array[(i*stride1 + start1) * cols + (j*stride2 + start2)];
+
+  // column-major
+  else if (order != ViennaCLRowMajor && trans == ViennaCLTrans)
+    return array[(j*stride1 + start1) + (i*stride2 + start2) * rows];
+  return array[(i*stride1 + start1) + (j*stride2 + start2) * rows];
+}
+
+
+
+void test_blas(ViennaCLBackend my_backend,
+               float eps_float, double eps_double,
+               std::vector<float> & C_float, std::vector<double> & C_double,
+               std::vector<float> & A_float, std::vector<double> & A_double,
+               std::vector<float> & B_float, std::vector<double> & B_double,
+               ViennaCLOrder order_C, ViennaCLOrder order_A, ViennaCLOrder order_B,
+               ViennaCLTranspose trans_A, ViennaCLTranspose trans_B,
+               viennacl::vector<float> & host_C_float, viennacl::vector<double> & host_C_double,
+               viennacl::vector<float> & host_A_float, viennacl::vector<double> & host_A_double,
+               viennacl::vector<float> & host_B_float, viennacl::vector<double> & host_B_double
+#ifdef VIENNACL_WITH_CUDA
+               , viennacl::vector<float> & cuda_C_float, viennacl::vector<double> & cuda_C_double
+               , viennacl::vector<float> & cuda_A_float, viennacl::vector<double> & cuda_A_double
+               , viennacl::vector<float> & cuda_B_float, viennacl::vector<double> & cuda_B_double
+#endif
+#ifdef VIENNACL_WITH_OPENCL
+               , viennacl::vector<float> & opencl_C_float, viennacl::vector<double> * opencl_C_double
+               , viennacl::vector<float> & opencl_A_float, viennacl::vector<double> * opencl_A_double
+               , viennacl::vector<float> & opencl_B_float, viennacl::vector<double> * opencl_B_double
+#endif
+               )
+{
+  ViennaCLInt C_size1   = 42;
+  ViennaCLInt C_size2   = 43;
+  ViennaCLInt C_start1  = 10;
+  ViennaCLInt C_start2  = 11;
+  ViennaCLInt C_stride1 = 2;
+  ViennaCLInt C_stride2 = 3;
+  ViennaCLInt C_rows    = C_size1 * C_stride1 + C_start1 + 5;
+  ViennaCLInt C_columns = C_size2 * C_stride2 + C_start2 + 5;
+
+  ViennaCLInt A_size1   = trans_A ? 44 : 42;
+  ViennaCLInt A_size2   = trans_A ? 42 : 44;
+  ViennaCLInt A_start1  = 12;
+  ViennaCLInt A_start2  = 13;
+  ViennaCLInt A_stride1 = 4;
+  ViennaCLInt A_stride2 = 5;
+  ViennaCLInt A_rows    = A_size1 * A_stride1 + A_start1 + 5;
+  ViennaCLInt A_columns = A_size2 * A_stride2 + A_start2 + 5;
+
+  ViennaCLInt B_size1   = trans_B ? 43 : 44;
+  ViennaCLInt B_size2   = trans_B ? 44 : 43;
+  ViennaCLInt B_start1  = 14;
+  ViennaCLInt B_start2  = 15;
+  ViennaCLInt B_stride1 = 6;
+  ViennaCLInt B_stride2 = 7;
+  ViennaCLInt B_rows    = B_size1 * B_stride1 + B_start1 + 5;
+  ViennaCLInt B_columns = B_size2 * B_stride2 + B_start2 + 5;
+
+  // Compute reference:
+  ViennaCLInt size_k = trans_A ? A_size1 : A_size2;
+  for (ViennaCLInt i=0; i<C_size1; ++i)
+    for (ViennaCLInt j=0; j<C_size2; ++j)
+    {
+      float val_float = 0;
+      double val_double = 0;
+      for (ViennaCLInt k=0; k<size_k; ++k)
+      {
+        float  val_A_float  = get_value(A_float,  i, k, A_start1, A_start2, A_stride1, A_stride2, A_rows, A_columns, order_A, trans_A);
+        double val_A_double = get_value(A_double, i, k, A_start1, A_start2, A_stride1, A_stride2, A_rows, A_columns, order_A, trans_A);
+
+        float  val_B_float  = get_value(B_float,  k, j, B_start1, B_start2, B_stride1, B_stride2, B_rows, B_columns, order_B, trans_B);
+        double val_B_double = get_value(B_double, k, j, B_start1, B_start2, B_stride1, B_stride2, B_rows, B_columns, order_B, trans_B);
+
+        val_float  += val_A_float  * val_B_float;
+        val_double += val_A_double * val_B_double;
+      }
+
+      // write result
+      if (order_C == ViennaCLRowMajor)
+      {
+        C_float [(i*C_stride1 + C_start1) * C_columns + (j*C_stride2 + C_start2)] = val_float;
+        C_double[(i*C_stride1 + C_start1) * C_columns + (j*C_stride2 + C_start2)] = val_double;
+      }
+      else
+      {
+        C_float [(i*C_stride1 + C_start1) + (j*C_stride2 + C_start2) * C_rows] = val_float;
+        C_double[(i*C_stride1 + C_start1) + (j*C_stride2 + C_start2) * C_rows] = val_double;
+      }
+    }
+
+  // Run GEMM and compare results:
+  ViennaCLHostSgemm(my_backend,
+                    order_A, trans_A, order_B, trans_B, order_C,
+                    C_size1, C_size2, size_k,
+                    1.0f,
+                    viennacl::linalg::host_based::detail::extract_raw_pointer<float>(host_A_float), A_start1, A_start2, A_stride1, A_stride2, (order_A == ViennaCLRowMajor) ? A_columns : A_rows,
+                    viennacl::linalg::host_based::detail::extract_raw_pointer<float>(host_B_float), B_start1, B_start2, B_stride1, B_stride2, (order_B == ViennaCLRowMajor) ? B_columns : B_rows,
+                    0.0f,
+                    viennacl::linalg::host_based::detail::extract_raw_pointer<float>(host_C_float), C_start1, C_start2, C_stride1, C_stride2, (order_C == ViennaCLRowMajor) ? C_columns : C_rows);
+  check(C_float, host_C_float, eps_float);
+
+  ViennaCLHostDgemm(my_backend,
+                    order_A, trans_A, order_B, trans_B, order_C,
+                    C_size1, C_size2, size_k,
+                    1.0,
+                    viennacl::linalg::host_based::detail::extract_raw_pointer<double>(host_A_double), A_start1, A_start2, A_stride1, A_stride2, (order_A == ViennaCLRowMajor) ? A_columns : A_rows,
+                    viennacl::linalg::host_based::detail::extract_raw_pointer<double>(host_B_double), B_start1, B_start2, B_stride1, B_stride2, (order_B == ViennaCLRowMajor) ? B_columns : B_rows,
+                    0.0,
+                    viennacl::linalg::host_based::detail::extract_raw_pointer<double>(host_C_double), C_start1, C_start2, C_stride1, C_stride2, (order_C == ViennaCLRowMajor) ? C_columns : C_rows);
+  check(C_double, host_C_double, eps_double);
+
+#ifdef VIENNACL_WITH_CUDA
+  ViennaCLCUDASgemm(my_backend,
+                    order_A, trans_A, order_B, trans_B, order_C,
+                    C_size1, C_size2, size_k,
+                    1.0f,
+                    viennacl::linalg::cuda::detail::cuda_arg<float>(cuda_A_float), A_start1, A_start2, A_stride1, A_stride2, (order_A == ViennaCLRowMajor) ? A_columns : A_rows,
+                    viennacl::linalg::cuda::detail::cuda_arg<float>(cuda_B_float), B_start1, B_start2, B_stride1, B_stride2, (order_B == ViennaCLRowMajor) ? B_columns : B_rows,
+                    0.0f,
+                    viennacl::linalg::cuda::detail::cuda_arg<float>(cuda_C_float), C_start1, C_start2, C_stride1, C_stride2, (order_C == ViennaCLRowMajor) ? C_columns : C_rows);
+  check(C_float, cuda_C_float, eps_float);
+
+  ViennaCLCUDADgemm(my_backend,
+                    order_A, trans_A, order_B, trans_B, order_C,
+                    C_size1, C_size2, size_k,
+                    1.0,
+                    viennacl::linalg::cuda::detail::cuda_arg<double>(cuda_A_double), A_start1, A_start2, A_stride1, A_stride2, (order_A == ViennaCLRowMajor) ? A_columns : A_rows,
+                    viennacl::linalg::cuda::detail::cuda_arg<double>(cuda_B_double), B_start1, B_start2, B_stride1, B_stride2, (order_B == ViennaCLRowMajor) ? B_columns : B_rows,
+                    0.0,
+                    viennacl::linalg::cuda::detail::cuda_arg<double>(cuda_C_double), C_start1, C_start2, C_stride1, C_stride2, (order_C == ViennaCLRowMajor) ? C_columns : C_rows);
+  check(C_double, cuda_C_double, eps_double);
+#endif
+
+#ifdef VIENNACL_WITH_OPENCL
+  ViennaCLOpenCLSgemm(my_backend,
+                    order_A, trans_A, order_B, trans_B, order_C,
+                    C_size1, C_size2, size_k,
+                    1.0f,
+                    viennacl::traits::opencl_handle(opencl_A_float), A_start1, A_start2, A_stride1, A_stride2, (order_A == ViennaCLRowMajor) ? A_columns : A_rows,
+                    viennacl::traits::opencl_handle(opencl_B_float), B_start1, B_start2, B_stride1, B_stride2, (order_B == ViennaCLRowMajor) ? B_columns : B_rows,
+                    0.0f,
+                    viennacl::traits::opencl_handle(opencl_C_float), C_start1, C_start2, C_stride1, C_stride2, (order_C == ViennaCLRowMajor) ? C_columns : C_rows);
+  check(C_float, opencl_C_float, eps_float);
+
+  if (opencl_A_double != NULL && opencl_B_double != NULL && opencl_C_double != NULL)
+  {
+    ViennaCLOpenCLDgemm(my_backend,
+                      order_A, trans_A, order_B, trans_B, order_C,
+                      C_size1, C_size2, size_k,
+                      1.0,
+                      viennacl::traits::opencl_handle(*opencl_A_double), A_start1, A_start2, A_stride1, A_stride2, (order_A == ViennaCLRowMajor) ? A_columns : A_rows,
+                      viennacl::traits::opencl_handle(*opencl_B_double), B_start1, B_start2, B_stride1, B_stride2, (order_B == ViennaCLRowMajor) ? B_columns : B_rows,
+                      0.0,
+                      viennacl::traits::opencl_handle(*opencl_C_double), C_start1, C_start2, C_stride1, C_stride2, (order_C == ViennaCLRowMajor) ? C_columns : C_rows);
+    check(C_double, *opencl_C_double, eps_double);
+  }
+#endif
+
+  std::cout << std::endl;
+}
+
+
+void test_blas(ViennaCLBackend my_backend,
+               float eps_float, double eps_double,
+               std::vector<float> & C_float, std::vector<double> & C_double,
+               std::vector<float> & A_float, std::vector<double> & A_double,
+               std::vector<float> & B_float, std::vector<double> & B_double,
+               ViennaCLOrder order_C, ViennaCLOrder order_A, ViennaCLOrder order_B,
+               viennacl::vector<float> & host_C_float, viennacl::vector<double> & host_C_double,
+               viennacl::vector<float> & host_A_float, viennacl::vector<double> & host_A_double,
+               viennacl::vector<float> & host_B_float, viennacl::vector<double> & host_B_double
+#ifdef VIENNACL_WITH_CUDA
+               , viennacl::vector<float> & cuda_C_float, viennacl::vector<double> & cuda_C_double
+               , viennacl::vector<float> & cuda_A_float, viennacl::vector<double> & cuda_A_double
+               , viennacl::vector<float> & cuda_B_float, viennacl::vector<double> & cuda_B_double
+#endif
+#ifdef VIENNACL_WITH_OPENCL
+               , viennacl::vector<float> & opencl_C_float, viennacl::vector<double> * opencl_C_double
+               , viennacl::vector<float> & opencl_A_float, viennacl::vector<double> * opencl_A_double
+               , viennacl::vector<float> & opencl_B_float, viennacl::vector<double> * opencl_B_double
+#endif
+               )
+{
+  std::cout << "    -> trans-trans: ";
+  test_blas(my_backend,
+            eps_float, eps_double,
+            C_float, C_double, A_float, A_double, B_float, B_double,
+            order_C, order_A, order_B,
+            ViennaCLTrans, ViennaCLTrans,
+            host_C_float, host_C_double, host_A_float, host_A_double, host_B_float, host_B_double
+#ifdef VIENNACL_WITH_CUDA
+            , cuda_C_float, cuda_C_double, cuda_A_float, cuda_A_double, cuda_B_float, cuda_B_double
+#endif
+#ifdef VIENNACL_WITH_OPENCL
+            , opencl_C_float, opencl_C_double, opencl_A_float, opencl_A_double, opencl_B_float, opencl_B_double
+#endif
+            );
+
+  std::cout << "    -> trans-no:    ";
+  test_blas(my_backend,
+            eps_float, eps_double,
+            C_float, C_double, A_float, A_double, B_float, B_double,
+            order_C, order_A, order_B,
+            ViennaCLTrans, ViennaCLNoTrans,
+            host_C_float, host_C_double, host_A_float, host_A_double, host_B_float, host_B_double
+#ifdef VIENNACL_WITH_CUDA
+            , cuda_C_float, cuda_C_double, cuda_A_float, cuda_A_double, cuda_B_float, cuda_B_double
+#endif
+#ifdef VIENNACL_WITH_OPENCL
+            , opencl_C_float, opencl_C_double, opencl_A_float, opencl_A_double, opencl_B_float, opencl_B_double
+#endif
+            );
+
+  std::cout << "    -> no-trans:    ";
+  test_blas(my_backend,
+            eps_float, eps_double,
+            C_float, C_double, A_float, A_double, B_float, B_double,
+            order_C, order_A, order_B,
+            ViennaCLNoTrans, ViennaCLTrans,
+            host_C_float, host_C_double, host_A_float, host_A_double, host_B_float, host_B_double
+#ifdef VIENNACL_WITH_CUDA
+            , cuda_C_float, cuda_C_double, cuda_A_float, cuda_A_double, cuda_B_float, cuda_B_double
+#endif
+#ifdef VIENNACL_WITH_OPENCL
+            , opencl_C_float, opencl_C_double, opencl_A_float, opencl_A_double, opencl_B_float, opencl_B_double
+#endif
+            );
+
+  std::cout << "    -> no-no:       ";
+  test_blas(my_backend,
+            eps_float, eps_double,
+            C_float, C_double, A_float, A_double, B_float, B_double,
+            order_C, order_A, order_B,
+            ViennaCLNoTrans, ViennaCLNoTrans,
+            host_C_float, host_C_double, host_A_float, host_A_double, host_B_float, host_B_double
+#ifdef VIENNACL_WITH_CUDA
+            , cuda_C_float, cuda_C_double, cuda_A_float, cuda_A_double, cuda_B_float, cuda_B_double
+#endif
+#ifdef VIENNACL_WITH_OPENCL
+            , opencl_C_float, opencl_C_double, opencl_A_float, opencl_A_double, opencl_B_float, opencl_B_double
+#endif
+            );
+
+}
+
+void test_blas(ViennaCLBackend my_backend,
+               float eps_float, double eps_double,
+               std::vector<float> & C_float, std::vector<double> & C_double,
+               std::vector<float> & A_float, std::vector<double> & A_double,
+               std::vector<float> & B_float, std::vector<double> & B_double,
+               viennacl::vector<float> & host_C_float, viennacl::vector<double> & host_C_double,
+               viennacl::vector<float> & host_A_float, viennacl::vector<double> & host_A_double,
+               viennacl::vector<float> & host_B_float, viennacl::vector<double> & host_B_double
+#ifdef VIENNACL_WITH_CUDA
+               , viennacl::vector<float> & cuda_C_float, viennacl::vector<double> & cuda_C_double
+               , viennacl::vector<float> & cuda_A_float, viennacl::vector<double> & cuda_A_double
+               , viennacl::vector<float> & cuda_B_float, viennacl::vector<double> & cuda_B_double
+#endif
+#ifdef VIENNACL_WITH_OPENCL
+               , viennacl::vector<float> & opencl_C_float, viennacl::vector<double> * opencl_C_double
+               , viennacl::vector<float> & opencl_A_float, viennacl::vector<double> * opencl_A_double
+               , viennacl::vector<float> & opencl_B_float, viennacl::vector<double> * opencl_B_double
+#endif
+               )
+{
+  std::cout << "  -> C: row, A: row, B: row" << std::endl;
+  test_blas(my_backend,
+            eps_float, eps_double,
+            C_float, C_double, A_float, A_double, B_float, B_double,
+            ViennaCLRowMajor, ViennaCLRowMajor, ViennaCLRowMajor,
+            host_C_float, host_C_double, host_A_float, host_A_double, host_B_float, host_B_double
+#ifdef VIENNACL_WITH_CUDA
+            , cuda_C_float, cuda_C_double, cuda_A_float, cuda_A_double, cuda_B_float, cuda_B_double
+#endif
+#ifdef VIENNACL_WITH_OPENCL
+            , opencl_C_float, opencl_C_double, opencl_A_float, opencl_A_double, opencl_B_float, opencl_B_double
+#endif
+            );
+
+  std::cout << "  -> C: row, A: row, B: col" << std::endl;
+  test_blas(my_backend,
+            eps_float, eps_double,
+            C_float, C_double, A_float, A_double, B_float, B_double,
+            ViennaCLRowMajor, ViennaCLRowMajor, ViennaCLColumnMajor,
+            host_C_float, host_C_double, host_A_float, host_A_double, host_B_float, host_B_double
+#ifdef VIENNACL_WITH_CUDA
+            , cuda_C_float, cuda_C_double, cuda_A_float, cuda_A_double, cuda_B_float, cuda_B_double
+#endif
+#ifdef VIENNACL_WITH_OPENCL
+            , opencl_C_float, opencl_C_double, opencl_A_float, opencl_A_double, opencl_B_float, opencl_B_double
+#endif
+            );
+
+  std::cout << "  -> C: row, A: col, B: row" << std::endl;
+  test_blas(my_backend,
+            eps_float, eps_double,
+            C_float, C_double, A_float, A_double, B_float, B_double,
+            ViennaCLRowMajor, ViennaCLColumnMajor, ViennaCLRowMajor,
+            host_C_float, host_C_double, host_A_float, host_A_double, host_B_float, host_B_double
+#ifdef VIENNACL_WITH_CUDA
+            , cuda_C_float, cuda_C_double, cuda_A_float, cuda_A_double, cuda_B_float, cuda_B_double
+#endif
+#ifdef VIENNACL_WITH_OPENCL
+            , opencl_C_float, opencl_C_double, opencl_A_float, opencl_A_double, opencl_B_float, opencl_B_double
+#endif
+            );
+
+  std::cout << "  -> C: row, A: col, B: col" << std::endl;
+  test_blas(my_backend,
+            eps_float, eps_double,
+            C_float, C_double, A_float, A_double, B_float, B_double,
+            ViennaCLRowMajor, ViennaCLColumnMajor, ViennaCLColumnMajor,
+            host_C_float, host_C_double, host_A_float, host_A_double, host_B_float, host_B_double
+#ifdef VIENNACL_WITH_CUDA
+            , cuda_C_float, cuda_C_double, cuda_A_float, cuda_A_double, cuda_B_float, cuda_B_double
+#endif
+#ifdef VIENNACL_WITH_OPENCL
+            , opencl_C_float, opencl_C_double, opencl_A_float, opencl_A_double, opencl_B_float, opencl_B_double
+#endif
+            );
+
+
+  std::cout << "  -> C: col, A: row, B: row" << std::endl;
+  test_blas(my_backend,
+            eps_float, eps_double,
+            C_float, C_double, A_float, A_double, B_float, B_double,
+            ViennaCLColumnMajor, ViennaCLRowMajor, ViennaCLRowMajor,
+            host_C_float, host_C_double, host_A_float, host_A_double, host_B_float, host_B_double
+#ifdef VIENNACL_WITH_CUDA
+            , cuda_C_float, cuda_C_double, cuda_A_float, cuda_A_double, cuda_B_float, cuda_B_double
+#endif
+#ifdef VIENNACL_WITH_OPENCL
+            , opencl_C_float, opencl_C_double, opencl_A_float, opencl_A_double, opencl_B_float, opencl_B_double
+#endif
+            );
+
+  std::cout << "  -> C: col, A: row, B: col" << std::endl;
+  test_blas(my_backend,
+            eps_float, eps_double,
+            C_float, C_double, A_float, A_double, B_float, B_double,
+            ViennaCLColumnMajor, ViennaCLRowMajor, ViennaCLColumnMajor,
+            host_C_float, host_C_double, host_A_float, host_A_double, host_B_float, host_B_double
+#ifdef VIENNACL_WITH_CUDA
+            , cuda_C_float, cuda_C_double, cuda_A_float, cuda_A_double, cuda_B_float, cuda_B_double
+#endif
+#ifdef VIENNACL_WITH_OPENCL
+            , opencl_C_float, opencl_C_double, opencl_A_float, opencl_A_double, opencl_B_float, opencl_B_double
+#endif
+            );
+
+  std::cout << "  -> C: col, A: col, B: row" << std::endl;
+  test_blas(my_backend,
+            eps_float, eps_double,
+            C_float, C_double, A_float, A_double, B_float, B_double,
+            ViennaCLColumnMajor, ViennaCLColumnMajor, ViennaCLRowMajor,
+            host_C_float, host_C_double, host_A_float, host_A_double, host_B_float, host_B_double
+#ifdef VIENNACL_WITH_CUDA
+            , cuda_C_float, cuda_C_double, cuda_A_float, cuda_A_double, cuda_B_float, cuda_B_double
+#endif
+#ifdef VIENNACL_WITH_OPENCL
+            , opencl_C_float, opencl_C_double, opencl_A_float, opencl_A_double, opencl_B_float, opencl_B_double
+#endif
+            );
+
+  std::cout << "  -> C: col, A: col, B: col" << std::endl;
+  test_blas(my_backend,
+            eps_float, eps_double,
+            C_float, C_double, A_float, A_double, B_float, B_double,
+            ViennaCLColumnMajor, ViennaCLColumnMajor, ViennaCLColumnMajor,
+            host_C_float, host_C_double, host_A_float, host_A_double, host_B_float, host_B_double
+#ifdef VIENNACL_WITH_CUDA
+            , cuda_C_float, cuda_C_double, cuda_A_float, cuda_A_double, cuda_B_float, cuda_B_double
+#endif
+#ifdef VIENNACL_WITH_OPENCL
+            , opencl_C_float, opencl_C_double, opencl_A_float, opencl_A_double, opencl_B_float, opencl_B_double
+#endif
+            );
+
+}
+
+
+
+
+int main()
+{
+  ViennaCLInt size  = 500*500;
+  float  eps_float  = 1e-5f;
+  double eps_double = 1e-12;
+
+  std::vector<float> C_float(size);
+  std::vector<float> A_float(size);
+  std::vector<float> B_float(size);
+
+  std::vector<double> C_double(size);
+  std::vector<double> A_double(size);
+  std::vector<double> B_double(size);
+
+  // fill with random data:
+
+  for (ViennaCLInt i = 0; i < size; ++i)
+  {
+    C_float[i] = 0.5f + 0.1f * random<float>();
+    A_float[i] = 0.5f + 0.1f * random<float>();
+    B_float[i] = 0.5f + 0.1f * random<float>();
+
+    C_double[i] = 0.5 + 0.2 * random<double>();
+    A_double[i] = 0.5 + 0.2 * random<double>();
+    B_double[i] = 0.5 + 0.2 * random<double>();
+  }
+
+
+  // Host setup
+  ViennaCLBackend my_backend;
+  ViennaCLBackendCreate(&my_backend);
+
+  viennacl::vector<float> host_C_float(size, viennacl::context(viennacl::MAIN_MEMORY));  viennacl::copy(C_float, host_C_float);
+  viennacl::vector<float> host_A_float(size, viennacl::context(viennacl::MAIN_MEMORY));  viennacl::copy(A_float, host_A_float);
+  viennacl::vector<float> host_B_float(size, viennacl::context(viennacl::MAIN_MEMORY));  viennacl::copy(B_float, host_B_float);
+
+  viennacl::vector<double> host_C_double(size, viennacl::context(viennacl::MAIN_MEMORY));  viennacl::copy(C_double, host_C_double);
+  viennacl::vector<double> host_A_double(size, viennacl::context(viennacl::MAIN_MEMORY));  viennacl::copy(A_double, host_A_double);
+  viennacl::vector<double> host_B_double(size, viennacl::context(viennacl::MAIN_MEMORY));  viennacl::copy(B_double, host_B_double);
+
+  // CUDA setup
+#ifdef VIENNACL_WITH_CUDA
+  viennacl::vector<float> cuda_C_float(size, viennacl::context(viennacl::CUDA_MEMORY));  viennacl::copy(C_float, cuda_C_float);
+  viennacl::vector<float> cuda_A_float(size, viennacl::context(viennacl::CUDA_MEMORY));  viennacl::copy(A_float, cuda_A_float);
+  viennacl::vector<float> cuda_B_float(size, viennacl::context(viennacl::CUDA_MEMORY));  viennacl::copy(B_float, cuda_B_float);
+
+  viennacl::vector<double> cuda_C_double(size, viennacl::context(viennacl::CUDA_MEMORY));  viennacl::copy(C_double, cuda_C_double);
+  viennacl::vector<double> cuda_A_double(size, viennacl::context(viennacl::CUDA_MEMORY));  viennacl::copy(A_double, cuda_A_double);
+  viennacl::vector<double> cuda_B_double(size, viennacl::context(viennacl::CUDA_MEMORY));  viennacl::copy(B_double, cuda_B_double);
+#endif
+
+  // OpenCL setup
+#ifdef VIENNACL_WITH_OPENCL
+  ViennaCLInt context_id = 0;
+  viennacl::vector<float> opencl_C_float(size, viennacl::context(viennacl::ocl::get_context(context_id)));  viennacl::copy(C_float, opencl_C_float);
+  viennacl::vector<float> opencl_A_float(size, viennacl::context(viennacl::ocl::get_context(context_id)));  viennacl::copy(A_float, opencl_A_float);
+  viennacl::vector<float> opencl_B_float(size, viennacl::context(viennacl::ocl::get_context(context_id)));  viennacl::copy(B_float, opencl_B_float);
+
+  viennacl::vector<double> *opencl_C_double = NULL;
+  viennacl::vector<double> *opencl_A_double = NULL;
+  viennacl::vector<double> *opencl_B_double = NULL;
+
+  if( viennacl::ocl::current_device().double_support() )
+  {
+    opencl_C_double = new viennacl::vector<double>(size, viennacl::context(viennacl::ocl::get_context(context_id)));  viennacl::copy(C_double, *opencl_C_double);
+    opencl_A_double = new viennacl::vector<double>(size, viennacl::context(viennacl::ocl::get_context(context_id)));  viennacl::copy(A_double, *opencl_A_double);
+    opencl_B_double = new viennacl::vector<double>(size, viennacl::context(viennacl::ocl::get_context(context_id)));  viennacl::copy(B_double, *opencl_B_double);
+  }
+
+  ViennaCLBackendSetOpenCLContextID(my_backend, context_id);
+#endif
+
+  // consistency checks:
+  check(C_float, host_C_float, eps_float);
+  check(A_float, host_A_float, eps_float);
+  check(B_float, host_B_float, eps_float);
+
+  check(C_double, host_C_double, eps_double);
+  check(A_double, host_A_double, eps_double);
+  check(B_double, host_B_double, eps_double);
+
+#ifdef VIENNACL_WITH_CUDA
+  check(C_float, cuda_C_float, eps_float);
+  check(A_float, cuda_A_float, eps_float);
+  check(B_float, cuda_B_float, eps_float);
+
+  check(C_double, cuda_C_double, eps_double);
+  check(A_double, cuda_A_double, eps_double);
+  check(B_double, cuda_B_double, eps_double);
+#endif
+#ifdef VIENNACL_WITH_OPENCL
+  check(C_float, opencl_C_float, eps_float);
+  check(A_float, opencl_A_float, eps_float);
+  check(B_float, opencl_B_float, eps_float);
+
+  if( viennacl::ocl::current_device().double_support() )
+  {
+    check(C_double, *opencl_C_double, eps_double);
+    check(A_double, *opencl_A_double, eps_double);
+    check(B_double, *opencl_B_double, eps_double);
+  }
+#endif
+
+  std::cout << std::endl;
+
+  test_blas(my_backend,
+            eps_float, eps_double,
+            C_float, C_double,
+            A_float, A_double,
+            B_float, B_double,
+            host_C_float, host_C_double,
+            host_A_float, host_A_double,
+            host_B_float, host_B_double
+#ifdef VIENNACL_WITH_CUDA
+            , cuda_C_float, cuda_C_double
+            , cuda_A_float, cuda_A_double
+            , cuda_B_float, cuda_B_double
+#endif
+#ifdef VIENNACL_WITH_OPENCL
+            , opencl_C_float, opencl_C_double
+            , opencl_A_float, opencl_A_double
+            , opencl_B_float, opencl_B_double
+#endif
+            );
+
+
+#ifdef VIENNACL_WITH_OPENCL
+  //cleanup
+  if( viennacl::ocl::current_device().double_support() )
+  {
+    delete opencl_C_double;
+    delete opencl_A_double;
+    delete opencl_B_double;
+  }
+
+#endif
+
+  ViennaCLBackendDestroy(&my_backend);
+
+  //
+  //  That's it.
+  //
+  std::cout << std::endl << "!!!! TEST COMPLETED SUCCESSFULLY !!!!" << std::endl;
+
+  return EXIT_SUCCESS;
+}
+
diff --git a/tests/src/libviennacl_blas3.cu b/tests/src/libviennacl_blas3.cu
new file mode 100644
index 0000000..99c5618
--- /dev/null
+++ b/tests/src/libviennacl_blas3.cu
@@ -0,0 +1,623 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/*
+*
+*   Testing the ViennaCL BLAS-like shared library
+*
+*/
+
+
+// include necessary system headers
+#include <iostream>
+#include <vector>
+
+// Some helper functions for this tutorial:
+#include "viennacl.hpp"
+
+#include "examples/tutorial/Random.hpp"
+
+
+#include "viennacl/vector.hpp"
+
+template <typename ScalarType>
+ScalarType diff(ScalarType const & s1, ScalarType const & s2)
+{
+   if (s1 != s2)
+      return (s1 - s2) / std::max(std::fabs(s1), std::fabs(s2));
+   return 0;
+}
+
+template <typename ScalarType, typename ViennaCLVectorType>
+ScalarType diff(std::vector<ScalarType> const & v1, ViennaCLVectorType const & vcl_vec)
+{
+   std::vector<ScalarType> v2_cpu(vcl_vec.size());
+   viennacl::backend::finish();
+   viennacl::copy(vcl_vec, v2_cpu);
+
+   ScalarType inf_norm = 0;
+   for (unsigned int i=0;i<v1.size(); ++i)
+   {
+      if ( std::max( std::fabs(v2_cpu[i]), std::fabs(v1[i]) ) > 0 )
+         v2_cpu[i] = std::fabs(v2_cpu[i] - v1[i]) / std::max( std::fabs(v2_cpu[i]), std::fabs(v1[i]) );
+      else
+         v2_cpu[i] = 0.0;
+
+      if (v2_cpu[i] > inf_norm)
+        inf_norm = v2_cpu[i];
+   }
+
+   return inf_norm;
+}
+
+template <typename T, typename U, typename EpsilonT>
+void check(T const & t, U const & u, EpsilonT eps)
+{
+  EpsilonT rel_error = diff(t,u);
+  if (rel_error > eps)
+  {
+    std::cerr << "Relative error: " << rel_error << std::endl;
+    std::cerr << "Aborting!" << std::endl;
+    exit(EXIT_FAILURE);
+  }
+  std::cout << "SUCCESS ";
+}
+
+
+template <typename T>
+T get_value(std::vector<T> & array, ViennaCLInt i, ViennaCLInt j,
+            ViennaCLInt start1, ViennaCLInt start2,
+            ViennaCLInt stride1, ViennaCLInt stride2,
+            ViennaCLInt rows, ViennaCLInt cols,
+            ViennaCLOrder order, ViennaCLTranspose trans)
+{
+  // row-major
+  if (order == ViennaCLRowMajor && trans == ViennaCLTrans)
+    return array[(j*stride1 + start1) * cols + (i*stride2 + start2)];
+  else if (order == ViennaCLRowMajor && trans != ViennaCLTrans)
+    return array[(i*stride1 + start1) * cols + (j*stride2 + start2)];
+
+  // column-major
+  else if (order != ViennaCLRowMajor && trans == ViennaCLTrans)
+    return array[(j*stride1 + start1) + (i*stride2 + start2) * rows];
+  return array[(i*stride1 + start1) + (j*stride2 + start2) * rows];
+}
+
+
+
+void test_blas(ViennaCLBackend my_backend,
+               float eps_float, double eps_double,
+               std::vector<float> & C_float, std::vector<double> & C_double,
+               std::vector<float> & A_float, std::vector<double> & A_double,
+               std::vector<float> & B_float, std::vector<double> & B_double,
+               ViennaCLOrder order_C, ViennaCLOrder order_A, ViennaCLOrder order_B,
+               ViennaCLTranspose trans_A, ViennaCLTranspose trans_B,
+               viennacl::vector<float> & host_C_float, viennacl::vector<double> & host_C_double,
+               viennacl::vector<float> & host_A_float, viennacl::vector<double> & host_A_double,
+               viennacl::vector<float> & host_B_float, viennacl::vector<double> & host_B_double
+#ifdef VIENNACL_WITH_CUDA
+               , viennacl::vector<float> & cuda_C_float, viennacl::vector<double> & cuda_C_double
+               , viennacl::vector<float> & cuda_A_float, viennacl::vector<double> & cuda_A_double
+               , viennacl::vector<float> & cuda_B_float, viennacl::vector<double> & cuda_B_double
+#endif
+#ifdef VIENNACL_WITH_OPENCL
+               , viennacl::vector<float> & opencl_C_float, viennacl::vector<double> * opencl_C_double
+               , viennacl::vector<float> & opencl_A_float, viennacl::vector<double> * opencl_A_double
+               , viennacl::vector<float> & opencl_B_float, viennacl::vector<double> * opencl_B_double
+#endif
+               )
+{
+  ViennaCLInt C_size1   = 42;
+  ViennaCLInt C_size2   = 43;
+  ViennaCLInt C_start1  = 10;
+  ViennaCLInt C_start2  = 11;
+  ViennaCLInt C_stride1 = 2;
+  ViennaCLInt C_stride2 = 3;
+  ViennaCLInt C_rows    = C_size1 * C_stride1 + C_start1 + 5;
+  ViennaCLInt C_columns = C_size2 * C_stride2 + C_start2 + 5;
+
+  ViennaCLInt A_size1   = trans_A ? 44 : 42;
+  ViennaCLInt A_size2   = trans_A ? 42 : 44;
+  ViennaCLInt A_start1  = 12;
+  ViennaCLInt A_start2  = 13;
+  ViennaCLInt A_stride1 = 4;
+  ViennaCLInt A_stride2 = 5;
+  ViennaCLInt A_rows    = A_size1 * A_stride1 + A_start1 + 5;
+  ViennaCLInt A_columns = A_size2 * A_stride2 + A_start2 + 5;
+
+  ViennaCLInt B_size1   = trans_B ? 43 : 44;
+  ViennaCLInt B_size2   = trans_B ? 44 : 43;
+  ViennaCLInt B_start1  = 14;
+  ViennaCLInt B_start2  = 15;
+  ViennaCLInt B_stride1 = 6;
+  ViennaCLInt B_stride2 = 7;
+  ViennaCLInt B_rows    = B_size1 * B_stride1 + B_start1 + 5;
+  ViennaCLInt B_columns = B_size2 * B_stride2 + B_start2 + 5;
+
+  // Compute reference:
+  ViennaCLInt size_k = trans_A ? A_size1 : A_size2;
+  for (ViennaCLInt i=0; i<C_size1; ++i)
+    for (ViennaCLInt j=0; j<C_size2; ++j)
+    {
+      float val_float = 0;
+      double val_double = 0;
+      for (ViennaCLInt k=0; k<size_k; ++k)
+      {
+        float  val_A_float  = get_value(A_float,  i, k, A_start1, A_start2, A_stride1, A_stride2, A_rows, A_columns, order_A, trans_A);
+        double val_A_double = get_value(A_double, i, k, A_start1, A_start2, A_stride1, A_stride2, A_rows, A_columns, order_A, trans_A);
+
+        float  val_B_float  = get_value(B_float,  k, j, B_start1, B_start2, B_stride1, B_stride2, B_rows, B_columns, order_B, trans_B);
+        double val_B_double = get_value(B_double, k, j, B_start1, B_start2, B_stride1, B_stride2, B_rows, B_columns, order_B, trans_B);
+
+        val_float  += val_A_float  * val_B_float;
+        val_double += val_A_double * val_B_double;
+      }
+
+      // write result
+      if (order_C == ViennaCLRowMajor)
+      {
+        C_float [(i*C_stride1 + C_start1) * C_columns + (j*C_stride2 + C_start2)] = val_float;
+        C_double[(i*C_stride1 + C_start1) * C_columns + (j*C_stride2 + C_start2)] = val_double;
+      }
+      else
+      {
+        C_float [(i*C_stride1 + C_start1) + (j*C_stride2 + C_start2) * C_rows] = val_float;
+        C_double[(i*C_stride1 + C_start1) + (j*C_stride2 + C_start2) * C_rows] = val_double;
+      }
+    }
+
+  // Run GEMM and compare results:
+  ViennaCLHostSgemm(my_backend,
+                    order_A, trans_A, order_B, trans_B, order_C,
+                    C_size1, C_size2, size_k,
+                    1.0f,
+                    viennacl::linalg::host_based::detail::extract_raw_pointer<float>(host_A_float), A_start1, A_start2, A_stride1, A_stride2, (order_A == ViennaCLRowMajor) ? A_columns : A_rows,
+                    viennacl::linalg::host_based::detail::extract_raw_pointer<float>(host_B_float), B_start1, B_start2, B_stride1, B_stride2, (order_B == ViennaCLRowMajor) ? B_columns : B_rows,
+                    0.0f,
+                    viennacl::linalg::host_based::detail::extract_raw_pointer<float>(host_C_float), C_start1, C_start2, C_stride1, C_stride2, (order_C == ViennaCLRowMajor) ? C_columns : C_rows);
+  check(C_float, host_C_float, eps_float);
+
+  ViennaCLHostDgemm(my_backend,
+                    order_A, trans_A, order_B, trans_B, order_C,
+                    C_size1, C_size2, size_k,
+                    1.0,
+                    viennacl::linalg::host_based::detail::extract_raw_pointer<double>(host_A_double), A_start1, A_start2, A_stride1, A_stride2, (order_A == ViennaCLRowMajor) ? A_columns : A_rows,
+                    viennacl::linalg::host_based::detail::extract_raw_pointer<double>(host_B_double), B_start1, B_start2, B_stride1, B_stride2, (order_B == ViennaCLRowMajor) ? B_columns : B_rows,
+                    0.0,
+                    viennacl::linalg::host_based::detail::extract_raw_pointer<double>(host_C_double), C_start1, C_start2, C_stride1, C_stride2, (order_C == ViennaCLRowMajor) ? C_columns : C_rows);
+  check(C_double, host_C_double, eps_double);
+
+#ifdef VIENNACL_WITH_CUDA
+  ViennaCLCUDASgemm(my_backend,
+                    order_A, trans_A, order_B, trans_B, order_C,
+                    C_size1, C_size2, size_k,
+                    1.0f,
+                    viennacl::linalg::cuda::detail::cuda_arg<float>(cuda_A_float), A_start1, A_start2, A_stride1, A_stride2, (order_A == ViennaCLRowMajor) ? A_columns : A_rows,
+                    viennacl::linalg::cuda::detail::cuda_arg<float>(cuda_B_float), B_start1, B_start2, B_stride1, B_stride2, (order_B == ViennaCLRowMajor) ? B_columns : B_rows,
+                    0.0f,
+                    viennacl::linalg::cuda::detail::cuda_arg<float>(cuda_C_float), C_start1, C_start2, C_stride1, C_stride2, (order_C == ViennaCLRowMajor) ? C_columns : C_rows);
+  check(C_float, cuda_C_float, eps_float);
+
+  ViennaCLCUDADgemm(my_backend,
+                    order_A, trans_A, order_B, trans_B, order_C,
+                    C_size1, C_size2, size_k,
+                    1.0,
+                    viennacl::linalg::cuda::detail::cuda_arg<double>(cuda_A_double), A_start1, A_start2, A_stride1, A_stride2, (order_A == ViennaCLRowMajor) ? A_columns : A_rows,
+                    viennacl::linalg::cuda::detail::cuda_arg<double>(cuda_B_double), B_start1, B_start2, B_stride1, B_stride2, (order_B == ViennaCLRowMajor) ? B_columns : B_rows,
+                    0.0,
+                    viennacl::linalg::cuda::detail::cuda_arg<double>(cuda_C_double), C_start1, C_start2, C_stride1, C_stride2, (order_C == ViennaCLRowMajor) ? C_columns : C_rows);
+  check(C_double, cuda_C_double, eps_double);
+#endif
+
+#ifdef VIENNACL_WITH_OPENCL
+  ViennaCLOpenCLSgemm(my_backend,
+                    order_A, trans_A, order_B, trans_B, order_C,
+                    C_size1, C_size2, size_k,
+                    1.0f,
+                    viennacl::traits::opencl_handle(opencl_A_float), A_start1, A_start2, A_stride1, A_stride2, (order_A == ViennaCLRowMajor) ? A_columns : A_rows,
+                    viennacl::traits::opencl_handle(opencl_B_float), B_start1, B_start2, B_stride1, B_stride2, (order_B == ViennaCLRowMajor) ? B_columns : B_rows,
+                    0.0f,
+                    viennacl::traits::opencl_handle(opencl_C_float), C_start1, C_start2, C_stride1, C_stride2, (order_C == ViennaCLRowMajor) ? C_columns : C_rows);
+  check(C_float, opencl_C_float, eps_float);
+
+  if (opencl_A_double != NULL && opencl_B_double != NULL && opencl_C_double != NULL)
+  {
+    ViennaCLOpenCLDgemm(my_backend,
+                      order_A, trans_A, order_B, trans_B, order_C,
+                      C_size1, C_size2, size_k,
+                      1.0,
+                      viennacl::traits::opencl_handle(*opencl_A_double), A_start1, A_start2, A_stride1, A_stride2, (order_A == ViennaCLRowMajor) ? A_columns : A_rows,
+                      viennacl::traits::opencl_handle(*opencl_B_double), B_start1, B_start2, B_stride1, B_stride2, (order_B == ViennaCLRowMajor) ? B_columns : B_rows,
+                      0.0,
+                      viennacl::traits::opencl_handle(*opencl_C_double), C_start1, C_start2, C_stride1, C_stride2, (order_C == ViennaCLRowMajor) ? C_columns : C_rows);
+    check(C_double, *opencl_C_double, eps_double);
+  }
+#endif
+
+  std::cout << std::endl;
+}
+
+
+void test_blas(ViennaCLBackend my_backend,
+               float eps_float, double eps_double,
+               std::vector<float> & C_float, std::vector<double> & C_double,
+               std::vector<float> & A_float, std::vector<double> & A_double,
+               std::vector<float> & B_float, std::vector<double> & B_double,
+               ViennaCLOrder order_C, ViennaCLOrder order_A, ViennaCLOrder order_B,
+               viennacl::vector<float> & host_C_float, viennacl::vector<double> & host_C_double,
+               viennacl::vector<float> & host_A_float, viennacl::vector<double> & host_A_double,
+               viennacl::vector<float> & host_B_float, viennacl::vector<double> & host_B_double
+#ifdef VIENNACL_WITH_CUDA
+               , viennacl::vector<float> & cuda_C_float, viennacl::vector<double> & cuda_C_double
+               , viennacl::vector<float> & cuda_A_float, viennacl::vector<double> & cuda_A_double
+               , viennacl::vector<float> & cuda_B_float, viennacl::vector<double> & cuda_B_double
+#endif
+#ifdef VIENNACL_WITH_OPENCL
+               , viennacl::vector<float> & opencl_C_float, viennacl::vector<double> * opencl_C_double
+               , viennacl::vector<float> & opencl_A_float, viennacl::vector<double> * opencl_A_double
+               , viennacl::vector<float> & opencl_B_float, viennacl::vector<double> * opencl_B_double
+#endif
+               )
+{
+  std::cout << "    -> trans-trans: ";
+  test_blas(my_backend,
+            eps_float, eps_double,
+            C_float, C_double, A_float, A_double, B_float, B_double,
+            order_C, order_A, order_B,
+            ViennaCLTrans, ViennaCLTrans,
+            host_C_float, host_C_double, host_A_float, host_A_double, host_B_float, host_B_double
+#ifdef VIENNACL_WITH_CUDA
+            , cuda_C_float, cuda_C_double, cuda_A_float, cuda_A_double, cuda_B_float, cuda_B_double
+#endif
+#ifdef VIENNACL_WITH_OPENCL
+            , opencl_C_float, opencl_C_double, opencl_A_float, opencl_A_double, opencl_B_float, opencl_B_double
+#endif
+            );
+
+  std::cout << "    -> trans-no:    ";
+  test_blas(my_backend,
+            eps_float, eps_double,
+            C_float, C_double, A_float, A_double, B_float, B_double,
+            order_C, order_A, order_B,
+            ViennaCLTrans, ViennaCLNoTrans,
+            host_C_float, host_C_double, host_A_float, host_A_double, host_B_float, host_B_double
+#ifdef VIENNACL_WITH_CUDA
+            , cuda_C_float, cuda_C_double, cuda_A_float, cuda_A_double, cuda_B_float, cuda_B_double
+#endif
+#ifdef VIENNACL_WITH_OPENCL
+            , opencl_C_float, opencl_C_double, opencl_A_float, opencl_A_double, opencl_B_float, opencl_B_double
+#endif
+            );
+
+  std::cout << "    -> no-trans:    ";
+  test_blas(my_backend,
+            eps_float, eps_double,
+            C_float, C_double, A_float, A_double, B_float, B_double,
+            order_C, order_A, order_B,
+            ViennaCLNoTrans, ViennaCLTrans,
+            host_C_float, host_C_double, host_A_float, host_A_double, host_B_float, host_B_double
+#ifdef VIENNACL_WITH_CUDA
+            , cuda_C_float, cuda_C_double, cuda_A_float, cuda_A_double, cuda_B_float, cuda_B_double
+#endif
+#ifdef VIENNACL_WITH_OPENCL
+            , opencl_C_float, opencl_C_double, opencl_A_float, opencl_A_double, opencl_B_float, opencl_B_double
+#endif
+            );
+
+  std::cout << "    -> no-no:       ";
+  test_blas(my_backend,
+            eps_float, eps_double,
+            C_float, C_double, A_float, A_double, B_float, B_double,
+            order_C, order_A, order_B,
+            ViennaCLNoTrans, ViennaCLNoTrans,
+            host_C_float, host_C_double, host_A_float, host_A_double, host_B_float, host_B_double
+#ifdef VIENNACL_WITH_CUDA
+            , cuda_C_float, cuda_C_double, cuda_A_float, cuda_A_double, cuda_B_float, cuda_B_double
+#endif
+#ifdef VIENNACL_WITH_OPENCL
+            , opencl_C_float, opencl_C_double, opencl_A_float, opencl_A_double, opencl_B_float, opencl_B_double
+#endif
+            );
+
+}
+
+void test_blas(ViennaCLBackend my_backend,
+               float eps_float, double eps_double,
+               std::vector<float> & C_float, std::vector<double> & C_double,
+               std::vector<float> & A_float, std::vector<double> & A_double,
+               std::vector<float> & B_float, std::vector<double> & B_double,
+               viennacl::vector<float> & host_C_float, viennacl::vector<double> & host_C_double,
+               viennacl::vector<float> & host_A_float, viennacl::vector<double> & host_A_double,
+               viennacl::vector<float> & host_B_float, viennacl::vector<double> & host_B_double
+#ifdef VIENNACL_WITH_CUDA
+               , viennacl::vector<float> & cuda_C_float, viennacl::vector<double> & cuda_C_double
+               , viennacl::vector<float> & cuda_A_float, viennacl::vector<double> & cuda_A_double
+               , viennacl::vector<float> & cuda_B_float, viennacl::vector<double> & cuda_B_double
+#endif
+#ifdef VIENNACL_WITH_OPENCL
+               , viennacl::vector<float> & opencl_C_float, viennacl::vector<double> * opencl_C_double
+               , viennacl::vector<float> & opencl_A_float, viennacl::vector<double> * opencl_A_double
+               , viennacl::vector<float> & opencl_B_float, viennacl::vector<double> * opencl_B_double
+#endif
+               )
+{
+  std::cout << "  -> C: row, A: row, B: row" << std::endl;
+  test_blas(my_backend,
+            eps_float, eps_double,
+            C_float, C_double, A_float, A_double, B_float, B_double,
+            ViennaCLRowMajor, ViennaCLRowMajor, ViennaCLRowMajor,
+            host_C_float, host_C_double, host_A_float, host_A_double, host_B_float, host_B_double
+#ifdef VIENNACL_WITH_CUDA
+            , cuda_C_float, cuda_C_double, cuda_A_float, cuda_A_double, cuda_B_float, cuda_B_double
+#endif
+#ifdef VIENNACL_WITH_OPENCL
+            , opencl_C_float, opencl_C_double, opencl_A_float, opencl_A_double, opencl_B_float, opencl_B_double
+#endif
+            );
+
+  std::cout << "  -> C: row, A: row, B: col" << std::endl;
+  test_blas(my_backend,
+            eps_float, eps_double,
+            C_float, C_double, A_float, A_double, B_float, B_double,
+            ViennaCLRowMajor, ViennaCLRowMajor, ViennaCLColumnMajor,
+            host_C_float, host_C_double, host_A_float, host_A_double, host_B_float, host_B_double
+#ifdef VIENNACL_WITH_CUDA
+            , cuda_C_float, cuda_C_double, cuda_A_float, cuda_A_double, cuda_B_float, cuda_B_double
+#endif
+#ifdef VIENNACL_WITH_OPENCL
+            , opencl_C_float, opencl_C_double, opencl_A_float, opencl_A_double, opencl_B_float, opencl_B_double
+#endif
+            );
+
+  std::cout << "  -> C: row, A: col, B: row" << std::endl;
+  test_blas(my_backend,
+            eps_float, eps_double,
+            C_float, C_double, A_float, A_double, B_float, B_double,
+            ViennaCLRowMajor, ViennaCLColumnMajor, ViennaCLRowMajor,
+            host_C_float, host_C_double, host_A_float, host_A_double, host_B_float, host_B_double
+#ifdef VIENNACL_WITH_CUDA
+            , cuda_C_float, cuda_C_double, cuda_A_float, cuda_A_double, cuda_B_float, cuda_B_double
+#endif
+#ifdef VIENNACL_WITH_OPENCL
+            , opencl_C_float, opencl_C_double, opencl_A_float, opencl_A_double, opencl_B_float, opencl_B_double
+#endif
+            );
+
+  std::cout << "  -> C: row, A: col, B: col" << std::endl;
+  test_blas(my_backend,
+            eps_float, eps_double,
+            C_float, C_double, A_float, A_double, B_float, B_double,
+            ViennaCLRowMajor, ViennaCLColumnMajor, ViennaCLColumnMajor,
+            host_C_float, host_C_double, host_A_float, host_A_double, host_B_float, host_B_double
+#ifdef VIENNACL_WITH_CUDA
+            , cuda_C_float, cuda_C_double, cuda_A_float, cuda_A_double, cuda_B_float, cuda_B_double
+#endif
+#ifdef VIENNACL_WITH_OPENCL
+            , opencl_C_float, opencl_C_double, opencl_A_float, opencl_A_double, opencl_B_float, opencl_B_double
+#endif
+            );
+
+
+  std::cout << "  -> C: col, A: row, B: row" << std::endl;
+  test_blas(my_backend,
+            eps_float, eps_double,
+            C_float, C_double, A_float, A_double, B_float, B_double,
+            ViennaCLColumnMajor, ViennaCLRowMajor, ViennaCLRowMajor,
+            host_C_float, host_C_double, host_A_float, host_A_double, host_B_float, host_B_double
+#ifdef VIENNACL_WITH_CUDA
+            , cuda_C_float, cuda_C_double, cuda_A_float, cuda_A_double, cuda_B_float, cuda_B_double
+#endif
+#ifdef VIENNACL_WITH_OPENCL
+            , opencl_C_float, opencl_C_double, opencl_A_float, opencl_A_double, opencl_B_float, opencl_B_double
+#endif
+            );
+
+  std::cout << "  -> C: col, A: row, B: col" << std::endl;
+  test_blas(my_backend,
+            eps_float, eps_double,
+            C_float, C_double, A_float, A_double, B_float, B_double,
+            ViennaCLColumnMajor, ViennaCLRowMajor, ViennaCLColumnMajor,
+            host_C_float, host_C_double, host_A_float, host_A_double, host_B_float, host_B_double
+#ifdef VIENNACL_WITH_CUDA
+            , cuda_C_float, cuda_C_double, cuda_A_float, cuda_A_double, cuda_B_float, cuda_B_double
+#endif
+#ifdef VIENNACL_WITH_OPENCL
+            , opencl_C_float, opencl_C_double, opencl_A_float, opencl_A_double, opencl_B_float, opencl_B_double
+#endif
+            );
+
+  std::cout << "  -> C: col, A: col, B: row" << std::endl;
+  test_blas(my_backend,
+            eps_float, eps_double,
+            C_float, C_double, A_float, A_double, B_float, B_double,
+            ViennaCLColumnMajor, ViennaCLColumnMajor, ViennaCLRowMajor,
+            host_C_float, host_C_double, host_A_float, host_A_double, host_B_float, host_B_double
+#ifdef VIENNACL_WITH_CUDA
+            , cuda_C_float, cuda_C_double, cuda_A_float, cuda_A_double, cuda_B_float, cuda_B_double
+#endif
+#ifdef VIENNACL_WITH_OPENCL
+            , opencl_C_float, opencl_C_double, opencl_A_float, opencl_A_double, opencl_B_float, opencl_B_double
+#endif
+            );
+
+  std::cout << "  -> C: col, A: col, B: col" << std::endl;
+  test_blas(my_backend,
+            eps_float, eps_double,
+            C_float, C_double, A_float, A_double, B_float, B_double,
+            ViennaCLColumnMajor, ViennaCLColumnMajor, ViennaCLColumnMajor,
+            host_C_float, host_C_double, host_A_float, host_A_double, host_B_float, host_B_double
+#ifdef VIENNACL_WITH_CUDA
+            , cuda_C_float, cuda_C_double, cuda_A_float, cuda_A_double, cuda_B_float, cuda_B_double
+#endif
+#ifdef VIENNACL_WITH_OPENCL
+            , opencl_C_float, opencl_C_double, opencl_A_float, opencl_A_double, opencl_B_float, opencl_B_double
+#endif
+            );
+
+}
+
+
+
+
+int main()
+{
+  ViennaCLInt size  = 500*500;
+  float  eps_float  = 1e-5f;
+  double eps_double = 1e-12;
+
+  std::vector<float> C_float(size);
+  std::vector<float> A_float(size);
+  std::vector<float> B_float(size);
+
+  std::vector<double> C_double(size);
+  std::vector<double> A_double(size);
+  std::vector<double> B_double(size);
+
+  // fill with random data:
+
+  for (ViennaCLInt i = 0; i < size; ++i)
+  {
+    C_float[i] = 0.5f + 0.1f * random<float>();
+    A_float[i] = 0.5f + 0.1f * random<float>();
+    B_float[i] = 0.5f + 0.1f * random<float>();
+
+    C_double[i] = 0.5 + 0.2 * random<double>();
+    A_double[i] = 0.5 + 0.2 * random<double>();
+    B_double[i] = 0.5 + 0.2 * random<double>();
+  }
+
+
+  // Host setup
+  ViennaCLBackend my_backend;
+  ViennaCLBackendCreate(&my_backend);
+
+  viennacl::vector<float> host_C_float(size, viennacl::context(viennacl::MAIN_MEMORY));  viennacl::copy(C_float, host_C_float);
+  viennacl::vector<float> host_A_float(size, viennacl::context(viennacl::MAIN_MEMORY));  viennacl::copy(A_float, host_A_float);
+  viennacl::vector<float> host_B_float(size, viennacl::context(viennacl::MAIN_MEMORY));  viennacl::copy(B_float, host_B_float);
+
+  viennacl::vector<double> host_C_double(size, viennacl::context(viennacl::MAIN_MEMORY));  viennacl::copy(C_double, host_C_double);
+  viennacl::vector<double> host_A_double(size, viennacl::context(viennacl::MAIN_MEMORY));  viennacl::copy(A_double, host_A_double);
+  viennacl::vector<double> host_B_double(size, viennacl::context(viennacl::MAIN_MEMORY));  viennacl::copy(B_double, host_B_double);
+
+  // CUDA setup
+#ifdef VIENNACL_WITH_CUDA
+  viennacl::vector<float> cuda_C_float(size, viennacl::context(viennacl::CUDA_MEMORY));  viennacl::copy(C_float, cuda_C_float);
+  viennacl::vector<float> cuda_A_float(size, viennacl::context(viennacl::CUDA_MEMORY));  viennacl::copy(A_float, cuda_A_float);
+  viennacl::vector<float> cuda_B_float(size, viennacl::context(viennacl::CUDA_MEMORY));  viennacl::copy(B_float, cuda_B_float);
+
+  viennacl::vector<double> cuda_C_double(size, viennacl::context(viennacl::CUDA_MEMORY));  viennacl::copy(C_double, cuda_C_double);
+  viennacl::vector<double> cuda_A_double(size, viennacl::context(viennacl::CUDA_MEMORY));  viennacl::copy(A_double, cuda_A_double);
+  viennacl::vector<double> cuda_B_double(size, viennacl::context(viennacl::CUDA_MEMORY));  viennacl::copy(B_double, cuda_B_double);
+#endif
+
+  // OpenCL setup
+#ifdef VIENNACL_WITH_OPENCL
+  ViennaCLInt context_id = 0;
+  viennacl::vector<float> opencl_C_float(size, viennacl::context(viennacl::ocl::get_context(context_id)));  viennacl::copy(C_float, opencl_C_float);
+  viennacl::vector<float> opencl_A_float(size, viennacl::context(viennacl::ocl::get_context(context_id)));  viennacl::copy(A_float, opencl_A_float);
+  viennacl::vector<float> opencl_B_float(size, viennacl::context(viennacl::ocl::get_context(context_id)));  viennacl::copy(B_float, opencl_B_float);
+
+  viennacl::vector<double> *opencl_C_double = NULL;
+  viennacl::vector<double> *opencl_A_double = NULL;
+  viennacl::vector<double> *opencl_B_double = NULL;
+
+  if( viennacl::ocl::current_device().double_support() )
+  {
+    opencl_C_double = new viennacl::vector<double>(size, viennacl::context(viennacl::ocl::get_context(context_id)));  viennacl::copy(C_double, *opencl_C_double);
+    opencl_A_double = new viennacl::vector<double>(size, viennacl::context(viennacl::ocl::get_context(context_id)));  viennacl::copy(A_double, *opencl_A_double);
+    opencl_B_double = new viennacl::vector<double>(size, viennacl::context(viennacl::ocl::get_context(context_id)));  viennacl::copy(B_double, *opencl_B_double);
+  }
+
+  ViennaCLBackendSetOpenCLContextID(my_backend, context_id);
+#endif
+
+  // consistency checks:
+  check(C_float, host_C_float, eps_float);
+  check(A_float, host_A_float, eps_float);
+  check(B_float, host_B_float, eps_float);
+
+  check(C_double, host_C_double, eps_double);
+  check(A_double, host_A_double, eps_double);
+  check(B_double, host_B_double, eps_double);
+
+#ifdef VIENNACL_WITH_CUDA
+  check(C_float, cuda_C_float, eps_float);
+  check(A_float, cuda_A_float, eps_float);
+  check(B_float, cuda_B_float, eps_float);
+
+  check(C_double, cuda_C_double, eps_double);
+  check(A_double, cuda_A_double, eps_double);
+  check(B_double, cuda_B_double, eps_double);
+#endif
+#ifdef VIENNACL_WITH_OPENCL
+  check(C_float, opencl_C_float, eps_float);
+  check(A_float, opencl_A_float, eps_float);
+  check(B_float, opencl_B_float, eps_float);
+
+  if( viennacl::ocl::current_device().double_support() )
+  {
+    check(C_double, *opencl_C_double, eps_double);
+    check(A_double, *opencl_A_double, eps_double);
+    check(B_double, *opencl_B_double, eps_double);
+  }
+#endif
+
+  std::cout << std::endl;
+
+  test_blas(my_backend,
+            eps_float, eps_double,
+            C_float, C_double,
+            A_float, A_double,
+            B_float, B_double,
+            host_C_float, host_C_double,
+            host_A_float, host_A_double,
+            host_B_float, host_B_double
+#ifdef VIENNACL_WITH_CUDA
+            , cuda_C_float, cuda_C_double
+            , cuda_A_float, cuda_A_double
+            , cuda_B_float, cuda_B_double
+#endif
+#ifdef VIENNACL_WITH_OPENCL
+            , opencl_C_float, opencl_C_double
+            , opencl_A_float, opencl_A_double
+            , opencl_B_float, opencl_B_double
+#endif
+            );
+
+
+#ifdef VIENNACL_WITH_OPENCL
+  //cleanup
+  if( viennacl::ocl::current_device().double_support() )
+  {
+    delete opencl_C_double;
+    delete opencl_A_double;
+    delete opencl_B_double;
+  }
+
+#endif
+
+  ViennaCLBackendDestroy(&my_backend);
+
+  //
+  //  That's it.
+  //
+  std::cout << std::endl << "!!!! TEST COMPLETED SUCCESSFULLY !!!!" << std::endl;
+
+  return EXIT_SUCCESS;
+}
+
diff --git a/tests/src/matrix.cpp b/tests/src/matrix.cpp
deleted file mode 100644
index 6962959..0000000
--- a/tests/src/matrix.cpp
+++ /dev/null
@@ -1,532 +0,0 @@
-/* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-//
-// *** System
-//
-#include <iostream>
-
-//
-// *** Boost
-//
-#include <boost/numeric/ublas/io.hpp>
-#include <boost/numeric/ublas/triangular.hpp>
-#include <boost/numeric/ublas/matrix_sparse.hpp>
-#include <boost/numeric/ublas/matrix.hpp>
-#include <boost/numeric/ublas/matrix_proxy.hpp>
-#include <boost/numeric/ublas/lu.hpp>
-#include <boost/numeric/ublas/io.hpp>
-
-//
-// *** ViennaCL
-//
-//#define VIENNACL_DEBUG_ALL
-#define VIENNACL_HAVE_UBLAS 1
-#include "viennacl/scalar.hpp"
-#include "viennacl/matrix.hpp"
-#include "viennacl/vector.hpp"
-#include "viennacl/linalg/prod.hpp"
-#include "viennacl/linalg/norm_2.hpp"
-#include "viennacl/linalg/direct_solve.hpp"
-#include "examples/tutorial/Random.hpp"
-
-//
-// -------------------------------------------------------------
-//
-using namespace boost::numeric;
-//
-// -------------------------------------------------------------
-//
-template <typename ScalarType>
-ScalarType diff(ScalarType & s1, viennacl::scalar<ScalarType> & s2) 
-{
-   if (s1 != s2)
-      return (s1 - s2) / std::max(fabs(s1), fabs(s2));
-   return 0;
-}
-
-template <typename ScalarType>
-ScalarType diff(ublas::vector<ScalarType> & v1, viennacl::vector<ScalarType> & v2)
-{
-   ublas::vector<ScalarType> v2_cpu(v2.size());
-   copy(v2.begin(), v2.end(), v2_cpu.begin());
-
-   for (unsigned int i=0;i<v1.size(); ++i)
-   {
-      if ( std::max( fabs(v2_cpu[i]), fabs(v1[i]) ) > 0 )
-         v2_cpu[i] = fabs(v2_cpu[i] - v1[i]) / std::max( fabs(v2_cpu[i]), fabs(v1[i]) );
-      else
-         v2_cpu[i] = 0.0;
-   }
-
-   return norm_inf(v2_cpu);
-}
-
-template <typename ScalarType, typename F, unsigned int ALIGNMENT>
-ScalarType diff(ublas::matrix<ScalarType> & mat1, viennacl::matrix<ScalarType, F, ALIGNMENT> & mat2)
-{
-   ublas::matrix<ScalarType> mat2_cpu(mat2.size1(), mat2.size2());
-   copy(mat2, mat2_cpu);
-   ScalarType ret = 0;
-   ScalarType act = 0;
-
-    for (unsigned int i = 0; i < mat2_cpu.size1(); ++i)
-    {
-      for (unsigned int j = 0; j < mat2_cpu.size2(); ++j)
-      {
-         act = fabs(mat2_cpu(i,j) - mat1(i,j)) / std::max( fabs(mat2_cpu(i, j)), fabs(mat1(i,j)) );
-         if (act > ret)
-           ret = act;
-      }
-    }
-   //std::cout << ret << std::endl;
-   return ret;
-}
-
-//
-// -------------------------------------------------------------
-//
-template< typename NumericT, typename F, typename Epsilon >
-int test(Epsilon const& epsilon)
-{
-   int retval = EXIT_SUCCESS;
-   
-   std::size_t num_rows = 121;
-   std::size_t num_cols = 103;
-   
-   // --------------------------------------------------------------------------            
-   ublas::vector<NumericT> rhs(num_rows);
-   for (unsigned int i = 0; i < rhs.size(); ++i)
-     rhs(i) = random<NumericT>();
-   ublas::vector<NumericT> rhs2 = rhs;
-   ublas::vector<NumericT> result = ublas::scalar_vector<NumericT>(num_cols, NumericT(3.1415));
-   ublas::vector<NumericT> result2 = result;
-   ublas::vector<NumericT> rhs_trans = rhs;
-   rhs_trans.resize(result.size(), true);
-   ublas::vector<NumericT> result_trans = ublas::zero_vector<NumericT>(rhs.size());
-
-  
-   ublas::matrix<NumericT> matrix(result.size(), rhs.size());
-  
-   for (unsigned int i = 0; i < matrix.size1(); ++i)
-      for (unsigned int j = 0; j < matrix.size2(); ++j)
-         matrix(i,j) = static_cast<NumericT>(0.1) * random<NumericT>();
-
-   viennacl::vector<NumericT> vcl_rhs(rhs.size());
-   viennacl::vector<NumericT> vcl_rhs_trans(rhs_trans.size());
-   viennacl::vector<NumericT> vcl_result_trans(result_trans.size());
-   viennacl::vector<NumericT> vcl_result(result.size()); 
-   viennacl::matrix<NumericT, F> vcl_matrix(result.size(), rhs.size());
-
-   viennacl::copy(rhs.begin(), rhs.end(), vcl_rhs.begin());
-   viennacl::copy(result, vcl_result);
-   viennacl::copy(matrix, vcl_matrix);
-   
-   std::cout << "Matrix resizing (to larger)" << std::endl;
-   matrix.resize(2*num_rows, 2*num_cols, true);
-   for (unsigned int i = 0; i < matrix.size1(); ++i)
-   {
-      for (unsigned int j = (i<result.size() ? rhs.size() : 0); j < matrix.size2(); ++j)
-         matrix(i,j) = 0;
-   }
-   vcl_matrix.resize(2*num_rows, 2*num_cols, true);
-   viennacl::copy(vcl_matrix, matrix);
-   if( fabs(diff(matrix, vcl_matrix)) > epsilon )
-   {
-      std::cout << "# Error at operation: matrix resize (to larger)" << std::endl;
-      std::cout << "  diff: " << fabs(diff(matrix, vcl_matrix)) << std::endl;
-      return EXIT_FAILURE;
-   }
-   
-   matrix(12, 14) = NumericT(1.9);
-   matrix(19, 16) = NumericT(1.0);
-   matrix (13, 15) =  NumericT(-9);
-   vcl_matrix(12, 14) = NumericT(1.9);
-   vcl_matrix(19, 16) = NumericT(1.0);
-   vcl_matrix (13, 15) =  NumericT(-9);
-   
-   std::cout << "Matrix resizing (to smaller)" << std::endl;
-   matrix.resize(result.size(), rhs.size(), true);
-   vcl_matrix.resize(result.size(), rhs.size(), true);
-   if( fabs(diff(matrix, vcl_matrix)) > epsilon )
-   {
-      std::cout << "# Error at operation: matrix resize (to smaller)" << std::endl;
-      std::cout << "  diff: " << fabs(diff(matrix, vcl_matrix)) << std::endl;
-      return EXIT_FAILURE;
-   }
-
-
-   std::cout << "Matrix addition and subtraction" << std::endl;
-   viennacl::matrix<NumericT, F> vcl_matrix2 = vcl_matrix;
-   vcl_matrix2 += vcl_matrix;
-   vcl_matrix2 -= vcl_matrix;
-   vcl_matrix2 = vcl_matrix2 + vcl_matrix;
-   vcl_matrix2 = vcl_matrix2 - vcl_matrix;
-
-   if( fabs(diff(matrix, vcl_matrix2)) > epsilon )
-   {
-      std::cout << "# Error at operation: matrix addition and subtraction" << std::endl;
-      std::cout << "  diff: " << fabs(diff(matrix, vcl_matrix2)) << std::endl;
-      return EXIT_FAILURE;
-   }
-
-   // --------------------------------------------------------------------------            
-   std::cout << "Rank 1 update" << std::endl;
-   ublas::matrix<NumericT> matrix2 = matrix;
-   
-   matrix2 += ublas::outer_prod(result, rhs);
-   vcl_matrix += viennacl::linalg::outer_prod(vcl_result, vcl_rhs);
-   if( fabs(diff(matrix2, vcl_matrix)) > epsilon )
-   {
-      std::cout << "# Error at operation: rank 1 update" << std::endl;
-      std::cout << "  diff: " << fabs(diff(matrix2, vcl_matrix)) << std::endl;
-      return EXIT_FAILURE;
-   }
-   // --------------------------------------------------------------------------            
-   std::cout << "Scaled rank 1 update" << std::endl;
-   matrix2 += 4.2f * ublas::outer_prod(result, rhs);
-   vcl_matrix += 2.1f * viennacl::linalg::outer_prod(vcl_result, vcl_rhs);
-   vcl_matrix += viennacl::linalg::outer_prod(vcl_result, vcl_rhs) * 2.1f;  //check proper compilation
-   if( fabs(diff(matrix2, vcl_matrix)) > epsilon )
-   {
-      std::cout << "# Error at operation: scaled rank 1 update" << std::endl;
-      std::cout << "  diff: " << fabs(diff(matrix2, vcl_matrix)) << std::endl;
-      return EXIT_FAILURE;
-   }
-   
-   //reset vcl_matrix:
-   viennacl::copy(matrix, vcl_matrix);
-   
-   // --------------------------------------------------------------------------            
-   std::cout << "Matrix-Vector product" << std::endl;
-   result     = viennacl::linalg::prod(matrix, rhs);
-   vcl_result = viennacl::linalg::prod(vcl_matrix, vcl_rhs);
-   
-   for (std::size_t i=0; i<result.size(); ++i)
-   {
-     std::cout << rhs(i) << ", " << vcl_rhs(i) << ", " << result(i) << ", " << vcl_result(i) << std::endl; 
-   }
-   
-   if( fabs(diff(result, vcl_result)) > epsilon )
-   {
-      std::cout << "# Error at operation: matrix-vector product" << std::endl;
-      std::cout << "  diff: " << fabs(diff(result, vcl_result)) << std::endl;
-      retval = EXIT_FAILURE;
-   }
-   // --------------------------------------------------------------------------            
-   std::cout << "Matrix-Vector product with scaled add" << std::endl;
-   NumericT alpha = static_cast<NumericT>(2.786);
-   NumericT beta = static_cast<NumericT>(1.432);
-   viennacl::copy(rhs.begin(), rhs.end(), vcl_rhs.begin());
-   viennacl::copy(result.begin(), result.end(), vcl_result.begin());
-
-   result     = alpha * viennacl::linalg::prod(matrix, rhs) + beta * result;
-   vcl_result = alpha * viennacl::linalg::prod(vcl_matrix, vcl_rhs) + beta * vcl_result;
-
-   if( fabs(diff(result, vcl_result)) > epsilon )
-   {
-      std::cout << "# Error at operation: matrix-vector product with scaled additions" << std::endl;
-      std::cout << "  diff: " << fabs(diff(result, vcl_result)) << std::endl;
-      retval = EXIT_FAILURE;
-   }
-   // --------------------------------------------------------------------------            
-
-   viennacl::copy(rhs_trans.begin(), rhs_trans.end(), vcl_rhs_trans.begin());
-   viennacl::copy(result_trans.begin(), result_trans.end(), vcl_result_trans.begin());
-
-   std::cout << "Transposed Matrix-Vector product" << std::endl;
-   result_trans     = alpha * viennacl::linalg::prod(trans(matrix), rhs_trans);  
-   vcl_result_trans = alpha * viennacl::linalg::prod(trans(vcl_matrix), vcl_rhs_trans);
-
-   if( fabs(diff(result_trans, vcl_result_trans)) > epsilon )
-   {
-      std::cout << "# Error at operation: transposed matrix-vector product" << std::endl;
-      std::cout << "  diff: " << fabs(diff(result_trans, vcl_result_trans)) << std::endl;
-      retval = EXIT_FAILURE;
-   }
-
-   std::cout << "Transposed Matrix-Vector product with scaled add" << std::endl;
-   result_trans     = alpha * viennacl::linalg::prod(trans(matrix), rhs_trans) + beta * result_trans;  
-   vcl_result_trans = alpha * viennacl::linalg::prod(trans(vcl_matrix), vcl_rhs_trans) + beta * vcl_result_trans;
-
-   if( fabs(diff(result_trans, vcl_result_trans)) > epsilon )
-   {
-      std::cout << "# Error at operation: transposed matrix-vector product with scaled additions" << std::endl;
-      std::cout << "  diff: " << fabs(diff(result_trans, vcl_result_trans)) << std::endl;
-      retval = EXIT_FAILURE;
-   }
-   // --------------------------------------------------------------------------            
-
-   /////////////////// test direct solvers ////////////////////////////
-   
-   rhs.resize(40);
-   matrix.resize(rhs.size(), rhs.size());
-   result.resize(rhs.size());
-
-   std::cout << "Resizing vcl_rhs..." << std::endl;
-   vcl_rhs.resize(rhs.size());
-   std::cout << "Resizing vcl_rhs done" << std::endl;
-   vcl_matrix.resize(rhs.size(), rhs.size());
-   std::cout << "Resizing vcl_result..." << std::endl;
-   vcl_result.resize(rhs.size());
-   std::cout << "Resizing vcl_result done" << std::endl;
-
-   for (unsigned int i = 0; i < matrix.size1(); ++i)
-   {
-      for (unsigned int j = 0; j < matrix.size2(); ++j)
-         matrix(i,j) = -random<NumericT>();
-      rhs(i) = random<NumericT>();
-   }
-
-   //force unit diagonal
-   for (unsigned int i = 0; i < matrix.size1(); ++i)
-      matrix(i,i) = static_cast<NumericT>(3) + random<NumericT>();
-
-   viennacl::copy(matrix, vcl_matrix);
-   viennacl::copy(rhs, vcl_rhs);
-
-   //upper triangular:
-   std::cout << "Upper triangular solver" << std::endl;
-   result = ublas::solve(matrix, rhs, ublas::upper_tag());
-   vcl_result = viennacl::linalg::solve(vcl_matrix, vcl_rhs, viennacl::linalg::upper_tag());
-   if( fabs(diff(result, vcl_result)) > epsilon )
-   {
-      std::cout << "# Error at operation: upper triangular solver" << std::endl;
-      std::cout << "  diff: " << fabs(diff(result, vcl_result)) << std::endl;
-      retval = EXIT_FAILURE;
-   }
-
-   //upper unit triangular:
-   std::cout << "Upper unit triangular solver" << std::endl;
-   viennacl::copy(rhs, vcl_rhs);
-   result = ublas::solve(matrix, rhs, ublas::unit_upper_tag());
-   vcl_result = viennacl::linalg::solve(vcl_matrix, vcl_rhs, viennacl::linalg::unit_upper_tag());
-   if( fabs(diff(result, vcl_result)) > epsilon )
-   {
-      std::cout << "# Error at operation: unit upper triangular solver" << std::endl;
-      std::cout << "  diff: " << fabs(diff(result, vcl_result)) << std::endl;
-      retval = EXIT_FAILURE;
-   }
-
-   //lower triangular:
-   std::cout << "Lower triangular solver" << std::endl;
-   viennacl::copy(rhs, vcl_rhs);
-   result = ublas::solve(matrix, rhs, ublas::lower_tag());
-   vcl_result = viennacl::linalg::solve(vcl_matrix, vcl_rhs, viennacl::linalg::lower_tag());
-   if( fabs(diff(result, vcl_result)) > epsilon )
-   {
-      std::cout << "# Error at operation: lower triangular solver" << std::endl;
-      std::cout << "  diff: " << fabs(diff(result, vcl_result)) << std::endl;
-      retval = EXIT_FAILURE;
-   }
-
-   //lower unit triangular:
-   std::cout << "Lower unit triangular solver" << std::endl;
-   viennacl::copy(rhs, vcl_rhs);
-   result = ublas::solve(matrix, rhs, ublas::unit_lower_tag());
-   vcl_result = viennacl::linalg::solve(vcl_matrix, vcl_rhs, viennacl::linalg::unit_lower_tag());
-   if( fabs(diff(result, vcl_result)) > epsilon )
-   {
-      std::cout << "# Error at operation: unit lower triangular solver" << std::endl;
-      std::cout << "  diff: " << fabs(diff(result, vcl_result)) << std::endl;
-      retval = EXIT_FAILURE;
-   }
-
-
-
-
-
-   //transposed upper triangular:
-   std::cout << "Transposed upper triangular solver" << std::endl;
-   viennacl::copy(rhs, vcl_rhs);
-   result = ublas::solve(trans(matrix), rhs, ublas::upper_tag());
-   vcl_result = viennacl::linalg::solve(trans(vcl_matrix), vcl_rhs, viennacl::linalg::upper_tag());
-   if( fabs(diff(result, vcl_result)) > epsilon )
-   {
-      std::cout << "# Error at operation: upper triangular solver" << std::endl;
-      std::cout << "  diff: " << fabs(diff(result, vcl_result)) << std::endl;
-      retval = EXIT_FAILURE;
-   }
-
-   //transposed upper unit triangular:
-   std::cout << "Transposed unit upper triangular solver" << std::endl;
-   viennacl::copy(rhs, vcl_rhs);
-   result = ublas::solve(trans(matrix), rhs, ublas::unit_upper_tag());
-   vcl_result = viennacl::linalg::solve(trans(vcl_matrix), vcl_rhs, viennacl::linalg::unit_upper_tag());
-   if( fabs(diff(result, vcl_result)) > epsilon )
-   {
-      std::cout << "# Error at operation: unit upper triangular solver" << std::endl;
-      std::cout << "  diff: " << fabs(diff(result, vcl_result)) << std::endl;
-      retval = EXIT_FAILURE;
-   }
-
-   //transposed lower triangular:
-   std::cout << "Transposed lower triangular solver" << std::endl;
-   viennacl::copy(rhs, vcl_rhs);
-   result = ublas::solve(trans(matrix), rhs, ublas::lower_tag());
-   vcl_result = viennacl::linalg::solve(trans(vcl_matrix), vcl_rhs, viennacl::linalg::lower_tag());
-   if( fabs(diff(result, vcl_result)) > epsilon )
-   {
-      std::cout << "# Error at operation: lower triangular solver" << std::endl;
-      std::cout << "  diff: " << fabs(diff(result, vcl_result)) << std::endl;
-      retval = EXIT_FAILURE;
-   }
-
-   //transposed lower unit triangular:
-   std::cout << "Transposed unit lower triangular solver" << std::endl;
-   viennacl::copy(rhs, vcl_rhs);
-   result = ublas::solve(trans(matrix), rhs, ublas::unit_lower_tag());
-   vcl_result = viennacl::linalg::solve(trans(vcl_matrix), vcl_rhs, viennacl::linalg::unit_lower_tag());
-   if( fabs(diff(result, vcl_result)) > epsilon )
-   {
-      std::cout << "# Error at operation: unit lower triangular solver" << std::endl;
-      std::cout << "  diff: " << fabs(diff(result, vcl_result)) << std::endl;
-      retval = EXIT_FAILURE;
-   }
-   
-   
-   //full solver:
-   std::cout << "Full solver" << std::endl;
-   unsigned int lu_dim = 100;
-   ublas::matrix<NumericT> square_matrix(lu_dim, lu_dim);
-   ublas::vector<NumericT> lu_rhs(lu_dim);
-   viennacl::matrix<NumericT, F> vcl_square_matrix(lu_dim, lu_dim);
-   viennacl::vector<NumericT> vcl_lu_rhs(lu_dim);
-
-   for (std::size_t i=0; i<lu_dim; ++i)
-     for (std::size_t j=0; j<lu_dim; ++j)
-       square_matrix(i,j) = -static_cast<NumericT>(0.5) * random<NumericT>();
-
-   //put some more weight on diagonal elements:
-   for (std::size_t j=0; j<lu_dim; ++j)
-   {
-     square_matrix(j,j) = static_cast<NumericT>(20.0) + random<NumericT>();
-     lu_rhs(j) = random<NumericT>();
-   }
-   
-   viennacl::copy(square_matrix, vcl_square_matrix);
-   viennacl::copy(lu_rhs, vcl_lu_rhs);
-   
-   //ublas::
-   ublas::lu_factorize(square_matrix);
-   ublas::inplace_solve (square_matrix, lu_rhs, ublas::unit_lower_tag ());
-   ublas::inplace_solve (square_matrix, lu_rhs, ublas::upper_tag ());
-
-   // ViennaCL:
-   viennacl::linalg::lu_factorize(vcl_square_matrix);
-   //viennacl::copy(square_matrix, vcl_square_matrix);
-   viennacl::linalg::lu_substitute(vcl_square_matrix, vcl_lu_rhs);
-
-   if( fabs(diff(lu_rhs, vcl_lu_rhs)) > epsilon )
-   {
-      std::cout << "# Error at operation: dense solver" << std::endl;
-      std::cout << "  diff: " << fabs(diff(lu_rhs, vcl_lu_rhs)) << std::endl;
-      retval = EXIT_FAILURE;
-   }
-   
-   
-
-   return retval;
-}
-//
-// -------------------------------------------------------------
-//
-int main()
-{
-   std::cout << std::endl;
-   std::cout << "----------------------------------------------" << std::endl;
-   std::cout << "----------------------------------------------" << std::endl;
-   std::cout << "## Test :: Matrix" << std::endl;
-   std::cout << "----------------------------------------------" << std::endl;
-   std::cout << "----------------------------------------------" << std::endl;
-   std::cout << std::endl;
-
-   int retval = EXIT_SUCCESS;
-
-   std::cout << std::endl;
-   std::cout << "----------------------------------------------" << std::endl;
-   std::cout << std::endl;
-   {
-      typedef float NumericT;
-      NumericT epsilon = NumericT(1.0E-3);
-      std::cout << "# Testing setup:" << std::endl;
-      std::cout << "  eps:     " << epsilon << std::endl;
-      std::cout << "  numeric: float" << std::endl;
-      std::cout << "  layout: row-major" << std::endl;
-      retval = test<NumericT, viennacl::row_major>(epsilon);
-      if( retval == EXIT_SUCCESS )
-         std::cout << "# Test passed" << std::endl;
-      else
-         return retval;
-   }
-   std::cout << std::endl;
-   std::cout << "----------------------------------------------" << std::endl;
-   std::cout << std::endl;
-   {
-      typedef float NumericT;
-      NumericT epsilon = NumericT(1.0E-3);
-      std::cout << "# Testing setup:" << std::endl;
-      std::cout << "  eps:     " << epsilon << std::endl;
-      std::cout << "  numeric: float" << std::endl;
-      std::cout << "  layout: column-major" << std::endl;
-      retval = test<NumericT, viennacl::column_major>(epsilon);
-      if( retval == EXIT_SUCCESS )
-         std::cout << "# Test passed" << std::endl;
-      else
-         return retval;
-   }
-   std::cout << std::endl;
-   std::cout << "----------------------------------------------" << std::endl;
-   std::cout << std::endl;
-   
-   
-   if( viennacl::ocl::current_device().double_support() )
-   {
-      {
-         typedef double NumericT;
-         NumericT epsilon = 1.0E-11;
-         std::cout << "# Testing setup:" << std::endl;
-         std::cout << "  eps:     " << epsilon << std::endl;
-         std::cout << "  numeric: double" << std::endl;
-         std::cout << "  layout: row-major" << std::endl;
-         retval = test<NumericT, viennacl::row_major>(epsilon);
-            if( retval == EXIT_SUCCESS )
-               std::cout << "# Test passed" << std::endl;
-            else
-              return retval;
-      }
-      std::cout << std::endl;
-      std::cout << "----------------------------------------------" << std::endl;
-      std::cout << std::endl;
-      {
-         typedef double NumericT;
-         NumericT epsilon = 1.0E-11;
-         std::cout << "# Testing setup:" << std::endl;
-         std::cout << "  eps:     " << epsilon << std::endl;
-         std::cout << "  numeric: double" << std::endl;
-         std::cout << "  layout: column-major" << std::endl;
-         retval = test<NumericT, viennacl::column_major>(epsilon);
-            if( retval == EXIT_SUCCESS )
-               std::cout << "# Test passed" << std::endl;
-            else
-              return retval;
-      }
-      std::cout << std::endl;
-      std::cout << "----------------------------------------------" << std::endl;
-      std::cout << std::endl;
-   }
-   return retval;
-}
diff --git a/tests/src/matrix_col_double.cpp b/tests/src/matrix_col_double.cpp
new file mode 100644
index 0000000..b89e624
--- /dev/null
+++ b/tests/src/matrix_col_double.cpp
@@ -0,0 +1,52 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+#include "matrix_float_double.hpp"
+
+
+int main (int, const char **)
+{
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "## Test :: Matrix operations, column-major, double precision " << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << std::endl;
+
+#ifdef VIENNACL_WITH_OPENCL
+   if( viennacl::ocl::current_device().double_support() )
+#endif
+  {
+    double epsilon = 1e-12;
+    std::cout << "# Testing setup:" << std::endl;
+    std::cout << "  eps:     " << epsilon << std::endl;
+    std::cout << "  numeric: double" << std::endl;
+
+    if (run_test<viennacl::column_major, double>(epsilon) != EXIT_SUCCESS)
+      return EXIT_FAILURE;
+  }
+
+   std::cout << std::endl;
+   std::cout << "------- Test completed --------" << std::endl;
+   std::cout << std::endl;
+
+
+  return EXIT_SUCCESS;
+}
+
diff --git a/tests/src/matrix_col_double.cu b/tests/src/matrix_col_double.cu
new file mode 100644
index 0000000..b89e624
--- /dev/null
+++ b/tests/src/matrix_col_double.cu
@@ -0,0 +1,52 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+#include "matrix_float_double.hpp"
+
+
+int main (int, const char **)
+{
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "## Test :: Matrix operations, column-major, double precision " << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << std::endl;
+
+#ifdef VIENNACL_WITH_OPENCL
+   if( viennacl::ocl::current_device().double_support() )
+#endif
+  {
+    double epsilon = 1e-12;
+    std::cout << "# Testing setup:" << std::endl;
+    std::cout << "  eps:     " << epsilon << std::endl;
+    std::cout << "  numeric: double" << std::endl;
+
+    if (run_test<viennacl::column_major, double>(epsilon) != EXIT_SUCCESS)
+      return EXIT_FAILURE;
+  }
+
+   std::cout << std::endl;
+   std::cout << "------- Test completed --------" << std::endl;
+   std::cout << std::endl;
+
+
+  return EXIT_SUCCESS;
+}
+
diff --git a/tests/src/matrix_col_float.cpp b/tests/src/matrix_col_float.cpp
new file mode 100644
index 0000000..9fbbc57
--- /dev/null
+++ b/tests/src/matrix_col_float.cpp
@@ -0,0 +1,45 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "matrix_float_double.hpp"
+
+
+int main (int, const char **)
+{
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "## Test :: Matrix operations, column-major, single precision " << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << std::endl;
+
+  double epsilon = 1e-4;
+  std::cout << "# Testing setup:" << std::endl;
+  std::cout << "  eps:     " << epsilon << std::endl;
+  std::cout << "  numeric: float" << std::endl;
+  std::cout << " --- column-major ---" << std::endl;
+  if (run_test<viennacl::column_major, float>(epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << std::endl;
+  std::cout << "------- Test completed --------" << std::endl;
+  std::cout << std::endl;
+
+  return EXIT_SUCCESS;
+}
+
diff --git a/tests/src/matrix_col_float.cu b/tests/src/matrix_col_float.cu
new file mode 100644
index 0000000..9fbbc57
--- /dev/null
+++ b/tests/src/matrix_col_float.cu
@@ -0,0 +1,45 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "matrix_float_double.hpp"
+
+
+int main (int, const char **)
+{
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "## Test :: Matrix operations, column-major, single precision " << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << std::endl;
+
+  double epsilon = 1e-4;
+  std::cout << "# Testing setup:" << std::endl;
+  std::cout << "  eps:     " << epsilon << std::endl;
+  std::cout << "  numeric: float" << std::endl;
+  std::cout << " --- column-major ---" << std::endl;
+  if (run_test<viennacl::column_major, float>(epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << std::endl;
+  std::cout << "------- Test completed --------" << std::endl;
+  std::cout << std::endl;
+
+  return EXIT_SUCCESS;
+}
+
diff --git a/tests/src/matrix_col_int.cpp b/tests/src/matrix_col_int.cpp
new file mode 100644
index 0000000..b3344dd
--- /dev/null
+++ b/tests/src/matrix_col_int.cpp
@@ -0,0 +1,48 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "matrix_int.hpp"
+
+int main (int, const char **)
+{
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "## Test :: Matrix operations, column-major, integers " << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << std::endl;
+
+  std::cout << "# Testing setup:" << std::endl;
+  std::cout << "  numeric: int" << std::endl;
+  std::cout << " --- column-major ---" << std::endl;
+  if (run_test<viennacl::column_major, int>() != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "# Testing setup:" << std::endl;
+  std::cout << "  numeric: long" << std::endl;
+  std::cout << " --- column-major ---" << std::endl;
+  if (run_test<viennacl::column_major, long>() != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << std::endl;
+  std::cout << "------- Test completed --------" << std::endl;
+  std::cout << std::endl;
+
+  return EXIT_SUCCESS;
+}
+
diff --git a/tests/src/matrix_col_int.cu b/tests/src/matrix_col_int.cu
new file mode 100644
index 0000000..b3344dd
--- /dev/null
+++ b/tests/src/matrix_col_int.cu
@@ -0,0 +1,48 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "matrix_int.hpp"
+
+int main (int, const char **)
+{
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "## Test :: Matrix operations, column-major, integers " << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << std::endl;
+
+  std::cout << "# Testing setup:" << std::endl;
+  std::cout << "  numeric: int" << std::endl;
+  std::cout << " --- column-major ---" << std::endl;
+  if (run_test<viennacl::column_major, int>() != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "# Testing setup:" << std::endl;
+  std::cout << "  numeric: long" << std::endl;
+  std::cout << " --- column-major ---" << std::endl;
+  if (run_test<viennacl::column_major, long>() != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << std::endl;
+  std::cout << "------- Test completed --------" << std::endl;
+  std::cout << std::endl;
+
+  return EXIT_SUCCESS;
+}
+
diff --git a/tests/src/matrix_float_double.hpp b/tests/src/matrix_float_double.hpp
new file mode 100644
index 0000000..5ae7dc9
--- /dev/null
+++ b/tests/src/matrix_float_double.hpp
@@ -0,0 +1,1304 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#define VIENNACL_WITH_UBLAS
+//#define NDEBUG
+//#define VIENNACL_BUILD_INFO
+
+// We don't need debug mode in UBLAS:
+#define BOOST_UBLAS_NDEBUG
+
+#include <utility>
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <cmath>
+#include <algorithm>
+#include <stdio.h>
+#include <time.h>
+//#include "../benchmarks/benchmark-utils.hpp"
+#include "viennacl/scalar.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/linalg/norm_1.hpp"
+#include "viennacl/linalg/norm_inf.hpp"
+#include "viennacl/linalg/norm_frobenius.hpp"
+#include "viennacl/matrix_proxy.hpp"
+#include "viennacl/vector_proxy.hpp"
+#include "viennacl/linalg/norm_1.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+#include "viennacl/linalg/norm_inf.hpp"
+#include "viennacl/linalg/norm_frobenius.hpp"
+
+#include "boost/numeric/ublas/vector.hpp"
+#include "boost/numeric/ublas/matrix.hpp"
+#include "boost/numeric/ublas/matrix_proxy.hpp"
+#include "boost/numeric/ublas/vector_proxy.hpp"
+#include "boost/numeric/ublas/io.hpp"
+
+using namespace boost::numeric;
+
+template <typename MatrixType, typename VCLMatrixType>
+bool check_for_equality(MatrixType const & ublas_A, VCLMatrixType const & vcl_A, double epsilon)
+{
+  typedef typename MatrixType::value_type   value_type;
+
+  boost::numeric::ublas::matrix<value_type> vcl_A_cpu(vcl_A.size1(), vcl_A.size2());
+  viennacl::backend::finish();  //workaround for a bug in APP SDK 2.7 on Trinity APUs (with Catalyst 12.8)
+  viennacl::copy(vcl_A, vcl_A_cpu);
+
+  for (std::size_t i=0; i<ublas_A.size1(); ++i)
+  {
+    for (std::size_t j=0; j<ublas_A.size2(); ++j)
+    {
+      if (ublas_A(i,j) != vcl_A_cpu(i,j))
+      {
+        if ( (std::abs(ublas_A(i,j) - vcl_A_cpu(i,j)) / std::max(std::fabs(ublas_A(i,j)), std::fabs(vcl_A_cpu(i,j))) > epsilon) || (vcl_A_cpu(i,j) != vcl_A_cpu(i,j)) )
+        {
+          std::cout << "Error at index (" << i << ", " << j << "): " << ublas_A(i,j) << " vs " << vcl_A_cpu(i,j) << std::endl;
+          std::cout << std::endl << "TEST failed!" << std::endl;
+          return false;
+        }
+      }
+    }
+  }
+
+  std::cout << "PASSED!" << std::endl;
+  return true;
+}
+
+
+
+
+template <typename UBLASMatrixType,
+          typename ViennaCLMatrixType1, typename ViennaCLMatrixType2, typename ViennaCLMatrixType3>
+int run_test(double epsilon,
+             UBLASMatrixType & ublas_A, UBLASMatrixType & ublas_B, UBLASMatrixType & ublas_C,
+             ViennaCLMatrixType1 & vcl_A, ViennaCLMatrixType2 & vcl_B, ViennaCLMatrixType3 vcl_C)
+{
+
+  typedef typename viennacl::result_of::cpu_value_type<typename ViennaCLMatrixType1::value_type>::type  cpu_value_type;
+
+  cpu_value_type alpha = cpu_value_type(3.1415);
+  viennacl::scalar<cpu_value_type>   gpu_alpha = alpha;
+
+  cpu_value_type beta = cpu_value_type(2.7182);
+  viennacl::scalar<cpu_value_type>   gpu_beta = beta;
+
+
+  //
+  // Initializer:
+  //
+  std::cout << "Checking for zero_matrix initializer..." << std::endl;
+  ublas_A = ublas::zero_matrix<cpu_value_type>(ublas_A.size1(), ublas_A.size2());
+  vcl_A = viennacl::zero_matrix<cpu_value_type>(vcl_A.size1(), vcl_A.size2());
+  if (!check_for_equality(ublas_A, vcl_A, epsilon))
+    return EXIT_FAILURE;
+
+  std::cout << "Checking for scalar_matrix initializer..." << std::endl;
+  ublas_A = ublas::scalar_matrix<cpu_value_type>(ublas_A.size1(), ublas_A.size2(), alpha);
+  vcl_A = viennacl::scalar_matrix<cpu_value_type>(vcl_A.size1(), vcl_A.size2(), alpha);
+  if (!check_for_equality(ublas_A, vcl_A, epsilon))
+    return EXIT_FAILURE;
+
+  ublas_A =    ublas::scalar_matrix<cpu_value_type>(ublas_A.size1(), ublas_A.size2(), gpu_beta);
+  vcl_A   = viennacl::scalar_matrix<cpu_value_type>(  vcl_A.size1(),   vcl_A.size2(), gpu_beta);
+  if (!check_for_equality(ublas_A, vcl_A, epsilon))
+    return EXIT_FAILURE;
+
+  /*std::cout << "Checking for identity initializer..." << std::endl;
+  ublas_A = ublas::identity_matrix<cpu_value_type>(ublas_A.size1());
+  vcl_A = viennacl::identity_matrix<cpu_value_type>(vcl_A.size1());
+  if (!check_for_equality(ublas_A, vcl_A, epsilon))
+    return EXIT_FAILURE;*/
+
+
+  std::cout << std::endl;
+  //std::cout << "//" << std::endl;
+  //std::cout << "////////// Test: Assignments //////////" << std::endl;
+  //std::cout << "//" << std::endl;
+
+  if (!check_for_equality(ublas_B, vcl_B, epsilon))
+    return EXIT_FAILURE;
+
+  std::cout << "Testing matrix assignment... ";
+  //std::cout << ublas_B(0,0) << " vs. " << vcl_B(0,0) << std::endl;
+  ublas_A = ublas_B;
+  vcl_A = vcl_B;
+  if (!check_for_equality(ublas_A, vcl_A, epsilon))
+    return EXIT_FAILURE;
+
+
+
+  //std::cout << std::endl;
+  //std::cout << "//" << std::endl;
+  //std::cout << "////////// Test 1: Copy to GPU //////////" << std::endl;
+  //std::cout << "//" << std::endl;
+
+  ublas_A = ublas_B;
+  viennacl::copy(ublas_B, vcl_A);
+  std::cout << "Testing upper left copy to GPU... ";
+  if (!check_for_equality(ublas_A, vcl_A, epsilon))
+    return EXIT_FAILURE;
+
+
+  ublas_C = ublas_B;
+  viennacl::copy(ublas_B, vcl_C);
+  std::cout << "Testing lower right copy to GPU... ";
+  if (!check_for_equality(ublas_C, vcl_C, epsilon))
+    return EXIT_FAILURE;
+
+
+  //std::cout << std::endl;
+  //std::cout << "//" << std::endl;
+  //std::cout << "////////// Test 2: Copy from GPU //////////" << std::endl;
+  //std::cout << "//" << std::endl;
+
+  std::cout << "Testing upper left copy to A... ";
+  if (!check_for_equality(ublas_A, vcl_A, epsilon))
+    return EXIT_FAILURE;
+
+  std::cout << "Testing lower right copy to C... ";
+  if (!check_for_equality(ublas_C, vcl_C, epsilon))
+    return EXIT_FAILURE;
+
+
+
+  //std::cout << "//" << std::endl;
+  //std::cout << "////////// Test 3: Addition //////////" << std::endl;
+  //std::cout << "//" << std::endl;
+  viennacl::copy(ublas_C, vcl_C);
+
+  std::cout << "Inplace add: ";
+  ublas_C += ublas_C;
+  vcl_C   +=   vcl_C;
+
+  if (!check_for_equality(ublas_C, vcl_C, epsilon))
+    return EXIT_FAILURE;
+
+  std::cout << "Scaled inplace add: ";
+  ublas_C += beta * ublas_A;
+  vcl_C   += gpu_beta * vcl_A;
+
+  if (!check_for_equality(ublas_C, vcl_C, epsilon))
+    return EXIT_FAILURE;
+
+  std::cout << "Add: ";
+  ublas_C = ublas_A + ublas_B;
+  vcl_C   =   vcl_A +   vcl_B;
+
+  if (!check_for_equality(ublas_C, vcl_C, epsilon))
+    return EXIT_FAILURE;
+
+  std::cout << "Add with flipsign: ";
+  ublas_C = - ublas_A + ublas_B;
+  vcl_C   = -   vcl_A +   vcl_B;
+
+  if (!check_for_equality(ublas_C, vcl_C, epsilon))
+    return EXIT_FAILURE;
+
+
+  std::cout << "Scaled add (left): ";
+  ublas_C = alpha * ublas_A + ublas_B;
+  vcl_C   = alpha *   vcl_A +   vcl_B;
+
+  if (!check_for_equality(ublas_C, vcl_C, epsilon))
+    return EXIT_FAILURE;
+
+  std::cout << "Scaled add (left): ";
+  vcl_C = gpu_alpha * vcl_A + vcl_B;
+  if (!check_for_equality(ublas_C, vcl_C, epsilon))
+    return EXIT_FAILURE;
+
+
+  std::cout << "Scaled add (right): ";
+  ublas_C = ublas_A + beta * ublas_B;
+  vcl_C   =   vcl_A + beta *   vcl_B;
+
+  if (!check_for_equality(ublas_C, vcl_C, epsilon))
+    return EXIT_FAILURE;
+
+  std::cout << "Scaled add (right): ";
+  vcl_C = vcl_A + gpu_beta * vcl_B;
+  if (!check_for_equality(ublas_C, vcl_C, epsilon))
+    return EXIT_FAILURE;
+
+
+
+  std::cout << "Scaled add (both): ";
+  ublas_C = alpha * ublas_A + beta * ublas_B;
+  vcl_C   = alpha *   vcl_A + beta *   vcl_B;
+
+  if (!check_for_equality(ublas_C, vcl_C, epsilon))
+    return EXIT_FAILURE;
+
+  std::cout << "Scaled add (both): ";
+  vcl_C = gpu_alpha * vcl_A + gpu_beta * vcl_B;
+  if (!check_for_equality(ublas_C, vcl_C, epsilon))
+    return EXIT_FAILURE;
+
+  //std::cout << "//" << std::endl;
+  //std::cout << "////////// Test 4: Subtraction //////////" << std::endl;
+  //std::cout << "//" << std::endl;
+  viennacl::copy(ublas_C, vcl_C);
+
+  std::cout << "Inplace sub: ";
+  ublas_C -= ublas_B;
+  vcl_C -= vcl_B;
+
+  if (!check_for_equality(ublas_C, vcl_C, epsilon))
+    return EXIT_FAILURE;
+
+  std::cout << "Scaled Inplace sub: ";
+  ublas_C -= alpha * ublas_B;
+  vcl_C -= alpha * vcl_B;
+
+  if (!check_for_equality(ublas_C, vcl_C, epsilon))
+    return EXIT_FAILURE;
+
+
+
+
+  std::cout << "Sub: ";
+  ublas_C = ublas_A - ublas_B;
+  vcl_C = vcl_A - vcl_B;
+
+  if (!check_for_equality(ublas_C, vcl_C, epsilon))
+    return EXIT_FAILURE;
+
+  std::cout << "Scaled sub (left): ";
+  ublas_B = alpha * ublas_A - ublas_C;
+  vcl_B   = alpha *   vcl_A - vcl_C;
+
+  if (!check_for_equality(ublas_B, vcl_B, epsilon))
+    return EXIT_FAILURE;
+
+  std::cout << "Scaled sub (left): ";
+  vcl_B = gpu_alpha * vcl_A - vcl_C;
+  if (!check_for_equality(ublas_B, vcl_B, epsilon))
+    return EXIT_FAILURE;
+
+
+  std::cout << "Scaled sub (right): ";
+  ublas_B = ublas_A - beta * ublas_C;
+  vcl_B   =   vcl_A - vcl_C * beta;
+
+  if (!check_for_equality(ublas_B, vcl_B, epsilon))
+    return EXIT_FAILURE;
+
+  std::cout << "Scaled sub (right): ";
+  vcl_B = vcl_A - vcl_C * gpu_beta;
+  if (!check_for_equality(ublas_B, vcl_B, epsilon))
+    return EXIT_FAILURE;
+
+
+  std::cout << "Scaled sub (both): ";
+  ublas_B = alpha * ublas_A - beta * ublas_C;
+  vcl_B   = alpha * vcl_A - vcl_C * beta;
+
+  if (!check_for_equality(ublas_B, vcl_B, epsilon))
+    return EXIT_FAILURE;
+
+  std::cout << "Scaled sub (both): ";
+  vcl_B = gpu_alpha * vcl_A - vcl_C * gpu_beta;
+  if (!check_for_equality(ublas_B, vcl_B, epsilon))
+    return EXIT_FAILURE;
+
+
+  std::cout << "Unary operator-: ";
+  ublas_C = - ublas_A;
+  vcl_C   = -   vcl_A;
+
+  if (!check_for_equality(ublas_C, vcl_C, epsilon))
+    return EXIT_FAILURE;
+
+
+
+  //std::cout << "//" << std::endl;
+  //std::cout << "////////// Test 5: Scaling //////////" << std::endl;
+  //std::cout << "//" << std::endl;
+  viennacl::copy(ublas_A, vcl_A);
+
+  std::cout << "Multiplication with CPU scalar: ";
+  ublas_A *= alpha;
+  vcl_A   *= alpha;
+
+  if (!check_for_equality(ublas_A, vcl_A, epsilon))
+    return EXIT_FAILURE;
+
+  std::cout << "Multiplication with GPU scalar: ";
+  ublas_A *= beta;
+  vcl_A *= gpu_beta;
+
+  if (!check_for_equality(ublas_A, vcl_A, epsilon))
+    return EXIT_FAILURE;
+
+
+  std::cout << "Division with CPU scalar: ";
+  ublas_A /= alpha;
+  vcl_A /= alpha;
+
+  if (!check_for_equality(ublas_A, vcl_A, epsilon))
+    return EXIT_FAILURE;
+
+  std::cout << "Division with GPU scalar: ";
+  ublas_A /= beta;
+  vcl_A /= gpu_beta;
+
+  if (!check_for_equality(ublas_A, vcl_A, epsilon))
+    return EXIT_FAILURE;
+
+
+
+  std::cout << "Testing elementwise multiplication..." << std::endl;
+  ublas_B = ublas::scalar_matrix<cpu_value_type>(ublas_B.size1(), ublas_B.size2(), cpu_value_type(1.4142));
+  ublas_A = cpu_value_type(3.1415) * ublas_B;
+  viennacl::copy(ublas_A, vcl_A);
+  viennacl::copy(ublas_B, vcl_B);
+  viennacl::copy(ublas_B, vcl_B);
+  ublas_A = ublas::element_prod(ublas_A, ublas_B);
+  vcl_A = viennacl::linalg::element_prod(vcl_A, vcl_B);
+
+  if (!check_for_equality(ublas_A, vcl_A, epsilon))
+    return EXIT_FAILURE;
+
+  ublas_A += ublas::element_prod(ublas_A, ublas_B);
+  vcl_A += viennacl::linalg::element_prod(vcl_A, vcl_B);
+
+  if (!check_for_equality(ublas_A, vcl_A, epsilon))
+    return EXIT_FAILURE;
+
+  ublas_A -= ublas::element_prod(ublas_A, ublas_B);
+  vcl_A -= viennacl::linalg::element_prod(vcl_A, vcl_B);
+
+  if (!check_for_equality(ublas_A, vcl_A, epsilon))
+    return EXIT_FAILURE;
+
+  ///////
+  ublas_A = ublas::element_prod(ublas_A + ublas_B, ublas_B);
+  vcl_A = viennacl::linalg::element_prod(vcl_A + vcl_B, vcl_B);
+
+  if (!check_for_equality(ublas_A, vcl_A, epsilon))
+    return EXIT_FAILURE;
+
+  ublas_A += ublas::element_prod(ublas_A + ublas_B, ublas_B);
+  vcl_A += viennacl::linalg::element_prod(vcl_A + vcl_B, vcl_B);
+
+  if (!check_for_equality(ublas_A, vcl_A, epsilon))
+    return EXIT_FAILURE;
+
+  ublas_A -= ublas::element_prod(ublas_A + ublas_B, ublas_B);
+  vcl_A -= viennacl::linalg::element_prod(vcl_A + vcl_B, vcl_B);
+
+  if (!check_for_equality(ublas_A, vcl_A, epsilon))
+    return EXIT_FAILURE;
+
+  ///////
+  ublas_A = ublas::element_prod(ublas_A, ublas_B + ublas_A);
+  vcl_A = viennacl::linalg::element_prod(vcl_A, vcl_B + vcl_A);
+
+  if (!check_for_equality(ublas_A, vcl_A, epsilon))
+    return EXIT_FAILURE;
+
+  ublas_A += ublas::element_prod(ublas_A, ublas_B + ublas_A);
+  vcl_A += viennacl::linalg::element_prod(vcl_A, vcl_B + vcl_A);
+
+  if (!check_for_equality(ublas_A, vcl_A, epsilon))
+    return EXIT_FAILURE;
+
+  ublas_A -= ublas::element_prod(ublas_A, ublas_B + ublas_A);
+  vcl_A -= viennacl::linalg::element_prod(vcl_A, vcl_B + vcl_A);
+
+  if (!check_for_equality(ublas_A, vcl_A, epsilon))
+    return EXIT_FAILURE;
+
+  ///////
+  ublas_A = ublas::element_prod(ublas_A + ublas_B, ublas_B + ublas_A);
+  vcl_A = viennacl::linalg::element_prod(vcl_A + vcl_B, vcl_B + vcl_A);
+
+  if (!check_for_equality(ublas_A, vcl_A, epsilon))
+    return EXIT_FAILURE;
+
+  ublas_A += ublas::element_prod(ublas_A + ublas_B, ublas_B + ublas_A);
+  vcl_A += viennacl::linalg::element_prod(vcl_A + vcl_B, vcl_B + vcl_A);
+
+  if (!check_for_equality(ublas_A, vcl_A, epsilon))
+    return EXIT_FAILURE;
+
+  ublas_A -= ublas::element_prod(ublas_A + ublas_B, ublas_B + ublas_A);
+  vcl_A -= viennacl::linalg::element_prod(vcl_A + vcl_B, vcl_B + vcl_A);
+
+  if (!check_for_equality(ublas_A, vcl_A, epsilon))
+    return EXIT_FAILURE;
+
+
+  ublas_B = ublas::scalar_matrix<cpu_value_type>(ublas_B.size1(), ublas_B.size2(), cpu_value_type(1.4142));
+  ublas_A = cpu_value_type(3.1415) * ublas_B;
+  viennacl::copy(ublas_A, vcl_A);
+  viennacl::copy(ublas_B, vcl_B);
+
+  ublas_A = ublas::element_div(ublas_A, ublas_B);
+  vcl_A = viennacl::linalg::element_div(vcl_A, vcl_B);
+
+  if (!check_for_equality(ublas_A, vcl_A, epsilon))
+    return EXIT_FAILURE;
+
+  ublas_A += ublas::element_div(ublas_A, ublas_B);
+  vcl_A += viennacl::linalg::element_div(vcl_A, vcl_B);
+
+  if (!check_for_equality(ublas_A, vcl_A, epsilon))
+    return EXIT_FAILURE;
+
+  ublas_A -= ublas::element_div(ublas_A, ublas_B);
+  vcl_A -= viennacl::linalg::element_div(vcl_A, vcl_B);
+
+  if (!check_for_equality(ublas_A, vcl_A, epsilon))
+    return EXIT_FAILURE;
+
+  ///////
+  ublas_A = ublas::element_div(ublas_A + ublas_B, ublas_B);
+  vcl_A = viennacl::linalg::element_div(vcl_A + vcl_B, vcl_B);
+
+  if (!check_for_equality(ublas_A, vcl_A, epsilon))
+    return EXIT_FAILURE;
+
+  ublas_A += ublas::element_div(ublas_A + ublas_B, ublas_B);
+  vcl_A += viennacl::linalg::element_div(vcl_A + vcl_B, vcl_B);
+
+  if (!check_for_equality(ublas_A, vcl_A, epsilon))
+    return EXIT_FAILURE;
+
+  ublas_A -= ublas::element_div(ublas_A + ublas_B, ublas_B);
+  vcl_A -= viennacl::linalg::element_div(vcl_A + vcl_B, vcl_B);
+
+  if (!check_for_equality(ublas_A, vcl_A, epsilon))
+    return EXIT_FAILURE;
+
+  ///////
+  ublas_A = ublas::element_div(ublas_A, ublas_B + ublas_A);
+  vcl_A = viennacl::linalg::element_div(vcl_A, vcl_B + vcl_A);
+
+  if (!check_for_equality(ublas_A, vcl_A, epsilon))
+    return EXIT_FAILURE;
+
+  ublas_A += ublas::element_div(ublas_A, ublas_B + ublas_A);
+  vcl_A += viennacl::linalg::element_div(vcl_A, vcl_B + vcl_A);
+
+  if (!check_for_equality(ublas_A, vcl_A, epsilon))
+    return EXIT_FAILURE;
+
+  ublas_A -= ublas::element_div(ublas_A, ublas_B + ublas_A);
+  vcl_A -= viennacl::linalg::element_div(vcl_A, vcl_B + vcl_A);
+
+  if (!check_for_equality(ublas_A, vcl_A, epsilon))
+    return EXIT_FAILURE;
+
+  ///////
+  ublas_A = ublas::element_div(ublas_A + ublas_B, ublas_B + ublas_A);
+  vcl_A = viennacl::linalg::element_div(vcl_A + vcl_B, vcl_B + vcl_A);
+
+  if (!check_for_equality(ublas_A, vcl_A, epsilon))
+    return EXIT_FAILURE;
+
+  ublas_A += ublas::element_div(ublas_A + ublas_B, ublas_B + ublas_A);
+  vcl_A += viennacl::linalg::element_div(vcl_A + vcl_B, vcl_B + vcl_A);
+
+  if (!check_for_equality(ublas_A, vcl_A, epsilon))
+    return EXIT_FAILURE;
+
+  ublas_A -= ublas::element_div(ublas_A + ublas_B, ublas_B + ublas_A);
+  vcl_A -= viennacl::linalg::element_div(vcl_A + vcl_B, vcl_B + vcl_A);
+
+  if (!check_for_equality(ublas_A, vcl_A, epsilon))
+    return EXIT_FAILURE;
+
+  // element_pow
+  std::cout << "Testing unary element_pow()..." << std::endl;
+
+  ublas_B = ublas::scalar_matrix<cpu_value_type>(ublas_B.size1(), ublas_B.size2(), cpu_value_type(1.4142));
+  ublas_A = cpu_value_type(3.1415) * ublas_B;
+  viennacl::copy(ublas_A, vcl_A);
+  viennacl::copy(ublas_B, vcl_B);
+
+  for (std::size_t i=0; i<ublas_C.size1(); i++)
+    for (std::size_t j=0; j<ublas_C.size2(); ++j)
+      ublas_C(i,j) = std::pow(ublas_A(i,j), ublas_B(i,j));
+  vcl_C = viennacl::linalg::element_pow(vcl_A, vcl_B);
+
+  if (!check_for_equality(ublas_C, vcl_C, epsilon))
+    return EXIT_FAILURE;
+
+  for (std::size_t i=0; i<ublas_C.size1(); i++)
+    for (std::size_t j=0; j<ublas_C.size2(); ++j)
+      ublas_C(i,j) += std::pow(ublas_A(i,j), ublas_B(i,j));
+  vcl_C += viennacl::linalg::element_pow(vcl_A, vcl_B);
+
+  if (!check_for_equality(ublas_C, vcl_C, epsilon))
+    return EXIT_FAILURE;
+
+  for (std::size_t i=0; i<ublas_C.size1(); i++)
+    for (std::size_t j=0; j<ublas_C.size2(); ++j)
+      ublas_C(i,j) -= std::pow(ublas_A(i,j), ublas_B(i,j));
+  vcl_C -= viennacl::linalg::element_pow(vcl_A, vcl_B);
+
+  if (!check_for_equality(ublas_C, vcl_C, epsilon))
+    return EXIT_FAILURE;
+
+  ///////
+  for (std::size_t i=0; i<ublas_C.size1(); i++)
+    for (std::size_t j=0; j<ublas_C.size2(); ++j)
+      ublas_C(i,j) = std::pow(ublas_A(i,j) + ublas_B(i,j), ublas_B(i,j));
+  vcl_C = viennacl::linalg::element_pow(vcl_A + vcl_B, vcl_B);
+
+  if (!check_for_equality(ublas_C, vcl_C, epsilon))
+    return EXIT_FAILURE;
+
+  for (std::size_t i=0; i<ublas_C.size1(); i++)
+    for (std::size_t j=0; j<ublas_C.size2(); ++j)
+      ublas_C(i,j) += std::pow(ublas_A(i,j) + ublas_B(i,j), ublas_B(i,j));
+  vcl_C += viennacl::linalg::element_pow(vcl_A + vcl_B, vcl_B);
+
+  if (!check_for_equality(ublas_C, vcl_C, epsilon))
+    return EXIT_FAILURE;
+
+  for (std::size_t i=0; i<ublas_C.size1(); i++)
+    for (std::size_t j=0; j<ublas_C.size2(); ++j)
+      ublas_C(i,j) -= std::pow(ublas_A(i,j) + ublas_B(i,j), ublas_B(i,j));
+  vcl_C -= viennacl::linalg::element_pow(vcl_A + vcl_B, vcl_B);
+
+  if (!check_for_equality(ublas_C, vcl_C, epsilon))
+    return EXIT_FAILURE;
+
+  ///////
+  for (std::size_t i=0; i<ublas_C.size1(); i++)
+    for (std::size_t j=0; j<ublas_C.size2(); ++j)
+      ublas_C(i,j) = std::pow(ublas_A(i,j), ublas_B(i,j) + ublas_A(i,j));
+  vcl_C = viennacl::linalg::element_pow(vcl_A, vcl_B + vcl_A);
+
+  if (!check_for_equality(ublas_C, vcl_C, epsilon))
+    return EXIT_FAILURE;
+
+  for (std::size_t i=0; i<ublas_C.size1(); i++)
+    for (std::size_t j=0; j<ublas_C.size2(); ++j)
+      ublas_C(i,j) += std::pow(ublas_A(i,j), ublas_B(i,j) + ublas_A(i,j));
+  vcl_C += viennacl::linalg::element_pow(vcl_A, vcl_B + vcl_A);
+
+  if (!check_for_equality(ublas_C, vcl_C, epsilon))
+    return EXIT_FAILURE;
+
+  for (std::size_t i=0; i<ublas_C.size1(); i++)
+    for (std::size_t j=0; j<ublas_C.size2(); ++j)
+      ublas_C(i,j) -= std::pow(ublas_A(i,j), ublas_B(i,j) + ublas_A(i,j));
+  vcl_C -= viennacl::linalg::element_pow(vcl_A, vcl_B + vcl_A);
+
+  if (!check_for_equality(ublas_C, vcl_C, epsilon))
+    return EXIT_FAILURE;
+
+  ///////
+  for (std::size_t i=0; i<ublas_C.size1(); i++)
+    for (std::size_t j=0; j<ublas_C.size2(); ++j)
+      ublas_C(i,j) = std::pow(ublas_A(i,j) + ublas_B(i,j), ublas_B(i,j) + ublas_A(i,j));
+  vcl_C = viennacl::linalg::element_pow(vcl_A + vcl_B, vcl_B + vcl_A);
+
+  if (!check_for_equality(ublas_C, vcl_C, epsilon))
+    return EXIT_FAILURE;
+
+  for (std::size_t i=0; i<ublas_C.size1(); i++)
+    for (std::size_t j=0; j<ublas_C.size2(); ++j)
+      ublas_C(i,j) += std::pow(ublas_A(i,j) + ublas_B(i,j), ublas_B(i,j) + ublas_A(i,j));
+  vcl_C += viennacl::linalg::element_pow(vcl_A + vcl_B, vcl_B + vcl_A);
+
+  if (!check_for_equality(ublas_C, vcl_C, epsilon))
+    return EXIT_FAILURE;
+
+  for (std::size_t i=0; i<ublas_C.size1(); i++)
+    for (std::size_t j=0; j<ublas_C.size2(); ++j)
+      ublas_C(i,j) -= std::pow(ublas_A(i,j) + ublas_B(i,j), ublas_B(i,j) + ublas_A(i,j));
+  vcl_C -= viennacl::linalg::element_pow(vcl_A + vcl_B, vcl_B + vcl_A);
+
+  if (!check_for_equality(ublas_C, vcl_C, epsilon))
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing unary elementwise operations..." << std::endl;
+
+#define GENERATE_UNARY_OP_TEST(FUNCNAME) \
+  ublas_B = ublas::scalar_matrix<cpu_value_type>(ublas_B.size1(), ublas_B.size2(), cpu_value_type(1.4142)); \
+  ublas_A = cpu_value_type(3.1415) * ublas_B; \
+  ublas_C = cpu_value_type(2.7172) * ublas_A; \
+  viennacl::copy(ublas_A, vcl_A); \
+  viennacl::copy(ublas_B, vcl_B); \
+  viennacl::copy(ublas_C, vcl_C); \
+  viennacl::copy(ublas_B, vcl_B); \
+  \
+  for (std::size_t i=0; i<ublas_C.size1(); ++i) \
+    for (std::size_t j=0; j<ublas_C.size2(); ++j) \
+      ublas_C(i,j) = std::FUNCNAME(ublas_A(i,j)); \
+  vcl_C = viennacl::linalg::element_##FUNCNAME(vcl_A); \
+ \
+  if (!check_for_equality(ublas_C, vcl_C, epsilon)) \
+  { \
+    std::cout << "Failure at C = " << #FUNCNAME << "(A)" << std::endl; \
+    return EXIT_FAILURE; \
+  } \
+ \
+  for (std::size_t i=0; i<ublas_C.size1(); ++i) \
+    for (std::size_t j=0; j<ublas_C.size2(); ++j) \
+      ublas_C(i,j) = std::FUNCNAME(ublas_A(i,j) + ublas_B(i,j)); \
+  vcl_C = viennacl::linalg::element_##FUNCNAME(vcl_A + vcl_B); \
+ \
+  if (!check_for_equality(ublas_C, vcl_C, epsilon)) \
+  { \
+    std::cout << "Failure at C = " << #FUNCNAME << "(A + B)" << std::endl; \
+    return EXIT_FAILURE; \
+  } \
+ \
+  for (std::size_t i=0; i<ublas_C.size1(); ++i) \
+    for (std::size_t j=0; j<ublas_C.size2(); ++j) \
+      ublas_C(i,j) += std::FUNCNAME(ublas_A(i,j)); \
+  vcl_C += viennacl::linalg::element_##FUNCNAME(vcl_A); \
+ \
+  if (!check_for_equality(ublas_C, vcl_C, epsilon)) \
+  { \
+    std::cout << "Failure at C += " << #FUNCNAME << "(A)" << std::endl; \
+    return EXIT_FAILURE; \
+  } \
+ \
+  for (std::size_t i=0; i<ublas_C.size1(); ++i) \
+    for (std::size_t j=0; j<ublas_C.size2(); ++j) \
+      ublas_C(i,j) += std::FUNCNAME(ublas_A(i,j) + ublas_B(i,j)); \
+  vcl_C += viennacl::linalg::element_##FUNCNAME(vcl_A + vcl_B); \
+ \
+  if (!check_for_equality(ublas_C, vcl_C, epsilon)) \
+  { \
+    std::cout << "Failure at C += " << #FUNCNAME << "(A + B)" << std::endl; \
+    return EXIT_FAILURE; \
+  } \
+ \
+  for (std::size_t i=0; i<ublas_C.size1(); ++i) \
+    for (std::size_t j=0; j<ublas_C.size2(); ++j) \
+      ublas_C(i,j) -= std::FUNCNAME(ublas_A(i,j)); \
+  vcl_C -= viennacl::linalg::element_##FUNCNAME(vcl_A); \
+ \
+  if (!check_for_equality(ublas_C, vcl_C, epsilon)) \
+  { \
+    std::cout << "Failure at C -= " << #FUNCNAME << "(A)" << std::endl; \
+    return EXIT_FAILURE; \
+  } \
+ \
+  for (std::size_t i=0; i<ublas_C.size1(); ++i) \
+    for (std::size_t j=0; j<ublas_C.size2(); ++j) \
+      ublas_C(i,j) -= std::FUNCNAME(ublas_A(i,j) + ublas_B(i,j)); \
+  vcl_C -= viennacl::linalg::element_##FUNCNAME(vcl_A + vcl_B); \
+ \
+  if (!check_for_equality(ublas_C, vcl_C, epsilon)) \
+  { \
+    std::cout << "Failure at C -= " << #FUNCNAME << "(A + B)" << std::endl; \
+    return EXIT_FAILURE; \
+  } \
+ \
+
+  GENERATE_UNARY_OP_TEST(cos);
+  GENERATE_UNARY_OP_TEST(cosh);
+  GENERATE_UNARY_OP_TEST(exp);
+  GENERATE_UNARY_OP_TEST(floor);
+  GENERATE_UNARY_OP_TEST(fabs);
+  GENERATE_UNARY_OP_TEST(log);
+  GENERATE_UNARY_OP_TEST(log10);
+  GENERATE_UNARY_OP_TEST(sin);
+  GENERATE_UNARY_OP_TEST(sinh);
+  GENERATE_UNARY_OP_TEST(fabs);
+  GENERATE_UNARY_OP_TEST(sqrt);
+  GENERATE_UNARY_OP_TEST(tan);
+  GENERATE_UNARY_OP_TEST(tanh);
+
+  std::cout << "Complicated expressions: ";
+  //std::cout << "ublas_A: " << ublas_A << std::endl;
+  //std::cout << "ublas_B: " << ublas_B << std::endl;
+  //std::cout << "ublas_C: " << ublas_C << std::endl;
+  ublas_B +=     alpha * (- ublas_A - beta * ublas_C + ublas_A);
+  vcl_B   += gpu_alpha * (-   vcl_A - vcl_C * beta   +   vcl_A);
+
+  if (!check_for_equality(ublas_B, vcl_B, epsilon))
+    return EXIT_FAILURE;
+
+  ublas_B += (- ublas_A - beta * ublas_C + ublas_A * beta) / gpu_alpha;
+  vcl_B   += (-   vcl_A - vcl_C * beta + gpu_beta * vcl_A) / gpu_alpha;
+
+  if (!check_for_equality(ublas_B, vcl_B, epsilon))
+    return EXIT_FAILURE;
+
+
+  ublas_B -=     alpha * (- ublas_A - beta * ublas_C - ublas_A);
+  vcl_B   -= gpu_alpha * (-   vcl_A - vcl_C * beta - vcl_A);
+
+  if (!check_for_equality(ublas_B, vcl_B, epsilon))
+    return EXIT_FAILURE;
+
+  ublas_B -= (- ublas_A - beta * ublas_C - ublas_A * beta) / alpha;
+  vcl_B   -= (-   vcl_A - vcl_C * beta - gpu_beta * vcl_A) / gpu_alpha;
+
+  if (!check_for_equality(ublas_B, vcl_B, epsilon))
+    return EXIT_FAILURE;
+
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << std::endl;
+
+
+  return EXIT_SUCCESS;
+}
+
+
+
+
+template <typename T, typename ScalarType>
+int run_test(double epsilon)
+{
+    //typedef float               ScalarType;
+    typedef boost::numeric::ublas::matrix<ScalarType>       MatrixType;
+
+    typedef viennacl::matrix<ScalarType, T>    VCLMatrixType;
+
+    std::size_t dim_rows = 131;
+    std::size_t dim_cols = 33;
+    //std::size_t dim_rows = 5;
+    //std::size_t dim_cols = 3;
+
+    //setup ublas objects:
+    MatrixType ublas_A(dim_rows, dim_cols);
+    MatrixType ublas_B(dim_rows, dim_cols);
+    MatrixType ublas_C(dim_rows, dim_cols);
+
+    for (std::size_t i=0; i<ublas_A.size1(); ++i)
+      for (std::size_t j=0; j<ublas_A.size2(); ++j)
+      {
+        ublas_A(i,j) = ScalarType((i+2) + (j+1)*(i+2));
+        ublas_B(i,j) = ScalarType((j+2) + (j+1)*(j+2));
+        ublas_C(i,j) = ScalarType((i+1) + (i+1)*(i+2));
+      }
+
+    MatrixType ublas_A_large(4 * dim_rows, 4 * dim_cols);
+    for (std::size_t i=0; i<ublas_A_large.size1(); ++i)
+      for (std::size_t j=0; j<ublas_A_large.size2(); ++j)
+        ublas_A_large(i,j) = ScalarType(i * ublas_A_large.size2() + j);
+
+    //Setup ViennaCL objects
+    VCLMatrixType vcl_A_full(4 * dim_rows, 4 * dim_cols);
+    VCLMatrixType vcl_B_full(4 * dim_rows, 4 * dim_cols);
+    VCLMatrixType vcl_C_full(4 * dim_rows, 4 * dim_cols);
+
+    viennacl::copy(ublas_A_large, vcl_A_full);
+    viennacl::copy(ublas_A_large, vcl_B_full);
+    viennacl::copy(ublas_A_large, vcl_C_full);
+
+    //
+    // Create A
+    //
+    VCLMatrixType vcl_A(dim_rows, dim_cols);
+
+    viennacl::range vcl_A_r1(2 * dim_rows, 3 * dim_rows);
+    viennacl::range vcl_A_r2(dim_cols, 2 * dim_cols);
+    viennacl::matrix_range<VCLMatrixType>   vcl_range_A(vcl_A_full, vcl_A_r1, vcl_A_r2);
+
+    viennacl::slice vcl_A_s1(2, 3, dim_rows);
+    viennacl::slice vcl_A_s2(2 * dim_cols, 2, dim_cols);
+    viennacl::matrix_slice<VCLMatrixType>   vcl_slice_A(vcl_A_full, vcl_A_s1, vcl_A_s2);
+
+
+    //
+    // Create B
+    //
+    VCLMatrixType vcl_B(dim_rows, dim_cols);
+
+    viennacl::range vcl_B_r1(dim_rows, 2 * dim_rows);
+    viennacl::range vcl_B_r2(2 * dim_cols, 3 * dim_cols);
+    viennacl::matrix_range<VCLMatrixType>   vcl_range_B(vcl_B_full, vcl_B_r1, vcl_B_r2);
+
+    viennacl::slice vcl_B_s1(2 * dim_rows, 2, dim_rows);
+    viennacl::slice vcl_B_s2(dim_cols, 3, dim_cols);
+    viennacl::matrix_slice<VCLMatrixType>   vcl_slice_B(vcl_B_full, vcl_B_s1, vcl_B_s2);
+
+
+    //
+    // Create C
+    //
+    VCLMatrixType vcl_C(dim_rows, dim_cols);
+
+    viennacl::range vcl_C_r1(2 * dim_rows, 3 * dim_rows);
+    viennacl::range vcl_C_r2(3 * dim_cols, 4 * dim_cols);
+    viennacl::matrix_range<VCLMatrixType>   vcl_range_C(vcl_C_full, vcl_C_r1, vcl_C_r2);
+
+    viennacl::slice vcl_C_s1(dim_rows, 2, dim_rows);
+    viennacl::slice vcl_C_s2(0, 3, dim_cols);
+    viennacl::matrix_slice<VCLMatrixType>   vcl_slice_C(vcl_C_full, vcl_C_s1, vcl_C_s2);
+
+    viennacl::copy(ublas_A, vcl_A);
+    viennacl::copy(ublas_A, vcl_range_A);
+    viennacl::copy(ublas_A, vcl_slice_A);
+
+    viennacl::copy(ublas_B, vcl_B);
+    viennacl::copy(ublas_B, vcl_range_B);
+    viennacl::copy(ublas_B, vcl_slice_B);
+
+    viennacl::copy(ublas_C, vcl_C);
+    viennacl::copy(ublas_C, vcl_range_C);
+    viennacl::copy(ublas_C, vcl_slice_C);
+
+
+    std::cout << std::endl;
+    std::cout << "//" << std::endl;
+    std::cout << "////////// Test: Copy CTOR //////////" << std::endl;
+    std::cout << "//" << std::endl;
+
+    {
+      std::cout << "Testing matrix created from range... ";
+      VCLMatrixType vcl_temp = vcl_range_A;
+      if (check_for_equality(ublas_A, vcl_temp, epsilon))
+        std::cout << "PASSED!" << std::endl;
+      else
+      {
+        std::cout << "ublas_A: " << ublas_A << std::endl;
+        std::cout << "vcl_temp: " << vcl_temp << std::endl;
+        std::cout << "vcl_range_A: " << vcl_range_A << std::endl;
+        std::cout << "vcl_A: " << vcl_A << std::endl;
+        std::cout << std::endl << "TEST failed!" << std::endl;
+        return EXIT_FAILURE;
+      }
+
+      std::cout << "Testing matrix created from slice... ";
+      VCLMatrixType vcl_temp2 = vcl_range_B;
+      if (check_for_equality(ublas_B, vcl_temp2, epsilon))
+        std::cout << "PASSED!" << std::endl;
+      else
+      {
+        std::cout << std::endl << "TEST failed!" << std::endl;
+        return EXIT_FAILURE;
+      }
+    }
+
+    std::cout << "//" << std::endl;
+    std::cout << "////////// Test: Initializer for matrix type //////////" << std::endl;
+    std::cout << "//" << std::endl;
+
+    {
+      ublas::matrix<ScalarType> ublas_dummy1 = ublas::identity_matrix<ScalarType>(ublas_A.size1());
+      ublas::matrix<ScalarType> ublas_dummy2 = ublas::scalar_matrix<ScalarType>(ublas_A.size1(), ublas_A.size1(), 3.0);
+      ublas::matrix<ScalarType> ublas_dummy3 = ublas::zero_matrix<ScalarType>(ublas_A.size1(), ublas_A.size1());
+
+      viennacl::matrix<ScalarType> vcl_dummy1 = viennacl::identity_matrix<ScalarType>(ublas_A.size1());
+      viennacl::matrix<ScalarType> vcl_dummy2 = viennacl::scalar_matrix<ScalarType>(ublas_A.size1(), ublas_A.size1(), 3.0);
+      viennacl::matrix<ScalarType> vcl_dummy3 = viennacl::zero_matrix<ScalarType>(ublas_A.size1(), ublas_A.size1());
+
+      std::cout << "Testing initializer CTOR... ";
+      if (   check_for_equality(ublas_dummy1, vcl_dummy1, epsilon)
+          && check_for_equality(ublas_dummy2, vcl_dummy2, epsilon)
+          && check_for_equality(ublas_dummy3, vcl_dummy3, epsilon)
+         )
+        std::cout << "PASSED!" << std::endl;
+      else
+      {
+        std::cout << std::endl << "TEST failed!" << std::endl;
+        return EXIT_FAILURE;
+      }
+
+      ublas_dummy1 = ublas::zero_matrix<ScalarType>(ublas_A.size1(), ublas_A.size1());
+      ublas_dummy2 = ublas::identity_matrix<ScalarType>(ublas_A.size1());
+      ublas_dummy3 = ublas::scalar_matrix<ScalarType>(ublas_A.size1(), ublas_A.size1(), 3.0);
+
+      vcl_dummy1 = viennacl::zero_matrix<ScalarType>(ublas_A.size1(), ublas_A.size1());
+      vcl_dummy2 = viennacl::identity_matrix<ScalarType>(ublas_A.size1());
+      vcl_dummy3 = viennacl::scalar_matrix<ScalarType>(ublas_A.size1(), ublas_A.size1(), 3.0);
+
+      std::cout << "Testing initializer assignment... ";
+      if (   check_for_equality(ublas_dummy1, vcl_dummy1, epsilon)
+          && check_for_equality(ublas_dummy2, vcl_dummy2, epsilon)
+          && check_for_equality(ublas_dummy3, vcl_dummy3, epsilon)
+         )
+        std::cout << "PASSED!" << std::endl;
+      else
+      {
+        std::cout << std::endl << "TEST failed!" << std::endl;
+        return EXIT_FAILURE;
+      }
+    }
+
+    std::cout << "//" << std::endl;
+    std::cout << "////////// Test: Norms //////////" << std::endl;
+    std::cout << "//" << std::endl;
+
+    /*ScalarType ublas_norm_1 = viennacl::linalg::norm_1(ublas_C);
+    ScalarType   vcl_norm_1 = viennacl::linalg::norm_1(vcl_C);
+    if ( std::fabs(ublas_norm_1 - vcl_norm_1) / ublas_norm_1 > epsilon)
+    {
+      std::cerr << "Failure at norm_1(): " << std::fabs(ublas_norm_1 - vcl_norm_1) / ublas_norm_1  << std::endl;
+      return EXIT_FAILURE;
+    }
+
+    ScalarType ublas_norm_inf = ublas::norm_inf(ublas_C);
+    ScalarType   vcl_norm_inf = viennacl::linalg::norm_inf(vcl_C);
+    if ( std::fabs(ublas_norm_inf - vcl_norm_inf) / ublas_norm_inf > epsilon)
+    {
+      std::cerr << "Failure at norm_inf(): " << std::fabs(ublas_norm_inf - vcl_norm_inf) / ublas_norm_inf << std::endl;
+      return EXIT_FAILURE;
+    }*/
+
+    ScalarType ublas_norm_frobenius = viennacl::linalg::norm_frobenius(ublas_C);
+    ScalarType   vcl_norm_frobenius = viennacl::linalg::norm_frobenius(vcl_C);
+    if ( std::fabs(ublas_norm_frobenius - vcl_norm_frobenius) / ublas_norm_frobenius > epsilon)
+    {
+      std::cerr << "Failure at norm_frobenius()" << std::endl;
+      return EXIT_FAILURE;
+    }
+
+    viennacl::scalar<ScalarType> device_ublas_norm_frobenius = viennacl::linalg::norm_frobenius(ublas_C);
+    viennacl::scalar<ScalarType>   device_vcl_norm_frobenius = viennacl::linalg::norm_frobenius(vcl_C);
+    if ( std::fabs(device_ublas_norm_frobenius - device_vcl_norm_frobenius) / device_ublas_norm_frobenius > epsilon)
+    {
+      std::cerr << "Failure at norm_frobenius()" << std::endl;
+      return EXIT_FAILURE;
+    }
+
+    std::cout << "PASSED!" << std::endl;
+
+
+    //
+    // run operation tests:
+    //
+
+    std::cout << "//" << std::endl;
+    std::cout << "////////// Test: Operations //////////" << std::endl;
+    std::cout << "//" << std::endl;
+
+
+    /////// A=matrix:
+    std::cout << "Testing A=matrix, B=matrix, C=matrix ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_A);
+    viennacl::copy(ublas_B, vcl_B);
+    viennacl::copy(ublas_C, vcl_C);
+    if (run_test(epsilon,
+                 ublas_A, ublas_B, ublas_C,
+                 vcl_A, vcl_B, vcl_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+    std::cout << "Testing A=matrix, B=matrix, C=range ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_A);
+    viennacl::copy(ublas_B, vcl_B);
+    viennacl::copy(ublas_C, vcl_range_C);
+    if (run_test(epsilon,
+                 ublas_A, ublas_B, ublas_C,
+                 vcl_A, vcl_B, vcl_range_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+    std::cout << "Testing A=matrix, B=matrix, C=slice ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_A);
+    viennacl::copy(ublas_B, vcl_B);
+    viennacl::copy(ublas_C, vcl_slice_C);
+    if (run_test(epsilon,
+                 ublas_A, ublas_B, ublas_C,
+                 vcl_A, vcl_B, vcl_slice_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+    std::cout << "Testing A=matrix, B=range, C=matrix ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_A);
+    viennacl::copy(ublas_B, vcl_range_B);
+    viennacl::copy(ublas_C, vcl_C);
+    if (run_test(epsilon,
+                 ublas_A, ublas_B, ublas_C,
+                 vcl_A, vcl_range_B, vcl_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+    std::cout << "Testing A=matrix, B=range, C=range ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_A);
+    viennacl::copy(ublas_B, vcl_range_B);
+    viennacl::copy(ublas_C, vcl_range_C);
+    if (run_test(epsilon,
+                 ublas_A, ublas_B, ublas_C,
+                 vcl_A, vcl_range_B, vcl_range_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+    std::cout << "Testing A=matrix, B=range, C=slice ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_A);
+    viennacl::copy(ublas_B, vcl_range_B);
+    viennacl::copy(ublas_C, vcl_slice_C);
+    if (run_test(epsilon,
+                 ublas_A, ublas_B, ublas_C,
+                 vcl_A, vcl_range_B, vcl_slice_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+
+    std::cout << "Testing A=matrix, B=slice, C=matrix ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_A);
+    viennacl::copy(ublas_B, vcl_slice_B);
+    viennacl::copy(ublas_C, vcl_C);
+    if (run_test(epsilon,
+                 ublas_A, ublas_B, ublas_C,
+                 vcl_A, vcl_slice_B, vcl_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+    std::cout << "Testing A=matrix, B=slice, C=range ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_A);
+    viennacl::copy(ublas_B, vcl_slice_B);
+    viennacl::copy(ublas_C, vcl_range_C);
+    if (run_test(epsilon,
+                 ublas_A, ublas_B, ublas_C,
+                 vcl_A, vcl_slice_B, vcl_range_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+    std::cout << "Testing A=matrix, B=slice, C=slice ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_A);
+    viennacl::copy(ublas_B, vcl_slice_B);
+    viennacl::copy(ublas_C, vcl_slice_C);
+    if (run_test(epsilon,
+                 ublas_A, ublas_B, ublas_C,
+                 vcl_A, vcl_slice_B, vcl_slice_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+
+
+    /////// A=range:
+    std::cout << "Testing A=range, B=matrix, C=matrix ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_range_A);
+    viennacl::copy(ublas_B, vcl_B);
+    viennacl::copy(ublas_C, vcl_C);
+    if (run_test(epsilon,
+                 ublas_A, ublas_B, ublas_C,
+                 vcl_range_A, vcl_B, vcl_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+    std::cout << "Testing A=range, B=matrix, C=range ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_range_A);
+    viennacl::copy(ublas_B, vcl_B);
+    viennacl::copy(ublas_C, vcl_range_C);
+    if (run_test(epsilon,
+                 ublas_A, ublas_B, ublas_C,
+                 vcl_range_A, vcl_B, vcl_range_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+    std::cout << "Testing A=range, B=matrix, C=slice ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_range_A);
+    viennacl::copy(ublas_B, vcl_B);
+    viennacl::copy(ublas_C, vcl_slice_C);
+    if (run_test(epsilon,
+                 ublas_A, ublas_B, ublas_C,
+                 vcl_range_A, vcl_B, vcl_slice_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+
+
+    std::cout << "Testing A=range, B=range, C=matrix ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_range_A);
+    viennacl::copy(ublas_B, vcl_range_B);
+    viennacl::copy(ublas_C, vcl_C);
+    if (run_test(epsilon,
+                 ublas_A, ublas_B, ublas_C,
+                 vcl_range_A, vcl_range_B, vcl_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+    std::cout << "Testing A=range, B=range, C=range ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_range_A);
+    viennacl::copy(ublas_B, vcl_range_B);
+    viennacl::copy(ublas_C, vcl_range_C);
+    if (run_test(epsilon,
+                 ublas_A, ublas_B, ublas_C,
+                 vcl_range_A, vcl_range_B, vcl_range_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+    std::cout << "Testing A=range, B=range, C=slice ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_range_A);
+    viennacl::copy(ublas_B, vcl_range_B);
+    viennacl::copy(ublas_C, vcl_slice_C);
+    if (run_test(epsilon,
+                 ublas_A, ublas_B, ublas_C,
+                 vcl_range_A, vcl_range_B, vcl_slice_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+
+
+    std::cout << "Testing A=range, B=slice, C=matrix ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_range_A);
+    viennacl::copy(ublas_B, vcl_slice_B);
+    viennacl::copy(ublas_C, vcl_C);
+    if (run_test(epsilon,
+                 ublas_A, ublas_B, ublas_C,
+                 vcl_range_A, vcl_slice_B, vcl_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+    std::cout << "Testing A=range, B=slice, C=range ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_range_A);
+    viennacl::copy(ublas_B, vcl_slice_B);
+    viennacl::copy(ublas_C, vcl_range_C);
+    if (run_test(epsilon,
+                 ublas_A, ublas_B, ublas_C,
+                 vcl_range_A, vcl_slice_B, vcl_range_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+    std::cout << "Testing A=range, B=slice, C=slice ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_range_A);
+    viennacl::copy(ublas_B, vcl_slice_B);
+    viennacl::copy(ublas_C, vcl_slice_C);
+    if (run_test(epsilon,
+                 ublas_A, ublas_B, ublas_C,
+                 vcl_range_A, vcl_slice_B, vcl_slice_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+
+    /////// A=slice:
+    std::cout << "Testing A=slice, B=matrix, C=matrix ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_slice_A);
+    viennacl::copy(ublas_B, vcl_B);
+    viennacl::copy(ublas_C, vcl_C);
+    if (run_test(epsilon,
+                 ublas_A, ublas_B, ublas_C,
+                 vcl_slice_A, vcl_B, vcl_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+    std::cout << "Testing A=slice, B=matrix, C=range ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_slice_A);
+    viennacl::copy(ublas_B, vcl_B);
+    viennacl::copy(ublas_C, vcl_range_C);
+    if (run_test(epsilon,
+                 ublas_A, ublas_B, ublas_C,
+                 vcl_slice_A, vcl_B, vcl_range_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+    std::cout << "Testing A=slice, B=matrix, C=slice ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_slice_A);
+    viennacl::copy(ublas_B, vcl_B);
+    viennacl::copy(ublas_C, vcl_slice_C);
+    if (run_test(epsilon,
+                 ublas_A, ublas_B, ublas_C,
+                 vcl_slice_A, vcl_B, vcl_slice_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+
+
+    std::cout << "Testing A=slice, B=range, C=matrix ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_slice_A);
+    viennacl::copy(ublas_B, vcl_range_B);
+    viennacl::copy(ublas_C, vcl_C);
+    if (run_test(epsilon,
+                 ublas_A, ublas_B, ublas_C,
+                 vcl_slice_A, vcl_range_B, vcl_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+    std::cout << "Testing A=slice, B=range, C=range ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_slice_A);
+    viennacl::copy(ublas_B, vcl_range_B);
+    viennacl::copy(ublas_C, vcl_range_C);
+    if (run_test(epsilon,
+                 ublas_A, ublas_B, ublas_C,
+                 vcl_slice_A, vcl_range_B, vcl_range_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+    std::cout << "Testing A=slice, B=range, C=slice ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_slice_A);
+    viennacl::copy(ublas_B, vcl_range_B);
+    viennacl::copy(ublas_C, vcl_slice_C);
+    if (run_test(epsilon,
+                 ublas_A, ublas_B, ublas_C,
+                 vcl_slice_A, vcl_range_B, vcl_slice_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+
+
+    std::cout << "Testing A=slice, B=slice, C=matrix ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_slice_A);
+    viennacl::copy(ublas_B, vcl_slice_B);
+    viennacl::copy(ublas_C, vcl_C);
+    if (run_test(epsilon,
+                 ublas_A, ublas_B, ublas_C,
+                 vcl_slice_A, vcl_slice_B, vcl_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+    std::cout << "Testing A=slice, B=slice, C=range ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_slice_A);
+    viennacl::copy(ublas_B, vcl_slice_B);
+    viennacl::copy(ublas_C, vcl_range_C);
+    if (run_test(epsilon,
+                 ublas_A, ublas_B, ublas_C,
+                 vcl_slice_A, vcl_slice_B, vcl_range_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+    std::cout << "Testing A=slice, B=slice, C=slice ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_slice_A);
+    viennacl::copy(ublas_B, vcl_slice_B);
+    viennacl::copy(ublas_C, vcl_slice_C);
+    if (run_test(epsilon,
+                 ublas_A, ublas_B, ublas_C,
+                 vcl_slice_A, vcl_slice_B, vcl_slice_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+
+    return EXIT_SUCCESS;
+}
+
+
diff --git a/tests/src/matrix_int.hpp b/tests/src/matrix_int.hpp
new file mode 100644
index 0000000..9798273
--- /dev/null
+++ b/tests/src/matrix_int.hpp
@@ -0,0 +1,1107 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#define VIENNACL_WITH_UBLAS
+//#define NDEBUG
+//#define VIENNACL_BUILD_INFO
+
+#include <utility>
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <cmath>
+#include <algorithm>
+#include <stdio.h>
+#include <time.h>
+//#include "../benchmarks/benchmark-utils.hpp"
+#include "viennacl/scalar.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/prod.hpp"
+/*#include "viennacl/compressed_matrix.hpp"
+#include "viennacl/linalg/cg.hpp"
+#include "viennacl/linalg/inner_prod.hpp"
+#include "viennacl/linalg/ilu.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+#include "viennacl/io/matrix_market.hpp"*/
+#include "viennacl/matrix_proxy.hpp"
+#include "viennacl/vector_proxy.hpp"
+#include "boost/numeric/ublas/vector.hpp"
+#include "boost/numeric/ublas/matrix.hpp"
+#include "boost/numeric/ublas/matrix_proxy.hpp"
+#include "boost/numeric/ublas/vector_proxy.hpp"
+#include "boost/numeric/ublas/io.hpp"
+
+using namespace boost::numeric;
+
+template <typename MatrixType, typename VCLMatrixType>
+bool check_for_equality(MatrixType const & ublas_A, VCLMatrixType const & vcl_A)
+{
+  typedef typename MatrixType::value_type   value_type;
+
+  boost::numeric::ublas::matrix<value_type> vcl_A_cpu(vcl_A.size1(), vcl_A.size2());
+  viennacl::backend::finish();  //workaround for a bug in APP SDK 2.7 on Trinity APUs (with Catalyst 12.8)
+  viennacl::copy(vcl_A, vcl_A_cpu);
+
+  for (std::size_t i=0; i<ublas_A.size1(); ++i)
+  {
+    for (std::size_t j=0; j<ublas_A.size2(); ++j)
+    {
+      if (ublas_A(i,j) != vcl_A_cpu(i,j))
+      {
+        std::cout << "Error at index (" << i << ", " << j << "): " << ublas_A(i,j) << " vs " << vcl_A_cpu(i,j) << std::endl;
+        std::cout << std::endl << "TEST failed!" << std::endl;
+        return false;
+      }
+    }
+  }
+
+  std::cout << "PASSED!" << std::endl;
+  return true;
+}
+
+
+
+
+template <typename UBLASMatrixType,
+          typename ViennaCLMatrixType1, typename ViennaCLMatrixType2, typename ViennaCLMatrixType3>
+int run_test(UBLASMatrixType & ublas_A, UBLASMatrixType & ublas_B, UBLASMatrixType & ublas_C,
+             ViennaCLMatrixType1 & vcl_A, ViennaCLMatrixType2 & vcl_B, ViennaCLMatrixType3 vcl_C)
+{
+
+  typedef typename viennacl::result_of::cpu_value_type<typename ViennaCLMatrixType1::value_type>::type  cpu_value_type;
+
+  cpu_value_type alpha = 3;
+  viennacl::scalar<cpu_value_type>   gpu_alpha = alpha;
+
+  cpu_value_type beta = 2;
+  viennacl::scalar<cpu_value_type>   gpu_beta = beta;
+
+
+  //
+  // Initializer:
+  //
+  std::cout << "Checking for zero_matrix initializer..." << std::endl;
+  ublas_A = ublas::zero_matrix<cpu_value_type>(ublas_A.size1(), ublas_A.size2());
+  vcl_A = viennacl::zero_matrix<cpu_value_type>(vcl_A.size1(), vcl_A.size2());
+  if (!check_for_equality(ublas_A, vcl_A))
+    return EXIT_FAILURE;
+
+  std::cout << "Checking for scalar_matrix initializer..." << std::endl;
+  ublas_A = ublas::scalar_matrix<cpu_value_type>(ublas_A.size1(), ublas_A.size2(), alpha);
+  vcl_A = viennacl::scalar_matrix<cpu_value_type>(vcl_A.size1(), vcl_A.size2(), alpha);
+  if (!check_for_equality(ublas_A, vcl_A))
+    return EXIT_FAILURE;
+
+  ublas_A =    ublas::scalar_matrix<cpu_value_type>(ublas_A.size1(), ublas_A.size2(), gpu_beta);
+  vcl_A   = viennacl::scalar_matrix<cpu_value_type>(  vcl_A.size1(),   vcl_A.size2(), gpu_beta);
+  if (!check_for_equality(ublas_A, vcl_A))
+    return EXIT_FAILURE;
+
+  /*
+  std::cout << "Checking for identity initializer..." << std::endl;
+  ublas_A = ublas::identity_matrix<cpu_value_type>(ublas_A.size1());
+  vcl_A = viennacl::identity_matrix<cpu_value_type>(vcl_A.size1());
+  if (!check_for_equality(ublas_A, vcl_A))
+    return EXIT_FAILURE; */
+
+
+  std::cout << std::endl;
+  //std::cout << "//" << std::endl;
+  //std::cout << "////////// Test: Assignments //////////" << std::endl;
+  //std::cout << "//" << std::endl;
+
+  if (!check_for_equality(ublas_B, vcl_B))
+    return EXIT_FAILURE;
+
+  std::cout << "Testing matrix assignment... ";
+  //std::cout << ublas_B(0,0) << " vs. " << vcl_B(0,0) << std::endl;
+  ublas_A = ublas_B;
+  vcl_A = vcl_B;
+  if (!check_for_equality(ublas_A, vcl_A))
+    return EXIT_FAILURE;
+
+
+
+  //std::cout << std::endl;
+  //std::cout << "//" << std::endl;
+  //std::cout << "////////// Test 1: Copy to GPU //////////" << std::endl;
+  //std::cout << "//" << std::endl;
+
+  ublas_A = ublas_B;
+  viennacl::copy(ublas_B, vcl_A);
+  std::cout << "Testing upper left copy to GPU... ";
+  if (!check_for_equality(ublas_A, vcl_A))
+    return EXIT_FAILURE;
+
+
+  ublas_C = ublas_B;
+  viennacl::copy(ublas_B, vcl_C);
+  std::cout << "Testing lower right copy to GPU... ";
+  if (!check_for_equality(ublas_C, vcl_C))
+    return EXIT_FAILURE;
+
+
+  //std::cout << std::endl;
+  //std::cout << "//" << std::endl;
+  //std::cout << "////////// Test 2: Copy from GPU //////////" << std::endl;
+  //std::cout << "//" << std::endl;
+
+  std::cout << "Testing upper left copy to A... ";
+  if (!check_for_equality(ublas_A, vcl_A))
+    return EXIT_FAILURE;
+
+  std::cout << "Testing lower right copy to C... ";
+  if (!check_for_equality(ublas_C, vcl_C))
+    return EXIT_FAILURE;
+
+
+
+  //std::cout << "//" << std::endl;
+  //std::cout << "////////// Test 3: Addition //////////" << std::endl;
+  //std::cout << "//" << std::endl;
+  viennacl::copy(ublas_C, vcl_C);
+
+  std::cout << "Inplace add: ";
+  ublas_C += ublas_C;
+  vcl_C   +=   vcl_C;
+
+  if (!check_for_equality(ublas_C, vcl_C))
+    return EXIT_FAILURE;
+
+  std::cout << "Scaled inplace add: ";
+  ublas_C += beta * ublas_A;
+  vcl_C   += gpu_beta * vcl_A;
+
+  if (!check_for_equality(ublas_C, vcl_C))
+    return EXIT_FAILURE;
+
+  std::cout << "Add: ";
+  ublas_C = ublas_A + ublas_B;
+  vcl_C   =   vcl_A +   vcl_B;
+
+  if (!check_for_equality(ublas_C, vcl_C))
+    return EXIT_FAILURE;
+
+  std::cout << "Add with flipsign: ";
+  ublas_C = - ublas_A + ublas_B;
+  vcl_C   = -   vcl_A +   vcl_B;
+
+  if (!check_for_equality(ublas_C, vcl_C))
+    return EXIT_FAILURE;
+
+
+  std::cout << "Scaled add (left): ";
+  ublas_C = alpha * ublas_A + ublas_B;
+  vcl_C   = alpha *   vcl_A +   vcl_B;
+
+  if (!check_for_equality(ublas_C, vcl_C))
+    return EXIT_FAILURE;
+
+  std::cout << "Scaled add (left): ";
+  vcl_C = gpu_alpha * vcl_A + vcl_B;
+  if (!check_for_equality(ublas_C, vcl_C))
+    return EXIT_FAILURE;
+
+
+  std::cout << "Scaled add (right): ";
+  ublas_C = ublas_A + beta * ublas_B;
+  vcl_C   =   vcl_A + beta *   vcl_B;
+
+  if (!check_for_equality(ublas_C, vcl_C))
+    return EXIT_FAILURE;
+
+  std::cout << "Scaled add (right): ";
+  vcl_C = vcl_A + gpu_beta * vcl_B;
+  if (!check_for_equality(ublas_C, vcl_C))
+    return EXIT_FAILURE;
+
+
+
+  std::cout << "Scaled add (both): ";
+  ublas_C = alpha * ublas_A + beta * ublas_B;
+  vcl_C   = alpha *   vcl_A + beta *   vcl_B;
+
+  if (!check_for_equality(ublas_C, vcl_C))
+    return EXIT_FAILURE;
+
+  std::cout << "Scaled add (both): ";
+  vcl_C = gpu_alpha * vcl_A + gpu_beta * vcl_B;
+  if (!check_for_equality(ublas_C, vcl_C))
+    return EXIT_FAILURE;
+
+  //std::cout << "//" << std::endl;
+  //std::cout << "////////// Test 4: Subtraction //////////" << std::endl;
+  //std::cout << "//" << std::endl;
+  viennacl::copy(ublas_C, vcl_C);
+
+  std::cout << "Inplace sub: ";
+  ublas_C -= ublas_B;
+  vcl_C -= vcl_B;
+
+  if (!check_for_equality(ublas_C, vcl_C))
+    return EXIT_FAILURE;
+
+  std::cout << "Scaled Inplace sub: ";
+  ublas_C -= alpha * ublas_B;
+  vcl_C -= alpha * vcl_B;
+
+  if (!check_for_equality(ublas_C, vcl_C))
+    return EXIT_FAILURE;
+
+
+
+
+  std::cout << "Sub: ";
+  ublas_C = ublas_A - ublas_B;
+  vcl_C = vcl_A - vcl_B;
+
+  if (!check_for_equality(ublas_C, vcl_C))
+    return EXIT_FAILURE;
+
+  std::cout << "Scaled sub (left): ";
+  ublas_B = alpha * ublas_A - ublas_C;
+  vcl_B   = alpha *   vcl_A - vcl_C;
+
+  if (!check_for_equality(ublas_B, vcl_B))
+    return EXIT_FAILURE;
+
+  std::cout << "Scaled sub (left): ";
+  vcl_B = gpu_alpha * vcl_A - vcl_C;
+  if (!check_for_equality(ublas_B, vcl_B))
+    return EXIT_FAILURE;
+
+
+  std::cout << "Scaled sub (right): ";
+  ublas_B = ublas_A - beta * ublas_C;
+  vcl_B   =   vcl_A - vcl_C * beta;
+
+  if (!check_for_equality(ublas_B, vcl_B))
+    return EXIT_FAILURE;
+
+  std::cout << "Scaled sub (right): ";
+  vcl_B = vcl_A - vcl_C * gpu_beta;
+  if (!check_for_equality(ublas_B, vcl_B))
+    return EXIT_FAILURE;
+
+
+  std::cout << "Scaled sub (both): ";
+  ublas_B = alpha * ublas_A - beta * ublas_C;
+  vcl_B   = alpha * vcl_A - vcl_C * beta;
+
+  if (!check_for_equality(ublas_B, vcl_B))
+    return EXIT_FAILURE;
+
+  std::cout << "Scaled sub (both): ";
+  vcl_B = gpu_alpha * vcl_A - vcl_C * gpu_beta;
+  if (!check_for_equality(ublas_B, vcl_B))
+    return EXIT_FAILURE;
+
+
+  std::cout << "Unary operator-: ";
+  ublas_C = - ublas_A;
+  vcl_C   = -   vcl_A;
+
+  if (!check_for_equality(ublas_C, vcl_C))
+    return EXIT_FAILURE;
+
+
+
+  //std::cout << "//" << std::endl;
+  //std::cout << "////////// Test 5: Scaling //////////" << std::endl;
+  //std::cout << "//" << std::endl;
+  viennacl::copy(ublas_A, vcl_A);
+
+  std::cout << "Multiplication with CPU scalar: ";
+  ublas_A *= alpha;
+  vcl_A   *= alpha;
+
+  if (!check_for_equality(ublas_A, vcl_A))
+    return EXIT_FAILURE;
+
+  std::cout << "Multiplication with GPU scalar: ";
+  ublas_A *= beta;
+  vcl_A *= gpu_beta;
+
+  if (!check_for_equality(ublas_A, vcl_A))
+    return EXIT_FAILURE;
+
+
+  std::cout << "Division with CPU scalar: ";
+  ublas_A /= alpha;
+  vcl_A /= alpha;
+
+  if (!check_for_equality(ublas_A, vcl_A))
+    return EXIT_FAILURE;
+
+  std::cout << "Division with GPU scalar: ";
+  ublas_A /= beta;
+  vcl_A /= gpu_beta;
+
+  if (!check_for_equality(ublas_A, vcl_A))
+    return EXIT_FAILURE;
+
+
+
+  std::cout << "Testing elementwise multiplication..." << std::endl;
+  ublas_B = ublas::scalar_matrix<cpu_value_type>(ublas_B.size1(), ublas_B.size2(), 2);
+  ublas_A = 3 * ublas_B;
+  viennacl::copy(ublas_A, vcl_A);
+  viennacl::copy(ublas_B, vcl_B);
+  viennacl::copy(ublas_B, vcl_B);
+  ublas_A = ublas::element_prod(ublas_A, ublas_B);
+  vcl_A = viennacl::linalg::element_prod(vcl_A, vcl_B);
+
+  if (!check_for_equality(ublas_A, vcl_A))
+    return EXIT_FAILURE;
+
+  ublas_A += ublas::element_prod(ublas_A, ublas_B);
+  vcl_A += viennacl::linalg::element_prod(vcl_A, vcl_B);
+
+  if (!check_for_equality(ublas_A, vcl_A))
+    return EXIT_FAILURE;
+
+  ublas_A -= ublas::element_prod(ublas_A, ublas_B);
+  vcl_A -= viennacl::linalg::element_prod(vcl_A, vcl_B);
+
+  if (!check_for_equality(ublas_A, vcl_A))
+    return EXIT_FAILURE;
+
+  ///////
+  ublas_A = ublas::element_prod(ublas_A + ublas_B, ublas_B);
+  vcl_A = viennacl::linalg::element_prod(vcl_A + vcl_B, vcl_B);
+
+  if (!check_for_equality(ublas_A, vcl_A))
+    return EXIT_FAILURE;
+
+  ublas_A += ublas::element_prod(ublas_A + ublas_B, ublas_B);
+  vcl_A += viennacl::linalg::element_prod(vcl_A + vcl_B, vcl_B);
+
+  if (!check_for_equality(ublas_A, vcl_A))
+    return EXIT_FAILURE;
+
+  ublas_A -= ublas::element_prod(ublas_A + ublas_B, ublas_B);
+  vcl_A -= viennacl::linalg::element_prod(vcl_A + vcl_B, vcl_B);
+
+  if (!check_for_equality(ublas_A, vcl_A))
+    return EXIT_FAILURE;
+
+  ///////
+  ublas_A = ublas::element_prod(ublas_A, ublas_B + ublas_A);
+  vcl_A = viennacl::linalg::element_prod(vcl_A, vcl_B + vcl_A);
+
+  if (!check_for_equality(ublas_A, vcl_A))
+    return EXIT_FAILURE;
+
+  ublas_A += ublas::element_prod(ublas_A, ublas_B + ublas_A);
+  vcl_A += viennacl::linalg::element_prod(vcl_A, vcl_B + vcl_A);
+
+  if (!check_for_equality(ublas_A, vcl_A))
+    return EXIT_FAILURE;
+
+  ublas_A -= ublas::element_prod(ublas_A, ublas_B + ublas_A);
+  vcl_A -= viennacl::linalg::element_prod(vcl_A, vcl_B + vcl_A);
+
+  if (!check_for_equality(ublas_A, vcl_A))
+    return EXIT_FAILURE;
+
+  ///////
+  ublas_A = ublas::element_prod(ublas_A + ublas_B, ublas_B + ublas_A);
+  vcl_A = viennacl::linalg::element_prod(vcl_A + vcl_B, vcl_B + vcl_A);
+
+  if (!check_for_equality(ublas_A, vcl_A))
+    return EXIT_FAILURE;
+
+  ublas_A += ublas::element_prod(ublas_A + ublas_B, ublas_B + ublas_A);
+  vcl_A += viennacl::linalg::element_prod(vcl_A + vcl_B, vcl_B + vcl_A);
+
+  if (!check_for_equality(ublas_A, vcl_A))
+    return EXIT_FAILURE;
+
+  ublas_A -= ublas::element_prod(ublas_A + ublas_B, ublas_B + ublas_A);
+  vcl_A -= viennacl::linalg::element_prod(vcl_A + vcl_B, vcl_B + vcl_A);
+
+  if (!check_for_equality(ublas_A, vcl_A))
+    return EXIT_FAILURE;
+
+
+  ublas_B = ublas::scalar_matrix<cpu_value_type>(ublas_B.size1(), ublas_B.size2(), 2);
+  ublas_A = 3 * ublas_B;
+  viennacl::copy(ublas_A, vcl_A);
+  viennacl::copy(ublas_B, vcl_B);
+  viennacl::copy(ublas_B, vcl_B);
+
+  ublas_A = ublas::element_div(ublas_A, ublas_B);
+  vcl_A = viennacl::linalg::element_div(vcl_A, vcl_B);
+
+  if (!check_for_equality(ublas_A, vcl_A))
+    return EXIT_FAILURE;
+
+  ublas_A += ublas::element_div(ublas_A, ublas_B);
+  vcl_A += viennacl::linalg::element_div(vcl_A, vcl_B);
+
+  if (!check_for_equality(ublas_A, vcl_A))
+    return EXIT_FAILURE;
+
+  ublas_A -= ublas::element_div(ublas_A, ublas_B);
+  vcl_A -= viennacl::linalg::element_div(vcl_A, vcl_B);
+
+  if (!check_for_equality(ublas_A, vcl_A))
+    return EXIT_FAILURE;
+
+  ///////
+  ublas_A = ublas::element_div(ublas_A + ublas_B, ublas_B);
+  vcl_A = viennacl::linalg::element_div(vcl_A + vcl_B, vcl_B);
+
+  if (!check_for_equality(ublas_A, vcl_A))
+    return EXIT_FAILURE;
+
+  ublas_A += ublas::element_div(ublas_A + ublas_B, ublas_B);
+  vcl_A += viennacl::linalg::element_div(vcl_A + vcl_B, vcl_B);
+
+  if (!check_for_equality(ublas_A, vcl_A))
+    return EXIT_FAILURE;
+
+  ublas_A -= ublas::element_div(ublas_A + ublas_B, ublas_B);
+  vcl_A -= viennacl::linalg::element_div(vcl_A + vcl_B, vcl_B);
+
+  if (!check_for_equality(ublas_A, vcl_A))
+    return EXIT_FAILURE;
+
+  ///////
+  ublas_A = ublas::element_div(ublas_A, ublas_B + ublas_A);
+  vcl_A = viennacl::linalg::element_div(vcl_A, vcl_B + vcl_A);
+
+  if (!check_for_equality(ublas_A, vcl_A))
+    return EXIT_FAILURE;
+
+  ublas_A += ublas::element_div(ublas_A, ublas_B + ublas_A);
+  vcl_A += viennacl::linalg::element_div(vcl_A, vcl_B + vcl_A);
+
+  if (!check_for_equality(ublas_A, vcl_A))
+    return EXIT_FAILURE;
+
+  ublas_A -= ublas::element_div(ublas_A, ublas_B + ublas_A);
+  vcl_A -= viennacl::linalg::element_div(vcl_A, vcl_B + vcl_A);
+
+  if (!check_for_equality(ublas_A, vcl_A))
+    return EXIT_FAILURE;
+
+  ///////
+  ublas_A = ublas::element_div(ublas_A + ublas_B, ublas_B + ublas_A);
+  vcl_A = viennacl::linalg::element_div(vcl_A + vcl_B, vcl_B + vcl_A);
+
+  if (!check_for_equality(ublas_A, vcl_A))
+    return EXIT_FAILURE;
+
+  ublas_A += ublas::element_div(ublas_A + ublas_B, ublas_B + ublas_A);
+  vcl_A += viennacl::linalg::element_div(vcl_A + vcl_B, vcl_B + vcl_A);
+
+  if (!check_for_equality(ublas_A, vcl_A))
+    return EXIT_FAILURE;
+
+  ublas_A -= ublas::element_div(ublas_A + ublas_B, ublas_B + ublas_A);
+  vcl_A -= viennacl::linalg::element_div(vcl_A + vcl_B, vcl_B + vcl_A);
+
+  if (!check_for_equality(ublas_A, vcl_A))
+    return EXIT_FAILURE;
+
+  std::cout << "Testing unary elementwise operations..." << std::endl;
+
+#define GENERATE_UNARY_OP_TEST(FUNCNAME) \
+  ublas_B = ublas::scalar_matrix<cpu_value_type>(ublas_B.size1(), ublas_B.size2(), 1); \
+  ublas_A = 3 * ublas_B; \
+  ublas_C = 2 * ublas_A; \
+  viennacl::copy(ublas_A, vcl_A); \
+  viennacl::copy(ublas_B, vcl_B); \
+  viennacl::copy(ublas_C, vcl_C); \
+  viennacl::copy(ublas_B, vcl_B); \
+  \
+  for (std::size_t i=0; i<ublas_C.size1(); ++i) \
+    for (std::size_t j=0; j<ublas_C.size2(); ++j) \
+      ublas_C(i,j) = std::FUNCNAME(ublas_A(i,j)); \
+  vcl_C = viennacl::linalg::element_##FUNCNAME(vcl_A); \
+ \
+  if (!check_for_equality(ublas_C, vcl_C)) \
+  { \
+    std::cout << "Failure at C = " << #FUNCNAME << "(A)" << std::endl; \
+    return EXIT_FAILURE; \
+  } \
+ \
+  for (std::size_t i=0; i<ublas_C.size1(); ++i) \
+    for (std::size_t j=0; j<ublas_C.size2(); ++j) \
+      ublas_C(i,j) = std::FUNCNAME(ublas_A(i,j) + ublas_B(i,j)); \
+  vcl_C = viennacl::linalg::element_##FUNCNAME(vcl_A + vcl_B); \
+ \
+  if (!check_for_equality(ublas_C, vcl_C)) \
+  { \
+    std::cout << "Failure at C = " << #FUNCNAME << "(A + B)" << std::endl; \
+    return EXIT_FAILURE; \
+  } \
+ \
+  for (std::size_t i=0; i<ublas_C.size1(); ++i) \
+    for (std::size_t j=0; j<ublas_C.size2(); ++j) \
+      ublas_C(i,j) += std::FUNCNAME(ublas_A(i,j)); \
+  vcl_C += viennacl::linalg::element_##FUNCNAME(vcl_A); \
+ \
+  if (!check_for_equality(ublas_C, vcl_C)) \
+  { \
+    std::cout << "Failure at C += " << #FUNCNAME << "(A)" << std::endl; \
+    return EXIT_FAILURE; \
+  } \
+ \
+  for (std::size_t i=0; i<ublas_C.size1(); ++i) \
+    for (std::size_t j=0; j<ublas_C.size2(); ++j) \
+      ublas_C(i,j) += std::FUNCNAME(ublas_A(i,j) + ublas_B(i,j)); \
+  vcl_C += viennacl::linalg::element_##FUNCNAME(vcl_A + vcl_B); \
+ \
+  if (!check_for_equality(ublas_C, vcl_C)) \
+  { \
+    std::cout << "Failure at C += " << #FUNCNAME << "(A + B)" << std::endl; \
+    return EXIT_FAILURE; \
+  } \
+ \
+  for (std::size_t i=0; i<ublas_C.size1(); ++i) \
+    for (std::size_t j=0; j<ublas_C.size2(); ++j) \
+      ublas_C(i,j) -= std::FUNCNAME(ublas_A(i,j)); \
+  vcl_C -= viennacl::linalg::element_##FUNCNAME(vcl_A); \
+ \
+  if (!check_for_equality(ublas_C, vcl_C)) \
+  { \
+    std::cout << "Failure at C -= " << #FUNCNAME << "(A)" << std::endl; \
+    return EXIT_FAILURE; \
+  } \
+ \
+  for (std::size_t i=0; i<ublas_C.size1(); ++i) \
+    for (std::size_t j=0; j<ublas_C.size2(); ++j) \
+      ublas_C(i,j) -= std::FUNCNAME(ublas_A(i,j) + ublas_B(i,j)); \
+  vcl_C -= viennacl::linalg::element_##FUNCNAME(vcl_A + vcl_B); \
+ \
+  if (!check_for_equality(ublas_C, vcl_C)) \
+  { \
+    std::cout << "Failure at C -= " << #FUNCNAME << "(A + B)" << std::endl; \
+    return EXIT_FAILURE; \
+  } \
+ \
+
+  GENERATE_UNARY_OP_TEST(abs);
+
+  std::cout << "Complicated expressions: ";
+  //std::cout << "ublas_A: " << ublas_A << std::endl;
+  //std::cout << "ublas_B: " << ublas_B << std::endl;
+  //std::cout << "ublas_C: " << ublas_C << std::endl;
+  ublas_B +=     alpha * (- ublas_A - beta * ublas_C + ublas_A);
+  vcl_B   += gpu_alpha * (-   vcl_A - vcl_C * beta   +   vcl_A);
+
+  if (!check_for_equality(ublas_B, vcl_B))
+    return EXIT_FAILURE;
+
+  ublas_B += (- ublas_A - beta * ublas_C + ublas_A * beta) / gpu_alpha;
+  vcl_B   += (-   vcl_A - vcl_C * beta + gpu_beta * vcl_A) / gpu_alpha;
+
+  if (!check_for_equality(ublas_B, vcl_B))
+    return EXIT_FAILURE;
+
+
+  ublas_B -=     alpha * (- ublas_A - beta * ublas_C - ublas_A);
+  vcl_B   -= gpu_alpha * (-   vcl_A - vcl_C * beta - vcl_A);
+
+  if (!check_for_equality(ublas_B, vcl_B))
+    return EXIT_FAILURE;
+
+  ublas_B -= (- ublas_A - beta * ublas_C - ublas_A * beta) / alpha;
+  vcl_B   -= (-   vcl_A - vcl_C * beta - gpu_beta * vcl_A) / gpu_alpha;
+
+  if (!check_for_equality(ublas_B, vcl_B))
+    return EXIT_FAILURE;
+
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << std::endl;
+
+
+  return EXIT_SUCCESS;
+}
+
+
+
+
+template <typename T, typename ScalarType>
+int run_test()
+{
+    //typedef float               ScalarType;
+    typedef boost::numeric::ublas::matrix<ScalarType>       MatrixType;
+
+    typedef viennacl::matrix<ScalarType, T>    VCLMatrixType;
+
+    std::size_t dim_rows = 131;
+    std::size_t dim_cols = 33;
+    //std::size_t dim_rows = 5;
+    //std::size_t dim_cols = 3;
+
+    //setup ublas objects:
+    MatrixType ublas_A(dim_rows, dim_cols);
+    MatrixType ublas_B(dim_rows, dim_cols);
+    MatrixType ublas_C(dim_rows, dim_cols);
+
+    for (std::size_t i=0; i<ublas_A.size1(); ++i)
+      for (std::size_t j=0; j<ublas_A.size2(); ++j)
+      {
+        ublas_A(i,j) = ScalarType((i+2) + (j+1)*(i+2));
+        ublas_B(i,j) = ScalarType((j+2) + (j+1)*(j+2));
+        ublas_C(i,j) = ScalarType((i+1) + (i+1)*(i+2));
+      }
+
+    MatrixType ublas_A_large(4 * dim_rows, 4 * dim_cols);
+    for (std::size_t i=0; i<ublas_A_large.size1(); ++i)
+      for (std::size_t j=0; j<ublas_A_large.size2(); ++j)
+        ublas_A_large(i,j) = ScalarType(i * ublas_A_large.size2() + j);
+
+    //Setup ViennaCL objects
+    VCLMatrixType vcl_A_full(4 * dim_rows, 4 * dim_cols);
+    VCLMatrixType vcl_B_full(4 * dim_rows, 4 * dim_cols);
+    VCLMatrixType vcl_C_full(4 * dim_rows, 4 * dim_cols);
+
+    viennacl::copy(ublas_A_large, vcl_A_full);
+    viennacl::copy(ublas_A_large, vcl_B_full);
+    viennacl::copy(ublas_A_large, vcl_C_full);
+
+    //
+    // Create A
+    //
+    VCLMatrixType vcl_A(dim_rows, dim_cols);
+
+    viennacl::range vcl_A_r1(2 * dim_rows, 3 * dim_rows);
+    viennacl::range vcl_A_r2(dim_cols, 2 * dim_cols);
+    viennacl::matrix_range<VCLMatrixType>   vcl_range_A(vcl_A_full, vcl_A_r1, vcl_A_r2);
+
+    viennacl::slice vcl_A_s1(2, 3, dim_rows);
+    viennacl::slice vcl_A_s2(2 * dim_cols, 2, dim_cols);
+    viennacl::matrix_slice<VCLMatrixType>   vcl_slice_A(vcl_A_full, vcl_A_s1, vcl_A_s2);
+
+
+    //
+    // Create B
+    //
+    VCLMatrixType vcl_B(dim_rows, dim_cols);
+
+    viennacl::range vcl_B_r1(dim_rows, 2 * dim_rows);
+    viennacl::range vcl_B_r2(2 * dim_cols, 3 * dim_cols);
+    viennacl::matrix_range<VCLMatrixType>   vcl_range_B(vcl_B_full, vcl_B_r1, vcl_B_r2);
+
+    viennacl::slice vcl_B_s1(2 * dim_rows, 2, dim_rows);
+    viennacl::slice vcl_B_s2(dim_cols, 3, dim_cols);
+    viennacl::matrix_slice<VCLMatrixType>   vcl_slice_B(vcl_B_full, vcl_B_s1, vcl_B_s2);
+
+
+    //
+    // Create C
+    //
+    VCLMatrixType vcl_C(dim_rows, dim_cols);
+
+    viennacl::range vcl_C_r1(2 * dim_rows, 3 * dim_rows);
+    viennacl::range vcl_C_r2(3 * dim_cols, 4 * dim_cols);
+    viennacl::matrix_range<VCLMatrixType>   vcl_range_C(vcl_C_full, vcl_C_r1, vcl_C_r2);
+
+    viennacl::slice vcl_C_s1(dim_rows, 2, dim_rows);
+    viennacl::slice vcl_C_s2(0, 3, dim_cols);
+    viennacl::matrix_slice<VCLMatrixType>   vcl_slice_C(vcl_C_full, vcl_C_s1, vcl_C_s2);
+
+    viennacl::copy(ublas_A, vcl_A);
+    viennacl::copy(ublas_A, vcl_range_A);
+    viennacl::copy(ublas_A, vcl_slice_A);
+
+    viennacl::copy(ublas_B, vcl_B);
+    viennacl::copy(ublas_B, vcl_range_B);
+    viennacl::copy(ublas_B, vcl_slice_B);
+
+    viennacl::copy(ublas_C, vcl_C);
+    viennacl::copy(ublas_C, vcl_range_C);
+    viennacl::copy(ublas_C, vcl_slice_C);
+
+
+    std::cout << std::endl;
+    std::cout << "//" << std::endl;
+    std::cout << "////////// Test: Copy CTOR //////////" << std::endl;
+    std::cout << "//" << std::endl;
+
+    {
+      std::cout << "Testing matrix created from range... ";
+      VCLMatrixType vcl_temp = vcl_range_A;
+      if (check_for_equality(ublas_A, vcl_temp))
+        std::cout << "PASSED!" << std::endl;
+      else
+      {
+        std::cout << "ublas_A: " << ublas_A << std::endl;
+        std::cout << "vcl_temp: " << vcl_temp << std::endl;
+        std::cout << "vcl_range_A: " << vcl_range_A << std::endl;
+        std::cout << "vcl_A: " << vcl_A << std::endl;
+        std::cout << std::endl << "TEST failed!" << std::endl;
+        return EXIT_FAILURE;
+      }
+
+      std::cout << "Testing matrix created from slice... ";
+      VCLMatrixType vcl_temp2 = vcl_range_B;
+      if (check_for_equality(ublas_B, vcl_temp2))
+        std::cout << "PASSED!" << std::endl;
+      else
+      {
+        std::cout << std::endl << "TEST failed!" << std::endl;
+        return EXIT_FAILURE;
+      }
+    }
+
+    std::cout << "//" << std::endl;
+    std::cout << "////////// Test: Initializer for matrix type //////////" << std::endl;
+    std::cout << "//" << std::endl;
+
+    {
+      ublas::matrix<ScalarType> ublas_dummy1 = ublas::identity_matrix<ScalarType>(ublas_A.size1());
+      ublas::matrix<ScalarType> ublas_dummy2 = ublas::scalar_matrix<ScalarType>(ublas_A.size1(), ublas_A.size1(), 3);
+      ublas::matrix<ScalarType> ublas_dummy3 = ublas::zero_matrix<ScalarType>(ublas_A.size1(), ublas_A.size1());
+
+      viennacl::matrix<ScalarType> vcl_dummy1 = viennacl::identity_matrix<ScalarType>(ublas_A.size1());
+      viennacl::matrix<ScalarType> vcl_dummy2 = viennacl::scalar_matrix<ScalarType>(ublas_A.size1(), ublas_A.size1(), 3);
+      viennacl::matrix<ScalarType> vcl_dummy3 = viennacl::zero_matrix<ScalarType>(ublas_A.size1(), ublas_A.size1());
+
+      std::cout << "Testing initializer CTOR... ";
+      if (   check_for_equality(ublas_dummy1, vcl_dummy1)
+          && check_for_equality(ublas_dummy2, vcl_dummy2)
+          && check_for_equality(ublas_dummy3, vcl_dummy3)
+         )
+        std::cout << "PASSED!" << std::endl;
+      else
+      {
+        std::cout << std::endl << "TEST failed!" << std::endl;
+        return EXIT_FAILURE;
+      }
+
+      ublas_dummy1 = ublas::zero_matrix<ScalarType>(ublas_A.size1(), ublas_A.size1());
+      ublas_dummy2 = ublas::identity_matrix<ScalarType>(ublas_A.size1());
+      ublas_dummy3 = ublas::scalar_matrix<ScalarType>(ublas_A.size1(), ublas_A.size1(), 3);
+
+      vcl_dummy1 = viennacl::zero_matrix<ScalarType>(ublas_A.size1(), ublas_A.size1());
+      vcl_dummy2 = viennacl::identity_matrix<ScalarType>(ublas_A.size1());
+      vcl_dummy3 = viennacl::scalar_matrix<ScalarType>(ublas_A.size1(), ublas_A.size1(), 3);
+
+      std::cout << "Testing initializer assignment... ";
+      if (   check_for_equality(ublas_dummy1, vcl_dummy1)
+          && check_for_equality(ublas_dummy2, vcl_dummy2)
+          && check_for_equality(ublas_dummy3, vcl_dummy3)
+         )
+        std::cout << "PASSED!" << std::endl;
+      else
+      {
+        std::cout << std::endl << "TEST failed!" << std::endl;
+        return EXIT_FAILURE;
+      }
+    }
+
+
+    //
+    // run operation tests:
+    //
+
+    /////// A=matrix:
+    std::cout << "Testing A=matrix, B=matrix, C=matrix ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_A);
+    viennacl::copy(ublas_B, vcl_B);
+    viennacl::copy(ublas_C, vcl_C);
+    if (run_test(ublas_A, ublas_B, ublas_C,
+                 vcl_A, vcl_B, vcl_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+    std::cout << "Testing A=matrix, B=matrix, C=range ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_A);
+    viennacl::copy(ublas_B, vcl_B);
+    viennacl::copy(ublas_C, vcl_range_C);
+    if (run_test(ublas_A, ublas_B, ublas_C,
+                 vcl_A, vcl_B, vcl_range_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+    std::cout << "Testing A=matrix, B=matrix, C=slice ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_A);
+    viennacl::copy(ublas_B, vcl_B);
+    viennacl::copy(ublas_C, vcl_slice_C);
+    if (run_test(ublas_A, ublas_B, ublas_C,
+                 vcl_A, vcl_B, vcl_slice_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+    std::cout << "Testing A=matrix, B=range, C=matrix ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_A);
+    viennacl::copy(ublas_B, vcl_range_B);
+    viennacl::copy(ublas_C, vcl_C);
+    if (run_test(ublas_A, ublas_B, ublas_C,
+                 vcl_A, vcl_range_B, vcl_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+    std::cout << "Testing A=matrix, B=range, C=range ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_A);
+    viennacl::copy(ublas_B, vcl_range_B);
+    viennacl::copy(ublas_C, vcl_range_C);
+    if (run_test(ublas_A, ublas_B, ublas_C,
+                 vcl_A, vcl_range_B, vcl_range_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+    std::cout << "Testing A=matrix, B=range, C=slice ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_A);
+    viennacl::copy(ublas_B, vcl_range_B);
+    viennacl::copy(ublas_C, vcl_slice_C);
+    if (run_test(ublas_A, ublas_B, ublas_C,
+                 vcl_A, vcl_range_B, vcl_slice_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+
+    std::cout << "Testing A=matrix, B=slice, C=matrix ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_A);
+    viennacl::copy(ublas_B, vcl_slice_B);
+    viennacl::copy(ublas_C, vcl_C);
+    if (run_test(ublas_A, ublas_B, ublas_C,
+                 vcl_A, vcl_slice_B, vcl_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+    std::cout << "Testing A=matrix, B=slice, C=range ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_A);
+    viennacl::copy(ublas_B, vcl_slice_B);
+    viennacl::copy(ublas_C, vcl_range_C);
+    if (run_test(ublas_A, ublas_B, ublas_C,
+                 vcl_A, vcl_slice_B, vcl_range_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+    std::cout << "Testing A=matrix, B=slice, C=slice ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_A);
+    viennacl::copy(ublas_B, vcl_slice_B);
+    viennacl::copy(ublas_C, vcl_slice_C);
+    if (run_test(ublas_A, ublas_B, ublas_C,
+                 vcl_A, vcl_slice_B, vcl_slice_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+
+
+    /////// A=range:
+    std::cout << "Testing A=range, B=matrix, C=matrix ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_range_A);
+    viennacl::copy(ublas_B, vcl_B);
+    viennacl::copy(ublas_C, vcl_C);
+    if (run_test(ublas_A, ublas_B, ublas_C,
+                 vcl_range_A, vcl_B, vcl_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+    std::cout << "Testing A=range, B=matrix, C=range ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_range_A);
+    viennacl::copy(ublas_B, vcl_B);
+    viennacl::copy(ublas_C, vcl_range_C);
+    if (run_test(ublas_A, ublas_B, ublas_C,
+                 vcl_range_A, vcl_B, vcl_range_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+    std::cout << "Testing A=range, B=matrix, C=slice ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_range_A);
+    viennacl::copy(ublas_B, vcl_B);
+    viennacl::copy(ublas_C, vcl_slice_C);
+    if (run_test(ublas_A, ublas_B, ublas_C,
+                 vcl_range_A, vcl_B, vcl_slice_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+
+
+    std::cout << "Testing A=range, B=range, C=matrix ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_range_A);
+    viennacl::copy(ublas_B, vcl_range_B);
+    viennacl::copy(ublas_C, vcl_C);
+    if (run_test(ublas_A, ublas_B, ublas_C,
+                 vcl_range_A, vcl_range_B, vcl_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+    std::cout << "Testing A=range, B=range, C=range ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_range_A);
+    viennacl::copy(ublas_B, vcl_range_B);
+    viennacl::copy(ublas_C, vcl_range_C);
+    if (run_test(ublas_A, ublas_B, ublas_C,
+                 vcl_range_A, vcl_range_B, vcl_range_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+    std::cout << "Testing A=range, B=range, C=slice ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_range_A);
+    viennacl::copy(ublas_B, vcl_range_B);
+    viennacl::copy(ublas_C, vcl_slice_C);
+    if (run_test(ublas_A, ublas_B, ublas_C,
+                 vcl_range_A, vcl_range_B, vcl_slice_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+
+
+    std::cout << "Testing A=range, B=slice, C=matrix ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_range_A);
+    viennacl::copy(ublas_B, vcl_slice_B);
+    viennacl::copy(ublas_C, vcl_C);
+    if (run_test(ublas_A, ublas_B, ublas_C,
+                 vcl_range_A, vcl_slice_B, vcl_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+    std::cout << "Testing A=range, B=slice, C=range ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_range_A);
+    viennacl::copy(ublas_B, vcl_slice_B);
+    viennacl::copy(ublas_C, vcl_range_C);
+    if (run_test(ublas_A, ublas_B, ublas_C,
+                 vcl_range_A, vcl_slice_B, vcl_range_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+    std::cout << "Testing A=range, B=slice, C=slice ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_range_A);
+    viennacl::copy(ublas_B, vcl_slice_B);
+    viennacl::copy(ublas_C, vcl_slice_C);
+    if (run_test(ublas_A, ublas_B, ublas_C,
+                 vcl_range_A, vcl_slice_B, vcl_slice_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+
+    /////// A=slice:
+    std::cout << "Testing A=slice, B=matrix, C=matrix ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_slice_A);
+    viennacl::copy(ublas_B, vcl_B);
+    viennacl::copy(ublas_C, vcl_C);
+    if (run_test(ublas_A, ublas_B, ublas_C,
+                 vcl_slice_A, vcl_B, vcl_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+    std::cout << "Testing A=slice, B=matrix, C=range ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_slice_A);
+    viennacl::copy(ublas_B, vcl_B);
+    viennacl::copy(ublas_C, vcl_range_C);
+    if (run_test(ublas_A, ublas_B, ublas_C,
+                 vcl_slice_A, vcl_B, vcl_range_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+    std::cout << "Testing A=slice, B=matrix, C=slice ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_slice_A);
+    viennacl::copy(ublas_B, vcl_B);
+    viennacl::copy(ublas_C, vcl_slice_C);
+    if (run_test(ublas_A, ublas_B, ublas_C,
+                 vcl_slice_A, vcl_B, vcl_slice_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+
+
+    std::cout << "Testing A=slice, B=range, C=matrix ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_slice_A);
+    viennacl::copy(ublas_B, vcl_range_B);
+    viennacl::copy(ublas_C, vcl_C);
+    if (run_test(ublas_A, ublas_B, ublas_C,
+                 vcl_slice_A, vcl_range_B, vcl_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+    std::cout << "Testing A=slice, B=range, C=range ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_slice_A);
+    viennacl::copy(ublas_B, vcl_range_B);
+    viennacl::copy(ublas_C, vcl_range_C);
+    if (run_test(ublas_A, ublas_B, ublas_C,
+                 vcl_slice_A, vcl_range_B, vcl_range_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+    std::cout << "Testing A=slice, B=range, C=slice ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_slice_A);
+    viennacl::copy(ublas_B, vcl_range_B);
+    viennacl::copy(ublas_C, vcl_slice_C);
+    if (run_test(ublas_A, ublas_B, ublas_C,
+                 vcl_slice_A, vcl_range_B, vcl_slice_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+
+
+    std::cout << "Testing A=slice, B=slice, C=matrix ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_slice_A);
+    viennacl::copy(ublas_B, vcl_slice_B);
+    viennacl::copy(ublas_C, vcl_C);
+    if (run_test(ublas_A, ublas_B, ublas_C,
+                 vcl_slice_A, vcl_slice_B, vcl_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+    std::cout << "Testing A=slice, B=slice, C=range ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_slice_A);
+    viennacl::copy(ublas_B, vcl_slice_B);
+    viennacl::copy(ublas_C, vcl_range_C);
+    if (run_test(ublas_A, ublas_B, ublas_C,
+                 vcl_slice_A, vcl_slice_B, vcl_range_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+    std::cout << "Testing A=slice, B=slice, C=slice ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_slice_A);
+    viennacl::copy(ublas_B, vcl_slice_B);
+    viennacl::copy(ublas_C, vcl_slice_C);
+    if (run_test(ublas_A, ublas_B, ublas_C,
+                 vcl_slice_A, vcl_slice_B, vcl_slice_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+
+    return EXIT_SUCCESS;
+}
+
+
diff --git a/tests/src/matrix_range.cpp b/tests/src/matrix_range.cpp
deleted file mode 100644
index c0ff983..0000000
--- a/tests/src/matrix_range.cpp
+++ /dev/null
@@ -1,489 +0,0 @@
-/* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-#define VIENNACL_HAVE_UBLAS
-//#define NDEBUG
-//#define VIENNACL_BUILD_INFO
-
-#include <utility>
-#include <iostream>
-#include <fstream>
-#include <string>
-#include <cmath>
-#include <algorithm>
-#include <stdio.h>
-#include <time.h>
-//#include "../benchmarks/benchmark-utils.hpp"
-#include "viennacl/scalar.hpp"
-#include "viennacl/matrix.hpp"
-#include "viennacl/linalg/prod.hpp"
-/*#include "viennacl/compressed_matrix.hpp"
-#include "viennacl/linalg/cg.hpp"
-#include "viennacl/linalg/inner_prod.hpp"
-#include "viennacl/linalg/ilu.hpp"
-#include "viennacl/linalg/norm_2.hpp"
-#include "viennacl/io/matrix_market.hpp"*/
-#include "viennacl/matrix_proxy.hpp"
-#include "boost/numeric/ublas/vector.hpp"
-#include "boost/numeric/ublas/matrix.hpp"
-#include "boost/numeric/ublas/matrix_proxy.hpp"
-#include "boost/numeric/ublas/io.hpp"
-
-
-
-template <typename MatrixType, typename VCLMatrixType>
-bool check_for_equality(MatrixType const & ublas_A, VCLMatrixType const & vcl_A)
-{
-  typedef typename MatrixType::value_type   value_type;
-  
-  boost::numeric::ublas::matrix<value_type> vcl_A_cpu(vcl_A.size1(), vcl_A.size2());
-  viennacl::copy(vcl_A, vcl_A_cpu);
-  
-  for (std::size_t i=0; i<ublas_A.size1(); ++i)
-  {
-    for (std::size_t j=0; j<ublas_A.size2(); ++j)
-    {
-      if (ublas_A(i,j) != vcl_A_cpu(i,j))
-      {
-        std::cout << "Error at index (" << i << ", " << j << "): " << ublas_A(i,j) << " vs " << vcl_A_cpu(i,j) << std::endl;
-        //std::cout << ublas_A << std::endl;
-        //std::cout << vcl_A_cpu << std::endl;
-        return false;
-      }
-    }
-  }
-  return true;
-}
-
-
-           
-template <typename T, typename ScalarType>
-int run_test()
-{
-    //typedef float               ScalarType;
-    typedef boost::numeric::ublas::matrix<ScalarType>       MatrixType;
-    
-    typedef viennacl::matrix<ScalarType, T>    VCLMatrixType;
-    
-    viennacl::scalar<ScalarType> gpu_pi = ScalarType(3.1415);
-    
-    std::size_t dim_large = 51;
-    std::size_t dim_small = 37;
-    //std::size_t dim_large = 5;
-    //std::size_t dim_small = 3;
-    
-    //setup ublas objects:
-    MatrixType ublas_A(dim_large, dim_large);
-    for (std::size_t i=0; i<ublas_A.size1(); ++i)
-      for (std::size_t j=0; j<ublas_A.size2(); ++j)
-        ublas_A(i,j) = ScalarType((i+1) + (j+1)*(i+1));
-
-    MatrixType ublas_B(dim_small, dim_small);
-    for (std::size_t i=0; i<ublas_B.size1(); ++i)
-      for (std::size_t j=0; j<ublas_B.size2(); ++j)
-        ublas_B(i,j) = ScalarType((i+1) + (j+1)*(i+1));
-
-    MatrixType ublas_C(dim_large, dim_small);
-    for (std::size_t i=0; i<ublas_C.size1(); ++i)
-      for (std::size_t j=0; j<ublas_C.size2(); ++j)
-        ublas_C(i,j) = ScalarType((j+2) + (j+1)*(i+1));
-
-    MatrixType ublas_D(dim_small, dim_large);
-    for (std::size_t i=0; i<ublas_D.size1(); ++i)
-      for (std::size_t j=0; j<ublas_D.size2(); ++j)
-        ublas_D(i,j) = ScalarType((j+2) + (j+1)*(i+1));
-      
-    boost::numeric::ublas::range ublas_r1(0, dim_small);
-    boost::numeric::ublas::range ublas_r2(dim_large - dim_small, dim_large);
-    boost::numeric::ublas::matrix_range<MatrixType> ublas_A_sub1(ublas_A, ublas_r1, ublas_r1);
-    boost::numeric::ublas::matrix_range<MatrixType> ublas_A_sub2(ublas_A, ublas_r2, ublas_r2);
-
-    boost::numeric::ublas::matrix_range<MatrixType> ublas_C_sub(ublas_C, ublas_r1, ublas_r1);
-    boost::numeric::ublas::matrix_range<MatrixType> ublas_D_sub(ublas_D, ublas_r1, ublas_r1);
-
-    //Setup ViennaCL objects    
-    VCLMatrixType vcl_A(dim_large, dim_large);
-    viennacl::copy(ublas_A, vcl_A);
-    VCLMatrixType vcl_B(dim_small, dim_small);
-    viennacl::copy(ublas_B, vcl_B);
-    VCLMatrixType vcl_C(dim_large, dim_small);
-    viennacl::copy(ublas_C, vcl_C);
-    VCLMatrixType vcl_D(dim_small, dim_large);
-    viennacl::copy(ublas_D, vcl_D);
-    
-    viennacl::range vcl_r1(0, dim_small);
-    viennacl::range vcl_r2(dim_large - dim_small, dim_large);
-    viennacl::matrix_range<VCLMatrixType>   vcl_A_sub1(vcl_A, vcl_r1, vcl_r1);
-    viennacl::matrix_range<VCLMatrixType>   vcl_A_sub2(vcl_A, vcl_r2, vcl_r2);
-    
-    viennacl::matrix_range<VCLMatrixType>   vcl_C_sub(vcl_C, vcl_r1, vcl_r1);
-    viennacl::matrix_range<VCLMatrixType>   vcl_D_sub(vcl_D, vcl_r1, vcl_r1);
-    
-    std::cout << std::endl;
-    std::cout << "//" << std::endl;
-    std::cout << "////////// Test 1: Copy to GPU //////////" << std::endl;
-    std::cout << "//" << std::endl;
-    
-    ublas_A_sub1 = ublas_B;
-    viennacl::copy(ublas_B, vcl_A_sub1);
-    std::cout << "Testing upper left copy to A... ";
-    if (check_for_equality(ublas_A, vcl_A))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-    
-    
-    ublas_A_sub2 = ublas_B;
-    viennacl::copy(ublas_B, vcl_A_sub2);
-    std::cout << "Testing lower right copy to A... ";
-    if (check_for_equality(ublas_A, vcl_A))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-    
-    
-    
-    ublas_C_sub = ublas_B;
-    viennacl::copy(ublas_B, vcl_C_sub);
-    std::cout << "Testing upper copy to C... ";
-    if (check_for_equality(ublas_C, vcl_C))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-
-    
-    ublas_D_sub = ublas_B;
-    viennacl::copy(ublas_B, vcl_D_sub);
-    std::cout << "Testing left copy to D... ";
-    if (check_for_equality(ublas_D, vcl_D))
-      std::cout << "PASSED!" << std::endl;
-    else
-      std::cout << std::endl << "TEST failed!";
-    
-    std::cout << std::endl;
-    std::cout << "//" << std::endl;
-    std::cout << "////////// Test 2: Copy from GPU //////////" << std::endl;
-    std::cout << "//" << std::endl;
-    
-    std::cout << "Testing upper left copy to A... ";
-    if (check_for_equality(ublas_A_sub1, vcl_A_sub1))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-    
-    std::cout << "Testing lower right copy to A... ";
-    if (check_for_equality(ublas_A_sub2, vcl_A_sub2))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-    
-    std::cout << "Testing upper copy to C... ";
-    if (check_for_equality(ublas_C_sub, vcl_C_sub))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-
-    std::cout << "Testing left copy to D... ";
-    if (check_for_equality(ublas_D_sub, vcl_D_sub))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-    
-    
-    std::cout << "//" << std::endl;
-    std::cout << "////////// Test 3: Addition //////////" << std::endl;
-    std::cout << "//" << std::endl;
-    viennacl::copy(ublas_A_sub2, vcl_A_sub2);
-    
-    std::cout << "Inplace add to submatrix: ";
-    ublas_A_sub2 += ublas_A_sub2;
-    vcl_A_sub2 += vcl_A_sub2;
-
-    if (check_for_equality(ublas_A, vcl_A))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-
-    std::cout << "Inplace add to matrix: ";
-    ublas_B += ublas_A_sub2;
-    vcl_B += vcl_A_sub2;
-
-    if (check_for_equality(ublas_B, vcl_B))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-    
-    std::cout << "Add to submatrix: ";
-    ublas_A_sub2 = ublas_A_sub2 + ublas_A_sub2;
-    vcl_A_sub2 = vcl_A_sub2 + vcl_A_sub2;
-
-    if (check_for_equality(ublas_A, vcl_A))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-
-    std::cout << "Add to matrix: ";
-    ublas_B = ublas_A_sub2 + ublas_A_sub2;
-    vcl_B = vcl_A_sub2 + vcl_A_sub2;
-
-    if (check_for_equality(ublas_B, vcl_B))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-    
-    std::cout << "//" << std::endl;
-    std::cout << "////////// Test 4: Subtraction //////////" << std::endl;
-    std::cout << "//" << std::endl;
-    viennacl::copy(ublas_A_sub2, vcl_A_sub2);
-    
-    std::cout << "Inplace add to submatrix: ";
-    ublas_A_sub2 -= ublas_A_sub2;
-    vcl_A_sub2 -= vcl_A_sub2;
-
-    if (check_for_equality(ublas_A, vcl_A))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-
-    std::cout << "Inplace add to matrix: ";
-    ublas_B -= ublas_A_sub2;
-    vcl_B -= vcl_A_sub2;
-
-    if (check_for_equality(ublas_B, vcl_B))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-    
-    std::cout << "Add to submatrix: ";
-    ublas_A_sub2 = ublas_A_sub2 - ublas_A_sub2;
-    vcl_A_sub2 = vcl_A_sub2 - vcl_A_sub2;
-
-    if (check_for_equality(ublas_A, vcl_A))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-
-    std::cout << "Add to matrix: ";
-    ublas_B = ublas_A_sub2 - ublas_A_sub2;
-    vcl_B = vcl_A_sub2 - vcl_A_sub2;
-
-    if (check_for_equality(ublas_B, vcl_B))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-    
-    std::cout << "//" << std::endl;
-    std::cout << "////////// Test 5: Scaling //////////" << std::endl;
-    std::cout << "//" << std::endl;
-    viennacl::copy(ublas_A, vcl_A);
-    
-    std::cout << "Multiplication with CPU scalar: ";
-    ublas_A_sub2 *= ScalarType(3.1415);
-    vcl_A_sub2 *= ScalarType(3.1415);
-
-    if (check_for_equality(ublas_A, vcl_A))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-
-    std::cout << "Multiplication with GPU scalar: ";
-    ublas_A_sub2 *= gpu_pi;
-    vcl_A_sub2 *= gpu_pi;
-
-    if (check_for_equality(ublas_A, vcl_A))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-    
-    
-    std::cout << "Division with CPU scalar: ";
-    ublas_A_sub2 /= ScalarType(3.1415);
-    vcl_A_sub2 /= ScalarType(3.1415);
-
-    if (check_for_equality(ublas_A, vcl_A))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-
-    std::cout << "Division with GPU scalar: ";
-    ublas_A_sub2 /= gpu_pi;
-    vcl_A_sub2 /= gpu_pi;
-
-    if (check_for_equality(ublas_A, vcl_A))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-    
-    
-
-    std::cout << "//" << std::endl;
-    std::cout << "////////// Test 6: Products //////////" << std::endl;
-    std::cout << "//" << std::endl;
-
-    std::cout << "Inplace add of prod(): ";
-    ublas_A_sub1 += prod(ublas_C_sub, ublas_D_sub);
-    vcl_A_sub1 += viennacl::linalg::prod(vcl_C_sub, vcl_D_sub);
-
-    if (check_for_equality(ublas_A, vcl_A))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-
-    std::cout << "Assigned C = A * B: ";
-    ublas_A_sub1 = prod(ublas_C_sub, ublas_D_sub);
-    vcl_A_sub1 = viennacl::linalg::prod(vcl_C_sub, vcl_D_sub);
-
-    if (check_for_equality(ublas_A, vcl_A))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-
-    std::cout << "Assigned C = A^T * B: ";
-    ublas_A_sub1 = prod(trans(ublas_C_sub), ublas_D_sub);
-    vcl_A_sub1 = viennacl::linalg::prod(trans(vcl_C_sub), vcl_D_sub);
-
-    if (check_for_equality(ublas_A, vcl_A))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-
-    std::cout << "Assigned C = A * B^T: ";
-    ublas_A_sub1 = prod(ublas_C_sub, trans(ublas_D_sub));
-    vcl_A_sub1 = viennacl::linalg::prod(vcl_C_sub, trans(vcl_D_sub));
-
-    if (check_for_equality(ublas_A, vcl_A))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-
-    std::cout << "Assigned C = A^T * B^T: ";
-    ublas_A_sub1 = prod(trans(ublas_C_sub), trans(ublas_D_sub));
-    vcl_A_sub1 = viennacl::linalg::prod(trans(vcl_C_sub), trans(vcl_D_sub));
-
-    if (check_for_equality(ublas_A, vcl_A))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-
-    return EXIT_SUCCESS;
-}    
-
-int main (int argc, const char * argv[])
-{
-  std::cout << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << "## Test :: Matrix Range" << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << std::endl;
-   
-  std::cout << "# Testing setup:" << std::endl;
-  std::cout << "  eps:     " << 0 << std::endl;
-  std::cout << "  numeric: float" << std::endl;
-  if (run_test<viennacl::row_major, float>() != EXIT_SUCCESS)
-    return EXIT_FAILURE;
-  if (run_test<viennacl::column_major, float>() != EXIT_SUCCESS)
-    return EXIT_FAILURE;
-  
-  
-  if( viennacl::ocl::current_device().double_support() )
-  {
-    std::cout << "# Testing setup:" << std::endl;
-    std::cout << "  eps:     " << 0 << std::endl;
-    std::cout << "  numeric: double" << std::endl;
-    
-    if (run_test<viennacl::row_major, double>() != EXIT_SUCCESS)
-      return EXIT_FAILURE;
-    if (run_test<viennacl::column_major, double>() != EXIT_SUCCESS)
-      return EXIT_FAILURE;
-  }
-
-  return EXIT_SUCCESS;
-}
-
diff --git a/tests/src/matrix_row_double.cpp b/tests/src/matrix_row_double.cpp
new file mode 100644
index 0000000..1e26b6e
--- /dev/null
+++ b/tests/src/matrix_row_double.cpp
@@ -0,0 +1,51 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "matrix_float_double.hpp"
+
+int main (int, const char **)
+{
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "## Test :: Matrix operations, row-major, double precision " << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << std::endl;
+
+
+#ifdef VIENNACL_WITH_OPENCL
+   if( viennacl::ocl::current_device().double_support() )
+#endif
+  {
+    double epsilon = 1e-12;
+    std::cout << "# Testing setup:" << std::endl;
+    std::cout << "  eps:     " << epsilon << std::endl;
+    std::cout << "  numeric: double" << std::endl;
+
+    if (run_test<viennacl::row_major, double>(epsilon) != EXIT_SUCCESS)
+      return EXIT_FAILURE;
+  }
+
+   std::cout << std::endl;
+   std::cout << "------- Test completed --------" << std::endl;
+   std::cout << std::endl;
+
+
+  return EXIT_SUCCESS;
+}
+
diff --git a/tests/src/matrix_row_double.cu b/tests/src/matrix_row_double.cu
new file mode 100644
index 0000000..1e26b6e
--- /dev/null
+++ b/tests/src/matrix_row_double.cu
@@ -0,0 +1,51 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "matrix_float_double.hpp"
+
+int main (int, const char **)
+{
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "## Test :: Matrix operations, row-major, double precision " << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << std::endl;
+
+
+#ifdef VIENNACL_WITH_OPENCL
+   if( viennacl::ocl::current_device().double_support() )
+#endif
+  {
+    double epsilon = 1e-12;
+    std::cout << "# Testing setup:" << std::endl;
+    std::cout << "  eps:     " << epsilon << std::endl;
+    std::cout << "  numeric: double" << std::endl;
+
+    if (run_test<viennacl::row_major, double>(epsilon) != EXIT_SUCCESS)
+      return EXIT_FAILURE;
+  }
+
+   std::cout << std::endl;
+   std::cout << "------- Test completed --------" << std::endl;
+   std::cout << std::endl;
+
+
+  return EXIT_SUCCESS;
+}
+
diff --git a/tests/src/matrix_row_float.cpp b/tests/src/matrix_row_float.cpp
new file mode 100644
index 0000000..7284d08
--- /dev/null
+++ b/tests/src/matrix_row_float.cpp
@@ -0,0 +1,44 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "matrix_float_double.hpp"
+
+int main (int, const char **)
+{
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "## Test :: Matrix operations, row-major, single precision " << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << std::endl;
+
+  double epsilon = 1e-4;
+  std::cout << "# Testing setup:" << std::endl;
+  std::cout << "  eps:     " << epsilon << std::endl;
+  std::cout << "  numeric: float" << std::endl;
+  std::cout << " --- row-major ---" << std::endl;
+  if (run_test<viennacl::row_major, float>(epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << std::endl;
+  std::cout << "------- Test completed --------" << std::endl;
+  std::cout << std::endl;
+
+  return EXIT_SUCCESS;
+}
+
diff --git a/tests/src/matrix_row_float.cu b/tests/src/matrix_row_float.cu
new file mode 100644
index 0000000..7284d08
--- /dev/null
+++ b/tests/src/matrix_row_float.cu
@@ -0,0 +1,44 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "matrix_float_double.hpp"
+
+int main (int, const char **)
+{
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "## Test :: Matrix operations, row-major, single precision " << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << std::endl;
+
+  double epsilon = 1e-4;
+  std::cout << "# Testing setup:" << std::endl;
+  std::cout << "  eps:     " << epsilon << std::endl;
+  std::cout << "  numeric: float" << std::endl;
+  std::cout << " --- row-major ---" << std::endl;
+  if (run_test<viennacl::row_major, float>(epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << std::endl;
+  std::cout << "------- Test completed --------" << std::endl;
+  std::cout << std::endl;
+
+  return EXIT_SUCCESS;
+}
+
diff --git a/tests/src/matrix_row_int.cpp b/tests/src/matrix_row_int.cpp
new file mode 100644
index 0000000..758c982
--- /dev/null
+++ b/tests/src/matrix_row_int.cpp
@@ -0,0 +1,48 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "matrix_int.hpp"
+
+int main (int, const char **)
+{
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "## Test :: Matrix operations, row-major, integers " << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << std::endl;
+
+  std::cout << "# Testing setup:" << std::endl;
+  std::cout << "  numeric: int" << std::endl;
+  std::cout << " --- row-major ---" << std::endl;
+  if (run_test<viennacl::row_major, int>() != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "# Testing setup:" << std::endl;
+  std::cout << "  numeric: long" << std::endl;
+  std::cout << " --- row-major ---" << std::endl;
+  if (run_test<viennacl::row_major, long>() != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << std::endl;
+  std::cout << "------- Test completed --------" << std::endl;
+  std::cout << std::endl;
+
+  return EXIT_SUCCESS;
+}
+
diff --git a/tests/src/matrix_row_int.cu b/tests/src/matrix_row_int.cu
new file mode 100644
index 0000000..758c982
--- /dev/null
+++ b/tests/src/matrix_row_int.cu
@@ -0,0 +1,48 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "matrix_int.hpp"
+
+int main (int, const char **)
+{
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "## Test :: Matrix operations, row-major, integers " << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << std::endl;
+
+  std::cout << "# Testing setup:" << std::endl;
+  std::cout << "  numeric: int" << std::endl;
+  std::cout << " --- row-major ---" << std::endl;
+  if (run_test<viennacl::row_major, int>() != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "# Testing setup:" << std::endl;
+  std::cout << "  numeric: long" << std::endl;
+  std::cout << " --- row-major ---" << std::endl;
+  if (run_test<viennacl::row_major, long>() != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << std::endl;
+  std::cout << "------- Test completed --------" << std::endl;
+  std::cout << std::endl;
+
+  return EXIT_SUCCESS;
+}
+
diff --git a/tests/src/matrix_vector.cpp b/tests/src/matrix_vector.cpp
new file mode 100644
index 0000000..90e3ac0
--- /dev/null
+++ b/tests/src/matrix_vector.cpp
@@ -0,0 +1,1146 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+//
+// *** System
+//
+#include <iostream>
+
+//
+// *** Boost
+//
+#include <boost/numeric/ublas/io.hpp>
+#include <boost/numeric/ublas/triangular.hpp>
+#include <boost/numeric/ublas/matrix_sparse.hpp>
+#include <boost/numeric/ublas/matrix.hpp>
+#include <boost/numeric/ublas/matrix_proxy.hpp>
+#include <boost/numeric/ublas/lu.hpp>
+#include <boost/numeric/ublas/io.hpp>
+#include <boost/numeric/ublas/vector_proxy.hpp>
+
+//
+// *** ViennaCL
+//
+//#define VIENNACL_DEBUG_ALL
+#define VIENNACL_WITH_UBLAS 1
+#include "viennacl/scalar.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+#include "viennacl/linalg/direct_solve.hpp"
+#include "viennacl/linalg/lu.hpp"
+#include "examples/tutorial/Random.hpp"
+
+//
+// -------------------------------------------------------------
+//
+using namespace boost::numeric;
+//
+// -------------------------------------------------------------
+//
+template <typename ScalarType>
+ScalarType diff(ScalarType & s1, viennacl::scalar<ScalarType> & s2)
+{
+   viennacl::backend::finish();
+   if (s1 != s2)
+      return (s1 - s2) / std::max(std::fabs(s1), std::fabs(s2));
+   return 0;
+}
+
+template <typename ScalarType, typename VCLVectorType>
+ScalarType diff(ublas::vector<ScalarType> const & v1, VCLVectorType const & v2)
+{
+   ublas::vector<ScalarType> v2_cpu(v2.size());
+   viennacl::backend::finish();  //workaround for a bug in APP SDK 2.7 on Trinity APUs (with Catalyst 12.8)
+   viennacl::copy(v2.begin(), v2.end(), v2_cpu.begin());
+
+   for (unsigned int i=0;i<v1.size(); ++i)
+   {
+      if ( std::max( std::fabs(v2_cpu[i]), std::fabs(v1[i]) ) > 0 )
+         v2_cpu[i] = std::fabs(v2_cpu[i] - v1[i]) / std::max( std::fabs(v2_cpu[i]), std::fabs(v1[i]) );
+      else
+         v2_cpu[i] = 0.0;
+   }
+
+   return norm_inf(v2_cpu);
+}
+
+template <typename ScalarType, typename VCLMatrixType>
+ScalarType diff(ublas::matrix<ScalarType> const & mat1, VCLMatrixType const & mat2)
+{
+   ublas::matrix<ScalarType> mat2_cpu(mat2.size1(), mat2.size2());
+   viennacl::backend::finish();  //workaround for a bug in APP SDK 2.7 on Trinity APUs (with Catalyst 12.8)
+   viennacl::copy(mat2, mat2_cpu);
+   ScalarType ret = 0;
+   ScalarType act = 0;
+
+    for (unsigned int i = 0; i < mat2_cpu.size1(); ++i)
+    {
+      for (unsigned int j = 0; j < mat2_cpu.size2(); ++j)
+      {
+         act = std::fabs(mat2_cpu(i,j) - mat1(i,j)) / std::max( std::fabs(mat2_cpu(i, j)), std::fabs(mat1(i,j)) );
+         if (act > ret)
+           ret = act;
+      }
+    }
+   //std::cout << ret << std::endl;
+   return ret;
+}
+//
+// -------------------------------------------------------------
+//
+
+template <typename NumericT, typename Epsilon,
+          typename UblasMatrixType, typename UblasVectorType,
+          typename VCLMatrixType, typename VCLVectorType1, typename VCLVectorType2>
+int test_prod_rank1(Epsilon const & epsilon,
+                    UblasMatrixType & ublas_m1, UblasVectorType & ublas_v1, UblasVectorType & ublas_v2, UblasMatrixType & ublas_m2,
+                    VCLMatrixType & vcl_m1, VCLVectorType1 & vcl_v1, VCLVectorType2 & vcl_v2, VCLMatrixType & vcl_m2)
+{
+   int retval = EXIT_SUCCESS;
+
+   // sync data:
+   ublas_v1 = ublas::scalar_vector<NumericT>(ublas_v1.size(), NumericT(0.1234));
+   ublas_v2 = ublas::scalar_vector<NumericT>(ublas_v2.size(), NumericT(0.4321));
+   viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+   viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+   viennacl::copy(ublas_m1, vcl_m1);
+
+   // --------------------------------------------------------------------------
+   std::cout << "Rank 1 update" << std::endl;
+
+   ublas_m1 += ublas::outer_prod(ublas_v1, ublas_v2);
+   vcl_m1 += viennacl::linalg::outer_prod(vcl_v1, vcl_v2);
+   if( std::fabs(diff(ublas_m1, vcl_m1)) > epsilon )
+   {
+      std::cout << "# Error at operation: rank 1 update" << std::endl;
+      std::cout << "  diff: " << std::fabs(diff(ublas_m1, vcl_m1)) << std::endl;
+      return EXIT_FAILURE;
+   }
+
+
+
+   // --------------------------------------------------------------------------
+   std::cout << "Scaled rank 1 update - CPU Scalar" << std::endl;
+   ublas_m1 += NumericT(4.2) * ublas::outer_prod(ublas_v1, ublas_v2);
+   vcl_m1 += NumericT(2.1) * viennacl::linalg::outer_prod(vcl_v1, vcl_v2);
+   vcl_m1 += viennacl::linalg::outer_prod(vcl_v1, vcl_v2) * NumericT(2.1);  //check proper compilation
+   if( std::fabs(diff(ublas_m1, vcl_m1)) > epsilon )
+   {
+      std::cout << "# Error at operation: scaled rank 1 update - CPU Scalar" << std::endl;
+      std::cout << "  diff: " << std::fabs(diff(ublas_m1, vcl_m1)) << std::endl;
+      return EXIT_FAILURE;
+   }
+
+      // --------------------------------------------------------------------------
+   std::cout << "Scaled rank 1 update - GPU Scalar" << std::endl;
+   ublas_m1 += NumericT(4.2) * ublas::outer_prod(ublas_v1, ublas_v2);
+   vcl_m1 += viennacl::scalar<NumericT>(NumericT(2.1)) * viennacl::linalg::outer_prod(vcl_v1, vcl_v2);
+   vcl_m1 += viennacl::linalg::outer_prod(vcl_v1, vcl_v2) * viennacl::scalar<NumericT>(NumericT(2.1));  //check proper compilation
+   if( std::fabs(diff(ublas_m1, vcl_m1)) > epsilon )
+   {
+      std::cout << "# Error at operation: scaled rank 1 update - GPU Scalar" << std::endl;
+      std::cout << "  diff: " << std::fabs(diff(ublas_m1, vcl_m1)) << std::endl;
+      return EXIT_FAILURE;
+   }
+
+   //reset vcl_matrix:
+   viennacl::copy(ublas_m1, vcl_m1);
+
+   // --------------------------------------------------------------------------
+   std::cout << "Matrix-Vector product" << std::endl;
+   ublas_v1 = viennacl::linalg::prod(ublas_m1, ublas_v2);
+   vcl_v1   = viennacl::linalg::prod(vcl_m1, vcl_v2);
+
+   if( std::fabs(diff(ublas_v1, vcl_v1)) > epsilon )
+   {
+      std::cout << "# Error at operation: matrix-vector product" << std::endl;
+      std::cout << "  diff: " << std::fabs(diff(ublas_v1, vcl_v1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+   // --------------------------------------------------------------------------
+   std::cout << "Matrix-Vector product with scaled add" << std::endl;
+   NumericT alpha = static_cast<NumericT>(2.786);
+   NumericT beta = static_cast<NumericT>(1.432);
+   viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+   viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+   ublas_v1 = alpha * viennacl::linalg::prod(ublas_m1, ublas_v2) + beta * ublas_v1;
+   vcl_v1   = alpha * viennacl::linalg::prod(vcl_m1, vcl_v2) + beta * vcl_v1;
+
+   if( std::fabs(diff(ublas_v1, vcl_v1)) > epsilon )
+   {
+      std::cout << "# Error at operation: matrix-vector product with scaled additions" << std::endl;
+      std::cout << "  diff: " << std::fabs(diff(ublas_v1, vcl_v1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+   // --------------------------------------------------------------------------
+
+   viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+   viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+   std::cout << "Transposed Matrix-Vector product" << std::endl;
+   ublas_v2 = alpha * viennacl::linalg::prod(trans(ublas_m1), ublas_v1);
+   vcl_v2   = alpha * viennacl::linalg::prod(trans(vcl_m1), vcl_v1);
+
+   if( std::fabs(diff(ublas_v2, vcl_v2)) > epsilon )
+   {
+      std::cout << "# Error at operation: transposed matrix-vector product" << std::endl;
+      std::cout << "  diff: " << std::fabs(diff(ublas_v2, vcl_v2)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   std::cout << "Transposed Matrix-Vector product with scaled add" << std::endl;
+   ublas_v2 = alpha * viennacl::linalg::prod(trans(ublas_m1), ublas_v1) + beta * ublas_v2;
+   vcl_v2   = alpha * viennacl::linalg::prod(trans(vcl_m1), vcl_v1) + beta * vcl_v2;
+
+   if( std::fabs(diff(ublas_v2, vcl_v2)) > epsilon )
+   {
+      std::cout << "# Error at operation: transposed matrix-vector product with scaled additions" << std::endl;
+      std::cout << "  diff: " << std::fabs(diff(ublas_v2, vcl_v2)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+   // --------------------------------------------------------------------------
+
+   viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+   viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+   std::cout << "Row extraction from matrix" << std::endl;
+   ublas_v2 = row(ublas_m1, std::size_t(7));
+   vcl_v2   = row(vcl_m1, std::size_t(7));
+
+   if( std::fabs(diff(ublas_v2, vcl_v2)) > epsilon )
+   {
+      std::cout << "# Error at operation: diagonal extraction from matrix" << std::endl;
+      std::cout << "  diff: " << std::fabs(diff(ublas_v2, vcl_v2)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   std::cout << "Column extraction from matrix" << std::endl;
+   ublas_v1 = column(ublas_m1, std::size_t(7));
+   vcl_v1   = column(vcl_m1, std::size_t(7));
+
+   if( std::fabs(diff(ublas_v1, vcl_v1)) > epsilon )
+   {
+      std::cout << "# Error at operation: diagonal extraction from matrix" << std::endl;
+      std::cout << "  diff: " << std::fabs(diff(ublas_v2, vcl_v2)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   // --------------------------------------------------------------------------
+
+   viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+   viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+   viennacl::copy(ublas_m2, vcl_m2);
+   UblasMatrixType A = ublas_m2;
+
+   std::cout << "Diagonal extraction from matrix" << std::endl;
+   for (std::size_t i=0; i<ublas_m1.size2(); ++i)
+     ublas_v2[i] = ublas_m1(i + 3, i);
+   vcl_v2   = diag(vcl_m1, static_cast<int>(-3));
+
+   if( std::fabs(diff(ublas_v2, vcl_v2)) > epsilon )
+   {
+      std::cout << "# Error at operation: diagonal extraction from matrix" << std::endl;
+      std::cout << "  diff: " << std::fabs(diff(ublas_v2, vcl_v2)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   std::cout << "Matrix diagonal assignment from vector" << std::endl;
+   A = ublas::scalar_matrix<NumericT>(A.size1(), A.size2(), NumericT(0));
+   for (std::size_t i=0; i<ublas_m1.size2(); ++i)
+     A(i + (A.size1() - ublas_m1.size2()), i) = ublas_v2[i];
+   vcl_m2 = diag(vcl_v2, static_cast<int>(ublas_m1.size2()) - static_cast<int>(A.size1()));
+
+   if( std::fabs(diff(A, vcl_m2)) > epsilon )
+   {
+      std::cout << "# Error at operation: Matrix assignment from diagonal" << std::endl;
+      std::cout << "  diff: " << std::fabs(diff(A, vcl_m2)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+   // --------------------------------------------------------------------------
+
+   return retval;
+}
+
+
+
+template <typename NumericT, typename Epsilon,
+          typename UblasMatrixType, typename UblasVectorType,
+          typename VCLMatrixType, typename VCLVectorType1>
+int test_solve(Epsilon const & epsilon,
+               UblasMatrixType & ublas_m1, UblasVectorType & ublas_v1,
+               VCLMatrixType & vcl_m1, VCLVectorType1 & vcl_v1)
+{
+   int retval = EXIT_SUCCESS;
+
+   // sync data:
+   //viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+   viennacl::copy(ublas_v1, vcl_v1);
+   viennacl::copy(ublas_m1, vcl_m1);
+
+   /////////////////// test direct solvers ////////////////////////////
+
+   //upper triangular:
+   std::cout << "Upper triangular solver" << std::endl;
+   ublas_v1 = ublas::solve(ublas_m1, ublas_v1, ublas::upper_tag());
+   vcl_v1 = viennacl::linalg::solve(vcl_m1, vcl_v1, viennacl::linalg::upper_tag());
+   if( std::fabs(diff(ublas_v1, vcl_v1)) > epsilon )
+   {
+      std::cout << "# Error at operation: upper triangular solver" << std::endl;
+      std::cout << "  diff: " << std::fabs(diff(ublas_v1, vcl_v1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   //upper unit triangular:
+   std::cout << "Upper unit triangular solver" << std::endl;
+   viennacl::copy(ublas_v1, vcl_v1);
+   ublas_v1 = ublas::solve(ublas_m1, ublas_v1, ublas::unit_upper_tag());
+   vcl_v1 = viennacl::linalg::solve(vcl_m1, vcl_v1, viennacl::linalg::unit_upper_tag());
+   if( std::fabs(diff(ublas_v1, vcl_v1)) > epsilon )
+   {
+      std::cout << "# Error at operation: unit upper triangular solver" << std::endl;
+      std::cout << "  diff: " << std::fabs(diff(ublas_v1, vcl_v1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   //lower triangular:
+   std::cout << "Lower triangular solver" << std::endl;
+   viennacl::copy(ublas_v1, vcl_v1);
+   ublas_v1 = ublas::solve(ublas_m1, ublas_v1, ublas::lower_tag());
+   vcl_v1 = viennacl::linalg::solve(vcl_m1, vcl_v1, viennacl::linalg::lower_tag());
+   if( std::fabs(diff(ublas_v1, vcl_v1)) > epsilon )
+   {
+      std::cout << "# Error at operation: lower triangular solver" << std::endl;
+      std::cout << "  diff: " << std::fabs(diff(ublas_v1, vcl_v1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   //lower unit triangular:
+   std::cout << "Lower unit triangular solver" << std::endl;
+   viennacl::copy(ublas_v1, vcl_v1);
+   ublas_v1 = ublas::solve(ublas_m1, ublas_v1, ublas::unit_lower_tag());
+   vcl_v1 = viennacl::linalg::solve(vcl_m1, vcl_v1, viennacl::linalg::unit_lower_tag());
+   if( std::fabs(diff(ublas_v1, vcl_v1)) > epsilon )
+   {
+      std::cout << "# Error at operation: unit lower triangular solver" << std::endl;
+      std::cout << "  diff: " << std::fabs(diff(ublas_v1, vcl_v1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+
+
+
+
+   //transposed upper triangular:
+   std::cout << "Transposed upper triangular solver" << std::endl;
+   viennacl::copy(ublas_v1, vcl_v1);
+   ublas_v1 = ublas::solve(trans(ublas_m1), ublas_v1, ublas::upper_tag());
+   vcl_v1 = viennacl::linalg::solve(trans(vcl_m1), vcl_v1, viennacl::linalg::upper_tag());
+   if( std::fabs(diff(ublas_v1, vcl_v1)) > epsilon )
+   {
+      std::cout << "# Error at operation: upper triangular solver" << std::endl;
+      std::cout << "  diff: " << std::fabs(diff(ublas_v1, vcl_v1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   //transposed upper unit triangular:
+   std::cout << "Transposed unit upper triangular solver" << std::endl;
+   viennacl::copy(ublas_v1, vcl_v1);
+   ublas_v1 = ublas::solve(trans(ublas_m1), ublas_v1, ublas::unit_upper_tag());
+   vcl_v1 = viennacl::linalg::solve(trans(vcl_m1), vcl_v1, viennacl::linalg::unit_upper_tag());
+   if( std::fabs(diff(ublas_v1, vcl_v1)) > epsilon )
+   {
+      std::cout << "# Error at operation: unit upper triangular solver" << std::endl;
+      std::cout << "  diff: " << std::fabs(diff(ublas_v1, vcl_v1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   //transposed lower triangular:
+   std::cout << "Transposed lower triangular solver" << std::endl;
+   viennacl::copy(ublas_v1, vcl_v1);
+   ublas_v1 = ublas::solve(trans(ublas_m1), ublas_v1, ublas::lower_tag());
+   vcl_v1 = viennacl::linalg::solve(trans(vcl_m1), vcl_v1, viennacl::linalg::lower_tag());
+   if( std::fabs(diff(ublas_v1, vcl_v1)) > epsilon )
+   {
+      std::cout << "# Error at operation: lower triangular solver" << std::endl;
+      std::cout << "  diff: " << std::fabs(diff(ublas_v1, vcl_v1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   //transposed lower unit triangular:
+   std::cout << "Transposed unit lower triangular solver" << std::endl;
+   viennacl::copy(ublas_v1, vcl_v1);
+   ublas_v1 = ublas::solve(trans(ublas_m1), ublas_v1, ublas::unit_lower_tag());
+   vcl_v1 = viennacl::linalg::solve(trans(vcl_m1), vcl_v1, viennacl::linalg::unit_lower_tag());
+   if( std::fabs(diff(ublas_v1, vcl_v1)) > epsilon )
+   {
+      std::cout << "# Error at operation: unit lower triangular solver" << std::endl;
+      std::cout << "  diff: " << std::fabs(diff(ublas_v1, vcl_v1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   return retval;
+}
+
+
+//
+// -------------------------------------------------------------
+//
+template< typename NumericT, typename F, typename Epsilon >
+int test(Epsilon const& epsilon)
+{
+   int retval = EXIT_SUCCESS;
+
+   std::size_t num_rows = 141; //note: use num_rows > num_cols + 3 for diag() tests to work
+   std::size_t num_cols = 103;
+
+   // --------------------------------------------------------------------------
+   ublas::vector<NumericT> ublas_v1(num_rows);
+   for (std::size_t i = 0; i < ublas_v1.size(); ++i)
+     ublas_v1(i) = random<NumericT>();
+   ublas::vector<NumericT> ublas_v2 = ublas::scalar_vector<NumericT>(num_cols, NumericT(3.1415));
+
+
+   ublas::matrix<NumericT> ublas_m1(ublas_v1.size(), ublas_v2.size());
+
+   for (std::size_t i = 0; i < ublas_m1.size1(); ++i)
+      for (std::size_t j = 0; j < ublas_m1.size2(); ++j)
+         ublas_m1(i,j) = static_cast<NumericT>(0.1) * random<NumericT>();
+
+
+   ublas::matrix<NumericT> ublas_m2(ublas_v1.size(), ublas_v1.size());
+
+   for (std::size_t i = 0; i < ublas_m2.size1(); ++i)
+   {
+      for (std::size_t j = 0; j < ublas_m2.size2(); ++j)
+         ublas_m2(i,j) = static_cast<NumericT>(-0.1) * random<NumericT>();
+      ublas_m2(i, i) = static_cast<NumericT>(2) + random<NumericT>();
+   }
+
+
+   viennacl::vector<NumericT> vcl_v1_native(ublas_v1.size());
+   viennacl::vector<NumericT> vcl_v1_large(4 * ublas_v1.size());
+   viennacl::vector_range< viennacl::vector<NumericT> > vcl_v1_range(vcl_v1_large, viennacl::range(3, ublas_v1.size() + 3));
+   viennacl::vector_slice< viennacl::vector<NumericT> > vcl_v1_slice(vcl_v1_large, viennacl::slice(2, 3, ublas_v1.size()));
+
+   viennacl::vector<NumericT> vcl_v2_native(ublas_v2.size());
+   viennacl::vector<NumericT> vcl_v2_large(4 * ublas_v2.size());
+   viennacl::vector_range< viennacl::vector<NumericT> > vcl_v2_range(vcl_v2_large, viennacl::range(8, ublas_v2.size() + 8));
+   viennacl::vector_slice< viennacl::vector<NumericT> > vcl_v2_slice(vcl_v2_large, viennacl::slice(6, 2, ublas_v2.size()));
+
+   viennacl::matrix<NumericT, F> vcl_m1_native(ublas_m1.size1(), ublas_m1.size2());
+   viennacl::matrix<NumericT, F> vcl_m1_large(4 * ublas_m1.size1(), 4 * ublas_m1.size2());
+   viennacl::matrix_range< viennacl::matrix<NumericT, F> > vcl_m1_range(vcl_m1_large,
+                                                                        viennacl::range(8, ublas_m1.size1() + 8),
+                                                                        viennacl::range(ublas_m1.size2(), 2 * ublas_m1.size2()) );
+   viennacl::matrix_slice< viennacl::matrix<NumericT, F> > vcl_m1_slice(vcl_m1_large,
+                                                                        viennacl::slice(6, 2, ublas_m1.size1()),
+                                                                        viennacl::slice(ublas_m1.size2(), 2, ublas_m1.size2()) );
+
+   viennacl::matrix<NumericT, F> vcl_m2_native(ublas_m2.size1(), ublas_m2.size2());
+   viennacl::matrix<NumericT, F> vcl_m2_large(4 * ublas_m2.size1(), 4 * ublas_m2.size2());
+   viennacl::matrix_range< viennacl::matrix<NumericT, F> > vcl_m2_range(vcl_m2_large,
+                                                                        viennacl::range(8, ublas_m2.size1() + 8),
+                                                                        viennacl::range(ublas_m2.size2(), 2 * ublas_m2.size2()) );
+   viennacl::matrix_slice< viennacl::matrix<NumericT, F> > vcl_m2_slice(vcl_m2_large,
+                                                                        viennacl::slice(6, 2, ublas_m2.size1()),
+                                                                        viennacl::slice(ublas_m2.size2(), 2, ublas_m2.size2()) );
+
+
+/*   std::cout << "Matrix resizing (to larger)" << std::endl;
+   matrix.resize(2*num_rows, 2*num_cols, true);
+   for (unsigned int i = 0; i < matrix.size1(); ++i)
+   {
+      for (unsigned int j = (i<result.size() ? rhs.size() : 0); j < matrix.size2(); ++j)
+         matrix(i,j) = 0;
+   }
+   vcl_matrix.resize(2*num_rows, 2*num_cols, true);
+   viennacl::copy(vcl_matrix, matrix);
+   if( std::fabs(diff(matrix, vcl_matrix)) > epsilon )
+   {
+      std::cout << "# Error at operation: matrix resize (to larger)" << std::endl;
+      std::cout << "  diff: " << std::fabs(diff(matrix, vcl_matrix)) << std::endl;
+      return EXIT_FAILURE;
+   }
+
+   matrix(12, 14) = NumericT(1.9);
+   matrix(19, 16) = NumericT(1.0);
+   matrix (13, 15) =  NumericT(-9);
+   vcl_matrix(12, 14) = NumericT(1.9);
+   vcl_matrix(19, 16) = NumericT(1.0);
+   vcl_matrix (13, 15) =  NumericT(-9);
+
+   std::cout << "Matrix resizing (to smaller)" << std::endl;
+   matrix.resize(result.size(), rhs.size(), true);
+   vcl_matrix.resize(result.size(), rhs.size(), true);
+   if( std::fabs(diff(matrix, vcl_matrix)) > epsilon )
+   {
+      std::cout << "# Error at operation: matrix resize (to smaller)" << std::endl;
+      std::cout << "  diff: " << std::fabs(diff(matrix, vcl_matrix)) << std::endl;
+      return EXIT_FAILURE;
+   }
+   */
+
+   //
+   // Run a bunch of tests for rank-1-updates, matrix-vector products
+   //
+   std::cout << "------------ Testing rank-1-updates and matrix-vector products ------------------" << std::endl;
+
+   std::cout << "* m = full, v1 = full, v2 = full" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2, ublas_m2,
+                                      vcl_m1_native, vcl_v1_native, vcl_v2_native, vcl_m2_native);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   std::cout << "* m = full, v1 = full, v2 = range" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2, ublas_m2,
+                                      vcl_m1_native, vcl_v1_native, vcl_v2_range, vcl_m2_native);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   std::cout << "* m = full, v1 = full, v2 = slice" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2, ublas_m2,
+                                      vcl_m1_native, vcl_v1_native, vcl_v2_slice, vcl_m2_native);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   // v1 = range
+
+
+   std::cout << "* m = full, v1 = range, v2 = full" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2, ublas_m2,
+                                      vcl_m1_native, vcl_v1_range, vcl_v2_native, vcl_m2_native);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   std::cout << "* m = full, v1 = range, v2 = range" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2, ublas_m2,
+                                      vcl_m1_native, vcl_v1_range, vcl_v2_range, vcl_m2_native);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   std::cout << "* m = full, v1 = range, v2 = slice" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2, ublas_m2,
+                                      vcl_m1_native, vcl_v1_range, vcl_v2_slice, vcl_m2_native);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+
+   // v1 = slice
+
+   std::cout << "* m = full, v1 = slice, v2 = full" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2, ublas_m2,
+                                      vcl_m1_native, vcl_v1_slice, vcl_v2_native, vcl_m2_native);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   std::cout << "* m = full, v1 = slice, v2 = range" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2, ublas_m2,
+                                      vcl_m1_native, vcl_v1_slice, vcl_v2_range, vcl_m2_native);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   std::cout << "* m = full, v1 = slice, v2 = slice" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2, ublas_m2,
+                                      vcl_m1_native, vcl_v1_slice, vcl_v2_slice, vcl_m2_native);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   ///////////////////////////// matrix_range
+
+   std::cout << "* m = range, v1 = full, v2 = full" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2, ublas_m2,
+                                      vcl_m1_range, vcl_v1_native, vcl_v2_native, vcl_m2_range);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   std::cout << "* m = range, v1 = full, v2 = range" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2, ublas_m2,
+                                      vcl_m1_range, vcl_v1_native, vcl_v2_range, vcl_m2_range);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   std::cout << "* m = range, v1 = full, v2 = slice" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2, ublas_m2,
+                                      vcl_m1_range, vcl_v1_native, vcl_v2_slice, vcl_m2_range);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   // v1 = range
+
+
+   std::cout << "* m = range, v1 = range, v2 = full" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2, ublas_m2,
+                                      vcl_m1_range, vcl_v1_range, vcl_v2_native, vcl_m2_range);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   std::cout << "* m = range, v1 = range, v2 = range" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2, ublas_m2,
+                                      vcl_m1_range, vcl_v1_range, vcl_v2_range, vcl_m2_range);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   std::cout << "* m = range, v1 = range, v2 = slice" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2, ublas_m2,
+                                      vcl_m1_range, vcl_v1_range, vcl_v2_slice, vcl_m2_range);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+
+   // v1 = slice
+
+   std::cout << "* m = range, v1 = slice, v2 = full" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2, ublas_m2,
+                                      vcl_m1_range, vcl_v1_slice, vcl_v2_native, vcl_m2_range);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   std::cout << "* m = range, v1 = slice, v2 = range" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2, ublas_m2,
+                                      vcl_m1_range, vcl_v1_slice, vcl_v2_range, vcl_m2_range);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   std::cout << "* m = range, v1 = slice, v2 = slice" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2, ublas_m2,
+                                      vcl_m1_range, vcl_v1_slice, vcl_v2_slice, vcl_m2_range);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   ///////////////////////////// matrix_slice
+
+   std::cout << "* m = slice, v1 = full, v2 = full" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2, ublas_m2,
+                                      vcl_m1_slice, vcl_v1_native, vcl_v2_native, vcl_m2_slice);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   std::cout << "* m = slice, v1 = full, v2 = range" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2, ublas_m2,
+                                      vcl_m1_slice, vcl_v1_native, vcl_v2_range, vcl_m2_slice);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   std::cout << "* m = slice, v1 = full, v2 = slice" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2, ublas_m2,
+                                      vcl_m1_slice, vcl_v1_native, vcl_v2_slice, vcl_m2_slice);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   // v1 = range
+
+
+   std::cout << "* m = slice, v1 = range, v2 = full" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2, ublas_m2,
+                                      vcl_m1_slice, vcl_v1_range, vcl_v2_native, vcl_m2_slice);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   std::cout << "* m = slice, v1 = range, v2 = range" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2, ublas_m2,
+                                      vcl_m1_slice, vcl_v1_range, vcl_v2_range, vcl_m2_slice);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   std::cout << "* m = slice, v1 = range, v2 = slice" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2, ublas_m2,
+                                      vcl_m1_slice, vcl_v1_range, vcl_v2_slice, vcl_m2_slice);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+
+   // v1 = slice
+
+   std::cout << "* m = slice, v1 = slice, v2 = full" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2, ublas_m2,
+                                      vcl_m1_slice, vcl_v1_slice, vcl_v2_native, vcl_m2_slice);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   std::cout << "* m = slice, v1 = slice, v2 = range" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2, ublas_m2,
+                                      vcl_m1_slice, vcl_v1_slice, vcl_v2_range, vcl_m2_slice);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   std::cout << "* m = slice, v1 = slice, v2 = slice" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2, ublas_m2,
+                                      vcl_m1_slice, vcl_v1_slice, vcl_v2_slice, vcl_m2_slice);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+
+   //
+   // Testing triangular solve() routines
+   //
+
+   std::cout << "------------ Testing triangular solves ------------------" << std::endl;
+
+   std::cout << "* m = full, v1 = full" << std::endl;
+   retval = test_solve<NumericT>(epsilon,
+                                 ublas_m2, ublas_v1,
+                                 vcl_m2_native, vcl_v1_native);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+   std::cout << "* m = full, v1 = range" << std::endl;
+   retval = test_solve<NumericT>(epsilon,
+                                 ublas_m2, ublas_v1,
+                                 vcl_m2_native, vcl_v1_range);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+   std::cout << "* m = full, v1 = slice" << std::endl;
+   retval = test_solve<NumericT>(epsilon,
+                                 ublas_m2, ublas_v1,
+                                 vcl_m2_native, vcl_v1_slice);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+   ///////// matrix_range
+
+
+   std::cout << "* m = range, v1 = full" << std::endl;
+   retval = test_solve<NumericT>(epsilon,
+                                 ublas_m2, ublas_v1,
+                                 vcl_m2_range, vcl_v1_native);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+   std::cout << "* m = range, v1 = range" << std::endl;
+   retval = test_solve<NumericT>(epsilon,
+                                 ublas_m2, ublas_v1,
+                                 vcl_m2_range, vcl_v1_range);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+   std::cout << "* m = range, v1 = slice" << std::endl;
+   retval = test_solve<NumericT>(epsilon,
+                                 ublas_m2, ublas_v1,
+                                 vcl_m2_range, vcl_v1_slice);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+   //////// matrix_slice
+
+   std::cout << "* m = slice, v1 = full" << std::endl;
+   retval = test_solve<NumericT>(epsilon,
+                                 ublas_m2, ublas_v1,
+                                 vcl_m2_slice, vcl_v1_native);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+   std::cout << "* m = slice, v1 = range" << std::endl;
+   retval = test_solve<NumericT>(epsilon,
+                                 ublas_m2, ublas_v1,
+                                 vcl_m2_slice, vcl_v1_range);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+   std::cout << "* m = slice, v1 = slice" << std::endl;
+   retval = test_solve<NumericT>(epsilon,
+                                 ublas_m2, ublas_v1,
+                                 vcl_m2_slice, vcl_v1_slice);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+
+
+
+
+
+   ////////////// Final test for full LU decomposition:
+
+   //full solver:
+   std::cout << "Full solver" << std::endl;
+   unsigned int lu_dim = 100;
+   ublas::matrix<NumericT> square_matrix(lu_dim, lu_dim);
+   ublas::vector<NumericT> lu_rhs(lu_dim);
+   viennacl::matrix<NumericT, F> vcl_square_matrix(lu_dim, lu_dim);
+   viennacl::vector<NumericT> vcl_lu_rhs(lu_dim);
+
+   for (std::size_t i=0; i<lu_dim; ++i)
+     for (std::size_t j=0; j<lu_dim; ++j)
+       square_matrix(i,j) = -static_cast<NumericT>(0.5) * random<NumericT>();
+
+   //put some more weight on diagonal elements:
+   for (std::size_t j=0; j<lu_dim; ++j)
+   {
+     square_matrix(j,j) = static_cast<NumericT>(20.0) + random<NumericT>();
+     lu_rhs(j) = random<NumericT>();
+   }
+
+   viennacl::copy(square_matrix, vcl_square_matrix);
+   viennacl::copy(lu_rhs, vcl_lu_rhs);
+
+   //ublas::
+   ublas::lu_factorize(square_matrix);
+   ublas::inplace_solve (square_matrix, lu_rhs, ublas::unit_lower_tag ());
+   ublas::inplace_solve (square_matrix, lu_rhs, ublas::upper_tag ());
+
+   // ViennaCL:
+   viennacl::linalg::lu_factorize(vcl_square_matrix);
+   //viennacl::copy(square_matrix, vcl_square_matrix);
+   viennacl::linalg::lu_substitute(vcl_square_matrix, vcl_lu_rhs);
+
+   if( std::fabs(diff(lu_rhs, vcl_lu_rhs)) > epsilon )
+   {
+      std::cout << "# Error at operation: dense solver" << std::endl;
+      std::cout << "  diff: " << std::fabs(diff(lu_rhs, vcl_lu_rhs)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+
+
+   return retval;
+}
+//
+// -------------------------------------------------------------
+//
+int main()
+{
+   std::cout << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << "## Test :: Matrix" << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << std::endl;
+
+   int retval = EXIT_SUCCESS;
+
+   std::cout << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << std::endl;
+   {
+      typedef float NumericT;
+      NumericT epsilon = NumericT(1.0E-3);
+      std::cout << "# Testing setup:" << std::endl;
+      std::cout << "  eps:     " << epsilon << std::endl;
+      std::cout << "  numeric: float" << std::endl;
+      std::cout << "  layout: row-major" << std::endl;
+      retval = test<NumericT, viennacl::row_major>(epsilon);
+      if( retval == EXIT_SUCCESS )
+         std::cout << "# Test passed" << std::endl;
+      else
+         return retval;
+   }
+   std::cout << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << std::endl;
+   {
+      typedef float NumericT;
+      NumericT epsilon = NumericT(1.0E-3);
+      std::cout << "# Testing setup:" << std::endl;
+      std::cout << "  eps:     " << epsilon << std::endl;
+      std::cout << "  numeric: float" << std::endl;
+      std::cout << "  layout: column-major" << std::endl;
+      retval = test<NumericT, viennacl::column_major>(epsilon);
+      if( retval == EXIT_SUCCESS )
+         std::cout << "# Test passed" << std::endl;
+      else
+         return retval;
+   }
+   std::cout << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << std::endl;
+
+
+#ifdef VIENNACL_WITH_OPENCL
+   if( viennacl::ocl::current_device().double_support() )
+#endif
+   {
+      {
+         typedef double NumericT;
+         NumericT epsilon = 1.0E-11;
+         std::cout << "# Testing setup:" << std::endl;
+         std::cout << "  eps:     " << epsilon << std::endl;
+         std::cout << "  numeric: double" << std::endl;
+         std::cout << "  layout: row-major" << std::endl;
+         retval = test<NumericT, viennacl::row_major>(epsilon);
+            if( retval == EXIT_SUCCESS )
+               std::cout << "# Test passed" << std::endl;
+            else
+              return retval;
+      }
+      std::cout << std::endl;
+      std::cout << "----------------------------------------------" << std::endl;
+      std::cout << std::endl;
+      {
+         typedef double NumericT;
+         NumericT epsilon = 1.0E-11;
+         std::cout << "# Testing setup:" << std::endl;
+         std::cout << "  eps:     " << epsilon << std::endl;
+         std::cout << "  numeric: double" << std::endl;
+         std::cout << "  layout: column-major" << std::endl;
+         retval = test<NumericT, viennacl::column_major>(epsilon);
+            if( retval == EXIT_SUCCESS )
+               std::cout << "# Test passed" << std::endl;
+            else
+              return retval;
+      }
+      std::cout << std::endl;
+      std::cout << "----------------------------------------------" << std::endl;
+      std::cout << std::endl;
+   }
+
+   std::cout << std::endl;
+   std::cout << "------- Test completed --------" << std::endl;
+   std::cout << std::endl;
+
+
+   return retval;
+}
diff --git a/tests/src/matrix_vector.cu b/tests/src/matrix_vector.cu
new file mode 100644
index 0000000..90e3ac0
--- /dev/null
+++ b/tests/src/matrix_vector.cu
@@ -0,0 +1,1146 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+//
+// *** System
+//
+#include <iostream>
+
+//
+// *** Boost
+//
+#include <boost/numeric/ublas/io.hpp>
+#include <boost/numeric/ublas/triangular.hpp>
+#include <boost/numeric/ublas/matrix_sparse.hpp>
+#include <boost/numeric/ublas/matrix.hpp>
+#include <boost/numeric/ublas/matrix_proxy.hpp>
+#include <boost/numeric/ublas/lu.hpp>
+#include <boost/numeric/ublas/io.hpp>
+#include <boost/numeric/ublas/vector_proxy.hpp>
+
+//
+// *** ViennaCL
+//
+//#define VIENNACL_DEBUG_ALL
+#define VIENNACL_WITH_UBLAS 1
+#include "viennacl/scalar.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+#include "viennacl/linalg/direct_solve.hpp"
+#include "viennacl/linalg/lu.hpp"
+#include "examples/tutorial/Random.hpp"
+
+//
+// -------------------------------------------------------------
+//
+using namespace boost::numeric;
+//
+// -------------------------------------------------------------
+//
+template <typename ScalarType>
+ScalarType diff(ScalarType & s1, viennacl::scalar<ScalarType> & s2)
+{
+   viennacl::backend::finish();
+   if (s1 != s2)
+      return (s1 - s2) / std::max(std::fabs(s1), std::fabs(s2));
+   return 0;
+}
+
+template <typename ScalarType, typename VCLVectorType>
+ScalarType diff(ublas::vector<ScalarType> const & v1, VCLVectorType const & v2)
+{
+   ublas::vector<ScalarType> v2_cpu(v2.size());
+   viennacl::backend::finish();  //workaround for a bug in APP SDK 2.7 on Trinity APUs (with Catalyst 12.8)
+   viennacl::copy(v2.begin(), v2.end(), v2_cpu.begin());
+
+   for (unsigned int i=0;i<v1.size(); ++i)
+   {
+      if ( std::max( std::fabs(v2_cpu[i]), std::fabs(v1[i]) ) > 0 )
+         v2_cpu[i] = std::fabs(v2_cpu[i] - v1[i]) / std::max( std::fabs(v2_cpu[i]), std::fabs(v1[i]) );
+      else
+         v2_cpu[i] = 0.0;
+   }
+
+   return norm_inf(v2_cpu);
+}
+
+template <typename ScalarType, typename VCLMatrixType>
+ScalarType diff(ublas::matrix<ScalarType> const & mat1, VCLMatrixType const & mat2)
+{
+   ublas::matrix<ScalarType> mat2_cpu(mat2.size1(), mat2.size2());
+   viennacl::backend::finish();  //workaround for a bug in APP SDK 2.7 on Trinity APUs (with Catalyst 12.8)
+   viennacl::copy(mat2, mat2_cpu);
+   ScalarType ret = 0;
+   ScalarType act = 0;
+
+    for (unsigned int i = 0; i < mat2_cpu.size1(); ++i)
+    {
+      for (unsigned int j = 0; j < mat2_cpu.size2(); ++j)
+      {
+         act = std::fabs(mat2_cpu(i,j) - mat1(i,j)) / std::max( std::fabs(mat2_cpu(i, j)), std::fabs(mat1(i,j)) );
+         if (act > ret)
+           ret = act;
+      }
+    }
+   //std::cout << ret << std::endl;
+   return ret;
+}
+//
+// -------------------------------------------------------------
+//
+
+template <typename NumericT, typename Epsilon,
+          typename UblasMatrixType, typename UblasVectorType,
+          typename VCLMatrixType, typename VCLVectorType1, typename VCLVectorType2>
+int test_prod_rank1(Epsilon const & epsilon,
+                    UblasMatrixType & ublas_m1, UblasVectorType & ublas_v1, UblasVectorType & ublas_v2, UblasMatrixType & ublas_m2,
+                    VCLMatrixType & vcl_m1, VCLVectorType1 & vcl_v1, VCLVectorType2 & vcl_v2, VCLMatrixType & vcl_m2)
+{
+   int retval = EXIT_SUCCESS;
+
+   // sync data:
+   ublas_v1 = ublas::scalar_vector<NumericT>(ublas_v1.size(), NumericT(0.1234));
+   ublas_v2 = ublas::scalar_vector<NumericT>(ublas_v2.size(), NumericT(0.4321));
+   viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+   viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+   viennacl::copy(ublas_m1, vcl_m1);
+
+   // --------------------------------------------------------------------------
+   std::cout << "Rank 1 update" << std::endl;
+
+   ublas_m1 += ublas::outer_prod(ublas_v1, ublas_v2);
+   vcl_m1 += viennacl::linalg::outer_prod(vcl_v1, vcl_v2);
+   if( std::fabs(diff(ublas_m1, vcl_m1)) > epsilon )
+   {
+      std::cout << "# Error at operation: rank 1 update" << std::endl;
+      std::cout << "  diff: " << std::fabs(diff(ublas_m1, vcl_m1)) << std::endl;
+      return EXIT_FAILURE;
+   }
+
+
+
+   // --------------------------------------------------------------------------
+   std::cout << "Scaled rank 1 update - CPU Scalar" << std::endl;
+   ublas_m1 += NumericT(4.2) * ublas::outer_prod(ublas_v1, ublas_v2);
+   vcl_m1 += NumericT(2.1) * viennacl::linalg::outer_prod(vcl_v1, vcl_v2);
+   vcl_m1 += viennacl::linalg::outer_prod(vcl_v1, vcl_v2) * NumericT(2.1);  //check proper compilation
+   if( std::fabs(diff(ublas_m1, vcl_m1)) > epsilon )
+   {
+      std::cout << "# Error at operation: scaled rank 1 update - CPU Scalar" << std::endl;
+      std::cout << "  diff: " << std::fabs(diff(ublas_m1, vcl_m1)) << std::endl;
+      return EXIT_FAILURE;
+   }
+
+      // --------------------------------------------------------------------------
+   std::cout << "Scaled rank 1 update - GPU Scalar" << std::endl;
+   ublas_m1 += NumericT(4.2) * ublas::outer_prod(ublas_v1, ublas_v2);
+   vcl_m1 += viennacl::scalar<NumericT>(NumericT(2.1)) * viennacl::linalg::outer_prod(vcl_v1, vcl_v2);
+   vcl_m1 += viennacl::linalg::outer_prod(vcl_v1, vcl_v2) * viennacl::scalar<NumericT>(NumericT(2.1));  //check proper compilation
+   if( std::fabs(diff(ublas_m1, vcl_m1)) > epsilon )
+   {
+      std::cout << "# Error at operation: scaled rank 1 update - GPU Scalar" << std::endl;
+      std::cout << "  diff: " << std::fabs(diff(ublas_m1, vcl_m1)) << std::endl;
+      return EXIT_FAILURE;
+   }
+
+   //reset vcl_matrix:
+   viennacl::copy(ublas_m1, vcl_m1);
+
+   // --------------------------------------------------------------------------
+   std::cout << "Matrix-Vector product" << std::endl;
+   ublas_v1 = viennacl::linalg::prod(ublas_m1, ublas_v2);
+   vcl_v1   = viennacl::linalg::prod(vcl_m1, vcl_v2);
+
+   if( std::fabs(diff(ublas_v1, vcl_v1)) > epsilon )
+   {
+      std::cout << "# Error at operation: matrix-vector product" << std::endl;
+      std::cout << "  diff: " << std::fabs(diff(ublas_v1, vcl_v1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+   // --------------------------------------------------------------------------
+   std::cout << "Matrix-Vector product with scaled add" << std::endl;
+   NumericT alpha = static_cast<NumericT>(2.786);
+   NumericT beta = static_cast<NumericT>(1.432);
+   viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+   viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+   ublas_v1 = alpha * viennacl::linalg::prod(ublas_m1, ublas_v2) + beta * ublas_v1;
+   vcl_v1   = alpha * viennacl::linalg::prod(vcl_m1, vcl_v2) + beta * vcl_v1;
+
+   if( std::fabs(diff(ublas_v1, vcl_v1)) > epsilon )
+   {
+      std::cout << "# Error at operation: matrix-vector product with scaled additions" << std::endl;
+      std::cout << "  diff: " << std::fabs(diff(ublas_v1, vcl_v1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+   // --------------------------------------------------------------------------
+
+   viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+   viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+   std::cout << "Transposed Matrix-Vector product" << std::endl;
+   ublas_v2 = alpha * viennacl::linalg::prod(trans(ublas_m1), ublas_v1);
+   vcl_v2   = alpha * viennacl::linalg::prod(trans(vcl_m1), vcl_v1);
+
+   if( std::fabs(diff(ublas_v2, vcl_v2)) > epsilon )
+   {
+      std::cout << "# Error at operation: transposed matrix-vector product" << std::endl;
+      std::cout << "  diff: " << std::fabs(diff(ublas_v2, vcl_v2)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   std::cout << "Transposed Matrix-Vector product with scaled add" << std::endl;
+   ublas_v2 = alpha * viennacl::linalg::prod(trans(ublas_m1), ublas_v1) + beta * ublas_v2;
+   vcl_v2   = alpha * viennacl::linalg::prod(trans(vcl_m1), vcl_v1) + beta * vcl_v2;
+
+   if( std::fabs(diff(ublas_v2, vcl_v2)) > epsilon )
+   {
+      std::cout << "# Error at operation: transposed matrix-vector product with scaled additions" << std::endl;
+      std::cout << "  diff: " << std::fabs(diff(ublas_v2, vcl_v2)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+   // --------------------------------------------------------------------------
+
+   viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+   viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+   std::cout << "Row extraction from matrix" << std::endl;
+   ublas_v2 = row(ublas_m1, std::size_t(7));
+   vcl_v2   = row(vcl_m1, std::size_t(7));
+
+   if( std::fabs(diff(ublas_v2, vcl_v2)) > epsilon )
+   {
+      std::cout << "# Error at operation: diagonal extraction from matrix" << std::endl;
+      std::cout << "  diff: " << std::fabs(diff(ublas_v2, vcl_v2)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   std::cout << "Column extraction from matrix" << std::endl;
+   ublas_v1 = column(ublas_m1, std::size_t(7));
+   vcl_v1   = column(vcl_m1, std::size_t(7));
+
+   if( std::fabs(diff(ublas_v1, vcl_v1)) > epsilon )
+   {
+      std::cout << "# Error at operation: diagonal extraction from matrix" << std::endl;
+      std::cout << "  diff: " << std::fabs(diff(ublas_v2, vcl_v2)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   // --------------------------------------------------------------------------
+
+   viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+   viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+   viennacl::copy(ublas_m2, vcl_m2);
+   UblasMatrixType A = ublas_m2;
+
+   std::cout << "Diagonal extraction from matrix" << std::endl;
+   for (std::size_t i=0; i<ublas_m1.size2(); ++i)
+     ublas_v2[i] = ublas_m1(i + 3, i);
+   vcl_v2   = diag(vcl_m1, static_cast<int>(-3));
+
+   if( std::fabs(diff(ublas_v2, vcl_v2)) > epsilon )
+   {
+      std::cout << "# Error at operation: diagonal extraction from matrix" << std::endl;
+      std::cout << "  diff: " << std::fabs(diff(ublas_v2, vcl_v2)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   std::cout << "Matrix diagonal assignment from vector" << std::endl;
+   A = ublas::scalar_matrix<NumericT>(A.size1(), A.size2(), NumericT(0));
+   for (std::size_t i=0; i<ublas_m1.size2(); ++i)
+     A(i + (A.size1() - ublas_m1.size2()), i) = ublas_v2[i];
+   vcl_m2 = diag(vcl_v2, static_cast<int>(ublas_m1.size2()) - static_cast<int>(A.size1()));
+
+   if( std::fabs(diff(A, vcl_m2)) > epsilon )
+   {
+      std::cout << "# Error at operation: Matrix assignment from diagonal" << std::endl;
+      std::cout << "  diff: " << std::fabs(diff(A, vcl_m2)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+   // --------------------------------------------------------------------------
+
+   return retval;
+}
+
+
+
+template <typename NumericT, typename Epsilon,
+          typename UblasMatrixType, typename UblasVectorType,
+          typename VCLMatrixType, typename VCLVectorType1>
+int test_solve(Epsilon const & epsilon,
+               UblasMatrixType & ublas_m1, UblasVectorType & ublas_v1,
+               VCLMatrixType & vcl_m1, VCLVectorType1 & vcl_v1)
+{
+   int retval = EXIT_SUCCESS;
+
+   // sync data:
+   //viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+   viennacl::copy(ublas_v1, vcl_v1);
+   viennacl::copy(ublas_m1, vcl_m1);
+
+   /////////////////// test direct solvers ////////////////////////////
+
+   //upper triangular:
+   std::cout << "Upper triangular solver" << std::endl;
+   ublas_v1 = ublas::solve(ublas_m1, ublas_v1, ublas::upper_tag());
+   vcl_v1 = viennacl::linalg::solve(vcl_m1, vcl_v1, viennacl::linalg::upper_tag());
+   if( std::fabs(diff(ublas_v1, vcl_v1)) > epsilon )
+   {
+      std::cout << "# Error at operation: upper triangular solver" << std::endl;
+      std::cout << "  diff: " << std::fabs(diff(ublas_v1, vcl_v1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   //upper unit triangular:
+   std::cout << "Upper unit triangular solver" << std::endl;
+   viennacl::copy(ublas_v1, vcl_v1);
+   ublas_v1 = ublas::solve(ublas_m1, ublas_v1, ublas::unit_upper_tag());
+   vcl_v1 = viennacl::linalg::solve(vcl_m1, vcl_v1, viennacl::linalg::unit_upper_tag());
+   if( std::fabs(diff(ublas_v1, vcl_v1)) > epsilon )
+   {
+      std::cout << "# Error at operation: unit upper triangular solver" << std::endl;
+      std::cout << "  diff: " << std::fabs(diff(ublas_v1, vcl_v1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   //lower triangular:
+   std::cout << "Lower triangular solver" << std::endl;
+   viennacl::copy(ublas_v1, vcl_v1);
+   ublas_v1 = ublas::solve(ublas_m1, ublas_v1, ublas::lower_tag());
+   vcl_v1 = viennacl::linalg::solve(vcl_m1, vcl_v1, viennacl::linalg::lower_tag());
+   if( std::fabs(diff(ublas_v1, vcl_v1)) > epsilon )
+   {
+      std::cout << "# Error at operation: lower triangular solver" << std::endl;
+      std::cout << "  diff: " << std::fabs(diff(ublas_v1, vcl_v1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   //lower unit triangular:
+   std::cout << "Lower unit triangular solver" << std::endl;
+   viennacl::copy(ublas_v1, vcl_v1);
+   ublas_v1 = ublas::solve(ublas_m1, ublas_v1, ublas::unit_lower_tag());
+   vcl_v1 = viennacl::linalg::solve(vcl_m1, vcl_v1, viennacl::linalg::unit_lower_tag());
+   if( std::fabs(diff(ublas_v1, vcl_v1)) > epsilon )
+   {
+      std::cout << "# Error at operation: unit lower triangular solver" << std::endl;
+      std::cout << "  diff: " << std::fabs(diff(ublas_v1, vcl_v1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+
+
+
+
+   //transposed upper triangular:
+   std::cout << "Transposed upper triangular solver" << std::endl;
+   viennacl::copy(ublas_v1, vcl_v1);
+   ublas_v1 = ublas::solve(trans(ublas_m1), ublas_v1, ublas::upper_tag());
+   vcl_v1 = viennacl::linalg::solve(trans(vcl_m1), vcl_v1, viennacl::linalg::upper_tag());
+   if( std::fabs(diff(ublas_v1, vcl_v1)) > epsilon )
+   {
+      std::cout << "# Error at operation: upper triangular solver" << std::endl;
+      std::cout << "  diff: " << std::fabs(diff(ublas_v1, vcl_v1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   //transposed upper unit triangular:
+   std::cout << "Transposed unit upper triangular solver" << std::endl;
+   viennacl::copy(ublas_v1, vcl_v1);
+   ublas_v1 = ublas::solve(trans(ublas_m1), ublas_v1, ublas::unit_upper_tag());
+   vcl_v1 = viennacl::linalg::solve(trans(vcl_m1), vcl_v1, viennacl::linalg::unit_upper_tag());
+   if( std::fabs(diff(ublas_v1, vcl_v1)) > epsilon )
+   {
+      std::cout << "# Error at operation: unit upper triangular solver" << std::endl;
+      std::cout << "  diff: " << std::fabs(diff(ublas_v1, vcl_v1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   //transposed lower triangular:
+   std::cout << "Transposed lower triangular solver" << std::endl;
+   viennacl::copy(ublas_v1, vcl_v1);
+   ublas_v1 = ublas::solve(trans(ublas_m1), ublas_v1, ublas::lower_tag());
+   vcl_v1 = viennacl::linalg::solve(trans(vcl_m1), vcl_v1, viennacl::linalg::lower_tag());
+   if( std::fabs(diff(ublas_v1, vcl_v1)) > epsilon )
+   {
+      std::cout << "# Error at operation: lower triangular solver" << std::endl;
+      std::cout << "  diff: " << std::fabs(diff(ublas_v1, vcl_v1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   //transposed lower unit triangular:
+   std::cout << "Transposed unit lower triangular solver" << std::endl;
+   viennacl::copy(ublas_v1, vcl_v1);
+   ublas_v1 = ublas::solve(trans(ublas_m1), ublas_v1, ublas::unit_lower_tag());
+   vcl_v1 = viennacl::linalg::solve(trans(vcl_m1), vcl_v1, viennacl::linalg::unit_lower_tag());
+   if( std::fabs(diff(ublas_v1, vcl_v1)) > epsilon )
+   {
+      std::cout << "# Error at operation: unit lower triangular solver" << std::endl;
+      std::cout << "  diff: " << std::fabs(diff(ublas_v1, vcl_v1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   return retval;
+}
+
+
+//
+// -------------------------------------------------------------
+//
+template< typename NumericT, typename F, typename Epsilon >
+int test(Epsilon const& epsilon)
+{
+   int retval = EXIT_SUCCESS;
+
+   std::size_t num_rows = 141; //note: use num_rows > num_cols + 3 for diag() tests to work
+   std::size_t num_cols = 103;
+
+   // --------------------------------------------------------------------------
+   ublas::vector<NumericT> ublas_v1(num_rows);
+   for (std::size_t i = 0; i < ublas_v1.size(); ++i)
+     ublas_v1(i) = random<NumericT>();
+   ublas::vector<NumericT> ublas_v2 = ublas::scalar_vector<NumericT>(num_cols, NumericT(3.1415));
+
+
+   ublas::matrix<NumericT> ublas_m1(ublas_v1.size(), ublas_v2.size());
+
+   for (std::size_t i = 0; i < ublas_m1.size1(); ++i)
+      for (std::size_t j = 0; j < ublas_m1.size2(); ++j)
+         ublas_m1(i,j) = static_cast<NumericT>(0.1) * random<NumericT>();
+
+
+   ublas::matrix<NumericT> ublas_m2(ublas_v1.size(), ublas_v1.size());
+
+   for (std::size_t i = 0; i < ublas_m2.size1(); ++i)
+   {
+      for (std::size_t j = 0; j < ublas_m2.size2(); ++j)
+         ublas_m2(i,j) = static_cast<NumericT>(-0.1) * random<NumericT>();
+      ublas_m2(i, i) = static_cast<NumericT>(2) + random<NumericT>();
+   }
+
+
+   viennacl::vector<NumericT> vcl_v1_native(ublas_v1.size());
+   viennacl::vector<NumericT> vcl_v1_large(4 * ublas_v1.size());
+   viennacl::vector_range< viennacl::vector<NumericT> > vcl_v1_range(vcl_v1_large, viennacl::range(3, ublas_v1.size() + 3));
+   viennacl::vector_slice< viennacl::vector<NumericT> > vcl_v1_slice(vcl_v1_large, viennacl::slice(2, 3, ublas_v1.size()));
+
+   viennacl::vector<NumericT> vcl_v2_native(ublas_v2.size());
+   viennacl::vector<NumericT> vcl_v2_large(4 * ublas_v2.size());
+   viennacl::vector_range< viennacl::vector<NumericT> > vcl_v2_range(vcl_v2_large, viennacl::range(8, ublas_v2.size() + 8));
+   viennacl::vector_slice< viennacl::vector<NumericT> > vcl_v2_slice(vcl_v2_large, viennacl::slice(6, 2, ublas_v2.size()));
+
+   viennacl::matrix<NumericT, F> vcl_m1_native(ublas_m1.size1(), ublas_m1.size2());
+   viennacl::matrix<NumericT, F> vcl_m1_large(4 * ublas_m1.size1(), 4 * ublas_m1.size2());
+   viennacl::matrix_range< viennacl::matrix<NumericT, F> > vcl_m1_range(vcl_m1_large,
+                                                                        viennacl::range(8, ublas_m1.size1() + 8),
+                                                                        viennacl::range(ublas_m1.size2(), 2 * ublas_m1.size2()) );
+   viennacl::matrix_slice< viennacl::matrix<NumericT, F> > vcl_m1_slice(vcl_m1_large,
+                                                                        viennacl::slice(6, 2, ublas_m1.size1()),
+                                                                        viennacl::slice(ublas_m1.size2(), 2, ublas_m1.size2()) );
+
+   viennacl::matrix<NumericT, F> vcl_m2_native(ublas_m2.size1(), ublas_m2.size2());
+   viennacl::matrix<NumericT, F> vcl_m2_large(4 * ublas_m2.size1(), 4 * ublas_m2.size2());
+   viennacl::matrix_range< viennacl::matrix<NumericT, F> > vcl_m2_range(vcl_m2_large,
+                                                                        viennacl::range(8, ublas_m2.size1() + 8),
+                                                                        viennacl::range(ublas_m2.size2(), 2 * ublas_m2.size2()) );
+   viennacl::matrix_slice< viennacl::matrix<NumericT, F> > vcl_m2_slice(vcl_m2_large,
+                                                                        viennacl::slice(6, 2, ublas_m2.size1()),
+                                                                        viennacl::slice(ublas_m2.size2(), 2, ublas_m2.size2()) );
+
+
+/*   std::cout << "Matrix resizing (to larger)" << std::endl;
+   matrix.resize(2*num_rows, 2*num_cols, true);
+   for (unsigned int i = 0; i < matrix.size1(); ++i)
+   {
+      for (unsigned int j = (i<result.size() ? rhs.size() : 0); j < matrix.size2(); ++j)
+         matrix(i,j) = 0;
+   }
+   vcl_matrix.resize(2*num_rows, 2*num_cols, true);
+   viennacl::copy(vcl_matrix, matrix);
+   if( std::fabs(diff(matrix, vcl_matrix)) > epsilon )
+   {
+      std::cout << "# Error at operation: matrix resize (to larger)" << std::endl;
+      std::cout << "  diff: " << std::fabs(diff(matrix, vcl_matrix)) << std::endl;
+      return EXIT_FAILURE;
+   }
+
+   matrix(12, 14) = NumericT(1.9);
+   matrix(19, 16) = NumericT(1.0);
+   matrix (13, 15) =  NumericT(-9);
+   vcl_matrix(12, 14) = NumericT(1.9);
+   vcl_matrix(19, 16) = NumericT(1.0);
+   vcl_matrix (13, 15) =  NumericT(-9);
+
+   std::cout << "Matrix resizing (to smaller)" << std::endl;
+   matrix.resize(result.size(), rhs.size(), true);
+   vcl_matrix.resize(result.size(), rhs.size(), true);
+   if( std::fabs(diff(matrix, vcl_matrix)) > epsilon )
+   {
+      std::cout << "# Error at operation: matrix resize (to smaller)" << std::endl;
+      std::cout << "  diff: " << std::fabs(diff(matrix, vcl_matrix)) << std::endl;
+      return EXIT_FAILURE;
+   }
+   */
+
+   //
+   // Run a bunch of tests for rank-1-updates, matrix-vector products
+   //
+   std::cout << "------------ Testing rank-1-updates and matrix-vector products ------------------" << std::endl;
+
+   std::cout << "* m = full, v1 = full, v2 = full" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2, ublas_m2,
+                                      vcl_m1_native, vcl_v1_native, vcl_v2_native, vcl_m2_native);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   std::cout << "* m = full, v1 = full, v2 = range" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2, ublas_m2,
+                                      vcl_m1_native, vcl_v1_native, vcl_v2_range, vcl_m2_native);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   std::cout << "* m = full, v1 = full, v2 = slice" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2, ublas_m2,
+                                      vcl_m1_native, vcl_v1_native, vcl_v2_slice, vcl_m2_native);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   // v1 = range
+
+
+   std::cout << "* m = full, v1 = range, v2 = full" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2, ublas_m2,
+                                      vcl_m1_native, vcl_v1_range, vcl_v2_native, vcl_m2_native);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   std::cout << "* m = full, v1 = range, v2 = range" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2, ublas_m2,
+                                      vcl_m1_native, vcl_v1_range, vcl_v2_range, vcl_m2_native);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   std::cout << "* m = full, v1 = range, v2 = slice" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2, ublas_m2,
+                                      vcl_m1_native, vcl_v1_range, vcl_v2_slice, vcl_m2_native);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+
+   // v1 = slice
+
+   std::cout << "* m = full, v1 = slice, v2 = full" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2, ublas_m2,
+                                      vcl_m1_native, vcl_v1_slice, vcl_v2_native, vcl_m2_native);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   std::cout << "* m = full, v1 = slice, v2 = range" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2, ublas_m2,
+                                      vcl_m1_native, vcl_v1_slice, vcl_v2_range, vcl_m2_native);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   std::cout << "* m = full, v1 = slice, v2 = slice" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2, ublas_m2,
+                                      vcl_m1_native, vcl_v1_slice, vcl_v2_slice, vcl_m2_native);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   ///////////////////////////// matrix_range
+
+   std::cout << "* m = range, v1 = full, v2 = full" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2, ublas_m2,
+                                      vcl_m1_range, vcl_v1_native, vcl_v2_native, vcl_m2_range);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   std::cout << "* m = range, v1 = full, v2 = range" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2, ublas_m2,
+                                      vcl_m1_range, vcl_v1_native, vcl_v2_range, vcl_m2_range);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   std::cout << "* m = range, v1 = full, v2 = slice" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2, ublas_m2,
+                                      vcl_m1_range, vcl_v1_native, vcl_v2_slice, vcl_m2_range);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   // v1 = range
+
+
+   std::cout << "* m = range, v1 = range, v2 = full" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2, ublas_m2,
+                                      vcl_m1_range, vcl_v1_range, vcl_v2_native, vcl_m2_range);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   std::cout << "* m = range, v1 = range, v2 = range" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2, ublas_m2,
+                                      vcl_m1_range, vcl_v1_range, vcl_v2_range, vcl_m2_range);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   std::cout << "* m = range, v1 = range, v2 = slice" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2, ublas_m2,
+                                      vcl_m1_range, vcl_v1_range, vcl_v2_slice, vcl_m2_range);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+
+   // v1 = slice
+
+   std::cout << "* m = range, v1 = slice, v2 = full" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2, ublas_m2,
+                                      vcl_m1_range, vcl_v1_slice, vcl_v2_native, vcl_m2_range);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   std::cout << "* m = range, v1 = slice, v2 = range" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2, ublas_m2,
+                                      vcl_m1_range, vcl_v1_slice, vcl_v2_range, vcl_m2_range);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   std::cout << "* m = range, v1 = slice, v2 = slice" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2, ublas_m2,
+                                      vcl_m1_range, vcl_v1_slice, vcl_v2_slice, vcl_m2_range);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   ///////////////////////////// matrix_slice
+
+   std::cout << "* m = slice, v1 = full, v2 = full" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2, ublas_m2,
+                                      vcl_m1_slice, vcl_v1_native, vcl_v2_native, vcl_m2_slice);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   std::cout << "* m = slice, v1 = full, v2 = range" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2, ublas_m2,
+                                      vcl_m1_slice, vcl_v1_native, vcl_v2_range, vcl_m2_slice);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   std::cout << "* m = slice, v1 = full, v2 = slice" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2, ublas_m2,
+                                      vcl_m1_slice, vcl_v1_native, vcl_v2_slice, vcl_m2_slice);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   // v1 = range
+
+
+   std::cout << "* m = slice, v1 = range, v2 = full" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2, ublas_m2,
+                                      vcl_m1_slice, vcl_v1_range, vcl_v2_native, vcl_m2_slice);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   std::cout << "* m = slice, v1 = range, v2 = range" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2, ublas_m2,
+                                      vcl_m1_slice, vcl_v1_range, vcl_v2_range, vcl_m2_slice);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   std::cout << "* m = slice, v1 = range, v2 = slice" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2, ublas_m2,
+                                      vcl_m1_slice, vcl_v1_range, vcl_v2_slice, vcl_m2_slice);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+
+   // v1 = slice
+
+   std::cout << "* m = slice, v1 = slice, v2 = full" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2, ublas_m2,
+                                      vcl_m1_slice, vcl_v1_slice, vcl_v2_native, vcl_m2_slice);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   std::cout << "* m = slice, v1 = slice, v2 = range" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2, ublas_m2,
+                                      vcl_m1_slice, vcl_v1_slice, vcl_v2_range, vcl_m2_slice);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   std::cout << "* m = slice, v1 = slice, v2 = slice" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2, ublas_m2,
+                                      vcl_m1_slice, vcl_v1_slice, vcl_v2_slice, vcl_m2_slice);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+
+   //
+   // Testing triangular solve() routines
+   //
+
+   std::cout << "------------ Testing triangular solves ------------------" << std::endl;
+
+   std::cout << "* m = full, v1 = full" << std::endl;
+   retval = test_solve<NumericT>(epsilon,
+                                 ublas_m2, ublas_v1,
+                                 vcl_m2_native, vcl_v1_native);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+   std::cout << "* m = full, v1 = range" << std::endl;
+   retval = test_solve<NumericT>(epsilon,
+                                 ublas_m2, ublas_v1,
+                                 vcl_m2_native, vcl_v1_range);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+   std::cout << "* m = full, v1 = slice" << std::endl;
+   retval = test_solve<NumericT>(epsilon,
+                                 ublas_m2, ublas_v1,
+                                 vcl_m2_native, vcl_v1_slice);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+   ///////// matrix_range
+
+
+   std::cout << "* m = range, v1 = full" << std::endl;
+   retval = test_solve<NumericT>(epsilon,
+                                 ublas_m2, ublas_v1,
+                                 vcl_m2_range, vcl_v1_native);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+   std::cout << "* m = range, v1 = range" << std::endl;
+   retval = test_solve<NumericT>(epsilon,
+                                 ublas_m2, ublas_v1,
+                                 vcl_m2_range, vcl_v1_range);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+   std::cout << "* m = range, v1 = slice" << std::endl;
+   retval = test_solve<NumericT>(epsilon,
+                                 ublas_m2, ublas_v1,
+                                 vcl_m2_range, vcl_v1_slice);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+   //////// matrix_slice
+
+   std::cout << "* m = slice, v1 = full" << std::endl;
+   retval = test_solve<NumericT>(epsilon,
+                                 ublas_m2, ublas_v1,
+                                 vcl_m2_slice, vcl_v1_native);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+   std::cout << "* m = slice, v1 = range" << std::endl;
+   retval = test_solve<NumericT>(epsilon,
+                                 ublas_m2, ublas_v1,
+                                 vcl_m2_slice, vcl_v1_range);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+   std::cout << "* m = slice, v1 = slice" << std::endl;
+   retval = test_solve<NumericT>(epsilon,
+                                 ublas_m2, ublas_v1,
+                                 vcl_m2_slice, vcl_v1_slice);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+
+
+
+
+
+   ////////////// Final test for full LU decomposition:
+
+   //full solver:
+   std::cout << "Full solver" << std::endl;
+   unsigned int lu_dim = 100;
+   ublas::matrix<NumericT> square_matrix(lu_dim, lu_dim);
+   ublas::vector<NumericT> lu_rhs(lu_dim);
+   viennacl::matrix<NumericT, F> vcl_square_matrix(lu_dim, lu_dim);
+   viennacl::vector<NumericT> vcl_lu_rhs(lu_dim);
+
+   for (std::size_t i=0; i<lu_dim; ++i)
+     for (std::size_t j=0; j<lu_dim; ++j)
+       square_matrix(i,j) = -static_cast<NumericT>(0.5) * random<NumericT>();
+
+   //put some more weight on diagonal elements:
+   for (std::size_t j=0; j<lu_dim; ++j)
+   {
+     square_matrix(j,j) = static_cast<NumericT>(20.0) + random<NumericT>();
+     lu_rhs(j) = random<NumericT>();
+   }
+
+   viennacl::copy(square_matrix, vcl_square_matrix);
+   viennacl::copy(lu_rhs, vcl_lu_rhs);
+
+   //ublas::
+   ublas::lu_factorize(square_matrix);
+   ublas::inplace_solve (square_matrix, lu_rhs, ublas::unit_lower_tag ());
+   ublas::inplace_solve (square_matrix, lu_rhs, ublas::upper_tag ());
+
+   // ViennaCL:
+   viennacl::linalg::lu_factorize(vcl_square_matrix);
+   //viennacl::copy(square_matrix, vcl_square_matrix);
+   viennacl::linalg::lu_substitute(vcl_square_matrix, vcl_lu_rhs);
+
+   if( std::fabs(diff(lu_rhs, vcl_lu_rhs)) > epsilon )
+   {
+      std::cout << "# Error at operation: dense solver" << std::endl;
+      std::cout << "  diff: " << std::fabs(diff(lu_rhs, vcl_lu_rhs)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+
+
+   return retval;
+}
+//
+// -------------------------------------------------------------
+//
+int main()
+{
+   std::cout << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << "## Test :: Matrix" << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << std::endl;
+
+   int retval = EXIT_SUCCESS;
+
+   std::cout << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << std::endl;
+   {
+      typedef float NumericT;
+      NumericT epsilon = NumericT(1.0E-3);
+      std::cout << "# Testing setup:" << std::endl;
+      std::cout << "  eps:     " << epsilon << std::endl;
+      std::cout << "  numeric: float" << std::endl;
+      std::cout << "  layout: row-major" << std::endl;
+      retval = test<NumericT, viennacl::row_major>(epsilon);
+      if( retval == EXIT_SUCCESS )
+         std::cout << "# Test passed" << std::endl;
+      else
+         return retval;
+   }
+   std::cout << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << std::endl;
+   {
+      typedef float NumericT;
+      NumericT epsilon = NumericT(1.0E-3);
+      std::cout << "# Testing setup:" << std::endl;
+      std::cout << "  eps:     " << epsilon << std::endl;
+      std::cout << "  numeric: float" << std::endl;
+      std::cout << "  layout: column-major" << std::endl;
+      retval = test<NumericT, viennacl::column_major>(epsilon);
+      if( retval == EXIT_SUCCESS )
+         std::cout << "# Test passed" << std::endl;
+      else
+         return retval;
+   }
+   std::cout << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << std::endl;
+
+
+#ifdef VIENNACL_WITH_OPENCL
+   if( viennacl::ocl::current_device().double_support() )
+#endif
+   {
+      {
+         typedef double NumericT;
+         NumericT epsilon = 1.0E-11;
+         std::cout << "# Testing setup:" << std::endl;
+         std::cout << "  eps:     " << epsilon << std::endl;
+         std::cout << "  numeric: double" << std::endl;
+         std::cout << "  layout: row-major" << std::endl;
+         retval = test<NumericT, viennacl::row_major>(epsilon);
+            if( retval == EXIT_SUCCESS )
+               std::cout << "# Test passed" << std::endl;
+            else
+              return retval;
+      }
+      std::cout << std::endl;
+      std::cout << "----------------------------------------------" << std::endl;
+      std::cout << std::endl;
+      {
+         typedef double NumericT;
+         NumericT epsilon = 1.0E-11;
+         std::cout << "# Testing setup:" << std::endl;
+         std::cout << "  eps:     " << epsilon << std::endl;
+         std::cout << "  numeric: double" << std::endl;
+         std::cout << "  layout: column-major" << std::endl;
+         retval = test<NumericT, viennacl::column_major>(epsilon);
+            if( retval == EXIT_SUCCESS )
+               std::cout << "# Test passed" << std::endl;
+            else
+              return retval;
+      }
+      std::cout << std::endl;
+      std::cout << "----------------------------------------------" << std::endl;
+      std::cout << std::endl;
+   }
+
+   std::cout << std::endl;
+   std::cout << "------- Test completed --------" << std::endl;
+   std::cout << std::endl;
+
+
+   return retval;
+}
diff --git a/tests/src/matrix_vector_int.cpp b/tests/src/matrix_vector_int.cpp
new file mode 100644
index 0000000..67ae5fd
--- /dev/null
+++ b/tests/src/matrix_vector_int.cpp
@@ -0,0 +1,823 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+//
+// *** System
+//
+#include <iostream>
+
+//
+// *** Boost
+//
+#include <boost/numeric/ublas/io.hpp>
+#include <boost/numeric/ublas/triangular.hpp>
+#include <boost/numeric/ublas/matrix_sparse.hpp>
+#include <boost/numeric/ublas/matrix.hpp>
+#include <boost/numeric/ublas/matrix_proxy.hpp>
+#include <boost/numeric/ublas/lu.hpp>
+#include <boost/numeric/ublas/io.hpp>
+
+//
+// *** ViennaCL
+//
+//#define VIENNACL_DEBUG_ALL
+#define VIENNACL_WITH_UBLAS 1
+#include "viennacl/scalar.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+#include "viennacl/linalg/direct_solve.hpp"
+#include "viennacl/linalg/lu.hpp"
+#include "examples/tutorial/Random.hpp"
+
+//
+// -------------------------------------------------------------
+//
+using namespace boost::numeric;
+//
+// -------------------------------------------------------------
+//
+template <typename ScalarType>
+ScalarType diff(ScalarType & s1, viennacl::scalar<ScalarType> & s2)
+{
+   viennacl::backend::finish();
+   if (s1 != s2)
+      return 1;
+   return 0;
+}
+
+template <typename ScalarType, typename VCLVectorType>
+ScalarType diff(ublas::vector<ScalarType> const & v1, VCLVectorType const & v2)
+{
+   ublas::vector<ScalarType> v2_cpu(v2.size());
+   viennacl::backend::finish();  //workaround for a bug in APP SDK 2.7 on Trinity APUs (with Catalyst 12.8)
+   viennacl::copy(v2.begin(), v2.end(), v2_cpu.begin());
+
+   for (unsigned int i=0;i<v1.size(); ++i)
+   {
+      if (v2_cpu[i] != v1[i])
+        return 1;
+   }
+
+   return 0;
+}
+
+template <typename ScalarType, typename VCLMatrixType>
+ScalarType diff(ublas::matrix<ScalarType> const & mat1, VCLMatrixType const & mat2)
+{
+   ublas::matrix<ScalarType> mat2_cpu(mat2.size1(), mat2.size2());
+   viennacl::backend::finish();  //workaround for a bug in APP SDK 2.7 on Trinity APUs (with Catalyst 12.8)
+   viennacl::copy(mat2, mat2_cpu);
+
+    for (unsigned int i = 0; i < mat2_cpu.size1(); ++i)
+    {
+      for (unsigned int j = 0; j < mat2_cpu.size2(); ++j)
+      {
+         if (mat2_cpu(i,j) != mat1(i,j))
+           return 1;
+      }
+    }
+   //std::cout << ret << std::endl;
+   return 0;
+}
+//
+// -------------------------------------------------------------
+//
+
+template <typename NumericT,
+          typename UblasMatrixType, typename UblasVectorType,
+          typename VCLMatrixType, typename VCLVectorType1, typename VCLVectorType2>
+int test_prod_rank1(UblasMatrixType & ublas_m1, UblasVectorType & ublas_v1, UblasVectorType & ublas_v2,
+                    VCLMatrixType & vcl_m1, VCLVectorType1 & vcl_v1, VCLVectorType2 & vcl_v2)
+{
+   int retval = EXIT_SUCCESS;
+
+   // sync data:
+   ublas_v1 = ublas::scalar_vector<NumericT>(ublas_v1.size(), NumericT(2));
+   ublas_v2 = ublas::scalar_vector<NumericT>(ublas_v2.size(), NumericT(3));
+   viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+   viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+   viennacl::copy(ublas_m1, vcl_m1);
+
+   // --------------------------------------------------------------------------
+   std::cout << "Rank 1 update" << std::endl;
+
+   ublas_m1 += ublas::outer_prod(ublas_v1, ublas_v2);
+   vcl_m1 += viennacl::linalg::outer_prod(vcl_v1, vcl_v2);
+   if( diff(ublas_m1, vcl_m1) != 0 )
+   {
+      std::cout << "# Error at operation: rank 1 update" << std::endl;
+      std::cout << "  diff: " << diff(ublas_m1, vcl_m1) << std::endl;
+      return EXIT_FAILURE;
+   }
+
+
+
+   // --------------------------------------------------------------------------
+   std::cout << "Scaled rank 1 update - CPU Scalar" << std::endl;
+   ublas_m1 += NumericT(4) * ublas::outer_prod(ublas_v1, ublas_v2);
+   vcl_m1 += NumericT(2) * viennacl::linalg::outer_prod(vcl_v1, vcl_v2);
+   vcl_m1 += viennacl::linalg::outer_prod(vcl_v1, vcl_v2) * NumericT(2);  //check proper compilation
+   if( diff(ublas_m1, vcl_m1) != 0 )
+   {
+      std::cout << "# Error at operation: scaled rank 1 update - CPU Scalar" << std::endl;
+      std::cout << "  diff: " << diff(ublas_m1, vcl_m1) << std::endl;
+      return EXIT_FAILURE;
+   }
+
+      // --------------------------------------------------------------------------
+   std::cout << "Scaled rank 1 update - GPU Scalar" << std::endl;
+   ublas_m1 += NumericT(4) * ublas::outer_prod(ublas_v1, ublas_v2);
+   vcl_m1 += viennacl::scalar<NumericT>(2) * viennacl::linalg::outer_prod(vcl_v1, vcl_v2);
+   vcl_m1 += viennacl::linalg::outer_prod(vcl_v1, vcl_v2) * viennacl::scalar<NumericT>(2);  //check proper compilation
+   if( diff(ublas_m1, vcl_m1) != 0 )
+   {
+      std::cout << "# Error at operation: scaled rank 1 update - GPU Scalar" << std::endl;
+      std::cout << "  diff: " << diff(ublas_m1, vcl_m1) << std::endl;
+      return EXIT_FAILURE;
+   }
+
+   //reset vcl_matrix:
+   viennacl::copy(ublas_m1, vcl_m1);
+
+   // --------------------------------------------------------------------------
+   std::cout << "Matrix-Vector product" << std::endl;
+   ublas_v1 = viennacl::linalg::prod(ublas_m1, ublas_v2);
+   vcl_v1   = viennacl::linalg::prod(vcl_m1, vcl_v2);
+
+   if( diff(ublas_v1, vcl_v1) != 0 )
+   {
+      std::cout << "# Error at operation: matrix-vector product" << std::endl;
+      std::cout << "  diff: " << diff(ublas_v1, vcl_v1) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+   // --------------------------------------------------------------------------
+   std::cout << "Matrix-Vector product with scaled add" << std::endl;
+   NumericT alpha = static_cast<NumericT>(2);
+   NumericT beta = static_cast<NumericT>(3);
+   viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+   viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+   ublas_v1 = alpha * viennacl::linalg::prod(ublas_m1, ublas_v2) + beta * ublas_v1;
+   vcl_v1   = alpha * viennacl::linalg::prod(vcl_m1, vcl_v2) + beta * vcl_v1;
+
+   if( diff(ublas_v1, vcl_v1) != 0 )
+   {
+      std::cout << "# Error at operation: matrix-vector product with scaled additions" << std::endl;
+      std::cout << "  diff: " << diff(ublas_v1, vcl_v1) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+   // --------------------------------------------------------------------------
+
+   viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+   viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+   std::cout << "Transposed Matrix-Vector product" << std::endl;
+   ublas_v2 = alpha * viennacl::linalg::prod(trans(ublas_m1), ublas_v1);
+   vcl_v2   = alpha * viennacl::linalg::prod(trans(vcl_m1), vcl_v1);
+
+   if( diff(ublas_v2, vcl_v2) != 0 )
+   {
+      std::cout << "# Error at operation: transposed matrix-vector product" << std::endl;
+      std::cout << "  diff: " << diff(ublas_v2, vcl_v2) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   std::cout << "Transposed Matrix-Vector product with scaled add" << std::endl;
+   ublas_v2 = alpha * viennacl::linalg::prod(trans(ublas_m1), ublas_v1) + beta * ublas_v2;
+   vcl_v2   = alpha * viennacl::linalg::prod(trans(vcl_m1), vcl_v1) + beta * vcl_v2;
+
+   if( diff(ublas_v2, vcl_v2) != 0 )
+   {
+      std::cout << "# Error at operation: transposed matrix-vector product with scaled additions" << std::endl;
+      std::cout << "  diff: " << diff(ublas_v2, vcl_v2) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+   // --------------------------------------------------------------------------
+
+   return retval;
+}
+
+
+//
+// -------------------------------------------------------------
+//
+template< typename NumericT, typename F>
+int test()
+{
+   int retval = EXIT_SUCCESS;
+
+   std::size_t num_rows = 141;
+   std::size_t num_cols = 103;
+
+   // --------------------------------------------------------------------------
+   ublas::vector<NumericT> ublas_v1(num_rows);
+   for (std::size_t i = 0; i < ublas_v1.size(); ++i)
+     ublas_v1(i) = NumericT(i);
+   ublas::vector<NumericT> ublas_v2 = ublas::scalar_vector<NumericT>(num_cols, NumericT(3));
+
+
+   ublas::matrix<NumericT> ublas_m1(ublas_v1.size(), ublas_v2.size());
+   ublas::matrix<NumericT> ublas_m2(ublas_v1.size(), ublas_v1.size());
+
+
+   for (std::size_t i = 0; i < ublas_m1.size1(); ++i)
+    for (std::size_t j = 0; j < ublas_m1.size2(); ++j)
+      ublas_m1(i,j) = NumericT(i+j);
+
+
+   for (std::size_t i = 0; i < ublas_m2.size1(); ++i)
+    for (std::size_t j = 0; j < ublas_m2.size2(); ++j)
+       ublas_m2(i,j) = NumericT(j - i*j + i);
+
+
+   viennacl::vector<NumericT> vcl_v1_native(ublas_v1.size());
+   viennacl::vector<NumericT> vcl_v1_large(4 * ublas_v1.size());
+   viennacl::vector_range< viennacl::vector<NumericT> > vcl_v1_range(vcl_v1_large, viennacl::range(3, ublas_v1.size() + 3));
+   viennacl::vector_slice< viennacl::vector<NumericT> > vcl_v1_slice(vcl_v1_large, viennacl::slice(2, 3, ublas_v1.size()));
+
+   viennacl::vector<NumericT> vcl_v2_native(ublas_v2.size());
+   viennacl::vector<NumericT> vcl_v2_large(4 * ublas_v2.size());
+   viennacl::vector_range< viennacl::vector<NumericT> > vcl_v2_range(vcl_v2_large, viennacl::range(8, ublas_v2.size() + 8));
+   viennacl::vector_slice< viennacl::vector<NumericT> > vcl_v2_slice(vcl_v2_large, viennacl::slice(6, 2, ublas_v2.size()));
+
+   viennacl::matrix<NumericT, F> vcl_m1_native(ublas_m1.size1(), ublas_m1.size2());
+   viennacl::matrix<NumericT, F> vcl_m1_large(4 * ublas_m1.size1(), 4 * ublas_m1.size2());
+   viennacl::matrix_range< viennacl::matrix<NumericT, F> > vcl_m1_range(vcl_m1_large,
+                                                                        viennacl::range(8, ublas_m1.size1() + 8),
+                                                                        viennacl::range(ublas_m1.size2(), 2 * ublas_m1.size2()) );
+   viennacl::matrix_slice< viennacl::matrix<NumericT, F> > vcl_m1_slice(vcl_m1_large,
+                                                                        viennacl::slice(6, 2, ublas_m1.size1()),
+                                                                        viennacl::slice(ublas_m1.size2(), 2, ublas_m1.size2()) );
+
+   viennacl::matrix<NumericT, F> vcl_m2_native(ublas_m2.size1(), ublas_m2.size2());
+   viennacl::matrix<NumericT, F> vcl_m2_large(4 * ublas_m2.size1(), 4 * ublas_m2.size2());
+   viennacl::matrix_range< viennacl::matrix<NumericT, F> > vcl_m2_range(vcl_m2_large,
+                                                                        viennacl::range(8, ublas_m2.size1() + 8),
+                                                                        viennacl::range(ublas_m2.size2(), 2 * ublas_m2.size2()) );
+   viennacl::matrix_slice< viennacl::matrix<NumericT, F> > vcl_m2_slice(vcl_m2_large,
+                                                                        viennacl::slice(6, 2, ublas_m2.size1()),
+                                                                        viennacl::slice(ublas_m2.size2(), 2, ublas_m2.size2()) );
+
+
+   //
+   // Run a bunch of tests for rank-1-updates, matrix-vector products
+   //
+   std::cout << "------------ Testing rank-1-updates and matrix-vector products ------------------" << std::endl;
+
+   std::cout << "* m = full, v1 = full, v2 = full" << std::endl;
+   retval = test_prod_rank1<NumericT>(ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_native, vcl_v1_native, vcl_v2_native);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+   for (std::size_t i = 0; i < ublas_m1.size1(); ++i)
+    for (std::size_t j = 0; j < ublas_m1.size2(); ++j)
+      ublas_m1(i,j) = NumericT(i+j);
+
+   std::cout << "* m = full, v1 = full, v2 = range" << std::endl;
+   retval = test_prod_rank1<NumericT>(ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_native, vcl_v1_native, vcl_v2_range);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   for (std::size_t i = 0; i < ublas_m1.size1(); ++i)
+    for (std::size_t j = 0; j < ublas_m1.size2(); ++j)
+      ublas_m1(i,j) = NumericT(i+j);
+
+   std::cout << "* m = full, v1 = full, v2 = slice" << std::endl;
+   retval = test_prod_rank1<NumericT>(ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_native, vcl_v1_native, vcl_v2_slice);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   for (std::size_t i = 0; i < ublas_m1.size1(); ++i)
+    for (std::size_t j = 0; j < ublas_m1.size2(); ++j)
+      ublas_m1(i,j) = NumericT(i+j);
+
+   // v1 = range
+
+
+   std::cout << "* m = full, v1 = range, v2 = full" << std::endl;
+   retval = test_prod_rank1<NumericT>(ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_native, vcl_v1_range, vcl_v2_native);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   for (std::size_t i = 0; i < ublas_m1.size1(); ++i)
+    for (std::size_t j = 0; j < ublas_m1.size2(); ++j)
+      ublas_m1(i,j) = NumericT(i+j);
+
+   std::cout << "* m = full, v1 = range, v2 = range" << std::endl;
+   retval = test_prod_rank1<NumericT>(ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_native, vcl_v1_range, vcl_v2_range);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   for (std::size_t i = 0; i < ublas_m1.size1(); ++i)
+    for (std::size_t j = 0; j < ublas_m1.size2(); ++j)
+      ublas_m1(i,j) = NumericT(i+j);
+
+   std::cout << "* m = full, v1 = range, v2 = slice" << std::endl;
+   retval = test_prod_rank1<NumericT>(ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_native, vcl_v1_range, vcl_v2_slice);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   for (std::size_t i = 0; i < ublas_m1.size1(); ++i)
+    for (std::size_t j = 0; j < ublas_m1.size2(); ++j)
+      ublas_m1(i,j) = NumericT(i+j);
+
+
+   // v1 = slice
+
+   std::cout << "* m = full, v1 = slice, v2 = full" << std::endl;
+   retval = test_prod_rank1<NumericT>(ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_native, vcl_v1_slice, vcl_v2_native);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   for (std::size_t i = 0; i < ublas_m1.size1(); ++i)
+    for (std::size_t j = 0; j < ublas_m1.size2(); ++j)
+      ublas_m1(i,j) = NumericT(i+j);
+
+   std::cout << "* m = full, v1 = slice, v2 = range" << std::endl;
+   retval = test_prod_rank1<NumericT>(ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_native, vcl_v1_slice, vcl_v2_range);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   for (std::size_t i = 0; i < ublas_m1.size1(); ++i)
+    for (std::size_t j = 0; j < ublas_m1.size2(); ++j)
+      ublas_m1(i,j) = NumericT(i+j);
+
+   std::cout << "* m = full, v1 = slice, v2 = slice" << std::endl;
+   retval = test_prod_rank1<NumericT>(ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_native, vcl_v1_slice, vcl_v2_slice);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   for (std::size_t i = 0; i < ublas_m1.size1(); ++i)
+    for (std::size_t j = 0; j < ublas_m1.size2(); ++j)
+      ublas_m1(i,j) = NumericT(i+j);
+
+   ///////////////////////////// matrix_range
+
+   std::cout << "* m = range, v1 = full, v2 = full" << std::endl;
+   retval = test_prod_rank1<NumericT>(ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_range, vcl_v1_native, vcl_v2_native);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   for (std::size_t i = 0; i < ublas_m1.size1(); ++i)
+    for (std::size_t j = 0; j < ublas_m1.size2(); ++j)
+      ublas_m1(i,j) = NumericT(i+j);
+
+   std::cout << "* m = range, v1 = full, v2 = range" << std::endl;
+   retval = test_prod_rank1<NumericT>(ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_range, vcl_v1_native, vcl_v2_range);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   for (std::size_t i = 0; i < ublas_m1.size1(); ++i)
+    for (std::size_t j = 0; j < ublas_m1.size2(); ++j)
+      ublas_m1(i,j) = NumericT(i+j);
+
+   std::cout << "* m = range, v1 = full, v2 = slice" << std::endl;
+   retval = test_prod_rank1<NumericT>(ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_range, vcl_v1_native, vcl_v2_slice);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   for (std::size_t i = 0; i < ublas_m1.size1(); ++i)
+    for (std::size_t j = 0; j < ublas_m1.size2(); ++j)
+      ublas_m1(i,j) = NumericT(i+j);
+
+   // v1 = range
+
+
+   std::cout << "* m = range, v1 = range, v2 = full" << std::endl;
+   retval = test_prod_rank1<NumericT>(ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_range, vcl_v1_range, vcl_v2_native);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   for (std::size_t i = 0; i < ublas_m1.size1(); ++i)
+    for (std::size_t j = 0; j < ublas_m1.size2(); ++j)
+      ublas_m1(i,j) = NumericT(i+j);
+
+   std::cout << "* m = range, v1 = range, v2 = range" << std::endl;
+   retval = test_prod_rank1<NumericT>(ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_range, vcl_v1_range, vcl_v2_range);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   for (std::size_t i = 0; i < ublas_m1.size1(); ++i)
+    for (std::size_t j = 0; j < ublas_m1.size2(); ++j)
+      ublas_m1(i,j) = NumericT(i+j);
+
+   std::cout << "* m = range, v1 = range, v2 = slice" << std::endl;
+   retval = test_prod_rank1<NumericT>(ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_range, vcl_v1_range, vcl_v2_slice);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   for (std::size_t i = 0; i < ublas_m1.size1(); ++i)
+    for (std::size_t j = 0; j < ublas_m1.size2(); ++j)
+      ublas_m1(i,j) = NumericT(i+j);
+
+
+   // v1 = slice
+
+   std::cout << "* m = range, v1 = slice, v2 = full" << std::endl;
+   retval = test_prod_rank1<NumericT>(ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_range, vcl_v1_slice, vcl_v2_native);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   for (std::size_t i = 0; i < ublas_m1.size1(); ++i)
+    for (std::size_t j = 0; j < ublas_m1.size2(); ++j)
+      ublas_m1(i,j) = NumericT(i+j);
+
+   std::cout << "* m = range, v1 = slice, v2 = range" << std::endl;
+   retval = test_prod_rank1<NumericT>(ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_range, vcl_v1_slice, vcl_v2_range);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   for (std::size_t i = 0; i < ublas_m1.size1(); ++i)
+    for (std::size_t j = 0; j < ublas_m1.size2(); ++j)
+      ublas_m1(i,j) = NumericT(i+j);
+
+   std::cout << "* m = range, v1 = slice, v2 = slice" << std::endl;
+   retval = test_prod_rank1<NumericT>(ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_range, vcl_v1_slice, vcl_v2_slice);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   for (std::size_t i = 0; i < ublas_m1.size1(); ++i)
+    for (std::size_t j = 0; j < ublas_m1.size2(); ++j)
+      ublas_m1(i,j) = NumericT(i+j);
+
+   ///////////////////////////// matrix_slice
+
+   std::cout << "* m = slice, v1 = full, v2 = full" << std::endl;
+   retval = test_prod_rank1<NumericT>(ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_slice, vcl_v1_native, vcl_v2_native);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   for (std::size_t i = 0; i < ublas_m1.size1(); ++i)
+    for (std::size_t j = 0; j < ublas_m1.size2(); ++j)
+      ublas_m1(i,j) = NumericT(i+j);
+
+   std::cout << "* m = slice, v1 = full, v2 = range" << std::endl;
+   retval = test_prod_rank1<NumericT>(ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_slice, vcl_v1_native, vcl_v2_range);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   for (std::size_t i = 0; i < ublas_m1.size1(); ++i)
+    for (std::size_t j = 0; j < ublas_m1.size2(); ++j)
+      ublas_m1(i,j) = NumericT(i+j);
+
+   std::cout << "* m = slice, v1 = full, v2 = slice" << std::endl;
+   retval = test_prod_rank1<NumericT>(ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_slice, vcl_v1_native, vcl_v2_slice);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   for (std::size_t i = 0; i < ublas_m1.size1(); ++i)
+    for (std::size_t j = 0; j < ublas_m1.size2(); ++j)
+      ublas_m1(i,j) = NumericT(i+j);
+
+   // v1 = range
+
+
+   std::cout << "* m = slice, v1 = range, v2 = full" << std::endl;
+   retval = test_prod_rank1<NumericT>(ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_slice, vcl_v1_range, vcl_v2_native);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   for (std::size_t i = 0; i < ublas_m1.size1(); ++i)
+    for (std::size_t j = 0; j < ublas_m1.size2(); ++j)
+      ublas_m1(i,j) = NumericT(i+j);
+
+   std::cout << "* m = slice, v1 = range, v2 = range" << std::endl;
+   retval = test_prod_rank1<NumericT>(ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_slice, vcl_v1_range, vcl_v2_range);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   for (std::size_t i = 0; i < ublas_m1.size1(); ++i)
+    for (std::size_t j = 0; j < ublas_m1.size2(); ++j)
+      ublas_m1(i,j) = NumericT(i+j);
+
+   std::cout << "* m = slice, v1 = range, v2 = slice" << std::endl;
+   retval = test_prod_rank1<NumericT>(ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_slice, vcl_v1_range, vcl_v2_slice);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+
+   for (std::size_t i = 0; i < ublas_m1.size1(); ++i)
+    for (std::size_t j = 0; j < ublas_m1.size2(); ++j)
+      ublas_m1(i,j) = NumericT(i+j);
+
+   // v1 = slice
+
+   std::cout << "* m = slice, v1 = slice, v2 = full" << std::endl;
+   retval = test_prod_rank1<NumericT>(ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_slice, vcl_v1_slice, vcl_v2_native);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+   for (std::size_t i = 0; i < ublas_m1.size1(); ++i)
+    for (std::size_t j = 0; j < ublas_m1.size2(); ++j)
+      ublas_m1(i,j) = NumericT(i+j);
+
+
+   std::cout << "* m = slice, v1 = slice, v2 = range" << std::endl;
+   retval = test_prod_rank1<NumericT>(ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_slice, vcl_v1_slice, vcl_v2_range);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   for (std::size_t i = 0; i < ublas_m1.size1(); ++i)
+    for (std::size_t j = 0; j < ublas_m1.size2(); ++j)
+      ublas_m1(i,j) = NumericT(i+j);
+
+   std::cout << "* m = slice, v1 = slice, v2 = slice" << std::endl;
+   retval = test_prod_rank1<NumericT>(ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_slice, vcl_v1_slice, vcl_v2_slice);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   return retval;
+}
+//
+// -------------------------------------------------------------
+//
+int main()
+{
+   std::cout << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << "## Test :: Matrix" << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << std::endl;
+
+   int retval = EXIT_SUCCESS;
+
+   std::cout << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << std::endl;
+   {
+      typedef int NumericT;
+      std::cout << "# Testing setup:" << std::endl;
+      std::cout << "  numeric: int" << std::endl;
+      std::cout << "  layout: row-major" << std::endl;
+      retval = test<NumericT, viennacl::row_major>();
+      if( retval == EXIT_SUCCESS )
+         std::cout << "# Test passed" << std::endl;
+      else
+         return retval;
+   }
+   std::cout << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << std::endl;
+   {
+      typedef int NumericT;
+      std::cout << "# Testing setup:" << std::endl;
+      std::cout << "  numeric: int" << std::endl;
+      std::cout << "  layout: column-major" << std::endl;
+      retval = test<NumericT, viennacl::column_major>();
+      if( retval == EXIT_SUCCESS )
+         std::cout << "# Test passed" << std::endl;
+      else
+         return retval;
+   }
+   std::cout << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << std::endl;
+
+
+#ifdef VIENNACL_WITH_OPENCL
+   if( viennacl::ocl::current_device().double_support() )
+#endif
+   {
+      {
+         typedef long NumericT;
+         std::cout << "# Testing setup:" << std::endl;
+         std::cout << "  numeric: double" << std::endl;
+         std::cout << "  layout: row-major" << std::endl;
+         retval = test<NumericT, viennacl::row_major>();
+            if( retval == EXIT_SUCCESS )
+               std::cout << "# Test passed" << std::endl;
+            else
+              return retval;
+      }
+      std::cout << std::endl;
+      std::cout << "----------------------------------------------" << std::endl;
+      std::cout << std::endl;
+      {
+         typedef long NumericT;
+         std::cout << "# Testing setup:" << std::endl;
+         std::cout << "  numeric: double" << std::endl;
+         std::cout << "  layout: column-major" << std::endl;
+         retval = test<NumericT, viennacl::column_major>();
+            if( retval == EXIT_SUCCESS )
+               std::cout << "# Test passed" << std::endl;
+            else
+              return retval;
+      }
+      std::cout << std::endl;
+      std::cout << "----------------------------------------------" << std::endl;
+      std::cout << std::endl;
+   }
+
+   std::cout << std::endl;
+   std::cout << "------- Test completed --------" << std::endl;
+   std::cout << std::endl;
+
+
+   return retval;
+}
diff --git a/tests/src/matrix_vector_int.cu b/tests/src/matrix_vector_int.cu
new file mode 100644
index 0000000..67ae5fd
--- /dev/null
+++ b/tests/src/matrix_vector_int.cu
@@ -0,0 +1,823 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+//
+// *** System
+//
+#include <iostream>
+
+//
+// *** Boost
+//
+#include <boost/numeric/ublas/io.hpp>
+#include <boost/numeric/ublas/triangular.hpp>
+#include <boost/numeric/ublas/matrix_sparse.hpp>
+#include <boost/numeric/ublas/matrix.hpp>
+#include <boost/numeric/ublas/matrix_proxy.hpp>
+#include <boost/numeric/ublas/lu.hpp>
+#include <boost/numeric/ublas/io.hpp>
+
+//
+// *** ViennaCL
+//
+//#define VIENNACL_DEBUG_ALL
+#define VIENNACL_WITH_UBLAS 1
+#include "viennacl/scalar.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+#include "viennacl/linalg/direct_solve.hpp"
+#include "viennacl/linalg/lu.hpp"
+#include "examples/tutorial/Random.hpp"
+
+//
+// -------------------------------------------------------------
+//
+using namespace boost::numeric;
+//
+// -------------------------------------------------------------
+//
+template <typename ScalarType>
+ScalarType diff(ScalarType & s1, viennacl::scalar<ScalarType> & s2)
+{
+   viennacl::backend::finish();
+   if (s1 != s2)
+      return 1;
+   return 0;
+}
+
+template <typename ScalarType, typename VCLVectorType>
+ScalarType diff(ublas::vector<ScalarType> const & v1, VCLVectorType const & v2)
+{
+   ublas::vector<ScalarType> v2_cpu(v2.size());
+   viennacl::backend::finish();  //workaround for a bug in APP SDK 2.7 on Trinity APUs (with Catalyst 12.8)
+   viennacl::copy(v2.begin(), v2.end(), v2_cpu.begin());
+
+   for (unsigned int i=0;i<v1.size(); ++i)
+   {
+      if (v2_cpu[i] != v1[i])
+        return 1;
+   }
+
+   return 0;
+}
+
+template <typename ScalarType, typename VCLMatrixType>
+ScalarType diff(ublas::matrix<ScalarType> const & mat1, VCLMatrixType const & mat2)
+{
+   ublas::matrix<ScalarType> mat2_cpu(mat2.size1(), mat2.size2());
+   viennacl::backend::finish();  //workaround for a bug in APP SDK 2.7 on Trinity APUs (with Catalyst 12.8)
+   viennacl::copy(mat2, mat2_cpu);
+
+    for (unsigned int i = 0; i < mat2_cpu.size1(); ++i)
+    {
+      for (unsigned int j = 0; j < mat2_cpu.size2(); ++j)
+      {
+         if (mat2_cpu(i,j) != mat1(i,j))
+           return 1;
+      }
+    }
+   //std::cout << ret << std::endl;
+   return 0;
+}
+//
+// -------------------------------------------------------------
+//
+
+template <typename NumericT,
+          typename UblasMatrixType, typename UblasVectorType,
+          typename VCLMatrixType, typename VCLVectorType1, typename VCLVectorType2>
+int test_prod_rank1(UblasMatrixType & ublas_m1, UblasVectorType & ublas_v1, UblasVectorType & ublas_v2,
+                    VCLMatrixType & vcl_m1, VCLVectorType1 & vcl_v1, VCLVectorType2 & vcl_v2)
+{
+   int retval = EXIT_SUCCESS;
+
+   // sync data:
+   ublas_v1 = ublas::scalar_vector<NumericT>(ublas_v1.size(), NumericT(2));
+   ublas_v2 = ublas::scalar_vector<NumericT>(ublas_v2.size(), NumericT(3));
+   viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+   viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+   viennacl::copy(ublas_m1, vcl_m1);
+
+   // --------------------------------------------------------------------------
+   std::cout << "Rank 1 update" << std::endl;
+
+   ublas_m1 += ublas::outer_prod(ublas_v1, ublas_v2);
+   vcl_m1 += viennacl::linalg::outer_prod(vcl_v1, vcl_v2);
+   if( diff(ublas_m1, vcl_m1) != 0 )
+   {
+      std::cout << "# Error at operation: rank 1 update" << std::endl;
+      std::cout << "  diff: " << diff(ublas_m1, vcl_m1) << std::endl;
+      return EXIT_FAILURE;
+   }
+
+
+
+   // --------------------------------------------------------------------------
+   std::cout << "Scaled rank 1 update - CPU Scalar" << std::endl;
+   ublas_m1 += NumericT(4) * ublas::outer_prod(ublas_v1, ublas_v2);
+   vcl_m1 += NumericT(2) * viennacl::linalg::outer_prod(vcl_v1, vcl_v2);
+   vcl_m1 += viennacl::linalg::outer_prod(vcl_v1, vcl_v2) * NumericT(2);  //check proper compilation
+   if( diff(ublas_m1, vcl_m1) != 0 )
+   {
+      std::cout << "# Error at operation: scaled rank 1 update - CPU Scalar" << std::endl;
+      std::cout << "  diff: " << diff(ublas_m1, vcl_m1) << std::endl;
+      return EXIT_FAILURE;
+   }
+
+      // --------------------------------------------------------------------------
+   std::cout << "Scaled rank 1 update - GPU Scalar" << std::endl;
+   ublas_m1 += NumericT(4) * ublas::outer_prod(ublas_v1, ublas_v2);
+   vcl_m1 += viennacl::scalar<NumericT>(2) * viennacl::linalg::outer_prod(vcl_v1, vcl_v2);
+   vcl_m1 += viennacl::linalg::outer_prod(vcl_v1, vcl_v2) * viennacl::scalar<NumericT>(2);  //check proper compilation
+   if( diff(ublas_m1, vcl_m1) != 0 )
+   {
+      std::cout << "# Error at operation: scaled rank 1 update - GPU Scalar" << std::endl;
+      std::cout << "  diff: " << diff(ublas_m1, vcl_m1) << std::endl;
+      return EXIT_FAILURE;
+   }
+
+   //reset vcl_matrix:
+   viennacl::copy(ublas_m1, vcl_m1);
+
+   // --------------------------------------------------------------------------
+   std::cout << "Matrix-Vector product" << std::endl;
+   ublas_v1 = viennacl::linalg::prod(ublas_m1, ublas_v2);
+   vcl_v1   = viennacl::linalg::prod(vcl_m1, vcl_v2);
+
+   if( diff(ublas_v1, vcl_v1) != 0 )
+   {
+      std::cout << "# Error at operation: matrix-vector product" << std::endl;
+      std::cout << "  diff: " << diff(ublas_v1, vcl_v1) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+   // --------------------------------------------------------------------------
+   std::cout << "Matrix-Vector product with scaled add" << std::endl;
+   NumericT alpha = static_cast<NumericT>(2);
+   NumericT beta = static_cast<NumericT>(3);
+   viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+   viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+   ublas_v1 = alpha * viennacl::linalg::prod(ublas_m1, ublas_v2) + beta * ublas_v1;
+   vcl_v1   = alpha * viennacl::linalg::prod(vcl_m1, vcl_v2) + beta * vcl_v1;
+
+   if( diff(ublas_v1, vcl_v1) != 0 )
+   {
+      std::cout << "# Error at operation: matrix-vector product with scaled additions" << std::endl;
+      std::cout << "  diff: " << diff(ublas_v1, vcl_v1) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+   // --------------------------------------------------------------------------
+
+   viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+   viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+   std::cout << "Transposed Matrix-Vector product" << std::endl;
+   ublas_v2 = alpha * viennacl::linalg::prod(trans(ublas_m1), ublas_v1);
+   vcl_v2   = alpha * viennacl::linalg::prod(trans(vcl_m1), vcl_v1);
+
+   if( diff(ublas_v2, vcl_v2) != 0 )
+   {
+      std::cout << "# Error at operation: transposed matrix-vector product" << std::endl;
+      std::cout << "  diff: " << diff(ublas_v2, vcl_v2) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   std::cout << "Transposed Matrix-Vector product with scaled add" << std::endl;
+   ublas_v2 = alpha * viennacl::linalg::prod(trans(ublas_m1), ublas_v1) + beta * ublas_v2;
+   vcl_v2   = alpha * viennacl::linalg::prod(trans(vcl_m1), vcl_v1) + beta * vcl_v2;
+
+   if( diff(ublas_v2, vcl_v2) != 0 )
+   {
+      std::cout << "# Error at operation: transposed matrix-vector product with scaled additions" << std::endl;
+      std::cout << "  diff: " << diff(ublas_v2, vcl_v2) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+   // --------------------------------------------------------------------------
+
+   return retval;
+}
+
+
+//
+// -------------------------------------------------------------
+//
+template< typename NumericT, typename F>
+int test()
+{
+   int retval = EXIT_SUCCESS;
+
+   std::size_t num_rows = 141;
+   std::size_t num_cols = 103;
+
+   // --------------------------------------------------------------------------
+   ublas::vector<NumericT> ublas_v1(num_rows);
+   for (std::size_t i = 0; i < ublas_v1.size(); ++i)
+     ublas_v1(i) = NumericT(i);
+   ublas::vector<NumericT> ublas_v2 = ublas::scalar_vector<NumericT>(num_cols, NumericT(3));
+
+
+   ublas::matrix<NumericT> ublas_m1(ublas_v1.size(), ublas_v2.size());
+   ublas::matrix<NumericT> ublas_m2(ublas_v1.size(), ublas_v1.size());
+
+
+   for (std::size_t i = 0; i < ublas_m1.size1(); ++i)
+    for (std::size_t j = 0; j < ublas_m1.size2(); ++j)
+      ublas_m1(i,j) = NumericT(i+j);
+
+
+   for (std::size_t i = 0; i < ublas_m2.size1(); ++i)
+    for (std::size_t j = 0; j < ublas_m2.size2(); ++j)
+       ublas_m2(i,j) = NumericT(j - i*j + i);
+
+
+   viennacl::vector<NumericT> vcl_v1_native(ublas_v1.size());
+   viennacl::vector<NumericT> vcl_v1_large(4 * ublas_v1.size());
+   viennacl::vector_range< viennacl::vector<NumericT> > vcl_v1_range(vcl_v1_large, viennacl::range(3, ublas_v1.size() + 3));
+   viennacl::vector_slice< viennacl::vector<NumericT> > vcl_v1_slice(vcl_v1_large, viennacl::slice(2, 3, ublas_v1.size()));
+
+   viennacl::vector<NumericT> vcl_v2_native(ublas_v2.size());
+   viennacl::vector<NumericT> vcl_v2_large(4 * ublas_v2.size());
+   viennacl::vector_range< viennacl::vector<NumericT> > vcl_v2_range(vcl_v2_large, viennacl::range(8, ublas_v2.size() + 8));
+   viennacl::vector_slice< viennacl::vector<NumericT> > vcl_v2_slice(vcl_v2_large, viennacl::slice(6, 2, ublas_v2.size()));
+
+   viennacl::matrix<NumericT, F> vcl_m1_native(ublas_m1.size1(), ublas_m1.size2());
+   viennacl::matrix<NumericT, F> vcl_m1_large(4 * ublas_m1.size1(), 4 * ublas_m1.size2());
+   viennacl::matrix_range< viennacl::matrix<NumericT, F> > vcl_m1_range(vcl_m1_large,
+                                                                        viennacl::range(8, ublas_m1.size1() + 8),
+                                                                        viennacl::range(ublas_m1.size2(), 2 * ublas_m1.size2()) );
+   viennacl::matrix_slice< viennacl::matrix<NumericT, F> > vcl_m1_slice(vcl_m1_large,
+                                                                        viennacl::slice(6, 2, ublas_m1.size1()),
+                                                                        viennacl::slice(ublas_m1.size2(), 2, ublas_m1.size2()) );
+
+   viennacl::matrix<NumericT, F> vcl_m2_native(ublas_m2.size1(), ublas_m2.size2());
+   viennacl::matrix<NumericT, F> vcl_m2_large(4 * ublas_m2.size1(), 4 * ublas_m2.size2());
+   viennacl::matrix_range< viennacl::matrix<NumericT, F> > vcl_m2_range(vcl_m2_large,
+                                                                        viennacl::range(8, ublas_m2.size1() + 8),
+                                                                        viennacl::range(ublas_m2.size2(), 2 * ublas_m2.size2()) );
+   viennacl::matrix_slice< viennacl::matrix<NumericT, F> > vcl_m2_slice(vcl_m2_large,
+                                                                        viennacl::slice(6, 2, ublas_m2.size1()),
+                                                                        viennacl::slice(ublas_m2.size2(), 2, ublas_m2.size2()) );
+
+
+   //
+   // Run a bunch of tests for rank-1-updates, matrix-vector products
+   //
+   std::cout << "------------ Testing rank-1-updates and matrix-vector products ------------------" << std::endl;
+
+   std::cout << "* m = full, v1 = full, v2 = full" << std::endl;
+   retval = test_prod_rank1<NumericT>(ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_native, vcl_v1_native, vcl_v2_native);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+   for (std::size_t i = 0; i < ublas_m1.size1(); ++i)
+    for (std::size_t j = 0; j < ublas_m1.size2(); ++j)
+      ublas_m1(i,j) = NumericT(i+j);
+
+   std::cout << "* m = full, v1 = full, v2 = range" << std::endl;
+   retval = test_prod_rank1<NumericT>(ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_native, vcl_v1_native, vcl_v2_range);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   for (std::size_t i = 0; i < ublas_m1.size1(); ++i)
+    for (std::size_t j = 0; j < ublas_m1.size2(); ++j)
+      ublas_m1(i,j) = NumericT(i+j);
+
+   std::cout << "* m = full, v1 = full, v2 = slice" << std::endl;
+   retval = test_prod_rank1<NumericT>(ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_native, vcl_v1_native, vcl_v2_slice);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   for (std::size_t i = 0; i < ublas_m1.size1(); ++i)
+    for (std::size_t j = 0; j < ublas_m1.size2(); ++j)
+      ublas_m1(i,j) = NumericT(i+j);
+
+   // v1 = range
+
+
+   std::cout << "* m = full, v1 = range, v2 = full" << std::endl;
+   retval = test_prod_rank1<NumericT>(ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_native, vcl_v1_range, vcl_v2_native);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   for (std::size_t i = 0; i < ublas_m1.size1(); ++i)
+    for (std::size_t j = 0; j < ublas_m1.size2(); ++j)
+      ublas_m1(i,j) = NumericT(i+j);
+
+   std::cout << "* m = full, v1 = range, v2 = range" << std::endl;
+   retval = test_prod_rank1<NumericT>(ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_native, vcl_v1_range, vcl_v2_range);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   for (std::size_t i = 0; i < ublas_m1.size1(); ++i)
+    for (std::size_t j = 0; j < ublas_m1.size2(); ++j)
+      ublas_m1(i,j) = NumericT(i+j);
+
+   std::cout << "* m = full, v1 = range, v2 = slice" << std::endl;
+   retval = test_prod_rank1<NumericT>(ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_native, vcl_v1_range, vcl_v2_slice);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   for (std::size_t i = 0; i < ublas_m1.size1(); ++i)
+    for (std::size_t j = 0; j < ublas_m1.size2(); ++j)
+      ublas_m1(i,j) = NumericT(i+j);
+
+
+   // v1 = slice
+
+   std::cout << "* m = full, v1 = slice, v2 = full" << std::endl;
+   retval = test_prod_rank1<NumericT>(ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_native, vcl_v1_slice, vcl_v2_native);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   for (std::size_t i = 0; i < ublas_m1.size1(); ++i)
+    for (std::size_t j = 0; j < ublas_m1.size2(); ++j)
+      ublas_m1(i,j) = NumericT(i+j);
+
+   std::cout << "* m = full, v1 = slice, v2 = range" << std::endl;
+   retval = test_prod_rank1<NumericT>(ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_native, vcl_v1_slice, vcl_v2_range);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   for (std::size_t i = 0; i < ublas_m1.size1(); ++i)
+    for (std::size_t j = 0; j < ublas_m1.size2(); ++j)
+      ublas_m1(i,j) = NumericT(i+j);
+
+   std::cout << "* m = full, v1 = slice, v2 = slice" << std::endl;
+   retval = test_prod_rank1<NumericT>(ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_native, vcl_v1_slice, vcl_v2_slice);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   for (std::size_t i = 0; i < ublas_m1.size1(); ++i)
+    for (std::size_t j = 0; j < ublas_m1.size2(); ++j)
+      ublas_m1(i,j) = NumericT(i+j);
+
+   ///////////////////////////// matrix_range
+
+   std::cout << "* m = range, v1 = full, v2 = full" << std::endl;
+   retval = test_prod_rank1<NumericT>(ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_range, vcl_v1_native, vcl_v2_native);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   for (std::size_t i = 0; i < ublas_m1.size1(); ++i)
+    for (std::size_t j = 0; j < ublas_m1.size2(); ++j)
+      ublas_m1(i,j) = NumericT(i+j);
+
+   std::cout << "* m = range, v1 = full, v2 = range" << std::endl;
+   retval = test_prod_rank1<NumericT>(ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_range, vcl_v1_native, vcl_v2_range);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   for (std::size_t i = 0; i < ublas_m1.size1(); ++i)
+    for (std::size_t j = 0; j < ublas_m1.size2(); ++j)
+      ublas_m1(i,j) = NumericT(i+j);
+
+   std::cout << "* m = range, v1 = full, v2 = slice" << std::endl;
+   retval = test_prod_rank1<NumericT>(ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_range, vcl_v1_native, vcl_v2_slice);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   for (std::size_t i = 0; i < ublas_m1.size1(); ++i)
+    for (std::size_t j = 0; j < ublas_m1.size2(); ++j)
+      ublas_m1(i,j) = NumericT(i+j);
+
+   // v1 = range
+
+
+   std::cout << "* m = range, v1 = range, v2 = full" << std::endl;
+   retval = test_prod_rank1<NumericT>(ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_range, vcl_v1_range, vcl_v2_native);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   for (std::size_t i = 0; i < ublas_m1.size1(); ++i)
+    for (std::size_t j = 0; j < ublas_m1.size2(); ++j)
+      ublas_m1(i,j) = NumericT(i+j);
+
+   std::cout << "* m = range, v1 = range, v2 = range" << std::endl;
+   retval = test_prod_rank1<NumericT>(ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_range, vcl_v1_range, vcl_v2_range);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   for (std::size_t i = 0; i < ublas_m1.size1(); ++i)
+    for (std::size_t j = 0; j < ublas_m1.size2(); ++j)
+      ublas_m1(i,j) = NumericT(i+j);
+
+   std::cout << "* m = range, v1 = range, v2 = slice" << std::endl;
+   retval = test_prod_rank1<NumericT>(ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_range, vcl_v1_range, vcl_v2_slice);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   for (std::size_t i = 0; i < ublas_m1.size1(); ++i)
+    for (std::size_t j = 0; j < ublas_m1.size2(); ++j)
+      ublas_m1(i,j) = NumericT(i+j);
+
+
+   // v1 = slice
+
+   std::cout << "* m = range, v1 = slice, v2 = full" << std::endl;
+   retval = test_prod_rank1<NumericT>(ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_range, vcl_v1_slice, vcl_v2_native);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   for (std::size_t i = 0; i < ublas_m1.size1(); ++i)
+    for (std::size_t j = 0; j < ublas_m1.size2(); ++j)
+      ublas_m1(i,j) = NumericT(i+j);
+
+   std::cout << "* m = range, v1 = slice, v2 = range" << std::endl;
+   retval = test_prod_rank1<NumericT>(ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_range, vcl_v1_slice, vcl_v2_range);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   for (std::size_t i = 0; i < ublas_m1.size1(); ++i)
+    for (std::size_t j = 0; j < ublas_m1.size2(); ++j)
+      ublas_m1(i,j) = NumericT(i+j);
+
+   std::cout << "* m = range, v1 = slice, v2 = slice" << std::endl;
+   retval = test_prod_rank1<NumericT>(ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_range, vcl_v1_slice, vcl_v2_slice);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   for (std::size_t i = 0; i < ublas_m1.size1(); ++i)
+    for (std::size_t j = 0; j < ublas_m1.size2(); ++j)
+      ublas_m1(i,j) = NumericT(i+j);
+
+   ///////////////////////////// matrix_slice
+
+   std::cout << "* m = slice, v1 = full, v2 = full" << std::endl;
+   retval = test_prod_rank1<NumericT>(ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_slice, vcl_v1_native, vcl_v2_native);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   for (std::size_t i = 0; i < ublas_m1.size1(); ++i)
+    for (std::size_t j = 0; j < ublas_m1.size2(); ++j)
+      ublas_m1(i,j) = NumericT(i+j);
+
+   std::cout << "* m = slice, v1 = full, v2 = range" << std::endl;
+   retval = test_prod_rank1<NumericT>(ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_slice, vcl_v1_native, vcl_v2_range);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   for (std::size_t i = 0; i < ublas_m1.size1(); ++i)
+    for (std::size_t j = 0; j < ublas_m1.size2(); ++j)
+      ublas_m1(i,j) = NumericT(i+j);
+
+   std::cout << "* m = slice, v1 = full, v2 = slice" << std::endl;
+   retval = test_prod_rank1<NumericT>(ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_slice, vcl_v1_native, vcl_v2_slice);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   for (std::size_t i = 0; i < ublas_m1.size1(); ++i)
+    for (std::size_t j = 0; j < ublas_m1.size2(); ++j)
+      ublas_m1(i,j) = NumericT(i+j);
+
+   // v1 = range
+
+
+   std::cout << "* m = slice, v1 = range, v2 = full" << std::endl;
+   retval = test_prod_rank1<NumericT>(ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_slice, vcl_v1_range, vcl_v2_native);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   for (std::size_t i = 0; i < ublas_m1.size1(); ++i)
+    for (std::size_t j = 0; j < ublas_m1.size2(); ++j)
+      ublas_m1(i,j) = NumericT(i+j);
+
+   std::cout << "* m = slice, v1 = range, v2 = range" << std::endl;
+   retval = test_prod_rank1<NumericT>(ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_slice, vcl_v1_range, vcl_v2_range);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   for (std::size_t i = 0; i < ublas_m1.size1(); ++i)
+    for (std::size_t j = 0; j < ublas_m1.size2(); ++j)
+      ublas_m1(i,j) = NumericT(i+j);
+
+   std::cout << "* m = slice, v1 = range, v2 = slice" << std::endl;
+   retval = test_prod_rank1<NumericT>(ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_slice, vcl_v1_range, vcl_v2_slice);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+
+   for (std::size_t i = 0; i < ublas_m1.size1(); ++i)
+    for (std::size_t j = 0; j < ublas_m1.size2(); ++j)
+      ublas_m1(i,j) = NumericT(i+j);
+
+   // v1 = slice
+
+   std::cout << "* m = slice, v1 = slice, v2 = full" << std::endl;
+   retval = test_prod_rank1<NumericT>(ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_slice, vcl_v1_slice, vcl_v2_native);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+   for (std::size_t i = 0; i < ublas_m1.size1(); ++i)
+    for (std::size_t j = 0; j < ublas_m1.size2(); ++j)
+      ublas_m1(i,j) = NumericT(i+j);
+
+
+   std::cout << "* m = slice, v1 = slice, v2 = range" << std::endl;
+   retval = test_prod_rank1<NumericT>(ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_slice, vcl_v1_slice, vcl_v2_range);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   for (std::size_t i = 0; i < ublas_m1.size1(); ++i)
+    for (std::size_t j = 0; j < ublas_m1.size2(); ++j)
+      ublas_m1(i,j) = NumericT(i+j);
+
+   std::cout << "* m = slice, v1 = slice, v2 = slice" << std::endl;
+   retval = test_prod_rank1<NumericT>(ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_slice, vcl_v1_slice, vcl_v2_slice);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   return retval;
+}
+//
+// -------------------------------------------------------------
+//
+int main()
+{
+   std::cout << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << "## Test :: Matrix" << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << std::endl;
+
+   int retval = EXIT_SUCCESS;
+
+   std::cout << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << std::endl;
+   {
+      typedef int NumericT;
+      std::cout << "# Testing setup:" << std::endl;
+      std::cout << "  numeric: int" << std::endl;
+      std::cout << "  layout: row-major" << std::endl;
+      retval = test<NumericT, viennacl::row_major>();
+      if( retval == EXIT_SUCCESS )
+         std::cout << "# Test passed" << std::endl;
+      else
+         return retval;
+   }
+   std::cout << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << std::endl;
+   {
+      typedef int NumericT;
+      std::cout << "# Testing setup:" << std::endl;
+      std::cout << "  numeric: int" << std::endl;
+      std::cout << "  layout: column-major" << std::endl;
+      retval = test<NumericT, viennacl::column_major>();
+      if( retval == EXIT_SUCCESS )
+         std::cout << "# Test passed" << std::endl;
+      else
+         return retval;
+   }
+   std::cout << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << std::endl;
+
+
+#ifdef VIENNACL_WITH_OPENCL
+   if( viennacl::ocl::current_device().double_support() )
+#endif
+   {
+      {
+         typedef long NumericT;
+         std::cout << "# Testing setup:" << std::endl;
+         std::cout << "  numeric: double" << std::endl;
+         std::cout << "  layout: row-major" << std::endl;
+         retval = test<NumericT, viennacl::row_major>();
+            if( retval == EXIT_SUCCESS )
+               std::cout << "# Test passed" << std::endl;
+            else
+              return retval;
+      }
+      std::cout << std::endl;
+      std::cout << "----------------------------------------------" << std::endl;
+      std::cout << std::endl;
+      {
+         typedef long NumericT;
+         std::cout << "# Testing setup:" << std::endl;
+         std::cout << "  numeric: double" << std::endl;
+         std::cout << "  layout: column-major" << std::endl;
+         retval = test<NumericT, viennacl::column_major>();
+            if( retval == EXIT_SUCCESS )
+               std::cout << "# Test passed" << std::endl;
+            else
+              return retval;
+      }
+      std::cout << std::endl;
+      std::cout << "----------------------------------------------" << std::endl;
+      std::cout << std::endl;
+   }
+
+   std::cout << std::endl;
+   std::cout << "------- Test completed --------" << std::endl;
+   std::cout << std::endl;
+
+
+   return retval;
+}
diff --git a/tests/src/nmf.cpp b/tests/src/nmf.cpp
new file mode 100644
index 0000000..5be2b4d
--- /dev/null
+++ b/tests/src/nmf.cpp
@@ -0,0 +1,120 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include <ctime>
+#include <cmath>
+
+
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/linalg/nmf.hpp"
+
+typedef float ScalarType;
+
+const ScalarType EPS = ScalarType(0.1);
+
+float matrix_compare(viennacl::matrix<ScalarType>& res,
+                     viennacl::matrix<ScalarType>& ref)
+{
+    std::vector<ScalarType> res_std(res.internal_size());
+    std::vector<ScalarType> ref_std(ref.internal_size());
+
+    viennacl::fast_copy(res, &res_std[0]);
+    viennacl::fast_copy(ref, &ref_std[0]);
+
+    float diff = 0.0;
+    float mx = 0.0;
+
+    for(std::size_t i = 0; i < res_std.size(); i++) {
+        diff = std::max(diff, std::abs(res_std[i] - ref_std[i]));
+        mx = std::max(mx, res_std[i]);
+    }
+
+    return diff / mx;
+}
+
+
+void fill_random(std::vector< std::vector<ScalarType> >& v)
+{
+    for(std::size_t i = 0; i < v.size(); i++)
+    {
+      for (std::size_t j = 0; j < v[i].size(); ++j)
+        v[i][j] = static_cast<ScalarType>(rand()) / RAND_MAX;
+    }
+}
+
+
+void test_nmf(std::size_t m, std::size_t k, std::size_t n)
+{
+    std::vector< std::vector<ScalarType> > stl_w(m, std::vector<ScalarType>(k));
+    std::vector< std::vector<ScalarType> > stl_h(k, std::vector<ScalarType>(n));
+
+    viennacl::matrix<ScalarType> v_ref(m, n);
+    viennacl::matrix<ScalarType> w_ref(m, k);
+    viennacl::matrix<ScalarType> h_ref(k, n);
+
+    fill_random(stl_w);
+    fill_random(stl_h);
+
+    viennacl::copy(stl_w, w_ref);
+    viennacl::copy(stl_h, h_ref);
+
+    v_ref = viennacl::linalg::prod(w_ref, h_ref);  //reference
+
+    // Fill again with random numbers:
+    fill_random(stl_w);
+    fill_random(stl_h);
+
+    viennacl::matrix<ScalarType> w_nmf(m, k);
+    viennacl::matrix<ScalarType> h_nmf(k, n);
+
+    viennacl::copy(stl_w, w_nmf);
+    viennacl::copy(stl_h, h_nmf);
+
+    viennacl::linalg::nmf_config conf;
+    conf.print_relative_error(true);
+    conf.max_iterations(5000); //5000 iterations are enough for the test
+    viennacl::linalg::nmf(v_ref, w_nmf, h_nmf, conf);
+
+    viennacl::matrix<ScalarType> v_nmf = viennacl::linalg::prod(w_nmf, h_nmf);
+
+    float diff  = matrix_compare(v_ref, v_nmf);
+    bool diff_ok = fabs(diff) < EPS;
+
+    long iterations = static_cast<long>(conf.iters());
+    printf("%6s [%lux%lux%lu] diff = %.6f (%ld iterations)\n", diff_ok ? "[[OK]]":"[FAIL]", m, k, n, diff, iterations);
+
+    if (!diff_ok)
+      exit(EXIT_FAILURE);
+}
+
+int main()
+{
+  //srand(time(NULL));  //let's use deterministic tests, so keep the default srand() initialization
+
+  test_nmf(3, 3, 3);
+  test_nmf(3, 2, 3);
+  test_nmf(16, 7, 12);
+  test_nmf(140, 73, 180);
+  test_nmf(427, 21, 523);
+
+  std::cout << std::endl;
+  std::cout << "------- Test completed --------" << std::endl;
+  std::cout << std::endl;
+
+
+  return EXIT_SUCCESS;
+}
diff --git a/tests/src/qr_method.cpp b/tests/src/qr_method.cpp
new file mode 100644
index 0000000..b7cf072
--- /dev/null
+++ b/tests/src/qr_method.cpp
@@ -0,0 +1,277 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/*
+Solutions for testdata were generated with Scilab line:
+
+M=fscanfMat('nsm1.example');e=spec(M);e=gsort(e);rr=real(e);ii=imag(e);e=cat(1, rr, ii); s=strcat(string(e), ' ');write('tmp', s);
+*/
+
+#ifndef NDEBUG
+  #define NDEBUG
+#endif
+
+//#define VIENNACL_DEBUG_ALL
+#include <iostream>
+#include <fstream>
+#include <stdexcept>
+#include <vector>
+
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/linalg/qr-method.hpp"
+
+#include <examples/benchmarks/benchmark-utils.hpp>
+
+#include <boost/numeric/ublas/vector.hpp>
+#include <boost/numeric/ublas/matrix.hpp>
+
+namespace ublas = boost::numeric::ublas;
+
+typedef float ScalarType;
+
+const ScalarType EPS = 0.0001f;
+
+void read_matrix_size(std::fstream& f, std::size_t& sz)
+{
+    if(!f.is_open())
+    {
+        throw std::invalid_argument("File is not opened");
+    }
+
+    f >> sz;
+}
+
+void read_matrix_body(std::fstream& f, viennacl::matrix<ScalarType>& A)
+{
+    if(!f.is_open())
+    {
+        throw std::invalid_argument("File is not opened");
+    }
+
+    boost::numeric::ublas::matrix<ScalarType> h_A(A.size1(), A.size2());
+
+    for(std::size_t i = 0; i < h_A.size1(); i++) {
+        for(std::size_t j = 0; j < h_A.size2(); j++) {
+            ScalarType val = 0.0;
+            f >> val;
+            h_A(i, j) = val;
+        }
+    }
+
+    viennacl::copy(h_A, A);
+}
+
+void read_vector_body(std::fstream& f, ublas::vector<ScalarType>& v) {
+    if(!f.is_open())
+        throw std::invalid_argument("File is not opened");
+
+    for(std::size_t i = 0; i < v.size(); i++)
+    {
+            ScalarType val = 0.0;
+            f >> val;
+            v[i] = val;
+    }
+}
+
+bool check_tridiag(viennacl::matrix<ScalarType>& A_orig)
+{
+    ublas::matrix<ScalarType> A(A_orig.size1(), A_orig.size2());
+    viennacl::copy(A_orig, A);
+
+    for (unsigned int i = 0; i < A.size1(); i++) {
+        for (unsigned int j = 0; j < A.size2(); j++) {
+            if ((std::abs(A(i, j)) > EPS) && ((i - 1) != j) && (i != j) && ((i + 1) != j))
+            {
+                // std::cout << "Failed at " << i << " " << j << " " << A(i, j) << "\n";
+                return false;
+            }
+        }
+    }
+    return true;
+}
+
+bool check_hessenberg(viennacl::matrix<ScalarType>& A_orig)
+{
+    ublas::matrix<ScalarType> A(A_orig.size1(), A_orig.size2());
+    viennacl::copy(A_orig, A);
+
+    for (std::size_t i = 0; i < A.size1(); i++) {
+        for (std::size_t j = 0; j < A.size2(); j++) {
+            if ((std::abs(A(i, j)) > EPS) && (i > (j + 1)))
+            {
+                // std::cout << "Failed at " << i << " " << j << " " << A(i, j) << "\n";
+                return false;
+            }
+        }
+    }
+    return true;
+}
+
+ScalarType matrix_compare(ublas::matrix<ScalarType>& res,
+                            ublas::matrix<ScalarType>& ref)
+{
+    ScalarType diff = 0.0;
+    ScalarType mx = 0.0;
+
+    for(std::size_t i = 0; i < res.size1(); i++)
+    {
+        for(std::size_t j = 0; j < res.size2(); j++)
+        {
+            diff = std::max(diff, std::abs(res(i, j) - ref(i, j)));
+            mx = std::max(mx, res(i, j));
+        }
+    }
+
+    return diff / mx;
+}
+
+ScalarType vector_compare(ublas::vector<ScalarType>& res,
+                          ublas::vector<ScalarType>& ref)
+{
+    std::sort(ref.begin(), ref.end());
+    std::sort(res.begin(), res.end());
+
+    ScalarType diff = 0.0;
+    ScalarType mx = 0.0;
+    for(size_t i = 0; i < ref.size(); i++)
+    {
+        diff = std::max(diff, std::abs(res[i] - ref[i]));
+        mx = std::max(mx, res[i]);
+    }
+
+    return diff / mx;
+}
+
+void test_eigen(const std::string& fn, bool is_symm)
+{
+    std::cout << "Reading..." << "\n";
+    std::size_t sz;
+    // read file
+    std::fstream f(fn.c_str(), std::fstream::in);
+    //read size of input matrix
+    read_matrix_size(f, sz);
+    std::cout << "Testing matrix of size " << sz << "-by-" << sz << std::endl;
+
+    viennacl::matrix<ScalarType> A_input(sz, sz), A_ref(sz, sz), Q(sz, sz);
+    ublas::vector<ScalarType> eigen_ref_re = ublas::scalar_vector<ScalarType>(sz, 0);
+    ublas::vector<ScalarType> eigen_ref_im = ublas::scalar_vector<ScalarType>(sz, 0);
+    ublas::vector<ScalarType> eigen_re = ublas::scalar_vector<ScalarType>(sz, 0);
+    ublas::vector<ScalarType> eigen_im = ublas::scalar_vector<ScalarType>(sz, 0);
+
+    read_matrix_body(f, A_input);
+
+    read_vector_body(f, eigen_ref_re);
+
+    if(!is_symm)
+        read_vector_body(f, eigen_ref_im);
+
+    f.close();
+
+    A_ref = A_input;
+
+    std::cout << "Calculation..." << "\n";
+
+    Timer timer;
+    timer.start();
+
+    if(is_symm)
+        viennacl::linalg::qr_method_sym(A_input, Q, eigen_re);
+    else
+        viennacl::linalg::qr_method_nsm(A_input, Q, eigen_re, eigen_im);
+
+    // std::cout << A_input << "\n";
+    viennacl::backend::finish();
+
+    double time_spend = timer.get();
+
+    std::cout << "Verification..." << "\n";
+
+    bool is_hessenberg = check_hessenberg(A_input);
+    bool is_tridiag = check_tridiag(A_input);
+
+    ublas::matrix<ScalarType> A_ref_ublas(sz, sz), A_input_ublas(sz, sz), Q_ublas(sz, sz), result1(sz, sz), result2(sz, sz);
+    viennacl::copy(A_ref, A_ref_ublas);
+    viennacl::copy(A_input, A_input_ublas);
+    viennacl::copy(Q, Q_ublas);
+
+    // compute result1 = ublas::prod(Q_ublas, A_input_ublas);   (terribly slow when using ublas directly)
+    for (std::size_t i=0; i<result1.size1(); ++i)
+      for (std::size_t j=0; j<result1.size2(); ++j)
+      {
+        ScalarType value = 0;
+        for (std::size_t k=0; k<Q_ublas.size2(); ++k)
+          value += Q_ublas(i, k) * A_input_ublas(k, j);
+        result1(i,j) = value;
+      }
+    // compute result2 = ublas::prod(A_ref_ublas, Q_ublas);   (terribly slow when using ublas directly)
+    for (std::size_t i=0; i<result2.size1(); ++i)
+      for (std::size_t j=0; j<result2.size2(); ++j)
+      {
+        ScalarType value = 0;
+        for (std::size_t k=0; k<A_ref_ublas.size2(); ++k)
+          value += A_ref_ublas(i, k) * Q_ublas(k, j);
+        result2(i,j) = value;
+      }
+
+    ScalarType prods_diff = matrix_compare(result1, result2);
+    ScalarType eigen_diff = vector_compare(eigen_ref_re, eigen_re);
+
+    bool is_ok = is_hessenberg;
+
+    if(is_symm)
+        is_ok = is_ok && is_tridiag;
+
+    is_ok = is_ok && (eigen_diff < EPS);
+    is_ok = is_ok && (prods_diff < EPS);
+
+    // std::cout << A_ref << "\n";
+    // std::cout << A_input << "\n";
+    // std::cout << Q << "\n";
+    // std::cout << eigen_re << "\n";
+    // std::cout << eigen_im << "\n";
+    // std::cout << eigen_ref_re << "\n";
+    // std::cout << eigen_ref_im << "\n";
+
+    // std::cout << result1 << "\n";
+    // std::cout << result2 << "\n";
+    // std::cout << eigen_ref << "\n";
+    // std::cout << eigen << "\n";
+
+    printf("%6s [%dx%d] %40s time = %.4f\n", is_ok?"[[OK]]":"[FAIL]", (int)A_ref.size1(), (int)A_ref.size2(), fn.c_str(), time_spend);
+    printf("tridiagonal = %d, hessenberg = %d prod-diff = %f eigen-diff = %f\n", is_tridiag, is_hessenberg, prods_diff, eigen_diff);
+
+    if (!is_ok)
+      exit(EXIT_FAILURE);
+}
+
+int main()
+{
+  // test_eigen("../../examples/testdata/eigen/symm1.example", true);
+  // test_eigen("../../examples/testdata/eigen/symm2.example", true);
+  // test_eigen("../../examples/testdata/eigen/symm3.example", true);
+
+  test_eigen("../../examples/testdata/eigen/nsm1.example", false);
+  test_eigen("../../examples/testdata/eigen/nsm2.example", false);
+  test_eigen("../../examples/testdata/eigen/nsm3.example", false);
+  //test_eigen("../../examples/testdata/eigen/nsm4.example", false); //Note: This test suffers from round-off errors in single precision, hence disabled
+
+  std::cout << std::endl;
+  std::cout << "------- Test completed --------" << std::endl;
+  std::cout << std::endl;
+
+  return EXIT_SUCCESS;
+}
diff --git a/tests/src/scalar.cpp b/tests/src/scalar.cpp
index 09f385b..ae3c180 100644
--- a/tests/src/scalar.cpp
+++ b/tests/src/scalar.cpp
@@ -1,14 +1,15 @@
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
@@ -30,10 +31,11 @@
 // -------------------------------------------------------------
 //
 template <typename ScalarType>
-ScalarType diff(ScalarType & s1, viennacl::scalar<ScalarType> & s2) 
+ScalarType diff(ScalarType & s1, viennacl::scalar<ScalarType> & s2)
 {
+   viennacl::backend::finish();
    if (s1 != s2)
-      return (s1 - s2) / std::max(fabs(s1), fabs(s2));
+      return (s1 - s2) / std::max(std::fabs(s1), std::fabs(s2));
    return 0;
 }
 //
@@ -46,14 +48,13 @@ int test(Epsilon const& epsilon)
 
    NumericT s1 = NumericT(3.1415926);
    NumericT s2 = NumericT(2.71763);
-   int s3 = 42;
+   NumericT s3 = NumericT(42);
 
    viennacl::scalar<NumericT> vcl_s1;
    viennacl::scalar<NumericT> vcl_s2;
    viennacl::scalar<NumericT> vcl_s3 = 1.0;
-      
-   vcl_s1 = s1;
 
+   vcl_s1 = s1;
    if( fabs(diff(s1, vcl_s1)) > epsilon )
    {
       std::cout << "# Error at operation: vcl_s1 = s1;" << std::endl;
@@ -62,7 +63,7 @@ int test(Epsilon const& epsilon)
    }
 
    vcl_s2 = s2;
-   if( fabs(diff(s2, vcl_s2)) > epsilon )   
+   if( fabs(diff(s2, vcl_s2)) > epsilon )
    {
       std::cout << "# Error at operation: vcl_s2 = s2;" << std::endl;
       std::cout << "  diff: " << fabs(diff(s2, vcl_s2)) << std::endl;
@@ -70,17 +71,27 @@ int test(Epsilon const& epsilon)
    }
 
    vcl_s3 = s3;
-   if( s3 != vcl_s3 ) 
+   if( s3 != vcl_s3 )
    {
       std::cout << "# Error at operation: vcl_s3 = s3;" << std::endl;
       std::cout << "  diff: " << s3 - vcl_s3 << std::endl;
       retval = EXIT_FAILURE;
    }
 
+   NumericT tmp = s2;
+   s2 = s1;
+   s1 = tmp;
+   viennacl::linalg::swap(vcl_s1, vcl_s2);
+   if( fabs(diff(s1, vcl_s1)) > epsilon )
+   {
+      std::cout << "# Error at operation: swap " << std::endl;
+      std::cout << "  diff: " << fabs(diff(s1, vcl_s1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
 
    s1 += s2;
    vcl_s1 += vcl_s2;
-   if( fabs(diff(s1, vcl_s1)) > epsilon ) 
+   if( fabs(diff(s1, vcl_s1)) > epsilon )
    {
       std::cout << "# Error at operation: += " << std::endl;
       std::cout << "  diff: " << fabs(diff(s1, vcl_s1)) << std::endl;
@@ -90,7 +101,7 @@ int test(Epsilon const& epsilon)
    s1 *= s3;
    vcl_s1 *= vcl_s3;
 
-   if( fabs(diff(s1, vcl_s1)) > epsilon )   
+   if( fabs(diff(s1, vcl_s1)) > epsilon )
    {
       std::cout << "# Error at operation: *= " << std::endl;
       std::cout << "  diff: " << fabs(diff(s1, vcl_s1)) << std::endl;
@@ -99,7 +110,7 @@ int test(Epsilon const& epsilon)
 
    s1 -= s2;
    vcl_s1 -= vcl_s2;
-   if( fabs(diff(s1, vcl_s1)) > epsilon )   
+   if( fabs(diff(s1, vcl_s1)) > epsilon )
    {
       std::cout << "# Error at operation: -= " << std::endl;
       std::cout << "  diff: " << fabs(diff(s1, vcl_s1)) << std::endl;
@@ -109,7 +120,7 @@ int test(Epsilon const& epsilon)
    s1 /= s3;
    vcl_s1 /= vcl_s3;
 
-   if( fabs(diff(s1, vcl_s1)) > epsilon )  
+   if( fabs(diff(s1, vcl_s1)) > epsilon )
    {
       std::cout << "# Error at operation: /= " << std::endl;
       std::cout << "  diff: " << fabs(diff(s1, vcl_s1)) << std::endl;
@@ -120,43 +131,263 @@ int test(Epsilon const& epsilon)
 
    s1 = s2 + s3;
    vcl_s1 = vcl_s2 + vcl_s3;
-   if( fabs(diff(s1, vcl_s1)) > epsilon )  
+   if( fabs(diff(s1, vcl_s1)) > epsilon )
    {
-      std::cout << "# Error at operation: + " << std::endl;
+      std::cout << "# Error at operation: s1 = s2 + s3 " << std::endl;
+      std::cout << "  diff: " << fabs(diff(s1, vcl_s1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   s1 += s2 + s3;
+   vcl_s1 += vcl_s2 + vcl_s3;
+   if( fabs(diff(s1, vcl_s1)) > epsilon )
+   {
+      std::cout << "# Error at operation: s1 += s2 + s3 " << std::endl;
+      std::cout << "  diff: " << fabs(diff(s1, vcl_s1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   s1 -= s2 + s3;
+   vcl_s1 -= vcl_s2 + vcl_s3;
+   if( fabs(diff(s1, vcl_s1)) > epsilon )
+   {
+      std::cout << "# Error at operation: s1 -= s2 + s3 " << std::endl;
       std::cout << "  diff: " << fabs(diff(s1, vcl_s1)) << std::endl;
       retval = EXIT_FAILURE;
    }
 
    s1 = s2 - s3;
    vcl_s1 = vcl_s2 - vcl_s3;
-   if( fabs(diff(s1, vcl_s1)) > epsilon )  
+   if( fabs(diff(s1, vcl_s1)) > epsilon )
+   {
+      std::cout << "# Error at operation: s1 = s2 - s3 " << std::endl;
+      std::cout << "  diff: " << fabs(diff(s1, vcl_s1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   s1 += s2 - s3;
+   vcl_s1 += vcl_s2 - vcl_s3;
+   if( fabs(diff(s1, vcl_s1)) > epsilon )
+   {
+      std::cout << "# Error at operation: s1 += s2 - s3 " << std::endl;
+      std::cout << "  diff: " << fabs(diff(s1, vcl_s1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   s1 -= s2 - s3;
+   vcl_s1 -= vcl_s2 - vcl_s3;
+   if( fabs(diff(s1, vcl_s1)) > epsilon )
    {
-      std::cout << "# Error at operation: - " << std::endl;
+      std::cout << "# Error at operation: s1 -= s2 - s3 " << std::endl;
       std::cout << "  diff: " << fabs(diff(s1, vcl_s1)) << std::endl;
       retval = EXIT_FAILURE;
    }
 
    s1 = s2 * s3;
    vcl_s1 = vcl_s2 * vcl_s3;
-   if( fabs(diff(s1, vcl_s1)) > epsilon )  
+   if( fabs(diff(s1, vcl_s1)) > epsilon )
    {
-      std::cout << "# Error at operation: * " << std::endl;
+      std::cout << "# Error at operation: s1 = s2 * s3 " << std::endl;
+      std::cout << "  diff: " << fabs(diff(s1, vcl_s1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   s1 += s2 * s3;
+   vcl_s1 += vcl_s2 * vcl_s3;
+   if( fabs(diff(s1, vcl_s1)) > epsilon )
+   {
+      std::cout << "# Error at operation: s1 += s2 * s3 " << std::endl;
+      std::cout << "  diff: " << fabs(diff(s1, vcl_s1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   s1 -= s2 * s3;
+   vcl_s1 -= vcl_s2 * vcl_s3;
+   if( fabs(diff(s1, vcl_s1)) > epsilon )
+   {
+      std::cout << "# Error at operation: s1 -= s2 * s3 " << std::endl;
       std::cout << "  diff: " << fabs(diff(s1, vcl_s1)) << std::endl;
       retval = EXIT_FAILURE;
    }
 
    s1 = s2 / s3;
    vcl_s1 = vcl_s2 / vcl_s3;
-   if( fabs(diff(s1, vcl_s1)) > epsilon )  
+   if( fabs(diff(s1, vcl_s1)) > epsilon )
+   {
+      std::cout << "# Error at operation: s1 = s2 / s3 " << std::endl;
+      std::cout << "  diff: " << fabs(diff(s1, vcl_s1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   s1 += s2 / s3;
+   vcl_s1 += vcl_s2 / vcl_s3;
+   if( fabs(diff(s1, vcl_s1)) > epsilon )
    {
-      std::cout << "# Error at operation: / " << std::endl;
+      std::cout << "# Error at operation: s1 += s2 / s3 " << std::endl;
       std::cout << "  diff: " << fabs(diff(s1, vcl_s1)) << std::endl;
       retval = EXIT_FAILURE;
    }
 
+   s1 -= s2 / s3;
+   vcl_s1 -= vcl_s2 / vcl_s3;
+   if( fabs(diff(s1, vcl_s1)) > epsilon )
+   {
+      std::cout << "# Error at operation: s1 -= s2 / s3 " << std::endl;
+      std::cout << "  diff: " << fabs(diff(s1, vcl_s1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   // addition with factors, =
+   vcl_s1 = s1;
+
+   s1 = s2 * s2 + s3 * s3;
+   vcl_s1 = vcl_s2 * s2 + vcl_s3 * s3;
+   if( fabs(diff(s1, vcl_s1)) > epsilon )
+   {
+      std::cout << "# Error at operation: s1 = s2 * s2 + s3 * s3 " << std::endl;
+      std::cout << "  diff: " << fabs(diff(s1, vcl_s1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+   vcl_s1 = vcl_s2 * vcl_s2 + vcl_s3 * vcl_s3;
+   if( fabs(diff(s1, vcl_s1)) > epsilon )
+   {
+      std::cout << "# Error at operation: s1 = s2 * s2 + s3 * s3, second test " << std::endl;
+      std::cout << "  diff: " << fabs(diff(s1, vcl_s1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   s1 = s2 * s2 + s3 / s3;
+   vcl_s1 = vcl_s2 * s2 + vcl_s3 / s3;
+   if( fabs(diff(s1, vcl_s1)) > epsilon )
+   {
+      std::cout << "# Error at operation: s1 = s2 * s2 + s3 / s3 " << std::endl;
+      std::cout << "  diff: " << fabs(diff(s1, vcl_s1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+   vcl_s1 = vcl_s2 * vcl_s2 + vcl_s3 / vcl_s3;
+   if( fabs(diff(s1, vcl_s1)) > epsilon )
+   {
+      std::cout << "# Error at operation: s1 = s2 * s2 + s3 / s3, second test " << std::endl;
+      std::cout << "  diff: " << fabs(diff(s1, vcl_s1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   s1 = s2 / s2 + s3 * s3;
+   vcl_s1 = vcl_s2 / s2 + vcl_s3 * s3;
+   if( fabs(diff(s1, vcl_s1)) > epsilon )
+   {
+      std::cout << "# Error at operation: s1 = s2 / s2 + s3 * s3 " << std::endl;
+      std::cout << "  diff: " << fabs(diff(s1, vcl_s1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+   vcl_s1 = vcl_s2 / vcl_s2 + vcl_s3 * vcl_s3;
+   if( fabs(diff(s1, vcl_s1)) > epsilon )
+   {
+      std::cout << "# Error at operation: s1 = s2 / s2 + s3 * s3, second test " << std::endl;
+      std::cout << "  diff: " << fabs(diff(s1, vcl_s1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   s1 = s2 / s2 + s3 / s3;
+   vcl_s1 = vcl_s2 / s2 + vcl_s3 / s3;
+   if( fabs(diff(s1, vcl_s1)) > epsilon )
+   {
+      std::cout << "# Error at operation: s1 = s2 / s2 + s3 / s3 " << std::endl;
+      std::cout << "  diff: " << fabs(diff(s1, vcl_s1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+   vcl_s1 = vcl_s2 / vcl_s2 + vcl_s3 / vcl_s3;
+   if( fabs(diff(s1, vcl_s1)) > epsilon )
+   {
+      std::cout << "# Error at operation: s1 = s2 / s2 + s3 / s3, second test " << std::endl;
+      std::cout << "  diff: " << fabs(diff(s1, vcl_s1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   // addition with factors, +=
+   vcl_s1 = s1;
+
+   s1 += s2 * s2 + s3 * s3;
+   vcl_s1 += vcl_s2 * s2 + vcl_s3 * s3;
+   if( fabs(diff(s1, vcl_s1)) > epsilon )
+   {
+      std::cout << "# Error at operation: s1 += s2 * s2 + s3 * s3 " << std::endl;
+      std::cout << "  diff: " << fabs(diff(s1, vcl_s1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   s1 += s2 * s2 + s3 / s3;
+   vcl_s1 += vcl_s2 * s2 + vcl_s3 / s3;
+   if( fabs(diff(s1, vcl_s1)) > epsilon )
+   {
+      std::cout << "# Error at operation: s1 += s2 * s2 + s3 / s3 " << std::endl;
+      std::cout << "  diff: " << fabs(diff(s1, vcl_s1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   s1 += s2 / s2 + s3 * s3;
+   vcl_s1 += vcl_s2 / s2 + vcl_s3 * s3;
+   if( fabs(diff(s1, vcl_s1)) > epsilon )
+   {
+      std::cout << "# Error at operation: s1 += s2 / s2 + s3 * s3 " << std::endl;
+      std::cout << "  diff: " << fabs(diff(s1, vcl_s1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   s1 += s2 / s2 + s3 / s3;
+   vcl_s1 += vcl_s2 / s2 + vcl_s3 / s3;
+   if( fabs(diff(s1, vcl_s1)) > epsilon )
+   {
+      std::cout << "# Error at operation: s1 += s2 / s2 + s3 / s3 " << std::endl;
+      std::cout << "  diff: " << fabs(diff(s1, vcl_s1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   // addition with factors, -=
+   vcl_s1 = s1;
+
+   s1 -= s2 * s2 + s3 * s3;
+   vcl_s1 -= vcl_s2 * s2 + vcl_s3 * s3;
+   if( fabs(diff(s1, vcl_s1)) > epsilon )
+   {
+      std::cout << "# Error at operation: s1 -= s2 * s2 + s3 * s3 " << std::endl;
+      std::cout << "  diff: " << fabs(diff(s1, vcl_s1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   s1 -= s2 * s2 + s3 / s3;
+   vcl_s1 -= vcl_s2 * s2 + vcl_s3 / s3;
+   if( fabs(diff(s1, vcl_s1)) > epsilon )
+   {
+      std::cout << "# Error at operation: s1 -= s2 * s2 + s3 / s3 " << std::endl;
+      std::cout << "  diff: " << fabs(diff(s1, vcl_s1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   s1 -= s2 / s2 + s3 * s3;
+   vcl_s1 -= vcl_s2 / s2 + vcl_s3 * s3;
+   if( fabs(diff(s1, vcl_s1)) > epsilon )
+   {
+      std::cout << "# Error at operation: s1 -= s2 / s2 + s3 * s3 " << std::endl;
+      std::cout << "  diff: " << fabs(diff(s1, vcl_s1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   s1 -= s2 / s2 + s3 / s3;
+   vcl_s1 -= vcl_s2 / s2 + vcl_s3 / s3;
+   if( fabs(diff(s1, vcl_s1)) > epsilon )
+   {
+      std::cout << "# Error at operation: s1 -= s2 / s2 + s3 / s3 " << std::endl;
+      std::cout << "  diff: " << fabs(diff(s1, vcl_s1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+
+   // lenghty expression:
+
    s1 = s2 + s3 * s2 - s3 / s1;
    vcl_s1 = vcl_s2 + vcl_s3 * vcl_s2 - vcl_s3 / vcl_s1;
-   if( fabs(diff(s1, vcl_s1)) > epsilon )  
+   if( fabs(diff(s1, vcl_s1)) > epsilon )
    {
       std::cout << "# Error at operation: + * - / " << std::endl;
       std::cout << "  diff: " << fabs(diff(s1, vcl_s1)) << std::endl;
@@ -198,37 +429,9 @@ int main()
    std::cout << std::endl;
    std::cout << "----------------------------------------------" << std::endl;
    std::cout << std::endl;
-   {
-      typedef float NumericT;
-      NumericT epsilon = NumericT(1.0E-6);
-      std::cout << "# Testing setup:" << std::endl;
-      std::cout << "  eps:     " << epsilon << std::endl;
-      std::cout << "  numeric: float" << std::endl;
-      retval = test<NumericT>(epsilon);
-      if( retval == EXIT_SUCCESS )
-         std::cout << "# Test passed" << std::endl;
-      else
-         return retval;
-   }
-   std::cout << std::endl;
-   std::cout << "----------------------------------------------" << std::endl;
-   std::cout << std::endl;
-   {
-      typedef float NumericT;
-      NumericT epsilon = NumericT(1.0E-7);
-      std::cout << "# Testing setup:" << std::endl;
-      std::cout << "  eps:     " << epsilon << std::endl;
-      std::cout << "  numeric: float" << std::endl;
-      retval = test<NumericT>(epsilon);
-      if( retval == EXIT_SUCCESS )
-         std::cout << "# Test passed" << std::endl;
-      else
-         return retval;
-   }
-   std::cout << std::endl;
-   std::cout << "----------------------------------------------" << std::endl;
-   std::cout << std::endl;
+#ifdef VIENNACL_WITH_OPENCL
    if( viennacl::ocl::current_device().double_support() )
+#endif
    {
       {
          typedef double NumericT;
@@ -243,84 +446,13 @@ int main()
            return retval;
       }
       std::cout << std::endl;
-      std::cout << "----------------------------------------------" << std::endl;
-      std::cout << std::endl;
-      {
-         typedef double NumericT;
-         NumericT epsilon = 1.0E-15;
-         std::cout << "# Testing setup:" << std::endl;
-         std::cout << "  eps:     " << epsilon << std::endl;
-         std::cout << "  numeric: double" << std::endl;
-         retval = test<NumericT>(epsilon);
-         if( retval == EXIT_SUCCESS )
-            std::cout << "# Test passed" << std::endl;
-         else
-           return retval;
-      }
-      std::cout << std::endl;
-      std::cout << "----------------------------------------------" << std::endl;
-      std::cout << std::endl;
-      {
-         typedef double NumericT;
-         NumericT epsilon = 1.0E-20;
-         std::cout << "# Testing setup:" << std::endl;
-         std::cout << "  eps:     " << epsilon << std::endl;
-         std::cout << "  numeric: double" << std::endl;
-         retval = test<NumericT>(epsilon);
-         if( retval == EXIT_SUCCESS )
-           std::cout << "# Test passed" << std::endl;
-         else
-           return retval;
-      }
-      std::cout << std::endl;
-      std::cout << "----------------------------------------------" << std::endl;
-      std::cout << std::endl;
-      {
-         typedef double NumericT;
-         NumericT epsilon = 1.0E-25;
-         std::cout << "# Testing setup:" << std::endl;
-         std::cout << "  eps:     " << epsilon << std::endl;
-         std::cout << "  numeric: double" << std::endl;
-         retval = test<NumericT>(epsilon);
-         if( retval == EXIT_SUCCESS )
-           std::cout << "# Test passed" << std::endl;
-         else
-           return retval;
-      }
-      std::cout << std::endl;
-      std::cout << "----------------------------------------------" << std::endl;
-      std::cout << std::endl;
-      {
-         typedef double NumericT;
-         NumericT epsilon = 1.0E-30;
-         std::cout << "# Testing setup:" << std::endl;
-         std::cout << "  eps:     " << epsilon << std::endl;
-         std::cout << "  numeric: double" << std::endl;
-         retval = test<NumericT>(epsilon);
-         if( retval == EXIT_SUCCESS )
-           std::cout << "# Test passed" << std::endl;
-         else
-           return retval;
-      }
-      std::cout << std::endl;
-      std::cout << "----------------------------------------------" << std::endl;
-      std::cout << std::endl;
-      {
-         typedef double NumericT;
-         NumericT epsilon = 1.0E-35;
-         std::cout << "# Testing setup:" << std::endl;
-         std::cout << "  eps:     " << epsilon << std::endl;
-         std::cout << "  numeric: double" << std::endl;
-         retval = test<NumericT>(epsilon);
-         if( retval == EXIT_SUCCESS )
-           std::cout << "# Test passed" << std::endl;
-         else
-           return retval;
-      }
-      std::cout << std::endl;
-      std::cout << "----------------------------------------------" << std::endl;
-      std::cout << std::endl;
    }
+
+  std::cout << std::endl;
+  std::cout << "------- Test completed --------" << std::endl;
+  std::cout << std::endl;
+
+
    return retval;
 }
 //
diff --git a/tests/src/scalar.cu b/tests/src/scalar.cu
new file mode 100644
index 0000000..ae3c180
--- /dev/null
+++ b/tests/src/scalar.cu
@@ -0,0 +1,461 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+//
+// *** System
+//
+#include <iostream>
+#include <algorithm>
+#include <cmath>
+
+//
+// *** ViennaCL
+//
+#include "viennacl/scalar.hpp"
+
+//
+// -------------------------------------------------------------
+//
+template <typename ScalarType>
+ScalarType diff(ScalarType & s1, viennacl::scalar<ScalarType> & s2)
+{
+   viennacl::backend::finish();
+   if (s1 != s2)
+      return (s1 - s2) / std::max(std::fabs(s1), std::fabs(s2));
+   return 0;
+}
+//
+// -------------------------------------------------------------
+//
+template< typename NumericT, typename Epsilon >
+int test(Epsilon const& epsilon)
+{
+   int retval = EXIT_SUCCESS;
+
+   NumericT s1 = NumericT(3.1415926);
+   NumericT s2 = NumericT(2.71763);
+   NumericT s3 = NumericT(42);
+
+   viennacl::scalar<NumericT> vcl_s1;
+   viennacl::scalar<NumericT> vcl_s2;
+   viennacl::scalar<NumericT> vcl_s3 = 1.0;
+
+   vcl_s1 = s1;
+   if( fabs(diff(s1, vcl_s1)) > epsilon )
+   {
+      std::cout << "# Error at operation: vcl_s1 = s1;" << std::endl;
+      std::cout << "  diff: " << fabs(diff(s1, vcl_s1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   vcl_s2 = s2;
+   if( fabs(diff(s2, vcl_s2)) > epsilon )
+   {
+      std::cout << "# Error at operation: vcl_s2 = s2;" << std::endl;
+      std::cout << "  diff: " << fabs(diff(s2, vcl_s2)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   vcl_s3 = s3;
+   if( s3 != vcl_s3 )
+   {
+      std::cout << "# Error at operation: vcl_s3 = s3;" << std::endl;
+      std::cout << "  diff: " << s3 - vcl_s3 << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   NumericT tmp = s2;
+   s2 = s1;
+   s1 = tmp;
+   viennacl::linalg::swap(vcl_s1, vcl_s2);
+   if( fabs(diff(s1, vcl_s1)) > epsilon )
+   {
+      std::cout << "# Error at operation: swap " << std::endl;
+      std::cout << "  diff: " << fabs(diff(s1, vcl_s1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   s1 += s2;
+   vcl_s1 += vcl_s2;
+   if( fabs(diff(s1, vcl_s1)) > epsilon )
+   {
+      std::cout << "# Error at operation: += " << std::endl;
+      std::cout << "  diff: " << fabs(diff(s1, vcl_s1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   s1 *= s3;
+   vcl_s1 *= vcl_s3;
+
+   if( fabs(diff(s1, vcl_s1)) > epsilon )
+   {
+      std::cout << "# Error at operation: *= " << std::endl;
+      std::cout << "  diff: " << fabs(diff(s1, vcl_s1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   s1 -= s2;
+   vcl_s1 -= vcl_s2;
+   if( fabs(diff(s1, vcl_s1)) > epsilon )
+   {
+      std::cout << "# Error at operation: -= " << std::endl;
+      std::cout << "  diff: " << fabs(diff(s1, vcl_s1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   s1 /= s3;
+   vcl_s1 /= vcl_s3;
+
+   if( fabs(diff(s1, vcl_s1)) > epsilon )
+   {
+      std::cout << "# Error at operation: /= " << std::endl;
+      std::cout << "  diff: " << fabs(diff(s1, vcl_s1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   s1 = vcl_s1;
+
+   s1 = s2 + s3;
+   vcl_s1 = vcl_s2 + vcl_s3;
+   if( fabs(diff(s1, vcl_s1)) > epsilon )
+   {
+      std::cout << "# Error at operation: s1 = s2 + s3 " << std::endl;
+      std::cout << "  diff: " << fabs(diff(s1, vcl_s1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   s1 += s2 + s3;
+   vcl_s1 += vcl_s2 + vcl_s3;
+   if( fabs(diff(s1, vcl_s1)) > epsilon )
+   {
+      std::cout << "# Error at operation: s1 += s2 + s3 " << std::endl;
+      std::cout << "  diff: " << fabs(diff(s1, vcl_s1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   s1 -= s2 + s3;
+   vcl_s1 -= vcl_s2 + vcl_s3;
+   if( fabs(diff(s1, vcl_s1)) > epsilon )
+   {
+      std::cout << "# Error at operation: s1 -= s2 + s3 " << std::endl;
+      std::cout << "  diff: " << fabs(diff(s1, vcl_s1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   s1 = s2 - s3;
+   vcl_s1 = vcl_s2 - vcl_s3;
+   if( fabs(diff(s1, vcl_s1)) > epsilon )
+   {
+      std::cout << "# Error at operation: s1 = s2 - s3 " << std::endl;
+      std::cout << "  diff: " << fabs(diff(s1, vcl_s1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   s1 += s2 - s3;
+   vcl_s1 += vcl_s2 - vcl_s3;
+   if( fabs(diff(s1, vcl_s1)) > epsilon )
+   {
+      std::cout << "# Error at operation: s1 += s2 - s3 " << std::endl;
+      std::cout << "  diff: " << fabs(diff(s1, vcl_s1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   s1 -= s2 - s3;
+   vcl_s1 -= vcl_s2 - vcl_s3;
+   if( fabs(diff(s1, vcl_s1)) > epsilon )
+   {
+      std::cout << "# Error at operation: s1 -= s2 - s3 " << std::endl;
+      std::cout << "  diff: " << fabs(diff(s1, vcl_s1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   s1 = s2 * s3;
+   vcl_s1 = vcl_s2 * vcl_s3;
+   if( fabs(diff(s1, vcl_s1)) > epsilon )
+   {
+      std::cout << "# Error at operation: s1 = s2 * s3 " << std::endl;
+      std::cout << "  diff: " << fabs(diff(s1, vcl_s1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   s1 += s2 * s3;
+   vcl_s1 += vcl_s2 * vcl_s3;
+   if( fabs(diff(s1, vcl_s1)) > epsilon )
+   {
+      std::cout << "# Error at operation: s1 += s2 * s3 " << std::endl;
+      std::cout << "  diff: " << fabs(diff(s1, vcl_s1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   s1 -= s2 * s3;
+   vcl_s1 -= vcl_s2 * vcl_s3;
+   if( fabs(diff(s1, vcl_s1)) > epsilon )
+   {
+      std::cout << "# Error at operation: s1 -= s2 * s3 " << std::endl;
+      std::cout << "  diff: " << fabs(diff(s1, vcl_s1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   s1 = s2 / s3;
+   vcl_s1 = vcl_s2 / vcl_s3;
+   if( fabs(diff(s1, vcl_s1)) > epsilon )
+   {
+      std::cout << "# Error at operation: s1 = s2 / s3 " << std::endl;
+      std::cout << "  diff: " << fabs(diff(s1, vcl_s1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   s1 += s2 / s3;
+   vcl_s1 += vcl_s2 / vcl_s3;
+   if( fabs(diff(s1, vcl_s1)) > epsilon )
+   {
+      std::cout << "# Error at operation: s1 += s2 / s3 " << std::endl;
+      std::cout << "  diff: " << fabs(diff(s1, vcl_s1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   s1 -= s2 / s3;
+   vcl_s1 -= vcl_s2 / vcl_s3;
+   if( fabs(diff(s1, vcl_s1)) > epsilon )
+   {
+      std::cout << "# Error at operation: s1 -= s2 / s3 " << std::endl;
+      std::cout << "  diff: " << fabs(diff(s1, vcl_s1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   // addition with factors, =
+   vcl_s1 = s1;
+
+   s1 = s2 * s2 + s3 * s3;
+   vcl_s1 = vcl_s2 * s2 + vcl_s3 * s3;
+   if( fabs(diff(s1, vcl_s1)) > epsilon )
+   {
+      std::cout << "# Error at operation: s1 = s2 * s2 + s3 * s3 " << std::endl;
+      std::cout << "  diff: " << fabs(diff(s1, vcl_s1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+   vcl_s1 = vcl_s2 * vcl_s2 + vcl_s3 * vcl_s3;
+   if( fabs(diff(s1, vcl_s1)) > epsilon )
+   {
+      std::cout << "# Error at operation: s1 = s2 * s2 + s3 * s3, second test " << std::endl;
+      std::cout << "  diff: " << fabs(diff(s1, vcl_s1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   s1 = s2 * s2 + s3 / s3;
+   vcl_s1 = vcl_s2 * s2 + vcl_s3 / s3;
+   if( fabs(diff(s1, vcl_s1)) > epsilon )
+   {
+      std::cout << "# Error at operation: s1 = s2 * s2 + s3 / s3 " << std::endl;
+      std::cout << "  diff: " << fabs(diff(s1, vcl_s1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+   vcl_s1 = vcl_s2 * vcl_s2 + vcl_s3 / vcl_s3;
+   if( fabs(diff(s1, vcl_s1)) > epsilon )
+   {
+      std::cout << "# Error at operation: s1 = s2 * s2 + s3 / s3, second test " << std::endl;
+      std::cout << "  diff: " << fabs(diff(s1, vcl_s1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   s1 = s2 / s2 + s3 * s3;
+   vcl_s1 = vcl_s2 / s2 + vcl_s3 * s3;
+   if( fabs(diff(s1, vcl_s1)) > epsilon )
+   {
+      std::cout << "# Error at operation: s1 = s2 / s2 + s3 * s3 " << std::endl;
+      std::cout << "  diff: " << fabs(diff(s1, vcl_s1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+   vcl_s1 = vcl_s2 / vcl_s2 + vcl_s3 * vcl_s3;
+   if( fabs(diff(s1, vcl_s1)) > epsilon )
+   {
+      std::cout << "# Error at operation: s1 = s2 / s2 + s3 * s3, second test " << std::endl;
+      std::cout << "  diff: " << fabs(diff(s1, vcl_s1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   s1 = s2 / s2 + s3 / s3;
+   vcl_s1 = vcl_s2 / s2 + vcl_s3 / s3;
+   if( fabs(diff(s1, vcl_s1)) > epsilon )
+   {
+      std::cout << "# Error at operation: s1 = s2 / s2 + s3 / s3 " << std::endl;
+      std::cout << "  diff: " << fabs(diff(s1, vcl_s1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+   vcl_s1 = vcl_s2 / vcl_s2 + vcl_s3 / vcl_s3;
+   if( fabs(diff(s1, vcl_s1)) > epsilon )
+   {
+      std::cout << "# Error at operation: s1 = s2 / s2 + s3 / s3, second test " << std::endl;
+      std::cout << "  diff: " << fabs(diff(s1, vcl_s1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   // addition with factors, +=
+   vcl_s1 = s1;
+
+   s1 += s2 * s2 + s3 * s3;
+   vcl_s1 += vcl_s2 * s2 + vcl_s3 * s3;
+   if( fabs(diff(s1, vcl_s1)) > epsilon )
+   {
+      std::cout << "# Error at operation: s1 += s2 * s2 + s3 * s3 " << std::endl;
+      std::cout << "  diff: " << fabs(diff(s1, vcl_s1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   s1 += s2 * s2 + s3 / s3;
+   vcl_s1 += vcl_s2 * s2 + vcl_s3 / s3;
+   if( fabs(diff(s1, vcl_s1)) > epsilon )
+   {
+      std::cout << "# Error at operation: s1 += s2 * s2 + s3 / s3 " << std::endl;
+      std::cout << "  diff: " << fabs(diff(s1, vcl_s1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   s1 += s2 / s2 + s3 * s3;
+   vcl_s1 += vcl_s2 / s2 + vcl_s3 * s3;
+   if( fabs(diff(s1, vcl_s1)) > epsilon )
+   {
+      std::cout << "# Error at operation: s1 += s2 / s2 + s3 * s3 " << std::endl;
+      std::cout << "  diff: " << fabs(diff(s1, vcl_s1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   s1 += s2 / s2 + s3 / s3;
+   vcl_s1 += vcl_s2 / s2 + vcl_s3 / s3;
+   if( fabs(diff(s1, vcl_s1)) > epsilon )
+   {
+      std::cout << "# Error at operation: s1 += s2 / s2 + s3 / s3 " << std::endl;
+      std::cout << "  diff: " << fabs(diff(s1, vcl_s1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   // addition with factors, -=
+   vcl_s1 = s1;
+
+   s1 -= s2 * s2 + s3 * s3;
+   vcl_s1 -= vcl_s2 * s2 + vcl_s3 * s3;
+   if( fabs(diff(s1, vcl_s1)) > epsilon )
+   {
+      std::cout << "# Error at operation: s1 -= s2 * s2 + s3 * s3 " << std::endl;
+      std::cout << "  diff: " << fabs(diff(s1, vcl_s1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   s1 -= s2 * s2 + s3 / s3;
+   vcl_s1 -= vcl_s2 * s2 + vcl_s3 / s3;
+   if( fabs(diff(s1, vcl_s1)) > epsilon )
+   {
+      std::cout << "# Error at operation: s1 -= s2 * s2 + s3 / s3 " << std::endl;
+      std::cout << "  diff: " << fabs(diff(s1, vcl_s1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   s1 -= s2 / s2 + s3 * s3;
+   vcl_s1 -= vcl_s2 / s2 + vcl_s3 * s3;
+   if( fabs(diff(s1, vcl_s1)) > epsilon )
+   {
+      std::cout << "# Error at operation: s1 -= s2 / s2 + s3 * s3 " << std::endl;
+      std::cout << "  diff: " << fabs(diff(s1, vcl_s1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   s1 -= s2 / s2 + s3 / s3;
+   vcl_s1 -= vcl_s2 / s2 + vcl_s3 / s3;
+   if( fabs(diff(s1, vcl_s1)) > epsilon )
+   {
+      std::cout << "# Error at operation: s1 -= s2 / s2 + s3 / s3 " << std::endl;
+      std::cout << "  diff: " << fabs(diff(s1, vcl_s1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+
+   // lenghty expression:
+
+   s1 = s2 + s3 * s2 - s3 / s1;
+   vcl_s1 = vcl_s2 + vcl_s3 * vcl_s2 - vcl_s3 / vcl_s1;
+   if( fabs(diff(s1, vcl_s1)) > epsilon )
+   {
+      std::cout << "# Error at operation: + * - / " << std::endl;
+      std::cout << "  diff: " << fabs(diff(s1, vcl_s1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   return retval;
+}
+//
+// -------------------------------------------------------------
+//
+int main()
+{
+   std::cout << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << "## Test :: Scalar" << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << std::endl;
+
+   int retval = EXIT_SUCCESS;
+
+   std::cout << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << std::endl;
+   {
+      typedef float NumericT;
+      NumericT epsilon = NumericT(1.0E-5);
+      std::cout << "# Testing setup:" << std::endl;
+      std::cout << "  eps:     " << epsilon << std::endl;
+      std::cout << "  numeric: float" << std::endl;
+      retval = test<NumericT>(epsilon);
+      if( retval == EXIT_SUCCESS )
+         std::cout << "# Test passed" << std::endl;
+      else
+         return retval;
+   }
+   std::cout << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << std::endl;
+#ifdef VIENNACL_WITH_OPENCL
+   if( viennacl::ocl::current_device().double_support() )
+#endif
+   {
+      {
+         typedef double NumericT;
+         NumericT epsilon = 1.0E-10;
+         std::cout << "# Testing setup:" << std::endl;
+         std::cout << "  eps:     " << epsilon << std::endl;
+         std::cout << "  numeric: double" << std::endl;
+         retval = test<NumericT>(epsilon);
+         if( retval == EXIT_SUCCESS )
+           std::cout << "# Test passed" << std::endl;
+         else
+           return retval;
+      }
+      std::cout << std::endl;
+   }
+
+  std::cout << std::endl;
+  std::cout << "------- Test completed --------" << std::endl;
+  std::cout << std::endl;
+
+
+   return retval;
+}
+//
+// -------------------------------------------------------------
+//
+
diff --git a/tests/src/scheduler_matrix.cpp b/tests/src/scheduler_matrix.cpp
new file mode 100644
index 0000000..51addfd
--- /dev/null
+++ b/tests/src/scheduler_matrix.cpp
@@ -0,0 +1,920 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#define VIENNACL_WITH_UBLAS
+//#define NDEBUG
+//#define VIENNACL_BUILD_INFO
+
+#include <utility>
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <cmath>
+#include <algorithm>
+#include <stdio.h>
+#include <time.h>
+//#include "../benchmarks/benchmark-utils.hpp"
+#include "viennacl/scalar.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/prod.hpp"
+/*#include "viennacl/compressed_matrix.hpp"
+#include "viennacl/linalg/cg.hpp"
+#include "viennacl/linalg/inner_prod.hpp"
+#include "viennacl/linalg/ilu.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+#include "viennacl/io/matrix_market.hpp"*/
+#include "viennacl/matrix_proxy.hpp"
+#include "viennacl/vector_proxy.hpp"
+#include "boost/numeric/ublas/vector.hpp"
+#include "boost/numeric/ublas/matrix.hpp"
+#include "boost/numeric/ublas/matrix_proxy.hpp"
+#include "boost/numeric/ublas/vector_proxy.hpp"
+#include "boost/numeric/ublas/io.hpp"
+
+#include "viennacl/scheduler/execute.hpp"
+
+using namespace boost::numeric;
+
+template <typename MatrixType, typename VCLMatrixType>
+bool check_for_equality(MatrixType const & ublas_A, VCLMatrixType const & vcl_A, double epsilon)
+{
+  typedef typename MatrixType::value_type   value_type;
+
+  boost::numeric::ublas::matrix<value_type> vcl_A_cpu(vcl_A.size1(), vcl_A.size2());
+  viennacl::backend::finish();  //workaround for a bug in APP SDK 2.7 on Trinity APUs (with Catalyst 12.8)
+  viennacl::copy(vcl_A, vcl_A_cpu);
+
+  for (std::size_t i=0; i<ublas_A.size1(); ++i)
+  {
+    for (std::size_t j=0; j<ublas_A.size2(); ++j)
+    {
+      if (ublas_A(i,j) != vcl_A_cpu(i,j))
+      {
+        if ( (std::abs(ublas_A(i,j) - vcl_A_cpu(i,j)) / std::max(std::fabs(ublas_A(i,j)), std::fabs(vcl_A_cpu(i,j))) > epsilon) || (vcl_A_cpu(i,j) != vcl_A_cpu(i,j)) )
+        {
+          std::cout << "Error at index (" << i << ", " << j << "): " << ublas_A(i,j) << " vs " << vcl_A_cpu(i,j) << std::endl;
+          std::cout << std::endl << "TEST failed!" << std::endl;
+          return false;
+        }
+      }
+    }
+  }
+
+  std::cout << "PASSED!" << std::endl;
+  return true;
+}
+
+
+
+
+template <typename UBLASMatrixType,
+          typename ViennaCLMatrixType1, typename ViennaCLMatrixType2, typename ViennaCLMatrixType3>
+int run_test(double epsilon,
+             UBLASMatrixType & ublas_A, UBLASMatrixType & ublas_B, UBLASMatrixType & ublas_C,
+             ViennaCLMatrixType1 & vcl_A, ViennaCLMatrixType2 & vcl_B, ViennaCLMatrixType3 vcl_C)
+{
+
+  typedef typename viennacl::result_of::cpu_value_type<typename ViennaCLMatrixType1::value_type>::type  cpu_value_type;
+
+  cpu_value_type alpha = cpu_value_type(3.1415);
+  viennacl::scalar<cpu_value_type>   gpu_alpha = alpha;
+
+  cpu_value_type beta = cpu_value_type(2.7182);
+  viennacl::scalar<cpu_value_type>   gpu_beta = beta;
+
+
+  //
+  // Initializer:
+  //
+  std::cout << "Checking for zero_matrix initializer..." << std::endl;
+  ublas_A = ublas::zero_matrix<cpu_value_type>(ublas_A.size1(), ublas_A.size2());
+  vcl_A = viennacl::zero_matrix<cpu_value_type>(vcl_A.size1(), vcl_A.size2());
+  if (!check_for_equality(ublas_A, vcl_A, epsilon))
+    return EXIT_FAILURE;
+
+  std::cout << "Checking for scalar_matrix initializer..." << std::endl;
+  ublas_A = ublas::scalar_matrix<cpu_value_type>(ublas_A.size1(), ublas_A.size2(), alpha);
+  vcl_A = viennacl::scalar_matrix<cpu_value_type>(vcl_A.size1(), vcl_A.size2(), alpha);
+  if (!check_for_equality(ublas_A, vcl_A, epsilon))
+    return EXIT_FAILURE;
+
+  ublas_A =    ublas::scalar_matrix<cpu_value_type>(ublas_A.size1(), ublas_A.size2(), gpu_beta);
+  vcl_A   = viennacl::scalar_matrix<cpu_value_type>(  vcl_A.size1(),   vcl_A.size2(), gpu_beta);
+  if (!check_for_equality(ublas_A, vcl_A, epsilon))
+    return EXIT_FAILURE;
+
+  /*std::cout << "Checking for identity initializer..." << std::endl;
+  ublas_A = ublas::identity_matrix<cpu_value_type>(ublas_A.size1());
+  vcl_A = viennacl::identity_matrix<cpu_value_type>(vcl_A.size1());
+  if (!check_for_equality(ublas_A, vcl_A, epsilon))
+    return EXIT_FAILURE;*/
+
+
+  std::cout << std::endl;
+  //std::cout << "//" << std::endl;
+  //std::cout << "////////// Test: Assignments //////////" << std::endl;
+  //std::cout << "//" << std::endl;
+
+  if (!check_for_equality(ublas_B, vcl_B, epsilon))
+    return EXIT_FAILURE;
+
+  std::cout << "Testing matrix assignment... ";
+  //std::cout << ublas_B(0,0) << " vs. " << vcl_B(0,0) << std::endl;
+  ublas_A = ublas_B;
+  vcl_A = vcl_B;
+  if (!check_for_equality(ublas_A, vcl_A, epsilon))
+    return EXIT_FAILURE;
+
+
+
+  //std::cout << std::endl;
+  //std::cout << "//" << std::endl;
+  //std::cout << "////////// Test 1: Copy to GPU //////////" << std::endl;
+  //std::cout << "//" << std::endl;
+
+  ublas_A = ublas_B;
+  viennacl::copy(ublas_B, vcl_A);
+  std::cout << "Testing upper left copy to GPU... ";
+  if (!check_for_equality(ublas_A, vcl_A, epsilon))
+    return EXIT_FAILURE;
+
+
+  ublas_C = ublas_B;
+  viennacl::copy(ublas_B, vcl_C);
+  std::cout << "Testing lower right copy to GPU... ";
+  if (!check_for_equality(ublas_C, vcl_C, epsilon))
+    return EXIT_FAILURE;
+
+
+  //std::cout << std::endl;
+  //std::cout << "//" << std::endl;
+  //std::cout << "////////// Test 2: Copy from GPU //////////" << std::endl;
+  //std::cout << "//" << std::endl;
+
+  std::cout << "Testing upper left copy to A... ";
+  if (!check_for_equality(ublas_A, vcl_A, epsilon))
+    return EXIT_FAILURE;
+
+  std::cout << "Testing lower right copy to C... ";
+  if (!check_for_equality(ublas_C, vcl_C, epsilon))
+    return EXIT_FAILURE;
+
+
+
+  //std::cout << "//" << std::endl;
+  //std::cout << "////////// Test 3: Addition //////////" << std::endl;
+  //std::cout << "//" << std::endl;
+  viennacl::copy(ublas_C, vcl_C);
+
+  std::cout << "Assignment: ";
+  {
+  ublas_C = ublas_B;
+  viennacl::scheduler::statement   my_statement(vcl_C, viennacl::op_assign(), vcl_B); // same as vcl_C = vcl_B;
+  viennacl::scheduler::execute(my_statement);
+
+  if (!check_for_equality(ublas_C, vcl_C, epsilon))
+    return EXIT_FAILURE;
+  }
+
+
+  std::cout << "Inplace add: ";
+  {
+  ublas_C += ublas_C;
+  viennacl::scheduler::statement   my_statement(vcl_C, viennacl::op_inplace_add(), vcl_C); // same as vcl_C += vcl_C;
+  viennacl::scheduler::execute(my_statement);
+
+  if (!check_for_equality(ublas_C, vcl_C, epsilon))
+    return EXIT_FAILURE;
+  }
+
+  std::cout << "Inplace sub: ";
+  {
+  ublas_C -= ublas_C;
+  viennacl::scheduler::statement my_statement(vcl_C, viennacl::op_inplace_sub(), vcl_C); // same as vcl_C -= vcl_C;
+  viennacl::scheduler::execute(my_statement);
+
+  if (!check_for_equality(ublas_C, vcl_C, epsilon))
+    return EXIT_FAILURE;
+  }
+
+
+  std::cout << "Add: ";
+  {
+  ublas_C = ublas_A + ublas_B;
+  viennacl::scheduler::statement my_statement(vcl_C, viennacl::op_assign(), vcl_A + vcl_B); // same as vcl_C = vcl_A + vcl_B;
+  viennacl::scheduler::execute(my_statement);
+
+  if (!check_for_equality(ublas_C, vcl_C, epsilon))
+    return EXIT_FAILURE;
+  }
+
+  std::cout << "Sub: ";
+  {
+  ublas_C = ublas_A - ublas_B;
+  viennacl::scheduler::statement my_statement(vcl_C, viennacl::op_assign(), vcl_A - vcl_B); // same as vcl_C = vcl_A - vcl_B;
+  viennacl::scheduler::execute(my_statement);
+
+  if (!check_for_equality(ublas_C, vcl_C, epsilon))
+    return EXIT_FAILURE;
+  }
+
+  std::cout << "Composite assignments: ";
+  {
+  ublas_C += alpha * ublas_A - beta * ublas_B + ublas_A / beta - ublas_B / alpha;
+  viennacl::scheduler::statement   my_statement(vcl_C, viennacl::op_inplace_add(), alpha * vcl_A - beta * vcl_B + vcl_A / beta - vcl_B / alpha); // same as vcl_C += alpha * vcl_A - beta * vcl_B + vcl_A / beta - vcl_B / alpha;
+  viennacl::scheduler::execute(my_statement);
+
+  if (!check_for_equality(ublas_C, vcl_C, epsilon))
+    return EXIT_FAILURE;
+  }
+
+
+  std::cout << "--- Testing elementwise operations (binary) ---" << std::endl;
+  std::cout << "x = element_prod(x, y)... ";
+  {
+  ublas_C = element_prod(ublas_A, ublas_B);
+  viennacl::scheduler::statement   my_statement(vcl_C, viennacl::op_assign(), viennacl::linalg::element_prod(vcl_A, vcl_B));
+  viennacl::scheduler::execute(my_statement);
+
+  if (!check_for_equality(ublas_C, vcl_C, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  }
+
+  std::cout << "x = element_prod(x + y, y)... ";
+  {
+  ublas_C = element_prod(ublas_A + ublas_B, ublas_B);
+  viennacl::scheduler::statement   my_statement(vcl_C, viennacl::op_assign(), viennacl::linalg::element_prod(vcl_A + vcl_B, vcl_B));
+  viennacl::scheduler::execute(my_statement);
+
+  if (!check_for_equality(ublas_C, vcl_C, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  }
+
+  std::cout << "x = element_prod(x, x + y)... ";
+  {
+  ublas_C = element_prod(ublas_A, ublas_A + ublas_B);
+  viennacl::scheduler::statement   my_statement(vcl_C, viennacl::op_assign(), viennacl::linalg::element_prod(vcl_A, vcl_B + vcl_A));
+  viennacl::scheduler::execute(my_statement);
+
+  if (!check_for_equality(ublas_C, vcl_C, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  }
+
+  std::cout << "x = element_prod(x - y, y + x)... ";
+  {
+  ublas_C = element_prod(ublas_A - ublas_B, ublas_B + ublas_A);
+  viennacl::scheduler::statement   my_statement(vcl_C, viennacl::op_assign(), viennacl::linalg::element_prod(vcl_A - vcl_B, vcl_B + vcl_A));
+  viennacl::scheduler::execute(my_statement);
+
+  if (!check_for_equality(ublas_C, vcl_C, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  }
+
+
+
+  std::cout << "x = element_div(x, y)... ";
+  {
+  ublas_C = element_div(ublas_A, ublas_B);
+  viennacl::scheduler::statement   my_statement(vcl_C, viennacl::op_assign(), viennacl::linalg::element_div(vcl_A, vcl_B));
+  viennacl::scheduler::execute(my_statement);
+
+  if (!check_for_equality(ublas_C, vcl_C, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  }
+
+  std::cout << "x = element_div(x + y, y)... ";
+  {
+  ublas_C = element_div(ublas_A + ublas_B, ublas_B);
+  viennacl::scheduler::statement   my_statement(vcl_C, viennacl::op_assign(), viennacl::linalg::element_div(vcl_A + vcl_B, vcl_B));
+  viennacl::scheduler::execute(my_statement);
+
+  if (!check_for_equality(ublas_C, vcl_C, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  }
+
+  std::cout << "x = element_div(x, x + y)... ";
+  {
+  ublas_C = element_div(ublas_A, ublas_A + ublas_B);
+  viennacl::scheduler::statement   my_statement(vcl_C, viennacl::op_assign(), viennacl::linalg::element_div(vcl_A, vcl_B + vcl_A));
+  viennacl::scheduler::execute(my_statement);
+
+  if (!check_for_equality(ublas_C, vcl_C, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  }
+
+  std::cout << "x = element_div(x - y, y + x)... ";
+  {
+  ublas_C = element_div(ublas_A - ublas_B, ublas_B + ublas_A);
+  viennacl::scheduler::statement   my_statement(vcl_C, viennacl::op_assign(), viennacl::linalg::element_div(vcl_A - vcl_B, vcl_B + vcl_A));
+  viennacl::scheduler::execute(my_statement);
+
+  if (!check_for_equality(ublas_C, vcl_C, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  }
+
+
+  std::cout << "--- Testing elementwise operations (unary) ---" << std::endl;
+#define GENERATE_UNARY_OP_TEST(OPNAME) \
+  ublas_A = ublas::scalar_matrix<cpu_value_type>(ublas_A.size1(), ublas_A.size2(), cpu_value_type(0.21)); \
+  ublas_B = cpu_value_type(3.1415) * ublas_A; \
+  viennacl::copy(ublas_A, vcl_A); \
+  viennacl::copy(ublas_B, vcl_B); \
+  { \
+  for (std::size_t i=0; i<ublas_C.size1(); ++i) \
+    for (std::size_t j=0; j<ublas_C.size2(); ++j) \
+      ublas_C(i,j) = OPNAME(ublas_A(i,j)); \
+  viennacl::scheduler::statement my_statement(vcl_C, viennacl::op_assign(), viennacl::linalg::element_##OPNAME(vcl_A)); \
+  viennacl::scheduler::execute(my_statement); \
+  if (!check_for_equality(ublas_C, vcl_C, epsilon) != EXIT_SUCCESS) \
+    return EXIT_FAILURE; \
+  } \
+  { \
+  for (std::size_t i=0; i<ublas_C.size1(); ++i) \
+    for (std::size_t j=0; j<ublas_C.size2(); ++j) \
+      ublas_C(i,j) = OPNAME(ublas_A(i,j) / cpu_value_type(2)); \
+  viennacl::scheduler::statement my_statement(vcl_C, viennacl::op_assign(), viennacl::linalg::element_##OPNAME(vcl_A / cpu_value_type(2))); \
+  viennacl::scheduler::execute(my_statement); \
+  if (!check_for_equality(ublas_C, vcl_C, epsilon) != EXIT_SUCCESS) \
+    return EXIT_FAILURE; \
+  }
+
+  GENERATE_UNARY_OP_TEST(cos);
+  GENERATE_UNARY_OP_TEST(cosh);
+  GENERATE_UNARY_OP_TEST(exp);
+  GENERATE_UNARY_OP_TEST(floor);
+  GENERATE_UNARY_OP_TEST(fabs);
+  GENERATE_UNARY_OP_TEST(log);
+  GENERATE_UNARY_OP_TEST(log10);
+  GENERATE_UNARY_OP_TEST(sin);
+  GENERATE_UNARY_OP_TEST(sinh);
+  GENERATE_UNARY_OP_TEST(fabs);
+  //GENERATE_UNARY_OP_TEST(abs); //OpenCL allows abs on integers only
+  GENERATE_UNARY_OP_TEST(sqrt);
+  GENERATE_UNARY_OP_TEST(tan);
+  GENERATE_UNARY_OP_TEST(tanh);
+
+#undef GENERATE_UNARY_OP_TEST
+
+
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << std::endl;
+
+
+  return EXIT_SUCCESS;
+}
+
+
+
+
+template <typename T, typename ScalarType>
+int run_test(double epsilon)
+{
+    //typedef float               ScalarType;
+    typedef boost::numeric::ublas::matrix<ScalarType>       MatrixType;
+
+    typedef viennacl::matrix<ScalarType, T>    VCLMatrixType;
+
+    std::size_t dim_rows = 131;
+    std::size_t dim_cols = 33;
+    //std::size_t dim_rows = 5;
+    //std::size_t dim_cols = 3;
+
+    //setup ublas objects:
+    MatrixType ublas_A(dim_rows, dim_cols);
+    MatrixType ublas_B(dim_rows, dim_cols);
+    MatrixType ublas_C(dim_rows, dim_cols);
+
+    for (std::size_t i=0; i<ublas_A.size1(); ++i)
+      for (std::size_t j=0; j<ublas_A.size2(); ++j)
+      {
+        ublas_A(i,j) = ScalarType((i+2) + (j+1)*(i+2));
+        ublas_B(i,j) = ScalarType((j+2) + (j+1)*(j+2));
+        ublas_C(i,j) = ScalarType((i+1) + (i+1)*(i+2));
+      }
+
+    MatrixType ublas_A_large(4 * dim_rows, 4 * dim_cols);
+    for (std::size_t i=0; i<ublas_A_large.size1(); ++i)
+      for (std::size_t j=0; j<ublas_A_large.size2(); ++j)
+        ublas_A_large(i,j) = ScalarType(i * ublas_A_large.size2() + j);
+
+    //Setup ViennaCL objects
+    VCLMatrixType vcl_A_full(4 * dim_rows, 4 * dim_cols);
+    VCLMatrixType vcl_B_full(4 * dim_rows, 4 * dim_cols);
+    VCLMatrixType vcl_C_full(4 * dim_rows, 4 * dim_cols);
+
+    viennacl::copy(ublas_A_large, vcl_A_full);
+    viennacl::copy(ublas_A_large, vcl_B_full);
+    viennacl::copy(ublas_A_large, vcl_C_full);
+
+    //
+    // Create A
+    //
+    VCLMatrixType vcl_A(dim_rows, dim_cols);
+
+    viennacl::range vcl_A_r1(2 * dim_rows, 3 * dim_rows);
+    viennacl::range vcl_A_r2(dim_cols, 2 * dim_cols);
+    viennacl::matrix_range<VCLMatrixType>   vcl_range_A(vcl_A_full, vcl_A_r1, vcl_A_r2);
+
+    viennacl::slice vcl_A_s1(2, 3, dim_rows);
+    viennacl::slice vcl_A_s2(2 * dim_cols, 2, dim_cols);
+    viennacl::matrix_slice<VCLMatrixType>   vcl_slice_A(vcl_A_full, vcl_A_s1, vcl_A_s2);
+
+
+    //
+    // Create B
+    //
+    VCLMatrixType vcl_B(dim_rows, dim_cols);
+
+    viennacl::range vcl_B_r1(dim_rows, 2 * dim_rows);
+    viennacl::range vcl_B_r2(2 * dim_cols, 3 * dim_cols);
+    viennacl::matrix_range<VCLMatrixType>   vcl_range_B(vcl_B_full, vcl_B_r1, vcl_B_r2);
+
+    viennacl::slice vcl_B_s1(2 * dim_rows, 2, dim_rows);
+    viennacl::slice vcl_B_s2(dim_cols, 3, dim_cols);
+    viennacl::matrix_slice<VCLMatrixType>   vcl_slice_B(vcl_B_full, vcl_B_s1, vcl_B_s2);
+
+
+    //
+    // Create C
+    //
+    VCLMatrixType vcl_C(dim_rows, dim_cols);
+
+    viennacl::range vcl_C_r1(2 * dim_rows, 3 * dim_rows);
+    viennacl::range vcl_C_r2(3 * dim_cols, 4 * dim_cols);
+    viennacl::matrix_range<VCLMatrixType>   vcl_range_C(vcl_C_full, vcl_C_r1, vcl_C_r2);
+
+    viennacl::slice vcl_C_s1(dim_rows, 2, dim_rows);
+    viennacl::slice vcl_C_s2(0, 3, dim_cols);
+    viennacl::matrix_slice<VCLMatrixType>   vcl_slice_C(vcl_C_full, vcl_C_s1, vcl_C_s2);
+
+    viennacl::copy(ublas_A, vcl_A);
+    viennacl::copy(ublas_A, vcl_range_A);
+    viennacl::copy(ublas_A, vcl_slice_A);
+
+    viennacl::copy(ublas_B, vcl_B);
+    viennacl::copy(ublas_B, vcl_range_B);
+    viennacl::copy(ublas_B, vcl_slice_B);
+
+    viennacl::copy(ublas_C, vcl_C);
+    viennacl::copy(ublas_C, vcl_range_C);
+    viennacl::copy(ublas_C, vcl_slice_C);
+
+
+    std::cout << std::endl;
+    std::cout << "//" << std::endl;
+    std::cout << "////////// Test: Copy CTOR //////////" << std::endl;
+    std::cout << "//" << std::endl;
+
+    {
+      std::cout << "Testing matrix created from range... ";
+      VCLMatrixType vcl_temp = vcl_range_A;
+      if (check_for_equality(ublas_A, vcl_temp, epsilon))
+        std::cout << "PASSED!" << std::endl;
+      else
+      {
+        std::cout << "ublas_A: " << ublas_A << std::endl;
+        std::cout << "vcl_temp: " << vcl_temp << std::endl;
+        std::cout << "vcl_range_A: " << vcl_range_A << std::endl;
+        std::cout << "vcl_A: " << vcl_A << std::endl;
+        std::cout << std::endl << "TEST failed!" << std::endl;
+        return EXIT_FAILURE;
+      }
+
+      std::cout << "Testing matrix created from slice... ";
+      VCLMatrixType vcl_temp2 = vcl_range_B;
+      if (check_for_equality(ublas_B, vcl_temp2, epsilon))
+        std::cout << "PASSED!" << std::endl;
+      else
+      {
+        std::cout << std::endl << "TEST failed!" << std::endl;
+        return EXIT_FAILURE;
+      }
+    }
+
+    std::cout << "//" << std::endl;
+    std::cout << "////////// Test: Initializer for matrix type //////////" << std::endl;
+    std::cout << "//" << std::endl;
+
+    {
+      ublas::matrix<ScalarType> ublas_dummy1 = ublas::identity_matrix<ScalarType>(ublas_A.size1());
+      ublas::matrix<ScalarType> ublas_dummy2 = ublas::scalar_matrix<ScalarType>(ublas_A.size1(), ublas_A.size1(), 3.0);
+      ublas::matrix<ScalarType> ublas_dummy3 = ublas::zero_matrix<ScalarType>(ublas_A.size1(), ublas_A.size1());
+
+      viennacl::matrix<ScalarType> vcl_dummy1 = viennacl::identity_matrix<ScalarType>(ublas_A.size1());
+      viennacl::matrix<ScalarType> vcl_dummy2 = viennacl::scalar_matrix<ScalarType>(ublas_A.size1(), ublas_A.size1(), 3.0);
+      viennacl::matrix<ScalarType> vcl_dummy3 = viennacl::zero_matrix<ScalarType>(ublas_A.size1(), ublas_A.size1());
+
+      std::cout << "Testing initializer CTOR... ";
+      if (   check_for_equality(ublas_dummy1, vcl_dummy1, epsilon)
+          && check_for_equality(ublas_dummy2, vcl_dummy2, epsilon)
+          && check_for_equality(ublas_dummy3, vcl_dummy3, epsilon)
+         )
+        std::cout << "PASSED!" << std::endl;
+      else
+      {
+        std::cout << std::endl << "TEST failed!" << std::endl;
+        return EXIT_FAILURE;
+      }
+
+      ublas_dummy1 = ublas::zero_matrix<ScalarType>(ublas_A.size1(), ublas_A.size1());
+      ublas_dummy2 = ublas::identity_matrix<ScalarType>(ublas_A.size1());
+      ublas_dummy3 = ublas::scalar_matrix<ScalarType>(ublas_A.size1(), ublas_A.size1(), 3.0);
+
+      vcl_dummy1 = viennacl::zero_matrix<ScalarType>(ublas_A.size1(), ublas_A.size1());
+      vcl_dummy2 = viennacl::identity_matrix<ScalarType>(ublas_A.size1());
+      vcl_dummy3 = viennacl::scalar_matrix<ScalarType>(ublas_A.size1(), ublas_A.size1(), 3.0);
+
+      std::cout << "Testing initializer assignment... ";
+      if (   check_for_equality(ublas_dummy1, vcl_dummy1, epsilon)
+          && check_for_equality(ublas_dummy2, vcl_dummy2, epsilon)
+          && check_for_equality(ublas_dummy3, vcl_dummy3, epsilon)
+         )
+        std::cout << "PASSED!" << std::endl;
+      else
+      {
+        std::cout << std::endl << "TEST failed!" << std::endl;
+        return EXIT_FAILURE;
+      }
+    }
+
+
+    //
+    // run operation tests:
+    //
+
+    /////// A=matrix:
+    std::cout << "Testing A=matrix, B=matrix, C=matrix ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_A);
+    viennacl::copy(ublas_B, vcl_B);
+    viennacl::copy(ublas_C, vcl_C);
+    if (run_test(epsilon,
+                 ublas_A, ublas_B, ublas_C,
+                 vcl_A, vcl_B, vcl_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+    std::cout << "Testing A=matrix, B=matrix, C=range ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_A);
+    viennacl::copy(ublas_B, vcl_B);
+    viennacl::copy(ublas_C, vcl_range_C);
+    if (run_test(epsilon,
+                 ublas_A, ublas_B, ublas_C,
+                 vcl_A, vcl_B, vcl_range_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+    std::cout << "Testing A=matrix, B=matrix, C=slice ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_A);
+    viennacl::copy(ublas_B, vcl_B);
+    viennacl::copy(ublas_C, vcl_slice_C);
+    if (run_test(epsilon,
+                 ublas_A, ublas_B, ublas_C,
+                 vcl_A, vcl_B, vcl_slice_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+    std::cout << "Testing A=matrix, B=range, C=matrix ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_A);
+    viennacl::copy(ublas_B, vcl_range_B);
+    viennacl::copy(ublas_C, vcl_C);
+    if (run_test(epsilon,
+                 ublas_A, ublas_B, ublas_C,
+                 vcl_A, vcl_range_B, vcl_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+    std::cout << "Testing A=matrix, B=range, C=range ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_A);
+    viennacl::copy(ublas_B, vcl_range_B);
+    viennacl::copy(ublas_C, vcl_range_C);
+    if (run_test(epsilon,
+                 ublas_A, ublas_B, ublas_C,
+                 vcl_A, vcl_range_B, vcl_range_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+    std::cout << "Testing A=matrix, B=range, C=slice ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_A);
+    viennacl::copy(ublas_B, vcl_range_B);
+    viennacl::copy(ublas_C, vcl_slice_C);
+    if (run_test(epsilon,
+                 ublas_A, ublas_B, ublas_C,
+                 vcl_A, vcl_range_B, vcl_slice_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+
+    std::cout << "Testing A=matrix, B=slice, C=matrix ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_A);
+    viennacl::copy(ublas_B, vcl_slice_B);
+    viennacl::copy(ublas_C, vcl_C);
+    if (run_test(epsilon,
+                 ublas_A, ublas_B, ublas_C,
+                 vcl_A, vcl_slice_B, vcl_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+    std::cout << "Testing A=matrix, B=slice, C=range ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_A);
+    viennacl::copy(ublas_B, vcl_slice_B);
+    viennacl::copy(ublas_C, vcl_range_C);
+    if (run_test(epsilon,
+                 ublas_A, ublas_B, ublas_C,
+                 vcl_A, vcl_slice_B, vcl_range_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+    std::cout << "Testing A=matrix, B=slice, C=slice ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_A);
+    viennacl::copy(ublas_B, vcl_slice_B);
+    viennacl::copy(ublas_C, vcl_slice_C);
+    if (run_test(epsilon,
+                 ublas_A, ublas_B, ublas_C,
+                 vcl_A, vcl_slice_B, vcl_slice_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+
+
+    /////// A=range:
+    std::cout << "Testing A=range, B=matrix, C=matrix ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_range_A);
+    viennacl::copy(ublas_B, vcl_B);
+    viennacl::copy(ublas_C, vcl_C);
+    if (run_test(epsilon,
+                 ublas_A, ublas_B, ublas_C,
+                 vcl_range_A, vcl_B, vcl_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+    std::cout << "Testing A=range, B=matrix, C=range ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_range_A);
+    viennacl::copy(ublas_B, vcl_B);
+    viennacl::copy(ublas_C, vcl_range_C);
+    if (run_test(epsilon,
+                 ublas_A, ublas_B, ublas_C,
+                 vcl_range_A, vcl_B, vcl_range_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+    std::cout << "Testing A=range, B=matrix, C=slice ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_range_A);
+    viennacl::copy(ublas_B, vcl_B);
+    viennacl::copy(ublas_C, vcl_slice_C);
+    if (run_test(epsilon,
+                 ublas_A, ublas_B, ublas_C,
+                 vcl_range_A, vcl_B, vcl_slice_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+
+
+    std::cout << "Testing A=range, B=range, C=matrix ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_range_A);
+    viennacl::copy(ublas_B, vcl_range_B);
+    viennacl::copy(ublas_C, vcl_C);
+    if (run_test(epsilon,
+                 ublas_A, ublas_B, ublas_C,
+                 vcl_range_A, vcl_range_B, vcl_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+    std::cout << "Testing A=range, B=range, C=range ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_range_A);
+    viennacl::copy(ublas_B, vcl_range_B);
+    viennacl::copy(ublas_C, vcl_range_C);
+    if (run_test(epsilon,
+                 ublas_A, ublas_B, ublas_C,
+                 vcl_range_A, vcl_range_B, vcl_range_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+    std::cout << "Testing A=range, B=range, C=slice ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_range_A);
+    viennacl::copy(ublas_B, vcl_range_B);
+    viennacl::copy(ublas_C, vcl_slice_C);
+    if (run_test(epsilon,
+                 ublas_A, ublas_B, ublas_C,
+                 vcl_range_A, vcl_range_B, vcl_slice_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+
+
+    std::cout << "Testing A=range, B=slice, C=matrix ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_range_A);
+    viennacl::copy(ublas_B, vcl_slice_B);
+    viennacl::copy(ublas_C, vcl_C);
+    if (run_test(epsilon,
+                 ublas_A, ublas_B, ublas_C,
+                 vcl_range_A, vcl_slice_B, vcl_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+    std::cout << "Testing A=range, B=slice, C=range ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_range_A);
+    viennacl::copy(ublas_B, vcl_slice_B);
+    viennacl::copy(ublas_C, vcl_range_C);
+    if (run_test(epsilon,
+                 ublas_A, ublas_B, ublas_C,
+                 vcl_range_A, vcl_slice_B, vcl_range_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+    std::cout << "Testing A=range, B=slice, C=slice ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_range_A);
+    viennacl::copy(ublas_B, vcl_slice_B);
+    viennacl::copy(ublas_C, vcl_slice_C);
+    if (run_test(epsilon,
+                 ublas_A, ublas_B, ublas_C,
+                 vcl_range_A, vcl_slice_B, vcl_slice_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+
+    /////// A=slice:
+    std::cout << "Testing A=slice, B=matrix, C=matrix ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_slice_A);
+    viennacl::copy(ublas_B, vcl_B);
+    viennacl::copy(ublas_C, vcl_C);
+    if (run_test(epsilon,
+                 ublas_A, ublas_B, ublas_C,
+                 vcl_slice_A, vcl_B, vcl_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+    std::cout << "Testing A=slice, B=matrix, C=range ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_slice_A);
+    viennacl::copy(ublas_B, vcl_B);
+    viennacl::copy(ublas_C, vcl_range_C);
+    if (run_test(epsilon,
+                 ublas_A, ublas_B, ublas_C,
+                 vcl_slice_A, vcl_B, vcl_range_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+    std::cout << "Testing A=slice, B=matrix, C=slice ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_slice_A);
+    viennacl::copy(ublas_B, vcl_B);
+    viennacl::copy(ublas_C, vcl_slice_C);
+    if (run_test(epsilon,
+                 ublas_A, ublas_B, ublas_C,
+                 vcl_slice_A, vcl_B, vcl_slice_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+
+
+    std::cout << "Testing A=slice, B=range, C=matrix ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_slice_A);
+    viennacl::copy(ublas_B, vcl_range_B);
+    viennacl::copy(ublas_C, vcl_C);
+    if (run_test(epsilon,
+                 ublas_A, ublas_B, ublas_C,
+                 vcl_slice_A, vcl_range_B, vcl_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+    std::cout << "Testing A=slice, B=range, C=range ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_slice_A);
+    viennacl::copy(ublas_B, vcl_range_B);
+    viennacl::copy(ublas_C, vcl_range_C);
+    if (run_test(epsilon,
+                 ublas_A, ublas_B, ublas_C,
+                 vcl_slice_A, vcl_range_B, vcl_range_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+    std::cout << "Testing A=slice, B=range, C=slice ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_slice_A);
+    viennacl::copy(ublas_B, vcl_range_B);
+    viennacl::copy(ublas_C, vcl_slice_C);
+    if (run_test(epsilon,
+                 ublas_A, ublas_B, ublas_C,
+                 vcl_slice_A, vcl_range_B, vcl_slice_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+
+
+    std::cout << "Testing A=slice, B=slice, C=matrix ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_slice_A);
+    viennacl::copy(ublas_B, vcl_slice_B);
+    viennacl::copy(ublas_C, vcl_C);
+    if (run_test(epsilon,
+                 ublas_A, ublas_B, ublas_C,
+                 vcl_slice_A, vcl_slice_B, vcl_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+    std::cout << "Testing A=slice, B=slice, C=range ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_slice_A);
+    viennacl::copy(ublas_B, vcl_slice_B);
+    viennacl::copy(ublas_C, vcl_range_C);
+    if (run_test(epsilon,
+                 ublas_A, ublas_B, ublas_C,
+                 vcl_slice_A, vcl_slice_B, vcl_range_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+    std::cout << "Testing A=slice, B=slice, C=slice ..." << std::endl;
+    viennacl::copy(ublas_A, vcl_slice_A);
+    viennacl::copy(ublas_B, vcl_slice_B);
+    viennacl::copy(ublas_C, vcl_slice_C);
+    if (run_test(epsilon,
+                 ublas_A, ublas_B, ublas_C,
+                 vcl_slice_A, vcl_slice_B, vcl_slice_C) != EXIT_SUCCESS)
+    {
+      return EXIT_FAILURE;
+    }
+
+
+    return EXIT_SUCCESS;
+}
+
+int main (int, const char **)
+{
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "## Test :: Matrix Range" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << std::endl;
+
+  double epsilon = 1e-4;
+  std::cout << "# Testing setup:" << std::endl;
+  std::cout << "  eps:     " << epsilon << std::endl;
+  std::cout << "  numeric: float" << std::endl;
+  std::cout << " --- row-major ---" << std::endl;
+  if (run_test<viennacl::row_major, float>(epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  std::cout << " --- column-major ---" << std::endl;
+  if (run_test<viennacl::column_major, float>(epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+#ifdef VIENNACL_WITH_OPENCL
+   if( viennacl::ocl::current_device().double_support() )
+#endif
+  {
+    double epsilon = 1e-12;
+    std::cout << "# Testing setup:" << std::endl;
+    std::cout << "  eps:     " << epsilon << std::endl;
+    std::cout << "  numeric: double" << std::endl;
+
+    if (run_test<viennacl::row_major, double>(epsilon) != EXIT_SUCCESS)
+      return EXIT_FAILURE;
+    if (run_test<viennacl::column_major, double>(epsilon) != EXIT_SUCCESS)
+      return EXIT_FAILURE;
+  }
+
+   std::cout << std::endl;
+   std::cout << "------- Test completed --------" << std::endl;
+   std::cout << std::endl;
+
+
+  return EXIT_SUCCESS;
+}
+
diff --git a/tests/src/scheduler_matrix_matrix.cpp b/tests/src/scheduler_matrix_matrix.cpp
new file mode 100644
index 0000000..3086e9d
--- /dev/null
+++ b/tests/src/scheduler_matrix_matrix.cpp
@@ -0,0 +1,954 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+//#define NDEBUG
+//#define VIENNACL_DEBUG_BUILD
+
+//
+// *** System
+//
+#include <iostream>
+
+//
+// *** Boost
+//
+#include <boost/numeric/ublas/io.hpp>
+#include <boost/numeric/ublas/triangular.hpp>
+#include <boost/numeric/ublas/matrix_sparse.hpp>
+#include <boost/numeric/ublas/matrix.hpp>
+#include <boost/numeric/ublas/matrix_proxy.hpp>
+#include <boost/numeric/ublas/lu.hpp>
+#include <boost/numeric/ublas/io.hpp>
+
+//
+// *** ViennaCL
+//
+//#define VIENNACL_DEBUG_ALL
+//#define VIENNACL_DEBUG_BUILD
+#define VIENNACL_WITH_UBLAS 1
+#include "viennacl/scalar.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/matrix_proxy.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+#include "viennacl/linalg/direct_solve.hpp"
+#include "examples/tutorial/Random.hpp"
+
+#include "viennacl/scheduler/execute.hpp"
+#include "viennacl/scheduler/io.hpp"
+
+//
+// -------------------------------------------------------------
+//
+using namespace boost::numeric;
+//
+// -------------------------------------------------------------
+//
+template <typename ScalarType>
+ScalarType diff(ScalarType & s1, viennacl::scalar<ScalarType> & s2)
+{
+   viennacl::backend::finish();
+   if (s1 != s2)
+      return (s1 - s2) / std::max(std::fabs(s1), std::fabs(s2));
+   return 0;
+}
+
+template <typename ScalarType>
+ScalarType diff(ublas::vector<ScalarType> & v1, viennacl::vector<ScalarType> & v2)
+{
+   ublas::vector<ScalarType> v2_cpu(v2.size());
+   viennacl::backend::finish();  //workaround for a bug in APP SDK 2.7 on Trinity APUs (with Catalyst 12.8)
+   viennacl::copy(v2.begin(), v2.end(), v2_cpu.begin());
+
+   for (std::size_t i=0;i<v1.size(); ++i)
+   {
+      if ( std::max( std::fabs(v2_cpu[i]), std::fabs(v1[i]) ) > 0 )
+         v2_cpu[i] = std::fabs(v2_cpu[i] - v1[i]) / std::max( std::fabs(v2_cpu[i]), std::fabs(v1[i]) );
+      else
+         v2_cpu[i] = 0.0;
+   }
+
+   return norm_inf(v2_cpu);
+}
+
+
+template <typename ScalarType, typename VCLMatrixType>
+ScalarType diff(ublas::matrix<ScalarType> & mat1, VCLMatrixType & mat2)
+{
+   ublas::matrix<ScalarType> mat2_cpu(mat2.size1(), mat2.size2());
+   viennacl::backend::finish();  //workaround for a bug in APP SDK 2.7 on Trinity APUs (with Catalyst 12.8)
+   viennacl::copy(mat2, mat2_cpu);
+   ScalarType ret = 0;
+   ScalarType act = 0;
+
+    for (unsigned int i = 0; i < mat2_cpu.size1(); ++i)
+    {
+      for (unsigned int j = 0; j < mat2_cpu.size2(); ++j)
+      {
+         act = std::fabs(mat2_cpu(i,j) - mat1(i,j)) / std::max( std::fabs(mat2_cpu(i, j)), std::fabs(mat1(i,j)) );
+         if (act > ret)
+           ret = act;
+      }
+    }
+   //std::cout << ret << std::endl;
+   return ret;
+}
+
+
+
+
+
+
+//
+// Part 1: Matrix-matrix multiplications
+//
+
+
+template< typename NumericT, typename Epsilon,
+          typename ReferenceMatrixTypeA, typename ReferenceMatrixTypeB, typename ReferenceMatrixTypeC,
+          typename MatrixTypeA, typename MatrixTypeB, typename MatrixTypeC>
+int test_prod(Epsilon const& epsilon,
+
+              ReferenceMatrixTypeA const & A, ReferenceMatrixTypeA const & A_trans,
+              ReferenceMatrixTypeB const & B, ReferenceMatrixTypeB const & B_trans,
+              ReferenceMatrixTypeC & C,
+
+              MatrixTypeA const & vcl_A, MatrixTypeA const & vcl_A_trans,
+              MatrixTypeB const & vcl_B, MatrixTypeB const & vcl_B_trans,
+              MatrixTypeC & vcl_C
+             )
+{
+   int retval = EXIT_SUCCESS;
+   NumericT act_diff = 0;
+
+
+   // Test: C +-= A * B --------------------------------------------------------------------------
+   C = viennacl::linalg::prod(A, B);
+   {
+   viennacl::scheduler::statement my_statement(vcl_C, viennacl::op_assign(), viennacl::linalg::prod(vcl_A, vcl_B));
+   viennacl::scheduler::execute(my_statement);
+   }
+   act_diff = std::fabs(diff(C, vcl_C));
+
+   if( act_diff > epsilon )
+   {
+     std::cout << "# Error at operation: matrix-matrix product" << std::endl;
+     std::cout << "  diff: " << act_diff << std::endl;
+     retval = EXIT_FAILURE;
+   }
+   else
+     std::cout << "Test C = A * B passed!" << std::endl;
+
+
+   C += viennacl::linalg::prod(A, B);
+   {
+   viennacl::scheduler::statement my_statement(vcl_C, viennacl::op_inplace_add(), viennacl::linalg::prod(vcl_A, vcl_B));
+   viennacl::scheduler::execute(my_statement);
+   }
+   act_diff = std::fabs(diff(C, vcl_C));
+
+   if( act_diff > epsilon )
+   {
+     std::cout << "# Error at operation: matrix-matrix product" << std::endl;
+     std::cout << "  diff: " << act_diff << std::endl;
+     retval = EXIT_FAILURE;
+   }
+   else
+     std::cout << "Test C += A * B passed!" << std::endl;
+
+   C -= viennacl::linalg::prod(A, B);
+   {
+   viennacl::scheduler::statement my_statement(vcl_C, viennacl::op_inplace_sub(), viennacl::linalg::prod(vcl_A, vcl_B));
+   viennacl::scheduler::execute(my_statement);
+   }
+   act_diff = std::fabs(diff(C, vcl_C));
+
+   if( act_diff > epsilon )
+   {
+     std::cout << "# Error at operation: matrix-matrix product" << std::endl;
+     std::cout << "  diff: " << act_diff << std::endl;
+     retval = EXIT_FAILURE;
+   }
+   else
+     std::cout << "Test C -= A * B passed!" << std::endl;
+
+
+
+
+
+   // Test: C +-= A * trans(B) --------------------------------------------------------------------------
+   C     = boost::numeric::ublas::prod(A, trans(B_trans));
+   {
+   viennacl::scheduler::statement my_statement(vcl_C, viennacl::op_assign(), viennacl::linalg::prod(vcl_A, trans(vcl_B_trans)));
+   viennacl::scheduler::execute(my_statement);
+   }
+   act_diff = std::fabs(diff(C, vcl_C));
+
+   if( act_diff > epsilon )
+   {
+     std::cout << "# Error at operation: matrix-matrix product" << std::endl;
+     std::cout << "  diff: " << act_diff << std::endl;
+     retval = EXIT_FAILURE;
+   }
+   else
+     std::cout << "Test C = A * trans(B) passed!" << std::endl;
+
+
+   C     += boost::numeric::ublas::prod(A, trans(B_trans));
+   {
+   viennacl::scheduler::statement my_statement(vcl_C, viennacl::op_inplace_add(), viennacl::linalg::prod(vcl_A, trans(vcl_B_trans)));
+   viennacl::scheduler::execute(my_statement);
+   }
+   act_diff = std::fabs(diff(C, vcl_C));
+
+   if( act_diff > epsilon )
+   {
+     std::cout << "# Error at operation: matrix-matrix product" << std::endl;
+     std::cout << "  diff: " << act_diff << std::endl;
+     retval = EXIT_FAILURE;
+   }
+   else
+     std::cout << "Test C += A * trans(B) passed!" << std::endl;
+
+
+   C     -= boost::numeric::ublas::prod(A, trans(B_trans));
+   {
+   viennacl::scheduler::statement my_statement(vcl_C, viennacl::op_inplace_sub(), viennacl::linalg::prod(vcl_A, trans(vcl_B_trans)));
+   viennacl::scheduler::execute(my_statement);
+   }
+   act_diff = std::fabs(diff(C, vcl_C));
+
+   if( act_diff > epsilon )
+   {
+     std::cout << "# Error at operation: matrix-matrix product" << std::endl;
+     std::cout << "  diff: " << act_diff << std::endl;
+     retval = EXIT_FAILURE;
+   }
+   else
+     std::cout << "Test C -= A * trans(B) passed!" << std::endl;
+
+
+
+   // Test: C +-= trans(A) * B --------------------------------------------------------------------------
+   C     = boost::numeric::ublas::prod(trans(A_trans), B);
+   {
+   viennacl::scheduler::statement my_statement(vcl_C, viennacl::op_assign(), viennacl::linalg::prod(trans(vcl_A_trans), vcl_B));
+   viennacl::scheduler::execute(my_statement);
+   }
+   act_diff = std::fabs(diff(C, vcl_C));
+
+   if( act_diff > epsilon )
+   {
+     std::cout << "# Error at operation: matrix-matrix product" << std::endl;
+     std::cout << "  diff: " << act_diff << std::endl;
+     retval = EXIT_FAILURE;
+   }
+   else
+     std::cout << "Test C = trans(A) * B passed!" << std::endl;
+
+
+   C     += boost::numeric::ublas::prod(trans(A_trans), B);
+   {
+   viennacl::scheduler::statement my_statement(vcl_C, viennacl::op_inplace_add(), viennacl::linalg::prod(trans(vcl_A_trans), vcl_B));
+   viennacl::scheduler::execute(my_statement);
+   }
+   act_diff = std::fabs(diff(C, vcl_C));
+
+   if( act_diff > epsilon )
+   {
+     std::cout << "# Error at operation: matrix-matrix product" << std::endl;
+     std::cout << "  diff: " << act_diff << std::endl;
+     retval = EXIT_FAILURE;
+   }
+   else
+     std::cout << "Test C += trans(A) * B passed!" << std::endl;
+
+
+   C     -= boost::numeric::ublas::prod(trans(A_trans), B);
+   {
+   viennacl::scheduler::statement my_statement(vcl_C, viennacl::op_inplace_sub(), viennacl::linalg::prod(trans(vcl_A_trans), vcl_B));
+   viennacl::scheduler::execute(my_statement);
+   }
+   act_diff = std::fabs(diff(C, vcl_C));
+
+   if( act_diff > epsilon )
+   {
+     std::cout << "# Error at operation: matrix-matrix product" << std::endl;
+     std::cout << "  diff: " << act_diff << std::endl;
+     retval = EXIT_FAILURE;
+   }
+   else
+     std::cout << "Test C -= trans(A) * B passed!" << std::endl;
+
+
+
+
+
+   // Test: C +-= trans(A) * trans(B) --------------------------------------------------------------------------
+   C     = boost::numeric::ublas::prod(trans(A_trans), trans(B_trans));
+   {
+   viennacl::scheduler::statement my_statement(vcl_C, viennacl::op_assign(), viennacl::linalg::prod(trans(vcl_A_trans), trans(vcl_B_trans)));
+   viennacl::scheduler::execute(my_statement);
+   }
+   act_diff = std::fabs(diff(C, vcl_C));
+
+   if( act_diff > epsilon )
+   {
+     std::cout << "# Error at operation: matrix-matrix product" << std::endl;
+     std::cout << "  diff: " << act_diff << std::endl;
+     retval = EXIT_FAILURE;
+   }
+   else
+     std::cout << "Test C = trans(A) * trans(B) passed!" << std::endl;
+
+   C     += boost::numeric::ublas::prod(trans(A_trans), trans(B_trans));
+   {
+   viennacl::scheduler::statement my_statement(vcl_C, viennacl::op_inplace_add(), viennacl::linalg::prod(trans(vcl_A_trans), trans(vcl_B_trans)));
+   viennacl::scheduler::execute(my_statement);
+   }
+   act_diff = std::fabs(diff(C, vcl_C));
+
+   if( act_diff > epsilon )
+   {
+     std::cout << "# Error at operation: matrix-matrix product" << std::endl;
+     std::cout << "  diff: " << act_diff << std::endl;
+     retval = EXIT_FAILURE;
+   }
+   else
+     std::cout << "Test C += trans(A) * trans(B) passed!" << std::endl;
+
+
+   C     -= boost::numeric::ublas::prod(trans(A_trans), trans(B_trans));
+   {
+   viennacl::scheduler::statement my_statement(vcl_C, viennacl::op_inplace_sub(), viennacl::linalg::prod(trans(vcl_A_trans), trans(vcl_B_trans)));
+   viennacl::scheduler::execute(my_statement);
+   }
+   act_diff = std::fabs(diff(C, vcl_C));
+
+   if( act_diff > epsilon )
+   {
+     std::cout << "# Error at operation: matrix-matrix product" << std::endl;
+     std::cout << "  diff: " << act_diff << std::endl;
+     retval = EXIT_FAILURE;
+   }
+   else
+     std::cout << "Test C -= trans(A) * trans(B) passed!" << std::endl;
+
+
+
+
+   return retval;
+}
+
+
+
+template< typename NumericT, typename F_A, typename F_B, typename F_C, typename Epsilon >
+int test_prod(Epsilon const& epsilon)
+{
+  int ret;
+
+  std::size_t matrix_size1 = 29;  //some odd number, not too large
+  std::size_t matrix_size2 = 47;  //some odd number, not too large
+  std::size_t matrix_size3 = 33;  //some odd number, not too large
+  //std::size_t matrix_size1 = 128;  //some odd number, not too large
+  //std::size_t matrix_size2 = 64;  //some odd number, not too large
+  //std::size_t matrix_size3 = 128;  //some odd number, not too large
+  //std::size_t matrix_size1 = 256;  // for testing AMD kernels
+  //std::size_t matrix_size2 = 256;  // for testing AMD kernels
+  //std::size_t matrix_size3 = 256;  // for testing AMD kernels
+
+  // --------------------------------------------------------------------------
+
+  // ublas reference:
+  ublas::matrix<NumericT> A(matrix_size1, matrix_size2);
+  ublas::matrix<NumericT> big_A = ublas::scalar_matrix<NumericT>(4*matrix_size1, 4*matrix_size2, NumericT(3.1415));
+
+  ublas::matrix<NumericT> B(matrix_size2, matrix_size3);
+  ublas::matrix<NumericT> big_B = ublas::scalar_matrix<NumericT>(4*matrix_size2, 4*matrix_size3, NumericT(42.0));
+
+  ublas::matrix<NumericT> C(matrix_size1, matrix_size3);
+
+  //fill A and B:
+  for (unsigned int i = 0; i < A.size1(); ++i)
+    for (unsigned int j = 0; j < A.size2(); ++j)
+        A(i,j) = static_cast<NumericT>(0.1) * random<NumericT>();
+  for (unsigned int i = 0; i < B.size1(); ++i)
+    for (unsigned int j = 0; j < B.size2(); ++j)
+        B(i,j) = static_cast<NumericT>(0.1) * random<NumericT>();
+
+  ublas::matrix<NumericT>     A_trans = trans(A);
+  ublas::matrix<NumericT> big_A_trans = trans(big_A);
+
+  ublas::matrix<NumericT>     B_trans = trans(B);
+  ublas::matrix<NumericT> big_B_trans = trans(big_B);
+
+  //
+  // ViennaCL objects
+  //
+
+  // A
+  viennacl::range range1_A(matrix_size1, 2*matrix_size1);
+  viennacl::range range2_A(matrix_size2, 2*matrix_size2);
+  viennacl::slice slice1_A(matrix_size1, 2, matrix_size1);
+  viennacl::slice slice2_A(matrix_size2, 3, matrix_size2);
+
+  viennacl::matrix<NumericT, F_A>    vcl_A(matrix_size1, matrix_size2);
+  viennacl::copy(A, vcl_A);
+
+  viennacl::matrix<NumericT, F_A>    vcl_big_range_A(4*matrix_size1, 4*matrix_size2);
+  viennacl::matrix_range<viennacl::matrix<NumericT, F_A> > vcl_range_A(vcl_big_range_A, range1_A, range2_A);
+  viennacl::copy(A, vcl_range_A);
+
+  viennacl::matrix<NumericT, F_A>    vcl_big_slice_A(4*matrix_size1, 4*matrix_size2);
+  viennacl::matrix_slice<viennacl::matrix<NumericT, F_A> > vcl_slice_A(vcl_big_slice_A, slice1_A, slice2_A);
+  viennacl::copy(A, vcl_slice_A);
+
+
+  // A^T
+  viennacl::matrix<NumericT, F_A>    vcl_A_trans(matrix_size2, matrix_size1);
+  viennacl::copy(A_trans, vcl_A_trans);
+
+  viennacl::matrix<NumericT, F_A>    vcl_big_range_A_trans(4*matrix_size2, 4*matrix_size1);
+  viennacl::matrix_range<viennacl::matrix<NumericT, F_A> > vcl_range_A_trans(vcl_big_range_A_trans, range2_A, range1_A);
+  viennacl::copy(A_trans, vcl_range_A_trans);
+
+  viennacl::matrix<NumericT, F_A>    vcl_big_slice_A_trans(4*matrix_size2, 4*matrix_size1);
+  viennacl::matrix_slice<viennacl::matrix<NumericT, F_A> > vcl_slice_A_trans(vcl_big_slice_A_trans, slice2_A, slice1_A);
+  viennacl::copy(A_trans, vcl_slice_A_trans);
+
+
+
+  // B
+  viennacl::range range1_B(2*matrix_size2, 3*matrix_size2);
+  viennacl::range range2_B(2*matrix_size3, 3*matrix_size3);
+  viennacl::slice slice1_B(matrix_size2, 3, matrix_size2);
+  viennacl::slice slice2_B(matrix_size3, 2, matrix_size3);
+
+  viennacl::matrix<NumericT, F_B>    vcl_B(matrix_size2, matrix_size3);
+  viennacl::copy(B, vcl_B);
+
+  viennacl::matrix<NumericT, F_B>    vcl_big_range_B(4*matrix_size2, 4*matrix_size3);
+  viennacl::matrix_range<viennacl::matrix<NumericT, F_B> > vcl_range_B(vcl_big_range_B, range1_B, range2_B);
+  viennacl::copy(B, vcl_range_B);
+
+  viennacl::matrix<NumericT, F_B>    vcl_big_slice_B(4*matrix_size2, 4*matrix_size3);
+  viennacl::matrix_slice<viennacl::matrix<NumericT, F_B> > vcl_slice_B(vcl_big_slice_B, slice1_B, slice2_B);
+  viennacl::copy(B, vcl_slice_B);
+
+
+  // B^T
+
+  viennacl::matrix<NumericT, F_B>    vcl_B_trans(matrix_size3, matrix_size2);
+  viennacl::copy(B_trans, vcl_B_trans);
+
+  viennacl::matrix<NumericT, F_B>    vcl_big_range_B_trans(4*matrix_size3, 4*matrix_size2);
+  viennacl::matrix_range<viennacl::matrix<NumericT, F_B> > vcl_range_B_trans(vcl_big_range_B_trans, range2_B, range1_B);
+  viennacl::copy(B_trans, vcl_range_B_trans);
+
+  viennacl::matrix<NumericT, F_B>    vcl_big_slice_B_trans(4*matrix_size3, 4*matrix_size2);
+  viennacl::matrix_slice<viennacl::matrix<NumericT, F_B> > vcl_slice_B_trans(vcl_big_slice_B_trans, slice2_B, slice1_B);
+  viennacl::copy(B_trans, vcl_slice_B_trans);
+
+
+  // C
+
+  viennacl::range range1_C(matrix_size1-1, 2*matrix_size1-1);
+  viennacl::range range2_C(matrix_size3-1, 2*matrix_size3-1);
+  viennacl::slice slice1_C(matrix_size1-1, 3, matrix_size1);
+  viennacl::slice slice2_C(matrix_size3-1, 3, matrix_size3);
+
+  viennacl::matrix<NumericT, F_C>    vcl_C(matrix_size1, matrix_size3);
+
+  viennacl::matrix<NumericT, F_C>    vcl_big_range_C(4*matrix_size1, 4*matrix_size3);
+  viennacl::matrix_range<viennacl::matrix<NumericT, F_C> > vcl_range_C(vcl_big_range_C, range1_C, range2_C);
+
+  viennacl::matrix<NumericT, F_C>    vcl_big_slice_C(4*matrix_size1, 4*matrix_size3);
+  viennacl::matrix_slice<viennacl::matrix<NumericT, F_C> > vcl_slice_C(vcl_big_slice_C, slice1_C, slice2_C);
+
+
+  std::cout << "--- Part 1: Testing matrix-matrix products ---" << std::endl;
+
+  //////
+  //////  A: matrix
+  //////
+
+  //
+  //
+  std::cout << "Now using A=matrix, B=matrix, C=matrix" << std::endl;
+  ret = test_prod<NumericT>(epsilon,
+                            A, A_trans, B, B_trans, C,
+                            vcl_A, vcl_A_trans,
+                            vcl_B, vcl_B_trans,
+                            vcl_C);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+
+  //
+  //
+  std::cout << "Now using A=matrix, B=matrix, C=range" << std::endl;
+  ret = test_prod<NumericT>(epsilon,
+                            A, A_trans, B, B_trans, C,
+                            vcl_A, vcl_A_trans,
+                            vcl_B, vcl_B_trans,
+                            vcl_range_C);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+  //
+  //
+  std::cout << "Now using A=matrix, B=matrix, C=slice" << std::endl;
+  ret = test_prod<NumericT>(epsilon,
+                            A, A_trans, B, B_trans, C,
+                            vcl_A, vcl_A_trans,
+                            vcl_B, vcl_B_trans,
+                            vcl_slice_C);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+
+
+  //
+  //
+  std::cout << "Now using A=matrix, B=range, C=matrix" << std::endl;
+  ret = test_prod<NumericT>(epsilon,
+                            A, A_trans, B, B_trans, C,
+                            vcl_A, vcl_A_trans,
+                            vcl_range_B, vcl_range_B_trans,
+                            vcl_C);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+
+  //
+  //
+  std::cout << "Now using A=matrix, B=range, C=range" << std::endl;
+  ret = test_prod<NumericT>(epsilon,
+                            A, A_trans, B, B_trans, C,
+                            vcl_A, vcl_A_trans,
+                            vcl_range_B, vcl_range_B_trans,
+                            vcl_range_C);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+  //
+  //
+  std::cout << "Now using A=matrix, B=range, C=slice" << std::endl;
+  ret = test_prod<NumericT>(epsilon,
+                            A, A_trans, B, B_trans, C,
+                            vcl_A, vcl_A_trans,
+                            vcl_range_B, vcl_range_B_trans,
+                            vcl_slice_C);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+
+  //
+  //
+  std::cout << "Now using A=matrix, B=slice, C=matrix" << std::endl;
+  ret = test_prod<NumericT>(epsilon,
+                            A, A_trans, B, B_trans, C,
+                            vcl_A, vcl_A_trans,
+                            vcl_slice_B, vcl_slice_B_trans,
+                            vcl_C);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+
+  //
+  //
+  std::cout << "Now using A=matrix, B=slice, C=range" << std::endl;
+  ret = test_prod<NumericT>(epsilon,
+                            A, A_trans, B, B_trans, C,
+                            vcl_A, vcl_A_trans,
+                            vcl_slice_B, vcl_slice_B_trans,
+                            vcl_range_C);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+  //
+  //
+  std::cout << "Now using A=matrix, B=slice, C=slice" << std::endl;
+  ret = test_prod<NumericT>(epsilon,
+                            A, A_trans, B, B_trans, C,
+                            vcl_A, vcl_A_trans,
+                            vcl_slice_B, vcl_slice_B_trans,
+                            vcl_slice_C);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+
+  //////
+  //////  A: range
+  //////
+
+  //
+  //
+  std::cout << "Now using A=range, B=matrix, C=matrix" << std::endl;
+  ret = test_prod<NumericT>(epsilon,
+                            A, A_trans, B, B_trans, C,
+                            vcl_range_A, vcl_range_A_trans,
+                            vcl_B, vcl_B_trans,
+                            vcl_C);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+
+  //
+  //
+  std::cout << "Now using A=range, B=matrix, C=range" << std::endl;
+  ret = test_prod<NumericT>(epsilon,
+                            A, A_trans, B, B_trans, C,
+                            vcl_range_A, vcl_range_A_trans,
+                            vcl_B, vcl_B_trans,
+                            vcl_range_C);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+  //
+  //
+  std::cout << "Now using A=range, B=matrix, C=slice" << std::endl;
+  ret = test_prod<NumericT>(epsilon,
+                            A, A_trans, B, B_trans, C,
+                            vcl_range_A, vcl_range_A_trans,
+                            vcl_B, vcl_B_trans,
+                            vcl_slice_C);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+
+
+  //
+  //
+  std::cout << "Now using A=range, B=range, C=matrix" << std::endl;
+  ret = test_prod<NumericT>(epsilon,
+                            A, A_trans, B, B_trans, C,
+                            vcl_range_A, vcl_range_A_trans,
+                            vcl_range_B, vcl_range_B_trans,
+                            vcl_C);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+
+  //
+  //
+  std::cout << "Now using A=range, B=range, C=range" << std::endl;
+  ret = test_prod<NumericT>(epsilon,
+                            A, A_trans, B, B_trans, C,
+                            vcl_range_A, vcl_range_A_trans,
+                            vcl_range_B, vcl_range_B_trans,
+                            vcl_range_C);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+  //
+  //
+  std::cout << "Now using A=range, B=range, C=slice" << std::endl;
+  ret = test_prod<NumericT>(epsilon,
+                            A, A_trans, B, B_trans, C,
+                            vcl_range_A, vcl_range_A_trans,
+                            vcl_range_B, vcl_range_B_trans,
+                            vcl_slice_C);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+
+  //
+  //
+  std::cout << "Now using A=range, B=slice, C=matrix" << std::endl;
+  ret = test_prod<NumericT>(epsilon,
+                            A, A_trans, B, B_trans, C,
+                            vcl_range_A, vcl_range_A_trans,
+                            vcl_slice_B, vcl_slice_B_trans,
+                            vcl_C);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+
+  //
+  //
+  std::cout << "Now using A=range, B=slice, C=range" << std::endl;
+  ret = test_prod<NumericT>(epsilon,
+                            A, A_trans, B, B_trans, C,
+                            vcl_range_A, vcl_range_A_trans,
+                            vcl_slice_B, vcl_slice_B_trans,
+                            vcl_range_C);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+  //
+  //
+  std::cout << "Now using A=range, B=slice, C=slice" << std::endl;
+  ret = test_prod<NumericT>(epsilon,
+                            A, A_trans, B, B_trans, C,
+                            vcl_range_A, vcl_range_A_trans,
+                            vcl_slice_B, vcl_slice_B_trans,
+                            vcl_slice_C);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+
+
+  //////
+  //////  A: slice
+  //////
+
+  //
+  //
+  std::cout << "Now using A=slice, B=matrix, C=matrix" << std::endl;
+  ret = test_prod<NumericT>(epsilon,
+                            A, A_trans, B, B_trans, C,
+                            vcl_slice_A, vcl_slice_A_trans,
+                            vcl_B, vcl_B_trans,
+                            vcl_C);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+
+  //
+  //
+  std::cout << "Now using A=slice, B=matrix, C=range" << std::endl;
+  ret = test_prod<NumericT>(epsilon,
+                            A, A_trans, B, B_trans, C,
+                            vcl_slice_A, vcl_slice_A_trans,
+                            vcl_B, vcl_B_trans,
+                            vcl_range_C);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+  //
+  //
+  std::cout << "Now using A=slice, B=matrix, C=slice" << std::endl;
+  ret = test_prod<NumericT>(epsilon,
+                            A, A_trans, B, B_trans, C,
+                            vcl_slice_A, vcl_slice_A_trans,
+                            vcl_B, vcl_B_trans,
+                            vcl_slice_C);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+
+
+  //
+  //
+  std::cout << "Now using A=slice, B=range, C=matrix" << std::endl;
+  ret = test_prod<NumericT>(epsilon,
+                            A, A_trans, B, B_trans, C,
+                            vcl_slice_A, vcl_slice_A_trans,
+                            vcl_range_B, vcl_range_B_trans,
+                            vcl_C);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+
+  //
+  //
+  std::cout << "Now using A=slice, B=range, C=range" << std::endl;
+  ret = test_prod<NumericT>(epsilon,
+                            A, A_trans, B, B_trans, C,
+                            vcl_slice_A, vcl_slice_A_trans,
+                            vcl_range_B, vcl_range_B_trans,
+                            vcl_range_C);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+  //
+  //
+  std::cout << "Now using A=slice, B=range, C=slice" << std::endl;
+  ret = test_prod<NumericT>(epsilon,
+                            A, A_trans, B, B_trans, C,
+                            vcl_slice_A, vcl_slice_A_trans,
+                            vcl_range_B, vcl_range_B_trans,
+                            vcl_slice_C);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+
+  //
+  //
+  std::cout << "Now using A=slice, B=slice, C=matrix" << std::endl;
+  ret = test_prod<NumericT>(epsilon,
+                            A, A_trans, B, B_trans, C,
+                            vcl_slice_A, vcl_slice_A_trans,
+                            vcl_slice_B, vcl_slice_B_trans,
+                            vcl_C);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+
+  //
+  //
+  std::cout << "Now using A=slice, B=slice, C=range" << std::endl;
+  ret = test_prod<NumericT>(epsilon,
+                            A, A_trans, B, B_trans, C,
+                            vcl_slice_A, vcl_slice_A_trans,
+                            vcl_slice_B, vcl_slice_B_trans,
+                            vcl_range_C);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+  //
+  //
+  std::cout << "Now using A=slice, B=slice, C=slice" << std::endl;
+  ret = test_prod<NumericT>(epsilon,
+                            A, A_trans, B, B_trans, C,
+                            vcl_slice_A, vcl_slice_A_trans,
+                            vcl_slice_B, vcl_slice_B_trans,
+                            vcl_slice_C);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+
+  return ret;
+
+}
+
+
+//
+// Control functions
+//
+
+
+
+template< typename NumericT, typename Epsilon >
+int test(Epsilon const& epsilon)
+{
+  int ret;
+
+  std::cout << "///////////////////////////////////////" << std::endl;
+  std::cout << "/// Now testing A=row, B=row, C=row ///" << std::endl;
+  std::cout << "///////////////////////////////////////" << std::endl;
+  ret = test_prod<NumericT, viennacl::row_major, viennacl::row_major, viennacl::row_major>(epsilon);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+  std::cout << "///////////////////////////////////////" << std::endl;
+  std::cout << "/// Now testing A=row, B=row, C=col ///" << std::endl;
+  std::cout << "///////////////////////////////////////" << std::endl;
+  ret = test_prod<NumericT, viennacl::row_major, viennacl::row_major, viennacl::column_major>(epsilon);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+  std::cout << "///////////////////////////////////////" << std::endl;
+  std::cout << "/// Now testing A=row, B=col, C=row ///" << std::endl;
+  std::cout << "///////////////////////////////////////" << std::endl;
+  ret = test_prod<NumericT, viennacl::row_major, viennacl::column_major, viennacl::row_major>(epsilon);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+  std::cout << "///////////////////////////////////////" << std::endl;
+  std::cout << "/// Now testing A=row, B=col, C=col ///" << std::endl;
+  std::cout << "///////////////////////////////////////" << std::endl;
+  ret = test_prod<NumericT, viennacl::row_major, viennacl::column_major, viennacl::column_major>(epsilon);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+  std::cout << "///////////////////////////////////////" << std::endl;
+  std::cout << "/// Now testing A=col, B=row, C=row ///" << std::endl;
+  std::cout << "///////////////////////////////////////" << std::endl;
+  ret = test_prod<NumericT, viennacl::column_major, viennacl::row_major, viennacl::row_major>(epsilon);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+  std::cout << "///////////////////////////////////////" << std::endl;
+  std::cout << "/// Now testing A=col, B=row, C=col ///" << std::endl;
+  std::cout << "///////////////////////////////////////" << std::endl;
+  ret = test_prod<NumericT, viennacl::column_major, viennacl::row_major, viennacl::column_major>(epsilon);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+  std::cout << "///////////////////////////////////////" << std::endl;
+  std::cout << "/// Now testing A=col, B=col, C=row ///" << std::endl;
+  std::cout << "///////////////////////////////////////" << std::endl;
+  ret = test_prod<NumericT, viennacl::column_major, viennacl::column_major, viennacl::row_major>(epsilon);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+  std::cout << "///////////////////////////////////////" << std::endl;
+  std::cout << "/// Now testing A=col, B=col, C=col ///" << std::endl;
+  std::cout << "///////////////////////////////////////" << std::endl;
+  ret = test_prod<NumericT, viennacl::column_major, viennacl::column_major, viennacl::column_major>(epsilon);
+  if (ret != EXIT_SUCCESS)
+    return ret;
+
+
+
+  return ret;
+}
+
+//
+// -------------------------------------------------------------
+//
+int main()
+{
+   std::cout << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << "## Test :: BLAS 3 routines" << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << std::endl;
+
+   int retval = EXIT_SUCCESS;
+
+   std::cout << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << std::endl;
+   {
+      typedef float NumericT;
+      NumericT epsilon = NumericT(1.0E-3);
+      std::cout << "# Testing setup:" << std::endl;
+      std::cout << "  eps:     " << epsilon << std::endl;
+      std::cout << "  numeric: float" << std::endl;
+      retval = test<NumericT>(epsilon);
+      if( retval == EXIT_SUCCESS )
+        std::cout << "# Test passed" << std::endl;
+      else
+        return retval;
+   }
+   std::cout << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << std::endl;
+#ifdef VIENNACL_WITH_OPENCL
+   if( viennacl::ocl::current_device().double_support() )
+#endif
+   {
+      {
+        typedef double NumericT;
+        NumericT epsilon = 1.0E-11;
+        std::cout << "# Testing setup:" << std::endl;
+        std::cout << "  eps:     " << epsilon << std::endl;
+        std::cout << "  numeric: double" << std::endl;
+        retval = test<NumericT>(epsilon);
+        if( retval == EXIT_SUCCESS )
+          std::cout << "# Test passed" << std::endl;
+        else
+          return retval;
+      }
+      std::cout << std::endl;
+      std::cout << "----------------------------------------------" << std::endl;
+      std::cout << std::endl;
+   }
+
+   std::cout << std::endl;
+   std::cout << "------- Test completed --------" << std::endl;
+   std::cout << std::endl;
+
+
+   return retval;
+}
diff --git a/tests/src/scheduler_matrix_vector.cpp b/tests/src/scheduler_matrix_vector.cpp
new file mode 100644
index 0000000..ee8638a
--- /dev/null
+++ b/tests/src/scheduler_matrix_vector.cpp
@@ -0,0 +1,945 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+//
+// *** System
+//
+#include <iostream>
+
+//
+// *** Boost
+//
+#include <boost/numeric/ublas/io.hpp>
+#include <boost/numeric/ublas/triangular.hpp>
+#include <boost/numeric/ublas/matrix_sparse.hpp>
+#include <boost/numeric/ublas/matrix.hpp>
+#include <boost/numeric/ublas/matrix_proxy.hpp>
+#include <boost/numeric/ublas/lu.hpp>
+#include <boost/numeric/ublas/io.hpp>
+
+//
+// *** ViennaCL
+//
+//#define VIENNACL_DEBUG_ALL
+#define VIENNACL_WITH_UBLAS 1
+#include "viennacl/scalar.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+#include "viennacl/linalg/direct_solve.hpp"
+#include "viennacl/linalg/lu.hpp"
+#include "examples/tutorial/Random.hpp"
+
+#include "viennacl/scheduler/execute.hpp"
+#include "viennacl/scheduler/io.hpp"
+
+//
+// -------------------------------------------------------------
+//
+using namespace boost::numeric;
+//
+// -------------------------------------------------------------
+//
+template <typename ScalarType>
+ScalarType diff(ScalarType & s1, viennacl::scalar<ScalarType> & s2)
+{
+   viennacl::backend::finish();
+   if (s1 != s2)
+      return (s1 - s2) / std::max(std::fabs(s1), std::fabs(s2));
+   return 0;
+}
+
+template <typename ScalarType, typename VCLVectorType>
+ScalarType diff(ublas::vector<ScalarType> const & v1, VCLVectorType const & v2)
+{
+   ublas::vector<ScalarType> v2_cpu(v2.size());
+   viennacl::backend::finish();  //workaround for a bug in APP SDK 2.7 on Trinity APUs (with Catalyst 12.8)
+   viennacl::copy(v2.begin(), v2.end(), v2_cpu.begin());
+
+   for (unsigned int i=0;i<v1.size(); ++i)
+   {
+      if ( std::max( std::fabs(v2_cpu[i]), std::fabs(v1[i]) ) > 0 )
+         v2_cpu[i] = std::fabs(v2_cpu[i] - v1[i]) / std::max( std::fabs(v2_cpu[i]), std::fabs(v1[i]) );
+      else
+         v2_cpu[i] = 0.0;
+   }
+
+   return norm_inf(v2_cpu);
+}
+
+template <typename ScalarType, typename VCLMatrixType>
+ScalarType diff(ublas::matrix<ScalarType> const & mat1, VCLMatrixType const & mat2)
+{
+   ublas::matrix<ScalarType> mat2_cpu(mat2.size1(), mat2.size2());
+   viennacl::backend::finish();  //workaround for a bug in APP SDK 2.7 on Trinity APUs (with Catalyst 12.8)
+   viennacl::copy(mat2, mat2_cpu);
+   ScalarType ret = 0;
+   ScalarType act = 0;
+
+    for (unsigned int i = 0; i < mat2_cpu.size1(); ++i)
+    {
+      for (unsigned int j = 0; j < mat2_cpu.size2(); ++j)
+      {
+         act = std::fabs(mat2_cpu(i,j) - mat1(i,j)) / std::max( std::fabs(mat2_cpu(i, j)), std::fabs(mat1(i,j)) );
+         if (act > ret)
+           ret = act;
+      }
+    }
+   //std::cout << ret << std::endl;
+   return ret;
+}
+//
+// -------------------------------------------------------------
+//
+
+template <typename NumericT, typename Epsilon,
+          typename UblasMatrixType, typename UblasVectorType,
+          typename VCLMatrixType, typename VCLVectorType1, typename VCLVectorType2>
+int test_prod_rank1(Epsilon const & epsilon,
+                    UblasMatrixType & ublas_m1, UblasVectorType & ublas_v1, UblasVectorType & ublas_v2,
+                    VCLMatrixType & vcl_m1, VCLVectorType1 & vcl_v1, VCLVectorType2 & vcl_v2)
+{
+   int retval = EXIT_SUCCESS;
+
+   // sync data:
+   viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+   viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+   viennacl::copy(ublas_m1, vcl_m1);
+
+   /* TODO: Add rank-1 operations here */
+
+   //reset vcl_matrix:
+   viennacl::copy(ublas_m1, vcl_m1);
+
+   // --------------------------------------------------------------------------
+   std::cout << "Matrix-Vector product" << std::endl;
+   ublas_v1 = viennacl::linalg::prod(ublas_m1, ublas_v2);
+   {
+   viennacl::scheduler::statement   my_statement(vcl_v1, viennacl::op_assign(), viennacl::linalg::prod(vcl_m1, vcl_v2));
+   viennacl::scheduler::execute(my_statement);
+   }
+
+   if( std::fabs(diff(ublas_v1, vcl_v1)) > epsilon )
+   {
+      std::cout << "# Error at operation: matrix-vector product" << std::endl;
+      std::cout << "  diff: " << std::fabs(diff(ublas_v1, vcl_v1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   std::cout << "Matrix-Vector product with inplace-add" << std::endl;
+   ublas_v1 += viennacl::linalg::prod(ublas_m1, ublas_v2);
+   {
+   viennacl::scheduler::statement   my_statement(vcl_v1, viennacl::op_inplace_add(), viennacl::linalg::prod(vcl_m1, vcl_v2));
+   viennacl::scheduler::execute(my_statement);
+   }
+
+   if( std::fabs(diff(ublas_v1, vcl_v1)) > epsilon )
+   {
+      std::cout << "# Error at operation: matrix-vector product" << std::endl;
+      std::cout << "  diff: " << std::fabs(diff(ublas_v1, vcl_v1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   std::cout << "Matrix-Vector product with inplace-sub" << std::endl;
+   ublas_v1 -= viennacl::linalg::prod(ublas_m1, ublas_v2);
+   {
+   viennacl::scheduler::statement   my_statement(vcl_v1, viennacl::op_inplace_sub(), viennacl::linalg::prod(vcl_m1, vcl_v2));
+   viennacl::scheduler::execute(my_statement);
+   }
+
+   if( std::fabs(diff(ublas_v1, vcl_v1)) > epsilon )
+   {
+      std::cout << "# Error at operation: matrix-vector product" << std::endl;
+      std::cout << "  diff: " << std::fabs(diff(ublas_v1, vcl_v1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   // --------------------------------------------------------------------------
+   /*
+   std::cout << "Matrix-Vector product with scaled matrix" << std::endl;
+   ublas_v1 = viennacl::linalg::prod(NumericT(2.0) * ublas_m1, ublas_v2);
+   {
+   viennacl::scheduler::statement   my_statement(vcl_v1, viennacl::op_assign(), viennacl::linalg::prod(NumericT(2.0) * vcl_m1, vcl_v2));
+   viennacl::scheduler::execute(my_statement);
+   }
+
+   if( std::fabs(diff(ublas_v1, vcl_v1)) > epsilon )
+   {
+      std::cout << "# Error at operation: matrix-vector product" << std::endl;
+      std::cout << "  diff: " << std::fabs(diff(ublas_v1, vcl_v1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }*/
+
+   // --------------------------------------------------------------------------
+   std::cout << "Matrix-Vector product with scaled vector" << std::endl;
+   /*
+   ublas_v1 = viennacl::linalg::prod(ublas_m1, NumericT(2.0) * ublas_v2);
+   {
+   viennacl::scheduler::statement   my_statement(vcl_v1, viennacl::op_assign(), viennacl::linalg::prod(vcl_m1, NumericT(2.0) * vcl_v2));
+   viennacl::scheduler::execute(my_statement);
+   }
+
+   if( std::fabs(diff(ublas_v1, vcl_v1)) > epsilon )
+   {
+      std::cout << "# Error at operation: matrix-vector product" << std::endl;
+      std::cout << "  diff: " << std::fabs(diff(ublas_v1, vcl_v1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }*/
+
+   // --------------------------------------------------------------------------
+   std::cout << "Matrix-Vector product with scaled matrix and scaled vector" << std::endl;
+   /*
+   ublas_v1 = viennacl::linalg::prod(NumericT(2.0) * ublas_m1, NumericT(2.0) * ublas_v2);
+   {
+   viennacl::scheduler::statement   my_statement(vcl_v1, viennacl::op_assign(), viennacl::linalg::prod(NumericT(2.0) * vcl_m1, NumericT(2.0) * vcl_v2));
+   viennacl::scheduler::execute(my_statement);
+   }
+
+   if( std::fabs(diff(ublas_v1, vcl_v1)) > epsilon )
+   {
+      std::cout << "# Error at operation: matrix-vector product" << std::endl;
+      std::cout << "  diff: " << std::fabs(diff(ublas_v1, vcl_v1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }*/
+
+
+   // --------------------------------------------------------------------------
+   std::cout << "Matrix-Vector product with scaled add" << std::endl;
+   NumericT alpha = static_cast<NumericT>(2.786);
+   NumericT beta = static_cast<NumericT>(3.1415);
+   viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+   viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+   ublas_v1 = alpha * viennacl::linalg::prod(ublas_m1, ublas_v2) - beta * ublas_v1;
+   {
+   viennacl::scheduler::statement   my_statement(vcl_v1, viennacl::op_assign(), alpha * viennacl::linalg::prod(vcl_m1, vcl_v2) - beta * vcl_v1);
+   viennacl::scheduler::execute(my_statement);
+   }
+
+   if( std::fabs(diff(ublas_v1, vcl_v1)) > epsilon )
+   {
+      std::cout << "# Error at operation: matrix-vector product with scaled additions" << std::endl;
+      std::cout << "  diff: " << std::fabs(diff(ublas_v1, vcl_v1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   std::cout << "Matrix-Vector product with scaled add, inplace-add" << std::endl;
+   viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+   viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+   ublas_v1 += alpha * viennacl::linalg::prod(ublas_m1, ublas_v2) - beta * ublas_v1;
+   {
+   viennacl::scheduler::statement   my_statement(vcl_v1, viennacl::op_inplace_add(), alpha * viennacl::linalg::prod(vcl_m1, vcl_v2) - beta * vcl_v1);
+   viennacl::scheduler::execute(my_statement);
+   }
+
+   if( std::fabs(diff(ublas_v1, vcl_v1)) > epsilon )
+   {
+      std::cout << "# Error at operation: matrix-vector product with scaled additions" << std::endl;
+      std::cout << "  diff: " << std::fabs(diff(ublas_v1, vcl_v1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   std::cout << "Matrix-Vector product with scaled add, inplace-sub" << std::endl;
+   viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+   viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+   ublas_v1 -= alpha * viennacl::linalg::prod(ublas_m1, ublas_v2) - beta * ublas_v1;
+   {
+   viennacl::scheduler::statement   my_statement(vcl_v1, viennacl::op_inplace_sub(), alpha * viennacl::linalg::prod(vcl_m1, vcl_v2) - beta * vcl_v1);
+   viennacl::scheduler::execute(my_statement);
+   }
+
+   if( std::fabs(diff(ublas_v1, vcl_v1)) > epsilon )
+   {
+      std::cout << "# Error at operation: matrix-vector product with scaled additions" << std::endl;
+      std::cout << "  diff: " << std::fabs(diff(ublas_v1, vcl_v1)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   // --------------------------------------------------------------------------
+
+   viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+   viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+   std::cout << "Transposed Matrix-Vector product" << std::endl;
+   ublas_v2 = viennacl::linalg::prod(trans(ublas_m1), ublas_v1);
+   {
+   viennacl::scheduler::statement   my_statement(vcl_v2, viennacl::op_assign(), viennacl::linalg::prod(trans(vcl_m1), vcl_v1));
+   viennacl::scheduler::execute(my_statement);
+   }
+
+   if( std::fabs(diff(ublas_v2, vcl_v2)) > epsilon )
+   {
+      std::cout << "# Error at operation: transposed matrix-vector product" << std::endl;
+      std::cout << "  diff: " << std::fabs(diff(ublas_v2, vcl_v2)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   std::cout << "Transposed Matrix-Vector product, inplace-add" << std::endl;
+   ublas_v2 += viennacl::linalg::prod(trans(ublas_m1), ublas_v1);
+   {
+   viennacl::scheduler::statement   my_statement(vcl_v2, viennacl::op_inplace_add(), viennacl::linalg::prod(trans(vcl_m1), vcl_v1));
+   viennacl::scheduler::execute(my_statement);
+   }
+
+   if( std::fabs(diff(ublas_v2, vcl_v2)) > epsilon )
+   {
+      std::cout << "# Error at operation: transposed matrix-vector product" << std::endl;
+      std::cout << "  diff: " << std::fabs(diff(ublas_v2, vcl_v2)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   std::cout << "Transposed Matrix-Vector product, inplace-sub" << std::endl;
+   ublas_v2 -= viennacl::linalg::prod(trans(ublas_m1), ublas_v1);
+   {
+   viennacl::scheduler::statement   my_statement(vcl_v2, viennacl::op_inplace_sub(), viennacl::linalg::prod(trans(vcl_m1), vcl_v1));
+   viennacl::scheduler::execute(my_statement);
+   }
+
+   if( std::fabs(diff(ublas_v2, vcl_v2)) > epsilon )
+   {
+      std::cout << "# Error at operation: transposed matrix-vector product" << std::endl;
+      std::cout << "  diff: " << std::fabs(diff(ublas_v2, vcl_v2)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   // --------------------------------------------------------------------------
+   std::cout << "Transposed Matrix-Vector product with scaled add" << std::endl;
+   ublas_v2 = alpha * viennacl::linalg::prod(trans(ublas_m1), ublas_v1) + beta * ublas_v2;
+   {
+   viennacl::scheduler::statement   my_statement(vcl_v2, viennacl::op_assign(), alpha * viennacl::linalg::prod(trans(vcl_m1), vcl_v1) + beta * vcl_v2);
+   viennacl::scheduler::execute(my_statement);
+   }
+
+   if( std::fabs(diff(ublas_v2, vcl_v2)) > epsilon )
+   {
+      std::cout << "# Error at operation: transposed matrix-vector product with scaled additions" << std::endl;
+      std::cout << "  diff: " << std::fabs(diff(ublas_v2, vcl_v2)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   std::cout << "Transposed Matrix-Vector product with scaled add, inplace-add" << std::endl;
+   ublas_v2 += alpha * viennacl::linalg::prod(trans(ublas_m1), ublas_v1) + beta * ublas_v2;
+   {
+   viennacl::scheduler::statement   my_statement(vcl_v2, viennacl::op_inplace_add(), alpha * viennacl::linalg::prod(trans(vcl_m1), vcl_v1) + beta * vcl_v2);
+   viennacl::scheduler::execute(my_statement);
+   }
+
+   if( std::fabs(diff(ublas_v2, vcl_v2)) > epsilon )
+   {
+      std::cout << "# Error at operation: transposed matrix-vector product with scaled additions" << std::endl;
+      std::cout << "  diff: " << std::fabs(diff(ublas_v2, vcl_v2)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   std::cout << "Transposed Matrix-Vector product with scaled add, inplace-sub" << std::endl;
+   ublas_v2 -= alpha * viennacl::linalg::prod(trans(ublas_m1), ublas_v1) + beta * ublas_v2;
+   {
+   viennacl::scheduler::statement   my_statement(vcl_v2, viennacl::op_inplace_sub(), alpha * viennacl::linalg::prod(trans(vcl_m1), vcl_v1) + beta * vcl_v2);
+   viennacl::scheduler::execute(my_statement);
+   }
+
+   if( std::fabs(diff(ublas_v2, vcl_v2)) > epsilon )
+   {
+      std::cout << "# Error at operation: transposed matrix-vector product with scaled additions" << std::endl;
+      std::cout << "  diff: " << std::fabs(diff(ublas_v2, vcl_v2)) << std::endl;
+      retval = EXIT_FAILURE;
+   }
+
+   // --------------------------------------------------------------------------
+
+   return retval;
+}
+
+
+//
+// -------------------------------------------------------------
+//
+template< typename NumericT, typename F, typename Epsilon >
+int test(Epsilon const& epsilon)
+{
+   int retval = EXIT_SUCCESS;
+
+   std::size_t num_rows = 141;
+   std::size_t num_cols = 79;
+
+   // --------------------------------------------------------------------------
+   ublas::vector<NumericT> ublas_v1(num_rows);
+   for (std::size_t i = 0; i < ublas_v1.size(); ++i)
+     ublas_v1(i) = random<NumericT>();
+   ublas::vector<NumericT> ublas_v2 = ublas::scalar_vector<NumericT>(num_cols, NumericT(3.1415));
+
+
+   ublas::matrix<NumericT> ublas_m1(ublas_v1.size(), ublas_v2.size());
+
+   for (std::size_t i = 0; i < ublas_m1.size1(); ++i)
+      for (std::size_t j = 0; j < ublas_m1.size2(); ++j)
+         ublas_m1(i,j) = static_cast<NumericT>(0.1) * random<NumericT>();
+
+
+   ublas::matrix<NumericT> ublas_m2(ublas_v1.size(), ublas_v1.size());
+
+   for (std::size_t i = 0; i < ublas_m2.size1(); ++i)
+   {
+      for (std::size_t j = 0; j < ublas_m2.size2(); ++j)
+         ublas_m2(i,j) = static_cast<NumericT>(-0.1) * random<NumericT>();
+      ublas_m2(i, i) = static_cast<NumericT>(2) + random<NumericT>();
+   }
+
+
+   viennacl::vector<NumericT> vcl_v1_native(ublas_v1.size());
+   viennacl::vector<NumericT> vcl_v1_large(4 * ublas_v1.size());
+   viennacl::vector_range< viennacl::vector<NumericT> > vcl_v1_range(vcl_v1_large, viennacl::range(3, ublas_v1.size() + 3));
+   viennacl::vector_slice< viennacl::vector<NumericT> > vcl_v1_slice(vcl_v1_large, viennacl::slice(2, 3, ublas_v1.size()));
+
+   viennacl::vector<NumericT> vcl_v2_native(ublas_v2.size());
+   viennacl::vector<NumericT> vcl_v2_large(4 * ublas_v2.size());
+   viennacl::vector_range< viennacl::vector<NumericT> > vcl_v2_range(vcl_v2_large, viennacl::range(8, ublas_v2.size() + 8));
+   viennacl::vector_slice< viennacl::vector<NumericT> > vcl_v2_slice(vcl_v2_large, viennacl::slice(6, 2, ublas_v2.size()));
+
+   viennacl::matrix<NumericT, F> vcl_m1_native(ublas_m1.size1(), ublas_m1.size2());
+   viennacl::matrix<NumericT, F> vcl_m1_large(4 * ublas_m1.size1(), 4 * ublas_m1.size2());
+   viennacl::matrix_range< viennacl::matrix<NumericT, F> > vcl_m1_range(vcl_m1_large,
+                                                                        viennacl::range(8, ublas_m1.size1() + 8),
+                                                                        viennacl::range(ublas_m1.size2(), 2 * ublas_m1.size2()) );
+   viennacl::matrix_slice< viennacl::matrix<NumericT, F> > vcl_m1_slice(vcl_m1_large,
+                                                                        viennacl::slice(6, 2, ublas_m1.size1()),
+                                                                        viennacl::slice(ublas_m1.size2(), 2, ublas_m1.size2()) );
+
+   viennacl::matrix<NumericT, F> vcl_m2_native(ublas_m2.size1(), ublas_m2.size2());
+   viennacl::matrix<NumericT, F> vcl_m2_large(4 * ublas_m2.size1(), 4 * ublas_m2.size2());
+   viennacl::matrix_range< viennacl::matrix<NumericT, F> > vcl_m2_range(vcl_m2_large,
+                                                                        viennacl::range(8, ublas_m2.size1() + 8),
+                                                                        viennacl::range(ublas_m2.size2(), 2 * ublas_m2.size2()) );
+   viennacl::matrix_slice< viennacl::matrix<NumericT, F> > vcl_m2_slice(vcl_m2_large,
+                                                                        viennacl::slice(6, 2, ublas_m2.size1()),
+                                                                        viennacl::slice(ublas_m2.size2(), 2, ublas_m2.size2()) );
+
+
+/*   std::cout << "Matrix resizing (to larger)" << std::endl;
+   matrix.resize(2*num_rows, 2*num_cols, true);
+   for (unsigned int i = 0; i < matrix.size1(); ++i)
+   {
+      for (unsigned int j = (i<result.size() ? rhs.size() : 0); j < matrix.size2(); ++j)
+         matrix(i,j) = 0;
+   }
+   vcl_matrix.resize(2*num_rows, 2*num_cols, true);
+   viennacl::copy(vcl_matrix, matrix);
+   if( std::fabs(diff(matrix, vcl_matrix)) > epsilon )
+   {
+      std::cout << "# Error at operation: matrix resize (to larger)" << std::endl;
+      std::cout << "  diff: " << std::fabs(diff(matrix, vcl_matrix)) << std::endl;
+      return EXIT_FAILURE;
+   }
+
+   matrix(12, 14) = NumericT(1.9);
+   matrix(19, 16) = NumericT(1.0);
+   matrix (13, 15) =  NumericT(-9);
+   vcl_matrix(12, 14) = NumericT(1.9);
+   vcl_matrix(19, 16) = NumericT(1.0);
+   vcl_matrix (13, 15) =  NumericT(-9);
+
+   std::cout << "Matrix resizing (to smaller)" << std::endl;
+   matrix.resize(result.size(), rhs.size(), true);
+   vcl_matrix.resize(result.size(), rhs.size(), true);
+   if( std::fabs(diff(matrix, vcl_matrix)) > epsilon )
+   {
+      std::cout << "# Error at operation: matrix resize (to smaller)" << std::endl;
+      std::cout << "  diff: " << std::fabs(diff(matrix, vcl_matrix)) << std::endl;
+      return EXIT_FAILURE;
+   }
+   */
+
+   //
+   // Run a bunch of tests for rank-1-updates, matrix-vector products
+   //
+   std::cout << "------------ Testing rank-1-updates and matrix-vector products ------------------" << std::endl;
+
+   std::cout << "* m = full, v1 = full, v2 = full" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_native, vcl_v1_native, vcl_v2_native);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   std::cout << "* m = full, v1 = full, v2 = range" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_native, vcl_v1_native, vcl_v2_range);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   std::cout << "* m = full, v1 = full, v2 = slice" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_native, vcl_v1_native, vcl_v2_slice);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   // v1 = range
+
+
+   std::cout << "* m = full, v1 = range, v2 = full" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_native, vcl_v1_range, vcl_v2_native);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   std::cout << "* m = full, v1 = range, v2 = range" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_native, vcl_v1_range, vcl_v2_range);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   std::cout << "* m = full, v1 = range, v2 = slice" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_native, vcl_v1_range, vcl_v2_slice);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+
+   // v1 = slice
+
+   std::cout << "* m = full, v1 = slice, v2 = full" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_native, vcl_v1_slice, vcl_v2_native);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   std::cout << "* m = full, v1 = slice, v2 = range" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_native, vcl_v1_slice, vcl_v2_range);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   std::cout << "* m = full, v1 = slice, v2 = slice" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_native, vcl_v1_slice, vcl_v2_slice);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   ///////////////////////////// matrix_range
+
+   std::cout << "* m = range, v1 = full, v2 = full" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_range, vcl_v1_native, vcl_v2_native);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   std::cout << "* m = range, v1 = full, v2 = range" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_range, vcl_v1_native, vcl_v2_range);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   std::cout << "* m = range, v1 = full, v2 = slice" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_range, vcl_v1_native, vcl_v2_slice);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   // v1 = range
+
+
+   std::cout << "* m = range, v1 = range, v2 = full" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_range, vcl_v1_range, vcl_v2_native);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   std::cout << "* m = range, v1 = range, v2 = range" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_range, vcl_v1_range, vcl_v2_range);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   std::cout << "* m = range, v1 = range, v2 = slice" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_range, vcl_v1_range, vcl_v2_slice);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+
+   // v1 = slice
+
+   std::cout << "* m = range, v1 = slice, v2 = full" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_range, vcl_v1_slice, vcl_v2_native);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   std::cout << "* m = range, v1 = slice, v2 = range" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_range, vcl_v1_slice, vcl_v2_range);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   std::cout << "* m = range, v1 = slice, v2 = slice" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_range, vcl_v1_slice, vcl_v2_slice);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   ///////////////////////////// matrix_slice
+
+   std::cout << "* m = slice, v1 = full, v2 = full" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_slice, vcl_v1_native, vcl_v2_native);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   std::cout << "* m = slice, v1 = full, v2 = range" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_slice, vcl_v1_native, vcl_v2_range);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   std::cout << "* m = slice, v1 = full, v2 = slice" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_slice, vcl_v1_native, vcl_v2_slice);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   // v1 = range
+
+
+   std::cout << "* m = slice, v1 = range, v2 = full" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_slice, vcl_v1_range, vcl_v2_native);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   std::cout << "* m = slice, v1 = range, v2 = range" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_slice, vcl_v1_range, vcl_v2_range);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   std::cout << "* m = slice, v1 = range, v2 = slice" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_slice, vcl_v1_range, vcl_v2_slice);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+
+   // v1 = slice
+
+   std::cout << "* m = slice, v1 = slice, v2 = full" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_slice, vcl_v1_slice, vcl_v2_native);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   std::cout << "* m = slice, v1 = slice, v2 = range" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_slice, vcl_v1_slice, vcl_v2_range);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+
+   std::cout << "* m = slice, v1 = slice, v2 = slice" << std::endl;
+   retval = test_prod_rank1<NumericT>(epsilon,
+                                      ublas_m1, ublas_v1, ublas_v2,
+                                      vcl_m1_slice, vcl_v1_slice, vcl_v2_slice);
+   if (retval == EXIT_FAILURE)
+   {
+     std::cout << " --- FAILED! ---" << std::endl;
+     return retval;
+   }
+   else
+     std::cout << " --- PASSED ---" << std::endl;
+
+   return retval;
+}
+//
+// -------------------------------------------------------------
+//
+int main()
+{
+   std::cout << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << "## Test :: Matrix" << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << std::endl;
+
+   int retval = EXIT_SUCCESS;
+
+   std::cout << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << std::endl;
+   {
+      typedef float NumericT;
+      NumericT epsilon = NumericT(1.0E-3);
+      std::cout << "# Testing setup:" << std::endl;
+      std::cout << "  eps:     " << epsilon << std::endl;
+      std::cout << "  numeric: float" << std::endl;
+      std::cout << "  layout: row-major" << std::endl;
+      retval = test<NumericT, viennacl::row_major>(epsilon);
+      if( retval == EXIT_SUCCESS )
+         std::cout << "# Test passed" << std::endl;
+      else
+         return retval;
+   }
+   std::cout << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << std::endl;
+   {
+      typedef float NumericT;
+      NumericT epsilon = NumericT(1.0E-3);
+      std::cout << "# Testing setup:" << std::endl;
+      std::cout << "  eps:     " << epsilon << std::endl;
+      std::cout << "  numeric: float" << std::endl;
+      std::cout << "  layout: column-major" << std::endl;
+      retval = test<NumericT, viennacl::column_major>(epsilon);
+      if( retval == EXIT_SUCCESS )
+         std::cout << "# Test passed" << std::endl;
+      else
+         return retval;
+   }
+   std::cout << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << std::endl;
+
+
+#ifdef VIENNACL_WITH_OPENCL
+   if( viennacl::ocl::current_device().double_support() )
+#endif
+   {
+      {
+         typedef double NumericT;
+         NumericT epsilon = 1.0E-11;
+         std::cout << "# Testing setup:" << std::endl;
+         std::cout << "  eps:     " << epsilon << std::endl;
+         std::cout << "  numeric: double" << std::endl;
+         std::cout << "  layout: row-major" << std::endl;
+         retval = test<NumericT, viennacl::row_major>(epsilon);
+            if( retval == EXIT_SUCCESS )
+               std::cout << "# Test passed" << std::endl;
+            else
+              return retval;
+      }
+      std::cout << std::endl;
+      std::cout << "----------------------------------------------" << std::endl;
+      std::cout << std::endl;
+      {
+         typedef double NumericT;
+         NumericT epsilon = 1.0E-11;
+         std::cout << "# Testing setup:" << std::endl;
+         std::cout << "  eps:     " << epsilon << std::endl;
+         std::cout << "  numeric: double" << std::endl;
+         std::cout << "  layout: column-major" << std::endl;
+         retval = test<NumericT, viennacl::column_major>(epsilon);
+            if( retval == EXIT_SUCCESS )
+               std::cout << "# Test passed" << std::endl;
+            else
+              return retval;
+      }
+      std::cout << std::endl;
+      std::cout << "----------------------------------------------" << std::endl;
+      std::cout << std::endl;
+   }
+
+   std::cout << std::endl;
+   std::cout << "------- Test completed --------" << std::endl;
+   std::cout << std::endl;
+
+
+   return retval;
+}
diff --git a/tests/src/scheduler_sparse.cpp b/tests/src/scheduler_sparse.cpp
new file mode 100644
index 0000000..94de314
--- /dev/null
+++ b/tests/src/scheduler_sparse.cpp
@@ -0,0 +1,456 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#ifndef NDEBUG
+ #define NDEBUG
+#endif
+
+//
+// *** System
+//
+#include <iostream>
+
+//
+// *** Boost
+//
+#include <boost/numeric/ublas/io.hpp>
+#include <boost/numeric/ublas/triangular.hpp>
+#include <boost/numeric/ublas/matrix_sparse.hpp>
+#include <boost/numeric/ublas/matrix.hpp>
+#include <boost/numeric/ublas/matrix_proxy.hpp>
+#include <boost/numeric/ublas/lu.hpp>
+#include <boost/numeric/ublas/io.hpp>
+#include <boost/numeric/ublas/operation_sparse.hpp>
+
+//
+// *** ViennaCL
+//
+//#define VIENNACL_DEBUG_ALL
+#define VIENNACL_WITH_UBLAS 1
+#include "viennacl/scalar.hpp"
+#include "viennacl/compressed_matrix.hpp"
+#include "viennacl/coordinate_matrix.hpp"
+#include "viennacl/ell_matrix.hpp"
+#include "viennacl/hyb_matrix.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/vector_proxy.hpp"
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+#include "viennacl/linalg/ilu.hpp"
+#include "viennacl/linalg/detail/ilu/common.hpp"
+#include "viennacl/io/matrix_market.hpp"
+#include "examples/tutorial/Random.hpp"
+#include "examples/tutorial/vector-io.hpp"
+
+#include "viennacl/scheduler/execute.hpp"
+#include "viennacl/scheduler/io.hpp"
+
+//
+// -------------------------------------------------------------
+//
+using namespace boost::numeric;
+//
+// -------------------------------------------------------------
+//
+template <typename ScalarType>
+ScalarType diff(ScalarType & s1, viennacl::scalar<ScalarType> & s2)
+{
+   if (s1 != s2)
+      return (s1 - s2) / std::max(fabs(s1), std::fabs(s2));
+   return 0;
+}
+
+template <typename ScalarType>
+ScalarType diff(ublas::vector<ScalarType> & v1, viennacl::vector<ScalarType> & v2)
+{
+   ublas::vector<ScalarType> v2_cpu(v2.size());
+   viennacl::backend::finish();
+   viennacl::copy(v2.begin(), v2.end(), v2_cpu.begin());
+
+   for (unsigned int i=0;i<v1.size(); ++i)
+   {
+      if ( std::max( std::fabs(v2_cpu[i]), std::fabs(v1[i]) ) > 0 )
+      {
+        //if (std::max( std::fabs(v2_cpu[i]), std::fabs(v1[i]) ) < 1e-10 )  //absolute tolerance (avoid round-off issues)
+        //  v2_cpu[i] = 0;
+        //else
+          v2_cpu[i] = std::fabs(v2_cpu[i] - v1[i]) / std::max( std::fabs(v2_cpu[i]), std::fabs(v1[i]) );
+      }
+      else
+         v2_cpu[i] = 0.0;
+
+      if (v2_cpu[i] > 0.0001)
+      {
+        //std::cout << "Neighbor: "      << i-1 << ": " << v1[i-1] << " vs. " << v2_cpu[i-1] << std::endl;
+        std::cout << "Error at entry " << i   << ": " << v1[i]   << " vs. " << v2_cpu[i]   << std::endl;
+        //std::cout << "Neighbor: "      << i+1 << ": " << v1[i+1] << " vs. " << v2_cpu[i+1] << std::endl;
+        exit(EXIT_FAILURE);
+      }
+   }
+
+   return norm_inf(v2_cpu);
+}
+
+
+template <typename ScalarType, typename VCL_MATRIX>
+ScalarType diff(ublas::compressed_matrix<ScalarType> & cpu_matrix, VCL_MATRIX & gpu_matrix)
+{
+  typedef ublas::compressed_matrix<ScalarType>  CPU_MATRIX;
+  CPU_MATRIX from_gpu;
+
+  viennacl::backend::finish();
+  viennacl::copy(gpu_matrix, from_gpu);
+
+  ScalarType error = 0;
+
+  //step 1: compare all entries from cpu_matrix with gpu_matrix:
+  //std::cout << "Ublas matrix: " << std::endl;
+  for (typename CPU_MATRIX::const_iterator1 row_it = cpu_matrix.begin1();
+        row_it != cpu_matrix.end1();
+        ++row_it)
+  {
+    //std::cout << "Row " << row_it.index1() << ": " << std::endl;
+    for (typename CPU_MATRIX::const_iterator2 col_it = row_it.begin();
+          col_it != row_it.end();
+          ++col_it)
+    {
+      //std::cout << "(" << col_it.index2() << ", " << *col_it << std::endl;
+      ScalarType current_error = 0;
+
+      if ( std::max( std::fabs(cpu_matrix(col_it.index1(), col_it.index2())),
+                      std::fabs(from_gpu(col_it.index1(), col_it.index2()))   ) > 0 )
+        current_error = std::fabs(cpu_matrix(col_it.index1(), col_it.index2()) - from_gpu(col_it.index1(), col_it.index2()))
+                          / std::max( std::fabs(cpu_matrix(col_it.index1(), col_it.index2())),
+                                      std::fabs(from_gpu(col_it.index1(), col_it.index2()))   );
+      if (current_error > error)
+        error = current_error;
+    }
+  }
+
+  //step 2: compare all entries from gpu_matrix with cpu_matrix (sparsity pattern might differ):
+  //std::cout << "ViennaCL matrix: " << std::endl;
+  for (typename CPU_MATRIX::const_iterator1 row_it = from_gpu.begin1();
+        row_it != from_gpu.end1();
+        ++row_it)
+  {
+    //std::cout << "Row " << row_it.index1() << ": " << std::endl;
+    for (typename CPU_MATRIX::const_iterator2 col_it = row_it.begin();
+          col_it != row_it.end();
+          ++col_it)
+    {
+      //std::cout << "(" << col_it.index2() << ", " << *col_it << std::endl;
+      ScalarType current_error = 0;
+
+      if ( std::max( std::fabs(cpu_matrix(col_it.index1(), col_it.index2())),
+                      std::fabs(from_gpu(col_it.index1(), col_it.index2()))   ) > 0 )
+        current_error = std::fabs(cpu_matrix(col_it.index1(), col_it.index2()) - from_gpu(col_it.index1(), col_it.index2()))
+                          / std::max( std::fabs(cpu_matrix(col_it.index1(), col_it.index2())),
+                                      std::fabs(from_gpu(col_it.index1(), col_it.index2()))   );
+      if (current_error > error)
+        error = current_error;
+    }
+  }
+
+  return error;
+}
+
+//
+// -------------------------------------------------------------
+//
+template< typename NumericT, typename Epsilon >
+int test(Epsilon const& epsilon)
+{
+  int retval = EXIT_SUCCESS;
+
+  // --------------------------------------------------------------------------
+  NumericT alpha = static_cast<NumericT>(2.786);
+  NumericT beta = static_cast<NumericT>(1.432);
+
+  ublas::vector<NumericT> rhs;
+  ublas::vector<NumericT> result;
+  ublas::compressed_matrix<NumericT> ublas_matrix;
+
+  if (viennacl::io::read_matrix_market_file(ublas_matrix, "../../examples/testdata/mat65k.mtx") == EXIT_FAILURE)
+  {
+    std::cout << "Error reading Matrix file" << std::endl;
+    return EXIT_FAILURE;
+  }
+  //unsigned int cg_mat_size = cg_mat.size();
+  std::cout << "done reading matrix" << std::endl;
+
+
+  rhs.resize(ublas_matrix.size2());
+  for (std::size_t i=0; i<rhs.size(); ++i)
+  {
+    ublas_matrix(i,i) = NumericT(0.5);   // Get rid of round-off errors by making row-sums unequal to zero:
+    rhs[i] = NumericT(1) + random<NumericT>();
+  }
+
+  result = rhs;
+
+
+  viennacl::vector<NumericT> vcl_rhs(rhs.size());
+  viennacl::vector<NumericT> vcl_result(result.size());
+  viennacl::vector<NumericT> vcl_result2(result.size());
+  viennacl::compressed_matrix<NumericT> vcl_compressed_matrix(rhs.size(), rhs.size());
+  viennacl::coordinate_matrix<NumericT> vcl_coordinate_matrix(rhs.size(), rhs.size());
+  viennacl::ell_matrix<NumericT> vcl_ell_matrix;
+  viennacl::hyb_matrix<NumericT> vcl_hyb_matrix;
+
+  viennacl::copy(rhs.begin(), rhs.end(), vcl_rhs.begin());
+  viennacl::copy(ublas_matrix, vcl_compressed_matrix);
+  viennacl::copy(ublas_matrix, vcl_coordinate_matrix);
+
+  // --------------------------------------------------------------------------
+  std::cout << "Testing products: compressed_matrix" << std::endl;
+  result     = viennacl::linalg::prod(ublas_matrix, rhs);
+  {
+  viennacl::scheduler::statement my_statement(vcl_result, viennacl::op_assign(), viennacl::linalg::prod(vcl_compressed_matrix, vcl_rhs));
+  viennacl::scheduler::execute(my_statement);
+  }
+  vcl_result = viennacl::linalg::prod(vcl_compressed_matrix, vcl_rhs);
+
+  if( std::fabs(diff(result, vcl_result)) > epsilon )
+  {
+    std::cout << "# Error at operation: matrix-vector product with compressed_matrix" << std::endl;
+    std::cout << "  diff: " << std::fabs(diff(result, vcl_result)) << std::endl;
+    retval = EXIT_FAILURE;
+  }
+
+  std::cout << "Testing products: coordinate_matrix" << std::endl;
+  rhs *= NumericT(1.1);
+  vcl_rhs *= NumericT(1.1);
+  result     = viennacl::linalg::prod(ublas_matrix, rhs);
+  {
+  viennacl::scheduler::statement my_statement(vcl_result, viennacl::op_assign(), viennacl::linalg::prod(vcl_coordinate_matrix, vcl_rhs));
+  viennacl::scheduler::execute(my_statement);
+  }
+
+  if( std::fabs(diff(result, vcl_result)) > epsilon )
+  {
+    std::cout << "# Error at operation: matrix-vector product with coordinate_matrix" << std::endl;
+    std::cout << "  diff: " << std::fabs(diff(result, vcl_result)) << std::endl;
+    retval = EXIT_FAILURE;
+  }
+
+  result     = alpha * viennacl::linalg::prod(ublas_matrix, rhs) + beta * result;
+  {
+  viennacl::scheduler::statement my_statement(vcl_result2, viennacl::op_assign(), alpha * viennacl::linalg::prod(vcl_coordinate_matrix, vcl_rhs) + beta * vcl_result);
+  viennacl::scheduler::execute(my_statement);
+  }
+
+  if( std::fabs(diff(result, vcl_result2)) > epsilon )
+  {
+    std::cout << "# Error at operation: matrix-vector product (coordinate_matrix) with scaled additions" << std::endl;
+    std::cout << "  diff: " << std::fabs(diff(result, vcl_result2)) << std::endl;
+    retval = EXIT_FAILURE;
+  }
+
+  //std::cout << "Copying ell_matrix" << std::endl;
+  viennacl::copy(ublas_matrix, vcl_ell_matrix);
+  ublas_matrix.clear();
+  viennacl::copy(vcl_ell_matrix, ublas_matrix);// just to check that it's works
+
+
+  std::cout << "Testing products: ell_matrix" << std::endl;
+  rhs *= NumericT(1.1);
+  vcl_rhs *= NumericT(1.1);
+  result     = viennacl::linalg::prod(ublas_matrix, rhs);
+  {
+  //viennacl::scheduler::statement my_statement(vcl_result, viennacl::op_assign(), viennacl::linalg::prod(vcl_ell_matrix, vcl_rhs));
+  //viennacl::scheduler::execute(my_statement);
+  }
+  vcl_result = viennacl::linalg::prod(vcl_ell_matrix, vcl_rhs);
+
+  if( std::fabs(diff(result, vcl_result)) > epsilon )
+  {
+    std::cout << "# Error at operation: matrix-vector product with ell_matrix" << std::endl;
+    std::cout << "  diff: " << std::fabs(diff(result, vcl_result)) << std::endl;
+    retval = EXIT_FAILURE;
+  }
+
+  //std::cout << "Copying hyb_matrix" << std::endl;
+  viennacl::copy(ublas_matrix, vcl_hyb_matrix);
+  ublas_matrix.clear();
+  viennacl::copy(vcl_hyb_matrix, ublas_matrix);// just to check that it's works
+  viennacl::copy(ublas_matrix, vcl_hyb_matrix);
+
+  std::cout << "Testing products: hyb_matrix" << std::endl;
+  rhs *= NumericT(1.1);
+  vcl_rhs *= NumericT(1.1);
+  result     = viennacl::linalg::prod(ublas_matrix, rhs);
+  {
+  viennacl::scheduler::statement my_statement(vcl_result, viennacl::op_assign(), viennacl::linalg::prod(vcl_hyb_matrix, vcl_rhs));
+  viennacl::scheduler::execute(my_statement);
+  }
+
+  if( std::fabs(diff(result, vcl_result)) > epsilon )
+  {
+    std::cout << "# Error at operation: matrix-vector product with hyb_matrix" << std::endl;
+    std::cout << "  diff: " << std::fabs(diff(result, vcl_result)) << std::endl;
+    retval = EXIT_FAILURE;
+  }
+
+
+  // --------------------------------------------------------------------------
+  // --------------------------------------------------------------------------
+  copy(rhs.begin(), rhs.end(), vcl_rhs.begin());
+  copy(result.begin(), result.end(), vcl_result.begin());
+  copy(result.begin(), result.end(), vcl_result2.begin());
+  copy(ublas_matrix, vcl_compressed_matrix);
+  copy(ublas_matrix, vcl_coordinate_matrix);
+  copy(ublas_matrix, vcl_ell_matrix);
+  copy(ublas_matrix, vcl_hyb_matrix);
+
+  std::cout << "Testing scaled additions of products and vectors: compressed_matrix" << std::endl;
+  rhs *= NumericT(1.1);
+  vcl_rhs *= NumericT(1.1);
+  result     = alpha * viennacl::linalg::prod(ublas_matrix, rhs) + beta * result;
+  {
+  viennacl::scheduler::statement my_statement(vcl_result2, viennacl::op_assign(), alpha * viennacl::linalg::prod(vcl_compressed_matrix, vcl_rhs) + beta * vcl_result);
+  viennacl::scheduler::execute(my_statement);
+  }
+
+  if( std::fabs(diff(result, vcl_result2)) > epsilon )
+  {
+    std::cout << "# Error at operation: matrix-vector product (compressed_matrix) with scaled additions" << std::endl;
+    std::cout << "  diff: " << std::fabs(diff(result, vcl_result2)) << std::endl;
+    retval = EXIT_FAILURE;
+  }
+
+
+  std::cout << "Testing scaled additions of products and vectors: coordinate_matrix" << std::endl;
+  copy(result.begin(), result.end(), vcl_result.begin());
+  rhs *= NumericT(1.1);
+  vcl_rhs *= NumericT(1.1);
+  result     = alpha * viennacl::linalg::prod(ublas_matrix, rhs) + beta * result;
+  {
+  viennacl::scheduler::statement my_statement(vcl_result2, viennacl::op_assign(), alpha * viennacl::linalg::prod(vcl_coordinate_matrix, vcl_rhs) + beta * vcl_result);
+  viennacl::scheduler::execute(my_statement);
+  }
+
+  if( std::fabs(diff(result, vcl_result2)) > epsilon )
+  {
+    std::cout << "# Error at operation: matrix-vector product (coordinate_matrix) with scaled additions" << std::endl;
+    std::cout << "  diff: " << std::fabs(diff(result, vcl_result2)) << std::endl;
+    retval = EXIT_FAILURE;
+  }
+
+  std::cout << "Testing scaled additions of products and vectors: ell_matrix" << std::endl;
+  copy(result.begin(), result.end(), vcl_result.begin());
+  rhs *= NumericT(1.1);
+  vcl_rhs *= NumericT(1.1);
+  result     = alpha * viennacl::linalg::prod(ublas_matrix, rhs) + beta * result;
+  {
+  viennacl::scheduler::statement my_statement(vcl_result2, viennacl::op_assign(), alpha * viennacl::linalg::prod(vcl_ell_matrix, vcl_rhs) + beta * vcl_result);
+  viennacl::scheduler::execute(my_statement);
+  }
+
+  if( std::fabs(diff(result, vcl_result2)) > epsilon )
+  {
+    std::cout << "# Error at operation: matrix-vector product (ell_matrix) with scaled additions" << std::endl;
+    std::cout << "  diff: " << std::fabs(diff(result, vcl_result2)) << std::endl;
+    retval = EXIT_FAILURE;
+  }
+
+  std::cout << "Testing scaled additions of products and vectors: hyb_matrix" << std::endl;
+  copy(result.begin(), result.end(), vcl_result.begin());
+  rhs *= NumericT(1.1);
+  vcl_rhs *= NumericT(1.1);
+  result     = alpha * viennacl::linalg::prod(ublas_matrix, rhs) + beta * result;
+  {
+  viennacl::scheduler::statement my_statement(vcl_result2, viennacl::op_assign(), alpha * viennacl::linalg::prod(vcl_hyb_matrix, vcl_rhs) + beta * vcl_result);
+  viennacl::scheduler::execute(my_statement);
+  }
+
+  if( std::fabs(diff(result, vcl_result2)) > epsilon )
+  {
+    std::cout << "# Error at operation: matrix-vector product (hyb_matrix) with scaled additions" << std::endl;
+    std::cout << "  diff: " << std::fabs(diff(result, vcl_result2)) << std::endl;
+    retval = EXIT_FAILURE;
+  }
+
+
+  // --------------------------------------------------------------------------
+  return retval;
+}
+//
+// -------------------------------------------------------------
+//
+int main()
+{
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "## Test :: Sparse Matrices" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << std::endl;
+
+  int retval = EXIT_SUCCESS;
+
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << std::endl;
+  {
+    typedef float NumericT;
+    NumericT epsilon = static_cast<NumericT>(1E-4);
+    std::cout << "# Testing setup:" << std::endl;
+    std::cout << "  eps:     " << epsilon << std::endl;
+    std::cout << "  numeric: float" << std::endl;
+    retval = test<NumericT>(epsilon);
+    if( retval == EXIT_SUCCESS )
+        std::cout << "# Test passed" << std::endl;
+    else
+        return retval;
+  }
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << std::endl;
+
+#ifdef VIENNACL_WITH_OPENCL
+  if( viennacl::ocl::current_device().double_support() )
+#endif
+  {
+    {
+      typedef double NumericT;
+      NumericT epsilon = 1.0E-13;
+      std::cout << "# Testing setup:" << std::endl;
+      std::cout << "  eps:     " << epsilon << std::endl;
+      std::cout << "  numeric: double" << std::endl;
+      retval = test<NumericT>(epsilon);
+      if( retval == EXIT_SUCCESS )
+        std::cout << "# Test passed" << std::endl;
+      else
+        return retval;
+    }
+    std::cout << std::endl;
+    std::cout << "----------------------------------------------" << std::endl;
+    std::cout << std::endl;
+  }
+#ifdef VIENNACL_WITH_OPENCL
+  else
+    std::cout << "No double precision support, skipping test..." << std::endl;
+#endif
+
+
+  std::cout << std::endl;
+  std::cout << "------- Test completed --------" << std::endl;
+  std::cout << std::endl;
+
+  return retval;
+}
diff --git a/tests/src/scheduler_vector.cpp b/tests/src/scheduler_vector.cpp
new file mode 100644
index 0000000..66922c4
--- /dev/null
+++ b/tests/src/scheduler_vector.cpp
@@ -0,0 +1,697 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+//
+// *** System
+//
+#include <iostream>
+#include <iomanip>
+
+//
+// *** Boost
+//
+#include <boost/numeric/ublas/io.hpp>
+#include <boost/numeric/ublas/vector.hpp>
+#include <boost/numeric/ublas/vector_proxy.hpp>
+
+//
+// *** ViennaCL
+//
+//#define VIENNACL_DEBUG_ALL
+#define VIENNACL_WITH_UBLAS 1
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/vector_proxy.hpp"
+#include "viennacl/linalg/inner_prod.hpp"
+#include "viennacl/linalg/norm_1.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+#include "viennacl/linalg/norm_inf.hpp"
+
+#include "viennacl/scheduler/execute.hpp"
+#include "viennacl/scheduler/io.hpp"
+
+#include "Random.hpp"
+
+using namespace boost::numeric;
+
+
+//
+// -------------------------------------------------------------
+//
+template <typename ScalarType>
+ScalarType diff(ScalarType const & s1, ScalarType const & s2)
+{
+   viennacl::backend::finish();
+   if (s1 != s2)
+      return (s1 - s2) / std::max(std::fabs(s1), std::fabs(s2));
+   return 0;
+}
+//
+// -------------------------------------------------------------
+//
+template <typename ScalarType>
+ScalarType diff(ScalarType const & s1, viennacl::scalar<ScalarType> const & s2)
+{
+   viennacl::backend::finish();
+   if (s1 != s2)
+      return (s1 - s2) / std::max(std::fabs(s1), std::fabs(s2));
+   return 0;
+}
+//
+// -------------------------------------------------------------
+//
+template <typename ScalarType>
+ScalarType diff(ScalarType const & s1, viennacl::entry_proxy<ScalarType> const & s2)
+{
+   viennacl::backend::finish();
+   if (s1 != s2)
+      return (s1 - s2) / std::max(std::fabs(s1), std::fabs(s2));
+   return 0;
+}
+//
+// -------------------------------------------------------------
+//
+template <typename ScalarType, typename ViennaCLVectorType>
+ScalarType diff(ublas::vector<ScalarType> const & v1, ViennaCLVectorType const & vcl_vec)
+{
+   ublas::vector<ScalarType> v2_cpu(vcl_vec.size());
+   viennacl::backend::finish();
+   viennacl::copy(vcl_vec, v2_cpu);
+
+   for (unsigned int i=0;i<v1.size(); ++i)
+   {
+      if ( std::max( std::fabs(v2_cpu[i]), std::fabs(v1[i]) ) > 0 )
+         v2_cpu[i] = std::fabs(v2_cpu[i] - v1[i]) / std::max( std::fabs(v2_cpu[i]), std::fabs(v1[i]) );
+      else
+         v2_cpu[i] = 0.0;
+   }
+
+   return ublas::norm_inf(v2_cpu);
+}
+
+
+template <typename T1, typename T2>
+int check(T1 const & t1, T2 const & t2, double epsilon)
+{
+  int retval = EXIT_SUCCESS;
+
+  double temp = std::fabs(diff(t1, t2));
+  if (temp > epsilon)
+  {
+    std::cout << "# Error! Relative difference: " << temp << std::endl;
+    retval = EXIT_FAILURE;
+  }
+  else
+    std::cout << "PASSED!" << std::endl;
+  return retval;
+}
+
+
+//
+// -------------------------------------------------------------
+//
+template< typename NumericT, typename Epsilon, typename UblasVectorType, typename ViennaCLVectorType1, typename ViennaCLVectorType2 >
+int test(Epsilon const& epsilon,
+         UblasVectorType     & ublas_v1, UblasVectorType     & ublas_v2,
+         ViennaCLVectorType1 &   vcl_v1, ViennaCLVectorType2 &   vcl_v2)
+{
+  int retval = EXIT_SUCCESS;
+
+  NumericT                    cpu_result = 42.0;
+  viennacl::scalar<NumericT>  gpu_result = 43.0;
+  NumericT                    alpha      = NumericT(3.1415);
+  NumericT                    beta       = NumericT(2.7172);
+
+  //
+  // Initializer:
+  //
+  std::cout << "Checking for zero_vector initializer..." << std::endl;
+  ublas_v1 = ublas::zero_vector<NumericT>(ublas_v1.size());
+  vcl_v1 = viennacl::zero_vector<NumericT>(vcl_v1.size());
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Checking for scalar_vector initializer..." << std::endl;
+  ublas_v1 = ublas::scalar_vector<NumericT>(ublas_v1.size(), cpu_result);
+  vcl_v1 = viennacl::scalar_vector<NumericT>(vcl_v1.size(), cpu_result);
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ublas_v1 = ublas::scalar_vector<NumericT>(ublas_v1.size(), gpu_result);
+  vcl_v1 = viennacl::scalar_vector<NumericT>(vcl_v1.size(), gpu_result);
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Checking for unit_vector initializer..." << std::endl;
+  ublas_v1 = ublas::unit_vector<NumericT>(ublas_v1.size(), 5);
+  vcl_v1 = viennacl::unit_vector<NumericT>(vcl_v1.size(), 5);
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  for (std::size_t i=0; i<ublas_v1.size(); ++i)
+  {
+    ublas_v1[i] = NumericT(1.0) + random<NumericT>();
+    ublas_v2[i] = NumericT(1.0) + random<NumericT>();
+  }
+
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());  //resync
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  std::cout << "Checking for successful copy..." << std::endl;
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  if (check(ublas_v2, vcl_v2, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  // --------------------------------------------------------------------------
+
+  std::cout << "Testing simple assignments..." << std::endl;
+
+  {
+  ublas_v1 = ublas_v2;
+  viennacl::scheduler::statement   my_statement(vcl_v1, viennacl::op_assign(), vcl_v2); // same as vcl_v1 = vcl_v2;
+  viennacl::scheduler::execute(my_statement);
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  }
+
+  {
+  ublas_v1 += ublas_v2;
+  viennacl::scheduler::statement   my_statement(vcl_v1, viennacl::op_inplace_add(), vcl_v2); // same as vcl_v1 += vcl_v2;
+  viennacl::scheduler::execute(my_statement);
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  }
+
+  {
+  ublas_v1 -= ublas_v2;
+  viennacl::scheduler::statement   my_statement(vcl_v1, viennacl::op_inplace_sub(), vcl_v2); // same as vcl_v1 -= vcl_v2;
+  viennacl::scheduler::execute(my_statement);
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  }
+
+  std::cout << "Testing composite assignments..." << std::endl;
+  {
+  ublas_v1 = ublas_v1 + ublas_v2;
+  viennacl::scheduler::statement   my_statement(vcl_v1, viennacl::op_assign(), vcl_v1 + vcl_v2); // same as vcl_v1 = vcl_v1 + vcl_v2;
+  viennacl::scheduler::execute(my_statement);
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  }
+  {
+  ublas_v1 += alpha * ublas_v1 - beta * ublas_v2 + ublas_v1 / beta - ublas_v2 / alpha;
+  viennacl::scheduler::statement   my_statement(vcl_v1, viennacl::op_inplace_add(), alpha * vcl_v1 - beta * vcl_v2 + vcl_v1 / beta - vcl_v2 / alpha); // same as vcl_v1 += alpha * vcl_v1 - beta * vcl_v2 + beta * vcl_v1 - alpha * vcl_v2;
+  viennacl::scheduler::execute(my_statement);
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  }
+
+  {
+  ublas_v1 = ublas_v1 - ublas_v2;
+  viennacl::scheduler::statement   my_statement(vcl_v1, viennacl::op_assign(), vcl_v1 - vcl_v2); // same as vcl_v1 = vcl_v1 - vcl_v2;
+  viennacl::scheduler::execute(my_statement);
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  }
+
+  std::cout << "--- Testing reductions ---" << std::endl;
+  std::cout << "inner_prod..." << std::endl;
+  {
+  cpu_result = inner_prod(ublas_v1, ublas_v2);
+  viennacl::scheduler::statement   my_statement(gpu_result, viennacl::op_assign(), viennacl::linalg::inner_prod(vcl_v1, vcl_v2)); // same as gpu_result = inner_prod(vcl_v1, vcl_v2);
+  viennacl::scheduler::execute(my_statement);
+
+  if (check(cpu_result, gpu_result, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  }
+
+  {
+  cpu_result = inner_prod(ublas_v1 + ublas_v2, ublas_v2);
+  viennacl::scheduler::statement   my_statement(gpu_result, viennacl::op_assign(), viennacl::linalg::inner_prod(vcl_v1 + vcl_v2, vcl_v2)); // same as gpu_result = inner_prod(vcl_v1 + vcl_v2, vcl_v2);
+  viennacl::scheduler::execute(my_statement);
+
+  if (check(cpu_result, gpu_result, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  }
+
+  {
+  cpu_result = inner_prod(ublas_v1, ublas_v2 - ublas_v1);
+  viennacl::scheduler::statement   my_statement(gpu_result, viennacl::op_assign(), viennacl::linalg::inner_prod(vcl_v1, vcl_v2 - vcl_v1)); // same as gpu_result = inner_prod(vcl_v1, vcl_v2 - vcl_v1);
+  viennacl::scheduler::execute(my_statement);
+
+  if (check(cpu_result, gpu_result, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  }
+
+  {
+  cpu_result = inner_prod(ublas_v1 - ublas_v2, ublas_v2 + ublas_v1);
+  viennacl::scheduler::statement   my_statement(gpu_result, viennacl::op_assign(), viennacl::linalg::inner_prod(vcl_v1 - vcl_v2, vcl_v2 + vcl_v1)); // same as gpu_result = inner_prod(vcl_v1 - vcl_v2, vcl_v2 + vcl_v1);
+  viennacl::scheduler::execute(my_statement);
+
+  if (check(cpu_result, gpu_result, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  }
+
+  std::cout << "norm_1..." << std::endl;
+  {
+  cpu_result = norm_1(ublas_v1);
+  viennacl::scheduler::statement   my_statement(gpu_result, viennacl::op_assign(), viennacl::linalg::norm_1(vcl_v1)); // same as gpu_result = norm_1(vcl_v1);
+  viennacl::scheduler::execute(my_statement);
+
+  if (check(cpu_result, gpu_result, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  }
+
+  {
+  cpu_result = norm_1(ublas_v1 + ublas_v2);
+  viennacl::scheduler::statement   my_statement(gpu_result, viennacl::op_assign(), viennacl::linalg::norm_1(vcl_v1 + vcl_v2)); // same as gpu_result = norm_1(vcl_v1 + vcl_v2);
+  viennacl::scheduler::execute(my_statement);
+
+  if (check(cpu_result, gpu_result, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  }
+
+  std::cout << "norm_2..." << std::endl;
+  {
+  cpu_result = norm_2(ublas_v1);
+  viennacl::scheduler::statement   my_statement(gpu_result, viennacl::op_assign(), viennacl::linalg::norm_2(vcl_v1)); // same as gpu_result = norm_2(vcl_v1);
+  viennacl::scheduler::execute(my_statement);
+
+  if (check(cpu_result, gpu_result, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  }
+
+  {
+  cpu_result = norm_2(ublas_v1 + ublas_v2);
+  viennacl::scheduler::statement   my_statement(gpu_result, viennacl::op_assign(), viennacl::linalg::norm_2(vcl_v1 + vcl_v2)); // same as gpu_result = norm_2(vcl_v1 + vcl_v2);
+  viennacl::scheduler::execute(my_statement);
+
+  if (check(cpu_result, gpu_result, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  }
+
+  std::cout << "norm_inf..." << std::endl;
+  {
+  cpu_result = norm_inf(ublas_v1);
+  viennacl::scheduler::statement   my_statement(gpu_result, viennacl::op_assign(), viennacl::linalg::norm_inf(vcl_v1)); // same as gpu_result = norm_inf(vcl_v1);
+  viennacl::scheduler::execute(my_statement);
+
+  if (check(cpu_result, gpu_result, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  }
+
+  {
+  cpu_result = norm_inf(ublas_v1 - ublas_v2);
+  viennacl::scheduler::statement   my_statement(gpu_result, viennacl::op_assign(), viennacl::linalg::norm_inf(vcl_v1 - vcl_v2)); // same as gpu_result = norm_inf(vcl_v1 - vcl_v2);
+  viennacl::scheduler::execute(my_statement);
+
+  if (check(cpu_result, gpu_result, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  }
+
+  std::cout << "--- Testing elementwise operations (binary) ---" << std::endl;
+  std::cout << "x = element_prod(x, y)... ";
+  {
+  ublas_v1 = element_prod(ublas_v1, ublas_v2);
+  viennacl::scheduler::statement   my_statement(vcl_v1, viennacl::op_assign(), viennacl::linalg::element_prod(vcl_v1, vcl_v2));
+  viennacl::scheduler::execute(my_statement);
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  }
+
+  std::cout << "x = element_prod(x + y, y)... ";
+  {
+  ublas_v1 = element_prod(ublas_v1 + ublas_v2, ublas_v2);
+  viennacl::scheduler::statement   my_statement(vcl_v1, viennacl::op_assign(), viennacl::linalg::element_prod(vcl_v1 + vcl_v2, vcl_v2));
+  viennacl::scheduler::execute(my_statement);
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  }
+
+  std::cout << "x = element_prod(x, x + y)... ";
+  {
+  ublas_v1 = element_prod(ublas_v1, ublas_v1 + ublas_v2);
+  viennacl::scheduler::statement   my_statement(vcl_v1, viennacl::op_assign(), viennacl::linalg::element_prod(vcl_v1, vcl_v2 + vcl_v1));
+  viennacl::scheduler::execute(my_statement);
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  }
+
+  std::cout << "x = element_prod(x - y, y + x)... ";
+  {
+  ublas_v1 = element_prod(ublas_v1 - ublas_v2, ublas_v2 + ublas_v1);
+  viennacl::scheduler::statement   my_statement(vcl_v1, viennacl::op_assign(), viennacl::linalg::element_prod(vcl_v1 - vcl_v2, vcl_v2 + vcl_v1));
+  viennacl::scheduler::execute(my_statement);
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  }
+
+
+
+  std::cout << "x = element_div(x, y)... ";
+  {
+  ublas_v1 = element_div(ublas_v1, ublas_v2);
+  viennacl::scheduler::statement   my_statement(vcl_v1, viennacl::op_assign(), viennacl::linalg::element_div(vcl_v1, vcl_v2));
+  viennacl::scheduler::execute(my_statement);
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  }
+
+  std::cout << "x = element_div(x + y, y)... ";
+  {
+  ublas_v1 = element_div(ublas_v1 + ublas_v2, ublas_v2);
+  viennacl::scheduler::statement   my_statement(vcl_v1, viennacl::op_assign(), viennacl::linalg::element_div(vcl_v1 + vcl_v2, vcl_v2));
+  viennacl::scheduler::execute(my_statement);
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  }
+
+  std::cout << "x = element_div(x, x + y)... ";
+  {
+  ublas_v1 = element_div(ublas_v1, ublas_v1 + ublas_v2);
+  viennacl::scheduler::statement   my_statement(vcl_v1, viennacl::op_assign(), viennacl::linalg::element_div(vcl_v1, vcl_v2 + vcl_v1));
+  viennacl::scheduler::execute(my_statement);
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  }
+
+  std::cout << "x = element_div(x - y, y + x)... ";
+  {
+  ublas_v1 = element_div(ublas_v1 - ublas_v2, ublas_v2 + ublas_v1);
+  viennacl::scheduler::statement   my_statement(vcl_v1, viennacl::op_assign(), viennacl::linalg::element_div(vcl_v1 - vcl_v2, vcl_v2 + vcl_v1));
+  viennacl::scheduler::execute(my_statement);
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  }
+
+  std::cout << "--- Testing elementwise operations (unary) ---" << std::endl;
+#define GENERATE_UNARY_OP_TEST(OPNAME) \
+  ublas_v1 = ublas::scalar_vector<NumericT>(ublas_v1.size(), NumericT(0.21)); \
+  ublas_v2 = NumericT(3.1415) * ublas_v1; \
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin()); \
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin()); \
+  { \
+  for (std::size_t i=0; i<ublas_v1.size(); ++i) \
+    ublas_v1[i] = OPNAME(ublas_v2[i]); \
+  viennacl::scheduler::statement my_statement(vcl_v1, viennacl::op_assign(), viennacl::linalg::element_##OPNAME(vcl_v2)); \
+  viennacl::scheduler::execute(my_statement); \
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS) \
+    return EXIT_FAILURE; \
+  } \
+  { \
+  for (std::size_t i=0; i<ublas_v1.size(); ++i) \
+  ublas_v1[i] = std::OPNAME(ublas_v2[i] / NumericT(2)); \
+  viennacl::scheduler::statement my_statement(vcl_v1, viennacl::op_assign(), viennacl::linalg::element_##OPNAME(vcl_v2 / NumericT(2))); \
+  viennacl::scheduler::execute(my_statement); \
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS) \
+    return EXIT_FAILURE; \
+  }
+
+  GENERATE_UNARY_OP_TEST(cos);
+  GENERATE_UNARY_OP_TEST(cosh);
+  GENERATE_UNARY_OP_TEST(exp);
+  GENERATE_UNARY_OP_TEST(floor);
+  GENERATE_UNARY_OP_TEST(fabs);
+  GENERATE_UNARY_OP_TEST(log);
+  GENERATE_UNARY_OP_TEST(log10);
+  GENERATE_UNARY_OP_TEST(sin);
+  GENERATE_UNARY_OP_TEST(sinh);
+  GENERATE_UNARY_OP_TEST(fabs);
+  //GENERATE_UNARY_OP_TEST(abs); //OpenCL allows abs on integers only
+  GENERATE_UNARY_OP_TEST(sqrt);
+  GENERATE_UNARY_OP_TEST(tan);
+  GENERATE_UNARY_OP_TEST(tanh);
+
+#undef GENERATE_UNARY_OP_TEST
+
+  std::cout << "--- Testing complicated composite operations ---" << std::endl;
+  std::cout << "x = inner_prod(x, y) * y..." << std::endl;
+  {
+  ublas_v1 = inner_prod(ublas_v1, ublas_v2) * ublas_v2;
+  viennacl::scheduler::statement   my_statement(vcl_v1, viennacl::op_assign(), viennacl::linalg::inner_prod(vcl_v1, vcl_v2) * vcl_v2);
+  viennacl::scheduler::execute(my_statement);
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  }
+
+  std::cout << "x = y / norm_1(x)..." << std::endl;
+  {
+  ublas_v1 = ublas_v2 / norm_1(ublas_v1);
+  viennacl::scheduler::statement   my_statement(vcl_v1, viennacl::op_assign(), vcl_v2 / viennacl::linalg::norm_1(vcl_v1) );
+  viennacl::scheduler::execute(my_statement);
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  }
+
+
+  // --------------------------------------------------------------------------
+  return retval;
+}
+
+
+template< typename NumericT, typename Epsilon >
+int test(Epsilon const& epsilon)
+{
+  int retval = EXIT_SUCCESS;
+  std::size_t size = 24656;
+
+  std::cout << "Running tests for vector of size " << size << std::endl;
+
+  //
+  // Set up UBLAS objects
+  //
+  ublas::vector<NumericT> ublas_full_vec(size);
+  ublas::vector<NumericT> ublas_full_vec2(ublas_full_vec.size());
+
+  for (std::size_t i=0; i<ublas_full_vec.size(); ++i)
+  {
+    ublas_full_vec[i]  = NumericT(1.0) + random<NumericT>();
+    ublas_full_vec2[i] = NumericT(1.0) + random<NumericT>();
+  }
+
+  ublas::range r1(    ublas_full_vec.size() / 4, 2 * ublas_full_vec.size() / 4);
+  ublas::range r2(2 * ublas_full_vec2.size() / 4, 3 * ublas_full_vec2.size() / 4);
+  ublas::vector_range< ublas::vector<NumericT> > ublas_range_vec(ublas_full_vec, r1);
+  ublas::vector_range< ublas::vector<NumericT> > ublas_range_vec2(ublas_full_vec2, r2);
+
+  ublas::slice s1(    ublas_full_vec.size() / 4, 3, ublas_full_vec.size() / 4);
+  ublas::slice s2(2 * ublas_full_vec2.size() / 4, 2, ublas_full_vec2.size() / 4);
+  ublas::vector_slice< ublas::vector<NumericT> > ublas_slice_vec(ublas_full_vec, s1);
+  ublas::vector_slice< ublas::vector<NumericT> > ublas_slice_vec2(ublas_full_vec2, s2);
+
+  //
+  // Set up ViennaCL objects
+  //
+  viennacl::vector<NumericT> vcl_full_vec(ublas_full_vec.size());
+  viennacl::vector<NumericT> vcl_full_vec2(ublas_full_vec2.size());
+
+  viennacl::fast_copy(ublas_full_vec.begin(), ublas_full_vec.end(), vcl_full_vec.begin());
+  viennacl::copy(ublas_full_vec2.begin(), ublas_full_vec2.end(), vcl_full_vec2.begin());
+
+  viennacl::range vcl_r1(    vcl_full_vec.size() / 4, 2 * vcl_full_vec.size() / 4);
+  viennacl::range vcl_r2(2 * vcl_full_vec2.size() / 4, 3 * vcl_full_vec2.size() / 4);
+  viennacl::vector_range< viennacl::vector<NumericT> > vcl_range_vec(vcl_full_vec, vcl_r1);
+  viennacl::vector_range< viennacl::vector<NumericT> > vcl_range_vec2(vcl_full_vec2, vcl_r2);
+
+  {
+    viennacl::vector<NumericT> vcl_short_vec(vcl_range_vec);
+    viennacl::vector<NumericT> vcl_short_vec2 = vcl_range_vec2;
+
+    ublas::vector<NumericT> ublas_short_vec(ublas_range_vec);
+    ublas::vector<NumericT> ublas_short_vec2(ublas_range_vec2);
+
+    std::cout << "Testing creation of vectors from range..." << std::endl;
+    if (check(ublas_short_vec, vcl_short_vec, epsilon) != EXIT_SUCCESS)
+      return EXIT_FAILURE;
+    if (check(ublas_short_vec2, vcl_short_vec2, epsilon) != EXIT_SUCCESS)
+      return EXIT_FAILURE;
+  }
+
+  viennacl::slice vcl_s1(    vcl_full_vec.size() / 4, 3, vcl_full_vec.size() / 4);
+  viennacl::slice vcl_s2(2 * vcl_full_vec2.size() / 4, 2, vcl_full_vec2.size() / 4);
+  viennacl::vector_slice< viennacl::vector<NumericT> > vcl_slice_vec(vcl_full_vec, vcl_s1);
+  viennacl::vector_slice< viennacl::vector<NumericT> > vcl_slice_vec2(vcl_full_vec2, vcl_s2);
+
+  viennacl::vector<NumericT> vcl_short_vec(vcl_slice_vec);
+  viennacl::vector<NumericT> vcl_short_vec2 = vcl_slice_vec2;
+
+  ublas::vector<NumericT> ublas_short_vec(ublas_slice_vec);
+  ublas::vector<NumericT> ublas_short_vec2(ublas_slice_vec2);
+
+  std::cout << "Testing creation of vectors from slice..." << std::endl;
+  if (check(ublas_short_vec, vcl_short_vec, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  if (check(ublas_short_vec2, vcl_short_vec2, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  //
+  // Now start running tests for vectors, ranges and slices:
+  //
+
+  std::cout << " ** vcl_v1 = vector, vcl_v2 = vector **" << std::endl;
+  retval = test<NumericT>(epsilon,
+                          ublas_short_vec, ublas_short_vec2,
+                          vcl_short_vec, vcl_short_vec2);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " ** vcl_v1 = vector, vcl_v2 = range **" << std::endl;
+  retval = test<NumericT>(epsilon,
+                          ublas_short_vec, ublas_short_vec2,
+                          vcl_short_vec, vcl_range_vec2);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " ** vcl_v1 = vector, vcl_v2 = slice **" << std::endl;
+  retval = test<NumericT>(epsilon,
+                          ublas_short_vec, ublas_short_vec2,
+                          vcl_short_vec, vcl_slice_vec2);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ///////
+
+  std::cout << " ** vcl_v1 = range, vcl_v2 = vector **" << std::endl;
+  retval = test<NumericT>(epsilon,
+                          ublas_short_vec, ublas_short_vec2,
+                          vcl_range_vec, vcl_short_vec2);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " ** vcl_v1 = range, vcl_v2 = range **" << std::endl;
+  retval = test<NumericT>(epsilon,
+                          ublas_short_vec, ublas_short_vec2,
+                          vcl_range_vec, vcl_range_vec2);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " ** vcl_v1 = range, vcl_v2 = slice **" << std::endl;
+  retval = test<NumericT>(epsilon,
+                          ublas_short_vec, ublas_short_vec2,
+                          vcl_range_vec, vcl_slice_vec2);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ///////
+
+  std::cout << " ** vcl_v1 = slice, vcl_v2 = vector **" << std::endl;
+  retval = test<NumericT>(epsilon,
+                          ublas_short_vec, ublas_short_vec2,
+                          vcl_slice_vec, vcl_short_vec2);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " ** vcl_v1 = slice, vcl_v2 = range **" << std::endl;
+  retval = test<NumericT>(epsilon,
+                          ublas_short_vec, ublas_short_vec2,
+                          vcl_slice_vec, vcl_range_vec2);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " ** vcl_v1 = slice, vcl_v2 = slice **" << std::endl;
+  retval = test<NumericT>(epsilon,
+                          ublas_short_vec, ublas_short_vec2,
+                          vcl_slice_vec, vcl_slice_vec2);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  return EXIT_SUCCESS;
+}
+
+
+
+//
+// -------------------------------------------------------------
+//
+int main()
+{
+   std::cout << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << "## Test :: Vector" << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << std::endl;
+
+   int retval = EXIT_SUCCESS;
+
+   std::cout << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << std::endl;
+   {
+      typedef float NumericT;
+      NumericT epsilon = static_cast<NumericT>(1.0E-4);
+      std::cout << "# Testing setup:" << std::endl;
+      std::cout << "  eps:     " << epsilon << std::endl;
+      std::cout << "  numeric: float" << std::endl;
+      retval = test<NumericT>(epsilon);
+      if( retval == EXIT_SUCCESS )
+         std::cout << "# Test passed" << std::endl;
+      else
+         return retval;
+   }
+   std::cout << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << std::endl;
+#ifdef VIENNACL_WITH_OPENCL
+   if( viennacl::ocl::current_device().double_support() )
+#endif
+   {
+      {
+         typedef double NumericT;
+         NumericT epsilon = 1.0E-12;
+         std::cout << "# Testing setup:" << std::endl;
+         std::cout << "  eps:     " << epsilon << std::endl;
+         std::cout << "  numeric: double" << std::endl;
+         retval = test<NumericT>(epsilon);
+         if( retval == EXIT_SUCCESS )
+           std::cout << "# Test passed" << std::endl;
+         else
+           return retval;
+      }
+      std::cout << std::endl;
+      std::cout << "----------------------------------------------" << std::endl;
+      std::cout << std::endl;
+   }
+
+  std::cout << std::endl;
+  std::cout << "------- Test completed --------" << std::endl;
+  std::cout << std::endl;
+
+
+   return retval;
+}
diff --git a/tests/src/sparse.cpp b/tests/src/sparse.cpp
index ab60b43..4d07000 100644
--- a/tests/src/sparse.cpp
+++ b/tests/src/sparse.cpp
@@ -1,14 +1,15 @@
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
@@ -33,19 +34,26 @@
 #include <boost/numeric/ublas/matrix_proxy.hpp>
 #include <boost/numeric/ublas/lu.hpp>
 #include <boost/numeric/ublas/io.hpp>
+#include <boost/numeric/ublas/operation_sparse.hpp>
+#include <boost/numeric/ublas/vector_proxy.hpp>
 
 //
 // *** ViennaCL
 //
 //#define VIENNACL_DEBUG_ALL
-#define VIENNACL_HAVE_UBLAS 1
+#define VIENNACL_WITH_UBLAS 1
 #include "viennacl/scalar.hpp"
 #include "viennacl/compressed_matrix.hpp"
+#include "viennacl/compressed_compressed_matrix.hpp"
 #include "viennacl/coordinate_matrix.hpp"
+#include "viennacl/ell_matrix.hpp"
+#include "viennacl/hyb_matrix.hpp"
 #include "viennacl/vector.hpp"
+#include "viennacl/vector_proxy.hpp"
 #include "viennacl/linalg/prod.hpp"
 #include "viennacl/linalg/norm_2.hpp"
-#include "viennacl/linalg/jacobi_precond.hpp"
+#include "viennacl/linalg/ilu.hpp"
+#include "viennacl/linalg/detail/ilu/common.hpp"
 #include "viennacl/io/matrix_market.hpp"
 #include "examples/tutorial/Random.hpp"
 #include "examples/tutorial/vector-io.hpp"
@@ -58,10 +66,10 @@ using namespace boost::numeric;
 // -------------------------------------------------------------
 //
 template <typename ScalarType>
-ScalarType diff(ScalarType & s1, viennacl::scalar<ScalarType> & s2) 
+ScalarType diff(ScalarType & s1, viennacl::scalar<ScalarType> & s2)
 {
    if (s1 != s2)
-      return (s1 - s2) / std::max(fabs(s1), fabs(s2));
+      return (s1 - s2) / std::max(fabs(s1), std::fabs(s2));
    return 0;
 }
 
@@ -69,14 +77,28 @@ template <typename ScalarType>
 ScalarType diff(ublas::vector<ScalarType> & v1, viennacl::vector<ScalarType> & v2)
 {
    ublas::vector<ScalarType> v2_cpu(v2.size());
-   copy(v2.begin(), v2.end(), v2_cpu.begin());
+   viennacl::backend::finish();
+   viennacl::copy(v2.begin(), v2.end(), v2_cpu.begin());
 
    for (unsigned int i=0;i<v1.size(); ++i)
    {
-      if ( std::max( fabs(v2_cpu[i]), fabs(v1[i]) ) > 0 )
-         v2_cpu[i] = fabs(v2_cpu[i] - v1[i]) / std::max( fabs(v2_cpu[i]), fabs(v1[i]) );
+      if ( std::max( std::fabs(v2_cpu[i]), std::fabs(v1[i]) ) > 0 )
+      {
+        //if (std::max( std::fabs(v2_cpu[i]), std::fabs(v1[i]) ) < 1e-10 )  //absolute tolerance (avoid round-off issues)
+        //  v2_cpu[i] = 0;
+        //else
+          v2_cpu[i] = std::fabs(v2_cpu[i] - v1[i]) / std::max( std::fabs(v2_cpu[i]), std::fabs(v1[i]) );
+      }
       else
          v2_cpu[i] = 0.0;
+
+      if (v2_cpu[i] > 0.0001)
+      {
+        //std::cout << "Neighbor: "      << i-1 << ": " << v1[i-1] << " vs. " << v2_cpu[i-1] << std::endl;
+        std::cout << "Error at entry " << i   << ": " << v1[i]   << " vs. " << v2_cpu[i]   << std::endl;
+        //std::cout << "Neighbor: "      << i+1 << ": " << v1[i+1] << " vs. " << v2_cpu[i+1] << std::endl;
+        exit(EXIT_FAILURE);
+      }
    }
 
    return norm_inf(v2_cpu);
@@ -87,55 +109,111 @@ template <typename ScalarType, typename VCL_MATRIX>
 ScalarType diff(ublas::compressed_matrix<ScalarType> & cpu_matrix, VCL_MATRIX & gpu_matrix)
 {
   typedef ublas::compressed_matrix<ScalarType>  CPU_MATRIX;
-   CPU_MATRIX from_gpu;
-   
-   copy(gpu_matrix, from_gpu);
-
-   ScalarType error = 0;
-   
-   //step 1: compare all entries from cpu_matrix with gpu_matrix:
-    for (typename CPU_MATRIX::const_iterator1 row_it = cpu_matrix.begin1();
-          row_it != cpu_matrix.end1();
-          ++row_it)
+  CPU_MATRIX from_gpu(gpu_matrix.size1(), gpu_matrix.size2());
+
+  viennacl::backend::finish();
+  viennacl::copy(gpu_matrix, from_gpu);
+
+  ScalarType error = 0;
+
+  //step 1: compare all entries from cpu_matrix with gpu_matrix:
+  //std::cout << "Ublas matrix: " << std::endl;
+  for (typename CPU_MATRIX::const_iterator1 row_it = cpu_matrix.begin1();
+        row_it != cpu_matrix.end1();
+        ++row_it)
+  {
+    //std::cout << "Row " << row_it.index1() << ": " << std::endl;
+    for (typename CPU_MATRIX::const_iterator2 col_it = row_it.begin();
+          col_it != row_it.end();
+          ++col_it)
     {
-      for (typename CPU_MATRIX::const_iterator2 col_it = row_it.begin();
-            col_it != row_it.end();
-            ++col_it)
-      {
-        ScalarType current_error = 0;
-        
-        if ( std::max( fabs(cpu_matrix(col_it.index1(), col_it.index2())), 
-                       fabs(from_gpu(col_it.index1(), col_it.index2()))   ) > 0 )
-          current_error = fabs(cpu_matrix(col_it.index1(), col_it.index2()) - from_gpu(col_it.index1(), col_it.index2())) 
-                            / std::max( fabs(cpu_matrix(col_it.index1(), col_it.index2())), 
-                                        fabs(from_gpu(col_it.index1(), col_it.index2()))   );
-        if (current_error > error)
-          error = current_error;
-      }
+      //std::cout << "(" << col_it.index2() << ", " << *col_it << std::endl;
+      ScalarType current_error = 0;
+
+      if ( std::max( std::fabs(cpu_matrix(col_it.index1(), col_it.index2())),
+                      std::fabs(from_gpu(col_it.index1(), col_it.index2()))   ) > 0 )
+        current_error = std::fabs(cpu_matrix(col_it.index1(), col_it.index2()) - from_gpu(col_it.index1(), col_it.index2()))
+                          / std::max( std::fabs(cpu_matrix(col_it.index1(), col_it.index2())),
+                                      std::fabs(from_gpu(col_it.index1(), col_it.index2()))   );
+      if (current_error > error)
+        error = current_error;
+    }
+  }
+
+  //step 2: compare all entries from gpu_matrix with cpu_matrix (sparsity pattern might differ):
+  //std::cout << "ViennaCL matrix: " << std::endl;
+  for (typename CPU_MATRIX::const_iterator1 row_it = from_gpu.begin1();
+        row_it != from_gpu.end1();
+        ++row_it)
+  {
+    //std::cout << "Row " << row_it.index1() << ": " << std::endl;
+    for (typename CPU_MATRIX::const_iterator2 col_it = row_it.begin();
+          col_it != row_it.end();
+          ++col_it)
+    {
+      //std::cout << "(" << col_it.index2() << ", " << *col_it << std::endl;
+      ScalarType current_error = 0;
+
+      if ( std::max( std::fabs(cpu_matrix(col_it.index1(), col_it.index2())),
+                      std::fabs(from_gpu(col_it.index1(), col_it.index2()))   ) > 0 )
+        current_error = std::fabs(cpu_matrix(col_it.index1(), col_it.index2()) - from_gpu(col_it.index1(), col_it.index2()))
+                          / std::max( std::fabs(cpu_matrix(col_it.index1(), col_it.index2())),
+                                      std::fabs(from_gpu(col_it.index1(), col_it.index2()))   );
+      if (current_error > error)
+        error = current_error;
     }
+  }
 
-   //step 2: compare all entries from gpu_matrix with cpu_matrix (sparsity pattern might differ):
-    for (typename CPU_MATRIX::const_iterator1 row_it = from_gpu.begin1();
-          row_it != from_gpu.end1();
-          ++row_it)
+  return error;
+}
+
+
+template <typename NumericT, typename VCL_MatrixT, typename Epsilon, typename UblasVectorT, typename VCLVectorT>
+int strided_matrix_vector_product_test(Epsilon epsilon,
+                                        UblasVectorT & result, UblasVectorT const & rhs,
+                                        VCLVectorT & vcl_result, VCLVectorT & vcl_rhs)
+{
+    int retval = EXIT_SUCCESS;
+
+    ublas::compressed_matrix<NumericT> ublas_matrix2(5, 4);
+    ublas_matrix2(0, 0) = NumericT(2.0); ublas_matrix2(0, 2) = NumericT(-1.0);
+    ublas_matrix2(1, 0) = NumericT(3.0); ublas_matrix2(1, 2) = NumericT(-5.0);
+    ublas_matrix2(2, 1) = NumericT(5.0); ublas_matrix2(2, 2) = NumericT(-2.0);
+    ublas_matrix2(3, 2) = NumericT(1.0); ublas_matrix2(3, 3) = NumericT(-6.0);
+    ublas_matrix2(4, 1) = NumericT(7.0); ublas_matrix2(4, 2) = NumericT(-5.0);
+    project(result, ublas::slice(1, 3, 5))     = ublas::prod(ublas_matrix2, project(rhs, ublas::slice(3, 2, 4)));
+
+    VCL_MatrixT vcl_sparse_matrix2;
+    viennacl::copy(ublas_matrix2, vcl_sparse_matrix2);
+    viennacl::vector<NumericT> vec(4);
+    vec(0) = rhs(3);
+    vec(1) = rhs(5);
+    vec(2) = rhs(7);
+    vec(3) = rhs(9);
+    viennacl::project(vcl_result, viennacl::slice(1, 3, 5)) = viennacl::linalg::prod(vcl_sparse_matrix2, viennacl::project(vcl_rhs, viennacl::slice(3, 2, 4)));
+
+    if( std::fabs(diff(result, vcl_result)) > epsilon )
     {
-      for (typename CPU_MATRIX::const_iterator2 col_it = row_it.begin();
-            col_it != row_it.end();
-            ++col_it)
-      {
-        ScalarType current_error = 0;
-        
-        if ( std::max( fabs(cpu_matrix(col_it.index1(), col_it.index2())), 
-                       fabs(from_gpu(col_it.index1(), col_it.index2()))   ) > 0 )
-          current_error = fabs(cpu_matrix(col_it.index1(), col_it.index2()) - from_gpu(col_it.index1(), col_it.index2())) 
-                            / std::max( fabs(cpu_matrix(col_it.index1(), col_it.index2())), 
-                                        fabs(from_gpu(col_it.index1(), col_it.index2()))   );
-        if (current_error > error)
-          error = current_error;
-      }
+      std::cout << "# Error at operation: matrix-vector product with stided vectors, part 1" << std::endl;
+      std::cout << "  diff: " << std::fabs(diff(result, vcl_result)) << std::endl;
+      retval = EXIT_FAILURE;
     }
+    vcl_result(1)  = NumericT(1.0);
+    vcl_result(4)  = NumericT(1.0);
+    vcl_result(7)  = NumericT(1.0);
+    vcl_result(10) = NumericT(1.0);
+    vcl_result(13) = NumericT(1.0);
 
-   return error;
+    viennacl::project(vcl_result, viennacl::slice(1, 3, 5)) = viennacl::linalg::prod(vcl_sparse_matrix2, vec);
+
+    if( std::fabs(diff(result, vcl_result)) > epsilon )
+    {
+      std::cout << "# Error at operation: matrix-vector product with strided vectors, part 2" << std::endl;
+      std::cout << "  diff: " << std::fabs(diff(result, vcl_result)) << std::endl;
+      retval = EXIT_FAILURE;
+    }
+
+    return retval;
 }
 
 
@@ -143,73 +221,162 @@ template< typename NumericT, typename VCL_MATRIX, typename Epsilon >
 int resize_test(Epsilon const& epsilon)
 {
    int retval = EXIT_SUCCESS;
-   
+
    ublas::compressed_matrix<NumericT> ublas_matrix(5,5);
-   VCL_MATRIX vcl_matrix;    
-   
-   ublas_matrix(0,0) = 10.0; ublas_matrix(0, 1) = 0.1; ublas_matrix(0, 2) = 0.2; ublas_matrix(0, 3) = 0.3; ublas_matrix(0, 4) = 0.4;
-   ublas_matrix(1,0) = 1.0; ublas_matrix(1, 1) = 1.1; ublas_matrix(1, 2) = 1.2; ublas_matrix(1, 3) = 1.3; ublas_matrix(1, 4) = 1.4;
-   ublas_matrix(2,0) = 2.0; ublas_matrix(2, 1) = 2.1; ublas_matrix(2, 2) = 2.2; ublas_matrix(2, 3) = 2.3; ublas_matrix(2, 4) = 2.4;
-   ublas_matrix(3,0) = 3.0; ublas_matrix(3, 1) = 3.1; ublas_matrix(3, 2) = 3.2; ublas_matrix(3, 3) = 3.3; ublas_matrix(3, 4) = 3.4;
-   ublas_matrix(4,0) = 4.0; ublas_matrix(4, 1) = 4.1; ublas_matrix(4, 2) = 4.2; ublas_matrix(4, 3) = 4.3; ublas_matrix(4, 4) = 4.4;
-   
-   copy(ublas_matrix, vcl_matrix); ublas_matrix.clear();
-   copy(vcl_matrix, ublas_matrix);
-   
-   std::cout << "Checking for equality after copy..." << std::endl;   
-    if( fabs(diff(ublas_matrix, vcl_matrix)) > epsilon )
+   VCL_MATRIX vcl_matrix;
+
+   ublas_matrix(0,0) = NumericT(10.0); ublas_matrix(0, 1) = NumericT(0.1); ublas_matrix(0, 2) = NumericT(0.2); ublas_matrix(0, 3) = NumericT(0.3); ublas_matrix(0, 4) = NumericT(0.4);
+   ublas_matrix(1,0) = NumericT(1.0);  ublas_matrix(1, 1) = NumericT(1.1); ublas_matrix(1, 2) = NumericT(1.2); ublas_matrix(1, 3) = NumericT(1.3); ublas_matrix(1, 4) = NumericT(1.4);
+   ublas_matrix(2,0) = NumericT(2.0);  ublas_matrix(2, 1) = NumericT(2.1); ublas_matrix(2, 2) = NumericT(2.2); ublas_matrix(2, 3) = NumericT(2.3); ublas_matrix(2, 4) = NumericT(2.4);
+   ublas_matrix(3,0) = NumericT(3.0);  ublas_matrix(3, 1) = NumericT(3.1); ublas_matrix(3, 2) = NumericT(3.2); ublas_matrix(3, 3) = NumericT(3.3); ublas_matrix(3, 4) = NumericT(3.4);
+   ublas_matrix(4,0) = NumericT(4.0);  ublas_matrix(4, 1) = NumericT(4.1); ublas_matrix(4, 2) = NumericT(4.2); ublas_matrix(4, 3) = NumericT(4.3); ublas_matrix(4, 4) = NumericT(4.4);
+
+   viennacl::copy(ublas_matrix, vcl_matrix);
+   ublas::compressed_matrix<NumericT> other_matrix(ublas_matrix.size1(), ublas_matrix.size2());
+   viennacl::copy(vcl_matrix, other_matrix);
+
+   std::cout << "Checking for equality after copy..." << std::endl;
+    if( std::fabs(diff(ublas_matrix, vcl_matrix)) > epsilon )
     {
         std::cout << "# Error at operation: equality after copy with sparse matrix" << std::endl;
-        std::cout << "  diff: " << fabs(diff(ublas_matrix, vcl_matrix)) << std::endl;
-        retval = EXIT_FAILURE;
+        std::cout << "  diff: " << std::fabs(diff(ublas_matrix, vcl_matrix)) << std::endl;
+        return EXIT_FAILURE;
     }
-   
+
    std::cout << "Testing resize to larger..." << std::endl;
    ublas_matrix.resize(10, 10, false); //ublas does not allow preserve = true here
-   ublas_matrix(0,0) = 10.0; ublas_matrix(0, 1) = 0.1; ublas_matrix(0, 2) = 0.2; ublas_matrix(0, 3) = 0.3; ublas_matrix(0, 4) = 0.4;
-   ublas_matrix(1,0) = 1.0; ublas_matrix(1, 1) = 1.1; ublas_matrix(1, 2) = 1.2; ublas_matrix(1, 3) = 1.3; ublas_matrix(1, 4) = 1.4;
-   ublas_matrix(2,0) = 2.0; ublas_matrix(2, 1) = 2.1; ublas_matrix(2, 2) = 2.2; ublas_matrix(2, 3) = 2.3; ublas_matrix(2, 4) = 2.4;
-   ublas_matrix(3,0) = 3.0; ublas_matrix(3, 1) = 3.1; ublas_matrix(3, 2) = 3.2; ublas_matrix(3, 3) = 3.3; ublas_matrix(3, 4) = 3.4;
-   ublas_matrix(4,0) = 4.0; ublas_matrix(4, 1) = 4.1; ublas_matrix(4, 2) = 4.2; ublas_matrix(4, 3) = 4.3; ublas_matrix(4, 4) = 4.4;
+   ublas_matrix(0,0) = NumericT(10.0); ublas_matrix(0, 1) = NumericT(0.1); ublas_matrix(0, 2) = NumericT(0.2); ublas_matrix(0, 3) = NumericT(0.3); ublas_matrix(0, 4) = NumericT(0.4);
+   ublas_matrix(1,0) = NumericT( 1.0); ublas_matrix(1, 1) = NumericT(1.1); ublas_matrix(1, 2) = NumericT(1.2); ublas_matrix(1, 3) = NumericT(1.3); ublas_matrix(1, 4) = NumericT(1.4);
+   ublas_matrix(2,0) = NumericT( 2.0); ublas_matrix(2, 1) = NumericT(2.1); ublas_matrix(2, 2) = NumericT(2.2); ublas_matrix(2, 3) = NumericT(2.3); ublas_matrix(2, 4) = NumericT(2.4);
+   ublas_matrix(3,0) = NumericT( 3.0); ublas_matrix(3, 1) = NumericT(3.1); ublas_matrix(3, 2) = NumericT(3.2); ublas_matrix(3, 3) = NumericT(3.3); ublas_matrix(3, 4) = NumericT(3.4);
+   ublas_matrix(4,0) = NumericT( 4.0); ublas_matrix(4, 1) = NumericT(4.1); ublas_matrix(4, 2) = NumericT(4.2); ublas_matrix(4, 3) = NumericT(4.3); ublas_matrix(4, 4) = NumericT(4.4);
    //std::cout << ublas_matrix << std::endl;
-   
+
    vcl_matrix.resize(10, 10, true);
-   
-    if( fabs(diff(ublas_matrix, vcl_matrix)) > epsilon )
+
+    if( std::fabs(diff(ublas_matrix, vcl_matrix)) > epsilon )
     {
         std::cout << "# Error at operation: resize (to larger) with sparse matrix" << std::endl;
-        std::cout << "  diff: " << fabs(diff(ublas_matrix, vcl_matrix)) << std::endl;
-        retval = EXIT_FAILURE;
+        std::cout << "  diff: " << std::fabs(diff(ublas_matrix, vcl_matrix)) << std::endl;
+        return EXIT_FAILURE;
     }
 
-   ublas_matrix(5,5) = 5.5; ublas_matrix(5, 6) = 5.6; ublas_matrix(5, 7) = 5.7; ublas_matrix(5, 8) = 5.8; ublas_matrix(5, 9) = 5.9;
-   ublas_matrix(6,5) = 6.5; ublas_matrix(6, 6) = 6.6; ublas_matrix(6, 7) = 6.7; ublas_matrix(6, 8) = 6.8; ublas_matrix(6, 9) = 6.9;
-   ublas_matrix(7,5) = 7.5; ublas_matrix(7, 6) = 7.6; ublas_matrix(7, 7) = 7.7; ublas_matrix(7, 8) = 7.8; ublas_matrix(7, 9) = 7.9;
-   ublas_matrix(8,5) = 8.5; ublas_matrix(8, 6) = 8.6; ublas_matrix(8, 7) = 8.7; ublas_matrix(8, 8) = 8.8; ublas_matrix(8, 9) = 8.9;
-   ublas_matrix(9,5) = 9.5; ublas_matrix(9, 6) = 9.6; ublas_matrix(9, 7) = 9.7; ublas_matrix(9, 8) = 9.8; ublas_matrix(9, 9) = 9.9;
-   copy(ublas_matrix, vcl_matrix);
-    
+   ublas_matrix(5,5) = NumericT(5.5); ublas_matrix(5, 6) = NumericT(5.6); ublas_matrix(5, 7) = NumericT(5.7); ublas_matrix(5, 8) = NumericT(5.8); ublas_matrix(5, 9) = NumericT(5.9);
+   ublas_matrix(6,5) = NumericT(6.5); ublas_matrix(6, 6) = NumericT(6.6); ublas_matrix(6, 7) = NumericT(6.7); ublas_matrix(6, 8) = NumericT(6.8); ublas_matrix(6, 9) = NumericT(6.9);
+   ublas_matrix(7,5) = NumericT(7.5); ublas_matrix(7, 6) = NumericT(7.6); ublas_matrix(7, 7) = NumericT(7.7); ublas_matrix(7, 8) = NumericT(7.8); ublas_matrix(7, 9) = NumericT(7.9);
+   ublas_matrix(8,5) = NumericT(8.5); ublas_matrix(8, 6) = NumericT(8.6); ublas_matrix(8, 7) = NumericT(8.7); ublas_matrix(8, 8) = NumericT(8.8); ublas_matrix(8, 9) = NumericT(8.9);
+   ublas_matrix(9,5) = NumericT(9.5); ublas_matrix(9, 6) = NumericT(9.6); ublas_matrix(9, 7) = NumericT(9.7); ublas_matrix(9, 8) = NumericT(9.8); ublas_matrix(9, 9) = NumericT(9.9);
+   viennacl::copy(ublas_matrix, vcl_matrix);
+
    std::cout << "Testing resize to smaller..." << std::endl;
    ublas_matrix.resize(7, 7, false); //ublas does not allow preserve = true here
-   ublas_matrix(0,0) = 10.0; ublas_matrix(0, 1) = 0.1; ublas_matrix(0, 2) = 0.2; ublas_matrix(0, 3) = 0.3; ublas_matrix(0, 4) = 0.4;
-   ublas_matrix(1,0) = 1.0; ublas_matrix(1, 1) = 1.1; ublas_matrix(1, 2) = 1.2; ublas_matrix(1, 3) = 1.3; ublas_matrix(1, 4) = 1.4;
-   ublas_matrix(2,0) = 2.0; ublas_matrix(2, 1) = 2.1; ublas_matrix(2, 2) = 2.2; ublas_matrix(2, 3) = 2.3; ublas_matrix(2, 4) = 2.4;
-   ublas_matrix(3,0) = 3.0; ublas_matrix(3, 1) = 3.1; ublas_matrix(3, 2) = 3.2; ublas_matrix(3, 3) = 3.3; ublas_matrix(3, 4) = 3.4;
-   ublas_matrix(4,0) = 4.0; ublas_matrix(4, 1) = 4.1; ublas_matrix(4, 2) = 4.2; ublas_matrix(4, 3) = 4.3; ublas_matrix(4, 4) = 4.4;
-   ublas_matrix(5,5) = 5.5; ublas_matrix(5, 6) = 5.6; ublas_matrix(5, 7) = 5.7; ublas_matrix(5, 8) = 5.8; ublas_matrix(5, 9) = 5.9;
-   ublas_matrix(6,5) = 6.5; ublas_matrix(6, 6) = 6.6; ublas_matrix(6, 7) = 6.7; ublas_matrix(6, 8) = 6.8; ublas_matrix(6, 9) = 6.9;
+   ublas_matrix(0,0) = NumericT(10.0); ublas_matrix(0, 1) = NumericT(0.1); ublas_matrix(0, 2) = NumericT(0.2); ublas_matrix(0, 3) = NumericT(0.3); ublas_matrix(0, 4) = NumericT(0.4);
+   ublas_matrix(1,0) = NumericT( 1.0); ublas_matrix(1, 1) = NumericT(1.1); ublas_matrix(1, 2) = NumericT(1.2); ublas_matrix(1, 3) = NumericT(1.3); ublas_matrix(1, 4) = NumericT(1.4);
+   ublas_matrix(2,0) = NumericT( 2.0); ublas_matrix(2, 1) = NumericT(2.1); ublas_matrix(2, 2) = NumericT(2.2); ublas_matrix(2, 3) = NumericT(2.3); ublas_matrix(2, 4) = NumericT(2.4);
+   ublas_matrix(3,0) = NumericT( 3.0); ublas_matrix(3, 1) = NumericT(3.1); ublas_matrix(3, 2) = NumericT(3.2); ublas_matrix(3, 3) = NumericT(3.3); ublas_matrix(3, 4) = NumericT(3.4);
+   ublas_matrix(4,0) = NumericT( 4.0); ublas_matrix(4, 1) = NumericT(4.1); ublas_matrix(4, 2) = NumericT(4.2); ublas_matrix(4, 3) = NumericT(4.3); ublas_matrix(4, 4) = NumericT(4.4);
+   ublas_matrix(5,5) = NumericT( 5.5); ublas_matrix(5, 6) = NumericT(5.6); ublas_matrix(5, 7) = NumericT(5.7); ublas_matrix(5, 8) = NumericT(5.8); ublas_matrix(5, 9) = NumericT(5.9);
+   ublas_matrix(6,5) = NumericT( 6.5); ublas_matrix(6, 6) = NumericT(6.6); ublas_matrix(6, 7) = NumericT(6.7); ublas_matrix(6, 8) = NumericT(6.8); ublas_matrix(6, 9) = NumericT(6.9);
 
    vcl_matrix.resize(7, 7);
 
    //std::cout << ublas_matrix << std::endl;
-    if( fabs(diff(ublas_matrix, vcl_matrix)) > epsilon )
+    if( std::fabs(diff(ublas_matrix, vcl_matrix)) > epsilon )
     {
         std::cout << "# Error at operation: resize (to smaller) with sparse matrix" << std::endl;
-        std::cout << "  diff: " << fabs(diff(ublas_matrix, vcl_matrix)) << std::endl;
+        std::cout << "  diff: " << std::fabs(diff(ublas_matrix, vcl_matrix)) << std::endl;
         retval = EXIT_FAILURE;
     }
-    
-   return retval;
+
+   ublas::vector<NumericT> ublas_vec = ublas::scalar_vector<NumericT>(ublas_matrix.size1(), NumericT(3.1415));
+   viennacl::vector<NumericT> vcl_vec(ublas_matrix.size1());
+
+
+  std::cout << "Testing transposed unit lower triangular solve: compressed_matrix" << std::endl;
+  viennacl::copy(ublas_vec, vcl_vec);
+  std::cout << "matrix: " << ublas_matrix << std::endl;
+  std::cout << "vector: " << ublas_vec << std::endl;
+  std::cout << "ViennaCL matrix size: " << vcl_matrix.size1() << " x " << vcl_matrix.size2() << std::endl;
+
+  std::cout << "ublas..." << std::endl;
+  boost::numeric::ublas::inplace_solve((ublas_matrix), ublas_vec, boost::numeric::ublas::unit_lower_tag());
+  std::cout << "ViennaCL..." << std::endl;
+  viennacl::linalg::inplace_solve((vcl_matrix), vcl_vec, viennacl::linalg::unit_lower_tag());
+
+  /*
+  std::list< viennacl::backend::mem_handle > multifrontal_L_row_index_arrays_;
+  std::list< viennacl::backend::mem_handle > multifrontal_L_row_buffers_;
+  std::list< viennacl::backend::mem_handle > multifrontal_L_col_buffers_;
+  std::list< viennacl::backend::mem_handle > multifrontal_L_element_buffers_;
+  std::list< std::size_t > multifrontal_L_row_elimination_num_list_;
+
+  viennacl::vector<NumericT> multifrontal_U_diagonal_;
+
+  viennacl::linalg::detail::multifrontal_setup_L(vcl_matrix,
+                                                  multifrontal_U_diagonal_, //dummy
+                                                  multifrontal_L_row_index_arrays_,
+                                                  multifrontal_L_row_buffers_,
+                                                  multifrontal_L_col_buffers_,
+                                                  multifrontal_L_element_buffers_,
+                                                  multifrontal_L_row_elimination_num_list_);
+
+  viennacl::linalg::detail::multifrontal_substitute(vcl_vec,
+                                                    multifrontal_L_row_index_arrays_,
+                                                    multifrontal_L_row_buffers_,
+                                                    multifrontal_L_col_buffers_,
+                                                    multifrontal_L_element_buffers_,
+                                                    multifrontal_L_row_elimination_num_list_);
+
+
+  std::cout << "ublas..." << std::endl;
+  boost::numeric::ublas::inplace_solve((ublas_matrix), ublas_vec, boost::numeric::ublas::upper_tag());
+  std::cout << "ViennaCL..." << std::endl;
+  std::list< viennacl::backend::mem_handle > multifrontal_U_row_index_arrays_;
+  std::list< viennacl::backend::mem_handle > multifrontal_U_row_buffers_;
+  std::list< viennacl::backend::mem_handle > multifrontal_U_col_buffers_;
+  std::list< viennacl::backend::mem_handle > multifrontal_U_element_buffers_;
+  std::list< std::size_t > multifrontal_U_row_elimination_num_list_;
+
+  multifrontal_U_diagonal_.resize(vcl_matrix.size1(), false);
+  viennacl::linalg::single_threaded::detail::row_info(vcl_matrix, multifrontal_U_diagonal_, viennacl::linalg::detail::SPARSE_ROW_DIAGONAL);
+  viennacl::linalg::detail::multifrontal_setup_U(vcl_matrix,
+                                                 multifrontal_U_diagonal_,
+                                                 multifrontal_U_row_index_arrays_,
+                                                 multifrontal_U_row_buffers_,
+                                                 multifrontal_U_col_buffers_,
+                                                 multifrontal_U_element_buffers_,
+                                                 multifrontal_U_row_elimination_num_list_);
+
+  vcl_vec = viennacl::linalg::element_div(vcl_vec, multifrontal_U_diagonal_);
+  viennacl::linalg::detail::multifrontal_substitute(vcl_vec,
+                                                    multifrontal_U_row_index_arrays_,
+                                                    multifrontal_U_row_buffers_,
+                                                    multifrontal_U_col_buffers_,
+                                                    multifrontal_U_element_buffers_,
+                                                    multifrontal_U_row_elimination_num_list_);
+  */
+  for (std::size_t i=0; i<ublas_vec.size(); ++i)
+  {
+    std::cout << ublas_vec[i] << " vs. " << vcl_vec[i] << std::endl;
+  }
+
+  /*std::cout << "Testing transposed unit upper triangular solve: compressed_matrix" << std::endl;
+  viennacl::copy(ublas_vec, vcl_vec);
+  std::cout << "matrix: " << ublas_matrix << std::endl;
+  std::cout << "vector: " << ublas_vec << std::endl;
+  std::cout << "ViennaCL matrix size: " << vcl_matrix.size1() << " x " << vcl_matrix.size2() << std::endl;
+
+  std::cout << "ublas..." << std::endl;
+  boost::numeric::ublas::inplace_solve((ublas_matrix), ublas_vec, boost::numeric::ublas::lower_tag());
+  std::cout << "ViennaCL..." << std::endl;
+  viennacl::linalg::inplace_solve((vcl_matrix), vcl_vec, viennacl::linalg::lower_tag());
+
+  for (std::size_t i=0; i<ublas_vec.size(); ++i)
+  {
+    std::cout << ublas_vec[i] << " vs. " << vcl_vec[i] << std::endl;
+  }*/
+
+  return retval;
 }
 
 
@@ -219,191 +386,506 @@ int resize_test(Epsilon const& epsilon)
 template< typename NumericT, typename Epsilon >
 int test(Epsilon const& epsilon)
 {
-   std::cout << "Testing resizing of compressed_matrix..." << std::endl;
-   int retval = resize_test<NumericT, viennacl::compressed_matrix<NumericT> >(epsilon);
-   std::cout << "Testing resizing of coordinate_matrix..." << std::endl;
-   if (retval != EXIT_FAILURE)
-     retval = resize_test<NumericT, viennacl::coordinate_matrix<NumericT> >(epsilon);
-   
-   // --------------------------------------------------------------------------            
-   ublas::vector<NumericT> rhs;
-   ublas::vector<NumericT> result;
-   ublas::compressed_matrix<NumericT> ublas_matrix;
-
-    if (!viennacl::io::read_matrix_market_file(ublas_matrix, "../../examples/testdata/mat65k.mtx"))
+  std::cout << "Testing resizing of compressed_matrix..." << std::endl;
+  int retval = resize_test<NumericT, viennacl::compressed_matrix<NumericT> >(epsilon);
+  if (retval != EXIT_SUCCESS)
+    return retval;
+  std::cout << "Testing resizing of coordinate_matrix..." << std::endl;
+  //if (retval != EXIT_FAILURE)
+  //  retval = resize_test<NumericT, viennacl::coordinate_matrix<NumericT> >(epsilon);
+  //else
+  //  return retval;
+
+  // --------------------------------------------------------------------------
+  ublas::vector<NumericT> rhs;
+  ublas::vector<NumericT> result;
+  ublas::compressed_matrix<NumericT> ublas_matrix;
+
+  if (viennacl::io::read_matrix_market_file(ublas_matrix, "../../examples/testdata/mat65k.mtx") == EXIT_FAILURE)
+  {
+    std::cout << "Error reading Matrix file" << std::endl;
+    return EXIT_FAILURE;
+  }
+  //unsigned int cg_mat_size = cg_mat.size();
+  std::cout << "done reading matrix" << std::endl;
+
+
+  rhs.resize(ublas_matrix.size2());
+  for (std::size_t i=0; i<rhs.size(); ++i)
+  {
+    ublas_matrix(i,i) = NumericT(0.5);   // Get rid of round-off errors by making row-sums unequal to zero:
+    rhs[i] = NumericT(1) + random<NumericT>();
+  }
+
+  // add some random numbers to the double-compressed matrix:
+  ublas::compressed_matrix<NumericT> ublas_cc_matrix(ublas_matrix.size1(), ublas_matrix.size2());
+  ublas_cc_matrix(42,199) = NumericT(3.1415);
+  ublas_cc_matrix(31, 69) = NumericT(2.71);
+  ublas_cc_matrix(23, 32) = NumericT(6);
+  ublas_cc_matrix(177,57) = NumericT(4);
+  ublas_cc_matrix(21, 97) = NumericT(-4);
+  ublas_cc_matrix(92, 25) = NumericT(2);
+  ublas_cc_matrix(89, 62) = NumericT(11);
+  ublas_cc_matrix(1,   7) = NumericT(8);
+  ublas_cc_matrix(85, 41) = NumericT(13);
+  ublas_cc_matrix(66, 28) = NumericT(8);
+  ublas_cc_matrix(21, 74) = NumericT(-2);
+
+
+  result = rhs;
+
+
+  viennacl::vector<NumericT> vcl_rhs(rhs.size());
+  viennacl::vector<NumericT> vcl_result(result.size());
+  viennacl::vector<NumericT> vcl_result2(result.size());
+  viennacl::compressed_matrix<NumericT> vcl_compressed_matrix(rhs.size(), rhs.size());
+  viennacl::compressed_compressed_matrix<NumericT> vcl_compressed_compressed_matrix(rhs.size(), rhs.size());
+  viennacl::coordinate_matrix<NumericT> vcl_coordinate_matrix(rhs.size(), rhs.size());
+  viennacl::ell_matrix<NumericT> vcl_ell_matrix;
+  viennacl::hyb_matrix<NumericT> vcl_hyb_matrix;
+
+  viennacl::copy(rhs.begin(), rhs.end(), vcl_rhs.begin());
+  viennacl::copy(ublas_matrix, vcl_compressed_matrix);
+  viennacl::copy(ublas_cc_matrix, vcl_compressed_compressed_matrix);
+  viennacl::copy(ublas_matrix, vcl_coordinate_matrix);
+
+  // --------------------------------------------------------------------------
+  std::cout << "Testing products: ublas" << std::endl;
+  result     = viennacl::linalg::prod(ublas_matrix, rhs);
+
+  std::cout << "Testing products: compressed_matrix" << std::endl;
+  vcl_result = viennacl::linalg::prod(vcl_compressed_matrix, vcl_rhs);
+
+  if( std::fabs(diff(result, vcl_result)) > epsilon )
+  {
+    std::cout << "# Error at operation: matrix-vector product with compressed_matrix" << std::endl;
+    std::cout << "  diff: " << std::fabs(diff(result, vcl_result)) << std::endl;
+    retval = EXIT_FAILURE;
+  }
+
+  std::cout << "Testing products: compressed_matrix, strided vectors" << std::endl;
+  retval = strided_matrix_vector_product_test<NumericT, viennacl::compressed_matrix<NumericT> >(epsilon, result, rhs, vcl_result, vcl_rhs);
+  if (retval != EXIT_SUCCESS)
+    return retval;
+
+  //
+  // Triangular solvers for A \ b:
+  //
+  ublas::compressed_matrix<NumericT> ublas_matrix_trans(ublas_matrix.size2(), ublas_matrix.size1(), ublas_matrix.nnz()); // = trans(ublas_matrix); //note: triangular solvers with uBLAS show atrocious performance, while transposed solvers are quite okay. To keep execution times short, we use a double-transpose-trick in the following.
+
+  // fast transpose:
+  for (typename ublas::compressed_matrix<NumericT>::iterator1 row_it  = ublas_matrix.begin1();
+                                                              row_it != ublas_matrix.end1();
+                                                            ++row_it)
+  {
+    for (typename ublas::compressed_matrix<NumericT>::iterator2 col_it  = row_it.begin();
+                                                                col_it != row_it.end();
+                                                              ++col_it)
     {
-      std::cout << "Error reading Matrix file" << std::endl;
-      return EXIT_FAILURE;
+      ublas_matrix_trans(col_it.index1(), col_it.index2()) = *col_it;
     }
-    //unsigned int cg_mat_size = cg_mat.size(); 
-    std::cout << "done reading matrix" << std::endl;
-
-    if (!readVectorFromFile("../../examples/testdata/rhs65025.txt", rhs))
-    {
-      std::cout << "Error reading RHS file" << std::endl;
-      return EXIT_FAILURE;
-    }
-    std::cout << "done reading rhs" << std::endl;
-
-    if (!readVectorFromFile("../../examples/testdata/result65025.txt", result))
+  }
+
+
+  std::cout << "Testing unit upper triangular solve: compressed_matrix" << std::endl;
+  result = rhs;
+  viennacl::copy(result, vcl_result);
+  boost::numeric::ublas::inplace_solve(trans(ublas_matrix_trans), result, boost::numeric::ublas::unit_upper_tag());
+  viennacl::linalg::inplace_solve(vcl_compressed_matrix, vcl_result, viennacl::linalg::unit_upper_tag());
+
+  if( std::fabs(diff(result, vcl_result)) > epsilon )
+  {
+    std::cout << "# Error at operation: unit upper triangular solve with compressed_matrix" << std::endl;
+    std::cout << "  diff: " << std::fabs(diff(result, vcl_result)) << std::endl;
+    retval = EXIT_FAILURE;
+  }
+
+  std::cout << "Testing upper triangular solve: compressed_matrix" << std::endl;
+  result = rhs;
+  viennacl::copy(result, vcl_result);
+  boost::numeric::ublas::inplace_solve(trans(ublas_matrix_trans), result, boost::numeric::ublas::upper_tag());
+  viennacl::linalg::inplace_solve(vcl_compressed_matrix, vcl_result, viennacl::linalg::upper_tag());
+
+  if( std::fabs(diff(result, vcl_result)) > epsilon )
+  {
+    std::cout << "# Error at operation: upper triangular solve with compressed_matrix" << std::endl;
+    std::cout << "  diff: " << std::fabs(diff(result, vcl_result)) << std::endl;
+    retval = EXIT_FAILURE;
+  }
+
+  std::cout << "Testing unit lower triangular solve: compressed_matrix" << std::endl;
+  result = rhs;
+  viennacl::copy(result, vcl_result);
+  boost::numeric::ublas::inplace_solve(trans(ublas_matrix_trans), result, boost::numeric::ublas::unit_lower_tag());
+  viennacl::linalg::inplace_solve(vcl_compressed_matrix, vcl_result, viennacl::linalg::unit_lower_tag());
+
+  /*std::list< viennacl::backend::mem_handle > multifrontal_L_row_index_arrays_;
+  std::list< viennacl::backend::mem_handle > multifrontal_L_row_buffers_;
+  std::list< viennacl::backend::mem_handle > multifrontal_L_col_buffers_;
+  std::list< viennacl::backend::mem_handle > multifrontal_L_element_buffers_;
+  std::list< std::size_t > multifrontal_L_row_elimination_num_list_;
+
+  viennacl::vector<NumericT> multifrontal_U_diagonal_;
+
+  viennacl::switch_memory_domain(multifrontal_U_diagonal_, viennacl::MAIN_MEMORY);
+  multifrontal_U_diagonal_.resize(vcl_compressed_matrix.size1(), false);
+  viennacl::linalg::single_threaded::detail::row_info(vcl_compressed_matrix, multifrontal_U_diagonal_, viennacl::linalg::detail::SPARSE_ROW_DIAGONAL);
+
+  viennacl::linalg::detail::multifrontal_setup_L(vcl_compressed_matrix,
+                                                  multifrontal_U_diagonal_, //dummy
+                                                  multifrontal_L_row_index_arrays_,
+                                                  multifrontal_L_row_buffers_,
+                                                  multifrontal_L_col_buffers_,
+                                                  multifrontal_L_element_buffers_,
+                                                  multifrontal_L_row_elimination_num_list_);
+
+  viennacl::linalg::detail::multifrontal_substitute(vcl_result,
+                                                    multifrontal_L_row_index_arrays_,
+                                                    multifrontal_L_row_buffers_,
+                                                    multifrontal_L_col_buffers_,
+                                                    multifrontal_L_element_buffers_,
+                                                    multifrontal_L_row_elimination_num_list_);*/
+
+
+  if( std::fabs(diff(result, vcl_result)) > epsilon )
+  {
+    std::cout << "# Error at operation: unit lower triangular solve with compressed_matrix" << std::endl;
+    std::cout << "  diff: " << std::fabs(diff(result, vcl_result)) << std::endl;
+    retval = EXIT_FAILURE;
+  }
+
+
+  std::cout << "Testing lower triangular solve: compressed_matrix" << std::endl;
+  result = rhs;
+  viennacl::copy(result, vcl_result);
+  boost::numeric::ublas::inplace_solve(trans(ublas_matrix_trans), result, boost::numeric::ublas::lower_tag());
+  viennacl::linalg::inplace_solve(vcl_compressed_matrix, vcl_result, viennacl::linalg::lower_tag());
+
+  /*std::list< viennacl::backend::mem_handle > multifrontal_U_row_index_arrays_;
+  std::list< viennacl::backend::mem_handle > multifrontal_U_row_buffers_;
+  std::list< viennacl::backend::mem_handle > multifrontal_U_col_buffers_;
+  std::list< viennacl::backend::mem_handle > multifrontal_U_element_buffers_;
+  std::list< std::size_t > multifrontal_U_row_elimination_num_list_;
+
+  multifrontal_U_diagonal_.resize(vcl_compressed_matrix.size1(), false);
+  viennacl::linalg::single_threaded::detail::row_info(vcl_compressed_matrix, multifrontal_U_diagonal_, viennacl::linalg::detail::SPARSE_ROW_DIAGONAL);
+  viennacl::linalg::detail::multifrontal_setup_U(vcl_compressed_matrix,
+                                                 multifrontal_U_diagonal_,
+                                                 multifrontal_U_row_index_arrays_,
+                                                 multifrontal_U_row_buffers_,
+                                                 multifrontal_U_col_buffers_,
+                                                 multifrontal_U_element_buffers_,
+                                                 multifrontal_U_row_elimination_num_list_);
+
+  vcl_result = viennacl::linalg::element_div(vcl_result, multifrontal_U_diagonal_);
+  viennacl::linalg::detail::multifrontal_substitute(vcl_result,
+                                                    multifrontal_U_row_index_arrays_,
+                                                    multifrontal_U_row_buffers_,
+                                                    multifrontal_U_col_buffers_,
+                                                    multifrontal_U_element_buffers_,
+                                                    multifrontal_U_row_elimination_num_list_);*/
+
+
+  if( std::fabs(diff(result, vcl_result)) > epsilon )
+  {
+    std::cout << "# Error at operation: lower triangular solve with compressed_matrix" << std::endl;
+    std::cout << "  diff: " << std::fabs(diff(result, vcl_result)) << std::endl;
+    retval = EXIT_FAILURE;
+  }
+
+/*
+  std::cout << "Testing lower triangular solve: compressed_matrix" << std::endl;
+  result = rhs;
+  viennacl::copy(result, vcl_result);
+  boost::numeric::ublas::inplace_solve(ublas_matrix, result, boost::numeric::ublas::lower_tag());
+  viennacl::linalg::inplace_solve(vcl_compressed_matrix, vcl_result, viennacl::linalg::lower_tag());
+
+  if( std::fabs(diff(result, vcl_result)) > epsilon )
+  {
+    std::cout << "# Error at operation: lower triangular solve with compressed_matrix" << std::endl;
+    std::cout << "  diff: " << std::fabs(diff(result, vcl_result)) << std::endl;
+    retval = EXIT_FAILURE;
+  }*/
+
+  //
+  // Triangular solvers for A^T \ b
+  //
+
+  std::cout << "Testing transposed unit upper triangular solve: compressed_matrix" << std::endl;
+  result = rhs;
+  viennacl::copy(result, vcl_result);
+  boost::numeric::ublas::inplace_solve(trans(ublas_matrix), result, boost::numeric::ublas::unit_upper_tag());
+  viennacl::linalg::inplace_solve(trans(vcl_compressed_matrix), vcl_result, viennacl::linalg::unit_upper_tag());
+
+  if( std::fabs(diff(result, vcl_result)) > epsilon )
+  {
+    std::cout << "# Error at operation: unit upper triangular solve with compressed_matrix" << std::endl;
+    std::cout << "  diff: " << std::fabs(diff(result, vcl_result)) << std::endl;
+    retval = EXIT_FAILURE;
+  }
+
+  std::cout << "Testing transposed upper triangular solve: compressed_matrix" << std::endl;
+  result = rhs;
+  viennacl::copy(result, vcl_result);
+  boost::numeric::ublas::inplace_solve(trans(ublas_matrix), result, boost::numeric::ublas::upper_tag());
+  viennacl::linalg::inplace_solve(trans(vcl_compressed_matrix), vcl_result, viennacl::linalg::upper_tag());
+
+  if( std::fabs(diff(result, vcl_result)) > epsilon )
+  {
+    std::cout << "# Error at operation: upper triangular solve with compressed_matrix" << std::endl;
+    std::cout << "  diff: " << std::fabs(diff(result, vcl_result)) << std::endl;
+    retval = EXIT_FAILURE;
+  }
+
+
+  std::cout << "Testing transposed unit lower triangular solve: compressed_matrix" << std::endl;
+  result = rhs;
+  viennacl::copy(result, vcl_result);
+  boost::numeric::ublas::inplace_solve(trans(ublas_matrix), result, boost::numeric::ublas::unit_lower_tag());
+  viennacl::linalg::inplace_solve(trans(vcl_compressed_matrix), vcl_result, viennacl::linalg::unit_lower_tag());
+
+  if( std::fabs(diff(result, vcl_result)) > epsilon )
+  {
+    std::cout << "# Error at operation: unit lower triangular solve with compressed_matrix" << std::endl;
+    std::cout << "  diff: " << std::fabs(diff(result, vcl_result)) << std::endl;
+    retval = EXIT_FAILURE;
+  }
+
+  std::cout << "Testing transposed lower triangular solve: compressed_matrix" << std::endl;
+  result = rhs;
+  viennacl::copy(result, vcl_result);
+  boost::numeric::ublas::inplace_solve(trans(ublas_matrix), result, boost::numeric::ublas::lower_tag());
+  viennacl::linalg::inplace_solve(trans(vcl_compressed_matrix), vcl_result, viennacl::linalg::lower_tag());
+
+  if( std::fabs(diff(result, vcl_result)) > epsilon )
+  {
+    std::cout << "# Error at operation: lower triangular solve with compressed_matrix" << std::endl;
+    std::cout << "  diff: " << std::fabs(diff(result, vcl_result)) << std::endl;
+    retval = EXIT_FAILURE;
+  }
+
+
+  std::cout << "Testing products: compressed_compressed_matrix" << std::endl;
+  result     = viennacl::linalg::prod(ublas_cc_matrix, rhs);
+  vcl_result = viennacl::linalg::prod(vcl_compressed_compressed_matrix, vcl_rhs);
+
+  if( std::fabs(diff(result, vcl_result)) > epsilon )
+  {
+    std::cout << "# Error at operation: matrix-vector product with compressed_compressed_matrix" << std::endl;
+    std::cout << "  diff: " << std::fabs(diff(result, vcl_result)) << std::endl;
+    retval = EXIT_FAILURE;
+  }
+
+  {
+    ublas::compressed_matrix<NumericT> temp(vcl_compressed_compressed_matrix.size1(), vcl_compressed_compressed_matrix.size2());
+    viennacl::copy(vcl_compressed_compressed_matrix, temp);
+
+    // check that entries are correct by computing the product again:
+    result     = viennacl::linalg::prod(temp, rhs);
+
+    if( std::fabs(diff(result, vcl_result)) > epsilon )
     {
-      std::cout << "Error reading Result file" << std::endl;
-      return EXIT_FAILURE;
-    }
-    std::cout << "done reading result" << std::endl;
-   
-
-   viennacl::vector<NumericT> vcl_rhs(rhs.size());
-   viennacl::vector<NumericT> vcl_result(result.size()); 
-   viennacl::vector<NumericT> vcl_result2(result.size()); 
-   viennacl::compressed_matrix<NumericT> vcl_compressed_matrix(rhs.size(), rhs.size());
-   viennacl::coordinate_matrix<NumericT> vcl_coordinate_matrix(rhs.size(), rhs.size());
-
-   copy(rhs.begin(), rhs.end(), vcl_rhs.begin());
-   copy(ublas_matrix, vcl_compressed_matrix);
-   copy(ublas_matrix, vcl_coordinate_matrix);
-
-   // --------------------------------------------------------------------------          
-   std::cout << "Testing products: ublas" << std::endl;
-   result     = viennacl::linalg::prod(ublas_matrix, rhs);
-   std::cout << "Testing products: compressed_matrix" << std::endl;
-   vcl_result = viennacl::linalg::prod(vcl_compressed_matrix, vcl_rhs);
-   
-   if( fabs(diff(result, vcl_result)) > epsilon )
-   {
-      std::cout << "# Error at operation: matrix-vector product with compressed_matrix" << std::endl;
-      std::cout << "  diff: " << fabs(diff(result, vcl_result)) << std::endl;
+      std::cout << "# Error at operation: matrix-vector product with compressed_compressed_matrix (after copy back)" << std::endl;
+      std::cout << "  diff: " << std::fabs(diff(result, vcl_result)) << std::endl;
       retval = EXIT_FAILURE;
-   }
-   
-/*   std::cout << "Benching products: coordinate_matrix" << std::endl;
-   vcl_result = viennacl::linalg::prod(vcl_coordinate_matrix, vcl_rhs);
-
-   if( fabs(diff(result, vcl_result)) > epsilon )
-   {
-      std::cout << "# Error at operation: matrix-vector product with coordinate_matrix" << std::endl;
-      std::cout << "  diff: " << fabs(diff(result, vcl_result)) << std::endl;
-      retval = EXIT_FAILURE;
-   }*/
-   
-   // --------------------------------------------------------------------------            
-   // --------------------------------------------------------------------------            
-   NumericT alpha = static_cast<NumericT>(2.786);
-   NumericT beta = static_cast<NumericT>(1.432);
-   copy(rhs.begin(), rhs.end(), vcl_rhs.begin());
-   copy(result.begin(), result.end(), vcl_result.begin());
-   copy(result.begin(), result.end(), vcl_result2.begin());
-
-   std::cout << "Testing scaled additions of products and vectors" << std::endl;
-   result     = alpha * viennacl::linalg::prod(ublas_matrix, rhs) + beta * result;
-   vcl_result2 = alpha * viennacl::linalg::prod(vcl_compressed_matrix, vcl_rhs) + beta * vcl_result;
-
-   if( fabs(diff(result, vcl_result2)) > epsilon )
-   {
-      std::cout << "# Error at operation: matrix-vector product (compressed_matrix) with scaled additions" << std::endl;
-      std::cout << "  diff: " << fabs(diff(result, vcl_result2)) << std::endl;
-      retval = EXIT_FAILURE;
-   }
+    }
 
-   
-/*   vcl_result2 = alpha * viennacl::linalg::prod(vcl_coordinate_matrix, vcl_rhs) + beta * vcl_result;
+  }
 
-   if( fabs(diff(result, vcl_result2)) > epsilon )
-   {
-      std::cout << "# Error at operation: matrix-vector product (coordinate_matrix) with scaled additions" << std::endl;
-      std::cout << "  diff: " << fabs(diff(result, vcl_result2)) << std::endl;
-      retval = EXIT_FAILURE;
-   }*/
-
-   
-   // --------------------------------------------------------------------------            
-   return retval;
+
+
+
+  std::cout << "Testing products: coordinate_matrix" << std::endl;
+  result     = viennacl::linalg::prod(ublas_matrix, rhs);
+  vcl_result = viennacl::linalg::prod(vcl_coordinate_matrix, vcl_rhs);
+
+  if( std::fabs(diff(result, vcl_result)) > epsilon )
+  {
+    std::cout << "# Error at operation: matrix-vector product with coordinate_matrix" << std::endl;
+    std::cout << "  diff: " << std::fabs(diff(result, vcl_result)) << std::endl;
+    retval = EXIT_FAILURE;
+  }
+
+  std::cout << "Testing products: coordinate_matrix, strided vectors" << std::endl;
+  //std::cout << " --> SKIPPING <--" << std::endl;
+  retval = strided_matrix_vector_product_test<NumericT, viennacl::coordinate_matrix<NumericT> >(epsilon, result, rhs, vcl_result, vcl_rhs);
+  if (retval != EXIT_SUCCESS)
+    return retval;
+
+
+  //std::cout << "Copying ell_matrix" << std::endl;
+  viennacl::copy(ublas_matrix, vcl_ell_matrix);
+  ublas_matrix.clear();
+  viennacl::copy(vcl_ell_matrix, ublas_matrix);// just to check that it's works
+
+
+  std::cout << "Testing products: ell_matrix" << std::endl;
+  result     = viennacl::linalg::prod(ublas_matrix, rhs);
+  vcl_result.clear();
+  vcl_result = viennacl::linalg::prod(vcl_ell_matrix, vcl_rhs);
+  //viennacl::linalg::prod_impl(vcl_ell_matrix, vcl_rhs, vcl_result);
+  //std::cout << vcl_result << "\n";
+  //std::cout << "  diff: " << std::fabs(diff(result, vcl_result)) << std::endl;
+  //std::cout << "First entry of result vector: " << vcl_result[0] << std::endl;
+
+  if( std::fabs(diff(result, vcl_result)) > epsilon )
+  {
+    std::cout << "# Error at operation: matrix-vector product with ell_matrix" << std::endl;
+    std::cout << "  diff: " << std::fabs(diff(result, vcl_result)) << std::endl;
+    retval = EXIT_FAILURE;
+  }
+
+  std::cout << "Testing products: ell_matrix, strided vectors" << std::endl;
+  retval = strided_matrix_vector_product_test<NumericT, viennacl::ell_matrix<NumericT> >(epsilon, result, rhs, vcl_result, vcl_rhs);
+  if (retval != EXIT_SUCCESS)
+    return retval;
+
+
+  //std::cout << "Copying hyb_matrix" << std::endl;
+  viennacl::copy(ublas_matrix, vcl_hyb_matrix);
+  ublas_matrix.clear();
+  viennacl::copy(vcl_hyb_matrix, ublas_matrix);// just to check that it's works
+  viennacl::copy(ublas_matrix, vcl_hyb_matrix);
+
+  std::cout << "Testing products: hyb_matrix" << std::endl;
+  result     = viennacl::linalg::prod(ublas_matrix, rhs);
+  vcl_result.clear();
+  vcl_result = viennacl::linalg::prod(vcl_hyb_matrix, vcl_rhs);
+  //viennacl::linalg::prod_impl(vcl_hyb_matrix, vcl_rhs, vcl_result);
+  //std::cout << vcl_result << "\n";
+  //std::cout << "  diff: " << std::fabs(diff(result, vcl_result)) << std::endl;
+  //std::cout << "First entry of result vector: " << vcl_result[0] << std::endl;
+
+  if( std::fabs(diff(result, vcl_result)) > epsilon )
+  {
+    std::cout << "# Error at operation: matrix-vector product with hyb_matrix" << std::endl;
+    std::cout << "  diff: " << std::fabs(diff(result, vcl_result)) << std::endl;
+    retval = EXIT_FAILURE;
+  }
+
+  std::cout << "Testing products: hyb_matrix, strided vectors" << std::endl;
+  retval = strided_matrix_vector_product_test<NumericT, viennacl::hyb_matrix<NumericT> >(epsilon, result, rhs, vcl_result, vcl_rhs);
+  if (retval != EXIT_SUCCESS)
+    return retval;
+
+
+  // --------------------------------------------------------------------------
+  // --------------------------------------------------------------------------
+  NumericT alpha = static_cast<NumericT>(2.786);
+  NumericT beta = static_cast<NumericT>(1.432);
+  copy(rhs.begin(), rhs.end(), vcl_rhs.begin());
+  copy(result.begin(), result.end(), vcl_result.begin());
+  copy(result.begin(), result.end(), vcl_result2.begin());
+
+  std::cout << "Testing scaled additions of products and vectors" << std::endl;
+  result     = alpha * viennacl::linalg::prod(ublas_matrix, rhs) + beta * result;
+  vcl_result2 = alpha * viennacl::linalg::prod(vcl_compressed_matrix, vcl_rhs) + beta * vcl_result;
+
+  if( std::fabs(diff(result, vcl_result2)) > epsilon )
+  {
+    std::cout << "# Error at operation: matrix-vector product (compressed_matrix) with scaled additions" << std::endl;
+    std::cout << "  diff: " << std::fabs(diff(result, vcl_result2)) << std::endl;
+    retval = EXIT_FAILURE;
+  }
+
+
+  vcl_result2.clear();
+  vcl_result2 = alpha * viennacl::linalg::prod(vcl_coordinate_matrix, vcl_rhs) + beta * vcl_result;
+
+  if( std::fabs(diff(result, vcl_result2)) > epsilon )
+  {
+    std::cout << "# Error at operation: matrix-vector product (coordinate_matrix) with scaled additions" << std::endl;
+    std::cout << "  diff: " << std::fabs(diff(result, vcl_result2)) << std::endl;
+    retval = EXIT_FAILURE;
+  }
+
+  vcl_result2.clear();
+  vcl_result2 = alpha * viennacl::linalg::prod(vcl_ell_matrix, vcl_rhs) + beta * vcl_result;
+
+  if( std::fabs(diff(result, vcl_result2)) > epsilon )
+  {
+    std::cout << "# Error at operation: matrix-vector product (ell_matrix) with scaled additions" << std::endl;
+    std::cout << "  diff: " << std::fabs(diff(result, vcl_result2)) << std::endl;
+    retval = EXIT_FAILURE;
+  }
+
+  vcl_result2.clear();
+  vcl_result2 = alpha * viennacl::linalg::prod(vcl_hyb_matrix, vcl_rhs) + beta * vcl_result;
+
+  if( std::fabs(diff(result, vcl_result2)) > epsilon )
+  {
+    std::cout << "# Error at operation: matrix-vector product (hyb_matrix) with scaled additions" << std::endl;
+    std::cout << "  diff: " << std::fabs(diff(result, vcl_result2)) << std::endl;
+    retval = EXIT_FAILURE;
+  }
+
+
+  // --------------------------------------------------------------------------
+  return retval;
 }
 //
 // -------------------------------------------------------------
 //
 int main()
 {
-   std::cout << std::endl;
-   std::cout << "----------------------------------------------" << std::endl;
-   std::cout << "----------------------------------------------" << std::endl;
-   std::cout << "## Test :: Sparse Matrices" << std::endl;
-   std::cout << "----------------------------------------------" << std::endl;
-   std::cout << "----------------------------------------------" << std::endl;
-   std::cout << std::endl;
-
-   int retval = EXIT_SUCCESS;
-
-   std::cout << std::endl;
-   std::cout << "----------------------------------------------" << std::endl;
-   std::cout << std::endl;
-   {
-      typedef float NumericT;
-      NumericT epsilon = static_cast<NumericT>(5.0E-2);
-      std::cout << "# Testing setup:" << std::endl;
-      std::cout << "  eps:     " << epsilon << std::endl;
-      std::cout << "  numeric: float" << std::endl;
-      retval = test<NumericT>(epsilon);
-      if( retval == EXIT_SUCCESS )
-         std::cout << "# Test passed" << std::endl;
-      else
-         return retval;
-   }
-   std::cout << std::endl;
-   std::cout << "----------------------------------------------" << std::endl;
-   std::cout << std::endl;
-   
-/*   {
-      typedef float NumericT;
-      NumericT epsilon = 1.0E-6;
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "## Test :: Sparse Matrices" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << std::endl;
+
+  int retval = EXIT_SUCCESS;
+
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << std::endl;
+  {
+    typedef float NumericT;
+    NumericT epsilon = static_cast<NumericT>(1E-4);
+    std::cout << "# Testing setup:" << std::endl;
+    std::cout << "  eps:     " << epsilon << std::endl;
+    std::cout << "  numeric: float" << std::endl;
+    retval = test<NumericT>(epsilon);
+    if( retval == EXIT_SUCCESS )
+        std::cout << "# Test passed" << std::endl;
+    else
+        return retval;
+  }
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << std::endl;
+
+#ifdef VIENNACL_WITH_OPENCL
+  if( viennacl::ocl::current_device().double_support() )
+#endif
+  {
+    {
+      typedef double NumericT;
+      NumericT epsilon = 1.0E-12;
       std::cout << "# Testing setup:" << std::endl;
       std::cout << "  eps:     " << epsilon << std::endl;
-      std::cout << "  numeric: float" << std::endl;
+      std::cout << "  numeric: double" << std::endl;
       retval = test<NumericT>(epsilon);
       if( retval == EXIT_SUCCESS )
-         std::cout << "# Test passed" << std::endl;
+        std::cout << "# Test passed" << std::endl;
       else
-         return retval;
-   }
-   std::cout << std::endl;
-   std::cout << "----------------------------------------------" << std::endl;
-   std::cout << std::endl;*/
-   
-   if( viennacl::ocl::current_device().double_support() )
-   {
-      {
-         typedef double NumericT;
-         NumericT epsilon = 1.0E-11;
-         std::cout << "# Testing setup:" << std::endl;
-         std::cout << "  eps:     " << epsilon << std::endl;
-         std::cout << "  numeric: double" << std::endl;
-         retval = test<NumericT>(epsilon);
-         if( retval == EXIT_SUCCESS )
-           std::cout << "# Test passed" << std::endl;
-         else
-           return retval;
-      }
-      std::cout << std::endl;
-      std::cout << "----------------------------------------------" << std::endl;
-      std::cout << std::endl;
-      
-/*      {
-         typedef double NumericT;
-         NumericT epsilon = 1.0E-15;
-         std::cout << "# Testing setup:" << std::endl;
-         std::cout << "  eps:     " << epsilon << std::endl;
-         std::cout << "  numeric: double" << std::endl;
-         retval = test<NumericT>(epsilon);
-         if( retval == EXIT_SUCCESS )
-           std::cout << "# Test passed" << std::endl;
-         else
-           return retval;
-      }
-      std::cout << std::endl;
-      std::cout << "----------------------------------------------" << std::endl;
-      std::cout << std::endl;*/
-   }
-   else
-     std::cout << "No double precision support..." << std::endl;
-   return retval;
+        return retval;
+    }
+    std::cout << std::endl;
+    std::cout << "----------------------------------------------" << std::endl;
+    std::cout << std::endl;
+  }
+#ifdef VIENNACL_WITH_OPENCL
+  else
+    std::cout << "No double precision support, skipping test..." << std::endl;
+#endif
+
+
+  std::cout << std::endl;
+  std::cout << "------- Test completed --------" << std::endl;
+  std::cout << std::endl;
+
+  return retval;
 }
diff --git a/tests/src/sparse.cu b/tests/src/sparse.cu
new file mode 100644
index 0000000..4d07000
--- /dev/null
+++ b/tests/src/sparse.cu
@@ -0,0 +1,891 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#ifndef NDEBUG
+ #define NDEBUG
+#endif
+
+//
+// *** System
+//
+#include <iostream>
+
+//
+// *** Boost
+//
+#include <boost/numeric/ublas/io.hpp>
+#include <boost/numeric/ublas/triangular.hpp>
+#include <boost/numeric/ublas/matrix_sparse.hpp>
+#include <boost/numeric/ublas/matrix.hpp>
+#include <boost/numeric/ublas/matrix_proxy.hpp>
+#include <boost/numeric/ublas/lu.hpp>
+#include <boost/numeric/ublas/io.hpp>
+#include <boost/numeric/ublas/operation_sparse.hpp>
+#include <boost/numeric/ublas/vector_proxy.hpp>
+
+//
+// *** ViennaCL
+//
+//#define VIENNACL_DEBUG_ALL
+#define VIENNACL_WITH_UBLAS 1
+#include "viennacl/scalar.hpp"
+#include "viennacl/compressed_matrix.hpp"
+#include "viennacl/compressed_compressed_matrix.hpp"
+#include "viennacl/coordinate_matrix.hpp"
+#include "viennacl/ell_matrix.hpp"
+#include "viennacl/hyb_matrix.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/vector_proxy.hpp"
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+#include "viennacl/linalg/ilu.hpp"
+#include "viennacl/linalg/detail/ilu/common.hpp"
+#include "viennacl/io/matrix_market.hpp"
+#include "examples/tutorial/Random.hpp"
+#include "examples/tutorial/vector-io.hpp"
+
+//
+// -------------------------------------------------------------
+//
+using namespace boost::numeric;
+//
+// -------------------------------------------------------------
+//
+template <typename ScalarType>
+ScalarType diff(ScalarType & s1, viennacl::scalar<ScalarType> & s2)
+{
+   if (s1 != s2)
+      return (s1 - s2) / std::max(fabs(s1), std::fabs(s2));
+   return 0;
+}
+
+template <typename ScalarType>
+ScalarType diff(ublas::vector<ScalarType> & v1, viennacl::vector<ScalarType> & v2)
+{
+   ublas::vector<ScalarType> v2_cpu(v2.size());
+   viennacl::backend::finish();
+   viennacl::copy(v2.begin(), v2.end(), v2_cpu.begin());
+
+   for (unsigned int i=0;i<v1.size(); ++i)
+   {
+      if ( std::max( std::fabs(v2_cpu[i]), std::fabs(v1[i]) ) > 0 )
+      {
+        //if (std::max( std::fabs(v2_cpu[i]), std::fabs(v1[i]) ) < 1e-10 )  //absolute tolerance (avoid round-off issues)
+        //  v2_cpu[i] = 0;
+        //else
+          v2_cpu[i] = std::fabs(v2_cpu[i] - v1[i]) / std::max( std::fabs(v2_cpu[i]), std::fabs(v1[i]) );
+      }
+      else
+         v2_cpu[i] = 0.0;
+
+      if (v2_cpu[i] > 0.0001)
+      {
+        //std::cout << "Neighbor: "      << i-1 << ": " << v1[i-1] << " vs. " << v2_cpu[i-1] << std::endl;
+        std::cout << "Error at entry " << i   << ": " << v1[i]   << " vs. " << v2_cpu[i]   << std::endl;
+        //std::cout << "Neighbor: "      << i+1 << ": " << v1[i+1] << " vs. " << v2_cpu[i+1] << std::endl;
+        exit(EXIT_FAILURE);
+      }
+   }
+
+   return norm_inf(v2_cpu);
+}
+
+
+template <typename ScalarType, typename VCL_MATRIX>
+ScalarType diff(ublas::compressed_matrix<ScalarType> & cpu_matrix, VCL_MATRIX & gpu_matrix)
+{
+  typedef ublas::compressed_matrix<ScalarType>  CPU_MATRIX;
+  CPU_MATRIX from_gpu(gpu_matrix.size1(), gpu_matrix.size2());
+
+  viennacl::backend::finish();
+  viennacl::copy(gpu_matrix, from_gpu);
+
+  ScalarType error = 0;
+
+  //step 1: compare all entries from cpu_matrix with gpu_matrix:
+  //std::cout << "Ublas matrix: " << std::endl;
+  for (typename CPU_MATRIX::const_iterator1 row_it = cpu_matrix.begin1();
+        row_it != cpu_matrix.end1();
+        ++row_it)
+  {
+    //std::cout << "Row " << row_it.index1() << ": " << std::endl;
+    for (typename CPU_MATRIX::const_iterator2 col_it = row_it.begin();
+          col_it != row_it.end();
+          ++col_it)
+    {
+      //std::cout << "(" << col_it.index2() << ", " << *col_it << std::endl;
+      ScalarType current_error = 0;
+
+      if ( std::max( std::fabs(cpu_matrix(col_it.index1(), col_it.index2())),
+                      std::fabs(from_gpu(col_it.index1(), col_it.index2()))   ) > 0 )
+        current_error = std::fabs(cpu_matrix(col_it.index1(), col_it.index2()) - from_gpu(col_it.index1(), col_it.index2()))
+                          / std::max( std::fabs(cpu_matrix(col_it.index1(), col_it.index2())),
+                                      std::fabs(from_gpu(col_it.index1(), col_it.index2()))   );
+      if (current_error > error)
+        error = current_error;
+    }
+  }
+
+  //step 2: compare all entries from gpu_matrix with cpu_matrix (sparsity pattern might differ):
+  //std::cout << "ViennaCL matrix: " << std::endl;
+  for (typename CPU_MATRIX::const_iterator1 row_it = from_gpu.begin1();
+        row_it != from_gpu.end1();
+        ++row_it)
+  {
+    //std::cout << "Row " << row_it.index1() << ": " << std::endl;
+    for (typename CPU_MATRIX::const_iterator2 col_it = row_it.begin();
+          col_it != row_it.end();
+          ++col_it)
+    {
+      //std::cout << "(" << col_it.index2() << ", " << *col_it << std::endl;
+      ScalarType current_error = 0;
+
+      if ( std::max( std::fabs(cpu_matrix(col_it.index1(), col_it.index2())),
+                      std::fabs(from_gpu(col_it.index1(), col_it.index2()))   ) > 0 )
+        current_error = std::fabs(cpu_matrix(col_it.index1(), col_it.index2()) - from_gpu(col_it.index1(), col_it.index2()))
+                          / std::max( std::fabs(cpu_matrix(col_it.index1(), col_it.index2())),
+                                      std::fabs(from_gpu(col_it.index1(), col_it.index2()))   );
+      if (current_error > error)
+        error = current_error;
+    }
+  }
+
+  return error;
+}
+
+
+template <typename NumericT, typename VCL_MatrixT, typename Epsilon, typename UblasVectorT, typename VCLVectorT>
+int strided_matrix_vector_product_test(Epsilon epsilon,
+                                        UblasVectorT & result, UblasVectorT const & rhs,
+                                        VCLVectorT & vcl_result, VCLVectorT & vcl_rhs)
+{
+    int retval = EXIT_SUCCESS;
+
+    ublas::compressed_matrix<NumericT> ublas_matrix2(5, 4);
+    ublas_matrix2(0, 0) = NumericT(2.0); ublas_matrix2(0, 2) = NumericT(-1.0);
+    ublas_matrix2(1, 0) = NumericT(3.0); ublas_matrix2(1, 2) = NumericT(-5.0);
+    ublas_matrix2(2, 1) = NumericT(5.0); ublas_matrix2(2, 2) = NumericT(-2.0);
+    ublas_matrix2(3, 2) = NumericT(1.0); ublas_matrix2(3, 3) = NumericT(-6.0);
+    ublas_matrix2(4, 1) = NumericT(7.0); ublas_matrix2(4, 2) = NumericT(-5.0);
+    project(result, ublas::slice(1, 3, 5))     = ublas::prod(ublas_matrix2, project(rhs, ublas::slice(3, 2, 4)));
+
+    VCL_MatrixT vcl_sparse_matrix2;
+    viennacl::copy(ublas_matrix2, vcl_sparse_matrix2);
+    viennacl::vector<NumericT> vec(4);
+    vec(0) = rhs(3);
+    vec(1) = rhs(5);
+    vec(2) = rhs(7);
+    vec(3) = rhs(9);
+    viennacl::project(vcl_result, viennacl::slice(1, 3, 5)) = viennacl::linalg::prod(vcl_sparse_matrix2, viennacl::project(vcl_rhs, viennacl::slice(3, 2, 4)));
+
+    if( std::fabs(diff(result, vcl_result)) > epsilon )
+    {
+      std::cout << "# Error at operation: matrix-vector product with stided vectors, part 1" << std::endl;
+      std::cout << "  diff: " << std::fabs(diff(result, vcl_result)) << std::endl;
+      retval = EXIT_FAILURE;
+    }
+    vcl_result(1)  = NumericT(1.0);
+    vcl_result(4)  = NumericT(1.0);
+    vcl_result(7)  = NumericT(1.0);
+    vcl_result(10) = NumericT(1.0);
+    vcl_result(13) = NumericT(1.0);
+
+    viennacl::project(vcl_result, viennacl::slice(1, 3, 5)) = viennacl::linalg::prod(vcl_sparse_matrix2, vec);
+
+    if( std::fabs(diff(result, vcl_result)) > epsilon )
+    {
+      std::cout << "# Error at operation: matrix-vector product with strided vectors, part 2" << std::endl;
+      std::cout << "  diff: " << std::fabs(diff(result, vcl_result)) << std::endl;
+      retval = EXIT_FAILURE;
+    }
+
+    return retval;
+}
+
+
+template< typename NumericT, typename VCL_MATRIX, typename Epsilon >
+int resize_test(Epsilon const& epsilon)
+{
+   int retval = EXIT_SUCCESS;
+
+   ublas::compressed_matrix<NumericT> ublas_matrix(5,5);
+   VCL_MATRIX vcl_matrix;
+
+   ublas_matrix(0,0) = NumericT(10.0); ublas_matrix(0, 1) = NumericT(0.1); ublas_matrix(0, 2) = NumericT(0.2); ublas_matrix(0, 3) = NumericT(0.3); ublas_matrix(0, 4) = NumericT(0.4);
+   ublas_matrix(1,0) = NumericT(1.0);  ublas_matrix(1, 1) = NumericT(1.1); ublas_matrix(1, 2) = NumericT(1.2); ublas_matrix(1, 3) = NumericT(1.3); ublas_matrix(1, 4) = NumericT(1.4);
+   ublas_matrix(2,0) = NumericT(2.0);  ublas_matrix(2, 1) = NumericT(2.1); ublas_matrix(2, 2) = NumericT(2.2); ublas_matrix(2, 3) = NumericT(2.3); ublas_matrix(2, 4) = NumericT(2.4);
+   ublas_matrix(3,0) = NumericT(3.0);  ublas_matrix(3, 1) = NumericT(3.1); ublas_matrix(3, 2) = NumericT(3.2); ublas_matrix(3, 3) = NumericT(3.3); ublas_matrix(3, 4) = NumericT(3.4);
+   ublas_matrix(4,0) = NumericT(4.0);  ublas_matrix(4, 1) = NumericT(4.1); ublas_matrix(4, 2) = NumericT(4.2); ublas_matrix(4, 3) = NumericT(4.3); ublas_matrix(4, 4) = NumericT(4.4);
+
+   viennacl::copy(ublas_matrix, vcl_matrix);
+   ublas::compressed_matrix<NumericT> other_matrix(ublas_matrix.size1(), ublas_matrix.size2());
+   viennacl::copy(vcl_matrix, other_matrix);
+
+   std::cout << "Checking for equality after copy..." << std::endl;
+    if( std::fabs(diff(ublas_matrix, vcl_matrix)) > epsilon )
+    {
+        std::cout << "# Error at operation: equality after copy with sparse matrix" << std::endl;
+        std::cout << "  diff: " << std::fabs(diff(ublas_matrix, vcl_matrix)) << std::endl;
+        return EXIT_FAILURE;
+    }
+
+   std::cout << "Testing resize to larger..." << std::endl;
+   ublas_matrix.resize(10, 10, false); //ublas does not allow preserve = true here
+   ublas_matrix(0,0) = NumericT(10.0); ublas_matrix(0, 1) = NumericT(0.1); ublas_matrix(0, 2) = NumericT(0.2); ublas_matrix(0, 3) = NumericT(0.3); ublas_matrix(0, 4) = NumericT(0.4);
+   ublas_matrix(1,0) = NumericT( 1.0); ublas_matrix(1, 1) = NumericT(1.1); ublas_matrix(1, 2) = NumericT(1.2); ublas_matrix(1, 3) = NumericT(1.3); ublas_matrix(1, 4) = NumericT(1.4);
+   ublas_matrix(2,0) = NumericT( 2.0); ublas_matrix(2, 1) = NumericT(2.1); ublas_matrix(2, 2) = NumericT(2.2); ublas_matrix(2, 3) = NumericT(2.3); ublas_matrix(2, 4) = NumericT(2.4);
+   ublas_matrix(3,0) = NumericT( 3.0); ublas_matrix(3, 1) = NumericT(3.1); ublas_matrix(3, 2) = NumericT(3.2); ublas_matrix(3, 3) = NumericT(3.3); ublas_matrix(3, 4) = NumericT(3.4);
+   ublas_matrix(4,0) = NumericT( 4.0); ublas_matrix(4, 1) = NumericT(4.1); ublas_matrix(4, 2) = NumericT(4.2); ublas_matrix(4, 3) = NumericT(4.3); ublas_matrix(4, 4) = NumericT(4.4);
+   //std::cout << ublas_matrix << std::endl;
+
+   vcl_matrix.resize(10, 10, true);
+
+    if( std::fabs(diff(ublas_matrix, vcl_matrix)) > epsilon )
+    {
+        std::cout << "# Error at operation: resize (to larger) with sparse matrix" << std::endl;
+        std::cout << "  diff: " << std::fabs(diff(ublas_matrix, vcl_matrix)) << std::endl;
+        return EXIT_FAILURE;
+    }
+
+   ublas_matrix(5,5) = NumericT(5.5); ublas_matrix(5, 6) = NumericT(5.6); ublas_matrix(5, 7) = NumericT(5.7); ublas_matrix(5, 8) = NumericT(5.8); ublas_matrix(5, 9) = NumericT(5.9);
+   ublas_matrix(6,5) = NumericT(6.5); ublas_matrix(6, 6) = NumericT(6.6); ublas_matrix(6, 7) = NumericT(6.7); ublas_matrix(6, 8) = NumericT(6.8); ublas_matrix(6, 9) = NumericT(6.9);
+   ublas_matrix(7,5) = NumericT(7.5); ublas_matrix(7, 6) = NumericT(7.6); ublas_matrix(7, 7) = NumericT(7.7); ublas_matrix(7, 8) = NumericT(7.8); ublas_matrix(7, 9) = NumericT(7.9);
+   ublas_matrix(8,5) = NumericT(8.5); ublas_matrix(8, 6) = NumericT(8.6); ublas_matrix(8, 7) = NumericT(8.7); ublas_matrix(8, 8) = NumericT(8.8); ublas_matrix(8, 9) = NumericT(8.9);
+   ublas_matrix(9,5) = NumericT(9.5); ublas_matrix(9, 6) = NumericT(9.6); ublas_matrix(9, 7) = NumericT(9.7); ublas_matrix(9, 8) = NumericT(9.8); ublas_matrix(9, 9) = NumericT(9.9);
+   viennacl::copy(ublas_matrix, vcl_matrix);
+
+   std::cout << "Testing resize to smaller..." << std::endl;
+   ublas_matrix.resize(7, 7, false); //ublas does not allow preserve = true here
+   ublas_matrix(0,0) = NumericT(10.0); ublas_matrix(0, 1) = NumericT(0.1); ublas_matrix(0, 2) = NumericT(0.2); ublas_matrix(0, 3) = NumericT(0.3); ublas_matrix(0, 4) = NumericT(0.4);
+   ublas_matrix(1,0) = NumericT( 1.0); ublas_matrix(1, 1) = NumericT(1.1); ublas_matrix(1, 2) = NumericT(1.2); ublas_matrix(1, 3) = NumericT(1.3); ublas_matrix(1, 4) = NumericT(1.4);
+   ublas_matrix(2,0) = NumericT( 2.0); ublas_matrix(2, 1) = NumericT(2.1); ublas_matrix(2, 2) = NumericT(2.2); ublas_matrix(2, 3) = NumericT(2.3); ublas_matrix(2, 4) = NumericT(2.4);
+   ublas_matrix(3,0) = NumericT( 3.0); ublas_matrix(3, 1) = NumericT(3.1); ublas_matrix(3, 2) = NumericT(3.2); ublas_matrix(3, 3) = NumericT(3.3); ublas_matrix(3, 4) = NumericT(3.4);
+   ublas_matrix(4,0) = NumericT( 4.0); ublas_matrix(4, 1) = NumericT(4.1); ublas_matrix(4, 2) = NumericT(4.2); ublas_matrix(4, 3) = NumericT(4.3); ublas_matrix(4, 4) = NumericT(4.4);
+   ublas_matrix(5,5) = NumericT( 5.5); ublas_matrix(5, 6) = NumericT(5.6); ublas_matrix(5, 7) = NumericT(5.7); ublas_matrix(5, 8) = NumericT(5.8); ublas_matrix(5, 9) = NumericT(5.9);
+   ublas_matrix(6,5) = NumericT( 6.5); ublas_matrix(6, 6) = NumericT(6.6); ublas_matrix(6, 7) = NumericT(6.7); ublas_matrix(6, 8) = NumericT(6.8); ublas_matrix(6, 9) = NumericT(6.9);
+
+   vcl_matrix.resize(7, 7);
+
+   //std::cout << ublas_matrix << std::endl;
+    if( std::fabs(diff(ublas_matrix, vcl_matrix)) > epsilon )
+    {
+        std::cout << "# Error at operation: resize (to smaller) with sparse matrix" << std::endl;
+        std::cout << "  diff: " << std::fabs(diff(ublas_matrix, vcl_matrix)) << std::endl;
+        retval = EXIT_FAILURE;
+    }
+
+   ublas::vector<NumericT> ublas_vec = ublas::scalar_vector<NumericT>(ublas_matrix.size1(), NumericT(3.1415));
+   viennacl::vector<NumericT> vcl_vec(ublas_matrix.size1());
+
+
+  std::cout << "Testing transposed unit lower triangular solve: compressed_matrix" << std::endl;
+  viennacl::copy(ublas_vec, vcl_vec);
+  std::cout << "matrix: " << ublas_matrix << std::endl;
+  std::cout << "vector: " << ublas_vec << std::endl;
+  std::cout << "ViennaCL matrix size: " << vcl_matrix.size1() << " x " << vcl_matrix.size2() << std::endl;
+
+  std::cout << "ublas..." << std::endl;
+  boost::numeric::ublas::inplace_solve((ublas_matrix), ublas_vec, boost::numeric::ublas::unit_lower_tag());
+  std::cout << "ViennaCL..." << std::endl;
+  viennacl::linalg::inplace_solve((vcl_matrix), vcl_vec, viennacl::linalg::unit_lower_tag());
+
+  /*
+  std::list< viennacl::backend::mem_handle > multifrontal_L_row_index_arrays_;
+  std::list< viennacl::backend::mem_handle > multifrontal_L_row_buffers_;
+  std::list< viennacl::backend::mem_handle > multifrontal_L_col_buffers_;
+  std::list< viennacl::backend::mem_handle > multifrontal_L_element_buffers_;
+  std::list< std::size_t > multifrontal_L_row_elimination_num_list_;
+
+  viennacl::vector<NumericT> multifrontal_U_diagonal_;
+
+  viennacl::linalg::detail::multifrontal_setup_L(vcl_matrix,
+                                                  multifrontal_U_diagonal_, //dummy
+                                                  multifrontal_L_row_index_arrays_,
+                                                  multifrontal_L_row_buffers_,
+                                                  multifrontal_L_col_buffers_,
+                                                  multifrontal_L_element_buffers_,
+                                                  multifrontal_L_row_elimination_num_list_);
+
+  viennacl::linalg::detail::multifrontal_substitute(vcl_vec,
+                                                    multifrontal_L_row_index_arrays_,
+                                                    multifrontal_L_row_buffers_,
+                                                    multifrontal_L_col_buffers_,
+                                                    multifrontal_L_element_buffers_,
+                                                    multifrontal_L_row_elimination_num_list_);
+
+
+  std::cout << "ublas..." << std::endl;
+  boost::numeric::ublas::inplace_solve((ublas_matrix), ublas_vec, boost::numeric::ublas::upper_tag());
+  std::cout << "ViennaCL..." << std::endl;
+  std::list< viennacl::backend::mem_handle > multifrontal_U_row_index_arrays_;
+  std::list< viennacl::backend::mem_handle > multifrontal_U_row_buffers_;
+  std::list< viennacl::backend::mem_handle > multifrontal_U_col_buffers_;
+  std::list< viennacl::backend::mem_handle > multifrontal_U_element_buffers_;
+  std::list< std::size_t > multifrontal_U_row_elimination_num_list_;
+
+  multifrontal_U_diagonal_.resize(vcl_matrix.size1(), false);
+  viennacl::linalg::single_threaded::detail::row_info(vcl_matrix, multifrontal_U_diagonal_, viennacl::linalg::detail::SPARSE_ROW_DIAGONAL);
+  viennacl::linalg::detail::multifrontal_setup_U(vcl_matrix,
+                                                 multifrontal_U_diagonal_,
+                                                 multifrontal_U_row_index_arrays_,
+                                                 multifrontal_U_row_buffers_,
+                                                 multifrontal_U_col_buffers_,
+                                                 multifrontal_U_element_buffers_,
+                                                 multifrontal_U_row_elimination_num_list_);
+
+  vcl_vec = viennacl::linalg::element_div(vcl_vec, multifrontal_U_diagonal_);
+  viennacl::linalg::detail::multifrontal_substitute(vcl_vec,
+                                                    multifrontal_U_row_index_arrays_,
+                                                    multifrontal_U_row_buffers_,
+                                                    multifrontal_U_col_buffers_,
+                                                    multifrontal_U_element_buffers_,
+                                                    multifrontal_U_row_elimination_num_list_);
+  */
+  for (std::size_t i=0; i<ublas_vec.size(); ++i)
+  {
+    std::cout << ublas_vec[i] << " vs. " << vcl_vec[i] << std::endl;
+  }
+
+  /*std::cout << "Testing transposed unit upper triangular solve: compressed_matrix" << std::endl;
+  viennacl::copy(ublas_vec, vcl_vec);
+  std::cout << "matrix: " << ublas_matrix << std::endl;
+  std::cout << "vector: " << ublas_vec << std::endl;
+  std::cout << "ViennaCL matrix size: " << vcl_matrix.size1() << " x " << vcl_matrix.size2() << std::endl;
+
+  std::cout << "ublas..." << std::endl;
+  boost::numeric::ublas::inplace_solve((ublas_matrix), ublas_vec, boost::numeric::ublas::lower_tag());
+  std::cout << "ViennaCL..." << std::endl;
+  viennacl::linalg::inplace_solve((vcl_matrix), vcl_vec, viennacl::linalg::lower_tag());
+
+  for (std::size_t i=0; i<ublas_vec.size(); ++i)
+  {
+    std::cout << ublas_vec[i] << " vs. " << vcl_vec[i] << std::endl;
+  }*/
+
+  return retval;
+}
+
+
+//
+// -------------------------------------------------------------
+//
+template< typename NumericT, typename Epsilon >
+int test(Epsilon const& epsilon)
+{
+  std::cout << "Testing resizing of compressed_matrix..." << std::endl;
+  int retval = resize_test<NumericT, viennacl::compressed_matrix<NumericT> >(epsilon);
+  if (retval != EXIT_SUCCESS)
+    return retval;
+  std::cout << "Testing resizing of coordinate_matrix..." << std::endl;
+  //if (retval != EXIT_FAILURE)
+  //  retval = resize_test<NumericT, viennacl::coordinate_matrix<NumericT> >(epsilon);
+  //else
+  //  return retval;
+
+  // --------------------------------------------------------------------------
+  ublas::vector<NumericT> rhs;
+  ublas::vector<NumericT> result;
+  ublas::compressed_matrix<NumericT> ublas_matrix;
+
+  if (viennacl::io::read_matrix_market_file(ublas_matrix, "../../examples/testdata/mat65k.mtx") == EXIT_FAILURE)
+  {
+    std::cout << "Error reading Matrix file" << std::endl;
+    return EXIT_FAILURE;
+  }
+  //unsigned int cg_mat_size = cg_mat.size();
+  std::cout << "done reading matrix" << std::endl;
+
+
+  rhs.resize(ublas_matrix.size2());
+  for (std::size_t i=0; i<rhs.size(); ++i)
+  {
+    ublas_matrix(i,i) = NumericT(0.5);   // Get rid of round-off errors by making row-sums unequal to zero:
+    rhs[i] = NumericT(1) + random<NumericT>();
+  }
+
+  // add some random numbers to the double-compressed matrix:
+  ublas::compressed_matrix<NumericT> ublas_cc_matrix(ublas_matrix.size1(), ublas_matrix.size2());
+  ublas_cc_matrix(42,199) = NumericT(3.1415);
+  ublas_cc_matrix(31, 69) = NumericT(2.71);
+  ublas_cc_matrix(23, 32) = NumericT(6);
+  ublas_cc_matrix(177,57) = NumericT(4);
+  ublas_cc_matrix(21, 97) = NumericT(-4);
+  ublas_cc_matrix(92, 25) = NumericT(2);
+  ublas_cc_matrix(89, 62) = NumericT(11);
+  ublas_cc_matrix(1,   7) = NumericT(8);
+  ublas_cc_matrix(85, 41) = NumericT(13);
+  ublas_cc_matrix(66, 28) = NumericT(8);
+  ublas_cc_matrix(21, 74) = NumericT(-2);
+
+
+  result = rhs;
+
+
+  viennacl::vector<NumericT> vcl_rhs(rhs.size());
+  viennacl::vector<NumericT> vcl_result(result.size());
+  viennacl::vector<NumericT> vcl_result2(result.size());
+  viennacl::compressed_matrix<NumericT> vcl_compressed_matrix(rhs.size(), rhs.size());
+  viennacl::compressed_compressed_matrix<NumericT> vcl_compressed_compressed_matrix(rhs.size(), rhs.size());
+  viennacl::coordinate_matrix<NumericT> vcl_coordinate_matrix(rhs.size(), rhs.size());
+  viennacl::ell_matrix<NumericT> vcl_ell_matrix;
+  viennacl::hyb_matrix<NumericT> vcl_hyb_matrix;
+
+  viennacl::copy(rhs.begin(), rhs.end(), vcl_rhs.begin());
+  viennacl::copy(ublas_matrix, vcl_compressed_matrix);
+  viennacl::copy(ublas_cc_matrix, vcl_compressed_compressed_matrix);
+  viennacl::copy(ublas_matrix, vcl_coordinate_matrix);
+
+  // --------------------------------------------------------------------------
+  std::cout << "Testing products: ublas" << std::endl;
+  result     = viennacl::linalg::prod(ublas_matrix, rhs);
+
+  std::cout << "Testing products: compressed_matrix" << std::endl;
+  vcl_result = viennacl::linalg::prod(vcl_compressed_matrix, vcl_rhs);
+
+  if( std::fabs(diff(result, vcl_result)) > epsilon )
+  {
+    std::cout << "# Error at operation: matrix-vector product with compressed_matrix" << std::endl;
+    std::cout << "  diff: " << std::fabs(diff(result, vcl_result)) << std::endl;
+    retval = EXIT_FAILURE;
+  }
+
+  std::cout << "Testing products: compressed_matrix, strided vectors" << std::endl;
+  retval = strided_matrix_vector_product_test<NumericT, viennacl::compressed_matrix<NumericT> >(epsilon, result, rhs, vcl_result, vcl_rhs);
+  if (retval != EXIT_SUCCESS)
+    return retval;
+
+  //
+  // Triangular solvers for A \ b:
+  //
+  ublas::compressed_matrix<NumericT> ublas_matrix_trans(ublas_matrix.size2(), ublas_matrix.size1(), ublas_matrix.nnz()); // = trans(ublas_matrix); //note: triangular solvers with uBLAS show atrocious performance, while transposed solvers are quite okay. To keep execution times short, we use a double-transpose-trick in the following.
+
+  // fast transpose:
+  for (typename ublas::compressed_matrix<NumericT>::iterator1 row_it  = ublas_matrix.begin1();
+                                                              row_it != ublas_matrix.end1();
+                                                            ++row_it)
+  {
+    for (typename ublas::compressed_matrix<NumericT>::iterator2 col_it  = row_it.begin();
+                                                                col_it != row_it.end();
+                                                              ++col_it)
+    {
+      ublas_matrix_trans(col_it.index1(), col_it.index2()) = *col_it;
+    }
+  }
+
+
+  std::cout << "Testing unit upper triangular solve: compressed_matrix" << std::endl;
+  result = rhs;
+  viennacl::copy(result, vcl_result);
+  boost::numeric::ublas::inplace_solve(trans(ublas_matrix_trans), result, boost::numeric::ublas::unit_upper_tag());
+  viennacl::linalg::inplace_solve(vcl_compressed_matrix, vcl_result, viennacl::linalg::unit_upper_tag());
+
+  if( std::fabs(diff(result, vcl_result)) > epsilon )
+  {
+    std::cout << "# Error at operation: unit upper triangular solve with compressed_matrix" << std::endl;
+    std::cout << "  diff: " << std::fabs(diff(result, vcl_result)) << std::endl;
+    retval = EXIT_FAILURE;
+  }
+
+  std::cout << "Testing upper triangular solve: compressed_matrix" << std::endl;
+  result = rhs;
+  viennacl::copy(result, vcl_result);
+  boost::numeric::ublas::inplace_solve(trans(ublas_matrix_trans), result, boost::numeric::ublas::upper_tag());
+  viennacl::linalg::inplace_solve(vcl_compressed_matrix, vcl_result, viennacl::linalg::upper_tag());
+
+  if( std::fabs(diff(result, vcl_result)) > epsilon )
+  {
+    std::cout << "# Error at operation: upper triangular solve with compressed_matrix" << std::endl;
+    std::cout << "  diff: " << std::fabs(diff(result, vcl_result)) << std::endl;
+    retval = EXIT_FAILURE;
+  }
+
+  std::cout << "Testing unit lower triangular solve: compressed_matrix" << std::endl;
+  result = rhs;
+  viennacl::copy(result, vcl_result);
+  boost::numeric::ublas::inplace_solve(trans(ublas_matrix_trans), result, boost::numeric::ublas::unit_lower_tag());
+  viennacl::linalg::inplace_solve(vcl_compressed_matrix, vcl_result, viennacl::linalg::unit_lower_tag());
+
+  /*std::list< viennacl::backend::mem_handle > multifrontal_L_row_index_arrays_;
+  std::list< viennacl::backend::mem_handle > multifrontal_L_row_buffers_;
+  std::list< viennacl::backend::mem_handle > multifrontal_L_col_buffers_;
+  std::list< viennacl::backend::mem_handle > multifrontal_L_element_buffers_;
+  std::list< std::size_t > multifrontal_L_row_elimination_num_list_;
+
+  viennacl::vector<NumericT> multifrontal_U_diagonal_;
+
+  viennacl::switch_memory_domain(multifrontal_U_diagonal_, viennacl::MAIN_MEMORY);
+  multifrontal_U_diagonal_.resize(vcl_compressed_matrix.size1(), false);
+  viennacl::linalg::single_threaded::detail::row_info(vcl_compressed_matrix, multifrontal_U_diagonal_, viennacl::linalg::detail::SPARSE_ROW_DIAGONAL);
+
+  viennacl::linalg::detail::multifrontal_setup_L(vcl_compressed_matrix,
+                                                  multifrontal_U_diagonal_, //dummy
+                                                  multifrontal_L_row_index_arrays_,
+                                                  multifrontal_L_row_buffers_,
+                                                  multifrontal_L_col_buffers_,
+                                                  multifrontal_L_element_buffers_,
+                                                  multifrontal_L_row_elimination_num_list_);
+
+  viennacl::linalg::detail::multifrontal_substitute(vcl_result,
+                                                    multifrontal_L_row_index_arrays_,
+                                                    multifrontal_L_row_buffers_,
+                                                    multifrontal_L_col_buffers_,
+                                                    multifrontal_L_element_buffers_,
+                                                    multifrontal_L_row_elimination_num_list_);*/
+
+
+  if( std::fabs(diff(result, vcl_result)) > epsilon )
+  {
+    std::cout << "# Error at operation: unit lower triangular solve with compressed_matrix" << std::endl;
+    std::cout << "  diff: " << std::fabs(diff(result, vcl_result)) << std::endl;
+    retval = EXIT_FAILURE;
+  }
+
+
+  std::cout << "Testing lower triangular solve: compressed_matrix" << std::endl;
+  result = rhs;
+  viennacl::copy(result, vcl_result);
+  boost::numeric::ublas::inplace_solve(trans(ublas_matrix_trans), result, boost::numeric::ublas::lower_tag());
+  viennacl::linalg::inplace_solve(vcl_compressed_matrix, vcl_result, viennacl::linalg::lower_tag());
+
+  /*std::list< viennacl::backend::mem_handle > multifrontal_U_row_index_arrays_;
+  std::list< viennacl::backend::mem_handle > multifrontal_U_row_buffers_;
+  std::list< viennacl::backend::mem_handle > multifrontal_U_col_buffers_;
+  std::list< viennacl::backend::mem_handle > multifrontal_U_element_buffers_;
+  std::list< std::size_t > multifrontal_U_row_elimination_num_list_;
+
+  multifrontal_U_diagonal_.resize(vcl_compressed_matrix.size1(), false);
+  viennacl::linalg::single_threaded::detail::row_info(vcl_compressed_matrix, multifrontal_U_diagonal_, viennacl::linalg::detail::SPARSE_ROW_DIAGONAL);
+  viennacl::linalg::detail::multifrontal_setup_U(vcl_compressed_matrix,
+                                                 multifrontal_U_diagonal_,
+                                                 multifrontal_U_row_index_arrays_,
+                                                 multifrontal_U_row_buffers_,
+                                                 multifrontal_U_col_buffers_,
+                                                 multifrontal_U_element_buffers_,
+                                                 multifrontal_U_row_elimination_num_list_);
+
+  vcl_result = viennacl::linalg::element_div(vcl_result, multifrontal_U_diagonal_);
+  viennacl::linalg::detail::multifrontal_substitute(vcl_result,
+                                                    multifrontal_U_row_index_arrays_,
+                                                    multifrontal_U_row_buffers_,
+                                                    multifrontal_U_col_buffers_,
+                                                    multifrontal_U_element_buffers_,
+                                                    multifrontal_U_row_elimination_num_list_);*/
+
+
+  if( std::fabs(diff(result, vcl_result)) > epsilon )
+  {
+    std::cout << "# Error at operation: lower triangular solve with compressed_matrix" << std::endl;
+    std::cout << "  diff: " << std::fabs(diff(result, vcl_result)) << std::endl;
+    retval = EXIT_FAILURE;
+  }
+
+/*
+  std::cout << "Testing lower triangular solve: compressed_matrix" << std::endl;
+  result = rhs;
+  viennacl::copy(result, vcl_result);
+  boost::numeric::ublas::inplace_solve(ublas_matrix, result, boost::numeric::ublas::lower_tag());
+  viennacl::linalg::inplace_solve(vcl_compressed_matrix, vcl_result, viennacl::linalg::lower_tag());
+
+  if( std::fabs(diff(result, vcl_result)) > epsilon )
+  {
+    std::cout << "# Error at operation: lower triangular solve with compressed_matrix" << std::endl;
+    std::cout << "  diff: " << std::fabs(diff(result, vcl_result)) << std::endl;
+    retval = EXIT_FAILURE;
+  }*/
+
+  //
+  // Triangular solvers for A^T \ b
+  //
+
+  std::cout << "Testing transposed unit upper triangular solve: compressed_matrix" << std::endl;
+  result = rhs;
+  viennacl::copy(result, vcl_result);
+  boost::numeric::ublas::inplace_solve(trans(ublas_matrix), result, boost::numeric::ublas::unit_upper_tag());
+  viennacl::linalg::inplace_solve(trans(vcl_compressed_matrix), vcl_result, viennacl::linalg::unit_upper_tag());
+
+  if( std::fabs(diff(result, vcl_result)) > epsilon )
+  {
+    std::cout << "# Error at operation: unit upper triangular solve with compressed_matrix" << std::endl;
+    std::cout << "  diff: " << std::fabs(diff(result, vcl_result)) << std::endl;
+    retval = EXIT_FAILURE;
+  }
+
+  std::cout << "Testing transposed upper triangular solve: compressed_matrix" << std::endl;
+  result = rhs;
+  viennacl::copy(result, vcl_result);
+  boost::numeric::ublas::inplace_solve(trans(ublas_matrix), result, boost::numeric::ublas::upper_tag());
+  viennacl::linalg::inplace_solve(trans(vcl_compressed_matrix), vcl_result, viennacl::linalg::upper_tag());
+
+  if( std::fabs(diff(result, vcl_result)) > epsilon )
+  {
+    std::cout << "# Error at operation: upper triangular solve with compressed_matrix" << std::endl;
+    std::cout << "  diff: " << std::fabs(diff(result, vcl_result)) << std::endl;
+    retval = EXIT_FAILURE;
+  }
+
+
+  std::cout << "Testing transposed unit lower triangular solve: compressed_matrix" << std::endl;
+  result = rhs;
+  viennacl::copy(result, vcl_result);
+  boost::numeric::ublas::inplace_solve(trans(ublas_matrix), result, boost::numeric::ublas::unit_lower_tag());
+  viennacl::linalg::inplace_solve(trans(vcl_compressed_matrix), vcl_result, viennacl::linalg::unit_lower_tag());
+
+  if( std::fabs(diff(result, vcl_result)) > epsilon )
+  {
+    std::cout << "# Error at operation: unit lower triangular solve with compressed_matrix" << std::endl;
+    std::cout << "  diff: " << std::fabs(diff(result, vcl_result)) << std::endl;
+    retval = EXIT_FAILURE;
+  }
+
+  std::cout << "Testing transposed lower triangular solve: compressed_matrix" << std::endl;
+  result = rhs;
+  viennacl::copy(result, vcl_result);
+  boost::numeric::ublas::inplace_solve(trans(ublas_matrix), result, boost::numeric::ublas::lower_tag());
+  viennacl::linalg::inplace_solve(trans(vcl_compressed_matrix), vcl_result, viennacl::linalg::lower_tag());
+
+  if( std::fabs(diff(result, vcl_result)) > epsilon )
+  {
+    std::cout << "# Error at operation: lower triangular solve with compressed_matrix" << std::endl;
+    std::cout << "  diff: " << std::fabs(diff(result, vcl_result)) << std::endl;
+    retval = EXIT_FAILURE;
+  }
+
+
+  std::cout << "Testing products: compressed_compressed_matrix" << std::endl;
+  result     = viennacl::linalg::prod(ublas_cc_matrix, rhs);
+  vcl_result = viennacl::linalg::prod(vcl_compressed_compressed_matrix, vcl_rhs);
+
+  if( std::fabs(diff(result, vcl_result)) > epsilon )
+  {
+    std::cout << "# Error at operation: matrix-vector product with compressed_compressed_matrix" << std::endl;
+    std::cout << "  diff: " << std::fabs(diff(result, vcl_result)) << std::endl;
+    retval = EXIT_FAILURE;
+  }
+
+  {
+    ublas::compressed_matrix<NumericT> temp(vcl_compressed_compressed_matrix.size1(), vcl_compressed_compressed_matrix.size2());
+    viennacl::copy(vcl_compressed_compressed_matrix, temp);
+
+    // check that entries are correct by computing the product again:
+    result     = viennacl::linalg::prod(temp, rhs);
+
+    if( std::fabs(diff(result, vcl_result)) > epsilon )
+    {
+      std::cout << "# Error at operation: matrix-vector product with compressed_compressed_matrix (after copy back)" << std::endl;
+      std::cout << "  diff: " << std::fabs(diff(result, vcl_result)) << std::endl;
+      retval = EXIT_FAILURE;
+    }
+
+  }
+
+
+
+
+  std::cout << "Testing products: coordinate_matrix" << std::endl;
+  result     = viennacl::linalg::prod(ublas_matrix, rhs);
+  vcl_result = viennacl::linalg::prod(vcl_coordinate_matrix, vcl_rhs);
+
+  if( std::fabs(diff(result, vcl_result)) > epsilon )
+  {
+    std::cout << "# Error at operation: matrix-vector product with coordinate_matrix" << std::endl;
+    std::cout << "  diff: " << std::fabs(diff(result, vcl_result)) << std::endl;
+    retval = EXIT_FAILURE;
+  }
+
+  std::cout << "Testing products: coordinate_matrix, strided vectors" << std::endl;
+  //std::cout << " --> SKIPPING <--" << std::endl;
+  retval = strided_matrix_vector_product_test<NumericT, viennacl::coordinate_matrix<NumericT> >(epsilon, result, rhs, vcl_result, vcl_rhs);
+  if (retval != EXIT_SUCCESS)
+    return retval;
+
+
+  //std::cout << "Copying ell_matrix" << std::endl;
+  viennacl::copy(ublas_matrix, vcl_ell_matrix);
+  ublas_matrix.clear();
+  viennacl::copy(vcl_ell_matrix, ublas_matrix);// just to check that it's works
+
+
+  std::cout << "Testing products: ell_matrix" << std::endl;
+  result     = viennacl::linalg::prod(ublas_matrix, rhs);
+  vcl_result.clear();
+  vcl_result = viennacl::linalg::prod(vcl_ell_matrix, vcl_rhs);
+  //viennacl::linalg::prod_impl(vcl_ell_matrix, vcl_rhs, vcl_result);
+  //std::cout << vcl_result << "\n";
+  //std::cout << "  diff: " << std::fabs(diff(result, vcl_result)) << std::endl;
+  //std::cout << "First entry of result vector: " << vcl_result[0] << std::endl;
+
+  if( std::fabs(diff(result, vcl_result)) > epsilon )
+  {
+    std::cout << "# Error at operation: matrix-vector product with ell_matrix" << std::endl;
+    std::cout << "  diff: " << std::fabs(diff(result, vcl_result)) << std::endl;
+    retval = EXIT_FAILURE;
+  }
+
+  std::cout << "Testing products: ell_matrix, strided vectors" << std::endl;
+  retval = strided_matrix_vector_product_test<NumericT, viennacl::ell_matrix<NumericT> >(epsilon, result, rhs, vcl_result, vcl_rhs);
+  if (retval != EXIT_SUCCESS)
+    return retval;
+
+
+  //std::cout << "Copying hyb_matrix" << std::endl;
+  viennacl::copy(ublas_matrix, vcl_hyb_matrix);
+  ublas_matrix.clear();
+  viennacl::copy(vcl_hyb_matrix, ublas_matrix);// just to check that it's works
+  viennacl::copy(ublas_matrix, vcl_hyb_matrix);
+
+  std::cout << "Testing products: hyb_matrix" << std::endl;
+  result     = viennacl::linalg::prod(ublas_matrix, rhs);
+  vcl_result.clear();
+  vcl_result = viennacl::linalg::prod(vcl_hyb_matrix, vcl_rhs);
+  //viennacl::linalg::prod_impl(vcl_hyb_matrix, vcl_rhs, vcl_result);
+  //std::cout << vcl_result << "\n";
+  //std::cout << "  diff: " << std::fabs(diff(result, vcl_result)) << std::endl;
+  //std::cout << "First entry of result vector: " << vcl_result[0] << std::endl;
+
+  if( std::fabs(diff(result, vcl_result)) > epsilon )
+  {
+    std::cout << "# Error at operation: matrix-vector product with hyb_matrix" << std::endl;
+    std::cout << "  diff: " << std::fabs(diff(result, vcl_result)) << std::endl;
+    retval = EXIT_FAILURE;
+  }
+
+  std::cout << "Testing products: hyb_matrix, strided vectors" << std::endl;
+  retval = strided_matrix_vector_product_test<NumericT, viennacl::hyb_matrix<NumericT> >(epsilon, result, rhs, vcl_result, vcl_rhs);
+  if (retval != EXIT_SUCCESS)
+    return retval;
+
+
+  // --------------------------------------------------------------------------
+  // --------------------------------------------------------------------------
+  NumericT alpha = static_cast<NumericT>(2.786);
+  NumericT beta = static_cast<NumericT>(1.432);
+  copy(rhs.begin(), rhs.end(), vcl_rhs.begin());
+  copy(result.begin(), result.end(), vcl_result.begin());
+  copy(result.begin(), result.end(), vcl_result2.begin());
+
+  std::cout << "Testing scaled additions of products and vectors" << std::endl;
+  result     = alpha * viennacl::linalg::prod(ublas_matrix, rhs) + beta * result;
+  vcl_result2 = alpha * viennacl::linalg::prod(vcl_compressed_matrix, vcl_rhs) + beta * vcl_result;
+
+  if( std::fabs(diff(result, vcl_result2)) > epsilon )
+  {
+    std::cout << "# Error at operation: matrix-vector product (compressed_matrix) with scaled additions" << std::endl;
+    std::cout << "  diff: " << std::fabs(diff(result, vcl_result2)) << std::endl;
+    retval = EXIT_FAILURE;
+  }
+
+
+  vcl_result2.clear();
+  vcl_result2 = alpha * viennacl::linalg::prod(vcl_coordinate_matrix, vcl_rhs) + beta * vcl_result;
+
+  if( std::fabs(diff(result, vcl_result2)) > epsilon )
+  {
+    std::cout << "# Error at operation: matrix-vector product (coordinate_matrix) with scaled additions" << std::endl;
+    std::cout << "  diff: " << std::fabs(diff(result, vcl_result2)) << std::endl;
+    retval = EXIT_FAILURE;
+  }
+
+  vcl_result2.clear();
+  vcl_result2 = alpha * viennacl::linalg::prod(vcl_ell_matrix, vcl_rhs) + beta * vcl_result;
+
+  if( std::fabs(diff(result, vcl_result2)) > epsilon )
+  {
+    std::cout << "# Error at operation: matrix-vector product (ell_matrix) with scaled additions" << std::endl;
+    std::cout << "  diff: " << std::fabs(diff(result, vcl_result2)) << std::endl;
+    retval = EXIT_FAILURE;
+  }
+
+  vcl_result2.clear();
+  vcl_result2 = alpha * viennacl::linalg::prod(vcl_hyb_matrix, vcl_rhs) + beta * vcl_result;
+
+  if( std::fabs(diff(result, vcl_result2)) > epsilon )
+  {
+    std::cout << "# Error at operation: matrix-vector product (hyb_matrix) with scaled additions" << std::endl;
+    std::cout << "  diff: " << std::fabs(diff(result, vcl_result2)) << std::endl;
+    retval = EXIT_FAILURE;
+  }
+
+
+  // --------------------------------------------------------------------------
+  return retval;
+}
+//
+// -------------------------------------------------------------
+//
+int main()
+{
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "## Test :: Sparse Matrices" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << std::endl;
+
+  int retval = EXIT_SUCCESS;
+
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << std::endl;
+  {
+    typedef float NumericT;
+    NumericT epsilon = static_cast<NumericT>(1E-4);
+    std::cout << "# Testing setup:" << std::endl;
+    std::cout << "  eps:     " << epsilon << std::endl;
+    std::cout << "  numeric: float" << std::endl;
+    retval = test<NumericT>(epsilon);
+    if( retval == EXIT_SUCCESS )
+        std::cout << "# Test passed" << std::endl;
+    else
+        return retval;
+  }
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << std::endl;
+
+#ifdef VIENNACL_WITH_OPENCL
+  if( viennacl::ocl::current_device().double_support() )
+#endif
+  {
+    {
+      typedef double NumericT;
+      NumericT epsilon = 1.0E-12;
+      std::cout << "# Testing setup:" << std::endl;
+      std::cout << "  eps:     " << epsilon << std::endl;
+      std::cout << "  numeric: double" << std::endl;
+      retval = test<NumericT>(epsilon);
+      if( retval == EXIT_SUCCESS )
+        std::cout << "# Test passed" << std::endl;
+      else
+        return retval;
+    }
+    std::cout << std::endl;
+    std::cout << "----------------------------------------------" << std::endl;
+    std::cout << std::endl;
+  }
+#ifdef VIENNACL_WITH_OPENCL
+  else
+    std::cout << "No double precision support, skipping test..." << std::endl;
+#endif
+
+
+  std::cout << std::endl;
+  std::cout << "------- Test completed --------" << std::endl;
+  std::cout << std::endl;
+
+  return retval;
+}
diff --git a/tests/src/spmdm.cpp b/tests/src/spmdm.cpp
new file mode 100644
index 0000000..c03235b
--- /dev/null
+++ b/tests/src/spmdm.cpp
@@ -0,0 +1,339 @@
+//
+// include necessary system headers
+//
+#include <iostream>
+#include <cmath>
+
+//
+// ublas includes
+//
+#include <boost/numeric/ublas/io.hpp>
+#include <boost/numeric/ublas/triangular.hpp>
+#include <boost/numeric/ublas/matrix_sparse.hpp>
+#include <boost/numeric/ublas/matrix.hpp>
+#include <boost/numeric/ublas/matrix_proxy.hpp>
+#include <boost/numeric/ublas/operation.hpp>
+#include <boost/numeric/ublas/operation_sparse.hpp>
+#include <boost/numeric/ublas/io.hpp>
+#include <boost/numeric/ublas/lu.hpp>
+
+// Must be set if you want to use ViennaCL algorithms on ublas objects
+#define VIENNACL_WITH_UBLAS 1
+//#define VIENNACL_WITH_OPENCL 1
+//#define VIENNACL_WITH_CUDA 1
+//#define VIENNACL_DEBUG_KERNEL 1
+//#define VIENNACL_BUILD_INFO 1
+
+//
+// ViennaCL includes
+//
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/direct_solve.hpp"
+#include "viennacl/compressed_matrix.hpp"
+#include "viennacl/coordinate_matrix.hpp"
+#include "viennacl/ell_matrix.hpp"
+#include "viennacl/hyb_matrix.hpp"
+#include "viennacl/linalg/prod.hpp"       //generic matrix-vector product
+#include "viennacl/linalg/norm_2.hpp"     //generic l2-norm for vectors
+#include "viennacl/io/matrix_market.hpp"
+
+
+// Some helper functions for this tutorial:
+#include "Random.hpp"
+
+
+using namespace boost::numeric;
+
+template < typename ScalarType >
+int check_matrices(const ublas::matrix< ScalarType >& ref_mat, const ublas::matrix< ScalarType >& mat, ScalarType eps) {
+
+  std::size_t size1, size2;
+  size1 = ref_mat.size1(); size2 = ref_mat.size2();
+  if( (size1 != mat.size1()) || (size2 != mat.size2()) )
+    return EXIT_FAILURE;
+
+  for (unsigned int i = 0; i < size1; i++)
+    for (unsigned int j = 0; j < size2; j++)
+    {
+      ScalarType rel_error = std::abs(ref_mat(i,j) - mat(i,j)) / std::max(std::abs(ref_mat(i,j)), std::abs(mat(i,j)));
+      if ( rel_error > eps ) {
+        std::cout << "ERROR: Verification failed at (" << i <<", "<< j << "): "
+                  << " Expected: " << ref_mat(i,j) << ", got: " << mat(i,j) << " (relative error: " << rel_error << ")" << std::endl;
+        return EXIT_FAILURE;
+      }
+    }
+
+  std::cout << "Everything went well!" << std::endl;
+  return EXIT_SUCCESS;
+}
+
+template <typename NumericT, typename ResultLayoutT, typename FactorLayoutT>
+int test(NumericT epsilon)
+{
+  int retVal = EXIT_SUCCESS;
+
+  ublas::compressed_matrix<NumericT>    ublas_lhs;
+
+  if (viennacl::io::read_matrix_market_file(ublas_lhs, "../../examples/testdata/mat65k.mtx") == EXIT_FAILURE)
+  {
+    std::cout << "Error reading Matrix file" << std::endl;
+    return EXIT_FAILURE;
+  }
+
+  // add some extra weight to diagonal in order to avoid issues with round-off errors
+  for (std::size_t i=0; i<ublas_lhs.size1(); ++i)
+    ublas_lhs(i,i) *= NumericT(1.5);
+
+  std::size_t cols_rhs = 1;
+
+  viennacl::compressed_matrix<NumericT> compressed_lhs;
+  viennacl::ell_matrix<NumericT>        ell_lhs;
+  viennacl::coordinate_matrix<NumericT> coo_lhs;
+  viennacl::hyb_matrix<NumericT>     hyb_lhs;
+
+  ublas::matrix<NumericT> ublas_result;
+  viennacl::matrix<NumericT, ResultLayoutT> result;
+
+  viennacl::copy( ublas_lhs, compressed_lhs);
+  viennacl::copy( ublas_lhs, ell_lhs);
+  viennacl::copy( ublas_lhs, coo_lhs);
+  viennacl::copy( ublas_lhs, hyb_lhs);
+
+  ublas::matrix<NumericT> ublas_rhs1(ublas_lhs.size2(), cols_rhs);
+  viennacl::matrix<NumericT, FactorLayoutT> rhs1(ublas_lhs.size2(), cols_rhs);
+
+  ublas::matrix<NumericT> ublas_rhs2;
+  viennacl::matrix<NumericT, FactorLayoutT> rhs2;
+
+  ublas::matrix<NumericT> temp(ublas_rhs1.size1(), cols_rhs);
+
+  for (unsigned int i = 0; i < ublas_rhs1.size1(); i++)
+    for (unsigned int j = 0; j < ublas_rhs1.size2(); j++)
+      ublas_rhs1(i,j) = NumericT(0.5) + NumericT(0.1) * random<NumericT>();
+  viennacl::copy( ublas_rhs1, rhs1);
+
+  ublas_rhs2 = ublas::trans( ublas_rhs1);
+  viennacl::copy( ublas_rhs2, rhs2);
+
+  /* gold result */
+  ublas_result = ublas::prod( ublas_lhs, ublas_rhs1);
+
+  /******************************************************************/
+  std::cout << "Testing compressed(CSR) lhs * dense rhs" << std::endl;
+  result = viennacl::linalg::prod( compressed_lhs, rhs1);
+
+  temp.clear();
+  viennacl::copy( result, temp);
+  retVal = check_matrices(ublas_result, temp, epsilon);
+
+  /******************************************************************/
+  std::cout << "Testing compressed(ELL) lhs * dense rhs" << std::endl;
+  result.clear();
+  result = viennacl::linalg::prod( ell_lhs, rhs1);
+
+  temp.clear();
+  viennacl::copy( result, temp);
+  check_matrices(ublas_result, temp, epsilon);
+
+  /******************************************************************/
+
+  std::cout << "Testing compressed(COO) lhs * dense rhs" << std::endl;
+  result.clear();
+  result = viennacl::linalg::prod( coo_lhs, rhs1);
+
+  temp.clear();
+  viennacl::copy( result, temp);
+  check_matrices(ublas_result, temp, epsilon);
+
+  /******************************************************************/
+
+  std::cout << "Testing compressed(HYB) lhs * dense rhs" << std::endl;
+  result.clear();
+  result = viennacl::linalg::prod( hyb_lhs, rhs1);
+
+  temp.clear();
+  viennacl::copy( result, temp);
+  check_matrices(ublas_result, temp, epsilon);
+
+  /******************************************************************/
+
+  /* gold result */
+  ublas_result = ublas::prod( ublas_lhs, ublas::trans(ublas_rhs2));
+
+  /******************************************************************/
+  std::cout << std::endl << "Testing compressed(CSR) lhs * transposed dense rhs:" << std::endl;
+  result.clear();
+  result = viennacl::linalg::prod( compressed_lhs, viennacl::trans(rhs2));
+
+  temp.clear();
+  viennacl::copy( result, temp);
+  retVal = check_matrices(ublas_result, temp, epsilon);
+
+  /******************************************************************/
+  std::cout << "Testing compressed(ELL) lhs * transposed dense rhs" << std::endl;
+  result.clear();
+  result = viennacl::linalg::prod( ell_lhs, viennacl::trans(rhs2));
+
+  temp.clear();
+  viennacl::copy( result, temp);
+  check_matrices(ublas_result, temp, epsilon);
+
+  /******************************************************************/
+  std::cout << "Testing compressed(COO) lhs * transposed dense rhs" << std::endl;
+  result.clear();
+  result = viennacl::linalg::prod( coo_lhs, viennacl::trans(rhs2));
+
+  temp.clear();
+  viennacl::copy( result, temp);
+  check_matrices(ublas_result, temp, epsilon);
+
+  /******************************************************************/
+
+  std::cout << "Testing compressed(HYB) lhs * dense rhs" << std::endl;
+  result.clear();
+  result = viennacl::linalg::prod( hyb_lhs, viennacl::trans(rhs2));
+
+  temp.clear();
+  viennacl::copy( result, temp);
+  check_matrices(ublas_result, temp, epsilon);
+
+  /******************************************************************/
+  if(retVal == EXIT_SUCCESS) {
+    std::cout << "Tests passed successfully" << std::endl;
+  }
+
+  return retVal;
+}
+
+//
+// -------------------------------------------------------------
+//
+int main()
+{
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "## Test :: Sparse-Dense Matrix Multiplication" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << std::endl;
+
+  int retval = EXIT_SUCCESS;
+
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << std::endl;
+  {
+    typedef float NumericT;
+    NumericT epsilon = static_cast<NumericT>(1E-4);
+    std::cout << "# Testing setup:" << std::endl;
+    std::cout << "  eps:     " << epsilon << std::endl;
+    std::cout << "  numeric: float" << std::endl;
+    std::cout << "  layout:  row-major, row-major" << std::endl;
+    retval = test<NumericT, viennacl::row_major, viennacl::row_major>(epsilon);
+    if( retval == EXIT_SUCCESS )
+        std::cout << "# Test passed" << std::endl;
+    else
+        return retval;
+
+    std::cout << "# Testing setup:" << std::endl;
+    std::cout << "  eps:     " << epsilon << std::endl;
+    std::cout << "  numeric: float" << std::endl;
+    std::cout << "  layout:  row-major, column-major" << std::endl;
+    retval = test<NumericT, viennacl::row_major, viennacl::column_major>(epsilon);
+    if( retval == EXIT_SUCCESS )
+        std::cout << "# Test passed" << std::endl;
+    else
+        return retval;
+
+    std::cout << "# Testing setup:" << std::endl;
+    std::cout << "  eps:     " << epsilon << std::endl;
+    std::cout << "  numeric: float" << std::endl;
+    std::cout << "  layout:  column-major, row-major" << std::endl;
+    retval = test<NumericT, viennacl::column_major, viennacl::row_major>(epsilon);
+    if( retval == EXIT_SUCCESS )
+        std::cout << "# Test passed" << std::endl;
+    else
+        return retval;
+
+    std::cout << "# Testing setup:" << std::endl;
+    std::cout << "  eps:     " << epsilon << std::endl;
+    std::cout << "  numeric: float" << std::endl;
+    std::cout << "  layout:  column-major, column-major" << std::endl;
+    retval = test<NumericT, viennacl::column_major, viennacl::column_major>(epsilon);
+    if( retval == EXIT_SUCCESS )
+        std::cout << "# Test passed" << std::endl;
+    else
+        return retval;
+
+  }
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << std::endl;
+
+#ifdef VIENNACL_WITH_OPENCL
+  if( viennacl::ocl::current_device().double_support() )
+#endif
+  {
+    {
+      typedef double NumericT;
+      NumericT epsilon = 1.0E-12;
+      std::cout << "# Testing setup:" << std::endl;
+      std::cout << "  eps:     " << epsilon << std::endl;
+      std::cout << "  numeric: double" << std::endl;
+      std::cout << "  layout:  row-major, row-major" << std::endl;
+      retval = test<NumericT, viennacl::row_major, viennacl::row_major>(epsilon);
+      if( retval == EXIT_SUCCESS )
+        std::cout << "# Test passed" << std::endl;
+      else
+        return retval;
+
+      std::cout << "# Testing setup:" << std::endl;
+      std::cout << "  eps:     " << epsilon << std::endl;
+      std::cout << "  numeric: double" << std::endl;
+      std::cout << "  layout:  row-major, column-major" << std::endl;
+      retval = test<NumericT, viennacl::row_major, viennacl::column_major>(epsilon);
+      if( retval == EXIT_SUCCESS )
+        std::cout << "# Test passed" << std::endl;
+      else
+        return retval;
+
+      std::cout << "# Testing setup:" << std::endl;
+      std::cout << "  eps:     " << epsilon << std::endl;
+      std::cout << "  numeric: double" << std::endl;
+      std::cout << "  layout:  column-major, row-major" << std::endl;
+      retval = test<NumericT, viennacl::column_major, viennacl::row_major>(epsilon);
+      if( retval == EXIT_SUCCESS )
+        std::cout << "# Test passed" << std::endl;
+      else
+        return retval;
+
+      std::cout << "# Testing setup:" << std::endl;
+      std::cout << "  eps:     " << epsilon << std::endl;
+      std::cout << "  numeric: double" << std::endl;
+      std::cout << "  layout:  column-major, column-major" << std::endl;
+      retval = test<NumericT, viennacl::column_major, viennacl::column_major>(epsilon);
+      if( retval == EXIT_SUCCESS )
+        std::cout << "# Test passed" << std::endl;
+      else
+        return retval;
+    }
+    std::cout << std::endl;
+    std::cout << "----------------------------------------------" << std::endl;
+    std::cout << std::endl;
+  }
+#ifdef VIENNACL_WITH_OPENCL
+  else
+    std::cout << "No double precision support, skipping test..." << std::endl;
+#endif
+
+
+  std::cout << std::endl;
+  std::cout << "------- Test completed --------" << std::endl;
+  std::cout << std::endl;
+
+  return retval;
+}
+
diff --git a/tests/src/spmdm.cu b/tests/src/spmdm.cu
new file mode 100644
index 0000000..c03235b
--- /dev/null
+++ b/tests/src/spmdm.cu
@@ -0,0 +1,339 @@
+//
+// include necessary system headers
+//
+#include <iostream>
+#include <cmath>
+
+//
+// ublas includes
+//
+#include <boost/numeric/ublas/io.hpp>
+#include <boost/numeric/ublas/triangular.hpp>
+#include <boost/numeric/ublas/matrix_sparse.hpp>
+#include <boost/numeric/ublas/matrix.hpp>
+#include <boost/numeric/ublas/matrix_proxy.hpp>
+#include <boost/numeric/ublas/operation.hpp>
+#include <boost/numeric/ublas/operation_sparse.hpp>
+#include <boost/numeric/ublas/io.hpp>
+#include <boost/numeric/ublas/lu.hpp>
+
+// Must be set if you want to use ViennaCL algorithms on ublas objects
+#define VIENNACL_WITH_UBLAS 1
+//#define VIENNACL_WITH_OPENCL 1
+//#define VIENNACL_WITH_CUDA 1
+//#define VIENNACL_DEBUG_KERNEL 1
+//#define VIENNACL_BUILD_INFO 1
+
+//
+// ViennaCL includes
+//
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/direct_solve.hpp"
+#include "viennacl/compressed_matrix.hpp"
+#include "viennacl/coordinate_matrix.hpp"
+#include "viennacl/ell_matrix.hpp"
+#include "viennacl/hyb_matrix.hpp"
+#include "viennacl/linalg/prod.hpp"       //generic matrix-vector product
+#include "viennacl/linalg/norm_2.hpp"     //generic l2-norm for vectors
+#include "viennacl/io/matrix_market.hpp"
+
+
+// Some helper functions for this tutorial:
+#include "Random.hpp"
+
+
+using namespace boost::numeric;
+
+template < typename ScalarType >
+int check_matrices(const ublas::matrix< ScalarType >& ref_mat, const ublas::matrix< ScalarType >& mat, ScalarType eps) {
+
+  std::size_t size1, size2;
+  size1 = ref_mat.size1(); size2 = ref_mat.size2();
+  if( (size1 != mat.size1()) || (size2 != mat.size2()) )
+    return EXIT_FAILURE;
+
+  for (unsigned int i = 0; i < size1; i++)
+    for (unsigned int j = 0; j < size2; j++)
+    {
+      ScalarType rel_error = std::abs(ref_mat(i,j) - mat(i,j)) / std::max(std::abs(ref_mat(i,j)), std::abs(mat(i,j)));
+      if ( rel_error > eps ) {
+        std::cout << "ERROR: Verification failed at (" << i <<", "<< j << "): "
+                  << " Expected: " << ref_mat(i,j) << ", got: " << mat(i,j) << " (relative error: " << rel_error << ")" << std::endl;
+        return EXIT_FAILURE;
+      }
+    }
+
+  std::cout << "Everything went well!" << std::endl;
+  return EXIT_SUCCESS;
+}
+
+template <typename NumericT, typename ResultLayoutT, typename FactorLayoutT>
+int test(NumericT epsilon)
+{
+  int retVal = EXIT_SUCCESS;
+
+  ublas::compressed_matrix<NumericT>    ublas_lhs;
+
+  if (viennacl::io::read_matrix_market_file(ublas_lhs, "../../examples/testdata/mat65k.mtx") == EXIT_FAILURE)
+  {
+    std::cout << "Error reading Matrix file" << std::endl;
+    return EXIT_FAILURE;
+  }
+
+  // add some extra weight to diagonal in order to avoid issues with round-off errors
+  for (std::size_t i=0; i<ublas_lhs.size1(); ++i)
+    ublas_lhs(i,i) *= NumericT(1.5);
+
+  std::size_t cols_rhs = 1;
+
+  viennacl::compressed_matrix<NumericT> compressed_lhs;
+  viennacl::ell_matrix<NumericT>        ell_lhs;
+  viennacl::coordinate_matrix<NumericT> coo_lhs;
+  viennacl::hyb_matrix<NumericT>     hyb_lhs;
+
+  ublas::matrix<NumericT> ublas_result;
+  viennacl::matrix<NumericT, ResultLayoutT> result;
+
+  viennacl::copy( ublas_lhs, compressed_lhs);
+  viennacl::copy( ublas_lhs, ell_lhs);
+  viennacl::copy( ublas_lhs, coo_lhs);
+  viennacl::copy( ublas_lhs, hyb_lhs);
+
+  ublas::matrix<NumericT> ublas_rhs1(ublas_lhs.size2(), cols_rhs);
+  viennacl::matrix<NumericT, FactorLayoutT> rhs1(ublas_lhs.size2(), cols_rhs);
+
+  ublas::matrix<NumericT> ublas_rhs2;
+  viennacl::matrix<NumericT, FactorLayoutT> rhs2;
+
+  ublas::matrix<NumericT> temp(ublas_rhs1.size1(), cols_rhs);
+
+  for (unsigned int i = 0; i < ublas_rhs1.size1(); i++)
+    for (unsigned int j = 0; j < ublas_rhs1.size2(); j++)
+      ublas_rhs1(i,j) = NumericT(0.5) + NumericT(0.1) * random<NumericT>();
+  viennacl::copy( ublas_rhs1, rhs1);
+
+  ublas_rhs2 = ublas::trans( ublas_rhs1);
+  viennacl::copy( ublas_rhs2, rhs2);
+
+  /* gold result */
+  ublas_result = ublas::prod( ublas_lhs, ublas_rhs1);
+
+  /******************************************************************/
+  std::cout << "Testing compressed(CSR) lhs * dense rhs" << std::endl;
+  result = viennacl::linalg::prod( compressed_lhs, rhs1);
+
+  temp.clear();
+  viennacl::copy( result, temp);
+  retVal = check_matrices(ublas_result, temp, epsilon);
+
+  /******************************************************************/
+  std::cout << "Testing compressed(ELL) lhs * dense rhs" << std::endl;
+  result.clear();
+  result = viennacl::linalg::prod( ell_lhs, rhs1);
+
+  temp.clear();
+  viennacl::copy( result, temp);
+  check_matrices(ublas_result, temp, epsilon);
+
+  /******************************************************************/
+
+  std::cout << "Testing compressed(COO) lhs * dense rhs" << std::endl;
+  result.clear();
+  result = viennacl::linalg::prod( coo_lhs, rhs1);
+
+  temp.clear();
+  viennacl::copy( result, temp);
+  check_matrices(ublas_result, temp, epsilon);
+
+  /******************************************************************/
+
+  std::cout << "Testing compressed(HYB) lhs * dense rhs" << std::endl;
+  result.clear();
+  result = viennacl::linalg::prod( hyb_lhs, rhs1);
+
+  temp.clear();
+  viennacl::copy( result, temp);
+  check_matrices(ublas_result, temp, epsilon);
+
+  /******************************************************************/
+
+  /* gold result */
+  ublas_result = ublas::prod( ublas_lhs, ublas::trans(ublas_rhs2));
+
+  /******************************************************************/
+  std::cout << std::endl << "Testing compressed(CSR) lhs * transposed dense rhs:" << std::endl;
+  result.clear();
+  result = viennacl::linalg::prod( compressed_lhs, viennacl::trans(rhs2));
+
+  temp.clear();
+  viennacl::copy( result, temp);
+  retVal = check_matrices(ublas_result, temp, epsilon);
+
+  /******************************************************************/
+  std::cout << "Testing compressed(ELL) lhs * transposed dense rhs" << std::endl;
+  result.clear();
+  result = viennacl::linalg::prod( ell_lhs, viennacl::trans(rhs2));
+
+  temp.clear();
+  viennacl::copy( result, temp);
+  check_matrices(ublas_result, temp, epsilon);
+
+  /******************************************************************/
+  std::cout << "Testing compressed(COO) lhs * transposed dense rhs" << std::endl;
+  result.clear();
+  result = viennacl::linalg::prod( coo_lhs, viennacl::trans(rhs2));
+
+  temp.clear();
+  viennacl::copy( result, temp);
+  check_matrices(ublas_result, temp, epsilon);
+
+  /******************************************************************/
+
+  std::cout << "Testing compressed(HYB) lhs * dense rhs" << std::endl;
+  result.clear();
+  result = viennacl::linalg::prod( hyb_lhs, viennacl::trans(rhs2));
+
+  temp.clear();
+  viennacl::copy( result, temp);
+  check_matrices(ublas_result, temp, epsilon);
+
+  /******************************************************************/
+  if(retVal == EXIT_SUCCESS) {
+    std::cout << "Tests passed successfully" << std::endl;
+  }
+
+  return retVal;
+}
+
+//
+// -------------------------------------------------------------
+//
+int main()
+{
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "## Test :: Sparse-Dense Matrix Multiplication" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << std::endl;
+
+  int retval = EXIT_SUCCESS;
+
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << std::endl;
+  {
+    typedef float NumericT;
+    NumericT epsilon = static_cast<NumericT>(1E-4);
+    std::cout << "# Testing setup:" << std::endl;
+    std::cout << "  eps:     " << epsilon << std::endl;
+    std::cout << "  numeric: float" << std::endl;
+    std::cout << "  layout:  row-major, row-major" << std::endl;
+    retval = test<NumericT, viennacl::row_major, viennacl::row_major>(epsilon);
+    if( retval == EXIT_SUCCESS )
+        std::cout << "# Test passed" << std::endl;
+    else
+        return retval;
+
+    std::cout << "# Testing setup:" << std::endl;
+    std::cout << "  eps:     " << epsilon << std::endl;
+    std::cout << "  numeric: float" << std::endl;
+    std::cout << "  layout:  row-major, column-major" << std::endl;
+    retval = test<NumericT, viennacl::row_major, viennacl::column_major>(epsilon);
+    if( retval == EXIT_SUCCESS )
+        std::cout << "# Test passed" << std::endl;
+    else
+        return retval;
+
+    std::cout << "# Testing setup:" << std::endl;
+    std::cout << "  eps:     " << epsilon << std::endl;
+    std::cout << "  numeric: float" << std::endl;
+    std::cout << "  layout:  column-major, row-major" << std::endl;
+    retval = test<NumericT, viennacl::column_major, viennacl::row_major>(epsilon);
+    if( retval == EXIT_SUCCESS )
+        std::cout << "# Test passed" << std::endl;
+    else
+        return retval;
+
+    std::cout << "# Testing setup:" << std::endl;
+    std::cout << "  eps:     " << epsilon << std::endl;
+    std::cout << "  numeric: float" << std::endl;
+    std::cout << "  layout:  column-major, column-major" << std::endl;
+    retval = test<NumericT, viennacl::column_major, viennacl::column_major>(epsilon);
+    if( retval == EXIT_SUCCESS )
+        std::cout << "# Test passed" << std::endl;
+    else
+        return retval;
+
+  }
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << std::endl;
+
+#ifdef VIENNACL_WITH_OPENCL
+  if( viennacl::ocl::current_device().double_support() )
+#endif
+  {
+    {
+      typedef double NumericT;
+      NumericT epsilon = 1.0E-12;
+      std::cout << "# Testing setup:" << std::endl;
+      std::cout << "  eps:     " << epsilon << std::endl;
+      std::cout << "  numeric: double" << std::endl;
+      std::cout << "  layout:  row-major, row-major" << std::endl;
+      retval = test<NumericT, viennacl::row_major, viennacl::row_major>(epsilon);
+      if( retval == EXIT_SUCCESS )
+        std::cout << "# Test passed" << std::endl;
+      else
+        return retval;
+
+      std::cout << "# Testing setup:" << std::endl;
+      std::cout << "  eps:     " << epsilon << std::endl;
+      std::cout << "  numeric: double" << std::endl;
+      std::cout << "  layout:  row-major, column-major" << std::endl;
+      retval = test<NumericT, viennacl::row_major, viennacl::column_major>(epsilon);
+      if( retval == EXIT_SUCCESS )
+        std::cout << "# Test passed" << std::endl;
+      else
+        return retval;
+
+      std::cout << "# Testing setup:" << std::endl;
+      std::cout << "  eps:     " << epsilon << std::endl;
+      std::cout << "  numeric: double" << std::endl;
+      std::cout << "  layout:  column-major, row-major" << std::endl;
+      retval = test<NumericT, viennacl::column_major, viennacl::row_major>(epsilon);
+      if( retval == EXIT_SUCCESS )
+        std::cout << "# Test passed" << std::endl;
+      else
+        return retval;
+
+      std::cout << "# Testing setup:" << std::endl;
+      std::cout << "  eps:     " << epsilon << std::endl;
+      std::cout << "  numeric: double" << std::endl;
+      std::cout << "  layout:  column-major, column-major" << std::endl;
+      retval = test<NumericT, viennacl::column_major, viennacl::column_major>(epsilon);
+      if( retval == EXIT_SUCCESS )
+        std::cout << "# Test passed" << std::endl;
+      else
+        return retval;
+    }
+    std::cout << std::endl;
+    std::cout << "----------------------------------------------" << std::endl;
+    std::cout << std::endl;
+  }
+#ifdef VIENNACL_WITH_OPENCL
+  else
+    std::cout << "No double precision support, skipping test..." << std::endl;
+#endif
+
+
+  std::cout << std::endl;
+  std::cout << "------- Test completed --------" << std::endl;
+  std::cout << std::endl;
+
+  return retval;
+}
+
diff --git a/tests/src/structured-matrices.cpp b/tests/src/structured-matrices.cpp
index 9227903..c94c4ea 100644
--- a/tests/src/structured-matrices.cpp
+++ b/tests/src/structured-matrices.cpp
@@ -1,14 +1,15 @@
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
@@ -22,6 +23,8 @@
 
 //#define VIENNACL_BUILD_INFO
 
+//#define VIENNACL_DEBUG_ALL
+
 #include "viennacl/toeplitz_matrix.hpp"
 #include "viennacl/circulant_matrix.hpp"
 #include "viennacl/vandermonde_matrix.hpp"
@@ -37,18 +40,20 @@ template <typename T>
 class dense_matrix
 {
   public:
+    typedef std::size_t   size_type;
+
     dense_matrix(std::size_t rows, std::size_t cols) : elements_(rows * cols), rows_(rows), cols_(cols) {}
-    
+
     T & operator()(std::size_t i, std::size_t j) { return elements_[i*cols_ + j]; }
     T const & operator()(std::size_t i, std::size_t j) const { return elements_[i*cols_ + j]; }
-    
+
     std::size_t size1() const { return rows_; }
     std::size_t size2() const { return cols_; }
 
     dense_matrix & operator+=(dense_matrix const & other)
     {
-      for(std::size_t i = 0; i < other.size1(); i++) 
-        for(std::size_t j = 0; j < other.size2(); j++) 
+      for(std::size_t i = 0; i < other.size1(); i++)
+        for(std::size_t j = 0; j < other.size2(); j++)
           elements_[i*cols_ + j] = other.elements_[i*cols_+j];
       return *this;
     }
@@ -70,20 +75,20 @@ std::ostream & operator<<(std::ostream & os, dense_matrix<T> const & mat)
       std::cout << mat(i,j) << ",";
     std::cout << ")";
   }
-  
+
   return os;
 }
 
 
 template <typename ScalarType>
-ScalarType diff(dense_matrix<ScalarType> const & m1, dense_matrix<ScalarType> const & m2) 
+ScalarType diff(dense_matrix<ScalarType> const & m1, dense_matrix<ScalarType> const & m2)
 {
     ScalarType df = 0.0;
     ScalarType d1 = 0;
     ScalarType d2 = 0;
 
-    for(std::size_t i = 0; i < m1.size1(); i++) 
-      for(std::size_t j = 0; j < m1.size2(); j++) 
+    for(std::size_t i = 0; i < m1.size1(); i++)
+      for(std::size_t j = 0; j < m1.size2(); j++)
       {
         df += (m1(i,j) - m2(i,j)) * (m1(i,j) - m2(i,j));
         d1 += m1(i,j) * m1(i,j);
@@ -92,45 +97,45 @@ ScalarType diff(dense_matrix<ScalarType> const & m1, dense_matrix<ScalarType> co
 
     if ( (d1 == 0) && (d2 == 0) )
       return 0;
-    
-    return sqrt(df / std::max<ScalarType>(d1, d2));
+
+    return std::sqrt(df / std::max<ScalarType>(d1, d2));
 }
 
 
 template <typename ScalarType>
-ScalarType diff(std::vector<ScalarType>& vec, std::vector<ScalarType>& ref) 
+ScalarType diff(std::vector<ScalarType>& vec, std::vector<ScalarType>& ref)
 {
     ScalarType df = 0.0;
     ScalarType norm_ref = 0;
 
-    for(std::size_t i = 0; i < vec.size(); i++) 
+    for(std::size_t i = 0; i < vec.size(); i++)
     {
         df = df + pow(vec[i] - ref[i], 2);
         norm_ref += ref[i] * ref[i];
     }
 
-    return sqrt(df / norm_ref) ;
+    return std::sqrt(df / norm_ref) ;
 }
 
 template <typename ScalarType>
-ScalarType diff_max(std::vector<ScalarType>& vec, std::vector<ScalarType>& ref) 
+ScalarType diff_max(std::vector<ScalarType>& vec, std::vector<ScalarType>& ref)
 {
   ScalarType df = 0.0;
   ScalarType mx = 0.0;
   ScalarType norm_max = 0;
-  
-  for (std::size_t i = 0; i < vec.size(); i++) 
+
+  for (std::size_t i = 0; i < vec.size(); i++)
   {
     df = std::max<ScalarType>(fabs(vec[i] - ref[i]), df);
     mx = std::max<ScalarType>(fabs(vec[i]), mx);
-    
+
     if (mx > 0)
     {
       if (norm_max < df / mx)
         norm_max = df / mx;
     }
   }
-  
+
   return norm_max;
 }
 
@@ -156,7 +161,7 @@ void transpose_test()
 
 
 template <typename ScalarType>
-int toeplitz_test(ScalarType epsilon) 
+int toeplitz_test(ScalarType epsilon)
 {
     std::size_t TOEPLITZ_SIZE = 47;
     viennacl::toeplitz_matrix<ScalarType> vcl_toeplitz1(TOEPLITZ_SIZE, TOEPLITZ_SIZE);
@@ -167,15 +172,15 @@ int toeplitz_test(ScalarType epsilon)
 
     std::vector<ScalarType> input_ref(TOEPLITZ_SIZE);
     std::vector<ScalarType> result_ref(TOEPLITZ_SIZE);
-    
+
     dense_matrix<ScalarType> m1(TOEPLITZ_SIZE, TOEPLITZ_SIZE);
     dense_matrix<ScalarType> m2(TOEPLITZ_SIZE, TOEPLITZ_SIZE);
 
-    for(std::size_t i = 0; i < TOEPLITZ_SIZE; i++) 
-      for(std::size_t j = 0; j < TOEPLITZ_SIZE; j++) 
+    for(std::size_t i = 0; i < TOEPLITZ_SIZE; i++)
+      for(std::size_t j = 0; j < TOEPLITZ_SIZE; j++)
       {
         m1(i,j) = static_cast<ScalarType>(i) - static_cast<ScalarType>(j);
-        m2(i,j) = m1(i,j) * m1(i,j) + 1;
+        m2(i,j) = m1(i,j) * m1(i,j) + ScalarType(1);
       }
 
     for(std::size_t i = 0; i < TOEPLITZ_SIZE; i++)
@@ -185,21 +190,21 @@ int toeplitz_test(ScalarType epsilon)
     viennacl::copy(m1, vcl_toeplitz1);
     viennacl::copy(m2, vcl_toeplitz2);
     viennacl::copy(input_ref, vcl_input);
-    
+
     //
     // Matrix-Vector product:
     //
     vcl_result = viennacl::linalg::prod(vcl_toeplitz1, vcl_input);
-    
+
     for(std::size_t i = 0; i < m1.size1(); i++)     //reference calculation
     {
       ScalarType entry = 0;
-      for(std::size_t j = 0; j < m1.size2(); j++) 
+      for(std::size_t j = 0; j < m1.size2(); j++)
         entry += m1(i,j) * input_ref[j];
-      
+
       result_ref[i] = entry;
     }
-    
+
     viennacl::copy(vcl_result, input_ref);
     std::cout << "Matrix-Vector Product: " << diff_max(input_ref, result_ref);
     if (diff_max(input_ref, result_ref) < epsilon)
@@ -211,17 +216,17 @@ int toeplitz_test(ScalarType epsilon)
       std::cout << " [FAILED]" << std::endl;
       return EXIT_FAILURE;
     }
-    
-    
+
+
     //
     // Matrix addition:
     //
     vcl_toeplitz1 += vcl_toeplitz2;
 
     for(std::size_t i = 0; i < m1.size1(); i++)    //reference calculation
-      for(std::size_t j = 0; j < m1.size2(); j++) 
+      for(std::size_t j = 0; j < m1.size2(); j++)
         m1(i,j) += m2(i,j);
-    
+
     viennacl::copy(vcl_toeplitz1, m2);
     std::cout << "Matrix Addition: " << diff(m1, m2);
     if (diff(m1, m2) < epsilon)
@@ -231,12 +236,12 @@ int toeplitz_test(ScalarType epsilon)
       std::cout << " [FAILED]" << std::endl;
       return EXIT_FAILURE;
     }
-    
+
     //
     // Per-Element access:
     //
     vcl_toeplitz1(2,4) = 42;
-    
+
     for (std::size_t i=0; i<m1.size1(); ++i)    //reference calculation
     {
       if (i + 2 < m1.size2())
@@ -252,12 +257,12 @@ int toeplitz_test(ScalarType epsilon)
       std::cout << " [FAILED]" << std::endl;
       return EXIT_FAILURE;
     }
-    
+
     return EXIT_SUCCESS;
 }
 
 template <typename ScalarType>
-int circulant_test(ScalarType epsilon) 
+int circulant_test(ScalarType epsilon)
 {
     std::size_t CIRCULANT_SIZE = 53;
     viennacl::circulant_matrix<ScalarType> vcl_circulant1(CIRCULANT_SIZE, CIRCULANT_SIZE);
@@ -268,12 +273,12 @@ int circulant_test(ScalarType epsilon)
 
     std::vector<ScalarType> input_ref(CIRCULANT_SIZE);
     std::vector<ScalarType> result_ref(CIRCULANT_SIZE);
-    
+
     dense_matrix<ScalarType> m1(vcl_circulant1.size1(), vcl_circulant1.size2());
     dense_matrix<ScalarType> m2(vcl_circulant1.size1(), vcl_circulant1.size2());
 
-    for(std::size_t i = 0; i < m1.size1(); i++) 
-      for(std::size_t j = 0; j < m1.size2(); j++) 
+    for(std::size_t i = 0; i < m1.size1(); i++)
+      for(std::size_t j = 0; j < m1.size2(); j++)
       {
         m1(i,j) = static_cast<ScalarType>((i - j + m1.size1()) % m1.size1());
         m2(i,j) = m1(i,j) * m1(i,j) + ScalarType(1);
@@ -281,26 +286,26 @@ int circulant_test(ScalarType epsilon)
 
     for(std::size_t i = 0; i < input_ref.size(); i++)
       input_ref[i] = ScalarType(i);
-    
+
     // Copy to ViennaCL
     viennacl::copy(m1, vcl_circulant1);
     viennacl::copy(m2, vcl_circulant2);
     viennacl::copy(input_ref, vcl_input);
-    
+
     //
     // Matrix-Vector product:
     //
     vcl_result = viennacl::linalg::prod(vcl_circulant1, vcl_input);
-    
+
     for(std::size_t i = 0; i < m1.size1(); i++)     //reference calculation
     {
       ScalarType entry = 0;
-      for(std::size_t j = 0; j < m1.size2(); j++) 
+      for(std::size_t j = 0; j < m1.size2(); j++)
         entry += m1(i,j) * input_ref[j];
-      
+
       result_ref[i] = entry;
     }
-    
+
     viennacl::copy(vcl_result, input_ref);
     std::cout << "Matrix-Vector Product: " << diff_max(input_ref, result_ref);
     if (diff_max(input_ref, result_ref) < epsilon)
@@ -312,17 +317,17 @@ int circulant_test(ScalarType epsilon)
       std::cout << " [FAILED]" << std::endl;
       return EXIT_FAILURE;
     }
-    
-    
+
+
     //
     // Matrix addition:
     //
     vcl_circulant1 += vcl_circulant2;
 
     for(std::size_t i = 0; i < m1.size1(); i++)    //reference calculation
-      for(std::size_t j = 0; j < m1.size2(); j++) 
+      for(std::size_t j = 0; j < m1.size2(); j++)
         m1(i,j) += m2(i,j);
-    
+
     viennacl::copy(vcl_circulant1, m2);
     std::cout << "Matrix Addition: " << diff(m1, m2);
     if (diff(m1, m2) < epsilon)
@@ -332,14 +337,14 @@ int circulant_test(ScalarType epsilon)
       std::cout << " [FAILED]" << std::endl;
       return EXIT_FAILURE;
     }
-    
+
     //
     // Per-Element access:
     //
     vcl_circulant1(4,2) = 42;
-    
+
     for(std::size_t i = 0; i < m1.size1(); i++)    //reference calculation
-      for(std::size_t j = 0; j < m1.size2(); j++) 
+      for(std::size_t j = 0; j < m1.size2(); j++)
       {
         if ((i - j + m1.size1()) % m1.size1() == 2)
           m1(i, j) = 42;
@@ -354,12 +359,12 @@ int circulant_test(ScalarType epsilon)
       std::cout << " [FAILED]" << std::endl;
       return EXIT_FAILURE;
     }
-    
+
     return EXIT_SUCCESS;
 }
 
 template <typename ScalarType>
-int vandermonde_test(ScalarType epsilon) 
+int vandermonde_test(ScalarType epsilon)
 {
     std::size_t VANDERMONDE_SIZE = 61;
 
@@ -371,39 +376,39 @@ int vandermonde_test(ScalarType epsilon)
 
     std::vector<ScalarType> input_ref(VANDERMONDE_SIZE);
     std::vector<ScalarType> result_ref(VANDERMONDE_SIZE);
-    
+
     dense_matrix<ScalarType> m1(vcl_vandermonde1.size1(), vcl_vandermonde1.size2());
     dense_matrix<ScalarType> m2(m1.size1(), m1.size2());
 
-    for(std::size_t i = 0; i < m1.size1(); i++) 
-      for(std::size_t j = 0; j < m1.size2(); j++) 
+    for(std::size_t i = 0; i < m1.size1(); i++)
+      for(std::size_t j = 0; j < m1.size2(); j++)
       {
-        m1(i,j) = pow(ScalarType(1.0 + i/1000.0), ScalarType(j));
-        m2(i,j) = pow(ScalarType(1.0 - i/2000.0), ScalarType(j));
+        m1(i,j) = std::pow(ScalarType(1.0 + i/1000.0), ScalarType(j));
+        m2(i,j) = std::pow(ScalarType(1.0 - i/2000.0), ScalarType(j));
       }
 
     for(std::size_t i = 0; i < input_ref.size(); i++)
       input_ref[i] = ScalarType(i);
-    
+
     // Copy to ViennaCL
     viennacl::copy(m1, vcl_vandermonde1);
     viennacl::copy(m2, vcl_vandermonde2);
     viennacl::copy(input_ref, vcl_input);
-    
+
     //
     // Matrix-Vector product:
     //
     vcl_result = viennacl::linalg::prod(vcl_vandermonde1, vcl_input);
-    
+
     for(std::size_t i = 0; i < m1.size1(); i++)     //reference calculation
     {
       ScalarType entry = 0;
-      for(std::size_t j = 0; j < m1.size2(); j++) 
+      for(std::size_t j = 0; j < m1.size2(); j++)
         entry += m1(i,j) * input_ref[j];
-      
+
       result_ref[i] = entry;
     }
-    
+
     viennacl::copy(vcl_result, input_ref);
     std::cout << "Matrix-Vector Product: " << diff_max(input_ref, result_ref);
     if (diff_max(input_ref, result_ref) < epsilon)
@@ -415,21 +420,21 @@ int vandermonde_test(ScalarType epsilon)
       std::cout << " [FAILED]" << std::endl;
       return EXIT_FAILURE;
     }
-    
-    
+
+
     //
     // Note: Matrix addition does not make sense for a Vandermonde matrix
     //
 
-    
+
     //
     // Per-Element access:
     //
     vcl_vandermonde1(4) = static_cast<ScalarType>(1.0001);
-    
-    for(std::size_t j = 0; j < m1.size2(); j++) 
+
+    for(std::size_t j = 0; j < m1.size2(); j++)
     {
-      m1(4, j) = pow(ScalarType(1.0001), ScalarType(j));
+      m1(4, j) = std::pow(ScalarType(1.0001), ScalarType(j));
     }
 
     viennacl::copy(vcl_vandermonde1, m2);
@@ -441,7 +446,7 @@ int vandermonde_test(ScalarType epsilon)
       std::cout << " [FAILED]" << std::endl;
       return EXIT_FAILURE;
     }
-    
+
     return EXIT_SUCCESS;
 }
 
@@ -457,12 +462,12 @@ int hankel_test(ScalarType epsilon)
 
     std::vector<ScalarType> input_ref(HANKEL_SIZE);
     std::vector<ScalarType> result_ref(HANKEL_SIZE);
-    
+
     dense_matrix<ScalarType> m1(vcl_hankel1.size1(), vcl_hankel1.size2());
     dense_matrix<ScalarType> m2(m1.size1(), m1.size2());
 
-    for(std::size_t i = 0; i < m1.size1(); i++) 
-      for(std::size_t j = 0; j < m1.size2(); j++) 
+    for(std::size_t i = 0; i < m1.size1(); i++)
+      for(std::size_t j = 0; j < m1.size2(); j++)
       {
         m1(i,j) = static_cast<ScalarType>((i + j) % (2 * m1.size1()));
         m2(i,j) = m1(i,j) * m1(i,j) + ScalarType(1);
@@ -470,26 +475,26 @@ int hankel_test(ScalarType epsilon)
 
     for(std::size_t i = 0; i < input_ref.size(); i++)
       input_ref[i] = ScalarType(i);
-    
+
     // Copy to ViennaCL
     viennacl::copy(m1, vcl_hankel1);
     viennacl::copy(m2, vcl_hankel2);
     viennacl::copy(input_ref, vcl_input);
-    
+
     //
     // Matrix-Vector product:
     //
     vcl_result = viennacl::linalg::prod(vcl_hankel1, vcl_input);
-    
+
     for(std::size_t i = 0; i < m1.size1(); i++)     //reference calculation
     {
       ScalarType entry = 0;
-      for(std::size_t j = 0; j < m1.size2(); j++) 
+      for(std::size_t j = 0; j < m1.size2(); j++)
         entry += m1(i,j) * input_ref[j];
-      
+
       result_ref[i] = entry;
     }
-    
+
     viennacl::copy(vcl_result, input_ref);
     std::cout << "Matrix-Vector Product: " << diff_max(input_ref, result_ref);
     if (diff_max(input_ref, result_ref) < epsilon)
@@ -501,17 +506,17 @@ int hankel_test(ScalarType epsilon)
       std::cout << " [FAILED]" << std::endl;
       return EXIT_FAILURE;
     }
-    
-    
+
+
     //
     // Matrix addition:
     //
     vcl_hankel1 += vcl_hankel2;
 
     for(std::size_t i = 0; i < m1.size1(); i++)    //reference calculation
-      for(std::size_t j = 0; j < m1.size2(); j++) 
+      for(std::size_t j = 0; j < m1.size2(); j++)
         m1(i,j) += m2(i,j);
-    
+
     viennacl::copy(vcl_hankel1, m2);
     std::cout << "Matrix Addition: " << diff(m1, m2);
     if (diff(m1, m2) < epsilon)
@@ -521,14 +526,14 @@ int hankel_test(ScalarType epsilon)
       std::cout << " [FAILED]" << std::endl;
       return EXIT_FAILURE;
     }
-    
+
     //
     // Per-Element access:
     //
     vcl_hankel1(4,2) = 42;
-    
+
     for(std::size_t i = 0; i < m1.size1(); i++)    //reference calculation
-      for(std::size_t j = 0; j < m1.size2(); j++) 
+      for(std::size_t j = 0; j < m1.size2(); j++)
       {
         if ((i + j) % (2*m1.size1()) == 6)
           m1(i, j) = 42;
@@ -543,11 +548,11 @@ int hankel_test(ScalarType epsilon)
       std::cout << " [FAILED]" << std::endl;
       return EXIT_FAILURE;
     }
-    
+
     return EXIT_SUCCESS;
 }
 
-int main() 
+int main()
 {
   std::cout << std::endl;
   std::cout << "----------------------------------------------" << std::endl;
@@ -558,7 +563,7 @@ int main()
   std::cout << std::endl;
 
   double eps = 1e-3;
-  
+
   std::cout << "# Testing setup:" << std::endl;
   std::cout << "  eps:     " << eps << std::endl;
   std::cout << "  numeric: float" << std::endl;
@@ -566,48 +571,53 @@ int main()
   std::cout << " -- Vandermonde matrix -- " << std::endl;
   if (vandermonde_test<float>(static_cast<float>(eps)) == EXIT_FAILURE)
     return EXIT_FAILURE;
-  
+
   std::cout << " -- Circulant matrix -- " << std::endl;
   if (circulant_test<float>(static_cast<float>(eps)) == EXIT_FAILURE)
     return EXIT_FAILURE;
-  
+
   std::cout << " -- Toeplitz matrix -- " << std::endl;
   if (toeplitz_test<float>(static_cast<float>(eps)) == EXIT_FAILURE)
     return EXIT_FAILURE;
-  
+
   std::cout << " -- Hankel matrix -- " << std::endl;
   if (hankel_test<float>(static_cast<float>(eps)) == EXIT_FAILURE)
     return EXIT_FAILURE;
-  
-  
+
+
   std::cout << std::endl;
 
   if( viennacl::ocl::current_device().double_support() )
   {
     eps = 1e-10;
-    
+
     std::cout << std::endl;
     std::cout << "# Testing setup:" << std::endl;
     std::cout << "  eps:     " << eps << std::endl;
     std::cout << "  numeric: double" << std::endl;
     std::cout << std::endl;
-    
+
     std::cout << " -- Vandermonde matrix -- " << std::endl;
     if (vandermonde_test<double>(eps) == EXIT_FAILURE)
       return EXIT_FAILURE;
-    
+
     std::cout << " -- Circulant matrix -- " << std::endl;
     if (circulant_test<double>(eps) == EXIT_FAILURE)
       return EXIT_FAILURE;
-    
+
     std::cout << " -- Toeplitz matrix -- " << std::endl;
     if (toeplitz_test<double>(eps) == EXIT_FAILURE)
       return EXIT_FAILURE;
-    
+
     std::cout << " -- Hankel matrix -- " << std::endl;
     if (hankel_test<double>(eps) == EXIT_FAILURE)
       return EXIT_FAILURE;
   }
 
+  std::cout << std::endl;
+  std::cout << "------- Test completed --------" << std::endl;
+  std::cout << std::endl;
+
+
   return EXIT_SUCCESS;
 }
diff --git a/tests/src/svd.cpp b/tests/src/svd.cpp
new file mode 100644
index 0000000..81775e6
--- /dev/null
+++ b/tests/src/svd.cpp
@@ -0,0 +1,311 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include <stdexcept>
+#include <iostream>
+#include <string>
+#include <vector>
+#include <cmath>
+
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/prod.hpp"
+
+#include "viennacl/linalg/svd.hpp"
+
+#include "examples/benchmarks/benchmark-utils.hpp"
+
+
+void read_matrix_size(std::fstream& f, std::size_t & sz1, std::size_t & sz2)
+{
+  if(!f.is_open())
+    throw std::invalid_argument("File is not opened");
+
+  f >> sz1 >> sz2;
+}
+
+
+template <typename ScalarType>
+void read_matrix_body(std::fstream& f, viennacl::matrix<ScalarType>& A)
+{
+  if(!f.is_open())
+    throw std::invalid_argument("File is not opened");
+
+  boost::numeric::ublas::matrix<ScalarType> h_A(A.size1(), A.size2());
+
+  for(std::size_t i = 0; i < h_A.size1(); i++)
+  {
+    for(std::size_t j = 0; j < h_A.size2(); j++)
+    {
+      ScalarType val = 0.0;
+      f >> val;
+      h_A(i, j) = val;
+    }
+  }
+
+  viennacl::copy(h_A, A);
+}
+
+
+template <typename ScalarType>
+void read_vector_body(std::fstream& f, std::vector<ScalarType>& v)
+{
+  if(!f.is_open())
+    throw std::invalid_argument("File is not opened");
+
+  for(std::size_t i = 0; i < v.size(); i++)
+  {
+    ScalarType val = 0.0;
+    f >> val;
+    v[i] = val;
+  }
+}
+
+
+template <typename ScalarType>
+void random_fill(std::vector<ScalarType>& in)
+{
+  for(std::size_t i = 0; i < in.size(); i++)
+    in[i] = static_cast<ScalarType>(rand()) / RAND_MAX;
+}
+
+
+template <typename ScalarType>
+bool check_bidiag(viennacl::matrix<ScalarType>& A)
+{
+  const ScalarType EPS = 0.0001f;
+
+  std::vector<ScalarType> aA(A.size1() * A.size2());
+  viennacl::fast_copy(A, &aA[0]);
+
+  for(std::size_t i = 0; i < A.size1(); i++)
+  {
+    for(std::size_t j = 0; j < A.size2(); j++)
+    {
+      ScalarType val = aA[i * A.size2() + j];
+      if((fabs(val) > EPS) && (i != j) && ((i + 1) != j))
+      {
+        std::cout << "Failed at " << i << " " << j << " " << val << std::endl;
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+template <typename ScalarType>
+ScalarType matrix_compare(viennacl::matrix<ScalarType>& res,
+                     viennacl::matrix<ScalarType>& ref)
+{
+  std::vector<ScalarType> res_std(res.internal_size());
+  std::vector<ScalarType> ref_std(ref.internal_size());
+
+  viennacl::fast_copy(res, &res_std[0]);
+  viennacl::fast_copy(ref, &ref_std[0]);
+
+  ScalarType diff = 0.0;
+  ScalarType mx = 0.0;
+
+  for(std::size_t i = 0; i < res_std.size(); i++)
+  {
+    diff = std::max(diff, std::abs(res_std[i] - ref_std[i]));
+    mx = std::max(mx, res_std[i]);
+  }
+
+  return diff / mx;
+}
+
+
+template <typename ScalarType>
+ScalarType sigmas_compare(viennacl::matrix<ScalarType>& res,
+                               std::vector<ScalarType>& ref)
+{
+    std::vector<ScalarType> res_std(ref.size());
+
+    for(std::size_t i = 0; i < ref.size(); i++)
+        res_std[i] = res(i, i);
+
+    std::sort(ref.begin(), ref.end());
+    std::sort(res_std.begin(), res_std.end());
+
+    ScalarType diff = 0.0;
+    ScalarType mx = 0.0;
+    for(std::size_t i = 0; i < ref.size(); i++)
+    {
+        diff = std::max(diff, std::abs(res_std[i] - ref[i]));
+        mx = std::max(mx, res_std[i]);
+    }
+
+    return diff / mx;
+}
+
+
+template <typename ScalarType>
+void test_svd(const std::string & fn, ScalarType EPS)
+{
+  std::size_t sz1, sz2;
+
+  //read matrix
+
+  // sz1 = 2048, sz2 = 2048;
+  // std::vector<ScalarType> in(sz1 * sz2);
+  // random_fill(in);
+
+  // read file
+  std::fstream f(fn.c_str(), std::fstream::in);
+  //read size of input matrix
+  read_matrix_size(f, sz1, sz2);
+
+  std::size_t to = std::min(sz1, sz2);
+
+  viennacl::matrix<ScalarType> Ai(sz1, sz2), Aref(sz1, sz2), QL(sz1, sz1), QR(sz2, sz2);
+  read_matrix_body(f, Ai);
+
+  std::vector<ScalarType> sigma_ref(to);
+  read_vector_body(f, sigma_ref);
+
+  f.close();
+
+  // viennacl::fast_copy(&in[0], &in[0] + in.size(), Ai);
+
+  Aref = Ai;
+
+  Timer timer;
+  timer.start();
+
+  viennacl::linalg::svd(Ai, QL, QR);
+
+  viennacl::backend::finish();
+
+  double time_spend = timer.get();
+
+  viennacl::matrix<ScalarType> result1(sz1, sz2), result2(sz1, sz2);
+  result1 = viennacl::linalg::prod(QL, Ai);
+  result2 = viennacl::linalg::prod(result1, trans(QR));
+
+  ScalarType sigma_diff = sigmas_compare(Ai, sigma_ref);
+  ScalarType prods_diff  = matrix_compare(result2, Aref);
+
+  bool sigma_ok = (fabs(sigma_diff) < EPS)
+                   && (fabs(prods_diff) < std::sqrt(EPS));  //note: computing the product is not accurate down to 10^{-16}, so we allow for a higher tolerance here
+
+  printf("%6s [%dx%d] %40s sigma_diff = %.6f; prod_diff = %.6f; time = %.6f\n", sigma_ok?"[[OK]]":"[FAIL]", (int)Aref.size1(), (int)Aref.size2(), fn.c_str(), sigma_diff, prods_diff, time_spend);
+}
+
+
+template <typename ScalarType>
+void time_svd(std::size_t sz1, std::size_t sz2)
+{
+  viennacl::matrix<ScalarType> Ai(sz1, sz2), QL(sz1, sz1), QR(sz2, sz2);
+
+  std::vector<ScalarType> in(Ai.internal_size1() * Ai.internal_size2());
+  random_fill(in);
+
+  viennacl::fast_copy(&in[0], &in[0] + in.size(), Ai);
+
+
+  Timer timer;
+  timer.start();
+
+  viennacl::linalg::svd(Ai, QL, QR);
+  viennacl::backend::finish();
+  double time_spend = timer.get();
+
+  printf("[%dx%d] time = %.6f\n", static_cast<int>(sz1), static_cast<int>(sz2), time_spend);
+}
+
+
+template <typename ScalarType>
+int test(ScalarType epsilon)
+{
+
+    test_svd<ScalarType>(std::string("../../examples/testdata/svd/qr.example"), epsilon);
+    test_svd<ScalarType>(std::string("../../examples/testdata/svd/wiki.example"), epsilon);
+    test_svd<ScalarType>(std::string("../../examples/testdata/svd/wiki.qr.example"), epsilon);
+    test_svd<ScalarType>(std::string("../../examples/testdata/svd/pysvd.example"), epsilon);
+    test_svd<ScalarType>(std::string("../../examples/testdata/svd/random.example"), epsilon);
+
+    time_svd<ScalarType>(500, 500);
+    time_svd<ScalarType>(1000, 1000);
+    time_svd<ScalarType>(4096, 512);
+    time_svd<ScalarType>(2048, 2048);
+    //time_svd(4096, 4096);  //takes too long for a standard sanity test. Feel free to uncomment
+
+    return EXIT_SUCCESS;
+}
+
+//
+// -------------------------------------------------------------
+//
+int main()
+{
+   std::cout << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << "## Test :: BLAS 3 routines" << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << std::endl;
+
+   int retval = EXIT_SUCCESS;
+
+   std::cout << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << std::endl;
+   {
+      typedef float NumericT;
+      NumericT epsilon = NumericT(1.0E-4);
+      std::cout << "# Testing setup:" << std::endl;
+      std::cout << "  eps:     " << epsilon << std::endl;
+      std::cout << "  numeric: float" << std::endl;
+      retval = test<NumericT>(epsilon);
+      if( retval == EXIT_SUCCESS )
+        std::cout << "# Test passed" << std::endl;
+      else
+        return retval;
+   }
+   std::cout << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << std::endl;
+   if( viennacl::ocl::current_device().double_support() )
+   {
+      {
+        typedef double NumericT;
+        NumericT epsilon = 1.0E-6;  //Note: higher accuracy not possible, because data only available with floating point precision
+        std::cout << "# Testing setup:" << std::endl;
+        std::cout << "  eps:     " << epsilon << std::endl;
+        std::cout << "  numeric: double" << std::endl;
+        retval = test<NumericT>(epsilon);
+        if( retval == EXIT_SUCCESS )
+          std::cout << "# Test passed" << std::endl;
+        else
+          return retval;
+      }
+      std::cout << std::endl;
+      std::cout << "----------------------------------------------" << std::endl;
+      std::cout << std::endl;
+   }
+
+  std::cout << std::endl;
+  std::cout << "------- Test completed --------" << std::endl;
+  std::cout << std::endl;
+
+
+   return retval;
+}
+
+
diff --git a/tests/src/vector.cpp b/tests/src/vector.cpp
deleted file mode 100644
index 5411028..0000000
--- a/tests/src/vector.cpp
+++ /dev/null
@@ -1,705 +0,0 @@
-/* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-//
-// *** System
-//
-#include <iostream>
-
-//
-// *** Boost
-//
-#include <boost/numeric/ublas/io.hpp>
-#include <boost/numeric/ublas/vector.hpp>
-
-//
-// *** ViennaCL
-//
-//#define VIENNACL_DEBUG_ALL
-#define VIENNACL_HAVE_UBLAS 1
-#include "viennacl/vector.hpp"
-#include "viennacl/linalg/inner_prod.hpp"
-#include "viennacl/linalg/norm_1.hpp"
-#include "viennacl/linalg/norm_2.hpp"
-#include "viennacl/linalg/norm_inf.hpp"
-
-
-using namespace boost::numeric;
-
-//
-// -------------------------------------------------------------
-//
-template <class TYPE>
-bool readVectorFromFile(const std::string & filename, boost::numeric::ublas::vector<TYPE> & vec)
-{
-	std::ifstream file(filename.c_str());
-
-	if (!file) return false;
-
-	unsigned int size;
-	file >> size;
-  
-  if (size > 30000)  //keep execution times short
-    size = 30000;
-	vec.resize(size);
-
-	for (unsigned int i = 0; i < size; ++i)
-	{
-		TYPE element;
-		file >> element;
-		vec[i] = element;
-	}
-
-	return true;
-}
-
-//
-// -------------------------------------------------------------
-//
-template <typename ScalarType>
-ScalarType diff(ScalarType & s1, ScalarType & s2) 
-{
-   if (s1 != s2)
-      return (s1 - s2) / std::max(fabs(s1), fabs(s2));
-   return 0;
-}
-//
-// -------------------------------------------------------------
-//
-template <typename ScalarType>
-ScalarType diff(ScalarType & s1, viennacl::scalar<ScalarType> & s2) 
-{
-   if (s1 != s2)
-      return (s1 - s2) / std::max(fabs(s1), fabs(s2));
-   return 0;
-}
-//
-// -------------------------------------------------------------
-//
-template <typename ScalarType>
-ScalarType diff(ScalarType & s1, viennacl::entry_proxy<ScalarType> const& s2) 
-{
-   if (s1 != s2)
-      return (s1 - s2) / std::max(fabs(s1), fabs(s2));
-   return 0;
-}
-//
-// -------------------------------------------------------------
-//
-template <typename ScalarType>
-ScalarType diff(ublas::vector<ScalarType> & v1, viennacl::vector<ScalarType> & v2)
-{
-   ublas::vector<ScalarType> v2_cpu(v2.size());
-   fast_copy(v2.begin(), v2.end(), v2_cpu.begin());
-
-   for (unsigned int i=0;i<v1.size(); ++i)
-   {
-      if ( std::max( fabs(v2_cpu[i]), fabs(v1[i]) ) > 0 )
-         v2_cpu[i] = fabs(v2_cpu[i] - v1[i]) / std::max( fabs(v2_cpu[i]), fabs(v1[i]) );
-      else
-         v2_cpu[i] = 0.0;
-   }
-
-   return norm_inf(v2_cpu);
-}
-//
-// -------------------------------------------------------------
-//
-template< typename NumericT, typename Epsilon >
-int test(Epsilon const& epsilon, std::string rhsfile, std::string resultfile)
-{
-   int retval = EXIT_SUCCESS;
-
-   ublas::vector<NumericT> rhs;
-   ublas::vector<NumericT> rhs2;
-
-   if (!readVectorFromFile<NumericT>(rhsfile, rhs)) 
-   {
-      std::cout << "Error reading RHS file" << std::endl;
-      retval = EXIT_FAILURE;
-   }
-   
-   std::cout << "Running tests for vector of size " << rhs.size() << std::endl;
-
-//    ublas::vector<NumericT> result;
-//    if (!readVectorFromFile<NumericT>(resultfile, result))  
-//    {
-//       std::cout << "Error reading Result file" << std::endl;
-//       retval = EXIT_FAILURE;
-//    }
-
-   viennacl::vector<NumericT> vcl_rhs(rhs.size());
-   fast_copy(rhs.begin(), rhs.end(), vcl_rhs.begin());
-   viennacl::vector<NumericT> vcl_rhs2(rhs.size()); 
-   copy(rhs.begin(), rhs.end(), vcl_rhs2.begin());
-   
-   NumericT                            cpu_result;
-   viennacl::scalar<NumericT>  gpu_result;
-   // --------------------------------------------------------------------------
-   std::cout << "Testing inner_prod..." << std::endl;
-   cpu_result = viennacl::linalg::inner_prod(rhs, rhs);
-   gpu_result = viennacl::linalg::inner_prod(vcl_rhs, vcl_rhs);
-
-   if( fabs(diff(cpu_result, gpu_result)) > epsilon )
-   {
-      std::cout << "# Error at operation: inner product" << std::endl;
-      std::cout << "  diff: " << fabs(diff(cpu_result, gpu_result)) << std::endl;
-      retval = EXIT_FAILURE;
-   }
-   // --------------------------------------------------------------------------
-   std::cout << "Testing norm_1..." << std::endl;
-   cpu_result = norm_1(rhs);
-   gpu_result = viennacl::linalg::norm_1(vcl_rhs);
-
-   if( fabs(diff(cpu_result, gpu_result)) > epsilon )
-   {
-      std::cout << "# Error at operation: norm-1" << std::endl;
-      std::cout << "  diff: " << fabs(diff(cpu_result, gpu_result)) << std::endl;
-      retval = EXIT_FAILURE;
-   }
-   // --------------------------------------------------------------------------
-   std::cout << "Testing norm_2..." << std::endl;
-   cpu_result = norm_2(rhs);
-   gpu_result = viennacl::linalg::norm_2(vcl_rhs);
-
-   if( fabs(diff(cpu_result, gpu_result)) > epsilon )
-   {
-      std::cout << "# Error at operation: norm-2" << std::endl;
-      std::cout << "  diff: " << fabs(diff(cpu_result, gpu_result)) << std::endl;
-      retval = EXIT_FAILURE;
-   }
-   // --------------------------------------------------------------------------
-   std::cout << "Testing norm_inf..." << std::endl;
-   cpu_result = norm_inf(rhs);
-   gpu_result = viennacl::linalg::norm_inf(vcl_rhs);
-
-   if( fabs(diff(cpu_result, gpu_result)) > epsilon )
-   {
-      std::cout << "# Error at operation: norm-inf" << std::endl;
-      std::cout << "  diff: " << fabs(diff(cpu_result, gpu_result)) << std::endl;
-      retval = EXIT_FAILURE;
-   }
-   // --------------------------------------------------------------------------
-   std::cout << "Testing index_norm_inf..." << std::endl;
-   size_t cpu_index = index_norm_inf(rhs);
-   size_t gpu_index = viennacl::linalg::index_norm_inf(vcl_rhs);
-
-   if( cpu_index != gpu_index )
-   {
-      std::cout << "# Error at operation: index norm-inf" << std::endl;
-      std::cout << "  cpu-index: " << cpu_index << " vs. gpu-index: " << gpu_index << std::endl;
-      retval = EXIT_FAILURE;
-   }
-   // --------------------------------------------------------------------------
-   cpu_result = rhs[index_norm_inf(rhs)];
-   gpu_result = vcl_rhs[viennacl::linalg::index_norm_inf(vcl_rhs)];
-
-   if( fabs(diff(cpu_result, gpu_result)) > epsilon )
-   {
-      std::cout << "# Error at operation: value norm-inf" << std::endl;
-      std::cout << "  diff: " << fabs(diff(cpu_result, gpu_result)) << std::endl;
-      retval = EXIT_FAILURE;
-   }
-   // --------------------------------------------------------------------------
-   ublas::vector<NumericT> x = rhs;
-   ublas::vector<NumericT> y = rhs;
-   ublas::vector<NumericT> t = rhs;
-   t.assign (NumericT(1.1) * x + NumericT(2.3) * y),
-   y.assign (- NumericT(2.3) * x + NumericT(1.1) * y),
-   x.assign (t);
-//   cpu_result = norm_inf(x); 
-
-   copy(rhs, vcl_rhs);
-   copy(rhs, vcl_rhs2);
-   std::cout << "Testing plane_rotation..." << std::endl;
-   viennacl::linalg::plane_rotation(vcl_rhs, vcl_rhs2, NumericT(1.1), NumericT(2.3));
-   //gpu_result = viennacl::linalg::norm_inf(vcl_rhs);
-
-   if( fabs(diff(x, vcl_rhs)) > epsilon )
-   {
-      std::cout << "# Error at operation: plane rotation" << std::endl;
-      std::cout << "  diff: " << fabs(diff(x, vcl_rhs)) << std::endl;
-      retval = EXIT_FAILURE;
-   }
-   // --------------------------------------------------------------------------
-   viennacl::copy(rhs, vcl_rhs);
-   
-   std::cout << "Testing cpu_assignments..." << std::endl;
-   NumericT val = static_cast<NumericT>(1e-3);
-   for (size_t i=0; i < rhs.size(); ++i)
-     rhs(i) = val;
-
-   if( fabs(diff(val, rhs(0))) > epsilon )
-   {
-      std::cout << "# Error at operation: cpu assignment" << std::endl;
-      std::cout << "  diff: " << fabs(diff(val, rhs(0))) << std::endl;
-      retval = EXIT_FAILURE;
-   }
-
-   std::cout << "Testing gpu_assignments..." << std::endl;
-   for (size_t i=0; i < vcl_rhs.size(); ++i)
-     vcl_rhs(i) = val;
-
-   if( fabs(diff(rhs, vcl_rhs)) > epsilon )
-   {
-      std::cout << "# Error at operation: gpu assignment" << std::endl;
-      std::cout << "  diff: " << fabs(diff(val, vcl_rhs(0))) << std::endl;
-      retval = EXIT_FAILURE;
-   }
-   
-   
-   //
-   // multiplication and division of vectors by scalars
-   //
-   copy(rhs.begin(), rhs.end(), vcl_rhs.begin());
-   rhs2 = rhs;
-
-   std::cout << "Testing scaling with CPU scalar..." << std::endl;
-   NumericT alpha = static_cast<NumericT>(3.1415);
-   viennacl::scalar<NumericT> gpu_alpha = alpha;
-
-   rhs     *= alpha;
-   vcl_rhs *= alpha;
-  
-   if( fabs(diff(rhs, vcl_rhs)) > epsilon )
-   {
-      std::cout << "# Error at operation: stretching with CPU scalar" << std::endl;
-      std::cout << "  diff: " << fabs(diff(cpu_result, gpu_result)) << std::endl;
-      retval = EXIT_FAILURE;
-   }  
-
-   std::cout << "Testing scaling with GPU scalar..." << std::endl;
-   copy(rhs2.begin(), rhs2.end(), vcl_rhs.begin());
-   vcl_rhs *= gpu_alpha;
-  
-   if( fabs(diff(rhs, vcl_rhs)) > epsilon )
-   {
-      std::cout << "# Error at operation: stretching with GPU scalar" << std::endl;
-      std::cout << "  diff: " << fabs(diff(rhs, vcl_rhs)) << std::endl;
-      retval = EXIT_FAILURE;
-   }  
-
-   NumericT beta  = static_cast<NumericT>(1.4153);
-   viennacl::scalar<NumericT> gpu_beta = beta;
-   rhs2 = rhs;
-  
-   std::cout << "Testing shrinking with CPU scalar..." << std::endl;
-   rhs     /= beta;
-   vcl_rhs /= beta;  
-
-   if( fabs(diff(rhs, vcl_rhs)) > epsilon )
-   {
-      std::cout << "# Error at operation: shrinking with CPU scalar" << std::endl;
-      std::cout << "  diff: " << fabs(diff(rhs, vcl_rhs)) << std::endl;
-      retval = EXIT_FAILURE;
-   }    
-   
-   std::cout << "Testing shrinking with GPU scalar..." << std::endl;
-   copy(rhs2.begin(), rhs2.end(), vcl_rhs.begin());
-   vcl_rhs /= gpu_beta;
-
-   if( fabs(diff(rhs, vcl_rhs)) > epsilon )
-   {
-      std::cout << "# Error at operation: shrinking with GPU scalar" << std::endl;
-      std::cout << "  diff: " << fabs(diff(rhs, vcl_rhs)) << std::endl;
-      retval = EXIT_FAILURE;
-   }    
-   
-
-
-   //
-   // add and inplace_add of vectors
-   //
-   std::cout << "Testing add on vector..." << std::endl;
-   rhs2 = 42.0 * rhs;
-   copy(rhs.begin(), rhs.end(), vcl_rhs.begin());
-   copy(rhs2.begin(), rhs2.end(), vcl_rhs2.begin());
-
-   rhs     = rhs + rhs2;
-   vcl_rhs = vcl_rhs + vcl_rhs2;
-
-   if( fabs(diff(rhs, vcl_rhs)) > epsilon )
-   {
-      std::cout << "# Error at operation: add on vector" << std::endl;
-      std::cout << "  diff: " << fabs(diff(rhs, vcl_rhs)) << std::endl;
-      retval = EXIT_FAILURE;
-   }       
-
-   std::cout << "Testing inplace-add on vector..." << std::endl;
-   rhs2 = 42.0 * rhs;
-   copy(rhs.begin(), rhs.end(), vcl_rhs.begin());
-   copy(rhs2.begin(), rhs2.end(), vcl_rhs2.begin());
-
-   rhs     += rhs2;
-   vcl_rhs += vcl_rhs2;
-
-   if( fabs(diff(rhs, vcl_rhs)) > epsilon )
-   {
-      std::cout << "# Error at operation: inplace-add on vector" << std::endl;
-      std::cout << "  diff: " << fabs(diff(rhs, vcl_rhs)) << std::endl;
-      retval = EXIT_FAILURE;
-   }       
-
-   //
-   // subtract and inplace_subtract of vectors
-   //
-   std::cout << "Testing sub on vector..." << std::endl;
-   rhs2 = 42.0 * rhs;
-   copy(rhs.begin(), rhs.end(), vcl_rhs.begin());
-   copy(rhs2.begin(), rhs2.end(), vcl_rhs2.begin());
-
-   rhs     = rhs - rhs2;
-   vcl_rhs = vcl_rhs - vcl_rhs2;
-
-   if( fabs(diff(rhs, vcl_rhs)) > epsilon )
-   {
-      std::cout << "# Error at operation: sub on vector" << std::endl;
-      std::cout << "  diff: " << fabs(diff(rhs, vcl_rhs)) << std::endl;
-      retval = EXIT_FAILURE;
-   }       
-
-   std::cout << "Testing inplace-sub on vector..." << std::endl;
-   rhs2 = 42.0 * rhs;
-   copy(rhs.begin(), rhs.end(), vcl_rhs.begin());
-   copy(rhs2.begin(), rhs2.end(), vcl_rhs2.begin());
-
-   rhs     += rhs2;
-   vcl_rhs += vcl_rhs2;
-
-   if( fabs(diff(rhs, vcl_rhs)) > epsilon )
-   {
-      std::cout << "# Error at operation: inplace-sub on vector" << std::endl;
-      std::cout << "  diff: " << fabs(diff(rhs, vcl_rhs)) << std::endl;
-      retval = EXIT_FAILURE;
-   }       
-
-
-   
-   //
-   // multiply-add and multiply-subtract
-   //
-   std::cout << "Testing multiply-add on vector with CPU scalar..." << std::endl;
-   rhs2 = 42.0 * rhs;
-   copy(rhs.begin(), rhs.end(), vcl_rhs.begin());
-   copy(rhs2.begin(), rhs2.end(), vcl_rhs2.begin());
-
-   rhs     = rhs + alpha * rhs2;
-   vcl_rhs = vcl_rhs + alpha * vcl_rhs2;
-
-   if( fabs(diff(rhs, vcl_rhs)) > epsilon )
-   {
-      std::cout << "# Error at operation: multiply add with CPU scalar" << std::endl;
-      std::cout << "  diff: " << fabs(diff(rhs, vcl_rhs)) << std::endl;
-      retval = EXIT_FAILURE;
-   }       
-
-
-   std::cout << "Testing inplace multiply-add on vector with CPU scalar..." << std::endl;
-   rhs2 = 42.0 * rhs;
-   copy(rhs.begin(), rhs.end(), vcl_rhs.begin());
-   copy(rhs2.begin(), rhs2.end(), vcl_rhs2.begin());
-
-   rhs     += alpha * rhs2;
-   vcl_rhs += alpha * vcl_rhs2;
-
-   if( fabs(diff(rhs, vcl_rhs)) > epsilon )
-   {
-      std::cout << "# Error at operation: inplace multiply add with CPU scalar" << std::endl;
-      std::cout << "  diff: " << fabs(diff(rhs, vcl_rhs)) << std::endl;
-      retval = EXIT_FAILURE;
-   }       
-
-   std::cout << "Testing multiply-add on vector with GPU scalar..." << std::endl;
-   rhs2 = 42.0 * rhs;
-   copy(rhs.begin(), rhs.end(), vcl_rhs.begin());
-   copy(rhs2.begin(), rhs2.end(), vcl_rhs2.begin());
-   
-   rhs = rhs + alpha * rhs2;
-   vcl_rhs = vcl_rhs + gpu_alpha * vcl_rhs2;
-
-   if( fabs(diff(rhs, vcl_rhs)) > epsilon )
-   {
-      std::cout << "# Error at operation: multiply add with GPU scalar" << std::endl;
-      std::cout << "  diff: " << fabs(diff(rhs, vcl_rhs)) << std::endl;
-      retval = EXIT_FAILURE;
-   }       
-   
-   std::cout << "Testing inplace multiply-add on vector with GPU scalar..." << std::endl;
-   rhs2 = 42.0 * rhs;
-   copy(rhs.begin(), rhs.end(), vcl_rhs.begin());
-   copy(rhs2.begin(), rhs2.end(), vcl_rhs2.begin());
-   
-   rhs += alpha * rhs2;
-   vcl_rhs += gpu_alpha * vcl_rhs2;
-
-   if( fabs(diff(rhs, vcl_rhs)) > epsilon )
-   {
-      std::cout << "# Error at operation: inplace multiply add with GPU scalar" << std::endl;
-      std::cout << "  diff: " << fabs(diff(rhs, vcl_rhs)) << std::endl;
-      retval = EXIT_FAILURE;
-   }       
-   
-
-
-   //
-   // multiply-subtract
-   //
-   std::cout << "Testing multiply-subtract on vector with CPU scalar..." << std::endl;
-   rhs2 = 42.0 * rhs;
-   copy(rhs.begin(), rhs.end(), vcl_rhs.begin());
-   copy(rhs2.begin(), rhs2.end(), vcl_rhs2.begin());
-
-   rhs     = rhs - alpha * rhs2;
-   vcl_rhs = vcl_rhs - alpha * vcl_rhs2;
-
-   if( fabs(diff(rhs, vcl_rhs)) > epsilon )
-   {
-      std::cout << "# Error at operation: multiply-subtract with CPU scalar" << std::endl;
-      std::cout << "  diff: " << fabs(diff(rhs, vcl_rhs)) << std::endl;
-      retval = EXIT_FAILURE;
-   }       
-
-
-   std::cout << "Testing inplace multiply-subtract on vector with CPU scalar..." << std::endl;
-   rhs2 = 42.0 * rhs;
-   copy(rhs.begin(), rhs.end(), vcl_rhs.begin());
-   copy(rhs2.begin(), rhs2.end(), vcl_rhs2.begin());
-
-   rhs     -= alpha * rhs2;
-   vcl_rhs -= alpha * vcl_rhs2;
-
-   if( fabs(diff(rhs, vcl_rhs)) > epsilon )
-   {
-      std::cout << "# Error at operation: inplace multiply subtract with CPU scalar" << std::endl;
-      std::cout << "  diff: " << fabs(diff(rhs, vcl_rhs)) << std::endl;
-      retval = EXIT_FAILURE;
-   }       
-
-   std::cout << "Testing multiply-subtract on vector with GPU scalar..." << std::endl;
-   rhs2 = 42.0 * rhs;
-   copy(rhs.begin(), rhs.end(), vcl_rhs.begin());
-   copy(rhs2.begin(), rhs2.end(), vcl_rhs2.begin());
-   
-   rhs     = rhs - alpha * rhs2;
-   vcl_rhs = vcl_rhs - gpu_alpha * vcl_rhs2;
-
-   if( fabs(diff(rhs, vcl_rhs)) > epsilon )
-   {
-      std::cout << "# Error at operation: multiply subtract with GPU scalar" << std::endl;
-      std::cout << "  diff: " << fabs(diff(rhs, vcl_rhs)) << std::endl;
-      retval = EXIT_FAILURE;
-   }       
-   
-   std::cout << "Testing inplace multiply-subtract on vector with GPU scalar..." << std::endl;
-   rhs2 = 42.0 * rhs;
-   copy(rhs.begin(), rhs.end(), vcl_rhs.begin());
-   copy(rhs2.begin(), rhs2.end(), vcl_rhs2.begin());
-   
-   rhs -= alpha * rhs2;
-   vcl_rhs -= gpu_alpha * vcl_rhs2;
-
-   if( fabs(diff(rhs, vcl_rhs)) > epsilon )
-   {
-      std::cout << "# Error at operation: inplace multiply subtract with GPU scalar" << std::endl;
-      std::cout << "  diff: " << fabs(diff(rhs, vcl_rhs)) << std::endl;
-      retval = EXIT_FAILURE;
-   }       
-   
-   
-   
-   //
-   // Misc stuff
-   //
-   rhs2 = rhs;
-   copy(rhs.begin(), rhs.end(), vcl_rhs.begin());
-   copy(rhs2.begin(), rhs2.end(), vcl_rhs2.begin());
-
-   std::cout << "Testing several vector additions..." << std::endl;
-   rhs     = rhs2 + rhs + rhs2;
-   vcl_rhs = vcl_rhs2 + vcl_rhs + vcl_rhs2;
-   
-   if( fabs(diff(rhs, vcl_rhs)) > epsilon )
-   {
-      std::cout << "# Error at operation: several additions" << std::endl;
-      std::cout << "  diff: " << fabs(diff(rhs, vcl_rhs)) << std::endl;
-      retval = EXIT_FAILURE;
-   }          
-   
-   
-   
-   //
-   // Complicated expressions (for ensuring the operator overloads work correctly)
-   //
-   copy(vcl_rhs.begin(), vcl_rhs.end(), rhs2.begin());
-   copy(rhs.begin(), rhs.end(), vcl_rhs.begin());
-   copy(rhs2.begin(), rhs2.end(), vcl_rhs2.begin());
-   rhs2 = rhs;
-
-   std::cout << "Testing complicated vector expression with CPU scalar..." << std::endl;
-   rhs     = beta * (rhs - alpha*rhs2);
-   vcl_rhs = beta * (vcl_rhs - alpha*vcl_rhs2);
-
-   if( fabs(diff(rhs, vcl_rhs)) > epsilon )
-   {
-      std::cout << "# Error at operation: advanced mul diff with CPU scalars" << std::endl;
-      std::cout << "  diff: " << fabs(diff(rhs, vcl_rhs)) << std::endl;
-      retval = EXIT_FAILURE;
-   }          
-   
-   std::cout << "Testing complicated vector expression with GPU scalar..." << std::endl;
-   copy(rhs2.begin(), rhs2.end(), vcl_rhs.begin());
-   vcl_rhs = gpu_beta * (vcl_rhs - gpu_alpha*vcl_rhs2);
-
-   if( fabs(diff(rhs, vcl_rhs)) > epsilon )
-   {
-      std::cout << "# Error at operation: advanced mul diff with GPU scalars" << std::endl;
-      std::cout << "  diff: " << fabs(diff(rhs, vcl_rhs)) << std::endl;
-      retval = EXIT_FAILURE;
-   }          
-   
-   // --------------------------------------------------------------------------      
-   copy(vcl_rhs.begin(), vcl_rhs.end(), rhs2.begin());
-   rhs2 = rhs;
-   rhs2 *= 3.0;
-   copy(rhs.begin(), rhs.end(), vcl_rhs.begin());
-   copy(rhs2.begin(), rhs2.end(), vcl_rhs2.begin());
-
-   std::cout << "Testing swap..." << std::endl;
-   swap(rhs, rhs2);
-   swap(vcl_rhs, vcl_rhs2);
-
-   if( fabs(diff(rhs, vcl_rhs)) > epsilon )
-   {
-      std::cout << "# Error at operation: swap" << std::endl;
-      std::cout << "  diff: " << fabs(diff(rhs, vcl_rhs)) << std::endl;
-      retval = EXIT_FAILURE;
-   }          
-   // --------------------------------------------------------------------------         
-   rhs2 = rhs;
-   copy(rhs.begin(), rhs.end(), vcl_rhs.begin());
-   copy(rhs2.begin(), rhs2.end(), vcl_rhs2.begin());
-
-   std::cout << "Testing another complicated vector expression with GPU scalar..." << std::endl;
-   rhs     = rhs2 / alpha + beta * (rhs - alpha*rhs2);
-   vcl_rhs = vcl_rhs2 / alpha + beta * (vcl_rhs - alpha*vcl_rhs2);
-
-   if( fabs(diff(rhs, vcl_rhs)) > epsilon )
-   {
-      std::cout << "# Error at operation: complex vector operations with CPU scalars" << std::endl;
-      std::cout << "  diff: " << fabs(diff(rhs, vcl_rhs)) << std::endl;
-      retval = EXIT_FAILURE;
-   }             
-   
-   std::cout << "Testing another complicated vector expression with GPU scalar..." << std::endl;
-   copy(rhs2.begin(), rhs2.end(), vcl_rhs.begin());
-   vcl_rhs = vcl_rhs2 / gpu_alpha + gpu_beta * (vcl_rhs - gpu_alpha*vcl_rhs2);
-
-   if( fabs(diff(rhs, vcl_rhs)) > epsilon )
-   {
-      std::cout << "# Error at operation: complex vector operations with GPU scalars" << std::endl;
-      std::cout << "  diff: " << fabs(diff(rhs, vcl_rhs)) << std::endl;
-      retval = EXIT_FAILURE;
-   }             
-   
-   // --------------------------------------------------------------------------            
-   return retval;
-}
-//
-// -------------------------------------------------------------
-//
-int main()
-{
-   std::cout << std::endl;
-   std::cout << "----------------------------------------------" << std::endl;
-   std::cout << "----------------------------------------------" << std::endl;
-   std::cout << "## Test :: Vector" << std::endl;
-   std::cout << "----------------------------------------------" << std::endl;
-   std::cout << "----------------------------------------------" << std::endl;
-   std::cout << std::endl;
-
-   int retval = EXIT_SUCCESS;
-
-   std::string rhsfile("../../examples/testdata/rhs65025.txt");
-   std::string resultfile("../../examples/testdata/result65025.txt");
-
-   std::cout << std::endl;
-   std::cout << "----------------------------------------------" << std::endl;
-   std::cout << std::endl;
-   {
-      typedef float NumericT;
-      NumericT epsilon = static_cast<NumericT>(1.0E-4);
-      std::cout << "# Testing setup:" << std::endl;
-      std::cout << "  eps:     " << epsilon << std::endl;
-      std::cout << "  numeric: float" << std::endl;
-      retval = test<NumericT>(epsilon, rhsfile, resultfile);
-      if( retval == EXIT_SUCCESS )
-         std::cout << "# Test passed" << std::endl;
-      else
-         return retval;
-   }
-   std::cout << std::endl;
-   std::cout << "----------------------------------------------" << std::endl;
-   std::cout << std::endl;
-   if( viennacl::ocl::current_device().double_support() )
-   {
-      {
-         typedef double NumericT;
-         NumericT epsilon = 1.0E-10;
-         std::cout << "# Testing setup:" << std::endl;
-         std::cout << "  eps:     " << epsilon << std::endl;
-         std::cout << "  numeric: double" << std::endl;
-         retval = test<NumericT>(epsilon, rhsfile, resultfile);
-         if( retval == EXIT_SUCCESS )
-           std::cout << "# Test passed" << std::endl;
-         else
-           return retval;
-      }
-      std::cout << std::endl;
-      std::cout << "----------------------------------------------" << std::endl;
-      std::cout << std::endl;
-      {
-         typedef double NumericT;
-         NumericT epsilon = 1.0E-11;
-         std::cout << "# Testing setup:" << std::endl;
-         std::cout << "  eps:     " << epsilon << std::endl;
-         std::cout << "  numeric: double" << std::endl;
-         retval = test<NumericT>(epsilon, rhsfile, resultfile);
-         if( retval == EXIT_SUCCESS )
-           std::cout << "# Test passed" << std::endl;
-         else
-           return retval;
-      }
-      std::cout << std::endl;
-      std::cout << "----------------------------------------------" << std::endl;
-      std::cout << std::endl;
-      {
-         typedef double NumericT;
-         NumericT epsilon = 1.0E-12;
-         std::cout << "# Testing setup:" << std::endl;
-         std::cout << "  eps:     " << epsilon << std::endl;
-         std::cout << "  numeric: double" << std::endl;
-         retval = test<NumericT>(epsilon, rhsfile, resultfile);
-         if( retval == EXIT_SUCCESS )
-           std::cout << "# Test passed" << std::endl;
-         else
-           return retval;
-      }
-      std::cout << std::endl;
-      std::cout << "----------------------------------------------" << std::endl;
-      std::cout << std::endl;
-   }
-   return retval;
-}
diff --git a/tests/src/vector_double.cpp b/tests/src/vector_double.cpp
new file mode 100644
index 0000000..4f005f2
--- /dev/null
+++ b/tests/src/vector_double.cpp
@@ -0,0 +1,66 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "vector_float_double.hpp"
+
+
+//
+// -------------------------------------------------------------
+//
+int main()
+{
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "## Test :: Vector" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << std::endl;
+
+  int retval = EXIT_SUCCESS;
+
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << std::endl;
+  #ifdef VIENNACL_WITH_OPENCL
+  if( viennacl::ocl::current_device().double_support() )
+  #endif
+  {
+    {
+      typedef double NumericT;
+      NumericT epsilon = 1.0E-10;
+      std::cout << "# Testing setup:" << std::endl;
+      std::cout << "  eps:     " << epsilon << std::endl;
+      std::cout << "  numeric: double" << std::endl;
+      retval = test<NumericT>(epsilon);
+      if( retval == EXIT_SUCCESS )
+       std::cout << "# Test passed" << std::endl;
+      else
+       return retval;
+    }
+    std::cout << std::endl;
+    std::cout << "----------------------------------------------" << std::endl;
+    std::cout << std::endl;
+  }
+
+  std::cout << std::endl;
+  std::cout << "------- Test completed --------" << std::endl;
+  std::cout << std::endl;
+
+
+  return retval;
+}
diff --git a/tests/src/vector_double.cu b/tests/src/vector_double.cu
new file mode 100644
index 0000000..4f005f2
--- /dev/null
+++ b/tests/src/vector_double.cu
@@ -0,0 +1,66 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "vector_float_double.hpp"
+
+
+//
+// -------------------------------------------------------------
+//
+int main()
+{
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "## Test :: Vector" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << std::endl;
+
+  int retval = EXIT_SUCCESS;
+
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << std::endl;
+  #ifdef VIENNACL_WITH_OPENCL
+  if( viennacl::ocl::current_device().double_support() )
+  #endif
+  {
+    {
+      typedef double NumericT;
+      NumericT epsilon = 1.0E-10;
+      std::cout << "# Testing setup:" << std::endl;
+      std::cout << "  eps:     " << epsilon << std::endl;
+      std::cout << "  numeric: double" << std::endl;
+      retval = test<NumericT>(epsilon);
+      if( retval == EXIT_SUCCESS )
+       std::cout << "# Test passed" << std::endl;
+      else
+       return retval;
+    }
+    std::cout << std::endl;
+    std::cout << "----------------------------------------------" << std::endl;
+    std::cout << std::endl;
+  }
+
+  std::cout << std::endl;
+  std::cout << "------- Test completed --------" << std::endl;
+  std::cout << std::endl;
+
+
+  return retval;
+}
diff --git a/tests/src/vector_float.cpp b/tests/src/vector_float.cpp
new file mode 100644
index 0000000..3087867
--- /dev/null
+++ b/tests/src/vector_float.cpp
@@ -0,0 +1,62 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+#include "vector_float_double.hpp"
+
+
+
+//
+// -------------------------------------------------------------
+//
+int main()
+{
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "## Test :: Vector" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << std::endl;
+
+  int retval = EXIT_SUCCESS;
+
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << std::endl;
+  {
+    typedef float NumericT;
+    NumericT epsilon = static_cast<NumericT>(1.0E-2);
+    std::cout << "# Testing setup:" << std::endl;
+    std::cout << "  eps:     " << epsilon << std::endl;
+    std::cout << "  numeric: float" << std::endl;
+    retval = test<NumericT>(epsilon);
+    if( retval == EXIT_SUCCESS )
+      std::cout << "# Test passed" << std::endl;
+    else
+      return retval;
+  }
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << std::endl;
+
+  std::cout << std::endl;
+  std::cout << "------- Test completed --------" << std::endl;
+  std::cout << std::endl;
+
+  return retval;
+}
diff --git a/tests/src/vector_float.cu b/tests/src/vector_float.cu
new file mode 100644
index 0000000..3087867
--- /dev/null
+++ b/tests/src/vector_float.cu
@@ -0,0 +1,62 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+#include "vector_float_double.hpp"
+
+
+
+//
+// -------------------------------------------------------------
+//
+int main()
+{
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "## Test :: Vector" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << std::endl;
+
+  int retval = EXIT_SUCCESS;
+
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << std::endl;
+  {
+    typedef float NumericT;
+    NumericT epsilon = static_cast<NumericT>(1.0E-2);
+    std::cout << "# Testing setup:" << std::endl;
+    std::cout << "  eps:     " << epsilon << std::endl;
+    std::cout << "  numeric: float" << std::endl;
+    retval = test<NumericT>(epsilon);
+    if( retval == EXIT_SUCCESS )
+      std::cout << "# Test passed" << std::endl;
+    else
+      return retval;
+  }
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << std::endl;
+
+  std::cout << std::endl;
+  std::cout << "------- Test completed --------" << std::endl;
+  std::cout << std::endl;
+
+  return retval;
+}
diff --git a/tests/src/vector_float_double.hpp b/tests/src/vector_float_double.hpp
new file mode 100644
index 0000000..f1d004c
--- /dev/null
+++ b/tests/src/vector_float_double.hpp
@@ -0,0 +1,1717 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+//
+// *** System
+//
+#include <iostream>
+#include <iomanip>
+#include <cmath>
+
+// We don't need debug mode in UBLAS:
+#define BOOST_UBLAS_NDEBUG
+
+//
+// *** Boost
+//
+#include <boost/numeric/ublas/io.hpp>
+#include <boost/numeric/ublas/vector.hpp>
+#include <boost/numeric/ublas/vector_proxy.hpp>
+
+//
+// *** ViennaCL
+//
+//#define VIENNACL_DEBUG_ALL
+#define VIENNACL_WITH_UBLAS 1
+#include "viennacl/vector.hpp"
+#include "viennacl/vector_proxy.hpp"
+#include "viennacl/linalg/inner_prod.hpp"
+#include "viennacl/linalg/norm_1.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+#include "viennacl/linalg/norm_inf.hpp"
+
+#include "Random.hpp"
+
+using namespace boost::numeric;
+
+
+//
+// -------------------------------------------------------------
+//
+template <typename ScalarType>
+ScalarType diff(ScalarType const & s1, ScalarType const & s2)
+{
+   viennacl::backend::finish();
+   if (s1 != s2)
+      return (s1 - s2) / std::max(std::fabs(s1), std::fabs(s2));
+   return 0;
+}
+//
+// -------------------------------------------------------------
+//
+template <typename ScalarType>
+ScalarType diff(ScalarType const & s1, viennacl::scalar<ScalarType> const & s2)
+{
+   viennacl::backend::finish();
+   if (s1 != s2)
+      return (s1 - s2) / std::max(std::fabs(s1), std::fabs(s2));
+   return 0;
+}
+//
+// -------------------------------------------------------------
+//
+template <typename ScalarType>
+ScalarType diff(ScalarType const & s1, viennacl::entry_proxy<ScalarType> const & s2)
+{
+   viennacl::backend::finish();
+   if (s1 != s2)
+      return (s1 - s2) / std::max(std::fabs(s1), std::fabs(s2));
+   return 0;
+}
+//
+// -------------------------------------------------------------
+//
+template <typename ScalarType, typename ViennaCLVectorType>
+ScalarType diff(ublas::vector<ScalarType> const & v1, ViennaCLVectorType const & vcl_vec)
+{
+   ublas::vector<ScalarType> v2_cpu(vcl_vec.size());
+   viennacl::backend::finish();
+   viennacl::copy(vcl_vec, v2_cpu);
+
+   for (unsigned int i=0;i<v1.size(); ++i)
+   {
+      if ( std::max( std::fabs(v2_cpu[i]), std::fabs(v1[i]) ) > 0 )
+         v2_cpu[i] = std::fabs(v2_cpu[i] - v1[i]) / std::max( std::fabs(v2_cpu[i]), std::fabs(v1[i]) );
+      else
+         v2_cpu[i] = 0.0;
+   }
+
+   return ublas::norm_inf(v2_cpu);
+}
+
+
+template <typename T1, typename T2>
+int check(T1 const & t1, T2 const & t2, double epsilon)
+{
+  int retval = EXIT_SUCCESS;
+
+  double temp = std::fabs(diff(t1, t2));
+  if (temp > epsilon)
+  {
+    std::cout << "# Error! Relative difference: " << temp << std::endl;
+    retval = EXIT_FAILURE;
+  }
+  return retval;
+}
+
+
+//
+// -------------------------------------------------------------
+//
+template< typename NumericT, typename Epsilon, typename UblasVectorType, typename ViennaCLVectorType1, typename ViennaCLVectorType2 >
+int test(Epsilon const& epsilon,
+         UblasVectorType     & ublas_v1, UblasVectorType     & ublas_v2,
+         ViennaCLVectorType1 &   vcl_v1, ViennaCLVectorType2 &   vcl_v2)
+{
+  int retval = EXIT_SUCCESS;
+
+  NumericT                    cpu_result = 42.0;
+  viennacl::scalar<NumericT>  gpu_result = 43.0;
+
+  //
+  // Initializer:
+  //
+  std::cout << "Checking for zero_vector initializer..." << std::endl;
+  ublas_v1 = ublas::zero_vector<NumericT>(ublas_v1.size());
+  vcl_v1 = viennacl::zero_vector<NumericT>(vcl_v1.size());
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Checking for scalar_vector initializer..." << std::endl;
+  ublas_v1 = ublas::scalar_vector<NumericT>(ublas_v1.size(), cpu_result);
+  vcl_v1 = viennacl::scalar_vector<NumericT>(vcl_v1.size(), cpu_result);
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ublas_v1 = ublas::scalar_vector<NumericT>(ublas_v1.size(), gpu_result);
+  vcl_v1 = viennacl::scalar_vector<NumericT>(vcl_v1.size(), gpu_result);
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Checking for unit_vector initializer..." << std::endl;
+  ublas_v1 = ublas::unit_vector<NumericT>(ublas_v1.size(), 5);
+  vcl_v1 = viennacl::unit_vector<NumericT>(vcl_v1.size(), 5);
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  for (std::size_t i=0; i<ublas_v1.size(); ++i)
+  {
+    ublas_v1[i] = NumericT(1.0) + random<NumericT>();
+    ublas_v2[i] = NumericT(1.0) + random<NumericT>();
+  }
+
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());  //resync
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  std::cout << "Checking for successful copy..." << std::endl;
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  if (check(ublas_v2, vcl_v2, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  //
+  // Part 1: Norms and inner product
+  //
+
+  // --------------------------------------------------------------------------
+  std::cout << "Testing inner_prod..." << std::endl;
+  cpu_result = viennacl::linalg::inner_prod(ublas_v1, ublas_v2);
+  NumericT cpu_result2 = viennacl::linalg::inner_prod(vcl_v1, vcl_v2);
+  gpu_result = viennacl::linalg::inner_prod(vcl_v1, vcl_v2);
+
+  if (check(cpu_result, cpu_result2, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  if (check(cpu_result, gpu_result, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  cpu_result = inner_prod(ublas_v1 + ublas_v2, ublas_v2 - ublas_v1);
+  NumericT cpu_result3 = viennacl::linalg::inner_prod(vcl_v1 + vcl_v2, vcl_v2 - vcl_v1);
+  gpu_result = viennacl::linalg::inner_prod(vcl_v1 + vcl_v2, vcl_v2 - vcl_v1);
+
+  if (check(cpu_result, cpu_result3, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  if (check(cpu_result, gpu_result, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  // --------------------------------------------------------------------------
+  std::cout << "Testing norm_1..." << std::endl;
+  cpu_result = ublas::norm_1(ublas_v1);
+  gpu_result = viennacl::linalg::norm_1(vcl_v1);
+
+  if (check(cpu_result, gpu_result, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  gpu_result = 2 * cpu_result; //reset
+  gpu_result = ublas::norm_1(ublas_v1);
+  cpu_result = viennacl::linalg::norm_1(vcl_v1);
+
+  if (check(cpu_result, gpu_result, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  gpu_result = ublas::norm_1(ublas_v1 + ublas_v2);
+  cpu_result = viennacl::linalg::norm_1(vcl_v1 + vcl_v2);
+
+  if (check(cpu_result, gpu_result, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  // --------------------------------------------------------------------------
+  std::cout << "Testing norm_2..." << std::endl;
+  cpu_result = ublas::norm_2(ublas_v1);
+  gpu_result = viennacl::linalg::norm_2(vcl_v1);
+
+  if (check(cpu_result, gpu_result, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  gpu_result = 2 * cpu_result; //reset
+  gpu_result = ublas::norm_2(ublas_v1);
+  cpu_result = viennacl::linalg::norm_2(vcl_v1);
+
+  if (check(cpu_result, gpu_result, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  gpu_result = ublas::norm_2(ublas_v1 + ublas_v2);
+  cpu_result = viennacl::linalg::norm_2(vcl_v1 + vcl_v2);
+
+  if (check(cpu_result, gpu_result, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  // --------------------------------------------------------------------------
+  std::cout << "Testing norm_inf..." << std::endl;
+  cpu_result = ublas::norm_inf(ublas_v1);
+  gpu_result = viennacl::linalg::norm_inf(vcl_v1);
+
+  if (check(cpu_result, gpu_result, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  gpu_result = 2 * cpu_result; //reset
+  gpu_result = ublas::norm_inf(ublas_v1);
+  cpu_result = viennacl::linalg::norm_inf(vcl_v1);
+
+  if (check(cpu_result, gpu_result, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  gpu_result = ublas::norm_inf(ublas_v1 + ublas_v2);
+  cpu_result = viennacl::linalg::norm_inf(vcl_v1 + vcl_v2);
+
+  if (check(cpu_result, gpu_result, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  // --------------------------------------------------------------------------
+  std::cout << "Testing index_norm_inf..." << std::endl;
+  std::size_t cpu_index = ublas::index_norm_inf(ublas_v1);
+  std::size_t gpu_index = viennacl::linalg::index_norm_inf(vcl_v1);
+
+  if (check(static_cast<NumericT>(cpu_index), static_cast<NumericT>(gpu_index), epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  // --------------------------------------------------------------------------
+  cpu_result = ublas_v1[index_norm_inf(ublas_v1)];
+  gpu_result = vcl_v1[viennacl::linalg::index_norm_inf(vcl_v1)];
+
+  if (check(cpu_result, gpu_result, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  cpu_result = ublas_v1[index_norm_inf(ublas_v1 + ublas_v2)];
+  gpu_result = vcl_v1[viennacl::linalg::index_norm_inf(vcl_v1 + vcl_v2)];
+
+  if (check(cpu_result, gpu_result, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  //
+  // Plane rotation and assignments
+  //
+
+  // --------------------------------------------------------------------------
+
+  ublas::vector<NumericT> x = ublas_v1;
+  ublas::vector<NumericT> y = ublas_v2;
+  ublas::vector<NumericT> t = ublas_v1;
+  t.assign (NumericT(1.1) * x + NumericT(2.3) * y),
+  y.assign (- NumericT(2.3) * x + NumericT(1.1) * y),
+  x.assign (t);
+
+  viennacl::linalg::plane_rotation(vcl_v1, vcl_v2, NumericT(1.1), NumericT(2.3));
+
+  if (check(x, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  if (check(y, vcl_v2, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  // --------------------------------------------------------------------------
+
+  std::cout << "Testing assignments..." << std::endl;
+  NumericT val = static_cast<NumericT>(1e-1);
+  for (size_t i=0; i < ublas_v1.size(); ++i)
+    ublas_v1(i) = val;
+
+  for (size_t i=0; i < vcl_v1.size(); ++i)
+    vcl_v1(i) = val;
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  //
+  // multiplication and division of vectors by scalars
+  //
+  for (size_t i=0; i < ublas_v1.size(); ++i)
+    ublas_v1(i) = NumericT(1.0) + random<NumericT>();
+  ublas_v2 = NumericT(3.1415) * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());  //resync
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  std::cout << "Testing scaling with CPU scalar..." << std::endl;
+  NumericT alpha = static_cast<NumericT>(1.7182);
+  viennacl::scalar<NumericT> gpu_alpha = alpha;
+
+  ublas_v1  *= alpha;
+  vcl_v1    *= alpha;
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing scaling with GPU scalar..." << std::endl;
+  ublas_v1  *= alpha;
+  vcl_v1    *= gpu_alpha;
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing scaling with scalar expression..." << std::endl;
+  ublas_v1  *= inner_prod(ublas_v1, ublas_v2);
+  vcl_v1    *= viennacl::linalg::inner_prod(vcl_v1, vcl_v2);
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  NumericT beta  = static_cast<NumericT>(1.4153);
+  viennacl::scalar<NumericT> gpu_beta = beta;
+
+  std::cout << "Testing shrinking with CPU scalar..." << std::endl;
+  ublas_v1 /= beta;
+  vcl_v1   /= beta;
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing shrinking with GPU scalar..." << std::endl;
+  ublas_v1 /= beta;
+  vcl_v1   /= gpu_beta;
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+
+  //
+  // add and inplace_add of vectors
+  //
+  for (size_t i=0; i < ublas_v1.size(); ++i)
+    ublas_v1(i) = NumericT(1.0) + random<NumericT>();
+  ublas_v2 = NumericT(3.1415) * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());  //resync
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  std::cout << "Testing add on vector..." << std::endl;
+
+  std::cout << "Checking for successful copy..." << std::endl;
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  if (check(ublas_v2, vcl_v2, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ublas_v1     = ublas_v1 + ublas_v2;
+  vcl_v1       =   vcl_v1 +   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing add on vector with flipsign..." << std::endl;
+  ublas_v1     = - ublas_v1 + ublas_v2;
+  vcl_v1       = -   vcl_v1 +   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing inplace-add on vector..." << std::endl;
+  ublas_v1 += ublas_v2;
+  vcl_v1   +=   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing assignment to vector with vector multiplied by scalar expression..." << std::endl;
+  ublas_v1  = inner_prod(ublas_v1, ublas_v2) * ublas_v2;
+  vcl_v1    = viennacl::linalg::inner_prod(vcl_v1, vcl_v2) * vcl_v2;
+
+  //
+  // subtract and inplace_subtract of vectors
+  //
+  std::cout << "Testing sub on vector..." << std::endl;
+  ublas_v2 = NumericT(3.1415) * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1     = ublas_v1 - ublas_v2;
+  vcl_v1       =   vcl_v1 -   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing inplace-sub on vector..." << std::endl;
+  ublas_v1 -= ublas_v2;
+  vcl_v1   -= vcl_v2;
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+
+  //
+  // multiply-add
+  //
+  std::cout << "Testing multiply-add on vector with CPU scalar (right)..." << std::endl;
+  for (size_t i=0; i < ublas_v1.size(); ++i)
+    ublas_v1(i) = NumericT(1.0) + random<NumericT>();
+  ublas_v2 = NumericT(3.1415) * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 + alpha * ublas_v2;
+  vcl_v1   = vcl_v1   + alpha *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing multiply-add on vector with CPU scalar (left)..." << std::endl;
+  ublas_v2 = NumericT(3.1415) * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = alpha * ublas_v1 + ublas_v2;
+  vcl_v1   = alpha *   vcl_v1 +   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing multiply-add on vector with CPU scalar (both)..." << std::endl;
+  ublas_v2 = NumericT(3.1415) * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = alpha * ublas_v1 + beta * ublas_v2;
+  vcl_v1   = alpha *   vcl_v1 + beta *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing inplace multiply-add on vector with CPU scalar..." << std::endl;
+  ublas_v2 = NumericT(3.1415) * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 += alpha * ublas_v2;
+  vcl_v1   += alpha *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing multiply-add on vector with GPU scalar (right)..." << std::endl;
+  ublas_v2 = NumericT(3.1415) * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 +     alpha * ublas_v2;
+  vcl_v1   = vcl_v1   + gpu_alpha *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing multiply-add on vector with GPU scalar (left)..." << std::endl;
+  ublas_v2 = NumericT(3.1415) * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 +     alpha * ublas_v2;
+  vcl_v1   = vcl_v1   + gpu_alpha *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing multiply-add on vector with GPU scalar (both)..." << std::endl;
+  ublas_v2 = NumericT(3.1415) * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 =     alpha * ublas_v1 +     beta * ublas_v2;
+  vcl_v1   = gpu_alpha *   vcl_v1 + gpu_beta *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing inplace multiply-add on vector with GPU scalar (both, adding)..." << std::endl;
+  ublas_v2 = NumericT(3.1415) * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 +=     alpha * ublas_v1 +     beta * ublas_v2;
+  vcl_v1   += gpu_alpha *   vcl_v1 + gpu_beta *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing inplace multiply-add on vector with GPU scalar (both, subtracting)..." << std::endl;
+  ublas_v2 = NumericT(3.1415) * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 +=     alpha * ublas_v1 -     beta * ublas_v2;
+  vcl_v1   += gpu_alpha *   vcl_v1 - gpu_beta *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+
+  std::cout << "Testing inplace multiply-add on vector with GPU scalar..." << std::endl;
+  ublas_v2 = NumericT(3.1415) * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 +=     alpha * ublas_v2;
+  vcl_v1   += gpu_alpha *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  //
+  // division-add
+  //
+  std::cout << "Testing division-add on vector with CPU scalar (right)..." << std::endl;
+  for (size_t i=0; i < ublas_v1.size(); ++i)
+    ublas_v1(i) = NumericT(1.0) + random<NumericT>();
+  ublas_v2 = NumericT(3.1415) * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 + ublas_v2 / alpha;
+  vcl_v1   = vcl_v1   + vcl_v2 / alpha;
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing division-add on vector with CPU scalar (left)..." << std::endl;
+  ublas_v2 = NumericT(3.1415) * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 / alpha + ublas_v2;
+  vcl_v1   =   vcl_v1 / alpha +   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing division-add on vector with CPU scalar (both)..." << std::endl;
+  ublas_v2 = NumericT(3.1415) * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 / alpha + ublas_v2 / beta;
+  vcl_v1   =   vcl_v1 / alpha +   vcl_v2 / beta;
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing division-multiply-add on vector with CPU scalar..." << std::endl;
+  ublas_v2 = NumericT(3.1415) * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 / alpha + ublas_v2 * beta;
+  vcl_v1   =   vcl_v1 / alpha +   vcl_v2 * beta;
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing multiply-division-add on vector with CPU scalar..." << std::endl;
+  ublas_v2 = NumericT(3.1415) * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 * alpha + ublas_v2 / beta;
+  vcl_v1   =   vcl_v1 * alpha +   vcl_v2 / beta;
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+
+  std::cout << "Testing inplace division-add on vector with CPU scalar..." << std::endl;
+  ublas_v2 = NumericT(3.1415) * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 += ublas_v2 / alpha;
+  vcl_v1   += vcl_v2 / alpha;
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing division-add on vector with GPU scalar (right)..." << std::endl;
+  ublas_v2 = NumericT(3.1415) * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 + ublas_v2 / alpha;
+  vcl_v1   = vcl_v1   +   vcl_v2 / gpu_alpha;
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing division-add on vector with GPU scalar (left)..." << std::endl;
+  ublas_v2 = NumericT(3.1415) * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 + ublas_v2 / alpha;
+  vcl_v1   = vcl_v1   +   vcl_v2 / gpu_alpha;
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing division-add on vector with GPU scalar (both)..." << std::endl;
+  ublas_v2 = NumericT(3.1415) * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 / alpha     + ublas_v2 / beta;
+  vcl_v1   =   vcl_v1 / gpu_alpha +   vcl_v2 / gpu_beta;
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing inplace division-add on vector with GPU scalar (both, adding)..." << std::endl;
+  ublas_v2 = NumericT(3.1415) * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 += ublas_v1 / alpha     + ublas_v2 / beta;
+  vcl_v1   +=   vcl_v1 / gpu_alpha +   vcl_v2 / gpu_beta;
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing inplace division-add on vector with GPU scalar (both, subtracting)..." << std::endl;
+  ublas_v2 = NumericT(3.1415) * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 += ublas_v1 / alpha     - ublas_v2 / beta;
+  vcl_v1   +=   vcl_v1 / gpu_alpha -   vcl_v2 / gpu_beta;
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing inplace division-multiply-add on vector with GPU scalar (adding)..." << std::endl;
+  ublas_v2 = NumericT(3.1415) * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 += ublas_v1 / alpha     + ublas_v2 * beta;
+  vcl_v1   +=   vcl_v1 / gpu_alpha +   vcl_v2 * gpu_beta;
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing inplace multiply-division-add on vector with GPU scalar (subtracting)..." << std::endl;
+  ublas_v2 = NumericT(3.1415) * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 += ublas_v1 * alpha     - ublas_v2 / beta;
+  vcl_v1   +=   vcl_v1 * gpu_alpha -   vcl_v2 / gpu_beta;
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+
+  std::cout << "Testing inplace division-add on vector with GPU scalar..." << std::endl;
+  ublas_v2 = NumericT(3.1415) * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 += ublas_v2 * alpha;
+  vcl_v1   +=   vcl_v2 * gpu_alpha;
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+
+  //
+  // multiply-subtract
+  //
+  std::cout << "Testing multiply-subtract on vector with CPU scalar (right)..." << std::endl;
+  for (size_t i=0; i < ublas_v1.size(); ++i)
+    ublas_v1(i) = NumericT(1.0) + random<NumericT>();
+  ublas_v2 = NumericT(3.1415) * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 - alpha * ublas_v2;
+  vcl_v1   = vcl_v1   - alpha *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing multiply-subtract on vector with CPU scalar (left)..." << std::endl;
+  ublas_v2 = NumericT(3.1415) * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = alpha * ublas_v1 - ublas_v2;
+  vcl_v1   = alpha * vcl_v1   -   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing multiply-subtract on vector with CPU scalar (both)..." << std::endl;
+  ublas_v2 = NumericT(3.1415) * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = alpha * ublas_v1 - beta * ublas_v2;
+  vcl_v1   = alpha * vcl_v1   - beta *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing inplace multiply-subtract on vector with CPU scalar..." << std::endl;
+  ublas_v2 = NumericT(3.1415) * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 -= alpha * ublas_v2;
+  vcl_v1   -= alpha *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing multiply-subtract on vector with GPU scalar (right)..." << std::endl;
+  ublas_v2 = NumericT(3.1415) * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 -     alpha * ublas_v2;
+  vcl_v1   = vcl_v1   - gpu_alpha *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing multiply-subtract on vector with GPU scalar (left)..." << std::endl;
+  ublas_v2 = NumericT(3.1415) * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 -     alpha * ublas_v2;
+  vcl_v1   = vcl_v1   - gpu_alpha *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing multiply-subtract on vector with GPU scalar (both)..." << std::endl;
+  ublas_v2 = NumericT(3.1415) * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 =     alpha * ublas_v1 -     beta * ublas_v2;
+  vcl_v1   = gpu_alpha * vcl_v1   - gpu_beta *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing inplace multiply-subtract on vector with GPU scalar (both, adding)..." << std::endl;
+  ublas_v2 = NumericT(3.1415) * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 -=     alpha * ublas_v1 +     beta * ublas_v2;
+  vcl_v1   -= gpu_alpha * vcl_v1   + gpu_beta *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing inplace multiply-subtract on vector with GPU scalar (both, subtracting)..." << std::endl;
+  ublas_v2 = NumericT(3.1415) * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 -=     alpha * ublas_v1 -     beta * ublas_v2;
+  vcl_v1   -= gpu_alpha * vcl_v1   - gpu_beta *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing inplace multiply-subtract on vector with GPU scalar..." << std::endl;
+  ublas_v2 = NumericT(3.1415) * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 -=     alpha * ublas_v2;
+  vcl_v1   -= gpu_alpha *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+
+  //
+  // division-subtract
+  //
+  std::cout << "Testing division-subtract on vector with CPU scalar (right)..." << std::endl;
+  for (size_t i=0; i < ublas_v1.size(); ++i)
+    ublas_v1(i) = NumericT(1.0) + random<NumericT>();
+  ublas_v2 = NumericT(3.1415) * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 - ublas_v2 / alpha;
+  vcl_v1   = vcl_v1   -   vcl_v2 / alpha;
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing division-subtract on vector with CPU scalar (left)..." << std::endl;
+  ublas_v2 = NumericT(3.1415) * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 / alpha - ublas_v2;
+  vcl_v1   =   vcl_v1 / alpha -   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing division-subtract on vector with CPU scalar (both)..." << std::endl;
+  ublas_v2 = NumericT(3.1415) * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 / alpha - ublas_v2 / alpha;
+  vcl_v1   =   vcl_v1 / alpha -   vcl_v2 / alpha;
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing inplace division-subtract on vector with CPU scalar..." << std::endl;
+  ublas_v2 = NumericT(3.1415) * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 -= ublas_v2 / alpha;
+  vcl_v1   -=   vcl_v2 / alpha;
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing inplace division-subtract on vector with GPU scalar..." << std::endl;
+  ublas_v2 = NumericT(3.1415) * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 -= ublas_v2 / alpha;
+  vcl_v1   -=   vcl_v2 / gpu_alpha;
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing division-subtract on vector with GPU scalar (right)..." << std::endl;
+  ublas_v2 = NumericT(3.1415) * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 - ublas_v2 / alpha;
+  vcl_v1   = vcl_v1   -   vcl_v2 / gpu_alpha;
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing division-subtract on vector with GPU scalar (left)..." << std::endl;
+  ublas_v2 = NumericT(3.1415) * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 - ublas_v2 / alpha;
+  vcl_v1   = vcl_v1   -   vcl_v2 / gpu_alpha;
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing division-subtract on vector with GPU scalar (both)..." << std::endl;
+  ublas_v2 = NumericT(3.1415) * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 / alpha     - ublas_v2 / beta;
+  vcl_v1   =   vcl_v1 / gpu_alpha -   vcl_v2 / gpu_beta;
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing inplace division-subtract on vector with GPU scalar (both, adding)..." << std::endl;
+  ublas_v2 = NumericT(3.1415) * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 -= ublas_v1 / alpha     + ublas_v2 / beta;
+  vcl_v1   -=   vcl_v1 / gpu_alpha +   vcl_v2 / gpu_beta;
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing inplace division-subtract on vector with GPU scalar (both, subtracting)..." << std::endl;
+  ublas_v2 = NumericT(3.1415) * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 -= ublas_v1 / alpha     - ublas_v2 / beta;
+  vcl_v1   -=   vcl_v1 / gpu_alpha -   vcl_v2 / gpu_beta;
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing multiply-division-subtract on vector with GPU scalar..." << std::endl;
+  ublas_v2 = NumericT(3.1415) * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 * alpha     - ublas_v2 / beta;
+  vcl_v1   =   vcl_v1 * gpu_alpha -   vcl_v2 / gpu_beta;
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing division-multiply-subtract on vector with GPU scalar..." << std::endl;
+  ublas_v2 = NumericT(3.1415) * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 / alpha     - ublas_v2 * beta;
+  vcl_v1   =   vcl_v1 / gpu_alpha -   vcl_v2 * gpu_beta;
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing inplace multiply-division-subtract on vector with GPU scalar (adding)..." << std::endl;
+  ublas_v2 = NumericT(3.1415) * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 -= ublas_v1 * alpha     + ublas_v2 / beta;
+  vcl_v1   -=   vcl_v1 * gpu_alpha +   vcl_v2 / gpu_beta;
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing inplace division-multiply-subtract on vector with GPU scalar (adding)..." << std::endl;
+  ublas_v2 = NumericT(3.1415) * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 -= ublas_v1 / alpha     + ublas_v2 * beta;
+  vcl_v1   -=   vcl_v1 / gpu_alpha +   vcl_v2 * gpu_beta;
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing inplace multiply-division-subtract on vector with GPU scalar (subtracting)..." << std::endl;
+  ublas_v2 = NumericT(3.1415) * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 -= ublas_v1 * alpha     - ublas_v2 / beta;
+  vcl_v1   -=   vcl_v1 * gpu_alpha -   vcl_v2 / gpu_beta;
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing inplace division-multiply-subtract on vector with GPU scalar (subtracting)..." << std::endl;
+  ublas_v2 = NumericT(3.1415) * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 -= ublas_v1 / alpha     - ublas_v2 * beta;
+  vcl_v1   -=   vcl_v1 / gpu_alpha -   vcl_v2 * gpu_beta;
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing inplace division-subtract on vector with GPU scalar..." << std::endl;
+  ublas_v2 = NumericT(3.1415) * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 -=     alpha * ublas_v2;
+  vcl_v1   -= gpu_alpha *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+
+  //
+  // More complicated expressions (for ensuring the operator overloads work correctly)
+  //
+  for (size_t i=0; i < ublas_v1.size(); ++i)
+    ublas_v1(i) = NumericT(1.0) + random<NumericT>();
+  ublas_v2 = NumericT(3.1415) * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  std::cout << "Testing three vector additions..." << std::endl;
+  ublas_v1 = ublas_v2 + ublas_v1 + ublas_v2;
+  vcl_v1   =   vcl_v2 +   vcl_v1 +   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  ublas_v2 = NumericT(3.1415) * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  std::cout << "Testing complicated vector expression with CPU scalar..." << std::endl;
+  ublas_v1 = beta * (ublas_v1 - alpha * ublas_v2);
+  vcl_v1   = beta * (vcl_v1   - alpha * vcl_v2);
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing complicated vector expression with GPU scalar..." << std::endl;
+  ublas_v1 =     beta * (ublas_v1 -     alpha * ublas_v2);
+  vcl_v1   = gpu_beta * (vcl_v1   - gpu_alpha * vcl_v2);
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  // --------------------------------------------------------------------------
+  ublas_v2 = NumericT(3.1415) * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  std::cout << "Testing swap..." << std::endl;
+  swap(ublas_v1, ublas_v2);
+  swap(vcl_v1, vcl_v2);
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  // --------------------------------------------------------------------------
+  for (std::size_t i=0; i<ublas_v1.size(); ++i)
+  {
+    ublas_v1[i] = NumericT(1.0) + random<NumericT>();
+    ublas_v2[i] = NumericT(5.0) + random<NumericT>();
+  }
+
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  std::cout << "Testing unary operator-..." << std::endl;
+  ublas_v1 = - ublas_v2;
+  vcl_v1   = -   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing elementwise multiplication..." << std::endl;
+  std::cout << " v1 = element_prod(v1, v2);" << std::endl;
+  ublas_v1 = ublas::element_prod(ublas_v1, ublas_v2);
+  vcl_v1 = viennacl::linalg::element_prod(vcl_v1, vcl_v2);
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " v1 += element_prod(v1, v2);" << std::endl;
+  ublas_v1 += ublas::element_prod(ublas_v1, ublas_v2);
+  vcl_v1 += viennacl::linalg::element_prod(vcl_v1, vcl_v2);
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " v1 -= element_prod(v1, v2);" << std::endl;
+  ublas_v1 -= ublas::element_prod(ublas_v1, ublas_v2);
+  vcl_v1 -= viennacl::linalg::element_prod(vcl_v1, vcl_v2);
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ///////
+  std::cout << " v1 = element_prod(v1 + v2, v2);" << std::endl;
+  ublas_v1 = ublas::element_prod(ublas_v1 + ublas_v2, ublas_v2);
+  vcl_v1 = viennacl::linalg::element_prod(vcl_v1 + vcl_v2, vcl_v2);
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " v1 += element_prod(v1 + v2, v2);" << std::endl;
+  ublas_v1 += ublas::element_prod(ublas_v1 + ublas_v2, ublas_v2);
+  vcl_v1 += viennacl::linalg::element_prod(vcl_v1 + vcl_v2, vcl_v2);
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " v1 -= element_prod(v1 + v2, v2);" << std::endl;
+  ublas_v1 -= ublas::element_prod(ublas_v1 + ublas_v2, ublas_v2);
+  vcl_v1 -= viennacl::linalg::element_prod(vcl_v1 + vcl_v2, vcl_v2);
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ///////
+  std::cout << " v1 = element_prod(v1, v2 + v1);" << std::endl;
+  ublas_v1 = ublas::element_prod(ublas_v1, ublas_v2 + ublas_v1);
+  vcl_v1 = viennacl::linalg::element_prod(vcl_v1, vcl_v2 + vcl_v1);
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " v1 += element_prod(v1, v2 + v1);" << std::endl;
+  ublas_v1 += ublas::element_prod(ublas_v1, ublas_v2 + ublas_v1);
+  vcl_v1 += viennacl::linalg::element_prod(vcl_v1, vcl_v2 + vcl_v1);
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " v1 -= element_prod(v1, v2 + v1);" << std::endl;
+  ublas_v1 -= ublas::element_prod(ublas_v1, ublas_v2 + ublas_v1);
+  vcl_v1 -= viennacl::linalg::element_prod(vcl_v1, vcl_v2 + vcl_v1);
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ///////
+  std::cout << " v1 = element_prod(v1 + v2, v2 + v1);" << std::endl;
+  ublas_v1 = ublas::element_prod(ublas_v1 + ublas_v2, ublas_v2 + ublas_v1);
+  vcl_v1 = viennacl::linalg::element_prod(vcl_v1 + vcl_v2, vcl_v2 + vcl_v1);
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " v1 += element_prod(v1 + v2, v2 + v1);" << std::endl;
+  ublas_v1 += ublas::element_prod(ublas_v1 + ublas_v2, ublas_v2 + ublas_v1);
+  vcl_v1 += viennacl::linalg::element_prod(vcl_v1 + vcl_v2, vcl_v2 + vcl_v1);
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " v1 -= element_prod(v1 + v2, v2 + v1);" << std::endl;
+  ublas_v1 -= ublas::element_prod(ublas_v1 + ublas_v2, ublas_v2 + ublas_v1);
+  vcl_v1 -= viennacl::linalg::element_prod(vcl_v1 + vcl_v2, vcl_v2 + vcl_v1);
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing elementwise division..." << std::endl;
+  for (std::size_t i=0; i<ublas_v1.size(); ++i)
+  {
+    ublas_v1[i] = NumericT(1.0) + random<NumericT>();
+    ublas_v2[i] = NumericT(5.0) + random<NumericT>();
+  }
+
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas::element_div(ublas_v1, ublas_v2);
+  vcl_v1 = viennacl::linalg::element_div(vcl_v1, vcl_v2);
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ublas_v1 += ublas::element_div(ublas_v1, ublas_v2);
+  vcl_v1 += viennacl::linalg::element_div(vcl_v1, vcl_v2);
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ublas_v1 -= ublas::element_div(ublas_v1, ublas_v2);
+  vcl_v1 -= viennacl::linalg::element_div(vcl_v1, vcl_v2);
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ///////
+  ublas_v1 = ublas::element_div(ublas_v1 + ublas_v2, ublas_v2);
+  vcl_v1 = viennacl::linalg::element_div(vcl_v1 + vcl_v2, vcl_v2);
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ublas_v1 += ublas::element_div(ublas_v1 + ublas_v2, ublas_v2);
+  vcl_v1 += viennacl::linalg::element_div(vcl_v1 + vcl_v2, vcl_v2);
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ublas_v1 -= ublas::element_div(ublas_v1 + ublas_v2, ublas_v2);
+  vcl_v1 -= viennacl::linalg::element_div(vcl_v1 + vcl_v2, vcl_v2);
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ///////
+  ublas_v1 = ublas::element_div(ublas_v1, ublas_v2 + ublas_v1);
+  vcl_v1 = viennacl::linalg::element_div(vcl_v1, vcl_v2 + vcl_v1);
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ublas_v1 += ublas::element_div(ublas_v1, ublas_v2 + ublas_v1);
+  vcl_v1 += viennacl::linalg::element_div(vcl_v1, vcl_v2 + vcl_v1);
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ublas_v1 -= ublas::element_div(ublas_v1, ublas_v2 + ublas_v1);
+  vcl_v1 -= viennacl::linalg::element_div(vcl_v1, vcl_v2 + vcl_v1);
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ///////
+  ublas_v1 = ublas::element_div(ublas_v1 + ublas_v2, ublas_v2 + ublas_v1);
+  vcl_v1 = viennacl::linalg::element_div(vcl_v1 + vcl_v2, vcl_v2 + vcl_v1);
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ublas_v1 += ublas::element_div(ublas_v1 + ublas_v2, ublas_v2 + ublas_v1);
+  vcl_v1 += viennacl::linalg::element_div(vcl_v1 + vcl_v2, vcl_v2 + vcl_v1);
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ublas_v1 -= ublas::element_div(ublas_v1 + ublas_v2, ublas_v2 + ublas_v1);
+  vcl_v1 -= viennacl::linalg::element_div(vcl_v1 + vcl_v2, vcl_v2 + vcl_v1);
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing elementwise power function..." << std::endl;
+  for (std::size_t i=0; i<ublas_v1.size(); ++i)
+  {
+    ublas_v1[i] = NumericT(1.0) + random<NumericT>();
+    ublas_v2[i] = NumericT(5.0) + random<NumericT>();
+  }
+  UblasVectorType ublas_v3 = ublas_v1;
+
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  for (std::size_t i=0; i<ublas_v3.size(); ++i)
+    ublas_v3[i] = std::pow(ublas_v1[i], ublas_v2[i]);
+  vcl_v1 = viennacl::linalg::element_pow(vcl_v1, vcl_v2);
+
+  if (check(ublas_v3, vcl_v1, epsilon) != EXIT_SUCCESS)
+  {
+    std::cerr << "** Failure in v1 = pow(v1, v2);" << std::endl;
+    return EXIT_FAILURE;
+  }
+
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  ublas_v3 = ublas_v1;
+  for (std::size_t i=0; i<ublas_v3.size(); ++i)
+    ublas_v3[i] += std::pow(ublas_v1[i], ublas_v2[i]);
+  vcl_v1 += viennacl::linalg::element_pow(vcl_v1, vcl_v2);
+
+  if (check(ublas_v3, vcl_v1, epsilon) != EXIT_SUCCESS)
+  {
+    std::cerr << "** Failure in v1 += pow(v1, v2);" << std::endl;
+    return EXIT_FAILURE;
+  }
+
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  ublas_v3 = ublas_v1;
+  for (std::size_t i=0; i<ublas_v3.size(); ++i)
+    ublas_v3[i] -= std::pow(ublas_v1[i], ublas_v2[i]);
+  vcl_v1 -= viennacl::linalg::element_pow(vcl_v1, vcl_v2);
+
+  if (check(ublas_v3, vcl_v1, epsilon) != EXIT_SUCCESS)
+  {
+    std::cerr << "** Failure in v1 -= pow(v1, v2);" << std::endl;
+    return EXIT_FAILURE;
+  }
+
+  ///////
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  ublas_v3 = ublas_v1;
+  for (std::size_t i=0; i<ublas_v3.size(); ++i)
+    ublas_v3[i] = std::pow(ublas_v1[i] + ublas_v2[i], ublas_v2[i]);
+  vcl_v1 = viennacl::linalg::element_pow(vcl_v1 + vcl_v2, vcl_v2);
+
+  if (check(ublas_v3, vcl_v1, epsilon) != EXIT_SUCCESS)
+  {
+    std::cerr << "** Failure in v1 = pow(v1 + v2, v2);" << std::endl;
+    return EXIT_FAILURE;
+  }
+
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  ublas_v3 = ublas_v1;
+  for (std::size_t i=0; i<ublas_v3.size(); ++i)
+    ublas_v3[i] += std::pow(ublas_v1[i] + ublas_v2[i], ublas_v2[i]);
+  vcl_v1 += viennacl::linalg::element_pow(vcl_v1 + vcl_v2, vcl_v2);
+
+  if (check(ublas_v3, vcl_v1, epsilon) != EXIT_SUCCESS)
+  {
+    std::cerr << "** Failure in v1 += pow(v1 + v2, v2);" << std::endl;
+    return EXIT_FAILURE;
+  }
+
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  ublas_v3 = ublas_v1;
+  for (std::size_t i=0; i<ublas_v3.size(); ++i)
+    ublas_v3[i] -= std::pow(ublas_v1[i] + ublas_v2[i], ublas_v2[i]);
+  vcl_v1 -= viennacl::linalg::element_pow(vcl_v1 + vcl_v2, vcl_v2);
+
+  if (check(ublas_v3, vcl_v1, epsilon) != EXIT_SUCCESS)
+  {
+    std::cerr << "** Failure in v1 -= pow(v1 + v2, v2);" << std::endl;
+    return EXIT_FAILURE;
+  }
+
+  ///////
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  ublas_v3 = ublas_v1;
+  for (std::size_t i=0; i<ublas_v3.size(); ++i)
+    ublas_v3[i] = std::pow(ublas_v1[i], ublas_v2[i] + ublas_v1[i]);
+  vcl_v1 = viennacl::linalg::element_pow(vcl_v1, vcl_v2 + vcl_v1);
+
+  if (check(ublas_v3, vcl_v1, epsilon) != EXIT_SUCCESS)
+  {
+    std::cerr << "** Failure in v1 = pow(v1, v2 + v1);" << std::endl;
+    return EXIT_FAILURE;
+  }
+
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  ublas_v3 = ublas_v1;
+  for (std::size_t i=0; i<ublas_v3.size(); ++i)
+    ublas_v3[i] += std::pow(ublas_v1[i], ublas_v2[i] + ublas_v1[i]);
+  vcl_v1 += viennacl::linalg::element_pow(vcl_v1, vcl_v2 + vcl_v1);
+
+  if (check(ublas_v3, vcl_v1, epsilon) != EXIT_SUCCESS)
+  {
+    std::cerr << "** Failure in v1 += pow(v1, v2 + v1);" << std::endl;
+    return EXIT_FAILURE;
+  }
+
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  ublas_v3 = ublas_v1;
+  for (std::size_t i=0; i<ublas_v3.size(); ++i)
+    ublas_v3[i] -= std::pow(ublas_v1[i], ublas_v2[i] + ublas_v1[i]);
+  vcl_v1 -= viennacl::linalg::element_pow(vcl_v1, vcl_v2 + vcl_v1);
+
+  if (check(ublas_v3, vcl_v1, epsilon) != EXIT_SUCCESS)
+  {
+    std::cerr << "** Failure in v1 -= pow(v1, v2 + v1);" << std::endl;
+    return EXIT_FAILURE;
+  }
+
+  ///////
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  ublas_v3 = ublas_v1;
+  for (std::size_t i=0; i<ublas_v3.size(); ++i)
+    ublas_v3[i] = std::pow(ublas_v1[i] + ublas_v2[i], ublas_v2[i] + ublas_v1[i]);
+  vcl_v1 = viennacl::linalg::element_pow(vcl_v1 + vcl_v2, vcl_v2 + vcl_v1);
+
+  if (check(ublas_v3, vcl_v1, epsilon) != EXIT_SUCCESS)
+  {
+    std::cerr << "** Failure in v1 = pow(v1 + v2, v2 + v1);" << std::endl;
+    return EXIT_FAILURE;
+  }
+
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  ublas_v3 = ublas_v1;
+  for (std::size_t i=0; i<ublas_v3.size(); ++i)
+    ublas_v3[i] += std::pow(ublas_v1[i] + ublas_v2[i], ublas_v2[i] + ublas_v1[i]);
+  vcl_v1 += viennacl::linalg::element_pow(vcl_v1 + vcl_v2, vcl_v2 + vcl_v1);
+
+  if (check(ublas_v3, vcl_v1, epsilon) != EXIT_SUCCESS)
+  {
+    std::cerr << "** Failure in v1 += pow(v1 + v2, v2 + v1);" << std::endl;
+    return EXIT_FAILURE;
+  }
+
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  ublas_v3 = ublas_v1;
+  for (std::size_t i=0; i<ublas_v3.size(); ++i)
+    ublas_v3[i] -= std::pow(ublas_v1[i] + ublas_v2[i], ublas_v2[i] + ublas_v1[i]);
+  vcl_v1 -= viennacl::linalg::element_pow(vcl_v1 + vcl_v2, vcl_v2 + vcl_v1);
+
+  if (check(ublas_v3, vcl_v1, epsilon) != EXIT_SUCCESS)
+  {
+    std::cerr << "** Failure in v1 -= pow(v1 + v2, v2 + v1);" << std::endl;
+    return EXIT_FAILURE;
+  }
+
+  std::cout << "Testing unary elementwise operations..." << std::endl;
+  for (size_t i=0; i < ublas_v1.size(); ++i)
+    ublas_v1(i) = random<NumericT>() / NumericT(4);
+
+#define GENERATE_UNARY_OP_TEST(FUNCNAME) \
+  ublas_v2 = NumericT(3.1415) * ublas_v1; \
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin()); \
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin()); \
+  \
+  for (std::size_t i=0; i<ublas_v1.size(); ++i) \
+    ublas_v1[i] = std::FUNCNAME(ublas_v2[i]); \
+  vcl_v1 = viennacl::linalg::element_##FUNCNAME(vcl_v2); \
+ \
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS) \
+  { \
+    std::cout << "Failure at v1 = " << #FUNCNAME << "(v2)" << std::endl; \
+    return EXIT_FAILURE; \
+  } \
+ \
+  for (std::size_t i=0; i<ublas_v1.size(); ++i) \
+    ublas_v1[i] = std::FUNCNAME(ublas_v1[i] + ublas_v2[i]); \
+  vcl_v1 = viennacl::linalg::element_##FUNCNAME(vcl_v1 + vcl_v2); \
+ \
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS) \
+  { \
+    std::cout << "Failure at v1 = " << #FUNCNAME << "(v1 + v2)" << std::endl; \
+    return EXIT_FAILURE; \
+  } \
+ \
+  for (std::size_t i=0; i<ublas_v1.size(); ++i) \
+    ublas_v1[i] += std::FUNCNAME(ublas_v1[i]); \
+  vcl_v1 += viennacl::linalg::element_##FUNCNAME(vcl_v1); \
+ \
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS) \
+  { \
+    std::cout << "Failure at v1 += " << #FUNCNAME << "(v2)" << std::endl; \
+    return EXIT_FAILURE; \
+  } \
+ \
+  for (std::size_t i=0; i<ublas_v1.size(); ++i) \
+    ublas_v1[i] += std::FUNCNAME(ublas_v1[i] + ublas_v2[i]); \
+  vcl_v1 += viennacl::linalg::element_##FUNCNAME(vcl_v1 + vcl_v2); \
+ \
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS) \
+  { \
+    std::cout << "Failure at v1 += " << #FUNCNAME << "(v1 + v2)" << std::endl; \
+    return EXIT_FAILURE; \
+  } \
+ \
+  for (std::size_t i=0; i<ublas_v1.size(); ++i) \
+    ublas_v1[i] -= std::FUNCNAME(ublas_v2[i]); \
+  vcl_v1 -= viennacl::linalg::element_##FUNCNAME(vcl_v2); \
+ \
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS) \
+  { \
+    std::cout << "Failure at v1 -= " << #FUNCNAME << "(v2)" << std::endl; \
+    return EXIT_FAILURE; \
+  } \
+ \
+  for (std::size_t i=0; i<ublas_v1.size(); ++i) \
+    ublas_v1[i] -= std::FUNCNAME(ublas_v1[i] + ublas_v2[i]); \
+  vcl_v1 -= viennacl::linalg::element_##FUNCNAME(vcl_v1 + vcl_v2); \
+ \
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS) \
+  { \
+    std::cout << "Failure at v1 -= " << #FUNCNAME << "(v1 + v2)" << std::endl; \
+    return EXIT_FAILURE; \
+  } \
+
+  GENERATE_UNARY_OP_TEST(cos);
+  GENERATE_UNARY_OP_TEST(cosh);
+  for (size_t i=0; i < ublas_v1.size(); ++i)
+    ublas_v1(i) = random<NumericT>() / NumericT(4);
+  GENERATE_UNARY_OP_TEST(exp);
+  GENERATE_UNARY_OP_TEST(floor);
+  GENERATE_UNARY_OP_TEST(fabs);
+  GENERATE_UNARY_OP_TEST(log);
+  GENERATE_UNARY_OP_TEST(log10);
+  GENERATE_UNARY_OP_TEST(sin);
+  GENERATE_UNARY_OP_TEST(sinh);
+  GENERATE_UNARY_OP_TEST(fabs);
+  //GENERATE_UNARY_OP_TEST(abs); //OpenCL allows abs on integers only
+  GENERATE_UNARY_OP_TEST(sqrt);
+  GENERATE_UNARY_OP_TEST(tan);
+  GENERATE_UNARY_OP_TEST(tanh);
+
+  // --------------------------------------------------------------------------
+  ublas_v2 = NumericT(3.1415) * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  std::cout << "Testing another complicated vector expression with CPU scalars..." << std::endl;
+  ublas_v1 = ublas_v2 / alpha + beta * (ublas_v1 - alpha*ublas_v2);
+  vcl_v1   = vcl_v2 / alpha   + beta * (vcl_v1   - alpha*vcl_v2);
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing another complicated vector expression with GPU scalars..." << std::endl;
+  ublas_v2 = NumericT(3.1415) * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v2 / alpha   +     beta * (ublas_v1 - alpha*ublas_v2);
+  vcl_v1   = vcl_v2 / gpu_alpha + gpu_beta * (vcl_v1   - gpu_alpha*vcl_v2);
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing lenghty sum of scaled vectors..." << std::endl;
+  ublas_v2 = NumericT(3.1415) * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v2 / alpha   +     beta * ublas_v1 - alpha * ublas_v2 + beta * ublas_v1 - alpha * ublas_v1;
+  vcl_v1   = vcl_v2 / gpu_alpha + gpu_beta *   vcl_v1 - alpha *   vcl_v2 + beta *   vcl_v1 - alpha *   vcl_v1;
+
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  // --------------------------------------------------------------------------
+  return retval;
+}
+
+
+template< typename NumericT, typename Epsilon >
+int test(Epsilon const& epsilon)
+{
+  int retval = EXIT_SUCCESS;
+  std::size_t size = 24656;
+
+  std::cout << "Running tests for vector of size " << size << std::endl;
+
+  //
+  // Set up UBLAS objects
+  //
+  ublas::vector<NumericT> ublas_full_vec(size);
+  ublas::vector<NumericT> ublas_full_vec2(ublas_full_vec.size());
+
+  for (std::size_t i=0; i<ublas_full_vec.size(); ++i)
+  {
+    ublas_full_vec[i]  = NumericT(1.0) + random<NumericT>();
+    ublas_full_vec2[i] = NumericT(1.0) + random<NumericT>();
+  }
+
+  ublas::range r1(    ublas_full_vec.size() / 4, 2 * ublas_full_vec.size() / 4);
+  ublas::range r2(2 * ublas_full_vec2.size() / 4, 3 * ublas_full_vec2.size() / 4);
+  ublas::vector_range< ublas::vector<NumericT> > ublas_range_vec(ublas_full_vec, r1);
+  ublas::vector_range< ublas::vector<NumericT> > ublas_range_vec2(ublas_full_vec2, r2);
+
+  ublas::slice s1(    ublas_full_vec.size() / 4, 3, ublas_full_vec.size() / 4);
+  ublas::slice s2(2 * ublas_full_vec2.size() / 4, 2, ublas_full_vec2.size() / 4);
+  ublas::vector_slice< ublas::vector<NumericT> > ublas_slice_vec(ublas_full_vec, s1);
+  ublas::vector_slice< ublas::vector<NumericT> > ublas_slice_vec2(ublas_full_vec2, s2);
+
+  //
+  // Set up ViennaCL objects
+  //
+  viennacl::vector<NumericT> vcl_full_vec(ublas_full_vec.size());
+  viennacl::vector<NumericT> vcl_full_vec2(ublas_full_vec2.size());
+
+  viennacl::fast_copy(ublas_full_vec.begin(), ublas_full_vec.end(), vcl_full_vec.begin());
+  viennacl::copy(ublas_full_vec2.begin(), ublas_full_vec2.end(), vcl_full_vec2.begin());
+
+  viennacl::range vcl_r1(    vcl_full_vec.size() / 4, 2 * vcl_full_vec.size() / 4);
+  viennacl::range vcl_r2(2 * vcl_full_vec2.size() / 4, 3 * vcl_full_vec2.size() / 4);
+  viennacl::vector_range< viennacl::vector<NumericT> > vcl_range_vec(vcl_full_vec, vcl_r1);
+  viennacl::vector_range< viennacl::vector<NumericT> > vcl_range_vec2(vcl_full_vec2, vcl_r2);
+
+  {
+    viennacl::vector<NumericT> vcl_short_vec(vcl_range_vec);
+    viennacl::vector<NumericT> vcl_short_vec2 = vcl_range_vec2;
+
+    ublas::vector<NumericT> ublas_short_vec(ublas_range_vec);
+    ublas::vector<NumericT> ublas_short_vec2(ublas_range_vec2);
+
+    std::cout << "Testing creation of vectors from range..." << std::endl;
+    if (check(ublas_short_vec, vcl_short_vec, epsilon) != EXIT_SUCCESS)
+      return EXIT_FAILURE;
+    if (check(ublas_short_vec2, vcl_short_vec2, epsilon) != EXIT_SUCCESS)
+      return EXIT_FAILURE;
+  }
+
+  viennacl::slice vcl_s1(    vcl_full_vec.size() / 4, 3, vcl_full_vec.size() / 4);
+  viennacl::slice vcl_s2(2 * vcl_full_vec2.size() / 4, 2, vcl_full_vec2.size() / 4);
+  viennacl::vector_slice< viennacl::vector<NumericT> > vcl_slice_vec(vcl_full_vec, vcl_s1);
+  viennacl::vector_slice< viennacl::vector<NumericT> > vcl_slice_vec2(vcl_full_vec2, vcl_s2);
+
+  viennacl::vector<NumericT> vcl_short_vec(vcl_slice_vec);
+  viennacl::vector<NumericT> vcl_short_vec2 = vcl_slice_vec2;
+
+  ublas::vector<NumericT> ublas_short_vec(ublas_slice_vec);
+  ublas::vector<NumericT> ublas_short_vec2(ublas_slice_vec2);
+
+  std::cout << "Testing creation of vectors from slice..." << std::endl;
+  if (check(ublas_short_vec, vcl_short_vec, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  if (check(ublas_short_vec2, vcl_short_vec2, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  //
+  // Now start running tests for vectors, ranges and slices:
+  //
+
+  std::cout << " ** vcl_v1 = vector, vcl_v2 = vector **" << std::endl;
+  retval = test<NumericT>(epsilon,
+                          ublas_short_vec, ublas_short_vec2,
+                          vcl_short_vec, vcl_short_vec2);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " ** vcl_v1 = vector, vcl_v2 = range **" << std::endl;
+  retval = test<NumericT>(epsilon,
+                          ublas_short_vec, ublas_short_vec2,
+                          vcl_short_vec, vcl_range_vec2);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " ** vcl_v1 = vector, vcl_v2 = slice **" << std::endl;
+  retval = test<NumericT>(epsilon,
+                          ublas_short_vec, ublas_short_vec2,
+                          vcl_short_vec, vcl_slice_vec2);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ///////
+
+  std::cout << " ** vcl_v1 = range, vcl_v2 = vector **" << std::endl;
+  retval = test<NumericT>(epsilon,
+                          ublas_short_vec, ublas_short_vec2,
+                          vcl_range_vec, vcl_short_vec2);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " ** vcl_v1 = range, vcl_v2 = range **" << std::endl;
+  retval = test<NumericT>(epsilon,
+                          ublas_short_vec, ublas_short_vec2,
+                          vcl_range_vec, vcl_range_vec2);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " ** vcl_v1 = range, vcl_v2 = slice **" << std::endl;
+  retval = test<NumericT>(epsilon,
+                          ublas_short_vec, ublas_short_vec2,
+                          vcl_range_vec, vcl_slice_vec2);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ///////
+
+  std::cout << " ** vcl_v1 = slice, vcl_v2 = vector **" << std::endl;
+  retval = test<NumericT>(epsilon,
+                          ublas_short_vec, ublas_short_vec2,
+                          vcl_slice_vec, vcl_short_vec2);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " ** vcl_v1 = slice, vcl_v2 = range **" << std::endl;
+  retval = test<NumericT>(epsilon,
+                          ublas_short_vec, ublas_short_vec2,
+                          vcl_slice_vec, vcl_range_vec2);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " ** vcl_v1 = slice, vcl_v2 = slice **" << std::endl;
+  retval = test<NumericT>(epsilon,
+                          ublas_short_vec, ublas_short_vec2,
+                          vcl_slice_vec, vcl_slice_vec2);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  return EXIT_SUCCESS;
+}
+
+
diff --git a/tests/src/vector_int.cpp b/tests/src/vector_int.cpp
new file mode 100644
index 0000000..13e4a49
--- /dev/null
+++ b/tests/src/vector_int.cpp
@@ -0,0 +1,1523 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+//
+// *** System
+//
+#include <iostream>
+#include <iomanip>
+
+//
+// *** Boost
+//
+#include <boost/numeric/ublas/io.hpp>
+#include <boost/numeric/ublas/vector.hpp>
+#include <boost/numeric/ublas/vector_proxy.hpp>
+
+//
+// *** ViennaCL
+//
+//#define VIENNACL_DEBUG_ALL
+#define VIENNACL_WITH_UBLAS 1
+#include "viennacl/vector.hpp"
+#include "viennacl/vector_proxy.hpp"
+#include "viennacl/linalg/inner_prod.hpp"
+#include "viennacl/linalg/norm_1.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+#include "viennacl/linalg/norm_inf.hpp"
+
+#include "Random.hpp"
+
+using namespace boost::numeric;
+
+
+//
+// -------------------------------------------------------------
+//
+template <typename ScalarType>
+ScalarType diff(ScalarType const & s1, ScalarType const & s2)
+{
+  viennacl::backend::finish();
+  return s1 - s2;
+}
+//
+// -------------------------------------------------------------
+//
+template <typename ScalarType>
+ScalarType diff(ScalarType const & s1, viennacl::scalar<ScalarType> const & s2)
+{
+  viennacl::backend::finish();
+  return s1 - s2;
+}
+//
+// -------------------------------------------------------------
+//
+template <typename ScalarType>
+ScalarType diff(ScalarType const & s1, viennacl::entry_proxy<ScalarType> const & s2)
+{
+  viennacl::backend::finish();
+  return s1 - s2;
+}
+//
+// -------------------------------------------------------------
+//
+template <typename ScalarType, typename VCLVectorType>
+ScalarType diff(ublas::vector<ScalarType> const & v1, VCLVectorType const & v2)
+{
+   ublas::vector<ScalarType> v2_cpu(v2.size());
+   viennacl::backend::finish();  //workaround for a bug in APP SDK 2.7 on Trinity APUs (with Catalyst 12.8)
+   viennacl::copy(v2.begin(), v2.end(), v2_cpu.begin());
+
+   for (unsigned int i=0;i<v1.size(); ++i)
+   {
+      if (v2_cpu[i] != v1[i])
+        return 1;
+   }
+
+   return 0;
+}
+
+template <typename T1, typename T2>
+int check(T1 const & t1, T2 const & t2)
+{
+  int retval = EXIT_SUCCESS;
+
+  if (diff(t1, t2) != 0)
+  {
+    std::cout << "# Error! Difference: " << std::abs(diff(t1, t2)) << std::endl;
+    retval = EXIT_FAILURE;
+  }
+  return retval;
+}
+
+
+//
+// -------------------------------------------------------------
+//
+template< typename NumericT, typename UblasVectorType, typename ViennaCLVectorType1, typename ViennaCLVectorType2 >
+int test(UblasVectorType     & ublas_v1, UblasVectorType     & ublas_v2,
+         ViennaCLVectorType1 &   vcl_v1, ViennaCLVectorType2 &   vcl_v2)
+{
+  int retval = EXIT_SUCCESS;
+
+  NumericT                    cpu_result = 42;
+  viennacl::scalar<NumericT>  gpu_result = 43;
+
+  //
+  // Initializer:
+  //
+  std::cout << "Checking for zero_vector initializer..." << std::endl;
+  //ublas_v1 = ublas::zero_vector<NumericT>(ublas_v1.size());
+  for (std::size_t i=0; i<ublas_v1.size(); ++i)
+    ublas_v1[i] = 0;
+  vcl_v1 = viennacl::zero_vector<NumericT>(vcl_v1.size());
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Checking for scalar_vector initializer..." << std::endl;
+  //ublas_v1 = ublas::scalar_vector<NumericT>(ublas_v1.size(), cpu_result);
+  for (std::size_t i=0; i<ublas_v1.size(); ++i)
+    ublas_v1[i] = cpu_result;
+  vcl_v1 = viennacl::scalar_vector<NumericT>(vcl_v1.size(), cpu_result);
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  //ublas_v1 = ublas::scalar_vector<NumericT>(ublas_v1.size(), gpu_result);
+  for (std::size_t i=0; i<ublas_v1.size(); ++i)
+    ublas_v1[i] = cpu_result + 1;
+  vcl_v1 = viennacl::scalar_vector<NumericT>(vcl_v1.size(), gpu_result);
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Checking for unit_vector initializer..." << std::endl;
+  //ublas_v1 = ublas::unit_vector<NumericT>(ublas_v1.size(), 5);
+  for (std::size_t i=0; i<ublas_v1.size(); ++i)
+    ublas_v1[i] = (i == 5) ? 1 : 0;
+  vcl_v1 = viennacl::unit_vector<NumericT>(vcl_v1.size(), 5);
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  for (std::size_t i=0; i<ublas_v1.size(); ++i)
+  {
+    ublas_v1[i] = NumericT(i);
+    ublas_v2[i] = NumericT(i+42);
+  }
+
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());  //resync
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  std::cout << "Checking for successful copy..." << std::endl;
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  if (check(ublas_v2, vcl_v2) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  //
+  // Part 1: Norms and inner product
+  //
+
+  // --------------------------------------------------------------------------
+  std::cout << "Testing inner_prod..." << std::endl;
+  cpu_result = viennacl::linalg::inner_prod(ublas_v1, ublas_v2);
+  NumericT cpu_result2 = viennacl::linalg::inner_prod(vcl_v1, vcl_v2);
+  gpu_result = viennacl::linalg::inner_prod(vcl_v1, vcl_v2);
+
+  if (check(cpu_result, cpu_result2) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  if (check(cpu_result, gpu_result) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  cpu_result = inner_prod(ublas_v1 + ublas_v2, ublas_v2 - ublas_v1);
+  NumericT cpu_result3 = viennacl::linalg::inner_prod(vcl_v1 + vcl_v2, vcl_v2 - vcl_v1);
+  gpu_result = viennacl::linalg::inner_prod(vcl_v1 + vcl_v2, vcl_v2 - vcl_v1);
+
+  if (check(cpu_result, cpu_result3) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  if (check(cpu_result, gpu_result) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  // --------------------------------------------------------------------------
+  std::cout << "Testing norm_1..." << std::endl;
+  cpu_result = ublas::norm_1(ublas_v1);
+  gpu_result = viennacl::linalg::norm_1(vcl_v1);
+
+  if (check(cpu_result, gpu_result) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  gpu_result = 2 * cpu_result; //reset
+  gpu_result = ublas::norm_1(ublas_v1);
+  cpu_result = viennacl::linalg::norm_1(vcl_v1);
+
+  if (check(cpu_result, gpu_result) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  gpu_result = ublas::norm_1(ublas_v1 + ublas_v2);
+  cpu_result = viennacl::linalg::norm_1(vcl_v1 + vcl_v2);
+
+  if (check(cpu_result, gpu_result) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  // --------------------------------------------------------------------------
+  std::cout << "Testing norm_inf..." << std::endl;
+  cpu_result = ublas::norm_inf(ublas_v1);
+  gpu_result = viennacl::linalg::norm_inf(vcl_v1);
+
+  if (check(cpu_result, gpu_result) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  gpu_result = 2 * cpu_result; //reset
+  gpu_result = ublas::norm_inf(ublas_v1);
+  cpu_result = viennacl::linalg::norm_inf(vcl_v1);
+
+  if (check(cpu_result, gpu_result) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  gpu_result = ublas::norm_inf(ublas_v1 + ublas_v2);
+  cpu_result = viennacl::linalg::norm_inf(vcl_v1 + vcl_v2);
+
+  if (check(cpu_result, gpu_result) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  // --------------------------------------------------------------------------
+  std::cout << "Testing index_norm_inf..." << std::endl;
+  std::size_t cpu_index = ublas::index_norm_inf(ublas_v1);
+  std::size_t gpu_index = viennacl::linalg::index_norm_inf(vcl_v1);
+
+  if (check(static_cast<NumericT>(cpu_index), static_cast<NumericT>(gpu_index)) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  // --------------------------------------------------------------------------
+  cpu_result = ublas_v1[index_norm_inf(ublas_v1)];
+  gpu_result = vcl_v1[viennacl::linalg::index_norm_inf(vcl_v1)];
+
+  if (check(cpu_result, gpu_result) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  cpu_result = ublas_v1[index_norm_inf(ublas_v1 + ublas_v2)];
+  gpu_result = vcl_v1[viennacl::linalg::index_norm_inf(vcl_v1 + vcl_v2)];
+
+  if (check(cpu_result, gpu_result) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  //
+  // Plane rotation and assignments
+  //
+
+  // --------------------------------------------------------------------------
+
+  ublas::vector<NumericT> x = ublas_v1;
+  ublas::vector<NumericT> y = ublas_v2;
+  ublas::vector<NumericT> t = ublas_v1;
+  t.assign (  NumericT(1) * x + NumericT(2) * y),
+  y.assign (- NumericT(2) * x + NumericT(1) * y),
+  x.assign (t);
+
+  viennacl::linalg::plane_rotation(vcl_v1, vcl_v2, NumericT(1), NumericT(2));
+
+  if (check(x, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  if (check(y, vcl_v2) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  // --------------------------------------------------------------------------
+
+  std::cout << "Testing assignments..." << std::endl;
+  NumericT val = static_cast<NumericT>(1);
+  for (size_t i=0; i < ublas_v1.size(); ++i)
+    ublas_v1(i) = val;
+
+  for (size_t i=0; i < vcl_v1.size(); ++i)
+    vcl_v1(i) = val;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  //
+  // multiplication and division of vectors by scalars
+  //
+  std::cout << "Testing scaling with CPU scalar..." << std::endl;
+  NumericT alpha = static_cast<NumericT>(3);
+  viennacl::scalar<NumericT> gpu_alpha = alpha;
+
+  ublas_v1  *= alpha;
+  vcl_v1    *= alpha;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing scaling with GPU scalar..." << std::endl;
+  ublas_v1  *= alpha;
+  vcl_v1    *= gpu_alpha;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  NumericT beta  = static_cast<NumericT>(2);
+  viennacl::scalar<NumericT> gpu_beta = beta;
+
+  std::cout << "Testing shrinking with CPU scalar..." << std::endl;
+  ublas_v1 /= beta;
+  vcl_v1   /= beta;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing shrinking with GPU scalar..." << std::endl;
+  ublas_v1 /= beta;
+  vcl_v1   /= gpu_beta;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  //
+  // add and inplace_add of vectors
+  //
+  for (size_t i=0; i < ublas_v1.size(); ++i)
+    ublas_v1(i) = NumericT(i);
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());  //resync
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  std::cout << "Testing add on vector..." << std::endl;
+
+  std::cout << "Checking for successful copy..." << std::endl;
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  if (check(ublas_v2, vcl_v2) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ublas_v1     = ublas_v1 + ublas_v2;
+  vcl_v1       =   vcl_v1 +   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing add on vector with flipsign..." << std::endl;
+  ublas_v1     = - ublas_v1 + ublas_v2;
+  vcl_v1       = -   vcl_v1 +   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing inplace-add on vector..." << std::endl;
+  ublas_v1 += ublas_v2;
+  vcl_v1   +=   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  //
+  // subtract and inplace_subtract of vectors
+  //
+  std::cout << "Testing sub on vector..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1     = ublas_v1 - ublas_v2;
+  vcl_v1       =   vcl_v1 -   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing inplace-sub on vector..." << std::endl;
+  ublas_v1 -= ublas_v2;
+  vcl_v1   -= vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+
+  //
+  // multiply-add
+  //
+  std::cout << "Testing multiply-add on vector with CPU scalar (right)..." << std::endl;
+  for (size_t i=0; i < ublas_v1.size(); ++i)
+    ublas_v1(i) = NumericT(i);
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 + alpha * ublas_v2;
+  vcl_v1   = vcl_v1   + alpha *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing multiply-add on vector with CPU scalar (left)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = alpha * ublas_v1 + ublas_v2;
+  vcl_v1   = alpha *   vcl_v1 +   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing multiply-add on vector with CPU scalar (both)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = alpha * ublas_v1 + beta * ublas_v2;
+  vcl_v1   = alpha *   vcl_v1 + beta *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing inplace multiply-add on vector with CPU scalar..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 += alpha * ublas_v2;
+  vcl_v1   += alpha *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing multiply-add on vector with GPU scalar (right)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 +     alpha * ublas_v2;
+  vcl_v1   = vcl_v1   + gpu_alpha *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing multiply-add on vector with GPU scalar (left)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 +     alpha * ublas_v2;
+  vcl_v1   = vcl_v1   + gpu_alpha *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing multiply-add on vector with GPU scalar (both)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 =     alpha * ublas_v1 +     beta * ublas_v2;
+  vcl_v1   = gpu_alpha *   vcl_v1 + gpu_beta *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing inplace multiply-add on vector with GPU scalar (both, adding)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 +=     alpha * ublas_v1 +     beta * ublas_v2;
+  vcl_v1   += gpu_alpha *   vcl_v1 + gpu_beta *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing inplace multiply-add on vector with GPU scalar (both, subtracting)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 +=     alpha * ublas_v1 -     beta * ublas_v2;
+  vcl_v1   += gpu_alpha *   vcl_v1 - gpu_beta *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+
+  std::cout << "Testing inplace multiply-add on vector with GPU scalar..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 +=     alpha * ublas_v2;
+  vcl_v1   += gpu_alpha *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  //
+  // division-add
+  //
+  std::cout << "Testing division-add on vector with CPU scalar (right)..." << std::endl;
+  for (size_t i=0; i < ublas_v1.size(); ++i)
+    ublas_v1(i) = NumericT(i);
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 + ublas_v2 / alpha;
+  vcl_v1   = vcl_v1   + vcl_v2 / alpha;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing division-add on vector with CPU scalar (left)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 / alpha + ublas_v2;
+  vcl_v1   =   vcl_v1 / alpha +   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing division-add on vector with CPU scalar (both)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 / alpha + ublas_v2 / beta;
+  vcl_v1   =   vcl_v1 / alpha +   vcl_v2 / beta;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing division-multiply-add on vector with CPU scalar..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 / alpha + ublas_v2 * beta;
+  vcl_v1   =   vcl_v1 / alpha +   vcl_v2 * beta;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing multiply-division-add on vector with CPU scalar..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 * alpha + ublas_v2 / beta;
+  vcl_v1   =   vcl_v1 * alpha +   vcl_v2 / beta;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+
+  std::cout << "Testing inplace division-add on vector with CPU scalar..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 += ublas_v2 / alpha;
+  vcl_v1   += vcl_v2 / alpha;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing division-add on vector with GPU scalar (right)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 + ublas_v2 / alpha;
+  vcl_v1   = vcl_v1   +   vcl_v2 / gpu_alpha;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing division-add on vector with GPU scalar (left)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 + ublas_v2 / alpha;
+  vcl_v1   = vcl_v1   +   vcl_v2 / gpu_alpha;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing division-add on vector with GPU scalar (both)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 / alpha     + ublas_v2 / beta;
+  vcl_v1   =   vcl_v1 / gpu_alpha +   vcl_v2 / gpu_beta;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing inplace division-add on vector with GPU scalar (both, adding)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 += ublas_v1 / alpha     + ublas_v2 / beta;
+  vcl_v1   +=   vcl_v1 / gpu_alpha +   vcl_v2 / gpu_beta;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing inplace division-add on vector with GPU scalar (both, subtracting)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 += ublas_v1 / alpha     - ublas_v2 / beta;
+  vcl_v1   +=   vcl_v1 / gpu_alpha -   vcl_v2 / gpu_beta;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing inplace division-multiply-add on vector with GPU scalar (adding)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 += ublas_v1 / alpha     + ublas_v2 * beta;
+  vcl_v1   +=   vcl_v1 / gpu_alpha +   vcl_v2 * gpu_beta;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing inplace multiply-division-add on vector with GPU scalar (subtracting)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 += ublas_v1 * alpha     - ublas_v2 / beta;
+  vcl_v1   +=   vcl_v1 * gpu_alpha -   vcl_v2 / gpu_beta;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+
+  std::cout << "Testing inplace division-add on vector with GPU scalar..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 += ublas_v2 * alpha;
+  vcl_v1   +=   vcl_v2 * gpu_alpha;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  //
+  // multiply-subtract
+  //
+  std::cout << "Testing multiply-subtract on vector with CPU scalar (right)..." << std::endl;
+  for (size_t i=0; i < ublas_v1.size(); ++i)
+    ublas_v1(i) = NumericT(i);
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 - alpha * ublas_v2;
+  vcl_v1   = vcl_v1   - alpha *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing multiply-subtract on vector with CPU scalar (left)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = alpha * ublas_v1 - ublas_v2;
+  vcl_v1   = alpha * vcl_v1   -   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing multiply-subtract on vector with CPU scalar (both)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = alpha * ublas_v1 - beta * ublas_v2;
+  vcl_v1   = alpha * vcl_v1   - beta *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing inplace multiply-subtract on vector with CPU scalar..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 -= alpha * ublas_v2;
+  vcl_v1   -= alpha *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing multiply-subtract on vector with GPU scalar (right)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 -     alpha * ublas_v2;
+  vcl_v1   = vcl_v1   - gpu_alpha *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing multiply-subtract on vector with GPU scalar (left)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 -     alpha * ublas_v2;
+  vcl_v1   = vcl_v1   - gpu_alpha *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing multiply-subtract on vector with GPU scalar (both)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 =     alpha * ublas_v1 -     beta * ublas_v2;
+  vcl_v1   = gpu_alpha * vcl_v1   - gpu_beta *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing inplace multiply-subtract on vector with GPU scalar (both, adding)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 -=     alpha * ublas_v1 +     beta * ublas_v2;
+  vcl_v1   -= gpu_alpha * vcl_v1   + gpu_beta *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing inplace multiply-subtract on vector with GPU scalar (both, subtracting)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 -=     alpha * ublas_v1 -     beta * ublas_v2;
+  vcl_v1   -= gpu_alpha * vcl_v1   - gpu_beta *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing inplace multiply-subtract on vector with GPU scalar..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 -=     alpha * ublas_v2;
+  vcl_v1   -= gpu_alpha *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+
+  //
+  // division-subtract
+  //
+  std::cout << "Testing division-subtract on vector with CPU scalar (right)..." << std::endl;
+  for (size_t i=0; i < ublas_v1.size(); ++i)
+    ublas_v1(i) = NumericT(i);
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 - ublas_v2 / alpha;
+  vcl_v1   = vcl_v1   -   vcl_v2 / alpha;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing division-subtract on vector with CPU scalar (left)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 / alpha - ublas_v2;
+  vcl_v1   =   vcl_v1 / alpha -   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing division-subtract on vector with CPU scalar (both)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 / alpha - ublas_v2 / alpha;
+  vcl_v1   =   vcl_v1 / alpha -   vcl_v2 / alpha;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing inplace division-subtract on vector with CPU scalar..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 -= ublas_v2 / alpha;
+  vcl_v1   -=   vcl_v2 / alpha;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing inplace division-subtract on vector with GPU scalar..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 -= ublas_v2 / alpha;
+  vcl_v1   -=   vcl_v2 / gpu_alpha;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing division-subtract on vector with GPU scalar (right)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 - ublas_v2 / alpha;
+  vcl_v1   = vcl_v1   -   vcl_v2 / gpu_alpha;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing division-subtract on vector with GPU scalar (left)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 - ublas_v2 / alpha;
+  vcl_v1   = vcl_v1   -   vcl_v2 / gpu_alpha;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing division-subtract on vector with GPU scalar (both)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 / alpha     - ublas_v2 / beta;
+  vcl_v1   =   vcl_v1 / gpu_alpha -   vcl_v2 / gpu_beta;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing inplace division-subtract on vector with GPU scalar (both, adding)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 -= ublas_v1 / alpha     + ublas_v2 / beta;
+  vcl_v1   -=   vcl_v1 / gpu_alpha +   vcl_v2 / gpu_beta;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing inplace division-subtract on vector with GPU scalar (both, subtracting)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 -= ublas_v1 / alpha     - ublas_v2 / beta;
+  vcl_v1   -=   vcl_v1 / gpu_alpha -   vcl_v2 / gpu_beta;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing multiply-division-subtract on vector with GPU scalar..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 * alpha     - ublas_v2 / beta;
+  vcl_v1   =   vcl_v1 * gpu_alpha -   vcl_v2 / gpu_beta;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing division-multiply-subtract on vector with GPU scalar..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 / alpha     - ublas_v2 * beta;
+  vcl_v1   =   vcl_v1 / gpu_alpha -   vcl_v2 * gpu_beta;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing inplace multiply-division-subtract on vector with GPU scalar (adding)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 -= ublas_v1 * alpha     + ublas_v2 / beta;
+  vcl_v1   -=   vcl_v1 * gpu_alpha +   vcl_v2 / gpu_beta;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing inplace division-multiply-subtract on vector with GPU scalar (adding)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 -= ublas_v1 / alpha     + ublas_v2 * beta;
+  vcl_v1   -=   vcl_v1 / gpu_alpha +   vcl_v2 * gpu_beta;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing inplace multiply-division-subtract on vector with GPU scalar (subtracting)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 -= ublas_v1 * alpha     - ublas_v2 / beta;
+  vcl_v1   -=   vcl_v1 * gpu_alpha -   vcl_v2 / gpu_beta;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing inplace division-multiply-subtract on vector with GPU scalar (subtracting)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 -= ublas_v1 / alpha     - ublas_v2 * beta;
+  vcl_v1   -=   vcl_v1 / gpu_alpha -   vcl_v2 * gpu_beta;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing inplace division-subtract on vector with GPU scalar..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 -=     alpha * ublas_v2;
+  vcl_v1   -= gpu_alpha *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+
+  //
+  // More complicated expressions (for ensuring the operator overloads work correctly)
+  //
+  for (size_t i=0; i < ublas_v1.size(); ++i)
+    ublas_v1(i) = NumericT(i);
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  std::cout << "Testing three vector additions..." << std::endl;
+  ublas_v1 = ublas_v2 + ublas_v1 + ublas_v2;
+  vcl_v1   =   vcl_v2 +   vcl_v1 +   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  std::cout << "Testing complicated vector expression with CPU scalar..." << std::endl;
+  ublas_v1 = beta * (ublas_v1 - alpha * ublas_v2);
+  vcl_v1   = beta * (vcl_v1   - alpha * vcl_v2);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing complicated vector expression with GPU scalar..." << std::endl;
+  ublas_v1 =     beta * (ublas_v1 -     alpha * ublas_v2);
+  vcl_v1   = gpu_beta * (vcl_v1   - gpu_alpha * vcl_v2);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  // --------------------------------------------------------------------------
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  std::cout << "Testing swap..." << std::endl;
+  swap(ublas_v1, ublas_v2);
+  swap(vcl_v1, vcl_v2);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  // --------------------------------------------------------------------------
+  for (std::size_t i=0; i<ublas_v1.size(); ++i)
+  {
+    ublas_v1[i] = NumericT(1.0) + NumericT(i);
+    ublas_v2[i] = NumericT(5.0) + NumericT(i);
+  }
+
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  std::cout << "Testing unary operator-..." << std::endl;
+  ublas_v1 = - ublas_v2;
+  vcl_v1   = -   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing elementwise multiplication..." << std::endl;
+  std::cout << " v1 = element_prod(v1, v2);" << std::endl;
+  ublas_v1 = ublas::element_prod(ublas_v1, ublas_v2);
+  vcl_v1 = viennacl::linalg::element_prod(vcl_v1, vcl_v2);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " v1 += element_prod(v1, v2);" << std::endl;
+  ublas_v1 += ublas::element_prod(ublas_v1, ublas_v2);
+  vcl_v1 += viennacl::linalg::element_prod(vcl_v1, vcl_v2);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " v1 -= element_prod(v1, v2);" << std::endl;
+  ublas_v1 -= ublas::element_prod(ublas_v1, ublas_v2);
+  vcl_v1 -= viennacl::linalg::element_prod(vcl_v1, vcl_v2);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ///////
+  std::cout << " v1 = element_prod(v1 + v2, v2);" << std::endl;
+  ublas_v1 = ublas::element_prod(ublas_v1 + ublas_v2, ublas_v2);
+  vcl_v1 = viennacl::linalg::element_prod(vcl_v1 + vcl_v2, vcl_v2);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " v1 += element_prod(v1 + v2, v2);" << std::endl;
+  ublas_v1 += ublas::element_prod(ublas_v1 + ublas_v2, ublas_v2);
+  vcl_v1 += viennacl::linalg::element_prod(vcl_v1 + vcl_v2, vcl_v2);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " v1 -= element_prod(v1 + v2, v2);" << std::endl;
+  ublas_v1 -= ublas::element_prod(ublas_v1 + ublas_v2, ublas_v2);
+  vcl_v1 -= viennacl::linalg::element_prod(vcl_v1 + vcl_v2, vcl_v2);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ///////
+  std::cout << " v1 = element_prod(v1, v2 + v1);" << std::endl;
+  ublas_v1 = ublas::element_prod(ublas_v1, ublas_v2 + ublas_v1);
+  vcl_v1 = viennacl::linalg::element_prod(vcl_v1, vcl_v2 + vcl_v1);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " v1 += element_prod(v1, v2 + v1);" << std::endl;
+  ublas_v1 += ublas::element_prod(ublas_v1, ublas_v2 + ublas_v1);
+  vcl_v1 += viennacl::linalg::element_prod(vcl_v1, vcl_v2 + vcl_v1);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " v1 -= element_prod(v1, v2 + v1);" << std::endl;
+  ublas_v1 -= ublas::element_prod(ublas_v1, ublas_v2 + ublas_v1);
+  vcl_v1 -= viennacl::linalg::element_prod(vcl_v1, vcl_v2 + vcl_v1);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ///////
+  std::cout << " v1 = element_prod(v1 + v2, v2 + v1);" << std::endl;
+  ublas_v1 = ublas::element_prod(ublas_v1 + ublas_v2, ublas_v2 + ublas_v1);
+  vcl_v1 = viennacl::linalg::element_prod(vcl_v1 + vcl_v2, vcl_v2 + vcl_v1);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " v1 += element_prod(v1 + v2, v2 + v1);" << std::endl;
+  ublas_v1 += ublas::element_prod(ublas_v1 + ublas_v2, ublas_v2 + ublas_v1);
+  vcl_v1 += viennacl::linalg::element_prod(vcl_v1 + vcl_v2, vcl_v2 + vcl_v1);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " v1 -= element_prod(v1 + v2, v2 + v1);" << std::endl;
+  ublas_v1 -= ublas::element_prod(ublas_v1 + ublas_v2, ublas_v2 + ublas_v1);
+  vcl_v1 -= viennacl::linalg::element_prod(vcl_v1 + vcl_v2, vcl_v2 + vcl_v1);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing elementwise division..." << std::endl;
+  for (std::size_t i=0; i<ublas_v1.size(); ++i)
+  {
+    ublas_v1[i] = NumericT(1 + i);
+    ublas_v2[i] = NumericT(5 + i);
+  }
+
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas::element_div(ublas_v1, ublas_v2);
+  vcl_v1 = viennacl::linalg::element_div(vcl_v1, vcl_v2);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ublas_v1 += ublas::element_div(ublas_v1, ublas_v2);
+  vcl_v1 += viennacl::linalg::element_div(vcl_v1, vcl_v2);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ublas_v1 -= ublas::element_div(ublas_v1, ublas_v2);
+  vcl_v1 -= viennacl::linalg::element_div(vcl_v1, vcl_v2);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ///////
+  ublas_v1 = ublas::element_div(ublas_v1 + ublas_v2, ublas_v2);
+  vcl_v1 = viennacl::linalg::element_div(vcl_v1 + vcl_v2, vcl_v2);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ublas_v1 += ublas::element_div(ublas_v1 + ublas_v2, ublas_v2);
+  vcl_v1 += viennacl::linalg::element_div(vcl_v1 + vcl_v2, vcl_v2);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ublas_v1 -= ublas::element_div(ublas_v1 + ublas_v2, ublas_v2);
+  vcl_v1 -= viennacl::linalg::element_div(vcl_v1 + vcl_v2, vcl_v2);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ///////
+  ublas_v1 = ublas::element_div(ublas_v1, ublas_v2 + ublas_v1);
+  vcl_v1 = viennacl::linalg::element_div(vcl_v1, vcl_v2 + vcl_v1);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ublas_v1 += ublas::element_div(ublas_v1, ublas_v2 + ublas_v1);
+  vcl_v1 += viennacl::linalg::element_div(vcl_v1, vcl_v2 + vcl_v1);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ublas_v1 -= ublas::element_div(ublas_v1, ublas_v2 + ublas_v1);
+  vcl_v1 -= viennacl::linalg::element_div(vcl_v1, vcl_v2 + vcl_v1);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ///////
+  ublas_v1 = ublas::element_div(ublas_v1 + ublas_v2, ublas_v2 + ublas_v1);
+  vcl_v1 = viennacl::linalg::element_div(vcl_v1 + vcl_v2, vcl_v2 + vcl_v1);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ublas_v1 += ublas::element_div(ublas_v1 + ublas_v2, ublas_v2 + ublas_v1);
+  vcl_v1 += viennacl::linalg::element_div(vcl_v1 + vcl_v2, vcl_v2 + vcl_v1);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ublas_v1 -= ublas::element_div(ublas_v1 + ublas_v2, ublas_v2 + ublas_v1);
+  vcl_v1 -= viennacl::linalg::element_div(vcl_v1 + vcl_v2, vcl_v2 + vcl_v1);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing unary elementwise operations..." << std::endl;
+
+#define GENERATE_UNARY_OP_TEST(FUNCNAME) \
+  ublas_v2 = 3 * ublas_v1; \
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin()); \
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin()); \
+  \
+  for (std::size_t i=0; i<ublas_v1.size(); ++i) \
+    ublas_v1[i] = std::FUNCNAME(ublas_v2[i]); \
+  vcl_v1 = viennacl::linalg::element_##FUNCNAME(vcl_v2); \
+ \
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS) \
+  { \
+    std::cout << "Failure at v1 = " << #FUNCNAME << "(v2)" << std::endl; \
+    return EXIT_FAILURE; \
+  } \
+ \
+  for (std::size_t i=0; i<ublas_v1.size(); ++i) \
+    ublas_v1[i] = std::FUNCNAME(ublas_v1[i] + ublas_v2[i]); \
+  vcl_v1 = viennacl::linalg::element_##FUNCNAME(vcl_v1 + vcl_v2); \
+ \
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS) \
+  { \
+    std::cout << "Failure at v1 = " << #FUNCNAME << "(v1 + v2)" << std::endl; \
+    return EXIT_FAILURE; \
+  } \
+ \
+  for (std::size_t i=0; i<ublas_v1.size(); ++i) \
+    ublas_v1[i] += std::FUNCNAME(ublas_v1[i]); \
+  vcl_v1 += viennacl::linalg::element_##FUNCNAME(vcl_v1); \
+ \
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS) \
+  { \
+    std::cout << "Failure at v1 += " << #FUNCNAME << "(v2)" << std::endl; \
+    return EXIT_FAILURE; \
+  } \
+ \
+  for (std::size_t i=0; i<ublas_v1.size(); ++i) \
+    ublas_v1[i] += std::FUNCNAME(ublas_v1[i] + ublas_v2[i]); \
+  vcl_v1 += viennacl::linalg::element_##FUNCNAME(vcl_v1 + vcl_v2); \
+ \
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS) \
+  { \
+    std::cout << "Failure at v1 += " << #FUNCNAME << "(v1 + v2)" << std::endl; \
+    return EXIT_FAILURE; \
+  } \
+ \
+  for (std::size_t i=0; i<ublas_v1.size(); ++i) \
+    ublas_v1[i] -= std::FUNCNAME(ublas_v2[i]); \
+  vcl_v1 -= viennacl::linalg::element_##FUNCNAME(vcl_v2); \
+ \
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS) \
+  { \
+    std::cout << "Failure at v1 -= " << #FUNCNAME << "(v2)" << std::endl; \
+    return EXIT_FAILURE; \
+  } \
+ \
+  for (std::size_t i=0; i<ublas_v1.size(); ++i) \
+    ublas_v1[i] -= std::FUNCNAME(ublas_v1[i] + ublas_v2[i]); \
+  vcl_v1 -= viennacl::linalg::element_##FUNCNAME(vcl_v1 + vcl_v2); \
+ \
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS) \
+  { \
+    std::cout << "Failure at v1 -= " << #FUNCNAME << "(v1 + v2)" << std::endl; \
+    return EXIT_FAILURE; \
+  } \
+
+  //GENERATE_UNARY_OP_TEST(cos);
+  //GENERATE_UNARY_OP_TEST(cosh);
+  //GENERATE_UNARY_OP_TEST(exp);
+  //GENERATE_UNARY_OP_TEST(floor);
+  //GENERATE_UNARY_OP_TEST(fabs);
+  //GENERATE_UNARY_OP_TEST(log);
+  //GENERATE_UNARY_OP_TEST(log10);
+  //GENERATE_UNARY_OP_TEST(sin);
+  //GENERATE_UNARY_OP_TEST(sinh);
+  //GENERATE_UNARY_OP_TEST(fabs);
+  GENERATE_UNARY_OP_TEST(abs);
+  //GENERATE_UNARY_OP_TEST(sqrt);
+  //GENERATE_UNARY_OP_TEST(tan);
+  //GENERATE_UNARY_OP_TEST(tanh);
+
+  std::cout << "Testing lenghty sum of scaled vectors..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v2 / alpha   +     beta * ublas_v1 - alpha * ublas_v2 + beta * ublas_v1 - alpha * ublas_v1;
+  vcl_v1   = vcl_v2 / gpu_alpha + gpu_beta *   vcl_v1 - alpha *   vcl_v2 + beta *   vcl_v1 - alpha *   vcl_v1;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  // --------------------------------------------------------------------------
+  return retval;
+}
+
+
+template< typename NumericT >
+int test()
+{
+  int retval = EXIT_SUCCESS;
+  std::size_t size = 12345;
+
+  std::cout << "Running tests for vector of size " << size << std::endl;
+
+  //
+  // Set up UBLAS objects
+  //
+  ublas::vector<NumericT> ublas_full_vec(size);
+  ublas::vector<NumericT> ublas_full_vec2(ublas_full_vec.size());
+
+  for (std::size_t i=0; i<ublas_full_vec.size(); ++i)
+  {
+    ublas_full_vec[i]  = NumericT(1.0 + i);
+    ublas_full_vec2[i] = NumericT(2.0 + i / 2);
+  }
+
+  ublas::range r1(    ublas_full_vec.size() / 4, 2 * ublas_full_vec.size() / 4);
+  ublas::range r2(2 * ublas_full_vec2.size() / 4, 3 * ublas_full_vec2.size() / 4);
+  ublas::vector_range< ublas::vector<NumericT> > ublas_range_vec(ublas_full_vec, r1);
+  ublas::vector_range< ublas::vector<NumericT> > ublas_range_vec2(ublas_full_vec2, r2);
+
+  ublas::slice s1(    ublas_full_vec.size() / 4, 3, ublas_full_vec.size() / 4);
+  ublas::slice s2(2 * ublas_full_vec2.size() / 4, 2, ublas_full_vec2.size() / 4);
+  ublas::vector_slice< ublas::vector<NumericT> > ublas_slice_vec(ublas_full_vec, s1);
+  ublas::vector_slice< ublas::vector<NumericT> > ublas_slice_vec2(ublas_full_vec2, s2);
+
+  //
+  // Set up ViennaCL objects
+  //
+  viennacl::vector<NumericT> vcl_full_vec(ublas_full_vec.size());
+  viennacl::vector<NumericT> vcl_full_vec2(ublas_full_vec2.size());
+
+  viennacl::fast_copy(ublas_full_vec.begin(), ublas_full_vec.end(), vcl_full_vec.begin());
+  viennacl::copy(ublas_full_vec2.begin(), ublas_full_vec2.end(), vcl_full_vec2.begin());
+
+  viennacl::range vcl_r1(    vcl_full_vec.size() / 4, 2 * vcl_full_vec.size() / 4);
+  viennacl::range vcl_r2(2 * vcl_full_vec2.size() / 4, 3 * vcl_full_vec2.size() / 4);
+  viennacl::vector_range< viennacl::vector<NumericT> > vcl_range_vec(vcl_full_vec, vcl_r1);
+  viennacl::vector_range< viennacl::vector<NumericT> > vcl_range_vec2(vcl_full_vec2, vcl_r2);
+
+  {
+    viennacl::vector<NumericT> vcl_short_vec(vcl_range_vec);
+    viennacl::vector<NumericT> vcl_short_vec2 = vcl_range_vec2;
+
+    ublas::vector<NumericT> ublas_short_vec(ublas_range_vec);
+    ublas::vector<NumericT> ublas_short_vec2(ublas_range_vec2);
+
+    std::cout << "Testing creation of vectors from range..." << std::endl;
+    if (check(ublas_short_vec, vcl_short_vec) != EXIT_SUCCESS)
+      return EXIT_FAILURE;
+    if (check(ublas_short_vec2, vcl_short_vec2) != EXIT_SUCCESS)
+      return EXIT_FAILURE;
+  }
+
+  viennacl::slice vcl_s1(    vcl_full_vec.size() / 4, 3, vcl_full_vec.size() / 4);
+  viennacl::slice vcl_s2(2 * vcl_full_vec2.size() / 4, 2, vcl_full_vec2.size() / 4);
+  viennacl::vector_slice< viennacl::vector<NumericT> > vcl_slice_vec(vcl_full_vec, vcl_s1);
+  viennacl::vector_slice< viennacl::vector<NumericT> > vcl_slice_vec2(vcl_full_vec2, vcl_s2);
+
+  viennacl::vector<NumericT> vcl_short_vec(vcl_slice_vec);
+  viennacl::vector<NumericT> vcl_short_vec2 = vcl_slice_vec2;
+
+  ublas::vector<NumericT> ublas_short_vec(ublas_slice_vec);
+  ublas::vector<NumericT> ublas_short_vec2(ublas_slice_vec2);
+
+  std::cout << "Testing creation of vectors from slice..." << std::endl;
+  if (check(ublas_short_vec, vcl_short_vec) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  if (check(ublas_short_vec2, vcl_short_vec2) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  //
+  // Now start running tests for vectors, ranges and slices:
+  //
+
+  std::cout << " ** vcl_v1 = vector, vcl_v2 = vector **" << std::endl;
+  retval = test<NumericT>(ublas_short_vec, ublas_short_vec2,
+                          vcl_short_vec, vcl_short_vec2);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " ** vcl_v1 = vector, vcl_v2 = range **" << std::endl;
+  retval = test<NumericT>(ublas_short_vec, ublas_short_vec2,
+                          vcl_short_vec, vcl_range_vec2);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " ** vcl_v1 = vector, vcl_v2 = slice **" << std::endl;
+  retval = test<NumericT>(ublas_short_vec, ublas_short_vec2,
+                          vcl_short_vec, vcl_slice_vec2);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ///////
+
+  std::cout << " ** vcl_v1 = range, vcl_v2 = vector **" << std::endl;
+  retval = test<NumericT>(ublas_short_vec, ublas_short_vec2,
+                          vcl_range_vec, vcl_short_vec2);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " ** vcl_v1 = range, vcl_v2 = range **" << std::endl;
+  retval = test<NumericT>(ublas_short_vec, ublas_short_vec2,
+                          vcl_range_vec, vcl_range_vec2);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " ** vcl_v1 = range, vcl_v2 = slice **" << std::endl;
+  retval = test<NumericT>(ublas_short_vec, ublas_short_vec2,
+                          vcl_range_vec, vcl_slice_vec2);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ///////
+
+  std::cout << " ** vcl_v1 = slice, vcl_v2 = vector **" << std::endl;
+  retval = test<NumericT>(ublas_short_vec, ublas_short_vec2,
+                          vcl_slice_vec, vcl_short_vec2);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " ** vcl_v1 = slice, vcl_v2 = range **" << std::endl;
+  retval = test<NumericT>(ublas_short_vec, ublas_short_vec2,
+                          vcl_slice_vec, vcl_range_vec2);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " ** vcl_v1 = slice, vcl_v2 = slice **" << std::endl;
+  retval = test<NumericT>(ublas_short_vec, ublas_short_vec2,
+                          vcl_slice_vec, vcl_slice_vec2);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  return EXIT_SUCCESS;
+}
+
+
+
+//
+// -------------------------------------------------------------
+//
+int main()
+{
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "## Test :: Vector with Integer types" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << std::endl;
+
+  int retval = EXIT_SUCCESS;
+
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << std::endl;
+  {
+    std::cout << "# Testing setup:" << std::endl;
+    std::cout << "  numeric: int" << std::endl;
+    retval = test<int>();
+    if( retval == EXIT_SUCCESS )
+      std::cout << "# Test passed" << std::endl;
+    else
+      return retval;
+  }
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << std::endl;
+  {
+    std::cout << "# Testing setup:" << std::endl;
+    std::cout << "  numeric: long" << std::endl;
+    retval = test<long>();
+    if( retval == EXIT_SUCCESS )
+      std::cout << "# Test passed" << std::endl;
+    else
+      return retval;
+  }
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << std::endl;
+
+  std::cout << std::endl;
+  std::cout << "------- Test completed --------" << std::endl;
+  std::cout << std::endl;
+
+  return retval;
+}
diff --git a/tests/src/vector_int.cu b/tests/src/vector_int.cu
new file mode 100644
index 0000000..13e4a49
--- /dev/null
+++ b/tests/src/vector_int.cu
@@ -0,0 +1,1523 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+//
+// *** System
+//
+#include <iostream>
+#include <iomanip>
+
+//
+// *** Boost
+//
+#include <boost/numeric/ublas/io.hpp>
+#include <boost/numeric/ublas/vector.hpp>
+#include <boost/numeric/ublas/vector_proxy.hpp>
+
+//
+// *** ViennaCL
+//
+//#define VIENNACL_DEBUG_ALL
+#define VIENNACL_WITH_UBLAS 1
+#include "viennacl/vector.hpp"
+#include "viennacl/vector_proxy.hpp"
+#include "viennacl/linalg/inner_prod.hpp"
+#include "viennacl/linalg/norm_1.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+#include "viennacl/linalg/norm_inf.hpp"
+
+#include "Random.hpp"
+
+using namespace boost::numeric;
+
+
+//
+// -------------------------------------------------------------
+//
+template <typename ScalarType>
+ScalarType diff(ScalarType const & s1, ScalarType const & s2)
+{
+  viennacl::backend::finish();
+  return s1 - s2;
+}
+//
+// -------------------------------------------------------------
+//
+template <typename ScalarType>
+ScalarType diff(ScalarType const & s1, viennacl::scalar<ScalarType> const & s2)
+{
+  viennacl::backend::finish();
+  return s1 - s2;
+}
+//
+// -------------------------------------------------------------
+//
+template <typename ScalarType>
+ScalarType diff(ScalarType const & s1, viennacl::entry_proxy<ScalarType> const & s2)
+{
+  viennacl::backend::finish();
+  return s1 - s2;
+}
+//
+// -------------------------------------------------------------
+//
+template <typename ScalarType, typename VCLVectorType>
+ScalarType diff(ublas::vector<ScalarType> const & v1, VCLVectorType const & v2)
+{
+   ublas::vector<ScalarType> v2_cpu(v2.size());
+   viennacl::backend::finish();  //workaround for a bug in APP SDK 2.7 on Trinity APUs (with Catalyst 12.8)
+   viennacl::copy(v2.begin(), v2.end(), v2_cpu.begin());
+
+   for (unsigned int i=0;i<v1.size(); ++i)
+   {
+      if (v2_cpu[i] != v1[i])
+        return 1;
+   }
+
+   return 0;
+}
+
+template <typename T1, typename T2>
+int check(T1 const & t1, T2 const & t2)
+{
+  int retval = EXIT_SUCCESS;
+
+  if (diff(t1, t2) != 0)
+  {
+    std::cout << "# Error! Difference: " << std::abs(diff(t1, t2)) << std::endl;
+    retval = EXIT_FAILURE;
+  }
+  return retval;
+}
+
+
+//
+// -------------------------------------------------------------
+//
+template< typename NumericT, typename UblasVectorType, typename ViennaCLVectorType1, typename ViennaCLVectorType2 >
+int test(UblasVectorType     & ublas_v1, UblasVectorType     & ublas_v2,
+         ViennaCLVectorType1 &   vcl_v1, ViennaCLVectorType2 &   vcl_v2)
+{
+  int retval = EXIT_SUCCESS;
+
+  NumericT                    cpu_result = 42;
+  viennacl::scalar<NumericT>  gpu_result = 43;
+
+  //
+  // Initializer:
+  //
+  std::cout << "Checking for zero_vector initializer..." << std::endl;
+  //ublas_v1 = ublas::zero_vector<NumericT>(ublas_v1.size());
+  for (std::size_t i=0; i<ublas_v1.size(); ++i)
+    ublas_v1[i] = 0;
+  vcl_v1 = viennacl::zero_vector<NumericT>(vcl_v1.size());
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Checking for scalar_vector initializer..." << std::endl;
+  //ublas_v1 = ublas::scalar_vector<NumericT>(ublas_v1.size(), cpu_result);
+  for (std::size_t i=0; i<ublas_v1.size(); ++i)
+    ublas_v1[i] = cpu_result;
+  vcl_v1 = viennacl::scalar_vector<NumericT>(vcl_v1.size(), cpu_result);
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  //ublas_v1 = ublas::scalar_vector<NumericT>(ublas_v1.size(), gpu_result);
+  for (std::size_t i=0; i<ublas_v1.size(); ++i)
+    ublas_v1[i] = cpu_result + 1;
+  vcl_v1 = viennacl::scalar_vector<NumericT>(vcl_v1.size(), gpu_result);
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Checking for unit_vector initializer..." << std::endl;
+  //ublas_v1 = ublas::unit_vector<NumericT>(ublas_v1.size(), 5);
+  for (std::size_t i=0; i<ublas_v1.size(); ++i)
+    ublas_v1[i] = (i == 5) ? 1 : 0;
+  vcl_v1 = viennacl::unit_vector<NumericT>(vcl_v1.size(), 5);
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  for (std::size_t i=0; i<ublas_v1.size(); ++i)
+  {
+    ublas_v1[i] = NumericT(i);
+    ublas_v2[i] = NumericT(i+42);
+  }
+
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());  //resync
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  std::cout << "Checking for successful copy..." << std::endl;
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  if (check(ublas_v2, vcl_v2) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  //
+  // Part 1: Norms and inner product
+  //
+
+  // --------------------------------------------------------------------------
+  std::cout << "Testing inner_prod..." << std::endl;
+  cpu_result = viennacl::linalg::inner_prod(ublas_v1, ublas_v2);
+  NumericT cpu_result2 = viennacl::linalg::inner_prod(vcl_v1, vcl_v2);
+  gpu_result = viennacl::linalg::inner_prod(vcl_v1, vcl_v2);
+
+  if (check(cpu_result, cpu_result2) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  if (check(cpu_result, gpu_result) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  cpu_result = inner_prod(ublas_v1 + ublas_v2, ublas_v2 - ublas_v1);
+  NumericT cpu_result3 = viennacl::linalg::inner_prod(vcl_v1 + vcl_v2, vcl_v2 - vcl_v1);
+  gpu_result = viennacl::linalg::inner_prod(vcl_v1 + vcl_v2, vcl_v2 - vcl_v1);
+
+  if (check(cpu_result, cpu_result3) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  if (check(cpu_result, gpu_result) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  // --------------------------------------------------------------------------
+  std::cout << "Testing norm_1..." << std::endl;
+  cpu_result = ublas::norm_1(ublas_v1);
+  gpu_result = viennacl::linalg::norm_1(vcl_v1);
+
+  if (check(cpu_result, gpu_result) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  gpu_result = 2 * cpu_result; //reset
+  gpu_result = ublas::norm_1(ublas_v1);
+  cpu_result = viennacl::linalg::norm_1(vcl_v1);
+
+  if (check(cpu_result, gpu_result) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  gpu_result = ublas::norm_1(ublas_v1 + ublas_v2);
+  cpu_result = viennacl::linalg::norm_1(vcl_v1 + vcl_v2);
+
+  if (check(cpu_result, gpu_result) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  // --------------------------------------------------------------------------
+  std::cout << "Testing norm_inf..." << std::endl;
+  cpu_result = ublas::norm_inf(ublas_v1);
+  gpu_result = viennacl::linalg::norm_inf(vcl_v1);
+
+  if (check(cpu_result, gpu_result) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  gpu_result = 2 * cpu_result; //reset
+  gpu_result = ublas::norm_inf(ublas_v1);
+  cpu_result = viennacl::linalg::norm_inf(vcl_v1);
+
+  if (check(cpu_result, gpu_result) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  gpu_result = ublas::norm_inf(ublas_v1 + ublas_v2);
+  cpu_result = viennacl::linalg::norm_inf(vcl_v1 + vcl_v2);
+
+  if (check(cpu_result, gpu_result) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  // --------------------------------------------------------------------------
+  std::cout << "Testing index_norm_inf..." << std::endl;
+  std::size_t cpu_index = ublas::index_norm_inf(ublas_v1);
+  std::size_t gpu_index = viennacl::linalg::index_norm_inf(vcl_v1);
+
+  if (check(static_cast<NumericT>(cpu_index), static_cast<NumericT>(gpu_index)) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  // --------------------------------------------------------------------------
+  cpu_result = ublas_v1[index_norm_inf(ublas_v1)];
+  gpu_result = vcl_v1[viennacl::linalg::index_norm_inf(vcl_v1)];
+
+  if (check(cpu_result, gpu_result) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  cpu_result = ublas_v1[index_norm_inf(ublas_v1 + ublas_v2)];
+  gpu_result = vcl_v1[viennacl::linalg::index_norm_inf(vcl_v1 + vcl_v2)];
+
+  if (check(cpu_result, gpu_result) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  //
+  // Plane rotation and assignments
+  //
+
+  // --------------------------------------------------------------------------
+
+  ublas::vector<NumericT> x = ublas_v1;
+  ublas::vector<NumericT> y = ublas_v2;
+  ublas::vector<NumericT> t = ublas_v1;
+  t.assign (  NumericT(1) * x + NumericT(2) * y),
+  y.assign (- NumericT(2) * x + NumericT(1) * y),
+  x.assign (t);
+
+  viennacl::linalg::plane_rotation(vcl_v1, vcl_v2, NumericT(1), NumericT(2));
+
+  if (check(x, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  if (check(y, vcl_v2) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  // --------------------------------------------------------------------------
+
+  std::cout << "Testing assignments..." << std::endl;
+  NumericT val = static_cast<NumericT>(1);
+  for (size_t i=0; i < ublas_v1.size(); ++i)
+    ublas_v1(i) = val;
+
+  for (size_t i=0; i < vcl_v1.size(); ++i)
+    vcl_v1(i) = val;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  //
+  // multiplication and division of vectors by scalars
+  //
+  std::cout << "Testing scaling with CPU scalar..." << std::endl;
+  NumericT alpha = static_cast<NumericT>(3);
+  viennacl::scalar<NumericT> gpu_alpha = alpha;
+
+  ublas_v1  *= alpha;
+  vcl_v1    *= alpha;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing scaling with GPU scalar..." << std::endl;
+  ublas_v1  *= alpha;
+  vcl_v1    *= gpu_alpha;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  NumericT beta  = static_cast<NumericT>(2);
+  viennacl::scalar<NumericT> gpu_beta = beta;
+
+  std::cout << "Testing shrinking with CPU scalar..." << std::endl;
+  ublas_v1 /= beta;
+  vcl_v1   /= beta;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing shrinking with GPU scalar..." << std::endl;
+  ublas_v1 /= beta;
+  vcl_v1   /= gpu_beta;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  //
+  // add and inplace_add of vectors
+  //
+  for (size_t i=0; i < ublas_v1.size(); ++i)
+    ublas_v1(i) = NumericT(i);
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());  //resync
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  std::cout << "Testing add on vector..." << std::endl;
+
+  std::cout << "Checking for successful copy..." << std::endl;
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  if (check(ublas_v2, vcl_v2) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ublas_v1     = ublas_v1 + ublas_v2;
+  vcl_v1       =   vcl_v1 +   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing add on vector with flipsign..." << std::endl;
+  ublas_v1     = - ublas_v1 + ublas_v2;
+  vcl_v1       = -   vcl_v1 +   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing inplace-add on vector..." << std::endl;
+  ublas_v1 += ublas_v2;
+  vcl_v1   +=   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  //
+  // subtract and inplace_subtract of vectors
+  //
+  std::cout << "Testing sub on vector..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1     = ublas_v1 - ublas_v2;
+  vcl_v1       =   vcl_v1 -   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing inplace-sub on vector..." << std::endl;
+  ublas_v1 -= ublas_v2;
+  vcl_v1   -= vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+
+  //
+  // multiply-add
+  //
+  std::cout << "Testing multiply-add on vector with CPU scalar (right)..." << std::endl;
+  for (size_t i=0; i < ublas_v1.size(); ++i)
+    ublas_v1(i) = NumericT(i);
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 + alpha * ublas_v2;
+  vcl_v1   = vcl_v1   + alpha *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing multiply-add on vector with CPU scalar (left)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = alpha * ublas_v1 + ublas_v2;
+  vcl_v1   = alpha *   vcl_v1 +   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing multiply-add on vector with CPU scalar (both)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = alpha * ublas_v1 + beta * ublas_v2;
+  vcl_v1   = alpha *   vcl_v1 + beta *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing inplace multiply-add on vector with CPU scalar..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 += alpha * ublas_v2;
+  vcl_v1   += alpha *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing multiply-add on vector with GPU scalar (right)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 +     alpha * ublas_v2;
+  vcl_v1   = vcl_v1   + gpu_alpha *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing multiply-add on vector with GPU scalar (left)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 +     alpha * ublas_v2;
+  vcl_v1   = vcl_v1   + gpu_alpha *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing multiply-add on vector with GPU scalar (both)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 =     alpha * ublas_v1 +     beta * ublas_v2;
+  vcl_v1   = gpu_alpha *   vcl_v1 + gpu_beta *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing inplace multiply-add on vector with GPU scalar (both, adding)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 +=     alpha * ublas_v1 +     beta * ublas_v2;
+  vcl_v1   += gpu_alpha *   vcl_v1 + gpu_beta *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing inplace multiply-add on vector with GPU scalar (both, subtracting)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 +=     alpha * ublas_v1 -     beta * ublas_v2;
+  vcl_v1   += gpu_alpha *   vcl_v1 - gpu_beta *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+
+  std::cout << "Testing inplace multiply-add on vector with GPU scalar..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 +=     alpha * ublas_v2;
+  vcl_v1   += gpu_alpha *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  //
+  // division-add
+  //
+  std::cout << "Testing division-add on vector with CPU scalar (right)..." << std::endl;
+  for (size_t i=0; i < ublas_v1.size(); ++i)
+    ublas_v1(i) = NumericT(i);
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 + ublas_v2 / alpha;
+  vcl_v1   = vcl_v1   + vcl_v2 / alpha;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing division-add on vector with CPU scalar (left)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 / alpha + ublas_v2;
+  vcl_v1   =   vcl_v1 / alpha +   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing division-add on vector with CPU scalar (both)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 / alpha + ublas_v2 / beta;
+  vcl_v1   =   vcl_v1 / alpha +   vcl_v2 / beta;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing division-multiply-add on vector with CPU scalar..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 / alpha + ublas_v2 * beta;
+  vcl_v1   =   vcl_v1 / alpha +   vcl_v2 * beta;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing multiply-division-add on vector with CPU scalar..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 * alpha + ublas_v2 / beta;
+  vcl_v1   =   vcl_v1 * alpha +   vcl_v2 / beta;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+
+  std::cout << "Testing inplace division-add on vector with CPU scalar..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 += ublas_v2 / alpha;
+  vcl_v1   += vcl_v2 / alpha;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing division-add on vector with GPU scalar (right)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 + ublas_v2 / alpha;
+  vcl_v1   = vcl_v1   +   vcl_v2 / gpu_alpha;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing division-add on vector with GPU scalar (left)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 + ublas_v2 / alpha;
+  vcl_v1   = vcl_v1   +   vcl_v2 / gpu_alpha;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing division-add on vector with GPU scalar (both)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 / alpha     + ublas_v2 / beta;
+  vcl_v1   =   vcl_v1 / gpu_alpha +   vcl_v2 / gpu_beta;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing inplace division-add on vector with GPU scalar (both, adding)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 += ublas_v1 / alpha     + ublas_v2 / beta;
+  vcl_v1   +=   vcl_v1 / gpu_alpha +   vcl_v2 / gpu_beta;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing inplace division-add on vector with GPU scalar (both, subtracting)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 += ublas_v1 / alpha     - ublas_v2 / beta;
+  vcl_v1   +=   vcl_v1 / gpu_alpha -   vcl_v2 / gpu_beta;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing inplace division-multiply-add on vector with GPU scalar (adding)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 += ublas_v1 / alpha     + ublas_v2 * beta;
+  vcl_v1   +=   vcl_v1 / gpu_alpha +   vcl_v2 * gpu_beta;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing inplace multiply-division-add on vector with GPU scalar (subtracting)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 += ublas_v1 * alpha     - ublas_v2 / beta;
+  vcl_v1   +=   vcl_v1 * gpu_alpha -   vcl_v2 / gpu_beta;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+
+  std::cout << "Testing inplace division-add on vector with GPU scalar..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 += ublas_v2 * alpha;
+  vcl_v1   +=   vcl_v2 * gpu_alpha;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  //
+  // multiply-subtract
+  //
+  std::cout << "Testing multiply-subtract on vector with CPU scalar (right)..." << std::endl;
+  for (size_t i=0; i < ublas_v1.size(); ++i)
+    ublas_v1(i) = NumericT(i);
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 - alpha * ublas_v2;
+  vcl_v1   = vcl_v1   - alpha *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing multiply-subtract on vector with CPU scalar (left)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = alpha * ublas_v1 - ublas_v2;
+  vcl_v1   = alpha * vcl_v1   -   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing multiply-subtract on vector with CPU scalar (both)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = alpha * ublas_v1 - beta * ublas_v2;
+  vcl_v1   = alpha * vcl_v1   - beta *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing inplace multiply-subtract on vector with CPU scalar..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 -= alpha * ublas_v2;
+  vcl_v1   -= alpha *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing multiply-subtract on vector with GPU scalar (right)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 -     alpha * ublas_v2;
+  vcl_v1   = vcl_v1   - gpu_alpha *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing multiply-subtract on vector with GPU scalar (left)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 -     alpha * ublas_v2;
+  vcl_v1   = vcl_v1   - gpu_alpha *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing multiply-subtract on vector with GPU scalar (both)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 =     alpha * ublas_v1 -     beta * ublas_v2;
+  vcl_v1   = gpu_alpha * vcl_v1   - gpu_beta *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing inplace multiply-subtract on vector with GPU scalar (both, adding)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 -=     alpha * ublas_v1 +     beta * ublas_v2;
+  vcl_v1   -= gpu_alpha * vcl_v1   + gpu_beta *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing inplace multiply-subtract on vector with GPU scalar (both, subtracting)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 -=     alpha * ublas_v1 -     beta * ublas_v2;
+  vcl_v1   -= gpu_alpha * vcl_v1   - gpu_beta *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing inplace multiply-subtract on vector with GPU scalar..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 -=     alpha * ublas_v2;
+  vcl_v1   -= gpu_alpha *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+
+  //
+  // division-subtract
+  //
+  std::cout << "Testing division-subtract on vector with CPU scalar (right)..." << std::endl;
+  for (size_t i=0; i < ublas_v1.size(); ++i)
+    ublas_v1(i) = NumericT(i);
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 - ublas_v2 / alpha;
+  vcl_v1   = vcl_v1   -   vcl_v2 / alpha;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing division-subtract on vector with CPU scalar (left)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 / alpha - ublas_v2;
+  vcl_v1   =   vcl_v1 / alpha -   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing division-subtract on vector with CPU scalar (both)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 / alpha - ublas_v2 / alpha;
+  vcl_v1   =   vcl_v1 / alpha -   vcl_v2 / alpha;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing inplace division-subtract on vector with CPU scalar..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 -= ublas_v2 / alpha;
+  vcl_v1   -=   vcl_v2 / alpha;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing inplace division-subtract on vector with GPU scalar..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 -= ublas_v2 / alpha;
+  vcl_v1   -=   vcl_v2 / gpu_alpha;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing division-subtract on vector with GPU scalar (right)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 - ublas_v2 / alpha;
+  vcl_v1   = vcl_v1   -   vcl_v2 / gpu_alpha;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing division-subtract on vector with GPU scalar (left)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 - ublas_v2 / alpha;
+  vcl_v1   = vcl_v1   -   vcl_v2 / gpu_alpha;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing division-subtract on vector with GPU scalar (both)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 / alpha     - ublas_v2 / beta;
+  vcl_v1   =   vcl_v1 / gpu_alpha -   vcl_v2 / gpu_beta;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing inplace division-subtract on vector with GPU scalar (both, adding)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 -= ublas_v1 / alpha     + ublas_v2 / beta;
+  vcl_v1   -=   vcl_v1 / gpu_alpha +   vcl_v2 / gpu_beta;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing inplace division-subtract on vector with GPU scalar (both, subtracting)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 -= ublas_v1 / alpha     - ublas_v2 / beta;
+  vcl_v1   -=   vcl_v1 / gpu_alpha -   vcl_v2 / gpu_beta;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing multiply-division-subtract on vector with GPU scalar..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 * alpha     - ublas_v2 / beta;
+  vcl_v1   =   vcl_v1 * gpu_alpha -   vcl_v2 / gpu_beta;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing division-multiply-subtract on vector with GPU scalar..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 / alpha     - ublas_v2 * beta;
+  vcl_v1   =   vcl_v1 / gpu_alpha -   vcl_v2 * gpu_beta;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing inplace multiply-division-subtract on vector with GPU scalar (adding)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 -= ublas_v1 * alpha     + ublas_v2 / beta;
+  vcl_v1   -=   vcl_v1 * gpu_alpha +   vcl_v2 / gpu_beta;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing inplace division-multiply-subtract on vector with GPU scalar (adding)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 -= ublas_v1 / alpha     + ublas_v2 * beta;
+  vcl_v1   -=   vcl_v1 / gpu_alpha +   vcl_v2 * gpu_beta;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing inplace multiply-division-subtract on vector with GPU scalar (subtracting)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 -= ublas_v1 * alpha     - ublas_v2 / beta;
+  vcl_v1   -=   vcl_v1 * gpu_alpha -   vcl_v2 / gpu_beta;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing inplace division-multiply-subtract on vector with GPU scalar (subtracting)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 -= ublas_v1 / alpha     - ublas_v2 * beta;
+  vcl_v1   -=   vcl_v1 / gpu_alpha -   vcl_v2 * gpu_beta;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing inplace division-subtract on vector with GPU scalar..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 -=     alpha * ublas_v2;
+  vcl_v1   -= gpu_alpha *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+
+  //
+  // More complicated expressions (for ensuring the operator overloads work correctly)
+  //
+  for (size_t i=0; i < ublas_v1.size(); ++i)
+    ublas_v1(i) = NumericT(i);
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  std::cout << "Testing three vector additions..." << std::endl;
+  ublas_v1 = ublas_v2 + ublas_v1 + ublas_v2;
+  vcl_v1   =   vcl_v2 +   vcl_v1 +   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  std::cout << "Testing complicated vector expression with CPU scalar..." << std::endl;
+  ublas_v1 = beta * (ublas_v1 - alpha * ublas_v2);
+  vcl_v1   = beta * (vcl_v1   - alpha * vcl_v2);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing complicated vector expression with GPU scalar..." << std::endl;
+  ublas_v1 =     beta * (ublas_v1 -     alpha * ublas_v2);
+  vcl_v1   = gpu_beta * (vcl_v1   - gpu_alpha * vcl_v2);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  // --------------------------------------------------------------------------
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  std::cout << "Testing swap..." << std::endl;
+  swap(ublas_v1, ublas_v2);
+  swap(vcl_v1, vcl_v2);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  // --------------------------------------------------------------------------
+  for (std::size_t i=0; i<ublas_v1.size(); ++i)
+  {
+    ublas_v1[i] = NumericT(1.0) + NumericT(i);
+    ublas_v2[i] = NumericT(5.0) + NumericT(i);
+  }
+
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  std::cout << "Testing unary operator-..." << std::endl;
+  ublas_v1 = - ublas_v2;
+  vcl_v1   = -   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing elementwise multiplication..." << std::endl;
+  std::cout << " v1 = element_prod(v1, v2);" << std::endl;
+  ublas_v1 = ublas::element_prod(ublas_v1, ublas_v2);
+  vcl_v1 = viennacl::linalg::element_prod(vcl_v1, vcl_v2);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " v1 += element_prod(v1, v2);" << std::endl;
+  ublas_v1 += ublas::element_prod(ublas_v1, ublas_v2);
+  vcl_v1 += viennacl::linalg::element_prod(vcl_v1, vcl_v2);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " v1 -= element_prod(v1, v2);" << std::endl;
+  ublas_v1 -= ublas::element_prod(ublas_v1, ublas_v2);
+  vcl_v1 -= viennacl::linalg::element_prod(vcl_v1, vcl_v2);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ///////
+  std::cout << " v1 = element_prod(v1 + v2, v2);" << std::endl;
+  ublas_v1 = ublas::element_prod(ublas_v1 + ublas_v2, ublas_v2);
+  vcl_v1 = viennacl::linalg::element_prod(vcl_v1 + vcl_v2, vcl_v2);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " v1 += element_prod(v1 + v2, v2);" << std::endl;
+  ublas_v1 += ublas::element_prod(ublas_v1 + ublas_v2, ublas_v2);
+  vcl_v1 += viennacl::linalg::element_prod(vcl_v1 + vcl_v2, vcl_v2);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " v1 -= element_prod(v1 + v2, v2);" << std::endl;
+  ublas_v1 -= ublas::element_prod(ublas_v1 + ublas_v2, ublas_v2);
+  vcl_v1 -= viennacl::linalg::element_prod(vcl_v1 + vcl_v2, vcl_v2);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ///////
+  std::cout << " v1 = element_prod(v1, v2 + v1);" << std::endl;
+  ublas_v1 = ublas::element_prod(ublas_v1, ublas_v2 + ublas_v1);
+  vcl_v1 = viennacl::linalg::element_prod(vcl_v1, vcl_v2 + vcl_v1);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " v1 += element_prod(v1, v2 + v1);" << std::endl;
+  ublas_v1 += ublas::element_prod(ublas_v1, ublas_v2 + ublas_v1);
+  vcl_v1 += viennacl::linalg::element_prod(vcl_v1, vcl_v2 + vcl_v1);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " v1 -= element_prod(v1, v2 + v1);" << std::endl;
+  ublas_v1 -= ublas::element_prod(ublas_v1, ublas_v2 + ublas_v1);
+  vcl_v1 -= viennacl::linalg::element_prod(vcl_v1, vcl_v2 + vcl_v1);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ///////
+  std::cout << " v1 = element_prod(v1 + v2, v2 + v1);" << std::endl;
+  ublas_v1 = ublas::element_prod(ublas_v1 + ublas_v2, ublas_v2 + ublas_v1);
+  vcl_v1 = viennacl::linalg::element_prod(vcl_v1 + vcl_v2, vcl_v2 + vcl_v1);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " v1 += element_prod(v1 + v2, v2 + v1);" << std::endl;
+  ublas_v1 += ublas::element_prod(ublas_v1 + ublas_v2, ublas_v2 + ublas_v1);
+  vcl_v1 += viennacl::linalg::element_prod(vcl_v1 + vcl_v2, vcl_v2 + vcl_v1);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " v1 -= element_prod(v1 + v2, v2 + v1);" << std::endl;
+  ublas_v1 -= ublas::element_prod(ublas_v1 + ublas_v2, ublas_v2 + ublas_v1);
+  vcl_v1 -= viennacl::linalg::element_prod(vcl_v1 + vcl_v2, vcl_v2 + vcl_v1);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing elementwise division..." << std::endl;
+  for (std::size_t i=0; i<ublas_v1.size(); ++i)
+  {
+    ublas_v1[i] = NumericT(1 + i);
+    ublas_v2[i] = NumericT(5 + i);
+  }
+
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas::element_div(ublas_v1, ublas_v2);
+  vcl_v1 = viennacl::linalg::element_div(vcl_v1, vcl_v2);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ublas_v1 += ublas::element_div(ublas_v1, ublas_v2);
+  vcl_v1 += viennacl::linalg::element_div(vcl_v1, vcl_v2);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ublas_v1 -= ublas::element_div(ublas_v1, ublas_v2);
+  vcl_v1 -= viennacl::linalg::element_div(vcl_v1, vcl_v2);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ///////
+  ublas_v1 = ublas::element_div(ublas_v1 + ublas_v2, ublas_v2);
+  vcl_v1 = viennacl::linalg::element_div(vcl_v1 + vcl_v2, vcl_v2);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ublas_v1 += ublas::element_div(ublas_v1 + ublas_v2, ublas_v2);
+  vcl_v1 += viennacl::linalg::element_div(vcl_v1 + vcl_v2, vcl_v2);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ublas_v1 -= ublas::element_div(ublas_v1 + ublas_v2, ublas_v2);
+  vcl_v1 -= viennacl::linalg::element_div(vcl_v1 + vcl_v2, vcl_v2);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ///////
+  ublas_v1 = ublas::element_div(ublas_v1, ublas_v2 + ublas_v1);
+  vcl_v1 = viennacl::linalg::element_div(vcl_v1, vcl_v2 + vcl_v1);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ublas_v1 += ublas::element_div(ublas_v1, ublas_v2 + ublas_v1);
+  vcl_v1 += viennacl::linalg::element_div(vcl_v1, vcl_v2 + vcl_v1);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ublas_v1 -= ublas::element_div(ublas_v1, ublas_v2 + ublas_v1);
+  vcl_v1 -= viennacl::linalg::element_div(vcl_v1, vcl_v2 + vcl_v1);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ///////
+  ublas_v1 = ublas::element_div(ublas_v1 + ublas_v2, ublas_v2 + ublas_v1);
+  vcl_v1 = viennacl::linalg::element_div(vcl_v1 + vcl_v2, vcl_v2 + vcl_v1);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ublas_v1 += ublas::element_div(ublas_v1 + ublas_v2, ublas_v2 + ublas_v1);
+  vcl_v1 += viennacl::linalg::element_div(vcl_v1 + vcl_v2, vcl_v2 + vcl_v1);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ublas_v1 -= ublas::element_div(ublas_v1 + ublas_v2, ublas_v2 + ublas_v1);
+  vcl_v1 -= viennacl::linalg::element_div(vcl_v1 + vcl_v2, vcl_v2 + vcl_v1);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing unary elementwise operations..." << std::endl;
+
+#define GENERATE_UNARY_OP_TEST(FUNCNAME) \
+  ublas_v2 = 3 * ublas_v1; \
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin()); \
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin()); \
+  \
+  for (std::size_t i=0; i<ublas_v1.size(); ++i) \
+    ublas_v1[i] = std::FUNCNAME(ublas_v2[i]); \
+  vcl_v1 = viennacl::linalg::element_##FUNCNAME(vcl_v2); \
+ \
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS) \
+  { \
+    std::cout << "Failure at v1 = " << #FUNCNAME << "(v2)" << std::endl; \
+    return EXIT_FAILURE; \
+  } \
+ \
+  for (std::size_t i=0; i<ublas_v1.size(); ++i) \
+    ublas_v1[i] = std::FUNCNAME(ublas_v1[i] + ublas_v2[i]); \
+  vcl_v1 = viennacl::linalg::element_##FUNCNAME(vcl_v1 + vcl_v2); \
+ \
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS) \
+  { \
+    std::cout << "Failure at v1 = " << #FUNCNAME << "(v1 + v2)" << std::endl; \
+    return EXIT_FAILURE; \
+  } \
+ \
+  for (std::size_t i=0; i<ublas_v1.size(); ++i) \
+    ublas_v1[i] += std::FUNCNAME(ublas_v1[i]); \
+  vcl_v1 += viennacl::linalg::element_##FUNCNAME(vcl_v1); \
+ \
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS) \
+  { \
+    std::cout << "Failure at v1 += " << #FUNCNAME << "(v2)" << std::endl; \
+    return EXIT_FAILURE; \
+  } \
+ \
+  for (std::size_t i=0; i<ublas_v1.size(); ++i) \
+    ublas_v1[i] += std::FUNCNAME(ublas_v1[i] + ublas_v2[i]); \
+  vcl_v1 += viennacl::linalg::element_##FUNCNAME(vcl_v1 + vcl_v2); \
+ \
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS) \
+  { \
+    std::cout << "Failure at v1 += " << #FUNCNAME << "(v1 + v2)" << std::endl; \
+    return EXIT_FAILURE; \
+  } \
+ \
+  for (std::size_t i=0; i<ublas_v1.size(); ++i) \
+    ublas_v1[i] -= std::FUNCNAME(ublas_v2[i]); \
+  vcl_v1 -= viennacl::linalg::element_##FUNCNAME(vcl_v2); \
+ \
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS) \
+  { \
+    std::cout << "Failure at v1 -= " << #FUNCNAME << "(v2)" << std::endl; \
+    return EXIT_FAILURE; \
+  } \
+ \
+  for (std::size_t i=0; i<ublas_v1.size(); ++i) \
+    ublas_v1[i] -= std::FUNCNAME(ublas_v1[i] + ublas_v2[i]); \
+  vcl_v1 -= viennacl::linalg::element_##FUNCNAME(vcl_v1 + vcl_v2); \
+ \
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS) \
+  { \
+    std::cout << "Failure at v1 -= " << #FUNCNAME << "(v1 + v2)" << std::endl; \
+    return EXIT_FAILURE; \
+  } \
+
+  //GENERATE_UNARY_OP_TEST(cos);
+  //GENERATE_UNARY_OP_TEST(cosh);
+  //GENERATE_UNARY_OP_TEST(exp);
+  //GENERATE_UNARY_OP_TEST(floor);
+  //GENERATE_UNARY_OP_TEST(fabs);
+  //GENERATE_UNARY_OP_TEST(log);
+  //GENERATE_UNARY_OP_TEST(log10);
+  //GENERATE_UNARY_OP_TEST(sin);
+  //GENERATE_UNARY_OP_TEST(sinh);
+  //GENERATE_UNARY_OP_TEST(fabs);
+  GENERATE_UNARY_OP_TEST(abs);
+  //GENERATE_UNARY_OP_TEST(sqrt);
+  //GENERATE_UNARY_OP_TEST(tan);
+  //GENERATE_UNARY_OP_TEST(tanh);
+
+  std::cout << "Testing lenghty sum of scaled vectors..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v2 / alpha   +     beta * ublas_v1 - alpha * ublas_v2 + beta * ublas_v1 - alpha * ublas_v1;
+  vcl_v1   = vcl_v2 / gpu_alpha + gpu_beta *   vcl_v1 - alpha *   vcl_v2 + beta *   vcl_v1 - alpha *   vcl_v1;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  // --------------------------------------------------------------------------
+  return retval;
+}
+
+
+template< typename NumericT >
+int test()
+{
+  int retval = EXIT_SUCCESS;
+  std::size_t size = 12345;
+
+  std::cout << "Running tests for vector of size " << size << std::endl;
+
+  //
+  // Set up UBLAS objects
+  //
+  ublas::vector<NumericT> ublas_full_vec(size);
+  ublas::vector<NumericT> ublas_full_vec2(ublas_full_vec.size());
+
+  for (std::size_t i=0; i<ublas_full_vec.size(); ++i)
+  {
+    ublas_full_vec[i]  = NumericT(1.0 + i);
+    ublas_full_vec2[i] = NumericT(2.0 + i / 2);
+  }
+
+  ublas::range r1(    ublas_full_vec.size() / 4, 2 * ublas_full_vec.size() / 4);
+  ublas::range r2(2 * ublas_full_vec2.size() / 4, 3 * ublas_full_vec2.size() / 4);
+  ublas::vector_range< ublas::vector<NumericT> > ublas_range_vec(ublas_full_vec, r1);
+  ublas::vector_range< ublas::vector<NumericT> > ublas_range_vec2(ublas_full_vec2, r2);
+
+  ublas::slice s1(    ublas_full_vec.size() / 4, 3, ublas_full_vec.size() / 4);
+  ublas::slice s2(2 * ublas_full_vec2.size() / 4, 2, ublas_full_vec2.size() / 4);
+  ublas::vector_slice< ublas::vector<NumericT> > ublas_slice_vec(ublas_full_vec, s1);
+  ublas::vector_slice< ublas::vector<NumericT> > ublas_slice_vec2(ublas_full_vec2, s2);
+
+  //
+  // Set up ViennaCL objects
+  //
+  viennacl::vector<NumericT> vcl_full_vec(ublas_full_vec.size());
+  viennacl::vector<NumericT> vcl_full_vec2(ublas_full_vec2.size());
+
+  viennacl::fast_copy(ublas_full_vec.begin(), ublas_full_vec.end(), vcl_full_vec.begin());
+  viennacl::copy(ublas_full_vec2.begin(), ublas_full_vec2.end(), vcl_full_vec2.begin());
+
+  viennacl::range vcl_r1(    vcl_full_vec.size() / 4, 2 * vcl_full_vec.size() / 4);
+  viennacl::range vcl_r2(2 * vcl_full_vec2.size() / 4, 3 * vcl_full_vec2.size() / 4);
+  viennacl::vector_range< viennacl::vector<NumericT> > vcl_range_vec(vcl_full_vec, vcl_r1);
+  viennacl::vector_range< viennacl::vector<NumericT> > vcl_range_vec2(vcl_full_vec2, vcl_r2);
+
+  {
+    viennacl::vector<NumericT> vcl_short_vec(vcl_range_vec);
+    viennacl::vector<NumericT> vcl_short_vec2 = vcl_range_vec2;
+
+    ublas::vector<NumericT> ublas_short_vec(ublas_range_vec);
+    ublas::vector<NumericT> ublas_short_vec2(ublas_range_vec2);
+
+    std::cout << "Testing creation of vectors from range..." << std::endl;
+    if (check(ublas_short_vec, vcl_short_vec) != EXIT_SUCCESS)
+      return EXIT_FAILURE;
+    if (check(ublas_short_vec2, vcl_short_vec2) != EXIT_SUCCESS)
+      return EXIT_FAILURE;
+  }
+
+  viennacl::slice vcl_s1(    vcl_full_vec.size() / 4, 3, vcl_full_vec.size() / 4);
+  viennacl::slice vcl_s2(2 * vcl_full_vec2.size() / 4, 2, vcl_full_vec2.size() / 4);
+  viennacl::vector_slice< viennacl::vector<NumericT> > vcl_slice_vec(vcl_full_vec, vcl_s1);
+  viennacl::vector_slice< viennacl::vector<NumericT> > vcl_slice_vec2(vcl_full_vec2, vcl_s2);
+
+  viennacl::vector<NumericT> vcl_short_vec(vcl_slice_vec);
+  viennacl::vector<NumericT> vcl_short_vec2 = vcl_slice_vec2;
+
+  ublas::vector<NumericT> ublas_short_vec(ublas_slice_vec);
+  ublas::vector<NumericT> ublas_short_vec2(ublas_slice_vec2);
+
+  std::cout << "Testing creation of vectors from slice..." << std::endl;
+  if (check(ublas_short_vec, vcl_short_vec) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  if (check(ublas_short_vec2, vcl_short_vec2) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  //
+  // Now start running tests for vectors, ranges and slices:
+  //
+
+  std::cout << " ** vcl_v1 = vector, vcl_v2 = vector **" << std::endl;
+  retval = test<NumericT>(ublas_short_vec, ublas_short_vec2,
+                          vcl_short_vec, vcl_short_vec2);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " ** vcl_v1 = vector, vcl_v2 = range **" << std::endl;
+  retval = test<NumericT>(ublas_short_vec, ublas_short_vec2,
+                          vcl_short_vec, vcl_range_vec2);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " ** vcl_v1 = vector, vcl_v2 = slice **" << std::endl;
+  retval = test<NumericT>(ublas_short_vec, ublas_short_vec2,
+                          vcl_short_vec, vcl_slice_vec2);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ///////
+
+  std::cout << " ** vcl_v1 = range, vcl_v2 = vector **" << std::endl;
+  retval = test<NumericT>(ublas_short_vec, ublas_short_vec2,
+                          vcl_range_vec, vcl_short_vec2);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " ** vcl_v1 = range, vcl_v2 = range **" << std::endl;
+  retval = test<NumericT>(ublas_short_vec, ublas_short_vec2,
+                          vcl_range_vec, vcl_range_vec2);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " ** vcl_v1 = range, vcl_v2 = slice **" << std::endl;
+  retval = test<NumericT>(ublas_short_vec, ublas_short_vec2,
+                          vcl_range_vec, vcl_slice_vec2);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ///////
+
+  std::cout << " ** vcl_v1 = slice, vcl_v2 = vector **" << std::endl;
+  retval = test<NumericT>(ublas_short_vec, ublas_short_vec2,
+                          vcl_slice_vec, vcl_short_vec2);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " ** vcl_v1 = slice, vcl_v2 = range **" << std::endl;
+  retval = test<NumericT>(ublas_short_vec, ublas_short_vec2,
+                          vcl_slice_vec, vcl_range_vec2);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " ** vcl_v1 = slice, vcl_v2 = slice **" << std::endl;
+  retval = test<NumericT>(ublas_short_vec, ublas_short_vec2,
+                          vcl_slice_vec, vcl_slice_vec2);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  return EXIT_SUCCESS;
+}
+
+
+
+//
+// -------------------------------------------------------------
+//
+int main()
+{
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "## Test :: Vector with Integer types" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << std::endl;
+
+  int retval = EXIT_SUCCESS;
+
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << std::endl;
+  {
+    std::cout << "# Testing setup:" << std::endl;
+    std::cout << "  numeric: int" << std::endl;
+    retval = test<int>();
+    if( retval == EXIT_SUCCESS )
+      std::cout << "# Test passed" << std::endl;
+    else
+      return retval;
+  }
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << std::endl;
+  {
+    std::cout << "# Testing setup:" << std::endl;
+    std::cout << "  numeric: long" << std::endl;
+    retval = test<long>();
+    if( retval == EXIT_SUCCESS )
+      std::cout << "# Test passed" << std::endl;
+    else
+      return retval;
+  }
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << std::endl;
+
+  std::cout << std::endl;
+  std::cout << "------- Test completed --------" << std::endl;
+  std::cout << std::endl;
+
+  return retval;
+}
diff --git a/tests/src/vector_multi_inner_prod.cpp b/tests/src/vector_multi_inner_prod.cpp
new file mode 100644
index 0000000..d172b6e
--- /dev/null
+++ b/tests/src/vector_multi_inner_prod.cpp
@@ -0,0 +1,584 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+//
+// *** System
+//
+#include <iostream>
+#include <iomanip>
+
+//
+// *** Boost
+//
+#include <boost/numeric/ublas/io.hpp>
+#include <boost/numeric/ublas/vector.hpp>
+#include <boost/numeric/ublas/vector_proxy.hpp>
+
+//
+// *** ViennaCL
+//
+//#define VIENNACL_DEBUG_ALL
+#define VIENNACL_WITH_UBLAS 1
+#include "viennacl/vector.hpp"
+#include "viennacl/vector_proxy.hpp"
+#include "viennacl/linalg/inner_prod.hpp"
+#include "viennacl/linalg/norm_1.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+#include "viennacl/linalg/norm_inf.hpp"
+
+#include "Random.hpp"
+
+using namespace boost::numeric;
+
+
+//
+// -------------------------------------------------------------
+//
+template <typename ScalarType>
+ScalarType diff(ScalarType const & s1, ScalarType const & s2)
+{
+   viennacl::backend::finish();
+   if (s1 != s2)
+      return (s1 - s2) / std::max(std::fabs(s1), std::fabs(s2));
+   return 0;
+}
+//
+// -------------------------------------------------------------
+//
+template <typename ScalarType>
+ScalarType diff(ScalarType const & s1, viennacl::scalar<ScalarType> const & s2)
+{
+   viennacl::backend::finish();
+   if (s1 != s2)
+      return (s1 - s2) / std::max(std::fabs(s1), std::fabs(s2));
+   return 0;
+}
+//
+// -------------------------------------------------------------
+//
+template <typename ScalarType>
+ScalarType diff(ScalarType const & s1, viennacl::entry_proxy<ScalarType> const & s2)
+{
+   viennacl::backend::finish();
+   if (s1 != s2)
+      return (s1 - s2) / std::max(std::fabs(s1), std::fabs(s2));
+   return 0;
+}
+//
+// -------------------------------------------------------------
+//
+template <typename ScalarType, typename ViennaCLVectorType>
+ScalarType diff(ublas::vector<ScalarType> const & v1, ViennaCLVectorType const & vcl_vec)
+{
+   ublas::vector<ScalarType> v2_cpu(vcl_vec.size());
+   viennacl::backend::finish();
+   viennacl::copy(vcl_vec, v2_cpu);
+
+   for (unsigned int i=0;i<v1.size(); ++i)
+   {
+      if ( std::max( std::fabs(v2_cpu[i]), std::fabs(v1[i]) ) > 0 )
+         v2_cpu[i] = std::fabs(v2_cpu[i] - v1[i]) / std::max( std::fabs(v2_cpu[i]), std::fabs(v1[i]) );
+      else
+         v2_cpu[i] = 0.0;
+   }
+
+   return ublas::norm_inf(v2_cpu);
+}
+
+template <typename ScalarType, typename ViennaCLVectorType>
+ScalarType diff(ublas::vector_slice<ublas::vector<ScalarType> > const & v1, ViennaCLVectorType const & vcl_vec)
+{
+   ublas::vector<ScalarType> v2_cpu(vcl_vec.size());
+   viennacl::backend::finish();
+   viennacl::copy(vcl_vec, v2_cpu);
+
+   for (unsigned int i=0;i<v1.size(); ++i)
+   {
+      if ( std::max( std::fabs(v2_cpu[i]), std::fabs(v1[i]) ) > 0 )
+         v2_cpu[i] = std::fabs(v2_cpu[i] - v1[i]) / std::max( std::fabs(v2_cpu[i]), std::fabs(v1[i]) );
+      else
+         v2_cpu[i] = 0.0;
+   }
+
+   return ublas::norm_inf(v2_cpu);
+}
+
+
+template <typename T1, typename T2>
+int check(T1 const & t1, T2 const & t2, double epsilon)
+{
+  int retval = EXIT_SUCCESS;
+
+  double temp = std::fabs(diff(t1, t2));
+  if (temp > epsilon)
+  {
+    std::cout << "# Error! Relative difference: " << temp << std::endl;
+    retval = EXIT_FAILURE;
+  }
+  return retval;
+}
+
+
+//
+// -------------------------------------------------------------
+//
+template< typename NumericT, typename Epsilon,
+          typename UblasVectorType1,    typename UblasVectorType2,    typename UblasVectorType3,    typename UblasVectorType4,
+          typename ViennaCLVectorType1, typename ViennaCLVectorType2, typename ViennaCLVectorType3, typename ViennaCLVectorType4 >
+int test(Epsilon const& epsilon,
+         UblasVectorType1    & ublas_v1, UblasVectorType2    & ublas_v2, UblasVectorType3    & ublas_v3, UblasVectorType4    & ublas_v4,
+         ViennaCLVectorType1 &   vcl_v1, ViennaCLVectorType2 &   vcl_v2, ViennaCLVectorType3 &   vcl_v3, ViennaCLVectorType4 &   vcl_v4)
+{
+  int retval = EXIT_SUCCESS;
+
+  for (std::size_t i=0; i<ublas_v1.size(); ++i)
+  {
+    ublas_v1[i] = NumericT(1.0) + random<NumericT>();
+    ublas_v2[i] = NumericT(1.0) + random<NumericT>();
+    ublas_v3[i] = NumericT(1.0) + random<NumericT>();
+    ublas_v4[i] = NumericT(1.0) + random<NumericT>();
+  }
+
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());  //resync
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+  viennacl::copy(ublas_v3.begin(), ublas_v3.end(), vcl_v3.begin());
+  viennacl::copy(ublas_v4.begin(), ublas_v4.end(), vcl_v4.begin());
+
+  std::cout << "Checking for successful copy..." << std::endl;
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  if (check(ublas_v2, vcl_v2, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  if (check(ublas_v3, vcl_v3, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  if (check(ublas_v4, vcl_v4, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ublas::vector<NumericT> ref_result = ublas::scalar_vector<NumericT>(40, 0.0);
+  viennacl::vector<NumericT> result = viennacl::scalar_vector<NumericT>(40, 0.0);
+
+  std::cout << "Testing inner_prod with two vectors..." << std::endl;
+  ref_result(2) = ublas::inner_prod(ublas_v1, ublas_v1);
+  ref_result(5) = ublas::inner_prod(ublas_v1, ublas_v2);
+  viennacl::project(result, viennacl::slice(2, 3, 2)) = viennacl::linalg::inner_prod(vcl_v1, viennacl::tie(vcl_v1, vcl_v2));
+  if (check(ref_result, result, epsilon) != EXIT_SUCCESS)
+  {
+    std::cout << ref_result << std::endl;
+    std::cout << result << std::endl;
+    return EXIT_FAILURE;
+  }
+
+  ref_result(3) = ublas::inner_prod(ublas_v1, ublas_v3);
+  ref_result(7) = ublas::inner_prod(ublas_v1, ublas_v4);
+  viennacl::project(result, viennacl::slice(3, 4, 2)) = viennacl::linalg::inner_prod(vcl_v1, viennacl::tie(vcl_v3, vcl_v4));
+  if (check(ref_result, result, epsilon) != EXIT_SUCCESS)
+  {
+    std::cout << ref_result << std::endl;
+    std::cout << result << std::endl;
+    return EXIT_FAILURE;
+  }
+
+
+  std::cout << "Testing inner_prod with three vectors..." << std::endl;
+  ref_result(1) = ublas::inner_prod(ublas_v1, ublas_v1);
+  ref_result(3) = ublas::inner_prod(ublas_v1, ublas_v2);
+  ref_result(5) = ublas::inner_prod(ublas_v1, ublas_v3);
+  viennacl::project(result, viennacl::slice(1, 2, 3)) = viennacl::linalg::inner_prod(vcl_v1, viennacl::tie(vcl_v1, vcl_v2, vcl_v3));
+  if (check(ref_result, result, epsilon) != EXIT_SUCCESS)
+  {
+    std::cout << ref_result << std::endl;
+    std::cout << result << std::endl;
+    return EXIT_FAILURE;
+  }
+
+  ref_result(2)  = ublas::inner_prod(ublas_v1, ublas_v3);
+  ref_result(6)  = ublas::inner_prod(ublas_v1, ublas_v2);
+  ref_result(10) = ublas::inner_prod(ublas_v1, ublas_v4);
+  viennacl::project(result, viennacl::slice(2, 4, 3)) = viennacl::linalg::inner_prod(vcl_v1, viennacl::tie(vcl_v3, vcl_v2, vcl_v4));
+  if (check(ref_result, result, epsilon) != EXIT_SUCCESS)
+  {
+    std::cout << ref_result << std::endl;
+    std::cout << result << std::endl;
+    return EXIT_FAILURE;
+  }
+
+  std::cout << "Testing inner_prod with four vectors..." << std::endl;
+  ref_result(4) = ublas::inner_prod(ublas_v1, ublas_v1);
+  ref_result(5) = ublas::inner_prod(ublas_v1, ublas_v2);
+  ref_result(6) = ublas::inner_prod(ublas_v1, ublas_v3);
+  ref_result(7) = ublas::inner_prod(ublas_v1, ublas_v4);
+  viennacl::project(result, viennacl::slice(4, 1, 4)) = viennacl::linalg::inner_prod(vcl_v1, viennacl::tie(vcl_v1, vcl_v2, vcl_v3, vcl_v4));
+  if (check(ref_result, result, epsilon) != EXIT_SUCCESS)
+  {
+    std::cout << ref_result << std::endl;
+    std::cout << result << std::endl;
+    return EXIT_FAILURE;
+  }
+
+  ref_result(3)  = ublas::inner_prod(ublas_v1, ublas_v3);
+  ref_result(6)  = ublas::inner_prod(ublas_v1, ublas_v2);
+  ref_result(9)  = ublas::inner_prod(ublas_v1, ublas_v4);
+  ref_result(12) = ublas::inner_prod(ublas_v1, ublas_v1);
+  viennacl::project(result, viennacl::slice(3, 3, 4)) = viennacl::linalg::inner_prod(vcl_v1, viennacl::tie(vcl_v3, vcl_v2, vcl_v4, vcl_v1));
+  if (check(ref_result, result, epsilon) != EXIT_SUCCESS)
+  {
+    std::cout << ref_result << std::endl;
+    std::cout << result << std::endl;
+    return EXIT_FAILURE;
+  }
+
+  std::cout << "Testing inner_prod with five vectors..." << std::endl;
+  ref_result(1) = ublas::inner_prod(ublas_v1, ublas_v1);
+  ref_result(3) = ublas::inner_prod(ublas_v1, ublas_v2);
+  ref_result(5) = ublas::inner_prod(ublas_v1, ublas_v3);
+  ref_result(7) = ublas::inner_prod(ublas_v1, ublas_v4);
+  ref_result(9) = ublas::inner_prod(ublas_v1, ublas_v2);
+  viennacl::project(result, viennacl::slice(1, 2, 5)) = viennacl::linalg::inner_prod(vcl_v1, viennacl::tie(vcl_v1, vcl_v2, vcl_v3, vcl_v4, vcl_v2));
+  if (check(ref_result, result, epsilon) != EXIT_SUCCESS)
+  {
+    std::cout << ref_result << std::endl;
+    std::cout << result << std::endl;
+    return EXIT_FAILURE;
+  }
+
+  ref_result(2)  = ublas::inner_prod(ublas_v1, ublas_v3);
+  ref_result(4)  = ublas::inner_prod(ublas_v1, ublas_v2);
+  ref_result(6)  = ublas::inner_prod(ublas_v1, ublas_v4);
+  ref_result(8)  = ublas::inner_prod(ublas_v1, ublas_v1);
+  ref_result(10) = ublas::inner_prod(ublas_v1, ublas_v2);
+  viennacl::project(result, viennacl::slice(2, 2, 5)) = viennacl::linalg::inner_prod(vcl_v1, viennacl::tie(vcl_v3, vcl_v2, vcl_v4, vcl_v1, vcl_v2));
+  if (check(ref_result, result, epsilon) != EXIT_SUCCESS)
+  {
+    std::cout << ref_result << std::endl;
+    std::cout << result << std::endl;
+    return EXIT_FAILURE;
+  }
+
+
+  std::cout << "Testing inner_prod with eight vectors..." << std::endl;
+  ref_result(1)  = ublas::inner_prod(ublas_v1, ublas_v1);
+  ref_result(5)  = ublas::inner_prod(ublas_v1, ublas_v2);
+  ref_result(9)  = ublas::inner_prod(ublas_v1, ublas_v3);
+  ref_result(13) = ublas::inner_prod(ublas_v1, ublas_v4);
+  ref_result(17) = ublas::inner_prod(ublas_v1, ublas_v3);
+  ref_result(21) = ublas::inner_prod(ublas_v1, ublas_v2);
+  ref_result(25) = ublas::inner_prod(ublas_v1, ublas_v1);
+  ref_result(29) = ublas::inner_prod(ublas_v1, ublas_v2);
+  std::vector<viennacl::vector_base<NumericT> const *> vecs1(8);
+  vecs1[0] = &vcl_v1;
+  vecs1[1] = &vcl_v2;
+  vecs1[2] = &vcl_v3;
+  vecs1[3] = &vcl_v4;
+  vecs1[4] = &vcl_v3;
+  vecs1[5] = &vcl_v2;
+  vecs1[6] = &vcl_v1;
+  vecs1[7] = &vcl_v2;
+  viennacl::vector_tuple<NumericT> tuple1(vecs1);
+  viennacl::project(result, viennacl::slice(1, 4, 8)) = viennacl::linalg::inner_prod(vcl_v1, tuple1);
+  if (check(ref_result, result, epsilon) != EXIT_SUCCESS)
+  {
+    std::cout << ref_result << std::endl;
+    std::cout << result << std::endl;
+    return EXIT_FAILURE;
+  }
+
+  ref_result(3)  = ublas::inner_prod(ublas_v1, ublas_v2);
+  ref_result(5)  = ublas::inner_prod(ublas_v1, ublas_v4);
+  ref_result(7)  = ublas::inner_prod(ublas_v1, ublas_v1);
+  ref_result(9)  = ublas::inner_prod(ublas_v1, ublas_v2);
+  ref_result(11) = ublas::inner_prod(ublas_v1, ublas_v2);
+  ref_result(13) = ublas::inner_prod(ublas_v1, ublas_v1);
+  ref_result(15) = ublas::inner_prod(ublas_v1, ublas_v4);
+  ref_result(17) = ublas::inner_prod(ublas_v1, ublas_v2);
+  std::vector<viennacl::vector_base<NumericT> const *> vecs2(8);
+  vecs2[0] = &vcl_v2;
+  vecs2[1] = &vcl_v4;
+  vecs2[2] = &vcl_v1;
+  vecs2[3] = &vcl_v2;
+  vecs2[4] = &vcl_v2;
+  vecs2[5] = &vcl_v1;
+  vecs2[6] = &vcl_v4;
+  vecs2[7] = &vcl_v2;
+  viennacl::vector_tuple<NumericT> tuple2(vecs2);
+  viennacl::project(result, viennacl::slice(3, 2, 8)) = viennacl::linalg::inner_prod(vcl_v1, tuple2);
+  if (check(ref_result, result, epsilon) != EXIT_SUCCESS)
+  {
+    std::cout << ref_result << std::endl;
+    std::cout << result << std::endl;
+    return EXIT_FAILURE;
+  }
+
+
+  // --------------------------------------------------------------------------
+  return retval;
+}
+
+
+template< typename NumericT, typename Epsilon >
+int test(Epsilon const& epsilon)
+{
+  int retval = EXIT_SUCCESS;
+  std::size_t size = 8 * 1337;
+
+  std::cout << "Running tests for vector of size " << size << std::endl;
+
+  //
+  // Set up UBLAS objects
+  //
+  ublas::vector<NumericT> ublas_full_vec1(size);
+  ublas::vector<NumericT> ublas_full_vec2(ublas_full_vec1.size());
+
+  for (std::size_t i=0; i<ublas_full_vec1.size(); ++i)
+  {
+    ublas_full_vec1[i]  = NumericT(1.0) + random<NumericT>();
+    ublas_full_vec2[i] = NumericT(1.0) + random<NumericT>();
+  }
+
+  ublas::slice s1(    ublas_full_vec1.size() / 8, 3, ublas_full_vec1.size() / 8);
+  ublas::slice s2(2 * ublas_full_vec2.size() / 8, 1, ublas_full_vec2.size() / 8);
+  ublas::slice s3(4 * ublas_full_vec1.size() / 8, 2, ublas_full_vec1.size() / 8);
+  ublas::slice s4(3 * ublas_full_vec2.size() / 8, 4, ublas_full_vec2.size() / 8);
+  ublas::vector_slice< ublas::vector<NumericT> > ublas_slice_vec1(ublas_full_vec1, s1);
+  ublas::vector_slice< ublas::vector<NumericT> > ublas_slice_vec2(ublas_full_vec2, s2);
+  ublas::vector_slice< ublas::vector<NumericT> > ublas_slice_vec3(ublas_full_vec1, s3);
+  ublas::vector_slice< ublas::vector<NumericT> > ublas_slice_vec4(ublas_full_vec2, s4);
+
+  //
+  // Set up ViennaCL objects
+  //
+  viennacl::vector<NumericT> vcl_full_vec1(ublas_full_vec1.size());
+  viennacl::vector<NumericT> vcl_full_vec2(ublas_full_vec2.size());
+
+  viennacl::fast_copy(ublas_full_vec1.begin(), ublas_full_vec1.end(), vcl_full_vec1.begin());
+  viennacl::copy     (ublas_full_vec2.begin(), ublas_full_vec2.end(), vcl_full_vec2.begin());
+
+  viennacl::slice vcl_s1(    vcl_full_vec1.size() / 8, 3, vcl_full_vec1.size() / 8);
+  viennacl::slice vcl_s2(2 * vcl_full_vec2.size() / 8, 1, vcl_full_vec2.size() / 8);
+  viennacl::slice vcl_s3(4 * vcl_full_vec1.size() / 8, 2, vcl_full_vec1.size() / 8);
+  viennacl::slice vcl_s4(3 * vcl_full_vec2.size() / 8, 4, vcl_full_vec2.size() / 8);
+  viennacl::vector_slice< viennacl::vector<NumericT> > vcl_slice_vec1(vcl_full_vec1, vcl_s1);
+  viennacl::vector_slice< viennacl::vector<NumericT> > vcl_slice_vec2(vcl_full_vec2, vcl_s2);
+  viennacl::vector_slice< viennacl::vector<NumericT> > vcl_slice_vec3(vcl_full_vec1, vcl_s3);
+  viennacl::vector_slice< viennacl::vector<NumericT> > vcl_slice_vec4(vcl_full_vec2, vcl_s4);
+
+  viennacl::vector<NumericT> vcl_short_vec1(vcl_slice_vec1);
+  viennacl::vector<NumericT> vcl_short_vec2 = vcl_slice_vec2;
+  viennacl::vector<NumericT> vcl_short_vec3 = vcl_slice_vec2 + vcl_slice_vec1;
+  viennacl::vector<NumericT> vcl_short_vec4 = vcl_short_vec1 + vcl_slice_vec2;
+
+  ublas::vector<NumericT> ublas_short_vec1(ublas_slice_vec1);
+  ublas::vector<NumericT> ublas_short_vec2(ublas_slice_vec2);
+  ublas::vector<NumericT> ublas_short_vec3 = ublas_slice_vec2 + ublas_slice_vec1;
+  ublas::vector<NumericT> ublas_short_vec4 = ublas_short_vec1 + ublas_slice_vec2;
+
+  std::cout << "Testing creation of vectors from slice..." << std::endl;
+  if (check(ublas_short_vec1, vcl_short_vec1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  if (check(ublas_short_vec2, vcl_short_vec2, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  if (check(ublas_short_vec3, vcl_short_vec3, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  if (check(ublas_short_vec4, vcl_short_vec4, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  //
+  // Now start running tests for vectors, ranges and slices:
+  //
+
+  std::cout << " ** [vector|vector|vector|vector] **" << std::endl;
+  retval = test<NumericT>(epsilon,
+                          ublas_short_vec1, ublas_short_vec2, ublas_short_vec2, ublas_short_vec2,
+                            vcl_short_vec1,   vcl_short_vec2,   vcl_short_vec3,   vcl_short_vec4);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " ** [vector|vector|vector|slice] **" << std::endl;
+  retval = test<NumericT>(epsilon,
+                          ublas_short_vec1, ublas_short_vec2, ublas_short_vec2, ublas_slice_vec2,
+                            vcl_short_vec1,   vcl_short_vec2,   vcl_short_vec3,   vcl_slice_vec4);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " ** [vector|vector|slice|vector] **" << std::endl;
+  retval = test<NumericT>(epsilon,
+                          ublas_short_vec1, ublas_short_vec2, ublas_slice_vec2, ublas_short_vec2,
+                            vcl_short_vec1,   vcl_short_vec2,   vcl_slice_vec3,   vcl_short_vec4);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " ** [vector|vector|slice|slice] **" << std::endl;
+  retval = test<NumericT>(epsilon,
+                          ublas_short_vec1, ublas_short_vec2, ublas_slice_vec2, ublas_slice_vec2,
+                            vcl_short_vec1,   vcl_short_vec2,   vcl_slice_vec3,   vcl_slice_vec4);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " ** [vector|slice|vector|vector] **" << std::endl;
+  retval = test<NumericT>(epsilon,
+                          ublas_short_vec1, ublas_slice_vec2, ublas_short_vec2, ublas_short_vec2,
+                            vcl_short_vec1,   vcl_slice_vec2,   vcl_short_vec3,   vcl_short_vec4);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " ** [vector|slice|vector|slice] **" << std::endl;
+  retval = test<NumericT>(epsilon,
+                          ublas_short_vec1, ublas_slice_vec2, ublas_short_vec2, ublas_slice_vec2,
+                            vcl_short_vec1,   vcl_slice_vec2,   vcl_short_vec3,   vcl_slice_vec4);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " ** [vector|slice|slice|vector] **" << std::endl;
+  retval = test<NumericT>(epsilon,
+                          ublas_short_vec1, ublas_slice_vec2, ublas_slice_vec2, ublas_short_vec2,
+                            vcl_short_vec1,   vcl_slice_vec2,   vcl_slice_vec3,   vcl_short_vec4);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " ** [vector|slice|slice|slice] **" << std::endl;
+  retval = test<NumericT>(epsilon,
+                          ublas_short_vec1, ublas_slice_vec2, ublas_slice_vec2, ublas_slice_vec2,
+                            vcl_short_vec1,   vcl_slice_vec2,   vcl_slice_vec3,   vcl_slice_vec4);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  //////////////////
+
+
+  std::cout << " ** [slice|vector|vector|vector] **" << std::endl;
+  retval = test<NumericT>(epsilon,
+                          ublas_slice_vec1, ublas_short_vec2, ublas_short_vec2, ublas_short_vec2,
+                            vcl_slice_vec1,   vcl_short_vec2,   vcl_short_vec3,   vcl_short_vec4);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " ** [slice|vector|vector|slice] **" << std::endl;
+  retval = test<NumericT>(epsilon,
+                          ublas_slice_vec1, ublas_short_vec2, ublas_short_vec2, ublas_slice_vec2,
+                            vcl_slice_vec1,   vcl_short_vec2,   vcl_short_vec3,   vcl_slice_vec4);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " ** [slice|vector|slice|vector] **" << std::endl;
+  retval = test<NumericT>(epsilon,
+                          ublas_slice_vec1, ublas_short_vec2, ublas_slice_vec2, ublas_short_vec2,
+                            vcl_slice_vec1,   vcl_short_vec2,   vcl_slice_vec3,   vcl_short_vec4);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " ** [slice|vector|slice|slice] **" << std::endl;
+  retval = test<NumericT>(epsilon,
+                          ublas_slice_vec1, ublas_short_vec2, ublas_slice_vec2, ublas_slice_vec2,
+                            vcl_slice_vec1,   vcl_short_vec2,   vcl_slice_vec3,   vcl_slice_vec4);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " ** [slice|slice|vector|vector] **" << std::endl;
+  retval = test<NumericT>(epsilon,
+                          ublas_slice_vec1, ublas_slice_vec2, ublas_short_vec2, ublas_short_vec2,
+                            vcl_slice_vec1,   vcl_slice_vec2,   vcl_short_vec3,   vcl_short_vec4);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " ** [slice|slice|vector|slice] **" << std::endl;
+  retval = test<NumericT>(epsilon,
+                          ublas_slice_vec1, ublas_slice_vec2, ublas_short_vec2, ublas_slice_vec2,
+                            vcl_slice_vec1,   vcl_slice_vec2,   vcl_short_vec3,   vcl_slice_vec4);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " ** [slice|slice|slice|vector] **" << std::endl;
+  retval = test<NumericT>(epsilon,
+                          ublas_slice_vec1, ublas_slice_vec2, ublas_slice_vec2, ublas_short_vec2,
+                            vcl_slice_vec1,   vcl_slice_vec2,   vcl_slice_vec3,   vcl_short_vec4);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " ** [slice|slice|slice|slice] **" << std::endl;
+  retval = test<NumericT>(epsilon,
+                          ublas_slice_vec1, ublas_slice_vec2, ublas_slice_vec2, ublas_slice_vec2,
+                            vcl_slice_vec1,   vcl_slice_vec2,   vcl_slice_vec3,   vcl_slice_vec4);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  return EXIT_SUCCESS;
+}
+
+
+
+//
+// -------------------------------------------------------------
+//
+int main()
+{
+   std::cout << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << "## Test :: Vector multiple inner products" << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << std::endl;
+
+   int retval = EXIT_SUCCESS;
+
+   std::cout << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << std::endl;
+   {
+      typedef float NumericT;
+      NumericT epsilon = static_cast<NumericT>(1.0E-4);
+      std::cout << "# Testing setup:" << std::endl;
+      std::cout << "  eps:     " << epsilon << std::endl;
+      std::cout << "  numeric: float" << std::endl;
+      retval = test<NumericT>(epsilon);
+      if( retval == EXIT_SUCCESS )
+         std::cout << "# Test passed" << std::endl;
+      else
+         return retval;
+   }
+   std::cout << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << std::endl;
+#ifdef VIENNACL_WITH_OPENCL
+   if( viennacl::ocl::current_device().double_support() )
+#endif
+   {
+      {
+         typedef double NumericT;
+         NumericT epsilon = 1.0E-12;
+         std::cout << "# Testing setup:" << std::endl;
+         std::cout << "  eps:     " << epsilon << std::endl;
+         std::cout << "  numeric: double" << std::endl;
+         retval = test<NumericT>(epsilon);
+         if( retval == EXIT_SUCCESS )
+           std::cout << "# Test passed" << std::endl;
+         else
+           return retval;
+      }
+      std::cout << std::endl;
+      std::cout << "----------------------------------------------" << std::endl;
+      std::cout << std::endl;
+   }
+
+  std::cout << std::endl;
+  std::cout << "------- Test completed --------" << std::endl;
+  std::cout << std::endl;
+
+
+   return retval;
+}
diff --git a/tests/src/vector_multi_inner_prod.cu b/tests/src/vector_multi_inner_prod.cu
new file mode 100644
index 0000000..d172b6e
--- /dev/null
+++ b/tests/src/vector_multi_inner_prod.cu
@@ -0,0 +1,584 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+//
+// *** System
+//
+#include <iostream>
+#include <iomanip>
+
+//
+// *** Boost
+//
+#include <boost/numeric/ublas/io.hpp>
+#include <boost/numeric/ublas/vector.hpp>
+#include <boost/numeric/ublas/vector_proxy.hpp>
+
+//
+// *** ViennaCL
+//
+//#define VIENNACL_DEBUG_ALL
+#define VIENNACL_WITH_UBLAS 1
+#include "viennacl/vector.hpp"
+#include "viennacl/vector_proxy.hpp"
+#include "viennacl/linalg/inner_prod.hpp"
+#include "viennacl/linalg/norm_1.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+#include "viennacl/linalg/norm_inf.hpp"
+
+#include "Random.hpp"
+
+using namespace boost::numeric;
+
+
+//
+// -------------------------------------------------------------
+//
+template <typename ScalarType>
+ScalarType diff(ScalarType const & s1, ScalarType const & s2)
+{
+   viennacl::backend::finish();
+   if (s1 != s2)
+      return (s1 - s2) / std::max(std::fabs(s1), std::fabs(s2));
+   return 0;
+}
+//
+// -------------------------------------------------------------
+//
+template <typename ScalarType>
+ScalarType diff(ScalarType const & s1, viennacl::scalar<ScalarType> const & s2)
+{
+   viennacl::backend::finish();
+   if (s1 != s2)
+      return (s1 - s2) / std::max(std::fabs(s1), std::fabs(s2));
+   return 0;
+}
+//
+// -------------------------------------------------------------
+//
+template <typename ScalarType>
+ScalarType diff(ScalarType const & s1, viennacl::entry_proxy<ScalarType> const & s2)
+{
+   viennacl::backend::finish();
+   if (s1 != s2)
+      return (s1 - s2) / std::max(std::fabs(s1), std::fabs(s2));
+   return 0;
+}
+//
+// -------------------------------------------------------------
+//
+template <typename ScalarType, typename ViennaCLVectorType>
+ScalarType diff(ublas::vector<ScalarType> const & v1, ViennaCLVectorType const & vcl_vec)
+{
+   ublas::vector<ScalarType> v2_cpu(vcl_vec.size());
+   viennacl::backend::finish();
+   viennacl::copy(vcl_vec, v2_cpu);
+
+   for (unsigned int i=0;i<v1.size(); ++i)
+   {
+      if ( std::max( std::fabs(v2_cpu[i]), std::fabs(v1[i]) ) > 0 )
+         v2_cpu[i] = std::fabs(v2_cpu[i] - v1[i]) / std::max( std::fabs(v2_cpu[i]), std::fabs(v1[i]) );
+      else
+         v2_cpu[i] = 0.0;
+   }
+
+   return ublas::norm_inf(v2_cpu);
+}
+
+template <typename ScalarType, typename ViennaCLVectorType>
+ScalarType diff(ublas::vector_slice<ublas::vector<ScalarType> > const & v1, ViennaCLVectorType const & vcl_vec)
+{
+   ublas::vector<ScalarType> v2_cpu(vcl_vec.size());
+   viennacl::backend::finish();
+   viennacl::copy(vcl_vec, v2_cpu);
+
+   for (unsigned int i=0;i<v1.size(); ++i)
+   {
+      if ( std::max( std::fabs(v2_cpu[i]), std::fabs(v1[i]) ) > 0 )
+         v2_cpu[i] = std::fabs(v2_cpu[i] - v1[i]) / std::max( std::fabs(v2_cpu[i]), std::fabs(v1[i]) );
+      else
+         v2_cpu[i] = 0.0;
+   }
+
+   return ublas::norm_inf(v2_cpu);
+}
+
+
+template <typename T1, typename T2>
+int check(T1 const & t1, T2 const & t2, double epsilon)
+{
+  int retval = EXIT_SUCCESS;
+
+  double temp = std::fabs(diff(t1, t2));
+  if (temp > epsilon)
+  {
+    std::cout << "# Error! Relative difference: " << temp << std::endl;
+    retval = EXIT_FAILURE;
+  }
+  return retval;
+}
+
+
+//
+// -------------------------------------------------------------
+//
+template< typename NumericT, typename Epsilon,
+          typename UblasVectorType1,    typename UblasVectorType2,    typename UblasVectorType3,    typename UblasVectorType4,
+          typename ViennaCLVectorType1, typename ViennaCLVectorType2, typename ViennaCLVectorType3, typename ViennaCLVectorType4 >
+int test(Epsilon const& epsilon,
+         UblasVectorType1    & ublas_v1, UblasVectorType2    & ublas_v2, UblasVectorType3    & ublas_v3, UblasVectorType4    & ublas_v4,
+         ViennaCLVectorType1 &   vcl_v1, ViennaCLVectorType2 &   vcl_v2, ViennaCLVectorType3 &   vcl_v3, ViennaCLVectorType4 &   vcl_v4)
+{
+  int retval = EXIT_SUCCESS;
+
+  for (std::size_t i=0; i<ublas_v1.size(); ++i)
+  {
+    ublas_v1[i] = NumericT(1.0) + random<NumericT>();
+    ublas_v2[i] = NumericT(1.0) + random<NumericT>();
+    ublas_v3[i] = NumericT(1.0) + random<NumericT>();
+    ublas_v4[i] = NumericT(1.0) + random<NumericT>();
+  }
+
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());  //resync
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+  viennacl::copy(ublas_v3.begin(), ublas_v3.end(), vcl_v3.begin());
+  viennacl::copy(ublas_v4.begin(), ublas_v4.end(), vcl_v4.begin());
+
+  std::cout << "Checking for successful copy..." << std::endl;
+  if (check(ublas_v1, vcl_v1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  if (check(ublas_v2, vcl_v2, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  if (check(ublas_v3, vcl_v3, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  if (check(ublas_v4, vcl_v4, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ublas::vector<NumericT> ref_result = ublas::scalar_vector<NumericT>(40, 0.0);
+  viennacl::vector<NumericT> result = viennacl::scalar_vector<NumericT>(40, 0.0);
+
+  std::cout << "Testing inner_prod with two vectors..." << std::endl;
+  ref_result(2) = ublas::inner_prod(ublas_v1, ublas_v1);
+  ref_result(5) = ublas::inner_prod(ublas_v1, ublas_v2);
+  viennacl::project(result, viennacl::slice(2, 3, 2)) = viennacl::linalg::inner_prod(vcl_v1, viennacl::tie(vcl_v1, vcl_v2));
+  if (check(ref_result, result, epsilon) != EXIT_SUCCESS)
+  {
+    std::cout << ref_result << std::endl;
+    std::cout << result << std::endl;
+    return EXIT_FAILURE;
+  }
+
+  ref_result(3) = ublas::inner_prod(ublas_v1, ublas_v3);
+  ref_result(7) = ublas::inner_prod(ublas_v1, ublas_v4);
+  viennacl::project(result, viennacl::slice(3, 4, 2)) = viennacl::linalg::inner_prod(vcl_v1, viennacl::tie(vcl_v3, vcl_v4));
+  if (check(ref_result, result, epsilon) != EXIT_SUCCESS)
+  {
+    std::cout << ref_result << std::endl;
+    std::cout << result << std::endl;
+    return EXIT_FAILURE;
+  }
+
+
+  std::cout << "Testing inner_prod with three vectors..." << std::endl;
+  ref_result(1) = ublas::inner_prod(ublas_v1, ublas_v1);
+  ref_result(3) = ublas::inner_prod(ublas_v1, ublas_v2);
+  ref_result(5) = ublas::inner_prod(ublas_v1, ublas_v3);
+  viennacl::project(result, viennacl::slice(1, 2, 3)) = viennacl::linalg::inner_prod(vcl_v1, viennacl::tie(vcl_v1, vcl_v2, vcl_v3));
+  if (check(ref_result, result, epsilon) != EXIT_SUCCESS)
+  {
+    std::cout << ref_result << std::endl;
+    std::cout << result << std::endl;
+    return EXIT_FAILURE;
+  }
+
+  ref_result(2)  = ublas::inner_prod(ublas_v1, ublas_v3);
+  ref_result(6)  = ublas::inner_prod(ublas_v1, ublas_v2);
+  ref_result(10) = ublas::inner_prod(ublas_v1, ublas_v4);
+  viennacl::project(result, viennacl::slice(2, 4, 3)) = viennacl::linalg::inner_prod(vcl_v1, viennacl::tie(vcl_v3, vcl_v2, vcl_v4));
+  if (check(ref_result, result, epsilon) != EXIT_SUCCESS)
+  {
+    std::cout << ref_result << std::endl;
+    std::cout << result << std::endl;
+    return EXIT_FAILURE;
+  }
+
+  std::cout << "Testing inner_prod with four vectors..." << std::endl;
+  ref_result(4) = ublas::inner_prod(ublas_v1, ublas_v1);
+  ref_result(5) = ublas::inner_prod(ublas_v1, ublas_v2);
+  ref_result(6) = ublas::inner_prod(ublas_v1, ublas_v3);
+  ref_result(7) = ublas::inner_prod(ublas_v1, ublas_v4);
+  viennacl::project(result, viennacl::slice(4, 1, 4)) = viennacl::linalg::inner_prod(vcl_v1, viennacl::tie(vcl_v1, vcl_v2, vcl_v3, vcl_v4));
+  if (check(ref_result, result, epsilon) != EXIT_SUCCESS)
+  {
+    std::cout << ref_result << std::endl;
+    std::cout << result << std::endl;
+    return EXIT_FAILURE;
+  }
+
+  ref_result(3)  = ublas::inner_prod(ublas_v1, ublas_v3);
+  ref_result(6)  = ublas::inner_prod(ublas_v1, ublas_v2);
+  ref_result(9)  = ublas::inner_prod(ublas_v1, ublas_v4);
+  ref_result(12) = ublas::inner_prod(ublas_v1, ublas_v1);
+  viennacl::project(result, viennacl::slice(3, 3, 4)) = viennacl::linalg::inner_prod(vcl_v1, viennacl::tie(vcl_v3, vcl_v2, vcl_v4, vcl_v1));
+  if (check(ref_result, result, epsilon) != EXIT_SUCCESS)
+  {
+    std::cout << ref_result << std::endl;
+    std::cout << result << std::endl;
+    return EXIT_FAILURE;
+  }
+
+  std::cout << "Testing inner_prod with five vectors..." << std::endl;
+  ref_result(1) = ublas::inner_prod(ublas_v1, ublas_v1);
+  ref_result(3) = ublas::inner_prod(ublas_v1, ublas_v2);
+  ref_result(5) = ublas::inner_prod(ublas_v1, ublas_v3);
+  ref_result(7) = ublas::inner_prod(ublas_v1, ublas_v4);
+  ref_result(9) = ublas::inner_prod(ublas_v1, ublas_v2);
+  viennacl::project(result, viennacl::slice(1, 2, 5)) = viennacl::linalg::inner_prod(vcl_v1, viennacl::tie(vcl_v1, vcl_v2, vcl_v3, vcl_v4, vcl_v2));
+  if (check(ref_result, result, epsilon) != EXIT_SUCCESS)
+  {
+    std::cout << ref_result << std::endl;
+    std::cout << result << std::endl;
+    return EXIT_FAILURE;
+  }
+
+  ref_result(2)  = ublas::inner_prod(ublas_v1, ublas_v3);
+  ref_result(4)  = ublas::inner_prod(ublas_v1, ublas_v2);
+  ref_result(6)  = ublas::inner_prod(ublas_v1, ublas_v4);
+  ref_result(8)  = ublas::inner_prod(ublas_v1, ublas_v1);
+  ref_result(10) = ublas::inner_prod(ublas_v1, ublas_v2);
+  viennacl::project(result, viennacl::slice(2, 2, 5)) = viennacl::linalg::inner_prod(vcl_v1, viennacl::tie(vcl_v3, vcl_v2, vcl_v4, vcl_v1, vcl_v2));
+  if (check(ref_result, result, epsilon) != EXIT_SUCCESS)
+  {
+    std::cout << ref_result << std::endl;
+    std::cout << result << std::endl;
+    return EXIT_FAILURE;
+  }
+
+
+  std::cout << "Testing inner_prod with eight vectors..." << std::endl;
+  ref_result(1)  = ublas::inner_prod(ublas_v1, ublas_v1);
+  ref_result(5)  = ublas::inner_prod(ublas_v1, ublas_v2);
+  ref_result(9)  = ublas::inner_prod(ublas_v1, ublas_v3);
+  ref_result(13) = ublas::inner_prod(ublas_v1, ublas_v4);
+  ref_result(17) = ublas::inner_prod(ublas_v1, ublas_v3);
+  ref_result(21) = ublas::inner_prod(ublas_v1, ublas_v2);
+  ref_result(25) = ublas::inner_prod(ublas_v1, ublas_v1);
+  ref_result(29) = ublas::inner_prod(ublas_v1, ublas_v2);
+  std::vector<viennacl::vector_base<NumericT> const *> vecs1(8);
+  vecs1[0] = &vcl_v1;
+  vecs1[1] = &vcl_v2;
+  vecs1[2] = &vcl_v3;
+  vecs1[3] = &vcl_v4;
+  vecs1[4] = &vcl_v3;
+  vecs1[5] = &vcl_v2;
+  vecs1[6] = &vcl_v1;
+  vecs1[7] = &vcl_v2;
+  viennacl::vector_tuple<NumericT> tuple1(vecs1);
+  viennacl::project(result, viennacl::slice(1, 4, 8)) = viennacl::linalg::inner_prod(vcl_v1, tuple1);
+  if (check(ref_result, result, epsilon) != EXIT_SUCCESS)
+  {
+    std::cout << ref_result << std::endl;
+    std::cout << result << std::endl;
+    return EXIT_FAILURE;
+  }
+
+  ref_result(3)  = ublas::inner_prod(ublas_v1, ublas_v2);
+  ref_result(5)  = ublas::inner_prod(ublas_v1, ublas_v4);
+  ref_result(7)  = ublas::inner_prod(ublas_v1, ublas_v1);
+  ref_result(9)  = ublas::inner_prod(ublas_v1, ublas_v2);
+  ref_result(11) = ublas::inner_prod(ublas_v1, ublas_v2);
+  ref_result(13) = ublas::inner_prod(ublas_v1, ublas_v1);
+  ref_result(15) = ublas::inner_prod(ublas_v1, ublas_v4);
+  ref_result(17) = ublas::inner_prod(ublas_v1, ublas_v2);
+  std::vector<viennacl::vector_base<NumericT> const *> vecs2(8);
+  vecs2[0] = &vcl_v2;
+  vecs2[1] = &vcl_v4;
+  vecs2[2] = &vcl_v1;
+  vecs2[3] = &vcl_v2;
+  vecs2[4] = &vcl_v2;
+  vecs2[5] = &vcl_v1;
+  vecs2[6] = &vcl_v4;
+  vecs2[7] = &vcl_v2;
+  viennacl::vector_tuple<NumericT> tuple2(vecs2);
+  viennacl::project(result, viennacl::slice(3, 2, 8)) = viennacl::linalg::inner_prod(vcl_v1, tuple2);
+  if (check(ref_result, result, epsilon) != EXIT_SUCCESS)
+  {
+    std::cout << ref_result << std::endl;
+    std::cout << result << std::endl;
+    return EXIT_FAILURE;
+  }
+
+
+  // --------------------------------------------------------------------------
+  return retval;
+}
+
+
+template< typename NumericT, typename Epsilon >
+int test(Epsilon const& epsilon)
+{
+  int retval = EXIT_SUCCESS;
+  std::size_t size = 8 * 1337;
+
+  std::cout << "Running tests for vector of size " << size << std::endl;
+
+  //
+  // Set up UBLAS objects
+  //
+  ublas::vector<NumericT> ublas_full_vec1(size);
+  ublas::vector<NumericT> ublas_full_vec2(ublas_full_vec1.size());
+
+  for (std::size_t i=0; i<ublas_full_vec1.size(); ++i)
+  {
+    ublas_full_vec1[i]  = NumericT(1.0) + random<NumericT>();
+    ublas_full_vec2[i] = NumericT(1.0) + random<NumericT>();
+  }
+
+  ublas::slice s1(    ublas_full_vec1.size() / 8, 3, ublas_full_vec1.size() / 8);
+  ublas::slice s2(2 * ublas_full_vec2.size() / 8, 1, ublas_full_vec2.size() / 8);
+  ublas::slice s3(4 * ublas_full_vec1.size() / 8, 2, ublas_full_vec1.size() / 8);
+  ublas::slice s4(3 * ublas_full_vec2.size() / 8, 4, ublas_full_vec2.size() / 8);
+  ublas::vector_slice< ublas::vector<NumericT> > ublas_slice_vec1(ublas_full_vec1, s1);
+  ublas::vector_slice< ublas::vector<NumericT> > ublas_slice_vec2(ublas_full_vec2, s2);
+  ublas::vector_slice< ublas::vector<NumericT> > ublas_slice_vec3(ublas_full_vec1, s3);
+  ublas::vector_slice< ublas::vector<NumericT> > ublas_slice_vec4(ublas_full_vec2, s4);
+
+  //
+  // Set up ViennaCL objects
+  //
+  viennacl::vector<NumericT> vcl_full_vec1(ublas_full_vec1.size());
+  viennacl::vector<NumericT> vcl_full_vec2(ublas_full_vec2.size());
+
+  viennacl::fast_copy(ublas_full_vec1.begin(), ublas_full_vec1.end(), vcl_full_vec1.begin());
+  viennacl::copy     (ublas_full_vec2.begin(), ublas_full_vec2.end(), vcl_full_vec2.begin());
+
+  viennacl::slice vcl_s1(    vcl_full_vec1.size() / 8, 3, vcl_full_vec1.size() / 8);
+  viennacl::slice vcl_s2(2 * vcl_full_vec2.size() / 8, 1, vcl_full_vec2.size() / 8);
+  viennacl::slice vcl_s3(4 * vcl_full_vec1.size() / 8, 2, vcl_full_vec1.size() / 8);
+  viennacl::slice vcl_s4(3 * vcl_full_vec2.size() / 8, 4, vcl_full_vec2.size() / 8);
+  viennacl::vector_slice< viennacl::vector<NumericT> > vcl_slice_vec1(vcl_full_vec1, vcl_s1);
+  viennacl::vector_slice< viennacl::vector<NumericT> > vcl_slice_vec2(vcl_full_vec2, vcl_s2);
+  viennacl::vector_slice< viennacl::vector<NumericT> > vcl_slice_vec3(vcl_full_vec1, vcl_s3);
+  viennacl::vector_slice< viennacl::vector<NumericT> > vcl_slice_vec4(vcl_full_vec2, vcl_s4);
+
+  viennacl::vector<NumericT> vcl_short_vec1(vcl_slice_vec1);
+  viennacl::vector<NumericT> vcl_short_vec2 = vcl_slice_vec2;
+  viennacl::vector<NumericT> vcl_short_vec3 = vcl_slice_vec2 + vcl_slice_vec1;
+  viennacl::vector<NumericT> vcl_short_vec4 = vcl_short_vec1 + vcl_slice_vec2;
+
+  ublas::vector<NumericT> ublas_short_vec1(ublas_slice_vec1);
+  ublas::vector<NumericT> ublas_short_vec2(ublas_slice_vec2);
+  ublas::vector<NumericT> ublas_short_vec3 = ublas_slice_vec2 + ublas_slice_vec1;
+  ublas::vector<NumericT> ublas_short_vec4 = ublas_short_vec1 + ublas_slice_vec2;
+
+  std::cout << "Testing creation of vectors from slice..." << std::endl;
+  if (check(ublas_short_vec1, vcl_short_vec1, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  if (check(ublas_short_vec2, vcl_short_vec2, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  if (check(ublas_short_vec3, vcl_short_vec3, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  if (check(ublas_short_vec4, vcl_short_vec4, epsilon) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  //
+  // Now start running tests for vectors, ranges and slices:
+  //
+
+  std::cout << " ** [vector|vector|vector|vector] **" << std::endl;
+  retval = test<NumericT>(epsilon,
+                          ublas_short_vec1, ublas_short_vec2, ublas_short_vec2, ublas_short_vec2,
+                            vcl_short_vec1,   vcl_short_vec2,   vcl_short_vec3,   vcl_short_vec4);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " ** [vector|vector|vector|slice] **" << std::endl;
+  retval = test<NumericT>(epsilon,
+                          ublas_short_vec1, ublas_short_vec2, ublas_short_vec2, ublas_slice_vec2,
+                            vcl_short_vec1,   vcl_short_vec2,   vcl_short_vec3,   vcl_slice_vec4);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " ** [vector|vector|slice|vector] **" << std::endl;
+  retval = test<NumericT>(epsilon,
+                          ublas_short_vec1, ublas_short_vec2, ublas_slice_vec2, ublas_short_vec2,
+                            vcl_short_vec1,   vcl_short_vec2,   vcl_slice_vec3,   vcl_short_vec4);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " ** [vector|vector|slice|slice] **" << std::endl;
+  retval = test<NumericT>(epsilon,
+                          ublas_short_vec1, ublas_short_vec2, ublas_slice_vec2, ublas_slice_vec2,
+                            vcl_short_vec1,   vcl_short_vec2,   vcl_slice_vec3,   vcl_slice_vec4);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " ** [vector|slice|vector|vector] **" << std::endl;
+  retval = test<NumericT>(epsilon,
+                          ublas_short_vec1, ublas_slice_vec2, ublas_short_vec2, ublas_short_vec2,
+                            vcl_short_vec1,   vcl_slice_vec2,   vcl_short_vec3,   vcl_short_vec4);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " ** [vector|slice|vector|slice] **" << std::endl;
+  retval = test<NumericT>(epsilon,
+                          ublas_short_vec1, ublas_slice_vec2, ublas_short_vec2, ublas_slice_vec2,
+                            vcl_short_vec1,   vcl_slice_vec2,   vcl_short_vec3,   vcl_slice_vec4);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " ** [vector|slice|slice|vector] **" << std::endl;
+  retval = test<NumericT>(epsilon,
+                          ublas_short_vec1, ublas_slice_vec2, ublas_slice_vec2, ublas_short_vec2,
+                            vcl_short_vec1,   vcl_slice_vec2,   vcl_slice_vec3,   vcl_short_vec4);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " ** [vector|slice|slice|slice] **" << std::endl;
+  retval = test<NumericT>(epsilon,
+                          ublas_short_vec1, ublas_slice_vec2, ublas_slice_vec2, ublas_slice_vec2,
+                            vcl_short_vec1,   vcl_slice_vec2,   vcl_slice_vec3,   vcl_slice_vec4);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  //////////////////
+
+
+  std::cout << " ** [slice|vector|vector|vector] **" << std::endl;
+  retval = test<NumericT>(epsilon,
+                          ublas_slice_vec1, ublas_short_vec2, ublas_short_vec2, ublas_short_vec2,
+                            vcl_slice_vec1,   vcl_short_vec2,   vcl_short_vec3,   vcl_short_vec4);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " ** [slice|vector|vector|slice] **" << std::endl;
+  retval = test<NumericT>(epsilon,
+                          ublas_slice_vec1, ublas_short_vec2, ublas_short_vec2, ublas_slice_vec2,
+                            vcl_slice_vec1,   vcl_short_vec2,   vcl_short_vec3,   vcl_slice_vec4);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " ** [slice|vector|slice|vector] **" << std::endl;
+  retval = test<NumericT>(epsilon,
+                          ublas_slice_vec1, ublas_short_vec2, ublas_slice_vec2, ublas_short_vec2,
+                            vcl_slice_vec1,   vcl_short_vec2,   vcl_slice_vec3,   vcl_short_vec4);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " ** [slice|vector|slice|slice] **" << std::endl;
+  retval = test<NumericT>(epsilon,
+                          ublas_slice_vec1, ublas_short_vec2, ublas_slice_vec2, ublas_slice_vec2,
+                            vcl_slice_vec1,   vcl_short_vec2,   vcl_slice_vec3,   vcl_slice_vec4);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " ** [slice|slice|vector|vector] **" << std::endl;
+  retval = test<NumericT>(epsilon,
+                          ublas_slice_vec1, ublas_slice_vec2, ublas_short_vec2, ublas_short_vec2,
+                            vcl_slice_vec1,   vcl_slice_vec2,   vcl_short_vec3,   vcl_short_vec4);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " ** [slice|slice|vector|slice] **" << std::endl;
+  retval = test<NumericT>(epsilon,
+                          ublas_slice_vec1, ublas_slice_vec2, ublas_short_vec2, ublas_slice_vec2,
+                            vcl_slice_vec1,   vcl_slice_vec2,   vcl_short_vec3,   vcl_slice_vec4);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " ** [slice|slice|slice|vector] **" << std::endl;
+  retval = test<NumericT>(epsilon,
+                          ublas_slice_vec1, ublas_slice_vec2, ublas_slice_vec2, ublas_short_vec2,
+                            vcl_slice_vec1,   vcl_slice_vec2,   vcl_slice_vec3,   vcl_short_vec4);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " ** [slice|slice|slice|slice] **" << std::endl;
+  retval = test<NumericT>(epsilon,
+                          ublas_slice_vec1, ublas_slice_vec2, ublas_slice_vec2, ublas_slice_vec2,
+                            vcl_slice_vec1,   vcl_slice_vec2,   vcl_slice_vec3,   vcl_slice_vec4);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  return EXIT_SUCCESS;
+}
+
+
+
+//
+// -------------------------------------------------------------
+//
+int main()
+{
+   std::cout << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << "## Test :: Vector multiple inner products" << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << std::endl;
+
+   int retval = EXIT_SUCCESS;
+
+   std::cout << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << std::endl;
+   {
+      typedef float NumericT;
+      NumericT epsilon = static_cast<NumericT>(1.0E-4);
+      std::cout << "# Testing setup:" << std::endl;
+      std::cout << "  eps:     " << epsilon << std::endl;
+      std::cout << "  numeric: float" << std::endl;
+      retval = test<NumericT>(epsilon);
+      if( retval == EXIT_SUCCESS )
+         std::cout << "# Test passed" << std::endl;
+      else
+         return retval;
+   }
+   std::cout << std::endl;
+   std::cout << "----------------------------------------------" << std::endl;
+   std::cout << std::endl;
+#ifdef VIENNACL_WITH_OPENCL
+   if( viennacl::ocl::current_device().double_support() )
+#endif
+   {
+      {
+         typedef double NumericT;
+         NumericT epsilon = 1.0E-12;
+         std::cout << "# Testing setup:" << std::endl;
+         std::cout << "  eps:     " << epsilon << std::endl;
+         std::cout << "  numeric: double" << std::endl;
+         retval = test<NumericT>(epsilon);
+         if( retval == EXIT_SUCCESS )
+           std::cout << "# Test passed" << std::endl;
+         else
+           return retval;
+      }
+      std::cout << std::endl;
+      std::cout << "----------------------------------------------" << std::endl;
+      std::cout << std::endl;
+   }
+
+  std::cout << std::endl;
+  std::cout << "------- Test completed --------" << std::endl;
+  std::cout << std::endl;
+
+
+   return retval;
+}
diff --git a/tests/src/vector_range.cpp b/tests/src/vector_range.cpp
deleted file mode 100644
index 3a6ab05..0000000
--- a/tests/src/vector_range.cpp
+++ /dev/null
@@ -1,254 +0,0 @@
-/* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-#define VIENNACL_HAVE_UBLAS
-//#define NDEBUG
-//#define VIENNACL_BUILD_INFO
-
-#include <utility>
-#include <iostream>
-#include <fstream>
-#include <string>
-#include <cmath>
-#include <algorithm>
-#include <stdio.h>
-#include <time.h>
-//#include "../benchmarks/benchmark-utils.hpp"
-#include "viennacl/scalar.hpp"
-#include "viennacl/matrix.hpp"
-#include "viennacl/linalg/prod.hpp"
-/*#include "viennacl/compressed_matrix.hpp"
-#include "viennacl/linalg/cg.hpp"
-#include "viennacl/linalg/inner_prod.hpp"
-#include "viennacl/linalg/ilu.hpp"
-#include "viennacl/linalg/norm_2.hpp"
-#include "viennacl/io/matrix_market.hpp"*/
-#include "viennacl/vector_proxy.hpp"
-#include "boost/numeric/ublas/vector.hpp"
-#include "boost/numeric/ublas/matrix.hpp"
-#include "boost/numeric/ublas/vector_proxy.hpp"
-#include "boost/numeric/ublas/io.hpp"
-
-
-template <typename VectorType, typename VCLVectorType>
-bool check_for_equality(VectorType const & ublas_v, VCLVectorType const & vcl_v)
-{
-  typedef typename VectorType::value_type   value_type;
-  
-  std::vector<value_type> vcl_v_cpu(vcl_v.size());
-  viennacl::copy(vcl_v, vcl_v_cpu);
-  
-  for (size_t i=0; i<ublas_v.size(); ++i)
-  {
-    if (ublas_v[i] != vcl_v_cpu[i])
-    {
-      std::cout << "Error at index (" << i << "): " << ublas_v[i] << " vs " << vcl_v_cpu[i] << std::endl;
-    }
-  }
-  return true;
-}
-
-
-           
-template <typename T>
-int run_test()
-{
-    typedef float               ScalarType;
-    typedef boost::numeric::ublas::vector<ScalarType>       VectorType;
-    
-    typedef viennacl::vector<ScalarType>                    VCLVectorType;
-    
-    std::size_t dim_large = 70;
-    std::size_t dim_small = 27;
-    
-    //setup ublas objects:
-    VectorType ublas_v1(dim_large);
-    for (std::size_t i=0; i<ublas_v1.size(); ++i)
-      ublas_v1(i) = static_cast<ScalarType>(i+1);
-
-    VectorType ublas_v2(dim_small);
-    for (std::size_t i=0; i<ublas_v2.size(); ++i)
-      ublas_v2(i) = static_cast<ScalarType>(dim_large + i);
-      
-    boost::numeric::ublas::range ublas_r1(0, dim_small);
-    boost::numeric::ublas::range ublas_r2(dim_small - 1, 2*dim_small - 1);
-    boost::numeric::ublas::range ublas_r3(dim_large - dim_small, dim_large);
-    boost::numeric::ublas::vector_range<VectorType> ublas_v1_sub1(ublas_v1, ublas_r1);
-    boost::numeric::ublas::vector_range<VectorType> ublas_v1_sub2(ublas_v1, ublas_r2);
-    boost::numeric::ublas::vector_range<VectorType> ublas_v1_sub3(ublas_v1, ublas_r3);
-
-    //Setup ViennaCL objects    
-    VCLVectorType vcl_v1(dim_large);
-    viennacl::copy(ublas_v1, vcl_v1);
-    VCLVectorType vcl_v2(dim_small);
-    viennacl::copy(ublas_v2, vcl_v2);
-    
-    viennacl::range vcl_r1(0, dim_small);
-    viennacl::range vcl_r2(dim_small - 1, 2*dim_small - 1);
-    viennacl::range vcl_r3(dim_large - dim_small, dim_large);
-    viennacl::vector_range<VCLVectorType>   vcl_v1_sub1(vcl_v1, vcl_r1);
-    viennacl::vector_range<VCLVectorType>   vcl_v1_sub2(vcl_v1, vcl_r2);
-    viennacl::vector_range<VCLVectorType>   vcl_v1_sub3(vcl_v1, vcl_r3);
-    
-    std::cout << std::endl;
-    std::cout << "//" << std::endl;
-    std::cout << "////////// Test 1: Copy to GPU //////////" << std::endl;
-    std::cout << "//" << std::endl;
-    
-    ublas_v1_sub1 = ublas_v2;
-    viennacl::copy(ublas_v2, vcl_v1_sub1);
-    std::cout << "Testing copy to begin of v1... ";
-    if (check_for_equality(ublas_v1, vcl_v1))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-    
-    
-    ublas_v1_sub2 = ublas_v2;
-    viennacl::copy(ublas_v2, vcl_v1_sub2);
-    std::cout << "Testing copy to middle of v1... ";
-    if (check_for_equality(ublas_v1, vcl_v1))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-    
-    
-    
-    ublas_v1_sub3 = ublas_v2;
-    viennacl::copy(ublas_v2, vcl_v1_sub3);
-    std::cout << "Testing copy to bottom of v1... ";
-    if (check_for_equality(ublas_v1, vcl_v1))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-
-    
-    std::cout << std::endl;
-    std::cout << "//" << std::endl;
-    std::cout << "////////// Test 2: Copy from GPU //////////" << std::endl;
-    std::cout << "//" << std::endl;
-    
-    std::cout << "Testing beginning of v1... ";
-    if (check_for_equality(ublas_v1_sub1, vcl_v1_sub1))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-    
-    std::cout << "Testing middle of v1... ";
-    if (check_for_equality(ublas_v1_sub2, vcl_v1_sub2))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-    
-    std::cout << "Testing bottom of v1... ";
-    if (check_for_equality(ublas_v1_sub3, vcl_v1_sub3))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-
-    std::cout << "//" << std::endl;
-    std::cout << "////////// Test 3: Inplace add //////////" << std::endl;
-    std::cout << "//" << std::endl;
-    viennacl::copy(ublas_v1_sub1, vcl_v1_sub1);
-    
-    std::cout << "Testing inplace add at beginning of v1: ";
-    ublas_v1_sub1 += ublas_v1_sub1;
-    vcl_v1_sub1 += vcl_v1_sub1;
-
-    if (check_for_equality(ublas_v1, vcl_v1))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-    
-
-    std::cout << "Testing inplace add at middle of v1: ";
-    ublas_v1_sub2 += ublas_v1_sub2;
-    vcl_v1_sub2 += vcl_v1_sub2;
-
-    if (check_for_equality(ublas_v1, vcl_v1))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-
-
-    std::cout << "Testing inplace add at end of v1: ";
-    ublas_v1_sub3 += ublas_v1_sub3;
-    vcl_v1_sub3 += vcl_v1_sub3;
-
-    if (check_for_equality(ublas_v1, vcl_v1))
-      std::cout << "PASSED!" << std::endl;
-    else
-    {
-      std::cout << std::endl << "TEST failed!";
-      return EXIT_FAILURE;
-    }
-
-    return EXIT_SUCCESS;
-}    
-
-int main (int argc, const char * argv[])
-{
-  std::cout << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << "## Test :: Vector Range" << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << "----------------------------------------------" << std::endl;
-  std::cout << std::endl;
-   
-  std::cout << "# Testing setup:" << std::endl;
-  std::cout << "  eps:     " << 0 << std::endl;
-  std::cout << "  numeric: float" << std::endl;
-  if (run_test<float>() != EXIT_SUCCESS)
-    return EXIT_FAILURE;
-  
-  if( viennacl::ocl::current_device().double_support() )
-  {
-    std::cout << "# Testing setup:" << std::endl;
-    std::cout << "  eps:     " << 0 << std::endl;
-    std::cout << "  numeric: double" << std::endl;
-    
-    if (run_test<double>() != EXIT_SUCCESS)
-      return EXIT_FAILURE;
-  }
-  
-  return EXIT_SUCCESS;
-}
-
diff --git a/tests/src/vector_uint.cpp b/tests/src/vector_uint.cpp
new file mode 100644
index 0000000..a7dd2c2
--- /dev/null
+++ b/tests/src/vector_uint.cpp
@@ -0,0 +1,966 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+//
+// *** System
+//
+#include <iostream>
+#include <iomanip>
+
+//
+// *** Boost
+//
+#include <boost/numeric/ublas/io.hpp>
+#include <boost/numeric/ublas/vector.hpp>
+#include <boost/numeric/ublas/vector_proxy.hpp>
+
+//
+// *** ViennaCL
+//
+//#define VIENNACL_DEBUG_ALL
+#define VIENNACL_WITH_UBLAS 1
+#include "viennacl/vector.hpp"
+#include "viennacl/vector_proxy.hpp"
+#include "viennacl/linalg/inner_prod.hpp"
+#include "viennacl/linalg/norm_1.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+#include "viennacl/linalg/norm_inf.hpp"
+
+#include "Random.hpp"
+
+using namespace boost::numeric;
+
+
+//
+// -------------------------------------------------------------
+//
+template <typename ScalarType>
+ScalarType diff(ScalarType const & s1, ScalarType const & s2)
+{
+  viennacl::backend::finish();
+  return s1 - s2;
+}
+//
+// -------------------------------------------------------------
+//
+template <typename ScalarType>
+ScalarType diff(ScalarType const & s1, viennacl::scalar<ScalarType> const & s2)
+{
+  viennacl::backend::finish();
+  return s1 - s2;
+}
+//
+// -------------------------------------------------------------
+//
+template <typename ScalarType>
+ScalarType diff(ScalarType const & s1, viennacl::entry_proxy<ScalarType> const & s2)
+{
+  viennacl::backend::finish();
+  return s1 - s2;
+}
+//
+// -------------------------------------------------------------
+//
+template <typename ScalarType, typename VCLVectorType>
+ScalarType diff(ublas::vector<ScalarType> const & v1, VCLVectorType const & v2)
+{
+   ublas::vector<ScalarType> v2_cpu(v2.size());
+   viennacl::backend::finish();  //workaround for a bug in APP SDK 2.7 on Trinity APUs (with Catalyst 12.8)
+   viennacl::copy(v2.begin(), v2.end(), v2_cpu.begin());
+
+   for (unsigned int i=0;i<v1.size(); ++i)
+   {
+      if (v2_cpu[i] != v1[i])
+        return 1;
+   }
+
+   return 0;
+}
+
+
+template <typename T1, typename T2>
+int check(T1 const & t1, T2 const & t2)
+{
+  int retval = EXIT_SUCCESS;
+
+  if (diff(t1, t2) != 0)
+  {
+    std::cout << "# Error! Difference: " << diff(t1, t2) << std::endl;
+    retval = EXIT_FAILURE;
+  }
+  return retval;
+}
+
+
+//
+// -------------------------------------------------------------
+//
+template< typename NumericT, typename UblasVectorType, typename ViennaCLVectorType1, typename ViennaCLVectorType2 >
+int test(UblasVectorType     & ublas_v1, UblasVectorType     & ublas_v2,
+         ViennaCLVectorType1 &   vcl_v1, ViennaCLVectorType2 &   vcl_v2)
+{
+  int retval = EXIT_SUCCESS;
+
+  NumericT                    cpu_result = 42;
+  viennacl::scalar<NumericT>  gpu_result = 43;
+
+  //
+  // Initializer:
+  //
+  std::cout << "Checking for zero_vector initializer..." << std::endl;
+  //ublas_v1 = ublas::zero_vector<NumericT>(ublas_v1.size());
+  for (std::size_t i=0; i<ublas_v1.size(); ++i)
+    ublas_v1[i] = 0;
+  vcl_v1 = viennacl::zero_vector<NumericT>(vcl_v1.size());
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Checking for scalar_vector initializer..." << std::endl;
+  //ublas_v1 = ublas::scalar_vector<NumericT>(ublas_v1.size(), cpu_result);
+  for (std::size_t i=0; i<ublas_v1.size(); ++i)
+    ublas_v1[i] = cpu_result;
+  vcl_v1 = viennacl::scalar_vector<NumericT>(vcl_v1.size(), cpu_result);
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  //ublas_v1 = ublas::scalar_vector<NumericT>(ublas_v1.size(), gpu_result);
+  for (std::size_t i=0; i<ublas_v1.size(); ++i)
+    ublas_v1[i] = cpu_result + 1;
+  vcl_v1 = viennacl::scalar_vector<NumericT>(vcl_v1.size(), gpu_result);
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Checking for unit_vector initializer..." << std::endl;
+  //ublas_v1 = ublas::unit_vector<NumericT>(ublas_v1.size(), 5);
+  for (std::size_t i=0; i<ublas_v1.size(); ++i)
+    ublas_v1[i] = (i == 5) ? 1 : 0;
+  vcl_v1 = viennacl::unit_vector<NumericT>(vcl_v1.size(), 5);
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  for (std::size_t i=0; i<ublas_v1.size(); ++i)
+  {
+    ublas_v1[i] = NumericT(i);
+    ublas_v2[i] = NumericT(i+42);
+  }
+
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());  //resync
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  std::cout << "Checking for successful copy..." << std::endl;
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  if (check(ublas_v2, vcl_v2) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  //
+  // Part 1: Norms and inner product
+  //
+
+  // --------------------------------------------------------------------------
+  std::cout << "Testing inner_prod..." << std::endl;
+  cpu_result = viennacl::linalg::inner_prod(ublas_v1, ublas_v2);
+  NumericT cpu_result2 = viennacl::linalg::inner_prod(vcl_v1, vcl_v2);
+  gpu_result = viennacl::linalg::inner_prod(vcl_v1, vcl_v2);
+
+  if (check(cpu_result, cpu_result2) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  if (check(cpu_result, gpu_result) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  cpu_result = inner_prod(ublas_v1 + ublas_v2, 2*ublas_v2);
+  NumericT cpu_result3 = viennacl::linalg::inner_prod(vcl_v1 + vcl_v2, 2*vcl_v2);
+  gpu_result = viennacl::linalg::inner_prod(vcl_v1 + vcl_v2, 2*vcl_v2);
+
+  if (check(cpu_result, cpu_result3) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  if (check(cpu_result, gpu_result) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  // --------------------------------------------------------------------------
+  std::cout << "Testing norm_1..." << std::endl;
+  cpu_result = 0;
+  for (std::size_t i=0; i<ublas_v1.size(); ++i)   //note: norm_1 broken for unsigned ints on MacOS
+    cpu_result += ublas_v1[i];
+  gpu_result = viennacl::linalg::norm_1(vcl_v1);
+
+  if (check(cpu_result, gpu_result) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  cpu_result2 = 0; //reset
+  for (std::size_t i=0; i<ublas_v1.size(); ++i)   //note: norm_1 broken for unsigned ints on MacOS
+    cpu_result2 += ublas_v1[i];
+  cpu_result = viennacl::linalg::norm_1(vcl_v1);
+
+  if (check(cpu_result, cpu_result2) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  cpu_result2 = 0;
+  for (std::size_t i=0; i<ublas_v1.size(); ++i)   //note: norm_1 broken for unsigned ints on MacOS
+    cpu_result2 += ublas_v1[i] + ublas_v2[i];
+  cpu_result = viennacl::linalg::norm_1(vcl_v1 + vcl_v2);
+
+  if (check(cpu_result, cpu_result2) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  // --------------------------------------------------------------------------
+  std::cout << "Testing norm_inf..." << std::endl;
+  cpu_result = 0;
+  for (std::size_t i=0; i<ublas_v1.size(); ++i)
+    if (ublas_v1[i] > cpu_result)
+      cpu_result = ublas_v1[i];
+  gpu_result = viennacl::linalg::norm_inf(vcl_v1);
+
+  if (check(cpu_result, gpu_result) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  cpu_result2 = 0;
+  for (std::size_t i=0; i<ublas_v1.size(); ++i)
+    if (ublas_v1[i] > cpu_result2)
+      cpu_result2 = ublas_v1[i];
+  cpu_result = viennacl::linalg::norm_inf(vcl_v1);
+
+  if (check(cpu_result, cpu_result2) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  cpu_result2 = 0;
+  for (std::size_t i=0; i<ublas_v1.size(); ++i)
+    if (ublas_v1[i] + ublas_v2[i] > cpu_result2)
+      cpu_result2 = ublas_v1[i] + ublas_v2[i];
+  cpu_result = viennacl::linalg::norm_inf(vcl_v1 + vcl_v2);
+
+  if (check(cpu_result, cpu_result2) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  // --------------------------------------------------------------------------
+  std::cout << "Testing index_norm_inf..." << std::endl;
+
+  std::size_t cpu_index = 0;
+  cpu_result = 0;
+  for (std::size_t i=0; i<ublas_v1.size(); ++i)
+    if (ublas_v1[i] > cpu_result)
+    {
+      cpu_result = ublas_v1[i];
+      cpu_index = i;
+    }
+  std::size_t gpu_index = viennacl::linalg::index_norm_inf(vcl_v1);
+
+  if (check(static_cast<NumericT>(cpu_index), static_cast<NumericT>(gpu_index)) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  // --------------------------------------------------------------------------
+  gpu_result = vcl_v1[viennacl::linalg::index_norm_inf(vcl_v1)];
+
+  if (check(cpu_result, gpu_result) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  cpu_index = 0;
+  cpu_result = 0;
+  for (std::size_t i=0; i<ublas_v1.size(); ++i)
+    if (ublas_v1[i] + ublas_v2[i] > cpu_result)
+    {
+      cpu_result = ublas_v1[i];
+      cpu_index = i;
+    }
+  gpu_result = vcl_v1[viennacl::linalg::index_norm_inf(vcl_v1 + vcl_v2)];
+
+  if (check(cpu_result, gpu_result) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  // --------------------------------------------------------------------------
+
+  std::cout << "Testing assignments..." << std::endl;
+  NumericT val = static_cast<NumericT>(1);
+  for (size_t i=0; i < ublas_v1.size(); ++i)
+    ublas_v1(i) = val;
+
+  for (size_t i=0; i < vcl_v1.size(); ++i)
+    vcl_v1(i) = val;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  //
+  // multiplication and division of vectors by scalars
+  //
+  std::cout << "Testing scaling with CPU scalar..." << std::endl;
+  NumericT alpha = static_cast<NumericT>(3);
+  viennacl::scalar<NumericT> gpu_alpha = alpha;
+
+  ublas_v1  *= alpha;
+  vcl_v1    *= alpha;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing scaling with GPU scalar..." << std::endl;
+  ublas_v1  *= alpha;
+  vcl_v1    *= gpu_alpha;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  NumericT beta  = static_cast<NumericT>(2);
+  viennacl::scalar<NumericT> gpu_beta = beta;
+
+  std::cout << "Testing shrinking with CPU scalar..." << std::endl;
+  ublas_v1 /= beta;
+  vcl_v1   /= beta;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing shrinking with GPU scalar..." << std::endl;
+  ublas_v1 /= beta;
+  vcl_v1   /= gpu_beta;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  //
+  // add and inplace_add of vectors
+  //
+  for (size_t i=0; i < ublas_v1.size(); ++i)
+    ublas_v1(i) = NumericT(i);
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());  //resync
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  std::cout << "Testing add on vector..." << std::endl;
+
+  std::cout << "Checking for successful copy..." << std::endl;
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  if (check(ublas_v2, vcl_v2) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ublas_v1     = ublas_v1 + ublas_v2;
+  vcl_v1       =   vcl_v1 +   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing inplace-add on vector..." << std::endl;
+  ublas_v1 += ublas_v2;
+  vcl_v1   +=   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  //
+  // multiply-add
+  //
+  std::cout << "Testing multiply-add on vector with CPU scalar (right)..." << std::endl;
+  for (size_t i=0; i < ublas_v1.size(); ++i)
+    ublas_v1(i) = NumericT(i);
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 + alpha * ublas_v2;
+  vcl_v1   = vcl_v1   + alpha *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing multiply-add on vector with CPU scalar (left)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = alpha * ublas_v1 + ublas_v2;
+  vcl_v1   = alpha *   vcl_v1 +   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing multiply-add on vector with CPU scalar (both)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = alpha * ublas_v1 + beta * ublas_v2;
+  vcl_v1   = alpha *   vcl_v1 + beta *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing inplace multiply-add on vector with CPU scalar..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 += alpha * ublas_v2;
+  vcl_v1   += alpha *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing multiply-add on vector with GPU scalar (right)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 +     alpha * ublas_v2;
+  vcl_v1   = vcl_v1   + gpu_alpha *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing multiply-add on vector with GPU scalar (left)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 +     alpha * ublas_v2;
+  vcl_v1   = vcl_v1   + gpu_alpha *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing multiply-add on vector with GPU scalar (both)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 =     alpha * ublas_v1 +     beta * ublas_v2;
+  vcl_v1   = gpu_alpha *   vcl_v1 + gpu_beta *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing inplace multiply-add on vector with GPU scalar (both, adding)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 +=     alpha * ublas_v1 +     beta * ublas_v2;
+  vcl_v1   += gpu_alpha *   vcl_v1 + gpu_beta *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing inplace multiply-add on vector with GPU scalar..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 +=     alpha * ublas_v2;
+  vcl_v1   += gpu_alpha *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  //
+  // division-add
+  //
+  std::cout << "Testing division-add on vector with CPU scalar (right)..." << std::endl;
+  for (size_t i=0; i < ublas_v1.size(); ++i)
+    ublas_v1(i) = NumericT(i);
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 + ublas_v2 / alpha;
+  vcl_v1   = vcl_v1   + vcl_v2 / alpha;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing division-add on vector with CPU scalar (left)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 / alpha + ublas_v2;
+  vcl_v1   =   vcl_v1 / alpha +   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing division-add on vector with CPU scalar (both)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 / alpha + ublas_v2 / beta;
+  vcl_v1   =   vcl_v1 / alpha +   vcl_v2 / beta;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing division-multiply-add on vector with CPU scalar..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 / alpha + ublas_v2 * beta;
+  vcl_v1   =   vcl_v1 / alpha +   vcl_v2 * beta;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing multiply-division-add on vector with CPU scalar..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 * alpha + ublas_v2 / beta;
+  vcl_v1   =   vcl_v1 * alpha +   vcl_v2 / beta;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+
+  std::cout << "Testing inplace division-add on vector with CPU scalar..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 += ublas_v2 / alpha;
+  vcl_v1   += vcl_v2 / alpha;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing division-add on vector with GPU scalar (right)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 + ublas_v2 / alpha;
+  vcl_v1   = vcl_v1   +   vcl_v2 / gpu_alpha;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing division-add on vector with GPU scalar (left)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 + ublas_v2 / alpha;
+  vcl_v1   = vcl_v1   +   vcl_v2 / gpu_alpha;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing division-add on vector with GPU scalar (both)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 / alpha     + ublas_v2 / beta;
+  vcl_v1   =   vcl_v1 / gpu_alpha +   vcl_v2 / gpu_beta;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing inplace division-add on vector with GPU scalar (both, adding)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 += ublas_v1 / alpha     + ublas_v2 / beta;
+  vcl_v1   +=   vcl_v1 / gpu_alpha +   vcl_v2 / gpu_beta;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing inplace division-multiply-add on vector with GPU scalar (adding)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 += ublas_v1 / alpha     + ublas_v2 * beta;
+  vcl_v1   +=   vcl_v1 / gpu_alpha +   vcl_v2 * gpu_beta;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing inplace division-add on vector with GPU scalar..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 += ublas_v2 * alpha;
+  vcl_v1   +=   vcl_v2 * gpu_alpha;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  //
+  // More complicated expressions (for ensuring the operator overloads work correctly)
+  //
+  for (size_t i=0; i < ublas_v1.size(); ++i)
+    ublas_v1(i) = NumericT(i);
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  std::cout << "Testing three vector additions..." << std::endl;
+  ublas_v1 = ublas_v2 + ublas_v1 + ublas_v2;
+  vcl_v1   =   vcl_v2 +   vcl_v1 +   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  // --------------------------------------------------------------------------
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  std::cout << "Testing swap..." << std::endl;
+  swap(ublas_v1, ublas_v2);
+  swap(vcl_v1, vcl_v2);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing elementwise multiplication..." << std::endl;
+  std::cout << " v1 = element_prod(v1, v2);" << std::endl;
+  ublas_v1 = ublas::element_prod(ublas_v1, ublas_v2);
+  vcl_v1 = viennacl::linalg::element_prod(vcl_v1, vcl_v2);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " v1 += element_prod(v1, v2);" << std::endl;
+  ublas_v1 += ublas::element_prod(ublas_v1, ublas_v2);
+  vcl_v1 += viennacl::linalg::element_prod(vcl_v1, vcl_v2);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ///////
+  std::cout << " v1 = element_prod(v1 + v2, v2);" << std::endl;
+  ublas_v1 = ublas::element_prod(ublas_v1 + ublas_v2, ublas_v2);
+  vcl_v1 = viennacl::linalg::element_prod(vcl_v1 + vcl_v2, vcl_v2);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " v1 += element_prod(v1 + v2, v2);" << std::endl;
+  ublas_v1 += ublas::element_prod(ublas_v1 + ublas_v2, ublas_v2);
+  vcl_v1 += viennacl::linalg::element_prod(vcl_v1 + vcl_v2, vcl_v2);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ///////
+  std::cout << " v1 = element_prod(v1, v2 + v1);" << std::endl;
+  ublas_v1 = ublas::element_prod(ublas_v1, ublas_v2 + ublas_v1);
+  vcl_v1 = viennacl::linalg::element_prod(vcl_v1, vcl_v2 + vcl_v1);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " v1 += element_prod(v1, v2 + v1);" << std::endl;
+  ublas_v1 += ublas::element_prod(ublas_v1, ublas_v2 + ublas_v1);
+  vcl_v1 += viennacl::linalg::element_prod(vcl_v1, vcl_v2 + vcl_v1);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ///////
+  std::cout << " v1 = element_prod(v1 + v2, v2 + v1);" << std::endl;
+  ublas_v1 = ublas::element_prod(ublas_v1 + ublas_v2, ublas_v2 + ublas_v1);
+  vcl_v1 = viennacl::linalg::element_prod(vcl_v1 + vcl_v2, vcl_v2 + vcl_v1);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " v1 += element_prod(v1 + v2, v2 + v1);" << std::endl;
+  ublas_v1 += ublas::element_prod(ublas_v1 + ublas_v2, ublas_v2 + ublas_v1);
+  vcl_v1 += viennacl::linalg::element_prod(vcl_v1 + vcl_v2, vcl_v2 + vcl_v1);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing elementwise division..." << std::endl;
+  for (std::size_t i=0; i<ublas_v1.size(); ++i)
+  {
+    ublas_v1[i] = NumericT(1 + i);
+    ublas_v2[i] = NumericT(5 + i);
+  }
+
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas::element_div(ublas_v1, ublas_v2);
+  vcl_v1 = viennacl::linalg::element_div(vcl_v1, vcl_v2);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ublas_v1 += ublas::element_div(ublas_v1, ublas_v2);
+  vcl_v1 += viennacl::linalg::element_div(vcl_v1, vcl_v2);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ///////
+  ublas_v1 = ublas::element_div(ublas_v1 + ublas_v2, ublas_v2);
+  vcl_v1 = viennacl::linalg::element_div(vcl_v1 + vcl_v2, vcl_v2);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ublas_v1 += ublas::element_div(ublas_v1 + ublas_v2, ublas_v2);
+  vcl_v1 += viennacl::linalg::element_div(vcl_v1 + vcl_v2, vcl_v2);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ///////
+  ublas_v1 = ublas::element_div(ublas_v1, ublas_v2 + ublas_v1);
+  vcl_v1 = viennacl::linalg::element_div(vcl_v1, vcl_v2 + vcl_v1);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ublas_v1 += ublas::element_div(ublas_v1, ublas_v2 + ublas_v1);
+  vcl_v1 += viennacl::linalg::element_div(vcl_v1, vcl_v2 + vcl_v1);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ///////
+  ublas_v1 = ublas::element_div(ublas_v1 + ublas_v2, ublas_v2 + ublas_v1);
+  vcl_v1 = viennacl::linalg::element_div(vcl_v1 + vcl_v2, vcl_v2 + vcl_v1);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ublas_v1 += ublas::element_div(ublas_v1 + ublas_v2, ublas_v2 + ublas_v1);
+  vcl_v1 += viennacl::linalg::element_div(vcl_v1 + vcl_v2, vcl_v2 + vcl_v1);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  // --------------------------------------------------------------------------
+  return retval;
+}
+
+
+template< typename NumericT >
+int test()
+{
+  int retval = EXIT_SUCCESS;
+  std::size_t size = 12345;
+
+  std::cout << "Running tests for vector of size " << size << std::endl;
+
+  //
+  // Set up UBLAS objects
+  //
+  ublas::vector<NumericT> ublas_full_vec(size);
+  ublas::vector<NumericT> ublas_full_vec2(ublas_full_vec.size());
+
+  for (std::size_t i=0; i<ublas_full_vec.size(); ++i)
+  {
+    ublas_full_vec[i]  = NumericT(1.0 + i);
+    ublas_full_vec2[i] = NumericT(2.0 + i / 2);
+  }
+
+  ublas::range r1(    ublas_full_vec.size() / 4, 2 * ublas_full_vec.size() / 4);
+  ublas::range r2(2 * ublas_full_vec2.size() / 4, 3 * ublas_full_vec2.size() / 4);
+  ublas::vector_range< ublas::vector<NumericT> > ublas_range_vec(ublas_full_vec, r1);
+  ublas::vector_range< ublas::vector<NumericT> > ublas_range_vec2(ublas_full_vec2, r2);
+
+  ublas::slice s1(    ublas_full_vec.size() / 4, 3, ublas_full_vec.size() / 4);
+  ublas::slice s2(2 * ublas_full_vec2.size() / 4, 2, ublas_full_vec2.size() / 4);
+  ublas::vector_slice< ublas::vector<NumericT> > ublas_slice_vec(ublas_full_vec, s1);
+  ublas::vector_slice< ublas::vector<NumericT> > ublas_slice_vec2(ublas_full_vec2, s2);
+
+  //
+  // Set up ViennaCL objects
+  //
+  viennacl::vector<NumericT> vcl_full_vec(ublas_full_vec.size());
+  viennacl::vector<NumericT> vcl_full_vec2(ublas_full_vec2.size());
+
+  viennacl::fast_copy(ublas_full_vec.begin(), ublas_full_vec.end(), vcl_full_vec.begin());
+  viennacl::copy(ublas_full_vec2.begin(), ublas_full_vec2.end(), vcl_full_vec2.begin());
+
+  viennacl::range vcl_r1(    vcl_full_vec.size() / 4, 2 * vcl_full_vec.size() / 4);
+  viennacl::range vcl_r2(2 * vcl_full_vec2.size() / 4, 3 * vcl_full_vec2.size() / 4);
+  viennacl::vector_range< viennacl::vector<NumericT> > vcl_range_vec(vcl_full_vec, vcl_r1);
+  viennacl::vector_range< viennacl::vector<NumericT> > vcl_range_vec2(vcl_full_vec2, vcl_r2);
+
+  {
+    viennacl::vector<NumericT> vcl_short_vec(vcl_range_vec);
+    viennacl::vector<NumericT> vcl_short_vec2 = vcl_range_vec2;
+
+    ublas::vector<NumericT> ublas_short_vec(ublas_range_vec);
+    ublas::vector<NumericT> ublas_short_vec2(ublas_range_vec2);
+
+    std::cout << "Testing creation of vectors from range..." << std::endl;
+    if (check(ublas_short_vec, vcl_short_vec) != EXIT_SUCCESS)
+      return EXIT_FAILURE;
+    if (check(ublas_short_vec2, vcl_short_vec2) != EXIT_SUCCESS)
+      return EXIT_FAILURE;
+  }
+
+  viennacl::slice vcl_s1(    vcl_full_vec.size() / 4, 3, vcl_full_vec.size() / 4);
+  viennacl::slice vcl_s2(2 * vcl_full_vec2.size() / 4, 2, vcl_full_vec2.size() / 4);
+  viennacl::vector_slice< viennacl::vector<NumericT> > vcl_slice_vec(vcl_full_vec, vcl_s1);
+  viennacl::vector_slice< viennacl::vector<NumericT> > vcl_slice_vec2(vcl_full_vec2, vcl_s2);
+
+  viennacl::vector<NumericT> vcl_short_vec(vcl_slice_vec);
+  viennacl::vector<NumericT> vcl_short_vec2 = vcl_slice_vec2;
+
+  ublas::vector<NumericT> ublas_short_vec(ublas_slice_vec);
+  ublas::vector<NumericT> ublas_short_vec2(ublas_slice_vec2);
+
+  std::cout << "Testing creation of vectors from slice..." << std::endl;
+  if (check(ublas_short_vec, vcl_short_vec) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  if (check(ublas_short_vec2, vcl_short_vec2) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  //
+  // Now start running tests for vectors, ranges and slices:
+  //
+
+  std::cout << " ** vcl_v1 = vector, vcl_v2 = vector **" << std::endl;
+  retval = test<NumericT>(ublas_short_vec, ublas_short_vec2,
+                          vcl_short_vec, vcl_short_vec2);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " ** vcl_v1 = vector, vcl_v2 = range **" << std::endl;
+  retval = test<NumericT>(ublas_short_vec, ublas_short_vec2,
+                          vcl_short_vec, vcl_range_vec2);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " ** vcl_v1 = vector, vcl_v2 = slice **" << std::endl;
+  retval = test<NumericT>(ublas_short_vec, ublas_short_vec2,
+                          vcl_short_vec, vcl_slice_vec2);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ///////
+
+  std::cout << " ** vcl_v1 = range, vcl_v2 = vector **" << std::endl;
+  retval = test<NumericT>(ublas_short_vec, ublas_short_vec2,
+                          vcl_range_vec, vcl_short_vec2);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " ** vcl_v1 = range, vcl_v2 = range **" << std::endl;
+  retval = test<NumericT>(ublas_short_vec, ublas_short_vec2,
+                          vcl_range_vec, vcl_range_vec2);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " ** vcl_v1 = range, vcl_v2 = slice **" << std::endl;
+  retval = test<NumericT>(ublas_short_vec, ublas_short_vec2,
+                          vcl_range_vec, vcl_slice_vec2);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ///////
+
+  std::cout << " ** vcl_v1 = slice, vcl_v2 = vector **" << std::endl;
+  retval = test<NumericT>(ublas_short_vec, ublas_short_vec2,
+                          vcl_slice_vec, vcl_short_vec2);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " ** vcl_v1 = slice, vcl_v2 = range **" << std::endl;
+  retval = test<NumericT>(ublas_short_vec, ublas_short_vec2,
+                          vcl_slice_vec, vcl_range_vec2);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " ** vcl_v1 = slice, vcl_v2 = slice **" << std::endl;
+  retval = test<NumericT>(ublas_short_vec, ublas_short_vec2,
+                          vcl_slice_vec, vcl_slice_vec2);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  return EXIT_SUCCESS;
+}
+
+
+
+//
+// -------------------------------------------------------------
+//
+int main()
+{
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "## Test :: Vector with Integer types" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << std::endl;
+
+  int retval = EXIT_SUCCESS;
+
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << std::endl;
+  {
+    std::cout << "# Testing setup:" << std::endl;
+    std::cout << "  numeric: unsigned int" << std::endl;
+    retval = test<unsigned int>();
+    if( retval == EXIT_SUCCESS )
+      std::cout << "# Test passed" << std::endl;
+    else
+      return retval;
+  }
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << std::endl;
+  {
+    std::cout << "# Testing setup:" << std::endl;
+    std::cout << "  numeric: long" << std::endl;
+    retval = test<unsigned long>();
+    if( retval == EXIT_SUCCESS )
+      std::cout << "# Test passed" << std::endl;
+    else
+      return retval;
+  }
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << std::endl;
+
+  std::cout << std::endl;
+  std::cout << "------- Test completed --------" << std::endl;
+  std::cout << std::endl;
+
+  return retval;
+}
diff --git a/tests/src/vector_uint.cu b/tests/src/vector_uint.cu
new file mode 100644
index 0000000..a7dd2c2
--- /dev/null
+++ b/tests/src/vector_uint.cu
@@ -0,0 +1,966 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+//
+// *** System
+//
+#include <iostream>
+#include <iomanip>
+
+//
+// *** Boost
+//
+#include <boost/numeric/ublas/io.hpp>
+#include <boost/numeric/ublas/vector.hpp>
+#include <boost/numeric/ublas/vector_proxy.hpp>
+
+//
+// *** ViennaCL
+//
+//#define VIENNACL_DEBUG_ALL
+#define VIENNACL_WITH_UBLAS 1
+#include "viennacl/vector.hpp"
+#include "viennacl/vector_proxy.hpp"
+#include "viennacl/linalg/inner_prod.hpp"
+#include "viennacl/linalg/norm_1.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+#include "viennacl/linalg/norm_inf.hpp"
+
+#include "Random.hpp"
+
+using namespace boost::numeric;
+
+
+//
+// -------------------------------------------------------------
+//
+template <typename ScalarType>
+ScalarType diff(ScalarType const & s1, ScalarType const & s2)
+{
+  viennacl::backend::finish();
+  return s1 - s2;
+}
+//
+// -------------------------------------------------------------
+//
+template <typename ScalarType>
+ScalarType diff(ScalarType const & s1, viennacl::scalar<ScalarType> const & s2)
+{
+  viennacl::backend::finish();
+  return s1 - s2;
+}
+//
+// -------------------------------------------------------------
+//
+template <typename ScalarType>
+ScalarType diff(ScalarType const & s1, viennacl::entry_proxy<ScalarType> const & s2)
+{
+  viennacl::backend::finish();
+  return s1 - s2;
+}
+//
+// -------------------------------------------------------------
+//
+template <typename ScalarType, typename VCLVectorType>
+ScalarType diff(ublas::vector<ScalarType> const & v1, VCLVectorType const & v2)
+{
+   ublas::vector<ScalarType> v2_cpu(v2.size());
+   viennacl::backend::finish();  //workaround for a bug in APP SDK 2.7 on Trinity APUs (with Catalyst 12.8)
+   viennacl::copy(v2.begin(), v2.end(), v2_cpu.begin());
+
+   for (unsigned int i=0;i<v1.size(); ++i)
+   {
+      if (v2_cpu[i] != v1[i])
+        return 1;
+   }
+
+   return 0;
+}
+
+
+template <typename T1, typename T2>
+int check(T1 const & t1, T2 const & t2)
+{
+  int retval = EXIT_SUCCESS;
+
+  if (diff(t1, t2) != 0)
+  {
+    std::cout << "# Error! Difference: " << diff(t1, t2) << std::endl;
+    retval = EXIT_FAILURE;
+  }
+  return retval;
+}
+
+
+//
+// -------------------------------------------------------------
+//
+template< typename NumericT, typename UblasVectorType, typename ViennaCLVectorType1, typename ViennaCLVectorType2 >
+int test(UblasVectorType     & ublas_v1, UblasVectorType     & ublas_v2,
+         ViennaCLVectorType1 &   vcl_v1, ViennaCLVectorType2 &   vcl_v2)
+{
+  int retval = EXIT_SUCCESS;
+
+  NumericT                    cpu_result = 42;
+  viennacl::scalar<NumericT>  gpu_result = 43;
+
+  //
+  // Initializer:
+  //
+  std::cout << "Checking for zero_vector initializer..." << std::endl;
+  //ublas_v1 = ublas::zero_vector<NumericT>(ublas_v1.size());
+  for (std::size_t i=0; i<ublas_v1.size(); ++i)
+    ublas_v1[i] = 0;
+  vcl_v1 = viennacl::zero_vector<NumericT>(vcl_v1.size());
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Checking for scalar_vector initializer..." << std::endl;
+  //ublas_v1 = ublas::scalar_vector<NumericT>(ublas_v1.size(), cpu_result);
+  for (std::size_t i=0; i<ublas_v1.size(); ++i)
+    ublas_v1[i] = cpu_result;
+  vcl_v1 = viennacl::scalar_vector<NumericT>(vcl_v1.size(), cpu_result);
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  //ublas_v1 = ublas::scalar_vector<NumericT>(ublas_v1.size(), gpu_result);
+  for (std::size_t i=0; i<ublas_v1.size(); ++i)
+    ublas_v1[i] = cpu_result + 1;
+  vcl_v1 = viennacl::scalar_vector<NumericT>(vcl_v1.size(), gpu_result);
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Checking for unit_vector initializer..." << std::endl;
+  //ublas_v1 = ublas::unit_vector<NumericT>(ublas_v1.size(), 5);
+  for (std::size_t i=0; i<ublas_v1.size(); ++i)
+    ublas_v1[i] = (i == 5) ? 1 : 0;
+  vcl_v1 = viennacl::unit_vector<NumericT>(vcl_v1.size(), 5);
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  for (std::size_t i=0; i<ublas_v1.size(); ++i)
+  {
+    ublas_v1[i] = NumericT(i);
+    ublas_v2[i] = NumericT(i+42);
+  }
+
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());  //resync
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  std::cout << "Checking for successful copy..." << std::endl;
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  if (check(ublas_v2, vcl_v2) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  //
+  // Part 1: Norms and inner product
+  //
+
+  // --------------------------------------------------------------------------
+  std::cout << "Testing inner_prod..." << std::endl;
+  cpu_result = viennacl::linalg::inner_prod(ublas_v1, ublas_v2);
+  NumericT cpu_result2 = viennacl::linalg::inner_prod(vcl_v1, vcl_v2);
+  gpu_result = viennacl::linalg::inner_prod(vcl_v1, vcl_v2);
+
+  if (check(cpu_result, cpu_result2) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  if (check(cpu_result, gpu_result) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  cpu_result = inner_prod(ublas_v1 + ublas_v2, 2*ublas_v2);
+  NumericT cpu_result3 = viennacl::linalg::inner_prod(vcl_v1 + vcl_v2, 2*vcl_v2);
+  gpu_result = viennacl::linalg::inner_prod(vcl_v1 + vcl_v2, 2*vcl_v2);
+
+  if (check(cpu_result, cpu_result3) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  if (check(cpu_result, gpu_result) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  // --------------------------------------------------------------------------
+  std::cout << "Testing norm_1..." << std::endl;
+  cpu_result = 0;
+  for (std::size_t i=0; i<ublas_v1.size(); ++i)   //note: norm_1 broken for unsigned ints on MacOS
+    cpu_result += ublas_v1[i];
+  gpu_result = viennacl::linalg::norm_1(vcl_v1);
+
+  if (check(cpu_result, gpu_result) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  cpu_result2 = 0; //reset
+  for (std::size_t i=0; i<ublas_v1.size(); ++i)   //note: norm_1 broken for unsigned ints on MacOS
+    cpu_result2 += ublas_v1[i];
+  cpu_result = viennacl::linalg::norm_1(vcl_v1);
+
+  if (check(cpu_result, cpu_result2) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  cpu_result2 = 0;
+  for (std::size_t i=0; i<ublas_v1.size(); ++i)   //note: norm_1 broken for unsigned ints on MacOS
+    cpu_result2 += ublas_v1[i] + ublas_v2[i];
+  cpu_result = viennacl::linalg::norm_1(vcl_v1 + vcl_v2);
+
+  if (check(cpu_result, cpu_result2) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  // --------------------------------------------------------------------------
+  std::cout << "Testing norm_inf..." << std::endl;
+  cpu_result = 0;
+  for (std::size_t i=0; i<ublas_v1.size(); ++i)
+    if (ublas_v1[i] > cpu_result)
+      cpu_result = ublas_v1[i];
+  gpu_result = viennacl::linalg::norm_inf(vcl_v1);
+
+  if (check(cpu_result, gpu_result) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  cpu_result2 = 0;
+  for (std::size_t i=0; i<ublas_v1.size(); ++i)
+    if (ublas_v1[i] > cpu_result2)
+      cpu_result2 = ublas_v1[i];
+  cpu_result = viennacl::linalg::norm_inf(vcl_v1);
+
+  if (check(cpu_result, cpu_result2) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  cpu_result2 = 0;
+  for (std::size_t i=0; i<ublas_v1.size(); ++i)
+    if (ublas_v1[i] + ublas_v2[i] > cpu_result2)
+      cpu_result2 = ublas_v1[i] + ublas_v2[i];
+  cpu_result = viennacl::linalg::norm_inf(vcl_v1 + vcl_v2);
+
+  if (check(cpu_result, cpu_result2) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  // --------------------------------------------------------------------------
+  std::cout << "Testing index_norm_inf..." << std::endl;
+
+  std::size_t cpu_index = 0;
+  cpu_result = 0;
+  for (std::size_t i=0; i<ublas_v1.size(); ++i)
+    if (ublas_v1[i] > cpu_result)
+    {
+      cpu_result = ublas_v1[i];
+      cpu_index = i;
+    }
+  std::size_t gpu_index = viennacl::linalg::index_norm_inf(vcl_v1);
+
+  if (check(static_cast<NumericT>(cpu_index), static_cast<NumericT>(gpu_index)) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  // --------------------------------------------------------------------------
+  gpu_result = vcl_v1[viennacl::linalg::index_norm_inf(vcl_v1)];
+
+  if (check(cpu_result, gpu_result) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  cpu_index = 0;
+  cpu_result = 0;
+  for (std::size_t i=0; i<ublas_v1.size(); ++i)
+    if (ublas_v1[i] + ublas_v2[i] > cpu_result)
+    {
+      cpu_result = ublas_v1[i];
+      cpu_index = i;
+    }
+  gpu_result = vcl_v1[viennacl::linalg::index_norm_inf(vcl_v1 + vcl_v2)];
+
+  if (check(cpu_result, gpu_result) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  // --------------------------------------------------------------------------
+
+  std::cout << "Testing assignments..." << std::endl;
+  NumericT val = static_cast<NumericT>(1);
+  for (size_t i=0; i < ublas_v1.size(); ++i)
+    ublas_v1(i) = val;
+
+  for (size_t i=0; i < vcl_v1.size(); ++i)
+    vcl_v1(i) = val;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  //
+  // multiplication and division of vectors by scalars
+  //
+  std::cout << "Testing scaling with CPU scalar..." << std::endl;
+  NumericT alpha = static_cast<NumericT>(3);
+  viennacl::scalar<NumericT> gpu_alpha = alpha;
+
+  ublas_v1  *= alpha;
+  vcl_v1    *= alpha;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing scaling with GPU scalar..." << std::endl;
+  ublas_v1  *= alpha;
+  vcl_v1    *= gpu_alpha;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  NumericT beta  = static_cast<NumericT>(2);
+  viennacl::scalar<NumericT> gpu_beta = beta;
+
+  std::cout << "Testing shrinking with CPU scalar..." << std::endl;
+  ublas_v1 /= beta;
+  vcl_v1   /= beta;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing shrinking with GPU scalar..." << std::endl;
+  ublas_v1 /= beta;
+  vcl_v1   /= gpu_beta;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  //
+  // add and inplace_add of vectors
+  //
+  for (size_t i=0; i < ublas_v1.size(); ++i)
+    ublas_v1(i) = NumericT(i);
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());  //resync
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  std::cout << "Testing add on vector..." << std::endl;
+
+  std::cout << "Checking for successful copy..." << std::endl;
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  if (check(ublas_v2, vcl_v2) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ublas_v1     = ublas_v1 + ublas_v2;
+  vcl_v1       =   vcl_v1 +   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing inplace-add on vector..." << std::endl;
+  ublas_v1 += ublas_v2;
+  vcl_v1   +=   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  //
+  // multiply-add
+  //
+  std::cout << "Testing multiply-add on vector with CPU scalar (right)..." << std::endl;
+  for (size_t i=0; i < ublas_v1.size(); ++i)
+    ublas_v1(i) = NumericT(i);
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 + alpha * ublas_v2;
+  vcl_v1   = vcl_v1   + alpha *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing multiply-add on vector with CPU scalar (left)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = alpha * ublas_v1 + ublas_v2;
+  vcl_v1   = alpha *   vcl_v1 +   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing multiply-add on vector with CPU scalar (both)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = alpha * ublas_v1 + beta * ublas_v2;
+  vcl_v1   = alpha *   vcl_v1 + beta *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing inplace multiply-add on vector with CPU scalar..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 += alpha * ublas_v2;
+  vcl_v1   += alpha *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing multiply-add on vector with GPU scalar (right)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 +     alpha * ublas_v2;
+  vcl_v1   = vcl_v1   + gpu_alpha *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing multiply-add on vector with GPU scalar (left)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 +     alpha * ublas_v2;
+  vcl_v1   = vcl_v1   + gpu_alpha *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing multiply-add on vector with GPU scalar (both)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 =     alpha * ublas_v1 +     beta * ublas_v2;
+  vcl_v1   = gpu_alpha *   vcl_v1 + gpu_beta *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing inplace multiply-add on vector with GPU scalar (both, adding)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 +=     alpha * ublas_v1 +     beta * ublas_v2;
+  vcl_v1   += gpu_alpha *   vcl_v1 + gpu_beta *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing inplace multiply-add on vector with GPU scalar..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 +=     alpha * ublas_v2;
+  vcl_v1   += gpu_alpha *   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  //
+  // division-add
+  //
+  std::cout << "Testing division-add on vector with CPU scalar (right)..." << std::endl;
+  for (size_t i=0; i < ublas_v1.size(); ++i)
+    ublas_v1(i) = NumericT(i);
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 + ublas_v2 / alpha;
+  vcl_v1   = vcl_v1   + vcl_v2 / alpha;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing division-add on vector with CPU scalar (left)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 / alpha + ublas_v2;
+  vcl_v1   =   vcl_v1 / alpha +   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing division-add on vector with CPU scalar (both)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 / alpha + ublas_v2 / beta;
+  vcl_v1   =   vcl_v1 / alpha +   vcl_v2 / beta;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing division-multiply-add on vector with CPU scalar..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 / alpha + ublas_v2 * beta;
+  vcl_v1   =   vcl_v1 / alpha +   vcl_v2 * beta;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing multiply-division-add on vector with CPU scalar..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 * alpha + ublas_v2 / beta;
+  vcl_v1   =   vcl_v1 * alpha +   vcl_v2 / beta;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+
+  std::cout << "Testing inplace division-add on vector with CPU scalar..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 += ublas_v2 / alpha;
+  vcl_v1   += vcl_v2 / alpha;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing division-add on vector with GPU scalar (right)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 + ublas_v2 / alpha;
+  vcl_v1   = vcl_v1   +   vcl_v2 / gpu_alpha;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing division-add on vector with GPU scalar (left)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 + ublas_v2 / alpha;
+  vcl_v1   = vcl_v1   +   vcl_v2 / gpu_alpha;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing division-add on vector with GPU scalar (both)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas_v1 / alpha     + ublas_v2 / beta;
+  vcl_v1   =   vcl_v1 / gpu_alpha +   vcl_v2 / gpu_beta;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing inplace division-add on vector with GPU scalar (both, adding)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 += ublas_v1 / alpha     + ublas_v2 / beta;
+  vcl_v1   +=   vcl_v1 / gpu_alpha +   vcl_v2 / gpu_beta;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing inplace division-multiply-add on vector with GPU scalar (adding)..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 += ublas_v1 / alpha     + ublas_v2 * beta;
+  vcl_v1   +=   vcl_v1 / gpu_alpha +   vcl_v2 * gpu_beta;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing inplace division-add on vector with GPU scalar..." << std::endl;
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 += ublas_v2 * alpha;
+  vcl_v1   +=   vcl_v2 * gpu_alpha;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  //
+  // More complicated expressions (for ensuring the operator overloads work correctly)
+  //
+  for (size_t i=0; i < ublas_v1.size(); ++i)
+    ublas_v1(i) = NumericT(i);
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  std::cout << "Testing three vector additions..." << std::endl;
+  ublas_v1 = ublas_v2 + ublas_v1 + ublas_v2;
+  vcl_v1   =   vcl_v2 +   vcl_v1 +   vcl_v2;
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  // --------------------------------------------------------------------------
+  ublas_v2 = 3 * ublas_v1;
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  std::cout << "Testing swap..." << std::endl;
+  swap(ublas_v1, ublas_v2);
+  swap(vcl_v1, vcl_v2);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << "Testing elementwise multiplication..." << std::endl;
+  std::cout << " v1 = element_prod(v1, v2);" << std::endl;
+  ublas_v1 = ublas::element_prod(ublas_v1, ublas_v2);
+  vcl_v1 = viennacl::linalg::element_prod(vcl_v1, vcl_v2);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " v1 += element_prod(v1, v2);" << std::endl;
+  ublas_v1 += ublas::element_prod(ublas_v1, ublas_v2);
+  vcl_v1 += viennacl::linalg::element_prod(vcl_v1, vcl_v2);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ///////
+  std::cout << " v1 = element_prod(v1 + v2, v2);" << std::endl;
+  ublas_v1 = ublas::element_prod(ublas_v1 + ublas_v2, ublas_v2);
+  vcl_v1 = viennacl::linalg::element_prod(vcl_v1 + vcl_v2, vcl_v2);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " v1 += element_prod(v1 + v2, v2);" << std::endl;
+  ublas_v1 += ublas::element_prod(ublas_v1 + ublas_v2, ublas_v2);
+  vcl_v1 += viennacl::linalg::element_prod(vcl_v1 + vcl_v2, vcl_v2);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ///////
+  std::cout << " v1 = element_prod(v1, v2 + v1);" << std::endl;
+  ublas_v1 = ublas::element_prod(ublas_v1, ublas_v2 + ublas_v1);
+  vcl_v1 = viennacl::linalg::element_prod(vcl_v1, vcl_v2 + vcl_v1);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " v1 += element_prod(v1, v2 + v1);" << std::endl;
+  ublas_v1 += ublas::element_prod(ublas_v1, ublas_v2 + ublas_v1);
+  vcl_v1 += viennacl::linalg::element_prod(vcl_v1, vcl_v2 + vcl_v1);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ///////
+  std::cout << " v1 = element_prod(v1 + v2, v2 + v1);" << std::endl;
+  ublas_v1 = ublas::element_prod(ublas_v1 + ublas_v2, ublas_v2 + ublas_v1);
+  vcl_v1 = viennacl::linalg::element_prod(vcl_v1 + vcl_v2, vcl_v2 + vcl_v1);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " v1 += element_prod(v1 + v2, v2 + v1);" << std::endl;
+  ublas_v1 += ublas::element_prod(ublas_v1 + ublas_v2, ublas_v2 + ublas_v1);
+  vcl_v1 += viennacl::linalg::element_prod(vcl_v1 + vcl_v2, vcl_v2 + vcl_v1);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  std::cout << "Testing elementwise division..." << std::endl;
+  for (std::size_t i=0; i<ublas_v1.size(); ++i)
+  {
+    ublas_v1[i] = NumericT(1 + i);
+    ublas_v2[i] = NumericT(5 + i);
+  }
+
+  viennacl::copy(ublas_v1.begin(), ublas_v1.end(), vcl_v1.begin());
+  viennacl::copy(ublas_v2.begin(), ublas_v2.end(), vcl_v2.begin());
+
+  ublas_v1 = ublas::element_div(ublas_v1, ublas_v2);
+  vcl_v1 = viennacl::linalg::element_div(vcl_v1, vcl_v2);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ublas_v1 += ublas::element_div(ublas_v1, ublas_v2);
+  vcl_v1 += viennacl::linalg::element_div(vcl_v1, vcl_v2);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ///////
+  ublas_v1 = ublas::element_div(ublas_v1 + ublas_v2, ublas_v2);
+  vcl_v1 = viennacl::linalg::element_div(vcl_v1 + vcl_v2, vcl_v2);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ublas_v1 += ublas::element_div(ublas_v1 + ublas_v2, ublas_v2);
+  vcl_v1 += viennacl::linalg::element_div(vcl_v1 + vcl_v2, vcl_v2);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ///////
+  ublas_v1 = ublas::element_div(ublas_v1, ublas_v2 + ublas_v1);
+  vcl_v1 = viennacl::linalg::element_div(vcl_v1, vcl_v2 + vcl_v1);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ublas_v1 += ublas::element_div(ublas_v1, ublas_v2 + ublas_v1);
+  vcl_v1 += viennacl::linalg::element_div(vcl_v1, vcl_v2 + vcl_v1);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ///////
+  ublas_v1 = ublas::element_div(ublas_v1 + ublas_v2, ublas_v2 + ublas_v1);
+  vcl_v1 = viennacl::linalg::element_div(vcl_v1 + vcl_v2, vcl_v2 + vcl_v1);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ublas_v1 += ublas::element_div(ublas_v1 + ublas_v2, ublas_v2 + ublas_v1);
+  vcl_v1 += viennacl::linalg::element_div(vcl_v1 + vcl_v2, vcl_v2 + vcl_v1);
+
+  if (check(ublas_v1, vcl_v1) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  // --------------------------------------------------------------------------
+  return retval;
+}
+
+
+template< typename NumericT >
+int test()
+{
+  int retval = EXIT_SUCCESS;
+  std::size_t size = 12345;
+
+  std::cout << "Running tests for vector of size " << size << std::endl;
+
+  //
+  // Set up UBLAS objects
+  //
+  ublas::vector<NumericT> ublas_full_vec(size);
+  ublas::vector<NumericT> ublas_full_vec2(ublas_full_vec.size());
+
+  for (std::size_t i=0; i<ublas_full_vec.size(); ++i)
+  {
+    ublas_full_vec[i]  = NumericT(1.0 + i);
+    ublas_full_vec2[i] = NumericT(2.0 + i / 2);
+  }
+
+  ublas::range r1(    ublas_full_vec.size() / 4, 2 * ublas_full_vec.size() / 4);
+  ublas::range r2(2 * ublas_full_vec2.size() / 4, 3 * ublas_full_vec2.size() / 4);
+  ublas::vector_range< ublas::vector<NumericT> > ublas_range_vec(ublas_full_vec, r1);
+  ublas::vector_range< ublas::vector<NumericT> > ublas_range_vec2(ublas_full_vec2, r2);
+
+  ublas::slice s1(    ublas_full_vec.size() / 4, 3, ublas_full_vec.size() / 4);
+  ublas::slice s2(2 * ublas_full_vec2.size() / 4, 2, ublas_full_vec2.size() / 4);
+  ublas::vector_slice< ublas::vector<NumericT> > ublas_slice_vec(ublas_full_vec, s1);
+  ublas::vector_slice< ublas::vector<NumericT> > ublas_slice_vec2(ublas_full_vec2, s2);
+
+  //
+  // Set up ViennaCL objects
+  //
+  viennacl::vector<NumericT> vcl_full_vec(ublas_full_vec.size());
+  viennacl::vector<NumericT> vcl_full_vec2(ublas_full_vec2.size());
+
+  viennacl::fast_copy(ublas_full_vec.begin(), ublas_full_vec.end(), vcl_full_vec.begin());
+  viennacl::copy(ublas_full_vec2.begin(), ublas_full_vec2.end(), vcl_full_vec2.begin());
+
+  viennacl::range vcl_r1(    vcl_full_vec.size() / 4, 2 * vcl_full_vec.size() / 4);
+  viennacl::range vcl_r2(2 * vcl_full_vec2.size() / 4, 3 * vcl_full_vec2.size() / 4);
+  viennacl::vector_range< viennacl::vector<NumericT> > vcl_range_vec(vcl_full_vec, vcl_r1);
+  viennacl::vector_range< viennacl::vector<NumericT> > vcl_range_vec2(vcl_full_vec2, vcl_r2);
+
+  {
+    viennacl::vector<NumericT> vcl_short_vec(vcl_range_vec);
+    viennacl::vector<NumericT> vcl_short_vec2 = vcl_range_vec2;
+
+    ublas::vector<NumericT> ublas_short_vec(ublas_range_vec);
+    ublas::vector<NumericT> ublas_short_vec2(ublas_range_vec2);
+
+    std::cout << "Testing creation of vectors from range..." << std::endl;
+    if (check(ublas_short_vec, vcl_short_vec) != EXIT_SUCCESS)
+      return EXIT_FAILURE;
+    if (check(ublas_short_vec2, vcl_short_vec2) != EXIT_SUCCESS)
+      return EXIT_FAILURE;
+  }
+
+  viennacl::slice vcl_s1(    vcl_full_vec.size() / 4, 3, vcl_full_vec.size() / 4);
+  viennacl::slice vcl_s2(2 * vcl_full_vec2.size() / 4, 2, vcl_full_vec2.size() / 4);
+  viennacl::vector_slice< viennacl::vector<NumericT> > vcl_slice_vec(vcl_full_vec, vcl_s1);
+  viennacl::vector_slice< viennacl::vector<NumericT> > vcl_slice_vec2(vcl_full_vec2, vcl_s2);
+
+  viennacl::vector<NumericT> vcl_short_vec(vcl_slice_vec);
+  viennacl::vector<NumericT> vcl_short_vec2 = vcl_slice_vec2;
+
+  ublas::vector<NumericT> ublas_short_vec(ublas_slice_vec);
+  ublas::vector<NumericT> ublas_short_vec2(ublas_slice_vec2);
+
+  std::cout << "Testing creation of vectors from slice..." << std::endl;
+  if (check(ublas_short_vec, vcl_short_vec) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+  if (check(ublas_short_vec2, vcl_short_vec2) != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+
+  //
+  // Now start running tests for vectors, ranges and slices:
+  //
+
+  std::cout << " ** vcl_v1 = vector, vcl_v2 = vector **" << std::endl;
+  retval = test<NumericT>(ublas_short_vec, ublas_short_vec2,
+                          vcl_short_vec, vcl_short_vec2);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " ** vcl_v1 = vector, vcl_v2 = range **" << std::endl;
+  retval = test<NumericT>(ublas_short_vec, ublas_short_vec2,
+                          vcl_short_vec, vcl_range_vec2);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " ** vcl_v1 = vector, vcl_v2 = slice **" << std::endl;
+  retval = test<NumericT>(ublas_short_vec, ublas_short_vec2,
+                          vcl_short_vec, vcl_slice_vec2);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ///////
+
+  std::cout << " ** vcl_v1 = range, vcl_v2 = vector **" << std::endl;
+  retval = test<NumericT>(ublas_short_vec, ublas_short_vec2,
+                          vcl_range_vec, vcl_short_vec2);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " ** vcl_v1 = range, vcl_v2 = range **" << std::endl;
+  retval = test<NumericT>(ublas_short_vec, ublas_short_vec2,
+                          vcl_range_vec, vcl_range_vec2);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " ** vcl_v1 = range, vcl_v2 = slice **" << std::endl;
+  retval = test<NumericT>(ublas_short_vec, ublas_short_vec2,
+                          vcl_range_vec, vcl_slice_vec2);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  ///////
+
+  std::cout << " ** vcl_v1 = slice, vcl_v2 = vector **" << std::endl;
+  retval = test<NumericT>(ublas_short_vec, ublas_short_vec2,
+                          vcl_slice_vec, vcl_short_vec2);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " ** vcl_v1 = slice, vcl_v2 = range **" << std::endl;
+  retval = test<NumericT>(ublas_short_vec, ublas_short_vec2,
+                          vcl_slice_vec, vcl_range_vec2);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  std::cout << " ** vcl_v1 = slice, vcl_v2 = slice **" << std::endl;
+  retval = test<NumericT>(ublas_short_vec, ublas_short_vec2,
+                          vcl_slice_vec, vcl_slice_vec2);
+  if (retval != EXIT_SUCCESS)
+    return EXIT_FAILURE;
+
+  return EXIT_SUCCESS;
+}
+
+
+
+//
+// -------------------------------------------------------------
+//
+int main()
+{
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "## Test :: Vector with Integer types" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << std::endl;
+
+  int retval = EXIT_SUCCESS;
+
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << std::endl;
+  {
+    std::cout << "# Testing setup:" << std::endl;
+    std::cout << "  numeric: unsigned int" << std::endl;
+    retval = test<unsigned int>();
+    if( retval == EXIT_SUCCESS )
+      std::cout << "# Test passed" << std::endl;
+    else
+      return retval;
+  }
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << std::endl;
+  {
+    std::cout << "# Testing setup:" << std::endl;
+    std::cout << "  numeric: long" << std::endl;
+    retval = test<unsigned long>();
+    if( retval == EXIT_SUCCESS )
+      std::cout << "# Test passed" << std::endl;
+    else
+      return retval;
+  }
+  std::cout << std::endl;
+  std::cout << "----------------------------------------------" << std::endl;
+  std::cout << std::endl;
+
+  std::cout << std::endl;
+  std::cout << "------- Test completed --------" << std::endl;
+  std::cout << std::endl;
+
+  return retval;
+}
diff --git a/viennacl/backend/cpu_ram.hpp b/viennacl/backend/cpu_ram.hpp
new file mode 100644
index 0000000..51fcf8c
--- /dev/null
+++ b/viennacl/backend/cpu_ram.hpp
@@ -0,0 +1,143 @@
+#ifndef VIENNACL_BACKEND_CPU_RAM_HPP_
+#define VIENNACL_BACKEND_CPU_RAM_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/backend/cpu_ram.hpp
+    @brief Implementations for the OpenCL backend functionality
+*/
+
+#include <cassert>
+#include <vector>
+#include "viennacl/tools/shared_ptr.hpp"
+
+namespace viennacl
+{
+  namespace backend
+  {
+    namespace cpu_ram
+    {
+      typedef viennacl::tools::shared_ptr<char>  handle_type;
+      // Requirements for backend:
+
+      // * memory_create(size, host_ptr)
+      // * memory_copy(src, dest, offset_src, offset_dest, size)
+      // * memory_write_from_main_memory(src, offset, size,
+      //                                 dest, offset, size)
+      // * memory_read_to_main_memory(src, offset, size
+      //                              dest, offset, size)
+      // *
+      //
+
+      namespace detail
+      {
+        /** @brief Helper struct for deleting an pointer to an array */
+        template<class U>
+        struct array_deleter
+        {
+          void operator()(U* p) const { delete[] p; }
+        };
+
+      }
+
+      /** @brief Creates an array of the specified size in main RAM. If the second argument is provided, the buffer is initialized with data from that pointer.
+       *
+       * @param size_in_bytes   Number of bytes to allocate
+       * @param host_ptr        Pointer to data which will be copied to the new array. Must point to at least 'size_in_bytes' bytes of data.
+       *
+       */
+      inline handle_type  memory_create(vcl_size_t size_in_bytes, const void * host_ptr = NULL)
+      {
+        if (!host_ptr)
+          return handle_type(new char[size_in_bytes], detail::array_deleter<char>());
+
+        handle_type new_handle(new char[size_in_bytes], detail::array_deleter<char>());
+
+        // copy data:
+        char * raw_ptr = new_handle.get();
+        const char * data_ptr = static_cast<const char *>(host_ptr);
+        for (vcl_size_t i=0; i<size_in_bytes; ++i)
+          raw_ptr[i] = data_ptr[i];
+
+        return new_handle;
+      }
+
+      /** @brief Copies 'bytes_to_copy' bytes from address 'src_buffer + src_offset' to memory starting at address 'dst_buffer + dst_offset'.
+       *
+       *  @param src_buffer     A smart pointer to the begin of an allocated buffer
+       *  @param dst_buffer     A smart pointer to the end of an allocated buffer
+       *  @param src_offset     Offset of the first byte to be written from the address given by 'src_buffer' (in bytes)
+       *  @param dst_offset     Offset of the first byte to be written to the address given by 'dst_buffer' (in bytes)
+       *  @param bytes_to_copy  Number of bytes to be copied
+       */
+      inline void memory_copy(handle_type const & src_buffer,
+                              handle_type & dst_buffer,
+                              vcl_size_t src_offset,
+                              vcl_size_t dst_offset,
+                              vcl_size_t bytes_to_copy)
+      {
+        assert( (dst_buffer.get() != NULL) && bool("Memory not initialized!"));
+        assert( (src_buffer.get() != NULL) && bool("Memory not initialized!"));
+
+        for (vcl_size_t i=0; i<bytes_to_copy; ++i)
+          dst_buffer.get()[i+dst_offset] = src_buffer.get()[i + src_offset];
+      }
+
+      /** @brief Writes data from main RAM identified by 'ptr' to the buffer identified by 'dst_buffer'
+       *
+       * @param dst_buffer    A smart pointer to the beginning of an allocated buffer
+       * @param dst_offset    Offset of the first written byte from the beginning of 'dst_buffer' (in bytes)
+       * @param bytes_to_copy Number of bytes to be copied
+       * @param ptr           Pointer to the first byte to be written
+       */
+      inline void memory_write(handle_type & dst_buffer,
+                               vcl_size_t dst_offset,
+                               vcl_size_t bytes_to_copy,
+                               const void * ptr,
+                               bool /*async*/)
+      {
+        assert( (dst_buffer.get() != NULL) && bool("Memory not initialized!"));
+
+        for (vcl_size_t i=0; i<bytes_to_copy; ++i)
+          dst_buffer.get()[i+dst_offset] = static_cast<const char *>(ptr)[i];
+      }
+
+      /** @brief Reads data from a buffer back to main RAM.
+       *
+       * @param src_buffer         A smart pointer to the beginning of an allocated source buffer
+       * @param src_offset         Offset of the first byte to be read from the beginning of src_buffer (in bytes_
+       * @param bytes_to_copy      Number of bytes to be read
+       * @param ptr                Location in main RAM where to read data should be written to
+       */
+      inline void memory_read(handle_type const & src_buffer,
+                              vcl_size_t src_offset,
+                              vcl_size_t bytes_to_copy,
+                              void * ptr,
+                              bool /*async*/)
+      {
+        assert( (src_buffer.get() != NULL) && bool("Memory not initialized!"));
+
+        for (vcl_size_t i=0; i<bytes_to_copy; ++i)
+          static_cast<char *>(ptr)[i] = src_buffer.get()[i+src_offset];
+      }
+
+
+    }
+  } //backend
+} //viennacl
+#endif
diff --git a/viennacl/backend/cuda.hpp b/viennacl/backend/cuda.hpp
new file mode 100644
index 0000000..f02d403
--- /dev/null
+++ b/viennacl/backend/cuda.hpp
@@ -0,0 +1,190 @@
+#ifndef VIENNACL_BACKEND_CUDA_HPP_
+#define VIENNACL_BACKEND_CUDA_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/backend/cuda.hpp
+    @brief Implementations for the CUDA backend functionality
+*/
+
+
+#include <iostream>
+#include <vector>
+#include <cassert>
+#include "viennacl/tools/shared_ptr.hpp"
+
+// includes CUDA
+#include <cuda_runtime.h>
+
+#define VIENNACL_CUDA_ERROR_CHECK(err)  detail::cuda_error_check (err, __FILE__, __LINE__)
+
+namespace viennacl
+{
+  namespace backend
+  {
+    namespace cuda
+    {
+      typedef viennacl::tools::shared_ptr<char>  handle_type;
+      // Requirements for backend:
+
+      // * memory_create(size, host_ptr)
+      // * memory_copy(src, dest, offset_src, offset_dest, size)
+      // * memory_write_from_main_memory(src, offset, size,
+      //                                 dest, offset, size)
+      // * memory_read_to_main_memory(src, offset, size
+      //                              dest, offset, size)
+      // *
+      //
+
+      namespace detail
+      {
+
+
+        inline void cuda_error_check(cudaError error_code, const char *file, const int line )
+        {
+          if(cudaSuccess != error_code)
+          {
+            std::cerr << file << "(" << line << "): " << ": CUDA Runtime API error " << error_code << ": " << cudaGetErrorString( error_code ) << std::endl;
+            throw "CUDA error";
+          }
+        }
+
+
+        /** @brief Functor for deleting a CUDA handle. Used within the smart pointer class. */
+        template <typename U>
+        struct cuda_deleter
+        {
+          void operator()(U * p) const
+          {
+            //std::cout << "Freeing handle " << reinterpret_cast<void *>(p) << std::endl;
+            cudaFree(p);
+          }
+        };
+
+      }
+
+      /** @brief Creates an array of the specified size on the CUDA device. If the second argument is provided, the buffer is initialized with data from that pointer.
+       *
+       * @param size_in_bytes   Number of bytes to allocate
+       * @param host_ptr        Pointer to data which will be copied to the new array. Must point to at least 'size_in_bytes' bytes of data.
+       *
+       */
+      inline handle_type  memory_create(vcl_size_t size_in_bytes, const void * host_ptr = NULL)
+      {
+        void * dev_ptr = NULL;
+        VIENNACL_CUDA_ERROR_CHECK( cudaMalloc(&dev_ptr, size_in_bytes) );
+        //std::cout << "Allocated new dev_ptr " << dev_ptr << " of size " <<  size_in_bytes << std::endl;
+
+        if (!host_ptr)
+          return handle_type(reinterpret_cast<char *>(dev_ptr), detail::cuda_deleter<char>());
+
+        handle_type new_handle(reinterpret_cast<char*>(dev_ptr), detail::cuda_deleter<char>());
+
+        // copy data:
+        //std::cout << "Filling new handle from host_ptr " << host_ptr << std::endl;
+        cudaMemcpy(new_handle.get(), host_ptr, size_in_bytes, cudaMemcpyHostToDevice);
+
+        return new_handle;
+      }
+
+
+      /** @brief Copies 'bytes_to_copy' bytes from address 'src_buffer + src_offset' on the CUDA device to memory starting at address 'dst_buffer + dst_offset' on the same CUDA device.
+       *
+       *  @param src_buffer     A smart pointer to the begin of an allocated CUDA buffer
+       *  @param dst_buffer     A smart pointer to the end of an allocated CUDA buffer
+       *  @param src_offset     Offset of the first byte to be written from the address given by 'src_buffer' (in bytes)
+       *  @param dst_offset     Offset of the first byte to be written to the address given by 'dst_buffer' (in bytes)
+       *  @param bytes_to_copy  Number of bytes to be copied
+       */
+      inline void memory_copy(handle_type const & src_buffer,
+                              handle_type & dst_buffer,
+                              vcl_size_t src_offset,
+                              vcl_size_t dst_offset,
+                              vcl_size_t bytes_to_copy)
+      {
+        assert( (dst_buffer.get() != NULL) && bool("Memory not initialized!"));
+        assert( (src_buffer.get() != NULL) && bool("Memory not initialized!"));
+
+        cudaMemcpy(reinterpret_cast<void *>(dst_buffer.get() + dst_offset),
+                   reinterpret_cast<void *>(src_buffer.get() + src_offset),
+                   bytes_to_copy,
+                   cudaMemcpyDeviceToDevice);
+      }
+
+
+      /** @brief Writes data from main RAM identified by 'ptr' to the CUDA buffer identified by 'dst_buffer'
+       *
+       * @param dst_buffer    A smart pointer to the beginning of an allocated CUDA buffer
+       * @param dst_offset    Offset of the first written byte from the beginning of 'dst_buffer' (in bytes)
+       * @param bytes_to_copy Number of bytes to be copied
+       * @param ptr           Pointer to the first byte to be written
+       * @param async              Whether the operation should be asynchronous
+       */
+      inline void memory_write(handle_type & dst_buffer,
+                               vcl_size_t dst_offset,
+                               vcl_size_t bytes_to_copy,
+                               const void * ptr,
+                               bool async = false)
+      {
+        assert( (dst_buffer.get() != NULL) && bool("Memory not initialized!"));
+
+        if (async)
+          cudaMemcpyAsync(reinterpret_cast<char *>(dst_buffer.get()) + dst_offset,
+                          reinterpret_cast<const char *>(ptr),
+                          bytes_to_copy,
+                          cudaMemcpyHostToDevice);
+        else
+          cudaMemcpy(reinterpret_cast<char *>(dst_buffer.get()) + dst_offset,
+                     reinterpret_cast<const char *>(ptr),
+                     bytes_to_copy,
+                     cudaMemcpyHostToDevice);
+      }
+
+
+      /** @brief Reads data from a CUDA buffer back to main RAM.
+       *
+       * @param src_buffer         A smart pointer to the beginning of an allocated CUDA source buffer
+       * @param src_offset         Offset of the first byte to be read from the beginning of src_buffer (in bytes_
+       * @param bytes_to_copy      Number of bytes to be read
+       * @param ptr                Location in main RAM where to read data should be written to
+       * @param async              Whether the operation should be asynchronous
+       */
+      inline void memory_read(handle_type const & src_buffer,
+                              vcl_size_t src_offset,
+                              vcl_size_t bytes_to_copy,
+                              void * ptr,
+                              bool async = false)
+      {
+        assert( (src_buffer.get() != NULL) && bool("Memory not initialized!"));
+
+        if (async)
+          cudaMemcpyAsync(reinterpret_cast<char *>(ptr),
+                          reinterpret_cast<char *>(src_buffer.get()) + src_offset,
+                          bytes_to_copy,
+                          cudaMemcpyDeviceToHost);
+        else
+          cudaMemcpy(reinterpret_cast<char *>(ptr),
+                     reinterpret_cast<char *>(src_buffer.get()) + src_offset,
+                     bytes_to_copy,
+                     cudaMemcpyDeviceToHost);
+      }
+
+    } //cuda
+  } //backend
+} //viennacl
+#endif
diff --git a/viennacl/backend/mem_handle.hpp b/viennacl/backend/mem_handle.hpp
new file mode 100644
index 0000000..1105e5c
--- /dev/null
+++ b/viennacl/backend/mem_handle.hpp
@@ -0,0 +1,225 @@
+#ifndef VIENNACL_BACKEND_MEM_HANDLE_HPP
+#define VIENNACL_BACKEND_MEM_HANDLE_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/backend/mem_handle.hpp
+    @brief Implements the multi-memory-domain handle
+*/
+
+#include <vector>
+#include <cassert>
+#include "viennacl/forwards.h"
+#include "viennacl/tools/shared_ptr.hpp"
+#include "viennacl/backend/cpu_ram.hpp"
+
+#ifdef VIENNACL_WITH_OPENCL
+  #include "viennacl/backend/opencl.hpp"
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+  #include "viennacl/backend/cuda.hpp"
+#endif
+
+
+namespace viennacl
+{
+  namespace backend
+  {
+
+
+// if a user compiles with CUDA, it is reasonable to expect that CUDA should be the default
+#ifdef VIENNACL_WITH_CUDA
+    inline memory_types default_memory_type() { return CUDA_MEMORY; }
+#elif defined(VIENNACL_WITH_OPENCL)
+    inline memory_types default_memory_type() { return OPENCL_MEMORY; }
+#else
+    inline memory_types default_memory_type() { return MAIN_MEMORY; }
+#endif
+
+
+    /** @brief Main abstraction class for multiple memory domains. Represents a buffer in either main RAM, an OpenCL context, or a CUDA device.
+     *
+     * The idea is to wrap all possible handle types inside this class so that higher-level code does not need to be cluttered with preprocessor switches.
+     * Instead, this class collects all the necessary conditional compilations.
+     *
+     */
+    class mem_handle
+    {
+      public:
+        typedef viennacl::tools::shared_ptr<char>      ram_handle_type;
+        typedef viennacl::tools::shared_ptr<char>      cuda_handle_type;
+
+        /** @brief Default CTOR. No memory is allocated */
+        mem_handle() : active_handle_(MEMORY_NOT_INITIALIZED), size_in_bytes_(0) {}
+
+        /** @brief Returns the handle to a buffer in CPU RAM. NULL is returned if no such buffer has been allocated. */
+        ram_handle_type       & ram_handle()       { return ram_handle_; }
+        /** @brief Returns the handle to a buffer in CPU RAM. NULL is returned if no such buffer has been allocated. */
+        ram_handle_type const & ram_handle() const { return ram_handle_; }
+
+#ifdef VIENNACL_WITH_OPENCL
+        /** @brief Returns the handle to an OpenCL buffer. The handle contains NULL if no such buffer has been allocated. */
+        viennacl::ocl::handle<cl_mem>       & opencl_handle()       { return opencl_handle_; }
+        /** @brief Returns the handle to an OpenCL buffer. The handle contains NULL if no such buffer has been allocated. */
+        viennacl::ocl::handle<cl_mem> const & opencl_handle() const { return opencl_handle_; }
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+        /** @brief Returns the handle to a CUDA buffer. The handle contains NULL if no such buffer has been allocated. */
+        cuda_handle_type       & cuda_handle()       { return cuda_handle_; }
+        /** @brief Returns the handle to a CUDA buffer. The handle contains NULL if no such buffer has been allocated. */
+        cuda_handle_type const & cuda_handle() const { return cuda_handle_; }
+#endif
+
+        /** @brief Returns an ID for the currently active memory buffer. Other memory buffers might contain old or no data. */
+        memory_types get_active_handle_id() const { return active_handle_; }
+
+        /** @brief Switches the currently active handle. If no support for that backend is provided, an exception is thrown. */
+        void switch_active_handle_id(memory_types new_id)
+        {
+          if (new_id != active_handle_)
+          {
+            if (active_handle_ == MEMORY_NOT_INITIALIZED)
+              active_handle_ = new_id;
+            else if (active_handle_ == MAIN_MEMORY)
+            {
+              active_handle_ = new_id;
+            }
+            else if (active_handle_ == OPENCL_MEMORY)
+            {
+#ifdef VIENNACL_WITH_OPENCL
+              active_handle_ = new_id;
+#else
+              throw "compiled without OpenCL suppport!";
+#endif
+            }
+            else if (active_handle_ == CUDA_MEMORY)
+            {
+#ifdef VIENNACL_WITH_CUDA
+              active_handle_ = new_id;
+#else
+              throw "compiled without CUDA suppport!";
+#endif
+            }
+            else
+              throw "invalid new memory region!";
+          }
+        }
+
+        /** @brief Compares the two handles and returns true if the active memory handles in the two mem_handles point to the same buffer. */
+        bool operator==(mem_handle const & other) const
+        {
+          if (active_handle_ != other.active_handle_)
+            return false;
+
+          switch (active_handle_)
+          {
+            case MAIN_MEMORY:
+              return ram_handle_.get() == other.ram_handle_.get();
+#ifdef VIENNACL_WITH_OPENCL
+            case OPENCL_MEMORY:
+              return opencl_handle_.get() == other.opencl_handle_.get();
+#endif
+#ifdef VIENNACL_WITH_CUDA
+            case CUDA_MEMORY:
+              return cuda_handle_.get() == other.cuda_handle_.get();
+#endif
+            default: break;
+          }
+
+          return false;
+        }
+
+        /** @brief Compares the two handles and returns true if the active memory handles in the two mem_handles point a buffer with inferior address
+         * useful to store handles into a map, since they naturally have strong ordering
+         */
+        bool operator<(mem_handle const & other) const
+        {
+          if (active_handle_ != other.active_handle_)
+            return false;
+
+          switch (active_handle_)
+          {
+            case MAIN_MEMORY:
+              return ram_handle_.get() < other.ram_handle_.get();
+#ifdef VIENNACL_WITH_OPENCL
+            case OPENCL_MEMORY:
+              return opencl_handle_.get() < other.opencl_handle_.get();
+#endif
+#ifdef VIENNACL_WITH_CUDA
+            case CUDA_MEMORY:
+              return cuda_handle_.get() < other.cuda_handle_.get();
+#endif
+            default: break;
+          }
+
+          return false;
+        }
+
+
+        bool operator!=(mem_handle const & other) const { return !(*this == other); }
+
+        /** @brief Implements a fast swapping method. No data is copied, only the handles are exchanged. */
+        void swap(mem_handle & other)
+        {
+          // swap handle type:
+          memory_types active_handle_tmp = other.active_handle_;
+          other.active_handle_ = active_handle_;
+          active_handle_ = active_handle_tmp;
+
+          // swap ram handle:
+          ram_handle_type ram_handle_tmp = other.ram_handle_;
+          other.ram_handle_ = ram_handle_;
+          ram_handle_ = ram_handle_tmp;
+
+          // swap OpenCL handle:
+#ifdef VIENNACL_WITH_OPENCL
+          opencl_handle_.swap(other.opencl_handle_);
+#endif
+#ifdef VIENNACL_WITH_CUDA
+          cuda_handle_type cuda_handle_tmp = other.cuda_handle_;
+          other.cuda_handle_ = cuda_handle_;
+          cuda_handle_ = cuda_handle_tmp;
+#endif
+        }
+
+        /** @brief Returns the number of bytes of the currently active buffer */
+        vcl_size_t raw_size() const               { return size_in_bytes_; }
+
+        /** @brief Sets the size of the currently active buffer. Use with care! */
+        void        raw_size(vcl_size_t new_size) { size_in_bytes_ = new_size; }
+
+      private:
+        memory_types active_handle_;
+        ram_handle_type ram_handle_;
+#ifdef VIENNACL_WITH_OPENCL
+        viennacl::ocl::handle<cl_mem> opencl_handle_;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        cuda_handle_type        cuda_handle_;
+#endif
+        vcl_size_t size_in_bytes_;
+    };
+
+
+  } //backend
+
+
+} //viennacl
+#endif
diff --git a/viennacl/backend/memory.hpp b/viennacl/backend/memory.hpp
new file mode 100644
index 0000000..057eae3
--- /dev/null
+++ b/viennacl/backend/memory.hpp
@@ -0,0 +1,630 @@
+#ifndef VIENNACL_BACKEND_MEMORY_HPP
+#define VIENNACL_BACKEND_MEMORY_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/backend/memory.hpp
+    @brief Main interface routines for memory management
+*/
+
+#include <vector>
+#include <cassert>
+#include "viennacl/forwards.h"
+#include "viennacl/backend/mem_handle.hpp"
+#include "viennacl/context.hpp"
+#include "viennacl/traits/handle.hpp"
+#include "viennacl/traits/context.hpp"
+#include "viennacl/backend/util.hpp"
+
+#include "viennacl/backend/cpu_ram.hpp"
+
+#ifdef VIENNACL_WITH_OPENCL
+  #include "viennacl/backend/opencl.hpp"
+  #include "viennacl/ocl/backend.hpp"
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+  #include "viennacl/backend/cuda.hpp"
+#endif
+
+
+namespace viennacl
+{
+  namespace backend
+  {
+
+
+// if a user compiles with CUDA, it is reasonable to expect that CUDA should be the default
+    /** @brief Synchronizes the execution. finish() will only return after all compute kernels (CUDA, OpenCL) have completed. */
+    inline void finish()
+    {
+#ifdef VIENNACL_WITH_CUDA
+      cudaDeviceSynchronize();
+#endif
+#ifdef VIENNACL_WITH_OPENCL
+      viennacl::ocl::get_queue().finish();
+#endif
+    }
+
+
+
+
+    // Requirements for backend:
+
+    // ---- Memory ----
+    //
+    // * memory_create(size, host_ptr)
+    // * memory_copy(src, dest, offset_src, offset_dest, size)
+    // * memory_write(src, offset, size, ptr)
+    // * memory_read(src, offset, size, ptr)
+    //
+
+    /** @brief Creates an array of the specified size. If the second argument is provided, the buffer is initialized with data from that pointer.
+    *
+    * This is the generic version for CPU RAM, CUDA, and OpenCL. Creates the memory in the currently active memory domain.
+    *
+    * @param handle          The generic wrapper handle for multiple memory domains which will hold the new buffer.
+    * @param size_in_bytes   Number of bytes to allocate
+    * @param ctx             Optional context in which the matrix is created (one out of multiple OpenCL contexts, CUDA, host)
+    * @param host_ptr        Pointer to data which will be copied to the new array. Must point to at least 'size_in_bytes' bytes of data.
+    *
+    */
+    inline void memory_create(mem_handle & handle, vcl_size_t size_in_bytes, viennacl::context const & ctx, const void * host_ptr = NULL)
+    {
+      if (size_in_bytes > 0)
+      {
+        if (handle.get_active_handle_id() == MEMORY_NOT_INITIALIZED)
+          handle.switch_active_handle_id(ctx.memory_type());
+
+        switch(handle.get_active_handle_id())
+        {
+          case MAIN_MEMORY:
+            handle.ram_handle() = cpu_ram::memory_create(size_in_bytes, host_ptr);
+            handle.raw_size(size_in_bytes);
+            break;
+#ifdef VIENNACL_WITH_OPENCL
+          case OPENCL_MEMORY:
+            handle.opencl_handle().context(ctx.opencl_context());
+            handle.opencl_handle() = opencl::memory_create(handle.opencl_handle().context(), size_in_bytes, host_ptr);
+            handle.raw_size(size_in_bytes);
+            break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+          case CUDA_MEMORY:
+            handle.cuda_handle() = cuda::memory_create(size_in_bytes, host_ptr);
+            handle.raw_size(size_in_bytes);
+            break;
+#endif
+          case MEMORY_NOT_INITIALIZED:
+            throw memory_exception("not initialised!");
+          default:
+            throw memory_exception("unknown memory handle!");
+        }
+      }
+    }
+
+    /*
+    inline void memory_create(mem_handle & handle, vcl_size_t size_in_bytes, const void * host_ptr = NULL)
+    {
+      viennacl::context  ctx(default_memory_type());
+      memory_create(handle, size_in_bytes, ctx, host_ptr);
+    }*/
+
+
+    /** @brief Copies 'bytes_to_copy' bytes from address 'src_buffer + src_offset' to memory starting at address 'dst_buffer + dst_offset'.
+    *
+    * This is the generic version for CPU RAM, CUDA, and OpenCL. Copies the memory in the currently active memory domain.
+    *
+    *
+    *  @param src_buffer     A smart pointer to the begin of an allocated buffer
+    *  @param dst_buffer     A smart pointer to the end of an allocated buffer
+    *  @param src_offset     Offset of the first byte to be written from the address given by 'src_buffer' (in bytes)
+    *  @param dst_offset     Offset of the first byte to be written to the address given by 'dst_buffer' (in bytes)
+    *  @param bytes_to_copy  Number of bytes to be copied
+    */
+    inline void memory_copy(mem_handle const & src_buffer,
+                            mem_handle & dst_buffer,
+                            vcl_size_t src_offset,
+                            vcl_size_t dst_offset,
+                            vcl_size_t bytes_to_copy)
+    {
+      assert( src_buffer.get_active_handle_id() == dst_buffer.get_active_handle_id() && bool("memory_copy() must be called on buffers from the same domain") );
+
+      if (bytes_to_copy > 0)
+      {
+        switch(src_buffer.get_active_handle_id())
+        {
+          case MAIN_MEMORY:
+            cpu_ram::memory_copy(src_buffer.ram_handle(), dst_buffer.ram_handle(), src_offset, dst_offset, bytes_to_copy);
+            break;
+#ifdef VIENNACL_WITH_OPENCL
+          case OPENCL_MEMORY:
+            opencl::memory_copy(src_buffer.opencl_handle(), dst_buffer.opencl_handle(), src_offset, dst_offset, bytes_to_copy);
+            break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+          case CUDA_MEMORY:
+            cuda::memory_copy(src_buffer.cuda_handle(), dst_buffer.cuda_handle(), src_offset, dst_offset, bytes_to_copy);
+            break;
+#endif
+          case MEMORY_NOT_INITIALIZED:
+            throw memory_exception("not initialised!");
+          default:
+            throw memory_exception("unknown memory handle!");
+        }
+      }
+    }
+
+    // TODO: Refine this concept. Maybe move to constructor?
+    /** @brief A 'shallow' copy operation from an initialized buffer to an uninitialized buffer.
+     * The uninitialized buffer just copies the raw handle.
+     */
+    inline void memory_shallow_copy(mem_handle const & src_buffer,
+                                    mem_handle & dst_buffer)
+    {
+      assert( (dst_buffer.get_active_handle_id() == MEMORY_NOT_INITIALIZED) && bool("Shallow copy on already initialized memory not supported!"));
+
+      switch(src_buffer.get_active_handle_id())
+      {
+        case MAIN_MEMORY:
+          dst_buffer.switch_active_handle_id(src_buffer.get_active_handle_id());
+          dst_buffer.ram_handle() = src_buffer.ram_handle();
+          dst_buffer.raw_size(src_buffer.raw_size());
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case OPENCL_MEMORY:
+          dst_buffer.switch_active_handle_id(src_buffer.get_active_handle_id());
+          dst_buffer.opencl_handle() = src_buffer.opencl_handle();
+          dst_buffer.raw_size(src_buffer.raw_size());
+          break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        case CUDA_MEMORY:
+          dst_buffer.switch_active_handle_id(src_buffer.get_active_handle_id());
+          dst_buffer.cuda_handle() = src_buffer.cuda_handle();
+          dst_buffer.raw_size(src_buffer.raw_size());
+          break;
+#endif
+        case MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("unknown memory handle!");
+      }
+    }
+
+    /** @brief Writes data from main RAM identified by 'ptr' to the buffer identified by 'dst_buffer'
+    *
+    * This is the generic version for CPU RAM, CUDA, and OpenCL. Writes the memory in the currently active memory domain.
+    *
+    * @param dst_buffer     A smart pointer to the beginning of an allocated buffer
+    * @param dst_offset     Offset of the first written byte from the beginning of 'dst_buffer' (in bytes)
+    * @param bytes_to_write Number of bytes to be written
+    * @param ptr            Pointer to the first byte to be written
+    * @param async              Whether the operation should be asynchronous
+    */
+    inline void memory_write(mem_handle & dst_buffer,
+                             vcl_size_t dst_offset,
+                             vcl_size_t bytes_to_write,
+                             const void * ptr,
+                             bool async = false)
+    {
+      if (bytes_to_write > 0)
+      {
+        switch(dst_buffer.get_active_handle_id())
+        {
+          case MAIN_MEMORY:
+            cpu_ram::memory_write(dst_buffer.ram_handle(), dst_offset, bytes_to_write, ptr, async);
+            break;
+#ifdef VIENNACL_WITH_OPENCL
+          case OPENCL_MEMORY:
+            opencl::memory_write(dst_buffer.opencl_handle(), dst_offset, bytes_to_write, ptr, async);
+            break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+          case CUDA_MEMORY:
+            cuda::memory_write(dst_buffer.cuda_handle(), dst_offset, bytes_to_write, ptr, async);
+            break;
+#endif
+          case MEMORY_NOT_INITIALIZED:
+            throw memory_exception("not initialised!");
+          default:
+            throw memory_exception("unknown memory handle!");
+        }
+      }
+    }
+
+    /** @brief Reads data from a buffer back to main RAM.
+    *
+    * This is the generic version for CPU RAM, CUDA, and OpenCL. Reads the memory from the currently active memory domain.
+    *
+    * @param src_buffer         A smart pointer to the beginning of an allocated source buffer
+    * @param src_offset         Offset of the first byte to be read from the beginning of src_buffer (in bytes_
+    * @param bytes_to_read      Number of bytes to be read
+    * @param ptr                Location in main RAM where to read data should be written to
+    * @param async              Whether the operation should be asynchronous
+    */
+    inline void memory_read(mem_handle const & src_buffer,
+                            vcl_size_t src_offset,
+                            vcl_size_t bytes_to_read,
+                            void * ptr,
+                            bool async = false)
+    {
+      //finish(); //Fixes some issues with AMD APP SDK. However, might sacrifice a few percents of performance in some cases.
+
+      if (bytes_to_read > 0)
+      {
+        switch(src_buffer.get_active_handle_id())
+        {
+          case MAIN_MEMORY:
+            cpu_ram::memory_read(src_buffer.ram_handle(), src_offset, bytes_to_read, ptr, async);
+            break;
+#ifdef VIENNACL_WITH_OPENCL
+          case OPENCL_MEMORY:
+            opencl::memory_read(src_buffer.opencl_handle(), src_offset, bytes_to_read, ptr, async);
+            break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+          case CUDA_MEMORY:
+            cuda::memory_read(src_buffer.cuda_handle(), src_offset, bytes_to_read, ptr, async);
+            break;
+#endif
+          case MEMORY_NOT_INITIALIZED:
+            throw memory_exception("not initialised!");
+          default:
+            throw memory_exception("unknown memory handle!");
+        }
+      }
+    }
+
+
+
+    namespace detail
+    {
+      template <typename T>
+      vcl_size_t element_size(memory_types /* mem_type */)
+      {
+        return sizeof(T);
+      }
+
+
+      template <>
+      inline vcl_size_t element_size<unsigned long>(memory_types
+#ifdef VIENNACL_WITH_OPENCL
+                                                      mem_type  //in order to compile cleanly at -Wextra in GCC
+#endif
+                                                    )
+      {
+#ifdef VIENNACL_WITH_OPENCL
+        if (mem_type == OPENCL_MEMORY)
+          return sizeof(cl_ulong);
+#endif
+        return sizeof(unsigned long);
+      }
+
+      template <>
+      inline vcl_size_t element_size<long>(memory_types
+#ifdef VIENNACL_WITH_OPENCL
+                                                      mem_type  //in order to compile cleanly at -Wextra in GCC
+#endif
+                                           )
+      {
+#ifdef VIENNACL_WITH_OPENCL
+        if (mem_type == OPENCL_MEMORY)
+          return sizeof(cl_long);
+#endif
+        return sizeof(long);
+      }
+
+
+      template <>
+      inline vcl_size_t element_size<unsigned int>(memory_types
+#ifdef VIENNACL_WITH_OPENCL
+                                                      mem_type  //in order to compile cleanly at -Wextra in GCC
+#endif
+                                                   )
+      {
+#ifdef VIENNACL_WITH_OPENCL
+        if (mem_type == OPENCL_MEMORY)
+          return sizeof(cl_uint);
+#endif
+        return sizeof(unsigned int);
+      }
+
+      template <>
+      inline vcl_size_t element_size<int>(memory_types
+#ifdef VIENNACL_WITH_OPENCL
+                                           mem_type  //in order to compile cleanly at -Wextra in GCC
+#endif
+                                          )
+      {
+#ifdef VIENNACL_WITH_OPENCL
+        if (mem_type == OPENCL_MEMORY)
+          return sizeof(cl_int);
+#endif
+        return sizeof(int);
+      }
+
+
+    }
+
+
+    /** @brief Switches the active memory domain within a memory handle. Data is copied if the new active domain differs from the old one. Memory in the source handle is not free'd. */
+    template <typename DataType>
+    void switch_memory_context(mem_handle & handle, viennacl::context new_ctx)
+    {
+      if (handle.get_active_handle_id() == new_ctx.memory_type())
+        return;
+
+      if (handle.get_active_handle_id() == viennacl::MEMORY_NOT_INITIALIZED || handle.raw_size() == 0)
+      {
+        handle.switch_active_handle_id(new_ctx.memory_type());
+#ifdef VIENNACL_WITH_OPENCL
+        if (new_ctx.memory_type() == OPENCL_MEMORY)
+          handle.opencl_handle().context(new_ctx.opencl_context());
+#endif
+        return;
+      }
+
+      vcl_size_t size_dst = detail::element_size<DataType>(handle.get_active_handle_id());
+      vcl_size_t size_src = detail::element_size<DataType>(new_ctx.memory_type());
+
+      if (size_dst != size_src)  // OpenCL data element size not the same as host data element size
+      {
+        throw "Heterogeneous data element sizes not yet supported!";
+      }
+      else //no data conversion required
+      {
+        if (handle.get_active_handle_id() == MAIN_MEMORY) //we can access the existing data directly
+        {
+          switch (new_ctx.memory_type())
+          {
+#ifdef VIENNACL_WITH_OPENCL
+            case OPENCL_MEMORY:
+              handle.opencl_handle().context(new_ctx.opencl_context());
+              handle.opencl_handle() = opencl::memory_create(handle.opencl_handle().context(), handle.raw_size(), handle.ram_handle().get());
+              break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+            case CUDA_MEMORY:
+              handle.cuda_handle() = cuda::memory_create(handle.raw_size(), handle.ram_handle().get());
+              break;
+#endif
+            case MAIN_MEMORY:
+            default:
+              throw "Invalid destination domain";
+          }
+        }
+#ifdef VIENNACL_WITH_OPENCL
+        else if (handle.get_active_handle_id() == OPENCL_MEMORY) // data can be dumped into destination directly
+        {
+          std::vector<DataType> buffer;
+
+          switch (new_ctx.memory_type())
+          {
+            case MAIN_MEMORY:
+              handle.ram_handle() = cpu_ram::memory_create(handle.raw_size());
+              opencl::memory_read(handle.opencl_handle(), 0, handle.raw_size(), handle.ram_handle().get());
+              break;
+  #ifdef VIENNACL_WITH_CUDA
+            case CUDA_MEMORY:
+              buffer.resize(handle.raw_size() / sizeof(DataType));
+              opencl::memory_read(handle.opencl_handle(), 0, handle.raw_size(), &(buffer[0]));
+              cuda::memory_create(handle.cuda_handle(), handle.raw_size(), &(buffer[0]));
+              break;
+  #endif
+            default:
+              throw "Invalid destination domain";
+          }
+        }
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        else //CUDA_MEMORY
+        {
+          std::vector<DataType> buffer;
+
+          // write
+          switch (new_ctx.memory_type())
+          {
+            case MAIN_MEMORY:
+              handle.ram_handle() = cpu_ram::memory_create(handle.raw_size());
+              cuda::memory_read(handle.cuda_handle(), 0, handle.raw_size(), handle.ram_handle().get());
+              break;
+  #ifdef VIENNACL_WITH_OPENCL
+            case OPENCL_MEMORY:
+              buffer.resize(handle.raw_size() / sizeof(DataType));
+              cuda::memory_read(handle.cuda_handle(), 0, handle.raw_size(), &(buffer[0]));
+              handle.opencl_handle() = opencl::memory_create(handle.raw_size(), &(buffer[0]));
+              break;
+  #endif
+            default:
+              throw "Unsupported source memory domain";
+          }
+        }
+#endif
+
+        // everything succeeded so far, now switch to new domain:
+        handle.switch_active_handle_id(new_ctx.memory_type());
+
+      } // no data conversion
+    }
+
+
+
+    /** @brief Copies data of the provided 'DataType' from 'handle_src' to 'handle_dst' and converts the data if the binary representation of 'DataType' among the memory domains differs. */
+    template <typename DataType>
+    void typesafe_memory_copy(mem_handle const & handle_src, mem_handle & handle_dst)
+    {
+      if (handle_dst.get_active_handle_id() == MEMORY_NOT_INITIALIZED)
+        handle_dst.switch_active_handle_id(default_memory_type());
+
+      vcl_size_t element_size_src = detail::element_size<DataType>(handle_src.get_active_handle_id());
+      vcl_size_t element_size_dst = detail::element_size<DataType>(handle_dst.get_active_handle_id());
+
+      if (element_size_src != element_size_dst)
+      {
+        // Data needs to be converted.
+
+        typesafe_host_array<DataType> buffer_src(handle_src);
+        typesafe_host_array<DataType> buffer_dst(handle_dst, handle_src.raw_size() / element_size_src);
+
+        //
+        // Step 1: Fill buffer_dst depending on where the data resides:
+        //
+        DataType const * src_data;
+        switch (handle_src.get_active_handle_id())
+        {
+          case MAIN_MEMORY:
+            src_data = reinterpret_cast<DataType const *>(handle_src.ram_handle().get());
+            for (vcl_size_t i=0; i<buffer_dst.size(); ++i)
+              buffer_dst.set(i, src_data[i]);
+            break;
+
+#ifdef VIENNACL_WITH_OPENCL
+          case OPENCL_MEMORY:
+            buffer_src.resize(handle_src, handle_src.raw_size() / element_size_src);
+            opencl::memory_read(handle_src.opencl_handle(), 0, buffer_src.raw_size(), buffer_src.get());
+            for (vcl_size_t i=0; i<buffer_dst.size(); ++i)
+              buffer_dst.set(i, buffer_src[i]);
+            break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+          case CUDA_MEMORY:
+            buffer_src.resize(handle_src, handle_src.raw_size() / element_size_src);
+            cuda::memory_read(handle_src.cuda_handle(), 0, buffer_src.raw_size(), buffer_src.get());
+            for (vcl_size_t i=0; i<buffer_dst.size(); ++i)
+              buffer_dst.set(i, buffer_src[i]);
+            break;
+#endif
+
+          default:
+            throw "unsupported memory domain";
+        }
+
+        //
+        // Step 2: Write to destination
+        //
+        if (handle_dst.raw_size() == buffer_dst.raw_size())
+          viennacl::backend::memory_write(handle_dst, 0, buffer_dst.raw_size(), buffer_dst.get());
+        else
+          viennacl::backend::memory_create(handle_dst, buffer_dst.raw_size(), viennacl::traits::context(handle_dst), buffer_dst.get());
+
+      }
+      else
+      {
+        // No data conversion required.
+        typesafe_host_array<DataType> buffer(handle_src);
+
+        switch (handle_src.get_active_handle_id())
+        {
+          case MAIN_MEMORY:
+            switch (handle_dst.get_active_handle_id())
+            {
+              case MAIN_MEMORY:
+              case OPENCL_MEMORY:
+              case CUDA_MEMORY:
+                if (handle_dst.raw_size() == handle_src.raw_size())
+                  viennacl::backend::memory_write(handle_dst, 0, handle_src.raw_size(), handle_src.ram_handle().get());
+                else
+                  viennacl::backend::memory_create(handle_dst, handle_src.raw_size(), viennacl::traits::context(handle_dst), handle_src.ram_handle().get());
+                break;
+
+              default:
+                throw "unsupported destination memory domain";
+            }
+            break;
+
+          case OPENCL_MEMORY:
+            switch (handle_dst.get_active_handle_id())
+            {
+              case MAIN_MEMORY:
+                if (handle_dst.raw_size() != handle_src.raw_size())
+                  viennacl::backend::memory_create(handle_dst, handle_src.raw_size(), viennacl::traits::context(handle_dst));
+                viennacl::backend::memory_read(handle_src, 0, handle_src.raw_size(), handle_dst.ram_handle().get());
+                break;
+
+              case OPENCL_MEMORY:
+                if (handle_dst.raw_size() != handle_src.raw_size())
+                  viennacl::backend::memory_create(handle_dst, handle_src.raw_size(), viennacl::traits::context(handle_dst));
+                viennacl::backend::memory_copy(handle_src, handle_dst, 0, 0, handle_src.raw_size());
+                break;
+
+              case CUDA_MEMORY:
+                if (handle_dst.raw_size() != handle_src.raw_size())
+                  viennacl::backend::memory_create(handle_dst, handle_src.raw_size(), viennacl::traits::context(handle_dst));
+                buffer.resize(handle_src, handle_src.raw_size() / element_size_src);
+                viennacl::backend::memory_read(handle_src, 0, handle_src.raw_size(), buffer.get());
+                viennacl::backend::memory_write(handle_dst, 0, handle_src.raw_size(), buffer.get());
+                break;
+
+              default:
+                throw "unsupported destination memory domain";
+            }
+            break;
+
+          case CUDA_MEMORY:
+            switch (handle_dst.get_active_handle_id())
+            {
+              case MAIN_MEMORY:
+                if (handle_dst.raw_size() != handle_src.raw_size())
+                  viennacl::backend::memory_create(handle_dst, handle_src.raw_size(), viennacl::traits::context(handle_dst));
+                viennacl::backend::memory_read(handle_src, 0, handle_src.raw_size(), handle_dst.ram_handle().get());
+                break;
+
+              case OPENCL_MEMORY:
+                if (handle_dst.raw_size() != handle_src.raw_size())
+                  viennacl::backend::memory_create(handle_dst, handle_src.raw_size(), viennacl::traits::context(handle_dst));
+                buffer.resize(handle_src, handle_src.raw_size() / element_size_src);
+                viennacl::backend::memory_read(handle_src, 0, handle_src.raw_size(), buffer.get());
+                viennacl::backend::memory_write(handle_dst, 0, handle_src.raw_size(), buffer.get());
+                break;
+
+              case CUDA_MEMORY:
+                if (handle_dst.raw_size() != handle_src.raw_size())
+                  viennacl::backend::memory_create(handle_dst, handle_src.raw_size(), viennacl::traits::context(handle_dst));
+                viennacl::backend::memory_copy(handle_src, handle_dst, 0, 0, handle_src.raw_size());
+                break;
+
+              default:
+                throw "unsupported destination memory domain";
+            }
+            break;
+
+          default:
+            throw "unsupported source memory domain";
+        }
+
+      }
+    }
+
+
+  } //backend
+
+
+  //
+  // Convenience layer:
+  //
+
+  /** @brief Generic convenience routine for migrating data of an object to a new memory domain */
+  template <typename T>
+  void switch_memory_context(T & obj, viennacl::context new_ctx)
+  {
+    obj.switch_memory_context(new_ctx);
+  }
+
+} //viennacl
+#endif
diff --git a/viennacl/backend/opencl.hpp b/viennacl/backend/opencl.hpp
new file mode 100644
index 0000000..849f1ae
--- /dev/null
+++ b/viennacl/backend/opencl.hpp
@@ -0,0 +1,146 @@
+#ifndef VIENNACL_BACKEND_OPENCL_HPP_
+#define VIENNACL_BACKEND_OPENCL_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/backend/opencl.hpp
+    @brief Implementations for the OpenCL backend functionality
+*/
+
+
+#include <vector>
+#include "viennacl/ocl/handle.hpp"
+#include "viennacl/ocl/backend.hpp"
+
+namespace viennacl
+{
+  namespace backend
+  {
+    namespace opencl
+    {
+
+      // Requirements for backend:
+
+      // * memory_create(size, host_ptr)
+      // * memory_copy(src, dest, offset_src, offset_dest, size)
+      // * memory_write_from_main_memory(src, offset, size,
+      //                                 dest, offset, size)
+      // * memory_read_to_main_memory(src, offset, size
+      //                              dest, offset, size)
+      // *
+      //
+
+      /** @brief Creates an array of the specified size in the current OpenCL context. If the second argument is provided, the buffer is initialized with data from that pointer.
+       *
+       * @param size_in_bytes   Number of bytes to allocate
+       * @param host_ptr        Pointer to data which will be copied to the new array. Must point to at least 'size_in_bytes' bytes of data.
+       * @param ctx             Optional context in which the matrix is created (one out of multiple OpenCL contexts, CUDA, host)
+       *
+       */
+      inline cl_mem memory_create(viennacl::ocl::context const & ctx, vcl_size_t size_in_bytes, const void * host_ptr = NULL)
+      {
+        //std::cout << "Creating buffer (" << size_in_bytes << " bytes) host buffer " << host_ptr << " in context " << &ctx << std::endl;
+        return ctx.create_memory_without_smart_handle(CL_MEM_READ_WRITE, static_cast<unsigned int>(size_in_bytes), const_cast<void *>(host_ptr));
+      }
+
+      /** @brief Copies 'bytes_to_copy' bytes from address 'src_buffer + src_offset' in the OpenCL context to memory starting at address 'dst_buffer + dst_offset' in the same OpenCL context.
+       *
+       *  @param src_buffer     A smart pointer to the begin of an allocated OpenCL buffer
+       *  @param dst_buffer     A smart pointer to the end of an allocated OpenCL buffer
+       *  @param src_offset     Offset of the first byte to be written from the address given by 'src_buffer' (in bytes)
+       *  @param dst_offset     Offset of the first byte to be written to the address given by 'dst_buffer' (in bytes)
+       *  @param bytes_to_copy  Number of bytes to be copied
+       */
+      inline void memory_copy(viennacl::ocl::handle<cl_mem> const & src_buffer,
+                       viennacl::ocl::handle<cl_mem> & dst_buffer,
+                       vcl_size_t src_offset,
+                       vcl_size_t dst_offset,
+                       vcl_size_t bytes_to_copy)
+      {
+        assert( &src_buffer.context() == &dst_buffer.context() && bool("Transfer between memory buffers in different contexts not supported yet!"));
+
+        viennacl::ocl::context & memory_context = const_cast<viennacl::ocl::context &>(src_buffer.context());
+        cl_int err = clEnqueueCopyBuffer(memory_context.get_queue().handle().get(),
+                                         src_buffer.get(),
+                                         dst_buffer.get(),
+                                         src_offset,
+                                         dst_offset,
+                                         bytes_to_copy,
+                                         0, NULL, NULL);  //events
+        VIENNACL_ERR_CHECK(err);
+      }
+
+
+      /** @brief Writes data from main RAM identified by 'ptr' to the OpenCL buffer identified by 'dst_buffer'
+       *
+       * @param dst_buffer    A smart pointer to the beginning of an allocated OpenCL buffer
+       * @param dst_offset    Offset of the first written byte from the beginning of 'dst_buffer' (in bytes)
+       * @param bytes_to_copy Number of bytes to be copied
+       * @param ptr           Pointer to the first byte to be written
+       * @param async         Whether the operation should be asynchronous
+       */
+      inline void memory_write(viennacl::ocl::handle<cl_mem> & dst_buffer,
+                        vcl_size_t dst_offset,
+                        vcl_size_t bytes_to_copy,
+                        const void * ptr,
+                        bool async = false)
+      {
+        //std::cout << "Writing data (" << bytes_to_copy << " bytes, offset " << dst_offset << ") to OpenCL buffer" << std::endl;
+        viennacl::ocl::context & memory_context = const_cast<viennacl::ocl::context &>(dst_buffer.context());
+        cl_int err = clEnqueueWriteBuffer(memory_context.get_queue().handle().get(),
+                                          dst_buffer.get(),
+                                          async ? CL_FALSE : CL_TRUE,             //blocking
+                                          dst_offset,
+                                          bytes_to_copy,
+                                          ptr,
+                                          0, NULL, NULL);      //events
+        VIENNACL_ERR_CHECK(err);
+      }
+
+
+      /** @brief Reads data from an OpenCL buffer back to main RAM.
+       *
+       * @param src_buffer         A smart pointer to the beginning of an allocated OpenCL source buffer
+       * @param src_offset         Offset of the first byte to be read from the beginning of src_buffer (in bytes_
+       * @param bytes_to_copy      Number of bytes to be read
+       * @param ptr                Location in main RAM where to read data should be written to
+       * @param async         Whether the operation should be asynchronous
+       */
+      inline void memory_read(viennacl::ocl::handle<cl_mem> const & src_buffer,
+                       vcl_size_t src_offset,
+                       vcl_size_t bytes_to_copy,
+                       void * ptr,
+                       bool async = false)
+      {
+        //std::cout << "Reading data (" << bytes_to_copy << " bytes, offset " << src_offset << ") from OpenCL buffer " << src_buffer.get() << " to " << ptr << std::endl;
+        viennacl::ocl::context & memory_context = const_cast<viennacl::ocl::context &>(src_buffer.context());
+        cl_int err =  clEnqueueReadBuffer(memory_context.get_queue().handle().get(),
+                                          src_buffer.get(),
+                                          async ? CL_FALSE : CL_TRUE,             //blocking
+                                          src_offset,
+                                          bytes_to_copy,
+                                          ptr,
+                                          0, NULL, NULL);      //events
+        VIENNACL_ERR_CHECK(err);
+      }
+
+
+    }
+  } //backend
+} //viennacl
+#endif
diff --git a/viennacl/backend/util.hpp b/viennacl/backend/util.hpp
new file mode 100644
index 0000000..8146e57
--- /dev/null
+++ b/viennacl/backend/util.hpp
@@ -0,0 +1,280 @@
+#ifndef VIENNACL_BACKEND_UTIL_HPP
+#define VIENNACL_BACKEND_UTIL_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/backend/util.hpp
+    @brief Helper functionality for working with different memory domains
+*/
+
+#include <vector>
+#include <cassert>
+
+#include "viennacl/forwards.h"
+#include "viennacl/backend/mem_handle.hpp"
+
+#ifdef VIENNACL_WITH_OPENCL
+  #include "viennacl/backend/opencl.hpp"
+#endif
+
+
+namespace viennacl
+{
+  namespace backend
+  {
+
+
+
+    namespace detail
+    {
+
+      /** @brief Helper struct for converting a type to its OpenCL pendant. */
+      template <typename T>
+      struct convert_to_opencl
+      {
+        typedef T    type;
+        enum { special = 0 };
+      };
+
+#ifdef VIENNACL_WITH_OPENCL
+      template <>
+      struct convert_to_opencl<unsigned int>
+      {
+        typedef cl_uint    type;
+        //enum { special = (sizeof(unsigned int) != sizeof(cl_uint)) };
+        enum { special = 1 };
+      };
+
+      template <>
+      struct convert_to_opencl<int>
+      {
+        typedef cl_int    type;
+        //enum { special = (sizeof(int) != sizeof(cl_int)) };
+        enum { special = 1 };
+      };
+
+
+      template <>
+      struct convert_to_opencl<unsigned long>
+      {
+        typedef cl_ulong    type;
+        //enum { special = (sizeof(unsigned long) != sizeof(cl_ulong)) };
+        enum { special = 1 };
+      };
+
+      template <>
+      struct convert_to_opencl<long>
+      {
+        typedef cl_long    type;
+        //enum { special = (sizeof(long) != sizeof(cl_long)) };
+        enum { special = 1 };
+      };
+#endif
+
+
+    } //namespace detail
+
+
+    /** @brief Helper class implementing an array on the host. Default case: No conversion necessary */
+    template <typename T, bool special = detail::convert_to_opencl<T>::special>
+    class typesafe_host_array
+    {
+        typedef T                                              cpu_type;
+        typedef typename detail::convert_to_opencl<T>::type    target_type;
+
+      public:
+        explicit typesafe_host_array() : bytes_buffer_(NULL), buffer_size_(0) {}
+
+        explicit typesafe_host_array(mem_handle const & handle, vcl_size_t num = 0) : bytes_buffer_(NULL), buffer_size_(sizeof(cpu_type) * num)
+        {
+          resize(handle, num);
+        }
+
+        ~typesafe_host_array() { delete[] bytes_buffer_; }
+
+        //
+        // Resize functionality
+        //
+
+        /** @brief Resize without initializing the new memory */
+        void raw_resize(mem_handle const & /*handle*/, vcl_size_t num)
+        {
+          buffer_size_ = sizeof(cpu_type) * num;
+
+          if (num > 0)
+          {
+            if (bytes_buffer_)
+              delete[] bytes_buffer_;
+
+            bytes_buffer_ = new char[buffer_size_];
+          }
+        }
+
+        /** @brief Resize including initialization of new memory (cf. std::vector<>) */
+        void resize(mem_handle const & handle, vcl_size_t num)
+        {
+          raw_resize(handle, num);
+
+          if (num > 0)
+          {
+            for (vcl_size_t i=0; i<buffer_size_; ++i)
+              bytes_buffer_[i] = 0;
+          }
+        }
+
+        //
+        // Setter and Getter
+        //
+
+        template <typename U>
+        void set(vcl_size_t index, U value)
+        {
+          reinterpret_cast<cpu_type *>(bytes_buffer_)[index] = static_cast<cpu_type>(value);
+        }
+
+        void * get() { return reinterpret_cast<void *>(bytes_buffer_); }
+        cpu_type operator[](vcl_size_t index) const
+        {
+          assert(index < size() && bool("index out of bounds"));
+
+          return reinterpret_cast<cpu_type *>(bytes_buffer_)[index];
+        }
+
+        vcl_size_t raw_size() const { return buffer_size_; }
+        vcl_size_t element_size() const
+        {
+          return sizeof(cpu_type);
+        }
+        vcl_size_t size() const { return buffer_size_ / element_size(); }
+
+      private:
+        char * bytes_buffer_;
+        vcl_size_t buffer_size_;
+    };
+
+
+
+
+    /** @brief Special host array type for conversion between OpenCL types and pure CPU types */
+    template <typename T>
+    class typesafe_host_array<T, true>
+    {
+        typedef T                                              cpu_type;
+        typedef typename detail::convert_to_opencl<T>::type    target_type;
+
+      public:
+        explicit typesafe_host_array() : convert_to_opencl_( (default_memory_type() == OPENCL_MEMORY) ? true : false), bytes_buffer_(NULL), buffer_size_(0) {}
+
+        explicit typesafe_host_array(mem_handle const & handle, vcl_size_t num = 0) : convert_to_opencl_(false), bytes_buffer_(NULL), buffer_size_(sizeof(cpu_type) * num)
+        {
+          resize(handle, num);
+        }
+
+        ~typesafe_host_array() { delete[] bytes_buffer_; }
+
+        //
+        // Resize functionality
+        //
+
+        /** @brief Resize without initializing the new memory */
+        void raw_resize(mem_handle const & handle, vcl_size_t num)
+        {
+          buffer_size_ = sizeof(cpu_type) * num;
+          (void)handle; //silence unused variable warning if compiled without OpenCL support
+
+#ifdef VIENNACL_WITH_OPENCL
+          memory_types mem_type = handle.get_active_handle_id();
+          if (mem_type == MEMORY_NOT_INITIALIZED)
+            mem_type = default_memory_type();
+
+          if (mem_type == OPENCL_MEMORY)
+          {
+            convert_to_opencl_ = true;
+            buffer_size_ = sizeof(target_type) * num;
+          }
+#endif
+
+          if (num > 0)
+          {
+            if (bytes_buffer_)
+              delete[] bytes_buffer_;
+
+            bytes_buffer_ = new char[buffer_size_];
+          }
+        }
+
+        /** @brief Resize including initialization of new memory (cf. std::vector<>) */
+        void resize(mem_handle const & handle, vcl_size_t num)
+        {
+          raw_resize(handle, num);
+
+          if (num > 0)
+          {
+            for (vcl_size_t i=0; i<buffer_size_; ++i)
+              bytes_buffer_[i] = 0;
+          }
+        }
+
+        //
+        // Setter and Getter
+        //
+
+        template <typename U>
+        void set(vcl_size_t index, U value)
+        {
+#ifdef VIENNACL_WITH_OPENCL
+          if (convert_to_opencl_)
+            reinterpret_cast<target_type *>(bytes_buffer_)[index] = static_cast<target_type>(value);
+          else
+#endif
+            reinterpret_cast<cpu_type *>(bytes_buffer_)[index] = static_cast<cpu_type>(value);
+        }
+
+        void * get() { return reinterpret_cast<void *>(bytes_buffer_); }
+        cpu_type operator[](vcl_size_t index) const
+        {
+          assert(index < size() && bool("index out of bounds"));
+#ifdef VIENNACL_WITH_OPENCL
+          if (convert_to_opencl_)
+            return static_cast<cpu_type>(reinterpret_cast<target_type *>(bytes_buffer_)[index]);
+#endif
+          return reinterpret_cast<cpu_type *>(bytes_buffer_)[index];
+        }
+
+        vcl_size_t raw_size() const { return buffer_size_; }
+        vcl_size_t element_size() const
+        {
+#ifdef VIENNACL_WITH_OPENCL
+          if (convert_to_opencl_)
+            return sizeof(target_type);
+#endif
+          return sizeof(cpu_type);
+        }
+        vcl_size_t size() const { return buffer_size_ / element_size(); }
+
+      private:
+        bool convert_to_opencl_;
+        char * bytes_buffer_;
+        vcl_size_t buffer_size_;
+    };
+
+  } //backend
+
+
+} //viennacl
+#endif
diff --git a/viennacl/circulant_matrix.hpp b/viennacl/circulant_matrix.hpp
index c029ede..30fe0a1 100644
--- a/viennacl/circulant_matrix.hpp
+++ b/viennacl/circulant_matrix.hpp
@@ -1,35 +1,36 @@
-#ifndef _VIENNACL_CIRCULANT_MATRIX_HPP
-#define _VIENNACL_CIRCULANT_MATRIX_HPP
+#ifndef VIENNACL_CIRCULANT_MATRIX_HPP
+#define VIENNACL_CIRCULANT_MATRIX_HPP
 
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
 ============================================================================= */
 
 /** @file circulant_matrix.hpp
-    @brief Implementation of the circulant_matrix class for efficient manipulation of circulant matrices.  Experimental in 1.2.x.
+    @brief Implementation of the circulant_matrix class for efficient manipulation of circulant matrices.  Experimental.
 */
 
 #include "viennacl/forwards.h"
 #include "viennacl/vector.hpp"
-#include "viennacl/ocl/context.hpp"
+#include "viennacl/ocl/backend.hpp"
 
 #include "viennacl/linalg/circulant_matrix_operations.hpp"
 
 #include "viennacl/fft.hpp"
 
-namespace viennacl 
+namespace viennacl
 {
     /** @brief A Circulant matrix class
     *
@@ -39,15 +40,15 @@ namespace viennacl
     template<class SCALARTYPE, unsigned int ALIGNMENT>
     class circulant_matrix
     {
-      public:     
+      public:
+        typedef viennacl::backend::mem_handle                                                              handle_type;
+        typedef scalar<typename viennacl::tools::CHECK_SCALAR_TEMPLATE_ARGUMENT<SCALARTYPE>::ResultType>   value_type;
+
         /**
          * @brief The default constructor. Does not allocate any memory.
          *
          */
-        explicit circulant_matrix()
-        {
-          viennacl::linalg::kernels::fft<SCALARTYPE, 1>::init();
-        }
+        explicit circulant_matrix() {}
 
         /**
          * @brief         Creates the matrix with the given size
@@ -55,10 +56,10 @@ namespace viennacl
          * @param rows      Number of rows of the matrix
          * @param cols      Number of columns of the matrix
          */
-        explicit circulant_matrix(std::size_t rows, std::size_t cols) : elements_(rows)
+        explicit circulant_matrix(vcl_size_t rows, vcl_size_t cols) : elements_(rows)
         {
-          assert(rows == cols && "Circulant matrix must be square!");
-          viennacl::linalg::kernels::fft<SCALARTYPE, 1>::init();
+          assert(rows == cols && bool("Circulant matrix must be square!"));
+          (void)cols;  // avoid 'unused parameter' warning in optimized builds
         }
 
         /** @brief Resizes the matrix.
@@ -67,7 +68,7 @@ namespace viennacl
         * @param sz         New size of matrix
         * @param preserve   If true, existing values are preserved.
         */
-        void resize(size_t sz, bool preserve = true)
+        void resize(vcl_size_t sz, bool preserve = true)
         {
             elements_.resize(sz, preserve);
         }
@@ -76,7 +77,7 @@ namespace viennacl
         *
         *   @return OpenCL handle
         */
-        viennacl::ocl::handle<cl_mem> handle() const { return elements_.handle(); }
+        handle_type const & handle() const { return elements_.handle(); }
 
         /**
          * @brief Returns an internal viennacl::vector, which represents a circulant matrix elements
@@ -88,19 +89,19 @@ namespace viennacl
         /**
          * @brief Returns the number of rows of the matrix
          */
-        std::size_t size1() const { return elements_.size(); }
-        
+        vcl_size_t size1() const { return elements_.size(); }
+
         /**
          * @brief Returns the number of columns of the matrix
          */
-        std::size_t size2() const { return elements_.size(); }
+        vcl_size_t size2() const { return elements_.size(); }
 
         /** @brief Returns the internal size of matrix representtion.
         *   Usually required for launching OpenCL kernels only
         *
         *   @return Internal size of matrix representation
         */
-        std::size_t internal_size() const { return elements_.internal_size(); }
+        vcl_size_t internal_size() const { return elements_.internal_size(); }
 
         /**
          * @brief Read-write access to a single element of the matrix
@@ -109,14 +110,14 @@ namespace viennacl
          * @param col_index  Column index of accessed element
          * @return Proxy for matrix entry
          */
-        entry_proxy<SCALARTYPE> operator()(std::size_t row_index, std::size_t col_index)
+        entry_proxy<SCALARTYPE> operator()(vcl_size_t row_index, vcl_size_t col_index)
         {
-            int index = static_cast<int>(row_index) - static_cast<int>(col_index);
+            long index = static_cast<long>(row_index) - static_cast<long>(col_index);
+
+            assert(row_index < size1() && col_index < size2() && bool("Invalid access"));
 
-            assert(row_index < size1() && col_index < size2() && "Invalid access");
-            
             while (index < 0)
-              index += size1();
+              index += static_cast<long>(size1());
             return elements_[index];
         }
 
@@ -133,9 +134,9 @@ namespace viennacl
         }
 
     private:
-        circulant_matrix(circulant_matrix const & t) {}
-        circulant_matrix & operator=(circulant_matrix const & t) {}
-      
+        circulant_matrix(circulant_matrix const &) {}
+        circulant_matrix & operator=(circulant_matrix const & t);
+
         viennacl::vector<SCALARTYPE, ALIGNMENT> elements_;
     };
 
@@ -148,7 +149,7 @@ namespace viennacl
     template <typename SCALARTYPE, unsigned int ALIGNMENT>
     void copy(std::vector<SCALARTYPE>& cpu_vec, circulant_matrix<SCALARTYPE, ALIGNMENT>& gpu_mat)
     {
-        assert(cpu_vec.size() == gpu_mat.size1() && "Size mismatch");
+        assert( (gpu_mat.size1() == 0 || cpu_vec.size() == gpu_mat.size1()) && bool("Size mismatch"));
         copy(cpu_vec, gpu_mat.elements());
     }
 
@@ -161,7 +162,7 @@ namespace viennacl
     template <typename SCALARTYPE, unsigned int ALIGNMENT>
     void copy(circulant_matrix<SCALARTYPE, ALIGNMENT>& gpu_mat, std::vector<SCALARTYPE>& cpu_vec)
     {
-        assert(cpu_vec.size() == gpu_mat.size1() && "Size mismatch");
+        assert(cpu_vec.size() == gpu_mat.size1() && bool("Size mismatch"));
         copy(gpu_mat.elements(), cpu_vec);
     }
 
@@ -173,17 +174,17 @@ namespace viennacl
     */
     template <typename SCALARTYPE, unsigned int ALIGNMENT, typename MATRIXTYPE>
     void copy(circulant_matrix<SCALARTYPE, ALIGNMENT>& circ_src, MATRIXTYPE& com_dst) {
-        std::size_t size = circ_src.size1();
-        assert(size == com_dst.size1() && "Size mismatch");
-        assert(size == com_dst.size2() && "Size mismatch");
+        vcl_size_t size = circ_src.size1();
+        assert(size == viennacl::traits::size1(com_dst) && bool("Size mismatch"));
+        assert(size == viennacl::traits::size2(com_dst) && bool("Size mismatch"));
         std::vector<SCALARTYPE> tmp(size);
         copy(circ_src, tmp);
 
-        for (std::size_t i = 0; i < size; i++) {
-            for (std::size_t j = 0; j < size; j++) {
-                int index = static_cast<int>(i) - static_cast<int>(j);
+        for (vcl_size_t i = 0; i < size; i++) {
+            for (vcl_size_t j = 0; j < size; j++) {
+                long index = static_cast<long>(i) - static_cast<long>(j);
                 if (index < 0)
-                  index = size + index;
+                  index = static_cast<long>(size + index);
                 com_dst(i, j) = tmp[index];
             }
         }
@@ -196,14 +197,16 @@ namespace viennacl
     * @param circ_dst   A circulant_matrix from ViennaCL
     */
     template <typename SCALARTYPE, unsigned int ALIGNMENT, typename MATRIXTYPE>
-    void copy(MATRIXTYPE& com_src, circulant_matrix<SCALARTYPE, ALIGNMENT>& circ_dst) {
-        std::size_t size = circ_dst.size1();
-        assert(size == com_src.size1() && "Size mismatch");
-        assert(size == com_src.size2() && "Size mismatch");
+    void copy(MATRIXTYPE& com_src, circulant_matrix<SCALARTYPE, ALIGNMENT>& circ_dst)
+    {
+        assert( (circ_dst.size1() == 0 || circ_dst.size1() == viennacl::traits::size1(com_src)) && bool("Size mismatch"));
+        assert( (circ_dst.size2() == 0 || circ_dst.size2() == viennacl::traits::size2(com_src)) && bool("Size mismatch"));
+
+        vcl_size_t size = viennacl::traits::size1(com_src);
 
         std::vector<SCALARTYPE> tmp(size);
 
-        for(std::size_t i = 0; i < size; i++) tmp[i] = com_src(i, 0);
+        for(vcl_size_t i = 0; i < size; i++) tmp[i] = com_src(i, 0);
 
         copy(tmp, circ_dst);
     }
@@ -234,16 +237,16 @@ namespace viennacl
     template<class SCALARTYPE, unsigned int ALIGNMENT>
     std::ostream & operator<<(std::ostream& s, circulant_matrix<SCALARTYPE, ALIGNMENT>& gpu_matrix)
     {
-        std::size_t size = gpu_matrix.size1();
+        vcl_size_t size = gpu_matrix.size1();
         std::vector<SCALARTYPE> tmp(size);
         copy(gpu_matrix, tmp);
         s << "[" << size << "," << size << "](";
 
-        for(std::size_t i = 0; i < size; i++) {
+        for(vcl_size_t i = 0; i < size; i++) {
             s << "(";
-            for(std::size_t j = 0; j < size; j++) {
-                int index = (int)i - (int)j;
-                if(index < 0) index = size + index;
+            for(vcl_size_t j = 0; j < size; j++) {
+                long index = static_cast<long>(i) - static_cast<long>(j);
+                if(index < 0) index = static_cast<long>(size) + index;
                 s << tmp[index];
                 //s << index;
                 if(j < (size - 1)) s << ",";
@@ -253,6 +256,99 @@ namespace viennacl
         s << ")";
         return s;
     }
+
+    //
+    // Specify available operations:
+    //
+
+    /** \cond */
+
+    namespace linalg
+    {
+      namespace detail
+      {
+        // x = A * y
+        template <typename T, unsigned int A>
+        struct op_executor<vector_base<T>, op_assign, vector_expression<const circulant_matrix<T, A>, const vector_base<T>, op_prod> >
+        {
+            static void apply(vector_base<T> & lhs, vector_expression<const circulant_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+            {
+              // check for the special case x = A * x
+              if (viennacl::traits::handle(lhs) == viennacl::traits::handle(rhs.rhs()))
+              {
+                viennacl::vector<T> temp(lhs);
+                viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), temp);
+                lhs = temp;
+              }
+              else
+                viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), lhs);
+            }
+        };
+
+        template <typename T, unsigned int A>
+        struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const circulant_matrix<T, A>, const vector_base<T>, op_prod> >
+        {
+            static void apply(vector_base<T> & lhs, vector_expression<const circulant_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+            {
+              viennacl::vector<T> temp(lhs);
+              viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), temp);
+              lhs += temp;
+            }
+        };
+
+        template <typename T, unsigned int A>
+        struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const circulant_matrix<T, A>, const vector_base<T>, op_prod> >
+        {
+            static void apply(vector_base<T> & lhs, vector_expression<const circulant_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+            {
+              viennacl::vector<T> temp(lhs);
+              viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), temp);
+              lhs -= temp;
+            }
+        };
+
+
+        // x = A * vec_op
+        template <typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+        struct op_executor<vector_base<T>, op_assign, vector_expression<const circulant_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> >
+        {
+            static void apply(vector_base<T> & lhs, vector_expression<const circulant_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+            {
+              viennacl::vector<T> temp(rhs.rhs());
+              viennacl::linalg::prod_impl(rhs.lhs(), temp, lhs);
+            }
+        };
+
+        // x = A * vec_op
+        template <typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+        struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const circulant_matrix<T, A>, vector_expression<const LHS, const RHS, OP>, op_prod> >
+        {
+            static void apply(vector_base<T> & lhs, vector_expression<const circulant_matrix<T, A>, vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+            {
+              viennacl::vector<T> temp(rhs.rhs());
+              viennacl::vector<T> temp_result(lhs);
+              viennacl::linalg::prod_impl(rhs.lhs(), temp, temp_result);
+              lhs += temp_result;
+            }
+        };
+
+        // x = A * vec_op
+        template <typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+        struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const circulant_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> >
+        {
+            static void apply(vector_base<T> & lhs, vector_expression<const circulant_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+            {
+              viennacl::vector<T> temp(rhs.rhs());
+              viennacl::vector<T> temp_result(lhs);
+              viennacl::linalg::prod_impl(rhs.lhs(), temp, temp_result);
+              lhs -= temp_result;
+            }
+        };
+
+      } // namespace detail
+    } // namespace linalg
+
+    /** \endcond */
 }
 
-#endif // _VIENNACL_CIRCULANT_MATRIX_HPP
+#endif // VIENNACL_CIRCULANT_MATRIX_HPP
diff --git a/viennacl/compressed_compressed_matrix.hpp b/viennacl/compressed_compressed_matrix.hpp
new file mode 100644
index 0000000..a420d79
--- /dev/null
+++ b/viennacl/compressed_compressed_matrix.hpp
@@ -0,0 +1,588 @@
+#ifndef VIENNACL_COMPRESSED_compressed_compressed_matrix_HPP_
+#define VIENNACL_COMPRESSED_compressed_compressed_matrix_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/compressed_compressed_matrix.hpp
+    @brief Implementation of the compressed_compressed_matrix class (CSR format with a relatively small number of nonzero rows)
+*/
+
+#include <vector>
+#include <list>
+#include <map>
+#include "viennacl/forwards.h"
+#include "viennacl/vector.hpp"
+
+#include "viennacl/linalg/sparse_matrix_operations.hpp"
+
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/tools/entry_proxy.hpp"
+
+namespace viennacl
+{
+    namespace detail
+    {
+      template <typename CPU_MATRIX, typename SCALARTYPE>
+      void copy_impl(const CPU_MATRIX & cpu_matrix,
+                     compressed_compressed_matrix<SCALARTYPE> & gpu_matrix,
+                     vcl_size_t nonzero_rows,
+                     vcl_size_t nonzeros)
+      {
+        assert( (gpu_matrix.size1() == 0 || viennacl::traits::size1(cpu_matrix) == gpu_matrix.size1()) && bool("Size mismatch") );
+        assert( (gpu_matrix.size2() == 0 || viennacl::traits::size2(cpu_matrix) == gpu_matrix.size2()) && bool("Size mismatch") );
+
+        viennacl::backend::typesafe_host_array<unsigned int> row_buffer(gpu_matrix.handle1(), nonzero_rows + 1);
+        viennacl::backend::typesafe_host_array<unsigned int> row_indices(gpu_matrix.handle3(), nonzero_rows);
+        viennacl::backend::typesafe_host_array<unsigned int> col_buffer(gpu_matrix.handle2(), nonzeros);
+        std::vector<SCALARTYPE> elements(nonzeros);
+
+        vcl_size_t row_index  = 0;
+        vcl_size_t data_index = 0;
+
+        for (typename CPU_MATRIX::const_iterator1 row_it = cpu_matrix.begin1();
+              row_it != cpu_matrix.end1();
+              ++row_it)
+        {
+          bool row_empty = true;
+
+          for (typename CPU_MATRIX::const_iterator2 col_it = row_it.begin();
+                col_it != row_it.end();
+                ++col_it)
+          {
+            SCALARTYPE entry = *col_it;
+            if (entry != SCALARTYPE(0))
+            {
+              if (row_empty)
+              {
+                assert(row_index < nonzero_rows && bool("Provided count of nonzero rows exceeded!"));
+
+                row_empty = false;
+                row_buffer.set(row_index, data_index);
+                row_indices.set(row_index, col_it.index1());
+                ++row_index;
+              }
+
+              col_buffer.set(data_index, col_it.index2());
+              elements[data_index] = entry;
+              ++data_index;
+            }
+          }
+        }
+        row_buffer.set(row_index, data_index);
+
+        gpu_matrix.set(row_buffer.get(),
+                       row_indices.get(),
+                       col_buffer.get(),
+                       &elements[0],
+                       cpu_matrix.size1(),
+                       cpu_matrix.size2(),
+                       nonzero_rows,
+                       nonzeros);
+      }
+    }
+
+    //provide copy-operation:
+    /** @brief Copies a sparse matrix from the host to the OpenCL device (either GPU or multi-core CPU)
+    *
+    * There are some type requirements on the CPU_MATRIX type (fulfilled by e.g. boost::numeric::ublas):
+    * - .size1() returns the number of rows
+    * - .size2() returns the number of columns
+    * - const_iterator1    is a type definition for an iterator along increasing row indices
+    * - const_iterator2    is a type definition for an iterator along increasing columns indices
+    * - The const_iterator1 type provides an iterator of type const_iterator2 via members .begin() and .end() that iterates along column indices in the current row.
+    * - The types const_iterator1 and const_iterator2 provide members functions .index1() and .index2() that return the current row and column indices respectively.
+    * - Dereferenciation of an object of type const_iterator2 returns the entry.
+    *
+    * @param cpu_matrix   A sparse matrix on the host.
+    * @param gpu_matrix   A compressed_compressed_matrix from ViennaCL
+    */
+    template <typename CPU_MATRIX, typename SCALARTYPE>
+    void copy(const CPU_MATRIX & cpu_matrix,
+              compressed_compressed_matrix<SCALARTYPE> & gpu_matrix )
+    {
+      //std::cout << "copy for (" << cpu_matrix.size1() << ", " << cpu_matrix.size2() << ", " << cpu_matrix.nnz() << ")" << std::endl;
+
+      if ( cpu_matrix.size1() > 0 && cpu_matrix.size2() > 0 )
+      {
+        //determine nonzero rows and total nonzeros:
+        vcl_size_t num_entries = 0;
+        vcl_size_t nonzero_rows = 0;
+        for (typename CPU_MATRIX::const_iterator1 row_it = cpu_matrix.begin1();
+              row_it != cpu_matrix.end1();
+              ++row_it)
+        {
+          bool row_empty = true;
+          for (typename CPU_MATRIX::const_iterator2 col_it = row_it.begin();
+                col_it != row_it.end();
+                ++col_it)
+          {
+            if (*col_it != SCALARTYPE(0))
+            {
+              ++num_entries;
+
+              if (row_empty)
+              {
+                row_empty = false;
+                ++nonzero_rows;
+              }
+            }
+          }
+        }
+
+        if (num_entries == 0) //we copy an empty matrix
+          num_entries = 1;
+
+        //set up matrix entries:
+        detail::copy_impl(cpu_matrix, gpu_matrix, nonzero_rows, num_entries);
+      }
+    }
+
+
+    //adapted for std::vector< std::map < > > argument:
+    /** @brief Copies a sparse square matrix in the std::vector< std::map < > > format to an OpenCL device. Use viennacl::tools::sparse_matrix_adapter for non-square matrices.
+    *
+    * @param cpu_matrix   A sparse square matrix on the host using STL types
+    * @param gpu_matrix   A compressed_compressed_matrix from ViennaCL
+    */
+    template <typename SizeType, typename SCALARTYPE>
+    void copy(const std::vector< std::map<SizeType, SCALARTYPE> > & cpu_matrix,
+              compressed_compressed_matrix<SCALARTYPE> & gpu_matrix )
+    {
+      vcl_size_t nonzero_rows = 0;
+      vcl_size_t nonzeros = 0;
+      vcl_size_t max_col = 0;
+      for (vcl_size_t i=0; i<cpu_matrix.size(); ++i)
+      {
+        if (cpu_matrix[i].size() > 0)
+          ++nonzero_rows;
+        nonzeros += cpu_matrix[i].size();
+        if (cpu_matrix[i].size() > 0)
+          max_col = std::max<vcl_size_t>(max_col, (cpu_matrix[i].rbegin())->first);
+      }
+
+      viennacl::detail::copy_impl(tools::const_sparse_matrix_adapter<SCALARTYPE, SizeType>(cpu_matrix, cpu_matrix.size(), max_col + 1),
+                                  gpu_matrix,
+                                  nonzero_rows,
+                                  nonzeros);
+    }
+
+
+    //
+    // gpu to cpu:
+    //
+    /** @brief Copies a sparse matrix from the OpenCL device (either GPU or multi-core CPU) to the host.
+    *
+    * There are two type requirements on the CPU_MATRIX type (fulfilled by e.g. boost::numeric::ublas):
+    * - resize(rows, cols)  A resize function to bring the matrix into the correct size
+    * - operator(i,j)       Write new entries via the parenthesis operator
+    *
+    * @param gpu_matrix   A compressed_compressed_matrix from ViennaCL
+    * @param cpu_matrix   A sparse matrix on the host.
+    */
+    template <typename CPU_MATRIX, typename SCALARTYPE>
+    void copy(const compressed_compressed_matrix<SCALARTYPE> & gpu_matrix,
+              CPU_MATRIX & cpu_matrix )
+    {
+      assert( (cpu_matrix.size1() == gpu_matrix.size1()) && bool("Size mismatch") );
+      assert( (cpu_matrix.size2() == gpu_matrix.size2()) && bool("Size mismatch") );
+
+      if ( gpu_matrix.size1() > 0 && gpu_matrix.size2() > 0 )
+      {
+        //get raw data from memory:
+        viennacl::backend::typesafe_host_array<unsigned int> row_buffer(gpu_matrix.handle1(), gpu_matrix.nnz1() + 1);
+        viennacl::backend::typesafe_host_array<unsigned int> row_indices(gpu_matrix.handle1(), gpu_matrix.nnz1());
+        viennacl::backend::typesafe_host_array<unsigned int> col_buffer(gpu_matrix.handle2(), gpu_matrix.nnz());
+        std::vector<SCALARTYPE> elements(gpu_matrix.nnz());
+
+        //std::cout << "GPU->CPU, nonzeros: " << gpu_matrix.nnz() << std::endl;
+
+        viennacl::backend::memory_read(gpu_matrix.handle1(), 0, row_buffer.raw_size(), row_buffer.get());
+        viennacl::backend::memory_read(gpu_matrix.handle3(), 0, row_indices.raw_size(), row_indices.get());
+        viennacl::backend::memory_read(gpu_matrix.handle2(), 0, col_buffer.raw_size(), col_buffer.get());
+        viennacl::backend::memory_read(gpu_matrix.handle(),  0, sizeof(SCALARTYPE)* gpu_matrix.nnz(), &(elements[0]));
+
+        //fill the cpu_matrix:
+        vcl_size_t data_index = 0;
+        for (vcl_size_t i = 1; i < row_buffer.size(); ++i)
+        {
+          while (data_index < row_buffer[i])
+          {
+            if (col_buffer[data_index] >= gpu_matrix.size2())
+            {
+              std::cerr << "ViennaCL encountered invalid data at colbuffer[" << data_index << "]: " << col_buffer[data_index] << std::endl;
+              return;
+            }
+
+            if (elements[data_index] != static_cast<SCALARTYPE>(0.0))
+              cpu_matrix(row_indices[i-1], col_buffer[data_index]) = elements[data_index];
+            ++data_index;
+          }
+        }
+      }
+    }
+
+
+    /** @brief Copies a sparse matrix from an OpenCL device to the host. The host type is the std::vector< std::map < > > format .
+    *
+    * @param gpu_matrix   A compressed_compressed_matrix from ViennaCL
+    * @param cpu_matrix   A sparse matrix on the host.
+    */
+    template <typename SCALARTYPE>
+    void copy(const compressed_compressed_matrix<SCALARTYPE> & gpu_matrix,
+              std::vector< std::map<unsigned int, SCALARTYPE> > & cpu_matrix)
+    {
+      tools::sparse_matrix_adapter<SCALARTYPE> temp(cpu_matrix, cpu_matrix.size(), cpu_matrix.size());
+      copy(gpu_matrix, temp);
+    }
+
+
+    //////////////////////// compressed_compressed_matrix //////////////////////////
+    /** @brief A sparse square matrix in compressed sparse rows format optimized for the case that only a few rows carry nonzero entries.
+    *
+    * The difference to the 'standard' CSR format is that there is an additional array 'row_indices' so that the i-th set of indices in the CSR-layout refers to row_indices[i].
+    *
+    * @tparam SCALARTYPE    The floating point type (either float or double, checked at compile time)
+    * @tparam ALIGNMENT     The internal memory size for the entries in each row is given by (size()/ALIGNMENT + 1) * ALIGNMENT. ALIGNMENT must be a power of two. Best values or usually 4, 8 or 16, higher values are usually a waste of memory.
+    */
+    template<class SCALARTYPE>
+    class compressed_compressed_matrix
+    {
+      public:
+        typedef viennacl::backend::mem_handle                                                              handle_type;
+        typedef scalar<typename viennacl::tools::CHECK_SCALAR_TEMPLATE_ARGUMENT<SCALARTYPE>::ResultType>   value_type;
+        typedef vcl_size_t                                                                                 size_type;
+
+        /** @brief Default construction of a compressed matrix. No memory is allocated */
+        compressed_compressed_matrix() : rows_(0), cols_(0), nonzero_rows_(0), nonzeros_(0) {}
+
+        /** @brief Construction of a compressed matrix with the supplied number of rows and columns. If the number of nonzeros is positive, memory is allocated
+        *
+        * @param rows         Number of rows
+        * @param cols         Number of columns
+        * @param nonzero_rows Optional number of nonzero rows for memory preallocation
+        * @param nonzeros     Optional number of nonzeros for memory preallocation
+        * @param ctx          Context in which to create the matrix. Uses the default context if omitted
+        */
+        explicit compressed_compressed_matrix(vcl_size_t rows, vcl_size_t cols, vcl_size_t nonzero_rows = 0, vcl_size_t nonzeros = 0, viennacl::context ctx = viennacl::context())
+          : rows_(rows), cols_(cols), nonzero_rows_(nonzero_rows), nonzeros_(nonzeros)
+        {
+          row_buffer_.switch_active_handle_id(ctx.memory_type());
+          row_indices_.switch_active_handle_id(ctx.memory_type());
+          col_buffer_.switch_active_handle_id(ctx.memory_type());
+            elements_.switch_active_handle_id(ctx.memory_type());
+
+#ifdef VIENNACL_WITH_OPENCL
+          if (ctx.memory_type() == OPENCL_MEMORY)
+          {
+            row_buffer_.opencl_handle().context(ctx.opencl_context());
+            row_indices_.opencl_handle().context(ctx.opencl_context());
+            col_buffer_.opencl_handle().context(ctx.opencl_context());
+              elements_.opencl_handle().context(ctx.opencl_context());
+          }
+#endif
+          if (rows > 0)
+          {
+            viennacl::backend::memory_create(row_buffer_, viennacl::backend::typesafe_host_array<unsigned int>().element_size() * (rows + 1), ctx);
+          }
+          if (nonzeros > 0)
+          {
+            viennacl::backend::memory_create(col_buffer_, viennacl::backend::typesafe_host_array<unsigned int>().element_size() * nonzeros, ctx);
+            viennacl::backend::memory_create(elements_, sizeof(SCALARTYPE) * nonzeros, ctx);
+          }
+        }
+
+        /** @brief Construction of a compressed matrix with the supplied number of rows and columns. If the number of nonzeros is positive, memory is allocated
+        *
+        * @param rows     Number of rows
+        * @param cols     Number of columns
+        * @param ctx      Context in which to create the matrix
+        */
+        explicit compressed_compressed_matrix(vcl_size_t rows, vcl_size_t cols, viennacl::context ctx)
+          : rows_(rows), cols_(cols), nonzeros_(0)
+        {
+          row_buffer_.switch_active_handle_id(ctx.memory_type());
+          col_buffer_.switch_active_handle_id(ctx.memory_type());
+            elements_.switch_active_handle_id(ctx.memory_type());
+
+#ifdef VIENNACL_WITH_OPENCL
+          if (ctx.memory_type() == OPENCL_MEMORY)
+          {
+            row_buffer_.opencl_handle().context(ctx.opencl_context());
+            col_buffer_.opencl_handle().context(ctx.opencl_context());
+              elements_.opencl_handle().context(ctx.opencl_context());
+          }
+#endif
+          if (rows > 0)
+          {
+            viennacl::backend::memory_create(row_buffer_, viennacl::backend::typesafe_host_array<unsigned int>().element_size() * (rows + 1), ctx);
+          }
+        }
+
+        explicit compressed_compressed_matrix(viennacl::context ctx) : rows_(0), cols_(0), nonzero_rows_(0), nonzeros_(0)
+        {
+          row_buffer_.switch_active_handle_id(ctx.memory_type());
+          row_indices_.switch_active_handle_id(ctx.memory_type());
+          col_buffer_.switch_active_handle_id(ctx.memory_type());
+            elements_.switch_active_handle_id(ctx.memory_type());
+
+#ifdef VIENNACL_WITH_OPENCL
+          if (ctx.memory_type() == OPENCL_MEMORY)
+          {
+            row_buffer_.opencl_handle().context(ctx.opencl_context());
+            row_indices_.opencl_handle().context(ctx.opencl_context());
+            col_buffer_.opencl_handle().context(ctx.opencl_context());
+              elements_.opencl_handle().context(ctx.opencl_context());
+          }
+#endif
+        }
+
+
+#ifdef VIENNACL_WITH_OPENCL
+        explicit compressed_compressed_matrix(cl_mem mem_row_buffer, cl_mem mem_row_indices, cl_mem mem_col_buffer, cl_mem mem_elements,
+                                              vcl_size_t rows, vcl_size_t cols, vcl_size_t nonzero_rows, vcl_size_t nonzeros) :
+          rows_(rows), cols_(cols), nonzero_rows_(nonzero_rows), nonzeros_(nonzeros)
+        {
+            row_buffer_.switch_active_handle_id(viennacl::OPENCL_MEMORY);
+            row_buffer_.opencl_handle() = mem_row_buffer;
+            row_buffer_.opencl_handle().inc();             //prevents that the user-provided memory is deleted once the matrix object is destroyed.
+            row_buffer_.raw_size(sizeof(cl_uint) * (nonzero_rows + 1));
+
+            row_indices_.switch_active_handle_id(viennacl::OPENCL_MEMORY);
+            row_indices_.opencl_handle() = mem_row_indices;
+            row_indices_.opencl_handle().inc();             //prevents that the user-provided memory is deleted once the matrix object is destroyed.
+            row_indices_.raw_size(sizeof(cl_uint) * nonzero_rows);
+
+            col_buffer_.switch_active_handle_id(viennacl::OPENCL_MEMORY);
+            col_buffer_.opencl_handle() = mem_col_buffer;
+            col_buffer_.opencl_handle().inc();             //prevents that the user-provided memory is deleted once the matrix object is destroyed.
+            col_buffer_.raw_size(sizeof(cl_uint) * nonzeros);
+
+            elements_.switch_active_handle_id(viennacl::OPENCL_MEMORY);
+            elements_.opencl_handle() = mem_elements;
+            elements_.opencl_handle().inc();               //prevents that the user-provided memory is deleted once the matrix object is destroyed.
+            elements_.raw_size(sizeof(SCALARTYPE) * nonzeros);
+        }
+#endif
+
+
+        /** @brief Assignment a compressed matrix from possibly another memory domain. */
+        compressed_compressed_matrix & operator=(compressed_compressed_matrix const & other)
+        {
+          assert( (rows_ == 0 || rows_ == other.size1()) && bool("Size mismatch") );
+          assert( (cols_ == 0 || cols_ == other.size2()) && bool("Size mismatch") );
+
+          rows_ = other.size1();
+          cols_ = other.size2();
+          nonzero_rows_ = other.nnz1();
+          nonzeros_ = other.nnz();
+
+          viennacl::backend::typesafe_memory_copy<unsigned int>(other.row_buffer_,  row_buffer_);
+          viennacl::backend::typesafe_memory_copy<unsigned int>(other.row_indices_, row_indices_);
+          viennacl::backend::typesafe_memory_copy<unsigned int>(other.col_buffer_,  col_buffer_);
+          viennacl::backend::typesafe_memory_copy<SCALARTYPE>(other.elements_, elements_);
+
+          return *this;
+        }
+
+
+        /** @brief Sets the row, column and value arrays of the compressed matrix
+        *
+        * @param row_jumper     Pointer to an array holding the indices of the first element of each row (starting with zero). E.g. row_jumper[10] returns the index of the first entry of the 11th row. The array length is 'cols + 1'
+        * @param row_indices    Array holding the indices of the nonzero rows
+        * @param col_buffer     Pointer to an array holding the column index of each entry. The array length is 'nonzeros'
+        * @param elements       Pointer to an array holding the entries of the sparse matrix. The array length is 'elements'
+        * @param rows           Number of rows of the sparse matrix
+        * @param cols           Number of columns of the sparse matrix
+        * @param nonzero_rows   Number of nonzero rows
+        * @param nonzeros       Total number of nonzero entries
+        */
+        void set(const void * row_jumper,
+                 const void * row_indices,
+                 const void * col_buffer,
+                 const SCALARTYPE * elements,
+                 vcl_size_t rows,
+                 vcl_size_t cols,
+                 vcl_size_t nonzero_rows,
+                 vcl_size_t nonzeros)
+        {
+          assert( (rows > 0)         && bool("Error in compressed_compressed_matrix::set(): Number of rows must be larger than zero!"));
+          assert( (cols > 0)         && bool("Error in compressed_compressed_matrix::set(): Number of columns must be larger than zero!"));
+          assert( (nonzero_rows > 0) && bool("Error in compressed_compressed_matrix::set(): Number of nonzero rows must be larger than zero!"));
+          assert( (nonzeros > 0)     && bool("Error in compressed_compressed_matrix::set(): Number of nonzeros must be larger than zero!"));
+          //std::cout << "Setting memory: " << cols + 1 << ", " << nonzeros << std::endl;
+
+          viennacl::backend::memory_create(row_buffer_,  viennacl::backend::typesafe_host_array<unsigned int>(row_buffer_).element_size() * (rows + 1),  viennacl::traits::context(row_buffer_),  row_jumper);
+          viennacl::backend::memory_create(row_indices_, viennacl::backend::typesafe_host_array<unsigned int>(row_indices_).element_size() * (rows + 1), viennacl::traits::context(row_indices_), row_indices);
+          viennacl::backend::memory_create(col_buffer_,  viennacl::backend::typesafe_host_array<unsigned int>(col_buffer_).element_size() * nonzeros,    viennacl::traits::context(col_buffer_),  col_buffer);
+          viennacl::backend::memory_create(elements_, sizeof(SCALARTYPE) * nonzeros, viennacl::traits::context(elements_), elements);
+
+          nonzeros_ = nonzeros;
+          nonzero_rows_ = nonzero_rows;
+          rows_ = rows;
+          cols_ = cols;
+        }
+
+        /** @brief  Returns the number of rows */
+        const vcl_size_t & size1() const { return rows_; }
+        /** @brief  Returns the number of columns */
+        const vcl_size_t & size2() const { return cols_; }
+        /** @brief  Returns the number of nonzero entries */
+        const vcl_size_t & nnz1() const { return nonzero_rows_; }
+        /** @brief  Returns the number of nonzero entries */
+        const vcl_size_t & nnz() const { return nonzeros_; }
+
+        /** @brief  Returns the OpenCL handle to the row index array */
+        const handle_type & handle1() const { return row_buffer_; }
+        /** @brief  Returns the OpenCL handle to the column index array */
+        const handle_type & handle2() const { return col_buffer_; }
+        /** @brief  Returns the OpenCL handle to the row index array */
+        const handle_type & handle3() const { return row_indices_; }
+        /** @brief  Returns the OpenCL handle to the matrix entry array */
+        const handle_type & handle() const { return elements_; }
+
+        /** @brief  Returns the OpenCL handle to the row index array */
+        handle_type & handle1() { return row_buffer_; }
+        /** @brief  Returns the OpenCL handle to the column index array */
+        handle_type & handle2() { return col_buffer_; }
+        /** @brief  Returns the OpenCL handle to the row index array */
+        handle_type & handle3() { return row_indices_; }
+        /** @brief  Returns the OpenCL handle to the matrix entry array */
+        handle_type & handle() { return elements_; }
+
+        void switch_memory_context(viennacl::context new_ctx)
+        {
+          viennacl::backend::switch_memory_context<unsigned int>(row_buffer_, new_ctx);
+          viennacl::backend::switch_memory_context<unsigned int>(row_indices_, new_ctx);
+          viennacl::backend::switch_memory_context<unsigned int>(col_buffer_, new_ctx);
+          viennacl::backend::switch_memory_context<SCALARTYPE>(elements_, new_ctx);
+        }
+
+        viennacl::memory_types memory_context() const
+        {
+          return row_buffer_.get_active_handle_id();
+        }
+
+      private:
+
+        vcl_size_t rows_;
+        vcl_size_t cols_;
+        vcl_size_t nonzero_rows_;
+        vcl_size_t nonzeros_;
+        handle_type row_buffer_;
+        handle_type row_indices_;
+        handle_type col_buffer_;
+        handle_type elements_;
+    };
+
+
+
+    //
+    // Specify available operations:
+    //
+
+    /** \cond */
+
+    namespace linalg
+    {
+      namespace detail
+      {
+        // x = A * y
+        template <typename T>
+        struct op_executor<vector_base<T>, op_assign, vector_expression<const compressed_compressed_matrix<T>, const vector_base<T>, op_prod> >
+        {
+            static void apply(vector_base<T> & lhs, vector_expression<const compressed_compressed_matrix<T>, const vector_base<T>, op_prod> const & rhs)
+            {
+              // check for the special case x = A * x
+              if (viennacl::traits::handle(lhs) == viennacl::traits::handle(rhs.rhs()))
+              {
+                viennacl::vector<T> temp(lhs);
+                viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), temp);
+                lhs = temp;
+              }
+              else
+                viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), lhs);
+            }
+        };
+
+        template <typename T>
+        struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const compressed_compressed_matrix<T>, const vector_base<T>, op_prod> >
+        {
+            static void apply(vector_base<T> & lhs, vector_expression<const compressed_compressed_matrix<T>, const vector_base<T>, op_prod> const & rhs)
+            {
+              viennacl::vector<T> temp(lhs);
+              viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), temp);
+              lhs += temp;
+            }
+        };
+
+        template <typename T>
+        struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const compressed_compressed_matrix<T>, const vector_base<T>, op_prod> >
+        {
+            static void apply(vector_base<T> & lhs, vector_expression<const compressed_compressed_matrix<T>, const vector_base<T>, op_prod> const & rhs)
+            {
+              viennacl::vector<T> temp(lhs);
+              viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), temp);
+              lhs -= temp;
+            }
+        };
+
+
+        // x = A * vec_op
+        template <typename T, typename LHS, typename RHS, typename OP>
+        struct op_executor<vector_base<T>, op_assign, vector_expression<const compressed_compressed_matrix<T>, const vector_expression<const LHS, const RHS, OP>, op_prod> >
+        {
+            static void apply(vector_base<T> & lhs, vector_expression<const compressed_compressed_matrix<T>, const vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+            {
+              viennacl::vector<T> temp(rhs.rhs());
+              viennacl::linalg::prod_impl(rhs.lhs(), temp, lhs);
+            }
+        };
+
+        // x = A * vec_op
+        template <typename T, typename LHS, typename RHS, typename OP>
+        struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const compressed_compressed_matrix<T>, vector_expression<const LHS, const RHS, OP>, op_prod> >
+        {
+            static void apply(vector_base<T> & lhs, vector_expression<const compressed_compressed_matrix<T>, vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+            {
+              viennacl::vector<T> temp(rhs.rhs(), viennacl::traits::context(rhs));
+              viennacl::vector<T> temp_result(lhs);
+              viennacl::linalg::prod_impl(rhs.lhs(), temp, temp_result);
+              lhs += temp_result;
+            }
+        };
+
+        // x = A * vec_op
+        template <typename T, typename LHS, typename RHS, typename OP>
+        struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const compressed_compressed_matrix<T>, const vector_expression<const LHS, const RHS, OP>, op_prod> >
+        {
+            static void apply(vector_base<T> & lhs, vector_expression<const compressed_compressed_matrix<T>, const vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+            {
+              viennacl::vector<T> temp(rhs.rhs(), viennacl::traits::context(rhs));
+              viennacl::vector<T> temp_result(lhs);
+              viennacl::linalg::prod_impl(rhs.lhs(), temp, temp_result);
+              lhs -= temp_result;
+            }
+        };
+
+      } // namespace detail
+    } // namespace linalg
+
+    /** \endcond */
+}
+
+#endif
diff --git a/viennacl/compressed_matrix.hpp b/viennacl/compressed_matrix.hpp
index d2b0cf8..bdd591f 100644
--- a/viennacl/compressed_matrix.hpp
+++ b/viennacl/compressed_matrix.hpp
@@ -1,664 +1,858 @@
-#ifndef VIENNACL_COMPRESSED_MATRIX_HPP_
-#define VIENNACL_COMPRESSED_MATRIX_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file compressed_matrix.hpp
-    @brief Implementation of the compressed_matrix class
-*/
-
-#include <vector>
-#include <list>
-#include <map>
-#include "viennacl/forwards.h"
-#include "viennacl/ocl/backend.hpp"
-#include "viennacl/vector.hpp"
-
-#include "viennacl/linalg/compressed_matrix_operations.hpp"
-
-#include "viennacl/tools/tools.hpp"
-#include "viennacl/tools/entry_proxy.hpp"
-
-namespace viennacl
-{
-    
-
-    //provide copy-operation:
-    /** @brief Copies a sparse matrix from the host to the OpenCL device (either GPU or multi-core CPU)
-    *
-    * There are some type requirements on the CPU_MATRIX type (fulfilled by e.g. boost::numeric::ublas):
-    * - .size1() returns the number of rows
-    * - .size2() returns the number of columns
-    * - const_iterator1    is a type definition for an iterator along increasing row indices
-    * - const_iterator2    is a type definition for an iterator along increasing columns indices
-    * - The const_iterator1 type provides an iterator of type const_iterator2 via members .begin() and .end() that iterates along column indices in the current row.
-    * - The types const_iterator1 and const_iterator2 provide members functions .index1() and .index2() that return the current row and column indices respectively.
-    * - Dereferenciation of an object of type const_iterator2 returns the entry.
-    *
-    * @param cpu_matrix   A sparse matrix on the host.
-    * @param gpu_matrix   A compressed_matrix from ViennaCL
-    */
-    template <typename CPU_MATRIX, typename SCALARTYPE, unsigned int ALIGNMENT>
-    void copy(const CPU_MATRIX & cpu_matrix,
-                     compressed_matrix<SCALARTYPE, ALIGNMENT> & gpu_matrix )
-    {
-      //std::cout << "copy for (" << cpu_matrix.size1() << ", " << cpu_matrix.size2() << ", " << cpu_matrix.nnz() << ")" << std::endl;
-      
-      if ( cpu_matrix.size1() > 0 && cpu_matrix.size2() > 0 )
-      {
-        //determine nonzeros:
-        long num_entries = 0;
-        for (typename CPU_MATRIX::const_iterator1 row_it = cpu_matrix.begin1();
-              row_it != cpu_matrix.end1();
-              ++row_it)
-        {
-          std::size_t entries_per_row = 0;
-          for (typename CPU_MATRIX::const_iterator2 col_it = row_it.begin();
-                col_it != row_it.end();
-                ++col_it)
-          {
-            ++entries_per_row;
-          }
-          num_entries += viennacl::tools::roundUpToNextMultiple<std::size_t>(entries_per_row, ALIGNMENT);
-        }
-        
-        if (num_entries == 0) //we copy an empty matrix
-        {
-          num_entries = 1;
-        }
-        
-        //set up matrix entries:
-        std::vector<cl_uint> row_buffer(cpu_matrix.size1() + 1);
-        std::vector<cl_uint> col_buffer(num_entries);
-        std::vector<SCALARTYPE> elements(num_entries);
-        
-        std::size_t row_index = 0;
-        std::size_t data_index = 0;
-        
-        for (typename CPU_MATRIX::const_iterator1 row_it = cpu_matrix.begin1();
-              row_it != cpu_matrix.end1();
-              ++row_it)
-        {
-          row_buffer[row_index] = data_index;
-          ++row_index;
-          
-          for (typename CPU_MATRIX::const_iterator2 col_it = row_it.begin();
-                col_it != row_it.end();
-                ++col_it)
-          {
-            col_buffer[data_index] = static_cast<std::size_t>(col_it.index2());
-            elements[data_index] = *col_it;
-            ++data_index;
-          }
-          data_index = viennacl::tools::roundUpToNextMultiple<std::size_t>(data_index, ALIGNMENT); //take care of alignment
-        }
-        row_buffer[row_index] = data_index;
-        
-        gpu_matrix.set(&row_buffer[0],
-                       &col_buffer[0],
-                       &elements[0], 
-                       cpu_matrix.size1(),
-                       cpu_matrix.size2(),
-                       num_entries);
-      }
-    }
-    
-    
-    //adapted for std::vector< std::map < > > argument:
-    /** @brief Copies a sparse matrix in the std::vector< std::map < > > format to an OpenCL device.
-    *
-    * @param cpu_matrix   A sparse square matrix on the host using STL types
-    * @param gpu_matrix   A compressed_matrix from ViennaCL
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    void copy(const std::vector< std::map<unsigned int, SCALARTYPE> > & cpu_matrix,
-                     compressed_matrix<SCALARTYPE, ALIGNMENT> & gpu_matrix )
-    {
-      copy(tools::const_sparse_matrix_adapter<SCALARTYPE>(cpu_matrix, cpu_matrix.size(), cpu_matrix.size()), gpu_matrix);
-    }
-    
-    #ifdef VIENNACL_HAVE_EIGEN
-    template <typename SCALARTYPE, int flags, unsigned int ALIGNMENT>
-    void copy(const Eigen::SparseMatrix<SCALARTYPE, flags> & eigen_matrix,
-              compressed_matrix<SCALARTYPE, ALIGNMENT> & gpu_matrix)
-    {
-      std::vector< std::map<unsigned int, SCALARTYPE> >  stl_matrix(eigen_matrix.rows());
-      
-      for (int k=0; k < eigen_matrix.outerSize(); ++k)
-        for (typename Eigen::SparseMatrix<SCALARTYPE, flags>::InnerIterator it(eigen_matrix, k); it; ++it)
-          stl_matrix[it.row()][it.col()] = it.value();
-        
-      copy(tools::const_sparse_matrix_adapter<SCALARTYPE>(stl_matrix, eigen_matrix.rows(), eigen_matrix.cols()), gpu_matrix);
-    }
-    #endif
-    
-    
-    #ifdef VIENNACL_HAVE_MTL4
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    void copy(const mtl::compressed2D<SCALARTYPE> & cpu_matrix,
-              compressed_matrix<SCALARTYPE, ALIGNMENT> & gpu_matrix)
-    {
-      typedef mtl::compressed2D<SCALARTYPE>  MatrixType;
-      
-      std::vector< std::map<unsigned int, SCALARTYPE> >  stl_matrix(cpu_matrix.num_rows());
-      
-      using mtl::traits::range_generator;
-      using mtl::traits::range::min;
-
-      // Choose between row and column traversal
-      typedef typename min<range_generator<mtl::tag::row, MatrixType>,
-                           range_generator<mtl::tag::col, MatrixType> >::type   range_type;
-      range_type                                                      my_range;
-
-      // Type of outer cursor
-      typedef typename range_type::type                               c_type;
-      // Type of inner cursor
-      typedef typename mtl::traits::range_generator<mtl::tag::nz, c_type>::type ic_type;
-
-      // Define the property maps
-      typename mtl::traits::row<MatrixType>::type                              row(cpu_matrix); 
-      typename mtl::traits::col<MatrixType>::type                              col(cpu_matrix);
-      typename mtl::traits::const_value<MatrixType>::type                      value(cpu_matrix); 
-
-      // Now iterate over the matrix    
-      for (c_type cursor(my_range.begin(cpu_matrix)), cend(my_range.end(cpu_matrix)); cursor != cend; ++cursor)
-        for (ic_type icursor(mtl::begin<mtl::tag::nz>(cursor)), icend(mtl::end<mtl::tag::nz>(cursor)); icursor != icend; ++icursor)
-          stl_matrix[row(*icursor)][col(*icursor)] = value(*icursor);
-      
-      copy(tools::const_sparse_matrix_adapter<SCALARTYPE>(stl_matrix, cpu_matrix.num_rows(), cpu_matrix.num_cols()), gpu_matrix);
-    }
-    #endif
-    
-    
-    
-    
-    
-    
-    
-    //
-    // gpu to cpu:
-    //
-    /** @brief Copies a sparse matrix from the OpenCL device (either GPU or multi-core CPU) to the host.
-    *
-    * There are two type requirements on the CPU_MATRIX type (fulfilled by e.g. boost::numeric::ublas):
-    * - resize(rows, cols)  A resize function to bring the matrix into the correct size
-    * - operator(i,j)       Write new entries via the parenthesis operator
-    *
-    * @param gpu_matrix   A compressed_matrix from ViennaCL
-    * @param cpu_matrix   A sparse matrix on the host.
-    */
-    template <typename CPU_MATRIX, typename SCALARTYPE, unsigned int ALIGNMENT>
-    void copy(const compressed_matrix<SCALARTYPE, ALIGNMENT> & gpu_matrix,
-                     CPU_MATRIX & cpu_matrix )
-    {
-      if ( gpu_matrix.size1() > 0 && gpu_matrix.size2() > 0 )
-      {
-        cpu_matrix.resize(gpu_matrix.size1(), gpu_matrix.size2(), false);
-        
-        //get raw data from memory:
-        std::vector<cl_uint> row_buffer(gpu_matrix.size1() + 1);
-        std::vector<cl_uint> col_buffer(gpu_matrix.nnz());
-        std::vector<SCALARTYPE> elements(gpu_matrix.nnz());
-        
-        //std::cout << "GPU->CPU, nonzeros: " << gpu_matrix.nnz() << std::endl;
-        
-        cl_int err;
-        err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(), gpu_matrix.handle1().get(), CL_TRUE, 0, sizeof(cl_uint)*(gpu_matrix.size1() + 1), &(row_buffer[0]), 0, NULL, NULL);
-        VIENNACL_ERR_CHECK(err);
-        err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(), gpu_matrix.handle2().get(), CL_TRUE, 0, sizeof(cl_uint)*gpu_matrix.nnz(), &(col_buffer[0]), 0, NULL, NULL);
-        VIENNACL_ERR_CHECK(err);
-        err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(), gpu_matrix.handle().get(), CL_TRUE, 0, sizeof(SCALARTYPE)*gpu_matrix.nnz(), &(elements[0]), 0, NULL, NULL);
-        VIENNACL_ERR_CHECK(err);
-        viennacl::ocl::get_queue().finish();
-        
-        //fill the cpu_matrix:
-        std::size_t data_index = 0;
-        for (std::size_t row = 1; row <= gpu_matrix.size1(); ++row)
-        {
-          while (data_index < row_buffer[row])
-          {
-            if (col_buffer[data_index] >= gpu_matrix.size2())
-            {
-              std::cerr << "ViennaCL encountered invalid data at colbuffer[" << data_index << "]: " << col_buffer[data_index] << std::endl;
-              return;
-            }
-            
-            if (elements[data_index] != static_cast<SCALARTYPE>(0.0))
-              cpu_matrix(row-1, col_buffer[data_index]) = elements[data_index];
-            ++data_index;
-          }
-        }
-      }
-    }
-    
-    
-    /** @brief Copies a sparse matrix from an OpenCL device to the host. The host type is the std::vector< std::map < > > format .
-    *
-    * @param gpu_matrix   A compressed_matrix from ViennaCL
-    * @param cpu_matrix   A sparse matrix on the host.
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    void copy(const compressed_matrix<SCALARTYPE, ALIGNMENT> & gpu_matrix,
-              std::vector< std::map<unsigned int, SCALARTYPE> > & cpu_matrix)
-    {
-      tools::sparse_matrix_adapter<SCALARTYPE> temp(cpu_matrix, cpu_matrix.size(), cpu_matrix.size());
-      copy(gpu_matrix, temp);
-    }
-    
-    
-    #ifdef VIENNACL_HAVE_EIGEN
-    template <typename SCALARTYPE, int flags, unsigned int ALIGNMENT>
-    void copy(compressed_matrix<SCALARTYPE, ALIGNMENT> & gpu_matrix,
-              Eigen::SparseMatrix<SCALARTYPE, flags> & eigen_matrix)
-    {
-      if ( gpu_matrix.size1() > 0 && gpu_matrix.size2() > 0 )
-      {
-        assert(static_cast<unsigned int>(eigen_matrix.rows()) >= gpu_matrix.size1()
-               && static_cast<unsigned int>(eigen_matrix.cols()) >= gpu_matrix.size2()
-               && "Provided Eigen compressed matrix is too small!");
-        
-        //get raw data from memory:
-        std::vector<cl_uint> row_buffer(gpu_matrix.size1() + 1);
-        std::vector<cl_uint> col_buffer(gpu_matrix.nnz());
-        std::vector<SCALARTYPE> elements(gpu_matrix.nnz());
-        
-        cl_int err;
-        err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(), gpu_matrix.handle1().get(),
-                                  CL_TRUE, 0, sizeof(cl_uint)*(gpu_matrix.size1() + 1), &(row_buffer[0]), 0, NULL, NULL);
-        VIENNACL_ERR_CHECK(err);
-        err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(), gpu_matrix.handle2().get(),
-                                  CL_TRUE, 0, sizeof(cl_uint)*gpu_matrix.nnz(), &(col_buffer[0]), 0, NULL, NULL);
-        VIENNACL_ERR_CHECK(err);
-        err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(), gpu_matrix.handle().get(),
-                                  CL_TRUE, 0, sizeof(SCALARTYPE)*gpu_matrix.nnz(), &(elements[0]), 0, NULL, NULL);
-        VIENNACL_ERR_CHECK(err);
-        viennacl::ocl::get_queue().finish();
-        
-        eigen_matrix.setZero();
-        eigen_matrix.startFill();
-        std::size_t data_index = 0;
-        for (std::size_t row = 1; row <= gpu_matrix.size1(); ++row)
-        {
-          while (data_index < row_buffer[row])
-          {
-            assert(col_buffer[data_index] < gpu_matrix.size2() && "ViennaCL encountered invalid data at col_buffer");
-            if (elements[data_index] != static_cast<SCALARTYPE>(0.0))
-              eigen_matrix.fill(row-1, col_buffer[data_index]) = elements[data_index];
-            ++data_index;
-          }
-        }
-        eigen_matrix.endFill();
-      }
-    }
-    #endif
-    
-    
-    
-    #ifdef VIENNACL_HAVE_MTL4
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    void copy(compressed_matrix<SCALARTYPE, ALIGNMENT> & gpu_matrix,
-              mtl::compressed2D<SCALARTYPE> & mtl4_matrix)
-    {
-      if ( gpu_matrix.size1() > 0 && gpu_matrix.size2() > 0 )
-      {
-        assert(mtl4_matrix.num_rows() >= gpu_matrix.size1()
-               && mtl4_matrix.num_cols() >= gpu_matrix.size2()
-               && "Provided MTL4 compressed matrix is too small!");
-        
-        //get raw data from memory:
-        std::vector<unsigned int> row_buffer(gpu_matrix.size1() + 1);
-        std::vector<unsigned int> col_buffer(gpu_matrix.nnz());
-        std::vector<SCALARTYPE> elements(gpu_matrix.nnz());
-        
-        cl_int err;
-        err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(), gpu_matrix.handle1().get(),
-                                  CL_TRUE, 0, sizeof(cl_uint)*(gpu_matrix.size1() + 1), &(row_buffer[0]), 0, NULL, NULL);
-        VIENNACL_ERR_CHECK(err);
-        err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(), gpu_matrix.handle2().get(),
-                                  CL_TRUE, 0, sizeof(cl_uint)*gpu_matrix.nnz(), &(col_buffer[0]), 0, NULL, NULL);
-        VIENNACL_ERR_CHECK(err);
-        err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(), gpu_matrix.handle().get(),
-                                  CL_TRUE, 0, sizeof(SCALARTYPE)*gpu_matrix.nnz(), &(elements[0]), 0, NULL, NULL);
-        VIENNACL_ERR_CHECK(err);
-        viennacl::ocl::get_queue().finish();
-        
-        //set_to_zero(mtl4_matrix);  
-        //mtl4_matrix.change_dim(gpu_matrix.size1(), gpu_matrix.size2());
-        
-        mtl::matrix::inserter< mtl::compressed2D<SCALARTYPE> >  ins(mtl4_matrix);
-        std::size_t data_index = 0;
-        for (std::size_t row = 1; row <= gpu_matrix.size1(); ++row)
-        {
-          while (data_index < row_buffer[row])
-          {
-            assert(col_buffer[data_index] < gpu_matrix.size2() && "ViennaCL encountered invalid data at col_buffer");
-            if (elements[data_index] != static_cast<SCALARTYPE>(0.0))
-              ins(row-1, col_buffer[data_index]) << typename mtl::Collection< mtl::compressed2D<SCALARTYPE> >::value_type(elements[data_index]);
-            ++data_index;
-          }
-        }
-      }
-    }
-    #endif
-    
-    
-    
-    
-    
-    //////////////////////// compressed_matrix //////////////////////////
-    /** @brief A sparse square matrix in compressed sparse rows format.
-    *
-    * @tparam SCALARTYPE    The floating point type (either float or double, checked at compile time)
-    * @tparam ALIGNMENT     The internal memory size for the entries in each row is given by (size()/ALIGNMENT + 1) * ALIGNMENT. ALIGNMENT must be a power of two. Best values or usually 4, 8 or 16, higher values are usually a waste of memory.
-    */
-    template<class SCALARTYPE, unsigned int ALIGNMENT /* see VCLForwards.h */>
-    class compressed_matrix
-    {
-    public:
-      typedef scalar<typename viennacl::tools::CHECK_SCALAR_TEMPLATE_ARGUMENT<SCALARTYPE>::ResultType>   value_type;
-      
-      /** @brief Default construction of a compressed matrix. No memory is allocated */
-      compressed_matrix() : _rows(0), _cols(0), _nonzeros(0) { viennacl::linalg::kernels::compressed_matrix<SCALARTYPE, ALIGNMENT>::init(); }
-      
-      /** @brief Construction of a compressed matrix with the supplied number of rows and columns. If the number of nonzeros is positive, memory is allocated
-      *
-      * @param rows     Number of rows
-      * @param cols     Number of columns
-      * @param nonzeros Optional number of nonzeros for memory preallocation
-      */
-      explicit compressed_matrix(std::size_t rows, std::size_t cols, std::size_t nonzeros = 0) : 
-        _rows(rows), _cols(cols), _nonzeros(nonzeros)
-      {
-        viennacl::linalg::kernels::compressed_matrix<SCALARTYPE, ALIGNMENT>::init();
-        
-        if (rows > 0)
-          _row_buffer = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, sizeof(cl_uint) * rows);
-        if (nonzeros > 0)
-        {
-          _col_buffer = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, sizeof(cl_uint) * nonzeros);
-          _elements = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, sizeof(SCALARTYPE) * nonzeros);
-        }
-      }
-      
-      explicit compressed_matrix(cl_mem mem_row_buffer, cl_mem mem_col_buffer, cl_mem mem_elements, 
-                                 std::size_t rows, std::size_t cols, std::size_t nonzeros) : 
-        _rows(rows), _cols(cols), _nonzeros(nonzeros)
-      {
-          _row_buffer = mem_row_buffer;
-          _row_buffer.inc();             //prevents that the user-provided memory is deleted once the matrix object is destroyed.
-          _col_buffer = mem_col_buffer;
-          _col_buffer.inc();             //prevents that the user-provided memory is deleted once the matrix object is destroyed.
-          _elements = mem_elements;
-          _elements.inc();               //prevents that the user-provided memory is deleted once the matrix object is destroyed.
-      }
-      
-      
-      /** @brief Sets the row, column and value arrays of the compressed matrix
-      *
-      * @param row_jumper     Pointer to an array holding the indices of the first element of each row (starting with zero). E.g. row_jumper[10] returns the index of the first entry of the 11th row. The array length is 'cols + 1'
-      * @param col_buffer     Pointer to an array holding the column index of each entry. The array length is 'nonzeros'
-      * @param elements       Pointer to an array holding the entries of the sparse matrix. The array length is 'elements'
-      * @param rows           Number of rows of the sparse matrix
-      * @param cols           Number of columns of the sparse matrix
-      * @param nonzeros       Number of nonzeros
-      */
-      void set(cl_uint * row_jumper, 
-               cl_uint * col_buffer,
-               SCALARTYPE * elements, 
-               std::size_t rows,
-               std::size_t cols,
-               std::size_t nonzeros)
-      {
-        assert(cols > 0);
-        assert(nonzeros > 0);
-        //std::cout << "Setting memory: " << cols + 1 << ", " << nonzeros << std::endl;
-        _row_buffer = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, sizeof(cl_uint) * (rows + 1), row_jumper);
-        _col_buffer = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, sizeof(cl_uint) * nonzeros, col_buffer);
-        _elements = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, sizeof(SCALARTYPE) * nonzeros, elements);
-        _nonzeros = nonzeros;
-        _rows = rows;
-        _cols = cols;
-      }
-        
-      /** @brief Allocate memory for the supplied number of nonzeros in the matrix. Old values are preserved. */
-      void reserve(std::size_t new_nonzeros)
-      {
-        if (new_nonzeros > _nonzeros)
-        {
-          viennacl::ocl::handle<cl_mem> _col_buffer_old = _col_buffer;
-          viennacl::ocl::handle<cl_mem> _elements_old = _elements;
-          _col_buffer = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, sizeof(cl_uint) * new_nonzeros);
-          _elements = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, sizeof(SCALARTYPE) * new_nonzeros);
-          
-          cl_int err;
-          err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle().get(), _col_buffer_old.get(), _col_buffer.get(), 0, 0, sizeof(cl_uint)*_nonzeros, 0, NULL, NULL);
-          VIENNACL_ERR_CHECK(err);
-          err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle().get(), _elements_old.get(), _elements.get(), 0, 0, sizeof(SCALARTYPE)*_nonzeros, 0, NULL, NULL);
-          VIENNACL_ERR_CHECK(err);
-
-          _nonzeros = new_nonzeros;
-        }
-      }
-
-      /** @brief Resize the matrix.
-      *
-      * @param new_size1    New number of rows
-      * @param new_size2    New number of columns
-      * @param preserve     If true, the old values are preserved. At present, old values are always discarded.
-      */
-      void resize(std::size_t new_size1, std::size_t new_size2, bool preserve = true)
-      {
-        assert(new_size1 > 0 && new_size2 > 0);
-        //std::cout << "Resizing from (" << _rows << ", " << _cols << ") to (" << new_size1 << ", " << new_size2 << ")" << std::endl;
-        
-        if (new_size1 != _rows || new_size2 != _cols)
-        {
-          std::vector<std::map<unsigned int, SCALARTYPE> > stl_sparse_matrix;
-          if (_rows > 0)
-            stl_sparse_matrix.resize(_rows);
-          
-          if (preserve && _rows > 0)
-            viennacl::copy(*this, stl_sparse_matrix);
-            
-          stl_sparse_matrix.resize(new_size1);
-          
-          //discard entries with column index larger than new_size2
-          if (new_size2 < _cols && _rows > 0)
-          {
-            for (size_t i=0; i<stl_sparse_matrix.size(); ++i)
-            {
-              std::list<unsigned int> to_delete;
-              for (typename std::map<unsigned int, SCALARTYPE>::iterator it = stl_sparse_matrix[i].begin();
-                   it != stl_sparse_matrix[i].end();
-                  ++it)
-              {
-                if (it->first >= new_size2)
-                  to_delete.push_back(it->first);
-              }
-              
-              for (std::list<unsigned int>::iterator it = to_delete.begin(); it != to_delete.end(); ++it)
-                stl_sparse_matrix[i].erase(*it);
-            }
-          }
-          
-          copy(stl_sparse_matrix, *this);
-          
-          _rows = new_size1;
-          _cols = new_size2;
-        }
-      }
-      
-      /** @brief Returns a reference to the (i,j)-th entry of the sparse matrix. If (i,j) does not exist (zero), it is inserted (slow!) */
-      entry_proxy<SCALARTYPE> operator()(std::size_t i, std::size_t j)
-      {
-        assert( (i < _rows) && (j < _cols) && "compressed_matrix access out of bounds!");
-        
-        std::size_t index = element_index(i, j);
-        
-        // check for element in sparsity pattern
-        if (index < _nonzeros)
-          return entry_proxy<SCALARTYPE>(index, _elements);
-
-        // Element not found. Copying required. Very slow, but direct entry manipulation is painful anyway...
-        std::vector< std::map<unsigned int, SCALARTYPE> > cpu_backup(_rows);
-        viennacl::copy(*this, cpu_backup);
-        cpu_backup[i][j] = 0.0;
-        viennacl::copy(cpu_backup, *this);
-        
-        index = element_index(i, j);
-        
-        assert(index < _nonzeros);
-        
-        return entry_proxy<SCALARTYPE>(index, _elements);        
-      }
-      /*void operator()(std::size_t i, std::size_t j, SCALARTYPE new_entry)
-      {
-        //read row indices
-        std::vector<cl_uint> row_indices(2);
-        cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(),
-                                          _row_buffer.get(), //row handle
-                                          CL_TRUE, //blocking
-                                          sizeof(cl_uint)*i, //offset
-                                          sizeof(cl_uint)*2, //size
-                                          &(row_indices[0]), //destination
-                                          0, NULL, NULL);
-        VIENNACL_ERR_CHECK(err);
-
-        //get column indices for row i:
-        std::vector<cl_uint> col_indices(row_indices[1] - row_indices[0]);
-        err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(),
-                                  _col_buffer.get(), //col handle
-                                  CL_TRUE, //blocking
-                                  sizeof(cl_uint)*row_indices[0], //offset
-                                  sizeof(cl_uint)*col_indices.size(), //size
-                                  &(col_indices[0]), //destination
-                                  0, NULL, NULL);
-        VIENNACL_ERR_CHECK(err);
-
-        //get entries for row i:
-        std::vector<SCALARTYPE> row_entries(row_indices[1] - row_indices[0]);
-        err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(),
-                                  _elements.get(), //entry handle
-                                  CL_TRUE, //blocking
-                                  sizeof(SCALARTYPE)*row_indices[0], //offset
-                                  sizeof(SCALARTYPE)*row_entries.size(), //size
-                                  &(row_entries[0]), //destination
-                                  0, NULL, NULL);
-        VIENNACL_ERR_CHECK(err);
-        
-        
-        // update entries:
-        for (std::size_t k=0; k<col_indices.size(); ++k)
-        {
-          if (col_indices[k] == j)
-            row_entries[k] = new_entry;
-        }
-        
-        // write back:
-        err = clEnqueueWriteBuffer(viennacl::ocl::get_queue().handle().get(),
-                                   _elements.get(),
-                                   CL_TRUE,
-                                   sizeof(SCALARTYPE)*row_indices[0], //offset
-                                   sizeof(SCALARTYPE)*row_entries.size(), //size
-                                   &(row_entries[0]), //data ptr
-                                   0, NULL, NULL);
-        VIENNACL_ERR_CHECK(err);
-      }*/
-      
-
-      /** @brief  Returns the number of rows */
-      const std::size_t & size1() const { return _rows; }
-      /** @brief  Returns the number of columns */
-      const std::size_t & size2() const { return _cols; }
-      /** @brief  Returns the number of nonzero entries */
-      const std::size_t & nnz() const { return _nonzeros; }
-      
-      /** @brief  Returns the OpenCL handle to the row index array */
-      const viennacl::ocl::handle<cl_mem> & handle1() const { return _row_buffer; }
-      /** @brief  Returns the OpenCL handle to the column index array */
-      const viennacl::ocl::handle<cl_mem> & handle2() const { return _col_buffer; }
-      /** @brief  Returns the OpenCL handle to the matrix entry array */
-      const viennacl::ocl::handle<cl_mem> & handle() const { return _elements; }
-      
-    private:
-      
-      std::size_t element_index(std::size_t i, std::size_t j)
-      {
-        //read row indices
-        std::vector<cl_uint> row_indices(2);
-        cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(),
-                                          _row_buffer.get(), //row handle
-                                          CL_TRUE, //blocking
-                                          sizeof(cl_uint)*i, //offset
-                                          sizeof(cl_uint)*2, //size
-                                          &(row_indices[0]), //destination
-                                          0, NULL, NULL);
-        VIENNACL_ERR_CHECK(err);
-
-        //get column indices for row i:
-        std::vector<cl_uint> col_indices(row_indices[1] - row_indices[0]);
-        err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(),
-                                  _col_buffer.get(), //col handle
-                                  CL_TRUE, //blocking
-                                  sizeof(cl_uint)*row_indices[0], //offset
-                                  sizeof(cl_uint)*col_indices.size(), //size
-                                  &(col_indices[0]), //destination
-                                  0, NULL, NULL);
-        VIENNACL_ERR_CHECK(err);
-
-        //get entries for row i:
-        std::vector<SCALARTYPE> row_entries(row_indices[1] - row_indices[0]);
-        err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(),
-                                  _elements.get(), //entry handle
-                                  CL_TRUE, //blocking
-                                  sizeof(SCALARTYPE)*row_indices[0], //offset
-                                  sizeof(SCALARTYPE)*row_entries.size(), //size
-                                  &(row_entries[0]), //destination
-                                  0, NULL, NULL);
-        VIENNACL_ERR_CHECK(err);
-
-        for (std::size_t k=0; k<col_indices.size(); ++k)
-        {
-          if (col_indices[k] == j)
-            return row_indices[0] + k;
-        }
-        
-        // if not found, return index past the end of the matrix (cf. matrix.end() in the spirit of the STL)
-        return _nonzeros;
-      }
-      
-      // /** @brief Copy constructor is by now not available. */
-      //compressed_matrix(compressed_matrix const &);
-      
-      /** @brief Assignment is by now not available. */
-      compressed_matrix & operator=(compressed_matrix const &);
-      
-      
-      std::size_t _rows;
-      std::size_t _cols;
-      std::size_t _nonzeros;
-      viennacl::ocl::handle<cl_mem> _row_buffer;
-      viennacl::ocl::handle<cl_mem> _col_buffer;
-      viennacl::ocl::handle<cl_mem> _elements;
-    };
-
-    
-    
-
-}
-
-#endif
+#ifndef VIENNACL_COMPRESSED_MATRIX_HPP_
+#define VIENNACL_COMPRESSED_MATRIX_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/compressed_matrix.hpp
+    @brief Implementation of the compressed_matrix class
+*/
+
+#include <vector>
+#include <list>
+#include <map>
+#include "viennacl/forwards.h"
+#include "viennacl/vector.hpp"
+
+#include "viennacl/linalg/sparse_matrix_operations.hpp"
+
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/tools/entry_proxy.hpp"
+
+namespace viennacl
+{
+    namespace detail
+    {
+      template <typename CPU_MATRIX, typename SCALARTYPE, unsigned int ALIGNMENT>
+      void copy_impl(const CPU_MATRIX & cpu_matrix,
+                     compressed_matrix<SCALARTYPE, ALIGNMENT> & gpu_matrix,
+                     vcl_size_t nonzeros)
+      {
+        assert( (gpu_matrix.size1() == 0 || viennacl::traits::size1(cpu_matrix) == gpu_matrix.size1()) && bool("Size mismatch") );
+        assert( (gpu_matrix.size2() == 0 || viennacl::traits::size2(cpu_matrix) == gpu_matrix.size2()) && bool("Size mismatch") );
+
+        viennacl::backend::typesafe_host_array<unsigned int> row_buffer(gpu_matrix.handle1(), cpu_matrix.size1() + 1);
+        viennacl::backend::typesafe_host_array<unsigned int> col_buffer(gpu_matrix.handle2(), nonzeros);
+        std::vector<SCALARTYPE> elements(nonzeros);
+
+        vcl_size_t row_index  = 0;
+        vcl_size_t data_index = 0;
+
+        for (typename CPU_MATRIX::const_iterator1 row_it = cpu_matrix.begin1();
+              row_it != cpu_matrix.end1();
+              ++row_it)
+        {
+          row_buffer.set(row_index, data_index);
+          ++row_index;
+
+          for (typename CPU_MATRIX::const_iterator2 col_it = row_it.begin();
+                col_it != row_it.end();
+                ++col_it)
+          {
+            col_buffer.set(data_index, col_it.index2());
+            elements[data_index] = *col_it;
+            ++data_index;
+          }
+          data_index = viennacl::tools::align_to_multiple<vcl_size_t>(data_index, ALIGNMENT); //take care of alignment
+        }
+        row_buffer.set(row_index, data_index);
+
+        gpu_matrix.set(row_buffer.get(),
+                       col_buffer.get(),
+                       &elements[0],
+                       cpu_matrix.size1(),
+                       cpu_matrix.size2(),
+                       nonzeros);
+      }
+    }
+
+    //provide copy-operation:
+    /** @brief Copies a sparse matrix from the host to the OpenCL device (either GPU or multi-core CPU)
+    *
+    * There are some type requirements on the CPU_MATRIX type (fulfilled by e.g. boost::numeric::ublas):
+    * - .size1() returns the number of rows
+    * - .size2() returns the number of columns
+    * - const_iterator1    is a type definition for an iterator along increasing row indices
+    * - const_iterator2    is a type definition for an iterator along increasing columns indices
+    * - The const_iterator1 type provides an iterator of type const_iterator2 via members .begin() and .end() that iterates along column indices in the current row.
+    * - The types const_iterator1 and const_iterator2 provide members functions .index1() and .index2() that return the current row and column indices respectively.
+    * - Dereferenciation of an object of type const_iterator2 returns the entry.
+    *
+    * @param cpu_matrix   A sparse matrix on the host.
+    * @param gpu_matrix   A compressed_matrix from ViennaCL
+    */
+    template <typename CPU_MATRIX, typename SCALARTYPE, unsigned int ALIGNMENT>
+    void copy(const CPU_MATRIX & cpu_matrix,
+              compressed_matrix<SCALARTYPE, ALIGNMENT> & gpu_matrix )
+    {
+      if ( cpu_matrix.size1() > 0 && cpu_matrix.size2() > 0 )
+      {
+        //determine nonzeros:
+        vcl_size_t num_entries = 0;
+        for (typename CPU_MATRIX::const_iterator1 row_it = cpu_matrix.begin1();
+              row_it != cpu_matrix.end1();
+              ++row_it)
+        {
+          vcl_size_t entries_per_row = 0;
+          for (typename CPU_MATRIX::const_iterator2 col_it = row_it.begin();
+                col_it != row_it.end();
+                ++col_it)
+          {
+            ++entries_per_row;
+          }
+          num_entries += viennacl::tools::align_to_multiple<vcl_size_t>(entries_per_row, ALIGNMENT);
+        }
+
+        if (num_entries == 0) //we copy an empty matrix
+          num_entries = 1;
+
+        //set up matrix entries:
+        viennacl::detail::copy_impl(cpu_matrix, gpu_matrix, num_entries);
+      }
+    }
+
+
+    //adapted for std::vector< std::map < > > argument:
+    /** @brief Copies a sparse square matrix in the std::vector< std::map < > > format to an OpenCL device. Use viennacl::tools::sparse_matrix_adapter for non-square matrices.
+    *
+    * @param cpu_matrix   A sparse square matrix on the host using STL types
+    * @param gpu_matrix   A compressed_matrix from ViennaCL
+    */
+    template <typename SizeType, typename SCALARTYPE, unsigned int ALIGNMENT>
+    void copy(const std::vector< std::map<SizeType, SCALARTYPE> > & cpu_matrix,
+              compressed_matrix<SCALARTYPE, ALIGNMENT> & gpu_matrix )
+    {
+      vcl_size_t nonzeros = 0;
+      vcl_size_t max_col = 0;
+      for (vcl_size_t i=0; i<cpu_matrix.size(); ++i)
+      {
+        if (cpu_matrix[i].size() > 0)
+        nonzeros += ((cpu_matrix[i].size() - 1) / ALIGNMENT + 1) * ALIGNMENT;
+        if (cpu_matrix[i].size() > 0)
+          max_col = std::max<vcl_size_t>(max_col, (cpu_matrix[i].rbegin())->first);
+      }
+
+      viennacl::detail::copy_impl(tools::const_sparse_matrix_adapter<SCALARTYPE, SizeType>(cpu_matrix, cpu_matrix.size(), max_col + 1),
+                                  gpu_matrix,
+                                  nonzeros);
+    }
+
+#ifdef VIENNACL_WITH_UBLAS
+    template <typename ScalarType, typename F, vcl_size_t IB, typename IA, typename TA>
+    void copy(const boost::numeric::ublas::compressed_matrix<ScalarType, F, IB, IA, TA> & ublas_matrix,
+              viennacl::compressed_matrix<ScalarType, 1> & gpu_matrix)
+    {
+      assert( (gpu_matrix.size1() == 0 || viennacl::traits::size1(ublas_matrix) == gpu_matrix.size1()) && bool("Size mismatch") );
+      assert( (gpu_matrix.size2() == 0 || viennacl::traits::size2(ublas_matrix) == gpu_matrix.size2()) && bool("Size mismatch") );
+
+      //we just need to copy the CSR arrays:
+      viennacl::backend::typesafe_host_array<unsigned int> row_buffer(gpu_matrix.handle1(), ublas_matrix.size1() + 1);
+      for (vcl_size_t i=0; i<=ublas_matrix.size1(); ++i)
+        row_buffer.set(i, ublas_matrix.index1_data()[i]);
+
+      viennacl::backend::typesafe_host_array<unsigned int> col_buffer(gpu_matrix.handle2(), ublas_matrix.nnz());
+      for (vcl_size_t i=0; i<ublas_matrix.nnz(); ++i)
+        col_buffer.set(i, ublas_matrix.index2_data()[i]);
+
+      gpu_matrix.set(row_buffer.get(),
+                     col_buffer.get(),
+                     &(ublas_matrix.value_data()[0]),
+                     ublas_matrix.size1(),
+                     ublas_matrix.size2(),
+                     ublas_matrix.nnz());
+
+    }
+#endif
+
+    #ifdef VIENNACL_WITH_EIGEN
+    template <typename SCALARTYPE, int flags, unsigned int ALIGNMENT>
+    void copy(const Eigen::SparseMatrix<SCALARTYPE, flags> & eigen_matrix,
+              compressed_matrix<SCALARTYPE, ALIGNMENT> & gpu_matrix)
+    {
+      assert( (gpu_matrix.size1() == 0 || static_cast<vcl_size_t>(eigen_matrix.rows()) == gpu_matrix.size1()) && bool("Size mismatch") );
+      assert( (gpu_matrix.size2() == 0 || static_cast<vcl_size_t>(eigen_matrix.cols()) == gpu_matrix.size2()) && bool("Size mismatch") );
+
+      std::vector< std::map<unsigned int, SCALARTYPE> >  stl_matrix(eigen_matrix.rows());
+
+      for (int k=0; k < eigen_matrix.outerSize(); ++k)
+        for (typename Eigen::SparseMatrix<SCALARTYPE, flags>::InnerIterator it(eigen_matrix, k); it; ++it)
+          stl_matrix[it.row()][it.col()] = it.value();
+
+      copy(tools::const_sparse_matrix_adapter<SCALARTYPE>(stl_matrix, eigen_matrix.rows(), eigen_matrix.cols()), gpu_matrix);
+    }
+#endif
+
+
+#ifdef VIENNACL_WITH_MTL4
+    template <typename SCALARTYPE, unsigned int ALIGNMENT>
+    void copy(const mtl::compressed2D<SCALARTYPE> & cpu_matrix,
+              compressed_matrix<SCALARTYPE, ALIGNMENT> & gpu_matrix)
+    {
+      assert( (gpu_matrix.size1() == 0 || static_cast<vcl_size_t>(cpu_matrix.num_rows()) == gpu_matrix.size1()) && bool("Size mismatch") );
+      assert( (gpu_matrix.size2() == 0 || static_cast<vcl_size_t>(cpu_matrix.num_cols()) == gpu_matrix.size2()) && bool("Size mismatch") );
+
+      typedef mtl::compressed2D<SCALARTYPE>  MatrixType;
+
+      std::vector< std::map<unsigned int, SCALARTYPE> >  stl_matrix(cpu_matrix.num_rows());
+
+      using mtl::traits::range_generator;
+      using mtl::traits::range::min;
+
+      // Choose between row and column traversal
+      typedef typename min<range_generator<mtl::tag::row, MatrixType>,
+                           range_generator<mtl::tag::col, MatrixType> >::type   range_type;
+      range_type                                                      my_range;
+
+      // Type of outer cursor
+      typedef typename range_type::type                               c_type;
+      // Type of inner cursor
+      typedef typename mtl::traits::range_generator<mtl::tag::nz, c_type>::type ic_type;
+
+      // Define the property maps
+      typename mtl::traits::row<MatrixType>::type                              row(cpu_matrix);
+      typename mtl::traits::col<MatrixType>::type                              col(cpu_matrix);
+      typename mtl::traits::const_value<MatrixType>::type                      value(cpu_matrix);
+
+      // Now iterate over the matrix
+      for (c_type cursor(my_range.begin(cpu_matrix)), cend(my_range.end(cpu_matrix)); cursor != cend; ++cursor)
+        for (ic_type icursor(mtl::begin<mtl::tag::nz>(cursor)), icend(mtl::end<mtl::tag::nz>(cursor)); icursor != icend; ++icursor)
+          stl_matrix[row(*icursor)][col(*icursor)] = value(*icursor);
+
+      copy(tools::const_sparse_matrix_adapter<SCALARTYPE>(stl_matrix, cpu_matrix.num_rows(), cpu_matrix.num_cols()), gpu_matrix);
+    }
+#endif
+
+
+
+
+
+
+
+    //
+    // gpu to cpu:
+    //
+    /** @brief Copies a sparse matrix from the OpenCL device (either GPU or multi-core CPU) to the host.
+    *
+    * There are two type requirements on the CPU_MATRIX type (fulfilled by e.g. boost::numeric::ublas):
+    * - resize(rows, cols)  A resize function to bring the matrix into the correct size
+    * - operator(i,j)       Write new entries via the parenthesis operator
+    *
+    * @param gpu_matrix   A compressed_matrix from ViennaCL
+    * @param cpu_matrix   A sparse matrix on the host.
+    */
+    template <typename CPU_MATRIX, typename SCALARTYPE, unsigned int ALIGNMENT>
+    void copy(const compressed_matrix<SCALARTYPE, ALIGNMENT> & gpu_matrix,
+              CPU_MATRIX & cpu_matrix )
+    {
+      assert( (viennacl::traits::size1(cpu_matrix) == gpu_matrix.size1()) && bool("Size mismatch") );
+      assert( (viennacl::traits::size2(cpu_matrix) == gpu_matrix.size2()) && bool("Size mismatch") );
+
+      if ( gpu_matrix.size1() > 0 && gpu_matrix.size2() > 0 )
+      {
+        //get raw data from memory:
+        viennacl::backend::typesafe_host_array<unsigned int> row_buffer(gpu_matrix.handle1(), cpu_matrix.size1() + 1);
+        viennacl::backend::typesafe_host_array<unsigned int> col_buffer(gpu_matrix.handle2(), gpu_matrix.nnz());
+        std::vector<SCALARTYPE> elements(gpu_matrix.nnz());
+
+        //std::cout << "GPU->CPU, nonzeros: " << gpu_matrix.nnz() << std::endl;
+
+        viennacl::backend::memory_read(gpu_matrix.handle1(), 0, row_buffer.raw_size(), row_buffer.get());
+        viennacl::backend::memory_read(gpu_matrix.handle2(), 0, col_buffer.raw_size(), col_buffer.get());
+        viennacl::backend::memory_read(gpu_matrix.handle(),  0, sizeof(SCALARTYPE)* gpu_matrix.nnz(), &(elements[0]));
+
+        //fill the cpu_matrix:
+        vcl_size_t data_index = 0;
+        for (vcl_size_t row = 1; row <= gpu_matrix.size1(); ++row)
+        {
+          while (data_index < row_buffer[row])
+          {
+            if (col_buffer[data_index] >= gpu_matrix.size2())
+            {
+              std::cerr << "ViennaCL encountered invalid data at colbuffer[" << data_index << "]: " << col_buffer[data_index] << std::endl;
+              return;
+            }
+
+            if (elements[data_index] != static_cast<SCALARTYPE>(0.0))
+              cpu_matrix(row-1, static_cast<vcl_size_t>(col_buffer[data_index])) = elements[data_index];
+            ++data_index;
+          }
+        }
+      }
+    }
+
+
+    /** @brief Copies a sparse matrix from an OpenCL device to the host. The host type is the std::vector< std::map < > > format .
+    *
+    * @param gpu_matrix   A compressed_matrix from ViennaCL
+    * @param cpu_matrix   A sparse matrix on the host.
+    */
+    template <typename SCALARTYPE, unsigned int ALIGNMENT>
+    void copy(const compressed_matrix<SCALARTYPE, ALIGNMENT> & gpu_matrix,
+              std::vector< std::map<unsigned int, SCALARTYPE> > & cpu_matrix)
+    {
+      tools::sparse_matrix_adapter<SCALARTYPE> temp(cpu_matrix, cpu_matrix.size(), cpu_matrix.size());
+      copy(gpu_matrix, temp);
+    }
+
+#ifdef VIENNACL_WITH_UBLAS
+    template <typename ScalarType, unsigned int ALIGNMENT, typename F, vcl_size_t IB, typename IA, typename TA>
+    void copy(viennacl::compressed_matrix<ScalarType, ALIGNMENT> const & gpu_matrix,
+              boost::numeric::ublas::compressed_matrix<ScalarType> & ublas_matrix)
+    {
+      assert( (viennacl::traits::size1(ublas_matrix) == gpu_matrix.size1()) && bool("Size mismatch") );
+      assert( (viennacl::traits::size2(ublas_matrix) == gpu_matrix.size2()) && bool("Size mismatch") );
+
+      viennacl::backend::typesafe_host_array<unsigned int> row_buffer(gpu_matrix.handle1(), gpu_matrix.size1() + 1);
+      viennacl::backend::typesafe_host_array<unsigned int> col_buffer(gpu_matrix.handle2(), gpu_matrix.nnz());
+
+      viennacl::backend::memory_read(gpu_matrix.handle1(), 0, row_buffer.raw_size(), row_buffer.get());
+      viennacl::backend::memory_read(gpu_matrix.handle2(), 0, col_buffer.raw_size(), col_buffer.get());
+
+      ublas_matrix.clear();
+      ublas_matrix.reserve(gpu_matrix.nnz());
+
+      ublas_matrix.set_filled(gpu_matrix.size1() + 1, gpu_matrix.nnz());
+
+      for (vcl_size_t i=0; i<ublas_matrix.size1() + 1; ++i)
+        ublas_matrix.index1_data()[i] = row_buffer[i];
+
+      for (vcl_size_t i=0; i<ublas_matrix.nnz(); ++i)
+        ublas_matrix.index2_data()[i] = col_buffer[i];
+
+      viennacl::backend::memory_read(gpu_matrix.handle(),  0, sizeof(ScalarType) * gpu_matrix.nnz(), &(ublas_matrix.value_data()[0]));
+
+    }
+#endif
+
+#ifdef VIENNACL_WITH_EIGEN
+    template <typename SCALARTYPE, int flags, unsigned int ALIGNMENT>
+    void copy(compressed_matrix<SCALARTYPE, ALIGNMENT> & gpu_matrix,
+              Eigen::SparseMatrix<SCALARTYPE, flags> & eigen_matrix)
+    {
+      assert( (static_cast<vcl_size_t>(eigen_matrix.rows()) == gpu_matrix.size1()) && bool("Size mismatch") );
+      assert( (static_cast<vcl_size_t>(eigen_matrix.cols()) == gpu_matrix.size2()) && bool("Size mismatch") );
+
+      if ( gpu_matrix.size1() > 0 && gpu_matrix.size2() > 0 )
+      {
+        //get raw data from memory:
+        viennacl::backend::typesafe_host_array<unsigned int> row_buffer(gpu_matrix.handle1(), gpu_matrix.size1() + 1);
+        viennacl::backend::typesafe_host_array<unsigned int> col_buffer(gpu_matrix.handle2(), gpu_matrix.nnz());
+        std::vector<SCALARTYPE> elements(gpu_matrix.nnz());
+
+        viennacl::backend::memory_read(gpu_matrix.handle1(), 0, row_buffer.raw_size(), row_buffer.get());
+        viennacl::backend::memory_read(gpu_matrix.handle2(), 0, col_buffer.raw_size(), col_buffer.get());
+        viennacl::backend::memory_read(gpu_matrix.handle(),  0, sizeof(SCALARTYPE)* gpu_matrix.nnz(),        &(elements[0]));
+
+        eigen_matrix.setZero();
+        vcl_size_t data_index = 0;
+        for (vcl_size_t row = 1; row <= gpu_matrix.size1(); ++row)
+        {
+          while (data_index < row_buffer[row])
+          {
+            assert(col_buffer[data_index] < gpu_matrix.size2() && bool("ViennaCL encountered invalid data at col_buffer"));
+            if (elements[data_index] != static_cast<SCALARTYPE>(0.0))
+              eigen_matrix.insert(row-1, col_buffer[data_index]) = elements[data_index];
+            ++data_index;
+          }
+        }
+      }
+    }
+#endif
+
+
+
+#ifdef VIENNACL_WITH_MTL4
+    template <typename SCALARTYPE, unsigned int ALIGNMENT>
+    void copy(compressed_matrix<SCALARTYPE, ALIGNMENT> & gpu_matrix,
+              mtl::compressed2D<SCALARTYPE> & mtl4_matrix)
+    {
+      assert( (static_cast<vcl_size_t>(mtl4_matrix.num_rows()) == gpu_matrix.size1()) && bool("Size mismatch") );
+      assert( (static_cast<vcl_size_t>(mtl4_matrix.num_cols()) == gpu_matrix.size2()) && bool("Size mismatch") );
+
+      if ( gpu_matrix.size1() > 0 && gpu_matrix.size2() > 0 )
+      {
+
+        //get raw data from memory:
+        viennacl::backend::typesafe_host_array<unsigned int> row_buffer(gpu_matrix.handle1(), gpu_matrix.size1() + 1);
+        viennacl::backend::typesafe_host_array<unsigned int> col_buffer(gpu_matrix.handle2(), gpu_matrix.nnz());
+        std::vector<SCALARTYPE> elements(gpu_matrix.nnz());
+
+        viennacl::backend::memory_read(gpu_matrix.handle1(), 0, row_buffer.raw_size(), row_buffer.get());
+        viennacl::backend::memory_read(gpu_matrix.handle2(), 0, col_buffer.raw_size(), col_buffer.get());
+        viennacl::backend::memory_read(gpu_matrix.handle(),  0, sizeof(SCALARTYPE)* gpu_matrix.nnz(), &(elements[0]));
+
+        //set_to_zero(mtl4_matrix);
+        //mtl4_matrix.change_dim(gpu_matrix.size1(), gpu_matrix.size2());
+
+        mtl::matrix::inserter< mtl::compressed2D<SCALARTYPE> >  ins(mtl4_matrix);
+        vcl_size_t data_index = 0;
+        for (vcl_size_t row = 1; row <= gpu_matrix.size1(); ++row)
+        {
+          while (data_index < row_buffer[row])
+          {
+            assert(col_buffer[data_index] < gpu_matrix.size2() && bool("ViennaCL encountered invalid data at col_buffer"));
+            if (elements[data_index] != static_cast<SCALARTYPE>(0.0))
+              ins(row-1, col_buffer[data_index]) << typename mtl::Collection< mtl::compressed2D<SCALARTYPE> >::value_type(elements[data_index]);
+            ++data_index;
+          }
+        }
+      }
+    }
+#endif
+
+
+
+
+
+    //////////////////////// compressed_matrix //////////////////////////
+    /** @brief A sparse square matrix in compressed sparse rows format.
+    *
+    * @tparam SCALARTYPE    The floating point type (either float or double, checked at compile time)
+    * @tparam ALIGNMENT     The internal memory size for the entries in each row is given by (size()/ALIGNMENT + 1) * ALIGNMENT. ALIGNMENT must be a power of two. Best values or usually 4, 8 or 16, higher values are usually a waste of memory.
+    */
+    template<class SCALARTYPE, unsigned int ALIGNMENT /* see VCLForwards.h */>
+    class compressed_matrix
+    {
+      public:
+        typedef viennacl::backend::mem_handle                                                              handle_type;
+        typedef scalar<typename viennacl::tools::CHECK_SCALAR_TEMPLATE_ARGUMENT<SCALARTYPE>::ResultType>   value_type;
+        typedef vcl_size_t                                                                                 size_type;
+
+        /** @brief Default construction of a compressed matrix. No memory is allocated */
+        compressed_matrix() : rows_(0), cols_(0), nonzeros_(0) {}
+
+        /** @brief Construction of a compressed matrix with the supplied number of rows and columns. If the number of nonzeros is positive, memory is allocated
+        *
+        * @param rows     Number of rows
+        * @param cols     Number of columns
+        * @param nonzeros Optional number of nonzeros for memory preallocation
+        * @param ctx      Optional context in which the matrix is created (one out of multiple OpenCL contexts, CUDA, host)
+        */
+        explicit compressed_matrix(vcl_size_t rows, vcl_size_t cols, vcl_size_t nonzeros = 0, viennacl::context ctx = viennacl::context())
+          : rows_(rows), cols_(cols), nonzeros_(nonzeros)
+        {
+          row_buffer_.switch_active_handle_id(ctx.memory_type());
+          col_buffer_.switch_active_handle_id(ctx.memory_type());
+            elements_.switch_active_handle_id(ctx.memory_type());
+
+#ifdef VIENNACL_WITH_OPENCL
+          if (ctx.memory_type() == OPENCL_MEMORY)
+          {
+            row_buffer_.opencl_handle().context(ctx.opencl_context());
+            col_buffer_.opencl_handle().context(ctx.opencl_context());
+              elements_.opencl_handle().context(ctx.opencl_context());
+          }
+#endif
+          if (rows > 0)
+          {
+            viennacl::backend::memory_create(row_buffer_, viennacl::backend::typesafe_host_array<unsigned int>().element_size() * (rows + 1), ctx);
+          }
+          if (nonzeros > 0)
+          {
+            viennacl::backend::memory_create(col_buffer_, viennacl::backend::typesafe_host_array<unsigned int>().element_size() * nonzeros, ctx);
+            viennacl::backend::memory_create(elements_, sizeof(SCALARTYPE) * nonzeros, ctx);
+          }
+        }
+
+        /** @brief Construction of a compressed matrix with the supplied number of rows and columns. If the number of nonzeros is positive, memory is allocated
+        *
+        * @param rows     Number of rows
+        * @param cols     Number of columns
+        * @param ctx      Context in which to create the matrix
+        */
+        explicit compressed_matrix(vcl_size_t rows, vcl_size_t cols, viennacl::context ctx)
+          : rows_(rows), cols_(cols), nonzeros_(0)
+        {
+          row_buffer_.switch_active_handle_id(ctx.memory_type());
+          col_buffer_.switch_active_handle_id(ctx.memory_type());
+            elements_.switch_active_handle_id(ctx.memory_type());
+
+#ifdef VIENNACL_WITH_OPENCL
+          if (ctx.memory_type() == OPENCL_MEMORY)
+          {
+            row_buffer_.opencl_handle().context(ctx.opencl_context());
+            col_buffer_.opencl_handle().context(ctx.opencl_context());
+              elements_.opencl_handle().context(ctx.opencl_context());
+          }
+#endif
+          if (rows > 0)
+          {
+            viennacl::backend::memory_create(row_buffer_, viennacl::backend::typesafe_host_array<unsigned int>().element_size() * (rows + 1), ctx);
+          }
+        }
+
+        explicit compressed_matrix(viennacl::context ctx) : rows_(0), cols_(0), nonzeros_(0)
+        {
+          row_buffer_.switch_active_handle_id(ctx.memory_type());
+          col_buffer_.switch_active_handle_id(ctx.memory_type());
+            elements_.switch_active_handle_id(ctx.memory_type());
+
+#ifdef VIENNACL_WITH_OPENCL
+          if (ctx.memory_type() == OPENCL_MEMORY)
+          {
+            row_buffer_.opencl_handle().context(ctx.opencl_context());
+            col_buffer_.opencl_handle().context(ctx.opencl_context());
+              elements_.opencl_handle().context(ctx.opencl_context());
+          }
+#endif
+        }
+
+
+#ifdef VIENNACL_WITH_OPENCL
+        explicit compressed_matrix(cl_mem mem_row_buffer, cl_mem mem_col_buffer, cl_mem mem_elements,
+                                  vcl_size_t rows, vcl_size_t cols, vcl_size_t nonzeros) :
+          rows_(rows), cols_(cols), nonzeros_(nonzeros)
+        {
+            row_buffer_.switch_active_handle_id(viennacl::OPENCL_MEMORY);
+            row_buffer_.opencl_handle() = mem_row_buffer;
+            row_buffer_.opencl_handle().inc();             //prevents that the user-provided memory is deleted once the matrix object is destroyed.
+            row_buffer_.raw_size(sizeof(cl_uint) * (rows + 1));
+
+            col_buffer_.switch_active_handle_id(viennacl::OPENCL_MEMORY);
+            col_buffer_.opencl_handle() = mem_col_buffer;
+            col_buffer_.opencl_handle().inc();             //prevents that the user-provided memory is deleted once the matrix object is destroyed.
+            col_buffer_.raw_size(sizeof(cl_uint) * nonzeros);
+
+            elements_.switch_active_handle_id(viennacl::OPENCL_MEMORY);
+            elements_.opencl_handle() = mem_elements;
+            elements_.opencl_handle().inc();               //prevents that the user-provided memory is deleted once the matrix object is destroyed.
+            elements_.raw_size(sizeof(SCALARTYPE) * nonzeros);
+        }
+#endif
+
+
+        /** @brief Assignment a compressed matrix from possibly another memory domain. */
+        compressed_matrix & operator=(compressed_matrix const & other)
+        {
+          assert( (rows_ == 0 || rows_ == other.size1()) && bool("Size mismatch") );
+          assert( (cols_ == 0 || cols_ == other.size2()) && bool("Size mismatch") );
+
+          rows_ = other.size1();
+          cols_ = other.size2();
+          nonzeros_ = other.nnz();
+
+          viennacl::backend::typesafe_memory_copy<unsigned int>(other.row_buffer_, row_buffer_);
+          viennacl::backend::typesafe_memory_copy<unsigned int>(other.col_buffer_, col_buffer_);
+          viennacl::backend::typesafe_memory_copy<SCALARTYPE>(other.elements_, elements_);
+
+          return *this;
+        }
+
+
+        /** @brief Sets the row, column and value arrays of the compressed matrix
+        *
+        * @param row_jumper     Pointer to an array holding the indices of the first element of each row (starting with zero). E.g. row_jumper[10] returns the index of the first entry of the 11th row. The array length is 'cols + 1'
+        * @param col_buffer     Pointer to an array holding the column index of each entry. The array length is 'nonzeros'
+        * @param elements       Pointer to an array holding the entries of the sparse matrix. The array length is 'elements'
+        * @param rows           Number of rows of the sparse matrix
+        * @param cols           Number of columns of the sparse matrix
+        * @param nonzeros       Number of nonzeros
+        */
+        void set(const void * row_jumper,
+                 const void * col_buffer,
+                 const SCALARTYPE * elements,
+                 vcl_size_t rows,
+                 vcl_size_t cols,
+                 vcl_size_t nonzeros)
+        {
+          assert( (rows > 0)     && bool("Error in compressed_matrix::set(): Number of rows must be larger than zero!"));
+          assert( (cols > 0)     && bool("Error in compressed_matrix::set(): Number of columns must be larger than zero!"));
+          assert( (nonzeros > 0) && bool("Error in compressed_matrix::set(): Number of nonzeros must be larger than zero!"));
+          //std::cout << "Setting memory: " << cols + 1 << ", " << nonzeros << std::endl;
+
+          //row_buffer_.switch_active_handle_id(viennacl::backend::OPENCL_MEMORY);
+          viennacl::backend::memory_create(row_buffer_, viennacl::backend::typesafe_host_array<unsigned int>(row_buffer_).element_size() * (rows + 1), viennacl::traits::context(row_buffer_), row_jumper);
+
+          //col_buffer_.switch_active_handle_id(viennacl::backend::OPENCL_MEMORY);
+          viennacl::backend::memory_create(col_buffer_, viennacl::backend::typesafe_host_array<unsigned int>(col_buffer_).element_size() * nonzeros, viennacl::traits::context(col_buffer_), col_buffer);
+
+          //elements_.switch_active_handle_id(viennacl::backend::OPENCL_MEMORY);
+          viennacl::backend::memory_create(elements_, sizeof(SCALARTYPE) * nonzeros, viennacl::traits::context(elements_), elements);
+
+          nonzeros_ = nonzeros;
+          rows_ = rows;
+          cols_ = cols;
+        }
+
+        /** @brief Allocate memory for the supplied number of nonzeros in the matrix. Old values are preserved. */
+        void reserve(vcl_size_t new_nonzeros)
+        {
+          if (new_nonzeros > nonzeros_)
+          {
+            handle_type col_buffer_old;
+            handle_type elements_old;
+            viennacl::backend::memory_shallow_copy(col_buffer_, col_buffer_old);
+            viennacl::backend::memory_shallow_copy(elements_,   elements_old);
+
+            viennacl::backend::typesafe_host_array<unsigned int> size_deducer(col_buffer_);
+            viennacl::backend::memory_create(col_buffer_, size_deducer.element_size() * new_nonzeros, viennacl::traits::context(col_buffer_));
+            viennacl::backend::memory_create(elements_,   sizeof(SCALARTYPE) * new_nonzeros,          viennacl::traits::context(elements_));
+
+            viennacl::backend::memory_copy(col_buffer_old, col_buffer_, 0, 0, size_deducer.element_size() * nonzeros_);
+            viennacl::backend::memory_copy(elements_old,   elements_,   0, 0, sizeof(SCALARTYPE)* nonzeros_);
+
+            nonzeros_ = new_nonzeros;
+          }
+        }
+
+        /** @brief Resize the matrix.
+        *
+        * @param new_size1    New number of rows
+        * @param new_size2    New number of columns
+        * @param preserve     If true, the old values are preserved. At present, old values are always discarded.
+        */
+        void resize(vcl_size_t new_size1, vcl_size_t new_size2, bool preserve = true)
+        {
+          assert(new_size1 > 0 && new_size2 > 0 && bool("Cannot resize to zero size!"));
+
+          if (new_size1 != rows_ || new_size2 != cols_)
+          {
+            std::vector<std::map<unsigned int, SCALARTYPE> > stl_sparse_matrix;
+            if (rows_ > 0)
+            {
+              if (preserve)
+              {
+                stl_sparse_matrix.resize(rows_);
+                viennacl::copy(*this, stl_sparse_matrix);
+              } else
+                stl_sparse_matrix[0][0] = 0;
+            } else {
+              stl_sparse_matrix.resize(new_size1);
+              stl_sparse_matrix[0][0] = 0;      //enforces nonzero array sizes if matrix was initially empty
+            }
+
+            stl_sparse_matrix.resize(new_size1);
+
+            //discard entries with column index larger than new_size2
+            if (new_size2 < cols_ && rows_ > 0)
+            {
+              for (vcl_size_t i=0; i<stl_sparse_matrix.size(); ++i)
+              {
+                std::list<unsigned int> to_delete;
+                for (typename std::map<unsigned int, SCALARTYPE>::iterator it = stl_sparse_matrix[i].begin();
+                    it != stl_sparse_matrix[i].end();
+                    ++it)
+                {
+                  if (it->first >= new_size2)
+                    to_delete.push_back(it->first);
+                }
+
+                for (std::list<unsigned int>::iterator it = to_delete.begin(); it != to_delete.end(); ++it)
+                  stl_sparse_matrix[i].erase(*it);
+              }
+            }
+
+            viennacl::copy(stl_sparse_matrix, *this);
+
+            rows_ = new_size1;
+            cols_ = new_size2;
+          }
+        }
+
+        /** @brief Returns a reference to the (i,j)-th entry of the sparse matrix. If (i,j) does not exist (zero), it is inserted (slow!) */
+        entry_proxy<SCALARTYPE> operator()(vcl_size_t i, vcl_size_t j)
+        {
+          assert( (i < rows_) && (j < cols_) && bool("compressed_matrix access out of bounds!"));
+
+          vcl_size_t index = element_index(i, j);
+
+          // check for element in sparsity pattern
+          if (index < nonzeros_)
+            return entry_proxy<SCALARTYPE>(index, elements_);
+
+          // Element not found. Copying required. Very slow, but direct entry manipulation is painful anyway...
+          std::vector< std::map<unsigned int, SCALARTYPE> > cpu_backup(rows_);
+          tools::sparse_matrix_adapter<SCALARTYPE> adapted_cpu_backup(cpu_backup, rows_, cols_);
+          viennacl::copy(*this, adapted_cpu_backup);
+          cpu_backup[i][static_cast<unsigned int>(j)] = 0.0;
+          viennacl::copy(adapted_cpu_backup, *this);
+
+          index = element_index(i, j);
+
+          assert(index < nonzeros_);
+
+          return entry_proxy<SCALARTYPE>(index, elements_);
+        }
+
+        /** @brief  Returns the number of rows */
+        const vcl_size_t & size1() const { return rows_; }
+        /** @brief  Returns the number of columns */
+        const vcl_size_t & size2() const { return cols_; }
+        /** @brief  Returns the number of nonzero entries */
+        const vcl_size_t & nnz() const { return nonzeros_; }
+
+        /** @brief  Returns the OpenCL handle to the row index array */
+        const handle_type & handle1() const { return row_buffer_; }
+        /** @brief  Returns the OpenCL handle to the column index array */
+        const handle_type & handle2() const { return col_buffer_; }
+        /** @brief  Returns the OpenCL handle to the matrix entry array */
+        const handle_type & handle() const { return elements_; }
+
+        /** @brief  Returns the OpenCL handle to the row index array */
+        handle_type & handle1() { return row_buffer_; }
+        /** @brief  Returns the OpenCL handle to the column index array */
+        handle_type & handle2() { return col_buffer_; }
+        /** @brief  Returns the OpenCL handle to the matrix entry array */
+        handle_type & handle() { return elements_; }
+
+        void switch_memory_context(viennacl::context new_ctx)
+        {
+          viennacl::backend::switch_memory_context<unsigned int>(row_buffer_, new_ctx);
+          viennacl::backend::switch_memory_context<unsigned int>(col_buffer_, new_ctx);
+          viennacl::backend::switch_memory_context<SCALARTYPE>(elements_, new_ctx);
+        }
+
+        viennacl::memory_types memory_context() const
+        {
+          return row_buffer_.get_active_handle_id();
+        }
+
+      private:
+
+        vcl_size_t element_index(vcl_size_t i, vcl_size_t j)
+        {
+          //read row indices
+          viennacl::backend::typesafe_host_array<unsigned int> row_indices(row_buffer_, 2);
+          viennacl::backend::memory_read(row_buffer_, row_indices.element_size()*i, row_indices.element_size()*2, row_indices.get());
+
+          //get column indices for row i:
+          viennacl::backend::typesafe_host_array<unsigned int> col_indices(col_buffer_, row_indices[1] - row_indices[0]);
+          viennacl::backend::memory_read(col_buffer_, col_indices.element_size()*row_indices[0], row_indices.element_size()*col_indices.size(), col_indices.get());
+
+          //get entries for row i:
+          viennacl::backend::typesafe_host_array<SCALARTYPE> row_entries(elements_, row_indices[1] - row_indices[0]);
+          viennacl::backend::memory_read(elements_, sizeof(SCALARTYPE)*row_indices[0], sizeof(SCALARTYPE)*row_entries.size(), row_entries.get());
+
+          for (vcl_size_t k=0; k<col_indices.size(); ++k)
+          {
+            if (col_indices[k] == j)
+              return row_indices[0] + k;
+          }
+
+          // if not found, return index past the end of the matrix (cf. matrix.end() in the spirit of the STL)
+          return nonzeros_;
+        }
+
+        // /** @brief Copy constructor is by now not available. */
+        //compressed_matrix(compressed_matrix const &);
+
+
+        vcl_size_t rows_;
+        vcl_size_t cols_;
+        vcl_size_t nonzeros_;
+        handle_type row_buffer_;
+        handle_type col_buffer_;
+        handle_type elements_;
+    };
+
+
+
+    //
+    // Specify available operations:
+    //
+
+    /** \cond */
+
+    namespace linalg
+    {
+      namespace detail
+      {
+        // x = A * y
+        template <typename T, unsigned int A>
+        struct op_executor<vector_base<T>, op_assign, vector_expression<const compressed_matrix<T, A>, const vector_base<T>, op_prod> >
+        {
+            static void apply(vector_base<T> & lhs, vector_expression<const compressed_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+            {
+              // check for the special case x = A * x
+              if (viennacl::traits::handle(lhs) == viennacl::traits::handle(rhs.rhs()))
+              {
+                viennacl::vector<T> temp(lhs);
+                viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), temp);
+                lhs = temp;
+              }
+              else
+                viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), lhs);
+            }
+        };
+
+        template <typename T, unsigned int A>
+        struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const compressed_matrix<T, A>, const vector_base<T>, op_prod> >
+        {
+            static void apply(vector_base<T> & lhs, vector_expression<const compressed_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+            {
+              viennacl::vector<T> temp(lhs);
+              viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), temp);
+              lhs += temp;
+            }
+        };
+
+        template <typename T, unsigned int A>
+        struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const compressed_matrix<T, A>, const vector_base<T>, op_prod> >
+        {
+            static void apply(vector_base<T> & lhs, vector_expression<const compressed_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+            {
+              viennacl::vector<T> temp(lhs);
+              viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), temp);
+              lhs -= temp;
+            }
+        };
+
+
+        // x = A * vec_op
+        template <typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+        struct op_executor<vector_base<T>, op_assign, vector_expression<const compressed_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> >
+        {
+            static void apply(vector_base<T> & lhs, vector_expression<const compressed_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+            {
+              viennacl::vector<T> temp(rhs.rhs(), viennacl::traits::context(rhs));
+              viennacl::linalg::prod_impl(rhs.lhs(), temp, lhs);
+            }
+        };
+
+        // x = A * vec_op
+        template <typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+        struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const compressed_matrix<T, A>, vector_expression<const LHS, const RHS, OP>, op_prod> >
+        {
+            static void apply(vector_base<T> & lhs, vector_expression<const compressed_matrix<T, A>, vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+            {
+              viennacl::vector<T> temp(rhs.rhs(), viennacl::traits::context(rhs));
+              viennacl::vector<T> temp_result(lhs);
+              viennacl::linalg::prod_impl(rhs.lhs(), temp, temp_result);
+              lhs += temp_result;
+            }
+        };
+
+        // x = A * vec_op
+        template <typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+        struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const compressed_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> >
+        {
+            static void apply(vector_base<T> & lhs, vector_expression<const compressed_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+            {
+              viennacl::vector<T> temp(rhs.rhs(), viennacl::traits::context(rhs));
+              viennacl::vector<T> temp_result(lhs);
+              viennacl::linalg::prod_impl(rhs.lhs(), temp, temp_result);
+              lhs -= temp_result;
+            }
+        };
+
+     } // namespace detail
+   } // namespace linalg
+
+   /** \endcond */
+}
+
+#endif
diff --git a/viennacl/context.hpp b/viennacl/context.hpp
new file mode 100644
index 0000000..146d043
--- /dev/null
+++ b/viennacl/context.hpp
@@ -0,0 +1,88 @@
+#ifndef VIENNACL_CONTEXT_HPP_
+#define VIENNACL_CONTEXT_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/context.hpp
+    @brief Implementation of a OpenCL-like context, which serves as a unification of {OpenMP, CUDA, OpenCL} at the user API.
+*/
+
+#include <vector>
+#include <stddef.h>
+#include <assert.h>
+#include "viennacl/forwards.h"
+#include "viennacl/ocl/forwards.h"
+#include "viennacl/backend/mem_handle.hpp"
+
+namespace viennacl
+{
+  /** @brief Represents a generic 'context' similar to an OpenCL context, but is backend-agnostic and thus also suitable for CUDA and OpenMP
+    *
+    * Context objects are used to distinguish between different memory domains. One context may refer to an OpenCL device, another context may refer to a CUDA device, and a third context to main RAM.
+    * Thus, operations are only defined on objects residing on the same context.
+    */
+  class context
+  {
+    public:
+      context() : mem_type_(viennacl::backend::default_memory_type())
+      {
+#ifdef VIENNACL_WITH_OPENCL
+        if (mem_type_ == OPENCL_MEMORY)
+          ocl_context_ptr_ = &viennacl::ocl::current_context();
+        else
+          ocl_context_ptr_ = NULL;
+#endif
+      }
+
+      explicit context(viennacl::memory_types mtype) : mem_type_(mtype)
+      {
+        if (mem_type_ == MEMORY_NOT_INITIALIZED)
+          mem_type_ = viennacl::backend::default_memory_type();
+#ifdef VIENNACL_WITH_OPENCL
+        if (mem_type_ == OPENCL_MEMORY)
+          ocl_context_ptr_ = &viennacl::ocl::current_context();
+        else
+          ocl_context_ptr_ = NULL;
+#endif
+      }
+
+#ifdef VIENNACL_WITH_OPENCL
+      context(viennacl::ocl::context const & ctx) : mem_type_(OPENCL_MEMORY), ocl_context_ptr_(&ctx) {}
+
+      viennacl::ocl::context const & opencl_context() const
+      {
+        assert(mem_type_ == OPENCL_MEMORY && bool("Context type is not OpenCL"));
+        return *ocl_context_ptr_;
+      }
+#endif
+
+      // TODO: Add CUDA and OpenMP contexts
+
+      viennacl::memory_types  memory_type() const { return mem_type_; }
+
+    private:
+      viennacl::memory_types   mem_type_;
+#ifdef VIENNACL_WITH_OPENCL
+      viennacl::ocl::context const * ocl_context_ptr_;
+#endif
+  };
+
+
+}
+
+#endif
diff --git a/viennacl/coordinate_matrix.hpp b/viennacl/coordinate_matrix.hpp
index 7878e9b..3ebdee4 100644
--- a/viennacl/coordinate_matrix.hpp
+++ b/viennacl/coordinate_matrix.hpp
@@ -1,332 +1,480 @@
-#ifndef VIENNACL_COORDINATE_MATRIX_HPP_
-#define VIENNACL_COORDINATE_MATRIX_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file coordinate_matrix.hpp
-    @brief Implementation of the coordinate_matrix class
-*/
-
-#include <map>
-#include <vector>
-#include <list>
-
-#include "viennacl/forwards.h"
-#include "viennacl/ocl/backend.hpp"
-#include "viennacl/vector.hpp"
-
-#include "viennacl/linalg/coordinate_matrix_operations.hpp"
-
-namespace viennacl
-{
-  
-    
-    //provide copy-operation:
-    /** @brief Copies a sparse matrix from the host to the OpenCL device (either GPU or multi-core CPU)
-    *
-    * For the requirements on the CPU_MATRIX type, see the documentation of the function copy(CPU_MATRIX, compressed_matrix<>)
-    *
-    * @param cpu_matrix   A sparse matrix on the host.
-    * @param gpu_matrix   A compressed_matrix from ViennaCL
-    */
-    template <typename CPU_MATRIX, typename SCALARTYPE, unsigned int ALIGNMENT>
-    void copy(const CPU_MATRIX & cpu_matrix,
-                     coordinate_matrix<SCALARTYPE, ALIGNMENT> & gpu_matrix )
-    {
-      size_t group_num = 64;
-      
-      // Step 1: Determine nonzeros:
-      if ( cpu_matrix.size1() > 0 && cpu_matrix.size2() > 0 )
-      {
-        std::size_t num_entries = 0;
-        for (typename CPU_MATRIX::const_iterator1 row_it = cpu_matrix.begin1();
-              row_it != cpu_matrix.end1();
-              ++row_it)
-        {
-          for (typename CPU_MATRIX::const_iterator2 col_it = row_it.begin();
-                col_it != row_it.end();
-                ++col_it)
-          {
-            ++num_entries;
-          }
-        }
-        
-        // Step 2: Set up matrix data:
-        std::cout << "Number of entries: " << num_entries << std::endl;
-        gpu_matrix.nonzeros_ = num_entries;
-        gpu_matrix.rows_ = cpu_matrix.size1();
-        gpu_matrix.cols_ = cpu_matrix.size2();
-
-        std::vector<cl_uint> coord_buffer(2*gpu_matrix.internal_nnz());
-        std::vector<cl_uint> group_boundaries(group_num + 1);
-        std::vector<SCALARTYPE> elements(gpu_matrix.internal_nnz());
-        
-        std::size_t data_index = 0;
-        std::size_t current_fraction = 0;
-        
-        for (typename CPU_MATRIX::const_iterator1 row_it = cpu_matrix.begin1();
-              row_it != cpu_matrix.end1();
-              ++row_it)
-        {
-          for (typename CPU_MATRIX::const_iterator2 col_it = row_it.begin();
-                col_it != row_it.end();
-                ++col_it)
-          {
-            coord_buffer[2*data_index] = static_cast<cl_uint>(col_it.index1());
-            coord_buffer[2*data_index + 1] = static_cast<cl_uint>(col_it.index2());
-            elements[data_index] = *col_it;
-            ++data_index;
-          }
-          
-          if (data_index > (current_fraction + 1) / static_cast<double>(group_num) * num_entries)    //split data equally over 64 groups
-            group_boundaries[++current_fraction] = data_index;
-        }
-        
-        //write end of last group:
-        group_boundaries[group_num] = data_index;
-        //group_boundaries[1] = data_index; //for one compute unit
-        
-        /*std::cout << "Group boundaries: " << std::endl;
-        for (size_t i=0; i<group_boundaries.size(); ++i)
-          std::cout << group_boundaries[i] << std::endl;*/
-        
-        gpu_matrix.coord_buffer_     = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, coord_buffer);
-        gpu_matrix.elements_         = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, elements);
-        gpu_matrix.group_boundaries_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, group_boundaries);
-      }
-    }
-
-    /** @brief Copies a sparse matrix in the std::vector< std::map < > > format to an OpenCL device.
-    *
-    * @param cpu_matrix   A sparse square matrix on the host.
-    * @param gpu_matrix   A coordinate_matrix from ViennaCL
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    void copy(const std::vector< std::map<unsigned int, SCALARTYPE> > & cpu_matrix,
-                     coordinate_matrix<SCALARTYPE, ALIGNMENT> & gpu_matrix )
-    {
-      copy(tools::const_sparse_matrix_adapter<SCALARTYPE>(cpu_matrix, cpu_matrix.size(), cpu_matrix.size()), gpu_matrix);
-    }
-    
-    //gpu to cpu:
-    /** @brief Copies a sparse matrix from the OpenCL device (either GPU or multi-core CPU) to the host.
-    *
-    * There are two type requirements on the CPU_MATRIX type (fulfilled by e.g. boost::numeric::ublas):
-    * - resize(rows, cols)  A resize function to bring the matrix into the correct size
-    * - operator(i,j)       Write new entries via the parenthesis operator
-    *
-    * @param gpu_matrix   A coordinate_matrix from ViennaCL
-    * @param cpu_matrix   A sparse matrix on the host.
-    */
-    template <typename CPU_MATRIX, typename SCALARTYPE, unsigned int ALIGNMENT>
-    void copy(const coordinate_matrix<SCALARTYPE, ALIGNMENT> & gpu_matrix,
-                     CPU_MATRIX & cpu_matrix )
-    {
-      if ( gpu_matrix.size1() > 0 && gpu_matrix.size2() > 0 )
-      {
-        cpu_matrix.resize(gpu_matrix.size1(), gpu_matrix.size2(), false);
-        
-        //get raw data from memory:
-        std::vector<cl_uint> coord_buffer(2*gpu_matrix.nnz());
-        std::vector<SCALARTYPE> elements(gpu_matrix.nnz());
-        
-        //std::cout << "GPU nonzeros: " << gpu_matrix.nnz() << std::endl;
-        
-        cl_int err;
-        err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(), gpu_matrix.handle12().get(), CL_TRUE, 0, sizeof(cl_uint)* 2 *gpu_matrix.nnz(), &(coord_buffer[0]), 0, NULL, NULL);
-        VIENNACL_ERR_CHECK(err);
-        err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(), gpu_matrix.handle().get(), CL_TRUE, 0, sizeof(SCALARTYPE)*gpu_matrix.nnz(), &(elements[0]), 0, NULL, NULL);
-        VIENNACL_ERR_CHECK(err);
-        viennacl::ocl::get_queue().finish();
-        
-        //fill the cpu_matrix:
-        for (std::size_t index = 0; index < gpu_matrix.nnz(); ++index)
-        {
-          cpu_matrix(coord_buffer[2*index], coord_buffer[2*index+1]) = elements[index];
-        }
-      }
-    }
-
-    /** @brief Copies a sparse matrix from an OpenCL device to the host. The host type is the std::vector< std::map < > > format .
-    *
-    * @param gpu_matrix   A coordinate_matrix from ViennaCL
-    * @param cpu_matrix   A sparse matrix on the host.
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    void copy(const coordinate_matrix<SCALARTYPE, ALIGNMENT> & gpu_matrix,
-              std::vector< std::map<unsigned int, SCALARTYPE> > & cpu_matrix)
-    {
-      tools::sparse_matrix_adapter<SCALARTYPE> temp(cpu_matrix, gpu_matrix.size1(), gpu_matrix.size2());
-      copy(gpu_matrix, temp);
-    }
-
-
-    //////////////////////// coordinate_matrix //////////////////////////
-    /** @brief A sparse square matrix, where entries are stored as triplets (i,j, val), where i and j are the row and column indices and val denotes the entry.
-    *
-    * The present implementation of coordinate_matrix suffers from poor runtime efficiency. Users are adviced to use compressed_matrix in the meanwhile.
-    *
-    * @tparam SCALARTYPE    The floating point type (either float or double, checked at compile time)
-    * @tparam ALIGNMENT     The internal memory size for the arrays, given by (size()/ALIGNMENT + 1) * ALIGNMENT. ALIGNMENT must be a power of two.
-    */
-    template<class SCALARTYPE, unsigned int ALIGNMENT /* see VCLForwards.h */ >
-    class coordinate_matrix
-    {
-    public:
-      typedef scalar<typename viennacl::tools::CHECK_SCALAR_TEMPLATE_ARGUMENT<SCALARTYPE>::ResultType>   value_type;
-      
-      /** @brief Default construction of a coordinate matrix. No memory is allocated */
-      coordinate_matrix() : rows_(0), cols_(0), nonzeros_(0) { viennacl::linalg::kernels::coordinate_matrix<SCALARTYPE, ALIGNMENT>::init(); }
-      
-      /** @brief Construction of a coordinate matrix with the supplied number of rows and columns. If the number of nonzeros is positive, memory is allocated
-      *
-      * @param rows     Number of rows
-      * @param cols     Number of columns
-      * @param nonzeros Optional number of nonzeros for memory preallocation
-      */
-      coordinate_matrix(std::size_t rows, std::size_t cols, std::size_t nonzeros = 0) : 
-        rows_(rows), cols_(cols), nonzeros_(nonzeros)
-      {
-        viennacl::linalg::kernels::coordinate_matrix<SCALARTYPE, ALIGNMENT>::init();
-        if (nonzeros > 0)
-        {
-          coord_buffer_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, sizeof(cl_uint) * 2 * internal_nnz());
-          elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, sizeof(SCALARTYPE) * internal_nnz());
-          group_boundaries_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, sizeof(cl_uint) * (rows + 1));
-        }
-      }
-        
-      /** @brief Allocate memory for the supplied number of nonzeros in the matrix. Old values are preserved. */
-      void reserve(std::size_t new_nonzeros)
-      {
-        if (new_nonzeros > nonzeros_)
-        {
-          viennacl::ocl::handle<cl_mem> coord_buffer_old = coord_buffer_;
-          viennacl::ocl::handle<cl_mem> elements_old = elements_;
-          coord_buffer_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, sizeof(cl_uint) * 2 * internal_nnz());
-          elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, sizeof(SCALARTYPE) * internal_nnz());
-          
-          cl_int err;
-          err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle().get(), coord_buffer_old.get(), coord_buffer_.get(), 0, 0, sizeof(cl_uint) * 2 * nonzeros_, 0, NULL, NULL);
-          VIENNACL_ERR_CHECK(err);
-          err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle().get(), elements_old.get(), elements_.get(), 0, 0, sizeof(SCALARTYPE)*nonzeros_, 0, NULL, NULL);
-          VIENNACL_ERR_CHECK(err);
-
-          //new memory must be padded with zeros:
-          std::vector<long> temp(internal_nnz() - nonzeros_);
-          err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle().get(), coord_buffer_old.get(), coord_buffer_.get(), 0, nonzeros_, sizeof(cl_uint) * 2 * temp.size(), 0, NULL, NULL);
-          VIENNACL_ERR_CHECK(err);
-          err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle().get(), elements_old.get(), elements_.get(), 0, nonzeros_, sizeof(SCALARTYPE)*temp.size(), 0, NULL, NULL);
-          VIENNACL_ERR_CHECK(err);
-        }
-      }
-
-      /** @brief Resize the matrix.
-      *
-      * @param new_size1    New number of rows
-      * @param new_size2    New number of columns
-      * @param preserve     If true, the old values are preserved. At present, old values are always discarded.
-      */
-      void resize(std::size_t new_size1, std::size_t new_size2, bool preserve = true)
-      {
-        assert (new_size1 > 0 && new_size2 > 0);
-                
-        if (new_size1 < rows_ || new_size2 < cols_) //enlarge buffer
-        {
-          std::vector<std::map<unsigned int, SCALARTYPE> > stl_sparse_matrix;
-          if (rows_ > 0)
-            stl_sparse_matrix.resize(rows_);
-          
-          if (preserve && rows_ > 0)
-            viennacl::copy(*this, stl_sparse_matrix);
-            
-          stl_sparse_matrix.resize(new_size1);
-          
-          std::cout << "Cropping STL matrix of size " << stl_sparse_matrix.size() << std::endl;
-          if (new_size2 < cols_ && rows_ > 0)
-          {
-            for (std::size_t i=0; i<stl_sparse_matrix.size(); ++i)
-            {
-              std::list<unsigned int> to_delete;
-              for (typename std::map<unsigned int, SCALARTYPE>::iterator it = stl_sparse_matrix[i].begin();
-                   it != stl_sparse_matrix[i].end();
-                  ++it)
-              {
-                if (it->first >= new_size2)
-                  to_delete.push_back(it->first);
-              }
-              
-              for (std::list<unsigned int>::iterator it = to_delete.begin(); it != to_delete.end(); ++it)
-                stl_sparse_matrix[i].erase(*it);
-            }
-            std::cout << "Cropping done..." << std::endl;
-          }
-          
-          rows_ = new_size1;
-          cols_ = new_size2;
-          viennacl::copy(stl_sparse_matrix, *this);
-        }
-          
-        rows_ = new_size1;
-        cols_ = new_size2;
-      }
-
-
-      /** @brief  Returns the number of rows */
-      std::size_t size1() const { return rows_; }
-      /** @brief  Returns the number of columns */
-      std::size_t size2() const { return cols_; }
-      /** @brief  Returns the number of nonzero entries */
-      std::size_t nnz() const { return nonzeros_; }
-      /** @brief  Returns the number of internal nonzero entries */
-      std::size_t internal_nnz() const { return viennacl::tools::roundUpToNextMultiple<std::size_t>(nonzeros_, ALIGNMENT);; }
-      
-      /** @brief  Returns the OpenCL handle to the (row, column) index array */
-      const viennacl::ocl::handle<cl_mem> & handle12() const { return coord_buffer_; }
-      /** @brief  Returns the OpenCL handle to the matrix entry array */
-      const viennacl::ocl::handle<cl_mem> & handle() const { return elements_; }
-      /** @brief  Returns the OpenCL handle to the group start index array */
-      const viennacl::ocl::handle<cl_mem> & handle3() const { return group_boundaries_; }
-      
-      #if defined(_MSC_VER) && _MSC_VER < 1500      //Visual Studio 2005 needs special treatment
-      template <typename CPU_MATRIX>
-      friend void copy(const CPU_MATRIX & cpu_matrix, coordinate_matrix & gpu_matrix );
-      #else
-      template <typename CPU_MATRIX, typename SCALARTYPE2, unsigned int ALIGNMENT2>
-      friend void copy(const CPU_MATRIX & cpu_matrix, coordinate_matrix<SCALARTYPE2, ALIGNMENT2> & gpu_matrix );
-      #endif
-
-    private:
-      /** @brief Copy constructor is by now not available. */
-      coordinate_matrix(coordinate_matrix const &);
-      
-      /** @brief Assignment is by now not available. */
-      coordinate_matrix & operator=(coordinate_matrix const &);
-      
-      
-      std::size_t rows_;
-      std::size_t cols_;
-      std::size_t nonzeros_;
-      viennacl::ocl::handle<cl_mem> coord_buffer_;
-      viennacl::ocl::handle<cl_mem> elements_;
-      viennacl::ocl::handle<cl_mem> group_boundaries_;
-    };
-
-
-}
-
-#endif
+#ifndef VIENNACL_COORDINATE_MATRIX_HPP_
+#define VIENNACL_COORDINATE_MATRIX_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/coordinate_matrix.hpp
+    @brief Implementation of the coordinate_matrix class
+*/
+
+#include <map>
+#include <vector>
+#include <list>
+
+#include "viennacl/forwards.h"
+#include "viennacl/vector.hpp"
+
+#include "viennacl/linalg/sparse_matrix_operations.hpp"
+
+namespace viennacl
+{
+
+
+    //provide copy-operation:
+    /** @brief Copies a sparse matrix from the host to the OpenCL device (either GPU or multi-core CPU)
+    *
+    * For the requirements on the CPU_MATRIX type, see the documentation of the function copy(CPU_MATRIX, compressed_matrix<>)
+    *
+    * @param cpu_matrix   A sparse matrix on the host.
+    * @param gpu_matrix   A compressed_matrix from ViennaCL
+    */
+    template <typename CPU_MATRIX, typename SCALARTYPE, unsigned int ALIGNMENT>
+    void copy(const CPU_MATRIX & cpu_matrix,
+                     coordinate_matrix<SCALARTYPE, ALIGNMENT> & gpu_matrix )
+    {
+      assert( (gpu_matrix.size1() == 0 || viennacl::traits::size1(cpu_matrix) == gpu_matrix.size1()) && bool("Size mismatch") );
+      assert( (gpu_matrix.size2() == 0 || viennacl::traits::size2(cpu_matrix) == gpu_matrix.size2()) && bool("Size mismatch") );
+
+      vcl_size_t group_num = 64;
+
+      // Step 1: Determine nonzeros:
+      if ( cpu_matrix.size1() > 0 && cpu_matrix.size2() > 0 )
+      {
+        vcl_size_t num_entries = 0;
+        for (typename CPU_MATRIX::const_iterator1 row_it = cpu_matrix.begin1();
+              row_it != cpu_matrix.end1();
+              ++row_it)
+        {
+          for (typename CPU_MATRIX::const_iterator2 col_it = row_it.begin();
+                col_it != row_it.end();
+                ++col_it)
+          {
+            ++num_entries;
+          }
+        }
+
+        // Step 2: Set up matrix data:
+        gpu_matrix.nonzeros_ = num_entries;
+        gpu_matrix.rows_ = cpu_matrix.size1();
+        gpu_matrix.cols_ = cpu_matrix.size2();
+
+        viennacl::backend::typesafe_host_array<unsigned int> group_boundaries(gpu_matrix.handle3(), group_num + 1);
+        viennacl::backend::typesafe_host_array<unsigned int> coord_buffer(gpu_matrix.handle12(), 2*gpu_matrix.internal_nnz());
+        std::vector<SCALARTYPE> elements(gpu_matrix.internal_nnz());
+
+        vcl_size_t data_index = 0;
+        vcl_size_t current_fraction = 0;
+
+        group_boundaries.set(0, 0);
+        for (typename CPU_MATRIX::const_iterator1 row_it = cpu_matrix.begin1();
+              row_it != cpu_matrix.end1();
+              ++row_it)
+        {
+          for (typename CPU_MATRIX::const_iterator2 col_it = row_it.begin();
+                col_it != row_it.end();
+                ++col_it)
+          {
+            coord_buffer.set(2*data_index, col_it.index1());
+            coord_buffer.set(2*data_index + 1, col_it.index2());
+            elements[data_index] = *col_it;
+            ++data_index;
+          }
+
+          while (data_index > (current_fraction + 1) / static_cast<double>(group_num) * num_entries)    //split data equally over 64 groups
+            group_boundaries.set(++current_fraction, data_index);
+        }
+
+        //write end of last group:
+        group_boundaries.set(group_num, data_index);
+        //group_boundaries[1] = data_index; //for one compute unit
+
+        //std::cout << "Group boundaries: " << std::endl;
+        //for (vcl_size_t i=0; i<group_boundaries.size(); ++i)
+        //  std::cout << group_boundaries[i] << std::endl;
+
+        viennacl::backend::memory_create(gpu_matrix.group_boundaries_, group_boundaries.raw_size(), traits::context(gpu_matrix.group_boundaries_), group_boundaries.get());
+        viennacl::backend::memory_create(gpu_matrix.coord_buffer_,         coord_buffer.raw_size(), traits::context(gpu_matrix.coord_buffer_),     coord_buffer.get());
+        viennacl::backend::memory_create(gpu_matrix.elements_,  sizeof(SCALARTYPE)*elements.size(), traits::context(gpu_matrix.elements_),         &(elements[0]));
+      }
+    }
+
+    /** @brief Copies a sparse matrix in the std::vector< std::map < > > format to an OpenCL device.
+    *
+    * @param cpu_matrix   A sparse square matrix on the host.
+    * @param gpu_matrix   A coordinate_matrix from ViennaCL
+    */
+    template <typename SCALARTYPE, unsigned int ALIGNMENT>
+    void copy(const std::vector< std::map<unsigned int, SCALARTYPE> > & cpu_matrix,
+                     coordinate_matrix<SCALARTYPE, ALIGNMENT> & gpu_matrix )
+    {
+      copy(tools::const_sparse_matrix_adapter<SCALARTYPE>(cpu_matrix, cpu_matrix.size(), cpu_matrix.size()), gpu_matrix);
+    }
+
+    //gpu to cpu:
+    /** @brief Copies a sparse matrix from the OpenCL device (either GPU or multi-core CPU) to the host.
+    *
+    * There are two type requirements on the CPU_MATRIX type (fulfilled by e.g. boost::numeric::ublas):
+    * - resize(rows, cols)  A resize function to bring the matrix into the correct size
+    * - operator(i,j)       Write new entries via the parenthesis operator
+    *
+    * @param gpu_matrix   A coordinate_matrix from ViennaCL
+    * @param cpu_matrix   A sparse matrix on the host.
+    */
+    template <typename CPU_MATRIX, typename SCALARTYPE, unsigned int ALIGNMENT>
+    void copy(const coordinate_matrix<SCALARTYPE, ALIGNMENT> & gpu_matrix,
+                     CPU_MATRIX & cpu_matrix )
+    {
+      assert( (viennacl::traits::size1(cpu_matrix) == gpu_matrix.size1()) && bool("Size mismatch") );
+      assert( (viennacl::traits::size2(cpu_matrix) == gpu_matrix.size2()) && bool("Size mismatch") );
+
+      if ( gpu_matrix.size1() > 0 && gpu_matrix.size2() > 0 )
+      {
+        //get raw data from memory:
+        viennacl::backend::typesafe_host_array<unsigned int> coord_buffer(gpu_matrix.handle12(), 2*gpu_matrix.nnz());
+        std::vector<SCALARTYPE> elements(gpu_matrix.nnz());
+
+        //std::cout << "GPU nonzeros: " << gpu_matrix.nnz() << std::endl;
+
+        viennacl::backend::memory_read(gpu_matrix.handle12(), 0, coord_buffer.raw_size(), coord_buffer.get());
+        viennacl::backend::memory_read(gpu_matrix.handle(),   0, sizeof(SCALARTYPE) * elements.size(), &(elements[0]));
+
+        //fill the cpu_matrix:
+        for (vcl_size_t index = 0; index < gpu_matrix.nnz(); ++index)
+          cpu_matrix(coord_buffer[2*index], coord_buffer[2*index+1]) = elements[index];
+
+      }
+    }
+
+    /** @brief Copies a sparse matrix from an OpenCL device to the host. The host type is the std::vector< std::map < > > format .
+    *
+    * @param gpu_matrix   A coordinate_matrix from ViennaCL
+    * @param cpu_matrix   A sparse matrix on the host.
+    */
+    template <typename SCALARTYPE, unsigned int ALIGNMENT>
+    void copy(const coordinate_matrix<SCALARTYPE, ALIGNMENT> & gpu_matrix,
+              std::vector< std::map<unsigned int, SCALARTYPE> > & cpu_matrix)
+    {
+      tools::sparse_matrix_adapter<SCALARTYPE> temp(cpu_matrix, gpu_matrix.size1(), gpu_matrix.size2());
+      copy(gpu_matrix, temp);
+    }
+
+
+    //////////////////////// coordinate_matrix //////////////////////////
+    /** @brief A sparse square matrix, where entries are stored as triplets (i,j, val), where i and j are the row and column indices and val denotes the entry.
+    *
+    * The present implementation of coordinate_matrix suffers from poor runtime efficiency. Users are adviced to use compressed_matrix in the meanwhile.
+    *
+    * @tparam SCALARTYPE    The floating point type (either float or double, checked at compile time)
+    * @tparam ALIGNMENT     The internal memory size for the arrays, given by (size()/ALIGNMENT + 1) * ALIGNMENT. ALIGNMENT must be a power of two.
+    */
+    template<class SCALARTYPE, unsigned int ALIGNMENT /* see forwards.h */ >
+    class coordinate_matrix
+    {
+      public:
+        typedef viennacl::backend::mem_handle                                                              handle_type;
+        typedef scalar<typename viennacl::tools::CHECK_SCALAR_TEMPLATE_ARGUMENT<SCALARTYPE>::ResultType>   value_type;
+        typedef vcl_size_t                                                                                 size_type;
+
+        /** @brief Default construction of a coordinate matrix. No memory is allocated */
+        coordinate_matrix() : rows_(0), cols_(0), nonzeros_(0), group_num_(64) {}
+
+        explicit coordinate_matrix(viennacl::context ctx) : rows_(0), cols_(0), nonzeros_(0), group_num_(64)
+        {
+          group_boundaries_.switch_active_handle_id(ctx.memory_type());
+              coord_buffer_.switch_active_handle_id(ctx.memory_type());
+                  elements_.switch_active_handle_id(ctx.memory_type());
+
+#ifdef VIENNACL_WITH_OPENCL
+          if (ctx.memory_type() == OPENCL_MEMORY)
+          {
+            group_boundaries_.opencl_handle().context(ctx.opencl_context());
+                coord_buffer_.opencl_handle().context(ctx.opencl_context());
+                    elements_.opencl_handle().context(ctx.opencl_context());
+          }
+#endif
+        }
+
+        /** @brief Construction of a coordinate matrix with the supplied number of rows and columns. If the number of nonzeros is positive, memory is allocated
+        *
+        * @param rows     Number of rows
+        * @param cols     Number of columns
+        * @param nonzeros Optional number of nonzeros for memory preallocation
+        * @param ctx      Optional context in which the matrix is created (one out of multiple OpenCL contexts, CUDA, host)
+        */
+        coordinate_matrix(vcl_size_t rows, vcl_size_t cols, vcl_size_t nonzeros = 0, viennacl::context ctx = viennacl::context()) :
+          rows_(rows), cols_(cols), nonzeros_(nonzeros)
+        {
+          if (nonzeros > 0)
+          {
+            viennacl::backend::memory_create(group_boundaries_, viennacl::backend::typesafe_host_array<unsigned int>().element_size() * (group_num_ + 1), ctx);
+            viennacl::backend::memory_create(coord_buffer_,     viennacl::backend::typesafe_host_array<unsigned int>().element_size() * 2 * internal_nnz(), ctx);
+            viennacl::backend::memory_create(elements_,         sizeof(SCALARTYPE) * internal_nnz(), ctx);
+          }
+          else
+          {
+            group_boundaries_.switch_active_handle_id(ctx.memory_type());
+                coord_buffer_.switch_active_handle_id(ctx.memory_type());
+                    elements_.switch_active_handle_id(ctx.memory_type());
+
+  #ifdef VIENNACL_WITH_OPENCL
+            if (ctx.memory_type() == OPENCL_MEMORY)
+            {
+              group_boundaries_.opencl_handle().context(ctx.opencl_context());
+                  coord_buffer_.opencl_handle().context(ctx.opencl_context());
+                      elements_.opencl_handle().context(ctx.opencl_context());
+            }
+  #endif
+          }
+        }
+
+        /** @brief Construction of a coordinate matrix with the supplied number of rows and columns in the supplied context. Does not yet allocate memory.
+        *
+        * @param rows     Number of rows
+        * @param cols     Number of columns
+        * @param ctx      Context in which to create the matrix
+        */
+        explicit coordinate_matrix(vcl_size_t rows, vcl_size_t cols, viennacl::context ctx)
+          : rows_(rows), cols_(cols), nonzeros_(0)
+        {
+          group_boundaries_.switch_active_handle_id(ctx.memory_type());
+              coord_buffer_.switch_active_handle_id(ctx.memory_type());
+                  elements_.switch_active_handle_id(ctx.memory_type());
+
+#ifdef VIENNACL_WITH_OPENCL
+          if (ctx.memory_type() == OPENCL_MEMORY)
+          {
+            group_boundaries_.opencl_handle().context(ctx.opencl_context());
+                coord_buffer_.opencl_handle().context(ctx.opencl_context());
+                    elements_.opencl_handle().context(ctx.opencl_context());
+          }
+#endif
+        }
+
+
+        /** @brief Allocate memory for the supplied number of nonzeros in the matrix. Old values are preserved. */
+        void reserve(vcl_size_t new_nonzeros)
+        {
+          if (new_nonzeros > nonzeros_)  //TODO: Do we need to initialize new memory with zero?
+          {
+            handle_type coord_buffer_old;
+            handle_type elements_old;
+            viennacl::backend::memory_shallow_copy(coord_buffer_, coord_buffer_old);
+            viennacl::backend::memory_shallow_copy(elements_, elements_old);
+
+            vcl_size_t internal_new_nnz = viennacl::tools::align_to_multiple<vcl_size_t>(new_nonzeros, ALIGNMENT);
+            viennacl::backend::typesafe_host_array<unsigned int> size_deducer(coord_buffer_);
+            viennacl::backend::memory_create(coord_buffer_, size_deducer.element_size() * 2 * internal_new_nnz, viennacl::traits::context(coord_buffer_));
+            viennacl::backend::memory_create(elements_,     sizeof(SCALARTYPE)  * internal_new_nnz,             viennacl::traits::context(elements_));
+
+            viennacl::backend::memory_copy(coord_buffer_old, coord_buffer_, 0, 0, size_deducer.element_size() * 2 * nonzeros_);
+            viennacl::backend::memory_copy(elements_old,     elements_,     0, 0, sizeof(SCALARTYPE)  * nonzeros_);
+
+            nonzeros_ = new_nonzeros;
+          }
+        }
+
+        /** @brief Resize the matrix.
+        *
+        * @param new_size1    New number of rows
+        * @param new_size2    New number of columns
+        * @param preserve     If true, the old values are preserved. At present, old values are always discarded.
+        */
+        void resize(vcl_size_t new_size1, vcl_size_t new_size2, bool preserve = true)
+        {
+          assert (new_size1 > 0 && new_size2 > 0);
+
+          if (new_size1 < rows_ || new_size2 < cols_) //enlarge buffer
+          {
+            std::vector<std::map<unsigned int, SCALARTYPE> > stl_sparse_matrix;
+            if (rows_ > 0)
+              stl_sparse_matrix.resize(rows_);
+
+            if (preserve && rows_ > 0)
+              viennacl::copy(*this, stl_sparse_matrix);
+
+            stl_sparse_matrix.resize(new_size1);
+
+            //std::cout << "Cropping STL matrix of size " << stl_sparse_matrix.size() << std::endl;
+            if (new_size2 < cols_ && rows_ > 0)
+            {
+              for (vcl_size_t i=0; i<stl_sparse_matrix.size(); ++i)
+              {
+                std::list<unsigned int> to_delete;
+                for (typename std::map<unsigned int, SCALARTYPE>::iterator it = stl_sparse_matrix[i].begin();
+                    it != stl_sparse_matrix[i].end();
+                    ++it)
+                {
+                  if (it->first >= new_size2)
+                    to_delete.push_back(it->first);
+                }
+
+                for (std::list<unsigned int>::iterator it = to_delete.begin(); it != to_delete.end(); ++it)
+                  stl_sparse_matrix[i].erase(*it);
+              }
+              //std::cout << "Cropping done..." << std::endl;
+            }
+
+            rows_ = new_size1;
+            cols_ = new_size2;
+            viennacl::copy(stl_sparse_matrix, *this);
+          }
+
+          rows_ = new_size1;
+          cols_ = new_size2;
+        }
+
+
+        /** @brief  Returns the number of rows */
+        vcl_size_t size1() const { return rows_; }
+        /** @brief  Returns the number of columns */
+        vcl_size_t size2() const { return cols_; }
+        /** @brief  Returns the number of nonzero entries */
+        vcl_size_t nnz() const { return nonzeros_; }
+        /** @brief  Returns the number of internal nonzero entries */
+        vcl_size_t internal_nnz() const { return viennacl::tools::align_to_multiple<vcl_size_t>(nonzeros_, ALIGNMENT); }
+
+        /** @brief  Returns the OpenCL handle to the (row, column) index array */
+        const handle_type & handle12() const { return coord_buffer_; }
+        /** @brief  Returns the OpenCL handle to the matrix entry array */
+        const handle_type & handle() const { return elements_; }
+        /** @brief  Returns the OpenCL handle to the group start index array */
+        const handle_type & handle3() const { return group_boundaries_; }
+
+        vcl_size_t groups() const { return group_num_; }
+
+        #if defined(_MSC_VER) && _MSC_VER < 1500      //Visual Studio 2005 needs special treatment
+        template <typename CPU_MATRIX>
+        friend void copy(const CPU_MATRIX & cpu_matrix, coordinate_matrix & gpu_matrix );
+        #else
+        template <typename CPU_MATRIX, typename SCALARTYPE2, unsigned int ALIGNMENT2>
+        friend void copy(const CPU_MATRIX & cpu_matrix, coordinate_matrix<SCALARTYPE2, ALIGNMENT2> & gpu_matrix );
+        #endif
+
+      private:
+        /** @brief Copy constructor is by now not available. */
+        coordinate_matrix(coordinate_matrix const &);
+
+        /** @brief Assignment is by now not available. */
+        coordinate_matrix & operator=(coordinate_matrix const &);
+
+
+        vcl_size_t rows_;
+        vcl_size_t cols_;
+        vcl_size_t nonzeros_;
+        vcl_size_t group_num_;
+        handle_type coord_buffer_;
+        handle_type elements_;
+        handle_type group_boundaries_;
+    };
+
+
+    //
+    // Specify available operations:
+    //
+
+    /** \cond */
+
+    namespace linalg
+    {
+      namespace detail
+      {
+        // x = A * y
+        template <typename T, unsigned int A>
+        struct op_executor<vector_base<T>, op_assign, vector_expression<const coordinate_matrix<T, A>, const vector_base<T>, op_prod> >
+        {
+            static void apply(vector_base<T> & lhs, vector_expression<const coordinate_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+            {
+              // check for the special case x = A * x
+              if (viennacl::traits::handle(lhs) == viennacl::traits::handle(rhs.rhs()))
+              {
+                viennacl::vector<T> temp(lhs);
+                viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), temp);
+                lhs = temp;
+              }
+              else
+                viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), lhs);
+            }
+        };
+
+        template <typename T, unsigned int A>
+        struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const coordinate_matrix<T, A>, const vector_base<T>, op_prod> >
+        {
+            static void apply(vector_base<T> & lhs, vector_expression<const coordinate_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+            {
+              viennacl::vector<T> temp(lhs);
+              viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), temp);
+              lhs += temp;
+            }
+        };
+
+        template <typename T, unsigned int A>
+        struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const coordinate_matrix<T, A>, const vector_base<T>, op_prod> >
+        {
+            static void apply(vector_base<T> & lhs, vector_expression<const coordinate_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+            {
+              viennacl::vector<T> temp(lhs);
+              viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), temp);
+              lhs -= temp;
+            }
+        };
+
+
+        // x = A * vec_op
+        template <typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+        struct op_executor<vector_base<T>, op_assign, vector_expression<const coordinate_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> >
+        {
+            static void apply(vector_base<T> & lhs, vector_expression<const coordinate_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+            {
+              viennacl::vector<T> temp(rhs.rhs(), viennacl::traits::context(rhs));
+              viennacl::linalg::prod_impl(rhs.lhs(), temp, lhs);
+            }
+        };
+
+        // x += A * vec_op
+        template <typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+        struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const coordinate_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> >
+        {
+            static void apply(vector_base<T> & lhs, vector_expression<const coordinate_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+            {
+              viennacl::vector<T> temp(rhs.rhs(), viennacl::traits::context(rhs));
+              viennacl::vector<T> temp_result(lhs);
+              viennacl::linalg::prod_impl(rhs.lhs(), temp, temp_result);
+              lhs += temp_result;
+            }
+        };
+
+        // x -= A * vec_op
+        template <typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+        struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const coordinate_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> >
+        {
+            static void apply(vector_base<T> & lhs, vector_expression<const coordinate_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+            {
+              viennacl::vector<T> temp(rhs.rhs(), viennacl::traits::context(rhs));
+              viennacl::vector<T> temp_result(lhs);
+              viennacl::linalg::prod_impl(rhs.lhs(), temp, temp_result);
+              lhs -= temp_result;
+            }
+        };
+
+      } // namespace detail
+    } // namespace linalg
+
+    /** \endcond */
+}
+
+#endif
diff --git a/viennacl/ell_matrix.hpp b/viennacl/ell_matrix.hpp
new file mode 100644
index 0000000..6e8af98
--- /dev/null
+++ b/viennacl/ell_matrix.hpp
@@ -0,0 +1,296 @@
+#ifndef VIENNACL_ELL_MATRIX_HPP_
+#define VIENNACL_ELL_MATRIX_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/ell_matrix.hpp
+    @brief Implementation of the ell_matrix class
+
+    Contributed by Volodymyr Kysenko.
+*/
+
+
+#include "viennacl/forwards.h"
+#include "viennacl/vector.hpp"
+
+#include "viennacl/tools/tools.hpp"
+
+#include "viennacl/linalg/sparse_matrix_operations.hpp"
+
+namespace viennacl
+{
+    /** @brief Sparse matrix class using the ELLPACK format for storing the nonzeros.
+      *
+      * This format works best for matrices where the number of nonzeros per row is mostly the same.
+      * Finite element and finite difference methods on nicely shaped domains often result in such a nonzero pattern.
+      * For a matrix
+      *
+      *   (1 2 0 0 0)
+      *   (2 3 4 0 0)
+      *   (0 5 6 0 7)
+      *   (0 0 8 9 0)
+      *
+      * the entries are layed out in chunks of size 3 as
+      *   (1 2 5 8; 2 3 6 9; 0 4 7 0)
+      * Note that this is a 'transposed' representation in order to maximize coalesced memory access.
+      */
+    template<typename SCALARTYPE, unsigned int ALIGNMENT /* see forwards.h for default argument */>
+    class ell_matrix
+    {
+      public:
+        typedef viennacl::backend::mem_handle                                                              handle_type;
+        typedef scalar<typename viennacl::tools::CHECK_SCALAR_TEMPLATE_ARGUMENT<SCALARTYPE>::ResultType>   value_type;
+        typedef vcl_size_t                                                                                 size_type;
+
+        ell_matrix() : rows_(0), cols_(0), maxnnz_(0) {}
+
+        ell_matrix(viennacl::context ctx) : rows_(0), cols_(0), maxnnz_(0)
+        {
+            coords_.switch_active_handle_id(ctx.memory_type());
+          elements_.switch_active_handle_id(ctx.memory_type());
+
+#ifdef VIENNACL_WITH_OPENCL
+          if (ctx.memory_type() == OPENCL_MEMORY)
+          {
+              coords_.opencl_handle().context(ctx.opencl_context());
+            elements_.opencl_handle().context(ctx.opencl_context());
+          }
+#endif
+        }
+
+      public:
+        vcl_size_t internal_size1() const { return viennacl::tools::align_to_multiple<vcl_size_t>(rows_, ALIGNMENT); }
+        vcl_size_t internal_size2() const { return viennacl::tools::align_to_multiple<vcl_size_t>(cols_, ALIGNMENT); }
+
+        vcl_size_t size1() const { return rows_; }
+        vcl_size_t size2() const { return cols_; }
+
+        vcl_size_t internal_maxnnz() const {return viennacl::tools::align_to_multiple<vcl_size_t>(maxnnz_, ALIGNMENT); }
+        vcl_size_t maxnnz() const { return maxnnz_; }
+
+        vcl_size_t nnz() const { return rows_ * maxnnz_; }
+        vcl_size_t internal_nnz() const { return internal_size1() * internal_maxnnz(); }
+
+              handle_type & handle()       { return elements_; }
+        const handle_type & handle() const { return elements_; }
+
+              handle_type & handle2()       { return coords_; }
+        const handle_type & handle2() const { return coords_; }
+
+      #if defined(_MSC_VER) && _MSC_VER < 1500          //Visual Studio 2005 needs special treatment
+        template <typename CPU_MATRIX>
+        friend void copy(const CPU_MATRIX & cpu_matrix, ell_matrix & gpu_matrix );
+      #else
+        template <typename CPU_MATRIX, typename T, unsigned int ALIGN>
+        friend void copy(const CPU_MATRIX & cpu_matrix, ell_matrix<T, ALIGN> & gpu_matrix );
+      #endif
+
+      private:
+        vcl_size_t rows_;
+        vcl_size_t cols_;
+        vcl_size_t maxnnz_;
+
+        handle_type coords_;
+        handle_type elements_;
+    };
+
+    template <typename CPU_MATRIX, typename SCALARTYPE, unsigned int ALIGNMENT>
+    void copy(const CPU_MATRIX& cpu_matrix, ell_matrix<SCALARTYPE, ALIGNMENT>& gpu_matrix )
+    {
+      assert( (gpu_matrix.size1() == 0 || viennacl::traits::size1(cpu_matrix) == gpu_matrix.size1()) && bool("Size mismatch") );
+      assert( (gpu_matrix.size2() == 0 || viennacl::traits::size2(cpu_matrix) == gpu_matrix.size2()) && bool("Size mismatch") );
+
+      if(cpu_matrix.size1() > 0 && cpu_matrix.size2() > 0)
+      {
+        //determine max capacity for row
+        vcl_size_t max_entries_per_row = 0;
+        for (typename CPU_MATRIX::const_iterator1 row_it = cpu_matrix.begin1(); row_it != cpu_matrix.end1(); ++row_it)
+        {
+          vcl_size_t num_entries = 0;
+          for (typename CPU_MATRIX::const_iterator2 col_it = row_it.begin(); col_it != row_it.end(); ++col_it)
+          {
+              ++num_entries;
+          }
+
+          max_entries_per_row = std::max(max_entries_per_row, num_entries);
+        }
+
+        //setup GPU matrix
+        gpu_matrix.maxnnz_ = max_entries_per_row;
+        gpu_matrix.rows_ = cpu_matrix.size1();
+        gpu_matrix.cols_ = cpu_matrix.size2();
+
+        vcl_size_t nnz = gpu_matrix.internal_nnz();
+
+        viennacl::backend::typesafe_host_array<unsigned int> coords(gpu_matrix.handle2(), nnz);
+        std::vector<SCALARTYPE> elements(nnz, 0);
+
+        // std::cout << "ELL_MATRIX copy " << gpu_matrix.maxnnz_ << " " << gpu_matrix.rows_ << " " << gpu_matrix.cols_ << " "
+        //             << gpu_matrix.internal_maxnnz() << "\n";
+
+        for (typename CPU_MATRIX::const_iterator1 row_it = cpu_matrix.begin1(); row_it != cpu_matrix.end1(); ++row_it)
+        {
+          vcl_size_t data_index = 0;
+
+          for (typename CPU_MATRIX::const_iterator2 col_it = row_it.begin(); col_it != row_it.end(); ++col_it)
+          {
+            coords.set(gpu_matrix.internal_size1() * data_index + col_it.index1(), col_it.index2());
+            elements[gpu_matrix.internal_size1() * data_index + col_it.index1()] = *col_it;
+            //std::cout << *col_it << "\n";
+              data_index++;
+          }
+        }
+
+        viennacl::backend::memory_create(gpu_matrix.handle2(), coords.raw_size(),                   traits::context(gpu_matrix.handle2()), coords.get());
+        viennacl::backend::memory_create(gpu_matrix.handle(), sizeof(SCALARTYPE) * elements.size(), traits::context(gpu_matrix.handle()), &(elements[0]));
+      }
+    }
+
+    template <typename CPU_MATRIX, typename SCALARTYPE, unsigned int ALIGNMENT>
+    void copy(const ell_matrix<SCALARTYPE, ALIGNMENT>& gpu_matrix, CPU_MATRIX& cpu_matrix)
+    {
+      assert( (viennacl::traits::size1(cpu_matrix) == gpu_matrix.size1()) && bool("Size mismatch") );
+      assert( (viennacl::traits::size2(cpu_matrix) == gpu_matrix.size2()) && bool("Size mismatch") );
+
+      if(gpu_matrix.size1() > 0 && gpu_matrix.size2() > 0)
+      {
+        std::vector<SCALARTYPE> elements(gpu_matrix.internal_nnz());
+        viennacl::backend::typesafe_host_array<unsigned int> coords(gpu_matrix.handle2(), gpu_matrix.internal_nnz());
+
+        viennacl::backend::memory_read(gpu_matrix.handle(), 0, sizeof(SCALARTYPE) * elements.size(), &(elements[0]));
+        viennacl::backend::memory_read(gpu_matrix.handle2(), 0, coords.raw_size(), coords.get());
+
+        for(vcl_size_t row = 0; row < gpu_matrix.size1(); row++)
+        {
+          for(vcl_size_t ind = 0; ind < gpu_matrix.internal_maxnnz(); ind++)
+          {
+            vcl_size_t offset = gpu_matrix.internal_size1() * ind + row;
+
+            if(elements[offset] == static_cast<SCALARTYPE>(0.0))
+                continue;
+
+            if(coords[offset] >= gpu_matrix.size2())
+            {
+                std::cerr << "ViennaCL encountered invalid data " << offset << " " << ind << " " << row << " " << coords[offset] << " " << gpu_matrix.size2() << std::endl;
+                return;
+            }
+
+            cpu_matrix(row, coords[offset]) = elements[offset];
+          }
+        }
+      }
+    }
+
+
+    //
+    // Specify available operations:
+    //
+
+    /** \cond */
+
+    namespace linalg
+    {
+      namespace detail
+      {
+        // x = A * y
+        template <typename T, unsigned int A>
+        struct op_executor<vector_base<T>, op_assign, vector_expression<const ell_matrix<T, A>, const vector_base<T>, op_prod> >
+        {
+            static void apply(vector_base<T> & lhs, vector_expression<const ell_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+            {
+              // check for the special case x = A * x
+              if (viennacl::traits::handle(lhs) == viennacl::traits::handle(rhs.rhs()))
+              {
+                viennacl::vector<T> temp(lhs);
+                viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), temp);
+                lhs = temp;
+              }
+              else
+                viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), lhs);
+            }
+        };
+
+        template <typename T, unsigned int A>
+        struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const ell_matrix<T, A>, const vector_base<T>, op_prod> >
+        {
+            static void apply(vector_base<T> & lhs, vector_expression<const ell_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+            {
+              viennacl::vector<T> temp(lhs);
+              viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), temp);
+              lhs += temp;
+            }
+        };
+
+        template <typename T, unsigned int A>
+        struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const ell_matrix<T, A>, const vector_base<T>, op_prod> >
+        {
+            static void apply(vector_base<T> & lhs, vector_expression<const ell_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+            {
+              viennacl::vector<T> temp(lhs);
+              viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), temp);
+              lhs -= temp;
+            }
+        };
+
+
+        // x = A * vec_op
+        template <typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+        struct op_executor<vector_base<T>, op_assign, vector_expression<const ell_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> >
+        {
+            static void apply(vector_base<T> & lhs, vector_expression<const ell_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+            {
+              viennacl::vector<T> temp(rhs.rhs(), viennacl::traits::context(rhs));
+              viennacl::linalg::prod_impl(rhs.lhs(), temp, lhs);
+            }
+        };
+
+        // x = A * vec_op
+        template <typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+        struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const ell_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> >
+        {
+            static void apply(vector_base<T> & lhs, vector_expression<const ell_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+            {
+              viennacl::vector<T> temp(rhs.rhs(), viennacl::traits::context(rhs));
+              viennacl::vector<T> temp_result(lhs);
+              viennacl::linalg::prod_impl(rhs.lhs(), temp, temp_result);
+              lhs += temp_result;
+            }
+        };
+
+        // x = A * vec_op
+        template <typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+        struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const ell_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> >
+        {
+            static void apply(vector_base<T> & lhs, vector_expression<const ell_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+            {
+              viennacl::vector<T> temp(rhs.rhs(), viennacl::traits::context(rhs));
+              viennacl::vector<T> temp_result(lhs);
+              viennacl::linalg::prod_impl(rhs.lhs(), temp, temp_result);
+              lhs -= temp_result;
+            }
+        };
+
+     } // namespace detail
+   } // namespace linalg
+
+    /** \endcond */
+}
+
+#endif
+
+
diff --git a/viennacl/fft.hpp b/viennacl/fft.hpp
index d5efbff..101c781 100644
--- a/viennacl/fft.hpp
+++ b/viennacl/fft.hpp
@@ -2,41 +2,42 @@
 #define VIENNACL_FFT_HPP
 
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
 ============================================================================= */
 
-/** @file fft.hpp
-    @brief All routines related to the Fast Fourier Transform. Experimental in 1.2.x.
+/** @file viennacl/fft.hpp
+    @brief All routines related to the Fast Fourier Transform. Experimental.
 */
 
 #include <viennacl/vector.hpp>
 #include <viennacl/matrix.hpp>
 
-#include "viennacl/linalg/kernels/fft_kernels.h"
+#include "viennacl/linalg/opencl/kernels/fft.hpp"
 
 #include <cmath>
 
 #include <stdexcept>
 
-namespace viennacl 
+namespace viennacl
 {
   namespace detail
   {
     namespace fft
     {
-        const std::size_t MAX_LOCAL_POINTS_NUM = 512;
+        const vcl_size_t MAX_LOCAL_POINTS_NUM = 512;
 
         namespace FFT_DATA_ORDER {
             enum DATA_ORDER {
@@ -49,21 +50,24 @@ namespace viennacl
 }
 
 /// @cond
-namespace viennacl {
-  namespace detail {
-    namespace fft {
+namespace viennacl
+{
+  namespace detail
+  {
+    namespace fft
+    {
 
-        inline bool is_radix2(std::size_t data_size) {
+        inline bool is_radix2(vcl_size_t data_size) {
             return !((data_size > 2) && (data_size & (data_size - 1)));
 
         }
 
-        inline std::size_t next_power_2(std::size_t n) {
+        inline vcl_size_t next_power_2(vcl_size_t n) {
             n = n - 1;
 
-            std::size_t power = 1;
+            vcl_size_t power = 1;
 
-            while(power < sizeof(std::size_t) * 8) {
+            while(power < sizeof(vcl_size_t) * 8) {
                 n = n | (n >> power);
                 power *= 2;
             }
@@ -71,10 +75,10 @@ namespace viennacl {
             return n + 1;
         }
 
-        inline std::size_t num_bits(std::size_t size)
+        inline vcl_size_t num_bits(vcl_size_t size)
         {
-            std::size_t bits_datasize = 0;
-            std::size_t ds = 1;
+            vcl_size_t bits_datasize = 0;
+            vcl_size_t ds = 1;
 
             while(ds < size)
             {
@@ -95,21 +99,25 @@ namespace viennacl {
         template<class SCALARTYPE>
         void direct(const viennacl::ocl::handle<cl_mem>& in,
                     const viennacl::ocl::handle<cl_mem>& out,
-                    std::size_t size,
-                    std::size_t stride,
-                    std::size_t batch_num,
+                    vcl_size_t size,
+                    vcl_size_t stride,
+                    vcl_size_t batch_num,
                     SCALARTYPE sign = -1.0f,
                     FFT_DATA_ORDER::DATA_ORDER data_order = FFT_DATA_ORDER::ROW_MAJOR
                     )
         {
-          viennacl::linalg::kernels::matrix_row<SCALARTYPE, 1>::init();
-          std::string program_string = viennacl::linalg::kernels::matrix_row<SCALARTYPE, 1>::program_name();
+          viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(in.context());
+          viennacl::linalg::opencl::kernels::fft<SCALARTYPE>::init(ctx);
+
+          std::string program_string = viennacl::linalg::opencl::kernels::matrix<SCALARTYPE, row_major>::program_name();
           if (data_order == FFT_DATA_ORDER::COL_MAJOR)
           {
-            viennacl::linalg::kernels::matrix_col<SCALARTYPE, 1>::init();
-            program_string = viennacl::linalg::kernels::matrix_col<SCALARTYPE, 1>::program_name();
+            viennacl::linalg::opencl::kernels::matrix<SCALARTYPE, column_major>::init(ctx);
+            program_string = viennacl::linalg::opencl::kernels::matrix<SCALARTYPE, column_major>::program_name();
           }
-          viennacl::ocl::kernel& kernel = viennacl::ocl::current_context().get_program(program_string).get_kernel("fft_direct");
+          else
+            viennacl::linalg::opencl::kernels::matrix<SCALARTYPE, row_major>::init(ctx);
+          viennacl::ocl::kernel& kernel = ctx.get_kernel(program_string, "fft_direct");
           viennacl::ocl::enqueue(kernel(in, out, static_cast<cl_uint>(size), static_cast<cl_uint>(stride), static_cast<cl_uint>(batch_num), sign));
         }
 
@@ -119,25 +127,27 @@ namespace viennacl {
         */
         template <typename SCALARTYPE>
         void reorder(const viennacl::ocl::handle<cl_mem>& in,
-                     std::size_t size,
-                     std::size_t stride,
-                     std::size_t bits_datasize,
-                     std::size_t batch_num,
+                     vcl_size_t size,
+                     vcl_size_t stride,
+                     vcl_size_t bits_datasize,
+                     vcl_size_t batch_num,
                      FFT_DATA_ORDER::DATA_ORDER data_order = FFT_DATA_ORDER::ROW_MAJOR
                      )
         {
-          viennacl::linalg::kernels::matrix_row<SCALARTYPE, 1>::init();
-          std::string program_string = viennacl::linalg::kernels::matrix_row<SCALARTYPE, 1>::program_name();
+          viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(in.context());
+          viennacl::linalg::opencl::kernels::fft<SCALARTYPE>::init(ctx);
+
+          std::string program_string = viennacl::linalg::opencl::kernels::matrix<SCALARTYPE, row_major>::program_name();
           if (data_order == FFT_DATA_ORDER::COL_MAJOR)
           {
-            viennacl::linalg::kernels::matrix_col<SCALARTYPE, 1>::init();
-            program_string = viennacl::linalg::kernels::matrix_col<SCALARTYPE, 1>::program_name();
+            viennacl::linalg::opencl::kernels::matrix<SCALARTYPE, column_major>::init(ctx);
+            program_string = viennacl::linalg::opencl::kernels::matrix<SCALARTYPE, column_major>::program_name();
           }
-          
-          viennacl::ocl::kernel& kernel = viennacl::ocl::current_context()
-                                              .get_program(program_string)
-                                              .get_kernel("fft_reorder");
-          viennacl::ocl::enqueue(kernel(in, 
+          else
+            viennacl::linalg::opencl::kernels::matrix<SCALARTYPE, row_major>::init(ctx);
+
+          viennacl::ocl::kernel& kernel = ctx.get_kernel(program_string, "fft_reorder");
+          viennacl::ocl::enqueue(kernel(in,
                                         static_cast<cl_uint>(bits_datasize),
                                         static_cast<cl_uint>(size),
                                         static_cast<cl_uint>(stride),
@@ -155,33 +165,33 @@ namespace viennacl {
         */
         template<class SCALARTYPE>
         void radix2(const viennacl::ocl::handle<cl_mem>& in,
-                    std::size_t size,
-                    std::size_t stride,
-                    std::size_t batch_num,
+                    vcl_size_t size,
+                    vcl_size_t stride,
+                    vcl_size_t batch_num,
                     SCALARTYPE sign = -1.0f,
                     FFT_DATA_ORDER::DATA_ORDER data_order = FFT_DATA_ORDER::ROW_MAJOR
                     )
         {
-          viennacl::linalg::kernels::fft<SCALARTYPE, 1>::init();
+          viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(in.context());
+          viennacl::linalg::opencl::kernels::fft<SCALARTYPE>::init(ctx);
 
             assert(batch_num != 0);
             assert(is_radix2(size));
 
-            viennacl::linalg::kernels::matrix_row<SCALARTYPE, 1>::init();
-            std::string program_string = viennacl::linalg::kernels::matrix_row<SCALARTYPE, 1>::program_name();
+            std::string program_string = viennacl::linalg::opencl::kernels::matrix<SCALARTYPE, row_major>::program_name();
             if (data_order == FFT_DATA_ORDER::COL_MAJOR)
             {
-              viennacl::linalg::kernels::matrix_col<SCALARTYPE, 1>::init();
-              program_string = viennacl::linalg::kernels::matrix_col<SCALARTYPE, 1>::program_name();
+              viennacl::linalg::opencl::kernels::matrix<SCALARTYPE, column_major>::init(ctx);
+              program_string = viennacl::linalg::opencl::kernels::matrix<SCALARTYPE, column_major>::program_name();
             }
+            else
+              viennacl::linalg::opencl::kernels::matrix<SCALARTYPE, row_major>::init(ctx);
 
-            std::size_t bits_datasize = num_bits(size);
+            vcl_size_t bits_datasize = num_bits(size);
 
-            if(size <= MAX_LOCAL_POINTS_NUM) 
+            if(size <= MAX_LOCAL_POINTS_NUM)
             {
-                viennacl::ocl::kernel& kernel = viennacl::ocl::current_context()
-                                                 .get_program(program_string)
-                                                 .get_kernel("fft_radix2_local");
+                viennacl::ocl::kernel& kernel = ctx.get_kernel(program_string, "fft_radix2_local");
                 viennacl::ocl::enqueue(kernel(in,
                                               viennacl::ocl::local_mem((size * 4) * sizeof(SCALARTYPE)),
                                               static_cast<cl_uint>(bits_datasize),
@@ -189,16 +199,14 @@ namespace viennacl {
                                               static_cast<cl_uint>(stride),
                                               static_cast<cl_uint>(batch_num),
                                               sign));
-            } 
+            }
             else
             {
                 reorder<SCALARTYPE>(in, size, stride, bits_datasize, batch_num);
 
-                for(std::size_t step = 0; step < bits_datasize; step++) 
+                for(vcl_size_t step = 0; step < bits_datasize; step++)
                 {
-                    viennacl::ocl::kernel& kernel = viennacl::ocl::current_context()
-                                                     .get_program(program_string)
-                                                     .get_kernel("fft_radix2");
+                    viennacl::ocl::kernel& kernel = ctx.get_kernel(program_string, "fft_radix2");
                     viennacl::ocl::enqueue(kernel(in,
                                                   static_cast<cl_uint>(step),
                                                   static_cast<cl_uint>(bits_datasize),
@@ -221,24 +229,21 @@ namespace viennacl {
         template<class SCALARTYPE, unsigned int ALIGNMENT>
         void bluestein(viennacl::vector<SCALARTYPE, ALIGNMENT>& in,
                        viennacl::vector<SCALARTYPE, ALIGNMENT>& out,
-                       std::size_t batch_num,
-                       SCALARTYPE sign = -1.0
-                       )
+                       vcl_size_t /*batch_num*/)
         {
-          viennacl::linalg::kernels::fft<SCALARTYPE, 1>::init();
+          viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(in).context());
+          viennacl::linalg::opencl::kernels::fft<SCALARTYPE>::init(ctx);
 
-            std::size_t size = in.size() >> 1;
-            std::size_t ext_size = next_power_2(2 * size - 1);
+          vcl_size_t size = in.size() >> 1;
+          vcl_size_t ext_size = next_power_2(2 * size - 1);
 
-            viennacl::vector<SCALARTYPE, ALIGNMENT> A(ext_size << 1);
-            viennacl::vector<SCALARTYPE, ALIGNMENT> B(ext_size << 1);
+          viennacl::vector<SCALARTYPE, ALIGNMENT> A(ext_size << 1);
+          viennacl::vector<SCALARTYPE, ALIGNMENT> B(ext_size << 1);
 
-            viennacl::vector<SCALARTYPE, ALIGNMENT> Z(ext_size << 1);
+          viennacl::vector<SCALARTYPE, ALIGNMENT> Z(ext_size << 1);
 
             {
-                viennacl::ocl::kernel& kernel = viennacl::ocl::current_context()
-                                             .get_program(viennacl::linalg::kernels::fft<SCALARTYPE, 1>::program_name())
-                                             .get_kernel("zero2");
+                viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::fft<SCALARTYPE>::program_name(), "zero2");
                 viennacl::ocl::enqueue(kernel(
                                             A,
                                             B,
@@ -247,9 +252,7 @@ namespace viennacl {
 
             }
             {
-                viennacl::ocl::kernel& kernel = viennacl::ocl::current_context()
-                                             .get_program(viennacl::linalg::kernels::fft<SCALARTYPE, 1>::program_name())
-                                             .get_kernel("bluestein_pre");
+                viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::fft<SCALARTYPE>::program_name(), "bluestein_pre");
                 viennacl::ocl::enqueue(kernel(
                                            in,
                                            A,
@@ -262,9 +265,7 @@ namespace viennacl {
             viennacl::linalg::convolve_i(A, B, Z);
 
             {
-                viennacl::ocl::kernel& kernel = viennacl::ocl::current_context()
-                                                 .get_program(viennacl::linalg::kernels::fft<SCALARTYPE, 1>::program_name())
-                                                 .get_kernel("bluestein_post");
+                viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::fft<SCALARTYPE>::program_name(), "bluestein_post");
                 viennacl::ocl::enqueue(kernel(
                                             Z,
                                             out,
@@ -276,95 +277,90 @@ namespace viennacl {
         template<class SCALARTYPE, unsigned int ALIGNMENT>
         void multiply(viennacl::vector<SCALARTYPE, ALIGNMENT> const & input1,
                       viennacl::vector<SCALARTYPE, ALIGNMENT> const & input2,
-                      viennacl::vector<SCALARTYPE, ALIGNMENT> & output) 
+                      viennacl::vector<SCALARTYPE, ALIGNMENT> & output)
         {
-          viennacl::linalg::kernels::fft<SCALARTYPE, 1>::init();
-            std::size_t size = input1.size() >> 1;
-            viennacl::ocl::kernel& kernel = viennacl::ocl::current_context()
-                                             .get_program(viennacl::linalg::kernels::fft<SCALARTYPE, 1>::program_name())
-                                             .get_kernel("fft_mult_vec");
-            viennacl::ocl::enqueue(kernel(input1, input2, output, static_cast<cl_uint>(size)));
+          viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(input1).context());
+          viennacl::linalg::opencl::kernels::fft<SCALARTYPE>::init(ctx);
+          vcl_size_t size = input1.size() >> 1;
+          viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::fft<SCALARTYPE>::program_name(), "fft_mult_vec");
+          viennacl::ocl::enqueue(kernel(input1, input2, output, static_cast<cl_uint>(size)));
         }
 
         template<class SCALARTYPE, unsigned int ALIGNMENT>
-        void normalize(viennacl::vector<SCALARTYPE, ALIGNMENT> & input) 
+        void normalize(viennacl::vector<SCALARTYPE, ALIGNMENT> & input)
         {
-          viennacl::linalg::kernels::fft<SCALARTYPE, 1>::init();
-            viennacl::ocl::kernel& kernel = viennacl::ocl::current_context()
-                                             .get_program(viennacl::linalg::kernels::fft<SCALARTYPE, 1>::program_name())
-                                             .get_kernel("fft_div_vec_scalar");
-            std::size_t size = input.size() >> 1;
-            SCALARTYPE norm_factor = static_cast<SCALARTYPE>(size);
-            viennacl::ocl::enqueue(kernel(input, static_cast<cl_uint>(size), norm_factor));
+          viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(input).context());
+          viennacl::linalg::opencl::kernels::fft<SCALARTYPE>::init(ctx);
+
+          viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::fft<SCALARTYPE>::program_name(), "fft_div_vec_scalar");
+          vcl_size_t size = input.size() >> 1;
+          SCALARTYPE norm_factor = static_cast<SCALARTYPE>(size);
+          viennacl::ocl::enqueue(kernel(input, static_cast<cl_uint>(size), norm_factor));
         }
 
         template<class SCALARTYPE, unsigned int ALIGNMENT>
-        void transpose(viennacl::matrix<SCALARTYPE, viennacl::row_major, ALIGNMENT> & input) 
+        void transpose(viennacl::matrix<SCALARTYPE, viennacl::row_major, ALIGNMENT> & input)
         {
-          viennacl::linalg::kernels::fft<SCALARTYPE, 1>::init();
-            viennacl::ocl::kernel& kernel = viennacl::ocl::current_context()
-                                             .get_program(viennacl::linalg::kernels::fft<SCALARTYPE, 1>::program_name())
-                                             .get_kernel("transpose_inplace");
-            viennacl::ocl::enqueue(kernel(input,
-                                          static_cast<cl_uint>(input.internal_size1()),
-                                          static_cast<cl_uint>(input.internal_size2()) >> 1));
+          viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(input).context());
+          viennacl::linalg::opencl::kernels::fft<SCALARTYPE>::init(ctx);
+
+          viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::fft<SCALARTYPE>::program_name(), "transpose_inplace");
+          viennacl::ocl::enqueue(kernel(input,
+                                        static_cast<cl_uint>(input.internal_size1()),
+                                        static_cast<cl_uint>(input.internal_size2()) >> 1));
         }
 
         template<class SCALARTYPE, unsigned int ALIGNMENT>
         void transpose(viennacl::matrix<SCALARTYPE, viennacl::row_major, ALIGNMENT> const & input,
                        viennacl::matrix<SCALARTYPE, viennacl::row_major, ALIGNMENT> & output)
         {
-          viennacl::linalg::kernels::fft<SCALARTYPE, 1>::init();
-          
-            viennacl::ocl::kernel& kernel = viennacl::ocl::current_context()
-                                             .get_program(viennacl::linalg::kernels::fft<SCALARTYPE, 1>::program_name())
-                                             .get_kernel("transpose");
-            viennacl::ocl::enqueue(kernel(input,
-                                          output,
-                                          static_cast<cl_uint>(input.internal_size1()),
-                                          static_cast<cl_uint>(input.internal_size2() >> 1))
-                                  );
+          viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(input).context());
+          viennacl::linalg::opencl::kernels::fft<SCALARTYPE>::init(ctx);
+
+          viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::fft<SCALARTYPE>::program_name(), "transpose");
+          viennacl::ocl::enqueue(kernel(input,
+                                        output,
+                                        static_cast<cl_uint>(input.internal_size1()),
+                                        static_cast<cl_uint>(input.internal_size2() >> 1))
+                                );
         }
-        
-        template<class SCALARTYPE, unsigned int ALIGNMENT>
-        void real_to_complex(viennacl::vector<SCALARTYPE, ALIGNMENT> const & in,
-                             viennacl::vector<SCALARTYPE, ALIGNMENT> & out,
-                             std::size_t size) 
+
+        template<class SCALARTYPE>
+        void real_to_complex(viennacl::vector_base<SCALARTYPE> const & in,
+                             viennacl::vector_base<SCALARTYPE> & out,
+                             vcl_size_t size)
         {
-          viennacl::linalg::kernels::fft<SCALARTYPE, 1>::init();
-            viennacl::ocl::kernel& kernel = viennacl::ocl::current_context()
-                                             .get_program(viennacl::linalg::kernels::fft<SCALARTYPE, 1>::program_name())
-                                             .get_kernel("real_to_complex");
-            viennacl::ocl::enqueue(kernel(in, out, static_cast<cl_uint>(size)));
+          viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(in).context());
+          viennacl::linalg::opencl::kernels::fft<SCALARTYPE>::init(ctx);
+          viennacl::ocl::kernel & kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::fft<SCALARTYPE>::program_name(), "real_to_complex");
+          viennacl::ocl::enqueue(kernel(in, out, static_cast<cl_uint>(size)));
         }
 
-        template<class SCALARTYPE, unsigned int ALIGNMENT>
-        void complex_to_real(viennacl::vector<SCALARTYPE, ALIGNMENT> const & in,
-                             viennacl::vector<SCALARTYPE, ALIGNMENT>& out,
-                             std::size_t size)
+        template<class SCALARTYPE>
+        void complex_to_real(viennacl::vector_base<SCALARTYPE> const & in,
+                             viennacl::vector_base<SCALARTYPE>& out,
+                             vcl_size_t size)
         {
-          viennacl::linalg::kernels::fft<SCALARTYPE, 1>::init();
-            viennacl::ocl::kernel& kernel = viennacl::ocl::current_context()
-                                             .get_program(viennacl::linalg::kernels::fft<SCALARTYPE, 1>::program_name())
-                                             .get_kernel("complex_to_real");
-            viennacl::ocl::enqueue(kernel(in, out, static_cast<cl_uint>(size)));
+          viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(in).context());
+          viennacl::linalg::opencl::kernels::fft<SCALARTYPE>::init(ctx);
+          viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::fft<SCALARTYPE>::program_name(), "complex_to_real");
+          viennacl::ocl::enqueue(kernel(in, out, static_cast<cl_uint>(size)));
         }
 
-        template<class SCALARTYPE, unsigned int ALIGNMENT>
-        void reverse(viennacl::vector<SCALARTYPE, ALIGNMENT>& in)
+        template<class SCALARTYPE>
+        void reverse(viennacl::vector_base<SCALARTYPE>& in)
         {
-          viennacl::linalg::kernels::fft<SCALARTYPE, 1>::init();
-            std::size_t size = in.size();
-            viennacl::ocl::kernel& kernel = viennacl::ocl::current_context()
-                                             .get_program(viennacl::linalg::kernels::fft<SCALARTYPE, 1>::program_name())
-                                             .get_kernel("reverse_inplace");
-            viennacl::ocl::enqueue(kernel(in, static_cast<cl_uint>(size)));
+          viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(in).context());
+          viennacl::linalg::opencl::kernels::fft<SCALARTYPE>::init(ctx);
+          vcl_size_t size = in.size();
+          viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::fft<SCALARTYPE>::program_name(), "reverse_inplace");
+          viennacl::ocl::enqueue(kernel(in, static_cast<cl_uint>(size)));
         }
 
-        
+
     } //namespace fft
   } //namespace detail
-  
+
   /**
     * @brief Generic inplace version of 1-D Fourier transformation.
     *
@@ -374,24 +370,24 @@ namespace viennacl {
     */
   template<class SCALARTYPE, unsigned int ALIGNMENT>
   void inplace_fft(viennacl::vector<SCALARTYPE, ALIGNMENT>& input,
-            std::size_t batch_num = 1,
+            vcl_size_t batch_num = 1,
             SCALARTYPE sign = -1.0)
   {
-      std::size_t size = (input.size() >> 1) / batch_num;
+      vcl_size_t size = (input.size() >> 1) / batch_num;
 
-      if(!detail::fft::is_radix2(size)) 
+      if(!viennacl::detail::fft::is_radix2(size))
       {
           viennacl::vector<SCALARTYPE, ALIGNMENT> output(input.size());
-          detail::fft::direct(input.handle(),
-                              output.handle(),
-                              size,
-                              size,
-                              batch_num,
-                              sign);
+          viennacl::detail::fft::direct(viennacl::traits::opencl_handle(input),
+                                        viennacl::traits::opencl_handle(output),
+                                        size,
+                                        size,
+                                        batch_num,
+                                        sign);
 
           viennacl::copy(output, input);
       } else {
-          detail::fft::radix2(input.handle(), size, size, batch_num, sign);
+          viennacl::detail::fft::radix2(viennacl::traits::opencl_handle(input), size, size, batch_num, sign);
       }
   }
 
@@ -406,23 +402,23 @@ namespace viennacl {
   template<class SCALARTYPE, unsigned int ALIGNMENT>
   void fft(viennacl::vector<SCALARTYPE, ALIGNMENT>& input,
             viennacl::vector<SCALARTYPE, ALIGNMENT>& output,
-            std::size_t batch_num = 1,
+            vcl_size_t batch_num = 1,
             SCALARTYPE sign = -1.0
             )
   {
-      std::size_t size = (input.size() >> 1) / batch_num;
+      vcl_size_t size = (input.size() >> 1) / batch_num;
 
-      if(detail::fft::is_radix2(size))
+      if(viennacl::detail::fft::is_radix2(size))
       {
           viennacl::copy(input, output);
-          detail::fft::radix2(output.handle(), size, size, batch_num, sign);
+          viennacl::detail::fft::radix2(viennacl::traits::opencl_handle(output), size, size, batch_num, sign);
       } else {
-          detail::fft::direct(input.handle(),
-                              output.handle(),
-                              size,
-                              size,
-                              batch_num,
-                              sign);
+          viennacl::detail::fft::direct(viennacl::traits::opencl_handle(input),
+                                        viennacl::traits::opencl_handle(output),
+                                        size,
+                                        size,
+                                        batch_num,
+                                        sign);
       }
   }
 
@@ -436,45 +432,45 @@ namespace viennacl {
   void inplace_fft(viennacl::matrix<SCALARTYPE, viennacl::row_major, ALIGNMENT>& input,
             SCALARTYPE sign = -1.0)
   {
-      std::size_t rows_num = input.size1();
-      std::size_t cols_num = input.size2() >> 1;
+      vcl_size_t rows_num = input.size1();
+      vcl_size_t cols_num = input.size2() >> 1;
 
-      std::size_t cols_int = input.internal_size2() >> 1;
+      vcl_size_t cols_int = input.internal_size2() >> 1;
 
       // batch with rows
-      if(detail::fft::is_radix2(cols_num)) 
+      if(viennacl::detail::fft::is_radix2(cols_num))
       {
-          detail::fft::radix2(input.handle(), cols_num, cols_int, rows_num, sign, detail::fft::FFT_DATA_ORDER::ROW_MAJOR);
-      } 
+          viennacl::detail::fft::radix2(viennacl::traits::opencl_handle(input), cols_num, cols_int, rows_num, sign, viennacl::detail::fft::FFT_DATA_ORDER::ROW_MAJOR);
+      }
       else
       {
           viennacl::matrix<SCALARTYPE, viennacl::row_major, ALIGNMENT> output(input.size1(), input.size2());
 
-          detail::fft::direct(input.handle(),
-                              output.handle(),
-                              cols_num,
-                              cols_int,
-                              rows_num,
-                              sign,
-                              detail::fft::FFT_DATA_ORDER::ROW_MAJOR
-                              );
+          viennacl::detail::fft::direct(viennacl::traits::opencl_handle(input),
+                                        viennacl::traits::opencl_handle(output),
+                                        cols_num,
+                                        cols_int,
+                                        rows_num,
+                                        sign,
+                                        viennacl::detail::fft::FFT_DATA_ORDER::ROW_MAJOR
+                                        );
 
           input = output;
       }
 
       // batch with cols
-      if (detail::fft::is_radix2(rows_num)) {
-          detail::fft::radix2(input.handle(), rows_num, cols_int, cols_num, sign, detail::fft::FFT_DATA_ORDER::COL_MAJOR);
+      if (viennacl::detail::fft::is_radix2(rows_num)) {
+          viennacl::detail::fft::radix2(viennacl::traits::opencl_handle(input), rows_num, cols_int, cols_num, sign, viennacl::detail::fft::FFT_DATA_ORDER::COL_MAJOR);
       } else {
           viennacl::matrix<SCALARTYPE, viennacl::row_major, ALIGNMENT> output(input.size1(), input.size2());
 
-          detail::fft::direct(input.handle(),
-                              output.handle(),
-                              rows_num,
-                              cols_int,
-                              cols_num,
-                              sign,
-                              detail::fft::FFT_DATA_ORDER::COL_MAJOR);
+          viennacl::detail::fft::direct(viennacl::traits::opencl_handle(input),
+                                        viennacl::traits::opencl_handle(output),
+                                        rows_num,
+                                        cols_int,
+                                        cols_num,
+                                        sign,
+                                        viennacl::detail::fft::FFT_DATA_ORDER::COL_MAJOR);
 
           input = output;
       }
@@ -493,46 +489,46 @@ namespace viennacl {
             viennacl::matrix<SCALARTYPE, viennacl::row_major, ALIGNMENT>& output,
             SCALARTYPE sign = -1.0)
   {
-      std::size_t rows_num = input.size1();
-      std::size_t cols_num = input.size2() >> 1;
+      vcl_size_t rows_num = input.size1();
+      vcl_size_t cols_num = input.size2() >> 1;
 
-      std::size_t cols_int = input.internal_size2() >> 1;
+      vcl_size_t cols_int = input.internal_size2() >> 1;
 
       // batch with rows
-      if(detail::fft::is_radix2(cols_num))
+      if(viennacl::detail::fft::is_radix2(cols_num))
       {
           output = input;
-          detail::fft::radix2(output.handle(), cols_num, cols_int, rows_num, sign, detail::fft::FFT_DATA_ORDER::ROW_MAJOR);
-      } 
+          viennacl::detail::fft::radix2(viennacl::traits::opencl_handle(output), cols_num, cols_int, rows_num, sign, viennacl::detail::fft::FFT_DATA_ORDER::ROW_MAJOR);
+      }
       else
       {
-          detail::fft::direct(input.handle(),
-                              output.handle(),
-                              cols_num,
-                              cols_int,
-                              rows_num,
-                              sign,
-                              detail::fft::FFT_DATA_ORDER::ROW_MAJOR
-                              );
+          viennacl::detail::fft::direct(viennacl::traits::opencl_handle(input),
+                                        viennacl::traits::opencl_handle(output),
+                                        cols_num,
+                                        cols_int,
+                                        rows_num,
+                                        sign,
+                                        viennacl::detail::fft::FFT_DATA_ORDER::ROW_MAJOR
+                                        );
       }
 
       // batch with cols
-      if(detail::fft::is_radix2(rows_num))
+      if(viennacl::detail::fft::is_radix2(rows_num))
       {
-          detail::fft::radix2(output.handle(), rows_num, cols_int, cols_num, sign, detail::fft::FFT_DATA_ORDER::COL_MAJOR);
-      } 
+          viennacl::detail::fft::radix2(viennacl::traits::opencl_handle(output), rows_num, cols_int, cols_num, sign, viennacl::detail::fft::FFT_DATA_ORDER::COL_MAJOR);
+      }
       else
       {
           viennacl::matrix<SCALARTYPE, viennacl::row_major, ALIGNMENT> tmp(output.size1(), output.size2());
           tmp = output;
 
-          detail::fft::direct(tmp.handle(),
-                              output.handle(),
+          viennacl::detail::fft::direct(viennacl::traits::opencl_handle(tmp),
+                              viennacl::traits::opencl_handle(output),
                               rows_num,
                               cols_int,
                               cols_num,
                               sign,
-                              detail::fft::FFT_DATA_ORDER::COL_MAJOR);
+                              viennacl::detail::fft::FFT_DATA_ORDER::COL_MAJOR);
       }
   }
 
@@ -547,10 +543,10 @@ namespace viennacl {
     */
   template<class SCALARTYPE, unsigned int ALIGNMENT>
   void inplace_ifft(viennacl::vector<SCALARTYPE, ALIGNMENT>& input,
-            std::size_t batch_num = 1)
+            vcl_size_t batch_num = 1)
   {
       viennacl::inplace_fft(input, batch_num, SCALARTYPE(1.0));
-      detail::fft::normalize(input);
+      viennacl::detail::fft::normalize(input);
   }
 
   /**
@@ -566,11 +562,11 @@ namespace viennacl {
   template<class SCALARTYPE, unsigned int ALIGNMENT>
   void ifft(viennacl::vector<SCALARTYPE, ALIGNMENT>& input,
             viennacl::vector<SCALARTYPE, ALIGNMENT>& output,
-            std::size_t batch_num = 1
+            vcl_size_t batch_num = 1
             )
   {
       viennacl::fft(input, output, batch_num, SCALARTYPE(1.0));
-      detail::fft::normalize(output);
+      viennacl::detail::fft::normalize(output);
   }
 
   namespace linalg
diff --git a/viennacl/forwards.h b/viennacl/forwards.h
index ea57b1a..d2ba91f 100644
--- a/viennacl/forwards.h
+++ b/viennacl/forwards.h
@@ -2,16 +2,17 @@
 #define VIENNACL_FORWARDS_H
 
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
@@ -23,49 +24,148 @@
 */
 
 /**
- @mainpage Source Code Documentation for ViennaCL 1.2.1
+ @mainpage Source Code Documentation for ViennaCL 1.5.1
 
  This is the source code documentation of ViennaCL. Detailed information about the functions in ViennaCL can be found here.
- 
+
  For a general overview over the types and functionality provided by ViennaCL, please refer to the file doc/viennacl.pdf
 
 */
 
+
+//compatibility defines:
+#ifdef VIENNACL_HAVE_UBLAS
+  #define VIENNACL_WITH_UBLAS
+#endif
+
+#ifdef VIENNACL_HAVE_EIGEN
+  #define VIENNACL_WITH_EIGEN
+#endif
+
+#ifdef VIENNACL_HAVE_MTL4
+  #define VIENNACL_WITH_MTL4
+#endif
+
 #include <cstddef>
-#include "viennacl/ocl/forwards.h"
+#include <cassert>
+#include <string>
+
 #include "viennacl/meta/enable_if.hpp"
 
+/** @brief Main namespace in ViennaCL. Holds all the basic types such as vector, matrix, etc. and defines operations upon them. */
 namespace viennacl
 {
   typedef std::size_t                                       vcl_size_t;
   typedef std::ptrdiff_t                                    vcl_ptrdiff_t;
- 
-  
+
+
+  /** @brief A tag class representing assignment */
+  struct op_assign {};
+  /** @brief A tag class representing inplace addition */
+  struct op_inplace_add {};
+  /** @brief A tag class representing inplace subtraction */
+  struct op_inplace_sub {};
+
   /** @brief A tag class representing addition */
-  struct op_add;
+  struct op_add {};
   /** @brief A tag class representing subtraction */
-  struct op_sub;
+  struct op_sub {};
+  /** @brief A tag class representing multiplication by a scalar */
+  struct op_mult {};
+  /** @brief A tag class representing matrix-vector products and element-wise multiplications*/
+  struct op_prod {};
+  /** @brief A tag class representing matrix-matrix products */
+  struct op_mat_mat_prod {};
   /** @brief A tag class representing division */
-  struct op_div;
-  
+  struct op_div {};
+  /** @brief A tag class representing the power function */
+  struct op_pow {};
+
+  /** @brief A tag class representing element-wise binary operations (like multiplication) on vectors or matrices */
+  template <typename OP>
+  struct op_element_binary {};
+
+  /** @brief A tag class representing element-wise unary operations (like sin()) on vectors or matrices */
+  template <typename OP>
+  struct op_element_unary {};
+
+  /** @brief A tag class representing the modulus function for integers */
+  struct op_abs {};
+  /** @brief A tag class representing the acos() function */
+  struct op_acos {};
+  /** @brief A tag class representing the asin() function */
+  struct op_asin {};
+  /** @brief A tag class representing the atan() function */
+  struct op_atan {};
+  /** @brief A tag class representing the atan2() function */
+  struct op_atan2 {};
+  /** @brief A tag class representing the ceil() function */
+  struct op_ceil {};
+  /** @brief A tag class representing the cos() function */
+  struct op_cos {};
+  /** @brief A tag class representing the cosh() function */
+  struct op_cosh {};
+  /** @brief A tag class representing the exp() function */
+  struct op_exp {};
+  /** @brief A tag class representing the fabs() function */
+  struct op_fabs {};
+  /** @brief A tag class representing the fdim() function */
+  struct op_fdim {};
+  /** @brief A tag class representing the floor() function */
+  struct op_floor {};
+  /** @brief A tag class representing the fmax() function */
+  struct op_fmax {};
+  /** @brief A tag class representing the fmin() function */
+  struct op_fmin {};
+  /** @brief A tag class representing the fmod() function */
+  struct op_fmod {};
+  /** @brief A tag class representing the log() function */
+  struct op_log {};
+  /** @brief A tag class representing the log10() function */
+  struct op_log10 {};
+  /** @brief A tag class representing the sin() function */
+  struct op_sin {};
+  /** @brief A tag class representing the sinh() function */
+  struct op_sinh {};
+  /** @brief A tag class representing the sqrt() function */
+  struct op_sqrt {};
+  /** @brief A tag class representing the tan() function */
+  struct op_tan {};
+  /** @brief A tag class representing the tanh() function */
+  struct op_tanh {};
+
+  /** @brief A tag class representing the (off-)diagonal of a matrix */
+  struct op_matrix_diag {};
+
+  /** @brief A tag class representing a matrix given by a vector placed on a certain (off-)diagonal */
+  struct op_vector_diag {};
+
+  /** @brief A tag class representing the extraction of a matrix row to a vector */
+  struct op_row {};
+
+  /** @brief A tag class representing the extraction of a matrix column to a vector */
+  struct op_column {};
+
   /** @brief A tag class representing inner products of two vectors */
-  struct op_inner_prod;
+  struct op_inner_prod {};
 
   /** @brief A tag class representing the 1-norm of a vector */
-  struct op_norm_1;
+  struct op_norm_1 {};
 
   /** @brief A tag class representing the 2-norm of a vector */
-  struct op_norm_2;
+  struct op_norm_2 {};
 
-  /** @brief A tag class representing the 2-norm of a vector */
-  struct op_norm_inf;
+  /** @brief A tag class representing the inf-norm of a vector */
+  struct op_norm_inf {};
+
+  /** @brief A tag class representing the Frobenius-norm of a matrix */
+  struct op_norm_frobenius {};
 
-  /** @brief A tag class representing a matrix-vector product */
-  struct op_prod;
-  
   /** @brief A tag class representing transposed matrices */
-  struct op_trans;
-  
+  struct op_trans {};
+
+  /** @brief A tag class representing sign flips (for scalars only. Vectors and matrices use the standard multiplication by the scalar -1.0) */
+  struct op_flip_sign {};
 
   //forward declaration of basic types:
   template<class TYPE>
@@ -76,7 +176,7 @@ namespace viennacl
 
   template <typename SCALARTYPE>
   class entry_proxy;
-  
+
   template <typename LHS, typename RHS, typename OP>
   class vector_expression;
 
@@ -85,10 +185,31 @@ namespace viennacl
 
   template<class SCALARTYPE, unsigned int ALIGNMENT>
   class const_vector_iterator;
-  
+
+  template<typename SCALARTYPE>
+  class implicit_vector_base;
+
+  template <typename SCALARTYPE>
+  class zero_vector;
+
+  template <typename SCALARTYPE>
+  class unit_vector;
+
+  template <typename SCALARTYPE>
+  class one_vector;
+
+  template <typename SCALARTYPE>
+  class scalar_vector;
+
+  template<class SCALARTYPE, typename SizeType = vcl_size_t, typename DistanceType = vcl_ptrdiff_t>
+  class vector_base;
+
   template<class SCALARTYPE, unsigned int ALIGNMENT = 1>
   class vector;
-  
+
+  template <typename ScalarT>
+  class vector_tuple;
+
   //the following forwards are needed for GMRES
   template <typename SCALARTYPE, unsigned int ALIGNMENT, typename CPU_ITERATOR>
   void copy(CPU_ITERATOR const & cpu_begin,
@@ -99,16 +220,62 @@ namespace viennacl
   void copy(const_vector_iterator<SCALARTYPE, ALIGNMENT_SRC> const & gpu_src_begin,
             const_vector_iterator<SCALARTYPE, ALIGNMENT_SRC> const & gpu_src_end,
             vector_iterator<SCALARTYPE, ALIGNMENT_DEST> gpu_dest_begin);
-  
+
   template <typename SCALARTYPE, unsigned int ALIGNMENT_SRC, unsigned int ALIGNMENT_DEST>
   void copy(const_vector_iterator<SCALARTYPE, ALIGNMENT_SRC> const & gpu_src_begin,
             const_vector_iterator<SCALARTYPE, ALIGNMENT_SRC> const & gpu_src_end,
             const_vector_iterator<SCALARTYPE, ALIGNMENT_DEST> gpu_dest_begin);
-  
-  
-  struct row_major;    
-  struct column_major;    
-  
+
+  template <typename SCALARTYPE, unsigned int ALIGNMENT, typename CPU_ITERATOR>
+  void fast_copy(const const_vector_iterator<SCALARTYPE, ALIGNMENT> & gpu_begin,
+                 const const_vector_iterator<SCALARTYPE, ALIGNMENT> & gpu_end,
+                 CPU_ITERATOR cpu_begin );
+
+  template <typename CPU_ITERATOR, typename SCALARTYPE, unsigned int ALIGNMENT>
+  void fast_copy(CPU_ITERATOR const & cpu_begin,
+                  CPU_ITERATOR const & cpu_end,
+                  vector_iterator<SCALARTYPE, ALIGNMENT> gpu_begin);
+
+
+  /** @brief Tag class for indicating row-major layout of a matrix. Not passed to the matrix directly, see row_major type. */
+  struct row_major_tag {};
+  /** @brief Tag class for indicating column-major layout of a matrix. Not passed to the matrix directly, see row_major type. */
+  struct column_major_tag {};
+
+  /** @brief A tag for row-major storage of a dense matrix. */
+  struct row_major
+  {
+    typedef row_major_tag         orientation_category;
+
+    /** @brief Returns the memory offset for entry (i,j) of a dense matrix.
+    *
+    * @param i   row index
+    * @param j   column index
+    * @param num_cols  number of entries per column (including alignment)
+    */
+    static vcl_size_t mem_index(vcl_size_t i, vcl_size_t j, vcl_size_t /* num_rows */, vcl_size_t num_cols)
+    {
+      return i * num_cols + j;
+    }
+  };
+
+  /** @brief A tag for column-major storage of a dense matrix. */
+  struct column_major
+  {
+    typedef column_major_tag         orientation_category;
+
+    /** @brief Returns the memory offset for entry (i,j) of a dense matrix.
+    *
+    * @param i   row index
+    * @param j   column index
+    * @param num_rows  number of entries per row (including alignment)
+    */
+    static vcl_size_t mem_index(vcl_size_t i, vcl_size_t j, vcl_size_t num_rows, vcl_size_t /* num_cols */)
+    {
+      return i + j * num_rows;
+    }
+  };
+
   struct row_iteration;
   struct col_iteration;
 
@@ -117,175 +284,601 @@ namespace viennacl
 
   //
   // Matrix types:
-  //  
+  //
+
+  template<class SCALARTYPE, typename F = row_major, typename SizeType = vcl_size_t, typename DistanceType = vcl_ptrdiff_t>
+  class matrix_base;
+
   template <class SCALARTYPE, typename F = row_major, unsigned int ALIGNMENT = 1>
   class matrix;
-  
+
+  template<typename SCALARTYPE>
+  class implicit_matrix_base;
+
+  template <class SCALARTYPE>
+  class identity_matrix;
+
+  template <class SCALARTYPE>
+  class zero_matrix;
+
+  template <class SCALARTYPE>
+  class scalar_matrix;
+
   template<class SCALARTYPE, unsigned int ALIGNMENT = 1>
   class compressed_matrix;
-  
+
+  template<class SCALARTYPE>
+  class compressed_compressed_matrix;
+
+
   template<class SCALARTYPE, unsigned int ALIGNMENT = 128>
-  class coordinate_matrix;    
+  class coordinate_matrix;
+
+  template<class SCALARTYPE, unsigned int ALIGNMENT = 1>
+  class ell_matrix;
+
+  template<class SCALARTYPE, unsigned int ALIGNMENT = 1>
+  class hyb_matrix;
 
   template<class SCALARTYPE, unsigned int ALIGNMENT = 1>
   class circulant_matrix;
-    
+
   template<class SCALARTYPE, unsigned int ALIGNMENT = 1>
   class hankel_matrix;
-  
+
   template<class SCALARTYPE, unsigned int ALIGNMENT = 1>
   class toeplitz_matrix;
-  
+
   template<class SCALARTYPE, unsigned int ALIGNMENT = 1>
   class vandermonde_matrix;
-  
+
   //
   // Proxies:
   //
-  template <typename SizeType = std::size_t, typename DistanceType = std::ptrdiff_t>
+  template <typename SizeType = vcl_size_t, typename DistanceType = std::ptrdiff_t>
   class basic_range;
-  
+
   typedef basic_range<>  range;
-  
-  template <typename MatrixType>
-  class matrix_range;
+
+  template <typename SizeType = vcl_size_t, typename DistanceType = std::ptrdiff_t>
+  class basic_slice;
+
+  typedef basic_slice<>  slice;
 
   template <typename VectorType>
   class vector_range;
-  
+
+  template <typename VectorType>
+  class vector_slice;
+
+  template <typename MatrixType>
+  class matrix_range;
+
+  template <typename MatrixType>
+  class matrix_slice;
+
+
+  /** @brief Helper struct for checking whether a type is a host scalar type (e.g. float, double) */
   template <typename T>
-  struct is_scalar;
+  struct is_cpu_scalar
+  {
+    enum { value = false };
+  };
+
+  /** @brief Helper struct for checking whether a type is a viennacl::scalar<> */
+  template <typename T>
+  struct is_scalar
+  {
+    enum { value = false };
+  };
 
+  /** @brief Helper struct for checking whether a type represents a sign flip on a viennacl::scalar<> */
   template <typename T>
-  struct is_vector;
+  struct is_flip_sign_scalar
+  {
+    enum { value = false };
+  };
 
+  /** @brief Helper struct for checking whether the provided type represents a scalar (either host, from ViennaCL, or a flip-sign proxy) */
   template <typename T>
-  struct is_matrix;
-  
+  struct is_any_scalar
+  {
+    enum { value = (is_scalar<T>::value || is_cpu_scalar<T>::value || is_flip_sign_scalar<T>::value )};
+  };
+
+  /** @brief Checks for a type being either vector_base or implicit_vector_base */
+  template<typename T>
+  struct is_any_vector { enum { value = 0 }; };
+
+  /** @brief Checks for either matrix_base or implicit_matrix_base */
+  template<typename T>
+  struct is_any_dense_matrix { enum { value = 0 }; };
+
+  /** @brief Helper class for checking whether a matrix has a row-major layout. */
+  template <typename T>
+  struct is_row_major
+  {
+    enum { value = false };
+  };
+
+  /** @brief Helper class for checking whether a matrix is a compressed_matrix (CSR format) */
+  template <typename T>
+  struct is_compressed_matrix
+  {
+    enum { value = false };
+  };
+
+  /** @brief Helper class for checking whether a matrix is a coordinate_matrix (COO format) */
+  template <typename T>
+  struct is_coordinate_matrix
+  {
+    enum { value = false };
+  };
+
+  /** @brief Helper class for checking whether a matrix is an ell_matrix (ELL format) */
+  template <typename T>
+  struct is_ell_matrix
+  {
+    enum { value = false };
+  };
+
+  /** @brief Helper class for checking whether a matrix is a hyb_matrix (hybrid format: ELL plus CSR) */
+  template <typename T>
+  struct is_hyb_matrix
+  {
+    enum { value = false };
+  };
+
+  /** @brief Helper class for checking whether the provided type is one of the sparse matrix types (compressed_matrix, coordinate_matrix, etc.) */
+  template <typename T>
+  struct is_any_sparse_matrix
+  {
+    enum { value = false };
+  };
+
+
+  /** @brief Helper class for checking whether a matrix is a circulant matrix */
+  template <typename T>
+  struct is_circulant_matrix
+  {
+    enum { value = false };
+  };
+
+  /** @brief Helper class for checking whether a matrix is a Hankel matrix */
+  template <typename T>
+  struct is_hankel_matrix
+  {
+    enum { value = false };
+  };
+
+  /** @brief Helper class for checking whether a matrix is a Toeplitz matrix */
+  template <typename T>
+  struct is_toeplitz_matrix
+  {
+    enum { value = false };
+  };
+
+  /** @brief Helper class for checking whether a matrix is a Vandermonde matrix */
+  template <typename T>
+  struct is_vandermonde_matrix
+  {
+    enum { value = false };
+  };
+
+  /** @brief Helper class for checking whether the provided type is any of the dense structured matrix types (circulant, Hankel, etc.) */
+  template <typename T>
+  struct is_any_dense_structured_matrix
+  {
+    enum { value = viennacl::is_circulant_matrix<T>::value || viennacl::is_hankel_matrix<T>::value || viennacl::is_toeplitz_matrix<T>::value || viennacl::is_vandermonde_matrix<T>::value };
+  };
+
+
+  enum memory_types
+  {
+    MEMORY_NOT_INITIALIZED
+    , MAIN_MEMORY
+    , OPENCL_MEMORY
+    , CUDA_MEMORY
+  };
+
+  /** @brief Exception class in case of memory errors */
+  class memory_exception : public std::exception
+  {
+  public:
+    memory_exception() : message_() {}
+    memory_exception(std::string message) : message_("ViennaCL: Internal memory error: " + message) {}
+
+    virtual const char* what() const throw() { return message_.c_str(); }
+
+    virtual ~memory_exception() throw() {}
+  private:
+    std::string message_;
+  };
+
+  class cuda_not_available_exception : public std::exception
+  {
+  public:
+    cuda_not_available_exception() : message_("ViennaCL was compiled without CUDA support, but CUDA functionality required for this operation.") {}
+
+    virtual const char* what() const throw() { return message_.c_str(); }
+
+    virtual ~cuda_not_available_exception() throw() {}
+  private:
+    std::string message_;
+  };
+
+
+  class context;
+
   namespace tools
   {
-    //helper for matrix row/col iterators 
+    //helper for matrix row/col iterators
     //must be specialized for every viennacl matrix type
+    /** @brief Helper class for incrementing an iterator in a dense matrix. */
     template <typename ROWCOL, typename MATRIXTYPE>
     struct MATRIX_ITERATOR_INCREMENTER
     {
-      static void apply(const MATRIXTYPE & mat, unsigned int & row, unsigned int & col)
-      {
-          typedef typename MATRIXTYPE::ERROR_SPECIALIZATION_FOR_THIS_MATRIX_TYPE_MISSING          ErrorIndicator;
-      }
+      typedef typename MATRIXTYPE::ERROR_SPECIALIZATION_FOR_THIS_MATRIX_TYPE_MISSING          ErrorIndicator;
+
+      static void apply(const MATRIXTYPE & /*mat*/, unsigned int & /*row*/, unsigned int & /*col*/) {}
     };
   }
-    
+
   namespace linalg
   {
+#if !defined(_MSC_VER) || defined(__CUDACC__)
+
     template<class SCALARTYPE, unsigned int ALIGNMENT>
     void convolve_i(viennacl::vector<SCALARTYPE, ALIGNMENT>& input1,
                     viennacl::vector<SCALARTYPE, ALIGNMENT>& input2,
                     viennacl::vector<SCALARTYPE, ALIGNMENT>& output);
-    
-#ifndef _MSC_VER
+
+    template <typename T>
+    viennacl::vector_expression<const vector_base<T>, const vector_base<T>, op_element_binary<op_prod> >
+    element_prod(vector_base<T> const & v1, vector_base<T> const & v2);
+
+    template <typename T>
+    viennacl::vector_expression<const vector_base<T>, const vector_base<T>, op_element_binary<op_div> >
+    element_div(vector_base<T> const & v1, vector_base<T> const & v2);
+
+
+
+    template <typename T>
+    void inner_prod_impl(vector_base<T> const & vec1,
+                         vector_base<T> const & vec2,
+                         scalar<T> & result);
+
+    template <typename LHS, typename RHS, typename OP, typename T>
+    void inner_prod_impl(viennacl::vector_expression<LHS, RHS, OP> const & vec1,
+                         vector_base<T> const & vec2,
+                         scalar<T> & result);
+
+    template <typename T, typename LHS, typename RHS, typename OP>
+    void inner_prod_impl(vector_base<T> const & vec1,
+                         viennacl::vector_expression<LHS, RHS, OP> const & vec2,
+                         scalar<T> & result);
+
+    template <typename LHS1, typename RHS1, typename OP1,
+              typename LHS2, typename RHS2, typename OP2, typename T>
+    void inner_prod_impl(viennacl::vector_expression<LHS1, RHS1, OP1> const & vec1,
+                         viennacl::vector_expression<LHS2, RHS2, OP2> const & vec2,
+                         scalar<T> & result);
+
+    ///////////////////////////
+
+    template <typename T>
+    void inner_prod_cpu(vector_base<T> const & vec1,
+                        vector_base<T> const & vec2,
+                        T & result);
+
+    template <typename LHS, typename RHS, typename OP, typename T>
+    void inner_prod_cpu(viennacl::vector_expression<LHS, RHS, OP> const & vec1,
+                        vector_base<T> const & vec2,
+                        T & result);
+
+    template <typename T, typename LHS, typename RHS, typename OP>
+    void inner_prod_cpu(vector_base<T> const & vec1,
+                        viennacl::vector_expression<LHS, RHS, OP> const & vec2,
+                        T & result);
+
+    template <typename LHS1, typename RHS1, typename OP1,
+              typename LHS2, typename RHS2, typename OP2, typename S3>
+    void inner_prod_cpu(viennacl::vector_expression<LHS1, RHS1, OP1> const & vec1,
+                        viennacl::vector_expression<LHS2, RHS2, OP2> const & vec2,
+                        S3 & result);
+
+
+
     //forward definition of norm_1_impl function
-    template <typename V1, typename S2>
-    void norm_1_impl(V1 const & vec,
-                     S2 & result,
-                      typename viennacl::enable_if< viennacl::is_vector<V1>::value
-                                                    && viennacl::is_scalar<S2>::value
-                                                  >::type * dummy = 0);
+    template <typename T>
+    void norm_1_impl(vector_base<T> const & vec, scalar<T> & result);
+
+    //template <typename T, typename F>
+    //void norm_1_impl(matrix_base<T, F> const & A, scalar<T> & result);
+
+    template <typename LHS, typename RHS, typename OP, typename T>
+    void norm_1_impl(viennacl::vector_expression<LHS, RHS, OP> const & vec,
+                     scalar<T> & result);
+
+
+    template <typename T>
+    void norm_1_cpu(vector_base<T> const & vec,
+                    T & result);
+
+    //template <typename T, typename F>
+    //void norm_1_cpu(matrix_base<T, F> const & vec,
+    //                T & result);
+
+    template <typename LHS, typename RHS, typename OP, typename S2>
+    void norm_1_cpu(viennacl::vector_expression<LHS, RHS, OP> const & vec,
+                    S2 & result);
 
     //forward definition of norm_2_impl function
-    template <typename V1, typename S2>
-    void norm_2_impl(V1 const & vec,
-                     S2 & result,
-                     typename viennacl::enable_if< viennacl::is_vector<V1>::value
-                                                   && viennacl::is_scalar<S2>::value
-                                                 >::type * dummy = 0);
+    template <typename T>
+    void norm_2_impl(vector_base<T> const & vec, scalar<T> & result);
+
+    template <typename LHS, typename RHS, typename OP, typename T>
+    void norm_2_impl(viennacl::vector_expression<LHS, RHS, OP> const & vec,
+                     scalar<T> & result);
+
+    template <typename T>
+    void norm_2_cpu(vector_base<T> const & vec, T & result);
+
+    template <typename LHS, typename RHS, typename OP, typename S2>
+    void norm_2_cpu(viennacl::vector_expression<LHS, RHS, OP> const & vec,
+                    S2 & result);
+
 
     //forward definition of norm_inf_impl function
-    template <typename V1, typename S2>
-    void norm_inf_impl(V1 const & vec,
-                       S2 & result,
-                       typename viennacl::enable_if< viennacl::is_vector<V1>::value
-                                                     && viennacl::is_scalar<S2>::value
-                                                   >::type * dummy = 0);
-#endif    
-    
+    template <typename T>
+    void norm_inf_impl(vector_base<T> const & vec, scalar<T> & result);
+
+    //template <typename T, typename F>
+    //void norm_inf_impl(matrix_base<T, F> const & vec, scalar<T> & result);
+
+    template <typename LHS, typename RHS, typename OP, typename T>
+    void norm_inf_impl(viennacl::vector_expression<LHS, RHS, OP> const & vec,
+                      scalar<T> & result);
+
+
+    template <typename T>
+    void norm_inf_cpu(vector_base<T> const & vec, T & result);
+
+    //template <typename T, typename F>
+    //void norm_inf_cpu(matrix_base<T, F> const & vec, T & result);
+
+    template <typename LHS, typename RHS, typename OP, typename S2>
+    void norm_inf_cpu(viennacl::vector_expression<LHS, RHS, OP> const & vec,
+                      S2 & result);
+
+    template <typename T, typename F>
+    void norm_frobenius_impl(matrix_base<T, F> const & vec, scalar<T> & result);
+
+    template <typename T, typename F>
+    void norm_frobenius_cpu(matrix_base<T, F> const & vec, T & result);
+
+
+    template <typename T>
+    vcl_size_t index_norm_inf(vector_base<T> const & vec);
+
+    template <typename LHS, typename RHS, typename OP>
+    vcl_size_t index_norm_inf(viennacl::vector_expression<LHS, RHS, OP> const & vec);
+
     //forward definition of prod_impl functions
-    template<class SCALARTYPE, typename F, unsigned int ALIGNMENT, unsigned int VECTOR_ALIGNMENT>
-    viennacl::vector_expression<const viennacl::matrix<SCALARTYPE, F, ALIGNMENT>,
-                                const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT>, 
-                                op_prod > prod_impl(const viennacl::matrix<SCALARTYPE, F, ALIGNMENT> &, 
-                                                    const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT> &);
-
-    template<class SCALARTYPE, unsigned int ALIGNMENT, unsigned int VECTOR_ALIGNMENT>
-    viennacl::vector_expression<const viennacl::compressed_matrix<SCALARTYPE, ALIGNMENT>,
-                                const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT>, 
-                                op_prod > prod_impl(const viennacl::compressed_matrix<SCALARTYPE, ALIGNMENT> & , 
-                                                    const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT> &);
-
-    template<class SCALARTYPE, unsigned int ALIGNMENT, unsigned int VECTOR_ALIGNMENT>
-    viennacl::vector_expression<const viennacl::coordinate_matrix<SCALARTYPE, ALIGNMENT>,
-                                const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT>, 
-                                op_prod > prod_impl(const viennacl::coordinate_matrix<SCALARTYPE, ALIGNMENT> & , 
-                                                    const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT> &);
-
-                                                    
-    //forward definition of inner_prod_impl function
-    /*template <typename V1, typename V2>
-    typename viennacl::enable_if< viennacl::is_vector<V1>::value
-                                  && viennacl::is_vector<V2>::value,
-                                  viennacl::scalar_expression< const V1, 
-                                                               const V2,
-                                                               viennacl::op_inner_prod >
-                                >::type
-    inner_prod_impl(V1 const & vec1,
-                    V2 const & vec2);*/
-    
-#ifndef _MSC_VER
-    template <typename V1, typename V2, typename S3>
-    void inner_prod_impl(V1 const & vec1,
-                         V2 const & vec2,
-                         S3 & result,
-                         typename viennacl::enable_if< viennacl::is_vector<V1>::value
-                                                       && viennacl::is_vector<V2>::value
-                                                       && viennacl::is_scalar<S3>::value
-                                                     >::type * dummy = 0);
-#endif                                                   
-                    
-      
+
+    template <typename NumericT, typename F>
+    void prod_impl(const matrix_base<NumericT, F> & mat,
+                   const vector_base<NumericT> & vec,
+                         vector_base<NumericT> & result);
+
+    template <typename NumericT, typename F>
+    void prod_impl(const matrix_expression< const matrix_base<NumericT, F>, const matrix_base<NumericT, F>, op_trans> & mat_trans,
+                   const vector_base<NumericT> & vec,
+                         vector_base<NumericT> & result);
+
+    template<typename SparseMatrixType, class SCALARTYPE, unsigned int ALIGNMENT>
+    typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixType>::value,
+                                  vector_expression<const SparseMatrixType,
+                                                    const vector<SCALARTYPE, ALIGNMENT>,
+                                                    op_prod >
+                                 >::type
+    prod_impl(const SparseMatrixType & mat,
+              const vector<SCALARTYPE, ALIGNMENT> & vec);
+#endif
+
+    namespace detail
+    {
+      enum row_info_types
+      {
+        SPARSE_ROW_NORM_INF = 0,
+        SPARSE_ROW_NORM_1,
+        SPARSE_ROW_NORM_2,
+        SPARSE_ROW_DIAGONAL
+      };
+
+    }
+
+
     /** @brief A tag class representing a lower triangular matrix */
-    struct lower_tag 
+    struct lower_tag
     {
-      static const char * const name() { return "lower"; }
+      static const char * name() { return "lower"; }
     };      //lower triangular matrix
     /** @brief A tag class representing an upper triangular matrix */
-    struct upper_tag 
+    struct upper_tag
     {
-      static const char * const name() { return "upper"; }
+      static const char * name() { return "upper"; }
     };      //upper triangular matrix
     /** @brief A tag class representing a lower triangular matrix with unit diagonal*/
     struct unit_lower_tag
     {
-      static const char * const name() { return "unit_lower"; }
+      static const char * name() { return "unit_lower"; }
     }; //unit lower triangular matrix
     /** @brief A tag class representing an upper triangular matrix with unit diagonal*/
     struct unit_upper_tag
     {
-      static const char * const name() { return "unit_upper"; }
+      static const char * name() { return "unit_upper"; }
     }; //unit upper triangular matrix
-    
+
     //preconditioner tags
     class ilut_tag;
-    
+
     /** @brief A tag class representing the use of no preconditioner */
     class no_precond
     {
       public:
         template <typename VectorType>
-        void apply(VectorType & vec) const {}
+        void apply(VectorType &) const {}
     };
-    
-    
+
+
   } //namespace linalg
+
+  //
+  // More namespace comments to follow:
+  //
+
+  /** @brief Namespace providing routines for handling the different memory domains. */
+  namespace backend
+  {
+    /** @brief Provides implementations for handling memory buffers in CPU RAM. */
+    namespace cpu_ram
+    {
+      /** @brief Holds implementation details for handling memory buffers in CPU RAM. Not intended for direct use by library users. */
+      namespace detail {}
+    }
+
+    /** @brief Provides implementations for handling CUDA memory buffers. */
+    namespace cuda
+    {
+      /** @brief Holds implementation details for handling CUDA memory buffers. Not intended for direct use by library users. */
+      namespace detail {}
+    }
+
+    /** @brief Implementation details for the generic memory backend interface. */
+    namespace detail {}
+
+    /** @brief Provides implementations for handling OpenCL memory buffers. */
+    namespace opencl
+    {
+      /** @brief Holds implementation details for handling OpenCL memory buffers. Not intended for direct use by library users. */
+      namespace detail {}
+    }
+  }
+
+
+  /** @brief Holds implementation details for functionality in the main viennacl-namespace. Not intended for direct use by library users. */
+  namespace detail
+  {
+    /** @brief Helper namespace for fast Fourier transforms. Not to be used directly by library users. */
+    namespace fft
+    {
+      /** @brief Helper namespace for fast-Fourier transformation. Deprecated. */
+      namespace FFT_DATA_ORDER {}
+    }
+  }
+
+
+  /** @brief Provides an OpenCL kernel generator. */
+  namespace generator
+  {
+    /** @brief Provides the implementation for tuning the kernels for a particular device. */
+    namespace autotune {}
+
+    /** @brief Contains implementation details of the kernel generator. */
+    namespace detail {}
+
+    /** @brief Namespace holding the various device-specific parameters for generating the best kernels. */
+    namespace profiles {}
+
+    /** @brief Contains various helper routines for kernel generation. */
+    namespace utils {}
+  }
+
+  /** @brief Provides basic input-output functionality. */
+  namespace io
+  {
+    /** @brief Implementation details for IO functionality. Usually not of interest for a library user. */
+    namespace detail {}
+
+    /** @brief Namespace holding the various XML tag definitions for the kernel parameter tuning facility. */
+    namespace tag {}
+
+    /** @brief Namespace holding the various XML strings for the kernel parameter tuning facility. */
+    namespace val {}
+  }
+
+  /** @brief Provides all linear algebra operations which are not covered by operator overloads. */
+  namespace linalg
+  {
+    /** @brief Holds all CUDA compute kernels used by ViennaCL. */
+    namespace cuda
+    {
+      /** @brief Helper functions for the CUDA linear algebra backend. */
+      namespace detail {}
+    }
+
+    /** @brief Namespace holding implementation details for linear algebra routines. Usually not of interest for a library user. */
+    namespace detail
+    {
+      /** @brief Implementation namespace for algebraic multigrid preconditioner. */
+      namespace amg {}
+
+      /** @brief Implementation namespace for sparse approximate inverse preconditioner. */
+      namespace spai {}
+    }
+
+    /** @brief Holds all compute kernels with conventional host-based execution (buffers in CPU RAM). */
+    namespace host_based
+    {
+      /** @brief Helper functions for the host-based linear algebra backend. */
+      namespace detail {}
+    }
+
+    /** @brief Namespace containing the OpenCL kernels. Deprecated, will be moved to viennacl::linalg::opencl in future releases. */
+    namespace kernels {}
+
+    /** @brief Holds all routines providing OpenCL linear algebra operations. */
+    namespace opencl
+    {
+      /** @brief Helper functions for OpenCL-accelerated linear algebra operations. */
+      namespace detail {}
+
+      /** @brief Contains the OpenCL kernel generation functions for a predefined set of functionality. */
+      namespace kernels
+      {
+        /** @brief Implementation details for the predefined OpenCL kernels. */
+        namespace detail {}
+      }
+    }
+  }
+
+  /** @brief OpenCL backend. Manages platforms, contexts, buffers, kernels, etc. */
+  namespace ocl {}
+
+  /** @brief Namespace containing many meta-functions. */
+  namespace result_of {}
+
+  /** @brief Namespace for various tools used within ViennaCL. */
+  namespace tools
+  {
+    /** @brief Contains implementation details for the tools. Usually not of interest for the library user. */
+    namespace detail {}
+  }
+
+  /** @brief Namespace providing traits-information as well as generic wrappers to common routines for vectors and matrices such as size() or clear() */
+  namespace traits {}
+
+  /** @brief Contains the scheduling functionality which allows for dynamic kernel generation as well as the fusion of multiple statements into a single kernel. */
+  namespace scheduler
+  {
+    /** @brief Implementation details for the scheduler */
+    namespace detail {}
+
+    /** @brief Helper metafunctions used for the scheduler */
+    namespace result_of {}
+  }
+
 } //namespace viennacl
 
 #endif
diff --git a/viennacl/generator/autotune.hpp b/viennacl/generator/autotune.hpp
new file mode 100644
index 0000000..7598534
--- /dev/null
+++ b/viennacl/generator/autotune.hpp
@@ -0,0 +1,208 @@
+#ifndef VIENNACL_GENERATOR_AUTOTUNE_HPP
+#define VIENNACL_GENERATOR_AUTOTUNE_HPP
+
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/generator/autotune.hpp
+ *
+ * @brief User interface for the autotuning procedure
+*/
+
+#include <ctime>
+#include <iomanip>
+#include <cmath>
+#include <iterator>
+
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/infos.hpp"
+
+#include "viennacl/scheduler/forwards.h"
+#include "viennacl/generator/generate.hpp"
+
+#include "viennacl/tools/timer.hpp"
+
+namespace viennacl{
+
+  namespace generator{
+
+    namespace autotune{
+
+      /** @brief class for a tuning parameter */
+      class tuning_param{
+        public:
+
+          /** @brief The constructor
+           *
+           *  @param values    The set of values which this particular tuning parameter can take
+           */
+          tuning_param(std::vector<int> const & values) : values_(values){ reset();  }
+
+          /** @brief Returns true if the parameter has reached its maximum value */
+          bool is_max() const { return current_ ==  (values_.size()-1); }
+
+          /** @brief Increments the parameter */
+          bool inc(){
+            ++current_ ;
+            if(current_ < values_.size() )
+              return false;
+            reset();
+            return true;
+          }
+
+          /** @brief Returns the current value of the parameter */
+          int current() const{ return values_[current_]; }
+
+          /** @brief Resets the parameter to its minimum value */
+          void reset() { current_ = 0; }
+
+        private:
+          std::vector<int> values_;
+          unsigned int current_;
+      };
+
+      /** @brief Tuning configuration
+       *
+       *  ConfigType must have a profile_type typedef
+       *  ConfigType must implement is_invalid that returns whether or not a given parameter is invalid
+       *  ConfigType must implement create_profile that creates a profile_type given a set of parameters
+       *
+       *  Parameters are stored in a std::map<std::string, viennacl::generator::autotune::tuning_param>
+       */
+      template<class ConfigType>
+      class tuning_config{
+        private:
+          /** @brief Storage type of the parameters */
+          typedef std::map<std::string, viennacl::generator::autotune::tuning_param> params_t;
+
+        public:
+          typedef ConfigType config_type;
+
+          /** @brief Accessor for profile_type */
+          typedef typename config_type::profile_type profile_type;
+
+          /** @brief Add a tuning parameter to the config */
+          void add_tuning_param(std::string const & name, std::vector<int> const & values){
+              params_.insert(std::make_pair(name,values));
+          }
+
+          /** @brief Returns true if the tuning config has still not explored all its possibilities */
+          bool has_next() const{
+              bool res = false;
+              for(typename params_t::const_iterator it = params_.begin() ; it != params_.end() ; ++it)
+                  res = res || !it->second.is_max();
+              return res;
+          }
+
+          /** @brief Update the parameters of the config */
+          void update(){
+              for(typename params_t::iterator it = params_.begin() ; it != params_.end() ; ++it)
+                  if(it->second.inc()==false)
+                      break;
+          }
+
+          /** @brief Returns true if the compilation/execution of the underlying profile has an undefined behavior */
+          bool is_invalid(viennacl::ocl::device const & dev) const{
+              return config_type::is_invalid(dev,params_);
+          }
+
+          /** @brief Returns the current profile */
+          typename config_type::profile_type get_current(){
+              return config_type::create_profile(params_);
+          }
+
+          /** @brief Reset the config */
+          void reset(){
+              for(params_t::iterator it = params_.begin() ; it != params_.end() ; ++it){
+                  it->second.reset();
+              }
+          }
+
+        private:
+          params_t params_;
+      };
+
+      /** @brief Add the timing value for a given profile and an statement */
+      template<class ProfileT>
+      double benchmark_impl(viennacl::scheduler::statement const & statement, code_generator::forced_profile_key_type key, ProfileT const & prof, unsigned int n_runs){
+
+        tools::timer t;
+        std::list<viennacl::ocl::kernel *> kernels;
+        viennacl::generator::code_generator gen;
+        gen.force_profile(key, prof);
+        gen.add(statement, statement.array()[0]);
+        viennacl::generator::get_configured_program(gen, kernels, true);
+        viennacl::generator::enqueue(gen);
+        viennacl::backend::finish();
+        t.start();
+
+        for(unsigned int i = 0 ; i < n_runs ; ++i)
+            viennacl::generator::enqueue(gen);
+        viennacl::backend::finish();
+        return (double)t.get()/n_runs;
+      }
+
+
+      /** @brief Fills a timing map for a given statement and a benchmark configuration
+       *
+       * @tparam OpT         type of the statement
+       * @tparam ConfigType  type of the benchmark configuration
+       * @param timings      the timings to fill
+       * @param op           the given statement
+       * @param key          a key for forcing a particular kernel profile (i.e. to pick profile A for a device which would usually use profile B)
+       * @param config       the given configuration
+       * @param n_runs       Number of runs for the benchmark
+       * @param out          Pointer to output file stream for writing to file (if not NULL)
+       */
+      template<class ConfigType>
+      void benchmark(std::map<double, typename ConfigType::profile_type> * timings, scheduler::statement const & op, code_generator::forced_profile_key_type const & key, tuning_config<ConfigType> & config, unsigned int n_runs, std::ofstream * out){
+        viennacl::ocl::device const & dev = viennacl::ocl::current_device();
+        unsigned int n_conf = 0;
+        while(config.has_next()){
+          config.update();
+          typename ConfigType::profile_type const & profile = config.get_current();
+          if(config.is_invalid(dev) || profile.is_slow(dev))
+              continue;
+          ++n_conf;
+        }
+        config.reset();
+
+        unsigned int n = 0;
+        while(config.has_next()){
+          config.update();
+          typename ConfigType::profile_type const & profile = config.get_current();
+          if(config.is_invalid(dev) || profile.is_slow(dev))
+              continue;
+          double percent = (double)n++*100/n_conf;
+          double exec_time = benchmark_impl(op,key,profile,n_runs);
+          timings->insert(std::make_pair(exec_time, profile));
+          std::cout << '\r' << "Autotuning..." << "[" << std::setprecision(2) << std::setfill (' ') << std::setw(6) << std::fixed  << percent << "%" << "]"
+                    << " | Best : " << timings->begin()->second << " => " << std::scientific << std::right << std::setprecision(2) << timings->begin()->first << std::flush;
+          if(out)
+            *out << std::setprecision(3) << std::scientific << exec_time << "," << profile.csv_representation() << std::endl ;
+        }
+        std::cout << '\r' << "Autotuning..." << "[100.00%]" << std::endl;
+      }
+
+    }
+
+  }
+
+}
+#endif // AUTOTUNE_HPP
diff --git a/viennacl/generator/forwards.h b/viennacl/generator/forwards.h
new file mode 100644
index 0000000..fcf5edb
--- /dev/null
+++ b/viennacl/generator/forwards.h
@@ -0,0 +1,142 @@
+#ifndef VIENNACL_GENERATOR_FORWARDS_H
+#define VIENNACL_GENERATOR_FORWARDS_H
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/generator/forwards.h
+    @brief Forwards declaration
+*/
+
+#include <map>
+#include <set>
+#include <list>
+#include <stdexcept>
+
+#include "viennacl/tools/shared_ptr.hpp"
+#include "viennacl/scheduler/forwards.h"
+
+namespace viennacl{
+
+  namespace generator{
+
+    inline void generate_enqueue_statement(viennacl::scheduler::statement const & s, scheduler::statement_node const & root_node);
+    inline void generate_enqueue_statement(viennacl::scheduler::statement const & s);
+
+    enum expression_type_family{
+      SCALAR_SAXPY_FAMILY,
+      VECTOR_SAXPY_FAMILY,
+      MATRIX_SAXPY_FAMILY,
+      SCALAR_REDUCE_FAMILY,
+      VECTOR_REDUCE_FAMILY,
+      MATRIX_PRODUCT_FAMILY,
+      INVALID_EXPRESSION_FAMILY
+    };
+
+    enum expression_type{
+      SCALAR_SAXPY_TYPE,
+      VECTOR_SAXPY_TYPE,
+      MATRIX_SAXPY_TYPE,
+      SCALAR_REDUCE_TYPE,
+      VECTOR_REDUCE_Nx_TYPE,
+      VECTOR_REDUCE_Tx_TYPE,
+      MATRIX_PRODUCT_NN_TYPE,
+      MATRIX_PRODUCT_TN_TYPE,
+      MATRIX_PRODUCT_NT_TYPE,
+      MATRIX_PRODUCT_TT_TYPE,
+      INVALID_EXPRESSION_TYPE
+    };
+
+    inline const char * expression_type_to_string(expression_type type){
+      switch(type){
+        case SCALAR_SAXPY_TYPE : return "Scalar SAXPY";
+        case VECTOR_SAXPY_TYPE : return "Vector SAXPY";
+        case MATRIX_SAXPY_TYPE : return "Matrix SAXPY";
+        case SCALAR_REDUCE_TYPE : return "Inner Product";
+        case VECTOR_REDUCE_Nx_TYPE : return "Matrix-Vector Product : Ax";
+        case VECTOR_REDUCE_Tx_TYPE : return "Matrix-Vector Product : Tx";
+        case MATRIX_PRODUCT_NN_TYPE : return "Matrix-Matrix Product : AA";
+        case MATRIX_PRODUCT_TN_TYPE : return "Matrix-Matrix Product : TA";
+        case MATRIX_PRODUCT_NT_TYPE : return "Matrix-Matrix Product : AT";
+        case MATRIX_PRODUCT_TT_TYPE : return "Matrix-Matrix Product : TT";
+        default : return "INVALID EXPRESSION";
+      }
+    }
+
+    typedef std::pair<expression_type, vcl_size_t> expression_key_type;
+
+    /** @brief A class for holding meta information such as the type or the underlying scalar type of an expression (such as x = inner_prod(y, z)). */
+    struct expression_descriptor{
+        expression_key_type make_key() const { return expression_key_type(type,scalartype_size); }
+        bool operator==(expression_descriptor const & other) const
+        {
+          return type_family == other.type_family && type == other.type && scalartype_size==other.scalartype_size;
+        }
+        expression_type_family type_family;
+        expression_type type;
+        vcl_size_t scalartype_size;
+    };
+
+    /** @brief Emulation of C++11's .at() member for std::map<> */
+    template <typename KeyT, typename ValueT>
+    ValueT const & at(std::map<KeyT, ValueT> const & map, KeyT const & key)
+    {
+      typename std::map<KeyT, ValueT>::const_iterator it = map.find(key);
+      if (it != map.end())
+        return it->second;
+
+      throw std::out_of_range("Generator: Key not found in map");
+    }
+
+    namespace utils{
+      class kernel_generation_stream;
+    }
+
+    namespace detail{
+
+      enum node_type{
+        LHS_NODE_TYPE,
+        PARENT_NODE_TYPE,
+        RHS_NODE_TYPE
+      };
+
+      class mapped_object;
+
+      typedef std::pair<viennacl::scheduler::statement_node const *, node_type> key_type;
+      typedef tools::shared_ptr<detail::mapped_object> container_ptr_type;
+      typedef std::map<key_type, container_ptr_type> mapping_type;
+
+      template<class Fun>
+      static void traverse(viennacl::scheduler::statement const & statement, viennacl::scheduler::statement_node const & root_node, Fun const & fun, bool recurse_binary_leaf = true);
+      inline std::string generate(std::pair<std::string, std::string> const & index, int vector_element, mapped_object const & s);
+      static std::string & append_kernel_arguments(std::set<std::string> & already_generated, std::string & str, unsigned int vector_size, mapped_object const & s);
+      static void fetch(std::pair<std::string, std::string> const & index, unsigned int vectorization, std::set<std::string> & fetched, utils::kernel_generation_stream & stream, mapped_object & s);
+      inline const char * generate(viennacl::scheduler::operation_node_type type);
+      static void generate_all_rhs(viennacl::scheduler::statement const & statement
+                                , viennacl::scheduler::statement_node const & root_node
+                                , std::pair<std::string, std::string> const & index
+                                , int vector_element
+                                , std::string & str
+                                , detail::mapping_type const & mapping);
+
+    }
+
+  }
+
+}
+#endif
diff --git a/viennacl/generator/generate.hpp b/viennacl/generator/generate.hpp
new file mode 100644
index 0000000..8981d69
--- /dev/null
+++ b/viennacl/generator/generate.hpp
@@ -0,0 +1,408 @@
+#ifndef VIENNACL_GENERATOR_GENERATE_HPP
+#define VIENNACL_GENERATOR_GENERATE_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/generator/generate.hpp
+    @brief the user interface for the code generator
+*/
+
+#include <cstring>
+#include <vector>
+#include <typeinfo>
+
+#include "viennacl/scheduler/forwards.h"
+#include "viennacl/generator/forwards.h"
+
+#include "viennacl/generator/profiles.hpp"
+#include "viennacl/generator/statement_representation_functor.hpp"
+#include "viennacl/generator/set_arguments_functor.hpp"
+#include "viennacl/generator/map_functor.hpp"
+
+#include "viennacl/tools/tools.hpp"
+
+namespace viennacl{
+
+  namespace generator{
+
+    /** @brief Class for handling code generation
+     *
+     *  It is meant to be only used along with the scheduler.*/
+    class code_generator{
+      public:
+        /** @brief typedef of the key used in the forced profiles. Contains the expression type and the size of the scalartype */
+        typedef std::pair<expression_type, vcl_size_t> forced_profile_key_type;
+      private:
+        typedef std::pair<expression_descriptor, generator::profile_base::statements_type> representation_node_type;
+        typedef std::vector<representation_node_type> statements_type;
+        typedef std::map<forced_profile_key_type, tools::shared_ptr<profile_base> > forced_profiles_type;
+
+        /** @brief Check for the data access flow of a node.
+        *
+        * Row-major + Trans and Col-Major + NoTrans are equal in this regard. This prevents too much code duplication in the kernel templates.
+        */
+        static bool is_flow_transposed(viennacl::scheduler::statement const & statement, viennacl::scheduler::statement_node const & root_node){
+          viennacl::scheduler::statement::container_type const & expr = statement.array();
+          if(root_node.op.type==viennacl::scheduler::OPERATION_UNARY_TRANS_TYPE)
+            return root_node.lhs.subtype==viennacl::scheduler::DENSE_ROW_MATRIX_TYPE;
+          else{
+            bool res = root_node.lhs.subtype==viennacl::scheduler::DENSE_COL_MATRIX_TYPE || root_node.rhs.subtype==viennacl::scheduler::DENSE_COL_MATRIX_TYPE;
+            if(root_node.lhs.type_family==viennacl::scheduler::COMPOSITE_OPERATION_FAMILY)
+              res = res || is_lhs_flow_transposed(statement, expr[root_node.lhs.node_index]);
+            if(root_node.rhs.type_family==viennacl::scheduler::COMPOSITE_OPERATION_FAMILY)
+              res = res || is_lhs_flow_transposed(statement, expr[root_node.rhs.node_index]);
+            return res;
+          }
+        }
+
+        /** @brief Checks for the data access flow of the LHS of a node */
+        static bool is_lhs_flow_transposed(viennacl::scheduler::statement const & statement, viennacl::scheduler::statement_node const & root_node){
+          scheduler::statement::container_type const & expr = statement.array();
+          if(root_node.lhs.type_family==viennacl::scheduler::COMPOSITE_OPERATION_FAMILY)
+            return is_flow_transposed(statement, expr[root_node.lhs.node_index]);
+          else
+            return root_node.lhs.subtype==viennacl::scheduler::DENSE_COL_MATRIX_TYPE;
+        }
+
+        /** @brief Checks for the data access flow of the RHS of a node */
+        static bool is_rhs_flow_transposed(viennacl::scheduler::statement const & statement, viennacl::scheduler::statement_node const & root_node){
+          viennacl::scheduler::statement::container_type const & expr = statement.array();
+          if(root_node.rhs.type_family==viennacl::scheduler::COMPOSITE_OPERATION_FAMILY)
+            return is_flow_transposed(statement, expr[root_node.rhs.node_index]);
+          else
+            return root_node.rhs.subtype==viennacl::scheduler::DENSE_COL_MATRIX_TYPE;
+        }
+
+        /** @brief Fills the expression descriptor for an operation of the type scalar = RHS */
+        static void fill_expression_descriptor_scalar(viennacl::scheduler::statement const & statement, viennacl::scheduler::statement_node const & root_node, expression_descriptor & descriptor){
+          viennacl::scheduler::statement::container_type const & expr = statement.array();
+          bool is_invalid = (root_node.op.type == viennacl::scheduler::OPERATION_BINARY_MAT_VEC_PROD_TYPE)
+                          || (descriptor.type_family==SCALAR_REDUCE_FAMILY && root_node.op.type == viennacl::scheduler::OPERATION_BINARY_INNER_PROD_TYPE);
+          if(is_invalid){
+            descriptor.type_family = INVALID_EXPRESSION_FAMILY;
+            descriptor.type = INVALID_EXPRESSION_TYPE;
+          }
+          else if(root_node.op.type==viennacl::scheduler::OPERATION_BINARY_INNER_PROD_TYPE){
+            descriptor.type_family = SCALAR_REDUCE_FAMILY;
+            descriptor.type = SCALAR_REDUCE_TYPE;
+          }
+          if(descriptor.type_family!=INVALID_EXPRESSION_FAMILY && root_node.lhs.type_family==viennacl::scheduler::COMPOSITE_OPERATION_FAMILY)
+            fill_expression_descriptor_scalar(statement, expr[root_node.lhs.node_index],descriptor);
+          if(descriptor.type_family!=INVALID_EXPRESSION_FAMILY && root_node.rhs.type_family==viennacl::scheduler::COMPOSITE_OPERATION_FAMILY)
+            fill_expression_descriptor_scalar(statement, expr[root_node.rhs.node_index],descriptor);
+        }
+
+        /** @brief Fills the expression descriptor for an operation of the type vector = RHS */
+        static void fill_expression_descriptor_vector(viennacl::scheduler::statement const & statement, viennacl::scheduler::statement_node const & root_node, expression_descriptor & descriptor){
+          viennacl::scheduler::statement::container_type const & expr = statement.array();
+          bool is_invalid =  (root_node.op.type == viennacl::scheduler::OPERATION_BINARY_INNER_PROD_TYPE)
+                          || (root_node.op.type == viennacl::scheduler::OPERATION_BINARY_MAT_MAT_PROD_TYPE)
+                          || (descriptor.type_family==VECTOR_REDUCE_FAMILY && root_node.op.type == viennacl::scheduler::OPERATION_BINARY_MAT_VEC_PROD_TYPE);
+          if(is_invalid){
+            descriptor.type_family=INVALID_EXPRESSION_FAMILY;
+            descriptor.type=INVALID_EXPRESSION_TYPE;
+          }
+          else if(root_node.op.type==viennacl::scheduler::OPERATION_BINARY_MAT_VEC_PROD_TYPE){
+            descriptor.type_family=VECTOR_REDUCE_FAMILY;
+            if(is_lhs_flow_transposed(statement,root_node))
+              descriptor.type=VECTOR_REDUCE_Tx_TYPE;
+            else
+              descriptor.type=VECTOR_REDUCE_Nx_TYPE;
+          }
+          if(descriptor.type_family!=INVALID_EXPRESSION_FAMILY && root_node.lhs.type_family==viennacl::scheduler::COMPOSITE_OPERATION_FAMILY)
+            fill_expression_descriptor_vector(statement, expr[root_node.lhs.node_index],descriptor);
+          if(descriptor.type_family!=INVALID_EXPRESSION_FAMILY && root_node.rhs.type_family==viennacl::scheduler::COMPOSITE_OPERATION_FAMILY)
+            fill_expression_descriptor_vector(statement, expr[root_node.rhs.node_index],descriptor);
+        }
+
+        /** @brief Fills the expression descriptor for an operation of the type matrix = RHS */
+        static void fill_expression_descriptor_matrix(viennacl::scheduler::statement const & statement, viennacl::scheduler::statement_node const & root_node, expression_descriptor & descriptor){
+          viennacl::scheduler::statement::container_type const & expr = statement.array();
+          bool is_invalid =  (root_node.op.type == viennacl::scheduler::OPERATION_BINARY_INNER_PROD_TYPE)
+                          || (root_node.op.type == viennacl::scheduler::OPERATION_BINARY_MAT_VEC_PROD_TYPE)
+                          || (descriptor.type_family==MATRIX_PRODUCT_FAMILY && root_node.op.type == viennacl::scheduler::OPERATION_BINARY_MAT_MAT_PROD_TYPE);
+          if(is_invalid){
+            descriptor.type_family=INVALID_EXPRESSION_FAMILY;
+            descriptor.type=INVALID_EXPRESSION_TYPE;
+          }
+          else if(root_node.op.type==viennacl::scheduler::OPERATION_BINARY_MAT_MAT_PROD_TYPE){
+            descriptor.type_family=MATRIX_PRODUCT_FAMILY;
+            bool lhs_trans = is_lhs_flow_transposed(statement,root_node);
+            bool rhs_trans = is_rhs_flow_transposed(statement,root_node);
+            if(!lhs_trans && !rhs_trans)
+              descriptor.type=MATRIX_PRODUCT_NN_TYPE;
+            else if(lhs_trans && !rhs_trans)
+              descriptor.type=MATRIX_PRODUCT_TN_TYPE;
+            else if(!lhs_trans && rhs_trans)
+              descriptor.type=MATRIX_PRODUCT_NT_TYPE;
+            else if(lhs_trans && rhs_trans)
+              descriptor.type=MATRIX_PRODUCT_TT_TYPE;
+
+          }
+          if(descriptor.type_family!=INVALID_EXPRESSION_FAMILY && root_node.lhs.type_family==viennacl::scheduler::COMPOSITE_OPERATION_FAMILY)
+            fill_expression_descriptor_matrix(statement, expr[root_node.lhs.node_index],descriptor);
+          if(descriptor.type_family!=INVALID_EXPRESSION_FAMILY && root_node.rhs.type_family==viennacl::scheduler::COMPOSITE_OPERATION_FAMILY)
+            fill_expression_descriptor_matrix(statement, expr[root_node.rhs.node_index],descriptor);
+        }
+
+        /** @brief Fills the expression descriptor for a statement */
+        void fill_descriptor(viennacl::scheduler::statement const & statement, viennacl::scheduler::statement_node const & root_node, expression_descriptor & descriptor){
+          viennacl::scheduler::statement_node_type_family lhs_family = root_node.lhs.type_family;
+          descriptor.scalartype_size = utils::call_on_element(root_node.lhs, utils::scalartype_size_fun());
+          if(lhs_family==viennacl::scheduler::VECTOR_TYPE_FAMILY){
+            descriptor.type_family = VECTOR_SAXPY_FAMILY;
+            descriptor.type = VECTOR_SAXPY_TYPE;
+            fill_expression_descriptor_vector(statement,root_node,descriptor);
+          }
+          else if(lhs_family==viennacl::scheduler::MATRIX_TYPE_FAMILY){
+            descriptor.type_family = MATRIX_SAXPY_FAMILY;
+            descriptor.type = MATRIX_SAXPY_TYPE;
+            fill_expression_descriptor_matrix(statement,root_node,descriptor);
+          }
+          else if(lhs_family==viennacl::scheduler::SCALAR_TYPE_FAMILY){
+            descriptor.type_family = SCALAR_SAXPY_FAMILY;
+            descriptor.type = SCALAR_SAXPY_TYPE;
+            fill_expression_descriptor_scalar(statement,root_node,descriptor);
+          }
+        }
+
+        /** @brief Sets the kernel arguments and enqueue the kernels associated with a list of statements.
+        *
+        *   The kernels are named 'kernel_'index of device in context'_'index of kernel in program'
+        */
+        template<class StatementsType>
+        void set_expression_arguments(profile_base const & profile, unsigned int device_offset, StatementsType const & statements, unsigned int & kernel_id, viennacl::ocl::program & p, std::list<viennacl::ocl::kernel *> & kernels) const {
+          for(vcl_size_t i = 0 ; i < profile.num_kernels() ; ++i){
+            //add kernel name
+            char str[32];
+            std::sprintf(str,"kernel_%d_%d",device_offset,kernel_id);
+            viennacl::ocl::kernel & kernel = p.get_kernel(str);
+            kernels.push_back(&kernel);
+            unsigned int current_arg = 0;
+            //Configure ND Range and enqueue arguments
+            profile.configure_range_enqueue_arguments(i, statements, kernel, current_arg);
+            std::set<void *> memory;
+            for(typename StatementsType::const_iterator it = statements.begin() ; it != statements.end() ; ++it){
+              detail::traverse(it->first, it->second, detail::set_arguments_functor(memory,current_arg,kernel));
+            }
+            ++kernel_id;
+          }
+        }
+
+        /** @brief Gets the profile associated with a device and an expression descriptor */
+        profile_base const & get_profile(viennacl::ocl::device const & device, expression_descriptor const & descriptor) const {
+          forced_profiles_type::const_iterator it = forced_profiles_.find(std::make_pair(descriptor.type, descriptor.scalartype_size));
+          if(it != forced_profiles_.end())
+            return *it->second;
+          return *profiles::get(device,descriptor);
+        }
+
+      public:
+
+        /** @brief The constructor */
+        code_generator(viennacl::ocl::context const & ctx = viennacl::ocl::current_context()) : ctx_(ctx){
+          statements_.reserve(16);
+        }
+
+        /** @brief Force the generator to use a specific profile for an operation */
+        template<class T>
+        void force_profile(forced_profile_key_type key, T const & t){
+          forced_profiles_.insert(std::pair<forced_profile_key_type, tools::shared_ptr<profile_base> >(key, tools::shared_ptr<profile_base>(new T(t))));
+        }
+
+        /** @brief Add a statement and the root node to the expression list
+        *   @return Whether or not the operation could be handled by the generator
+        */
+        bool add(scheduler::statement const & statement, scheduler::statement_node const & root_node) {
+          expression_descriptor descriptor;
+          fill_descriptor(statement, root_node, descriptor);
+          if(descriptor.type_family==INVALID_EXPRESSION_FAMILY)
+            return false;
+          if(statements_.empty())
+            statements_.push_back(std::make_pair(descriptor,profile_base::statements_type(1,std::make_pair(statement, root_node))));
+          else
+            if(statements_.back().first == descriptor)
+              statements_.back().second.push_back(std::make_pair(statement, root_node));
+            else
+              statements_.push_back(std::make_pair(descriptor,profile_base::statements_type(1,std::make_pair(statement, root_node))));
+          return true;
+        }
+
+        /** @brief Set the arguments for a program previously generated by the generator and fills the kernels */
+        void configure_program(viennacl::ocl::program & p, std::list<viennacl::ocl::kernel *> & kernels) const {
+          unsigned int kernel_id = 0;
+          std::vector<viennacl::ocl::device>::const_iterator found = std::find(ctx_.devices().begin(),ctx_.devices().end(),ctx_.current_device());
+          for(statements_type::const_iterator it = statements_.begin() ; it != statements_.end() ; ++it)
+            set_expression_arguments(get_profile(ctx_.current_device(), it->first), static_cast<unsigned int>(std::distance(ctx_.devices().begin(), found)), it->second, kernel_id, p, kernels);
+        }
+
+        /** @brief Creates an identifier string for the set of expressions in the object */
+        void make_program_name(char * program_name) const {
+          unsigned int current_arg = 0;
+          void* memory[64] = {NULL};
+          for(statements_type::const_iterator it = statements_.begin() ; it != statements_.end() ; ++it){
+            for(profile_base::statements_type::const_iterator iit = it->second.begin() ; iit != it->second.end() ; ++iit){
+              detail::traverse(iit->first, iit->second, detail::statement_representation_functor(memory, current_arg, program_name));
+            }
+          }
+          *program_name='\0';
+        }
+
+        /** @brief Creates the OpenCL program string from the set of expressions in the object */
+        std::string make_opencl_program_string() const {
+          utils::kernel_generation_stream stream;
+
+          //Headers generation
+          stream << "#if defined(cl_khr_fp64)\n";
+          stream <<  "#  pragma OPENCL EXTENSION cl_khr_fp64: enable\n";
+          stream <<  "#elif defined(cl_amd_fp64)\n";
+          stream <<  "#  pragma OPENCL EXTENSION cl_amd_fp64: enable\n";
+          stream <<  "#endif\n";
+          stream << std::endl;
+
+          vcl_size_t device_offset =0;
+          for(std::vector<viennacl::ocl::device>::const_iterator it = ctx_.devices().begin() ; it != ctx_.devices().end() ; ++it)
+            for(statements_type::const_iterator iit = statements_.begin() ; iit != statements_.end() ; ++iit)
+              get_profile(*it,iit->first)(stream,device_offset++,iit->second);
+
+          return stream.str();
+        }
+
+        /** @brief Creates the CUDA device code from the set of expressions in the object
+        *
+        *   Performs just a direct translation...
+        */
+        std::string make_cuda_program_string() const {
+          //Creates OpenCL string with #ifdef and attributes
+          utils::kernel_generation_stream stream;
+          vcl_size_t device_offset =0;
+          for(std::vector<viennacl::ocl::device>::const_iterator it = ctx_.devices().begin() ; it != ctx_.devices().end() ; ++it)
+            for(statements_type::const_iterator iit = statements_.begin() ; iit != statements_.end() ; ++iit)
+              get_profile(*it,iit->first)(stream,device_offset++,iit->second);
+          std::string res = stream.str();
+
+          viennacl::tools::find_and_replace(res,"__attribute__","//__attribute__");
+
+          //Pointer
+          viennacl::tools::find_and_replace(res, "__global float*", "float*");
+          viennacl::tools::find_and_replace(res, "__local float*", "float*");
+
+          viennacl::tools::find_and_replace(res, "__global double*", "double*");
+          viennacl::tools::find_and_replace(res, "__local double*", "double*");
+
+          //Qualifiers
+          viennacl::tools::find_and_replace(res,"__global","__device__");
+          viennacl::tools::find_and_replace(res,"__kernel","__global__");
+          viennacl::tools::find_and_replace(res,"__constant","__constant__");
+          viennacl::tools::find_and_replace(res,"__local","__shared__");
+
+          //Indexing
+          viennacl::tools::find_and_replace(res,"get_num_groups(0)","gridDim.x");
+          viennacl::tools::find_and_replace(res,"get_num_groups(1)","gridDim.y");
+
+          viennacl::tools::find_and_replace(res,"get_local_size(0)","blockDim.x");
+          viennacl::tools::find_and_replace(res,"get_local_size(1)","blockDim.y");
+
+          viennacl::tools::find_and_replace(res,"get_group_id(0)","blockIdx.x");
+          viennacl::tools::find_and_replace(res,"get_group_id(1)","blockIdx.y");
+
+          viennacl::tools::find_and_replace(res,"get_local_id(0)","threadIdx.x");
+          viennacl::tools::find_and_replace(res,"get_local_id(1)","threadIdx.y");
+
+          viennacl::tools::find_and_replace(res,"get_global_id(0)","(blockIdx.x*blockDim.x + threadIdx.x)");
+          viennacl::tools::find_and_replace(res,"get_global_id(1)","(blockIdx.y*blockDim.y + threadIdx.y)");
+
+          //Synchronization
+          viennacl::tools::find_and_replace(res,"barrier(CLK_LOCAL_MEM_FENCE)","__syncthreads()");
+          viennacl::tools::find_and_replace(res,"barrier(CLK_GLOBAL_MEM_FENCE)","__syncthreads()");
+
+
+          return res;
+        }
+
+      private:
+        statements_type statements_;
+        viennacl::ocl::context const & ctx_;
+        forced_profiles_type forced_profiles_;
+    };
+
+    /** @brief Creates the program associated with a generator object and fills the kernels. Checks the context for the program and possibly (re)compile it.
+    *
+    *   @param generator the generator to work on
+    *   @param kernels this list will be filled with the kernels associated with the generator
+    *   @param force_recompilation if true, the program will be recompiled
+    */
+    inline viennacl::ocl::program & get_configured_program(viennacl::generator::code_generator const & generator, std::list<viennacl::ocl::kernel*> & kernels, bool force_recompilation = false){
+      char* program_name = new char[256];
+      generator.make_program_name(program_name);
+      if(force_recompilation)
+        viennacl::ocl::current_context().delete_program(program_name);
+      if(!viennacl::ocl::current_context().has_program(program_name)){
+        std::string source_code = generator.make_opencl_program_string();
+    #ifdef VIENNACL_DEBUG_BUILD
+        std::cout << "Building " << program_name << "..." << std::endl;
+        std::cout << source_code << std::endl;
+    #endif
+        viennacl::ocl::current_context().add_program(source_code, program_name);
+      }
+      viennacl::ocl::program & p = viennacl::ocl::current_context().get_program(program_name);
+      generator.configure_program(p, kernels);
+      delete[] program_name;
+
+      return p;
+    }
+
+    /** @brief Set the arguments and enqueue a generator object */
+    inline void enqueue(viennacl::generator::code_generator const & generator, bool force_recompilation = false){
+      std::list<viennacl::ocl::kernel*> kernels;
+      get_configured_program(generator, kernels, force_recompilation);
+      for(std::list<viennacl::ocl::kernel*>::iterator it = kernels.begin() ; it != kernels.end() ; ++it){
+        viennacl::ocl::enqueue(**it, (*it)->context().get_queue());
+      }
+    }
+
+    /** @brief Convenience function to get the OpenCL program string for a single statement */
+    inline std::string get_opencl_program_string(viennacl::scheduler::statement const & s){
+      generator::code_generator gen;
+      gen.add(s,s.array()[0]);
+      return gen.make_opencl_program_string();
+    }
+
+    /** @brief Convenience function to get the CUDA device code for a single statement */
+    inline std::string get_cuda_device_code(viennacl::scheduler::statement const & s){
+      generator::code_generator gen;
+      gen.add(s, s.array()[0]);
+      return gen.make_cuda_program_string();
+    }
+
+    /** @brief Generate and enqueue a statement plus root_node into the current queue */
+    inline void generate_enqueue_statement(viennacl::scheduler::statement const & s, scheduler::statement_node const & root_node){
+      generator::code_generator gen;
+      gen.add(s,root_node);
+      viennacl::generator::enqueue(gen);
+    }
+
+    /** @brief Generate and enqueue a statement into the current queue, assumes the root_node is the first node of the statement */
+    inline void generate_enqueue_statement(viennacl::scheduler::statement const & s){
+      generate_enqueue_statement(s, s.array()[0]);
+    }
+
+  }
+}
+#endif
diff --git a/viennacl/generator/helpers.hpp b/viennacl/generator/helpers.hpp
new file mode 100644
index 0000000..a809237
--- /dev/null
+++ b/viennacl/generator/helpers.hpp
@@ -0,0 +1,286 @@
+#ifndef VIENNACL_GENERATOR_GENERATE_UTILS_HPP
+#define VIENNACL_GENERATOR_GENERATE_UTILS_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/generator/helpers.hpp
+    @brief several code generation helpers
+*/
+
+#include <set>
+
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#else
+#include "CL/cl.h"
+#endif
+
+#include "viennacl/forwards.h"
+#include "viennacl/scheduler/forwards.h"
+
+#include "viennacl/generator/utils.hpp"
+#include "viennacl/generator/forwards.h"
+
+namespace viennacl{
+
+  namespace generator{
+
+    namespace detail{
+
+    /** @brief generate the string for a pointer kernel argument */
+      static std::string generate_value_kernel_argument(std::string const & scalartype, std::string const & name){
+        return scalartype + ' ' + name + ",";
+      }
+
+      /** @brief generate the string for a pointer kernel argument */
+      static std::string generate_pointer_kernel_argument(std::string const & address_space, std::string const & scalartype, std::string const & name){
+        return address_space +  " " + scalartype + "* " + name + ",";
+      }
+
+      /** @brief generate a string from an operation_node_type */
+      inline const char * generate(viennacl::scheduler::operation_node_type type){
+        // unary expression
+        switch(type){
+          case viennacl::scheduler::OPERATION_UNARY_ABS_TYPE : return "abs";
+          case viennacl::scheduler::OPERATION_UNARY_TRANS_TYPE : return "trans";
+          case viennacl::scheduler::OPERATION_BINARY_ASSIGN_TYPE : return "=";
+          case viennacl::scheduler::OPERATION_BINARY_INPLACE_ADD_TYPE : return "+=";
+          case viennacl::scheduler::OPERATION_BINARY_INPLACE_SUB_TYPE : return "-=";
+          case viennacl::scheduler::OPERATION_BINARY_ADD_TYPE : return "+";
+          case viennacl::scheduler::OPERATION_BINARY_SUB_TYPE : return "-";
+          case viennacl::scheduler::OPERATION_BINARY_MULT_TYPE : return "*";
+          case viennacl::scheduler::OPERATION_BINARY_DIV_TYPE : return "/";
+          case viennacl::scheduler::OPERATION_BINARY_INNER_PROD_TYPE : return "iprod";
+          case viennacl::scheduler::OPERATION_BINARY_MAT_MAT_PROD_TYPE : return "mmprod";
+          case viennacl::scheduler::OPERATION_BINARY_MAT_VEC_PROD_TYPE : return "mvprod";
+          case viennacl::scheduler::OPERATION_BINARY_ACCESS_TYPE : return "[]";
+          default : throw "not implemented";
+        }
+      }
+
+      /** @brief checks whether an operator is both a binary node and a leaf */
+      inline bool is_binary_leaf_operator(viennacl::scheduler::operation_node_type const & op_type) {
+        return op_type == viennacl::scheduler::OPERATION_BINARY_INNER_PROD_TYPE
+             ||op_type == viennacl::scheduler::OPERATION_BINARY_MAT_VEC_PROD_TYPE
+             ||op_type == viennacl::scheduler::OPERATION_BINARY_MAT_MAT_PROD_TYPE;
+      }
+
+      /** @brief checks whether an operator is arithmetic or not */
+      inline bool is_arithmetic_operator(viennacl::scheduler::operation_node_type const & op_type) {
+        return op_type == viennacl::scheduler::OPERATION_BINARY_ASSIGN_TYPE
+             ||op_type == viennacl::scheduler::OPERATION_BINARY_ADD_TYPE
+             ||op_type == viennacl::scheduler::OPERATION_BINARY_DIV_TYPE
+             ||op_type == viennacl::scheduler::OPERATION_BINARY_ELEMENT_DIV_TYPE
+             ||op_type == viennacl::scheduler::OPERATION_BINARY_ELEMENT_PROD_TYPE
+             ||op_type == viennacl::scheduler::OPERATION_BINARY_INPLACE_ADD_TYPE
+             ||op_type == viennacl::scheduler::OPERATION_BINARY_INPLACE_SUB_TYPE
+//                 ||op_type == viennacl::scheduler::OPERATION_BINARY_INPLACE_DIV_TYPE
+//                ||op_type == viennacl::scheduler::OPERATION_BINARY_INPLACE_MULT_TYPE
+            ||op_type == viennacl::scheduler::OPERATION_BINARY_MULT_TYPE
+            ||op_type == viennacl::scheduler::OPERATION_BINARY_SUB_TYPE;
+
+      }
+
+      /** @brief Recursively execute a functor on a statement */
+      template<class Fun>
+      static void traverse(viennacl::scheduler::statement const & statement, viennacl::scheduler::statement_node const & root_node, Fun const & fun, bool recurse_binary_leaf /* see forwards.h for default argument */){
+
+        if(root_node.op.type_family==viennacl::scheduler::OPERATION_UNARY_TYPE_FAMILY)
+        {
+          //Self:
+          fun(&statement, &root_node, PARENT_NODE_TYPE);
+
+          //Lhs:
+          fun.call_before_expansion();
+          if(root_node.lhs.type_family==viennacl::scheduler::COMPOSITE_OPERATION_FAMILY)
+              traverse(statement, statement.array()[root_node.lhs.node_index], fun, recurse_binary_leaf);
+          fun(&statement, &root_node, LHS_NODE_TYPE);
+          fun.call_after_expansion();
+        }
+        else if(root_node.op.type_family==viennacl::scheduler::OPERATION_BINARY_TYPE_FAMILY)
+        {
+          bool deep_recursion = recurse_binary_leaf || !is_binary_leaf_operator(root_node.op.type);
+
+          fun.call_before_expansion();
+
+          //Lhs:
+          if(deep_recursion){
+            if(root_node.lhs.type_family==viennacl::scheduler::COMPOSITE_OPERATION_FAMILY)
+              traverse(statement, statement.array()[root_node.lhs.node_index], fun, recurse_binary_leaf);
+            fun(&statement, &root_node, LHS_NODE_TYPE);
+          }
+
+          //Self:
+          fun(&statement, &root_node, PARENT_NODE_TYPE);
+
+          //Rhs:
+          if(deep_recursion){
+            if(root_node.rhs.type_family==viennacl::scheduler::COMPOSITE_OPERATION_FAMILY)
+              traverse(statement, statement.array()[root_node.rhs.node_index], fun, recurse_binary_leaf);
+            fun(&statement, &root_node, RHS_NODE_TYPE);
+          }
+
+          fun.call_after_expansion();
+
+        }
+      }
+
+      /** @brief base functor class for traversing a statement */
+      class traversal_functor{
+        public:
+          void call_before_expansion() const { }
+          void call_after_expansion() const { }
+      };
+
+      /** @brief functor for generating the prototype of a statement */
+      class prototype_generation_traversal : public traversal_functor{
+        private:
+          std::set<std::string> & already_generated_;
+          std::string & str_;
+          unsigned int vector_size_;
+          mapping_type const & mapping_;
+        public:
+          prototype_generation_traversal(std::set<std::string> & already_generated, std::string & str, unsigned int vector_size, mapping_type const & mapping) : already_generated_(already_generated), str_(str), vector_size_(vector_size), mapping_(mapping){ }
+
+          void operator()(viennacl::scheduler::statement const *, viennacl::scheduler::statement_node const * root_node, detail::node_type node_type) const {
+              if( (node_type==detail::LHS_NODE_TYPE && root_node->lhs.type_family!=viennacl::scheduler::COMPOSITE_OPERATION_FAMILY)
+                ||(node_type==detail::RHS_NODE_TYPE && root_node->rhs.type_family!=viennacl::scheduler::COMPOSITE_OPERATION_FAMILY) )
+                  append_kernel_arguments(already_generated_, str_, vector_size_, *at(mapping_, std::make_pair(root_node,node_type)));
+          }
+      };
+
+      /** @brief functor for fetching the elements of a statement */
+      class fetch_traversal : public traversal_functor{
+        private:
+          std::set<std::string> & fetched_;
+          std::pair<std::string, std::string> index_string_;
+          unsigned int vectorization_;
+          utils::kernel_generation_stream & stream_;
+          mapping_type const & mapping_;
+        public:
+          fetch_traversal(std::set<std::string> & fetched, std::pair<std::string, std::string> const & index, unsigned int vectorization, utils::kernel_generation_stream & stream, mapping_type const & mapping) : fetched_(fetched), index_string_(index), vectorization_(vectorization), stream_(stream), mapping_(mapping){ }
+
+          void operator()(viennacl::scheduler::statement const *, viennacl::scheduler::statement_node const * root_node, detail::node_type node_type) const {
+            if( (node_type==detail::LHS_NODE_TYPE && root_node->lhs.type_family!=viennacl::scheduler::COMPOSITE_OPERATION_FAMILY)
+              ||(node_type==detail::RHS_NODE_TYPE && root_node->rhs.type_family!=viennacl::scheduler::COMPOSITE_OPERATION_FAMILY) )
+              fetch(index_string_, vectorization_, fetched_, stream_, *at(mapping_, std::make_pair(root_node, node_type)));
+          }
+      };
+
+      /** @brief functor for fetching the LHS of a statement's node
+      *
+      *   Forwards to fetch_traversal functor if the LHS is not a leaf
+      */
+      static void fetch_all_lhs(std::set<std::string> & fetched
+                                , viennacl::scheduler::statement const & statement
+                                , viennacl::scheduler::statement_node const & root_node
+                                , std::pair<std::string, std::string> const & index
+                                , vcl_size_t const & vectorization
+                                , utils::kernel_generation_stream & stream
+                                , detail::mapping_type const & mapping){
+        if(root_node.lhs.type_family==viennacl::scheduler::COMPOSITE_OPERATION_FAMILY)
+          detail::traverse(statement, statement.array()[root_node.lhs.node_index], detail::fetch_traversal(fetched, index, static_cast<unsigned int>(vectorization), stream, mapping));
+        else
+          detail::fetch(index, static_cast<unsigned int>(vectorization),fetched, stream, *at(mapping, std::make_pair(&root_node,detail::LHS_NODE_TYPE)));
+
+      }
+
+      /** @brief functor for fetching the RHS of a statement's node
+      *
+      *   Forwards to fetch_traversal functor if the RHS is not a leaf
+      */
+      static void fetch_all_rhs(std::set<std::string> & fetched
+                                , viennacl::scheduler::statement const & statement
+                                , viennacl::scheduler::statement_node const & root_node
+                                , std::pair<std::string, std::string> const & index
+                                , vcl_size_t const & vectorization
+                                , utils::kernel_generation_stream & stream
+                                , detail::mapping_type const & mapping){
+        if(root_node.rhs.type_family==viennacl::scheduler::COMPOSITE_OPERATION_FAMILY)
+          detail::traverse(statement, statement.array()[root_node.rhs.node_index], detail::fetch_traversal(fetched, index, static_cast<unsigned int>(vectorization), stream, mapping));
+        else
+          detail::fetch(index, static_cast<unsigned int>(vectorization),fetched, stream, *at(mapping, std::make_pair(&root_node,detail::RHS_NODE_TYPE)));
+
+      }
+
+
+      /** @brief functor for generating the expression string from a statement */
+      class expression_generation_traversal : public traversal_functor{
+        private:
+          std::pair<std::string, std::string> index_string_;
+          int vector_element_;
+          std::string & str_;
+          mapping_type const & mapping_;
+
+        public:
+          expression_generation_traversal(std::pair<std::string, std::string> const & index, int vector_element, std::string & str, mapping_type const & mapping) : index_string_(index), vector_element_(vector_element), str_(str), mapping_(mapping){ }
+
+          void call_before_expansion() const { str_+="("; }
+          void call_after_expansion() const { str_+=")"; }
+
+          void operator()(viennacl::scheduler::statement const *, viennacl::scheduler::statement_node const * root_node, detail::node_type node_type) const {
+            if(node_type==PARENT_NODE_TYPE)
+            {
+              if(is_binary_leaf_operator(root_node->op.type))
+                str_ += generate(index_string_, vector_element_, *at(mapping_, std::make_pair(root_node, node_type)));
+              else if(is_arithmetic_operator(root_node->op.type))
+                str_ += generate(root_node->op.type);
+            }
+            else{
+              if(node_type==LHS_NODE_TYPE){
+                if(root_node->lhs.type_family!=viennacl::scheduler::COMPOSITE_OPERATION_FAMILY)
+                  str_ += detail::generate(index_string_,vector_element_, *at(mapping_, std::make_pair(root_node,node_type)));
+              }
+              else if(node_type==RHS_NODE_TYPE){
+                if(root_node->rhs.type_family!=viennacl::scheduler::COMPOSITE_OPERATION_FAMILY)
+                  str_ += detail::generate(index_string_,vector_element_, *at(mapping_, std::make_pair(root_node,node_type)));
+              }
+            }
+          }
+      };
+
+      static void generate_all_lhs(viennacl::scheduler::statement const & statement
+                                , viennacl::scheduler::statement_node const & root_node
+                                , std::pair<std::string, std::string> const & index
+                                , int vector_element
+                                , std::string & str
+                                , detail::mapping_type const & mapping){
+        if(root_node.lhs.type_family==viennacl::scheduler::COMPOSITE_OPERATION_FAMILY)
+          detail::traverse(statement, statement.array()[root_node.lhs.node_index], detail::expression_generation_traversal(index, vector_element, str, mapping));
+        else
+          str += detail::generate(index, vector_element,*at(mapping, std::make_pair(&root_node,detail::LHS_NODE_TYPE)));
+      }
+
+
+      static void generate_all_rhs(viennacl::scheduler::statement const & statement
+                                , viennacl::scheduler::statement_node const & root_node
+                                , std::pair<std::string, std::string> const & index
+                                , int vector_element
+                                , std::string & str
+                                , detail::mapping_type const & mapping){
+        if(root_node.rhs.type_family==viennacl::scheduler::COMPOSITE_OPERATION_FAMILY)
+          detail::traverse(statement, statement.array()[root_node.rhs.node_index], detail::expression_generation_traversal(index, vector_element, str, mapping));
+        else
+          str += detail::generate(index, vector_element,*at(mapping, std::make_pair(&root_node,detail::RHS_NODE_TYPE)));
+      }
+
+    }
+  }
+}
+#endif
diff --git a/viennacl/generator/map_functor.hpp b/viennacl/generator/map_functor.hpp
new file mode 100644
index 0000000..bdbe404
--- /dev/null
+++ b/viennacl/generator/map_functor.hpp
@@ -0,0 +1,170 @@
+#ifndef VIENNACL_GENERATOR_MAP_GENERATE_PROTOTYPE_HPP
+#define VIENNACL_GENERATOR_MAP_GENERATE_PROTOTYPE_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/generator/map_functor.hpp
+    @brief Functor to map the statements to the types defined in mapped_objects.hpp
+*/
+
+#include <set>
+
+#include "viennacl/forwards.h"
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/scheduler/forwards.h"
+#include "viennacl/generator/forwards.h"
+
+#include "viennacl/tools/shared_ptr.hpp"
+
+#include "viennacl/generator/helpers.hpp"
+#include "viennacl/generator/utils.hpp"
+#include "viennacl/generator/mapped_objects.hpp"
+
+namespace viennacl{
+
+  namespace generator{
+
+    namespace detail{
+
+      /** @brief Functor to map the statements to the types defined in mapped_objects.hpp */
+      class map_functor : public traversal_functor{
+          std::string create_name(unsigned int & current_arg, std::map<void *, vcl_size_t> & memory, void * handle) const{
+            if(handle==NULL)
+              return "arg" + utils::to_string(current_arg_++);
+            if(memory.insert(std::make_pair(handle, current_arg)).second)
+              return "arg" + utils::to_string(current_arg_++);
+            else
+              return "arg" + utils::to_string(memory[handle]);
+          }
+
+        public:
+          typedef container_ptr_type result_type;
+
+          map_functor(std::map<void *, vcl_size_t> & memory, unsigned int & current_arg, mapping_type & mapping) : memory_(memory), current_arg_(current_arg), mapping_(mapping){ }
+
+          /** @brief Binary leaf */
+          template<class T>
+          result_type binary_leaf(viennacl::scheduler::statement const * statement, viennacl::scheduler::statement_node const * root_node, mapping_type const * mapping) const {
+            T * p = new T("float");
+
+            p->info_.statement = statement;
+            p->info_.root_node = root_node;
+            p->info_.mapping = mapping;
+
+            return container_ptr_type(p);
+          }
+
+          template<class ScalarType>
+          result_type operator()(ScalarType const & /*scal*/) const {
+            mapped_host_scalar * p = new mapped_host_scalar(utils::type_to_string<ScalarType>::value());
+            p->name_ = create_name(current_arg_, memory_, NULL);
+            return container_ptr_type(p);
+          }
+
+          /** @brief Scalar mapping */
+          template<class ScalarType>
+          result_type operator()(scalar<ScalarType> const & scal) const {
+            mapped_scalar * p = new mapped_scalar(utils::type_to_string<ScalarType>::value());
+            p->name_ = create_name(current_arg_, memory_, (void*)&scal);
+            return container_ptr_type(p);
+          }
+
+          /** @brief Vector mapping */
+          template<class ScalarType>
+          result_type operator()(vector_base<ScalarType> const & vec) const {
+            mapped_vector * p = new mapped_vector(utils::type_to_string<ScalarType>::value());
+            p->name_ = create_name(current_arg_, memory_, (void*)&vec);
+            if(vec.start() > 0)
+              p->start_name_ = p->name_ +"_start";
+            if(vec.stride() > 1)
+              p->stride_name_ = p->name_ + "_stride";
+            return container_ptr_type(p);
+          }
+
+          /** @brief Implicit vector mapping */
+          template<class ScalarType>
+          result_type operator()(implicit_vector_base<ScalarType> const & vec) const {
+            mapped_implicit_vector * p = new mapped_implicit_vector(utils::type_to_string<ScalarType>::value());
+
+            if(vec.is_value_static()==false)
+              p->value_name_ = create_name(current_arg_, memory_, NULL);
+            if(vec.has_index())
+              p->value_name_ = create_name(current_arg_, memory_, NULL);
+            return container_ptr_type(p);
+          }
+
+          /** @brief Matrix mapping */
+          template<class ScalarType, class Layout>
+          result_type operator()(matrix_base<ScalarType, Layout> const & mat) const {
+            mapped_matrix * p = new mapped_matrix(utils::type_to_string<ScalarType>::value());
+            p->name_ = create_name(current_arg_, memory_, (void*)&mat);
+            p->is_row_major_ = static_cast<bool>(utils::is_same_type<Layout, viennacl::row_major>::value);
+            if(mat.start1() > 0)
+              p->start1_name_ = p->name_ +"_start1";
+            if(mat.stride1() > 1)
+              p->stride1_name_ = p->name_ + "_stride1";
+            if(mat.start2() > 0)
+              p->start2_name_ = p->name_ +"_start2";
+            if(mat.stride2() > 1)
+              p->stride2_name_ = p->name_ + "_stride2";
+            return container_ptr_type(p);
+          }
+
+          /** @brief Implicit matrix mapping */
+          template<class ScalarType>
+          result_type operator()(implicit_matrix_base<ScalarType> const & mat) const {
+            mapped_implicit_matrix * p = new mapped_implicit_matrix(utils::type_to_string<ScalarType>::value());
+
+            if(mat.is_value_static()==false)
+              p->value_name_ = create_name(current_arg_, memory_, NULL);
+
+            return container_ptr_type(p);
+          }
+
+          /** @brief Traversal functor */
+          void operator()(viennacl::scheduler::statement const * statement, viennacl::scheduler::statement_node const * root_node, detail::node_type node_type) const {
+            const key_type key(root_node, node_type);
+            if(node_type == LHS_NODE_TYPE && root_node->lhs.type_family != viennacl::scheduler::COMPOSITE_OPERATION_FAMILY)
+                 mapping_.insert(mapping_type::value_type(key, utils::call_on_element(root_node->lhs, *this)));
+            else if(node_type == RHS_NODE_TYPE && root_node->rhs.type_family != viennacl::scheduler::COMPOSITE_OPERATION_FAMILY)
+                 mapping_.insert(mapping_type::value_type(key,  utils::call_on_element(root_node->rhs, *this)));
+            else if( node_type== PARENT_NODE_TYPE){
+                  viennacl::scheduler::operation_node_type op_type = root_node->op.type;
+                if(op_type == viennacl::scheduler::OPERATION_BINARY_INNER_PROD_TYPE)
+                  mapping_.insert(mapping_type::value_type(key, binary_leaf<mapped_scalar_reduction>(statement, root_node, &mapping_)));
+                else if(op_type == viennacl::scheduler::OPERATION_BINARY_MAT_VEC_PROD_TYPE)
+                  mapping_.insert(mapping_type::value_type(key, binary_leaf<mapped_vector_reduction>(statement, root_node, &mapping_)));
+                else if(op_type == viennacl::scheduler::OPERATION_BINARY_MAT_MAT_PROD_TYPE)
+                  mapping_.insert(mapping_type::value_type(key, binary_leaf<mapped_matrix_product>(statement, root_node, &mapping_)));
+            }
+          }
+
+        private:
+          std::map<void *, vcl_size_t> & memory_;
+          unsigned int & current_arg_;
+          mapping_type & mapping_;
+      };
+
+    }
+
+  }
+
+}
+#endif
diff --git a/viennacl/generator/mapped_objects.hpp b/viennacl/generator/mapped_objects.hpp
new file mode 100644
index 0000000..639474e
--- /dev/null
+++ b/viennacl/generator/mapped_objects.hpp
@@ -0,0 +1,343 @@
+#ifndef VIENNACL_GENERATOR_MAPPED_TYPE_HPP
+#define VIENNACL_GENERATOR_MAPPED_TYPE_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/generator/mapped_objects.hpp
+    @brief Map ViennaCL objects to generator wrappers
+*/
+
+#include <string>
+
+#include "viennacl/scheduler/forwards.h"
+#include "viennacl/generator/forwards.h"
+#include "viennacl/generator/utils.hpp"
+
+namespace viennacl{
+
+  namespace generator{
+
+    namespace detail{
+
+
+      /** @brief Base class for mapping viennacl datastructure to generator-friendly structures
+       */
+      class mapped_object{
+        protected:
+          /** \cond */
+          struct node_info{
+              node_info() : mapping(NULL), statement(NULL), root_node(NULL) { }
+              mapping_type const * mapping;
+              scheduler::statement const * statement;
+              scheduler::statement_node const * root_node;
+          };
+          /** \endcond */
+          virtual std::string generate_default(std::pair<std::string, std::string> const & index) const = 0;
+          virtual std::string append_vector_size(std::string const & scalartype, unsigned int) const { return scalartype; }
+
+        public:
+          mapped_object(std::string const & scalartype) : scalartype_(scalartype){          }
+          virtual std::string & append_kernel_arguments(std::set<std::string> &, std::string & str, unsigned int) const{ return str; }
+          std::string const & scalartype() const { return scalartype_; }
+          void access_name(std::string const & str) { access_name_ = str; }
+          std::string const & access_name() const { return access_name_; }
+          virtual std::string generate(std::pair<std::string, std::string> const & index, int) const{
+            if(!access_name_.empty())
+              return access_name_;
+            else
+              return generate_default(index);
+          }
+          virtual ~mapped_object(){ }
+        protected:
+          std::string access_name_;
+          std::string scalartype_;
+      };
+
+      /** @brief Base class for mapping binary leaves (inner product-based, matrix vector product-base, matrix-matrix product based...)
+       */
+      class mapped_binary_leaf : public mapped_object{
+        public:
+          mapped_binary_leaf(std::string const & scalartype) : mapped_object(scalartype){ }
+          mapping_type const & mapping() const { return *info_.mapping; }
+          scheduler::statement const & statement() const { return *info_.statement; }
+          scheduler::statement_node const & root_node() const { return *info_.root_node; }
+          std::string generate_default(std::pair<std::string, std::string> const &) const { return "";}
+        protected:
+          node_info info_;
+      };
+
+      /** @brief Mapping of a matrix product */
+      class mapped_matrix_product : public mapped_binary_leaf{
+          friend class map_functor;
+        public:
+          mapped_matrix_product(std::string const & scalartype) : mapped_binary_leaf(scalartype){ }
+      };
+
+      /** @brief Base class for mapping a reduction */
+      class mapped_reduction : public mapped_binary_leaf{
+        public:
+          mapped_reduction(std::string const & scalartype) : mapped_binary_leaf(scalartype){ }
+          viennacl::scheduler::operation_node_type reduction_type() const { return reduction_type_; }
+        private:
+          viennacl::scheduler::operation_node_type reduction_type_;
+      };
+
+      /** @brief Mapping of a scalar reduction (based on inner product) */
+      class mapped_scalar_reduction : public mapped_reduction{
+          friend class map_functor;
+        public:
+          mapped_scalar_reduction(std::string const & scalartype) : mapped_reduction(scalartype){ }
+      };
+
+      /** @brief Mapping of a vector reduction (based on matrix-vector product) */
+      class mapped_vector_reduction : public mapped_reduction{
+          friend class map_functor;
+        public:
+          mapped_vector_reduction(std::string const & scalartype) : mapped_reduction(scalartype){ }
+      };
+
+      /** @brief Mapping of a host scalar to a generator class */
+      class mapped_host_scalar : public mapped_object{
+          friend class map_functor;
+          std::string generate_default(std::pair<std::string, std::string> const &) const{ return name_;  }
+        public:
+          mapped_host_scalar(std::string const & scalartype) : mapped_object(scalartype){ }
+          std::string const & name() { return name_; }
+          std::string & append_kernel_arguments(std::set<std::string> & already_generated, std::string & str, unsigned int) const{
+            if(already_generated.insert(name_).second)
+              str += detail::generate_value_kernel_argument(scalartype_, name_);
+            return str;
+          }
+
+        private:
+          std::string name_;
+      };
+
+      /** @brief Base class for datastructures passed by pointer */
+      class mapped_handle : public mapped_object{
+          virtual std::string offset(std::pair<std::string, std::string> const & index) const = 0;
+          virtual void append_optional_arguments(std::string &) const{ }
+          std::string generate_default(std::pair<std::string, std::string> const & index) const{ return name_  + '[' + offset(index) + ']'; }
+        public:
+          mapped_handle(std::string const & scalartype) : mapped_object(scalartype){ }
+
+          std::string const & name() const { return name_; }
+
+          void fetch(std::pair<std::string, std::string> const & index, unsigned int vectorization, std::set<std::string> & fetched, utils::kernel_generation_stream & stream) {
+            std::string new_access_name = name_ + "_private";
+            if(fetched.find(name_)==fetched.end()){
+              stream << scalartype_;
+              if(vectorization > 1) stream << vectorization;
+              stream << " " << new_access_name << " = " << generate_default(index) << ';' << std::endl;
+              fetched.insert(name_);
+            }
+            access_name_ = new_access_name;
+          }
+
+          void write_back(std::pair<std::string, std::string> const & index, std::set<std::string> & fetched, utils::kernel_generation_stream & stream) {
+            std::string old_access_name = access_name_ ;
+            access_name_ = "";
+            if(fetched.find(name_)!=fetched.end()){
+              stream << generate_default(index) << " = " << old_access_name << ';' << std::endl;
+              fetched.erase(name_);
+            }
+          }
+
+          std::string & append_kernel_arguments(std::set<std::string> & already_generated, std::string & str, unsigned int vector_size) const{
+            if(already_generated.insert(name_).second){
+              std::string vector_scalartype = append_vector_size(scalartype_, vector_size);
+              str += detail::generate_pointer_kernel_argument("__global", vector_scalartype, name_);
+              append_optional_arguments(str);
+            }
+            return str;
+          }
+
+        protected:
+          std::string name_;
+      };
+
+      /** @brief Mapping of a scalar to a generator class */
+      class mapped_scalar : public mapped_handle{
+          friend class map_functor;
+        private:
+          std::string offset(std::pair<std::string, std::string> const &)  const { return "0"; }
+        public:
+          mapped_scalar(std::string const & scalartype) : mapped_handle(scalartype){ }
+      };
+
+
+      /** @brief Base class for mapping buffer-based objects to a generator class */
+      class mapped_buffer : public mapped_handle{
+        protected:
+          std::string append_vector_size(std::string const & scalartype, unsigned int vector_size) const {
+            if(vector_size>1)
+              return scalartype + utils::to_string(vector_size);
+            else
+              return scalartype;
+          }
+        public:
+          mapped_buffer(std::string const & scalartype) : mapped_handle(scalartype){ }
+          virtual std::string generate(std::pair<std::string, std::string> const & index, int vector_element) const{
+            if(vector_element>-1)
+              return mapped_object::generate(index, vector_element)+".s"+utils::to_string(vector_element);
+            return mapped_object::generate(index, vector_element);
+          }
+
+      };
+
+      /** @brief Mapping of a vector to a generator class */
+      class mapped_vector : public mapped_buffer{
+          friend class map_functor;
+          std::string offset(std::pair<std::string, std::string> const & index) const {
+            if(info_.statement){
+              std::string str;
+              detail::generate_all_rhs(*info_.statement, *info_.root_node, index, -1, str, *info_.mapping);
+              return str;
+            }
+            else
+              return index.first;
+          }
+
+          void append_optional_arguments(std::string & str) const{
+            if(!start_name_.empty())
+              str += detail::generate_value_kernel_argument("unsigned int", start_name_);
+            if(!stride_name_.empty())
+              str += detail::generate_value_kernel_argument("unsigned int", stride_name_);
+            if(!shift_name_.empty())
+              str += detail::generate_value_kernel_argument("unsigned int", shift_name_);
+          }
+        public:
+          mapped_vector(std::string const & scalartype) : mapped_buffer(scalartype){ }
+        private:
+          node_info info_;
+
+          std::string start_name_;
+          std::string stride_name_;
+          std::string shift_name_;
+      };
+
+      /** @brief Mapping of a matrix to a generator class */
+      class mapped_matrix : public mapped_buffer{
+          friend class map_functor;
+          void append_optional_arguments(std::string & str) const{
+            if(!start1_name_.empty())
+              str += detail::generate_value_kernel_argument("unsigned int", start1_name_);
+            if(!stride1_name_.empty())
+              str += detail::generate_value_kernel_argument("unsigned int", stride1_name_);
+            if(!start2_name_.empty())
+              str += detail::generate_value_kernel_argument("unsigned int", start2_name_);
+            if(!stride2_name_.empty())
+              str += detail::generate_value_kernel_argument("unsigned int", stride2_name_);
+          }
+        public:
+          mapped_matrix(std::string const & scalartype) : mapped_buffer(scalartype){ }
+
+          bool is_row_major() const { return is_row_major_; }
+
+          std::string const & size1() const { return size1_; }
+
+          std::string const & size2() const { return size2_; }
+
+          void bind_sizes(std::string const & size1, std::string const & size2) const{
+            size1_ = size1;
+            size2_ = size2;
+          }
+
+          std::string offset(std::pair<std::string, std::string> const & index) const {
+            std::string i = index.first;
+            std::string j = index.second;
+            if(is_row_major_)
+              if(j=="0")
+                return '(' + i + ')' + '*' + size2_;
+              else
+                return '(' + i + ')' + '*' + size2_ + "+ (" + j + ')';
+            else
+              if(i=="0")
+                return  "(" + j + ')' + '*' + size1_;
+              else
+                return  '(' + i + ')' + "+ (" + j + ')' + '*' + size1_;
+          }
+
+        private:
+          mutable std::string size1_;
+          mutable std::string size2_;
+
+          std::string start1_name_;
+          std::string stride1_name_;
+          std::string shift1_name_;
+          std::string start2_name_;
+          std::string stride2_name_;
+          std::string shift2_name_;
+          bool is_row_major_;
+      };
+
+      /** @brief Mapping of a implicit vector to a generator class */
+      class mapped_implicit_vector : public mapped_object{
+          friend class map_functor;
+          std::string value_name_;
+          std::string index_name_;
+        public:
+          mapped_implicit_vector(std::string const & scalartype) : mapped_object(scalartype){ }
+          std::string generate_default(std::pair<std::string, std::string> const & /*index*/) const{
+            return value_name_;
+          }
+          std::string & append_kernel_arguments(std::set<std::string> & /*already_generated*/, std::string & str, unsigned int /*vector_size*/) const{
+            if(!value_name_.empty())
+              str += detail::generate_value_kernel_argument(scalartype_, value_name_);
+            if(!index_name_.empty())
+              str += detail::generate_value_kernel_argument("unsigned int", index_name_);
+            return str;
+          }
+      };
+
+      /** @brief Mapping of a implicit matrix to a generator class */
+      class mapped_implicit_matrix : public mapped_object{
+          friend class map_functor;
+          std::string value_name_;
+        public:
+          mapped_implicit_matrix(std::string const & scalartype) : mapped_object(scalartype){ }
+          std::string generate_default(std::pair<std::string, std::string> const & /* index */) const{
+            return value_name_;
+          }
+          std::string & append_kernel_arguments(std::set<std::string> & /*already generated*/, std::string & str, unsigned int /*vector size*/) const{
+            if(!value_name_.empty())
+              str += detail::generate_value_kernel_argument(scalartype_, value_name_);
+            return str;
+          }
+      };
+
+      inline std::string generate(std::pair<std::string, std::string> const & index, int vector_element, mapped_object const & s){
+        return s.generate(index, vector_element);
+      }
+
+      static void fetch(std::pair<std::string, std::string> const & index, unsigned int vectorization, std::set<std::string> & fetched, utils::kernel_generation_stream & stream, mapped_object & s){
+        if(mapped_handle * p = dynamic_cast<mapped_handle  *>(&s))
+          p->fetch(index, vectorization, fetched, stream);
+      }
+
+      static std::string & append_kernel_arguments(std::set<std::string> & already_generated, std::string & str, unsigned int vector_size, mapped_object const & s){
+        return s.append_kernel_arguments(already_generated, str, vector_size);
+      }
+
+    }
+
+  }
+
+}
+#endif
diff --git a/viennacl/generator/matrix_product.hpp b/viennacl/generator/matrix_product.hpp
new file mode 100644
index 0000000..1d6855c
--- /dev/null
+++ b/viennacl/generator/matrix_product.hpp
@@ -0,0 +1,716 @@
+#ifndef VIENNACL_GENERATOR_GENERATE_MATRIX_PRODUCT_HPP
+#define VIENNACL_GENERATOR_GENERATE_MATRIX_PRODUCT_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/generator/matrix_product.hpp
+ *
+ * @brief Kernel template for the matrix product operation
+*/
+
+#include <vector>
+
+#include "viennacl/scheduler/forwards.h"
+
+#include "viennacl/generator/profile_base.hpp"
+#include "viennacl/generator/mapped_objects.hpp"
+#include "viennacl/generator/utils.hpp"
+
+#include "viennacl/forwards.h"
+
+#include "viennacl/tools/tools.hpp"
+
+namespace viennacl{
+
+  namespace generator{
+
+    /** @brief Kernel generation class for matrix-matrix products. */
+    class matrix_product : public profile_base{
+
+        enum access_flow{
+          REGULAR,
+          STRIDED
+        };
+
+        bool is_slow_impl(viennacl::ocl::device const &) const { return false; }
+
+        vcl_size_t lmem_used(vcl_size_t scalartype_size) const {
+          vcl_size_t lmem_used = 0;
+          if(use_lhs_shared_)
+            lmem_used += (ml_ + 1) * (cache_width_ + 1) * scalartype_size;
+          if(use_rhs_shared_)
+            lmem_used += (cache_width_ + 1) * (nl_ + 1) * scalartype_size;
+          return lmem_used;
+        }
+
+        virtual void print(std::ostream & s) const{
+          s << "{vector_type, local_size1, cache_width, local_size2, ms, ks, ns, use_lhs_shared, use_rhs_shared} = {"
+            << vector_size_ << ","
+            << local_size1_ << ", "
+            << cache_width_ << ", "
+            << local_size2_ << ", "
+            << ms_ << ", "
+            << ks_ << ", "
+            << ns_ << ", "
+            << use_lhs_shared_ << ", " << use_rhs_shared_ << "}" ;
+        }
+
+
+        bool invalid_impl(viennacl::ocl::device const & /*dev*/, vcl_size_t /*scalartype_size*/) const{
+          static const unsigned int alignment = 128;
+          return ml_ > alignment
+              || cache_width_ > alignment
+              || nl_ > alignment
+              || ml_ < ms_
+              || cache_width_ < ks_
+              || nl_ < ns_
+              || (ms_ % vector_size_) > 0
+              || (ks_ % vector_size_) > 0
+              || (ns_ % vector_size_) > 0;
+        }
+
+      public:
+        /** @brief The user constructor */
+        matrix_product(unsigned int vectorization
+                , vcl_size_t local_size1, vcl_size_t cache_width, vcl_size_t local_size2
+                , unsigned int ms, unsigned int ks, unsigned int ns
+                , bool use_lhs_shared, bool use_rhs_shared) : profile_base(vectorization,local_size1, local_size2,1){
+          local_size1_ = local_size1;
+          local_size2_ = local_size2;
+          cache_width_=cache_width;
+          ml_= ms*local_size1;
+          nl_=ns*local_size2;
+          ms_ = ms;
+          ks_=ks;
+          ns_=ns;
+          use_lhs_shared_ = use_lhs_shared;
+          use_rhs_shared_ = use_rhs_shared;
+        }
+
+        static std::string csv_format() {
+          return "Vec,LSize1,CacheWidth,LSize2,mS,kS,nS,NumGroups";
+        }
+
+        std::string csv_representation() const{
+          std::ostringstream oss;
+          oss << vector_size_
+              << "," << local_size1_
+              << "," << cache_width_
+              << "," << local_size2_
+              << "," << ms_
+              << "," << ks_
+              << "," << ns_
+              << "," << use_lhs_shared_
+              << "," << use_rhs_shared_;
+          return oss.str();
+        }
+
+        void configure_range_enqueue_arguments(vcl_size_t kernel_id, statements_type  const & statements, viennacl::ocl::kernel & k, unsigned int & n_arg)  const {
+          //set M, N
+          scheduler::statement_node const & first_node = statements.front().second;
+          vcl_size_t M = utils::call_on_matrix(first_node.lhs, utils::internal_size1_fun());
+          vcl_size_t N = utils::call_on_matrix(first_node.lhs, utils::internal_size2_fun());
+
+          //set ND range
+          configure_local_sizes(k, kernel_id);
+          k.global_work_size(0, M/ms_);
+          k.global_work_size(1, N/ns_);
+
+          //set arguments
+          //M,N
+          k.arg(n_arg++, cl_uint(M));
+          k.arg(n_arg++, cl_uint(N));
+
+          //K
+          for(statements_type::const_iterator it = statements.begin() ; it != statements.end() ; ++it){
+            scheduler::statement::container_type exprs = it->first.array();
+            for(scheduler::statement::container_type::iterator iit = exprs.begin() ; iit != exprs.end() ; ++iit){
+              if(iit->op.type==scheduler::OPERATION_BINARY_MAT_MAT_PROD_TYPE){
+                scheduler::statement_node const * current_node = &(*iit);
+                //The LHS of the prod is a matrix
+                if(current_node->lhs.type_family==scheduler::MATRIX_TYPE_FAMILY)
+                {
+                  k.arg(n_arg++, cl_uint(utils::call_on_matrix(current_node->lhs, utils::internal_size2_fun())));
+                }
+                else{
+                  //The LHS of the prod is a matrix expression
+                  current_node = &exprs[current_node->lhs.node_index];
+                  if(current_node->lhs.type_family==scheduler::MATRIX_TYPE_FAMILY)
+                  {
+                    if(current_node->op.type==scheduler::OPERATION_UNARY_TRANS_TYPE)
+                      k.arg(n_arg++, cl_uint(utils::call_on_matrix(current_node->lhs, utils::internal_size1_fun())));
+                    else
+                      k.arg(n_arg++, cl_uint(utils::call_on_matrix(current_node->lhs, utils::internal_size2_fun())));
+                  }
+                  else{
+                    assert(false && bool("unexpected expression tree"));
+                  }
+                }
+                return;
+              }
+            }
+          }
+
+        }
+
+        static std::string size1() { return "M";  }
+        static std::string size2() { return "K"; }
+        static std::string size3() { return "N"; }
+
+        void kernel_arguments(statements_type  const & /*statements*/, std::string & arguments_string) const{
+          arguments_string += detail::generate_value_kernel_argument("unsigned int", "M");
+          arguments_string += detail::generate_value_kernel_argument("unsigned int", "N");
+          arguments_string += detail::generate_value_kernel_argument("unsigned int", "K");
+        }
+
+      private:
+
+        void transform_block(detail::mapped_matrix const & /*mat_infos*/, bool store_shared
+                             , unsigned int & large_block_1, unsigned int & large_block_2
+                             , unsigned int & small_block_1, unsigned int & small_block_2
+                             , access_flow flow) const {
+          if(flow==REGULAR){
+            large_block_2/=vector_size_;
+            if(!store_shared)
+              small_block_2/=vector_size_;
+          }
+          else{
+            large_block_1/=vector_size_;
+            if(!store_shared)
+              small_block_1/=vector_size_;
+          }
+        }
+
+
+        std::string helper_variable(utils::kernel_generation_stream & stream
+                                    , bool store_in_register
+                                    , std::string const & type
+                                    , std::string const & name
+                                    , std::string const & expr) const {
+          if(!store_in_register)
+            return expr;
+          stream << type << " " << name << " = " << expr << ";" << std::endl;
+          return name;
+        }
+
+        void fetch_element_to_local_mem(utils::kernel_generation_stream & stream,
+                                std::string const & lmem_name,
+                                vcl_size_t lmem_size2,
+                                std::string const & global_ptr,
+                                detail::mapped_matrix const & mat,
+                                access_flow flow,
+                                std::string const & i,
+                                std::string const & j) const {
+
+            if(flow==REGULAR){
+                stream << "val = *(" << global_ptr << " + " << j << " + " << mat.size2()  << "*" << i << ");" << std::endl;
+              for(unsigned int a = 0 ; a < vector_size_ ; ++a)
+                  if(vector_size_>1)
+                      stream << lmem_name << "[" << i << "*" << lmem_size2 << " + " << j << "*" << vector_size_<<" + " << a << "] = val.s" << a << ";" <<std::endl;
+                  else
+                      stream << lmem_name << "[" << i << "*" << lmem_size2 << " + " << j << "*" << vector_size_ << "] = val" << ";" <<std::endl;
+            }
+            else{
+              stream << "val = *(" << global_ptr << "+ " << j << "*" << mat.size1() << " + " << i << ");" << std::endl;
+              for(unsigned int a = 0 ; a < vector_size_ ; ++a)
+                  if(vector_size_>1)
+                      stream << lmem_name << "[" << i << "*" << vector_size_*lmem_size2 << " + " << j << " + " << a*lmem_size2 << "] = val.s" << a << ";" <<std::endl;
+                  else
+                      stream << lmem_name << "[" << i << "*" << vector_size_*lmem_size2 << " + " << j << "] = val" << ";" <<std::endl;
+            }
+        }
+        void fetch_to_local_mem(utils::kernel_generation_stream & stream,
+                                std::string const & lmem_name,
+                                vcl_size_t lmem_size2,
+                                std::string const & global_ptr,
+                                unsigned int bound1,
+                                unsigned int bound2,
+                                detail::mapped_matrix const & mat,
+                                access_flow flow) const {
+          std::string aligned_scalartype = mat.scalartype();
+          if(vector_size_ > 1)
+            aligned_scalartype+=utils::to_string(vector_size_);
+          stream << "barrier(CLK_LOCAL_MEM_FENCE);" << std::endl;
+          stream << "{" << std::endl;
+          stream << aligned_scalartype << " val;" << std::endl;
+          //Can unroll
+          if(bound2%local_size2_==0 && bound1%local_size1_==0){
+              for(unsigned int j = 0 ; j < bound2 ; j+=static_cast<unsigned int>(local_size2_)){
+                  for(unsigned int i = 0 ; i < bound1 ; i+=static_cast<unsigned int>(local_size1_)){
+                      std::string indi = "(get_local_id(0) + " + utils::to_string(i)+")";
+                      std::string indj = "(get_local_id(1) + " + utils::to_string(j)+")";
+                      fetch_element_to_local_mem(stream,lmem_name,lmem_size2,global_ptr,mat,flow,indi,indj);
+                  }
+              }
+          }
+          else{
+              stream << "for(unsigned int j = get_local_id(1)" << " ; j < " << bound2 << "; j+= " << local_size2_ << "){" << std::endl;
+              stream.inc_tab();
+              stream << "for(unsigned int i = get_local_id(0)" << " ; i < " << bound1 << "; i+= " << local_size1_ << "){" << std::endl;
+              stream.inc_tab();
+              fetch_element_to_local_mem(stream,lmem_name,lmem_size2,global_ptr,mat,flow,"i","j");
+              stream.dec_tab();
+              stream << "}" << std::endl;
+              stream.dec_tab();
+              stream << "}" << std::endl;
+
+          }
+          stream << "}" << std::endl;
+          stream << "barrier(CLK_LOCAL_MEM_FENCE);" << std::endl;
+
+        }
+
+        void core(vcl_size_t /*kernel_id*/, utils::kernel_generation_stream& stream, statements_type const & statements, std::vector<detail::mapping_type> const & mapping) const {
+
+          //////////////////
+          /// INIT
+          /// //////////////
+
+          detail::mapped_matrix const * assigned = static_cast<detail::mapped_matrix const *>(at(mapping.at(0), std::make_pair(&statements.front().second,detail::LHS_NODE_TYPE)).get());
+          detail::mapped_matrix_product* prod = NULL;
+          detail::mapped_matrix const * lhs = NULL;
+          detail::mapped_matrix const * rhs = NULL;
+
+          bool is_lhs_transposed = false;
+          bool is_rhs_transposed = false;
+
+          for(statements_type::const_iterator it = statements.begin() ; it != statements.end() ; ++it){
+            scheduler::statement::container_type const & exprs = it->first.array();
+            vcl_size_t i = std::distance(statements.begin(), it);
+            for(scheduler::statement::container_type::const_iterator iit = exprs.begin() ; iit != exprs.end() ; ++iit){
+              if(iit->op.type==scheduler::OPERATION_BINARY_MAT_MAT_PROD_TYPE){
+                prod = (detail::mapped_matrix_product *)at(mapping.at(i), std::make_pair(&(*iit), detail::PARENT_NODE_TYPE)).get();
+                if(iit->lhs.type_family == scheduler::COMPOSITE_OPERATION_FAMILY){
+                  is_lhs_transposed = true;
+                  lhs = (detail::mapped_matrix const *)at(mapping.at(i), std::make_pair(&exprs[iit->lhs.node_index],detail::LHS_NODE_TYPE)).get();
+                }
+                else{
+                  is_lhs_transposed = false;
+                  lhs = (detail::mapped_matrix const *)at(mapping.at(i), std::make_pair(&(*iit), detail::LHS_NODE_TYPE)).get();
+                }
+
+                if(iit->rhs.type_family == scheduler::COMPOSITE_OPERATION_FAMILY){
+                  is_rhs_transposed = true;
+                  rhs = (detail::mapped_matrix const *)at(mapping.at(i), std::make_pair(&exprs[iit->rhs.node_index], detail::LHS_NODE_TYPE)).get();
+                }
+                else{
+                  is_rhs_transposed = false;
+                  rhs = (detail::mapped_matrix const *)at(mapping.at(i), std::make_pair(&(*iit),detail::RHS_NODE_TYPE)).get();
+                }
+
+              }
+            }
+          }
+
+          if(vector_size_>1){
+            std::string StrV = "/"+utils::to_string(vector_size_) ;
+
+            for(detail::mapping_type::const_iterator it = mapping.front().begin() ; it != mapping.front().end() ; ++it){
+              if(detail::mapped_matrix const * p = dynamic_cast<detail::mapped_matrix const *>(it->second.get())){
+                if(p->is_row_major())
+                  p->bind_sizes("M", "N"+StrV);
+                else
+                  p->bind_sizes("M"+StrV, "N");
+              }
+            }
+
+            if(lhs->is_row_major())
+              if(is_lhs_transposed)
+                lhs->bind_sizes("M"+StrV, "K");
+              else
+                lhs->bind_sizes("M", "K"+StrV);
+            else
+              if(is_lhs_transposed)
+                lhs->bind_sizes("M", "K"+StrV);
+              else
+                lhs->bind_sizes("M"+StrV, "K");
+
+
+            if(rhs->is_row_major())
+              if(is_rhs_transposed)
+                rhs->bind_sizes("K"+StrV, "N");
+              else
+                rhs->bind_sizes("K", "N"+StrV);
+            else
+              if(is_rhs_transposed)
+                rhs->bind_sizes("K", "N"+StrV);
+              else
+                rhs->bind_sizes("K"+StrV, "N");
+
+
+          }
+          else{
+            for(detail::mapping_type::const_iterator it = mapping.front().begin() ; it != mapping.front().end() ; ++it){
+              if(detail::mapped_matrix const * p = dynamic_cast<detail::mapped_matrix const *>(it->second.get())){
+                p->bind_sizes("M", "N");
+              }
+            }
+
+            lhs->bind_sizes("M", "K");
+            rhs->bind_sizes("K", "N");
+          }
+
+
+
+          std::string aligned_scalartype = assigned->scalartype();
+          if(vector_size_ > 1)
+            aligned_scalartype+=utils::to_string(vector_size_);
+
+
+          access_flow result_access_flow;
+          if(assigned->is_row_major())
+            result_access_flow = REGULAR;
+          else
+            result_access_flow = STRIDED;
+
+          access_flow lhs_access_flow;
+          if((lhs->is_row_major() && !is_lhs_transposed)
+             ||(!lhs->is_row_major() && is_lhs_transposed))
+            lhs_access_flow = REGULAR;
+          else
+            lhs_access_flow = STRIDED;
+
+          access_flow rhs_access_flow;
+          if((rhs->is_row_major() && !is_rhs_transposed)
+             ||(!rhs->is_row_major() && is_rhs_transposed))
+            rhs_access_flow = REGULAR;
+          else
+            rhs_access_flow = STRIDED;
+
+
+          std::string lhs_value_scalartype;
+          if(use_lhs_shared_)
+            lhs_value_scalartype = lhs->scalartype();
+          else
+            lhs_value_scalartype = aligned_scalartype;
+
+          std::string rhs_value_scalartype;
+          if(use_rhs_shared_)
+            rhs_value_scalartype = rhs->scalartype();
+          else
+            rhs_value_scalartype = aligned_scalartype;
+
+
+          unsigned int ml_res = static_cast<unsigned int>(ml_), nl_res = static_cast<unsigned int>(nl_), ms_res = static_cast<unsigned int>(ms_), ns_res = static_cast<unsigned int>(ns_);
+          unsigned int ml_lhs = static_cast<unsigned int>(ml_), cache_width_lhs = static_cast<unsigned int>(cache_width_), ms_lhs = static_cast<unsigned int>(ms_), ks_lhs = static_cast<unsigned int>(ks_);
+          unsigned int cache_width_rhs = static_cast<unsigned int>(cache_width_), nl_rhs = static_cast<unsigned int>(nl_), ks_rhs = static_cast<unsigned int>(ks_), ns_rhs = static_cast<unsigned int>(ns_);
+
+          transform_block(*assigned,false,ml_res,nl_res,ms_res,ns_res,result_access_flow);
+          transform_block(*lhs,use_lhs_shared_,ml_lhs,cache_width_lhs,ms_lhs,ks_lhs,lhs_access_flow);
+          transform_block(*rhs,use_rhs_shared_,cache_width_rhs,nl_rhs,ks_rhs,ns_rhs,rhs_access_flow);
+
+          //////////////////
+          /// DECLARATIONS
+          /// //////////////
+
+
+          vcl_size_t local_lhs_size1 = ml_ ;
+          vcl_size_t local_lhs_size2 = cache_width_ + 1;
+
+          vcl_size_t local_rhs_size1 = cache_width_;
+          vcl_size_t local_rhs_size2 = nl_ + 1;
+
+          ///Result Values
+          for(unsigned int m=0; m< ms_res; ++m)
+            for(unsigned int n=0; n < ns_res ; ++n)
+              stream << aligned_scalartype << " " << "res" << m << "_" << n << " = (" << aligned_scalartype << ")(0) ;" << std::endl;
+
+          ///Local memory
+          if(use_lhs_shared_)
+            stream << "__local " << lhs->scalartype() << " lhs_buf[" << local_lhs_size1*local_lhs_size2 << "]" << ";" << std::endl;
+          if(use_rhs_shared_)
+            stream << "__local " << rhs->scalartype() << " rhs_buf[" << local_rhs_size1*local_rhs_size2 << "]" << ";" << std::endl;
+
+          ///Pointer to result
+          //stream << "__global " << aligned_scalartype << "* res_ptr = " <<  assigned->name() << " + " << assigned->offset(std::make_pair("get_global_id(0)*" + utils::to_string(ms_res), "get_global_id(1)*" + utils::to_string(ns_res))) << ";" << std::endl;
+
+
+          ///LHS - Local Memory Offset
+          if(use_lhs_shared_){
+            std::string i = "get_group_id(0)*" + utils::to_string(ml_lhs);
+            stream << "__global " << aligned_scalartype << "* global_lhs_ptr = " << lhs->name() << " + ";
+            if(lhs_access_flow==REGULAR)
+              stream << "(" << i << ")" << "*" << lhs->size2();
+            else
+              stream << i;
+            stream << ";" << std::endl;
+          }
+
+          ///LHS - Global Memory pointer
+          else{
+            if(lhs_access_flow==REGULAR)
+              for(unsigned int m=0; m<ms_lhs; ++m)
+                stream << "__global " << aligned_scalartype << "* " << "lhs_ptr_" << m << " = " << lhs->name() << " + "
+                       << lhs->size2() << "* ("
+                       << "get_group_id(0)*" << ml_lhs << "+" << "get_local_id(0)*" << ms_lhs << "+" << m
+                       << " );" << std::endl;
+            else
+              for(unsigned int k=0; k<ks_lhs; ++k)
+                stream << "__global " << aligned_scalartype<< "* " << "lhs_ptr_" << k << " = " << lhs->name() << " + "
+                       << "(" << lhs->size1() << ")*" << k
+                       << "+ " << "get_group_id(0)*" << ml_lhs << "+" << "get_local_id(0)*" << ms_lhs << ";" << std::endl;
+          }
+
+          ///RHS - Local Memory Offset
+          if(use_rhs_shared_){
+            std::string j = "get_group_id(1)*" + utils::to_string(nl_rhs);
+            stream << "__global " << aligned_scalartype << "* global_rhs_ptr = " << rhs->name() << " + ";
+            if(rhs_access_flow==REGULAR)
+              stream << j;
+            else
+              stream << "(" << j << ")" << "*" << rhs->size1();
+            stream << ";" << std::endl;
+          }
+
+          ///RHS - Global Memory Pointer
+          else{
+            if(rhs_access_flow==REGULAR)
+              for(unsigned int k = 0 ; k < ks_rhs ; ++k)
+                stream << "__global " << aligned_scalartype << "* " << "rhs_ptr_" << k << " = " << rhs->name() << " + "
+                       << "(" << k << ")" << "*" << rhs->size2()
+                       << " + " << "get_local_id(1)*" << ns_rhs << " + get_group_id(1)*" << nl_rhs
+                       << ";" << std::endl;
+            else
+              for(unsigned int n = 0 ; n < ns_rhs ; ++n)
+                stream << "__global " << aligned_scalartype << "* " << "rhs_ptr_" << n << " = " << rhs->name() << " +  "
+                       << "(" << "get_local_id(1)*" << ns_rhs << " + get_group_id(1)*" << nl_rhs << " + " << n << ")" << "*" << rhs->size1()
+                       << ";" << std::endl;
+          }
+
+
+          ///Large Work-group Wise loop
+          std::string block_num = helper_variable(stream,false,"unsigned int", "block_num", "K/" + utils::to_string(cache_width_));
+          stream << "for(unsigned int bl=0 ; bl<" << block_num << " ; ++bl){" << std::endl;
+          stream.inc_tab();
+
+          ///Update LHS Local Memory and pointers (if necessary)
+          if(use_lhs_shared_){
+            fetch_to_local_mem(stream,"lhs_buf",local_lhs_size2,"global_lhs_ptr",ml_lhs,cache_width_lhs,*lhs,lhs_access_flow);
+            for(unsigned int m=0; m<ms_lhs; ++m)
+              stream << "__local " << lhs_value_scalartype << "* lhs_ptr_" << m << " = lhs_buf + "
+                     << "(" << "get_local_id(0)*" << ms_lhs << "+" << m << ")" << "*" << local_lhs_size2
+                     << ";" << std::endl;
+          }
+
+          ///Update RHS Local Memory and pointers (if necessary)
+          if(use_rhs_shared_){
+            fetch_to_local_mem(stream,"rhs_buf", local_rhs_size2, "global_rhs_ptr",cache_width_rhs,nl_rhs,*rhs,rhs_access_flow);
+            for(unsigned int k=0; k<ks_rhs; ++k)
+              stream << "__local " << rhs_value_scalartype << "* rhs_ptr_" << k << " = rhs_buf + "
+                     << k*local_rhs_size2 << " + " << "get_local_id(1)*" << ns_rhs
+                     << ";" << std::endl;
+          }
+
+
+          stream << " for(unsigned int bs=0 ; bs < " << cache_width_/ks_  << " ; ++bs){" << std::endl;
+          stream.inc_tab();
+
+
+          for(unsigned int k = 0 ; k < ks_rhs ; ++k){
+            for(unsigned int n=0 ; n < ns_rhs ; ++n){
+              stream << rhs_value_scalartype << " val_rhs_" << k << "_" << n << " = " ;
+              if(use_rhs_shared_ )
+                stream << "* rhs_ptr_" << k << "++";
+              else{
+                if(rhs_access_flow==REGULAR)
+                  stream << "* rhs_ptr_" << k << "++";
+                else
+                  stream  << "* rhs_ptr_" << n << "++";
+              }
+              stream << ";";
+              stream << std::endl;
+            }
+          }
+
+
+          for(unsigned int k = 0 ; k < ks_lhs ; ++k){
+            for(unsigned int m=0 ; m < ms_lhs ; ++m){
+              stream << lhs_value_scalartype << " " << "val_lhs_" << m << "_" << k << " = ";
+              if(use_lhs_shared_)
+                stream <<  "* lhs_ptr_" << m << "++" ;
+              else if(lhs_access_flow==REGULAR)
+                stream << "* lhs_ptr_" << m << "++";
+              else
+                stream << "* lhs_ptr_" << k << "++";
+              stream << ";";
+              stream << std::endl;
+            }
+          }
+
+
+            for(unsigned int n=0 ; n < ns_res ; ++n){
+             for(unsigned int k = 0 ; k < ks_ ; ++k){
+               for(unsigned int m=0 ; m < ms_res ; ++m){
+                for(unsigned int a = 0; a<vector_size_; ++a){
+
+                  int ind_lhs_1 = m;
+                  int ind_lhs_2 = k;
+                  int ind_s_lhs = a;
+
+                  int ind_rhs_1=k;
+                  int ind_rhs_2=n;
+                  int ind_s_rhs=a;
+
+                  if(result_access_flow==REGULAR){
+                    if(!use_lhs_shared_){
+                      if(lhs_access_flow==REGULAR){
+                        ind_s_lhs = ind_lhs_2%vector_size_;
+                        ind_lhs_2 /= vector_size_;
+                      }
+                      else{
+                        ind_s_lhs = ind_lhs_1%vector_size_;
+                        ind_lhs_1 /= vector_size_;
+                      }
+                    }
+                  }
+                  else{
+                    if(use_lhs_shared_){
+                      ind_lhs_1 = ind_lhs_1*vector_size_+a;
+                    }
+                    else{
+                      if(lhs_access_flow==REGULAR){
+                        ind_lhs_1 = ind_lhs_1*vector_size_+a;
+                        ind_s_lhs = ind_lhs_2%vector_size_;
+                        ind_lhs_2 /= vector_size_;
+                      }
+                    }
+                  }
+
+                  if(result_access_flow==REGULAR){
+                    if(use_rhs_shared_){
+                      ind_rhs_2 = ind_rhs_2*vector_size_+a;
+                    }
+                    else{
+                      if(rhs_access_flow==STRIDED){
+                        ind_rhs_2 = ind_rhs_2*vector_size_+a;
+                        ind_s_rhs = ind_rhs_1%vector_size_;
+                        ind_rhs_1 = ind_rhs_1/vector_size_;
+                      }
+                      else{
+                      }
+                    }
+                  }
+                  else{
+                    if(!use_rhs_shared_){
+                      if(rhs_access_flow==REGULAR){
+                        ind_s_rhs = ind_rhs_2%vector_size_;
+                        ind_rhs_2/=vector_size_;
+                      }
+                      else{
+                        ind_s_rhs = ind_rhs_1%vector_size_;
+                        ind_rhs_1/=vector_size_;
+                      }
+                    }
+                  }
+
+                  std::ostringstream res_oss;
+                  std::ostringstream lhs_oss;
+                  std::ostringstream rhs_oss;
+
+                  res_oss << "res" << m << "_" << n ;
+                  if(vector_size_>1) res_oss << ".s" << a;
+
+                  lhs_oss << "val_lhs_" << ind_lhs_1 << "_" << ind_lhs_2;
+                  if(!use_lhs_shared_ && vector_size_>1) lhs_oss << ".s" << ind_s_lhs;
+
+
+                  rhs_oss << "val_rhs_" << ind_rhs_1 << "_" << ind_rhs_2;
+                  if(!use_rhs_shared_ && vector_size_>1) rhs_oss << ".s" << ind_s_rhs;
+
+
+                  stream << res_oss.str() << "+=" << lhs_oss.str() << "*" << rhs_oss.str() << ";" << std::endl;
+                }
+              }
+            }
+          }
+
+
+          if(use_rhs_shared_){
+            for(unsigned int k=0 ; k<ks_ ; ++k)
+              stream << "rhs_ptr_" << k << " += " << ks_rhs*local_rhs_size2 - ns_rhs << ";" << std::endl;
+          }
+          else{
+            if(rhs_access_flow==REGULAR)
+              for(unsigned int k=0 ; k<ks_ ; ++k)
+                stream << "rhs_ptr_" << k << " += " << ks_rhs << "*" << rhs->size2() << " - " << ns_rhs << ";" << std::endl;
+          }
+
+          if(!use_lhs_shared_){
+            if(lhs_access_flow==STRIDED)
+              for(unsigned int k=0 ; k<ks_lhs ; ++k)
+                stream << "lhs_ptr_" << k << " += " << ks_lhs << "*" << lhs->size1() << " - " << ms_lhs << ";" << std::endl;
+          }
+
+
+
+          stream.dec_tab();
+          stream << "}" << std::endl;
+
+          if(use_lhs_shared_){
+            if(lhs_access_flow==REGULAR)
+              stream << "global_lhs_ptr += " << cache_width_lhs << ";" << std::endl;
+            else
+              stream << "global_lhs_ptr += " << cache_width_lhs << "*" << lhs->size1() << ";" << std::endl;
+          }
+
+          if(use_rhs_shared_){
+            if(rhs_access_flow==REGULAR)
+              stream << "global_rhs_ptr += " << cache_width_rhs << "*" << rhs->size2() << ";" << std::endl;
+            else
+              stream << "global_rhs_ptr += " << cache_width_rhs << ";" << std::endl;
+          }
+
+          stream.dec_tab();
+          stream << "}" << std::endl;
+
+          for(unsigned int m=0 ; m < ms_res ; ++m){
+            for(unsigned int n=0 ; n < ns_res ; ++n){
+              std::string i = "get_global_id(0)*" + utils::to_string(ms_res) + "+" + utils::to_string(m);
+              std::string j = "get_global_id(1)*" + utils::to_string(ns_res) + "+" + utils::to_string(n);
+              prod->access_name("res"+utils::to_string(m)+"_"+utils::to_string(n));
+              std::string str;
+              detail::traverse(statements.front().first, statements.front().second, detail::expression_generation_traversal(std::make_pair(i, j), -1, str, mapping[0]), false);
+              stream << str << ";" << std::endl;
+            }
+          }
+
+
+        }
+
+      private:
+        vcl_size_t local_size1_;
+        vcl_size_t local_size2_;
+        vcl_size_t cache_width_;
+
+        vcl_size_t ml_;
+        vcl_size_t nl_;
+
+        vcl_size_t ms_;
+        vcl_size_t ks_;
+        vcl_size_t ns_;
+
+        bool use_lhs_shared_;
+        bool use_rhs_shared_;
+    };
+
+  }
+
+}
+
+#endif
diff --git a/viennacl/generator/profile_base.hpp b/viennacl/generator/profile_base.hpp
new file mode 100644
index 0000000..305fa7d
--- /dev/null
+++ b/viennacl/generator/profile_base.hpp
@@ -0,0 +1,194 @@
+#ifndef VIENNACL_GENERATOR_GENERATE_TEMPLATE_BASE_BASE
+#define VIENNACL_GENERATOR_GENERATE_TEMPLATE_BASE_BASE
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/generator/profile_base.hpp
+ *
+ * @brief Base classes for the profiles
+*/
+
+#include <list>
+#include <set>
+
+#include "viennacl/ocl/backend.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/device.hpp"
+#include "viennacl/ocl/device_utils.hpp"
+#include "viennacl/ocl/infos.hpp"
+
+#include "viennacl/scheduler/forwards.h"
+
+#include "viennacl/generator/helpers.hpp"
+#include "viennacl/generator/map_functor.hpp"
+
+namespace viennacl{
+
+  namespace generator{
+
+
+    /** @brief Base class for an operation profile */
+    class profile_base{
+      public:
+        typedef std::list< std::pair<scheduler::statement, scheduler::statement_node> > statements_type;
+
+      protected:
+        friend std::ostream & operator<<(std::ostream &, profile_base const &);
+
+        virtual bool invalid_impl(viennacl::ocl::device const & /*dev*/, vcl_size_t /*scalartype_size*/) const { return false; }
+        virtual bool is_slow_impl(viennacl::ocl::device const &) const { return false; }
+
+        virtual vcl_size_t lmem_used(vcl_size_t /*scalartype_size*/) const { return 0; }
+
+        void configure_local_sizes(viennacl::ocl::kernel & k, vcl_size_t /*kernel_id*/) const {
+          k.local_work_size(0,local_size_1_);
+          k.local_work_size(1,local_size_2_);
+        }
+
+        virtual void print(std::ostream & s) const{
+          s << csv_representation();
+        }
+
+        /** @brief Generates the body of the associated kernel function
+         *
+         *  @param kernel_id  If this profile requires multiple kernel, the index for which the core should be generated
+         *  @param stream     The output stream the kernel is written to
+         *  @param statements the statements for which the code should be generated
+         *  @param mapping    the mapping of the statement_nodes to the mapped_objects
+         */
+        virtual void core(vcl_size_t kernel_id, utils::kernel_generation_stream& stream, statements_type const & statements, std::vector<detail::mapping_type> const & mapping) const = 0;
+
+      public:
+        /** @brief The constructor */
+        profile_base(unsigned int vectorization, vcl_size_t local_size_1, vcl_size_t local_size_2, vcl_size_t num_kernels) : vector_size_(vectorization), local_size_1_(local_size_1), local_size_2_(local_size_2), num_kernels_(num_kernels){ }
+
+        /** @brief The destructor */
+        virtual ~profile_base(){ }
+
+        /** @brief Configures the range and enqueues the arguments associated with the profile */
+        virtual void configure_range_enqueue_arguments(vcl_size_t kernel_id, statements_type  const & statements, viennacl::ocl::kernel & k, unsigned int & n_arg) const = 0;
+
+        virtual void kernel_arguments(statements_type  const & statements, std::string & arguments_string) const = 0;
+
+        /** @brief Get the vector size of the kernel */
+        unsigned int vector_size() const { return vector_size_; }
+
+        /** @brief csv representation of an operation
+         *
+         *  Useful when writing to a file */
+        virtual std::string csv_representation() const = 0;
+
+        /** @brief returns whether or not the profile is likely to be slow on a particular device
+         *  @param dev the given device*/
+        bool is_slow(viennacl::ocl::device const & dev) const{
+          bool res = false;
+          if(dev.type()==CL_DEVICE_TYPE_GPU){
+            vcl_size_t warp_size = 32;
+            if(dev.vendor_id()==4098)
+              warp_size = 64;
+            res = static_cast<bool>(((local_size_1_*local_size_2_)%warp_size)>0);
+          }
+          return res || is_slow_impl(dev);
+        }
+
+        /** @brief returns whether or not the profile leads to undefined behavior on particular device
+         *  @param dev               the given device
+         *  @param scalartype_size   Local memory required to execute the kernel
+         */
+        bool is_invalid(viennacl::ocl::device const & dev, vcl_size_t scalartype_size) const{
+          //Query device informations
+          vcl_size_t lmem_available = static_cast<vcl_size_t>(dev.local_mem_size());
+          vcl_size_t max_workgroup_size = dev.max_work_group_size();
+
+          std::vector<vcl_size_t> max_work_item_sizes = dev.max_work_item_sizes();
+          bool invalid_work_group_sizes = local_size_1_*local_size_2_ > max_workgroup_size
+              || local_size_1_ > max_work_item_sizes[0]
+              || local_size_2_ > max_work_item_sizes[1]; // uses too much resources
+
+          return  invalid_work_group_sizes
+              || lmem_used(scalartype_size)>lmem_available
+              || invalid_impl(dev, scalartype_size);
+        }
+
+        /** @brief Returns the number of kernels needed by this operation */
+        vcl_size_t num_kernels() const{ return num_kernels_; }
+
+        /** @brief Generates the code associated with this profile onto the provided stream
+         *  Redirects to the virtual core() method
+         *
+         *  @param stream Stream onto which the code should be generated
+         *  @param device_offset the index of the device in the context (used for the kernel name)
+         *  @param statements the statements associated with this profile */
+        virtual void operator()(utils::kernel_generation_stream & stream, vcl_size_t device_offset, statements_type const & statements) const {
+          std::vector<detail::mapping_type> mapping(statements.size());
+
+          ///Get Prototype, initialize mapping
+          std::string prototype;
+          std::set<std::string> already_generated;
+          kernel_arguments(statements, prototype);
+
+          {
+            std::map<void *, vcl_size_t> memory;
+            unsigned int current_arg = 0;
+            vcl_size_t i = 0;
+            for(statements_type::const_iterator it = statements.begin() ; it != statements.end() ; ++it)
+              detail::traverse(it->first, it->second, detail::map_functor(memory,current_arg,mapping[i++]));
+          }
+
+          for(statements_type::const_iterator it = statements.begin() ; it != statements.end() ; ++it){
+            detail::traverse(it->first, it->second, detail::prototype_generation_traversal(already_generated, prototype, vector_size(), mapping[std::distance(statements.begin(), it)]));
+          }
+
+          prototype.erase(prototype.size()-1); //Last comma pruned
+
+          //Generate
+          for(vcl_size_t n = 0 ; n < num_kernels() ; ++n){
+            //stream << "__attribute__((vec_type_hint()))" << std::endl;
+            stream << " __attribute__((reqd_work_group_size(" << local_size_1_ << "," << local_size_2_ << "," << 1 << ")))" << std::endl;
+            stream << "__kernel " << "void " << "kernel_" << device_offset << "_" << n << "(" << std::endl;
+            stream << prototype << std::endl;
+            stream << ")" << std::endl;
+
+            //core:
+            stream << "{" << std::endl;
+            stream.inc_tab();
+            core(n, stream, statements, mapping);
+            stream.dec_tab();
+            stream << "}" << std::endl;
+          }
+        }
+
+      protected:
+        unsigned int vector_size_;
+        vcl_size_t local_size_1_;
+        vcl_size_t local_size_2_;
+        vcl_size_t num_kernels_;
+    };
+
+
+    inline std::ostream & operator<<(std::ostream & os, profile_base const & profile){
+      profile.print(os);
+      return os;
+    }
+
+  }
+
+}
+
+#endif
diff --git a/viennacl/generator/profiles.hpp b/viennacl/generator/profiles.hpp
new file mode 100644
index 0000000..3755fdd
--- /dev/null
+++ b/viennacl/generator/profiles.hpp
@@ -0,0 +1,340 @@
+#ifndef VIENNACL_GENERATOR_PROFILES_HPP
+#define VIENNACL_GENERATOR_PROFILES_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+           Institute for Analysis and Scientific Computing,
+           TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+           -----------------
+ ViennaCL - The Vienna Computing Library
+           -----------------
+
+   Project Head:    Karl Rupp  rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/generator/profiles.hpp
+ *
+ * @brief Vendor-specific parameters for the generated kernels
+*/
+
+#include <map>
+
+#include "viennacl/ocl/device.hpp"
+
+#include "viennacl/generator/forwards.h"
+
+#include "viennacl/tools/shared_ptr.hpp"
+
+#include "viennacl/generator/profile_base.hpp"
+#include "viennacl/generator/saxpy.hpp"
+#include "viennacl/generator/scalar_reduction.hpp"
+#include "viennacl/generator/vector_reduction.hpp"
+#include "viennacl/generator/matrix_product.hpp"
+
+namespace viennacl{
+
+  namespace generator{
+
+    namespace profiles{
+
+      typedef cl_uint vendor_id_type;
+      typedef cl_device_type device_type;
+
+      typedef std::string device_name_type;
+      typedef viennacl::tools::shared_ptr<profile_base> profile_base_ptr;
+
+      /** @brief Helper struct for mapping a std::map<KeyType, ValueType>. Used to avoids type length explosion when using nested std::map directly */
+      template<class KeyType, class ValueType>
+      struct map_wrapper{
+          typedef std::map<KeyType,ValueType> map_type;
+          map_type map;
+          ValueType & operator[](KeyType const & key){ return map[key]; }
+      };
+
+      /** @brief Represents expression->profile in the map hierarchy vendor->device_type->device_arch->device->expression->profile. */
+      struct expression_map : public map_wrapper<expression_key_type, profile_base_ptr>{ };
+
+      /** @brief Represents device->expression in the map hierarchy vendor->device_type->device_arch->device->expression->profile. */
+      struct device_name_map : public map_wrapper<device_name_type, expression_map>{ };
+
+      /** @brief Represents device_arch->device in the map hierarchy vendor->device_type->device_arch->device->expression->profile. */
+      struct device_architecture_map : public map_wrapper<viennacl::ocl::device_architecture_family, device_name_map>{ };
+
+      /** @brief Represents device_type->device_arch in the map hierarchy vendor->device_type->device_arch->device->expression->profile. */
+      struct device_type_map : public map_wrapper<device_type,device_architecture_map>{ };
+
+      /** @brief Represents vendor->device_type in the map hierarchy vendor->device_type->device_arch->device->expression->profile. */
+      struct database_type : public map_wrapper<vendor_id_type, device_type_map>{ };
+
+      /** @brief Set a default of a generation to a particular device for a particular operation */
+        inline void set_generation_default_to(database_type & map, vendor_id_type vendor_id, viennacl::ocl::device_architecture_family family, expression_key_type expression, std::string const & device_name){
+            map[vendor_id][CL_DEVICE_TYPE_GPU][family][""][expression] = map[vendor_id][CL_DEVICE_TYPE_GPU][family][device_name][expression];
+        }
+
+        /** @brief Set a default of a generation to a particular device for all operations */
+        inline void set_all_generation_default_to(database_type & map, vendor_id_type vendor_id, viennacl::ocl::device_architecture_family family, std::string const & device_name){
+            set_generation_default_to(map,vendor_id,family,std::make_pair(VECTOR_SAXPY_TYPE,4),device_name);
+            set_generation_default_to(map,vendor_id,family,std::make_pair(MATRIX_SAXPY_TYPE,4),device_name);
+            set_generation_default_to(map,vendor_id,family,std::make_pair(SCALAR_REDUCE_TYPE,4),device_name);
+            set_generation_default_to(map,vendor_id,family,std::make_pair(VECTOR_REDUCE_Nx_TYPE,4),device_name);
+            set_generation_default_to(map,vendor_id,family,std::make_pair(VECTOR_REDUCE_Tx_TYPE,4),device_name);
+            set_generation_default_to(map,vendor_id,family,std::make_pair(MATRIX_PRODUCT_NN_TYPE,4),device_name);
+            set_generation_default_to(map,vendor_id,family,std::make_pair(MATRIX_PRODUCT_TN_TYPE,4),device_name);
+            set_generation_default_to(map,vendor_id,family,std::make_pair(MATRIX_PRODUCT_NT_TYPE,4),device_name);
+            set_generation_default_to(map,vendor_id,family,std::make_pair(MATRIX_PRODUCT_TT_TYPE,4),device_name);
+
+            set_generation_default_to(map,vendor_id,family,std::make_pair(VECTOR_SAXPY_TYPE,8),device_name);
+            set_generation_default_to(map,vendor_id,family,std::make_pair(MATRIX_SAXPY_TYPE,8),device_name);
+            set_generation_default_to(map,vendor_id,family,std::make_pair(SCALAR_REDUCE_TYPE,8),device_name);
+            set_generation_default_to(map,vendor_id,family,std::make_pair(VECTOR_REDUCE_Nx_TYPE,8),device_name);
+            set_generation_default_to(map,vendor_id,family,std::make_pair(VECTOR_REDUCE_Tx_TYPE,8),device_name);
+            set_generation_default_to(map,vendor_id,family,std::make_pair(MATRIX_PRODUCT_NN_TYPE,8),device_name);
+            set_generation_default_to(map,vendor_id,family,std::make_pair(MATRIX_PRODUCT_TN_TYPE,8),device_name);
+            set_generation_default_to(map,vendor_id,family,std::make_pair(MATRIX_PRODUCT_NT_TYPE,8),device_name);
+            set_generation_default_to(map,vendor_id,family,std::make_pair(MATRIX_PRODUCT_TT_TYPE,8),device_name);
+        }
+
+        /** @brief Initialize the database */
+      static database_type init_database(){
+        database_type map;
+
+        /*---------------------------*/
+        /*     GPU Defaults          */
+        /*---------------------------*/
+        map[viennacl::ocl::unknown_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::UNKNOWN][""][std::make_pair(VECTOR_SAXPY_TYPE,4)] = profile_base_ptr(new vector_saxpy(1,128,128,true));
+        map[viennacl::ocl::unknown_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::UNKNOWN][""][std::make_pair(MATRIX_SAXPY_TYPE,4)] = profile_base_ptr(new matrix_saxpy(1,16,16,16,16,true));
+        map[viennacl::ocl::unknown_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::UNKNOWN][""][std::make_pair(SCALAR_REDUCE_TYPE,4)] = profile_base_ptr(new scalar_reduction(1, 128, 128, true));
+        map[viennacl::ocl::unknown_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::UNKNOWN][""][std::make_pair(VECTOR_REDUCE_Nx_TYPE,4)] = profile_base_ptr(new vector_reduction(1, 1, 256, 32));
+        map[viennacl::ocl::unknown_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::UNKNOWN][""][std::make_pair(VECTOR_REDUCE_Tx_TYPE,4)] = profile_base_ptr(new vector_reduction(1, 1, 256, 32));
+        map[viennacl::ocl::unknown_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::UNKNOWN][""][std::make_pair(MATRIX_PRODUCT_NN_TYPE,4)] = profile_base_ptr(new matrix_product(1,8,32,8,4,4,4,1,0));
+        map[viennacl::ocl::unknown_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::UNKNOWN][""][std::make_pair(MATRIX_PRODUCT_TN_TYPE,4)] = profile_base_ptr(new matrix_product(1,8,32,8,4,4,4,1,0));
+        map[viennacl::ocl::unknown_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::UNKNOWN][""][std::make_pair(MATRIX_PRODUCT_NT_TYPE,4)] = profile_base_ptr(new matrix_product(1,8,32,8,4,4,4,1,0));
+        map[viennacl::ocl::unknown_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::UNKNOWN][""][std::make_pair(MATRIX_PRODUCT_TT_TYPE,4)] = profile_base_ptr(new matrix_product(1,8,32,8,4,4,4,1,0));
+
+
+        map[viennacl::ocl::unknown_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::UNKNOWN][""][std::make_pair(VECTOR_SAXPY_TYPE,8)] = profile_base_ptr(new vector_saxpy(1,128,128,true));
+        map[viennacl::ocl::unknown_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::UNKNOWN][""][std::make_pair(MATRIX_SAXPY_TYPE,8)] = profile_base_ptr(new matrix_saxpy(1,16,16,16,16,true));
+        map[viennacl::ocl::unknown_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::UNKNOWN][""][std::make_pair(SCALAR_REDUCE_TYPE,8)] = profile_base_ptr(new scalar_reduction(1, 128, 128, true));
+        map[viennacl::ocl::unknown_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::UNKNOWN][""][std::make_pair(VECTOR_REDUCE_Nx_TYPE,8)] = profile_base_ptr(new vector_reduction(1, 1, 256, 32));
+        map[viennacl::ocl::unknown_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::UNKNOWN][""][std::make_pair(VECTOR_REDUCE_Tx_TYPE,8)] = profile_base_ptr(new vector_reduction(1, 1, 256, 32));
+        map[viennacl::ocl::unknown_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::UNKNOWN][""][std::make_pair(MATRIX_PRODUCT_NN_TYPE,8)] = profile_base_ptr(new matrix_product(1,8,32,8,4,4,4,1,0));
+        map[viennacl::ocl::unknown_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::UNKNOWN][""][std::make_pair(MATRIX_PRODUCT_TN_TYPE,8)] = profile_base_ptr(new matrix_product(1,8,32,8,4,4,4,1,0));
+        map[viennacl::ocl::unknown_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::UNKNOWN][""][std::make_pair(MATRIX_PRODUCT_NT_TYPE,8)] = profile_base_ptr(new matrix_product(1,8,32,8,4,4,4,1,0));
+        map[viennacl::ocl::unknown_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::UNKNOWN][""][std::make_pair(MATRIX_PRODUCT_TT_TYPE,8)] = profile_base_ptr(new matrix_product(1,8,32,8,4,4,4,1,0));
+
+        /*---------------------------*/
+        /*     CPU Defaults          */
+        /*---------------------------*/
+        map[viennacl::ocl::unknown_id][CL_DEVICE_TYPE_CPU][viennacl::ocl::UNKNOWN][""][std::make_pair(VECTOR_SAXPY_TYPE,4)] = profile_base_ptr(new vector_saxpy(8,16,256,true));
+        map[viennacl::ocl::unknown_id][CL_DEVICE_TYPE_CPU][viennacl::ocl::UNKNOWN][""][std::make_pair(MATRIX_SAXPY_TYPE,4)] = profile_base_ptr(new matrix_saxpy(1,16,16,16,16,true));
+        map[viennacl::ocl::unknown_id][CL_DEVICE_TYPE_CPU][viennacl::ocl::UNKNOWN][""][std::make_pair(SCALAR_REDUCE_TYPE,4)] = profile_base_ptr(new scalar_reduction(8,8,512,true));
+        map[viennacl::ocl::unknown_id][CL_DEVICE_TYPE_CPU][viennacl::ocl::UNKNOWN][""][std::make_pair(VECTOR_REDUCE_Nx_TYPE,4)] = profile_base_ptr(new vector_reduction(1,2,1,8));
+        map[viennacl::ocl::unknown_id][CL_DEVICE_TYPE_CPU][viennacl::ocl::UNKNOWN][""][std::make_pair(VECTOR_REDUCE_Tx_TYPE,4)] = profile_base_ptr(new vector_reduction(1,16,8,8));
+        map[viennacl::ocl::unknown_id][CL_DEVICE_TYPE_CPU][viennacl::ocl::UNKNOWN][""][std::make_pair(MATRIX_PRODUCT_NN_TYPE,4)] = profile_base_ptr(new matrix_product(1,16,64,1,8,4,32,0,0));
+        map[viennacl::ocl::unknown_id][CL_DEVICE_TYPE_CPU][viennacl::ocl::UNKNOWN][""][std::make_pair(MATRIX_PRODUCT_TN_TYPE,4)] = profile_base_ptr(new matrix_product(1,16,64,1,8,4,32,0,0));
+        map[viennacl::ocl::unknown_id][CL_DEVICE_TYPE_CPU][viennacl::ocl::UNKNOWN][""][std::make_pair(MATRIX_PRODUCT_NT_TYPE,4)] = profile_base_ptr(new matrix_product(1,16,64,1,8,4,32,0,0));
+        map[viennacl::ocl::unknown_id][CL_DEVICE_TYPE_CPU][viennacl::ocl::UNKNOWN][""][std::make_pair(MATRIX_PRODUCT_TT_TYPE,4)] = profile_base_ptr(new matrix_product(1,16,64,1,8,4,32,0,0));
+
+
+        map[viennacl::ocl::unknown_id][CL_DEVICE_TYPE_CPU][viennacl::ocl::UNKNOWN][""][std::make_pair(VECTOR_SAXPY_TYPE,8)] = profile_base_ptr(new vector_saxpy(8,16,32,true));
+        map[viennacl::ocl::unknown_id][CL_DEVICE_TYPE_CPU][viennacl::ocl::UNKNOWN][""][std::make_pair(MATRIX_SAXPY_TYPE,8)] = profile_base_ptr(new matrix_saxpy(1,16,16,16,16,true));
+        map[viennacl::ocl::unknown_id][CL_DEVICE_TYPE_CPU][viennacl::ocl::UNKNOWN][""][std::make_pair(SCALAR_REDUCE_TYPE,8)] = profile_base_ptr(new scalar_reduction(8,8,512,true));
+        map[viennacl::ocl::unknown_id][CL_DEVICE_TYPE_CPU][viennacl::ocl::UNKNOWN][""][std::make_pair(VECTOR_REDUCE_Nx_TYPE,8)] = profile_base_ptr(new vector_reduction(1,1,1,8));
+        map[viennacl::ocl::unknown_id][CL_DEVICE_TYPE_CPU][viennacl::ocl::UNKNOWN][""][std::make_pair(VECTOR_REDUCE_Tx_TYPE,8)] = profile_base_ptr(new vector_reduction(1,8,16,16));
+        map[viennacl::ocl::unknown_id][CL_DEVICE_TYPE_CPU][viennacl::ocl::UNKNOWN][""][std::make_pair(MATRIX_PRODUCT_NN_TYPE,8)] = profile_base_ptr(new matrix_product(1,16,64,1,8,4,32,0,0));
+        map[viennacl::ocl::unknown_id][CL_DEVICE_TYPE_CPU][viennacl::ocl::UNKNOWN][""][std::make_pair(MATRIX_PRODUCT_TN_TYPE,8)] = profile_base_ptr(new matrix_product(1,16,64,1,8,4,32,0,0));
+        map[viennacl::ocl::unknown_id][CL_DEVICE_TYPE_CPU][viennacl::ocl::UNKNOWN][""][std::make_pair(MATRIX_PRODUCT_NT_TYPE,8)] = profile_base_ptr(new matrix_product(1,16,64,1,8,4,32,0,0));
+        map[viennacl::ocl::unknown_id][CL_DEVICE_TYPE_CPU][viennacl::ocl::UNKNOWN][""][std::make_pair(MATRIX_PRODUCT_TT_TYPE,8)] = profile_base_ptr(new matrix_product(1,16,64,1,8,4,32,0,0));
+
+
+        /*---------------------------*/
+        /*     ACCELERATOR Defaults  */
+        /*---------------------------*/
+        //same as CPU for now
+        map[viennacl::ocl::unknown_id][CL_DEVICE_TYPE_ACCELERATOR][viennacl::ocl::UNKNOWN][""][std::make_pair(VECTOR_SAXPY_TYPE,4)] = profile_base_ptr(new vector_saxpy(8,16,256,true));
+        map[viennacl::ocl::unknown_id][CL_DEVICE_TYPE_ACCELERATOR][viennacl::ocl::UNKNOWN][""][std::make_pair(MATRIX_SAXPY_TYPE,4)] = profile_base_ptr(new matrix_saxpy(1,16,16,16,16,true));
+        map[viennacl::ocl::unknown_id][CL_DEVICE_TYPE_ACCELERATOR][viennacl::ocl::UNKNOWN][""][std::make_pair(SCALAR_REDUCE_TYPE,4)] = profile_base_ptr(new scalar_reduction(8,8,512,true));
+        map[viennacl::ocl::unknown_id][CL_DEVICE_TYPE_ACCELERATOR][viennacl::ocl::UNKNOWN][""][std::make_pair(VECTOR_REDUCE_Nx_TYPE,4)] = profile_base_ptr(new vector_reduction(1,2,1,8));
+        map[viennacl::ocl::unknown_id][CL_DEVICE_TYPE_ACCELERATOR][viennacl::ocl::UNKNOWN][""][std::make_pair(VECTOR_REDUCE_Tx_TYPE,4)] = profile_base_ptr(new vector_reduction(1,16,8,8));
+        map[viennacl::ocl::unknown_id][CL_DEVICE_TYPE_ACCELERATOR][viennacl::ocl::UNKNOWN][""][std::make_pair(MATRIX_PRODUCT_NN_TYPE,4)] = profile_base_ptr(new matrix_product(1,16,64,1,8,4,32,0,0));
+        map[viennacl::ocl::unknown_id][CL_DEVICE_TYPE_ACCELERATOR][viennacl::ocl::UNKNOWN][""][std::make_pair(MATRIX_PRODUCT_TN_TYPE,4)] = profile_base_ptr(new matrix_product(1,16,64,1,8,4,32,0,0));
+        map[viennacl::ocl::unknown_id][CL_DEVICE_TYPE_ACCELERATOR][viennacl::ocl::UNKNOWN][""][std::make_pair(MATRIX_PRODUCT_NT_TYPE,4)] = profile_base_ptr(new matrix_product(1,16,64,1,8,4,32,0,0));
+        map[viennacl::ocl::unknown_id][CL_DEVICE_TYPE_ACCELERATOR][viennacl::ocl::UNKNOWN][""][std::make_pair(MATRIX_PRODUCT_TT_TYPE,4)] = profile_base_ptr(new matrix_product(1,16,64,1,8,4,32,0,0));
+
+
+        map[viennacl::ocl::unknown_id][CL_DEVICE_TYPE_ACCELERATOR][viennacl::ocl::UNKNOWN][""][std::make_pair(VECTOR_SAXPY_TYPE,8)] = profile_base_ptr(new vector_saxpy(8,16,32,true));
+        map[viennacl::ocl::unknown_id][CL_DEVICE_TYPE_ACCELERATOR][viennacl::ocl::UNKNOWN][""][std::make_pair(MATRIX_SAXPY_TYPE,8)] = profile_base_ptr(new matrix_saxpy(1,16,16,16,16,true));
+        map[viennacl::ocl::unknown_id][CL_DEVICE_TYPE_ACCELERATOR][viennacl::ocl::UNKNOWN][""][std::make_pair(SCALAR_REDUCE_TYPE,8)] = profile_base_ptr(new scalar_reduction(8,8,512,true));
+        map[viennacl::ocl::unknown_id][CL_DEVICE_TYPE_ACCELERATOR][viennacl::ocl::UNKNOWN][""][std::make_pair(VECTOR_REDUCE_Nx_TYPE,8)] = profile_base_ptr(new vector_reduction(1,1,1,8));
+        map[viennacl::ocl::unknown_id][CL_DEVICE_TYPE_ACCELERATOR][viennacl::ocl::UNKNOWN][""][std::make_pair(VECTOR_REDUCE_Tx_TYPE,8)] = profile_base_ptr(new vector_reduction(1,8,16,16));
+        map[viennacl::ocl::unknown_id][CL_DEVICE_TYPE_ACCELERATOR][viennacl::ocl::UNKNOWN][""][std::make_pair(MATRIX_PRODUCT_NN_TYPE,8)] = profile_base_ptr(new matrix_product(1,16,64,1,8,4,32,0,0));
+        map[viennacl::ocl::unknown_id][CL_DEVICE_TYPE_ACCELERATOR][viennacl::ocl::UNKNOWN][""][std::make_pair(MATRIX_PRODUCT_TN_TYPE,8)] = profile_base_ptr(new matrix_product(1,16,64,1,8,4,32,0,0));
+        map[viennacl::ocl::unknown_id][CL_DEVICE_TYPE_ACCELERATOR][viennacl::ocl::UNKNOWN][""][std::make_pair(MATRIX_PRODUCT_NT_TYPE,8)] = profile_base_ptr(new matrix_product(1,16,64,1,8,4,32,0,0));
+        map[viennacl::ocl::unknown_id][CL_DEVICE_TYPE_ACCELERATOR][viennacl::ocl::UNKNOWN][""][std::make_pair(MATRIX_PRODUCT_TT_TYPE,8)] = profile_base_ptr(new matrix_product(1,16,64,1,8,4,32,0,0));
+
+
+
+        /*---------------------------*/
+        /*     AMD                   */
+        /*---------------------------*/
+
+        //Evergreen
+
+            //Cypress
+            map[viennacl::ocl::amd_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::Evergreen]["Cypress"][std::make_pair(VECTOR_SAXPY_TYPE,4)] = profile_base_ptr(new vector_saxpy(1,4,64,true));
+            map[viennacl::ocl::amd_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::Evergreen]["Cypress"][std::make_pair(MATRIX_SAXPY_TYPE,4)] = profile_base_ptr(new matrix_saxpy(1,16,16,16,16,true));
+            map[viennacl::ocl::amd_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::Evergreen]["Cypress"][std::make_pair(SCALAR_REDUCE_TYPE,4)] = profile_base_ptr(new scalar_reduction(8,128,128,true));
+            map[viennacl::ocl::amd_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::Evergreen]["Cypress"][std::make_pair(VECTOR_REDUCE_Nx_TYPE,4)] = profile_base_ptr(new vector_reduction(1,1,256,1024));
+            map[viennacl::ocl::amd_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::Evergreen]["Cypress"][std::make_pair(VECTOR_REDUCE_Tx_TYPE,4)] = profile_base_ptr(new vector_reduction(1,32,8,256));
+            map[viennacl::ocl::amd_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::Evergreen]["Cypress"][std::make_pair(MATRIX_PRODUCT_NN_TYPE,4)] = profile_base_ptr(new matrix_product(4,4,64,16,4,4,8,1,0));
+            map[viennacl::ocl::amd_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::Evergreen]["Cypress"][std::make_pair(MATRIX_PRODUCT_TN_TYPE,4)] = profile_base_ptr(new matrix_product(4,4,64,16,4,4,8,1,0));
+            map[viennacl::ocl::amd_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::Evergreen]["Cypress"][std::make_pair(MATRIX_PRODUCT_NT_TYPE,4)] = profile_base_ptr(new matrix_product(4,8,64,16,4,4,8,1,0));
+            map[viennacl::ocl::amd_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::Evergreen]["Cypress"][std::make_pair(MATRIX_PRODUCT_TT_TYPE,4)] = profile_base_ptr(new matrix_product(4,8,128,8,8,4,4,0,0));
+
+            map[viennacl::ocl::amd_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::Evergreen]["Cypress"][std::make_pair(VECTOR_SAXPY_TYPE,8)] = profile_base_ptr(new vector_saxpy(2,1,64,true));
+            map[viennacl::ocl::amd_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::Evergreen]["Cypress"][std::make_pair(MATRIX_SAXPY_TYPE,8)] = profile_base_ptr(new matrix_saxpy(1,16,16,16,16,true));
+            map[viennacl::ocl::amd_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::Evergreen]["Cypress"][std::make_pair(SCALAR_REDUCE_TYPE,8)] = profile_base_ptr(new scalar_reduction(2,256,64,true));
+            map[viennacl::ocl::amd_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::Evergreen]["Cypress"][std::make_pair(VECTOR_REDUCE_Nx_TYPE,8)] = profile_base_ptr(new vector_reduction(1,1,256,1024));
+            map[viennacl::ocl::amd_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::Evergreen]["Cypress"][std::make_pair(VECTOR_REDUCE_Tx_TYPE,8)] = profile_base_ptr(new vector_reduction(1,64,4,256));
+            map[viennacl::ocl::amd_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::Evergreen]["Cypress"][std::make_pair(MATRIX_PRODUCT_NN_TYPE,8)] = profile_base_ptr(new matrix_product(2,16,32,16,2,2,8,0,0));
+            map[viennacl::ocl::amd_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::Evergreen]["Cypress"][std::make_pair(MATRIX_PRODUCT_TN_TYPE,8)] = profile_base_ptr(new matrix_product(2,4,64,32,4,2,2,0,0));
+            map[viennacl::ocl::amd_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::Evergreen]["Cypress"][std::make_pair(MATRIX_PRODUCT_NT_TYPE,8)] = profile_base_ptr(new matrix_product(4,2,64,32,8,8,4,0,0));
+            map[viennacl::ocl::amd_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::Evergreen]["Cypress"][std::make_pair(MATRIX_PRODUCT_TT_TYPE,8)] = profile_base_ptr(new matrix_product(2,16,64,8,2,2,4,0,0));
+
+            //Default
+            set_all_generation_default_to(map,viennacl::ocl::amd_id,viennacl::ocl::Evergreen,"Cypress");
+
+
+         //Southern Islands
+            map[viennacl::ocl::amd_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::SouthernIslands]["Tahiti"][std::make_pair(VECTOR_SAXPY_TYPE,4)] = profile_base_ptr(new vector_saxpy(1,4,64,true));
+            map[viennacl::ocl::amd_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::SouthernIslands]["Tahiti"][std::make_pair(MATRIX_SAXPY_TYPE,4)] = profile_base_ptr(new matrix_saxpy(1,16,16,16,16,true));
+            map[viennacl::ocl::amd_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::SouthernIslands]["Tahiti"][std::make_pair(SCALAR_REDUCE_TYPE,4)] = profile_base_ptr(new scalar_reduction(8,128,128,true));
+            map[viennacl::ocl::amd_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::SouthernIslands]["Tahiti"][std::make_pair(VECTOR_REDUCE_Nx_TYPE,4)] = profile_base_ptr(new vector_reduction(1,1,256,1024));
+            map[viennacl::ocl::amd_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::SouthernIslands]["Tahiti"][std::make_pair(VECTOR_REDUCE_Tx_TYPE,4)] = profile_base_ptr(new vector_reduction(1,32,8,256));
+            map[viennacl::ocl::amd_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::SouthernIslands]["Tahiti"][std::make_pair(MATRIX_PRODUCT_NN_TYPE,4)] = profile_base_ptr(new matrix_product(4,8,128,32,4,4,4,1,0));
+            map[viennacl::ocl::amd_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::SouthernIslands]["Tahiti"][std::make_pair(MATRIX_PRODUCT_TN_TYPE,4)] = profile_base_ptr(new matrix_product(1,8,64,16,4,2,8,1,0));
+            map[viennacl::ocl::amd_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::SouthernIslands]["Tahiti"][std::make_pair(MATRIX_PRODUCT_NT_TYPE,4)] = profile_base_ptr(new matrix_product(4,16,64,16,4,4,8,1,0));
+            map[viennacl::ocl::amd_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::SouthernIslands]["Tahiti"][std::make_pair(MATRIX_PRODUCT_TT_TYPE,4)] = profile_base_ptr(new matrix_product(4,16,64,16,4,4,8,1,0));
+
+            map[viennacl::ocl::amd_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::SouthernIslands]["Tahiti"][std::make_pair(VECTOR_SAXPY_TYPE,8)] = profile_base_ptr(new vector_saxpy(2,1,64,true));
+            map[viennacl::ocl::amd_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::SouthernIslands]["Tahiti"][std::make_pair(MATRIX_SAXPY_TYPE,8)] = profile_base_ptr(new matrix_saxpy(1,16,16,16,16,true));
+            map[viennacl::ocl::amd_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::SouthernIslands]["Tahiti"][std::make_pair(SCALAR_REDUCE_TYPE,8)] = profile_base_ptr(new scalar_reduction(2,256,64,true));
+            map[viennacl::ocl::amd_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::SouthernIslands]["Tahiti"][std::make_pair(VECTOR_REDUCE_Nx_TYPE,8)] = profile_base_ptr(new vector_reduction(1,1,256,1024));
+            map[viennacl::ocl::amd_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::SouthernIslands]["Tahiti"][std::make_pair(VECTOR_REDUCE_Tx_TYPE,8)] = profile_base_ptr(new vector_reduction(1,64,4,256));
+            map[viennacl::ocl::amd_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::SouthernIslands]["Tahiti"][std::make_pair(MATRIX_PRODUCT_NN_TYPE,8)] = profile_base_ptr(new matrix_product(2,4,128,64,4,2,2,1,0));
+            map[viennacl::ocl::amd_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::SouthernIslands]["Tahiti"][std::make_pair(MATRIX_PRODUCT_TN_TYPE,8)] = profile_base_ptr(new matrix_product(2,2,128,32,4,2,2,0,0));
+            map[viennacl::ocl::amd_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::SouthernIslands]["Tahiti"][std::make_pair(MATRIX_PRODUCT_NT_TYPE,8)] = profile_base_ptr(new matrix_product(2,8,128,32,2,2,2,1,0));
+            map[viennacl::ocl::amd_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::SouthernIslands]["Tahiti"][std::make_pair(MATRIX_PRODUCT_TT_TYPE,8)] = profile_base_ptr(new matrix_product(2,8,128,32,2,2,2,1,0));
+
+
+            //Default
+            set_all_generation_default_to(map,viennacl::ocl::amd_id,viennacl::ocl::SouthernIslands,"Tahiti");
+
+
+        /*---------------------------*/
+        /*     NVidia                */
+        /*---------------------------*/
+
+        //-----Fermi
+
+          //Geforce GTX 470
+          map[viennacl::ocl::nvidia_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::Fermi]["GeForce GTX 470"][std::make_pair(VECTOR_SAXPY_TYPE,4)]      =    profile_base_ptr(new vector_saxpy(1,1,256,true));
+          map[viennacl::ocl::nvidia_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::Fermi]["GeForce GTX 470"][std::make_pair(MATRIX_SAXPY_TYPE,4)]      =    profile_base_ptr(new matrix_saxpy(1,16,16,16,16,true));
+          map[viennacl::ocl::nvidia_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::Fermi]["GeForce GTX 470"][std::make_pair(SCALAR_REDUCE_TYPE,4)]     =    profile_base_ptr(new scalar_reduction(4,64,512,true));
+          map[viennacl::ocl::nvidia_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::Fermi]["GeForce GTX 470"][std::make_pair(VECTOR_REDUCE_Nx_TYPE,4)]  =    profile_base_ptr(new vector_reduction(1,1,256,1024));
+          map[viennacl::ocl::nvidia_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::Fermi]["GeForce GTX 470"][std::make_pair(VECTOR_REDUCE_Tx_TYPE,4)]  =    profile_base_ptr(new vector_reduction(1,64,4,64));
+          map[viennacl::ocl::nvidia_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::Fermi]["GeForce GTX 470"][std::make_pair(MATRIX_PRODUCT_NN_TYPE,4)] =    profile_base_ptr(new matrix_product(1,2,64,64,8,4,2,1,0));
+          map[viennacl::ocl::nvidia_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::Fermi]["GeForce GTX 470"][std::make_pair(MATRIX_PRODUCT_TN_TYPE,4)] =    profile_base_ptr(new matrix_product(1,8,32,16,4,4,8,0,0));
+          map[viennacl::ocl::nvidia_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::Fermi]["GeForce GTX 470"][std::make_pair(MATRIX_PRODUCT_NT_TYPE,4)] =    profile_base_ptr(new matrix_product(1,4,128,32,4,8,4,1,0));
+          map[viennacl::ocl::nvidia_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::Fermi]["GeForce GTX 470"][std::make_pair(MATRIX_PRODUCT_TT_TYPE,4)] =    profile_base_ptr(new matrix_product(1,4,32,16,8,4,8,0,0));
+
+          map[viennacl::ocl::nvidia_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::Fermi]["GeForce GTX 470"][std::make_pair(VECTOR_SAXPY_TYPE,8)]      =    profile_base_ptr(new vector_saxpy(2,1,64,true));
+          map[viennacl::ocl::nvidia_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::Fermi]["GeForce GTX 470"][std::make_pair(MATRIX_SAXPY_TYPE,8)]      =    profile_base_ptr(new matrix_saxpy(2,16,16,16,16,true));
+          map[viennacl::ocl::nvidia_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::Fermi]["GeForce GTX 470"][std::make_pair(SCALAR_REDUCE_TYPE,8)]     =    profile_base_ptr(new scalar_reduction(2,64,512,true));
+          map[viennacl::ocl::nvidia_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::Fermi]["GeForce GTX 470"][std::make_pair(VECTOR_REDUCE_Nx_TYPE,8)]  =    profile_base_ptr(new vector_reduction(1,1,128,1024));
+          map[viennacl::ocl::nvidia_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::Fermi]["GeForce GTX 470"][std::make_pair(VECTOR_REDUCE_Tx_TYPE,8)]  =    profile_base_ptr(new vector_reduction(1,16,32,1024));
+          map[viennacl::ocl::nvidia_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::Fermi]["GeForce GTX 470"][std::make_pair(MATRIX_PRODUCT_NN_TYPE,8)] =    profile_base_ptr(new matrix_product(1,8,64,32,2,2,8,1,0));
+          map[viennacl::ocl::nvidia_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::Fermi]["GeForce GTX 470"][std::make_pair(MATRIX_PRODUCT_TN_TYPE,8)] =    profile_base_ptr(new matrix_product(1,64,128,4,2,2,8,0,1));
+          map[viennacl::ocl::nvidia_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::Fermi]["GeForce GTX 470"][std::make_pair(MATRIX_PRODUCT_NT_TYPE,8)] =    profile_base_ptr(new matrix_product(1,4,128,32,4,8,4,1,0));
+          map[viennacl::ocl::nvidia_id][CL_DEVICE_TYPE_GPU][viennacl::ocl::Fermi]["GeForce GTX 470"][std::make_pair(MATRIX_PRODUCT_TT_TYPE,8)] =    profile_base_ptr(new matrix_product(1,4,32,16,8,4,8,0,0));
+
+          //default
+          set_all_generation_default_to(map,viennacl::ocl::nvidia_id,viennacl::ocl::Fermi,"GeForce GTX 470");
+
+
+
+        return map;
+      }
+      static database_type database = init_database();
+
+      /** @brief If the fallback is too harsh, use a very conservative profile */
+      static profile_base * handle_failure(viennacl::ocl::device const & device, expression_descriptor const & descriptor, tools::shared_ptr<profile_base> const & profile){
+        //Returns default if the profile is invalid
+        if(profile->is_invalid(device, descriptor.scalartype_size))
+          return at(at(at(at(at(database.map, viennacl::ocl::unknown_id).map, device.type()).map, viennacl::ocl::UNKNOWN).map, std::string("")).map, descriptor.make_key()).get();
+        return profile.get();
+      }
+
+      /** @brief Get the profile for a device and a descriptor */
+      static profile_base * get(viennacl::ocl::device const & device, expression_descriptor const & descriptor){
+        device_type dev_type = device.type();
+        vendor_id_type vendor_id = device.vendor_id();
+        viennacl::ocl::device_architecture_family device_architecture = device.architecture_family();
+        std::string const & device_name = device.name();
+        expression_key_type expression_key = descriptor.make_key();
+
+        //std::cout << "Looking up vendor ID..." << std::endl;
+        /*-Vendor ID-*/
+        database_type::map_type::iterator vendor_it = database.map.find(vendor_id);
+        //Vendor not recognized => global default:
+        if(vendor_it==database.map.end())
+          return handle_failure(device, descriptor, at(at(at(at(at(database.map, viennacl::ocl::unknown_id).map, dev_type).map, viennacl::ocl::UNKNOWN).map, std::string("")).map, expression_key));
+
+        /*-Device Type-*/
+        //std::cout << "Looking up device type..." << std::endl;
+        device_type_map::map_type::iterator device_type_it = vendor_it->second.map.find(dev_type);
+        //Device type not recognized for this vendor => global default
+        if(device_type_it==vendor_it->second.map.end())
+          return handle_failure(device, descriptor, at(at(at(at(at(database.map, viennacl::ocl::unknown_id).map, dev_type).map, viennacl::ocl::UNKNOWN).map, std::string("")).map, expression_key));
+
+        /*-Device Architecture-*/
+        //std::cout << "Looking up device architecture..." << std::endl;
+        device_architecture_map::map_type::iterator architecture_it = device_type_it->second.map.find(device_architecture);
+        if(architecture_it==device_type_it->second.map.end())
+          return handle_failure(device, descriptor, at(at(at(at(at(database.map, viennacl::ocl::unknown_id).map, dev_type).map, viennacl::ocl::UNKNOWN).map, std::string("")).map, expression_key));
+
+        /*-Device Name-*/
+        //std::cout << "Looking up device name..." << std::endl;
+        device_name_map::map_type::iterator device_name_it = architecture_it->second.map.find(device_name);
+        //Name not found => Vendor default
+        if(device_name_it==architecture_it->second.map.end())
+          return handle_failure(device, descriptor, at(at(at(at(at(database.map, vendor_id).map, dev_type).map, device_architecture).map, std::string("")).map, expression_key));
+
+        //std::cout << "Looking up expression name.." << std::endl;
+        /*-Expression-*/
+        expression_map::map_type::iterator expression_it = device_name_it->second.map.find(expression_key);
+        //Expression not found => Vendor default
+        if(expression_it==device_name_it->second.map.end())
+          return handle_failure(device, descriptor, at(at(at(at(at(database.map, vendor_id).map, dev_type).map, device_architecture).map, std::string("")).map, expression_key));
+
+        //std::cout << "Device found in the database! Getting profile..." << std::endl;
+        //Everything okay. Return specific profile//
+        return handle_failure(device, descriptor, at(at(at(at(at(database.map, vendor_id).map, dev_type).map, device_architecture).map, std::string("")).map, expression_key));
+      }
+
+    }
+
+  }
+
+}
+
+
+#endif
+
diff --git a/viennacl/generator/saxpy.hpp b/viennacl/generator/saxpy.hpp
new file mode 100644
index 0000000..530240a
--- /dev/null
+++ b/viennacl/generator/saxpy.hpp
@@ -0,0 +1,210 @@
+#ifndef VIENNACL_GENERATOR_GENERATE_SAXPY_HPP
+#define VIENNACL_GENERATOR_GENERATE_SAXPY_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/generator/saxpy.hpp
+ *
+ * @brief Kernel template for the saxpy-like operation
+*/
+
+#include <vector>
+
+#include "viennacl/scheduler/forwards.h"
+
+#include "viennacl/generator/mapped_objects.hpp"
+#include "viennacl/generator/helpers.hpp"
+#include "viennacl/generator/utils.hpp"
+
+#include "viennacl/generator/profile_base.hpp"
+
+#include "viennacl/tools/tools.hpp"
+
+namespace viennacl{
+
+  namespace generator{
+
+    /** @brief OpenCL kernel generation class for vector expressions of AXPY type, i.e. x = alpha * y + beta * z, where the number of summands can in principle be arbitrarily large. */
+    class vector_saxpy : public profile_base{
+      public:
+        static std::string csv_format() {
+          return "Vec,LSize1,NumGroups1,GlobalDecomposition";
+        }
+
+        std::string csv_representation() const{
+          std::ostringstream oss;
+          oss << vector_size_
+              << "," << local_size_1_
+              << "," << num_groups_
+              << "," << decomposition_;
+          return oss.str();
+        }
+
+        vector_saxpy(unsigned int v, vcl_size_t gs, vcl_size_t ng, unsigned int d) : profile_base(v, gs, 1, 1), num_groups_(ng), decomposition_(d){ }
+
+        void configure_range_enqueue_arguments(vcl_size_t kernel_id, statements_type  const & statements, viennacl::ocl::kernel & k, unsigned int & n_arg)  const{
+          configure_local_sizes(k, kernel_id);
+
+          k.global_work_size(0,local_size_1_*num_groups_);
+          k.global_work_size(1,1);
+
+          scheduler::statement_node const & first_node = statements.front().second;
+          viennacl::vcl_size_t N = utils::call_on_vector(first_node.lhs, utils::internal_size_fun());
+          k.arg(n_arg++, cl_uint(N/vector_size_));
+        }
+        void kernel_arguments(statements_type  const & /*statements*/, std::string & arguments_string) const{
+          arguments_string += detail::generate_value_kernel_argument("unsigned int", "N");
+        }
+
+      private:
+
+        void core(vcl_size_t /*kernel_id*/, utils::kernel_generation_stream& stream, statements_type const & statements, std::vector<detail::mapping_type> const & mapping) const {
+          stream << "for(unsigned int i = get_global_id(0) ; i < N ; i += get_global_size(0))" << std::endl;
+          stream << "{" << std::endl;
+          stream.inc_tab();
+
+          //Fetches entries to registers
+          std::set<std::string>  fetched;
+          for(std::vector<detail::mapping_type>::const_iterator it = mapping.begin() ; it != mapping.end() ; ++it)
+            for(detail::mapping_type::const_reverse_iterator iit = it->rbegin() ; iit != it->rend() ; ++iit)
+              //Useless to fetch cpu scalars into registers
+              if(detail::mapped_handle * p = dynamic_cast<detail::mapped_handle *>(iit->second.get()))
+                p->fetch( std::make_pair("i","0"), vector_size_, fetched, stream);
+
+          //Generates all the expression, in order
+          vcl_size_t i = 0;
+          for(statements_type::const_iterator it = statements.begin() ; it != statements.end() ; ++it){
+            std::string str;
+            detail::traverse(it->first, it->second, detail::expression_generation_traversal(std::make_pair("i","0"), -1, str, mapping[i++]));
+            stream << str << ";" << std::endl;
+          }
+
+          //Writes back
+          for(statements_type::const_iterator it = statements.begin() ; it != statements.end() ; ++it)
+             //Gets the mapped object at the LHS of each expression
+            if(detail::mapped_handle * p = dynamic_cast<detail::mapped_handle *>(at(mapping.at(std::distance(statements.begin(),it)), std::make_pair(&it->second, detail::LHS_NODE_TYPE)).get()))
+              p->write_back( std::make_pair("i", "0"), fetched, stream);
+
+          stream.dec_tab();
+          stream << "}" << std::endl;
+        }
+
+      private:
+        vcl_size_t num_groups_;
+        unsigned int decomposition_;
+
+    };
+
+
+
+    /** @brief OpenCL kernel generation class for matrix expressions of AXPY type, i.e. A = alpha * B + beta * C, where the number of summands can in principle be arbitrarily large. */
+    class matrix_saxpy : public profile_base{
+
+        bool invalid_impl(viennacl::ocl::device const & /*dev*/, vcl_size_t /*scalartype_size*/) const{ return false; }
+        bool is_slow_impl(viennacl::ocl::device const &) const { return false; }
+
+      public:
+        matrix_saxpy(unsigned int v, vcl_size_t gs1, vcl_size_t gs2, vcl_size_t ng1, vcl_size_t ng2, unsigned int d) : profile_base(v, gs1, gs2, 1), num_groups_row_(ng1), num_groups_col_(ng2), decomposition_(d){ }
+
+        static std::string csv_format() {
+          return "Vec,LSize1,LSize2,NumGroups1,NumGroups2,GlobalDecomposition";
+        }
+
+        std::string csv_representation() const{
+          std::ostringstream oss;
+          oss << vector_size_
+                 << "," << local_size_1_
+                 << "," << local_size_2_
+                 << "," << num_groups_row_
+                 << "," << num_groups_col_
+                 << "," << decomposition_;
+          return oss.str();
+        }
+
+        void configure_range_enqueue_arguments(vcl_size_t kernel_id, statements_type  const & statements, viennacl::ocl::kernel & k, unsigned int & n_arg)  const{
+          configure_local_sizes(k, kernel_id);
+
+          k.global_work_size(0,local_size_1_*num_groups_row_);
+          k.global_work_size(1,local_size_2_*num_groups_col_);
+
+          scheduler::statement_node const & first_node = statements.front().second;
+          k.arg(n_arg++, cl_uint(utils::call_on_matrix(first_node.lhs, utils::internal_size1_fun())));
+          k.arg(n_arg++, cl_uint(utils::call_on_matrix(first_node.lhs, utils::internal_size2_fun())));
+        }
+
+        void kernel_arguments(statements_type  const & /*statements*/, std::string & arguments_string) const{
+          arguments_string += detail::generate_value_kernel_argument("unsigned int", "M");
+          arguments_string += detail::generate_value_kernel_argument("unsigned int", "N");
+        }
+
+      private:
+        void core(vcl_size_t /*kernel_id*/, utils::kernel_generation_stream& stream, statements_type const & statements, std::vector<detail::mapping_type> const & mapping) const {
+
+          for(std::vector<detail::mapping_type>::const_iterator it = mapping.begin() ; it != mapping.end() ; ++it){
+            for(detail::mapping_type::const_iterator iit = it->begin() ; iit != it->end() ; ++iit){
+              if(detail::mapped_matrix * p = dynamic_cast<detail::mapped_matrix*>(iit->second.get()))
+                p->bind_sizes("M","N");
+            }
+          }
+
+          stream << "for(unsigned int i = get_global_id(0) ; i < M ; i += get_global_size(0))" << std::endl;
+          stream << "{" << std::endl;
+          stream.inc_tab();
+          stream << "for(unsigned int j = get_global_id(1) ; j < N ; j += get_global_size(1))" << std::endl;
+          stream << "{" << std::endl;
+          stream.inc_tab();
+
+          //Fetches entries to registers
+          std::set<std::string>  fetched;
+          for(std::vector<detail::mapping_type>::const_iterator it = mapping.begin() ; it != mapping.end() ; ++it)
+            for(detail::mapping_type::const_reverse_iterator it2 = it->rbegin() ; it2 != it->rend() ; ++it2)
+              if(detail::mapped_matrix * p = dynamic_cast<detail::mapped_matrix *>(it2->second.get()))
+                p->fetch(std::make_pair("i", "j"), vector_size_, fetched, stream);
+
+
+          vcl_size_t i = 0;
+          for(statements_type::const_iterator it = statements.begin() ; it != statements.end() ; ++it){
+            std::string str;
+            detail::traverse(it->first, it->second, detail::expression_generation_traversal(std::make_pair("i", "j"), -1, str, mapping[i++]));
+            stream << str << ";" << std::endl;
+          }
+
+          //Writes back
+          for(statements_type::const_iterator it = statements.begin() ; it != statements.end() ; ++it){
+            if(detail::mapped_handle * p = dynamic_cast<detail::mapped_handle *>(at(mapping.at(std::distance(statements.begin(),it)), std::make_pair(&it->second,detail::LHS_NODE_TYPE)).get()))
+              p->write_back(std::make_pair("i", "j"), fetched, stream);
+          }
+
+          stream.dec_tab();
+          stream << "}" << std::endl;
+          stream.dec_tab();
+          stream << "}" << std::endl;
+        }
+
+      private:
+        vcl_size_t num_groups_row_;
+        vcl_size_t num_groups_col_;
+
+        unsigned int decomposition_;
+    };
+  }
+
+}
+
+#endif
diff --git a/viennacl/generator/scalar_reduction.hpp b/viennacl/generator/scalar_reduction.hpp
new file mode 100644
index 0000000..6621e54
--- /dev/null
+++ b/viennacl/generator/scalar_reduction.hpp
@@ -0,0 +1,362 @@
+#ifndef VIENNACL_GENERATOR_GENERATE_SCALAR_REDUCTION_HPP
+#define VIENNACL_GENERATOR_GENERATE_SCALAR_REDUCTION_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/generator/scalar_reduction.hpp
+ *
+ * @brief Kernel template for the scalar reduction operation
+*/
+
+#include <vector>
+
+#include "viennacl/backend/opencl.hpp"
+
+#include "viennacl/scheduler/forwards.h"
+
+#include "viennacl/generator/helpers.hpp"
+#include "viennacl/generator/utils.hpp"
+
+#include "viennacl/generator/profile_base.hpp"
+
+#include "viennacl/tools/tools.hpp"
+
+namespace viennacl{
+
+  namespace generator{
+
+    /** @brief OpenCL kernel generation template for scalar reduction operations such as s = norm_2(x). */
+    class scalar_reduction : public profile_base{
+      private:
+        typedef std::vector<std::pair<const char *, viennacl::ocl::handle<cl_mem> > > temporaries_type;
+
+        static void fill_scalartypes(statements_type statements, std::vector<const char *> & res){
+          res.reserve(statements.size());
+          for(statements_type::const_iterator it = statements.begin() ; it != statements.end() ; ++it){
+            if (it->second.lhs.type_family == scheduler::SCALAR_TYPE_FAMILY)
+            {
+              switch(it->second.lhs.numeric_type){
+                case scheduler::FLOAT_TYPE:
+                  res.push_back("float");
+                  break;
+                case scheduler::DOUBLE_TYPE:
+                  res.push_back("double");
+                  break;
+                default:
+                  res.push_back("");
+                  break;
+              }
+            }
+            else
+            {
+              res.push_back("");
+            }
+          }
+        }
+
+      public:
+
+        vcl_size_t lmem_used(vcl_size_t scalartype_size) const {
+          return local_size_1_*scalartype_size;
+        }
+
+        void init_temporaries(statements_type const & statements) const {
+          if(temporaries_.empty()){
+            //set temporary buffer argument
+            for(statements_type::const_iterator it = statements.begin() ; it != statements.end() ; ++it){
+              scheduler::statement::container_type const & array = it->first.array();
+              vcl_size_t size_of_scalartype;
+              const char * scalartype_name;
+              if (array[0].lhs.type_family != scheduler::SCALAR_TYPE_FAMILY) throw "not implemented";
+              switch(array[0].lhs.numeric_type){
+                case scheduler::FLOAT_TYPE: scalartype_name = "float"; size_of_scalartype = sizeof(float); break;
+                case scheduler::DOUBLE_TYPE: scalartype_name = "double"; size_of_scalartype = sizeof(double); break;
+                default: throw "not implemented";
+              }
+              for(scheduler::statement::container_type::const_iterator iit = array.begin() ; iit != array.end() ; ++iit){
+                if(iit->op.type==scheduler::OPERATION_BINARY_INNER_PROD_TYPE){
+                  temporaries_.push_back(std::make_pair(scalartype_name, viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, static_cast<unsigned int>(num_groups_*size_of_scalartype))));
+                }
+              }
+            }
+          }
+        }
+
+        void set_size_argument(viennacl::scheduler::statement const & s, viennacl::scheduler::statement_node const & /*root_node*/, unsigned int & n_arg, viennacl::ocl::kernel & k) const {
+          scheduler::statement::container_type exprs = s.array();
+          for(scheduler::statement::container_type::iterator it = exprs.begin() ; it != exprs.end() ; ++it){
+            if(it->op.type==scheduler::OPERATION_BINARY_INNER_PROD_TYPE){
+              //set size argument
+              scheduler::statement_node const * current_node = &(*it);
+
+              vcl_size_t vector_size = 0;
+              //The LHS of the prod is a vector
+              if(current_node->lhs.type_family==scheduler::VECTOR_TYPE_FAMILY)
+              {
+                vector_size = utils::call_on_vector(current_node->lhs, utils::internal_size_fun());
+              }
+              else{
+                //The LHS of the prod is a vector expression
+                current_node = &exprs[current_node->lhs.node_index];
+                if(current_node->lhs.type_family==scheduler::VECTOR_TYPE_FAMILY)
+                {
+                  vector_size = cl_uint(utils::call_on_vector(current_node->lhs, utils::internal_size_fun()));
+                }
+                else if(current_node->rhs.type_family==scheduler::VECTOR_TYPE_FAMILY)
+                {
+                  vector_size = cl_uint(utils::call_on_vector(current_node->lhs, utils::internal_size_fun()));
+                }
+                else{
+                  assert(false && bool("unexpected expression tree"));
+                }
+              }
+              k.arg(n_arg++, cl_uint(vector_size/vector_size_));
+            }
+          }
+        }
+
+      public:
+        /** @brief The user constructor */
+        scalar_reduction(unsigned int vectorization, unsigned int local_size, unsigned int num_groups, unsigned int decomposition) : profile_base(vectorization, local_size, 1, 2), num_groups_(num_groups), decomposition_(decomposition){ }
+
+
+        static std::string csv_format() {
+          return "Vec,LSize,NumGroups,GlobalDecomposition";
+        }
+
+        std::string csv_representation() const{
+          std::ostringstream oss;
+          oss << vector_size_
+                 << "," << local_size_1_
+                 << "," << num_groups_
+                 << "," << decomposition_;
+          return oss.str();
+        }
+
+        unsigned int num_groups() const { return num_groups_; }
+
+
+        unsigned int decomposition() const { return decomposition_; }
+
+
+        void configure_range_enqueue_arguments(vcl_size_t kernel_id, statements_type  const & statements, viennacl::ocl::kernel & k, unsigned int & n_arg)  const{
+
+          //create temporaries
+          init_temporaries(statements);
+
+          //configure ND range
+          if(kernel_id==0){
+            configure_local_sizes(k, 0);
+
+            vcl_size_t gsize = local_size_1_*num_groups_;
+            k.global_work_size(0,gsize);
+            k.global_work_size(1,1);
+          }
+          else{
+            configure_local_sizes(k, 1);
+
+            k.global_work_size(0,local_size_1_);
+            k.global_work_size(1,1);
+          }
+
+          //set arguments
+          set_size_argument(statements.front().first, statements.front().second, n_arg, k);
+          for(temporaries_type::iterator it = temporaries_.begin() ; it != temporaries_.end() ; ++it){
+            k.arg(n_arg++, it->second);
+          }
+        }
+
+        void kernel_arguments(statements_type  const & statements, std::string & arguments_string) const{
+          init_temporaries(statements);
+          arguments_string += detail::generate_value_kernel_argument("unsigned int", "N");
+          for(temporaries_type::iterator it = temporaries_.begin() ; it != temporaries_.end() ; ++it){
+            arguments_string += detail::generate_pointer_kernel_argument("__global", it->first, "temp" + utils::to_string(std::distance(temporaries_.begin(), it)));
+          }
+        }
+
+      private:
+
+        void core_0(utils::kernel_generation_stream& stream, std::vector<detail::mapped_scalar_reduction*> exprs, std::vector<const char *> const & scalartypes, statements_type const & /*statements*/, std::vector<detail::mapping_type> const & /*mapping*/) const {
+
+          stream << "unsigned int lid = get_local_id(0);" << std::endl;
+
+          for(vcl_size_t k = 0 ; k < exprs.size() ; ++k)
+            stream << scalartypes[k] << " sum" << k << " = 0;" << std::endl;
+
+          if(decomposition_){
+            stream << "for(unsigned int i = get_global_id(0) ; i < N ; i += get_global_size(0)){" << std::endl;
+          }
+          else{
+            stream << "unsigned int chunk_size = (N + get_num_groups(0)-1)/get_num_groups(0);" << std::endl;
+            stream << "unsigned int chunk_start = get_group_id(0)*chunk_size;" << std::endl;
+            stream << "unsigned int chunk_end = min(chunk_start+chunk_size, N);" << std::endl;
+            stream << "for(unsigned int i = chunk_start + get_local_id(0) ; i < chunk_end ; i += get_local_size(0)){" << std::endl;
+          }
+          stream.inc_tab();
+
+          //Fetch vector entry
+          std::set<std::string>  fetched;
+
+          for(std::vector<detail::mapped_scalar_reduction*>::iterator it = exprs.begin() ; it != exprs.end() ; ++it){
+            viennacl::scheduler::statement const & statement = (*it)->statement();
+            viennacl::scheduler::statement_node const & root_node = (*it)->root_node();
+            detail::fetch_all_lhs(fetched,statement,root_node, std::make_pair("i", "0"),vector_size_,stream,(*it)->mapping());
+            detail::fetch_all_rhs(fetched,statement,root_node, std::make_pair("i", "0"),vector_size_,stream,(*it)->mapping());
+          }
+
+
+          //Update sums;
+          for(std::vector<detail::mapped_scalar_reduction*>::iterator it = exprs.begin() ; it != exprs.end() ; ++it){
+            viennacl::scheduler::statement const & statement = (*it)->statement();
+            viennacl::scheduler::statement_node const & root_node = (*it)->root_node();
+            if(vector_size_ > 1){
+              for(unsigned int a = 0 ; a < vector_size_ ; ++a){
+                std::string str;
+                detail::generate_all_lhs(statement,root_node,std::make_pair("i","0"),a,str,(*it)->mapping());
+                str += "*";
+                detail::generate_all_rhs(statement,root_node,std::make_pair("i","0"),a,str,(*it)->mapping());
+                stream << " sum" << std::distance(exprs.begin(),it) << " += "  << str << ";" << std::endl;
+              }
+            }
+            else{
+              std::string str;
+              detail::generate_all_lhs(statement,root_node,std::make_pair("i","0"),-1,str,(*it)->mapping());
+              str += "*";
+              detail::generate_all_rhs(statement,root_node,std::make_pair("i","0"),-1,str,(*it)->mapping());
+              stream << " sum" << std::distance(exprs.begin(),it) << " += "  << str << ";" << std::endl;
+            }
+          }
+
+
+          stream.dec_tab();
+          stream << "}" << std::endl;
+          //Declare and fill local memory
+          for(vcl_size_t k = 0 ; k < exprs.size() ; ++k)
+            stream << "__local " << scalartypes[k] << " buf" << k << "[" << local_size_1_ << "];" << std::endl;
+
+          for(vcl_size_t k = 0 ; k < exprs.size() ; ++k)
+            stream << "buf" << k << "[lid] = sum" << k << ";" << std::endl;
+
+          //Reduce local memory
+          for(vcl_size_t stride = local_size_1_/2 ; stride>1 ; stride /=2){
+            stream << "barrier(CLK_LOCAL_MEM_FENCE); " << std::endl;
+            stream << "if(lid < " << stride << "){" << std::endl;
+            stream.inc_tab();
+            for(vcl_size_t k = 0 ; k < exprs.size() ; ++k){
+              stream << "buf" << k << "[lid] += buf" << k << "[lid + " << stride << "];" << std::endl;
+            }
+            stream.dec_tab();
+            stream << "}" << std::endl;
+          }
+
+          //Last reduction and write back to temporary buffer
+          stream << "barrier(CLK_LOCAL_MEM_FENCE); " << std::endl;
+          stream << "if(lid==0){" << std::endl;
+          stream.inc_tab();
+          for(vcl_size_t k = 0 ; k < exprs.size() ; ++k)
+            stream << "buf" << k << "[0] += buf" << k << "[1];" << std::endl;
+
+          for(vcl_size_t k = 0 ; k < exprs.size() ; ++k)
+            stream << "temp"<< k << "[get_group_id(0)] = buf" << k << "[0];" << std::endl;
+
+          stream.dec_tab();
+          stream << "}" << std::endl;
+        }
+
+
+        void core_1(utils::kernel_generation_stream& stream, std::vector<detail::mapped_scalar_reduction*> exprs, std::vector<const char *> scalartypes, statements_type const & statements, std::vector<detail::mapping_type> const & mapping) const {
+          stream << "unsigned int lid = get_local_id(0);" << std::endl;
+
+          for(vcl_size_t k = 0 ; k < exprs.size() ; ++k)
+            stream << "__local " << scalartypes[k] << " buf" << k << "[" << local_size_1_ << "];" << std::endl;
+
+          for(vcl_size_t k = 0 ; k < exprs.size() ; ++k)
+            stream << scalartypes[0] << " sum" << k << " = 0;" << std::endl;
+
+          stream << "for(unsigned int i = lid ; i < " << num_groups_ << " ; i += get_local_size(0)){" << std::endl;
+          stream.inc_tab();
+          for(vcl_size_t k = 0 ; k < exprs.size() ; ++k)
+            stream << "sum" << k << " += temp" << k << "[i];" << std::endl;
+          stream.dec_tab();
+          stream << "}" << std::endl;
+
+          for(vcl_size_t k = 0 ; k < exprs.size() ; ++k)
+            stream << "buf" << k << "[lid] = sum" << k << ";" << std::endl;
+
+          //Reduce local memory
+          for(vcl_size_t stride = local_size_1_/2 ; stride>1 ; stride /=2){
+            stream << "barrier(CLK_LOCAL_MEM_FENCE); " << std::endl;
+            stream << "if(lid < " << stride << "){" << std::endl;
+            stream.inc_tab();
+            for(vcl_size_t k = 0 ; k < exprs.size() ; ++k){
+              stream << "buf" << k << "[lid] += buf" << k << "[lid + " << stride << "];" << std::endl;
+            }
+            stream.dec_tab();
+            stream << "}" << std::endl;
+          }
+
+          stream << "barrier(CLK_LOCAL_MEM_FENCE); " << std::endl;
+          stream << "if(lid==0){" << std::endl;
+          stream.inc_tab();
+          for(vcl_size_t k = 0 ; k < exprs.size() ; ++k){
+            stream << "buf" << k << "[0] += buf" << k << "[1];" << std::endl;
+            exprs[k]->access_name("buf"+utils::to_string(k)+"[0]");
+          }
+
+          vcl_size_t i = 0;
+          for(statements_type::const_iterator it = statements.begin() ; it != statements.end() ; ++it){
+            std::string str;
+            detail::traverse(it->first, it->second, detail::expression_generation_traversal(std::make_pair("0", "0"), -1, str, mapping[i++]), false);
+            stream << str << ";" << std::endl;
+          }
+
+          stream.dec_tab();
+          stream << "}" << std::endl;
+        }
+
+        void core(vcl_size_t kernel_id, utils::kernel_generation_stream& stream, statements_type const & statements, std::vector<detail::mapping_type> const & mapping) const {
+          std::vector<detail::mapped_scalar_reduction*> exprs;
+          for(std::vector<detail::mapping_type>::const_iterator it = mapping.begin() ; it != mapping.end() ; ++it)
+            for(detail::mapping_type::const_iterator iit = it->begin() ; iit != it->end() ; ++iit)
+              if(detail::mapped_scalar_reduction * p = dynamic_cast<detail::mapped_scalar_reduction*>(iit->second.get()))
+                exprs.push_back(p);
+
+          std::vector<const char *> scalartypes;
+          fill_scalartypes(statements, scalartypes);
+
+          if(kernel_id==0){
+            core_0(stream,exprs,scalartypes,statements,mapping);
+          }
+          else{
+            core_1(stream,exprs,scalartypes,statements,mapping);
+          }
+        }
+
+      private:
+        unsigned int num_groups_;
+        unsigned int decomposition_;
+        mutable temporaries_type temporaries_;
+    };
+
+
+  }
+
+}
+
+#endif
diff --git a/viennacl/generator/set_arguments_functor.hpp b/viennacl/generator/set_arguments_functor.hpp
new file mode 100644
index 0000000..d434cae
--- /dev/null
+++ b/viennacl/generator/set_arguments_functor.hpp
@@ -0,0 +1,139 @@
+#ifndef VIENNACL_GENERATOR_ENQUEUE_TREE_HPP
+#define VIENNACL_GENERATOR_ENQUEUE_TREE_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/generator/set_arguments_functor.hpp
+    @brief Functor to set the arguments of a statement into a kernel
+*/
+
+#include <set>
+
+#include "viennacl/matrix.hpp"
+#include "viennacl/vector.hpp"
+
+#include "viennacl/forwards.h"
+#include "viennacl/scheduler/forwards.h"
+#include "viennacl/generator/forwards.h"
+
+#include "viennacl/meta/result_of.hpp"
+
+#include "viennacl/tools/shared_ptr.hpp"
+
+#include "viennacl/ocl/kernel.hpp"
+
+#include "viennacl/generator/helpers.hpp"
+#include "viennacl/generator/utils.hpp"
+#include "viennacl/generator/mapped_objects.hpp"
+
+
+namespace viennacl{
+
+  namespace generator{
+
+    namespace detail{
+
+      /** @brief Helper class for setting the arguments of a kernel. */
+      class set_arguments_functor : public traversal_functor{
+        public:
+          typedef void result_type;
+
+          set_arguments_functor(std::set<void *> & memory, unsigned int & current_arg, viennacl::ocl::kernel & kernel) : memory_(memory), current_arg_(current_arg), kernel_(kernel){ }
+
+          template<class ScalarType>
+          result_type operator()(ScalarType const & scal) const {
+            typedef typename viennacl::result_of::cl_type<ScalarType>::type cl_scalartype;
+            kernel_.arg(current_arg_++, cl_scalartype(scal));
+          }
+
+          /** @brief Scalar mapping */
+          template<class ScalarType>
+          result_type operator()(scalar<ScalarType> const & scal) const {
+            if(memory_.insert((void*)&scal).second)
+              kernel_.arg(current_arg_++, scal.handle().opencl_handle());
+          }
+
+          /** @brief Vector mapping */
+          template<class ScalarType>
+          result_type operator()(vector_base<ScalarType> const & vec) const {
+            if(memory_.insert((void*)&vec).second){
+              kernel_.arg(current_arg_++, vec.handle().opencl_handle());
+              if(viennacl::traits::start(vec)>0)
+                kernel_.arg(current_arg_++, cl_uint(viennacl::traits::start(vec)));
+              if(vec.stride()>1)
+                kernel_.arg(current_arg_++, cl_uint(viennacl::traits::stride(vec)));
+            }
+          }
+
+          /** @brief Implicit vector mapping */
+          template<class ScalarType>
+          result_type operator()(implicit_vector_base<ScalarType> const & vec) const {
+            typedef typename viennacl::result_of::cl_type<ScalarType>::type cl_scalartype;
+            if(memory_.insert((void*)&vec).second){
+              if(vec.is_value_static()==false)
+                kernel_.arg(current_arg_++, cl_scalartype(vec.value()));
+              if(vec.has_index())
+                kernel_.arg(current_arg_++, cl_uint(vec.index()));
+            }
+          }
+
+          /** @brief Matrix mapping */
+          template<class ScalarType, class Layout>
+          result_type operator()(matrix_base<ScalarType, Layout> const & mat) const {
+            //typedef typename matrix_base<ScalarType, Layout>::size_type size_type;
+            if(memory_.insert((void*)&mat).second){
+              kernel_.arg(current_arg_++, mat.handle().opencl_handle());
+              if(viennacl::traits::start1(mat)>0)
+                kernel_.arg(current_arg_++, cl_uint(viennacl::traits::start1(mat)));
+              if(viennacl::traits::stride1(mat)>1)
+                kernel_.arg(current_arg_++, cl_uint(viennacl::traits::stride1(mat)));
+              if(viennacl::traits::start2(mat)>0)
+                kernel_.arg(current_arg_++, cl_uint(viennacl::traits::start2(mat)));
+              if(viennacl::traits::stride2(mat)>1)
+                kernel_.arg(current_arg_++, cl_uint(viennacl::traits::stride2(mat)));
+            }
+          }
+
+          /** @brief Implicit matrix mapping */
+          template<class ScalarType>
+          result_type operator()(implicit_matrix_base<ScalarType> const & mat) const {
+            if(mat.is_value_static()==false)
+              kernel_.arg(current_arg_++, mat.value());
+          }
+
+          /** @brief Traversal functor: */
+          void operator()(scheduler::statement const * /*statement*/, scheduler::statement_node const * root_node, detail::node_type node_type) const {
+            if(node_type==LHS_NODE_TYPE && root_node->lhs.type_family != scheduler::COMPOSITE_OPERATION_FAMILY)
+              utils::call_on_element(root_node->lhs, *this);
+            else if(node_type==RHS_NODE_TYPE && root_node->rhs.type_family != scheduler::COMPOSITE_OPERATION_FAMILY)
+              utils::call_on_element(root_node->rhs, *this);
+          }
+
+        private:
+          std::set<void *> & memory_;
+          unsigned int & current_arg_;
+          viennacl::ocl::kernel & kernel_;
+      };
+
+    }
+
+  }
+
+}
+#endif
diff --git a/viennacl/generator/statement_representation_functor.hpp b/viennacl/generator/statement_representation_functor.hpp
new file mode 100644
index 0000000..86a5d66
--- /dev/null
+++ b/viennacl/generator/statement_representation_functor.hpp
@@ -0,0 +1,172 @@
+#ifndef VIENNACL_GENERATOR_STATEMENT_REPRESENTATION_HPP
+#define VIENNACL_GENERATOR_STATEMENT_REPRESENTATION_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/generator/statement_representation_functor.hpp
+    @brief Functor to generate the string id of a statement
+*/
+
+#include <set>
+#include <cstring>
+
+#include "viennacl/forwards.h"
+#include "viennacl/scheduler/forwards.h"
+#include "viennacl/generator/forwards.h"
+
+#include "viennacl/tools/shared_ptr.hpp"
+
+#include "viennacl/ocl/backend.hpp"
+#include "viennacl/ocl/kernel.hpp"
+
+#include "viennacl/traits/start.hpp"
+#include "viennacl/traits/stride.hpp"
+
+#include "viennacl/generator/helpers.hpp"
+#include "viennacl/generator/utils.hpp"
+#include "viennacl/generator/mapped_objects.hpp"
+
+namespace viennacl{
+
+  namespace generator{
+
+    namespace detail{
+
+      /** @brief Helper class for the OpenCL kernel generator, representing a statement. */
+      class statement_representation_functor : public traversal_functor{
+        private:
+          unsigned int get_id(void * handle) const{
+            unsigned int i = 0;
+            for( ; i < 64 ; ++i){
+              void* current = memory_[i];
+              if(current==NULL)
+                break;
+              if(current==handle)
+                return i;
+            }
+            memory_[i] = handle;
+            return i;
+          }
+
+          static void append_id(char * & ptr, unsigned int val){
+            if(val==0)
+              *ptr++='0';
+            else
+              while(val>0)
+              {
+                  *ptr++=static_cast<char>('0') + static_cast<char>(val % 10);
+                  val /= 10;
+              }
+          }
+
+        public:
+          typedef void result_type;
+
+          statement_representation_functor(void* (&memory)[64], unsigned int , char *& ptr) : memory_(memory), ptr_(ptr){ }
+
+          template<class ScalarType>
+          result_type operator()(ScalarType const & /*scal*/) const {
+            *ptr_++='h'; //host
+            *ptr_++='s'; //scalar
+            *ptr_++=utils::first_letter_of_type<ScalarType>::value();
+          }
+
+          /** @brief Scalar mapping */
+          template<class ScalarType>
+          result_type operator()(scalar<ScalarType> const & scal) const {
+            *ptr_++='s'; //scalar
+            *ptr_++=utils::first_letter_of_type<ScalarType>::value();
+            append_id(ptr_, get_id((void*)&scal));
+          }
+
+          /** @brief Vector mapping */
+          template<class ScalarType>
+          result_type operator()(vector_base<ScalarType> const & vec) const {
+            *ptr_++='v'; //vector
+            if(viennacl::traits::start(vec)>0)
+              *ptr_++='r';
+            if(vec.stride()>1)
+              *ptr_++='s';
+            *ptr_++=utils::first_letter_of_type<ScalarType>::value();
+            append_id(ptr_, get_id((void*)&vec));
+          }
+
+          /** @brief Implicit vector mapping */
+          template<class ScalarType>
+          result_type operator()(implicit_vector_base<ScalarType> const & vec) const {
+            *ptr_++='i'; //implicit
+            *ptr_++='v'; //vector
+            if(vec.is_value_static())
+              *ptr_++='v'; //value
+            if(vec.has_index())
+              *ptr_++='i';
+            *ptr_++=utils::first_letter_of_type<ScalarType>::value();
+          }
+
+          /** @brief Matrix mapping */
+          template<class ScalarType, class Layout>
+          result_type operator()(matrix_base<ScalarType, Layout> const & mat) const {
+            *ptr_++='m'; //vector
+            if(viennacl::traits::start1(mat)>0)
+              *ptr_++='r';
+            if(viennacl::traits::stride1(mat)>1)
+              *ptr_++='s';
+            if(viennacl::traits::start2(mat)>0)
+              *ptr_++='r';
+            if(viennacl::traits::stride2(mat)>1)
+              *ptr_++='s';
+            *ptr_++=utils::first_letter_of_type<ScalarType>::value();
+            *ptr_++=utils::first_letter_of_type<Layout>::value();
+            append_id(ptr_, get_id((void*)&mat));
+          }
+
+          /** @brief Implicit matrix mapping */
+          template<class ScalarType>
+          result_type operator()(implicit_matrix_base<ScalarType> const & mat) const {
+            *ptr_++='i'; //implicit
+            *ptr_++='m'; //matrix
+            if(mat.is_value_static())
+              *ptr_++='v'; //value
+            *ptr_++=utils::first_letter_of_type<ScalarType>::value();
+          }
+
+          void operator()(scheduler::statement const *, scheduler::statement_node const * root_node, detail::node_type node_type) const {
+            if(node_type==LHS_NODE_TYPE && root_node->lhs.type_family != scheduler::COMPOSITE_OPERATION_FAMILY)
+              utils::call_on_element(root_node->lhs, *this);
+            else if(node_type==RHS_NODE_TYPE && root_node->rhs.type_family != scheduler::COMPOSITE_OPERATION_FAMILY)
+              utils::call_on_element(root_node->rhs, *this);
+            else if(node_type==PARENT_NODE_TYPE){
+              const char * op_expr = detail::generate(root_node->op.type);
+              vcl_size_t n = std::strlen(op_expr);
+              std::memcpy(ptr_, op_expr, n);
+              ptr_+=n;
+            }
+          }
+
+        private:
+          void* (&memory_)[64];
+          char *& ptr_;
+      };
+
+    }
+
+  }
+
+}
+#endif
diff --git a/viennacl/generator/utils.hpp b/viennacl/generator/utils.hpp
new file mode 100644
index 0000000..950098f
--- /dev/null
+++ b/viennacl/generator/utils.hpp
@@ -0,0 +1,274 @@
+#ifndef VIENNACL_GENERATOR_UTILS_HPP
+#define VIENNACL_GENERATOR_UTILS_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/generator/utils.hpp
+    @brief Internal utils for a dynamic OpenCL kernel generation.
+*/
+
+#include <sstream>
+
+#include "viennacl/ocl/forwards.h"
+
+#include "viennacl/traits/size.hpp"
+
+#include "viennacl/scheduler/forwards.h"
+
+namespace viennacl{
+
+  namespace generator{
+
+    namespace utils{
+
+    template<class Fun>
+    static typename Fun::result_type call_on_host_scalar(scheduler::lhs_rhs_element element, Fun const & fun){
+        assert(element.type_family == scheduler::SCALAR_TYPE_FAMILY && bool("Must be called on a host scalar"));
+        switch(element.numeric_type){
+        case scheduler::FLOAT_TYPE :
+            return fun(element.host_float);
+        case scheduler::DOUBLE_TYPE :
+            return fun(element.host_double);
+        default :
+            throw "not implemented";
+        }
+    }
+
+    template<class Fun>
+    static typename Fun::result_type call_on_scalar(scheduler::lhs_rhs_element element, Fun const & fun){
+        assert(element.type_family == scheduler::SCALAR_TYPE_FAMILY && bool("Must be called on a scalar"));
+        switch(element.numeric_type){
+        case scheduler::FLOAT_TYPE :
+            return fun(*element.scalar_float);
+        case scheduler::DOUBLE_TYPE :
+            return fun(*element.scalar_double);
+        default :
+            throw "not implemented";
+        }
+    }
+
+    template<class Fun>
+    static typename Fun::result_type call_on_vector(scheduler::lhs_rhs_element element, Fun const & fun){
+        assert(element.type_family == scheduler::VECTOR_TYPE_FAMILY && bool("Must be called on a vector"));
+        switch(element.numeric_type){
+        case scheduler::FLOAT_TYPE :
+            return fun(*element.vector_float);
+        case scheduler::DOUBLE_TYPE :
+            return fun(*element.vector_double);
+        default :
+            throw "not implemented";
+        }
+    }
+
+    template<class Fun>
+    static typename Fun::result_type call_on_implicit_vector(scheduler::lhs_rhs_element element, Fun const & fun){
+        assert(element.type_family == scheduler::VECTOR_TYPE_FAMILY   && bool("Must be called on a implicit_vector"));
+        assert(element.subtype     == scheduler::IMPLICIT_VECTOR_TYPE && bool("Must be called on a implicit_vector"));
+        switch(element.numeric_type){
+        case scheduler::FLOAT_TYPE :
+            return fun(*element.implicit_vector_float);
+        case scheduler::DOUBLE_TYPE :
+            return fun(*element.implicit_vector_double);
+        default :
+            throw "not implemented";
+        }
+    }
+
+    template<class Fun>
+    static typename Fun::result_type call_on_matrix(scheduler::lhs_rhs_element element, Fun const & fun){
+        assert(element.type_family == scheduler::MATRIX_TYPE_FAMILY && bool("Must be called on a matrix"));
+        if (element.subtype == scheduler::DENSE_ROW_MATRIX_TYPE)
+        {
+            switch(element.numeric_type){
+            case scheduler::FLOAT_TYPE :
+                return fun(*element.matrix_row_float);
+            case scheduler::DOUBLE_TYPE :
+                return fun(*element.matrix_row_double);
+            default :
+                throw "not implemented";
+            }
+        }
+        else
+        {
+            switch(element.numeric_type){
+            case scheduler::FLOAT_TYPE :
+                return fun(*element.matrix_col_float);
+            case scheduler::DOUBLE_TYPE :
+                return fun(*element.matrix_col_double);
+            default :
+                throw "not implemented";
+            }
+        }
+    }
+
+
+    template<class Fun>
+    static typename Fun::result_type call_on_implicit_matrix(scheduler::lhs_rhs_element element, Fun const & fun){
+        assert(element.type_family == scheduler::MATRIX_TYPE_FAMILY   && bool("Must be called on a matrix_vector"));
+        assert(element.subtype     == scheduler::IMPLICIT_MATRIX_TYPE && bool("Must be called on a matrix_vector"));
+        switch(element.numeric_type){
+        case scheduler::FLOAT_TYPE :
+            return fun(*element.implicit_matrix_float);
+        case scheduler::DOUBLE_TYPE :
+            return fun(*element.implicit_matrix_double);
+        default :
+            throw "not implemented";
+        }
+    }
+
+      template<class Fun>
+      static typename Fun::result_type call_on_element(scheduler::lhs_rhs_element const & element, Fun const & fun){
+        switch(element.type_family){
+          case scheduler::SCALAR_TYPE_FAMILY:
+            if (element.subtype == scheduler::HOST_SCALAR_TYPE)
+              return call_on_host_scalar(element, fun);
+            else
+              return call_on_scalar(element, fun);
+          case scheduler::VECTOR_TYPE_FAMILY :
+            if (element.subtype == scheduler::IMPLICIT_VECTOR_TYPE)
+              return call_on_implicit_vector(element, fun);
+            else
+              return call_on_vector(element, fun);
+          case scheduler::MATRIX_TYPE_FAMILY:
+            if (element.subtype == scheduler::IMPLICIT_MATRIX_TYPE)
+              return call_on_implicit_matrix(element, fun);
+            else
+              return call_on_matrix(element,fun);
+          default:
+            throw "not implemented";
+        }
+      }
+
+      /** @brief Functor for returning the size of the underlying scalar type in bytes. */
+      struct scalartype_size_fun{
+          typedef vcl_size_t result_type;
+          result_type operator()(float const &) const { return sizeof(float); }
+          result_type operator()(double const &) const { return sizeof(double); }
+          template<class T> result_type operator()(T const &) const { return sizeof(typename viennacl::result_of::cpu_value_type<T>::type); }
+      };
+
+      /** @brief Functor for returning the internal size of a vector. */
+      struct internal_size_fun{
+          typedef vcl_size_t result_type;
+          template<class T>
+          result_type operator()(T const &t) const { return viennacl::traits::internal_size(t); }
+      };
+
+      /** @brief Functor for obtaining the OpenCL handle from ViennaCL objects (vector, matrix, etc.). */
+      struct handle_fun{
+          typedef cl_mem result_type;
+          template<class T>
+          result_type operator()(T const &t) const { return t.handle().opencl_handle(); }
+      };
+
+      /** @brief Functor for obtaining the internal number of rows of a ViennaCL matrix. */
+      struct internal_size1_fun{
+          typedef vcl_size_t result_type;
+          template<class T>
+          result_type operator()(T const &t) const { return viennacl::traits::internal_size1(t); }
+      };
+
+      /** @brief Functor for obtaining the internal number of columns of a ViennaCL matrix. */
+      struct internal_size2_fun{
+          typedef vcl_size_t result_type;
+          template<class T>
+          result_type operator()(T const &t) const { return viennacl::traits::internal_size2(t); }
+      };
+
+      /** @brief Helper metafunction for checking whether two types are the same. */
+      template<class T, class U>
+      struct is_same_type { enum { value = 0 }; };
+
+      /** \cond */
+      template<class T>
+      struct is_same_type<T,T> { enum { value = 1 }; };
+      /** \endcond */
+
+      template <class T>
+      inline std::string to_string ( T const t )
+      {
+        std::stringstream ss;
+        ss << t;
+        return ss.str();
+      }
+
+      /** @brief Helper struct for converting a numerical type to its string representation. */
+      template<class T>
+      struct type_to_string;
+
+
+      /** \cond */
+      template<> struct type_to_string<float> { static const char * value() { return "float"; } };
+      template<> struct type_to_string<double> { static const char * value() { return "double"; } };
+      /** \endcond */
+
+      /** @brief Helper struct for obtaining the first letter of a type. Used internally by the generator only. */
+      template<class T>
+      struct first_letter_of_type;
+
+      /** \cond */
+      template<> struct first_letter_of_type<float> { static char value() { return 'f'; } };
+      template<> struct first_letter_of_type<double> { static char value() { return 'd'; } };
+      template<> struct first_letter_of_type<viennacl::row_major> { static char value() { return 'r'; } };
+      template<> struct first_letter_of_type<viennacl::column_major> { static char value() { return 'c'; } };
+      /** \endcond */
+
+      /** @brief A stream class where the kernel sources are streamed to. Takes care of indentation of the sources. */
+      class kernel_generation_stream : public std::ostream{
+        private:
+
+          class kgenstream : public std::stringbuf{
+            public:
+              kgenstream(std::ostringstream& oss,unsigned int const & tab_count) : oss_(oss), tab_count_(tab_count){ }
+              int sync() {
+                for(unsigned int i=0 ; i<tab_count_;++i)
+                  oss_ << "    ";
+                oss_ << str();
+                str("");
+                return !oss_;
+              }
+              ~kgenstream() {  pubsync(); }
+            private:
+              std::ostream& oss_;
+              unsigned int const & tab_count_;
+          };
+
+        public:
+          kernel_generation_stream() : std::ostream(new kgenstream(oss,tab_count_)), tab_count_(0){ }
+
+          std::string str(){ return oss.str(); }
+
+          void inc_tab(){ ++tab_count_; }
+
+          void dec_tab(){ --tab_count_; }
+
+          ~kernel_generation_stream(){ delete rdbuf(); }
+
+        private:
+          unsigned int tab_count_;
+          std::ostringstream oss;
+      };
+
+
+    }
+
+  }
+
+}
+#endif
diff --git a/viennacl/generator/vector_reduction.hpp b/viennacl/generator/vector_reduction.hpp
new file mode 100644
index 0000000..677bb2b
--- /dev/null
+++ b/viennacl/generator/vector_reduction.hpp
@@ -0,0 +1,243 @@
+#ifndef VIENNACL_GENERATOR_GENERATE_VECTOR_REDUCTION_HPP
+#define VIENNACL_GENERATOR_GENERATE_VECTOR_REDUCTION_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/generator/vector_reduction.hpp
+ *
+ * @brief Kernel template for the vector reduction operation
+*/
+
+#include <vector>
+
+#include "viennacl/scheduler/forwards.h"
+
+#include "viennacl/generator/mapped_objects.hpp"
+#include "viennacl/generator/helpers.hpp"
+#include "viennacl/generator/utils.hpp"
+
+#include "viennacl/generator/profile_base.hpp"
+
+#include "viennacl/tools/tools.hpp"
+
+namespace viennacl{
+
+  namespace generator{
+
+    /** @brief OpenCL kernel template for reductions resulting in a vector. Example: Computing the row norms of a matrix concurrently. */
+    class vector_reduction : public profile_base{
+
+        vcl_size_t lmem_used(vcl_size_t scalartype_size) const {
+          return m_*(k_+1)*scalartype_size;
+        }
+
+      public:
+        /** @brief The user constructor */
+        vector_reduction(unsigned int vectorization, unsigned int m, unsigned int k, unsigned int num_groups) : profile_base(vectorization, m, k, 1), m_(m), k_(k), num_groups_(num_groups){ }
+
+
+        static std::string csv_format() {
+          return "Vec,M,K,NumGroups";
+        }
+
+        std::string csv_representation() const{
+          std::ostringstream oss;
+          oss << vector_size_
+                 << "," << m_
+                 << "," << k_
+                 << "," << num_groups_;
+          return oss.str();
+        }
+
+        unsigned int m() const { return m_; }
+
+        unsigned int k() const { return k_; }
+
+        unsigned int num_groups() const { return num_groups_; }
+
+        void configure_range_enqueue_arguments(vcl_size_t kernel_id, statements_type  const & statements, viennacl::ocl::kernel & kernel, unsigned int & n_arg)  const{
+
+          configure_local_sizes(kernel, kernel_id);
+          kernel.global_work_size(0,m_*num_groups_);
+          kernel.global_work_size(1,k_);
+
+
+          for(statements_type::const_iterator it = statements.begin() ; it != statements.end() ; ++it){
+            scheduler::statement::container_type exprs = it->first.array();
+            for(scheduler::statement::container_type::iterator iit = exprs.begin() ; iit != exprs.end() ; ++iit){
+              if(iit->op.type==scheduler::OPERATION_BINARY_MAT_VEC_PROD_TYPE){
+                scheduler::statement_node const * current_node = &(*iit);
+                //The LHS of the prod is a matrix
+                if(current_node->lhs.type_family==scheduler::MATRIX_TYPE_FAMILY)
+                {
+                  kernel.arg(n_arg++, cl_uint(utils::call_on_matrix(current_node->lhs, utils::internal_size1_fun())));
+                  kernel.arg(n_arg++, cl_uint(utils::call_on_matrix(current_node->lhs, utils::internal_size2_fun())));
+                  return;
+                }
+                else{
+                  //The LHS of the prod is a matrix expression
+                  current_node = &exprs[current_node->lhs.node_index];
+                  if(current_node->lhs.type_family==scheduler::MATRIX_TYPE_FAMILY)
+                  {
+                    kernel.arg(n_arg++, cl_uint(utils::call_on_matrix(current_node->lhs, utils::internal_size1_fun())));
+                    kernel.arg(n_arg++, cl_uint(utils::call_on_matrix(current_node->lhs, utils::internal_size2_fun())));
+                    return;
+                  }
+                  else if(current_node->rhs.type_family==scheduler::MATRIX_TYPE_FAMILY)
+                  {
+                    kernel.arg(n_arg++, cl_uint(utils::call_on_matrix(current_node->lhs, utils::internal_size1_fun())));
+                    kernel.arg(n_arg++, cl_uint(utils::call_on_matrix(current_node->lhs, utils::internal_size2_fun())));
+                    return;
+                  }
+                  else{
+                    assert(false && bool("unexpected expression tree"));
+                  }
+                }
+                return;
+              }
+            }
+          }
+        }
+
+        void kernel_arguments(statements_type  const & /*statements*/, std::string & arguments_string) const{
+          arguments_string += detail::generate_value_kernel_argument("unsigned int", "M");
+          arguments_string += detail::generate_value_kernel_argument("unsigned int", "N");
+        }
+
+      private:
+        void core(vcl_size_t /*kernel_id*/, utils::kernel_generation_stream& stream, statements_type const & statements, std::vector<detail::mapping_type> const & mapping) const {
+
+          std::vector<detail::mapped_vector_reduction*> exprs;
+          for(std::vector<detail::mapping_type>::const_iterator it = mapping.begin() ; it != mapping.end() ; ++it){
+            for(detail::mapping_type::const_iterator iit = it->begin() ; iit != it->end() ; ++iit){
+              if(detail::mapped_vector_reduction * p = dynamic_cast<detail::mapped_vector_reduction*>(iit->second.get()))
+                exprs.push_back(p);
+              if(detail::mapped_matrix * p = dynamic_cast<detail::mapped_matrix*>(iit->second.get()))
+                p->bind_sizes("M","N");
+            }
+          }
+
+          vcl_size_t lsize1 = m_;
+          vcl_size_t lsize2 = k_+1;
+          std::string scalartype = "float";
+          bool is_lhs_transposed = false;
+          if(exprs.front()->root_node().lhs.type_family==scheduler::COMPOSITE_OPERATION_FAMILY)
+            if(exprs.front()->statement().array()[exprs.front()->root_node().lhs.node_index].op.type==scheduler::OPERATION_UNARY_TRANS_TYPE)
+              is_lhs_transposed = true;
+
+          std::string size1 = "M", size2 = "N";
+          if(is_lhs_transposed)
+            std::swap(size1, size2);
+
+          for(std::vector<detail::mapped_vector_reduction*>::iterator it = exprs.begin() ; it != exprs.end() ; ++it){
+            stream << "__local " <<  (*it)->scalartype() << " buf" << std::distance(exprs.begin(), it) << '[' << lsize1*lsize2 << "];" << std::endl;
+          }
+
+          stream << "unsigned int lid0 = get_local_id(0);" << std::endl;
+          stream << "unsigned int lid1 = get_local_id(1);" << std::endl;
+
+
+          stream << "for(unsigned int r = get_global_id(0) ; r < " << size1 << " ; r += get_global_size(0)){" << std::endl;
+          stream.inc_tab();
+
+          for(vcl_size_t k = 0 ; k < exprs.size() ; ++k)
+            stream << scalartype << " sum" << k << " = 0;" << std::endl;
+
+          stream << "for(unsigned int c = get_local_id(1) ; c < " << size2 << " ; c += get_local_size(1)){" << std::endl;
+          stream.inc_tab();
+
+          std::set<std::string>  fetched;
+
+          for(std::vector<detail::mapped_vector_reduction*>::iterator it = exprs.begin() ; it != exprs.end() ; ++it){
+            viennacl::scheduler::statement const & statement = (*it)->statement();
+            viennacl::scheduler::statement_node const & root_node = (*it)->root_node();
+            if(is_lhs_transposed)
+              detail::fetch_all_lhs(fetched,statement,root_node, std::make_pair("c", "r"),vector_size_,stream,(*it)->mapping());
+            else
+              detail::fetch_all_lhs(fetched,statement,root_node, std::make_pair("r", "c"),vector_size_,stream,(*it)->mapping());
+
+            detail::fetch_all_rhs(fetched,statement,root_node, std::make_pair("c", "0"),vector_size_,stream,(*it)->mapping());
+          }
+
+
+          //Update sums;
+          for(std::vector<detail::mapped_vector_reduction*>::iterator it = exprs.begin() ; it != exprs.end() ; ++it){
+            viennacl::scheduler::statement const & statement = (*it)->statement();
+            viennacl::scheduler::statement_node const & root_node = (*it)->root_node();
+            std::string str;
+            detail::generate_all_lhs(statement,root_node,std::make_pair("i","0"),-1,str,(*it)->mapping());
+            str += "*";
+            detail::generate_all_rhs(statement,root_node,std::make_pair("i","0"),-1,str,(*it)->mapping());
+            stream << " sum" << std::distance(exprs.begin(),it) << " += "  << str << ";" << std::endl;
+          }
+
+
+          stream.dec_tab();
+          stream << "}" << std::endl;
+
+          for(vcl_size_t k = 0 ; k < exprs.size() ; ++k){
+            stream << "buf" << k << "[lid0*" << lsize2 << "+ lid1] = sum" << k << ";" << std::endl;
+          }
+
+          for(unsigned int stride = k_/2 ; stride>1 ; stride /=2){
+            stream << "barrier(CLK_LOCAL_MEM_FENCE); " << std::endl;
+            stream <<  "if(lid1 < " << stride << ")" ;
+            stream << "{" << std::endl;
+            stream.inc_tab();
+
+            for(vcl_size_t i = 0 ; i < exprs.size() ; ++i)
+              stream << "buf" << i << "[lid0*" << lsize2 << "+ lid1] += buf" << i << "[lid0*" << lsize2 << "+ lid1 + " << stride << "];" << std::endl;
+
+            stream.dec_tab();
+            stream << "}" << std::endl;
+          }
+
+
+          stream << "barrier(CLK_LOCAL_MEM_FENCE); " << std::endl;
+          stream <<  "if(lid1 == 0)" ;
+          stream << "{" << std::endl;
+          stream.inc_tab();
+          for(vcl_size_t i = 0 ; i < exprs.size() ; ++i){
+            stream << "buf" << i << "[lid0*" << lsize2 << "] += buf" << i << "[lid0*" << lsize2 << "+ 1];" << std::endl;
+            exprs[i]->access_name("buf"+utils::to_string(i)+"[lid0*"+utils::to_string(lsize2)+"]");
+          }
+          vcl_size_t i = 0;
+          for(statements_type::const_iterator it = statements.begin() ; it != statements.end() ; ++it){
+            std::string str;
+            detail::traverse(it->first, it->second, detail::expression_generation_traversal(std::make_pair("r","0"), -1, str, mapping[i++]), false);
+            stream << str << ";" << std::endl;
+          }
+          stream.dec_tab();
+          stream << "}" << std::endl;
+
+
+          stream.dec_tab();
+          stream << "}" << std::endl;
+
+        }
+
+      private:
+        unsigned int m_;
+        unsigned int k_;
+        unsigned int num_groups_;
+    };
+  }
+}
+
+#endif
diff --git a/viennacl/hankel_matrix.hpp b/viennacl/hankel_matrix.hpp
index 9c2c8e3..e884901 100644
--- a/viennacl/hankel_matrix.hpp
+++ b/viennacl/hankel_matrix.hpp
@@ -2,16 +2,17 @@
 #define VIENNACL_HANKEL_MATRIX_HPP
 
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
@@ -19,12 +20,12 @@
 
 
 /** @file hankel_matrix.hpp
-    @brief Implementation of the hankel_matrix class for efficient manipulation of Hankel matrices.  Experimental in 1.2.x.
+    @brief Implementation of the hankel_matrix class for efficient manipulation of Hankel matrices.  Experimental.
 */
 
 #include "viennacl/forwards.h"
 #include "viennacl/vector.hpp"
-#include "viennacl/ocl/context.hpp"
+#include "viennacl/ocl/backend.hpp"
 
 #include "viennacl/toeplitz_matrix.hpp"
 #include "viennacl/fft.hpp"
@@ -38,17 +39,17 @@ namespace viennacl {
     * @tparam ALIGNMENT    The internal memory size is given by (size()/ALIGNMENT + 1) * ALIGNMENT. ALIGNMENT must be a power of two. Best values or usually 4, 8 or 16, higher values are usually a waste of memory.
     */
     template<class SCALARTYPE, unsigned int ALIGNMENT>
-    class hankel_matrix 
+    class hankel_matrix
     {
-    public:
+      public:
+        typedef viennacl::backend::mem_handle                                                              handle_type;
+        typedef scalar<typename viennacl::tools::CHECK_SCALAR_TEMPLATE_ARGUMENT<SCALARTYPE>::ResultType>   value_type;
+
         /**
          * @brief The default constructor. Does not allocate any memory.
          *
          */
-        explicit hankel_matrix()
-        {
-          viennacl::linalg::kernels::fft<SCALARTYPE, 1>::init();
-        }
+        explicit hankel_matrix() {}
 
         /**
          * @brief         Creates the matrix with the given size
@@ -56,10 +57,10 @@ namespace viennacl {
          * @param rows      Number of rows of the matrix
          * @param cols      Number of columns of the matrix
          */
-        explicit hankel_matrix(std::size_t rows, std::size_t cols) : elements_(rows, cols)
+        explicit hankel_matrix(vcl_size_t rows, vcl_size_t cols) : elements_(rows, cols)
         {
-          assert(rows == cols && "Hankel matrix must be square!");
-          viennacl::linalg::kernels::fft<SCALARTYPE, 1>::init();
+          assert(rows == cols && bool("Hankel matrix must be square!"));
+          (void)cols;  // avoid 'unused parameter' warning in optimized builds
         }
 
         /** @brief Resizes the matrix.
@@ -68,7 +69,7 @@ namespace viennacl {
         * @param sz         New size of matrix
         * @param preserve   If true, existing values are preserved.
         */
-        void resize(size_t sz, bool preserve = true)
+        void resize(vcl_size_t sz, bool preserve = true)
         {
             elements_.resize(sz, preserve);
         }
@@ -77,7 +78,7 @@ namespace viennacl {
         *
         *   @return OpenCL handle
         */
-        viennacl::ocl::handle<cl_mem> handle() const { return elements_.handle(); }
+        handle_type const & handle() const { return elements_.handle(); }
 
         /**
          * @brief Returns an internal viennacl::toeplitz_matrix, which represents a Hankel matrix elements
@@ -89,19 +90,19 @@ namespace viennacl {
         /**
          * @brief Returns the number of rows of the matrix
          */
-        std::size_t size1() const { return elements_.size1(); }
-        
+        vcl_size_t size1() const { return elements_.size1(); }
+
         /**
          * @brief Returns the number of columns of the matrix
          */
-        std::size_t size2() const { return elements_.size2(); }
+        vcl_size_t size2() const { return elements_.size2(); }
 
         /** @brief Returns the internal size of matrix representtion.
         *   Usually required for launching OpenCL kernels only
         *
         *   @return Internal size of matrix representation
         */
-        std::size_t internal_size() const { return elements_.internal_size(); }
+        vcl_size_t internal_size() const { return elements_.internal_size(); }
 
         /**
          * @brief Read-write access to a element of the matrix
@@ -112,8 +113,8 @@ namespace viennacl {
          */
         entry_proxy<SCALARTYPE> operator()(unsigned int row_index, unsigned int col_index)
         {
-            assert(row_index < size1() && col_index < size2() && "Invalid access");
-            
+            assert(row_index < size1() && col_index < size2() && bool("Invalid access"));
+
             return elements_(size1() - row_index - 1, col_index);
         }
 
@@ -130,9 +131,9 @@ namespace viennacl {
         }
 
     private:
-        hankel_matrix(hankel_matrix const & t) {}
-        hankel_matrix & operator=(hankel_matrix const & t) {}
-      
+        hankel_matrix(hankel_matrix const &) {}
+        hankel_matrix & operator=(hankel_matrix const & t);
+
         toeplitz_matrix<SCALARTYPE, ALIGNMENT> elements_;
     };
 
@@ -145,7 +146,7 @@ namespace viennacl {
     template <typename SCALARTYPE, unsigned int ALIGNMENT>
     void copy(std::vector<SCALARTYPE> const & cpu_vec, hankel_matrix<SCALARTYPE, ALIGNMENT> & gpu_mat)
     {
-        assert((gpu_mat.size1() * 2 - 1)  == cpu_vec.size() && "Size mismatch");
+        assert((gpu_mat.size1() * 2 - 1)  == cpu_vec.size() && bool("Size mismatch"));
 
         copy(cpu_vec, gpu_mat.elements());
     }
@@ -159,7 +160,7 @@ namespace viennacl {
     template <typename SCALARTYPE, unsigned int ALIGNMENT>
     void copy(hankel_matrix<SCALARTYPE, ALIGNMENT> const & gpu_mat, std::vector<SCALARTYPE> & cpu_vec)
     {
-        assert((gpu_mat.size1() * 2 - 1)  == cpu_vec.size() && "Size mismatch");
+        assert((gpu_mat.size1() * 2 - 1)  == cpu_vec.size() && bool("Size mismatch"));
 
         copy(gpu_mat.elements(), cpu_vec);
     }
@@ -173,17 +174,18 @@ namespace viennacl {
     template <typename SCALARTYPE, unsigned int ALIGNMENT, typename MATRIXTYPE>
     void copy(hankel_matrix<SCALARTYPE, ALIGNMENT> const & han_src, MATRIXTYPE& com_dst)
     {
-        std::size_t size = han_src.size1();
-        assert(size == com_dst.size1() && "Size mismatch");
-        assert(size == com_dst.size2() && "Size mismatch");
+        assert( (viennacl::traits::size1(com_dst) == han_src.size1()) && bool("Size mismatch") );
+        assert( (viennacl::traits::size2(com_dst) == han_src.size2()) && bool("Size mismatch") );
+
+        vcl_size_t size = han_src.size1();
         std::vector<SCALARTYPE> tmp(size * 2 - 1);
         copy(han_src, tmp);
 
-        for (std::size_t i = 0; i < size; i++)
-            for (std::size_t j = 0; j < size; j++)
+        for (vcl_size_t i = 0; i < size; i++)
+            for (vcl_size_t j = 0; j < size; j++)
                 com_dst(i, j) = tmp[i + j];
     }
-    
+
     /** @brief Copies a the matrix-like object to the Hankel matrix from the OpenCL device (either GPU or multi-core CPU)
     *
     *
@@ -193,16 +195,18 @@ namespace viennacl {
     template <typename SCALARTYPE, unsigned int ALIGNMENT, typename MATRIXTYPE>
     void copy(MATRIXTYPE const & com_src, hankel_matrix<SCALARTYPE, ALIGNMENT>& han_dst)
     {
-        std::size_t size = han_dst.size1();
-        assert(size == com_src.size1() && "Size mismatch");
-        assert(size == com_src.size2() && "Size mismatch");
+        assert( (han_dst.size1() == 0 || viennacl::traits::size1(com_src) == han_dst.size1()) && bool("Size mismatch") );
+        assert( (han_dst.size2() == 0 || viennacl::traits::size2(com_src) == han_dst.size2()) && bool("Size mismatch") );
+        assert( viennacl::traits::size2(com_src) == viennacl::traits::size1(com_src) && bool("Logic error: non-square Hankel matrix!") );
+
+        vcl_size_t size = viennacl::traits::size1(com_src);
 
         std::vector<SCALARTYPE> tmp(2*size - 1);
 
-        for (std::size_t i = 0; i < size; i++)
+        for (vcl_size_t i = 0; i < size; i++)
             tmp[i] = com_src(0, i);
 
-        for (std::size_t i = 1; i < size; i++)
+        for (vcl_size_t i = 1; i < size; i++)
             tmp[size + i - 1] = com_src(size - 1, i);
 
         viennacl::copy(tmp, han_dst);
@@ -220,14 +224,14 @@ namespace viennacl {
     template<class SCALARTYPE, unsigned int ALIGNMENT>
     std::ostream & operator<<(std::ostream & s, hankel_matrix<SCALARTYPE, ALIGNMENT>& gpu_matrix)
     {
-        std::size_t size = gpu_matrix.size1();
+        vcl_size_t size = gpu_matrix.size1();
         std::vector<SCALARTYPE> tmp(2*size - 1);
         copy(gpu_matrix, tmp);
         s << "[" << size << "," << size << "](";
 
-        for(std::size_t i = 0; i < size; i++) {
+        for(vcl_size_t i = 0; i < size; i++) {
             s << "(";
-            for(std::size_t j = 0; j < size; j++) {
+            for(vcl_size_t j = 0; j < size; j++) {
                 s << tmp[i + j];
                 //s << (int)i - (int)j;
                 if(j < (size - 1)) s << ",";
@@ -237,5 +241,100 @@ namespace viennacl {
         s << ")";
         return s;
     }
+
+    //
+    // Specify available operations:
+    //
+
+    /** \cond */
+
+    namespace linalg
+    {
+      namespace detail
+      {
+        // x = A * y
+        template <typename T, unsigned int A>
+        struct op_executor<vector_base<T>, op_assign, vector_expression<const hankel_matrix<T, A>, const vector_base<T>, op_prod> >
+        {
+            static void apply(vector_base<T> & lhs, vector_expression<const hankel_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+            {
+              // check for the special case x = A * x
+              if (viennacl::traits::handle(lhs) == viennacl::traits::handle(rhs.rhs()))
+              {
+                viennacl::vector<T> temp(lhs);
+                viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), temp);
+                lhs = temp;
+              }
+              else
+                viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), lhs);
+            }
+        };
+
+        template <typename T, unsigned int A>
+        struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const hankel_matrix<T, A>, const vector_base<T>, op_prod> >
+        {
+            static void apply(vector_base<T> & lhs, vector_expression<const hankel_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+            {
+              viennacl::vector<T> temp(lhs);
+              viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), temp);
+              lhs += temp;
+            }
+        };
+
+        template <typename T, unsigned int A>
+        struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const hankel_matrix<T, A>, const vector_base<T>, op_prod> >
+        {
+            static void apply(vector_base<T> & lhs, vector_expression<const hankel_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+            {
+              viennacl::vector<T> temp(lhs);
+              viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), temp);
+              lhs -= temp;
+            }
+        };
+
+
+        // x = A * vec_op
+        template <typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+        struct op_executor<vector_base<T>, op_assign, vector_expression<const hankel_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> >
+        {
+            static void apply(vector_base<T> & lhs, vector_expression<const hankel_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+            {
+              viennacl::vector<T> temp(rhs.rhs());
+              viennacl::linalg::prod_impl(rhs.lhs(), temp, lhs);
+            }
+        };
+
+        // x = A * vec_op
+        template <typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+        struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const hankel_matrix<T, A>, vector_expression<const LHS, const RHS, OP>, op_prod> >
+        {
+            static void apply(vector_base<T> & lhs, vector_expression<const hankel_matrix<T, A>, vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+            {
+              viennacl::vector<T> temp(rhs.rhs());
+              viennacl::vector<T> temp_result(lhs);
+              viennacl::linalg::prod_impl(rhs.lhs(), temp, temp_result);
+              lhs += temp_result;
+            }
+        };
+
+        // x = A * vec_op
+        template <typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+        struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const hankel_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> >
+        {
+            static void apply(vector_base<T> & lhs, vector_expression<const hankel_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+            {
+              viennacl::vector<T> temp(rhs.rhs());
+              viennacl::vector<T> temp_result(lhs);
+              viennacl::linalg::prod_impl(rhs.lhs(), temp, temp_result);
+              lhs -= temp_result;
+            }
+        };
+
+
+
+     } // namespace detail
+   } // namespace linalg
+
+   /** \endcond */
 }
-#endif // _VIENNACL_HANKEL_MATRIX_HPP
+#endif // VIENNACL_HANKEL_MATRIX_HPP
diff --git a/viennacl/hyb_matrix.hpp b/viennacl/hyb_matrix.hpp
new file mode 100644
index 0000000..d04de34
--- /dev/null
+++ b/viennacl/hyb_matrix.hpp
@@ -0,0 +1,368 @@
+#ifndef VIENNACL_HYB_MATRIX_HPP_
+#define VIENNACL_HYB_MATRIX_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/hyb_matrix.hpp
+    @brief Implementation of the hyb_matrix class
+
+    Contributed by Volodymyr Kysenko.
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/vector.hpp"
+
+#include "viennacl/tools/tools.hpp"
+
+#include "viennacl/linalg/sparse_matrix_operations.hpp"
+
+namespace viennacl
+{
+    /** @brief Sparse matrix class using a hybrid format composed of the ELL and CSR format for storing the nonzeros. */
+    template<typename SCALARTYPE, unsigned int ALIGNMENT  /* see forwards.h for default argument */>
+    class hyb_matrix
+    {
+      public:
+        typedef viennacl::backend::mem_handle                                                              handle_type;
+        typedef scalar<typename viennacl::tools::CHECK_SCALAR_TEMPLATE_ARGUMENT<SCALARTYPE>::ResultType>   value_type;
+
+        hyb_matrix() : csr_threshold_(SCALARTYPE(0.8)), rows_(0), cols_(0) {}
+
+        hyb_matrix(viennacl::context ctx) : csr_threshold_(SCALARTYPE(0.8)), rows_(0), cols_(0)
+        {
+            ell_coords_.switch_active_handle_id(ctx.memory_type());
+          ell_elements_.switch_active_handle_id(ctx.memory_type());
+
+              csr_rows_.switch_active_handle_id(ctx.memory_type());
+              csr_cols_.switch_active_handle_id(ctx.memory_type());
+          csr_elements_.switch_active_handle_id(ctx.memory_type());
+
+#ifdef VIENNACL_WITH_OPENCL
+          if (ctx.memory_type() == OPENCL_MEMORY)
+          {
+              ell_coords_.opencl_handle().context(ctx.opencl_context());
+            ell_elements_.opencl_handle().context(ctx.opencl_context());
+
+                csr_rows_.opencl_handle().context(ctx.opencl_context());
+                csr_cols_.opencl_handle().context(ctx.opencl_context());
+            csr_elements_.opencl_handle().context(ctx.opencl_context());
+          }
+#endif
+        }
+
+        SCALARTYPE  csr_threshold()  const { return csr_threshold_; }
+        void csr_threshold(SCALARTYPE thr) { csr_threshold_ = thr; }
+
+        vcl_size_t internal_size1() const { return viennacl::tools::align_to_multiple<vcl_size_t>(rows_, ALIGNMENT); }
+        vcl_size_t internal_size2() const { return viennacl::tools::align_to_multiple<vcl_size_t>(cols_, ALIGNMENT); }
+
+        vcl_size_t size1() const { return rows_; }
+        vcl_size_t size2() const { return cols_; }
+
+        vcl_size_t internal_ellnnz() const {return viennacl::tools::align_to_multiple<vcl_size_t>(ellnnz_, ALIGNMENT); }
+        vcl_size_t ell_nnz() const { return ellnnz_; }
+        vcl_size_t csr_nnz() const { return csrnnz_; }
+
+        const handle_type & handle() const { return ell_elements_; }
+        const handle_type & handle2() const { return ell_coords_; }
+        const handle_type & handle3() const { return csr_rows_; }
+        const handle_type & handle4() const { return csr_cols_; }
+        const handle_type & handle5() const { return csr_elements_; }
+
+      public:
+      #if defined(_MSC_VER) && _MSC_VER < 1500          //Visual Studio 2005 needs special treatment
+        template <typename CPU_MATRIX>
+        friend void copy(const CPU_MATRIX & cpu_matrix, hyb_matrix & gpu_matrix );
+      #else
+        template <typename CPU_MATRIX, typename T, unsigned int ALIGN>
+        friend void copy(const CPU_MATRIX & cpu_matrix, hyb_matrix<T, ALIGN> & gpu_matrix );
+      #endif
+
+      private:
+        SCALARTYPE  csr_threshold_;
+        vcl_size_t rows_;
+        vcl_size_t cols_;
+        vcl_size_t ellnnz_;
+        vcl_size_t csrnnz_;
+
+        handle_type ell_coords_; // ell coords
+        handle_type ell_elements_; // ell elements
+
+        handle_type csr_rows_;
+        handle_type csr_cols_;
+        handle_type csr_elements_;
+    };
+
+    template <typename CPU_MATRIX, typename SCALARTYPE, unsigned int ALIGNMENT>
+    void copy(const CPU_MATRIX& cpu_matrix, hyb_matrix<SCALARTYPE, ALIGNMENT>& gpu_matrix )
+    {
+      assert( (gpu_matrix.size1() == 0 || viennacl::traits::size1(cpu_matrix) == gpu_matrix.size1()) && bool("Size mismatch") );
+      assert( (gpu_matrix.size2() == 0 || viennacl::traits::size2(cpu_matrix) == gpu_matrix.size2()) && bool("Size mismatch") );
+
+      if(cpu_matrix.size1() > 0 && cpu_matrix.size2() > 0)
+      {
+        //determine max capacity for row
+        vcl_size_t max_entries_per_row = 0;
+        std::vector<vcl_size_t> hist_entries(cpu_matrix.size1() + 1, 0);
+
+        for (typename CPU_MATRIX::const_iterator1 row_it = cpu_matrix.begin1(); row_it != cpu_matrix.end1(); ++row_it)
+        {
+            vcl_size_t num_entries = 0;
+            for (typename CPU_MATRIX::const_iterator2 col_it = row_it.begin(); col_it != row_it.end(); ++col_it)
+            {
+                ++num_entries;
+            }
+
+            hist_entries[num_entries] += 1;
+            max_entries_per_row = std::max(max_entries_per_row, num_entries);
+        }
+
+        vcl_size_t sum = 0;
+        for(vcl_size_t ind = 0; ind <= max_entries_per_row; ind++)
+        {
+            sum += hist_entries[ind];
+
+            if(sum >= gpu_matrix.csr_threshold() * cpu_matrix.size1())
+            {
+                max_entries_per_row = ind;
+                break;
+            }
+            }
+
+        //setup GPU matrix
+        gpu_matrix.ellnnz_ = max_entries_per_row;
+        gpu_matrix.rows_ = cpu_matrix.size1();
+        gpu_matrix.cols_ = cpu_matrix.size2();
+
+        vcl_size_t nnz = gpu_matrix.internal_size1() * gpu_matrix.internal_ellnnz();
+
+        viennacl::backend::typesafe_host_array<unsigned int>  ell_coords(gpu_matrix.ell_coords_, nnz);
+        viennacl::backend::typesafe_host_array<unsigned int>  csr_rows(gpu_matrix.csr_rows_, cpu_matrix.size1() + 1);
+        std::vector<unsigned int> csr_cols;
+
+        std::vector<SCALARTYPE> ell_elements(nnz);
+        std::vector<SCALARTYPE> csr_elements;
+
+        vcl_size_t csr_index = 0;
+
+        for (typename CPU_MATRIX::const_iterator1 row_it = cpu_matrix.begin1(); row_it != cpu_matrix.end1(); ++row_it)
+        {
+          vcl_size_t data_index = 0;
+
+          csr_rows.set(row_it.index1(), csr_index);
+
+          for (typename CPU_MATRIX::const_iterator2 col_it = row_it.begin(); col_it != row_it.end(); ++col_it)
+          {
+            if(data_index < max_entries_per_row)
+            {
+                ell_coords.set(gpu_matrix.internal_size1() * data_index + col_it.index1(), col_it.index2());
+                ell_elements[gpu_matrix.internal_size1() * data_index + col_it.index1()] = *col_it;
+            }
+            else
+            {
+                csr_cols.push_back(static_cast<unsigned int>(col_it.index2()));
+                csr_elements.push_back(*col_it);
+
+                csr_index++;
+            }
+
+            data_index++;
+          }
+
+        }
+
+        if(csr_cols.empty())
+        {
+          csr_cols.push_back(0);
+          csr_elements.push_back(0);
+        }
+
+        csr_rows.set(csr_rows.size() - 1, csr_index);
+
+        gpu_matrix.csrnnz_ = csr_cols.size();
+
+        viennacl::backend::typesafe_host_array<unsigned int> csr_cols_for_gpu(gpu_matrix.csr_cols_, csr_cols.size());
+        for (vcl_size_t i=0; i<csr_cols.size(); ++i)
+          csr_cols_for_gpu.set(i, csr_cols[i]);
+
+        viennacl::backend::memory_create(gpu_matrix.ell_coords_,   ell_coords.raw_size(),                    traits::context(gpu_matrix.ell_coords_), ell_coords.get());
+        viennacl::backend::memory_create(gpu_matrix.ell_elements_, sizeof(SCALARTYPE) * ell_elements.size(), traits::context(gpu_matrix.ell_elements_), &(ell_elements[0]));
+
+        viennacl::backend::memory_create(gpu_matrix.csr_rows_,     csr_rows.raw_size(),                      traits::context(gpu_matrix.csr_rows_), csr_rows.get());
+        viennacl::backend::memory_create(gpu_matrix.csr_cols_,     csr_cols_for_gpu.raw_size(),              traits::context(gpu_matrix.csr_cols_), csr_cols_for_gpu.get());
+        viennacl::backend::memory_create(gpu_matrix.csr_elements_, sizeof(SCALARTYPE) * csr_elements.size(), traits::context(gpu_matrix.csr_elements_), &(csr_elements[0]));
+      }
+    }
+
+    template <typename CPU_MATRIX, typename SCALARTYPE, unsigned int ALIGNMENT>
+    void copy(const hyb_matrix<SCALARTYPE, ALIGNMENT>& gpu_matrix, CPU_MATRIX& cpu_matrix)
+    {
+      assert( (viennacl::traits::size1(cpu_matrix) == gpu_matrix.size1()) && bool("Size mismatch") );
+      assert( (viennacl::traits::size2(cpu_matrix) == gpu_matrix.size2()) && bool("Size mismatch") );
+
+      if(gpu_matrix.size1() > 0 && gpu_matrix.size2() > 0)
+      {
+        std::vector<SCALARTYPE> ell_elements(gpu_matrix.internal_size1() * gpu_matrix.internal_ellnnz());
+        viennacl::backend::typesafe_host_array<unsigned int> ell_coords(gpu_matrix.handle2(), gpu_matrix.internal_size1() * gpu_matrix.internal_ellnnz());
+
+        std::vector<SCALARTYPE> csr_elements(gpu_matrix.csr_nnz());
+        viennacl::backend::typesafe_host_array<unsigned int> csr_rows(gpu_matrix.handle3(), gpu_matrix.size1() + 1);
+        viennacl::backend::typesafe_host_array<unsigned int> csr_cols(gpu_matrix.handle4(), gpu_matrix.csr_nnz());
+
+        viennacl::backend::memory_read(gpu_matrix.handle(), 0, sizeof(SCALARTYPE) * ell_elements.size(), &(ell_elements[0]));
+        viennacl::backend::memory_read(gpu_matrix.handle2(), 0, ell_coords.raw_size(), ell_coords.get());
+        viennacl::backend::memory_read(gpu_matrix.handle3(), 0, csr_rows.raw_size(),   csr_rows.get());
+        viennacl::backend::memory_read(gpu_matrix.handle4(), 0, csr_cols.raw_size(),   csr_cols.get());
+        viennacl::backend::memory_read(gpu_matrix.handle5(), 0, sizeof(SCALARTYPE) * csr_elements.size(), &(csr_elements[0]));
+
+
+        for(vcl_size_t row = 0; row < gpu_matrix.size1(); row++)
+        {
+          for(vcl_size_t ind = 0; ind < gpu_matrix.internal_ellnnz(); ind++)
+          {
+            vcl_size_t offset = gpu_matrix.internal_size1() * ind + row;
+
+            if(ell_elements[offset] == static_cast<SCALARTYPE>(0.0))
+            {
+              continue;
+            }
+
+            if(ell_coords[offset] >= gpu_matrix.size2())
+            {
+              std::cerr << "ViennaCL encountered invalid data " << offset << " " << ind << " " << row << " " << ell_coords[offset] << " " << gpu_matrix.size2() << std::endl;
+              return;
+            }
+
+            cpu_matrix(row, ell_coords[offset]) = ell_elements[offset];
+          }
+
+          for(vcl_size_t ind = csr_rows[row]; ind < csr_rows[row+1]; ind++)
+          {
+            if(csr_elements[ind] == static_cast<SCALARTYPE>(0.0))
+            {
+              continue;
+            }
+
+            if(csr_cols[ind] >= gpu_matrix.size2())
+            {
+              std::cerr << "ViennaCL encountered invalid data " << std::endl;
+              return;
+            }
+
+            cpu_matrix(row, csr_cols[ind]) = csr_elements[ind];
+          }
+        }
+      }
+    }
+
+
+    //
+    // Specify available operations:
+    //
+
+    /** \cond */
+
+    namespace linalg
+    {
+      namespace detail
+      {
+        // x = A * y
+        template <typename T, unsigned int A>
+        struct op_executor<vector_base<T>, op_assign, vector_expression<const hyb_matrix<T, A>, const vector_base<T>, op_prod> >
+        {
+            static void apply(vector_base<T> & lhs, vector_expression<const hyb_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+            {
+              // check for the special case x = A * x
+              if (viennacl::traits::handle(lhs) == viennacl::traits::handle(rhs.rhs()))
+              {
+                viennacl::vector<T> temp(lhs);
+                viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), temp);
+                lhs = temp;
+              }
+              else
+                viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), lhs);
+            }
+        };
+
+        template <typename T, unsigned int A>
+        struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const hyb_matrix<T, A>, const vector_base<T>, op_prod> >
+        {
+            static void apply(vector_base<T> & lhs, vector_expression<const hyb_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+            {
+              viennacl::vector<T> temp(lhs);
+              viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), temp);
+              lhs += temp;
+            }
+        };
+
+        template <typename T, unsigned int A>
+        struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const hyb_matrix<T, A>, const vector_base<T>, op_prod> >
+        {
+            static void apply(vector_base<T> & lhs, vector_expression<const hyb_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+            {
+              viennacl::vector<T> temp(lhs);
+              viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), temp);
+              lhs -= temp;
+            }
+        };
+
+
+        // x = A * vec_op
+        template <typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+        struct op_executor<vector_base<T>, op_assign, vector_expression<const hyb_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> >
+        {
+            static void apply(vector_base<T> & lhs, vector_expression<const hyb_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+            {
+              viennacl::vector<T> temp(rhs.rhs(), viennacl::traits::context(rhs));
+              viennacl::linalg::prod_impl(rhs.lhs(), temp, lhs);
+            }
+        };
+
+        // x = A * vec_op
+        template <typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+        struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const hyb_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> >
+        {
+            static void apply(vector_base<T> & lhs, vector_expression<const hyb_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+            {
+              viennacl::vector<T> temp(rhs.rhs(), viennacl::traits::context(rhs));
+              viennacl::vector<T> temp_result(lhs);
+              viennacl::linalg::prod_impl(rhs.lhs(), temp, temp_result);
+              lhs += temp_result;
+            }
+        };
+
+        // x = A * vec_op
+        template <typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+        struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const hyb_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> >
+        {
+            static void apply(vector_base<T> & lhs, vector_expression<const hyb_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+            {
+              viennacl::vector<T> temp(rhs.rhs(), viennacl::traits::context(rhs));
+              viennacl::vector<T> temp_result(lhs);
+              viennacl::linalg::prod_impl(rhs.lhs(), temp, temp_result);
+              lhs -= temp_result;
+            }
+        };
+
+      } // namespace detail
+    } // namespace linalg
+
+    /** \endcond */
+}
+
+#endif
diff --git a/viennacl/io/kernel_parameters.hpp b/viennacl/io/kernel_parameters.hpp
deleted file mode 100644
index 21f4152..0000000
--- a/viennacl/io/kernel_parameters.hpp
+++ /dev/null
@@ -1,446 +0,0 @@
-#ifndef VIENNACL_IO_KERNEL_PARAMETERS_HPP
-#define VIENNACL_IO_KERNEL_PARAMETERS_HPP
-
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-
-/** @file kernel_parameters.hpp
-    @brief This file holds the code necessary for reading kernel parameters from XML files using pugixml
-*/
-
-#include "viennacl/ocl/backend.hpp"
-#include "pugixml/src/pugixml.hpp"
-
-namespace viennacl
-{
-  namespace io 
-  {
-    namespace tag 
-    {
-      static std::string root     = "parameters";
-      static std::string devices  = "devices";   
-      static std::string device   = "device";   
-      static std::string name     = "name";
-      static std::string driver   = "driver";
-      static std::string compun   = "computeunits";      
-      static std::string workgrp  = "workgroupsize";            
-      static std::string tests    = "tests";
-      static std::string test     = "test";                     
-      static std::string numeric  = "numeric";
-      static std::string kernels  = "kernels";
-      static std::string kernel   = "kernel";
-      static std::string params   = "params";
-      static std::string param    = "param";
-      static std::string value    = "value";   
-      static std::string alignment = "alignment";   
-    } // end namespace tag
-
-    namespace val {
-      static std::string globsize = "globalsize";
-      static std::string locsize  = "localsize";   
-      static std::string vec      = "vector";   
-      static std::string matrix   = "matrix";   
-      static std::string compmat  = "compressed_matrix";
-      static std::string fl       = "float";   
-      static std::string dbl      = "double";      
-    }
-
-    /** @brief  A XML parameter database using PugiXML. Allows to add tests for different devices and the like */
-    struct parameter_database 
-    {
-      parameter_database ()
-      {
-          root = doc.append_child();
-          root.set_name(tag::root.c_str());
-          last = root;
-          
-          devices_open = false;
-          tests_open = false;      
-          kernels_open = false;
-          parameters_open = false;      
-      }   
-      
-      void add_device()
-      {
-          pugi::xml_node dev;
-          if(devices_open)
-          {
-            dev = devices.append_child();
-            dev.set_name(tag::device.c_str());      
-          }
-          else
-          {
-            devices = last.append_child();
-            devices.set_name(tag::devices.c_str());
-            
-            dev = devices.append_child();
-            dev.set_name(tag::device.c_str());
-            
-            devices_open = true;
-          }
-          last = dev;
-      }
-      
-      void add_test()
-      {
-          pugi::xml_node test;
-          if(tests_open)
-          {
-            test = tests.append_child();
-            test.set_name(tag::test.c_str());      
-          }
-          else
-          {
-            tests = last.append_child();
-            tests.set_name(tag::tests.c_str());
-            
-            test = tests.append_child();
-            test.set_name(tag::test.c_str());
-            
-            tests_open = true;
-          }
-          last = test;
-          // close the current kernels section
-          // so a new one is created for this new test
-          kernels_open = false;      
-      }   
-
-      void add_kernel()
-      {
-          pugi::xml_node kern;
-          if(kernels_open)
-          {
-            kern = kernels.append_child();
-            kern.set_name(tag::kernel.c_str());      
-          }
-          else
-          {
-            kernels = last.append_child();
-            kernels.set_name(tag::kernels.c_str());
-            
-            kern = kernels.append_child();
-            kern.set_name(tag::kernel.c_str());
-            
-            kernels_open = true;
-          }
-          last = kern;
-          
-          // close the current parameters section
-          // so a new one is created for this new kernel
-          parameters_open = false;
-      }      
-      
-      void add_parameter()
-      {
-          pugi::xml_node para;
-          
-          if(parameters_open)
-          {
-            para = parameters.append_child();
-            para.set_name(tag::param.c_str());      
-          }
-          else
-          {
-            parameters = last.append_child();
-            parameters.set_name(tag::params.c_str());
-            
-            para = parameters.append_child();
-            para.set_name(tag::param.c_str());
-            
-            parameters_open = true;
-          }
-          last = para;
-      }         
-      
-      template<typename ValueT>
-      void add_data_node(std::string tagstr, ValueT data)
-      {
-          std::stringstream ss;
-          ss << data;
-          add_data_node(tagstr, ss.str());
-      }   
-      
-      void add_data_node(std::string tagstr, std::string data)
-      {
-          pugi::xml_node node = last.append_child();
-          
-          if(tagstr == tag::name)
-            node.set_name(tag::name.c_str());
-          else if(tagstr == tag::driver)
-            node.set_name(tag::driver.c_str());      
-          else if(tagstr == tag::numeric)
-            node.set_name(tag::numeric.c_str());      
-          else if(tagstr == tag::alignment)
-            node.set_name(tag::alignment.c_str());      
-          else if(tagstr == tag::value)
-            node.set_name(tag::value.c_str());      
-          else if(tagstr == tag::compun)
-            node.set_name(tag::compun.c_str());      
-          else if(tagstr == tag::workgrp)
-            node.set_name(tag::workgrp.c_str());                        
-          else
-            std::cout << "# Error adding data node: node tag not recognized .." << std::endl;
-          node.append_child(pugi::node_pcdata).set_value(data.c_str());
-      }
-
-      void load(std::string filename)
-      {
-          doc.load_file(filename.c_str());
-      }
-
-      void dump(std::string filename)
-      {
-          std::ofstream outstream(filename.c_str());
-          this->dump(outstream);
-          outstream.close();
-      }
-      
-      void dump(std::ostream& stream = std::cout)
-      {
-          doc.save(stream, "  ");
-      }
-
-      pugi::xml_document   doc;
-      pugi::xml_node       root;
-      pugi::xml_node       devices;
-      pugi::xml_node       tests;   
-      pugi::xml_node       kernels;      
-      pugi::xml_node       parameters;         
-      pugi::xml_node       last;   
-      
-      bool devices_open;
-      bool tests_open;   
-      bool kernels_open;      
-      bool parameters_open;         
-
-    };
-    
-    /** @brief Helper meta class that returns the first letter of a particular type (float or double) */
-    template <typename T>
-    struct first_letter_of_type
-    {
-      static char get(); //intentionally not implemented, class must be specialized
-    };
-    
-    template <>
-    struct first_letter_of_type <float>
-    {
-      static char get() { return 'f'; } 
-    };
-    
-    template <>
-    struct first_letter_of_type <double>
-    {
-      static char get() { return 'd'; } 
-    };
-    
-    template <typename T>
-    struct program_for_vcltype
-    {
-      static std::string get();  //intentionally not implemented, class must be specialized
-    };
-    
-    template <typename T, unsigned int ALIGNMENT>
-    struct program_for_vcltype < viennacl::vector<T, ALIGNMENT> >
-    {
-      static std::string get()
-      {
-        std::stringstream ss;
-        ss << first_letter_of_type<T>::get() << "_vector_" << ALIGNMENT;
-        return ss.str();
-      } 
-    };
-    
-    template <typename T, unsigned int ALIGNMENT>
-    struct program_for_vcltype < viennacl::matrix<T, row_major, ALIGNMENT> >
-    {
-      static std::string get()
-      {
-        std::stringstream ss;
-        ss << first_letter_of_type<T>::get() << "_matrix_row_" << ALIGNMENT;
-        return ss.str();
-      } 
-    };
-
-    template <typename T, unsigned int ALIGNMENT>
-    struct program_for_vcltype < viennacl::matrix<T, column_major, ALIGNMENT> >
-    {
-      static std::string get()
-      {
-        std::stringstream ss;
-        ss << first_letter_of_type<T>::get() << "_matrix_col_" << ALIGNMENT;
-        return ss.str();
-      } 
-    };
-    
-    template <typename T, unsigned int ALIGNMENT>
-    struct program_for_vcltype < viennacl::compressed_matrix<T,  ALIGNMENT> >
-    {
-      static std::string get()
-      {
-        std::stringstream ss;
-        ss << first_letter_of_type<T>::get() << "_compressed_matrix_" << ALIGNMENT;
-        return ss.str();
-      } 
-    };
-
-    template<typename SCALARTYPE, unsigned int ALIGNMENT>
-    void set_kernel_params(std::string program_name,
-                          std::string kernel_name,
-                          unsigned int glob, //total no. of threads
-                          unsigned int loc)  //threads per work group
-    {
-      //get kernel from pool and set work sizes:
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(program_name, kernel_name);
-      k.global_work_size(0, glob);
-      k.local_work_size(0, loc);
-      
-      //std::cout << "Setting [" << glob << ", " << loc << "] for kernel " << kernel_name << std::endl;
-    }
-
-    template<typename VclBasicType>
-    void tune_impl(parameter_database& paras, std::string parent)
-    {
-      typedef typename VclBasicType::value_type::value_type   SCALARTYPE;
-      
-      // create dummy vectors; the kernels have to be created ..
-      VclBasicType    dummy;
-
-      // extract the kernels for which parameters are present
-      std::string          kernel_str = parent+"/kernels/kernel/name/text()";
-      pugi::xpath_node_set kernel_res = paras.doc.select_nodes(kernel_str.c_str());      
-
-      typedef std::vector<std::string>   kernels_type;
-      kernels_type kernels;
-      std::cout << "Retrieving kernels..." << std::endl;
-      for (pugi::xpath_node_set::const_iterator it = kernel_res.begin(); it != kernel_res.end(); ++it)
-      {
-          std::stringstream ss;
-          it->node().print(ss, "  ");
-          std::string kern(ss.str());
-          kern.erase(std::remove(kern.begin(), kern.end(), '\n'), kern.end()); //trim trailing linebreak
-          kernels.push_back(kern);
-      }
-      
-      // retrieve the actual parameters
-      std::cout << "Retrieving actual parameters..." << std::endl;
-      for(typename kernels_type::iterator iter = kernels.begin();
-          iter != kernels.end(); iter++)
-      {
-          // retrieving the work group ..
-          std::string          wg_str = parent+"/kernels/kernel[name='"+*iter+"']/params/param[name='"+val::globsize+"']/value/text()";
-          pugi::xpath_node_set wg_res = paras.doc.select_nodes(wg_str.c_str());  
-
-          unsigned int global_size(0);
-          
-          std::stringstream ss;
-          ss << wg_res[0].node().value();
-          ss >> global_size;
-          
-          // retrieving the local_workers ..
-          std::string          lw_str = parent+"/kernels/kernel[name='"+*iter+"']/params/param[name='"+val::locsize+"']/value/text()";
-          pugi::xpath_node_set lw_res = paras.doc.select_nodes(lw_str.c_str());  
-
-          unsigned int local_workers(0);
-          
-          ss.clear();
-          ss << lw_res[0].node().value();
-          ss >> local_workers;         
-          
-          //std::cout << "kernel: " << *iter << " wg: " << work_group << " lw: " << local_workers << std::endl;
-
-          // set the parameters
-          set_kernel_params<SCALARTYPE,1> (program_for_vcltype<VclBasicType>::get(), *iter, global_size, local_workers);
-          //set_kernel_params<SCALARTYPE,4> (*iter, work_group * local_workers, local_workers);         
-          //set_kernel_params<SCALARTYPE,16>(*iter, work_group * local_workers, local_workers);                 
-      }
-    }
-
-    /** @brief Helper meta-class that converts a type to a string */
-    template <typename T>
-    struct to_string {};
-
-    template <>
-    struct to_string<float>
-    {
-      static std::string get() { return "float"; }
-    };
-
-    template <>
-    struct to_string<double>
-    {
-      static std::string get() { return "double"; }
-    };
-
-    /** @brief The interface function for reading kernel parameters
-    *
-    * @tparam VclBasicType  The ViennaCL type for which parameters should be read
-    * @param filename       Relative filename to the XML file where the parameters are located in
-    */
-    template<typename VclBasicType>
-    void read_kernel_parameters(std::string filename)
-    {
-      typedef typename VclBasicType::value_type::value_type   SCALARTYPE;
-      
-      parameter_database  paras;
-      paras.load(filename);
-      
-      std::string devname   = viennacl::ocl::current_device().name();
-      
-      // check if tune parameters for the current device are present
-      std::string          device_str = "/parameters/devices/device[name='"+devname+"']";
-      pugi::xpath_node_set device_res = paras.doc.select_nodes(device_str.c_str());
-      
-      if(device_res.size() == 0)
-      {
-          std::cout << "Tuner: There are no parameters for this device present!" << std::endl;
-          // evaluate the parameters for this device?
-      }
-      
-      // check if tune parameters for float exist
-      std::string          numeric_str = device_str+"/tests/test[numeric='"+to_string<SCALARTYPE>::get()+"']";
-      pugi::xpath_node_set numeric_res = paras.doc.select_nodes(numeric_str.c_str());
-
-      if(numeric_res.size() > 0)
-      {
-          tune_impl<VclBasicType>(paras, numeric_str);
-      }
-      else
-      {
-          std::cout << "Tuner: There are no parameters for numeric type float present!" << std::endl;   
-      }
-
-  //    // check if tune parameters for double exist
-  //    std::string          double_str = device_str+"/tests/test[numeric='"+val::dbl+"']";
-  //    pugi::xpath_node_set double_res = paras.doc.select_nodes(double_str.c_str());
-  // 
-  //    if(double_res.size() > 0)
-  //    {
-  //       tune_impl<double>(paras, double_str);
-  //    }
-  //    else
-  //    {
-  //       std::cout << "Tuner: There are no parameters for numeric type double present!" << std::endl;   
-  //    }
-
-    }
-
-  } // end namespace io
-
-} // end namespace viennacl
-
-#endif
diff --git a/viennacl/io/matrix_market.hpp b/viennacl/io/matrix_market.hpp
index 823f553..71b1b08 100644
--- a/viennacl/io/matrix_market.hpp
+++ b/viennacl/io/matrix_market.hpp
@@ -2,16 +2,17 @@
 #define VIENNACL_IO_MATRIX_MARKET_HPP
 
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
@@ -39,9 +40,9 @@ namespace viennacl
   namespace io
   {
     //helper
-    namespace
+    namespace detail
     {
-      void trim(char * buffer, long max_size)
+      inline void trim(char * buffer, long max_size)
       {
         //trim at beginning of string
         long start = 0;
@@ -59,7 +60,7 @@ namespace viennacl
         {
           if (buffer[i] == 0)   //end of string
             break;
-          
+
           if (buffer[i] != ' ')
             stop = i;
         }
@@ -68,25 +69,25 @@ namespace viennacl
         {
           buffer[i] = buffer[start + i];
         }
-        
+
         if (buffer[0] != ' ')
           buffer[stop - start + 1] = 0; //terminate string
         else
           buffer[0] = 0;
-      }      
-      
-      std::string tolower(std::string & s)
+      }
+
+      inline std::string tolower(std::string & s)
       {
         std::transform(s.begin(), s.end(), s.begin(), static_cast < int(*)(int) > (std::tolower));
         return s;
       }
-      
-      
-      
-    } //namespace 
-    
+
+
+
+    } //namespace
+
     ///////// reader ////////////
-    
+
     /** @brief Reads a sparse or dense matrix from a file (MatrixMarket format)
     *
     * @param mat The matrix that is to be read
@@ -100,7 +101,9 @@ namespace viennacl
                                       const char * file,
                                       long index_base)
     {
-      std::cout << "Reading matrix market file" << std::endl;
+      typedef typename viennacl::result_of::cpu_value_type<typename viennacl::result_of::value_type<MatrixType>::type>::type    ScalarType;
+
+      //std::cout << "Reading matrix market file" << std::endl;
       char buffer[1025];
       std::ifstream reader(file);
       std::string token;
@@ -113,12 +116,12 @@ namespace viennacl
       long valid_entries = 0;
       long nnz = 0;
 
-      
+
       if (!reader){
         std::cerr << "ViennaCL: Matrix Market Reader: Cannot open file " << file << std::endl;
-        return 0;
+        return EXIT_FAILURE;
       }
-      
+
       while (reader.good())
       {
         // get a non-empty line
@@ -126,10 +129,10 @@ namespace viennacl
         {
           reader.getline(buffer, 1024);
           ++linenum;
-          trim(buffer, 1024);
+          detail::trim(buffer, 1024);
         }
         while (reader.good() && buffer[0] == 0);
-        
+
         if (buffer[0] == '%')
         {
           if (buffer[1] == '%')
@@ -137,23 +140,23 @@ namespace viennacl
             //parse header:
             std::stringstream line(std::string(buffer + 2));
             line >> token;
-            if (tolower(token) != "matrixmarket")
+            if (detail::tolower(token) != "matrixmarket")
             {
               std::cerr << "Error in file " << file << " at line " << linenum << " in file " << file << ": Expected 'MatrixMarket', got '" << token << "'" << std::endl;
               return 0;
             }
 
             line >> token;
-            if (tolower(token) != "matrix")
+            if (detail::tolower(token) != "matrix")
             {
               std::cerr << "Error in file " << file << " at line " << linenum << " in file " << file << ": Expected 'matrix', got '" << token << "'" << std::endl;
               return 0;
             }
 
             line >> token;
-            if (tolower(token) != "coordinate")
+            if (detail::tolower(token) != "coordinate")
             {
-              if (tolower(token) == "array")
+              if (detail::tolower(token) == "array")
               {
                 dense_format = true;
                 std::cerr << "Error in file " << file << " at line " << linenum << " in file " << file << ": 'array' type is not supported yet!" << std::endl;
@@ -167,21 +170,21 @@ namespace viennacl
             }
 
             line >> token;
-            if (tolower(token) != "real")
+            if (detail::tolower(token) != "real")
             {
               std::cerr << "Error in file " << file << ": The MatrixMarket reader provided with ViennaCL supports only real valued floating point arithmetic." << std::endl;
               return 0;
             }
 
             line >> token;
-            if (tolower(token) == "general"){ }
-            else if (tolower(token) == "symmetric"){ symmetric = true; }
+            if (detail::tolower(token) == "general"){ }
+            else if (detail::tolower(token) == "symmetric"){ symmetric = true; }
             else
             {
               std::cerr << "Error in file " << file << ": The MatrixMarket reader provided with ViennaCL supports only general or symmetric matrices." << std::endl;
               return 0;
             }
-            
+
           }
         }
         else
@@ -194,7 +197,7 @@ namespace viennacl
             //read header line
             long rows;
             long cols;
-            
+
             if (line.good())
               line >> rows;
             else
@@ -202,7 +205,7 @@ namespace viennacl
               std::cerr << "Error in file " << file << ": Could not get matrix dimensions (rows) in line " << linenum << std::endl;
               return 0;
             }
-            
+
             if (line.good())
               line >> cols;
             else
@@ -220,10 +223,10 @@ namespace viennacl
                 return 0;
               }
             }
-            
+
             if (rows > 0 && cols > 0)
               viennacl::traits::resize(mat, rows, cols);
-            
+
             is_header = false;
           }
           else
@@ -231,10 +234,10 @@ namespace viennacl
             //read data
             if (dense_format)
             {
-              double value;
+              ScalarType value;
               line >> value;
               viennacl::traits::fill(mat, cur_row, cur_col, value);
-              
+
               if (++cur_row == static_cast<long>(viennacl::traits::size1(mat)))
               {
                 //next column
@@ -246,8 +249,8 @@ namespace viennacl
             {
               long row;
               long col;
-              double value;
-              
+              ScalarType value;
+
               //parse data:
               if (line.good())
                 line >> row;
@@ -256,7 +259,7 @@ namespace viennacl
                 std::cerr << "Error in file " << file << ": Parse error for matrix entry in line " << linenum << std::endl;
                 return 0;
               }
-              
+
               if (line.good())
                 line >> col;
               else
@@ -264,11 +267,11 @@ namespace viennacl
                 std::cerr << "Error in file " << file << ": Parse error for matrix entry in line " << linenum << std::endl;
                 return 0;
               }
-              
+
               //take index_base base into account:
               row -= index_base;
               col -= index_base;
-              
+
               if (line.good())
                 line >> value;
               else
@@ -276,36 +279,36 @@ namespace viennacl
                 std::cerr << "Error in file " << file << ": Parse error for matrix entry in line " << linenum << std::endl;
                 return 0;
               }
-              
+
               if (row >= static_cast<long>(viennacl::traits::size1(mat)) || row < 0)
               {
                 std::cerr << "Error in file " << file << " at line " << linenum << ": Row index out of bounds: " << row << " (matrix dim: " << viennacl::traits::size1(mat) << " x " << viennacl::traits::size2(mat) << ")" << std::endl;
                 return 0;
               }
-                  
+
               if (col >= static_cast<long>(viennacl::traits::size2(mat)) || col < 0)
               {
                 std::cerr << "Error in file " << file << " at line " << linenum << ": Column index out of bounds: " << col << " (matrix dim: " << viennacl::traits::size1(mat) << " x " << viennacl::traits::size2(mat) << ")" << std::endl;
                 return 0;
               }
-              
+
               viennacl::traits::fill(mat, row, col, value); //basically equivalent to mat(row, col) = value;
               if (symmetric)
                 viennacl::traits::fill(mat, col, row, value); //basically equivalent to mat(col, row) = value;
-              
+
               if (++valid_entries == nnz)
                 break;
-              
+
             } //else dense_format
           }
         }
       }
-      
-      std::cout << linenum << " lines read." << std::endl;
+
+      //std::cout << linenum << " lines read." << std::endl;
       reader.close();
       return linenum;
     }
-    
+
 
     /** @brief Reads a sparse matrix from a file (MatrixMarket format)
     *
@@ -320,7 +323,7 @@ namespace viennacl
                                  const char * file,
                                  long index_base = 1)
     {
-      return read_matrix_market_file_impl(mat, file, index_base);  
+      return read_matrix_market_file_impl(mat, file, index_base);
     }
 
     template <typename MatrixType>
@@ -330,7 +333,7 @@ namespace viennacl
     {
       return read_matrix_market_file_impl(mat, file.c_str(), index_base);
     }
-    
+
     template <typename ScalarType>
     long read_matrix_market_file(std::vector< std::map<unsigned int, ScalarType> > & mat,
                                  const char * file,
@@ -339,7 +342,7 @@ namespace viennacl
       viennacl::tools::sparse_matrix_adapter<ScalarType> adapted_matrix(mat);
       return read_matrix_market_file_impl(adapted_matrix, file, index_base);
     }
-    
+
     template <typename ScalarType>
     long read_matrix_market_file(std::vector< std::map<unsigned int, ScalarType> > & mat,
                                  const std::string & file,
@@ -355,7 +358,7 @@ namespace viennacl
     void write_matrix_market_file_impl(MatrixType const & mat, const char * file, long index_base)
     {
       std::ofstream writer(file);
-      
+
       long num_entries = 0;
       for (typename MatrixType::const_iterator1 row_it = mat.begin1();
             row_it != mat.end1();
@@ -367,7 +370,7 @@ namespace viennacl
 
       writer << "%%MatrixMarket matrix coordinate real general" << std::endl;
       writer << mat.size1() << " " << mat.size2() << " " << num_entries << std::endl;
-      
+
       for (typename MatrixType::const_iterator1 row_it = mat.begin1();
             row_it != mat.end1();
             ++row_it)
@@ -375,7 +378,7 @@ namespace viennacl
               col_it != row_it.end();
               ++col_it)
           writer << col_it.index1() + index_base << " " << col_it.index2() + index_base << " " << *col_it << std::endl;
-      
+
       writer.close();
     }
 
@@ -413,7 +416,7 @@ namespace viennacl
       write_matrix_market_file_impl(mat, file.c_str(), index_base);
     }
 
-    
+
   } //namespace io
 } //namespace viennacl
 
diff --git a/viennacl/linalg/amg.hpp b/viennacl/linalg/amg.hpp
old mode 100755
new mode 100644
index 6d82530..cf635fe
--- a/viennacl/linalg/amg.hpp
+++ b/viennacl/linalg/amg.hpp
@@ -2,24 +2,25 @@
 #define VIENNACL_LINALG_AMG_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
 ============================================================================= */
 
 /** @file viennacl/linalg/amg.hpp
-    @brief Main include file for algebraic multigrid (AMG) preconditioners.  Experimental in 1.2.x.
-    
+    @brief Main include file for algebraic multigrid (AMG) preconditioners.  Experimental.
+
     Implementation contributed by Markus Wagner
 */
 
@@ -29,7 +30,7 @@
 #include <boost/numeric/ublas/vector_proxy.hpp>
 #include <boost/numeric/ublas/matrix_proxy.hpp>
 #include <boost/numeric/ublas/vector.hpp>
-#include <boost/numeric/ublas/triangular.hpp> 
+#include <boost/numeric/ublas/triangular.hpp>
 #include <vector>
 #include <cmath>
 #include "viennacl/forwards.h"
@@ -43,7 +44,7 @@
 
 #include <map>
 
-#ifdef _OPENMP
+#ifdef VIENNACL_WITH_OPENMP
  #include <omp.h>
 #endif
 
@@ -55,45 +56,41 @@
 namespace viennacl
 {
   namespace linalg
-  {    
+  {
     typedef detail::amg::amg_tag          amg_tag;
-    
-    
-    
+
+
+
     /** @brief Setup AMG preconditioner
     *
     * @param A      Operator matrices on all levels
     * @param P      Prolongation/Interpolation operators on all levels
     * @param Pointvector  Vector of points on all levels
-    * @param tag    AMG preconditioner tag 
+    * @param tag    AMG preconditioner tag
     */
     template <typename InternalType1, typename InternalType2>
     void amg_setup(InternalType1 & A, InternalType1 & P, InternalType2 & Pointvector, amg_tag & tag)
     {
-      typedef typename InternalType1::value_type SparseMatrixType;
-      typedef typename InternalType2::value_type PointVectorType;     
-      typedef typename SparseMatrixType::value_type ScalarType;   
-      typedef typename SparseMatrixType::iterator1 InternalRowIterator;
-      typedef typename SparseMatrixType::iterator2 InternalColIterator;
-      
+      typedef typename InternalType2::value_type PointVectorType;
+
       unsigned int i, iterations, c_points, f_points;
       detail::amg::amg_slicing<InternalType1,InternalType2> Slicing;
-      
+
       // Set number of iterations. If automatic coarse grid construction is chosen (0), then set a maximum size and stop during the process.
       iterations = tag.get_coarselevels();
       if (iterations == 0)
         iterations = VIENNACL_AMG_MAX_LEVELS;
-             
+
       // For parallel coarsenings build data structures (number of threads set automatically).
       if (tag.get_coarse() == VIENNACL_AMG_COARSE_RS0 || tag.get_coarse() == VIENNACL_AMG_COARSE_RS3)
         Slicing.init(iterations);
-      
+
       for (i=0; i<iterations; ++i)
-      {  
+      {
         // Initialize Pointvector on level i and construct points.
-        Pointvector[i] = PointVectorType(A[i].size1());
+        Pointvector[i] = PointVectorType(static_cast<unsigned int>(A[i].size1()));
         Pointvector[i].init_points();
-        
+
         // Construct C and F points on coarse level (i is fine level, i+1 coarse level).
         detail::amg::amg_coarse (i, A, Pointvector, Slicing, tag);
 
@@ -101,32 +98,32 @@ namespace viennacl
         c_points = Pointvector[i].get_cpoints();
         f_points = Pointvector[i].get_fpoints();
 
-        #if defined (DEBUG) //or defined(DEBUGBENCH) 
+        #if defined (VIENNACL_AMG_DEBUG) //or defined(VIENNACL_AMG_DEBUGBENCH)
         std::cout << "Level " << i << ": ";
         std::cout << "No of C points = " << c_points << ", ";
         std::cout << "No of F points = " << f_points << std::endl;
         #endif
-        
+
         // Stop routine when the maximal coarse level is found (no C or F point). Coarsest level is level i.
         if (c_points == 0 || f_points == 0)
           break;
-          
+
         // Construct interpolation matrix for level i.
         detail::amg::amg_interpol (i, A, P, Pointvector, tag);
-        
+
         // Compute coarse grid operator (A[i+1] = R * A[i] * P) with R = trans(P).
         detail::amg::amg_galerkin_prod(A[i], P[i], A[i+1]);
-        
+
         // Test triple matrix product. Very slow for large matrix sizes (ublas).
         // test_triplematprod(A[i],P[i],A[i+1]);
-        
+
         Pointvector[i].delete_points();
-        
-        #ifdef DEBUG
+
+        #ifdef VIENNACL_AMG_DEBUG
         std::cout << "Coarse Grid Operator Matrix:" << std::endl;
         printmatrix (A[i+1]);
-        #endif  
-        
+        #endif
+
         // If Limit of coarse points is reached then stop. Coarsest level is level i+1.
         if (tag.get_coarselevels() == 0 && c_points <= VIENNACL_AMG_COARSE_LIMIT)
         {
@@ -134,23 +131,23 @@ namespace viennacl
           return;
         }
       }
-      tag.set_coarselevels(i);  
+      tag.set_coarselevels(i);
     }
-    
+
     /** @brief Initialize AMG preconditioner
     *
     * @param mat    System matrix
     * @param A      Operator matrices on all levels
     * @param P      Prolongation/Interpolation operators on all levels
     * @param Pointvector  Vector of points on all levels
-    * @param tag    AMG preconditioner tag 
+    * @param tag    AMG preconditioner tag
     */
     template <typename MatrixType, typename InternalType1, typename InternalType2>
     void amg_init(MatrixType const & mat, InternalType1 & A, InternalType1 & P, InternalType2 & Pointvector, amg_tag & tag)
     {
-      typedef typename MatrixType::value_type ScalarType;
+      //typedef typename MatrixType::value_type ScalarType;
       typedef typename InternalType1::value_type SparseMatrixType;
-      
+
       if (tag.get_coarselevels() > 0)
       {
         A.resize(tag.get_coarselevels()+1);
@@ -163,12 +160,12 @@ namespace viennacl
         P.resize(VIENNACL_AMG_MAX_LEVELS);
         Pointvector.resize(VIENNACL_AMG_MAX_LEVELS);
       }
-      
+
       // Insert operator matrix as operator for finest level.
       SparseMatrixType A0 (mat);
-      A.insert_element (0, A0);  
+      A.insert_element (0, A0);
     }
-    
+
     /** @brief Save operators after setup phase for CPU computation.
     *
     * @param A      Operator matrices on all levels on the CPU
@@ -176,18 +173,18 @@ namespace viennacl
     * @param R      Restriction operators on all levels on the CPU
     * @param A_setup    Operators matrices on all levels from setup phase
     * @param P_setup    Prolongation/Interpolation operators on all levels from setup phase
-    * @param tag    AMG preconditioner tag 
+    * @param tag    AMG preconditioner tag
     */
     template <typename InternalType1, typename InternalType2>
     void amg_transform_cpu (InternalType1 & A, InternalType1 & P, InternalType1 & R, InternalType2 & A_setup, InternalType2 & P_setup, amg_tag & tag)
-    {  
-      typedef typename InternalType1::value_type MatrixType;
-      
+    {
+      //typedef typename InternalType1::value_type MatrixType;
+
       // Resize internal data structures to actual size.
       A.resize(tag.get_coarselevels()+1);
       P.resize(tag.get_coarselevels());
       R.resize(tag.get_coarselevels());
-      
+
       // Transform into matrix type.
       for (unsigned int i=0; i<tag.get_coarselevels()+1; ++i)
       {
@@ -207,7 +204,7 @@ namespace viennacl
         P_setup[i].set_trans(false);
       }
     }
-    
+
     /** @brief Save operators after setup phase for GPU computation.
     *
     * @param A      Operator matrices on all levels on the GPU
@@ -215,57 +212,58 @@ namespace viennacl
     * @param R      Restriction operators on all levels on the GPU
     * @param A_setup    Operators matrices on all levels from setup phase
     * @param P_setup    Prolongation/Interpolation operators on all levels from setup phase
-    * @param tag    AMG preconditioner tag 
+    * @param tag    AMG preconditioner tag
+    * @param ctx      Optional context in which the auxiliary objects are created (one out of multiple OpenCL contexts, CUDA, host)
     */
     template <typename InternalType1, typename InternalType2>
-    void amg_transform_gpu (InternalType1 & A, InternalType1 & P, InternalType1 & R, InternalType2 & A_setup, InternalType2 & P_setup, amg_tag & tag)
+    void amg_transform_gpu (InternalType1 & A, InternalType1 & P, InternalType1 & R, InternalType2 & A_setup, InternalType2 & P_setup, amg_tag & tag, viennacl::context ctx)
     {
-      typedef typename InternalType1::value_type MatrixType;
-      typedef typename InternalType2::value_type::value_type ScalarType;
-      
       // Resize internal data structures to actual size.
       A.resize(tag.get_coarselevels()+1);
       P.resize(tag.get_coarselevels());
       R.resize(tag.get_coarselevels());
-      
+
       // Copy to GPU using the internal sparse matrix structure: std::vector<std::map>.
       for (unsigned int i=0; i<tag.get_coarselevels()+1; ++i)
       {
-        A[i].resize(A_setup[i].size1(),A_setup[i].size2(),false);
+        viennacl::switch_memory_context(A[i], ctx);
+        //A[i].resize(A_setup[i].size1(),A_setup[i].size2(),false);
         viennacl::copy(*(A_setup[i].get_internal_pointer()),A[i]);
       }
       for (unsigned int i=0; i<tag.get_coarselevels(); ++i)
       {
-        P[i].resize(P_setup[i].size1(),P_setup[i].size2(),false);
+        viennacl::switch_memory_context(P[i], ctx);
+        //P[i].resize(P_setup[i].size1(),P_setup[i].size2(),false);
         viennacl::copy(*(P_setup[i].get_internal_pointer()),P[i]);
         //viennacl::copy((boost::numeric::ublas::compressed_matrix<ScalarType>)P_setup[i],P[i]);
       }
       for (unsigned int i=0; i<tag.get_coarselevels(); ++i)
       {
-        R[i].resize(P_setup[i].size2(),P_setup[i].size1(),false);
+        viennacl::switch_memory_context(R[i], ctx);
+        //R[i].resize(P_setup[i].size2(),P_setup[i].size1(),false);
         P_setup[i].set_trans(true);
         viennacl::copy(*(P_setup[i].get_internal_pointer()),R[i]);
         P_setup[i].set_trans(false);
       }
     }
-    
+
     /** @brief Setup data structures for precondition phase.
     *
     * @param result    Result vector on all levels
     * @param rhs    RHS vector on all levels
     * @param residual    Residual vector on all levels
     * @param A      Operators matrices on all levels from setup phase
-    * @param tag    AMG preconditioner tag 
+    * @param tag    AMG preconditioner tag
     */
     template <typename InternalVectorType, typename SparseMatrixType>
     void amg_setup_apply (InternalVectorType & result, InternalVectorType & rhs, InternalVectorType & residual, SparseMatrixType const & A, amg_tag const & tag)
-    {        
+    {
       typedef typename InternalVectorType::value_type VectorType;
-      
+
       result.resize(tag.get_coarselevels()+1);
       rhs.resize(tag.get_coarselevels()+1);
       residual.resize(tag.get_coarselevels());
-            
+
       for (unsigned int level=0; level < tag.get_coarselevels()+1; ++level)
       {
         result[level] = VectorType(A[level].size1());
@@ -279,6 +277,38 @@ namespace viennacl
         residual[level].clear();
       }
     }
+
+
+    /** @brief Setup data structures for precondition phase for later use on the GPU
+    *
+    * @param result    Result vector on all levels
+    * @param rhs    RHS vector on all levels
+    * @param residual    Residual vector on all levels
+    * @param A      Operators matrices on all levels from setup phase
+    * @param tag    AMG preconditioner tag
+    * @param ctx      Optional context in which the auxiliary objects are created (one out of multiple OpenCL contexts, CUDA, host)
+    */
+    template <typename InternalVectorType, typename SparseMatrixType>
+    void amg_setup_apply (InternalVectorType & result, InternalVectorType & rhs, InternalVectorType & residual, SparseMatrixType const & A, amg_tag const & tag, viennacl::context ctx)
+    {
+      typedef typename InternalVectorType::value_type VectorType;
+
+      result.resize(tag.get_coarselevels()+1);
+      rhs.resize(tag.get_coarselevels()+1);
+      residual.resize(tag.get_coarselevels());
+
+      for (unsigned int level=0; level < tag.get_coarselevels()+1; ++level)
+      {
+        result[level] = VectorType(A[level].size1(), ctx);
+        rhs[level] = VectorType(A[level].size1(), ctx);
+      }
+      for (unsigned int level=0; level < tag.get_coarselevels(); ++level)
+      {
+        residual[level] = VectorType(A[level].size1(), ctx);
+      }
+    }
+
+
     /** @brief Pre-compute LU factorization for direct solve (ublas library).
      *  @brief Speeds up precondition phase as this is computed only once overall instead of once per iteration.
     *
@@ -287,22 +317,22 @@ namespace viennacl
     * @param A      Operator matrix on coarsest level
     */
     template <typename ScalarType, typename SparseMatrixType>
-    void amg_lu(boost::numeric::ublas::compressed_matrix<ScalarType> & op, boost::numeric::ublas::permutation_matrix<ScalarType> & Permutation, SparseMatrixType const & A)
+    void amg_lu(boost::numeric::ublas::compressed_matrix<ScalarType> & op, boost::numeric::ublas::permutation_matrix<> & Permutation, SparseMatrixType const & A)
     {
       typedef typename SparseMatrixType::const_iterator1 ConstRowIterator;
       typedef typename SparseMatrixType::const_iterator2 ConstColIterator;
-    
-      // Copy to operator matrix. Needed 
+
+      // Copy to operator matrix. Needed
       op.resize(A.size1(),A.size2(),false);
       for (ConstRowIterator row_iter = A.begin1(); row_iter != A.end1(); ++row_iter)
         for (ConstColIterator col_iter = row_iter.begin(); col_iter != row_iter.end(); ++col_iter)
           op (col_iter.index1(), col_iter.index2()) = *col_iter;
-      
+
       // Permutation matrix has to be reinitialized with actual size. Do not clear() or resize()!
-      Permutation = boost::numeric::ublas::permutation_matrix<ScalarType> (op.size1());
+      Permutation = boost::numeric::ublas::permutation_matrix<> (op.size1());
       boost::numeric::ublas::lu_factorize(op,Permutation);
     }
-    
+
     /** @brief AMG preconditioner class, can be supplied to solve()-routines
     */
     template <typename MatrixType>
@@ -312,31 +342,31 @@ namespace viennacl
       typedef boost::numeric::ublas::vector<ScalarType> VectorType;
       typedef detail::amg::amg_sparsematrix<ScalarType> SparseMatrixType;
       typedef detail::amg::amg_pointvector PointVectorType;
-      
+
       typedef typename SparseMatrixType::const_iterator1 InternalConstRowIterator;
       typedef typename SparseMatrixType::const_iterator2 InternalConstColIterator;
       typedef typename SparseMatrixType::iterator1 InternalRowIterator;
       typedef typename SparseMatrixType::iterator2 InternalColIterator;
-      
+
       boost::numeric::ublas::vector <SparseMatrixType> A_setup;
       boost::numeric::ublas::vector <SparseMatrixType> P_setup;
       boost::numeric::ublas::vector <MatrixType> A;
       boost::numeric::ublas::vector <MatrixType> P;
       boost::numeric::ublas::vector <MatrixType> R;
       boost::numeric::ublas::vector <PointVectorType> Pointvector;
-       
+
       mutable boost::numeric::ublas::compressed_matrix<ScalarType> op;
-      mutable boost::numeric::ublas::permutation_matrix<ScalarType> Permutation;  
-      
+      mutable boost::numeric::ublas::permutation_matrix<> Permutation;
+
       mutable boost::numeric::ublas::vector <VectorType> result;
       mutable boost::numeric::ublas::vector <VectorType> rhs;
       mutable boost::numeric::ublas::vector <VectorType> residual;
-      
+
       mutable bool done_init_apply;
-          
-      amg_tag _tag;
+
+      amg_tag tag_;
     public:
-    
+
       amg_precond(): Permutation(0) {}
       /** @brief The constructor. Saves system matrix, tag and builds data structures for setup.
       *
@@ -345,25 +375,25 @@ namespace viennacl
       */
       amg_precond(MatrixType const & mat, amg_tag const & tag): Permutation(0)
       {
-        _tag = tag;
+        tag_ = tag;
         // Initialize data structures.
-        amg_init (mat,A_setup,P_setup,Pointvector,_tag);
-        
+        amg_init (mat,A_setup,P_setup,Pointvector,tag_);
+
         done_init_apply = false;
       }
-      
+
       /** @brief Start setup phase for this class and copy data structures.
       */
       void setup()
       {
         // Start setup phase.
-        amg_setup(A_setup,P_setup,Pointvector,_tag);
+        amg_setup(A_setup,P_setup,Pointvector,tag_);
         // Transform to CPU-Matrixtype for precondition phase.
-        amg_transform_cpu(A,P,R,A_setup,P_setup,_tag);
-        
+        amg_transform_cpu(A,P,R,A_setup,P_setup,tag_);
+
         done_init_apply = false;
       }
-      
+
       /** @brief Prepare data structures for preconditioning:
        *  Build data structures for precondition phase.
        *  Do LU factorization on coarsest level.
@@ -371,13 +401,13 @@ namespace viennacl
       void init_apply() const
       {
         // Setup precondition phase (Data structures).
-        amg_setup_apply(result,rhs,residual,A_setup,_tag);
+        amg_setup_apply(result,rhs,residual,A_setup,tag_);
         // Do LU factorization for direct solve.
-        amg_lu(op,Permutation,A_setup[_tag.get_coarselevels()]);
-        
+        amg_lu(op,Permutation,A_setup[tag_.get_coarselevels()]);
+
         done_init_apply = true;
       }
-      
+
       /** @brief Returns complexity measures.
       *
       * @param avgstencil  Average stencil sizes on all levels
@@ -386,14 +416,14 @@ namespace viennacl
       template <typename VectorType>
       ScalarType calc_complexity(VectorType & avgstencil)
       {
-        avgstencil = VectorType (_tag.get_coarselevels()+1);
+        avgstencil = VectorType (tag_.get_coarselevels()+1);
         unsigned int nonzero=0, systemmat_nonzero=0, level_coefficients=0;
-          
-        for (unsigned int level=0; level < _tag.get_coarselevels()+1; ++level)
+
+        for (unsigned int level=0; level < tag_.get_coarselevels()+1; ++level)
         {
           level_coefficients = 0;
           for (InternalRowIterator row_iter = A_setup[level].begin1(); row_iter != A_setup[level].end1(); ++row_iter)
-          { 
+          {
             for (InternalColIterator col_iter = row_iter.begin(); col_iter != row_iter.end(); ++col_iter)
             {
               if (level == 0)
@@ -402,9 +432,9 @@ namespace viennacl
               level_coefficients++;
             }
           }
-          avgstencil[level] = level_coefficients/(double)A_setup[level].size1();
+          avgstencil[level] = level_coefficients/static_cast<ScalarType>(A_setup[level].size1());
         }
-        return nonzero/static_cast<double>(systemmat_nonzero);
+        return nonzero/static_cast<ScalarType>(systemmat_nonzero);
       }
 
       /** @brief Precondition Operation
@@ -413,82 +443,82 @@ namespace viennacl
       */
       template <typename VectorType>
       void apply(VectorType & vec) const
-      {   
+      {
         // Build data structures and do lu factorization before first iteration step.
         if (!done_init_apply)
           init_apply();
-        
+
         int level;
-        
+
         // Precondition operation (Yang, p.3)
         rhs[0] = vec;
-        for (level=0; level <(signed)_tag.get_coarselevels(); level++)
-        {    
+        for (level=0; level <static_cast<int>(tag_.get_coarselevels()); level++)
+        {
           result[level].clear();
-          
-          // Apply Smoother _presmooth times.
-          smooth_jacobi (level, _tag.get_presmooth(), result[level], rhs[level]);    
-          
-          #ifdef DEBUG
+
+          // Apply Smoother presmooth_ times.
+          smooth_jacobi (level, tag_.get_presmooth(), result[level], rhs[level]);
+
+          #ifdef VIENNACL_AMG_DEBUG
           std::cout << "After presmooth:" << std::endl;
           printvector(result[level]);
           #endif
 
           // Compute residual.
           residual[level] = rhs[level] - boost::numeric::ublas::prod (A[level],result[level]);
-          
-          #ifdef DEBUG          
+
+          #ifdef VIENNACL_AMG_DEBUG
           std::cout << "Residual:" << std::endl;
           printvector(residual[level]);
           #endif
-          
+
           // Restrict to coarse level. Restricted residual is RHS of coarse level.
           rhs[level+1] = boost::numeric::ublas::prod (R[level],residual[level]);
-          
-          #ifdef DEBUG
+
+          #ifdef VIENNACL_AMG_DEBUG
           std::cout << "Restricted Residual: " << std::endl;
           printvector(rhs[level+1]);
           #endif
         }
-          
+
         // On highest level use direct solve to solve equation.
         result[level] = rhs[level];
         boost::numeric::ublas::lu_substitute(op,Permutation,result[level]);
 
-        #ifdef DEBUG
+        #ifdef VIENNACL_AMG_DEBUG
         std::cout << "After direct solve: " << std::endl;
         printvector (result[level]);
         #endif
-          
-        for (level=_tag.get_coarselevels()-1; level >= 0; level--)
-        {       
-          #ifdef DEBUG
+
+        for (level=tag_.get_coarselevels()-1; level >= 0; level--)
+        {
+          #ifdef VIENNACL_AMG_DEBUG
           std::cout << "Coarse Error: " << std::endl;
           printvector(result[level+1]);
           #endif
-          
+
           // Interpolate error to fine level. Correct solution by adding error.
           result[level] += boost::numeric::ublas::prod (P[level], result[level+1]);
-              
-          #ifdef DEBUG
+
+          #ifdef VIENNACL_AMG_DEBUG
           std::cout << "Corrected Result: " << std::endl;
           printvector (result[level]);
           #endif
-          
-          // Apply Smoother _postsmooth times.
-          smooth_jacobi (level, _tag.get_postsmooth(), result[level], rhs[level]);
-          
-          #ifdef DEBUG
+
+          // Apply Smoother postsmooth_ times.
+          smooth_jacobi (level, tag_.get_postsmooth(), result[level], rhs[level]);
+
+          #ifdef VIENNACL_AMG_DEBUG
           std::cout << "After postsmooth: " << std::endl;
           printvector (result[level]);
           #endif
-        }    
+        }
         vec = result[0];
       }
-      
+
       /** @brief (Weighted) Jacobi Smoother (CPU version)
       * @param level    Coarse level to which smoother is applied to
-      * @param iterations  Number of smoother iterations    
+      * @param iterations  Number of smoother iterations
       * @param x     The vector smoothing is applied to
       * @param rhs    The right hand side of the equation for the smoother
       */
@@ -496,20 +526,20 @@ namespace viennacl
       void smooth_jacobi(int level, int const iterations, VectorType & x, VectorType const & rhs) const
       {
         VectorType old_result (x.size());
-        unsigned int index;
+        long index;
         ScalarType sum = 0, diag = 1;
-        
+
         for (int i=0; i<iterations; ++i)
         {
           old_result = x;
           x.clear();
-#ifdef _OPENMP
+#ifdef VIENNACL_WITH_OPENMP
           #pragma omp parallel for private (sum,diag) shared (rhs,x)
-#endif          
-          for (index=0; index<A_setup[level].size1(); ++index)  
+#endif
+          for (index=0; index < static_cast<long>(A_setup[level].size1()); ++index)
           {
             InternalConstRowIterator row_iter = A_setup[level].begin1();
-            row_iter += index; 
+            row_iter += index;
             sum = 0;
             diag = 1;
             for (InternalConstColIterator col_iter = row_iter.begin(); col_iter != row_iter.end(); ++col_iter)
@@ -519,14 +549,14 @@ namespace viennacl
               else
                 sum += *col_iter * old_result[col_iter.index2()];
             }
-            x[index]= _tag.get_jacobiweight() * (rhs[index] - sum) / diag + (1-_tag.get_jacobiweight()) * old_result[index];
+            x[index]= static_cast<ScalarType>(tag_.get_jacobiweight()) * (rhs[index] - sum) / diag + (1-static_cast<ScalarType>(tag_.get_jacobiweight())) * old_result[index];
           }
         }
       }
-      
-      amg_tag & tag() { return _tag; }
+
+      amg_tag & tag() { return tag_; }
     };
-    
+
     /** @brief AMG preconditioner class, can be supplied to solve()-routines.
     *
     *  Specialization for compressed_matrix
@@ -538,65 +568,67 @@ namespace viennacl
       typedef viennacl::vector<ScalarType> VectorType;
       typedef detail::amg::amg_sparsematrix<ScalarType> SparseMatrixType;
       typedef detail::amg::amg_pointvector PointVectorType;
-      
+
       typedef typename SparseMatrixType::const_iterator1 InternalConstRowIterator;
       typedef typename SparseMatrixType::const_iterator2 InternalConstColIterator;
       typedef typename SparseMatrixType::iterator1 InternalRowIterator;
       typedef typename SparseMatrixType::iterator2 InternalColIterator;
-      
+
       boost::numeric::ublas::vector <SparseMatrixType> A_setup;
       boost::numeric::ublas::vector <SparseMatrixType> P_setup;
       boost::numeric::ublas::vector <MatrixType> A;
       boost::numeric::ublas::vector <MatrixType> P;
       boost::numeric::ublas::vector <MatrixType> R;
       boost::numeric::ublas::vector <PointVectorType> Pointvector;
-      
+
       mutable boost::numeric::ublas::compressed_matrix<ScalarType> op;
-      mutable boost::numeric::ublas::permutation_matrix<ScalarType> Permutation;  
-      
+      mutable boost::numeric::ublas::permutation_matrix<> Permutation;
+
       mutable boost::numeric::ublas::vector <VectorType> result;
       mutable boost::numeric::ublas::vector <VectorType> rhs;
       mutable boost::numeric::ublas::vector <VectorType> residual;
-          
+
+      viennacl::context ctx_;
+
       mutable bool done_init_apply;
-    
-      amg_tag _tag;
-      
+
+      amg_tag tag_;
+
     public:
-      
+
       amg_precond(): Permutation(0) {}
-      
+
       /** @brief The constructor. Builds data structures.
       *
       * @param mat  System matrix
       * @param tag  The AMG tag
       */
-      amg_precond(compressed_matrix<ScalarType, MAT_ALIGNMENT> const & mat, amg_tag const & tag): Permutation(0)
+      amg_precond(compressed_matrix<ScalarType, MAT_ALIGNMENT> const & mat, amg_tag const & tag): Permutation(0), ctx_(viennacl::traits::context(mat))
       {
-        _tag = tag;
-        
+        tag_ = tag;
+
         // Copy to CPU. Internal structure of sparse matrix is used for copy operation.
         std::vector<std::map<unsigned int, ScalarType> > mat2 = std::vector<std::map<unsigned int, ScalarType> >(mat.size1());
         viennacl::copy(mat, mat2);
-        
+
         // Initialize data structures.
-        amg_init (mat2,A_setup,P_setup,Pointvector,_tag);
-          
+        amg_init (mat2,A_setup,P_setup,Pointvector,tag_);
+
         done_init_apply = false;
       }
-      
+
       /** @brief Start setup phase for this class and copy data structures.
       */
       void setup()
       {
         // Start setup phase.
-        amg_setup(A_setup,P_setup,Pointvector,_tag);  
+        amg_setup(A_setup,P_setup,Pointvector, tag_);
         // Transform to GPU-Matrixtype for precondition phase.
-        amg_transform_gpu(A,P,R,A_setup,P_setup,_tag);  
-        
+        amg_transform_gpu(A,P,R,A_setup,P_setup, tag_, ctx_);
+
         done_init_apply = false;
       }
-      
+
       /** @brief Prepare data structures for preconditioning:
        *  Build data structures for precondition phase.
        *  Do LU factorization on coarsest level.
@@ -604,10 +636,10 @@ namespace viennacl
       void init_apply() const
       {
         // Setup precondition phase (Data structures).
-        amg_setup_apply(result,rhs,residual,A_setup,_tag);
+        amg_setup_apply(result,rhs,residual,A_setup,tag_, ctx_);
         // Do LU factorization for direct solve.
-        amg_lu(op,Permutation,A_setup[_tag.get_coarselevels()]);
-        
+        amg_lu(op,Permutation,A_setup[tag_.get_coarselevels()]);
+
         done_init_apply = true;
       }
 
@@ -619,14 +651,14 @@ namespace viennacl
       template <typename VectorType>
       ScalarType calc_complexity(VectorType & avgstencil)
       {
-        avgstencil = VectorType (_tag.get_coarselevels()+1);
+        avgstencil = VectorType (tag_.get_coarselevels()+1);
         unsigned int nonzero=0, systemmat_nonzero=0, level_coefficients=0;
-        
-        for (unsigned int level=0; level < _tag.get_coarselevels()+1; ++level)
+
+        for (unsigned int level=0; level < tag_.get_coarselevels()+1; ++level)
         {
           level_coefficients = 0;
           for (InternalRowIterator row_iter = A_setup[level].begin1(); row_iter != A_setup[level].end1(); ++row_iter)
-          { 
+          {
             for (InternalColIterator col_iter = row_iter.begin(); col_iter != row_iter.end(); ++col_iter)
             {
               if (level == 0)
@@ -642,48 +674,50 @@ namespace viennacl
 
       /** @brief Precondition Operation
       *
-      * @param vec The vector to which preconditioning is applied to 
+      * @param vec The vector to which preconditioning is applied to
       */
       template <typename VectorType>
       void apply(VectorType & vec) const
       {
         if (!done_init_apply)
-          init_apply();  
-        
+          init_apply();
+
         int level;
-        
+
         // Precondition operation (Yang, p.3).
         rhs[0] = vec;
-        for (level=0; level <(signed)_tag.get_coarselevels(); level++)
-        {    
+        for (level=0; level <static_cast<int>(tag_.get_coarselevels()); level++)
+        {
           result[level].clear();
-          
-          // Apply Smoother _presmooth times.
-          smooth_jacobi (level, _tag.get_presmooth(), result[level], rhs[level]);
 
-          #ifdef DEBUG
+          // Apply Smoother presmooth_ times.
+          smooth_jacobi (level, tag_.get_presmooth(), result[level], rhs[level]);
+
+          #ifdef VIENNACL_AMG_DEBUG
           std::cout << "After presmooth: " << std::endl;
           printvector(result[level]);
           #endif
-          
+
           // Compute residual.
-          residual[level] = rhs[level] - viennacl::linalg::prod (A[level],result[level]);
-          
-          #ifdef DEBUG             
+          //residual[level] = rhs[level] - viennacl::linalg::prod (A[level],result[level]);
+          residual[level] = viennacl::linalg::prod (A[level],result[level]);
+          residual[level] = rhs[level] - residual[level];
+
+          #ifdef VIENNACL_AMG_DEBUG
           std::cout << "Residual: " << std::endl;
           printvector(residual[level]);
           #endif
-          
+
           // Restrict to coarse level. Result is RHS of coarse level equation.
           //residual_coarse[level] = viennacl::linalg::prod(R[level],residual[level]);
           rhs[level+1] = viennacl::linalg::prod(R[level],residual[level]);
-          
-          #ifdef DEBUG
+
+          #ifdef VIENNACL_AMG_DEBUG
           std::cout << "Restricted Residual: " << std::endl;
           printvector(rhs[level+1]);
           #endif
         }
-          
+
         // On highest level use direct solve to solve equation (on the CPU)
         //TODO: Use GPU direct solve!
         result[level] = rhs[level];
@@ -692,71 +726,69 @@ namespace viennacl
         copy (result[level],result_cpu);
         boost::numeric::ublas::lu_substitute(op,Permutation,result_cpu);
         copy (result_cpu, result[level]);
-        
-        #ifdef DEBUG
+
+        #ifdef VIENNACL_AMG_DEBUG
         std::cout << "After direct solve: " << std::endl;
         printvector (result[level]);
         #endif
-          
-        for (level=_tag.get_coarselevels()-1; level >= 0; level--)
-        {   
-          #ifdef DEBUG
+
+        for (level=tag_.get_coarselevels()-1; level >= 0; level--)
+        {
+          #ifdef VIENNACL_AMG_DEBUG
           std::cout << "Coarse Error: " << std::endl;
           printvector(result[level+1]);
           #endif
-          
+
           // Interpolate error to fine level and correct solution.
           result[level] += viennacl::linalg::prod(P[level],result[level+1]);
-              
-          #ifdef DEBUG
+
+          #ifdef VIENNACL_AMG_DEBUG
           std::cout << "Corrected Result: " << std::endl;
           printvector (result[level]);
           #endif
-          
-          // Apply Smoother _postsmooth times.
-          smooth_jacobi (level, _tag.get_postsmooth(), result[level], rhs[level]);
-          
-          #ifdef DEBUG
+
+          // Apply Smoother postsmooth_ times.
+          smooth_jacobi (level, tag_.get_postsmooth(), result[level], rhs[level]);
+
+          #ifdef VIENNACL_AMG_DEBUG
           std::cout << "After postsmooth: " << std::endl;
           printvector (result[level]);
           #endif
-        }    
+        }
         vec = result[0];
       }
-      
+
       /** @brief Jacobi Smoother (GPU version)
       * @param level       Coarse level to which smoother is applied to
-      * @param iterations  Number of smoother iterations    
+      * @param iterations  Number of smoother iterations
       * @param x           The vector smoothing is applied to
       * @param rhs         The right hand side of the equation for the smoother
       */
       template <typename VectorType>
       void smooth_jacobi(int level, unsigned int iterations, VectorType & x, VectorType const & rhs) const
-      {     
-        VectorType old_result (x.size());
-  
-  //viennacl::ocl::program & p = viennacl::ocl::current_context().add_program
-  //          (viennacl::tools::make_double_kernel(jacobi_kernel,viennacl::ocl::current_device().info()), "jacobi_kernel");
-  //viennacl::ocl::kernel & k = p.add_kernel("jacobi");
-  
-        viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::compressed_matrix<ScalarType, MAT_ALIGNMENT>::program_name(),
-                    "jacobi");
-  
+      {
+        VectorType old_result = x;
+
+        viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(x).context());
+        viennacl::linalg::opencl::kernels::compressed_matrix<ScalarType>::init(ctx);
+        viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix<ScalarType>::program_name(), "jacobi");
+
         for (unsigned int i=0; i<iterations; ++i)
         {
-          old_result = x;    
+          if (i > 0)
+            old_result = x;
           x.clear();
-          viennacl::ocl::enqueue(k(A[level].handle1(), A[level].handle2(), A[level].handle(),
-                                  static_cast<ScalarType>(_tag.get_jacobiweight()), 
-                                  old_result,
-                                  x,
-                                  rhs,
-                                  static_cast<cl_uint>(rhs.size()))); 
-          
+          viennacl::ocl::enqueue(k(A[level].handle1().opencl_handle(), A[level].handle2().opencl_handle(), A[level].handle().opencl_handle(),
+                                  static_cast<ScalarType>(tag_.get_jacobiweight()),
+                                  viennacl::traits::opencl_handle(old_result),
+                                  viennacl::traits::opencl_handle(x),
+                                  viennacl::traits::opencl_handle(rhs),
+                                  static_cast<cl_uint>(rhs.size())));
+
         }
       }
-      
-      amg_tag & tag() { return _tag; }
+
+      amg_tag & tag() { return tag_; }
     };
 
   }
diff --git a/viennacl/linalg/bicgstab.hpp b/viennacl/linalg/bicgstab.hpp
index 82ca73a..642b490 100644
--- a/viennacl/linalg/bicgstab.hpp
+++ b/viennacl/linalg/bicgstab.hpp
@@ -1,17 +1,18 @@
-#ifndef VIENNACL_BICGSTAB_HPP_
-#define VIENNACL_BICGSTAB_HPP_
+#ifndef VIENNACL_LINALG_BICGSTAB_HPP_
+#define VIENNACL_LINALG_BICGSTAB_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
@@ -27,6 +28,7 @@
 #include "viennacl/tools/tools.hpp"
 #include "viennacl/linalg/prod.hpp"
 #include "viennacl/linalg/inner_prod.hpp"
+#include "viennacl/linalg/norm_2.hpp"
 #include "viennacl/traits/clear.hpp"
 #include "viennacl/traits/size.hpp"
 #include "viennacl/meta/result_of.hpp"
@@ -35,7 +37,7 @@ namespace viennacl
 {
   namespace linalg
   {
-    
+
     /** @brief A tag for the stabilized Bi-conjugate gradient solver. Used for supplying solver parameters and for dispatching the solve() function
     */
     class bicgstab_tag
@@ -44,33 +46,38 @@ namespace viennacl
         /** @brief The constructor
         *
         * @param tol              Relative tolerance for the residual (solver quits if ||r|| < tol * ||r_initial||)
-        * @param max_iterations   The maximum number of iterations
+        * @param max_iters        The maximum number of iterations
+        * @param max_iters_before_restart   The maximum number of iterations before BiCGStab is reinitialized (to avoid accumulation of round-off errors)
         */
-        bicgstab_tag(double tol = 1e-8, unsigned int max_iterations = 300) : _tol(tol), _iterations(max_iterations) {};
-      
+        bicgstab_tag(double tol = 1e-8, vcl_size_t max_iters = 400, vcl_size_t max_iters_before_restart = 200)
+          : tol_(tol), iterations_(max_iters), iterations_before_restart_(max_iters_before_restart) {}
+
         /** @brief Returns the relative tolerance */
-        double tolerance() const { return _tol; }
+        double tolerance() const { return tol_; }
         /** @brief Returns the maximum number of iterations */
-        unsigned int max_iterations() const { return _iterations; }
-        
+        vcl_size_t max_iterations() const { return iterations_; }
+        /** @brief Returns the maximum number of iterations before a restart*/
+        vcl_size_t max_iterations_before_restart() const { return iterations_before_restart_; }
+
         /** @brief Return the number of solver iterations: */
-        unsigned int iters() const { return iters_taken_; }
-        void iters(unsigned int i) const { iters_taken_ = i; }
-        
+        vcl_size_t iters() const { return iters_taken_; }
+        void iters(vcl_size_t i) const { iters_taken_ = i; }
+
         /** @brief Returns the estimated relative error at the end of the solver run */
         double error() const { return last_error_; }
         /** @brief Sets the estimated relative error at the end of the solver run */
         void error(double e) const { last_error_ = e; }
-        
+
       private:
-        double _tol;
-        unsigned int _iterations;
+        double tol_;
+        vcl_size_t iterations_;
+        vcl_size_t iterations_before_restart_;
 
         //return values from solver
-        mutable unsigned int iters_taken_;
+        mutable vcl_size_t iters_taken_;
         mutable double last_error_;
     };
-    
+
 
     /** @brief Implementation of the stabilized Bi-conjugate gradient solver
     *
@@ -86,72 +93,78 @@ namespace viennacl
     {
       typedef typename viennacl::result_of::value_type<VectorType>::type        ScalarType;
       typedef typename viennacl::result_of::cpu_value_type<ScalarType>::type    CPU_ScalarType;
-      unsigned int problem_size = viennacl::traits::size(rhs);
-      VectorType result(problem_size);
+      VectorType result = rhs;
       viennacl::traits::clear(result);
 
       VectorType residual = rhs;
       VectorType p = rhs;
       VectorType r0star = rhs;
-      VectorType tmp0(problem_size);
-      VectorType tmp1(problem_size);
-      VectorType s(problem_size);
+      VectorType tmp0 = rhs;
+      VectorType tmp1 = rhs;
+      VectorType s = rhs;
 
-      CPU_ScalarType ip_rr0star = viennacl::linalg::inner_prod(rhs,r0star);
-      CPU_ScalarType norm_rhs_host = ip_rr0star;
+      CPU_ScalarType norm_rhs_host = viennacl::linalg::norm_2(residual);
+      CPU_ScalarType ip_rr0star = norm_rhs_host * norm_rhs_host;
       CPU_ScalarType beta;
       CPU_ScalarType alpha;
       CPU_ScalarType omega;
-      ScalarType inner_prod_temp; //temporary variable for inner product computation
-      ScalarType new_ip_rr0star = 0;
-      
-      for (unsigned int i = 0; i < tag.max_iterations(); ++i)
+      //ScalarType inner_prod_temp; //temporary variable for inner product computation
+      CPU_ScalarType new_ip_rr0star = 0;
+      CPU_ScalarType residual_norm = norm_rhs_host;
+
+      if (norm_rhs_host == 0) //solution is zero if RHS norm is zero
+        return result;
+
+      bool restart_flag = true;
+      vcl_size_t last_restart = 0;
+      for (vcl_size_t i = 0; i < tag.max_iterations(); ++i)
       {
+        if (restart_flag)
+        {
+          residual = rhs;
+          residual -= viennacl::linalg::prod(matrix, result);
+          p = residual;
+          r0star = residual;
+          ip_rr0star = viennacl::linalg::norm_2(residual);
+          ip_rr0star *= ip_rr0star;
+          restart_flag = false;
+          last_restart = i;
+        }
+
         tag.iters(i+1);
         tmp0 = viennacl::linalg::prod(matrix, p);
-        //alpha = ip_rr0star / viennacl::linalg::inner_prod(tmp0, r0star);
-        inner_prod_temp = viennacl::linalg::inner_prod(tmp0, r0star);
-        alpha = ip_rr0star / static_cast<CPU_ScalarType>(inner_prod_temp);
-
-        //s = residual - alpha*tmp0;
-        s = residual;
-        s -= alpha*tmp0;
-        
+        alpha = ip_rr0star / viennacl::linalg::inner_prod(tmp0, r0star);
+
+        s = residual - alpha*tmp0;
+
         tmp1 = viennacl::linalg::prod(matrix, s);
-        //omega = viennacl::linalg::inner_prod(tmp1, s) / viennacl::linalg::inner_prod(tmp1, tmp1);
-        inner_prod_temp = viennacl::linalg::inner_prod(tmp1, s);
-        omega = inner_prod_temp;
-        inner_prod_temp = viennacl::linalg::inner_prod(tmp1, tmp1);
-        omega /= inner_prod_temp;
-        
-        //result += alpha * p + omega * s;
-        result += alpha * p;
-        result += omega * s;
-        
-        //residual = s - omega * tmp1;
-        residual = s;
-        residual -= omega*tmp1;
-        
-        new_ip_rr0star = viennacl::linalg::inner_prod(residual,r0star);
-        if (fabs(CPU_ScalarType(viennacl::linalg::inner_prod(residual, residual)) / norm_rhs_host) < tag.tolerance() * tag.tolerance())
+        CPU_ScalarType norm_tmp1 = viennacl::linalg::norm_2(tmp1);
+        omega = viennacl::linalg::inner_prod(tmp1, s) / (norm_tmp1 * norm_tmp1);
+
+        result += alpha * p + omega * s;
+        residual = s - omega * tmp1;
+
+        new_ip_rr0star = viennacl::linalg::inner_prod(residual, r0star);
+        residual_norm = viennacl::linalg::norm_2(residual);
+        if (std::fabs(residual_norm / norm_rhs_host) < tag.tolerance())
           break;
-        
-        //beta = new_ip_rr0star / ip_rr0star * alpha/omega;
-        CPU_ScalarType cpu_temp = new_ip_rr0star; //read from device only once
-        beta = cpu_temp / ip_rr0star * alpha/omega;
-        ip_rr0star = cpu_temp;
+
+        beta = new_ip_rr0star / ip_rr0star * alpha/omega;
+        ip_rr0star = new_ip_rr0star;
+
+        if (ip_rr0star == 0 || omega == 0 || i - last_restart > tag.max_iterations_before_restart()) //search direction degenerate. A restart might help
+          restart_flag = true;
 
         // Execution of
         //  p = residual + beta * (p - omega*tmp0);
         // without introducing temporary vectors:
         p -= omega * tmp0;
-        p *= beta;
-        p += residual;
+        p = residual + beta * p;
       }
-      
+
       //store last error estimate:
-      tag.error(std::sqrt(fabs(CPU_ScalarType(viennacl::linalg::inner_prod(residual, residual)) / norm_rhs_host)));
-      
+      tag.error(residual_norm / norm_rhs_host);
+
       return result;
     }
 
@@ -176,77 +189,84 @@ namespace viennacl
     {
       typedef typename viennacl::result_of::value_type<VectorType>::type        ScalarType;
       typedef typename viennacl::result_of::cpu_value_type<ScalarType>::type    CPU_ScalarType;
-      unsigned int problem_size = viennacl::traits::size(rhs);
-      VectorType result(problem_size);
-      result.clear();
+      VectorType result = rhs;
+      viennacl::traits::clear(result);
 
       VectorType residual = rhs;
-      precond.apply(residual);
       VectorType r0star = residual;  //can be chosen arbitrarily in fact
-      VectorType tmp0(problem_size);
-      VectorType tmp1(problem_size);
-      VectorType s(problem_size);
-      
+      VectorType tmp0 = rhs;
+      VectorType tmp1 = rhs;
+      VectorType s = rhs;
+
       VectorType p = residual;
 
-      CPU_ScalarType ip_rr0star = viennacl::linalg::inner_prod(residual,r0star);
-      CPU_ScalarType norm_rhs_host = ip_rr0star;
+      CPU_ScalarType ip_rr0star = viennacl::linalg::norm_2(residual);
+      CPU_ScalarType norm_rhs_host = viennacl::linalg::norm_2(residual);
       CPU_ScalarType beta;
       CPU_ScalarType alpha;
       CPU_ScalarType omega;
-      ScalarType new_ip_rr0star = 0;
-      ScalarType inner_prod_temp; //temporary variable for inner product
-      
+      CPU_ScalarType new_ip_rr0star = 0;
+      CPU_ScalarType residual_norm = norm_rhs_host;
+
+      if (norm_rhs_host == 0) //solution is zero if RHS norm is zero
+        return result;
+
+      bool restart_flag = true;
+      vcl_size_t last_restart = 0;
       for (unsigned int i = 0; i < tag.max_iterations(); ++i)
       {
+        if (restart_flag)
+        {
+          residual = rhs;
+          residual -= viennacl::linalg::prod(matrix, result);
+          precond.apply(residual);
+          p = residual;
+          r0star = residual;
+          ip_rr0star = viennacl::linalg::norm_2(residual);
+          ip_rr0star *= ip_rr0star;
+          restart_flag = false;
+          last_restart = i;
+        }
+
         tag.iters(i+1);
         tmp0 = viennacl::linalg::prod(matrix, p);
         precond.apply(tmp0);
-        //alpha = ip_rr0star / viennacl::linalg::inner_prod(tmp0, r0star);
-        inner_prod_temp = viennacl::linalg::inner_prod(tmp0, r0star);
-        alpha = ip_rr0star / static_cast<CPU_ScalarType>(inner_prod_temp);
+        alpha = ip_rr0star / viennacl::linalg::inner_prod(tmp0, r0star);
 
-        //s = residual - alpha*tmp0;
-        s = residual;
-        s -= alpha*tmp0;
+        s = residual - alpha*tmp0;
 
         tmp1 = viennacl::linalg::prod(matrix, s);
         precond.apply(tmp1);
-        //omega = viennacl::linalg::inner_prod(tmp1, s) / viennacl::linalg::inner_prod(tmp1, tmp1);
-        inner_prod_temp = viennacl::linalg::inner_prod(tmp1, s);
-        omega = inner_prod_temp;
-        inner_prod_temp = viennacl::linalg::inner_prod(tmp1, tmp1);
-        omega /= inner_prod_temp;
-        
-        //result += alpha * p + omega * s;
-        result += alpha * p;
-        result += omega * s;
-        //residual = s - omega * tmp1;
-        residual = s;
-        residual -= omega*tmp1;
-        
-        new_ip_rr0star = viennacl::linalg::inner_prod(residual,r0star);
-        if (fabs(CPU_ScalarType(viennacl::linalg::inner_prod(residual, residual) / norm_rhs_host)) < tag.tolerance() * tag.tolerance() )
+        CPU_ScalarType norm_tmp1 = viennacl::linalg::norm_2(tmp1);
+        omega = viennacl::linalg::inner_prod(tmp1, s) / (norm_tmp1 * norm_tmp1);
+
+        result += alpha * p + omega * s;
+        residual = s - omega * tmp1;
+
+        residual_norm = viennacl::linalg::norm_2(residual);
+        if (residual_norm / norm_rhs_host < tag.tolerance())
           break;
-        
-        //beta = new_ip_rr0star / ip_rr0star * alpha/omega;
-        CPU_ScalarType cpu_temp = new_ip_rr0star; //read from device only once
-        beta = cpu_temp / ip_rr0star * alpha/omega;
-        ip_rr0star = cpu_temp;
+
+        new_ip_rr0star = viennacl::linalg::inner_prod(residual, r0star);
+
+        beta = new_ip_rr0star / ip_rr0star * alpha/omega;
+        ip_rr0star = new_ip_rr0star;
+
+        if (ip_rr0star == 0 || omega == 0 || i - last_restart > tag.max_iterations_before_restart()) //search direction degenerate. A restart might help
+          restart_flag = true;
 
         // Execution of
         //  p = residual + beta * (p - omega*tmp0);
         // without introducing temporary vectors:
         p -= omega * tmp0;
-        p *= beta;
-        p += residual;
-        
+        p = residual + beta * p;
+
         //std::cout << "Rel. Residual in current step: " << std::sqrt(std::fabs(viennacl::linalg::inner_prod(residual, residual) / norm_rhs_host)) << std::endl;
       }
-      
+
       //store last error estimate:
-      tag.error(std::sqrt(fabs(CPU_ScalarType(viennacl::linalg::inner_prod(residual, residual)) / norm_rhs_host)));
-      
+      tag.error(residual_norm / norm_rhs_host);
+
       return result;
     }
 
diff --git a/viennacl/linalg/bisect.hpp b/viennacl/linalg/bisect.hpp
new file mode 100644
index 0000000..3c04917
--- /dev/null
+++ b/viennacl/linalg/bisect.hpp
@@ -0,0 +1,176 @@
+#ifndef VIENNACL_LINALG_BISECT_HPP_
+#define VIENNACL_LINALG_BISECT_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/bisect.hpp
+*   @brief Implementation of the algorithm for finding eigenvalues of a tridiagonal matrix.
+*
+*   Contributed by Guenther Mader and Astrid Rupp.
+*/
+
+#include <vector>
+#include <cmath>
+#include <limits>
+#include <cstddef>
+#include "viennacl/meta/result_of.hpp"
+
+namespace viennacl
+{
+  namespace linalg
+  {
+
+    namespace detail
+    {
+      /**
+      *    @brief overloaded function for copying vectors
+      */
+      template <typename T, typename OtherVectorType>
+      void copy_vec_to_vec(viennacl::vector<T> const & src, OtherVectorType & dest)
+      {
+        viennacl::copy(src, dest);
+      }
+
+      template <typename OtherVectorType, typename T>
+      void copy_vec_to_vec(OtherVectorType const & src, viennacl::vector<T> & dest)
+      {
+        viennacl::copy(src, dest);
+      }
+
+      template <typename VectorType1, typename VectorType2>
+      void copy_vec_to_vec(VectorType1 const & src, VectorType2 & dest)
+      {
+        for (vcl_size_t i=0; i<src.size(); ++i)
+          dest[i] = src[i];
+      }
+    }
+
+    /**
+    *   @brief Implementation of the bisect-algorithm for the calculation of the eigenvalues of a tridiagonal matrix. Experimental - interface might change.
+    *
+    *   @param alphas       Elements of the main diagonal
+    *   @param betas        Elements of the secondary diagonal
+    *   @return             Returns the eigenvalues of the tridiagonal matrix defined by alpha and beta
+    */
+    template< typename VectorT >
+    std::vector<
+            typename viennacl::result_of::cpu_value_type<typename VectorT::value_type>::type
+            >
+    bisect(VectorT const & alphas, VectorT const & betas)
+    {
+      typedef typename viennacl::result_of::value_type<VectorT>::type           ScalarType;
+      typedef typename viennacl::result_of::cpu_value_type<ScalarType>::type    CPU_ScalarType;
+
+      vcl_size_t size = betas.size();
+      std::vector<CPU_ScalarType>  x_temp(size);
+
+
+      std::vector<CPU_ScalarType> beta_bisect;
+      std::vector<CPU_ScalarType> wu;
+
+      double rel_error = std::numeric_limits<CPU_ScalarType>::epsilon();
+      beta_bisect.push_back(0);
+
+      for(vcl_size_t i = 1; i < size; i++){
+              beta_bisect.push_back(betas[i] * betas[i]);
+      }
+
+      double xmin = alphas[size - 1] - std::fabs(betas[size - 1]);
+      double xmax = alphas[size - 1] + std::fabs(betas[size - 1]);
+
+      for(vcl_size_t i = 0; i < size - 1; i++)
+      {
+        double h = std::fabs(betas[i]) + std::fabs(betas[i + 1]);
+        if (alphas[i] + h > xmax)
+          xmax = alphas[i] + h;
+        if (alphas[i] - h < xmin)
+          xmin = alphas[i] - h;
+      }
+
+
+      double eps1 = 1e-6;
+      /*double eps2 = (xmin + xmax > 0) ? (rel_error * xmax) : (-rel_error * xmin);
+      if(eps1 <= 0)
+        eps1 = eps2;
+      else
+        eps2 = 0.5 * eps1 + 7.0 * eps2; */
+
+      double x0 = xmax;
+
+      for(vcl_size_t i = 0; i < size; i++)
+      {
+        x_temp[i] = xmax;
+        wu.push_back(xmin);
+      }
+
+      for(long k = static_cast<long>(size) - 1; k >= 0; --k)
+      {
+        double xu = xmin;
+        for(long i = k; i >= 0; --i)
+        {
+          if(xu < wu[k-i])
+          {
+            xu = wu[i];
+            break;
+          }
+        }
+
+        if(x0 > x_temp[k])
+          x0 = x_temp[k];
+
+        double x1 = (xu + x0) / 2.0;
+        while (x0 - xu > 2.0 * rel_error * (std::fabs(xu) + std::fabs(x0)) + eps1)
+        {
+          vcl_size_t a = 0;
+          double q = 1;
+          for(vcl_size_t i = 0; i < size; i++)
+          {
+            if(q != 0)
+              q = alphas[i] - x1 - beta_bisect[i] / q;
+            else
+              q = alphas[i] - x1 - std::fabs(betas[i] / rel_error);
+
+            if(q < 0)
+              a++;
+          }
+
+          if (a <= static_cast<vcl_size_t>(k))
+          {
+            xu = x1;
+            if(a < 1)
+              wu[0] = x1;
+            else
+            {
+              wu[a] = x1;
+              if(x_temp[a - 1] > x1)
+                  x_temp[a - 1] = x1;
+            }
+          }
+          else
+            x0 = x1;
+
+          x1 = (xu + x0) / 2.0;
+        }
+        x_temp[k] = x1;
+      }
+      return x_temp;
+    }
+
+  } // end namespace linalg
+} // end namespace viennacl
+#endif
diff --git a/viennacl/linalg/cg.hpp b/viennacl/linalg/cg.hpp
index c7b7b57..e981239 100644
--- a/viennacl/linalg/cg.hpp
+++ b/viennacl/linalg/cg.hpp
@@ -1,23 +1,24 @@
-#ifndef VIENNACL_CG_HPP_
-#define VIENNACL_CG_HPP_
+#ifndef VIENNACL_LINALG_CG_HPP_
+#define VIENNACL_LINALG_CG_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
 ============================================================================= */
 
-/** @file cg.hpp
+/** @file viennacl/linalg/cg.hpp
     @brief The conjugate gradient method is implemented here
 */
 
@@ -29,6 +30,7 @@
 #include "viennacl/linalg/ilu.hpp"
 #include "viennacl/linalg/prod.hpp"
 #include "viennacl/linalg/inner_prod.hpp"
+#include "viennacl/linalg/norm_2.hpp"
 #include "viennacl/traits/clear.hpp"
 #include "viennacl/traits/size.hpp"
 #include "viennacl/meta/result_of.hpp"
@@ -37,7 +39,7 @@ namespace viennacl
 {
   namespace linalg
   {
-    
+
     /** @brief A tag for the conjugate gradient Used for supplying solver parameters and for dispatching the solve() function
     */
     class cg_tag
@@ -48,32 +50,32 @@ namespace viennacl
         * @param tol              Relative tolerance for the residual (solver quits if ||r|| < tol * ||r_initial||)
         * @param max_iterations   The maximum number of iterations
         */
-        cg_tag(double tol = 1e-8, unsigned int max_iterations = 300) : _tol(tol), _iterations(max_iterations) {};
-      
+        cg_tag(double tol = 1e-8, unsigned int max_iterations = 300) : tol_(tol), iterations_(max_iterations) {}
+
         /** @brief Returns the relative tolerance */
-        double tolerance() const { return _tol; }
+        double tolerance() const { return tol_; }
         /** @brief Returns the maximum number of iterations */
-        unsigned int max_iterations() const { return _iterations; }
-        
+        unsigned int max_iterations() const { return iterations_; }
+
         /** @brief Return the number of solver iterations: */
         unsigned int iters() const { return iters_taken_; }
         void iters(unsigned int i) const { iters_taken_ = i; }
-        
+
         /** @brief Returns the estimated relative error at the end of the solver run */
         double error() const { return last_error_; }
         /** @brief Sets the estimated relative error at the end of the solver run */
         void error(double e) const { last_error_ = e; }
-        
-        
+
+
       private:
-        double _tol;
-        unsigned int _iterations;
-        
+        double tol_;
+        unsigned int iterations_;
+
         //return values from solver
         mutable unsigned int iters_taken_;
         mutable double last_error_;
     };
-    
+
 
     /** @brief Implementation of the conjugate gradient solver without preconditioner
     *
@@ -91,52 +93,46 @@ namespace viennacl
       typedef typename viennacl::result_of::value_type<VectorType>::type        ScalarType;
       typedef typename viennacl::result_of::cpu_value_type<ScalarType>::type    CPU_ScalarType;
       //std::cout << "Starting CG" << std::endl;
-      std::size_t problem_size = viennacl::traits::size(rhs);
-      VectorType result(problem_size);
+      VectorType result = rhs;
       viennacl::traits::clear(result);
 
       VectorType residual = rhs;
       VectorType p = rhs;
-      VectorType tmp(problem_size);
+      VectorType tmp = rhs;
 
-      ScalarType tmp_in_p;
-      ScalarType residual_norm_squared;
       CPU_ScalarType ip_rr = viennacl::linalg::inner_prod(rhs,rhs);
       CPU_ScalarType alpha;
       CPU_ScalarType new_ip_rr = 0;
       CPU_ScalarType beta;
-      CPU_ScalarType norm_rhs_squared = ip_rr;
-      
+      CPU_ScalarType norm_rhs = std::sqrt(ip_rr);
+
       //std::cout << "Starting CG solver iterations... " << std::endl;
-      
+      if (norm_rhs == 0) //solution is zero if RHS norm is zero
+        return result;
+
       for (unsigned int i = 0; i < tag.max_iterations(); ++i)
       {
         tag.iters(i+1);
         tmp = viennacl::linalg::prod(matrix, p);
 
-        //alpha = ip_rr / viennacl::linalg::inner_prod(tmp, p);
-        tmp_in_p = viennacl::linalg::inner_prod(tmp, p);
-        alpha = ip_rr / static_cast<CPU_ScalarType>(tmp_in_p);
-        
+        alpha = ip_rr / viennacl::linalg::inner_prod(tmp, p);
         result += alpha * p;
         residual -= alpha * tmp;
-        
-        residual_norm_squared = viennacl::linalg::inner_prod(residual,residual);
-        new_ip_rr = static_cast<CPU_ScalarType>(residual_norm_squared);
-        if (new_ip_rr / norm_rhs_squared < tag.tolerance() *  tag.tolerance())//squared norms involved here
+
+        new_ip_rr = viennacl::linalg::norm_2(residual);
+        if (new_ip_rr / norm_rhs < tag.tolerance())
           break;
-        
+        new_ip_rr *= new_ip_rr;
+
         beta = new_ip_rr / ip_rr;
         ip_rr = new_ip_rr;
 
-        //p = residual + beta*p;
-        p *= beta;
-        p += residual;
-      } 
-      
+        p = residual + beta * p;
+      }
+
       //store last error estimate:
-      tag.error(sqrt(new_ip_rr / norm_rhs_squared));
-      
+      tag.error(std::sqrt(new_ip_rr) / norm_rhs);
+
       return result;
     }
 
@@ -161,56 +157,52 @@ namespace viennacl
     {
       typedef typename viennacl::result_of::value_type<VectorType>::type        ScalarType;
       typedef typename viennacl::result_of::cpu_value_type<ScalarType>::type    CPU_ScalarType;
-      unsigned int problem_size = viennacl::traits::size(rhs);
-      
-      VectorType result(problem_size);
-      result.clear();
-      
+
+      VectorType result = rhs;
+      viennacl::traits::clear(result);
+
       VectorType residual = rhs;
-      VectorType tmp(problem_size);
+      VectorType tmp = rhs;
       VectorType z = rhs;
 
       precond.apply(z);
       VectorType p = z;
 
-      ScalarType tmp_in_p;
-      ScalarType residual_in_z;
       CPU_ScalarType ip_rr = viennacl::linalg::inner_prod(residual, z);
       CPU_ScalarType alpha;
       CPU_ScalarType new_ip_rr = 0;
       CPU_ScalarType beta;
       CPU_ScalarType norm_rhs_squared = ip_rr;
       CPU_ScalarType new_ipp_rr_over_norm_rhs;
-      
+
+      if (norm_rhs_squared == 0) //solution is zero if RHS norm is zero
+        return result;
+
       for (unsigned int i = 0; i < tag.max_iterations(); ++i)
       {
         tag.iters(i+1);
         tmp = viennacl::linalg::prod(matrix, p);
-        
-        tmp_in_p = viennacl::linalg::inner_prod(tmp, p);
-        alpha = ip_rr / static_cast<CPU_ScalarType>(tmp_in_p);
-        
+
+        alpha = ip_rr / viennacl::linalg::inner_prod(tmp, p);
+
         result += alpha * p;
         residual -= alpha * tmp;
         z = residual;
         precond.apply(z);
-        
-        residual_in_z = viennacl::linalg::inner_prod(residual, z);
-        new_ip_rr = static_cast<CPU_ScalarType>(residual_in_z);
+
+        new_ip_rr = viennacl::linalg::inner_prod(residual, z);
         new_ipp_rr_over_norm_rhs = new_ip_rr / norm_rhs_squared;
         if (std::fabs(new_ipp_rr_over_norm_rhs) < tag.tolerance() *  tag.tolerance())    //squared norms involved here
           break;
-        
+
         beta = new_ip_rr / ip_rr;
         ip_rr = new_ip_rr;
-        
-        //p = z + beta*p;
-        p *= beta;
-        p += z;
-      } 
+
+        p = z + beta*p;
+      }
 
       //store last error estimate:
-      tag.error(sqrt(std::fabs(new_ip_rr / norm_rhs_squared)));
+      tag.error(std::sqrt(std::fabs(new_ip_rr / norm_rhs_squared)));
 
       return result;
     }
diff --git a/viennacl/linalg/circulant_matrix_operations.hpp b/viennacl/linalg/circulant_matrix_operations.hpp
index 49d42d1..4d612e4 100644
--- a/viennacl/linalg/circulant_matrix_operations.hpp
+++ b/viennacl/linalg/circulant_matrix_operations.hpp
@@ -2,29 +2,28 @@
 #define VIENNACL_LINALG_CIRCULANT_MATRIX_OPERATIONS_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
 ============================================================================= */
 
-/** @file circulant_matrix_operations.hpp
-    @brief Implementations of operations using circulant_matrix
+/** @file viennacl/linalg/circulant_matrix_operations.hpp
+    @brief Implementations of operations using circulant_matrix. Experimental.
 */
 
 #include "viennacl/forwards.h"
-#include "viennacl/ocl/device.hpp"
-#include "viennacl/ocl/handle.hpp"
-#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/backend.hpp"
 #include "viennacl/scalar.hpp"
 #include "viennacl/vector.hpp"
 #include "viennacl/tools/tools.hpp"
@@ -35,48 +34,9 @@ namespace viennacl
 {
   namespace linalg
   {
-    
-    
-    // A * x
-    /** @brief Returns a proxy class that represents matrix-vector multiplication with a compressed_matrix
-    *
-    * This is used for the convenience expression result = prod(mat, vec);
-    *
-    * @param mat    The matrix
-    * @param vec    The vector
-    */
-    template<class SCALARTYPE, unsigned int ALIGNMENT, unsigned int VECTOR_ALIGNMENT>
-    vector_expression<const circulant_matrix<SCALARTYPE, ALIGNMENT>,
-                      const vector<SCALARTYPE, VECTOR_ALIGNMENT>, 
-                      op_prod > prod_impl(const circulant_matrix<SCALARTYPE, ALIGNMENT> & mat, 
-                                     const vector<SCALARTYPE, VECTOR_ALIGNMENT> & vec)
-    {
-      return vector_expression<const circulant_matrix<SCALARTYPE, ALIGNMENT>,
-                               const vector<SCALARTYPE, VECTOR_ALIGNMENT>, 
-                               op_prod >(mat, vec);
-    }
-    
+
     // A * x
-    /** @brief Returns a proxy class that represents matrix-vector multiplication with a circulant_matrix
-    *
-    * This is used for the convenience expression result = prod(mat, vec);
-    *
-    * @param mat    The matrix
-    * @param vec    The vector
-    * @param NUM_THREADS Number of threads per work group. Can be used for fine-tuning.
-    */
-    template<class SCALARTYPE, unsigned int ALIGNMENT, unsigned int VECTOR_ALIGNMENT>
-    viennacl::vector_expression<const viennacl::circulant_matrix<SCALARTYPE, ALIGNMENT>,
-                                const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT>, 
-                                viennacl::op_prod > prod_impl(const viennacl::circulant_matrix<SCALARTYPE, ALIGNMENT> & mat, 
-                                                              const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT> & vec, 
-                                                              size_t NUM_THREADS)
-    {
-      return viennacl::vector_expression<const viennacl::circulant_matrix<SCALARTYPE, ALIGNMENT>,
-                               const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT>, 
-                               viennacl::op_prod >(mat, vec);
-    }
-    
+
     /** @brief Carries out matrix-vector multiplication with a circulant_matrix
     *
     * Implementation of the convenience expression result = prod(mat, vec);
@@ -85,22 +45,22 @@ namespace viennacl
     * @param vec    The vector
     * @param result The result vector
     */
-      template<class SCALARTYPE, unsigned int ALIGNMENT, unsigned int VECTOR_ALIGNMENT>
-      void prod_impl(const viennacl::circulant_matrix<SCALARTYPE, ALIGNMENT> & mat, 
-                     const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT> & vec,
-                           viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT> & result)
+      template<class SCALARTYPE, unsigned int ALIGNMENT>
+      void prod_impl(const viennacl::circulant_matrix<SCALARTYPE, ALIGNMENT> & mat,
+                     const viennacl::vector_base<SCALARTYPE> & vec,
+                           viennacl::vector_base<SCALARTYPE> & result)
       {
         assert(mat.size1() == result.size());
         assert(mat.size2() == vec.size());
         //result.clear();
-        
+
         //std::cout << "prod(circulant_matrix" << ALIGNMENT << ", vector) called with internal_nnz=" << mat.internal_nnz() << std::endl;
-        
-        viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT> circ(mat.elements().size() * 2);
+
+        viennacl::vector<SCALARTYPE> circ(mat.elements().size() * 2);
         viennacl::detail::fft::real_to_complex(mat.elements(), circ, mat.elements().size());
 
-        viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT> tmp(vec.size() * 2);
-        viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT> tmp2(vec.size() * 2);
+        viennacl::vector<SCALARTYPE> tmp(vec.size() * 2);
+        viennacl::vector<SCALARTYPE> tmp2(vec.size() * 2);
 
         viennacl::detail::fft::real_to_complex(vec, tmp, vec.size());
         viennacl::linalg::convolve(circ, tmp, tmp2);
@@ -111,107 +71,6 @@ namespace viennacl
   } //namespace linalg
 
 
-
-    /** @brief Implementation of the operation v1 = A * v2, where A is a matrix
-    *
-    * @param proxy  An expression template proxy class.
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    template <unsigned int MAT_ALIGNMENT>
-    viennacl::vector<SCALARTYPE, ALIGNMENT> & 
-    viennacl::vector<SCALARTYPE, ALIGNMENT>::operator=(const viennacl::vector_expression< const circulant_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                                                          const viennacl::vector<SCALARTYPE, ALIGNMENT>,
-                                                                                          viennacl::op_prod> & proxy) 
-    {
-      // check for the special case x = A * x
-      if (proxy.rhs().handle().get() == this->handle().get())
-      {
-        viennacl::vector<SCALARTYPE, ALIGNMENT> result(proxy.rhs().size());
-        viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
-        *this = result;
-        return *this;
-      }
-      else
-      {
-        viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), *this);
-        return *this;
-      }
-      return *this;
-    }
-
-    //v += A * x
-    /** @brief Implementation of the operation v1 += A * v2, where A is a matrix
-    *
-    * @param proxy  An expression template proxy class.
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    template <unsigned int MAT_ALIGNMENT>
-    viennacl::vector<SCALARTYPE, ALIGNMENT> & 
-    viennacl::vector<SCALARTYPE, ALIGNMENT>::operator+=(const vector_expression< const circulant_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                                                 const vector<SCALARTYPE, ALIGNMENT>,
-                                                                                 op_prod> & proxy) 
-    {
-      vector<SCALARTYPE, ALIGNMENT> result(proxy.lhs().size1());
-      viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
-      *this += result;
-      return *this;
-    }
-
-    /** @brief Implementation of the operation v1 -= A * v2, where A is a matrix
-    *
-    * @param proxy  An expression template proxy class.
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    template <unsigned int MAT_ALIGNMENT>
-    viennacl::vector<SCALARTYPE, ALIGNMENT> & 
-    viennacl::vector<SCALARTYPE, ALIGNMENT>::operator-=(const vector_expression< const circulant_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                                                 const vector<SCALARTYPE, ALIGNMENT>,
-                                                                                 op_prod> & proxy) 
-    {
-      vector<SCALARTYPE, ALIGNMENT> result(proxy.get_lhs().size1());
-      viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
-      *this -= result;
-      return *this;
-    }
-    
-    
-    //free functions:
-    /** @brief Implementation of the operation 'result = v1 + A * v2', where A is a matrix
-    *
-    * @param proxy  An expression template proxy class.
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    template <unsigned int MAT_ALIGNMENT>
-    viennacl::vector<SCALARTYPE, ALIGNMENT> 
-    viennacl::vector<SCALARTYPE, ALIGNMENT>::operator+(const vector_expression< const circulant_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                                                const vector<SCALARTYPE, ALIGNMENT>,
-                                                                                op_prod> & proxy) 
-    {
-      assert(proxy.get_lhs().size1() == size());
-      vector<SCALARTYPE, ALIGNMENT> result(size());
-      viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
-      result += *this;
-      return result;
-    }
-
-    /** @brief Implementation of the operation 'result = v1 - A * v2', where A is a matrix
-    *
-    * @param proxy  An expression template proxy class.
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    template <unsigned int MAT_ALIGNMENT>
-    viennacl::vector<SCALARTYPE, ALIGNMENT> 
-    viennacl::vector<SCALARTYPE, ALIGNMENT>::operator-(const vector_expression< const circulant_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                                                const vector<SCALARTYPE, ALIGNMENT>,
-                                                                                op_prod> & proxy) 
-    {
-      assert(proxy.get_lhs().size1() == size());
-      vector<SCALARTYPE, ALIGNMENT> result(size());
-      viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
-      result = *this - result;
-      return result;
-    }
-
 } //namespace viennacl
 
 
diff --git a/viennacl/linalg/compressed_matrix_operations.hpp b/viennacl/linalg/compressed_matrix_operations.hpp
deleted file mode 100644
index f31af4b..0000000
--- a/viennacl/linalg/compressed_matrix_operations.hpp
+++ /dev/null
@@ -1,265 +0,0 @@
-#ifndef VIENNACL_COMPRESSED_MATRIX_OPERATIONS_HPP_
-#define VIENNACL_COMPRESSED_MATRIX_OPERATIONS_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file compressed_matrix_operations.hpp
-    @brief Implementations of operations using compressed_matrix
-*/
-
-#include "viennacl/forwards.h"
-#include "viennacl/ocl/device.hpp"
-#include "viennacl/ocl/handle.hpp"
-#include "viennacl/ocl/kernel.hpp"
-#include "viennacl/scalar.hpp"
-#include "viennacl/vector.hpp"
-#include "viennacl/tools/tools.hpp"
-#include "viennacl/linalg/kernels/compressed_matrix_kernels.h"
-
-namespace viennacl
-{
-  namespace linalg
-  {
-    // A * x
-    /** @brief Returns a proxy class that represents matrix-vector multiplication with a compressed_matrix
-    *
-    * This is used for the convenience expression result = prod(mat, vec);
-    *
-    * @param mat    The matrix
-    * @param vec    The vector
-    */
-    template<class SCALARTYPE, unsigned int ALIGNMENT, unsigned int VECTOR_ALIGNMENT>
-    vector_expression<const compressed_matrix<SCALARTYPE, ALIGNMENT>,
-                      const vector<SCALARTYPE, VECTOR_ALIGNMENT>, 
-                      op_prod > prod_impl(const compressed_matrix<SCALARTYPE, ALIGNMENT> & mat, 
-                                     const vector<SCALARTYPE, VECTOR_ALIGNMENT> & vec)
-    {
-      return vector_expression<const compressed_matrix<SCALARTYPE, ALIGNMENT>,
-                               const vector<SCALARTYPE, VECTOR_ALIGNMENT>, 
-                               op_prod >(mat, vec);
-    }
-    
-    /** @brief Carries out matrix-vector multiplication with a compressed_matrix
-    *
-    * Implementation of the convenience expression result = prod(mat, vec);
-    *
-    * @param mat    The matrix
-    * @param vec    The vector
-    * @param result The result vector
-    * @param NUM_THREADS Number of threads per work group. Can be used for fine-tuning.
-    */
-    template<class TYPE, unsigned int ALIGNMENT, unsigned int VECTOR_ALIGNMENT>
-    void prod_impl(const viennacl::compressed_matrix<TYPE, ALIGNMENT> & mat, 
-                   const viennacl::vector<TYPE, VECTOR_ALIGNMENT> & vec,
-                         viennacl::vector<TYPE, VECTOR_ALIGNMENT> & result, 
-                   size_t NUM_THREADS = 0)
-    {
-      assert(mat.size1() == result.size());
-      assert(mat.size2() == vec.size());
-
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::compressed_matrix<TYPE, ALIGNMENT>::program_name(), "vec_mul");
-      
-      viennacl::ocl::enqueue(k(mat.handle1(), mat.handle2(), mat.handle(), vec, result, static_cast<cl_uint>(mat.size1())));
-    }
-
-    /** @brief Inplace solution of a lower triangular compressed_matrix with unit diagonal. Typically used for LU substitutions
-    *
-    * @param L    The matrix
-    * @param vec    The vector
-    */
-    template<typename SCALARTYPE, unsigned int MAT_ALIGNMENT, unsigned int VEC_ALIGNMENT>
-    void inplace_solve(compressed_matrix<SCALARTYPE, MAT_ALIGNMENT> const & L, vector<SCALARTYPE, VEC_ALIGNMENT> & vec, viennacl::linalg::unit_lower_tag)
-    {
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::compressed_matrix<SCALARTYPE, MAT_ALIGNMENT>::program_name(), "lu_forward");
-      unsigned int threads = k.local_work_size();
-
-      k.global_work_size(k.local_work_size());
-      viennacl::ocl::enqueue(k(L.handle1(), L.handle2(), L,
-                                                              viennacl::ocl::local_mem(sizeof(int) * (threads+1)),
-                                                              viennacl::ocl::local_mem(sizeof(SCALARTYPE) * threads),
-                                                              vec, L.size1()));        
-    }
-    
-    /** @brief Convenience functions for result = solve(trans(mat), vec, unit_lower_tag()); Creates a temporary result vector and forwards the request to inplace_solve()
-    *
-    * @param L      The lower triangular sparse matrix
-    * @param vec    The load vector, where the solution is directly written to
-    * @param tag    Dispatch tag
-    */
-    template<typename SCALARTYPE, unsigned int MAT_ALIGNMENT, unsigned int VEC_ALIGNMENT, typename TAG>
-    vector<SCALARTYPE, VEC_ALIGNMENT> solve(compressed_matrix<SCALARTYPE, MAT_ALIGNMENT> const & L,
-                                        const vector<SCALARTYPE, VEC_ALIGNMENT> & vec,
-                                        const viennacl::linalg::unit_lower_tag & tag)
-    {
-      // do an inplace solve on the result vector:
-      vector<SCALARTYPE, VEC_ALIGNMENT> result(vec.size());
-      result = vec;
-
-      inplace_solve(L, result, tag);
-    
-      return result;
-    }
-    
-    
-    /** @brief Inplace solution of a upper triangular compressed_matrix. Typically used for LU substitutions
-    *
-    * @param U      The upper triangular matrix
-    * @param vec    The vector
-    */
-    template<typename SCALARTYPE, unsigned int MAT_ALIGNMENT, unsigned int VEC_ALIGNMENT>
-    void inplace_solve(compressed_matrix<SCALARTYPE, MAT_ALIGNMENT> const & U, vector<SCALARTYPE, VEC_ALIGNMENT> & vec, viennacl::linalg::upper_tag)
-    {
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::compressed_matrix<SCALARTYPE, MAT_ALIGNMENT>::program_name(), "lu_backward");
-      unsigned int threads = k.local_work_size();
-      
-      k.global_work_size(k.local_work_size());
-      viennacl::ocl::enqueue(k(U.handle1().get(), U.handle2().get(), U.handle().get(),
-                                                              viennacl::ocl::local_mem(sizeof(int) * (threads+2)),
-                                                              viennacl::ocl::local_mem(sizeof(SCALARTYPE) * (threads+2)),
-                                                              vec, U.size1()));        
-    }
-
-    /** @brief Convenience functions for result = solve(trans(mat), vec, unit_lower_tag()); Creates a temporary result vector and forwards the request to inplace_solve()
-    *
-    * @param L      The lower triangular sparse matrix
-    * @param vec    The load vector, where the solution is directly written to
-    * @param tag    Dispatch tag
-    */
-    template<typename SCALARTYPE, unsigned int MAT_ALIGNMENT, unsigned int VEC_ALIGNMENT, typename TAG>
-    vector<SCALARTYPE, VEC_ALIGNMENT> solve(compressed_matrix<SCALARTYPE, MAT_ALIGNMENT> const & L,
-                                        const vector<SCALARTYPE, VEC_ALIGNMENT> & vec,
-                                        viennacl::linalg::upper_tag const & tag)
-    {
-      // do an inplace solve on the result vector:
-      vector<SCALARTYPE, VEC_ALIGNMENT> result(vec.size());
-      result = vec;
-    
-      inplace_solve(L, result, tag);
-    
-      return result;
-    }
-
-    
-  } //namespace linalg
-
-
-
-    //v = A * x
-    /** @brief Implementation of the operation v1 = A * v2, where A is a matrix
-    *
-    * @param proxy  An expression template proxy class.
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    template <unsigned int MAT_ALIGNMENT>
-    viennacl::vector<SCALARTYPE, ALIGNMENT> & 
-    viennacl::vector<SCALARTYPE, ALIGNMENT>::operator=(const viennacl::vector_expression< const compressed_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                                                          const viennacl::vector<SCALARTYPE, ALIGNMENT>,
-                                                                                          viennacl::op_prod> & proxy) 
-    {
-      // check for the special case x = A * x
-      if (proxy.rhs().handle().get() == this->handle().get())
-      {
-        viennacl::vector<SCALARTYPE, ALIGNMENT> result(proxy.rhs().size());
-        viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
-        *this = result;
-        return *this;
-      }
-      else
-      {
-        viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), *this);
-        return *this;
-      }
-      return *this;
-    }
-
-    //v += A * x
-    /** @brief Implementation of the operation v1 += A * v2, where A is a matrix
-    *
-    * @param proxy  An expression template proxy class.
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    template <unsigned int MAT_ALIGNMENT>
-    viennacl::vector<SCALARTYPE, ALIGNMENT> & 
-    viennacl::vector<SCALARTYPE, ALIGNMENT>::operator+=(const vector_expression< const compressed_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                                                 const vector<SCALARTYPE, ALIGNMENT>,
-                                                                                 op_prod> & proxy) 
-    {
-      vector<SCALARTYPE, ALIGNMENT> result(proxy.lhs().size1());
-      viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
-      *this += result;
-      return *this;
-    }
-
-    /** @brief Implementation of the operation v1 -= A * v2, where A is a matrix
-    *
-    * @param proxy  An expression template proxy class.
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    template <unsigned int MAT_ALIGNMENT>
-    viennacl::vector<SCALARTYPE, ALIGNMENT> & 
-    viennacl::vector<SCALARTYPE, ALIGNMENT>::operator-=(const vector_expression< const compressed_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                                                 const vector<SCALARTYPE, ALIGNMENT>,
-                                                                                 op_prod> & proxy) 
-    {
-      vector<SCALARTYPE, ALIGNMENT> result(proxy.lhs().size1());
-      viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
-      *this -= result;
-      return *this;
-    }
-    
-    
-    //free functions:
-    /** @brief Implementation of the operation 'result = v1 + A * v2', where A is a matrix
-    *
-    * @param proxy  An expression template proxy class.
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    template <unsigned int MAT_ALIGNMENT>
-    viennacl::vector<SCALARTYPE, ALIGNMENT> 
-    viennacl::vector<SCALARTYPE, ALIGNMENT>::operator+(const vector_expression< const compressed_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                                                const vector<SCALARTYPE, ALIGNMENT>,
-                                                                                op_prod> & proxy) 
-    {
-      assert(proxy.lhs().size1() == size());
-      vector<SCALARTYPE, ALIGNMENT> result(size());
-      viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
-      result += *this;
-      return result;
-    }
-
-    /** @brief Implementation of the operation 'result = v1 - A * v2', where A is a matrix
-    *
-    * @param proxy  An expression template proxy class.
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    template <unsigned int MAT_ALIGNMENT>
-    viennacl::vector<SCALARTYPE, ALIGNMENT> 
-    viennacl::vector<SCALARTYPE, ALIGNMENT>::operator-(const vector_expression< const compressed_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                                                const vector<SCALARTYPE, ALIGNMENT>,
-                                                                                op_prod> & proxy) 
-    {
-      assert(proxy.lhs().size1() == size());
-      vector<SCALARTYPE, ALIGNMENT> result(size());
-      viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
-      result = *this - result;
-      return result;
-    }
-
-} //namespace viennacl
-
-
-#endif
diff --git a/viennacl/linalg/coordinate_matrix_operations.hpp b/viennacl/linalg/coordinate_matrix_operations.hpp
deleted file mode 100644
index 8b46b7f..0000000
--- a/viennacl/linalg/coordinate_matrix_operations.hpp
+++ /dev/null
@@ -1,222 +0,0 @@
-#ifndef VIENNACL_COORDINATE_MATRIX_OPERATIONS_HPP_
-#define VIENNACL_COORDINATE_MATRIX_OPERATIONS_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file coordinate_matrix_operations.hpp
-    @brief Implementations of operations using coordinate_matrix
-*/
-
-#include "viennacl/forwards.h"
-#include "viennacl/ocl/device.hpp"
-#include "viennacl/ocl/handle.hpp"
-#include "viennacl/ocl/kernel.hpp"
-#include "viennacl/scalar.hpp"
-#include "viennacl/vector.hpp"
-#include "viennacl/tools/tools.hpp"
-#include "viennacl/linalg/kernels/coordinate_matrix_kernels.h"
-
-namespace viennacl
-{
-  namespace linalg
-  {
-    
-    
-    // A * x
-    /** @brief Returns a proxy class that represents matrix-vector multiplication with a compressed_matrix
-    *
-    * This is used for the convenience expression result = prod(mat, vec);
-    *
-    * @param mat    The matrix
-    * @param vec    The vector
-    */
-    template<class SCALARTYPE, unsigned int ALIGNMENT, unsigned int VECTOR_ALIGNMENT>
-    vector_expression<const coordinate_matrix<SCALARTYPE, ALIGNMENT>,
-                      const vector<SCALARTYPE, VECTOR_ALIGNMENT>, 
-                      op_prod > prod_impl(const coordinate_matrix<SCALARTYPE, ALIGNMENT> & mat, 
-                                     const vector<SCALARTYPE, VECTOR_ALIGNMENT> & vec)
-    {
-      return vector_expression<const coordinate_matrix<SCALARTYPE, ALIGNMENT>,
-                               const vector<SCALARTYPE, VECTOR_ALIGNMENT>, 
-                               op_prod >(mat, vec);
-    }
-    
-    // A * x
-    /** @brief Returns a proxy class that represents matrix-vector multiplication with a coordinate_matrix
-    *
-    * This is used for the convenience expression result = prod(mat, vec);
-    *
-    * @param mat    The matrix
-    * @param vec    The vector
-    * @param NUM_THREADS Number of threads per work group. Can be used for fine-tuning.
-    */
-    template<class SCALARTYPE, unsigned int ALIGNMENT, unsigned int VECTOR_ALIGNMENT>
-    viennacl::vector_expression<const viennacl::coordinate_matrix<SCALARTYPE, ALIGNMENT>,
-                                const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT>, 
-                                viennacl::op_prod > prod_impl(const viennacl::coordinate_matrix<SCALARTYPE, ALIGNMENT> & mat, 
-                                                              const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT> & vec, 
-                                                              size_t NUM_THREADS)
-    {
-      return viennacl::vector_expression<const viennacl::coordinate_matrix<SCALARTYPE, ALIGNMENT>,
-                               const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT>, 
-                               viennacl::op_prod >(mat, vec);
-    }
-    
-    //namespace {
-    /** @brief Carries out matrix-vector multiplication with a coordinate_matrix
-    *
-    * Implementation of the convenience expression result = prod(mat, vec);
-    *
-    * @param mat    The matrix
-    * @param vec    The vector
-    * @param result The result vector
-    */
-      template<class TYPE, unsigned int ALIGNMENT, unsigned int VECTOR_ALIGNMENT>
-      void prod_impl(const viennacl::coordinate_matrix<TYPE, ALIGNMENT> & mat, 
-                     const viennacl::vector<TYPE, VECTOR_ALIGNMENT> & vec,
-                           viennacl::vector<TYPE, VECTOR_ALIGNMENT> & result)
-      {
-        assert(mat.size1() == result.size());
-        assert(mat.size2() == vec.size());
-        result.clear();
-        
-        //std::cout << "prod(coordinate_matrix" << ALIGNMENT << ", vector) called with internal_nnz=" << mat.internal_nnz() << std::endl;
-        
-        viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::coordinate_matrix<TYPE, ALIGNMENT>::program_name(), "vec_mul");
-        unsigned int thread_num = 256; //k.local_work_size(0);
-        
-        k.local_work_size(0, thread_num);
-        
-        k.global_work_size(0, 64 * thread_num);  //64 work groups are hard-coded for now. Gives reasonable performance in most cases
-        //k.global_work_size(0, thread_num);  //Only one work group
-        viennacl::ocl::enqueue(k(mat.handle12(), mat, mat.handle3(),
-                                 vec,
-                                 result,
-                                 viennacl::ocl::local_mem(sizeof(cl_uint)*thread_num),
-                                 viennacl::ocl::local_mem(sizeof(TYPE)*thread_num)) );
-
-      }
-    //};
-
-  } //namespace linalg
-
-
-
-    /** @brief Implementation of the operation v1 = A * v2, where A is a matrix
-    *
-    * @param proxy  An expression template proxy class.
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    template <unsigned int MAT_ALIGNMENT>
-    viennacl::vector<SCALARTYPE, ALIGNMENT> & 
-    viennacl::vector<SCALARTYPE, ALIGNMENT>::operator=(const viennacl::vector_expression< const coordinate_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                                                          const viennacl::vector<SCALARTYPE, ALIGNMENT>,
-                                                                                          viennacl::op_prod> & proxy) 
-    {
-      // check for the special case x = A * x
-      if (proxy.rhs().handle().get() == this->handle().get())
-      {
-        viennacl::vector<SCALARTYPE, ALIGNMENT> result(proxy.rhs().size());
-        viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
-        *this = result;
-        return *this;
-      }
-      else
-      {
-        viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), *this);
-        return *this;
-      }
-      return *this;
-    }
-
-    //v += A * x
-    /** @brief Implementation of the operation v1 += A * v2, where A is a matrix
-    *
-    * @param proxy  An expression template proxy class.
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    template <unsigned int MAT_ALIGNMENT>
-    viennacl::vector<SCALARTYPE, ALIGNMENT> & 
-    viennacl::vector<SCALARTYPE, ALIGNMENT>::operator+=(const vector_expression< const coordinate_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                                                 const vector<SCALARTYPE, ALIGNMENT>,
-                                                                                 op_prod> & proxy) 
-    {
-      vector<SCALARTYPE, ALIGNMENT> result(proxy.lhs().size1());
-      viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
-      *this += result;
-      return *this;
-    }
-
-    /** @brief Implementation of the operation v1 -= A * v2, where A is a matrix
-    *
-    * @param proxy  An expression template proxy class.
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    template <unsigned int MAT_ALIGNMENT>
-    viennacl::vector<SCALARTYPE, ALIGNMENT> & 
-    viennacl::vector<SCALARTYPE, ALIGNMENT>::operator-=(const vector_expression< const coordinate_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                                                 const vector<SCALARTYPE, ALIGNMENT>,
-                                                                                 op_prod> & proxy) 
-    {
-      vector<SCALARTYPE, ALIGNMENT> result(proxy.get_lhs().size1());
-      viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
-      *this -= result;
-      return *this;
-    }
-    
-    
-    //free functions:
-    /** @brief Implementation of the operation 'result = v1 + A * v2', where A is a matrix
-    *
-    * @param proxy  An expression template proxy class.
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    template <unsigned int MAT_ALIGNMENT>
-    viennacl::vector<SCALARTYPE, ALIGNMENT> 
-    viennacl::vector<SCALARTYPE, ALIGNMENT>::operator+(const vector_expression< const coordinate_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                                                const vector<SCALARTYPE, ALIGNMENT>,
-                                                                                op_prod> & proxy) 
-    {
-      assert(proxy.get_lhs().size1() == size());
-      vector<SCALARTYPE, ALIGNMENT> result(size());
-      viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
-      result += *this;
-      return result;
-    }
-
-    /** @brief Implementation of the operation 'result = v1 - A * v2', where A is a matrix
-    *
-    * @param proxy  An expression template proxy class.
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    template <unsigned int MAT_ALIGNMENT>
-    viennacl::vector<SCALARTYPE, ALIGNMENT> 
-    viennacl::vector<SCALARTYPE, ALIGNMENT>::operator-(const vector_expression< const coordinate_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                                                const vector<SCALARTYPE, ALIGNMENT>,
-                                                                                op_prod> & proxy) 
-    {
-      assert(proxy.get_lhs().size1() == size());
-      vector<SCALARTYPE, ALIGNMENT> result(size());
-      viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
-      result = *this - result;
-      return result;
-    }
-
-} //namespace viennacl
-
-
-#endif
diff --git a/viennacl/linalg/cuda/common.hpp b/viennacl/linalg/cuda/common.hpp
new file mode 100644
index 0000000..6962d43
--- /dev/null
+++ b/viennacl/linalg/cuda/common.hpp
@@ -0,0 +1,189 @@
+#ifndef VIENNACL_LINALG_CUDA_COMMON_HPP_
+#define VIENNACL_LINALG_CUDA_COMMON_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/cuda/common.hpp
+    @brief Common routines for CUDA execution
+*/
+
+#include "viennacl/traits/handle.hpp"
+
+#define VIENNACL_CUDA_LAST_ERROR_CHECK(message)  detail::cuda_last_error_check (message, __FILE__, __LINE__)
+
+namespace viennacl
+{
+  namespace linalg
+  {
+    namespace cuda
+    {
+      namespace detail
+      {
+        inline unsigned int make_options(vcl_size_t length, bool reciprocal, bool flip_sign)
+        {
+          return static_cast<unsigned int>( ((length > 1) ? (static_cast<unsigned int>(length) << 2) : 0) + (reciprocal ? 2 : 0) + (flip_sign ? 1 : 0) );
+        }
+
+        inline void cuda_last_error_check(const char * message, const char * file, const int line )
+        {
+          cudaError_t error_code = cudaGetLastError();
+
+          if(cudaSuccess != error_code)
+          {
+            std::cerr << file << "(" << line << "): " << ": getLastCudaError() CUDA error " << error_code << ": " << cudaGetErrorString( error_code ) << " @ " << message << std::endl;
+            throw "CUDA error";
+          }
+        }
+
+        template <typename T, typename U>
+        T * cuda_arg(vector_base<U> & obj)
+        {
+          return reinterpret_cast<T *>(viennacl::traits::handle(obj).cuda_handle().get());
+        }
+
+        template <typename T, typename U>
+        const T * cuda_arg(vector_base<U> const & obj)
+        {
+          return reinterpret_cast<const T *>(viennacl::traits::handle(obj).cuda_handle().get());
+        }
+
+        template <typename NumericT, typename F>
+        NumericT * cuda_arg(matrix_base<NumericT, F> & obj)
+        {
+          return reinterpret_cast<NumericT *>(viennacl::traits::handle(obj).cuda_handle().get());
+        }
+
+        template <typename NumericT, typename F>
+        const NumericT * cuda_arg(matrix_base<NumericT, F> const & obj)
+        {
+          return reinterpret_cast<const NumericT *>(viennacl::traits::handle(obj).cuda_handle().get());
+        }
+
+
+        template <typename ScalarType, typename T>
+        typename viennacl::enable_if< viennacl::is_scalar<T>::value,
+                                      ScalarType *>::type
+        cuda_arg(T & obj)
+        {
+          return reinterpret_cast<ScalarType *>(viennacl::traits::handle(obj).cuda_handle().get());
+        }
+
+        template <typename ScalarType, typename T>
+        typename viennacl::enable_if< viennacl::is_scalar<T>::value,
+                                      const ScalarType *>::type
+        cuda_arg(T const & obj)
+        {
+          return reinterpret_cast<const ScalarType *>(viennacl::traits::handle(obj).cuda_handle().get());
+        }
+
+        template <typename ScalarType>
+        ScalarType *  cuda_arg(viennacl::backend::mem_handle::cuda_handle_type & h)
+        {
+          return reinterpret_cast<ScalarType *>(h.get());
+        }
+
+        template <typename ScalarType>
+        ScalarType const *  cuda_arg(viennacl::backend::mem_handle::cuda_handle_type const & h)
+        {
+          return reinterpret_cast<const ScalarType *>(h.get());
+        }
+
+        //template <typename ScalarType>
+        //ScalarType cuda_arg(ScalarType const & val)  { return val; }
+
+        inline unsigned int cuda_arg(unsigned int val)  { return val; }
+
+        template <typename T> char           cuda_arg(char val)           { return val; }
+        template <typename T> unsigned char  cuda_arg(unsigned char val)  { return val; }
+
+        template <typename T> short          cuda_arg(short val)          { return val; }
+        template <typename T> unsigned short cuda_arg(unsigned short val) { return val; }
+
+        template <typename T> int            cuda_arg(int val)            { return val; }
+        template <typename T> unsigned int   cuda_arg(unsigned int val)   { return val; }
+
+        template <typename T> long           cuda_arg(long val)           { return val; }
+        template <typename T> unsigned long  cuda_arg(unsigned long val)  { return val; }
+
+        template <typename T> float          cuda_arg(float val)          { return val; }
+        template <typename T> double         cuda_arg(double val)         { return val; }
+
+        template <typename T, typename U>
+        typename viennacl::backend::mem_handle::cuda_handle_type & arg_reference(viennacl::scalar<T> & s, U) { return s.handle().cuda_handle(); }
+
+        template <typename T, typename U>
+        typename viennacl::backend::mem_handle::cuda_handle_type const & arg_reference(viennacl::scalar<T> const & s, U) { return s.handle().cuda_handle(); }
+
+        // all other cases where T is not a ViennaCL scalar
+        template <typename T>
+        typename viennacl::enable_if< viennacl::is_cpu_scalar<T>::value,
+                                      char const &>::type
+        arg_reference(T, char const & val)  { return val; }
+
+        template <typename T>
+        typename viennacl::enable_if< viennacl::is_cpu_scalar<T>::value,
+                                      unsigned char const &>::type
+        arg_reference(T, unsigned char const & val)  { return val; }
+
+        template <typename T>
+        typename viennacl::enable_if< viennacl::is_cpu_scalar<T>::value,
+                                      short const &>::type
+        arg_reference(T, short const & val)  { return val; }
+
+        template <typename T>
+        typename viennacl::enable_if< viennacl::is_cpu_scalar<T>::value,
+                                      unsigned short const &>::type
+        arg_reference(T, unsigned short const & val)  { return val; }
+
+        template <typename T>
+        typename viennacl::enable_if< viennacl::is_cpu_scalar<T>::value,
+                                      int const &>::type
+        arg_reference(T, int const & val)  { return val; }
+
+        template <typename T>
+        typename viennacl::enable_if< viennacl::is_cpu_scalar<T>::value,
+                                      unsigned int const &>::type
+        arg_reference(T, unsigned int const & val)  { return val; }
+
+        template <typename T>
+        typename viennacl::enable_if< viennacl::is_cpu_scalar<T>::value,
+                                      long const &>::type
+        arg_reference(T, long const & val)  { return val; }
+
+        template <typename T>
+        typename viennacl::enable_if< viennacl::is_cpu_scalar<T>::value,
+                                      unsigned long const &>::type
+        arg_reference(T, unsigned long const & val)  { return val; }
+
+        template <typename T>
+        typename viennacl::enable_if< viennacl::is_cpu_scalar<T>::value,
+                                      float const &>::type
+        arg_reference(T, float const & val)  { return val; }
+
+        template <typename T>
+        typename viennacl::enable_if< viennacl::is_cpu_scalar<T>::value,
+                                      double const &>::type
+        arg_reference(T, double const & val)  { return val; }
+      } //namespace detail
+
+    } //namespace cuda
+  } //namespace linalg
+} //namespace viennacl
+
+
+#endif
diff --git a/viennacl/linalg/cuda/direct_solve.hpp b/viennacl/linalg/cuda/direct_solve.hpp
new file mode 100644
index 0000000..8c6a1d4
--- /dev/null
+++ b/viennacl/linalg/cuda/direct_solve.hpp
@@ -0,0 +1,523 @@
+#ifndef VIENNACL_LINALG_CUDA_DIRECT_SOLVE_HPP
+#define VIENNACL_LINALG_CUDA_DIRECT_SOLVE_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/cuda/direct_solve.hpp
+    @brief Implementations of dense direct solvers using CUDA are found here.
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+
+
+#include "viennacl/linalg/cuda/common.hpp"
+
+
+namespace viennacl
+{
+  namespace linalg
+  {
+    namespace cuda
+    {
+
+      template <typename T>
+      __global__ void matrix_matrix_upper_solve_kernel(
+                const T * A,
+                unsigned int A_start1, unsigned int A_start2,
+                unsigned int A_inc1,   unsigned int A_inc2,
+                unsigned int A_size1,  unsigned int A_size2,
+                unsigned int A_internal_size1, unsigned int A_internal_size2,
+                bool row_major_A,
+                bool transpose_A,
+
+                T * B,
+                unsigned int B_start1, unsigned int B_start2,
+                unsigned int B_inc1,   unsigned int B_inc2,
+                unsigned int B_size1,  unsigned int B_size2,
+                unsigned int B_internal_size1, unsigned int B_internal_size2,
+                bool row_major_B,
+                bool transpose_B,
+
+                bool unit_diagonal)
+      {
+        T temp;
+        T entry_A;
+
+        for (unsigned int row_cnt = 0; row_cnt < A_size1; ++row_cnt)
+        {
+          unsigned int row = A_size1 - 1 - row_cnt;
+
+          if (!unit_diagonal)
+          {
+            __syncthreads();
+
+            if (threadIdx.x == 0)
+            {
+              if (row_major_B && transpose_B)
+                B[(blockIdx.x * B_inc1 + B_start1) * B_internal_size2 + (row * B_inc2 + B_start2)] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]
+                                                                                                                    : A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2)*A_internal_size1];
+              else if (row_major_B && !transpose_B)
+                B[(row * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]
+                                                                                                                    : A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2)*A_internal_size1];
+              else if (!row_major_B && transpose_B)
+                B[(blockIdx.x * B_inc1 + B_start1) + (row * B_inc2 + B_start2) * B_internal_size1] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]
+                                                                                                                    : A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2)*A_internal_size1];
+              else //if (!row_major_B && !transpose_B)
+                B[(row * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]
+                                                                                                                    : A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2)*A_internal_size1];
+            }
+          }
+
+          __syncthreads();
+
+          if (row_major_B && transpose_B)
+            temp = B[(blockIdx.x * B_inc1 + B_start1) * B_internal_size2 + (row * B_inc2 + B_start2)];
+          else if (row_major_B && !transpose_B)
+            temp = B[(row * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)];
+          else if (!row_major_B && transpose_B)
+            temp = B[(blockIdx.x * B_inc1 + B_start1) + (row * B_inc2 + B_start2) * B_internal_size1];
+          else //if (!row_major_B && !transpose_B)
+            temp = B[(row * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1];
+
+          //eliminate column of op(A) with index 'row' in parallel: " << std::endl;
+          for  (unsigned int elim = threadIdx.x; elim < row; elim += blockDim.x)
+          {
+            if (row_major_A && transpose_A)
+              entry_A = A[(row * A_inc1 + A_start1) * A_internal_size2 + (elim * A_inc2 + A_start2)];
+            else if (row_major_A && !transpose_A)
+              entry_A = A[(elim * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)];
+            else if (!row_major_A && transpose_A)
+              entry_A = A[(row * A_inc1 + A_start1) + (elim * A_inc2 + A_start2) * A_internal_size1];
+            else //if (!row_major_A && !transpose_A)
+              entry_A = A[(elim * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1];
+
+            if (row_major_B && transpose_B)
+              B[(blockIdx.x * B_inc1 + B_start1) * B_internal_size2 + (elim * B_inc2 + B_start2)] -= temp * entry_A;
+            else if (row_major_B && !transpose_B)
+              B[(elim * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)] -= temp * entry_A;
+            else if (!row_major_B && transpose_B)
+              B[(blockIdx.x * B_inc1 + B_start1) + (elim * B_inc2 + B_start2) * B_internal_size1] -= temp * entry_A;
+            else //if (!row_major_B && !transpose_B)
+              B[(elim * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1] -= temp * entry_A;
+
+          }
+        }
+      }
+
+
+
+      template <typename T>
+      __global__ void matrix_matrix_lower_solve_kernel(
+                const T * A,
+                unsigned int A_start1, unsigned int A_start2,
+                unsigned int A_inc1,   unsigned int A_inc2,
+                unsigned int A_size1,  unsigned int A_size2,
+                unsigned int A_internal_size1, unsigned int A_internal_size2,
+                bool row_major_A,
+                bool transpose_A,
+
+                T * B,
+                unsigned int B_start1, unsigned int B_start2,
+                unsigned int B_inc1,   unsigned int B_inc2,
+                unsigned int B_size1,  unsigned int B_size2,
+                unsigned int B_internal_size1, unsigned int B_internal_size2,
+                bool row_major_B,
+                bool transpose_B,
+
+                bool unit_diagonal)
+      {
+        T temp;
+        T entry_A;
+
+        for (unsigned int row = 0; row < A_size1; ++row)
+        {
+
+          if (!unit_diagonal)
+          {
+            __syncthreads();
+
+            if (threadIdx.x == 0)
+            {
+              if (row_major_B && transpose_B)
+                B[(blockIdx.x * B_inc1 + B_start1) * B_internal_size2 + (row * B_inc2 + B_start2)] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]
+                                                                                                                    : A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2)*A_internal_size1];
+              else if (row_major_B && !transpose_B)
+                B[(row * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]
+                                                                                                                    : A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2)*A_internal_size1];
+              else if (!row_major_B && transpose_B)
+                B[(blockIdx.x * B_inc1 + B_start1) + (row * B_inc2 + B_start2) * B_internal_size1] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]
+                                                                                                                    : A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2)*A_internal_size1];
+              else //if (!row_major_B && !transpose_B)
+                B[(row * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]
+                                                                                                                    : A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2)*A_internal_size1];
+            }
+          }
+
+          __syncthreads();
+
+          if (row_major_B && transpose_B)
+            temp = B[(blockIdx.x * B_inc1 + B_start1) * B_internal_size2 + (row * B_inc2 + B_start2)];
+          else if (row_major_B && !transpose_B)
+            temp = B[(row * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)];
+          else if (!row_major_B && transpose_B)
+            temp = B[(blockIdx.x * B_inc1 + B_start1) + (row * B_inc2 + B_start2) * B_internal_size1];
+          else //if (!row_major_B && !transpose_B)
+            temp = B[(row * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1];
+
+          //eliminate column of op(A) with index 'row' in parallel: " << std::endl;
+          for  (unsigned int elim = row + threadIdx.x + 1; elim < A_size1; elim += blockDim.x)
+          {
+            if (row_major_A && transpose_A)
+              entry_A = A[(row * A_inc1 + A_start1) * A_internal_size2 + (elim * A_inc2 + A_start2)];
+            else if (row_major_A && !transpose_A)
+              entry_A = A[(elim * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)];
+            else if (!row_major_A && transpose_A)
+              entry_A = A[(row * A_inc1 + A_start1) + (elim * A_inc2 + A_start2) * A_internal_size1];
+            else //if (!row_major_A && !transpose_A)
+              entry_A = A[(elim * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1];
+
+            if (row_major_B && transpose_B)
+              B[(blockIdx.x * B_inc1 + B_start1) * B_internal_size2 + (elim * B_inc2 + B_start2)] -= temp * entry_A;
+            else if (row_major_B && !transpose_B)
+              B[(elim * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)] -= temp * entry_A;
+            else if (!row_major_B && transpose_B)
+              B[(blockIdx.x * B_inc1 + B_start1) + (elim * B_inc2 + B_start2) * B_internal_size1] -= temp * entry_A;
+            else //if (!row_major_B && !transpose_B)
+              B[(elim * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1] -= temp * entry_A;
+
+          }
+        }
+      }
+
+
+
+
+
+
+      namespace detail
+      {
+        template <typename T>
+        bool is_unit_solve(T const & tag) { return false; }
+
+        inline bool is_unit_solve(viennacl::linalg::unit_lower_tag) { return true; }
+        inline bool is_unit_solve(viennacl::linalg::unit_upper_tag) { return true; }
+
+        template <typename T>
+        bool is_upper_solve(T const & tag) { return false; }
+
+        inline bool is_upper_solve(viennacl::linalg::upper_tag) { return true; }
+        inline bool is_upper_solve(viennacl::linalg::unit_upper_tag) { return true; }
+
+        template <typename M1, typename M2, typename SolverTag>
+        void inplace_solve_impl(M1 const & A, bool transpose_A,
+                                M2 & B,       bool transpose_B,
+                                SolverTag const & tag)
+        {
+          typedef typename viennacl::result_of::cpu_value_type<M1>::type        value_type;
+
+          dim3 threads(128);
+          dim3 grid( transpose_B ? B.size1() : B.size2() );
+
+          if (is_upper_solve(tag))
+          {
+            matrix_matrix_upper_solve_kernel<<<grid,threads>>>(detail::cuda_arg<value_type>(A),
+                                                               static_cast<unsigned int>(viennacl::traits::start1(A)),         static_cast<unsigned int>(viennacl::traits::start2(A)),
+                                                               static_cast<unsigned int>(viennacl::traits::stride1(A)),        static_cast<unsigned int>(viennacl::traits::stride2(A)),
+                                                               static_cast<unsigned int>(viennacl::traits::size1(A)),          static_cast<unsigned int>(viennacl::traits::size2(A)),
+                                                               static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+                                                               bool(viennacl::is_row_major<M1>::value),
+                                                               transpose_A,
+
+                                                               detail::cuda_arg<value_type>(B),
+                                                               static_cast<unsigned int>(viennacl::traits::start1(B)),         static_cast<unsigned int>(viennacl::traits::start2(B)),
+                                                               static_cast<unsigned int>(viennacl::traits::stride1(B)),        static_cast<unsigned int>(viennacl::traits::stride2(B)),
+                                                               static_cast<unsigned int>(viennacl::traits::size1(B)),          static_cast<unsigned int>(viennacl::traits::size2(B)),
+                                                               static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
+                                                               bool(viennacl::is_row_major<M2>::value),
+                                                               transpose_B,
+
+                                                               is_unit_solve(tag)
+                                                              );
+          }
+          else
+          {
+            matrix_matrix_lower_solve_kernel<<<grid,threads>>>(detail::cuda_arg<value_type>(A),
+                                                               static_cast<unsigned int>(viennacl::traits::start1(A)),         static_cast<unsigned int>(viennacl::traits::start2(A)),
+                                                               static_cast<unsigned int>(viennacl::traits::stride1(A)),        static_cast<unsigned int>(viennacl::traits::stride2(A)),
+                                                               static_cast<unsigned int>(viennacl::traits::size1(A)),          static_cast<unsigned int>(viennacl::traits::size2(A)),
+                                                               static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+                                                               bool(viennacl::is_row_major<M1>::value),
+                                                               transpose_A,
+
+                                                               detail::cuda_arg<value_type>(B),
+                                                               static_cast<unsigned int>(viennacl::traits::start1(B)),         static_cast<unsigned int>(viennacl::traits::start2(B)),
+                                                               static_cast<unsigned int>(viennacl::traits::stride1(B)),        static_cast<unsigned int>(viennacl::traits::stride2(B)),
+                                                               static_cast<unsigned int>(viennacl::traits::size1(B)),          static_cast<unsigned int>(viennacl::traits::size2(B)),
+                                                               static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
+                                                               bool(viennacl::is_row_major<M2>::value),
+                                                               transpose_B,
+
+                                                               is_unit_solve(tag)
+                                                              );
+          }
+
+        }
+      }
+
+
+      //
+      // Note: By convention, all size checks are performed in the calling frontend. No need to double-check here.
+      //
+
+      ////////////////// triangular solver //////////////////////////////////////
+      /** @brief Direct inplace solver for triangular systems with multiple right hand sides, i.e. A \ B   (MATLAB notation)
+      *
+      * @param A      The system matrix
+      * @param B      The matrix of row vectors, where the solution is directly written to
+      * @param tag    Solver tag for identifying the respective triangular solver
+      */
+      template <typename NumericT, typename F1, typename F2, typename SOLVERTAG>
+      void inplace_solve(const matrix_base<NumericT, F1> & A, matrix_base<NumericT, F2> & B, SOLVERTAG tag)
+      {
+        detail::inplace_solve_impl(A, false,
+                                   B, false, tag);
+      }
+
+      /** @brief Direct inplace solver for triangular systems with multiple transposed right hand sides, i.e. A \ B^T   (MATLAB notation)
+      *
+      * @param A       The system matrix
+      * @param proxy_B The proxy for the transposed matrix of row vectors, where the solution is directly written to
+      * @param tag    Solver tag for identifying the respective triangular solver
+      */
+      template <typename NumericT, typename F1, typename F2, typename SOLVERTAG>
+      void inplace_solve(const matrix_base<NumericT, F1> & A,
+                         matrix_expression< const matrix_base<NumericT, F2>, const matrix_base<NumericT, F2>, op_trans> proxy_B,
+                         SOLVERTAG tag)
+      {
+        detail::inplace_solve_impl(A, false,
+                                   const_cast<matrix_base<NumericT, F2> &>(proxy_B.lhs()), true, tag);
+      }
+
+      //upper triangular solver for transposed lower triangular matrices
+      /** @brief Direct inplace solver for transposed triangular systems with multiple right hand sides, i.e. A^T \ B   (MATLAB notation)
+      *
+      * @param proxy_A  The transposed system matrix proxy
+      * @param B        The matrix holding the load vectors, where the solution is directly written to
+      * @param tag    Solver tag for identifying the respective triangular solver
+      */
+      template <typename NumericT, typename F1, typename F2, typename SOLVERTAG>
+      void inplace_solve(const matrix_expression< const matrix_base<NumericT, F1>, const matrix_base<NumericT, F1>, op_trans> & proxy_A,
+                         matrix_base<NumericT, F2> & B,
+                         SOLVERTAG tag)
+      {
+        detail::inplace_solve_impl(const_cast<matrix_base<NumericT, F1> &>(proxy_A.lhs()), true,
+                                   B, false, tag);
+      }
+
+      /** @brief Direct inplace solver for transposed triangular systems with multiple transposed right hand sides, i.e. A^T \ B^T   (MATLAB notation)
+      *
+      * @param proxy_A    The transposed system matrix proxy
+      * @param proxy_B    The transposed matrix holding the load vectors, where the solution is directly written to
+      * @param tag    Solver tag for identifying the respective triangular solver
+      */
+      template <typename NumericT, typename F1, typename F2, typename SOLVERTAG>
+      void inplace_solve(const matrix_expression< const matrix_base<NumericT, F1>, const matrix_base<NumericT, F1>, op_trans> & proxy_A,
+                               matrix_expression< const matrix_base<NumericT, F2>, const matrix_base<NumericT, F2>, op_trans>   proxy_B,
+                         SOLVERTAG tag)
+      {
+        detail::inplace_solve_impl(const_cast<matrix_base<NumericT, F1> &>(proxy_A.lhs()), true,
+                                   const_cast<matrix_base<NumericT, F2> &>(proxy_B.lhs()), true, tag);
+      }
+
+
+
+      //
+      //  Solve on vector
+      //
+
+      template <typename T>
+      __global__ void triangular_substitute_inplace_row_kernel(
+                T const * A,
+                unsigned int A_start1, unsigned int A_start2,
+                unsigned int A_inc1,   unsigned int A_inc2,
+                unsigned int A_size1,  unsigned int A_size2,
+                unsigned int A_internal_size1,  unsigned int A_internal_size2,
+                T * v,
+                unsigned int v_start,
+                unsigned int v_inc,
+                unsigned int v_size,
+
+                unsigned int options)
+      {
+        T temp;
+        unsigned int unit_diagonal_flag  = (options & (1 << 0));
+        unsigned int transposed_access_A = (options & (1 << 1));
+        unsigned int is_lower_solve      = (options & (1 << 2));
+        unsigned int row;
+        for (unsigned int rows_processed = 0; rows_processed < A_size1; ++rows_processed)    //Note: A required to be square
+        {
+          row = is_lower_solve ? rows_processed : ((A_size1 - rows_processed) - 1);
+          if (!unit_diagonal_flag)
+          {
+            __syncthreads();
+            if (threadIdx.x == 0)
+              v[row * v_inc + v_start] /= A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)];
+          }
+
+          __syncthreads();
+
+          temp = v[row * v_inc + v_start];
+
+          for (int elim = (is_lower_solve ? (row + threadIdx.x + 1) : threadIdx.x);
+                  elim < (is_lower_solve ? A_size1 : row);
+                  elim += blockDim.x)
+            v[elim * v_inc + v_start] -= temp * A[transposed_access_A ? ((row  * A_inc1 + A_start1) * A_internal_size2 + (elim * A_inc2 + A_start2))
+                                                                      : ((elim * A_inc1 + A_start1) * A_internal_size2 + (row  * A_inc2 + A_start2))];
+        }
+      }
+
+
+      template <typename T>
+      __global__ void triangular_substitute_inplace_col_kernel(
+                T const * A,
+                unsigned int A_start1, unsigned int A_start2,
+                unsigned int A_inc1,   unsigned int A_inc2,
+                unsigned int A_size1,  unsigned int A_size2,
+                unsigned int A_internal_size1,  unsigned int A_internal_size2,
+                T * v,
+                unsigned int v_start,
+                unsigned int v_inc,
+                unsigned int v_size,
+                unsigned int options)
+      {
+        T temp;
+        unsigned int unit_diagonal_flag  = (options & (1 << 0));
+        unsigned int transposed_access_A = (options & (1 << 1));
+        unsigned int is_lower_solve      = (options & (1 << 2));
+        unsigned int row;
+        for (unsigned int rows_processed = 0; rows_processed < A_size1; ++rows_processed)    //Note: A required to be square
+        {
+          row = is_lower_solve ? rows_processed : ((A_size1 - rows_processed) - 1);
+          if (!unit_diagonal_flag)
+          {
+            __syncthreads();
+            if (threadIdx.x == 0)
+              v[row * v_inc + v_start] /= A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1];
+          }
+
+          __syncthreads();
+
+          temp = v[row * v_inc + v_start];
+
+          for (int elim = (is_lower_solve ? (row + threadIdx.x + 1) : threadIdx.x);
+                  elim < (is_lower_solve ? A_size1 : row);
+                  elim += blockDim.x)
+            v[elim * v_inc + v_start] -= temp * A[transposed_access_A ? ((row  * A_inc1 + A_start1) + (elim * A_inc2 + A_start2) * A_internal_size1)
+                                                                      : ((elim * A_inc1 + A_start1) + (row  * A_inc2 + A_start2) * A_internal_size1)];
+        }
+      }
+
+
+      namespace detail
+      {
+        inline unsigned int get_option_for_solver_tag(viennacl::linalg::upper_tag)      { return 0; }
+        inline unsigned int get_option_for_solver_tag(viennacl::linalg::unit_upper_tag) { return (1 << 0); }
+        inline unsigned int get_option_for_solver_tag(viennacl::linalg::lower_tag)      { return (1 << 2); }
+        inline unsigned int get_option_for_solver_tag(viennacl::linalg::unit_lower_tag) { return (1 << 2) | (1 << 0); }
+
+        template <typename MatrixType, typename VectorType>
+        void inplace_solve_vector_impl(MatrixType const & mat,
+                                       VectorType & vec,
+                                       unsigned int options)
+        {
+          typedef typename viennacl::result_of::cpu_value_type<MatrixType>::type        value_type;
+
+          if (viennacl::is_row_major<MatrixType>::value)
+          {
+            triangular_substitute_inplace_row_kernel<<<1, 128>>>(detail::cuda_arg<value_type>(mat),
+                                                                 static_cast<unsigned int>(viennacl::traits::start1(mat)),         static_cast<unsigned int>(viennacl::traits::start2(mat)),
+                                                                 static_cast<unsigned int>(viennacl::traits::stride1(mat)),        static_cast<unsigned int>(viennacl::traits::stride2(mat)),
+                                                                 static_cast<unsigned int>(viennacl::traits::size1(mat)),          static_cast<unsigned int>(viennacl::traits::size2(mat)),
+                                                                 static_cast<unsigned int>(viennacl::traits::internal_size1(mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
+                                                                 detail::cuda_arg<value_type>(vec),
+                                                                 static_cast<unsigned int>(viennacl::traits::start(vec)),
+                                                                 static_cast<unsigned int>(viennacl::traits::stride(vec)),
+                                                                 static_cast<unsigned int>(viennacl::traits::size(vec)),
+                                                                 options
+                                                                );
+          }
+          else
+          {
+            triangular_substitute_inplace_col_kernel<<<1, 128>>>(detail::cuda_arg<value_type>(mat),
+                                                                 static_cast<unsigned int>(viennacl::traits::start1(mat)),         static_cast<unsigned int>(viennacl::traits::start2(mat)),
+                                                                 static_cast<unsigned int>(viennacl::traits::stride1(mat)),        static_cast<unsigned int>(viennacl::traits::stride2(mat)),
+                                                                 static_cast<unsigned int>(viennacl::traits::size1(mat)),          static_cast<unsigned int>(viennacl::traits::size2(mat)),
+                                                                 static_cast<unsigned int>(viennacl::traits::internal_size1(mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
+                                                                 detail::cuda_arg<value_type>(vec),
+                                                                 static_cast<unsigned int>(viennacl::traits::start(vec)),
+                                                                 static_cast<unsigned int>(viennacl::traits::stride(vec)),
+                                                                 static_cast<unsigned int>(viennacl::traits::size(vec)),
+                                                                 options
+                                                                );
+          }
+        }
+
+      }
+
+      /** @brief Direct inplace solver for dense triangular systems (non-transposed version)
+      *
+      * @param mat    The system matrix proxy
+      * @param vec    The load vector, where the solution is directly written to
+      */
+      template <typename NumericT, typename F, typename SOLVERTAG>
+      void inplace_solve(const matrix_base<NumericT, F> & mat,
+                               vector_base<NumericT> & vec,
+                         SOLVERTAG)
+      {
+        unsigned int options = detail::get_option_for_solver_tag(SOLVERTAG());
+
+        detail::inplace_solve_vector_impl(mat, vec, options);
+      }
+
+
+
+
+      /** @brief Direct inplace solver for dense triangular systems (transposed version)
+      *
+      * @param proxy    The system matrix proxy
+      * @param vec    The load vector, where the solution is directly written to
+      */
+      template <typename NumericT, typename F, typename SOLVERTAG>
+      void inplace_solve(const matrix_expression< const matrix_base<NumericT, F>, const matrix_base<NumericT, F>, op_trans> & proxy,
+                         vector_base<NumericT> & vec,
+                         SOLVERTAG)
+      {
+        unsigned int options = detail::get_option_for_solver_tag(SOLVERTAG()) | 0x02;  //add transpose-flag
+
+        detail::inplace_solve_vector_impl(proxy.lhs(), vec, options);
+      }
+
+
+
+    }
+  }
+}
+
+#endif
diff --git a/viennacl/linalg/cuda/matrix_operations.hpp b/viennacl/linalg/cuda/matrix_operations.hpp
new file mode 100644
index 0000000..d5109b9
--- /dev/null
+++ b/viennacl/linalg/cuda/matrix_operations.hpp
@@ -0,0 +1,2539 @@
+#ifndef VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_HPP_
+#define VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file  viennacl/linalg/cuda/matrix_operations.hpp
+    @brief Implementations of dense matrix related operations, including matrix-vector products, using CUDA.
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/vector_proxy.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/meta/predicate.hpp"
+#include "viennacl/meta/result_of.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/start.hpp"
+#include "viennacl/traits/handle.hpp"
+#include "viennacl/traits/stride.hpp"
+
+#include "viennacl/linalg/cuda/common.hpp"
+
+#include "viennacl/linalg/cuda/vector_operations.hpp"
+#include "viennacl/linalg/cuda/matrix_operations_row.hpp"
+#include "viennacl/linalg/cuda/matrix_operations_col.hpp"
+#include "viennacl/linalg/cuda/matrix_operations_prod.hpp"
+#include "viennacl/linalg/cuda/matrix_operations_prod.hpp"
+
+namespace viennacl
+{
+  namespace linalg
+  {
+    namespace cuda
+    {
+      //
+      // Introductory note: By convention, all dimensions are already checked in the dispatcher frontend. No need to double-check again in here!
+      //
+
+      template <typename NumericT, typename F,
+                typename ScalarType1>
+      void am(matrix_base<NumericT, F> & mat1,
+              matrix_base<NumericT, F> const & mat2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
+      {
+        typedef NumericT        value_type;
+
+        unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
+
+        value_type temporary_alpha = 0;
+        if (viennacl::is_cpu_scalar<ScalarType1>::value)
+          temporary_alpha = alpha;
+
+        if (viennacl::is_row_major<F>::value)
+        {
+          am_row_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat1),
+                                      static_cast<unsigned int>(viennacl::traits::start1(mat1)),           static_cast<unsigned int>(viennacl::traits::start2(mat1)),
+                                      static_cast<unsigned int>(viennacl::traits::stride1(mat1)),          static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
+                                      static_cast<unsigned int>(viennacl::traits::size1(mat1)),            static_cast<unsigned int>(viennacl::traits::size2(mat1)),
+                                      static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
+
+                                      detail::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
+                                      options_alpha,
+                                      detail::cuda_arg<value_type>(mat2),
+                                      static_cast<unsigned int>(viennacl::traits::start1(mat2)),           static_cast<unsigned int>(viennacl::traits::start2(mat2)),
+                                      static_cast<unsigned int>(viennacl::traits::stride1(mat2)),          static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
+                                      static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat2))
+                                    );
+          VIENNACL_CUDA_LAST_ERROR_CHECK("am_row_kernel");
+        }
+        else
+        {
+          am_col_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat1),
+                                      static_cast<unsigned int>(viennacl::traits::start1(mat1)),           static_cast<unsigned int>(viennacl::traits::start2(mat1)),
+                                      static_cast<unsigned int>(viennacl::traits::stride1(mat1)),          static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
+                                      static_cast<unsigned int>(viennacl::traits::size1(mat1)),            static_cast<unsigned int>(viennacl::traits::size2(mat1)),
+                                      static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
+
+                                      detail::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
+                                      options_alpha,
+                                      detail::cuda_arg<value_type>(mat2),
+                                      static_cast<unsigned int>(viennacl::traits::start1(mat2)),           static_cast<unsigned int>(viennacl::traits::start2(mat2)),
+                                      static_cast<unsigned int>(viennacl::traits::stride1(mat2)),          static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
+                                      static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat2))
+                                    );
+          VIENNACL_CUDA_LAST_ERROR_CHECK("am_col_kernel");
+        }
+      }
+
+
+      template <typename NumericT, typename F,
+                typename ScalarType1, typename ScalarType2>
+      void ambm(matrix_base<NumericT, F> & mat1,
+                matrix_base<NumericT, F> const & mat2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+                matrix_base<NumericT, F> const & mat3, ScalarType2 const & beta,  vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
+      {
+        typedef NumericT        value_type;
+
+        unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
+
+        value_type temporary_alpha = 0;
+        if (viennacl::is_cpu_scalar<ScalarType1>::value)
+          temporary_alpha = alpha;
+
+
+        unsigned int options_beta  = detail::make_options(len_beta,  reciprocal_beta,  flip_sign_beta);
+
+        value_type temporary_beta = 0;
+        if (viennacl::is_cpu_scalar<ScalarType2>::value)
+          temporary_beta = beta;
+
+
+        if (viennacl::is_row_major<F>::value)
+        {
+          ambm_row_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat1),
+                                        static_cast<unsigned int>(viennacl::traits::start1(mat1)),           static_cast<unsigned int>(viennacl::traits::start2(mat1)),
+                                        static_cast<unsigned int>(viennacl::traits::stride1(mat1)),          static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
+                                        static_cast<unsigned int>(viennacl::traits::size1(mat1)),            static_cast<unsigned int>(viennacl::traits::size2(mat1)),
+                                        static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
+
+                                        detail::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
+                                        options_alpha,
+                                        detail::cuda_arg<value_type>(mat2),
+                                        static_cast<unsigned int>(viennacl::traits::start1(mat2)),           static_cast<unsigned int>(viennacl::traits::start2(mat2)),
+                                        static_cast<unsigned int>(viennacl::traits::stride1(mat2)),          static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
+                                        static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat2)),
+
+                                        detail::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)),
+                                        options_beta,
+                                        detail::cuda_arg<value_type>(mat3),
+                                        static_cast<unsigned int>(viennacl::traits::start1(mat3)),           static_cast<unsigned int>(viennacl::traits::start2(mat3)),
+                                        static_cast<unsigned int>(viennacl::traits::stride1(mat3)),          static_cast<unsigned int>(viennacl::traits::stride2(mat3)),
+                                        static_cast<unsigned int>(viennacl::traits::internal_size1(mat3)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat3))
+                                      );
+          VIENNACL_CUDA_LAST_ERROR_CHECK("ambm_row_kernel");
+        }
+        else
+        {
+          ambm_col_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat1),
+                                        static_cast<unsigned int>(viennacl::traits::start1(mat1)),           static_cast<unsigned int>(viennacl::traits::start2(mat1)),
+                                        static_cast<unsigned int>(viennacl::traits::stride1(mat1)),          static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
+                                        static_cast<unsigned int>(viennacl::traits::size1(mat1)),            static_cast<unsigned int>(viennacl::traits::size2(mat1)),
+                                        static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
+
+                                        detail::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
+                                        options_alpha,
+                                        detail::cuda_arg<value_type>(mat2),
+                                        static_cast<unsigned int>(viennacl::traits::start1(mat2)),           static_cast<unsigned int>(viennacl::traits::start2(mat2)),
+                                        static_cast<unsigned int>(viennacl::traits::stride1(mat2)),          static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
+                                        static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat2)),
+
+                                        detail::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)),
+                                        options_beta,
+                                        detail::cuda_arg<value_type>(mat3),
+                                        static_cast<unsigned int>(viennacl::traits::start1(mat3)),           static_cast<unsigned int>(viennacl::traits::start2(mat3)),
+                                        static_cast<unsigned int>(viennacl::traits::stride1(mat3)),          static_cast<unsigned int>(viennacl::traits::stride2(mat3)),
+                                        static_cast<unsigned int>(viennacl::traits::internal_size1(mat3)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat3))
+                                      );
+          VIENNACL_CUDA_LAST_ERROR_CHECK("ambm_col_kernel");
+        }
+
+      }
+
+
+      template <typename NumericT, typename F,
+                typename ScalarType1, typename ScalarType2>
+      void ambm_m(matrix_base<NumericT, F> & mat1,
+                  matrix_base<NumericT, F> const & mat2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+                  matrix_base<NumericT, F> const & mat3, ScalarType2 const & beta,  vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
+      {
+        typedef NumericT        value_type;
+
+        unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
+
+        value_type temporary_alpha = 0;
+        if (viennacl::is_cpu_scalar<ScalarType1>::value)
+          temporary_alpha = alpha;
+
+
+        unsigned int options_beta  = detail::make_options(len_beta,  reciprocal_beta,  flip_sign_beta);
+
+        value_type temporary_beta = 0;
+        if (viennacl::is_cpu_scalar<ScalarType2>::value)
+          temporary_beta = beta;
+
+
+        if (viennacl::is_row_major<F>::value)
+        {
+          ambm_m_row_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat1),
+                                          static_cast<unsigned int>(viennacl::traits::start1(mat1)),           static_cast<unsigned int>(viennacl::traits::start2(mat1)),
+                                          static_cast<unsigned int>(viennacl::traits::stride1(mat1)),          static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
+                                          static_cast<unsigned int>(viennacl::traits::size1(mat1)),            static_cast<unsigned int>(viennacl::traits::size2(mat1)),
+                                          static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
+
+                                          detail::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
+                                          options_alpha,
+                                          detail::cuda_arg<value_type>(mat2),
+                                          static_cast<unsigned int>(viennacl::traits::start1(mat2)),           static_cast<unsigned int>(viennacl::traits::start2(mat2)),
+                                          static_cast<unsigned int>(viennacl::traits::stride1(mat2)),          static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
+                                          static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat2)),
+
+                                          detail::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)),
+                                          options_beta,
+                                          detail::cuda_arg<value_type>(mat3),
+                                          static_cast<unsigned int>(viennacl::traits::start1(mat3)),           static_cast<unsigned int>(viennacl::traits::start2(mat3)),
+                                          static_cast<unsigned int>(viennacl::traits::stride1(mat3)),          static_cast<unsigned int>(viennacl::traits::stride2(mat3)),
+                                          static_cast<unsigned int>(viennacl::traits::internal_size1(mat3)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat3))
+                                        );
+          VIENNACL_CUDA_LAST_ERROR_CHECK("ambm_m_row_kernel");
+        }
+        else
+        {
+          ambm_m_col_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat1),
+                                          static_cast<unsigned int>(viennacl::traits::start1(mat1)),           static_cast<unsigned int>(viennacl::traits::start2(mat1)),
+                                          static_cast<unsigned int>(viennacl::traits::stride1(mat1)),          static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
+                                          static_cast<unsigned int>(viennacl::traits::size1(mat1)),            static_cast<unsigned int>(viennacl::traits::size2(mat1)),
+                                          static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
+
+                                          detail::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
+                                          options_alpha,
+                                          detail::cuda_arg<value_type>(mat2),
+                                          static_cast<unsigned int>(viennacl::traits::start1(mat2)),           static_cast<unsigned int>(viennacl::traits::start2(mat2)),
+                                          static_cast<unsigned int>(viennacl::traits::stride1(mat2)),          static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
+                                          static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat2)),
+
+                                          detail::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)),
+                                          options_beta,
+                                          detail::cuda_arg<value_type>(mat3),
+                                          static_cast<unsigned int>(viennacl::traits::start1(mat3)),           static_cast<unsigned int>(viennacl::traits::start2(mat3)),
+                                          static_cast<unsigned int>(viennacl::traits::stride1(mat3)),          static_cast<unsigned int>(viennacl::traits::stride2(mat3)),
+                                          static_cast<unsigned int>(viennacl::traits::internal_size1(mat3)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat3))
+                                        );
+          VIENNACL_CUDA_LAST_ERROR_CHECK("ambm_m_col_kernel");
+        }
+
+      }
+
+
+
+
+      template <typename NumericT, typename F>
+      void matrix_assign(matrix_base<NumericT, F> & mat, NumericT s, bool clear = false)
+      {
+        typedef NumericT        value_type;
+        value_type alpha = s;
+
+        unsigned int s1  = clear ? viennacl::traits::internal_size1(mat) : viennacl::traits::size1(mat);
+        unsigned int s2  = clear ? viennacl::traits::internal_size2(mat) : viennacl::traits::size2(mat);
+
+        if (viennacl::is_row_major<F>::value)
+        {
+
+          matrix_row_assign_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat),
+                                                 static_cast<unsigned int>(viennacl::traits::start1(mat)),           static_cast<unsigned int>(viennacl::traits::start2(mat)),
+                                                 static_cast<unsigned int>(viennacl::traits::stride1(mat)),          static_cast<unsigned int>(viennacl::traits::stride2(mat)),
+                                                 s1,                                                                 s2,
+                                                 static_cast<unsigned int>(viennacl::traits::internal_size1(mat)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
+                                                 alpha);
+          VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_assign_kernel");
+        }
+        else
+        {
+          matrix_col_assign_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat),
+                                                  static_cast<unsigned int>(viennacl::traits::start1(mat)),           static_cast<unsigned int>(viennacl::traits::start2(mat)),
+                                                  static_cast<unsigned int>(viennacl::traits::stride1(mat)),          static_cast<unsigned int>(viennacl::traits::stride2(mat)),
+                                                  s1,                                                                 s2,
+                                                  static_cast<unsigned int>(viennacl::traits::internal_size1(mat)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
+                                                  alpha);
+          VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_assign_kernel");
+        }
+      }
+
+      template <typename NumericT, typename F>
+      void matrix_diagonal_assign(matrix_base<NumericT, F> & mat, NumericT s)
+      {
+        typedef NumericT        value_type;
+        value_type alpha = s;
+
+        if (viennacl::is_row_major<F>::value)
+        {
+          matrix_row_diagonal_assign_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat),
+                                                          static_cast<unsigned int>(viennacl::traits::start1(mat)),           static_cast<unsigned int>(viennacl::traits::start2(mat)),
+                                                          static_cast<unsigned int>(viennacl::traits::stride1(mat)),          static_cast<unsigned int>(viennacl::traits::stride2(mat)),
+                                                          static_cast<unsigned int>(viennacl::traits::size1(mat)),            static_cast<unsigned int>(viennacl::traits::size2(mat)),
+                                                          static_cast<unsigned int>(viennacl::traits::internal_size1(mat)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
+                                                          alpha);
+          VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_diagonal_assign_kernel");
+        }
+        else
+        {
+          matrix_col_diagonal_assign_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat),
+                                                          static_cast<unsigned int>(viennacl::traits::start1(mat)),           static_cast<unsigned int>(viennacl::traits::start2(mat)),
+                                                          static_cast<unsigned int>(viennacl::traits::stride1(mat)),          static_cast<unsigned int>(viennacl::traits::stride2(mat)),
+                                                          static_cast<unsigned int>(viennacl::traits::size1(mat)),            static_cast<unsigned int>(viennacl::traits::size2(mat)),
+                                                          static_cast<unsigned int>(viennacl::traits::internal_size1(mat)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
+                                                          alpha);
+          VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_diagonal_assign_kernel");
+        }
+      }
+
+
+      template <typename NumericT, typename F>
+      void matrix_diag_from_vector(const vector_base<NumericT> & vec, int k, matrix_base<NumericT, F> & mat)
+      {
+        typedef NumericT        value_type;
+
+        // Step 1: assign zero matrix:
+        matrix_assign(mat, NumericT(0));
+
+        // Step 2: Assign diagonal:
+        unsigned int options_alpha = 0;
+
+        vcl_size_t mat_start = 0;
+        vcl_size_t mat_stride = 0;
+        vcl_size_t mat_size = viennacl::traits::size(vec);
+        if (viennacl::is_row_major<F>::value)
+        {
+          vcl_size_t first_row_index = 0;
+          vcl_size_t first_col_index = 0;
+          if (k < 0)
+            first_row_index = vcl_size_t(-k);
+          else
+            first_col_index = vcl_size_t(k);
+          mat_start  =  (viennacl::traits::start1(mat) + first_row_index * viennacl::traits::stride1(mat)) * viennacl::traits::internal_size2(mat)
+                       + viennacl::traits::start2(mat) + first_col_index * viennacl::traits::stride2(mat);
+          mat_stride = viennacl::traits::stride1(mat) * viennacl::traits::internal_size2(mat) + viennacl::traits::stride2(mat);
+        }
+        else
+        {
+          vcl_size_t first_row_index = 0;
+          vcl_size_t first_col_index = 0;
+          if (k < 0)
+            first_row_index = vcl_size_t(-k);
+          else
+            first_col_index = vcl_size_t(k);
+          mat_start  =    viennacl::traits::start1(mat) + first_row_index * viennacl::traits::stride1(mat)
+                       + (viennacl::traits::start2(mat) + first_col_index * viennacl::traits::stride2(mat)) * viennacl::traits::internal_size1(mat);
+          mat_stride = viennacl::traits::stride2(mat) * viennacl::traits::internal_size1(mat) + viennacl::traits::stride1(mat);
+        }
+
+        av_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat),
+                                static_cast<unsigned int>(mat_start),
+                                static_cast<unsigned int>(mat_stride),
+                                static_cast<unsigned int>(mat_size),
+
+                                detail::cuda_arg<value_type>(NumericT(1)),
+                                options_alpha,
+                                detail::cuda_arg<value_type>(vec),
+                                static_cast<unsigned int>(viennacl::traits::start(vec)),
+                                static_cast<unsigned int>(viennacl::traits::stride(vec)) );
+        VIENNACL_CUDA_LAST_ERROR_CHECK("av_kernel");
+      }
+
+      template <typename NumericT, typename F>
+      void matrix_diag_to_vector(const matrix_base<NumericT, F> & mat, int k, vector_base<NumericT> & vec)
+      {
+        typedef NumericT        value_type;
+
+        unsigned int options_alpha = 0;
+
+        vcl_size_t mat_start = 0;
+        vcl_size_t mat_stride = 0;
+        if (viennacl::is_row_major<F>::value)
+        {
+          vcl_size_t first_row_index = 0;
+          vcl_size_t first_col_index = 0;
+          if (k < 0)
+            first_row_index = vcl_size_t(-k);
+          else
+            first_col_index = vcl_size_t(k);
+          mat_start  =  (viennacl::traits::start1(mat) + first_row_index * viennacl::traits::stride1(mat)) * viennacl::traits::internal_size2(mat)
+                       + viennacl::traits::start2(mat) + first_col_index * viennacl::traits::stride2(mat);
+          mat_stride = viennacl::traits::stride1(mat) * viennacl::traits::internal_size2(mat) + viennacl::traits::stride2(mat);
+        }
+        else
+        {
+          vcl_size_t first_row_index = 0;
+          vcl_size_t first_col_index = 0;
+          if (k < 0)
+            first_row_index = vcl_size_t(-k);
+          else
+            first_col_index = vcl_size_t(k);
+          mat_start  =    viennacl::traits::start1(mat) + first_row_index * viennacl::traits::stride1(mat)
+                       + (viennacl::traits::start2(mat) + first_col_index * viennacl::traits::stride2(mat)) * viennacl::traits::internal_size1(mat);
+          mat_stride = viennacl::traits::stride2(mat) * viennacl::traits::internal_size1(mat) + viennacl::traits::stride1(mat);
+        }
+
+        av_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec),
+                                static_cast<unsigned int>(viennacl::traits::start(vec)),
+                                static_cast<unsigned int>(viennacl::traits::stride(vec)),
+                                static_cast<unsigned int>(viennacl::traits::size(vec)),
+
+                                detail::cuda_arg<value_type>(NumericT(1)),
+                                options_alpha,
+                                detail::cuda_arg<value_type>(mat),
+                                static_cast<unsigned int>(mat_start),
+                                static_cast<unsigned int>(mat_stride));
+        VIENNACL_CUDA_LAST_ERROR_CHECK("av_kernel");
+      }
+
+      template <typename NumericT, typename F>
+      void matrix_row(const matrix_base<NumericT, F> & mat, unsigned int i, vector_base<NumericT> & vec)
+      {
+        typedef NumericT        value_type;
+
+        unsigned int options_alpha = 0;
+
+        vcl_size_t mat_start = 0;
+        vcl_size_t mat_stride = 0;
+        if (viennacl::is_row_major<F>::value)
+        {
+          mat_start  = (viennacl::traits::start1(mat) + i * viennacl::traits::stride1(mat)) * viennacl::traits::internal_size2(mat) + viennacl::traits::start2(mat);
+          mat_stride = viennacl::traits::stride2(mat);
+        }
+        else
+        {
+          mat_start  = viennacl::traits::start1(mat) + i * viennacl::traits::stride1(mat) + viennacl::traits::start2(mat) * viennacl::traits::internal_size1(mat);
+          mat_stride = viennacl::traits::stride2(mat) * viennacl::traits::internal_size1(mat);
+        }
+
+        av_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec),
+                                static_cast<unsigned int>(viennacl::traits::start(vec)),
+                                static_cast<unsigned int>(viennacl::traits::stride(vec)),
+                                static_cast<unsigned int>(viennacl::traits::size(vec)),
+
+                                detail::cuda_arg<value_type>(NumericT(1)),
+                                options_alpha,
+                                detail::cuda_arg<value_type>(mat),
+                                static_cast<unsigned int>(mat_start),
+                                static_cast<unsigned int>(mat_stride));
+        VIENNACL_CUDA_LAST_ERROR_CHECK("av_kernel");
+      }
+
+      template <typename NumericT, typename F>
+      void matrix_column(const matrix_base<NumericT, F> & mat, unsigned int j, vector_base<NumericT> & vec)
+      {
+        typedef NumericT        value_type;
+
+        unsigned int options_alpha = 0;
+
+        vcl_size_t mat_start = 0;
+        vcl_size_t mat_stride = 0;
+        if (viennacl::is_row_major<F>::value)
+        {
+          mat_start  = viennacl::traits::start1(mat) * viennacl::traits::internal_size2(mat) + viennacl::traits::start2(mat) + j * viennacl::traits::stride2(mat);
+          mat_stride = viennacl::traits::stride2(mat) * viennacl::traits::internal_size2(mat);
+        }
+        else
+        {
+          mat_start  = viennacl::traits::start1(mat) + (viennacl::traits::start2(mat) + j * viennacl::traits::stride2(mat)) * viennacl::traits::internal_size1(mat);
+          mat_stride = viennacl::traits::stride2(mat);
+        }
+
+        av_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec),
+                                static_cast<unsigned int>(viennacl::traits::start(vec)),
+                                static_cast<unsigned int>(viennacl::traits::stride(vec)),
+                                static_cast<unsigned int>(viennacl::traits::size(vec)),
+
+                                detail::cuda_arg<value_type>(NumericT(1)),
+                                options_alpha,
+                                detail::cuda_arg<value_type>(mat),
+                                static_cast<unsigned int>(mat_start),
+                                static_cast<unsigned int>(mat_stride));
+        VIENNACL_CUDA_LAST_ERROR_CHECK("av_kernel");
+      }
+
+
+      //
+      /////////////////////////   binary element-wise operations    /////////////////////////////////
+      //
+
+
+      template <typename T, typename F, typename OP>
+      void element_op(matrix_base<T, F> & A,
+                      matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_binary<OP> > const & proxy)
+      {
+        typedef T        value_type;
+
+        unsigned int op_type = 2; //0: product, 1: division, 2: power
+        if (viennacl::is_division<OP>::value)
+          op_type = 1;
+        else if (viennacl::is_product<OP>::value)
+          op_type = 0;
+
+        if (viennacl::is_row_major<F>::value)
+        {
+          element_op_int_row_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
+                                              static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+                                              static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+                                              static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+                                              static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+                                              detail::cuda_arg<value_type>(proxy.lhs()),
+                                              static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+                                              static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+                                              static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())),
+
+                                              detail::cuda_arg<value_type>(proxy.rhs()),
+                                              static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())),
+                                              static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())),
+                                              static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())),
+
+                                              op_type
+                                            );
+          VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_row_kernel");
+        }
+        else
+        {
+          element_op_int_col_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
+                                              static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+                                              static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+                                              static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+                                              static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+                                              detail::cuda_arg<value_type>(proxy.lhs()),
+                                              static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+                                              static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+                                              static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())),
+
+                                              detail::cuda_arg<value_type>(proxy.rhs()),
+                                              static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())),
+                                              static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())),
+                                              static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())),
+
+                                              op_type
+                                            );
+          VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_col_kernel");
+        }
+      }
+
+      template <typename F, typename OP>
+      void element_op(matrix_base<float, F> & A,
+                      matrix_expression<const matrix_base<float, F>, const matrix_base<float, F>, op_element_binary<OP> > const & proxy)
+      {
+        typedef float        value_type;
+
+        unsigned int op_type = 2; //0: product, 1: division, 2: power
+        if (viennacl::is_division<OP>::value)
+          op_type = 1;
+        else if (viennacl::is_product<OP>::value)
+          op_type = 0;
+
+        if (viennacl::is_row_major<F>::value)
+        {
+          element_op_row_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
+                                              static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+                                              static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+                                              static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+                                              static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+                                              detail::cuda_arg<value_type>(proxy.lhs()),
+                                              static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+                                              static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+                                              static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())),
+
+                                              detail::cuda_arg<value_type>(proxy.rhs()),
+                                              static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())),
+                                              static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())),
+                                              static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())),
+
+                                              op_type
+                                            );
+          VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_row_kernel");
+        }
+        else
+        {
+          element_op_col_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
+                                              static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+                                              static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+                                              static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+                                              static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+                                              detail::cuda_arg<value_type>(proxy.lhs()),
+                                              static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+                                              static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+                                              static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())),
+
+                                              detail::cuda_arg<value_type>(proxy.rhs()),
+                                              static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())),
+                                              static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())),
+                                              static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())),
+
+                                              op_type
+                                            );
+          VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_col_kernel");
+        }
+      }
+
+      template <typename F, typename OP>
+      void element_op(matrix_base<double, F> & A,
+                      matrix_expression<const matrix_base<double, F>, const matrix_base<double, F>, op_element_binary<OP> > const & proxy)
+      {
+        typedef double        value_type;
+
+        unsigned int op_type = 2; //0: product, 1: division, 2: power
+        if (viennacl::is_division<OP>::value)
+          op_type = 1;
+        else if (viennacl::is_product<OP>::value)
+          op_type = 0;
+
+        if (viennacl::is_row_major<F>::value)
+        {
+          element_op_row_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
+                                              static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+                                              static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+                                              static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+                                              static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+                                              detail::cuda_arg<value_type>(proxy.lhs()),
+                                              static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+                                              static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+                                              static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())),
+
+                                              detail::cuda_arg<value_type>(proxy.rhs()),
+                                              static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())),
+                                              static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())),
+                                              static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())),
+
+                                              op_type
+                                            );
+          VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_row_kernel");
+        }
+        else
+        {
+          element_op_col_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
+                                              static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+                                              static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+                                              static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+                                              static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+                                              detail::cuda_arg<value_type>(proxy.lhs()),
+                                              static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+                                              static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+                                              static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())),
+
+                                              detail::cuda_arg<value_type>(proxy.rhs()),
+                                              static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())),
+                                              static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())),
+                                              static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())),
+
+                                              op_type
+                                            );
+          VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_col_kernel");
+        }
+      }
+
+      //
+      /////////////////////////   unary element-wise operations    /////////////////////////////////
+      //
+
+      // Note: Due to CUDA vs C-proprocessor interference (concatenation seems to be broken in at least CUDA 4.2),
+      //       we could not find a more 'automatic' way of generating the overloads below...
+
+      // abs
+      template <typename T, typename F>
+      void element_op(matrix_base<T, F> & A,
+                      matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_unary<op_abs> > const & proxy)
+      {
+        typedef T        value_type;
+
+        if (viennacl::is_row_major<F>::value)
+        {
+          matrix_row_element_abs_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
+            static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+            static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+            static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+            static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+            detail::cuda_arg<value_type>(proxy.lhs()),
+            static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+            static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+            static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+          );
+          VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_abs_kernel");
+        }
+        else
+        {
+          matrix_col_element_abs_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
+            static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+            static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+            static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+            static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+            detail::cuda_arg<value_type>(proxy.lhs()),
+            static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+            static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+            static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+          );
+          VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_abs_kernel");
+        }
+      }
+
+
+      // acos
+      template <typename T, typename F>
+      void element_op(matrix_base<T, F> & A,
+                      matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_unary<op_acos> > const & proxy)
+      {
+        typedef T        value_type;
+
+        if (viennacl::is_row_major<F>::value)
+        {
+          matrix_row_element_acos_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
+           static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+           static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+           static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+           static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+           detail::cuda_arg<value_type>(proxy.lhs()),
+           static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+           static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+           static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+          );
+          VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_acos_kernel");
+        }
+        else
+        {
+          matrix_col_element_acos_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
+           static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+           static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+           static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+           static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+           detail::cuda_arg<value_type>(proxy.lhs()),
+           static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+           static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+           static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+          );
+          VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_acos_kernel");
+        }
+      }
+
+
+      // asin
+      template <typename T, typename F>
+      void element_op(matrix_base<T, F> & A,
+                      matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_unary<op_asin> > const & proxy)
+      {
+        typedef T        value_type;
+
+        if (viennacl::is_row_major<F>::value)
+        {
+          matrix_row_element_asin_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
+           static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+           static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+           static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+           static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+           detail::cuda_arg<value_type>(proxy.lhs()),
+           static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+           static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+           static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+          );
+          VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_asin_kernel");
+        }
+        else
+        {
+          matrix_col_element_asin_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
+           static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+           static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+           static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+           static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+           detail::cuda_arg<value_type>(proxy.lhs()),
+           static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+           static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+           static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+          );
+          VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_sin_kernel");
+        }
+      }
+
+
+      // atan
+      template <typename T, typename F>
+      void element_op(matrix_base<T, F> & A,
+                      matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_unary<op_atan> > const & proxy)
+      {
+        typedef T        value_type;
+
+        if (viennacl::is_row_major<F>::value)
+        {
+          matrix_row_element_atan_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
+           static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+           static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+           static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+           static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+           detail::cuda_arg<value_type>(proxy.lhs()),
+           static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+           static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+           static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+          );
+          VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_atan_kernel");
+        }
+        else
+        {
+          matrix_col_element_atan_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
+           static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+           static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+           static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+           static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+           detail::cuda_arg<value_type>(proxy.lhs()),
+           static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+           static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+           static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+          );
+          VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_atan_kernel");
+        }
+      }
+
+
+      // ceil
+      template <typename T, typename F>
+      void element_op(matrix_base<T, F> & A,
+                      matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_unary<op_ceil> > const & proxy)
+      {
+        typedef T        value_type;
+
+        if (viennacl::is_row_major<F>::value)
+        {
+          matrix_row_element_ceil_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
+           static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+           static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+           static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+           static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+           detail::cuda_arg<value_type>(proxy.lhs()),
+           static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+           static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+           static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+          );
+          VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_ceil_kernel");
+        }
+        else
+        {
+          matrix_col_element_ceil_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
+           static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+           static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+           static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+           static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+           detail::cuda_arg<value_type>(proxy.lhs()),
+           static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+           static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+           static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+          );
+          VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_ceil_kernel");
+        }
+      }
+
+
+      // cos
+      template <typename T, typename F>
+      void element_op(matrix_base<T, F> & A,
+                      matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_unary<op_cos> > const & proxy)
+      {
+        typedef T        value_type;
+
+        if (viennacl::is_row_major<F>::value)
+        {
+          matrix_row_element_cos_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
+            static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+            static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+            static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+            static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+            detail::cuda_arg<value_type>(proxy.lhs()),
+            static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+            static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+            static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+          );
+          VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_cos_kernel");
+        }
+        else
+        {
+          matrix_col_element_cos_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
+            static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+            static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+            static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+            static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+            detail::cuda_arg<value_type>(proxy.lhs()),
+            static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+            static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+            static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+          );
+          VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_cos_kernel");
+        }
+      }
+
+
+      // cosh
+      template <typename T, typename F>
+      void element_op(matrix_base<T, F> & A,
+                      matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_unary<op_cosh> > const & proxy)
+      {
+        typedef T        value_type;
+
+        if (viennacl::is_row_major<F>::value)
+        {
+          matrix_row_element_cosh_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
+           static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+           static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+           static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+           static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+           detail::cuda_arg<value_type>(proxy.lhs()),
+           static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+           static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+           static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+          );
+          VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_cosh_kernel");
+        }
+        else
+        {
+          matrix_col_element_cosh_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
+           static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+           static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+           static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+           static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+           detail::cuda_arg<value_type>(proxy.lhs()),
+           static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+           static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+           static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+          );
+          VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_cosh_kernel");
+        }
+      }
+
+
+      // exp
+      template <typename T, typename F>
+      void element_op(matrix_base<T, F> & A,
+                      matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_unary<op_exp> > const & proxy)
+      {
+        typedef T        value_type;
+
+        if (viennacl::is_row_major<F>::value)
+        {
+          matrix_row_element_exp_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
+            static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+            static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+            static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+            static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+            detail::cuda_arg<value_type>(proxy.lhs()),
+            static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+            static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+            static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+          );
+          VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_exp_kernel");
+        }
+        else
+        {
+          matrix_col_element_exp_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
+            static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+            static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+            static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+            static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+            detail::cuda_arg<value_type>(proxy.lhs()),
+            static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+            static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+            static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+          );
+          VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_exp_kernel");
+        }
+      }
+
+
+      // fabs
+      template <typename T, typename F>
+      void element_op(matrix_base<T, F> & A,
+                      matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_unary<op_fabs> > const & proxy)
+      {
+        typedef T        value_type;
+
+        if (viennacl::is_row_major<F>::value)
+        {
+          matrix_row_element_fabs_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
+           static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+           static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+           static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+           static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+           detail::cuda_arg<value_type>(proxy.lhs()),
+           static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+           static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+           static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+          );
+          VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_fabs_kernel");
+        }
+        else
+        {
+          matrix_col_element_fabs_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
+           static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+           static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+           static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+           static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+           detail::cuda_arg<value_type>(proxy.lhs()),
+           static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+           static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+           static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+          );
+          VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_fabs_kernel");
+        }
+      }
+
+
+      // floor
+      template <typename T, typename F>
+      void element_op(matrix_base<T, F> & A,
+                      matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_unary<op_floor> > const & proxy)
+      {
+        typedef T        value_type;
+
+        if (viennacl::is_row_major<F>::value)
+        {
+          matrix_row_element_floor_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
+            static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+            static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+            static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+            static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+            detail::cuda_arg<value_type>(proxy.lhs()),
+            static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+            static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+            static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+          );
+          VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_floor_kernel");
+        }
+        else
+        {
+          matrix_col_element_floor_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
+            static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+            static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+            static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+            static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+            detail::cuda_arg<value_type>(proxy.lhs()),
+            static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+            static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+            static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+          );
+          VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_floor_kernel");
+        }
+      }
+
+
+      // log
+      template <typename T, typename F>
+      void element_op(matrix_base<T, F> & A,
+                      matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_unary<op_log> > const & proxy)
+      {
+        typedef T        value_type;
+
+        if (viennacl::is_row_major<F>::value)
+        {
+          matrix_row_element_log_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
+            static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+            static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+            static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+            static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+            detail::cuda_arg<value_type>(proxy.lhs()),
+            static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+            static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+            static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+          );
+          VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_log_kernel");
+        }
+        else
+        {
+          matrix_col_element_log_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
+            static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+            static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+            static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+            static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+            detail::cuda_arg<value_type>(proxy.lhs()),
+            static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+            static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+            static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+          );
+          VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_log_kernel");
+        }
+      }
+
+
+      // log10
+      template <typename T, typename F>
+      void element_op(matrix_base<T, F> & A,
+                      matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_unary<op_log10> > const & proxy)
+      {
+        typedef T        value_type;
+
+        if (viennacl::is_row_major<F>::value)
+        {
+          matrix_row_element_log10_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
+            static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+            static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+            static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+            static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+            detail::cuda_arg<value_type>(proxy.lhs()),
+            static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+            static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+            static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+          );
+          VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_log10_kernel");
+        }
+        else
+        {
+          matrix_col_element_log10_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
+            static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+            static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+            static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+            static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+            detail::cuda_arg<value_type>(proxy.lhs()),
+            static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+            static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+            static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+          );
+          VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_log10_kernel");
+        }
+      }
+
+
+      // sin
+      template <typename T, typename F>
+      void element_op(matrix_base<T, F> & A,
+                      matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_unary<op_sin> > const & proxy)
+      {
+        typedef T        value_type;
+
+        if (viennacl::is_row_major<F>::value)
+        {
+          matrix_row_element_sin_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
+            static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+            static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+            static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+            static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+            detail::cuda_arg<value_type>(proxy.lhs()),
+            static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+            static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+            static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+          );
+          VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_sin_kernel");
+        }
+        else
+        {
+          matrix_col_element_sin_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
+            static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+            static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+            static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+            static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+            detail::cuda_arg<value_type>(proxy.lhs()),
+            static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+            static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+            static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+          );
+          VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_sin_kernel");
+        }
+      }
+
+
+      // sinh
+      template <typename T, typename F>
+      void element_op(matrix_base<T, F> & A,
+                      matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_unary<op_sinh> > const & proxy)
+      {
+        typedef T        value_type;
+
+        if (viennacl::is_row_major<F>::value)
+        {
+          matrix_row_element_sinh_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
+           static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+           static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+           static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+           static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+           detail::cuda_arg<value_type>(proxy.lhs()),
+           static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+           static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+           static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+          );
+          VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_sinh_kernel");
+        }
+        else
+        {
+          matrix_col_element_sinh_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
+           static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+           static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+           static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+           static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+           detail::cuda_arg<value_type>(proxy.lhs()),
+           static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+           static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+           static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+          );
+          VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_sinh_kernel");
+        }
+      }
+
+
+      // sqrt
+      template <typename T, typename F>
+      void element_op(matrix_base<T, F> & A,
+                      matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_unary<op_sqrt> > const & proxy)
+      {
+        typedef T        value_type;
+
+        if (viennacl::is_row_major<F>::value)
+        {
+          matrix_row_element_sqrt_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
+           static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+           static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+           static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+           static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+           detail::cuda_arg<value_type>(proxy.lhs()),
+           static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+           static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+           static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+          );
+          VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_sqrt_kernel");
+        }
+        else
+        {
+          matrix_col_element_sqrt_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
+           static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+           static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+           static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+           static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+           detail::cuda_arg<value_type>(proxy.lhs()),
+           static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+           static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+           static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+          );
+          VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_sqrt_kernel");
+        }
+      }
+
+
+      // tan
+      template <typename T, typename F>
+      void element_op(matrix_base<T, F> & A,
+                      matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_unary<op_tan> > const & proxy)
+      {
+        typedef T        value_type;
+
+        if (viennacl::is_row_major<F>::value)
+        {
+          matrix_row_element_tan_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
+            static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+            static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+            static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+            static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+            detail::cuda_arg<value_type>(proxy.lhs()),
+            static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+            static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+            static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+          );
+          VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_tan_kernel");
+        }
+        else
+        {
+          matrix_col_element_tan_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
+            static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+            static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+            static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+            static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+            detail::cuda_arg<value_type>(proxy.lhs()),
+            static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+            static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+            static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+          );
+          VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_tan_kernel");
+        }
+      }
+
+
+      // tanh
+      template <typename T, typename F>
+      void element_op(matrix_base<T, F> & A,
+                      matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_unary<op_tanh> > const & proxy)
+      {
+        typedef T        value_type;
+
+        if (viennacl::is_row_major<F>::value)
+        {
+          matrix_row_element_tanh_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
+           static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+           static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+           static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+           static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+           detail::cuda_arg<value_type>(proxy.lhs()),
+           static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+           static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+           static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+          );
+          VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_tanh_kernel");
+        }
+        else
+        {
+          matrix_col_element_tanh_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
+           static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+           static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+           static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+           static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+           detail::cuda_arg<value_type>(proxy.lhs()),
+           static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+           static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+           static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+          );
+          VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_tanh_kernel");
+        }
+      }
+
+
+      //
+      /////////////////////////   matrix-vector products /////////////////////////////////
+      //
+
+      // A * x
+
+      /** @brief Carries out matrix-vector multiplication
+      *
+      * Implementation of the convenience expression result = prod(mat, vec);
+      *
+      * @param mat    The matrix
+      * @param vec    The vector
+      * @param result The result vector
+      */
+      template <typename NumericT, typename F>
+      void prod_impl(const matrix_base<NumericT, F> & mat,
+                     const vector_base<NumericT> & vec,
+                           vector_base<NumericT> & result)
+      {
+        typedef NumericT        value_type;
+
+        assert(viennacl::traits::handle(vec) != viennacl::traits::handle(result) && bool("No direct inplace matrix-vector product possible. Introduce a temporary!"));
+
+        if (viennacl::is_row_major<F>::value)
+        {
+          vec_mul_row_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat),
+                                           static_cast<unsigned int>(viennacl::traits::start1(mat)),         static_cast<unsigned int>(viennacl::traits::start2(mat)),
+                                           static_cast<unsigned int>(viennacl::traits::stride1(mat)),        static_cast<unsigned int>(viennacl::traits::stride2(mat)),
+                                           static_cast<unsigned int>(viennacl::traits::size1(mat)),          static_cast<unsigned int>(viennacl::traits::size2(mat)),
+                                           static_cast<unsigned int>(viennacl::traits::internal_size1(mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
+
+                                           detail::cuda_arg<value_type>(vec),
+                                           static_cast<unsigned int>(viennacl::traits::start(vec)),
+                                           static_cast<unsigned int>(viennacl::traits::stride(vec)),
+                                           static_cast<unsigned int>(viennacl::traits::size(vec)),
+
+                                           detail::cuda_arg<value_type>(result),
+                                           static_cast<unsigned int>(viennacl::traits::start(result)),
+                                           static_cast<unsigned int>(viennacl::traits::stride(result)),
+                                           static_cast<unsigned int>(viennacl::traits::size(result))
+                                          );
+          VIENNACL_CUDA_LAST_ERROR_CHECK("vec_mul_row_kernel");
+        }
+        else
+        {
+          vec_mul_col_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat),
+                                           static_cast<unsigned int>(viennacl::traits::start1(mat)),         static_cast<unsigned int>(viennacl::traits::start2(mat)),
+                                           static_cast<unsigned int>(viennacl::traits::stride1(mat)),        static_cast<unsigned int>(viennacl::traits::stride2(mat)),
+                                           static_cast<unsigned int>(viennacl::traits::size1(mat)),          static_cast<unsigned int>(viennacl::traits::size2(mat)),
+                                           static_cast<unsigned int>(viennacl::traits::internal_size1(mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
+
+                                           detail::cuda_arg<value_type>(vec),
+                                           static_cast<unsigned int>(viennacl::traits::start(vec)),
+                                           static_cast<unsigned int>(viennacl::traits::stride(vec)),
+                                           static_cast<unsigned int>(viennacl::traits::size(vec)),
+
+                                           detail::cuda_arg<value_type>(result),
+                                           static_cast<unsigned int>(viennacl::traits::start(result)),
+                                           static_cast<unsigned int>(viennacl::traits::stride(result)),
+                                           static_cast<unsigned int>(viennacl::traits::size(result))
+                                          );
+          VIENNACL_CUDA_LAST_ERROR_CHECK("vec_mul_col_kernel");
+        }
+      }
+
+
+      // trans(A) * x
+
+      /** @brief Carries out matrix-vector multiplication with a transposed matrix
+      *
+      * Implementation of the convenience expression result = trans(mat) * vec;
+      *
+      * @param mat_trans  The transposed matrix proxy
+      * @param vec        The vector
+      * @param result     The result vector
+      */
+      template <typename NumericT, typename F>
+      void prod_impl(const viennacl::matrix_expression< const matrix_base<NumericT, F>, const matrix_base<NumericT, F>, op_trans> & mat_trans,
+                     const vector_base<NumericT> & vec,
+                           vector_base<NumericT> & result)
+      {
+        assert( (viennacl::traits::size1(mat_trans) == viennacl::traits::size(result)) && bool("Size check failed for transposed matrix-vector product: size1(A^T) == size(result)"));
+        assert( (viennacl::traits::size2(mat_trans) == viennacl::traits::size(vec)) && bool("Size check failed for transposed matrix-vector product: size2(A^T) == size(x)"));  //remember: mat is transposed!
+
+        typedef NumericT    value_type;
+
+
+        // Inplace matrix-vector products like x = prod(A, x) are currently illegal: Introduce a temporary like y = prod(A, x); x = y; instead
+        assert(viennacl::traits::handle(vec) != viennacl::traits::handle(result) && bool("No direct inplace transposed matrix-vector product possible. Introduce a temporary!"));
+
+        if (viennacl::is_row_major<F>::value)
+        {
+          trans_vec_mul_row_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat_trans.lhs()),
+                                                 static_cast<unsigned int>(viennacl::traits::start1(mat_trans.lhs())),         static_cast<unsigned int>(viennacl::traits::start2(mat_trans.lhs())),
+                                                 static_cast<unsigned int>(viennacl::traits::stride1(mat_trans.lhs())),        static_cast<unsigned int>(viennacl::traits::stride2(mat_trans.lhs())),
+                                                 static_cast<unsigned int>(viennacl::traits::size1(mat_trans.lhs())),          static_cast<unsigned int>(viennacl::traits::size2(mat_trans.lhs())),
+                                                 static_cast<unsigned int>(viennacl::traits::internal_size1(mat_trans.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(mat_trans.lhs())),
+
+                                                 detail::cuda_arg<value_type>(vec),
+                                                 static_cast<unsigned int>(viennacl::traits::start(vec)),
+                                                 static_cast<unsigned int>(viennacl::traits::stride(vec)),
+                                                 static_cast<unsigned int>(viennacl::traits::size(vec)),
+
+                                                 detail::cuda_arg<value_type>(result),
+                                                 static_cast<unsigned int>(viennacl::traits::start(result)),
+                                                 static_cast<unsigned int>(viennacl::traits::stride(result)),
+                                                 static_cast<unsigned int>(viennacl::traits::size(result))
+                                                );
+          VIENNACL_CUDA_LAST_ERROR_CHECK("trans_vec_mul_row_kernel");
+        }
+        else
+        {
+          trans_vec_mul_col_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat_trans.lhs()),
+                                                 static_cast<unsigned int>(viennacl::traits::start1(mat_trans.lhs())),         static_cast<unsigned int>(viennacl::traits::start2(mat_trans.lhs())),
+                                                 static_cast<unsigned int>(viennacl::traits::stride1(mat_trans.lhs())),        static_cast<unsigned int>(viennacl::traits::stride2(mat_trans.lhs())),
+                                                 static_cast<unsigned int>(viennacl::traits::size1(mat_trans.lhs())),          static_cast<unsigned int>(viennacl::traits::size2(mat_trans.lhs())),
+                                                 static_cast<unsigned int>(viennacl::traits::internal_size1(mat_trans.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(mat_trans.lhs())),
+
+                                                 detail::cuda_arg<value_type>(vec),
+                                                 static_cast<unsigned int>(viennacl::traits::start(vec)),
+                                                 static_cast<unsigned int>(viennacl::traits::stride(vec)),
+                                                 static_cast<unsigned int>(viennacl::traits::size(vec)),
+
+                                                 detail::cuda_arg<value_type>(result),
+                                                 static_cast<unsigned int>(viennacl::traits::start(result)),
+                                                 static_cast<unsigned int>(viennacl::traits::stride(result)),
+                                                 static_cast<unsigned int>(viennacl::traits::size(result))
+                                                );
+          VIENNACL_CUDA_LAST_ERROR_CHECK("trans_vec_mul_col_kernel");
+        }
+      }
+
+
+      //
+      /////////////////////////   matrix-matrix products /////////////////////////////////
+      //
+
+      namespace detail
+      {
+        // C = A * B and possibly transposed variants
+        template <typename T1, typename T2, typename T3, typename ScalarType >
+        void prod_slow_kernel(const T1 & A, bool transposed_A,
+                              const T2 & B, bool transposed_B,
+                              T3 & C,
+                              ScalarType alpha,
+                              ScalarType beta)
+        {
+          typedef typename viennacl::result_of::cpu_value_type< typename T1::value_type >::type   cpu_value_type;
+
+          cpu_value_type converted_alpha = static_cast<cpu_value_type>(alpha);
+          cpu_value_type converted_beta  = static_cast<cpu_value_type>(beta);
+
+          dim3 threads(16, 16);
+          dim3 grid( (viennacl::traits::size1(C) - 1) / 16 + 1,
+                     (viennacl::traits::size2(C) - 1) / 16 + 1);
+
+          bool row_major_A = viennacl::is_row_major<T1>::value;
+          bool row_major_B = viennacl::is_row_major<T2>::value;
+          bool row_major_C = viennacl::is_row_major<T3>::value;
+
+
+          if (!row_major_C && !row_major_A && !row_major_B && !transposed_A && !transposed_B)
+          {
+            matrix_matrix_col_col_col_prod_AA_kernel<<<grid, threads>>>
+              (converted_alpha,
+                detail::cuda_arg<cpu_value_type>(A),
+                static_cast<unsigned int>(viennacl::traits::start1(A)),         static_cast<unsigned int>(viennacl::traits::start2(A)),
+                static_cast<unsigned int>(viennacl::traits::stride1(A)),        static_cast<unsigned int>(viennacl::traits::stride2(A)),
+                static_cast<unsigned int>(viennacl::traits::size1(A)),          static_cast<unsigned int>(viennacl::traits::size2(A)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+                detail::cuda_arg<cpu_value_type>(B),
+                static_cast<unsigned int>(viennacl::traits::start1(B)),         static_cast<unsigned int>(viennacl::traits::start2(B)),
+                static_cast<unsigned int>(viennacl::traits::stride1(B)),        static_cast<unsigned int>(viennacl::traits::stride2(B)),
+                static_cast<unsigned int>(viennacl::traits::size1(B)),          static_cast<unsigned int>(viennacl::traits::size2(B)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
+
+                converted_beta,
+                detail::cuda_arg<cpu_value_type>(C),
+                static_cast<unsigned int>(viennacl::traits::start1(C)),         static_cast<unsigned int>(viennacl::traits::start2(C)),
+                static_cast<unsigned int>(viennacl::traits::stride1(C)),        static_cast<unsigned int>(viennacl::traits::stride2(C)),
+                static_cast<unsigned int>(viennacl::traits::size1(C)),          static_cast<unsigned int>(viennacl::traits::size2(C)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
+          }
+          else if (!row_major_C && !row_major_A && !row_major_B && !transposed_A && transposed_B)
+          {
+            matrix_matrix_col_col_col_prod_AT_kernel<<<grid, threads>>>
+              (converted_alpha,
+                detail::cuda_arg<cpu_value_type>(A),
+                static_cast<unsigned int>(viennacl::traits::start1(A)),         static_cast<unsigned int>(viennacl::traits::start2(A)),
+                static_cast<unsigned int>(viennacl::traits::stride1(A)),        static_cast<unsigned int>(viennacl::traits::stride2(A)),
+                static_cast<unsigned int>(viennacl::traits::size1(A)),          static_cast<unsigned int>(viennacl::traits::size2(A)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+                detail::cuda_arg<cpu_value_type>(B),
+                static_cast<unsigned int>(viennacl::traits::start1(B)),         static_cast<unsigned int>(viennacl::traits::start2(B)),
+                static_cast<unsigned int>(viennacl::traits::stride1(B)),        static_cast<unsigned int>(viennacl::traits::stride2(B)),
+                static_cast<unsigned int>(viennacl::traits::size1(B)),          static_cast<unsigned int>(viennacl::traits::size2(B)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
+
+                converted_beta,
+                detail::cuda_arg<cpu_value_type>(C),
+                static_cast<unsigned int>(viennacl::traits::start1(C)),         static_cast<unsigned int>(viennacl::traits::start2(C)),
+                static_cast<unsigned int>(viennacl::traits::stride1(C)),        static_cast<unsigned int>(viennacl::traits::stride2(C)),
+                static_cast<unsigned int>(viennacl::traits::size1(C)),          static_cast<unsigned int>(viennacl::traits::size2(C)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
+          }
+          else if (!row_major_C && !row_major_A && !row_major_B && transposed_A && !transposed_B)
+          {
+            matrix_matrix_col_col_col_prod_TA_kernel<<<grid, threads>>>
+              (converted_alpha,
+                detail::cuda_arg<cpu_value_type>(A),
+                static_cast<unsigned int>(viennacl::traits::start1(A)),         static_cast<unsigned int>(viennacl::traits::start2(A)),
+                static_cast<unsigned int>(viennacl::traits::stride1(A)),        static_cast<unsigned int>(viennacl::traits::stride2(A)),
+                static_cast<unsigned int>(viennacl::traits::size1(A)),          static_cast<unsigned int>(viennacl::traits::size2(A)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+                detail::cuda_arg<cpu_value_type>(B),
+                static_cast<unsigned int>(viennacl::traits::start1(B)),         static_cast<unsigned int>(viennacl::traits::start2(B)),
+                static_cast<unsigned int>(viennacl::traits::stride1(B)),        static_cast<unsigned int>(viennacl::traits::stride2(B)),
+                static_cast<unsigned int>(viennacl::traits::size1(B)),          static_cast<unsigned int>(viennacl::traits::size2(B)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
+
+                converted_beta,
+                detail::cuda_arg<cpu_value_type>(C),
+                static_cast<unsigned int>(viennacl::traits::start1(C)),         static_cast<unsigned int>(viennacl::traits::start2(C)),
+                static_cast<unsigned int>(viennacl::traits::stride1(C)),        static_cast<unsigned int>(viennacl::traits::stride2(C)),
+                static_cast<unsigned int>(viennacl::traits::size1(C)),          static_cast<unsigned int>(viennacl::traits::size2(C)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
+          }
+          else if (!row_major_C && !row_major_A && !row_major_B && transposed_A && transposed_B)
+          {
+            matrix_matrix_col_col_col_prod_TT_kernel<<<grid, threads>>>
+              (converted_alpha,
+                detail::cuda_arg<cpu_value_type>(A),
+                static_cast<unsigned int>(viennacl::traits::start1(A)),         static_cast<unsigned int>(viennacl::traits::start2(A)),
+                static_cast<unsigned int>(viennacl::traits::stride1(A)),        static_cast<unsigned int>(viennacl::traits::stride2(A)),
+                static_cast<unsigned int>(viennacl::traits::size1(A)),          static_cast<unsigned int>(viennacl::traits::size2(A)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+                detail::cuda_arg<cpu_value_type>(B),
+                static_cast<unsigned int>(viennacl::traits::start1(B)),         static_cast<unsigned int>(viennacl::traits::start2(B)),
+                static_cast<unsigned int>(viennacl::traits::stride1(B)),        static_cast<unsigned int>(viennacl::traits::stride2(B)),
+                static_cast<unsigned int>(viennacl::traits::size1(B)),          static_cast<unsigned int>(viennacl::traits::size2(B)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
+
+                converted_beta,
+                detail::cuda_arg<cpu_value_type>(C),
+                static_cast<unsigned int>(viennacl::traits::start1(C)),         static_cast<unsigned int>(viennacl::traits::start2(C)),
+                static_cast<unsigned int>(viennacl::traits::stride1(C)),        static_cast<unsigned int>(viennacl::traits::stride2(C)),
+                static_cast<unsigned int>(viennacl::traits::size1(C)),          static_cast<unsigned int>(viennacl::traits::size2(C)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
+          }
+          /////////////////////////////////
+
+          else if (!row_major_C && !row_major_A && row_major_B && !transposed_A && !transposed_B)
+          {
+            matrix_matrix_col_col_row_prod_AA_kernel<<<grid, threads>>>
+              (converted_alpha,
+                detail::cuda_arg<cpu_value_type>(A),
+                static_cast<unsigned int>(viennacl::traits::start1(A)),         static_cast<unsigned int>(viennacl::traits::start2(A)),
+                static_cast<unsigned int>(viennacl::traits::stride1(A)),        static_cast<unsigned int>(viennacl::traits::stride2(A)),
+                static_cast<unsigned int>(viennacl::traits::size1(A)),          static_cast<unsigned int>(viennacl::traits::size2(A)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+                detail::cuda_arg<cpu_value_type>(B),
+                static_cast<unsigned int>(viennacl::traits::start1(B)),         static_cast<unsigned int>(viennacl::traits::start2(B)),
+                static_cast<unsigned int>(viennacl::traits::stride1(B)),        static_cast<unsigned int>(viennacl::traits::stride2(B)),
+                static_cast<unsigned int>(viennacl::traits::size1(B)),          static_cast<unsigned int>(viennacl::traits::size2(B)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
+
+                converted_beta,
+                detail::cuda_arg<cpu_value_type>(C),
+                static_cast<unsigned int>(viennacl::traits::start1(C)),         static_cast<unsigned int>(viennacl::traits::start2(C)),
+                static_cast<unsigned int>(viennacl::traits::stride1(C)),        static_cast<unsigned int>(viennacl::traits::stride2(C)),
+                static_cast<unsigned int>(viennacl::traits::size1(C)),          static_cast<unsigned int>(viennacl::traits::size2(C)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
+          }
+          else if (!row_major_C && !row_major_A && row_major_B && !transposed_A && transposed_B)
+          {
+            matrix_matrix_col_col_row_prod_AT_kernel<<<grid, threads>>>
+              (converted_alpha,
+                detail::cuda_arg<cpu_value_type>(A),
+                static_cast<unsigned int>(viennacl::traits::start1(A)),         static_cast<unsigned int>(viennacl::traits::start2(A)),
+                static_cast<unsigned int>(viennacl::traits::stride1(A)),        static_cast<unsigned int>(viennacl::traits::stride2(A)),
+                static_cast<unsigned int>(viennacl::traits::size1(A)),          static_cast<unsigned int>(viennacl::traits::size2(A)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+                detail::cuda_arg<cpu_value_type>(B),
+                static_cast<unsigned int>(viennacl::traits::start1(B)),         static_cast<unsigned int>(viennacl::traits::start2(B)),
+                static_cast<unsigned int>(viennacl::traits::stride1(B)),        static_cast<unsigned int>(viennacl::traits::stride2(B)),
+                static_cast<unsigned int>(viennacl::traits::size1(B)),          static_cast<unsigned int>(viennacl::traits::size2(B)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
+
+                converted_beta,
+                detail::cuda_arg<cpu_value_type>(C),
+                static_cast<unsigned int>(viennacl::traits::start1(C)),         static_cast<unsigned int>(viennacl::traits::start2(C)),
+                static_cast<unsigned int>(viennacl::traits::stride1(C)),        static_cast<unsigned int>(viennacl::traits::stride2(C)),
+                static_cast<unsigned int>(viennacl::traits::size1(C)),          static_cast<unsigned int>(viennacl::traits::size2(C)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
+          }
+          else if (!row_major_C && !row_major_A && row_major_B && transposed_A && !transposed_B)
+          {
+            matrix_matrix_col_col_row_prod_TA_kernel<<<grid, threads>>>
+              (converted_alpha,
+                detail::cuda_arg<cpu_value_type>(A),
+                static_cast<unsigned int>(viennacl::traits::start1(A)),         static_cast<unsigned int>(viennacl::traits::start2(A)),
+                static_cast<unsigned int>(viennacl::traits::stride1(A)),        static_cast<unsigned int>(viennacl::traits::stride2(A)),
+                static_cast<unsigned int>(viennacl::traits::size1(A)),          static_cast<unsigned int>(viennacl::traits::size2(A)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+                detail::cuda_arg<cpu_value_type>(B),
+                static_cast<unsigned int>(viennacl::traits::start1(B)),         static_cast<unsigned int>(viennacl::traits::start2(B)),
+                static_cast<unsigned int>(viennacl::traits::stride1(B)),        static_cast<unsigned int>(viennacl::traits::stride2(B)),
+                static_cast<unsigned int>(viennacl::traits::size1(B)),          static_cast<unsigned int>(viennacl::traits::size2(B)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
+
+                converted_beta,
+                detail::cuda_arg<cpu_value_type>(C),
+                static_cast<unsigned int>(viennacl::traits::start1(C)),         static_cast<unsigned int>(viennacl::traits::start2(C)),
+                static_cast<unsigned int>(viennacl::traits::stride1(C)),        static_cast<unsigned int>(viennacl::traits::stride2(C)),
+                static_cast<unsigned int>(viennacl::traits::size1(C)),          static_cast<unsigned int>(viennacl::traits::size2(C)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
+          }
+          else if (!row_major_C && !row_major_A && row_major_B && transposed_A && transposed_B)
+          {
+            matrix_matrix_col_col_row_prod_TT_kernel<<<grid, threads>>>
+              (converted_alpha,
+                detail::cuda_arg<cpu_value_type>(A),
+                static_cast<unsigned int>(viennacl::traits::start1(A)),         static_cast<unsigned int>(viennacl::traits::start2(A)),
+                static_cast<unsigned int>(viennacl::traits::stride1(A)),        static_cast<unsigned int>(viennacl::traits::stride2(A)),
+                static_cast<unsigned int>(viennacl::traits::size1(A)),          static_cast<unsigned int>(viennacl::traits::size2(A)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+                detail::cuda_arg<cpu_value_type>(B),
+                static_cast<unsigned int>(viennacl::traits::start1(B)),         static_cast<unsigned int>(viennacl::traits::start2(B)),
+                static_cast<unsigned int>(viennacl::traits::stride1(B)),        static_cast<unsigned int>(viennacl::traits::stride2(B)),
+                static_cast<unsigned int>(viennacl::traits::size1(B)),          static_cast<unsigned int>(viennacl::traits::size2(B)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
+
+                converted_beta,
+                detail::cuda_arg<cpu_value_type>(C),
+                static_cast<unsigned int>(viennacl::traits::start1(C)),         static_cast<unsigned int>(viennacl::traits::start2(C)),
+                static_cast<unsigned int>(viennacl::traits::stride1(C)),        static_cast<unsigned int>(viennacl::traits::stride2(C)),
+                static_cast<unsigned int>(viennacl::traits::size1(C)),          static_cast<unsigned int>(viennacl::traits::size2(C)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
+          }
+          /////////////////////////////////
+
+          else if (!row_major_C && row_major_A && !row_major_B && !transposed_A && !transposed_B)
+          {
+            matrix_matrix_col_row_col_prod_AA_kernel<<<grid, threads>>>
+              (converted_alpha,
+                detail::cuda_arg<cpu_value_type>(A),
+                static_cast<unsigned int>(viennacl::traits::start1(A)),         static_cast<unsigned int>(viennacl::traits::start2(A)),
+                static_cast<unsigned int>(viennacl::traits::stride1(A)),        static_cast<unsigned int>(viennacl::traits::stride2(A)),
+                static_cast<unsigned int>(viennacl::traits::size1(A)),          static_cast<unsigned int>(viennacl::traits::size2(A)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+                detail::cuda_arg<cpu_value_type>(B),
+                static_cast<unsigned int>(viennacl::traits::start1(B)),         static_cast<unsigned int>(viennacl::traits::start2(B)),
+                static_cast<unsigned int>(viennacl::traits::stride1(B)),        static_cast<unsigned int>(viennacl::traits::stride2(B)),
+                static_cast<unsigned int>(viennacl::traits::size1(B)),          static_cast<unsigned int>(viennacl::traits::size2(B)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
+
+                converted_beta,
+                detail::cuda_arg<cpu_value_type>(C),
+                static_cast<unsigned int>(viennacl::traits::start1(C)),         static_cast<unsigned int>(viennacl::traits::start2(C)),
+                static_cast<unsigned int>(viennacl::traits::stride1(C)),        static_cast<unsigned int>(viennacl::traits::stride2(C)),
+                static_cast<unsigned int>(viennacl::traits::size1(C)),          static_cast<unsigned int>(viennacl::traits::size2(C)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
+          }
+          else if (!row_major_C && row_major_A && !row_major_B && !transposed_A && transposed_B)
+          {
+            matrix_matrix_col_row_col_prod_AT_kernel<<<grid, threads>>>
+              (converted_alpha,
+                detail::cuda_arg<cpu_value_type>(A),
+                static_cast<unsigned int>(viennacl::traits::start1(A)),         static_cast<unsigned int>(viennacl::traits::start2(A)),
+                static_cast<unsigned int>(viennacl::traits::stride1(A)),        static_cast<unsigned int>(viennacl::traits::stride2(A)),
+                static_cast<unsigned int>(viennacl::traits::size1(A)),          static_cast<unsigned int>(viennacl::traits::size2(A)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+                detail::cuda_arg<cpu_value_type>(B),
+                static_cast<unsigned int>(viennacl::traits::start1(B)),         static_cast<unsigned int>(viennacl::traits::start2(B)),
+                static_cast<unsigned int>(viennacl::traits::stride1(B)),        static_cast<unsigned int>(viennacl::traits::stride2(B)),
+                static_cast<unsigned int>(viennacl::traits::size1(B)),          static_cast<unsigned int>(viennacl::traits::size2(B)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
+
+                converted_beta,
+                detail::cuda_arg<cpu_value_type>(C),
+                static_cast<unsigned int>(viennacl::traits::start1(C)),         static_cast<unsigned int>(viennacl::traits::start2(C)),
+                static_cast<unsigned int>(viennacl::traits::stride1(C)),        static_cast<unsigned int>(viennacl::traits::stride2(C)),
+                static_cast<unsigned int>(viennacl::traits::size1(C)),          static_cast<unsigned int>(viennacl::traits::size2(C)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
+          }
+          else if (!row_major_C && row_major_A && !row_major_B && transposed_A && !transposed_B)
+          {
+            matrix_matrix_col_row_col_prod_TA_kernel<<<grid, threads>>>
+              (converted_alpha,
+                detail::cuda_arg<cpu_value_type>(A),
+                static_cast<unsigned int>(viennacl::traits::start1(A)),         static_cast<unsigned int>(viennacl::traits::start2(A)),
+                static_cast<unsigned int>(viennacl::traits::stride1(A)),        static_cast<unsigned int>(viennacl::traits::stride2(A)),
+                static_cast<unsigned int>(viennacl::traits::size1(A)),          static_cast<unsigned int>(viennacl::traits::size2(A)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+                detail::cuda_arg<cpu_value_type>(B),
+                static_cast<unsigned int>(viennacl::traits::start1(B)),         static_cast<unsigned int>(viennacl::traits::start2(B)),
+                static_cast<unsigned int>(viennacl::traits::stride1(B)),        static_cast<unsigned int>(viennacl::traits::stride2(B)),
+                static_cast<unsigned int>(viennacl::traits::size1(B)),          static_cast<unsigned int>(viennacl::traits::size2(B)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
+
+                converted_beta,
+                detail::cuda_arg<cpu_value_type>(C),
+                static_cast<unsigned int>(viennacl::traits::start1(C)),         static_cast<unsigned int>(viennacl::traits::start2(C)),
+                static_cast<unsigned int>(viennacl::traits::stride1(C)),        static_cast<unsigned int>(viennacl::traits::stride2(C)),
+                static_cast<unsigned int>(viennacl::traits::size1(C)),          static_cast<unsigned int>(viennacl::traits::size2(C)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
+          }
+          else if (!row_major_C && row_major_A && !row_major_B && transposed_A && transposed_B)
+          {
+            matrix_matrix_col_row_col_prod_TT_kernel<<<grid, threads>>>
+              (converted_alpha,
+                detail::cuda_arg<cpu_value_type>(A),
+                static_cast<unsigned int>(viennacl::traits::start1(A)),         static_cast<unsigned int>(viennacl::traits::start2(A)),
+                static_cast<unsigned int>(viennacl::traits::stride1(A)),        static_cast<unsigned int>(viennacl::traits::stride2(A)),
+                static_cast<unsigned int>(viennacl::traits::size1(A)),          static_cast<unsigned int>(viennacl::traits::size2(A)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+                detail::cuda_arg<cpu_value_type>(B),
+                static_cast<unsigned int>(viennacl::traits::start1(B)),         static_cast<unsigned int>(viennacl::traits::start2(B)),
+                static_cast<unsigned int>(viennacl::traits::stride1(B)),        static_cast<unsigned int>(viennacl::traits::stride2(B)),
+                static_cast<unsigned int>(viennacl::traits::size1(B)),          static_cast<unsigned int>(viennacl::traits::size2(B)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
+
+                converted_beta,
+                detail::cuda_arg<cpu_value_type>(C),
+                static_cast<unsigned int>(viennacl::traits::start1(C)),         static_cast<unsigned int>(viennacl::traits::start2(C)),
+                static_cast<unsigned int>(viennacl::traits::stride1(C)),        static_cast<unsigned int>(viennacl::traits::stride2(C)),
+                static_cast<unsigned int>(viennacl::traits::size1(C)),          static_cast<unsigned int>(viennacl::traits::size2(C)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
+          }
+          /////////////////////////////////
+
+          else if (!row_major_C && row_major_A && row_major_B && !transposed_A && !transposed_B)
+          {
+            matrix_matrix_col_row_row_prod_AA_kernel<<<grid, threads>>>
+              (converted_alpha,
+                detail::cuda_arg<cpu_value_type>(A),
+                static_cast<unsigned int>(viennacl::traits::start1(A)),         static_cast<unsigned int>(viennacl::traits::start2(A)),
+                static_cast<unsigned int>(viennacl::traits::stride1(A)),        static_cast<unsigned int>(viennacl::traits::stride2(A)),
+                static_cast<unsigned int>(viennacl::traits::size1(A)),          static_cast<unsigned int>(viennacl::traits::size2(A)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+                detail::cuda_arg<cpu_value_type>(B),
+                static_cast<unsigned int>(viennacl::traits::start1(B)),         static_cast<unsigned int>(viennacl::traits::start2(B)),
+                static_cast<unsigned int>(viennacl::traits::stride1(B)),        static_cast<unsigned int>(viennacl::traits::stride2(B)),
+                static_cast<unsigned int>(viennacl::traits::size1(B)),          static_cast<unsigned int>(viennacl::traits::size2(B)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
+
+                converted_beta,
+                detail::cuda_arg<cpu_value_type>(C),
+                static_cast<unsigned int>(viennacl::traits::start1(C)),         static_cast<unsigned int>(viennacl::traits::start2(C)),
+                static_cast<unsigned int>(viennacl::traits::stride1(C)),        static_cast<unsigned int>(viennacl::traits::stride2(C)),
+                static_cast<unsigned int>(viennacl::traits::size1(C)),          static_cast<unsigned int>(viennacl::traits::size2(C)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
+          }
+          else if (!row_major_C && row_major_A && row_major_B && !transposed_A && transposed_B)
+          {
+            matrix_matrix_col_row_row_prod_AT_kernel<<<grid, threads>>>
+              (converted_alpha,
+                detail::cuda_arg<cpu_value_type>(A),
+                static_cast<unsigned int>(viennacl::traits::start1(A)),         static_cast<unsigned int>(viennacl::traits::start2(A)),
+                static_cast<unsigned int>(viennacl::traits::stride1(A)),        static_cast<unsigned int>(viennacl::traits::stride2(A)),
+                static_cast<unsigned int>(viennacl::traits::size1(A)),          static_cast<unsigned int>(viennacl::traits::size2(A)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+                detail::cuda_arg<cpu_value_type>(B),
+                static_cast<unsigned int>(viennacl::traits::start1(B)),         static_cast<unsigned int>(viennacl::traits::start2(B)),
+                static_cast<unsigned int>(viennacl::traits::stride1(B)),        static_cast<unsigned int>(viennacl::traits::stride2(B)),
+                static_cast<unsigned int>(viennacl::traits::size1(B)),          static_cast<unsigned int>(viennacl::traits::size2(B)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
+
+                converted_beta,
+                detail::cuda_arg<cpu_value_type>(C),
+                static_cast<unsigned int>(viennacl::traits::start1(C)),         static_cast<unsigned int>(viennacl::traits::start2(C)),
+                static_cast<unsigned int>(viennacl::traits::stride1(C)),        static_cast<unsigned int>(viennacl::traits::stride2(C)),
+                static_cast<unsigned int>(viennacl::traits::size1(C)),          static_cast<unsigned int>(viennacl::traits::size2(C)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
+          }
+          else if (!row_major_C && row_major_A && row_major_B && transposed_A && !transposed_B)
+          {
+            matrix_matrix_col_row_row_prod_TA_kernel<<<grid, threads>>>
+              (converted_alpha,
+                detail::cuda_arg<cpu_value_type>(A),
+                static_cast<unsigned int>(viennacl::traits::start1(A)),         static_cast<unsigned int>(viennacl::traits::start2(A)),
+                static_cast<unsigned int>(viennacl::traits::stride1(A)),        static_cast<unsigned int>(viennacl::traits::stride2(A)),
+                static_cast<unsigned int>(viennacl::traits::size1(A)),          static_cast<unsigned int>(viennacl::traits::size2(A)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+                detail::cuda_arg<cpu_value_type>(B),
+                static_cast<unsigned int>(viennacl::traits::start1(B)),         static_cast<unsigned int>(viennacl::traits::start2(B)),
+                static_cast<unsigned int>(viennacl::traits::stride1(B)),        static_cast<unsigned int>(viennacl::traits::stride2(B)),
+                static_cast<unsigned int>(viennacl::traits::size1(B)),          static_cast<unsigned int>(viennacl::traits::size2(B)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
+
+                converted_beta,
+                detail::cuda_arg<cpu_value_type>(C),
+                static_cast<unsigned int>(viennacl::traits::start1(C)),         static_cast<unsigned int>(viennacl::traits::start2(C)),
+                static_cast<unsigned int>(viennacl::traits::stride1(C)),        static_cast<unsigned int>(viennacl::traits::stride2(C)),
+                static_cast<unsigned int>(viennacl::traits::size1(C)),          static_cast<unsigned int>(viennacl::traits::size2(C)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
+          }
+          else if (!row_major_C && row_major_A && row_major_B && transposed_A && transposed_B)
+          {
+            matrix_matrix_col_row_row_prod_TT_kernel<<<grid, threads>>>
+              (converted_alpha,
+                detail::cuda_arg<cpu_value_type>(A),
+                static_cast<unsigned int>(viennacl::traits::start1(A)),         static_cast<unsigned int>(viennacl::traits::start2(A)),
+                static_cast<unsigned int>(viennacl::traits::stride1(A)),        static_cast<unsigned int>(viennacl::traits::stride2(A)),
+                static_cast<unsigned int>(viennacl::traits::size1(A)),          static_cast<unsigned int>(viennacl::traits::size2(A)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+                detail::cuda_arg<cpu_value_type>(B),
+                static_cast<unsigned int>(viennacl::traits::start1(B)),         static_cast<unsigned int>(viennacl::traits::start2(B)),
+                static_cast<unsigned int>(viennacl::traits::stride1(B)),        static_cast<unsigned int>(viennacl::traits::stride2(B)),
+                static_cast<unsigned int>(viennacl::traits::size1(B)),          static_cast<unsigned int>(viennacl::traits::size2(B)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
+
+                converted_beta,
+                detail::cuda_arg<cpu_value_type>(C),
+                static_cast<unsigned int>(viennacl::traits::start1(C)),         static_cast<unsigned int>(viennacl::traits::start2(C)),
+                static_cast<unsigned int>(viennacl::traits::stride1(C)),        static_cast<unsigned int>(viennacl::traits::stride2(C)),
+                static_cast<unsigned int>(viennacl::traits::size1(C)),          static_cast<unsigned int>(viennacl::traits::size2(C)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
+          }
+          /////////////////////////////////
+
+          else if (row_major_C && !row_major_A && !row_major_B && !transposed_A && !transposed_B)
+          {
+            matrix_matrix_row_col_col_prod_AA_kernel<<<grid, threads>>>
+              (converted_alpha,
+                detail::cuda_arg<cpu_value_type>(A),
+                static_cast<unsigned int>(viennacl::traits::start1(A)),         static_cast<unsigned int>(viennacl::traits::start2(A)),
+                static_cast<unsigned int>(viennacl::traits::stride1(A)),        static_cast<unsigned int>(viennacl::traits::stride2(A)),
+                static_cast<unsigned int>(viennacl::traits::size1(A)),          static_cast<unsigned int>(viennacl::traits::size2(A)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+                detail::cuda_arg<cpu_value_type>(B),
+                static_cast<unsigned int>(viennacl::traits::start1(B)),         static_cast<unsigned int>(viennacl::traits::start2(B)),
+                static_cast<unsigned int>(viennacl::traits::stride1(B)),        static_cast<unsigned int>(viennacl::traits::stride2(B)),
+                static_cast<unsigned int>(viennacl::traits::size1(B)),          static_cast<unsigned int>(viennacl::traits::size2(B)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
+
+                converted_beta,
+                detail::cuda_arg<cpu_value_type>(C),
+                static_cast<unsigned int>(viennacl::traits::start1(C)),         static_cast<unsigned int>(viennacl::traits::start2(C)),
+                static_cast<unsigned int>(viennacl::traits::stride1(C)),        static_cast<unsigned int>(viennacl::traits::stride2(C)),
+                static_cast<unsigned int>(viennacl::traits::size1(C)),          static_cast<unsigned int>(viennacl::traits::size2(C)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
+          }
+          else if (row_major_C && !row_major_A && !row_major_B && !transposed_A && transposed_B)
+          {
+            matrix_matrix_row_col_col_prod_AT_kernel<<<grid, threads>>>
+              (converted_alpha,
+                detail::cuda_arg<cpu_value_type>(A),
+                static_cast<unsigned int>(viennacl::traits::start1(A)),         static_cast<unsigned int>(viennacl::traits::start2(A)),
+                static_cast<unsigned int>(viennacl::traits::stride1(A)),        static_cast<unsigned int>(viennacl::traits::stride2(A)),
+                static_cast<unsigned int>(viennacl::traits::size1(A)),          static_cast<unsigned int>(viennacl::traits::size2(A)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+                detail::cuda_arg<cpu_value_type>(B),
+                static_cast<unsigned int>(viennacl::traits::start1(B)),         static_cast<unsigned int>(viennacl::traits::start2(B)),
+                static_cast<unsigned int>(viennacl::traits::stride1(B)),        static_cast<unsigned int>(viennacl::traits::stride2(B)),
+                static_cast<unsigned int>(viennacl::traits::size1(B)),          static_cast<unsigned int>(viennacl::traits::size2(B)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
+
+                converted_beta,
+                detail::cuda_arg<cpu_value_type>(C),
+                static_cast<unsigned int>(viennacl::traits::start1(C)),         static_cast<unsigned int>(viennacl::traits::start2(C)),
+                static_cast<unsigned int>(viennacl::traits::stride1(C)),        static_cast<unsigned int>(viennacl::traits::stride2(C)),
+                static_cast<unsigned int>(viennacl::traits::size1(C)),          static_cast<unsigned int>(viennacl::traits::size2(C)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
+          }
+          else if (row_major_C && !row_major_A && !row_major_B && transposed_A && !transposed_B)
+          {
+            matrix_matrix_row_col_col_prod_TA_kernel<<<grid, threads>>>
+              (converted_alpha,
+                detail::cuda_arg<cpu_value_type>(A),
+                static_cast<unsigned int>(viennacl::traits::start1(A)),         static_cast<unsigned int>(viennacl::traits::start2(A)),
+                static_cast<unsigned int>(viennacl::traits::stride1(A)),        static_cast<unsigned int>(viennacl::traits::stride2(A)),
+                static_cast<unsigned int>(viennacl::traits::size1(A)),          static_cast<unsigned int>(viennacl::traits::size2(A)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+                detail::cuda_arg<cpu_value_type>(B),
+                static_cast<unsigned int>(viennacl::traits::start1(B)),         static_cast<unsigned int>(viennacl::traits::start2(B)),
+                static_cast<unsigned int>(viennacl::traits::stride1(B)),        static_cast<unsigned int>(viennacl::traits::stride2(B)),
+                static_cast<unsigned int>(viennacl::traits::size1(B)),          static_cast<unsigned int>(viennacl::traits::size2(B)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
+
+                converted_beta,
+                detail::cuda_arg<cpu_value_type>(C),
+                static_cast<unsigned int>(viennacl::traits::start1(C)),         static_cast<unsigned int>(viennacl::traits::start2(C)),
+                static_cast<unsigned int>(viennacl::traits::stride1(C)),        static_cast<unsigned int>(viennacl::traits::stride2(C)),
+                static_cast<unsigned int>(viennacl::traits::size1(C)),          static_cast<unsigned int>(viennacl::traits::size2(C)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
+          }
+          else if (row_major_C && !row_major_A && !row_major_B && transposed_A && transposed_B)
+          {
+            matrix_matrix_row_col_col_prod_TT_kernel<<<grid, threads>>>
+              (converted_alpha,
+                detail::cuda_arg<cpu_value_type>(A),
+                static_cast<unsigned int>(viennacl::traits::start1(A)),         static_cast<unsigned int>(viennacl::traits::start2(A)),
+                static_cast<unsigned int>(viennacl::traits::stride1(A)),        static_cast<unsigned int>(viennacl::traits::stride2(A)),
+                static_cast<unsigned int>(viennacl::traits::size1(A)),          static_cast<unsigned int>(viennacl::traits::size2(A)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+                detail::cuda_arg<cpu_value_type>(B),
+                static_cast<unsigned int>(viennacl::traits::start1(B)),         static_cast<unsigned int>(viennacl::traits::start2(B)),
+                static_cast<unsigned int>(viennacl::traits::stride1(B)),        static_cast<unsigned int>(viennacl::traits::stride2(B)),
+                static_cast<unsigned int>(viennacl::traits::size1(B)),          static_cast<unsigned int>(viennacl::traits::size2(B)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
+
+                converted_beta,
+                detail::cuda_arg<cpu_value_type>(C),
+                static_cast<unsigned int>(viennacl::traits::start1(C)),         static_cast<unsigned int>(viennacl::traits::start2(C)),
+                static_cast<unsigned int>(viennacl::traits::stride1(C)),        static_cast<unsigned int>(viennacl::traits::stride2(C)),
+                static_cast<unsigned int>(viennacl::traits::size1(C)),          static_cast<unsigned int>(viennacl::traits::size2(C)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
+          }
+          /////////////////////////////////
+
+          else if (row_major_C && !row_major_A && row_major_B && !transposed_A && !transposed_B)
+          {
+            matrix_matrix_row_col_row_prod_AA_kernel<<<grid, threads>>>
+              (converted_alpha,
+                detail::cuda_arg<cpu_value_type>(A),
+                static_cast<unsigned int>(viennacl::traits::start1(A)),         static_cast<unsigned int>(viennacl::traits::start2(A)),
+                static_cast<unsigned int>(viennacl::traits::stride1(A)),        static_cast<unsigned int>(viennacl::traits::stride2(A)),
+                static_cast<unsigned int>(viennacl::traits::size1(A)),          static_cast<unsigned int>(viennacl::traits::size2(A)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+                detail::cuda_arg<cpu_value_type>(B),
+                static_cast<unsigned int>(viennacl::traits::start1(B)),         static_cast<unsigned int>(viennacl::traits::start2(B)),
+                static_cast<unsigned int>(viennacl::traits::stride1(B)),        static_cast<unsigned int>(viennacl::traits::stride2(B)),
+                static_cast<unsigned int>(viennacl::traits::size1(B)),          static_cast<unsigned int>(viennacl::traits::size2(B)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
+
+                converted_beta,
+                detail::cuda_arg<cpu_value_type>(C),
+                static_cast<unsigned int>(viennacl::traits::start1(C)),         static_cast<unsigned int>(viennacl::traits::start2(C)),
+                static_cast<unsigned int>(viennacl::traits::stride1(C)),        static_cast<unsigned int>(viennacl::traits::stride2(C)),
+                static_cast<unsigned int>(viennacl::traits::size1(C)),          static_cast<unsigned int>(viennacl::traits::size2(C)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
+          }
+          else if (row_major_C && !row_major_A && row_major_B && !transposed_A && transposed_B)
+          {
+            matrix_matrix_row_col_row_prod_AT_kernel<<<grid, threads>>>
+              (converted_alpha,
+                detail::cuda_arg<cpu_value_type>(A),
+                static_cast<unsigned int>(viennacl::traits::start1(A)),         static_cast<unsigned int>(viennacl::traits::start2(A)),
+                static_cast<unsigned int>(viennacl::traits::stride1(A)),        static_cast<unsigned int>(viennacl::traits::stride2(A)),
+                static_cast<unsigned int>(viennacl::traits::size1(A)),          static_cast<unsigned int>(viennacl::traits::size2(A)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+                detail::cuda_arg<cpu_value_type>(B),
+                static_cast<unsigned int>(viennacl::traits::start1(B)),         static_cast<unsigned int>(viennacl::traits::start2(B)),
+                static_cast<unsigned int>(viennacl::traits::stride1(B)),        static_cast<unsigned int>(viennacl::traits::stride2(B)),
+                static_cast<unsigned int>(viennacl::traits::size1(B)),          static_cast<unsigned int>(viennacl::traits::size2(B)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
+
+                converted_beta,
+                detail::cuda_arg<cpu_value_type>(C),
+                static_cast<unsigned int>(viennacl::traits::start1(C)),         static_cast<unsigned int>(viennacl::traits::start2(C)),
+                static_cast<unsigned int>(viennacl::traits::stride1(C)),        static_cast<unsigned int>(viennacl::traits::stride2(C)),
+                static_cast<unsigned int>(viennacl::traits::size1(C)),          static_cast<unsigned int>(viennacl::traits::size2(C)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
+          }
+          else if (row_major_C && !row_major_A && row_major_B && transposed_A && !transposed_B)
+          {
+            matrix_matrix_row_col_row_prod_TA_kernel<<<grid, threads>>>
+              (converted_alpha,
+                detail::cuda_arg<cpu_value_type>(A),
+                static_cast<unsigned int>(viennacl::traits::start1(A)),         static_cast<unsigned int>(viennacl::traits::start2(A)),
+                static_cast<unsigned int>(viennacl::traits::stride1(A)),        static_cast<unsigned int>(viennacl::traits::stride2(A)),
+                static_cast<unsigned int>(viennacl::traits::size1(A)),          static_cast<unsigned int>(viennacl::traits::size2(A)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+                detail::cuda_arg<cpu_value_type>(B),
+                static_cast<unsigned int>(viennacl::traits::start1(B)),         static_cast<unsigned int>(viennacl::traits::start2(B)),
+                static_cast<unsigned int>(viennacl::traits::stride1(B)),        static_cast<unsigned int>(viennacl::traits::stride2(B)),
+                static_cast<unsigned int>(viennacl::traits::size1(B)),          static_cast<unsigned int>(viennacl::traits::size2(B)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
+
+                converted_beta,
+                detail::cuda_arg<cpu_value_type>(C),
+                static_cast<unsigned int>(viennacl::traits::start1(C)),         static_cast<unsigned int>(viennacl::traits::start2(C)),
+                static_cast<unsigned int>(viennacl::traits::stride1(C)),        static_cast<unsigned int>(viennacl::traits::stride2(C)),
+                static_cast<unsigned int>(viennacl::traits::size1(C)),          static_cast<unsigned int>(viennacl::traits::size2(C)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
+          }
+          else if (row_major_C && !row_major_A && row_major_B && transposed_A && transposed_B)
+          {
+            matrix_matrix_row_col_row_prod_TT_kernel<<<grid, threads>>>
+              (converted_alpha,
+                detail::cuda_arg<cpu_value_type>(A),
+                static_cast<unsigned int>(viennacl::traits::start1(A)),         static_cast<unsigned int>(viennacl::traits::start2(A)),
+                static_cast<unsigned int>(viennacl::traits::stride1(A)),        static_cast<unsigned int>(viennacl::traits::stride2(A)),
+                static_cast<unsigned int>(viennacl::traits::size1(A)),          static_cast<unsigned int>(viennacl::traits::size2(A)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+                detail::cuda_arg<cpu_value_type>(B),
+                static_cast<unsigned int>(viennacl::traits::start1(B)),         static_cast<unsigned int>(viennacl::traits::start2(B)),
+                static_cast<unsigned int>(viennacl::traits::stride1(B)),        static_cast<unsigned int>(viennacl::traits::stride2(B)),
+                static_cast<unsigned int>(viennacl::traits::size1(B)),          static_cast<unsigned int>(viennacl::traits::size2(B)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
+
+                converted_beta,
+                detail::cuda_arg<cpu_value_type>(C),
+                static_cast<unsigned int>(viennacl::traits::start1(C)),         static_cast<unsigned int>(viennacl::traits::start2(C)),
+                static_cast<unsigned int>(viennacl::traits::stride1(C)),        static_cast<unsigned int>(viennacl::traits::stride2(C)),
+                static_cast<unsigned int>(viennacl::traits::size1(C)),          static_cast<unsigned int>(viennacl::traits::size2(C)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
+          }
+          /////////////////////////////////
+
+          else if (row_major_C && row_major_A && !row_major_B && !transposed_A && !transposed_B)
+          {
+            matrix_matrix_row_row_col_prod_AA_kernel<<<grid, threads>>>
+              (converted_alpha,
+                detail::cuda_arg<cpu_value_type>(A),
+                static_cast<unsigned int>(viennacl::traits::start1(A)),         static_cast<unsigned int>(viennacl::traits::start2(A)),
+                static_cast<unsigned int>(viennacl::traits::stride1(A)),        static_cast<unsigned int>(viennacl::traits::stride2(A)),
+                static_cast<unsigned int>(viennacl::traits::size1(A)),          static_cast<unsigned int>(viennacl::traits::size2(A)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+                detail::cuda_arg<cpu_value_type>(B),
+                static_cast<unsigned int>(viennacl::traits::start1(B)),         static_cast<unsigned int>(viennacl::traits::start2(B)),
+                static_cast<unsigned int>(viennacl::traits::stride1(B)),        static_cast<unsigned int>(viennacl::traits::stride2(B)),
+                static_cast<unsigned int>(viennacl::traits::size1(B)),          static_cast<unsigned int>(viennacl::traits::size2(B)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
+
+                converted_beta,
+                detail::cuda_arg<cpu_value_type>(C),
+                static_cast<unsigned int>(viennacl::traits::start1(C)),         static_cast<unsigned int>(viennacl::traits::start2(C)),
+                static_cast<unsigned int>(viennacl::traits::stride1(C)),        static_cast<unsigned int>(viennacl::traits::stride2(C)),
+                static_cast<unsigned int>(viennacl::traits::size1(C)),          static_cast<unsigned int>(viennacl::traits::size2(C)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
+          }
+          else if (row_major_C && row_major_A && !row_major_B && !transposed_A && transposed_B)
+          {
+            matrix_matrix_row_row_col_prod_AT_kernel<<<grid, threads>>>
+              (converted_alpha,
+                detail::cuda_arg<cpu_value_type>(A),
+                static_cast<unsigned int>(viennacl::traits::start1(A)),         static_cast<unsigned int>(viennacl::traits::start2(A)),
+                static_cast<unsigned int>(viennacl::traits::stride1(A)),        static_cast<unsigned int>(viennacl::traits::stride2(A)),
+                static_cast<unsigned int>(viennacl::traits::size1(A)),          static_cast<unsigned int>(viennacl::traits::size2(A)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+                detail::cuda_arg<cpu_value_type>(B),
+                static_cast<unsigned int>(viennacl::traits::start1(B)),         static_cast<unsigned int>(viennacl::traits::start2(B)),
+                static_cast<unsigned int>(viennacl::traits::stride1(B)),        static_cast<unsigned int>(viennacl::traits::stride2(B)),
+                static_cast<unsigned int>(viennacl::traits::size1(B)),          static_cast<unsigned int>(viennacl::traits::size2(B)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
+
+                converted_beta,
+                detail::cuda_arg<cpu_value_type>(C),
+                static_cast<unsigned int>(viennacl::traits::start1(C)),         static_cast<unsigned int>(viennacl::traits::start2(C)),
+                static_cast<unsigned int>(viennacl::traits::stride1(C)),        static_cast<unsigned int>(viennacl::traits::stride2(C)),
+                static_cast<unsigned int>(viennacl::traits::size1(C)),          static_cast<unsigned int>(viennacl::traits::size2(C)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
+          }
+          else if (row_major_C && row_major_A && !row_major_B && transposed_A && !transposed_B)
+          {
+            matrix_matrix_row_row_col_prod_TA_kernel<<<grid, threads>>>
+              (converted_alpha,
+                detail::cuda_arg<cpu_value_type>(A),
+                static_cast<unsigned int>(viennacl::traits::start1(A)),         static_cast<unsigned int>(viennacl::traits::start2(A)),
+                static_cast<unsigned int>(viennacl::traits::stride1(A)),        static_cast<unsigned int>(viennacl::traits::stride2(A)),
+                static_cast<unsigned int>(viennacl::traits::size1(A)),          static_cast<unsigned int>(viennacl::traits::size2(A)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+                detail::cuda_arg<cpu_value_type>(B),
+                static_cast<unsigned int>(viennacl::traits::start1(B)),         static_cast<unsigned int>(viennacl::traits::start2(B)),
+                static_cast<unsigned int>(viennacl::traits::stride1(B)),        static_cast<unsigned int>(viennacl::traits::stride2(B)),
+                static_cast<unsigned int>(viennacl::traits::size1(B)),          static_cast<unsigned int>(viennacl::traits::size2(B)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
+
+                converted_beta,
+                detail::cuda_arg<cpu_value_type>(C),
+                static_cast<unsigned int>(viennacl::traits::start1(C)),         static_cast<unsigned int>(viennacl::traits::start2(C)),
+                static_cast<unsigned int>(viennacl::traits::stride1(C)),        static_cast<unsigned int>(viennacl::traits::stride2(C)),
+                static_cast<unsigned int>(viennacl::traits::size1(C)),          static_cast<unsigned int>(viennacl::traits::size2(C)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
+          }
+          else if (row_major_C && row_major_A && !row_major_B && transposed_A && transposed_B)
+          {
+            matrix_matrix_row_row_col_prod_TT_kernel<<<grid, threads>>>
+              (converted_alpha,
+                detail::cuda_arg<cpu_value_type>(A),
+                static_cast<unsigned int>(viennacl::traits::start1(A)),         static_cast<unsigned int>(viennacl::traits::start2(A)),
+                static_cast<unsigned int>(viennacl::traits::stride1(A)),        static_cast<unsigned int>(viennacl::traits::stride2(A)),
+                static_cast<unsigned int>(viennacl::traits::size1(A)),          static_cast<unsigned int>(viennacl::traits::size2(A)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+                detail::cuda_arg<cpu_value_type>(B),
+                static_cast<unsigned int>(viennacl::traits::start1(B)),         static_cast<unsigned int>(viennacl::traits::start2(B)),
+                static_cast<unsigned int>(viennacl::traits::stride1(B)),        static_cast<unsigned int>(viennacl::traits::stride2(B)),
+                static_cast<unsigned int>(viennacl::traits::size1(B)),          static_cast<unsigned int>(viennacl::traits::size2(B)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
+
+                converted_beta,
+                detail::cuda_arg<cpu_value_type>(C),
+                static_cast<unsigned int>(viennacl::traits::start1(C)),         static_cast<unsigned int>(viennacl::traits::start2(C)),
+                static_cast<unsigned int>(viennacl::traits::stride1(C)),        static_cast<unsigned int>(viennacl::traits::stride2(C)),
+                static_cast<unsigned int>(viennacl::traits::size1(C)),          static_cast<unsigned int>(viennacl::traits::size2(C)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
+          }
+
+
+          /////////////////////////////////
+
+          else if (row_major_C && row_major_A && row_major_B && !transposed_A && !transposed_B)
+          {
+            matrix_matrix_row_row_row_prod_AA_kernel<<<grid, threads>>>
+              (converted_alpha,
+                detail::cuda_arg<cpu_value_type>(A),
+                static_cast<unsigned int>(viennacl::traits::start1(A)),         static_cast<unsigned int>(viennacl::traits::start2(A)),
+                static_cast<unsigned int>(viennacl::traits::stride1(A)),        static_cast<unsigned int>(viennacl::traits::stride2(A)),
+                static_cast<unsigned int>(viennacl::traits::size1(A)),          static_cast<unsigned int>(viennacl::traits::size2(A)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+                detail::cuda_arg<cpu_value_type>(B),
+                static_cast<unsigned int>(viennacl::traits::start1(B)),         static_cast<unsigned int>(viennacl::traits::start2(B)),
+                static_cast<unsigned int>(viennacl::traits::stride1(B)),        static_cast<unsigned int>(viennacl::traits::stride2(B)),
+                static_cast<unsigned int>(viennacl::traits::size1(B)),          static_cast<unsigned int>(viennacl::traits::size2(B)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
+
+                converted_beta,
+                detail::cuda_arg<cpu_value_type>(C),
+                static_cast<unsigned int>(viennacl::traits::start1(C)),         static_cast<unsigned int>(viennacl::traits::start2(C)),
+                static_cast<unsigned int>(viennacl::traits::stride1(C)),        static_cast<unsigned int>(viennacl::traits::stride2(C)),
+                static_cast<unsigned int>(viennacl::traits::size1(C)),          static_cast<unsigned int>(viennacl::traits::size2(C)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
+          }
+          else if (row_major_C && row_major_A && row_major_B && !transposed_A && transposed_B)
+          {
+            matrix_matrix_row_row_row_prod_AT_kernel<<<grid, threads>>>
+              (converted_alpha,
+                detail::cuda_arg<cpu_value_type>(A),
+                static_cast<unsigned int>(viennacl::traits::start1(A)),         static_cast<unsigned int>(viennacl::traits::start2(A)),
+                static_cast<unsigned int>(viennacl::traits::stride1(A)),        static_cast<unsigned int>(viennacl::traits::stride2(A)),
+                static_cast<unsigned int>(viennacl::traits::size1(A)),          static_cast<unsigned int>(viennacl::traits::size2(A)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+                detail::cuda_arg<cpu_value_type>(B),
+                static_cast<unsigned int>(viennacl::traits::start1(B)),         static_cast<unsigned int>(viennacl::traits::start2(B)),
+                static_cast<unsigned int>(viennacl::traits::stride1(B)),        static_cast<unsigned int>(viennacl::traits::stride2(B)),
+                static_cast<unsigned int>(viennacl::traits::size1(B)),          static_cast<unsigned int>(viennacl::traits::size2(B)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
+
+                converted_beta,
+                detail::cuda_arg<cpu_value_type>(C),
+                static_cast<unsigned int>(viennacl::traits::start1(C)),         static_cast<unsigned int>(viennacl::traits::start2(C)),
+                static_cast<unsigned int>(viennacl::traits::stride1(C)),        static_cast<unsigned int>(viennacl::traits::stride2(C)),
+                static_cast<unsigned int>(viennacl::traits::size1(C)),          static_cast<unsigned int>(viennacl::traits::size2(C)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
+          }
+          else if (row_major_C && row_major_A && row_major_B && transposed_A && !transposed_B)
+          {
+            matrix_matrix_row_row_row_prod_TA_kernel<<<grid, threads>>>
+              (converted_alpha,
+                detail::cuda_arg<cpu_value_type>(A),
+                static_cast<unsigned int>(viennacl::traits::start1(A)),         static_cast<unsigned int>(viennacl::traits::start2(A)),
+                static_cast<unsigned int>(viennacl::traits::stride1(A)),        static_cast<unsigned int>(viennacl::traits::stride2(A)),
+                static_cast<unsigned int>(viennacl::traits::size1(A)),          static_cast<unsigned int>(viennacl::traits::size2(A)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+                detail::cuda_arg<cpu_value_type>(B),
+                static_cast<unsigned int>(viennacl::traits::start1(B)),         static_cast<unsigned int>(viennacl::traits::start2(B)),
+                static_cast<unsigned int>(viennacl::traits::stride1(B)),        static_cast<unsigned int>(viennacl::traits::stride2(B)),
+                static_cast<unsigned int>(viennacl::traits::size1(B)),          static_cast<unsigned int>(viennacl::traits::size2(B)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
+
+                converted_beta,
+                detail::cuda_arg<cpu_value_type>(C),
+                static_cast<unsigned int>(viennacl::traits::start1(C)),         static_cast<unsigned int>(viennacl::traits::start2(C)),
+                static_cast<unsigned int>(viennacl::traits::stride1(C)),        static_cast<unsigned int>(viennacl::traits::stride2(C)),
+                static_cast<unsigned int>(viennacl::traits::size1(C)),          static_cast<unsigned int>(viennacl::traits::size2(C)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
+          }
+          else if (row_major_C && row_major_A && row_major_B && transposed_A && transposed_B)
+          {
+            matrix_matrix_row_row_row_prod_TT_kernel<<<grid, threads>>>
+              (converted_alpha,
+                detail::cuda_arg<cpu_value_type>(A),
+                static_cast<unsigned int>(viennacl::traits::start1(A)),         static_cast<unsigned int>(viennacl::traits::start2(A)),
+                static_cast<unsigned int>(viennacl::traits::stride1(A)),        static_cast<unsigned int>(viennacl::traits::stride2(A)),
+                static_cast<unsigned int>(viennacl::traits::size1(A)),          static_cast<unsigned int>(viennacl::traits::size2(A)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+                detail::cuda_arg<cpu_value_type>(B),
+                static_cast<unsigned int>(viennacl::traits::start1(B)),         static_cast<unsigned int>(viennacl::traits::start2(B)),
+                static_cast<unsigned int>(viennacl::traits::stride1(B)),        static_cast<unsigned int>(viennacl::traits::stride2(B)),
+                static_cast<unsigned int>(viennacl::traits::size1(B)),          static_cast<unsigned int>(viennacl::traits::size2(B)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
+
+                converted_beta,
+                detail::cuda_arg<cpu_value_type>(C),
+                static_cast<unsigned int>(viennacl::traits::start1(C)),         static_cast<unsigned int>(viennacl::traits::start2(C)),
+                static_cast<unsigned int>(viennacl::traits::stride1(C)),        static_cast<unsigned int>(viennacl::traits::stride2(C)),
+                static_cast<unsigned int>(viennacl::traits::size1(C)),          static_cast<unsigned int>(viennacl::traits::size2(C)),
+                static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
+          }
+
+        }
+
+        // C = A * B, using fast kernel
+        template <typename T1, typename T2, typename T3, typename ScalarType >
+        void prod_fast_kernel(const T1 & A,
+                              const T2 & B,
+                              T3 & C,
+                              ScalarType alpha,
+                              ScalarType beta,
+                              std::string kernel_name)
+        {
+          typedef typename viennacl::result_of::cpu_value_type< typename T1::value_type >::type   cpu_value_type;
+
+          cpu_value_type cl_alpha = static_cast<cpu_value_type>(alpha);
+          cpu_value_type cl_beta  = static_cast<cpu_value_type>(beta);
+
+          /*viennacl::ocl::enqueue(k(cl_alpha,
+                                  viennacl::traits::opencl_handle(A),
+                                  cl_uint(viennacl::traits::start1(A)),           cl_uint(viennacl::traits::start2(A)),
+                                  cl_uint(viennacl::traits::stride1(A)),          cl_uint(viennacl::traits::stride2(A)),
+                                  cl_uint(viennacl::traits::size1(A)),            cl_uint(viennacl::traits::size2(A)),
+                                  cl_uint(viennacl::traits::internal_size1(A)),   cl_uint(viennacl::traits::internal_size2(A)),
+
+                                  viennacl::traits::opencl_handle(B),
+                                  cl_uint(viennacl::traits::start1(B)),           cl_uint(viennacl::traits::start2(B)),
+                                  cl_uint(viennacl::traits::stride1(B)),          cl_uint(viennacl::traits::stride2(B)),
+                                  cl_uint(viennacl::traits::size1(B)),            cl_uint(viennacl::traits::size2(B)),
+                                  cl_uint(viennacl::traits::internal_size1(B)),   cl_uint(viennacl::traits::internal_size2(B)),
+
+                                  cl_beta,
+                                  viennacl::traits::opencl_handle(C),
+                                  cl_uint(viennacl::traits::start1(C)),           cl_uint(viennacl::traits::start2(C)),
+                                  cl_uint(viennacl::traits::stride1(C)),          cl_uint(viennacl::traits::stride2(C)),
+                                  cl_uint(viennacl::traits::size1(C)),            cl_uint(viennacl::traits::size2(C)),
+                                  cl_uint(viennacl::traits::internal_size1(C)),   cl_uint(viennacl::traits::internal_size2(C))
+                                  )
+                                );*/
+
+          throw "not implemented yet";
+        }
+
+        template <typename T1, typename T2, typename T3, typename ScalarType >
+        void prod(const T1 & A, bool transposed_A,
+                  const T2 & B, bool transposed_B,
+                  T3 & C,
+                  ScalarType alpha,
+                  ScalarType beta)
+        {
+          if (   (viennacl::traits::size1(A) < 64)
+              || (viennacl::traits::size2(A) < 64)
+              || (viennacl::traits::size1(B) < 64) )   //there is most likely not enough to compute, rendering kernel launch overhead considerable
+          {
+            prod_slow_kernel(A, transposed_A,
+                             B, transposed_B,
+                             C, alpha, beta);
+          }
+          /*else if (   (viennacl::traits::size1(A) % 64 == 0)
+                  && (viennacl::traits::size2(A) % 64 == 0)
+                  && (viennacl::traits::size1(B) % 64 == 0) )   // allows the use of the fast kernel only
+          {
+            prod_fast_kernel(A, B, C, alpha, beta);
+            //prod_slow_kernel(A, B, C, slow_kernel_name);
+          }*/
+          else //TODO: use four kernels
+          {
+            prod_slow_kernel(A, transposed_A,
+                             B, transposed_B,
+                             C, alpha, beta);
+          }
+
+        }
+      } // namespace detail
+
+
+      /** @brief Carries out matrix-matrix multiplication
+      *
+      * Implementation of C = prod(A, B);
+      *
+      */
+      template <typename NumericT, typename F1, typename F2, typename F3, typename ScalarType >
+      void prod_impl(const matrix_base<NumericT, F1> & A,
+                     const matrix_base<NumericT, F2> & B,
+                           matrix_base<NumericT, F3> & C,
+                     ScalarType alpha,
+                     ScalarType beta)
+      {
+        assert( (viennacl::traits::size1(A) == viennacl::traits::size1(C)) && bool("Size mismatch in C = prod(A, B): size1(A) != size1(C)"));
+        assert( (viennacl::traits::size2(A) == viennacl::traits::size1(B)) && bool("Size mismatch in C = prod(A, B): size2(A) != size1(B)"));
+        assert( (viennacl::traits::size2(B) == viennacl::traits::size2(C)) && bool("Size mismatch in C = prod(A, B): size2(B) != size2(C)"));
+
+        // Inplace matrix-vector products like B = prod(A, B) are currently illegal: Introduce a temporary like C = prod(A, B); B = C; instead
+        /*assert(  (viennacl::traits::handle(C) != viennacl::traits::handle(A))
+              && (viennacl::traits::handle(C) != viennacl::traits::handle(B))
+              && bool("No direct inplace matrix-matrix product possible. Introduce a temporary!"));*/
+
+
+        detail::prod(A, false,
+                     B, false,
+                     C, alpha, beta);
+      }
+
+
+
+      /** @brief Carries out matrix-matrix multiplication
+      *
+      * Implementation of C = prod(trans(A), B);
+      *
+      */
+      template <typename NumericT, typename F1, typename F2, typename F3, typename ScalarType >
+      void prod_impl(const viennacl::matrix_expression< const matrix_base<NumericT, F1>,
+                                                        const matrix_base<NumericT, F1>,
+                                                        op_trans> & A,
+                     const matrix_base<NumericT, F2> & B,
+                           matrix_base<NumericT, F3> & C,
+                     ScalarType alpha,
+                     ScalarType beta)
+      {
+        //std::cout << "size2(A): " << viennacl::traits::size2(A.lhs()) << std::endl;
+        //std::cout << "size1(C): " << viennacl::traits::size1(C) << std::endl;
+        assert( (viennacl::traits::size2(A.lhs()) == viennacl::traits::size1(C)) && bool("Size mismatch in C = prod(trans(A), B): size2(A) != size1(C)"));
+        assert( (viennacl::traits::size1(A.lhs()) == viennacl::traits::size1(B)) && bool("Size mismatch in C = prod(trans(A), B): size1(A) != size1(B)"));
+        assert( (viennacl::traits::size2(B)       == viennacl::traits::size2(C)) && bool("Size mismatch in C = prod(trans(A), B): size2(B) != size2(C)"));
+
+        // Inplace matrix-vector products like B = prod(A, B) are currently illegal: Introduce a temporary like C = prod(A, B); B = C; instead
+        assert(  (viennacl::traits::handle(C) != viennacl::traits::handle(A.lhs()))
+              && (viennacl::traits::handle(C) != viennacl::traits::handle(B))
+              && bool("No direct inplace matrix-matrix product possible. Introduce a temporary!"));
+
+        detail::prod(A.lhs(), true,
+                     B, false,
+                     C, alpha, beta);
+      }
+
+
+
+
+      /** @brief Carries out matrix-matrix multiplication
+      *
+      * Implementation of C = prod(A, trans(B));
+      *
+      */
+      template <typename NumericT, typename F1, typename F2, typename F3, typename ScalarType >
+      void prod_impl(const matrix_base<NumericT, F1> & A,
+                     const viennacl::matrix_expression< const matrix_base<NumericT, F2>, const matrix_base<NumericT, F2>, op_trans> & B,
+                           matrix_base<NumericT, F3> & C,
+                     ScalarType alpha,
+                     ScalarType beta)
+      {
+        assert( (viennacl::traits::size1(A)       == viennacl::traits::size1(C))       && bool("Size mismatch in C = prod(A, trans(B)): size1(A) != size1(C)"));
+        assert( (viennacl::traits::size2(A)       == viennacl::traits::size2(B.lhs())) && bool("Size mismatch in C = prod(A, trans(B)): size2(A) != size2(B)"));
+        assert( (viennacl::traits::size1(B.lhs()) == viennacl::traits::size2(C))       && bool("Size mismatch in C = prod(A, trans(B)): size1(B) != size2(C)"));
+
+        // Inplace matrix-vector products like B = prod(A, B) are currently illegal: Introduce a temporary like C = prod(A, B); B = C; instead
+        detail::prod(A, false,
+                     B.lhs(), true,
+                     C, alpha, beta);
+      }
+
+
+
+      /** @brief Carries out matrix-matrix multiplication
+      *
+      * Implementation of C = prod(trans(A), trans(B));
+      *
+      */
+      template <typename NumericT, typename F1, typename F2, typename F3, typename ScalarType >
+      void prod_impl(const viennacl::matrix_expression< const matrix_base<NumericT, F1>, const matrix_base<NumericT, F1>, op_trans> & A,
+                     const viennacl::matrix_expression< const matrix_base<NumericT, F2>, const matrix_base<NumericT, F2>, op_trans> & B,
+                     matrix_base<NumericT, F3> & C,
+                     ScalarType alpha,
+                     ScalarType beta)
+      {
+        assert(viennacl::traits::size2(A.lhs()) == viennacl::traits::size1(C)       && bool("Size mismatch in C = prod(trans(A), trans(B)): size2(A) != size1(C)"));
+        assert(viennacl::traits::size1(A.lhs()) == viennacl::traits::size2(B.lhs()) && bool("Size mismatch in C = prod(trans(A), trans(B)): size1(A) != size2(B)"));
+        assert(viennacl::traits::size1(B.lhs()) == viennacl::traits::size2(C)       && bool("Size mismatch in C = prod(trans(A), trans(B)): size1(B) != size2(C)"));
+
+        // Inplace matrix-vector products like B = prod(A, B) are currently illegal: Introduce a temporary like C = prod(A, B); B = C; instead
+        assert(  (viennacl::traits::handle(C) != viennacl::traits::handle(A.lhs()))
+              && (viennacl::traits::handle(C) != viennacl::traits::handle(B.lhs()))
+              && bool("No direct inplace matrix-matrix product possible. Introduce a temporary!"));
+
+        detail::prod(A.lhs(), true,
+                     B.lhs(), true,
+                     C, alpha, beta);
+      }
+
+
+
+
+      //
+      /////////////////////////   miscellaneous operations /////////////////////////////////
+      //
+
+
+      /** @brief The implementation of the operation mat += alpha * vec1 * vec2^T, i.e. a scaled rank 1 update
+      *
+      * Implementation of the convenience expression result += alpha * outer_prod(vec1, vec2);
+      *
+      * @param mat1    The matrix to be updated
+      * @param alpha            The scaling factor (either a viennacl::scalar<>, float, or double)
+      * @param len_alpha        Length of the buffer for an eventual final reduction step (currently always '1')
+      * @param reciprocal_alpha Use 1/alpha instead of alpha
+      * @param flip_sign_alpha  Use -alpha instead of alpha
+      * @param vec1    The first vector
+      * @param vec2    The second vector
+      */
+      template <typename NumericT, typename F, typename S1>
+      void scaled_rank_1_update(matrix_base<NumericT, F> & mat1,
+                                S1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+                                const vector_base<NumericT> & vec1,
+                                const vector_base<NumericT> & vec2)
+      {
+        assert( (viennacl::traits::size1(mat1) == viennacl::traits::size(vec1)) && bool("Size mismatch in scaled_rank_1_update: size1(A) != size(v1)"));
+        assert( (viennacl::traits::size2(mat1) == viennacl::traits::size(vec2)) && bool("Size mismatch in scaled_rank_1_update: size2(A) != size(v2)"));
+
+        typedef NumericT        value_type;
+
+        unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
+
+        value_type temporary_alpha = 0;
+        if (viennacl::is_cpu_scalar<S1>::value)
+          temporary_alpha = alpha;
+
+        if (viennacl::is_row_major<F>::value)
+        {
+          scaled_rank1_update_row_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat1),
+                                                       static_cast<unsigned int>(viennacl::traits::start1(mat1)),           static_cast<unsigned int>(viennacl::traits::start2(mat1)),
+                                                       static_cast<unsigned int>(viennacl::traits::stride1(mat1)),          static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
+                                                       static_cast<unsigned int>(viennacl::traits::size1(mat1)),            static_cast<unsigned int>(viennacl::traits::size2(mat1)),
+                                                       static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
+
+                                                       detail::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
+                                                       options_alpha,
+
+                                                       detail::cuda_arg<value_type>(vec1),
+                                                       static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                                       static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                                       static_cast<unsigned int>(viennacl::traits::size(vec1)),
+
+                                                       detail::cuda_arg<value_type>(vec2),
+                                                       static_cast<unsigned int>(viennacl::traits::start(vec2)),
+                                                       static_cast<unsigned int>(viennacl::traits::stride(vec2)),
+                                                       static_cast<unsigned int>(viennacl::traits::size(vec2))
+                                                     );
+          VIENNACL_CUDA_LAST_ERROR_CHECK("scaled_rank1_update_row_kernel");
+        }
+        else
+        {
+          scaled_rank1_update_col_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat1),
+                                                       static_cast<unsigned int>(viennacl::traits::start1(mat1)),           static_cast<unsigned int>(viennacl::traits::start2(mat1)),
+                                                       static_cast<unsigned int>(viennacl::traits::stride1(mat1)),          static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
+                                                       static_cast<unsigned int>(viennacl::traits::size1(mat1)),            static_cast<unsigned int>(viennacl::traits::size2(mat1)),
+                                                       static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
+
+                                                       detail::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
+                                                       options_alpha,
+
+                                                       detail::cuda_arg<value_type>(vec1),
+                                                       static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                                       static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                                       static_cast<unsigned int>(viennacl::traits::size(vec1)),
+
+                                                       detail::cuda_arg<value_type>(vec2),
+                                                       static_cast<unsigned int>(viennacl::traits::start(vec2)),
+                                                       static_cast<unsigned int>(viennacl::traits::stride(vec2)),
+                                                       static_cast<unsigned int>(viennacl::traits::size(vec2))
+                                                      );
+          VIENNACL_CUDA_LAST_ERROR_CHECK("scaled_rank1_update_col_kernel");
+        }
+      }
+
+    } // namespace opencl
+  } //namespace linalg
+} //namespace viennacl
+
+
+#endif
diff --git a/viennacl/linalg/cuda/matrix_operations_col.hpp b/viennacl/linalg/cuda/matrix_operations_col.hpp
new file mode 100644
index 0000000..3099607
--- /dev/null
+++ b/viennacl/linalg/cuda/matrix_operations_col.hpp
@@ -0,0 +1,1423 @@
+#ifndef VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_COL_HPP_
+#define VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_COL_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file  viennacl/linalg/cuda/matrix_operations_col.hpp
+    @brief Implementations of column-major dense matrix related operations, including matrix-vector products, using CUDA.
+*/
+
+
+namespace viennacl
+{
+  namespace linalg
+  {
+    namespace cuda
+    {
+      //
+      // am
+      //
+
+      // alpha on CPU
+      template <typename T>
+      __global__ void am_col_kernel(
+                T * A,
+                unsigned int A_start1, unsigned int A_start2,
+                unsigned int A_inc1,   unsigned int A_inc2,
+                unsigned int A_size1,  unsigned int A_size2,
+                unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+                T fac2,
+                unsigned int options2,
+                const T * B,
+                unsigned int B_start1, unsigned int B_start2,
+                unsigned int B_inc1,   unsigned int B_inc2,
+                unsigned int B_internal_size1,  unsigned int B_internal_size2)
+      {
+        T alpha = fac2;
+        if (options2 & (1 << 0))
+          alpha = -alpha;
+
+        unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+        unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+        if (options2 & (1 << 1))
+        {
+          for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+            for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+              A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha;
+        }
+        else
+        {
+          for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+            for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+              A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha;
+        }
+      }
+
+      // alpha on GPU
+      template <typename T>
+      __global__ void am_col_kernel(
+                T * A,
+                unsigned int A_start1, unsigned int A_start2,
+                unsigned int A_inc1,   unsigned int A_inc2,
+                unsigned int A_size1,  unsigned int A_size2,
+                unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+                const T * fac2,
+                unsigned int options2,
+                const T * B,
+                unsigned int B_start1, unsigned int B_start2,
+                unsigned int B_inc1,   unsigned int B_inc2,
+                unsigned int B_internal_size1,  unsigned int B_internal_size2)
+      {
+        T alpha = *fac2;
+        if (options2 & (1 << 0))
+          alpha = -alpha;
+
+        unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+        unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+
+        if (options2 & (1 << 1))
+        {
+          for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+            for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+              A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha;
+        }
+        else
+        {
+          for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+            for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+              A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha;
+        }
+      }
+
+
+      //
+      // ambm
+      //
+
+      // alpha and beta on CPU
+      template <typename T>
+      __global__ void ambm_col_kernel(
+                T * A,
+                unsigned int A_start1, unsigned int A_start2,
+                unsigned int A_inc1,   unsigned int A_inc2,
+                unsigned int A_size1,  unsigned int A_size2,
+                unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+                T fac2,
+                unsigned int options2,
+                const T * B,
+                unsigned int B_start1, unsigned int B_start2,
+                unsigned int B_inc1,   unsigned int B_inc2,
+                unsigned int B_internal_size1,  unsigned int B_internal_size2,
+
+                T fac3,
+                unsigned int options3,
+                const T * C,
+                unsigned int C_start1, unsigned int C_start2,
+                unsigned int C_inc1,   unsigned int C_inc2,
+                unsigned int C_internal_size1,  unsigned int C_internal_size2)
+      {
+        T alpha = fac2;
+        if (options2 & (1 << 0))
+          alpha = -alpha;
+
+        T beta = fac3;
+        if (options3 & (1 << 0))
+          beta = -beta;
+
+        unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+        unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+
+        if (options2 & (1 << 1))
+        {
+          if (options3 & (1 << 1))
+          {
+            for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+              for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+                A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+              = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
+              + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
+          }
+          else
+          {
+            for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+              for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+                A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+              = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
+              + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
+          }
+        }
+        else
+        {
+          if (options3 & (1 << 1))
+          {
+            for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+              for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+                A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+              = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
+              + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
+          }
+          else
+          {
+            for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+              for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+                A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+              = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
+              + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
+          }
+        }
+      }
+
+
+      // alpha on CPU, beta on GPU
+      template <typename T>
+      __global__ void ambm_col_kernel(
+                T * A,
+                unsigned int A_start1, unsigned int A_start2,
+                unsigned int A_inc1,   unsigned int A_inc2,
+                unsigned int A_size1,  unsigned int A_size2,
+                unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+                T fac2,
+                unsigned int options2,
+                const T * B,
+                unsigned int B_start1, unsigned int B_start2,
+                unsigned int B_inc1,   unsigned int B_inc2,
+                unsigned int B_internal_size1,  unsigned int B_internal_size2,
+
+                const T * fac3,
+                unsigned int options3,
+                const T * C,
+                unsigned int C_start1, unsigned int C_start2,
+                unsigned int C_inc1,   unsigned int C_inc2,
+                unsigned int C_internal_size1,  unsigned int C_internal_size2)
+      {
+        T alpha = fac2;
+        if (options2 & (1 << 0))
+          alpha = -alpha;
+
+        T beta = *fac3;
+        if (options3 & (1 << 0))
+          beta = -beta;
+
+        unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+        unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+
+        if (options2 & (1 << 1))
+        {
+          if (options3 & (1 << 1))
+          {
+            for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+              for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+                A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+              = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
+              + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
+          }
+          else
+          {
+            for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+              for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+                A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+              = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
+              + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
+          }
+        }
+        else
+        {
+          if (options3 & (1 << 1))
+          {
+            for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+              for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+                A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+              = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
+              + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
+          }
+          else
+          {
+            for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+              for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+                A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+              = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
+              + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
+          }
+        }
+      }
+
+      // alpha on GPU, beta on CPU
+      template <typename T>
+      __global__ void ambm_col_kernel(
+                T * A,
+                unsigned int A_start1, unsigned int A_start2,
+                unsigned int A_inc1,   unsigned int A_inc2,
+                unsigned int A_size1,  unsigned int A_size2,
+                unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+                const T * fac2,
+                unsigned int options2,
+                const T * B,
+                unsigned int B_start1, unsigned int B_start2,
+                unsigned int B_inc1,   unsigned int B_inc2,
+                unsigned int B_internal_size1,  unsigned int B_internal_size2,
+
+                T fac3,
+                unsigned int options3,
+                const T * C,
+                unsigned int C_start1, unsigned int C_start2,
+                unsigned int C_inc1,   unsigned int C_inc2,
+                unsigned int C_internal_size1,  unsigned int C_internal_size2)
+      {
+        T alpha = *fac2;
+        if (options2 & (1 << 0))
+          alpha = -alpha;
+
+        T beta = fac3;
+        if (options3 & (1 << 0))
+          beta = -beta;
+
+        unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+        unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+
+        if (options2 & (1 << 1))
+        {
+          if (options3 & (1 << 1))
+          {
+            for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+              for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+                A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+              = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
+              + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
+          }
+          else
+          {
+            for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+              for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+                A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+              = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
+              + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
+          }
+        }
+        else
+        {
+          if (options3 & (1 << 1))
+          {
+            for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+              for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+                A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+              = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
+              + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
+          }
+          else
+          {
+            for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+              for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+                A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+              = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
+              + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
+          }
+        }
+      }
+
+
+      // alpha and beta on GPU
+      template <typename T>
+      __global__ void ambm_col_kernel(
+                T * A,
+                unsigned int A_start1, unsigned int A_start2,
+                unsigned int A_inc1,   unsigned int A_inc2,
+                unsigned int A_size1,  unsigned int A_size2,
+                unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+                const T * fac2,
+                unsigned int options2,
+                const T * B,
+                unsigned int B_start1, unsigned int B_start2,
+                unsigned int B_inc1,   unsigned int B_inc2,
+                unsigned int B_internal_size1,  unsigned int B_internal_size2,
+
+                const T * fac3,
+                unsigned int options3,
+                const T * C,
+                unsigned int C_start1, unsigned int C_start2,
+                unsigned int C_inc1,   unsigned int C_inc2,
+                unsigned int C_internal_size1,  unsigned int C_internal_size2)
+      {
+        T alpha = *fac2;
+        if (options2 & (1 << 0))
+          alpha = -alpha;
+
+        T beta = *fac3;
+        if (options3 & (1 << 0))
+          beta = -beta;
+
+        unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+        unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+
+        if (options2 & (1 << 1))
+        {
+          if (options3 & (1 << 1))
+          {
+            for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+              for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+                A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+              = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
+              + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
+          }
+          else
+          {
+            for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+              for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+                A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+              = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
+              + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
+          }
+        }
+        else
+        {
+          if (options3 & (1 << 1))
+          {
+            for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+              for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+                A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+              = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
+              + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
+          }
+          else
+          {
+            for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+              for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+                A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+              = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
+              + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
+          }
+        }
+      }
+
+
+      //
+      // ambm_m
+      //
+
+      // alpha and beta on CPU
+      template <typename T>
+      __global__ void ambm_m_col_kernel(
+                T * A,
+                unsigned int A_start1, unsigned int A_start2,
+                unsigned int A_inc1,   unsigned int A_inc2,
+                unsigned int A_size1,  unsigned int A_size2,
+                unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+                T fac2,
+                unsigned int options2,
+                const T * B,
+                unsigned int B_start1, unsigned int B_start2,
+                unsigned int B_inc1,   unsigned int B_inc2,
+                unsigned int B_internal_size1,  unsigned int B_internal_size2,
+
+                T fac3,
+                unsigned int options3,
+                const T * C,
+                unsigned int C_start1, unsigned int C_start2,
+                unsigned int C_inc1,   unsigned int C_inc2,
+                unsigned int C_internal_size1,  unsigned int C_internal_size2)
+      {
+        T alpha = fac2;
+        if (options2 & (1 << 0))
+          alpha = -alpha;
+
+        T beta = fac3;
+        if (options3 & (1 << 0))
+          beta = -beta;
+
+        unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+        unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+
+        if (options2 & (1 << 1))
+        {
+          if (options3 & (1 << 1))
+          {
+            for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+              for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+                A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+             += B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
+              + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
+          }
+          else
+          {
+            for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+              for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+                A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+             += B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
+              + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
+          }
+        }
+        else
+        {
+          if (options3 & (1 << 1))
+          {
+            for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+              for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+                A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+             += B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
+              + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
+          }
+          else
+          {
+            for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+              for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+                A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+             += B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
+              + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
+          }
+        }
+      }
+
+
+      // alpha on CPU, beta on GPU
+      template <typename T>
+      __global__ void ambm_m_col_kernel(
+                T * A,
+                unsigned int A_start1, unsigned int A_start2,
+                unsigned int A_inc1,   unsigned int A_inc2,
+                unsigned int A_size1,  unsigned int A_size2,
+                unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+                T fac2,
+                unsigned int options2,
+                const T * B,
+                unsigned int B_start1, unsigned int B_start2,
+                unsigned int B_inc1,   unsigned int B_inc2,
+                unsigned int B_internal_size1,  unsigned int B_internal_size2,
+
+                const T * fac3,
+                unsigned int options3,
+                const T * C,
+                unsigned int C_start1, unsigned int C_start2,
+                unsigned int C_inc1,   unsigned int C_inc2,
+                unsigned int C_internal_size1,  unsigned int C_internal_size2)
+      {
+        T alpha = fac2;
+        if (options2 & (1 << 0))
+          alpha = -alpha;
+
+        T beta = *fac3;
+        if (options3 & (1 << 0))
+          beta = -beta;
+
+        unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+        unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+
+        if (options2 & (1 << 1))
+        {
+          if (options3 & (1 << 1))
+          {
+            for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+              for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+                A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+              = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
+              + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
+          }
+          else
+          {
+            for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+              for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+                A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+              = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
+              + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
+          }
+        }
+        else
+        {
+          if (options3 & (1 << 1))
+          {
+            for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+              for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+                A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+              = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
+              + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
+          }
+          else
+          {
+            for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+              for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+                A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+              = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
+              + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
+          }
+        }
+      }
+
+      // alpha on GPU, beta on CPU
+      template <typename T>
+      __global__ void ambm_m_col_kernel(
+                T * A,
+                unsigned int A_start1, unsigned int A_start2,
+                unsigned int A_inc1,   unsigned int A_inc2,
+                unsigned int A_size1,  unsigned int A_size2,
+                unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+                const T * fac2,
+                unsigned int options2,
+                const T * B,
+                unsigned int B_start1, unsigned int B_start2,
+                unsigned int B_inc1,   unsigned int B_inc2,
+                unsigned int B_internal_size1,  unsigned int B_internal_size2,
+
+                T fac3,
+                unsigned int options3,
+                const T * C,
+                unsigned int C_start1, unsigned int C_start2,
+                unsigned int C_inc1,   unsigned int C_inc2,
+                unsigned int C_internal_size1,  unsigned int C_internal_size2)
+      {
+        T alpha = *fac2;
+        if (options2 & (1 << 0))
+          alpha = -alpha;
+
+        T beta = fac3;
+        if (options3 & (1 << 0))
+          beta = -beta;
+
+        unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+        unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+
+        if (options2 & (1 << 1))
+        {
+          if (options3 & (1 << 1))
+          {
+            for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+              for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+                A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+              = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
+              + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
+          }
+          else
+          {
+            for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+              for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+                A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+              = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
+              + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
+          }
+        }
+        else
+        {
+          if (options3 & (1 << 1))
+          {
+            for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+              for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+                A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+              = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
+              + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
+          }
+          else
+          {
+            for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+              for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+                A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+              = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
+              + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
+          }
+        }
+      }
+
+
+      // alpha and beta on GPU
+      template <typename T>
+      __global__ void ambm_m_col_kernel(
+                T * A,
+                unsigned int A_start1, unsigned int A_start2,
+                unsigned int A_inc1,   unsigned int A_inc2,
+                unsigned int A_size1,  unsigned int A_size2,
+                unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+                const T * fac2,
+                unsigned int options2,
+                const T * B,
+                unsigned int B_start1, unsigned int B_start2,
+                unsigned int B_inc1,   unsigned int B_inc2,
+                unsigned int B_internal_size1,  unsigned int B_internal_size2,
+
+                const T * fac3,
+                unsigned int options3,
+                const T * C,
+                unsigned int C_start1, unsigned int C_start2,
+                unsigned int C_inc1,   unsigned int C_inc2,
+                unsigned int C_internal_size1,  unsigned int C_internal_size2)
+      {
+        T alpha = *fac2;
+        if (options2 & (1 << 0))
+          alpha = -alpha;
+
+        T beta = *fac3;
+        if (options3 & (1 << 0))
+          beta = -beta;
+
+        unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+        unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+
+        if (options2 & (1 << 1))
+        {
+          if (options3 & (1 << 1))
+          {
+            for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+              for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+                A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+              = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
+              + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
+          }
+          else
+          {
+            for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+              for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+                A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+              = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
+              + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
+          }
+        }
+        else
+        {
+          if (options3 & (1 << 1))
+          {
+            for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+              for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+                A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+              = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
+              + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
+          }
+          else
+          {
+            for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+              for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+                A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+              = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
+              + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
+          }
+        }
+      }
+
+
+
+      //
+      // assignments
+      //
+
+      template <typename T>
+      __global__ void matrix_col_assign_kernel(
+                T * A,
+                unsigned int A_start1, unsigned int A_start2,
+                unsigned int A_inc1,   unsigned int A_inc2,
+                unsigned int A_size1,  unsigned int A_size2,
+                unsigned int A_internal_size1,  unsigned int A_internal_size2,
+                T alpha)
+      {
+        unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+        unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+
+        for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+          for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+            A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = alpha;
+      }
+
+
+      template <typename T>
+      __global__ void matrix_col_diagonal_assign_kernel(
+                T * A,
+                unsigned int A_start1, unsigned int A_start2,
+                unsigned int A_inc1,   unsigned int A_inc2,
+                unsigned int A_size1,  unsigned int A_size2,
+                unsigned int A_internal_size1,  unsigned int A_internal_size2,
+                T alpha)
+      {
+        unsigned int gid = (blockIdx.x * blockDim.x + threadIdx.x);
+
+        for (unsigned int row = gid; row < A_size1; row += blockDim.x * gridDim.x)
+          A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1] = alpha;
+      }
+
+      //
+      // binary element-wise operations
+      //
+
+      template <typename T>
+      __global__ void element_op_col_kernel(
+                T * A,
+                unsigned int A_start1, unsigned int A_start2,
+                unsigned int A_inc1,   unsigned int A_inc2,
+                unsigned int A_size1,  unsigned int A_size2,
+                unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+                const T * B,
+                unsigned int B_start1, unsigned int B_start2,
+                unsigned int B_inc1,   unsigned int B_inc2,
+                unsigned int B_internal_size1,  unsigned int B_internal_size2,
+
+                const T * C,
+                unsigned int C_start1, unsigned int C_start2,
+                unsigned int C_inc1,   unsigned int C_inc2,
+                unsigned int C_internal_size1,  unsigned int C_internal_size2,
+
+                unsigned int op_type) //0: product, 1: division, 2: pow
+      {
+        unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+        unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+
+        if (op_type == 2)
+        {
+          for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+            for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+              A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+            = pow(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1],
+                  C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1]);
+        }
+        else if (op_type == 1)
+        {
+          for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+            for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+              A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+            = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]
+            / C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1];
+        }
+        else if (op_type == 0)
+        {
+          for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+            for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+              A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+            = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]
+            * C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1];
+        }
+      }
+
+      template <typename T>
+      __global__ void element_op_int_col_kernel(
+                T * A,
+                unsigned int A_start1, unsigned int A_start2,
+                unsigned int A_inc1,   unsigned int A_inc2,
+                unsigned int A_size1,  unsigned int A_size2,
+                unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+                const T * B,
+                unsigned int B_start1, unsigned int B_start2,
+                unsigned int B_inc1,   unsigned int B_inc2,
+                unsigned int B_internal_size1,  unsigned int B_internal_size2,
+
+                const T * C,
+                unsigned int C_start1, unsigned int C_start2,
+                unsigned int C_inc1,   unsigned int C_inc2,
+                unsigned int C_internal_size1,  unsigned int C_internal_size2,
+
+                unsigned int op_type) //0: product, 1: division, 2: pow
+      {
+        unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+        unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+
+        if (op_type == 1)
+        {
+          for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+            for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+              A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+            = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]
+            / C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1];
+        }
+        else if (op_type == 0)
+        {
+          for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+            for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+              A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+            = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]
+            * C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1];
+        }
+      }
+
+
+      //
+      // unary element-wise operations
+      //
+
+      // abs
+      template <typename T>
+      __global__ void matrix_col_element_abs_kernel(
+                T * A,
+                unsigned int A_start1, unsigned int A_start2,
+                unsigned int A_inc1,   unsigned int A_inc2,
+                unsigned int A_size1,  unsigned int A_size2,
+                unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+                const T * B,
+                unsigned int B_start1, unsigned int B_start2,
+                unsigned int B_inc1,   unsigned int B_inc2,
+                unsigned int B_internal_size1,  unsigned int B_internal_size2)
+      {
+        unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+        unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+        for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+          for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+            A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = abs(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
+      }
+
+
+      // acos
+      template <typename T>
+      __global__ void matrix_col_element_acos_kernel(
+                T * A,
+                unsigned int A_start1, unsigned int A_start2,
+                unsigned int A_inc1,   unsigned int A_inc2,
+                unsigned int A_size1,  unsigned int A_size2,
+                unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+                const T * B,
+                unsigned int B_start1, unsigned int B_start2,
+                unsigned int B_inc1,   unsigned int B_inc2,
+                unsigned int B_internal_size1,  unsigned int B_internal_size2)
+      {
+        unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+        unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+        for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+          for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+            A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = acos(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
+      }
+
+
+      // asin
+      template <typename T>
+      __global__ void matrix_col_element_asin_kernel(
+                T * A,
+                unsigned int A_start1, unsigned int A_start2,
+                unsigned int A_inc1,   unsigned int A_inc2,
+                unsigned int A_size1,  unsigned int A_size2,
+                unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+                const T * B,
+                unsigned int B_start1, unsigned int B_start2,
+                unsigned int B_inc1,   unsigned int B_inc2,
+                unsigned int B_internal_size1,  unsigned int B_internal_size2)
+      {
+        unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+        unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+        for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+          for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+            A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = asin(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
+      }
+
+
+      // atan
+      template <typename T>
+      __global__ void matrix_col_element_atan_kernel(
+                T * A,
+                unsigned int A_start1, unsigned int A_start2,
+                unsigned int A_inc1,   unsigned int A_inc2,
+                unsigned int A_size1,  unsigned int A_size2,
+                unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+                const T * B,
+                unsigned int B_start1, unsigned int B_start2,
+                unsigned int B_inc1,   unsigned int B_inc2,
+                unsigned int B_internal_size1,  unsigned int B_internal_size2)
+      {
+        unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+        unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+        for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+          for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+            A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = atan(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
+      }
+
+
+      // ceil
+      template <typename T>
+      __global__ void matrix_col_element_ceil_kernel(
+                T * A,
+                unsigned int A_start1, unsigned int A_start2,
+                unsigned int A_inc1,   unsigned int A_inc2,
+                unsigned int A_size1,  unsigned int A_size2,
+                unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+                const T * B,
+                unsigned int B_start1, unsigned int B_start2,
+                unsigned int B_inc1,   unsigned int B_inc2,
+                unsigned int B_internal_size1,  unsigned int B_internal_size2)
+      {
+        unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+        unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+        for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+          for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+            A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = ceil(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
+      }
+
+
+      // cos
+      template <typename T>
+      __global__ void matrix_col_element_cos_kernel(
+                T * A,
+                unsigned int A_start1, unsigned int A_start2,
+                unsigned int A_inc1,   unsigned int A_inc2,
+                unsigned int A_size1,  unsigned int A_size2,
+                unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+                const T * B,
+                unsigned int B_start1, unsigned int B_start2,
+                unsigned int B_inc1,   unsigned int B_inc2,
+                unsigned int B_internal_size1,  unsigned int B_internal_size2)
+      {
+        unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+        unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+        for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+          for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+            A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = cos(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
+      }
+
+
+      // cosh
+      template <typename T>
+      __global__ void matrix_col_element_cosh_kernel(
+                T * A,
+                unsigned int A_start1, unsigned int A_start2,
+                unsigned int A_inc1,   unsigned int A_inc2,
+                unsigned int A_size1,  unsigned int A_size2,
+                unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+                const T * B,
+                unsigned int B_start1, unsigned int B_start2,
+                unsigned int B_inc1,   unsigned int B_inc2,
+                unsigned int B_internal_size1,  unsigned int B_internal_size2)
+      {
+        unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+        unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+        for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+          for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+            A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = cosh(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
+      }
+
+
+      // exp
+      template <typename T>
+      __global__ void matrix_col_element_exp_kernel(
+                T * A,
+                unsigned int A_start1, unsigned int A_start2,
+                unsigned int A_inc1,   unsigned int A_inc2,
+                unsigned int A_size1,  unsigned int A_size2,
+                unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+                const T * B,
+                unsigned int B_start1, unsigned int B_start2,
+                unsigned int B_inc1,   unsigned int B_inc2,
+                unsigned int B_internal_size1,  unsigned int B_internal_size2)
+      {
+        unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+        unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+        for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+          for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+            A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = exp(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
+      }
+
+
+      // fabs
+      template <typename T>
+      __global__ void matrix_col_element_fabs_kernel(
+                T * A,
+                unsigned int A_start1, unsigned int A_start2,
+                unsigned int A_inc1,   unsigned int A_inc2,
+                unsigned int A_size1,  unsigned int A_size2,
+                unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+                const T * B,
+                unsigned int B_start1, unsigned int B_start2,
+                unsigned int B_inc1,   unsigned int B_inc2,
+                unsigned int B_internal_size1,  unsigned int B_internal_size2)
+      {
+        unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+        unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+        for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+          for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+            A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = fabs(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
+      }
+
+
+      // floor
+      template <typename T>
+      __global__ void matrix_col_element_floor_kernel(
+                T * A,
+                unsigned int A_start1, unsigned int A_start2,
+                unsigned int A_inc1,   unsigned int A_inc2,
+                unsigned int A_size1,  unsigned int A_size2,
+                unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+                const T * B,
+                unsigned int B_start1, unsigned int B_start2,
+                unsigned int B_inc1,   unsigned int B_inc2,
+                unsigned int B_internal_size1,  unsigned int B_internal_size2)
+      {
+        unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+        unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+        for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+          for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+            A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = floor(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
+      }
+
+
+      // log
+      template <typename T>
+      __global__ void matrix_col_element_log_kernel(
+                T * A,
+                unsigned int A_start1, unsigned int A_start2,
+                unsigned int A_inc1,   unsigned int A_inc2,
+                unsigned int A_size1,  unsigned int A_size2,
+                unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+                const T * B,
+                unsigned int B_start1, unsigned int B_start2,
+                unsigned int B_inc1,   unsigned int B_inc2,
+                unsigned int B_internal_size1,  unsigned int B_internal_size2)
+      {
+        unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+        unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+        for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+          for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+            A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = log(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
+      }
+
+
+      // log10
+      template <typename T>
+      __global__ void matrix_col_element_log10_kernel(
+                T * A,
+                unsigned int A_start1, unsigned int A_start2,
+                unsigned int A_inc1,   unsigned int A_inc2,
+                unsigned int A_size1,  unsigned int A_size2,
+                unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+                const T * B,
+                unsigned int B_start1, unsigned int B_start2,
+                unsigned int B_inc1,   unsigned int B_inc2,
+                unsigned int B_internal_size1,  unsigned int B_internal_size2)
+      {
+        unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+        unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+        for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+          for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+            A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = log10(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
+      }
+
+
+      // sin
+      template <typename T>
+      __global__ void matrix_col_element_sin_kernel(
+                T * A,
+                unsigned int A_start1, unsigned int A_start2,
+                unsigned int A_inc1,   unsigned int A_inc2,
+                unsigned int A_size1,  unsigned int A_size2,
+                unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+                const T * B,
+                unsigned int B_start1, unsigned int B_start2,
+                unsigned int B_inc1,   unsigned int B_inc2,
+                unsigned int B_internal_size1,  unsigned int B_internal_size2)
+      {
+        unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+        unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+        for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+          for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+            A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = sin(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
+      }
+
+
+      // sinh
+      template <typename T>
+      __global__ void matrix_col_element_sinh_kernel(
+                T * A,
+                unsigned int A_start1, unsigned int A_start2,
+                unsigned int A_inc1,   unsigned int A_inc2,
+                unsigned int A_size1,  unsigned int A_size2,
+                unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+                const T * B,
+                unsigned int B_start1, unsigned int B_start2,
+                unsigned int B_inc1,   unsigned int B_inc2,
+                unsigned int B_internal_size1,  unsigned int B_internal_size2)
+      {
+        unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+        unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+        for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+          for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+            A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = sinh(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
+      }
+
+
+      // sqrt
+      template <typename T>
+      __global__ void matrix_col_element_sqrt_kernel(
+                T * A,
+                unsigned int A_start1, unsigned int A_start2,
+                unsigned int A_inc1,   unsigned int A_inc2,
+                unsigned int A_size1,  unsigned int A_size2,
+                unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+                const T * B,
+                unsigned int B_start1, unsigned int B_start2,
+                unsigned int B_inc1,   unsigned int B_inc2,
+                unsigned int B_internal_size1,  unsigned int B_internal_size2)
+      {
+        unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+        unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+        for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+          for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+            A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = sqrt(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
+      }
+
+
+      // tan
+      template <typename T>
+      __global__ void matrix_col_element_tan_kernel(
+                T * A,
+                unsigned int A_start1, unsigned int A_start2,
+                unsigned int A_inc1,   unsigned int A_inc2,
+                unsigned int A_size1,  unsigned int A_size2,
+                unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+                const T * B,
+                unsigned int B_start1, unsigned int B_start2,
+                unsigned int B_inc1,   unsigned int B_inc2,
+                unsigned int B_internal_size1,  unsigned int B_internal_size2)
+      {
+        unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+        unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+        for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+          for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+            A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = tan(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
+      }
+
+
+      // tanh
+      template <typename T>
+      __global__ void matrix_col_element_tanh_kernel(
+                T * A,
+                unsigned int A_start1, unsigned int A_start2,
+                unsigned int A_inc1,   unsigned int A_inc2,
+                unsigned int A_size1,  unsigned int A_size2,
+                unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+                const T * B,
+                unsigned int B_start1, unsigned int B_start2,
+                unsigned int B_inc1,   unsigned int B_inc2,
+                unsigned int B_internal_size1,  unsigned int B_internal_size2)
+      {
+        unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+        unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+        for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+          for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+            A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = tanh(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
+      }
+
+
+
+      //
+      // matrix-vector product
+      //
+
+      template <typename T>
+      __global__ void vec_mul_col_kernel(
+                const T * A,
+                unsigned int A_row_start,
+                unsigned int A_col_start,
+                unsigned int A_row_inc,
+                unsigned int A_col_inc,
+                unsigned int A_row_size,
+                unsigned int A_col_size,
+                unsigned int A_internal_rows,
+                unsigned int A_internal_cols,
+                const T * v,
+                unsigned int v_start,
+                unsigned int v_inc,
+                unsigned int v_size,
+                T * result,
+                unsigned int result_start,
+                unsigned int result_inc,
+                unsigned int result_size)
+      {
+
+        for (unsigned int row = blockIdx.x * blockDim.x + threadIdx.x; row < A_row_size; row += gridDim.x * blockDim.x)
+        {
+          T dot_prod = 0;
+          for (unsigned int col = 0; col < A_col_size; ++col)
+            dot_prod += A[(row * A_row_inc + A_row_start) + (col * A_col_inc + A_col_start) * A_internal_rows] * v[v_start + v_inc * col];
+          result[row * result_inc + result_start] = dot_prod;
+        }
+      }
+
+
+      template <typename T>
+      __global__ void trans_vec_mul_col_kernel(
+                const T * A,
+                unsigned int A_row_start,
+                unsigned int A_col_start,
+                unsigned int A_row_inc,
+                unsigned int A_col_inc,
+                unsigned int A_row_size,
+                unsigned int A_col_size,
+                unsigned int A_internal_rows,
+                unsigned int A_internal_cols,
+                const T * v,
+                unsigned int v_start,
+                unsigned int v_inc,
+                unsigned int v_size,
+                T * result,
+                unsigned int result_start,
+                unsigned int result_inc,
+                unsigned int result_size)
+      {
+        __shared__ T work[128];
+
+        unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+        unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+        unsigned int lid = threadIdx.x;
+
+        for (unsigned int row = row_gid; row < A_col_size; row += gridDim.x)
+        {
+          T dot_prod = 0;
+          for (unsigned int col = col_gid; col < A_row_size; col += blockDim.x)
+            dot_prod += A[(row * A_col_inc + A_col_start) * A_internal_rows + col * A_row_inc + A_row_start] * v[v_start + v_inc * col];
+          work[lid] = dot_prod;
+
+          for(unsigned int stride = blockDim.x/2 ; stride>0 ; stride>>=1){
+            __syncthreads();
+            if(lid < stride)
+              work[lid] += work[lid+stride];
+          }
+
+          if(lid == 0)
+            result[row * result_inc + result_start] = work[0];
+        }
+      }
+
+
+      //
+      // matrix-matrix products
+      //
+
+
+
+
+      //
+      // scaled rank-1-update
+      //
+
+      // alpha on CPU
+      template <typename T>
+      __global__ void scaled_rank1_update_col_kernel(
+                T * A,
+                unsigned int A_start1, unsigned int A_start2,
+                unsigned int A_inc1,   unsigned int A_inc2,
+                unsigned int A_size1,  unsigned int A_size2,
+                unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+                T val,
+                unsigned int options2,
+
+                const T * vec1,
+                unsigned int start1,
+                unsigned int inc1,
+                unsigned int size1,
+
+                const T * vec2,
+                unsigned int start2,
+                unsigned int inc2,
+                unsigned int size2)
+      {
+        T alpha = val;
+        if (options2 & (1 << 0))
+          alpha = -alpha;
+        if (options2 & (1 << 1))
+          alpha = ((T)(1)) / alpha;
+
+        unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+        unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+        for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+        {
+          T tmp = alpha * vec1[row * inc1 + start1];
+          for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+            A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] += tmp * vec2[col * inc2 + start2];
+        }
+      }
+
+
+      // alpha on GPU
+      template <typename T>
+      __global__ void scaled_rank1_update_col_kernel(
+                T * A,
+                unsigned int A_start1, unsigned int A_start2,
+                unsigned int A_inc1,   unsigned int A_inc2,
+                unsigned int A_size1,  unsigned int A_size2,
+                unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+                const T * val,
+                unsigned int options2,
+
+                const T * vec1,
+                unsigned int start1,
+                unsigned int inc1,
+                unsigned int size1,
+
+                const T * vec2,
+                unsigned int start2,
+                unsigned int inc2,
+                unsigned int size2)
+      {
+        T alpha = *val;
+        if (options2 & (1 << 0))
+          alpha = -alpha;
+        if (options2 & (1 << 1))
+          alpha = ((T)(1)) / alpha;
+
+        unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+        unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+        for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+        {
+          T tmp = alpha * vec1[row * inc1 + start1];
+          for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+            A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] += tmp * vec2[col * inc2 + start2];
+        }
+      }
+
+
+
+    } // namespace cuda
+  } //namespace linalg
+} //namespace viennacl
+
+
+#endif
diff --git a/viennacl/linalg/cuda/matrix_operations_prod.hpp b/viennacl/linalg/cuda/matrix_operations_prod.hpp
new file mode 100644
index 0000000..a33e7c2
--- /dev/null
+++ b/viennacl/linalg/cuda/matrix_operations_prod.hpp
@@ -0,0 +1,2886 @@
+#ifndef VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_PROD_HPP_
+#define VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_PROD_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file  viennacl/linalg/cuda/matrix_operations_prod.hpp
+    @brief Dense matrix-matrix product CUDA kernels reside here.
+
+    Note: File created semi-automatically from OpenCL kernels.
+*/
+
+
+namespace viennacl
+{
+  namespace linalg
+  {
+    namespace cuda
+    {
+
+      // matrix-matrix multiplication C = A * B
+      // matrix layouts: C...col_major, A...col_major, B...col_major
+      template <typename T>
+      __global__ void matrix_matrix_col_col_col_prod_AA_kernel(
+                T alpha,
+                const T * A,
+                unsigned int A_row_start,
+                unsigned int A_col_start,
+                unsigned int A_row_inc,
+                unsigned int A_col_inc,
+                unsigned int A_row_size,
+                unsigned int A_col_size,
+                unsigned int A_internal_rows,
+                unsigned int A_internal_cols,
+                const T * B,
+                unsigned int B_row_start,
+                unsigned int B_col_start,
+                unsigned int B_row_inc,
+                unsigned int B_col_inc,
+                unsigned int B_row_size,
+                unsigned int B_col_size,
+                unsigned int B_internal_rows,
+                unsigned int B_internal_cols,
+                T beta,
+                T * C,
+                unsigned int C_row_start,
+                unsigned int C_col_start,
+                unsigned int C_row_inc,
+                unsigned int C_col_inc,
+                unsigned int C_row_size,
+                unsigned int C_col_size,
+                unsigned int C_internal_rows,
+                unsigned int C_internal_cols)
+      {
+
+        __shared__ T bufA[272];
+        __shared__ T bufB[272];
+
+        vcl_size_t block_size = 16;//get_local_size(0);
+        vcl_size_t row_block_id = blockIdx.x;
+        vcl_size_t col_block_id = blockIdx.y;
+        vcl_size_t row_thread_id = threadIdx.x;
+        vcl_size_t col_thread_id = threadIdx.y;
+        vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
+        vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
+        vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
+        vcl_size_t bStep = block_size * B_row_inc;
+        vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
+        T Csub = 0;
+        vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
+        vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc *  B_internal_rows;
+
+        vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
+        vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
+        for (vcl_size_t block = 0;
+                block < block_num;
+                ++block)
+        {
+          bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
+          bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
+          __syncthreads();
+          T * bufAptr = bufA + row_thread_id_times_block_size;
+          T * bufBptr = bufB + col_thread_id_times_block_size;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+          __syncthreads();
+          aBegin += aStep;
+          bBegin += bStep;
+        }
+        if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
+          C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
+      }
+
+      // matrix-matrix multiplication C = A * B^T
+      // matrix layouts: C...col_major, A...col_major, B...col_major
+      template <typename T>
+      __global__ void matrix_matrix_col_col_col_prod_AT_kernel(
+                T alpha,
+                const T * A,
+                unsigned int A_row_start,
+                unsigned int A_col_start,
+                unsigned int A_row_inc,
+                unsigned int A_col_inc,
+                unsigned int A_row_size,
+                unsigned int A_col_size,
+                unsigned int A_internal_rows,
+                unsigned int A_internal_cols,
+                const T * B,
+                unsigned int B_row_start,
+                unsigned int B_col_start,
+                unsigned int B_row_inc,
+                unsigned int B_col_inc,
+                unsigned int B_row_size,
+                unsigned int B_col_size,
+                unsigned int B_internal_rows,
+                unsigned int B_internal_cols,
+                T beta,
+                T * C,
+                unsigned int C_row_start,
+                unsigned int C_col_start,
+                unsigned int C_row_inc,
+                unsigned int C_col_inc,
+                unsigned int C_row_size,
+                unsigned int C_col_size,
+                unsigned int C_internal_rows,
+                unsigned int C_internal_cols)
+      {
+
+        __shared__ T bufA[272];
+        __shared__ T bufB[272];
+
+        vcl_size_t block_size = 16;//get_local_size(0);
+        vcl_size_t row_block_id = blockIdx.x;
+        vcl_size_t col_block_id = blockIdx.y;
+        vcl_size_t row_thread_id = threadIdx.x;
+        vcl_size_t col_thread_id = threadIdx.y;
+        vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
+        vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
+        vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
+        vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
+        vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
+        T Csub = 0;
+        vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
+        vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc *  B_internal_rows;
+
+        vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
+        vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
+        for (vcl_size_t block = 0;
+                block < block_num;
+                ++block)
+        {
+          bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
+          bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
+          __syncthreads();
+          T * bufAptr = bufA + row_thread_id_times_block_size;
+          T * bufBptr = bufB + col_thread_id_times_block_size;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+          __syncthreads();
+          aBegin += aStep;
+          bBegin += bStep;
+        }
+        if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
+          C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
+      }
+
+      // matrix-matrix multiplication C = A^T * B
+      // matrix layouts: C...col_major, A...col_major, B...col_major
+      template <typename T>
+      __global__ void matrix_matrix_col_col_col_prod_TA_kernel(
+                T alpha,
+                const T * A,
+                unsigned int A_row_start,
+                unsigned int A_col_start,
+                unsigned int A_row_inc,
+                unsigned int A_col_inc,
+                unsigned int A_row_size,
+                unsigned int A_col_size,
+                unsigned int A_internal_rows,
+                unsigned int A_internal_cols,
+                const T * B,
+                unsigned int B_row_start,
+                unsigned int B_col_start,
+                unsigned int B_row_inc,
+                unsigned int B_col_inc,
+                unsigned int B_row_size,
+                unsigned int B_col_size,
+                unsigned int B_internal_rows,
+                unsigned int B_internal_cols,
+                T beta,
+                T * C,
+                unsigned int C_row_start,
+                unsigned int C_col_start,
+                unsigned int C_row_inc,
+                unsigned int C_col_inc,
+                unsigned int C_row_size,
+                unsigned int C_col_size,
+                unsigned int C_internal_rows,
+                unsigned int C_internal_cols)
+      {
+
+        __shared__ T bufA[272];
+        __shared__ T bufB[272];
+
+        vcl_size_t block_size = 16;//get_local_size(0);
+        vcl_size_t row_block_id = blockIdx.x;
+        vcl_size_t col_block_id = blockIdx.y;
+        vcl_size_t row_thread_id = threadIdx.x;
+        vcl_size_t col_thread_id = threadIdx.y;
+        vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
+        vcl_size_t aStep = block_size * A_row_inc;
+        vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
+        vcl_size_t bStep = block_size * B_row_inc;
+        vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
+        T Csub = 0;
+        vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
+        vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc *  B_internal_rows;
+
+        vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
+        vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
+        for (vcl_size_t block = 0;
+                block < block_num;
+                ++block)
+        {
+          bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
+          bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
+          __syncthreads();
+          T * bufAptr = bufA + row_thread_id_times_block_size;
+          T * bufBptr = bufB + col_thread_id_times_block_size;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+          __syncthreads();
+          aBegin += aStep;
+          bBegin += bStep;
+        }
+        if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
+          C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
+      }
+
+      // matrix-matrix multiplication C = A^T * B^T
+      // matrix layouts: C...col_major, A...col_major, B...col_major
+      template <typename T>
+      __global__ void matrix_matrix_col_col_col_prod_TT_kernel(
+                T alpha,
+                const T * A,
+                unsigned int A_row_start,
+                unsigned int A_col_start,
+                unsigned int A_row_inc,
+                unsigned int A_col_inc,
+                unsigned int A_row_size,
+                unsigned int A_col_size,
+                unsigned int A_internal_rows,
+                unsigned int A_internal_cols,
+                const T * B,
+                unsigned int B_row_start,
+                unsigned int B_col_start,
+                unsigned int B_row_inc,
+                unsigned int B_col_inc,
+                unsigned int B_row_size,
+                unsigned int B_col_size,
+                unsigned int B_internal_rows,
+                unsigned int B_internal_cols,
+                T beta,
+                T * C,
+                unsigned int C_row_start,
+                unsigned int C_col_start,
+                unsigned int C_row_inc,
+                unsigned int C_col_inc,
+                unsigned int C_row_size,
+                unsigned int C_col_size,
+                unsigned int C_internal_rows,
+                unsigned int C_internal_cols)
+      {
+
+        __shared__ T bufA[272];
+        __shared__ T bufB[272];
+
+        vcl_size_t block_size = 16;//get_local_size(0);
+        vcl_size_t row_block_id = blockIdx.x;
+        vcl_size_t col_block_id = blockIdx.y;
+        vcl_size_t row_thread_id = threadIdx.x;
+        vcl_size_t col_thread_id = threadIdx.y;
+        vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
+        vcl_size_t aStep = block_size * A_row_inc;
+        vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
+        vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
+        vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
+        T Csub = 0;
+        vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
+        vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc *  B_internal_rows;
+
+        vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
+        vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
+        for (vcl_size_t block = 0;
+                block < block_num;
+                ++block)
+        {
+          bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
+          bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
+          __syncthreads();
+          T * bufAptr = bufA + row_thread_id_times_block_size;
+          T * bufBptr = bufB + col_thread_id_times_block_size;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+          __syncthreads();
+          aBegin += aStep;
+          bBegin += bStep;
+        }
+        if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
+          C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
+      }
+
+
+
+      ////////////////////////////////////////////////////////////////////////////
+
+
+
+
+      // matrix-matrix multiplication C = A * B
+      // matrix layouts: C...row_major, A...col_major, B...col_major
+      template <typename T>
+      __global__ void matrix_matrix_row_col_col_prod_AA_kernel(
+                T alpha,
+                const T * A,
+                unsigned int A_row_start,
+                unsigned int A_col_start,
+                unsigned int A_row_inc,
+                unsigned int A_col_inc,
+                unsigned int A_row_size,
+                unsigned int A_col_size,
+                unsigned int A_internal_rows,
+                unsigned int A_internal_cols,
+                const T * B,
+                unsigned int B_row_start,
+                unsigned int B_col_start,
+                unsigned int B_row_inc,
+                unsigned int B_col_inc,
+                unsigned int B_row_size,
+                unsigned int B_col_size,
+                unsigned int B_internal_rows,
+                unsigned int B_internal_cols,
+                T beta,
+                T * C,
+                unsigned int C_row_start,
+                unsigned int C_col_start,
+                unsigned int C_row_inc,
+                unsigned int C_col_inc,
+                unsigned int C_row_size,
+                unsigned int C_col_size,
+                unsigned int C_internal_rows,
+                unsigned int C_internal_cols)
+      {
+
+        __shared__ T bufA[272];
+        __shared__ T bufB[272];
+
+        vcl_size_t block_size = 16;//get_local_size(0);
+        vcl_size_t row_block_id = blockIdx.x;
+        vcl_size_t col_block_id = blockIdx.y;
+        vcl_size_t row_thread_id = threadIdx.x;
+        vcl_size_t col_thread_id = threadIdx.y;
+        vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
+        vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
+        vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
+        vcl_size_t bStep = block_size * B_row_inc;
+        vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
+        T Csub = 0;
+        vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
+        vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc *  B_internal_rows;
+
+        vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
+        vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
+        for (vcl_size_t block = 0;
+                block < block_num;
+                ++block)
+        {
+          bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
+          bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
+          __syncthreads();
+          T * bufAptr = bufA + row_thread_id_times_block_size;
+          T * bufBptr = bufB + col_thread_id_times_block_size;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+          __syncthreads();
+          aBegin += aStep;
+          bBegin += bStep;
+        }
+        if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
+          C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
+      }
+
+      // matrix-matrix multiplication C = A * B^T
+      // matrix layouts: C...row_major, A...col_major, B...col_major
+      template <typename T>
+      __global__ void matrix_matrix_row_col_col_prod_AT_kernel(
+                T alpha,
+                const T * A,
+                unsigned int A_row_start,
+                unsigned int A_col_start,
+                unsigned int A_row_inc,
+                unsigned int A_col_inc,
+                unsigned int A_row_size,
+                unsigned int A_col_size,
+                unsigned int A_internal_rows,
+                unsigned int A_internal_cols,
+                const T * B,
+                unsigned int B_row_start,
+                unsigned int B_col_start,
+                unsigned int B_row_inc,
+                unsigned int B_col_inc,
+                unsigned int B_row_size,
+                unsigned int B_col_size,
+                unsigned int B_internal_rows,
+                unsigned int B_internal_cols,
+                T beta,
+                T * C,
+                unsigned int C_row_start,
+                unsigned int C_col_start,
+                unsigned int C_row_inc,
+                unsigned int C_col_inc,
+                unsigned int C_row_size,
+                unsigned int C_col_size,
+                unsigned int C_internal_rows,
+                unsigned int C_internal_cols)
+      {
+
+        __shared__ T bufA[272];
+        __shared__ T bufB[272];
+
+        vcl_size_t block_size = 16;//get_local_size(0);
+        vcl_size_t row_block_id = blockIdx.x;
+        vcl_size_t col_block_id = blockIdx.y;
+        vcl_size_t row_thread_id = threadIdx.x;
+        vcl_size_t col_thread_id = threadIdx.y;
+        vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
+        vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
+        vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
+        vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
+        vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
+        T Csub = 0;
+        vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
+        vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc *  B_internal_rows;
+
+        vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
+        vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
+        for (vcl_size_t block = 0;
+                block < block_num;
+                ++block)
+        {
+          bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
+          bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
+          __syncthreads();
+          T * bufAptr = bufA + row_thread_id_times_block_size;
+          T * bufBptr = bufB + col_thread_id_times_block_size;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+          __syncthreads();
+          aBegin += aStep;
+          bBegin += bStep;
+        }
+        if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
+          C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
+      }
+
+      // matrix-matrix multiplication C = A^T * B
+      // matrix layouts: C...row_major, A...col_major, B...col_major
+      template <typename T>
+      __global__ void matrix_matrix_row_col_col_prod_TA_kernel(
+                T alpha,
+                const T * A,
+                unsigned int A_row_start,
+                unsigned int A_col_start,
+                unsigned int A_row_inc,
+                unsigned int A_col_inc,
+                unsigned int A_row_size,
+                unsigned int A_col_size,
+                unsigned int A_internal_rows,
+                unsigned int A_internal_cols,
+                const T * B,
+                unsigned int B_row_start,
+                unsigned int B_col_start,
+                unsigned int B_row_inc,
+                unsigned int B_col_inc,
+                unsigned int B_row_size,
+                unsigned int B_col_size,
+                unsigned int B_internal_rows,
+                unsigned int B_internal_cols,
+                T beta,
+                T * C,
+                unsigned int C_row_start,
+                unsigned int C_col_start,
+                unsigned int C_row_inc,
+                unsigned int C_col_inc,
+                unsigned int C_row_size,
+                unsigned int C_col_size,
+                unsigned int C_internal_rows,
+                unsigned int C_internal_cols)
+      {
+
+        __shared__ T bufA[272];
+        __shared__ T bufB[272];
+
+        vcl_size_t block_size = 16;//get_local_size(0);
+        vcl_size_t row_block_id = blockIdx.x;
+        vcl_size_t col_block_id = blockIdx.y;
+        vcl_size_t row_thread_id = threadIdx.x;
+        vcl_size_t col_thread_id = threadIdx.y;
+        vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
+        vcl_size_t aStep = block_size * A_row_inc;
+        vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
+        vcl_size_t bStep = block_size * B_row_inc;
+        vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
+        T Csub = 0;
+        vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
+        vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc *  B_internal_rows;
+
+        vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
+        vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
+        for (vcl_size_t block = 0;
+                block < block_num;
+                ++block)
+        {
+          bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
+          bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
+          __syncthreads();
+          T * bufAptr = bufA + row_thread_id_times_block_size;
+          T * bufBptr = bufB + col_thread_id_times_block_size;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+          __syncthreads();
+          aBegin += aStep;
+          bBegin += bStep;
+        }
+        if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
+          C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
+      }
+
+      // matrix-matrix multiplication C = A^T * B^T
+      // matrix layouts: C...row_major, A...col_major, B...col_major
+      template <typename T>
+      __global__ void matrix_matrix_row_col_col_prod_TT_kernel(
+                T alpha,
+                const T * A,
+                unsigned int A_row_start,
+                unsigned int A_col_start,
+                unsigned int A_row_inc,
+                unsigned int A_col_inc,
+                unsigned int A_row_size,
+                unsigned int A_col_size,
+                unsigned int A_internal_rows,
+                unsigned int A_internal_cols,
+                const T * B,
+                unsigned int B_row_start,
+                unsigned int B_col_start,
+                unsigned int B_row_inc,
+                unsigned int B_col_inc,
+                unsigned int B_row_size,
+                unsigned int B_col_size,
+                unsigned int B_internal_rows,
+                unsigned int B_internal_cols,
+                T beta,
+                T * C,
+                unsigned int C_row_start,
+                unsigned int C_col_start,
+                unsigned int C_row_inc,
+                unsigned int C_col_inc,
+                unsigned int C_row_size,
+                unsigned int C_col_size,
+                unsigned int C_internal_rows,
+                unsigned int C_internal_cols)
+      {
+
+        __shared__ T bufA[272];
+        __shared__ T bufB[272];
+
+        vcl_size_t block_size = 16;//get_local_size(0);
+        vcl_size_t row_block_id = blockIdx.x;
+        vcl_size_t col_block_id = blockIdx.y;
+        vcl_size_t row_thread_id = threadIdx.x;
+        vcl_size_t col_thread_id = threadIdx.y;
+        vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
+        vcl_size_t aStep = block_size * A_row_inc;
+        vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
+        vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
+        vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
+        T Csub = 0;
+        vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
+        vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc *  B_internal_rows;
+
+        vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
+        vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
+        for (vcl_size_t block = 0;
+                block < block_num;
+                ++block)
+        {
+          bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
+          bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
+          __syncthreads();
+          T * bufAptr = bufA + row_thread_id_times_block_size;
+          T * bufBptr = bufB + col_thread_id_times_block_size;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+          __syncthreads();
+          aBegin += aStep;
+          bBegin += bStep;
+        }
+        if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
+          C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
+      }
+
+
+
+
+      ////////////////////////////////////////////////////////////////////////////
+
+
+
+
+      // matrix-matrix multiplication C = A * B
+      // matrix layouts: C...col_major, A...col_major, B...row_major
+      template <typename T>
+      __global__ void matrix_matrix_col_col_row_prod_AA_kernel(
+                T alpha,
+                const T * A,
+                unsigned int A_row_start,
+                unsigned int A_col_start,
+                unsigned int A_row_inc,
+                unsigned int A_col_inc,
+                unsigned int A_row_size,
+                unsigned int A_col_size,
+                unsigned int A_internal_rows,
+                unsigned int A_internal_cols,
+                const T * B,
+                unsigned int B_row_start,
+                unsigned int B_col_start,
+                unsigned int B_row_inc,
+                unsigned int B_col_inc,
+                unsigned int B_row_size,
+                unsigned int B_col_size,
+                unsigned int B_internal_rows,
+                unsigned int B_internal_cols,
+                T beta,
+                T * C,
+                unsigned int C_row_start,
+                unsigned int C_col_start,
+                unsigned int C_row_inc,
+                unsigned int C_col_inc,
+                unsigned int C_row_size,
+                unsigned int C_col_size,
+                unsigned int C_internal_rows,
+                unsigned int C_internal_cols)
+      {
+
+        __shared__ T bufA[272];
+        __shared__ T bufB[272];
+
+        vcl_size_t block_size = 16;//get_local_size(0);
+        vcl_size_t row_block_id = blockIdx.x;
+        vcl_size_t col_block_id = blockIdx.y;
+        vcl_size_t row_thread_id = threadIdx.x;
+        vcl_size_t col_thread_id = threadIdx.y;
+        vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
+        vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
+        vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
+        vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
+        vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
+        T Csub = 0;
+        vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
+        vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
+
+        vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
+        vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
+        for (vcl_size_t block = 0;
+                block < block_num;
+                ++block)
+        {
+          bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
+          bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
+          __syncthreads();
+          T * bufAptr = bufA + row_thread_id_times_block_size;
+          T * bufBptr = bufB + col_thread_id_times_block_size;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+          __syncthreads();
+          aBegin += aStep;
+          bBegin += bStep;
+        }
+        if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
+          C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
+      }
+
+      // matrix-matrix multiplication C = A * B^T
+      // matrix layouts: C...col_major, A...col_major, B...row_major
+      template <typename T>
+      __global__ void matrix_matrix_col_col_row_prod_AT_kernel(
+                T alpha,
+                const T * A,
+                unsigned int A_row_start,
+                unsigned int A_col_start,
+                unsigned int A_row_inc,
+                unsigned int A_col_inc,
+                unsigned int A_row_size,
+                unsigned int A_col_size,
+                unsigned int A_internal_rows,
+                unsigned int A_internal_cols,
+                const T * B,
+                unsigned int B_row_start,
+                unsigned int B_col_start,
+                unsigned int B_row_inc,
+                unsigned int B_col_inc,
+                unsigned int B_row_size,
+                unsigned int B_col_size,
+                unsigned int B_internal_rows,
+                unsigned int B_internal_cols,
+                T beta,
+                T * C,
+                unsigned int C_row_start,
+                unsigned int C_col_start,
+                unsigned int C_row_inc,
+                unsigned int C_col_inc,
+                unsigned int C_row_size,
+                unsigned int C_col_size,
+                unsigned int C_internal_rows,
+                unsigned int C_internal_cols)
+      {
+
+        __shared__ T bufA[272];
+        __shared__ T bufB[272];
+
+        vcl_size_t block_size = 16;//get_local_size(0);
+        vcl_size_t row_block_id = blockIdx.x;
+        vcl_size_t col_block_id = blockIdx.y;
+        vcl_size_t row_thread_id = threadIdx.x;
+        vcl_size_t col_thread_id = threadIdx.y;
+        vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
+        vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
+        vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
+        vcl_size_t bStep = block_size * B_col_inc;
+        vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
+        T Csub = 0;
+        vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
+        vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
+
+        vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
+        vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
+        for (vcl_size_t block = 0;
+                block < block_num;
+                ++block)
+        {
+          bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
+          bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
+          __syncthreads();
+          T * bufAptr = bufA + row_thread_id_times_block_size;
+          T * bufBptr = bufB + col_thread_id_times_block_size;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+          __syncthreads();
+          aBegin += aStep;
+          bBegin += bStep;
+        }
+        if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
+          C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
+      }
+
+      // matrix-matrix multiplication C = A^T * B
+      // matrix layouts: C...col_major, A...col_major, B...row_major
+      template <typename T>
+      __global__ void matrix_matrix_col_col_row_prod_TA_kernel(
+                T alpha,
+                const T * A,
+                unsigned int A_row_start,
+                unsigned int A_col_start,
+                unsigned int A_row_inc,
+                unsigned int A_col_inc,
+                unsigned int A_row_size,
+                unsigned int A_col_size,
+                unsigned int A_internal_rows,
+                unsigned int A_internal_cols,
+                const T * B,
+                unsigned int B_row_start,
+                unsigned int B_col_start,
+                unsigned int B_row_inc,
+                unsigned int B_col_inc,
+                unsigned int B_row_size,
+                unsigned int B_col_size,
+                unsigned int B_internal_rows,
+                unsigned int B_internal_cols,
+                T beta,
+                T * C,
+                unsigned int C_row_start,
+                unsigned int C_col_start,
+                unsigned int C_row_inc,
+                unsigned int C_col_inc,
+                unsigned int C_row_size,
+                unsigned int C_col_size,
+                unsigned int C_internal_rows,
+                unsigned int C_internal_cols)
+      {
+
+        __shared__ T bufA[272];
+        __shared__ T bufB[272];
+
+        vcl_size_t block_size = 16;//get_local_size(0);
+        vcl_size_t row_block_id = blockIdx.x;
+        vcl_size_t col_block_id = blockIdx.y;
+        vcl_size_t row_thread_id = threadIdx.x;
+        vcl_size_t col_thread_id = threadIdx.y;
+        vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
+        vcl_size_t aStep = block_size * A_row_inc;
+        vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
+        vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
+        vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
+        T Csub = 0;
+        vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
+        vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
+
+        vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
+        vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
+        for (vcl_size_t block = 0;
+                block < block_num;
+                ++block)
+        {
+          bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
+          bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
+          __syncthreads();
+          T * bufAptr = bufA + row_thread_id_times_block_size;
+          T * bufBptr = bufB + col_thread_id_times_block_size;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+          __syncthreads();
+          aBegin += aStep;
+          bBegin += bStep;
+        }
+        if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
+          C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
+      }
+
+      // matrix-matrix multiplication C = A^T * B^T
+      // matrix layouts: C...col_major, A...col_major, B...row_major
+      template <typename T>
+      __global__ void matrix_matrix_col_col_row_prod_TT_kernel(
+                T alpha,
+                const T * A,
+                unsigned int A_row_start,
+                unsigned int A_col_start,
+                unsigned int A_row_inc,
+                unsigned int A_col_inc,
+                unsigned int A_row_size,
+                unsigned int A_col_size,
+                unsigned int A_internal_rows,
+                unsigned int A_internal_cols,
+                const T * B,
+                unsigned int B_row_start,
+                unsigned int B_col_start,
+                unsigned int B_row_inc,
+                unsigned int B_col_inc,
+                unsigned int B_row_size,
+                unsigned int B_col_size,
+                unsigned int B_internal_rows,
+                unsigned int B_internal_cols,
+                T beta,
+                T * C,
+                unsigned int C_row_start,
+                unsigned int C_col_start,
+                unsigned int C_row_inc,
+                unsigned int C_col_inc,
+                unsigned int C_row_size,
+                unsigned int C_col_size,
+                unsigned int C_internal_rows,
+                unsigned int C_internal_cols)
+      {
+
+        __shared__ T bufA[272];
+        __shared__ T bufB[272];
+
+        vcl_size_t block_size = 16;//get_local_size(0);
+        vcl_size_t row_block_id = blockIdx.x;
+        vcl_size_t col_block_id = blockIdx.y;
+        vcl_size_t row_thread_id = threadIdx.x;
+        vcl_size_t col_thread_id = threadIdx.y;
+        vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
+        vcl_size_t aStep = block_size * A_row_inc;
+        vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
+        vcl_size_t bStep = block_size * B_col_inc;
+        vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
+        T Csub = 0;
+        vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
+        vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
+
+        vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
+        vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
+        for (vcl_size_t block = 0;
+                block < block_num;
+                ++block)
+        {
+          bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
+          bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
+          __syncthreads();
+          T * bufAptr = bufA + row_thread_id_times_block_size;
+          T * bufBptr = bufB + col_thread_id_times_block_size;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+          __syncthreads();
+          aBegin += aStep;
+          bBegin += bStep;
+        }
+        if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
+          C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
+      }
+
+
+
+      ////////////////////////////////////////////////////////////////////////////
+
+
+
+
+      // matrix-matrix multiplication C = A * B
+      // matrix layouts: C...row_major, A...col_major, B...row_major
+      template <typename T>
+      __global__ void matrix_matrix_row_col_row_prod_AA_kernel(
+                T alpha,
+                const T * A,
+                unsigned int A_row_start,
+                unsigned int A_col_start,
+                unsigned int A_row_inc,
+                unsigned int A_col_inc,
+                unsigned int A_row_size,
+                unsigned int A_col_size,
+                unsigned int A_internal_rows,
+                unsigned int A_internal_cols,
+                const T * B,
+                unsigned int B_row_start,
+                unsigned int B_col_start,
+                unsigned int B_row_inc,
+                unsigned int B_col_inc,
+                unsigned int B_row_size,
+                unsigned int B_col_size,
+                unsigned int B_internal_rows,
+                unsigned int B_internal_cols,
+                T beta,
+                T * C,
+                unsigned int C_row_start,
+                unsigned int C_col_start,
+                unsigned int C_row_inc,
+                unsigned int C_col_inc,
+                unsigned int C_row_size,
+                unsigned int C_col_size,
+                unsigned int C_internal_rows,
+                unsigned int C_internal_cols)
+      {
+
+        __shared__ T bufA[272];
+        __shared__ T bufB[272];
+
+        vcl_size_t block_size = 16;//get_local_size(0);
+        vcl_size_t row_block_id = blockIdx.x;
+        vcl_size_t col_block_id = blockIdx.y;
+        vcl_size_t row_thread_id = threadIdx.x;
+        vcl_size_t col_thread_id = threadIdx.y;
+        vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
+        vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
+        vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
+        vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
+        vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
+        T Csub = 0;
+        vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
+        vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
+
+        vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
+        vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
+        for (vcl_size_t block = 0;
+                block < block_num;
+                ++block)
+        {
+          bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
+          bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
+          __syncthreads();
+          T * bufAptr = bufA + row_thread_id_times_block_size;
+          T * bufBptr = bufB + col_thread_id_times_block_size;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+          __syncthreads();
+          aBegin += aStep;
+          bBegin += bStep;
+        }
+        if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
+          C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
+      }
+
+      // matrix-matrix multiplication C = A * B^T
+      // matrix layouts: C...row_major, A...col_major, B...row_major
+      template <typename T>
+      __global__ void matrix_matrix_row_col_row_prod_AT_kernel(
+                T alpha,
+                const T * A,
+                unsigned int A_row_start,
+                unsigned int A_col_start,
+                unsigned int A_row_inc,
+                unsigned int A_col_inc,
+                unsigned int A_row_size,
+                unsigned int A_col_size,
+                unsigned int A_internal_rows,
+                unsigned int A_internal_cols,
+                const T * B,
+                unsigned int B_row_start,
+                unsigned int B_col_start,
+                unsigned int B_row_inc,
+                unsigned int B_col_inc,
+                unsigned int B_row_size,
+                unsigned int B_col_size,
+                unsigned int B_internal_rows,
+                unsigned int B_internal_cols,
+                T beta,
+                T * C,
+                unsigned int C_row_start,
+                unsigned int C_col_start,
+                unsigned int C_row_inc,
+                unsigned int C_col_inc,
+                unsigned int C_row_size,
+                unsigned int C_col_size,
+                unsigned int C_internal_rows,
+                unsigned int C_internal_cols)
+      {
+
+        __shared__ T bufA[272];
+        __shared__ T bufB[272];
+
+        vcl_size_t block_size = 16;//get_local_size(0);
+        vcl_size_t row_block_id = blockIdx.x;
+        vcl_size_t col_block_id = blockIdx.y;
+        vcl_size_t row_thread_id = threadIdx.x;
+        vcl_size_t col_thread_id = threadIdx.y;
+        vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
+        vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
+        vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
+        vcl_size_t bStep = block_size * B_col_inc;
+        vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
+        T Csub = 0;
+        vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
+        vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
+
+        vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
+        vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
+        for (vcl_size_t block = 0;
+                block < block_num;
+                ++block)
+        {
+          bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
+          bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
+          __syncthreads();
+          T * bufAptr = bufA + row_thread_id_times_block_size;
+          T * bufBptr = bufB + col_thread_id_times_block_size;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+          __syncthreads();
+          aBegin += aStep;
+          bBegin += bStep;
+        }
+        if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
+          C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
+      }
+
+      // matrix-matrix multiplication C = A^T * B
+      // matrix layouts: C...row_major, A...col_major, B...row_major
+      template <typename T>
+      __global__ void matrix_matrix_row_col_row_prod_TA_kernel(
+                T alpha,
+                const T * A,
+                unsigned int A_row_start,
+                unsigned int A_col_start,
+                unsigned int A_row_inc,
+                unsigned int A_col_inc,
+                unsigned int A_row_size,
+                unsigned int A_col_size,
+                unsigned int A_internal_rows,
+                unsigned int A_internal_cols,
+                const T * B,
+                unsigned int B_row_start,
+                unsigned int B_col_start,
+                unsigned int B_row_inc,
+                unsigned int B_col_inc,
+                unsigned int B_row_size,
+                unsigned int B_col_size,
+                unsigned int B_internal_rows,
+                unsigned int B_internal_cols,
+                T beta,
+                T * C,
+                unsigned int C_row_start,
+                unsigned int C_col_start,
+                unsigned int C_row_inc,
+                unsigned int C_col_inc,
+                unsigned int C_row_size,
+                unsigned int C_col_size,
+                unsigned int C_internal_rows,
+                unsigned int C_internal_cols)
+      {
+
+        __shared__ T bufA[272];
+        __shared__ T bufB[272];
+
+        vcl_size_t block_size = 16;//get_local_size(0);
+        vcl_size_t row_block_id = blockIdx.x;
+        vcl_size_t col_block_id = blockIdx.y;
+        vcl_size_t row_thread_id = threadIdx.x;
+        vcl_size_t col_thread_id = threadIdx.y;
+        vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
+        vcl_size_t aStep = block_size * A_row_inc;
+        vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
+        vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
+        vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
+        T Csub = 0;
+        vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
+        vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
+
+        vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
+        vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
+        for (vcl_size_t block = 0;
+                block < block_num;
+                ++block)
+        {
+          bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
+          bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
+          __syncthreads();
+          T * bufAptr = bufA + row_thread_id_times_block_size;
+          T * bufBptr = bufB + col_thread_id_times_block_size;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+          __syncthreads();
+          aBegin += aStep;
+          bBegin += bStep;
+        }
+        if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
+          C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
+      }
+
+      // matrix-matrix multiplication C = A^T * B^T
+      // matrix layouts: C...row_major, A...col_major, B...row_major
+      template <typename T>
+      __global__ void matrix_matrix_row_col_row_prod_TT_kernel(
+                T alpha,
+                const T * A,
+                unsigned int A_row_start,
+                unsigned int A_col_start,
+                unsigned int A_row_inc,
+                unsigned int A_col_inc,
+                unsigned int A_row_size,
+                unsigned int A_col_size,
+                unsigned int A_internal_rows,
+                unsigned int A_internal_cols,
+                const T * B,
+                unsigned int B_row_start,
+                unsigned int B_col_start,
+                unsigned int B_row_inc,
+                unsigned int B_col_inc,
+                unsigned int B_row_size,
+                unsigned int B_col_size,
+                unsigned int B_internal_rows,
+                unsigned int B_internal_cols,
+                T beta,
+                T * C,
+                unsigned int C_row_start,
+                unsigned int C_col_start,
+                unsigned int C_row_inc,
+                unsigned int C_col_inc,
+                unsigned int C_row_size,
+                unsigned int C_col_size,
+                unsigned int C_internal_rows,
+                unsigned int C_internal_cols)
+      {
+
+        __shared__ T bufA[272];
+        __shared__ T bufB[272];
+
+        vcl_size_t block_size = 16;//get_local_size(0);
+        vcl_size_t row_block_id = blockIdx.x;
+        vcl_size_t col_block_id = blockIdx.y;
+        vcl_size_t row_thread_id = threadIdx.x;
+        vcl_size_t col_thread_id = threadIdx.y;
+        vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
+        vcl_size_t aStep = block_size * A_row_inc;
+        vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
+        vcl_size_t bStep = block_size * B_col_inc;
+        vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
+        T Csub = 0;
+        vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
+        vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
+
+        vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
+        vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
+        for (vcl_size_t block = 0;
+                block < block_num;
+                ++block)
+        {
+          bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
+          bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
+          __syncthreads();
+          T * bufAptr = bufA + row_thread_id_times_block_size;
+          T * bufBptr = bufB + col_thread_id_times_block_size;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+          __syncthreads();
+          aBegin += aStep;
+          bBegin += bStep;
+        }
+        if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
+          C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
+      }
+
+
+
+
+
+      ////////////////////////////////////////////////////////////////////////////
+
+
+
+
+
+
+      // matrix-matrix multiplication C = A * B
+      // matrix layouts: C...col_major, A...row_major, B...col_major
+      template <typename T>
+      __global__ void matrix_matrix_col_row_col_prod_AA_kernel(
+                T alpha,
+                const T * A,
+                unsigned int A_row_start,
+                unsigned int A_col_start,
+                unsigned int A_row_inc,
+                unsigned int A_col_inc,
+                unsigned int A_row_size,
+                unsigned int A_col_size,
+                unsigned int A_internal_rows,
+                unsigned int A_internal_cols,
+                const T * B,
+                unsigned int B_row_start,
+                unsigned int B_col_start,
+                unsigned int B_row_inc,
+                unsigned int B_col_inc,
+                unsigned int B_row_size,
+                unsigned int B_col_size,
+                unsigned int B_internal_rows,
+                unsigned int B_internal_cols,
+                T beta,
+                T * C,
+                unsigned int C_row_start,
+                unsigned int C_col_start,
+                unsigned int C_row_inc,
+                unsigned int C_col_inc,
+                unsigned int C_row_size,
+                unsigned int C_col_size,
+                unsigned int C_internal_rows,
+                unsigned int C_internal_cols)
+      {
+
+        __shared__ T bufA[272];
+        __shared__ T bufB[272];
+
+        vcl_size_t block_size = 16;//get_local_size(0);
+        vcl_size_t row_block_id = blockIdx.x;
+        vcl_size_t col_block_id = blockIdx.y;
+        vcl_size_t row_thread_id = threadIdx.x;
+        vcl_size_t col_thread_id = threadIdx.y;
+        vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start;
+        vcl_size_t aStep = block_size * A_col_inc;
+        vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
+        vcl_size_t bStep = block_size * B_row_inc;
+        vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
+        T Csub = 0;
+        vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
+        vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc *  B_internal_rows;
+
+        vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
+        vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
+        for (vcl_size_t block = 0;
+                block < block_num;
+                ++block)
+        {
+          bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
+          bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
+          __syncthreads();
+          T * bufAptr = bufA + row_thread_id_times_block_size;
+          T * bufBptr = bufB + col_thread_id_times_block_size;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+          __syncthreads();
+          aBegin += aStep;
+          bBegin += bStep;
+        }
+        if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
+          C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
+      }
+
+      // matrix-matrix multiplication C = A * B^T
+      // matrix layouts: C...col_major, A...row_major, B...col_major
+      template <typename T>
+      __global__ void matrix_matrix_col_row_col_prod_AT_kernel(
+                T alpha,
+                const T * A,
+                unsigned int A_row_start,
+                unsigned int A_col_start,
+                unsigned int A_row_inc,
+                unsigned int A_col_inc,
+                unsigned int A_row_size,
+                unsigned int A_col_size,
+                unsigned int A_internal_rows,
+                unsigned int A_internal_cols,
+                const T * B,
+                unsigned int B_row_start,
+                unsigned int B_col_start,
+                unsigned int B_row_inc,
+                unsigned int B_col_inc,
+                unsigned int B_row_size,
+                unsigned int B_col_size,
+                unsigned int B_internal_rows,
+                unsigned int B_internal_cols,
+                T beta,
+                T * C,
+                unsigned int C_row_start,
+                unsigned int C_col_start,
+                unsigned int C_row_inc,
+                unsigned int C_col_inc,
+                unsigned int C_row_size,
+                unsigned int C_col_size,
+                unsigned int C_internal_rows,
+                unsigned int C_internal_cols)
+      {
+
+        __shared__ T bufA[272];
+        __shared__ T bufB[272];
+
+        vcl_size_t block_size = 16;//get_local_size(0);
+        vcl_size_t row_block_id = blockIdx.x;
+        vcl_size_t col_block_id = blockIdx.y;
+        vcl_size_t row_thread_id = threadIdx.x;
+        vcl_size_t col_thread_id = threadIdx.y;
+        vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start;
+        vcl_size_t aStep = block_size * A_col_inc;
+        vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
+        vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
+        vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
+        T Csub = 0;
+        vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
+        vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc *  B_internal_rows;
+
+        vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
+        vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
+        for (vcl_size_t block = 0;
+                block < block_num;
+                ++block)
+        {
+          bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
+          bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
+          __syncthreads();
+          T * bufAptr = bufA + row_thread_id_times_block_size;
+          T * bufBptr = bufB + col_thread_id_times_block_size;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+          __syncthreads();
+          aBegin += aStep;
+          bBegin += bStep;
+        }
+        if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
+          C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
+      }
+
+      // matrix-matrix multiplication C = A^T * B
+      // matrix layouts: C...col_major, A...row_major, B...col_major
+      template <typename T>
+      __global__ void matrix_matrix_col_row_col_prod_TA_kernel(
+                T alpha,
+                const T * A,
+                unsigned int A_row_start,
+                unsigned int A_col_start,
+                unsigned int A_row_inc,
+                unsigned int A_col_inc,
+                unsigned int A_row_size,
+                unsigned int A_col_size,
+                unsigned int A_internal_rows,
+                unsigned int A_internal_cols,
+                const T * B,
+                unsigned int B_row_start,
+                unsigned int B_col_start,
+                unsigned int B_row_inc,
+                unsigned int B_col_inc,
+                unsigned int B_row_size,
+                unsigned int B_col_size,
+                unsigned int B_internal_rows,
+                unsigned int B_internal_cols,
+                T beta,
+                T * C,
+                unsigned int C_row_start,
+                unsigned int C_col_start,
+                unsigned int C_row_inc,
+                unsigned int C_col_inc,
+                unsigned int C_row_size,
+                unsigned int C_col_size,
+                unsigned int C_internal_rows,
+                unsigned int C_internal_cols)
+      {
+
+        __shared__ T bufA[272];
+        __shared__ T bufB[272];
+
+        vcl_size_t block_size = 16;//get_local_size(0);
+        vcl_size_t row_block_id = blockIdx.x;
+        vcl_size_t col_block_id = blockIdx.y;
+        vcl_size_t row_thread_id = threadIdx.x;
+        vcl_size_t col_thread_id = threadIdx.y;
+        vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols;
+        vcl_size_t aStep = block_size * A_row_inc * A_internal_cols;
+        vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
+        vcl_size_t bStep = block_size * B_row_inc;
+        vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
+        T Csub = 0;
+        vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
+        vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc *  B_internal_rows;
+
+        vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
+        vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
+        for (vcl_size_t block = 0;
+                block < block_num;
+                ++block)
+        {
+          bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
+          bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
+          __syncthreads();
+          T * bufAptr = bufA + row_thread_id_times_block_size;
+          T * bufBptr = bufB + col_thread_id_times_block_size;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+          __syncthreads();
+          aBegin += aStep;
+          bBegin += bStep;
+        }
+        if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
+          C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
+      }
+
+      // matrix-matrix multiplication C = A^T * B^T
+      // matrix layouts: C...col_major, A...row_major, B...col_major
+      template <typename T>
+      __global__ void matrix_matrix_col_row_col_prod_TT_kernel(
+                T alpha,
+                const T * A,
+                unsigned int A_row_start,
+                unsigned int A_col_start,
+                unsigned int A_row_inc,
+                unsigned int A_col_inc,
+                unsigned int A_row_size,
+                unsigned int A_col_size,
+                unsigned int A_internal_rows,
+                unsigned int A_internal_cols,
+                const T * B,
+                unsigned int B_row_start,
+                unsigned int B_col_start,
+                unsigned int B_row_inc,
+                unsigned int B_col_inc,
+                unsigned int B_row_size,
+                unsigned int B_col_size,
+                unsigned int B_internal_rows,
+                unsigned int B_internal_cols,
+                T beta,
+                T * C,
+                unsigned int C_row_start,
+                unsigned int C_col_start,
+                unsigned int C_row_inc,
+                unsigned int C_col_inc,
+                unsigned int C_row_size,
+                unsigned int C_col_size,
+                unsigned int C_internal_rows,
+                unsigned int C_internal_cols)
+      {
+
+        __shared__ T bufA[272];
+        __shared__ T bufB[272];
+
+        vcl_size_t block_size = 16;//get_local_size(0);
+        vcl_size_t row_block_id = blockIdx.x;
+        vcl_size_t col_block_id = blockIdx.y;
+        vcl_size_t row_thread_id = threadIdx.x;
+        vcl_size_t col_thread_id = threadIdx.y;
+        vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols;
+        vcl_size_t aStep = block_size * A_row_inc * A_internal_cols;
+        vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
+        vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
+        vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
+        T Csub = 0;
+        vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
+        vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc *  B_internal_rows;
+
+        vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
+        vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
+        for (vcl_size_t block = 0;
+                block < block_num;
+                ++block)
+        {
+          bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
+          bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
+          __syncthreads();
+          T * bufAptr = bufA + row_thread_id_times_block_size;
+          T * bufBptr = bufB + col_thread_id_times_block_size;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+          __syncthreads();
+          aBegin += aStep;
+          bBegin += bStep;
+        }
+        if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
+          C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
+      }
+
+
+
+
+      ////////////////////////////////////////////////////////////////////////////
+
+
+
+
+      // matrix-matrix multiplication C = A * B
+      // matrix layouts: C...row_major, A...row_major, B...col_major
+      template <typename T>
+      __global__ void matrix_matrix_row_row_col_prod_AA_kernel(
+                T alpha,
+                const T * A,
+                unsigned int A_row_start,
+                unsigned int A_col_start,
+                unsigned int A_row_inc,
+                unsigned int A_col_inc,
+                unsigned int A_row_size,
+                unsigned int A_col_size,
+                unsigned int A_internal_rows,
+                unsigned int A_internal_cols,
+                const T * B,
+                unsigned int B_row_start,
+                unsigned int B_col_start,
+                unsigned int B_row_inc,
+                unsigned int B_col_inc,
+                unsigned int B_row_size,
+                unsigned int B_col_size,
+                unsigned int B_internal_rows,
+                unsigned int B_internal_cols,
+                T beta,
+                T * C,
+                unsigned int C_row_start,
+                unsigned int C_col_start,
+                unsigned int C_row_inc,
+                unsigned int C_col_inc,
+                unsigned int C_row_size,
+                unsigned int C_col_size,
+                unsigned int C_internal_rows,
+                unsigned int C_internal_cols)
+      {
+
+        __shared__ T bufA[272];
+        __shared__ T bufB[272];
+
+        vcl_size_t block_size = 16;//get_local_size(0);
+        vcl_size_t row_block_id = blockIdx.x;
+        vcl_size_t col_block_id = blockIdx.y;
+        vcl_size_t row_thread_id = threadIdx.x;
+        vcl_size_t col_thread_id = threadIdx.y;
+        vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start;
+        vcl_size_t aStep = block_size * A_col_inc;
+        vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
+        vcl_size_t bStep = block_size * B_row_inc;
+        vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
+        T Csub = 0;
+        vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
+        vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc *  B_internal_rows;
+
+        vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
+        vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
+        for (vcl_size_t block = 0;
+                block < block_num;
+                ++block)
+        {
+          bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
+          bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
+          __syncthreads();
+          T * bufAptr = bufA + row_thread_id_times_block_size;
+          T * bufBptr = bufB + col_thread_id_times_block_size;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+          __syncthreads();
+          aBegin += aStep;
+          bBegin += bStep;
+        }
+        if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
+          C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
+      }
+
+      // matrix-matrix multiplication C = A * B^T
+      // matrix layouts: C...row_major, A...row_major, B...col_major
+      template <typename T>
+      __global__ void matrix_matrix_row_row_col_prod_AT_kernel(
+                T alpha,
+                const T * A,
+                unsigned int A_row_start,
+                unsigned int A_col_start,
+                unsigned int A_row_inc,
+                unsigned int A_col_inc,
+                unsigned int A_row_size,
+                unsigned int A_col_size,
+                unsigned int A_internal_rows,
+                unsigned int A_internal_cols,
+                const T * B,
+                unsigned int B_row_start,
+                unsigned int B_col_start,
+                unsigned int B_row_inc,
+                unsigned int B_col_inc,
+                unsigned int B_row_size,
+                unsigned int B_col_size,
+                unsigned int B_internal_rows,
+                unsigned int B_internal_cols,
+                T beta,
+                T * C,
+                unsigned int C_row_start,
+                unsigned int C_col_start,
+                unsigned int C_row_inc,
+                unsigned int C_col_inc,
+                unsigned int C_row_size,
+                unsigned int C_col_size,
+                unsigned int C_internal_rows,
+                unsigned int C_internal_cols)
+      {
+
+        __shared__ T bufA[272];
+        __shared__ T bufB[272];
+
+        vcl_size_t block_size = 16;//get_local_size(0);
+        vcl_size_t row_block_id = blockIdx.x;
+        vcl_size_t col_block_id = blockIdx.y;
+        vcl_size_t row_thread_id = threadIdx.x;
+        vcl_size_t col_thread_id = threadIdx.y;
+        vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start;
+        vcl_size_t aStep = block_size * A_col_inc;
+        vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
+        vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
+        vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
+        T Csub = 0;
+        vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
+        vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc *  B_internal_rows;
+
+        vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
+        vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
+        for (vcl_size_t block = 0;
+                block < block_num;
+                ++block)
+        {
+          bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
+          bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
+          __syncthreads();
+          T * bufAptr = bufA + row_thread_id_times_block_size;
+          T * bufBptr = bufB + col_thread_id_times_block_size;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+          __syncthreads();
+          aBegin += aStep;
+          bBegin += bStep;
+        }
+        if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
+          C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
+      }
+
+      // matrix-matrix multiplication C = A^T * B
+      // matrix layouts: C...row_major, A...row_major, B...col_major
+      template <typename T>
+      __global__ void matrix_matrix_row_row_col_prod_TA_kernel(
+                T alpha,
+                const T * A,
+                unsigned int A_row_start,
+                unsigned int A_col_start,
+                unsigned int A_row_inc,
+                unsigned int A_col_inc,
+                unsigned int A_row_size,
+                unsigned int A_col_size,
+                unsigned int A_internal_rows,
+                unsigned int A_internal_cols,
+                const T * B,
+                unsigned int B_row_start,
+                unsigned int B_col_start,
+                unsigned int B_row_inc,
+                unsigned int B_col_inc,
+                unsigned int B_row_size,
+                unsigned int B_col_size,
+                unsigned int B_internal_rows,
+                unsigned int B_internal_cols,
+                T beta,
+                T * C,
+                unsigned int C_row_start,
+                unsigned int C_col_start,
+                unsigned int C_row_inc,
+                unsigned int C_col_inc,
+                unsigned int C_row_size,
+                unsigned int C_col_size,
+                unsigned int C_internal_rows,
+                unsigned int C_internal_cols)
+      {
+
+        __shared__ T bufA[272];
+        __shared__ T bufB[272];
+
+        vcl_size_t block_size = 16;//get_local_size(0);
+        vcl_size_t row_block_id = blockIdx.x;
+        vcl_size_t col_block_id = blockIdx.y;
+        vcl_size_t row_thread_id = threadIdx.x;
+        vcl_size_t col_thread_id = threadIdx.y;
+        vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols;
+        vcl_size_t aStep = block_size * A_row_inc * A_internal_cols;
+        vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
+        vcl_size_t bStep = block_size * B_row_inc;
+        vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
+        T Csub = 0;
+        vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
+        vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc *  B_internal_rows;
+
+        vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
+        vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
+        for (vcl_size_t block = 0;
+                block < block_num;
+                ++block)
+        {
+          bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
+          bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
+          __syncthreads();
+          T * bufAptr = bufA + row_thread_id_times_block_size;
+          T * bufBptr = bufB + col_thread_id_times_block_size;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+          __syncthreads();
+          aBegin += aStep;
+          bBegin += bStep;
+        }
+        if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
+          C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
+      }
+
+      // matrix-matrix multiplication C = A^T * B^T
+      // matrix layouts: C...row_major, A...row_major, B...col_major
+      template <typename T>
+      __global__ void matrix_matrix_row_row_col_prod_TT_kernel(
+                T alpha,
+                const T * A,
+                unsigned int A_row_start,
+                unsigned int A_col_start,
+                unsigned int A_row_inc,
+                unsigned int A_col_inc,
+                unsigned int A_row_size,
+                unsigned int A_col_size,
+                unsigned int A_internal_rows,
+                unsigned int A_internal_cols,
+                const T * B,
+                unsigned int B_row_start,
+                unsigned int B_col_start,
+                unsigned int B_row_inc,
+                unsigned int B_col_inc,
+                unsigned int B_row_size,
+                unsigned int B_col_size,
+                unsigned int B_internal_rows,
+                unsigned int B_internal_cols,
+                T beta,
+                T * C,
+                unsigned int C_row_start,
+                unsigned int C_col_start,
+                unsigned int C_row_inc,
+                unsigned int C_col_inc,
+                unsigned int C_row_size,
+                unsigned int C_col_size,
+                unsigned int C_internal_rows,
+                unsigned int C_internal_cols)
+      {
+
+        __shared__ T bufA[272];
+        __shared__ T bufB[272];
+
+        vcl_size_t block_size = 16;//get_local_size(0);
+        vcl_size_t row_block_id = blockIdx.x;
+        vcl_size_t col_block_id = blockIdx.y;
+        vcl_size_t row_thread_id = threadIdx.x;
+        vcl_size_t col_thread_id = threadIdx.y;
+        vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols;
+        vcl_size_t aStep = block_size * A_row_inc * A_internal_cols;
+        vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
+        vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
+        vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
+        T Csub = 0;
+        vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
+        vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc *  B_internal_rows;
+
+        vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
+        vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
+        for (vcl_size_t block = 0;
+                block < block_num;
+                ++block)
+        {
+          bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
+          bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
+          __syncthreads();
+          T * bufAptr = bufA + row_thread_id_times_block_size;
+          T * bufBptr = bufB + col_thread_id_times_block_size;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+          __syncthreads();
+          aBegin += aStep;
+          bBegin += bStep;
+        }
+        if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
+          C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
+      }
+
+
+
+
+
+      ////////////////////////////////////////////////////////////////////////////
+
+
+
+
+
+
+      // matrix-matrix multiplication C = A * B
+      // matrix layouts: C...col_major, A...row_major, B...row_major
+      template <typename T>
+      __global__ void matrix_matrix_col_row_row_prod_AA_kernel(
+                T alpha,
+                const T * A,
+                unsigned int A_row_start,
+                unsigned int A_col_start,
+                unsigned int A_row_inc,
+                unsigned int A_col_inc,
+                unsigned int A_row_size,
+                unsigned int A_col_size,
+                unsigned int A_internal_rows,
+                unsigned int A_internal_cols,
+                const T * B,
+                unsigned int B_row_start,
+                unsigned int B_col_start,
+                unsigned int B_row_inc,
+                unsigned int B_col_inc,
+                unsigned int B_row_size,
+                unsigned int B_col_size,
+                unsigned int B_internal_rows,
+                unsigned int B_internal_cols,
+                T beta,
+                T * C,
+                unsigned int C_row_start,
+                unsigned int C_col_start,
+                unsigned int C_row_inc,
+                unsigned int C_col_inc,
+                unsigned int C_row_size,
+                unsigned int C_col_size,
+                unsigned int C_internal_rows,
+                unsigned int C_internal_cols)
+      {
+
+        __shared__ T bufA[272];
+        __shared__ T bufB[272];
+
+        vcl_size_t block_size = 16;//get_local_size(0);
+        vcl_size_t row_block_id = blockIdx.x;
+        vcl_size_t col_block_id = blockIdx.y;
+        vcl_size_t row_thread_id = threadIdx.x;
+        vcl_size_t col_thread_id = threadIdx.y;
+        vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start;
+        vcl_size_t aStep = block_size * A_col_inc;
+        vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
+        vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
+        vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
+        T Csub = 0;
+        vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
+        vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
+
+        vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
+        vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
+        for (vcl_size_t block = 0;
+                block < block_num;
+                ++block)
+        {
+          bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
+          bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
+          __syncthreads();
+          T * bufAptr = bufA + row_thread_id_times_block_size;
+          T * bufBptr = bufB + col_thread_id_times_block_size;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+          __syncthreads();
+          aBegin += aStep;
+          bBegin += bStep;
+        }
+        if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
+          C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
+      }
+
+      // matrix-matrix multiplication C = A * B^T
+      // matrix layouts: C...col_major, A...row_major, B...row_major
+      template <typename T>
+      __global__ void matrix_matrix_col_row_row_prod_AT_kernel(
+                T alpha,
+                const T * A,
+                unsigned int A_row_start,
+                unsigned int A_col_start,
+                unsigned int A_row_inc,
+                unsigned int A_col_inc,
+                unsigned int A_row_size,
+                unsigned int A_col_size,
+                unsigned int A_internal_rows,
+                unsigned int A_internal_cols,
+                const T * B,
+                unsigned int B_row_start,
+                unsigned int B_col_start,
+                unsigned int B_row_inc,
+                unsigned int B_col_inc,
+                unsigned int B_row_size,
+                unsigned int B_col_size,
+                unsigned int B_internal_rows,
+                unsigned int B_internal_cols,
+                T beta,
+                T * C,
+                unsigned int C_row_start,
+                unsigned int C_col_start,
+                unsigned int C_row_inc,
+                unsigned int C_col_inc,
+                unsigned int C_row_size,
+                unsigned int C_col_size,
+                unsigned int C_internal_rows,
+                unsigned int C_internal_cols)
+      {
+
+        __shared__ T bufA[272];
+        __shared__ T bufB[272];
+
+        vcl_size_t block_size = 16;//get_local_size(0);
+        vcl_size_t row_block_id = blockIdx.x;
+        vcl_size_t col_block_id = blockIdx.y;
+        vcl_size_t row_thread_id = threadIdx.x;
+        vcl_size_t col_thread_id = threadIdx.y;
+        vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start;
+        vcl_size_t aStep = block_size * A_col_inc;
+        vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
+        vcl_size_t bStep = block_size * B_col_inc;
+        vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
+        T Csub = 0;
+        vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
+        vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
+
+        vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
+        vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
+        for (vcl_size_t block = 0;
+                block < block_num;
+                ++block)
+        {
+          bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
+          bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
+          __syncthreads();
+          T * bufAptr = bufA + row_thread_id_times_block_size;
+          T * bufBptr = bufB + col_thread_id_times_block_size;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+          __syncthreads();
+          aBegin += aStep;
+          bBegin += bStep;
+        }
+        if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
+          C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
+      }
+
+      // matrix-matrix multiplication C = A^T * B
+      // matrix layouts: C...col_major, A...row_major, B...row_major
+      template <typename T>
+      __global__ void matrix_matrix_col_row_row_prod_TA_kernel(
+                T alpha,
+                const T * A,
+                unsigned int A_row_start,
+                unsigned int A_col_start,
+                unsigned int A_row_inc,
+                unsigned int A_col_inc,
+                unsigned int A_row_size,
+                unsigned int A_col_size,
+                unsigned int A_internal_rows,
+                unsigned int A_internal_cols,
+                const T * B,
+                unsigned int B_row_start,
+                unsigned int B_col_start,
+                unsigned int B_row_inc,
+                unsigned int B_col_inc,
+                unsigned int B_row_size,
+                unsigned int B_col_size,
+                unsigned int B_internal_rows,
+                unsigned int B_internal_cols,
+                T beta,
+                T * C,
+                unsigned int C_row_start,
+                unsigned int C_col_start,
+                unsigned int C_row_inc,
+                unsigned int C_col_inc,
+                unsigned int C_row_size,
+                unsigned int C_col_size,
+                unsigned int C_internal_rows,
+                unsigned int C_internal_cols)
+      {
+
+        __shared__ T bufA[272];
+        __shared__ T bufB[272];
+
+        vcl_size_t block_size = 16;//get_local_size(0);
+        vcl_size_t row_block_id = blockIdx.x;
+        vcl_size_t col_block_id = blockIdx.y;
+        vcl_size_t row_thread_id = threadIdx.x;
+        vcl_size_t col_thread_id = threadIdx.y;
+        vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols;
+        vcl_size_t aStep = block_size * A_row_inc * A_internal_cols;
+        vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
+        vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
+        vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
+        T Csub = 0;
+        vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
+        vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
+
+        vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
+        vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
+        for (vcl_size_t block = 0;
+                block < block_num;
+                ++block)
+        {
+          bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
+          bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
+          __syncthreads();
+          T * bufAptr = bufA + row_thread_id_times_block_size;
+          T * bufBptr = bufB + col_thread_id_times_block_size;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+          __syncthreads();
+          aBegin += aStep;
+          bBegin += bStep;
+        }
+        if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
+          C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
+      }
+
+      // matrix-matrix multiplication C = A^T * B^T
+      // matrix layouts: C...col_major, A...row_major, B...row_major
+      template <typename T>
+      __global__ void matrix_matrix_col_row_row_prod_TT_kernel(
+                T alpha,
+                const T * A,
+                unsigned int A_row_start,
+                unsigned int A_col_start,
+                unsigned int A_row_inc,
+                unsigned int A_col_inc,
+                unsigned int A_row_size,
+                unsigned int A_col_size,
+                unsigned int A_internal_rows,
+                unsigned int A_internal_cols,
+                const T * B,
+                unsigned int B_row_start,
+                unsigned int B_col_start,
+                unsigned int B_row_inc,
+                unsigned int B_col_inc,
+                unsigned int B_row_size,
+                unsigned int B_col_size,
+                unsigned int B_internal_rows,
+                unsigned int B_internal_cols,
+                T beta,
+                T * C,
+                unsigned int C_row_start,
+                unsigned int C_col_start,
+                unsigned int C_row_inc,
+                unsigned int C_col_inc,
+                unsigned int C_row_size,
+                unsigned int C_col_size,
+                unsigned int C_internal_rows,
+                unsigned int C_internal_cols)
+      {
+
+        __shared__ T bufA[272];
+        __shared__ T bufB[272];
+
+        vcl_size_t block_size = 16;//get_local_size(0);
+        vcl_size_t row_block_id = blockIdx.x;
+        vcl_size_t col_block_id = blockIdx.y;
+        vcl_size_t row_thread_id = threadIdx.x;
+        vcl_size_t col_thread_id = threadIdx.y;
+        vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols;
+        vcl_size_t aStep = block_size * A_row_inc * A_internal_cols;
+        vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
+        vcl_size_t bStep = block_size * B_col_inc;
+        vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
+        T Csub = 0;
+        vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
+        vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
+
+        vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
+        vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
+        for (vcl_size_t block = 0;
+                block < block_num;
+                ++block)
+        {
+          bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
+          bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
+          __syncthreads();
+          T * bufAptr = bufA + row_thread_id_times_block_size;
+          T * bufBptr = bufB + col_thread_id_times_block_size;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+          __syncthreads();
+          aBegin += aStep;
+          bBegin += bStep;
+        }
+        if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
+          C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
+      }
+
+
+
+
+
+      ////////////////////////////////////////////////////////////////////////////
+
+
+
+
+      // matrix-matrix multiplication C = A * B
+      // matrix layouts: C...row_major, A...row_major, B...row_major
+      template <typename T>
+      __global__ void matrix_matrix_row_row_row_prod_AA_kernel(
+                T alpha,
+                const T * A,
+                unsigned int A_row_start,
+                unsigned int A_col_start,
+                unsigned int A_row_inc,
+                unsigned int A_col_inc,
+                unsigned int A_row_size,
+                unsigned int A_col_size,
+                unsigned int A_internal_rows,
+                unsigned int A_internal_cols,
+                const T * B,
+                unsigned int B_row_start,
+                unsigned int B_col_start,
+                unsigned int B_row_inc,
+                unsigned int B_col_inc,
+                unsigned int B_row_size,
+                unsigned int B_col_size,
+                unsigned int B_internal_rows,
+                unsigned int B_internal_cols,
+                T beta,
+                T * C,
+                unsigned int C_row_start,
+                unsigned int C_col_start,
+                unsigned int C_row_inc,
+                unsigned int C_col_inc,
+                unsigned int C_row_size,
+                unsigned int C_col_size,
+                unsigned int C_internal_rows,
+                unsigned int C_internal_cols)
+      {
+
+        __shared__ T bufA[272];
+        __shared__ T bufB[272];
+
+        vcl_size_t block_size = 16;//get_local_size(0);
+        vcl_size_t row_block_id = blockIdx.x;
+        vcl_size_t col_block_id = blockIdx.y;
+        vcl_size_t row_thread_id = threadIdx.x;
+        vcl_size_t col_thread_id = threadIdx.y;
+        vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start;
+        vcl_size_t aStep = block_size * A_col_inc;
+        vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
+        vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
+        vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
+        T Csub = 0;
+        vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
+        vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
+
+        vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
+        vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
+        for (vcl_size_t block = 0;
+                block < block_num;
+                ++block)
+        {
+          bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
+          bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
+          __syncthreads();
+          T * bufAptr = bufA + row_thread_id_times_block_size;
+          T * bufBptr = bufB + col_thread_id_times_block_size;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+          __syncthreads();
+          aBegin += aStep;
+          bBegin += bStep;
+        }
+        if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
+          C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
+      }
+
+      // matrix-matrix multiplication C = A * B^T
+      // matrix layouts: C...row_major, A...row_major, B...row_major
+      template <typename T>
+      __global__ void matrix_matrix_row_row_row_prod_AT_kernel(
+                T alpha,
+                const T * A,
+                unsigned int A_row_start,
+                unsigned int A_col_start,
+                unsigned int A_row_inc,
+                unsigned int A_col_inc,
+                unsigned int A_row_size,
+                unsigned int A_col_size,
+                unsigned int A_internal_rows,
+                unsigned int A_internal_cols,
+                const T * B,
+                unsigned int B_row_start,
+                unsigned int B_col_start,
+                unsigned int B_row_inc,
+                unsigned int B_col_inc,
+                unsigned int B_row_size,
+                unsigned int B_col_size,
+                unsigned int B_internal_rows,
+                unsigned int B_internal_cols,
+                T beta,
+                T * C,
+                unsigned int C_row_start,
+                unsigned int C_col_start,
+                unsigned int C_row_inc,
+                unsigned int C_col_inc,
+                unsigned int C_row_size,
+                unsigned int C_col_size,
+                unsigned int C_internal_rows,
+                unsigned int C_internal_cols)
+      {
+
+        __shared__ T bufA[272];
+        __shared__ T bufB[272];
+
+        vcl_size_t block_size = 16;//get_local_size(0);
+        vcl_size_t row_block_id = blockIdx.x;
+        vcl_size_t col_block_id = blockIdx.y;
+        vcl_size_t row_thread_id = threadIdx.x;
+        vcl_size_t col_thread_id = threadIdx.y;
+        vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start;
+        vcl_size_t aStep = block_size * A_col_inc;
+        vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
+        vcl_size_t bStep = block_size * B_col_inc;
+        vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
+        T Csub = 0;
+        vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
+        vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
+
+        vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
+        vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
+        for (vcl_size_t block = 0;
+                block < block_num;
+                ++block)
+        {
+          bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
+          bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
+          __syncthreads();
+          T * bufAptr = bufA + row_thread_id_times_block_size;
+          T * bufBptr = bufB + col_thread_id_times_block_size;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+          __syncthreads();
+          aBegin += aStep;
+          bBegin += bStep;
+        }
+        if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
+          C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
+      }
+
+      // matrix-matrix multiplication C = A^T * B
+      // matrix layouts: C...row_major, A...row_major, B...row_major
+      template <typename T>
+      __global__ void matrix_matrix_row_row_row_prod_TA_kernel(
+                T alpha,
+                const T * A,
+                unsigned int A_row_start,
+                unsigned int A_col_start,
+                unsigned int A_row_inc,
+                unsigned int A_col_inc,
+                unsigned int A_row_size,
+                unsigned int A_col_size,
+                unsigned int A_internal_rows,
+                unsigned int A_internal_cols,
+                const T * B,
+                unsigned int B_row_start,
+                unsigned int B_col_start,
+                unsigned int B_row_inc,
+                unsigned int B_col_inc,
+                unsigned int B_row_size,
+                unsigned int B_col_size,
+                unsigned int B_internal_rows,
+                unsigned int B_internal_cols,
+                T beta,
+                T * C,
+                unsigned int C_row_start,
+                unsigned int C_col_start,
+                unsigned int C_row_inc,
+                unsigned int C_col_inc,
+                unsigned int C_row_size,
+                unsigned int C_col_size,
+                unsigned int C_internal_rows,
+                unsigned int C_internal_cols)
+      {
+
+        __shared__ T bufA[272];
+        __shared__ T bufB[272];
+
+        vcl_size_t block_size = 16;//get_local_size(0);
+        vcl_size_t row_block_id = blockIdx.x;
+        vcl_size_t col_block_id = blockIdx.y;
+        vcl_size_t row_thread_id = threadIdx.x;
+        vcl_size_t col_thread_id = threadIdx.y;
+        vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols;
+        vcl_size_t aStep = block_size * A_row_inc * A_internal_cols;
+        vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
+        vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
+        vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
+        T Csub = 0;
+        vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
+        vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
+
+        vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
+        vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
+        for (vcl_size_t block = 0;
+                block < block_num;
+                ++block)
+        {
+          bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
+          bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
+          __syncthreads();
+          T * bufAptr = bufA + row_thread_id_times_block_size;
+          T * bufBptr = bufB + col_thread_id_times_block_size;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+          __syncthreads();
+          aBegin += aStep;
+          bBegin += bStep;
+        }
+        if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
+          C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
+      }
+
+      // matrix-matrix multiplication C = A^T * B^T
+      // matrix layouts: C...row_major, A...row_major, B...row_major
+      template <typename T>
+      __global__ void matrix_matrix_row_row_row_prod_TT_kernel(
+                T alpha,
+                const T * A,
+                unsigned int A_row_start,
+                unsigned int A_col_start,
+                unsigned int A_row_inc,
+                unsigned int A_col_inc,
+                unsigned int A_row_size,
+                unsigned int A_col_size,
+                unsigned int A_internal_rows,
+                unsigned int A_internal_cols,
+                const T * B,
+                unsigned int B_row_start,
+                unsigned int B_col_start,
+                unsigned int B_row_inc,
+                unsigned int B_col_inc,
+                unsigned int B_row_size,
+                unsigned int B_col_size,
+                unsigned int B_internal_rows,
+                unsigned int B_internal_cols,
+                T beta,
+                T * C,
+                unsigned int C_row_start,
+                unsigned int C_col_start,
+                unsigned int C_row_inc,
+                unsigned int C_col_inc,
+                unsigned int C_row_size,
+                unsigned int C_col_size,
+                unsigned int C_internal_rows,
+                unsigned int C_internal_cols)
+      {
+
+        __shared__ T bufA[272];
+        __shared__ T bufB[272];
+
+        vcl_size_t block_size = 16;//get_local_size(0);
+        vcl_size_t row_block_id = blockIdx.x;
+        vcl_size_t col_block_id = blockIdx.y;
+        vcl_size_t row_thread_id = threadIdx.x;
+        vcl_size_t col_thread_id = threadIdx.y;
+        vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols;
+        vcl_size_t aStep = block_size * A_row_inc * A_internal_cols;
+        vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
+        vcl_size_t bStep = block_size * B_col_inc;
+        vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
+        T Csub = 0;
+        vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
+        vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
+
+        vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
+        vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
+        for (vcl_size_t block = 0;
+                block < block_num;
+                ++block)
+        {
+          bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
+          bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
+          __syncthreads();
+          T * bufAptr = bufA + row_thread_id_times_block_size;
+          T * bufBptr = bufB + col_thread_id_times_block_size;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+            Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+          __syncthreads();
+          aBegin += aStep;
+          bBegin += bStep;
+        }
+        if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
+          C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
+      }
+
+
+    } // namespace cuda
+  } //namespace linalg
+} //namespace viennacl
+
+
+#endif
diff --git a/viennacl/linalg/cuda/matrix_operations_row.hpp b/viennacl/linalg/cuda/matrix_operations_row.hpp
new file mode 100644
index 0000000..d035507
--- /dev/null
+++ b/viennacl/linalg/cuda/matrix_operations_row.hpp
@@ -0,0 +1,1419 @@
+#ifndef VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_ROW_HPP_
+#define VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_ROW_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file  viennacl/linalg/cuda/matrix_operations_row.hpp
+    @brief Implementations of row-major dense matrix related operations, including matrix-vector products, using CUDA.
+*/
+
+
+namespace viennacl
+{
+  namespace linalg
+  {
+    namespace cuda
+    {
+      //
+      // am
+      //
+
+      // alpha on CPU
+      template <typename T>
+      __global__ void am_row_kernel(
+                T * A,
+                unsigned int A_start1, unsigned int A_start2,
+                unsigned int A_inc1,   unsigned int A_inc2,
+                unsigned int A_size1,  unsigned int A_size2,
+                unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+                T fac2,
+                unsigned int options2,
+                const T * B,
+                unsigned int B_start1, unsigned int B_start2,
+                unsigned int B_inc1,   unsigned int B_inc2,
+                unsigned int B_internal_size1,  unsigned int B_internal_size2)
+      {
+        T alpha = fac2;
+        if (options2 & (1 << 0))
+          alpha = -alpha;
+
+        unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+        unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+        if (options2 & (1 << 1))
+        {
+          for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+            for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+              A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha;
+        }
+        else
+        {
+          for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+            for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+              A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha;
+        }
+      }
+
+      // alpha on GPU
+      template <typename T>
+      __global__ void am_row_kernel(
+                T * A,
+                unsigned int A_start1, unsigned int A_start2,
+                unsigned int A_inc1,   unsigned int A_inc2,
+                unsigned int A_size1,  unsigned int A_size2,
+                unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+                const T * fac2,
+                unsigned int options2,
+                const T * B,
+                unsigned int B_start1, unsigned int B_start2,
+                unsigned int B_inc1,   unsigned int B_inc2,
+                unsigned int B_internal_size1,  unsigned int B_internal_size2)
+      {
+        T alpha = *fac2;
+        if (options2 & (1 << 0))
+          alpha = -alpha;
+
+        unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+        unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+        if (options2 & (1 << 1))
+        {
+          for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+            for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+              A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha;
+        }
+        else
+        {
+          for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+            for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+              A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha;
+        }
+      }
+
+
+      //
+      // ambm
+      //
+
+      // alpha and beta on CPU
+      template <typename T>
+      __global__ void ambm_row_kernel(
+                T * A,
+                unsigned int A_start1, unsigned int A_start2,
+                unsigned int A_inc1,   unsigned int A_inc2,
+                unsigned int A_size1,  unsigned int A_size2,
+                unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+                T fac2,
+                unsigned int options2,
+                const T * B,
+                unsigned int B_start1, unsigned int B_start2,
+                unsigned int B_inc1,   unsigned int B_inc2,
+                unsigned int B_internal_size1,  unsigned int B_internal_size2,
+
+                T fac3,
+                unsigned int options3,
+                const T * C,
+                unsigned int C_start1, unsigned int C_start2,
+                unsigned int C_inc1,   unsigned int C_inc2,
+                unsigned int C_internal_size1,  unsigned int C_internal_size2)
+      {
+        T alpha = fac2;
+        if (options2 & (1 << 0))
+          alpha = -alpha;
+
+        T beta = fac3;
+        if (options3 & (1 << 0))
+          beta = -beta;
+
+        unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+        unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+        if (options2 & (1 << 1))
+        {
+          if (options3 & (1 << 1))
+          {
+            for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+              for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+                A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+              = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
+              + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
+          }
+          else
+          {
+            for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+              for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+                A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+              = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
+              + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
+          }
+        }
+        else
+        {
+          if (options3 & (1 << 1))
+          {
+            for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+              for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+                A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+              = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
+              + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
+          }
+          else
+          {
+            for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+              for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+                A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+              = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
+              + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
+          }
+        }
+      }
+
+
+      // alpha on CPU, beta on GPU
+      template <typename T>
+      __global__ void ambm_row_kernel(
+                T * A,
+                unsigned int A_start1, unsigned int A_start2,
+                unsigned int A_inc1,   unsigned int A_inc2,
+                unsigned int A_size1,  unsigned int A_size2,
+                unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+                T fac2,
+                unsigned int options2,
+                const T * B,
+                unsigned int B_start1, unsigned int B_start2,
+                unsigned int B_inc1,   unsigned int B_inc2,
+                unsigned int B_internal_size1,  unsigned int B_internal_size2,
+
+                const T * fac3,
+                unsigned int options3,
+                const T * C,
+                unsigned int C_start1, unsigned int C_start2,
+                unsigned int C_inc1,   unsigned int C_inc2,
+                unsigned int C_internal_size1,  unsigned int C_internal_size2)
+      {
+        T alpha = fac2;
+        if (options2 & (1 << 0))
+          alpha = -alpha;
+
+        T beta = *fac3;
+        if (options3 & (1 << 0))
+          beta = -beta;
+
+        unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+        unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+        if (options2 & (1 << 1))
+        {
+          if (options3 & (1 << 1))
+          {
+            for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+              for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+                A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+              = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
+              + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
+          }
+          else
+          {
+            for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+              for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+                A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+              = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
+              + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
+          }
+        }
+        else
+        {
+          if (options3 & (1 << 1))
+          {
+            for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+              for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+                A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+              = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
+              + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
+          }
+          else
+          {
+            for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+              for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+                A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+              = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
+              + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
+          }
+        }
+      }
+
+      // alpha on GPU, beta on CPU
+      template <typename T>
+      __global__ void ambm_row_kernel(
+                T * A,
+                unsigned int A_start1, unsigned int A_start2,
+                unsigned int A_inc1,   unsigned int A_inc2,
+                unsigned int A_size1,  unsigned int A_size2,
+                unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+                const T * fac2,
+                unsigned int options2,
+                const T * B,
+                unsigned int B_start1, unsigned int B_start2,
+                unsigned int B_inc1,   unsigned int B_inc2,
+                unsigned int B_internal_size1,  unsigned int B_internal_size2,
+
+                T fac3,
+                unsigned int options3,
+                const T * C,
+                unsigned int C_start1, unsigned int C_start2,
+                unsigned int C_inc1,   unsigned int C_inc2,
+                unsigned int C_internal_size1,  unsigned int C_internal_size2)
+      {
+        T alpha = *fac2;
+        if (options2 & (1 << 0))
+          alpha = -alpha;
+
+        T beta = fac3;
+        if (options3 & (1 << 0))
+          beta = -beta;
+
+        unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+        unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+        if (options2 & (1 << 1))
+        {
+          if (options3 & (1 << 1))
+          {
+            for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+              for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+                A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+              = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
+              + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
+          }
+          else
+          {
+            for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+              for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+                A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+              = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
+              + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
+          }
+        }
+        else
+        {
+          if (options3 & (1 << 1))
+          {
+            for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+              for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+                A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+              = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
+              + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
+          }
+          else
+          {
+            for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+              for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+                A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+              = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
+              + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
+          }
+        }
+      }
+
+
+      // alpha and beta on GPU
+      template <typename T>
+      __global__ void ambm_row_kernel(
+                T * A,
+                unsigned int A_start1, unsigned int A_start2,
+                unsigned int A_inc1,   unsigned int A_inc2,
+                unsigned int A_size1,  unsigned int A_size2,
+                unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+                const T * fac2,
+                unsigned int options2,
+                const T * B,
+                unsigned int B_start1, unsigned int B_start2,
+                unsigned int B_inc1,   unsigned int B_inc2,
+                unsigned int B_internal_size1,  unsigned int B_internal_size2,
+
+                const T * fac3,
+                unsigned int options3,
+                const T * C,
+                unsigned int C_start1, unsigned int C_start2,
+                unsigned int C_inc1,   unsigned int C_inc2,
+                unsigned int C_internal_size1,  unsigned int C_internal_size2)
+      {
+        T alpha = *fac2;
+        if (options2 & (1 << 0))
+          alpha = -alpha;
+
+        T beta = *fac3;
+        if (options3 & (1 << 0))
+          beta = -beta;
+
+        unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+        unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+        if (options2 & (1 << 1))
+        {
+          if (options3 & (1 << 1))
+          {
+            for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+              for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+                A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+              = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
+              + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
+          }
+          else
+          {
+            for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+              for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+                A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+              = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
+              + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
+          }
+        }
+        else
+        {
+          if (options3 & (1 << 1))
+          {
+            for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+              for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+                A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+              = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
+              + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
+          }
+          else
+          {
+            for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+              for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+                A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+              = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
+              + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
+          }
+        }
+      }
+
+
+      //
+      // ambm_m
+      //
+
+      // alpha and beta on CPU
+      template <typename T>
+      __global__ void ambm_m_row_kernel(
+                T * A,
+                unsigned int A_start1, unsigned int A_start2,
+                unsigned int A_inc1,   unsigned int A_inc2,
+                unsigned int A_size1,  unsigned int A_size2,
+                unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+                T fac2,
+                unsigned int options2,
+                const T * B,
+                unsigned int B_start1, unsigned int B_start2,
+                unsigned int B_inc1,   unsigned int B_inc2,
+                unsigned int B_internal_size1,  unsigned int B_internal_size2,
+
+                T fac3,
+                unsigned int options3,
+                const T * C,
+                unsigned int C_start1, unsigned int C_start2,
+                unsigned int C_inc1,   unsigned int C_inc2,
+                unsigned int C_internal_size1,  unsigned int C_internal_size2)
+      {
+        T alpha = fac2;
+        if (options2 & (1 << 0))
+          alpha = -alpha;
+
+        T beta = fac3;
+        if (options3 & (1 << 0))
+          beta = -beta;
+
+        unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+        unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+        if (options2 & (1 << 1))
+        {
+          if (options3 & (1 << 1))
+          {
+            for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+              for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+                A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+             += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
+              + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
+          }
+          else
+          {
+            for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+              for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+                A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+             += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
+              + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
+          }
+        }
+        else
+        {
+          if (options3 & (1 << 1))
+          {
+            for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+              for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+                A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+             += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
+              + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
+          }
+          else
+          {
+            for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+              for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+                A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+             += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
+              + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
+          }
+        }
+      }
+
+
+      // alpha on CPU, beta on GPU
+      template <typename T>
+      __global__ void ambm_m_row_kernel(
+                T * A,
+                unsigned int A_start1, unsigned int A_start2,
+                unsigned int A_inc1,   unsigned int A_inc2,
+                unsigned int A_size1,  unsigned int A_size2,
+                unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+                T fac2,
+                unsigned int options2,
+                const T * B,
+                unsigned int B_start1, unsigned int B_start2,
+                unsigned int B_inc1,   unsigned int B_inc2,
+                unsigned int B_internal_size1,  unsigned int B_internal_size2,
+
+                const T * fac3,
+                unsigned int options3,
+                const T * C,
+                unsigned int C_start1, unsigned int C_start2,
+                unsigned int C_inc1,   unsigned int C_inc2,
+                unsigned int C_internal_size1,  unsigned int C_internal_size2)
+      {
+        T alpha = fac2;
+        if (options2 & (1 << 0))
+          alpha = -alpha;
+
+        T beta = *fac3;
+        if (options3 & (1 << 0))
+          beta = -beta;
+
+        unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+        unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+        if (options2 & (1 << 1))
+        {
+          if (options3 & (1 << 1))
+          {
+            for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+              for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+                A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+             += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
+              + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
+          }
+          else
+          {
+            for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+              for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+                A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+             += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
+              + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
+          }
+        }
+        else
+        {
+          if (options3 & (1 << 1))
+          {
+            for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+              for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+                A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+             += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
+              + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
+          }
+          else
+          {
+            for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+              for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+                A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+             += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
+              + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
+          }
+        }
+      }
+
+      // alpha on GPU, beta on CPU
+      template <typename T>
+      __global__ void ambm_m_row_kernel(
+                T * A,
+                unsigned int A_start1, unsigned int A_start2,
+                unsigned int A_inc1,   unsigned int A_inc2,
+                unsigned int A_size1,  unsigned int A_size2,
+                unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+                const T * fac2,
+                unsigned int options2,
+                const T * B,
+                unsigned int B_start1, unsigned int B_start2,
+                unsigned int B_inc1,   unsigned int B_inc2,
+                unsigned int B_internal_size1,  unsigned int B_internal_size2,
+
+                T fac3,
+                unsigned int options3,
+                const T * C,
+                unsigned int C_start1, unsigned int C_start2,
+                unsigned int C_inc1,   unsigned int C_inc2,
+                unsigned int C_internal_size1,  unsigned int C_internal_size2)
+      {
+        T alpha = *fac2;
+        if (options2 & (1 << 0))
+          alpha = -alpha;
+
+        T beta = fac3;
+        if (options3 & (1 << 0))
+          beta = -beta;
+
+        unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+        unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+        if (options2 & (1 << 1))
+        {
+          if (options3 & (1 << 1))
+          {
+            for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+              for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+                A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+             += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
+              + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
+          }
+          else
+          {
+            for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+              for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+                A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+             += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
+              + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
+          }
+        }
+        else
+        {
+          if (options3 & (1 << 1))
+          {
+            for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+              for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+                A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+             += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
+              + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
+          }
+          else
+          {
+            for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+              for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+                A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+             += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
+              + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
+          }
+        }
+      }
+
+
+      // alpha and beta on GPU
+      template <typename T>
+      __global__ void ambm_m_row_kernel(
+                T * A,
+                unsigned int A_start1, unsigned int A_start2,
+                unsigned int A_inc1,   unsigned int A_inc2,
+                unsigned int A_size1,  unsigned int A_size2,
+                unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+                const T * fac2,
+                unsigned int options2,
+                const T * B,
+                unsigned int B_start1, unsigned int B_start2,
+                unsigned int B_inc1,   unsigned int B_inc2,
+                unsigned int B_internal_size1,  unsigned int B_internal_size2,
+
+                const T * fac3,
+                unsigned int options3,
+                const T * C,
+                unsigned int C_start1, unsigned int C_start2,
+                unsigned int C_inc1,   unsigned int C_inc2,
+                unsigned int C_internal_size1,  unsigned int C_internal_size2)
+      {
+        T alpha = *fac2;
+        if (options2 & (1 << 0))
+          alpha = -alpha;
+
+        T beta = *fac3;
+        if (options3 & (1 << 0))
+          beta = -beta;
+
+        unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+        unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+        if (options2 & (1 << 1))
+        {
+          if (options3 & (1 << 1))
+          {
+            for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+              for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+                A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+             += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
+              + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
+          }
+          else
+          {
+            for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+              for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+                A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+             += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
+              + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
+          }
+        }
+        else
+        {
+          if (options3 & (1 << 1))
+          {
+            for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+              for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+                A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+             += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
+              + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
+          }
+          else
+          {
+            for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+              for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+                A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+             += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
+              + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
+          }
+        }
+      }
+
+      //
+      // assignments
+      //
+
+      template <typename T>
+      __global__ void matrix_row_assign_kernel(
+                T * A,
+                unsigned int A_start1, unsigned int A_start2,
+                unsigned int A_inc1,   unsigned int A_inc2,
+                unsigned int A_size1,  unsigned int A_size2,
+                unsigned int A_internal_size1,  unsigned int A_internal_size2,
+                T alpha)
+      {
+        unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+        unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+        for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+          for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+            A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = alpha;
+      }
+
+
+      template <typename T>
+      __global__ void matrix_row_diagonal_assign_kernel(
+                T * A,
+                unsigned int A_start1, unsigned int A_start2,
+                unsigned int A_inc1,   unsigned int A_inc2,
+                unsigned int A_size1,  unsigned int A_size2,
+                unsigned int A_internal_size1,  unsigned int A_internal_size2,
+                T alpha)
+      {
+        unsigned int gid = (blockIdx.x * blockDim.x + threadIdx.x);
+
+        for (unsigned int row = gid; row < A_size1; row += blockDim.x * gridDim.x)
+          A[(row * A_inc1 + A_start1) * A_internal_size2 + row * A_inc2 + A_start2] = alpha;
+      }
+
+      //
+      // binary element-wise operations
+      //
+
+      template <typename T>
+      __global__ void element_op_row_kernel(
+                T * A,
+                unsigned int A_start1, unsigned int A_start2,
+                unsigned int A_inc1,   unsigned int A_inc2,
+                unsigned int A_size1,  unsigned int A_size2,
+                unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+                const T * B,
+                unsigned int B_start1, unsigned int B_start2,
+                unsigned int B_inc1,   unsigned int B_inc2,
+                unsigned int B_internal_size1,  unsigned int B_internal_size2,
+
+                const T * C,
+                unsigned int C_start1, unsigned int C_start2,
+                unsigned int C_inc1,   unsigned int C_inc2,
+                unsigned int C_internal_size1,  unsigned int C_internal_size2,
+
+                unsigned int op_type) //0: product, 1: division, 2: pow
+      {
+        unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+        unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+        if (op_type == 2)
+        {
+          for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+            for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+              A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+            = pow(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2],
+                  C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2]);
+        }
+        else if (op_type == 1)
+        {
+          for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+            for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+              A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+            = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]
+            / C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2];
+        }
+        else if (op_type == 0)
+        {
+          for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+            for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+              A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+            = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]
+            * C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2];
+        }
+      }
+
+      template <typename T>
+      __global__ void element_op_int_row_kernel(
+                T * A,
+                unsigned int A_start1, unsigned int A_start2,
+                unsigned int A_inc1,   unsigned int A_inc2,
+                unsigned int A_size1,  unsigned int A_size2,
+                unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+                const T * B,
+                unsigned int B_start1, unsigned int B_start2,
+                unsigned int B_inc1,   unsigned int B_inc2,
+                unsigned int B_internal_size1,  unsigned int B_internal_size2,
+
+                const T * C,
+                unsigned int C_start1, unsigned int C_start2,
+                unsigned int C_inc1,   unsigned int C_inc2,
+                unsigned int C_internal_size1,  unsigned int C_internal_size2,
+
+                unsigned int op_type) //0: product, 1: division, 2: pow
+      {
+        unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+        unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+        if (op_type == 1)
+        {
+          for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+            for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+              A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+            = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]
+            / C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2];
+        }
+        else if (op_type == 0)
+        {
+          for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+            for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+              A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+            = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]
+            * C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2];
+        }
+      }
+
+      //
+      // unary element-wise operations
+      //
+
+      // abs
+      template <typename T>
+      __global__ void matrix_row_element_abs_kernel(
+                T * A,
+                unsigned int A_start1, unsigned int A_start2,
+                unsigned int A_inc1,   unsigned int A_inc2,
+                unsigned int A_size1,  unsigned int A_size2,
+                unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+                const T * B,
+                unsigned int B_start1, unsigned int B_start2,
+                unsigned int B_inc1,   unsigned int B_inc2,
+                unsigned int B_internal_size1,  unsigned int B_internal_size2)
+      {
+        unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+        unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+        for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+          for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+            A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = abs(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
+      }
+
+
+      // acos
+      template <typename T>
+      __global__ void matrix_row_element_acos_kernel(
+                T * A,
+                unsigned int A_start1, unsigned int A_start2,
+                unsigned int A_inc1,   unsigned int A_inc2,
+                unsigned int A_size1,  unsigned int A_size2,
+                unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+                const T * B,
+                unsigned int B_start1, unsigned int B_start2,
+                unsigned int B_inc1,   unsigned int B_inc2,
+                unsigned int B_internal_size1,  unsigned int B_internal_size2)
+      {
+        unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+        unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+        for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+          for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+            A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = acos(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
+      }
+
+
+      // asin
+      template <typename T>
+      __global__ void matrix_row_element_asin_kernel(
+                T * A,
+                unsigned int A_start1, unsigned int A_start2,
+                unsigned int A_inc1,   unsigned int A_inc2,
+                unsigned int A_size1,  unsigned int A_size2,
+                unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+                const T * B,
+                unsigned int B_start1, unsigned int B_start2,
+                unsigned int B_inc1,   unsigned int B_inc2,
+                unsigned int B_internal_size1,  unsigned int B_internal_size2)
+      {
+        unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+        unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+        for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+          for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+            A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = asin(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
+      }
+
+
+      // atan
+      template <typename T>
+      __global__ void matrix_row_element_atan_kernel(
+                T * A,
+                unsigned int A_start1, unsigned int A_start2,
+                unsigned int A_inc1,   unsigned int A_inc2,
+                unsigned int A_size1,  unsigned int A_size2,
+                unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+                const T * B,
+                unsigned int B_start1, unsigned int B_start2,
+                unsigned int B_inc1,   unsigned int B_inc2,
+                unsigned int B_internal_size1,  unsigned int B_internal_size2)
+      {
+        unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+        unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+        for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+          for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+            A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = atan(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
+      }
+
+
+      // ceil
+      template <typename T>
+      __global__ void matrix_row_element_ceil_kernel(
+                T * A,
+                unsigned int A_start1, unsigned int A_start2,
+                unsigned int A_inc1,   unsigned int A_inc2,
+                unsigned int A_size1,  unsigned int A_size2,
+                unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+                const T * B,
+                unsigned int B_start1, unsigned int B_start2,
+                unsigned int B_inc1,   unsigned int B_inc2,
+                unsigned int B_internal_size1,  unsigned int B_internal_size2)
+      {
+        unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+        unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+        for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+          for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+            A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = ceil(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
+      }
+
+
+      // cos
+      template <typename T>
+      __global__ void matrix_row_element_cos_kernel(
+                T * A,
+                unsigned int A_start1, unsigned int A_start2,
+                unsigned int A_inc1,   unsigned int A_inc2,
+                unsigned int A_size1,  unsigned int A_size2,
+                unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+                const T * B,
+                unsigned int B_start1, unsigned int B_start2,
+                unsigned int B_inc1,   unsigned int B_inc2,
+                unsigned int B_internal_size1,  unsigned int B_internal_size2)
+      {
+        unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+        unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+        for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+          for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+            A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = cos(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
+      }
+
+
+      // cosh
+      template <typename T>
+      __global__ void matrix_row_element_cosh_kernel(
+                T * A,
+                unsigned int A_start1, unsigned int A_start2,
+                unsigned int A_inc1,   unsigned int A_inc2,
+                unsigned int A_size1,  unsigned int A_size2,
+                unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+                const T * B,
+                unsigned int B_start1, unsigned int B_start2,
+                unsigned int B_inc1,   unsigned int B_inc2,
+                unsigned int B_internal_size1,  unsigned int B_internal_size2)
+      {
+        unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+        unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+        for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+          for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+            A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = cosh(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
+      }
+
+
+      // exp
+      template <typename T>
+      __global__ void matrix_row_element_exp_kernel(
+                T * A,
+                unsigned int A_start1, unsigned int A_start2,
+                unsigned int A_inc1,   unsigned int A_inc2,
+                unsigned int A_size1,  unsigned int A_size2,
+                unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+                const T * B,
+                unsigned int B_start1, unsigned int B_start2,
+                unsigned int B_inc1,   unsigned int B_inc2,
+                unsigned int B_internal_size1,  unsigned int B_internal_size2)
+      {
+        unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+        unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+        for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+          for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+            A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = exp(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
+      }
+
+
+      // fabs
+      template <typename T>
+      __global__ void matrix_row_element_fabs_kernel(
+                T * A,
+                unsigned int A_start1, unsigned int A_start2,
+                unsigned int A_inc1,   unsigned int A_inc2,
+                unsigned int A_size1,  unsigned int A_size2,
+                unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+                const T * B,
+                unsigned int B_start1, unsigned int B_start2,
+                unsigned int B_inc1,   unsigned int B_inc2,
+                unsigned int B_internal_size1,  unsigned int B_internal_size2)
+      {
+        unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+        unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+        for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+          for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+            A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = fabs(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
+      }
+
+
+      // floor
+      template <typename T>
+      __global__ void matrix_row_element_floor_kernel(
+                T * A,
+                unsigned int A_start1, unsigned int A_start2,
+                unsigned int A_inc1,   unsigned int A_inc2,
+                unsigned int A_size1,  unsigned int A_size2,
+                unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+                const T * B,
+                unsigned int B_start1, unsigned int B_start2,
+                unsigned int B_inc1,   unsigned int B_inc2,
+                unsigned int B_internal_size1,  unsigned int B_internal_size2)
+      {
+        unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+        unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+        for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+          for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+            A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = floor(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
+      }
+
+
+      // log
+      template <typename T>
+      __global__ void matrix_row_element_log_kernel(
+                T * A,
+                unsigned int A_start1, unsigned int A_start2,
+                unsigned int A_inc1,   unsigned int A_inc2,
+                unsigned int A_size1,  unsigned int A_size2,
+                unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+                const T * B,
+                unsigned int B_start1, unsigned int B_start2,
+                unsigned int B_inc1,   unsigned int B_inc2,
+                unsigned int B_internal_size1,  unsigned int B_internal_size2)
+      {
+        unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+        unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+        for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+          for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+            A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = log(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
+      }
+
+
+      // log10
+      template <typename T>
+      __global__ void matrix_row_element_log10_kernel(
+                T * A,
+                unsigned int A_start1, unsigned int A_start2,
+                unsigned int A_inc1,   unsigned int A_inc2,
+                unsigned int A_size1,  unsigned int A_size2,
+                unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+                const T * B,
+                unsigned int B_start1, unsigned int B_start2,
+                unsigned int B_inc1,   unsigned int B_inc2,
+                unsigned int B_internal_size1,  unsigned int B_internal_size2)
+      {
+        unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+        unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+        for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+          for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+            A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = log10(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
+      }
+
+
+      // sin
+      template <typename T>
+      __global__ void matrix_row_element_sin_kernel(
+                T * A,
+                unsigned int A_start1, unsigned int A_start2,
+                unsigned int A_inc1,   unsigned int A_inc2,
+                unsigned int A_size1,  unsigned int A_size2,
+                unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+                const T * B,
+                unsigned int B_start1, unsigned int B_start2,
+                unsigned int B_inc1,   unsigned int B_inc2,
+                unsigned int B_internal_size1,  unsigned int B_internal_size2)
+      {
+        unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+        unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+        for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+          for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+            A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = sin(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
+      }
+
+
+      // sinh
+      template <typename T>
+      __global__ void matrix_row_element_sinh_kernel(
+                T * A,
+                unsigned int A_start1, unsigned int A_start2,
+                unsigned int A_inc1,   unsigned int A_inc2,
+                unsigned int A_size1,  unsigned int A_size2,
+                unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+                const T * B,
+                unsigned int B_start1, unsigned int B_start2,
+                unsigned int B_inc1,   unsigned int B_inc2,
+                unsigned int B_internal_size1,  unsigned int B_internal_size2)
+      {
+        unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+        unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+        for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+          for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+            A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = sinh(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
+      }
+
+
+      // sqrt
+      template <typename T>
+      __global__ void matrix_row_element_sqrt_kernel(
+                T * A,
+                unsigned int A_start1, unsigned int A_start2,
+                unsigned int A_inc1,   unsigned int A_inc2,
+                unsigned int A_size1,  unsigned int A_size2,
+                unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+                const T * B,
+                unsigned int B_start1, unsigned int B_start2,
+                unsigned int B_inc1,   unsigned int B_inc2,
+                unsigned int B_internal_size1,  unsigned int B_internal_size2)
+      {
+        unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+        unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+        for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+          for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+            A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = sqrt(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
+      }
+
+
+      // tan
+      template <typename T>
+      __global__ void matrix_row_element_tan_kernel(
+                T * A,
+                unsigned int A_start1, unsigned int A_start2,
+                unsigned int A_inc1,   unsigned int A_inc2,
+                unsigned int A_size1,  unsigned int A_size2,
+                unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+                const T * B,
+                unsigned int B_start1, unsigned int B_start2,
+                unsigned int B_inc1,   unsigned int B_inc2,
+                unsigned int B_internal_size1,  unsigned int B_internal_size2)
+      {
+        unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+        unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+        for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+          for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+            A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = tan(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
+      }
+
+
+      // tanh
+      template <typename T>
+      __global__ void matrix_row_element_tanh_kernel(
+                T * A,
+                unsigned int A_start1, unsigned int A_start2,
+                unsigned int A_inc1,   unsigned int A_inc2,
+                unsigned int A_size1,  unsigned int A_size2,
+                unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+                const T * B,
+                unsigned int B_start1, unsigned int B_start2,
+                unsigned int B_inc1,   unsigned int B_inc2,
+                unsigned int B_internal_size1,  unsigned int B_internal_size2)
+      {
+        unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+        unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+        for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+          for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+            A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = tanh(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
+      }
+
+
+
+      //
+      // matrix-vector product
+      //
+
+      template <typename T>
+      __global__ void vec_mul_row_kernel(
+                const T * A,
+                unsigned int A_row_start,
+                unsigned int A_col_start,
+                unsigned int A_row_inc,
+                unsigned int A_col_inc,
+                unsigned int A_row_size,
+                unsigned int A_col_size,
+                unsigned int A_internal_rows,
+                unsigned int A_internal_cols,
+                const T * v,
+                unsigned int v_start,
+                unsigned int v_inc,
+                unsigned int v_size,
+                T * result,
+                unsigned int result_start,
+                unsigned int result_inc,
+                unsigned int result_size)
+      {
+        __shared__ T work[128];
+
+        unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+        unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+        unsigned int lid = threadIdx.x;
+
+        for (unsigned int row = row_gid; row < A_row_size; row += gridDim.x)
+        {
+          T dot_prod = 0;
+          for (unsigned int col = col_gid; col < A_col_size; col += blockDim.x)
+            dot_prod += A[(row * A_row_inc + A_row_start) * A_internal_cols + col * A_col_inc + A_col_start] * v[v_start + v_inc * col];
+          work[lid] = dot_prod;
+
+          for(unsigned int stride = blockDim.x/2 ; stride>0 ; stride>>=1){
+            __syncthreads();
+            if(lid < stride)
+              work[lid] += work[lid+stride];
+          }
+
+          if(lid == 0)
+            result[row * result_inc + result_start] = work[0];
+        }
+      }
+
+
+      template <typename T>
+      __global__ void trans_vec_mul_row_kernel(
+                const T * A,
+                unsigned int A_row_start,
+                unsigned int A_col_start,
+                unsigned int A_row_inc,
+                unsigned int A_col_inc,
+                unsigned int A_row_size,
+                unsigned int A_col_size,
+                unsigned int A_internal_rows,
+                unsigned int A_internal_cols,
+                const T * v,
+                unsigned int v_start,
+                unsigned int v_inc,
+                unsigned int v_size,
+                T * result,
+                unsigned int result_start,
+                unsigned int result_inc,
+                unsigned int result_size)
+      {
+        for (unsigned int row = blockIdx.x * blockDim.x + threadIdx.x; row < A_col_size; row += gridDim.x * blockDim.x)
+        {
+          T dot_prod = 0;
+          for (unsigned int col = 0; col < A_row_size; ++col)
+            dot_prod += A[(row * A_col_inc + A_col_start) + (col * A_row_inc + A_row_start) * A_internal_cols] * v[v_start + v_inc * col];
+          result[row * result_inc + result_start] = dot_prod;
+        }
+      }
+
+
+      //
+      // matrix-matrix products
+      //
+
+
+
+
+      //
+      // scaled rank-1-update
+      //
+
+      // alpha on CPU
+      template <typename T>
+      __global__ void scaled_rank1_update_row_kernel(
+                T * A,
+                unsigned int A_start1, unsigned int A_start2,
+                unsigned int A_inc1,   unsigned int A_inc2,
+                unsigned int A_size1,  unsigned int A_size2,
+                unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+                T val,
+                unsigned int options2,
+
+                const T * vec1,
+                unsigned int start1,
+                unsigned int inc1,
+                unsigned int size1,
+
+                const T * vec2,
+                unsigned int start2,
+                unsigned int inc2,
+                unsigned int size2)
+      {
+        T alpha = val;
+        if (options2 & (1 << 0))
+          alpha = -alpha;
+        if (options2 & (1 << 1))
+          alpha = ((T)(1)) / alpha;
+
+        unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+        unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+        for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+        {
+          T tmp = alpha * vec1[row * inc1 + start1];
+          for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+            A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] += tmp * vec2[col * inc2 + start2];
+        }
+      }
+
+
+      // alpha on GPU
+      template <typename T>
+      __global__ void scaled_rank1_update_row_kernel(
+                T * A,
+                unsigned int A_start1, unsigned int A_start2,
+                unsigned int A_inc1,   unsigned int A_inc2,
+                unsigned int A_size1,  unsigned int A_size2,
+                unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+                const T * val,
+                unsigned int options2,
+
+                const T * vec1,
+                unsigned int start1,
+                unsigned int inc1,
+                unsigned int size1,
+
+                const T * vec2,
+                unsigned int start2,
+                unsigned int inc2,
+                unsigned int size2)
+      {
+        T alpha = *val;
+        if (options2 & (1 << 0))
+          alpha = -alpha;
+        if (options2 & (1 << 1))
+          alpha = ((T)(1)) / alpha;
+
+        unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+        unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+        for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+        {
+          T tmp = alpha * vec1[row * inc1 + start1];
+          for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+            A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] += tmp * vec2[col * inc2 + start2];
+        }
+      }
+
+
+
+    } // namespace cuda
+  } //namespace linalg
+} //namespace viennacl
+
+
+#endif
diff --git a/viennacl/linalg/cuda/misc_operations.hpp b/viennacl/linalg/cuda/misc_operations.hpp
new file mode 100644
index 0000000..ee58b04
--- /dev/null
+++ b/viennacl/linalg/cuda/misc_operations.hpp
@@ -0,0 +1,93 @@
+#ifndef VIENNACL_LINALG_CUDA_MISC_OPERATIONS_HPP_
+#define VIENNACL_LINALG_CUDA_MISC_OPERATIONS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/cuda/misc_operations.hpp
+    @brief Implementations of miscellaneous operations using CUDA
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/cuda/common.hpp"
+
+
+namespace viennacl
+{
+  namespace linalg
+  {
+    namespace cuda
+    {
+
+      namespace detail
+      {
+
+        template <typename T>
+        __global__ void level_scheduling_substitute_kernel(
+                  const unsigned int * row_index_array,
+                  const unsigned int * row_indices,
+                  const unsigned int * column_indices,
+                  const T * elements,
+                  T * vec,
+                  unsigned int size)
+        {
+          for (unsigned int row  = blockDim.x * blockIdx.x + threadIdx.x;
+                            row  < size;
+                            row += gridDim.x * blockDim.x)
+          {
+            unsigned int eq_row = row_index_array[row];
+            T vec_entry = vec[eq_row];
+            unsigned int row_end = row_indices[row+1];
+
+            for (unsigned int j = row_indices[row]; j < row_end; ++j)
+              vec_entry -= vec[column_indices[j]] * elements[j];
+
+            vec[eq_row] = vec_entry;
+          }
+        }
+
+
+
+        template <typename ScalarType>
+        void level_scheduling_substitute(vector<ScalarType> & vec,
+                                     viennacl::backend::mem_handle const & row_index_array,
+                                     viennacl::backend::mem_handle const & row_buffer,
+                                     viennacl::backend::mem_handle const & col_buffer,
+                                     viennacl::backend::mem_handle const & element_buffer,
+                                     vcl_size_t num_rows
+                                    )
+        {
+          level_scheduling_substitute_kernel<<<128, 128>>>(detail::cuda_arg<unsigned int>(row_index_array.cuda_handle()),
+                                                       detail::cuda_arg<unsigned int>(row_buffer.cuda_handle()),
+                                                       detail::cuda_arg<unsigned int>(col_buffer.cuda_handle()),
+                                                       detail::cuda_arg<ScalarType>(element_buffer.cuda_handle()),
+                                                       detail::cuda_arg<ScalarType>(vec),
+                                                       static_cast<unsigned int>(num_rows)
+                                                      );
+        }
+
+      }
+
+    } // namespace cuda
+  } //namespace linalg
+} //namespace viennacl
+
+
+#endif
diff --git a/viennacl/linalg/cuda/scalar_operations.hpp b/viennacl/linalg/cuda/scalar_operations.hpp
new file mode 100644
index 0000000..d51c8f4
--- /dev/null
+++ b/viennacl/linalg/cuda/scalar_operations.hpp
@@ -0,0 +1,380 @@
+#ifndef VIENNACL_LINALG_CUDA_SCALAR_OPERATIONS_HPP_
+#define VIENNACL_LINALG_CUDA_SCALAR_OPERATIONS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/cuda/scalar_operations.hpp
+    @brief Implementations of scalar operations using CUDA
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/meta/predicate.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/start.hpp"
+#include "viennacl/traits/stride.hpp"
+#include "viennacl/linalg/cuda/common.hpp"
+
+// includes CUDA
+#include <cuda_runtime.h>
+
+
+namespace viennacl
+{
+  namespace linalg
+  {
+    namespace cuda
+    {
+
+      namespace detail
+      {
+
+      }
+
+      /////////////////// as /////////////////////////////
+
+      template <typename T>
+      __global__ void as_kernel(T * s1, const T * fac2, unsigned int options2, const T * s2)
+      {
+          T alpha = *fac2;
+          if (options2 & (1 << 0))
+            alpha = -alpha;
+          if (options2 & (1 << 1))
+            alpha = ((T)(1)) / alpha;
+
+          *s1 = *s2 * alpha;
+      }
+
+      template <typename T>
+      __global__ void as_kernel(T * s1, T fac2, unsigned int options2, const T * s2)
+      {
+          T alpha = fac2;
+          if (options2 & (1 << 0))
+            alpha = -alpha;
+          if (options2 & (1 << 1))
+            alpha = ((T)(1)) / alpha;
+
+          *s1 = *s2 * alpha;
+      }
+
+      template <typename S1,
+                typename S2, typename ScalarType1>
+      typename viennacl::enable_if< viennacl::is_scalar<S1>::value
+                                    && viennacl::is_scalar<S2>::value
+                                    && viennacl::is_any_scalar<ScalarType1>::value
+                                  >::type
+      as(S1 & s1,
+         S2 const & s2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
+      {
+        typedef typename viennacl::result_of::cpu_value_type<S1>::type        value_type;
+
+        unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
+
+        value_type temporary_alpha = 0;
+        if (viennacl::is_cpu_scalar<ScalarType1>::value)
+          temporary_alpha = alpha;
+
+        as_kernel<<<1, 1>>>(detail::cuda_arg<value_type>(s1),
+                            detail::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
+                            options_alpha,
+                            detail::cuda_arg<value_type>(s2));
+        VIENNACL_CUDA_LAST_ERROR_CHECK("as_kernel");
+      }
+
+      //////////////////// asbs ////////////////////////////
+
+      // alpha and beta on GPU
+      template <typename T>
+      __global__ void asbs_kernel(T * s1,
+                                  const T * fac2, unsigned int options2, const T * s2,
+                                  const T * fac3, unsigned int options3, const T * s3)
+      {
+          T alpha = *fac2;
+          if (options2 & (1 << 0))
+            alpha = -alpha;
+          if (options2 & (1 << 1))
+            alpha = ((T)(1)) / alpha;
+
+          T beta = *fac3;
+          if (options3 & (1 << 0))
+            beta = -beta;
+          if (options3 & (1 << 1))
+            beta = ((T)(1)) / beta;
+
+          *s1 = *s2 * alpha + *s3 * beta;
+      }
+
+      // alpha on CPU, beta on GPU
+      template <typename T>
+      __global__ void asbs_kernel(T * s1,
+                                  T fac2, unsigned int options2, const T * s2,
+                                  const T * fac3, unsigned int options3, const T * s3)
+      {
+          T alpha = fac2;
+          if (options2 & (1 << 0))
+            alpha = -alpha;
+          if (options2 & (1 << 1))
+            alpha = ((T)(1)) / alpha;
+
+          T beta = *fac3;
+          if (options3 & (1 << 0))
+            beta = -beta;
+          if (options3 & (1 << 1))
+            beta = ((T)(1)) / beta;
+
+          *s1 = *s2 * alpha + *s3 * beta;
+      }
+
+      // alpha on GPU, beta on CPU
+      template <typename T>
+      __global__ void asbs_kernel(T * s1,
+                                  const T * fac2, unsigned int options2, const T * s2,
+                                  T fac3, unsigned int options3, const T * s3)
+      {
+          T alpha = *fac2;
+          if (options2 & (1 << 0))
+            alpha = -alpha;
+          if (options2 & (1 << 1))
+            alpha = ((T)(1)) / alpha;
+
+          T beta = fac3;
+          if (options3 & (1 << 0))
+            beta = -beta;
+          if (options3 & (1 << 1))
+            beta = ((T)(1)) / beta;
+
+          *s1 = *s2 * alpha + *s3 * beta;
+      }
+
+      // alpha and beta on CPU
+      template <typename T>
+      __global__ void asbs_kernel(T * s1,
+                                  T fac2, unsigned int options2, const T * s2,
+                                  T fac3, unsigned int options3, const T * s3)
+      {
+          T alpha = fac2;
+          if (options2 & (1 << 0))
+            alpha = -alpha;
+          if (options2 & (1 << 1))
+            alpha = ((T)(1)) / alpha;
+
+          T beta = fac3;
+          if (options3 & (1 << 0))
+            beta = -beta;
+          if (options3 & (1 << 1))
+            beta = ((T)(1)) / beta;
+
+          *s1 = *s2 * alpha + *s3 * beta;
+      }
+
+
+      template <typename S1,
+                typename S2, typename ScalarType1,
+                typename S3, typename ScalarType2>
+      typename viennacl::enable_if< viennacl::is_scalar<S1>::value
+                                    && viennacl::is_scalar<S2>::value
+                                    && viennacl::is_scalar<S3>::value
+                                    && viennacl::is_any_scalar<ScalarType1>::value
+                                    && viennacl::is_any_scalar<ScalarType2>::value
+                                  >::type
+      asbs(S1 & s1,
+           S2 const & s2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+           S3 const & s3, ScalarType2 const & beta,  vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
+      {
+        typedef typename viennacl::result_of::cpu_value_type<S1>::type        value_type;
+
+        unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
+        unsigned int options_beta  = detail::make_options(len_beta,  reciprocal_beta,  flip_sign_beta);
+
+        value_type temporary_alpha = 0;
+        if (viennacl::is_cpu_scalar<ScalarType1>::value)
+          temporary_alpha = alpha;
+
+        value_type temporary_beta = 0;
+        if (viennacl::is_cpu_scalar<ScalarType2>::value)
+          temporary_beta = beta;
+
+        asbs_kernel<<<1, 1>>>(detail::cuda_arg<value_type>(s1),
+                              detail::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
+                              options_alpha,
+                              detail::cuda_arg<value_type>(s2),
+                              detail::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)),
+                              options_beta,
+                              detail::cuda_arg<value_type>(s3) );
+        VIENNACL_CUDA_LAST_ERROR_CHECK("asbs_kernel");
+      }
+
+      //////////////////// asbs_s ////////////////////
+
+      // alpha and beta on GPU
+      template <typename T>
+      __global__ void asbs_s_kernel(T * s1,
+                                    const T * fac2, unsigned int options2, const T * s2,
+                                    const T * fac3, unsigned int options3, const T * s3)
+      {
+          T alpha = *fac2;
+          if (options2 & (1 << 0))
+            alpha = -alpha;
+          if (options2 & (1 << 1))
+            alpha = ((T)(1)) / alpha;
+
+          T beta = *fac3;
+          if (options3 & (1 << 0))
+            beta = -beta;
+          if (options3 & (1 << 1))
+            beta = ((T)(1)) / beta;
+
+          *s1 += *s2 * alpha + *s3 * beta;
+      }
+
+      // alpha on CPU, beta on GPU
+      template <typename T>
+      __global__ void asbs_s_kernel(T * s1,
+                                    T fac2, unsigned int options2, const T * s2,
+                                    const T * fac3, unsigned int options3, const T * s3)
+      {
+          T alpha = fac2;
+          if (options2 & (1 << 0))
+            alpha = -alpha;
+          if (options2 & (1 << 1))
+            alpha = ((T)(1)) / alpha;
+
+          T beta = *fac3;
+          if (options3 & (1 << 0))
+            beta = -beta;
+          if (options3 & (1 << 1))
+            beta = ((T)(1)) / beta;
+
+          *s1 += *s2 * alpha + *s3 * beta;
+      }
+
+      // alpha on GPU, beta on CPU
+      template <typename T>
+      __global__ void asbs_s_kernel(T * s1,
+                                    const T * fac2, unsigned int options2, const T * s2,
+                                    T fac3, unsigned int options3, const T * s3)
+      {
+          T alpha = *fac2;
+          if (options2 & (1 << 0))
+            alpha = -alpha;
+          if (options2 & (1 << 1))
+            alpha = ((T)(1)) / alpha;
+
+          T beta = fac3;
+          if (options3 & (1 << 0))
+            beta = -beta;
+          if (options3 & (1 << 1))
+            beta = ((T)(1)) / beta;
+
+          *s1 += *s2 * alpha + *s3 * beta;
+      }
+
+      // alpha and beta on CPU
+      template <typename T>
+      __global__ void asbs_s_kernel(T * s1,
+                                    T fac2, unsigned int options2, const T * s2,
+                                    T fac3, unsigned int options3, const T * s3)
+      {
+          T alpha = fac2;
+          if (options2 & (1 << 0))
+            alpha = -alpha;
+          if (options2 & (1 << 1))
+            alpha = ((T)(1)) / alpha;
+
+          T beta = fac3;
+          if (options3 & (1 << 0))
+            beta = -beta;
+          if (options3 & (1 << 1))
+            beta = ((T)(1)) / beta;
+
+          *s1 += *s2 * alpha + *s3 * beta;
+      }
+
+
+      template <typename S1,
+                typename S2, typename ScalarType1,
+                typename S3, typename ScalarType2>
+      typename viennacl::enable_if< viennacl::is_scalar<S1>::value
+                                    && viennacl::is_scalar<S2>::value
+                                    && viennacl::is_scalar<S3>::value
+                                    && viennacl::is_any_scalar<ScalarType1>::value
+                                    && viennacl::is_any_scalar<ScalarType2>::value
+                                  >::type
+      asbs_s(S1 & s1,
+             S2 const & s2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+             S3 const & s3, ScalarType2 const & beta,  vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
+      {
+        typedef typename viennacl::result_of::cpu_value_type<S1>::type        value_type;
+
+        unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
+        unsigned int options_beta  = detail::make_options(len_beta,  reciprocal_beta,  flip_sign_beta);
+
+        value_type temporary_alpha = 0;
+        if (viennacl::is_cpu_scalar<ScalarType1>::value)
+          temporary_alpha = alpha;
+
+        value_type temporary_beta = 0;
+        if (viennacl::is_cpu_scalar<ScalarType2>::value)
+          temporary_beta = beta;
+
+        std::cout << "Launching asbs_s_kernel..." << std::endl;
+        asbs_s_kernel<<<1, 1>>>(detail::cuda_arg<value_type>(s1),
+                                detail::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
+                                options_alpha,
+                                detail::cuda_arg<value_type>(s2),
+                                detail::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)),
+                                options_beta,
+                                detail::cuda_arg<value_type>(s3) );
+        VIENNACL_CUDA_LAST_ERROR_CHECK("asbs_s_kernel");
+      }
+
+      ///////////////// swap //////////////////
+
+      template <typename T>
+      __global__ void scalar_swap_kernel(T * s1, T * s2)
+      {
+        T tmp = *s2;
+        *s2 = *s1;
+        *s1 = tmp;
+      }
+
+      /** @brief Swaps the contents of two scalars, data is copied
+      *
+      * @param s1   The first scalar
+      * @param s2   The second scalar
+      */
+      template <typename S1, typename S2>
+      typename viennacl::enable_if<    viennacl::is_scalar<S1>::value
+                                    && viennacl::is_scalar<S2>::value
+                                  >::type
+      swap(S1 & s1, S2 & s2)
+      {
+        typedef typename viennacl::result_of::cpu_value_type<S1>::type        value_type;
+
+        scalar_swap_kernel<<<1, 1>>>(detail::cuda_arg<value_type>(s1),detail::cuda_arg<value_type>(s2));
+      }
+
+
+
+    } //namespace single_threaded
+  } //namespace linalg
+} //namespace viennacl
+
+
+#endif
diff --git a/viennacl/linalg/cuda/sparse_matrix_operations.hpp b/viennacl/linalg/cuda/sparse_matrix_operations.hpp
new file mode 100644
index 0000000..06e7a7d
--- /dev/null
+++ b/viennacl/linalg/cuda/sparse_matrix_operations.hpp
@@ -0,0 +1,1831 @@
+#ifndef VIENNACL_LINALG_CUDA_SPARSE_MATRIX_OPERATIONS_HPP_
+#define VIENNACL_LINALG_CUDA_SPARSE_MATRIX_OPERATIONS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/cuda/sparse_matrix_operations.hpp
+    @brief Implementations of operations using sparse matrices using CUDA
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/cuda/common.hpp"
+
+#include "viennacl/linalg/cuda/sparse_matrix_operations_solve.hpp"
+
+namespace viennacl
+{
+  namespace linalg
+  {
+    namespace cuda
+    {
+      //
+      // Compressed matrix
+      //
+
+      namespace detail
+      {
+
+        template <typename T>
+        __global__ void csr_row_info_extractor_kernel(
+                  const unsigned int * row_indices,
+                  const unsigned int * column_indices,
+                  const T * elements,
+                  T * result,
+                  unsigned int size,
+                  unsigned int option)
+        {
+          for (unsigned int row  = blockDim.x * blockIdx.x + threadIdx.x;
+                            row  < size;
+                            row += gridDim.x * blockDim.x)
+          {
+            T value = 0;
+            unsigned int row_end = row_indices[row+1];
+
+            switch (option)
+            {
+              case 0: //inf-norm
+                for (unsigned int i = row_indices[row]; i < row_end; ++i)
+                  value = max(value, fabs(elements[i]));
+                break;
+
+              case 1: //1-norm
+                for (unsigned int i = row_indices[row]; i < row_end; ++i)
+                  value += fabs(elements[i]);
+                break;
+
+              case 2: //2-norm
+                for (unsigned int i = row_indices[row]; i < row_end; ++i)
+                  value += elements[i] * elements[i];
+                value = sqrt(value);
+                break;
+
+              case 3: //diagonal entry
+                for (unsigned int i = row_indices[row]; i < row_end; ++i)
+                {
+                  if (column_indices[i] == row)
+                  {
+                    value = elements[i];
+                    break;
+                  }
+                }
+                break;
+
+              default:
+                break;
+            }
+            result[row] = value;
+          }
+        }
+
+
+        template<typename ScalarType, unsigned int MAT_ALIGNMENT>
+        void row_info(compressed_matrix<ScalarType, MAT_ALIGNMENT> const & mat,
+                      vector_base<ScalarType> & vec,
+                      viennacl::linalg::detail::row_info_types info_selector)
+        {
+          csr_row_info_extractor_kernel<<<128, 128>>>(detail::cuda_arg<unsigned int>(mat.handle1().cuda_handle()),
+                                                      detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
+                                                      detail::cuda_arg<ScalarType>(mat.handle().cuda_handle()),
+                                                      detail::cuda_arg<ScalarType>(vec),
+                                                      static_cast<unsigned int>(mat.size1()),
+                                                      static_cast<unsigned int>(info_selector)
+                                                     );
+          VIENNACL_CUDA_LAST_ERROR_CHECK("csr_row_info_extractor_kernel");
+        }
+
+      } //namespace detail
+
+
+      template <typename T>
+      __global__ void compressed_matrix_vec_mul_kernel(
+                const unsigned int * row_indices,
+                const unsigned int * column_indices,
+                const T * elements,
+                const T * x,
+                unsigned int start_x,
+                unsigned int inc_x,
+                T * result,
+                unsigned int start_result,
+                unsigned int inc_result,
+                unsigned int size_result)
+      {
+        for (unsigned int row  = blockDim.x * blockIdx.x + threadIdx.x;
+                          row  < size_result;
+                          row += gridDim.x * blockDim.x)
+        {
+          T dot_prod = (T)0;
+          unsigned int row_end = row_indices[row+1];
+          for (unsigned int i = row_indices[row]; i < row_end; ++i)
+            dot_prod += elements[i] * x[column_indices[i] * inc_x + start_x];
+          result[row * inc_result + start_result] = dot_prod;
+        }
+      }
+
+
+
+
+
+      /** @brief Carries out matrix-vector multiplication with a compressed_matrix
+      *
+      * Implementation of the convenience expression result = prod(mat, vec);
+      *
+      * @param mat    The matrix
+      * @param vec    The vector
+      * @param result The result vector
+      */
+      template<class ScalarType, unsigned int ALIGNMENT>
+      void prod_impl(const viennacl::compressed_matrix<ScalarType, ALIGNMENT> & mat,
+                     const viennacl::vector_base<ScalarType> & vec,
+                           viennacl::vector_base<ScalarType> & result)
+      {
+        compressed_matrix_vec_mul_kernel<<<128, 128>>>(detail::cuda_arg<unsigned int>(mat.handle1().cuda_handle()),
+                                                       detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
+                                                       detail::cuda_arg<ScalarType>(mat.handle().cuda_handle()),
+                                                       detail::cuda_arg<ScalarType>(vec),
+                                                       static_cast<unsigned int>(vec.start()),
+                                                       static_cast<unsigned int>(vec.stride()),
+                                                       detail::cuda_arg<ScalarType>(result),
+                                                       static_cast<unsigned int>(result.start()),
+                                                       static_cast<unsigned int>(result.stride()),
+                                                       static_cast<unsigned int>(result.size())
+                                                      );
+        VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_vec_mul_kernel");
+      }
+
+      /** @brief Helper struct for accessing an element of a row- or column-major matrix.
+        *
+        * @param LayoutT   The layout tag: Either row_major or column_major
+        */
+      template <typename LayoutT>
+      struct mat_mult_matrix_index
+      {
+        static __device__ unsigned int apply(unsigned int i, unsigned int j,
+                                      unsigned int row_start, unsigned int row_inc,
+                                      unsigned int col_start, unsigned int col_inc,
+                                      unsigned int internal_rows, unsigned int internal_cols)
+        {
+          return (row_start + i * row_inc) * internal_cols + col_start + j * col_inc;
+        }
+      };
+
+      /** \cond */
+      template <>
+      struct mat_mult_matrix_index<viennacl::column_major>
+      {
+        static __device__ unsigned int apply(unsigned int i, unsigned int j,
+                                      unsigned int row_start, unsigned int row_inc,
+                                      unsigned int col_start, unsigned int col_inc,
+                                      unsigned int internal_rows, unsigned int internal_cols)
+        {
+          return (row_start + i * row_inc) + (col_start + j * col_inc) * internal_rows;
+        }
+      };
+      /** \endcond */
+
+
+      template <typename DMatIndexT, typename ResultIndexT, typename T>
+      __global__ void compressed_matrix_d_mat_mul_kernel(
+                const unsigned int * sp_mat_row_indices,
+                const unsigned int * sp_mat_col_indices,
+                const T * sp_mat_elements,
+                const T * d_mat,
+                unsigned int d_mat_row_start,
+                unsigned int d_mat_col_start,
+                unsigned int d_mat_row_inc,
+                unsigned int d_mat_col_inc,
+                unsigned int d_mat_row_size,
+                unsigned int d_mat_col_size,
+                unsigned int d_mat_internal_rows,
+                unsigned int d_mat_internal_cols,
+                T * result,
+                unsigned int result_row_start,
+                unsigned int result_col_start,
+                unsigned int result_row_inc,
+                unsigned int result_col_inc,
+                unsigned int result_row_size,
+                unsigned int result_col_size,
+                unsigned int result_internal_rows,
+                unsigned int result_internal_cols) {
+
+        for (unsigned int row  = blockIdx.x; row  < result_row_size; row += gridDim.x) {
+
+          unsigned int row_start = sp_mat_row_indices[row];
+          unsigned int row_end = sp_mat_row_indices[row+1];
+
+          for ( unsigned int col = threadIdx.x; col < result_col_size; col += blockDim.x) {
+
+            T r = 0;
+
+            for (unsigned int k = row_start; k < row_end; k++) {
+
+              unsigned int j = sp_mat_col_indices[k];
+              T x = sp_mat_elements[k];
+              T y = d_mat[ DMatIndexT::apply(j, col,
+                                             d_mat_row_start, d_mat_row_inc,
+                                             d_mat_col_start, d_mat_col_inc,
+                                             d_mat_internal_rows, d_mat_internal_cols) ];
+
+              r += x * y;
+            }
+
+            result [ ResultIndexT::apply(row, col,
+                                        result_row_start, result_row_inc,
+                                        result_col_start, result_col_inc,
+                                        result_internal_rows, result_internal_cols) ] = r;
+          }
+
+        }
+
+      }
+
+
+      /** @brief Carries out sparse_matrix-dense_matrix multiplication first matrix being compressed
+      *
+      * Implementation of the convenience expression result = prod(mat, vec);
+      *
+      * @param sp_mat   The sparse matrix
+      * @param d_mat    The dense matrix
+      * @param result   The result matrix
+      */
+      template< typename TYPE, unsigned int ALIGNMENT, typename F1, typename F2>
+      void prod_impl(const viennacl::compressed_matrix<TYPE, ALIGNMENT> & sp_mat,
+                     const viennacl::matrix_base<TYPE, F1> & d_mat,
+                           viennacl::matrix_base<TYPE, F2> & result) {
+        compressed_matrix_d_mat_mul_kernel<mat_mult_matrix_index<F1>, mat_mult_matrix_index<F2> ><<<128, 128>>>
+                                                      (detail::cuda_arg<unsigned int>(sp_mat.handle1().cuda_handle()),
+                                                       detail::cuda_arg<unsigned int>(sp_mat.handle2().cuda_handle()),
+                                                       detail::cuda_arg<TYPE>(sp_mat.handle().cuda_handle()),
+
+                                                       detail::cuda_arg<TYPE>(d_mat),
+                                                       static_cast<unsigned int>(viennacl::traits::start1(d_mat)),         static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
+                                                       static_cast<unsigned int>(viennacl::traits::stride1(d_mat)),        static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
+                                                       static_cast<unsigned int>(viennacl::traits::size1(d_mat)),          static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
+                                                       static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
+
+                                                       detail::cuda_arg<TYPE>(result),
+                                                       static_cast<unsigned int>(viennacl::traits::start1(result)),         static_cast<unsigned int>(viennacl::traits::start2(result)),
+                                                       static_cast<unsigned int>(viennacl::traits::stride1(result)),        static_cast<unsigned int>(viennacl::traits::stride2(result)),
+                                                       static_cast<unsigned int>(viennacl::traits::size1(result)),          static_cast<unsigned int>(viennacl::traits::size2(result)),
+                                                       static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
+                                                      );
+        VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_mat_mul_kernel");
+      }
+
+
+      template <typename DMatIndexT, typename ResultIndexT, typename T>
+      __global__ void compressed_matrix_d_tr_mat_mul_kernel(
+                const unsigned int * sp_mat_row_indices,
+                const unsigned int * sp_mat_col_indices,
+                const T * sp_mat_elements,
+                const T * d_mat,
+                unsigned int d_mat_row_start,
+                unsigned int d_mat_col_start,
+                unsigned int d_mat_row_inc,
+                unsigned int d_mat_col_inc,
+                unsigned int d_mat_row_size,
+                unsigned int d_mat_col_size,
+                unsigned int d_mat_internal_rows,
+                unsigned int d_mat_internal_cols,
+                T * result,
+                unsigned int result_row_start,
+                unsigned int result_col_start,
+                unsigned int result_row_inc,
+                unsigned int result_col_inc,
+                unsigned int result_row_size,
+                unsigned int result_col_size,
+                unsigned int result_internal_rows,
+                unsigned int result_internal_cols) {
+
+        for (unsigned int row  = blockIdx.x; row  < result_row_size; row += gridDim.x) {
+
+          unsigned int row_start = sp_mat_row_indices[row];
+          unsigned int row_end = sp_mat_row_indices[row+1];
+
+          for ( unsigned int col = threadIdx.x; col < result_col_size; col += blockDim.x) {
+
+            T r = 0;
+
+            for (unsigned int k = row_start; k < row_end; k++) {
+
+              unsigned int j = sp_mat_col_indices[k];
+              T x = sp_mat_elements[k];
+              T y = d_mat[ DMatIndexT::apply(col, j,
+                                             d_mat_row_start, d_mat_row_inc,
+                                             d_mat_col_start, d_mat_col_inc,
+                                             d_mat_internal_rows, d_mat_internal_cols) ];
+
+              r += x * y;
+            }
+
+            result [ ResultIndexT::apply(row, col,
+                                         result_row_start, result_row_inc,
+                                         result_col_start, result_col_inc,
+                                         result_internal_rows, result_internal_cols) ] = r;
+          }
+        }
+
+      }
+
+      /** @brief Carries out matrix-trans(matrix) multiplication first matrix being compressed
+      *          and the second transposed
+      *
+      * Implementation of the convenience expression result = prod(sp_mat, d_mat);
+      *
+      * @param sp_mat             The sparse matrix
+      * @param d_mat              The transposed dense matrix proxy
+      * @param result             The result matrix
+      */
+      template< typename TYPE, unsigned int ALIGNMENT, typename F1, typename F2>
+      void prod_impl(const viennacl::compressed_matrix<TYPE, ALIGNMENT> & sp_mat,
+                     const viennacl::matrix_expression< const viennacl::matrix_base<TYPE, F1>,
+                                                        const viennacl::matrix_base<TYPE, F1>,
+                                                        viennacl::op_trans > & d_mat,
+                      viennacl::matrix_base<TYPE, F2> & result) {
+
+        compressed_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<F1>, mat_mult_matrix_index<F2> ><<<128, 128>>>
+                                                      (detail::cuda_arg<unsigned int>(sp_mat.handle1().cuda_handle()),
+                                                       detail::cuda_arg<unsigned int>(sp_mat.handle2().cuda_handle()),
+                                                       detail::cuda_arg<TYPE>(sp_mat.handle().cuda_handle()),
+
+                                                       detail::cuda_arg<TYPE>(d_mat.lhs()),
+                                                       static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())),         static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
+                                                       static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())),        static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
+                                                       static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())),          static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
+                                                       static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
+
+                                                       detail::cuda_arg<TYPE>(result),
+                                                       static_cast<unsigned int>(viennacl::traits::start1(result)),         static_cast<unsigned int>(viennacl::traits::start2(result)),
+                                                       static_cast<unsigned int>(viennacl::traits::stride1(result)),        static_cast<unsigned int>(viennacl::traits::stride2(result)),
+                                                       static_cast<unsigned int>(viennacl::traits::size1(result)),          static_cast<unsigned int>(viennacl::traits::size2(result)),
+                                                       static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
+                                                      );
+        VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_tr_mat_mul_kernel");
+      }
+
+
+      //
+      // triangular solves for compressed_matrix
+      //
+
+      template <typename T>
+      __global__ void compressed_matrix_diagonal_kernel(
+                const unsigned int * row_indices,
+                const unsigned int * column_indices,
+                const T * elements,
+                T * result,
+                unsigned int size)
+      {
+        for (unsigned int row  = blockDim.x * blockIdx.x + threadIdx.x;
+                          row  < size;
+                          row += gridDim.x * blockDim.x)
+        {
+          T diag = (T)0;
+          unsigned int row_end = row_indices[row+1];
+          for (unsigned int i = row_indices[row]; i < row_end; ++i)
+          {
+            unsigned int col_index = column_indices[i];
+            if (col_index == row)
+            {
+              diag = elements[i];
+              break;
+            }
+          }
+          result[row] = diag;
+        }
+      }
+
+
+      /** @brief Carries out triangular inplace solves
+      *
+      * @param mat    The matrix
+      * @param vec    The vector holding the right hand side. Is overwritten by the solution.
+      */
+      template<typename SparseMatrixType, class ScalarType>
+      typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixType>::value>::type
+      inplace_solve(const SparseMatrixType & mat,
+                    viennacl::vector_base<ScalarType> & vec,
+                    viennacl::linalg::unit_lower_tag)
+      {
+        csr_unit_lu_forward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.handle1().cuda_handle()),
+                                          detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
+                                          detail::cuda_arg<ScalarType>(mat.handle().cuda_handle()),
+                                          detail::cuda_arg<ScalarType>(vec),
+                                          static_cast<unsigned int>(mat.size1())
+                                         );
+        VIENNACL_CUDA_LAST_ERROR_CHECK("csr_unit_lu_forward_kernel");
+      }
+
+
+      /** @brief Carries out triangular inplace solves
+      *
+      * @param mat    The matrix
+      * @param vec    The vector holding the right hand side. Is overwritten by the solution.
+      */
+      template<typename SparseMatrixType, class ScalarType>
+      typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixType>::value>::type
+      inplace_solve(const SparseMatrixType & mat,
+                    viennacl::vector_base<ScalarType> & vec,
+                    viennacl::linalg::lower_tag)
+      {
+        csr_lu_forward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.handle1().cuda_handle()),
+                                          detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
+                                          detail::cuda_arg<ScalarType>(mat.handle().cuda_handle()),
+                                          detail::cuda_arg<ScalarType>(vec),
+                                          static_cast<unsigned int>(mat.size1())
+                                         );
+        VIENNACL_CUDA_LAST_ERROR_CHECK("csr_lu_forward_kernel");
+      }
+
+
+
+      /** @brief Carries out triangular inplace solves
+      *
+      * @param mat    The matrix
+      * @param vec    The vector holding the right hand side. Is overwritten by the solution.
+      */
+      template<typename SparseMatrixType, class ScalarType>
+      typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixType>::value>::type
+      inplace_solve(const SparseMatrixType & mat,
+                    viennacl::vector_base<ScalarType> & vec,
+                    viennacl::linalg::unit_upper_tag)
+      {
+        csr_unit_lu_backward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.handle1().cuda_handle()),
+                                          detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
+                                          detail::cuda_arg<ScalarType>(mat.handle().cuda_handle()),
+                                          detail::cuda_arg<ScalarType>(vec),
+                                          static_cast<unsigned int>(mat.size1())
+                                         );
+        VIENNACL_CUDA_LAST_ERROR_CHECK("csr_unit_lu_backward_kernel");
+      }
+
+
+      /** @brief Carries out triangular inplace solves
+      *
+      * @param mat    The matrix
+      * @param vec    The vector holding the right hand side. Is overwritten by the solution.
+      */
+      template<typename SparseMatrixType, class ScalarType>
+      typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixType>::value>::type
+      inplace_solve(const SparseMatrixType & mat,
+                    viennacl::vector_base<ScalarType> & vec,
+                    viennacl::linalg::upper_tag)
+      {
+        csr_lu_backward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.handle1().cuda_handle()),
+                                          detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
+                                          detail::cuda_arg<ScalarType>(mat.handle().cuda_handle()),
+                                          detail::cuda_arg<ScalarType>(vec),
+                                          static_cast<unsigned int>(mat.size1())
+                                         );
+        VIENNACL_CUDA_LAST_ERROR_CHECK("csr_lu_backward_kernel");
+      }
+
+
+
+      // transposed
+
+      /** @brief Carries out triangular inplace solves
+      *
+      * @param mat    The matrix
+      * @param vec    The vector holding the right hand side. Is overwritten by the solution.
+      */
+      template<typename SparseMatrixType, class ScalarType>
+      typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixType>::value>::type
+      inplace_solve(const matrix_expression<const SparseMatrixType, const SparseMatrixType, op_trans> & mat,
+                    viennacl::vector_base<ScalarType> & vec,
+                    viennacl::linalg::unit_lower_tag)
+      {
+        csr_trans_unit_lu_forward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.lhs().handle1().cuda_handle()),
+                                                detail::cuda_arg<unsigned int>(mat.lhs().handle2().cuda_handle()),
+                                                detail::cuda_arg<ScalarType>(mat.lhs().handle().cuda_handle()),
+                                                detail::cuda_arg<ScalarType>(vec),
+                                                static_cast<unsigned int>(mat.lhs().size1())
+                                               );
+        VIENNACL_CUDA_LAST_ERROR_CHECK("csr_trans_unit_lu_forward_kernel");
+      }
+
+
+      /** @brief Carries out triangular inplace solves
+      *
+      * @param mat    The matrix
+      * @param vec    The vector holding the right hand side. Is overwritten by the solution.
+      */
+      template<typename SparseMatrixType, class ScalarType>
+      typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixType>::value>::type
+      inplace_solve(const matrix_expression<const SparseMatrixType, const SparseMatrixType, op_trans> & mat,
+                    viennacl::vector_base<ScalarType> & vec,
+                    viennacl::linalg::lower_tag)
+      {
+        viennacl::vector<ScalarType> diagonal(vec.size());
+
+        compressed_matrix_diagonal_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.lhs().handle1().cuda_handle()),
+                                                      detail::cuda_arg<unsigned int>(mat.lhs().handle2().cuda_handle()),
+                                                      detail::cuda_arg<ScalarType>(mat.lhs().handle().cuda_handle()),
+                                                      detail::cuda_arg<ScalarType>(diagonal),
+                                                      static_cast<unsigned int>(mat.size1())
+                                                     );
+
+        csr_trans_lu_forward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.lhs().handle1().cuda_handle()),
+                                                detail::cuda_arg<unsigned int>(mat.lhs().handle2().cuda_handle()),
+                                                detail::cuda_arg<ScalarType>(mat.lhs().handle().cuda_handle()),
+                                                detail::cuda_arg<ScalarType>(diagonal),
+                                                detail::cuda_arg<ScalarType>(vec),
+                                                static_cast<unsigned int>(mat.lhs().size1())
+                                               );
+        VIENNACL_CUDA_LAST_ERROR_CHECK("csr_trans_lu_forward_kernel");
+      }
+
+
+      /** @brief Carries out triangular inplace solves
+      *
+      * @param mat    The matrix
+      * @param vec    The vector holding the right hand side. Is overwritten by the solution.
+      */
+      template<typename SparseMatrixType, class ScalarType>
+      typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixType>::value>::type
+      inplace_solve(const matrix_expression<const SparseMatrixType, const SparseMatrixType, op_trans> & mat,
+                    viennacl::vector_base<ScalarType> & vec,
+                    viennacl::linalg::unit_upper_tag)
+      {
+        csr_trans_unit_lu_backward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.lhs().handle1().cuda_handle()),
+                                                      detail::cuda_arg<unsigned int>(mat.lhs().handle2().cuda_handle()),
+                                                      detail::cuda_arg<ScalarType>(mat.lhs().handle().cuda_handle()),
+                                                      detail::cuda_arg<ScalarType>(vec),
+                                                      static_cast<unsigned int>(mat.lhs().size1())
+                                                    );
+        VIENNACL_CUDA_LAST_ERROR_CHECK("csr_trans_unit_lu_backward_kernel");
+      }
+
+
+      /** @brief Carries out triangular inplace solves
+      *
+      * @param mat    The matrix
+      * @param vec    The vector holding the right hand side. Is overwritten by the solution.
+      */
+      template<typename SparseMatrixType, class ScalarType>
+      typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixType>::value>::type
+      inplace_solve(const matrix_expression<const SparseMatrixType, const SparseMatrixType, op_trans> & mat,
+                    viennacl::vector_base<ScalarType> & vec,
+                    viennacl::linalg::upper_tag)
+      {
+        viennacl::vector<ScalarType> diagonal(vec.size());
+
+        compressed_matrix_diagonal_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.lhs().handle1().cuda_handle()),
+                                                      detail::cuda_arg<unsigned int>(mat.lhs().handle2().cuda_handle()),
+                                                      detail::cuda_arg<ScalarType>(mat.lhs().handle().cuda_handle()),
+                                                      detail::cuda_arg<ScalarType>(diagonal),
+                                                      static_cast<unsigned int>(mat.size1())
+                                                     );
+
+        csr_trans_lu_backward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.lhs().handle1().cuda_handle()),
+                                                 detail::cuda_arg<unsigned int>(mat.lhs().handle2().cuda_handle()),
+                                                 detail::cuda_arg<ScalarType>(mat.lhs().handle().cuda_handle()),
+                                                 detail::cuda_arg<ScalarType>(diagonal),
+                                                 detail::cuda_arg<ScalarType>(vec),
+                                                 static_cast<unsigned int>(mat.lhs().size1())
+                                                );
+        VIENNACL_CUDA_LAST_ERROR_CHECK("csr_trans_lu_backward_kernel");
+      }
+
+      namespace detail
+      {
+        //
+        // block solves
+        //
+        template<typename ScalarType, unsigned int MAT_ALIGNMENT>
+        void block_inplace_solve(const matrix_expression<const compressed_matrix<ScalarType, MAT_ALIGNMENT>,
+                                                         const compressed_matrix<ScalarType, MAT_ALIGNMENT>,
+                                                         op_trans> & L,
+                                 viennacl::backend::mem_handle const & block_indices, vcl_size_t num_blocks,
+                                 vector_base<ScalarType> const & /* L_diagonal */,  //ignored
+                                 vector_base<ScalarType> & vec,
+                                 viennacl::linalg::unit_lower_tag)
+        {
+          csr_block_trans_unit_lu_forward<<<num_blocks, 128>>>(detail::cuda_arg<unsigned int>(L.lhs().handle1().cuda_handle()),
+                                                               detail::cuda_arg<unsigned int>(L.lhs().handle2().cuda_handle()),
+                                                               detail::cuda_arg<ScalarType>(L.lhs().handle().cuda_handle()),
+                                                               detail::cuda_arg<unsigned int>(block_indices.cuda_handle()),
+                                                               detail::cuda_arg<ScalarType>(vec),
+                                                               static_cast<unsigned int>(L.lhs().size1())
+                                                              );
+        }
+
+
+        template<typename ScalarType, unsigned int MAT_ALIGNMENT>
+        void block_inplace_solve(const matrix_expression<const compressed_matrix<ScalarType, MAT_ALIGNMENT>,
+                                                         const compressed_matrix<ScalarType, MAT_ALIGNMENT>,
+                                                         op_trans> & U,
+                                 viennacl::backend::mem_handle const & block_indices, vcl_size_t num_blocks,
+                                 vector_base<ScalarType> const & U_diagonal,
+                                 vector_base<ScalarType> & vec,
+                                 viennacl::linalg::upper_tag)
+        {
+          csr_block_trans_lu_backward<<<num_blocks, 128>>>(detail::cuda_arg<unsigned int>(U.lhs().handle1().cuda_handle()),
+                                                           detail::cuda_arg<unsigned int>(U.lhs().handle2().cuda_handle()),
+                                                           detail::cuda_arg<ScalarType>(U.lhs().handle().cuda_handle()),
+                                                           detail::cuda_arg<ScalarType>(U_diagonal.handle().cuda_handle()),
+                                                           detail::cuda_arg<unsigned int>(block_indices.cuda_handle()),
+                                                           detail::cuda_arg<ScalarType>(vec),
+                                                           static_cast<unsigned int>(U.lhs().size1())
+                                                          );
+        }
+
+
+      }
+
+
+      //
+      // Compressed Compressed Matrix
+      //
+
+      template <typename T>
+      __global__ void compressed_compressed_matrix_vec_mul_kernel(
+                const unsigned int * row_jumper,
+                const unsigned int * row_indices,
+                const unsigned int * column_indices,
+                const T * elements,
+                unsigned int nonzero_rows,
+                const T * x,
+                unsigned int start_x,
+                unsigned int inc_x,
+                T * result,
+                unsigned int start_result,
+                unsigned int inc_result,
+                unsigned int size_result)
+      {
+        for (unsigned int i  = blockDim.x * blockIdx.x + threadIdx.x;
+                          i  < size_result;
+                          i += gridDim.x * blockDim.x)
+        {
+          result[i * inc_result + start_result] = 0;
+        }
+
+        for (unsigned int i  = blockDim.x * blockIdx.x + threadIdx.x;
+                          i  < nonzero_rows;
+                          i += gridDim.x * blockDim.x)
+        {
+          T dot_prod = (T)0;
+          unsigned int row_end = row_jumper[i+1];
+          for (unsigned int j = row_jumper[i]; j < row_end; ++j)
+            dot_prod += elements[j] * x[column_indices[j] * inc_x + start_x];
+          result[row_indices[i] * inc_result + start_result] = dot_prod;
+        }
+      }
+
+
+      /** @brief Carries out matrix-vector multiplication with a compressed_compressed_matrix
+      *
+      * Implementation of the convenience expression result = prod(mat, vec);
+      *
+      * @param mat    The matrix
+      * @param vec    The vector
+      * @param result The result vector
+      */
+      template<class ScalarType>
+      void prod_impl(const viennacl::compressed_compressed_matrix<ScalarType> & mat,
+                     const viennacl::vector_base<ScalarType> & vec,
+                           viennacl::vector_base<ScalarType> & result)
+      {
+        compressed_compressed_matrix_vec_mul_kernel<<<128, 128>>>(detail::cuda_arg<unsigned int>(mat.handle1().cuda_handle()),
+                                                                  detail::cuda_arg<unsigned int>(mat.handle3().cuda_handle()),
+                                                                  detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
+                                                                  detail::cuda_arg<ScalarType>(mat.handle().cuda_handle()),
+                                                                  static_cast<unsigned int>(mat.nnz1()),
+                                                                  detail::cuda_arg<ScalarType>(vec),
+                                                                  static_cast<unsigned int>(vec.start()),
+                                                                  static_cast<unsigned int>(vec.stride()),
+                                                                  detail::cuda_arg<ScalarType>(result),
+                                                                  static_cast<unsigned int>(result.start()),
+                                                                  static_cast<unsigned int>(result.stride()),
+                                                                  static_cast<unsigned int>(result.size())
+                                                                 );
+        VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_compressed_matrix_vec_mul_kernel");
+      }
+
+      //
+      // Coordinate Matrix
+      //
+
+
+      namespace detail
+      {
+
+        template <typename T>
+        __global__ void coo_row_info_extractor( const unsigned int * coords, //(row_index, column_index)
+                                                const T * elements,
+                                                const unsigned int * group_boundaries,
+                                                T * result,
+                                                unsigned int option)
+        {
+          __shared__ unsigned int shared_rows[128];
+          __shared__ T inter_results[128];
+
+          uint2 tmp;
+          T val;
+          unsigned int last_index  = blockDim.x - 1;
+          unsigned int group_start = group_boundaries[blockIdx.x];
+          unsigned int group_end   = group_boundaries[blockIdx.x + 1];
+          unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0;   // -1 in order to have correct behavior if group_end - group_start == j * blockDim.x
+
+          unsigned int local_index = 0;
+
+          for (unsigned int k = 0; k < k_end; ++k)
+          {
+            local_index = group_start + k * blockDim.x + threadIdx.x;
+
+            tmp = (local_index < group_end) ? ((const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
+            val = (local_index < group_end && (option != 3 || tmp.x == tmp.y) ) ? elements[local_index] : 0;
+
+            //check for carry from previous loop run:
+            if (threadIdx.x == 0 && k > 0)
+            {
+              if (tmp.x == shared_rows[last_index])
+              {
+                switch (option)
+                {
+                  case 0: //inf-norm
+                  case 3: //diagonal entry
+                    val = max(val, fabs(inter_results[last_index]));
+                    break;
+
+                  case 1: //1-norm
+                    val = fabs(val) + inter_results[last_index];
+                    break;
+
+                  case 2: //2-norm
+                    val = sqrt(val * val + inter_results[last_index]);
+                    break;
+
+                  default:
+                    break;
+                }
+              }
+              else
+              {
+                switch (option)
+                {
+                  case 0: //inf-norm
+                  case 1: //1-norm
+                  case 3: //diagonal entry
+                    result[shared_rows[last_index]] = inter_results[last_index];
+                    break;
+
+                  case 2: //2-norm
+                    result[shared_rows[last_index]] = sqrt(inter_results[last_index]);
+                  default:
+                    break;
+                }
+              }
+            }
+
+            //segmented parallel reduction begin
+            __syncthreads();
+            shared_rows[threadIdx.x] = tmp.x;
+            switch (option)
+            {
+              case 0:
+              case 3:
+                inter_results[threadIdx.x] = val;
+                break;
+              case 1:
+                inter_results[threadIdx.x] = fabs(val);
+                break;
+              case 2:
+                inter_results[threadIdx.x] = val * val;
+              default:
+                break;
+            }
+            T left = 0;
+            __syncthreads();
+
+            for (unsigned int stride = 1; stride < blockDim.x; stride *= 2)
+            {
+              left = (threadIdx.x >= stride && tmp.x == shared_rows[threadIdx.x - stride]) ? inter_results[threadIdx.x - stride] : 0;
+              __syncthreads();
+              switch (option)
+              {
+                case 0: //inf-norm
+                case 3: //diagonal entry
+                  inter_results[threadIdx.x] = max(inter_results[threadIdx.x], left);
+                  break;
+
+                case 1: //1-norm
+                  inter_results[threadIdx.x] += left;
+                  break;
+
+                case 2: //2-norm
+                  inter_results[threadIdx.x] += left;
+                  break;
+
+                default:
+                  break;
+              }
+              __syncthreads();
+            }
+            //segmented parallel reduction end
+
+            if (threadIdx.x != last_index &&
+                shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1] &&
+                inter_results[threadIdx.x] != 0)
+            {
+              result[tmp.x] = (option == 2) ? sqrt(inter_results[threadIdx.x]) : inter_results[threadIdx.x];
+            }
+
+            __syncthreads();
+          } //for k
+
+          if (threadIdx.x == last_index && inter_results[last_index] != 0)
+            result[tmp.x] = (option == 2) ? sqrt(inter_results[last_index]) : inter_results[last_index];
+        }
+
+        template<typename ScalarType, unsigned int MAT_ALIGNMENT>
+        void row_info(coordinate_matrix<ScalarType, MAT_ALIGNMENT> const & mat,
+                      vector_base<ScalarType> & vec,
+                      viennacl::linalg::detail::row_info_types info_selector)
+        {
+          coo_row_info_extractor<<<64, 128>>>(detail::cuda_arg<unsigned int>(mat.handle12().cuda_handle()),
+                                               detail::cuda_arg<ScalarType>(mat.handle().cuda_handle()),
+                                               detail::cuda_arg<unsigned int>(mat.handle3().cuda_handle()),
+                                               detail::cuda_arg<ScalarType>(vec),
+                                               static_cast<unsigned int>(info_selector)
+                                              );
+          VIENNACL_CUDA_LAST_ERROR_CHECK("coo_row_info_extractor");
+        }
+
+      } //namespace detail
+
+
+      template <typename T>
+      __global__ void coordinate_matrix_vec_mul_kernel(const unsigned int * coords, //(row_index, column_index)
+                                                       const T * elements,
+                                                       const unsigned int * group_boundaries,
+                                                       const T * x,
+                                                       unsigned int start_x,
+                                                       unsigned int inc_x,
+                                                             T * result,
+                                                       unsigned int start_result,
+                                                       unsigned int inc_result
+                                                       )
+      {
+        __shared__ unsigned int shared_rows[128];
+        __shared__ T inter_results[128];
+
+        uint2 tmp;
+        T val;
+        unsigned int group_start = group_boundaries[blockIdx.x];
+        unsigned int group_end   = group_boundaries[blockIdx.x + 1];
+        unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0;   // -1 in order to have correct behavior if group_end - group_start == j * blockDim.x
+
+        unsigned int local_index = 0;
+
+        for (unsigned int k = 0; k < k_end; ++k)
+        {
+          local_index = group_start + k * blockDim.x + threadIdx.x;
+
+          tmp = (local_index < group_end) ? ((const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
+          val = (local_index < group_end) ? elements[local_index] * x[tmp.y * inc_x + start_x] : 0;
+
+          //check for carry from previous loop run:
+          if (threadIdx.x == 0 && k > 0)
+          {
+            if (tmp.x == shared_rows[blockDim.x-1])
+              val += inter_results[blockDim.x-1];
+            else
+              result[shared_rows[blockDim.x-1] * inc_result + start_result] = inter_results[blockDim.x-1];
+          }
+
+          //segmented parallel reduction begin
+          __syncthreads();
+          shared_rows[threadIdx.x] = tmp.x;
+          inter_results[threadIdx.x] = val;
+          T left = 0;
+          __syncthreads();
+
+          for (unsigned int stride = 1; stride < blockDim.x; stride *= 2)
+          {
+            left = (threadIdx.x >= stride && tmp.x == shared_rows[threadIdx.x - stride]) ? inter_results[threadIdx.x - stride] : 0;
+            __syncthreads();
+            inter_results[threadIdx.x] += left;
+            __syncthreads();
+          }
+          //segmented parallel reduction end
+
+          if (local_index < group_end && threadIdx.x < blockDim.x-1 &&
+              shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1])
+          {
+            result[tmp.x * inc_result + start_result] = inter_results[threadIdx.x];
+          }
+
+          __syncthreads();
+        } //for k
+
+        if (local_index + 1 == group_end)
+          result[tmp.x * inc_result + start_result] = inter_results[threadIdx.x];
+      }
+
+
+      /** @brief Carries out matrix-vector multiplication with a coordinate_matrix
+      *
+      * Implementation of the convenience expression result = prod(mat, vec);
+      *
+      * @param mat    The matrix
+      * @param vec    The vector
+      * @param result The result vector
+      */
+      template<class ScalarType, unsigned int ALIGNMENT>
+      void prod_impl(const viennacl::coordinate_matrix<ScalarType, ALIGNMENT> & mat,
+                     const viennacl::vector_base<ScalarType> & vec,
+                           viennacl::vector_base<ScalarType> & result)
+      {
+        result.clear();
+
+        coordinate_matrix_vec_mul_kernel<<<64, 128>>>(detail::cuda_arg<unsigned int>(mat.handle12().cuda_handle()),
+                                                      detail::cuda_arg<ScalarType>(mat.handle().cuda_handle()),
+                                                      detail::cuda_arg<unsigned int>(mat.handle3().cuda_handle()),
+                                                      detail::cuda_arg<ScalarType>(vec),
+                                                      static_cast<unsigned int>(vec.start()),
+                                                      static_cast<unsigned int>(vec.stride()),
+                                                      detail::cuda_arg<ScalarType>(result),
+                                                      static_cast<unsigned int>(result.start()),
+                                                      static_cast<unsigned int>(result.stride())
+                                                     );
+        VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_vec_mul_kernel");
+      }
+
+
+
+
+      template <typename DMatIndexT, typename ResultIndexT, typename ScalarType, typename NumericT>
+      __global__ void coordinate_matrix_d_mat_mul_kernel(const unsigned int * coords, //(row_index, column_index)
+                                                         const ScalarType * elements,
+                                                         const unsigned int * group_boundaries,
+                                                         const NumericT * d_mat,
+                                                         unsigned int d_mat_row_start,
+                                                         unsigned int d_mat_col_start,
+                                                         unsigned int d_mat_row_inc,
+                                                         unsigned int d_mat_col_inc,
+                                                         unsigned int d_mat_row_size,
+                                                         unsigned int d_mat_col_size,
+                                                         unsigned int d_mat_internal_rows,
+                                                         unsigned int d_mat_internal_cols,
+                                                         NumericT * result,
+                                                         unsigned int result_row_start,
+                                                         unsigned int result_col_start,
+                                                         unsigned int result_row_inc,
+                                                         unsigned int result_col_inc,
+                                                         unsigned int result_row_size,
+                                                         unsigned int result_col_size,
+                                                         unsigned int result_internal_rows,
+                                                         unsigned int result_internal_cols)
+      {
+        __shared__ unsigned int shared_rows[128];
+        __shared__ NumericT inter_results[128];
+
+        uint2 tmp;
+        NumericT val;
+        unsigned int group_start = group_boundaries[blockIdx.x];
+        unsigned int group_end   = group_boundaries[blockIdx.x + 1];
+        unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0;   // -1 in order to have correct behavior if group_end - group_start == j * blockDim.x
+
+        unsigned int local_index = 0;
+
+        for (unsigned int result_col = 0; result_col < result_col_size; ++result_col)
+        {
+          for (unsigned int k = 0; k < k_end; ++k)
+          {
+            local_index = group_start + k * blockDim.x + threadIdx.x;
+
+            tmp = (local_index < group_end) ? ((const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
+            val = (local_index < group_end) ? elements[local_index] * d_mat[DMatIndexT::apply(tmp.y, result_col,
+                                                                                              d_mat_row_start, d_mat_row_inc,
+                                                                                              d_mat_col_start, d_mat_col_inc,
+                                                                                              d_mat_internal_rows, d_mat_internal_cols) ] : 0;
+
+            //check for carry from previous loop run:
+            if (threadIdx.x == 0 && k > 0)
+            {
+              if (tmp.x == shared_rows[blockDim.x-1])
+                val += inter_results[blockDim.x-1];
+              else
+                result[ResultIndexT::apply(shared_rows[blockDim.x-1], result_col,
+                                           result_row_start, result_row_inc,
+                                           result_col_start, result_col_inc,
+                                           result_internal_rows, result_internal_cols)] = inter_results[blockDim.x-1];
+            }
+
+            //segmented parallel reduction begin
+            __syncthreads();
+            shared_rows[threadIdx.x] = tmp.x;
+            inter_results[threadIdx.x] = val;
+            NumericT left = 0;
+            __syncthreads();
+
+            for (unsigned int stride = 1; stride < blockDim.x; stride *= 2)
+            {
+              left = (threadIdx.x >= stride && tmp.x == shared_rows[threadIdx.x - stride]) ? inter_results[threadIdx.x - stride] : 0;
+              __syncthreads();
+              inter_results[threadIdx.x] += left;
+              __syncthreads();
+            }
+            //segmented parallel reduction end
+
+            if (local_index < group_end && threadIdx.x < blockDim.x-1 &&
+                shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1])
+            {
+              result[ResultIndexT::apply(tmp.x, result_col,
+                                         result_row_start, result_row_inc,
+                                         result_col_start, result_col_inc,
+                                         result_internal_rows, result_internal_cols)] = inter_results[threadIdx.x];
+            }
+
+            __syncthreads();
+          } //for k
+
+          if (local_index + 1 == group_end)
+            result[ResultIndexT::apply(tmp.x, result_col,
+                                       result_row_start, result_row_inc,
+                                       result_col_start, result_col_inc,
+                                       result_internal_rows, result_internal_cols)] = inter_results[threadIdx.x];
+        }
+      }
+
+
+      /** @brief Carries out Compressed Matrix(COO)-Dense Matrix multiplication
+      *
+      * Implementation of the convenience expression result = prod(sp_mat, d_mat);
+      *
+      * @param sp_mat     The Sparse Matrix (Coordinate format)
+      * @param d_mat      The Dense Matrix
+      * @param result     The Result Matrix
+      */
+      template<typename NumericT, unsigned int ALIGNMENT, typename F1, typename F2>
+      void prod_impl(const viennacl::coordinate_matrix<NumericT, ALIGNMENT> & sp_mat,
+                     const viennacl::matrix_base<NumericT, F1> & d_mat,
+                           viennacl::matrix_base<NumericT, F2> & result) {
+
+        coordinate_matrix_d_mat_mul_kernel<mat_mult_matrix_index<F1>, mat_mult_matrix_index<F2> ><<<64, 128>>>
+                                                        (detail::cuda_arg<unsigned int>(sp_mat.handle12().cuda_handle()),
+                                                         detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
+                                                         detail::cuda_arg<unsigned int>(sp_mat.handle3().cuda_handle()),
+
+                                                         detail::cuda_arg<NumericT>(d_mat),
+                                                         static_cast<unsigned int>(viennacl::traits::start1(d_mat)),         static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
+                                                         static_cast<unsigned int>(viennacl::traits::stride1(d_mat)),        static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
+                                                         static_cast<unsigned int>(viennacl::traits::size1(d_mat)),          static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
+                                                         static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
+
+                                                         detail::cuda_arg<NumericT>(result),
+                                                         static_cast<unsigned int>(viennacl::traits::start1(result)),         static_cast<unsigned int>(viennacl::traits::start2(result)),
+                                                         static_cast<unsigned int>(viennacl::traits::stride1(result)),        static_cast<unsigned int>(viennacl::traits::stride2(result)),
+                                                         static_cast<unsigned int>(viennacl::traits::size1(result)),          static_cast<unsigned int>(viennacl::traits::size2(result)),
+                                                         static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
+                                                         );
+
+      }
+
+      template <typename DMatIndexT, typename ResultIndexT, typename ScalarType, typename NumericT>
+      __global__ void coordinate_matrix_d_tr_mat_mul_kernel(const unsigned int * coords, //(row_index, column_index)
+                                                           const ScalarType * elements,
+                                                           const unsigned int * group_boundaries,
+                                                           const NumericT * d_mat,
+                                                           unsigned int d_mat_row_start,
+                                                           unsigned int d_mat_col_start,
+                                                           unsigned int d_mat_row_inc,
+                                                           unsigned int d_mat_col_inc,
+                                                           unsigned int d_mat_row_size,
+                                                           unsigned int d_mat_col_size,
+                                                           unsigned int d_mat_internal_rows,
+                                                           unsigned int d_mat_internal_cols,
+                                                           NumericT * result,
+                                                           unsigned int result_row_start,
+                                                           unsigned int result_col_start,
+                                                           unsigned int result_row_inc,
+                                                           unsigned int result_col_inc,
+                                                           unsigned int result_row_size,
+                                                           unsigned int result_col_size,
+                                                           unsigned int result_internal_rows,
+                                                           unsigned int result_internal_cols)
+      {
+        __shared__ unsigned int shared_rows[128];
+        __shared__ NumericT inter_results[128];
+
+        uint2 tmp;
+        NumericT val;
+        unsigned int group_start = group_boundaries[blockIdx.x];
+        unsigned int group_end   = group_boundaries[blockIdx.x + 1];
+        unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0;   // -1 in order to have correct behavior if group_end - group_start == j * blockDim.x
+
+        unsigned int local_index = 0;
+
+        for (unsigned int result_col = 0; result_col < result_col_size; ++result_col)
+        {
+          for (unsigned int k = 0; k < k_end; ++k)
+          {
+            local_index = group_start + k * blockDim.x + threadIdx.x;
+
+            tmp = (local_index < group_end) ? ((const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
+            val = (local_index < group_end) ? elements[local_index] * d_mat[DMatIndexT::apply(result_col, tmp.y,
+                                                                                              d_mat_row_start, d_mat_row_inc,
+                                                                                              d_mat_col_start, d_mat_col_inc,
+                                                                                              d_mat_internal_rows, d_mat_internal_cols)] : 0;
+
+            //check for carry from previous loop run:
+            if (threadIdx.x == 0 && k > 0)
+            {
+              if (tmp.x == shared_rows[blockDim.x-1])
+                val += inter_results[blockDim.x-1];
+              else
+                result[ResultIndexT::apply(shared_rows[blockDim.x-1], result_col,
+                                           result_row_start, result_row_inc,
+                                           result_col_start, result_col_inc,
+                                           result_internal_rows, result_internal_cols) ] = inter_results[blockDim.x-1];
+            }
+
+            //segmented parallel reduction begin
+            __syncthreads();
+            shared_rows[threadIdx.x] = tmp.x;
+            inter_results[threadIdx.x] = val;
+            NumericT left = 0;
+            __syncthreads();
+
+            for (unsigned int stride = 1; stride < blockDim.x; stride *= 2)
+            {
+              left = (threadIdx.x >= stride && tmp.x == shared_rows[threadIdx.x - stride]) ? inter_results[threadIdx.x - stride] : 0;
+              __syncthreads();
+              inter_results[threadIdx.x] += left;
+              __syncthreads();
+            }
+            //segmented parallel reduction end
+
+            if (local_index < group_end && threadIdx.x < blockDim.x-1 &&
+                shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1])
+            {
+              result[ ResultIndexT::apply(tmp.x, result_col,
+                                          result_row_start, result_row_inc,
+                                          result_col_start, result_col_inc,
+                                          result_internal_rows, result_internal_cols) ] = inter_results[threadIdx.x];
+            }
+
+            __syncthreads();
+          } //for k
+
+          if (local_index + 1 == group_end)
+            result[ ResultIndexT::apply(tmp.x, result_col,
+                                        result_row_start, result_row_inc,
+                                        result_col_start, result_col_inc,
+                                        result_internal_rows, result_internal_cols) ] = inter_results[threadIdx.x];
+        }
+      }
+
+      /** @brief Carries out Compressed Matrix(COO)-Dense Transposed Matrix multiplication
+      *
+      * Implementation of the convenience expression result = prod(sp_mat, trans(d_mat));
+      *
+      * @param sp_mat     The Sparse Matrix (Coordinate format)
+      * @param d_mat      The Dense Transposed Matrix
+      * @param result     The Result Matrix
+      */
+      template<class ScalarType, unsigned int ALIGNMENT, class NumericT, typename F1, typename F2>
+      void prod_impl(const viennacl::coordinate_matrix<ScalarType, ALIGNMENT> & sp_mat,
+                     const viennacl::matrix_expression< const viennacl::matrix_base<NumericT, F1>,
+                                                        const viennacl::matrix_base<NumericT, F1>,
+                                                        viennacl::op_trans > & d_mat,
+                           viennacl::matrix_base<NumericT, F2> & result) {
+
+        coordinate_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<F1>, mat_mult_matrix_index<F2> ><<<64, 128>>>
+                                                          (detail::cuda_arg<unsigned int>(sp_mat.handle12().cuda_handle()),
+                                                           detail::cuda_arg<ScalarType>(sp_mat.handle().cuda_handle()),
+                                                           detail::cuda_arg<unsigned int>(sp_mat.handle3().cuda_handle()),
+
+                                                           detail::cuda_arg<NumericT>(d_mat.lhs()),
+                                                           static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())),         static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
+                                                           static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())),        static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
+                                                           static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())),          static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
+                                                           static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
+
+                                                           detail::cuda_arg<NumericT>(result),
+                                                           static_cast<unsigned int>(viennacl::traits::start1(result)),         static_cast<unsigned int>(viennacl::traits::start2(result)),
+                                                           static_cast<unsigned int>(viennacl::traits::stride1(result)),        static_cast<unsigned int>(viennacl::traits::stride2(result)),
+                                                           static_cast<unsigned int>(viennacl::traits::size1(result)),          static_cast<unsigned int>(viennacl::traits::size2(result)),
+                                                           static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
+                                                          );
+
+        VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_d_tr_mat_mul_kernel");
+      }
+
+
+      //
+      // ELL Matrix
+      //
+
+      template <typename T>
+      __global__ void ell_matrix_vec_mul_kernel(const unsigned int * coords,
+                                                const T * elements,
+                                                const T * x,
+                                                unsigned int start_x,
+                                                unsigned int inc_x,
+                                                      T * result,
+                                                unsigned int start_result,
+                                                unsigned int inc_result,
+                                                unsigned int row_num,
+                                                unsigned int col_num,
+                                                unsigned int internal_row_num,
+                                                unsigned int items_per_row,
+                                                unsigned int aligned_items_per_row
+                                               )
+      {
+        unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
+        unsigned int glb_sz = gridDim.x * blockDim.x;
+
+        for(unsigned int row_id = glb_id; row_id < row_num; row_id += glb_sz)
+        {
+          T sum = 0;
+
+          unsigned int offset = row_id;
+          for(unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
+          {
+            T val = elements[offset];
+
+            if(val != (T)0)
+            {
+              int col = coords[offset];
+              sum += (x[col * inc_x + start_x] * val);
+            }
+          }
+
+          result[row_id * inc_result + start_result] = sum;
+        }
+      }
+
+
+      /** @brief Carries out matrix-vector multiplication with a ell_matrix
+      *
+      * Implementation of the convenience expression result = prod(mat, vec);
+      *
+      * @param mat    The matrix
+      * @param vec    The vector
+      * @param result The result vector
+      */
+      template<class ScalarType, unsigned int ALIGNMENT>
+      void prod_impl(const viennacl::ell_matrix<ScalarType, ALIGNMENT> & mat,
+                     const viennacl::vector_base<ScalarType> & vec,
+                           viennacl::vector_base<ScalarType> & result)
+      {
+        ell_matrix_vec_mul_kernel<<<256, 128>>>(detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
+                                                detail::cuda_arg<ScalarType>(mat.handle().cuda_handle()),
+                                                detail::cuda_arg<ScalarType>(vec),
+                                                static_cast<unsigned int>(vec.start()),
+                                                static_cast<unsigned int>(vec.stride()),
+                                                detail::cuda_arg<ScalarType>(result),
+                                                static_cast<unsigned int>(result.start()),
+                                                static_cast<unsigned int>(result.stride()),
+                                                static_cast<unsigned int>(mat.size1()),
+                                                static_cast<unsigned int>(mat.size2()),
+                                                static_cast<unsigned int>(mat.internal_size1()),
+                                                static_cast<unsigned int>(mat.maxnnz()),
+                                                static_cast<unsigned int>(mat.internal_maxnnz())
+                                               );
+        VIENNACL_CUDA_LAST_ERROR_CHECK("ell_matrix_vec_mul_kernel");
+      }
+
+      template <typename DMatIndexT, typename ResultIndexT, typename ScalarType, typename NumericT >
+      __global__ void ell_matrix_d_mat_mul_kernel(const unsigned int * sp_mat_coords,
+                                                  const ScalarType * sp_mat_elements,
+                                                  unsigned int sp_mat_row_num,
+                                                  unsigned int sp_mat_col_num,
+                                                  unsigned int sp_mat_internal_row_num,
+                                                  unsigned int sp_mat_items_per_row,
+                                                  unsigned int sp_mat_aligned_items_per_row,
+                                                  const NumericT * d_mat,
+                                                  unsigned int d_mat_row_start,
+                                                  unsigned int d_mat_col_start,
+                                                  unsigned int d_mat_row_inc,
+                                                  unsigned int d_mat_col_inc,
+                                                  unsigned int d_mat_row_size,
+                                                  unsigned int d_mat_col_size,
+                                                  unsigned int d_mat_internal_rows,
+                                                  unsigned int d_mat_internal_cols,
+                                                  NumericT * result,
+                                                  unsigned int result_row_start,
+                                                  unsigned int result_col_start,
+                                                  unsigned int result_row_inc,
+                                                  unsigned int result_col_inc,
+                                                  unsigned int result_row_size,
+                                                  unsigned int result_col_size,
+                                                  unsigned int result_internal_rows,
+                                                  unsigned int result_internal_cols) {
+
+
+        unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
+        unsigned int glb_sz = gridDim.x * blockDim.x;
+        for( unsigned int rc = glb_id; rc < (sp_mat_row_num * d_mat_col_size); rc += glb_sz) {
+          unsigned int row = rc % sp_mat_row_num;
+          unsigned int col = rc / sp_mat_row_num;
+
+          unsigned int offset = row;
+          NumericT r = (NumericT)0;
+
+          for(unsigned int k = 0; k < sp_mat_items_per_row; k++, offset += sp_mat_internal_row_num) {
+
+            unsigned int j = sp_mat_coords[offset];
+            NumericT x = static_cast<NumericT>(sp_mat_elements[offset]);
+
+            if(x != (NumericT)0) {
+
+                NumericT y = d_mat[ DMatIndexT::apply(j, col,
+                                                      d_mat_row_start, d_mat_row_inc,
+                                                      d_mat_col_start, d_mat_col_inc,
+                                                      d_mat_internal_rows, d_mat_internal_cols) ];
+
+                r += x*y;
+              }
+            }
+          result [ ResultIndexT::apply(row, col,
+                                       result_row_start, result_row_inc,
+                                       result_col_start, result_col_inc,
+                                       result_internal_rows, result_internal_cols) ] = r;
+        }
+
+      }
+
+      /** @brief Carries out Sparse Matrix(ELL)-Dense Matrix multiplication
+      *
+      * Implementation of the convenience expression result = prod(sp_mat, d_mat);
+      * sp_mat being in ELL format
+      *
+      * @param sp_mat     The sparse matrix (ELL)
+      * @param d_mat      The dense matrix
+      * @param result     The result matrix
+      */
+      template<class ScalarType, unsigned int ALIGNMENT, class NumericT, typename F1, typename F2 >
+      void prod_impl(const viennacl::ell_matrix<ScalarType, ALIGNMENT> & sp_mat,
+                     const viennacl::matrix_base<NumericT, F1> & d_mat,
+                           viennacl::matrix_base<NumericT, F2> & result) {
+
+        ell_matrix_d_mat_mul_kernel<mat_mult_matrix_index<F1>, mat_mult_matrix_index<F2> ><<<128, 128>>>
+                                                 (detail::cuda_arg<unsigned int>(sp_mat.handle2().cuda_handle()),
+                                                  detail::cuda_arg<ScalarType>(sp_mat.handle().cuda_handle()),
+                                                  static_cast<unsigned int>(sp_mat.size1()),
+                                                  static_cast<unsigned int>(sp_mat.size2()),
+                                                  static_cast<unsigned int>(sp_mat.internal_size1()),
+                                                  static_cast<unsigned int>(sp_mat.maxnnz()),
+                                                  static_cast<unsigned int>(sp_mat.internal_maxnnz()),
+                                                  detail::cuda_arg<NumericT>(d_mat),
+                                                  static_cast<unsigned int>(viennacl::traits::start1(d_mat)),         static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
+                                                  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)),        static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
+                                                  static_cast<unsigned int>(viennacl::traits::size1(d_mat)),          static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
+                                                  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
+
+                                                  detail::cuda_arg<NumericT>(result),
+                                                  static_cast<unsigned int>(viennacl::traits::start1(result)),         static_cast<unsigned int>(viennacl::traits::start2(result)),
+                                                  static_cast<unsigned int>(viennacl::traits::stride1(result)),        static_cast<unsigned int>(viennacl::traits::stride2(result)),
+                                                  static_cast<unsigned int>(viennacl::traits::size1(result)),          static_cast<unsigned int>(viennacl::traits::size2(result)),
+                                                  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
+                                               );
+        VIENNACL_CUDA_LAST_ERROR_CHECK("ell_matrix_d_mat_mul_kernel");
+      }
+
+      template <typename DMatIndexT, typename ResultIndexT, typename ScalarType, typename NumericT >
+      __global__ void ell_matrix_d_tr_mat_mul_kernel(const unsigned int * sp_mat_coords,
+                                                  const ScalarType * sp_mat_elements,
+                                                  unsigned int sp_mat_row_num,
+                                                  unsigned int sp_mat_col_num,
+                                                  unsigned int sp_mat_internal_row_num,
+                                                  unsigned int sp_mat_items_per_row,
+                                                  unsigned int sp_mat_aligned_items_per_row,
+                                                  const NumericT * d_mat,
+                                                  unsigned int d_mat_row_start,
+                                                  unsigned int d_mat_col_start,
+                                                  unsigned int d_mat_row_inc,
+                                                  unsigned int d_mat_col_inc,
+                                                  unsigned int d_mat_row_size,
+                                                  unsigned int d_mat_col_size,
+                                                  unsigned int d_mat_internal_rows,
+                                                  unsigned int d_mat_internal_cols,
+                                                  NumericT * result,
+                                                  unsigned int result_row_start,
+                                                  unsigned int result_col_start,
+                                                  unsigned int result_row_inc,
+                                                  unsigned int result_col_inc,
+                                                  unsigned int result_row_size,
+                                                  unsigned int result_col_size,
+                                                  unsigned int result_internal_rows,
+                                                  unsigned int result_internal_cols) {
+
+
+        unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
+        unsigned int glb_sz = gridDim.x * blockDim.x;
+        for( unsigned int rc = glb_id; rc < (sp_mat_row_num * d_mat_row_size); rc += glb_sz) {
+          unsigned int row = rc % sp_mat_row_num;
+          unsigned int col = rc / sp_mat_row_num;
+
+          unsigned int offset = row;
+          NumericT r = (NumericT)0;
+
+          for(unsigned int k = 0; k < sp_mat_items_per_row; k++, offset += sp_mat_internal_row_num) {
+
+            unsigned int j = sp_mat_coords[offset];
+            NumericT x = static_cast<NumericT>(sp_mat_elements[offset]);
+
+            if(x != (NumericT)0) {
+
+                NumericT y = d_mat[ DMatIndexT::apply(col, j,
+                                                      d_mat_row_start, d_mat_row_inc,
+                                                      d_mat_col_start, d_mat_col_inc,
+                                                      d_mat_internal_rows, d_mat_internal_cols) ];
+
+                r += x*y;
+              }
+            }
+          result [ ResultIndexT::apply(row, col,
+                                       result_row_start, result_row_inc,
+                                       result_col_start, result_col_inc,
+                                       result_internal_rows, result_internal_cols) ] = r;
+        }
+
+      }
+
+      /** @brief Carries out Sparse Matrix(ELL)-Dense Transposed Matrix multiplication
+      *
+      * Implementation of the convenience expression result = prod(sp_mat, trans(d_mat));
+      * sp_mat being in ELL format
+      *
+      * @param sp_mat     The sparse matrix (ELL)
+      * @param d_mat      The dense matrix
+      * @param result     The result matrix
+      */
+      template<class ScalarType, unsigned int ALIGNMENT, class NumericT, typename F1, typename F2 >
+      void prod_impl(const viennacl::ell_matrix<ScalarType, ALIGNMENT> & sp_mat,
+                     const viennacl::matrix_expression< const viennacl::matrix_base<NumericT, F1>,
+                                                        const viennacl::matrix_base<NumericT, F1>,
+                                                        viennacl::op_trans > & d_mat,
+                           viennacl::matrix_base<NumericT, F2> & result) {
+
+        ell_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<F1>, mat_mult_matrix_index<F2> ><<<128, 128>>>
+                                                    (detail::cuda_arg<unsigned int>(sp_mat.handle2().cuda_handle()),
+                                                     detail::cuda_arg<ScalarType>(sp_mat.handle().cuda_handle()),
+                                                     static_cast<unsigned int>(sp_mat.size1()),
+                                                     static_cast<unsigned int>(sp_mat.size2()),
+                                                     static_cast<unsigned int>(sp_mat.internal_size1()),
+                                                     static_cast<unsigned int>(sp_mat.maxnnz()),
+                                                     static_cast<unsigned int>(sp_mat.internal_maxnnz()),
+
+                                                     detail::cuda_arg<NumericT>(d_mat.lhs()),
+                                                     static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())),         static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
+                                                     static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())),        static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
+                                                     static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())),          static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
+                                                     static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
+
+                                                     detail::cuda_arg<NumericT>(result),
+                                                     static_cast<unsigned int>(viennacl::traits::start1(result)),         static_cast<unsigned int>(viennacl::traits::start2(result)),
+                                                     static_cast<unsigned int>(viennacl::traits::stride1(result)),        static_cast<unsigned int>(viennacl::traits::stride2(result)),
+                                                     static_cast<unsigned int>(viennacl::traits::size1(result)),          static_cast<unsigned int>(viennacl::traits::size2(result)),
+                                                     static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
+                                               );
+
+        VIENNACL_CUDA_LAST_ERROR_CHECK("ell_matrix_d_tr_mat_mul_kernel");
+      }
+
+      //
+      // Hybrid Matrix
+      //
+
+
+      template <typename T>
+      __global__ void hyb_matrix_vec_mul_kernel(const unsigned int * ell_coords,
+                                                const T * ell_elements,
+                                                const unsigned int * csr_rows,
+                                                const unsigned int * csr_cols,
+                                                const T * csr_elements,
+                                                const T * x,
+                                                unsigned int start_x,
+                                                unsigned int inc_x,
+                                                      T * result,
+                                                unsigned int start_result,
+                                                unsigned int inc_result,
+                                                unsigned int row_num,
+                                                unsigned int internal_row_num,
+                                                unsigned int items_per_row,
+                                                unsigned int aligned_items_per_row
+                                               )
+      {
+        unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
+        unsigned int glb_sz = gridDim.x * blockDim.x;
+
+        for(unsigned int row_id = glb_id; row_id < row_num; row_id += glb_sz)
+        {
+          T sum = 0;
+
+          unsigned int offset = row_id;
+          for(unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
+          {
+            T val = ell_elements[offset];
+
+
+            if(val != 0.0f)
+            {
+              int col = ell_coords[offset];
+              sum += (x[col * inc_x + start_x] * val);
+            }
+          }
+
+          unsigned int col_begin = csr_rows[row_id];
+          unsigned int col_end   = csr_rows[row_id + 1];
+
+          for(unsigned int item_id = col_begin; item_id < col_end; item_id++)
+          {
+            sum += (x[csr_cols[item_id] * inc_x + start_x] * csr_elements[item_id]);
+          }
+
+          result[row_id * inc_result + start_result] = sum;
+        }
+      }
+
+
+
+      /** @brief Carries out matrix-vector multiplication with a hyb_matrix
+      *
+      * Implementation of the convenience expression result = prod(mat, vec);
+      *
+      * @param mat    The matrix
+      * @param vec    The vector
+      * @param result The result vector
+      */
+      template<class ScalarType, unsigned int ALIGNMENT>
+      void prod_impl(const viennacl::hyb_matrix<ScalarType, ALIGNMENT> & mat,
+                     const viennacl::vector_base<ScalarType> & vec,
+                           viennacl::vector_base<ScalarType> & result)
+      {
+        hyb_matrix_vec_mul_kernel<<<256, 128>>>(detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
+                                                detail::cuda_arg<ScalarType>(mat.handle().cuda_handle()),
+                                                detail::cuda_arg<unsigned int>(mat.handle3().cuda_handle()),
+                                                detail::cuda_arg<unsigned int>(mat.handle4().cuda_handle()),
+                                                detail::cuda_arg<ScalarType>(mat.handle5().cuda_handle()),
+                                                detail::cuda_arg<ScalarType>(vec),
+                                                static_cast<unsigned int>(vec.start()),
+                                                static_cast<unsigned int>(vec.stride()),
+                                                detail::cuda_arg<ScalarType>(result),
+                                                static_cast<unsigned int>(result.start()),
+                                                static_cast<unsigned int>(result.stride()),
+                                                static_cast<unsigned int>(mat.size1()),
+                                                static_cast<unsigned int>(mat.internal_size1()),
+                                                static_cast<unsigned int>(mat.ell_nnz()),
+                                                static_cast<unsigned int>(mat.internal_ellnnz())
+                                               );
+        VIENNACL_CUDA_LAST_ERROR_CHECK("hyb_matrix_vec_mul_kernel");
+      }
+
+
+
+      template <typename DMatIndexT, typename ResultIndexT, typename NumericT>
+      __global__ void hyb_matrix_d_mat_mul_kernel(const unsigned int * ell_coords,
+                                                const NumericT * ell_elements,
+                                                const unsigned int * csr_rows,
+                                                const unsigned int * csr_cols,
+                                                const NumericT * csr_elements,
+                                                unsigned int row_num,
+                                                unsigned int internal_row_num,
+                                                unsigned int items_per_row,
+                                                unsigned int aligned_items_per_row,
+                                                const NumericT * d_mat,
+                                                unsigned int d_mat_row_start,
+                                                unsigned int d_mat_col_start,
+                                                unsigned int d_mat_row_inc,
+                                                unsigned int d_mat_col_inc,
+                                                unsigned int d_mat_row_size,
+                                                unsigned int d_mat_col_size,
+                                                unsigned int d_mat_internal_rows,
+                                                unsigned int d_mat_internal_cols,
+                                                NumericT * result,
+                                                unsigned int result_row_start,
+                                                unsigned int result_col_start,
+                                                unsigned int result_row_inc,
+                                                unsigned int result_col_inc,
+                                                unsigned int result_row_size,
+                                                unsigned int result_col_size,
+                                                unsigned int result_internal_rows,
+                                                unsigned int result_internal_cols)
+      {
+        unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
+        unsigned int glb_sz = gridDim.x * blockDim.x;
+
+        for(unsigned int result_col = 0; result_col < result_col_size; ++result_col)
+        {
+          for(unsigned int row_id = glb_id; row_id < row_num; row_id += glb_sz)
+          {
+            NumericT sum = 0;
+
+            unsigned int offset = row_id;
+            for(unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
+            {
+              NumericT val = ell_elements[offset];
+
+              if(val != 0.0f)
+              {
+                sum += d_mat[DMatIndexT::apply(ell_coords[offset], result_col,
+                                               d_mat_row_start, d_mat_row_inc,
+                                               d_mat_col_start, d_mat_col_inc,
+                                               d_mat_internal_rows, d_mat_internal_cols)] * val;
+              }
+            }
+
+            unsigned int col_begin = csr_rows[row_id];
+            unsigned int col_end   = csr_rows[row_id + 1];
+
+            for(unsigned int item_id = col_begin; item_id < col_end; item_id++)
+            {
+              sum += d_mat[DMatIndexT::apply(csr_cols[item_id], result_col,
+                                             d_mat_row_start, d_mat_row_inc,
+                                             d_mat_col_start, d_mat_col_inc,
+                                             d_mat_internal_rows, d_mat_internal_cols)] * csr_elements[item_id];
+            }
+
+            result[ResultIndexT::apply(row_id, result_col,
+                                       result_row_start, result_row_inc,
+                                       result_col_start, result_col_inc,
+                                       result_internal_rows, result_internal_cols)] = sum;
+          }
+        }
+      }
+
+
+
+      /** @brief Carries out matrix-vector multiplication with a hyb_matrix
+      *
+      * Implementation of the convenience expression result = prod(mat, d_mat);
+      *
+      * @param mat      The sparse matrix
+      * @param d_mat    The dense matrix (row- or column-major)
+      * @param result   The dense result matrix (row- or column-major)
+      */
+      template<typename NumericT, unsigned int ALIGNMENT, typename F1, typename F2>
+      void prod_impl(const viennacl::hyb_matrix<NumericT, ALIGNMENT> & mat,
+                     const viennacl::matrix_base<NumericT, F1> & d_mat,
+                           viennacl::matrix_base<NumericT, F2> & result)
+      {
+        hyb_matrix_d_mat_mul_kernel<mat_mult_matrix_index<F1>, mat_mult_matrix_index<F2> ><<<256, 128>>>(
+          detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
+          detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
+          detail::cuda_arg<unsigned int>(mat.handle3().cuda_handle()),
+          detail::cuda_arg<unsigned int>(mat.handle4().cuda_handle()),
+          detail::cuda_arg<NumericT>(mat.handle5().cuda_handle()),
+          static_cast<unsigned int>(mat.size1()),
+          static_cast<unsigned int>(mat.internal_size1()),
+          static_cast<unsigned int>(mat.ell_nnz()),
+          static_cast<unsigned int>(mat.internal_ellnnz()),
+
+          detail::cuda_arg<NumericT>(d_mat),
+          static_cast<unsigned int>(viennacl::traits::start1(d_mat)),         static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
+          static_cast<unsigned int>(viennacl::traits::stride1(d_mat)),        static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
+          static_cast<unsigned int>(viennacl::traits::size1(d_mat)),          static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
+          static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
+
+          detail::cuda_arg<NumericT>(result),
+          static_cast<unsigned int>(viennacl::traits::start1(result)),         static_cast<unsigned int>(viennacl::traits::start2(result)),
+          static_cast<unsigned int>(viennacl::traits::stride1(result)),        static_cast<unsigned int>(viennacl::traits::stride2(result)),
+          static_cast<unsigned int>(viennacl::traits::size1(result)),          static_cast<unsigned int>(viennacl::traits::size2(result)),
+          static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
+         );
+        VIENNACL_CUDA_LAST_ERROR_CHECK("hyb_matrix_vec_mul_kernel");
+      }
+
+
+
+      template <typename DMatIndexT, typename ResultIndexT, typename NumericT>
+      __global__ void hyb_matrix_d_tr_mat_mul_kernel(const unsigned int * ell_coords,
+                                                const NumericT * ell_elements,
+                                                const unsigned int * csr_rows,
+                                                const unsigned int * csr_cols,
+                                                const NumericT * csr_elements,
+                                                unsigned int row_num,
+                                                unsigned int internal_row_num,
+                                                unsigned int items_per_row,
+                                                unsigned int aligned_items_per_row,
+                                                const NumericT * d_mat,
+                                                unsigned int d_mat_row_start,
+                                                unsigned int d_mat_col_start,
+                                                unsigned int d_mat_row_inc,
+                                                unsigned int d_mat_col_inc,
+                                                unsigned int d_mat_row_size,
+                                                unsigned int d_mat_col_size,
+                                                unsigned int d_mat_internal_rows,
+                                                unsigned int d_mat_internal_cols,
+                                                NumericT * result,
+                                                unsigned int result_row_start,
+                                                unsigned int result_col_start,
+                                                unsigned int result_row_inc,
+                                                unsigned int result_col_inc,
+                                                unsigned int result_row_size,
+                                                unsigned int result_col_size,
+                                                unsigned int result_internal_rows,
+                                                unsigned int result_internal_cols)
+      {
+        unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
+        unsigned int glb_sz = gridDim.x * blockDim.x;
+
+        for(unsigned int result_col = 0; result_col < result_col_size; ++result_col)
+        {
+          for(unsigned int row_id = glb_id; row_id < row_num; row_id += glb_sz)
+          {
+            NumericT sum = 0;
+
+            unsigned int offset = row_id;
+            for(unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
+            {
+              NumericT val = ell_elements[offset];
+
+              if(val != 0.0f)
+              {
+                sum += d_mat[DMatIndexT::apply(result_col, ell_coords[offset],
+                                               d_mat_row_start, d_mat_row_inc,
+                                               d_mat_col_start, d_mat_col_inc,
+                                               d_mat_internal_rows, d_mat_internal_cols)] * val;
+              }
+            }
+
+            unsigned int col_begin = csr_rows[row_id];
+            unsigned int col_end   = csr_rows[row_id + 1];
+
+            for(unsigned int item_id = col_begin; item_id < col_end; item_id++)
+            {
+              sum += d_mat[DMatIndexT::apply(result_col, csr_cols[item_id],
+                                             d_mat_row_start, d_mat_row_inc,
+                                             d_mat_col_start, d_mat_col_inc,
+                                             d_mat_internal_rows, d_mat_internal_cols)] * csr_elements[item_id];
+            }
+
+            result[ResultIndexT::apply(row_id, result_col,
+                                       result_row_start, result_row_inc,
+                                       result_col_start, result_col_inc,
+                                       result_internal_rows, result_internal_cols)] = sum;
+          }
+        }
+      }
+
+
+
+      /** @brief Carries out matrix-vector multiplication with a hyb_matrix
+      *
+      * Implementation of the convenience expression result = prod(mat, trans(d_mat));
+      *
+      * @param mat      The sparse matrix
+      * @param d_mat    Transposed matrix proxy object for the rhs dense matrix (row- or column-major)
+      * @param result   The dense result matrix (row- or column-major)
+      */
+      template<typename NumericT, unsigned int ALIGNMENT, typename F1, typename F2>
+      void prod_impl(const viennacl::hyb_matrix<NumericT, ALIGNMENT> & mat,
+                     const viennacl::matrix_expression< const viennacl::matrix_base<NumericT, F1>,
+                                                        const viennacl::matrix_base<NumericT, F1>,
+                                                        viennacl::op_trans > & d_mat,
+                           viennacl::matrix_base<NumericT, F2> & result)
+      {
+        hyb_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<F1>, mat_mult_matrix_index<F2> ><<<256, 128>>>(
+          detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
+          detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
+          detail::cuda_arg<unsigned int>(mat.handle3().cuda_handle()),
+          detail::cuda_arg<unsigned int>(mat.handle4().cuda_handle()),
+          detail::cuda_arg<NumericT>(mat.handle5().cuda_handle()),
+          static_cast<unsigned int>(mat.size1()),
+          static_cast<unsigned int>(mat.internal_size1()),
+          static_cast<unsigned int>(mat.ell_nnz()),
+          static_cast<unsigned int>(mat.internal_ellnnz()),
+
+          detail::cuda_arg<NumericT>(d_mat.lhs()),
+          static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())),         static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
+          static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())),        static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
+          static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())),          static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
+          static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
+
+          detail::cuda_arg<NumericT>(result),
+          static_cast<unsigned int>(viennacl::traits::start1(result)),         static_cast<unsigned int>(viennacl::traits::start2(result)),
+          static_cast<unsigned int>(viennacl::traits::stride1(result)),        static_cast<unsigned int>(viennacl::traits::stride2(result)),
+          static_cast<unsigned int>(viennacl::traits::size1(result)),          static_cast<unsigned int>(viennacl::traits::size2(result)),
+          static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
+         );
+        VIENNACL_CUDA_LAST_ERROR_CHECK("hyb_matrix_vec_mul_kernel");
+      }
+
+
+    } // namespace cuda
+  } //namespace linalg
+} //namespace viennacl
+
+
+#endif
diff --git a/viennacl/linalg/cuda/sparse_matrix_operations_solve.hpp b/viennacl/linalg/cuda/sparse_matrix_operations_solve.hpp
new file mode 100644
index 0000000..6349247
--- /dev/null
+++ b/viennacl/linalg/cuda/sparse_matrix_operations_solve.hpp
@@ -0,0 +1,761 @@
+#ifndef VIENNACL_LINALG_CUDA_SPARSE_MATRIX_OPERATIONS_SOLVE_HPP_
+#define VIENNACL_LINALG_CUDA_SPARSE_MATRIX_OPERATIONS_SOLVE_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/cuda/sparse_matrix_operations_solve.hpp
+    @brief Implementations of direct triangular solvers for sparse matrices using CUDA
+*/
+
+#include "viennacl/forwards.h"
+
+namespace viennacl
+{
+  namespace linalg
+  {
+    namespace cuda
+    {
+      //
+      // Compressed matrix
+      //
+
+      //
+      // non-transposed
+      //
+
+      template <typename T>
+      __global__ void csr_unit_lu_forward_kernel(
+                const unsigned int * row_indices,
+                const unsigned int * column_indices,
+                const T * elements,
+                      T * vector,
+                unsigned int size)
+      {
+        __shared__  unsigned int col_index_buffer[128];
+        __shared__  T element_buffer[128];
+        __shared__  T vector_buffer[128];
+
+        unsigned int nnz = row_indices[size];
+        unsigned int current_row = 0;
+        unsigned int row_at_window_start = 0;
+        T current_vector_entry = vector[0];
+        unsigned int loop_end = (nnz / blockDim.x + 1) * blockDim.x;
+        unsigned int next_row = row_indices[1];
+
+        for (unsigned int i = threadIdx.x; i < loop_end; i += blockDim.x)
+        {
+          //load into shared memory (coalesced access):
+          if (i < nnz)
+          {
+            element_buffer[threadIdx.x] = elements[i];
+            unsigned int tmp = column_indices[i];
+            col_index_buffer[threadIdx.x] = tmp;
+            vector_buffer[threadIdx.x] = vector[tmp];
+          }
+
+          __syncthreads();
+
+          //now a single thread does the remaining work in shared memory:
+          if (threadIdx.x == 0)
+          {
+            // traverse through all the loaded data:
+            for (unsigned int k=0; k<blockDim.x; ++k)
+            {
+              if (current_row < size && i+k == next_row) //current row is finished. Write back result
+              {
+                vector[current_row] = current_vector_entry;
+                ++current_row;
+                if (current_row < size) //load next row's data
+                {
+                  next_row = row_indices[current_row+1];
+                  current_vector_entry = vector[current_row];
+                }
+              }
+
+              if (current_row < size && col_index_buffer[k] < current_row) //substitute
+              {
+                if (col_index_buffer[k] < row_at_window_start) //use recently computed results
+                  current_vector_entry -= element_buffer[k] * vector_buffer[k];
+                else if (col_index_buffer[k] < current_row) //use buffered data
+                  current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]];
+              }
+
+            } // for k
+
+            row_at_window_start = current_row;
+          } // if (get_local_id(0) == 0)
+
+          __syncthreads();
+        } //for i
+      }
+
+
+
+      template <typename T>
+      __global__ void csr_lu_forward_kernel(
+                const unsigned int * row_indices,
+                const unsigned int * column_indices,
+                const T * elements,
+                      T * vector,
+                unsigned int size)
+      {
+        __shared__  unsigned int col_index_buffer[128];
+        __shared__  T element_buffer[128];
+        __shared__  T vector_buffer[128];
+
+        unsigned int nnz = row_indices[size];
+        unsigned int current_row = 0;
+        unsigned int row_at_window_start = 0;
+        T current_vector_entry = vector[0];
+        T diagonal_entry = 0;
+        unsigned int loop_end = (nnz / blockDim.x + 1) * blockDim.x;
+        unsigned int next_row = row_indices[1];
+
+        for (unsigned int i = threadIdx.x; i < loop_end; i += blockDim.x)
+        {
+          //load into shared memory (coalesced access):
+          if (i < nnz)
+          {
+            element_buffer[threadIdx.x] = elements[i];
+            unsigned int tmp = column_indices[i];
+            col_index_buffer[threadIdx.x] = tmp;
+            vector_buffer[threadIdx.x] = vector[tmp];
+          }
+
+          __syncthreads();
+
+          //now a single thread does the remaining work in shared memory:
+          if (threadIdx.x == 0)
+          {
+            // traverse through all the loaded data:
+            for (unsigned int k=0; k<blockDim.x; ++k)
+            {
+              if (current_row < size && i+k == next_row) //current row is finished. Write back result
+              {
+                vector[current_row] = current_vector_entry / diagonal_entry;
+                ++current_row;
+                if (current_row < size) //load next row's data
+                {
+                  next_row = row_indices[current_row+1];
+                  current_vector_entry = vector[current_row];
+                }
+              }
+
+              if (current_row < size && col_index_buffer[k] < current_row) //substitute
+              {
+                if (col_index_buffer[k] < row_at_window_start) //use recently computed results
+                  current_vector_entry -= element_buffer[k] * vector_buffer[k];
+                else if (col_index_buffer[k] < current_row) //use buffered data
+                  current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]];
+              }
+              else if (col_index_buffer[k] == current_row)
+                diagonal_entry = element_buffer[k];
+
+            } // for k
+
+            row_at_window_start = current_row;
+          } // if (get_local_id(0) == 0)
+
+          __syncthreads();
+        } //for i
+      }
+
+
+      template <typename T>
+      __global__ void csr_unit_lu_backward_kernel(
+                const unsigned int * row_indices,
+                const unsigned int * column_indices,
+                const T * elements,
+                      T * vector,
+                unsigned int size)
+      {
+        __shared__  unsigned int col_index_buffer[128];
+        __shared__  T element_buffer[128];
+        __shared__  T vector_buffer[128];
+
+        unsigned int nnz = row_indices[size];
+        unsigned int current_row = size-1;
+        unsigned int row_at_window_start = size-1;
+        T current_vector_entry = vector[size-1];
+        unsigned int loop_end = ( (nnz - 1) / blockDim.x) * blockDim.x;
+        unsigned int next_row = row_indices[size-1];
+
+        unsigned int i = loop_end + threadIdx.x;
+        while (1)
+        {
+          //load into shared memory (coalesced access):
+          if (i < nnz)
+          {
+            element_buffer[threadIdx.x] = elements[i];
+            unsigned int tmp = column_indices[i];
+            col_index_buffer[threadIdx.x] = tmp;
+            vector_buffer[threadIdx.x] = vector[tmp];
+          }
+
+          __syncthreads();
+
+          //now a single thread does the remaining work in shared memory:
+          if (threadIdx.x == 0)
+          {
+            // traverse through all the loaded data from back to front:
+            for (unsigned int k2=0; k2<blockDim.x; ++k2)
+            {
+              unsigned int k = (blockDim.x - k2) - 1;
+
+              if (i+k >= nnz)
+                continue;
+
+              if (col_index_buffer[k] > row_at_window_start) //use recently computed results
+                current_vector_entry -= element_buffer[k] * vector_buffer[k];
+              else if (col_index_buffer[k] > current_row) //use buffered data
+                current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]];
+
+              if (i+k == next_row) //current row is finished. Write back result
+              {
+                vector[current_row] = current_vector_entry;
+                if (current_row > 0) //load next row's data
+                {
+                  --current_row;
+                  next_row = row_indices[current_row];
+                  current_vector_entry = vector[current_row];
+                }
+              }
+
+
+            } // for k
+
+            row_at_window_start = current_row;
+          } // if (get_local_id(0) == 0)
+
+          __syncthreads();
+
+          if (i < blockDim.x)
+            break;
+
+          i -= blockDim.x;
+        } //for i
+      }
+
+
+
+      template <typename T>
+      __global__ void csr_lu_backward_kernel(
+                const unsigned int * row_indices,
+                const unsigned int * column_indices,
+                const T * elements,
+                      T * vector,
+                unsigned int size)
+      {
+        __shared__  unsigned int col_index_buffer[128];
+        __shared__  T element_buffer[128];
+        __shared__  T vector_buffer[128];
+
+        unsigned int nnz = row_indices[size];
+        unsigned int current_row = size-1;
+        unsigned int row_at_window_start = size-1;
+        T current_vector_entry = vector[size-1];
+        T diagonal_entry;
+        unsigned int loop_end = ( (nnz - 1) / blockDim.x) * blockDim.x;
+        unsigned int next_row = row_indices[size-1];
+
+        unsigned int i = loop_end + threadIdx.x;
+        while (1)
+        {
+          //load into shared memory (coalesced access):
+          if (i < nnz)
+          {
+            element_buffer[threadIdx.x] = elements[i];
+            unsigned int tmp = column_indices[i];
+            col_index_buffer[threadIdx.x] = tmp;
+            vector_buffer[threadIdx.x] = vector[tmp];
+          }
+
+          __syncthreads();
+
+          //now a single thread does the remaining work in shared memory:
+          if (threadIdx.x == 0)
+          {
+            // traverse through all the loaded data from back to front:
+            for (unsigned int k2=0; k2<blockDim.x; ++k2)
+            {
+              unsigned int k = (blockDim.x - k2) - 1;
+
+              if (i+k >= nnz)
+                continue;
+
+              if (col_index_buffer[k] > row_at_window_start) //use recently computed results
+                current_vector_entry -= element_buffer[k] * vector_buffer[k];
+              else if (col_index_buffer[k] > current_row) //use buffered data
+                current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]];
+              else if (col_index_buffer[k] == current_row)
+                diagonal_entry = element_buffer[k];
+
+              if (i+k == next_row) //current row is finished. Write back result
+              {
+                vector[current_row] = current_vector_entry / diagonal_entry;
+                if (current_row > 0) //load next row's data
+                {
+                  --current_row;
+                  next_row = row_indices[current_row];
+                  current_vector_entry = vector[current_row];
+                }
+              }
+
+
+            } // for k
+
+            row_at_window_start = current_row;
+          } // if (get_local_id(0) == 0)
+
+          __syncthreads();
+
+          if (i < blockDim.x)
+            break;
+
+          i -= blockDim.x;
+        } //for i
+      }
+
+
+
+      //
+      // transposed
+      //
+
+
+      template <typename T>
+      __global__ void csr_trans_lu_forward_kernel2(
+                const unsigned int * row_indices,
+                const unsigned int * column_indices,
+                const T * elements,
+                      T * vector,
+                unsigned int size)
+      {
+        for (unsigned int row = 0; row < size; ++row)
+        {
+          T result_entry = vector[row];
+
+          unsigned int row_start = row_indices[row];
+          unsigned int row_stop  = row_indices[row + 1];
+          for (unsigned int entry_index = row_start + threadIdx.x; entry_index < row_stop; entry_index += blockDim.x)
+          {
+            unsigned int col_index = column_indices[entry_index];
+            if (col_index > row)
+              vector[col_index] -= result_entry * elements[entry_index];
+          }
+
+          __syncthreads();
+        }
+      }
+
+      template <typename T>
+      __global__ void csr_trans_unit_lu_forward_kernel(
+                const unsigned int * row_indices,
+                const unsigned int * column_indices,
+                const T * elements,
+                      T * vector,
+                unsigned int size)
+      {
+        __shared__  unsigned int row_index_lookahead[256];
+        __shared__  unsigned int row_index_buffer[256];
+
+        unsigned int row_index;
+        unsigned int col_index;
+        T matrix_entry;
+        unsigned int nnz = row_indices[size];
+        unsigned int row_at_window_start = 0;
+        unsigned int row_at_window_end = 0;
+        unsigned int loop_end = ( (nnz - 1) / blockDim.x + 1) * blockDim.x;
+
+        for (unsigned int i = threadIdx.x; i < loop_end; i += blockDim.x)
+        {
+          col_index    = (i < nnz) ? column_indices[i] : 0;
+          matrix_entry = (i < nnz) ? elements[i]       : 0;
+          row_index_lookahead[threadIdx.x] = (row_at_window_start + threadIdx.x < size) ? row_indices[row_at_window_start + threadIdx.x] : size - 1;
+
+          __syncthreads();
+
+          if (i < nnz)
+          {
+            unsigned int row_index_inc = 0;
+            while (i >= row_index_lookahead[row_index_inc + 1])
+              ++row_index_inc;
+            row_index = row_at_window_start + row_index_inc;
+            row_index_buffer[threadIdx.x] = row_index;
+          }
+          else
+          {
+            row_index = size+1;
+            row_index_buffer[threadIdx.x] = size - 1;
+          }
+
+          __syncthreads();
+
+          row_at_window_start = row_index_buffer[0];
+          row_at_window_end   = row_index_buffer[blockDim.x - 1];
+
+          //forward elimination
+          for (unsigned int row = row_at_window_start; row <= row_at_window_end; ++row)
+          {
+            T result_entry = vector[row];
+
+            if ( (row_index == row) && (col_index > row) )
+              vector[col_index] -= result_entry * matrix_entry;
+
+            __syncthreads();
+          }
+
+          row_at_window_start = row_at_window_end;
+        }
+
+      }
+
+      template <typename T>
+      __global__ void csr_trans_lu_forward_kernel(
+                const unsigned int * row_indices,
+                const unsigned int * column_indices,
+                const T * elements,
+                const T * diagonal_entries,
+                      T * vector,
+                unsigned int size)
+      {
+        __shared__  unsigned int row_index_lookahead[256];
+        __shared__  unsigned int row_index_buffer[256];
+
+        unsigned int row_index;
+        unsigned int col_index;
+        T matrix_entry;
+        unsigned int nnz = row_indices[size];
+        unsigned int row_at_window_start = 0;
+        unsigned int row_at_window_end = 0;
+        unsigned int loop_end = ( (nnz - 1) / blockDim.x + 1) * blockDim.x;
+
+        for (unsigned int i = threadIdx.x; i < loop_end; i += blockDim.x)
+        {
+          col_index    = (i < nnz) ? column_indices[i] : 0;
+          matrix_entry = (i < nnz) ? elements[i]       : 0;
+          row_index_lookahead[threadIdx.x] = (row_at_window_start + threadIdx.x < size) ? row_indices[row_at_window_start + threadIdx.x] : size - 1;
+
+          __syncthreads();
+
+          if (i < nnz)
+          {
+            unsigned int row_index_inc = 0;
+            while (i >= row_index_lookahead[row_index_inc + 1])
+              ++row_index_inc;
+            row_index = row_at_window_start + row_index_inc;
+            row_index_buffer[threadIdx.x] = row_index;
+          }
+          else
+          {
+            row_index = size+1;
+            row_index_buffer[threadIdx.x] = size - 1;
+          }
+
+          __syncthreads();
+
+          row_at_window_start = row_index_buffer[0];
+          row_at_window_end   = row_index_buffer[blockDim.x - 1];
+
+          //forward elimination
+          for (unsigned int row = row_at_window_start; row <= row_at_window_end; ++row)
+          {
+            T result_entry = vector[row] / diagonal_entries[row];
+
+            if ( (row_index == row) && (col_index > row) )
+              vector[col_index] -= result_entry * matrix_entry;
+
+            __syncthreads();
+          }
+
+          row_at_window_start = row_at_window_end;
+        }
+
+        // final step: Divide vector by diagonal entries:
+        for (unsigned int i = threadIdx.x; i < size; i += blockDim.x)
+          vector[i] /= diagonal_entries[i];
+
+      }
+
+
+      template <typename T>
+      __global__ void csr_trans_unit_lu_backward_kernel(
+                const unsigned int * row_indices,
+                const unsigned int * column_indices,
+                const T * elements,
+                      T * vector,
+                unsigned int size)
+      {
+        __shared__  unsigned int row_index_lookahead[256];
+        __shared__  unsigned int row_index_buffer[256];
+
+        unsigned int row_index;
+        unsigned int col_index;
+        T matrix_entry;
+        unsigned int nnz = row_indices[size];
+        unsigned int row_at_window_start = size;
+        unsigned int row_at_window_end;
+        unsigned int loop_end = ( (nnz - 1) / blockDim.x + 1) * blockDim.x;
+
+        for (unsigned int i2 = threadIdx.x; i2 < loop_end; i2 += blockDim.x)
+        {
+          unsigned int i = (nnz - i2) - 1;
+          col_index    = (i2 < nnz) ? column_indices[i] : 0;
+          matrix_entry = (i2 < nnz) ? elements[i]       : 0;
+          row_index_lookahead[threadIdx.x] = (row_at_window_start >= threadIdx.x) ? row_indices[row_at_window_start - threadIdx.x] : 0;
+
+          __syncthreads();
+
+          if (i2 < nnz)
+          {
+            unsigned int row_index_dec = 0;
+            while (row_index_lookahead[row_index_dec] > i)
+              ++row_index_dec;
+            row_index = row_at_window_start - row_index_dec;
+            row_index_buffer[threadIdx.x] = row_index;
+          }
+          else
+          {
+            row_index = size+1;
+            row_index_buffer[threadIdx.x] = 0;
+          }
+
+          __syncthreads();
+
+          row_at_window_start = row_index_buffer[0];
+          row_at_window_end   = row_index_buffer[blockDim.x - 1];
+
+          //backward elimination
+          for (unsigned int row2 = 0; row2 <= (row_at_window_start - row_at_window_end); ++row2)
+          {
+            unsigned int row = row_at_window_start - row2;
+            T result_entry = vector[row];
+
+            if ( (row_index == row) && (col_index < row) )
+              vector[col_index] -= result_entry * matrix_entry;
+
+            __syncthreads();
+          }
+
+          row_at_window_start = row_at_window_end;
+        }
+
+      }
+
+
+
+      template <typename T>
+      __global__ void csr_trans_lu_backward_kernel2(
+                const unsigned int * row_indices,
+                const unsigned int * column_indices,
+                const T * elements,
+                const T * diagonal_entries,
+                      T * vector,
+                unsigned int size)
+      {
+        T result_entry = 0;
+
+        //backward elimination, using U and D:
+        for (unsigned int row2 = 0; row2 < size; ++row2)
+        {
+          unsigned int row = (size - row2) - 1;
+          result_entry = vector[row] / diagonal_entries[row];
+
+          unsigned int row_start = row_indices[row];
+          unsigned int row_stop  = row_indices[row + 1];
+          for (unsigned int entry_index = row_start + threadIdx.x; entry_index < row_stop; ++entry_index)
+          {
+            unsigned int col_index = column_indices[entry_index];
+            if (col_index < row)
+              vector[col_index] -= result_entry * elements[entry_index];
+          }
+
+          __syncthreads();
+
+          if (threadIdx.x == 0)
+            vector[row] = result_entry;
+        }
+      }
+
+
+      template <typename T>
+      __global__ void csr_trans_lu_backward_kernel(
+                const unsigned int * row_indices,
+                const unsigned int * column_indices,
+                const T * elements,
+                const T * diagonal_entries,
+                      T * vector,
+                unsigned int size)
+      {
+        __shared__  unsigned int row_index_lookahead[256];
+        __shared__  unsigned int row_index_buffer[256];
+
+        unsigned int row_index;
+        unsigned int col_index;
+        T matrix_entry;
+        unsigned int nnz = row_indices[size];
+        unsigned int row_at_window_start = size;
+        unsigned int row_at_window_end;
+        unsigned int loop_end = ( (nnz - 1) / blockDim.x + 1) * blockDim.x;
+
+        for (unsigned int i2 = threadIdx.x; i2 < loop_end; i2 += blockDim.x)
+        {
+          unsigned int i = (nnz - i2) - 1;
+          col_index    = (i2 < nnz) ? column_indices[i] : 0;
+          matrix_entry = (i2 < nnz) ? elements[i]       : 0;
+          row_index_lookahead[threadIdx.x] = (row_at_window_start >= threadIdx.x) ? row_indices[row_at_window_start - threadIdx.x] : 0;
+
+          __syncthreads();
+
+          if (i2 < nnz)
+          {
+            unsigned int row_index_dec = 0;
+            while (row_index_lookahead[row_index_dec] > i)
+              ++row_index_dec;
+            row_index = row_at_window_start - row_index_dec;
+            row_index_buffer[threadIdx.x] = row_index;
+          }
+          else
+          {
+            row_index = size+1;
+            row_index_buffer[threadIdx.x] = 0;
+          }
+
+          __syncthreads();
+
+          row_at_window_start = row_index_buffer[0];
+          row_at_window_end   = row_index_buffer[blockDim.x - 1];
+
+          //backward elimination
+          for (unsigned int row2 = 0; row2 <= (row_at_window_start - row_at_window_end); ++row2)
+          {
+            unsigned int row = row_at_window_start - row2;
+            T result_entry = vector[row] / diagonal_entries[row];
+
+            if ( (row_index == row) && (col_index < row) )
+              vector[col_index] -= result_entry * matrix_entry;
+
+            __syncthreads();
+          }
+
+          row_at_window_start = row_at_window_end;
+        }
+
+
+        // final step: Divide vector by diagonal entries:
+        for (unsigned int i = threadIdx.x; i < size; i += blockDim.x)
+          vector[i] /= diagonal_entries[i];
+
+      }
+
+
+      template <typename T>
+      __global__ void csr_block_trans_unit_lu_forward(
+                const unsigned int * row_jumper_L,      //L part (note that L is transposed in memory)
+                const unsigned int * column_indices_L,
+                const T * elements_L,
+                const unsigned int * block_offsets,
+                T * result,
+                unsigned int size)
+      {
+        unsigned int col_start = block_offsets[2*blockIdx.x];
+        unsigned int col_stop  = block_offsets[2*blockIdx.x+1];
+        unsigned int row_start = row_jumper_L[col_start];
+        unsigned int row_stop;
+        T result_entry = 0;
+
+        if (col_start >= col_stop)
+          return;
+
+        //forward elimination, using L:
+        for (unsigned int col = col_start; col < col_stop; ++col)
+        {
+          result_entry = result[col];
+          row_stop = row_jumper_L[col + 1];
+          for (unsigned int buffer_index = row_start + threadIdx.x; buffer_index < row_stop; buffer_index += blockDim.x)
+            result[column_indices_L[buffer_index]] -= result_entry * elements_L[buffer_index];
+          row_start = row_stop; //for next iteration (avoid unnecessary loads from GPU RAM)
+          __syncthreads();
+        }
+
+      };
+
+
+      template <typename T>
+      __global__ void csr_block_trans_lu_backward(
+                const unsigned int * row_jumper_U,      //U part (note that U is transposed in memory)
+                const unsigned int * column_indices_U,
+                const T * elements_U,
+                const T * diagonal_U,
+                const unsigned int * block_offsets,
+                T * result,
+                unsigned int size)
+      {
+        unsigned int col_start = block_offsets[2*blockIdx.x];
+        unsigned int col_stop  = block_offsets[2*blockIdx.x+1];
+        unsigned int row_start;
+        unsigned int row_stop;
+        T result_entry = 0;
+
+        if (col_start >= col_stop)
+          return;
+
+        //backward elimination, using U and diagonal_U
+        for (unsigned int iter = 0; iter < col_stop - col_start; ++iter)
+        {
+          unsigned int col = (col_stop - iter) - 1;
+          result_entry = result[col] / diagonal_U[col];
+          row_start = row_jumper_U[col];
+          row_stop  = row_jumper_U[col + 1];
+          for (unsigned int buffer_index = row_start + threadIdx.x; buffer_index < row_stop; buffer_index += blockDim.x)
+            result[column_indices_U[buffer_index]] -= result_entry * elements_U[buffer_index];
+          __syncthreads();
+        }
+
+        //divide result vector by diagonal:
+        for (unsigned int col = col_start + threadIdx.x; col < col_stop; col += blockDim.x)
+          result[col] /= diagonal_U[col];
+      };
+
+
+
+      //
+      // Coordinate Matrix
+      //
+
+
+
+
+      //
+      // ELL Matrix
+      //
+
+
+
+      //
+      // Hybrid Matrix
+      //
+
+
+
+    } // namespace opencl
+  } //namespace linalg
+} //namespace viennacl
+
+
+#endif
diff --git a/viennacl/linalg/cuda/vector_operations.hpp b/viennacl/linalg/cuda/vector_operations.hpp
new file mode 100644
index 0000000..62c527f
--- /dev/null
+++ b/viennacl/linalg/cuda/vector_operations.hpp
@@ -0,0 +1,2790 @@
+#ifndef VIENNACL_LINALG_CUDA_VECTOR_OPERATIONS_HPP_
+#define VIENNACL_LINALG_CUDA_VECTOR_OPERATIONS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/cuda/vector_operations.hpp
+    @brief Implementations of vector operations using a plain single-threaded execution on CPU
+*/
+
+#include <cmath>
+#include "viennacl/forwards.h"
+#include "viennacl/scalar.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/meta/predicate.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/start.hpp"
+#include "viennacl/traits/stride.hpp"
+
+namespace viennacl
+{
+  namespace linalg
+  {
+    namespace cuda
+    {
+
+      //
+      // Introductory note: By convention, all dimensions are already checked in the dispatcher frontend. No need to double-check again in here!
+      //
+
+
+      //////////////////////// av /////////////////////////////
+
+      // gpu scalar
+      template <typename T>
+      __global__ void av_kernel(T * vec1,
+                                unsigned int start1,
+                                unsigned int inc1,
+                                unsigned int size1,
+
+                                const T * fac2,
+                                unsigned int options2,
+                                const T * vec2,
+                                unsigned int start2,
+                                unsigned int inc2)
+      {
+        T alpha = *fac2;
+        if (options2 & (1 << 0))
+          alpha = -alpha;
+
+        if (options2 & (1 << 1))
+        {
+          for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                            i < size1;
+                            i += gridDim.x * blockDim.x)
+            vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha;
+        }
+        else
+        {
+          for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                            i < size1;
+                            i += gridDim.x * blockDim.x)
+            vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha;
+        }
+      }
+
+      // cpu scalar
+      template <typename T>
+      __global__ void av_kernel(T * vec1,
+                                unsigned int start1,
+                                unsigned int inc1,
+                                unsigned int size1,
+
+                                T fac2,
+                                unsigned int options2,
+                                const T * vec2,
+                                unsigned int start2,
+                                unsigned int inc2)
+      {
+        T alpha = fac2;
+        if (options2 & (1 << 0))
+          alpha = -alpha;
+
+        if (options2 & (1 << 1))
+        {
+          for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                            i < size1;
+                            i += gridDim.x * blockDim.x)
+            vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha;
+        }
+        else
+        {
+          for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                            i < size1;
+                            i += gridDim.x * blockDim.x)
+            vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha;
+        }
+      }
+
+
+
+      template <typename T, typename ScalarType1>
+      void av(vector_base<T> & vec1,
+              vector_base<T> const & vec2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
+      {
+        typedef T        value_type;
+
+        unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
+
+        value_type data_alpha = alpha;
+        if (flip_sign_alpha)
+          data_alpha = -data_alpha;
+        if (reciprocal_alpha)
+          data_alpha = static_cast<value_type>(1) / data_alpha;
+
+        value_type temporary_alpha = 0;
+        if (viennacl::is_cpu_scalar<ScalarType1>::value)
+          temporary_alpha = alpha;
+
+        av_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
+                                static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                static_cast<unsigned int>(viennacl::traits::size(vec1)),
+
+                                detail::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
+                                options_alpha,
+                                detail::cuda_arg<value_type>(vec2),
+                                static_cast<unsigned int>(viennacl::traits::start(vec2)),
+                                static_cast<unsigned int>(viennacl::traits::stride(vec2)) );
+        VIENNACL_CUDA_LAST_ERROR_CHECK("av_kernel");
+      }
+
+
+      ///////////////////// avbv //////////////////////////////////
+
+      // alpha and beta on GPU
+      template <typename T>
+      __global__ void avbv_kernel(T * vec1,
+                                  unsigned int start1,
+                                  unsigned int inc1,
+                                  unsigned int size1,
+
+                                  const T * fac2,
+                                  unsigned int options2,
+                                  const T * vec2,
+                                  unsigned int start2,
+                                  unsigned int inc2,
+
+                                  const T * fac3,
+                                  unsigned int options3,
+                                  const T * vec3,
+                                  unsigned int start3,
+                                  unsigned int inc3)
+      {
+        T alpha = *fac2;
+        if (options2 & (1 << 0))
+          alpha = -alpha;
+
+        T beta = *fac3;
+        if (options3 & (1 << 0))
+          beta = -beta;
+
+        if (options2 & (1 << 1))
+        {
+          if (options3 & (1 << 1))
+          {
+            for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                              i < size1;
+                              i += gridDim.x * blockDim.x)
+              vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
+          }
+          else
+          {
+            for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                              i < size1;
+                              i += gridDim.x * blockDim.x)
+              vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
+          }
+        }
+        else
+        {
+          if (options3 & (1 << 1))
+          {
+            for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                              i < size1;
+                              i += gridDim.x * blockDim.x)
+              vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
+          }
+          else
+          {
+            for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                              i < size1;
+                              i += gridDim.x * blockDim.x)
+              vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
+          }
+        }
+      }
+
+      // alpha on CPU, beta on GPU
+      template <typename T>
+      __global__ void avbv_kernel(T * vec1,
+                                  unsigned int start1,
+                                  unsigned int inc1,
+                                  unsigned int size1,
+
+                                  T fac2,
+                                  unsigned int options2,
+                                  const T * vec2,
+                                  unsigned int start2,
+                                  unsigned int inc2,
+
+                                  const T * fac3,
+                                  unsigned int options3,
+                                  const T * vec3,
+                                  unsigned int start3,
+                                  unsigned int inc3)
+      {
+        T alpha = fac2;
+        if (options2 & (1 << 0))
+          alpha = -alpha;
+
+        T beta = *fac3;
+        if (options3 & (1 << 0))
+          beta = -beta;
+
+        if (options2 & (1 << 1))
+        {
+          if (options3 & (1 << 1))
+          {
+            for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                              i < size1;
+                              i += gridDim.x * blockDim.x)
+              vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
+          }
+          else
+          {
+            for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                              i < size1;
+                              i += gridDim.x * blockDim.x)
+              vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
+          }
+        }
+        else
+        {
+          if (options3 & (1 << 1))
+          {
+            for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                              i < size1;
+                              i += gridDim.x * blockDim.x)
+              vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
+          }
+          else
+          {
+            for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                              i < size1;
+                              i += gridDim.x * blockDim.x)
+              vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
+          }
+        }
+      }
+
+      // alpha on GPU, beta on CPU
+      template <typename T>
+      __global__ void avbv_kernel(T * vec1,
+                                  unsigned int start1,
+                                  unsigned int inc1,
+                                  unsigned int size1,
+
+                                  const T * fac2,
+                                  unsigned int options2,
+                                  const T * vec2,
+                                  unsigned int start2,
+                                  unsigned int inc2,
+
+                                  T fac3,
+                                  unsigned int options3,
+                                  const T * vec3,
+                                  unsigned int start3,
+                                  unsigned int inc3)
+      {
+        T alpha = *fac2;
+        if (options2 & (1 << 0))
+          alpha = -alpha;
+
+        T beta = fac3;
+        if (options3 & (1 << 0))
+          beta = -beta;
+
+        if (options2 & (1 << 1))
+        {
+          if (options3 & (1 << 1))
+          {
+            for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                              i < size1;
+                              i += gridDim.x * blockDim.x)
+              vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
+          }
+          else
+          {
+            for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                              i < size1;
+                              i += gridDim.x * blockDim.x)
+              vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
+          }
+        }
+        else
+        {
+          if (options3 & (1 << 1))
+          {
+            for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                              i < size1;
+                              i += gridDim.x * blockDim.x)
+              vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
+          }
+          else
+          {
+            for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                              i < size1;
+                              i += gridDim.x * blockDim.x)
+              vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
+          }
+        }
+      }
+
+      // alpha and beta on CPU
+      template <typename T>
+      __global__ void avbv_kernel(T * vec1,
+                                  unsigned int start1,
+                                  unsigned int inc1,
+                                  unsigned int size1,
+
+                                  T fac2,
+                                  unsigned int options2,
+                                  const T * vec2,
+                                  unsigned int start2,
+                                  unsigned int inc2,
+
+                                  T fac3,
+                                  unsigned int options3,
+                                  const T * vec3,
+                                  unsigned int start3,
+                                  unsigned int inc3)
+      {
+        T alpha = fac2;
+        if (options2 & (1 << 0))
+          alpha = -alpha;
+
+        T beta = fac3;
+        if (options3 & (1 << 0))
+          beta = -beta;
+
+        if (options2 & (1 << 1))
+        {
+          if (options3 & (1 << 1))
+          {
+            for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                              i < size1;
+                              i += gridDim.x * blockDim.x)
+              vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
+          }
+          else
+          {
+            for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                              i < size1;
+                              i += gridDim.x * blockDim.x)
+              vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
+          }
+        }
+        else
+        {
+          if (options3 & (1 << 1))
+          {
+            for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                              i < size1;
+                              i += gridDim.x * blockDim.x)
+              vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
+          }
+          else
+          {
+            for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                              i < size1;
+                              i += gridDim.x * blockDim.x)
+              vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
+          }
+        }
+      }
+
+
+
+
+      template <typename T, typename ScalarType1, typename ScalarType2>
+      void avbv(vector_base<T> & vec1,
+                vector_base<T> const & vec2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+                vector_base<T> const & vec3, ScalarType2 const & beta,  vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
+      {
+        typedef T        value_type;
+
+        unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
+
+        value_type data_alpha = alpha;
+        if (flip_sign_alpha)
+          data_alpha = -data_alpha;
+        if (reciprocal_alpha)
+          data_alpha = static_cast<value_type>(1) / data_alpha;
+
+        value_type temporary_alpha = 0;
+        if (viennacl::is_cpu_scalar<ScalarType1>::value)
+          temporary_alpha = alpha;
+
+        unsigned int options_beta  = detail::make_options(len_beta,  reciprocal_beta,  flip_sign_beta);
+
+        value_type temporary_beta = 0;
+        if (viennacl::is_cpu_scalar<ScalarType2>::value)
+          temporary_beta = beta;
+
+
+        avbv_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
+                                  static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                  static_cast<unsigned int>(viennacl::traits::size(vec1)),
+
+                                  detail::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
+                                  options_alpha,
+                                  detail::cuda_arg<value_type>(vec2),
+                                  static_cast<unsigned int>(viennacl::traits::start(vec2)),
+                                  static_cast<unsigned int>(viennacl::traits::stride(vec2)),
+
+                                  detail::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)),
+                                  options_beta,
+                                  detail::cuda_arg<value_type>(vec3),
+                                  static_cast<unsigned int>(viennacl::traits::start(vec3)),
+                                  static_cast<unsigned int>(viennacl::traits::stride(vec3)) );
+        VIENNACL_CUDA_LAST_ERROR_CHECK("avbv_kernel");
+      }
+
+
+      ////////////////////////// avbv_v //////////////////////////////////////
+
+
+      // alpha and beta on GPU
+      template <typename T>
+      __global__ void avbv_v_kernel(T * vec1,
+                                    unsigned int start1,
+                                    unsigned int inc1,
+                                    unsigned int size1,
+
+                                    const T * fac2,
+                                    unsigned int options2,
+                                    const T * vec2,
+                                    unsigned int start2,
+                                    unsigned int inc2,
+
+                                    const T * fac3,
+                                    unsigned int options3,
+                                    const T * vec3,
+                                    unsigned int start3,
+                                    unsigned int inc3)
+      {
+        T alpha = *fac2;
+        if (options2 & (1 << 0))
+          alpha = -alpha;
+
+        T beta = *fac3;
+        if (options3 & (1 << 0))
+          beta = -beta;
+
+        if (options2 & (1 << 1))
+        {
+          if (options3 & (1 << 1))
+          {
+            for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                              i < size1;
+                              i += gridDim.x * blockDim.x)
+              vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
+          }
+          else
+          {
+            for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                              i < size1;
+                              i += gridDim.x * blockDim.x)
+              vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
+          }
+        }
+        else
+        {
+          if (options3 & (1 << 1))
+          {
+            for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                              i < size1;
+                              i += gridDim.x * blockDim.x)
+              vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
+          }
+          else
+          {
+            for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                              i < size1;
+                              i += gridDim.x * blockDim.x)
+              vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
+          }
+        }
+      }
+
+      // alpha on CPU, beta on GPU
+      template <typename T>
+      __global__ void avbv_v_kernel(T * vec1,
+                                    unsigned int start1,
+                                    unsigned int inc1,
+                                    unsigned int size1,
+
+                                    T fac2,
+                                    unsigned int options2,
+                                    const T * vec2,
+                                    unsigned int start2,
+                                    unsigned int inc2,
+
+                                    const T * fac3,
+                                    unsigned int options3,
+                                    const T * vec3,
+                                    unsigned int start3,
+                                    unsigned int inc3)
+      {
+        T alpha = fac2;
+        if (options2 & (1 << 0))
+          alpha = -alpha;
+
+        T beta = *fac3;
+        if (options3 & (1 << 0))
+          beta = -beta;
+
+        if (options2 & (1 << 1))
+        {
+          if (options3 & (1 << 1))
+          {
+            for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                              i < size1;
+                              i += gridDim.x * blockDim.x)
+              vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
+          }
+          else
+          {
+            for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                              i < size1;
+                              i += gridDim.x * blockDim.x)
+              vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
+          }
+        }
+        else
+        {
+          if (options3 & (1 << 1))
+          {
+            for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                              i < size1;
+                              i += gridDim.x * blockDim.x)
+              vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
+          }
+          else
+          {
+            for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                              i < size1;
+                              i += gridDim.x * blockDim.x)
+              vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
+          }
+        }
+      }
+
+      // alpha on GPU, beta on CPU
+      template <typename T>
+      __global__ void avbv_v_kernel(T * vec1,
+                                    unsigned int start1,
+                                    unsigned int inc1,
+                                    unsigned int size1,
+
+                                    const T * fac2,
+                                    unsigned int options2,
+                                    const T * vec2,
+                                    unsigned int start2,
+                                    unsigned int inc2,
+
+                                    T fac3,
+                                    unsigned int options3,
+                                    const T * vec3,
+                                    unsigned int start3,
+                                    unsigned int inc3)
+      {
+        T alpha = *fac2;
+        if (options2 & (1 << 0))
+          alpha = -alpha;
+
+        T beta = fac3;
+        if (options3 & (1 << 0))
+          beta = -beta;
+
+        if (options2 & (1 << 1))
+        {
+          if (options3 & (1 << 1))
+          {
+            for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                              i < size1;
+                              i += gridDim.x * blockDim.x)
+              vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
+          }
+          else
+          {
+            for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                              i < size1;
+                              i += gridDim.x * blockDim.x)
+              vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
+          }
+        }
+        else
+        {
+          if (options3 & (1 << 1))
+          {
+            for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                              i < size1;
+                              i += gridDim.x * blockDim.x)
+              vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
+          }
+          else
+          {
+            for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                              i < size1;
+                              i += gridDim.x * blockDim.x)
+              vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
+          }
+        }
+      }
+
+      // alpha and beta on CPU
+      template <typename T>
+      __global__ void avbv_v_kernel(T * vec1,
+                                    unsigned int start1,
+                                    unsigned int inc1,
+                                    unsigned int size1,
+
+                                    T fac2,
+                                    unsigned int options2,
+                                    const T * vec2,
+                                    unsigned int start2,
+                                    unsigned int inc2,
+
+                                    T fac3,
+                                    unsigned int options3,
+                                    const T * vec3,
+                                    unsigned int start3,
+                                    unsigned int inc3)
+      {
+        T alpha = fac2;
+        if (options2 & (1 << 0))
+          alpha = -alpha;
+
+        T beta = fac3;
+        if (options3 & (1 << 0))
+          beta = -beta;
+
+        if (options2 & (1 << 1))
+        {
+          if (options3 & (1 << 1))
+          {
+            for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                              i < size1;
+                              i += gridDim.x * blockDim.x)
+              vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
+          }
+          else
+          {
+            for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                              i < size1;
+                              i += gridDim.x * blockDim.x)
+              vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
+          }
+        }
+        else
+        {
+          if (options3 & (1 << 1))
+          {
+            for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                              i < size1;
+                              i += gridDim.x * blockDim.x)
+              vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
+          }
+          else
+          {
+            for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                              i < size1;
+                              i += gridDim.x * blockDim.x)
+              vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
+          }
+        }
+      }
+
+
+      template <typename T, typename ScalarType1, typename ScalarType2>
+      void avbv_v(vector_base<T> & vec1,
+                  vector_base<T> const & vec2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+                  vector_base<T> const & vec3, ScalarType2 const & beta,  vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
+      {
+        typedef T        value_type;
+
+        unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
+
+        value_type data_alpha = alpha;
+        if (flip_sign_alpha)
+          data_alpha = -data_alpha;
+        if (reciprocal_alpha)
+          data_alpha = static_cast<value_type>(1) / data_alpha;
+
+        value_type temporary_alpha = 0;
+        if (viennacl::is_cpu_scalar<ScalarType1>::value)
+          temporary_alpha = alpha;
+
+        unsigned int options_beta  = detail::make_options(len_beta,  reciprocal_beta,  flip_sign_beta);
+
+        value_type temporary_beta = 0;
+        if (viennacl::is_cpu_scalar<ScalarType2>::value)
+          temporary_beta = beta;
+
+
+        avbv_v_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
+                                    static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                    static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                    static_cast<unsigned int>(viennacl::traits::size(vec1)),
+
+                                    detail::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
+                                    options_alpha,
+                                    detail::cuda_arg<value_type>(vec2),
+                                    static_cast<unsigned int>(viennacl::traits::start(vec2)),
+                                    static_cast<unsigned int>(viennacl::traits::stride(vec2)),
+
+                                    detail::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)),
+                                    options_beta,
+                                    detail::cuda_arg<value_type>(vec3),
+                                    static_cast<unsigned int>(viennacl::traits::start(vec3)),
+                                    static_cast<unsigned int>(viennacl::traits::stride(vec3)) );
+      }
+
+
+      //////////////////////////
+
+      template <typename T>
+      __global__ void vector_assign_kernel(T * vec1,
+                                           unsigned int start1,
+                                           unsigned int inc1,
+                                           unsigned int size1,
+                                           unsigned int internal_size1,
+
+                                           T alpha)
+      {
+        for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                          i < size1;
+                          i += gridDim.x * blockDim.x)
+          vec1[i*inc1+start1] =  (i < size1) ? alpha : 0;
+      }
+
+      /** @brief Assign a constant value to a vector (-range/-slice)
+      *
+      * @param vec1   The vector to which the value should be assigned
+      * @param alpha  The value to be assigned
+      * @param up_to_internal_size  Specifies whether alpha should also be written to padded memory (mostly used for clearing the whole buffer).
+      */
+      template <typename T, typename S1>
+      void vector_assign(vector_base<T> & vec1, const S1 & alpha, bool up_to_internal_size = false)
+      {
+        typedef T        value_type;
+
+        value_type temporary_alpha = 0;
+        if (viennacl::is_cpu_scalar<S1>::value)
+          temporary_alpha = alpha;
+
+        unsigned int size = up_to_internal_size ? static_cast<unsigned int>(vec1.internal_size()) : static_cast<unsigned int>(viennacl::traits::size(vec1));
+
+        vector_assign_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
+                                           static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                           static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                           size,
+                                           static_cast<unsigned int>(vec1.internal_size()),  //Note: Do NOT use traits::internal_size() here, because vector proxies don't require padding.
+
+                                           detail::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)) );
+        VIENNACL_CUDA_LAST_ERROR_CHECK("avbv_v_kernel");
+      }
+
+      //////////////////////////
+
+      template <typename T>
+      __global__ void vector_swap_kernel(T * vec1,
+                                         unsigned int start1,
+                                         unsigned int inc1,
+                                         unsigned int size1,
+
+                                         T * vec2,
+                                         unsigned int start2,
+                                         unsigned int inc2)
+      {
+        T tmp;
+        for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                          i < size1;
+                          i += gridDim.x * blockDim.x)
+        {
+          tmp = vec2[i*inc2+start2];
+          vec2[i*inc2+start2] = vec1[i*inc1+start1];
+          vec1[i*inc1+start1] = tmp;
+        }
+      }
+
+
+      /** @brief Swaps the contents of two vectors, data is copied
+      *
+      * @param vec1   The first vector (or -range, or -slice)
+      * @param vec2   The second vector (or -range, or -slice)
+      */
+      template <typename T>
+      void vector_swap(vector_base<T> & vec1, vector_base<T> & vec2)
+      {
+        typedef T      value_type;
+
+        vector_swap_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
+                                         static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                         static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                         static_cast<unsigned int>(viennacl::traits::size(vec1)),
+
+                                         detail::cuda_arg<value_type>(vec2),
+                                         static_cast<unsigned int>(viennacl::traits::start(vec2)),
+                                         static_cast<unsigned int>(viennacl::traits::stride(vec2)) );
+        VIENNACL_CUDA_LAST_ERROR_CHECK("vector_swap_kernel");
+      }
+
+      ///////////////////////// Binary Elementwise operations /////////////
+
+      template <typename T>
+      __global__ void element_op_kernel(T * vec1,
+                                         unsigned int start1,
+                                         unsigned int inc1,
+                                         unsigned int size1,
+
+                                         T const * vec2,
+                                         unsigned int start2,
+                                         unsigned int inc2,
+
+                                         T const * vec3,
+                                         unsigned int start3,
+                                         unsigned int inc3,
+
+                                         unsigned int op_type
+                                       )
+      {
+        if (op_type == 2)
+        {
+          for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                            i < size1;
+                            i += gridDim.x * blockDim.x)
+          {
+            vec1[i*inc1+start1] = pow(vec2[i*inc2+start2], vec3[i*inc3+start3]);
+          }
+        }
+        else if (op_type == 1)
+        {
+          for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                            i < size1;
+                            i += gridDim.x * blockDim.x)
+          {
+            vec1[i*inc1+start1] = vec2[i*inc2+start2] / vec3[i*inc3+start3];
+          }
+        }
+        else if (op_type == 0)
+        {
+          for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                            i < size1;
+                            i += gridDim.x * blockDim.x)
+          {
+            vec1[i*inc1+start1] = vec2[i*inc2+start2] * vec3[i*inc3+start3];
+          }
+        }
+      }
+
+      template <typename T>
+      __global__ void element_op_int_kernel(T * vec1,
+                                         unsigned int start1,
+                                         unsigned int inc1,
+                                         unsigned int size1,
+
+                                         T const * vec2,
+                                         unsigned int start2,
+                                         unsigned int inc2,
+
+                                         T const * vec3,
+                                         unsigned int start3,
+                                         unsigned int inc3,
+
+                                         unsigned int op_type
+                                       )
+      {
+        if (op_type == 1)
+        {
+          for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                            i < size1;
+                            i += gridDim.x * blockDim.x)
+          {
+            vec1[i*inc1+start1] = vec2[i*inc2+start2] / vec3[i*inc3+start3];
+          }
+        }
+        else if (op_type == 0)
+        {
+          for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                            i < size1;
+                            i += gridDim.x * blockDim.x)
+          {
+            vec1[i*inc1+start1] = vec2[i*inc2+start2] * vec3[i*inc3+start3];
+          }
+        }
+      }
+
+      /** @brief Implementation of the element-wise operation v1 = v2 .* v3 and v1 = v2 ./ v3    (using MATLAB syntax)
+      *
+      * @param vec1   The result vector (or -range, or -slice)
+      * @param proxy  The proxy object holding v2, v3 and the operation
+      */
+      template <typename T, typename OP>
+      void element_op(vector_base<T> & vec1,
+                      vector_expression<const vector_base<T>, const vector_base<T>, op_element_binary<OP> > const & proxy)
+      {
+        typedef T        value_type;
+
+        unsigned int op_type = 2; //0: product, 1: division, 2: power
+        if (viennacl::is_division<OP>::value)
+          op_type = 1;
+        else if (viennacl::is_product<OP>::value)
+          op_type = 0;
+
+        element_op_int_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
+                                        static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::size(vec1)),
+
+                                        detail::cuda_arg<value_type>(proxy.lhs()),
+                                        static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+                                        static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())),
+
+                                        detail::cuda_arg<value_type>(proxy.rhs()),
+                                        static_cast<unsigned int>(viennacl::traits::start(proxy.rhs())),
+                                        static_cast<unsigned int>(viennacl::traits::stride(proxy.rhs())),
+
+                                        op_type
+                                       );
+        VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_kernel");
+      }
+
+      template <typename OP>
+      void element_op(vector_base<float> & vec1,
+                      vector_expression<const vector_base<float>, const vector_base<float>, op_element_binary<OP> > const & proxy)
+      {
+        typedef float        value_type;
+
+        unsigned int op_type = 2; //0: product, 1: division, 2: power
+        if (viennacl::is_division<OP>::value)
+          op_type = 1;
+        else if (viennacl::is_product<OP>::value)
+          op_type = 0;
+
+        element_op_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
+                                        static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::size(vec1)),
+
+                                        detail::cuda_arg<value_type>(proxy.lhs()),
+                                        static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+                                        static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())),
+
+                                        detail::cuda_arg<value_type>(proxy.rhs()),
+                                        static_cast<unsigned int>(viennacl::traits::start(proxy.rhs())),
+                                        static_cast<unsigned int>(viennacl::traits::stride(proxy.rhs())),
+
+                                        op_type
+                                       );
+        VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_kernel");
+      }
+
+      template <typename OP>
+      void element_op(vector_base<double> & vec1,
+                      vector_expression<const vector_base<double>, const vector_base<double>, op_element_binary<OP> > const & proxy)
+      {
+        typedef double        value_type;
+
+        unsigned int op_type = 2; //0: product, 1: division, 2: power
+        if (viennacl::is_division<OP>::value)
+          op_type = 1;
+        else if (viennacl::is_product<OP>::value)
+          op_type = 0;
+
+        element_op_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
+                                        static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::size(vec1)),
+
+                                        detail::cuda_arg<value_type>(proxy.lhs()),
+                                        static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+                                        static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())),
+
+                                        detail::cuda_arg<value_type>(proxy.rhs()),
+                                        static_cast<unsigned int>(viennacl::traits::start(proxy.rhs())),
+                                        static_cast<unsigned int>(viennacl::traits::stride(proxy.rhs())),
+
+                                        op_type
+                                       );
+        VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_kernel");
+      }
+
+      ///////////////////////// Unary Elementwise operations /////////////
+
+// Note: Trying to automate things with macros or template metaprogramming failed (preprocessor with nvcc did not work as expected), so this is terribly hand-rolled code
+// Question (Karl Rupp): Why is CUDA code always such a hassle when trying to use it in a library context?
+
+      // acos
+      template <typename T> __global__ void vec_element_acos_kernel(
+          T       * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+          T const * vec2, unsigned int start2, unsigned int inc2)
+      {
+        for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+          vec1[i*inc1+start1] = acos(vec2[i*inc2+start2]);
+      }
+
+      template <typename T>
+      void element_op(vector_base<T> & vec1,
+                      vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<op_acos> > const & proxy)
+      {
+        typedef T        value_type;
+
+        vec_element_acos_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
+                                              static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                              static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                              static_cast<unsigned int>(viennacl::traits::size(vec1)),
+                                              detail::cuda_arg<value_type>(proxy.lhs()),
+                                              static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+                                              static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+                                             );
+        VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_acos_kernel");
+      }
+
+      // asin
+      template <typename T> __global__ void vec_element_asin_kernel(
+          T       * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+          T const * vec2, unsigned int start2, unsigned int inc2)
+      {
+        for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+          vec1[i*inc1+start1] = asin(vec2[i*inc2+start2]);
+      }
+
+      template <typename T>
+      void element_op(vector_base<T> & vec1,
+                      vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<op_asin> > const & proxy)
+      {
+        typedef T        value_type;
+
+        vec_element_asin_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
+                                              static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                              static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                              static_cast<unsigned int>(viennacl::traits::size(vec1)),
+                                              detail::cuda_arg<value_type>(proxy.lhs()),
+                                              static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+                                              static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+                                             );
+        VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_asin_kernel");
+      }
+
+
+      // atan
+      template <typename T> __global__ void vec_element_atan_kernel(
+          T       * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+          T const * vec2, unsigned int start2, unsigned int inc2)
+      {
+        for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+          vec1[i*inc1+start1] = atan(vec2[i*inc2+start2]);
+      }
+
+      template <typename T>
+      void element_op(vector_base<T> & vec1,
+                      vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<op_atan> > const & proxy)
+      {
+        typedef T        value_type;
+
+        vec_element_atan_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
+                                              static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                              static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                              static_cast<unsigned int>(viennacl::traits::size(vec1)),
+                                              detail::cuda_arg<value_type>(proxy.lhs()),
+                                              static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+                                              static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+                                             );
+        VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_atan_kernel");
+      }
+
+
+      // ceil
+      template <typename T> __global__ void vec_element_ceil_kernel(
+          T       * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+          T const * vec2, unsigned int start2, unsigned int inc2)
+      {
+        for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+          vec1[i*inc1+start1] = ceil(vec2[i*inc2+start2]);
+      }
+
+      template <typename T>
+      void element_op(vector_base<T> & vec1,
+                      vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<op_ceil> > const & proxy)
+      {
+        typedef T        value_type;
+
+        vec_element_ceil_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
+                                              static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                              static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                              static_cast<unsigned int>(viennacl::traits::size(vec1)),
+                                              detail::cuda_arg<value_type>(proxy.lhs()),
+                                              static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+                                              static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+                                             );
+        VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_ceil_kernel");
+      }
+
+
+      // cos
+      template <typename T> __global__ void vec_element_cos_kernel(
+          T       * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+          T const * vec2, unsigned int start2, unsigned int inc2)
+      {
+        for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+          vec1[i*inc1+start1] = cos(vec2[i*inc2+start2]);
+      }
+
+      template <typename T>
+      void element_op(vector_base<T> & vec1,
+                      vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<op_cos> > const & proxy)
+      {
+        typedef T        value_type;
+
+        vec_element_cos_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
+                                              static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                              static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                              static_cast<unsigned int>(viennacl::traits::size(vec1)),
+                                              detail::cuda_arg<value_type>(proxy.lhs()),
+                                              static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+                                              static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+                                             );
+        VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_cos_kernel");
+      }
+
+
+      // cosh
+      template <typename T> __global__ void vec_element_cosh_kernel(
+          T       * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+          T const * vec2, unsigned int start2, unsigned int inc2)
+      {
+        for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+          vec1[i*inc1+start1] = cosh(vec2[i*inc2+start2]);
+      }
+
+      template <typename T>
+      void element_op(vector_base<T> & vec1,
+                      vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<op_cosh> > const & proxy)
+      {
+        typedef T        value_type;
+
+        vec_element_cosh_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
+                                              static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                              static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                              static_cast<unsigned int>(viennacl::traits::size(vec1)),
+                                              detail::cuda_arg<value_type>(proxy.lhs()),
+                                              static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+                                              static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+                                             );
+        VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_cosh_kernel");
+      }
+
+
+      // exp
+      template <typename T> __global__ void vec_element_exp_kernel(
+          T       * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+          T const * vec2, unsigned int start2, unsigned int inc2)
+      {
+        for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+          vec1[i*inc1+start1] = exp(vec2[i*inc2+start2]);
+      }
+
+      template <typename T>
+      void element_op(vector_base<T> & vec1,
+                      vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<op_exp> > const & proxy)
+      {
+        typedef T        value_type;
+
+        vec_element_exp_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
+                                              static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                              static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                              static_cast<unsigned int>(viennacl::traits::size(vec1)),
+                                              detail::cuda_arg<value_type>(proxy.lhs()),
+                                              static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+                                              static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+                                             );
+        VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_exp_kernel");
+      }
+
+
+      // fabs
+      template <typename T> __global__ void vec_element_fabs_kernel(
+          T       * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+          T const * vec2, unsigned int start2, unsigned int inc2)
+      {
+        for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+          vec1[i*inc1+start1] = fabs(vec2[i*inc2+start2]);
+      }
+
+      template <typename T>
+      void element_op(vector_base<T> & vec1,
+                      vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<op_fabs> > const & proxy)
+      {
+        typedef T        value_type;
+
+        vec_element_fabs_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
+                                              static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                              static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                              static_cast<unsigned int>(viennacl::traits::size(vec1)),
+                                              detail::cuda_arg<value_type>(proxy.lhs()),
+                                              static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+                                              static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+                                             );
+        VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_fabs_kernel");
+      }
+
+      // abs
+      template <typename T> __global__ void vec_element_abs_kernel(
+          T       * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+          T const * vec2, unsigned int start2, unsigned int inc2)
+      {
+        for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+          vec1[i*inc1+start1] = abs(vec2[i*inc2+start2]);
+      }
+
+      template <typename T>
+      void element_op(vector_base<T> & vec1,
+                      vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<op_abs> > const & proxy)
+      {
+        typedef T        value_type;
+
+        vec_element_abs_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
+                                             static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                             static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                             static_cast<unsigned int>(viennacl::traits::size(vec1)),
+                                             detail::cuda_arg<value_type>(proxy.lhs()),
+                                             static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+                                             static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+                                            );
+        VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_abs_kernel");
+      }
+
+
+
+      // floor
+      template <typename T> __global__ void vec_element_floor_kernel(
+          T       * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+          T const * vec2, unsigned int start2, unsigned int inc2)
+      {
+        for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+          vec1[i*inc1+start1] = floor(vec2[i*inc2+start2]);
+      }
+
+      template <typename T>
+      void element_op(vector_base<T> & vec1,
+                      vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<op_floor> > const & proxy)
+      {
+        typedef T        value_type;
+
+        vec_element_floor_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
+                                              static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                              static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                              static_cast<unsigned int>(viennacl::traits::size(vec1)),
+                                              detail::cuda_arg<value_type>(proxy.lhs()),
+                                              static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+                                              static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+                                             );
+        VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_floor_kernel");
+      }
+
+
+      // log
+      template <typename T> __global__ void vec_element_log_kernel(
+          T       * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+          T const * vec2, unsigned int start2, unsigned int inc2)
+      {
+        for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+          vec1[i*inc1+start1] = log(vec2[i*inc2+start2]);
+      }
+
+      template <typename T>
+      void element_op(vector_base<T> & vec1,
+                      vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<op_log> > const & proxy)
+      {
+        typedef T        value_type;
+
+        vec_element_log_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
+                                              static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                              static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                              static_cast<unsigned int>(viennacl::traits::size(vec1)),
+                                              detail::cuda_arg<value_type>(proxy.lhs()),
+                                              static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+                                              static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+                                             );
+        VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_log_kernel");
+      }
+
+
+      // log10
+      template <typename T> __global__ void vec_element_log10_kernel(
+          T       * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+          T const * vec2, unsigned int start2, unsigned int inc2)
+      {
+        for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+          vec1[i*inc1+start1] = log10(vec2[i*inc2+start2]);
+      }
+
+      template <typename T>
+      void element_op(vector_base<T> & vec1,
+                      vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<op_log10> > const & proxy)
+      {
+        typedef T        value_type;
+
+        vec_element_log10_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
+                                              static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                              static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                              static_cast<unsigned int>(viennacl::traits::size(vec1)),
+                                              detail::cuda_arg<value_type>(proxy.lhs()),
+                                              static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+                                              static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+                                             );
+        VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_log10_kernel");
+      }
+
+
+      // sin
+      template <typename T> __global__ void vec_element_sin_kernel(
+          T       * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+          T const * vec2, unsigned int start2, unsigned int inc2)
+      {
+        for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+          vec1[i*inc1+start1] = sin(vec2[i*inc2+start2]);
+      }
+
+      template <typename T>
+      void element_op(vector_base<T> & vec1,
+                      vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<op_sin> > const & proxy)
+      {
+        typedef T        value_type;
+
+        vec_element_sin_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
+                                              static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                              static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                              static_cast<unsigned int>(viennacl::traits::size(vec1)),
+                                              detail::cuda_arg<value_type>(proxy.lhs()),
+                                              static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+                                              static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+                                             );
+        VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_sin_kernel");
+      }
+
+
+      // sinh
+      template <typename T> __global__ void vec_element_sinh_kernel(
+          T       * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+          T const * vec2, unsigned int start2, unsigned int inc2)
+      {
+        for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+          vec1[i*inc1+start1] = sinh(vec2[i*inc2+start2]);
+      }
+
+      template <typename T>
+      void element_op(vector_base<T> & vec1,
+                      vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<op_sinh> > const & proxy)
+      {
+        typedef T        value_type;
+
+        vec_element_sinh_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
+                                              static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                              static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                              static_cast<unsigned int>(viennacl::traits::size(vec1)),
+                                              detail::cuda_arg<value_type>(proxy.lhs()),
+                                              static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+                                              static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+                                             );
+        VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_sinh_kernel");
+      }
+
+
+      // sqrt
+      template <typename T> __global__ void vec_element_sqrt_kernel(
+          T       * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+          T const * vec2, unsigned int start2, unsigned int inc2)
+      {
+        for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+          vec1[i*inc1+start1] = sqrt(vec2[i*inc2+start2]);
+      }
+
+      template <typename T>
+      void element_op(vector_base<T> & vec1,
+                      vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<op_sqrt> > const & proxy)
+      {
+        typedef T        value_type;
+
+        vec_element_sqrt_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
+                                              static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                              static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                              static_cast<unsigned int>(viennacl::traits::size(vec1)),
+                                              detail::cuda_arg<value_type>(proxy.lhs()),
+                                              static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+                                              static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+                                             );
+        VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_sqrt_kernel");
+      }
+
+
+      // tan
+      template <typename T> __global__ void vec_element_tan_kernel(
+          T       * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+          T const * vec2, unsigned int start2, unsigned int inc2)
+      {
+        for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+          vec1[i*inc1+start1] = tan(vec2[i*inc2+start2]);
+      }
+
+      template <typename T>
+      void element_op(vector_base<T> & vec1,
+                      vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<op_tan> > const & proxy)
+      {
+        typedef T        value_type;
+
+        vec_element_tan_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
+                                              static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                              static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                              static_cast<unsigned int>(viennacl::traits::size(vec1)),
+                                              detail::cuda_arg<value_type>(proxy.lhs()),
+                                              static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+                                              static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+                                             );
+        VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_tan_kernel");
+      }
+
+
+      // tanh
+      template <typename T> __global__ void vec_element_tanh_kernel(
+          T       * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+          T const * vec2, unsigned int start2, unsigned int inc2)
+      {
+        for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+          vec1[i*inc1+start1] = tanh(vec2[i*inc2+start2]);
+      }
+
+      template <typename T>
+      void element_op(vector_base<T> & vec1,
+                      vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<op_tanh> > const & proxy)
+      {
+        typedef T        value_type;
+
+        vec_element_tanh_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
+                                              static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                              static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                              static_cast<unsigned int>(viennacl::traits::size(vec1)),
+                                              detail::cuda_arg<value_type>(proxy.lhs()),
+                                              static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+                                              static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+                                             );
+        VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_tanh_kernel");
+      }
+
+
+
+      ///////////////////////// Norms and inner product ///////////////////
+
+
+      template <typename T>
+      __global__ void inner_prod_kernel(const T * vec1,
+                                        unsigned int start1,
+                                        unsigned int inc1,
+                                        unsigned int size1,
+                                        const T * vec2,
+                                        unsigned int start2,
+                                        unsigned int inc2,
+                                        unsigned int size2,
+                                        T * group_buffer)
+      {
+        __shared__ T tmp_buffer[128];
+        unsigned int group_start1 = (blockIdx.x * size1) / (gridDim.x) * inc1 + start1;
+        unsigned int group_start2 = (blockIdx.x * size2) / (gridDim.x) * inc2 + start2;
+
+        unsigned int group_size1 = ((blockIdx.x + 1) * size1) / (gridDim.x)
+                                     - (  blockIdx.x * size1) / (gridDim.x);
+
+
+        T tmp = 0;
+        for (unsigned int i = threadIdx.x; i < group_size1; i += blockDim.x)
+          tmp += vec1[i*inc1+group_start1] * vec2[i*inc2+group_start2];
+        tmp_buffer[threadIdx.x] = tmp;
+
+        // parallel reduction
+        for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
+        {
+          __syncthreads();
+          if (threadIdx.x < stride)
+            tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x+stride];
+        }
+
+        if (threadIdx.x == 0)
+          group_buffer[blockIdx.x] = tmp_buffer[0];
+
+      }
+
+
+
+      // sums the array 'vec1' and writes to result. Makes use of a single work-group only.
+      template <typename T>
+      __global__ void vector_sum_kernel_floats(
+                const T * vec1,
+                unsigned int start1,
+                unsigned int inc1,
+                unsigned int size1,
+                unsigned int option, //0: use fmax, 1: just sum, 2: sum and return sqrt of sum
+                T * result)
+      {
+        __shared__ T tmp_buffer[128];
+        T thread_sum = 0;
+        for (unsigned int i = threadIdx.x; i<size1; i += blockDim.x)
+        {
+          if (option > 0)
+            thread_sum += vec1[i*inc1+start1];
+          else
+            thread_sum = fmax(thread_sum, fabs(vec1[i*inc1+start1]));
+        }
+
+        tmp_buffer[threadIdx.x] = thread_sum;
+
+        for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
+        {
+          __syncthreads();
+          if (threadIdx.x < stride)
+          {
+            if (option > 0)
+              tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + stride];
+            else
+              tmp_buffer[threadIdx.x] = fmax(tmp_buffer[threadIdx.x], tmp_buffer[threadIdx.x + stride]);
+          }
+        }
+
+        if (threadIdx.x == 0)
+        {
+          if (option == 2)
+            *result = sqrt(tmp_buffer[0]);
+          else
+            *result = tmp_buffer[0];
+        }
+      }
+
+      template <typename T>
+      __global__ void vector_sum_kernel_integers(
+                const T * vec1,
+                unsigned int start1,
+                unsigned int inc1,
+                unsigned int size1,
+                unsigned int option, //0: use max, 1: just sum
+                T * result)
+      {
+        __shared__ T tmp_buffer[128];
+        T thread_sum = 0;
+        for (unsigned int i = threadIdx.x; i<size1; i += blockDim.x)
+        {
+          if (option > 0)
+            thread_sum += vec1[i*inc1+start1];
+          else
+            thread_sum = thread_sum > abs(vec1[i*inc1+start1]) ? thread_sum : abs(vec1[i*inc1+start1]);
+        }
+
+        tmp_buffer[threadIdx.x] = thread_sum;
+
+        for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
+        {
+          __syncthreads();
+          if (threadIdx.x < stride)
+          {
+            if (option > 0)
+              tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + stride];
+            else
+              tmp_buffer[threadIdx.x] = tmp_buffer[threadIdx.x] > tmp_buffer[threadIdx.x + stride] ? tmp_buffer[threadIdx.x] : tmp_buffer[threadIdx.x + stride];
+          }
+        }
+
+        if (threadIdx.x == 0)
+          *result = tmp_buffer[0];
+      }
+
+      template <typename T>
+      __global__ void vector_sum_kernel_unsigned_integers(
+                const T * vec1,
+                unsigned int start1,
+                unsigned int inc1,
+                unsigned int size1,
+                unsigned int option, //0: use max, 1: just sum
+                T * result)
+      {
+        __shared__ T tmp_buffer[128];
+        T thread_sum = 0;
+        for (unsigned int i = threadIdx.x; i<size1; i += blockDim.x)
+        {
+          if (option > 0)
+            thread_sum += vec1[i*inc1+start1];
+          else
+            thread_sum = (thread_sum > vec1[i*inc1+start1]) ? thread_sum : vec1[i*inc1+start1];
+        }
+
+        tmp_buffer[threadIdx.x] = thread_sum;
+
+        for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
+        {
+          __syncthreads();
+          if (threadIdx.x < stride)
+          {
+            if (option > 0)
+              tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + stride];
+            else
+              tmp_buffer[threadIdx.x] = tmp_buffer[threadIdx.x] > tmp_buffer[threadIdx.x + stride] ? tmp_buffer[threadIdx.x] : tmp_buffer[threadIdx.x + stride];
+          }
+        }
+
+        if (threadIdx.x == 0)
+          *result = tmp_buffer[0];
+      }
+
+      namespace detail
+      {
+        /** \cond */
+        struct vector_sum_kernel_launcher_integers
+        {
+          template <typename T, typename S3>
+          static void apply(vector_base<T> const & temp,
+                            unsigned int option,
+                            S3 & result)
+          {
+            typedef T        value_type;
+            vector_sum_kernel_integers<<<1, 128>>>(detail::cuda_arg<value_type>(temp),
+                                                  static_cast<unsigned int>(viennacl::traits::start(temp)),
+                                                  static_cast<unsigned int>(viennacl::traits::stride(temp)),
+                                                  static_cast<unsigned int>(viennacl::traits::size(temp)),
+                                                  static_cast<unsigned int>(option),
+                                                  detail::cuda_arg<value_type>(result) );
+            VIENNACL_CUDA_LAST_ERROR_CHECK("vector_sum_kernel");
+          }
+        };
+
+        struct vector_sum_kernel_launcher_unsigned_integers
+        {
+          template <typename T, typename S3>
+          static void apply(vector_base<T> const & temp,
+                            unsigned int option,
+                            S3 & result)
+          {
+            typedef T        value_type;
+            vector_sum_kernel_unsigned_integers<<<1, 128>>>(detail::cuda_arg<value_type>(temp),
+                                                            static_cast<unsigned int>(viennacl::traits::start(temp)),
+                                                            static_cast<unsigned int>(viennacl::traits::stride(temp)),
+                                                            static_cast<unsigned int>(viennacl::traits::size(temp)),
+                                                            static_cast<unsigned int>(option),
+                                                            detail::cuda_arg<value_type>(result) );
+            VIENNACL_CUDA_LAST_ERROR_CHECK("vector_sum_kernel");
+          }
+        };
+
+        struct vector_sum_kernel_launcher_floats
+        {
+          template <typename T, typename S3>
+          static void apply(vector_base<T> const & temp,
+                            unsigned int option,
+                            S3 & result)
+          {
+            typedef T        value_type;
+            vector_sum_kernel_floats<<<1, 128>>>(detail::cuda_arg<value_type>(temp),
+                                                  static_cast<unsigned int>(viennacl::traits::start(temp)),
+                                                  static_cast<unsigned int>(viennacl::traits::stride(temp)),
+                                                  static_cast<unsigned int>(viennacl::traits::size(temp)),
+                                                  static_cast<unsigned int>(option),
+                                                  detail::cuda_arg<value_type>(result) );
+            VIENNACL_CUDA_LAST_ERROR_CHECK("vector_sum_kernel");
+          }
+        };
+
+        template <typename T>
+        struct vector_sum_kernel_launcher : public vector_sum_kernel_launcher_integers {};
+
+        template <>
+        struct vector_sum_kernel_launcher<unsigned char>  : public vector_sum_kernel_launcher_unsigned_integers {};
+
+        template <>
+        struct vector_sum_kernel_launcher<unsigned short>  : public vector_sum_kernel_launcher_unsigned_integers {};
+
+        template <>
+        struct vector_sum_kernel_launcher<unsigned int>  : public vector_sum_kernel_launcher_unsigned_integers {};
+
+        template <>
+        struct vector_sum_kernel_launcher<unsigned long>  : public vector_sum_kernel_launcher_unsigned_integers {};
+
+        template <>
+        struct vector_sum_kernel_launcher<float>  : public vector_sum_kernel_launcher_floats {};
+
+        template <>
+        struct vector_sum_kernel_launcher<double> : public vector_sum_kernel_launcher_floats {};
+
+        /** \endcond */
+      }
+
+
+      //implementation of inner product:
+      //namespace {
+      /** @brief Computes the inner product of two vectors - implementation. Library users should call inner_prod(vec1, vec2).
+      *
+      * @param vec1 The first vector
+      * @param vec2 The second vector
+      * @param result The result scalar (on the gpu)
+      */
+      template <typename T, typename S3>
+      void inner_prod_impl(vector_base<T> const & vec1,
+                           vector_base<T> const & vec2,
+                           S3 & result)
+      {
+        typedef T        value_type;
+
+        static const unsigned int work_groups = 128;
+        static viennacl::vector<value_type> temp(work_groups);
+
+        inner_prod_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
+                                        static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::size(vec1)),
+                                        detail::cuda_arg<value_type>(vec2),
+                                        static_cast<unsigned int>(viennacl::traits::start(vec2)),
+                                        static_cast<unsigned int>(viennacl::traits::stride(vec2)),
+                                        static_cast<unsigned int>(viennacl::traits::size(vec2)),
+                                        detail::cuda_arg<value_type>(temp)
+                                       );
+        VIENNACL_CUDA_LAST_ERROR_CHECK("inner_prod_kernel");
+
+        detail::vector_sum_kernel_launcher<T>::apply(temp, 1, result);
+      }
+
+
+      /** @brief Computes the inner product of two vectors - implementation. Library users should call inner_prod(vec1, vec2).
+      *
+      * @param vec1 The first vector
+      * @param vec2 The second vector
+      * @param result The result scalar (on the host)
+      */
+      template <typename T>
+      void inner_prod_cpu(vector_base<T> const & vec1,
+                          vector_base<T> const & vec2,
+                          T & result)
+      {
+        typedef T        value_type;
+
+        const unsigned int work_groups = 128;
+        viennacl::vector<value_type> temp(work_groups);
+
+        inner_prod_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
+                                        static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::size(vec1)),
+                                        detail::cuda_arg<value_type>(vec2),
+                                        static_cast<unsigned int>(viennacl::traits::start(vec2)),
+                                        static_cast<unsigned int>(viennacl::traits::stride(vec2)),
+                                        static_cast<unsigned int>(viennacl::traits::size(vec2)),
+                                        detail::cuda_arg<value_type>(temp)
+                                       );
+        VIENNACL_CUDA_LAST_ERROR_CHECK("inner_prod_kernel");
+
+        // Now copy partial results from GPU back to CPU and run reduction there:
+        std::vector<value_type> temp_cpu(work_groups);
+        viennacl::fast_copy(temp.begin(), temp.end(), temp_cpu.begin());
+
+        result = 0;
+        for (typename std::vector<value_type>::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
+          result += *it;
+      }
+
+      ///////////////////////////////////
+
+#define VIENNACL_MDOT_WORKGROUP_SIZE  128
+#define VIENNACL_MDOT_WORKGROUP_NUM   128
+      // M = 2:
+      template <typename NumericT>
+      __global__ void inner_prod_2_kernel(const NumericT *x,  unsigned int startx, unsigned int stridex, unsigned int sizex,
+                                          const NumericT *y0, unsigned int start0, unsigned int stride0,
+                                          const NumericT *y1, unsigned int start1, unsigned int stride1,
+                                          NumericT *group_results)
+      {
+        __shared__ NumericT tmp_buffer[2*VIENNACL_MDOT_WORKGROUP_SIZE];
+        unsigned int entries_per_thread = (sizex - 1) / (blockDim.x * gridDim.x) + 1;
+        unsigned int vec_start_index = blockIdx.x * blockDim.x * entries_per_thread;
+        unsigned int vec_stop_index  = min((blockIdx.x + 1) * blockDim.x * entries_per_thread, sizex); // don't go beyond size of x
+
+        NumericT entry_x    = 0;
+        NumericT group_sum0 = 0;
+        NumericT group_sum1 = 0;
+        for (unsigned int i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
+          entry_x     = x[i * stridex + startx];   // load only once from global memory!
+          group_sum0 += entry_x * y0[i * stride0 + start0];
+          group_sum1 += entry_x * y1[i * stride1 + start1];
+        }
+        tmp_buffer[threadIdx.x]              = group_sum0;
+        tmp_buffer[threadIdx.x + blockDim.x] = group_sum1;
+
+        // parallel reduction
+        for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) {
+          __syncthreads();
+          if (threadIdx.x < stride) {
+            tmp_buffer[threadIdx.x             ] += tmp_buffer[threadIdx.x+stride             ];
+            tmp_buffer[threadIdx.x + blockDim.x] += tmp_buffer[threadIdx.x+stride + blockDim.x];
+          }
+        }
+
+        // write result of group to group_results
+        if (threadIdx.x == 0) {
+          group_results[blockIdx.x]             = tmp_buffer[0];
+          group_results[blockIdx.x + gridDim.x] = tmp_buffer[blockDim.x];
+        }
+      }
+
+      // M = 3:
+      template <typename NumericT>
+      __global__ void inner_prod_3_kernel(const NumericT *x,  unsigned int startx, unsigned int stridex, unsigned int sizex,
+                                          const NumericT *y0, unsigned int start0, unsigned int stride0,
+                                          const NumericT *y1, unsigned int start1, unsigned int stride1,
+                                          const NumericT *y2, unsigned int start2, unsigned int stride2,
+                                          NumericT *group_results)
+      {
+        __shared__ NumericT tmp_buffer[3*VIENNACL_MDOT_WORKGROUP_SIZE];
+        unsigned int entries_per_thread = (sizex - 1) / (blockDim.x * gridDim.x) + 1;
+        unsigned int vec_start_index = blockIdx.x * blockDim.x * entries_per_thread;
+        unsigned int vec_stop_index  = min((blockIdx.x + 1) * blockDim.x * entries_per_thread, sizex); // don't go beyond vec size
+
+        NumericT entry_x    = 0;
+        NumericT group_sum0 = 0;
+        NumericT group_sum1 = 0;
+        NumericT group_sum2 = 0;
+        for (unsigned int i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
+          entry_x     = x[i * stridex + startx];   // load only once from global memory!
+          group_sum0 += entry_x * y0[i * stride0 + start0];
+          group_sum1 += entry_x * y1[i * stride1 + start1];
+          group_sum2 += entry_x * y2[i * stride2 + start2];
+        }
+        tmp_buffer[threadIdx.x]                  = group_sum0;
+        tmp_buffer[threadIdx.x +     blockDim.x] = group_sum1;
+        tmp_buffer[threadIdx.x + 2 * blockDim.x] = group_sum2;
+
+        // parallel reduction
+        for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) {
+          __syncthreads();
+          if (threadIdx.x < stride) {
+            tmp_buffer[threadIdx.x                 ] += tmp_buffer[threadIdx.x+stride                 ];
+            tmp_buffer[threadIdx.x +     blockDim.x] += tmp_buffer[threadIdx.x+stride +     blockDim.x];
+            tmp_buffer[threadIdx.x + 2 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 2 * blockDim.x];
+          }
+        }
+
+        // write result of group to group_results
+        if (threadIdx.x == 0) {
+          group_results[blockIdx.x                ] = tmp_buffer[0];
+          group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    blockDim.x];
+          group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * blockDim.x];
+        }
+      }
+
+      // M = 4:
+      template <typename NumericT>
+      __global__ void inner_prod_4_kernel(const NumericT *x,  unsigned int startx, unsigned int stridex, unsigned int sizex,
+                                          const NumericT *y0, unsigned int start0, unsigned int stride0,
+                                          const NumericT *y1, unsigned int start1, unsigned int stride1,
+                                          const NumericT *y2, unsigned int start2, unsigned int stride2,
+                                          const NumericT *y3, unsigned int start3, unsigned int stride3,
+                                          NumericT *group_results)
+      {
+        __shared__ NumericT tmp_buffer[4*VIENNACL_MDOT_WORKGROUP_SIZE];
+        unsigned int entries_per_thread = (sizex - 1) / (blockDim.x * gridDim.x) + 1;
+        unsigned int vec_start_index = blockIdx.x * blockDim.x * entries_per_thread;
+        unsigned int vec_stop_index  = min((blockIdx.x + 1) * blockDim.x * entries_per_thread, sizex); // don't go beyond vec size
+
+        NumericT entry_x    = 0;
+        NumericT group_sum0 = 0;
+        NumericT group_sum1 = 0;
+        NumericT group_sum2 = 0;
+        NumericT group_sum3 = 0;
+        for (unsigned int i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
+          entry_x     = x[i * stridex + startx];   // load only once from global memory!
+          group_sum0 += entry_x * y0[i * stride0 + start0];
+          group_sum1 += entry_x * y1[i * stride1 + start1];
+          group_sum2 += entry_x * y2[i * stride2 + start2];
+          group_sum3 += entry_x * y3[i * stride3 + start3];
+        }
+        tmp_buffer[threadIdx.x]                  = group_sum0;
+        tmp_buffer[threadIdx.x +     blockDim.x] = group_sum1;
+        tmp_buffer[threadIdx.x + 2 * blockDim.x] = group_sum2;
+        tmp_buffer[threadIdx.x + 3 * blockDim.x] = group_sum3;
+
+        // parallel reduction
+        for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) {
+          __syncthreads();
+          if (threadIdx.x < stride) {
+            tmp_buffer[threadIdx.x                 ] += tmp_buffer[threadIdx.x+stride                 ];
+            tmp_buffer[threadIdx.x +     blockDim.x] += tmp_buffer[threadIdx.x+stride +     blockDim.x];
+            tmp_buffer[threadIdx.x + 2 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 2 * blockDim.x];
+            tmp_buffer[threadIdx.x + 3 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 3 * blockDim.x];
+          }
+        }
+
+        // write result of group to group_results
+        if (threadIdx.x == 0) {
+          group_results[blockIdx.x                ] = tmp_buffer[0];
+          group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    blockDim.x];
+          group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * blockDim.x];
+          group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * blockDim.x];
+        }
+      }
+
+      // M = 8:
+      template <typename NumericT>
+      __global__ void inner_prod_8_kernel(const NumericT *x,  unsigned int startx, unsigned int stridex, unsigned int sizex,
+                                          const NumericT *y0, unsigned int start0, unsigned int stride0,
+                                          const NumericT *y1, unsigned int start1, unsigned int stride1,
+                                          const NumericT *y2, unsigned int start2, unsigned int stride2,
+                                          const NumericT *y3, unsigned int start3, unsigned int stride3,
+                                          const NumericT *y4, unsigned int start4, unsigned int stride4,
+                                          const NumericT *y5, unsigned int start5, unsigned int stride5,
+                                          const NumericT *y6, unsigned int start6, unsigned int stride6,
+                                          const NumericT *y7, unsigned int start7, unsigned int stride7,
+                                          NumericT *group_results)
+      {
+        __shared__ NumericT tmp_buffer[8*VIENNACL_MDOT_WORKGROUP_SIZE];
+        unsigned int entries_per_thread = (sizex - 1) / (blockDim.x * gridDim.x) + 1;
+        unsigned int vec_start_index = blockIdx.x * blockDim.x * entries_per_thread;
+        unsigned int vec_stop_index  = min((blockIdx.x + 1) * blockDim.x * entries_per_thread, sizex); // don't go beyond vec size
+
+        NumericT entry_x    = 0;
+        NumericT group_sum0 = 0;
+        NumericT group_sum1 = 0;
+        NumericT group_sum2 = 0;
+        NumericT group_sum3 = 0;
+        NumericT group_sum4 = 0;
+        NumericT group_sum5 = 0;
+        NumericT group_sum6 = 0;
+        NumericT group_sum7 = 0;
+        for (unsigned int i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
+          entry_x     = x[i * stridex + startx];   // load only once from global memory!
+          group_sum0 += entry_x * y0[i * stride0 + start0];
+          group_sum1 += entry_x * y1[i * stride1 + start1];
+          group_sum2 += entry_x * y2[i * stride2 + start2];
+          group_sum3 += entry_x * y3[i * stride3 + start3];
+          group_sum4 += entry_x * y4[i * stride4 + start4];
+          group_sum5 += entry_x * y5[i * stride5 + start5];
+          group_sum6 += entry_x * y6[i * stride6 + start6];
+          group_sum7 += entry_x * y7[i * stride7 + start7];
+        }
+        tmp_buffer[threadIdx.x]                  = group_sum0;
+        tmp_buffer[threadIdx.x +     blockDim.x] = group_sum1;
+        tmp_buffer[threadIdx.x + 2 * blockDim.x] = group_sum2;
+        tmp_buffer[threadIdx.x + 3 * blockDim.x] = group_sum3;
+        tmp_buffer[threadIdx.x + 4 * blockDim.x] = group_sum4;
+        tmp_buffer[threadIdx.x + 5 * blockDim.x] = group_sum5;
+        tmp_buffer[threadIdx.x + 6 * blockDim.x] = group_sum6;
+        tmp_buffer[threadIdx.x + 7 * blockDim.x] = group_sum7;
+
+        // parallel reduction
+        for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) {
+          __syncthreads();
+          if (threadIdx.x < stride) {
+            tmp_buffer[threadIdx.x                 ] += tmp_buffer[threadIdx.x+stride                 ];
+            tmp_buffer[threadIdx.x +     blockDim.x] += tmp_buffer[threadIdx.x+stride +     blockDim.x];
+            tmp_buffer[threadIdx.x + 2 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 2 * blockDim.x];
+            tmp_buffer[threadIdx.x + 3 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 3 * blockDim.x];
+            tmp_buffer[threadIdx.x + 4 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 4 * blockDim.x];
+            tmp_buffer[threadIdx.x + 5 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 5 * blockDim.x];
+            tmp_buffer[threadIdx.x + 6 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 6 * blockDim.x];
+            tmp_buffer[threadIdx.x + 7 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 7 * blockDim.x];
+          }
+        }
+
+        // write result of group to group_results
+        if (threadIdx.x == 0) {
+          group_results[blockIdx.x                ] = tmp_buffer[0];
+          group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    blockDim.x];
+          group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * blockDim.x];
+          group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * blockDim.x];
+          group_results[blockIdx.x + 4 * gridDim.x] = tmp_buffer[4 * blockDim.x];
+          group_results[blockIdx.x + 5 * gridDim.x] = tmp_buffer[5 * blockDim.x];
+          group_results[blockIdx.x + 6 * gridDim.x] = tmp_buffer[6 * blockDim.x];
+          group_results[blockIdx.x + 7 * gridDim.x] = tmp_buffer[7 * blockDim.x];
+        }
+      }
+
+      // sums the array 'vec1' and writes to result. Makes use of a single work-group only.
+      template <typename T>
+      __global__ void vector_multi_sum_kernel(
+                T const * vec1,
+                T * result,
+                unsigned int start_result,
+                unsigned int inc_result)
+      {
+        __shared__ T tmp_buffer[VIENNACL_MDOT_WORKGROUP_SIZE];
+
+        tmp_buffer[threadIdx.x] = vec1[threadIdx.x + blockIdx.x * VIENNACL_MDOT_WORKGROUP_SIZE];
+
+        for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
+        {
+          __syncthreads();
+          if (threadIdx.x < stride)
+            tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + stride];
+        }
+
+        if (threadIdx.x == 0)
+          result[start_result + inc_result * blockIdx.x] = tmp_buffer[0];
+      }
+
+      template <typename T>
+      void inner_prod_impl(vector_base<T> const & x,
+                           vector_tuple<T> const & vec_tuple,
+                           vector_base<T> & result)
+      {
+        typedef T        value_type;
+
+        static viennacl::vector<value_type> temp(8 * VIENNACL_MDOT_WORKGROUP_NUM);
+
+        vcl_size_t current_index = 0;
+        while (vec_tuple.const_size() > current_index)
+        {
+          switch (vec_tuple.const_size() - current_index)
+          {
+            case 7:
+            case 6:
+            case 5:
+            case 4:
+            {
+              vector_base<T> const & y0 = vec_tuple.const_at(current_index);
+              vector_base<T> const & y1 = vec_tuple.const_at(current_index + 1);
+              vector_base<T> const & y2 = vec_tuple.const_at(current_index + 2);
+              vector_base<T> const & y3 = vec_tuple.const_at(current_index + 3);
+
+              inner_prod_4_kernel<<<VIENNACL_MDOT_WORKGROUP_NUM,
+                                    VIENNACL_MDOT_WORKGROUP_SIZE>>>( detail::cuda_arg<value_type>(x),
+                                                                     static_cast<unsigned int>(viennacl::traits::start(x)),
+                                                                     static_cast<unsigned int>(viennacl::traits::stride(x)),
+                                                                     static_cast<unsigned int>(viennacl::traits::size(x)),
+                                                                     detail::cuda_arg<value_type>(y0),
+                                                                     static_cast<unsigned int>(viennacl::traits::start(y0)),
+                                                                     static_cast<unsigned int>(viennacl::traits::stride(y0)),
+                                                                     detail::cuda_arg<value_type>(y1),
+                                                                     static_cast<unsigned int>(viennacl::traits::start(y1)),
+                                                                     static_cast<unsigned int>(viennacl::traits::stride(y1)),
+                                                                     detail::cuda_arg<value_type>(y2),
+                                                                     static_cast<unsigned int>(viennacl::traits::start(y2)),
+                                                                     static_cast<unsigned int>(viennacl::traits::stride(y2)),
+                                                                     detail::cuda_arg<value_type>(y3),
+                                                                     static_cast<unsigned int>(viennacl::traits::start(y3)),
+                                                                     static_cast<unsigned int>(viennacl::traits::stride(y3)),
+                                                                     detail::cuda_arg<value_type>(temp)
+                                                                    );
+              VIENNACL_CUDA_LAST_ERROR_CHECK("inner_prod_4_kernel");
+              vector_multi_sum_kernel<<<4, VIENNACL_MDOT_WORKGROUP_NUM>>>(detail::cuda_arg<value_type>(temp),
+                                                                          detail::cuda_arg<value_type>(result),
+                                                                          static_cast<unsigned int>(viennacl::traits::start(result) + viennacl::traits::stride(result) * current_index),
+                                                                          static_cast<unsigned int>(viennacl::traits::stride(result))
+                                                                         );
+              VIENNACL_CUDA_LAST_ERROR_CHECK("vector_multi_sum_kernel");
+            }
+              current_index += 4;
+              break;
+            case 3:
+            {
+              vector_base<T> const & y0 = vec_tuple.const_at(current_index);
+              vector_base<T> const & y1 = vec_tuple.const_at(current_index + 1);
+              vector_base<T> const & y2 = vec_tuple.const_at(current_index + 2);
+
+              inner_prod_3_kernel<<<VIENNACL_MDOT_WORKGROUP_NUM,
+                                    VIENNACL_MDOT_WORKGROUP_SIZE>>>( detail::cuda_arg<value_type>(x),
+                                                                     static_cast<unsigned int>(viennacl::traits::start(x)),
+                                                                     static_cast<unsigned int>(viennacl::traits::stride(x)),
+                                                                     static_cast<unsigned int>(viennacl::traits::size(x)),
+                                                                     detail::cuda_arg<value_type>(y0),
+                                                                     static_cast<unsigned int>(viennacl::traits::start(y0)),
+                                                                     static_cast<unsigned int>(viennacl::traits::stride(y0)),
+                                                                     detail::cuda_arg<value_type>(y1),
+                                                                     static_cast<unsigned int>(viennacl::traits::start(y1)),
+                                                                     static_cast<unsigned int>(viennacl::traits::stride(y1)),
+                                                                     detail::cuda_arg<value_type>(y2),
+                                                                     static_cast<unsigned int>(viennacl::traits::start(y2)),
+                                                                     static_cast<unsigned int>(viennacl::traits::stride(y2)),
+                                                                     detail::cuda_arg<value_type>(temp)
+                                                                    );
+              VIENNACL_CUDA_LAST_ERROR_CHECK("inner_prod_3_kernel");
+              vector_multi_sum_kernel<<<3, VIENNACL_MDOT_WORKGROUP_NUM>>>(detail::cuda_arg<value_type>(temp),
+                                                                          detail::cuda_arg<value_type>(result),
+                                                                          static_cast<unsigned int>(viennacl::traits::start(result) + viennacl::traits::stride(result) * current_index),
+                                                                          static_cast<unsigned int>(viennacl::traits::stride(result))
+                                                                         );
+              VIENNACL_CUDA_LAST_ERROR_CHECK("vector_multi_sum_kernel");
+            }
+              current_index += 3;
+              break;
+            case 2:
+            {
+              vector_base<T> const & y0 = vec_tuple.const_at(current_index);
+              vector_base<T> const & y1 = vec_tuple.const_at(current_index + 1);
+
+              inner_prod_2_kernel<<<VIENNACL_MDOT_WORKGROUP_NUM,
+                                    VIENNACL_MDOT_WORKGROUP_SIZE>>>( detail::cuda_arg<value_type>(x),
+                                                                     static_cast<unsigned int>(viennacl::traits::start(x)),
+                                                                     static_cast<unsigned int>(viennacl::traits::stride(x)),
+                                                                     static_cast<unsigned int>(viennacl::traits::size(x)),
+                                                                     detail::cuda_arg<value_type>(y0),
+                                                                     static_cast<unsigned int>(viennacl::traits::start(y0)),
+                                                                     static_cast<unsigned int>(viennacl::traits::stride(y0)),
+                                                                     detail::cuda_arg<value_type>(y1),
+                                                                     static_cast<unsigned int>(viennacl::traits::start(y1)),
+                                                                     static_cast<unsigned int>(viennacl::traits::stride(y1)),
+                                                                     detail::cuda_arg<value_type>(temp)
+                                                                    );
+              VIENNACL_CUDA_LAST_ERROR_CHECK("inner_prod_2_kernel");
+              vector_multi_sum_kernel<<<2, VIENNACL_MDOT_WORKGROUP_NUM>>>(detail::cuda_arg<value_type>(temp),
+                                                                          detail::cuda_arg<value_type>(result),
+                                                                          static_cast<unsigned int>(viennacl::traits::start(result) + viennacl::traits::stride(result) * current_index),
+                                                                          static_cast<unsigned int>(viennacl::traits::stride(result))
+                                                                         );
+              VIENNACL_CUDA_LAST_ERROR_CHECK("vector_multi_sum_kernel");
+            }
+              current_index += 2;
+              break;
+            case 1:
+            {
+              vector_base<T> const & y0 = vec_tuple.const_at(current_index);
+              inner_prod_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(x),
+                                              static_cast<unsigned int>(viennacl::traits::start(x)),
+                                              static_cast<unsigned int>(viennacl::traits::stride(x)),
+                                              static_cast<unsigned int>(viennacl::traits::size(x)),
+                                              detail::cuda_arg<value_type>(y0),
+                                              static_cast<unsigned int>(viennacl::traits::start(y0)),
+                                              static_cast<unsigned int>(viennacl::traits::stride(y0)),
+                                              static_cast<unsigned int>(viennacl::traits::size(y0)),
+                                              detail::cuda_arg<value_type>(temp)
+                                             );
+              VIENNACL_CUDA_LAST_ERROR_CHECK("inner_prod_kernel");
+
+              vector_multi_sum_kernel<<<1, 128>>>(detail::cuda_arg<value_type>(temp),
+                                                  detail::cuda_arg<value_type>(result),
+                                                  static_cast<unsigned int>(viennacl::traits::start(result) + viennacl::traits::stride(result) * current_index),
+                                                  static_cast<unsigned int>(viennacl::traits::stride(result))
+                                                 );
+              VIENNACL_CUDA_LAST_ERROR_CHECK("vector_multi_sum_kernel");
+            }
+              current_index += 1;
+              break;
+
+            default:
+            {
+              vector_base<T> const & y0 = vec_tuple.const_at(current_index);
+              vector_base<T> const & y1 = vec_tuple.const_at(current_index + 1);
+              vector_base<T> const & y2 = vec_tuple.const_at(current_index + 2);
+              vector_base<T> const & y3 = vec_tuple.const_at(current_index + 3);
+              vector_base<T> const & y4 = vec_tuple.const_at(current_index + 4);
+              vector_base<T> const & y5 = vec_tuple.const_at(current_index + 5);
+              vector_base<T> const & y6 = vec_tuple.const_at(current_index + 6);
+              vector_base<T> const & y7 = vec_tuple.const_at(current_index + 7);
+
+              inner_prod_8_kernel<<<VIENNACL_MDOT_WORKGROUP_NUM,
+                                    VIENNACL_MDOT_WORKGROUP_SIZE>>>( detail::cuda_arg<value_type>(x),
+                                                                     static_cast<unsigned int>(viennacl::traits::start(x)),
+                                                                     static_cast<unsigned int>(viennacl::traits::stride(x)),
+                                                                     static_cast<unsigned int>(viennacl::traits::size(x)),
+                                                                     detail::cuda_arg<value_type>(y0),
+                                                                     static_cast<unsigned int>(viennacl::traits::start(y0)),
+                                                                     static_cast<unsigned int>(viennacl::traits::stride(y0)),
+                                                                     detail::cuda_arg<value_type>(y1),
+                                                                     static_cast<unsigned int>(viennacl::traits::start(y1)),
+                                                                     static_cast<unsigned int>(viennacl::traits::stride(y1)),
+                                                                     detail::cuda_arg<value_type>(y2),
+                                                                     static_cast<unsigned int>(viennacl::traits::start(y2)),
+                                                                     static_cast<unsigned int>(viennacl::traits::stride(y2)),
+                                                                     detail::cuda_arg<value_type>(y3),
+                                                                     static_cast<unsigned int>(viennacl::traits::start(y3)),
+                                                                     static_cast<unsigned int>(viennacl::traits::stride(y3)),
+                                                                     detail::cuda_arg<value_type>(y4),
+                                                                     static_cast<unsigned int>(viennacl::traits::start(y4)),
+                                                                     static_cast<unsigned int>(viennacl::traits::stride(y4)),
+                                                                     detail::cuda_arg<value_type>(y5),
+                                                                     static_cast<unsigned int>(viennacl::traits::start(y5)),
+                                                                     static_cast<unsigned int>(viennacl::traits::stride(y5)),
+                                                                     detail::cuda_arg<value_type>(y6),
+                                                                     static_cast<unsigned int>(viennacl::traits::start(y6)),
+                                                                     static_cast<unsigned int>(viennacl::traits::stride(y6)),
+                                                                     detail::cuda_arg<value_type>(y7),
+                                                                     static_cast<unsigned int>(viennacl::traits::start(y7)),
+                                                                     static_cast<unsigned int>(viennacl::traits::stride(y7)),
+                                                                     detail::cuda_arg<value_type>(temp)
+                                                                    );
+              VIENNACL_CUDA_LAST_ERROR_CHECK("inner_prod_8_kernel");
+              vector_multi_sum_kernel<<<8, VIENNACL_MDOT_WORKGROUP_NUM>>>(detail::cuda_arg<value_type>(temp),
+                                                                          detail::cuda_arg<value_type>(result),
+                                                                          static_cast<unsigned int>(viennacl::traits::start(result) + viennacl::traits::stride(result) * current_index),
+                                                                          static_cast<unsigned int>(viennacl::traits::stride(result))
+                                                                         );
+              VIENNACL_CUDA_LAST_ERROR_CHECK("vector_multi_sum_kernel");
+            }
+              current_index += 8;
+              break;
+          }
+        }
+      }
+
+#undef VIENNACL_MDOT_WORKGROUP_NUM
+#undef VIENNACL_MDOT_WORKGROUP_SIZE
+
+      ///////////////////////////////////
+
+      template <typename T>
+      __global__ void norm_kernel_floats(
+                 const T * vec,
+                unsigned int start1,
+                unsigned int inc1,
+                unsigned int size1,
+                unsigned int norm_selector,
+                T * group_buffer)
+      {
+        __shared__ T tmp_buffer[128];
+
+        T tmp = 0;
+        unsigned int work_per_thread = (size1 - 1) / (gridDim.x * blockDim.x) + 1;
+        unsigned int group_start = blockIdx.x * work_per_thread * blockDim.x;
+        unsigned int group_stop  = (blockIdx.x + 1) * work_per_thread * blockDim.x;
+        group_stop = (group_stop > size1) ? size1 : group_stop;
+
+        if (norm_selector == 1) //norm_1
+        {
+          for (unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
+            tmp += fabs(vec[i*inc1 + start1]);
+        }
+        else if (norm_selector == 2) //norm_2
+        {
+          T vec_entry = 0;
+          for (unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
+          {
+            vec_entry = vec[i*inc1 + start1];
+            tmp += vec_entry * vec_entry;
+          }
+        }
+        else if (norm_selector == 0) //norm_inf
+        {
+          for (unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
+            tmp = fmax(fabs(vec[i*inc1 + start1]), tmp);
+        }
+
+        tmp_buffer[threadIdx.x] = tmp;
+
+        if (norm_selector > 0) //parallel reduction for norm_1 or norm_2:
+        {
+          for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
+          {
+            __syncthreads();
+            if (threadIdx.x < stride)
+              tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x+stride];
+          }
+        }
+        else
+        {
+          //norm_inf:
+          for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
+          {
+            __syncthreads();
+            if (threadIdx.x < stride)
+              tmp_buffer[threadIdx.x] = fmax(tmp_buffer[threadIdx.x], tmp_buffer[threadIdx.x+stride]);
+          }
+        }
+
+        if (threadIdx.x == 0)
+          group_buffer[blockIdx.x] = tmp_buffer[0];
+      }
+
+      template <typename T>
+      __global__ void norm_kernel_integers(
+                 const T * vec,
+                unsigned int start1,
+                unsigned int inc1,
+                unsigned int size1,
+                unsigned int norm_selector,
+                T * group_buffer)
+      {
+        __shared__ T tmp_buffer[128];
+
+        T tmp = 0;
+        unsigned int work_per_thread = (size1 - 1) / (gridDim.x * blockDim.x) + 1;
+        unsigned int group_start = blockIdx.x * work_per_thread * blockDim.x;
+        unsigned int group_stop  = (blockIdx.x + 1) * work_per_thread * blockDim.x;
+        group_stop = (group_stop > size1) ? size1 : group_stop;
+
+        if (norm_selector == 1) //norm_1
+        {
+          for (unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
+            tmp += abs(vec[i*inc1 + start1]);
+        }
+        else if (norm_selector == 0) //norm_inf
+        {
+          for (unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
+            tmp = (tmp > abs(vec[i*inc1 + start1])) ? tmp : abs(vec[i*inc1 + start1]);
+        }
+
+        tmp_buffer[threadIdx.x] = tmp;
+
+        if (norm_selector > 0) //parallel reduction for norm_1 or norm_2:
+        {
+          for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
+          {
+            __syncthreads();
+            if (threadIdx.x < stride)
+              tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x+stride];
+          }
+        }
+        else
+        {
+          //norm_inf:
+          for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
+          {
+            __syncthreads();
+            if (threadIdx.x < stride)
+              tmp_buffer[threadIdx.x] = (tmp_buffer[threadIdx.x] > tmp_buffer[threadIdx.x+stride]) ? tmp_buffer[threadIdx.x] : tmp_buffer[threadIdx.x+stride];
+          }
+        }
+
+        if (threadIdx.x == 0)
+          group_buffer[blockIdx.x] = tmp_buffer[0];
+      }
+
+      template <typename T>
+      __global__ void norm_kernel_unsigned_integers(
+                 const T * vec,
+                unsigned int start1,
+                unsigned int inc1,
+                unsigned int size1,
+                unsigned int norm_selector,
+                T * group_buffer)
+      {
+        __shared__ T tmp_buffer[128];
+
+        T tmp = 0;
+        unsigned int work_per_thread = (size1 - 1) / (gridDim.x * blockDim.x) + 1;
+        unsigned int group_start = blockIdx.x * work_per_thread * blockDim.x;
+        unsigned int group_stop  = (blockIdx.x + 1) * work_per_thread * blockDim.x;
+        group_stop = (group_stop > size1) ? size1 : group_stop;
+
+        if (norm_selector == 1) //norm_1
+        {
+          for (unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
+            tmp += vec[i*inc1 + start1];
+        }
+        else if (norm_selector == 0) //norm_inf
+        {
+          for (unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
+            tmp = (tmp > vec[i*inc1 + start1]) ? tmp : vec[i*inc1 + start1];
+        }
+
+        tmp_buffer[threadIdx.x] = tmp;
+
+        if (norm_selector > 0) //parallel reduction for norm_1 or norm_2:
+        {
+          for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
+          {
+            __syncthreads();
+            if (threadIdx.x < stride)
+              tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x+stride];
+          }
+        }
+        else
+        {
+          //norm_inf:
+          for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
+          {
+            __syncthreads();
+            if (threadIdx.x < stride)
+              tmp_buffer[threadIdx.x] = (tmp_buffer[threadIdx.x] > tmp_buffer[threadIdx.x+stride]) ? tmp_buffer[threadIdx.x] : tmp_buffer[threadIdx.x+stride];
+          }
+        }
+
+        if (threadIdx.x == 0)
+          group_buffer[blockIdx.x] = tmp_buffer[0];
+      }
+
+      /** \cond */
+      namespace detail
+      {
+        struct norm_kernel_launcher_integers
+        {
+          template <typename T>
+          static void apply(vector_base<T> const & vec1,
+                            vector_base<T> & temp,
+                            unsigned int option)
+          {
+            typedef T        value_type;
+            norm_kernel_integers<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
+                                               static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                               static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                               static_cast<unsigned int>(viennacl::traits::size(vec1)),
+                                               static_cast<unsigned int>(option),
+                                               detail::cuda_arg<value_type>(temp)
+                                              );
+            VIENNACL_CUDA_LAST_ERROR_CHECK("norm_kernel");
+          }
+        };
+
+        struct norm_kernel_launcher_unsigned_integers
+        {
+          template <typename T>
+          static void apply(vector_base<T> const & vec1,
+                            vector_base<T> & temp,
+                            unsigned int option)
+          {
+            typedef T        value_type;
+            norm_kernel_unsigned_integers<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
+                                                       static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                                       static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                                       static_cast<unsigned int>(viennacl::traits::size(vec1)),
+                                                       static_cast<unsigned int>(option),
+                                                       detail::cuda_arg<value_type>(temp)
+                                                      );
+            VIENNACL_CUDA_LAST_ERROR_CHECK("norm_kernel");
+          }
+        };
+
+
+        struct norm_kernel_launcher_floats
+        {
+          template <typename T>
+          static void apply(vector_base<T> const & vec1,
+                            vector_base<T> & temp,
+                            unsigned int option)
+          {
+            typedef T        value_type;
+            norm_kernel_floats<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
+                                             static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                             static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                             static_cast<unsigned int>(viennacl::traits::size(vec1)),
+                                             static_cast<unsigned int>(option),
+                                             detail::cuda_arg<value_type>(temp)
+                                            );
+            VIENNACL_CUDA_LAST_ERROR_CHECK("norm_kernel");
+          }
+        };
+
+        template <typename T>
+        struct norm_kernel_launcher : public norm_kernel_launcher_integers {};
+
+        template <>
+        struct norm_kernel_launcher<unsigned char>  : public norm_kernel_launcher_unsigned_integers {};
+
+        template <>
+        struct norm_kernel_launcher<unsigned short>  : public norm_kernel_launcher_unsigned_integers {};
+
+        template <>
+        struct norm_kernel_launcher<unsigned int>  : public norm_kernel_launcher_unsigned_integers {};
+
+        template <>
+        struct norm_kernel_launcher<unsigned long>  : public norm_kernel_launcher_unsigned_integers {};
+
+        template <>
+        struct norm_kernel_launcher<float>  : public norm_kernel_launcher_floats {};
+
+        template <>
+        struct norm_kernel_launcher<double> : public norm_kernel_launcher_floats {};
+
+      }
+      /** \endcond */
+
+
+      /** @brief Computes the l^1-norm of a vector
+      *
+      * @param vec1 The vector
+      * @param result The result scalar
+      */
+      template <typename T>
+      void norm_1_impl(vector_base<T> const & vec1,
+                       scalar<T> & result)
+      {
+        typedef T        value_type;
+
+        vcl_size_t work_groups = 128;
+        viennacl::vector<value_type> temp(work_groups);
+
+        detail::norm_kernel_launcher<T>::apply(vec1, temp, 1);
+        detail::vector_sum_kernel_launcher<T>::apply(temp, 1, result);
+      }
+
+      /** @brief Computes the l^1-norm of a vector
+      *
+      * @param vec1 The vector
+      * @param result The result scalar
+      */
+      template <typename T>
+      void norm_1_cpu(vector_base<T> const & vec1,
+                      T & result)
+      {
+        typedef T        value_type;
+
+        vcl_size_t work_groups = 128;
+        viennacl::vector<value_type> temp(work_groups);
+
+        detail::norm_kernel_launcher<T>::apply(vec1, temp, 1);
+
+        // Now copy partial results from GPU back to CPU and run reduction there:
+        std::vector<value_type> temp_cpu(work_groups);
+        viennacl::fast_copy(temp.begin(), temp.end(), temp_cpu.begin());
+
+        result = 0;
+        for (typename std::vector<value_type>::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
+          result += *it;
+      }
+
+      ///// norm_2
+
+      /** @brief Computes the l^2-norm of a vector - implementation
+      *
+      * @param vec1 The vector
+      * @param result The result scalar
+      */
+      template <typename T>
+      void norm_2_impl(vector_base<T> const & vec1,
+                       scalar<T> & result)
+      {
+        typedef T       value_type;
+
+        vcl_size_t work_groups = 128;
+        viennacl::vector<value_type> temp(work_groups);
+
+        detail::norm_kernel_launcher<T>::apply(vec1, temp, 2);
+
+        detail::vector_sum_kernel_launcher<T>::apply(temp, 2, result);
+      }
+
+      /** @brief Computes the l^2-norm of a vector - implementation
+      *
+      * @param vec1 The vector
+      * @param result The result scalar
+      */
+      template <typename T>
+      void norm_2_cpu(vector_base<T> const & vec1,
+                      T & result)
+      {
+        typedef T        value_type;
+
+        vcl_size_t work_groups = 128;
+        viennacl::vector<value_type> temp(work_groups);
+
+        detail::norm_kernel_launcher<T>::apply(vec1, temp, 2);
+
+        std::vector<value_type> temp_cpu(work_groups);
+        viennacl::fast_copy(temp.begin(), temp.end(), temp_cpu.begin());
+
+        result = 0;
+        for (typename std::vector<value_type>::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
+          result += *it;
+        result = std::sqrt(result);
+      }
+
+
+      ////// norm_inf
+
+      /** @brief Computes the supremum-norm of a vector
+      *
+      * @param vec1 The vector
+      * @param result The result scalar
+      */
+      template <typename T>
+      void norm_inf_impl(vector_base<T> const & vec1,
+                         scalar<T> & result)
+      {
+        typedef T      value_type;
+
+        vcl_size_t work_groups = 128;
+        viennacl::vector<value_type> temp(work_groups);
+
+        detail::norm_kernel_launcher<T>::apply(vec1, temp, 0);
+        detail::vector_sum_kernel_launcher<T>::apply(temp, 0, result);
+      }
+
+
+
+      /** @brief Computes the supremum-norm of a vector
+      *
+      * @param vec1 The vector
+      * @param result The result scalar
+      */
+      template <typename T>
+      void norm_inf_cpu(vector_base<T> const & vec1,
+                        T & result)
+      {
+        typedef T        value_type;
+
+        vcl_size_t work_groups = 128;
+        viennacl::vector<value_type> temp(work_groups);
+
+        detail::norm_kernel_launcher<T>::apply(vec1, temp, 0);
+
+        std::vector<value_type> temp_cpu(work_groups);
+        viennacl::fast_copy(temp.begin(), temp.end(), temp_cpu.begin());
+
+        result = 0;
+        for (typename std::vector<value_type>::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
+          result = std::max(result, *it);
+      }
+
+
+      //////////////////////////////////////
+
+
+
+      //index_norm_inf:
+
+      // fixes the problem of not having (f)abs available in a consistent manner
+      template <typename T>
+      __device__ T              cuda_abs(T val) { return (val < 0) ? -val : val; }
+      __device__ inline unsigned long  cuda_abs(unsigned long  val) { return val; }
+      __device__ inline unsigned int   cuda_abs(unsigned int   val) { return val; }
+      __device__ inline unsigned short cuda_abs(unsigned short val) { return val; }
+      __device__ inline unsigned char  cuda_abs(unsigned char  val) { return val; }
+
+      template <typename T>
+      __global__ void index_norm_inf_kernel(const T * vec,
+                                            unsigned int start1,
+                                            unsigned int inc1,
+                                            unsigned int size1,
+                                            unsigned int * result)
+      {
+        __shared__ T float_buffer[128];
+        __shared__ unsigned int index_buffer[128];
+
+        float_buffer[threadIdx.x] = 0;
+        index_buffer[threadIdx.x] = 0;
+
+        //step 1: fill buffer:
+        T cur_max = (T)0;
+        T tmp;
+        for (unsigned int i = threadIdx.x; i < size1; i += blockDim.x)
+        {
+          tmp = vec[i*inc1+start1];
+          tmp = cuda_abs(tmp);
+          if (cur_max < tmp)
+          {
+            float_buffer[threadIdx.x] = tmp;
+            index_buffer[threadIdx.x] = i;
+            cur_max = tmp;
+          }
+        }
+
+        //step 2: parallel reduction:
+        for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
+        {
+          __syncthreads();
+          if (threadIdx.x < stride)
+          {
+            //find the first occurring index
+            if (float_buffer[threadIdx.x] < float_buffer[threadIdx.x+stride])
+            {
+              index_buffer[threadIdx.x] = index_buffer[threadIdx.x+stride];
+              float_buffer[threadIdx.x] = float_buffer[threadIdx.x+stride];
+            }
+          }
+        }
+
+        if (threadIdx.x == 0)
+          *result = index_buffer[0];
+      }
+
+      //This function should return a CPU scalar, otherwise statements like
+      // vcl_rhs[index_norm_inf(vcl_rhs)]
+      // are ambiguous
+      /** @brief Computes the index of the first entry that is equal to the supremum-norm in modulus.
+      *
+      * @param vec1 The vector
+      * @return The result. Note that the result must be a CPU scalar (unsigned int), since gpu scalars are floating point types.
+      */
+      template <typename T>
+      vcl_size_t index_norm_inf(vector_base<T> const & vec1)
+      {
+        typedef T       value_type;
+
+        viennacl::backend::mem_handle h;
+        viennacl::backend::memory_create(h, sizeof(unsigned int), viennacl::traits::context(vec1));
+
+        index_norm_inf_kernel<<<1, 128>>>(detail::cuda_arg<value_type>(vec1),
+                                          static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                          static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                          static_cast<unsigned int>(viennacl::traits::size(vec1)),
+                                          //detail::cuda_arg<unsigned int>(h.cuda_handle())
+                                          reinterpret_cast<unsigned int *>(h.cuda_handle().get())
+                                        );
+        VIENNACL_CUDA_LAST_ERROR_CHECK("index_norm_inf_kernel");
+
+        unsigned int ret = 0;
+        viennacl::backend::memory_read(h, 0, sizeof(unsigned int), &ret);
+        return static_cast<vcl_size_t>(ret);
+      }
+
+      ///////////////////////////////////////////
+
+      template <typename T>
+      __global__ void plane_rotation_kernel(
+                T * vec1,
+                unsigned int start1,
+                unsigned int inc1,
+                unsigned int size1,
+                T * vec2,
+                unsigned int start2,
+                unsigned int inc2,
+                unsigned int size2,
+                T alpha,
+                T beta)
+      {
+        T tmp1 = 0;
+        T tmp2 = 0;
+
+        for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += blockDim.x * gridDim.x)
+        {
+          tmp1 = vec1[i*inc1+start1];
+          tmp2 = vec2[i*inc2+start2];
+
+          vec1[i*inc1+start1] = alpha * tmp1 + beta * tmp2;
+          vec2[i*inc2+start2] = alpha * tmp2 - beta * tmp1;
+        }
+
+      }
+
+      /** @brief Computes a plane rotation of two vectors.
+      *
+      * Computes (x,y) <- (alpha * x + beta * y, -beta * x + alpha * y)
+      *
+      * @param vec1   The first vector
+      * @param vec2   The second vector
+      * @param alpha  The first transformation coefficient
+      * @param beta   The second transformation coefficient
+      */
+      template <typename T>
+      void plane_rotation(vector_base<T> & vec1,
+                          vector_base<T> & vec2,
+                          T alpha, T beta)
+      {
+        typedef T     value_type;
+
+        value_type temporary_alpha = 0;
+        if (viennacl::is_cpu_scalar<value_type>::value)
+          temporary_alpha = alpha;
+
+        value_type temporary_beta = 0;
+        if (viennacl::is_cpu_scalar<value_type>::value)
+          temporary_beta = beta;
+
+        plane_rotation_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
+                                            static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                            static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                            static_cast<unsigned int>(viennacl::traits::size(vec1)),
+                                            detail::cuda_arg<value_type>(vec2),
+                                            static_cast<unsigned int>(viennacl::traits::start(vec2)),
+                                            static_cast<unsigned int>(viennacl::traits::stride(vec2)),
+                                            static_cast<unsigned int>(viennacl::traits::size(vec2)),
+                                            detail::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
+                                            detail::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)) );
+        VIENNACL_CUDA_LAST_ERROR_CHECK("plane_rotation_kernel");
+      }
+
+    } //namespace opencl
+  } //namespace linalg
+} //namespace viennacl
+
+
+#endif
diff --git a/viennacl/linalg/detail/amg/amg_base.hpp b/viennacl/linalg/detail/amg/amg_base.hpp
index 25cd084..3e73be4 100644
--- a/viennacl/linalg/detail/amg/amg_base.hpp
+++ b/viennacl/linalg/detail/amg/amg_base.hpp
@@ -2,16 +2,17 @@
 #define VIENNACL_LINALG_DETAIL_AMG_AMG_BASE_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
@@ -19,10 +20,11 @@
 
 /** @file amg_base.hpp
     @brief Helper classes and functions for the AMG preconditioner. Experimental.
-    
+
     AMG code contributed by Markus Wagner
 */
 
+#include <boost/numeric/ublas/operation.hpp>
 #include <boost/numeric/ublas/vector.hpp>
 #include <cmath>
 #include <set>
@@ -30,7 +32,7 @@
 #include <algorithm>
 
 #include <map>
-#ifdef _OPENMP
+#ifdef VIENNACL_WITH_OPENMP
 #include <omp.h>
 #endif
 
@@ -79,41 +81,41 @@ namespace viennacl
                     unsigned int presmooth = 1,
                     unsigned int postsmooth = 1,
                     unsigned int coarselevels = 0)
-            : _coarse(coarse), _interpol(interpol),
-              _threshold(threshold), _interpolweight(interpolweight), _jacobiweight(jacobiweight), 
-              _presmooth(presmooth), _postsmooth(postsmooth), _coarselevels(coarselevels) {}; 
+            : coarse_(coarse), interpol_(interpol),
+              threshold_(threshold), interpolweight_(interpolweight), jacobiweight_(jacobiweight),
+              presmooth_(presmooth), postsmooth_(postsmooth), coarselevels_(coarselevels) {}
 
             // Getter-/Setter-Functions
-            void set_coarse(unsigned int coarse) { if (coarse > 0) _coarse = coarse; }
-            unsigned int get_coarse() const { return _coarse; }
-            
-            void set_interpol(unsigned int interpol) { if (interpol > 0) _interpol = interpol; }
-            unsigned int get_interpol() const { return _interpol; }
-            
-            void set_threshold(double threshold) { if (threshold > 0 && threshold <= 1) _threshold = threshold; }
-            double get_threshold() const{ return _threshold; }
-            
-            void set_as(double jacobiweight) { if (jacobiweight > 0 && jacobiweight <= 2) _jacobiweight = jacobiweight; }
-            double get_interpolweight() const { return _interpolweight; }
-            
-            void set_interpolweight(double interpolweight) { if (interpolweight > 0 && interpolweight <= 2) _interpolweight = interpolweight; }
-            double get_jacobiweight() const { return _jacobiweight; }
-            
-            void set_presmooth(int presmooth) { if (presmooth >= 0) _presmooth = presmooth; }
-            unsigned int get_presmooth() const { return _presmooth; }
-            
-            void set_postsmooth(int postsmooth) { if (postsmooth >= 0) _postsmooth = postsmooth; }
-            unsigned int get_postsmooth() const { return _postsmooth; }
-            
-            void set_coarselevels(int coarselevels)  { if (coarselevels >= 0) _coarselevels = coarselevels; }
-            unsigned int get_coarselevels() const { return _coarselevels; }
+            void set_coarse(unsigned int coarse) { if (coarse > 0) coarse_ = coarse; }
+            unsigned int get_coarse() const { return coarse_; }
+
+            void set_interpol(unsigned int interpol) { if (interpol > 0) interpol_ = interpol; }
+            unsigned int get_interpol() const { return interpol_; }
+
+            void set_threshold(double threshold) { if (threshold > 0 && threshold <= 1) threshold_ = threshold; }
+            double get_threshold() const{ return threshold_; }
+
+            void set_as(double jacobiweight) { if (jacobiweight > 0 && jacobiweight <= 2) jacobiweight_ = jacobiweight; }
+            double get_interpolweight() const { return interpolweight_; }
+
+            void set_interpolweight(double interpolweight) { if (interpolweight > 0 && interpolweight <= 2) interpolweight_ = interpolweight; }
+            double get_jacobiweight() const { return jacobiweight_; }
+
+            void set_presmooth(int presmooth) { if (presmooth >= 0) presmooth_ = presmooth; }
+            unsigned int get_presmooth() const { return presmooth_; }
+
+            void set_postsmooth(int postsmooth) { if (postsmooth >= 0) postsmooth_ = postsmooth; }
+            unsigned int get_postsmooth() const { return postsmooth_; }
+
+            void set_coarselevels(int coarselevels)  { if (coarselevels >= 0) coarselevels_ = coarselevels; }
+            unsigned int get_coarselevels() const { return coarselevels_; }
 
           private:
-            unsigned int _coarse, _interpol;
-            double _threshold, _interpolweight, _jacobiweight;
-            unsigned int _presmooth, _postsmooth, _coarselevels;
+            unsigned int coarse_, interpol_;
+            double threshold_, interpolweight_, jacobiweight_;
+            unsigned int presmooth_, postsmooth_, coarselevels_;
         };
-        
+
         /** @brief A class for a scalar that can be written to the sparse matrix or sparse vector datatypes.
         *  @brief Values are only written to those datatypes if non-zero to optimize memory usage and performance.
         *  @brief Needed for the []- and ()-operators.
@@ -122,14 +124,14 @@ namespace viennacl
         class amg_nonzero_scalar
         {
           private:
-            InternalType *_m;
-            IteratorType _iter;
-            unsigned int _i,_j;
-            ScalarType _s;
+            InternalType *m_;
+            IteratorType iter_;
+            unsigned int i_,j_;
+            ScalarType s_;
 
           public:
             amg_nonzero_scalar();
-            
+
             /** @brief The constructor.
             *  @param m    Pointer to the sparse vector/matrix the scalar will be written to
             *  @param iter    Iterator pointing to the respective element in the vector/matrix if available
@@ -141,21 +143,21 @@ namespace viennacl
                               IteratorType & iter,
                               unsigned int i,
                               unsigned int j,
-                              ScalarType s = 0): _m(m), _iter(iter), _i(i), _j(j), _s(s) {}
-            
+                              ScalarType s = 0): m_(m), iter_(iter), i_(i), j_(j), s_(s) {}
+
             /** @brief Assignment operator. Writes value into matrix at the given position.
             *  @param value  Value that will be written
             */
             ScalarType operator = (const ScalarType value)
             {
-              _s = value;
+              s_ = value;
               // Only write if scalar is nonzero
-              if (_s == 0) return _s;
-              // Write to _m using iterator _iter or indices (_i,_j)
-              _m->addscalar (_iter,_i,_j,_s);
-              return _s;
+              if (s_ == 0) return s_;
+              // Write to m_ using iterator iter_ or indices (i_,j_)
+              m_->addscalar (iter_,i_,j_,s_);
+              return s_;
             }
-            
+
             /** @brief Addition operator. Adds a constant.
             *  @param value  Value that will be written
             */
@@ -163,52 +165,52 @@ namespace viennacl
             {
               // If zero is added, then no change necessary
               if (value == 0)
-                return _s;
-              
-              _s += value;
+                return s_;
+
+              s_ += value;
               // Remove entry if resulting scalar is zero
-              if (_s == 0)
+              if (s_ == 0)
               {
-                _m->removescalar(_iter,_i);
-                return _s;
+                m_->removescalar(iter_,i_);
+                return s_;
               }
-              //Write to _m using iterator _iter or indices (_i,_j)
-              _m->addscalar (_iter,_i,_j,_s);
-              return _s;
+              //Write to m_ using iterator iter_ or indices (i_,j_)
+              m_->addscalar (iter_,i_,j_,s_);
+              return s_;
             }
             ScalarType operator ++ (int)
             {
-              _s++;
-              if (_s == 0)
-                _m->removescalar(_iter,_i);
-              _m->addscalar (_iter,_i,_j,_s);
-              return _s;
+              s_++;
+              if (s_ == 0)
+                m_->removescalar(iter_,i_);
+              m_->addscalar (iter_,i_,j_,s_);
+              return s_;
             }
             ScalarType operator ++ ()
             {
-              _s++;
-              if (_s == 0)
-                _m->removescalar(_iter,_i);
-              _m->addscalar (_iter,_i,_j,_s);
-              return _s;
+              s_++;
+              if (s_ == 0)
+                m_->removescalar(iter_,i_);
+              m_->addscalar (iter_,i_,j_,s_);
+              return s_;
             }
-            operator ScalarType (void) { return _s;  }
+            operator ScalarType (void) { return s_;  }
         };
-    
+
         /** @brief Defines an iterator for the sparse vector type.
         */
         template <typename InternalType>
         class amg_sparsevector_iterator
         {
           private:
-            typedef amg_sparsevector_iterator<InternalType> self_type;  
+            typedef amg_sparsevector_iterator<InternalType> self_type;
             typedef typename InternalType::mapped_type ScalarType;
-            
+
             InternalType & internal_vec;
             typename InternalType::iterator iter;
-              
+
           public:
-            
+
             /** @brief The constructor.
             *  @param vec    Internal sparse vector
             *  @param begin  Whether the iterator starts at the beginning or end of vec
@@ -220,7 +222,7 @@ namespace viennacl
               else
                 iter = internal_vec.end();
             }
-            
+
             bool operator == (self_type other)
             {
               if (iter == other.iter)
@@ -235,17 +237,17 @@ namespace viennacl
               else
                 return false;
             }
-            
+
             self_type & operator ++ () const { iter++; return *this; }
             self_type & operator ++ () { iter++; return *this; }
             self_type & operator -- () const { iter--; return *this; }
-            self_type & operator -- () { iter--; return *this; }  
+            self_type & operator -- () { iter--; return *this; }
             ScalarType & operator * () const { return (*iter).second; }
             ScalarType & operator * () { return (*iter).second; }
             unsigned int index() const { return (*iter).first; }
             unsigned int index() { return (*iter).first; }
         };
-    
+
         /** @brief A class for the sparse vector type.
         */
         template <typename ScalarType>
@@ -253,40 +255,40 @@ namespace viennacl
         {
           public:
             typedef ScalarType value_type;
-      
+
           private:
             // A map is used internally which saves all non-zero elements with pairs of (index,value)
             typedef std::map<unsigned int,ScalarType> InternalType;
             typedef amg_sparsevector<ScalarType> self_type;
             typedef amg_nonzero_scalar<self_type,typename InternalType::iterator,ScalarType> NonzeroScalarType;
-              
+
             // Size is only a dummy variable. Not needed for internal map structure but for compatible vector interface.
-            unsigned int _size;
+            unsigned int size_;
             InternalType internal_vector;
-      
+
           public:
             typedef amg_sparsevector_iterator<InternalType> iterator;
             typedef typename InternalType::const_iterator const_iterator;
-      
+
           public:
             /** @brief The constructor.
             *  @param size    Size of the vector
             */
-            amg_sparsevector(unsigned int size = 0): _size(size)
+            amg_sparsevector(unsigned int size = 0): size_(size)
             {
               internal_vector = InternalType();
             }
-            
-            void resize(unsigned int size) { _size = size; }
-            unsigned int size() const { return _size;}
-            
+
+            void resize(unsigned int size) { size_ = size; }
+            unsigned int size() const { return size_;}
+
             // Returns number of non-zero entries in vector equal to the size of the underlying map.
-            unsigned int internal_size() const { return internal_vector.size(); }
+            unsigned int internal_size() const { return static_cast<unsigned int>(internal_vector.size()); }
             // Delete underlying map.
             void clear() { internal_vector.clear();  }
             // Remove entry at position i.
             void remove(unsigned int i) { internal_vector.erase(i); }
-            
+
             // Add s to the entry at position i
             void add (unsigned int i, ScalarType s)
             {
@@ -304,26 +306,26 @@ namespace viennacl
                   internal_vector.erase(iter);
               }
             }
-            
+
             // Write to the map. Is called from non-zero scalar type.
             template <typename IteratorType>
-            void addscalar(IteratorType & iter, unsigned int i, unsigned int j, ScalarType s)
+            void addscalar(IteratorType & iter, unsigned int i, unsigned int /* j */, ScalarType s)
             {
               // Don't write if value is zero
               if (s == 0)
                 return;
-              
+
               // If entry is already present, overwrite value, otherwise make new entry
-              if (iter != internal_vector.end())  
+              if (iter != internal_vector.end())
                 (*iter).second = s;
               else
                 internal_vector[i] = s;
             }
-            
+
             // Remove value from the map. Is called from non-zero scalar type.
             template <typename IteratorType>
-            void removescalar(IteratorType & iter, unsigned int i) { internal_vector.erase(iter); }   
-            
+            void removescalar(IteratorType & iter, unsigned int /* i */) { internal_vector.erase(iter); }
+
             // Bracket operator. Returns non-zero scalar type with actual values of the respective entry which calls addscalar/removescalar after value is altered.
             NonzeroScalarType operator [] (unsigned int i)
             {
@@ -332,39 +334,39 @@ namespace viennacl
               if (it != internal_vector.end())
                 return NonzeroScalarType (this,it,i,i,(*it).second);
               else
-                return NonzeroScalarType (this,it,i,i,0);  
+                return NonzeroScalarType (this,it,i,i,0);
             }
-            
+
             // Use internal data structure directly for read-only access. No need to use non-zero scalar as no write access possible.
             ScalarType operator [] (unsigned int i) const
             {
               const_iterator it = internal_vector.find(i);
-              
+
               if (it != internal_vector.end())
                 return (*it).second;
               else
                 return 0;
             }
-            
+
             // Iterator functions.
             iterator begin() { return iterator(internal_vector); }
             const_iterator begin() const { return internal_vector.begin(); }
             iterator end() { return iterator(internal_vector,false); }
             const_iterator end() const { return internal_vector.end(); }
-            
+
             // checks whether value at index i is nonzero. More efficient than doing [] == 0.
             bool isnonzero(unsigned int i) const { return internal_vector.find(i) != internal_vector.end();  }
-            
+
             // Copies data into a ublas vector type.
             operator boost::numeric::ublas::vector<ScalarType> (void)
             {
-              boost::numeric::ublas::vector<ScalarType> vec (_size);    
+              boost::numeric::ublas::vector<ScalarType> vec (size_);
               for (iterator iter = begin(); iter != end(); ++iter)
-                vec [iter.index()] = *iter;        
+                vec [iter.index()] = *iter;
               return vec;
-            } 
+            }
          };
-    
+
         /** @brief A class for the sparse matrix type.
         *  Uses vector of maps as data structure for higher performance and lower memory usage.
         *  Uses similar interface as ublas::compressed_matrix.
@@ -379,40 +381,40 @@ namespace viennacl
             typedef std::map<unsigned int,ScalarType> RowType;
             typedef std::vector<RowType> InternalType;
             typedef amg_sparsematrix<ScalarType> self_type;
-            
+
             // Adapter is used for certain functionality, especially iterators.
             typedef typename viennacl::tools::sparse_matrix_adapter<ScalarType> AdapterType;
             typedef typename viennacl::tools::const_sparse_matrix_adapter<ScalarType> ConstAdapterType;
-            
+
             // Non-zero scalar is used to write to the matrix.
             typedef amg_nonzero_scalar<self_type,typename RowType::iterator,ScalarType> NonzeroScalarType;
 
             // Holds matrix coefficients.
             InternalType internal_mat;
-            // Holds matrix coefficient of transposed matrix if built. 
+            // Holds matrix coefficient of transposed matrix if built.
             // Note: Only internal_mat is written using operators and methods while internal_mat_trans is built from internal_mat using do_trans().
             InternalType internal_mat_trans;
             // Saves sizes.
-            size_t s1, s2;
-            
+            vcl_size_t s1, s2;
+
             // True if the transposed of the matrix is used (for calculations, iteration, etc.).
             bool transposed_mode;
             // True if the transposed is already built (saved in internal_mat_trans) and also up to date (no changes to internal_mat).
             bool transposed;
-            
-          public:          
+
+          public:
             typedef typename AdapterType::iterator1 iterator1;
             typedef typename AdapterType::iterator2 iterator2;
             typedef typename ConstAdapterType::const_iterator1 const_iterator1;
             typedef typename ConstAdapterType::const_iterator2 const_iterator2;
-            
+
             /** @brief Standard constructor. */
             amg_sparsematrix ()
             {
               transposed_mode = false;
               transposed = false;
             }
-            
+
             /** @brief Constructor. Builds matrix of size (i,j).
               * @param i  Size of first dimension
               * @param j  Size of second dimension
@@ -430,49 +432,49 @@ namespace viennacl
               transposed_mode = false;
               transposed = false;
             }
-            
+
             /** @brief Constructor. Builds matrix via std::vector<std::map> by copying memory
             * (Only necessary feature of this other matrix type is to have const iterators)
             * @param mat  Vector of maps
             */
             amg_sparsematrix (std::vector<std::map<unsigned int, ScalarType> > const & mat)
-            {  
+            {
               AdapterType a (internal_mat, mat.size(), mat.size());
               AdapterType a_trans (internal_mat_trans, mat.size(), mat.size());
               a.resize(mat.size(), mat.size());
               a_trans.resize(mat.size(), mat.size());
-              
-              internal_mat = mat;  
+
+              internal_mat = mat;
               s1 = s2 = mat.size();
-              
+
               transposed_mode = false;
               transposed = false;
             }
-            
+
             /** @brief Constructor. Builds matrix via another matrix type.
               * (Only necessary feature of this other matrix type is to have const iterators)
               * @param mat  Matrix
               */
             template <typename MatrixType>
             amg_sparsematrix (MatrixType const & mat)
-            {  
+            {
               AdapterType a (internal_mat, mat.size1(), mat.size2());
               AdapterType a_trans (internal_mat_trans, mat.size2(), mat.size1());
               a.resize(mat.size1(), mat.size2());
-              a_trans.resize (mat.size2(), mat.size1());
+              a_trans.resize(mat.size2(), mat.size1());
               s1 = mat.size1();
               s2 = mat.size2();
               a.clear();
               a_trans.clear();
-              
+
               for (typename MatrixType::const_iterator1 row_iter = mat.begin1(); row_iter != mat.end1(); ++row_iter)
               {
                 for (typename MatrixType::const_iterator2 col_iter = row_iter.begin(); col_iter != row_iter.end(); ++col_iter)
                 {
                   if (*col_iter != 0)
                   {
-                    unsigned int x = col_iter.index1();
-                    unsigned int y = col_iter.index2();
+                    unsigned int x = static_cast<unsigned int>(col_iter.index1());
+                    unsigned int y = static_cast<unsigned int>(col_iter.index2());
                     a (x,y) = *col_iter;
                     a_trans (y,x) = *col_iter;
                   }
@@ -481,32 +483,32 @@ namespace viennacl
               transposed_mode = false;
               transposed = true;
             }
-                  
+
             // Build transposed of the current matrix.
             void do_trans()
             {
               // Do it only once if called in a parallel section
-            #ifdef _OPENMP
+            #ifdef VIENNACL_WITH_OPENMP
               #pragma omp critical
             #endif
-              { 
+              {
                 // Only build transposed if it is not built or not up to date
                 if (!transposed)
                 {
                   // Mode has to be set to standard mode temporarily
                   bool save_mode = transposed_mode;
                   transposed_mode = false;
-                  
+
                   for (iterator1 row_iter = begin1(); row_iter != end1(); ++row_iter)
-                for (iterator2 col_iter = row_iter.begin(); col_iter != row_iter.end(); ++col_iter)
-                  internal_mat_trans[col_iter.index2()][col_iter.index1()] = *col_iter;
-                
+                    for (iterator2 col_iter = row_iter.begin(); col_iter != row_iter.end(); ++col_iter)
+                      internal_mat_trans[col_iter.index2()][static_cast<unsigned int>(col_iter.index1())] = *col_iter;
+
                   transposed_mode = save_mode;
                   transposed = true;
                 }
               }
             } //do_trans()
-            
+
             // Set transposed mode (true=transposed, false=regular)
             void set_trans(bool mode)
             {
@@ -514,9 +516,9 @@ namespace viennacl
               if (mode)
                 do_trans();
             }
-            
-            bool get_trans() const { return transposed_mode; }     
-                  
+
+            bool get_trans() const { return transposed_mode; }
+
             // Checks whether coefficient (i,j) is non-zero. More efficient than using (i,j) == 0.
             bool isnonzero (unsigned int i, unsigned int j) const
             {
@@ -535,14 +537,14 @@ namespace viennacl
                   return false;
               }
             } //isnonzero()
-                
+
             // Add s to value at (i,j)
             void add (unsigned int i, unsigned int j, ScalarType s)
             {
               // If zero is added then do nothing.
               if (s == 0)
                 return;
-              
+
               typename RowType::iterator col_iter = internal_mat[i].find(j);
               // If there is no entry at position (i,j), then make new entry.
               if (col_iter == internal_mat[i].end())
@@ -558,7 +560,7 @@ namespace viennacl
               }
               transposed = false;
             } //add()
-            
+
             // Write to the internal data structure. Is called from non-zero scalar type.
             template <typename IteratorType>
             void addscalar(IteratorType & iter, unsigned int i, unsigned int j, ScalarType s)
@@ -566,28 +568,28 @@ namespace viennacl
               // Don't write if value is zero
               if (s == 0)
                 return;
-              
-              if (iter != internal_mat[i].end())  
+
+              if (iter != internal_mat[i].end())
                 (*iter).second = s;
               else
                 internal_mat[i][j] = s;
-              
+
               transposed = false;
             }
-            
+
             // Remove entry from internal data structure. Is called from non-zero scalar type.
             template <typename IteratorType>
             void removescalar(IteratorType & iter, unsigned int i)
             {
               internal_mat[i].erase(iter);
               transposed = false;
-            }   
-            
+            }
+
             // Return non-zero scalar at position (i,j). Value is written to the non-zero scalar and updated via addscalar()/removescalar().
             NonzeroScalarType operator()(unsigned int i, unsigned int j)
             {
               typename RowType::iterator iter;
-              
+
               if (!transposed_mode)
               {
                 iter = internal_mat[i].find(j);
@@ -605,12 +607,12 @@ namespace viennacl
                   return NonzeroScalarType (this,iter,j,i,0);
               }
             }
-            
+
             // For read-only access return the actual value directly. Non-zero datatype not needed as no write access possible.
             ScalarType operator()(unsigned int i, unsigned int j) const
             {
               typename RowType::const_iterator iter;
-              
+
               if (!transposed_mode)
               {
                 iter = internal_mat[i].find(j);
@@ -628,7 +630,7 @@ namespace viennacl
                   return 0;
               }
             }
-              
+
             void resize(unsigned int i, unsigned int j, bool preserve = true)
             {
               AdapterType a (internal_mat);
@@ -638,8 +640,8 @@ namespace viennacl
               s1 = i;
               s2 = j;
             }
-            
-            void clear() 
+
+            void clear()
             {
               AdapterType a (internal_mat, s1, s2);
               a.clear();
@@ -648,38 +650,39 @@ namespace viennacl
               transposed = true;
             }
 
-            size_t size1()
+            vcl_size_t size1()
             {
               if (!transposed_mode)
                 return s1;
               else
                 return s2;
             }
-            
-            size_t size1() const
+
+            vcl_size_t size1() const
             {
               if (!transposed_mode)
                 return s1;
               else
                 return s2;
             }
-            
-            
-            size_t size2()
+
+
+            vcl_size_t size2()
             {
               if (!transposed_mode)
                 return s2;
               else
                 return s1;
             }
-            size_t size2() const
+
+            vcl_size_t size2() const
             {
               if (!transposed_mode)
                 return s2;
               else
                 return s1;
             }
-            
+
             iterator1 begin1(bool trans = false)
             {
               if (!trans && !transposed_mode)
@@ -694,7 +697,7 @@ namespace viennacl
                 return a_trans.begin1();
               }
             }
-            
+
             iterator1 end1(bool trans = false)
             {
               if (!trans && !transposed_mode)
@@ -709,7 +712,7 @@ namespace viennacl
                 return a_trans.end1();
               }
             }
-            
+
             iterator2 begin2(bool trans = false)
             {
               if (!trans && !transposed_mode)
@@ -724,7 +727,7 @@ namespace viennacl
                 return a_trans.begin2();
               }
             }
-            
+
             iterator2 end2(bool trans = false)
             {
               if (!trans && !transposed_mode)
@@ -739,42 +742,42 @@ namespace viennacl
                 return a_trans.end2();
               }
             }
-            
+
             const_iterator1 begin1() const
             {
               // Const_iterator of transposed can only be used if transposed matrix is already built and up to date.
-              assert((!transposed_mode || (transposed_mode && transposed)) && "Error: Cannot build const_iterator when transposed has not been built yet!");
+              assert((!transposed_mode || (transposed_mode && transposed)) && bool("Error: Cannot build const_iterator when transposed has not been built yet!"));
                     ConstAdapterType a_const (internal_mat, s1, s2);
               return a_const.begin1();
             }
-            
+
             const_iterator1 end1(bool trans = false) const
             {
-              assert((!transposed_mode || (transposed_mode && transposed)) && "Error: Cannot build const_iterator when transposed has not been built yet!");
+              assert((!transposed_mode || (transposed_mode && transposed)) && bool("Error: Cannot build const_iterator when transposed has not been built yet!"));
               ConstAdapterType a_const (internal_mat, trans ? s2 : s1, trans ? s1 : s2);
               return a_const.end1();
             }
-            
+
             const_iterator2 begin2(bool trans = false) const
             {
-              assert((!transposed_mode || (transposed_mode && transposed)) && "Error: Cannot build const_iterator when transposed has not been built yet!");
+              assert((!transposed_mode || (transposed_mode && transposed)) && bool("Error: Cannot build const_iterator when transposed has not been built yet!"));
               ConstAdapterType a_const (internal_mat, trans ? s2 : s1, trans ? s1 : s2);
               return a_const.begin2();
             }
-            
+
             const_iterator2 end2(bool trans = false) const
             {
-              assert((!transposed_mode || (transposed_mode && transposed)) && "Error: Cannot build const_iterator when transposed has not been built yet!");
+              assert((!transposed_mode || (transposed_mode && transposed)) && bool("Error: Cannot build const_iterator when transposed has not been built yet!"));
               ConstAdapterType a_const (internal_mat, trans ? s2 : s1, trans ? s1 : s2);
               return a_const.end2();
             }
-            
+
             // Returns pointer to the internal data structure. Improves performance of copy operation to GPU.
             std::vector<std::map<unsigned int, ScalarType> > * get_internal_pointer()
-            {    
+            {
               if (!transposed_mode)
                 return &internal_mat;
-              
+
               if (!transposed)
                 do_trans();
               return &internal_mat_trans;
@@ -784,27 +787,27 @@ namespace viennacl
               boost::numeric::ublas::compressed_matrix<ScalarType> mat;
               mat.resize(size1(),size2(),false);
               mat.clear();
-              
+
               for (iterator1 row_iter = begin1(); row_iter != end1(); ++row_iter)
                   for (iterator2 col_iter = row_iter.begin(); col_iter != row_iter.end(); ++col_iter)
                     mat (col_iter.index1(), col_iter.index2()) = *col_iter;
-                  
+
               return mat;
-            } 
+            }
             operator boost::numeric::ublas::matrix<ScalarType> (void)
             {
               boost::numeric::ublas::matrix<ScalarType> mat;
               mat.resize(size1(),size2(),false);
               mat.clear();
-              
+
               for (iterator1 row_iter = begin1(); row_iter != end1(); ++row_iter)
                   for (iterator2 col_iter = row_iter.begin(); col_iter != row_iter.end(); ++col_iter)
                     mat (col_iter.index1(), col_iter.index2()) = *col_iter;
-                  
+
               return mat;
             }
         };
-          
+
         /** @brief A class for the AMG points.
         *   Saves point index and influence measure
         *  Holds information whether point is undecided, C or F point.
@@ -814,92 +817,92 @@ namespace viennacl
         {
           private:
             typedef amg_sparsevector<amg_point*> ListType;
-            
-            unsigned int _index;
-            unsigned int _influence;
+
+            unsigned int index_;
+            unsigned int influence_;
             // Determines whether point is undecided.
-            bool _undecided;
+            bool undecided_;
             // Determines wheter point is C point (true) or F point (false).
-            bool _cpoint;
-            unsigned int _coarse_index;
-            // Index offset of parallel coarsening. In that case a point acts as if it had an index of _index-_offset and treats other points as if they had an index of index+_offset
-            unsigned int _offset;
+            bool cpoint_;
+            unsigned int coarse_index_;
+            // Index offset of parallel coarsening. In that case a point acts as if it had an index of index_-offset_ and treats other points as if they had an index of index+offset_
+            unsigned int offset_;
             // Aggregate the point belongs to.
-            unsigned int _aggregate;
-            
+            unsigned int aggregate_;
+
             // Holds all points influencing this point.
             ListType influencing_points;
             // Holds all points that are influenced by this point.
             ListType influenced_points;
-      
+
           public:
             typedef ListType::iterator iterator;
             typedef ListType::const_iterator const_iterator;
-            
+
             /** @brief The constructor.
             */
-            amg_point (unsigned int index, unsigned int size): _index(index), _influence(0), _undecided(true), _cpoint(false), _coarse_index(0), _offset(0), _aggregate(0)
+            amg_point (unsigned int index, unsigned int size): index_(index), influence_(0), undecided_(true), cpoint_(false), coarse_index_(0), offset_(0), aggregate_(0)
             {
               influencing_points = ListType(size);
               influenced_points = ListType(size);
             }
-            
-            void set_offset(unsigned int offset) { _offset = offset; }
-            unsigned int get_offset() { return _offset; }
-            void set_index(unsigned int index) { _index = index+_offset; }
-            unsigned int get_index() const { return _index-_offset;  }
-            unsigned int get_influence() const { return _influence;  }
-            void set_aggregate(unsigned int aggregate) { _aggregate = aggregate; }
-            unsigned int get_aggregate () { return _aggregate; }
-            
-            bool is_cpoint() const { return _cpoint && !_undecided;  }
-            bool is_fpoint() const { return !_cpoint && !_undecided; }
-            bool is_undecided() const { return _undecided; }
-            
+
+            void set_offset(unsigned int offset) { offset_ = offset; }
+            unsigned int get_offset() { return offset_; }
+            void set_index(unsigned int index) { index_ = index+offset_; }
+            unsigned int get_index() const { return index_-offset_;  }
+            unsigned int get_influence() const { return influence_;  }
+            void set_aggregate(unsigned int aggregate) { aggregate_ = aggregate; }
+            unsigned int get_aggregate () { return aggregate_; }
+
+            bool is_cpoint() const { return cpoint_ && !undecided_;  }
+            bool is_fpoint() const { return !cpoint_ && !undecided_; }
+            bool is_undecided() const { return undecided_; }
+
             // Returns number of influencing points
             unsigned int number_influencing() const  { return influencing_points.internal_size(); }
             // Returns true if *point is influencing this point
-            bool is_influencing(amg_point* point) const { return influencing_points.isnonzero(point->get_index()+_offset); }
+            bool is_influencing(amg_point* point) const { return influencing_points.isnonzero(point->get_index()+offset_); }
             // Add *point to influencing points
-            void add_influencing_point(amg_point* point) { influencing_points[point->get_index()+_offset] = point;  }
+            void add_influencing_point(amg_point* point) { influencing_points[point->get_index()+offset_] = point;  }
             // Add *point to influenced points
-            void add_influenced_point(amg_point* point) { influenced_points[point->get_index()+_offset] = point; }
-            
+            void add_influenced_point(amg_point* point) { influenced_points[point->get_index()+offset_] = point; }
+
             // Clear influencing points
             void clear_influencing() { influencing_points.clear(); }
             // Clear influenced points
             void clear_influenced() {influenced_points.clear(); }
-            
-            
-            unsigned int get_coarse_index() const { return _coarse_index; }
-            void set_coarse_index(unsigned int index) { _coarse_index = index; }
-            
+
+
+            unsigned int get_coarse_index() const { return coarse_index_; }
+            void set_coarse_index(unsigned int index) { coarse_index_ = index; }
+
             // Calculates the initial influence measure equal to the number of influenced points.
-            void calc_influence() { _influence = influenced_points.internal_size();  }
-            
+            void calc_influence() { influence_ = influenced_points.internal_size();  }
+
             // Add to influence measure.
             unsigned int add_influence(int add)
             {
-              _influence += add;
-              return _influence;
+              influence_ += add;
+              return influence_;
             }
             // Make this point C point. Only call via amg_pointvector.
-            void make_cpoint() 
-            { 
-              _undecided = false;
-              _cpoint = true; 
-              _influence = 0;
+            void make_cpoint()
+            {
+              undecided_ = false;
+              cpoint_ = true;
+              influence_ = 0;
             }
             // Make this point F point. Only call via amg_pointvector.
             void make_fpoint()
             {
-              _undecided = false;
-              _cpoint = false;
-              _influence = 0;
+              undecided_ = false;
+              cpoint_ = false;
+              influence_ = 0;
             }
             // Switch point from F to C point. Only call via amg_pointvector.
-            void switch_ftoc() { _cpoint = true; }  
-            
+            void switch_ftoc() { cpoint_ = true; }
+
             // Iterator handling for influencing and influenced points.
             iterator begin_influencing() { return influencing_points.begin(); }
             iterator end_influencing() { return influencing_points.end(); }
@@ -910,7 +913,7 @@ namespace viennacl
             const_iterator begin_influenced() const { return influenced_points.begin(); }
             const_iterator end_influenced() const { return influenced_points.end(); }
         };
-        
+
         /** @brief Comparison class for the sorted set of points in amg_pointvector. Set is sorted by influence measure from lower to higher with the point-index as tie-breaker.
         */
         struct classcomp
@@ -923,7 +926,7 @@ namespace viennacl
             return (l->get_influence() < r->get_influence() || (l->get_influence() == r->get_influence() && l->get_index() > r->get_index()));
           }
         };
-      
+
         /** @brief A class for the AMG points.
         *  Holds pointers of type amg_point in a vector that can be accessed using [point-index].
         *  Additional list of pointers sorted by influence number and index to improve coarsening performance (see amg_coarse_classic_onepass() in amg_coarse.hpp)
@@ -938,31 +941,31 @@ namespace viennacl
             typedef std::vector<amg_point*> VectorType;
             VectorType pointvector;
             ListType pointlist;
-            unsigned int _size;
+            unsigned int size_;
             unsigned int c_points, f_points;
-      
+
           public:
             typedef VectorType::iterator iterator;
             typedef VectorType::const_iterator const_iterator;
-            
+
             /** @brief The constructor.
             *  @param size    Number of points
             */
-            amg_pointvector(unsigned int size = 0): _size(size)
+            amg_pointvector(unsigned int size = 0): size_(size)
             {
               pointvector = VectorType(size);
               c_points = f_points = 0;
             }
-            
+
             // Construct all the points dynamically and save pointers into vector.
             void init_points()
-            {  
+            {
               for (unsigned int i=0; i<size(); ++i)
                 pointvector[i] = new amg_point(i,size());
             }
             // Delete all the points.
             void delete_points()
-            {  
+            {
               for (unsigned int i=0; i<size(); ++i)
                 delete pointvector[i];
             }
@@ -976,14 +979,14 @@ namespace viennacl
 
             // Update C and F count for point *point.
             // Necessary if C and F points were constructed outside this data structure (e.g. by parallel coarsening RS0 or RS3).
-            void update_cf(amg_point *point) 
+            void update_cf(amg_point *point)
             {
               if (point->is_cpoint()) c_points++;
               else if (point->is_fpoint()) f_points++;
             }
             // Clear the C and F point count.
             void clear_cf() { c_points = f_points = 0; }
-            
+
             // Clear both point lists.
             void clear_influencelists()
             {
@@ -993,25 +996,25 @@ namespace viennacl
                 (*iter)->clear_influenced();
               }
             }
-            
+
             amg_point* operator [] (unsigned int i) const { return pointvector[i]; }
             iterator begin() { return pointvector.begin(); }
             iterator end() { return pointvector.end(); }
             const_iterator begin() const { return pointvector.begin(); }
             const_iterator end() const { return pointvector.end(); }
-            
+
             void resize(unsigned int size)
             {
-              _size = size;
+              size_ = size;
               pointvector = VectorType(size);
             }
-            unsigned int size() const { return _size; }
-            
+            unsigned int size() const { return size_; }
+
             // Returns number of C points
             unsigned int get_cpoints() const { return c_points; }
             // Returns number of F points
             unsigned int get_fpoints() const { return f_points; }
-            
+
             // Does the initial sorting of points into the list. Sorting is automatically done by the std::set data type.
             void sort()
             {
@@ -1037,15 +1040,15 @@ namespace viennacl
               ListType::iterator iter = pointlist.find(point);
               // If point is not in the list then stop.
               if (iter == pointlist.end()) return;
-              
+
               // Save iterator and decrement
               ListType::iterator iter2 = iter;
               iter2--;
-              
+
               // Point has to be erased first as changing the value does not re-order the std::set
               pointlist.erase(iter);
               point->add_influence(add);
-              
+
               // Insert point back into the list. Using the iterator improves performance. The new position has to be at the same position or to the right of the old.
               pointlist.insert(iter2,point);
             }
@@ -1065,7 +1068,7 @@ namespace viennacl
             }
             // Swich *point from F to C point
             void switch_ftoc(amg_point* point)
-            { 
+            {
               point->switch_ftoc();
               c_points++;
               f_points--;
@@ -1086,24 +1089,24 @@ namespace viennacl
                 }
               }
             }
-            
+
             // Return information for debugging purposes
             template <typename MatrixType>
             void get_influence_matrix(MatrixType & mat) const
             {
               mat = MatrixType(size(),size());
               mat.clear();
-              
+
               for (const_iterator row_iter = begin(); row_iter != end(); ++row_iter)
                 for (amg_point::iterator col_iter = (*row_iter)->begin_influencing(); col_iter != (*row_iter)->end_influencing(); ++col_iter)
-                  mat((*row_iter)->get_index(),(*col_iter)->get_index()) = true;  
+                  mat((*row_iter)->get_index(),(*col_iter)->get_index()) = true;
             }
             template <typename VectorType>
             void get_influence(VectorType & vec) const
             {
-              vec = VectorType(_size);
+              vec = VectorType(size_);
               vec.clear();
-              
+
               for (const_iterator iter = begin(); iter != end(); ++iter)
                 vec[(*iter)->get_index()] = (*iter)->get_influence();
             }
@@ -1113,7 +1116,7 @@ namespace viennacl
               vec = VectorType(pointlist.size());
               vec.clear();
               unsigned int i=0;
-              
+
               for (ListType::const_iterator iter = pointlist.begin(); iter != pointlist.end(); ++iter)
               {
                 vec[i] = (*iter)->get_index();
@@ -1123,9 +1126,9 @@ namespace viennacl
             template <typename VectorType>
             void get_C(VectorType & vec) const
             {
-              vec = VectorType(_size);
+              vec = VectorType(size_);
               vec.clear();
-              
+
               for (const_iterator iter = begin(); iter != end(); ++iter)
               {
                 if ((*iter)->is_cpoint())
@@ -1135,9 +1138,9 @@ namespace viennacl
             template <typename VectorType>
             void get_F(VectorType & vec) const
             {
-              vec = VectorType(_size);
+              vec = VectorType(size_);
               vec.clear();
-              
+
               for (const_iterator iter = begin(); iter != end(); ++iter)
               {
                 if ((*iter)->is_fpoint())
@@ -1147,9 +1150,9 @@ namespace viennacl
             template <typename MatrixType>
             void get_Aggregates(MatrixType & mat) const
             {
-              mat = MatrixType(_size,_size);
+              mat = MatrixType(size_,size_);
               mat.clear();
-              
+
               for (const_iterator iter = begin(); iter != end(); ++iter)
               {
                 if (!(*iter)->is_undecided())
@@ -1157,7 +1160,7 @@ namespace viennacl
               }
             }
         };
-        
+
         /** @brief A class for the matrix slicing for parallel coarsening schemes (RS0/RS3).
           * @brief Holds information on a per-processor basis and offers functionality to slice and join the data structures.
           */
@@ -1165,54 +1168,54 @@ namespace viennacl
         class amg_slicing
         {
             typedef typename InternalType1::value_type SparseMatrixType;
-            typedef typename InternalType2::value_type PointVectorType;    
-            
+            typedef typename InternalType2::value_type PointVectorType;
+
           public:
             // Data structures on a per-processor basis.
             boost::numeric::ublas::vector<InternalType1> A_slice;
             boost::numeric::ublas::vector<InternalType2> Pointvector_slice;
             // Holds the offsets showing the indices for which a new slice begins.
             boost::numeric::ublas::vector<boost::numeric::ublas::vector<unsigned int> > Offset;
-            
-            unsigned int _threads;
-            unsigned int _levels;
-            
+
+            unsigned int threads_;
+            unsigned int levels_;
+
             void init(unsigned int levels, unsigned int threads = 0)
             {
               // Either use the number of threads chosen by the user or the maximum number of threads available on the processor.
               if (threads == 0)
-            #ifdef _OPENMP
-                _threads = omp_get_num_procs();
+            #ifdef VIENNACL_WITH_OPENMP
+                threads_ = omp_get_num_procs();
             #else
-              _threads = 1;
-            #endif   
-              else 
-                _threads = threads;
-              
-              _levels = levels;
-              
-              A_slice.resize(_threads);
-              Pointvector_slice.resize(_threads);
-              // Offset has _threads+1 entries to also hold the total size
-              Offset.resize(_threads+1);
-              
-              for (unsigned int i=0; i<_threads; ++i)
+              threads_ = 1;
+            #endif
+              else
+                threads_ = threads;
+
+              levels_ = levels;
+
+              A_slice.resize(threads_);
+              Pointvector_slice.resize(threads_);
+              // Offset has threads_+1 entries to also hold the total size
+              Offset.resize(threads_+1);
+
+              for (unsigned int i=0; i<threads_; ++i)
               {
-                A_slice[i].resize(_levels);
-                Pointvector_slice[i].resize(_levels);
+                A_slice[i].resize(levels_);
+                Pointvector_slice[i].resize(levels_);
                 // Offset needs one more level for the build-up of the next offset
-                Offset[i].resize(_levels+1);
+                Offset[i].resize(levels_+1);
               }
-              Offset[_threads].resize(_levels+1);
+              Offset[threads_].resize(levels_+1);
             } //init()
-            
+
             // Slice matrix A into as many parts as threads are used.
             void slice (unsigned int level, InternalType1 const & A, InternalType2 const & Pointvector)
             {
               // On the finest level, build a new slicing first.
               if (level == 0)
                 slice_new (level, A);
-              
+
               // On coarser levels use the same slicing as on the finest level (Points stay together on the same thread on all levels).
               // This is necessary as due to interpolation and galerkin product there only exist connections between points on the same thread on coarser levels.
               // Note: Offset is determined in amg_coarse_rs0() after fine level was built.
@@ -1223,7 +1226,7 @@ namespace viennacl
             void join (unsigned int level, InternalType2 & Pointvector) const
             {
               typedef typename InternalType2::value_type PointVectorType;
-              
+
               // Reset index offset of all points and update overall C and F point count
               Pointvector[level].clear_cf();
               for (typename PointVectorType::iterator iter = Pointvector[level].begin(); iter != Pointvector[level].end(); ++iter)
@@ -1232,30 +1235,27 @@ namespace viennacl
                 Pointvector[level].update_cf(*iter);
               }
             }
-              
-          private:     
+
+          private:
             /** @brief Slices mat into this->threads parts of (almost) equal size
             * @param level    Level for which slicing is requested
             * @param A     System matrix on all levels
             */
             void slice_new (unsigned int level, InternalType1 const & A)
-            {  
-              typedef typename SparseMatrixType::const_iterator1 ConstRowIterator;
-              typedef typename SparseMatrixType::const_iterator2 ConstColIterator;
-              
+            {
               // Determine index offset of all the slices (index of A[level] when the respective slice starts).
-            #ifdef _OPENMP
-              #pragma omp parallel for 
+            #ifdef VIENNACL_WITH_OPENMP
+              #pragma omp parallel for
             #endif
-              for (unsigned int i=0; i<=_threads; ++i)
+              for (long i=0; i<=static_cast<long>(threads_); ++i)
               {
                 // Offset of first piece is zero. Pieces 1,...,threads-1 have equal size while the last one might be greater.
                 if (i == 0) Offset[i][level] = 0;
-                else if (i == _threads) Offset[i][level] = A[level].size1();
-                else Offset[i][level] = i*(A[level].size1()/_threads);
+                else if (i == threads_) Offset[i][level] = static_cast<unsigned int>(A[level].size1());
+                else Offset[i][level] = static_cast<unsigned int>(i*(A[level].size1()/threads_));
               }
-            }   
-            
+            }
+
             /** @brief Slices mat into pieces determined by this->Offset
             * @param level    Level to which Slices are saved
             * @param A     System matrix on all levels
@@ -1265,51 +1265,51 @@ namespace viennacl
             {
               typedef typename SparseMatrixType::const_iterator1 ConstRowIterator;
               typedef typename SparseMatrixType::const_iterator2 ConstColIterator;
-              
+
               unsigned int x, y;
               amg_point *point;
-              
-            #ifdef _OPENMP
+
+            #ifdef VIENNACL_WITH_OPENMP
               #pragma omp parallel for private (x,y,point)
             #endif
-              for (unsigned int i=0; i<_threads; ++i)
+              for (long i=0; i<static_cast<long>(threads_); ++i)
               {
                 // Allocate space for the matrix slice and the pointvector.
                 A_slice[i][level] = SparseMatrixType(Offset[i+1][level]-Offset[i][level],Offset[i+1][level]-Offset[i][level]);
                 Pointvector_slice[i][level] = PointVectorType(Offset[i+1][level]-Offset[i][level]);
-                
+
                 // Iterate over the part that belongs to thread i (from Offset[i][level] to Offset[i+1][level]).
                 ConstRowIterator row_iter = A[level].begin1();
                 row_iter += Offset[i][level];
-                x = row_iter.index1();
-                    
+                x = static_cast<unsigned int>(row_iter.index1());
+
                 while (x < Offset[i+1][level] && row_iter != A[level].end1())
                 {
                   // Set offset for point index and save point for the respective thread
                   point = Pointvector[level][x];
                   point->set_offset(Offset[i][level]);
                   Pointvector_slice[i][level].add_point(point);
-                  
+
                   ConstColIterator col_iter = row_iter.begin();
-                  y = col_iter.index2();
-                  
+                  y = static_cast<unsigned int>(col_iter.index2());
+
                   // Save all coefficients from the matrix slice
                   while (y < Offset[i+1][level] && col_iter != row_iter.end())
                   {
                     if (y >= Offset[i][level])
                 A_slice[i][level](x-Offset[i][level],y-Offset[i][level]) = *col_iter;
-                    
+
                     ++col_iter;
-                    y = col_iter.index2();
+                    y = static_cast<unsigned int>(col_iter.index2());
                   }
-                  
+
                   ++row_iter;
-                  x = row_iter.index1();
+                  x = static_cast<unsigned int>(row_iter.index1());
                 }
               }
             }
-        };  
-        
+        };
+
         /** @brief Sparse matrix product. Calculates RES = A*B.
           * @param A    Left Matrix
           * @param B    Right Matrix
@@ -1321,35 +1321,35 @@ namespace viennacl
           typedef typename SparseMatrixType::value_type ScalarType;
           typedef typename SparseMatrixType::iterator1 InternalRowIterator;
           typedef typename SparseMatrixType::iterator2 InternalColIterator;
-          
-          unsigned int x,y,z;
+
+          long x,y,z;
           ScalarType prod;
-          RES = SparseMatrixType(A.size1(), B.size2());
+          RES = SparseMatrixType(static_cast<unsigned int>(A.size1()), static_cast<unsigned int>(B.size2()));
           RES.clear();
-          
-    #ifdef _OPENMP
-          #pragma omp parallel for private (x,y,z,prod) shared (A,B,RES)
+
+    #ifdef VIENNACL_WITH_OPENMP
+          #pragma omp parallel for private (x,y,z,prod)
     #endif
-          for (x=0; x<A.size1(); ++x)
+          for (x=0; x<static_cast<long>(A.size1()); ++x)
           {
             InternalRowIterator row_iter = A.begin1();
             row_iter += x;
             for (InternalColIterator col_iter = row_iter.begin(); col_iter != row_iter.end(); ++col_iter)
             {
-              y = col_iter.index2(); 
+              y = static_cast<unsigned int>(col_iter.index2());
               InternalRowIterator row_iter2 = B.begin1();
               row_iter2 += y;
 
               for(InternalColIterator col_iter2 = row_iter2.begin(); col_iter2 != row_iter2.end(); ++col_iter2)
               {
-                z = col_iter2.index2();
+                z = static_cast<unsigned int>(col_iter2.index2());
                 prod = *col_iter * *col_iter2;
                 RES.add(x,z,prod);
               }
             }
           }
         }
-        
+
         /** @brief Sparse Galerkin product: Calculates RES = trans(P)*A*P
           * @param A    Operator matrix (quadratic)
           * @param P    Prolongation/Interpolation matrix
@@ -1361,30 +1361,30 @@ namespace viennacl
           typedef typename SparseMatrixType::value_type ScalarType;
           typedef typename SparseMatrixType::iterator1 InternalRowIterator;
           typedef typename SparseMatrixType::iterator2 InternalColIterator;
-          
-          unsigned int x,y1,y2,z;
+
+          long x,y1,y2,z;
           amg_sparsevector<ScalarType> row;
-          RES = SparseMatrixType(P.size2(), P.size2());
+          RES = SparseMatrixType(static_cast<unsigned int>(P.size2()), static_cast<unsigned int>(P.size2()));
           RES.clear();
-          
-    #ifdef _OPENMP
-          #pragma omp parallel for private (x,y1,y2,z,row) shared (A,P,RES)
-    #endif      
-          for (x=0; x<P.size2(); ++x)
+
+    #ifdef VIENNACL_WITH_OPENMP
+          #pragma omp parallel for private (x,y1,y2,z,row)
+    #endif
+          for (x=0; x<static_cast<long>(P.size2()); ++x)
           {
-            row = amg_sparsevector<ScalarType>(A.size2());
+            row = amg_sparsevector<ScalarType>(static_cast<unsigned int>(A.size2()));
             InternalRowIterator row_iter = P.begin1(true);
             row_iter += x;
 
             for (InternalColIterator col_iter = row_iter.begin(); col_iter != row_iter.end(); ++col_iter)
             {
-              y1 = col_iter.index2(); 
+              y1 = static_cast<long>(col_iter.index2());
               InternalRowIterator row_iter2 = A.begin1();
               row_iter2 += y1;
-              
+
               for(InternalColIterator col_iter2 = row_iter2.begin(); col_iter2 != row_iter2.end(); ++col_iter2)
               {
-                y2 = col_iter2.index2();
+                y2 = static_cast<long>(col_iter2.index2());
                 row.add (y2, *col_iter * *col_iter2);
               }
             }
@@ -1393,21 +1393,21 @@ namespace viennacl
               y2 = iter.index();
               InternalRowIterator row_iter3 = P.begin1();
               row_iter3 += y2;
-              
+
               for (InternalColIterator col_iter3 = row_iter3.begin(); col_iter3 != row_iter3.end(); ++col_iter3)
               {
-                z = col_iter3.index2();
+                z = static_cast<long>(col_iter3.index2());
                 RES.add (x, z, *col_iter3 * *iter);
               }
             }
           }
-          
-          #ifdef DEBUG
+
+          #ifdef VIENNACL_AMG_DEBUG
           std::cout << "Galerkin Operator: " << std::endl;
           printmatrix (RES);
           #endif
         }
-        
+
         /** @brief Test triple-matrix product by comparing it to ublas functions. Very slow for large matrices!
           * @param A    Operator matrix (quadratic)
           * @param P    Prolongation/Interpolation matrix
@@ -1417,7 +1417,7 @@ namespace viennacl
         void test_triplematprod(SparseMatrixType & A, SparseMatrixType & P, SparseMatrixType  & A_i1)
         {
           typedef typename SparseMatrixType::value_type ScalarType;
-          
+
           boost::numeric::ublas::compressed_matrix<ScalarType> A_temp (A.size1(), A.size2());
           A_temp = A;
           boost::numeric::ublas::compressed_matrix<ScalarType> P_temp (P.size1(), P.size2());
@@ -1426,22 +1426,22 @@ namespace viennacl
           boost::numeric::ublas::compressed_matrix<ScalarType> R_temp (P.size1(), P.size2());
           R_temp = P;
           P.set_trans(false);
-          
+
           boost::numeric::ublas::compressed_matrix<ScalarType> RA (R_temp.size1(),A_temp.size2());
           RA = boost::numeric::ublas::prod(R_temp,A_temp);
           boost::numeric::ublas::compressed_matrix<ScalarType> RAP (RA.size1(),P_temp.size2());
           RAP = boost::numeric::ublas::prod(RA,P_temp);
-          
+
           for (unsigned int x=0; x<RAP.size1(); ++x)
           {
             for (unsigned int y=0; y<RAP.size2(); ++y)
             {
-              if (abs((ScalarType)RAP(x,y) - (ScalarType)A_i1(x,y)) > 0.0001)
+              if (std::fabs(static_cast<ScalarType>(RAP(x,y)) - static_cast<ScalarType>(A_i1(x,y))) > 0.0001)
                 std::cout << x << " " << y << " " << RAP(x,y) << " " << A_i1(x,y) << std::endl;
-            } 
+            }
           }
         }
-        
+
         /** @brief Test if interpolation matrix makes sense. Only vanilla test though! Only checks if basic requirements are met!
           * @param A    Operator matrix (quadratic)
           * @param P    Prolongation/Interpolation matrix
@@ -1468,7 +1468,7 @@ namespace viennacl
                 }
               }
             }
-            
+
             if (Pointvector.is_fpoint(i))
               for (unsigned int j=0; j<P.size2(); ++j)
               {
@@ -1482,7 +1482,7 @@ namespace viennacl
                     if (P.isnonzero(k,j))
                     {
                       if (Pointvector.is_cpoint(k) && P(k,j) == 1 && A.isnonzero(i,k))
-                        set = true;      
+                        set = true;
                     }
                   }
                   if (!set)
@@ -1491,8 +1491,8 @@ namespace viennacl
               }
             }
         }
-        
-        
+
+
       } //namespace amg
     }
   }
diff --git a/viennacl/linalg/detail/amg/amg_coarse.hpp b/viennacl/linalg/detail/amg/amg_coarse.hpp
index 3a73534..6dc1260 100644
--- a/viennacl/linalg/detail/amg/amg_coarse.hpp
+++ b/viennacl/linalg/detail/amg/amg_coarse.hpp
@@ -2,16 +2,17 @@
 #define VIENNACL_LINALG_DETAIL_AMG_AMG_COARSE_HPP
 
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
@@ -22,24 +23,24 @@
 */
 
 #include <cmath>
-#include "viennacl/linalg/amg.hpp"
+#include "viennacl/linalg/detail/amg/amg_base.hpp"
 
 #include <map>
-#ifdef _OPENMP
+#ifdef VIENNACL_WITH_OPENMP
 #include <omp.h>
 #endif
 
-#include "amg_debug.hpp"
+#include "viennacl/linalg/detail/amg/amg_debug.hpp"
 
 namespace viennacl
 {
   namespace linalg
-  {    
+  {
     namespace detail
     {
       namespace amg
       {
-    
+
     /** @brief Calls the right coarsening procedure
       * @param level    Coarse level identifier
       * @param A    Operator matrix on all levels
@@ -50,16 +51,16 @@ namespace viennacl
     template <typename InternalType1, typename InternalType2, typename InternalType3>
     void amg_coarse(unsigned int level, InternalType1 & A, InternalType2 & Pointvector, InternalType3 & Slicing, amg_tag & tag)
     {
-  switch (tag.get_coarse())
-  {
-    case VIENNACL_AMG_COARSE_RS: amg_coarse_classic (level, A, Pointvector, tag); break;
-    case VIENNACL_AMG_COARSE_ONEPASS: amg_coarse_classic_onepass (level, A, Pointvector, tag); break;
-    case VIENNACL_AMG_COARSE_RS0: amg_coarse_rs0 (level, A, Pointvector, Slicing, tag); break;
-    case VIENNACL_AMG_COARSE_RS3: amg_coarse_rs3 (level, A, Pointvector, Slicing, tag); break;
-    case VIENNACL_AMG_COARSE_AG:   amg_coarse_ag (level, A, Pointvector, tag); break;
-  }
-    } 
-    
+      switch (tag.get_coarse())
+      {
+        case VIENNACL_AMG_COARSE_RS: amg_coarse_classic (level, A, Pointvector, tag); break;
+        case VIENNACL_AMG_COARSE_ONEPASS: amg_coarse_classic_onepass (level, A, Pointvector, tag); break;
+        case VIENNACL_AMG_COARSE_RS0: amg_coarse_rs0 (level, A, Pointvector, Slicing, tag); break;
+        case VIENNACL_AMG_COARSE_RS3: amg_coarse_rs3 (level, A, Pointvector, Slicing, tag); break;
+        case VIENNACL_AMG_COARSE_AG:   amg_coarse_ag (level, A, Pointvector, tag); break;
+      }
+    }
+
     /** @brief Determines strong influences in system matrix, classical approach (RS). Multithreaded!
     * @param level    Coarse level identifier
     * @param A      Operator matrix on all levels
@@ -75,67 +76,67 @@ namespace viennacl
       typedef typename SparseMatrixType::value_type ScalarType;
       typedef typename SparseMatrixType::const_iterator1 ConstRowIterator;
       typedef typename SparseMatrixType::const_iterator2 ConstColIterator;
-      
+
       ScalarType max;
       int diag_sign;
       //unsigned int i;
-        
-#ifdef _OPENMP
-      #pragma omp parallel for private (max,diag_sign) shared (A,Pointvector)
-#endif      
-      for (unsigned int i=0; i<A[level].size1(); ++i)
-      {  
-  diag_sign = 1;
-  if (A[level](i,i) < 0)
-    diag_sign = -1;
-  
-  ConstRowIterator row_iter = A[level].begin1();
-  row_iter += i;
-  // Find greatest non-diagonal negative value (positive if diagonal is negative) in row
-  max = 0;
-  for (ConstColIterator col_iter = row_iter.begin(); col_iter != row_iter.end(); ++col_iter)
-  {
-      if (i == (unsigned int) col_iter.index2()) continue;
-      if (diag_sign == 1)
-        if (max > *col_iter)  max = *col_iter;
-      if (diag_sign == -1)
-        if (max < *col_iter)  max = *col_iter;
-  }
-  
-  // If maximum is 0 then the row is independent of the others
-  if (max == 0)
-    continue;
-  
-  // Find all points that strongly influence current point (Yang, p.5)
-  for (ConstColIterator col_iter = row_iter.begin(); col_iter != row_iter.end(); ++col_iter)
-  {
-    unsigned int j = col_iter.index2();  
-    if (i == j) continue;
-    if (diag_sign * (-*col_iter) >= tag.get_threshold() * (diag_sign * (-max)))
-    {
-      // Strong influence from j to i found, save information
-      Pointvector[level][i]->add_influencing_point(Pointvector[level][j]);
-    }
-  }
+
+#ifdef VIENNACL_WITH_OPENMP
+      #pragma omp parallel for private (max,diag_sign)
+#endif
+      for (long i=0; i<static_cast<long>(A[level].size1()); ++i)
+      {
+        diag_sign = 1;
+        if (A[level](i,i) < 0)
+          diag_sign = -1;
+
+        ConstRowIterator row_iter = A[level].begin1();
+        row_iter += i;
+        // Find greatest non-diagonal negative value (positive if diagonal is negative) in row
+        max = 0;
+        for (ConstColIterator col_iter = row_iter.begin(); col_iter != row_iter.end(); ++col_iter)
+        {
+            if (i == (unsigned int) col_iter.index2()) continue;
+            if (diag_sign == 1)
+              if (max > *col_iter)  max = *col_iter;
+            if (diag_sign == -1)
+              if (max < *col_iter)  max = *col_iter;
+        }
+
+        // If maximum is 0 then the row is independent of the others
+        if (max == 0)
+          continue;
+
+        // Find all points that strongly influence current point (Yang, p.5)
+        for (ConstColIterator col_iter = row_iter.begin(); col_iter != row_iter.end(); ++col_iter)
+        {
+          unsigned int j = static_cast<unsigned int>(col_iter.index2());
+          if (i == j) continue;
+          if (diag_sign * (-*col_iter) >= tag.get_threshold() * (diag_sign * (-max)))
+          {
+            // Strong influence from j to i found, save information
+            Pointvector[level][i]->add_influencing_point(Pointvector[level][j]);
+          }
+        }
       }
-      
-      #ifdef DEBUG
+
+      #ifdef VIENNACL_AMG_DEBUG
       std::cout << "Influence Matrix: " << std::endl;
       boost::numeric::ublas::matrix<bool> mat;
       Pointvector[level].get_influence_matrix(mat);
       printmatrix (mat);
       #endif
-      
+
       // Save influenced points
       for (typename PointVectorType::iterator iter = Pointvector[level].begin(); iter != Pointvector[level].end(); ++iter)
       {
-  for (typename amg_point::iterator iter2 = (*iter)->begin_influencing(); iter2 != (*iter)->end_influencing(); ++iter2)
-  {
-    (*iter2)->add_influenced_point(*iter);
-  }
+        for (typename amg_point::iterator iter2 = (*iter)->begin_influencing(); iter2 != (*iter)->end_influencing(); ++iter2)
+        {
+          (*iter2)->add_influenced_point(*iter);
+        }
       }
-        
-      #ifdef DEBUG
+
+      #ifdef VIENNACL_AMG_DEBUG
       std::cout << "Influence Measures: " << std::endl;
       boost::numeric::ublas::vector<unsigned int> temp;
       Pointvector[level].get_influence(temp);
@@ -143,9 +144,9 @@ namespace viennacl
       std::cout << "Point Sorting: " << std::endl;
       Pointvector[level].get_sorting(temp);
       printvector (temp);
-      #endif 
+      #endif
     }
-        
+
     /** @brief Classical (RS) one-pass coarsening. Single-Threaded! (VIENNACL_AMG_COARSE_CLASSIC_ONEPASS)
     * @param level     Course level identifier
     * @param A      Operator matrix on all levels
@@ -155,73 +156,66 @@ namespace viennacl
     template <typename InternalType1, typename InternalType2>
     void amg_coarse_classic_onepass(unsigned int level, InternalType1 & A, InternalType2 & Pointvector, amg_tag & tag)
     {
-      typedef typename InternalType1::value_type SparseMatrixType;
-      typedef typename InternalType2::value_type PointVectorType;
-      typedef typename SparseMatrixType::value_type ScalarType;
-      
-      typedef typename SparseMatrixType::iterator1 InternalRowIterator;
-      typedef typename SparseMatrixType::iterator2 InternalColIterator;
-      
       amg_point* c_point, *point1, *point2;
-      unsigned int i;
-        
+
       // Check and save all strong influences
-      amg_influence (level, A, Pointvector, tag);    
-      
+      amg_influence (level, A, Pointvector, tag);
+
       // Traverse through points and calculate initial influence measure
-#ifdef _OPENMP
-      #pragma omp parallel for private (i) shared (Pointvector)
-#endif      
-      for (i=0; i<Pointvector[level].size(); ++i)
+      long i;
+#ifdef VIENNACL_WITH_OPENMP
+      #pragma omp parallel for private (i)
+#endif
+      for (i=0; i<static_cast<long>(Pointvector[level].size()); ++i)
   Pointvector[level][i]->calc_influence();
-      
+
        // Do initial sorting
       Pointvector[level].sort();
-      
+
       // Get undecided point with highest influence measure
       while ((c_point = Pointvector[level].get_nextpoint()) != NULL)
-      {    
-  // Make this point C point
-  Pointvector[level].make_cpoint(c_point);
-  
-  // All strongly influenced points become F points
-  for (typename amg_point::iterator iter = c_point->begin_influenced(); iter != c_point->end_influenced(); ++iter)
-  {
-    point1 = *iter;
-    // Found strong influence from C point (c_point influences point1), check whether point is still undecided, otherwise skip
-    if (!point1->is_undecided()) continue;
-    // Make this point F point if it is still undecided point
-    Pointvector[level].make_fpoint(point1);
-    
-    // Add +1 to influence measure for all undecided points that strongly influence new F point
-    for (typename amg_point::iterator iter2 = point1->begin_influencing(); iter2 != point1->end_influencing(); ++iter2)
-    {
-      point2 = *iter2;
-      // Found strong influence to F point (point2 influences point1)
-      if (point2->is_undecided())
-        Pointvector[level].add_influence(point2,1);
-    }
-  }
+      {
+        // Make this point C point
+        Pointvector[level].make_cpoint(c_point);
+
+        // All strongly influenced points become F points
+        for (typename amg_point::iterator iter = c_point->begin_influenced(); iter != c_point->end_influenced(); ++iter)
+        {
+          point1 = *iter;
+          // Found strong influence from C point (c_point influences point1), check whether point is still undecided, otherwise skip
+          if (!point1->is_undecided()) continue;
+          // Make this point F point if it is still undecided point
+          Pointvector[level].make_fpoint(point1);
+
+          // Add +1 to influence measure for all undecided points that strongly influence new F point
+          for (typename amg_point::iterator iter2 = point1->begin_influencing(); iter2 != point1->end_influencing(); ++iter2)
+          {
+            point2 = *iter2;
+            // Found strong influence to F point (point2 influences point1)
+            if (point2->is_undecided())
+              Pointvector[level].add_influence(point2,1);
+          }
+        }
       }
-      
+
       // If a point is neither C nor F point but is nevertheless influenced by other points make it F point
       // (this situation can happen when this point does not influence other points and the points that influence this point became F points already)
       /*#pragma omp parallel for private (i,point1)
-      for (i=0; i<Pointvector[level].size(); ++i)
+      for (long i=0; i<static_cast<long>(Pointvector[level].size()); ++i)
       {
-  point1 = Pointvector[level][i];
-  if (point1->is_undecided())
-  {
-    // Undecided point found. Check whether it is influenced by other point and if so: Make it F point.
-    if (point1->number_influencing() > 0)
-    {
-      #pragma omp critical
-      Pointvector[level].make_fpoint(point1);
-    }
-  }
+        point1 = Pointvector[level][i];
+        if (point1->is_undecided())
+        {
+          // Undecided point found. Check whether it is influenced by other point and if so: Make it F point.
+          if (point1->number_influencing() > 0)
+          {
+            #pragma omp critical
+            Pointvector[level].make_fpoint(point1);
+          }
+        }
       }*/
 
-      #if defined (DEBUG)//  or defined (DEBUGBENCH)
+      #if defined (VIENNACL_AMG_DEBUG)//  or defined (VIENNACL_AMG_DEBUGBENCH)
       unsigned int c_points = Pointvector[level].get_cpoints();
       unsigned int f_points = Pointvector[level].get_fpoints();
       std::cout << "1st pass: Level " << level << ": ";
@@ -229,7 +223,7 @@ namespace viennacl
       std::cout << "No of F points = " << f_points << std::endl;
       #endif
 
-      #ifdef DEBUG
+      #ifdef VIENNACL_AMG_DEBUG
       std::cout << "Coarse Points:" << std::endl;
       boost::numeric::ublas::vector<bool> C;
       Pointvector[level].get_C(C);
@@ -239,8 +233,8 @@ namespace viennacl
       Pointvector[level].get_F(F);
       printvector (F);
       #endif
-    }    
-        
+    }
+
     /** @brief Classical (RS) two-pass coarsening. Single-Threaded! (VIENNACL_AMG_COARSE_CLASSIC)
     * @param level    Coarse level identifier
     * @param A      Operator matrix on all levels
@@ -250,99 +244,94 @@ namespace viennacl
     template <typename InternalType1, typename InternalType2>
     void amg_coarse_classic(unsigned int level, InternalType1 & A, InternalType2 & Pointvector, amg_tag & tag)
     {
-      typedef typename InternalType1::value_type SparseMatrixType;
       typedef typename InternalType2::value_type PointVectorType;
-      typedef typename SparseMatrixType::value_type ScalarType;
-      
-      typedef typename SparseMatrixType::iterator1 InternalRowIterator;
-      typedef typename SparseMatrixType::iterator2 InternalColIterator;
-      
+
       bool add_C;
-      amg_point *c_point, *point1, *point2;     
-      
+      amg_point *c_point, *point1, *point2;
+
       // Use one-pass-coarsening as first pass.
       amg_coarse_classic_onepass(level, A, Pointvector, tag);
-    
+
       // 2nd pass: Add more C points if F-F connection does not have a common C point.
       for (typename PointVectorType::iterator iter = Pointvector[level].begin(); iter != Pointvector[level].end(); ++iter)
       {
-  point1 = *iter;
-  // If point is F point, check for strong connections.
-  if (point1->is_fpoint())
-  {
-    // Check for strong connections from influencing and influenced points.
-    amg_point::iterator iter2 = point1->begin_influencing();
-    amg_point::iterator iter3 = point1->begin_influenced();
-    
-    // Iterate over both lists at once. This makes sure that points are no checked twice when influence relation is symmetric (which is often the case).
-    // Note: Only works because influencing and influenced lists are sorted by point-index.
-    while(iter2 != point1->end_influencing() || iter3 != point1->end_influenced())
-    {     
-      if (iter2 == point1->end_influencing())
-      {
-        point2 = *iter3;
-        ++iter3;
-      }
-      else if (iter3 == point1->end_influenced())
-      {
-        point2 = *iter2;
-        ++iter2;
-      }
-      else
-      {      
-        if ((*iter2)->get_index() == (*iter3)->get_index())   
+        point1 = *iter;
+        // If point is F point, check for strong connections.
+        if (point1->is_fpoint())
         {
-    point2 = *iter2;
-    ++iter2;
-    ++iter3;
-        }
-        else if ((*iter2)->get_index() < (*iter3)->get_index())
-        {
-    point2 = *iter2;
-    ++iter2;
-        }
-        else
-        {
-    point2 = *iter3;
-    ++iter3;
-        }
-      }
-      // Only check points with higher index as points with lower index have been checked already.
-      if (point2->get_index() < point1->get_index())
-        continue;
-      
-      // If there is a strong connection then it has to either be a C point or a F point with common C point.
-      // C point? Then skip as everything is ok.
-      if (point2->is_cpoint())
-        continue;
-      // F point? Then check whether F points point1 and point2 have a common C point.
-      if (point2->is_fpoint())
-      {
-        add_C = true;
-        // C point is common for two F points if they are both strongly influenced by that C point.
-        // Compare strong influences for point1 and point2.
-        for (amg_point::iterator iter3 = point1->begin_influencing(); iter3 != point1 -> end_influencing(); ++iter3)
-        {
-    c_point = *iter3;
-    // Stop search when strong common influence is found via c_point.
-    if (c_point->is_cpoint())
-    {
-      if (point2->is_influencing(c_point))
-      {
-        add_C = false;
-        break;            
-      }
-    }
+          // Check for strong connections from influencing and influenced points.
+          amg_point::iterator iter2 = point1->begin_influencing();
+          amg_point::iterator iter3 = point1->begin_influenced();
+
+          // Iterate over both lists at once. This makes sure that points are no checked twice when influence relation is symmetric (which is often the case).
+          // Note: Only works because influencing and influenced lists are sorted by point-index.
+          while(iter2 != point1->end_influencing() || iter3 != point1->end_influenced())
+          {
+            if (iter2 == point1->end_influencing())
+            {
+              point2 = *iter3;
+              ++iter3;
+            }
+            else if (iter3 == point1->end_influenced())
+            {
+              point2 = *iter2;
+              ++iter2;
+            }
+            else
+            {
+              if ((*iter2)->get_index() == (*iter3)->get_index())
+              {
+                point2 = *iter2;
+                ++iter2;
+                ++iter3;
+              }
+              else if ((*iter2)->get_index() < (*iter3)->get_index())
+              {
+                point2 = *iter2;
+                ++iter2;
+              }
+              else
+              {
+                point2 = *iter3;
+                ++iter3;
+              }
+            }
+            // Only check points with higher index as points with lower index have been checked already.
+            if (point2->get_index() < point1->get_index())
+              continue;
+
+            // If there is a strong connection then it has to either be a C point or a F point with common C point.
+            // C point? Then skip as everything is ok.
+            if (point2->is_cpoint())
+              continue;
+            // F point? Then check whether F points point1 and point2 have a common C point.
+            if (point2->is_fpoint())
+            {
+              add_C = true;
+              // C point is common for two F points if they are both strongly influenced by that C point.
+              // Compare strong influences for point1 and point2.
+              for (amg_point::iterator iter3 = point1->begin_influencing(); iter3 != point1 -> end_influencing(); ++iter3)
+              {
+                c_point = *iter3;
+                // Stop search when strong common influence is found via c_point.
+                if (c_point->is_cpoint())
+                {
+                  if (point2->is_influencing(c_point))
+                  {
+                    add_C = false;
+                    break;
+                  }
+                }
+              }
+              // No common C point found? Then make second F point to C point.
+              if (add_C == true)
+                Pointvector[level].switch_ftoc(point2);
+            }
+          }
         }
-        // No common C point found? Then make second F point to C point.
-        if (add_C == true)
-    Pointvector[level].switch_ftoc(point2);
-      }
-    }
-  }
       }
-      
-      #ifdef DEBUG
+
+      #ifdef VIENNACL_AMG_DEBUG
       std::cout << "After 2nd pass:" << std::endl;
       std::cout << "Coarse Points:" << std::endl;
       boost::numeric::ublas::vector<bool> C;
@@ -354,10 +343,10 @@ namespace viennacl
       printvector (F);
       #endif
 
-      #ifdef DEBUG
-#ifdef _OPENMP
+      #ifdef VIENNACL_AMG_DEBUG
+#ifdef VIENNACL_WITH_OPENMP
       #pragma omp critical
-#endif      
+#endif
       {
       std::cout << "No C and no F point: ";
       for (typename PointVectorType::iterator iter = Pointvector[level].begin(); iter != Pointvector[level].end(); ++iter)
@@ -378,77 +367,70 @@ namespace viennacl
     template <typename InternalType1, typename InternalType2, typename InternalType3>
     void amg_coarse_rs0(unsigned int level, InternalType1 & A, InternalType2 & Pointvector, InternalType3 & Slicing, amg_tag & tag)
     {
-      typedef typename InternalType1::value_type SparseMatrixType;
-      typedef typename InternalType2::value_type PointVectorType;
-      typedef typename SparseMatrixType::value_type ScalarType;
-      
-      typedef typename SparseMatrixType::iterator1 InternalRowIterator;
-      typedef typename SparseMatrixType::iterator2 InternalColIterator;
-      
       unsigned int total_points;
-      
+
       // Slice matrix into parts such that points are distributed among threads
-      Slicing.slice(level, A, Pointvector);     
-      
+      Slicing.slice(level, A, Pointvector);
+
       // Run classical coarsening in parallel
       total_points = 0;
-#ifdef _OPENMP
-      #pragma omp parallel for shared (total_points,Slicing,level)
-#endif      
-      for (unsigned int i=0; i<Slicing._threads; ++i)
+#ifdef VIENNACL_WITH_OPENMP
+      #pragma omp parallel for
+#endif
+      for (long i=0; i<static_cast<long>(Slicing.threads_); ++i)
       {
-  amg_coarse_classic(level,Slicing.A_slice[i],Slicing.Pointvector_slice[i],tag);
-  
-  // Save C points (using Slicing.Offset on the next level as temporary memory)
-  // Note: Number of C points for point i is saved in i+1!! (makes it easier later to compute offset)
-  Slicing.Offset[i+1][level+1] = Slicing.Pointvector_slice[i][level].get_cpoints();
-#ifdef _OPENMP
-  #pragma omp critical
-#endif  
-  total_points += Slicing.Pointvector_slice[i][level].get_cpoints();
-      }      
-      
+        amg_coarse_classic(level,Slicing.A_slice[i],Slicing.Pointvector_slice[i],tag);
+
+        // Save C points (using Slicing.Offset on the next level as temporary memory)
+        // Note: Number of C points for point i is saved in i+1!! (makes it easier later to compute offset)
+        Slicing.Offset[i+1][level+1] = Slicing.Pointvector_slice[i][level].get_cpoints();
+      #ifdef VIENNACL_WITH_OPENMP
+        #pragma omp critical
+      #endif
+        total_points += Slicing.Pointvector_slice[i][level].get_cpoints();
+      }
+
       // If no coarser level can be found on any level then resume and coarsening will stop in amg_coarse()
       if (total_points != 0)
-      {    
-#ifdef _OPENMP
-  #pragma omp parallel for shared (Slicing)
-#endif  
-  for (unsigned int i=0; i<Slicing._threads; ++i)
-  {
-    // If no higher coarse level can be found on slice i (saved in Slicing.Offset[i+1][level+1]) then pull C point(s) to the next level
-    if (Slicing.Offset[i+1][level+1] == 0)
-    {
-      // All points become C points
-      for (unsigned int j=0; j<Slicing.A_slice[i][level].size1(); ++j)
-        Slicing.Pointvector_slice[i][level].make_cpoint(Slicing.Pointvector_slice[i][level][j]);
-      Slicing.Offset[i+1][level+1] = Slicing.A_slice[i][level].size1();
-    }
-  }
-    
-  // Build slicing offset from number of C points (offset = total sum of C points on threads with lower number)
-  for (unsigned int i=2; i<=Slicing._threads; ++i)
-    Slicing.Offset[i][level+1] += Slicing.Offset[i-1][level+1];
-      
-  // Join C and F points
-  Slicing.join(level, Pointvector);
+      {
+      #ifdef VIENNACL_WITH_OPENMP
+        #pragma omp parallel for
+      #endif
+        for (long i=0; i<static_cast<long>(Slicing.threads_); ++i)
+        {
+          // If no higher coarse level can be found on slice i (saved in Slicing.Offset[i+1][level+1]) then pull C point(s) to the next level
+          if (Slicing.Offset[i+1][level+1] == 0)
+          {
+            // All points become C points
+            for (unsigned int j=0; j<Slicing.A_slice[i][level].size1(); ++j)
+              Slicing.Pointvector_slice[i][level].make_cpoint(Slicing.Pointvector_slice[i][level][j]);
+            Slicing.Offset[i+1][level+1] = static_cast<unsigned int>(Slicing.A_slice[i][level].size1());
+          }
+        }
+
+        // Build slicing offset from number of C points (offset = total sum of C points on threads with lower number)
+        for (unsigned int i=2; i<=Slicing.threads_; ++i)
+          Slicing.Offset[i][level+1] += Slicing.Offset[i-1][level+1];
+
+        // Join C and F points
+        Slicing.join(level, Pointvector);
       }
-      
+
       // Calculate global influence measures for interpolation and/or RS3.
-      amg_influence(level, A, Pointvector, tag); 
-      
-      #if defined(DEBUG)// or defined (DEBUGBENCH)
+      amg_influence(level, A, Pointvector, tag);
+
+      #if defined(VIENNACL_AMG_DEBUG)// or defined (VIENNACL_AMG_DEBUGBENCH)
       for (unsigned int i=0; i<Slicing._threads; ++i)
       {
-  unsigned int c_points = Slicing.Pointvector_slice[i][level].get_cpoints();
-  unsigned int f_points = Slicing.Pointvector_slice[i][level].get_fpoints();
-  std::cout << "Thread " << i << ": ";
-  std::cout << "No of C points = " << c_points << ", ";
-  std::cout << "No of F points = " << f_points << std::endl;
+        unsigned int c_points = Slicing.Pointvector_slice[i][level].get_cpoints();
+        unsigned int f_points = Slicing.Pointvector_slice[i][level].get_fpoints();
+        std::cout << "Thread " << i << ": ";
+        std::cout << "No of C points = " << c_points << ", ";
+        std::cout << "No of F points = " << f_points << std::endl;
       }
       #endif
     }
-    
+
     /** @brief RS3 coarsening. Single-Threaded! (VIENNACL_AMG_COARSE_RS3)
     * @param level    Coarse level identifier
     * @param A      Operator matrix on all levels
@@ -459,236 +441,215 @@ namespace viennacl
     template <typename InternalType1, typename InternalType2, typename InternalType3>
     void amg_coarse_rs3(unsigned int level, InternalType1 & A, InternalType2 & Pointvector, InternalType3 & Slicing, amg_tag & tag)
     {
-      typedef typename InternalType1::value_type SparseMatrixType;
-      typedef typename InternalType2::value_type PointVectorType;
-      typedef typename SparseMatrixType::value_type ScalarType;
-      
-      typedef typename SparseMatrixType::iterator1 InternalRowIterator;
-      typedef typename SparseMatrixType::iterator2 InternalColIterator;
-      
       amg_point *c_point, *point1, *point2;
       bool add_C;
       unsigned int i, j;
-            
+
       // Run RS0 first (parallel).
       amg_coarse_rs0(level, A, Pointvector, Slicing, tag);
-      
+
       // Save slicing offset
       boost::numeric::ublas::vector<unsigned int> Offset = boost::numeric::ublas::vector<unsigned int> (Slicing.Offset.size());
       for (i=0; i<Slicing.Offset.size(); ++i)
-  Offset[i] = Slicing.Offset[i][level];
-      
+        Offset[i] = Slicing.Offset[i][level];
+
       // Correct the coarsening with a third pass: Don't allow strong F-F connections without common C point
-      for (i=0; i<Slicing._threads; ++i)
+      for (i=0; i<Slicing.threads_; ++i)
       {
-  //for (j=Slicing.Offset[i][level]; j<Slicing.Offset[i+1][level]; ++j)
-  for (j=Offset[i]; j<Offset[i+1]; ++j)
-  {
-    point1 = Pointvector[level][j];
-    // If point is F point, check for strong connections.
-    if (point1->is_fpoint())
-    {
-      // Check for strong connections from influencing and influenced points.
-      amg_point::iterator iter2 = point1->begin_influencing();
-      amg_point::iterator iter3 = point1->begin_influenced();
-      
-      // Iterate over both lists at once. This makes sure that points are no checked twice when influence relation is symmetric (which is often the case).
-      // Note: Only works because influencing and influenced lists are sorted by point-index.
-      while(iter2 != point1->end_influencing() || iter3 != point1->end_influenced())
-      {     
-        if (iter2 == point1->end_influencing())
-        {
-    point2 = *iter3;
-    ++iter3;
-        }
-        else if (iter3 == point1->end_influenced())
-        {
-    point2 = *iter2;
-    ++iter2;
-        }
-        else
-        {      
-    if ((*iter2)->get_index() == (*iter3)->get_index())   
-    {
-      point2 = *iter2;
-      ++iter2;
-      ++iter3;
-    }
-    else if ((*iter2)->get_index() < (*iter3)->get_index())
-    {
-      point2 = *iter2;
-      ++iter2;
-    }
-    else
-    {
-      point2 = *iter3;
-      ++iter3;
-    }
-        }
-              
-        // Only check points with higher index as points with lower index have been checked already.
-        if (point2->get_index() < point1->get_index())
-    continue;
-                
-        // Only check points that are outside the slicing boundaries (interior F-F connections have already been checked in second pass)
-        //if (point2->get_index() >= Slicing.Offset[i][level] || point2->get_index() < Slicing.Offset[i+1][level])
-        if (point2->get_index() >= Offset[i] && point2->get_index() < Offset[i+1])
-    continue;
-        
-        // If there is a strong connection then it has to either be a C point or a F point with common C point.
-        // C point? Then skip as everything is ok.
-        if (point2->is_cpoint())
-    continue;
-        // F point? Then check whether F points point1 and point2 have a common C point.
-        if (point2->is_fpoint())
-        {
-    add_C = true;
-    // C point is common for two F points if they are both strongly influenced by that C point.
-    // Compare strong influences for point1 and point2.
-    for (amg_point::iterator iter3 = point1->begin_influencing(); iter3 != point1 -> end_influencing(); ++iter3)
-    {
-      c_point = *iter3;
-      // Stop search when strong common influence is found via c_point.
-      if (c_point->is_cpoint())
+      //for (j=Slicing.Offset[i][level]; j<Slicing.Offset[i+1][level]; ++j)
+      for (j=Offset[i]; j<Offset[i+1]; ++j)
       {
-        if (point2->is_influencing(c_point))
+        point1 = Pointvector[level][j];
+        // If point is F point, check for strong connections.
+        if (point1->is_fpoint())
         {
-          add_C = false;
-          break;            
-        }
-      }
-    }
-    // No common C point found? Then make second F point to C point.
-    if (add_C == true)
-    {
-      Pointvector[level].switch_ftoc(point2);
-      // Add +1 to offsets as one C point has been added.
-      for (unsigned int j=i+1; j<=Slicing._threads; ++j)
-        Slicing.Offset[j][level+1]++;
-    }
+          // Check for strong connections from influencing and influenced points.
+          amg_point::iterator iter2 = point1->begin_influencing();
+          amg_point::iterator iter3 = point1->begin_influenced();
+
+          // Iterate over both lists at once. This makes sure that points are no checked twice when influence relation is symmetric (which is often the case).
+          // Note: Only works because influencing and influenced lists are sorted by point-index.
+          while(iter2 != point1->end_influencing() || iter3 != point1->end_influenced())
+          {
+            if (iter2 == point1->end_influencing())
+            {
+              point2 = *iter3;
+              ++iter3;
+            }
+            else if (iter3 == point1->end_influenced())
+            {
+              point2 = *iter2;
+              ++iter2;
+            }
+            else
+            {
+              if ((*iter2)->get_index() == (*iter3)->get_index())
+              {
+                point2 = *iter2;
+                ++iter2;
+                ++iter3;
+              }
+              else if ((*iter2)->get_index() < (*iter3)->get_index())
+              {
+                point2 = *iter2;
+                ++iter2;
+              }
+              else
+              {
+                point2 = *iter3;
+                ++iter3;
+              }
+            }
+
+            // Only check points with higher index as points with lower index have been checked already.
+            if (point2->get_index() < point1->get_index())
+              continue;
+
+            // Only check points that are outside the slicing boundaries (interior F-F connections have already been checked in second pass)
+            //if (point2->get_index() >= Slicing.Offset[i][level] || point2->get_index() < Slicing.Offset[i+1][level])
+            if (point2->get_index() >= Offset[i] && point2->get_index() < Offset[i+1])
+              continue;
+
+            // If there is a strong connection then it has to either be a C point or a F point with common C point.
+            // C point? Then skip as everything is ok.
+            if (point2->is_cpoint())
+              continue;
+            // F point? Then check whether F points point1 and point2 have a common C point.
+            if (point2->is_fpoint())
+            {
+              add_C = true;
+              // C point is common for two F points if they are both strongly influenced by that C point.
+              // Compare strong influences for point1 and point2.
+              for (amg_point::iterator iter3 = point1->begin_influencing(); iter3 != point1 -> end_influencing(); ++iter3)
+              {
+                c_point = *iter3;
+                // Stop search when strong common influence is found via c_point.
+                if (c_point->is_cpoint())
+                {
+                  if (point2->is_influencing(c_point))
+                  {
+                    add_C = false;
+                    break;
+                  }
+                }
+              }
+              // No common C point found? Then make second F point to C point.
+              if (add_C == true)
+              {
+                Pointvector[level].switch_ftoc(point2);
+                // Add +1 to offsets as one C point has been added.
+                for (unsigned int j=i+1; j<=Slicing.threads_; ++j)
+                  Slicing.Offset[j][level+1]++;
+              }
+                  }
+                }
+              }
+            }
+          }
+
+          #ifdef VIENNACL_AMG_DEBUG
+          std::cout << "After 3rd pass:" << std::endl;
+          std::cout << "Coarse Points:" << std::endl;
+          boost::numeric::ublas::vector<bool> C;
+          Pointvector[level].get_C(C);
+          printvector (C);
+          std::cout << "Fine Points:" << std::endl;
+          boost::numeric::ublas::vector<bool> F;
+          Pointvector[level].get_F(F);
+          printvector (F);
+          #endif
         }
-      }
-    }
-  }
-      }
-      
-      #ifdef DEBUG
-      std::cout << "After 3rd pass:" << std::endl;
-      std::cout << "Coarse Points:" << std::endl;
-      boost::numeric::ublas::vector<bool> C;
-      Pointvector[level].get_C(C);
-      printvector (C);
-      std::cout << "Fine Points:" << std::endl;
-      boost::numeric::ublas::vector<bool> F;
-      Pointvector[level].get_F(F);
-      printvector (F);
-      #endif
 
-      #ifdef DEBUG
-      unsigned int i;
-#ifdef _OPENMP
-      #pragma omp critical
-#endif      
-      {
-      std::cout << "No C and no F point: ";
-      for (typename PointVectorType::iterator iter = Pointvector[level].begin(); iter != Pointvector[level].end(); ++iter)
-  if ((*iter)->is_undecided())
-    std::cout << i << " ";
-      std::cout << std::endl;
-      }
-      #endif
-    }
-    
-    /** @brief AG (aggregation based) coarsening. Single-Threaded! (VIENNACL_AMG_COARSE_SA)
-    *
-    * @param level    Coarse level identifier
-    * @param A      Operator matrix on all levels
-    * @param Pointvector   Vector of points on all levels
-    * @param tag    AMG preconditioner tag
-    */
-    template <typename InternalType1, typename InternalType2>
-    void amg_coarse_ag(unsigned int level, InternalType1 & A, InternalType2 & Pointvector, amg_tag & tag)
-    {
-      typedef typename InternalType1::value_type SparseMatrixType;
-      typedef typename InternalType2::value_type PointVectorType;
-      typedef typename SparseMatrixType::value_type ScalarType;
-      
-      typedef typename SparseMatrixType::iterator1 InternalRowIterator;
-      typedef typename SparseMatrixType::iterator2 InternalColIterator;
-      
-      unsigned int x,y;
-      ScalarType diag;
-      amg_point *pointx, *pointy;
-    
-      // Cannot determine aggregates if size == 1 as then a new aggregate would always consist of this point (infinite loop)
-      if (A[level].size1() == 1) return;
-      
-      // SA algorithm (Vanek et al. p.6)     
-      // Build neighborhoods
-#ifdef _OPENMP
-      #pragma omp parallel for private (x,y,diag) shared (A)
-#endif      
-      for (x=0; x<A[level].size1(); ++x)
-      {
-  InternalRowIterator row_iter = A[level].begin1();
-  row_iter += x;
-  diag = A[level](x,x);
-  for (InternalColIterator col_iter = row_iter.begin(); col_iter != row_iter.end(); ++col_iter)
-  {
-    y = col_iter.index2();
-    if (y == x || (std::abs(*col_iter) >= tag.get_threshold()*pow(0.5,level-1) * sqrt(std::abs(diag*A[level](y,y)))))
-    {
-      // Neighborhood x includes point y
-      Pointvector[level][x]->add_influencing_point(Pointvector[level][y]);
-    }
-  }
-      }
-      
-      #ifdef DEBUG
-      std::cout << "Neighborhoods:" << std::endl;
-      boost::numeric::ublas::matrix<bool> mat;
-      Pointvector[level].get_influence_matrix(mat);
-      printmatrix (mat);
-      #endif
+        /** @brief AG (aggregation based) coarsening. Single-Threaded! (VIENNACL_AMG_COARSE_SA)
+        *
+        * @param level    Coarse level identifier
+        * @param A      Operator matrix on all levels
+        * @param Pointvector   Vector of points on all levels
+        * @param tag    AMG preconditioner tag
+        */
+        template <typename InternalType1, typename InternalType2>
+        void amg_coarse_ag(unsigned int level, InternalType1 & A, InternalType2 & Pointvector, amg_tag & tag)
+        {
+          typedef typename InternalType1::value_type SparseMatrixType;
+          typedef typename InternalType2::value_type PointVectorType;
+          typedef typename SparseMatrixType::value_type ScalarType;
 
-      // Build aggregates from neighborhoods  
-      for (typename PointVectorType::iterator iter = Pointvector[level].begin(); iter != Pointvector[level].end(); ++iter)
-      {
-  pointx = (*iter);
-  
-  if (pointx->is_undecided())
-  {
-    // Make center of aggregate to C point and include it to aggregate x.
-    Pointvector[level].make_cpoint(pointx);
-    pointx->set_aggregate (pointx->get_index());
-    for (amg_point::iterator iter2 = pointx->begin_influencing(); iter2 != pointx->end_influencing(); ++iter2)
-    {
-     pointy = (*iter2);
-      
-      if (pointy->is_undecided())
-      {
-        // Make neighbor y to F point and include it to aggregate x.
-        Pointvector[level].make_fpoint(pointy);
-        pointy->set_aggregate (pointx->get_index());
-      }
-    }
-  }
-      }
-      
-      #ifdef DEBUG
-      std::cout << "After aggregation:" << std::endl;
-      std::cout << "Coarse Points:" << std::endl;
-      boost::numeric::ublas::vector<bool> C;
-      Pointvector[level].get_C(C);
-      printvector (C);
-      std::cout << "Fine Points:" << std::endl;
-      boost::numeric::ublas::vector<bool> F;
-      Pointvector[level].get_F(F);
-      printvector (F);
-      std::cout << "Aggregates:" << std::endl;
-      printvector (Aggregates[level]);          
-      #endif
-    }
+          typedef typename SparseMatrixType::iterator1 InternalRowIterator;
+          typedef typename SparseMatrixType::iterator2 InternalColIterator;
+
+          long x,y;
+          ScalarType diag;
+          amg_point *pointx, *pointy;
+
+          // Cannot determine aggregates if size == 1 as then a new aggregate would always consist of this point (infinite loop)
+          if (A[level].size1() == 1) return;
+
+          // SA algorithm (Vanek et al. p.6)
+          // Build neighborhoods
+    #ifdef VIENNACL_WITH_OPENMP
+          #pragma omp parallel for private (x,y,diag)
+    #endif
+          for (x=0; x<static_cast<long>(A[level].size1()); ++x)
+          {
+            InternalRowIterator row_iter = A[level].begin1();
+            row_iter += x;
+            diag = A[level](x,x);
+            for (InternalColIterator col_iter = row_iter.begin(); col_iter != row_iter.end(); ++col_iter)
+            {
+              y = static_cast<long>(col_iter.index2());
+              if (y == x || (std::fabs(*col_iter) >= tag.get_threshold()*pow(0.5, static_cast<double>(level-1)) * std::sqrt(std::fabs(diag*A[level](y,y)))))
+              {
+                // Neighborhood x includes point y
+                Pointvector[level][x]->add_influencing_point(Pointvector[level][y]);
+              }
+            }
+          }
+
+          #ifdef VIENNACL_AMG_DEBUG
+          std::cout << "Neighborhoods:" << std::endl;
+          boost::numeric::ublas::matrix<bool> mat;
+          Pointvector[level].get_influence_matrix(mat);
+          printmatrix (mat);
+          #endif
+
+          // Build aggregates from neighborhoods
+          for (typename PointVectorType::iterator iter = Pointvector[level].begin(); iter != Pointvector[level].end(); ++iter)
+          {
+            pointx = (*iter);
+
+            if (pointx->is_undecided())
+            {
+              // Make center of aggregate to C point and include it to aggregate x.
+              Pointvector[level].make_cpoint(pointx);
+              pointx->set_aggregate (pointx->get_index());
+              for (amg_point::iterator iter2 = pointx->begin_influencing(); iter2 != pointx->end_influencing(); ++iter2)
+              {
+              pointy = (*iter2);
+
+                if (pointy->is_undecided())
+                {
+                  // Make neighbor y to F point and include it to aggregate x.
+                  Pointvector[level].make_fpoint(pointy);
+                  pointy->set_aggregate (pointx->get_index());
+                }
+              }
+            }
+          }
+
+          #ifdef VIENNACL_AMG_DEBUG
+          std::cout << "After aggregation:" << std::endl;
+          std::cout << "Coarse Points:" << std::endl;
+          boost::numeric::ublas::vector<bool> C;
+          Pointvector[level].get_C(C);
+          printvector (C);
+          std::cout << "Fine Points:" << std::endl;
+          boost::numeric::ublas::vector<bool> F;
+          Pointvector[level].get_F(F);
+          printvector (F);
+          std::cout << "Aggregates:" << std::endl;
+          printvector (Aggregates[level]);
+          #endif
+        }
       } //namespace amg
     }
   }
diff --git a/viennacl/linalg/detail/amg/amg_debug.hpp b/viennacl/linalg/detail/amg/amg_debug.hpp
index 9fc3e26..5f297bb 100644
--- a/viennacl/linalg/detail/amg/amg_debug.hpp
+++ b/viennacl/linalg/detail/amg/amg_debug.hpp
@@ -2,16 +2,17 @@
 #define VIENNACL_LINALG_DETAIL_AMG_AMG_DEBUG_HPP
 
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
@@ -19,7 +20,7 @@
 
 /** @file amg_debug.hpp
     @brief Debug functionality for AMG. To be removed.
-    
+
     AMG code contributed by Markus Wagner
 */
 
@@ -35,45 +36,51 @@
 namespace viennacl
 {
   namespace linalg
-  {    
+  {
     namespace detail
     {
       namespace amg
       {
 
+#ifdef VIENNACL_AMG_DEBUG
         template <typename MatrixType>
         void printmatrix(MatrixType & mat, int const value=-1)
         {
-          typedef typename MatrixType::value_type ScalarType;  
+          typedef typename MatrixType::value_type ScalarType;
           typedef typename VIENNACL_AMG_MATRIXTYPE::iterator1 InternalRowIterator;
           typedef typename VIENNACL_AMG_MATRIXTYPE::iterator2 InternalColIterator;
-          
-          #ifdef DEBUG
+
           VIENNACL_AMG_MATRIXTYPE mat2 = mat;
-          
+
           for (InternalRowIterator row_iter = mat2.begin1(); row_iter != mat2.end1(); ++row_iter)
           {
             for (InternalColIterator col_iter = row_iter.begin(); col_iter != row_iter.end(); ++col_iter)
-            {     
+            {
               std::cout << *col_iter << " ";
             }
             std::cout << std::endl;
           }
           std::cout << std::endl;
-          #endif
         }
 
         template <typename VectorType>
         void printvector(VectorType const & vec)
         {
-          #ifdef DEBUGBENCH
           for (typename VectorType::const_iterator iter = vec.begin(); iter != vec.end(); ++iter)
           {
             std::cout << *iter << " ";
           }
           std::cout << std::endl;
-          #endif
         }
+#else
+        template <typename MatrixType>
+        void printmatrix(MatrixType &, int) {}
+
+        template <typename VectorType>
+        void printvector(VectorType const &) {}
+
+#endif
+
 
       }
     }
diff --git a/viennacl/linalg/detail/amg/amg_interpol.hpp b/viennacl/linalg/detail/amg/amg_interpol.hpp
index bcd574b..b64b964 100644
--- a/viennacl/linalg/detail/amg/amg_interpol.hpp
+++ b/viennacl/linalg/detail/amg/amg_interpol.hpp
@@ -2,16 +2,17 @@
 #define VIENNACL_LINALG_DETAIL_AMG_AMG_INTERPOL_HPP
 
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
@@ -23,14 +24,14 @@
 
 #include <boost/numeric/ublas/vector.hpp>
 #include <cmath>
-#include "viennacl/linalg/amg.hpp"
+#include "viennacl/linalg/detail/amg/amg_base.hpp"
 
 #include <map>
-#ifdef _OPENMP
+#ifdef VIENNACL_WITH_OPENMP
 #include <omp.h>
 #endif
 
-#include "amg_debug.hpp"
+#include "viennacl/linalg/detail/amg/amg_debug.hpp"
 
 namespace viennacl
 {
@@ -40,7 +41,7 @@ namespace viennacl
     {
       namespace amg
       {
-    
+
     /** @brief Calls the right function to build interpolation matrix
      * @param level    Coarse level identifier
      * @param A      Operator matrix on all levels
@@ -51,14 +52,14 @@ namespace viennacl
     template <typename InternalType1, typename InternalType2>
     void amg_interpol(unsigned int level, InternalType1 & A, InternalType1 & P, InternalType2 & Pointvector, amg_tag & tag)
     {
-  switch (tag.get_interpol())
-  {
-    case VIENNACL_AMG_INTERPOL_DIRECT: amg_interpol_direct (level, A, P, Pointvector, tag); break;
-    case VIENNACL_AMG_INTERPOL_CLASSIC: amg_interpol_classic (level, A, P, Pointvector, tag); break;
-    case VIENNACL_AMG_INTERPOL_AG: amg_interpol_ag (level, A, P, Pointvector, tag); break;
-    case VIENNACL_AMG_INTERPOL_SA: amg_interpol_sa (level, A, P, Pointvector, tag); break;
-  }
-    } 
+      switch (tag.get_interpol())
+      {
+        case VIENNACL_AMG_INTERPOL_DIRECT: amg_interpol_direct (level, A, P, Pointvector, tag); break;
+        case VIENNACL_AMG_INTERPOL_CLASSIC: amg_interpol_classic (level, A, P, Pointvector, tag); break;
+        case VIENNACL_AMG_INTERPOL_AG: amg_interpol_ag (level, A, P, Pointvector, tag); break;
+        case VIENNACL_AMG_INTERPOL_SA: amg_interpol_sa (level, A, P, Pointvector, tag); break;
+      }
+    }
     /** @brief Direct interpolation. Multi-threaded! (VIENNACL_AMG_INTERPOL_DIRECT)
      * @param level    Coarse level identifier
      * @param A      Operator matrix on all levels
@@ -70,96 +71,97 @@ namespace viennacl
     void amg_interpol_direct(unsigned int level, InternalType1 & A, InternalType1 & P, InternalType2 & Pointvector, amg_tag & tag)
     {
       typedef typename InternalType1::value_type SparseMatrixType;
-      typedef typename InternalType2::value_type PointVectorType;
+      //typedef typename InternalType2::value_type PointVectorType;
       typedef typename SparseMatrixType::value_type ScalarType;
       typedef typename SparseMatrixType::iterator1 InternalRowIterator;
       typedef typename SparseMatrixType::iterator2 InternalColIterator;
-      
+
       ScalarType temp_res;
       ScalarType row_sum, c_sum, diag;
       //int diag_sign;
-      unsigned int x, y;
+      long x, y;
       amg_point *pointx, *pointy;
       unsigned int c_points = Pointvector[level].get_cpoints();
 
       // Setup Prolongation/Interpolation matrix
-      P[level] = SparseMatrixType(A[level].size1(),c_points);
+      P[level] = SparseMatrixType(static_cast<unsigned int>(A[level].size1()),c_points);
       P[level].clear();
-      
+
       // Assign indices to C points
       Pointvector[level].build_index();
-      
+
       // Direct Interpolation (Yang, p.14)
-#ifdef _OPENMP
-      #pragma omp parallel for private (pointx,pointy,row_sum,c_sum,temp_res,y,x,diag) shared (P,A,Pointvector,tag)
-#endif      
-      for (x=0; x < Pointvector[level].size(); ++x)
-      {
-  pointx = Pointvector[level][x];
-  /*if (A[level](x,x) > 0) 
-    diag_sign = 1;
-  else
-    diag_sign = -1;*/
-  
-  // When the current line corresponds to a C point then the diagonal coefficient is 1 and the rest 0
-  if (pointx->is_cpoint())
-    P[level](x,pointx->get_coarse_index()) = 1;
-  
-  // When the current line corresponds to a F point then the diagonal is 0 and the rest has to be computed (Yang, p.14)
-  if (pointx->is_fpoint())
-  {
-    // Jump to row x
-    InternalRowIterator row_iter = A[level].begin1();
-    row_iter += x;
-    
-    // Row sum of coefficients (without diagonal) and sum of influencing C point coefficients has to be computed
-    row_sum = c_sum = diag = 0;
-    for (InternalColIterator col_iter = row_iter.begin(); col_iter != row_iter.end(); ++col_iter)
-    {
-      y = col_iter.index2();
-      if (x == y)// || *col_iter * diag_sign > 0)
-      {
-        diag += *col_iter;
-        continue;
-      }
-      
-      // Sum all other coefficients in line x
-      row_sum += *col_iter;
-
-      pointy = Pointvector[level][y];
-      // Sum all coefficients that correspond to a strongly influencing C point
-      if (pointy->is_cpoint())
-        if (pointx->is_influencing(pointy))
-    c_sum += *col_iter;        
-    }
-    temp_res = -row_sum/(c_sum*diag);
-
-    // Iterate over all strongly influencing points of point x
-    for (amg_point::iterator iter = pointx->begin_influencing(); iter != pointx->end_influencing(); ++iter)
-    {    
-      pointy = *iter;
-      // The value is only non-zero for columns that correspond to a C point
-      if (pointy->is_cpoint())
+#ifdef VIENNACL_WITH_OPENMP
+      #pragma omp parallel for private (pointx,pointy,row_sum,c_sum,temp_res,y,x,diag)
+#endif
+      for (x=0; x < static_cast<long>(Pointvector[level].size()); ++x)
       {
-        if (temp_res != 0)
-    P[level](x, pointy->get_coarse_index()) = temp_res * A[level](x,pointy->get_index());
-      }
-    }
-    
-    //Truncate interpolation if chosen
-    if (tag.get_interpolweight() != 0)
-      amg_truncate_row(P[level], x, tag);
-  }
+        pointx = Pointvector[level][x];
+        /*if (A[level](x,x) > 0)
+          diag_sign = 1;
+        else
+          diag_sign = -1;*/
+
+        // When the current line corresponds to a C point then the diagonal coefficient is 1 and the rest 0
+        if (pointx->is_cpoint())
+          P[level](x,pointx->get_coarse_index()) = 1;
+
+        // When the current line corresponds to a F point then the diagonal is 0 and the rest has to be computed (Yang, p.14)
+        if (pointx->is_fpoint())
+        {
+          // Jump to row x
+          InternalRowIterator row_iter = A[level].begin1();
+          row_iter += x;
+
+          // Row sum of coefficients (without diagonal) and sum of influencing C point coefficients has to be computed
+          row_sum = c_sum = diag = 0;
+          for (InternalColIterator col_iter = row_iter.begin(); col_iter != row_iter.end(); ++col_iter)
+          {
+            y = static_cast<long>(col_iter.index2());
+            if (x == y)// || *col_iter * diag_sign > 0)
+            {
+              diag += *col_iter;
+              continue;
+            }
+
+            // Sum all other coefficients in line x
+            row_sum += *col_iter;
+
+            pointy = Pointvector[level][y];
+            // Sum all coefficients that correspond to a strongly influencing C point
+            if (pointy->is_cpoint())
+              if (pointx->is_influencing(pointy))
+                c_sum += *col_iter;
+          }
+          temp_res = -row_sum/(c_sum*diag);
+
+          // Iterate over all strongly influencing points of point x
+          for (amg_point::iterator iter = pointx->begin_influencing(); iter != pointx->end_influencing(); ++iter)
+          {
+            pointy = *iter;
+            // The value is only non-zero for columns that correspond to a C point
+            if (pointy->is_cpoint())
+            {
+              if (temp_res != 0)
+                P[level](x, pointy->get_coarse_index()) = temp_res * A[level](x,pointy->get_index());
+            }
+          }
+
+          //Truncate interpolation if chosen
+          if (tag.get_interpolweight() != 0)
+            amg_truncate_row(P[level], x, tag);
+        }
       }
-      
+
       // P test
       //test_interpolation(A[level], P[level], Pointvector[level]);
-      
-      #ifdef DEBUG
+
+      #ifdef VIENNACL_AMG_DEBUG
       std::cout << "Prolongation Matrix:" << std::endl;
       printmatrix (P[level]);
-      #endif  
+      #endif
     }
+
     /** @brief Classical interpolation. Don't use with onepass classical coarsening or RS0 (Yang, p.14)! Multi-threaded! (VIENNACL_AMG_INTERPOL_CLASSIC)
      * @param level    Coarse level identifier
      * @param A      Operator matrix on all levels
@@ -171,120 +173,120 @@ namespace viennacl
     void amg_interpol_classic(unsigned int level, InternalType1 & A, InternalType1 & P, InternalType2 & Pointvector, amg_tag & tag)
     {
       typedef typename InternalType1::value_type SparseMatrixType;
-      typedef typename InternalType2::value_type PointVectorType;
+      //typedef typename InternalType2::value_type PointVectorType;
       typedef typename SparseMatrixType::value_type ScalarType;
       typedef typename SparseMatrixType::iterator1 InternalRowIterator;
       typedef typename SparseMatrixType::iterator2 InternalColIterator;
-      
+
       ScalarType temp_res;
       ScalarType weak_sum, strong_sum;
       int diag_sign;
       amg_sparsevector<ScalarType> c_sum_row;
       amg_point *pointx, *pointy, *pointk, *pointm;
-      unsigned int x, y, k, m;
-      
+      long x, y, k, m;
+
       unsigned int c_points = Pointvector[level].get_cpoints();
-      
+
       // Setup Prolongation/Interpolation matrix
-      P[level] = SparseMatrixType(A[level].size1(), c_points);
+      P[level] = SparseMatrixType(static_cast<unsigned int>(A[level].size1()), c_points);
       P[level].clear();
-      
+
       // Assign indices to C points
       Pointvector[level].build_index();
-      
+
       // Classical Interpolation (Yang, p.13-14)
-#ifdef _OPENMP
-      #pragma omp parallel for private (pointx,pointy,pointk,pointm,weak_sum,strong_sum,c_sum_row,temp_res,x,y,k,m,diag_sign) shared (A,P,Pointvector)
-#endif      
-      for (x=0; x < Pointvector[level].size(); ++x)
-      {
-  pointx = Pointvector[level][x];
-  if (A[level](x,x) > 0) 
-    diag_sign = 1;
-  else
-    diag_sign = -1;
-  
-  // When the current line corresponds to a C point then the diagonal coefficient is 1 and the rest 0
-  if (pointx->is_cpoint())
-    P[level](x,pointx->get_coarse_index()) = 1;
-
-  // When the current line corresponds to a F point then the diagonal is 0 and the rest has to be computed (Yang, p.14)
-  if (pointx->is_fpoint())
-  {  
-    // Jump to row x
-    InternalRowIterator row_iter = A[level].begin1();
-    row_iter += x;
-    
-    weak_sum = 0;
-    c_sum_row = amg_sparsevector<ScalarType>(A[level].size1());
-    c_sum_row.clear();
-    for (InternalColIterator col_iter = row_iter.begin(); col_iter != row_iter.end(); ++col_iter)
-    {
-      k = col_iter.index2();
-      pointk = Pointvector[level][k];
-      
-      // Sum of weakly influencing neighbors + diagonal coefficient
-      if (x == k || !pointx->is_influencing(pointk))// || *col_iter * diag_sign > 0)
-      {
-        weak_sum += *col_iter;
-        continue;
-      }
-        
-      // Sums of coefficients in row k (strongly influening F neighbors) of C point neighbors of x are calculated
-      if (pointk->is_fpoint() && pointx->is_influencing(pointk))
-      {
-        for (amg_point::iterator iter = pointx->begin_influencing(); iter != pointx->end_influencing(); ++iter)
-        {
-    pointm = *iter;
-    m = pointm->get_index();
-    
-    if (pointm->is_cpoint())
-      // Only use coefficients that have opposite sign of diagonal.
-      if (A[level](k,m) * diag_sign < 0)
-        c_sum_row[k] += A[level](k,m);
-        }
-        continue;
-      }
-    }
-    
-    // Iterate over all strongly influencing points of point x
-    for (amg_point::iterator iter = pointx->begin_influencing(); iter != pointx->end_influencing(); ++iter)
-    {    
-      pointy = *iter;
-      y = pointy->get_index();
-      
-      // The value is only non-zero for columns that correspond to a C point
-      if (pointy->is_cpoint())
+#ifdef VIENNACL_WITH_OPENMP
+      #pragma omp parallel for private (pointx,pointy,pointk,pointm,weak_sum,strong_sum,c_sum_row,temp_res,x,y,k,m,diag_sign)
+#endif
+      for (x=0; x < static_cast<long>(Pointvector[level].size()); ++x)
       {
-        strong_sum = 0;
-        // Calculate term for strongly influencing F neighbors
-        for (typename amg_sparsevector<ScalarType>::iterator iter2 = c_sum_row.begin(); iter2 != c_sum_row.end(); ++iter2)
+        pointx = Pointvector[level][x];
+        if (A[level](x,x) > 0)
+          diag_sign = 1;
+        else
+          diag_sign = -1;
+
+        // When the current line corresponds to a C point then the diagonal coefficient is 1 and the rest 0
+        if (pointx->is_cpoint())
+          P[level](x,pointx->get_coarse_index()) = 1;
+
+        // When the current line corresponds to a F point then the diagonal is 0 and the rest has to be computed (Yang, p.14)
+        if (pointx->is_fpoint())
         {
-    k = iter2.index();
-    // Only use coefficients that have opposite sign of diagonal.
-    if (A[level](k,y) * diag_sign < 0)
-      strong_sum += (A[level](x,k) * A[level](k,y)) / (*iter2);
+          // Jump to row x
+          InternalRowIterator row_iter = A[level].begin1();
+          row_iter += x;
+
+          weak_sum = 0;
+          c_sum_row = amg_sparsevector<ScalarType>(static_cast<unsigned int>(A[level].size1()));
+          c_sum_row.clear();
+          for (InternalColIterator col_iter = row_iter.begin(); col_iter != row_iter.end(); ++col_iter)
+          {
+            k = static_cast<unsigned int>(col_iter.index2());
+            pointk = Pointvector[level][k];
+
+            // Sum of weakly influencing neighbors + diagonal coefficient
+            if (x == k || !pointx->is_influencing(pointk))// || *col_iter * diag_sign > 0)
+            {
+              weak_sum += *col_iter;
+              continue;
+            }
+
+            // Sums of coefficients in row k (strongly influening F neighbors) of C point neighbors of x are calculated
+            if (pointk->is_fpoint() && pointx->is_influencing(pointk))
+            {
+              for (amg_point::iterator iter = pointx->begin_influencing(); iter != pointx->end_influencing(); ++iter)
+              {
+                pointm = *iter;
+                m = pointm->get_index();
+
+                if (pointm->is_cpoint())
+                  // Only use coefficients that have opposite sign of diagonal.
+                  if (A[level](k,m) * diag_sign < 0)
+                    c_sum_row[k] += A[level](k,m);
+              }
+              continue;
+            }
+          }
+
+          // Iterate over all strongly influencing points of point x
+          for (amg_point::iterator iter = pointx->begin_influencing(); iter != pointx->end_influencing(); ++iter)
+          {
+            pointy = *iter;
+            y = pointy->get_index();
+
+            // The value is only non-zero for columns that correspond to a C point
+            if (pointy->is_cpoint())
+            {
+              strong_sum = 0;
+              // Calculate term for strongly influencing F neighbors
+              for (typename amg_sparsevector<ScalarType>::iterator iter2 = c_sum_row.begin(); iter2 != c_sum_row.end(); ++iter2)
+              {
+                k = iter2.index();
+                // Only use coefficients that have opposite sign of diagonal.
+                if (A[level](k,y) * diag_sign < 0)
+                  strong_sum += (A[level](x,k) * A[level](k,y)) / (*iter2);
+              }
+
+              // Calculate coefficient
+              temp_res = - (A[level](x,y) + strong_sum) / (weak_sum);
+              if (temp_res != 0)
+                P[level](x,pointy->get_coarse_index()) = temp_res;
+            }
+          }
+
+          //Truncate iteration if chosen
+          if (tag.get_interpolweight() != 0)
+            amg_truncate_row(P[level], x, tag);
         }
-        
-        // Calculate coefficient
-        temp_res = - (A[level](x,y) + strong_sum) / (weak_sum);
-        if (temp_res != 0)
-    P[level](x,pointy->get_coarse_index()) = temp_res;   
       }
-    }
-    
-    //Truncate iteration if chosen
-    if (tag.get_interpolweight() != 0)
-      amg_truncate_row(P[level], x, tag);
-  }
-      }
-      
-      #ifdef DEBUG
+
+      #ifdef VIENNACL_AMG_DEBUG
       std::cout << "Prolongation Matrix:" << std::endl;
       printmatrix (P[level]);
-      #endif  
+      #endif
     }
-    
+
     /** @brief Interpolation truncation (for VIENNACL_AMG_INTERPOL_DIRECT and VIENNACL_AMG_INTERPOL_CLASSIC)
     *
     * @param P    Interpolation matrix
@@ -297,103 +299,102 @@ namespace viennacl
       typedef typename SparseMatrixType::value_type ScalarType;
       typedef typename SparseMatrixType::iterator1 InternalRowIterator;
       typedef typename SparseMatrixType::iterator2 InternalColIterator;
-      
+
       ScalarType row_max, row_min, row_sum_pos, row_sum_neg, row_sum_pos_scale, row_sum_neg_scale;
-      
+
       InternalRowIterator row_iter = P.begin1();
       row_iter += row;
-      
+
       row_max = 0;
       row_min = 0;
       row_sum_pos = 0;
       row_sum_neg = 0;
-      
+
       // Truncate interpolation by making values to zero that are a lot smaller than the biggest value in a row
       // Determine max entry and sum of row (seperately for negative and positive entries)
       for (InternalColIterator col_iter = row_iter.begin(); col_iter != row_iter.end(); ++col_iter)
       {
-  if (*col_iter > row_max)
-    row_max = *col_iter;
-  if (*col_iter < row_min)
-    row_min = *col_iter;
-  if (*col_iter > 0)
-    row_sum_pos += *col_iter;
-  if (*col_iter < 0)
-    row_sum_neg += *col_iter;
+        if (*col_iter > row_max)
+          row_max = *col_iter;
+        if (*col_iter < row_min)
+          row_min = *col_iter;
+        if (*col_iter > 0)
+          row_sum_pos += *col_iter;
+        if (*col_iter < 0)
+          row_sum_neg += *col_iter;
       }
-      
+
       row_sum_pos_scale = row_sum_pos;
       row_sum_neg_scale = row_sum_neg;
-      
+
       // Make certain values to zero (seperately for negative and positive entries)
       for (InternalColIterator col_iter = row_iter.begin(); col_iter != row_iter.end(); ++col_iter)
       {
-  if (*col_iter > 0 && *col_iter < tag.get_interpolweight() * row_max)
-  {
-    row_sum_pos_scale -= *col_iter;
-    *col_iter = 0;
-  }
-  if (*col_iter < 0 && *col_iter > tag.get_interpolweight() * row_min)
-  {
-    row_sum_pos_scale -= *col_iter;
-    *col_iter = 0;
-  }
+        if (*col_iter > 0 && *col_iter < tag.get_interpolweight() * row_max)
+        {
+          row_sum_pos_scale -= *col_iter;
+          *col_iter = 0;
+        }
+        if (*col_iter < 0 && *col_iter > tag.get_interpolweight() * row_min)
+        {
+          row_sum_pos_scale -= *col_iter;
+          *col_iter = 0;
+        }
       }
-      
+
       // Scale remaining values such that row sum is unchanged
       for (InternalColIterator col_iter = row_iter.begin(); col_iter != row_iter.end(); ++col_iter)
       {
-  if (*col_iter > 0)
-    *col_iter = *col_iter *(row_sum_pos/row_sum_pos_scale);
-  if (*col_iter < 0)
-    *col_iter = *col_iter *(row_sum_neg/row_sum_neg_scale);
+        if (*col_iter > 0)
+          *col_iter = *col_iter *(row_sum_pos/row_sum_pos_scale);
+        if (*col_iter < 0)
+          *col_iter = *col_iter *(row_sum_neg/row_sum_neg_scale);
       }
     }
-    
+
     /** @brief AG (aggregation based) interpolation. Multi-Threaded! (VIENNACL_INTERPOL_SA)
      * @param level    Coarse level identifier
      * @param A      Operator matrix on all levels
      * @param P      Prolongation matrices. P[level] is constructed
      * @param Pointvector  Vector of points on all levels
-     * @param tag    AMG preconditioner tag
     */
     template <typename InternalType1, typename InternalType2>
-    void amg_interpol_ag(unsigned int level, InternalType1 & A, InternalType1 & P, InternalType2 & Pointvector, amg_tag & tag)
+    void amg_interpol_ag(unsigned int level, InternalType1 & A, InternalType1 & P, InternalType2 & Pointvector, amg_tag)
     {
       typedef typename InternalType1::value_type SparseMatrixType;
-      typedef typename InternalType2::value_type PointVectorType;
-      typedef typename SparseMatrixType::value_type ScalarType;
-      typedef typename SparseMatrixType::iterator1 InternalRowIterator;
-      typedef typename SparseMatrixType::iterator2 InternalColIterator;
-      
-      unsigned int x;
+      //typedef typename InternalType2::value_type PointVectorType;
+      //typedef typename SparseMatrixType::value_type ScalarType;
+      //typedef typename SparseMatrixType::iterator1 InternalRowIterator;
+      //typedef typename SparseMatrixType::iterator2 InternalColIterator;
+
+      long x;
       amg_point *pointx, *pointy;
       unsigned int c_points = Pointvector[level].get_cpoints();
-      
-      P[level] = SparseMatrixType(A[level].size1(), c_points);
+
+      P[level] = SparseMatrixType(static_cast<unsigned int>(A[level].size1()), c_points);
       P[level].clear();
-      
+
       // Assign indices to C points
       Pointvector[level].build_index();
-      
+
       // Set prolongation such that F point is interpolated (weight=1) by the aggregate it belongs to (Vanek et al p.6)
-#ifdef _OPENMP
-      #pragma omp parallel for private (x,pointx) shared (P)
-#endif      
-      for (x=0; x<Pointvector[level].size(); ++x)
+#ifdef VIENNACL_WITH_OPENMP
+      #pragma omp parallel for private (x,pointx)
+#endif
+      for (x=0; x<static_cast<long>(Pointvector[level].size()); ++x)
       {
-  pointx = Pointvector[level][x];
-  pointy = Pointvector[level][pointx->get_aggregate()];
-  // Point x belongs to aggregate y.
-  P[level](x,pointy->get_coarse_index()) = 1;
+        pointx = Pointvector[level][x];
+        pointy = Pointvector[level][pointx->get_aggregate()];
+        // Point x belongs to aggregate y.
+        P[level](x,pointy->get_coarse_index()) = 1;
       }
-      
-      #ifdef DEBUG
+
+      #ifdef VIENNACL_AMG_DEBUG
       std::cout << "Aggregation based Prolongation:" << std::endl;
       printmatrix(P[level]);
       #endif
     }
-      
+
     /** @brief SA (smoothed aggregate) interpolation. Multi-Threaded! (VIENNACL_INTERPOL_SA)
      * @param level    Coarse level identifier
      * @param A      Operator matrix on all levels
@@ -405,75 +406,75 @@ namespace viennacl
     void amg_interpol_sa(unsigned int level, InternalType1 & A, InternalType1 & P, InternalType2 & Pointvector, amg_tag & tag)
     {
       typedef typename InternalType1::value_type SparseMatrixType;
-      typedef typename InternalType2::value_type PointVectorType;
+      //typedef typename InternalType2::value_type PointVectorType;
       typedef typename SparseMatrixType::value_type ScalarType;
       typedef typename SparseMatrixType::iterator1 InternalRowIterator;
       typedef typename SparseMatrixType::iterator2 InternalColIterator;
-      
-      unsigned int x,y;
-      ScalarType diag;
+
+      long x,y;
+      ScalarType diag = 0;
       unsigned int c_points = Pointvector[level].get_cpoints();
-           
+
       InternalType1 P_tentative = InternalType1(P.size());
-      SparseMatrixType Jacobi = SparseMatrixType(A[level].size1(), A[level].size2());
+      SparseMatrixType Jacobi = SparseMatrixType(static_cast<unsigned int>(A[level].size1()), static_cast<unsigned int>(A[level].size2()));
       Jacobi.clear();
-      P[level] = SparseMatrixType(A[level].size1(), c_points);
-      P[level].clear();      
-           
+      P[level] = SparseMatrixType(static_cast<unsigned int>(A[level].size1()), c_points);
+      P[level].clear();
+
       // Build Jacobi Matrix via filtered A matrix (Vanek et al. p.6)
-#ifdef _OPENMP
-      #pragma omp parallel for private (x,y,diag) shared (A,Pointvector)
-#endif      
-      for (x=0; x<A[level].size1(); ++x)
+#ifdef VIENNACL_WITH_OPENMP
+      #pragma omp parallel for private (x,y,diag)
+#endif
+      for (x=0; x<static_cast<long>(A[level].size1()); ++x)
       {
-  diag = 0;
-  InternalRowIterator row_iter = A[level].begin1();
-  row_iter += x;
-  for (InternalColIterator col_iter = row_iter.begin(); col_iter != row_iter.end(); ++col_iter)
-  {
-    y = col_iter.index2();
-    // Determine the structure of the Jacobi matrix by using a filtered matrix of A:
-    // The diagonal consists of the diagonal coefficient minus all coefficients of points not in the neighborhood of x.
-    // All other coefficients are the same as in A.
-    // Already use Jacobi matrix to save filtered A matrix to speed up computation.
-    if (x == y)
-      diag += *col_iter;
-    else if (!Pointvector[level][x]->is_influencing(Pointvector[level][y]))
-      diag += -*col_iter;
-    else
-      Jacobi (x,y) = *col_iter;      
-  }
-  InternalRowIterator row_iter2 = Jacobi.begin1();
-  row_iter2 += x;
-  // Traverse through filtered A matrix and compute the Jacobi filtering
-  for (InternalColIterator col_iter2 = row_iter2.begin(); col_iter2 != row_iter2.end(); ++col_iter2)
-  {
-      *col_iter2 = - tag.get_interpolweight()/diag * *col_iter2;
-  }
-  // Diagonal can be computed seperately.
-  Jacobi (x,x) = 1 - tag.get_interpolweight();
+        diag = 0;
+        InternalRowIterator row_iter = A[level].begin1();
+        row_iter += x;
+        for (InternalColIterator col_iter = row_iter.begin(); col_iter != row_iter.end(); ++col_iter)
+        {
+          y = static_cast<long>(col_iter.index2());
+          // Determine the structure of the Jacobi matrix by using a filtered matrix of A:
+          // The diagonal consists of the diagonal coefficient minus all coefficients of points not in the neighborhood of x.
+          // All other coefficients are the same as in A.
+          // Already use Jacobi matrix to save filtered A matrix to speed up computation.
+          if (x == y)
+            diag += *col_iter;
+          else if (!Pointvector[level][x]->is_influencing(Pointvector[level][y]))
+            diag += -*col_iter;
+          else
+            Jacobi (x,y) = *col_iter;
+        }
+        InternalRowIterator row_iter2 = Jacobi.begin1();
+        row_iter2 += x;
+        // Traverse through filtered A matrix and compute the Jacobi filtering
+        for (InternalColIterator col_iter2 = row_iter2.begin(); col_iter2 != row_iter2.end(); ++col_iter2)
+        {
+            *col_iter2 = - static_cast<ScalarType>(tag.get_interpolweight())/diag * *col_iter2;
+        }
+        // Diagonal can be computed seperately.
+        Jacobi (x,x) = 1 - static_cast<ScalarType>(tag.get_interpolweight());
       }
-          
-      #ifdef DEBUG
+
+      #ifdef VIENNACL_AMG_DEBUG
       std::cout << "Jacobi Matrix:" << std::endl;
       printmatrix(Jacobi);
       #endif
-      
+
       // Use AG interpolation as tentative prolongation
       amg_interpol_ag(level, A, P_tentative, Pointvector, tag);
-      
-      #ifdef DEBUG
+
+      #ifdef VIENNACL_AMG_DEBUG
       std::cout << "Tentative Prolongation:" << std::endl;
       printmatrix(P_tentative[level]);
       #endif
-      
+
       // Multiply Jacobi matrix with tentative prolongation to get actual prolongation
       amg_mat_prod(Jacobi,P_tentative[level],P[level]);
-      
-      #ifdef DEBUG
+
+      #ifdef VIENNACL_AMG_DEBUG
       std::cout << "Prolongation Matrix:" << std::endl;
       printmatrix (P[level]);
-      #endif    
+      #endif
     }
       } //namespace amg
     }
diff --git a/viennacl/linalg/detail/ilu/block_ilu.hpp b/viennacl/linalg/detail/ilu/block_ilu.hpp
new file mode 100644
index 0000000..406553a
--- /dev/null
+++ b/viennacl/linalg/detail/ilu/block_ilu.hpp
@@ -0,0 +1,463 @@
+#ifndef VIENNACL_LINALG_DETAIL_BLOCK_ILU_HPP_
+#define VIENNACL_LINALG_DETAIL_BLOCK_ILU_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/detail/ilu/block_ilu.hpp
+    @brief Implementations of incomplete block factorization preconditioners
+*/
+
+#include <vector>
+#include <cmath>
+#include "viennacl/forwards.h"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/detail/ilu/common.hpp"
+#include "viennacl/linalg/detail/ilu/ilu0.hpp"
+#include "viennacl/linalg/detail/ilu/ilut.hpp"
+
+#include <map>
+
+namespace viennacl
+{
+  namespace linalg
+  {
+    namespace detail
+    {
+      /** @brief Helper range class for representing a subvector of a larger buffer. */
+      template <typename VectorType, typename ValueType, typename SizeType = vcl_size_t>
+      class ilu_vector_range
+      {
+        public:
+          //typedef typename VectorType::value_type      value_type;
+          //typedef typename VectorType::size_type       size_type;
+
+          ilu_vector_range(VectorType & v,
+                           SizeType start_index,
+                           SizeType vec_size
+                          ) : vec_(v), start_(start_index), size_(vec_size) {}
+
+          ValueType & operator()(SizeType index)
+          {
+            assert(index < size_ && bool("Index out of bounds!"));
+            return vec_[start_ + index];
+          }
+
+          ValueType & operator[](SizeType index)
+          {
+            assert(index < size_ && bool("Index out of bounds!"));
+            return vec_[start_ + index];
+          }
+
+          SizeType size() const { return size_; }
+
+        private:
+          VectorType & vec_;
+          SizeType start_;
+          SizeType size_;
+      };
+
+      /** @brief Extracts a diagonal block from a larger system matrix
+        *
+        * @param A                   The full matrix
+        * @param diagonal_block_A    The output matrix, to which the extracted block is written to
+        * @param start_index         First row- and column-index of the block
+        * @param stop_index          First row- and column-index beyond the block
+        */
+      template <typename ScalarType>
+      void extract_block_matrix(viennacl::compressed_matrix<ScalarType> const & A,
+                                viennacl::compressed_matrix<ScalarType> & diagonal_block_A,
+                                vcl_size_t start_index,
+                                vcl_size_t stop_index
+                                )
+      {
+
+        assert( (A.handle1().get_active_handle_id() == viennacl::MAIN_MEMORY) && bool("System matrix must reside in main memory for ILU0") );
+        assert( (A.handle2().get_active_handle_id() == viennacl::MAIN_MEMORY) && bool("System matrix must reside in main memory for ILU0") );
+        assert( (A.handle().get_active_handle_id() == viennacl::MAIN_MEMORY) && bool("System matrix must reside in main memory for ILU0") );
+
+        ScalarType   const * A_elements   = viennacl::linalg::host_based::detail::extract_raw_pointer<ScalarType>(A.handle());
+        unsigned int const * A_row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle1());
+        unsigned int const * A_col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle2());
+
+        ScalarType   * output_elements   = viennacl::linalg::host_based::detail::extract_raw_pointer<ScalarType>(diagonal_block_A.handle());
+        unsigned int * output_row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(diagonal_block_A.handle1());
+        unsigned int * output_col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(diagonal_block_A.handle2());
+
+        vcl_size_t output_counter = 0;
+        for (vcl_size_t row = start_index; row < stop_index; ++row)
+        {
+          unsigned int buffer_col_start = A_row_buffer[row];
+          unsigned int buffer_col_end   = A_row_buffer[row+1];
+
+          output_row_buffer[row - start_index] = static_cast<unsigned int>(output_counter);
+
+          for (unsigned int buf_index = buffer_col_start; buf_index < buffer_col_end; ++buf_index)
+          {
+            unsigned int col = A_col_buffer[buf_index];
+            if (col < start_index)
+              continue;
+
+            if (col >= static_cast<unsigned int>(stop_index))
+              continue;
+
+            output_col_buffer[output_counter] = static_cast<unsigned int>(col - start_index);
+            output_elements[output_counter] = A_elements[buf_index];
+            ++output_counter;
+          }
+          output_row_buffer[row - start_index + 1] = static_cast<unsigned int>(output_counter);
+        }
+      }
+
+
+    }
+
+    /** @brief A block ILU preconditioner class, can be supplied to solve()-routines
+     *
+     * @tparam MatrixType   Type of the system matrix
+     * @tparam ILUTag       Type of the tag identifiying the ILU preconditioner to be used on each block.
+    */
+    template <typename MatrixType, typename ILUTag>
+    class block_ilu_precond
+    {
+      typedef typename MatrixType::value_type      ScalarType;
+
+      public:
+        typedef std::vector<std::pair<vcl_size_t, vcl_size_t> >    index_vector_type;   //the pair refers to index range [a, b) of each block
+
+
+        block_ilu_precond(MatrixType const & mat,
+                          ILUTag const & tag,
+                          vcl_size_t num_blocks = 8
+                         ) : tag_(tag), LU_blocks(num_blocks)
+        {
+
+          // Set up vector of block indices:
+          block_indices_.resize(num_blocks);
+          for (vcl_size_t i=0; i<num_blocks; ++i)
+          {
+            vcl_size_t start_index = (   i  * mat.size1()) / num_blocks;
+            vcl_size_t stop_index  = ((i+1) * mat.size1()) / num_blocks;
+
+            block_indices_[i] = std::pair<vcl_size_t, vcl_size_t>(start_index, stop_index);
+          }
+
+          //initialize preconditioner:
+          //std::cout << "Start CPU precond" << std::endl;
+          init(mat);
+          //std::cout << "End CPU precond" << std::endl;
+        }
+
+        block_ilu_precond(MatrixType const & mat,
+                          ILUTag const & tag,
+                          index_vector_type const & block_boundaries
+                         ) : tag_(tag), block_indices_(block_boundaries), LU_blocks(block_boundaries.size())
+        {
+          //initialize preconditioner:
+          //std::cout << "Start CPU precond" << std::endl;
+          init(mat);
+          //std::cout << "End CPU precond" << std::endl;
+        }
+
+
+        template <typename VectorType>
+        void apply(VectorType & vec) const
+        {
+          for (vcl_size_t i=0; i<block_indices_.size(); ++i)
+          {
+            detail::ilu_vector_range<VectorType, ScalarType>  vec_range(vec, block_indices_[i].first, LU_blocks[i].size2());
+
+            unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(LU_blocks[i].handle1());
+            unsigned int const * col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(LU_blocks[i].handle2());
+            ScalarType   const * elements   = viennacl::linalg::host_based::detail::extract_raw_pointer<ScalarType>(LU_blocks[i].handle());
+
+            viennacl::linalg::host_based::detail::csr_inplace_solve<ScalarType>(row_buffer, col_buffer, elements, vec_range, LU_blocks[i].size2(), unit_lower_tag());
+            viennacl::linalg::host_based::detail::csr_inplace_solve<ScalarType>(row_buffer, col_buffer, elements, vec_range, LU_blocks[i].size2(), upper_tag());
+
+          }
+        }
+
+      private:
+        void init(MatrixType const & A)
+        {
+          viennacl::context host_context(viennacl::MAIN_MEMORY);
+          viennacl::compressed_matrix<ScalarType> mat(host_context);
+
+          viennacl::copy(A, mat);
+
+          unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(mat.handle1());
+
+#ifdef VIENNACL_WITH_OPENMP
+          #pragma omp parallel for
+#endif
+          for (long i=0; i<static_cast<long>(block_indices_.size()); ++i)
+          {
+            // Step 1: Extract blocks
+            vcl_size_t block_size = block_indices_[i].second - block_indices_[i].first;
+            vcl_size_t block_nnz  = row_buffer[block_indices_[i].second] - row_buffer[block_indices_[i].first];
+            viennacl::compressed_matrix<ScalarType> mat_block(block_size, block_size, block_nnz, host_context);
+
+            detail::extract_block_matrix(mat, mat_block, block_indices_[i].first, block_indices_[i].second);
+
+            // Step 2: Precondition blocks:
+            viennacl::switch_memory_context(LU_blocks[i], host_context);
+            preconditioner_dispatch(mat_block, LU_blocks[i], tag_);
+          }
+
+        }
+
+        void preconditioner_dispatch(viennacl::compressed_matrix<ScalarType> const & mat_block,
+                                     viennacl::compressed_matrix<ScalarType> & LU,
+                                     viennacl::linalg::ilu0_tag)
+        {
+          LU = mat_block;
+          viennacl::linalg::precondition(LU, tag_);
+        }
+
+        void preconditioner_dispatch(viennacl::compressed_matrix<ScalarType> const & mat_block,
+                                     viennacl::compressed_matrix<ScalarType> & LU,
+                                     viennacl::linalg::ilut_tag)
+        {
+          std::vector< std::map<unsigned int, ScalarType> > temp(mat_block.size1());
+
+          viennacl::linalg::precondition(mat_block, temp, tag_);
+
+          viennacl::copy(temp, LU);
+        }
+
+        ILUTag const & tag_;
+        index_vector_type block_indices_;
+        std::vector< viennacl::compressed_matrix<ScalarType> > LU_blocks;
+    };
+
+
+
+
+
+    /** @brief ILUT preconditioner class, can be supplied to solve()-routines.
+    *
+    *  Specialization for compressed_matrix
+    */
+    template <typename ScalarType, unsigned int MAT_ALIGNMENT, typename ILUTag>
+    class block_ilu_precond< compressed_matrix<ScalarType, MAT_ALIGNMENT>, ILUTag >
+    {
+        typedef compressed_matrix<ScalarType, MAT_ALIGNMENT>        MatrixType;
+        //typedef std::vector<ScalarType>                             STLVectorType;
+
+      public:
+        typedef std::vector<std::pair<vcl_size_t, vcl_size_t> >    index_vector_type;   //the pair refers to index range [a, b) of each block
+
+
+        block_ilu_precond(MatrixType const & mat,
+                          ILUTag const & tag,
+                          vcl_size_t num_blocks = 8
+                         ) : tag_(tag),
+                             block_indices_(num_blocks),
+                             gpu_block_indices(),
+                             gpu_L_trans(0,0, viennacl::traits::context(mat)),
+                             gpu_U_trans(0,0, viennacl::traits::context(mat)),
+                             gpu_D(mat.size1(), viennacl::traits::context(mat)),
+                             LU_blocks(num_blocks)
+        {
+          // Set up vector of block indices:
+          block_indices_.resize(num_blocks);
+          for (vcl_size_t i=0; i<num_blocks; ++i)
+          {
+            vcl_size_t start_index = (   i  * mat.size1()) / num_blocks;
+            vcl_size_t stop_index  = ((i+1) * mat.size1()) / num_blocks;
+
+            block_indices_[i] = std::pair<vcl_size_t, vcl_size_t>(start_index, stop_index);
+          }
+
+          //initialize preconditioner:
+          //std::cout << "Start CPU precond" << std::endl;
+          init(mat);
+          //std::cout << "End CPU precond" << std::endl;
+        }
+
+        block_ilu_precond(MatrixType const & mat,
+                          ILUTag const & tag,
+                          index_vector_type const & block_boundaries
+                         ) : tag_(tag),
+                             block_indices_(block_boundaries),
+                             gpu_block_indices(viennacl::traits::context(mat)),
+                             gpu_L_trans(0,0,viennacl::traits::context(mat)),
+                             gpu_U_trans(0,0,viennacl::traits::context(mat)),
+                             gpu_D(0,viennacl::traits::context(mat)),
+                             LU_blocks(block_boundaries.size())
+        {
+          //initialize preconditioner:
+          //std::cout << "Start CPU precond" << std::endl;
+          init(mat);
+          //std::cout << "End CPU precond" << std::endl;
+        }
+
+
+        void apply(vector<ScalarType> & vec) const
+        {
+          viennacl::linalg::detail::block_inplace_solve(trans(gpu_L_trans), gpu_block_indices, block_indices_.size(), gpu_D,
+                                                        vec,
+                                                        viennacl::linalg::unit_lower_tag());
+
+          viennacl::linalg::detail::block_inplace_solve(trans(gpu_U_trans), gpu_block_indices, block_indices_.size(), gpu_D,
+                                                        vec,
+                                                        viennacl::linalg::upper_tag());
+
+          //apply_cpu(vec);
+        }
+
+
+      private:
+
+        void init(MatrixType const & A)
+        {
+          viennacl::context host_context(viennacl::MAIN_MEMORY);
+          viennacl::compressed_matrix<ScalarType> mat(host_context);
+
+          mat = A;
+
+          unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(mat.handle1());
+
+#ifdef VIENNACL_WITH_OPENMP
+          #pragma omp parallel for
+#endif
+          for (long i=0; i<static_cast<long>(block_indices_.size()); ++i)
+          {
+            // Step 1: Extract blocks
+            vcl_size_t block_size = block_indices_[i].second - block_indices_[i].first;
+            vcl_size_t block_nnz  = row_buffer[block_indices_[i].second] - row_buffer[block_indices_[i].first];
+            viennacl::compressed_matrix<ScalarType> mat_block(block_size, block_size, block_nnz, host_context);
+
+            detail::extract_block_matrix(mat, mat_block, block_indices_[i].first, block_indices_[i].second);
+
+            // Step 2: Precondition blocks:
+            viennacl::switch_memory_context(LU_blocks[i], host_context);
+            preconditioner_dispatch(mat_block, LU_blocks[i], tag_);
+          }
+
+          /*
+           * copy resulting preconditioner back to GPU:
+           */
+
+          viennacl::switch_memory_context(gpu_L_trans, viennacl::traits::context(A));
+          viennacl::switch_memory_context(gpu_U_trans, viennacl::traits::context(A));
+          viennacl::switch_memory_context(gpu_D, viennacl::traits::context(A));
+
+          viennacl::backend::typesafe_host_array<unsigned int> block_indices_uint(gpu_block_indices, 2 * block_indices_.size());
+          for (vcl_size_t i=0; i<block_indices_.size(); ++i)
+          {
+            block_indices_uint.set(2*i, block_indices_[i].first);
+            block_indices_uint.set(2*i + 1, block_indices_[i].second);
+          }
+
+          viennacl::backend::memory_create(gpu_block_indices, block_indices_uint.raw_size(), viennacl::traits::context(A), block_indices_uint.get());
+
+          blocks_to_device(mat.size1());
+
+        }
+
+        // Copy computed preconditioned blocks to OpenCL device
+        void blocks_to_device(vcl_size_t matrix_size)
+        {
+          std::vector< std::map<unsigned int, ScalarType> > L_transposed(matrix_size);
+          std::vector< std::map<unsigned int, ScalarType> > U_transposed(matrix_size);
+          std::vector<ScalarType> entries_D(matrix_size);
+
+          //
+          // Transpose individual blocks into a single large matrix:
+          //
+          for (vcl_size_t block_index = 0; block_index < LU_blocks.size(); ++block_index)
+          {
+            MatrixType const & current_block = LU_blocks[block_index];
+
+            unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(current_block.handle1());
+            unsigned int const * col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(current_block.handle2());
+            ScalarType   const * elements   = viennacl::linalg::host_based::detail::extract_raw_pointer<ScalarType>(current_block.handle());
+
+            vcl_size_t block_start = block_indices_[block_index].first;
+
+            //transpose L and U:
+            for (vcl_size_t row = 0; row < current_block.size1(); ++row)
+            {
+              unsigned int buffer_col_start = row_buffer[row];
+              unsigned int buffer_col_end   = row_buffer[row+1];
+
+              for (unsigned int buf_index = buffer_col_start; buf_index < buffer_col_end; ++buf_index)
+              {
+                unsigned int col = col_buffer[buf_index];
+
+                if (row > col) //entry for L
+                  L_transposed[col + block_start][static_cast<unsigned int>(row + block_start)] = elements[buf_index];
+                else if (row == col)
+                  entries_D[row + block_start] = elements[buf_index];
+                else //entry for U
+                  U_transposed[col + block_start][static_cast<unsigned int>(row + block_start)] = elements[buf_index];
+              }
+            }
+          }
+
+          //
+          // Move data to GPU:
+          //
+          tools::const_sparse_matrix_adapter<ScalarType, unsigned int> adapted_L_transposed(L_transposed, matrix_size, matrix_size);
+          tools::const_sparse_matrix_adapter<ScalarType, unsigned int> adapted_U_transposed(U_transposed, matrix_size, matrix_size);
+          viennacl::copy(adapted_L_transposed, gpu_L_trans);
+          viennacl::copy(adapted_U_transposed, gpu_U_trans);
+          viennacl::copy(entries_D, gpu_D);
+        }
+
+        void preconditioner_dispatch(viennacl::compressed_matrix<ScalarType> const & mat_block,
+                                     viennacl::compressed_matrix<ScalarType> & LU,
+                                     viennacl::linalg::ilu0_tag)
+        {
+          LU = mat_block;
+          viennacl::linalg::precondition(LU, tag_);
+        }
+
+        void preconditioner_dispatch(viennacl::compressed_matrix<ScalarType> const & mat_block,
+                                     viennacl::compressed_matrix<ScalarType> & LU,
+                                     viennacl::linalg::ilut_tag)
+        {
+          std::vector< std::map<unsigned int, ScalarType> > temp(mat_block.size1());
+
+          viennacl::linalg::precondition(mat_block, temp, tag_);
+
+          viennacl::copy(temp, LU);
+        }
+
+
+        ILUTag const & tag_;
+        index_vector_type block_indices_;
+        viennacl::backend::mem_handle gpu_block_indices;
+        viennacl::compressed_matrix<ScalarType> gpu_L_trans;
+        viennacl::compressed_matrix<ScalarType> gpu_U_trans;
+        viennacl::vector<ScalarType> gpu_D;
+
+        std::vector< MatrixType > LU_blocks;
+    };
+
+
+  }
+}
+
+
+
+
+#endif
+
+
+
diff --git a/viennacl/linalg/detail/ilu/common.hpp b/viennacl/linalg/detail/ilu/common.hpp
new file mode 100644
index 0000000..e66e362
--- /dev/null
+++ b/viennacl/linalg/detail/ilu/common.hpp
@@ -0,0 +1,263 @@
+#ifndef VIENNACL_LINALG_DETAIL_ILU_COMMON_HPP_
+#define VIENNACL_LINALG_DETAIL_ILU_COMMON_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/detail/ilu/common.hpp
+    @brief Common routines used within ILU-type preconditioners
+*/
+
+#include <vector>
+#include <cmath>
+#include <iostream>
+#include <map>
+#include <list>
+
+#include "viennacl/forwards.h"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/backend/memory.hpp"
+
+#include "viennacl/linalg/host_based/common.hpp"
+#include "viennacl/linalg/misc_operations.hpp"
+
+namespace viennacl
+{
+  namespace linalg
+  {
+    namespace detail
+    {
+
+
+      //
+      // Level Scheduling Setup for ILU:
+      //
+
+      template <typename ScalarType, unsigned int ALIGNMENT>
+      void level_scheduling_setup_impl(viennacl::compressed_matrix<ScalarType, ALIGNMENT> const & LU,
+                                       vector<ScalarType> const & diagonal_LU,
+                                       std::list< viennacl::backend::mem_handle > & row_index_arrays,
+                                       std::list< viennacl::backend::mem_handle > & row_buffers,
+                                       std::list< viennacl::backend::mem_handle > & col_buffers,
+                                       std::list< viennacl::backend::mem_handle > & element_buffers,
+                                       std::list< vcl_size_t > & row_elimination_num_list,
+                                       bool setup_U)
+      {
+        ScalarType   const * diagonal_buf = viennacl::linalg::host_based::detail::extract_raw_pointer<ScalarType>(diagonal_LU.handle());
+        ScalarType   const * elements   = viennacl::linalg::host_based::detail::extract_raw_pointer<ScalarType>(LU.handle());
+        unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(LU.handle1());
+        unsigned int const * col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(LU.handle2());
+
+        //
+        // Step 1: Determine row elimination order for each row and build up meta information about the number of entries taking part in each elimination step:
+        //
+        std::vector<vcl_size_t> row_elimination(LU.size1());
+        std::map<vcl_size_t, std::map<vcl_size_t, vcl_size_t> > row_entries_per_elimination_step;
+
+        vcl_size_t max_elimination_runs = 0;
+        for (vcl_size_t row2 = 0; row2 < LU.size1(); ++row2)
+        {
+          vcl_size_t row = setup_U ? (LU.size1() - row2) - 1 : row2;
+
+          vcl_size_t row_begin = row_buffer[row];
+          vcl_size_t row_end   = row_buffer[row+1];
+          vcl_size_t elimination_index = 0;  //Note: first run corresponds to elimination_index = 1 (otherwise, type issues with int <-> unsigned int would arise
+          for (vcl_size_t i = row_begin; i < row_end; ++i)
+          {
+            unsigned int col = col_buffer[i];
+            if ( (!setup_U && col < row) || (setup_U && col > row) )
+            {
+              elimination_index = std::max<vcl_size_t>(elimination_index, row_elimination[col]);
+              row_entries_per_elimination_step[row_elimination[col]][row] += 1;
+            }
+          }
+          row_elimination[row] = elimination_index + 1;
+          max_elimination_runs = std::max<vcl_size_t>(max_elimination_runs, elimination_index + 1);
+        }
+
+        //std::cout << "Number of elimination runs: " << max_elimination_runs << std::endl;
+
+        //
+        // Step 2: Build row-major elimination matrix for each elimination step
+        //
+
+        //std::cout << "Elimination order: " << std::endl;
+        //for (vcl_size_t i=0; i<row_elimination.size(); ++i)
+        //  std::cout << row_elimination[i] << ", ";
+        //std::cout << std::endl;
+
+        //vcl_size_t summed_rows = 0;
+        for (vcl_size_t elimination_run = 1; elimination_run <= max_elimination_runs; ++elimination_run)
+        {
+          std::map<vcl_size_t, vcl_size_t> const & current_elimination_info = row_entries_per_elimination_step[elimination_run];
+
+          // count cols and entries handled in this elimination step
+          vcl_size_t num_tainted_cols = current_elimination_info.size();
+          vcl_size_t num_entries = 0;
+
+          for (std::map<vcl_size_t, vcl_size_t>::const_iterator it  = current_elimination_info.begin();
+                                                                  it != current_elimination_info.end();
+                                                                ++it)
+            num_entries += it->second;
+
+          //std::cout << "num_entries: " << num_entries << std::endl;
+          //std::cout << "num_tainted_cols: " << num_tainted_cols << std::endl;
+
+          if (num_tainted_cols > 0)
+          {
+            row_index_arrays.push_back(viennacl::backend::mem_handle());
+            viennacl::backend::switch_memory_context<unsigned int>(row_index_arrays.back(), viennacl::traits::context(LU));
+            viennacl::backend::typesafe_host_array<unsigned int> elim_row_index_array(row_index_arrays.back(), num_tainted_cols);
+
+            row_buffers.push_back(viennacl::backend::mem_handle());
+            viennacl::backend::switch_memory_context<unsigned int>(row_buffers.back(), viennacl::traits::context(LU));
+            viennacl::backend::typesafe_host_array<unsigned int> elim_row_buffer(row_buffers.back(), num_tainted_cols + 1);
+
+            col_buffers.push_back(viennacl::backend::mem_handle());
+            viennacl::backend::switch_memory_context<unsigned int>(col_buffers.back(), viennacl::traits::context(LU));
+            viennacl::backend::typesafe_host_array<unsigned int> elim_col_buffer(col_buffers.back(), num_entries);
+
+            element_buffers.push_back(viennacl::backend::mem_handle());
+            viennacl::backend::switch_memory_context<ScalarType>(element_buffers.back(), viennacl::traits::context(LU));
+            std::vector<ScalarType> elim_elements_buffer(num_entries);
+
+            row_elimination_num_list.push_back(num_tainted_cols);
+
+            vcl_size_t k=0;
+            vcl_size_t nnz_index = 0;
+            elim_row_buffer.set(0, 0);
+
+            for (std::map<vcl_size_t, vcl_size_t>::const_iterator it  = current_elimination_info.begin();
+                                                                    it != current_elimination_info.end();
+                                                                  ++it)
+            {
+              //vcl_size_t col = setup_U ? (elimination_matrix.size() - it->first) - 1 : col2;
+              vcl_size_t row = it->first;
+              elim_row_index_array.set(k, row);
+
+              vcl_size_t row_begin = row_buffer[row];
+              vcl_size_t row_end   = row_buffer[row+1];
+              for (vcl_size_t i = row_begin; i < row_end; ++i)
+              {
+                unsigned int col = col_buffer[i];
+                if ( (!setup_U && col < row) || (setup_U && col > row) ) //entry of L/U
+                {
+                  if (row_elimination[col] == elimination_run) // this entry is substituted in this run
+                  {
+                    elim_col_buffer.set(nnz_index, col);
+                    elim_elements_buffer[nnz_index] = setup_U ? elements[i] / diagonal_buf[it->first] : elements[i];
+                    ++nnz_index;
+                  }
+                }
+              }
+
+              elim_row_buffer.set(++k, nnz_index);
+            }
+
+            //
+            // Wrap in memory_handles:
+            //
+            viennacl::backend::memory_create(row_index_arrays.back(), elim_row_index_array.raw_size(),                  viennacl::traits::context(row_index_arrays.back()), elim_row_index_array.get());
+            viennacl::backend::memory_create(row_buffers.back(),      elim_row_buffer.raw_size(),                       viennacl::traits::context(row_buffers.back()),      elim_row_buffer.get());
+            viennacl::backend::memory_create(col_buffers.back(),      elim_col_buffer.raw_size(),                       viennacl::traits::context(col_buffers.back()),      elim_col_buffer.get());
+            viennacl::backend::memory_create(element_buffers.back(),  sizeof(ScalarType) * elim_elements_buffer.size(), viennacl::traits::context(element_buffers.back()),  &(elim_elements_buffer[0]));
+          }
+
+          // Print some info:
+          //std::cout << "Eliminated columns in run " << elimination_run << ": " << num_tainted_cols << " (tainted columns: " << num_tainted_cols << ")" << std::endl;
+          //summed_rows += eliminated_rows_in_run;
+          //if (eliminated_rows_in_run == 0)
+          //  break;
+        }
+        //std::cout << "Eliminated rows: " << summed_rows << " out of " << row_elimination.size() << std::endl;
+      }
+
+
+      template <typename ScalarType, unsigned int ALIGNMENT>
+      void level_scheduling_setup_L(viennacl::compressed_matrix<ScalarType, ALIGNMENT> const & LU,
+                                vector<ScalarType> const & diagonal_LU,
+                                std::list< viennacl::backend::mem_handle > & row_index_arrays,
+                                std::list< viennacl::backend::mem_handle > & row_buffers,
+                                std::list< viennacl::backend::mem_handle > & col_buffers,
+                                std::list< viennacl::backend::mem_handle > & element_buffers,
+                                std::list< vcl_size_t > & row_elimination_num_list)
+      {
+        level_scheduling_setup_impl(LU, diagonal_LU, row_index_arrays, row_buffers, col_buffers, element_buffers, row_elimination_num_list, false);
+      }
+
+
+      //
+      // Multifrontal setup of U:
+      //
+
+      template <typename ScalarType, unsigned int ALIGNMENT>
+      void level_scheduling_setup_U(viennacl::compressed_matrix<ScalarType, ALIGNMENT> const & LU,
+                                vector<ScalarType> const & diagonal_LU,
+                                std::list< viennacl::backend::mem_handle > & row_index_arrays,
+                                std::list< viennacl::backend::mem_handle > & row_buffers,
+                                std::list< viennacl::backend::mem_handle > & col_buffers,
+                                std::list< viennacl::backend::mem_handle > & element_buffers,
+                                std::list< vcl_size_t > & row_elimination_num_list)
+      {
+        level_scheduling_setup_impl(LU, diagonal_LU, row_index_arrays, row_buffers, col_buffers, element_buffers, row_elimination_num_list, true);
+      }
+
+
+      //
+      // Multifrontal substitution (both L and U). Will partly be moved to single_threaded/opencl/cuda implementations
+      //
+      template <typename ScalarType>
+      void level_scheduling_substitute(vector<ScalarType> & vec,
+                                       std::list< viennacl::backend::mem_handle > const & row_index_arrays,
+                                       std::list< viennacl::backend::mem_handle > const & row_buffers,
+                                       std::list< viennacl::backend::mem_handle > const & col_buffers,
+                                       std::list< viennacl::backend::mem_handle > const & element_buffers,
+                                       std::list< vcl_size_t > const & row_elimination_num_list)
+      {
+        typedef typename std::list< viennacl::backend::mem_handle >::const_iterator  ListIterator;
+        ListIterator row_index_array_it = row_index_arrays.begin();
+        ListIterator row_buffers_it = row_buffers.begin();
+        ListIterator col_buffers_it = col_buffers.begin();
+        ListIterator element_buffers_it = element_buffers.begin();
+        typename std::list< vcl_size_t>::const_iterator row_elimination_num_it = row_elimination_num_list.begin();
+        for (vcl_size_t i=0; i<row_index_arrays.size(); ++i)
+        {
+          viennacl::linalg::detail::level_scheduling_substitute(vec, *row_index_array_it, *row_buffers_it, *col_buffers_it, *element_buffers_it, *row_elimination_num_it);
+
+          ++row_index_array_it;
+          ++row_buffers_it;
+          ++col_buffers_it;
+          ++element_buffers_it;
+          ++row_elimination_num_it;
+        }
+      }
+
+
+
+
+
+    } // namespace detail
+  } // namespace linalg
+} // namespace viennacl
+
+
+
+
+#endif
+
+
+
diff --git a/viennacl/linalg/detail/ilu/ilu0.hpp b/viennacl/linalg/detail/ilu/ilu0.hpp
new file mode 100644
index 0000000..d9b11ed
--- /dev/null
+++ b/viennacl/linalg/detail/ilu/ilu0.hpp
@@ -0,0 +1,381 @@
+
+#ifndef VIENNACL_LINALG_DETAIL_ILU0_HPP_
+#define VIENNACL_LINALG_DETAIL_ILU0_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/detail/ilu/ilu0.hpp
+  @brief Implementations of incomplete factorization preconditioners with static nonzero pattern.
+
+  Contributed by Evan Bollig.
+
+  ILU0 (Incomplete LU with zero fill-in)
+  - All preconditioner nonzeros exist at locations that were nonzero in the input matrix.
+  - The number of nonzeros in the output preconditioner are exactly the same number as the input matrix
+
+ Evan Bollig 3/30/12
+
+ Adapted from viennacl/linalg/detail/ilut.hpp
+
+ Low-level reimplementation by Karl Rupp in Nov 2012, increasing performance substantially. Also added level-scheduling.
+
+*/
+
+#include <vector>
+#include <cmath>
+#include <iostream>
+#include "viennacl/forwards.h"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/detail/ilu/common.hpp"
+#include "viennacl/compressed_matrix.hpp"
+#include "viennacl/backend/memory.hpp"
+
+#include "viennacl/linalg/host_based/common.hpp"
+
+#include <map>
+
+namespace viennacl
+{
+  namespace linalg
+  {
+
+    /** @brief A tag for incomplete LU factorization with static pattern (ILU0)
+    */
+    class ilu0_tag
+    {
+      public:
+        ilu0_tag(bool with_level_scheduling = false) : use_level_scheduling_(with_level_scheduling) {}
+
+        bool use_level_scheduling() const { return use_level_scheduling_; }
+        void use_level_scheduling(bool b) { use_level_scheduling_ = b; }
+
+      private:
+        bool use_level_scheduling_;
+    };
+
+
+    /** @brief Implementation of a ILU-preconditioner with static pattern. Optimized version for CSR matrices.
+      *
+      * refer to the Algorithm in Saad's book (1996 edition)
+      *
+      *  @param A       The sparse matrix matrix. The result is directly written to A.
+      */
+    template<typename ScalarType>
+    void precondition(viennacl::compressed_matrix<ScalarType> & A, ilu0_tag const & /* tag */)
+    {
+      assert( (A.handle1().get_active_handle_id() == viennacl::MAIN_MEMORY) && bool("System matrix must reside in main memory for ILU0") );
+      assert( (A.handle2().get_active_handle_id() == viennacl::MAIN_MEMORY) && bool("System matrix must reside in main memory for ILU0") );
+      assert( (A.handle().get_active_handle_id() == viennacl::MAIN_MEMORY) && bool("System matrix must reside in main memory for ILU0") );
+
+      ScalarType         * elements   = viennacl::linalg::host_based::detail::extract_raw_pointer<ScalarType>(A.handle());
+      unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle1());
+      unsigned int const * col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle2());
+
+      // Note: Line numbers in the following refer to the algorithm in Saad's book
+
+      for (vcl_size_t i=1; i<A.size1(); ++i)  // Line 1
+      {
+        unsigned int row_i_begin = row_buffer[i];
+        unsigned int row_i_end   = row_buffer[i+1];
+        for (unsigned int buf_index_k = row_i_begin; buf_index_k < row_i_end; ++buf_index_k) //Note: We do not assume that the column indices within a row are sorted
+        {
+          unsigned int k = col_buffer[buf_index_k];
+          if (k >= i)
+            continue; //Note: We do not assume that the column indices within a row are sorted
+
+          unsigned int row_k_begin = row_buffer[k];
+          unsigned int row_k_end   = row_buffer[k+1];
+
+          // get a_kk:
+          ScalarType a_kk = 0;
+          for (unsigned int buf_index_akk = row_k_begin; buf_index_akk < row_k_end; ++buf_index_akk)
+          {
+            if (col_buffer[buf_index_akk] == k)
+            {
+              a_kk = elements[buf_index_akk];
+              break;
+            }
+          }
+
+          ScalarType & a_ik = elements[buf_index_k];
+          a_ik /= a_kk;                                 //Line 3
+
+          for (unsigned int buf_index_j = row_i_begin; buf_index_j < row_i_end; ++buf_index_j) //Note: We do not assume that the column indices within a row are sorted
+          {
+            unsigned int j = col_buffer[buf_index_j];
+            if (j <= k)
+              continue;
+
+            // determine a_kj:
+            ScalarType a_kj = 0;
+            for (unsigned int buf_index_akj = row_k_begin; buf_index_akj < row_k_end; ++buf_index_akj)
+            {
+              if (col_buffer[buf_index_akj] == j)
+              {
+                a_kk = elements[buf_index_akj];
+                break;
+              }
+            }
+
+            //a_ij -= a_ik * a_kj
+            elements[buf_index_j] -= a_ik * a_kj;  //Line 5
+          }
+        }
+      }
+
+    }
+
+
+    /** @brief ILU0 preconditioner class, can be supplied to solve()-routines
+    */
+    template <typename MatrixType>
+    class ilu0_precond
+    {
+        typedef typename MatrixType::value_type      ScalarType;
+
+      public:
+        ilu0_precond(MatrixType const & mat, ilu0_tag const & tag) : tag_(tag), LU()
+        {
+            //initialize preconditioner:
+            //std::cout << "Start CPU precond" << std::endl;
+            init(mat);
+            //std::cout << "End CPU precond" << std::endl;
+        }
+
+        template <typename VectorType>
+        void apply(VectorType & vec) const
+        {
+          unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(LU.handle1());
+          unsigned int const * col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(LU.handle2());
+          ScalarType   const * elements   = viennacl::linalg::host_based::detail::extract_raw_pointer<ScalarType>(LU.handle());
+
+          viennacl::linalg::host_based::detail::csr_inplace_solve<ScalarType>(row_buffer, col_buffer, elements, vec, LU.size2(), unit_lower_tag());
+          viennacl::linalg::host_based::detail::csr_inplace_solve<ScalarType>(row_buffer, col_buffer, elements, vec, LU.size2(), upper_tag());
+        }
+
+      private:
+        void init(MatrixType const & mat)
+        {
+          viennacl::context host_context(viennacl::MAIN_MEMORY);
+          viennacl::switch_memory_context(LU, host_context);
+
+          viennacl::copy(mat, LU);
+          viennacl::linalg::precondition(LU, tag_);
+        }
+
+        ilu0_tag const & tag_;
+
+        viennacl::compressed_matrix<ScalarType> LU;
+    };
+
+
+    /** @brief ILU0 preconditioner class, can be supplied to solve()-routines.
+      *
+      *  Specialization for compressed_matrix
+      */
+    template <typename ScalarType, unsigned int MAT_ALIGNMENT>
+    class ilu0_precond< compressed_matrix<ScalarType, MAT_ALIGNMENT> >
+    {
+        typedef compressed_matrix<ScalarType, MAT_ALIGNMENT>   MatrixType;
+
+      public:
+        ilu0_precond(MatrixType const & mat, ilu0_tag const & tag) : tag_(tag), LU(mat.size1(), mat.size2())
+        {
+          //initialize preconditioner:
+          //std::cout << "Start GPU precond" << std::endl;
+          init(mat);
+          //std::cout << "End GPU precond" << std::endl;
+        }
+
+        void apply(vector<ScalarType> & vec) const
+        {
+          viennacl::context host_context(viennacl::MAIN_MEMORY);
+          if (vec.handle().get_active_handle_id() != viennacl::MAIN_MEMORY)
+          {
+            if (tag_.use_level_scheduling())
+            {
+              //std::cout << "Using multifrontal on GPU..." << std::endl;
+              detail::level_scheduling_substitute(vec,
+                                                  multifrontal_L_row_index_arrays_,
+                                                  multifrontal_L_row_buffers_,
+                                                  multifrontal_L_col_buffers_,
+                                                  multifrontal_L_element_buffers_,
+                                                  multifrontal_L_row_elimination_num_list_);
+
+              vec = viennacl::linalg::element_div(vec, multifrontal_U_diagonal_);
+
+              detail::level_scheduling_substitute(vec,
+                                                  multifrontal_U_row_index_arrays_,
+                                                  multifrontal_U_row_buffers_,
+                                                  multifrontal_U_col_buffers_,
+                                                  multifrontal_U_element_buffers_,
+                                                  multifrontal_U_row_elimination_num_list_);
+            }
+            else
+            {
+              viennacl::context old_context = viennacl::traits::context(vec);
+              viennacl::switch_memory_context(vec, host_context);
+              viennacl::linalg::inplace_solve(LU, vec, unit_lower_tag());
+              viennacl::linalg::inplace_solve(LU, vec, upper_tag());
+              viennacl::switch_memory_context(vec, old_context);
+            }
+          }
+          else //apply ILU0 directly on CPU
+          {
+            if (tag_.use_level_scheduling())
+            {
+              //std::cout << "Using multifrontal..." << std::endl;
+              detail::level_scheduling_substitute(vec,
+                                                  multifrontal_L_row_index_arrays_,
+                                                  multifrontal_L_row_buffers_,
+                                                  multifrontal_L_col_buffers_,
+                                                  multifrontal_L_element_buffers_,
+                                                  multifrontal_L_row_elimination_num_list_);
+
+              vec = viennacl::linalg::element_div(vec, multifrontal_U_diagonal_);
+
+              detail::level_scheduling_substitute(vec,
+                                                  multifrontal_U_row_index_arrays_,
+                                                  multifrontal_U_row_buffers_,
+                                                  multifrontal_U_col_buffers_,
+                                                  multifrontal_U_element_buffers_,
+                                                  multifrontal_U_row_elimination_num_list_);
+            }
+            else
+            {
+              viennacl::linalg::inplace_solve(LU, vec, unit_lower_tag());
+              viennacl::linalg::inplace_solve(LU, vec, upper_tag());
+            }
+          }
+        }
+
+        vcl_size_t levels() const { return multifrontal_L_row_index_arrays_.size(); }
+
+      private:
+        void init(MatrixType const & mat)
+        {
+          viennacl::context host_context(viennacl::MAIN_MEMORY);
+          viennacl::switch_memory_context(LU, host_context);
+          LU = mat;
+          viennacl::linalg::precondition(LU, tag_);
+
+          if (!tag_.use_level_scheduling())
+            return;
+
+          // multifrontal part:
+          viennacl::switch_memory_context(multifrontal_U_diagonal_, host_context);
+          multifrontal_U_diagonal_.resize(LU.size1(), false);
+          host_based::detail::row_info(LU, multifrontal_U_diagonal_, viennacl::linalg::detail::SPARSE_ROW_DIAGONAL);
+
+          detail::level_scheduling_setup_L(LU,
+                                           multifrontal_U_diagonal_, //dummy
+                                           multifrontal_L_row_index_arrays_,
+                                           multifrontal_L_row_buffers_,
+                                           multifrontal_L_col_buffers_,
+                                           multifrontal_L_element_buffers_,
+                                           multifrontal_L_row_elimination_num_list_);
+
+
+          detail::level_scheduling_setup_U(LU,
+                                           multifrontal_U_diagonal_,
+                                           multifrontal_U_row_index_arrays_,
+                                           multifrontal_U_row_buffers_,
+                                           multifrontal_U_col_buffers_,
+                                           multifrontal_U_element_buffers_,
+                                           multifrontal_U_row_elimination_num_list_);
+
+          //
+          // Bring to device if necessary:
+          //
+
+          // L:
+          for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_L_row_index_arrays_.begin();
+                                                                             it != multifrontal_L_row_index_arrays_.end();
+                                                                           ++it)
+            viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
+
+          for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_L_row_buffers_.begin();
+                                                                             it != multifrontal_L_row_buffers_.end();
+                                                                           ++it)
+            viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
+
+          for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_L_col_buffers_.begin();
+                                                                             it != multifrontal_L_col_buffers_.end();
+                                                                           ++it)
+            viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
+
+          for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_L_element_buffers_.begin();
+                                                                             it != multifrontal_L_element_buffers_.end();
+                                                                           ++it)
+            viennacl::backend::switch_memory_context<ScalarType>(*it, viennacl::traits::context(mat));
+
+
+          // U:
+
+          viennacl::switch_memory_context(multifrontal_U_diagonal_, viennacl::traits::context(mat));
+
+          for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_U_row_index_arrays_.begin();
+                                                                             it != multifrontal_U_row_index_arrays_.end();
+                                                                           ++it)
+            viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
+
+          for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_U_row_buffers_.begin();
+                                                                             it != multifrontal_U_row_buffers_.end();
+                                                                           ++it)
+            viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
+
+          for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_U_col_buffers_.begin();
+                                                                             it != multifrontal_U_col_buffers_.end();
+                                                                           ++it)
+            viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
+
+          for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_U_element_buffers_.begin();
+                                                                             it != multifrontal_U_element_buffers_.end();
+                                                                           ++it)
+            viennacl::backend::switch_memory_context<ScalarType>(*it, viennacl::traits::context(mat));
+
+        }
+
+        ilu0_tag const & tag_;
+        viennacl::compressed_matrix<ScalarType> LU;
+
+        std::list< viennacl::backend::mem_handle > multifrontal_L_row_index_arrays_;
+        std::list< viennacl::backend::mem_handle > multifrontal_L_row_buffers_;
+        std::list< viennacl::backend::mem_handle > multifrontal_L_col_buffers_;
+        std::list< viennacl::backend::mem_handle > multifrontal_L_element_buffers_;
+        std::list< vcl_size_t > multifrontal_L_row_elimination_num_list_;
+
+        viennacl::vector<ScalarType> multifrontal_U_diagonal_;
+        std::list< viennacl::backend::mem_handle > multifrontal_U_row_index_arrays_;
+        std::list< viennacl::backend::mem_handle > multifrontal_U_row_buffers_;
+        std::list< viennacl::backend::mem_handle > multifrontal_U_col_buffers_;
+        std::list< viennacl::backend::mem_handle > multifrontal_U_element_buffers_;
+        std::list< vcl_size_t > multifrontal_U_row_elimination_num_list_;
+
+    };
+
+  }
+}
+
+
+
+
+#endif
+
+
+
diff --git a/viennacl/linalg/detail/ilu/ilut.hpp b/viennacl/linalg/detail/ilu/ilut.hpp
new file mode 100644
index 0000000..311f0c1
--- /dev/null
+++ b/viennacl/linalg/detail/ilu/ilut.hpp
@@ -0,0 +1,486 @@
+#ifndef VIENNACL_LINALG_DETAIL_ILUT_HPP_
+#define VIENNACL_LINALG_DETAIL_ILUT_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/detail/ilu/ilut.hpp
+    @brief Implementations of an incomplete factorization preconditioner with threshold (ILUT)
+*/
+
+#include <vector>
+#include <cmath>
+#include <iostream>
+#include "viennacl/forwards.h"
+#include "viennacl/tools/tools.hpp"
+
+#include "viennacl/linalg/detail/ilu/common.hpp"
+#include "viennacl/compressed_matrix.hpp"
+
+#include "viennacl/linalg/host_based/common.hpp"
+
+#include <map>
+
+namespace viennacl
+{
+  namespace linalg
+  {
+
+    /** @brief A tag for incomplete LU factorization with threshold (ILUT)
+    */
+    class ilut_tag
+    {
+      public:
+        /** @brief The constructor.
+        *
+        * @param entries_per_row        Number of nonzero entries per row in L and U. Note that L and U are stored in a single matrix, thus there are 2*entries_per_row in total.
+        * @param drop_tolerance         The drop tolerance for ILUT
+        * @param with_level_scheduling  Flag for enabling level scheduling on GPUs.
+        */
+        ilut_tag(unsigned int entries_per_row = 20,
+                 double drop_tolerance = 1e-4,
+                 bool with_level_scheduling = false) : entries_per_row_(entries_per_row), drop_tolerance_(drop_tolerance), use_level_scheduling_(with_level_scheduling) {}
+
+        void set_drop_tolerance(double tol)
+        {
+          if (tol > 0)
+            drop_tolerance_ = tol;
+        }
+        double get_drop_tolerance() const { return drop_tolerance_; }
+
+        void set_entries_per_row(unsigned int e)
+        {
+          if (e > 0)
+            entries_per_row_ = e;
+        }
+
+        unsigned int get_entries_per_row() const { return entries_per_row_; }
+
+        bool use_level_scheduling() const { return use_level_scheduling_; }
+        void use_level_scheduling(bool b) { use_level_scheduling_ = b; }
+
+      private:
+        unsigned int entries_per_row_;
+        double drop_tolerance_;
+        bool use_level_scheduling_;
+    };
+
+
+    /** @brief Dispatcher overload for extracting the row of nonzeros of a compressed matrix */
+    template <typename ScalarType, typename SizeType, typename SparseVector>
+    ScalarType setup_w(viennacl::compressed_matrix<ScalarType> const & A,
+                       SizeType row,
+                       SparseVector & w)
+    {
+      assert( (A.handle1().get_active_handle_id() == viennacl::MAIN_MEMORY) && bool("System matrix must reside in main memory for ILU0") );
+      assert( (A.handle2().get_active_handle_id() == viennacl::MAIN_MEMORY) && bool("System matrix must reside in main memory for ILU0") );
+      assert( (A.handle().get_active_handle_id() == viennacl::MAIN_MEMORY) && bool("System matrix must reside in main memory for ILU0") );
+
+      ScalarType   const * elements   = viennacl::linalg::host_based::detail::extract_raw_pointer<ScalarType>(A.handle());
+      unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle1());
+      unsigned int const * col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle2());
+
+      SizeType row_i_begin = static_cast<SizeType>(row_buffer[row]);
+      SizeType row_i_end   = static_cast<SizeType>(row_buffer[row+1]);
+      ScalarType row_norm = 0;
+      for (SizeType buf_index_i = row_i_begin; buf_index_i < row_i_end; ++buf_index_i) //Note: We do not assume that the column indices within a row are sorted
+      {
+        ScalarType entry = elements[buf_index_i];
+        w[col_buffer[buf_index_i]] = entry;
+        row_norm += entry * entry;
+      }
+      return std::sqrt(row_norm);
+    }
+
+    /** @brief Dispatcher overload for extracting the row of nonzeros of a STL-grown sparse matrix */
+    template <typename ScalarType, typename SizeType, typename SparseVector>
+    ScalarType setup_w(std::vector< std::map<SizeType, ScalarType> > const & A,
+                       SizeType row,
+                       SparseVector & w)
+    {
+      ScalarType row_norm = 0;
+      w = A[row];
+      for (typename std::map<SizeType, ScalarType>::const_iterator iter_w  = w.begin(); iter_w != w.end(); ++iter_w)
+        row_norm += iter_w->second * iter_w->second;
+
+      return std::sqrt(row_norm);
+    }
+
+
+    /** @brief Implementation of a ILU-preconditioner with threshold. Optimized implementation for compressed_matrix.
+    *
+    * refer to Algorithm 10.6 by Saad's book (1996 edition)
+    *
+    *  @param A       The input matrix. Either a compressed_matrix or of type std::vector< std::map<T, U> >
+    *  @param output  The output matrix. Type requirements: const_iterator1 for iteration along rows, const_iterator2 for iteration along columns and write access via operator()
+    *  @param tag     An ilut_tag in order to dispatch among several other preconditioners.
+    */
+    template<typename SparseMatrixType, typename ScalarType, typename SizeType>
+    void precondition(SparseMatrixType const & A,
+                      std::vector< std::map<SizeType, ScalarType> > & output,
+                      ilut_tag const & tag)
+    {
+      typedef std::map<SizeType, ScalarType>          SparseVector;
+      typedef typename SparseVector::iterator         SparseVectorIterator;
+      typedef typename std::map<SizeType, ScalarType>::const_iterator   OutputRowConstIterator;
+      typedef std::multimap<ScalarType, std::pair<SizeType, ScalarType> >  TemporarySortMap;
+
+      assert(viennacl::traits::size1(A) == output.size() && bool("Output matrix size mismatch") );
+
+      SparseVector w;
+      TemporarySortMap temp_map;
+
+      for (SizeType i=0; i<viennacl::traits::size1(A); ++i)  // Line 1
+      {
+    /*    if (i%10 == 0)
+      std::cout << i << std::endl;*/
+
+        //line 2: set up w
+        ScalarType row_norm = setup_w(A, i, w);
+        ScalarType tau_i = static_cast<ScalarType>(tag.get_drop_tolerance()) * row_norm;
+
+        //line 3:
+        for (SparseVectorIterator w_k = w.begin(); w_k != w.end(); ++w_k)
+        {
+          SizeType k = w_k->first;
+          if (k >= i)
+            break;
+
+          //line 4:
+          ScalarType a_kk = output[k][k];
+          if (a_kk == 0)
+          {
+            std::cerr << "ViennaCL: FATAL ERROR in ILUT(): Diagonal entry is zero in row " << k
+                      << " while processing line " << i << "!" << std::endl;
+            throw "ILUT zero diagonal!";
+          }
+
+          ScalarType w_k_entry = w_k->second / a_kk;
+          w_k->second = w_k_entry;
+
+          //line 5: (dropping rule to w_k)
+          if ( std::fabs(w_k_entry) > tau_i)
+          {
+            //line 7:
+            for (OutputRowConstIterator u_k = output[k].begin(); u_k != output[k].end(); ++u_k)
+            {
+              if (u_k->first > k)
+                w[u_k->first] -= w_k_entry * u_k->second;
+            }
+          }
+          //else
+          //  w.erase(k);
+
+        } //for w_k
+
+        //Line 10: Apply a dropping rule to w
+        //Sort entries which are kept
+        temp_map.clear();
+        for (SparseVectorIterator w_k = w.begin(); w_k != w.end(); ++w_k)
+        {
+          SizeType k = w_k->first;
+          ScalarType w_k_entry = w_k->second;
+
+          ScalarType abs_w_k = std::fabs(w_k_entry);
+          if ( (abs_w_k > tau_i) || (k == i) )//do not drop diagonal element!
+          {
+
+            if (abs_w_k == 0) // this can only happen for diagonal entry
+              throw "Triangular factor in ILUT singular!";
+
+            temp_map.insert(std::make_pair(abs_w_k, std::make_pair(k, w_k_entry)));
+          }
+        }
+
+        //Lines 10-12: write the largest p values to L and U
+        SizeType written_L = 0;
+        SizeType written_U = 0;
+        for (typename TemporarySortMap::reverse_iterator iter = temp_map.rbegin(); iter != temp_map.rend(); ++iter)
+        {
+          std::map<SizeType, ScalarType> & row_i = output[i];
+          SizeType j = (iter->second).first;
+          ScalarType w_j_entry = (iter->second).second;
+
+          if (j < i) // Line 11: entry for L
+          {
+            if (written_L < tag.get_entries_per_row())
+            {
+              row_i[j] = w_j_entry;
+              ++written_L;
+            }
+          }
+          else if (j == i)  // Diagonal entry is always kept
+          {
+            row_i[j] = w_j_entry;
+          }
+          else //Line 12: entry for U
+          {
+            if (written_U < tag.get_entries_per_row())
+            {
+              row_i[j] = w_j_entry;
+              ++written_U;
+            }
+          }
+        }
+
+        w.clear(); //Line 13
+
+      } //for i
+    }
+
+
+    /** @brief ILUT preconditioner class, can be supplied to solve()-routines
+    */
+    template <typename MatrixType>
+    class ilut_precond
+    {
+      typedef typename MatrixType::value_type      ScalarType;
+
+      public:
+        ilut_precond(MatrixType const & mat, ilut_tag const & tag) : tag_(tag), LU(mat.size1(), mat.size2())
+        {
+          //initialize preconditioner:
+          //std::cout << "Start CPU precond" << std::endl;
+          init(mat);
+          //std::cout << "End CPU precond" << std::endl;
+        }
+
+        template <typename VectorType>
+        void apply(VectorType & vec) const
+        {
+          //Note: Since vec can be a rather arbitrary vector type, we call the more generic version in the backend manually:
+          unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(LU.handle1());
+          unsigned int const * col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(LU.handle2());
+          ScalarType   const * elements   = viennacl::linalg::host_based::detail::extract_raw_pointer<ScalarType>(LU.handle());
+
+          viennacl::linalg::host_based::detail::csr_inplace_solve<ScalarType>(row_buffer, col_buffer, elements, vec, LU.size2(), unit_lower_tag());
+          viennacl::linalg::host_based::detail::csr_inplace_solve<ScalarType>(row_buffer, col_buffer, elements, vec, LU.size2(), upper_tag());
+        }
+
+      private:
+        void init(MatrixType const & mat)
+        {
+          viennacl::context host_context(viennacl::MAIN_MEMORY);
+          viennacl::compressed_matrix<ScalarType> temp;
+          viennacl::switch_memory_context(temp, host_context);
+
+          viennacl::copy(mat, temp);
+
+          std::vector< std::map<unsigned int, ScalarType> > LU_temp(mat.size1());
+
+          viennacl::linalg::precondition(temp, LU_temp, tag_);
+
+          viennacl::switch_memory_context(LU, host_context);
+          viennacl::copy(LU_temp, LU);
+        }
+
+        ilut_tag const & tag_;
+        viennacl::compressed_matrix<ScalarType> LU;
+    };
+
+
+    /** @brief ILUT preconditioner class, can be supplied to solve()-routines.
+    *
+    *  Specialization for compressed_matrix
+    */
+    template <typename ScalarType, unsigned int MAT_ALIGNMENT>
+    class ilut_precond< compressed_matrix<ScalarType, MAT_ALIGNMENT> >
+    {
+      typedef compressed_matrix<ScalarType, MAT_ALIGNMENT>   MatrixType;
+
+      public:
+        ilut_precond(MatrixType const & mat, ilut_tag const & tag) : tag_(tag), LU(mat.size1(), mat.size2())
+        {
+          //initialize preconditioner:
+          //std::cout << "Start GPU precond" << std::endl;
+          init(mat);
+          //std::cout << "End GPU precond" << std::endl;
+        }
+
+        void apply(vector<ScalarType> & vec) const
+        {
+          if (vec.handle().get_active_handle_id() != viennacl::MAIN_MEMORY)
+          {
+            if (tag_.use_level_scheduling())
+            {
+              //std::cout << "Using multifrontal on GPU..." << std::endl;
+              detail::level_scheduling_substitute(vec,
+                                                  multifrontal_L_row_index_arrays_,
+                                                  multifrontal_L_row_buffers_,
+                                                  multifrontal_L_col_buffers_,
+                                                  multifrontal_L_element_buffers_,
+                                                  multifrontal_L_row_elimination_num_list_);
+
+              vec = viennacl::linalg::element_div(vec, multifrontal_U_diagonal_);
+
+              detail::level_scheduling_substitute(vec,
+                                                  multifrontal_U_row_index_arrays_,
+                                                  multifrontal_U_row_buffers_,
+                                                  multifrontal_U_col_buffers_,
+                                                  multifrontal_U_element_buffers_,
+                                                  multifrontal_U_row_elimination_num_list_);
+            }
+            else
+            {
+              viennacl::context host_context(viennacl::MAIN_MEMORY);
+              viennacl::context old_context = viennacl::traits::context(vec);
+              viennacl::switch_memory_context(vec, host_context);
+              viennacl::linalg::inplace_solve(LU, vec, unit_lower_tag());
+              viennacl::linalg::inplace_solve(LU, vec, upper_tag());
+              viennacl::switch_memory_context(vec, old_context);
+            }
+          }
+          else //apply ILUT directly:
+          {
+            viennacl::linalg::inplace_solve(LU, vec, unit_lower_tag());
+            viennacl::linalg::inplace_solve(LU, vec, upper_tag());
+          }
+        }
+
+      private:
+        void init(MatrixType const & mat)
+        {
+          viennacl::context host_context(viennacl::MAIN_MEMORY);
+          viennacl::switch_memory_context(LU, host_context);
+
+          std::vector< std::map<unsigned int, ScalarType> > LU_temp(mat.size1());
+
+          if (viennacl::traits::context(mat).memory_type() == viennacl::MAIN_MEMORY)
+          {
+            viennacl::linalg::precondition(mat, LU_temp, tag_);
+          }
+          else //we need to copy to CPU
+          {
+            viennacl::compressed_matrix<ScalarType> cpu_mat(mat.size1(), mat.size2());
+            viennacl::switch_memory_context(cpu_mat, host_context);
+
+            cpu_mat = mat;
+
+            viennacl::linalg::precondition(cpu_mat, LU_temp, tag_);
+          }
+
+          viennacl::copy(LU_temp, LU);
+
+          if (!tag_.use_level_scheduling())
+            return;
+
+          //
+          // multifrontal part:
+          //
+
+          viennacl::switch_memory_context(multifrontal_U_diagonal_, host_context);
+          multifrontal_U_diagonal_.resize(LU.size1(), false);
+          host_based::detail::row_info(LU, multifrontal_U_diagonal_, viennacl::linalg::detail::SPARSE_ROW_DIAGONAL);
+
+          detail::level_scheduling_setup_L(LU,
+                                           multifrontal_U_diagonal_, //dummy
+                                           multifrontal_L_row_index_arrays_,
+                                           multifrontal_L_row_buffers_,
+                                           multifrontal_L_col_buffers_,
+                                           multifrontal_L_element_buffers_,
+                                           multifrontal_L_row_elimination_num_list_);
+
+
+          detail::level_scheduling_setup_U(LU,
+                                           multifrontal_U_diagonal_,
+                                           multifrontal_U_row_index_arrays_,
+                                           multifrontal_U_row_buffers_,
+                                           multifrontal_U_col_buffers_,
+                                           multifrontal_U_element_buffers_,
+                                           multifrontal_U_row_elimination_num_list_);
+
+          //
+          // Bring to device if necessary:
+          //
+
+          // L:
+
+          for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_L_row_index_arrays_.begin();
+                                                                             it != multifrontal_L_row_index_arrays_.end();
+                                                                           ++it)
+            viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
+
+          for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_L_row_buffers_.begin();
+                                                                             it != multifrontal_L_row_buffers_.end();
+                                                                           ++it)
+            viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
+
+          for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_L_col_buffers_.begin();
+                                                                             it != multifrontal_L_col_buffers_.end();
+                                                                           ++it)
+            viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
+
+          for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_L_element_buffers_.begin();
+                                                                             it != multifrontal_L_element_buffers_.end();
+                                                                           ++it)
+            viennacl::backend::switch_memory_context<ScalarType>(*it, viennacl::traits::context(mat));
+
+
+          // U:
+
+          viennacl::switch_memory_context(multifrontal_U_diagonal_, viennacl::traits::context(mat));
+
+          for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_U_row_index_arrays_.begin();
+                                                                             it != multifrontal_U_row_index_arrays_.end();
+                                                                           ++it)
+            viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
+
+          for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_U_row_buffers_.begin();
+                                                                             it != multifrontal_U_row_buffers_.end();
+                                                                           ++it)
+            viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
+
+          for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_U_col_buffers_.begin();
+                                                                             it != multifrontal_U_col_buffers_.end();
+                                                                           ++it)
+            viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
+
+          for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_U_element_buffers_.begin();
+                                                                             it != multifrontal_U_element_buffers_.end();
+                                                                           ++it)
+            viennacl::backend::switch_memory_context<ScalarType>(*it, viennacl::traits::context(mat));
+
+
+        }
+
+        ilut_tag const & tag_;
+        viennacl::compressed_matrix<ScalarType> LU;
+
+        std::list< viennacl::backend::mem_handle > multifrontal_L_row_index_arrays_;
+        std::list< viennacl::backend::mem_handle > multifrontal_L_row_buffers_;
+        std::list< viennacl::backend::mem_handle > multifrontal_L_col_buffers_;
+        std::list< viennacl::backend::mem_handle > multifrontal_L_element_buffers_;
+        std::list< vcl_size_t > multifrontal_L_row_elimination_num_list_;
+
+        viennacl::vector<ScalarType> multifrontal_U_diagonal_;
+        std::list< viennacl::backend::mem_handle > multifrontal_U_row_index_arrays_;
+        std::list< viennacl::backend::mem_handle > multifrontal_U_row_buffers_;
+        std::list< viennacl::backend::mem_handle > multifrontal_U_col_buffers_;
+        std::list< viennacl::backend::mem_handle > multifrontal_U_element_buffers_;
+        std::list< vcl_size_t > multifrontal_U_row_elimination_num_list_;
+    };
+
+  }
+}
+
+
+
+
+#endif
+
+
+
diff --git a/viennacl/linalg/detail/op_applier.hpp b/viennacl/linalg/detail/op_applier.hpp
new file mode 100644
index 0000000..b73b1ca
--- /dev/null
+++ b/viennacl/linalg/detail/op_applier.hpp
@@ -0,0 +1,103 @@
+#ifndef VIENNACL_LINALG_DETAIL_OP_APPLIER_HPP
+#define VIENNACL_LINALG_DETAIL_OP_APPLIER_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/detail/op_applier.hpp
+ *
+ * @brief Defines the action of certain unary and binary operators and its arguments (for host execution).
+*/
+
+#include "viennacl/forwards.h"
+#include <cmath>
+
+namespace viennacl
+{
+  namespace linalg
+  {
+    namespace detail
+    {
+
+      /** @brief Worker class for decomposing expression templates.
+        *
+        * @tparam A    Type to which is assigned to
+        * @tparam OP   One out of {op_assign, op_inplace_add, op_inplace_sub}
+        @ @tparam T    Right hand side of the assignment
+      */
+      template <typename OP>
+      struct op_applier
+      {
+        typedef typename OP::ERROR_UNKNOWN_OP_TAG_PROVIDED    error_type;
+      };
+
+      /** \cond */
+      template <>
+      struct op_applier<op_element_binary<op_prod> >
+      {
+        template <typename T>
+        static void apply(T & result, T const & x, T const & y) { result = x * y; }
+      };
+
+      template <>
+      struct op_applier<op_element_binary<op_div> >
+      {
+        template <typename T>
+        static void apply(T & result, T const & x, T const & y) { result = x / y; }
+      };
+
+      template <>
+      struct op_applier<op_element_binary<op_pow> >
+      {
+        template <typename T>
+        static void apply(T & result, T const & x, T const & y) { result = std::pow(x, y); }
+      };
+
+#define VIENNACL_MAKE_UNARY_OP_APPLIER(funcname)  \
+      template <> \
+      struct op_applier<op_element_unary<op_##funcname> > \
+      { \
+        template <typename T> \
+        static void apply(T & result, T const & x) { using namespace std; result = funcname(x); } \
+      }
+
+      VIENNACL_MAKE_UNARY_OP_APPLIER(abs);
+      VIENNACL_MAKE_UNARY_OP_APPLIER(acos);
+      VIENNACL_MAKE_UNARY_OP_APPLIER(asin);
+      VIENNACL_MAKE_UNARY_OP_APPLIER(atan);
+      VIENNACL_MAKE_UNARY_OP_APPLIER(ceil);
+      VIENNACL_MAKE_UNARY_OP_APPLIER(cos);
+      VIENNACL_MAKE_UNARY_OP_APPLIER(cosh);
+      VIENNACL_MAKE_UNARY_OP_APPLIER(exp);
+      VIENNACL_MAKE_UNARY_OP_APPLIER(fabs);
+      VIENNACL_MAKE_UNARY_OP_APPLIER(floor);
+      VIENNACL_MAKE_UNARY_OP_APPLIER(log);
+      VIENNACL_MAKE_UNARY_OP_APPLIER(log10);
+      VIENNACL_MAKE_UNARY_OP_APPLIER(sin);
+      VIENNACL_MAKE_UNARY_OP_APPLIER(sinh);
+      VIENNACL_MAKE_UNARY_OP_APPLIER(sqrt);
+      VIENNACL_MAKE_UNARY_OP_APPLIER(tan);
+      VIENNACL_MAKE_UNARY_OP_APPLIER(tanh);
+
+#undef VIENNACL_MAKE_UNARY_OP_APPLIER
+      /** \endcond */
+
+    }
+  }
+}
+
+#endif // VIENNACL_LINALG_DETAIL_OP_EXECUTOR_HPP
diff --git a/viennacl/linalg/detail/op_executor.hpp b/viennacl/linalg/detail/op_executor.hpp
new file mode 100644
index 0000000..327a4eb
--- /dev/null
+++ b/viennacl/linalg/detail/op_executor.hpp
@@ -0,0 +1,85 @@
+#ifndef VIENNACL_LINALG_DETAIL_OP_EXECUTOR_HPP
+#define VIENNACL_LINALG_DETAIL_OP_EXECUTOR_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/detail/op_executor.hpp
+ *
+ * @brief Defines the worker class for decomposing an expression tree into small chunks, which can be processed by the predefined operations in ViennaCL.
+*/
+
+#include "viennacl/forwards.h"
+
+namespace viennacl
+{
+  namespace linalg
+  {
+    namespace detail
+    {
+      template <typename T, typename B>
+      bool op_aliasing(vector_base<T> const & /*lhs*/, B const & /*b*/)
+      {
+        return false;
+      }
+
+      template <typename T>
+      bool op_aliasing(vector_base<T> const & lhs, vector_base<T> const & b)
+      {
+        return lhs.handle() == b.handle();
+      }
+
+      template <typename T, typename LHS, typename RHS, typename OP>
+      bool op_aliasing(vector_base<T> const & lhs, vector_expression<const LHS, const RHS, OP> const & rhs)
+      {
+        return op_aliasing(lhs, rhs.lhs()) || op_aliasing(lhs, rhs.rhs());
+      }
+
+
+      template <typename T, typename F, typename B>
+      bool op_aliasing(matrix_base<T, F> const & /*lhs*/, B const & /*b*/)
+      {
+        return false;
+      }
+
+      template <typename T, typename F>
+      bool op_aliasing(matrix_base<T, F> const & lhs, matrix_base<T, F> const & b)
+      {
+        return lhs.handle() == b.handle();
+      }
+
+      template <typename T, typename F, typename LHS, typename RHS, typename OP>
+      bool op_aliasing(matrix_base<T, F> const & lhs, matrix_expression<const LHS, const RHS, OP> const & rhs)
+      {
+        return op_aliasing(lhs, rhs.lhs()) || op_aliasing(lhs, rhs.rhs());
+      }
+
+
+      /** @brief Worker class for decomposing expression templates.
+        *
+        * @tparam A    Type to which is assigned to
+        * @tparam OP   One out of {op_assign, op_inplace_add, op_inplace_sub}
+        @ @tparam T    Right hand side of the assignment
+      */
+      template <typename A, typename OP, typename T>
+      struct op_executor {};
+
+    }
+  }
+}
+
+#endif // VIENNACL_LINALG_DETAIL_OP_EXECUTOR_HPP
diff --git a/viennacl/linalg/detail/spai/block_matrix.hpp b/viennacl/linalg/detail/spai/block_matrix.hpp
index ce56486..44141ba 100644
--- a/viennacl/linalg/detail/spai/block_matrix.hpp
+++ b/viennacl/linalg/detail/spai/block_matrix.hpp
@@ -2,16 +2,17 @@
 #define VIENNACL_LINALG_DETAIL_SPAI_BLOCK_MATRIX_HPP
 
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
@@ -28,7 +29,7 @@
 
 /** @file viennacl/linalg/detail/spai/block_matrix.hpp
     @brief Implementation of a bunch of (small) matrices on GPU. Experimental.
-    
+
     SPAI code contributed by Nikolay Lukash
 */
 
@@ -40,51 +41,49 @@ namespace viennacl
       {
         namespace spai
         {
-          
+
           /**
           * @brief Represents a contigious matrices on GPU
           */
-          
-          class block_matrix{
-          public:
-              block_matrix(){
-                  
-              }
+
+          class block_matrix
+          {
+            public:
+
               /**
               * @brief Returns a handle to the elements
               */
-              viennacl::ocl::handle<cl_mem>& handle(){ return _elements; }
+              viennacl::ocl::handle<cl_mem>& handle(){ return elements_; }
               /**
               * @brief Returns a handle to the matrix dimensions
               */
-              viennacl::ocl::handle<cl_mem>& handle1() { return _matrix_dimensions; }
+              viennacl::ocl::handle<cl_mem>& handle1() { return matrix_dimensions_; }
               /**
               * @brief Returns a handle to the start indices of matrix
               */
-              viennacl::ocl::handle<cl_mem>& handle2() { return _start_block_inds; }
-              
+              viennacl::ocl::handle<cl_mem>& handle2() { return start_block_inds_; }
+
               /**
               * @brief Returns a handle to the const elements
               */
-              const viennacl::ocl::handle<cl_mem>& handle() const { return _elements; }
+              const viennacl::ocl::handle<cl_mem>& handle() const { return elements_; }
               /**
               * @brief Returns a handle to the const matrix dimensions
               */
-              const viennacl::ocl::handle<cl_mem>& handle1() const { return _matrix_dimensions; }
+              const viennacl::ocl::handle<cl_mem>& handle1() const { return matrix_dimensions_; }
               /**
               * @brief Returns a handle to the const start indices of matrix
               */
-              const viennacl::ocl::handle<cl_mem>& handle2() const { return _start_block_inds; }
-          private:
-              //unsigned int _vectorIndex;
-              viennacl::ocl::handle<cl_mem> _elements;
-              viennacl::ocl::handle<cl_mem> _matrix_dimensions;
-              viennacl::ocl::handle<cl_mem> _start_block_inds;
+              const viennacl::ocl::handle<cl_mem>& handle2() const { return start_block_inds_; }
+            private:
+              viennacl::ocl::handle<cl_mem> elements_;
+              viennacl::ocl::handle<cl_mem> matrix_dimensions_;
+              viennacl::ocl::handle<cl_mem> start_block_inds_;
           };
-        
-        
+
+
         }
       }
     }
 }
-#endif
\ No newline at end of file
+#endif
diff --git a/viennacl/linalg/detail/spai/block_vector.hpp b/viennacl/linalg/detail/spai/block_vector.hpp
index 2e85b05..2253e38 100644
--- a/viennacl/linalg/detail/spai/block_vector.hpp
+++ b/viennacl/linalg/detail/spai/block_vector.hpp
@@ -2,16 +2,17 @@
 #define VIENNACL_LINALG_DETAIL_SPAI_BLOCK_VECTOR_HPP
 
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
@@ -28,7 +29,7 @@
 
 /** @file viennacl/linalg/detail/spai/block_vector.hpp
     @brief Implementation of a bunch of vectors on GPU. Experimental.
-    
+
     SPAI code contributed by Nikolay Lukash
 */
 
@@ -40,39 +41,38 @@ namespace viennacl
       {
         namespace spai
         {
-        
+
           /**
           * @brief Represents a contigious vector on GPU
           */
-          
-          class block_vector{
-          public:
-              block_vector(){
-              }
+
+          class block_vector
+          {
+            public:
+
               /**
               * @brief Return handle to the elements
               */
-              viennacl::ocl::handle<cl_mem>& handle(){ return _elements; }
+              viennacl::ocl::handle<cl_mem>& handle(){ return elements_; }
               /**
               * @brief Return handle to start indices
               */
-              viennacl::ocl::handle<cl_mem>& handle1() { return _start_block_inds; }
-              
+              viennacl::ocl::handle<cl_mem>& handle1() { return start_block_inds_; }
+
               /**
               * @brief Return handle to the const elements
               */
-              const viennacl::ocl::handle<cl_mem>& handle() const { return _elements; }
+              const viennacl::ocl::handle<cl_mem>& handle() const { return elements_; }
               /**
               * @brief Return handle to const start indices
               */
-              const viennacl::ocl::handle<cl_mem>& handle1() const { return _start_block_inds; }
-          private:
-              //unsigned int _vectorIndex;
-              viennacl::ocl::handle<cl_mem> _elements;
-              viennacl::ocl::handle<cl_mem> _start_block_inds;
+              const viennacl::ocl::handle<cl_mem>& handle1() const { return start_block_inds_; }
+            private:
+              viennacl::ocl::handle<cl_mem> elements_;
+              viennacl::ocl::handle<cl_mem> start_block_inds_;
           };
         }
       }
     }
 }
-#endif
\ No newline at end of file
+#endif
diff --git a/viennacl/linalg/detail/spai/fspai.hpp b/viennacl/linalg/detail/spai/fspai.hpp
index 85c3d29..76207f9 100644
--- a/viennacl/linalg/detail/spai/fspai.hpp
+++ b/viennacl/linalg/detail/spai/fspai.hpp
@@ -2,16 +2,17 @@
 #define VIENNACL_LINALG_DETAIL_SPAI_FSPAI_HPP
 
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
@@ -41,7 +42,7 @@
 #include "viennacl/linalg/prod.hpp"
 #include "viennacl/matrix.hpp"
 #include "viennacl/compressed_matrix.hpp"
-#include "viennacl/linalg/compressed_matrix_operations.hpp"
+#include "viennacl/linalg/sparse_matrix_operations.hpp"
 #include "viennacl/linalg/matrix_operations.hpp"
 #include "viennacl/scalar.hpp"
 #include "viennacl/linalg/cg.hpp"
@@ -61,7 +62,7 @@ namespace viennacl
       {
         namespace spai
         {
-        
+
           /** @brief A tag for FSPAI. Experimental.
           * Contains values for the algorithm.
           * Must be passed to spai_precond constructor
@@ -76,45 +77,45 @@ namespace viennacl
           public:
               fspai_tag(
                       double residual_norm_threshold = 1e-3,
-                      unsigned int iteration_limit = 5, 
+                      unsigned int iteration_limit = 5,
                       bool is_static = false,
                       bool is_right = false) :
-              _residual_norm_threshold(residual_norm_threshold),
-              _iteration_limit(iteration_limit),
-              _is_static(is_static),
-              _is_right(is_right){};
-              
-              inline const double getResidualNormThreshold() const
-              { return _residual_norm_threshold; }
-              inline const unsigned long getIterationLimit () const
-              { return _iteration_limit; }
-              inline const bool getIsStatic() const
-              { return _is_static; }
-              inline const bool getIsRight() const
-              { return _is_right; }
+              residual_norm_threshold_(residual_norm_threshold),
+              iteration_limit_(iteration_limit),
+              is_static_(is_static),
+              is_right_(is_right) {}
+
+              inline double getResidualNormThreshold() const
+              { return residual_norm_threshold_; }
+              inline unsigned long getIterationLimit () const
+              { return iteration_limit_; }
+              inline bool getIsStatic() const
+              { return is_static_; }
+              inline bool getIsRight() const
+              { return is_right_; }
               inline void setResidualNormThreshold(double residual_norm_threshold){
                   if(residual_norm_threshold > 0)
-                      _residual_norm_threshold = residual_norm_threshold;
+                      residual_norm_threshold_ = residual_norm_threshold;
               }
               inline void setIterationLimit(unsigned long iteration_limit){
                   if(iteration_limit > 0)
-                      _iteration_limit = iteration_limit;
+                      iteration_limit_ = iteration_limit;
               }
               inline void setIsRight(bool is_right){
-                  _is_right = is_right;
+                  is_right_ = is_right;
               }
               inline void setIsStatic(bool is_static){
-                  _is_static = is_static;
+                  is_static_ = is_static;
               }
-              
+
           private:
-              double _residual_norm_threshold;
-              unsigned long _iteration_limit;
-              bool _is_static;
-              bool _is_right;
+              double residual_norm_threshold_;
+              unsigned long iteration_limit_;
+              bool is_static_;
+              bool is_right_;
           };
-          
-          
+
+
           //
           // Helper: Store A in an STL container of type, exploiting symmetry
           // Reason: ublas interface does not allow to iterate over nonzeros of a particular row without starting an iterator1 from the very beginning of the matrix...
@@ -132,19 +133,19 @@ namespace viennacl
                                                       ++col_it)
               {
                 if (col_it.index1() >= col_it.index2())
-                  STL_A[col_it.index1()][col_it.index2()] = *col_it;
+                  STL_A[col_it.index1()][static_cast<unsigned int>(col_it.index2())] = *col_it;
                 else
                   break; //go to next row
               }
             }
           }
-          
-          
+
+
           //
           // Generate index sets J_k, k=0,...,N-1
           //
           template <typename MatrixType>
-          void generateJ(MatrixType const & A, std::vector<std::vector<size_t> > & J)
+          void generateJ(MatrixType const & A, std::vector<std::vector<vcl_size_t> > & J)
           {
             for (typename MatrixType::const_iterator1 row_it  = A.begin1();
                                                       row_it != A.end1();
@@ -167,163 +168,161 @@ namespace viennacl
 
 
           //
-          // Extracts the blocks A(\tilde{J}_k, \tilde{J}_k) from A 
+          // Extracts the blocks A(\tilde{J}_k, \tilde{J}_k) from A
           // Sets up y_k = A(\tilde{J}_k, k) for the inplace-solution after Cholesky-factoriation
           //
           template <typename ScalarType, typename MatrixType, typename VectorType>
           void fill_blocks(std::vector< std::map<unsigned int, ScalarType> > & A,
                           std::vector<MatrixType> & blocks,
-                          std::vector<std::vector<size_t> > const & J,
+                          std::vector<std::vector<vcl_size_t> > const & J,
                           std::vector<VectorType> & Y)
           {
-            for (size_t k=0; k<A.size(); ++k)
+            for (vcl_size_t k=0; k<A.size(); ++k)
             {
-              std::vector<size_t> const & Jk = J[k];
+              std::vector<vcl_size_t> const & Jk = J[k];
               VectorType & yk = Y[k];
               MatrixType & block_k = blocks[k];
 
               yk.resize(Jk.size());
               block_k.resize(Jk.size(), Jk.size());
               block_k.clear();
-              
-              for (size_t i=0; i<Jk.size(); ++i)
+
+              for (vcl_size_t i=0; i<Jk.size(); ++i)
               {
-                size_t row_index = Jk[i];
+                vcl_size_t row_index = Jk[i];
                 std::map<unsigned int, ScalarType> & A_row = A[row_index];
-                
+
                 //fill y_k:
-                yk[i] = A_row[k];
-                
-                for (size_t j=0; j<Jk.size(); ++j)
+                yk[i] = A_row[static_cast<unsigned int>(k)];
+
+                for (vcl_size_t j=0; j<Jk.size(); ++j)
                 {
-                  size_t col_index = Jk[j];
-                  if (col_index <= row_index && A_row.find(col_index) != A_row.end()) //block is symmetric, thus store only lower triangular part
-                    block_k(i, j) = A_row[col_index];
+                  vcl_size_t col_index = Jk[j];
+                  if (col_index <= row_index && A_row.find(static_cast<unsigned int>(col_index)) != A_row.end()) //block is symmetric, thus store only lower triangular part
+                    block_k(i, j) = A_row[static_cast<unsigned int>(col_index)];
                 }
               }
             }
           }
-          
-          
+
+
           //
           // Perform Cholesky factorization of A inplace. Cf. Schwarz: Numerische Mathematik, vol 5, p. 58
           //
           template <typename MatrixType>
           void cholesky_decompose(MatrixType & A)
           {
-            for (size_t k=0; k<A.size2(); ++k)
+            for (vcl_size_t k=0; k<A.size2(); ++k)
             {
               if (A(k,k) <= 0)
               {
                 std::cout << "k: " << k << std::endl;
                 std::cout << "A(k,k): " << A(k,k) << std::endl;
               }
-              
+
               assert(A(k,k) > 0);
-              
+
               A(k,k) = std::sqrt(A(k,k));
-              
-              for (size_t i=k+1; i<A.size1(); ++i)
+
+              for (vcl_size_t i=k+1; i<A.size1(); ++i)
               {
                 A(i,k) /= A(k,k);
-                for (size_t j=k+1; j<=i; ++j)
+                for (vcl_size_t j=k+1; j<=i; ++j)
                   A(i,j) -= A(i,k) * A(j,k);
               }
             }
           }
-          
-          
+
+
           //
           // Compute x in Ax = b, where A is already Cholesky factored (A = L L^T)
           //
           template <typename MatrixType, typename VectorType>
           void cholesky_solve(MatrixType const & L, VectorType & b)
           {
-            typedef typename VectorType::value_type  ScalarType;
-            
             // inplace forward solve L x = b
-            for (size_t i=0; i<L.size1(); ++i)
+            for (vcl_size_t i=0; i<L.size1(); ++i)
             {
-              for (size_t j=0; j<i; ++j)
+              for (vcl_size_t j=0; j<i; ++j)
                 b[i] -= L(i,j) * b[j];
               b[i] /= L(i,i);
             }
-            
+
             // inplace backward solve L^T x = b:
-            for (size_t i=L.size1()-1; ; --i)
+            for (vcl_size_t i=L.size1()-1; ; --i)
             {
-              for (size_t k=i+1; k<L.size1(); ++k)
+              for (vcl_size_t k=i+1; k<L.size1(); ++k)
                 b[i] -= L(k,i) * b[k];
               b[i] /= L(i,i);
-              
-              if (i==0) //size_t might be unsigned, therefore manual check for equality with zero here
+
+              if (i==0) //vcl_size_t might be unsigned, therefore manual check for equality with zero here
                 break;
             }
           }
-          
-          
-          
+
+
+
           //
           // Compute the Cholesky factor L from the sparse vectors y_k
           //
           template <typename MatrixType, typename VectorType1>
-          void computeL(MatrixType const & A, 
+          void computeL(MatrixType const & A,
                         MatrixType & L,
                         MatrixType & L_trans,
                         std::vector<VectorType1> & Y,
-                        std::vector<std::vector<size_t> > & J)
+                        std::vector<std::vector<vcl_size_t> > & J)
           {
             typedef typename VectorType1::value_type    ScalarType;
             typedef std::vector<std::map<unsigned int, ScalarType> >     STLSparseMatrixType;
-            
+
             STLSparseMatrixType L_temp(A.size1());
-            
-            for (size_t k=0; k<A.size1(); ++k)
+
+            for (vcl_size_t k=0; k<A.size1(); ++k)
             {
-              std::vector<size_t> const & Jk = J[k];
+              std::vector<vcl_size_t> const & Jk = J[k];
               VectorType1 const & yk = Y[k];
-              
+
               //compute L(k,k):
               ScalarType Lkk = A(k,k);
-              for (size_t i=0; i<Jk.size(); ++i)
+              for (vcl_size_t i=0; i<Jk.size(); ++i)
                 Lkk -= A(Jk[i],k) * yk[i];
-              
-              Lkk = 1.0 / sqrt(Lkk);
-              L_temp[k][k] = Lkk;
+
+              Lkk = ScalarType(1) / std::sqrt(Lkk);
+              L_temp[k][static_cast<unsigned int>(k)] = Lkk;
               L_trans(k,k) = Lkk;
-              
+
               //write lower diagonal entries:
-              for (size_t i=0; i<Jk.size(); ++i)
+              for (vcl_size_t i=0; i<Jk.size(); ++i)
               {
-                L_temp[Jk[i]][k] = -Lkk * yk[i];
+                L_temp[Jk[i]][static_cast<unsigned int>(k)] = -Lkk * yk[i];
                 L_trans(k, Jk[i]) = -Lkk * yk[i];
               }
             } //for k
-            
-            
+
+
             //build L from L_temp
-            for (size_t i=0; i<L_temp.size(); ++i)
+            for (vcl_size_t i=0; i<L_temp.size(); ++i)
               for (typename std::map<unsigned int, ScalarType>::const_iterator it = L_temp[i].begin();
                   it != L_temp[i].end();
                 ++it)
                   L(i, it->first) = it->second;
           }
-          
+
 
           //
           // Top level FSPAI function
           //
           template <typename MatrixType>
-          void computeFSPAI(MatrixType const & A, 
+          void computeFSPAI(MatrixType const & A,
                             MatrixType const & PatternA,
-                            MatrixType & L, 
-                            MatrixType & L_trans, 
-                            fspai_tag const & tag)
+                            MatrixType & L,
+                            MatrixType & L_trans,
+                            fspai_tag)
           {
             typedef typename MatrixType::value_type              ScalarType;
             typedef boost::numeric::ublas::matrix<ScalarType>    DenseMatrixType;
             typedef std::vector<std::map<unsigned int, ScalarType> >     SparseMatrixType;
-            
+
             //
             // preprocessing: Store A in a STL container:
             //
@@ -331,13 +330,13 @@ namespace viennacl
             std::vector<std::vector<ScalarType> >    y_k(A.size1());
             SparseMatrixType   STL_A(A.size1());
             sym_sparse_matrix_to_stl(A, STL_A);
-            
-            
+
+
             //
             // Step 1: Generate pattern indices
             //
             //std::cout << "computeFSPAI(): Generating pattern..." << std::endl;
-            std::vector<std::vector<size_t> > J(A.size1());
+            std::vector<std::vector<vcl_size_t> > J(A.size1());
             generateJ(PatternA, J);
 
             //
@@ -347,42 +346,42 @@ namespace viennacl
             std::vector<DenseMatrixType>  subblocks_A(A.size1());
             fill_blocks(STL_A, subblocks_A, J, y_k);
             STL_A.clear(); //not needed anymore
-            
+
             //
             // Step 3: Cholesky-factor blocks
             //
             //std::cout << "computeFSPAI(): Cholesky-factorization..." << std::endl;
-            for (size_t i=0; i<subblocks_A.size(); ++i)
+            for (vcl_size_t i=0; i<subblocks_A.size(); ++i)
             {
               //std::cout << "Block before: " << subblocks_A[i] << std::endl;
               cholesky_decompose(subblocks_A[i]);
               //std::cout << "Block after: " << subblocks_A[i] << std::endl;
             }
-            
-            
-            /*size_t num_bytes = 0;
-            for (size_t i=0; i<subblocks_A.size(); ++i)
+
+
+            /*vcl_size_t num_bytes = 0;
+            for (vcl_size_t i=0; i<subblocks_A.size(); ++i)
               num_bytes += 8*subblocks_A[i].size1()*subblocks_A[i].size2();*/
             //std::cout << "Memory for FSPAI matrix: " << num_bytes / (1024.0 * 1024.0) << " MB" << std::endl;
-            
+
             //
             // Step 4: Solve for y_k
             //
             //std::cout << "computeFSPAI(): Cholesky-solve..." << std::endl;
-            for (size_t i=0; i<y_k.size(); ++i)
+            for (vcl_size_t i=0; i<y_k.size(); ++i)
             {
               if (subblocks_A[i].size1() > 0) //block might be empty...
               {
                 //y_k[i].resize(subblocks_A[i].size1());
                 //std::cout << "y_k[" << i << "]: ";
-                //for (size_t j=0; j<y_k[i].size(); ++j)
+                //for (vcl_size_t j=0; j<y_k[i].size(); ++j)
                 //  std::cout << y_k[i][j] << " ";
                 //std::cout << std::endl;
                 cholesky_solve(subblocks_A[i], y_k[i]);
               }
             }
-            
-            
+
+
             //
             // Step 5: Set up Cholesky factors L and L_trans
             //
@@ -392,12 +391,12 @@ namespace viennacl
             L_trans.resize(A.size1(), A.size2(), false);
             L_trans.reserve(A.nnz(), false);
             computeL(A, L, L_trans, y_k, J);
-            
+
             //std::cout << "L: " << L << std::endl;
           }
-          
-          
-          
+
+
+
         }
       }
     }
diff --git a/viennacl/linalg/detail/spai/qr.hpp b/viennacl/linalg/detail/spai/qr.hpp
index bedd135..4ec3643 100644
--- a/viennacl/linalg/detail/spai/qr.hpp
+++ b/viennacl/linalg/detail/spai/qr.hpp
@@ -2,16 +2,17 @@
 #define VIENNACL_LINALG_DETAIL_SPAI_QR_HPP
 
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
@@ -19,7 +20,7 @@
 
 /** @file viennacl/linalg/detail/spai/qr.hpp
     @brief Implementation of a simultaneous QR factorization of multiple matrices. Experimental.
-    
+
     SPAI code contributed by Nikolay Lukash
 */
 
@@ -40,15 +41,13 @@
 #include "boost/numeric/ublas/io.hpp"
 #include "boost/numeric/ublas/matrix_expression.hpp"
 #include "boost/numeric/ublas/detail/matrix_assign.hpp"
-//#include "boost/thread/thread.hpp"
 
 #include "viennacl/vector.hpp"
 #include "viennacl/matrix.hpp"
 
 #include "viennacl/linalg/detail/spai/block_matrix.hpp"
 #include "viennacl/linalg/detail/spai/block_vector.hpp"
-#include "viennacl/linalg/kernels/spai_source.h"
-#include "viennacl/linalg/kernels/spai_kernels.h"
+#include "viennacl/linalg/opencl/kernels/spai.hpp"
 
 namespace viennacl
 {
@@ -58,9 +57,7 @@ namespace viennacl
       {
         namespace spai
         {
-        
-          
-          
+
           //********** DEBUG FUNCTIONS *****************//
           template< typename T, typename InputIterator>
           void Print(std::ostream& ostr, InputIterator it_begin, InputIterator it_end){
@@ -69,23 +66,23 @@ namespace viennacl
               std::copy(it_begin, it_end, std::ostream_iterator<T>(ostr, delimiters.c_str()));
               ostr<<std::endl;
           }
-          
+
           template<typename VectorType, typename MatrixType>
           void write_to_block(VectorType& con_A_I_J, unsigned int start_ind,  const std::vector<unsigned int>& I, const std::vector<unsigned int>& J, MatrixType& m){
               m.resize(I.size(), J.size(), false);
-              for(size_t i = 0; i < J.size(); ++i){
-                  for(size_t j = 0; j < I.size(); ++j){
+              for(vcl_size_t i = 0; i < J.size(); ++i){
+                  for(vcl_size_t j = 0; j < I.size(); ++j){
                       m(j,i) = con_A_I_J[start_ind + i*I.size() + j];
                   }
               }
           }
-          
+
           template<typename VectorType>
           void print_continious_matrix(VectorType& con_A_I_J, std::vector<cl_uint>& blocks_ind,
                                       const std::vector<std::vector<unsigned int> >& g_I, const std::vector<std::vector<unsigned int> >& g_J){
               typedef typename VectorType::value_type ScalarType;
               std::vector<boost::numeric::ublas::matrix<ScalarType> > com_A_I_J(g_I.size());
-              for(size_t i = 0; i < g_I.size(); ++i){
+              for(vcl_size_t i = 0; i < g_I.size(); ++i){
                   write_to_block( con_A_I_J, blocks_ind[i], g_I[i], g_J[i], com_A_I_J[i]);
                   std::cout<<com_A_I_J[i]<<std::endl;
               }
@@ -95,65 +92,68 @@ namespace viennacl
               typedef typename VectorType::value_type ScalarType;
               std::vector<boost::numeric::ublas::vector<ScalarType> > com_v(g_J.size());
               //Print<ScalarType>(std::cout, con_v.begin(), con_v.end());
-              for(size_t i = 0; i < g_J.size(); ++i){
+              for(vcl_size_t i = 0; i < g_J.size(); ++i){
                   com_v[i].resize(g_J[i].size());
-                  for(size_t j = 0; j < g_J[i].size(); ++j){
+                  for(vcl_size_t j = 0; j < g_J[i].size(); ++j){
                       com_v[i](j) = con_v[block_ind[i] + j];
                   }
                   std::cout<<com_v[i]<<std::endl;
               }
           }
-          
+
           ///**************************************** BLOCK FUNCTIONS ************************************//
           /** @brief Computes size of elements, start indices and matrix dimensions for a certain block
-          * @param g_I container of row indices 
-          * @param g_J container of column indices 
+          * @param g_I container of row indices
+          * @param g_J container of column indices
           * @param sz general size for all elements in a certain block
           * @param blocks_ind start indices in a certain
           * @param matrix_dims matrix dimensions for each block
-          */ 
-          void compute_blocks_size(const std::vector<std::vector<unsigned int> >& g_I, const std::vector<std::vector<unsigned int> >& g_J, 
-                                  unsigned int& sz, std::vector<cl_uint>& blocks_ind, std::vector<cl_uint>& matrix_dims){
+          */
+          inline void compute_blocks_size(const std::vector<std::vector<unsigned int> >& g_I, const std::vector<std::vector<unsigned int> >& g_J,
+                                          unsigned int& sz, std::vector<cl_uint>& blocks_ind, std::vector<cl_uint>& matrix_dims)
+          {
               sz = 0;
-              for(size_t i = 0; i < g_I.size(); ++i){
+              for(vcl_size_t i = 0; i < g_I.size(); ++i){
                   sz += static_cast<unsigned int>(g_I[i].size()*g_J[i].size());
                   matrix_dims[2*i] = static_cast<cl_uint>(g_I[i].size());
                   matrix_dims[2*i + 1] = static_cast<cl_uint>(g_J[i].size());
                   blocks_ind[i+1] = blocks_ind[i] + static_cast<cl_uint>(g_I[i].size()*g_J[i].size());
-                  
+
               }
           }
-          /** @brief Computes size of particular container of index set 
-          * @param inds container of index sets 
-          * @param size output size 
-          */ 
-          void get_size(const std::vector<std::vector<unsigned int> >& inds, unsigned int& size){
+          /** @brief Computes size of particular container of index set
+          * @param inds container of index sets
+          * @param size output size
+          */
+          template <typename SizeType>
+          void get_size(const std::vector<std::vector<SizeType> >& inds, SizeType & size){
               size = 0;
-              for (size_t i = 0; i < inds.size(); ++i) {
+              for (vcl_size_t i = 0; i < inds.size(); ++i) {
                   size += static_cast<unsigned int>(inds[i].size());
               }
           }
-          
-          /** @brief Initializes start indices of particular index set 
-          * @param inds container of index sets 
-          * @param start_inds output index set 
+
+          /** @brief Initializes start indices of particular index set
+          * @param inds container of index sets
+          * @param start_inds output index set
           */
-          void init_start_inds(const std::vector<std::vector<unsigned int> >& inds, std::vector<cl_uint>& start_inds){
-              for(size_t i = 0; i < inds.size(); ++i){
+          template <typename SizeType>
+          void init_start_inds(const std::vector<std::vector<SizeType> >& inds, std::vector<cl_uint>& start_inds){
+              for(vcl_size_t i = 0; i < inds.size(); ++i){
                   start_inds[i+1] = start_inds[i] + static_cast<cl_uint>(inds[i].size());
               }
           }
 
           //*************************************  QR FUNCTIONS  ***************************************//
-          /** @brief Dot prod of particular column of martix A with it's self starting at a certain index beg_ind 
-          * @param A init matrix 
+          /** @brief Dot prod of particular column of martix A with it's self starting at a certain index beg_ind
+          * @param A init matrix
           * @param beg_ind starting index
           * @param res result of dot product
           */
           template<typename MatrixType, typename ScalarType>
           void dot_prod(const MatrixType& A,  unsigned int beg_ind, ScalarType& res){
               res = static_cast<ScalarType>(0);
-              for(size_t i = beg_ind; i < A.size1(); ++i){
+              for(vcl_size_t i = beg_ind; i < A.size1(); ++i){
                   res += A(i, beg_ind-1)*A(i, beg_ind-1);
               }
           }
@@ -168,11 +168,11 @@ namespace viennacl
           void custom_inner_prod(const MatrixType& A, const VectorType& v, unsigned int col_ind, unsigned int start_ind, ScalarType& res){
               res = static_cast<ScalarType>(0);
               for(unsigned int i = start_ind; i < static_cast<unsigned int>(A.size1()); ++i){
-                  res += A(i, col_ind)*v(i);  
+                  res += A(i, col_ind)*v(i);
               }
           }
-          
-          /** @brief Copying part of matrix column 
+
+          /** @brief Copying part of matrix column
           * @param A init matrix
           * @param v output vector
           * @param beg_ind start index for copying
@@ -183,19 +183,20 @@ namespace viennacl
                   v(i) = A( i, beg_ind-1);
               }
           }
-          
+
           //householder reflection c.f. Gene H. Golub, Charles F. Van Loan "Matrix Computations" 3rd edition p.210
           /** @brief Coputation of Householder vector, householder reflection c.f. Gene H. Golub, Charles F. Van Loan "Matrix Computations" 3rd edition p.210
           * @param A init matrix
-          * @param j start index for computations 
-          * @param v output Householder vector 
+          * @param j start index for computations
+          * @param v output Householder vector
           * @param b beta
           */
           template<typename MatrixType, typename VectorType, typename ScalarType>
-          void householder_vector(const MatrixType& A, unsigned int j, VectorType& v, ScalarType& b){
+          void householder_vector(const MatrixType& A, unsigned int j, VectorType& v, ScalarType& b)
+          {
               ScalarType sg;
               //
-              dot_prod(A, j+1, sg); 
+              dot_prod(A, j+1, sg);
               copy_vector(A, v, j+1);
               ScalarType mu;
               v(j) = static_cast<ScalarType>(1.0);
@@ -216,11 +217,12 @@ namespace viennacl
           /** @brief Inplace application of Householder vector to a matrix A
           * @param A init matrix
           * @param iter_cnt current iteration
-          * @param v Householder vector 
+          * @param v Householder vector
           * @param b beta
           */
           template<typename MatrixType, typename VectorType, typename ScalarType>
-          void apply_householder_reflection(MatrixType& A, unsigned int iter_cnt, VectorType& v, ScalarType b){
+          void apply_householder_reflection(MatrixType& A, unsigned int iter_cnt, VectorType& v, ScalarType b)
+          {
               //update every column of matrix A
               ScalarType in_prod_res;
               for(unsigned int i = iter_cnt; i < static_cast<unsigned int>(A.size2()); ++i){
@@ -231,27 +233,29 @@ namespace viennacl
                   }
               }
           }
-          
+
           /** @brief Storage of vector v in column(A, ind), starting from ind-1 index of a column
           * @param A init matrix
           * @param ind index of a column
           * @param v vector that should be stored
           */
           template<typename MatrixType, typename VectorType>
-          void store_householder_vector(MatrixType& A, unsigned int ind, VectorType& v){
+          void store_householder_vector(MatrixType& A, unsigned int ind, VectorType& v)
+          {
               for(unsigned int i = ind; i < static_cast<unsigned int>(A.size1()); ++i){
                   A(i, ind-1) = v(i);
               }
           }
-          
-          
-          //QR algorithm 
+
+
+          //QR algorithm
           /** @brief Inplace QR factorization via Householder reflections c.f. Gene H. Golub, Charles F. Van Loan "Matrix Computations" 3rd edition p.224
-          * @param R input matrix 
+          * @param R input matrix
           * @param b_v vector of betas
           */
           template<typename MatrixType, typename VectorType>
-          void single_qr(MatrixType& R, VectorType& b_v){
+          void single_qr(MatrixType& R, VectorType& b_v)
+          {
               typedef typename MatrixType::value_type ScalarType;
               if((R.size1() > 0) && (R.size2() > 0)){
                   VectorType v = (VectorType)boost::numeric::ublas::zero_vector<ScalarType>(R.size1());
@@ -263,47 +267,50 @@ namespace viennacl
                   }
               }
           }
-          
+
           //********************** HELP FUNCTIONS FOR GPU-based QR factorization *************************//
-          /** @brief Reading from text file into string
+          /* * @brief Reading from text file into string
           * @param file_name file name
           * @param kernel_source string that contains file
-          */
+
           void read_kernel_from_file(std::string& file_name, std::string& kernel_source){
               std::ifstream ifs(file_name.c_str(), std::ifstream::in);
-              
+
               if (!ifs)
                 std::cerr << "WARNING: Cannot open file " << file_name << std::endl;
-              
+
               std::string line;
               std::ostringstream ost;
               while (std::getline(ifs, line)) {
                   ost<<line<<std::endl;
               }
               kernel_source = ost.str();
-          }
-          
+          }*/
+
           /** @brief Getting max size of rows/columns from container of index set
           * @param inds container of index set
           * @param max_size max size that corresponds to that container
           */
-          void get_max_block_size(const std::vector<std::vector<unsigned int> >& inds, unsigned int& max_size){
+          template <typename SizeType>
+          void get_max_block_size(const std::vector<std::vector<SizeType> >& inds, SizeType & max_size)
+          {
               max_size = 0;
-              for(unsigned int i = 0; i < inds.size(); ++i){
+              for(vcl_size_t i = 0; i < inds.size(); ++i){
                   if(inds[i].size() > max_size){
-                      max_size = static_cast<unsigned int>(inds[i].size());
+                      max_size = static_cast<SizeType>(inds[i].size());
                   }
               }
           }
-          
-          /** @brief Dot_prod(column(A, ind), v) starting from index ind+1 
+
+          /** @brief Dot_prod(column(A, ind), v) starting from index ind+1
           * @param A input matrix
           * @param v input vector
           * @param ind index
           * @param res result value
           */
           template<typename MatrixType, typename VectorType, typename ScalarType>
-          void custom_dot_prod(const MatrixType& A, const VectorType& v, unsigned int ind, ScalarType& res){
+          void custom_dot_prod(const MatrixType& A, const VectorType& v, unsigned int ind, ScalarType& res)
+          {
               res = static_cast<ScalarType>(0);
               for(unsigned int j = ind; j < A.size1(); ++j){
                   if(j == ind){
@@ -313,19 +320,20 @@ namespace viennacl
                   }
               }
           }
-          
+
           /** @brief Recovery Q from matrix R and vector of betas b_v
           * @param R input matrix
           * @param b_v vector of betas
           * @param y output vector
           */
           template<typename MatrixType, typename VectorType>
-          void apply_q_trans_vec(const MatrixType& R, const VectorType& b_v, VectorType& y){
+          void apply_q_trans_vec(const MatrixType& R, const VectorType& b_v, VectorType& y)
+          {
               typedef typename MatrixType::value_type ScalarType;
               ScalarType inn_prod = static_cast<ScalarType>(0);
-              for(size_t i = 0; i < R.size2(); ++i){
+              for(vcl_size_t i = 0; i < R.size2(); ++i){
                   custom_dot_prod(R, y, static_cast<unsigned int>(i), inn_prod);
-                  for(size_t j = i; j < R.size1(); ++j){
+                  for(vcl_size_t j = i; j < R.size1(); ++j){
                       if(i == j){
                           y(j) -= b_v(i)*inn_prod;
                       }
@@ -335,104 +343,100 @@ namespace viennacl
                   }
               }
           }
-          
+
           /** @brief Multiplication of Q'*A, where Q is in implicit for lower part of R and vector of betas - b_v
           * @param R input matrix
           * @param b_v vector of betas
           * @param A output matrix
           */
           template<typename MatrixType, typename VectorType>
-          void apply_q_trans_mat(const MatrixType& R, const VectorType& b_v, MatrixType& A){
+          void apply_q_trans_mat(const MatrixType& R, const VectorType& b_v, MatrixType& A)
+          {
               VectorType tmp_v;
-              for(size_t i = 0; i < A.size2(); ++i){
+              for(vcl_size_t i = 0; i < A.size2(); ++i){
                   tmp_v = (VectorType)column(A,i);
                   apply_q_trans_vec(R, b_v, tmp_v);
                   column(A,i) = tmp_v;
               }
           }
-          
+
           //parallel QR for GPU
-          /** @brief Inplace QR factorization via Householder reflections c.f. Gene H. Golub, Charles F. Van Loan "Matrix Computations" 3rd edition p.224 performed
-                      on GPU
-          * @param g_I container of row indices
-          * @param g_J container of column indices
+          /** @brief Inplace QR factorization via Householder reflections c.f. Gene H. Golub, Charles F. Van Loan "Matrix Computations" 3rd edition p.224 performed on GPU
+          *
+          * @param g_I         container of row indices
+          * @param g_J         container of column indices
           * @param g_A_I_J_vcl contigious matrices, GPU memory is used
-          * @param g_bv_vcl contigios vectors beta, GPU memory is used 
-          * @param g_is_update container of indicators that show active blocks  
-          * @param cur_iter current iteration
+          * @param g_bv_vcl    contigiuos vectors beta, GPU memory is used
+          * @param g_is_update container of indicators that show active blocks
+          * @param ctx         Optional context in which the auxiliary data is created (one out of multiple OpenCL contexts, CUDA, host)
           */
           template<typename ScalarType>
-          void block_qr(std::vector<std::vector<unsigned int> >& g_I, 
-                        std::vector<std::vector<unsigned int> >& g_J, 
+          void block_qr(std::vector<std::vector<unsigned int> >& g_I,
+                        std::vector<std::vector<unsigned int> >& g_J,
                         block_matrix& g_A_I_J_vcl,
                         block_vector& g_bv_vcl,
                         std::vector<cl_uint>& g_is_update,
-                        const unsigned int cur_iter){
-              //typedef typename MatrixType::value_type ScalarType;
-              unsigned int bv_size;
-              unsigned int v_size;
-              //set up arguments for GPU
-              //find maximum size of rows/columns
-              unsigned int local_r_n, local_c_n;
-              //find max size for blocks
-              get_max_block_size(g_I, local_r_n);
-              get_max_block_size(g_J, local_c_n);
-              //get size
-              get_size(g_J, bv_size);
-              get_size(g_I, v_size);
-              //get start indices
-              std::vector<cl_uint> start_bv_inds(g_I.size() + 1, 0);
-              std::vector<cl_uint> start_v_inds(g_I.size() + 1, 0);
-              init_start_inds(g_J, start_bv_inds);
-              init_start_inds(g_I, start_v_inds);
-              //init arrays
-              std::vector<ScalarType> b_v(bv_size, static_cast<ScalarType>(0));
-              std::vector<ScalarType> v(v_size, static_cast<ScalarType>(0));
-              //call qr program
-              block_vector v_vcl;
-              /*if(cur_iter == 0)
-              {
-                  //if first run - compile the program
-                  std::string qr_kernel_file_name = "kernels/spai/qr3_a_n.cl";
-                  std::string qr_kernel_source;
-                  read_kernel_from_file(qr_kernel_file_name, qr_kernel_source);
-                  viennacl::ocl::program & qr_prog = viennacl::ocl::current_context().add_program(qr_kernel_source.c_str(), "qr_kernel_source");
-                  qr_prog.add_kernel("block_qr");
-                  //
-              }*/
-              
-              g_bv_vcl.handle() = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE,  
-                                                                      static_cast<unsigned int>(sizeof(ScalarType)*bv_size), 
-                                                                      &(b_v[0]));
-              
-              v_vcl.handle() = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE,  
-                                                                                                  static_cast<unsigned int>(sizeof(ScalarType)*v_size), 
-                                                                                                  &(v[0]));
-              //the same as j_start_inds
-              g_bv_vcl.handle1() = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, 
-                                                                                static_cast<unsigned int>(sizeof(cl_uint)*g_I.size()), 
-                                                                                &(start_bv_inds[0]));
-              
-              v_vcl.handle1() = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, 
-                                                                                static_cast<unsigned int>(sizeof(cl_uint)*g_I.size()), 
-                                                                                &(start_v_inds[0]));
-              viennacl::ocl::handle<cl_mem> g_is_update_vcl = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE,
-                                                                                static_cast<unsigned int>(sizeof(cl_uint)*g_is_update.size()),
-                                                                                &(g_is_update[0]));
-              //local memory
-              //viennacl::ocl::enqueue(k(vcl_vec, size, viennacl::ocl::local_mem(sizeof(SCALARTYPE) * k.local_work_size()), temp));
-              viennacl::ocl::kernel& qr_kernel = viennacl::ocl::get_kernel(viennacl::linalg::kernels::spai<ScalarType, 1>::program_name(), "block_qr");
-              qr_kernel.local_work_size(0, local_c_n);
-              qr_kernel.global_work_size(0, 256);
-              viennacl::ocl::enqueue(qr_kernel(g_A_I_J_vcl.handle(), g_A_I_J_vcl.handle1(), g_bv_vcl.handle(), 
-                                              v_vcl.handle(), g_A_I_J_vcl.handle2(), 
-                                              g_bv_vcl.handle1(), v_vcl.handle1(), g_is_update_vcl,
-                                              viennacl::ocl::local_mem(static_cast<unsigned int>(sizeof(ScalarType)*(local_r_n*local_c_n))),
-                                              static_cast<cl_uint>(g_I.size())));
-              
+                        viennacl::context ctx)
+          {
+            viennacl::ocl::context & opencl_ctx = const_cast<viennacl::ocl::context &>(ctx.opencl_context());
+
+            //typedef typename MatrixType::value_type ScalarType;
+            unsigned int bv_size = 0;
+            unsigned int v_size = 0;
+            //set up arguments for GPU
+            //find maximum size of rows/columns
+            unsigned int local_r_n = 0;
+            unsigned int local_c_n = 0;
+            //find max size for blocks
+            get_max_block_size(g_I, local_r_n);
+            get_max_block_size(g_J, local_c_n);
+            //get size
+            get_size(g_J, bv_size);
+            get_size(g_I, v_size);
+            //get start indices
+            std::vector<cl_uint> start_bv_inds(g_I.size() + 1, 0);
+            std::vector<cl_uint> start_v_inds(g_I.size() + 1, 0);
+            init_start_inds(g_J, start_bv_inds);
+            init_start_inds(g_I, start_v_inds);
+            //init arrays
+            std::vector<ScalarType> b_v(bv_size, static_cast<ScalarType>(0));
+            std::vector<ScalarType> v(v_size, static_cast<ScalarType>(0));
+            //call qr program
+            block_vector v_vcl;
+
+            g_bv_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+                                                         static_cast<unsigned int>(sizeof(ScalarType)*bv_size),
+                                                         &(b_v[0]));
+
+            v_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+                                                      static_cast<unsigned int>(sizeof(ScalarType)*v_size),
+                                                      &(v[0]));
+            //the same as j_start_inds
+            g_bv_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+                                                          static_cast<unsigned int>(sizeof(cl_uint)*g_I.size()),
+                                                          &(start_bv_inds[0]));
+
+            v_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+                                                       static_cast<unsigned int>(sizeof(cl_uint)*g_I.size()),
+                                                       &(start_v_inds[0]));
+            viennacl::ocl::handle<cl_mem> g_is_update_vcl = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+                                                                                     static_cast<unsigned int>(sizeof(cl_uint)*g_is_update.size()),
+                                                                                     &(g_is_update[0]));
+            //local memory
+            //viennacl::ocl::enqueue(k(vcl_vec, size, viennacl::ocl::local_mem(sizeof(SCALARTYPE) * k.local_work_size()), temp));
+            viennacl::linalg::opencl::kernels::spai<ScalarType>::init(opencl_ctx);
+            viennacl::ocl::kernel& qr_kernel = opencl_ctx.get_kernel(viennacl::linalg::opencl::kernels::spai<ScalarType>::program_name(), "block_qr");
+            qr_kernel.local_work_size(0, local_c_n);
+            qr_kernel.global_work_size(0, local_c_n*256);
+            viennacl::ocl::enqueue(qr_kernel(g_A_I_J_vcl.handle(), g_A_I_J_vcl.handle1(), g_bv_vcl.handle(),
+                                            v_vcl.handle(), g_A_I_J_vcl.handle2(),
+                                            g_bv_vcl.handle1(), v_vcl.handle1(), g_is_update_vcl,
+                                            viennacl::ocl::local_mem(static_cast<unsigned int>(sizeof(ScalarType)*(local_r_n*local_c_n))),
+                                            static_cast<cl_uint>(g_I.size())));
+
           }
         }
       }
     }
 }
-#endif
\ No newline at end of file
+#endif
diff --git a/viennacl/linalg/detail/spai/small_matrix.hpp b/viennacl/linalg/detail/spai/small_matrix.hpp
index 20ff736..436c24f 100644
--- a/viennacl/linalg/detail/spai/small_matrix.hpp
+++ b/viennacl/linalg/detail/spai/small_matrix.hpp
@@ -2,24 +2,25 @@
 #define VIENNACL_LINALG_DETAIL_SPAI_SMALL_MATRIX_HPP
 
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
 ============================================================================= */
 
 /** @file viennacl/linalg/detail/spai/small_matrix.hpp
-    @brief Implementation of a routines for small matrices (helper for SPAI). Experimental in 1.2.x.
-    
+    @brief Implementation of a routines for small matrices (helper for SPAI). Experimental.
+
     SPAI code contributed by Nikolay Lukash
 */
 
@@ -57,17 +58,17 @@ namespace viennacl
           // Constructs an orthonormal sparse matrix M (with M^T M = Id). Is composed of elementary 2x2 rotation matrices with suitable renumbering.
           //
           template <typename MatrixType>
-          void make_rotation_matrix(MatrixType & mat, size_t new_size, size_t off_diagonal_distance = 4)
+          void make_rotation_matrix(MatrixType & mat, vcl_size_t new_size, vcl_size_t off_diagonal_distance = 4)
           {
             mat.resize(new_size, new_size, false);
             mat.clear();
 
-            double val = 1 / sqrt(2.0);
+            double val = 1.0 / std::sqrt(2.0);
 
-            for (size_t i=0; i<new_size; ++i)
+            for (vcl_size_t i=0; i<new_size; ++i)
               mat(i,i) = val;
 
-            for (size_t i=off_diagonal_distance; i<new_size; ++i)
+            for (vcl_size_t i=off_diagonal_distance; i<new_size; ++i)
             {
               mat(i-off_diagonal_distance, i) = val; mat(i, i-off_diagonal_distance) = -val;
             }
@@ -80,30 +81,30 @@ namespace viennacl
           double determinant(boost::numeric::ublas::matrix_expression<MatrixType> const& mat_r)
           {
               double det = 1.0;
-              
+
               MatrixType mLu(mat_r() );
-              boost::numeric::ublas::permutation_matrix<std::size_t> pivots(mat_r().size1() );
-              
+              boost::numeric::ublas::permutation_matrix<vcl_size_t> pivots(mat_r().size1() );
+
               int is_singular = static_cast<int>(lu_factorize(mLu, pivots));
-              
+
               if (!is_singular)
               {
-                  for (std::size_t i=0; i < pivots.size(); ++i)
+                  for (vcl_size_t i=0; i < pivots.size(); ++i)
                   {
                       if (pivots(i) != i)
                           det *= -1.0;
-                      
+
                       det *= mLu(i,i);
                   }
               }
               else
                   det = 0.0;
-              
+
               return det;
-          } 
-          
+          }
+
         }
       }
     }
 }
-#endif
\ No newline at end of file
+#endif
diff --git a/viennacl/linalg/detail/spai/spai-dynamic.hpp b/viennacl/linalg/detail/spai/spai-dynamic.hpp
index a5e8063..9267179 100644
--- a/viennacl/linalg/detail/spai/spai-dynamic.hpp
+++ b/viennacl/linalg/detail/spai/spai-dynamic.hpp
@@ -2,24 +2,25 @@
 #define VIENNACL_LINALG_DETAIL_SPAI_SPAI_DYNAMIC_HPP
 
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
 ============================================================================= */
 
 /** @file viennacl/linalg/detail/spai/spai-dynamic.hpp
-    @brief Implementation of a dynamic SPAI. Provides the routines for automatic pattern updates Experimental in 1.2.x.
-    
+    @brief Implementation of a dynamic SPAI. Provides the routines for automatic pattern updates Experimental.
+
     SPAI code contributed by Nikolay Lukash
 */
 
@@ -47,7 +48,7 @@
 #include "viennacl/linalg/prod.hpp"
 #include "viennacl/matrix.hpp"
 #include "viennacl/compressed_matrix.hpp"
-#include "viennacl/linalg/compressed_matrix_operations.hpp"
+#include "viennacl/linalg/sparse_matrix_operations.hpp"
 #include "viennacl/linalg/matrix_operations.hpp"
 #include "viennacl/scalar.hpp"
 #include "viennacl/linalg/cg.hpp"
@@ -58,9 +59,10 @@
 #include "viennacl/linalg/detail/spai/block_matrix.hpp"
 #include "viennacl/linalg/detail/spai/block_vector.hpp"
 #include "viennacl/linalg/detail/spai/qr.hpp"
+#include "viennacl/linalg/detail/spai/spai-static.hpp"
+#include "viennacl/linalg/detail/spai/spai.hpp"
 #include "viennacl/linalg/detail/spai/spai_tag.hpp"
-#include "viennacl/linalg/kernels/spai_source.h"
-#include "viennacl/linalg/kernels/spai_kernels.h"
+#include "viennacl/linalg/opencl/kernels/spai.hpp"
 
 namespace viennacl
 {
@@ -70,200 +72,186 @@ namespace viennacl
       {
         namespace spai
         {
-        
-          typedef std::pair<unsigned int, double> PairT;
-          struct CompareSecond{
-              bool operator()(const PairT& left, const PairT& right)
-              {
-                  return static_cast<double>(left.second) > static_cast<double>(right.second);
-              }
+
+          /** @brief Helper functor for comparing std::pair<> based on the second member. */
+          struct CompareSecond
+          {
+            template <typename T1, typename T2>
+            bool operator()(std::pair<T1, T2> const & left, std::pair<T1, T2> const & right)
+            {
+              return static_cast<double>(left.second) > static_cast<double>(right.second);
+            }
           };
-          
-          
-          /** @brief Initializes Dense matrix from sparse one
-          * @param A_in Riginal sparse matrix
-          * @param J Set of column indices
-          * @param I Set of row indices 
-          * @param A_out dense matrix output
-          */
-          template<typename SparseMatrixType, typename DenseMatrixType>
-          void initProjectSubMatrix(const SparseMatrixType& A_in, const std::vector<unsigned int>& J, std::vector<unsigned int>& I,
-                                    DenseMatrixType& A_out){
-              typedef typename DenseMatrixType::value_type ScalarType;
-              A_out.resize(I.size(), J.size(), false);
-              for(size_t j = 0; j < J.size(); ++j){
-                  for(size_t i = 0; i < I.size(); ++i){
-                      A_out(i,j) = A_in(I[i],J[j]);
-                  }
-              }
-          }
-          
-          /** @brief Determines if element ind is in set {J}
-          * @param J current set
-          * @param ind current element
-          */
-          bool isInIndexSet(const std::vector<unsigned int>& J, const unsigned int& ind){
-              return (std::find(J.begin(), J.end(), ind) != J.end());
-          }
-          
+
+
           /** @brief Composition of new matrix R, that is going to be used in Least Square problem solving
           * @param A matrix Q'*A(I, \\tilde J), where \\tilde J - set of new column indices
           * @param R_n matrix A_Iu_J_u after QR factorization
-          * @param R previously composed matrix R 
+          * @param R previously composed matrix R
           */
           template<typename MatrixType>
-          void composeNewR(const MatrixType& A, const MatrixType& R_n, MatrixType& R){
-              typedef typename MatrixType::value_type ScalarType;
-              size_t row_n = R_n.size1() - (A.size1() - R.size2()); 
-              MatrixType C = boost::numeric::ublas::zero_matrix<ScalarType>(R.size1() + row_n, R.size2() + A.size2());
-              //write original R to new Composite R
-              boost::numeric::ublas::project(C, boost::numeric::ublas::range(0,R.size1()), boost::numeric::ublas::range(0, R.size2())) += R;
-              //write upper part of Q'*A_I_\hatJ, all columns and number of rows that equals to R.size2()
-              boost::numeric::ublas::project(C, boost::numeric::ublas::range(0, R.size2()), boost::numeric::ublas::range(R.size2(), 
-                                                                                                                        R.size2() + A.size2())) += 
-              boost::numeric::ublas::project(A, boost::numeric::ublas::range(0, R.size2()), boost::numeric::ublas::range(0, A.size2()));
-              //adding decomposed(QR) block to Composite R
-              if(R_n.size1() > 0 && R_n.size2() > 0)
-                  boost::numeric::ublas::project(C, boost::numeric::ublas::range(R.size2(), R.size1() + row_n),
-                                                boost::numeric::ublas::range(R.size2(), R.size2() + A.size2())) += R_n;
-              R = C;
+          void composeNewR(const MatrixType& A, const MatrixType& R_n, MatrixType& R)
+          {
+            typedef typename MatrixType::value_type ScalarType;
+            vcl_size_t row_n = R_n.size1() - (A.size1() - R.size2());
+            MatrixType C = boost::numeric::ublas::zero_matrix<ScalarType>(R.size1() + row_n, R.size2() + A.size2());
+            //write original R to new Composite R
+            boost::numeric::ublas::project(C, boost::numeric::ublas::range(0,R.size1()), boost::numeric::ublas::range(0, R.size2())) += R;
+            //write upper part of Q'*A_I_\hatJ, all columns and number of rows that equals to R.size2()
+            boost::numeric::ublas::project(C, boost::numeric::ublas::range(0, R.size2()), boost::numeric::ublas::range(R.size2(),
+                                                                                                                      R.size2() + A.size2())) +=
+            boost::numeric::ublas::project(A, boost::numeric::ublas::range(0, R.size2()), boost::numeric::ublas::range(0, A.size2()));
+            //adding decomposed(QR) block to Composite R
+            if(R_n.size1() > 0 && R_n.size2() > 0)
+                boost::numeric::ublas::project(C, boost::numeric::ublas::range(R.size2(), R.size1() + row_n),
+                                              boost::numeric::ublas::range(R.size2(), R.size2() + A.size2())) += R_n;
+            R = C;
           }
-          
+
           /** @brief Composition of new vector of coefficients beta from QR factorizations(necessary for Q recovery)
           * @param v_n new vector from last QR factorization
           * @param v composition of previous vectors from QR factorizations
           */
           template<typename VectorType>
-          void composeNewVector(const VectorType& v_n, VectorType& v){
-              typedef typename VectorType::value_type ScalarType;
-              VectorType w  = boost::numeric::ublas::zero_vector<ScalarType>(v.size() + v_n.size());
-              boost::numeric::ublas::project(w, boost::numeric::ublas::range(0, v.size())) += v;
-              boost::numeric::ublas::project(w, boost::numeric::ublas::range(v.size(), v.size() + v_n.size())) += v_n;
-              v = w;
+          void composeNewVector(const VectorType& v_n, VectorType& v)
+          {
+            typedef typename VectorType::value_type ScalarType;
+            VectorType w  = boost::numeric::ublas::zero_vector<ScalarType>(v.size() + v_n.size());
+            boost::numeric::ublas::project(w, boost::numeric::ublas::range(0, v.size())) += v;
+            boost::numeric::ublas::project(w, boost::numeric::ublas::range(v.size(), v.size() + v_n.size())) += v_n;
+            v = w;
           }
-          
+
           /** @brief Computation of Euclidean norm for sparse vector
           * @param v initial sparse vector
           * @param norm scalar that represents Euclidean norm
           */
           template<typename SparseVectorType, typename ScalarType>
-          void sparse_norm_2(const SparseVectorType& v, ScalarType& norm){
-              for(typename SparseVectorType::const_iterator vec_it  = v.begin(); vec_it != v.end(); ++vec_it){
-                  norm += (vec_it->second)*(vec_it->second);
-              }
-              norm = std::sqrt(norm);
+          void sparse_norm_2(const SparseVectorType& v, ScalarType& norm)
+          {
+            for(typename SparseVectorType::const_iterator vec_it  = v.begin(); vec_it != v.end(); ++vec_it)
+              norm += (vec_it->second)*(vec_it->second);
+
+            norm = std::sqrt(norm);
           }
-          
+
           /** @brief Dot product of two sparse vectors
           * @param v1 initial sparse vector
           * @param v2 initial sparse vector
           * @param res_v scalar that represents dot product result
           */
           template<typename SparseVectorType, typename ScalarType>
-          void sparse_inner_prod(const SparseVectorType& v1, const SparseVectorType& v2, ScalarType& res_v){
-              typename SparseVectorType::const_iterator v_it1 = v1.begin();
-              typename SparseVectorType::const_iterator v_it2 = v2.begin();
-              while((v_it1 != v1.end())&&(v_it2 != v2.end())){
-                  if(v_it1->first == v_it2->first){
-                      res_v += (v_it1->second)*(v_it2->second);
-                      ++v_it1;
-                      ++v_it2;
-                  }
-                  else if(v_it1->first < v_it2->first){
-                      ++v_it1;
-                  }
-                  else 
-                      ++v_it2;
-                      
-                  
+          void sparse_inner_prod(const SparseVectorType& v1, const SparseVectorType& v2, ScalarType& res_v)
+          {
+            typename SparseVectorType::const_iterator v_it1 = v1.begin();
+            typename SparseVectorType::const_iterator v_it2 = v2.begin();
+            while((v_it1 != v1.end())&&(v_it2 != v2.end()))
+            {
+              if(v_it1->first == v_it2->first)
+              {
+                res_v += (v_it1->second)*(v_it2->second);
+                ++v_it1;
+                ++v_it2;
               }
+              else if(v_it1->first < v_it2->first)
+                ++v_it1;
+              else
+                ++v_it2;
+            }
           }
-          
+
           /** @brief Building a new set of column indices J_u, cf. Kallischko dissertation p.31
-          * @param A_v_c vectorized column-wise initial matrix 
-          * @param res residual vector 
+          * @param A_v_c vectorized column-wise initial matrix
+          * @param res residual vector
           * @param J set of column indices
-          * @param J_u set of new column indices 
+          * @param J_u set of new column indices
           * @param tag SPAI tag with parameters
           */
           template <typename SparseVectorType, typename ScalarType>
-          bool buildAugmentedIndexSet(const std::vector<SparseVectorType>& A_v_c, 
+          bool buildAugmentedIndexSet(const std::vector<SparseVectorType>& A_v_c,
                                       const SparseVectorType& res,
                                       std::vector<unsigned int>& J,
                                       std::vector<unsigned int>& J_u,
-                                      const spai_tag& tag){
-              std::vector<std::pair<unsigned int, ScalarType> > p;
-              size_t cur_size = 0;
-              ScalarType inprod, norm2;
-              //print_sparse_vector(res);
-              for(typename SparseVectorType::const_iterator res_it = res.begin(); res_it != res.end(); ++res_it){
-                  if(!isInIndexSet(J, res_it->first) && (std::abs(res_it->second) > tag.getResidualThreshold())){
-                      inprod = norm2 = 0;
-                      sparse_inner_prod(res, A_v_c[res_it->first], inprod);
-                      sparse_norm_2(A_v_c[res_it->first], norm2);
-                      p.push_back(std::pair<size_t, ScalarType>(res_it->first, (inprod*inprod)/(norm2*norm2)));
-                  }
-              }
-              
-              std::sort(p.begin(), p.end(), CompareSecond());
-              while ((cur_size < J.size())&&(p.size() > 0)) {
-                  J_u.push_back(p[0].first);
-                  p.erase(p.begin());
-                  cur_size++;
+                                      const spai_tag& tag)
+          {
+            std::vector<std::pair<unsigned int, ScalarType> > p;
+            vcl_size_t cur_size = 0;
+            ScalarType inprod, norm2;
+            //print_sparse_vector(res);
+            for(typename SparseVectorType::const_iterator res_it = res.begin(); res_it != res.end(); ++res_it)
+            {
+              if(!isInIndexSet(J, res_it->first) && (std::fabs(res_it->second) > tag.getResidualThreshold()))
+              {
+                inprod = norm2 = 0;
+                sparse_inner_prod(res, A_v_c[res_it->first], inprod);
+                sparse_norm_2(A_v_c[res_it->first], norm2);
+                p.push_back(std::pair<unsigned int, ScalarType>(res_it->first, (inprod*inprod)/(norm2*norm2)));
               }
-              p.clear();
-              return (cur_size > 0);
+            }
+
+            std::sort(p.begin(), p.end(), CompareSecond());
+            while ((cur_size < J.size())&&(p.size() > 0))
+            {
+              J_u.push_back(p[0].first);
+              p.erase(p.begin());
+              cur_size++;
+            }
+            p.clear();
+            return (cur_size > 0);
           }
-          
+
           /** @brief Building a new indices to current set of row indices I_n, cf. Kallischko dissertation p.32
           * @param A_v_c vectorized column-wise initial matrix
           * @param I set of previous determined row indices
           * @param J_n set of new column indices
-          * @param I_n set of new indices 
+          * @param I_n set of new indices
           */
           template<typename SparseVectorType>
-          void buildNewRowSet(const std::vector<SparseVectorType>& A_v_c, const std::vector<unsigned int>& I, 
-                              const std::vector<unsigned int>& J_n, std::vector<unsigned int>& I_n){
-              for(size_t i = 0; i < J_n.size(); ++i){
-                  for(typename SparseVectorType::const_iterator col_it = A_v_c[J_n[i]].begin(); col_it!=A_v_c[J_n[i]].end(); ++col_it){
-                      if(!isInIndexSet(I, col_it->first)&&!isInIndexSet(I_n, col_it->first)){
-                          I_n.push_back(col_it->first);
-                      }
-                  }
+          void buildNewRowSet(const std::vector<SparseVectorType>& A_v_c, const std::vector<unsigned int>& I,
+                              const std::vector<unsigned int>& J_n, std::vector<unsigned int>& I_n)
+          {
+            for(vcl_size_t i = 0; i < J_n.size(); ++i)
+            {
+              for(typename SparseVectorType::const_iterator col_it = A_v_c[J_n[i]].begin(); col_it!=A_v_c[J_n[i]].end(); ++col_it)
+              {
+                if(!isInIndexSet(I, col_it->first)&&!isInIndexSet(I_n, col_it->first))
+                  I_n.push_back(col_it->first);
               }
+            }
           }
-          
+
           /** @brief Composition of new block for QR factorization cf. Kallischko dissertation p.82, figure 4.7
           * @param A_I_J previously composed block
           * @param A_I_J_u matrix Q'*A(I, \\tilde J), where \\tilde J - set of new column indices
           * @param A_I_u_J_u is composition of lower part A(I, \\tilde J) and  A(\\tilde I, \\tilde J) - new block for QR decomposition
           */
           template<typename MatrixType>
-          void QRBlockComposition(const MatrixType& A_I_J, const MatrixType& A_I_J_u, MatrixType& A_I_u_J_u){
-              typedef typename MatrixType::value_type ScalarType;
-              size_t row_n1 = A_I_J_u.size1() - A_I_J.size2();
-              size_t row_n2 = A_I_u_J_u.size1();
-              size_t row_n = row_n1 + row_n2;
-              size_t col_n = A_I_J_u.size2();
-              MatrixType C = boost::numeric::ublas::zero_matrix<ScalarType>(row_n, col_n);
-              boost::numeric::ublas::project(C, boost::numeric::ublas::range(0, row_n1), boost::numeric::ublas::range(0, col_n)) += 
-              boost::numeric::ublas::project(A_I_J_u, boost::numeric::ublas::range(A_I_J.size2(), A_I_J_u.size1()),
-                                            boost::numeric::ublas::range(0, col_n));
-                                            
-              boost::numeric::ublas::project(C, boost::numeric::ublas::range(row_n1, row_n1 + row_n2),
-                                            boost::numeric::ublas::range(0, col_n)) += A_I_u_J_u;
-              A_I_u_J_u = C;
+          void QRBlockComposition(const MatrixType& A_I_J, const MatrixType& A_I_J_u, MatrixType& A_I_u_J_u)
+          {
+            typedef typename MatrixType::value_type ScalarType;
+            vcl_size_t row_n1 = A_I_J_u.size1() - A_I_J.size2();
+            vcl_size_t row_n2 = A_I_u_J_u.size1();
+            vcl_size_t row_n = row_n1 + row_n2;
+            vcl_size_t col_n = A_I_J_u.size2();
+            MatrixType C = boost::numeric::ublas::zero_matrix<ScalarType>(row_n, col_n);
+            boost::numeric::ublas::project(C, boost::numeric::ublas::range(0, row_n1), boost::numeric::ublas::range(0, col_n)) +=
+            boost::numeric::ublas::project(A_I_J_u, boost::numeric::ublas::range(A_I_J.size2(), A_I_J_u.size1()),
+                                          boost::numeric::ublas::range(0, col_n));
+
+            boost::numeric::ublas::project(C, boost::numeric::ublas::range(row_n1, row_n1 + row_n2),
+                                          boost::numeric::ublas::range(0, col_n)) += A_I_u_J_u;
+            A_I_u_J_u = C;
           }
-          
+
           /** @brief CPU-based dynamic update for SPAI preconditioner
           * @param A initial sparse matrix
           * @param A_v_c vectorized column-wise initial matrix
           * @param g_res container of residuals for all columns
-          * @param g_is_update container with identificators that shows which block should be modified 
+          * @param g_is_update container with identificators that shows which block should be modified
           * @param g_I container of row index sets for all columns
           * @param g_J container of column index sets for all columns
           * @param g_b_v container of vectors of beta for Q recovery(cf. Golub Van Loan "Matrix Computations", 3rd edition p.211)
-          * @param g_A_I_J container of block matrices from previous update  
+          * @param g_A_I_J container of block matrices from previous update
           * @param tag  SPAI configuration tag
           */
           template<typename SparseMatrixType, typename SparseVectorType, typename DenseMatrixType, typename VectorType>
@@ -271,384 +259,381 @@ namespace viennacl
                             std::vector<SparseVectorType>& g_res,
                             std::vector<bool>& g_is_update,
                             std::vector<std::vector<unsigned int> >& g_I,
-                            std::vector<std::vector<unsigned int> >& g_J, 
+                            std::vector<std::vector<unsigned int> >& g_J,
                             std::vector<VectorType>& g_b_v,
                             std::vector<DenseMatrixType>& g_A_I_J,
-                            spai_tag const & tag){
-              typedef typename DenseMatrixType::value_type ScalarType;
-              //set of new column indices
-              std::vector<std::vector<unsigned int> > g_J_u(g_J.size());
-              //set of new row indices 
-              std::vector<std::vector<unsigned int> > g_I_u(g_J.size());
-              //matrix A(I, \tilde J), cf. Kallischko p.31-32
-              std::vector<DenseMatrixType> g_A_I_J_u(g_J.size());
-              //matrix A(\tilde I, \tilde J), cf. Kallischko
-              std::vector<DenseMatrixType> g_A_I_u_J_u(g_J.size());
-              //new vector of beta coefficients from QR factorization
-              std::vector<VectorType> g_b_v_u(g_J.size());
-#ifdef _OPENMP
-              #pragma omp parallel for
-#endif              
-              for(std::size_t i = 0; i < g_J.size(); ++i){
-                  if(g_is_update[i]){
-                      if(buildAugmentedIndexSet<SparseVectorType, ScalarType>(A_v_c, g_res[i], g_J[i], g_J_u[i], tag)){
-                          //initialize matrix A_I_\hatJ
-                          initProjectSubMatrix(A, g_J_u[i], g_I[i], g_A_I_J_u[i]);
-                          //multiplication of Q'*A_I_\hatJ
-                          apply_q_trans_mat(g_A_I_J[i], g_b_v[i], g_A_I_J_u[i]);
-                          //building new rows index set \hatI
-                          buildNewRowSet(A_v_c, g_I[i], g_J_u[i], g_I_u[i]);
-                          initProjectSubMatrix(A, g_J_u[i], g_I_u[i], g_A_I_u_J_u[i]);
-                          //composition of block for new QR factorization
-                          QRBlockComposition(g_A_I_J[i], g_A_I_J_u[i], g_A_I_u_J_u[i]);
-                          //QR factorization
-                          single_qr(g_A_I_u_J_u[i], g_b_v_u[i]);
-                          //composition of new R and new vector b_v
-                          composeNewR(g_A_I_J_u[i], g_A_I_u_J_u[i], g_A_I_J[i]);
-                          composeNewVector(g_b_v_u[i], g_b_v[i]);
-                          //composition of new sets: I and J
-                          g_J[i].insert(g_J[i].end(), g_J_u[i].begin(), g_J_u[i].end());
-                          g_I[i].insert(g_I[i].end(), g_I_u[i].begin(), g_I_u[i].end());
-                      }else{
-                          g_is_update[i] = false;
-                      }
-                  }
+                            spai_tag const & tag)
+          {
+            typedef typename DenseMatrixType::value_type ScalarType;
+            //set of new column indices
+            std::vector<std::vector<unsigned int> > g_J_u(g_J.size());
+            //set of new row indices
+            std::vector<std::vector<unsigned int> > g_I_u(g_J.size());
+            //matrix A(I, \tilde J), cf. Kallischko p.31-32
+            std::vector<DenseMatrixType> g_A_I_J_u(g_J.size());
+            //matrix A(\tilde I, \tilde J), cf. Kallischko
+            std::vector<DenseMatrixType> g_A_I_u_J_u(g_J.size());
+            //new vector of beta coefficients from QR factorization
+            std::vector<VectorType> g_b_v_u(g_J.size());
+#ifdef VIENNACL_WITH_OPENMP
+            #pragma omp parallel for
+#endif
+            for(long i = 0; i < static_cast<long>(g_J.size()); ++i)
+            {
+              if(g_is_update[i])
+              {
+                if(buildAugmentedIndexSet<SparseVectorType, ScalarType>(A_v_c, g_res[i], g_J[i], g_J_u[i], tag))
+                {
+                  //initialize matrix A_I_\hatJ
+                  initProjectSubMatrix(A, g_J_u[i], g_I[i], g_A_I_J_u[i]);
+                  //multiplication of Q'*A_I_\hatJ
+                  apply_q_trans_mat(g_A_I_J[i], g_b_v[i], g_A_I_J_u[i]);
+                  //building new rows index set \hatI
+                  buildNewRowSet(A_v_c, g_I[i], g_J_u[i], g_I_u[i]);
+                  initProjectSubMatrix(A, g_J_u[i], g_I_u[i], g_A_I_u_J_u[i]);
+                  //composition of block for new QR factorization
+                  QRBlockComposition(g_A_I_J[i], g_A_I_J_u[i], g_A_I_u_J_u[i]);
+                  //QR factorization
+                  single_qr(g_A_I_u_J_u[i], g_b_v_u[i]);
+                  //composition of new R and new vector b_v
+                  composeNewR(g_A_I_J_u[i], g_A_I_u_J_u[i], g_A_I_J[i]);
+                  composeNewVector(g_b_v_u[i], g_b_v[i]);
+                  //composition of new sets: I and J
+                  g_J[i].insert(g_J[i].end(), g_J_u[i].begin(), g_J_u[i].end());
+                  g_I[i].insert(g_I[i].end(), g_I_u[i].begin(), g_I_u[i].end());
+                }
+                else
+                {
+                  g_is_update[i] = false;
+                }
               }
+            }
           }
           /**************************************************** GPU SPAI Update ****************************************************************/
-          
-          
+
+
           //performs Q'*A(I, \tilde J) on GPU
           /** @brief Performs multiplication Q'*A(I, \\tilde J) on GPU
           * @param g_J_u container of sets of new column indices
           * @param g_I container of row indices
           * @param g_A_I_J_vcl block matrix composed from previous blocks, they are blocks of R
-          * @param g_bv_vcl block of beta vectors 
-          * @param g_A_I_J_u_vcl block of matrices A(I, \\tilde J) 
+          * @param g_bv_vcl block of beta vectors
+          * @param g_A_I_J_u_vcl block of matrices A(I, \\tilde J)
           * @param g_is_update indicators, that show if a certain block should be processed
-          * @param cur_iter current iteration, used to make sure that kernel compiles just once
+          * @param ctx         Optional context in which the auxiliary data is created (one out of multiple OpenCL contexts, CUDA, host)
           */
           template<typename ScalarType>
-          void block_q_multiplication(const std::vector<std::vector<unsigned int> >& g_J_u, 
-                                      const std::vector<std::vector<unsigned int> >& g_I, 
-                                      block_matrix& g_A_I_J_vcl, 
-                                      block_vector& g_bv_vcl, 
-                                      block_matrix& g_A_I_J_u_vcl, 
+          void block_q_multiplication(const std::vector<std::vector<unsigned int> >& g_J_u,
+                                      const std::vector<std::vector<unsigned int> >& g_I,
+                                      block_matrix& g_A_I_J_vcl,
+                                      block_vector& g_bv_vcl,
+                                      block_matrix& g_A_I_J_u_vcl,
                                       std::vector<cl_uint>& g_is_update,
-                                      const unsigned int cur_iter){
-              unsigned int local_r_n, local_c_n, sz_blocks;
-              get_max_block_size(g_I, local_r_n);
-              get_max_block_size(g_J_u, local_c_n);
-              //for debug 
-              std::vector<cl_uint> matrix_dims(g_I.size()*2, static_cast<cl_uint>(0));
-              std::vector<cl_uint> blocks_ind(g_I.size() + 1, static_cast<cl_uint>(0));
-              compute_blocks_size(g_I, g_J_u, sz_blocks, blocks_ind, matrix_dims);
-              std::vector<ScalarType> con_A_I_J(sz_blocks, static_cast<ScalarType>(0));
-              /*if(cur_iter == 1){
-                  //if first run - compile the program
-                  std::string block_q_kernel_file_name = "kernels/spai/block_q.cl";
-                  std::string block_q_kernel_source;
-                  read_kernel_from_file(block_q_kernel_file_name, block_q_kernel_source);
-                  viennacl::ocl::program & block_q_prog = viennacl::ocl::current_context().add_program(block_q_kernel_source.c_str(), "block_q_kernel_source");
-                  block_q_prog.add_kernel("block_q_mult");
-                  //
-              }*/
-              
-              viennacl::ocl::handle<cl_mem> g_is_update_vcl = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE,
-                                                                                  static_cast<unsigned int>(sizeof(cl_uint)*(g_is_update.size())),
-                                                                                                            &(g_is_update[0]));
-              viennacl::ocl::kernel& block_q_kernel = viennacl::ocl::get_kernel(viennacl::linalg::kernels::spai<ScalarType, 1>::program_name(), "block_q_mult");
-              block_q_kernel.local_work_size(0, local_c_n);
-              block_q_kernel.global_work_size(0, 256);
-              viennacl::ocl::enqueue(block_q_kernel(g_A_I_J_vcl.handle(), g_A_I_J_vcl.handle2(), g_A_I_J_u_vcl.handle(), g_A_I_J_u_vcl.handle2(), 
-                                                    g_bv_vcl.handle(),   
-                                                    g_bv_vcl.handle1(), g_A_I_J_vcl.handle1(), g_A_I_J_u_vcl.handle1(), g_is_update_vcl,
-                                                    viennacl::ocl::local_mem(static_cast<unsigned int>(sizeof(ScalarType)*(local_r_n*local_c_n))),
-                                                    static_cast<cl_uint>(g_I.size())));
+                                      viennacl::context ctx)
+          {
+            viennacl::ocl::context & opencl_ctx = const_cast<viennacl::ocl::context &>(ctx.opencl_context());
+            unsigned int local_r_n = 0;
+            unsigned int local_c_n = 0;
+            unsigned int sz_blocks = 0;
+            get_max_block_size(g_I, local_r_n);
+            get_max_block_size(g_J_u, local_c_n);
+            //for debug
+            std::vector<cl_uint> matrix_dims(g_I.size()*2, static_cast<cl_uint>(0));
+            std::vector<cl_uint> blocks_ind(g_I.size() + 1, static_cast<cl_uint>(0));
+            compute_blocks_size(g_I, g_J_u, sz_blocks, blocks_ind, matrix_dims);
+            //std::vector<ScalarType> con_A_I_J(sz_blocks, static_cast<ScalarType>(0));
+
+            viennacl::ocl::handle<cl_mem> g_is_update_vcl = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+                                                                                     static_cast<unsigned int>(sizeof(cl_uint)*(g_is_update.size())),
+                                                                                     &(g_is_update[0]));
+            viennacl::linalg::opencl::kernels::spai<ScalarType>::init(opencl_ctx);
+            viennacl::ocl::kernel& block_q_kernel = opencl_ctx.get_kernel(viennacl::linalg::opencl::kernels::spai<ScalarType>::program_name(), "block_q_mult");
+            block_q_kernel.local_work_size(0, local_c_n);
+            block_q_kernel.global_work_size(0, 128*local_c_n);
+            viennacl::ocl::enqueue(block_q_kernel(g_A_I_J_vcl.handle(), g_A_I_J_vcl.handle2(), g_A_I_J_u_vcl.handle(), g_A_I_J_u_vcl.handle2(),
+                                                  g_bv_vcl.handle(),
+                                                  g_bv_vcl.handle1(), g_A_I_J_vcl.handle1(), g_A_I_J_u_vcl.handle1(), g_is_update_vcl,
+                                                  viennacl::ocl::local_mem(static_cast<unsigned int>(sizeof(ScalarType)*(local_r_n*local_c_n))),
+                                                  static_cast<cl_uint>(g_I.size())));
           }
-          
+
           /** @brief Assembly of container of index row sets: I_q, row indices for new "QR block"
           * @param g_I container of row indices
           * @param g_J container of column indices
-          * @param g_I_u container of new row indices 
+          * @param g_I_u container of new row indices
           * @param g_I_q container of row indices for new QR blocks
           */
-          void assemble_qr_row_inds(const std::vector<std::vector<unsigned int> >& g_I, const std::vector<std::vector<unsigned int> > g_J, 
-                                    const std::vector<std::vector<unsigned int> >& g_I_u, 
-                                    std::vector<std::vector<unsigned int> >& g_I_q){
-#ifdef _OPENMP
-              #pragma omp parallel for
-#endif              
-              for(std::size_t i = 0; i < g_I.size(); ++i){
-                  for(std::size_t j = g_J[i].size(); j < g_I[i].size(); ++j){
-                      g_I_q[i].push_back(g_I[i][j]);
-                  }
-                  
-                  for(std::size_t j = 0; j < g_I_u[i].size(); ++j){
-                      g_I_q[i].push_back(g_I_u[i][j]);
-                  }
-              }
+          template <typename SizeType>
+          void assemble_qr_row_inds(const std::vector<std::vector<SizeType> >& g_I, const std::vector<std::vector<SizeType> > g_J,
+                                    const std::vector<std::vector<SizeType> >& g_I_u,
+                                    std::vector<std::vector<SizeType> >& g_I_q)
+          {
+#ifdef VIENNACL_WITH_OPENMP
+            #pragma omp parallel for
+#endif
+            for(long i = 0; i < static_cast<long>(g_I.size()); ++i)
+            {
+              for(vcl_size_t j = g_J[i].size(); j < g_I[i].size(); ++j)
+                  g_I_q[i].push_back(g_I[i][j]);
+
+              for(vcl_size_t j = 0; j < g_I_u[i].size(); ++j)
+                  g_I_q[i].push_back(g_I_u[i][j]);
+            }
           }
 
           /** @brief Performs assembly for new QR block
-          * @param g_J container of column indices 
+          * @param g_J container of column indices
           * @param g_I container of row indices
           * @param g_J_u container of new column indices
-          * @param g_I_u container of new row indices 
-          * @param g_I_q container of row indices for new QR blocks 
+          * @param g_I_u container of new row indices
+          * @param g_I_q container of row indices for new QR blocks
           * @param g_A_I_J_u_vcl blocks of Q'*A(I, \\tilde J)
           * @param matrix_dimensions array with matrix dimensions for all blocks
           * @param g_A_I_u_J_u_vcl blocks A(\\tilde I, \\tilde J)
           * @param g_is_update container with update indicators
           * @param is_empty_block indicator if all previous blocks A(\\tilde I, \\tilde J) - are empty, in case if they are empty kernel with smaller number of
                                   arguments is used
-          * @param cur_iter current iteration, used to make sure that kernel compiles just once 
+          * @param ctx      Optional context in which the matrix is created (one out of multiple OpenCL contexts, CUDA, host)
           */
           template<typename ScalarType>
           void assemble_qr_block(
-                                const std::vector<std::vector<unsigned int> >& g_J, 
-                                const std::vector<std::vector<unsigned int> >& g_I, 
+                                const std::vector<std::vector<unsigned int> >& g_J,
+                                const std::vector<std::vector<unsigned int> >& g_I,
                                 const std::vector<std::vector<unsigned int> >& g_J_u,
-                                const std::vector<std::vector<unsigned int> >& g_I_u, 
+                                const std::vector<std::vector<unsigned int> >& g_I_u,
                                 std::vector<std::vector<unsigned int> >& g_I_q,
-                                block_matrix& g_A_I_J_u_vcl, 
-                                viennacl::ocl::handle<cl_mem>& matrix_dimensions, 
-                                block_matrix& g_A_I_u_J_u_vcl, 
+                                block_matrix& g_A_I_J_u_vcl,
+                                viennacl::ocl::handle<cl_mem>& matrix_dimensions,
+                                block_matrix& g_A_I_u_J_u_vcl,
                                 std::vector<cl_uint>& g_is_update,
                                 const bool is_empty_block,
-                                const unsigned int cur_iter){
-              //std::vector<std::vector<unsigned int> > g_I_q(g_I.size());
-              assemble_qr_row_inds(g_I, g_J, g_I_u, g_I_q);
-              unsigned int sz_blocks;
-              std::vector<cl_uint> matrix_dims(g_I.size()*2, static_cast<cl_uint>(0));
-              std::vector<cl_uint> blocks_ind(g_I.size() + 1, static_cast<cl_uint>(0));
-              compute_blocks_size(g_I_q, g_J_u, sz_blocks, blocks_ind, matrix_dims);
-              std::vector<ScalarType> con_A_I_J_q(sz_blocks, static_cast<ScalarType>(0));
-              
-              /*if(cur_iter == 1){
-                  std::string qr_block_asm_file_name = "kernels/spai/qr_block_assembly_g.cl";
-                  std::string qr_block_asm_source;
-                  read_kernel_from_file(qr_block_asm_file_name, qr_block_asm_source);
-                  viennacl::ocl::program & qr_block_asm_prog = viennacl::ocl::current_context().add_program(qr_block_asm_source.c_str(), 
-                                                                                                        "qr_block_assembly_kernel_source");
-                  
-                  qr_block_asm_prog.add_kernel("block_qr_assembly");
-                  
-                  
-                  //extra kernel in case of empty block A_I_u_J_u
-                  std::string qr_block_asm_file_name_1 = "kernels/spai/qr_block_assembly_1_g.cl";
-                  std::string qr_block_asm_source_1;
-                  read_kernel_from_file(qr_block_asm_file_name_1, qr_block_asm_source_1);
-                  viennacl::ocl::program & qr_block_asm_prog_1 = viennacl::ocl::current_context().add_program(qr_block_asm_source_1.c_str(), 
-                                                                                                            "qr_block_assembly_kernel_source_1");
-                  
-                  qr_block_asm_prog_1.add_kernel("block_qr_assembly_1");
-                  
-              }*/
-              block_matrix g_A_I_J_q_vcl;
-              //need to allocate memory for QR block
-              g_A_I_J_q_vcl.handle() = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE,
-                                                                                      static_cast<unsigned int>(sizeof(ScalarType)*sz_blocks),
-                                                                                      &(con_A_I_J_q[0]));
-              g_A_I_J_q_vcl.handle1() = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, 
-                                                                                static_cast<unsigned int>(sizeof(cl_uint)*2*static_cast<unsigned int>(g_I.size())), 
-                                                                                &(matrix_dims[0]));
-              g_A_I_J_q_vcl.handle2() = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE,
-                                                                  static_cast<unsigned int>(sizeof(cl_uint)*2*static_cast<unsigned int>(g_I.size() + 1)),
-                                                                                      &(blocks_ind[0]));
-              viennacl::ocl::handle<cl_mem> g_is_update_vcl = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE,
-                                                                                                            static_cast<unsigned int>(sizeof(cl_uint)*(g_is_update.size())),
-                                                                                                            &(g_is_update[0]));
-              
-              if(!is_empty_block){
-                  viennacl::ocl::kernel& qr_assembly_kernel = viennacl::ocl::get_kernel(viennacl::linalg::kernels::spai<ScalarType, 1>::program_name(), "block_qr_assembly");
-                  qr_assembly_kernel.local_work_size(0, 1);
-                  qr_assembly_kernel.global_work_size(0, 256);
-                  viennacl::ocl::enqueue(qr_assembly_kernel(matrix_dimensions, 
-                                                            g_A_I_J_u_vcl.handle(), 
-                                                            g_A_I_J_u_vcl.handle2(), 
-                                                            g_A_I_J_u_vcl.handle1(), 
-                                                            g_A_I_u_J_u_vcl.handle(),
-                                                            g_A_I_u_J_u_vcl.handle2(), 
-                                                            g_A_I_u_J_u_vcl.handle1(), 
-                                                            g_A_I_J_q_vcl.handle(), 
-                                                            g_A_I_J_q_vcl.handle2(), 
-                                                            g_A_I_J_q_vcl.handle1(),
-                                                            g_is_update_vcl,
-                                                            static_cast<unsigned int>(g_I.size())));
-              }else{
-                  viennacl::ocl::kernel& qr_assembly_kernel = viennacl::ocl::get_kernel(viennacl::linalg::kernels::spai<ScalarType, 1>::program_name(), "block_qr_assembly_1");
-                  qr_assembly_kernel.local_work_size(0, 1);
-                  qr_assembly_kernel.global_work_size(0, 256);
-                  viennacl::ocl::enqueue(qr_assembly_kernel(matrix_dimensions, g_A_I_J_u_vcl.handle(), g_A_I_J_u_vcl.handle2(), 
-                                                            g_A_I_J_u_vcl.handle1(),
-                                                            g_A_I_J_q_vcl.handle(), 
-                                                            g_A_I_J_q_vcl.handle2(), g_A_I_J_q_vcl.handle1(),
-                                                            g_is_update_vcl,
-                                                            static_cast<unsigned int>(g_I.size())));
-              }
-              g_A_I_u_J_u_vcl.handle() = g_A_I_J_q_vcl.handle();
-              g_A_I_u_J_u_vcl.handle1() = g_A_I_J_q_vcl.handle1();
-              g_A_I_u_J_u_vcl.handle2() = g_A_I_J_q_vcl.handle2();
+                                viennacl::context ctx)
+          {
+            viennacl::ocl::context & opencl_ctx = const_cast<viennacl::ocl::context &>(ctx.opencl_context());
+
+            //std::vector<std::vector<unsigned int> > g_I_q(g_I.size());
+            assemble_qr_row_inds(g_I, g_J, g_I_u, g_I_q);
+            unsigned int sz_blocks;
+            std::vector<cl_uint> matrix_dims(g_I.size()*2, static_cast<cl_uint>(0));
+            std::vector<cl_uint> blocks_ind(g_I.size() + 1, static_cast<cl_uint>(0));
+            compute_blocks_size(g_I_q, g_J_u, sz_blocks, blocks_ind, matrix_dims);
+            std::vector<ScalarType> con_A_I_J_q(sz_blocks, static_cast<ScalarType>(0));
+
+            block_matrix g_A_I_J_q_vcl;
+            //need to allocate memory for QR block
+            g_A_I_J_q_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+                                                              static_cast<unsigned int>(sizeof(ScalarType)*sz_blocks),
+                                                              &(con_A_I_J_q[0]));
+            g_A_I_J_q_vcl.handle().context(opencl_ctx);
+
+            g_A_I_J_q_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+                                                               static_cast<unsigned int>(sizeof(cl_uint)*2*static_cast<unsigned int>(g_I.size())),
+                                                               &(matrix_dims[0]));
+            g_A_I_J_q_vcl.handle1().context(opencl_ctx);
+
+            g_A_I_J_q_vcl.handle2() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+                                                                static_cast<unsigned int>(sizeof(cl_uint)*2*static_cast<unsigned int>(g_I.size() + 1)),
+                                                                &(blocks_ind[0]));
+            g_A_I_J_q_vcl.handle2().context(opencl_ctx);
+
+            viennacl::ocl::handle<cl_mem> g_is_update_vcl = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+                                                                                     static_cast<unsigned int>(sizeof(cl_uint)*(g_is_update.size())),
+                                                                                     &(g_is_update[0]));
+
+            viennacl::linalg::opencl::kernels::spai<ScalarType>::init(opencl_ctx);
+            if(!is_empty_block)
+            {
+              viennacl::ocl::kernel& qr_assembly_kernel = opencl_ctx.get_kernel(viennacl::linalg::opencl::kernels::spai<ScalarType>::program_name(), "block_qr_assembly");
+              qr_assembly_kernel.local_work_size(0, 1);
+              qr_assembly_kernel.global_work_size(0, 256);
+              viennacl::ocl::enqueue(qr_assembly_kernel(matrix_dimensions,
+                                                        g_A_I_J_u_vcl.handle(),
+                                                        g_A_I_J_u_vcl.handle2(),
+                                                        g_A_I_J_u_vcl.handle1(),
+                                                        g_A_I_u_J_u_vcl.handle(),
+                                                        g_A_I_u_J_u_vcl.handle2(),
+                                                        g_A_I_u_J_u_vcl.handle1(),
+                                                        g_A_I_J_q_vcl.handle(),
+                                                        g_A_I_J_q_vcl.handle2(),
+                                                        g_A_I_J_q_vcl.handle1(),
+                                                        g_is_update_vcl,
+                                                        static_cast<unsigned int>(g_I.size())));
+            }
+            else
+            {
+              viennacl::ocl::kernel& qr_assembly_kernel = opencl_ctx.get_kernel(viennacl::linalg::opencl::kernels::spai<ScalarType>::program_name(), "block_qr_assembly_1");
+              qr_assembly_kernel.local_work_size(0, 1);
+              qr_assembly_kernel.global_work_size(0, 256);
+              viennacl::ocl::enqueue(qr_assembly_kernel(matrix_dimensions, g_A_I_J_u_vcl.handle(), g_A_I_J_u_vcl.handle2(),
+                                                        g_A_I_J_u_vcl.handle1(),
+                                                        g_A_I_J_q_vcl.handle(),
+                                                        g_A_I_J_q_vcl.handle2(), g_A_I_J_q_vcl.handle1(),
+                                                        g_is_update_vcl,
+                                                        static_cast<unsigned int>(g_I.size())));
+            }
+            g_A_I_u_J_u_vcl.handle() = g_A_I_J_q_vcl.handle();
+            g_A_I_u_J_u_vcl.handle1() = g_A_I_J_q_vcl.handle1();
+            g_A_I_u_J_u_vcl.handle2() = g_A_I_J_q_vcl.handle2();
           }
 
           /** @brief Performs assembly for new R matrix on GPU
           * @param g_I container of row indices
           * @param g_J container of column indices
-          * @param g_A_I_J_vcl container of block matrices from previous update  
+          * @param g_A_I_J_vcl container of block matrices from previous update
           * @param g_A_I_J_u_vcl container of block matrices Q'*A(I, \\tilde J)
           * @param g_A_I_u_J_u_vcl container of block matrices QR factored on current iteration
           * @param g_bv_vcl block of beta vectors from previous iteration
           * @param g_bv_vcl_u block of updated beta vectors got after recent QR factorization
-          * @param g_is_update container with identificators that shows which block should be modified 
-          * @param cur_iter current iteration, used to make sure that kernel compiles just once 
-          */ 
+          * @param g_is_update container with identificators that shows which block should be modified
+          * @param ctx         Optional context in which the auxiliary data is created (one out of multiple OpenCL contexts, CUDA, host)
+          */
           template<typename ScalarType>
           void assemble_r(std::vector<std::vector<unsigned int> >& g_I, std::vector<std::vector<unsigned int> >& g_J,
-                          block_matrix& g_A_I_J_vcl, 
+                          block_matrix& g_A_I_J_vcl,
                           block_matrix& g_A_I_J_u_vcl,
-                          block_matrix& g_A_I_u_J_u_vcl, 
-                          block_vector& g_bv_vcl, 
+                          block_matrix& g_A_I_u_J_u_vcl,
+                          block_vector& g_bv_vcl,
                           block_vector& g_bv_vcl_u,
                           std::vector<cl_uint>& g_is_update,
-                          const unsigned int cur_iter){
-              std::vector<cl_uint> matrix_dims(g_I.size()*2, static_cast<cl_uint>(0));
-              std::vector<cl_uint> blocks_ind(g_I.size() + 1, static_cast<cl_uint>(0));
-              std::vector<cl_uint> start_bv_r_inds(g_I.size() + 1, 0);
-              unsigned int sz_blocks, bv_size;
-              compute_blocks_size(g_I, g_J, sz_blocks, blocks_ind, matrix_dims);
-              get_size(g_J, bv_size);
-              init_start_inds(g_J, start_bv_r_inds);
-              std::vector<ScalarType> con_A_I_J_r(sz_blocks, static_cast<ScalarType>(0));
-              std::vector<ScalarType> b_v_r(bv_size, static_cast<ScalarType>(0));
-              /*if(cur_iter == 1){
-                  std::string r_block_asm_file_name = "kernels/spai/r_block_assembly_g.cl";
-                  std::string r_block_asm_source;
-                  read_kernel_from_file(r_block_asm_file_name, r_block_asm_source);
-                  viennacl::ocl::program & r_block_asm_prog = viennacl::ocl::current_context().add_program(r_block_asm_source.c_str(), 
-                                                                                                            "r_block_assembly_kernel_source");
-                  r_block_asm_prog.add_kernel("block_r_assembly");
-                  
-                  std::string bv_block_asm_file_name = "kernels/spai/bv_block_assembly_g.cl";
-                  std::string bv_block_asm_source;
-                  read_kernel_from_file(bv_block_asm_file_name, bv_block_asm_source);
-                  viennacl::ocl::program & bv_block_asm_prog = viennacl::ocl::current_context().add_program(bv_block_asm_source.c_str(), 
-                                                                                                          "bv_block_assembly_kernel_source");
-                  bv_block_asm_prog.add_kernel("block_bv_assembly");
-              }*/
-              block_matrix g_A_I_J_r_vcl;
-              block_vector g_bv_r_vcl;
-              g_A_I_J_r_vcl.handle() = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE,
-                                                                                      static_cast<unsigned int>(sizeof(ScalarType)*sz_blocks),
-                                                                                      &(con_A_I_J_r[0]));
-              g_A_I_J_r_vcl.handle1() = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, 
-                                                          static_cast<unsigned int>(sizeof(cl_uint)*2*static_cast<unsigned int>(g_I.size())), 
-                                                                                      &(matrix_dims[0]));
-              g_A_I_J_r_vcl.handle2() = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE,
-                                                          static_cast<unsigned int>(sizeof(cl_uint)*2*static_cast<unsigned int>(g_I.size() + 1)),
-                                                                                      &(blocks_ind[0]));
-              g_bv_r_vcl.handle() = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE,  
-                                                                        static_cast<unsigned int>(sizeof(ScalarType)*bv_size), 
-                                                                        &(b_v_r[0]));
-              g_bv_r_vcl.handle1() = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, 
-                                                                                static_cast<unsigned int>(sizeof(cl_uint)*(g_I.size() + 1)), 
-                                                                                &(start_bv_r_inds[0]));
-              viennacl::ocl::handle<cl_mem> g_is_update_vcl = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE,
-                                                                                          static_cast<unsigned int>(sizeof(cl_uint)*(g_is_update.size())),
-                                                                                                            &(g_is_update[0]));
-              viennacl::ocl::kernel& r_assembly_kernel = viennacl::ocl::get_kernel(viennacl::linalg::kernels::spai<ScalarType, 1>::program_name(), "block_r_assembly");
-              r_assembly_kernel.local_work_size(0, 1);
-              r_assembly_kernel.global_work_size(0, 256);
-              
-              viennacl::ocl::enqueue(r_assembly_kernel(g_A_I_J_vcl.handle(), g_A_I_J_vcl.handle2(), g_A_I_J_vcl.handle1(), 
-                                                      g_A_I_J_u_vcl.handle(), g_A_I_J_u_vcl.handle2(), g_A_I_J_u_vcl.handle1(), 
-                                                      g_A_I_u_J_u_vcl.handle(), g_A_I_u_J_u_vcl.handle2(), g_A_I_u_J_u_vcl.handle1(), 
-                                                      g_A_I_J_r_vcl.handle(), g_A_I_J_r_vcl.handle2(), g_A_I_J_r_vcl.handle1(),
-                                                      g_is_update_vcl, static_cast<cl_uint>(g_I.size())));
-              
-              viennacl::ocl::kernel & bv_assembly_kernel = viennacl::ocl::get_kernel(viennacl::linalg::kernels::spai<ScalarType, 1>::program_name(), "block_bv_assembly");
-              bv_assembly_kernel.local_work_size(0, 1);
-              bv_assembly_kernel.global_work_size(0, 256);
-              viennacl::ocl::enqueue(bv_assembly_kernel(g_bv_vcl.handle(), g_bv_vcl.handle1(), g_A_I_J_vcl.handle1(), g_bv_vcl_u.handle(),
-                                                        g_bv_vcl_u.handle1(), g_A_I_J_u_vcl.handle1(),
-                                                        g_bv_r_vcl.handle(), g_bv_r_vcl.handle1(), g_A_I_J_r_vcl.handle1(), g_is_update_vcl,
-                                                        static_cast<cl_uint>(g_I.size())));
-              g_bv_vcl.handle() = g_bv_r_vcl.handle();
-              g_bv_vcl.handle1() = g_bv_r_vcl.handle1();
-              
-              g_A_I_J_vcl.handle() = g_A_I_J_r_vcl.handle();
-              g_A_I_J_vcl.handle2() = g_A_I_J_r_vcl.handle2();
-              g_A_I_J_vcl.handle1() = g_A_I_J_r_vcl.handle1();
+                          viennacl::context ctx)
+          {
+            viennacl::ocl::context & opencl_ctx = const_cast<viennacl::ocl::context &>(ctx.opencl_context());
+            std::vector<cl_uint> matrix_dims(g_I.size()*2, static_cast<cl_uint>(0));
+            std::vector<cl_uint> blocks_ind(g_I.size() + 1, static_cast<cl_uint>(0));
+            std::vector<cl_uint> start_bv_r_inds(g_I.size() + 1, 0);
+            unsigned int sz_blocks, bv_size;
+            compute_blocks_size(g_I, g_J, sz_blocks, blocks_ind, matrix_dims);
+            get_size(g_J, bv_size);
+            init_start_inds(g_J, start_bv_r_inds);
+            std::vector<ScalarType> con_A_I_J_r(sz_blocks, static_cast<ScalarType>(0));
+            std::vector<ScalarType> b_v_r(bv_size, static_cast<ScalarType>(0));
+
+            block_matrix g_A_I_J_r_vcl;
+            block_vector g_bv_r_vcl;
+            g_A_I_J_r_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+                                                              static_cast<unsigned int>(sizeof(ScalarType)*sz_blocks),
+                                                              &(con_A_I_J_r[0]));
+            g_A_I_J_r_vcl.handle().context(opencl_ctx);
+
+            g_A_I_J_r_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+                                                               static_cast<unsigned int>(sizeof(cl_uint)*2*static_cast<unsigned int>(g_I.size())),
+                                                               &(matrix_dims[0]));
+            g_A_I_J_r_vcl.handle1().context(opencl_ctx);
+
+            g_A_I_J_r_vcl.handle2() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+                                                               static_cast<unsigned int>(sizeof(cl_uint)*2*static_cast<unsigned int>(g_I.size() + 1)),
+                                                               &(blocks_ind[0]));
+            g_A_I_J_r_vcl.handle2().context(opencl_ctx);
+
+            g_bv_r_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+                                                           static_cast<unsigned int>(sizeof(ScalarType)*bv_size),
+                                                           &(b_v_r[0]));
+            g_bv_r_vcl.handle().context(opencl_ctx);
+
+            g_bv_r_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+                                                            static_cast<unsigned int>(sizeof(cl_uint)*(g_I.size() + 1)),
+                                                            &(start_bv_r_inds[0]));
+            g_bv_r_vcl.handle().context(opencl_ctx);
+
+            viennacl::ocl::handle<cl_mem> g_is_update_vcl = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+                                                                                     static_cast<unsigned int>(sizeof(cl_uint)*(g_is_update.size())),
+                                                                                     &(g_is_update[0]));
+            viennacl::linalg::opencl::kernels::spai<ScalarType>::init(opencl_ctx);
+            viennacl::ocl::kernel& r_assembly_kernel = opencl_ctx.get_kernel(viennacl::linalg::opencl::kernels::spai<ScalarType>::program_name(), "block_r_assembly");
+            r_assembly_kernel.local_work_size(0, 1);
+            r_assembly_kernel.global_work_size(0, 256);
+
+            viennacl::ocl::enqueue(r_assembly_kernel(g_A_I_J_vcl.handle(), g_A_I_J_vcl.handle2(), g_A_I_J_vcl.handle1(),
+                                                    g_A_I_J_u_vcl.handle(), g_A_I_J_u_vcl.handle2(), g_A_I_J_u_vcl.handle1(),
+                                                    g_A_I_u_J_u_vcl.handle(), g_A_I_u_J_u_vcl.handle2(), g_A_I_u_J_u_vcl.handle1(),
+                                                    g_A_I_J_r_vcl.handle(), g_A_I_J_r_vcl.handle2(), g_A_I_J_r_vcl.handle1(),
+                                                    g_is_update_vcl, static_cast<cl_uint>(g_I.size())));
+
+            viennacl::ocl::kernel & bv_assembly_kernel = opencl_ctx.get_kernel(viennacl::linalg::opencl::kernels::spai<ScalarType>::program_name(), "block_bv_assembly");
+            bv_assembly_kernel.local_work_size(0, 1);
+            bv_assembly_kernel.global_work_size(0, 256);
+            viennacl::ocl::enqueue(bv_assembly_kernel(g_bv_vcl.handle(), g_bv_vcl.handle1(), g_A_I_J_vcl.handle1(), g_bv_vcl_u.handle(),
+                                                      g_bv_vcl_u.handle1(), g_A_I_J_u_vcl.handle1(),
+                                                      g_bv_r_vcl.handle(), g_bv_r_vcl.handle1(), g_A_I_J_r_vcl.handle1(), g_is_update_vcl,
+                                                      static_cast<cl_uint>(g_I.size())));
+            g_bv_vcl.handle() = g_bv_r_vcl.handle();
+            g_bv_vcl.handle1() = g_bv_r_vcl.handle1();
+
+            g_A_I_J_vcl.handle() = g_A_I_J_r_vcl.handle();
+            g_A_I_J_vcl.handle2() = g_A_I_J_r_vcl.handle2();
+            g_A_I_J_vcl.handle1() = g_A_I_J_r_vcl.handle1();
           }
-          
-          /** @brief GPU-based block update 
+
+          /** @brief GPU-based block update
           * @param A sparse matrix
-          * @param A_v_c vectorized column-wise initial matrix 
-          * @param g_is_update container with identificators that shows which block should be modified 
+          * @param A_v_c vectorized column-wise initial matrix
+          * @param g_is_update container with identificators that shows which block should be modified
           * @param g_res container of residuals for all columns
           * @param g_J container of column index sets for all columns
           * @param g_I container of row index sets for all columns
-          * @param g_A_I_J_vcl container of block matrices from previous update  
+          * @param g_A_I_J_vcl container of block matrices from previous update
           * @param g_bv_vcl block of beta vectors from previous iteration
           * @param tag SPAI configuration tag
-          * @param cur_iter current iteration, used to make sure that kernel compiles just once 
-          */ 
+          */
           template<typename ScalarType, unsigned int MAT_ALIGNMENT, typename SparseVectorType>
           void block_update(const viennacl::compressed_matrix<ScalarType, MAT_ALIGNMENT>& A, const std::vector<SparseVectorType>& A_v_c,
                             std::vector<cl_uint>& g_is_update,
                             std::vector<SparseVectorType>& g_res,
-                            std::vector<std::vector<unsigned int> >& g_J, 
+                            std::vector<std::vector<unsigned int> >& g_J,
                             std::vector<std::vector<unsigned int> >& g_I,
-                            block_matrix& g_A_I_J_vcl, 
+                            block_matrix& g_A_I_J_vcl,
                             block_vector& g_bv_vcl,
-                            spai_tag const & tag,
-                            const unsigned int cur_iter){
-              //updated index set for columns
-              std::vector<std::vector<unsigned int> > g_J_u(g_J.size());
-              //updated index set for rows
-              std::vector<std::vector<unsigned int> > g_I_u(g_J.size());
-              //mixed index set of old and updated indices for rows 
-              std::vector<std::vector<unsigned int> > g_I_q(g_J.size());
-              //GPU memory for A_I_\hatJ
-              block_matrix g_A_I_J_u_vcl;
-              //GPU memory for A_\hatI_\hatJ
-              block_matrix g_A_I_u_J_u_vcl;
-              bool is_empty_block;
-              //GPU memory for new b_v
-              block_vector g_bv_u_vcl;
-#ifdef _OPENMP
-              #pragma omp parallel for
-#endif              
-              for(std::size_t i = 0; i < g_J.size(); ++i){
-                  if(g_is_update[i]){
-                      if(buildAugmentedIndexSet<SparseVectorType, ScalarType>(A_v_c, g_res[i], g_J[i], g_J_u[i], tag)){
-                          buildNewRowSet(A_v_c, g_I[i], g_J_u[i], g_I_u[i]);
-                      }
-                  }
-              }
-              //assemble new A_I_J_u blocks on GPU and multiply them with Q'
-              block_assembly(A, g_J_u, g_I, g_A_I_J_u_vcl, g_is_update, is_empty_block, cur_iter);
-              //I have matrix A_I_J_u ready..
-              block_q_multiplication<ScalarType>(g_J_u, g_I, g_A_I_J_vcl, g_bv_vcl, g_A_I_J_u_vcl, g_is_update, cur_iter);
-              //assemble A_\hatI_\hatJ
-              block_assembly(A, g_J_u, g_I_u, g_A_I_u_J_u_vcl, g_is_update, is_empty_block, cur_iter);
-              assemble_qr_block<ScalarType>(g_J, g_I, g_J_u, g_I_u, g_I_q, g_A_I_J_u_vcl, g_A_I_J_vcl.handle1(),
-                                            g_A_I_u_J_u_vcl, g_is_update, is_empty_block, cur_iter);
-              
-              block_qr<ScalarType>(g_I_q, g_J_u, g_A_I_u_J_u_vcl, g_bv_u_vcl, g_is_update, cur_iter);
-              //concatanation of new and old indices
-#ifdef _OPENMP
-              #pragma omp parallel for
-#endif              
-              for(std::size_t i = 0; i < g_J.size(); ++i){
-                  g_J[i].insert(g_J[i].end(), g_J_u[i].begin(), g_J_u[i].end());
-                  g_I[i].insert(g_I[i].end(), g_I_u[i].begin(), g_I_u[i].end());
+                            spai_tag const & tag)
+          {
+            viennacl::context ctx = viennacl::traits::context(A);
+            //updated index set for columns
+            std::vector<std::vector<unsigned int> > g_J_u(g_J.size());
+            //updated index set for rows
+            std::vector<std::vector<unsigned int> > g_I_u(g_J.size());
+            //mixed index set of old and updated indices for rows
+            std::vector<std::vector<unsigned int> > g_I_q(g_J.size());
+            //GPU memory for A_I_\hatJ
+            block_matrix g_A_I_J_u_vcl;
+            //GPU memory for A_\hatI_\hatJ
+            block_matrix g_A_I_u_J_u_vcl;
+            bool is_empty_block;
+            //GPU memory for new b_v
+            block_vector g_bv_u_vcl;
+#ifdef VIENNACL_WITH_OPENMP
+            #pragma omp parallel for
+#endif
+            for(long i = 0; i < static_cast<long>(g_J.size()); ++i)
+            {
+              if(g_is_update[i])
+              {
+                if(buildAugmentedIndexSet<SparseVectorType, ScalarType>(A_v_c, g_res[i], g_J[i], g_J_u[i], tag))
+                    buildNewRowSet(A_v_c, g_I[i], g_J_u[i], g_I_u[i]);
               }
-              assemble_r<ScalarType>(g_I, g_J, g_A_I_J_vcl, g_A_I_J_u_vcl, g_A_I_u_J_u_vcl,  g_bv_vcl,  g_bv_u_vcl, g_is_update, cur_iter);
+            }
+            //assemble new A_I_J_u blocks on GPU and multiply them with Q'
+            block_assembly(A, g_J_u, g_I, g_A_I_J_u_vcl, g_is_update, is_empty_block);
+            //I have matrix A_I_J_u ready..
+            block_q_multiplication<ScalarType>(g_J_u, g_I, g_A_I_J_vcl, g_bv_vcl, g_A_I_J_u_vcl, g_is_update, ctx);
+            //assemble A_\hatI_\hatJ
+            block_assembly(A, g_J_u, g_I_u, g_A_I_u_J_u_vcl, g_is_update, is_empty_block);
+            assemble_qr_block<ScalarType>(g_J, g_I, g_J_u, g_I_u, g_I_q, g_A_I_J_u_vcl, g_A_I_J_vcl.handle1(),
+                                          g_A_I_u_J_u_vcl, g_is_update, is_empty_block, ctx);
+
+            block_qr<ScalarType>(g_I_q, g_J_u, g_A_I_u_J_u_vcl, g_bv_u_vcl, g_is_update, ctx);
+            //concatanation of new and old indices
+#ifdef VIENNACL_WITH_OPENMP
+            #pragma omp parallel for
+#endif
+            for(long i = 0; i < static_cast<long>(g_J.size()); ++i)
+            {
+              g_J[i].insert(g_J[i].end(), g_J_u[i].begin(), g_J_u[i].end());
+              g_I[i].insert(g_I[i].end(), g_I_u[i].begin(), g_I_u[i].end());
+            }
+            assemble_r<ScalarType>(g_I, g_J, g_A_I_J_vcl, g_A_I_J_u_vcl, g_A_I_u_J_u_vcl,  g_bv_vcl,  g_bv_u_vcl, g_is_update, ctx);
           }
-        
-        }        
-      }        
+
+        }
+      }
     }
 }
-#endif
\ No newline at end of file
+#endif
diff --git a/viennacl/linalg/detail/spai/spai-static.hpp b/viennacl/linalg/detail/spai/spai-static.hpp
index 8680098..dbebed4 100644
--- a/viennacl/linalg/detail/spai/spai-static.hpp
+++ b/viennacl/linalg/detail/spai/spai-static.hpp
@@ -2,24 +2,25 @@
 #define VIENNACL_LINALG_DETAIL_SPAI_SPAI_STATIC_HPP
 
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
 ============================================================================= */
 
 /** @file viennacl/linalg/detail/spai/spai-static.hpp
-    @brief Implementation of a static SPAI. Experimental in 1.2.x.
-    
+    @brief Implementation of a static SPAI. Experimental.
+
     SPAI code contributed by Nikolay Lukash
 */
 
@@ -45,122 +46,143 @@
 #include "viennacl/linalg/prod.hpp"
 #include "viennacl/matrix.hpp"
 #include "viennacl/compressed_matrix.hpp"
-#include "viennacl/linalg/compressed_matrix_operations.hpp"
+#include "viennacl/linalg/sparse_matrix_operations.hpp"
 #include "viennacl/linalg/matrix_operations.hpp"
 #include "viennacl/scalar.hpp"
 #include "viennacl/linalg/cg.hpp"
 #include "viennacl/linalg/inner_prod.hpp"
-#include "viennacl/linalg/ilu.hpp"
 
 //#include "boost/numeric/ublas/detail/matrix_assign.hpp"
 
 namespace viennacl
 {
-    namespace linalg
+  namespace linalg
+  {
+    namespace detail
     {
-      namespace detail
+      namespace spai
       {
-        namespace spai
+
+        /** @brief Determines if element ind is in set {J}
+        * @param J current set
+        * @param ind current element
+        */
+        template <typename SizeType>
+        bool isInIndexSet(const std::vector<SizeType>& J, SizeType ind)
         {
-        
-          /********************************* STATIC SPAI FUNCTIONS******************************************/
-          
-          /** @brief Projects solution of LS problem onto original column m 
-          * @param m_in solution of LS
-          * @param J set of non-zero columns 
-          * @param m original column of M
-          */
-          template <typename VectorType, typename SparseVectorType>
-          void fanOutVector(const VectorType& m_in, const std::vector<unsigned int>& J, SparseVectorType& m){
-              unsigned int  cnt = 0;
-              for (size_t i = 0; i < J.size(); ++i) {
-                  m[J[i]] = m_in(cnt++);
-              }
-          }
-          /** @brief Solution of linear:R*x=y system by backward substitution
-          * @param R uppertriangular matrix 
-          * @param y right handside vector
-          * @param x solution vector
-          */
-          template <typename MatrixType, typename VectorType>
-          void backwardSolve(const MatrixType& R, const VectorType& y, VectorType& x){
-              typedef typename MatrixType::value_type ScalarType;
-              for (long i = R.size2()-1; i >= 0 ; i--) {
-                  x(i) = y(i);
-                  for (size_t j = i+1; j < R.size2(); ++j) {
-                      x(i) -= R(i,j)*x(j);
-                  }
-                  x(i) /= R(i,i);
-              }
-          }
-          /** @brief Perform projection of set I on the unit-vector
-          * @param I set of non-zero rows
-          * @param y result vector
-          * @param ind index of unit vector
-          */
-          template <typename VectorType, typename ScalarType>
-          void projectI(const std::vector<unsigned int>& I, VectorType& y, unsigned int ind){
-              for(size_t i = 0; i < I.size(); ++i){
-                  //y.resize(y.size()+1);
-                  if(I[i] == ind){
-                      y(i) = static_cast<ScalarType>(1.0);
-                  }
-                  else{
-                      y(i) = static_cast<ScalarType>(0.0);
-                  }
-              }
+          return (std::find(J.begin(), J.end(), ind) != J.end());
+        }
+
+
+
+        /********************************* STATIC SPAI FUNCTIONS******************************************/
+
+        /** @brief Projects solution of LS problem onto original column m
+        * @param m_in solution of LS
+        * @param J set of non-zero columns
+        * @param m original column of M
+        */
+        template <typename VectorType, typename SparseVectorType>
+        void fanOutVector(const VectorType& m_in, const std::vector<unsigned int>& J, SparseVectorType& m)
+        {
+          unsigned int  cnt = 0;
+          for (vcl_size_t i = 0; i < J.size(); ++i)
+            m[J[i]] = m_in(cnt++);
+        }
+        /** @brief Solution of linear:R*x=y system by backward substitution
+        * @param R uppertriangular matrix
+        * @param y right handside vector
+        * @param x solution vector
+        */
+        template <typename MatrixType, typename VectorType>
+        void backwardSolve(const MatrixType& R, const VectorType& y, VectorType& x)
+        {
+          for (long i = static_cast<long>(R.size2())-1; i >= 0 ; i--)
+          {
+            x(i) = y(i);
+            for (vcl_size_t j = i+1; j < R.size2(); ++j)
+                x(i) -= R(i,j)*x(j);
+
+            x(i) /= R(i,i);
           }
-          
-          /** @brief Builds index set of projected columns for current column of preconditioner
-          * @param v current column of preconditioner
-          * @param J output - index set of non-zero columns
-          */
-          template <typename SparseVectorType>
-          void buildColumnIndexSet(const SparseVectorType& v, std::vector<unsigned int>& J){
-              //typedef typename VectorType::value_type ScalarType;
-              unsigned int tmp_v;
-              for(typename SparseVectorType::const_iterator vec_it = v.begin(); vec_it != v.end(); ++vec_it){
-                  tmp_v = vec_it->first;
-                  J.push_back(vec_it->first);
-              }
-              std::sort(J.begin(), J.end());
+        }
+        /** @brief Perform projection of set I on the unit-vector
+        * @param I set of non-zero rows
+        * @param y result vector
+        * @param ind index of unit vector
+        */
+        template <typename VectorType, typename ScalarType>
+        void projectI(const std::vector<unsigned int>& I, VectorType& y, unsigned int ind)
+        {
+          for(vcl_size_t i = 0; i < I.size(); ++i)
+          {
+            //y.resize(y.size()+1);
+            if(I[i] == ind)
+              y(i) = static_cast<ScalarType>(1.0);
+            else
+              y(i) = static_cast<ScalarType>(0.0);
           }
-          
-          /** @brief Initialize preconditioner with sparcity pattern = p(A)
-          * @param A input matrix
-          * @param M output matrix - initialized preconditioner
-          */
-          template <typename SparseMatrixType>
-          void initPreconditioner(const SparseMatrixType& A, SparseMatrixType& M){
-              typedef typename SparseMatrixType::value_type ScalarType;
-              M.resize(A.size1(), A.size2(), false);
-              for(typename SparseMatrixType::const_iterator1 row_it = A.begin1(); row_it!= A.end1(); ++row_it){
-                  //
-                  for(typename SparseMatrixType::const_iterator2 col_it = row_it.begin(); col_it != row_it.end(); ++col_it){
-                      M(col_it.index1(),col_it.index2()) = static_cast<ScalarType>(1);
-                  }
-              }
+        }
+
+        /** @brief Builds index set of projected columns for current column of preconditioner
+        * @param v current column of preconditioner
+        * @param J output - index set of non-zero columns
+        */
+        template <typename SparseVectorType>
+        void buildColumnIndexSet(const SparseVectorType& v, std::vector<unsigned int>& J)
+        {
+            //typedef typename VectorType::value_type ScalarType;
+            //unsigned int tmp_v;
+            for(typename SparseVectorType::const_iterator vec_it = v.begin(); vec_it != v.end(); ++vec_it)
+            {
+                //tmp_v = vec_it->first;
+                J.push_back(vec_it->first);
+            }
+            std::sort(J.begin(), J.end());
+        }
+
+        /** @brief Initialize preconditioner with sparcity pattern = p(A)
+        * @param A input matrix
+        * @param M output matrix - initialized preconditioner
+        */
+        template <typename SparseMatrixType>
+        void initPreconditioner(const SparseMatrixType& A, SparseMatrixType& M)
+        {
+          typedef typename SparseMatrixType::value_type ScalarType;
+          M.resize(A.size1(), A.size2(), false);
+          for(typename SparseMatrixType::const_iterator1 row_it = A.begin1(); row_it!= A.end1(); ++row_it)
+          {
+            //
+            for(typename SparseMatrixType::const_iterator2 col_it = row_it.begin(); col_it != row_it.end(); ++col_it)
+            {
+              M(col_it.index1(),col_it.index2()) = static_cast<ScalarType>(1);
+            }
           }
-          
-          /** @brief Row projection for matrix A(:,J) -> A(I,J), building index set of non-zero rows
-          * @param A_v_c input matrix
-          * @param J set of non-zero rows
-          * @param I output matrix 
-          */
-          template <typename SparseVectorType>
-          void projectRows(const std::vector<SparseVectorType>& A_v_c, const std::vector<unsigned int>& J, std::vector<unsigned int>& I){
-              for(size_t i = 0; i < J.size(); ++i){
-                  for(typename SparseVectorType::const_iterator col_it = A_v_c[J[i]].begin(); col_it!=A_v_c[J[i]].end(); ++col_it){
-                      if(!isInIndexSet(I, col_it->first)){
-                          I.push_back(col_it->first);
-                      }
-                  }
-              }
-              std::sort(I.begin(), I.end());
+        }
+
+        /** @brief Row projection for matrix A(:,J) -> A(I,J), building index set of non-zero rows
+        * @param A_v_c input matrix
+        * @param J set of non-zero rows
+        * @param I output matrix
+        */
+        template <typename SparseVectorType>
+        void projectRows(const std::vector<SparseVectorType>& A_v_c, const std::vector<unsigned int>& J, std::vector<unsigned int>& I)
+        {
+          for(vcl_size_t i = 0; i < J.size(); ++i)
+          {
+            for(typename SparseVectorType::const_iterator col_it = A_v_c[J[i]].begin(); col_it!=A_v_c[J[i]].end(); ++col_it)
+            {
+              if(!isInIndexSet(I, col_it->first))
+                I.push_back(col_it->first);
+            }
           }
+          std::sort(I.begin(), I.end());
         }
-      }
-    }
-}
 
-#endif
\ No newline at end of file
+
+      } //namespace spai
+    } //namespace detail
+  } //namespace linalg
+} //namespace viennacl
+
+#endif
diff --git a/viennacl/linalg/detail/spai/spai.hpp b/viennacl/linalg/detail/spai/spai.hpp
index 7466aba..1358637 100644
--- a/viennacl/linalg/detail/spai/spai.hpp
+++ b/viennacl/linalg/detail/spai/spai.hpp
@@ -2,23 +2,24 @@
 #define VIENNACL_LINALG_DETAIL_SPAI_SPAI_HPP
 
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
 ============================================================================= */
 
 /** @file viennacl/linalg/detail/spai/spai.hpp
-    @brief Main implementation of SPAI (not FSPAI)
+    @brief Main implementation of SPAI (not FSPAI). Experimental.
 */
 
 #include <utility>
@@ -54,14 +55,13 @@
 #include "viennacl/linalg/prod.hpp"
 #include "viennacl/matrix.hpp"
 #include "viennacl/compressed_matrix.hpp"
-#include "viennacl/linalg/compressed_matrix_operations.hpp"
+#include "viennacl/linalg/sparse_matrix_operations.hpp"
 #include "viennacl/linalg/matrix_operations.hpp"
 #include "viennacl/scalar.hpp"
 #include "viennacl/linalg/inner_prod.hpp"
 #include "viennacl/linalg/ilu.hpp"
 #include "viennacl/ocl/backend.hpp"
-#include "viennacl/linalg/kernels/spai_source.h"
-#include "viennacl/linalg/kernels/spai_kernels.h"
+#include "viennacl/linalg/opencl/kernels/spai.hpp"
 
 
 
@@ -75,7 +75,7 @@ namespace viennacl
     {
       namespace spai
       {
-        
+
         //debug function for print
         template<typename SparseVectorType>
         void print_sparse_vector(const SparseVectorType& v){
@@ -92,12 +92,12 @@ namespace viennacl
                 std::cout<<std::endl;
             }
         }
-        
+
         /** @brief Add two sparse vectors res_v = b*v
          * @param v initial sparse vector
          * @param b scalar
-         * @param res_v output vector 
-         */ 
+         * @param res_v output vector
+         */
         template<typename SparseVectorType, typename ScalarType>
         void add_sparse_vectors(const SparseVectorType& v, const ScalarType b,  SparseVectorType& res_v){
             for(typename SparseVectorType::const_iterator v_it = v.begin(); v_it != v.end(); ++v_it){
@@ -107,10 +107,10 @@ namespace viennacl
         //sparse-matrix - vector product
         /** @brief Computation of residual res = A*v - e
          * @param A_v_c column major vectorized input sparse matrix
-         * @param v sparse vector, in this case new column of preconditioner matrix 
+         * @param v sparse vector, in this case new column of preconditioner matrix
          * @param ind index for current column
          * @param res residual
-         */ 
+         */
         template<typename SparseVectorType, typename ScalarType>
         void compute_spai_residual(const std::vector<SparseVectorType>& A_v_c, const SparseVectorType& v,
                                    const unsigned int ind, SparseVectorType& res){
@@ -119,111 +119,129 @@ namespace viennacl
             }
             res[ind] -= static_cast<ScalarType>(1);
         }
-        
+
         /** @brief Setting up index set of columns and rows for certain column
-         * @param A_v_c column major vectorized initial sparse matrix  
+         * @param A_v_c column major vectorized initial sparse matrix
          * @param v current column of preconditioner matrix
          * @param J set of column indices
          * @param I set of row indices
-         */ 
+         */
         template<typename SparseVectorType>
-        void build_index_set(const std::vector<SparseVectorType>& A_v_c, const SparseVectorType& v, std::vector<unsigned int>& J, 
+        void build_index_set(const std::vector<SparseVectorType>& A_v_c, const SparseVectorType& v, std::vector<unsigned int>& J,
                              std::vector<unsigned int>& I){
             buildColumnIndexSet(v, J);
             projectRows(A_v_c, J, I);
         }
-        
+
+        /** @brief Initializes a dense matrix from a sparse one
+        * @param A_in Riginal sparse matrix
+        * @param J Set of column indices
+        * @param I Set of row indices
+        * @param A_out dense matrix output
+        */
+        template<typename SparseMatrixType, typename DenseMatrixType>
+        void initProjectSubMatrix(const SparseMatrixType& A_in, const std::vector<unsigned int>& J, std::vector<unsigned int>& I,
+                                  DenseMatrixType& A_out)
+        {
+          A_out.resize(I.size(), J.size(), false);
+          for(vcl_size_t j = 0; j < J.size(); ++j)
+          {
+            for(vcl_size_t i = 0; i < I.size(); ++i)
+              A_out(i,j) = A_in(I[i],J[j]);
+          }
+        }
+
+
         /************************************************** CPU BLOCK SET UP ***************************************/
         /** @brief Setting up blocks and QR factorizing them on CPU
-         * @param A initial sparse matrix 
-         * @param A_v_c column major vectorized initial sparse matrix  
+         * @param A initial sparse matrix
+         * @param A_v_c column major vectorized initial sparse matrix
          * @param M_v initialized preconditioner
          * @param g_I container of row indices
          * @param g_J container of column indices
          * @param g_A_I_J container of dense matrices -> R matrices after QR factorization
          * @param g_b_v container of vectors beta, necessary for Q recovery
-         */ 
+         */
         template<typename SparseMatrixType, typename DenseMatrixType, typename SparseVectorType, typename VectorType>
         void block_set_up(const SparseMatrixType& A,
                           const std::vector<SparseVectorType>& A_v_c,
-                          const std::vector<SparseVectorType>& M_v, 
-                          std::vector<std::vector<unsigned int> >& g_I, 
+                          const std::vector<SparseVectorType>& M_v,
+                          std::vector<std::vector<unsigned int> >& g_I,
                           std::vector<std::vector<unsigned int> >& g_J,
                           std::vector<DenseMatrixType>& g_A_I_J,
                           std::vector<VectorType>& g_b_v){
-#ifdef _OPENMP
+#ifdef VIENNACL_WITH_OPENMP
             #pragma omp parallel for
-#endif            
-            for(std::size_t i = 0; i < M_v.size(); ++i){
+#endif
+            for (long i = 0; i < static_cast<long>(M_v.size()); ++i){
                 build_index_set(A_v_c, M_v[i], g_J[i], g_I[i]);
                 initProjectSubMatrix(A, g_J[i], g_I[i], g_A_I_J[i]);
                 //print_matrix(g_A_I_J[i]);
                 single_qr(g_A_I_J[i], g_b_v[i]);
                 //print_matrix(g_A_I_J[i]);
-            }            
+            }
         }
-        
+
         /** @brief Setting up index set of columns and rows for all columns
-         * @param A_v_c column major vectorized initial sparse matrix  
+         * @param A_v_c column major vectorized initial sparse matrix
          * @param M_v initialized preconditioner
          * @param g_J container of column indices
          * @param g_I container of row indices
-         */ 
+         */
         template<typename SparseVectorType>
         void index_set_up(const std::vector<SparseVectorType>& A_v_c,
-                          const std::vector<SparseVectorType>& M_v, 
-                          std::vector<std::vector<unsigned int> >& g_J, 
+                          const std::vector<SparseVectorType>& M_v,
+                          std::vector<std::vector<unsigned int> >& g_J,
                           std::vector<std::vector<unsigned int> >& g_I)
         {
-#ifdef _OPENMP
+#ifdef VIENNACL_WITH_OPENMP
             #pragma omp parallel for
-#endif            
-            for(std::size_t i = 0; i < M_v.size(); ++i){
+#endif
+            for (long i = 0; i < static_cast<long>(M_v.size()); ++i){
                 build_index_set(A_v_c, M_v[i], g_J[i], g_I[i]);
             }
         }
-        
+
         /************************************************** GPU BLOCK SET UP ***************************************/
         /** @brief Setting up blocks and QR factorizing them on GPU
-         * @param A initial sparse matrix 
-         * @param A_v_c column major vectorized initial sparse matrix  
+         * @param A initial sparse matrix
+         * @param A_v_c column major vectorized initial sparse matrix
          * @param M_v initialized preconditioner
          * @param g_is_update container that indicates which blocks are active
          * @param g_I container of row indices
          * @param g_J container of column indices
          * @param g_A_I_J container of dense matrices -> R matrices after QR factorization
          * @param g_bv container of vectors beta, necessary for Q recovery
-         * @param cur_iter number of current iteration
-         */ 
+         */
         template<typename ScalarType, unsigned int MAT_ALIGNMENT, typename SparseVectorType>
         void block_set_up(const viennacl::compressed_matrix<ScalarType, MAT_ALIGNMENT>& A,
-                          const std::vector<SparseVectorType>& A_v_c, 
+                          const std::vector<SparseVectorType>& A_v_c,
                           const std::vector<SparseVectorType>& M_v,
                           std::vector<cl_uint> g_is_update,
-                          std::vector<std::vector<unsigned int> >& g_I, 
-                          std::vector<std::vector<unsigned int> >& g_J, 
-                          block_matrix & g_A_I_J, 
-                          block_vector & g_bv,
-                          const unsigned int cur_iter)
+                          std::vector<std::vector<unsigned int> >& g_I,
+                          std::vector<std::vector<unsigned int> >& g_J,
+                          block_matrix & g_A_I_J,
+                          block_vector & g_bv)
         {
-            bool is_empty_block;
-            //build index set
-            index_set_up(A_v_c, M_v, g_J, g_I);
-            block_assembly(A, g_J, g_I, g_A_I_J, g_is_update, is_empty_block, cur_iter);
-            block_qr<ScalarType>(g_I, g_J, g_A_I_J, g_bv, g_is_update, cur_iter);
-            
+          viennacl::context ctx = viennacl::traits::context(A);
+          bool is_empty_block;
+          //build index set
+          index_set_up(A_v_c, M_v, g_J, g_I);
+          block_assembly(A, g_J, g_I, g_A_I_J, g_is_update, is_empty_block);
+          block_qr<ScalarType>(g_I, g_J, g_A_I_J, g_bv, g_is_update, ctx);
+
         }
-        
-        
+
+
         /***************************************************************************************************/
         /******************************** SOLVING LS PROBLEMS ON GPU ***************************************/
         /***************************************************************************************************/
-        /** @brief Elicitation of sparse vector m for particular column from m_in - contigious vector for all columns  
+        /** @brief Elicitation of sparse vector m for particular column from m_in - contigious vector for all columns
          * @param m_in contigious sparse vector for all columns
-         * @param start_m_ind start index of particular vector   
+         * @param start_m_ind start index of particular vector
          * @param J column index set
          * @param m sparse vector for particular column
-         */ 
+         */
         template<typename ScalarType, typename SparseVectorType>
         void custom_fan_out(const std::vector<ScalarType> & m_in,
                             unsigned int start_m_ind,
@@ -231,117 +249,113 @@ namespace viennacl
                             SparseVectorType & m)
         {
             unsigned int  cnt = 0;
-            for (std::size_t i = 0; i < J.size(); ++i) {
+            for (vcl_size_t i = 0; i < J.size(); ++i) {
                 m[J[i]] = m_in[start_m_ind + cnt++];
             }
         }
-        
-        
+
+
 
         //GPU based least square problem
-        /** @brief Solution of Least square problem on GPU 
-         * @param A_v_c column-major vectorized initial sparse matrix 
+        /** @brief Solution of Least square problem on GPU
+         * @param A_v_c column-major vectorized initial sparse matrix
          * @param M_v column-major vectorized sparse preconditioner matrix
          * @param g_I container of row set indices
          * @param g_J container of column set indices
          * @param g_A_I_J_vcl contigious matrix that consists of blocks A(I_k, J_k)
          * @param g_bv_vcl contigious vector that consists of betas, necessary for Q recovery
-         * @param g_res container of residuals 
+         * @param g_res container of residuals
          * @param g_is_update container with indicators which blocks are active
          * @param tag spai tag
-         * @param cur_iter current iteration number
-         */ 
+         * @param ctx         Optional context in which the auxiliary data is created (one out of multiple OpenCL contexts, CUDA, host)
+         */
         template<typename SparseVectorType, typename ScalarType>
-        void least_square_solve(std::vector<SparseVectorType> & A_v_c, 
+        void least_square_solve(std::vector<SparseVectorType> & A_v_c,
                                 std::vector<SparseVectorType> & M_v,
-                                std::vector<std::vector<unsigned int> >& g_I, 
+                                std::vector<std::vector<unsigned int> >& g_I,
                                 std::vector<std::vector<unsigned int> > & g_J,
                                 block_matrix & g_A_I_J_vcl,
                                 block_vector & g_bv_vcl,
                                 std::vector<SparseVectorType> & g_res,
                                 std::vector<cl_uint> & g_is_update,
                                 const spai_tag & tag,
-                                const unsigned int cur_iter){
-            unsigned int y_sz, m_sz;
-            std::vector<cl_uint> y_inds(M_v.size() + 1, static_cast<cl_uint>(0));
-            std::vector<cl_uint> m_inds(M_v.size() + 1, static_cast<cl_uint>(0));
-            get_size(g_I, y_sz);
-            init_start_inds(g_I, y_inds);
-            init_start_inds(g_J, m_inds);
-            //create y_v
-            std::vector<ScalarType> y_v(y_sz, static_cast<ScalarType>(0));
-            for(std::size_t i = 0; i < M_v.size(); ++i){
-                for(std::size_t j = 0; j < g_I[i].size(); ++j){
-                    if(g_I[i][j] == i)
-                        y_v[y_inds[i] + j] = static_cast<ScalarType>(1.0);
-                }
+                                viennacl::context ctx)
+        {
+          viennacl::ocl::context & opencl_ctx = const_cast<viennacl::ocl::context &>(ctx.opencl_context());
+          unsigned int y_sz, m_sz;
+          std::vector<cl_uint> y_inds(M_v.size() + 1, static_cast<cl_uint>(0));
+          std::vector<cl_uint> m_inds(M_v.size() + 1, static_cast<cl_uint>(0));
+          get_size(g_I, y_sz);
+          init_start_inds(g_I, y_inds);
+          init_start_inds(g_J, m_inds);
+          //create y_v
+          std::vector<ScalarType> y_v(y_sz, static_cast<ScalarType>(0));
+          for(vcl_size_t i = 0; i < M_v.size(); ++i)
+          {
+            for(vcl_size_t j = 0; j < g_I[i].size(); ++j)
+            {
+              if(g_I[i][j] == i)
+                y_v[y_inds[i] + j] = static_cast<ScalarType>(1.0);
             }
-            //compute m_v
-            get_size(g_J, m_sz);
-            std::vector<ScalarType> m_v(m_sz, static_cast<cl_uint>(0));
-            
-            //acquire kernel
-            /*if(cur_iter == 0){
-                std::string ls_kernel_file_name = "kernels/spai/ls_g.cl";
-                std::string ls_kernel_source;
-                read_kernel_from_file(ls_kernel_file_name, ls_kernel_source);
-                //compilation of a kernel
-                viennacl::ocl::program & ls_prog = viennacl::ocl::current_context().add_program(ls_kernel_source.c_str(), "ls_kernel_source");
-                //least square kernel
-                ls_prog.add_kernel("block_least_squares");
-            }*/
-            block_vector y_v_vcl;
-            block_vector m_v_vcl;
-            //prepearing memory for least square problem on GPU
-            y_v_vcl.handle() = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, 
-                                                                              static_cast<unsigned int>(sizeof(ScalarType)*y_v.size()), 
-                                                                              &(y_v[0]));
-            m_v_vcl.handle() = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, 
-                                                                              static_cast<unsigned int>(sizeof(ScalarType)*m_v.size()),
-                                                                              &(m_v[0]));
-            y_v_vcl.handle1() = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE,
-                                                                               static_cast<unsigned int>(sizeof(cl_uint)*(g_I.size() + 1)), 
-                                                                               &(y_inds[0]));
-            viennacl::ocl::handle<cl_mem> g_is_update_vcl = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE,
-                                                                                    static_cast<unsigned int>(sizeof(cl_uint)*(g_is_update.size())),
-                                                                                        &(g_is_update[0]));
-            viennacl::ocl::kernel& ls_kernel = viennacl::ocl::get_kernel(viennacl::linalg::kernels::spai<ScalarType, 1>::program_name(), "block_least_squares");
-            ls_kernel.local_work_size(0, 1);
-            ls_kernel.global_work_size(0, 256);
-            viennacl::ocl::enqueue(ls_kernel(g_A_I_J_vcl.handle(), g_A_I_J_vcl.handle2(), g_bv_vcl.handle(), g_bv_vcl.handle1(), m_v_vcl.handle(), 
-                                             y_v_vcl.handle(), y_v_vcl.handle1(), 
-                                             g_A_I_J_vcl.handle1(), g_is_update_vcl,
-                                             //viennacl::ocl::local_mem(static_cast<unsigned int>(sizeof(ScalarType)*(local_r_n*local_c_n))),
-                                             static_cast<unsigned int>(M_v.size())));
-            //copy vector m_v back from GPU to CPU
-            cl_int vcl_err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(),
-                                                 m_v_vcl.handle().get(), CL_TRUE, 0, 
-                                                 sizeof(ScalarType)*(m_v.size()),
-                                                 &(m_v[0]), 0, NULL, NULL);
-            VIENNACL_ERR_CHECK(vcl_err);
-            //fan out vector in parallel
-            //#pragma omp parallel for
-            for(std::size_t i = 0; i < M_v.size(); ++i){
-                if(g_is_update[i]){
-                    //faned out onto sparse vector
-                    custom_fan_out(m_v, m_inds[i], g_J[i], M_v[i]);
-                    g_res[i].clear();
-                    compute_spai_residual<SparseVectorType, ScalarType>(A_v_c,  M_v[i], static_cast<unsigned int>(i), g_res[i]);
-                    ScalarType res_norm = 0;
-                    //compute norm of res - just to make sure that this implementatino works correct
-                    sparse_norm_2(g_res[i], res_norm);
-                    //std::cout<<"Residual norm of column #: "<<i<<std::endl;
-                    //std::cout<<res_norm<<std::endl;
-                    //std::cout<<"************************"<<std::endl;
-                    g_is_update[i] = (res_norm > tag.getResidualNormThreshold())&& (!tag.getIsStatic())?(1):(0);
-                    
-                }
+          }
+          //compute m_v
+          get_size(g_J, m_sz);
+          std::vector<ScalarType> m_v(m_sz, static_cast<cl_uint>(0));
+
+          block_vector y_v_vcl;
+          block_vector m_v_vcl;
+          //prepearing memory for least square problem on GPU
+          y_v_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+                                                      static_cast<unsigned int>(sizeof(ScalarType)*y_v.size()),
+                                                      &(y_v[0]));
+          m_v_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+                                                      static_cast<unsigned int>(sizeof(ScalarType)*m_v.size()),
+                                                      &(m_v[0]));
+          y_v_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+                                                       static_cast<unsigned int>(sizeof(cl_uint)*(g_I.size() + 1)),
+                                                       &(y_inds[0]));
+          viennacl::ocl::handle<cl_mem> g_is_update_vcl = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+                                                                                   static_cast<unsigned int>(sizeof(cl_uint)*(g_is_update.size())),
+                                                                                   &(g_is_update[0]));
+          viennacl::linalg::opencl::kernels::spai<ScalarType>::init(opencl_ctx);
+          viennacl::ocl::kernel& ls_kernel = opencl_ctx.get_kernel(viennacl::linalg::opencl::kernels::spai<ScalarType>::program_name(), "block_least_squares");
+          ls_kernel.local_work_size(0, 1);
+          ls_kernel.global_work_size(0, 256);
+          viennacl::ocl::enqueue(ls_kernel(g_A_I_J_vcl.handle(), g_A_I_J_vcl.handle2(), g_bv_vcl.handle(), g_bv_vcl.handle1(), m_v_vcl.handle(),
+                                           y_v_vcl.handle(), y_v_vcl.handle1(),
+                                           g_A_I_J_vcl.handle1(), g_is_update_vcl,
+                                           //viennacl::ocl::local_mem(static_cast<unsigned int>(sizeof(ScalarType)*(local_r_n*local_c_n))),
+                                           static_cast<unsigned int>(M_v.size())));
+          //copy vector m_v back from GPU to CPU
+          cl_int vcl_err = clEnqueueReadBuffer(opencl_ctx.get_queue().handle().get(),
+                                               m_v_vcl.handle().get(), CL_TRUE, 0,
+                                               sizeof(ScalarType)*(m_v.size()),
+                                               &(m_v[0]), 0, NULL, NULL);
+          VIENNACL_ERR_CHECK(vcl_err);
+          //fan out vector in parallel
+          //#pragma omp parallel for
+          for(long i = 0; i < static_cast<long>(M_v.size()); ++i)
+          {
+            if(g_is_update[i])
+            {
+              //faned out onto sparse vector
+              custom_fan_out(m_v, m_inds[i], g_J[i], M_v[i]);
+              g_res[i].clear();
+              compute_spai_residual<SparseVectorType, ScalarType>(A_v_c,  M_v[i], static_cast<unsigned int>(i), g_res[i]);
+              ScalarType res_norm = 0;
+              //compute norm of res - just to make sure that this implementatino works correct
+              sparse_norm_2(g_res[i], res_norm);
+              //std::cout<<"Residual norm of column #: "<<i<<std::endl;
+              //std::cout<<res_norm<<std::endl;
+              //std::cout<<"************************"<<std::endl;
+              g_is_update[i] = (res_norm > tag.getResidualNormThreshold())&& (!tag.getIsStatic())?(1):(0);
             }
+          }
         }
-        
+
         //CPU based least square problems
-        /** @brief Solution of Least square problem on CPU 
-         * @param A_v_c column-major vectorized initial sparse matrix 
+        /** @brief Solution of Least square problem on CPU
+         * @param A_v_c column-major vectorized initial sparse matrix
          * @param g_R blocks for least square solution
          * @param g_b_v vectors beta, necessary for Q recovery
          * @param g_I container of row index set for all columns of matrix M
@@ -350,23 +364,23 @@ namespace viennacl
          * @param g_is_update container with indicators which blocks are active
          * @param M_v column-major vectorized sparse matrix, final preconditioner
          * @param tag spai tag
-         */ 
+         */
         template<typename SparseVectorType, typename DenseMatrixType, typename VectorType>
         void least_square_solve(const std::vector<SparseVectorType>& A_v_c,
-                                std::vector<DenseMatrixType>& g_R, 
+                                std::vector<DenseMatrixType>& g_R,
                                 std::vector<VectorType>& g_b_v,
-                                std::vector<std::vector<unsigned int> >& g_I, 
+                                std::vector<std::vector<unsigned int> >& g_I,
                                 std::vector<std::vector<unsigned int> >& g_J,
-                                std::vector<SparseVectorType>& g_res, 
-                                std::vector<bool>& g_is_update, 
+                                std::vector<SparseVectorType>& g_res,
+                                std::vector<bool>& g_is_update,
                                 std::vector<SparseVectorType>& M_v,
                                 const spai_tag& tag){
             typedef typename DenseMatrixType::value_type ScalarType;
             //VectorType m_new, y;
-#ifdef _OPENMP
+#ifdef VIENNACL_WITH_OPENMP
             #pragma omp parallel for
-#endif            
-            for(std::size_t i = 0; i < M_v.size(); ++i){
+#endif
+            for (long i = 0; i < static_cast<long>(M_v.size()); ++i){
                 if(g_is_update[i]){
                     VectorType y = boost::numeric::ublas::zero_vector<ScalarType>(g_I[i].size());
                     //std::cout<<y<<std::endl;
@@ -376,7 +390,7 @@ namespace viennacl
                     backwardSolve(g_R[i], y, m_new);
                     fanOutVector(m_new, g_J[i], M_v[i]);
                     g_res[i].clear();
-                    compute_spai_residual<SparseVectorType, ScalarType>(A_v_c,  M_v[i], static_cast<unsigned int>(tag.getBegInd() + i), g_res[i]); 
+                    compute_spai_residual<SparseVectorType, ScalarType>(A_v_c,  M_v[i], static_cast<unsigned int>(tag.getBegInd() + i), g_res[i]);
                     ScalarType res_norm = 0;
                     sparse_norm_2(g_res[i], res_norm);
 //                    std::cout<<"Residual norm of column #: "<<i<<std::endl;
@@ -385,86 +399,85 @@ namespace viennacl
                     g_is_update[i] = (res_norm > tag.getResidualNormThreshold())&& (!tag.getIsStatic());
                 }
             }
-        } 
-        
+        }
+
         //************************************ UPDATE CHECK ***************************************************//
         template<typename VectorType>
         bool is_all_update(VectorType& parallel_is_update){
-            
+
             for(unsigned int i = 0; i < parallel_is_update.size(); ++i){
                 if(parallel_is_update[i])
                     return true;
             }
             return false;
         }
-        
+
         //********************************** MATRIX VECTORIZATION ***********************************************//
         //Matrix vectorization, column based approach
-        /** @brief Solution of Least square problem on CPU 
-         * @param M_in input sparse, boost::numeric::ublas::compressed_matrix 
-         * @param M_v array of sparse vectors 
-         */ 
+        /** @brief Solution of Least square problem on CPU
+         * @param M_in input sparse, boost::numeric::ublas::compressed_matrix
+         * @param M_v array of sparse vectors
+         */
         template<typename SparseMatrixType, typename SparseVectorType>
         void vectorize_column_matrix(const SparseMatrixType& M_in, std::vector<SparseVectorType>& M_v){
             for(typename SparseMatrixType::const_iterator1 row_it = M_in.begin1(); row_it!= M_in.end1(); ++row_it){
                 //
                 for(typename SparseMatrixType::const_iterator2 col_it = row_it.begin(); col_it != row_it.end(); ++col_it){
-                    M_v[static_cast<unsigned int>(col_it.index2())][static_cast<unsigned int>(col_it.index1())] = *col_it; 
+                    M_v[static_cast<unsigned int>(col_it.index2())][static_cast<unsigned int>(col_it.index1())] = *col_it;
                 }
                 //std::cout<<std::endl;
             }
         }
-        
+
         //Matrix vectorization row based approach
         template<typename SparseMatrixType, typename SparseVectorType>
         void vectorize_row_matrix(const SparseMatrixType& M_in, std::vector<SparseVectorType>& M_v){
             for(typename SparseMatrixType::const_iterator1 row_it = M_in.begin1(); row_it!= M_in.end1(); ++row_it){
                 for(typename SparseMatrixType::const_iterator2 col_it = row_it.begin(); col_it != row_it.end(); ++col_it){
-                    M_v[static_cast<unsigned int>(col_it.index1())][static_cast<unsigned int>(col_it.index2())] = *col_it; 
+                    M_v[static_cast<unsigned int>(col_it.index1())][static_cast<unsigned int>(col_it.index2())] = *col_it;
                 }
             }
         }
-        
+
         //************************************* BLOCK ASSEMBLY CODE *********************************************//
-        
-        
-        
-        void write_set_to_array(const std::vector<std::vector<unsigned int> >& ind_set, std::vector<cl_uint>& a){
-            unsigned int cnt = 0;
+
+
+        template <typename SizeType>
+        void write_set_to_array(const std::vector<std::vector<SizeType> >& ind_set, std::vector<cl_uint>& a){
+            vcl_size_t cnt = 0;
             //unsigned int tmp;
-            for(size_t i = 0; i < ind_set.size(); ++i){
-                for(size_t j = 0; j < ind_set[i].size(); ++j){
-                    a[cnt++] = static_cast<cl_uint>(ind_set[i][j]); 
+            for(vcl_size_t i = 0; i < ind_set.size(); ++i){
+                for(vcl_size_t j = 0; j < ind_set[i].size(); ++j){
+                    a[cnt++] = static_cast<cl_uint>(ind_set[i][j]);
                 }
             }
         }
-        
-        
-        
+
+
+
         //assembling blocks on GPU
-        /** @brief Assembly of blocks on GPU by a gived set of row indices: g_I and column indices: g_J 
-         * @param A intial sparse matrix  
+        /** @brief Assembly of blocks on GPU by a gived set of row indices: g_I and column indices: g_J
+         * @param A intial sparse matrix
          * @param g_J container of column index set
          * @param g_I container of row index set
          * @param g_A_I_J_vcl contigious blocks A(I, J) using GPU memory
          * @param g_is_update container with indicators which blocks are active
-         * @param is_empty_block parameter that indicates if no block were assembled 
-         * @param cur_iter current iteration number 
-         */ 
+         * @param is_empty_block parameter that indicates if no block were assembled
+         */
         template<typename ScalarType, unsigned int MAT_ALIGNMENT>
-        void block_assembly(const viennacl::compressed_matrix<ScalarType, MAT_ALIGNMENT>& A, const std::vector<std::vector<unsigned int> >& g_J, 
+        void block_assembly(const viennacl::compressed_matrix<ScalarType, MAT_ALIGNMENT>& A, const std::vector<std::vector<unsigned int> >& g_J,
                             const std::vector<std::vector<unsigned int> >& g_I,
-                            block_matrix& g_A_I_J_vcl, 
+                            block_matrix& g_A_I_J_vcl,
                             std::vector<cl_uint>& g_is_update,
-                            bool& is_empty_block,
-                            const unsigned int cur_iter){
+                            bool& is_empty_block)
+        {
             //computing start indices for index sets and start indices for block matrices
             unsigned int sz_I, sz_J, sz_blocks;
             std::vector<cl_uint> matrix_dims(g_I.size()*2, static_cast<cl_uint>(0));
             std::vector<cl_uint> i_ind(g_I.size() + 1, static_cast<cl_uint>(0));
             std::vector<cl_uint> j_ind(g_I.size() + 1, static_cast<cl_uint>(0));
             std::vector<cl_uint> blocks_ind(g_I.size() + 1, static_cast<cl_uint>(0));
-            // 
+            //
             init_start_inds(g_J, j_ind);
             init_start_inds(g_I, i_ind);
             //
@@ -478,75 +491,84 @@ namespace viennacl
             write_set_to_array(g_I, I_set);
             write_set_to_array(g_J, J_set);
             // if block for assembly does exist
-            if(I_set.size() > 0 && J_set.size() > 0){
-                compute_blocks_size(g_I, g_J, sz_blocks, blocks_ind, matrix_dims);
-                std::vector<ScalarType> con_A_I_J(sz_blocks, static_cast<ScalarType>(0));
-                
-                /*if(cur_iter == 0){
-                    std::string block_asm_file_name = "kernels/spai/block_assembly_g.cl";
-                    std::string block_asm_source;
-                    read_kernel_from_file(block_asm_file_name, block_asm_source);
-                    viennacl::ocl::program & block_asm_prog = viennacl::ocl::current_context().add_program(block_asm_source.c_str(), 
-                                                                                                           "block_assembly_kernel_source");
-                    
-                    block_asm_prog.add_kernel("assemble_blocks");
-                }*/
-                block_vector set_I_vcl, set_J_vcl;
-                //init memory on GPU
-                //contigious g_A_I_J
-                g_A_I_J_vcl.handle() = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, 
-                                                                                      static_cast<unsigned int>(sizeof(ScalarType)*(sz_blocks)), 
-                                                                                      &(con_A_I_J[0]));
-                //matrix_dimensions
-                g_A_I_J_vcl.handle1() = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, 
-                                                                                   static_cast<unsigned int>(sizeof(cl_uint)*2*static_cast<cl_uint>(g_I.size())), 
-                                                                                   &(matrix_dims[0]));
-                //start_block inds
-                g_A_I_J_vcl.handle2() = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, 
-                                                                                       static_cast<unsigned int>(sizeof(cl_uint)*(g_I.size() + 1)), 
-                                                                                       &(blocks_ind[0]));
-                //set_I
-                set_I_vcl.handle() = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, 
-                                                                                    static_cast<unsigned int>(sizeof(cl_uint)*sz_I), 
-                                                                                    &(I_set[0]));
-                //set_J
-                set_J_vcl.handle() = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, 
-                                                                                    static_cast<unsigned int>(sizeof(cl_uint)*sz_J), 
-                                                                                    &(J_set[0]));
-                //i_ind
-                set_I_vcl.handle1() = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, 
-                                                                                     static_cast<unsigned int>(sizeof(cl_uint)*(g_I.size() + 1)), 
-                                                                                     &(i_ind[0]));
-                //j_ind
-                set_J_vcl.handle1() = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, 
-                                                                                     static_cast<unsigned int>(sizeof(cl_uint)*(g_I.size() + 1)), 
-                                                                                     &(j_ind[0]));
-                
-                viennacl::ocl::handle<cl_mem> g_is_update_vcl = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, 
-                                                                                                               static_cast<unsigned int>(sizeof(cl_uint)*g_is_update.size()),
-                                                                                                               &(g_is_update[0]));
-                viennacl::ocl::kernel& assembly_kernel = viennacl::ocl::get_kernel(viennacl::linalg::kernels::spai<ScalarType, 1>::program_name(), "assemble_blocks");
-                assembly_kernel.local_work_size(0, 1);
-                assembly_kernel.global_work_size(0, 256);
-                viennacl::ocl::enqueue(assembly_kernel(A.handle1(), A.handle2(), A.handle(), 
-                                                       set_I_vcl.handle(), set_J_vcl.handle(), set_I_vcl.handle1(), 
-                                                       set_J_vcl.handle1(), 
-                                                       g_A_I_J_vcl.handle2(), g_A_I_J_vcl.handle1(), g_A_I_J_vcl.handle(),
-                                                       g_is_update_vcl,
-                                                       static_cast<unsigned int>(g_I.size())));
-                is_empty_block = false;
-            }else{ 
-                is_empty_block = true;
+            if (I_set.size() > 0 && J_set.size() > 0)
+            {
+              viennacl::context ctx = viennacl::traits::context(A);
+              viennacl::ocl::context & opencl_ctx = const_cast<viennacl::ocl::context &>(ctx.opencl_context());
+              compute_blocks_size(g_I, g_J, sz_blocks, blocks_ind, matrix_dims);
+              std::vector<ScalarType> con_A_I_J(sz_blocks, static_cast<ScalarType>(0));
+
+              block_vector set_I_vcl, set_J_vcl;
+              //init memory on GPU
+              //contigious g_A_I_J
+              g_A_I_J_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+                                                              static_cast<unsigned int>(sizeof(ScalarType)*(sz_blocks)),
+                                                              &(con_A_I_J[0]));
+              g_A_I_J_vcl.handle().context(opencl_ctx);
+
+              //matrix_dimensions
+              g_A_I_J_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+                                                               static_cast<unsigned int>(sizeof(cl_uint)*2*static_cast<cl_uint>(g_I.size())),
+                                                               &(matrix_dims[0]));
+              g_A_I_J_vcl.handle1().context(opencl_ctx);
+
+              //start_block inds
+              g_A_I_J_vcl.handle2() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+                                                               static_cast<unsigned int>(sizeof(cl_uint)*(g_I.size() + 1)),
+                                                               &(blocks_ind[0]));
+              g_A_I_J_vcl.handle2().context(opencl_ctx);
+
+              //set_I
+              set_I_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+                                                            static_cast<unsigned int>(sizeof(cl_uint)*sz_I),
+                                                            &(I_set[0]));
+              set_I_vcl.handle().context(opencl_ctx);
+
+              //set_J
+              set_J_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+                                                            static_cast<unsigned int>(sizeof(cl_uint)*sz_J),
+                                                            &(J_set[0]));
+              set_J_vcl.handle().context(opencl_ctx);
+
+              //i_ind
+              set_I_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+                                                             static_cast<unsigned int>(sizeof(cl_uint)*(g_I.size() + 1)),
+                                                             &(i_ind[0]));
+              set_I_vcl.handle().context(opencl_ctx);
+
+              //j_ind
+              set_J_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+                                                             static_cast<unsigned int>(sizeof(cl_uint)*(g_I.size() + 1)),
+                                                             &(j_ind[0]));
+              set_J_vcl.handle().context(opencl_ctx);
+
+              viennacl::ocl::handle<cl_mem> g_is_update_vcl = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+                                                                                       static_cast<unsigned int>(sizeof(cl_uint)*g_is_update.size()),
+                                                                                       &(g_is_update[0]));
+
+              viennacl::linalg::opencl::kernels::spai<ScalarType>::init(opencl_ctx);
+              viennacl::ocl::kernel& assembly_kernel = opencl_ctx.get_kernel(viennacl::linalg::opencl::kernels::spai<ScalarType>::program_name(), "assemble_blocks");
+              assembly_kernel.local_work_size(0, 1);
+              assembly_kernel.global_work_size(0, 256);
+              viennacl::ocl::enqueue(assembly_kernel(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(),
+                                                     set_I_vcl.handle(), set_J_vcl.handle(), set_I_vcl.handle1(),
+                                                     set_J_vcl.handle1(),
+                                                     g_A_I_J_vcl.handle2(), g_A_I_J_vcl.handle1(), g_A_I_J_vcl.handle(),
+                                                     g_is_update_vcl,
+                                                     static_cast<unsigned int>(g_I.size())));
+              is_empty_block = false;
             }
+            else
+              is_empty_block = true;
         }
-        
+
         /************************************************************************************************************************/
-        
-        /** @brief Insertion of vectorized matrix column into original sparse matrix 
-         * @param M_v column-major vectorized matrix 
+
+        /** @brief Insertion of vectorized matrix column into original sparse matrix
+         * @param M_v column-major vectorized matrix
          * @param M original sparse matrix
          * @param is_right indicates if matrix should be transposed in the output
-         */ 
+         */
         template<typename SparseMatrixType, typename SparseVectorType>
         void insert_sparse_columns(const std::vector<SparseVectorType>& M_v,
                                    SparseMatrixType& M,
@@ -568,17 +590,17 @@ namespace viennacl
               }
             }
         }
-        
+
         /** @brief Transposition of sparse matrix
-         * @param A_in intial sparse matrix  
-         * @param A output transposed matrix 
-         */ 
+         * @param A_in intial sparse matrix
+         * @param A output transposed matrix
+         */
         template<typename MatrixType>
         void sparse_transpose(const MatrixType& A_in, MatrixType& A){
             typedef typename MatrixType::value_type ScalarType;
-            std::vector<std::map<size_t, ScalarType> >   temp_A(A_in.size2());
+            std::vector<std::map<vcl_size_t, ScalarType> >   temp_A(A_in.size2());
             A.resize(A_in.size2(), A_in.size1(), false);
-            
+
             for (typename MatrixType::const_iterator1 row_it = A_in.begin1();
                  row_it != A_in.end1();
                  ++row_it)
@@ -590,32 +612,32 @@ namespace viennacl
                     temp_A[col_it.index2()][col_it.index1()] = *col_it;
                 }
             }
-            
-            for (size_t i=0; i<temp_A.size(); ++i)
+
+            for (vcl_size_t i=0; i<temp_A.size(); ++i)
             {
-                for (typename std::map<size_t, ScalarType>::const_iterator it = temp_A[i].begin();
+                for (typename std::map<vcl_size_t, ScalarType>::const_iterator it = temp_A[i].begin();
                      it != temp_A[i].end();
                      ++it)
                     A(i, it->first) = it->second;
             }
         }
-        
-        
-        
-        
+
+
+
+
 //        template<typename SparseVectorType>
 //        void custom_copy(std::vector<SparseVectorType> & M_v, std::vector<SparseVectorType> & l_M_v, const unsigned int beg_ind){
 //            for(int i = 0; i < l_M_v.size(); ++i){
 //                l_M_v[i] = M_v[i + beg_ind];
 //            }
 //        }
-        
+
         //CPU version
-        /** @brief Construction of SPAI preconditioner on CPU 
-         * @param A initial sparse matrix   
+        /** @brief Construction of SPAI preconditioner on CPU
+         * @param A initial sparse matrix
          * @param M output preconditioner
          * @param tag spai tag
-         */ 
+         */
         template <typename MatrixType>
         void computeSPAI(const MatrixType & A, MatrixType & M, spai_tag & tag){
             typedef typename MatrixType::value_type ScalarType;
@@ -630,8 +652,8 @@ namespace viennacl
             std::vector<SparseVectorType> M_v(M.size2());
             vectorize_column_matrix(A, A_v_c);
             vectorize_column_matrix(M, M_v);
-            
-            
+
+
             while(go_on){
                 go_on = (tag.getEndInd() < static_cast<long>(M.size2()));
                 cur_iter = 0;
@@ -646,7 +668,7 @@ namespace viennacl
                 //print_matrix(M_v);
                 std::vector<SparseVectorType> l_M_v(l_sz);
                 //custom_copy(M_v, l_M_v, beg_ind);
-                std::copy(M_v.begin() + tag.getBegInd(), M_v.begin() + tag.getEndInd(), l_M_v.begin()); 
+                std::copy(M_v.begin() + tag.getBegInd(), M_v.begin() + tag.getEndInd(), l_M_v.begin());
                 //print_matrix(l_M_v);
                 //std::vector<SparseVectorType> l_A_v_c(K);
                 //custom_copy(A_v_c, l_A_v_c, beg_ind);
@@ -654,7 +676,7 @@ namespace viennacl
                 //print_matrix(l_A_v_c);
                 //vectorize_row_matrix(A, A_v_r);
                 //working blocks
-                //std::vector<DenseMatrixType> g_A_I_J(M.size2())                
+                //std::vector<DenseMatrixType> g_A_I_J(M.size2())
                 std::vector<DenseMatrixType> g_A_I_J(l_sz);
                 //std::vector<VectorType> g_b_v(M.size2());
                 std::vector<VectorType> g_b_v(l_sz);
@@ -669,44 +691,41 @@ namespace viennacl
                     // PHASE ONE
                     if(cur_iter == 0) block_set_up(A, A_v_c, l_M_v,  g_I, g_J, g_A_I_J, g_b_v);
                     else block_update(A, A_v_c, g_res, g_is_update, g_I, g_J, g_b_v, g_A_I_J, tag);
-                    
+
                     //PHASE TWO, LEAST SQUARE SOLUTION
                     least_square_solve(A_v_c, g_A_I_J, g_b_v, g_I, g_J, g_res, g_is_update, l_M_v, tag);
-                    
+
                     if(tag.getIsStatic()) break;
                     cur_iter++;
-                    
-                    
+
+
                 }
                 std::copy(l_M_v.begin(), l_M_v.end(), M_v.begin() + tag.getBegInd());
                 tag.setBegInd(tag.getEndInd());//beg_ind = end_ind;
                 tag.setEndInd(std::min(static_cast<long>(tag.getBegInd() + VIENNACL_SPAI_K_b), static_cast<long>(M.size2())));
-                //std::copy(l_M_v.begin(), l_M_v.end(), M_v.begin() + tag.getBegInd()); 
-                
+                //std::copy(l_M_v.begin(), l_M_v.end(), M_v.begin() + tag.getBegInd());
+
             }
             M.resize(M.size1(), M.size2(), false);
             insert_sparse_columns(M_v, M, tag.getIsRight());
         }
 
-        
+
         //GPU - based version
         /** @brief Construction of SPAI preconditioner on GPU
-         * @param A initial sparse matrix   
+         * @param A initial sparse matrix
          * @param cpu_A copy of initial matrix on CPU
          * @param cpu_M output preconditioner on CPU
          * @param M output preconditioner
          * @param tag SPAI tag class with parameters
-         */ 
+         */
         template <typename ScalarType, unsigned int MAT_ALIGNMENT>
         void computeSPAI(const viennacl::compressed_matrix<ScalarType, MAT_ALIGNMENT>& A, //input
                          const boost::numeric::ublas::compressed_matrix<ScalarType>& cpu_A,
                          boost::numeric::ublas::compressed_matrix<ScalarType>& cpu_M, //output
                          viennacl::compressed_matrix<ScalarType, MAT_ALIGNMENT>& M,
                          const spai_tag& tag){
-            typedef typename boost::numeric::ublas::vector<ScalarType> VectorType;
             typedef typename viennacl::linalg::detail::spai::sparse_vector<ScalarType> SparseVectorType;
-            typedef typename boost::numeric::ublas::matrix<ScalarType> DenseMatrixType;
-            typedef typename boost::numeric::ublas::compressed_matrix<ScalarType> CPUMatrixType;
             //typedef typename viennacl::compressed_matrix<ScalarType> GPUSparseMatrixType;
             //sparse matrix transpose...
             unsigned int cur_iter = 0;
@@ -720,7 +739,7 @@ namespace viennacl
             std::vector<SparseVectorType> g_res(cpu_M.size2());
             std::vector<std::vector<unsigned int> > g_I(cpu_M.size2());
             std::vector<std::vector<unsigned int> > g_J(cpu_M.size2());
-            
+
             //OpenCL variables
             block_matrix g_A_I_J_vcl;
             block_vector g_bv_vcl;
@@ -729,13 +748,15 @@ namespace viennacl
                 // PHASE ONE..
                 //timer.start();
                 //index set up on CPU
-                if(cur_iter == 0) block_set_up(A, A_v_c, M_v, g_is_update, g_I, g_J, g_A_I_J_vcl, g_bv_vcl, cur_iter);
-                else block_update(A, A_v_c, g_is_update, g_res, g_J, g_I, g_A_I_J_vcl, g_bv_vcl, tag, cur_iter);
+                if(cur_iter == 0)
+                  block_set_up(A, A_v_c, M_v, g_is_update, g_I, g_J, g_A_I_J_vcl, g_bv_vcl);
+                else
+                  block_update(A, A_v_c, g_is_update, g_res, g_J, g_I, g_A_I_J_vcl, g_bv_vcl, tag);
                 //std::cout<<"Phase 2 timing: "<<timer.get()<<std::endl;
                 //PERFORM LEAST SQUARE problems solution
                 //PHASE TWO
                 //timer.start();
-                least_square_solve<SparseVectorType, ScalarType>(A_v_c, M_v, g_I, g_J, g_A_I_J_vcl, g_bv_vcl, g_res, g_is_update, tag, cur_iter);
+                least_square_solve<SparseVectorType, ScalarType>(A_v_c, M_v, g_I, g_J, g_A_I_J_vcl, g_bv_vcl, g_res, g_is_update, tag, viennacl::traits::context(A));
                 //std::cout<<"Phase 3 timing: "<<timer.get()<<std::endl;
                 if(tag.getIsStatic()) break;
                 cur_iter++;
@@ -746,9 +767,9 @@ namespace viennacl
             M.resize(static_cast<unsigned int>(cpu_M.size1()), static_cast<unsigned int>(cpu_M.size2()));
             viennacl::copy(cpu_M, M);
         }
-        
-      }        
+
+      }
     }
   }
 }
-#endif
\ No newline at end of file
+#endif
diff --git a/viennacl/linalg/detail/spai/spai_tag.hpp b/viennacl/linalg/detail/spai/spai_tag.hpp
index be73102..1cfd41c 100644
--- a/viennacl/linalg/detail/spai/spai_tag.hpp
+++ b/viennacl/linalg/detail/spai/spai_tag.hpp
@@ -2,16 +2,17 @@
 #define VIENNACL_LINALG_DETAIL_SPAI_SPAI_TAG_HPP
 
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
@@ -19,14 +20,12 @@
 
 
 /** @file viennacl/linalg/detail/spai/spai_tag.hpp
-    @brief Implementation of the spai tag holding SPAI configuration parameters. Experimental in 1.2.x.
-    
+    @brief Implementation of the spai tag holding SPAI configuration parameters. Experimental.
+
     SPAI code contributed by Nikolay Lukash
 */
 
 
-
-//#define __NO_STD_VECTOR
 #include <utility>
 #include <iostream>
 #include <fstream>
@@ -44,7 +43,7 @@
 #include "boost/numeric/ublas/io.hpp"
 #include "boost/numeric/ublas/matrix_expression.hpp"
 #include "boost/numeric/ublas/detail/matrix_assign.hpp"
-//#include "boost/thread/thread.hpp"
+
 #include "viennacl/linalg/detail/spai/block_matrix.hpp"
 #include "viennacl/linalg/detail/spai/block_vector.hpp"
 
@@ -56,7 +55,7 @@ namespace viennacl
       {
         namespace spai
         {
-        
+
           /** @brief A tag for SPAI
           * Contains values for the algorithm.
           * Must be passed to spai_precond constructor
@@ -77,71 +76,71 @@ namespace viennacl
                       double residual_threshold = 1e-2,
                       bool is_static = false,
                       bool is_right = false) :
-              _residual_norm_threshold(residual_norm_threshold),
-              _iteration_limit(iteration_limit),
-              _residual_threshold(residual_threshold),
-              _is_static(is_static),
-              _is_right(is_right) {};
-              
+              residual_norm_threshold_(residual_norm_threshold),
+              iteration_limit_(iteration_limit),
+              residual_threshold_(residual_threshold),
+              is_static_(is_static),
+              is_right_(is_right) {}
+
               double getResidualNormThreshold() const
-              { return _residual_norm_threshold; }
-              
+              { return residual_norm_threshold_; }
+
               double getResidualThreshold() const
-              { return _residual_threshold; }
-              
+              { return residual_threshold_; }
+
               unsigned int getIterationLimit () const
-              { return _iteration_limit; }
-              
+              { return iteration_limit_; }
+
               bool getIsStatic() const
-              { return _is_static; }
-              
+              { return is_static_; }
+
               bool getIsRight() const
-              { return _is_right; }
-              
+              { return is_right_; }
+
               long getBegInd() const
-              { return _beg_ind; }
-              
-              long getEndInd() const 
-              { return _end_ind; }
-              
-              
-              
+              { return beg_ind_; }
+
+              long getEndInd() const
+              { return end_ind_; }
+
+
+
               void setResidualNormThreshold(double residual_norm_threshold)
               {
                   if(residual_norm_threshold > 0)
-                      _residual_norm_threshold = residual_norm_threshold;
+                      residual_norm_threshold_ = residual_norm_threshold;
               }
-              
+
               void setResidualThreshold(double residual_threshold)
               {
                   if(residual_threshold > 0)
-                      _residual_threshold = residual_threshold;
+                      residual_threshold_ = residual_threshold;
               }
-              
+
               void setIterationLimit(unsigned int iteration_limit)
               {
                   if(iteration_limit > 0)
-                      _iteration_limit = iteration_limit;
+                      iteration_limit_ = iteration_limit;
               }
-              
-              void setIsRight(bool is_right) { _is_right = is_right; }
-              
-              void setIsStatic(bool is_static) { _is_static = is_static; }
-              
-              void setBegInd(long beg_ind) { _beg_ind = beg_ind; }
-              
-              void setEndInd(long end_ind){ _end_ind = end_ind; }
-              
-              
+
+              void setIsRight(bool is_right) { is_right_ = is_right; }
+
+              void setIsStatic(bool is_static) { is_static_ = is_static; }
+
+              void setBegInd(long beg_ind) { beg_ind_ = beg_ind; }
+
+              void setEndInd(long end_ind){ end_ind_ = end_ind; }
+
+
             private:
-              double _residual_norm_threshold;
-              unsigned int _iteration_limit;
-              long _beg_ind, _end_ind;
-              double _residual_threshold;
-              bool _is_static;
-              bool _is_right;
+              double residual_norm_threshold_;
+              unsigned int iteration_limit_;
+              long beg_ind_, end_ind_;
+              double residual_threshold_;
+              bool is_static_;
+              bool is_right_;
           };
-        
+
         }
       }
     }
diff --git a/viennacl/linalg/detail/spai/sparse_vector.hpp b/viennacl/linalg/detail/spai/sparse_vector.hpp
index a832a58..8684b65 100644
--- a/viennacl/linalg/detail/spai/sparse_vector.hpp
+++ b/viennacl/linalg/detail/spai/sparse_vector.hpp
@@ -2,24 +2,25 @@
 #define VIENNACL_LINALG_DETAIL_SPAI_SPARSE_VECTOR_HPP
 
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
 ============================================================================= */
 
 /** @file viennacl/linalg/detail/spai/sparse_vector.hpp
-    @brief Implementation of a helper sparse vector class for SPAI. Experimental in 1.2.x.
-    
+    @brief Implementation of a helper sparse vector class for SPAI. Experimental.
+
     SPAI code contributed by Nikolay Lukash
 */
 
@@ -43,7 +44,7 @@ namespace viennacl
     {
       namespace spai
       {
-        
+
         /**
          * @brief Represents sparse vector based on std::map<unsigned int, ScalarType>
          */
@@ -52,55 +53,54 @@ namespace viennacl
         public:
             typedef typename std::map<unsigned int, ScalarType>::iterator iterator;
             typedef typename std::map<unsigned int, ScalarType>::const_iterator const_iterator;
-            sparse_vector(){
-            }
-            
+            sparse_vector() {}
+
             /** @brief Set the index of the vector in the original matrix
              * May only be called once.
              */
             //getter
             ScalarType& operator[] (const unsigned int ind){
-                return _v[ind];
-                
+                return v_[ind];
+
             }
-            
+
             void clear(){
-                _v.clear();
+                v_.clear();
             }
-            
+
             const_iterator find(const unsigned int var) const{
-                return _v.find(var);
+                return v_.find(var);
             }
-            
+
             iterator find(const unsigned int var){
-                return _v.find(var);
+                return v_.find(var);
             }
-            
+
             const_iterator begin() const{
-                return _v.begin();
+                return v_.begin();
             }
-            
+
             const_iterator end() const{
-                return _v.end();
+                return v_.end();
             }
-            
-            
+
+
             iterator begin(){
-                return _v.begin();
+                return v_.begin();
             }
-            
+
             iterator end(){
-                return _v.end();
+                return v_.end();
             }
-            
-            
+
+
         private:
-            unsigned int _size;
-            std::map<unsigned int, ScalarType> _v;
+            unsigned int size_;
+            std::map<unsigned int, ScalarType> v_;
         };
       }
     }
   }
 }
 
-#endif
\ No newline at end of file
+#endif
diff --git a/viennacl/linalg/direct_solve.hpp b/viennacl/linalg/direct_solve.hpp
index 2edb387..0eab6f4 100644
--- a/viennacl/linalg/direct_solve.hpp
+++ b/viennacl/linalg/direct_solve.hpp
@@ -1,396 +1,383 @@
-#ifndef VIENNACL_DIRECT_SOLVE_HPP_
-#define VIENNACL_DIRECT_SOLVE_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file direct_solve.hpp
-    @brief Implementations of dense direct solvers are found here.
-*/
-
-#include "viennacl/vector.hpp"
-#include "viennacl/matrix.hpp"
-#include "viennacl/tools/matrix_kernel_class_deducer.hpp"
-#include "viennacl/tools/matrix_solve_kernel_class_deducer.hpp"
-#include "viennacl/ocl/kernel.hpp"
-#include "viennacl/ocl/device.hpp"
-#include "viennacl/ocl/handle.hpp"
-
-
-namespace viennacl
-{
-  namespace linalg
-  {
-    ////////////////// upper triangular solver (upper_tag) //////////////////////////////////////
-    /** @brief Direct inplace solver for dense upper triangular systems
-    *
-    * @param mat    The system matrix
-    * @param B      The matrix of row vectors, where the solution is directly written to
-    */
-    template<typename SCALARTYPE, typename F1, typename F2, unsigned int A1, unsigned int A2, typename SOLVERTAG>
-    void inplace_solve(const matrix<SCALARTYPE, F1, A1> & mat,
-                       matrix<SCALARTYPE, F2, A2> & B,
-                       SOLVERTAG)
-    {
-      assert(mat.size1() == mat.size2());
-      assert(mat.size2() == B.size1());
-      
-      typedef typename viennacl::tools::MATRIX_SOLVE_KERNEL_CLASS_DEDUCER< matrix<SCALARTYPE, F1, A1>,
-                                                                           matrix<SCALARTYPE, F2, A2> >::ResultType    KernelClass;
-      KernelClass::init();
-      
-      std::stringstream ss;
-      ss << SOLVERTAG::name() << "_solve";
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), ss.str());
-
-      k.global_work_size(0, B.size2() * k.local_work_size());
-      viennacl::ocl::enqueue(k(mat, cl_uint(mat.size1()), cl_uint(mat.size2()),
-                                    cl_uint(mat.internal_size1()), cl_uint(mat.internal_size2()),
-                               B,   cl_uint(B.size1()), cl_uint(B.size2()),
-                                    cl_uint(B.internal_size1()), cl_uint(B.internal_size2()))
-                            );        
-    }
-    
-    /** @brief Direct inplace solver for dense upper triangular systems
-    *
-    * @param mat    The system matrix
-    * @param B      The (transposed) matrix of row vectors, where the solution is directly written to
-    */
-    template<typename SCALARTYPE, typename F1, typename F2, unsigned int A1, unsigned int A2, typename SOLVERTAG>
-    void inplace_solve(const matrix<SCALARTYPE, F1, A1> & mat,
-                       const matrix_expression< const matrix<SCALARTYPE, F2, A2>,
-                                                const matrix<SCALARTYPE, F2, A2>,
-                                                op_trans> & B,
-                       SOLVERTAG)
-    {
-      assert(mat.size1() == mat.size2());
-      assert(mat.size2() == B.lhs().size2());
-      
-      typedef typename viennacl::tools::MATRIX_SOLVE_KERNEL_CLASS_DEDUCER< matrix<SCALARTYPE, F1, A1>,
-                                                                           matrix<SCALARTYPE, F2, A2> >::ResultType    KernelClass;
-      KernelClass::init();
-
-      std::stringstream ss;
-      ss << SOLVERTAG::name() << "_trans_solve";
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), ss.str());
-
-      k.global_work_size(0, B.lhs().size1() * k.local_work_size());
-      viennacl::ocl::enqueue(k(mat, cl_uint(mat.size1()), cl_uint(mat.size2()),
-                                    cl_uint(mat.internal_size1()), cl_uint(mat.internal_size2()),
-                               B.lhs(), cl_uint(B.lhs().size1()), cl_uint(B.lhs().size2()),
-                                        cl_uint(B.lhs().internal_size1()), cl_uint(B.lhs().internal_size2()))
-                            );     
-    }
-    
-    //upper triangular solver for transposed lower triangular matrices
-    /** @brief Direct inplace solver for dense upper triangular systems that stem from transposed lower triangular systems
-    *
-    * @param proxy    The system matrix proxy
-    * @param B        The matrix holding the load vectors, where the solution is directly written to
-    */
-    template<typename SCALARTYPE, typename F1, typename F2, unsigned int A1, unsigned int A2, typename SOLVERTAG>
-    void inplace_solve(const matrix_expression< const matrix<SCALARTYPE, F1, A1>,
-                                                const matrix<SCALARTYPE, F1, A1>,
-                                                op_trans> & proxy,
-                       matrix<SCALARTYPE, F2, A2> & B,
-                       SOLVERTAG)
-    {
-      assert(proxy.lhs().size1() == proxy.lhs().size2());
-      assert(proxy.lhs().size2() == B.size1());
-      
-      typedef typename viennacl::tools::MATRIX_SOLVE_KERNEL_CLASS_DEDUCER< matrix<SCALARTYPE, F1, A1>,
-                                                                           matrix<SCALARTYPE, F2, A2> >::ResultType    KernelClass;
-      KernelClass::init();
-
-      std::stringstream ss;
-      ss << "trans_" << SOLVERTAG::name() << "_solve";
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), ss.str());
-
-      k.global_work_size(0, B.size2() * k.local_work_size());
-      viennacl::ocl::enqueue(k(proxy.lhs(), cl_uint(proxy.lhs().size1()), cl_uint(proxy.lhs().size2()),
-                                            cl_uint(proxy.lhs().internal_size1()), cl_uint(proxy.lhs().internal_size2()),
-                                         B, cl_uint(B.size1()), cl_uint(B.size2()),
-                                            cl_uint(B.internal_size1()), cl_uint(B.internal_size2()))
-                            );        
-    }
-
-    /** @brief Direct inplace solver for dense upper triangular systems that stem from transposed lower triangular systems
-    *
-    * @param proxy    The system matrix proxy
-    * @param B        The matrix holding the load vectors, where the solution is directly written to
-    */
-    template<typename SCALARTYPE, typename F1, typename F2, unsigned int A1, unsigned int A2, typename SOLVERTAG>
-    void inplace_solve(const matrix_expression< const matrix<SCALARTYPE, F1, A1>,
-                                                const matrix<SCALARTYPE, F1, A1>,
-                                                op_trans> & proxy,
-                       const matrix_expression< const matrix<SCALARTYPE, F2, A2>,
-                                                const matrix<SCALARTYPE, F2, A2>,
-                                                op_trans> & B,
-                       SOLVERTAG)
-    {
-      assert(proxy.lhs().size1() == proxy.lhs().size2());
-      assert(proxy.lhs().size2() == B.lhs().size2());
-      
-      typedef typename viennacl::tools::MATRIX_SOLVE_KERNEL_CLASS_DEDUCER< matrix<SCALARTYPE, F1, A1>,
-                                                                           matrix<SCALARTYPE, F2, A2> >::ResultType    KernelClass;
-      KernelClass::init();
-
-      std::stringstream ss;
-      ss << "trans_" << SOLVERTAG::name() << "_trans_solve";
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), ss.str());
-
-      k.global_work_size(0, B.lhs().size1() * k.local_work_size());
-      viennacl::ocl::enqueue(k(proxy.lhs(), cl_uint(proxy.lhs().size1()), cl_uint(proxy.lhs().size2()),
-                                            cl_uint(proxy.lhs().internal_size1()), cl_uint(proxy.lhs().internal_size2()),
-                               B.lhs(), cl_uint(B.lhs().size1()), cl_uint(B.lhs().size2()),
-                                        cl_uint(B.lhs().internal_size1()), cl_uint(B.lhs().internal_size2()))
-                            );        
-    }
-
-    template<typename SCALARTYPE, typename F, unsigned int ALIGNMENT, unsigned int VEC_ALIGNMENT, typename SOLVERTAG>
-    void inplace_solve(const matrix<SCALARTYPE, F, ALIGNMENT> & mat,
-                       vector<SCALARTYPE, VEC_ALIGNMENT> & vec,
-                       SOLVERTAG)
-    {
-      assert(mat.size1() == vec.size());
-      assert(mat.size2() == vec.size());
-      
-      typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< matrix<SCALARTYPE, F, ALIGNMENT> >::ResultType    KernelClass;
-
-      std::stringstream ss;
-      ss << SOLVERTAG::name() << "_triangular_substitute_inplace";
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), ss.str());
-
-      k.global_work_size(0, k.local_work_size());
-      viennacl::ocl::enqueue(k(mat, cl_uint(mat.size1()), cl_uint(mat.size2()), 
-                                    cl_uint(mat.internal_size1()), cl_uint(mat.internal_size2()), vec));        
-    }
-
-    /** @brief Direct inplace solver for dense upper triangular systems that stem from transposed lower triangular systems
-    *
-    * @param proxy    The system matrix proxy
-    * @param vec    The load vector, where the solution is directly written to
-    */
-    template<typename SCALARTYPE, typename F, unsigned int ALIGNMENT, unsigned int VEC_ALIGNMENT, typename SOLVERTAG>
-    void inplace_solve(const matrix_expression< const matrix<SCALARTYPE, F, ALIGNMENT>,
-                                                const matrix<SCALARTYPE, F, ALIGNMENT>,
-                                                op_trans> & proxy,
-                       vector<SCALARTYPE, VEC_ALIGNMENT> & vec,
-                       SOLVERTAG)
-    {
-      assert(proxy.lhs().size1() == vec.size());
-      assert(proxy.lhs().size2() == vec.size());
-
-      typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< matrix<SCALARTYPE, F, ALIGNMENT> >::ResultType    KernelClass;
-      
-      std::stringstream ss;
-      ss << "trans_" << SOLVERTAG::name() << "_triangular_substitute_inplace";
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), ss.str());
-      
-      k.global_work_size(0, k.local_work_size());
-      viennacl::ocl::enqueue(k(proxy.lhs(), cl_uint(proxy.lhs().size1()), cl_uint(proxy.lhs().size2()),
-                                            cl_uint(proxy.lhs().internal_size1()), cl_uint(proxy.lhs().internal_size2()), vec));        
-    }
-    
-    /////////////////// general wrappers for non-inplace solution //////////////////////    
-
-    /** @brief Convenience functions for C = solve(A, B, some_tag()); Creates a temporary result matrix and forwards the request to inplace_solve()
-    *
-    * @param A    The system matrix
-    * @param B    The matrix of load vectors
-    * @param tag    Dispatch tag
-    */
-    template<typename SCALARTYPE, typename F1, typename F2, unsigned int ALIGNMENT_A, unsigned int ALIGNMENT_B, typename TAG>
-    matrix<SCALARTYPE, F2, ALIGNMENT_B> solve(const matrix<SCALARTYPE, F1, ALIGNMENT_A> & A,
-                                        const matrix<SCALARTYPE, F2, ALIGNMENT_B> & B,
-                                        TAG const & tag)
-    {
-      // do an inplace solve on the result vector:
-      matrix<SCALARTYPE, F2, ALIGNMENT_A> result(B.size1(), B.size2());
-      result = B;
-    
-      inplace_solve(A, result, tag);
-    
-      return result;
-    }
-
-    /** @brief Convenience functions for C = solve(A, B^T, some_tag()); Creates a temporary result matrix and forwards the request to inplace_solve()
-    *
-    * @param A    The system matrix
-    * @param proxy  The transposed load vector
-    * @param tag    Dispatch tag
-    */
-    template<typename SCALARTYPE, typename F1, typename F2, unsigned int ALIGNMENT_A, unsigned int ALIGNMENT_B, typename TAG>
-    matrix<SCALARTYPE, F2, ALIGNMENT_B> solve(const matrix<SCALARTYPE, F1, ALIGNMENT_A> & A,
-                                        const matrix_expression< const matrix<SCALARTYPE, F2, ALIGNMENT_B>,
-                                                                     const matrix<SCALARTYPE, F2, ALIGNMENT_B>,
-                                                                     op_trans> & proxy,
-                                        TAG const & tag)
-    {
-      // do an inplace solve on the result vector:
-      matrix<SCALARTYPE, F2, ALIGNMENT_B> result(proxy.lhs().size2(), proxy.lhs().size1());
-      result = proxy;
-    
-      inplace_solve(A, result, tag);
-    
-      return result;
-    }
-
-    /** @brief Convenience functions for result = solve(mat, vec, some_tag()); Creates a temporary result vector and forwards the request to inplace_solve()
-    *
-    * @param mat    The system matrix
-    * @param vec    The load vector
-    * @param tag    Dispatch tag
-    */
-    template<typename SCALARTYPE, typename F, unsigned int ALIGNMENT, unsigned int VEC_ALIGNMENT, typename TAG>
-    vector<SCALARTYPE, VEC_ALIGNMENT> solve(const matrix<SCALARTYPE, F, ALIGNMENT> & mat,
-                                        const vector<SCALARTYPE, VEC_ALIGNMENT> & vec,
-                                        TAG const & tag)
-    {
-      // do an inplace solve on the result vector:
-      vector<SCALARTYPE, VEC_ALIGNMENT> result(vec.size());
-      result = vec;
-    
-      inplace_solve(mat, result, tag);
-    
-      return result;
-    }
-    
-    
-    ///////////// transposed system matrix:
-    /** @brief Convenience functions for result = solve(trans(mat), B, some_tag()); Creates a temporary result matrix and forwards the request to inplace_solve()
-    *
-    * @param proxy  The transposed system matrix proxy
-    * @param B      The matrix of load vectors
-    * @param tag    Dispatch tag
-    */
-    template<typename SCALARTYPE, typename F1, typename F2, unsigned int ALIGNMENT_A, unsigned int ALIGNMENT_B, typename TAG>
-    matrix<SCALARTYPE, F2, ALIGNMENT_B> solve(const matrix_expression< const matrix<SCALARTYPE, F1, ALIGNMENT_A>,
-                                                                     const matrix<SCALARTYPE, F1, ALIGNMENT_A>,
-                                                                     op_trans> & proxy,
-                                            const matrix<SCALARTYPE, F2, ALIGNMENT_B> & B,
-                                            TAG const & tag)
-    {
-      // do an inplace solve on the result vector:
-      matrix<SCALARTYPE, F2, ALIGNMENT_B> result(B.size1(), B.size2());
-      result = B;
-    
-      inplace_solve(proxy, result, tag);
-    
-      return result;
-    }
-    
-    
-    /** @brief Convenience functions for result = solve(trans(mat), vec, some_tag()); Creates a temporary result vector and forwards the request to inplace_solve()
-    *
-    * @param proxy_A  The transposed system matrix proxy
-    * @param proxy_B  The transposed matrix of load vectors, where the solution is directly written to
-    * @param tag    Dispatch tag
-    */
-    template<typename SCALARTYPE, typename F1, typename F2, unsigned int ALIGNMENT_A, unsigned int ALIGNMENT_B, typename TAG>
-    matrix<SCALARTYPE, F2, ALIGNMENT_B> solve(const matrix_expression< const matrix<SCALARTYPE, F1, ALIGNMENT_A>,
-                                                                     const matrix<SCALARTYPE, F1, ALIGNMENT_A>,
-                                                                     op_trans> & proxy_A,
-                                            const matrix_expression< const matrix<SCALARTYPE, F2, ALIGNMENT_B>,
-                                                                     const matrix<SCALARTYPE, F2, ALIGNMENT_B>,
-                                                                     op_trans> & proxy_B,
-                                            TAG const & tag)
-    {
-      // do an inplace solve on the result vector:
-      matrix<SCALARTYPE, F2, ALIGNMENT_B> result(proxy_B.lhs().size2(), proxy_B.lhs().size1());
-      result = trans(proxy_B.lhs());
-    
-      inplace_solve(proxy_A, result, tag);
-    
-      return result;
-    }
-    
-    /** @brief Convenience functions for result = solve(trans(mat), vec, some_tag()); Creates a temporary result vector and forwards the request to inplace_solve()
-    *
-    * @param proxy  The transposed system matrix proxy
-    * @param vec    The load vector, where the solution is directly written to
-    * @param tag    Dispatch tag
-    */
-    template<typename SCALARTYPE, typename F, unsigned int ALIGNMENT, unsigned int VEC_ALIGNMENT, typename TAG>
-    vector<SCALARTYPE, VEC_ALIGNMENT> solve(const matrix_expression< const matrix<SCALARTYPE, F, ALIGNMENT>,
-                                                                     const matrix<SCALARTYPE, F, ALIGNMENT>,
-                                                                     op_trans> & proxy,
-                                            const vector<SCALARTYPE, VEC_ALIGNMENT> & vec,
-                                            TAG const & tag)
-    {
-      // do an inplace solve on the result vector:
-      vector<SCALARTYPE, VEC_ALIGNMENT> result(vec.size());
-      result = vec;
-    
-      inplace_solve(proxy, result, tag);
-    
-      return result;
-    }
-    
-    
-    ///////////////////////////// lu factorization ///////////////////////
-    /** @brief LU factorization of a dense matrix.
-    *
-    * @param mat    The system matrix, where the LU matrices are directly written to. The implicit unit diagonal of L is not written.
-    */
-    template<typename SCALARTYPE, typename F, unsigned int ALIGNMENT>
-    void lu_factorize(matrix<SCALARTYPE, F, ALIGNMENT> & mat)
-    {
-      assert(mat.size1() == mat.size2());
-
-      typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< matrix<SCALARTYPE, F, ALIGNMENT> >::ResultType    KernelClass;
-      
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), "lu_factorize");
-      
-      k.global_work_size(0, k.local_work_size());
-      viennacl::ocl::enqueue(k(mat, cl_uint(mat.size1()), cl_uint(mat.size2()),
-                                    cl_uint(mat.internal_size1()), cl_uint(mat.internal_size2())) );        
-    }
-
-
-    /** @brief LU substitution for the system LU = rhs.
-    *
-    * @param A    The system matrix, where the LU matrices are directly written to. The implicit unit diagonal of L is not written.
-    * @param B    The matrix of load vectors, where the solution is directly written to
-    */
-    template<typename SCALARTYPE, typename F1, typename F2, unsigned int ALIGNMENT_A, unsigned int ALIGNMENT_B>
-    void lu_substitute(matrix<SCALARTYPE, F1, ALIGNMENT_A> const & A,
-                       matrix<SCALARTYPE, F2, ALIGNMENT_B> & B)
-    {
-      assert(A.size1() == A.size2());
-      assert(A.size1() == A.size2());
-      inplace_solve(A, B, unit_lower_tag());
-      inplace_solve(A, B, upper_tag());
-    }
-
-    /** @brief LU substitution for the system LU = rhs.
-    *
-    * @param mat    The system matrix, where the LU matrices are directly written to. The implicit unit diagonal of L is not written.
-    * @param vec    The load vector, where the solution is directly written to
-    */
-    template<typename SCALARTYPE, typename F, unsigned int ALIGNMENT, unsigned int VEC_ALIGNMENT>
-    void lu_substitute(matrix<SCALARTYPE, F, ALIGNMENT> const & mat,
-                       vector<SCALARTYPE, VEC_ALIGNMENT> & vec)
-    {
-      assert(mat.size1() == mat.size2());
-      inplace_solve(mat, vec, unit_lower_tag());
-      inplace_solve(mat, vec, upper_tag());
-    }
-
-  }
-}
-
-#endif
+#ifndef VIENNACL_LINALG_DIRECT_SOLVE_HPP_
+#define VIENNACL_LINALG_DIRECT_SOLVE_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/direct_solve.hpp
+    @brief Implementations of dense direct solvers are found here.
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/host_based/direct_solve.hpp"
+
+#ifdef VIENNACL_WITH_OPENCL
+  #include "viennacl/linalg/opencl/direct_solve.hpp"
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+  #include "viennacl/linalg/cuda/direct_solve.hpp"
+#endif
+
+namespace viennacl
+{
+  namespace linalg
+  {
+
+    //
+    // A \ B:
+    //
+
+    /** @brief Direct inplace solver for dense triangular systems. Matlab notation: A \ B
+    *
+    * @param A    The system matrix
+    * @param B    The matrix of row vectors, where the solution is directly written to
+    */
+    template <typename NumericT, typename F1, typename F2, typename SOLVERTAG>
+    void inplace_solve(const matrix_base<NumericT, F1> & A, matrix_base<NumericT, F2> & B, SOLVERTAG)
+    {
+      assert( (viennacl::traits::size1(A) == viennacl::traits::size2(A)) && bool("Size check failed in inplace_solve(): size1(A) != size2(A)"));
+      assert( (viennacl::traits::size1(A) == viennacl::traits::size1(B)) && bool("Size check failed in inplace_solve(): size1(A) != size1(B)"));
+
+      switch (viennacl::traits::handle(A).get_active_handle_id())
+      {
+        case viennacl::MAIN_MEMORY:
+          viennacl::linalg::host_based::inplace_solve(A, B, SOLVERTAG());
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case viennacl::OPENCL_MEMORY:
+          viennacl::linalg::opencl::inplace_solve(A, B, SOLVERTAG());
+          break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        case viennacl::CUDA_MEMORY:
+          viennacl::linalg::cuda::inplace_solve(A, B, SOLVERTAG());
+          break;
+#endif
+        case viennacl::MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("not implemented");
+      }
+    }
+
+    /** @brief Direct inplace solver for dense triangular systems with transposed right hand side
+    *
+    * @param A       The system matrix
+    * @param proxy_B The transposed matrix of row vectors, where the solution is directly written to
+    */
+    template <typename NumericT, typename F1, typename F2, typename SOLVERTAG>
+    void inplace_solve(const matrix_base<NumericT, F1> & A,
+                       matrix_expression< const matrix_base<NumericT, F2>, const matrix_base<NumericT, F2>, op_trans> proxy_B,
+                       SOLVERTAG)
+    {
+      assert( (viennacl::traits::size1(A) == viennacl::traits::size2(A))       && bool("Size check failed in inplace_solve(): size1(A) != size2(A)"));
+      assert( (viennacl::traits::size1(A) == viennacl::traits::size1(proxy_B)) && bool("Size check failed in inplace_solve(): size1(A) != size1(B^T)"));
+
+      switch (viennacl::traits::handle(A).get_active_handle_id())
+      {
+        case viennacl::MAIN_MEMORY:
+          viennacl::linalg::host_based::inplace_solve(A, proxy_B, SOLVERTAG());
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case viennacl::OPENCL_MEMORY:
+          viennacl::linalg::opencl::inplace_solve(A, proxy_B, SOLVERTAG());
+          break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        case viennacl::CUDA_MEMORY:
+          viennacl::linalg::cuda::inplace_solve(A, proxy_B, SOLVERTAG());
+          break;
+#endif
+        case viennacl::MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("not implemented");
+      }
+    }
+
+    //upper triangular solver for transposed lower triangular matrices
+    /** @brief Direct inplace solver for dense triangular systems that stem from transposed triangular systems
+    *
+    * @param proxy_A  The system matrix proxy
+    * @param B        The matrix holding the load vectors, where the solution is directly written to
+    */
+    template <typename NumericT, typename F1, typename F2, typename SOLVERTAG>
+    void inplace_solve(const matrix_expression< const matrix_base<NumericT, F1>, const matrix_base<NumericT, F1>, op_trans> & proxy_A,
+                       matrix_base<NumericT, F2> & B,
+                       SOLVERTAG)
+    {
+      assert( (viennacl::traits::size1(proxy_A) == viennacl::traits::size2(proxy_A)) && bool("Size check failed in inplace_solve(): size1(A) != size2(A)"));
+      assert( (viennacl::traits::size1(proxy_A) == viennacl::traits::size1(B))       && bool("Size check failed in inplace_solve(): size1(A^T) != size1(B)"));
+
+      switch (viennacl::traits::handle(proxy_A.lhs()).get_active_handle_id())
+      {
+        case viennacl::MAIN_MEMORY:
+          viennacl::linalg::host_based::inplace_solve(proxy_A, B, SOLVERTAG());
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case viennacl::OPENCL_MEMORY:
+          viennacl::linalg::opencl::inplace_solve(proxy_A, B, SOLVERTAG());
+          break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        case viennacl::CUDA_MEMORY:
+          viennacl::linalg::cuda::inplace_solve(proxy_A, B, SOLVERTAG());
+          break;
+#endif
+        case viennacl::MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("not implemented");
+      }
+    }
+
+    /** @brief Direct inplace solver for dense transposed triangular systems with transposed right hand side. Matlab notation: A' \ B'
+    *
+    * @param proxy_A  The system matrix proxy
+    * @param proxy_B  The matrix holding the load vectors, where the solution is directly written to
+    */
+    template <typename NumericT, typename F1, typename F2, typename SOLVERTAG>
+    void inplace_solve(const matrix_expression< const matrix_base<NumericT, F1>, const matrix_base<NumericT, F1>, op_trans> & proxy_A,
+                             matrix_expression< const matrix_base<NumericT, F2>, const matrix_base<NumericT, F2>, op_trans>   proxy_B,
+                       SOLVERTAG)
+    {
+      assert( (viennacl::traits::size1(proxy_A) == viennacl::traits::size2(proxy_A)) && bool("Size check failed in inplace_solve(): size1(A) != size2(A)"));
+      assert( (viennacl::traits::size1(proxy_A) == viennacl::traits::size1(proxy_B)) && bool("Size check failed in inplace_solve(): size1(A^T) != size1(B^T)"));
+
+      switch (viennacl::traits::handle(proxy_A.lhs()).get_active_handle_id())
+      {
+        case viennacl::MAIN_MEMORY:
+          viennacl::linalg::host_based::inplace_solve(proxy_A, proxy_B, SOLVERTAG());
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case viennacl::OPENCL_MEMORY:
+          viennacl::linalg::opencl::inplace_solve(proxy_A, proxy_B, SOLVERTAG());
+          break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        case viennacl::CUDA_MEMORY:
+          viennacl::linalg::cuda::inplace_solve(proxy_A, proxy_B, SOLVERTAG());
+          break;
+#endif
+        case viennacl::MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("not implemented");
+      }
+    }
+
+    //
+    // A \ b
+    //
+
+    template <typename NumericT, typename F, typename SOLVERTAG>
+    void inplace_solve(const matrix_base<NumericT, F> & mat,
+                             vector_base<NumericT> & vec,
+                       SOLVERTAG)
+    {
+      assert( (mat.size1() == vec.size()) && bool("Size check failed in inplace_solve(): size1(A) != size(b)"));
+      assert( (mat.size2() == vec.size()) && bool("Size check failed in inplace_solve(): size2(A) != size(b)"));
+
+      switch (viennacl::traits::handle(mat).get_active_handle_id())
+      {
+        case viennacl::MAIN_MEMORY:
+          viennacl::linalg::host_based::inplace_solve(mat, vec, SOLVERTAG());
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case viennacl::OPENCL_MEMORY:
+          viennacl::linalg::opencl::inplace_solve(mat, vec, SOLVERTAG());
+          break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        case viennacl::CUDA_MEMORY:
+          viennacl::linalg::cuda::inplace_solve(mat, vec, SOLVERTAG());
+          break;
+#endif
+        case viennacl::MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("not implemented");
+      }
+    }
+
+    /** @brief Direct inplace solver for dense upper triangular systems that stem from transposed lower triangular systems
+    *
+    * @param proxy    The system matrix proxy
+    * @param vec    The load vector, where the solution is directly written to
+    */
+    template <typename NumericT, typename F, typename SOLVERTAG>
+    void inplace_solve(const matrix_expression< const matrix_base<NumericT, F>, const matrix_base<NumericT, F>, op_trans> & proxy,
+                       vector_base<NumericT> & vec,
+                       SOLVERTAG)
+    {
+      assert( (proxy.lhs().size1() == vec.size()) && bool("Size check failed in inplace_solve(): size1(A) != size(b)"));
+      assert( (proxy.lhs().size2() == vec.size()) && bool("Size check failed in inplace_solve(): size2(A) != size(b)"));
+
+      switch (viennacl::traits::handle(proxy.lhs()).get_active_handle_id())
+      {
+        case viennacl::MAIN_MEMORY:
+          viennacl::linalg::host_based::inplace_solve(proxy, vec, SOLVERTAG());
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case viennacl::OPENCL_MEMORY:
+          viennacl::linalg::opencl::inplace_solve(proxy, vec, SOLVERTAG());
+          break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        case viennacl::CUDA_MEMORY:
+          viennacl::linalg::cuda::inplace_solve(proxy, vec, SOLVERTAG());
+          break;
+#endif
+        case viennacl::MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("not implemented");
+      }
+    }
+
+    /////////////////// general wrappers for non-inplace solution //////////////////////
+
+
+    /** @brief Convenience functions for C = solve(A, B, some_tag()); Creates a temporary result matrix and forwards the request to inplace_solve()
+    *
+    * @param A    The system matrix
+    * @param B    The matrix of load vectors
+    * @param tag    Dispatch tag
+    */
+    template <typename NumericT, typename F1, typename F2, typename SOLVERTAG>
+    matrix<NumericT, F2> solve(const matrix_base<NumericT, F1> & A,
+                               const matrix_base<NumericT, F2> & B,
+                               SOLVERTAG tag)
+    {
+      // do an inplace solve on the result vector:
+      matrix<NumericT, F2> result(B);
+
+      inplace_solve(A, result, tag);
+
+      return result;
+    }
+
+
+    //////////
+
+    /** @brief Convenience functions for C = solve(A, B^T, some_tag()); Creates a temporary result matrix and forwards the request to inplace_solve()
+    *
+    * @param A    The system matrix
+    * @param proxy  The transposed load vector
+    * @param tag    Dispatch tag
+    */
+    template <typename NumericT, typename F1, typename F2, typename SOLVERTAG>
+    matrix<NumericT, F2> solve(const matrix_base<NumericT, F1> & A,
+                               const matrix_expression< const matrix_base<NumericT, F2>, const matrix_base<NumericT, F2>, op_trans> & proxy,
+                               SOLVERTAG tag)
+    {
+      // do an inplace solve on the result vector:
+      matrix<NumericT, F2> result(proxy);
+
+      inplace_solve(A, result, tag);
+
+      return result;
+    }
+
+    /** @brief Convenience functions for result = solve(mat, vec, some_tag()); Creates a temporary result vector and forwards the request to inplace_solve()
+    *
+    * @param mat    The system matrix
+    * @param vec    The load vector
+    * @param tag    Dispatch tag
+    */
+    template <typename NumericT, typename F1, typename SOLVERTAG>
+    vector<NumericT> solve(const matrix_base<NumericT, F1> & mat,
+                           const vector_base<NumericT> & vec,
+                           SOLVERTAG const & tag)
+    {
+      // do an inplace solve on the result vector:
+      vector<NumericT> result(vec);
+
+      inplace_solve(mat, result, tag);
+
+      return result;
+    }
+
+
+    ///////////// transposed system matrix:
+    /** @brief Convenience functions for result = solve(trans(mat), B, some_tag()); Creates a temporary result matrix and forwards the request to inplace_solve()
+    *
+    * @param proxy  The transposed system matrix proxy
+    * @param B      The matrix of load vectors
+    * @param tag    Dispatch tag
+    */
+    template <typename NumericT, typename F1, typename F2, typename SOLVERTAG>
+    matrix<NumericT, F2> solve(const matrix_expression< const matrix_base<NumericT, F1>, const matrix_base<NumericT, F1>, op_trans> & proxy,
+                               const matrix_base<NumericT, F2> & B,
+                               SOLVERTAG tag)
+    {
+      // do an inplace solve on the result vector:
+      matrix<NumericT, F2> result(B);
+
+      inplace_solve(proxy, result, tag);
+
+      return result;
+    }
+
+
+    /** @brief Convenience functions for result = solve(trans(mat), vec, some_tag()); Creates a temporary result vector and forwards the request to inplace_solve()
+    *
+    * @param proxy_A  The transposed system matrix proxy
+    * @param proxy_B  The transposed matrix of load vectors, where the solution is directly written to
+    * @param tag    Dispatch tag
+    */
+    template <typename NumericT, typename F1, typename F2, typename SOLVERTAG>
+    matrix<NumericT, F2> solve(const matrix_expression< const matrix_base<NumericT, F1>, const matrix_base<NumericT, F1>, op_trans> & proxy_A,
+                               const matrix_expression< const matrix_base<NumericT, F2>, const matrix_base<NumericT, F2>, op_trans> & proxy_B,
+                               SOLVERTAG tag)
+    {
+      // do an inplace solve on the result vector:
+      matrix<NumericT, F2> result(proxy_B);
+
+      inplace_solve(proxy_A, result, tag);
+
+      return result;
+    }
+
+    /** @brief Convenience functions for result = solve(trans(mat), vec, some_tag()); Creates a temporary result vector and forwards the request to inplace_solve()
+    *
+    * @param proxy  The transposed system matrix proxy
+    * @param vec    The load vector, where the solution is directly written to
+    * @param tag    Dispatch tag
+    */
+    template <typename NumericT, typename F1, typename SOLVERTAG>
+    vector<NumericT> solve(const matrix_expression< const matrix_base<NumericT, F1>, const matrix_base<NumericT, F1>, op_trans> & proxy,
+                           const vector_base<NumericT> & vec,
+                           SOLVERTAG const & tag)
+    {
+      // do an inplace solve on the result vector:
+      vector<NumericT> result(vec);
+
+      inplace_solve(proxy, result, tag);
+
+      return result;
+    }
+
+
+  }
+}
+
+#endif
diff --git a/viennacl/misc/bandwidth_reduction.hpp b/viennacl/linalg/eig.hpp
similarity index 53%
copy from viennacl/misc/bandwidth_reduction.hpp
copy to viennacl/linalg/eig.hpp
index be237b8..8479f94 100644
--- a/viennacl/misc/bandwidth_reduction.hpp
+++ b/viennacl/linalg/eig.hpp
@@ -1,37 +1,29 @@
-#ifndef VIENNACL_MISC_BANDWIDTH_REDUCTION_HPP
-#define VIENNACL_MISC_BANDWIDTH_REDUCTION_HPP
+#ifndef VIENNACL_LINALG_EIG_HPP_
+#define VIENNACL_LINALG_EIG_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
 ============================================================================= */
 
-
-/** @file viennacl/misc/bandwidth_reduction.hpp
-    @brief Convenience include for bandwidth reduction algorithms such as Cuthill-McKee or Gibbs-Poole-Stockmeyer.  Experimental in 1.2.x.
+/** @file viennacl/linalg/eig.hpp
+*   @brief Convenience header file including all available eigenvalue algorithms
 */
 
-#include "viennacl/misc/cuthill_mckee.hpp"
-#include "viennacl/misc/gibbs_poole_stockmeyer.hpp"
-
-
-namespace viennacl
-{
-  //TODO: Add convenience overload here. Which should be default?
-  
-  
-} //namespace viennacl
-    
+#include "viennacl/linalg/bisect.hpp"
+#include "viennacl/linalg/lanczos.hpp"
+#include "viennacl/linalg/power_iter.hpp"
 
 #endif
diff --git a/viennacl/linalg/gmres.hpp b/viennacl/linalg/gmres.hpp
index ca3d704..7768763 100644
--- a/viennacl/linalg/gmres.hpp
+++ b/viennacl/linalg/gmres.hpp
@@ -2,16 +2,17 @@
 #define VIENNACL_GMRES_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
@@ -37,7 +38,7 @@ namespace viennacl
 {
   namespace linalg
   {
-    
+
     /** @brief A tag for the solver GMRES. Used for supplying solver parameters and for dispatching the solve() function
     */
     class gmres_tag       //generalized minimum residual
@@ -49,64 +50,107 @@ namespace viennacl
         * @param max_iterations The maximum number of iterations (including restarts
         * @param krylov_dim     The maximum dimension of the Krylov space before restart (number of restarts is found by max_iterations / krylov_dim)
         */
-        gmres_tag(double tol = 1e-10, unsigned int max_iterations = 300, unsigned int krylov_dim = 20) 
-         : _tol(tol), _iterations(max_iterations), _krylov_dim(krylov_dim), iters_taken_(0) {};
-        
+        gmres_tag(double tol = 1e-10, unsigned int max_iterations = 300, unsigned int krylov_dim = 20)
+         : tol_(tol), iterations_(max_iterations), krylov_dim_(krylov_dim), iters_taken_(0) {}
+
         /** @brief Returns the relative tolerance */
-        double tolerance() const { return _tol; }
+        double tolerance() const { return tol_; }
         /** @brief Returns the maximum number of iterations */
-        unsigned int max_iterations() const { return _iterations; }
+        unsigned int max_iterations() const { return iterations_; }
         /** @brief Returns the maximum dimension of the Krylov space before restart */
-        unsigned int krylov_dim() const { return _krylov_dim; }
+        unsigned int krylov_dim() const { return krylov_dim_; }
         /** @brief Returns the maximum number of GMRES restarts */
         unsigned int max_restarts() const
-        { 
-          unsigned int ret = _iterations / _krylov_dim;
-          if (ret > 0 && (ret * _krylov_dim == _iterations) )
+        {
+          unsigned int ret = iterations_ / krylov_dim_;
+          if (ret > 0 && (ret * krylov_dim_ == iterations_) )
             return ret - 1;
           return ret;
         }
-        
+
         /** @brief Return the number of solver iterations: */
         unsigned int iters() const { return iters_taken_; }
         /** @brief Set the number of solver iterations (should only be modified by the solver) */
         void iters(unsigned int i) const { iters_taken_ = i; }
-        
+
         /** @brief Returns the estimated relative error at the end of the solver run */
         double error() const { return last_error_; }
         /** @brief Sets the estimated relative error at the end of the solver run */
         void error(double e) const { last_error_ = e; }
-        
+
       private:
-        double _tol;
-        unsigned int _iterations;
-        unsigned int _krylov_dim;
-        
+        double tol_;
+        unsigned int iterations_;
+        unsigned int krylov_dim_;
+
         //return values from solver
         mutable unsigned int iters_taken_;
         mutable double last_error_;
     };
-    
-    namespace
+
+    namespace detail
     {
-      
+
       template <typename SRC_VECTOR, typename DEST_VECTOR>
-      void gmres_copy_helper(SRC_VECTOR const & src, DEST_VECTOR & dest, unsigned int len)
+      void gmres_copy_helper(SRC_VECTOR const & src, DEST_VECTOR & dest, vcl_size_t len, vcl_size_t start = 0)
       {
-        for (unsigned int i=0; i<len; ++i)
-          dest[i] = src[i];
+        for (vcl_size_t i=0; i<len; ++i)
+          dest[start+i] = src[start+i];
       }
 
       template <typename ScalarType, typename DEST_VECTOR>
-      void gmres_copy_helper(viennacl::vector<ScalarType> const & src, DEST_VECTOR & dest, unsigned int len)
+      void gmres_copy_helper(viennacl::vector<ScalarType> const & src, DEST_VECTOR & dest, vcl_size_t len, vcl_size_t start = 0)
       {
-        viennacl::copy(src.begin(), src.begin() + len, dest.begin());
+        typedef typename viennacl::vector<ScalarType>::difference_type   difference_type;
+        viennacl::copy( src.begin() + static_cast<difference_type>(start),
+                        src.begin() + static_cast<difference_type>(start + len),
+                       dest.begin() + static_cast<difference_type>(start));
       }
 
-      template <typename ScalarType>
-      void gmres_copy_helper(viennacl::vector<ScalarType> const & src, viennacl::vector<ScalarType> & dest, unsigned int len)
+      /** @brief Computes the householder vector 'hh_vec' which rotates 'input_vec' such that all entries below the j-th entry of 'v' become zero.
+        *
+        * @param input_vec       The input vector
+        * @param hh_vec          The householder vector defining the relection (I - beta * hh_vec * hh_vec^T)
+        * @param beta            The coefficient beta in (I - beta  * hh_vec * hh_vec^T)
+        * @param mu              The norm of the input vector part relevant for the reflection: norm_2(input_vec[j:size])
+        * @param j               Index of the last nonzero index in 'input_vec' after applying the reflection
+      */
+      template <typename VectorType, typename ScalarType>
+      void gmres_setup_householder_vector(VectorType const & input_vec, VectorType & hh_vec, ScalarType & beta, ScalarType & mu, vcl_size_t j)
       {
-        viennacl::copy(src.begin(), src.begin() + len, dest.begin());
+        ScalarType input_j = input_vec(j);
+
+        // copy entries from input vector to householder vector:
+        detail::gmres_copy_helper(input_vec, hh_vec, viennacl::traits::size(hh_vec) - (j+1), j+1);
+
+        ScalarType sigma = viennacl::linalg::norm_2(hh_vec);
+        sigma *= sigma;
+
+        if (sigma == 0)
+        {
+          beta = 0;
+          mu = input_j;
+        }
+        else
+        {
+          mu = std::sqrt(sigma + input_j*input_j);
+
+          ScalarType hh_vec_0 = (input_j <= 0) ? (input_j - mu) : (-sigma / (input_j + mu));
+
+          beta = ScalarType(2) * hh_vec_0 * hh_vec_0 / (sigma + hh_vec_0 * hh_vec_0);
+
+          //divide hh_vec by its diagonal element hh_vec_0
+          hh_vec /= hh_vec_0;
+          hh_vec[j] = ScalarType(1);
+        }
+      }
+
+      // Apply (I - beta h h^T) to x (Householder reflection with Householder vector h)
+      template <typename VectorType, typename ScalarType>
+      void gmres_householder_reflect(VectorType & x, VectorType const & h, ScalarType beta)
+      {
+        ScalarType hT_in_x = viennacl::linalg::inner_prod(h, x);
+        x -= (beta * hT_in_x) * h;
       }
 
     }
@@ -126,72 +170,70 @@ namespace viennacl
     {
       typedef typename viennacl::result_of::value_type<VectorType>::type        ScalarType;
       typedef typename viennacl::result_of::cpu_value_type<ScalarType>::type    CPU_ScalarType;
-      unsigned int problem_size = viennacl::traits::size(rhs);
-      VectorType result(problem_size);
+      unsigned int problem_size = static_cast<unsigned int>(viennacl::traits::size(rhs));
+      VectorType result = rhs;
       viennacl::traits::clear(result);
+
       unsigned int krylov_dim = tag.krylov_dim();
       if (problem_size < tag.krylov_dim())
         krylov_dim = problem_size; //A Krylov space larger than the matrix would lead to seg-faults (mathematically, error is certain to be zero already)
-      
-      VectorType res(problem_size);
-      VectorType v_k_tilde(problem_size);
-      VectorType v_k_tilde_temp(problem_size);
-      
-      std::vector< std::vector<CPU_ScalarType> > R(krylov_dim);
+
+      VectorType res = rhs;
+      VectorType v_k_tilde = rhs;
+      VectorType v_k_tilde_temp = rhs;
+
+      std::vector< std::vector<CPU_ScalarType> > R(krylov_dim, std::vector<CPU_ScalarType>(tag.krylov_dim()));
       std::vector<CPU_ScalarType> projection_rhs(krylov_dim);
-      std::vector<VectorType> U(krylov_dim);
 
-      const CPU_ScalarType gpu_scalar_minus_1 = static_cast<CPU_ScalarType>(-1);    //representing the scalar '-1' on the GPU. Prevents blocking write operations
-      const CPU_ScalarType gpu_scalar_1 = static_cast<CPU_ScalarType>(1);    //representing the scalar '1' on the GPU. Prevents blocking write operations
-      const CPU_ScalarType gpu_scalar_2 = static_cast<CPU_ScalarType>(2);    //representing the scalar '2' on the GPU. Prevents blocking write operations
-      
+      std::vector<VectorType>      householder_reflectors(krylov_dim, rhs);
+      std::vector<CPU_ScalarType>  betas(krylov_dim);
+
       CPU_ScalarType norm_rhs = viennacl::linalg::norm_2(rhs);
-      
-      unsigned int k;
-      for (k = 0; k < krylov_dim; ++k)
-      {
-        R[k].resize(tag.krylov_dim()); 
-        viennacl::traits::resize(U[k], problem_size);
-      }
 
-      //std::cout << "Starting GMRES..." << std::endl;
+      if (norm_rhs == 0) //solution is zero if RHS norm is zero
+        return result;
+
       tag.iters(0);
-      
+
       for (unsigned int it = 0; it <= tag.max_restarts(); ++it)
       {
-        //std::cout << "-- GMRES Start " << it << " -- " << std::endl;
-        
+        //
+        // (Re-)Initialize residual: r = b - A*x (without temporary for the result of A*x)
+        //
         res = rhs;
         res -= viennacl::linalg::prod(matrix, result);  //initial guess zero
         precond.apply(res);
-        //std::cout << "Residual: " << res << std::endl;
-        
-        CPU_ScalarType rho_0 = viennacl::linalg::norm_2(res); 
-        CPU_ScalarType rho = static_cast<CPU_ScalarType>(1.0);
-        //std::cout << "rho_0: " << rho_0 << std::endl;
 
-        if (rho_0 / norm_rhs < tag.tolerance() || (norm_rhs == CPU_ScalarType(0.0)) )
+        CPU_ScalarType rho_0 = viennacl::linalg::norm_2(res);
+
+        //
+        // Check for premature convergence
+        //
+        if (rho_0 / norm_rhs < tag.tolerance() ) // norm_rhs is known to be nonzero here
         {
-          //std::cout << "Allowed Error reached at begin of loop" << std::endl;
           tag.error(rho_0 / norm_rhs);
           return result;
         }
 
+        //
+        // Normalize residual and set 'rho' to 1 as requested in 'A Simpler GMRES' by Walker and Zhou.
+        //
         res /= rho_0;
-        //std::cout << "Normalized Residual: " << res << std::endl;
-        
-        for (k=0; k<krylov_dim; ++k)
-        {
-          viennacl::traits::clear(R[k]);
-          viennacl::traits::clear(U[k]);
-          R[k].resize(krylov_dim); 
-          viennacl::traits::resize(U[k], problem_size);
-        }
+        CPU_ScalarType rho = static_cast<CPU_ScalarType>(1.0);
+
 
+        //
+        // Iterate up until maximal Krylove space dimension is reached:
+        //
+        unsigned int k = 0;
         for (k = 0; k < krylov_dim; ++k)
         {
           tag.iters( tag.iters() + 1 ); //increase iteration counter
 
+          // prepare storage:
+          viennacl::traits::clear(R[k]);
+          viennacl::traits::clear(householder_reflectors[k]);
+
           //compute v_k = A * v_{k-1} via Householder matrices
           if (k == 0)
           {
@@ -201,156 +243,103 @@ namespace viennacl
           else
           {
             viennacl::traits::clear(v_k_tilde);
-            v_k_tilde[k-1] = gpu_scalar_1;
-            //Householder rotations part 1
+            v_k_tilde[k-1] = CPU_ScalarType(1);
+
+            //Householder rotations, part 1: Compute P_1 * P_2 * ... * P_{k-1} * e_{k-1}
             for (int i = k-1; i > -1; --i)
-              v_k_tilde -= U[i] * (viennacl::linalg::inner_prod(U[i], v_k_tilde) * gpu_scalar_2);
+              detail::gmres_householder_reflect(v_k_tilde, householder_reflectors[i], betas[i]);
 
             v_k_tilde_temp = viennacl::linalg::prod(matrix, v_k_tilde);
             precond.apply(v_k_tilde_temp);
             v_k_tilde = v_k_tilde_temp;
 
-            //Householder rotations part 2
+            //Householder rotations, part 2: Compute P_{k-1} * ... * P_{1} * v_k_tilde
             for (unsigned int i = 0; i < k; ++i)
-              v_k_tilde -= U[i] * (viennacl::linalg::inner_prod(U[i], v_k_tilde) * gpu_scalar_2);
+              detail::gmres_householder_reflect(v_k_tilde, householder_reflectors[i], betas[i]);
           }
-          
-          //std::cout << "v_k_tilde: " << v_k_tilde << std::endl;
-
-          viennacl::traits::clear(U[k]);
-          viennacl::traits::resize(U[k], problem_size);
-          //copy first k entries from v_k_tilde to U[k]:
-          gmres_copy_helper(v_k_tilde, U[k], k);
-          
-          U[k][k] = std::sqrt( viennacl::linalg::inner_prod(v_k_tilde, v_k_tilde) - viennacl::linalg::inner_prod(U[k], U[k]) );
-
-          if (fabs(U[k][k]) < CPU_ScalarType(10 * std::numeric_limits<CPU_ScalarType>::epsilon()))
-            break; //Note: Solution is essentially (up to round-off error) already in Krylov space. No need to proceed.
-          
-          //copy first k+1 entries from U[k] to R[k]
-          gmres_copy_helper(U[k], R[k], k+1);
-          
-          U[k] -= v_k_tilde;
-          //std::cout << "U[k] before normalization: " << U[k] << std::endl;
-          U[k] *= gpu_scalar_minus_1 / viennacl::linalg::norm_2( U[k] );
-          //std::cout << "Householder vector U[k]: " << U[k] << std::endl;
-          
-          //DEBUG: Make sure that P_k v_k_tilde equals (rho_{1,k}, ... , rho_{k,k}, 0, 0 )
-#ifdef VIENNACL_GMRES_DEBUG
-          std::cout << "P_k v_k_tilde: " << (v_k_tilde - 2.0 * U[k] * inner_prod(U[k], v_k_tilde)) << std::endl;
-          std::cout << "R[k]: [" << R[k].size() << "](";
-          for (size_t i=0; i<R[k].size(); ++i)
-            std::cout << R[k][i] << ",";
-          std::cout << ")" << std::endl;
-#endif
-          //std::cout << "P_k res: " << (res - 2.0 * U[k] * inner_prod(U[k], res)) << std::endl;
-          res -= U[k] * (viennacl::linalg::inner_prod( U[k], res ) * gpu_scalar_2);
-          //std::cout << "zeta_k: " << viennacl::linalg::inner_prod( U[k], res ) * gpu_scalar_2 << std::endl;
-          //std::cout << "Updated res: " << res << std::endl;
-
-#ifdef VIENNACL_GMRES_DEBUG
-          VectorType v1(U[k].size()); v1.clear(); v1.resize(U[k].size());
-          v1(0) = 1.0;
-          v1 -= U[k] * (viennacl::linalg::inner_prod( U[k], v1 ) * gpu_scalar_2);
-          std::cout << "v1: " << v1 << std::endl;
-          boost::numeric::ublas::matrix<ScalarType> P = -2.0 * outer_prod(U[k], U[k]);
-          P(0,0) += 1.0; P(1,1) += 1.0; P(2,2) += 1.0;
-          std::cout << "P: " << P << std::endl;
-#endif
-          
+
+          //
+          // Compute Householder reflection for v_k_tilde such that all entries below k-th entry are zero:
+          //
+          CPU_ScalarType rho_k_k = 0;
+          detail::gmres_setup_householder_vector(v_k_tilde, householder_reflectors[k], betas[k], rho_k_k, k);
+
+          //
+          // copy first k entries from v_k_tilde to R[k] in order to fill k-th column with result of
+          // P_k * v_k_tilde = (v[0], ... , v[k-1], norm(v), 0, 0, ...) =: (rho_{1,k}, rho_{2,k}, ..., rho_{k,k}, 0, ..., 0);
+          //
+          detail::gmres_copy_helper(v_k_tilde, R[k], k);
+          R[k][k] = rho_k_k;
+
+          //
+          // Update residual: r = P_k r
+          // Set zeta_k = r[k] including machine precision considerations: mathematically we have |r[k]| <= rho
+          // Set rho *= sin(acos(r[k] / rho))
+          //
+          detail::gmres_householder_reflect(res, householder_reflectors[k], betas[k]);
+
           if (res[k] > rho) //machine precision reached
             res[k] = rho;
-
-          if (res[k] < -1.0 * rho) //machine precision reached
-            res[k] = -1.0 * rho;
-          
+          if (res[k] < -rho) //machine precision reached
+            res[k] = -rho;
           projection_rhs[k] = res[k];
-          
+
           rho *= std::sin( std::acos(projection_rhs[k] / rho) );
-          
-#ifdef VIENNACL_GMRES_DEBUG
-          std::cout << "k-th component of r: " << res[k] << std::endl;
-          std::cout << "New rho (norm of res): " << rho << std::endl;
-#endif        
 
-          if (std::fabs(rho * rho_0 / norm_rhs) < tag.tolerance())
+          if (std::fabs(rho * rho_0 / norm_rhs) < tag.tolerance())  // Residual is sufficiently reduced, stop here
           {
-            //std::cout << "Krylov space big enough" << endl;
             tag.error( std::fabs(rho*rho_0 / norm_rhs) );
             ++k;
             break;
           }
-          
-          //std::cout << "Current residual: " << rho * rho_0 << std::endl;
-          //std::cout << " - End of Krylov space setup - " << std::endl;
         } // for k
-        
-#ifdef VIENNACL_GMRES_DEBUG
-        //inplace solution of the upper triangular matrix:
-        std::cout << "Upper triangular system:" << std::endl;
-        std::cout << "Size of Krylov space: " << k << std::endl;
-        for (size_t i=0; i<k; ++i)
-        {
-          for (size_t j=0; j<k; ++j)
-          {
-            std::cout << R[j][i] << ", ";
-          }
-          std::cout << " | " << projection_rhs[i] << std::endl;
-        }
-#endif        
-        
+
+        //
+        // Triangular solver stage:
+        //
+
         for (int i=k-1; i>-1; --i)
         {
           for (unsigned int j=i+1; j<k; ++j)
-            //temp_rhs[i] -= R[i][j] * temp_rhs[j];   //if R is not transposed
             projection_rhs[i] -= R[j][i] * projection_rhs[j];     //R is transposed
-            
+
           projection_rhs[i] /= R[i][i];
         }
-        
-#ifdef VIENNACL_GMRES_DEBUG
-        std::cout << "Result of triangular solver: ";
-        for (size_t i=0; i<k; ++i)
-          std::cout << projection_rhs[i] << ", ";
-        std::cout << std::endl;
-#endif        
+
+        //
+        // Note: 'projection_rhs' now holds the solution (eta_1, ..., eta_k)
+        //
+
         res *= projection_rhs[0];
-        
+
         if (k > 0)
         {
           for (unsigned int i = 0; i < k-1; ++i)
-          {
             res[i] += projection_rhs[i+1];
-          }
         }
 
-        for (int i = k-1; i > -1; --i)
-          res -= U[i] * (viennacl::linalg::inner_prod(U[i], res) * gpu_scalar_2);
+        //
+        // Form z inplace in 'res' by applying P_1 * ... * P_{k}
+        //
+        for (int i=k-1; i>=0; --i)
+          detail::gmres_householder_reflect(res, householder_reflectors[i], betas[i]);
 
         res *= rho_0;
-        result += res;
+        result += res;  // x += rho_0 * z    in the paper
 
-        if ( std::fabs(rho*rho_0 / norm_rhs) < tag.tolerance() )
-        {
-          //std::cout << "Allowed Error reached at end of loop" << std::endl;
-          tag.error(std::fabs(rho*rho_0 / norm_rhs));
+        //
+        // Check for convergence:
+        //
+        tag.error(std::fabs(rho*rho_0 / norm_rhs));
+        if ( tag.error() < tag.tolerance() )
           return result;
-        }
-
-        //res = rhs;
-        //res -= viennacl::linalg::prod(matrix, result);
-        //std::cout << "norm_2(r)=" << norm_2(r) << std::endl;
-        //std::cout << "std::abs(rho*rho_0)=" << std::abs(rho*rho_0) << std::endl;
-        //std::cout << r << std::endl; 
-
-        tag.error(std::fabs(rho*rho_0));
       }
 
       return result;
     }
 
     /** @brief Convenience overload of the solve() function using GMRES. Per default, no preconditioner is used
-    */ 
+    */
     template <typename MatrixType, typename VectorType>
     VectorType solve(const MatrixType & matrix, VectorType const & rhs, gmres_tag const & tag)
     {
diff --git a/viennacl/linalg/hankel_matrix_operations.hpp b/viennacl/linalg/hankel_matrix_operations.hpp
index 1f9ca40..d3ba286 100644
--- a/viennacl/linalg/hankel_matrix_operations.hpp
+++ b/viennacl/linalg/hankel_matrix_operations.hpp
@@ -2,82 +2,41 @@
 #define VIENNACL_LINALG_HANKEL_MATRIX_OPERATIONS_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
 ============================================================================= */
 
-/** @file hankel_matrix_operations.hpp
-    @brief Implementations of operations using hankel_matrix
+/** @file viennacl/linalg/hankel_matrix_operations.hpp
+    @brief Implementations of operations using hankel_matrix. Experimental.
 */
 
 #include "viennacl/forwards.h"
-#include "viennacl/ocl/device.hpp"
-#include "viennacl/ocl/handle.hpp"
-#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/backend.hpp"
 #include "viennacl/scalar.hpp"
 #include "viennacl/vector.hpp"
 #include "viennacl/tools/tools.hpp"
 #include "viennacl/fft.hpp"
 #include "viennacl/linalg/toeplitz_matrix_operations.hpp"
-//#include "viennacl/linalg/kernels/coordinate_matrix_kernels.h"
 
 namespace viennacl
 {
   namespace linalg
   {
-    
-    
-    // A * x
-    /** @brief Returns a proxy class that represents matrix-vector multiplication with a compressed_matrix
-    *
-    * This is used for the convenience expression result = prod(mat, vec);
-    *
-    * @param mat    The matrix
-    * @param vec    The vector
-    */
-    template<class SCALARTYPE, unsigned int ALIGNMENT, unsigned int VECTOR_ALIGNMENT>
-    vector_expression<const hankel_matrix<SCALARTYPE, ALIGNMENT>,
-                      const vector<SCALARTYPE, VECTOR_ALIGNMENT>, 
-                      op_prod > prod_impl(const hankel_matrix<SCALARTYPE, ALIGNMENT> & mat, 
-                                     const vector<SCALARTYPE, VECTOR_ALIGNMENT> & vec)
-    {
-      return vector_expression<const hankel_matrix<SCALARTYPE, ALIGNMENT>,
-                               const vector<SCALARTYPE, VECTOR_ALIGNMENT>, 
-                               op_prod >(mat, vec);
-    }
-    
+
     // A * x
-    /** @brief Returns a proxy class that represents matrix-vector multiplication with a hankel_matrix
-    *
-    * This is used for the convenience expression result = prod(mat, vec);
-    *
-    * @param mat    The matrix
-    * @param vec    The vector
-    * @param NUM_THREADS Number of threads per work group. Can be used for fine-tuning.
-    */
-    template<class SCALARTYPE, unsigned int ALIGNMENT, unsigned int VECTOR_ALIGNMENT>
-    viennacl::vector_expression<const viennacl::hankel_matrix<SCALARTYPE, ALIGNMENT>,
-                                const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT>, 
-                                viennacl::op_prod > prod_impl(const viennacl::hankel_matrix<SCALARTYPE, ALIGNMENT> & mat, 
-                                                              const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT> & vec, 
-                                                              size_t NUM_THREADS)
-    {
-      return viennacl::vector_expression<const viennacl::hankel_matrix<SCALARTYPE, ALIGNMENT>,
-                               const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT>, 
-                               viennacl::op_prod >(mat, vec);
-    }
-    
+
     /** @brief Carries out matrix-vector multiplication with a hankel_matrix
     *
     * Implementation of the convenience expression result = prod(mat, vec);
@@ -86,121 +45,20 @@ namespace viennacl
     * @param vec    The vector
     * @param result The result vector
     */
-      template<class SCALARTYPE, unsigned int ALIGNMENT, unsigned int VECTOR_ALIGNMENT>
-      void prod_impl(const viennacl::hankel_matrix<SCALARTYPE, ALIGNMENT> & mat, 
-                     const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT> & vec,
-                           viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT> & result)
-      {
-        assert(mat.size1() == result.size());
-        assert(mat.size2() == vec.size());
-        
-        prod_impl(mat.elements(), vec, result);
-        viennacl::detail::fft::reverse(result);
-      }
-
-  } //namespace linalg
-
-
-
-    /** @brief Implementation of the operation v1 = A * v2, where A is a matrix
-    *
-    * @param proxy  An expression template proxy class.
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    template <unsigned int MAT_ALIGNMENT>
-    viennacl::vector<SCALARTYPE, ALIGNMENT> & 
-    viennacl::vector<SCALARTYPE, ALIGNMENT>::operator=(const viennacl::vector_expression< const hankel_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                                                          const viennacl::vector<SCALARTYPE, ALIGNMENT>,
-                                                                                          viennacl::op_prod> & proxy) 
+    template<class SCALARTYPE, unsigned int ALIGNMENT>
+    void prod_impl(const viennacl::hankel_matrix<SCALARTYPE, ALIGNMENT> & mat,
+                   const viennacl::vector_base<SCALARTYPE> & vec,
+                         viennacl::vector_base<SCALARTYPE> & result)
     {
-      // check for the special case x = A * x
-      if (proxy.rhs().handle().get() == this->handle().get())
-      {
-        viennacl::vector<SCALARTYPE, ALIGNMENT> result(proxy.rhs().size());
-        viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
-        *this = result;
-        return *this;
-      }
-      else
-      {
-        viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), *this);
-        return *this;
-      }
-      return *this;
-    }
+      assert(mat.size1() == result.size());
+      assert(mat.size2() == vec.size());
 
-    //v += A * x
-    /** @brief Implementation of the operation v1 += A * v2, where A is a matrix
-    *
-    * @param proxy  An expression template proxy class.
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    template <unsigned int MAT_ALIGNMENT>
-    viennacl::vector<SCALARTYPE, ALIGNMENT> & 
-    viennacl::vector<SCALARTYPE, ALIGNMENT>::operator+=(const vector_expression< const hankel_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                                                 const vector<SCALARTYPE, ALIGNMENT>,
-                                                                                 op_prod> & proxy) 
-    {
-      vector<SCALARTYPE, ALIGNMENT> result(proxy.lhs().size1());
-      viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
-      *this += result;
-      return *this;
+      prod_impl(mat.elements(), vec, result);
+      viennacl::detail::fft::reverse(result);
     }
 
-    /** @brief Implementation of the operation v1 -= A * v2, where A is a matrix
-    *
-    * @param proxy  An expression template proxy class.
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    template <unsigned int MAT_ALIGNMENT>
-    viennacl::vector<SCALARTYPE, ALIGNMENT> & 
-    viennacl::vector<SCALARTYPE, ALIGNMENT>::operator-=(const vector_expression< const hankel_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                                                 const vector<SCALARTYPE, ALIGNMENT>,
-                                                                                 op_prod> & proxy) 
-    {
-      vector<SCALARTYPE, ALIGNMENT> result(proxy.get_lhs().size1());
-      viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
-      *this -= result;
-      return *this;
-    }
-    
-    
-    //free functions:
-    /** @brief Implementation of the operation 'result = v1 + A * v2', where A is a matrix
-    *
-    * @param proxy  An expression template proxy class.
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    template <unsigned int MAT_ALIGNMENT>
-    viennacl::vector<SCALARTYPE, ALIGNMENT> 
-    viennacl::vector<SCALARTYPE, ALIGNMENT>::operator+(const vector_expression< const hankel_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                                                const vector<SCALARTYPE, ALIGNMENT>,
-                                                                                op_prod> & proxy) 
-    {
-      assert(proxy.get_lhs().size1() == size());
-      vector<SCALARTYPE, ALIGNMENT> result(size());
-      viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
-      result += *this;
-      return result;
-    }
+  } //namespace linalg
 
-    /** @brief Implementation of the operation 'result = v1 - A * v2', where A is a matrix
-    *
-    * @param proxy  An expression template proxy class.
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    template <unsigned int MAT_ALIGNMENT>
-    viennacl::vector<SCALARTYPE, ALIGNMENT> 
-    viennacl::vector<SCALARTYPE, ALIGNMENT>::operator-(const vector_expression< const hankel_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                                                const vector<SCALARTYPE, ALIGNMENT>,
-                                                                                op_prod> & proxy) 
-    {
-      assert(proxy.get_lhs().size1() == size());
-      vector<SCALARTYPE, ALIGNMENT> result(size());
-      viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
-      result = *this - result;
-      return result;
-    }
 
 } //namespace viennacl
 
diff --git a/viennacl/linalg/host_based/common.hpp b/viennacl/linalg/host_based/common.hpp
new file mode 100644
index 0000000..f7a726a
--- /dev/null
+++ b/viennacl/linalg/host_based/common.hpp
@@ -0,0 +1,166 @@
+#ifndef VIENNACL_LINALG_HOST_BASED_COMMON_HPP_
+#define VIENNACL_LINALG_HOST_BASED_COMMON_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/host_based/common.hpp
+    @brief Common routines for single-threaded or OpenMP-enabled execution on CPU
+*/
+
+#include "viennacl/traits/handle.hpp"
+
+namespace viennacl
+{
+  namespace linalg
+  {
+    namespace host_based
+    {
+      namespace detail
+      {
+        template <typename T, typename VectorType>
+        T * extract_raw_pointer(VectorType & vec)
+        {
+          return reinterpret_cast<T *>(viennacl::traits::ram_handle(vec).get());
+        }
+
+        template <typename T, typename VectorType>
+        T const * extract_raw_pointer(VectorType const & vec)
+        {
+          return reinterpret_cast<T const *>(viennacl::traits::ram_handle(vec).get());
+        }
+
+        /** @brief Helper class for accessing a strided subvector of a larger vector. */
+        template <typename NumericT>
+        class vector_array_wrapper
+        {
+          public:
+            typedef NumericT   value_type;
+
+            vector_array_wrapper(value_type * A,
+                                 vcl_size_t start,
+                                 vcl_size_t inc)
+             : A_(A),
+               start_(start),
+               inc_(inc) {}
+
+            value_type & operator()(vcl_size_t i)
+            {
+              return A_[i * inc_ + start_];
+            }
+
+          private:
+            value_type * A_;
+            vcl_size_t start_;
+            vcl_size_t inc_;
+        };
+
+
+        inline bool is_row_major(viennacl::row_major_tag) { return true; }
+        inline bool is_row_major(viennacl::column_major_tag) { return false; }
+
+        /** @brief Returns the row_major or column_major class of a dense matrix based on the majority-tag (layout-tag) provided. */
+        template <typename T>
+        struct majority_struct_for_orientation
+        {
+          typedef typename T::ERROR_UNRECOGNIZED_MAJORITY_CATEGORTY_TAG   type;
+        };
+
+        /** \cond */
+        template <>
+        struct majority_struct_for_orientation<viennacl::row_major_tag>
+        {
+          typedef viennacl::row_major   type;
+        };
+
+        template <>
+        struct majority_struct_for_orientation<viennacl::column_major_tag>
+        {
+          typedef viennacl::column_major type;
+        };
+        /** \endcond */
+
+
+        /** @brief Helper array for accessing a strided submatrix embedded in a larger matrix. */
+        template <typename NumericT, typename MajorityCategory, bool is_transposed>
+        class matrix_array_wrapper
+        {
+            typedef typename majority_struct_for_orientation<MajorityCategory>::type   F;
+
+          public:
+            typedef NumericT   value_type;
+
+            matrix_array_wrapper(value_type * A,
+                                 vcl_size_t start1, vcl_size_t start2,
+                                 vcl_size_t inc1,   vcl_size_t inc2,
+                                 vcl_size_t internal_size1, vcl_size_t internal_size2)
+             : A_(A),
+               start1_(start1), start2_(start2),
+               inc1_(inc1), inc2_(inc2),
+               internal_size1_(internal_size1), internal_size2_(internal_size2) {}
+
+            value_type & operator()(vcl_size_t i, vcl_size_t j)
+            {
+              return A_[F::mem_index(i * inc1_ + start1_, j * inc2_ + start2_, internal_size1_, internal_size2_)];
+            }
+
+          private:
+            value_type * A_;
+            vcl_size_t start1_, start2_;
+            vcl_size_t inc1_, inc2_;
+            vcl_size_t internal_size1_, internal_size2_;
+        };
+
+        /** \cond */
+        template <typename NumericT, typename MajorityCategory>
+        class matrix_array_wrapper<NumericT, MajorityCategory, true>
+        {
+            typedef typename majority_struct_for_orientation<MajorityCategory>::type   F;
+
+          public:
+            typedef NumericT   value_type;
+
+            matrix_array_wrapper(value_type * A,
+                                 vcl_size_t start1, vcl_size_t start2,
+                                 vcl_size_t inc1,   vcl_size_t inc2,
+                                 vcl_size_t internal_size1, vcl_size_t internal_size2)
+             : A_(A),
+               start1_(start1), start2_(start2),
+               inc1_(inc1), inc2_(inc2),
+               internal_size1_(internal_size1), internal_size2_(internal_size2) {}
+
+            value_type & operator()(vcl_size_t i, vcl_size_t j)
+            {
+              return A_[F::mem_index(j * inc1_ + start1_, i * inc2_ + start2_, internal_size1_, internal_size2_)];  //swapping row and column indices here
+            }
+
+          private:
+            value_type * A_;
+            vcl_size_t start1_, start2_;
+            vcl_size_t inc1_, inc2_;
+            vcl_size_t internal_size1_, internal_size2_;
+        };
+        /** \endcond */
+
+      }
+
+    } //namespace host_based
+  } //namespace linalg
+} //namespace viennacl
+
+
+#endif
diff --git a/viennacl/linalg/host_based/direct_solve.hpp b/viennacl/linalg/host_based/direct_solve.hpp
new file mode 100644
index 0000000..fcf37d9
--- /dev/null
+++ b/viennacl/linalg/host_based/direct_solve.hpp
@@ -0,0 +1,418 @@
+#ifndef VIENNACL_LINALG_HOST_BASED_DIRECT_SOLVE_HPP
+#define VIENNACL_LINALG_HOST_BASED_DIRECT_SOLVE_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/host_based/direct_solve.hpp
+    @brief Implementations of dense direct triangular solvers are found here.
+*/
+
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+
+#include "viennacl/linalg/host_based/common.hpp"
+
+namespace viennacl
+{
+  namespace linalg
+  {
+    namespace host_based
+    {
+
+      namespace detail
+      {
+        //
+        // Upper solve:
+        //
+        template <typename MatrixType1, typename MatrixType2>
+        void upper_inplace_solve_matrix(MatrixType1 & A, MatrixType2 & B, vcl_size_t A_size, vcl_size_t B_size, bool unit_diagonal)
+        {
+          typedef typename MatrixType2::value_type   value_type;
+
+          for (vcl_size_t i = 0; i < A_size; ++i)
+          {
+            vcl_size_t current_row = A_size - i - 1;
+
+            for (vcl_size_t j = current_row + 1; j < A_size; ++j)
+            {
+              value_type A_element = A(current_row, j);
+              for (vcl_size_t k=0; k < B_size; ++k)
+                B(current_row, k) -= A_element * B(j, k);
+            }
+
+            if (!unit_diagonal)
+            {
+              value_type A_diag = A(current_row, current_row);
+              for (vcl_size_t k=0; k < B_size; ++k)
+                B(current_row, k) /= A_diag;
+            }
+          }
+        }
+
+        template <typename MatrixType1, typename MatrixType2>
+        void inplace_solve_matrix(MatrixType1 & A, MatrixType2 & B, vcl_size_t A_size, vcl_size_t B_size, viennacl::linalg::unit_upper_tag)
+        {
+          upper_inplace_solve_matrix(A, B, A_size, B_size, true);
+        }
+
+        template <typename MatrixType1, typename MatrixType2>
+        void inplace_solve_matrix(MatrixType1 & A, MatrixType2 & B, vcl_size_t A_size, vcl_size_t B_size, viennacl::linalg::upper_tag)
+        {
+          upper_inplace_solve_matrix(A, B, A_size, B_size, false);
+        }
+
+        //
+        // Lower solve:
+        //
+        template <typename MatrixType1, typename MatrixType2>
+        void lower_inplace_solve_matrix(MatrixType1 & A, MatrixType2 & B, vcl_size_t A_size, vcl_size_t B_size, bool unit_diagonal)
+        {
+          typedef typename MatrixType2::value_type   value_type;
+
+          for (vcl_size_t i = 0; i < A_size; ++i)
+          {
+            for (vcl_size_t j = 0; j < i; ++j)
+            {
+              value_type A_element = A(i, j);
+              for (vcl_size_t k=0; k < B_size; ++k)
+                B(i, k) -= A_element * B(j, k);
+            }
+
+            if (!unit_diagonal)
+            {
+              value_type A_diag = A(i, i);
+              for (vcl_size_t k=0; k < B_size; ++k)
+                B(i, k) /= A_diag;
+            }
+          }
+        }
+
+        template <typename MatrixType1, typename MatrixType2>
+        void inplace_solve_matrix(MatrixType1 & A, MatrixType2 & B, vcl_size_t A_size, vcl_size_t B_size, viennacl::linalg::unit_lower_tag)
+        {
+          lower_inplace_solve_matrix(A, B, A_size, B_size, true);
+        }
+
+        template <typename MatrixType1, typename MatrixType2>
+        void inplace_solve_matrix(MatrixType1 & A, MatrixType2 & B, vcl_size_t A_size, vcl_size_t B_size, viennacl::linalg::lower_tag)
+        {
+          lower_inplace_solve_matrix(A, B, A_size, B_size, false);
+        }
+
+      }
+
+      //
+      // Note: By convention, all size checks are performed in the calling frontend. No need to double-check here.
+      //
+
+      ////////////////// upper triangular solver (upper_tag) //////////////////////////////////////
+      /** @brief Direct inplace solver for triangular systems with multiple right hand sides, i.e. A \ B   (MATLAB notation)
+      *
+      * @param A      The system matrix
+      * @param B      The matrix of row vectors, where the solution is directly written to
+      */
+      template <typename NumericT, typename F1, typename F2, typename SOLVERTAG>
+      void inplace_solve(const matrix_base<NumericT, F1> & A, matrix_base<NumericT, F2> & B, SOLVERTAG)
+      {
+        typedef NumericT        value_type;
+
+        value_type const * data_A = detail::extract_raw_pointer<value_type>(A);
+        value_type       * data_B = detail::extract_raw_pointer<value_type>(B);
+
+        vcl_size_t A_start1 = viennacl::traits::start1(A);
+        vcl_size_t A_start2 = viennacl::traits::start2(A);
+        vcl_size_t A_inc1   = viennacl::traits::stride1(A);
+        vcl_size_t A_inc2   = viennacl::traits::stride2(A);
+        vcl_size_t A_size2  = viennacl::traits::size2(A);
+        vcl_size_t A_internal_size1  = viennacl::traits::internal_size1(A);
+        vcl_size_t A_internal_size2  = viennacl::traits::internal_size2(A);
+
+        vcl_size_t B_start1 = viennacl::traits::start1(B);
+        vcl_size_t B_start2 = viennacl::traits::start2(B);
+        vcl_size_t B_inc1   = viennacl::traits::stride1(B);
+        vcl_size_t B_inc2   = viennacl::traits::stride2(B);
+        vcl_size_t B_size2  = viennacl::traits::size2(B);
+        vcl_size_t B_internal_size1  = viennacl::traits::internal_size1(B);
+        vcl_size_t B_internal_size2  = viennacl::traits::internal_size2(B);
+
+
+        detail::matrix_array_wrapper<value_type const, typename F1::orientation_category, false>   wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+        detail::matrix_array_wrapper<value_type,       typename F2::orientation_category, false>   wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+
+        detail::inplace_solve_matrix(wrapper_A, wrapper_B, A_size2, B_size2, SOLVERTAG());
+      }
+
+      /** @brief Direct inplace solver for triangular systems with multiple transposed right hand sides, i.e. A \ B^T   (MATLAB notation)
+      *
+      * @param A       The system matrix
+      * @param proxy_B The proxy for the transposed matrix of row vectors, where the solution is directly written to
+      */
+      template <typename NumericT, typename F1, typename F2, typename SOLVERTAG>
+      void inplace_solve(const matrix_base<NumericT, F1> & A,
+                         matrix_expression< const matrix_base<NumericT, F2>, const matrix_base<NumericT, F2>, op_trans> proxy_B,
+                         SOLVERTAG)
+      {
+        typedef NumericT        value_type;
+
+        value_type const * data_A = detail::extract_raw_pointer<value_type>(A);
+        value_type       * data_B = const_cast<value_type *>(detail::extract_raw_pointer<value_type>(proxy_B.lhs()));
+
+        vcl_size_t A_start1 = viennacl::traits::start1(A);
+        vcl_size_t A_start2 = viennacl::traits::start2(A);
+        vcl_size_t A_inc1   = viennacl::traits::stride1(A);
+        vcl_size_t A_inc2   = viennacl::traits::stride2(A);
+        vcl_size_t A_size2  = viennacl::traits::size2(A);
+        vcl_size_t A_internal_size1  = viennacl::traits::internal_size1(A);
+        vcl_size_t A_internal_size2  = viennacl::traits::internal_size2(A);
+
+        vcl_size_t B_start1 = viennacl::traits::start1(proxy_B.lhs());
+        vcl_size_t B_start2 = viennacl::traits::start2(proxy_B.lhs());
+        vcl_size_t B_inc1   = viennacl::traits::stride1(proxy_B.lhs());
+        vcl_size_t B_inc2   = viennacl::traits::stride2(proxy_B.lhs());
+        vcl_size_t B_size1  = viennacl::traits::size1(proxy_B.lhs());
+        vcl_size_t B_internal_size1  = viennacl::traits::internal_size1(proxy_B.lhs());
+        vcl_size_t B_internal_size2  = viennacl::traits::internal_size2(proxy_B.lhs());
+
+
+        detail::matrix_array_wrapper<value_type const, typename F1::orientation_category, false>   wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+        detail::matrix_array_wrapper<value_type,       typename F2::orientation_category, true>   wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+
+        detail::inplace_solve_matrix(wrapper_A, wrapper_B, A_size2, B_size1, SOLVERTAG());
+      }
+
+      //upper triangular solver for transposed lower triangular matrices
+      /** @brief Direct inplace solver for transposed triangular systems with multiple right hand sides, i.e. A^T \ B   (MATLAB notation)
+      *
+      * @param proxy_A  The transposed system matrix proxy
+      * @param B        The matrix holding the load vectors, where the solution is directly written to
+      */
+      template <typename NumericT, typename F1, typename F2, typename SOLVERTAG>
+      void inplace_solve(const matrix_expression< const matrix_base<NumericT, F1>, const matrix_base<NumericT, F1>, op_trans> & proxy_A,
+                         matrix_base<NumericT, F2> & B,
+                         SOLVERTAG)
+      {
+        typedef NumericT        value_type;
+
+        value_type const * data_A = detail::extract_raw_pointer<value_type>(proxy_A.lhs());
+        value_type       * data_B = const_cast<value_type *>(detail::extract_raw_pointer<value_type>(B));
+
+        vcl_size_t A_start1 = viennacl::traits::start1(proxy_A.lhs());
+        vcl_size_t A_start2 = viennacl::traits::start2(proxy_A.lhs());
+        vcl_size_t A_inc1   = viennacl::traits::stride1(proxy_A.lhs());
+        vcl_size_t A_inc2   = viennacl::traits::stride2(proxy_A.lhs());
+        vcl_size_t A_size2  = viennacl::traits::size2(proxy_A.lhs());
+        vcl_size_t A_internal_size1  = viennacl::traits::internal_size1(proxy_A.lhs());
+        vcl_size_t A_internal_size2  = viennacl::traits::internal_size2(proxy_A.lhs());
+
+        vcl_size_t B_start1 = viennacl::traits::start1(B);
+        vcl_size_t B_start2 = viennacl::traits::start2(B);
+        vcl_size_t B_inc1   = viennacl::traits::stride1(B);
+        vcl_size_t B_inc2   = viennacl::traits::stride2(B);
+        vcl_size_t B_size2  = viennacl::traits::size2(B);
+        vcl_size_t B_internal_size1  = viennacl::traits::internal_size1(B);
+        vcl_size_t B_internal_size2  = viennacl::traits::internal_size2(B);
+
+
+        detail::matrix_array_wrapper<value_type const, typename F1::orientation_category, true>    wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+        detail::matrix_array_wrapper<value_type,       typename F2::orientation_category, false>   wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+
+        detail::inplace_solve_matrix(wrapper_A, wrapper_B, A_size2, B_size2, SOLVERTAG());
+      }
+
+      /** @brief Direct inplace solver for transposed triangular systems with multiple transposed right hand sides, i.e. A^T \ B^T   (MATLAB notation)
+      *
+      * @param proxy_A    The transposed system matrix proxy
+      * @param proxy_B    The transposed matrix holding the load vectors, where the solution is directly written to
+      */
+      template <typename NumericT, typename F1, typename F2, typename SOLVERTAG>
+      void inplace_solve(const matrix_expression< const matrix_base<NumericT, F1>, const matrix_base<NumericT, F1>, op_trans> & proxy_A,
+                               matrix_expression< const matrix_base<NumericT, F2>, const matrix_base<NumericT, F2>, op_trans>   proxy_B,
+                         SOLVERTAG)
+      {
+        typedef NumericT        value_type;
+
+        value_type const * data_A = detail::extract_raw_pointer<value_type>(proxy_A.lhs());
+        value_type       * data_B = const_cast<value_type *>(detail::extract_raw_pointer<value_type>(proxy_B.lhs()));
+
+        vcl_size_t A_start1 = viennacl::traits::start1(proxy_A.lhs());
+        vcl_size_t A_start2 = viennacl::traits::start2(proxy_A.lhs());
+        vcl_size_t A_inc1   = viennacl::traits::stride1(proxy_A.lhs());
+        vcl_size_t A_inc2   = viennacl::traits::stride2(proxy_A.lhs());
+        vcl_size_t A_size2  = viennacl::traits::size2(proxy_A.lhs());
+        vcl_size_t A_internal_size1  = viennacl::traits::internal_size1(proxy_A.lhs());
+        vcl_size_t A_internal_size2  = viennacl::traits::internal_size2(proxy_A.lhs());
+
+        vcl_size_t B_start1 = viennacl::traits::start1(proxy_B.lhs());
+        vcl_size_t B_start2 = viennacl::traits::start2(proxy_B.lhs());
+        vcl_size_t B_inc1   = viennacl::traits::stride1(proxy_B.lhs());
+        vcl_size_t B_inc2   = viennacl::traits::stride2(proxy_B.lhs());
+        vcl_size_t B_size1  = viennacl::traits::size1(proxy_B.lhs());
+        vcl_size_t B_internal_size1  = viennacl::traits::internal_size1(proxy_B.lhs());
+        vcl_size_t B_internal_size2  = viennacl::traits::internal_size2(proxy_B.lhs());
+
+
+        detail::matrix_array_wrapper<value_type const, typename F1::orientation_category, true>   wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+        detail::matrix_array_wrapper<value_type,       typename F2::orientation_category, true>   wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+
+        detail::inplace_solve_matrix(wrapper_A, wrapper_B, A_size2, B_size1, SOLVERTAG());
+      }
+
+      //
+      //  Solve on vector
+      //
+
+      namespace detail
+      {
+        //
+        // Upper solve:
+        //
+        template <typename MatrixType, typename VectorType>
+        void upper_inplace_solve_vector(MatrixType & A, VectorType & b, vcl_size_t A_size, bool unit_diagonal)
+        {
+          typedef typename VectorType::value_type   value_type;
+
+          for (vcl_size_t i = 0; i < A_size; ++i)
+          {
+            vcl_size_t current_row = A_size - i - 1;
+
+            for (vcl_size_t j = current_row + 1; j < A_size; ++j)
+            {
+              value_type A_element = A(current_row, j);
+              b(current_row) -= A_element * b(j);
+            }
+
+            if (!unit_diagonal)
+              b(current_row) /= A(current_row, current_row);
+          }
+        }
+
+        template <typename MatrixType, typename VectorType>
+        void inplace_solve_vector(MatrixType & A, VectorType & b, vcl_size_t A_size, viennacl::linalg::unit_upper_tag)
+        {
+          upper_inplace_solve_vector(A, b, A_size, true);
+        }
+
+        template <typename MatrixType, typename VectorType>
+        void inplace_solve_vector(MatrixType & A, VectorType & b, vcl_size_t A_size, viennacl::linalg::upper_tag)
+        {
+          upper_inplace_solve_vector(A, b, A_size, false);
+        }
+
+        //
+        // Lower solve:
+        //
+        template <typename MatrixType, typename VectorType>
+        void lower_inplace_solve_vector(MatrixType & A, VectorType & b, vcl_size_t A_size, bool unit_diagonal)
+        {
+          typedef typename VectorType::value_type   value_type;
+
+          for (vcl_size_t i = 0; i < A_size; ++i)
+          {
+            for (vcl_size_t j = 0; j < i; ++j)
+            {
+              value_type A_element = A(i, j);
+              b(i) -= A_element * b(j);
+            }
+
+            if (!unit_diagonal)
+              b(i) /= A(i, i);
+          }
+        }
+
+        template <typename MatrixType, typename VectorType>
+        void inplace_solve_vector(MatrixType & A, VectorType & b, vcl_size_t A_size, viennacl::linalg::unit_lower_tag)
+        {
+          lower_inplace_solve_vector(A, b, A_size, true);
+        }
+
+        template <typename MatrixType, typename VectorType>
+        void inplace_solve_vector(MatrixType & A, VectorType & b, vcl_size_t A_size, viennacl::linalg::lower_tag)
+        {
+          lower_inplace_solve_vector(A, b, A_size, false);
+        }
+
+      }
+
+      template <typename NumericT, typename F, typename SOLVERTAG>
+      void inplace_solve(const matrix_base<NumericT, F> & mat,
+                               vector_base<NumericT> & vec,
+                         SOLVERTAG)
+      {
+        typedef NumericT        value_type;
+
+        value_type const * data_A = detail::extract_raw_pointer<value_type>(mat);
+        value_type       * data_v = detail::extract_raw_pointer<value_type>(vec);
+
+        vcl_size_t A_start1 = viennacl::traits::start1(mat);
+        vcl_size_t A_start2 = viennacl::traits::start2(mat);
+        vcl_size_t A_inc1   = viennacl::traits::stride1(mat);
+        vcl_size_t A_inc2   = viennacl::traits::stride2(mat);
+        vcl_size_t A_size2  = viennacl::traits::size2(mat);
+        vcl_size_t A_internal_size1  = viennacl::traits::internal_size1(mat);
+        vcl_size_t A_internal_size2  = viennacl::traits::internal_size2(mat);
+
+        vcl_size_t start1 = viennacl::traits::start(vec);
+        vcl_size_t inc1   = viennacl::traits::stride(vec);
+
+        detail::matrix_array_wrapper<value_type const, typename F::orientation_category, false>   wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+        detail::vector_array_wrapper<value_type> wrapper_v(data_v, start1, inc1);
+
+        detail::inplace_solve_vector(wrapper_A, wrapper_v, A_size2, SOLVERTAG());
+      }
+
+
+
+      /** @brief Direct inplace solver for dense upper triangular systems that stem from transposed lower triangular systems
+      *
+      * @param proxy    The system matrix proxy
+      * @param vec    The load vector, where the solution is directly written to
+      */
+      template <typename NumericT, typename F, typename SOLVERTAG>
+      void inplace_solve(const matrix_expression< const matrix_base<NumericT, F>, const matrix_base<NumericT, F>, op_trans> & proxy,
+                         vector_base<NumericT> & vec,
+                         SOLVERTAG)
+      {
+        typedef NumericT        value_type;
+
+        value_type const * data_A = detail::extract_raw_pointer<value_type>(proxy.lhs());
+        value_type       * data_v = detail::extract_raw_pointer<value_type>(vec);
+
+        vcl_size_t A_start1 = viennacl::traits::start1(proxy.lhs());
+        vcl_size_t A_start2 = viennacl::traits::start2(proxy.lhs());
+        vcl_size_t A_inc1   = viennacl::traits::stride1(proxy.lhs());
+        vcl_size_t A_inc2   = viennacl::traits::stride2(proxy.lhs());
+        vcl_size_t A_size2  = viennacl::traits::size2(proxy.lhs());
+        vcl_size_t A_internal_size1  = viennacl::traits::internal_size1(proxy.lhs());
+        vcl_size_t A_internal_size2  = viennacl::traits::internal_size2(proxy.lhs());
+
+        vcl_size_t start1 = viennacl::traits::start(vec);
+        vcl_size_t inc1   = viennacl::traits::stride(vec);
+
+        detail::matrix_array_wrapper<value_type const, typename F::orientation_category, true>   wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+        detail::vector_array_wrapper<value_type> wrapper_v(data_v, start1, inc1);
+
+        detail::inplace_solve_vector(wrapper_A, wrapper_v, A_size2, SOLVERTAG());
+      }
+
+
+
+    }
+  }
+}
+
+#endif
diff --git a/viennacl/linalg/host_based/matrix_operations.hpp b/viennacl/linalg/host_based/matrix_operations.hpp
new file mode 100644
index 0000000..933c232
--- /dev/null
+++ b/viennacl/linalg/host_based/matrix_operations.hpp
@@ -0,0 +1,1177 @@
+#ifndef VIENNACL_LINALG_HOST_BASED_MATRIX_OPERATIONS_HPP_
+#define VIENNACL_LINALG_HOST_BASED_MATRIX_OPERATIONS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file  viennacl/linalg/host_based/matrix_operations.hpp
+    @brief Implementations of dense matrix related operations, including matrix-vector products, using a plain single-threaded or OpenMP-enabled execution on CPU.
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/vector_proxy.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/meta/predicate.hpp"
+#include "viennacl/meta/result_of.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/start.hpp"
+#include "viennacl/traits/handle.hpp"
+#include "viennacl/traits/stride.hpp"
+#include "viennacl/linalg/detail/op_applier.hpp"
+#include "viennacl/linalg/host_based/common.hpp"
+
+namespace viennacl
+{
+  namespace linalg
+  {
+    namespace host_based
+    {
+
+      //
+      // Introductory note: By convention, all dimensions are already checked in the dispatcher frontend. No need to double-check again in here!
+      //
+
+      template <typename NumericT, typename F, typename ScalarType1>
+      void am(matrix_base<NumericT, F> & mat1,
+              matrix_base<NumericT, F> const & mat2, ScalarType1 const & alpha, vcl_size_t /*len_alpha*/, bool reciprocal_alpha, bool flip_sign_alpha)
+      {
+        typedef NumericT        value_type;
+
+        value_type       * data_A = detail::extract_raw_pointer<value_type>(mat1);
+        value_type const * data_B = detail::extract_raw_pointer<value_type>(mat2);
+
+        value_type data_alpha = alpha;
+        if (flip_sign_alpha)
+          data_alpha = -data_alpha;
+
+        vcl_size_t A_start1 = viennacl::traits::start1(mat1);
+        vcl_size_t A_start2 = viennacl::traits::start2(mat1);
+        vcl_size_t A_inc1   = viennacl::traits::stride1(mat1);
+        vcl_size_t A_inc2   = viennacl::traits::stride2(mat1);
+        vcl_size_t A_size1  = viennacl::traits::size1(mat1);
+        vcl_size_t A_size2  = viennacl::traits::size2(mat1);
+        vcl_size_t A_internal_size1  = viennacl::traits::internal_size1(mat1);
+        vcl_size_t A_internal_size2  = viennacl::traits::internal_size2(mat1);
+
+        vcl_size_t B_start1 = viennacl::traits::start1(mat2);
+        vcl_size_t B_start2 = viennacl::traits::start2(mat2);
+        vcl_size_t B_inc1   = viennacl::traits::stride1(mat2);
+        vcl_size_t B_inc2   = viennacl::traits::stride2(mat2);
+        vcl_size_t B_internal_size1  = viennacl::traits::internal_size1(mat2);
+        vcl_size_t B_internal_size2  = viennacl::traits::internal_size2(mat2);
+
+        detail::matrix_array_wrapper<value_type,       typename F::orientation_category, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+        detail::matrix_array_wrapper<value_type const, typename F::orientation_category, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+        //typedef typename detail::majority_struct_for_orientation<typename M1::orientation_category>::type index_generator_A;
+        //typedef typename detail::majority_struct_for_orientation<typename M2::orientation_category>::type index_generator_B;
+
+        if (detail::is_row_major(typename F::orientation_category()))
+        {
+          if (reciprocal_alpha)
+          {
+#ifdef VIENNACL_WITH_OPENMP
+            #pragma omp parallel for
+#endif
+            for (long row = 0; row < static_cast<long>(A_size1); ++row)
+              for (long col = 0; col < static_cast<long>(A_size2); ++col)
+                wrapper_A(row, col) = wrapper_B(row, col) / data_alpha;
+          }
+          else
+          {
+#ifdef VIENNACL_WITH_OPENMP
+            #pragma omp parallel for
+#endif
+            for (long row = 0; row < static_cast<long>(A_size1); ++row)
+              for (long col = 0; col < static_cast<long>(A_size2); ++col)
+                wrapper_A(row, col) = wrapper_B(row, col) * data_alpha;
+          }
+        }
+        else
+        {
+          if (reciprocal_alpha)
+          {
+#ifdef VIENNACL_WITH_OPENMP
+            #pragma omp parallel for
+#endif
+            for (long col = 0; col < static_cast<long>(A_size2); ++col)
+              for (long row = 0; row < static_cast<long>(A_size1); ++row)
+                wrapper_A(row, col) = wrapper_B(row, col) / data_alpha;
+          }
+          else
+          {
+#ifdef VIENNACL_WITH_OPENMP
+            #pragma omp parallel for
+#endif
+            for (long col = 0; col < static_cast<long>(A_size2); ++col)
+              for (long row = 0; row < static_cast<long>(A_size1); ++row)
+                wrapper_A(row, col) = wrapper_B(row, col) * data_alpha;
+          }
+        }
+      }
+
+
+      template <typename NumericT, typename F,
+                typename ScalarType1, typename ScalarType2>
+      void ambm(matrix_base<NumericT, F> & mat1,
+                matrix_base<NumericT, F> const & mat2, ScalarType1 const & alpha, vcl_size_t /*len_alpha*/, bool reciprocal_alpha, bool flip_sign_alpha,
+                matrix_base<NumericT, F> const & mat3, ScalarType2 const & beta,  vcl_size_t /*len_beta*/,  bool reciprocal_beta,  bool flip_sign_beta)
+      {
+        typedef NumericT        value_type;
+
+        value_type       * data_A = detail::extract_raw_pointer<value_type>(mat1);
+        value_type const * data_B = detail::extract_raw_pointer<value_type>(mat2);
+        value_type const * data_C = detail::extract_raw_pointer<value_type>(mat3);
+
+        value_type data_alpha = alpha;
+        if (flip_sign_alpha)
+          data_alpha = -data_alpha;
+
+        value_type data_beta = beta;
+        if (flip_sign_beta)
+          data_beta = -data_beta;
+
+        vcl_size_t A_start1 = viennacl::traits::start1(mat1);
+        vcl_size_t A_start2 = viennacl::traits::start2(mat1);
+        vcl_size_t A_inc1   = viennacl::traits::stride1(mat1);
+        vcl_size_t A_inc2   = viennacl::traits::stride2(mat1);
+        vcl_size_t A_size1  = viennacl::traits::size1(mat1);
+        vcl_size_t A_size2  = viennacl::traits::size2(mat1);
+        vcl_size_t A_internal_size1  = viennacl::traits::internal_size1(mat1);
+        vcl_size_t A_internal_size2  = viennacl::traits::internal_size2(mat1);
+
+        vcl_size_t B_start1 = viennacl::traits::start1(mat2);
+        vcl_size_t B_start2 = viennacl::traits::start2(mat2);
+        vcl_size_t B_inc1   = viennacl::traits::stride1(mat2);
+        vcl_size_t B_inc2   = viennacl::traits::stride2(mat2);
+        vcl_size_t B_internal_size1  = viennacl::traits::internal_size1(mat2);
+        vcl_size_t B_internal_size2  = viennacl::traits::internal_size2(mat2);
+
+        vcl_size_t C_start1 = viennacl::traits::start1(mat3);
+        vcl_size_t C_start2 = viennacl::traits::start2(mat3);
+        vcl_size_t C_inc1   = viennacl::traits::stride1(mat3);
+        vcl_size_t C_inc2   = viennacl::traits::stride2(mat3);
+        vcl_size_t C_internal_size1  = viennacl::traits::internal_size1(mat3);
+        vcl_size_t C_internal_size2  = viennacl::traits::internal_size2(mat3);
+
+        detail::matrix_array_wrapper<value_type,       typename F::orientation_category, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+        detail::matrix_array_wrapper<value_type const, typename F::orientation_category, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+        detail::matrix_array_wrapper<value_type const, typename F::orientation_category, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+        if (detail::is_row_major(typename F::orientation_category()))
+        {
+          if (reciprocal_alpha && reciprocal_beta)
+          {
+#ifdef VIENNACL_WITH_OPENMP
+            #pragma omp parallel for
+#endif
+            for (long row = 0; row < static_cast<long>(A_size1); ++row)
+              for (long col = 0; col < static_cast<long>(A_size2); ++col)
+                wrapper_A(row, col) = wrapper_B(row, col) / data_alpha + wrapper_C(row, col) / data_beta;
+          }
+          else if (reciprocal_alpha && !reciprocal_beta)
+          {
+#ifdef VIENNACL_WITH_OPENMP
+            #pragma omp parallel for
+#endif
+            for (long row = 0; row < static_cast<long>(A_size1); ++row)
+              for (long col = 0; col < static_cast<long>(A_size2); ++col)
+                wrapper_A(row, col) = wrapper_B(row, col) / data_alpha + wrapper_C(row, col) * data_beta;
+          }
+          else if (!reciprocal_alpha && reciprocal_beta)
+          {
+#ifdef VIENNACL_WITH_OPENMP
+            #pragma omp parallel for
+#endif
+            for (long row = 0; row < static_cast<long>(A_size1); ++row)
+              for (long col = 0; col < static_cast<long>(A_size2); ++col)
+                wrapper_A(row, col) = wrapper_B(row, col) * data_alpha + wrapper_C(row, col) / data_beta;
+          }
+          else if (!reciprocal_alpha && !reciprocal_beta)
+          {
+#ifdef VIENNACL_WITH_OPENMP
+            #pragma omp parallel for
+#endif
+            for (long row = 0; row < static_cast<long>(A_size1); ++row)
+              for (long col = 0; col < static_cast<long>(A_size2); ++col)
+                wrapper_A(row, col) = wrapper_B(row, col) * data_alpha + wrapper_C(row, col) * data_beta;
+          }
+        }
+        else
+        {
+          if (reciprocal_alpha && reciprocal_beta)
+          {
+#ifdef VIENNACL_WITH_OPENMP
+            #pragma omp parallel for
+#endif
+            for (long col = 0; col < static_cast<long>(A_size2); ++col)
+              for (long row = 0; row < static_cast<long>(A_size1); ++row)
+                wrapper_A(row, col) = wrapper_B(row, col) / data_alpha + wrapper_C(row, col) / data_beta;
+          }
+          else if (reciprocal_alpha && !reciprocal_beta)
+          {
+#ifdef VIENNACL_WITH_OPENMP
+            #pragma omp parallel for
+#endif
+            for (long col = 0; col < static_cast<long>(A_size2); ++col)
+              for (long row = 0; row < static_cast<long>(A_size1); ++row)
+                wrapper_A(row, col) = wrapper_B(row, col) / data_alpha + wrapper_C(row, col) * data_beta;
+          }
+          else if (!reciprocal_alpha && reciprocal_beta)
+          {
+#ifdef VIENNACL_WITH_OPENMP
+            #pragma omp parallel for
+#endif
+            for (long col = 0; col < static_cast<long>(A_size2); ++col)
+              for (long row = 0; row < static_cast<long>(A_size1); ++row)
+                wrapper_A(row, col) = wrapper_B(row, col) * data_alpha + wrapper_C(row, col) / data_beta;
+          }
+          else if (!reciprocal_alpha && !reciprocal_beta)
+          {
+#ifdef VIENNACL_WITH_OPENMP
+            #pragma omp parallel for
+#endif
+            for (long col = 0; col < static_cast<long>(A_size2); ++col)
+              for (long row = 0; row < static_cast<long>(A_size1); ++row)
+                wrapper_A(row, col) = wrapper_B(row, col) * data_alpha + wrapper_C(row, col) * data_beta;
+          }
+        }
+
+      }
+
+
+      template <typename NumericT, typename F,
+                typename ScalarType1, typename ScalarType2>
+      void ambm_m(matrix_base<NumericT, F> & mat1,
+                  matrix_base<NumericT, F> const & mat2, ScalarType1 const & alpha, vcl_size_t /*len_alpha*/, bool reciprocal_alpha, bool flip_sign_alpha,
+                  matrix_base<NumericT, F> const & mat3, ScalarType2 const & beta,  vcl_size_t /*len_beta*/,  bool reciprocal_beta,  bool flip_sign_beta)
+      {
+        typedef NumericT        value_type;
+
+        value_type       * data_A = detail::extract_raw_pointer<value_type>(mat1);
+        value_type const * data_B = detail::extract_raw_pointer<value_type>(mat2);
+        value_type const * data_C = detail::extract_raw_pointer<value_type>(mat3);
+
+        value_type data_alpha = alpha;
+        if (flip_sign_alpha)
+          data_alpha = -data_alpha;
+
+        value_type data_beta = beta;
+        if (flip_sign_beta)
+          data_beta = -data_beta;
+
+        vcl_size_t A_start1 = viennacl::traits::start1(mat1);
+        vcl_size_t A_start2 = viennacl::traits::start2(mat1);
+        vcl_size_t A_inc1   = viennacl::traits::stride1(mat1);
+        vcl_size_t A_inc2   = viennacl::traits::stride2(mat1);
+        vcl_size_t A_size1  = viennacl::traits::size1(mat1);
+        vcl_size_t A_size2  = viennacl::traits::size2(mat1);
+        vcl_size_t A_internal_size1  = viennacl::traits::internal_size1(mat1);
+        vcl_size_t A_internal_size2  = viennacl::traits::internal_size2(mat1);
+
+        vcl_size_t B_start1 = viennacl::traits::start1(mat2);
+        vcl_size_t B_start2 = viennacl::traits::start2(mat2);
+        vcl_size_t B_inc1   = viennacl::traits::stride1(mat2);
+        vcl_size_t B_inc2   = viennacl::traits::stride2(mat2);
+        vcl_size_t B_internal_size1  = viennacl::traits::internal_size1(mat2);
+        vcl_size_t B_internal_size2  = viennacl::traits::internal_size2(mat2);
+
+        vcl_size_t C_start1 = viennacl::traits::start1(mat3);
+        vcl_size_t C_start2 = viennacl::traits::start2(mat3);
+        vcl_size_t C_inc1   = viennacl::traits::stride1(mat3);
+        vcl_size_t C_inc2   = viennacl::traits::stride2(mat3);
+        vcl_size_t C_internal_size1  = viennacl::traits::internal_size1(mat3);
+        vcl_size_t C_internal_size2  = viennacl::traits::internal_size2(mat3);
+
+        detail::matrix_array_wrapper<value_type,       typename F::orientation_category, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+        detail::matrix_array_wrapper<value_type const, typename F::orientation_category, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+        detail::matrix_array_wrapper<value_type const, typename F::orientation_category, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+        //typedef typename detail::majority_struct_for_orientation<typename M1::orientation_category>::type index_generator_A;
+        //typedef typename detail::majority_struct_for_orientation<typename M2::orientation_category>::type index_generator_B;
+        //typedef typename detail::majority_struct_for_orientation<typename M3::orientation_category>::type index_generator_C;
+
+        if (detail::is_row_major(typename F::orientation_category()))
+        {
+          if (reciprocal_alpha && reciprocal_beta)
+          {
+#ifdef VIENNACL_WITH_OPENMP
+            #pragma omp parallel for
+#endif
+            for (long row = 0; row < static_cast<long>(A_size1); ++row)
+              for (long col = 0; col < static_cast<long>(A_size2); ++col)
+                wrapper_A(row, col) += wrapper_B(row, col) / data_alpha + wrapper_C(row, col) / data_beta;
+          }
+          else if (reciprocal_alpha && !reciprocal_beta)
+          {
+#ifdef VIENNACL_WITH_OPENMP
+            #pragma omp parallel for
+#endif
+            for (long row = 0; row < static_cast<long>(A_size1); ++row)
+              for (long col = 0; col < static_cast<long>(A_size2); ++col)
+                wrapper_A(row, col) += wrapper_B(row, col) / data_alpha + wrapper_C(row, col) * data_beta;
+          }
+          else if (!reciprocal_alpha && reciprocal_beta)
+          {
+#ifdef VIENNACL_WITH_OPENMP
+            #pragma omp parallel for
+#endif
+            for (long row = 0; row < static_cast<long>(A_size1); ++row)
+              for (long col = 0; col < static_cast<long>(A_size2); ++col)
+                wrapper_A(row, col) += wrapper_B(row, col) * data_alpha + wrapper_C(row, col) / data_beta;
+          }
+          else if (!reciprocal_alpha && !reciprocal_beta)
+          {
+#ifdef VIENNACL_WITH_OPENMP
+            #pragma omp parallel for
+#endif
+            for (long row = 0; row < static_cast<long>(A_size1); ++row)
+              for (long col = 0; col < static_cast<long>(A_size2); ++col)
+                wrapper_A(row, col) += wrapper_B(row, col) * data_alpha + wrapper_C(row, col) * data_beta;
+          }
+        }
+        else
+        {
+          if (reciprocal_alpha && reciprocal_beta)
+          {
+#ifdef VIENNACL_WITH_OPENMP
+            #pragma omp parallel for
+#endif
+            for (long col = 0; col < static_cast<long>(A_size2); ++col)
+              for (long row = 0; row < static_cast<long>(A_size1); ++row)
+                wrapper_A(row, col) += wrapper_B(row, col) / data_alpha + wrapper_C(row, col) / data_beta;
+          }
+          else if (reciprocal_alpha && !reciprocal_beta)
+          {
+#ifdef VIENNACL_WITH_OPENMP
+            #pragma omp parallel for
+#endif
+            for (long col = 0; col < static_cast<long>(A_size2); ++col)
+              for (long row = 0; row < static_cast<long>(A_size1); ++row)
+                wrapper_A(row, col) += wrapper_B(row, col) / data_alpha + wrapper_C(row, col) * data_beta;
+          }
+          else if (!reciprocal_alpha && reciprocal_beta)
+          {
+#ifdef VIENNACL_WITH_OPENMP
+            #pragma omp parallel for
+#endif
+            for (long col = 0; col < static_cast<long>(A_size2); ++col)
+              for (long row = 0; row < static_cast<long>(A_size1); ++row)
+                wrapper_A(row, col) += wrapper_B(row, col) * data_alpha + wrapper_C(row, col) / data_beta;
+          }
+          else if (!reciprocal_alpha && !reciprocal_beta)
+          {
+#ifdef VIENNACL_WITH_OPENMP
+            #pragma omp parallel for
+#endif
+            for (long col = 0; col < static_cast<long>(A_size2); ++col)
+              for (long row = 0; row < static_cast<long>(A_size1); ++row)
+                wrapper_A(row, col) += wrapper_B(row, col) * data_alpha + wrapper_C(row, col) * data_beta;
+          }
+        }
+
+      }
+
+
+
+
+      template <typename NumericT, typename F>
+      void matrix_assign(matrix_base<NumericT, F> & mat, NumericT s, bool clear = false)
+      {
+        typedef NumericT        value_type;
+
+        value_type       * data_A = detail::extract_raw_pointer<value_type>(mat);
+        value_type alpha = static_cast<value_type>(s);
+
+        vcl_size_t A_start1 = viennacl::traits::start1(mat);
+        vcl_size_t A_start2 = viennacl::traits::start2(mat);
+        vcl_size_t A_inc1   = viennacl::traits::stride1(mat);
+        vcl_size_t A_inc2   = viennacl::traits::stride2(mat);
+        vcl_size_t A_size1  = clear ? viennacl::traits::internal_size1(mat) : viennacl::traits::size1(mat);
+        vcl_size_t A_size2  = clear ? viennacl::traits::internal_size2(mat) : viennacl::traits::size2(mat);
+        vcl_size_t A_internal_size1  = viennacl::traits::internal_size1(mat);
+        vcl_size_t A_internal_size2  = viennacl::traits::internal_size2(mat);
+
+        detail::matrix_array_wrapper<value_type,       typename F::orientation_category, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+
+        if (detail::is_row_major(typename F::orientation_category()))
+        {
+#ifdef VIENNACL_WITH_OPENMP
+          #pragma omp parallel for
+#endif
+          for (long row = 0; row < static_cast<long>(A_size1); ++row)
+            for (long col = 0; col < static_cast<long>(A_size2); ++col)
+              wrapper_A(row, col) = alpha;
+              //data_A[index_generator_A::mem_index(row * A_inc1 + A_start1, col * A_inc2 + A_start2, A_internal_size1, A_internal_size2)]
+              // = data_B[index_generator_B::mem_index(row * B_inc1 + B_start1, col * B_inc2 + B_start2, B_internal_size1, B_internal_size2)] * alpha;
+        }
+        else
+        {
+#ifdef VIENNACL_WITH_OPENMP
+          #pragma omp parallel for
+#endif
+          for (long col = 0; col < static_cast<long>(A_size2); ++col)
+            for (long row = 0; row < static_cast<long>(A_size1); ++row)
+              wrapper_A(row, col) = alpha;
+              //data_A[index_generator_A::mem_index(row * A_inc1 + A_start1, col * A_inc2 + A_start2, A_internal_size1, A_internal_size2)]
+              // = data_B[index_generator_B::mem_index(row * B_inc1 + B_start1, col * B_inc2 + B_start2, B_internal_size1, B_internal_size2)] * alpha;
+        }
+      }
+
+
+
+      template <typename NumericT, typename F>
+      void matrix_diagonal_assign(matrix_base<NumericT, F> & mat, NumericT s)
+      {
+        typedef NumericT        value_type;
+
+        value_type       * data_A = detail::extract_raw_pointer<value_type>(mat);
+        value_type alpha = static_cast<value_type>(s);
+
+        vcl_size_t A_start1 = viennacl::traits::start1(mat);
+        vcl_size_t A_start2 = viennacl::traits::start2(mat);
+        vcl_size_t A_inc1   = viennacl::traits::stride1(mat);
+        vcl_size_t A_inc2   = viennacl::traits::stride2(mat);
+        vcl_size_t A_size1  = viennacl::traits::size1(mat);
+        //vcl_size_t A_size2  = viennacl::traits::size2(mat);
+        vcl_size_t A_internal_size1  = viennacl::traits::internal_size1(mat);
+        vcl_size_t A_internal_size2  = viennacl::traits::internal_size2(mat);
+
+        detail::matrix_array_wrapper<value_type, typename F::orientation_category, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+
+#ifdef VIENNACL_WITH_OPENMP
+        #pragma omp parallel for
+#endif
+        for (long row = 0; row < static_cast<long>(A_size1); ++row)
+          wrapper_A(row, row) = alpha;
+      }
+
+      template <typename NumericT, typename F>
+      void matrix_diag_from_vector(const vector_base<NumericT> & vec, int k, matrix_base<NumericT, F> & mat)
+      {
+        typedef NumericT        value_type;
+
+        value_type       *data_A   = detail::extract_raw_pointer<value_type>(mat);
+        value_type const *data_vec = detail::extract_raw_pointer<value_type>(vec);
+
+        vcl_size_t A_start1 = viennacl::traits::start1(mat);
+        vcl_size_t A_start2 = viennacl::traits::start2(mat);
+        vcl_size_t A_inc1   = viennacl::traits::stride1(mat);
+        vcl_size_t A_inc2   = viennacl::traits::stride2(mat);
+        //vcl_size_t A_size1  = viennacl::traits::size1(mat);
+        //vcl_size_t A_size2  = viennacl::traits::size2(mat);
+        vcl_size_t A_internal_size1  = viennacl::traits::internal_size1(mat);
+        vcl_size_t A_internal_size2  = viennacl::traits::internal_size2(mat);
+
+        vcl_size_t v_start = viennacl::traits::start(vec);
+        vcl_size_t v_inc   = viennacl::traits::stride(vec);
+        vcl_size_t v_size  = viennacl::traits::size(vec);
+
+        detail::matrix_array_wrapper<value_type, typename F::orientation_category, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+
+        vcl_size_t row_start = 0;
+        vcl_size_t col_start = 0;
+
+        if (k >= 0)
+          col_start = static_cast<vcl_size_t>(k);
+        else
+          row_start = static_cast<vcl_size_t>(-k);
+
+        matrix_assign(mat, NumericT(0));
+
+        for (vcl_size_t i = 0; i < v_size; ++i)
+          wrapper_A(row_start + i, col_start + i) = data_vec[v_start + i * v_inc];
+
+      }
+
+      template <typename NumericT, typename F>
+      void matrix_diag_to_vector(const matrix_base<NumericT, F> & mat, int k, vector_base<NumericT> & vec)
+      {
+        typedef NumericT        value_type;
+
+        value_type const *data_A   = detail::extract_raw_pointer<value_type>(mat);
+        value_type       *data_vec = detail::extract_raw_pointer<value_type>(vec);
+
+        vcl_size_t A_start1 = viennacl::traits::start1(mat);
+        vcl_size_t A_start2 = viennacl::traits::start2(mat);
+        vcl_size_t A_inc1   = viennacl::traits::stride1(mat);
+        vcl_size_t A_inc2   = viennacl::traits::stride2(mat);
+        //vcl_size_t A_size1  = viennacl::traits::size1(mat);
+        //vcl_size_t A_size2  = viennacl::traits::size2(mat);
+        vcl_size_t A_internal_size1  = viennacl::traits::internal_size1(mat);
+        vcl_size_t A_internal_size2  = viennacl::traits::internal_size2(mat);
+
+        vcl_size_t v_start = viennacl::traits::start(vec);
+        vcl_size_t v_inc   = viennacl::traits::stride(vec);
+        vcl_size_t v_size  = viennacl::traits::size(vec);
+
+        detail::matrix_array_wrapper<value_type const, typename F::orientation_category, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+
+        vcl_size_t row_start = 0;
+        vcl_size_t col_start = 0;
+
+        if (k >= 0)
+          col_start = static_cast<vcl_size_t>(k);
+        else
+          row_start = static_cast<vcl_size_t>(-k);
+
+        for (vcl_size_t i = 0; i < v_size; ++i)
+          data_vec[v_start + i * v_inc] = wrapper_A(row_start + i, col_start + i);
+      }
+
+      template <typename NumericT, typename F>
+      void matrix_row(const matrix_base<NumericT, F> & mat, unsigned int i, vector_base<NumericT> & vec)
+      {
+        typedef NumericT        value_type;
+
+        value_type const *data_A   = detail::extract_raw_pointer<value_type>(mat);
+        value_type       *data_vec = detail::extract_raw_pointer<value_type>(vec);
+
+        vcl_size_t A_start1 = viennacl::traits::start1(mat);
+        vcl_size_t A_start2 = viennacl::traits::start2(mat);
+        vcl_size_t A_inc1   = viennacl::traits::stride1(mat);
+        vcl_size_t A_inc2   = viennacl::traits::stride2(mat);
+        //vcl_size_t A_size1  = viennacl::traits::size1(mat);
+        //vcl_size_t A_size2  = viennacl::traits::size2(mat);
+        vcl_size_t A_internal_size1  = viennacl::traits::internal_size1(mat);
+        vcl_size_t A_internal_size2  = viennacl::traits::internal_size2(mat);
+
+        vcl_size_t v_start = viennacl::traits::start(vec);
+        vcl_size_t v_inc   = viennacl::traits::stride(vec);
+        vcl_size_t v_size  = viennacl::traits::size(vec);
+
+        detail::matrix_array_wrapper<value_type const, typename F::orientation_category, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+
+        for (vcl_size_t j = 0; j < v_size; ++j)
+          data_vec[v_start + j * v_inc] = wrapper_A(i, j);
+      }
+
+      template <typename NumericT, typename F>
+      void matrix_column(const matrix_base<NumericT, F> & mat, unsigned int j, vector_base<NumericT> & vec)
+      {
+        typedef NumericT        value_type;
+
+        value_type const *data_A   = detail::extract_raw_pointer<value_type>(mat);
+        value_type       *data_vec = detail::extract_raw_pointer<value_type>(vec);
+
+        vcl_size_t A_start1 = viennacl::traits::start1(mat);
+        vcl_size_t A_start2 = viennacl::traits::start2(mat);
+        vcl_size_t A_inc1   = viennacl::traits::stride1(mat);
+        vcl_size_t A_inc2   = viennacl::traits::stride2(mat);
+        //vcl_size_t A_size1  = viennacl::traits::size1(mat);
+        //vcl_size_t A_size2  = viennacl::traits::size2(mat);
+        vcl_size_t A_internal_size1  = viennacl::traits::internal_size1(mat);
+        vcl_size_t A_internal_size2  = viennacl::traits::internal_size2(mat);
+
+        vcl_size_t v_start = viennacl::traits::start(vec);
+        vcl_size_t v_inc   = viennacl::traits::stride(vec);
+        vcl_size_t v_size  = viennacl::traits::size(vec);
+
+        detail::matrix_array_wrapper<value_type const, typename F::orientation_category, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+
+        for (vcl_size_t i = 0; i < v_size; ++i)
+          data_vec[v_start + i * v_inc] = wrapper_A(i, j);
+      }
+
+      //
+      ///////////////////////// Element-wise operation //////////////////////////////////
+      //
+
+      // Binary operations A = B .* C and A = B ./ C
+
+      /** @brief Implementation of the element-wise operations A = B .* C and A = B ./ C    (using MATLAB syntax)
+      *
+      * @param A      The result matrix (or -range, or -slice)
+      * @param proxy  The proxy object holding B, C, and the operation
+      */
+      template <typename NumericT, typename F, typename OP>
+      void element_op(matrix_base<NumericT, F> & A,
+                      matrix_expression<const matrix_base<NumericT, F>, const matrix_base<NumericT, F>, op_element_binary<OP> > const & proxy)
+      {
+        typedef NumericT        value_type;
+        typedef viennacl::linalg::detail::op_applier<op_element_binary<OP> >    OpFunctor;
+
+        value_type       * data_A = detail::extract_raw_pointer<value_type>(A);
+        value_type const * data_B = detail::extract_raw_pointer<value_type>(proxy.lhs());
+        value_type const * data_C = detail::extract_raw_pointer<value_type>(proxy.rhs());
+
+        vcl_size_t A_start1 = viennacl::traits::start1(A);
+        vcl_size_t A_start2 = viennacl::traits::start2(A);
+        vcl_size_t A_inc1   = viennacl::traits::stride1(A);
+        vcl_size_t A_inc2   = viennacl::traits::stride2(A);
+        vcl_size_t A_size1  = viennacl::traits::size1(A);
+        vcl_size_t A_size2  = viennacl::traits::size2(A);
+        vcl_size_t A_internal_size1  = viennacl::traits::internal_size1(A);
+        vcl_size_t A_internal_size2  = viennacl::traits::internal_size2(A);
+
+        vcl_size_t B_start1 = viennacl::traits::start1(proxy.lhs());
+        vcl_size_t B_start2 = viennacl::traits::start2(proxy.lhs());
+        vcl_size_t B_inc1   = viennacl::traits::stride1(proxy.lhs());
+        vcl_size_t B_inc2   = viennacl::traits::stride2(proxy.lhs());
+        vcl_size_t B_internal_size1  = viennacl::traits::internal_size1(proxy.lhs());
+        vcl_size_t B_internal_size2  = viennacl::traits::internal_size2(proxy.lhs());
+
+        vcl_size_t C_start1 = viennacl::traits::start1(proxy.rhs());
+        vcl_size_t C_start2 = viennacl::traits::start2(proxy.rhs());
+        vcl_size_t C_inc1   = viennacl::traits::stride1(proxy.rhs());
+        vcl_size_t C_inc2   = viennacl::traits::stride2(proxy.rhs());
+        vcl_size_t C_internal_size1  = viennacl::traits::internal_size1(proxy.rhs());
+        vcl_size_t C_internal_size2  = viennacl::traits::internal_size2(proxy.rhs());
+
+        detail::matrix_array_wrapper<value_type,       typename F::orientation_category, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+        detail::matrix_array_wrapper<value_type const, typename F::orientation_category, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+        detail::matrix_array_wrapper<value_type const, typename F::orientation_category, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+        if (detail::is_row_major(typename F::orientation_category()))
+        {
+#ifdef VIENNACL_WITH_OPENMP
+          #pragma omp parallel for
+#endif
+          for (long row = 0; row < static_cast<long>(A_size1); ++row)
+            for (long col = 0; col < static_cast<long>(A_size2); ++col)
+              OpFunctor::apply(wrapper_A(row, col), wrapper_B(row, col), wrapper_C(row, col));
+              //data_A[index_generator_A::mem_index(row * A_inc1 + A_start1, col * A_inc2 + A_start2, A_internal_size1, A_internal_size2)]
+              // =   data_B[index_generator_B::mem_index(row * B_inc1 + B_start1, col * B_inc2 + B_start2, B_internal_size1, B_internal_size2)] * alpha
+              //   + data_C[index_generator_C::mem_index(row * C_inc1 + C_start1, col * C_inc2 + C_start2, C_internal_size1, C_internal_size2)] * beta;
+        }
+        else
+        {
+#ifdef VIENNACL_WITH_OPENMP
+          #pragma omp parallel for
+#endif
+          for (long col = 0; col < static_cast<long>(A_size2); ++col)
+            for (long row = 0; row < static_cast<long>(A_size1); ++row)
+              OpFunctor::apply(wrapper_A(row, col), wrapper_B(row, col), wrapper_C(row, col));
+
+              //data_A[index_generator_A::mem_index(row * A_inc1 + A_start1, col * A_inc2 + A_start2, A_internal_size1, A_internal_size2)]
+              // =   data_B[index_generator_B::mem_index(row * B_inc1 + B_start1, col * B_inc2 + B_start2, B_internal_size1, B_internal_size2)] * alpha
+              //   + data_C[index_generator_C::mem_index(row * C_inc1 + C_start1, col * C_inc2 + C_start2, C_internal_size1, C_internal_size2)] * beta;
+        }
+      }
+
+      // Unary operations
+
+      // A = op(B)
+      template <typename NumericT, typename F, typename OP>
+      void element_op(matrix_base<NumericT, F> & A,
+                      matrix_expression<const matrix_base<NumericT, F>, const matrix_base<NumericT, F>, op_element_unary<OP> > const & proxy)
+      {
+        typedef NumericT        value_type;
+        typedef viennacl::linalg::detail::op_applier<op_element_unary<OP> >    OpFunctor;
+
+        value_type       * data_A = detail::extract_raw_pointer<value_type>(A);
+        value_type const * data_B = detail::extract_raw_pointer<value_type>(proxy.lhs());
+
+        vcl_size_t A_start1 = viennacl::traits::start1(A);
+        vcl_size_t A_start2 = viennacl::traits::start2(A);
+        vcl_size_t A_inc1   = viennacl::traits::stride1(A);
+        vcl_size_t A_inc2   = viennacl::traits::stride2(A);
+        vcl_size_t A_size1  = viennacl::traits::size1(A);
+        vcl_size_t A_size2  = viennacl::traits::size2(A);
+        vcl_size_t A_internal_size1  = viennacl::traits::internal_size1(A);
+        vcl_size_t A_internal_size2  = viennacl::traits::internal_size2(A);
+
+        vcl_size_t B_start1 = viennacl::traits::start1(proxy.lhs());
+        vcl_size_t B_start2 = viennacl::traits::start2(proxy.lhs());
+        vcl_size_t B_inc1   = viennacl::traits::stride1(proxy.lhs());
+        vcl_size_t B_inc2   = viennacl::traits::stride2(proxy.lhs());
+        vcl_size_t B_internal_size1  = viennacl::traits::internal_size1(proxy.lhs());
+        vcl_size_t B_internal_size2  = viennacl::traits::internal_size2(proxy.lhs());
+
+        detail::matrix_array_wrapper<value_type,       typename F::orientation_category, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+        detail::matrix_array_wrapper<value_type const, typename F::orientation_category, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+
+        if (detail::is_row_major(typename F::orientation_category()))
+        {
+#ifdef VIENNACL_WITH_OPENMP
+          #pragma omp parallel for
+#endif
+          for (long row = 0; row < static_cast<long>(A_size1); ++row)
+            for (long col = 0; col < static_cast<long>(A_size2); ++col)
+              OpFunctor::apply(wrapper_A(row, col), wrapper_B(row, col));
+        }
+        else
+        {
+#ifdef VIENNACL_WITH_OPENMP
+          #pragma omp parallel for
+#endif
+          for (long col = 0; col < static_cast<long>(A_size2); ++col)
+            for (long row = 0; row < static_cast<long>(A_size1); ++row)
+              OpFunctor::apply(wrapper_A(row, col), wrapper_B(row, col));
+        }
+      }
+
+
+
+      //
+      /////////////////////////   matrix-vector products /////////////////////////////////
+      //
+
+      // A * x
+
+      /** @brief Carries out matrix-vector multiplication
+      *
+      * Implementation of the convenience expression result = prod(mat, vec);
+      *
+      * @param mat    The matrix
+      * @param vec    The vector
+      * @param result The result vector
+      */
+      template <typename NumericT, typename F>
+      void prod_impl(const matrix_base<NumericT, F> & mat,
+                     const vector_base<NumericT> & vec,
+                           vector_base<NumericT> & result)
+      {
+        typedef NumericT        value_type;
+
+        value_type const * data_A = detail::extract_raw_pointer<value_type>(mat);
+        value_type const * data_x = detail::extract_raw_pointer<value_type>(vec);
+        value_type       * data_result = detail::extract_raw_pointer<value_type>(result);
+
+        vcl_size_t A_start1 = viennacl::traits::start1(mat);
+        vcl_size_t A_start2 = viennacl::traits::start2(mat);
+        vcl_size_t A_inc1   = viennacl::traits::stride1(mat);
+        vcl_size_t A_inc2   = viennacl::traits::stride2(mat);
+        vcl_size_t A_size1  = viennacl::traits::size1(mat);
+        vcl_size_t A_size2  = viennacl::traits::size2(mat);
+        vcl_size_t A_internal_size1  = viennacl::traits::internal_size1(mat);
+        vcl_size_t A_internal_size2  = viennacl::traits::internal_size2(mat);
+
+        vcl_size_t start1 = viennacl::traits::start(vec);
+        vcl_size_t inc1   = viennacl::traits::stride(vec);
+
+        vcl_size_t start2 = viennacl::traits::start(result);
+        vcl_size_t inc2   = viennacl::traits::stride(result);
+
+        if (detail::is_row_major(typename F::orientation_category()))
+        {
+#ifdef VIENNACL_WITH_OPENMP
+          #pragma omp parallel for
+#endif
+          for (long row = 0; row < static_cast<long>(A_size1); ++row)
+          {
+            value_type temp = 0;
+            for (vcl_size_t col = 0; col < A_size2; ++col)
+              temp += data_A[viennacl::row_major::mem_index(row * A_inc1 + A_start1, col * A_inc2 + A_start2, A_internal_size1, A_internal_size2)] * data_x[col * inc1 + start1];
+
+            data_result[row * inc2 + start2] = temp;
+          }
+        }
+        else
+        {
+          {
+            value_type temp = data_x[start1];
+            for (vcl_size_t row = 0; row < A_size1; ++row)
+              data_result[row * inc2 + start2] = data_A[viennacl::column_major::mem_index(row * A_inc1 + A_start1, A_start2, A_internal_size1, A_internal_size2)] * temp;
+          }
+          for (vcl_size_t col = 1; col < A_size2; ++col)  //run through matrix sequentially
+          {
+            value_type temp = data_x[col * inc1 + start1];
+            for (vcl_size_t row = 0; row < A_size1; ++row)
+              data_result[row * inc2 + start2] += data_A[viennacl::column_major::mem_index(row * A_inc1 + A_start1, col * A_inc2 + A_start2, A_internal_size1, A_internal_size2)] * temp;
+          }
+        }
+      }
+
+
+      // trans(A) * x
+
+      /** @brief Carries out matrix-vector multiplication with a transposed matrix
+      *
+      * Implementation of the convenience expression result = trans(mat) * vec;
+      *
+      * @param mat_trans  The transposed matrix proxy
+      * @param vec        The vector
+      * @param result     The result vector
+      */
+      template <typename NumericT, typename F>
+      void prod_impl(const viennacl::matrix_expression< const matrix_base<NumericT, F>, const matrix_base<NumericT, F>, op_trans> & mat_trans,
+                     const vector_base<NumericT> & vec,
+                           vector_base<NumericT> & result)
+      {
+        typedef NumericT        value_type;
+
+        value_type const * data_A = detail::extract_raw_pointer<value_type>(mat_trans.lhs());
+        value_type const * data_x = detail::extract_raw_pointer<value_type>(vec);
+        value_type       * data_result = detail::extract_raw_pointer<value_type>(result);
+
+        vcl_size_t A_start1 = viennacl::traits::start1(mat_trans.lhs());
+        vcl_size_t A_start2 = viennacl::traits::start2(mat_trans.lhs());
+        vcl_size_t A_inc1   = viennacl::traits::stride1(mat_trans.lhs());
+        vcl_size_t A_inc2   = viennacl::traits::stride2(mat_trans.lhs());
+        vcl_size_t A_size1  = viennacl::traits::size1(mat_trans.lhs());
+        vcl_size_t A_size2  = viennacl::traits::size2(mat_trans.lhs());
+        vcl_size_t A_internal_size1  = viennacl::traits::internal_size1(mat_trans.lhs());
+        vcl_size_t A_internal_size2  = viennacl::traits::internal_size2(mat_trans.lhs());
+
+        vcl_size_t start1 = viennacl::traits::start(vec);
+        vcl_size_t inc1   = viennacl::traits::stride(vec);
+
+        vcl_size_t start2 = viennacl::traits::start(result);
+        vcl_size_t inc2   = viennacl::traits::stride(result);
+
+        if (detail::is_row_major(typename F::orientation_category()))
+        {
+          {
+            value_type temp = data_x[start1];
+            for (vcl_size_t row = 0; row < A_size2; ++row)
+              data_result[row * inc2 + start2] = data_A[viennacl::row_major::mem_index(A_start1, row * A_inc2 + A_start2, A_internal_size1, A_internal_size2)] * temp;
+          }
+
+          for (vcl_size_t col = 1; col < A_size1; ++col)  //run through matrix sequentially
+          {
+            value_type temp = data_x[col * inc1 + start1];
+            for (vcl_size_t row = 0; row < A_size2; ++row)
+            {
+              data_result[row * inc2 + start2] += data_A[viennacl::row_major::mem_index(col * A_inc1 + A_start1, row * A_inc2 + A_start2, A_internal_size1, A_internal_size2)] * temp;
+            }
+          }
+        }
+        else
+        {
+#ifdef VIENNACL_WITH_OPENMP
+          #pragma omp parallel for
+#endif
+          for (long row = 0; row < static_cast<long>(A_size2); ++row)
+          {
+            value_type temp = 0;
+            for (vcl_size_t col = 0; col < A_size1; ++col)
+              temp += data_A[viennacl::column_major::mem_index(col * A_inc1 + A_start1, row * A_inc2 + A_start2, A_internal_size1, A_internal_size2)] * data_x[col * inc1 + start1];
+
+            data_result[row * inc2 + start2] = temp;
+          }
+        }
+      }
+
+
+      //
+      /////////////////////////   matrix-matrix products /////////////////////////////////
+      //
+
+      namespace detail
+      {
+        template <typename A, typename B, typename C, typename NumericT>
+        void prod(A & a, B & b, C & c,
+                  vcl_size_t C_size1, vcl_size_t C_size2, vcl_size_t A_size2,
+                  NumericT alpha, NumericT beta)
+        {
+#ifdef VIENNACL_WITH_OPENMP
+          #pragma omp parallel for
+#endif
+          for (long i=0; i<static_cast<long>(C_size1); ++i)
+          {
+            for (vcl_size_t j=0; j<C_size2; ++j)
+            {
+              NumericT temp = 0;
+              for (vcl_size_t k=0; k<A_size2; ++k)
+                temp += a(i, k) * b(k, j);
+
+              temp *= alpha;
+              if (beta != 0)
+                temp += beta * c(i,j);
+              c(i,j) = temp;
+            }
+          }
+        }
+
+      }
+
+      /** @brief Carries out matrix-matrix multiplication
+      *
+      * Implementation of C = prod(A, B);
+      *
+      */
+      template <typename NumericT, typename F1, typename F2, typename F3, typename ScalarType >
+      void prod_impl(const matrix_base<NumericT, F1> & A,
+                     const matrix_base<NumericT, F2> & B,
+                           matrix_base<NumericT, F3> & C,
+                     ScalarType alpha,
+                     ScalarType beta)
+      {
+        typedef NumericT        value_type;
+
+        value_type const * data_A = detail::extract_raw_pointer<value_type>(A);
+        value_type const * data_B = detail::extract_raw_pointer<value_type>(B);
+        value_type       * data_C = detail::extract_raw_pointer<value_type>(C);
+
+        vcl_size_t A_start1 = viennacl::traits::start1(A);
+        vcl_size_t A_start2 = viennacl::traits::start2(A);
+        vcl_size_t A_inc1   = viennacl::traits::stride1(A);
+        vcl_size_t A_inc2   = viennacl::traits::stride2(A);
+        vcl_size_t A_size2  = viennacl::traits::size2(A);
+        vcl_size_t A_internal_size1  = viennacl::traits::internal_size1(A);
+        vcl_size_t A_internal_size2  = viennacl::traits::internal_size2(A);
+
+        vcl_size_t B_start1 = viennacl::traits::start1(B);
+        vcl_size_t B_start2 = viennacl::traits::start2(B);
+        vcl_size_t B_inc1   = viennacl::traits::stride1(B);
+        vcl_size_t B_inc2   = viennacl::traits::stride2(B);
+        vcl_size_t B_internal_size1  = viennacl::traits::internal_size1(B);
+        vcl_size_t B_internal_size2  = viennacl::traits::internal_size2(B);
+
+        vcl_size_t C_start1 = viennacl::traits::start1(C);
+        vcl_size_t C_start2 = viennacl::traits::start2(C);
+        vcl_size_t C_inc1   = viennacl::traits::stride1(C);
+        vcl_size_t C_inc2   = viennacl::traits::stride2(C);
+        vcl_size_t C_size1  = viennacl::traits::size1(C);
+        vcl_size_t C_size2  = viennacl::traits::size2(C);
+        vcl_size_t C_internal_size1  = viennacl::traits::internal_size1(C);
+        vcl_size_t C_internal_size2  = viennacl::traits::internal_size2(C);
+
+        detail::matrix_array_wrapper<value_type const, typename F1::orientation_category, false>   wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+        detail::matrix_array_wrapper<value_type const, typename F2::orientation_category, false>   wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+        detail::matrix_array_wrapper<value_type,       typename F3::orientation_category, false>   wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+        detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+      }
+
+
+
+      /** @brief Carries out matrix-matrix multiplication
+      *
+      * Implementation of C = prod(trans(A), B);
+      *
+      */
+      template <typename NumericT, typename F1, typename F2, typename F3, typename ScalarType >
+      void prod_impl(const viennacl::matrix_expression< const matrix_base<NumericT, F1>,
+                                                        const matrix_base<NumericT, F1>,
+                                                        op_trans> & A,
+                     const matrix_base<NumericT, F2> & B,
+                           matrix_base<NumericT, F3> & C,
+                     ScalarType alpha,
+                     ScalarType beta)
+      {
+        typedef NumericT        value_type;
+
+        value_type const * data_A = detail::extract_raw_pointer<value_type>(A.lhs());
+        value_type const * data_B = detail::extract_raw_pointer<value_type>(B);
+        value_type       * data_C = detail::extract_raw_pointer<value_type>(C);
+
+        vcl_size_t A_start1 = viennacl::traits::start1(A.lhs());
+        vcl_size_t A_start2 = viennacl::traits::start2(A.lhs());
+        vcl_size_t A_inc1   = viennacl::traits::stride1(A.lhs());
+        vcl_size_t A_inc2   = viennacl::traits::stride2(A.lhs());
+        vcl_size_t A_size1  = viennacl::traits::size1(A.lhs());
+        vcl_size_t A_internal_size1  = viennacl::traits::internal_size1(A.lhs());
+        vcl_size_t A_internal_size2  = viennacl::traits::internal_size2(A.lhs());
+
+        vcl_size_t B_start1 = viennacl::traits::start1(B);
+        vcl_size_t B_start2 = viennacl::traits::start2(B);
+        vcl_size_t B_inc1   = viennacl::traits::stride1(B);
+        vcl_size_t B_inc2   = viennacl::traits::stride2(B);
+        vcl_size_t B_internal_size1  = viennacl::traits::internal_size1(B);
+        vcl_size_t B_internal_size2  = viennacl::traits::internal_size2(B);
+
+        vcl_size_t C_start1 = viennacl::traits::start1(C);
+        vcl_size_t C_start2 = viennacl::traits::start2(C);
+        vcl_size_t C_inc1   = viennacl::traits::stride1(C);
+        vcl_size_t C_inc2   = viennacl::traits::stride2(C);
+        vcl_size_t C_size1  = viennacl::traits::size1(C);
+        vcl_size_t C_size2  = viennacl::traits::size2(C);
+        vcl_size_t C_internal_size1  = viennacl::traits::internal_size1(C);
+        vcl_size_t C_internal_size2  = viennacl::traits::internal_size2(C);
+
+        detail::matrix_array_wrapper<value_type const, typename F1::orientation_category, true>    wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+        detail::matrix_array_wrapper<value_type const, typename F2::orientation_category, false>   wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+        detail::matrix_array_wrapper<value_type,       typename F3::orientation_category, false>   wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+        detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+      }
+
+
+
+
+      /** @brief Carries out matrix-matrix multiplication
+      *
+      * Implementation of C = prod(A, trans(B));
+      *
+      */
+      template <typename NumericT, typename F1, typename F2, typename F3, typename ScalarType >
+      void prod_impl(const matrix_base<NumericT, F1> & A,
+                     const viennacl::matrix_expression< const matrix_base<NumericT, F2>, const matrix_base<NumericT, F2>, op_trans> & B,
+                           matrix_base<NumericT, F3> & C,
+                     ScalarType alpha,
+                     ScalarType beta)
+      {
+        typedef NumericT        value_type;
+
+        value_type const * data_A = detail::extract_raw_pointer<value_type>(A);
+        value_type const * data_B = detail::extract_raw_pointer<value_type>(B.lhs());
+        value_type       * data_C = detail::extract_raw_pointer<value_type>(C);
+
+        vcl_size_t A_start1 = viennacl::traits::start1(A);
+        vcl_size_t A_start2 = viennacl::traits::start2(A);
+        vcl_size_t A_inc1   = viennacl::traits::stride1(A);
+        vcl_size_t A_inc2   = viennacl::traits::stride2(A);
+        vcl_size_t A_size2  = viennacl::traits::size2(A);
+        vcl_size_t A_internal_size1  = viennacl::traits::internal_size1(A);
+        vcl_size_t A_internal_size2  = viennacl::traits::internal_size2(A);
+
+        vcl_size_t B_start1 = viennacl::traits::start1(B.lhs());
+        vcl_size_t B_start2 = viennacl::traits::start2(B.lhs());
+        vcl_size_t B_inc1   = viennacl::traits::stride1(B.lhs());
+        vcl_size_t B_inc2   = viennacl::traits::stride2(B.lhs());
+        vcl_size_t B_internal_size1  = viennacl::traits::internal_size1(B.lhs());
+        vcl_size_t B_internal_size2  = viennacl::traits::internal_size2(B.lhs());
+
+        vcl_size_t C_start1 = viennacl::traits::start1(C);
+        vcl_size_t C_start2 = viennacl::traits::start2(C);
+        vcl_size_t C_inc1   = viennacl::traits::stride1(C);
+        vcl_size_t C_inc2   = viennacl::traits::stride2(C);
+        vcl_size_t C_size1  = viennacl::traits::size1(C);
+        vcl_size_t C_size2  = viennacl::traits::size2(C);
+        vcl_size_t C_internal_size1  = viennacl::traits::internal_size1(C);
+        vcl_size_t C_internal_size2  = viennacl::traits::internal_size2(C);
+
+        detail::matrix_array_wrapper<value_type const, typename F1::orientation_category, false>   wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+        detail::matrix_array_wrapper<value_type const, typename F2::orientation_category, true>    wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+        detail::matrix_array_wrapper<value_type,       typename F3::orientation_category, false>   wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+        detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+      }
+
+
+
+      /** @brief Carries out matrix-matrix multiplication
+      *
+      * Implementation of C = prod(trans(A), trans(B));
+      *
+      */
+      template <typename NumericT, typename F1, typename F2, typename F3, typename ScalarType >
+      void prod_impl(const viennacl::matrix_expression< const matrix_base<NumericT, F1>, const matrix_base<NumericT, F1>, op_trans> & A,
+                     const viennacl::matrix_expression< const matrix_base<NumericT, F2>, const matrix_base<NumericT, F2>, op_trans> & B,
+                     matrix_base<NumericT, F3> & C,
+                     ScalarType alpha,
+                     ScalarType beta)
+      {
+        typedef NumericT        value_type;
+
+        value_type const * data_A = detail::extract_raw_pointer<value_type>(A.lhs());
+        value_type const * data_B = detail::extract_raw_pointer<value_type>(B.lhs());
+        value_type       * data_C = detail::extract_raw_pointer<value_type>(C);
+
+        vcl_size_t A_start1 = viennacl::traits::start1(A.lhs());
+        vcl_size_t A_start2 = viennacl::traits::start2(A.lhs());
+        vcl_size_t A_inc1   = viennacl::traits::stride1(A.lhs());
+        vcl_size_t A_inc2   = viennacl::traits::stride2(A.lhs());
+        vcl_size_t A_size1  = viennacl::traits::size1(A.lhs());
+        vcl_size_t A_internal_size1  = viennacl::traits::internal_size1(A.lhs());
+        vcl_size_t A_internal_size2  = viennacl::traits::internal_size2(A.lhs());
+
+        vcl_size_t B_start1 = viennacl::traits::start1(B.lhs());
+        vcl_size_t B_start2 = viennacl::traits::start2(B.lhs());
+        vcl_size_t B_inc1   = viennacl::traits::stride1(B.lhs());
+        vcl_size_t B_inc2   = viennacl::traits::stride2(B.lhs());
+        vcl_size_t B_internal_size1  = viennacl::traits::internal_size1(B.lhs());
+        vcl_size_t B_internal_size2  = viennacl::traits::internal_size2(B.lhs());
+
+        vcl_size_t C_start1 = viennacl::traits::start1(C);
+        vcl_size_t C_start2 = viennacl::traits::start2(C);
+        vcl_size_t C_inc1   = viennacl::traits::stride1(C);
+        vcl_size_t C_inc2   = viennacl::traits::stride2(C);
+        vcl_size_t C_size1  = viennacl::traits::size1(C);
+        vcl_size_t C_size2  = viennacl::traits::size2(C);
+        vcl_size_t C_internal_size1  = viennacl::traits::internal_size1(C);
+        vcl_size_t C_internal_size2  = viennacl::traits::internal_size2(C);
+
+        detail::matrix_array_wrapper<value_type const, typename F1::orientation_category, true>    wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+        detail::matrix_array_wrapper<value_type const, typename F2::orientation_category, true>    wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+        detail::matrix_array_wrapper<value_type,       typename F3::orientation_category, false>   wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+        detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+      }
+
+
+
+
+      //
+      /////////////////////////   miscellaneous operations /////////////////////////////////
+      //
+
+
+      /** @brief The implementation of the operation mat += alpha * vec1 * vec2^T, i.e. a scaled rank 1 update
+      *
+      * Implementation of the convenience expression result += alpha * outer_prod(vec1, vec2);
+      *
+      * @param mat1    The matrix to be updated
+      * @param alpha            The scaling factor (either a viennacl::scalar<>, float, or double)
+      * @param reciprocal_alpha Use 1/alpha instead of alpha
+      * @param flip_sign_alpha  Use -alpha instead of alpha
+      * @param vec1    The first vector
+      * @param vec2    The second vector
+      */
+      template <typename NumericT, typename F, typename S1>
+      void scaled_rank_1_update(matrix_base<NumericT, F> & mat1,
+                                S1 const & alpha, vcl_size_t /*len_alpha*/, bool reciprocal_alpha, bool flip_sign_alpha,
+                                const vector_base<NumericT> & vec1,
+                                const vector_base<NumericT> & vec2)
+      {
+        typedef NumericT        value_type;
+
+        value_type       * data_A  = detail::extract_raw_pointer<value_type>(mat1);
+        value_type const * data_v1 = detail::extract_raw_pointer<value_type>(vec1);
+        value_type const * data_v2 = detail::extract_raw_pointer<value_type>(vec2);
+
+        vcl_size_t A_start1 = viennacl::traits::start1(mat1);
+        vcl_size_t A_start2 = viennacl::traits::start2(mat1);
+        vcl_size_t A_inc1   = viennacl::traits::stride1(mat1);
+        vcl_size_t A_inc2   = viennacl::traits::stride2(mat1);
+        vcl_size_t A_size1  = viennacl::traits::size1(mat1);
+        vcl_size_t A_size2  = viennacl::traits::size2(mat1);
+        vcl_size_t A_internal_size1  = viennacl::traits::internal_size1(mat1);
+        vcl_size_t A_internal_size2  = viennacl::traits::internal_size2(mat1);
+
+        vcl_size_t start1 = viennacl::traits::start(vec1);
+        vcl_size_t inc1   = viennacl::traits::stride(vec1);
+
+        vcl_size_t start2 = viennacl::traits::start(vec2);
+        vcl_size_t inc2   = viennacl::traits::stride(vec2);
+
+        value_type data_alpha = alpha;
+        if (flip_sign_alpha)
+          data_alpha = -data_alpha;
+        if (reciprocal_alpha)
+          data_alpha = static_cast<value_type>(1) / data_alpha;
+
+        if (detail::is_row_major(typename F::orientation_category()))
+        {
+          for (vcl_size_t row = 0; row < A_size1; ++row)
+          {
+            value_type value_v1 = data_alpha * data_v1[row * inc1 + start1];
+            for (vcl_size_t col = 0; col < A_size2; ++col)
+              data_A[viennacl::row_major::mem_index(row * A_inc1 + A_start1, col * A_inc2 + A_start2, A_internal_size1, A_internal_size2)] += value_v1 * data_v2[col * inc2 + start2];
+          }
+        }
+        else
+        {
+          for (vcl_size_t col = 0; col < A_size2; ++col)  //run through matrix sequentially
+          {
+            value_type value_v2 = data_alpha * data_v2[col * inc2 + start2];
+            for (vcl_size_t row = 0; row < A_size1; ++row)
+              data_A[viennacl::column_major::mem_index(row * A_inc1 + A_start1, col * A_inc2 + A_start2, A_internal_size1, A_internal_size2)] += data_v1[row * inc1 + start1] * value_v2;
+          }
+        }
+      }
+
+    } // namespace host_based
+  } //namespace linalg
+} //namespace viennacl
+
+
+#endif
diff --git a/viennacl/linalg/host_based/misc_operations.hpp b/viennacl/linalg/host_based/misc_operations.hpp
new file mode 100644
index 0000000..32af28e
--- /dev/null
+++ b/viennacl/linalg/host_based/misc_operations.hpp
@@ -0,0 +1,80 @@
+#ifndef VIENNACL_LINALG_HOST_BASED_MISC_OPERATIONS_HPP_
+#define VIENNACL_LINALG_HOST_BASED_MISC_OPERATIONS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/host_based/misc_operations.hpp
+    @brief Implementations of miscellaneous operations on the CPU using a single thread or OpenMP.
+*/
+
+#include <list>
+
+#include "viennacl/forwards.h"
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/host_based/common.hpp"
+
+namespace viennacl
+{
+  namespace linalg
+  {
+    namespace host_based
+    {
+      namespace detail
+      {
+        template <typename ScalarType>
+        void level_scheduling_substitute(vector<ScalarType> & vec,
+                                     viennacl::backend::mem_handle const & row_index_array,
+                                     viennacl::backend::mem_handle const & row_buffer,
+                                     viennacl::backend::mem_handle const & col_buffer,
+                                     viennacl::backend::mem_handle const & element_buffer,
+                                     vcl_size_t num_rows
+                                    )
+        {
+          ScalarType * vec_buf = viennacl::linalg::host_based::detail::extract_raw_pointer<ScalarType>(vec.handle());
+
+          unsigned int const * elim_row_index  = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(row_index_array);
+          unsigned int const * elim_row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(row_buffer);
+          unsigned int const * elim_col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(col_buffer);
+          ScalarType   const * elim_elements   = viennacl::linalg::host_based::detail::extract_raw_pointer<ScalarType>(element_buffer);
+
+#ifdef VIENNACL_WITH_OPENMP
+          #pragma omp parallel for
+#endif
+          for (long row=0; row < static_cast<long>(num_rows); ++row)
+          {
+            unsigned int eq_row = elim_row_index[row];
+            ScalarType vec_entry = vec_buf[eq_row];
+            unsigned int row_end = elim_row_buffer[row+1];
+
+            for (vcl_size_t j = elim_row_buffer[row]; j < row_end; ++j)
+              vec_entry -= vec_buf[elim_col_buffer[j]] * elim_elements[j];
+
+            vec_buf[eq_row] = vec_entry;
+          }
+
+        }
+      }
+
+    } // namespace host_based
+  } //namespace linalg
+} //namespace viennacl
+
+
+#endif
diff --git a/viennacl/linalg/host_based/scalar_operations.hpp b/viennacl/linalg/host_based/scalar_operations.hpp
new file mode 100644
index 0000000..00d4f7b
--- /dev/null
+++ b/viennacl/linalg/host_based/scalar_operations.hpp
@@ -0,0 +1,162 @@
+#ifndef VIENNACL_LINALG_HOST_BASED_SCALAR_OPERATIONS_HPP_
+#define VIENNACL_LINALG_HOST_BASED_SCALAR_OPERATIONS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/host_based/scalar_operations.hpp
+    @brief Implementations of scalar operations using a plain single-threaded or OpenMP-enabled execution on CPU
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/meta/predicate.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/start.hpp"
+#include "viennacl/traits/stride.hpp"
+#include "viennacl/linalg/host_based/common.hpp"
+
+namespace viennacl
+{
+  namespace linalg
+  {
+    namespace host_based
+    {
+      template <typename S1,
+                typename S2, typename ScalarType1>
+      typename viennacl::enable_if< viennacl::is_scalar<S1>::value
+                                    && viennacl::is_scalar<S2>::value
+                                    && viennacl::is_any_scalar<ScalarType1>::value
+                                  >::type
+      as(S1 & s1,
+         S2 const & s2, ScalarType1 const & alpha, vcl_size_t /*len_alpha*/, bool reciprocal_alpha, bool flip_sign_alpha)
+      {
+        typedef typename viennacl::result_of::cpu_value_type<S1>::type        value_type;
+
+        value_type       * data_s1 = detail::extract_raw_pointer<value_type>(s1);
+        value_type const * data_s2 = detail::extract_raw_pointer<value_type>(s2);
+
+        value_type data_alpha = alpha;
+        if (flip_sign_alpha)
+          data_alpha = -data_alpha;
+        if (reciprocal_alpha)
+          data_alpha = static_cast<value_type>(1) / data_alpha;
+
+        *data_s1 = *data_s2 * data_alpha;
+      }
+
+
+      template <typename S1,
+                typename S2, typename ScalarType1,
+                typename S3, typename ScalarType2>
+      typename viennacl::enable_if< viennacl::is_scalar<S1>::value
+                                    && viennacl::is_scalar<S2>::value
+                                    && viennacl::is_scalar<S3>::value
+                                    && viennacl::is_any_scalar<ScalarType1>::value
+                                    && viennacl::is_any_scalar<ScalarType2>::value
+                                  >::type
+      asbs(S1 & s1,
+           S2 const & s2, ScalarType1 const & alpha, vcl_size_t /*len_alpha*/, bool reciprocal_alpha, bool flip_sign_alpha,
+           S3 const & s3, ScalarType2 const & beta,  vcl_size_t /*len_beta*/,  bool reciprocal_beta,  bool flip_sign_beta)
+      {
+        typedef typename viennacl::result_of::cpu_value_type<S1>::type        value_type;
+
+        value_type       * data_s1 = detail::extract_raw_pointer<value_type>(s1);
+        value_type const * data_s2 = detail::extract_raw_pointer<value_type>(s2);
+        value_type const * data_s3 = detail::extract_raw_pointer<value_type>(s3);
+
+        value_type data_alpha = alpha;
+        if (flip_sign_alpha)
+          data_alpha = -data_alpha;
+        if (reciprocal_alpha)
+          data_alpha = static_cast<value_type>(1) / data_alpha;
+
+        value_type data_beta = beta;
+        if (flip_sign_beta)
+          data_beta = -data_beta;
+        if (reciprocal_beta)
+          data_beta = static_cast<value_type>(1) / data_beta;
+
+        *data_s1 = *data_s2 * data_alpha + *data_s3 * data_beta;
+      }
+
+
+      template <typename S1,
+                typename S2, typename ScalarType1,
+                typename S3, typename ScalarType2>
+      typename viennacl::enable_if< viennacl::is_scalar<S1>::value
+                                    && viennacl::is_scalar<S2>::value
+                                    && viennacl::is_scalar<S3>::value
+                                    && viennacl::is_any_scalar<ScalarType1>::value
+                                    && viennacl::is_any_scalar<ScalarType2>::value
+                                  >::type
+      asbs_s(S1 & s1,
+             S2 const & s2, ScalarType1 const & alpha, vcl_size_t /*len_alpha*/, bool reciprocal_alpha, bool flip_sign_alpha,
+             S3 const & s3, ScalarType2 const & beta,  vcl_size_t /*len_beta*/,  bool reciprocal_beta,  bool flip_sign_beta)
+      {
+        typedef typename viennacl::result_of::cpu_value_type<S1>::type        value_type;
+
+        value_type       * data_s1 = detail::extract_raw_pointer<value_type>(s1);
+        value_type const * data_s2 = detail::extract_raw_pointer<value_type>(s2);
+        value_type const * data_s3 = detail::extract_raw_pointer<value_type>(s3);
+
+        value_type data_alpha = alpha;
+        if (flip_sign_alpha)
+          data_alpha = -data_alpha;
+        if (reciprocal_alpha)
+          data_alpha = static_cast<value_type>(1) / data_alpha;
+
+        value_type data_beta = beta;
+        if (flip_sign_beta)
+          data_beta = -data_beta;
+        if (reciprocal_beta)
+          data_beta = static_cast<value_type>(1) / data_beta;
+
+        *data_s1 += *data_s2 * data_alpha + *data_s3 * data_beta;
+      }
+
+
+      /** @brief Swaps the contents of two scalars, data is copied
+      *
+      * @param s1   The first scalar
+      * @param s2   The second scalar
+      */
+      template <typename S1, typename S2>
+      typename viennacl::enable_if<    viennacl::is_scalar<S1>::value
+                                    && viennacl::is_scalar<S2>::value
+                                  >::type
+      swap(S1 & s1, S2 & s2)
+      {
+        typedef typename viennacl::result_of::cpu_value_type<S1>::type        value_type;
+
+        value_type * data_s1 = detail::extract_raw_pointer<value_type>(s1);
+        value_type * data_s2 = detail::extract_raw_pointer<value_type>(s2);
+
+        value_type temp = *data_s2;
+        *data_s2 = *data_s1;
+        *data_s1 = temp;
+      }
+
+
+
+    } //namespace host_based
+  } //namespace linalg
+} //namespace viennacl
+
+
+#endif
diff --git a/viennacl/linalg/host_based/sparse_matrix_operations.hpp b/viennacl/linalg/host_based/sparse_matrix_operations.hpp
new file mode 100644
index 0000000..78cf17f
--- /dev/null
+++ b/viennacl/linalg/host_based/sparse_matrix_operations.hpp
@@ -0,0 +1,1603 @@
+#ifndef VIENNACL_LINALG_HOST_BASED_SPARSE_MATRIX_OPERATIONS_HPP_
+#define VIENNACL_LINALG_HOST_BASED_SPARSE_MATRIX_OPERATIONS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/host_based/sparse_matrix_operations.hpp
+    @brief Implementations of operations using sparse matrices on the CPU using a single thread or OpenMP.
+*/
+
+#include <list>
+
+#include "viennacl/forwards.h"
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/host_based/common.hpp"
+#include "viennacl/linalg/host_based/vector_operations.hpp"
+
+namespace viennacl
+{
+  namespace linalg
+  {
+    namespace host_based
+    {
+      //
+      // Compressed matrix
+      //
+
+      namespace detail
+      {
+        template<typename ScalarType, unsigned int MAT_ALIGNMENT>
+        void row_info(compressed_matrix<ScalarType, MAT_ALIGNMENT> const & mat,
+                      vector_base<ScalarType> & vec,
+                      viennacl::linalg::detail::row_info_types info_selector)
+        {
+          ScalarType         * result_buf = detail::extract_raw_pointer<ScalarType>(vec.handle());
+          ScalarType   const * elements   = detail::extract_raw_pointer<ScalarType>(mat.handle());
+          unsigned int const * row_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle1());
+          unsigned int const * col_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle2());
+
+          for (vcl_size_t row = 0; row < mat.size1(); ++row)
+          {
+            ScalarType value = 0;
+            unsigned int row_end = row_buffer[row+1];
+
+            switch (info_selector)
+            {
+              case viennacl::linalg::detail::SPARSE_ROW_NORM_INF: //inf-norm
+                for (unsigned int i = row_buffer[row]; i < row_end; ++i)
+                  value = std::max<ScalarType>(value, std::fabs(elements[i]));
+                break;
+
+              case viennacl::linalg::detail::SPARSE_ROW_NORM_1: //1-norm
+                for (unsigned int i = row_buffer[row]; i < row_end; ++i)
+                  value += std::fabs(elements[i]);
+                break;
+
+              case viennacl::linalg::detail::SPARSE_ROW_NORM_2: //2-norm
+                for (unsigned int i = row_buffer[row]; i < row_end; ++i)
+                  value += elements[i] * elements[i];
+                value = std::sqrt(value);
+                break;
+
+              case viennacl::linalg::detail::SPARSE_ROW_DIAGONAL: //diagonal entry
+                for (unsigned int i = row_buffer[row]; i < row_end; ++i)
+                {
+                  if (col_buffer[i] == row)
+                  {
+                    value = elements[i];
+                    break;
+                  }
+                }
+                break;
+
+              default:
+                break;
+            }
+            result_buf[row] = value;
+          }
+        }
+      }
+
+
+      /** @brief Carries out matrix-vector multiplication with a compressed_matrix
+      *
+      * Implementation of the convenience expression result = prod(mat, vec);
+      *
+      * @param mat    The matrix
+      * @param vec    The vector
+      * @param result The result vector
+      */
+      template<class ScalarType, unsigned int ALIGNMENT>
+      void prod_impl(const viennacl::compressed_matrix<ScalarType, ALIGNMENT> & mat,
+                     const viennacl::vector_base<ScalarType> & vec,
+                           viennacl::vector_base<ScalarType> & result)
+      {
+        ScalarType         * result_buf = detail::extract_raw_pointer<ScalarType>(result.handle());
+        ScalarType   const * vec_buf    = detail::extract_raw_pointer<ScalarType>(vec.handle());
+        ScalarType   const * elements   = detail::extract_raw_pointer<ScalarType>(mat.handle());
+        unsigned int const * row_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle1());
+        unsigned int const * col_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle2());
+
+#ifdef VIENNACL_WITH_OPENMP
+        #pragma omp parallel for
+#endif
+        for (long row = 0; row < static_cast<long>(mat.size1()); ++row)
+        {
+          ScalarType dot_prod = 0;
+          vcl_size_t row_end = row_buffer[row+1];
+          for (vcl_size_t i = row_buffer[row]; i < row_end; ++i)
+            dot_prod += elements[i] * vec_buf[col_buffer[i] * vec.stride() + vec.start()];
+          result_buf[row * result.stride() + result.start()] = dot_prod;
+        }
+
+      }
+
+      /** @brief Carries out sparse_matrix-matrix multiplication first matrix being compressed
+      *
+      * Implementation of the convenience expression result = prod(sp_mat, d_mat);
+      *
+      * @param sp_mat     The sparse matrix
+      * @param d_mat      The dense matrix
+      * @param result     The result matrix
+      */
+      template< class ScalarType, typename NumericT, unsigned int ALIGNMENT, typename F1, typename F2>
+      void prod_impl(const viennacl::compressed_matrix<ScalarType, ALIGNMENT> & sp_mat,
+                     const viennacl::matrix_base<NumericT, F1> & d_mat,
+                           viennacl::matrix_base<NumericT, F2> & result) {
+
+        ScalarType   const * sp_mat_elements   = detail::extract_raw_pointer<ScalarType>(sp_mat.handle());
+        unsigned int const * sp_mat_row_buffer = detail::extract_raw_pointer<unsigned int>(sp_mat.handle1());
+        unsigned int const * sp_mat_col_buffer = detail::extract_raw_pointer<unsigned int>(sp_mat.handle2());
+
+        NumericT const * d_mat_data = detail::extract_raw_pointer<NumericT>(d_mat);
+        NumericT       * result_data = detail::extract_raw_pointer<NumericT>(result);
+
+        vcl_size_t d_mat_start1 = viennacl::traits::start1(d_mat);
+        vcl_size_t d_mat_start2 = viennacl::traits::start2(d_mat);
+        vcl_size_t d_mat_inc1   = viennacl::traits::stride1(d_mat);
+        vcl_size_t d_mat_inc2   = viennacl::traits::stride2(d_mat);
+        vcl_size_t d_mat_internal_size1  = viennacl::traits::internal_size1(d_mat);
+        vcl_size_t d_mat_internal_size2  = viennacl::traits::internal_size2(d_mat);
+
+        vcl_size_t result_start1 = viennacl::traits::start1(result);
+        vcl_size_t result_start2 = viennacl::traits::start2(result);
+        vcl_size_t result_inc1   = viennacl::traits::stride1(result);
+        vcl_size_t result_inc2   = viennacl::traits::stride2(result);
+        vcl_size_t result_internal_size1  = viennacl::traits::internal_size1(result);
+        vcl_size_t result_internal_size2  = viennacl::traits::internal_size2(result);
+
+        detail::matrix_array_wrapper<NumericT const, typename F1::orientation_category, false>
+            d_mat_wrapper(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
+        detail::matrix_array_wrapper<NumericT,       typename F2::orientation_category, false>
+            result_wrapper(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
+
+        if ( detail::is_row_major(typename F1::orientation_category()) ) {
+#ifdef VIENNACL_WITH_OPENMP
+        #pragma omp parallel for
+#endif
+          for (long row = 0; row < static_cast<long>(sp_mat.size1()); ++row) {
+            vcl_size_t row_start = sp_mat_row_buffer[row];
+            vcl_size_t row_end = sp_mat_row_buffer[row+1];
+            for (vcl_size_t col = 0; col < d_mat.size2(); ++col) {
+              NumericT temp = 0;
+              for (vcl_size_t k = row_start; k < row_end; ++k) {
+                temp += sp_mat_elements[k] * d_mat_wrapper(sp_mat_col_buffer[k], col);
+              }
+              result_wrapper(row, col) = temp;
+            }
+          }
+        }
+        else {
+#ifdef VIENNACL_WITH_OPENMP
+        #pragma omp parallel for
+#endif
+          for (long col = 0; col < static_cast<long>(d_mat.size2()); ++col) {
+            for (long row = 0; row < static_cast<long>(sp_mat.size1()); ++row) {
+              vcl_size_t row_start = sp_mat_row_buffer[row];
+              vcl_size_t row_end = sp_mat_row_buffer[row+1];
+              NumericT temp = 0;
+              for (vcl_size_t k = row_start; k < row_end; ++k) {
+                temp += sp_mat_elements[k] * d_mat_wrapper(sp_mat_col_buffer[k], col);
+              }
+              result_wrapper(row, col) = temp;
+            }
+          }
+        }
+
+      }
+
+      /** @brief Carries out matrix-trans(matrix) multiplication first matrix being compressed
+      *          and the second transposed
+      *
+      * Implementation of the convenience expression result = prod(sp_mat, trans(d_mat));
+      *
+      * @param sp_mat             The sparse matrix
+      * @param d_mat              The transposed dense matrix
+      * @param result             The result matrix
+      */
+      template< class ScalarType, typename NumericT, unsigned int ALIGNMENT, typename F1, typename F2>
+      void prod_impl(const viennacl::compressed_matrix<ScalarType, ALIGNMENT> & sp_mat,
+                const viennacl::matrix_expression< const viennacl::matrix_base<NumericT, F1>,
+                                                   const viennacl::matrix_base<NumericT, F1>,
+                                                   viennacl::op_trans > & d_mat,
+                      viennacl::matrix_base<NumericT, F2> & result) {
+
+        ScalarType   const * sp_mat_elements   = detail::extract_raw_pointer<ScalarType>(sp_mat.handle());
+        unsigned int const * sp_mat_row_buffer = detail::extract_raw_pointer<unsigned int>(sp_mat.handle1());
+        unsigned int const * sp_mat_col_buffer = detail::extract_raw_pointer<unsigned int>(sp_mat.handle2());
+
+        NumericT const * d_mat_data = detail::extract_raw_pointer<NumericT>(d_mat.lhs());
+        NumericT       * result_data = detail::extract_raw_pointer<NumericT>(result);
+
+        vcl_size_t d_mat_start1 = viennacl::traits::start1(d_mat.lhs());
+        vcl_size_t d_mat_start2 = viennacl::traits::start2(d_mat.lhs());
+        vcl_size_t d_mat_inc1   = viennacl::traits::stride1(d_mat.lhs());
+        vcl_size_t d_mat_inc2   = viennacl::traits::stride2(d_mat.lhs());
+        vcl_size_t d_mat_internal_size1  = viennacl::traits::internal_size1(d_mat.lhs());
+        vcl_size_t d_mat_internal_size2  = viennacl::traits::internal_size2(d_mat.lhs());
+
+        vcl_size_t result_start1 = viennacl::traits::start1(result);
+        vcl_size_t result_start2 = viennacl::traits::start2(result);
+        vcl_size_t result_inc1   = viennacl::traits::stride1(result);
+        vcl_size_t result_inc2   = viennacl::traits::stride2(result);
+        vcl_size_t result_internal_size1  = viennacl::traits::internal_size1(result);
+        vcl_size_t result_internal_size2  = viennacl::traits::internal_size2(result);
+
+        detail::matrix_array_wrapper<NumericT const, typename F1::orientation_category, false>
+            d_mat_wrapper(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
+        detail::matrix_array_wrapper<NumericT,       typename F2::orientation_category, false>
+            result_wrapper(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
+
+        if ( detail::is_row_major(typename F1::orientation_category()) ) {
+#ifdef VIENNACL_WITH_OPENMP
+        #pragma omp parallel for
+#endif
+          for (long row = 0; row < static_cast<long>(sp_mat.size1()); ++row) {
+            vcl_size_t row_start = sp_mat_row_buffer[row];
+            vcl_size_t row_end = sp_mat_row_buffer[row+1];
+            for (vcl_size_t col = 0; col < d_mat.size2(); ++col) {
+              NumericT temp = 0;
+              for (vcl_size_t k = row_start; k < row_end; ++k) {
+                temp += sp_mat_elements[k] * d_mat_wrapper(col, sp_mat_col_buffer[k]);
+              }
+              result_wrapper(row, col) = temp;
+            }
+          }
+        }
+        else {
+#ifdef VIENNACL_WITH_OPENMP
+        #pragma omp parallel for
+#endif
+          for (long col = 0; col < static_cast<long>(d_mat.size2()); ++col) {
+            for (vcl_size_t row = 0; row < sp_mat.size1(); ++row) {
+              vcl_size_t row_start = sp_mat_row_buffer[row];
+              vcl_size_t row_end = sp_mat_row_buffer[row+1];
+              NumericT temp = 0;
+              for (vcl_size_t k = row_start; k < row_end; ++k) {
+                temp += sp_mat_elements[k] * d_mat_wrapper(col, sp_mat_col_buffer[k]);
+              }
+              result_wrapper(row, col) = temp;
+            }
+          }
+        }
+
+      }
+
+
+      //
+      // Triangular solve for compressed_matrix, A \ b
+      //
+      namespace detail
+      {
+        template <typename NumericT, typename ConstScalarTypeArray, typename ScalarTypeArray, typename SizeTypeArray>
+        void csr_inplace_solve(SizeTypeArray const & row_buffer,
+                               SizeTypeArray const & col_buffer,
+                               ConstScalarTypeArray const & element_buffer,
+                               ScalarTypeArray & vec_buffer,
+                               vcl_size_t num_cols,
+                               viennacl::linalg::unit_lower_tag)
+        {
+          vcl_size_t row_begin = row_buffer[1];
+          for (vcl_size_t row = 1; row < num_cols; ++row)
+          {
+            NumericT vec_entry = vec_buffer[row];
+            vcl_size_t row_end = row_buffer[row+1];
+            for (vcl_size_t i = row_begin; i < row_end; ++i)
+            {
+              vcl_size_t col_index = col_buffer[i];
+              if (col_index < row)
+                vec_entry -= vec_buffer[col_index] * element_buffer[i];
+            }
+            vec_buffer[row] = vec_entry;
+            row_begin = row_end;
+          }
+        }
+
+        template <typename NumericT, typename ConstScalarTypeArray, typename ScalarTypeArray, typename SizeTypeArray>
+        void csr_inplace_solve(SizeTypeArray const & row_buffer,
+                               SizeTypeArray const & col_buffer,
+                               ConstScalarTypeArray const & element_buffer,
+                               ScalarTypeArray & vec_buffer,
+                               vcl_size_t num_cols,
+                               viennacl::linalg::lower_tag)
+        {
+          vcl_size_t row_begin = row_buffer[0];
+          for (vcl_size_t row = 0; row < num_cols; ++row)
+          {
+            NumericT vec_entry = vec_buffer[row];
+
+            // substitute and remember diagonal entry
+            vcl_size_t row_end = row_buffer[row+1];
+            NumericT diagonal_entry = 0;
+            for (vcl_size_t i = row_begin; i < row_end; ++i)
+            {
+              vcl_size_t col_index = col_buffer[i];
+              if (col_index < row)
+                vec_entry -= vec_buffer[col_index] * element_buffer[i];
+              else if (col_index == row)
+                diagonal_entry = element_buffer[i];
+            }
+
+            vec_buffer[row] = vec_entry / diagonal_entry;
+            row_begin = row_end;
+          }
+        }
+
+
+        template <typename NumericT, typename ConstScalarTypeArray, typename ScalarTypeArray, typename SizeTypeArray>
+        void csr_inplace_solve(SizeTypeArray const & row_buffer,
+                               SizeTypeArray const & col_buffer,
+                               ConstScalarTypeArray const & element_buffer,
+                               ScalarTypeArray & vec_buffer,
+                               vcl_size_t num_cols,
+                               viennacl::linalg::unit_upper_tag)
+        {
+          for (vcl_size_t row2 = 1; row2 < num_cols; ++row2)
+          {
+            vcl_size_t row = (num_cols - row2) - 1;
+            NumericT vec_entry = vec_buffer[row];
+            vcl_size_t row_begin = row_buffer[row];
+            vcl_size_t row_end   = row_buffer[row+1];
+            for (vcl_size_t i = row_begin; i < row_end; ++i)
+            {
+              vcl_size_t col_index = col_buffer[i];
+              if (col_index > row)
+                vec_entry -= vec_buffer[col_index] * element_buffer[i];
+            }
+            vec_buffer[row] = vec_entry;
+          }
+        }
+
+        template <typename NumericT, typename ConstScalarTypeArray, typename ScalarTypeArray, typename SizeTypeArray>
+        void csr_inplace_solve(SizeTypeArray const & row_buffer,
+                               SizeTypeArray const & col_buffer,
+                               ConstScalarTypeArray const & element_buffer,
+                               ScalarTypeArray & vec_buffer,
+                               vcl_size_t num_cols,
+                               viennacl::linalg::upper_tag)
+        {
+          for (vcl_size_t row2 = 0; row2 < num_cols; ++row2)
+          {
+            vcl_size_t row = (num_cols - row2) - 1;
+            NumericT vec_entry = vec_buffer[row];
+
+            // substitute and remember diagonal entry
+            vcl_size_t row_begin = row_buffer[row];
+            vcl_size_t row_end   = row_buffer[row+1];
+            NumericT diagonal_entry = 0;
+            for (vcl_size_t i = row_begin; i < row_end; ++i)
+            {
+              vcl_size_t col_index = col_buffer[i];
+              if (col_index > row)
+                vec_entry -= vec_buffer[col_index] * element_buffer[i];
+              else if (col_index == row)
+                diagonal_entry = element_buffer[i];
+            }
+
+            vec_buffer[row] = vec_entry / diagonal_entry;
+          }
+        }
+
+      } //namespace detail
+
+
+
+      /** @brief Inplace solution of a lower triangular compressed_matrix with unit diagonal. Typically used for LU substitutions
+      *
+      * @param L    The matrix
+      * @param vec  The vector holding the right hand side. Is overwritten by the solution.
+      * @param tag  The solver tag identifying the respective triangular solver
+      */
+      template<typename ScalarType, unsigned int MAT_ALIGNMENT>
+      void inplace_solve(compressed_matrix<ScalarType, MAT_ALIGNMENT> const & L,
+                         vector_base<ScalarType> & vec,
+                         viennacl::linalg::unit_lower_tag tag)
+      {
+        ScalarType         * vec_buf    = detail::extract_raw_pointer<ScalarType>(vec.handle());
+        ScalarType   const * elements   = detail::extract_raw_pointer<ScalarType>(L.handle());
+        unsigned int const * row_buffer = detail::extract_raw_pointer<unsigned int>(L.handle1());
+        unsigned int const * col_buffer = detail::extract_raw_pointer<unsigned int>(L.handle2());
+
+        detail::csr_inplace_solve<ScalarType>(row_buffer, col_buffer, elements, vec_buf, L.size2(), tag);
+      }
+
+      /** @brief Inplace solution of a lower triangular compressed_matrix. Typically used for LU substitutions
+      *
+      * @param L    The matrix
+      * @param vec  The vector holding the right hand side. Is overwritten by the solution.
+      * @param tag  The solver tag identifying the respective triangular solver
+      */
+      template<typename ScalarType, unsigned int MAT_ALIGNMENT>
+      void inplace_solve(compressed_matrix<ScalarType, MAT_ALIGNMENT> const & L,
+                         vector_base<ScalarType> & vec,
+                         viennacl::linalg::lower_tag tag)
+      {
+        ScalarType         * vec_buf    = detail::extract_raw_pointer<ScalarType>(vec.handle());
+        ScalarType   const * elements   = detail::extract_raw_pointer<ScalarType>(L.handle());
+        unsigned int const * row_buffer = detail::extract_raw_pointer<unsigned int>(L.handle1());
+        unsigned int const * col_buffer = detail::extract_raw_pointer<unsigned int>(L.handle2());
+
+        detail::csr_inplace_solve<ScalarType>(row_buffer, col_buffer, elements, vec_buf, L.size2(), tag);
+      }
+
+
+      /** @brief Inplace solution of a upper triangular compressed_matrix with unit diagonal. Typically used for LU substitutions
+      *
+      * @param U    The matrix
+      * @param vec  The vector holding the right hand side. Is overwritten by the solution.
+      * @param tag  The solver tag identifying the respective triangular solver
+      */
+      template<typename ScalarType, unsigned int MAT_ALIGNMENT>
+      void inplace_solve(compressed_matrix<ScalarType, MAT_ALIGNMENT> const & U,
+                         vector_base<ScalarType> & vec,
+                         viennacl::linalg::unit_upper_tag tag)
+      {
+        ScalarType         * vec_buf    = detail::extract_raw_pointer<ScalarType>(vec.handle());
+        ScalarType   const * elements   = detail::extract_raw_pointer<ScalarType>(U.handle());
+        unsigned int const * row_buffer = detail::extract_raw_pointer<unsigned int>(U.handle1());
+        unsigned int const * col_buffer = detail::extract_raw_pointer<unsigned int>(U.handle2());
+
+        detail::csr_inplace_solve<ScalarType>(row_buffer, col_buffer, elements, vec_buf, U.size2(), tag);
+      }
+
+      /** @brief Inplace solution of a upper triangular compressed_matrix. Typically used for LU substitutions
+      *
+      * @param U    The matrix
+      * @param vec  The vector holding the right hand side. Is overwritten by the solution.
+      * @param tag  The solver tag identifying the respective triangular solver
+      */
+      template<typename ScalarType, unsigned int MAT_ALIGNMENT>
+      void inplace_solve(compressed_matrix<ScalarType, MAT_ALIGNMENT> const & U,
+                         vector_base<ScalarType> & vec,
+                         viennacl::linalg::upper_tag tag)
+      {
+        ScalarType         * vec_buf    = detail::extract_raw_pointer<ScalarType>(vec.handle());
+        ScalarType   const * elements   = detail::extract_raw_pointer<ScalarType>(U.handle());
+        unsigned int const * row_buffer = detail::extract_raw_pointer<unsigned int>(U.handle1());
+        unsigned int const * col_buffer = detail::extract_raw_pointer<unsigned int>(U.handle2());
+
+        detail::csr_inplace_solve<ScalarType>(row_buffer, col_buffer, elements, vec_buf, U.size2(), tag);
+      }
+
+
+
+
+
+
+
+      //
+      // Triangular solve for compressed_matrix, A^T \ b
+      //
+
+      namespace detail
+      {
+        template <typename NumericT, typename ConstScalarTypeArray, typename ScalarTypeArray, typename SizeTypeArray>
+        void csr_trans_inplace_solve(SizeTypeArray const & row_buffer,
+                                     SizeTypeArray const & col_buffer,
+                                     ConstScalarTypeArray const & element_buffer,
+                                     ScalarTypeArray & vec_buffer,
+                                     vcl_size_t num_cols,
+                                     viennacl::linalg::unit_lower_tag)
+        {
+          vcl_size_t col_begin = row_buffer[0];
+          for (vcl_size_t col = 0; col < num_cols; ++col)
+          {
+            NumericT vec_entry = vec_buffer[col];
+            vcl_size_t col_end = row_buffer[col+1];
+            for (vcl_size_t i = col_begin; i < col_end; ++i)
+            {
+              unsigned int row_index = col_buffer[i];
+              if (row_index > col)
+                vec_buffer[row_index] -= vec_entry * element_buffer[i];
+            }
+            col_begin = col_end;
+          }
+        }
+
+        template <typename NumericT, typename ConstScalarTypeArray, typename ScalarTypeArray, typename SizeTypeArray>
+        void csr_trans_inplace_solve(SizeTypeArray const & row_buffer,
+                                     SizeTypeArray const & col_buffer,
+                                     ConstScalarTypeArray const & element_buffer,
+                                     ScalarTypeArray & vec_buffer,
+                                     vcl_size_t num_cols,
+                                     viennacl::linalg::lower_tag)
+        {
+          vcl_size_t col_begin = row_buffer[0];
+          for (vcl_size_t col = 0; col < num_cols; ++col)
+          {
+            vcl_size_t col_end = row_buffer[col+1];
+
+            // Stage 1: Find diagonal entry:
+            NumericT diagonal_entry = 0;
+            for (vcl_size_t i = col_begin; i < col_end; ++i)
+            {
+              vcl_size_t row_index = col_buffer[i];
+              if (row_index == col)
+              {
+                diagonal_entry = element_buffer[i];
+                break;
+              }
+            }
+
+            // Stage 2: Substitute
+            NumericT vec_entry = vec_buffer[col] / diagonal_entry;
+            vec_buffer[col] = vec_entry;
+            for (vcl_size_t i = col_begin; i < col_end; ++i)
+            {
+              vcl_size_t row_index = col_buffer[i];
+              if (row_index > col)
+                vec_buffer[row_index] -= vec_entry * element_buffer[i];
+            }
+            col_begin = col_end;
+          }
+        }
+
+        template <typename NumericT, typename ConstScalarTypeArray, typename ScalarTypeArray, typename SizeTypeArray>
+        void csr_trans_inplace_solve(SizeTypeArray const & row_buffer,
+                                     SizeTypeArray const & col_buffer,
+                                     ConstScalarTypeArray const & element_buffer,
+                                     ScalarTypeArray & vec_buffer,
+                                     vcl_size_t num_cols,
+                                     viennacl::linalg::unit_upper_tag)
+        {
+          for (vcl_size_t col2 = 0; col2 < num_cols; ++col2)
+          {
+            vcl_size_t col = (num_cols - col2) - 1;
+
+            NumericT vec_entry = vec_buffer[col];
+            vcl_size_t col_begin = row_buffer[col];
+            vcl_size_t col_end = row_buffer[col+1];
+            for (vcl_size_t i = col_begin; i < col_end; ++i)
+            {
+              vcl_size_t row_index = col_buffer[i];
+              if (row_index < col)
+                vec_buffer[row_index] -= vec_entry * element_buffer[i];
+            }
+
+          }
+        }
+
+        template <typename NumericT, typename ConstScalarTypeArray, typename ScalarTypeArray, typename SizeTypeArray>
+        void csr_trans_inplace_solve(SizeTypeArray const & row_buffer,
+                                     SizeTypeArray const & col_buffer,
+                                     ConstScalarTypeArray const & element_buffer,
+                                     ScalarTypeArray & vec_buffer,
+                                     vcl_size_t num_cols,
+                                     viennacl::linalg::upper_tag)
+        {
+          for (vcl_size_t col2 = 0; col2 < num_cols; ++col2)
+          {
+            vcl_size_t col = (num_cols - col2) - 1;
+            vcl_size_t col_begin = row_buffer[col];
+            vcl_size_t col_end = row_buffer[col+1];
+
+            // Stage 1: Find diagonal entry:
+            NumericT diagonal_entry = 0;
+            for (vcl_size_t i = col_begin; i < col_end; ++i)
+            {
+              vcl_size_t row_index = col_buffer[i];
+              if (row_index == col)
+              {
+                diagonal_entry = element_buffer[i];
+                break;
+              }
+            }
+
+            // Stage 2: Substitute
+            NumericT vec_entry = vec_buffer[col] / diagonal_entry;
+            vec_buffer[col] = vec_entry;
+            for (vcl_size_t i = col_begin; i < col_end; ++i)
+            {
+              vcl_size_t row_index = col_buffer[i];
+              if (row_index < col)
+                vec_buffer[row_index] -= vec_entry * element_buffer[i];
+            }
+          }
+        }
+
+
+        //
+        // block solves
+        //
+        template<typename ScalarType, unsigned int MAT_ALIGNMENT>
+        void block_inplace_solve(const matrix_expression<const compressed_matrix<ScalarType, MAT_ALIGNMENT>,
+                                                         const compressed_matrix<ScalarType, MAT_ALIGNMENT>,
+                                                         op_trans> & L,
+                                 viennacl::backend::mem_handle const & /* block_indices */, vcl_size_t /* num_blocks */,
+                                 vector_base<ScalarType> const & /* L_diagonal */,  //ignored
+                                 vector_base<ScalarType> & vec,
+                                 viennacl::linalg::unit_lower_tag)
+        {
+          // Note: The following could be implemented more efficiently using the block structure and possibly OpenMP.
+
+          unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(L.lhs().handle1());
+          unsigned int const * col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(L.lhs().handle2());
+          ScalarType   const * elements   = viennacl::linalg::host_based::detail::extract_raw_pointer<ScalarType>(L.lhs().handle());
+          ScalarType         * vec_buffer = detail::extract_raw_pointer<ScalarType>(vec.handle());
+
+          vcl_size_t col_begin = row_buffer[0];
+          for (vcl_size_t col = 0; col < L.lhs().size1(); ++col)
+          {
+            ScalarType vec_entry = vec_buffer[col];
+            vcl_size_t col_end = row_buffer[col+1];
+            for (vcl_size_t i = col_begin; i < col_end; ++i)
+            {
+              unsigned int row_index = col_buffer[i];
+              if (row_index > col)
+                vec_buffer[row_index] -= vec_entry * elements[i];
+            }
+            col_begin = col_end;
+          }
+        }
+
+        template<typename ScalarType, unsigned int MAT_ALIGNMENT>
+        void block_inplace_solve(const matrix_expression<const compressed_matrix<ScalarType, MAT_ALIGNMENT>,
+                                                         const compressed_matrix<ScalarType, MAT_ALIGNMENT>,
+                                                         op_trans> & L,
+                                 viennacl::backend::mem_handle const & /*block_indices*/, vcl_size_t /* num_blocks */,
+                                 vector_base<ScalarType> const & L_diagonal,
+                                 vector_base<ScalarType> & vec,
+                                 viennacl::linalg::lower_tag)
+        {
+          // Note: The following could be implemented more efficiently using the block structure and possibly OpenMP.
+
+          unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(L.lhs().handle1());
+          unsigned int const * col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(L.lhs().handle2());
+          ScalarType   const * elements   = viennacl::linalg::host_based::detail::extract_raw_pointer<ScalarType>(L.lhs().handle());
+          ScalarType   const * diagonal_buffer = detail::extract_raw_pointer<ScalarType>(L_diagonal.handle());
+          ScalarType         * vec_buffer = detail::extract_raw_pointer<ScalarType>(vec.handle());
+
+          vcl_size_t col_begin = row_buffer[0];
+          for (vcl_size_t col = 0; col < L.lhs().size1(); ++col)
+          {
+            vcl_size_t col_end = row_buffer[col+1];
+
+            ScalarType vec_entry = vec_buffer[col] / diagonal_buffer[col];
+            vec_buffer[col] = vec_entry;
+            for (vcl_size_t i = col_begin; i < col_end; ++i)
+            {
+              vcl_size_t row_index = col_buffer[i];
+              if (row_index > col)
+                vec_buffer[row_index] -= vec_entry * elements[i];
+            }
+            col_begin = col_end;
+          }
+        }
+
+
+
+        template<typename ScalarType, unsigned int MAT_ALIGNMENT>
+        void block_inplace_solve(const matrix_expression<const compressed_matrix<ScalarType, MAT_ALIGNMENT>,
+                                                         const compressed_matrix<ScalarType, MAT_ALIGNMENT>,
+                                                         op_trans> & U,
+                                 viennacl::backend::mem_handle const & /*block_indices*/, vcl_size_t /* num_blocks */,
+                                 vector_base<ScalarType> const & /* U_diagonal */, //ignored
+                                 vector_base<ScalarType> & vec,
+                                 viennacl::linalg::unit_upper_tag)
+        {
+          // Note: The following could be implemented more efficiently using the block structure and possibly OpenMP.
+
+          unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(U.lhs().handle1());
+          unsigned int const * col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(U.lhs().handle2());
+          ScalarType   const * elements   = viennacl::linalg::host_based::detail::extract_raw_pointer<ScalarType>(U.lhs().handle());
+          ScalarType         * vec_buffer = detail::extract_raw_pointer<ScalarType>(vec.handle());
+
+          for (vcl_size_t col2 = 0; col2 < U.lhs().size1(); ++col2)
+          {
+            vcl_size_t col = (U.lhs().size1() - col2) - 1;
+
+            ScalarType vec_entry = vec_buffer[col];
+            vcl_size_t col_begin = row_buffer[col];
+            vcl_size_t col_end = row_buffer[col+1];
+            for (vcl_size_t i = col_begin; i < col_end; ++i)
+            {
+              vcl_size_t row_index = col_buffer[i];
+              if (row_index < col)
+                vec_buffer[row_index] -= vec_entry * elements[i];
+            }
+
+          }
+        }
+
+        template<typename ScalarType, unsigned int MAT_ALIGNMENT>
+        void block_inplace_solve(const matrix_expression<const compressed_matrix<ScalarType, MAT_ALIGNMENT>,
+                                                         const compressed_matrix<ScalarType, MAT_ALIGNMENT>,
+                                                         op_trans> & U,
+                                 viennacl::backend::mem_handle const & /* block_indices */, vcl_size_t /* num_blocks */,
+                                 vector_base<ScalarType> const & U_diagonal,
+                                 vector_base<ScalarType> & vec,
+                                 viennacl::linalg::upper_tag)
+        {
+          // Note: The following could be implemented more efficiently using the block structure and possibly OpenMP.
+
+          unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(U.lhs().handle1());
+          unsigned int const * col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(U.lhs().handle2());
+          ScalarType   const * elements   = viennacl::linalg::host_based::detail::extract_raw_pointer<ScalarType>(U.lhs().handle());
+          ScalarType   const * diagonal_buffer = detail::extract_raw_pointer<ScalarType>(U_diagonal.handle());
+          ScalarType         * vec_buffer = detail::extract_raw_pointer<ScalarType>(vec.handle());
+
+          for (vcl_size_t col2 = 0; col2 < U.lhs().size1(); ++col2)
+          {
+            vcl_size_t col = (U.lhs().size1() - col2) - 1;
+            vcl_size_t col_begin = row_buffer[col];
+            vcl_size_t col_end = row_buffer[col+1];
+
+            // Stage 2: Substitute
+            ScalarType vec_entry = vec_buffer[col] / diagonal_buffer[col];
+            vec_buffer[col] = vec_entry;
+            for (vcl_size_t i = col_begin; i < col_end; ++i)
+            {
+              vcl_size_t row_index = col_buffer[i];
+              if (row_index < col)
+                vec_buffer[row_index] -= vec_entry * elements[i];
+            }
+          }
+        }
+
+
+      } //namespace detail
+
+      /** @brief Inplace solution of a lower triangular compressed_matrix with unit diagonal. Typically used for LU substitutions
+      *
+      * @param proxy  Proxy object for a transposed CSR-matrix
+      * @param vec    The right hand side vector
+      * @param tag    The solver tag identifying the respective triangular solver
+      */
+      template<typename ScalarType, unsigned int MAT_ALIGNMENT>
+      void inplace_solve(matrix_expression< const compressed_matrix<ScalarType, MAT_ALIGNMENT>,
+                                            const compressed_matrix<ScalarType, MAT_ALIGNMENT>,
+                                            op_trans> const & proxy,
+                         vector_base<ScalarType> & vec,
+                         viennacl::linalg::unit_lower_tag tag)
+      {
+        ScalarType         * vec_buf    = detail::extract_raw_pointer<ScalarType>(vec.handle());
+        ScalarType   const * elements   = detail::extract_raw_pointer<ScalarType>(proxy.lhs().handle());
+        unsigned int const * row_buffer = detail::extract_raw_pointer<unsigned int>(proxy.lhs().handle1());
+        unsigned int const * col_buffer = detail::extract_raw_pointer<unsigned int>(proxy.lhs().handle2());
+
+        detail::csr_trans_inplace_solve<ScalarType>(row_buffer, col_buffer, elements, vec_buf, proxy.lhs().size1(), tag);
+      }
+
+      /** @brief Inplace solution of a lower triangular compressed_matrix. Typically used for LU substitutions
+      *
+      * @param proxy  Proxy object for a transposed CSR-matrix
+      * @param vec    The right hand side vector
+      * @param tag    The solver tag identifying the respective triangular solver
+      */
+      template<typename ScalarType, unsigned int MAT_ALIGNMENT>
+      void inplace_solve(matrix_expression< const compressed_matrix<ScalarType, MAT_ALIGNMENT>,
+                                            const compressed_matrix<ScalarType, MAT_ALIGNMENT>,
+                                            op_trans> const & proxy,
+                         vector_base<ScalarType> & vec,
+                         viennacl::linalg::lower_tag tag)
+      {
+        ScalarType         * vec_buf    = detail::extract_raw_pointer<ScalarType>(vec.handle());
+        ScalarType   const * elements   = detail::extract_raw_pointer<ScalarType>(proxy.lhs().handle());
+        unsigned int const * row_buffer = detail::extract_raw_pointer<unsigned int>(proxy.lhs().handle1());
+        unsigned int const * col_buffer = detail::extract_raw_pointer<unsigned int>(proxy.lhs().handle2());
+
+        detail::csr_trans_inplace_solve<ScalarType>(row_buffer, col_buffer, elements, vec_buf, proxy.lhs().size1(), tag);
+      }
+
+
+      /** @brief Inplace solution of a upper triangular compressed_matrix with unit diagonal. Typically used for LU substitutions
+      *
+      * @param proxy  Proxy object for a transposed CSR-matrix
+      * @param vec    The right hand side vector
+      * @param tag    The solver tag identifying the respective triangular solver
+      */
+      template<typename ScalarType, unsigned int MAT_ALIGNMENT>
+      void inplace_solve(matrix_expression< const compressed_matrix<ScalarType, MAT_ALIGNMENT>,
+                                            const compressed_matrix<ScalarType, MAT_ALIGNMENT>,
+                                            op_trans> const & proxy,
+                         vector_base<ScalarType> & vec,
+                         viennacl::linalg::unit_upper_tag tag)
+      {
+        ScalarType         * vec_buf    = detail::extract_raw_pointer<ScalarType>(vec.handle());
+        ScalarType   const * elements   = detail::extract_raw_pointer<ScalarType>(proxy.lhs().handle());
+        unsigned int const * row_buffer = detail::extract_raw_pointer<unsigned int>(proxy.lhs().handle1());
+        unsigned int const * col_buffer = detail::extract_raw_pointer<unsigned int>(proxy.lhs().handle2());
+
+        detail::csr_trans_inplace_solve<ScalarType>(row_buffer, col_buffer, elements, vec_buf, proxy.lhs().size1(), tag);
+      }
+
+
+      /** @brief Inplace solution of a upper triangular compressed_matrix with unit diagonal. Typically used for LU substitutions
+      *
+      * @param proxy  Proxy object for a transposed CSR-matrix
+      * @param vec    The right hand side vector
+      * @param tag    The solver tag identifying the respective triangular solver
+      */
+      template<typename ScalarType, unsigned int MAT_ALIGNMENT>
+      void inplace_solve(matrix_expression< const compressed_matrix<ScalarType, MAT_ALIGNMENT>,
+                                            const compressed_matrix<ScalarType, MAT_ALIGNMENT>,
+                                            op_trans> const & proxy,
+                         vector_base<ScalarType> & vec,
+                         viennacl::linalg::upper_tag tag)
+      {
+        ScalarType         * vec_buf    = detail::extract_raw_pointer<ScalarType>(vec.handle());
+        ScalarType   const * elements   = detail::extract_raw_pointer<ScalarType>(proxy.lhs().handle());
+        unsigned int const * row_buffer = detail::extract_raw_pointer<unsigned int>(proxy.lhs().handle1());
+        unsigned int const * col_buffer = detail::extract_raw_pointer<unsigned int>(proxy.lhs().handle2());
+
+        detail::csr_trans_inplace_solve<ScalarType>(row_buffer, col_buffer, elements, vec_buf, proxy.lhs().size1(), tag);
+      }
+
+
+
+      //
+      // Compressed Compressed Matrix
+      //
+
+      /** @brief Carries out matrix-vector multiplication with a compressed_matrix
+      *
+      * Implementation of the convenience expression result = prod(mat, vec);
+      *
+      * @param mat    The matrix
+      * @param vec    The vector
+      * @param result The result vector
+      */
+      template<class ScalarType>
+      void prod_impl(const viennacl::compressed_compressed_matrix<ScalarType> & mat,
+                     const viennacl::vector_base<ScalarType> & vec,
+                           viennacl::vector_base<ScalarType> & result)
+      {
+        ScalarType         * result_buf  = detail::extract_raw_pointer<ScalarType>(result.handle());
+        ScalarType   const * vec_buf     = detail::extract_raw_pointer<ScalarType>(vec.handle());
+        ScalarType   const * elements    = detail::extract_raw_pointer<ScalarType>(mat.handle());
+        unsigned int const * row_buffer  = detail::extract_raw_pointer<unsigned int>(mat.handle1());
+        unsigned int const * row_indices = detail::extract_raw_pointer<unsigned int>(mat.handle3());
+        unsigned int const * col_buffer  = detail::extract_raw_pointer<unsigned int>(mat.handle2());
+
+        vector_assign(result, ScalarType(0));
+
+#ifdef VIENNACL_WITH_OPENMP
+        #pragma omp parallel for
+#endif
+        for (long i = 0; i < static_cast<long>(mat.nnz1()); ++i)
+        {
+          ScalarType dot_prod = 0;
+          vcl_size_t row_end = row_buffer[i+1];
+          for (vcl_size_t j = row_buffer[i]; j < row_end; ++j)
+            dot_prod += elements[j] * vec_buf[col_buffer[j] * vec.stride() + vec.start()];
+          result_buf[row_indices[i] * result.stride() + result.start()] = dot_prod;
+        }
+
+      }
+
+
+
+      //
+      // Coordinate Matrix
+      //
+
+      namespace detail
+      {
+        template<typename ScalarType, unsigned int MAT_ALIGNMENT>
+        void row_info(coordinate_matrix<ScalarType, MAT_ALIGNMENT> const & mat,
+                      vector_base<ScalarType> & vec,
+                      viennacl::linalg::detail::row_info_types info_selector)
+        {
+          ScalarType         * result_buf   = detail::extract_raw_pointer<ScalarType>(vec.handle());
+          ScalarType   const * elements     = detail::extract_raw_pointer<ScalarType>(mat.handle());
+          unsigned int const * coord_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle12());
+
+          ScalarType value = 0;
+          unsigned int last_row = 0;
+
+          for (vcl_size_t i = 0; i < mat.nnz(); ++i)
+          {
+            unsigned int current_row = coord_buffer[2*i];
+
+            if (current_row != last_row)
+            {
+              if (info_selector == viennacl::linalg::detail::SPARSE_ROW_NORM_2)
+                value = std::sqrt(value);
+
+              result_buf[last_row] = value;
+              value = 0;
+              last_row = current_row;
+            }
+
+            switch (info_selector)
+            {
+              case viennacl::linalg::detail::SPARSE_ROW_NORM_INF: //inf-norm
+                value = std::max<ScalarType>(value, std::fabs(elements[i]));
+                break;
+
+              case viennacl::linalg::detail::SPARSE_ROW_NORM_1: //1-norm
+                value += std::fabs(elements[i]);
+                break;
+
+              case viennacl::linalg::detail::SPARSE_ROW_NORM_2: //2-norm
+                value += elements[i] * elements[i];
+                break;
+
+              case viennacl::linalg::detail::SPARSE_ROW_DIAGONAL: //diagonal entry
+                if (coord_buffer[2*i+1] == current_row)
+                  value = elements[i];
+                break;
+
+              default:
+                break;
+            }
+          }
+
+          if (info_selector == viennacl::linalg::detail::SPARSE_ROW_NORM_2)
+            value = std::sqrt(value);
+
+          result_buf[last_row] = value;
+        }
+      }
+
+      /** @brief Carries out matrix-vector multiplication with a coordinate_matrix
+      *
+      * Implementation of the convenience expression result = prod(mat, vec);
+      *
+      * @param mat    The matrix
+      * @param vec    The vector
+      * @param result The result vector
+      */
+      template<class ScalarType, unsigned int ALIGNMENT>
+      void prod_impl(const viennacl::coordinate_matrix<ScalarType, ALIGNMENT> & mat,
+                     const viennacl::vector_base<ScalarType> & vec,
+                           viennacl::vector_base<ScalarType> & result)
+      {
+        ScalarType         * result_buf   = detail::extract_raw_pointer<ScalarType>(result.handle());
+        ScalarType   const * vec_buf      = detail::extract_raw_pointer<ScalarType>(vec.handle());
+        ScalarType   const * elements     = detail::extract_raw_pointer<ScalarType>(mat.handle());
+        unsigned int const * coord_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle12());
+
+        for (vcl_size_t i = 0; i< result.size(); ++i)
+          result_buf[i * result.stride() + result.start()] = 0;
+
+        for (vcl_size_t i = 0; i < mat.nnz(); ++i)
+          result_buf[coord_buffer[2*i] * result.stride() + result.start()]
+            += elements[i] * vec_buf[coord_buffer[2*i+1] * vec.stride() + vec.start()];
+      }
+
+      /** @brief Carries out Compressed Matrix(COO)-Dense Matrix multiplication
+      *
+      * Implementation of the convenience expression result = prod(sp_mat, d_mat);
+      *
+      * @param sp_mat     The Sparse Matrix (Coordinate format)
+      * @param d_mat      The Dense Matrix
+      * @param result     The Result Matrix
+      */
+      template<class ScalarType, unsigned int ALIGNMENT, class NumericT, typename F1, typename F2>
+      void prod_impl(const viennacl::coordinate_matrix<ScalarType, ALIGNMENT> & sp_mat,
+                     const viennacl::matrix_base<NumericT, F1> & d_mat,
+                           viennacl::matrix_base<NumericT, F2> & result) {
+
+        ScalarType   const * sp_mat_elements     = detail::extract_raw_pointer<ScalarType>(sp_mat.handle());
+        unsigned int const * sp_mat_coords       = detail::extract_raw_pointer<unsigned int>(sp_mat.handle12());
+
+        NumericT const * d_mat_data = detail::extract_raw_pointer<NumericT>(d_mat);
+        NumericT       * result_data = detail::extract_raw_pointer<NumericT>(result);
+
+        vcl_size_t d_mat_start1 = viennacl::traits::start1(d_mat);
+        vcl_size_t d_mat_start2 = viennacl::traits::start2(d_mat);
+        vcl_size_t d_mat_inc1   = viennacl::traits::stride1(d_mat);
+        vcl_size_t d_mat_inc2   = viennacl::traits::stride2(d_mat);
+        vcl_size_t d_mat_internal_size1  = viennacl::traits::internal_size1(d_mat);
+        vcl_size_t d_mat_internal_size2  = viennacl::traits::internal_size2(d_mat);
+
+        vcl_size_t result_start1 = viennacl::traits::start1(result);
+        vcl_size_t result_start2 = viennacl::traits::start2(result);
+        vcl_size_t result_inc1   = viennacl::traits::stride1(result);
+        vcl_size_t result_inc2   = viennacl::traits::stride2(result);
+        vcl_size_t result_internal_size1  = viennacl::traits::internal_size1(result);
+        vcl_size_t result_internal_size2  = viennacl::traits::internal_size2(result);
+
+        detail::matrix_array_wrapper<NumericT const, typename F1::orientation_category, false>
+            d_mat_wrapper(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
+        detail::matrix_array_wrapper<NumericT,       typename F2::orientation_category, false>
+            result_wrapper(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
+
+        if ( detail::is_row_major(typename F1::orientation_category()) ) {
+
+#ifdef VIENNACL_WITH_OPENMP
+        #pragma omp parallel for
+#endif
+          for (long row = 0; row < static_cast<long>(sp_mat.size1()); ++row)
+                for (vcl_size_t col = 0; col < d_mat.size2(); ++col)
+                  result_wrapper( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
+
+#ifdef VIENNACL_WITH_OPENMP
+        #pragma omp parallel for
+#endif
+          for (long i = 0; i < static_cast<long>(sp_mat.nnz()); ++i) {
+            NumericT x = static_cast<NumericT>(sp_mat_elements[i]);
+            unsigned int r = sp_mat_coords[2*i];
+            unsigned int c = sp_mat_coords[2*i+1];
+            for (vcl_size_t col = 0; col < d_mat.size2(); ++col) {
+              NumericT y = d_mat_wrapper( c, col);
+              result_wrapper(r, col) += x * y;
+            }
+          }
+        }
+
+        else {
+
+#ifdef VIENNACL_WITH_OPENMP
+        #pragma omp parallel for
+#endif
+          for (long col = 0; col < static_cast<long>(d_mat.size2()); ++col)
+            for (vcl_size_t row = 0; row < sp_mat.size1(); ++row)
+                result_wrapper( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
+
+#ifdef VIENNACL_WITH_OPENMP
+        #pragma omp parallel for
+#endif
+          for (long col = 0; col < static_cast<long>(d_mat.size2()); ++col) {
+
+            for (vcl_size_t i = 0; i < sp_mat.nnz(); ++i) {
+
+              NumericT x = static_cast<NumericT>(sp_mat_elements[i]);
+              unsigned int r = sp_mat_coords[2*i];
+              unsigned int c = sp_mat_coords[2*i+1];
+              NumericT y = d_mat_wrapper( c, col);
+
+              result_wrapper( r, col) += x*y;
+            }
+
+          }
+        }
+
+      }
+
+
+      /** @brief Carries out Compressed Matrix(COO)-Dense Transposed Matrix multiplication
+      *
+      * Implementation of the convenience expression result = prod(sp_mat, trans(d_mat));
+      *
+      * @param sp_mat     The Sparse Matrix (Coordinate format)
+      * @param d_mat      The Dense Transposed Matrix
+      * @param result     The Result Matrix
+      */
+      template<class ScalarType, unsigned int ALIGNMENT, class NumericT, typename F1, typename F2>
+      void prod_impl(const viennacl::coordinate_matrix<ScalarType, ALIGNMENT> & sp_mat,
+                     const viennacl::matrix_expression< const viennacl::matrix_base<NumericT, F1>,
+                                                        const viennacl::matrix_base<NumericT, F1>,
+                                                        viennacl::op_trans > & d_mat,
+                           viennacl::matrix_base<NumericT, F2> & result) {
+
+        ScalarType   const * sp_mat_elements     = detail::extract_raw_pointer<ScalarType>(sp_mat.handle());
+        unsigned int const * sp_mat_coords       = detail::extract_raw_pointer<unsigned int>(sp_mat.handle12());
+
+        NumericT const * d_mat_data = detail::extract_raw_pointer<NumericT>(d_mat.lhs());
+        NumericT       * result_data = detail::extract_raw_pointer<NumericT>(result);
+
+        vcl_size_t d_mat_start1 = viennacl::traits::start1(d_mat.lhs());
+        vcl_size_t d_mat_start2 = viennacl::traits::start2(d_mat.lhs());
+        vcl_size_t d_mat_inc1   = viennacl::traits::stride1(d_mat.lhs());
+        vcl_size_t d_mat_inc2   = viennacl::traits::stride2(d_mat.lhs());
+        vcl_size_t d_mat_internal_size1  = viennacl::traits::internal_size1(d_mat.lhs());
+        vcl_size_t d_mat_internal_size2  = viennacl::traits::internal_size2(d_mat.lhs());
+
+        vcl_size_t result_start1 = viennacl::traits::start1(result);
+        vcl_size_t result_start2 = viennacl::traits::start2(result);
+        vcl_size_t result_inc1   = viennacl::traits::stride1(result);
+        vcl_size_t result_inc2   = viennacl::traits::stride2(result);
+        vcl_size_t result_internal_size1  = viennacl::traits::internal_size1(result);
+        vcl_size_t result_internal_size2  = viennacl::traits::internal_size2(result);
+
+        detail::matrix_array_wrapper<NumericT const, typename F1::orientation_category, false>
+            d_mat_wrapper(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
+        detail::matrix_array_wrapper<NumericT,       typename F2::orientation_category, false>
+            result_wrapper(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
+
+        if ( detail::is_row_major(typename F1::orientation_category()) ) {
+
+#ifdef VIENNACL_WITH_OPENMP
+        #pragma omp parallel for
+#endif
+          for (long row = 0; row < static_cast<long>(sp_mat.size1()); ++row)
+            for (vcl_size_t col = 0; col < d_mat.size2(); ++col)
+              result_wrapper( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
+        }
+        else {
+
+#ifdef VIENNACL_WITH_OPENMP
+        #pragma omp parallel for
+#endif
+          for (long col = 0; col < static_cast<long>(d_mat.size2()); ++col)
+            for (vcl_size_t row = 0; row < sp_mat.size1(); ++row)
+              result_wrapper( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
+        }
+
+#ifdef VIENNACL_WITH_OPENMP
+        #pragma omp parallel for
+#endif
+        for (long i = 0; i < static_cast<long>(sp_mat.nnz()); ++i) {
+          NumericT x = static_cast<NumericT>(sp_mat_elements[i]);
+          unsigned int r = sp_mat_coords[2*i];
+          unsigned int c = sp_mat_coords[2*i+1];
+          for (vcl_size_t col = 0; col < d_mat.size2(); ++col) {
+            NumericT y = d_mat_wrapper( col, c);
+            result_wrapper(r, col) += x * y;
+          }
+        }
+
+      }
+      //
+      // ELL Matrix
+      //
+      /** @brief Carries out matrix-vector multiplication with a ell_matrix
+      *
+      * Implementation of the convenience expression result = prod(mat, vec);
+      *
+      * @param mat    The matrix
+      * @param vec    The vector
+      * @param result The result vector
+      */
+      template<class ScalarType, unsigned int ALIGNMENT>
+      void prod_impl(const viennacl::ell_matrix<ScalarType, ALIGNMENT> & mat,
+                     const viennacl::vector_base<ScalarType> & vec,
+                           viennacl::vector_base<ScalarType> & result)
+      {
+        ScalarType         * result_buf   = detail::extract_raw_pointer<ScalarType>(result.handle());
+        ScalarType   const * vec_buf      = detail::extract_raw_pointer<ScalarType>(vec.handle());
+        ScalarType   const * elements     = detail::extract_raw_pointer<ScalarType>(mat.handle());
+        unsigned int const * coords       = detail::extract_raw_pointer<unsigned int>(mat.handle2());
+
+        for(vcl_size_t row = 0; row < mat.size1(); ++row)
+        {
+          ScalarType sum = 0;
+
+          for(unsigned int item_id = 0; item_id < mat.internal_maxnnz(); ++item_id)
+          {
+            vcl_size_t offset = row + item_id * mat.internal_size1();
+            ScalarType val = elements[offset];
+
+            if(val != 0)
+            {
+              unsigned int col = coords[offset];
+              sum += (vec_buf[col * vec.stride() + vec.start()] * val);
+            }
+          }
+
+          result_buf[row * result.stride() + result.start()] = sum;
+        }
+      }
+
+      /** @brief Carries out ell_matrix-d_matrix multiplication
+      *
+      * Implementation of the convenience expression result = prod(sp_mat, d_mat);
+      *
+      * @param sp_mat     The sparse(ELL) matrix
+      * @param d_mat      The dense matrix
+      * @param result     The result dense matrix
+      */
+      template<class ScalarType, typename NumericT, unsigned int ALIGNMENT, typename F1, typename F2>
+      void prod_impl(const viennacl::ell_matrix<ScalarType, ALIGNMENT> & sp_mat,
+                     const viennacl::matrix_base<NumericT, F1> & d_mat,
+                           viennacl::matrix_base<NumericT, F2> & result)
+      {
+        ScalarType   const * sp_mat_elements     = detail::extract_raw_pointer<ScalarType>(sp_mat.handle());
+        unsigned int const * sp_mat_coords       = detail::extract_raw_pointer<unsigned int>(sp_mat.handle2());
+
+        NumericT const * d_mat_data = detail::extract_raw_pointer<NumericT>(d_mat);
+        NumericT       * result_data = detail::extract_raw_pointer<NumericT>(result);
+
+        vcl_size_t d_mat_start1 = viennacl::traits::start1(d_mat);
+        vcl_size_t d_mat_start2 = viennacl::traits::start2(d_mat);
+        vcl_size_t d_mat_inc1   = viennacl::traits::stride1(d_mat);
+        vcl_size_t d_mat_inc2   = viennacl::traits::stride2(d_mat);
+        vcl_size_t d_mat_internal_size1  = viennacl::traits::internal_size1(d_mat);
+        vcl_size_t d_mat_internal_size2  = viennacl::traits::internal_size2(d_mat);
+
+        vcl_size_t result_start1 = viennacl::traits::start1(result);
+        vcl_size_t result_start2 = viennacl::traits::start2(result);
+        vcl_size_t result_inc1   = viennacl::traits::stride1(result);
+        vcl_size_t result_inc2   = viennacl::traits::stride2(result);
+        vcl_size_t result_internal_size1  = viennacl::traits::internal_size1(result);
+        vcl_size_t result_internal_size2  = viennacl::traits::internal_size2(result);
+
+        detail::matrix_array_wrapper<NumericT const, typename F1::orientation_category, false>
+            d_mat_wrapper(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
+        detail::matrix_array_wrapper<NumericT,       typename F2::orientation_category, false>
+            result_wrapper(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
+
+        if ( detail::is_row_major(typename F1::orientation_category()) ) {
+#ifdef VIENNACL_WITH_OPENMP
+        #pragma omp parallel for
+#endif
+          for (long row = 0; row < static_cast<long>(sp_mat.size1()); ++row)
+                for (vcl_size_t col = 0; col < d_mat.size2(); ++col)
+                  result_wrapper( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
+
+#ifdef VIENNACL_WITH_OPENMP
+        #pragma omp parallel for
+#endif
+          for (long row = 0; row < static_cast<long>(sp_mat.size1()); ++row)
+          {
+            for (long item_id = 0; item_id < static_cast<long>(sp_mat.maxnnz()); ++item_id)
+            {
+              vcl_size_t offset = static_cast<vcl_size_t>(row) + item_id * sp_mat.internal_size1();
+              NumericT sp_mat_val = static_cast<NumericT>(sp_mat_elements[offset]);
+              unsigned int sp_mat_col = sp_mat_coords[offset];
+
+              if( sp_mat_val != 0)
+              {
+                for (vcl_size_t col = 0; col < d_mat.size2(); ++col)
+                  result_wrapper(static_cast<vcl_size_t>(row), col) += sp_mat_val * d_mat_wrapper( sp_mat_col, col);
+              }
+            }
+          }
+        }
+        else {
+#ifdef VIENNACL_WITH_OPENMP
+        #pragma omp parallel for
+#endif
+          for (long col = 0; col < static_cast<long>(d_mat.size2()); ++col)
+            for (long row = 0; row < static_cast<long>(sp_mat.size1()); ++row)
+                result_wrapper( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
+
+#ifdef VIENNACL_WITH_OPENMP
+        #pragma omp parallel for
+#endif
+          for (long col = 0; col < static_cast<long>(d_mat.size2()); ++col) {
+
+            for(unsigned int item_id = 0; item_id < sp_mat.maxnnz(); ++item_id) {
+
+              for(vcl_size_t row = 0; row < sp_mat.size1(); ++row) {
+
+                vcl_size_t offset = row + item_id * sp_mat.internal_size1();
+                NumericT sp_mat_val = static_cast<NumericT>(sp_mat_elements[offset]);
+                unsigned int sp_mat_col = sp_mat_coords[offset];
+
+                if( sp_mat_val != 0) {
+
+                  result_wrapper( row, col) += sp_mat_val * d_mat_wrapper( sp_mat_col, col);
+                }
+              }
+            }
+          }
+        }
+
+      }
+
+      /** @brief Carries out matrix-trans(matrix) multiplication first matrix being sparse ell
+      *          and the second dense transposed
+      *
+      * Implementation of the convenience expression result = prod(sp_mat, trans(d_mat));
+      *
+      * @param sp_mat             The sparse matrix
+      * @param d_mat              The transposed dense matrix
+      * @param result             The result matrix
+      */
+      template<class ScalarType, typename NumericT, unsigned int ALIGNMENT, typename F1, typename F2>
+      void prod_impl(const viennacl::ell_matrix<ScalarType, ALIGNMENT> & sp_mat,
+                     const viennacl::matrix_expression< const viennacl::matrix_base<NumericT, F1>,
+                                                        const viennacl::matrix_base<NumericT, F1>,
+                                                        viennacl::op_trans > & d_mat,
+                           viennacl::matrix_base<NumericT, F2> & result) {
+
+        ScalarType   const * sp_mat_elements     = detail::extract_raw_pointer<ScalarType>(sp_mat.handle());
+        unsigned int const * sp_mat_coords       = detail::extract_raw_pointer<unsigned int>(sp_mat.handle2());
+
+        NumericT const * d_mat_data = detail::extract_raw_pointer<NumericT>(d_mat.lhs());
+        NumericT       * result_data = detail::extract_raw_pointer<NumericT>(result);
+
+        vcl_size_t d_mat_start1 = viennacl::traits::start1(d_mat.lhs());
+        vcl_size_t d_mat_start2 = viennacl::traits::start2(d_mat.lhs());
+        vcl_size_t d_mat_inc1   = viennacl::traits::stride1(d_mat.lhs());
+        vcl_size_t d_mat_inc2   = viennacl::traits::stride2(d_mat.lhs());
+        vcl_size_t d_mat_internal_size1  = viennacl::traits::internal_size1(d_mat.lhs());
+        vcl_size_t d_mat_internal_size2  = viennacl::traits::internal_size2(d_mat.lhs());
+
+        vcl_size_t result_start1 = viennacl::traits::start1(result);
+        vcl_size_t result_start2 = viennacl::traits::start2(result);
+        vcl_size_t result_inc1   = viennacl::traits::stride1(result);
+        vcl_size_t result_inc2   = viennacl::traits::stride2(result);
+        vcl_size_t result_internal_size1  = viennacl::traits::internal_size1(result);
+        vcl_size_t result_internal_size2  = viennacl::traits::internal_size2(result);
+
+        detail::matrix_array_wrapper<NumericT const, typename F1::orientation_category, false>
+            d_mat_wrapper(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
+        detail::matrix_array_wrapper<NumericT,       typename F2::orientation_category, false>
+            result_wrapper(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
+
+        if ( detail::is_row_major(typename F1::orientation_category()) ) {
+#ifdef VIENNACL_WITH_OPENMP
+        #pragma omp parallel for
+#endif
+          for(long row = 0; row < static_cast<long>(sp_mat.size1()); ++row)
+            for (vcl_size_t col = 0; col < d_mat.size2(); ++col)
+              result_wrapper( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
+
+          for (vcl_size_t col = 0; col < d_mat.size2(); ++col) {
+
+            for(unsigned int item_id = 0; item_id < sp_mat.maxnnz(); ++item_id) {
+
+              for(vcl_size_t row = 0; row < sp_mat.size1(); ++row) {
+
+                vcl_size_t offset = row + item_id * sp_mat.internal_size1();
+                NumericT sp_mat_val = static_cast<NumericT>(sp_mat_elements[offset]);
+                unsigned int sp_mat_col = sp_mat_coords[offset];
+
+                if( sp_mat_val != 0) {
+
+                  result_wrapper( row, col) += sp_mat_val * d_mat_wrapper( col, sp_mat_col);
+                }
+              }
+            }
+          }
+        }
+        else {
+#ifdef VIENNACL_WITH_OPENMP
+        #pragma omp parallel for
+#endif
+          for (long col = 0; col < static_cast<long>(d_mat.size2()); ++col)
+            for (vcl_size_t row = 0; row < sp_mat.size1(); ++row)
+              result_wrapper( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
+
+#ifdef VIENNACL_WITH_OPENMP
+        #pragma omp parallel for
+#endif
+          for(long item_id = 0; item_id < static_cast<long>(sp_mat.maxnnz()); ++item_id) {
+
+            for(vcl_size_t row = 0; row < sp_mat.size1(); ++row) {
+
+              vcl_size_t offset = row + item_id * sp_mat.internal_size1();
+              NumericT sp_mat_val = static_cast<NumericT>(sp_mat_elements[offset]);
+              unsigned int sp_mat_col = sp_mat_coords[offset];
+
+              if( sp_mat_val != 0) {
+
+                for (vcl_size_t col = 0; col < d_mat.size2(); ++col) {
+
+                  result_wrapper( row, col) += sp_mat_val * d_mat_wrapper( col, sp_mat_col);
+                }
+              }
+            }
+          }
+        }
+
+      }
+
+      //
+      // Hybrid Matrix
+      //
+      /** @brief Carries out matrix-vector multiplication with a hyb_matrix
+      *
+      * Implementation of the convenience expression result = prod(mat, vec);
+      *
+      * @param mat    The matrix
+      * @param vec    The vector
+      * @param result The result vector
+      */
+      template<class ScalarType, unsigned int ALIGNMENT>
+      void prod_impl(const viennacl::hyb_matrix<ScalarType, ALIGNMENT> & mat,
+                     const viennacl::vector_base<ScalarType> & vec,
+                           viennacl::vector_base<ScalarType> & result)
+      {
+        ScalarType         * result_buf     = detail::extract_raw_pointer<ScalarType>(result.handle());
+        ScalarType   const * vec_buf        = detail::extract_raw_pointer<ScalarType>(vec.handle());
+        ScalarType   const * elements       = detail::extract_raw_pointer<ScalarType>(mat.handle());
+        unsigned int const * coords         = detail::extract_raw_pointer<unsigned int>(mat.handle2());
+        ScalarType   const * csr_elements   = detail::extract_raw_pointer<ScalarType>(mat.handle5());
+        unsigned int const * csr_row_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle3());
+        unsigned int const * csr_col_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle4());
+
+
+        for(vcl_size_t row = 0; row < mat.size1(); ++row)
+        {
+          ScalarType sum = 0;
+
+          //
+          // Part 1: Process ELL part
+          //
+          for(unsigned int item_id = 0; item_id < mat.internal_ellnnz(); ++item_id)
+          {
+            vcl_size_t offset = row + item_id * mat.internal_size1();
+            ScalarType val = elements[offset];
+
+            if(val != 0)
+            {
+              unsigned int col = coords[offset];
+              sum += (vec_buf[col * vec.stride() + vec.start()] * val);
+            }
+          }
+
+          //
+          // Part 2: Process HYB part
+          //
+          vcl_size_t col_begin = csr_row_buffer[row];
+          vcl_size_t col_end   = csr_row_buffer[row + 1];
+
+          for(vcl_size_t item_id = col_begin; item_id < col_end; item_id++)
+          {
+              sum += (vec_buf[csr_col_buffer[item_id] * vec.stride() + vec.start()] * csr_elements[item_id]);
+          }
+
+          result_buf[row * result.stride() + result.start()] = sum;
+        }
+
+      }
+
+      //
+      // Hybrid Matrix
+      //
+      /** @brief Carries out sparse-matrix-dense-matrix multiplication with a hyb_matrix
+      *
+      * Implementation of the convenience expression C = prod(A, B);
+      *
+      * @param mat    The sparse matrix A
+      * @param d_mat  The dense matrix B
+      * @param result The dense result matrix C
+      */
+      template<typename NumericT, unsigned int ALIGNMENT, typename F1, typename F2>
+      void prod_impl(const viennacl::hyb_matrix<NumericT, ALIGNMENT> & mat,
+                     const viennacl::matrix_base<NumericT, F1> & d_mat,
+                           viennacl::matrix_base<NumericT, F2> & result)
+      {
+        NumericT const * d_mat_data = detail::extract_raw_pointer<NumericT>(d_mat);
+        NumericT       * result_data = detail::extract_raw_pointer<NumericT>(result);
+
+        vcl_size_t d_mat_start1 = viennacl::traits::start1(d_mat);
+        vcl_size_t d_mat_start2 = viennacl::traits::start2(d_mat);
+        vcl_size_t d_mat_inc1   = viennacl::traits::stride1(d_mat);
+        vcl_size_t d_mat_inc2   = viennacl::traits::stride2(d_mat);
+        vcl_size_t d_mat_internal_size1  = viennacl::traits::internal_size1(d_mat);
+        vcl_size_t d_mat_internal_size2  = viennacl::traits::internal_size2(d_mat);
+
+        vcl_size_t result_start1 = viennacl::traits::start1(result);
+        vcl_size_t result_start2 = viennacl::traits::start2(result);
+        vcl_size_t result_inc1   = viennacl::traits::stride1(result);
+        vcl_size_t result_inc2   = viennacl::traits::stride2(result);
+        vcl_size_t result_internal_size1  = viennacl::traits::internal_size1(result);
+        vcl_size_t result_internal_size2  = viennacl::traits::internal_size2(result);
+
+        detail::matrix_array_wrapper<NumericT const, typename F1::orientation_category, false>
+            d_mat_wrapper(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
+        detail::matrix_array_wrapper<NumericT,       typename F2::orientation_category, false>
+            result_wrapper(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
+
+        NumericT     const * elements       = detail::extract_raw_pointer<NumericT>(mat.handle());
+        unsigned int const * coords         = detail::extract_raw_pointer<unsigned int>(mat.handle2());
+        NumericT     const * csr_elements   = detail::extract_raw_pointer<NumericT>(mat.handle5());
+        unsigned int const * csr_row_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle3());
+        unsigned int const * csr_col_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle4());
+
+
+        for (vcl_size_t result_col = 0; result_col < result.size2(); ++result_col)
+        {
+          for(vcl_size_t row = 0; row < mat.size1(); ++row)
+          {
+            NumericT sum = 0;
+
+            //
+            // Part 1: Process ELL part
+            //
+            for(unsigned int item_id = 0; item_id < mat.internal_ellnnz(); ++item_id)
+            {
+              vcl_size_t offset = row + item_id * mat.internal_size1();
+              NumericT val = elements[offset];
+
+              if(val != 0)
+              {
+                unsigned int col = coords[offset];
+                sum += d_mat_wrapper(col, result_col) * val;
+              }
+            }
+
+            //
+            // Part 2: Process HYB/CSR part
+            //
+            vcl_size_t col_begin = csr_row_buffer[row];
+            vcl_size_t col_end   = csr_row_buffer[row + 1];
+
+            for(vcl_size_t item_id = col_begin; item_id < col_end; item_id++)
+              sum += d_mat_wrapper(csr_col_buffer[item_id], result_col) * csr_elements[item_id];
+
+            result_wrapper(row, result_col) = sum;
+          }
+        } // for result_col
+      }
+
+
+      /** @brief Carries out sparse-matrix-transposed-dense-matrix multiplication with a hyb_matrix
+      *
+      * Implementation of the convenience expression C = prod(A, trans(B));
+      *
+      * @param mat    The sparse matrix A
+      * @param d_mat  The dense matrix B
+      * @param result The dense result matrix C
+      */
+      template<typename NumericT, unsigned int ALIGNMENT, typename F1, typename F2>
+      void prod_impl(const viennacl::hyb_matrix<NumericT, ALIGNMENT> & mat,
+                     const viennacl::matrix_expression< const viennacl::matrix_base<NumericT, F1>,
+                                                        const viennacl::matrix_base<NumericT, F1>,
+                                                        viennacl::op_trans > & d_mat,
+                           viennacl::matrix_base<NumericT, F2> & result)
+      {
+        NumericT const * d_mat_data = detail::extract_raw_pointer<NumericT>(d_mat);
+        NumericT       * result_data = detail::extract_raw_pointer<NumericT>(result);
+
+        vcl_size_t d_mat_start1 = viennacl::traits::start1(d_mat.lhs());
+        vcl_size_t d_mat_start2 = viennacl::traits::start2(d_mat.lhs());
+        vcl_size_t d_mat_inc1   = viennacl::traits::stride1(d_mat.lhs());
+        vcl_size_t d_mat_inc2   = viennacl::traits::stride2(d_mat.lhs());
+        vcl_size_t d_mat_internal_size1  = viennacl::traits::internal_size1(d_mat.lhs());
+        vcl_size_t d_mat_internal_size2  = viennacl::traits::internal_size2(d_mat.lhs());
+
+        vcl_size_t result_start1 = viennacl::traits::start1(result);
+        vcl_size_t result_start2 = viennacl::traits::start2(result);
+        vcl_size_t result_inc1   = viennacl::traits::stride1(result);
+        vcl_size_t result_inc2   = viennacl::traits::stride2(result);
+        vcl_size_t result_internal_size1  = viennacl::traits::internal_size1(result);
+        vcl_size_t result_internal_size2  = viennacl::traits::internal_size2(result);
+
+        detail::matrix_array_wrapper<NumericT const, typename F1::orientation_category, false>
+            d_mat_wrapper(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
+        detail::matrix_array_wrapper<NumericT,       typename F2::orientation_category, false>
+            result_wrapper(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
+
+        NumericT     const * elements       = detail::extract_raw_pointer<NumericT>(mat.handle());
+        unsigned int const * coords         = detail::extract_raw_pointer<unsigned int>(mat.handle2());
+        NumericT     const * csr_elements   = detail::extract_raw_pointer<NumericT>(mat.handle5());
+        unsigned int const * csr_row_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle3());
+        unsigned int const * csr_col_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle4());
+
+
+        for (vcl_size_t result_col = 0; result_col < result.size2(); ++result_col)
+        {
+          for(vcl_size_t row = 0; row < mat.size1(); ++row)
+          {
+            NumericT sum = 0;
+
+            //
+            // Part 1: Process ELL part
+            //
+            for(unsigned int item_id = 0; item_id < mat.internal_ellnnz(); ++item_id)
+            {
+              vcl_size_t offset = row + item_id * mat.internal_size1();
+              NumericT val = elements[offset];
+
+              if(val != 0)
+              {
+                unsigned int col = coords[offset];
+                sum += d_mat_wrapper(result_col, col) * val;
+              }
+            }
+
+            //
+            // Part 2: Process HYB/CSR part
+            //
+            vcl_size_t col_begin = csr_row_buffer[row];
+            vcl_size_t col_end   = csr_row_buffer[row + 1];
+
+            for(vcl_size_t item_id = col_begin; item_id < col_end; item_id++)
+              sum += d_mat_wrapper(result_col, csr_col_buffer[item_id]) * csr_elements[item_id];
+
+            result_wrapper(row, result_col) = sum;
+          }
+        } // for result_col
+      }
+
+
+    } // namespace host_based
+  } //namespace linalg
+} //namespace viennacl
+
+
+#endif
diff --git a/viennacl/linalg/host_based/sse_blas.hpp b/viennacl/linalg/host_based/sse_blas.hpp
new file mode 100644
index 0000000..f3953cb
--- /dev/null
+++ b/viennacl/linalg/host_based/sse_blas.hpp
@@ -0,0 +1,1013 @@
+#ifndef VIENNACL_LINALG_HOST_BASED_SSE_BLAS_HPP_
+#define VIENNACL_LINALG_HOST_BASED_SSE_BLAS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/host_based/sse_blas.hpp
+*   @brief optimized BLAS functions using SSE2 and SSE3 intrinsic functions
+*
+*   Contributed by Alex Christensen.
+*/
+
+//complex BLAS functions are included but unused in this version of ViennaCL
+#if defined VIENNACL_WITH_COMPLEX
+#include <complex>
+#endif
+
+//defining VIENNACL_SSE3 adds a slight optimization for complex multiplication using SSE3
+#if defined VIENNACL_WITH_SSE3
+#include <pmmintrin.h>
+#elif defined VIENNACL_WITH_SSE2
+#include <emmintrin.h>
+#endif
+
+#include <cstddef>
+#include <cmath>
+
+namespace viennacl
+{
+  namespace linalg
+  {
+
+    namespace host_based
+    {
+      //saxpy, daxpy, caxpy, zaxpy
+      template <class T> inline void _axpy(const T*, T*, vcl_size_t, T);
+
+      //sdot, ddot, cdotu, zdotu
+      template <class T> inline T    _dot (vcl_size_t, const T*, const T*);
+
+      //sdot, ddot, cdotc, zdotc
+      template <class T> inline T    _dotc(vcl_size_t, const T*, const T*);
+
+      //sswap, dswap, cswap, zswap
+      template <class T> inline void _swap(vcl_size_t, T*, T*);
+
+      //scopy, dcopy, ccopy, zcopy
+      template <class T> inline void _copy(vcl_size_t, T*, T*);
+
+      //snrm2, dnrm2, euclidian norm of complex vectors
+      template <class T> inline T    _nrm2(const T*, vcl_size_t);
+
+      namespace detail
+      {
+        template <class T> inline T conjIfComplex(T x){return x;}
+      }
+
+      template <class T>
+      inline void _axpy(const T* x, T* y, vcl_size_t n, T a)
+      {
+        for(vcl_size_t i=0;i<n;i++)
+          y[i]+=a*x[i];
+      }
+
+      template <class T>
+      inline T _dot(vcl_size_t n, const T* x, const T* y)
+      {
+        T sum(0);
+        for(vcl_size_t i=0;i<n;i++)
+          sum+=x[i]*y[i];
+        return sum;
+      }
+
+      template <class T>
+      inline T _dotc(vcl_size_t n, const T* x, const T* y)
+      {
+        T sum(0);
+        for(vcl_size_t i=0;i<n;i++)
+          sum+=detail::conjIfComplex(x[i])*y[i];
+        return sum;
+      }
+
+      template <class T>
+      inline void _swap(vcl_size_t n, T* sx, T* sy)
+      {
+        T t;
+        for(vcl_size_t i=0;i<n;i++)
+        {
+          t=sx[i];
+          sx[i]=sy[i];
+          sy[i]=t;
+        }
+      }
+
+      template <class T>
+      inline void _copy(vcl_size_t n, T* cx, T* cy)
+      {
+        for(vcl_size_t i=0;i<n;i++)
+          cx[i]=cy[i];
+      }
+
+      template <class T>
+      inline T _nrm2(const T* x, vcl_size_t n)
+      {
+        //based on http://www.netlib.org/blas/snrm2.f, but works with std::complex
+
+        if(n<1)
+          return T(0);
+        if(n==1)
+          return std::abs(x[0]);
+        T scale(0);
+        T scaledSquareSum(1);
+        for(vcl_size_t i=0;i<n;i++){
+          if(x[i]!=T(0)){
+            T absXi=std::abs(x[i]);
+            if(std::abs(x[i])>std::abs(scale)){
+              T temp=scale/absXi;
+              scaledSquareSum=T(1)+scaledSquareSum*temp*temp;
+              scale=absXi;
+            }
+            else{
+              T temp=absXi/scale;
+              scaledSquareSum+=temp*temp;
+            }
+          }
+        }
+        return scale*sqrt(scaledSquareSum);
+      }
+
+  #if defined VIENNACL_WITH_COMPLEX
+
+      namespace detail
+      {
+        template <> inline std::complex<double> conjIfComplex(std::complex<double> x){return conj(x);}
+        template <> inline std::complex<float > conjIfComplex(std::complex<float > x){return conj(x);}
+      }
+
+      template <>
+      inline std::complex<double> _nrm2(const std::complex<double>* x, vcl_size_t n)
+      {
+        //based on http://www.netlib.org/blas/snrm2.f
+
+        if(n<1)
+          return std::complex<double>(0);
+        if(n==1)
+          return std::abs(x[0]);
+        double scale=0.0;
+        double scaledSquareSum=1.0;
+        for(vcl_size_t i=0;i<n;i++){
+          if(x[i].real()!=0.0){
+            double absXi=std::abs(x[i].real());
+            if(absXi>scale){
+              double temp=scale/absXi;
+              scaledSquareSum=1.0+scaledSquareSum*temp*temp;
+              scale=absXi;
+            }
+            else{
+              double temp=absXi/scale;
+              scaledSquareSum+=temp*temp;
+            }
+          }
+          if(x[i].imag()!=0.0){
+            double absXi=std::abs(x[i].imag());
+            if(absXi>scale){
+              double temp=scale/absXi;
+              scaledSquareSum=1.0+scaledSquareSum*temp*temp;
+              scale=absXi;
+            }
+            else{
+              double temp=absXi/scale;
+              scaledSquareSum+=temp*temp;
+            }
+          }
+        }
+        return std::complex<double>(scale*sqrt(scaledSquareSum));
+      }
+
+      template <>
+      inline std::complex<float> _nrm2(const std::complex<float>* x, vcl_size_t n)
+      {
+        //based on http://www.netlib.org/blas/snrm2.f
+
+        if(n<1)
+          return std::complex<float>(0);
+        if(n==1)
+          return std::abs(x[0]);
+        float scale=0.0;
+        float scaledSquareSum=1.0;
+        for(vcl_size_t i=0;i<n;i++){
+          if(x[i].real()!=0.0){
+            float absXi=std::abs(x[i].real());
+            if(absXi>scale){
+              float temp=scale/absXi;
+              scaledSquareSum=1.0f+scaledSquareSum*temp*temp;
+              scale=absXi;
+            }
+            else{
+              float temp=absXi/scale;
+              scaledSquareSum+=temp*temp;
+            }
+          }
+          if(x[i].imag()!=0.0){
+            float absXi=std::abs(x[i].imag());
+            if(absXi>scale){
+              float temp=scale/absXi;
+              scaledSquareSum=1.0f+scaledSquareSum*temp*temp;
+              scale=absXi;
+            }
+            else{
+              float temp=absXi/scale;
+              scaledSquareSum+=temp*temp;
+            }
+          }
+        }
+        return std::complex<float>(scale*sqrt(scaledSquareSum));
+      }
+
+  #endif //defined VIENNACL_COMPLEX
+
+  #if defined VIENNACL_WITH_SSE2
+
+      //saxpy
+      template <>
+      inline void _axpy<float>(const float* x, float* y, vcl_size_t n, float a)
+      {
+
+        //if the array is short or if either array is unaligned, perform the non-SSE code
+        if(n<16||((vcl_size_t)x)%16!=((vcl_size_t)y)%16||((vcl_size_t)x)%sizeof(float)!=0)
+          for(vcl_size_t i=0;i<n;i++)
+            y[i]+=a*x[i];
+        else
+        {
+          //process unaligned section of arrays
+          while(((vcl_size_t)x)%16)
+          {
+            if(n<=0)
+              return;
+            y[0]+=a*x[0];
+            x++;
+            y++;
+            n--;
+          }
+
+          __m128 sum0;
+          __m128 sum1;
+          __m128 reg0,reg1,reg2,reg3;
+          __m128 areg=_mm_set1_ps(a);
+          __m128 prod;
+
+          //add floats 8 at a time
+          while(n>=8){
+
+            //read floats into MMX registers (8 from each array)
+            reg0=_mm_load_ps(x+0);
+            reg1=_mm_load_ps(x+4);
+            reg2=_mm_load_ps(y+0);
+            reg3=_mm_load_ps(y+4);
+
+            //add floats
+            prod=_mm_mul_ps(reg0,areg);
+            sum0=_mm_add_ps(prod,reg2);
+            prod=_mm_mul_ps(reg1,areg);
+            sum1=_mm_add_ps(prod,reg3);
+
+            //put float sums into y
+            _mm_store_ps(y+0,sum0);
+            _mm_store_ps(y+4,sum1);
+
+            x+=8;
+            y+=8;
+            n-=8;
+          }
+
+          //add beyond the last multiple of 8
+          for(vcl_size_t i=0;i<n;i++)
+            y[i]+=a*x[i];
+        }
+      }
+
+      //daxpy
+      template <>
+      inline void _axpy<double>(const double* x, double* y, vcl_size_t n, double a)
+      {
+        //if the array is short or if either array is unaligned, perform the non-SSE code
+        if(n<16||((vcl_size_t)x)%16!=((vcl_size_t)y)%16||((vcl_size_t)x)%sizeof(double)!=0)
+          for(vcl_size_t i=0;i<n;i++)
+            y[i]+=a*x[i];
+
+        else
+        {
+          //process unaligned section of arrays
+          while(((vcl_size_t)x)%16)
+          {
+            if(n<=0)
+              return;
+            y[0]+=a*x[0];
+            x++;
+            y++;
+            n--;
+          }
+
+          __m128d sum0;
+          __m128d sum1;
+          __m128d reg0,reg1,reg2,reg3;
+          __m128d areg=_mm_set1_pd(a);
+          __m128d prod;
+
+          //add doubles 4 at a time
+          while(n>=8){
+
+            //read floats into MMX registers (4 from each array)
+            reg0=_mm_load_pd(x+0);
+            reg1=_mm_load_pd(x+2);
+            reg2=_mm_load_pd(y+0);
+            reg3=_mm_load_pd(y+2);
+
+            //add floats
+            prod=_mm_mul_pd(reg0,areg);
+            sum0=_mm_add_pd(prod,reg2);
+            prod=_mm_mul_pd(reg1,areg);
+            sum1=_mm_add_pd(prod,reg3);
+
+            //put float sums into y
+            _mm_store_pd(y+0,sum0);
+            _mm_store_pd(y+2,sum1);
+
+            x+=4;
+            y+=4;
+            n-=4;
+          }
+
+          //add beyond the last multiple of 4
+          for(vcl_size_t i=0;i<n;i++)
+            y[i]+=a*x[i];
+        }
+      }
+
+      //sdot
+      template <>
+      inline float _dot<float>(vcl_size_t n, const float* x, const float* y)
+      {
+
+        //if the array is short or if either array is unaligned, perform the non-SSE code
+        if(n<16||((vcl_size_t)x)%16!=((vcl_size_t)y)%16||((vcl_size_t)x)%sizeof(float)!=0)
+        {
+          float sum=0;
+          for(vcl_size_t i=0;i<n;i++)
+            sum+=x[i]*y[i];
+          return sum;
+        }
+        else
+        {
+
+          //process unaligned section of array
+          float sum=0;
+          while(((vcl_size_t)x)%16)
+          {
+            if(n<=0)
+              return sum;
+            sum+=x[0]*y[0];
+            y++;
+            x++;
+            n--;
+          }
+
+          __m128 sumReg=_mm_setzero_ps();
+          __m128 reg0,reg1,reg2,reg3;
+
+          //add floats 8 at a time
+          while(n>=8)
+          {
+            //read floats into MMX registers (8 from each array)
+            reg0=_mm_load_ps(x+0);
+            reg1=_mm_load_ps(x+4);
+            reg2=_mm_load_ps(y+0);
+            reg3=_mm_load_ps(y+4);
+
+            //multiply floats together
+            reg0=_mm_mul_ps(reg0,reg2);
+            reg1=_mm_mul_ps(reg1,reg3);
+
+            //add to sums
+            sumReg=_mm_add_ps(sumReg,reg0);
+            sumReg=_mm_add_ps(sumReg,reg1);
+
+            x+=8;
+            y+=8;
+            n-=8;
+          }
+
+          //add beyond where the inner loop stopped
+          for(vcl_size_t i=0;i<n;i++)
+            sum+=x[i]*y[i];
+
+          //move the sums from the xmm registers to aligned memory on the stack
+          float sums[8];
+          float* pSums=(float*)((((vcl_size_t)sums)&(~15))+16);
+          _mm_store_ps(pSums,sumReg);
+
+          return sum+pSums[0]+pSums[1]+pSums[2]+pSums[3];
+        }
+      }
+
+      //ddot
+      template <>
+      inline double _dot(vcl_size_t n, const double* x, const double* y)
+      {
+        //if the array is short or if either array is unaligned, perform the non-SSE code
+        if(n<16||((vcl_size_t)x)%16!=((vcl_size_t)y)%16||((vcl_size_t)x)%sizeof(double)!=0)
+        {
+          double sum=0;
+          for(vcl_size_t i=0;i<n;i++)
+            sum+=x[i]*y[i];
+          return sum;
+        }
+        else
+        {
+          //process unaligned section of array
+          double sum=0;
+          while(((vcl_size_t)x)%16)
+          {
+            if(n<=0)
+              return sum;
+            sum+=x[0]*y[0];
+            y++;
+            x++;
+            n--;
+          }
+
+          __m128d sum0=_mm_setzero_pd();
+          __m128d sum1=_mm_setzero_pd();
+          __m128d reg0,reg1,reg2,reg3;
+
+          //add doubles 4 at a time
+          while(n>=4)
+          {
+            //read doubles into MMX registers (4 from each array)
+            reg0=_mm_load_pd(x+0);
+            reg1=_mm_load_pd(x+2);
+            reg2=_mm_load_pd(y+0);
+            reg3=_mm_load_pd(y+2);
+
+            //multiply doubles together
+            reg0=_mm_mul_pd(reg0,reg2);
+            reg1=_mm_mul_pd(reg1,reg3);
+
+            //add to sums
+            sum0=_mm_add_pd(sum0,reg0);
+            sum1=_mm_add_pd(sum1,reg1);
+
+            x+=4;
+            y+=4;
+            n-=4;
+          }
+
+          //add beyond where the inner loop stopped
+          for(vcl_size_t i=0;i<n;i++)
+            sum+=x[i]*y[i];
+
+          //move the sums from the xmm registers to aligned memory on the stack
+          double sums[4];
+          double* pSums=(double*)((((vcl_size_t)sums)&(~15))+16);
+          sum0=_mm_add_pd(sum0,sum1);
+          _mm_store_pd(pSums,sum0);
+
+          return sum+pSums[0]+pSums[1];
+        }
+      }
+
+      //conjugated dot products are the same as non-conjugated dot products for real numbers
+      template <> inline float  _dotc<float >(vcl_size_t n, const float  *x, const float  *y){return _dot(n,x,y);}
+      template <> inline double _dotc<double>(vcl_size_t n, const double *x, const double *y){return _dot(n,x,y);}
+
+  #if defined VIENNACL_WITH_COMPLEX
+
+      //caxpy
+      template <>
+      inline void _axpy<std::complex<float> >(const std::complex<float>* x, std::complex<float>* y, vcl_size_t n, std::complex<float> a)
+      {
+        //if the array is short or if either array is unaligned, perform the non-SSE code
+        if(n<16||((vcl_size_t)x)%16!=((vcl_size_t)y)%16||((vcl_size_t)x)%sizeof(std::complex<float>)!=0)
+          for(vcl_size_t i=0;i<n;i++)
+            y[i]+=a*x[i];
+
+        else
+        {
+          //process unaligned section of arrays
+          while(((vcl_size_t)x)%16)
+          {
+            if(n<=0)
+              return;
+            y[0]+=a*x[0];
+            x++;
+            y++;
+            n--;
+          }
+
+          __m128 reg0,reg1,reg2,reg3,reg4;
+          __m128 areg0=_mm_set_ps(a.imag(),a.real(),a.imag(),a.real());
+          __m128 areg1=_mm_set_ps(a.real(),a.imag(),a.real(),a.imag());
+  #ifndef VIENNACL_WITH_SSE3
+          __m128 nreg=_mm_set_ps(1.0f,-1.0f,1.0f,-1.0f);
+  #endif
+
+          //add complex floats 4 at a time
+          while(n>=4)
+          {
+            //read floats into MMX registers (8 from each array)
+            reg0=_mm_load_ps((float*)(x+0));
+            reg1=_mm_load_ps((float*)(x+2));
+            reg2=_mm_load_ps((float*)(y+0));
+            reg3=_mm_load_ps((float*)(y+2));
+
+            //do complex multiplication and addition
+  #ifndef VIENNACL_WITH_SSE3
+            reg4=_mm_shuffle_ps(reg0,reg0,0xA0);
+            reg0=_mm_shuffle_ps(reg0,reg0,0xF5);
+            reg4=_mm_mul_ps(reg4,areg0);
+            reg0=_mm_mul_ps(reg0,areg1);
+            reg0=_mm_mul_ps(reg0,nreg);
+            reg0=_mm_add_ps(reg4,reg0);
+            reg0=_mm_add_ps(reg0,reg2);
+            reg4=_mm_shuffle_ps(reg1,reg1,0xA0);
+            reg1=_mm_shuffle_ps(reg1,reg1,0xF5);
+            reg4=_mm_mul_ps(reg4,areg0);
+            reg1=_mm_mul_ps(reg1,areg1);
+            reg1=_mm_mul_ps(reg1,nreg);
+            reg1=_mm_add_ps(reg4,reg1);
+            reg1=_mm_add_ps(reg1,reg3);
+  #else
+            reg4=_mm_moveldup_ps(reg0);
+            reg0=_mm_movehdup_ps(reg0);
+            reg4=_mm_mul_ps(reg4,areg0);
+            reg0=_mm_mul_ps(reg0,areg1);
+            reg0=_mm_addsub_ps(reg4,reg0);
+            reg0=_mm_add_ps(reg0,reg2);
+            reg4=_mm_moveldup_ps(reg1);
+            reg1=_mm_movehdup_ps(reg1);
+            reg4=_mm_mul_ps(reg4,areg0);
+            reg1=_mm_mul_ps(reg1,areg1);
+            reg1=_mm_addsub_ps(reg4,reg1);
+            reg1=_mm_add_ps(reg1,reg3);
+  #endif
+            //put results into y
+            _mm_store_ps((float*)(y+0),reg0);
+            _mm_store_ps((float*)(y+2),reg1);
+
+            x+=4;
+            y+=4;
+            n-=4;
+          }
+
+          //add beyond the last multiple of 4
+          for(vcl_size_t i=0;i<n;i++)
+            y[i]+=a*x[i];
+        }
+      }
+
+      //zaxpy
+      template <>
+      inline void _axpy<std::complex<double> >(const std::complex<double>* x, std::complex<double>* y, vcl_size_t n, std::complex<double> a)
+      {
+        //if the array is short or if either array is unaligned, perform the non-SSE code
+        if(n<16||((vcl_size_t)x)%16||((vcl_size_t)y)%16)
+          for(vcl_size_t i=0;i<n;i++)
+            y[i]+=a*x[i];
+
+        else
+        {
+          __m128d reg0,reg1,reg2,reg3,reg4;
+          __m128d areg0=_mm_set_pd(a.imag(),a.real());
+          __m128d areg1=_mm_set_pd(a.real(),a.imag());
+  #ifndef VIENNACL_WITH_SSE3
+          __m128d nreg=_mm_set_pd(1.0,-1.0);
+  #endif
+
+          //add complex doubles 2 at a time
+          while(n>=2)
+          {
+            //read doubles into MMX registers (4 from each array)
+            reg0=_mm_load_pd((double*)(x+0));
+            reg1=_mm_load_pd((double*)(x+1));
+            reg2=_mm_load_pd((double*)(y+0));
+            reg3=_mm_load_pd((double*)(y+1));
+
+            //do complex multiplication and addition
+  #ifndef VIENNACL_WITH_SSE3
+            reg4=_mm_shuffle_pd(reg0,reg0,0x0);
+            reg0=_mm_shuffle_pd(reg0,reg0,0x3);
+            reg4=_mm_mul_pd(reg4,areg0);
+            reg0=_mm_mul_pd(reg0,areg1);
+            reg0=_mm_mul_pd(reg0,nreg);
+            reg0=_mm_add_pd(reg4,reg0);
+            reg0=_mm_add_pd(reg0,reg2);
+            reg4=_mm_shuffle_pd(reg1,reg1,0x0);
+            reg1=_mm_shuffle_pd(reg1,reg1,0x3);
+            reg4=_mm_mul_pd(reg4,areg0);
+            reg1=_mm_mul_pd(reg1,areg1);
+            reg1=_mm_mul_pd(reg1,nreg);
+            reg1=_mm_add_pd(reg4,reg1);
+            reg1=_mm_add_pd(reg1,reg3);
+  #else
+            reg4=_mm_shuffle_pd(reg0,reg0,0x0);
+            reg0=_mm_shuffle_pd(reg0,reg0,0x3);
+            reg4=_mm_mul_pd(reg4,areg0);
+            reg0=_mm_mul_pd(reg0,areg1);
+            reg0=_mm_addsub_pd(reg4,reg0);
+            reg0=_mm_add_pd(reg0,reg2);
+            reg4=_mm_shuffle_pd(reg1,reg1,0x0);
+            reg1=_mm_shuffle_pd(reg1,reg1,0x3);
+            reg4=_mm_mul_pd(reg4,areg0);
+            reg1=_mm_mul_pd(reg1,areg1);
+            reg1=_mm_addsub_pd(reg4,reg1);
+            reg1=_mm_add_pd(reg1,reg3);
+  #endif
+            //put results into y
+            _mm_store_pd((double*)(y+0),reg0);
+            _mm_store_pd((double*)(y+1),reg1);
+
+            x+=2;
+            y+=2;
+            n-=2;
+          }
+
+          //add beyond the last multiple of 2
+          if(n)
+            y[0]+=a*x[0];
+        }
+      }
+
+      //cdotu
+      template <>
+      inline std::complex<float> _dot<std::complex<float> >(vcl_size_t n, const std::complex<float>* x, const std::complex<float>* y)
+      {
+        //if the array is short or if either array is unaligned, perform the non-SSE code
+        if(n<16||((vcl_size_t)x)%16!=((vcl_size_t)y)%16||((vcl_size_t)x)%sizeof(std::complex<float>)!=0)
+        {
+          std::complex<float> sum(0);
+          for(vcl_size_t i=0;i<n;i++)
+            sum+=x[i]*y[i];
+          return sum;
+        }
+        else
+        {
+          //process unaligned section of arrays
+          std::complex<float> sum(0);
+          while(((vcl_size_t)x)%16)
+          {
+            if(n<=0)
+              return sum;
+            sum+=x[0]*y[0];
+            y++;
+            x++;
+            n--;
+          }
+
+          __m128 sumReg=_mm_setzero_ps();
+          __m128 reg0,reg1,reg2,reg3,reg4;
+  #ifndef VIENNACL_WITH_SSE3
+          __m128 nreg=_mm_set_ps(1.0f,-1.0f,1.0f,-1.0f);
+  #endif
+
+          //add complex floats 4 at a time
+          while(n>=4)
+          {
+            //read floats into MMX registers (8 from each array)
+            reg0=_mm_load_ps((float*)(x+0));
+            reg1=_mm_load_ps((float*)(x+2));
+            reg2=_mm_load_ps((float*)(y+0));
+            reg3=_mm_load_ps((float*)(y+2));
+
+            //multiply complex floats together
+  #ifndef VIENNACL_WITH_SSE3
+            reg4=_mm_shuffle_ps(reg2,reg2,0xA0);
+            reg2=_mm_shuffle_ps(reg2,reg2,0xF5);
+            reg4=_mm_mul_ps(reg4,reg0);
+            reg2=_mm_mul_ps(reg2,reg0);
+            reg2=_mm_shuffle_ps(reg2,reg2,0xB1);
+            reg2=_mm_mul_ps(reg2,nreg);
+            reg0=_mm_add_ps(reg4,reg2);
+            reg4=_mm_shuffle_ps(reg3,reg3,0xA0);
+            reg3=_mm_shuffle_ps(reg3,reg3,0xF5);
+            reg4=_mm_mul_ps(reg4,reg1);
+            reg3=_mm_mul_ps(reg3,reg1);
+            reg3=_mm_shuffle_ps(reg3,reg3,0xB1);
+            reg3=_mm_mul_ps(reg3,nreg);
+            reg1=_mm_add_ps(reg4,reg3);
+  #else
+            reg4=_mm_moveldup_ps(reg2);
+            reg2=_mm_movehdup_ps(reg2);
+            reg4=_mm_mul_ps(reg4,reg0);
+            reg2=_mm_mul_ps(reg2,reg0);
+            reg2=_mm_shuffle_ps(reg2,reg2,0xB1);
+            reg0=_mm_addsub_ps(reg4,reg2);
+            reg4=_mm_moveldup_ps(reg3);
+            reg3=_mm_movehdup_ps(reg3);
+            reg4=_mm_mul_ps(reg4,reg1);
+            reg3=_mm_mul_ps(reg3,reg1);
+            reg3=_mm_shuffle_ps(reg3,reg3,0xB1);
+            reg1=_mm_addsub_ps(reg4,reg3);
+  #endif
+
+            //add to sum
+            sumReg=_mm_add_ps(sumReg,reg0);
+            sumReg=_mm_add_ps(sumReg,reg1);
+
+            x+=4;
+            y+=4;
+            n-=4;
+          }
+
+          //add beyond where the inner loop stopped
+          for(vcl_size_t i=0;i<n;i++)
+            sum+=x[i]*y[i];
+
+          //move the sums from the xmm registers to aligned memory on the stack
+          std::complex<float> sums[4];
+          std::complex<float>* pSums=(std::complex<float>*)((((vcl_size_t)sums)&(~15))+16);
+          pSums[0]=std::complex<float>(0);
+          pSums[1]=std::complex<float>(0);
+          _mm_store_ps((float*)pSums,sumReg);
+
+          return sum+pSums[0]+pSums[1];
+        }
+      }
+
+      //zdotu
+      template <>
+      inline std::complex<double> _dot<std::complex<double> >(vcl_size_t n, const std::complex<double>* x, const std::complex<double>* y)
+      {
+        //if the array is short or if either array is unaligned, perform the non-SSE code
+        if(n<16||((vcl_size_t)x)%16||((vcl_size_t)y)%16)
+        {
+          std::complex<double> sum(0);
+          for(vcl_size_t i=0;i<n;i++)
+            sum+=x[i]*y[i];
+          return sum;
+        }
+        else
+        {
+          __m128d sumReg=_mm_setzero_pd();
+          __m128d reg0,reg1,reg2,reg3,reg4;
+  #ifndef VIENNACL_WITH_SSE3
+          __m128d nreg=_mm_set_pd(1.0,-1.0);
+  #endif
+
+          //add complex doubles 2 at a time
+          while(n>=2)
+          {
+            //read doubles into MMX registers (4 from each array)
+            reg0=_mm_load_pd((double*)(x+0));
+            reg1=_mm_load_pd((double*)(x+1));
+            reg2=_mm_load_pd((double*)(y+0));
+            reg3=_mm_load_pd((double*)(y+1));
+
+            //multiply complex doubles together
+  #ifndef VIENNACL_WITH_SSE3
+            reg4=_mm_shuffle_pd(reg2,reg2,0x0);
+            reg2=_mm_shuffle_pd(reg2,reg2,0x3);
+            reg4=_mm_mul_pd(reg4,reg0);
+            reg2=_mm_mul_pd(reg2,reg0);
+            reg2=_mm_shuffle_pd(reg2,reg2,0x1);
+            reg2=_mm_mul_pd(reg2,nreg);
+            reg0=_mm_add_pd(reg4,reg2);
+            reg4=_mm_shuffle_pd(reg3,reg3,0x0);
+            reg3=_mm_shuffle_pd(reg3,reg3,0x3);
+            reg4=_mm_mul_pd(reg4,reg1);
+            reg3=_mm_mul_pd(reg3,reg1);
+            reg3=_mm_shuffle_pd(reg3,reg3,0x1);
+            reg3=_mm_mul_pd(reg3,nreg);
+            reg1=_mm_add_pd(reg4,reg3);
+  #else
+            reg4=_mm_shuffle_pd(reg2,reg2,0x0);
+            reg2=_mm_shuffle_pd(reg2,reg2,0x3);
+            reg4=_mm_mul_pd(reg4,reg0);
+            reg2=_mm_mul_pd(reg2,reg0);
+            reg2=_mm_shuffle_pd(reg2,reg2,0x1);
+            reg0=_mm_addsub_pd(reg4,reg2);
+            reg4=_mm_shuffle_pd(reg3,reg3,0x0);
+            reg3=_mm_shuffle_pd(reg3,reg3,0x3);
+            reg4=_mm_mul_pd(reg4,reg1);
+            reg3=_mm_mul_pd(reg3,reg1);
+            reg3=_mm_shuffle_pd(reg3,reg3,0x1);
+            reg1=_mm_addsub_pd(reg4,reg3);
+  #endif
+
+            //add to sum
+            sumReg=_mm_add_pd(sumReg,reg0);
+            sumReg=_mm_add_pd(sumReg,reg1);
+
+            x+=2;
+            y+=2;
+            n-=2;
+          }
+
+          //add beyond where the inner loop stopped
+          std::complex<double> sum(0);
+          if(n)
+            sum=x[0]*y[0];
+
+          //move the sums from the xmm registers to aligned memory on the stack
+          std::complex<double> sums[2];
+          std::complex<double>* pSums=(std::complex<double>*)((((vcl_size_t)sums)&(~15))+16);
+          pSums[0]=std::complex<double>(0);
+          _mm_store_pd((double*)pSums,sumReg);
+
+          return sum+pSums[0];
+        }
+      }
+
+      //cdotc
+      template <>
+      inline std::complex<float> _dotc<std::complex<float> >(vcl_size_t n, const std::complex<float>* x, const std::complex<float>* y)
+      {
+        //if the array is short or if either array is unaligned, perform the non-SSE code
+        if(n<16||((vcl_size_t)x)%16!=((vcl_size_t)y)%16||((vcl_size_t)x)%sizeof(std::complex<float>)!=0)
+        {
+          std::complex<float> sum(0);
+          for(vcl_size_t i=0;i<n;i++)
+            sum+=conj(x[i])*y[i];
+          return sum;
+        }
+        else
+        {
+          //process unaligned section of arrays
+          std::complex<float> sum(0);
+          while(((vcl_size_t)x)%16)
+          {
+            if(n<=0)
+              return sum;
+            sum+=conj(x[0])*y[0];
+            y++;
+            x++;
+            n--;
+          }
+
+          __m128 sumReg=_mm_setzero_ps();
+          __m128 reg0,reg1,reg2,reg3,reg4;
+  #ifndef VIENNACL_WITH_SSE3
+          __m128 nreg=_mm_set_ps(1.0f,-1.0f,1.0f,-1.0f);
+  #endif
+
+          //add complex floats 4 at a time
+          while(n>=4)
+          {
+            //read floats into MMX registers (8 from each array)
+            reg0=_mm_load_ps((float*)(x+0));
+            reg1=_mm_load_ps((float*)(x+2));
+            reg2=_mm_load_ps((float*)(y+0));
+            reg3=_mm_load_ps((float*)(y+2));
+
+            //multiply complex doubles together
+  #ifndef VIENNACL_WITH_SSE3
+            reg4=_mm_shuffle_ps(reg2,reg2,0xA0);
+            reg2=_mm_shuffle_ps(reg2,reg2,0xF5);
+            reg4=_mm_mul_ps(reg4,reg0);
+            reg2=_mm_mul_ps(reg2,reg0);
+            reg4=_mm_shuffle_ps(reg4,reg4,0xB1);
+            reg4=_mm_mul_ps(reg4,nreg);
+            reg0=_mm_add_ps(reg4,reg2);
+            reg4=_mm_shuffle_ps(reg3,reg3,0xA0);
+            reg3=_mm_shuffle_ps(reg3,reg3,0xF5);
+            reg4=_mm_mul_ps(reg4,reg1);
+            reg3=_mm_mul_ps(reg3,reg1);
+            reg4=_mm_shuffle_ps(reg4,reg4,0xB1);
+            reg4=_mm_mul_ps(reg4,nreg);
+            reg1=_mm_add_ps(reg4,reg3);
+  #else
+            reg4=_mm_moveldup_ps(reg2);
+            reg2=_mm_movehdup_ps(reg2);
+            reg4=_mm_mul_ps(reg4,reg0);
+            reg2=_mm_mul_ps(reg2,reg0);
+            reg4=_mm_shuffle_ps(reg4,reg4,0xB1);
+            reg0=_mm_addsub_ps(reg2,reg4);
+            reg4=_mm_moveldup_ps(reg3);
+            reg3=_mm_movehdup_ps(reg3);
+            reg4=_mm_mul_ps(reg4,reg1);
+            reg3=_mm_mul_ps(reg3,reg1);
+            reg4=_mm_shuffle_ps(reg4,reg4,0xB1);
+            reg1=_mm_addsub_ps(reg3,reg4);
+  #endif
+
+            //add to sum
+            sumReg=_mm_add_ps(sumReg,reg0);
+            sumReg=_mm_add_ps(sumReg,reg1);
+
+            x+=4;
+            y+=4;
+            n-=4;
+          }
+
+          //add beyond where the inner loop stopped
+          for(vcl_size_t i=0;i<n;i++)
+            sum+=conj(x[i])*y[i];
+
+          //move the sums from the xmm registers to aligned memory on the stack
+          std::complex<float> sums[4];
+          std::complex<float>* pSums=(std::complex<float>*)((((vcl_size_t)sums)&(~15))+16);
+          sumReg=_mm_shuffle_ps(sumReg,sumReg,0xB1);//swap real and imag
+          _mm_store_ps((float*)pSums,sumReg);
+
+          return sum+pSums[0]+pSums[1];
+        }
+      }
+
+      //zdotc
+      template <>
+      inline std::complex<double> _dotc<std::complex<double> >(vcl_size_t n, const std::complex<double>* x, const std::complex<double>* y)
+      {
+        //if the array is short or if either array is unaligned, perform the non-SSE code
+        if(n<16||((vcl_size_t)x)%16||((vcl_size_t)y)%16)
+        {
+          std::complex<double> sum(0);
+          for(vcl_size_t i=0;i<n;i++)
+            sum+=conj(x[i])*y[i];
+          return sum;
+        }
+        else
+        {
+          __m128d sumReg=_mm_setzero_pd();
+          __m128d reg0,reg1,reg2,reg3,reg4;
+  #ifndef VIENNACL_WITH_SSE3
+          __m128d nreg=_mm_set_pd(1.0,-1.0);
+  #endif
+
+          //add complex doubles 2 at a time
+          while(n>=2)
+          {
+            //read doubles into MMX registers (4 from each array)
+            reg0=_mm_load_pd((double*)(x+0));
+            reg1=_mm_load_pd((double*)(x+1));
+            reg2=_mm_load_pd((double*)(y+0));
+            reg3=_mm_load_pd((double*)(y+1));
+
+            //multiply complex floats together
+  #ifndef VIENNACL_WITH_SSE3
+            reg4=_mm_shuffle_pd(reg2,reg2,0x0);
+            reg2=_mm_shuffle_pd(reg2,reg2,0x3);
+            reg4=_mm_mul_pd(reg4,reg0);
+            reg2=_mm_mul_pd(reg2,reg0);
+            reg4=_mm_shuffle_pd(reg4,reg4,0x1);
+            reg4=_mm_mul_pd(reg4,nreg);
+            reg0=_mm_add_pd(reg4,reg2);
+            reg4=_mm_shuffle_pd(reg3,reg3,0x0);
+            reg3=_mm_shuffle_pd(reg3,reg3,0x3);
+            reg4=_mm_mul_pd(reg4,reg1);
+            reg3=_mm_mul_pd(reg3,reg1);
+            reg4=_mm_shuffle_pd(reg4,reg4,0x1);
+            reg4=_mm_mul_pd(reg4,nreg);
+            reg1=_mm_add_pd(reg4,reg3);
+  #else
+            reg4=_mm_shuffle_pd(reg2,reg2,0x0);
+            reg2=_mm_shuffle_pd(reg2,reg2,0x3);
+            reg4=_mm_mul_pd(reg4,reg0);
+            reg2=_mm_mul_pd(reg2,reg0);
+            reg4=_mm_shuffle_pd(reg4,reg4,0x1);
+            reg0=_mm_addsub_pd(reg2,reg4);
+            reg4=_mm_shuffle_pd(reg3,reg3,0x0);
+            reg3=_mm_shuffle_pd(reg3,reg3,0x3);
+            reg4=_mm_mul_pd(reg4,reg1);
+            reg3=_mm_mul_pd(reg3,reg1);
+            reg4=_mm_shuffle_pd(reg4,reg4,0x1);
+            reg1=_mm_addsub_pd(reg3,reg4);
+
+  #endif
+
+            //add to sum
+            sumReg=_mm_add_pd(sumReg,reg0);
+            sumReg=_mm_add_pd(sumReg,reg1);
+
+            x+=2;
+            y+=2;
+            n-=2;
+          }
+
+          //add beyond where the inner loop stopped
+          std::complex<double> sum(0);
+          if(n)
+            sum=conj(x[0])*y[0];
+
+          //move the sums from the xmm registers to aligned memory on the stack
+          std::complex<double> sums[2];
+          std::complex<double>* pSums=(std::complex<double>*)((((vcl_size_t)sums)&(~15))+16);
+          sumReg=_mm_shuffle_pd(sumReg,sumReg,0x1);//swap real and imag
+          _mm_store_pd((double*)pSums,sumReg);
+
+          return sum+pSums[0];
+        }
+      }
+
+  #endif //defined VIENNACL_WITH_COMPLEX
+
+  #endif //defined VIENNACL_WITH_SSE2
+
+    } //namespace host_based
+  } //namespace linalg
+} //namespace viennacl
+
+#endif
diff --git a/viennacl/linalg/host_based/sse_kernels.hpp b/viennacl/linalg/host_based/sse_kernels.hpp
new file mode 100644
index 0000000..00d014f
--- /dev/null
+++ b/viennacl/linalg/host_based/sse_kernels.hpp
@@ -0,0 +1,590 @@
+#ifndef VIENNACL_LINALG_HOST_BASED_SSE_KERNELS_HPP_
+#define VIENNACL_LINALG_HOST_BASED_SSE_KERNELS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/host_based/sse_kernels.hpp
+*   @brief optimized linear algebra operations for the CPU
+*
+*   Contributed by Alex Christensen.
+*/
+
+#ifdef VIENNACL_WITH_OPENMP
+#include <omp.h>
+#endif
+
+#include <iostream>
+#include <vector>
+
+//for std::min
+#include <algorithm>
+
+#include "viennacl/linalg/host_based/sse_blas.hpp"
+
+namespace viennacl
+{
+  namespace linalg
+  {
+    namespace host_based
+    {
+      namespace detail
+      {
+
+        // returns true if the matrix is hermitian (or real symmetric), false otherwise
+        template <typename ScalarType>
+        bool isHermitian(ScalarType ** const A, vcl_size_t n)
+        {
+          for(vcl_size_t i=0;i<n;i++)
+            for(vcl_size_t j=i;j<n;j++)
+              if(A[i][j] != conjIfComplex(A[j][i]))
+                return false;
+          return true;
+        }
+
+        // returns the bandwidth of a hermitian (or real symmetric) matrix
+        template <typename ScalarType>
+        vcl_size_t getHermitianBandwidth(ScalarType ** const A, vcl_size_t n)
+        {
+          for(vcl_size_t i=n-1;i>=0;i--)
+            for(vcl_size_t j=0;j<n-i;j++)
+              if(A[i+j][j]!=ScalarType(0))
+                return 2*i+1;
+          return 0;
+        }
+
+        // helper for tridiagonalizeBandedMatrix
+        // does a householder similarity transform to eliminate a range of nonzeros in a row of a hermitian matrix
+        template <typename ScalarType>
+        void eliminateHermitian(ScalarType ** A, vcl_size_t row, vcl_size_t from, vcl_size_t to, vcl_size_t width, ScalarType * ss)
+        {
+          if(from>=to)
+            return;
+
+          ScalarType norm=_nrm2(&A[row][row+from],to-from);
+
+          if(norm != ScalarType(0))
+          {
+
+            //pick the better of two reflectors, to 1 or -1
+            //this is wierd syntax that also works with std::complex
+            if(std::abs(A[row][row+from]-ScalarType(1))>std::abs(A[row][row+from]+ScalarType(1)))
+              norm=-norm;
+            for(vcl_size_t i=row+from;i<row+to;i++)
+              A[row][i]/=norm;
+            A[row][row+from]+=ScalarType(1);
+
+            //apply the similarity transformation
+
+            //left transformation
+            for(vcl_size_t j=row+1;j<row+width;j++)
+            {
+              ScalarType s=_dotc(to-from,&A[row][row+from],&A[j][row+from]);
+              s=-s/A[row][row+from];
+              _axpy(&A[row][row+from],&A[j][row+from],to-from,s);
+            }
+
+            //conjugate householder reflector for right transformation
+            for(vcl_size_t i=row+from;i<row+to;i++)
+              A[row][i]=conjIfComplex(A[row][i]);
+
+            //right transformation (cache aligned)
+            for(vcl_size_t i=0;i<width;i++)
+              ss[i]=ScalarType(0);
+            for(vcl_size_t i=from;i<to;i++)
+              _axpy(&A[row+i][row],ss,width,conjIfComplex(A[row][row+i]));
+            for(vcl_size_t i=0;i<width;i++)
+              ss[i]=-ss[i]/A[row][row+from];
+            for(vcl_size_t i=from;i<to;i++)
+              _axpy(ss,&A[row+i][row],width,A[row][row+i]);
+
+            //clean up the householder reflector
+            for(vcl_size_t col=row+from;col<row+to;col++)
+              A[row][col]=conjIfComplex(A[col][row]);
+
+          }
+        }
+
+        // reduces a hermitian (or symmetric real) banded matrix to a hermitian (or symmetric real) tridiagonal matrix,
+        // using householder similarity transforms, so eigenvalues are preserved.
+        // bandwidth should be an odd integer, such as 3 for an already tridiagonal matrix
+        // based on http://www.netlib.org/lapack/lawnspdf/lawn208.pdf
+        template<typename ScalarType>
+        void tridiagonalizeHermitianBandedMatrix(ScalarType ** A, vcl_size_t n, vcl_size_t bandwidth)
+        {
+          if(bandwidth<=3)
+            return;
+
+          vcl_size_t belowDiagonal=(bandwidth-1)/2;
+          ScalarType *ss=new ScalarType[bandwidth+belowDiagonal];
+
+          //eliminate and chase bulges where the elimination makes a bulge
+          vcl_size_t k=0;
+          for(;k<n-belowDiagonal;k++)
+          {
+
+              //eliminate below the diagonal
+              eliminateHermitian(A,k,1,1+belowDiagonal,std::min(n-k,2*belowDiagonal+1),ss);
+
+              //chase the bulge
+              for(vcl_size_t bulgeStart=k+1;bulgeStart<n-belowDiagonal;bulgeStart+=belowDiagonal)
+                  for(vcl_size_t i=0;i<belowDiagonal-1;i++)
+                      eliminateHermitian(A,bulgeStart+i,belowDiagonal,std::min(n-bulgeStart-i,belowDiagonal*2-i),std::min(n-bulgeStart-i,bandwidth+belowDiagonal),ss);
+          }
+
+          //eliminate beyond where elimination makes bulges
+          for(;k<n-2;k++)
+              eliminateHermitian(A,k,1,n-k,n-k,ss);
+
+          delete [] ss;
+        }
+
+        // reduces a hermitian (or symmetric real) matrix to a hermitian (or symmetric real) banded matrix with bandwidth 2*block_size+1
+        // using householder similarity transformations, so eigenvalues are preserved. reduceToBandedMatrix(A,1) reduces the matrix to tridiagonal
+        template<typename ScalarType>
+        void reduceHermitianToBandedMatrix(ScalarType ** A, vcl_size_t n, vcl_size_t block_size, vcl_size_t num_threads)
+        {
+          ScalarType* norms=new ScalarType[block_size];
+          ScalarType* ss=new ScalarType[n];
+
+          for (vcl_size_t k=0;k<n-block_size;k+=block_size)
+          {
+            for(vcl_size_t bi=0;bi<std::min(block_size,n-k-block_size);bi++)
+            {
+
+              //this is the same as the norm of the column, since it's hermetian
+              norms[bi]=_nrm2(&A[k+bi][k+bi+block_size],n-k-bi-block_size);
+
+              if(norms[bi]!=ScalarType(0))
+              {
+
+                //pick the better of two reflectors, to 1 or -1
+                //this is wierd syntax that also works with std::complex
+                if(std::abs(A[k+bi][k+bi+block_size]-ScalarType(1))>std::abs(A[k+bi][k+bi+block_size]+ScalarType(1)))
+                    norms[bi]=-norms[bi];
+                for(vcl_size_t i=k+bi+block_size;i<n;i++)
+                    A[k+bi][i]/=norms[bi];
+                A[k+bi][k+bi+block_size]+=ScalarType(1);
+
+                // Apply transformation to remaining rows within the block
+                for(vcl_size_t j=k+bi+1;j<k+block_size;j++)
+                {
+                    ScalarType s=_dotc(n-k-bi-block_size,&A[k+bi][k+bi+block_size],&A[j][k+bi+block_size]);
+                    s=-s/A[k+bi][k+bi+block_size];
+                    _axpy(&A[k+bi][k+bi+block_size],&A[j][k+bi+block_size],n-k-bi-block_size,s);
+                }
+              }
+            }
+
+            //apply transformations from block to remaining rows and columns the block in parallel
+
+            //left transformations
+  #ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for
+            for(int j=k+block_size;j<(int)n;j++)
+  #else
+            for(vcl_size_t j=k+block_size;j<n;j++)
+  #endif
+            {
+              for(vcl_size_t bi=0;bi<std::min(block_size,n-k-block_size);bi++)
+              {
+                if(norms[bi]!=ScalarType(0))
+                {
+                  ScalarType s=_dotc(n-k-bi-block_size,&A[k+bi][k+bi+block_size],&A[j][k+bi+block_size]);
+                  s=-s/A[k+bi][k+bi+block_size];
+                  _axpy(&A[k+bi][k+bi+block_size],&A[j][k+bi+block_size],n-k-bi-block_size,s);
+                }
+              }
+            }
+
+            //conjugate householder reflectors for right transformations
+            for(vcl_size_t bi=0;bi<block_size;bi++)
+              for(vcl_size_t i=k+bi+block_size;i<n;i++)
+                A[k+bi][i]=conjIfComplex(A[k+bi][i]);
+
+            //right transformations (cache aligned)
+  #ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for
+            for(int section=0;section<(int)num_threads;section++)
+  #else
+            for(vcl_size_t section=0;section<num_threads;section++)
+  #endif
+            {
+              vcl_size_t start=((n-k)*(section+0))/num_threads+k;
+              vcl_size_t end  =((n-k)*(section+1))/num_threads+k;
+              vcl_size_t length=end-start;
+              for(vcl_size_t bi=0;bi<std::min(block_size,n-k-block_size);bi++)
+              {
+                if(norms[bi]!=ScalarType(0))
+                {
+                  for(vcl_size_t i=start;i<end;i++)
+                    ss[i]=ScalarType(0);
+                  for(vcl_size_t i=k+bi+block_size;i<n;i++)
+                    _axpy(&A[i][start],ss+start,length,conjIfComplex(A[k+bi][i]));
+                  for(vcl_size_t i=start;i<end;i++)
+                    ss[i]=-ss[i]/A[k+bi][k+bi+block_size];
+                  for(vcl_size_t i=k+bi+block_size;i<n;i++)
+                    _axpy(ss+start,&A[i][start],length,A[k+bi][i]);
+                }
+              }
+            }
+
+            //clean up householder reflectors
+            for(vcl_size_t row=k;row<k+block_size;row++)
+              for(vcl_size_t col=row+block_size;col<n;col++)
+                A[row][col]=conjIfComplex(A[col][row]);
+          }
+          delete [] norms;
+          delete [] ss;
+        }
+
+      } //namespace detail
+
+      /** @brief Inplace reduction of a dense n x n row-major or column-major hermitian (or real symmetric) matrix
+      *         to tridiagonal form using householder similarity transforms (preserving eigenvalues)
+      *
+      * @param A            A dense hermitian matrix to be tridiagonalized
+      * @param n            The height and width of the hermitian matrix
+      * @param block_size   The block size to be used
+      * @param num_threads  The number of threads to be used with OpenMP
+      */
+      template<typename ScalarType>
+      void inplace_tred2(ScalarType ** A, vcl_size_t n, vcl_size_t block_size = 1, vcl_size_t num_threads = 1)
+      {
+        if(!detail::isHermitian(A,n))
+          std::cerr << "ViennaCL: Warning in inplace_tred2(): Matrix is not hermitian (or real symmetric)" << std::endl;
+
+        // Don't touch the whole matrix if the bandwidth is already small.
+        // There's nothing numerically significant about n*4,
+        // it's just a point I chose to switch to assuming the matrix is full.
+        vcl_size_t bandwidth=detail::getHermitianBandwidth(A,n);
+        if(bandwidth*bandwidth*num_threads<n*4 || 2*block_size+1>bandwidth)
+          detail::tridiagonalizeHermitianBandedMatrix(A,n,bandwidth);
+        else
+        {
+          detail::reduceHermitianToBandedMatrix(A,n,block_size,num_threads);
+          detail::tridiagonalizeHermitianBandedMatrix(A,n,2*block_size+1);
+        }
+      }
+
+      /** @brief Inplace lu factorization of an m x n dense row-major matrix with optional partial pivoting,
+      *         returning true for an even number of pivots, false for an odd number of pivots.  Factorization
+      *         is successful if there are no nonzero values on the diagonal.
+      *
+      * @param A            A dense row-major matrix to be factorized
+      * @param m            The height of the matrix
+      * @param n            The width of the matrix
+      * @param piv          The optional pivot vector to store the pivot indices.  If piv is NULL, no partial pivoting will be performed.
+      * @param block_size   The block size to be used
+      */
+      template <typename ScalarType>
+      bool lu_factorize_row_major(ScalarType ** A, vcl_size_t m, vcl_size_t n, vcl_size_t * piv = NULL, vcl_size_t block_size = 8)
+      {
+        // Use a parallel "left-looking", row-operation-based, block Crout/Doolittle algorithm.
+        if(piv)
+          for(vcl_size_t i=0; i<m; i++)
+            piv[i]=i;
+        bool pivsign=true;
+
+        // Outer loop.
+            for(vcl_size_t j=0; j<std::min(m,n); j+=block_size)
+        {
+                  block_size=std::min(std::min(m-j,n-j),block_size);
+
+          //do Gaussian elimination with partial pivoting in the block
+          //(in the first few columns of the matrix)
+          for(vcl_size_t bi=0;bi<block_size;bi++)
+          {
+            // Find pivot and exchange if necessary.
+            vcl_size_t p=j+bi;
+            if(piv)
+            {
+              for(vcl_size_t i=j+bi+1; i<m; i++)
+                if(std::abs(A[i][j+bi])>std::abs(A[p][j+bi]))
+                  p=i;
+
+              if (p!=j+bi)
+              {
+                for(vcl_size_t k=0; k<n; k++)
+                {
+                  ScalarType t=A[p][k];
+                  A[p][k]=A[j+bi][k];
+                  A[j+bi][k]=t;
+                }
+
+                //swap pivot vector
+                vcl_size_t k = piv[p];
+                piv[p] = piv[j+bi];
+                piv[j+bi] = k;
+                pivsign = !pivsign;
+              }
+            }
+
+            //eliminate below the diagonal in the block
+            ScalarType elimVal=A[j+bi][j+bi];
+            if(elimVal==ScalarType(0))
+            {
+              //apply previous transformations from the block to the top of the submatrix
+              for(vcl_size_t row=j+1;row<j+bi;row++)
+                for(vcl_size_t bi_=0;bi_<row-j;bi_++)
+                  if(A[row][j+bi_]!=ScalarType(0))
+                    _axpy(&(A[j+bi_][j+block_size]),&(A[row][j+block_size]),n-j-block_size,-A[row][j+bi_]);
+              return pivsign;
+            }
+            for(vcl_size_t row=j+bi+1;row<m;row++)
+            {
+              ScalarType multiplier=A[row][j+bi]/elimVal;
+                for(vcl_size_t col=j+bi;col<j+block_size;col++)
+                  A[row][col]-=multiplier*A[j+bi][col];
+                    A[row][j+bi]=multiplier;
+            }
+          }
+
+          //at this point, the matrix looks something like this (if block size were 4)
+          //
+          //U U U U * * * *
+          //L U U U * * * *
+          //L L U U * * * *
+          //L L L U * * * *
+          //L L L L * * * *
+          //L L L L * * * *
+          //L L L L * * * *
+          //L L L L * * * *
+
+          //apply previous transformations from the block to the top of the submatrix
+          for(vcl_size_t row=j+1;row<j+block_size;row++)
+            for(vcl_size_t bi=0;bi<row-j;bi++)
+              if(A[row][j+bi]!=ScalarType(0))
+                _axpy(&(A[j+bi][j+block_size]),&(A[row][j+block_size]),n-j-block_size,-A[row][j+bi]);
+
+          //at this point, the matrix looks something like this (if block size were 4)
+          //
+          //U U U U U U U U
+          //L U U U U U U U
+          //L L U U U U U U
+          //L L L U U U U U
+          //L L L L * * * *
+          //L L L L * * * *
+          //L L L L * * * *
+          //L L L L * * * *
+
+          //apply previous transformations from the block in parallel to the rest of the submatrix
+  #ifdef VIENNACL_OPENMP
+  #pragma omp parallel for
+          for(int row=j+block_size;row<(int)m;row++)
+  #else
+          for(vcl_size_t row=j+block_size;row<m;row++)
+  #endif
+          for(vcl_size_t bi=0;bi<block_size;bi++)
+            if(A[row][j+bi]!=ScalarType(0))
+                _axpy(&(A[j+bi][j+block_size]),&(A[row][j+block_size]),n-j-block_size,-A[row][j+bi]);
+        }
+        return pivsign;
+      }
+
+      /** @brief Inplace qr factorization of an m x n dense column-major matrix, returning the householder normalization coefficients
+      *
+      * @param A            A dense column-major matrix to be factorized
+      * @param m            The height of the matrix
+      * @param n            The width of the matrix
+      * @param block_size   The block size to be used
+      */
+      template <typename ScalarType>
+          std::vector<ScalarType> inplace_qr_col_major(ScalarType ** A, vcl_size_t m, vcl_size_t n, vcl_size_t block_size = 8)
+      {
+        std::vector<ScalarType> betas(std::min(m,n));
+        ScalarType* norms=new ScalarType[block_size];
+
+        for(vcl_size_t k=0; k<std::min(m,n); k+=block_size)
+        {
+          block_size=std::min(std::min(m-k,n-k),block_size);
+
+          for(vcl_size_t bi=0;bi<block_size;bi++)
+          {
+
+            // Compute 2-norm of k+bi-th column below the diagonal
+            norms[bi]=_nrm2(&A[k+bi][k+bi],m-k-bi);
+
+            if(norms[bi]!=ScalarType(0))
+            {
+              //pick the better of two reflectors, to 1 or -1,
+              //this is wierd syntax that also works with std::complex
+              if(std::abs(A[k+bi][k+bi]-ScalarType(1))>std::abs(A[k+bi][k+bi]+ScalarType(1)))
+                norms[bi]*=-1;
+              for(vcl_size_t i=k+bi;i<m;i++)
+                A[k+bi][i]/=norms[bi];
+              A[k+bi][k+bi]+=ScalarType(1);
+
+              // Apply transformation to columns within the block
+              for(vcl_size_t j=k+bi+1; j<k+block_size; j++)
+              {
+                ScalarType s=_dotc(m-k-bi,&A[k+bi][k+bi],&A[j][k+bi]);
+                s = -s/A[k+bi][k+bi];
+                _axpy(&A[k+bi][k+bi],&A[j][k+bi],m-k-bi,s);
+              }
+            }
+            //temporarily store the diagonal value of R in betas
+            betas[k+bi]=-norms[bi];
+          }
+
+          //apply transformations from block to remaining columns to the right of the block in parallel
+  #ifdef VIENNACL_OPENMP
+  #pragma omp parallel for
+          for(int j=k+block_size; j<(int)n; j++)
+  #else
+          for(vcl_size_t j=k+block_size; j<n; j++)
+  #endif
+          {
+            for(vcl_size_t bi=0;bi<block_size;bi++)
+            {
+              if(norms[bi]!=ScalarType(0))
+                          {
+                ScalarType s=_dotc(m-k-bi,&A[k+bi][k+bi],&A[j][k+bi]);
+                s = -s/A[k+bi][k+bi];
+                _axpy(&A[k+bi][k+bi],A[j]+k+bi,m-k-bi,s);
+              }
+            }
+          }
+        }
+
+        //normalize the householder reflectors and store the betas
+        for(vcl_size_t j=0;j<std::min(m,n);j++)
+        {
+          ScalarType beta=A[j][j];
+          for(vcl_size_t i=j+1;i<m;i++)
+            A[j][i]/=beta;
+          A[j][j]=betas[j];//R diagonal values were stored temporarily in betas
+          betas[j]=beta;
+        }
+
+        delete [] norms;
+        return betas;
+      }
+
+      /** @brief Inplace qr factorization of an m x n dense row-major matrix, returning the householder normalization coefficients
+      *
+      * @param A            A dense row-major matrix to be factorized
+      * @param m            The height of the matrix
+      * @param n            The width of the matrix
+      * @param block_size   The block size to be used
+      * @param num_threads  Number of threads to be used
+      */
+      template <typename ScalarType>
+      std::vector<ScalarType> inplace_qr_row_major(ScalarType ** A, vcl_size_t m, vcl_size_t n, vcl_size_t block_size = 8, vcl_size_t num_threads = 1)
+      {
+        std::vector<ScalarType> betas(std::min(m,n));
+        ScalarType* norms=new ScalarType[block_size];
+        ScalarType* ss=new ScalarType[n];
+
+        //allocate O(m) memory for temporary column-major storage of the block for blas functions
+        ScalarType** block_cols=new ScalarType*[block_size];
+        for(vcl_size_t i=0;i<block_size;i++)
+          block_cols[i]=new ScalarType[m];
+
+        for(vcl_size_t k=0; k<std::min(m,n); k+=block_size)
+        {
+          block_size=std::min(std::min(m-k,n-k),block_size);
+
+          //copy the block to column-major storage for cache alignment (necessary for _nrm2)
+          for(vcl_size_t i=0;i<m-k;i++)
+            for(vcl_size_t bi=0;bi<block_size;bi++)
+              block_cols[bi][i]=A[k+i][k+bi];
+
+          for(vcl_size_t bi=0;bi<block_size;bi++)
+          {
+
+            // Compute 2-norm of k+bi-th column below the diagonal
+            norms[bi]=_nrm2(&block_cols[bi][bi],m-k-bi);
+
+            if(norms[bi]!=ScalarType(0))
+            {
+              //pick the better of two reflectors, to 1 or -1,
+              //this is wierd syntax that also works with std::complex
+              if(std::abs(block_cols[bi][bi]-ScalarType(1))>std::abs(block_cols[bi][bi]+ScalarType(1)))
+                norms[bi]*=-1;
+              for(vcl_size_t i=bi;i<m-k;i++)
+                block_cols[bi][i]/=norms[bi];
+              block_cols[bi][bi]+=ScalarType(1);
+
+              // Apply transformation to columns within the block
+              for(vcl_size_t j=bi+1; j<block_size; j++)
+              {
+                ScalarType s=_dotc(m-k-bi,&block_cols[bi][bi],&block_cols[j][bi]);
+                s = -s/block_cols[bi][bi];
+                _axpy(&block_cols[bi][bi],&block_cols[j][bi],m-k-bi,s);
+              }
+            }
+            //temporarily store the diagonal value of R in betas
+            betas[k+bi]=-norms[bi];
+          }
+
+          //copy the block back to row-major storage
+          for(vcl_size_t i=0;i<m-k;i++)
+            for(vcl_size_t bi=0;bi<block_size;bi++)
+              A[k+i][k+bi]=block_cols[bi][i];
+
+          //apply transformations from block to remaining rows to the right of the block in parallel
+  #ifdef VIENNACL_OPENMP
+  #pragma omp parallel for
+          for(int section=0;section<(int)num_threads;section++)
+  #else
+          for(vcl_size_t section=0;section<num_threads;section++)
+  #endif
+          {
+            vcl_size_t start=((n-k-block_size)*(section+0))/num_threads+k+block_size;
+            vcl_size_t end  =((n-k-block_size)*(section+1))/num_threads+k+block_size;
+            vcl_size_t length=end-start;
+            for(vcl_size_t bi=0;bi<block_size;bi++)
+            {
+              if(norms[bi]!=ScalarType(0))
+              {
+                for(vcl_size_t i=start;i<end;i++)
+                  ss[i]=ScalarType(0);
+                for(vcl_size_t i=k+bi;i<m;i++)
+                  _axpy(&A[i][start],ss+start,length,A[i][k+bi]);
+                for(vcl_size_t i=start;i<end;i++)
+                  ss[i]=-ss[i]/A[k+bi][k+bi];
+                for(vcl_size_t i=k+bi;i<m;i++)
+                  _axpy(ss+start,&A[i][start],length,A[i][k+bi]);
+              }
+            }
+          }
+        }
+
+        //normalize the householder reflectors and store the betas
+        for(vcl_size_t j=0;j<std::min(m,n);j++)
+        {
+          ScalarType beta=A[j][j];
+          for(vcl_size_t i=j+1;i<m;i++)
+            A[i][j]/=beta;
+          A[j][j]=betas[j];//R diagonal values were stored temporarily in betas
+          betas[j]=beta;
+        }
+
+        delete [] norms;
+        for(vcl_size_t i=0;i<block_size;i++)
+          delete [] block_cols[i];
+        delete [] block_cols;
+        delete [] ss;
+
+        return betas;
+      }
+
+    } //namespace host_based
+  } //namespace linalg
+} //namespace viennacl
+#endif
diff --git a/viennacl/linalg/host_based/vector_operations.hpp b/viennacl/linalg/host_based/vector_operations.hpp
new file mode 100644
index 0000000..3ecc6a8
--- /dev/null
+++ b/viennacl/linalg/host_based/vector_operations.hpp
@@ -0,0 +1,621 @@
+#ifndef VIENNACL_LINALG_HOST_BASED_VECTOR_OPERATIONS_HPP_
+#define VIENNACL_LINALG_HOST_BASED_VECTOR_OPERATIONS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/host_based/vector_operations.hpp
+    @brief Implementations of vector operations using a plain single-threaded or OpenMP-enabled execution on CPU
+*/
+
+#include <cmath>
+#include <algorithm>  //for std::max and std::min
+
+#include "viennacl/forwards.h"
+#include "viennacl/scalar.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/meta/predicate.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/start.hpp"
+#include "viennacl/linalg/host_based/common.hpp"
+#include "viennacl/linalg/detail/op_applier.hpp"
+#include "viennacl/traits/stride.hpp"
+
+
+// Minimum vector size for using OpenMP on vector operations:
+#ifndef VIENNACL_OPENMP_VECTOR_MIN_SIZE
+  #define VIENNACL_OPENMP_VECTOR_MIN_SIZE  5000
+#endif
+
+namespace viennacl
+{
+  namespace linalg
+  {
+    namespace host_based
+    {
+      namespace detail
+      {
+        template <typename NumericT>
+        NumericT flip_sign(NumericT val) { return -val; }
+        inline unsigned long  flip_sign(unsigned long  val) { return val; }
+        inline unsigned int   flip_sign(unsigned int   val) { return val; }
+        inline unsigned short flip_sign(unsigned short val) { return val; }
+        inline unsigned char  flip_sign(unsigned char  val) { return val; }
+      }
+
+      //
+      // Introductory note: By convention, all dimensions are already checked in the dispatcher frontend. No need to double-check again in here!
+      //
+
+      template <typename T, typename ScalarType1>
+      void av(vector_base<T> & vec1,
+              vector_base<T> const & vec2, ScalarType1 const & alpha, vcl_size_t /*len_alpha*/, bool reciprocal_alpha, bool flip_sign_alpha)
+      {
+        typedef T        value_type;
+
+        value_type       * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+        value_type const * data_vec2 = detail::extract_raw_pointer<value_type>(vec2);
+
+        value_type data_alpha = alpha;
+        if (flip_sign_alpha)
+          data_alpha = detail::flip_sign(data_alpha);
+
+        vcl_size_t start1 = viennacl::traits::start(vec1);
+        vcl_size_t inc1   = viennacl::traits::stride(vec1);
+        vcl_size_t size1  = viennacl::traits::size(vec1);
+
+        vcl_size_t start2 = viennacl::traits::start(vec2);
+        vcl_size_t inc2   = viennacl::traits::stride(vec2);
+
+        if (reciprocal_alpha)
+        {
+#ifdef VIENNACL_WITH_OPENMP
+          #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+          for (long i = 0; i < static_cast<long>(size1); ++i)
+            data_vec1[i*inc1+start1] = data_vec2[i*inc2+start2] / data_alpha;
+        }
+        else
+        {
+#ifdef VIENNACL_WITH_OPENMP
+          #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+          for (long i = 0; i < static_cast<long>(size1); ++i)
+            data_vec1[i*inc1+start1] = data_vec2[i*inc2+start2] * data_alpha;
+        }
+      }
+
+
+      template <typename T, typename ScalarType1, typename ScalarType2>
+      void avbv(vector_base<T> & vec1,
+                vector_base<T> const & vec2, ScalarType1 const & alpha, vcl_size_t /* len_alpha */, bool reciprocal_alpha, bool flip_sign_alpha,
+                vector_base<T> const & vec3, ScalarType2 const & beta,  vcl_size_t /* len_beta */,  bool reciprocal_beta,  bool flip_sign_beta)
+      {
+        typedef T        value_type;
+
+        value_type       * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+        value_type const * data_vec2 = detail::extract_raw_pointer<value_type>(vec2);
+        value_type const * data_vec3 = detail::extract_raw_pointer<value_type>(vec3);
+
+        value_type data_alpha = alpha;
+        if (flip_sign_alpha)
+          data_alpha = detail::flip_sign(data_alpha);
+
+        value_type data_beta = beta;
+        if (flip_sign_beta)
+          data_beta = detail::flip_sign(data_beta);
+
+        vcl_size_t start1 = viennacl::traits::start(vec1);
+        vcl_size_t inc1   = viennacl::traits::stride(vec1);
+        vcl_size_t size1  = viennacl::traits::size(vec1);
+
+        vcl_size_t start2 = viennacl::traits::start(vec2);
+        vcl_size_t inc2   = viennacl::traits::stride(vec2);
+
+        vcl_size_t start3 = viennacl::traits::start(vec3);
+        vcl_size_t inc3   = viennacl::traits::stride(vec3);
+
+        if (reciprocal_alpha)
+        {
+          if (reciprocal_beta)
+          {
+#ifdef VIENNACL_WITH_OPENMP
+            #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+            for (long i = 0; i < static_cast<long>(size1); ++i)
+              data_vec1[i*inc1+start1] = data_vec2[i*inc2+start2] / data_alpha + data_vec3[i*inc3+start3] / data_beta;
+          }
+          else
+          {
+#ifdef VIENNACL_WITH_OPENMP
+            #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+            for (long i = 0; i < static_cast<long>(size1); ++i)
+              data_vec1[i*inc1+start1] = data_vec2[i*inc2+start2] / data_alpha + data_vec3[i*inc3+start3] * data_beta;
+          }
+        }
+        else
+        {
+          if (reciprocal_beta)
+          {
+#ifdef VIENNACL_WITH_OPENMP
+            #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+            for (long i = 0; i < static_cast<long>(size1); ++i)
+              data_vec1[i*inc1+start1] = data_vec2[i*inc2+start2] * data_alpha + data_vec3[i*inc3+start3] / data_beta;
+          }
+          else
+          {
+#ifdef VIENNACL_WITH_OPENMP
+            #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+            for (long i = 0; i < static_cast<long>(size1); ++i)
+              data_vec1[i*inc1+start1] = data_vec2[i*inc2+start2] * data_alpha + data_vec3[i*inc3+start3] * data_beta;
+          }
+        }
+      }
+
+
+      template <typename T, typename ScalarType1, typename ScalarType2>
+      void avbv_v(vector_base<T> & vec1,
+                  vector_base<T> const & vec2, ScalarType1 const & alpha, vcl_size_t /*len_alpha*/, bool reciprocal_alpha, bool flip_sign_alpha,
+                  vector_base<T> const & vec3, ScalarType2 const & beta,  vcl_size_t /*len_beta*/,  bool reciprocal_beta,  bool flip_sign_beta)
+      {
+        typedef T        value_type;
+
+        value_type       * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+        value_type const * data_vec2 = detail::extract_raw_pointer<value_type>(vec2);
+        value_type const * data_vec3 = detail::extract_raw_pointer<value_type>(vec3);
+
+        value_type data_alpha = alpha;
+        if (flip_sign_alpha)
+          data_alpha = detail::flip_sign(data_alpha);
+
+        value_type data_beta = beta;
+        if (flip_sign_beta)
+          data_beta = detail::flip_sign(data_beta);
+
+        vcl_size_t start1 = viennacl::traits::start(vec1);
+        vcl_size_t inc1   = viennacl::traits::stride(vec1);
+        vcl_size_t size1  = viennacl::traits::size(vec1);
+
+        vcl_size_t start2 = viennacl::traits::start(vec2);
+        vcl_size_t inc2   = viennacl::traits::stride(vec2);
+
+        vcl_size_t start3 = viennacl::traits::start(vec3);
+        vcl_size_t inc3   = viennacl::traits::stride(vec3);
+
+        if (reciprocal_alpha)
+        {
+          if (reciprocal_beta)
+          {
+#ifdef VIENNACL_WITH_OPENMP
+            #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+            for (long i = 0; i < static_cast<long>(size1); ++i)
+              data_vec1[i*inc1+start1] += data_vec2[i*inc2+start2] / data_alpha + data_vec3[i*inc3+start3] / data_beta;
+          }
+          else
+          {
+#ifdef VIENNACL_WITH_OPENMP
+            #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+            for (long i = 0; i < static_cast<long>(size1); ++i)
+              data_vec1[i*inc1+start1] += data_vec2[i*inc2+start2] / data_alpha + data_vec3[i*inc3+start3] * data_beta;
+          }
+        }
+        else
+        {
+          if (reciprocal_beta)
+          {
+#ifdef VIENNACL_WITH_OPENMP
+            #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+            for (long i = 0; i < static_cast<long>(size1); ++i)
+              data_vec1[i*inc1+start1] += data_vec2[i*inc2+start2] * data_alpha + data_vec3[i*inc3+start3] / data_beta;
+          }
+          else
+          {
+#ifdef VIENNACL_WITH_OPENMP
+            #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+            for (long i = 0; i < static_cast<long>(size1); ++i)
+              data_vec1[i*inc1+start1] += data_vec2[i*inc2+start2] * data_alpha + data_vec3[i*inc3+start3] * data_beta;
+          }
+        }
+      }
+
+
+
+
+      /** @brief Assign a constant value to a vector (-range/-slice)
+      *
+      * @param vec1   The vector to which the value should be assigned
+      * @param alpha  The value to be assigned
+      * @param up_to_internal_size  Specifies whether alpha should also be written to padded memory (mostly used for clearing the whole buffer).
+      */
+      template <typename T>
+      void vector_assign(vector_base<T> & vec1, const T & alpha, bool up_to_internal_size = false)
+      {
+        typedef T        value_type;
+
+        value_type * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+
+        vcl_size_t start1 = viennacl::traits::start(vec1);
+        vcl_size_t inc1   = viennacl::traits::stride(vec1);
+        vcl_size_t size1  = viennacl::traits::size(vec1);
+        vcl_size_t loop_bound  = up_to_internal_size ? vec1.internal_size() : size1;  //Note: Do NOT use traits::internal_size() here, because vector proxies don't require padding.
+
+        value_type data_alpha = static_cast<value_type>(alpha);
+
+#ifdef VIENNACL_WITH_OPENMP
+        #pragma omp parallel for if (loop_bound > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+        for (long i = 0; i < static_cast<long>(loop_bound); ++i)
+          data_vec1[i*inc1+start1] = data_alpha;
+      }
+
+
+      /** @brief Swaps the contents of two vectors, data is copied
+      *
+      * @param vec1   The first vector (or -range, or -slice)
+      * @param vec2   The second vector (or -range, or -slice)
+      */
+      template <typename T>
+      void vector_swap(vector_base<T> & vec1, vector_base<T> & vec2)
+      {
+        typedef T        value_type;
+
+        value_type * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+        value_type * data_vec2 = detail::extract_raw_pointer<value_type>(vec2);
+
+        vcl_size_t start1 = viennacl::traits::start(vec1);
+        vcl_size_t inc1   = viennacl::traits::stride(vec1);
+        vcl_size_t size1  = viennacl::traits::size(vec1);
+
+        vcl_size_t start2 = viennacl::traits::start(vec2);
+        vcl_size_t inc2   = viennacl::traits::stride(vec2);
+
+#ifdef VIENNACL_WITH_OPENMP
+        #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+        for (long i = 0; i < static_cast<long>(size1); ++i)
+        {
+          value_type temp = data_vec2[i*inc2+start2];
+          data_vec2[i*inc2+start2] = data_vec1[i*inc1+start1];
+          data_vec1[i*inc1+start1] = temp;
+        }
+      }
+
+
+      ///////////////////////// Elementwise operations /////////////
+
+      /** @brief Implementation of the element-wise operation v1 = v2 .* v3 and v1 = v2 ./ v3    (using MATLAB syntax)
+      *
+      * @param vec1   The result vector (or -range, or -slice)
+      * @param proxy  The proxy object holding v2, v3 and the operation
+      */
+      template <typename T, typename OP>
+      void element_op(vector_base<T> & vec1,
+                      vector_expression<const vector_base<T>, const vector_base<T>, op_element_binary<OP> > const & proxy)
+      {
+        typedef T                                              value_type;
+        typedef viennacl::linalg::detail::op_applier<op_element_binary<OP> >    OpFunctor;
+
+        value_type       * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+        value_type const * data_vec2 = detail::extract_raw_pointer<value_type>(proxy.lhs());
+        value_type const * data_vec3 = detail::extract_raw_pointer<value_type>(proxy.rhs());
+
+        vcl_size_t start1 = viennacl::traits::start(vec1);
+        vcl_size_t inc1   = viennacl::traits::stride(vec1);
+        vcl_size_t size1  = viennacl::traits::size(vec1);
+
+        vcl_size_t start2 = viennacl::traits::start(proxy.lhs());
+        vcl_size_t inc2   = viennacl::traits::stride(proxy.lhs());
+
+        vcl_size_t start3 = viennacl::traits::start(proxy.rhs());
+        vcl_size_t inc3   = viennacl::traits::stride(proxy.rhs());
+
+#ifdef VIENNACL_WITH_OPENMP
+        #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+        for (long i = 0; i < static_cast<long>(size1); ++i)
+          OpFunctor::apply(data_vec1[i*inc1+start1], data_vec2[i*inc2+start2], data_vec3[i*inc3+start3]);
+      }
+
+      /** @brief Implementation of the element-wise operation v1 = v2 .* v3 and v1 = v2 ./ v3    (using MATLAB syntax)
+      *
+      * @param vec1   The result vector (or -range, or -slice)
+      * @param proxy  The proxy object holding v2, v3 and the operation
+      */
+      template <typename T, typename OP>
+      void element_op(vector_base<T> & vec1,
+                      vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<OP> > const & proxy)
+      {
+        typedef T        value_type;
+        typedef viennacl::linalg::detail::op_applier<op_element_unary<OP> >    OpFunctor;
+
+        value_type       * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+        value_type const * data_vec2 = detail::extract_raw_pointer<value_type>(proxy.lhs());
+
+        vcl_size_t start1 = viennacl::traits::start(vec1);
+        vcl_size_t inc1   = viennacl::traits::stride(vec1);
+        vcl_size_t size1  = viennacl::traits::size(vec1);
+
+        vcl_size_t start2 = viennacl::traits::start(proxy.lhs());
+        vcl_size_t inc2   = viennacl::traits::stride(proxy.lhs());
+
+#ifdef VIENNACL_WITH_OPENMP
+        #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+        for (long i = 0; i < static_cast<long>(size1); ++i)
+          OpFunctor::apply(data_vec1[i*inc1+start1], data_vec2[i*inc2+start2]);
+      }
+
+
+      ///////////////////////// Norms and inner product ///////////////////
+
+
+      //implementation of inner product:
+      //namespace {
+      /** @brief Computes the inner product of two vectors - implementation. Library users should call inner_prod(vec1, vec2).
+      *
+      * @param vec1 The first vector
+      * @param vec2 The second vector
+      * @param result The result scalar (on the gpu)
+      */
+      template <typename T, typename S3>
+      void inner_prod_impl(vector_base<T> const & vec1,
+                           vector_base<T> const & vec2,
+                           S3 & result)
+      {
+        typedef T        value_type;
+
+        value_type const * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+        value_type const * data_vec2 = detail::extract_raw_pointer<value_type>(vec2);
+
+        vcl_size_t start1 = viennacl::traits::start(vec1);
+        vcl_size_t inc1   = viennacl::traits::stride(vec1);
+        vcl_size_t size1  = viennacl::traits::size(vec1);
+
+        vcl_size_t start2 = viennacl::traits::start(vec2);
+        vcl_size_t inc2   = viennacl::traits::stride(vec2);
+
+        value_type temp = 0;
+
+#ifdef VIENNACL_WITH_OPENMP
+        #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+        for (long i = 0; i < static_cast<long>(size1); ++i)
+          temp += data_vec1[i*inc1+start1] * data_vec2[i*inc2+start2];
+
+        result = temp;  //Note: Assignment to result might be expensive, thus 'temp' is used for accumulation
+      }
+
+      template <typename T>
+      void inner_prod_impl(vector_base<T> const & x,
+                           vector_tuple<T> const & vec_tuple,
+                           vector_base<T> & result)
+      {
+        typedef T        value_type;
+
+        value_type const * data_x = detail::extract_raw_pointer<value_type>(x);
+
+        vcl_size_t start_x = viennacl::traits::start(x);
+        vcl_size_t inc_x   = viennacl::traits::stride(x);
+        vcl_size_t size_x  = viennacl::traits::size(x);
+
+        std::vector<value_type> temp(vec_tuple.const_size());
+        std::vector<value_type const *> data_y(vec_tuple.const_size());
+        std::vector<vcl_size_t> start_y(vec_tuple.const_size());
+        std::vector<vcl_size_t> stride_y(vec_tuple.const_size());
+
+        for (vcl_size_t j=0; j<vec_tuple.const_size(); ++j)
+        {
+          data_y[j] = detail::extract_raw_pointer<value_type>(vec_tuple.const_at(j));
+          start_y[j] = viennacl::traits::start(vec_tuple.const_at(j));
+          stride_y[j] = viennacl::traits::stride(vec_tuple.const_at(j));
+        }
+
+        // Note: No OpenMP here because it cannot perform a reduction on temp-array. Savings in memory bandwidth are expected to still justify this approach...
+        for (vcl_size_t i = 0; i < size_x; ++i)
+        {
+          value_type entry_x = data_x[i*inc_x+start_x];
+          for (vcl_size_t j=0; j < vec_tuple.const_size(); ++j)
+            temp[j] += entry_x * data_y[j][i*stride_y[j]+start_y[j]];
+        }
+
+        for (vcl_size_t j=0; j < vec_tuple.const_size(); ++j)
+          result[j] = temp[j];  //Note: Assignment to result might be expensive, thus 'temp' is used for accumulation
+      }
+
+
+      /** @brief Computes the l^1-norm of a vector
+      *
+      * @param vec1 The vector
+      * @param result The result scalar
+      */
+      template <typename T, typename S2>
+      void norm_1_impl(vector_base<T> const & vec1,
+                       S2 & result)
+      {
+        typedef T        value_type;
+
+        value_type const * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+
+        vcl_size_t start1 = viennacl::traits::start(vec1);
+        vcl_size_t inc1   = viennacl::traits::stride(vec1);
+        vcl_size_t size1  = viennacl::traits::size(vec1);
+
+        value_type temp = 0;
+
+#ifdef VIENNACL_WITH_OPENMP
+        #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+        for (long i = 0; i < static_cast<long>(size1); ++i)
+          temp += static_cast<value_type>(std::fabs(static_cast<double>(data_vec1[i*inc1+start1])));  //casting to double in order to avoid problems if T is an integer type
+
+        result = temp;  //Note: Assignment to result might be expensive, thus 'temp' is used for accumulation
+      }
+
+      /** @brief Computes the l^2-norm of a vector - implementation
+      *
+      * @param vec1 The vector
+      * @param result The result scalar
+      */
+      template <typename T, typename S2>
+      void norm_2_impl(vector_base<T> const & vec1,
+                       S2 & result)
+      {
+        typedef T        value_type;
+
+        value_type const * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+
+        vcl_size_t start1 = viennacl::traits::start(vec1);
+        vcl_size_t inc1   = viennacl::traits::stride(vec1);
+        vcl_size_t size1  = viennacl::traits::size(vec1);
+
+        value_type temp = 0;
+        value_type data = 0;
+
+#ifdef VIENNACL_WITH_OPENMP
+        #pragma omp parallel for reduction(+: temp) private(data) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+        for (long i = 0; i < static_cast<long>(size1); ++i)
+        {
+          data = data_vec1[i*inc1+start1];
+          temp += data * data;
+        }
+
+        result = std::sqrt(temp);  //Note: Assignment to result might be expensive, thus 'temp' is used for accumulation
+      }
+
+      /** @brief Computes the supremum-norm of a vector
+      *
+      * @param vec1 The vector
+      * @param result The result scalar
+      */
+      template <typename T, typename S2>
+      void norm_inf_impl(vector_base<T> const & vec1,
+                         S2 & result)
+      {
+        typedef T        value_type;
+
+        value_type const * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+
+        vcl_size_t start1 = viennacl::traits::start(vec1);
+        vcl_size_t inc1   = viennacl::traits::stride(vec1);
+        vcl_size_t size1  = viennacl::traits::size(vec1);
+
+        value_type temp = 0;
+
+        // Note: No max() reduction in OpenMP yet
+        for (vcl_size_t i = 0; i < size1; ++i)
+          temp = std::max<value_type>(temp, static_cast<value_type>(std::fabs(static_cast<double>(data_vec1[i*inc1+start1]))));  //casting to double in order to avoid problems if T is an integer type
+
+        result = temp;  //Note: Assignment to result might be expensive, thus 'temp' is used for accumulation
+      }
+
+      //This function should return a CPU scalar, otherwise statements like
+      // vcl_rhs[index_norm_inf(vcl_rhs)]
+      // are ambiguous
+      /** @brief Computes the index of the first entry that is equal to the supremum-norm in modulus.
+      *
+      * @param vec1 The vector
+      * @return The result. Note that the result must be a CPU scalar (unsigned int), since gpu scalars are floating point types.
+      */
+      template <typename T>
+      vcl_size_t index_norm_inf(vector_base<T> const & vec1)
+      {
+        typedef T        value_type;
+
+        value_type const * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+
+        vcl_size_t start1 = viennacl::traits::start(vec1);
+        vcl_size_t inc1   = viennacl::traits::stride(vec1);
+        vcl_size_t size1  = viennacl::traits::size(vec1);
+
+        value_type temp = 0;
+        value_type data;
+        vcl_size_t index = start1;
+
+        // Note: No suitable reduction in OpenMP yet
+        for (vcl_size_t i = 0; i < size1; ++i)
+        {
+          data = static_cast<value_type>(std::fabs(static_cast<double>(data_vec1[i*inc1+start1])));  //casting to double in order to avoid problems if T is an integer type
+          if (data > temp)
+          {
+            index = i;
+            temp = data;
+          }
+        }
+
+        return index;
+      }
+
+
+      /** @brief Computes a plane rotation of two vectors.
+      *
+      * Computes (x,y) <- (alpha * x + beta * y, -beta * x + alpha * y)
+      *
+      * @param vec1   The first vector
+      * @param vec2   The second vector
+      * @param alpha  The first transformation coefficient
+      * @param beta   The second transformation coefficient
+      */
+      template <typename T>
+      void plane_rotation(vector_base<T> & vec1,
+                          vector_base<T> & vec2,
+                          T alpha, T beta)
+      {
+        typedef T   value_type;
+
+        value_type * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+        value_type * data_vec2 = detail::extract_raw_pointer<value_type>(vec2);
+
+        vcl_size_t start1 = viennacl::traits::start(vec1);
+        vcl_size_t inc1   = viennacl::traits::stride(vec1);
+        vcl_size_t size1  = viennacl::traits::size(vec1);
+
+        vcl_size_t start2 = viennacl::traits::start(vec2);
+        vcl_size_t inc2   = viennacl::traits::stride(vec2);
+
+        value_type temp1 = 0;
+        value_type temp2 = 0;
+        value_type data_alpha = alpha;
+        value_type data_beta  = beta;
+
+#ifdef VIENNACL_WITH_OPENMP
+        #pragma omp parallel for private(temp1, temp2) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+        for (long i = 0; i < static_cast<long>(size1); ++i)
+        {
+          temp1 = data_vec1[i*inc1+start1];
+          temp2 = data_vec2[i*inc2+start2];
+
+          data_vec1[i*inc1+start1] = data_alpha * temp1 + data_beta * temp2;
+          data_vec2[i*inc2+start2] = data_alpha * temp2 - data_beta * temp1;
+        }
+      }
+
+    } //namespace host_based
+  } //namespace linalg
+} //namespace viennacl
+
+
+#endif
diff --git a/viennacl/linalg/ichol.hpp b/viennacl/linalg/ichol.hpp
new file mode 100644
index 0000000..c54e60c
--- /dev/null
+++ b/viennacl/linalg/ichol.hpp
@@ -0,0 +1,228 @@
+#ifndef VIENNACL_LINALG_ICHOL_HPP_
+#define VIENNACL_LINALG_ICHOL_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/ichol.hpp
+  @brief Implementations of incomplete Cholesky factorization preconditioners with static nonzero pattern.
+*/
+
+#include <vector>
+#include <cmath>
+#include <iostream>
+#include "viennacl/forwards.h"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/compressed_matrix.hpp"
+
+#include "viennacl/linalg/host_based/common.hpp"
+
+#include <map>
+
+namespace viennacl
+{
+  namespace linalg
+  {
+
+    /** @brief A tag for incomplete Cholesky factorization with static pattern (ILU0)
+    */
+    class ichol0_tag {};
+
+
+    /** @brief Implementation of a ILU-preconditioner with static pattern. Optimized version for CSR matrices.
+      *
+      *  Refer to Chih-Jen Lin and Jorge J. Moré, Incomplete Cholesky Factorizations with Limited Memory, SIAM J. Sci. Comput., 21(1), 24–45
+      *  for one of many descriptions of incomplete Cholesky Factorizations
+      *
+      *  @param A       The input matrix in CSR format
+      *  // param tag     An ichol0_tag in order to dispatch among several other preconditioners.
+      */
+    template<typename ScalarType>
+    void precondition(viennacl::compressed_matrix<ScalarType> & A, ichol0_tag const & /* tag */)
+    {
+      assert( (viennacl::traits::context(A).memory_type() == viennacl::MAIN_MEMORY) && bool("System matrix must reside in main memory for ICHOL0") );
+
+      ScalarType         * elements   = viennacl::linalg::host_based::detail::extract_raw_pointer<ScalarType>(A.handle());
+      unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle1());
+      unsigned int const * col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle2());
+
+      //std::cout << A.size1() << std::endl;
+      for (vcl_size_t i=0; i<A.size1(); ++i)
+      {
+        unsigned int row_i_begin = row_buffer[i];
+        unsigned int row_i_end   = row_buffer[i+1];
+
+        // get a_ii:
+        ScalarType a_ii = 0;
+        for (unsigned int buf_index_aii = row_i_begin; buf_index_aii < row_i_end; ++buf_index_aii)
+        {
+          if (col_buffer[buf_index_aii] == i)
+          {
+            a_ii = std::sqrt(elements[buf_index_aii]);
+            elements[buf_index_aii] = a_ii;
+            break;
+          }
+        }
+
+        // Now scale column/row i, i.e. A(k, i) /= A(i, i)
+        for (unsigned int buf_index_aii = row_i_begin; buf_index_aii < row_i_end; ++buf_index_aii)
+        {
+          if (col_buffer[buf_index_aii] > i)
+            elements[buf_index_aii] /= a_ii;
+        }
+
+        // Now compute A(k, j) -= A(k, i) * A(j, i) for all nonzero k, j in column i:
+        for (unsigned int buf_index_j = row_i_begin; buf_index_j < row_i_end; ++buf_index_j)
+        {
+          unsigned int j = col_buffer[buf_index_j];
+          if (j <= i)
+            continue;
+
+          ScalarType a_ji = elements[buf_index_j];
+
+          for (unsigned int buf_index_k = row_i_begin; buf_index_k < row_i_end; ++buf_index_k)
+          {
+            unsigned int k = col_buffer[buf_index_k];
+            if (k < j)
+              continue;
+
+            ScalarType a_ki = elements[buf_index_k];
+
+            //Now check whether A(k, j) is in nonzero pattern:
+            unsigned int row_j_begin = row_buffer[j];
+            unsigned int row_j_end   = row_buffer[j+1];
+            for (unsigned int buf_index_kj = row_j_begin; buf_index_kj < row_j_end; ++buf_index_kj)
+            {
+              if (col_buffer[buf_index_kj] == k)
+              {
+                elements[buf_index_kj] -= a_ki * a_ji;
+                break;
+              }
+            }
+          }
+        }
+
+      }
+
+    }
+
+
+    /** @brief Incomplete Cholesky preconditioner class with static pattern (ICHOL0), can be supplied to solve()-routines
+    */
+    template <typename MatrixType>
+    class ichol0_precond
+    {
+        typedef typename MatrixType::value_type      ScalarType;
+
+      public:
+        ichol0_precond(MatrixType const & mat, ichol0_tag const & tag) : tag_(tag), LLT(mat.size1(), mat.size2(), viennacl::context(viennacl::MAIN_MEMORY))
+        {
+            //initialize preconditioner:
+            //std::cout << "Start CPU precond" << std::endl;
+            init(mat);
+            //std::cout << "End CPU precond" << std::endl;
+        }
+
+        template <typename VectorType>
+        void apply(VectorType & vec) const
+        {
+          unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(LLT.handle1());
+          unsigned int const * col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(LLT.handle2());
+          ScalarType   const * elements   = viennacl::linalg::host_based::detail::extract_raw_pointer<ScalarType>(LLT.handle());
+
+          // Note: L is stored in a column-oriented fashion, i.e. transposed w.r.t. the row-oriented layout. Thus, the factorization A = L L^T holds L in the upper triangular part of A.
+          viennacl::linalg::host_based::detail::csr_trans_inplace_solve<ScalarType>(row_buffer, col_buffer, elements, vec, LLT.size2(), lower_tag());
+          viennacl::linalg::host_based::detail::csr_inplace_solve<ScalarType>(row_buffer, col_buffer, elements, vec, LLT.size2(), upper_tag());
+        }
+
+      private:
+        void init(MatrixType const & mat)
+        {
+          viennacl::context host_ctx(viennacl::MAIN_MEMORY);
+          viennacl::switch_memory_context(LLT, host_ctx);
+
+          viennacl::copy(mat, LLT);
+          viennacl::linalg::precondition(LLT, tag_);
+        }
+
+        ichol0_tag const & tag_;
+        viennacl::compressed_matrix<ScalarType> LLT;
+    };
+
+
+    /** @brief ILU0 preconditioner class, can be supplied to solve()-routines.
+      *
+      *  Specialization for compressed_matrix
+      */
+    template <typename ScalarType, unsigned int MAT_ALIGNMENT>
+    class ichol0_precond< compressed_matrix<ScalarType, MAT_ALIGNMENT> >
+    {
+        typedef compressed_matrix<ScalarType, MAT_ALIGNMENT>   MatrixType;
+
+      public:
+        ichol0_precond(MatrixType const & mat, ichol0_tag const & tag) : tag_(tag), LLT(mat.size1(), mat.size2(), viennacl::traits::context(mat))
+        {
+          //initialize preconditioner:
+          //std::cout << "Start GPU precond" << std::endl;
+          init(mat);
+          //std::cout << "End GPU precond" << std::endl;
+        }
+
+        void apply(vector<ScalarType> & vec) const
+        {
+          if (viennacl::traits::context(vec).memory_type() != viennacl::MAIN_MEMORY)
+          {
+            viennacl::context host_ctx(viennacl::MAIN_MEMORY);
+            viennacl::context old_ctx = viennacl::traits::context(vec);
+
+            viennacl::switch_memory_context(vec, host_ctx);
+            viennacl::linalg::inplace_solve(trans(LLT), vec, lower_tag());
+            viennacl::linalg::inplace_solve(      LLT , vec, upper_tag());
+            viennacl::switch_memory_context(vec, old_ctx);
+          }
+          else //apply ILU0 directly:
+          {
+            // Note: L is stored in a column-oriented fashion, i.e. transposed w.r.t. the row-oriented layout. Thus, the factorization A = L L^T holds L in the upper triangular part of A.
+            viennacl::linalg::inplace_solve(trans(LLT), vec, lower_tag());
+            viennacl::linalg::inplace_solve(      LLT , vec, upper_tag());
+          }
+        }
+
+      private:
+        void init(MatrixType const & mat)
+        {
+          viennacl::context host_ctx(viennacl::MAIN_MEMORY);
+          viennacl::switch_memory_context(LLT, host_ctx);
+          LLT = mat;
+
+          viennacl::linalg::precondition(LLT, tag_);
+        }
+
+        ichol0_tag const & tag_;
+        viennacl::compressed_matrix<ScalarType> LLT;
+    };
+
+  }
+}
+
+
+
+
+#endif
+
+
+
diff --git a/viennacl/linalg/ilu.hpp b/viennacl/linalg/ilu.hpp
index 13681b7..f913649 100644
--- a/viennacl/linalg/ilu.hpp
+++ b/viennacl/linalg/ilu.hpp
@@ -2,393 +2,29 @@
 #define VIENNACL_LINALG_ILU_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
 ============================================================================= */
 
-/** @file ilu.hpp
-    @brief Implementations of incomplete factorization preconditioners
+/** @file viennacl/linalg/ilu.hpp
+    @brief Implementations of incomplete factorization preconditioners. Convenience header file.
 */
 
-#include <vector>
-#include <cmath>
-#include "viennacl/forwards.h"
-#include "viennacl/tools/tools.hpp"
-
-#include <map>
-
-namespace viennacl
-{
-  namespace linalg
-  {
-    
-    /** @brief A tag for incomplete LU factorization with threshold (ILUT)
-    */
-    class ilut_tag
-    {
-      public:
-        /** @brief The constructor.
-        *
-        * @param entries_per_row  Number of nonzero entries per row in L and U. Note that L and U are stored in a single matrix, thus there are 2*entries_per_row in total.
-        * @param drop_tolerance   The drop tolerance for ILUT
-        */
-        ilut_tag(unsigned int entries_per_row = 20,
-                 double drop_tolerance = 1e-4) : _entries_per_row(entries_per_row), _drop_tolerance(drop_tolerance) {}; 
-
-        void set_drop_tolerance(double tol)
-        {
-          if (tol > 0)
-            _drop_tolerance = tol;
-        }
-        double get_drop_tolerance() const { return _drop_tolerance; }
-        
-        void set_entries_per_row(unsigned int e)
-        {
-          if (e > 0)
-            _entries_per_row = e;
-        }
-
-        unsigned int get_entries_per_row() const { return _entries_per_row; }
-
-      private:
-        unsigned int _entries_per_row;
-        double _drop_tolerance;
-    };
-    
-    
-    /** @brief Increments a row iterator (iteration along increasing row indices) up to a certain row index k.
-     * 
-     * Generic implementation using the iterator concept from boost::numeric::ublas. Could not find a better way for sparse matrices...
-     *
-     * @param row_iter   The row iterator
-     * @param k      The final row index
-     */
-    template <typename T>
-    void ilut_inc_row_iterator_to_row_index(T & row_iter, unsigned int k)
-    {
-      while (row_iter.index1() < k)
-        ++row_iter;
-    }
-    
-    /** @brief Increments a row iterator (iteration along increasing row indices) up to a certain row index k.
-     * 
-     * Specialization for the sparse matrix adapter shipped with ViennaCL
-     *
-     * @param row_iter   The row iterator
-     * @param k      The final row index
-     */
-    template <typename ScalarType>
-    void ilut_inc_row_iterator_to_row_index(viennacl::tools::sparse_matrix_adapter<ScalarType> & row_iter, unsigned int k)
-    {
-      row_iter += k - row_iter.index1();
-    }
-    
-    /** @brief Increments a row iterator (iteration along increasing row indices) up to a certain row index k.
-     * 
-     * Specialization for the const sparse matrix adapter shipped with ViennaCL
-     *
-     * @param row_iter   The row iterator
-     * @param k      The final row index
-     */
-    template <typename ScalarType>
-    void ilut_inc_row_iterator_to_row_index(viennacl::tools::const_sparse_matrix_adapter<ScalarType> & row_iter, unsigned int k)
-    {
-      row_iter += k - row_iter.index1();
-    }
-    
-    
-    /** @brief Implementation of a ILU-preconditioner with threshold
-    *
-    * refer to Algorithm 10.6 by Saad's book (1996 edition)
-    *
-    *  @param input   The input matrix. Type requirements: const_iterator1 for iteration along rows, const_iterator2 for iteration along columns
-    *  @param output  The output matrix. Type requirements: const_iterator1 for iteration along rows, const_iterator2 for iteration along columns and write access via operator()
-    *  @param tag     An ilut_tag in order to dispatch among several other preconditioners.
-    */
-    template<typename MatrixType, typename LUType>
-    void precondition(MatrixType const & input, LUType & output, ilut_tag const & tag)
-    {
-      typedef std::map<unsigned int, double>          SparseVector;
-      typedef typename SparseVector::iterator         SparseVectorIterator;
-      typedef typename MatrixType::const_iterator1    InputRowIterator;  //iterate along increasing row index
-      typedef typename MatrixType::const_iterator2    InputColIterator;  //iterate along increasing column index
-      typedef typename LUType::iterator1              OutputRowIterator;  //iterate along increasing row index
-      typedef typename LUType::iterator2              OutputColIterator;  //iterate along increasing column index
-
-      output.clear();
-      assert(input.size1() == output.size1());
-      assert(input.size2() == output.size2());
-      output.resize(static_cast<unsigned int>(input.size1()), static_cast<unsigned int>(input.size2()), false);
-      SparseVector w;
-      
-      std::map<double, unsigned int> temp_map;
-      
-      for (InputRowIterator row_iter = input.begin1(); row_iter != input.end1(); ++row_iter)
-      {
-    /*    if (i%10 == 0)
-      std::cout << i << std::endl;*/
-        
-        //line 2:
-        w.clear();
-        for (InputColIterator col_iter = row_iter.begin(); col_iter != row_iter.end(); ++col_iter)
-          w[static_cast<unsigned int>(col_iter.index2())] = *col_iter;
-
-        //line 3:
-        OutputRowIterator row_iter_out = output.begin1();
-        for (SparseVectorIterator k = w.begin(); k != w.end();)
-        {
-          unsigned int index_k = k->first;
-          if (index_k >= static_cast<unsigned int>(row_iter.index1()))
-            break;
-          
-          
-          //while (row_iter_out.index1() < index_k)
-          //  ++row_iter_out;
-          //if (row_iter_out.index1() < index_k)
-          //  row_iter_out += index_k - row_iter_out.index1();
-          ilut_inc_row_iterator_to_row_index(row_iter_out, index_k);
-          
-          //line 4:
-          double temp = k->second / output(index_k, index_k);
-          if (output(index_k, index_k) == 0.0)
-          {
-            std::cerr << "ViennaCL: FATAL ERROR in ILUT(): Diagonal entry is zero in row " << index_k << "!" << std::endl;
-          }
-          
-          //line 5: (dropping rule to w_k)
-          if ( fabs(temp) > tag.get_drop_tolerance())
-          {
-            //line 7:
-            for (OutputColIterator j = row_iter_out.begin(); j != row_iter_out.end(); ++j)
-            {
-              if (j.index2() > index_k) //attention: manipulation of w(k->first) would invalidate iterator!
-              {
-                w[j.index2()] -= temp * *j;
-              }
-            }
-            ++k;  //attention: manipulation of w(k->first) would invalidate iterator!
-            w[index_k] = temp;// - temp * A(index_k, index_k);
-          }
-          else
-            ++k;
-        } //for k
-        
-        //Line 10: Apply a dropping rule to w
-        //Step 1: Sort all entries:
-        temp_map.clear();
-        for (SparseVectorIterator k = w.begin(); k != w.end(); )
-        {
-          if (fabs(k->second) < tag.get_drop_tolerance())
-          { 
-            long index = k->first;
-            ++k;
-            w.erase(index);
-          }
-          else
-          {
-            double temp = fabs(k->second);
-            while (temp_map.find(temp) != temp_map.end())
-              temp *= 1.00000001; //make entry slightly larger to maintain uniqueness of the entry
-            temp_map[temp] = k->first;
-            ++k;
-          }
-        }
-
-        //Lines 10-12: write the largest p values to L and U
-        unsigned int written_L = 0;
-        unsigned int written_U = 0;
-        for (typename std::map<double, unsigned int>::reverse_iterator iter = temp_map.rbegin(); iter != temp_map.rend(); ++iter)
-        {
-          if (iter->second > static_cast<unsigned int>(row_iter.index1())) //entry for U
-          {
-            if (written_U < tag.get_entries_per_row())
-            {
-              output(static_cast<unsigned int>(row_iter.index1()), iter->second) = static_cast<typename LUType::value_type>(w[iter->second]);
-              ++written_U;
-            }
-          }
-          else if (iter->second == static_cast<unsigned int>(row_iter.index1()))
-          {
-            output(iter->second, iter->second) = static_cast<typename LUType::value_type>(w[static_cast<unsigned int>(row_iter.index1())]);
-          }
-          else //entry for L
-          {
-            if (written_L < tag.get_entries_per_row())
-            {
-              output(static_cast<unsigned int>(row_iter.index1()), iter->second) = static_cast<typename LUType::value_type>(w[iter->second]);
-              ++written_L;
-            }
-          }
-        }
-      } //for i
-    }
-
-
-    /** @brief Generic inplace solution of a unit lower triangular system
-    *   
-    * @param mat  The system matrix
-    * @param vec  The right hand side vector
-    */
-    template<typename MatrixType, typename VectorType>
-    void ilu_inplace_solve(MatrixType const & mat, VectorType & vec, viennacl::linalg::unit_lower_tag)
-    {
-      typedef typename MatrixType::const_iterator1    InputRowIterator;  //iterate along increasing row index
-      typedef typename MatrixType::const_iterator2    InputColIterator;  //iterate along increasing column index
-      
-      for (InputRowIterator row_iter = mat.begin1(); row_iter != mat.end1(); ++row_iter)
-      {
-        for (InputColIterator col_iter = row_iter.begin(); col_iter != row_iter.end(); ++col_iter)
-        {
-          if (col_iter.index2() < col_iter.index1())
-            vec[col_iter.index1()] -= *col_iter * vec[col_iter.index2()];
-        }
-      }
-    }
-
-    /** @brief Generic inplace solution of a upper triangular system
-    *   
-    * @param mat  The system matrix
-    * @param vec  The right hand side vector
-    */
-    template<typename MatrixType, typename VectorType>
-    void ilu_inplace_solve(MatrixType const & mat, VectorType & vec, viennacl::linalg::upper_tag)
-    {
-      typedef typename MatrixType::const_reverse_iterator1    InputRowIterator;  //iterate along increasing row index
-      typedef typename MatrixType::const_iterator2            InputColIterator;  //iterate along increasing column index
-      typedef typename VectorType::value_type                 ScalarType;
-      
-      ScalarType diagonal_entry = 1.0;
-      
-      for (InputRowIterator row_iter = mat.rbegin1(); row_iter != mat.rend1(); ++row_iter)
-      {
-        for (InputColIterator col_iter = row_iter.begin(); col_iter != row_iter.end(); ++col_iter)
-        {
-          if (col_iter.index2() > col_iter.index1())
-            vec[col_iter.index1()] -= *col_iter * vec[col_iter.index2()];
-          if (col_iter.index2() == col_iter.index1())
-            diagonal_entry = *col_iter;
-        }
-        vec[row_iter.index1()] /= diagonal_entry;
-      }
-    }
-
-    /** @brief Generic LU substitution
-    *   
-    * @param mat  The system matrix
-    * @param vec  The right hand side vector
-    */
-    template<typename MatrixType, typename VectorType>
-    void ilu_lu_substitute(MatrixType const & mat, VectorType & vec)
-    {
-      ilu_inplace_solve(mat, vec, unit_lower_tag());
-      ilu_inplace_solve(mat, vec, upper_tag());
-    }
-
-
-    /** @brief ILUT preconditioner class, can be supplied to solve()-routines
-    */
-    template <typename MatrixType>
-    class ilut_precond
-    {
-      typedef typename MatrixType::value_type      ScalarType;
-      
-      public:
-        ilut_precond(MatrixType const & mat, ilut_tag const & tag) : _tag(tag), LU(mat.size1())
-        {
-          //initialize preconditioner:
-          //std::cout << "Start CPU precond" << std::endl;
-          init(mat);          
-          //std::cout << "End CPU precond" << std::endl;
-        }
-        
-        template <typename VectorType>
-        void apply(VectorType & vec) const
-        {
-          viennacl::tools::const_sparse_matrix_adapter<ScalarType> LU_const_adapter(LU, LU.size(), LU.size());
-          viennacl::linalg::ilu_lu_substitute(LU_const_adapter, vec);
-        }
-        
-      private:
-        void init(MatrixType const & mat)
-        {
-          viennacl::tools::sparse_matrix_adapter<ScalarType>       LU_adapter(LU, LU.size(), LU.size());
-          viennacl::linalg::precondition(mat, LU_adapter, _tag);
-        }
-        
-        ilut_tag const & _tag;
-        std::vector< std::map<unsigned int, ScalarType> > LU;
-    };
-
-    
-    /** @brief ILUT preconditioner class, can be supplied to solve()-routines.
-    *
-    *  Specialization for compressed_matrix
-    */
-    template <typename ScalarType, unsigned int MAT_ALIGNMENT>
-    class ilut_precond< compressed_matrix<ScalarType, MAT_ALIGNMENT> >
-    {
-      typedef compressed_matrix<ScalarType, MAT_ALIGNMENT>   MatrixType;
-      
-      public:
-        ilut_precond(MatrixType const & mat, ilut_tag const & tag) : _tag(tag), LU(mat.size1())
-        {
-          //initialize preconditioner:
-          //std::cout << "Start GPU precond" << std::endl;
-          init(mat);          
-          //std::cout << "End GPU precond" << std::endl;
-        }
-        
-        void apply(vector<ScalarType> & vec) const
-        {
-          copy(vec, temp_vec);
-          //lu_substitute(LU, vec);
-          viennacl::tools::const_sparse_matrix_adapter<ScalarType> LU_const_adapter(LU, LU.size(), LU.size());
-          viennacl::linalg::ilu_lu_substitute(LU_const_adapter, temp_vec);
-          
-          copy(temp_vec, vec);
-        }
-        
-      private:
-        void init(MatrixType const & mat)
-        {
-          std::vector< std::map<unsigned int, ScalarType> > temp(mat.size1());
-          //std::vector< std::map<unsigned int, ScalarType> > LU_cpu(mat.size1());
-
-          //copy to cpu:
-          copy(mat, temp);
-          
-          viennacl::tools::const_sparse_matrix_adapter<ScalarType>       temp_adapter(temp, temp.size(), temp.size());
-          viennacl::tools::sparse_matrix_adapter<ScalarType>       LU_adapter(LU, LU.size(), LU.size());
-          viennacl::linalg::precondition(temp_adapter, LU_adapter, _tag);
-          
-          temp_vec.resize(mat.size1());
-          
-          //copy resulting preconditioner back to gpu:
-          //copy(LU_cpu, LU);
-        }
-        
-        ilut_tag const & _tag;
-        //MatrixType LU;
-        std::vector< std::map<unsigned int, ScalarType> > LU;
-        mutable std::vector<ScalarType> temp_vec;
-    };
-
-  }
-}
-
-
-
+#include "viennacl/linalg/detail/ilu/ilut.hpp"
+#include "viennacl/linalg/detail/ilu/ilu0.hpp"
+#include "viennacl/linalg/detail/ilu/block_ilu.hpp"
 
 #endif
 
diff --git a/viennacl/linalg/inner_prod.hpp b/viennacl/linalg/inner_prod.hpp
index 69b374d..ed810db 100644
--- a/viennacl/linalg/inner_prod.hpp
+++ b/viennacl/linalg/inner_prod.hpp
@@ -2,22 +2,23 @@
 #define VIENNACL_LINALG_INNER_PROD_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
 ============================================================================= */
 
-/** @file inner_prod.hpp
+/** @file viennacl/linalg/inner_prod.hpp
     @brief Generic interface for the computation of inner products. See viennacl/linalg/vector_operations.hpp for implementations.
 */
 
@@ -25,131 +26,148 @@
 #include "viennacl/tools/tools.hpp"
 #include "viennacl/meta/enable_if.hpp"
 #include "viennacl/meta/tag_of.hpp"
+#include "viennacl/meta/result_of.hpp"
 
 namespace viennacl
 {
   //
   // generic inner_prod function
   //   uses tag dispatch to identify which algorithm
-  //   should be called 
+  //   should be called
   //
-  namespace linalg 
+  namespace linalg
   {
-    
-    #ifdef VIENNACL_HAVE_EIGEN
+
+    #ifdef VIENNACL_WITH_EIGEN
     // ----------------------------------------------------
     // EIGEN
     //
-      #if defined(_MSC_VER) && _MSC_VER < 1500        //Visual Studio 2005 needs special treatment
-      float
-      inner_prod(Eigen::VectorXf const & v1,
-                 Eigen::VectorXf const & v2)
-      {
-        return v1 * v2;
-      }
-      
-      double
-      inner_prod(Eigen::VectorXd const & v1,
-                 Eigen::VectorXd const & v2)
-      {
-        return v1 * v2;
-      }
-      
-      #else    
-      template< typename VectorT1, typename VectorT2 >
-      typename VectorT1::RealScalar
-      inner_prod(VectorT1 const& v1, VectorT2 const& v2, 
-          typename viennacl::enable_if< viennacl::is_eigen< typename viennacl::traits::tag_of< VectorT1 >::type >::value
-                                              >::type* dummy = 0)
-      {
-        //std::cout << "eigen .. " << std::endl;
-        return v1.dot(v2);
-      }
-      #endif
+    template< typename VectorT1, typename VectorT2 >
+    typename viennacl::enable_if< viennacl::is_eigen< typename viennacl::traits::tag_of< VectorT1 >::type >::value,
+                                  typename VectorT1::RealScalar>::type
+    inner_prod(VectorT1 const& v1, VectorT2 const& v2)
+    {
+      //std::cout << "eigen .. " << std::endl;
+      return v1.dot(v2);
+    }
     #endif
-    
-    #ifdef VIENNACL_HAVE_MTL4
+
+    #ifdef VIENNACL_WITH_MTL4
     // ----------------------------------------------------
     // MTL4
     //
-      #if defined(_MSC_VER) && _MSC_VER < 1500        //Visual Studio 2005 needs special treatment
-      template <typename ScalarType>
-      ScalarType inner_prod(mtl::dense_vector<ScalarType> const & v1,
-                            mtl::dense_vector<ScalarType> const & v2)
-      {
-        return mtl::dot(v1, v2);
-      }
-      #else    
-      template< typename VectorT1, typename VectorT2 >
-      typename VectorT1::value_type
-      inner_prod(VectorT1 const& v1, VectorT2 const& v2, 
-          typename viennacl::enable_if< viennacl::is_mtl4< typename viennacl::traits::tag_of< VectorT1 >::type >::value
-                                              >::type* dummy = 0)
-      {
-        //std::cout << "mtl4 .. " << std::endl;
-        return mtl::dot(v1, v2);
-      }
-      #endif
+    template< typename VectorT1, typename VectorT2 >
+    typename viennacl::enable_if< viennacl::is_mtl4< typename viennacl::traits::tag_of< VectorT1 >::type >::value,
+                                  typename VectorT1::value_type>::type
+    inner_prod(VectorT1 const& v1, VectorT2 const& v2)
+    {
+      //std::cout << "mtl4 .. " << std::endl;
+      return mtl::dot(v1, v2);
+    }
     #endif
-    
-    #ifdef VIENNACL_HAVE_UBLAS
+
+    #ifdef VIENNACL_WITH_UBLAS
     // ----------------------------------------------------
     // UBLAS
     //
-      #if defined(_MSC_VER) && _MSC_VER < 1500        //Visual Studio 2005 needs special treatment
-      template< typename ScalarType >
-      ScalarType
-      inner_prod(boost::numeric::ublas::vector<ScalarType> const & v1,
-                 boost::numeric::ublas::vector<ScalarType> const & v2)
-      {
-        // std::cout << "ublas .. " << std::endl;
-        return boost::numeric::ublas::inner_prod(v1, v2);
-      }
-      #else    
-      template< typename VectorT1, typename VectorT2 >
-      typename VectorT1::value_type
-      inner_prod(VectorT1 const& v1, VectorT2 const& v2, 
-          typename viennacl::enable_if< viennacl::is_ublas< typename viennacl::traits::tag_of< VectorT1 >::type >::value
-                                              >::type* dummy = 0)
-      {
-        //std::cout << "ublas .. " << std::endl;
-        return boost::numeric::ublas::inner_prod(v1, v2);
-      }
-      #endif
+    template< typename VectorT1, typename VectorT2 >
+    typename viennacl::enable_if< viennacl::is_ublas< typename viennacl::traits::tag_of< VectorT1 >::type >::value,
+                                  typename VectorT1::value_type>::type
+    inner_prod(VectorT1 const& v1, VectorT2 const& v2)
+    {
+      //std::cout << "ublas .. " << std::endl;
+      return boost::numeric::ublas::inner_prod(v1, v2);
+    }
     #endif
 
     // ----------------------------------------------------
     // STL
     //
     template< typename VectorT1, typename VectorT2 >
-    typename VectorT1::value_type
-    inner_prod(VectorT1 const& v1, VectorT2 const& v2, 
-         typename viennacl::enable_if< viennacl::is_stl< typename viennacl::traits::tag_of< VectorT1 >::type >::value
-                                     >::type* dummy = 0)
+    typename viennacl::enable_if< viennacl::is_stl< typename viennacl::traits::tag_of< VectorT1 >::type >::value,
+                                  typename VectorT1::value_type>::type
+    inner_prod(VectorT1 const& v1, VectorT2 const& v2)
     {
-      assert(v1.size() == v2.size());
+      assert(v1.size() == v2.size() && bool("Vector sizes mismatch"));
       //std::cout << "stl .. " << std::endl;
       typename VectorT1::value_type result = 0;
       for (typename VectorT1::size_type i=0; i<v1.size(); ++i)
         result += v1[i] * v2[i];
-      
+
       return result;
     }
 
     // ----------------------------------------------------
     // VIENNACL
     //
-    template< typename ScalarType, unsigned int alignment1, unsigned int alignment2 >
-    viennacl::scalar_expression< const viennacl::vector<ScalarType, alignment1>, 
-                                 const viennacl::vector<ScalarType, alignment2>,
+    template< typename NumericT>
+    viennacl::scalar_expression< const vector_base<NumericT>, const vector_base<NumericT>, viennacl::op_inner_prod >
+    inner_prod(vector_base<NumericT> const & vector1,
+               vector_base<NumericT> const & vector2)
+    {
+      //std::cout << "viennacl .. " << std::endl;
+      return viennacl::scalar_expression< const vector_base<NumericT>,
+                                          const vector_base<NumericT>,
+                                          viennacl::op_inner_prod >(vector1, vector2);
+    }
+
+
+    // expression on lhs:
+    template< typename LHS, typename RHS, typename OP, typename NumericT>
+    viennacl::scalar_expression< const viennacl::vector_expression<LHS, RHS, OP>,
+                                 const vector_base<NumericT>,
                                  viennacl::op_inner_prod >
-    inner_prod(viennacl::vector<ScalarType, alignment1> const & vector1, viennacl::vector<ScalarType, alignment2> const & vector2, 
-         typename viennacl::enable_if< viennacl::is_viennacl< typename viennacl::traits::tag_of< viennacl::vector<ScalarType, alignment1> >::type >::value
-                                            >::type* dummy = 0)
+    inner_prod(viennacl::vector_expression<LHS, RHS, OP> const & vector1,
+               vector_base<NumericT> const & vector2)
     {
       //std::cout << "viennacl .. " << std::endl;
-      return viennacl::linalg::inner_prod_impl(vector1, vector2);
+      return viennacl::scalar_expression< const viennacl::vector_expression<LHS, RHS, OP>,
+                                          const vector_base<NumericT>,
+                                          viennacl::op_inner_prod >(vector1, vector2);
     }
+
+    // expression on rhs:
+    template <typename NumericT, typename LHS, typename RHS, typename OP>
+    viennacl::scalar_expression< const vector_base<NumericT>,
+                                 const viennacl::vector_expression<LHS, RHS, OP>,
+                                 viennacl::op_inner_prod >
+    inner_prod(vector_base<NumericT> const & vector1,
+               viennacl::vector_expression<LHS, RHS, OP> const & vector2)
+    {
+      //std::cout << "viennacl .. " << std::endl;
+      return viennacl::scalar_expression< const vector_base<NumericT>,
+                                          const viennacl::vector_expression<LHS, RHS, OP>,
+                                          viennacl::op_inner_prod >(vector1, vector2);
+    }
+
+    // expression on lhs and rhs:
+    template <typename LHS1, typename RHS1, typename OP1,
+              typename LHS2, typename RHS2, typename OP2>
+    viennacl::scalar_expression< const viennacl::vector_expression<LHS1, RHS1, OP1>,
+                                 const viennacl::vector_expression<LHS2, RHS2, OP2>,
+                                 viennacl::op_inner_prod >
+    inner_prod(viennacl::vector_expression<LHS1, RHS1, OP1> const & vector1,
+               viennacl::vector_expression<LHS2, RHS2, OP2> const & vector2)
+    {
+      //std::cout << "viennacl .. " << std::endl;
+      return viennacl::scalar_expression< const viennacl::vector_expression<LHS1, RHS1, OP1>,
+                                          const viennacl::vector_expression<LHS2, RHS2, OP2>,
+                                          viennacl::op_inner_prod >(vector1, vector2);
+    }
+
+
+    // Multiple inner products:
+    template< typename NumericT>
+    viennacl::vector_expression< const vector_base<NumericT>, const vector_tuple<NumericT>, viennacl::op_inner_prod >
+    inner_prod(vector_base<NumericT> const & x,
+               vector_tuple<NumericT> const & y_tuple)
+    {
+      return viennacl::vector_expression< const vector_base<NumericT>,
+                                          const vector_tuple<NumericT>,
+                                          viennacl::op_inner_prod >(x, y_tuple);
+    }
+
+
   } // end namespace linalg
 } // end namespace viennacl
 #endif
diff --git a/viennacl/linalg/jacobi_precond.hpp b/viennacl/linalg/jacobi_precond.hpp
index 4ceac0a..bc268d9 100644
--- a/viennacl/linalg/jacobi_precond.hpp
+++ b/viennacl/linalg/jacobi_precond.hpp
@@ -1,23 +1,24 @@
-#ifndef VIENNACL_JACOBI_PRECOND_HPP_
-#define VIENNACL_JACOBI_PRECOND_HPP_
+#ifndef VIENNACL_LINALG_JACOBI_PRECOND_HPP_
+#define VIENNACL_LINALG_JACOBI_PRECOND_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
 ============================================================================= */
 
-/** @file jacobi_precond.hpp
+/** @file viennacl/linalg/jacobi_precond.hpp
     @brief Implementation of a simple Jacobi preconditioner
 */
 
@@ -27,6 +28,8 @@
 #include "viennacl/vector.hpp"
 #include "viennacl/compressed_matrix.hpp"
 #include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/sparse_matrix_operations.hpp"
+#include "viennacl/linalg/row_scaling.hpp"
 
 #include <map>
 
@@ -34,27 +37,32 @@ namespace viennacl
 {
   namespace linalg
   {
-    
+
     /** @brief A tag for a jacobi preconditioner
     */
     class jacobi_tag {};
-    
 
-    /** @brief Jacobi preconditioner class, can be supplied to solve()-routines
+
+    /** @brief Jacobi preconditioner class, can be supplied to solve()-routines. Generic version for non-ViennaCL matrices.
     */
-    template <typename MatrixType>
+    template <typename MatrixType,
+              bool is_viennacl = detail::row_scaling_for_viennacl<MatrixType>::value >
     class jacobi_precond
     {
       typedef typename MatrixType::value_type      ScalarType;
-      
+
       public:
-        jacobi_precond(MatrixType const & mat, jacobi_tag const & tag) : system_matrix(mat)
+        jacobi_precond(MatrixType const & mat, jacobi_tag const &) : diag_A(viennacl::traits::size1(mat))
         {
-          assert(mat.size1() == mat.size2());
-          diag_A_inv.resize(mat.size1());  //resize without preserving values
-          
-          for (typename MatrixType::const_iterator1 row_it = system_matrix.begin1();
-                row_it != system_matrix.end1();
+          init(mat);
+        }
+
+        void init(MatrixType const & mat)
+        {
+          diag_A.resize(viennacl::traits::size1(mat));  //resize without preserving values
+
+          for (typename MatrixType::const_iterator1 row_it = mat.begin1();
+                row_it != mat.end1();
                 ++row_it)
           {
             bool diag_found = false;
@@ -64,7 +72,7 @@ namespace viennacl
             {
               if (col_it.index1() == col_it.index2())
               {
-                diag_A_inv[col_it.index1()] = static_cast<ScalarType>(1.0) / *col_it;
+                diag_A[col_it.index1()] = *col_it;
                 diag_found = true;
               }
             }
@@ -72,103 +80,53 @@ namespace viennacl
               throw "ViennaCL: Zero in diagonal encountered while setting up Jacobi preconditioner!";
           }
         }
-        
-        
+
+
         /** @brief Apply to res = b - Ax, i.e. jacobi applied vec (right hand side),  */
         template <typename VectorType>
         void apply(VectorType & vec) const
         {
-          assert(vec.size() == diag_A_inv.size());
-          for (size_t i=0; i<vec.size(); ++i)
-          {
-            vec[i] *= diag_A_inv[i];
-          }
+          assert(viennacl::traits::size(diag_A) == viennacl::traits::size(vec) && bool("Size mismatch"));
+          for (vcl_size_t i=0; i<diag_A.size(); ++i)
+            vec[i] /= diag_A[i];
         }
-        
+
       private:
-        MatrixType const & system_matrix;
-        std::vector<ScalarType> diag_A_inv;
+        std::vector<ScalarType> diag_A;
     };
 
-    
+
     /** @brief Jacobi preconditioner class, can be supplied to solve()-routines.
     *
     *  Specialization for compressed_matrix
     */
-    template <typename ScalarType, unsigned int MAT_ALIGNMENT>
-    class jacobi_precond< compressed_matrix<ScalarType, MAT_ALIGNMENT> >
+    template <typename MatrixType>
+    class jacobi_precond< MatrixType, true>
     {
-      typedef compressed_matrix<ScalarType, MAT_ALIGNMENT>   MatrixType;
-      
+        typedef typename viennacl::result_of::cpu_value_type<typename MatrixType::value_type>::type  ScalarType;
+
       public:
-        jacobi_precond(MatrixType const & mat, jacobi_tag const & tag) : system_matrix(mat), diag_A_inv(mat.size1())
+        jacobi_precond(MatrixType const & mat, jacobi_tag const &) : diag_A(mat.size1(), viennacl::traits::context(mat))
         {
-          assert(system_matrix.size1() == system_matrix.size2());
-
-          init_gpu();
+          init(mat);
         }
-          
-        /*void init_cpu()
-        {
-          
-          std::vector< std::map<unsigned int, ScalarType> > cpu_check;
-          std::vector<ScalarType> diag_A_inv_cpu(system_matrix.size1());
-          
-          copy(system_matrix, cpu_check);
-          viennacl::tools::const_sparse_matrix_adapter<ScalarType> cpu_check_adapter(cpu_check);
-          
-          for (typename viennacl::tools::const_sparse_matrix_adapter<ScalarType>::const_iterator1 row_it = cpu_check_adapter.begin1();
-                row_it != cpu_check_adapter.end1();
-                ++row_it)
-          {
-            bool diag_found = false;
-            for (typename viennacl::tools::const_sparse_matrix_adapter<ScalarType>::const_iterator2 col_it = row_it.begin();
-                  col_it != row_it.end();
-                  ++col_it)
-            {
-              if (col_it.index1() == col_it.index2())
-              {
-                diag_found = true;
-                diag_A_inv_cpu[col_it.index1()] = static_cast<ScalarType>(1.0) / *col_it;
-              }
-            }
-            if (!diag_found)
-              throw "ViennaCL: Zero in diagonal encountered while setting up Jacobi preconditioner!";
-          }
-          
-          diag_A_inv.resize(system_matrix.size1(), false);
-          viennacl::fast_copy(diag_A_inv_cpu, diag_A_inv);
-        }*/
-        
-        void init_gpu()
-        {
-          viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(
-                                              viennacl::linalg::kernels::compressed_matrix<ScalarType, MAT_ALIGNMENT>::program_name(),
-                                              "jacobi_precond");
 
-          viennacl::ocl::enqueue( k(system_matrix.handle1(), system_matrix.handle2(), system_matrix.handle(), 
-                                    diag_A_inv, static_cast<cl_uint>(diag_A_inv.size())) );        
+
+        void init(MatrixType const & mat)
+        {
+          detail::row_info(mat, diag_A, detail::SPARSE_ROW_DIAGONAL);
         }
-        
-        
+
+
         template <unsigned int ALIGNMENT>
         void apply(viennacl::vector<ScalarType, ALIGNMENT> & vec) const
         {
-          assert(viennacl::traits::size1(system_matrix) == viennacl::traits::size(vec));
-          
-          //run kernel:
-          viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<ScalarType, ALIGNMENT>::program_name(),
-                                                                "diag_precond");
-
-          viennacl::ocl::enqueue(
-             k(viennacl::traits::handle(diag_A_inv), cl_uint(viennacl::traits::start(diag_A_inv)), cl_uint(viennacl::traits::size(diag_A_inv)),
-               viennacl::traits::handle(vec), cl_uint(viennacl::traits::start(vec)), cl_uint(viennacl::traits::size(vec)) )
-                                );        
+          assert(viennacl::traits::size(diag_A) == viennacl::traits::size(vec) && bool("Size mismatch"));
+          vec = element_div(vec, diag_A);
         }
-        
+
       private:
-        MatrixType const & system_matrix;
-        viennacl::vector<ScalarType> diag_A_inv;
+        viennacl::vector<ScalarType> diag_A;
     };
 
   }
diff --git a/viennacl/linalg/lanczos.hpp b/viennacl/linalg/lanczos.hpp
new file mode 100644
index 0000000..2785435
--- /dev/null
+++ b/viennacl/linalg/lanczos.hpp
@@ -0,0 +1,490 @@
+#ifndef VIENNACL_LINALG_LANCZOS_HPP_
+#define VIENNACL_LINALG_LANCZOS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/lanczos.hpp
+*   @brief Generic interface for the Lanczos algorithm.
+*
+*   Contributed by Guenther Mader and Astrid Rupp.
+*/
+
+#include <cmath>
+#include <vector>
+#include "viennacl/vector.hpp"
+#include "viennacl/compressed_matrix.hpp"
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/linalg/inner_prod.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+#include "viennacl/io/matrix_market.hpp"
+#include "viennacl/linalg/bisect.hpp"
+#include <boost/random.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/numeric/ublas/matrix.hpp>
+#include <boost/numeric/ublas/matrix_proxy.hpp>
+#include <boost/numeric/ublas/matrix_expression.hpp>
+#include <boost/numeric/ublas/matrix_sparse.hpp>
+#include <boost/numeric/ublas/vector.hpp>
+#include <boost/numeric/ublas/operation.hpp>
+#include <boost/numeric/ublas/vector_expression.hpp>
+#include <boost/numeric/ublas/io.hpp>
+
+namespace viennacl
+{
+  namespace linalg
+  {
+
+    /** @brief A tag for the lanczos algorithm.
+    */
+    class lanczos_tag
+    {
+      public:
+
+        enum
+        {
+          partial_reorthogonalization = 0,
+          full_reorthogonalization,
+          no_reorthogonalization
+        };
+
+        /** @brief The constructor
+        *
+        * @param factor                 Exponent of epsilon - tolerance for batches of Reorthogonalization
+        * @param numeig                 Number of eigenvalues to be returned
+        * @param met                    Method for Lanczos-Algorithm: 0 for partial Reorthogonalization, 1 for full Reorthogonalization and 2 for Lanczos without Reorthogonalization
+        * @param krylov                 Maximum krylov-space size
+        */
+
+        lanczos_tag(double factor = 0.75,
+                    vcl_size_t numeig = 10,
+                    int met = 0,
+                    vcl_size_t krylov = 100) : factor_(factor), num_eigenvalues_(numeig), method_(met), krylov_size_(krylov) {}
+
+        /** @brief Sets the number of eigenvalues */
+        void num_eigenvalues(int numeig){ num_eigenvalues_ = numeig; }
+
+          /** @brief Returns the number of eigenvalues */
+        vcl_size_t num_eigenvalues() const { return num_eigenvalues_; }
+
+          /** @brief Sets the exponent of epsilon */
+        void factor(double fct) { factor_ = fct; }
+
+        /** @brief Returns the exponent */
+        double factor() const { return factor_; }
+
+        /** @brief Sets the size of the kylov space */
+        void krylov_size(int max) { krylov_size_ = max; }
+
+        /** @brief Returns the size of the kylov space */
+        vcl_size_t  krylov_size() const { return krylov_size_; }
+
+        /** @brief Sets the reorthogonalization method */
+        void method(int met){ method_ = met; }
+
+        /** @brief Returns the reorthogonalization method */
+        int method() const { return method_; }
+
+
+      private:
+        double factor_;
+        vcl_size_t num_eigenvalues_;
+        int method_; // see enum defined above for possible values
+        vcl_size_t krylov_size_;
+
+    };
+
+
+    namespace detail
+    {
+      /**
+      *   @brief Implementation of the Lanczos PRO algorithm
+      *
+      *   @param A            The system matrix
+      *   @param r            Random start vector
+      *   @param size         Size of krylov-space
+      *   @param tag          Lanczos_tag with several options for the algorithm
+      *   @return             Returns the eigenvalues (number of eigenvalues equals size of krylov-space)
+      */
+
+      template< typename MatrixT, typename VectorT >
+      std::vector<
+              typename viennacl::result_of::cpu_value_type<typename MatrixT::value_type>::type
+              >
+      lanczosPRO (MatrixT const& A, VectorT & r, vcl_size_t size, lanczos_tag const & tag)
+      {
+
+        typedef typename viennacl::result_of::value_type<MatrixT>::type        ScalarType;
+        typedef typename viennacl::result_of::cpu_value_type<ScalarType>::type    CPU_ScalarType;
+
+
+        // generation of some random numbers, used for lanczos PRO algorithm
+        boost::mt11213b mt;
+        boost::normal_distribution<CPU_ScalarType> N(0, 1);
+        boost::bernoulli_distribution<CPU_ScalarType> B(0.5);
+        boost::triangle_distribution<CPU_ScalarType> T(-1, 0, 1);
+
+        boost::variate_generator<boost::mt11213b&, boost::normal_distribution<CPU_ScalarType> >     get_N(mt, N);
+        boost::variate_generator<boost::mt11213b&, boost::bernoulli_distribution<CPU_ScalarType> >  get_B(mt, B);
+        boost::variate_generator<boost::mt11213b&, boost::triangle_distribution<CPU_ScalarType> >   get_T(mt, T);
+
+
+        long i, j, k, index, retry, reorths;
+        std::vector<long> l_bound(size/2), u_bound(size/2);
+        bool second_step;
+        CPU_ScalarType squ_eps, eta, temp, eps, retry_th;
+        vcl_size_t n = r.size();
+        std::vector< std::vector<CPU_ScalarType> > w(2, std::vector<CPU_ScalarType>(size));
+        CPU_ScalarType cpu_beta;
+
+        boost::numeric::ublas::vector<CPU_ScalarType> s(n);
+
+        VectorT t(n);
+        CPU_ScalarType inner_rt;
+        ScalarType vcl_beta;
+        ScalarType vcl_alpha;
+        std::vector<CPU_ScalarType> alphas, betas;
+        boost::numeric::ublas::matrix<CPU_ScalarType> Q(n, size);
+
+        second_step = false;
+        eps = std::numeric_limits<CPU_ScalarType>::epsilon();
+        squ_eps = std::sqrt(eps);
+        retry_th = 1e-2;
+        eta = std::exp(std::log(eps) * tag.factor());
+        reorths = 0;
+        retry = 0;
+
+        vcl_beta = viennacl::linalg::norm_2(r);
+
+        r /= vcl_beta;
+
+        detail::copy_vec_to_vec(r,s);
+        boost::numeric::ublas::column(Q, 0) = s;
+
+        VectorT u = viennacl::linalg::prod(A, r);
+        vcl_alpha = viennacl::linalg::inner_prod(u, r);
+        alphas.push_back(vcl_alpha);
+        w[0][0] = 1;
+        betas.push_back(vcl_beta);
+
+        long batches = 0;
+        for(i = 1;i < static_cast<long>(size); i++)
+        {
+          r = u - vcl_alpha * r;
+          vcl_beta = viennacl::linalg::norm_2(r);
+
+          betas.push_back(vcl_beta);
+          r = r / vcl_beta;
+
+          index = i % 2;
+          w[index][i] = 1;
+          k = (i + 1) % 2;
+          w[index][0] = (betas[1] * w[k][1] + (alphas[0] - vcl_alpha) * w[k][0] - betas[i - 1] * w[index][0]) / vcl_beta + eps * 0.3 * get_N() * (betas[1] + vcl_beta);
+
+          for(j = 1;j < i - 1;j++)
+          {
+                  w[index][j] = (betas[j + 1] * w[k][j + 1] + (alphas[j] - vcl_alpha) * w[k][j] + betas[j] * w[k][j - 1] - betas[i - 1] * w[index][j]) / vcl_beta + eps * 0.3 * get_N() * (betas[j + 1] + vcl_beta);
+          }
+          w[index][i - 1] = 0.6 * eps * n * get_N() * betas[1] / vcl_beta;
+
+          if(second_step)
+          {
+            for(j = 0;j < batches;j++)
+            {
+              l_bound[j]++;
+              u_bound[j]--;
+
+              for(k = l_bound[j];k < u_bound[j];k++)
+              {
+                detail::copy_vec_to_vec(boost::numeric::ublas::column(Q, k), t);
+                inner_rt = viennacl::linalg::inner_prod(r,t);
+                r = r - inner_rt * t;
+                w[index][k] = 1.5 * eps * get_N();
+                reorths++;
+              }
+            }
+            temp = viennacl::linalg::norm_2(r);
+            r = r / temp;
+            vcl_beta = vcl_beta * temp;
+            second_step = false;
+          }
+          batches = 0;
+
+          for(j = 0;j < i;j++)
+          {
+            if(std::fabs(w[index][j]) >= squ_eps)
+            {
+              detail::copy_vec_to_vec(boost::numeric::ublas::column(Q, j), t);
+              inner_rt = viennacl::linalg::inner_prod(r,t);
+              r = r - inner_rt * t;
+              w[index][j] = 1.5 * eps * get_N();
+              k = j - 1;
+              reorths++;
+              while(k >= 0 && std::fabs(w[index][k]) > eta)
+              {
+                detail::copy_vec_to_vec(boost::numeric::ublas::column(Q, k), t);
+                inner_rt = viennacl::linalg::inner_prod(r,t);
+                r = r - inner_rt * t;
+                w[index][k] = 1.5 * eps * get_N();
+                k--;
+                reorths++;
+              }
+              l_bound[batches] = k + 1;
+              k = j + 1;
+
+              while(k < i && std::fabs(w[index][k]) > eta)
+              {
+                detail::copy_vec_to_vec(boost::numeric::ublas::column(Q, k), t);
+                inner_rt = viennacl::linalg::inner_prod(r,t);
+                r = r - inner_rt * t;
+                w[index][k] = 1.5 * eps * get_N();
+                k++;
+                reorths++;
+              }
+              u_bound[batches] = k - 1;
+              batches++;
+              j = k;
+            }
+          }
+
+          if(batches > 0)
+          {
+            temp = viennacl::linalg::norm_2(r);
+            r = r / temp;
+            vcl_beta = vcl_beta * temp;
+            second_step = true;
+
+            while(temp < retry_th)
+            {
+              for(j = 0;j < i;j++)
+              {
+                detail::copy_vec_to_vec(boost::numeric::ublas::column(Q, k), t);
+                inner_rt = viennacl::linalg::inner_prod(r,t);
+                r = r - inner_rt * t;
+                reorths++;
+              }
+              retry++;
+              temp = viennacl::linalg::norm_2(r);
+              r = r / temp;
+              vcl_beta = vcl_beta * temp;
+            }
+          }
+
+          detail::copy_vec_to_vec(r,s);
+          boost::numeric::ublas::column(Q, i) = s;
+
+          cpu_beta = vcl_beta;
+          s = - cpu_beta * boost::numeric::ublas::column(Q, i - 1);
+          detail::copy_vec_to_vec(s, u);
+          u += viennacl::linalg::prod(A, r);
+          vcl_alpha = viennacl::linalg::inner_prod(u, r);
+          alphas.push_back(vcl_alpha);
+        }
+
+        return bisect(alphas, betas);
+
+      }
+
+
+      /**
+      *   @brief Implementation of the lanczos algorithm without reorthogonalization
+      *
+      *   @param A            The system matrix
+      *   @param r            Random start vector
+      *   @param size         Size of krylov-space
+      *   @return             Returns the eigenvalues (number of eigenvalues equals size of krylov-space)
+      */
+      template< typename MatrixT, typename VectorT >
+      std::vector<
+              typename viennacl::result_of::cpu_value_type<typename MatrixT::value_type>::type
+              >
+      lanczos (MatrixT const& A, VectorT & r, vcl_size_t size, lanczos_tag)
+      {
+
+        typedef typename viennacl::result_of::value_type<MatrixT>::type        ScalarType;
+        typedef typename viennacl::result_of::cpu_value_type<ScalarType>::type    CPU_ScalarType;
+
+        ScalarType vcl_beta;
+        ScalarType vcl_alpha;
+        std::vector<CPU_ScalarType> alphas, betas;
+        CPU_ScalarType norm;
+        vcl_size_t n = r.size();
+        VectorT u(n), t(n);
+        boost::numeric::ublas::vector<CPU_ScalarType> s(r.size()), u_zero(n), q(n);
+        boost::numeric::ublas::matrix<CPU_ScalarType> Q(n, size);
+
+        u_zero = boost::numeric::ublas::zero_vector<CPU_ScalarType>(n);
+        detail::copy_vec_to_vec(u_zero, u);
+        norm = norm_2(r);
+
+        for(vcl_size_t i = 0;i < size; i++)
+        {
+          r /= norm;
+          vcl_beta = norm;
+
+          detail::copy_vec_to_vec(r,s);
+          boost::numeric::ublas::column(Q, i) = s;
+
+          u += prod(A, r);
+          vcl_alpha = inner_prod(u, r);
+          r = u - vcl_alpha * r;
+          norm = norm_2(r);
+
+          q = boost::numeric::ublas::column(Q, i);
+          detail::copy_vec_to_vec(q, t);
+
+          u = - norm * t;
+          alphas.push_back(vcl_alpha);
+          betas.push_back(vcl_beta);
+          s.clear();
+        }
+
+        return bisect(alphas, betas);
+      }
+
+      /**
+      *   @brief Implementation of the Lanczos FRO algorithm
+      *
+      *   @param A            The system matrix
+      *   @param r            Random start vector
+      *   @param size         Size of krylov-space
+      *   @return             Returns the eigenvalues (number of eigenvalues equals size of krylov-space)
+      */
+      template< typename MatrixT, typename VectorT >
+      std::vector<
+              typename viennacl::result_of::cpu_value_type<typename MatrixT::value_type>::type
+              >
+      lanczosFRO (MatrixT const& A, VectorT & r, vcl_size_t size, lanczos_tag)
+      {
+
+        typedef typename viennacl::result_of::value_type<MatrixT>::type        ScalarType;
+        typedef typename viennacl::result_of::cpu_value_type<ScalarType>::type    CPU_ScalarType;
+
+          CPU_ScalarType temp;
+          CPU_ScalarType norm;
+          ScalarType vcl_beta;
+          ScalarType vcl_alpha;
+          std::vector<CPU_ScalarType> alphas, betas;
+          vcl_size_t n = r.size();
+          VectorT u(n), t(n);
+          ScalarType inner_rt;
+          boost::numeric::ublas::vector<CPU_ScalarType> u_zero(n), s(r.size()), q(n);
+          boost::numeric::ublas::matrix<CPU_ScalarType> Q(n, size);
+
+          long reorths = 0;
+          norm = norm_2(r);
+
+
+          for(vcl_size_t i = 0; i < size; i++)
+          {
+            r /= norm;
+
+            for(vcl_size_t j = 0; j < i; j++)
+            {
+              q = boost::numeric::ublas::column(Q, j);
+              detail::copy_vec_to_vec(q, t);
+              inner_rt = viennacl::linalg::inner_prod(r,t);
+              r = r - inner_rt * t;
+              reorths++;
+            }
+            temp = viennacl::linalg::norm_2(r);
+            r = r / temp;
+            vcl_beta = temp * norm;
+            detail::copy_vec_to_vec(r,s);
+            boost::numeric::ublas::column(Q, i) = s;
+
+            u += viennacl::linalg::prod(A, r);
+            vcl_alpha = viennacl::linalg::inner_prod(u, r);
+            r = u - vcl_alpha * r;
+            norm = viennacl::linalg::norm_2(r);
+            q = boost::numeric::ublas::column(Q, i);
+            detail::copy_vec_to_vec(q, t);
+            u = - norm * t;
+            alphas.push_back(vcl_alpha);
+            betas.push_back(vcl_beta);
+          }
+
+          return bisect(alphas, betas);
+      }
+
+    } // end namespace detail
+
+    /**
+    *   @brief Implementation of the calculation of eigenvalues using lanczos
+    *
+    *   @param matrix        The system matrix
+    *   @param tag           Tag with several options for the lanczos algorithm
+    *   @return              Returns the n largest eigenvalues (n defined in the lanczos_tag)
+    */
+    template< typename MatrixT >
+    std::vector< typename viennacl::result_of::cpu_value_type<typename MatrixT::value_type>::type >
+    eig(MatrixT const & matrix, lanczos_tag const & tag)
+    {
+      typedef typename viennacl::result_of::value_type<MatrixT>::type           ScalarType;
+      typedef typename viennacl::result_of::cpu_value_type<ScalarType>::type    CPU_ScalarType;
+      typedef typename viennacl::result_of::vector_for_matrix<MatrixT>::type    VectorT;
+
+      boost::mt11213b mt;
+      boost::normal_distribution<CPU_ScalarType> N(0, 1);
+      boost::bernoulli_distribution<CPU_ScalarType> B(0.5);
+      boost::triangle_distribution<CPU_ScalarType> T(-1, 0, 1);
+
+      boost::variate_generator<boost::mt11213b&, boost::normal_distribution<CPU_ScalarType> >     get_N(mt, N);
+      boost::variate_generator<boost::mt11213b&, boost::bernoulli_distribution<CPU_ScalarType> >  get_B(mt, B);
+      boost::variate_generator<boost::mt11213b&, boost::triangle_distribution<CPU_ScalarType> >   get_T(mt, T);
+
+      std::vector<CPU_ScalarType> eigenvalues;
+      vcl_size_t matrix_size = matrix.size1();
+      VectorT r(matrix_size);
+      std::vector<CPU_ScalarType> s(matrix_size);
+
+      for(vcl_size_t i=0; i<s.size(); ++i)
+        s[i] = 3.0 * get_B() + get_T() - 1.5;
+
+      detail::copy_vec_to_vec(s,r);
+
+      vcl_size_t size_krylov = (matrix_size < tag.krylov_size()) ? matrix_size
+                                                                  : tag.krylov_size();
+
+      switch(tag.method())
+      {
+        case lanczos_tag::partial_reorthogonalization:
+          eigenvalues = detail::lanczosPRO(matrix, r, size_krylov, tag);
+          break;
+        case lanczos_tag::full_reorthogonalization:
+          eigenvalues = detail::lanczosFRO(matrix, r, size_krylov, tag);
+          break;
+        case lanczos_tag::no_reorthogonalization:
+          eigenvalues = detail::lanczos(matrix, r, size_krylov, tag);
+          break;
+      }
+
+      std::vector<CPU_ScalarType> largest_eigenvalues;
+
+      for(vcl_size_t i = 1; i<=tag.num_eigenvalues(); i++)
+        largest_eigenvalues.push_back(eigenvalues[size_krylov-i]);
+
+
+      return largest_eigenvalues;
+    }
+
+
+
+
+  } // end namespace linalg
+} // end namespace viennacl
+#endif
diff --git a/viennacl/linalg/lu.hpp b/viennacl/linalg/lu.hpp
new file mode 100644
index 0000000..29bf304
--- /dev/null
+++ b/viennacl/linalg/lu.hpp
@@ -0,0 +1,227 @@
+#ifndef VIENNACL_LINALG_LU_HPP
+#define VIENNACL_LINALG_LU_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/lu.hpp
+    @brief Implementations of LU factorization for row-major and column-major dense matrices.
+*/
+
+#include <algorithm>    //for std::min
+
+#include "viennacl/matrix.hpp"
+#include "viennacl/matrix_proxy.hpp"
+
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/linalg/direct_solve.hpp"
+
+namespace viennacl
+{
+  namespace linalg
+  {
+    /** @brief LU factorization of a row-major dense matrix.
+    *
+    * @param A    The system matrix, where the LU matrices are directly written to. The implicit unit diagonal of L is not written.
+    */
+    template<typename SCALARTYPE>
+    void lu_factorize(matrix<SCALARTYPE, viennacl::row_major> & A)
+    {
+      typedef matrix<SCALARTYPE, viennacl::row_major>  MatrixType;
+
+      vcl_size_t max_block_size = 32;
+      vcl_size_t num_blocks = (A.size2() - 1) / max_block_size + 1;
+      std::vector<SCALARTYPE> temp_buffer(A.internal_size2() * max_block_size);
+
+      // Iterate over panels
+      for (vcl_size_t panel_id = 0; panel_id < num_blocks; ++panel_id)
+      {
+        vcl_size_t row_start = panel_id * max_block_size;
+        vcl_size_t current_block_size = std::min<vcl_size_t>(A.size1() - row_start, max_block_size);
+
+        viennacl::range     block_range(row_start, row_start + current_block_size);
+        viennacl::range remainder_range(row_start + current_block_size, A.size1());
+
+        //
+        // Perform LU factorization on panel:
+        //
+
+
+        // Read from matrix to buffer:
+        viennacl::backend::memory_read(A.handle(),
+                                       sizeof(SCALARTYPE) * row_start          * A.internal_size2(),
+                                       sizeof(SCALARTYPE) * current_block_size * A.internal_size2(),
+                                       &(temp_buffer[0]));
+
+        // Factorize (kij-version):
+        for (vcl_size_t k=0; k < current_block_size - 1; ++k)
+        {
+          for (vcl_size_t i=k+1; i < current_block_size; ++i)
+          {
+            temp_buffer[row_start + i * A.internal_size2() + k] /= temp_buffer[row_start + k * A.internal_size2() + k];  // write l_ik
+
+            SCALARTYPE l_ik = temp_buffer[row_start + i * A.internal_size2() + k];
+
+            for (vcl_size_t j = row_start + k + 1; j < A.size1(); ++j)
+              temp_buffer[i * A.internal_size2() + j] -= l_ik * temp_buffer[k * A.internal_size2() + j];  // l_ik * a_kj
+          }
+        }
+
+        // Write back:
+        viennacl::backend::memory_write(A.handle(),
+                                        sizeof(SCALARTYPE) * row_start          * A.internal_size2(),
+                                        sizeof(SCALARTYPE) * current_block_size * A.internal_size2(),
+                                        &(temp_buffer[0]));
+
+        if (remainder_range.size() > 0)
+        {
+          //
+          // Compute L_12 = [ (U_11)^{T}^{-1} A_{21}^T ]^T
+          //
+          viennacl::matrix_range<MatrixType> U_11(A, block_range,     block_range);
+          viennacl::matrix_range<MatrixType> A_21(A, remainder_range, block_range);
+          viennacl::linalg::inplace_solve(trans(U_11), trans(A_21), viennacl::linalg::lower_tag());
+
+          //
+          // Update remainder of A
+          //
+          viennacl::matrix_range<MatrixType> L_21(A, remainder_range, block_range);
+          viennacl::matrix_range<MatrixType> U_12(A, block_range,     remainder_range);
+          viennacl::matrix_range<MatrixType> A_22(A, remainder_range, remainder_range);
+
+          A_22 -= viennacl::linalg::prod(L_21, U_12);
+        }
+      }
+
+    }
+
+
+    /** @brief LU factorization of a column-major dense matrix.
+    *
+    * @param A    The system matrix, where the LU matrices are directly written to. The implicit unit diagonal of L is not written.
+    */
+    template<typename SCALARTYPE>
+    void lu_factorize(matrix<SCALARTYPE, viennacl::column_major> & A)
+    {
+      typedef matrix<SCALARTYPE, viennacl::column_major>  MatrixType;
+
+      vcl_size_t max_block_size = 32;
+      vcl_size_t num_blocks = (A.size1() - 1) / max_block_size + 1;
+      std::vector<SCALARTYPE> temp_buffer(A.internal_size1() * max_block_size);
+
+      // Iterate over panels
+      for (vcl_size_t panel_id = 0; panel_id < num_blocks; ++panel_id)
+      {
+        vcl_size_t col_start = panel_id * max_block_size;
+        vcl_size_t current_block_size = std::min<vcl_size_t>(A.size1() - col_start, max_block_size);
+
+        viennacl::range     block_range(col_start, col_start + current_block_size);
+        viennacl::range remainder_range(col_start + current_block_size, A.size1());
+
+        //
+        // Perform LU factorization on panel:
+        //
+
+
+        // Read from matrix to buffer:
+        viennacl::backend::memory_read(A.handle(),
+                                       sizeof(SCALARTYPE) * col_start          * A.internal_size1(),
+                                       sizeof(SCALARTYPE) * current_block_size * A.internal_size1(),
+                                       &(temp_buffer[0]));
+
+        // Factorize (kji-version):
+        for (vcl_size_t k=0; k < current_block_size; ++k)
+        {
+          SCALARTYPE a_kk = temp_buffer[col_start + k + k * A.internal_size1()];
+          for (vcl_size_t i=col_start+k+1; i < A.size1(); ++i)
+            temp_buffer[i + k * A.internal_size1()] /= a_kk;  // write l_ik
+
+          for (vcl_size_t j=k+1; j < current_block_size; ++j)
+          {
+            SCALARTYPE a_kj = temp_buffer[col_start + k + j * A.internal_size1()];
+            for (vcl_size_t i=col_start+k+1; i < A.size1(); ++i)
+              temp_buffer[i + j * A.internal_size1()] -= temp_buffer[i + k * A.internal_size1()] * a_kj;  // l_ik * a_kj
+          }
+        }
+
+        // Write back:
+        viennacl::backend::memory_write(A.handle(),
+                                        sizeof(SCALARTYPE) * col_start          * A.internal_size1(),
+                                        sizeof(SCALARTYPE) * current_block_size * A.internal_size1(),
+                                        &(temp_buffer[0]));
+
+        if (remainder_range.size() > 0)
+        {
+          //
+          // Compute U_12:
+          //
+          viennacl::matrix_range<MatrixType> L_11(A, block_range,     block_range);
+          viennacl::matrix_range<MatrixType> A_12(A, block_range, remainder_range);
+          viennacl::linalg::inplace_solve(L_11, A_12, viennacl::linalg::unit_lower_tag());
+
+          //
+          // Update remainder of A
+          //
+          viennacl::matrix_range<MatrixType> L_21(A, remainder_range, block_range);
+          viennacl::matrix_range<MatrixType> U_12(A, block_range,     remainder_range);
+          viennacl::matrix_range<MatrixType> A_22(A, remainder_range, remainder_range);
+
+          A_22 -= viennacl::linalg::prod(L_21, U_12);
+        }
+
+      }
+
+    }
+
+
+    //
+    // Convenience layer:
+    //
+
+    /** @brief LU substitution for the system LU = rhs.
+    *
+    * @param A    The system matrix, where the LU matrices are directly written to. The implicit unit diagonal of L is not written.
+    * @param B    The matrix of load vectors, where the solution is directly written to
+    */
+    template<typename SCALARTYPE, typename F1, typename F2, unsigned int ALIGNMENT_A, unsigned int ALIGNMENT_B>
+    void lu_substitute(matrix<SCALARTYPE, F1, ALIGNMENT_A> const & A,
+                       matrix<SCALARTYPE, F2, ALIGNMENT_B> & B)
+    {
+      assert(A.size1() == A.size2() && bool("Matrix must be square"));
+      assert(A.size1() == B.size1() && bool("Matrix must be square"));
+      inplace_solve(A, B, unit_lower_tag());
+      inplace_solve(A, B, upper_tag());
+    }
+
+    /** @brief LU substitution for the system LU = rhs.
+    *
+    * @param A      The system matrix, where the LU matrices are directly written to. The implicit unit diagonal of L is not written.
+    * @param vec    The load vector, where the solution is directly written to
+    */
+    template<typename SCALARTYPE, typename F, unsigned int ALIGNMENT, unsigned int VEC_ALIGNMENT>
+    void lu_substitute(matrix<SCALARTYPE, F, ALIGNMENT> const & A,
+                       vector<SCALARTYPE, VEC_ALIGNMENT> & vec)
+    {
+      assert(A.size1() == A.size2() && bool("Matrix must be square"));
+      inplace_solve(A, vec, unit_lower_tag());
+      inplace_solve(A, vec, upper_tag());
+    }
+
+  }
+}
+
+#endif
diff --git a/viennacl/linalg/matrix_operations.hpp b/viennacl/linalg/matrix_operations.hpp
index 0b2f7bc..5ca490e 100644
--- a/viennacl/linalg/matrix_operations.hpp
+++ b/viennacl/linalg/matrix_operations.hpp
@@ -1,32 +1,31 @@
-#ifndef VIENNACL_MATRIX_OPERATIONS_HPP_
-#define VIENNACL_MATRIX_OPERATIONS_HPP_
+#ifndef VIENNACL_LINALG_MATRIX_OPERATIONS_HPP_
+#define VIENNACL_LINALG_MATRIX_OPERATIONS_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
 ============================================================================= */
 
-/** @file matrix_operations.hpp
-    @brief Implementations of dense matrix related operations. also matrix-vector products.
+/** @file viennacl/linalg/matrix_operations.hpp
+    @brief Implementations of dense matrix related operations including matrix-vector products.
 */
 
 #include "viennacl/forwards.h"
-#include "viennacl/ocl/device.hpp"
-#include "viennacl/ocl/handle.hpp"
-#include "viennacl/ocl/kernel.hpp"
 #include "viennacl/scalar.hpp"
 #include "viennacl/vector.hpp"
+#include "viennacl/vector_proxy.hpp"
 #include "viennacl/tools/tools.hpp"
 #include "viennacl/meta/enable_if.hpp"
 #include "viennacl/meta/predicate.hpp"
@@ -34,265 +33,303 @@
 #include "viennacl/traits/size.hpp"
 #include "viennacl/traits/start.hpp"
 #include "viennacl/traits/handle.hpp"
-#include "viennacl/tools/matrix_kernel_class_deducer.hpp"
-#include "viennacl/tools/matrix_prod_kernel_class_deducer.hpp"
-#include "viennacl/linalg/kernels/vector_kernels.h"
-#include "viennacl/linalg/kernels/matrix_row_kernels.h"
-#include "viennacl/linalg/kernels/matrix_col_kernels.h"
-
-#include "viennacl/linalg/kernels/matrix_prod_col_col_col_kernels.h"
-#include "viennacl/linalg/kernels/matrix_prod_col_col_row_kernels.h"
-#include "viennacl/linalg/kernels/matrix_prod_col_row_col_kernels.h"
-#include "viennacl/linalg/kernels/matrix_prod_col_row_row_kernels.h"
-
-#include "viennacl/linalg/kernels/matrix_prod_row_col_col_kernels.h"
-#include "viennacl/linalg/kernels/matrix_prod_row_col_row_kernels.h"
-#include "viennacl/linalg/kernels/matrix_prod_row_row_col_kernels.h"
-#include "viennacl/linalg/kernels/matrix_prod_row_row_row_kernels.h"
+#include "viennacl/traits/stride.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/linalg/host_based/matrix_operations.hpp"
+
+#ifdef VIENNACL_WITH_OPENCL
+  #include "viennacl/linalg/opencl/matrix_operations.hpp"
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+  #include "viennacl/linalg/cuda/matrix_operations.hpp"
+#endif
 
 namespace viennacl
 {
   namespace linalg
   {
-    //
-    ///////////////////////////////////// addition and subtraction///////////////////////////////////////////////
-    //
-    
-    namespace detail
+
+    template <typename NumericT, typename F,
+              typename ScalarType1>
+    void am(matrix_base<NumericT, F> & mat1,
+            matrix_base<NumericT, F> const & mat2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
     {
-      template<class T1, class T2, class T3>
-      typename viennacl::enable_if<   viennacl::is_matrix<T1>::value 
-                                   && viennacl::is_matrix<T2>::value 
-                                   && viennacl::is_matrix<T3>::value >::type
-      add_sub_impl(const T1 & mat1, 
-                   const T2 & mat2,
-                         T3 & result,
-                   std::string kernel_name
-                  )
+      switch (viennacl::traits::handle(mat1).get_active_handle_id())
       {
-        assert(result.size1() == mat1.size1());
-        assert(result.size2() == mat1.size2());
-        assert(result.size1() == mat2.size1());
-        assert(result.size2() == mat2.size2());
-
-        typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< T1 >::ResultType    KernelClass;
-        
-        std::size_t block_size = 16;
-        
-        viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), kernel_name);
-        k.global_work_size(0, block_size*block_size);
-        k.global_work_size(1, block_size*block_size);
-        k.local_work_size(0, block_size);
-        k.local_work_size(1, block_size);
-        viennacl::ocl::enqueue(k(viennacl::traits::handle(mat1), 
-                                        cl_uint(viennacl::traits::start1(mat1)),           cl_uint(viennacl::traits::start2(mat1)), 
-                                        cl_uint(viennacl::traits::size1(mat1)),            cl_uint(viennacl::traits::size2(mat1)),
-                                        cl_uint(viennacl::traits::internal_size1(mat1)),   cl_uint(viennacl::traits::internal_size2(mat1)),
-                                viennacl::traits::handle(mat2), 
-                                        cl_uint(viennacl::traits::start1(mat2)),           cl_uint(viennacl::traits::start2(mat2)), 
-                                        cl_uint(viennacl::traits::size1(mat2)),            cl_uint(viennacl::traits::size2(mat2)),
-                                        cl_uint(viennacl::traits::internal_size1(mat2)),   cl_uint(viennacl::traits::internal_size2(mat2)),
-                                viennacl::traits::handle(result), 
-                                        cl_uint(viennacl::traits::start1(result)),         cl_uint(viennacl::traits::start2(result)), 
-                                        cl_uint(viennacl::traits::size1(result)),          cl_uint(viennacl::traits::size2(result)),
-                                        cl_uint(viennacl::traits::internal_size1(result)), cl_uint(viennacl::traits::internal_size2(result))
-                                )
-                              );        
+        case viennacl::MAIN_MEMORY:
+          viennacl::linalg::host_based::am(mat1, mat2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha);
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case viennacl::OPENCL_MEMORY:
+          viennacl::linalg::opencl::am(mat1, mat2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha);
+          break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        case viennacl::CUDA_MEMORY:
+          viennacl::linalg::cuda::am(mat1, mat2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha);
+          break;
+#endif
+        case viennacl::MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("not implemented");
       }
-      
+    }
 
 
-      template <typename T1, typename T2>
-      typename viennacl::enable_if<    viennacl::is_matrix<T1>::value
-                                    && viennacl::is_matrix<T2>::value
-                                  >::type
-      inplace_add_sub_impl(T1 & result, T2 const & mat2, std::string kernel_name)
+    template <typename NumericT, typename F,
+              typename ScalarType1, typename ScalarType2>
+    void ambm(matrix_base<NumericT, F> & mat1,
+              matrix_base<NumericT, F> const & mat2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+              matrix_base<NumericT, F> const & mat3, ScalarType2 const & beta,  vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
+    {
+      switch (viennacl::traits::handle(mat1).get_active_handle_id())
       {
-        assert(viennacl::traits::size1(result) == viennacl::traits::size1(mat2));
-        assert(viennacl::traits::size2(result) == viennacl::traits::size2(mat2));
-
-        typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< T1 >::ResultType    KernelClass;
-        
-        std::size_t block_size = 16;
-        
-        viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), kernel_name);
-        k.global_work_size(0, block_size*block_size);
-        k.global_work_size(1, block_size*block_size);
-        k.local_work_size(0, block_size);
-        k.local_work_size(1, block_size);
-        
-        viennacl::ocl::enqueue(k(viennacl::traits::handle(result),
-                                        cl_uint(viennacl::traits::start1(result)),         cl_uint(viennacl::traits::start2(result)), 
-                                        cl_uint(viennacl::traits::size1(result)),          cl_uint(viennacl::traits::size2(result)),
-                                        cl_uint(viennacl::traits::internal_size1(result)), cl_uint(viennacl::traits::internal_size2(result)),
-                                viennacl::traits::handle(mat2), 
-                                        cl_uint(viennacl::traits::start1(mat2)),            cl_uint(viennacl::traits::start2(mat2)), 
-                                        cl_uint(viennacl::traits::size1(mat2)),             cl_uint(viennacl::traits::size2(mat2)),
-                                        cl_uint(viennacl::traits::internal_size1(mat2)),    cl_uint(viennacl::traits::internal_size2(mat2))
-                                )
-                              );
+        case viennacl::MAIN_MEMORY:
+          viennacl::linalg::host_based::ambm(mat1,
+                                             mat2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha,
+                                             mat3,  beta, len_beta,  reciprocal_beta,  flip_sign_beta);
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case viennacl::OPENCL_MEMORY:
+          viennacl::linalg::opencl::ambm(mat1,
+                                         mat2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha,
+                                         mat3,  beta, len_beta,  reciprocal_beta,  flip_sign_beta);
+          break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        case viennacl::CUDA_MEMORY:
+          viennacl::linalg::cuda::ambm(mat1,
+                                       mat2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha,
+                                       mat3,  beta, len_beta,  reciprocal_beta,  flip_sign_beta);
+          break;
+#endif
+        case viennacl::MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("not implemented");
       }
-      
-    }
-    
-    /** @brief Adds two dense matrices or submatrices and writes the result to a third matrix or submatrix
-    *
-    * This is the implementation of the convenience expression result = mat1 + mat2;
-    *
-    * @param mat1   The left hand side operand
-    * @param mat2   The right hand side operand
-    * @param result The resulting matrix
-    */
-    template<class T1, class T2, class T3>
-    typename viennacl::enable_if<   viennacl::is_matrix<T1>::value 
-                                 && viennacl::is_matrix<T2>::value 
-                                 && viennacl::is_matrix<T3>::value >::type
-    add(const T1 & mat1, 
-        const T2 & mat2,
-              T3 & result)
-    {
-      detail::add_sub_impl(mat1, mat2, result, "add");
     }
 
-    /** @brief Adds a dense matrix or submatrix to another
-    *
-    * This is the implementation of the convenience expression result += mat1;
-    *
-    * @param mat2   The addend (either a matrix or a matrix_range)
-    * @param result The resulting matrix  (either a matrix or a matrix_range)
-    */
-    template <typename T1, typename T2>
-    typename viennacl::enable_if<    viennacl::is_matrix<T1>::value
-                                  && viennacl::is_matrix<T2>::value
-                                >::type
-    inplace_add(T1 & result, T2 const & mat2)
+
+    template <typename NumericT, typename F,
+              typename ScalarType1, typename ScalarType2>
+    void ambm_m(matrix_base<NumericT, F> & mat1,
+                matrix_base<NumericT, F> const & mat2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+                matrix_base<NumericT, F> const & mat3, ScalarType2 const & beta,  vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
     {
-      detail::inplace_add_sub_impl(result, mat2, "inplace_add");
+      switch (viennacl::traits::handle(mat1).get_active_handle_id())
+      {
+        case viennacl::MAIN_MEMORY:
+          viennacl::linalg::host_based::ambm_m(mat1,
+                                               mat2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha,
+                                               mat3,  beta, len_beta,  reciprocal_beta,  flip_sign_beta);
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case viennacl::OPENCL_MEMORY:
+          viennacl::linalg::opencl::ambm_m(mat1,
+                                           mat2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha,
+                                           mat3,  beta, len_beta,  reciprocal_beta,  flip_sign_beta);
+          break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        case viennacl::CUDA_MEMORY:
+          viennacl::linalg::cuda::ambm_m(mat1,
+                                         mat2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha,
+                                         mat3,  beta, len_beta,  reciprocal_beta,  flip_sign_beta);
+          break;
+#endif
+        case viennacl::MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("not implemented");
+      }
     }
 
 
-
-    /** @brief Subtracts two dense matrices or submatrices and writes the result to a third matrix or submatrix
-    *
-    * This is the implementation of the convenience expression result = mat1 - mat2;
-    *
-    * @param mat1   The left hand side operand
-    * @param mat2   The right hand side operand
-    * @param result The resulting matrix
-    */
-    template<class T1, class T2, class T3>
-    typename viennacl::enable_if<   viennacl::is_matrix<T1>::value 
-                                 && viennacl::is_matrix<T2>::value 
-                                 && viennacl::is_matrix<T3>::value >::type
-    sub(const T1 & mat1, 
-        const T2 & mat2,
-              T3 & result)
+    template <typename NumericT, typename F>
+    void matrix_assign(matrix_base<NumericT, F> & mat, NumericT s, bool clear = false)
     {
-      detail::add_sub_impl(mat1, mat2, result, "sub");
+      switch (viennacl::traits::handle(mat).get_active_handle_id())
+      {
+        case viennacl::MAIN_MEMORY:
+          viennacl::linalg::host_based::matrix_assign(mat, s, clear);
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case viennacl::OPENCL_MEMORY:
+          viennacl::linalg::opencl::matrix_assign(mat, s, clear);
+          break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        case viennacl::CUDA_MEMORY:
+          viennacl::linalg::cuda::matrix_assign(mat, s, clear);
+          break;
+#endif
+        case viennacl::MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("not implemented");
+      }
     }
 
-    /** @brief Subtracts a dense matrix or submatrix from another
-    *
-    * This is the implementation of the convenience expression result -= mat1;
-    *
-    * @param mat2   The addend (either a matrix or a matrix_range)
-    * @param result The resulting matrix  (either a matrix or a matrix_range)
-    */
-    template <typename T1, typename T2>
-    typename viennacl::enable_if<    viennacl::is_matrix<T1>::value
-                                  && viennacl::is_matrix<T2>::value
-                                >::type
-    inplace_sub(T1 & result, T2 const & mat2)
+
+    template <typename NumericT, typename F>
+    void matrix_diagonal_assign(matrix_base<NumericT, F> & mat, NumericT s)
     {
-      detail::inplace_add_sub_impl(result, mat2, "inplace_sub");
+      switch (viennacl::traits::handle(mat).get_active_handle_id())
+      {
+        case viennacl::MAIN_MEMORY:
+          viennacl::linalg::host_based::matrix_diagonal_assign(mat, s);
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case viennacl::OPENCL_MEMORY:
+          viennacl::linalg::opencl::matrix_diagonal_assign(mat, s);
+          break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        case viennacl::CUDA_MEMORY:
+          viennacl::linalg::cuda::matrix_diagonal_assign(mat, s);
+          break;
+#endif
+        case viennacl::MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("not implemented");
+      }
     }
 
 
-
-
-    //
-    /////////////////////////   inplace multiplication and division /////////////////////////////////
-    //
-
-    namespace detail
+    /** @brief Dispatcher interface for A = diag(v, k) */
+    template <typename NumericT, typename F>
+    void matrix_diag_from_vector(const vector_base<NumericT> & v, int k, matrix_base<NumericT, F> & A)
     {
-      template <typename  T1, typename ScalarType>
-      typename viennacl::enable_if< viennacl::is_matrix<T1>::value >::type
-      inplace_mult_div_impl(T1 & result, 
-                            ScalarType val,
-                            std::string kernel_name)
+      switch (viennacl::traits::handle(v).get_active_handle_id())
       {
-        typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< T1 >::ResultType    KernelClass;
-        
-        std::size_t block_size = 16;
-          
-        viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), kernel_name);
-        
-        k.global_work_size(0, block_size*block_size);
-        k.global_work_size(1, block_size*block_size);
-        k.local_work_size(0, block_size);
-        k.local_work_size(1, block_size);
-        
-        viennacl::ocl::enqueue(k(viennacl::traits::handle(result),
-                                        cl_uint(viennacl::traits::start1(result)),         cl_uint(viennacl::traits::start2(result)), 
-                                        cl_uint(viennacl::traits::size1(result)),          cl_uint(viennacl::traits::size2(result)),
-                                        cl_uint(viennacl::traits::internal_size1(result)), cl_uint(viennacl::traits::internal_size2(result)),
-                                val)
-                              );
+        case viennacl::MAIN_MEMORY:
+          viennacl::linalg::host_based::matrix_diag_from_vector(v, k, A);
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case viennacl::OPENCL_MEMORY:
+          viennacl::linalg::opencl::matrix_diag_from_vector(v, k, A);
+          break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        case viennacl::CUDA_MEMORY:
+          viennacl::linalg::cuda::matrix_diag_from_vector(v, k, A);
+          break;
+#endif
+        case viennacl::MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("not implemented");
       }
     }
 
+    /** @brief Dispatcher interface for v = diag(A, k) */
+    template <typename NumericT, typename F>
+    void matrix_diag_to_vector(const matrix_base<NumericT, F> & A, int k, vector_base<NumericT> & v)
+    {
+      switch (viennacl::traits::handle(A).get_active_handle_id())
+      {
+        case viennacl::MAIN_MEMORY:
+          viennacl::linalg::host_based::matrix_diag_to_vector(A, k, v);
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case viennacl::OPENCL_MEMORY:
+          viennacl::linalg::opencl::matrix_diag_to_vector(A, k, v);
+          break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        case viennacl::CUDA_MEMORY:
+          viennacl::linalg::cuda::matrix_diag_to_vector(A, k, v);
+          break;
+#endif
+        case viennacl::MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("not implemented");
+      }
+    }
 
-    /** @brief Multiplies a dense matrix or submatrix by a scalar
-    *
-    * This is the implementation of the convenience expression matrix *= val;
-    *
-    * @param result The matrix to be manipulated
-    * @param val    The CPU scalar by which all entries of the matrix are multiplied
-    */
-    template <typename  T1>
-    typename viennacl::enable_if< viennacl::is_matrix<T1>::value >::type
-    inplace_mult(T1 & result, 
-                 typename viennacl::result_of::cpu_value_type< typename T1::value_type >::type val)
+    template <typename NumericT, typename F>
+    void matrix_row(const matrix_base<NumericT, F> & A, unsigned int i, vector_base<NumericT> & v)
     {
-      detail::inplace_mult_div_impl(result, val, "cpu_inplace_mult");
+      switch (viennacl::traits::handle(A).get_active_handle_id())
+      {
+        case viennacl::MAIN_MEMORY:
+          viennacl::linalg::host_based::matrix_row(A, i, v);
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case viennacl::OPENCL_MEMORY:
+          viennacl::linalg::opencl::matrix_row(A, i, v);
+          break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        case viennacl::CUDA_MEMORY:
+          viennacl::linalg::cuda::matrix_row(A, i, v);
+          break;
+#endif
+        case viennacl::MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("not implemented");
+      }
     }
 
+    template <typename NumericT, typename F>
+    void matrix_column(const matrix_base<NumericT, F> & A, unsigned int j, vector_base<NumericT> & v)
+    {
+      switch (viennacl::traits::handle(A).get_active_handle_id())
+      {
+        case viennacl::MAIN_MEMORY:
+          viennacl::linalg::host_based::matrix_column(A, j, v);
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case viennacl::OPENCL_MEMORY:
+          viennacl::linalg::opencl::matrix_column(A, j, v);
+          break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        case viennacl::CUDA_MEMORY:
+          viennacl::linalg::cuda::matrix_column(A, j, v);
+          break;
+#endif
+        case viennacl::MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("not implemented");
+      }
+    }
 
-    /** @brief Multiplies a dense matrix or submatrix by a scalar
+    /** @brief Computes the Frobenius norm of a matrix - dispatcher interface
     *
-    * This is the implementation of the convenience expression matrix *= val;
-    *
-    * @param result The matrix to be manipulated
-    * @param val    The scalar by which all entries of the matrix are multiplied
+    * @param A      The matrix
+    * @param result The result scalar
     */
-    template <typename  T1>
-    typename viennacl::enable_if< viennacl::is_matrix<T1>::value >::type
-    inplace_mult(T1 & result, 
-                 typename T1::value_type val)
+    template <typename T, typename F>
+    void norm_frobenius_impl(matrix_base<T, F> const & A,
+                             scalar<T> & result)
     {
-      detail::inplace_mult_div_impl(result, val, "inplace_mult");
+      typedef typename matrix_base<T, F>::handle_type  HandleType;
+      viennacl::vector_base<T> temp(const_cast<HandleType &>(A.handle()), A.internal_size(), 0, 1);
+      norm_2_impl(temp, result);
     }
 
-
-
-    /** @brief Divides a dense matrix or submatrix by a scalar
+    /** @brief Computes the Frobenius norm of a vector with final reduction on the CPU
     *
-    * This is the implementation of the convenience expression matrix /= val;
-    *
-    * @param result The matrix to be manipulated
-    * @param val    The scalar by which all entries of the matrix are divided
+    * @param A      The matrix
+    * @param result The result scalar
     */
-    template <typename  T1>
-    typename viennacl::enable_if< viennacl::is_matrix<T1>::value >::type
-    inplace_divide(T1 & result, 
-                   typename T1::value_type val)
+    template <typename T, typename F>
+    void norm_frobenius_cpu(matrix_base<T, F> const & A,
+                             T & result)
     {
-      detail::inplace_mult_div_impl(result, val, "inplace_divide");
+      typedef typename matrix_base<T, F>::handle_type  HandleType;
+      viennacl::vector_base<T> temp(const_cast<HandleType &>(A.handle()), A.internal_size(), 0, 1);
+      norm_2_cpu(temp, result);
     }
 
-
-
     //
     /////////////////////////   matrix-vector products /////////////////////////////////
     //
@@ -300,24 +337,7 @@ namespace viennacl
 
 
     // A * x
-    /** @brief Returns a proxy class that represents matrix-vector multiplication
-    *
-    * This is used for the convenience expression result = prod(mat, vec);
-    *
-    * @param mat    The matrix
-    * @param vec    The vector
-    */
-    template<class SCALARTYPE, typename F, unsigned int ALIGNMENT, unsigned int VECTOR_ALIGNMENT>
-    viennacl::vector_expression<const viennacl::matrix<SCALARTYPE, F, ALIGNMENT>,
-                                const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT>, 
-                                op_prod > prod_impl(const viennacl::matrix<SCALARTYPE, F, ALIGNMENT> & mat, 
-                                                    const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT> & vec)
-    {
-      return viennacl::vector_expression<const viennacl::matrix<SCALARTYPE, F, ALIGNMENT>,
-                                         const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT>, 
-                                         op_prod >(mat, vec);
-    }
-    
+
     /** @brief Carries out matrix-vector multiplication
     *
     * Implementation of the convenience expression result = prod(mat, vec);
@@ -326,164 +346,119 @@ namespace viennacl
     * @param vec    The vector
     * @param result The result vector
     */
-    template<class TYPE, typename F, unsigned int ALIGNMENT, unsigned int VECTOR_ALIGNMENT>
-    void prod_impl(const viennacl::matrix<TYPE, F, ALIGNMENT> & mat, 
-                    const viennacl::vector<TYPE, VECTOR_ALIGNMENT> & vec, 
-                          viennacl::vector<TYPE, VECTOR_ALIGNMENT> & result)
+    template <typename NumericT, typename F>
+    void prod_impl(const matrix_base<NumericT, F> & mat,
+                   const vector_base<NumericT> & vec,
+                         vector_base<NumericT> & result)
     {
-      assert(mat.size2() == vec.size());
-      // Inplace matrix-vector products like x = prod(A, x) are currently illegal: Introduce a temporary like y = prod(A, x); x = y; instead
-      assert(vec.handle().get() != result.handle().get() && "No direct inplace matrix-vector product possible. Introduce a temporary!");
-      result.resize(mat.size1());
-
-      typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< matrix<TYPE, F, ALIGNMENT> >::ResultType    KernelClass;
-      
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), "vec_mul");
-      viennacl::ocl::enqueue(
-                             k(mat, cl_uint(mat.size1()), cl_uint(mat.size2()),
-                                    cl_uint(mat.internal_size1()), cl_uint(mat.internal_size2()), vec, result));    
-    }
+      assert( (viennacl::traits::size1(mat) == viennacl::traits::size(result)) && bool("Size check failed at v1 = prod(A, v2): size1(A) != size(v1)"));
+      assert( (viennacl::traits::size2(mat) == viennacl::traits::size(vec))    && bool("Size check failed at v1 = prod(A, v2): size2(A) != size(v2)"));
 
+      switch (viennacl::traits::handle(mat).get_active_handle_id())
+      {
+        case viennacl::MAIN_MEMORY:
+          viennacl::linalg::host_based::prod_impl(mat, vec, result);
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case viennacl::OPENCL_MEMORY:
+          viennacl::linalg::opencl::prod_impl(mat, vec, result);
+          break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        case viennacl::CUDA_MEMORY:
+          viennacl::linalg::cuda::prod_impl(mat, vec, result);
+          break;
+#endif
+        case viennacl::MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("not implemented");
+      }
+    }
 
 
     // trans(A) * x
-    /** @brief Returns a proxy class that represents matrix-vector multiplication with a transposed matrix
-    *
-    * This is used for the convenience expression result = trans(mat) * vec;
-    *
-    * @param proxy  The transposed matrix proxy
-    * @param vec    The vector
-    */
-    template<class SCALARTYPE, typename F, unsigned int ALIGNMENT, unsigned int VECTOR_ALIGNMENT>
-    viennacl::vector_expression<const viennacl::matrix_expression< const matrix<SCALARTYPE, F, ALIGNMENT>,
-                                                                   const matrix<SCALARTYPE, F, ALIGNMENT>,
-                                                                   op_trans>,
-                                const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT>, 
-                                op_prod > prod_impl(const viennacl::matrix_expression< const matrix<SCALARTYPE, F, ALIGNMENT>,
-                                                                                       const matrix<SCALARTYPE, F, ALIGNMENT>,
-                                                                                       op_trans> & proxy, 
-                                                    const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT> & vec)
-    {
-      return viennacl::vector_expression<const viennacl::matrix_expression< const matrix<SCALARTYPE, F, ALIGNMENT>,
-                                                                            const matrix<SCALARTYPE, F, ALIGNMENT>,
-                                                                            op_trans>,
-                                         const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT>, 
-                                         op_prod >(proxy, vec);
-    }
 
-    /** @brief Unwraps the transposed matrix proxy and forwards to trans_prod_impl()
-    */
-    template<class SCALARTYPE, typename F, unsigned int ALIGNMENT, unsigned int VECTOR_ALIGNMENT>
-    void prod_impl(const viennacl::matrix_expression< const matrix<SCALARTYPE, F, ALIGNMENT>,
-                                                      const matrix<SCALARTYPE, F, ALIGNMENT>,
-                                                      op_trans> & mat,
-                    const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT> & vec, 
-                          viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT> & result)
-    {
-      trans_prod_impl(mat.lhs(), vec, result);
-    }
-    
     /** @brief Carries out matrix-vector multiplication with a transposed matrix
     *
     * Implementation of the convenience expression result = trans(mat) * vec;
     *
-    * @param mat    The matrix
-    * @param vec    The vector
-    * @param result The result vector
+    * @param mat_trans  The transposed matrix proxy
+    * @param vec        The vector
+    * @param result     The result vector
     */
-    template<class SCALARTYPE, typename F, unsigned int ALIGNMENT, unsigned int VECTOR_ALIGNMENT>
-    void trans_prod_impl(const matrix<SCALARTYPE, F, ALIGNMENT> & mat,
-                          const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT> & vec, 
-                                viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT> & result)
+    template <typename NumericT, typename F>
+    void prod_impl(const matrix_expression< const matrix_base<NumericT, F>, const matrix_base<NumericT, F>, op_trans> & mat_trans,
+                   const vector_base<NumericT> & vec,
+                         vector_base<NumericT> & result)
     {
-      assert(mat.size1() == vec.size());  //remember: mat is transposed!
-      // Inplace matrix-vector products like x = prod(A, x) are currently illegal: Introduce a temporary like y = prod(A, x); x = y; instead
-      assert(vec.handle().get() != result.handle().get() && "No direct inplace matrix-vector product possible. Introduce a temporary!");
-      result.resize(mat.size2());
-
-      typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< matrix<SCALARTYPE, F, ALIGNMENT> >::ResultType    KernelClass;
-      
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), "trans_vec_mul");
-      
-      viennacl::ocl::enqueue(k(mat, cl_uint(mat.size1()), cl_uint(mat.size2()),
-                                    cl_uint(mat.internal_size1()), cl_uint(mat.internal_size2()), vec, result));        
-    }
-
-
+      assert( (viennacl::traits::size1(mat_trans.lhs()) == viennacl::traits::size(vec))    && bool("Size check failed at v1 = trans(A) * v2: size1(A) != size(v2)"));
+      assert( (viennacl::traits::size2(mat_trans.lhs()) == viennacl::traits::size(result)) && bool("Size check failed at v1 = trans(A) * v2: size2(A) != size(v1)"));
 
+      switch (viennacl::traits::handle(mat_trans.lhs()).get_active_handle_id())
+      {
+        case viennacl::MAIN_MEMORY:
+          viennacl::linalg::host_based::prod_impl(mat_trans, vec, result);
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case viennacl::OPENCL_MEMORY:
+          viennacl::linalg::opencl::prod_impl(mat_trans, vec, result);
+          break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        case viennacl::CUDA_MEMORY:
+          viennacl::linalg::cuda::prod_impl(mat_trans, vec, result);
+          break;
+#endif
+        case viennacl::MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("not implemented");
+      }
+    }
 
 
     //
     /////////////////////////   matrix-matrix products /////////////////////////////////
     //
-    
-    namespace detail
-    {
-      // C = A * B and possibly transposed variants
-      template <typename T1, typename T2, typename T3 >
-      void prod(const T1 & A, 
-                const T2 & B, 
-                T3 & C,
-                std::string kernel_name,
-                int block_size = 16) // [JW] added ability to set block size from outside ..
-      {
-        typename viennacl::result_of::cpu_value_type< typename T1::value_type >::type   cpu_value_type;
-        
-        typedef typename viennacl::tools::MATRIX_PROD_KERNEL_CLASS_DEDUCER< T1, T2, T3 >::ResultType    KernelClass;
-        KernelClass::init();
-        
-        //std::cout << "KernelClass::program_name() : " << KernelClass::program_name() << std::endl;
-        viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), kernel_name);
-        
-        k.global_work_size(0, viennacl::tools::roundUpToNextMultiple<unsigned int>(viennacl::traits::size1(C), block_size));
-        k.global_work_size(1, viennacl::tools::roundUpToNextMultiple<unsigned int>(viennacl::traits::size2(C), block_size));
-        k.local_work_size(0, block_size);
-        k.local_work_size(1, block_size);
-        
-        viennacl::ocl::enqueue(k(viennacl::traits::handle(A), 
-                                        cl_uint(viennacl::traits::start1(A)),           cl_uint(viennacl::traits::start2(A)), 
-                                        cl_uint(viennacl::traits::size1(A)),            cl_uint(viennacl::traits::size2(A)),
-                                        cl_uint(viennacl::traits::internal_size1(A)),   cl_uint(viennacl::traits::internal_size2(A)),
-                                 viennacl::traits::handle(B), 
-                                        cl_uint(viennacl::traits::start1(B)),           cl_uint(viennacl::traits::start2(B)), 
-                                        cl_uint(viennacl::traits::size1(B)),            cl_uint(viennacl::traits::size2(B)),
-                                        cl_uint(viennacl::traits::internal_size1(B)),   cl_uint(viennacl::traits::internal_size2(B)),
-                                 viennacl::traits::handle(C), 
-                                        cl_uint(viennacl::traits::start1(C)),         cl_uint(viennacl::traits::start2(C)), 
-                                        cl_uint(viennacl::traits::size1(C)),          cl_uint(viennacl::traits::size2(C)),
-                                        cl_uint(viennacl::traits::internal_size1(C)), cl_uint(viennacl::traits::internal_size2(C)),
-                                 viennacl::ocl::local_mem(sizeof(cpu_value_type) * (block_size+1) * block_size),
-                                 viennacl::ocl::local_mem(sizeof(cpu_value_type) * (block_size+1) * block_size)
-                                )
-                              );        
-      }
-    }
-
 
     /** @brief Carries out matrix-matrix multiplication
     *
     * Implementation of C = prod(A, B);
     *
     */
-    template <typename T1, typename T2, typename T3 >
-    typename viennacl::enable_if<    viennacl::is_matrix<T1>::value
-                                  && viennacl::is_matrix<T2>::value
-                                  && viennacl::is_matrix<T3>::value
-                                >::type
-    prod_impl(const T1 & A, 
-              const T2 & B, 
-                    T3 & C, 
-              int block_size = 16) // [JW] added ability to set block size from outside ..
+    template <typename NumericT, typename F1, typename F2, typename F3, typename ScalarType >
+    void prod_impl(const matrix_base<NumericT, F1> & A,
+                   const matrix_base<NumericT, F2> & B,
+                         matrix_base<NumericT, F3> & C,
+                   ScalarType alpha,
+                   ScalarType beta)
     {
-      assert(viennacl::traits::size1(A) == viennacl::traits::size1(C));
-      assert(viennacl::traits::size2(A) == viennacl::traits::size1(B));
-      assert(viennacl::traits::size2(B) == viennacl::traits::size2(C));
-      // Inplace matrix-vector products like B = prod(A, B) are currently illegal: Introduce a temporary like C = prod(A, B); B = C; instead
-      assert(viennacl::traits::handle(C).get() != viennacl::traits::handle(A).get() 
-            && viennacl::traits::handle(C).get() != viennacl::traits::handle(B).get()
-            && "No direct inplace matrix-matrix product possible. Introduce a temporary!");
-        
-      detail::prod(A, B, C, "prod_AA", block_size);
+      assert( (viennacl::traits::size1(A) == viennacl::traits::size1(C)) && bool("Size check failed at C = prod(A, B): size1(A) != size1(C)"));
+      assert( (viennacl::traits::size2(A) == viennacl::traits::size1(B)) && bool("Size check failed at C = prod(A, B): size2(A) != size1(B)"));
+      assert( (viennacl::traits::size2(B) == viennacl::traits::size2(C)) && bool("Size check failed at C = prod(A, B): size2(B) != size2(C)"));
+
+
+      switch (viennacl::traits::handle(A).get_active_handle_id())
+      {
+        case viennacl::MAIN_MEMORY:
+          viennacl::linalg::host_based::prod_impl(A, B, C, alpha, beta);
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case viennacl::OPENCL_MEMORY:
+          viennacl::linalg::opencl::prod_impl(A, B, C, alpha, beta);
+          break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        case viennacl::CUDA_MEMORY:
+          viennacl::linalg::cuda::prod_impl(A, B, C, alpha, beta);
+          break;
+#endif
+        case viennacl::MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("not implemented");
+      }
     }
 
 
@@ -493,29 +468,39 @@ namespace viennacl
     * Implementation of C = prod(trans(A), B);
     *
     */
-    template <typename T1, typename T2, typename T3 >
-    typename viennacl::enable_if<    viennacl::is_matrix<T1>::value
-                                  && viennacl::is_matrix<T2>::value
-                                  && viennacl::is_matrix<T3>::value
-                                >::type
-    prod_impl(const viennacl::matrix_expression< const T1,
-                                                 const T1,
-                                                 op_trans> & A, 
-              const T2 & B, 
-                    T3 & C, 
-              int block_size = 16)
+    template <typename NumericT, typename F1, typename F2, typename F3, typename ScalarType >
+    void prod_impl(const viennacl::matrix_expression< const matrix_base<NumericT, F1>,
+                                                      const matrix_base<NumericT, F1>,
+                                                      op_trans> & A,
+                   const matrix_base<NumericT, F2> & B,
+                         matrix_base<NumericT, F3> & C,
+                   ScalarType alpha,
+                   ScalarType beta)
     {
-      //std::cout << "size2(A): " << viennacl::traits::size2(A.lhs()) << std::endl;
-      //std::cout << "size1(C): " << viennacl::traits::size1(C) << std::endl;
-      assert(viennacl::traits::size2(A.lhs()) == viennacl::traits::size1(C));
-      assert(viennacl::traits::size1(A.lhs()) == viennacl::traits::size1(B));
-      assert(viennacl::traits::size2(B) == viennacl::traits::size2(C));
-      // Inplace matrix-vector products like B = prod(A, B) are currently illegal: Introduce a temporary like C = prod(A, B); B = C; instead
-      assert(viennacl::traits::handle(C).get() != viennacl::traits::handle(A.lhs()).get() 
-            && viennacl::traits::handle(C).get() != viennacl::traits::handle(B).get()
-            && "No direct inplace matrix-matrix product possible. Introduce a temporary!");
-      
-      detail::prod(A.lhs(), B, C, "prod_TA", block_size);
+      assert(viennacl::traits::size2(A.lhs()) == viennacl::traits::size1(C) && bool("Size check failed at C = prod(trans(A), B): size2(A) != size1(C)"));
+      assert(viennacl::traits::size1(A.lhs()) == viennacl::traits::size1(B) && bool("Size check failed at C = prod(trans(A), B): size1(A) != size1(B)"));
+      assert(viennacl::traits::size2(B)       == viennacl::traits::size2(C) && bool("Size check failed at C = prod(trans(A), B): size2(B) != size2(C)"));
+
+      switch (viennacl::traits::handle(A.lhs()).get_active_handle_id())
+      {
+        case viennacl::MAIN_MEMORY:
+          viennacl::linalg::host_based::prod_impl(A, B, C, alpha, beta);
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case viennacl::OPENCL_MEMORY:
+          viennacl::linalg::opencl::prod_impl(A, B, C, alpha, beta);
+          break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        case viennacl::CUDA_MEMORY:
+          viennacl::linalg::cuda::prod_impl(A, B, C, alpha, beta);
+          break;
+#endif
+        case viennacl::MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("not implemented");
+      }
     }
 
 
@@ -526,27 +511,37 @@ namespace viennacl
     * Implementation of C = prod(A, trans(B));
     *
     */
-    template <typename T1, typename T2, typename T3 >
-    typename viennacl::enable_if<    viennacl::is_matrix<T1>::value
-                                  && viennacl::is_matrix<T2>::value
-                                  && viennacl::is_matrix<T3>::value
-                                >::type
-    prod_impl(const T1 & A, 
-              const viennacl::matrix_expression< const T2,
-                                                 const T2,
-                                                 op_trans> & B,
-              T3 & C, 
-              int block_size = 16)
+    template <typename NumericT, typename F1, typename F2, typename F3, typename ScalarType >
+    void prod_impl(const matrix_base<NumericT, F1> & A,
+                   const viennacl::matrix_expression< const matrix_base<NumericT, F2>, const matrix_base<NumericT, F2>, op_trans> & B,
+                         matrix_base<NumericT, F3> & C,
+                   ScalarType alpha,
+                   ScalarType beta)
     {
-      assert(viennacl::traits::size1(A) == viennacl::traits::size1(C));
-      assert(viennacl::traits::size2(A) == viennacl::traits::size2(B.lhs()));
-      assert(viennacl::traits::size1(B.lhs()) == viennacl::traits::size2(C));
-      // Inplace matrix-vector products like B = prod(A, B) are currently illegal: Introduce a temporary like C = prod(A, B); B = C; instead
-      assert(viennacl::traits::handle(C).get() != viennacl::traits::handle(A).get() 
-            && viennacl::traits::handle(C).get() != viennacl::traits::handle(B.lhs()).get()
-            && "No direct inplace matrix-matrix product possible. Introduce a temporary!");
-      
-      detail::prod(A, B.lhs(), C, "prod_AT", block_size);
+      assert(viennacl::traits::size1(A)       == viennacl::traits::size1(C)       && bool("Size check failed at C = prod(A, trans(B)): size1(A) != size1(C)"));
+      assert(viennacl::traits::size2(A)       == viennacl::traits::size2(B.lhs()) && bool("Size check failed at C = prod(A, trans(B)): size2(A) != size2(B)"));
+      assert(viennacl::traits::size1(B.lhs()) == viennacl::traits::size2(C)       && bool("Size check failed at C = prod(A, trans(B)): size1(B) != size2(C)"));
+
+      switch (viennacl::traits::handle(A).get_active_handle_id())
+      {
+        case viennacl::MAIN_MEMORY:
+          viennacl::linalg::host_based::prod_impl(A, B, C, alpha, beta);
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case viennacl::OPENCL_MEMORY:
+          viennacl::linalg::opencl::prod_impl(A, B, C, alpha, beta);
+          break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        case viennacl::CUDA_MEMORY:
+          viennacl::linalg::cuda::prod_impl(A, B, C, alpha, beta);
+          break;
+#endif
+        case viennacl::MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("not implemented");
+      }
     }
 
 
@@ -556,113 +551,233 @@ namespace viennacl
     * Implementation of C = prod(trans(A), trans(B));
     *
     */
-    template <typename T1, typename T2, typename T3 >
-    typename viennacl::enable_if<    viennacl::is_matrix<T1>::value
-                                  && viennacl::is_matrix<T2>::value
-                                  && viennacl::is_matrix<T3>::value
-                                >::type
-    prod_impl(const viennacl::matrix_expression< const T1,
-                                                 const T1,
-                                                 op_trans> & A,
-              const viennacl::matrix_expression< const T2,
-                                                 const T2,
-                                                 op_trans> & B,
-              T3 & C, 
-              int block_size = 16)
+    template <typename NumericT, typename F1, typename F2, typename F3, typename ScalarType >
+    void prod_impl(const viennacl::matrix_expression< const matrix_base<NumericT, F1>, const matrix_base<NumericT, F1>, op_trans> & A,
+                   const viennacl::matrix_expression< const matrix_base<NumericT, F2>, const matrix_base<NumericT, F2>, op_trans> & B,
+                   matrix_base<NumericT, F3> & C,
+                   ScalarType alpha,
+                   ScalarType beta)
     {
-      assert(viennacl::traits::size2(A.lhs()) == viennacl::traits::size1(C));
-      assert(viennacl::traits::size1(A.lhs()) == viennacl::traits::size2(B.lhs()));
-      assert(viennacl::traits::size1(B.lhs()) == viennacl::traits::size2(C));
-      // Inplace matrix-vector products like B = prod(A, B) are currently illegal: Introduce a temporary like C = prod(A, B); B = C; instead
-      assert(viennacl::traits::handle(C).get() != viennacl::traits::handle(A.lhs()).get() 
-            && viennacl::traits::handle(C).get() != viennacl::traits::handle(B.lhs()).get()
-            && "No direct inplace matrix-matrix product possible. Introduce a temporary!");
-      
-      detail::prod(A.lhs(), B.lhs(), C, "prod_TT", block_size);
+      assert(viennacl::traits::size2(A.lhs()) == viennacl::traits::size1(C)       && bool("Size check failed at C = prod(trans(A), trans(B)): size2(A) != size1(C)"));
+      assert(viennacl::traits::size1(A.lhs()) == viennacl::traits::size2(B.lhs()) && bool("Size check failed at C = prod(trans(A), trans(B)): size1(A) != size2(B)"));
+      assert(viennacl::traits::size1(B.lhs()) == viennacl::traits::size2(C)       && bool("Size check failed at C = prod(trans(A), trans(B)): size1(B) != size2(C)"));
+
+      switch (viennacl::traits::handle(A.lhs()).get_active_handle_id())
+      {
+        case viennacl::MAIN_MEMORY:
+          viennacl::linalg::host_based::prod_impl(A, B, C, alpha, beta);
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case viennacl::OPENCL_MEMORY:
+          viennacl::linalg::opencl::prod_impl(A, B, C, alpha, beta);
+          break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        case viennacl::CUDA_MEMORY:
+          viennacl::linalg::cuda::prod_impl(A, B, C, alpha, beta);
+          break;
+#endif
+        case viennacl::MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("not implemented");
+      }
     }
 
 
+    ///////////////////////// Elementwise operations /////////////
 
 
-    //
-    /////////////////////////   miscellaneous operations /////////////////////////////////
-    //
+
+    /** @brief Implementation of the element-wise operation A = B .* C and A = B ./ C for matrices (using MATLAB syntax). Don't use this function directly, use element_prod() and element_div().
+    *
+    * @param A      The result matrix (or -range, or -slice)
+    * @param proxy  The proxy object holding B, C, and the operation
+    */
+    template <typename T, typename F, typename OP>
+    void element_op(matrix_base<T, F> & A,
+                    matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, OP> const & proxy)
+    {
+      assert( (viennacl::traits::size1(A) == viennacl::traits::size1(proxy)) && bool("Size check failed at A = element_op(B): size1(A) != size1(B)"));
+      assert( (viennacl::traits::size2(A) == viennacl::traits::size2(proxy)) && bool("Size check failed at A = element_op(B): size2(A) != size2(B)"));
+
+      switch (viennacl::traits::handle(A).get_active_handle_id())
+      {
+        case viennacl::MAIN_MEMORY:
+          viennacl::linalg::host_based::element_op(A, proxy);
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case viennacl::OPENCL_MEMORY:
+          viennacl::linalg::opencl::element_op(A, proxy);
+          break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        case viennacl::CUDA_MEMORY:
+          viennacl::linalg::cuda::element_op(A, proxy);
+          break;
+#endif
+        case viennacl::MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("not implemented");
+      }
+    }
 
 
+#define VIENNACL_MAKE_BINARY_OP(OPNAME)\
+    template <typename T, typename F>\
+    viennacl::matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_binary<op_##OPNAME> >\
+    element_##OPNAME(matrix_base<T, F> const & A, matrix_base<T, F> const & B)\
+    {\
+      return viennacl::matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_binary<op_##OPNAME> >(A, B);\
+    }\
+\
+    template <typename M1, typename M2, typename OP, typename T, typename F>\
+    viennacl::matrix_expression<const matrix_expression<const M1, const M2, OP>,\
+                                const matrix_base<T, F>,\
+                                op_element_binary<op_##OPNAME> >\
+    element_##OPNAME(matrix_expression<const M1, const M2, OP> const & proxy, matrix_base<T, F> const & B)\
+    {\
+      return viennacl::matrix_expression<const matrix_expression<const M1, const M2, OP>,\
+                                         const matrix_base<T, F>,\
+                                         op_element_binary<op_##OPNAME> >(proxy, B);\
+    }\
+\
+    template <typename T, typename F, typename M2, typename M3, typename OP>\
+    viennacl::matrix_expression<const matrix_base<T, F>,\
+                                const matrix_expression<const M2, const M3, OP>,\
+                                op_element_binary<op_##OPNAME> >\
+    element_##OPNAME(matrix_base<T, F> const & A, matrix_expression<const M2, const M3, OP> const & proxy)\
+    {\
+      return viennacl::matrix_expression<const matrix_base<T, F>,\
+                                         const matrix_expression<const M2, const M3, OP>,\
+                                         op_element_binary<op_##OPNAME> >(A, proxy);\
+    }\
+\
+    template <typename M1, typename M2, typename OP1,\
+              typename M3, typename M4, typename OP2>\
+    viennacl::matrix_expression<const matrix_expression<const M1, const M2, OP1>,\
+                                const matrix_expression<const M3, const M4, OP2>,\
+                                op_element_binary<op_##OPNAME> >\
+    element_##OPNAME(matrix_expression<const M1, const M2, OP1> const & proxy1,\
+                 matrix_expression<const M3, const M4, OP2> const & proxy2)\
+    {\
+      return viennacl::matrix_expression<const matrix_expression<const M1, const M2, OP1>,\
+                                         const matrix_expression<const M3, const M4, OP2>,\
+                                         op_element_binary<op_##OPNAME> >(proxy1, proxy2);\
+    }
 
+    VIENNACL_MAKE_BINARY_OP(prod)
+    VIENNACL_MAKE_BINARY_OP(div)
+    VIENNACL_MAKE_BINARY_OP(pow)
+
+#undef VIENNACL_GENERATE_BINARY_OP_OVERLOADS
+
+
+
+#define VIENNACL_MAKE_UNARY_ELEMENT_OP(funcname) \
+    template <typename T, typename F> \
+    viennacl::matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_unary<op_##funcname> > \
+    element_##funcname(matrix_base<T, F> const & A) \
+    { \
+      return viennacl::matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_unary<op_##funcname> >(A, A); \
+    } \
+    template <typename LHS, typename RHS, typename OP> \
+    viennacl::matrix_expression<const matrix_expression<const LHS, const RHS, OP>, \
+                                const matrix_expression<const LHS, const RHS, OP>, \
+                                op_element_unary<op_##funcname> > \
+    element_##funcname(matrix_expression<const LHS, const RHS, OP> const & proxy) \
+    { \
+      return viennacl::matrix_expression<const matrix_expression<const LHS, const RHS, OP>, \
+                                         const matrix_expression<const LHS, const RHS, OP>, \
+                                         op_element_unary<op_##funcname> >(proxy, proxy); \
+    } \
+
+    VIENNACL_MAKE_UNARY_ELEMENT_OP(abs)
+    VIENNACL_MAKE_UNARY_ELEMENT_OP(acos)
+    VIENNACL_MAKE_UNARY_ELEMENT_OP(asin)
+    VIENNACL_MAKE_UNARY_ELEMENT_OP(atan)
+    VIENNACL_MAKE_UNARY_ELEMENT_OP(ceil)
+    VIENNACL_MAKE_UNARY_ELEMENT_OP(cos)
+    VIENNACL_MAKE_UNARY_ELEMENT_OP(cosh)
+    VIENNACL_MAKE_UNARY_ELEMENT_OP(exp)
+    VIENNACL_MAKE_UNARY_ELEMENT_OP(fabs)
+    VIENNACL_MAKE_UNARY_ELEMENT_OP(floor)
+    VIENNACL_MAKE_UNARY_ELEMENT_OP(log)
+    VIENNACL_MAKE_UNARY_ELEMENT_OP(log10)
+    VIENNACL_MAKE_UNARY_ELEMENT_OP(sin)
+    VIENNACL_MAKE_UNARY_ELEMENT_OP(sinh)
+    VIENNACL_MAKE_UNARY_ELEMENT_OP(sqrt)
+    VIENNACL_MAKE_UNARY_ELEMENT_OP(tan)
+    VIENNACL_MAKE_UNARY_ELEMENT_OP(tanh)
+
+#undef VIENNACL_MAKE_UNARY_ELEMENT_OP
 
 
+    //
+    /////////////////////////   miscellaneous operations /////////////////////////////////
+    //
+
 
     /** @brief Returns a proxy class for the operation mat += vec1 * vec2^T, i.e. a rank 1 update
     *
     * @param vec1    The first vector
     * @param vec2    The second vector
     */
-    template<class SCALARTYPE, unsigned int VA1, unsigned int VA2>
-    viennacl::matrix_expression< const viennacl::vector<SCALARTYPE, VA1>,
-                                 const viennacl::vector<SCALARTYPE, VA2>,
-                                 op_prod> outer_prod(const viennacl::vector<SCALARTYPE, VA1> & vec1, 
-                                                     const viennacl::vector<SCALARTYPE, VA2> & vec2)
+    template <typename NumericT>
+    viennacl::matrix_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_prod>
+    outer_prod(const vector_base<NumericT> & vec1, const vector_base<NumericT> & vec2)
     {
-      return viennacl::matrix_expression< const viennacl::vector<SCALARTYPE, VA1>,
-                                          const viennacl::vector<SCALARTYPE, VA2>,
-                                          op_prod>(vec1, vec2);
+      return viennacl::matrix_expression< const vector_base<NumericT>, const vector_base<NumericT>, op_prod>(vec1, vec2);
     }
-    
-    
 
-    /** @brief The implementation of the operation mat += vec1 * vec2^T, i.e. a rank 1 update
-    *
-    * Implementation of the convenience expression result += outer_prod(vec1, vec2);
-    *
-    * @param mat1    The matrix to be updated
-    * @param vec1    The first vector
-    * @param vec2    The second vector
-    */
-    template<class SCALARTYPE, typename F, unsigned int ALIGNMENT>
-    void rank_1_update(viennacl::matrix<SCALARTYPE, F, ALIGNMENT> & mat1, 
-                       const viennacl::vector<SCALARTYPE, ALIGNMENT> & vec1, 
-                       const viennacl::vector<SCALARTYPE, ALIGNMENT> & vec2)
-    {
-      assert(mat1.size1() == vec1.size());
-      assert(mat1.size2() == vec2.size());
 
-      typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< matrix<SCALARTYPE, F, ALIGNMENT> >::ResultType    KernelClass;
-      
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), "rank1_update");
-
-      viennacl::ocl::enqueue(k(mat1, cl_uint(mat1.size1()), cl_uint(mat1.size2()),
-                                     cl_uint(mat1.internal_size1()), cl_uint(mat1.internal_size2()), vec1, vec2));        
-    }
-    
-    
     /** @brief The implementation of the operation mat += alpha * vec1 * vec2^T, i.e. a scaled rank 1 update
     *
     * Implementation of the convenience expression result += alpha * outer_prod(vec1, vec2);
     *
-    * @param mat1    The matrix to be updated
-    * @param val     The scaling factor
-    * @param vec1    The first vector
-    * @param vec2    The second vector
+    * @param mat1             The matrix to be updated
+    * @param alpha            The scaling factor (either a viennacl::scalar<>, float, or double)
+    * @param len_alpha        Length of the buffer for an eventual final reduction step (currently always '1')
+    * @param reciprocal_alpha Use 1/alpha instead of alpha
+    * @param flip_sign_alpha  Use -alpha instead of alpha
+    * @param vec1             The first vector
+    * @param vec2             The second vector
     */
-    template<class SCALARTYPE, typename F, unsigned int ALIGNMENT>
-    void scaled_rank_1_update(viennacl::matrix<SCALARTYPE, F, ALIGNMENT> & mat1,
-                              SCALARTYPE val,
-                              const viennacl::vector<SCALARTYPE, ALIGNMENT> & vec1, 
-                              const viennacl::vector<SCALARTYPE, ALIGNMENT> & vec2)
+    template <typename NumericT, typename F, typename S1>
+    void scaled_rank_1_update(matrix_base<NumericT, F> & mat1,
+                              S1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+                              const vector_base<NumericT> & vec1,
+                              const vector_base<NumericT> & vec2)
     {
-      assert(mat1.size1() == vec1.size());
-      assert(mat1.size2() == vec2.size());
-
-      typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< matrix<SCALARTYPE, F, ALIGNMENT> >::ResultType    KernelClass;
-      
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), "scaled_rank1_update");
-
-      viennacl::ocl::enqueue(k(mat1, cl_uint(mat1.size1()), cl_uint(mat1.size2()),
-                                     cl_uint(mat1.internal_size1()), cl_uint(mat1.internal_size2()), 
-                                                           val, vec1, vec2));        
+      switch (viennacl::traits::handle(mat1).get_active_handle_id())
+      {
+        case viennacl::MAIN_MEMORY:
+          viennacl::linalg::host_based::scaled_rank_1_update(mat1,
+                                                             alpha, len_alpha, reciprocal_alpha, flip_sign_alpha,
+                                                             vec1, vec2);
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case viennacl::OPENCL_MEMORY:
+          viennacl::linalg::opencl::scaled_rank_1_update(mat1,
+                                                         alpha, len_alpha, reciprocal_alpha, flip_sign_alpha,
+                                                         vec1, vec2);
+          break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        case viennacl::CUDA_MEMORY:
+          viennacl::linalg::cuda::scaled_rank_1_update(mat1,
+                                                       alpha, len_alpha, reciprocal_alpha, flip_sign_alpha,
+                                                       vec1, vec2);
+          break;
+#endif
+        case viennacl::MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("not implemented");
+      }
     }
-    
+
   } //namespace linalg
 
 
@@ -673,107 +788,81 @@ namespace viennacl
   //
 
 
-
-
-
-  //v = A * x
-  /** @brief Implementation of the operation v1 = A * v2, where A is a matrix
-  *
-  * @param proxy  An expression template proxy class.
-  */
-  template <typename SCALARTYPE, unsigned int ALIGNMENT>
-  template <typename F, unsigned int MAT_ALIGNMENT>
-  viennacl::vector<SCALARTYPE, ALIGNMENT> & 
-  viennacl::vector<SCALARTYPE, ALIGNMENT>::operator=(const viennacl::vector_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
-                                                                                        const viennacl::vector<SCALARTYPE, ALIGNMENT>,
-                                                                                        viennacl::op_prod> & proxy) 
-  {
-    // check for the special case x = A * x
-    if (proxy.rhs().handle().get() == this->handle().get())
-    {
-      viennacl::vector<SCALARTYPE, ALIGNMENT> result(proxy.rhs().size());
-      viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
-      *this = result;
-      return *this;
-    }
-    else
-    {
-      viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), *this);
-      return *this;
-    }
-    return *this;
-  }
-
   //v += A * x
   /** @brief Implementation of the operation v1 += A * v2, where A is a matrix
   *
+  * @param v1     The result vector v1 where A * v2 is added to
   * @param proxy  An expression template proxy class.
   */
-  template <typename SCALARTYPE, unsigned int ALIGNMENT>
-  template <typename F, unsigned int MAT_ALIGNMENT>
-  viennacl::vector<SCALARTYPE, ALIGNMENT> & 
-  viennacl::vector<SCALARTYPE, ALIGNMENT>::operator+=(const vector_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
-                                                                                const vector<SCALARTYPE, ALIGNMENT>,
-                                                                                op_prod> & proxy) 
+  template <typename NumericT, typename F>
+  vector<NumericT>
+  operator+=(vector_base<NumericT> & v1,
+             const viennacl::vector_expression< const matrix_base<NumericT, F>, const vector_base<NumericT>, viennacl::op_prod> & proxy)
   {
-    vector<SCALARTYPE, ALIGNMENT> result(proxy.lhs().size1());
+    assert(viennacl::traits::size1(proxy.lhs()) == v1.size() && bool("Size check failed for v1 += A * v2: size1(A) != size(v1)"));
+
+    vector<NumericT> result(viennacl::traits::size1(proxy.lhs()));
     viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
-    *this += result;
-    return *this;
+    v1 += result;
+    return v1;
   }
 
   /** @brief Implementation of the operation v1 -= A * v2, where A is a matrix
   *
+  * @param v1     The result vector v1 where A * v2 is subtracted from
   * @param proxy  An expression template proxy class.
   */
-  template <typename SCALARTYPE, unsigned int ALIGNMENT>
-  template <typename F, unsigned int MAT_ALIGNMENT>
-  viennacl::vector<SCALARTYPE, ALIGNMENT> & 
-  viennacl::vector<SCALARTYPE, ALIGNMENT>::operator-=(const vector_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
-                                                                                const vector<SCALARTYPE, ALIGNMENT>,
-                                                                                op_prod> & proxy) 
+  template <typename NumericT, typename F>
+  vector<NumericT>
+  operator-=(vector_base<NumericT> & v1,
+             const viennacl::vector_expression< const matrix_base<NumericT, F>, const vector_base<NumericT>, viennacl::op_prod> & proxy)
   {
-    vector<SCALARTYPE, ALIGNMENT> result(proxy.lhs().size1());
+    assert(viennacl::traits::size1(proxy.lhs()) == v1.size() && bool("Size check failed for v1 -= A * v2: size1(A) != size(v1)"));
+
+    vector<NumericT> result(viennacl::traits::size1(proxy.lhs()));
     viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
-    *this -= result;
-    return *this;
+    v1 -= result;
+    return v1;
   }
-  
-  
+
+
+
+
+
   //free functions:
   /** @brief Implementation of the operation 'result = v1 + A * v2', where A is a matrix
   *
+  * @param v1     The addend vector.
   * @param proxy  An expression template proxy class.
   */
-  template <typename SCALARTYPE, unsigned int ALIGNMENT>
-  template <typename F, unsigned int MAT_ALIGNMENT>
-  viennacl::vector<SCALARTYPE, ALIGNMENT> 
-  viennacl::vector<SCALARTYPE, ALIGNMENT>::operator+(const vector_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
-                                                                              const vector<SCALARTYPE, ALIGNMENT>,
-                                                                              op_prod> & proxy) 
+  template <typename NumericT, typename F>
+  viennacl::vector<NumericT>
+  operator+(const vector_base<NumericT> & v1,
+            const vector_expression< const matrix_base<NumericT, F>, const vector_base<NumericT>, op_prod> & proxy)
   {
-    assert(proxy.lhs().size1() == size());
-    vector<SCALARTYPE, ALIGNMENT> result(size());
+    assert(viennacl::traits::size1(proxy.lhs()) == viennacl::traits::size(v1) && bool("Size check failed for v1 + A * v2: size1(A) != size(v1)"));
+
+    vector<NumericT> result(viennacl::traits::size(v1));
     viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
-    result += *this;
+    result += v1;
     return result;
   }
 
   /** @brief Implementation of the operation 'result = v1 - A * v2', where A is a matrix
   *
+  * @param v1     The addend vector.
   * @param proxy  An expression template proxy class.
   */
-  template <typename SCALARTYPE, unsigned int ALIGNMENT>
-  template <typename F, unsigned int MAT_ALIGNMENT>
-  viennacl::vector<SCALARTYPE, ALIGNMENT> 
-  viennacl::vector<SCALARTYPE, ALIGNMENT>::operator-(const vector_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
-                                                                              const vector<SCALARTYPE, ALIGNMENT>,
-                                                                              op_prod> & proxy) 
+  template <typename NumericT, typename F>
+  viennacl::vector<NumericT>
+  operator-(const vector_base<NumericT> & v1,
+            const vector_expression< const matrix_base<NumericT, F>, const vector_base<NumericT>, op_prod> & proxy)
   {
-    assert(proxy.lhs().size1() == size());
-    vector<SCALARTYPE, ALIGNMENT> result(size());
+    assert(viennacl::traits::size1(proxy.lhs()) == viennacl::traits::size(v1) && bool("Size check failed for v1 - A * v2: size1(A) != size(v1)"));
+
+    vector<NumericT> result(viennacl::traits::size(v1));
     viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
-    result = *this - result;
+    result = v1 - result;
     return result;
   }
 
@@ -781,114 +870,87 @@ namespace viennacl
   ////////// transposed_matrix_proxy
 
 
-  //v = trans(A) * x
-  /** @brief Implementation of the operation v1 = A * v2, where A is a matrix
-  *
-  * @param proxy  An expression template proxy class.
-  */
-  template <typename SCALARTYPE, unsigned int ALIGNMENT>
-  template <typename F, unsigned int MAT_ALIGNMENT>
-  viennacl::vector<SCALARTYPE, ALIGNMENT> & 
-  viennacl::vector<SCALARTYPE, ALIGNMENT>::operator=(const viennacl::vector_expression< const matrix_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
-                                                                                                                  const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
-                                                                                                                  op_trans>,
-                                                                                        const viennacl::vector<SCALARTYPE, ALIGNMENT>,
-                                                                                        viennacl::op_prod> & proxy) 
-  {
-    // check for the special case x = trans(A) * x
-    if (proxy.rhs().handle().get() == this->handle().get())
-    {
-      viennacl::vector<SCALARTYPE, ALIGNMENT> result(proxy.rhs().size());
-      viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
-      *this = result;
-      return *this;
-    }
-    else
-    {
-      viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), *this);
-      return *this;
-    }
-    return *this;
-  }
-
-  //v += A * x
+  //v += A^T * x
   /** @brief Implementation of the operation v1 += A * v2, where A is a matrix
   *
+  * @param v1     The addend vector where the result is written to.
   * @param proxy  An expression template proxy class.
   */
-  template <typename SCALARTYPE, unsigned int ALIGNMENT>
-  template <typename F, unsigned int MAT_ALIGNMENT>
-  viennacl::vector<SCALARTYPE, ALIGNMENT> & 
-  viennacl::vector<SCALARTYPE, ALIGNMENT>::operator+=(const vector_expression< const matrix_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
-                                                                                                        const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
-                                                                                                        op_trans>,
-                                                                                const vector<SCALARTYPE, ALIGNMENT>,
-                                                                                op_prod> & proxy) 
+  template <typename NumericT, typename F>
+  vector<NumericT>
+  operator+=(vector_base<NumericT> & v1,
+             const vector_expression< const matrix_expression<const matrix_base<NumericT, F>, const matrix_base<NumericT, F>, op_trans>,
+                                                              const vector_base<NumericT>,
+                                                              op_prod> & proxy)
   {
-    vector<SCALARTYPE, ALIGNMENT> result(proxy.lhs().size1());
+    assert(viennacl::traits::size2(proxy.lhs()) == v1.size() && bool("Size check failed in v1 += trans(A) * v2: size2(A) != size(v1)"));
+
+    vector<NumericT> result(viennacl::traits::size2(proxy.lhs()));
     viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
-    *this += result;
-    return *this;
+    v1 += result;
+    return v1;
   }
 
+  //v -= A^T * x
   /** @brief Implementation of the operation v1 -= A * v2, where A is a matrix
   *
+  * @param v1     The addend vector where the result is written to.
   * @param proxy  An expression template proxy class.
   */
-  template <typename SCALARTYPE, unsigned int ALIGNMENT>
-  template <typename F, unsigned int MAT_ALIGNMENT>
-  viennacl::vector<SCALARTYPE, ALIGNMENT> & 
-  viennacl::vector<SCALARTYPE, ALIGNMENT>::operator-=(const vector_expression< const matrix_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
-                                                                                                        const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
-                                                                                                        op_trans>,
-                                                                                const vector<SCALARTYPE, ALIGNMENT>,
-                                                                                op_prod> & proxy) 
+  template <typename NumericT, typename F>
+  vector<NumericT>
+  operator-=(vector_base<NumericT> & v1,
+             const vector_expression< const matrix_expression<const matrix_base<NumericT, F>, const matrix_base<NumericT, F>, op_trans>,
+                                                              const vector_base<NumericT>,
+                                                              op_prod> & proxy)
   {
-    vector<SCALARTYPE, ALIGNMENT> result(proxy.lhs().size1());
+    assert(viennacl::traits::size2(proxy.lhs()) == v1.size() && bool("Size check failed in v1 += trans(A) * v2: size2(A) != size(v1)"));
+
+    vector<NumericT> result(viennacl::traits::size2(proxy.lhs()));
     viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
-    *this -= result;
-    return *this;
+    v1 -= result;
+    return v1;
   }
-  
-  
+
+
   //free functions:
   /** @brief Implementation of the operation 'result = v1 + A * v2', where A is a matrix
   *
+  * @param v1     The addend vector.
   * @param proxy  An expression template proxy class.
   */
-  template <typename SCALARTYPE, unsigned int ALIGNMENT>
-  template <typename F, unsigned int MAT_ALIGNMENT>
-  viennacl::vector<SCALARTYPE, ALIGNMENT> 
-  viennacl::vector<SCALARTYPE, ALIGNMENT>::operator+(const vector_expression< const matrix_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
-                                                                                                        const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
-                                                                                                        op_trans>,
-                                                                              const vector<SCALARTYPE, ALIGNMENT>,
-                                                                              op_prod> & proxy) 
+  template <typename NumericT, typename F>
+  vector<NumericT>
+  operator+(const vector_base<NumericT> & v1,
+            const vector_expression< const matrix_expression<const matrix_base<NumericT, F>, const matrix_base<NumericT, F>, op_trans>,
+                                     const vector_base<NumericT>,
+                                     op_prod> & proxy)
   {
-    assert(proxy.lhs().size1() == size());
-    vector<SCALARTYPE, ALIGNMENT> result(size());
+    assert(viennacl::traits::size2(proxy.lhs()) == viennacl::traits::size(v1) && bool("Size check failed in v1 + trans(A) * v2: size2(A) != size(v1)"));
+
+    vector<NumericT> result(viennacl::traits::size(v1));
     viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
-    result += *this;
+    result += v1;
     return result;
   }
 
   /** @brief Implementation of the operation 'result = v1 - A * v2', where A is a matrix
   *
+  * @param v1     The addend vector.
   * @param proxy  An expression template proxy class.
   */
-  template <typename SCALARTYPE, unsigned int ALIGNMENT>
-  template <typename F, unsigned int MAT_ALIGNMENT>
-  viennacl::vector<SCALARTYPE, ALIGNMENT> 
-  viennacl::vector<SCALARTYPE, ALIGNMENT>::operator-(const vector_expression< const matrix_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
-                                                                                                        const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
-                                                                                                        op_trans>,
-                                                                              const vector<SCALARTYPE, ALIGNMENT>,
-                                                                              op_prod> & proxy) 
+  template <typename NumericT, typename F>
+  vector<NumericT>
+  operator-(const vector_base<NumericT> & v1,
+            const vector_expression< const matrix_expression<const matrix_base<NumericT, F>, const matrix_base<NumericT, F>, op_trans>,
+                                     const vector_base<NumericT>,
+                                     op_prod> & proxy)
   {
-    assert(proxy.lhs().size1() == size());
-    vector<SCALARTYPE, ALIGNMENT> result(size());
+    assert(viennacl::traits::size2(proxy.lhs()) == viennacl::traits::size(v1) && bool("Size check failed in v1 - trans(A) * v2: size2(A) != size(v1)"));
+
+    vector<NumericT> result(viennacl::traits::size(v1));
     viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
-    result = *this - result;
+    result = v1 - result;
     return result;
   }
 
diff --git a/viennacl/linalg/misc_operations.hpp b/viennacl/linalg/misc_operations.hpp
new file mode 100644
index 0000000..0bd1b58
--- /dev/null
+++ b/viennacl/linalg/misc_operations.hpp
@@ -0,0 +1,94 @@
+#ifndef VIENNACL_LINALG_MISC_OPERATIONS_HPP_
+#define VIENNACL_LINALG_MISC_OPERATIONS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/misc_operations.hpp
+    @brief Implementations of miscellaneous operations
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/host_based/misc_operations.hpp"
+
+#ifdef VIENNACL_WITH_OPENCL
+  #include "viennacl/linalg/opencl/misc_operations.hpp"
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+  #include "viennacl/linalg/cuda/misc_operations.hpp"
+#endif
+
+namespace viennacl
+{
+  namespace linalg
+  {
+
+    namespace detail
+    {
+
+      template <typename ScalarType>
+      void level_scheduling_substitute(vector<ScalarType> & vec,
+                                  viennacl::backend::mem_handle const & row_index_array,
+                                  viennacl::backend::mem_handle const & row_buffer,
+                                  viennacl::backend::mem_handle const & col_buffer,
+                                  viennacl::backend::mem_handle const & element_buffer,
+                                  vcl_size_t num_rows
+                                  )
+      {
+        assert( viennacl::traits::handle(vec).get_active_handle_id() == row_index_array.get_active_handle_id() && bool("Incompatible memory domains"));
+        assert( viennacl::traits::handle(vec).get_active_handle_id() ==      row_buffer.get_active_handle_id() && bool("Incompatible memory domains"));
+        assert( viennacl::traits::handle(vec).get_active_handle_id() ==      col_buffer.get_active_handle_id() && bool("Incompatible memory domains"));
+        assert( viennacl::traits::handle(vec).get_active_handle_id() ==  element_buffer.get_active_handle_id() && bool("Incompatible memory domains"));
+
+        switch (viennacl::traits::handle(vec).get_active_handle_id())
+        {
+          case viennacl::MAIN_MEMORY:
+            viennacl::linalg::host_based::detail::level_scheduling_substitute(vec, row_index_array, row_buffer, col_buffer, element_buffer, num_rows);
+            break;
+#ifdef VIENNACL_WITH_OPENCL
+          case viennacl::OPENCL_MEMORY:
+            viennacl::linalg::opencl::detail::level_scheduling_substitute(vec, row_index_array, row_buffer, col_buffer, element_buffer, num_rows);
+            break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+          case viennacl::CUDA_MEMORY:
+            viennacl::linalg::cuda::detail::level_scheduling_substitute(vec, row_index_array, row_buffer, col_buffer, element_buffer, num_rows);
+            break;
+#endif
+          case viennacl::MEMORY_NOT_INITIALIZED:
+            throw memory_exception("not initialised!");
+          default:
+            throw memory_exception("not implemented");
+        }
+      }
+
+
+
+
+    } //namespace detail
+
+
+  } //namespace linalg
+} //namespace viennacl
+
+
+#endif
diff --git a/viennacl/linalg/mixed_precision_cg.hpp b/viennacl/linalg/mixed_precision_cg.hpp
new file mode 100644
index 0000000..eae0713
--- /dev/null
+++ b/viennacl/linalg/mixed_precision_cg.hpp
@@ -0,0 +1,254 @@
+#ifndef VIENNACL_LINALG_MIXED_PRECISION_CG_HPP_
+#define VIENNACL_LINALG_MIXED_PRECISION_CG_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/mixed_precision_cg.hpp
+    @brief The conjugate gradient method using mixed precision is implemented here. Experimental.
+*/
+
+#include <vector>
+#include <map>
+#include <cmath>
+#include "viennacl/forwards.h"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/ilu.hpp"
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/linalg/inner_prod.hpp"
+#include "viennacl/traits/clear.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/meta/result_of.hpp"
+#include "viennacl/ocl/backend.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/backend/memory.hpp"
+
+#include "viennacl/vector_proxy.hpp"
+
+namespace viennacl
+{
+  namespace linalg
+  {
+
+    /** @brief A tag for the conjugate gradient Used for supplying solver parameters and for dispatching the solve() function
+    */
+    class mixed_precision_cg_tag
+    {
+      public:
+        /** @brief The constructor
+        *
+        * @param tol              Relative tolerance for the residual (solver quits if ||r|| < tol * ||r_initial||)
+        * @param max_iterations   The maximum number of iterations
+        * @param inner_tol        Inner tolerance for the low-precision iterations
+        */
+        mixed_precision_cg_tag(double tol = 1e-8, unsigned int max_iterations = 300, float inner_tol = 1e-2f) : tol_(tol), iterations_(max_iterations), inner_tol_(inner_tol) {}
+
+        /** @brief Returns the relative tolerance */
+        double tolerance() const { return tol_; }
+        /** @brief Returns the relative tolerance */
+        float inner_tolerance() const { return inner_tol_; }
+        /** @brief Returns the maximum number of iterations */
+        unsigned int max_iterations() const { return iterations_; }
+
+        /** @brief Return the number of solver iterations: */
+        unsigned int iters() const { return iters_taken_; }
+        void iters(unsigned int i) const { iters_taken_ = i; }
+
+        /** @brief Returns the estimated relative error at the end of the solver run */
+        double error() const { return last_error_; }
+        /** @brief Sets the estimated relative error at the end of the solver run */
+        void error(double e) const { last_error_ = e; }
+
+
+      private:
+        double tol_;
+        unsigned int iterations_;
+        float inner_tol_;
+
+        //return values from solver
+        mutable unsigned int iters_taken_;
+        mutable double last_error_;
+    };
+
+
+    const char * double_float_conversion_program =
+    "#if defined(cl_khr_fp64)\n"
+    "#  pragma OPENCL EXTENSION cl_khr_fp64: enable\n"
+    "#elif defined(cl_amd_fp64)\n"
+    "#  pragma OPENCL EXTENSION cl_amd_fp64: enable\n"
+    "#endif\n"
+    "__kernel void assign_double_to_float(\n"
+    "          __global float * vec1,\n"
+    "          __global const double * vec2, \n"
+    "          unsigned int size) \n"
+    "{ \n"
+    "  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
+    "    vec1[i] = (float)(vec2[i]);\n"
+    "};\n\n"
+    "__kernel void inplace_add_float_to_double(\n"
+    "          __global double * vec1,\n"
+    "          __global const float * vec2, \n"
+    "          unsigned int size) \n"
+    "{ \n"
+    "  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
+    "    vec1[i] += (double)(vec2[i]);\n"
+    "};\n";
+
+
+    /** @brief Implementation of the conjugate gradient solver without preconditioner
+    *
+    * Following the algorithm in the book by Y. Saad "Iterative Methods for sparse linear systems"
+    *
+    * @param matrix     The system matrix
+    * @param rhs        The load vector
+    * @param tag        Solver configuration tag
+    * @return The result vector
+    */
+    template <typename MatrixType, typename VectorType>
+    VectorType solve(const MatrixType & matrix, VectorType const & rhs, mixed_precision_cg_tag const & tag)
+    {
+      //typedef typename VectorType::value_type      ScalarType;
+      typedef typename viennacl::result_of::value_type<VectorType>::type        ScalarType;
+      typedef typename viennacl::result_of::cpu_value_type<ScalarType>::type    CPU_ScalarType;
+
+      //TODO: Assert CPU_ScalarType == double
+
+      //std::cout << "Starting CG" << std::endl;
+      vcl_size_t problem_size = viennacl::traits::size(rhs);
+      VectorType result(rhs);
+      viennacl::traits::clear(result);
+
+      VectorType residual = rhs;
+
+      CPU_ScalarType ip_rr = viennacl::linalg::inner_prod(rhs, rhs);
+      CPU_ScalarType new_ip_rr = 0;
+      CPU_ScalarType norm_rhs_squared = ip_rr;
+
+      if (norm_rhs_squared == 0) //solution is zero if RHS norm is zero
+        return result;
+
+      static bool first = true;
+
+      viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(matrix).context());
+      if (first)
+      {
+        ctx.add_program(double_float_conversion_program, "double_float_conversion_program");
+      }
+
+      viennacl::vector<float> residual_low_precision(problem_size, viennacl::traits::context(rhs));
+      viennacl::vector<float> result_low_precision(problem_size, viennacl::traits::context(rhs));
+      viennacl::vector<float> p_low_precision(problem_size, viennacl::traits::context(rhs));
+      viennacl::vector<float> tmp_low_precision(problem_size, viennacl::traits::context(rhs));
+      float inner_ip_rr = static_cast<float>(ip_rr);
+      float new_inner_ip_rr = 0;
+      float initial_inner_rhs_norm_squared = static_cast<float>(ip_rr);
+      float alpha;
+      float beta;
+
+      viennacl::ocl::kernel & assign_double_to_float      = ctx.get_kernel("double_float_conversion_program", "assign_double_to_float");
+      viennacl::ocl::kernel & inplace_add_float_to_double = ctx.get_kernel("double_float_conversion_program", "inplace_add_float_to_double");
+
+      // transfer rhs to single precision:
+      viennacl::ocl::enqueue( assign_double_to_float(p_low_precision.handle().opencl_handle(),
+                                                     rhs.handle().opencl_handle(),
+                                                     cl_uint(rhs.size())
+                                                    ) );
+      //std::cout << "copying p_low_precision..." << std::endl;
+      //assign_double_to_float(p_low_precision.handle(), residual.handle(), residual.size());
+      residual_low_precision = p_low_precision;
+
+      // transfer matrix to single precision:
+      viennacl::compressed_matrix<float> matrix_low_precision(matrix.size1(), matrix.size2(), matrix.nnz(), viennacl::traits::context(rhs));
+      viennacl::backend::memory_copy(matrix.handle1(), const_cast<viennacl::backend::mem_handle &>(matrix_low_precision.handle1()), 0, 0, sizeof(cl_uint) * (matrix.size1() + 1) );
+      viennacl::backend::memory_copy(matrix.handle2(), const_cast<viennacl::backend::mem_handle &>(matrix_low_precision.handle2()), 0, 0, sizeof(cl_uint) * (matrix.nnz()) );
+
+      viennacl::ocl::enqueue( assign_double_to_float(matrix_low_precision.handle().opencl_handle(),
+                                                     matrix.handle().opencl_handle(),
+                                                     cl_uint(matrix.nnz())
+                                                    ) );
+      //std::cout << "copying matrix_low_precision..." << std::endl;
+      //assign_double_to_float(const_cast<viennacl::backend::mem_handle &>(matrix_low_precision.handle()), matrix.handle(), matrix.nnz());
+
+      //std::cout << "Starting CG solver iterations... " << std::endl;
+
+
+      for (unsigned int i = 0; i < tag.max_iterations(); ++i)
+      {
+        tag.iters(i+1);
+
+        // lower precision 'inner iteration'
+        tmp_low_precision = viennacl::linalg::prod(matrix_low_precision, p_low_precision);
+
+        alpha = inner_ip_rr / viennacl::linalg::inner_prod(tmp_low_precision, p_low_precision);
+        result_low_precision += alpha * p_low_precision;
+        residual_low_precision -= alpha * tmp_low_precision;
+
+        new_inner_ip_rr = viennacl::linalg::inner_prod(residual_low_precision, residual_low_precision);
+
+        beta = new_inner_ip_rr / inner_ip_rr;
+        inner_ip_rr = new_inner_ip_rr;
+
+        p_low_precision = residual_low_precision + beta * p_low_precision;
+
+
+
+        if (new_inner_ip_rr < tag.inner_tolerance() * initial_inner_rhs_norm_squared || i == tag.max_iterations()-1)
+        {
+          //std::cout << "outer correction at i=" << i << std::endl;
+          //result += result_low_precision;
+          viennacl::ocl::enqueue( inplace_add_float_to_double(result.handle().opencl_handle(),
+                                                              result_low_precision.handle().opencl_handle(),
+                                                              cl_uint(result.size())
+                                                             ) );
+
+          // residual = b - Ax  (without introducing a temporary)
+          residual = viennacl::linalg::prod(matrix, result);
+          residual = rhs - residual;
+
+          new_ip_rr = viennacl::linalg::inner_prod(residual, residual);
+          if (new_ip_rr / norm_rhs_squared < tag.tolerance() *  tag.tolerance())//squared norms involved here
+            break;
+
+          // p_low_precision = residual;
+          viennacl::ocl::enqueue( assign_double_to_float(p_low_precision.handle().opencl_handle(),
+                                                         residual.handle().opencl_handle(),
+                                                         cl_uint(residual.size())
+                                                        ) );
+          result_low_precision.clear();
+          residual_low_precision = p_low_precision;
+          initial_inner_rhs_norm_squared = static_cast<float>(new_ip_rr);
+          inner_ip_rr = static_cast<float>(new_ip_rr);
+        }
+      }
+
+      //store last error estimate:
+      tag.error(std::sqrt(new_ip_rr / norm_rhs_squared));
+
+      return result;
+    }
+
+    template <typename MatrixType, typename VectorType>
+    VectorType solve(const MatrixType & matrix, VectorType const & rhs, mixed_precision_cg_tag const & tag, viennacl::linalg::no_precond)
+    {
+      return solve(matrix, rhs, tag);
+    }
+
+
+  }
+}
+
+#endif
diff --git a/viennacl/linalg/nmf.hpp b/viennacl/linalg/nmf.hpp
new file mode 100644
index 0000000..e47712d
--- /dev/null
+++ b/viennacl/linalg/nmf.hpp
@@ -0,0 +1,200 @@
+#ifndef VIENNACL_LINALG_NMF_HPP
+#define VIENNACL_LINALG_NMF_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/nmf.hpp
+    @brief Provides a nonnegative matrix factorization implementation.  Experimental.
+
+    Contributed by Volodymyr Kysenko.
+*/
+
+
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+#include "viennacl/linalg/norm_frobenius.hpp"
+#include "viennacl/linalg/opencl/kernels/nmf.hpp"
+
+namespace viennacl
+{
+  namespace linalg
+  {
+    /** @brief Configuration class for the nonnegative-matrix-factorization algorithm. Specify tolerances, maximum iteration counts, etc., here. */
+    class nmf_config
+    {
+      public:
+        nmf_config(double val_epsilon = 1e-4,
+                   double val_epsilon_stagnation = 1e-5,
+                   vcl_size_t num_max_iters = 10000,
+                   vcl_size_t num_check_iters = 100)
+         : eps_(val_epsilon), stagnation_eps_(val_epsilon_stagnation),
+           max_iters_(num_max_iters),
+           check_after_steps_( (num_check_iters > 0) ? num_check_iters : 1),
+           print_relative_error_(false),
+           iters_(0) {}
+
+        /** @brief Returns the relative tolerance for convergence */
+        double tolerance() const { return eps_; }
+
+        /** @brief Sets the relative tolerance for convergence, i.e. norm(V - W * H) / norm(V - W_init * H_init) */
+        void tolerance(double e) { eps_ = e; }
+
+        /** @brief Relative tolerance for the stagnation check */
+        double stagnation_tolerance() const { return stagnation_eps_; }
+
+        /** @brief Sets the tolerance for the stagnation check (i.e. the minimum required relative change of the residual between two iterations) */
+        void stagnation_tolerance(double e) { stagnation_eps_ = e; }
+
+        /** @brief Returns the maximum number of iterations for the NMF algorithm */
+        vcl_size_t max_iterations() const { return max_iters_; }
+        /** @brief Sets the maximum number of iterations for the NMF algorithm */
+        void max_iterations(vcl_size_t m) { max_iters_ = m; }
+
+        /** @brief Returns the number of iterations of the last NMF run using this configuration object */
+        vcl_size_t iters() const { return iters_; }
+
+
+        /** @brief Number of steps after which the convergence of NMF should be checked (again) */
+        vcl_size_t check_after_steps() const { return check_after_steps_; }
+        /** @brief Set the number of steps after which the convergence of NMF should be checked (again) */
+        void check_after_steps(vcl_size_t c) { if (c > 0) check_after_steps_ = c; }
+
+        /** @brief Returns the flag specifying whether the relative tolerance should be printed in each iteration */
+        bool print_relative_error() const { return print_relative_error_; }
+        /** @brief Specify whether the relative error should be printed at each convergence check after 'num_check_iters' steps */
+        void print_relative_error(bool b) { print_relative_error_ = b; }
+
+        template <typename ScalarType>
+        friend void nmf(viennacl::matrix<ScalarType> const & V,
+                        viennacl::matrix<ScalarType> & W,
+                        viennacl::matrix<ScalarType> & H,
+                        nmf_config const & conf);
+
+      private:
+        double eps_;
+        double stagnation_eps_;
+        vcl_size_t max_iters_;
+        vcl_size_t check_after_steps_;
+        bool print_relative_error_;
+        mutable vcl_size_t iters_;
+    };
+
+
+    /** @brief The nonnegative matrix factorization (approximation) algorithm as suggested by Lee and Seung. Factorizes a matrix V with nonnegative entries into matrices W and H such that ||V - W*H|| is minimized.
+     *
+     * @param V     Input matrix
+     * @param W     First factor
+     * @param H     Second factor
+     * @param conf  A configuration object holding tolerances and the like
+     */
+    template <typename ScalarType>
+    void nmf(viennacl::matrix<ScalarType> const & V,
+             viennacl::matrix<ScalarType> & W,
+             viennacl::matrix<ScalarType> & H,
+             nmf_config const & conf)
+    {
+      viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(V).context());
+
+      const std::string NMF_MUL_DIV_KERNEL = "el_wise_mul_div";
+
+      viennacl::linalg::opencl::kernels::nmf<ScalarType>::init(ctx);
+
+      assert(V.size1() == W.size1() && V.size2() == H.size2() && bool("Dimensions of W and H don't allow for V = W * H"));
+      assert(W.size2() == H.size1() && bool("Dimensions of W and H don't match, prod(W, H) impossible"));
+
+      vcl_size_t k = W.size2();
+      conf.iters_ = 0;
+
+      viennacl::matrix<ScalarType> wn(V.size1(), k);
+      viennacl::matrix<ScalarType> wd(V.size1(), k);
+      viennacl::matrix<ScalarType> wtmp(V.size1(), V.size2());
+
+      viennacl::matrix<ScalarType> hn(k, V.size2());
+      viennacl::matrix<ScalarType> hd(k, V.size2());
+      viennacl::matrix<ScalarType> htmp(k, k);
+
+      viennacl::matrix<ScalarType> appr(V.size1(), V.size2());
+      viennacl::vector<ScalarType> diff(V.size1() * V.size2());
+
+      ScalarType last_diff = 0;
+      ScalarType diff_init = 0;
+      bool stagnation_flag = false;
+
+
+      for (vcl_size_t i = 0; i < conf.max_iterations(); i++)
+      {
+        conf.iters_ = i + 1;
+        {
+          hn   = viennacl::linalg::prod(trans(W), V);
+          htmp = viennacl::linalg::prod(trans(W), W);
+          hd   = viennacl::linalg::prod(htmp, H);
+
+          viennacl::ocl::kernel & mul_div_kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::nmf<ScalarType>::program_name(), NMF_MUL_DIV_KERNEL);
+          viennacl::ocl::enqueue(mul_div_kernel(H, hn, hd, cl_uint(H.internal_size1() * H.internal_size2())));
+        }
+        {
+          wn   = viennacl::linalg::prod(V, trans(H));
+          wtmp = viennacl::linalg::prod(W, H);
+          wd   = viennacl::linalg::prod(wtmp, trans(H));
+
+          viennacl::ocl::kernel & mul_div_kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::nmf<ScalarType>::program_name(), NMF_MUL_DIV_KERNEL);
+
+          viennacl::ocl::enqueue(mul_div_kernel(W, wn, wd, cl_uint(W.internal_size1() * W.internal_size2())));
+        }
+
+        if (i % conf.check_after_steps() == 0)  //check for convergence
+        {
+          appr = viennacl::linalg::prod(W, H);
+
+          appr -= V;
+          ScalarType diff_val = viennacl::linalg::norm_frobenius(appr);
+
+          if (i == 0)
+            diff_init = diff_val;
+
+          if (conf.print_relative_error())
+            std::cout << diff_val / diff_init << std::endl;
+
+          // Approximation check
+          if (diff_val / diff_init < conf.tolerance())
+            break;
+
+          // Stagnation check
+          if (std::fabs(diff_val - last_diff) / (diff_val * conf.check_after_steps()) < conf.stagnation_tolerance()) //avoid situations where convergence stagnates
+          {
+            if (stagnation_flag)       // iteration stagnates (two iterates with no notable progress)
+              break;
+            else                       // record stagnation in this iteration
+              stagnation_flag = true;
+          }
+          else                         // good progress in this iteration, so unset stagnation flag
+            stagnation_flag = false;
+
+          // prepare for next iterate:
+          last_diff = diff_val;
+        }
+      }
+
+
+    }
+  }
+}
+
+#endif
diff --git a/viennacl/linalg/norm_1.hpp b/viennacl/linalg/norm_1.hpp
index 2428a99..42c6e02 100644
--- a/viennacl/linalg/norm_1.hpp
+++ b/viennacl/linalg/norm_1.hpp
@@ -2,16 +2,17 @@
 #define VIENNACL_LINALG_NORM_1_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
@@ -21,7 +22,7 @@
     @brief Generic interface for the l^1-norm. See viennacl/linalg/vector_operations.hpp for implementations.
 */
 
-#include <math.h>    //for sqrt()
+#include <cmath>
 #include "viennacl/forwards.h"
 #include "viennacl/tools/tools.hpp"
 #include "viennacl/meta/enable_if.hpp"
@@ -32,60 +33,75 @@ namespace viennacl
   //
   // generic norm_1 function
   //   uses tag dispatch to identify which algorithm
-  //   should be called 
+  //   should be called
   //
-  namespace linalg 
+  namespace linalg
   {
-    
-    #ifdef VIENNACL_HAVE_UBLAS
+
+    #ifdef VIENNACL_WITH_UBLAS
     // ----------------------------------------------------
     // UBLAS
     //
     template< typename VectorT >
     typename viennacl::enable_if< viennacl::is_ublas< typename viennacl::traits::tag_of< VectorT >::type >::value,
-                                  typename VectorT::value_type      
-                                >::type    
+                                  typename VectorT::value_type
+                                >::type
     norm_1(VectorT const& vector)
     {
       // std::cout << "ublas .. " << std::endl;
       return boost::numeric::ublas::norm_1(vector);
     }
     #endif
-    
-    
+
+
     // ----------------------------------------------------
     // STL
     //
-    template< typename VectorT>
-    typename VectorT::value_type
-    norm_1(VectorT const& v1,
-         typename viennacl::enable_if< viennacl::is_stl< typename viennacl::traits::tag_of< VectorT >::type >::value
-                                     >::type* dummy = 0)
+    template< typename T, typename A >
+    T norm_1(std::vector<T, A> const & v1)
     {
       //std::cout << "stl .. " << std::endl;
-      typename VectorT::value_type result = 0;
-      for (typename VectorT::size_type i=0; i<v1.size(); ++i)
-        result += fabs(v1[i]);
-      
+      T result = 0;
+      for (typename std::vector<T, A>::size_type i=0; i<v1.size(); ++i)
+        result += std::fabs(v1[i]);
+
       return result;
     }
-    
+
     // ----------------------------------------------------
     // VIENNACL
     //
-    template< typename ScalarType, unsigned int alignment >
-    viennacl::scalar_expression< const viennacl::vector<ScalarType, alignment>, 
-                                 const viennacl::vector<ScalarType, alignment>,
+    template< typename ScalarType>
+    viennacl::scalar_expression< const viennacl::vector_base<ScalarType>,
+                                 const viennacl::vector_base<ScalarType>,
                                  viennacl::op_norm_1 >
-    norm_1(viennacl::vector<ScalarType, alignment> const & vector, 
-         typename viennacl::enable_if< viennacl::is_viennacl< typename viennacl::traits::tag_of< viennacl::vector<ScalarType, alignment> >::type >::value
-                                     >::type* dummy = 0)
+    norm_1(viennacl::vector_base<ScalarType> const & vector)
     {
-      return viennacl::scalar_expression< const viennacl::vector<ScalarType, alignment>, 
-                                          const viennacl::vector<ScalarType, alignment>,
+      return viennacl::scalar_expression< const viennacl::vector_base<ScalarType>,
+                                          const viennacl::vector_base<ScalarType>,
                                           viennacl::op_norm_1 >(vector, vector);
     }
 
+    // with vector expression:
+    template <typename LHS, typename RHS, typename OP>
+    viennacl::scalar_expression<const viennacl::vector_expression<const LHS, const RHS, OP>,
+                                const viennacl::vector_expression<const LHS, const RHS, OP>,
+                                viennacl::op_norm_1>
+    norm_1(viennacl::vector_expression<const LHS, const RHS, OP> const & vector)
+    {
+      return viennacl::scalar_expression< const viennacl::vector_expression<const LHS, const RHS, OP>,
+                                          const viennacl::vector_expression<const LHS, const RHS, OP>,
+                                          viennacl::op_norm_1 >(vector, vector);
+    }
+
+    // with matrix
+    /*template<typename NumericT, typename F>
+    scalar_expression< const matrix_base<NumericT, F>, const matrix_base<NumericT, F>, op_norm_1>
+    norm_1(const matrix<NumericT, F> & A)
+    {
+      return scalar_expression< const matrix_base<NumericT, F>, const matrix_base<NumericT, F>, op_norm_1>(A, A);
+    }*/
+
   } // end namespace linalg
 } // end namespace viennacl
 #endif
diff --git a/viennacl/linalg/norm_2.hpp b/viennacl/linalg/norm_2.hpp
index 3c38c28..e716ce3 100644
--- a/viennacl/linalg/norm_2.hpp
+++ b/viennacl/linalg/norm_2.hpp
@@ -2,16 +2,17 @@
 #define VIENNACL_LINALG_NORM_2_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
@@ -21,7 +22,7 @@
     @brief Generic interface for the l^2-norm. See viennacl/linalg/vector_operations.hpp for implementations.
 */
 
-#include <math.h>    //for sqrt()
+#include <cmath>
 #include "viennacl/forwards.h"
 #include "viennacl/tools/tools.hpp"
 #include "viennacl/meta/enable_if.hpp"
@@ -32,127 +33,93 @@ namespace viennacl
   //
   // generic norm_2 function
   //   uses tag dispatch to identify which algorithm
-  //   should be called 
+  //   should be called
   //
-  namespace linalg 
+  namespace linalg
   {
-    #ifdef VIENNACL_HAVE_MTL4
+    #ifdef VIENNACL_WITH_MTL4
     // ----------------------------------------------------
     // MTL4
     //
-      #if defined(_MSC_VER) && _MSC_VER < 1500        //Visual Studio 2005 needs special treatment
-      template <typename ScalarType>
-      ScalarType norm_2(mtl::dense_vector<ScalarType> const & v)
-      {
-        // std::cout << "mtl4 .. " << std::endl;
-        return mtl::two_norm(v);
-      }
-      
-      #else
-      template< typename VectorT >
-      typename VectorT::value_type
-      norm_2(VectorT const& v, 
-          typename viennacl::enable_if< viennacl::is_mtl4< typename viennacl::traits::tag_of< VectorT >::type >::value
-                                      >::type* dummy = 0)
-      {
-        // std::cout << "mtl4 .. " << std::endl;
-        return mtl::two_norm(v);
-      }
-      #endif
+    template< typename VectorT >
+    typename viennacl::enable_if< viennacl::is_mtl4< typename viennacl::traits::tag_of< VectorT >::type >::value,
+                                  typename VectorT::value_type>::type
+    norm_2(VectorT const & v)
+    {
+      return mtl::two_norm(v);
+    }
     #endif
-    
-    
-    #ifdef VIENNACL_HAVE_EIGEN
+
+
+    #ifdef VIENNACL_WITH_EIGEN
     // ----------------------------------------------------
     // EIGEN
     //
-      #if defined(_MSC_VER) && _MSC_VER < 1500        //Visual Studio 2005 needs special treatment
-      float norm_2(Eigen::VectorXf const & v)
-      {
-        // std::cout << "eigen .. " << std::endl;
-        return v.norm();
-      }
-      
-      double norm_2(Eigen::VectorXd const & v)
-      {
-        // std::cout << "eigen .. " << std::endl;
-        return v.norm();
-      }
-      
-      #else
-      template< typename VectorT >
-      typename VectorT::RealScalar
-      norm_2(VectorT const& v, 
-          typename viennacl::enable_if< viennacl::is_eigen< typename viennacl::traits::tag_of< VectorT >::type >::value
-                                      >::type* dummy = 0)
-      {
-        // std::cout << "ublas .. " << std::endl;
-        return v.norm();
-      }
-      #endif
+    template< typename VectorT >
+    typename viennacl::enable_if< viennacl::is_eigen< typename viennacl::traits::tag_of< VectorT >::type >::value,
+                                  typename VectorT::RealScalar>::type
+    norm_2(VectorT const & v)
+    {
+      return v.norm();
+    }
     #endif
-    
-    
-    #ifdef VIENNACL_HAVE_UBLAS
+
+
+    #ifdef VIENNACL_WITH_UBLAS
     // ----------------------------------------------------
     // UBLAS
     //
-      #if defined(_MSC_VER) && _MSC_VER < 1500        //Visual Studio 2005 needs special treatment
-      template< typename ScalarType >
-      ScalarType
-      norm_2(boost::numeric::ublas::vector<ScalarType> const & v)
-      {
-        // std::cout << "ublas .. " << std::endl;
-        return boost::numeric::ublas::norm_2(v);
-      }
-      #else
-      template< typename VectorT >
-      typename VectorT::value_type
-      norm_2(VectorT const& v, 
-          typename viennacl::enable_if< viennacl::is_ublas< typename viennacl::traits::tag_of< VectorT >::type >::value
-                                      >::type* dummy = 0)
-      {
-        // std::cout << "ublas .. " << std::endl;
-        return boost::numeric::ublas::norm_2(v);
-      }
-      #endif
+    template< typename VectorT >
+    typename viennacl::enable_if< viennacl::is_ublas< typename viennacl::traits::tag_of< VectorT >::type >::value,
+                                  typename VectorT::value_type>::type
+    norm_2(VectorT const & v)
+    {
+      return boost::numeric::ublas::norm_2(v);
+    }
     #endif
-    
-    
+
+
     // ----------------------------------------------------
     // STL
     //
-    template< typename VectorT>
-    typename VectorT::value_type
-    norm_2(VectorT const& v1,
-         typename viennacl::enable_if< viennacl::is_stl< typename viennacl::traits::tag_of< VectorT >::type >::value
-                                     >::type* dummy = 0)
+    template< typename T, typename A >
+    T norm_2(std::vector<T, A> const & v1)
     {
-      //std::cout << "stl .. " << std::endl;
-      typename VectorT::value_type result = 0;
-      for (typename VectorT::size_type i=0; i<v1.size(); ++i)
+      T result = 0;
+      for (typename std::vector<T, A>::size_type i=0; i<v1.size(); ++i)
         result += v1[i] * v1[i];
-      
-      return sqrt(result);
+
+      return std::sqrt(result);
     }
-    
+
     // ----------------------------------------------------
     // VIENNACL
     //
-    template< typename ScalarType, unsigned int alignment >
-    viennacl::scalar_expression< const viennacl::vector<ScalarType, alignment>, 
-                                 const viennacl::vector<ScalarType, alignment>,
+    template< typename ScalarType>
+    viennacl::scalar_expression< const viennacl::vector_base<ScalarType>,
+                                 const viennacl::vector_base<ScalarType>,
                                  viennacl::op_norm_2 >
-    norm_2(viennacl::vector<ScalarType, alignment> const & v, 
-         typename viennacl::enable_if< viennacl::is_viennacl< typename viennacl::traits::tag_of< viennacl::vector<ScalarType, alignment> >::type >::value
-                                     >::type* dummy = 0)
+    norm_2(viennacl::vector_base<ScalarType> const & v)
     {
        //std::cout << "viennacl .. " << std::endl;
-      return viennacl::scalar_expression< const viennacl::vector<ScalarType, alignment>, 
-                                          const viennacl::vector<ScalarType, alignment>,
+      return viennacl::scalar_expression< const viennacl::vector_base<ScalarType>,
+                                          const viennacl::vector_base<ScalarType>,
                                           viennacl::op_norm_2 >(v, v);
     }
 
+    // with vector expression:
+    template <typename LHS, typename RHS, typename OP>
+    viennacl::scalar_expression<const viennacl::vector_expression<const LHS, const RHS, OP>,
+                                const viennacl::vector_expression<const LHS, const RHS, OP>,
+                                viennacl::op_norm_2>
+    norm_2(viennacl::vector_expression<const LHS, const RHS, OP> const & vector)
+    {
+      return viennacl::scalar_expression< const viennacl::vector_expression<const LHS, const RHS, OP>,
+                                          const viennacl::vector_expression<const LHS, const RHS, OP>,
+                                          viennacl::op_norm_2>(vector, vector);
+    }
+
+
   } // end namespace linalg
 } // end namespace viennacl
 #endif
diff --git a/viennacl/linalg/norm_frobenius.hpp b/viennacl/linalg/norm_frobenius.hpp
new file mode 100644
index 0000000..310fb10
--- /dev/null
+++ b/viennacl/linalg/norm_frobenius.hpp
@@ -0,0 +1,73 @@
+#ifndef VIENNACL_LINALG_NORM_FROBENIUS_HPP_
+#define VIENNACL_LINALG_NORM_FROBENIUS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/norm_frobenius.hpp
+    @brief Generic interface for the Frobenius norm.
+*/
+
+#include <cmath>
+#include "viennacl/forwards.h"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/meta/tag_of.hpp"
+
+namespace viennacl
+{
+  //
+  // generic norm_frobenius function
+  //   uses tag dispatch to identify which algorithm
+  //   should be called
+  //
+  namespace linalg
+  {
+
+    #ifdef VIENNACL_WITH_UBLAS
+    // ----------------------------------------------------
+    // UBLAS
+    //
+    template< typename VectorT >
+    typename viennacl::enable_if< viennacl::is_ublas< typename viennacl::traits::tag_of< VectorT >::type >::value,
+                                  typename VectorT::value_type
+                                >::type
+    norm_frobenius(VectorT const& v1)
+    {
+      return boost::numeric::ublas::norm_frobenius(v1);
+    }
+    #endif
+
+
+    // ----------------------------------------------------
+    // VIENNACL
+    //
+    template<typename NumericT, typename F>
+    scalar_expression< const matrix_base<NumericT, F>, const matrix_base<NumericT, F>, op_norm_frobenius>
+    norm_frobenius(const matrix<NumericT, F> & A)
+    {
+      return scalar_expression< const matrix_base<NumericT, F>, const matrix_base<NumericT, F>, op_norm_frobenius>(A, A);
+    }
+
+  } // end namespace linalg
+} // end namespace viennacl
+#endif
+
+
+
+
+
diff --git a/viennacl/linalg/norm_inf.hpp b/viennacl/linalg/norm_inf.hpp
index 8ddcd20..b8d15eb 100644
--- a/viennacl/linalg/norm_inf.hpp
+++ b/viennacl/linalg/norm_inf.hpp
@@ -2,16 +2,17 @@
 #define VIENNACL_LINALG_NORM_INF_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
@@ -21,7 +22,7 @@
     @brief Generic interface for the l^infty-norm. See viennacl/linalg/vector_operations.hpp for implementations.
 */
 
-#include <math.h>    //for sqrt()
+#include <cmath>
 #include "viennacl/forwards.h"
 #include "viennacl/tools/tools.hpp"
 #include "viennacl/meta/enable_if.hpp"
@@ -32,64 +33,80 @@ namespace viennacl
   //
   // generic norm_inf function
   //   uses tag dispatch to identify which algorithm
-  //   should be called 
+  //   should be called
   //
-  namespace linalg 
+  namespace linalg
   {
-    
-    #ifdef VIENNACL_HAVE_UBLAS
+
+    #ifdef VIENNACL_WITH_UBLAS
     // ----------------------------------------------------
     // UBLAS
     //
     template< typename VectorT >
     typename viennacl::enable_if< viennacl::is_ublas< typename viennacl::traits::tag_of< VectorT >::type >::value,
-                                  typename VectorT::value_type      
-                                >::type    
+                                  typename VectorT::value_type
+                                >::type
     norm_inf(VectorT const& v1)
     {
-      // std::cout << "ublas .. " << std::endl;
       return boost::numeric::ublas::norm_inf(v1);
     }
     #endif
-    
-    
+
+
     // ----------------------------------------------------
     // STL
     //
-    template< typename VectorT>
-    typename VectorT::value_type
-    norm_inf(VectorT const& v1,
-         typename viennacl::enable_if< viennacl::is_stl< typename viennacl::traits::tag_of< VectorT >::type >::value
-                                            >::type* dummy = 0)
+    template< typename T, typename A >
+    T norm_inf(std::vector<T, A> const & v1)
     {
       //std::cout << "stl .. " << std::endl;
-      typename VectorT::value_type result = 0;
-      for (typename VectorT::size_type i=0; i<v1.size(); ++i)
+      T result = 0;
+      for (typename std::vector<T, A>::size_type i=0; i<v1.size(); ++i)
       {
-        if (fabs(v1[i]) > result)
-          result = fabs(v1[i]);
+        if (std::fabs(v1[i]) > result)
+          result = std::fabs(v1[i]);
       }
-      
+
       return result;
     }
-    
+
     // ----------------------------------------------------
     // VIENNACL
     //
-    template< typename ScalarType, unsigned int alignment >
-    viennacl::scalar_expression< const viennacl::vector<ScalarType, alignment>, 
-                                 const viennacl::vector<ScalarType, alignment>,
+    template< typename ScalarType>
+    viennacl::scalar_expression< const viennacl::vector_base<ScalarType>,
+                                 const viennacl::vector_base<ScalarType>,
                                  viennacl::op_norm_inf >
-    norm_inf(viennacl::vector<ScalarType, alignment> const & v1, 
-         typename viennacl::enable_if< viennacl::is_viennacl< typename viennacl::traits::tag_of< viennacl::vector<ScalarType, alignment> >::type >::value
-                                            >::type* dummy = 0)
+    norm_inf(viennacl::vector_base<ScalarType> const & v1)
     {
        //std::cout << "viennacl .. " << std::endl;
-      return viennacl::scalar_expression< const viennacl::vector<ScalarType, alignment>, 
-                                          const viennacl::vector<ScalarType, alignment>,
+      return viennacl::scalar_expression< const viennacl::vector_base<ScalarType>,
+                                          const viennacl::vector_base<ScalarType>,
                                           viennacl::op_norm_inf >(v1, v1);
     }
 
+    // with vector expression:
+    template <typename LHS, typename RHS, typename OP>
+    viennacl::scalar_expression<const viennacl::vector_expression<const LHS, const RHS, OP>,
+                                const viennacl::vector_expression<const LHS, const RHS, OP>,
+                                viennacl::op_norm_inf>
+    norm_inf(viennacl::vector_expression<const LHS, const RHS, OP> const & vector)
+    {
+      return viennacl::scalar_expression< const viennacl::vector_expression<const LHS, const RHS, OP>,
+                                          const viennacl::vector_expression<const LHS, const RHS, OP>,
+                                          viennacl::op_norm_inf >(vector, vector);
+    }
+
+    // with matrix:
+    /*
+    template<typename NumericT, typename F>
+    scalar_expression< const matrix_base<NumericT, F>, const matrix_base<NumericT, F>, op_norm_inf>
+    norm_inf(const matrix<NumericT, F> & A)
+    {
+      return scalar_expression< const matrix_base<NumericT, F>, const matrix_base<NumericT, F>, op_norm_inf>(A, A);
+    }*/
+
+
   } // end namespace linalg
 } // end namespace viennacl
 #endif
diff --git a/viennacl/linalg/opencl/common.hpp b/viennacl/linalg/opencl/common.hpp
new file mode 100644
index 0000000..2228eb2
--- /dev/null
+++ b/viennacl/linalg/opencl/common.hpp
@@ -0,0 +1,95 @@
+#ifndef VIENNACL_LINALG_OPENCL_COMMON_HPP_
+#define VIENNACL_LINALG_OPENCL_COMMON_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/opencl/common.hpp
+    @brief Common implementations shared by OpenCL-based operations
+*/
+
+#include <cmath>
+
+#include "viennacl/forwards.h"
+#include "viennacl/ocl/platform.hpp"
+
+namespace viennacl
+{
+  namespace linalg
+  {
+    namespace opencl
+    {
+
+      namespace detail
+      {
+        inline cl_uint make_options(vcl_size_t length, bool reciprocal, bool flip_sign)
+        {
+          return static_cast<cl_uint>( ((length > 1) ? (cl_uint(length) << 2) : 0) + (reciprocal ? 2 : 0) + (flip_sign ? 1 : 0) );
+        }
+
+
+        /** @brief Returns the OpenCL kernel string for the operation C = A * B with A sparse, B, C dense matrices. */
+        inline std::string sparse_dense_matmult_kernel_name(bool B_transposed, bool B_row_major, bool C_row_major)
+        {
+          if (B_transposed)
+          {
+            if (B_row_major && C_row_major)
+              return "trans_mat_mult_row_row";
+            if (B_row_major && !C_row_major)
+              return "trans_mat_mult_row_col";
+            if (!B_row_major && C_row_major)
+              return "trans_mat_mult_col_row";
+
+            return "trans_mat_mult_col_col";
+          }
+
+          if (B_row_major && C_row_major)
+            return "mat_mult_row_row";
+          if (B_row_major && !C_row_major)
+            return "mat_mult_row_col";
+          if (!B_row_major && C_row_major)
+            return "mat_mult_col_row";
+
+          return "mat_mult_col_col";
+        }
+
+
+        inline std::string op_to_string(op_abs)   { return "abs";   }
+        inline std::string op_to_string(op_acos)  { return "acos";  }
+        inline std::string op_to_string(op_asin)  { return "asin";  }
+        inline std::string op_to_string(op_atan)  { return "atan";  }
+        inline std::string op_to_string(op_ceil)  { return "ceil";  }
+        inline std::string op_to_string(op_cos)   { return "cos";   }
+        inline std::string op_to_string(op_cosh)  { return "cosh";  }
+        inline std::string op_to_string(op_exp)   { return "exp";   }
+        inline std::string op_to_string(op_fabs)  { return "fabs";  }
+        inline std::string op_to_string(op_floor) { return "floor"; }
+        inline std::string op_to_string(op_log)   { return "log";   }
+        inline std::string op_to_string(op_log10) { return "log10"; }
+        inline std::string op_to_string(op_sin)   { return "sin";   }
+        inline std::string op_to_string(op_sinh)  { return "sinh";  }
+        inline std::string op_to_string(op_sqrt)  { return "sqrt";  }
+        inline std::string op_to_string(op_tan)   { return "tan";   }
+        inline std::string op_to_string(op_tanh)  { return "tanh";  }
+      }
+
+    } //namespace opencl
+  } //namespace linalg
+} //namespace viennacl
+
+
+#endif
diff --git a/viennacl/linalg/opencl/direct_solve.hpp b/viennacl/linalg/opencl/direct_solve.hpp
new file mode 100644
index 0000000..82de036
--- /dev/null
+++ b/viennacl/linalg/opencl/direct_solve.hpp
@@ -0,0 +1,232 @@
+#ifndef VIENNACL_LINALG_OPENCL_DIRECT_SOLVE_HPP
+#define VIENNACL_LINALG_OPENCL_DIRECT_SOLVE_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/opencl/direct_solve.hpp
+    @brief Implementations of dense direct solvers are found here.
+*/
+
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/device.hpp"
+#include "viennacl/ocl/handle.hpp"
+#include "viennacl/linalg/opencl/kernels/matrix_solve.hpp"
+
+namespace viennacl
+{
+  namespace linalg
+  {
+    namespace opencl
+    {
+      namespace detail
+      {
+        inline cl_uint get_option_for_solver_tag(viennacl::linalg::upper_tag)      { return 0; }
+        inline cl_uint get_option_for_solver_tag(viennacl::linalg::unit_upper_tag) { return (1 << 0); }
+        inline cl_uint get_option_for_solver_tag(viennacl::linalg::lower_tag)      { return (1 << 2); }
+        inline cl_uint get_option_for_solver_tag(viennacl::linalg::unit_lower_tag) { return (1 << 2) | (1 << 0); }
+
+        template <typename M1, typename M2, typename KernelType>
+        void inplace_solve_impl(M1 const & A, M2 & B, KernelType & k)
+        {
+          viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(A),
+                                   cl_uint(viennacl::traits::start1(A)),         cl_uint(viennacl::traits::start2(A)),
+                                   cl_uint(viennacl::traits::stride1(A)),        cl_uint(viennacl::traits::stride2(A)),
+                                   cl_uint(viennacl::traits::size1(A)),          cl_uint(viennacl::traits::size2(A)),
+                                   cl_uint(viennacl::traits::internal_size1(A)), cl_uint(viennacl::traits::internal_size2(A)),
+                                   viennacl::traits::opencl_handle(B),
+                                   cl_uint(viennacl::traits::start1(B)),         cl_uint(viennacl::traits::start2(B)),
+                                   cl_uint(viennacl::traits::stride1(B)),        cl_uint(viennacl::traits::stride2(B)),
+                                   cl_uint(viennacl::traits::size1(B)),          cl_uint(viennacl::traits::size2(B)),
+                                   cl_uint(viennacl::traits::internal_size1(B)), cl_uint(viennacl::traits::internal_size2(B))
+                                  )
+                                );
+        }
+      }
+
+
+      //
+      // Note: By convention, all size checks are performed in the calling frontend. No need to double-check here.
+      //
+
+      ////////////////// upper triangular solver (upper_tag) //////////////////////////////////////
+      /** @brief Direct inplace solver for dense triangular systems. Matlab notation: A \ B
+      *
+      * @param A    The system matrix
+      * @param B    The matrix of row vectors, where the solution is directly written to
+      */
+      template <typename NumericT, typename F1, typename F2, typename SOLVERTAG>
+      void inplace_solve(const matrix_base<NumericT, F1> & A, matrix_base<NumericT, F2> & B, SOLVERTAG)
+      {
+        viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+
+        typedef viennacl::linalg::opencl::kernels::matrix_solve<NumericT, F1, F2>    KernelClass;
+        KernelClass::init(ctx);
+
+        std::stringstream ss;
+        ss << SOLVERTAG::name() << "_solve";
+        viennacl::ocl::kernel & k = ctx.get_kernel(KernelClass::program_name(), ss.str());
+
+        k.global_work_size(0, B.size2() * k.local_work_size());
+        detail::inplace_solve_impl(A, B, k);
+      }
+
+      /** @brief Direct inplace solver for dense triangular systems with transposed right hand side
+      *
+      * @param A       The system matrix
+      * @param proxy_B The transposed matrix of row vectors, where the solution is directly written to
+      */
+      template <typename NumericT, typename F1, typename F2, typename SOLVERTAG>
+      void inplace_solve(const matrix_base<NumericT, F1> & A,
+                         matrix_expression< const matrix_base<NumericT, F2>, const matrix_base<NumericT, F2>, op_trans> proxy_B,
+                         SOLVERTAG)
+      {
+        viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+
+        typedef viennacl::linalg::opencl::kernels::matrix_solve<NumericT, F1, F2>    KernelClass;
+        KernelClass::init(ctx);
+
+        std::stringstream ss;
+        ss << SOLVERTAG::name() << "_trans_solve";
+        viennacl::ocl::kernel & k = ctx.get_kernel(KernelClass::program_name(), ss.str());
+
+        k.global_work_size(0, proxy_B.lhs().size1() * k.local_work_size());
+        detail::inplace_solve_impl(A, proxy_B.lhs(), k);
+      }
+
+      //upper triangular solver for transposed lower triangular matrices
+      /** @brief Direct inplace solver for dense triangular systems that stem from transposed triangular systems
+      *
+      * @param proxy_A  The system matrix proxy
+      * @param B        The matrix holding the load vectors, where the solution is directly written to
+      */
+      template <typename NumericT, typename F1, typename F2, typename SOLVERTAG>
+      void inplace_solve(const matrix_expression< const matrix_base<NumericT, F1>, const matrix_base<NumericT, F1>, op_trans> & proxy_A,
+                         matrix_base<NumericT, F2> & B,
+                         SOLVERTAG)
+      {
+        viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(B).context());
+
+        typedef viennacl::linalg::opencl::kernels::matrix_solve<NumericT, F1, F2>    KernelClass;
+        KernelClass::init(ctx);
+
+        std::stringstream ss;
+        ss << "trans_" << SOLVERTAG::name() << "_solve";
+        viennacl::ocl::kernel & k = ctx.get_kernel(KernelClass::program_name(), ss.str());
+
+        k.global_work_size(0, B.size2() * k.local_work_size());
+        detail::inplace_solve_impl(proxy_A.lhs(), B, k);
+      }
+
+      /** @brief Direct inplace solver for dense transposed triangular systems with transposed right hand side. Matlab notation: A' \ B'
+      *
+      * @param proxy_A  The system matrix proxy
+      * @param proxy_B  The matrix holding the load vectors, where the solution is directly written to
+      */
+      template <typename NumericT, typename F1, typename F2, typename SOLVERTAG>
+      void inplace_solve(const matrix_expression< const matrix_base<NumericT, F1>, const matrix_base<NumericT, F1>, op_trans> & proxy_A,
+                               matrix_expression< const matrix_base<NumericT, F2>, const matrix_base<NumericT, F2>, op_trans>   proxy_B,
+                         SOLVERTAG)
+      {
+        viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(proxy_A.lhs()).context());
+
+        typedef viennacl::linalg::opencl::kernels::matrix_solve<NumericT, F1, F2>    KernelClass;
+        KernelClass::init(ctx);
+
+        std::stringstream ss;
+        ss << "trans_" << SOLVERTAG::name() << "_trans_solve";
+        viennacl::ocl::kernel & k = ctx.get_kernel(KernelClass::program_name(), ss.str());
+
+        k.global_work_size(0, proxy_B.lhs().size1() * k.local_work_size());
+        detail::inplace_solve_impl(proxy_A.lhs(), proxy_B.lhs(), k);
+      }
+
+
+
+      //
+      //  Solve on vector
+      //
+
+      template <typename NumericT, typename F, typename SOLVERTAG>
+      void inplace_solve(const matrix_base<NumericT, F> & mat,
+                               vector_base<NumericT> & vec,
+                         SOLVERTAG)
+      {
+        viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(mat).context());
+
+        typedef viennacl::linalg::opencl::kernels::matrix<NumericT, F>  KernelClass;
+        KernelClass::init(ctx);
+
+        cl_uint options = detail::get_option_for_solver_tag(SOLVERTAG());
+        viennacl::ocl::kernel & k = ctx.get_kernel(KernelClass::program_name(), "triangular_substitute_inplace");
+
+        k.global_work_size(0, k.local_work_size());
+        viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(mat),
+                                 cl_uint(viennacl::traits::start1(mat)),         cl_uint(viennacl::traits::start2(mat)),
+                                 cl_uint(viennacl::traits::stride1(mat)),        cl_uint(viennacl::traits::stride2(mat)),
+                                 cl_uint(viennacl::traits::size1(mat)),          cl_uint(viennacl::traits::size2(mat)),
+                                 cl_uint(viennacl::traits::internal_size1(mat)), cl_uint(viennacl::traits::internal_size2(mat)),
+                                 viennacl::traits::opencl_handle(vec),
+                                 cl_uint(viennacl::traits::start(vec)),
+                                 cl_uint(viennacl::traits::stride(vec)),
+                                 cl_uint(viennacl::traits::size(vec)),
+                                 options
+                                )
+                              );
+      }
+
+      /** @brief Direct inplace solver for dense upper triangular systems that stem from transposed lower triangular systems
+      *
+      * @param proxy    The system matrix proxy
+      * @param vec    The load vector, where the solution is directly written to
+      */
+      template <typename NumericT, typename F, typename SOLVERTAG>
+      void inplace_solve(const matrix_expression< const matrix_base<NumericT, F>, const matrix_base<NumericT, F>, op_trans> & proxy,
+                         vector_base<NumericT> & vec,
+                         SOLVERTAG)
+      {
+        viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec).context());
+
+        typedef viennacl::linalg::opencl::kernels::matrix<NumericT, F>  KernelClass;
+        KernelClass::init(ctx);
+
+        cl_uint options = detail::get_option_for_solver_tag(SOLVERTAG()) | 0x02;  //add transpose-flag
+        viennacl::ocl::kernel & k = ctx.get_kernel(KernelClass::program_name(), "triangular_substitute_inplace");
+
+        k.global_work_size(0, k.local_work_size());
+        viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(proxy.lhs()),
+                                 cl_uint(viennacl::traits::start1(proxy.lhs())),         cl_uint(viennacl::traits::start2(proxy.lhs())),
+                                 cl_uint(viennacl::traits::stride1(proxy.lhs())),        cl_uint(viennacl::traits::stride2(proxy.lhs())),
+                                 cl_uint(viennacl::traits::size1(proxy.lhs())),          cl_uint(viennacl::traits::size2(proxy.lhs())),
+                                 cl_uint(viennacl::traits::internal_size1(proxy.lhs())), cl_uint(viennacl::traits::internal_size2(proxy.lhs())),
+                                 viennacl::traits::opencl_handle(vec),
+                                 cl_uint(viennacl::traits::start(vec)),
+                                 cl_uint(viennacl::traits::stride(vec)),
+                                 cl_uint(viennacl::traits::size(vec)),
+                                 options
+                                )
+                              );
+      }
+
+
+    }
+  }
+}
+
+#endif
diff --git a/viennacl/linalg/opencl/kernels/compressed_compressed_matrix.hpp b/viennacl/linalg/opencl/kernels/compressed_compressed_matrix.hpp
new file mode 100644
index 0000000..2432ea7
--- /dev/null
+++ b/viennacl/linalg/opencl/kernels/compressed_compressed_matrix.hpp
@@ -0,0 +1,89 @@
+#ifndef VIENNACL_LINALG_OPENCL_KERNELS_COMPRESSED_COMPRESSED_MATRIX_HPP
+#define VIENNACL_LINALG_OPENCL_KERNELS_COMPRESSED_COMPRESSED_MATRIX_HPP
+
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/platform.hpp"
+#include "viennacl/ocl/utils.hpp"
+
+/** @file viennacl/linalg/opencl/kernels/compressed_compressed_matrix.hpp
+ *  @brief OpenCL kernel file for vector operations */
+namespace viennacl
+{
+  namespace linalg
+  {
+    namespace opencl
+    {
+      namespace kernels
+      {
+
+        //////////////////////////// Part 1: Kernel generation routines ////////////////////////////////////
+
+        template <typename StringType>
+        void generate_vec_mul(StringType & source, std::string const & numeric_string)
+        {
+          source.append("__kernel void vec_mul( \n");
+          source.append("          __global const unsigned int * row_jumper, \n");
+          source.append("          __global const unsigned int * row_indices, \n");
+          source.append("          __global const unsigned int * column_indices, \n");
+          source.append("          __global const "); source.append(numeric_string); source.append(" * elements, \n");
+          source.append("          uint nonzero_rows, \n");
+          source.append("          __global const "); source.append(numeric_string); source.append(" * x, \n");
+          source.append("          uint4 layout_x, \n");
+          source.append("          __global "); source.append(numeric_string); source.append(" * result, \n");
+          source.append("          uint4 layout_result) \n");
+          source.append("{ \n");
+          source.append("  for (unsigned int i = get_global_id(0); i < nonzero_rows; i += get_global_size(0)) \n");
+          source.append("  { \n");
+          source.append("    "); source.append(numeric_string); source.append(" dot_prod = 0; \n");
+          source.append("    unsigned int row_end = row_jumper[i+1]; \n");
+          source.append("    for (unsigned int j = row_jumper[i]; j < row_end; ++j) \n");
+          source.append("      dot_prod += elements[j] * x[column_indices[j] * layout_x.y + layout_x.x]; \n");
+          source.append("    result[row_indices[i] * layout_result.y + layout_result.x] = dot_prod; \n");
+          source.append("  } \n");
+          source.append(" } \n");
+        }
+
+        //////////////////////////// Part 2: Main kernel class ////////////////////////////////////
+
+        /** @brief Main kernel class for generating OpenCL kernels for compressed_compressed_matrix. */
+        template <typename NumericT>
+        struct compressed_compressed_matrix
+        {
+          static std::string program_name()
+          {
+            return viennacl::ocl::type_to_string<NumericT>::apply() + "_compressed_compressed_matrix";
+          }
+
+          static void init(viennacl::ocl::context & ctx)
+          {
+            viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx);
+            std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
+
+            static std::map<cl_context, bool> init_done;
+            if (!init_done[ctx.handle().get()])
+            {
+              std::string source;
+              source.reserve(8192);
+
+              viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
+
+              // fully parametrized kernels:
+              generate_vec_mul(source, numeric_string);
+
+              std::string prog_name = program_name();
+              #ifdef VIENNACL_BUILD_INFO
+              std::cout << "Creating program " << prog_name << std::endl;
+              #endif
+              ctx.add_program(source, prog_name);
+              init_done[ctx.handle().get()] = true;
+            } //if
+          } //init
+        };
+
+      }  // namespace kernels
+    }  // namespace opencl
+  }  // namespace linalg
+}  // namespace viennacl
+#endif
+
diff --git a/viennacl/linalg/opencl/kernels/compressed_matrix.hpp b/viennacl/linalg/opencl/kernels/compressed_matrix.hpp
new file mode 100644
index 0000000..d861978
--- /dev/null
+++ b/viennacl/linalg/opencl/kernels/compressed_matrix.hpp
@@ -0,0 +1,1096 @@
+#ifndef VIENNACL_LINALG_OPENCL_KERNELS_COMPRESSED_MATRIX_HPP
+#define VIENNACL_LINALG_OPENCL_KERNELS_COMPRESSED_MATRIX_HPP
+
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/platform.hpp"
+#include "viennacl/ocl/utils.hpp"
+
+#include "viennacl/linalg/opencl/common.hpp"
+
+/** @file viennacl/linalg/opencl/kernels/compressed_matrix.hpp
+ *  @brief OpenCL kernel file for compressed_matrix operations */
+namespace viennacl
+{
+  namespace linalg
+  {
+    namespace opencl
+    {
+      namespace kernels
+      {
+
+        //////////////////////////// Part 1: Kernel generation routines ////////////////////////////////////
+
+        template <typename StringType>
+        void generate_compressed_matrix_block_trans_lu_backward(StringType & source, std::string const & numeric_string)
+        {
+          source.append("__kernel void block_trans_lu_backward( \n");
+          source.append("          __global const unsigned int * row_jumper_U,  \n");     //U part (note that U is transposed in memory)
+          source.append("          __global const unsigned int * column_indices_U, \n");
+          source.append("          __global const "); source.append(numeric_string); source.append(" * elements_U, \n");
+          source.append("          __global const "); source.append(numeric_string); source.append(" * diagonal_U, \n");
+          source.append("          __global const unsigned int * block_offsets, \n");
+          source.append("          __global "); source.append(numeric_string); source.append(" * result, \n");
+          source.append("          unsigned int size) \n");
+          source.append("{ \n");
+          source.append("  unsigned int col_start = block_offsets[2*get_group_id(0)]; \n");
+          source.append("  unsigned int col_stop  = block_offsets[2*get_group_id(0)+1]; \n");
+          source.append("  unsigned int row_start; \n");
+          source.append("  unsigned int row_stop; \n");
+          source.append("  "); source.append(numeric_string); source.append(" result_entry = 0; \n");
+
+          source.append("  if (col_start >= col_stop) \n");
+          source.append("    return; \n");
+
+            //backward elimination, using U and diagonal_U
+          source.append("  for (unsigned int iter = 0; iter < col_stop - col_start; ++iter) \n");
+          source.append("  { \n");
+          source.append("    unsigned int col = (col_stop - iter) - 1; \n");
+          source.append("    result_entry = result[col] / diagonal_U[col]; \n");
+          source.append("    row_start = row_jumper_U[col]; \n");
+          source.append("    row_stop  = row_jumper_U[col + 1]; \n");
+          source.append("    for (unsigned int buffer_index = row_start + get_local_id(0); buffer_index < row_stop; buffer_index += get_local_size(0)) \n");
+          source.append("      result[column_indices_U[buffer_index]] -= result_entry * elements_U[buffer_index]; \n");
+          source.append("    barrier(CLK_GLOBAL_MEM_FENCE); \n");
+          source.append("  } \n");
+
+            //divide result vector by diagonal:
+          source.append("  for (unsigned int col = col_start + get_local_id(0); col < col_stop; col += get_local_size(0)) \n");
+          source.append("    result[col] /= diagonal_U[col]; \n");
+          source.append("} \n");
+        }
+
+        template <typename StringType>
+        void generate_compressed_matrix_block_trans_unit_lu_forward(StringType & source, std::string const & numeric_string)
+        {
+          source.append("__kernel void block_trans_unit_lu_forward( \n");
+          source.append("          __global const unsigned int * row_jumper_L,  \n");     //L part (note that L is transposed in memory)
+          source.append("          __global const unsigned int * column_indices_L, \n");
+          source.append("          __global const "); source.append(numeric_string); source.append(" * elements_L, \n");
+          source.append("          __global const unsigned int * block_offsets, \n");
+          source.append("          __global "); source.append(numeric_string); source.append(" * result, \n");
+          source.append("          unsigned int size) \n");
+          source.append("{ \n");
+          source.append("  unsigned int col_start = block_offsets[2*get_group_id(0)]; \n");
+          source.append("  unsigned int col_stop  = block_offsets[2*get_group_id(0)+1]; \n");
+          source.append("  unsigned int row_start = row_jumper_L[col_start]; \n");
+          source.append("  unsigned int row_stop; \n");
+          source.append("  "); source.append(numeric_string); source.append(" result_entry = 0; \n");
+
+          source.append("  if (col_start >= col_stop) \n");
+          source.append("    return; \n");
+
+            //forward elimination, using L:
+          source.append("  for (unsigned int col = col_start; col < col_stop; ++col) \n");
+          source.append("  { \n");
+          source.append("    result_entry = result[col]; \n");
+          source.append("    row_stop = row_jumper_L[col + 1]; \n");
+          source.append("    for (unsigned int buffer_index = row_start + get_local_id(0); buffer_index < row_stop; buffer_index += get_local_size(0)) \n");
+          source.append("      result[column_indices_L[buffer_index]] -= result_entry * elements_L[buffer_index]; \n");
+          source.append("    row_start = row_stop; \n"); //for next iteration (avoid unnecessary loads from GPU RAM)
+          source.append("    barrier(CLK_GLOBAL_MEM_FENCE); \n");
+          source.append("  } \n");
+
+          source.append("}; \n");
+        }
+
+        namespace detail
+        {
+          /** @brief Generate kernel for C = A * B with A being a compressed_matrix, B and C dense */
+          template <typename StringType>
+          void generate_compressed_matrix_dense_matrix_mult(StringType & source, std::string const & numeric_string,
+                                                            bool B_transposed, bool B_row_major, bool C_row_major)
+          {
+            source.append("__kernel void ");
+            source.append(viennacl::linalg::opencl::detail::sparse_dense_matmult_kernel_name(B_transposed, B_row_major, C_row_major));
+            source.append("( \n");
+            source.append("          __global const unsigned int * sp_mat_row_indices, \n");
+            source.append("          __global const unsigned int * sp_mat_col_indices, \n");
+            source.append("          __global const "); source.append(numeric_string); source.append(" * sp_mat_elements, \n");
+            source.append("          __global const "); source.append(numeric_string); source.append(" * d_mat, \n");
+            source.append("          unsigned int d_mat_row_start, \n");
+            source.append("          unsigned int d_mat_col_start, \n");
+            source.append("          unsigned int d_mat_row_inc, \n");
+            source.append("          unsigned int d_mat_col_inc, \n");
+            source.append("          unsigned int d_mat_row_size, \n");
+            source.append("          unsigned int d_mat_col_size, \n");
+            source.append("          unsigned int d_mat_internal_rows, \n");
+            source.append("          unsigned int d_mat_internal_cols, \n");
+            source.append("          __global "); source.append(numeric_string); source.append(" * result, \n");
+            source.append("          unsigned int result_row_start, \n");
+            source.append("          unsigned int result_col_start, \n");
+            source.append("          unsigned int result_row_inc, \n");
+            source.append("          unsigned int result_col_inc, \n");
+            source.append("          unsigned int result_row_size, \n");
+            source.append("          unsigned int result_col_size, \n");
+            source.append("          unsigned int result_internal_rows, \n");
+            source.append("          unsigned int result_internal_cols) { \n");
+
+              // split work rows (sparse matrix rows) to thread groups
+            source.append("  for (unsigned int row = get_group_id(0); row < result_row_size; row += get_num_groups(0)) { \n");
+
+            source.append("    unsigned int row_start = sp_mat_row_indices[row]; \n");
+            source.append("    unsigned int row_end = sp_mat_row_indices[row+1]; \n");
+
+                // split result cols between threads in a thread group
+            source.append("    for ( unsigned int col = get_local_id(0); col < result_col_size; col += get_local_size(0) ) { \n");
+
+            source.append("      "); source.append(numeric_string); source.append(" r = 0; \n");
+
+            source.append("      for (unsigned int k = row_start; k < row_end; k ++) { \n");
+
+            source.append("        unsigned int j = sp_mat_col_indices[k]; \n");
+            source.append("        "); source.append(numeric_string); source.append(" x = sp_mat_elements[k]; \n");
+
+            source.append("        "); source.append(numeric_string);
+            if (B_transposed && B_row_major)
+              source.append(" y = d_mat[ (d_mat_row_start + col * d_mat_row_inc) * d_mat_internal_cols + d_mat_col_start +   j * d_mat_col_inc ]; \n");
+            else if (B_transposed && !B_row_major)
+              source.append(" y = d_mat[ (d_mat_row_start + col * d_mat_row_inc)                       + (d_mat_col_start +  j * d_mat_col_inc) * d_mat_internal_rows ]; \n");
+            else if (!B_transposed && B_row_major)
+              source.append(" y = d_mat[ (d_mat_row_start +   j * d_mat_row_inc) * d_mat_internal_cols + d_mat_col_start + col * d_mat_col_inc ]; \n");
+            else
+              source.append(" y = d_mat[ (d_mat_row_start +   j * d_mat_row_inc)                       + (d_mat_col_start + col * d_mat_col_inc) * d_mat_internal_rows ]; \n");
+            source.append("        r += x * y; \n");
+            source.append("      } \n");
+
+            if (C_row_major)
+              source.append("      result[ (result_row_start + row * result_row_inc) * result_internal_cols + result_col_start + col * result_col_inc ] = r; \n");
+            else
+              source.append("      result[ (result_row_start + row * result_row_inc)                        + (result_col_start + col * result_col_inc) * result_internal_rows ] = r; \n");
+            source.append("    } \n");
+            source.append("  } \n");
+
+            source.append("} \n");
+
+          }
+        }
+        template <typename StringType>
+        void generate_compressed_matrix_dense_matrix_multiplication(StringType & source, std::string const & numeric_string)
+        {
+          detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, false, false, false);
+          detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, false, false,  true);
+          detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, false,  true, false);
+          detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, false,  true,  true);
+
+          detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, true, false, false);
+          detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, true, false,  true);
+          detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, true,  true, false);
+          detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, true,  true,  true);
+        }
+
+        template <typename StringType>
+        void generate_compressed_matrix_jacobi(StringType & source, std::string const & numeric_string)
+        {
+
+         source.append(" __kernel void jacobi( \n");
+         source.append("  __global const unsigned int * row_indices, \n");
+         source.append("  __global const unsigned int * column_indices, \n");
+         source.append("  __global const "); source.append(numeric_string); source.append(" * elements, \n");
+         source.append("  "); source.append(numeric_string); source.append(" weight, \n");
+         source.append("  __global const "); source.append(numeric_string); source.append(" * old_result, \n");
+         source.append("  __global "); source.append(numeric_string); source.append(" * new_result, \n");
+         source.append("  __global const "); source.append(numeric_string); source.append(" * rhs, \n");
+         source.append("  unsigned int size) \n");
+         source.append("  { \n");
+         source.append("   "); source.append(numeric_string); source.append(" sum, diag=1; \n");
+         source.append("   int col; \n");
+         source.append("   for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) \n");
+         source.append("   { \n");
+         source.append("     sum = 0; \n");
+         source.append("     for (unsigned int j = row_indices[i]; j<row_indices[i+1]; j++) \n");
+         source.append("     { \n");
+         source.append("       col = column_indices[j]; \n");
+         source.append("       if (i == col) \n");
+         source.append("   diag = elements[j]; \n");
+         source.append("       else \n");
+         source.append("   sum += elements[j] * old_result[col]; \n");
+         source.append("     } \n");
+         source.append("       new_result[i] = weight * (rhs[i]-sum) / diag + (1-weight) * old_result[i]; \n");
+         source.append("    } \n");
+         source.append("  } \n");
+
+        }
+
+        template <typename StringType>
+        void generate_compressed_matrix_lu_backward(StringType & source, std::string const & numeric_string)
+        {
+          // compute x in Ux = y for incomplete LU factorizations of a sparse matrix in compressed format
+          source.append("__kernel void lu_backward( \n");
+          source.append("          __global const unsigned int * row_indices, \n");
+          source.append("          __global const unsigned int * column_indices, \n");
+          source.append("          __global const "); source.append(numeric_string); source.append(" * elements, \n");
+          source.append("          __global "); source.append(numeric_string); source.append(" * vector, \n");
+          source.append("          unsigned int size) \n");
+          source.append("{ \n");
+          source.append("  __local unsigned int col_index_buffer[128]; \n");
+          source.append("  __local "); source.append(numeric_string); source.append(" element_buffer[128]; \n");
+          source.append("  __local "); source.append(numeric_string); source.append(" vector_buffer[128]; \n");
+
+          source.append("  unsigned int nnz = row_indices[size]; \n");
+          source.append("  unsigned int current_row = size-1; \n");
+          source.append("  unsigned int row_at_window_start = size-1; \n");
+          source.append("  "); source.append(numeric_string); source.append(" current_vector_entry = vector[size-1]; \n");
+          source.append("  "); source.append(numeric_string); source.append(" diagonal_entry = 0; \n");
+          source.append("  unsigned int loop_end = ( (nnz - 1) / get_local_size(0)) * get_local_size(0); \n");
+          source.append("  unsigned int next_row = row_indices[size-1]; \n");
+
+          source.append("  unsigned int i = loop_end + get_local_id(0); \n");
+          source.append("  while (1) \n");
+          source.append("  { \n");
+              //load into shared memory (coalesced access):
+          source.append("    if (i < nnz) \n");
+          source.append("    { \n");
+          source.append("      element_buffer[get_local_id(0)] = elements[i]; \n");
+          source.append("      unsigned int tmp = column_indices[i]; \n");
+          source.append("      col_index_buffer[get_local_id(0)] = tmp; \n");
+          source.append("      vector_buffer[get_local_id(0)] = vector[tmp]; \n");
+          source.append("    } \n");
+
+          source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+              //now a single thread does the remaining work in shared memory:
+          source.append("    if (get_local_id(0) == 0) \n");
+          source.append("    { \n");
+                // traverse through all the loaded data from back to front:
+          source.append("      for (unsigned int k2=0; k2<get_local_size(0); ++k2) \n");
+          source.append("      { \n");
+          source.append("        unsigned int k = (get_local_size(0) - k2) - 1; \n");
+
+          source.append("        if (i+k >= nnz) \n");
+          source.append("          continue; \n");
+
+          source.append("        if (col_index_buffer[k] > row_at_window_start) \n"); //use recently computed results
+          source.append("          current_vector_entry -= element_buffer[k] * vector_buffer[k]; \n");
+          source.append("        else if (col_index_buffer[k] > current_row) \n"); //use buffered data
+          source.append("          current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]]; \n");
+          source.append("        else if (col_index_buffer[k] == current_row) \n");
+          source.append("          diagonal_entry = element_buffer[k]; \n");
+
+          source.append("        if (i+k == next_row) \n"); //current row is finished. Write back result
+          source.append("        { \n");
+          source.append("          vector[current_row] = current_vector_entry / diagonal_entry; \n");
+          source.append("          if (current_row > 0) //load next row's data \n");
+          source.append("          { \n");
+          source.append("            --current_row; \n");
+          source.append("            next_row = row_indices[current_row]; \n");
+          source.append("            current_vector_entry = vector[current_row]; \n");
+          source.append("          } \n");
+          source.append("        } \n");
+
+
+          source.append("      } \n"); // for k
+
+          source.append("      row_at_window_start = current_row; \n");
+          source.append("    } \n"); // if (get_local_id(0) == 0)
+
+          source.append("    barrier(CLK_GLOBAL_MEM_FENCE); \n");
+
+          source.append("    if (i < get_local_size(0)) \n");
+          source.append("      break; \n");
+
+          source.append("    i -= get_local_size(0); \n");
+          source.append("  } \n"); //for i
+          source.append("} \n");
+
+        }
+
+        template <typename StringType>
+        void generate_compressed_matrix_lu_forward(StringType & source, std::string const & numeric_string)
+        {
+
+          // compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format
+          source.append("__kernel void lu_forward( \n");
+          source.append("          __global const unsigned int * row_indices, \n");
+          source.append("          __global const unsigned int * column_indices, \n");
+          source.append("          __global const "); source.append(numeric_string); source.append(" * elements, \n");
+          source.append("          __global "); source.append(numeric_string); source.append(" * vector, \n");
+          source.append("          unsigned int size) \n");
+          source.append("{ \n");
+          source.append("  __local unsigned int col_index_buffer[128]; \n");
+          source.append("  __local "); source.append(numeric_string); source.append(" element_buffer[128]; \n");
+          source.append("  __local "); source.append(numeric_string); source.append(" vector_buffer[128]; \n");
+
+          source.append("  unsigned int nnz = row_indices[size]; \n");
+          source.append("  unsigned int current_row = 0; \n");
+          source.append("  unsigned int row_at_window_start = 0; \n");
+          source.append("  "); source.append(numeric_string); source.append(" current_vector_entry = vector[0]; \n");
+          source.append("  "); source.append(numeric_string); source.append(" diagonal_entry; \n");
+          source.append("  unsigned int loop_end = (nnz / get_local_size(0) + 1) * get_local_size(0); \n");
+          source.append("  unsigned int next_row = row_indices[1]; \n");
+
+          source.append("  for (unsigned int i = get_local_id(0); i < loop_end; i += get_local_size(0)) \n");
+          source.append("  { \n");
+              //load into shared memory (coalesced access):
+          source.append("    if (i < nnz) \n");
+          source.append("    { \n");
+          source.append("      element_buffer[get_local_id(0)] = elements[i]; \n");
+          source.append("      unsigned int tmp = column_indices[i]; \n");
+          source.append("      col_index_buffer[get_local_id(0)] = tmp; \n");
+          source.append("      vector_buffer[get_local_id(0)] = vector[tmp]; \n");
+          source.append("    } \n");
+
+          source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+              //now a single thread does the remaining work in shared memory:
+          source.append("    if (get_local_id(0) == 0) \n");
+          source.append("    { \n");
+                // traverse through all the loaded data:
+          source.append("      for (unsigned int k=0; k<get_local_size(0); ++k) \n");
+          source.append("      { \n");
+          source.append("        if (current_row < size && i+k == next_row) \n"); //current row is finished. Write back result
+          source.append("        { \n");
+          source.append("          vector[current_row] = current_vector_entry / diagonal_entry; \n");
+          source.append("          ++current_row; \n");
+          source.append("          if (current_row < size) \n"); //load next row's data
+          source.append("          { \n");
+          source.append("            next_row = row_indices[current_row+1]; \n");
+          source.append("            current_vector_entry = vector[current_row]; \n");
+          source.append("          } \n");
+          source.append("        } \n");
+
+          source.append("        if (current_row < size && col_index_buffer[k] < current_row) \n"); //substitute
+          source.append("        { \n");
+          source.append("          if (col_index_buffer[k] < row_at_window_start) \n"); //use recently computed results
+          source.append("            current_vector_entry -= element_buffer[k] * vector_buffer[k]; \n");
+          source.append("          else if (col_index_buffer[k] < current_row) \n"); //use buffered data
+          source.append("            current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]]; \n");
+          source.append("        } \n");
+          source.append("        else if (col_index_buffer[k] == current_row) \n");
+          source.append("          diagonal_entry = element_buffer[k]; \n");
+
+          source.append("      } \n"); // for k
+
+          source.append("      row_at_window_start = current_row; \n");
+          source.append("    } \n"); // if (get_local_id(0) == 0)
+
+          source.append("    barrier(CLK_GLOBAL_MEM_FENCE); \n");
+          source.append("  } \n"); //for i
+          source.append("} \n");
+
+        }
+
+        template <typename StringType>
+        void generate_compressed_matrix_row_info_extractor(StringType & source, std::string const & numeric_string)
+        {
+          source.append("__kernel void row_info_extractor( \n");
+          source.append("          __global const unsigned int * row_indices, \n");
+          source.append("          __global const unsigned int * column_indices, \n");
+          source.append("          __global const "); source.append(numeric_string); source.append(" * elements, \n");
+          source.append("          __global "); source.append(numeric_string); source.append(" * result, \n");
+          source.append("          unsigned int size, \n");
+          source.append("          unsigned int option \n");
+          source.append("          ) \n");
+          source.append("{ \n");
+          source.append("  for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0)) \n");
+          source.append("  { \n");
+          source.append("    "); source.append(numeric_string); source.append(" value = 0; \n");
+          source.append("    unsigned int row_end = row_indices[row+1]; \n");
+
+          source.append("    switch (option) \n");
+          source.append("    { \n");
+          source.append("      case 0: \n"); //inf-norm
+          source.append("        for (unsigned int i = row_indices[row]; i < row_end; ++i) \n");
+          source.append("          value = max(value, fabs(elements[i])); \n");
+          source.append("        break; \n");
+
+          source.append("      case 1: \n"); //1-norm
+          source.append("        for (unsigned int i = row_indices[row]; i < row_end; ++i) \n");
+          source.append("          value += fabs(elements[i]); \n");
+          source.append("        break; \n");
+
+          source.append("      case 2: \n"); //2-norm
+          source.append("        for (unsigned int i = row_indices[row]; i < row_end; ++i) \n");
+          source.append("          value += elements[i] * elements[i]; \n");
+          source.append("        value = sqrt(value); \n");
+          source.append("        break; \n");
+
+          source.append("      case 3: \n"); //diagonal entry
+          source.append("        for (unsigned int i = row_indices[row]; i < row_end; ++i) \n");
+          source.append("        { \n");
+          source.append("          if (column_indices[i] == row) \n");
+          source.append("          { \n");
+          source.append("            value = elements[i]; \n");
+          source.append("            break; \n");
+          source.append("          } \n");
+          source.append("        } \n");
+          source.append("        break; \n");
+
+          source.append("      default: \n");
+          source.append("        break; \n");
+          source.append("    } \n");
+          source.append("    result[row] = value; \n");
+          source.append("  } \n");
+          source.append("} \n");
+
+        }
+
+        template <typename StringType>
+        void generate_compressed_matrix_trans_lu_backward(StringType & source, std::string const & numeric_string)
+        {
+
+          // compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format
+          source.append("__kernel void trans_lu_backward( \n");
+          source.append("          __global const unsigned int * row_indices, \n");
+          source.append("          __global const unsigned int * column_indices, \n");
+          source.append("          __global const "); source.append(numeric_string); source.append(" * elements, \n");
+          source.append("          __global const "); source.append(numeric_string); source.append(" * diagonal_entries, \n");
+          source.append("          __global "); source.append(numeric_string); source.append(" * vector, \n");
+          source.append("          unsigned int size) \n");
+          source.append("{ \n");
+          source.append("  __local unsigned int row_index_lookahead[256]; \n");
+          source.append("  __local unsigned int row_index_buffer[256]; \n");
+
+          source.append("  unsigned int row_index; \n");
+          source.append("  unsigned int col_index; \n");
+          source.append("  "); source.append(numeric_string); source.append(" matrix_entry; \n");
+          source.append("  unsigned int nnz = row_indices[size]; \n");
+          source.append("  unsigned int row_at_window_start = size; \n");
+          source.append("  unsigned int row_at_window_end; \n");
+          source.append("  unsigned int loop_end = ( (nnz - 1) / get_local_size(0) + 1) * get_local_size(0); \n");
+
+          source.append("  for (unsigned int i2 = get_local_id(0); i2 < loop_end; i2 += get_local_size(0)) \n");
+          source.append("  { \n");
+          source.append("    unsigned int i = (nnz - i2) - 1; \n");
+          source.append("    col_index    = (i2 < nnz) ? column_indices[i] : 0; \n");
+          source.append("    matrix_entry = (i2 < nnz) ? elements[i]       : 0; \n");
+          source.append("    row_index_lookahead[get_local_id(0)] = (row_at_window_start >= get_local_id(0)) ? row_indices[row_at_window_start - get_local_id(0)] : 0; \n");
+
+          source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+          source.append("    if (i2 < nnz) \n");
+          source.append("    { \n");
+          source.append("      unsigned int row_index_dec = 0; \n");
+          source.append("      while (row_index_lookahead[row_index_dec] > i) \n");
+          source.append("        ++row_index_dec; \n");
+          source.append("      row_index = row_at_window_start - row_index_dec; \n");
+          source.append("      row_index_buffer[get_local_id(0)] = row_index; \n");
+          source.append("    } \n");
+          source.append("    else \n");
+          source.append("    { \n");
+          source.append("      row_index = size+1; \n");
+          source.append("      row_index_buffer[get_local_id(0)] = 0; \n");
+          source.append("    } \n");
+
+          source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+          source.append("    row_at_window_start = row_index_buffer[0]; \n");
+          source.append("    row_at_window_end   = row_index_buffer[get_local_size(0) - 1]; \n");
+
+              //backward elimination
+          source.append("    for (unsigned int row2 = 0; row2 <= (row_at_window_start - row_at_window_end); ++row2) \n");
+          source.append("    { \n");
+          source.append("      unsigned int row = row_at_window_start - row2; \n");
+          source.append("      "); source.append(numeric_string); source.append(" result_entry = vector[row] / diagonal_entries[row]; \n");
+
+          source.append("      if ( (row_index == row) && (col_index < row) ) \n");
+          source.append("        vector[col_index] -= result_entry * matrix_entry; \n");
+
+          source.append("      barrier(CLK_GLOBAL_MEM_FENCE); \n");
+          source.append("    } \n");
+
+          source.append("    row_at_window_start = row_at_window_end; \n");
+          source.append("  } \n");
+
+            // final step: Divide vector by diagonal entries:
+          source.append("  for (unsigned int i = get_local_id(0); i < size; i += get_local_size(0)) \n");
+          source.append("    vector[i] /= diagonal_entries[i]; \n");
+          source.append("} \n");
+
+        }
+
+        template <typename StringType>
+        void generate_compressed_matrix_trans_lu_forward(StringType & source, std::string const & numeric_string)
+        {
+
+          // compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format
+          source.append("__kernel void trans_lu_forward( \n");
+          source.append("          __global const unsigned int * row_indices, \n");
+          source.append("          __global const unsigned int * column_indices, \n");
+          source.append("          __global const "); source.append(numeric_string); source.append(" * elements, \n");
+          source.append("          __global const "); source.append(numeric_string); source.append(" * diagonal_entries, \n");
+          source.append("          __global "); source.append(numeric_string); source.append(" * vector, \n");
+          source.append("          unsigned int size) \n");
+          source.append("{ \n");
+          source.append("  __local unsigned int row_index_lookahead[256]; \n");
+          source.append("  __local unsigned int row_index_buffer[256]; \n");
+
+          source.append("  unsigned int row_index; \n");
+          source.append("  unsigned int col_index; \n");
+          source.append("  "); source.append(numeric_string); source.append(" matrix_entry; \n");
+          source.append("  unsigned int nnz = row_indices[size]; \n");
+          source.append("  unsigned int row_at_window_start = 0; \n");
+          source.append("  unsigned int row_at_window_end = 0; \n");
+          source.append("  unsigned int loop_end = ( (nnz - 1) / get_local_size(0) + 1) * get_local_size(0); \n");
+
+          source.append("  for (unsigned int i = get_local_id(0); i < loop_end; i += get_local_size(0)) \n");
+          source.append("  { \n");
+          source.append("    col_index    = (i < nnz) ? column_indices[i] : 0; \n");
+          source.append("    matrix_entry = (i < nnz) ? elements[i]       : 0; \n");
+          source.append("    row_index_lookahead[get_local_id(0)] = (row_at_window_start + get_local_id(0) < size) ? row_indices[row_at_window_start + get_local_id(0)] : size - 1; \n");
+
+          source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+          source.append("    if (i < nnz) \n");
+          source.append("    { \n");
+          source.append("      unsigned int row_index_inc = 0; \n");
+          source.append("      while (i >= row_index_lookahead[row_index_inc + 1]) \n");
+          source.append("        ++row_index_inc; \n");
+          source.append("      row_index = row_at_window_start + row_index_inc; \n");
+          source.append("      row_index_buffer[get_local_id(0)] = row_index; \n");
+          source.append("    } \n");
+          source.append("    else \n");
+          source.append("    { \n");
+          source.append("      row_index = size+1; \n");
+          source.append("      row_index_buffer[get_local_id(0)] = size - 1; \n");
+          source.append("    } \n");
+
+          source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+          source.append("    row_at_window_start = row_index_buffer[0]; \n");
+          source.append("    row_at_window_end   = row_index_buffer[get_local_size(0) - 1]; \n");
+
+              //forward elimination
+          source.append("    for (unsigned int row = row_at_window_start; row <= row_at_window_end; ++row) \n");
+          source.append("    { \n");
+          source.append("      "); source.append(numeric_string); source.append(" result_entry = vector[row] / diagonal_entries[row]; \n");
+
+          source.append("      if ( (row_index == row) && (col_index > row) ) \n");
+          source.append("        vector[col_index] -= result_entry * matrix_entry; \n");
+
+          source.append("      barrier(CLK_GLOBAL_MEM_FENCE); \n");
+          source.append("    } \n");
+
+          source.append("    row_at_window_start = row_at_window_end; \n");
+          source.append("  } \n");
+
+            // final step: Divide vector by diagonal entries:
+          source.append("  for (unsigned int i = get_local_id(0); i < size; i += get_local_size(0)) \n");
+          source.append("    vector[i] /= diagonal_entries[i]; \n");
+          source.append("} \n");
+
+        }
+
+        template <typename StringType>
+        void generate_compressed_matrix_trans_unit_lu_backward(StringType & source, std::string const & numeric_string)
+        {
+
+          // compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format
+          source.append("__kernel void trans_unit_lu_backward( \n");
+          source.append("          __global const unsigned int * row_indices, \n");
+          source.append("          __global const unsigned int * column_indices, \n");
+          source.append("          __global const "); source.append(numeric_string); source.append(" * elements, \n");
+          source.append("          __global "); source.append(numeric_string); source.append(" * vector, \n");
+          source.append("          unsigned int size) \n");
+          source.append("{ \n");
+          source.append("  __local unsigned int row_index_lookahead[256]; \n");
+          source.append("  __local unsigned int row_index_buffer[256]; \n");
+
+          source.append("  unsigned int row_index; \n");
+          source.append("  unsigned int col_index; \n");
+          source.append("  "); source.append(numeric_string); source.append(" matrix_entry; \n");
+          source.append("  unsigned int nnz = row_indices[size]; \n");
+          source.append("  unsigned int row_at_window_start = size; \n");
+          source.append("  unsigned int row_at_window_end; \n");
+          source.append("  unsigned int loop_end = ( (nnz - 1) / get_local_size(0) + 1) * get_local_size(0); \n");
+
+          source.append("  for (unsigned int i2 = get_local_id(0); i2 < loop_end; i2 += get_local_size(0)) \n");
+          source.append("  { \n");
+          source.append("    unsigned int i = (nnz - i2) - 1; \n");
+          source.append("    col_index    = (i2 < nnz) ? column_indices[i] : 0; \n");
+          source.append("    matrix_entry = (i2 < nnz) ? elements[i]       : 0; \n");
+          source.append("    row_index_lookahead[get_local_id(0)] = (row_at_window_start >= get_local_id(0)) ? row_indices[row_at_window_start - get_local_id(0)] : 0; \n");
+
+          source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+          source.append("    if (i2 < nnz) \n");
+          source.append("    { \n");
+          source.append("      unsigned int row_index_dec = 0; \n");
+          source.append("      while (row_index_lookahead[row_index_dec] > i) \n");
+          source.append("        ++row_index_dec; \n");
+          source.append("      row_index = row_at_window_start - row_index_dec; \n");
+          source.append("      row_index_buffer[get_local_id(0)] = row_index; \n");
+          source.append("    } \n");
+          source.append("    else \n");
+          source.append("    { \n");
+          source.append("      row_index = size+1; \n");
+          source.append("      row_index_buffer[get_local_id(0)] = 0; \n");
+          source.append("    } \n");
+
+          source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+          source.append("    row_at_window_start = row_index_buffer[0]; \n");
+          source.append("    row_at_window_end   = row_index_buffer[get_local_size(0) - 1]; \n");
+
+              //backward elimination
+          source.append("    for (unsigned int row2 = 0; row2 <= (row_at_window_start - row_at_window_end); ++row2) \n");
+          source.append("    { \n");
+          source.append("      unsigned int row = row_at_window_start - row2; \n");
+          source.append("      "); source.append(numeric_string); source.append(" result_entry = vector[row]; \n");
+
+          source.append("      if ( (row_index == row) && (col_index < row) ) \n");
+          source.append("        vector[col_index] -= result_entry * matrix_entry; \n");
+
+          source.append("      barrier(CLK_GLOBAL_MEM_FENCE); \n");
+          source.append("    } \n");
+
+          source.append("    row_at_window_start = row_at_window_end; \n");
+          source.append("  } \n");
+          source.append("} \n");
+
+        }
+
+
+        template <typename StringType>
+        void generate_compressed_matrix_trans_unit_lu_forward(StringType & source, std::string const & numeric_string)
+        {
+
+          // compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format
+          source.append("__kernel void trans_unit_lu_forward( \n");
+          source.append("          __global const unsigned int * row_indices, \n");
+          source.append("          __global const unsigned int * column_indices, \n");
+          source.append("          __global const "); source.append(numeric_string); source.append(" * elements, \n");
+          source.append("          __global "); source.append(numeric_string); source.append(" * vector, \n");
+          source.append("          unsigned int size) \n");
+          source.append("{ \n");
+          source.append("  __local unsigned int row_index_lookahead[256]; \n");
+          source.append("  __local unsigned int row_index_buffer[256]; \n");
+
+          source.append("  unsigned int row_index; \n");
+          source.append("  unsigned int col_index; \n");
+          source.append("  "); source.append(numeric_string); source.append(" matrix_entry; \n");
+          source.append("  unsigned int nnz = row_indices[size]; \n");
+          source.append("  unsigned int row_at_window_start = 0; \n");
+          source.append("  unsigned int row_at_window_end = 0; \n");
+          source.append("  unsigned int loop_end = ( (nnz - 1) / get_local_size(0) + 1) * get_local_size(0); \n");
+
+          source.append("  for (unsigned int i = get_local_id(0); i < loop_end; i += get_local_size(0)) \n");
+          source.append("  { \n");
+          source.append("    col_index    = (i < nnz) ? column_indices[i] : 0; \n");
+          source.append("    matrix_entry = (i < nnz) ? elements[i]       : 0; \n");
+          source.append("    row_index_lookahead[get_local_id(0)] = (row_at_window_start + get_local_id(0) < size) ? row_indices[row_at_window_start + get_local_id(0)] : size - 1; \n");
+
+          source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+          source.append("    if (i < nnz) \n");
+          source.append("    { \n");
+          source.append("      unsigned int row_index_inc = 0; \n");
+          source.append("      while (i >= row_index_lookahead[row_index_inc + 1]) \n");
+          source.append("        ++row_index_inc; \n");
+          source.append("      row_index = row_at_window_start + row_index_inc; \n");
+          source.append("      row_index_buffer[get_local_id(0)] = row_index; \n");
+          source.append("    } \n");
+          source.append("    else \n");
+          source.append("    { \n");
+          source.append("      row_index = size+1; \n");
+          source.append("      row_index_buffer[get_local_id(0)] = size - 1; \n");
+          source.append("    } \n");
+
+          source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+          source.append("    row_at_window_start = row_index_buffer[0]; \n");
+          source.append("    row_at_window_end   = row_index_buffer[get_local_size(0) - 1]; \n");
+
+              //forward elimination
+          source.append("    for (unsigned int row = row_at_window_start; row <= row_at_window_end; ++row) \n");
+          source.append("    { \n");
+          source.append("      "); source.append(numeric_string); source.append(" result_entry = vector[row]; \n");
+
+          source.append("      if ( (row_index == row) && (col_index > row) ) \n");
+          source.append("        vector[col_index] -= result_entry * matrix_entry; \n");
+
+          source.append("      barrier(CLK_GLOBAL_MEM_FENCE); \n");
+          source.append("    } \n");
+
+          source.append("    row_at_window_start = row_at_window_end; \n");
+          source.append("  } \n");
+          source.append("} \n");
+
+        }
+
+        template <typename StringType>
+        void generate_compressed_matrix_trans_unit_lu_forward_slow(StringType & source, std::string const & numeric_string)
+        {
+
+          // compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format
+          source.append("__kernel void trans_unit_lu_forward_slow( \n");
+          source.append("          __global const unsigned int * row_indices, \n");
+          source.append("          __global const unsigned int * column_indices, \n");
+          source.append("          __global const "); source.append(numeric_string); source.append(" * elements, \n");
+          source.append("          __global "); source.append(numeric_string); source.append(" * vector, \n");
+          source.append("          unsigned int size) \n");
+          source.append("{ \n");
+          source.append("  for (unsigned int row = 0; row < size; ++row) \n");
+          source.append("  { \n");
+          source.append("    "); source.append(numeric_string); source.append(" result_entry = vector[row]; \n");
+
+          source.append("    unsigned int row_start = row_indices[row]; \n");
+          source.append("    unsigned int row_stop  = row_indices[row + 1]; \n");
+          source.append("    for (unsigned int entry_index = row_start + get_local_id(0); entry_index < row_stop; entry_index += get_local_size(0)) \n");
+          source.append("    { \n");
+          source.append("      unsigned int col_index = column_indices[entry_index]; \n");
+          source.append("      if (col_index > row) \n");
+          source.append("        vector[col_index] -= result_entry * elements[entry_index]; \n");
+          source.append("    } \n");
+
+          source.append("    barrier(CLK_GLOBAL_MEM_FENCE); \n");
+          source.append("  } \n");
+          source.append("} \n");
+
+        }
+
+        template <typename StringType>
+        void generate_compressed_matrix_unit_lu_backward(StringType & source, std::string const & numeric_string)
+        {
+
+          // compute x in Ux = y for incomplete LU factorizations of a sparse matrix in compressed format
+          source.append("__kernel void unit_lu_backward( \n");
+          source.append("          __global const unsigned int * row_indices, \n");
+          source.append("          __global const unsigned int * column_indices, \n");
+          source.append("          __global const "); source.append(numeric_string); source.append(" * elements, \n");
+          source.append("          __global "); source.append(numeric_string); source.append(" * vector, \n");
+          source.append("          unsigned int size) \n");
+          source.append("{ \n");
+          source.append("  __local  unsigned int col_index_buffer[128]; \n");
+          source.append("  __local  "); source.append(numeric_string); source.append(" element_buffer[128]; \n");
+          source.append("  __local  "); source.append(numeric_string); source.append(" vector_buffer[128]; \n");
+
+          source.append("  unsigned int nnz = row_indices[size]; \n");
+          source.append("  unsigned int current_row = size-1; \n");
+          source.append("  unsigned int row_at_window_start = size-1; \n");
+          source.append("  "); source.append(numeric_string); source.append(" current_vector_entry = vector[size-1]; \n");
+          source.append("  unsigned int loop_end = ( (nnz - 1) / get_local_size(0)) * get_local_size(0); \n");
+          source.append("  unsigned int next_row = row_indices[size-1]; \n");
+
+          source.append("  unsigned int i = loop_end + get_local_id(0); \n");
+          source.append("  while (1) \n");
+          source.append("  { \n");
+              //load into shared memory (coalesced access):
+          source.append("    if (i < nnz) \n");
+          source.append("    { \n");
+          source.append("      element_buffer[get_local_id(0)] = elements[i]; \n");
+          source.append("      unsigned int tmp = column_indices[i]; \n");
+          source.append("      col_index_buffer[get_local_id(0)] = tmp; \n");
+          source.append("      vector_buffer[get_local_id(0)] = vector[tmp]; \n");
+          source.append("    } \n");
+
+          source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+              //now a single thread does the remaining work in shared memory:
+          source.append("    if (get_local_id(0) == 0) \n");
+          source.append("    { \n");
+              // traverse through all the loaded data from back to front:
+          source.append("      for (unsigned int k2=0; k2<get_local_size(0); ++k2) \n");
+          source.append("      { \n");
+          source.append("        unsigned int k = (get_local_size(0) - k2) - 1; \n");
+
+          source.append("        if (i+k >= nnz) \n");
+          source.append("          continue; \n");
+
+          source.append("        if (col_index_buffer[k] > row_at_window_start) \n"); //use recently computed results
+          source.append("          current_vector_entry -= element_buffer[k] * vector_buffer[k]; \n");
+          source.append("        else if (col_index_buffer[k] > current_row) \n"); //use buffered data
+          source.append("          current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]]; \n");
+
+          source.append("        if (i+k == next_row) \n"); //current row is finished. Write back result
+          source.append("        { \n");
+          source.append("          vector[current_row] = current_vector_entry; \n");
+          source.append("          if (current_row > 0) \n"); //load next row's data
+          source.append("          { \n");
+          source.append("            --current_row; \n");
+          source.append("            next_row = row_indices[current_row]; \n");
+          source.append("            current_vector_entry = vector[current_row]; \n");
+          source.append("          } \n");
+          source.append("        } \n");
+
+
+          source.append("      } \n"); // for k
+
+          source.append("      row_at_window_start = current_row; \n");
+          source.append("    } \n"); // if (get_local_id(0) == 0)
+
+          source.append("    barrier(CLK_GLOBAL_MEM_FENCE); \n");
+
+          source.append("    if (i < get_local_size(0)) \n");
+          source.append("      break; \n");
+
+          source.append("    i -= get_local_size(0); \n");
+          source.append("  } \n"); //for i
+          source.append("} \n");
+
+        }
+
+        template <typename StringType>
+        void generate_compressed_matrix_unit_lu_forward(StringType & source, std::string const & numeric_string)
+        {
+
+          // compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format
+          source.append("__kernel void unit_lu_forward( \n");
+          source.append("          __global const unsigned int * row_indices, \n");
+          source.append("          __global const unsigned int * column_indices, \n");
+          source.append("          __global const "); source.append(numeric_string); source.append(" * elements, \n");
+          source.append("          __global "); source.append(numeric_string); source.append(" * vector, \n");
+          source.append("          unsigned int size) \n");
+          source.append("{ \n");
+          source.append("  __local  unsigned int col_index_buffer[128]; \n");
+          source.append("  __local  "); source.append(numeric_string); source.append(" element_buffer[128]; \n");
+          source.append("  __local  "); source.append(numeric_string); source.append(" vector_buffer[128]; \n");
+
+          source.append("  unsigned int nnz = row_indices[size]; \n");
+          source.append("  unsigned int current_row = 0; \n");
+          source.append("  unsigned int row_at_window_start = 0; \n");
+          source.append("  "); source.append(numeric_string); source.append(" current_vector_entry = vector[0]; \n");
+          source.append("  unsigned int loop_end = (nnz / get_local_size(0) + 1) * get_local_size(0); \n");
+          source.append("  unsigned int next_row = row_indices[1]; \n");
+
+          source.append("  for (unsigned int i = get_local_id(0); i < loop_end; i += get_local_size(0)) \n");
+          source.append("  { \n");
+              //load into shared memory (coalesced access):
+          source.append("    if (i < nnz) \n");
+          source.append("    { \n");
+          source.append("      element_buffer[get_local_id(0)] = elements[i]; \n");
+          source.append("      unsigned int tmp = column_indices[i]; \n");
+          source.append("      col_index_buffer[get_local_id(0)] = tmp; \n");
+          source.append("      vector_buffer[get_local_id(0)] = vector[tmp]; \n");
+          source.append("    } \n");
+
+          source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+              //now a single thread does the remaining work in shared memory:
+          source.append("    if (get_local_id(0) == 0) \n");
+          source.append("    { \n");
+                // traverse through all the loaded data:
+          source.append("      for (unsigned int k=0; k<get_local_size(0); ++k) \n");
+          source.append("      { \n");
+          source.append("        if (i+k == next_row) \n"); //current row is finished. Write back result
+          source.append("        { \n");
+          source.append("          vector[current_row] = current_vector_entry; \n");
+          source.append("          ++current_row; \n");
+          source.append("          if (current_row < size) //load next row's data \n");
+          source.append("          { \n");
+          source.append("            next_row = row_indices[current_row+1]; \n");
+          source.append("            current_vector_entry = vector[current_row]; \n");
+          source.append("          } \n");
+          source.append("        } \n");
+
+          source.append("        if (current_row < size && col_index_buffer[k] < current_row) \n"); //substitute
+          source.append("        { \n");
+          source.append("          if (col_index_buffer[k] < row_at_window_start) \n"); //use recently computed results
+          source.append("            current_vector_entry -= element_buffer[k] * vector_buffer[k]; \n");
+          source.append("          else if (col_index_buffer[k] < current_row) \n"); //use buffered data
+          source.append("            current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]]; \n");
+          source.append("        } \n");
+
+          source.append("      } \n"); // for k
+
+          source.append("      row_at_window_start = current_row; \n");
+          source.append("    } \n"); // if (get_local_id(0) == 0)
+
+          source.append("    barrier(CLK_GLOBAL_MEM_FENCE); \n");
+          source.append("  } //for i \n");
+          source.append("} \n");
+
+        }
+
+        template <typename StringType>
+        void generate_compressed_matrix_vec_mul(StringType & source, std::string const & numeric_string)
+        {
+
+          source.append("__kernel void vec_mul( \n");
+          source.append("          __global const unsigned int * row_indices, \n");
+          source.append("          __global const unsigned int * column_indices, \n");
+          source.append("          __global const "); source.append(numeric_string); source.append(" * elements, \n");
+          source.append("          __global const "); source.append(numeric_string); source.append(" * x, \n");
+          source.append("          uint4 layout_x, \n");
+          source.append("          __global "); source.append(numeric_string); source.append(" * result, \n");
+          source.append("          uint4 layout_result) \n");
+          source.append("{ \n");
+          source.append("  for (unsigned int row = get_global_id(0); row < layout_result.z; row += get_global_size(0)) \n");
+          source.append("  { \n");
+          source.append("    "); source.append(numeric_string); source.append(" dot_prod = 0; \n");
+          source.append("    unsigned int row_end = row_indices[row+1]; \n");
+          source.append("    for (unsigned int i = row_indices[row]; i < row_end; ++i) \n");
+          source.append("      dot_prod += elements[i] * x[column_indices[i] * layout_x.y + layout_x.x]; \n");
+          source.append("    result[row * layout_result.y + layout_result.x] = dot_prod; \n");
+          source.append("  } \n");
+          source.append("} \n");
+
+        }
+
+        template <typename StringType>
+        void generate_compressed_matrix_vec_mul4(StringType & source, std::string const & numeric_string)
+        {
+          source.append("__kernel void vec_mul4( \n");
+          source.append("          __global const unsigned int * row_indices, \n");
+          source.append("          __global const uint4 * column_indices, \n");
+          source.append("          __global const "); source.append(numeric_string); source.append("4 * elements, \n");
+          source.append("          __global const "); source.append(numeric_string); source.append(" * x, \n");
+          source.append("          uint4 layout_x, \n");
+          source.append("          __global "); source.append(numeric_string); source.append(" * result, \n");
+          source.append("          uint4 layout_result) \n");
+          source.append("{ \n");
+          source.append("  "); source.append(numeric_string); source.append(" dot_prod; \n");
+          source.append("  unsigned int start, next_stop; \n");
+          source.append("  uint4 col_idx; \n");
+          source.append("  "); source.append(numeric_string); source.append("4 tmp_vec; \n");
+          source.append("  "); source.append(numeric_string); source.append("4 tmp_entries; \n");
+
+          source.append("  for (unsigned int row = get_global_id(0); row < layout_result.z; row += get_global_size(0)) \n");
+          source.append("  { \n");
+          source.append("    dot_prod = 0; \n");
+          source.append("    start = row_indices[row] / 4; \n");
+          source.append("    next_stop = row_indices[row+1] / 4; \n");
+
+          source.append("    for (unsigned int i = start; i < next_stop; ++i) \n");
+          source.append("    { \n");
+          source.append("      col_idx = column_indices[i]; \n");
+
+          source.append("      tmp_entries = elements[i]; \n");
+          source.append("      tmp_vec.x = x[col_idx.x * layout_x.y + layout_x.x]; \n");
+          source.append("      tmp_vec.y = x[col_idx.y * layout_x.y + layout_x.x]; \n");
+          source.append("      tmp_vec.z = x[col_idx.z * layout_x.y + layout_x.x]; \n");
+          source.append("      tmp_vec.w = x[col_idx.w * layout_x.y + layout_x.x]; \n");
+
+          source.append("      dot_prod += dot(tmp_entries, tmp_vec); \n");
+          source.append("    } \n");
+          source.append("    result[row * layout_result.y + layout_result.x] = dot_prod; \n");
+          source.append("  } \n");
+          source.append("} \n");
+        }
+
+        template <typename StringType>
+        void generate_compressed_matrix_vec_mul8(StringType & source, std::string const & numeric_string)
+        {
+          source.append("__kernel void vec_mul8( \n");
+          source.append("          __global const unsigned int * row_indices, \n");
+          source.append("          __global const uint8 * column_indices, \n");
+          source.append("          __global const "); source.append(numeric_string); source.append("8 * elements, \n");
+          source.append("          __global const "); source.append(numeric_string); source.append(" * x, \n");
+          source.append("          uint4 layout_x, \n");
+          source.append("          __global "); source.append(numeric_string); source.append(" * result, \n");
+          source.append("          uint4 layout_result) \n");
+          source.append("{ \n");
+          source.append("  "); source.append(numeric_string); source.append(" dot_prod; \n");
+          source.append("  unsigned int start, next_stop; \n");
+          source.append("  uint8 col_idx; \n");
+          source.append("  "); source.append(numeric_string); source.append("8 tmp_vec; \n");
+          source.append("  "); source.append(numeric_string); source.append("8 tmp_entries; \n");
+
+          source.append("  for (unsigned int row = get_global_id(0); row < layout_result.z; row += get_global_size(0)) \n");
+          source.append("  { \n");
+          source.append("    dot_prod = 0; \n");
+          source.append("    start = row_indices[row] / 8; \n");
+          source.append("    next_stop = row_indices[row+1] / 8; \n");
+
+          source.append("    for (unsigned int i = start; i < next_stop; ++i) \n");
+          source.append("    { \n");
+          source.append("      col_idx = column_indices[i]; \n");
+
+          source.append("      tmp_entries = elements[i]; \n");
+          source.append("      tmp_vec.s0 = x[col_idx.s0 * layout_x.y + layout_x.x]; \n");
+          source.append("      tmp_vec.s1 = x[col_idx.s1 * layout_x.y + layout_x.x]; \n");
+          source.append("      tmp_vec.s2 = x[col_idx.s2 * layout_x.y + layout_x.x]; \n");
+          source.append("      tmp_vec.s3 = x[col_idx.s3 * layout_x.y + layout_x.x]; \n");
+          source.append("      tmp_vec.s4 = x[col_idx.s4 * layout_x.y + layout_x.x]; \n");
+          source.append("      tmp_vec.s5 = x[col_idx.s5 * layout_x.y + layout_x.x]; \n");
+          source.append("      tmp_vec.s6 = x[col_idx.s6 * layout_x.y + layout_x.x]; \n");
+          source.append("      tmp_vec.s7 = x[col_idx.s7 * layout_x.y + layout_x.x]; \n");
+
+          source.append("      dot_prod += dot(tmp_entries.lo, tmp_vec.lo); \n");
+          source.append("      dot_prod += dot(tmp_entries.hi, tmp_vec.hi); \n");
+          source.append("    } \n");
+          source.append("    result[row * layout_result.y + layout_result.x] = dot_prod; \n");
+          source.append("  } \n");
+          source.append("} \n");
+        }
+
+        template <typename StringType>
+        void generate_compressed_matrix_vec_mul_cpu(StringType & source, std::string const & numeric_string)
+        {
+          source.append("__kernel void vec_mul_cpu( \n");
+          source.append("          __global const unsigned int * row_indices, \n");
+          source.append("          __global const unsigned int * column_indices, \n");
+          source.append("          __global const "); source.append(numeric_string); source.append(" * elements, \n");
+          source.append("          __global const "); source.append(numeric_string); source.append(" * vector, \n");
+          source.append("          __global "); source.append(numeric_string); source.append(" * result, \n");
+          source.append("          unsigned int size) \n");
+          source.append("{ \n");
+          source.append("  unsigned int work_per_item = max((uint) (size / get_global_size(0)), (uint) 1); \n");
+          source.append("  unsigned int row_start = get_global_id(0) * work_per_item; \n");
+          source.append("  unsigned int row_stop  = min( (uint) ((get_global_id(0) + 1) * work_per_item), (uint) size); \n");
+          source.append("  for (unsigned int row = row_start; row < row_stop; ++row) \n");
+          source.append("  { \n");
+          source.append("    "); source.append(numeric_string); source.append(" dot_prod = ("); source.append(numeric_string); source.append(")0; \n");
+          source.append("    unsigned int row_end = row_indices[row+1]; \n");
+          source.append("    for (unsigned int i = row_indices[row]; i < row_end; ++i) \n");
+          source.append("      dot_prod += elements[i] * vector[column_indices[i]]; \n");
+          source.append("    result[row] = dot_prod; \n");
+          source.append("  } \n");
+          source.append("} \n");
+
+        }
+
+
+        //////////////////////////// Part 2: Main kernel class ////////////////////////////////////
+
+        // main kernel class
+        /** @brief Main kernel class for generating OpenCL kernels for compressed_matrix. */
+        template <typename NumericT>
+        struct compressed_matrix
+        {
+          static std::string program_name()
+          {
+            return viennacl::ocl::type_to_string<NumericT>::apply() + "_compressed_matrix";
+          }
+
+          static void init(viennacl::ocl::context & ctx)
+          {
+            viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx);
+            std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
+
+            static std::map<cl_context, bool> init_done;
+            if (!init_done[ctx.handle().get()])
+            {
+              std::string source;
+              source.reserve(1024);
+
+              viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
+
+              if (numeric_string == "float" || numeric_string == "double")
+              {
+                generate_compressed_matrix_block_trans_lu_backward(source, numeric_string);
+                generate_compressed_matrix_block_trans_unit_lu_forward(source, numeric_string);
+                generate_compressed_matrix_jacobi(source, numeric_string);
+                generate_compressed_matrix_lu_backward(source, numeric_string);
+                generate_compressed_matrix_lu_forward(source, numeric_string);
+                generate_compressed_matrix_trans_lu_backward(source, numeric_string);
+                generate_compressed_matrix_trans_lu_forward(source, numeric_string);
+                generate_compressed_matrix_trans_unit_lu_backward(source, numeric_string);
+                generate_compressed_matrix_trans_unit_lu_forward(source, numeric_string);
+                generate_compressed_matrix_trans_unit_lu_forward_slow(source, numeric_string);
+                generate_compressed_matrix_unit_lu_backward(source, numeric_string);
+                generate_compressed_matrix_unit_lu_forward(source, numeric_string);
+              }
+              generate_compressed_matrix_dense_matrix_multiplication(source, numeric_string);
+              generate_compressed_matrix_row_info_extractor(source, numeric_string);
+              generate_compressed_matrix_vec_mul(source, numeric_string);
+              generate_compressed_matrix_vec_mul4(source, numeric_string);
+              generate_compressed_matrix_vec_mul8(source, numeric_string);
+              generate_compressed_matrix_vec_mul_cpu(source, numeric_string);
+
+              std::string prog_name = program_name();
+              #ifdef VIENNACL_BUILD_INFO
+              std::cout << "Creating program " << prog_name << std::endl;
+              #endif
+              ctx.add_program(source, prog_name);
+              init_done[ctx.handle().get()] = true;
+            } //if
+          } //init
+        };
+
+      }  // namespace kernels
+    }  // namespace opencl
+  }  // namespace linalg
+}  // namespace viennacl
+#endif
+
diff --git a/viennacl/linalg/opencl/kernels/coordinate_matrix.hpp b/viennacl/linalg/opencl/kernels/coordinate_matrix.hpp
new file mode 100644
index 0000000..7609587
--- /dev/null
+++ b/viennacl/linalg/opencl/kernels/coordinate_matrix.hpp
@@ -0,0 +1,382 @@
+#ifndef VIENNACL_LINALG_OPENCL_KERNELS_COORDINATE_MATRIX_HPP
+#define VIENNACL_LINALG_OPENCL_KERNELS_COORDINATE_MATRIX_HPP
+
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/platform.hpp"
+#include "viennacl/ocl/utils.hpp"
+
+#include "viennacl/linalg/opencl/common.hpp"
+
+/** @file viennacl/linalg/opencl/kernels/coordinate_matrix.hpp
+ *  @brief OpenCL kernel file for coordinate_matrix operations */
+namespace viennacl
+{
+  namespace linalg
+  {
+    namespace opencl
+    {
+      namespace kernels
+      {
+
+        //////////////////////////// Part 1: Kernel generation routines ////////////////////////////////////
+
+        template <typename StringType>
+        void generate_coordinate_matrix_vec_mul(StringType & source, std::string const & numeric_string)
+        {
+          source.append("__kernel void vec_mul( \n");
+          source.append("  __global const uint2 * coords,  \n");//(row_index, column_index)
+          source.append("  __global const "); source.append(numeric_string); source.append(" * elements, \n");
+          source.append("  __global const uint  * group_boundaries, \n");
+          source.append("  __global const "); source.append(numeric_string); source.append(" * x, \n");
+          source.append("  uint4 layout_x, \n");
+          source.append("  __global "); source.append(numeric_string); source.append(" * result, \n");
+          source.append("  uint4 layout_result, \n");
+          source.append("  __local unsigned int * shared_rows, \n");
+          source.append("  __local "); source.append(numeric_string); source.append(" * inter_results) \n");
+          source.append("{ \n");
+          source.append("  uint2 tmp; \n");
+          source.append("  "); source.append(numeric_string); source.append(" val; \n");
+          source.append("  uint group_start = group_boundaries[get_group_id(0)]; \n");
+          source.append("  uint group_end   = group_boundaries[get_group_id(0) + 1]; \n");
+          source.append("  uint k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / get_local_size(0) : 0; \n");   // -1 in order to have correct behavior if group_end - group_start == j * get_local_size(0)
+
+          source.append("  uint local_index = 0; \n");
+
+          source.append("  for (uint k = 0; k < k_end; ++k) { \n");
+          source.append("    local_index = group_start + k * get_local_size(0) + get_local_id(0); \n");
+
+          source.append("    tmp = (local_index < group_end) ? coords[local_index] : (uint2) 0; \n");
+          source.append("    val = (local_index < group_end) ? elements[local_index] * x[tmp.y * layout_x.y + layout_x.x] : 0; \n");
+
+          //check for carry from previous loop run:
+          source.append("    if (get_local_id(0) == 0 && k > 0) { \n");
+          source.append("      if (tmp.x == shared_rows[get_local_size(0)-1]) \n");
+          source.append("        val += inter_results[get_local_size(0)-1]; \n");
+          source.append("      else \n");
+          source.append("        result[shared_rows[get_local_size(0)-1] * layout_result.y + layout_result.x] = inter_results[get_local_size(0)-1]; \n");
+          source.append("    } \n");
+
+          //segmented parallel reduction begin
+          source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+          source.append("    shared_rows[get_local_id(0)] = tmp.x; \n");
+          source.append("    inter_results[get_local_id(0)] = val; \n");
+          source.append("    "); source.append(numeric_string); source.append(" left = 0; \n");
+          source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+          source.append("    for (unsigned int stride = 1; stride < get_local_size(0); stride *= 2) { \n");
+          source.append("      left = (get_local_id(0) >= stride && tmp.x == shared_rows[get_local_id(0) - stride]) ? inter_results[get_local_id(0) - stride] : 0; \n");
+          source.append("      barrier(CLK_LOCAL_MEM_FENCE); \n");
+          source.append("      inter_results[get_local_id(0)] += left; \n");
+          source.append("      barrier(CLK_LOCAL_MEM_FENCE); \n");
+          source.append("    } \n");
+          //segmented parallel reduction end
+
+          source.append("    if (local_index < group_end && get_local_id(0) < get_local_size(0) - 1 && \n");
+          source.append("      shared_rows[get_local_id(0)] != shared_rows[get_local_id(0) + 1]) { \n");
+          source.append("      result[tmp.x * layout_result.y + layout_result.x] = inter_results[get_local_id(0)]; \n");
+          source.append("    } \n");
+
+          source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+          source.append("  }  \n"); //for k
+
+          source.append("  if (local_index + 1 == group_end) \n");  //write results of last active entry (this may not necessarily be the case already)
+          source.append("    result[tmp.x * layout_result.y + layout_result.x] = inter_results[get_local_id(0)]; \n");
+          source.append("} \n");
+
+        }
+
+        namespace detail
+        {
+          /** @brief Generate kernel for C = A * B with A being a compressed_matrix, B and C dense */
+          template <typename StringType>
+          void generate_coordinate_matrix_dense_matrix_mul(StringType & source, std::string const & numeric_string,
+                                                           bool B_transposed, bool B_row_major, bool C_row_major)
+          {
+            source.append("__kernel void ");
+            source.append(viennacl::linalg::opencl::detail::sparse_dense_matmult_kernel_name(B_transposed, B_row_major, C_row_major));
+            source.append("( \n");
+            source.append("  __global const uint2 * coords,  \n");//(row_index, column_index)
+            source.append("  __global const "); source.append(numeric_string); source.append(" * elements, \n");
+            source.append("  __global const uint  * group_boundaries, \n");
+            source.append("  __global const "); source.append(numeric_string); source.append(" * d_mat, \n");
+            source.append("  unsigned int d_mat_row_start, \n");
+            source.append("  unsigned int d_mat_col_start, \n");
+            source.append("  unsigned int d_mat_row_inc, \n");
+            source.append("  unsigned int d_mat_col_inc, \n");
+            source.append("  unsigned int d_mat_row_size, \n");
+            source.append("  unsigned int d_mat_col_size, \n");
+            source.append("  unsigned int d_mat_internal_rows, \n");
+            source.append("  unsigned int d_mat_internal_cols, \n");
+            source.append("  __global "); source.append(numeric_string); source.append(" * result, \n");
+            source.append("  unsigned int result_row_start, \n");
+            source.append("  unsigned int result_col_start, \n");
+            source.append("  unsigned int result_row_inc, \n");
+            source.append("  unsigned int result_col_inc, \n");
+            source.append("  unsigned int result_row_size, \n");
+            source.append("  unsigned int result_col_size, \n");
+            source.append("  unsigned int result_internal_rows, \n");
+            source.append("  unsigned int result_internal_cols, \n");
+            source.append("  __local unsigned int * shared_rows, \n");
+            source.append("  __local "); source.append(numeric_string); source.append(" * inter_results) \n");
+            source.append("{ \n");
+            source.append("  uint2 tmp; \n");
+            source.append("  "); source.append(numeric_string); source.append(" val; \n");
+            source.append("  uint group_start = group_boundaries[get_group_id(0)]; \n");
+            source.append("  uint group_end   = group_boundaries[get_group_id(0) + 1]; \n");
+            source.append("  uint k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / get_local_size(0) : 0; \n");   // -1 in order to have correct behavior if group_end - group_start == j * get_local_size(0)
+
+            source.append("  uint local_index = 0; \n");
+
+            source.append("  for (uint result_col = 0; result_col < result_col_size; ++result_col) { \n");
+            source.append("   for (uint k = 0; k < k_end; ++k) { \n");
+            source.append("    local_index = group_start + k * get_local_size(0) + get_local_id(0); \n");
+
+            source.append("    tmp = (local_index < group_end) ? coords[local_index] : (uint2) 0; \n");
+            if (B_transposed && B_row_major)
+              source.append("    val = (local_index < group_end) ? elements[local_index] * d_mat[ (d_mat_row_start + result_col * d_mat_row_inc) * d_mat_internal_cols + d_mat_col_start +      tmp.y * d_mat_col_inc ] : 0; \n");
+            if (B_transposed && !B_row_major)
+              source.append("    val = (local_index < group_end) ? elements[local_index] * d_mat[ (d_mat_row_start + result_col * d_mat_row_inc)                       + (d_mat_col_start +      tmp.y * d_mat_col_inc) * d_mat_internal_rows ] : 0; \n");
+            else if (!B_transposed && B_row_major)
+              source.append("    val = (local_index < group_end) ? elements[local_index] * d_mat[ (d_mat_row_start +      tmp.y * d_mat_row_inc) * d_mat_internal_cols + d_mat_col_start + result_col * d_mat_col_inc ] : 0; \n");
+            else
+              source.append("    val = (local_index < group_end) ? elements[local_index] * d_mat[ (d_mat_row_start +      tmp.y * d_mat_row_inc)                       + (d_mat_col_start + result_col * d_mat_col_inc) * d_mat_internal_rows ] : 0; \n");
+
+            //check for carry from previous loop run:
+            source.append("    if (get_local_id(0) == 0 && k > 0) { \n");
+            source.append("      if (tmp.x == shared_rows[get_local_size(0)-1]) \n");
+            source.append("        val += inter_results[get_local_size(0)-1]; \n");
+            source.append("      else \n");
+            if (C_row_major)
+              source.append("        result[(shared_rows[get_local_size(0)-1] * result_row_inc + result_row_start) * result_internal_cols + result_col_start + result_col * result_col_inc ] = inter_results[get_local_size(0)-1]; \n");
+            else
+              source.append("        result[(shared_rows[get_local_size(0)-1] * result_row_inc + result_row_start)                        + (result_col_start + result_col * result_col_inc) * result_internal_rows ] = inter_results[get_local_size(0)-1]; \n");
+            source.append("    } \n");
+
+            //segmented parallel reduction begin
+            source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+            source.append("    shared_rows[get_local_id(0)] = tmp.x; \n");
+            source.append("    inter_results[get_local_id(0)] = val; \n");
+            source.append("    "); source.append(numeric_string); source.append(" left = 0; \n");
+            source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+            source.append("    for (unsigned int stride = 1; stride < get_local_size(0); stride *= 2) { \n");
+            source.append("      left = (get_local_id(0) >= stride && tmp.x == shared_rows[get_local_id(0) - stride]) ? inter_results[get_local_id(0) - stride] : 0; \n");
+            source.append("      barrier(CLK_LOCAL_MEM_FENCE); \n");
+            source.append("      inter_results[get_local_id(0)] += left; \n");
+            source.append("      barrier(CLK_LOCAL_MEM_FENCE); \n");
+            source.append("    } \n");
+            //segmented parallel reduction end
+
+            source.append("    if (local_index < group_end && get_local_id(0) < get_local_size(0) - 1 && \n");
+            source.append("      shared_rows[get_local_id(0)] != shared_rows[get_local_id(0) + 1]) { \n");
+            if (C_row_major)
+              source.append("      result[(tmp.x * result_row_inc + result_row_start) * result_internal_cols + result_col_start + result_col * result_col_inc ] = inter_results[get_local_id(0)]; \n");
+            else
+              source.append("      result[(tmp.x * result_row_inc + result_row_start)                        + (result_col_start + result_col * result_col_inc) * result_internal_rows ] = inter_results[get_local_id(0)]; \n");
+            source.append("    } \n");
+
+            source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+            source.append("   }  \n"); //for k
+
+            source.append("   if (local_index + 1 == group_end) \n");  //write results of last active entry (this may not necessarily be the case already)
+            if (C_row_major)
+              source.append("    result[(tmp.x  * result_row_inc + result_row_start) * result_internal_cols + result_col_start + result_col * result_col_inc ] = inter_results[get_local_id(0)]; \n");
+            else
+              source.append("    result[(tmp.x  * result_row_inc + result_row_start)                        + (result_col_start + result_col * result_col_inc) * result_internal_rows ] = inter_results[get_local_id(0)]; \n");
+            source.append("  } \n"); //for result_col
+            source.append("} \n");
+
+          }
+        }
+
+        template <typename StringType>
+        void generate_coordinate_matrix_dense_matrix_multiplication(StringType & source, std::string const & numeric_string)
+        {
+          detail::generate_coordinate_matrix_dense_matrix_mul(source, numeric_string, false, false, false);
+          detail::generate_coordinate_matrix_dense_matrix_mul(source, numeric_string, false, false,  true);
+          detail::generate_coordinate_matrix_dense_matrix_mul(source, numeric_string, false,  true, false);
+          detail::generate_coordinate_matrix_dense_matrix_mul(source, numeric_string, false,  true,  true);
+
+          detail::generate_coordinate_matrix_dense_matrix_mul(source, numeric_string, true, false, false);
+          detail::generate_coordinate_matrix_dense_matrix_mul(source, numeric_string, true, false,  true);
+          detail::generate_coordinate_matrix_dense_matrix_mul(source, numeric_string, true,  true, false);
+          detail::generate_coordinate_matrix_dense_matrix_mul(source, numeric_string, true,  true,  true);
+        }
+
+        template <typename StringType>
+        void generate_coordinate_matrix_row_info_extractor(StringType & source, std::string const & numeric_string)
+        {
+          source.append("__kernel void row_info_extractor( \n");
+          source.append("          __global const uint2 * coords,  \n");//(row_index, column_index)
+          source.append("          __global const "); source.append(numeric_string); source.append(" * elements, \n");
+          source.append("          __global const uint  * group_boundaries, \n");
+          source.append("          __global "); source.append(numeric_string); source.append(" * result, \n");
+          source.append("          unsigned int option, \n");
+          source.append("          __local unsigned int * shared_rows, \n");
+          source.append("          __local "); source.append(numeric_string); source.append(" * inter_results) \n");
+          source.append("{ \n");
+          source.append("  uint2 tmp; \n");
+          source.append("  "); source.append(numeric_string); source.append(" val; \n");
+          source.append("  uint last_index  = get_local_size(0) - 1; \n");
+          source.append("  uint group_start = group_boundaries[get_group_id(0)]; \n");
+          source.append("  uint group_end   = group_boundaries[get_group_id(0) + 1]; \n");
+          source.append("  uint k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / get_local_size(0) : ("); source.append(numeric_string); source.append(")0; \n");   // -1 in order to have correct behavior if group_end - group_start == j * get_local_size(0)
+
+          source.append("  uint local_index = 0; \n");
+
+          source.append("  for (uint k = 0; k < k_end; ++k) \n");
+          source.append("  { \n");
+          source.append("    local_index = group_start + k * get_local_size(0) + get_local_id(0); \n");
+
+          source.append("    tmp = (local_index < group_end) ? coords[local_index] : (uint2) 0; \n");
+          source.append("    val = (local_index < group_end && (option != 3 || tmp.x == tmp.y) ) ? elements[local_index] : 0; \n");
+
+              //check for carry from previous loop run:
+          source.append("    if (get_local_id(0) == 0 && k > 0) \n");
+          source.append("    { \n");
+          source.append("      if (tmp.x == shared_rows[last_index]) \n");
+          source.append("      { \n");
+          source.append("        switch (option) \n");
+          source.append("        { \n");
+          source.append("          case 0: \n"); //inf-norm
+          source.append("          case 3: \n"); //diagonal entry
+          source.append("            val = max(val, fabs(inter_results[last_index])); \n");
+          source.append("            break; \n");
+
+          source.append("          case 1: \n"); //1-norm
+          source.append("            val = fabs(val) + inter_results[last_index]; \n");
+          source.append("            break; \n");
+
+          source.append("          case 2: \n"); //2-norm
+          source.append("            val = sqrt(val * val + inter_results[last_index]); \n");
+          source.append("            break; \n");
+
+          source.append("          default: \n");
+          source.append("            break; \n");
+          source.append("        } \n");
+          source.append("      } \n");
+          source.append("      else \n");
+          source.append("      { \n");
+          source.append("        switch (option) \n");
+          source.append("        { \n");
+          source.append("          case 0: \n"); //inf-norm
+          source.append("          case 1: \n"); //1-norm
+          source.append("          case 3: \n"); //diagonal entry
+          source.append("            result[shared_rows[last_index]] = inter_results[last_index]; \n");
+          source.append("            break; \n");
+
+          source.append("          case 2: \n"); //2-norm
+          source.append("            result[shared_rows[last_index]] = sqrt(inter_results[last_index]); \n");
+          source.append("          default: \n");
+          source.append("            break; \n");
+          source.append("        } \n");
+          source.append("      } \n");
+          source.append("    } \n");
+
+              //segmented parallel reduction begin
+          source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+          source.append("    shared_rows[get_local_id(0)] = tmp.x; \n");
+          source.append("    switch (option) \n");
+          source.append("    { \n");
+          source.append("      case 0: \n");
+          source.append("      case 3: \n");
+          source.append("        inter_results[get_local_id(0)] = val; \n");
+          source.append("        break; \n");
+          source.append("      case 1: \n");
+          source.append("        inter_results[get_local_id(0)] = fabs(val); \n");
+          source.append("        break; \n");
+          source.append("      case 2: \n");
+          source.append("        inter_results[get_local_id(0)] = val * val; \n");
+          source.append("      default: \n");
+          source.append("        break; \n");
+          source.append("    } \n");
+          source.append("    "); source.append(numeric_string); source.append(" left = 0; \n");
+          source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+          source.append("    for (unsigned int stride = 1; stride < get_local_size(0); stride *= 2) \n");
+          source.append("    { \n");
+          source.append("      left = (get_local_id(0) >= stride && tmp.x == shared_rows[get_local_id(0) - stride]) ? inter_results[get_local_id(0) - stride] : ("); source.append(numeric_string); source.append(")0; \n");
+          source.append("      barrier(CLK_LOCAL_MEM_FENCE); \n");
+          source.append("      switch (option) \n");
+          source.append("      { \n");
+          source.append("        case 0: \n"); //inf-norm
+          source.append("        case 3: \n"); //diagonal entry
+          source.append("          inter_results[get_local_id(0)] = max(inter_results[get_local_id(0)], left); \n");
+          source.append("          break; \n");
+
+          source.append("        case 1: \n"); //1-norm
+          source.append("          inter_results[get_local_id(0)] += left; \n");
+          source.append("          break; \n");
+
+          source.append("        case 2: \n"); //2-norm
+          source.append("          inter_results[get_local_id(0)] += left; \n");
+          source.append("          break; \n");
+
+          source.append("        default: \n");
+          source.append("          break; \n");
+          source.append("      } \n");
+          source.append("      barrier(CLK_LOCAL_MEM_FENCE); \n");
+          source.append("    } \n");
+              //segmented parallel reduction end
+
+          source.append("    if (get_local_id(0) != last_index && \n");
+          source.append("        shared_rows[get_local_id(0)] != shared_rows[get_local_id(0) + 1] && \n");
+          source.append("        inter_results[get_local_id(0)] != 0) \n");
+          source.append("    { \n");
+          source.append("      result[tmp.x] = (option == 2) ? sqrt(inter_results[get_local_id(0)]) : inter_results[get_local_id(0)]; \n");
+          source.append("    } \n");
+
+          source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+          source.append("  } \n"); //for k
+
+          source.append("  if (get_local_id(0) == last_index && inter_results[last_index] != 0) \n");
+          source.append("    result[tmp.x] = (option == 2) ? sqrt(inter_results[last_index]) : inter_results[last_index]; \n");
+          source.append("} \n");
+        }
+
+        //////////////////////////// Part 2: Main kernel class ////////////////////////////////////
+
+        // main kernel class
+        /** @brief Main kernel class for generating OpenCL kernels for coordinate_matrix. */
+        template <typename NumericT>
+        struct coordinate_matrix
+        {
+          static std::string program_name()
+          {
+            return viennacl::ocl::type_to_string<NumericT>::apply() + "_coordinate_matrix";
+          }
+
+          static void init(viennacl::ocl::context & ctx)
+          {
+            viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx);
+            std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
+
+            static std::map<cl_context, bool> init_done;
+            if (!init_done[ctx.handle().get()])
+            {
+              std::string source;
+              source.reserve(1024);
+
+              viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
+
+              generate_coordinate_matrix_vec_mul(source, numeric_string);
+              generate_coordinate_matrix_dense_matrix_multiplication(source, numeric_string);
+              generate_coordinate_matrix_row_info_extractor(source, numeric_string);
+
+              std::string prog_name = program_name();
+              #ifdef VIENNACL_BUILD_INFO
+              std::cout << "Creating program " << prog_name << std::endl;
+              #endif
+              ctx.add_program(source, prog_name);
+              init_done[ctx.handle().get()] = true;
+            } //if
+          } //init
+        };
+
+      }  // namespace kernels
+    }  // namespace opencl
+  }  // namespace linalg
+}  // namespace viennacl
+#endif
+
diff --git a/viennacl/linalg/opencl/kernels/ell_matrix.hpp b/viennacl/linalg/opencl/kernels/ell_matrix.hpp
new file mode 100644
index 0000000..ef23e5e
--- /dev/null
+++ b/viennacl/linalg/opencl/kernels/ell_matrix.hpp
@@ -0,0 +1,195 @@
+#ifndef VIENNACL_LINALG_OPENCL_KERNELS_ELL_MATRIX_HPP
+#define VIENNACL_LINALG_OPENCL_KERNELS_ELL_MATRIX_HPP
+
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/platform.hpp"
+#include "viennacl/ocl/utils.hpp"
+
+#include "viennacl/linalg/opencl/common.hpp"
+
+/** @file viennacl/linalg/opencl/kernels/ell_matrix.hpp
+ *  @brief OpenCL kernel file for ell_matrix operations */
+namespace viennacl
+{
+  namespace linalg
+  {
+    namespace opencl
+    {
+      namespace kernels
+      {
+
+        //////////////////////////// Part 1: Kernel generation routines ////////////////////////////////////
+
+        template <typename StringType>
+        void generate_ell_vec_mul(StringType & source, std::string const & numeric_string)
+        {
+          source.append("__kernel void vec_mul( \n");
+          source.append("  __global const unsigned int * coords, \n");
+          source.append("  __global const "); source.append(numeric_string); source.append(" * elements, \n");
+          source.append("  __global const "); source.append(numeric_string); source.append(" * x, \n");
+          source.append("  uint4 layout_x, \n");
+          source.append("  __global "); source.append(numeric_string); source.append(" * result, \n");
+          source.append("  uint4 layout_result, \n");
+          source.append("  unsigned int row_num, \n");
+          source.append("  unsigned int col_num, \n");
+          source.append("  unsigned int internal_row_num, \n");
+          source.append("  unsigned int items_per_row, \n");
+          source.append("  unsigned int aligned_items_per_row) \n");
+          source.append("{ \n");
+          source.append("  uint glb_id = get_global_id(0); \n");
+          source.append("  uint glb_sz = get_global_size(0); \n");
+
+          source.append("  for(uint row_id = glb_id; row_id < row_num; row_id += glb_sz) { \n");
+          source.append("    "); source.append(numeric_string); source.append(" sum = 0; \n");
+
+          source.append("    uint offset = row_id; \n");
+          source.append("    for(uint item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num) { \n");
+          source.append("      "); source.append(numeric_string); source.append(" val = elements[offset]; \n");
+
+          source.append("       if(val != 0.0f) { \n");
+          source.append("          int col = coords[offset]; \n");
+          source.append("          sum += (x[col * layout_x.y + layout_x.x] * val); \n");
+          source.append("       } \n");
+
+          source.append("    } \n");
+
+          source.append("    result[row_id * layout_result.y + layout_result.x] = sum; \n");
+          source.append("  } \n");
+          source.append("} \n");
+        }
+
+        namespace detail
+        {
+          template <typename StringType>
+          void generate_ell_matrix_dense_matrix_mul(StringType & source, std::string const & numeric_string,
+                                                    bool B_transposed, bool B_row_major, bool C_row_major)
+          {
+            source.append("__kernel void ");
+            source.append(viennacl::linalg::opencl::detail::sparse_dense_matmult_kernel_name(B_transposed, B_row_major, C_row_major));
+            source.append("( \n");
+            source.append("    __global const unsigned int * sp_mat_coords, \n");
+            source.append("    __global const "); source.append(numeric_string); source.append(" * sp_mat_elems, \n");
+            source.append("    unsigned int sp_mat_row_num, \n");
+            source.append("    unsigned int sp_mat_col_num, \n");
+            source.append("    unsigned int sp_mat_internal_row_num, \n");
+            source.append("    unsigned int sp_mat_items_per_row, \n");
+            source.append("    unsigned int sp_mat_aligned_items_per_row, \n");
+            source.append("    __global const "); source.append(numeric_string); source.append("* d_mat, \n");
+            source.append("    unsigned int d_mat_row_start, \n");
+            source.append("    unsigned int d_mat_col_start, \n");
+            source.append("    unsigned int d_mat_row_inc, \n");
+            source.append("    unsigned int d_mat_col_inc, \n");
+            source.append("    unsigned int d_mat_row_size, \n");
+            source.append("    unsigned int d_mat_col_size, \n");
+            source.append("    unsigned int d_mat_internal_rows, \n");
+            source.append("    unsigned int d_mat_internal_cols, \n");
+            source.append("    __global "); source.append(numeric_string); source.append(" * result, \n");
+            source.append("    unsigned int result_row_start, \n");
+            source.append("    unsigned int result_col_start, \n");
+            source.append("    unsigned int result_row_inc, \n");
+            source.append("    unsigned int result_col_inc, \n");
+            source.append("    unsigned int result_row_size, \n");
+            source.append("    unsigned int result_col_size, \n");
+            source.append("    unsigned int result_internal_rows, \n");
+            source.append("    unsigned int result_internal_cols) { \n");
+
+            source.append("    uint glb_id = get_global_id(0); \n");
+            source.append("    uint glb_sz = get_global_size(0); \n");
+
+            source.append("    for( uint rc = glb_id; rc < (sp_mat_row_num * result_col_size); rc += glb_sz) { \n");
+            source.append("      uint row = rc % sp_mat_row_num; \n");
+            source.append("      uint col = rc / sp_mat_row_num; \n");
+
+            source.append("      uint offset = row; \n");
+            source.append("      "); source.append(numeric_string); source.append(" r = ("); source.append(numeric_string); source.append(")0; \n");
+
+            source.append("      for( uint k = 0; k < sp_mat_items_per_row; k++, offset += sp_mat_internal_row_num) { \n");
+
+            source.append("        uint j = sp_mat_coords[offset]; \n");
+            source.append("        "); source.append(numeric_string); source.append(" x = sp_mat_elems[offset]; \n");
+
+            source.append("        if(x != ("); source.append(numeric_string); source.append(")0) { \n");
+            source.append("          "); source.append(numeric_string);
+            if (B_transposed && B_row_major)
+              source.append(" y = d_mat[ (d_mat_row_start + col * d_mat_row_inc) * d_mat_internal_cols + d_mat_col_start + j * d_mat_col_inc ]; \n");
+            else if (B_transposed && !B_row_major)
+              source.append(" y = d_mat[ (d_mat_row_start + col * d_mat_row_inc)                       + (d_mat_col_start + j * d_mat_col_inc) * d_mat_internal_rows ]; \n");
+            else if (!B_transposed && B_row_major)
+              source.append(" y = d_mat[ (d_mat_row_start +   j * d_mat_row_inc) * d_mat_internal_cols + d_mat_col_start + col * d_mat_col_inc ]; \n");
+            else
+              source.append(" y = d_mat[ (d_mat_row_start +   j * d_mat_row_inc)                       + (d_mat_col_start + col * d_mat_col_inc) * d_mat_internal_rows ]; \n");
+
+            source.append("          r += x*y; \n");
+            source.append("        } \n");
+            source.append("      } \n");
+
+            if (C_row_major)
+              source.append("      result[ (result_row_start + row * result_row_inc) * result_internal_cols + result_col_start + col * result_col_inc ] = r; \n");
+            else
+              source.append("      result[ (result_row_start + row * result_row_inc)                        + (result_col_start + col * result_col_inc) * result_internal_rows ] = r; \n");
+            source.append("    } \n");
+            source.append("} \n");
+
+          }
+        }
+
+        template <typename StringType>
+        void generate_ell_matrix_dense_matrix_multiplication(StringType & source, std::string const & numeric_string)
+        {
+          detail::generate_ell_matrix_dense_matrix_mul(source, numeric_string, false, false, false);
+          detail::generate_ell_matrix_dense_matrix_mul(source, numeric_string, false, false,  true);
+          detail::generate_ell_matrix_dense_matrix_mul(source, numeric_string, false,  true, false);
+          detail::generate_ell_matrix_dense_matrix_mul(source, numeric_string, false,  true,  true);
+
+          detail::generate_ell_matrix_dense_matrix_mul(source, numeric_string, true, false, false);
+          detail::generate_ell_matrix_dense_matrix_mul(source, numeric_string, true, false,  true);
+          detail::generate_ell_matrix_dense_matrix_mul(source, numeric_string, true,  true, false);
+          detail::generate_ell_matrix_dense_matrix_mul(source, numeric_string, true,  true,  true);
+        }
+
+        //////////////////////////// Part 2: Main kernel class ////////////////////////////////////
+
+        // main kernel class
+        /** @brief Main kernel class for generating OpenCL kernels for ell_matrix. */
+        template <typename NumericT>
+        struct ell_matrix
+        {
+          static std::string program_name()
+          {
+            return viennacl::ocl::type_to_string<NumericT>::apply() + "_ell_matrix";
+          }
+
+          static void init(viennacl::ocl::context & ctx)
+          {
+            viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx);
+            std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
+
+            static std::map<cl_context, bool> init_done;
+            if (!init_done[ctx.handle().get()])
+            {
+              std::string source;
+              source.reserve(1024);
+
+              viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
+
+              // fully parametrized kernels:
+              generate_ell_vec_mul(source, numeric_string);
+              generate_ell_matrix_dense_matrix_multiplication(source, numeric_string);
+
+              std::string prog_name = program_name();
+              #ifdef VIENNACL_BUILD_INFO
+              std::cout << "Creating program " << prog_name << std::endl;
+              #endif
+              ctx.add_program(source, prog_name);
+              init_done[ctx.handle().get()] = true;
+            } //if
+          } //init
+        };
+
+      }  // namespace kernels
+    }  // namespace opencl
+  }  // namespace linalg
+}  // namespace viennacl
+#endif
+
diff --git a/viennacl/linalg/opencl/kernels/fft.hpp b/viennacl/linalg/opencl/kernels/fft.hpp
new file mode 100644
index 0000000..78b0b79
--- /dev/null
+++ b/viennacl/linalg/opencl/kernels/fft.hpp
@@ -0,0 +1,294 @@
+#ifndef VIENNACL_LINALG_OPENCL_KERNELS_FFT_HPP
+#define VIENNACL_LINALG_OPENCL_KERNELS_FFT_HPP
+
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/platform.hpp"
+#include "viennacl/ocl/utils.hpp"
+
+/** @file viennacl/linalg/opencl/kernels/fft.hpp
+ *  @brief OpenCL kernel file for FFT operations */
+namespace viennacl
+{
+  namespace linalg
+  {
+    namespace opencl
+    {
+      namespace kernels
+      {
+
+        //////////////////////////// Part 1: Kernel generation routines ////////////////////////////////////
+
+
+        // Postprocessing phase of Bluestein algorithm
+        template <typename StringType>
+        void generate_fft_bluestein_post(StringType & source, std::string const & numeric_string)
+        {
+          source.append("__kernel void bluestein_post(__global "); source.append(numeric_string); source.append("2 *Z, \n");
+          source.append("                             __global "); source.append(numeric_string); source.append("2 *out, \n");
+          source.append("                             unsigned int size) \n");
+          source.append("{ \n");
+          source.append("  unsigned int glb_id = get_global_id(0); \n");
+          source.append("  unsigned int glb_sz = get_global_size(0); \n");
+
+          source.append("  unsigned int double_size = size << 1; \n");
+          source.append("  "); source.append(numeric_string); source.append(" sn_a, cs_a; \n");
+          source.append("  const "); source.append(numeric_string); source.append(" NUM_PI = 3.14159265358979323846; \n");
+
+          source.append("  for(unsigned int i = glb_id; i < size; i += glb_sz) { \n");
+          source.append("    unsigned int rm = i * i % (double_size); \n");
+          source.append("    "); source.append(numeric_string); source.append(" angle = ("); source.append(numeric_string); source.append(")rm / size * (-NUM_PI); \n");
+
+          source.append("    sn_a = sincos(angle, &cs_a); \n");
+
+          source.append("    "); source.append(numeric_string); source.append("2 b_i = ("); source.append(numeric_string); source.append("2)(cs_a, sn_a); \n");
+          source.append("    out[i] = ("); source.append(numeric_string); source.append("2)(Z[i].x * b_i.x - Z[i].y * b_i.y, Z[i].x * b_i.y + Z[i].y * b_i.x); \n");
+          source.append("  } \n");
+          source.append("} \n");
+        }
+
+        // Preprocessing phase of Bluestein algorithm
+        template <typename StringType>
+        void generate_fft_bluestein_pre(StringType & source, std::string const & numeric_string)
+        {
+          source.append("__kernel void bluestein_pre(__global "); source.append(numeric_string); source.append("2 *input, \n");
+          source.append("  __global "); source.append(numeric_string); source.append("2 *A, \n");
+          source.append("  __global "); source.append(numeric_string); source.append("2 *B, \n");
+          source.append("  unsigned int size, \n");
+          source.append("  unsigned int ext_size \n");
+          source.append("  ) { \n");
+          source.append("  unsigned int glb_id = get_global_id(0); \n");
+          source.append("  unsigned int glb_sz = get_global_size(0); \n");
+
+          source.append("  unsigned int double_size = size << 1; \n");
+
+          source.append("  "); source.append(numeric_string); source.append(" sn_a, cs_a; \n");
+          source.append("  const "); source.append(numeric_string); source.append(" NUM_PI = 3.14159265358979323846; \n");
+
+          source.append("  for(unsigned int i = glb_id; i < size; i += glb_sz) { \n");
+          source.append("    unsigned int rm = i * i % (double_size); \n");
+          source.append("    "); source.append(numeric_string); source.append(" angle = ("); source.append(numeric_string); source.append(")rm / size * NUM_PI; \n");
+
+          source.append("    sn_a = sincos(-angle, &cs_a); \n");
+
+          source.append("    "); source.append(numeric_string); source.append("2 a_i = ("); source.append(numeric_string); source.append("2)(cs_a, sn_a); \n");
+          source.append("    "); source.append(numeric_string); source.append("2 b_i = ("); source.append(numeric_string); source.append("2)(cs_a, -sn_a); \n");
+
+          source.append("    A[i] = ("); source.append(numeric_string); source.append("2)(input[i].x * a_i.x - input[i].y * a_i.y, input[i].x * a_i.y + input[i].y * a_i.x); \n");
+          source.append("    B[i] = b_i; \n");
+
+                  // very bad instruction, to be fixed
+          source.append("    if(i) \n");
+          source.append("      B[ext_size - i] = b_i; \n");
+          source.append("  } \n");
+          source.append("} \n");
+        }
+
+        /** @brief Extract real part of a complex number array */
+        template <typename StringType>
+        void generate_fft_complex_to_real(StringType & source, std::string const & numeric_string)
+        {
+          source.append("__kernel void complex_to_real(__global "); source.append(numeric_string); source.append("2 *in, \n");
+          source.append("  __global "); source.append(numeric_string); source.append("  *out, \n");
+          source.append("  unsigned int size) { \n");
+          source.append("  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))  \n");
+          source.append("    out[i] = in[i].x; \n");
+          source.append("} \n");
+        }
+
+        /** @brief OpenCL kernel generation code for dividing a complex number by a real number */
+        template <typename StringType>
+        void generate_fft_div_vec_scalar(StringType & source, std::string const & numeric_string)
+        {
+          source.append("__kernel void fft_div_vec_scalar(__global "); source.append(numeric_string); source.append("2 *input1, \n");
+          source.append("  unsigned int size, \n");
+          source.append("  "); source.append(numeric_string); source.append(" factor) { \n");
+          source.append("  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))  \n");
+          source.append("    input1[i] /= factor; \n");
+          source.append("} \n");
+        }
+
+        /** @brief Elementwise product of two complex vectors */
+        template <typename StringType>
+        void generate_fft_mult_vec(StringType & source, std::string const & numeric_string)
+        {
+          source.append("__kernel void fft_mult_vec(__global const "); source.append(numeric_string); source.append("2 *input1, \n");
+          source.append("  __global const "); source.append(numeric_string); source.append("2 *input2, \n");
+          source.append("  __global "); source.append(numeric_string); source.append("2 *output, \n");
+          source.append("  unsigned int size) { \n");
+          source.append("  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
+          source.append("    "); source.append(numeric_string); source.append("2 in1 = input1[i]; \n");
+          source.append("    "); source.append(numeric_string); source.append("2 in2 = input2[i]; \n");
+
+          source.append("    output[i] = ("); source.append(numeric_string); source.append("2)(in1.x * in2.x - in1.y * in2.y, in1.x * in2.y + in1.y * in2.x); \n");
+          source.append("  } \n");
+          source.append("} \n");
+        }
+
+        /** @brief Embedds a real-valued vector into a complex one */
+        template <typename StringType>
+        void generate_fft_real_to_complex(StringType & source, std::string const & numeric_string)
+        {
+          source.append("__kernel void real_to_complex(__global "); source.append(numeric_string); source.append(" *in, \n");
+          source.append("  __global "); source.append(numeric_string); source.append("2 *out, \n");
+          source.append("  unsigned int size) { \n");
+          source.append("  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
+          source.append("    "); source.append(numeric_string); source.append("2 val = 0; \n");
+          source.append("    val.x = in[i]; \n");
+          source.append("    out[i] = val; \n");
+          source.append("  } \n");
+          source.append("} \n");
+        }
+
+        /** @brief Reverses the entries in a vector */
+        template <typename StringType>
+        void generate_fft_reverse_inplace(StringType & source, std::string const & numeric_string)
+        {
+          source.append("__kernel void reverse_inplace(__global "); source.append(numeric_string); source.append(" *vec, uint size) { \n");
+          source.append("  for(uint i = get_global_id(0); i < (size >> 1); i+=get_global_size(0)) { \n");
+          source.append("    "); source.append(numeric_string); source.append(" val1 = vec[i]; \n");
+          source.append("    "); source.append(numeric_string); source.append(" val2 = vec[size - i - 1]; \n");
+
+          source.append("    vec[i] = val2; \n");
+          source.append("    vec[size - i - 1] = val1; \n");
+          source.append("  } \n");
+          source.append("} \n");
+        }
+
+        /** @brief Simplistic matrix transpose function */
+        template <typename StringType>
+        void generate_fft_transpose(StringType & source, std::string const & numeric_string)
+        {
+          source.append("__kernel void transpose(__global "); source.append(numeric_string); source.append("2 *input, \n");
+          source.append("  __global "); source.append(numeric_string); source.append("2 *output, \n");
+          source.append("  unsigned int row_num, \n");
+          source.append("  unsigned int col_num) { \n");
+          source.append("  unsigned int size = row_num * col_num; \n");
+          source.append("  for(unsigned int i = get_global_id(0); i < size; i+= get_global_size(0)) { \n");
+          source.append("    unsigned int row = i / col_num; \n");
+          source.append("    unsigned int col = i - row*col_num; \n");
+
+          source.append("    unsigned int new_pos = col * row_num + row; \n");
+
+          source.append("    output[new_pos] = input[i]; \n");
+          source.append("  } \n");
+          source.append("} \n");
+        }
+
+        /** @brief Simplistic inplace matrix transpose function */
+        template <typename StringType>
+        void generate_fft_transpose_inplace(StringType & source, std::string const & numeric_string)
+        {
+          source.append("__kernel void transpose_inplace(__global "); source.append(numeric_string); source.append("2* input, \n");
+          source.append("  unsigned int row_num, \n");
+          source.append("  unsigned int col_num) { \n");
+          source.append("  unsigned int size = row_num * col_num; \n");
+          source.append("  for(unsigned int i = get_global_id(0); i < size; i+= get_global_size(0)) { \n");
+          source.append("    unsigned int row = i / col_num; \n");
+          source.append("    unsigned int col = i - row*col_num; \n");
+
+          source.append("    unsigned int new_pos = col * row_num + row; \n");
+
+          source.append("    if(i < new_pos) { \n");
+          source.append("      "); source.append(numeric_string); source.append("2 val = input[i]; \n");
+          source.append("      input[i] = input[new_pos]; \n");
+          source.append("      input[new_pos] = val; \n");
+          source.append("    } \n");
+          source.append("  } \n");
+          source.append("} \n");
+        }
+
+        /** @brief Computes the matrix vector product with a Vandermonde matrix */
+        template <typename StringType>
+        void generate_fft_vandermonde_prod(StringType & source, std::string const & numeric_string)
+        {
+          source.append("__kernel void vandermonde_prod(__global "); source.append(numeric_string); source.append(" *vander, \n");
+          source.append("  __global "); source.append(numeric_string); source.append(" *vector, \n");
+          source.append("  __global "); source.append(numeric_string); source.append(" *result, \n");
+          source.append("  uint size) { \n");
+          source.append("  for(uint i = get_global_id(0); i < size; i+= get_global_size(0)) { \n");
+          source.append("    "); source.append(numeric_string); source.append(" mul = vander[i]; \n");
+          source.append("    "); source.append(numeric_string); source.append(" pwr = 1; \n");
+          source.append("    "); source.append(numeric_string); source.append(" val = 0; \n");
+
+          source.append("    for(uint j = 0; j < size; j++) { \n");
+          source.append("      val = val + pwr * vector[j]; \n");
+          source.append("      pwr *= mul; \n");
+          source.append("    } \n");
+
+          source.append("    result[i] = val; \n");
+          source.append("  } \n");
+          source.append("} \n");
+        }
+
+        /** @brief Zero two complex vectors (to avoid kernel launch overhead) */
+        template <typename StringType>
+        void generate_fft_zero2(StringType & source, std::string const & numeric_string)
+        {
+          source.append("__kernel void zero2(__global "); source.append(numeric_string); source.append("2 *input1, \n");
+          source.append("  __global "); source.append(numeric_string); source.append("2 *input2, \n");
+          source.append("  unsigned int size) { \n");
+          source.append("  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
+          source.append("    input1[i] = 0; \n");
+          source.append("    input2[i] = 0; \n");
+          source.append("  } \n");
+          source.append("} \n");
+        }
+
+        //////////////////////////// Part 2: Main kernel class ////////////////////////////////////
+
+        // main kernel class
+        /** @brief Main kernel class for generating OpenCL kernels for the fast Fourier transform. */
+        template <class NumericT>
+        struct fft
+        {
+          static std::string program_name()
+          {
+            return viennacl::ocl::type_to_string<NumericT>::apply() + "_fft";
+          }
+
+          static void init(viennacl::ocl::context & ctx)
+          {
+            viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx);
+            std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
+
+            static std::map<cl_context, bool> init_done;
+            if (!init_done[ctx.handle().get()])
+            {
+              std::string source;
+              source.reserve(8192);
+
+              viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
+
+              // unary operations
+              if (numeric_string == "float" || numeric_string == "double")
+              {
+                generate_fft_bluestein_post(source, numeric_string);
+                generate_fft_bluestein_pre(source, numeric_string);
+                generate_fft_complex_to_real(source, numeric_string);
+                generate_fft_div_vec_scalar(source, numeric_string);
+                generate_fft_mult_vec(source, numeric_string);
+                generate_fft_real_to_complex(source, numeric_string);
+                generate_fft_reverse_inplace(source, numeric_string);
+                generate_fft_transpose(source, numeric_string);
+                generate_fft_transpose_inplace(source, numeric_string);
+                generate_fft_vandermonde_prod(source, numeric_string);
+                generate_fft_zero2(source, numeric_string);
+              }
+
+              std::string prog_name = program_name();
+              #ifdef VIENNACL_BUILD_INFO
+              std::cout << "Creating program " << prog_name << std::endl;
+              #endif
+              ctx.add_program(source, prog_name);
+              init_done[ctx.handle().get()] = true;
+            } //if
+          } //init
+        };
+
+      }  // namespace kernels
+    }  // namespace opencl
+  }  // namespace linalg
+}  // namespace viennacl
+#endif
+
diff --git a/viennacl/linalg/opencl/kernels/hyb_matrix.hpp b/viennacl/linalg/opencl/kernels/hyb_matrix.hpp
new file mode 100644
index 0000000..3282aba
--- /dev/null
+++ b/viennacl/linalg/opencl/kernels/hyb_matrix.hpp
@@ -0,0 +1,214 @@
+#ifndef VIENNACL_LINALG_OPENCL_KERNELS_HYB_MATRIX_HPP
+#define VIENNACL_LINALG_OPENCL_KERNELS_HYB_MATRIX_HPP
+
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/platform.hpp"
+#include "viennacl/ocl/utils.hpp"
+
+#include "viennacl/linalg/opencl/common.hpp"
+
+/** @file viennacl/linalg/opencl/kernels/hyb_matrix.hpp
+ *  @brief OpenCL kernel file for hyb_matrix operations */
+namespace viennacl
+{
+  namespace linalg
+  {
+    namespace opencl
+    {
+      namespace kernels
+      {
+
+        //////////////////////////// Part 1: Kernel generation routines ////////////////////////////////////
+
+        template <typename StringType>
+        void generate_hyb_vec_mul(StringType & source, std::string const & numeric_string)
+        {
+          source.append("__kernel void vec_mul( \n");
+          source.append("  const __global int* ell_coords, \n");
+          source.append("  const __global "); source.append(numeric_string); source.append("* ell_elements, \n");
+          source.append("  const __global uint* csr_rows, \n");
+          source.append("  const __global uint* csr_cols, \n");
+          source.append("  const __global "); source.append(numeric_string); source.append("* csr_elements, \n");
+          source.append("  const __global "); source.append(numeric_string); source.append(" * x, \n");
+          source.append("  uint4 layout_x, \n");
+          source.append("  __global "); source.append(numeric_string); source.append(" * result, \n");
+          source.append("  uint4 layout_result, \n");
+          source.append("  unsigned int row_num, \n");
+          source.append("  unsigned int internal_row_num, \n");
+          source.append("  unsigned int items_per_row, \n");
+          source.append("  unsigned int aligned_items_per_row) \n");
+          source.append("{ \n");
+          source.append("  uint glb_id = get_global_id(0); \n");
+          source.append("  uint glb_sz = get_global_size(0); \n");
+
+          source.append("  for(uint row_id = glb_id; row_id < row_num; row_id += glb_sz) { \n");
+          source.append("    "); source.append(numeric_string); source.append(" sum = 0; \n");
+
+          source.append("    uint offset = row_id; \n");
+          source.append("    for(uint item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num) { \n");
+          source.append("      "); source.append(numeric_string); source.append(" val = ell_elements[offset]; \n");
+
+          source.append("      if(val != ("); source.append(numeric_string); source.append(")0) { \n");
+          source.append("        int col = ell_coords[offset]; \n");
+          source.append("        sum += (x[col * layout_x.y + layout_x.x] * val); \n");
+          source.append("      } \n");
+
+          source.append("    } \n");
+
+          source.append("    uint col_begin = csr_rows[row_id]; \n");
+          source.append("    uint col_end   = csr_rows[row_id + 1]; \n");
+
+          source.append("    for(uint item_id = col_begin; item_id < col_end; item_id++) {  \n");
+          source.append("      sum += (x[csr_cols[item_id] * layout_x.y + layout_x.x] * csr_elements[item_id]); \n");
+          source.append("    } \n");
+
+          source.append("    result[row_id * layout_result.y + layout_result.x] = sum; \n");
+          source.append("  } \n");
+          source.append("} \n");
+        }
+
+        namespace detail
+        {
+          template <typename StringType>
+          void generate_hyb_matrix_dense_matrix_mul(StringType & source, std::string const & numeric_string,
+                                                    bool B_transposed, bool B_row_major, bool C_row_major)
+          {
+            source.append("__kernel void ");
+            source.append(viennacl::linalg::opencl::detail::sparse_dense_matmult_kernel_name(B_transposed, B_row_major, C_row_major));
+            source.append("( \n");
+            source.append("  const __global int* ell_coords, \n");
+            source.append("  const __global "); source.append(numeric_string); source.append("* ell_elements, \n");
+            source.append("  const __global uint* csr_rows, \n");
+            source.append("  const __global uint* csr_cols, \n");
+            source.append("  const __global "); source.append(numeric_string); source.append("* csr_elements, \n");
+            source.append("  unsigned int row_num, \n");
+            source.append("  unsigned int internal_row_num, \n");
+            source.append("  unsigned int items_per_row, \n");
+            source.append("  unsigned int aligned_items_per_row, \n");
+            source.append("    __global const "); source.append(numeric_string); source.append("* d_mat, \n");
+            source.append("    unsigned int d_mat_row_start, \n");
+            source.append("    unsigned int d_mat_col_start, \n");
+            source.append("    unsigned int d_mat_row_inc, \n");
+            source.append("    unsigned int d_mat_col_inc, \n");
+            source.append("    unsigned int d_mat_row_size, \n");
+            source.append("    unsigned int d_mat_col_size, \n");
+            source.append("    unsigned int d_mat_internal_rows, \n");
+            source.append("    unsigned int d_mat_internal_cols, \n");
+            source.append("    __global "); source.append(numeric_string); source.append(" * result, \n");
+            source.append("    unsigned int result_row_start, \n");
+            source.append("    unsigned int result_col_start, \n");
+            source.append("    unsigned int result_row_inc, \n");
+            source.append("    unsigned int result_col_inc, \n");
+            source.append("    unsigned int result_row_size, \n");
+            source.append("    unsigned int result_col_size, \n");
+            source.append("    unsigned int result_internal_rows, \n");
+            source.append("    unsigned int result_internal_cols) { \n");
+
+            source.append("  uint glb_id = get_global_id(0); \n");
+            source.append("  uint glb_sz = get_global_size(0); \n");
+
+            source.append("  for(uint result_col = 0; result_col < result_col_size; ++result_col) { \n");
+            source.append("   for(uint row_id = glb_id; row_id < row_num; row_id += glb_sz) { \n");
+            source.append("    "); source.append(numeric_string); source.append(" sum = 0; \n");
+
+            source.append("    uint offset = row_id; \n");
+            source.append("    for(uint item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num) { \n");
+            source.append("      "); source.append(numeric_string); source.append(" val = ell_elements[offset]; \n");
+
+            source.append("      if(val != ("); source.append(numeric_string); source.append(")0) { \n");
+            source.append("        int col = ell_coords[offset]; \n");
+            if (B_transposed && B_row_major)
+              source.append("      sum += d_mat[ (d_mat_row_start + result_col * d_mat_row_inc) * d_mat_internal_cols +  d_mat_col_start +        col * d_mat_col_inc                        ] * val; \n");
+            else if (B_transposed && !B_row_major)
+              source.append("      sum += d_mat[ (d_mat_row_start + result_col * d_mat_row_inc)                       + (d_mat_col_start +        col * d_mat_col_inc) * d_mat_internal_rows ] * val; \n");
+            else if (!B_transposed && B_row_major)
+              source.append("      sum += d_mat[ (d_mat_row_start +        col * d_mat_row_inc) * d_mat_internal_cols +  d_mat_col_start + result_col * d_mat_col_inc                        ] * val; \n");
+            else
+              source.append("      sum += d_mat[ (d_mat_row_start +        col * d_mat_row_inc)                       + (d_mat_col_start + result_col * d_mat_col_inc) * d_mat_internal_rows ] * val; \n");
+            source.append("      } \n");
+
+            source.append("    } \n");
+
+            source.append("    uint col_begin = csr_rows[row_id]; \n");
+            source.append("    uint col_end   = csr_rows[row_id + 1]; \n");
+
+            source.append("    for(uint item_id = col_begin; item_id < col_end; item_id++) {  \n");
+            if (B_transposed && B_row_major)
+              source.append("      sum += d_mat[ (d_mat_row_start +        result_col * d_mat_row_inc) * d_mat_internal_cols +  d_mat_col_start + csr_cols[item_id] * d_mat_col_inc                        ] * csr_elements[item_id]; \n");
+            else if (B_transposed && !B_row_major)
+              source.append("      sum += d_mat[ (d_mat_row_start +        result_col * d_mat_row_inc)                       + (d_mat_col_start + csr_cols[item_id] * d_mat_col_inc) * d_mat_internal_rows ] * csr_elements[item_id]; \n");
+            else if (!B_transposed && B_row_major)
+              source.append("      sum += d_mat[ (d_mat_row_start + csr_cols[item_id] * d_mat_row_inc) * d_mat_internal_cols +  d_mat_col_start +        result_col * d_mat_col_inc                        ] * csr_elements[item_id]; \n");
+            else
+              source.append("      sum += d_mat[ (d_mat_row_start + csr_cols[item_id] * d_mat_row_inc)                       + (d_mat_col_start +        result_col * d_mat_col_inc) * d_mat_internal_rows ] * csr_elements[item_id]; \n");
+            source.append("    } \n");
+
+            if (C_row_major)
+              source.append("      result[ (result_row_start + row_id * result_row_inc) * result_internal_cols + result_col_start + result_col * result_col_inc ] = sum; \n");
+            else
+              source.append("      result[ (result_row_start + row_id * result_row_inc)                        + (result_col_start + result_col * result_col_inc) * result_internal_rows ] = sum; \n");
+            source.append("   } \n");
+            source.append("  } \n");
+            source.append("} \n");
+          }
+        }
+
+        template <typename StringType>
+        void generate_hyb_matrix_dense_matrix_multiplication(StringType & source, std::string const & numeric_string)
+        {
+          detail::generate_hyb_matrix_dense_matrix_mul(source, numeric_string, false, false, false);
+          detail::generate_hyb_matrix_dense_matrix_mul(source, numeric_string, false, false,  true);
+          detail::generate_hyb_matrix_dense_matrix_mul(source, numeric_string, false,  true, false);
+          detail::generate_hyb_matrix_dense_matrix_mul(source, numeric_string, false,  true,  true);
+
+          detail::generate_hyb_matrix_dense_matrix_mul(source, numeric_string, true, false, false);
+          detail::generate_hyb_matrix_dense_matrix_mul(source, numeric_string, true, false,  true);
+          detail::generate_hyb_matrix_dense_matrix_mul(source, numeric_string, true,  true, false);
+          detail::generate_hyb_matrix_dense_matrix_mul(source, numeric_string, true,  true,  true);
+        }
+
+        //////////////////////////// Part 2: Main kernel class ////////////////////////////////////
+
+        // main kernel class
+        /** @brief Main kernel class for generating OpenCL kernels for hyb_matrix. */
+        template <typename NumericT>
+        struct hyb_matrix
+        {
+          static std::string program_name()
+          {
+            return viennacl::ocl::type_to_string<NumericT>::apply() + "_hyb_matrix";
+          }
+
+          static void init(viennacl::ocl::context & ctx)
+          {
+            viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx);
+            std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
+
+            static std::map<cl_context, bool> init_done;
+            if (!init_done[ctx.handle().get()])
+            {
+              std::string source;
+              source.reserve(1024);
+
+              viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
+
+              generate_hyb_vec_mul(source, numeric_string);
+              generate_hyb_matrix_dense_matrix_multiplication(source, numeric_string);
+
+              std::string prog_name = program_name();
+              #ifdef VIENNACL_BUILD_INFO
+              std::cout << "Creating program " << prog_name << std::endl;
+              #endif
+              ctx.add_program(source, prog_name);
+              init_done[ctx.handle().get()] = true;
+            } //if
+          } //init
+        };
+
+      }  // namespace kernels
+    }  // namespace opencl
+  }  // namespace linalg
+}  // namespace viennacl
+#endif
+
diff --git a/viennacl/linalg/opencl/kernels/ilu.hpp b/viennacl/linalg/opencl/kernels/ilu.hpp
new file mode 100644
index 0000000..bb561f7
--- /dev/null
+++ b/viennacl/linalg/opencl/kernels/ilu.hpp
@@ -0,0 +1,90 @@
+#ifndef VIENNACL_LINALG_OPENCL_KERNELS_ILU_HPP
+#define VIENNACL_LINALG_OPENCL_KERNELS_ILU_HPP
+
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/platform.hpp"
+#include "viennacl/ocl/utils.hpp"
+
+/** @file viennacl/linalg/opencl/kernels/ilu.hpp
+ *  @brief OpenCL kernel file for nonnegative matrix factorization */
+namespace viennacl
+{
+  namespace linalg
+  {
+    namespace opencl
+    {
+      namespace kernels
+      {
+        template <typename StringType>
+        void generate_ilu_level_scheduling_substitute(StringType & source, std::string const & numeric_string)
+        {
+          source.append("__kernel void level_scheduling_substitute( \n");
+          source.append("          __global const unsigned int * row_index_array, \n");
+          source.append("          __global const unsigned int * row_indices, \n");
+          source.append("          __global const unsigned int * column_indices, \n");
+          source.append("          __global const "); source.append(numeric_string); source.append(" * elements, \n");
+          source.append("          __global "); source.append(numeric_string); source.append(" * vec, \n");
+          source.append("          unsigned int size) \n");
+          source.append("{ \n");
+          source.append("  for (unsigned int row  = get_global_id(0); \n");
+          source.append("                    row  < size; \n");
+          source.append("                    row += get_global_size(0)) \n");
+          source.append("  { \n");
+          source.append("    unsigned int eq_row = row_index_array[row]; \n");
+          source.append("    "); source.append(numeric_string); source.append(" vec_entry = vec[eq_row]; \n");
+          source.append("    unsigned int row_end = row_indices[row+1]; \n");
+
+          source.append("    for (unsigned int j = row_indices[row]; j < row_end; ++j) \n");
+          source.append("      vec_entry -= vec[column_indices[j]] * elements[j]; \n");
+
+          source.append("    vec[eq_row] = vec_entry; \n");
+          source.append("  } \n");
+          source.append("} \n");
+        }
+
+        // main kernel class
+        /** @brief Main kernel class for generating OpenCL kernels for incomplete LU factorization preconditioners. */
+        template <class NumericT>
+        struct ilu
+        {
+          static std::string program_name()
+          {
+            return viennacl::ocl::type_to_string<NumericT>::apply() + "_ilu";
+          }
+
+          static void init(viennacl::ocl::context & ctx)
+          {
+            viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx);
+            std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
+
+            static std::map<cl_context, bool> init_done;
+            if (!init_done[ctx.handle().get()])
+            {
+              std::string source;
+              source.reserve(1024);
+
+              viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
+
+              // only generate for floating points (forces error for integers)
+              if (numeric_string == "float" || numeric_string == "double")
+              {
+                generate_ilu_level_scheduling_substitute(source, numeric_string);
+              }
+
+              std::string prog_name = program_name();
+              #ifdef VIENNACL_BUILD_INFO
+              std::cout << "Creating program " << prog_name << std::endl;
+              #endif
+              ctx.add_program(source, prog_name);
+              init_done[ctx.handle().get()] = true;
+            } //if
+          } //init
+        };
+
+      }  // namespace kernels
+    }  // namespace opencl
+  }  // namespace linalg
+}  // namespace viennacl
+#endif
+
diff --git a/viennacl/linalg/opencl/kernels/matrix.hpp b/viennacl/linalg/opencl/kernels/matrix.hpp
new file mode 100644
index 0000000..1e22615
--- /dev/null
+++ b/viennacl/linalg/opencl/kernels/matrix.hpp
@@ -0,0 +1,932 @@
+#ifndef VIENNACL_LINALG_OPENCL_KERNELS_MATRIX_HPP
+#define VIENNACL_LINALG_OPENCL_KERNELS_MATRIX_HPP
+
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/platform.hpp"
+#include "viennacl/ocl/utils.hpp"
+
+/** @file viennacl/linalg/opencl/kernels/matrix.hpp
+ *  @brief Runtime generation of OpenCL kernels for matrix operations */
+namespace viennacl
+{
+  namespace linalg
+  {
+    namespace opencl
+    {
+      namespace kernels
+      {
+
+        //////////////////////////// Part 1: Kernel generation routines ////////////////////////////////////
+
+        /** @brief Enumeration for the scalar type in ambm-like operations */
+        enum ambm_scalar_type
+        {
+          VIENNACL_AMBM_NONE = 0, // vector does not exist/contribute
+          VIENNACL_AMBM_CPU,
+          VIENNACL_AMBM_GPU
+        };
+
+        /** @brief Configuration struct for generating OpenCL kernels for linear combinations of matrices */
+        struct ambm_config
+        {
+          ambm_config() : with_stride_and_range(true), is_row_major(true), a(VIENNACL_AMBM_CPU), b(VIENNACL_AMBM_NONE) {}
+
+          bool with_stride_and_range;
+          bool is_row_major;
+          std::string      assign_op;
+          ambm_scalar_type a;
+          ambm_scalar_type b;
+        };
+
+        // just returns the for-loop
+        template <typename StringType>
+        void generate_ambm_impl2(StringType & source, ambm_config const & cfg, bool mult_alpha, bool mult_beta)
+        {
+          if (cfg.is_row_major)
+          {
+            source.append("  unsigned int row_gid = get_global_id(0) / get_local_size(0);\n");
+            source.append("  unsigned int col_gid = get_global_id(0) % get_local_size(0);\n");
+            source.append("  for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))\n");
+            source.append("    for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))\n");
+          }
+          else
+          {
+            source.append("  unsigned int col_gid = get_global_id(0) / get_local_size(0);\n");
+            source.append("  unsigned int row_gid = get_global_id(0) % get_local_size(0);\n");
+            source.append("  for (unsigned int col = col_gid; col < A_size2; col += get_num_groups(0))\n");
+            source.append("    for (unsigned int row = row_gid; row < A_size1; row += get_local_size(0))\n");
+          }
+
+          if (cfg.with_stride_and_range)
+          {
+            if (cfg.is_row_major)
+              source.append("      A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] ");
+            else
+              source.append("      A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) *  A_internal_size1] ");
+            source.append(cfg.assign_op);
+            if (cfg.is_row_major)
+              source.append(" B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] ");
+            else
+              source.append(" B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] ");
+
+            if (mult_alpha)
+              source.append("* alpha ");
+            else
+              source.append("/ alpha ");
+            if (cfg.b != VIENNACL_AMBM_NONE)
+            {
+              if (cfg.is_row_major)
+                source.append("+ C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] ");
+              else
+                source.append("+ C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] ");
+              if (mult_beta)
+                source.append("* beta");
+              else
+                source.append("/ beta");
+            }
+          }
+          else
+          {
+            if (cfg.is_row_major)
+              source.append("    A[row * A_internal_size2 + col] ");
+            else
+              source.append("    A[row + col * A_internal_size1] ");
+            source.append(cfg.assign_op);
+            if (cfg.is_row_major)
+              source.append(" B[row * B_internal_size2 + col] ");
+            else
+              source.append(" B[row + col * B_internal_size1] ");
+
+            if (mult_alpha)
+              source.append("* alpha ");
+            else
+              source.append("/ alpha ");
+            if (cfg.b != VIENNACL_AMBM_NONE)
+            {
+              if (cfg.is_row_major)
+                source.append("+ C[row * C_internal_size2 + col] ");
+              else
+                source.append("+ C[row + col * C_internal_size2] ");
+              if (mult_beta)
+                source.append("* beta");
+              else
+                source.append("/ beta");
+            }
+          }
+          source.append("; \n");
+        }
+
+        template <typename StringType>
+        void generate_ambm_impl(StringType & source, std::string const & numeric_string, ambm_config const & cfg)
+        {
+          source.append("__kernel void am");
+          if (cfg.b != VIENNACL_AMBM_NONE)
+            source.append("bm");
+          if (cfg.assign_op != "=")
+            source.append("_m");
+
+          if (cfg.a == VIENNACL_AMBM_CPU)
+            source.append("_cpu");
+          else if (cfg.a == VIENNACL_AMBM_GPU)
+            source.append("_gpu");
+
+          if (cfg.b == VIENNACL_AMBM_CPU)
+            source.append("_cpu");
+          else if (cfg.b == VIENNACL_AMBM_GPU)
+            source.append("_gpu");
+          source.append("( \n");
+          source.append("  __global "); source.append(numeric_string); source.append(" * A, \n");
+          source.append("  unsigned int A_start1, unsigned int A_start2, \n");
+          source.append("  unsigned int A_inc1,   unsigned int A_inc2, \n");
+          source.append("  unsigned int A_size1,  unsigned int A_size2, \n");
+          source.append("  unsigned int A_internal_size1,  unsigned int A_internal_size2, \n");
+          if (cfg.a == VIENNACL_AMBM_CPU)
+          {
+            source.append("  "); source.append(numeric_string); source.append(" fac2, \n");
+          }
+          else if (cfg.a == VIENNACL_AMBM_GPU)
+          {
+            source.append("  __global "); source.append(numeric_string); source.append(" * fac2, \n");
+          }
+          source.append("  unsigned int options2, \n");  // 0: no action, 1: flip sign, 2: take inverse, 3: flip sign and take inverse
+          source.append("  __global const "); source.append(numeric_string); source.append(" * B, \n");
+          source.append("  unsigned int B_start1, unsigned int B_start2, \n");
+          source.append("  unsigned int B_inc1,   unsigned int B_inc2, \n");
+          source.append("  unsigned int B_internal_size1,  unsigned int B_internal_size2");
+
+          if (cfg.b != VIENNACL_AMBM_NONE)
+          {
+            source.append(", \n\n");
+            if (cfg.b == VIENNACL_AMBM_CPU)
+            {
+              source.append("  "); source.append(numeric_string); source.append(" fac3, \n");
+            }
+            else if (cfg.b == VIENNACL_AMBM_GPU)
+            {
+              source.append("  __global "); source.append(numeric_string); source.append(" * fac3, \n");
+            }
+            source.append("  unsigned int options3, \n");  // 0: no action, 1: flip sign, 2: take inverse, 3: flip sign and take inverse
+            source.append("  __global const "); source.append(numeric_string); source.append(" * C, \n");
+            source.append("  unsigned int C_start1, unsigned int C_start2, \n");
+            source.append("  unsigned int C_inc1,   unsigned int C_inc2, \n");
+            source.append("  unsigned int C_internal_size1,  unsigned int C_internal_size2 \n");
+          }
+          source.append(") { \n");
+
+          if (cfg.a == VIENNACL_AMBM_CPU)
+          {
+            source.append("  "); source.append(numeric_string); source.append(" alpha = fac2; \n");
+          }
+          else if (cfg.a == VIENNACL_AMBM_GPU)
+          {
+            source.append("  "); source.append(numeric_string); source.append(" alpha = fac2[0]; \n");
+          }
+          source.append("  if (options2 & (1 << 0)) \n");
+          source.append("    alpha = -alpha; \n");
+          source.append(" \n");
+
+          if (cfg.b == VIENNACL_AMBM_CPU)
+          {
+            source.append("  "); source.append(numeric_string); source.append(" beta = fac3; \n");
+          }
+          else if (cfg.b == VIENNACL_AMBM_GPU)
+          {
+            source.append("  "); source.append(numeric_string); source.append(" beta = fac3[0]; \n");
+          }
+          if (cfg.b != VIENNACL_AMBM_NONE)
+          {
+            source.append("  if (options3 & (1 << 0)) \n");
+            source.append("    beta = -beta; \n");
+            source.append(" \n");
+          }
+          source.append("  if (options2 & (1 << 1)) { \n");
+          if (cfg.b != VIENNACL_AMBM_NONE)
+          {
+            source.append("    if (options3 & (1 << 1)) {\n");
+            generate_ambm_impl2(source, cfg, false, false);
+            source.append("    } else {\n");
+            generate_ambm_impl2(source, cfg, false, true);
+            source.append("    } \n");
+          }
+          else
+            generate_ambm_impl2(source, cfg, false, true);
+          source.append("  } else { \n");
+          if (cfg.b != VIENNACL_AMBM_NONE)
+          {
+            source.append("    if (options3 & (1 << 1)) {\n");
+            generate_ambm_impl2(source, cfg, true, false);
+            source.append("    } else {\n");
+            generate_ambm_impl2(source, cfg, true, true);
+            source.append("    } \n");
+          }
+          else
+            generate_ambm_impl2(source, cfg, true, true);
+          source.append("  } \n");
+          source.append("} \n");
+        }
+
+        template <typename StringType>
+        void generate_ambm(StringType & source, std::string const & numeric_string, bool is_row_major)
+        {
+          ambm_config cfg;
+          cfg.assign_op = "=";
+          cfg.with_stride_and_range = true;
+          cfg.is_row_major = is_row_major;
+
+          // am
+          cfg.b = VIENNACL_AMBM_NONE; cfg.a = VIENNACL_AMBM_CPU; generate_ambm_impl(source, numeric_string, cfg);
+          cfg.b = VIENNACL_AMBM_NONE; cfg.a = VIENNACL_AMBM_GPU; generate_ambm_impl(source, numeric_string, cfg);
+
+          // ambm
+          cfg.a = VIENNACL_AMBM_CPU; cfg.b = VIENNACL_AMBM_CPU; generate_ambm_impl(source, numeric_string, cfg);
+          cfg.a = VIENNACL_AMBM_CPU; cfg.b = VIENNACL_AMBM_GPU; generate_ambm_impl(source, numeric_string, cfg);
+          cfg.a = VIENNACL_AMBM_GPU; cfg.b = VIENNACL_AMBM_CPU; generate_ambm_impl(source, numeric_string, cfg);
+          cfg.a = VIENNACL_AMBM_GPU; cfg.b = VIENNACL_AMBM_GPU; generate_ambm_impl(source, numeric_string, cfg);
+
+          // ambm_m
+          cfg.assign_op = "+=";
+
+          cfg.a = VIENNACL_AMBM_CPU; cfg.b = VIENNACL_AMBM_CPU; generate_ambm_impl(source, numeric_string, cfg);
+          cfg.a = VIENNACL_AMBM_CPU; cfg.b = VIENNACL_AMBM_GPU; generate_ambm_impl(source, numeric_string, cfg);
+          cfg.a = VIENNACL_AMBM_GPU; cfg.b = VIENNACL_AMBM_CPU; generate_ambm_impl(source, numeric_string, cfg);
+          cfg.a = VIENNACL_AMBM_GPU; cfg.b = VIENNACL_AMBM_GPU; generate_ambm_impl(source, numeric_string, cfg);
+        }
+
+        template <typename StringType>
+        void generate_assign_cpu(StringType & source, std::string const & numeric_string, bool is_row_major)
+        {
+          source.append("__kernel void assign_cpu( \n");
+          source.append("  __global "); source.append(numeric_string); source.append(" * A, \n");
+          source.append("  unsigned int A_start1, unsigned int A_start2, \n");
+          source.append("  unsigned int A_inc1,   unsigned int A_inc2, \n");
+          source.append("  unsigned int A_size1,  unsigned int A_size2, \n");
+          source.append("  unsigned int A_internal_size1,  unsigned int A_internal_size2, \n");
+          source.append("  "); source.append(numeric_string); source.append(" alpha) \n");
+          source.append("{ \n");
+          if (is_row_major)
+          {
+            source.append("  unsigned int row_gid = get_global_id(0) / get_local_size(0);\n");
+            source.append("  unsigned int col_gid = get_global_id(0) % get_local_size(0);\n");
+            source.append("  for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))\n");
+            source.append("    for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))\n");
+            source.append("      A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = alpha; \n");
+          }
+          else
+          {
+            source.append("  unsigned int row_gid = get_global_id(0) % get_local_size(0);\n");
+            source.append("  unsigned int col_gid = get_global_id(0) / get_local_size(0);\n");
+            source.append("  for (unsigned int col = col_gid; col < A_size2; col += get_num_groups(0))\n");
+            source.append("    for (unsigned int row = row_gid; row < A_size1; row += get_local_size(0))\n");
+            source.append("      A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) *  A_internal_size1] = alpha; \n");
+          }
+          source.append("} \n");
+        }
+
+        template <typename StringType>
+        void generate_diagonal_assign_cpu(StringType & source, std::string const & numeric_string, bool is_row_major)
+        {
+          source.append("__kernel void diagonal_assign_cpu( \n");
+          source.append("  __global "); source.append(numeric_string); source.append(" * A, \n");
+          source.append("  unsigned int A_start1, unsigned int A_start2, \n");
+          source.append("  unsigned int A_inc1,   unsigned int A_inc2, \n");
+          source.append("  unsigned int A_size1,  unsigned int A_size2, \n");
+          source.append("  unsigned int A_internal_size1,  unsigned int A_internal_size2, \n");
+          source.append("  "); source.append(numeric_string); source.append(" alpha) \n");
+          source.append("{ \n");
+          source.append("  for (unsigned int idx = get_global_id(0); idx < min(A_size1, A_size2); idx += get_global_size(0))\n");
+          if (is_row_major)
+            source.append("    A[(idx * A_inc1 + A_start1) * A_internal_size2 + (idx * A_inc2 + A_start2)] = alpha; \n");
+          else
+            source.append("    A[(idx * A_inc1 + A_start1) + (idx * A_inc2 + A_start2) *  A_internal_size1] = alpha; \n");
+          source.append("} \n");
+        }
+
+        template <typename StringType>
+        void generate_element_op(StringType & source, std::string const & numeric_string, bool is_row_major)
+        {
+          source.append("__kernel void element_op( \n");
+          source.append("  __global "); source.append(numeric_string); source.append(" * A, \n");
+          source.append("  unsigned int A_start1, unsigned int A_start2, \n");
+          source.append("  unsigned int A_inc1,   unsigned int A_inc2, \n");
+          source.append("  unsigned int A_size1,  unsigned int A_size2, \n");
+          source.append("  unsigned int A_internal_size1,  unsigned int A_internal_size2, \n");
+          source.append("  __global "); source.append(numeric_string); source.append(" * B, \n");
+          source.append("  unsigned int B_start1, unsigned int B_start2, \n");
+          source.append("  unsigned int B_inc1,   unsigned int B_inc2, \n");
+          source.append("  unsigned int B_internal_size1,  unsigned int B_internal_size2, \n");
+          source.append("  __global "); source.append(numeric_string); source.append(" * C, \n");
+          source.append("  unsigned int C_start1, unsigned int C_start2, \n");
+          source.append("  unsigned int C_inc1,   unsigned int C_inc2, \n");
+          source.append("  unsigned int C_internal_size1,  unsigned int C_internal_size2, \n");
+          source.append("  unsigned int op_type) \n"); //0: product, 1: division, 2: pow
+          source.append("{ \n");
+          if (is_row_major)
+          {
+            source.append("  unsigned int row_gid = get_global_id(0) / get_local_size(0);\n");
+            source.append("  unsigned int col_gid = get_global_id(0) % get_local_size(0);\n");
+            source.append("  if (op_type == 2) {");
+            if (numeric_string == "float" || numeric_string == "double")
+            {
+              source.append("    for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))\n");
+              source.append("      for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))\n");
+              source.append("        A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = \n");
+              source.append("        pow(B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)], \n");
+              source.append("            C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)]); \n");
+            }
+            source.append("  } else if (op_type == 1) {");
+            source.append("    for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))\n");
+            source.append("      for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))\n");
+            source.append("        A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = \n");
+            source.append("        B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] / \n");
+            source.append("        C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)]; \n");
+            source.append("  } else if (op_type == 0) {");
+            source.append("    for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))\n");
+            source.append("      for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))\n");
+            source.append("        A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = \n");
+            source.append("        B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] * \n");
+            source.append("        C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)]; \n");
+            source.append("  }");
+          }
+          else
+          {
+            source.append("  unsigned int row_gid = get_global_id(0) % get_local_size(0);\n");
+            source.append("  unsigned int col_gid = get_global_id(0) / get_local_size(0);\n");
+            source.append("  if (op_type == 2) {");
+            if (numeric_string == "float" || numeric_string == "double")
+            {
+              source.append("    for (unsigned int col = col_gid; col < A_size2; col += get_num_groups(0))\n");
+              source.append("      for (unsigned int row = row_gid; row < A_size1; row += get_local_size(0))\n");
+              source.append("        A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) *  A_internal_size1] =  \n");
+              source.append("          pow(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) *  B_internal_size1], \n");
+              source.append("              C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) *  C_internal_size1]); \n");
+            }
+            source.append("  } else if (op_type == 1) {");
+            source.append("    for (unsigned int col = col_gid; col < A_size2; col += get_num_groups(0))\n");
+            source.append("      for (unsigned int row = row_gid; row < A_size1; row += get_local_size(0))\n");
+            source.append("        A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) *  A_internal_size1] =  \n");
+            source.append("          B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) *  B_internal_size1] / \n");
+            source.append("          C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) *  C_internal_size1]; \n");
+            source.append("  } else if (op_type == 0) {");
+            source.append("    for (unsigned int col = col_gid; col < A_size2; col += get_num_groups(0))\n");
+            source.append("      for (unsigned int row = row_gid; row < A_size1; row += get_local_size(0))\n");
+            source.append("        A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) *  A_internal_size1] = \n");
+            source.append("          B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) *  B_internal_size1] * \n");
+            source.append("          C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) *  C_internal_size1]; \n");
+            source.append("  }");
+          }
+          source.append("} \n");
+        }
+
+
+        template <typename StringType>
+        void generate_fft(StringType & source, std::string const & numeric_string, bool is_row_major)
+        {
+          // naive fourier transform (quadratic complexity, use for reference only)
+          source.append("__kernel void fft_direct(__global "); source.append(numeric_string); source.append("2 *input, \n");
+          source.append("                         __global "); source.append(numeric_string); source.append("2 *output, \n");
+          source.append("                         unsigned int size, \n");
+          source.append("                         unsigned int stride, \n");
+          source.append("                         unsigned int batch_num, \n");
+          source.append("                         "); source.append(numeric_string); source.append(" sign) { \n");
+          source.append("    const "); source.append(numeric_string); source.append(" NUM_PI = 3.14159265358979323846; \n");
+          source.append(" \n");
+          source.append("    for(unsigned int batch_id = 0; batch_id < batch_num; batch_id++) { \n");
+          source.append("        for(unsigned int k = get_global_id(0); k < size; k += get_global_size(0)) { \n");
+          source.append("            "); source.append(numeric_string); source.append("2 f = 0.0f; \n");
+          source.append(" \n");
+          source.append("            for(unsigned int n = 0; n < size; n++) { \n");
+          source.append("                "); source.append(numeric_string); source.append("2 in = ");
+          if (is_row_major)
+            source.append("input[batch_id * stride + n]; \n"); //input index here
+          else
+            source.append("input[n * stride + batch_id]; \n"); //input index here
+          source.append(" \n");
+          source.append("                "); source.append(numeric_string); source.append(" sn, cs; \n");
+          source.append("                "); source.append(numeric_string); source.append(" arg = sign * 2 * NUM_PI * k / size * n; \n");
+          source.append("                sn = sincos(arg, &cs); \n");
+          source.append(" \n");
+          source.append("                "); source.append(numeric_string); source.append("2 ex = ("); source.append(numeric_string); source.append("2)(cs, sn); \n");
+          source.append("                f = f + ("); source.append(numeric_string); source.append("2)(in.x * ex.x - in.y * ex.y, in.x * ex.y + in.y * ex.x); \n");
+          source.append("            } \n");
+          source.append(" \n");
+          if (is_row_major)
+            source.append("            output[batch_id * stride + k] = f; \n"); // output index here
+          else
+            source.append("            output[k * stride + batch_id] = f; \n"); // output index here
+          source.append("        } \n");
+          source.append("    } \n");
+          source.append("} \n");
+
+          source.append(" \n"); //////////////////////////////
+
+          source.append("__kernel void fft_radix2(__global "); source.append(numeric_string); source.append("2* input, \n");
+          source.append("                         unsigned int s, \n");
+          source.append("                         unsigned int bit_size, \n");
+          source.append("                         unsigned int size, \n");
+          source.append("                         unsigned int stride, \n");
+          source.append("                         unsigned int batch_num, \n");
+          source.append("                         "); source.append(numeric_string); source.append(" sign) { \n");
+          source.append(" \n");
+          source.append("    unsigned int ss = 1 << s; \n");
+          source.append("    unsigned int half_size = size >> 1; \n");
+          source.append(" \n");
+          source.append("    "); source.append(numeric_string); source.append(" cs, sn; \n");
+          source.append("    const "); source.append(numeric_string); source.append(" NUM_PI = 3.14159265358979323846; \n");
+          source.append(" \n");
+          source.append("    unsigned int glb_id = get_global_id(0); \n");
+          source.append("    unsigned int glb_sz = get_global_size(0); \n");
+
+          source.append("    for(unsigned int batch_id = 0; batch_id < batch_num; batch_id++) { \n");
+          source.append("        for(unsigned int tid = glb_id; tid < half_size; tid += glb_sz) { \n");
+          source.append("            unsigned int group = (tid & (ss - 1)); \n");
+          source.append("            unsigned int pos = ((tid >> s) << (s + 1)) + group; \n");
+
+          if (is_row_major)
+          {
+            source.append("            unsigned int offset = batch_id * stride + pos; \n");
+            source.append("            "); source.append(numeric_string); source.append("2 in1 = input[offset]; \n"); //index
+            source.append("            "); source.append(numeric_string); source.append("2 in2 = input[offset + ss]; \n");//index
+          }
+          else
+          {
+            source.append("            unsigned int offset = pos * stride + batch_id; \n");
+            source.append("            "); source.append(numeric_string); source.append("2 in1 = input[offset]; \n"); //index
+            source.append("            "); source.append(numeric_string); source.append("2 in2 = input[offset + ss * stride]; \n");//index
+          }
+
+          source.append("            "); source.append(numeric_string); source.append(" arg = group * sign * NUM_PI / ss; \n");
+
+          source.append("            sn = sincos(arg, &cs); \n");
+
+          source.append("            "); source.append(numeric_string); source.append("2 ex = ("); source.append(numeric_string); source.append("2)(cs, sn); \n");
+
+          source.append("            "); source.append(numeric_string); source.append("2 tmp = ("); source.append(numeric_string); source.append("2)(in2.x * ex.x - in2.y * ex.y, in2.x * ex.y + in2.y * ex.x); \n");
+
+          if (is_row_major)
+            source.append("            input[offset + ss] = in1 - tmp; \n");//index
+          else
+            source.append("            input[offset + ss * stride] = in1 - tmp; \n");//index
+          source.append("            input[offset] = in1 + tmp; \n");//index
+          source.append("        } \n");
+          source.append("    } \n");
+          source.append("} \n");
+
+          source.append(" \n"); //////////////////////////////
+
+          source.append(" unsigned int get_reorder_num(unsigned int v, unsigned int bit_size) { \n");
+          source.append("     v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1); \n");
+          source.append("     v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2); \n");
+          source.append("     v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4); \n");
+          source.append("     v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8); \n");
+          source.append("     v = (v >> 16) | (v << 16); \n");
+          source.append("  \n");
+          source.append("     v = v >> (32 - bit_size); \n");
+          source.append("  \n");
+          source.append("     return v; \n");
+          source.append(" } \n");
+
+          source.append(" __kernel void fft_radix2_local(__global "); source.append(numeric_string); source.append("2* input, \n");
+          source.append("                                 __local "); source.append(numeric_string); source.append("2* lcl_input, \n");
+          source.append("                                 unsigned int bit_size, \n");
+          source.append("                                 unsigned int size, \n");
+          source.append("                                 unsigned int stride, \n");
+          source.append("                                 unsigned int batch_num, \n");
+          source.append("                                 "); source.append(numeric_string); source.append(" sign) { \n");
+
+          source.append("     unsigned int grp_id = get_group_id(0); \n");
+          source.append("     unsigned int grp_num = get_num_groups(0); \n");
+
+          source.append("     unsigned int lcl_sz = get_local_size(0); \n");
+          source.append("     unsigned int lcl_id = get_local_id(0); \n");
+          source.append("     const "); source.append(numeric_string); source.append(" NUM_PI = 3.14159265358979323846; \n");
+
+          source.append("     for(unsigned int batch_id = grp_id; batch_id < batch_num; batch_id += grp_num) { \n");
+                  //unsigned int base_offset = stride * batch_id; \n");
+                  //copy chunk of global memory to local \n");
+          source.append("         for(unsigned int p = lcl_id; p < size; p += lcl_sz) { \n");
+          source.append("             unsigned int v = get_reorder_num(p, bit_size); \n");
+          if (is_row_major)
+            source.append("             lcl_input[v] = input[batch_id * stride + p]; \n"); //index
+          else
+            source.append("             lcl_input[v] = input[p * stride + batch_id]; \n"); //index
+          source.append("         } \n");
+
+          source.append("         barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+                  //performs Cooley-Tukey FFT on local array
+          source.append("         for(unsigned int s = 0; s < bit_size; s++) { \n");
+          source.append("             unsigned int ss = 1 << s; \n");
+
+          source.append("             "); source.append(numeric_string); source.append(" cs, sn; \n");
+
+          source.append("             for(unsigned int tid = lcl_id; tid < size; tid += lcl_sz) { \n");
+          source.append("                 unsigned int group = (tid & (ss - 1)); \n");
+          source.append("                 unsigned int pos = ((tid >> s) << (s + 1)) + group; \n");
+
+          source.append("                 "); source.append(numeric_string); source.append("2 in1 = lcl_input[pos]; \n");
+          source.append("                 "); source.append(numeric_string); source.append("2 in2 = lcl_input[pos + ss]; \n");
+
+          source.append("                 "); source.append(numeric_string); source.append(" arg = group * sign * NUM_PI / ss; \n");
+
+          source.append("                 sn = sincos(arg, &cs); \n");
+          source.append("                 "); source.append(numeric_string); source.append("2 ex = ("); source.append(numeric_string); source.append("2)(cs, sn); \n");
+
+          source.append("                 "); source.append(numeric_string); source.append("2 tmp = ("); source.append(numeric_string); source.append("2)(in2.x * ex.x - in2.y * ex.y, in2.x * ex.y + in2.y * ex.x); \n");
+
+          source.append("                 lcl_input[pos + ss] = in1 - tmp; \n");
+          source.append("                 lcl_input[pos] = in1 + tmp; \n");
+          source.append("             } \n");
+
+          source.append("             barrier(CLK_LOCAL_MEM_FENCE); \n");
+          source.append("         } \n");
+
+                  //copy local array back to global memory
+          source.append("         for(unsigned int p = lcl_id; p < size; p += lcl_sz) { \n");
+          if (is_row_major)
+            source.append("             input[batch_id * stride + p] = lcl_input[p]; \n");//index
+          else
+            source.append("             input[p * stride + batch_id] = lcl_input[p]; \n");//index
+          source.append("         } \n");
+          source.append("     } \n");
+          source.append(" } \n");
+
+          source.append(" \n"); //////////////////////////////
+
+          //
+          // Performs reordering of input data in bit-reversal order
+          // Probably it's better to do in host side,
+          //
+          source.append("unsigned int get_reorder_num_2(unsigned int v, unsigned int bit_size) { \n");
+          source.append("    v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1); \n");
+          source.append("    v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2); \n");
+          source.append("    v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4); \n");
+          source.append("    v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8); \n");
+          source.append("    v = (v >> 16) | (v << 16); \n");
+
+          source.append("    v = v >> (32 - bit_size); \n");
+
+          source.append("    return v; \n");
+          source.append("} \n");
+
+          source.append("__kernel void fft_reorder(__global "); source.append(numeric_string); source.append("2* input, \n");
+          source.append("                          unsigned int bit_size, \n");
+          source.append("                          unsigned int size, \n");
+          source.append("                          unsigned int stride, \n");
+          source.append("                          int batch_num) { \n");
+
+          source.append("    unsigned int glb_id = get_global_id(0); \n");
+          source.append("    unsigned int glb_sz = get_global_size(0); \n");
+
+          source.append("    for(unsigned int batch_id = 0; batch_id < batch_num; batch_id++) { \n");
+          source.append("        for(unsigned int i = glb_id; i < size; i += glb_sz) { \n");
+          source.append("            unsigned int v = get_reorder_num_2(i, bit_size); \n");
+
+          source.append("            if(i < v) {\n");
+          if (is_row_major)
+          {
+            source.append("                "); source.append(numeric_string); source.append("2 tmp = input[batch_id * stride + i]; \n"); // index
+            source.append("                input[batch_id * stride + i] = input[batch_id * stride + v]; \n"); //index
+            source.append("                input[batch_id * stride + v] = tmp; \n"); //index
+          }
+          else
+          {
+            source.append("                "); source.append(numeric_string); source.append("2 tmp = input[i * stride + batch_id]; \n"); // index
+            source.append("                input[i * stride + batch_id] = input[v * stride + batch_id]; \n"); //index
+            source.append("                input[v * stride + batch_id] = tmp; \n"); //index
+          }
+          source.append("            } \n");
+          source.append("        } \n");
+          source.append("    } \n");
+          source.append("} \n");
+        }
+
+        template <typename StringType>
+        void generate_lu(StringType & source, std::string const & numeric_string, bool is_row_major)
+        {
+          source.append("__kernel void lu_factorize( \n");
+          source.append("          __global "); source.append(numeric_string); source.append(" * matrix, \n");
+          source.append("          unsigned int matrix_rows, \n");
+          source.append("          unsigned int matrix_cols, \n");
+          source.append("          unsigned int matrix_internal_rows, \n");
+          source.append("          unsigned int matrix_internal_cols) \n");
+          source.append("{ \n");
+          source.append("  "); source.append(numeric_string); source.append(" temp; \n");
+
+          if (is_row_major)
+          {
+            source.append("  unsigned rowi; \n");
+            source.append("  unsigned rowk; \n");
+            source.append("  for (unsigned int i=1; i<matrix_rows; ++i) \n");
+            source.append("  { \n");
+            source.append("    rowi = i * matrix_internal_cols; \n");
+            source.append("    for (unsigned int k=0; k<i; ++k) \n");
+            source.append("    { \n");
+            source.append("      rowk = k * matrix_internal_cols; \n");
+            source.append("      if (get_global_id(0) == 0) \n");
+            source.append("        matrix[rowi + k] /= matrix[rowk + k]; \n");
+
+            source.append("      barrier(CLK_GLOBAL_MEM_FENCE); \n");
+            source.append("      temp = matrix[rowi + k]; \n");
+
+            //parallel subtraction:
+            source.append("      for (unsigned int j=k+1 + get_global_id(0); j<matrix_rows; j += get_global_size(0)) \n");
+            source.append("        matrix[rowi + j] -= temp * matrix[rowk + j]; \n");
+          }
+          else
+          {
+            source.append("      for (unsigned int i=1; i<matrix_rows; ++i) \n");
+            source.append("      { \n");
+            source.append("        for (unsigned int k=0; k<i; ++k) \n");
+            source.append("        { \n");
+
+            source.append("          if (get_global_id(0) == 0) \n");
+            source.append("            matrix[i + k*matrix_internal_rows] /= matrix[k + k*matrix_internal_rows]; \n");
+
+            source.append("          barrier(CLK_GLOBAL_MEM_FENCE); \n");
+            source.append("          temp = matrix[i + k*matrix_internal_rows]; \n");
+
+            //parallel subtraction:
+            source.append("          for (unsigned int j=k+1 + get_global_id(0); j<matrix_cols; j += get_global_size(0)) \n");
+            source.append("            matrix[i + j*matrix_internal_rows] -= temp * matrix[k + j*matrix_internal_rows]; \n");
+          }
+          source.append("   }");
+          source.append("  }");
+          source.append("}");
+        }
+
+
+        template <typename StringType>
+        void generate_scaled_rank1_update(StringType & source, std::string const & numeric_string, bool is_row_major, bool alpha_on_cpu)
+        {
+          source.append("__kernel void scaled_rank1_update_"); alpha_on_cpu ? source.append("cpu") : source.append("gpu"); source.append("( \n");
+          source.append("  __global "); source.append(numeric_string); source.append(" * A, \n");
+          source.append("  unsigned int A_start1, unsigned int A_start2, \n");
+          source.append("  unsigned int A_inc1,   unsigned int A_inc2, \n");
+          source.append("  unsigned int A_size1,  unsigned int A_size2, \n");
+          source.append("  unsigned int A_internal_size1,  unsigned int A_internal_size2, \n");
+
+          if (alpha_on_cpu) {
+            source.append("  "); source.append(numeric_string); source.append(" val, \n");
+          } else {
+            source.append("  __global const "); source.append(numeric_string); source.append(" *val, \n");
+          }
+          source.append("  unsigned int options2, \n");
+
+          source.append("  __global const "); source.append(numeric_string); source.append(" * vec1, \n");
+          source.append("  unsigned int start1, \n");
+          source.append("  unsigned int inc1, \n");
+          source.append("  unsigned int size1, \n");
+
+          source.append("  __global const "); source.append(numeric_string); source.append(" * vec2, \n");
+          source.append("  unsigned int start2, \n");
+          source.append("  unsigned int inc2, \n");
+          source.append("  unsigned int size2) \n");
+          source.append("{ \n");
+
+          if (alpha_on_cpu) {
+            source.append("  "); source.append(numeric_string); source.append(" alpha = val; \n");
+          } else {
+            source.append("  "); source.append(numeric_string); source.append(" alpha = val[0]; \n");
+          }
+          source.append("  if (options2 & (1 << 0)) \n");
+          source.append("    alpha = -alpha; \n");
+
+          source.append("  unsigned int row_gid = get_global_id(0) / get_local_size(0); \n");
+          source.append("  unsigned int col_gid = get_global_id(0) % get_local_size(0); \n");
+
+          source.append("  for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0)) \n");
+          source.append("  { \n");
+          source.append("    "); source.append(numeric_string); source.append(" tmp = vec1[row * inc1 + start1];");
+          source.append("    tmp = (options2 & (1 << 1)) ? tmp / alpha : tmp * alpha;");
+          source.append("    for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0)) \n");
+          if (is_row_major)
+            source.append("      A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] += tmp * vec2[col * inc2 + start2]; \n");
+          else
+            source.append("      A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] += tmp * vec2[col * inc2 + start2]; \n");
+          source.append("  } \n");
+          source.append("} \n");
+        }
+
+        template <typename StringType>
+        void generate_trans_vec_mul(StringType & source, std::string const & numeric_string, bool is_row_major)
+        {
+          source.append("__kernel void trans_vec_mul( \n");
+          source.append("          __global const "); source.append(numeric_string); source.append(" * A, \n");
+          source.append("          unsigned int A_row_start, unsigned int A_col_start, \n");
+          source.append("          unsigned int A_row_inc, unsigned int A_col_inc, \n");
+          source.append("          unsigned int A_row_size, unsigned int A_col_size, \n");
+          source.append("          unsigned int A_internal_rows, unsigned int A_internal_cols, \n");
+          source.append("          __global const "); source.append(numeric_string); source.append(" * v, \n");
+          source.append("          unsigned int v_start, unsigned int v_inc, unsigned int v_size, \n");
+          source.append("          __global "); source.append(numeric_string); source.append(" * result, \n");
+          source.append("          unsigned int result_start, unsigned int result_inc, unsigned int result_size, \n");
+          source.append("          __local "); source.append(numeric_string); source.append(" * work) \n");
+          source.append("{ \n");
+          if (is_row_major)
+          {
+            source.append("  for (unsigned int row = get_global_id(0); row < A_col_size; row += get_global_size(0)) \n");
+            source.append("  { \n");
+            source.append("    "); source.append(numeric_string); source.append(" dot_prod = 0; \n");
+            source.append("    for (unsigned int col = 0; col < A_row_size; ++col) \n");
+            source.append("      dot_prod += A[(row * A_col_inc + A_col_start) + (col * A_row_inc + A_row_start) * A_internal_cols] * v[v_start + v_inc * col]; \n");
+            source.append("    result[row * result_inc + result_start] = dot_prod; \n");
+          }
+          else
+          {
+            source.append("  unsigned int row_gid = get_global_id(0) / get_local_size(0); \n");
+            source.append("  unsigned int col_gid = get_global_id(0) % get_local_size(0); \n");
+            source.append("  unsigned int lid = get_local_id(0); \n");
+
+            source.append("  for (unsigned int row = row_gid; row < A_col_size; row += get_num_groups(0)) \n");
+            source.append("  { \n");
+            source.append("    "); source.append(numeric_string); source.append(" dot_prod = 0; \n");
+            source.append("    for (unsigned int col = col_gid; col < A_row_size; col+=get_local_size(0)) \n");
+            source.append("      dot_prod += A[(row * A_col_inc + A_col_start) * A_internal_rows + col * A_row_inc + A_row_start] * v[v_start + v_inc * col]; \n");
+            source.append("    work[lid] = dot_prod; \n");
+
+            source.append("    for(unsigned int stride=get_local_size(0)/2 ; stride>0 ; stride>>=1){ \n");
+            source.append("      barrier(CLK_LOCAL_MEM_FENCE); \n");
+            source.append("      if(lid < stride) \n");
+            source.append("        work[lid] += work[lid+stride]; \n");
+            source.append("    } \n");
+
+            source.append("    if(lid == 0) \n");
+            source.append("      result[row * result_inc + result_start] = work[0]; \n");
+          }
+          source.append("  } \n");
+          source.append("} \n");
+        }
+
+        template <typename StringType>
+        void generate_triangular_substitute_inplace(StringType & source, std::string const & numeric_string, bool is_row_major)
+        {
+          source.append("__kernel void triangular_substitute_inplace( \n");
+          source.append("          __global "); source.append(numeric_string); source.append(" * A, \n");
+          source.append("          unsigned int A_start1, unsigned int A_start2, \n");
+          source.append("          unsigned int A_inc1,   unsigned int A_inc2, \n");
+          source.append("          unsigned int A_size1,  unsigned int A_size2, \n");
+          source.append("          unsigned int A_internal_size1,  unsigned int A_internal_size2, \n");
+          source.append("          __global "); source.append(numeric_string); source.append(" * v, \n");
+          source.append("          unsigned int v_start, \n");
+          source.append("          unsigned int v_inc, \n");
+          source.append("          unsigned int v_size, \n");
+          source.append("          unsigned int options) \n");
+          source.append("{ \n");
+          source.append("  "); source.append(numeric_string); source.append(" temp; \n");
+          source.append("  unsigned int unit_diagonal_flag  = (options & (1 << 0)); \n");
+          source.append("  unsigned int transposed_access_A = (options & (1 << 1)); \n");
+          source.append("  unsigned int is_lower_solve      = (options & (1 << 2)); \n");
+          source.append("  unsigned int row; \n");
+          source.append("  for (unsigned int rows_processed = 0; rows_processed < A_size1; ++rows_processed)  \n");   //Note: A required to be square
+          source.append("  { \n");
+          source.append("    row = is_lower_solve ? rows_processed : ((A_size1 - rows_processed) - 1); \n");
+          source.append("    if (!unit_diagonal_flag) \n");
+          source.append("    { \n");
+          source.append("      barrier(CLK_GLOBAL_MEM_FENCE); \n");
+          source.append("      if (get_global_id(0) == 0) \n");
+          if (is_row_major)
+            source.append("        v[row * v_inc + v_start] /= A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]; \n");
+          else
+            source.append("        v[row * v_inc + v_start] /= A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1]; \n");
+          source.append("   } \n");
+
+          source.append("    barrier(CLK_GLOBAL_MEM_FENCE); \n");
+
+          source.append("    temp = v[row * v_inc + v_start]; \n");
+
+          source.append("    for (int elim = (is_lower_solve ? (row + get_global_id(0) + 1) : get_global_id(0)); \n");
+          source.append("             elim < (is_lower_solve ? A_size1 : row); \n");
+          source.append("             elim += get_global_size(0)) \n");
+          if (is_row_major)
+          {
+            source.append("      v[elim * v_inc + v_start] -= temp * A[transposed_access_A ? ((row  * A_inc1 + A_start1) * A_internal_size2 + (elim * A_inc2 + A_start2)) \n");
+            source.append("                                                                : ((elim * A_inc1 + A_start1) * A_internal_size2 + (row  * A_inc2 + A_start2))]; \n");
+          }
+          else
+          {
+            source.append("      v[elim * v_inc + v_start] -= temp * A[transposed_access_A ? ((row  * A_inc1 + A_start1) + (elim * A_inc2 + A_start2) * A_internal_size1) \n");
+            source.append("                                                                : ((elim * A_inc1 + A_start1) + (row  * A_inc2 + A_start2) * A_internal_size1)]; \n");
+          }
+          source.append("  } \n");
+          source.append("} \n");
+        }
+
+        template <typename StringType>
+        void generate_vec_mul(StringType & source, std::string const & numeric_string, bool is_row_major)
+        {
+          source.append("__kernel void vec_mul( \n");
+          source.append("          __global const "); source.append(numeric_string); source.append(" * A, \n");
+          source.append("          unsigned int A_row_start, unsigned int A_col_start, \n");
+          source.append("          unsigned int A_row_inc, unsigned int A_col_inc, \n");
+          source.append("          unsigned int A_row_size, unsigned int A_col_size, \n");
+          source.append("          unsigned int A_internal_rows, unsigned int A_internal_cols, \n");
+          source.append("          __global const "); source.append(numeric_string); source.append(" * v, \n");
+          source.append("          unsigned int v_start, unsigned int v_inc, unsigned int v_size, \n");
+          source.append("          __global "); source.append(numeric_string); source.append(" * result, \n");
+          source.append("          unsigned int result_start, unsigned int result_inc, unsigned int result_size, \n");
+          source.append("          __local "); source.append(numeric_string); source.append(" * work) \n");
+          source.append("{ \n");
+          if (is_row_major)
+          {
+            source.append("  unsigned int row_gid = get_global_id(0) / get_local_size(0); \n");
+            source.append("  unsigned int col_gid = get_global_id(0) % get_local_size(0); \n");
+            source.append("  unsigned int lid = get_local_id(0); \n");
+
+            source.append("  for (unsigned int row = row_gid; row < A_row_size; row += get_num_groups(0)) \n");
+            source.append("  { \n");
+            source.append("    "); source.append(numeric_string); source.append(" dot_prod = 0; \n");
+            source.append("    for (unsigned int col = col_gid; col < A_col_size; col+=get_local_size(0)) \n");
+            source.append("      dot_prod += A[(row * A_row_inc + A_row_start) * A_internal_cols + col * A_col_inc + A_col_start] * v[v_start + v_inc * col]; \n");
+            source.append("    work[lid] = dot_prod; \n");
+
+            source.append("    for(unsigned int stride=get_local_size(0)/2 ; stride>0 ; stride>>=1){ \n");
+            source.append("      barrier(CLK_LOCAL_MEM_FENCE); \n");
+            source.append("      if(lid < stride) \n");
+            source.append("        work[lid] += work[lid+stride]; \n");
+            source.append("    } \n");
+
+            source.append("    if(lid == 0) \n");
+            source.append("      result[row * result_inc + result_start] = work[0]; \n");
+
+          }
+          else
+          {
+            source.append("    for (unsigned int row = get_global_id(0); row < A_row_size; row += get_global_size(0)) \n");
+            source.append("    { \n");
+            source.append("      "); source.append(numeric_string); source.append(" dot_prod = 0; \n");
+            source.append("      for (unsigned int col = 0; col < A_col_size; ++col) \n");
+            source.append("        dot_prod += A[(row * A_row_inc + A_row_start) + (col * A_col_inc + A_col_start) * A_internal_rows] * v[v_start + v_inc * col]; \n");
+            source.append("      result[row * result_inc + result_start] = dot_prod; \n");
+          }
+          source.append("  } \n");
+          source.append("} \n");
+        }
+
+        namespace detail
+        {
+          inline std::string type_to_string(viennacl::row_major)    { return "row"; }
+          inline std::string type_to_string(viennacl::column_major) { return "col"; }
+        }
+
+        //////////////////////////// Part 2: Main kernel class ////////////////////////////////////
+
+        // main kernel class
+        /** @brief Main kernel class for generating OpenCL kernels for operations on/with dense matrix objects of type viennacl::matrix<>. */
+        template <typename NumericT, typename F>
+        struct matrix
+        {
+          static std::string program_name()
+          {
+            return viennacl::ocl::type_to_string<NumericT>::apply() + "_matrix_" + detail::type_to_string(F());
+          }
+
+          static void init(viennacl::ocl::context & ctx)
+          {
+            viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx);
+            std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
+            bool is_row_major = viennacl::is_row_major<F>::value;
+
+            static std::map<cl_context, bool> init_done;
+            if (!init_done[ctx.handle().get()])
+            {
+              std::string source;
+              source.reserve(8192);
+
+              viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
+
+              // fully parametrized kernels:
+              generate_ambm(source, numeric_string, is_row_major);
+
+              // kernels with mostly predetermined skeleton:
+              generate_assign_cpu(source, numeric_string, is_row_major);
+              generate_diagonal_assign_cpu(source, numeric_string, is_row_major);
+              generate_element_op(source, numeric_string, is_row_major);
+              generate_scaled_rank1_update(source, numeric_string, is_row_major, true);
+              generate_scaled_rank1_update(source, numeric_string, is_row_major, false);
+              generate_trans_vec_mul(source, numeric_string, is_row_major);
+              generate_vec_mul(source, numeric_string, is_row_major);
+
+              if (numeric_string == "float" || numeric_string == "double")
+              {
+                generate_fft(source, numeric_string, is_row_major);
+                generate_lu(source, numeric_string, is_row_major);
+                generate_triangular_substitute_inplace(source, numeric_string, is_row_major);
+              }
+
+              std::string prog_name = program_name();
+              #ifdef VIENNACL_BUILD_INFO
+              std::cout << "Creating program " << prog_name << std::endl;
+              #endif
+              ctx.add_program(source, prog_name);
+              init_done[ctx.handle().get()] = true;
+            } //if
+          } //init
+        };
+
+      }  // namespace kernels
+    }  // namespace opencl
+  }  // namespace linalg
+}  // namespace viennacl
+#endif
+
diff --git a/viennacl/linalg/opencl/kernels/matrix_element.hpp b/viennacl/linalg/opencl/kernels/matrix_element.hpp
new file mode 100644
index 0000000..d299118
--- /dev/null
+++ b/viennacl/linalg/opencl/kernels/matrix_element.hpp
@@ -0,0 +1,138 @@
+#ifndef VIENNACL_LINALG_OPENCL_KERNELS_MATRIX_ELEMENT_HPP
+#define VIENNACL_LINALG_OPENCL_KERNELS_MATRIX_ELEMENT_HPP
+
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/platform.hpp"
+#include "viennacl/ocl/utils.hpp"
+#include "viennacl/linalg/opencl/kernels/matrix.hpp"
+
+/** @file viennacl/linalg/opencl/kernels/matrix_element.hpp
+ *  @brief OpenCL kernel file for element-wise matrix operations */
+namespace viennacl
+{
+  namespace linalg
+  {
+    namespace opencl
+    {
+      namespace kernels
+      {
+
+        //////////////////////////// Part 1: Kernel generation routines ////////////////////////////////////
+
+
+        //generate code for C = op1(A) * op2(B), where A, B, C can have different storage layouts and opX(D) = D or trans(D)
+        template <typename StringType>
+        void generate_matrix_unary_element_ops(StringType & source, std::string const & numeric_string,
+                                               std::string const & funcname, std::string const & op, std::string const & op_name, bool is_row_major)
+        {
+          source.append("__kernel void "); source.append(funcname); source.append("_"); source.append(op_name); source.append("(\n");
+          source.append("          __global "); source.append(numeric_string); source.append(" * A, \n");
+          source.append("          unsigned int A_start1, unsigned int A_start2, \n");
+          source.append("          unsigned int A_inc1,   unsigned int A_inc2, \n");
+          source.append("          unsigned int A_size1,  unsigned int A_size2, \n");
+          source.append("          unsigned int A_internal_size1,  unsigned int A_internal_size2, \n");
+
+          source.append("          __global const "); source.append(numeric_string); source.append(" * B, \n");
+          source.append("          unsigned int B_start1, unsigned int B_start2, \n");
+          source.append("          unsigned int B_inc1,   unsigned int B_inc2, \n");
+          source.append("          unsigned int B_internal_size1,  unsigned int B_internal_size2) { \n");
+
+          if (is_row_major)
+          {
+            source.append("  unsigned int row_gid = get_global_id(0) / get_local_size(0); \n");
+            source.append("  unsigned int col_gid = get_global_id(0) % get_local_size(0); \n");
+
+            source.append("  for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0)) \n");
+            source.append("    for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0)) \n");
+            source.append("      A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] \n");
+            source.append("        "); source.append(op); source.append(" "); source.append(funcname); source.append("(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]); \n");
+          }
+          else
+          {
+            source.append("  unsigned int row_gid = get_global_id(0) % get_local_size(0); \n");
+            source.append("  unsigned int col_gid = get_global_id(0) / get_local_size(0); \n");
+
+            source.append("  for (unsigned int col = col_gid; col < A_size2; col += get_num_groups(0)) \n");
+            source.append("    for (unsigned int row = row_gid; row < A_size1; row += get_local_size(0)) \n");
+            source.append("      A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] \n");
+            source.append("        "); source.append(op); source.append(" "); source.append(funcname); source.append("(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]); \n");
+          }
+          source.append("} \n");
+        }
+
+        template <typename StringType>
+        void generate_matrix_unary_element_ops(StringType & source, std::string const & numeric_string, std::string const & funcname, bool is_row_major)
+        {
+          generate_matrix_unary_element_ops(source, numeric_string, funcname, "=", "assign", is_row_major);
+          //generate_matrix_unary_element_ops(source, numeric_string, funcname, "+=", "plus", is_row_major);
+          //generate_matrix_unary_element_ops(source, numeric_string, funcname, "-=", "minus", is_row_major);
+        }
+
+        //////////////////////////// Part 2: Main kernel class ////////////////////////////////////
+
+        // main kernel class
+        /** @brief Main kernel class for generating OpenCL kernels for elementwise-operations such as element_sin() on/with dense matrix objects of type viennacl::matrix<>. */
+        template <typename NumericT, typename F>
+        struct matrix_element
+        {
+          static std::string program_name()
+          {
+            return viennacl::ocl::type_to_string<NumericT>::apply() + "_matrix_element_" + detail::type_to_string(F());
+          }
+
+          static void init(viennacl::ocl::context & ctx)
+          {
+            viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx);
+            std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
+
+            static std::map<cl_context, bool> init_done;
+            if (!init_done[ctx.handle().get()])
+            {
+              std::string source;
+              source.reserve(8192);
+              bool is_row_major = viennacl::is_row_major<F>::value;
+
+              viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
+
+              // unary operations
+              if (numeric_string == "float" || numeric_string == "double")
+              {
+                generate_matrix_unary_element_ops(source, numeric_string, "acos",  is_row_major);
+                generate_matrix_unary_element_ops(source, numeric_string, "asin",  is_row_major);
+                generate_matrix_unary_element_ops(source, numeric_string, "atan",  is_row_major);
+                generate_matrix_unary_element_ops(source, numeric_string, "ceil",  is_row_major);
+                generate_matrix_unary_element_ops(source, numeric_string, "cos",   is_row_major);
+                generate_matrix_unary_element_ops(source, numeric_string, "cosh",  is_row_major);
+                generate_matrix_unary_element_ops(source, numeric_string, "exp",   is_row_major);
+                generate_matrix_unary_element_ops(source, numeric_string, "fabs",  is_row_major);
+                generate_matrix_unary_element_ops(source, numeric_string, "floor", is_row_major);
+                generate_matrix_unary_element_ops(source, numeric_string, "log",   is_row_major);
+                generate_matrix_unary_element_ops(source, numeric_string, "log10", is_row_major);
+                generate_matrix_unary_element_ops(source, numeric_string, "sin",   is_row_major);
+                generate_matrix_unary_element_ops(source, numeric_string, "sinh",  is_row_major);
+                generate_matrix_unary_element_ops(source, numeric_string, "sqrt",  is_row_major);
+                generate_matrix_unary_element_ops(source, numeric_string, "tan",   is_row_major);
+                generate_matrix_unary_element_ops(source, numeric_string, "tanh",  is_row_major);
+              }
+              else
+              {
+                generate_matrix_unary_element_ops(source, numeric_string, "abs", is_row_major);
+              }
+
+              std::string prog_name = program_name();
+              #ifdef VIENNACL_BUILD_INFO
+              std::cout << "Creating program " << prog_name << std::endl;
+              #endif
+              ctx.add_program(source, prog_name);
+              init_done[ctx.handle().get()] = true;
+            } //if
+          } //init
+        };
+
+      }  // namespace kernels
+    }  // namespace opencl
+  }  // namespace linalg
+}  // namespace viennacl
+#endif
+
diff --git a/viennacl/linalg/opencl/kernels/matrix_prod.hpp b/viennacl/linalg/opencl/kernels/matrix_prod.hpp
new file mode 100644
index 0000000..e722b63
--- /dev/null
+++ b/viennacl/linalg/opencl/kernels/matrix_prod.hpp
@@ -0,0 +1,485 @@
+#ifndef VIENNACL_LINALG_OPENCL_KERNELS_MATRIX_PROD_HPP
+#define VIENNACL_LINALG_OPENCL_KERNELS_MATRIX_PROD_HPP
+
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/platform.hpp"
+#include "viennacl/ocl/utils.hpp"
+
+#include "viennacl/linalg/opencl/kernels/matrix.hpp"
+
+/** @file viennacl/linalg/opencl/kernels/matrix_prod.hpp
+ *  @brief Runtime generation of OpenCL kernels for dense matrix-matrix products */
+namespace viennacl
+{
+  namespace linalg
+  {
+    namespace opencl
+    {
+      namespace kernels
+      {
+
+        template <typename StringType>
+        void generate_matrix_prod_blas3(StringType & source, std::string const & numeric_string,
+                                        bool row_major_A, bool row_major_B, bool row_major_C,
+                                        bool transpose_A, bool transpose_B)
+        {
+          //start OpenCL code:
+          source.append("__kernel void prod_");
+          if (transpose_A)
+            source.append("T");
+          else
+            source.append("A");
+          if (transpose_B)
+            source.append("T");
+          else
+            source.append("A");
+
+          source.append("( \n");
+          source.append("  "); source.append(numeric_string); source.append(" alpha, \n");
+          source.append("  __global const "); source.append(numeric_string); source.append(" * A, \n");
+          source.append("  unsigned int A_row_start, \n");
+          source.append("  unsigned int A_col_start, \n");
+          source.append("  unsigned int A_row_inc, \n");
+          source.append("  unsigned int A_col_inc, \n");
+          source.append("  unsigned int A_row_size, \n");   //number of elements starting from row_start!
+          source.append("  unsigned int A_col_size, \n");
+          source.append("  unsigned int A_internal_rows, \n");
+          source.append("  unsigned int A_internal_cols, \n");
+
+          source.append("  __global const "); source.append(numeric_string); source.append(" * B,   \n");
+          source.append("  unsigned int B_row_start, \n");
+          source.append("  unsigned int B_col_start, \n");
+          source.append("  unsigned int B_row_inc, \n");
+          source.append("  unsigned int B_col_inc, \n");
+          source.append("  unsigned int B_row_size, \n");
+          source.append("  unsigned int B_col_size, \n");
+          source.append("  unsigned int B_internal_rows, \n");
+          source.append("  unsigned int B_internal_cols, \n");
+
+          source.append("  "); source.append(numeric_string); source.append(" beta, \n");
+          source.append("  __global "); source.append(numeric_string); source.append(" * C, \n");
+          source.append("  unsigned int C_row_start, \n");
+          source.append("  unsigned int C_col_start, \n");
+          source.append("  unsigned int C_row_inc, \n");
+          source.append("  unsigned int C_col_inc, \n");
+          source.append("  unsigned int C_row_size, \n");
+          source.append("  unsigned int C_col_size, \n");
+          source.append("  unsigned int C_internal_rows, \n");
+          source.append("  unsigned int C_internal_cols)  \n");
+          source.append("{  \n");
+
+          source.append("  __local "); source.append(numeric_string); source.append(" bufA[272]; \n"); // 16 * 17
+          source.append("  __local "); source.append(numeric_string); source.append(" bufB[272]; \n"); // 16 * 17
+
+          source.append("  size_t block_size = 16; \n"); //get_local_size(0);
+
+          source.append("  size_t row_block_id = get_group_id(0); \n");
+          source.append("  size_t col_block_id = get_group_id(1); \n");
+          source.append("  size_t row_thread_id = get_local_id(0); \n");
+          source.append("  size_t col_thread_id = get_local_id(1); \n");
+
+          //traverse block row of A (taking mem layout and transpose operation into account)
+          if (row_major_A && transpose_A)
+          {
+            source.append("  size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols; \n");
+            source.append("  size_t aStep = block_size * A_row_inc * A_internal_cols; \n");
+          }
+          else if (row_major_A && !transpose_A)
+          {
+            source.append("  size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start; \n");
+            source.append("  size_t aStep = block_size * A_col_inc; \n");
+          }
+          else if (!row_major_A && transpose_A)
+          {
+            source.append("  size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start; \n");
+            source.append("  size_t aStep = block_size * A_row_inc; \n");
+          }
+          else if (!row_major_A && !transpose_A)
+          {
+            source.append("  size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows; \n");
+            source.append("  size_t aStep = block_size * A_col_inc * A_internal_rows; \n");
+          }
+
+
+          if (row_major_B && transpose_B)
+          {
+            source.append("  size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start; \n");
+            source.append("  size_t bStep = block_size * B_col_inc; \n");
+          }
+          else if (row_major_B && !transpose_B)
+          {
+            source.append("  size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols; \n");
+            source.append("  size_t bStep = block_size * B_internal_cols * B_row_inc; \n");
+          }
+          else if (!row_major_B && transpose_B)
+          {
+            source.append("  size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows; \n");
+            source.append("  size_t bStep = block_size * B_internal_rows * B_col_inc; \n");
+          }
+          else if (!row_major_B && !transpose_B)
+          {
+            source.append("  size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start; \n");
+            source.append("  size_t bStep = block_size * B_row_inc; \n");
+          }
+
+
+          if (transpose_A)
+            source.append("  size_t block_num = (A_row_size + block_size - 1) / block_size; \n");
+          else
+            source.append("  size_t block_num = (A_col_size + block_size - 1) / block_size; \n");
+
+          source.append("  "); source.append(numeric_string); source.append(" Csub = 0; \n");
+
+          //offset of the the memory access by the thread relative to the beginning of the block:
+          if (row_major_A)
+            source.append("  size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols; \n");
+          else
+            source.append("  size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows; \n");
+
+          if (row_major_B)
+            source.append("  size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols; \n");
+          else
+            source.append("  size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc *  B_internal_rows; \n");
+
+          source.append("  size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1); \n");
+          source.append("  size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1); \n");
+
+          source.append("  for (size_t block = 0; \n");
+          source.append("           block < block_num; \n");
+          source.append("           ++block) \n");
+          source.append("  { \n");
+
+          //read block from A and check for access within matrix:
+
+          if (transpose_A && row_major_A)
+            source.append("    bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0; \n");
+          else if (transpose_A && !row_major_A)
+            source.append("    bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0; \n");
+          else if (!transpose_A && row_major_A)
+            source.append("    bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0; \n");
+          else if (!transpose_A && !row_major_A)
+            source.append("    bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0; \n");
+
+
+          if (transpose_B && row_major_B)
+            source.append("    bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0; \n");
+          else if (transpose_B && !row_major_B)
+            source.append("    bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0; \n");
+          else if (!transpose_B && row_major_B)
+            source.append("    bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0; \n");
+          else if (!transpose_B && !row_major_B)
+            source.append("    bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0; \n");
+
+          //computation of block-matrix-matrix product is the same for all cases:
+          source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+          //loop unrolling:
+          source.append("    __local "); source.append(numeric_string); source.append(" * bufAptr = bufA + row_thread_id_times_block_size; \n");
+          source.append("    __local "); source.append(numeric_string); source.append(" * bufBptr = bufB + col_thread_id_times_block_size; \n");
+
+          for (size_t unroll = 0; unroll < 16; ++unroll) {
+            source.append("      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; \n");
+          }
+
+          source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+          source.append("    aBegin += aStep; \n");
+          source.append("    bBegin += bStep; \n");
+          source.append("  } \n");
+
+
+          if (transpose_A)
+          {
+            source.append("  if (get_global_id(0) < A_col_size && ");
+          }
+          else
+          {
+            source.append("  if (get_global_id(0) < A_row_size && ");
+          }
+
+          if (transpose_B)
+          {
+            source.append("get_global_id(1) < B_row_size) \n");
+          }
+          else
+          {
+            source.append("get_global_id(1) < B_col_size) \n");
+          }
+
+          if (row_major_C)
+          {
+            source.append("    C[(get_global_id(0) * C_row_inc + C_row_start) * C_internal_cols + get_global_id(1) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(get_global_id(0) * C_row_inc + C_row_start) * C_internal_cols + get_global_id(1) * C_col_inc + C_col_start]; \n");
+          }
+          else
+          {
+            source.append("    C[get_global_id(0) * C_row_inc + C_row_start + (get_global_id(1) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[get_global_id(0) * C_row_inc + C_row_start + (get_global_id(1) * C_col_inc + C_col_start) * C_internal_rows]; \n");
+          }
+          source.append("} \n");
+        }
+
+        template <typename StringType>
+        void generate_matrix_prod16_blas3(StringType & source, std::string const & numeric_string,
+                                        bool row_major_A, bool row_major_B, bool row_major_C,
+                                        bool transpose_A, bool transpose_B)
+        {
+          //vcl_size_t vector_size =  4;
+          vcl_size_t block_size  = 16;
+
+          //start OpenCL code:
+          source.append("__kernel void prod16_");
+          if (transpose_A)
+            source.append("T");
+          else
+            source.append("A");
+          if (transpose_B)
+            source.append("T");
+          else
+            source.append("A");
+
+          source.append("( "); source.append(numeric_string); source.append(" alpha, \n");
+          source.append("   __global const "); source.append(numeric_string); source.append(" * A, \n");
+          source.append("   unsigned int A_row_start, \n");
+          source.append("   unsigned int A_col_start, \n");
+          source.append("   unsigned int A_row_inc, \n");
+          source.append("   unsigned int A_col_inc, \n");
+          source.append("   unsigned int A_row_size, \n");   //number of elements starting from row_start, using an increment of A_row_inc
+          source.append("   unsigned int A_col_size, \n");
+          source.append("   unsigned int A_internal_rows, \n");
+          source.append("   unsigned int A_internal_cols, \n");
+          source.append("   __global const "); source.append(numeric_string); source.append(" * B,   \n");
+          source.append("   unsigned int B_row_start, \n");
+          source.append("   unsigned int B_col_start, \n");
+          source.append("   unsigned int B_row_inc, \n");
+          source.append("   unsigned int B_col_inc, \n");
+          source.append("   unsigned int B_row_size, \n");
+          source.append("   unsigned int B_col_size, \n");
+          source.append("   unsigned int B_internal_rows, \n");
+          source.append("   unsigned int B_internal_cols, \n");
+          source.append("   "); source.append(numeric_string); source.append(" beta, \n");
+          source.append("   __global "); source.append(numeric_string); source.append(" * C, \n");
+          source.append("   unsigned int C_row_start, \n");
+          source.append("   unsigned int C_col_start, \n");
+          source.append("   unsigned int C_row_inc, \n");
+          source.append("   unsigned int C_col_inc, \n");
+          source.append("   unsigned int C_row_size, \n");
+          source.append("   unsigned int C_col_size, \n");
+          source.append("   unsigned int C_internal_rows, \n");
+          source.append("   unsigned int C_internal_cols)  \n");
+          source.append("{  \n");
+          //do not forgot to change block_size !!!
+          source.append("  size_t row_block_id = get_group_id(1); \n");    //refers to the row index in op(A), op(B)
+          source.append("  size_t col_block_id = get_group_id(0); \n");    //refers to the col index in op(A), op(B)
+          source.append("  size_t row_thread_id = get_local_id(1); \n");
+          source.append("  size_t col_thread_id = get_local_id(0); \n");
+
+          source.append("  __local "); source.append(numeric_string); source.append(" As[256]; \n");
+
+          source.append("  "); source.append(numeric_string); source.append(" cv[16] = {");
+          for (vcl_size_t i=0; i<block_size-1; ++i)
+            source.append("0,");
+          source.append("0}; \n");
+
+          //traverse block row of A (taking mem layout and transpose operation into account)
+          if (row_major_A && transpose_A)
+          {
+            source.append("  size_t aBegin = (row_block_id * 16 * A_col_inc + A_col_start) + A_row_start * A_internal_cols; \n");
+            source.append("  size_t aStep = 16 * A_internal_cols * A_row_inc; \n");
+            source.append("  size_t aEnd = aBegin + A_internal_cols * A_row_inc * A_row_size; \n");
+          }
+          else if (row_major_A && !transpose_A)
+          {
+            source.append("  size_t aBegin = (row_block_id * 16 * A_row_inc + A_row_start) * A_internal_cols + A_col_start; \n");
+            source.append("  size_t aStep = 16 * A_col_inc; \n");
+            source.append("  size_t aEnd = aBegin + A_col_inc * A_col_size; \n");
+          }
+          else if (!row_major_A && transpose_A)
+          {
+            source.append("  size_t aBegin = (row_block_id * 16 * A_col_inc + A_col_start) * A_internal_rows + A_row_start; \n");
+            source.append("  size_t aStep = 16 * A_row_inc; \n");
+            source.append("  size_t aEnd = aBegin + A_row_inc * A_row_size; \n");
+          }
+          else if (!row_major_A && !transpose_A)
+          {
+            source.append("  size_t aBegin = (row_block_id * 16 * A_row_inc + A_row_start) + A_col_start * A_internal_rows; \n");
+            source.append("  size_t aStep = 16 * A_internal_rows * A_col_inc; \n");
+            source.append("  size_t aEnd = aBegin + A_internal_rows * A_col_inc * A_col_size; \n");
+          }
+
+
+          if (row_major_B && transpose_B)
+          {
+            source.append("  size_t bBegin = (col_block_id * 64 * B_row_inc + B_row_start) * B_internal_cols + B_col_start; \n");
+            source.append("  size_t bStep = 16 * B_col_inc; \n");
+          }
+          else if (row_major_B && !transpose_B)
+          {
+            source.append("  size_t bBegin = (col_block_id * 64 * B_col_inc + B_col_start) + B_row_start * B_internal_cols; \n");
+            source.append("  size_t bStep = 16 * B_row_inc * B_internal_cols; \n");
+          }
+          else if (!row_major_B && transpose_B)
+          {
+            source.append("  size_t bBegin = (col_block_id * 64 * B_row_inc + B_row_start) + B_col_start * B_internal_rows; \n");
+            source.append("  size_t bStep = 16 * B_col_inc * B_internal_rows; \n");
+          }
+          else if (!row_major_B && !transpose_B)
+          {
+            source.append("  size_t bBegin = (col_block_id * 64 * B_col_inc + B_col_start) * B_internal_rows + B_row_start; \n");
+            source.append("  size_t bStep = 16 * B_row_inc; \n");
+          }
+
+          source.append("  for(size_t a = aBegin, b = bBegin; a < aEnd; a += aStep, b += bStep) {  \n");
+
+          // copy blocks of op(A) to shared memory (op(A) is column-major in shared memory then)
+          source.append("    for(size_t i = 0; i < 4; i++)   \n");
+          if (row_major_A && transpose_A)
+            source.append("      As[ (i*4 + row_thread_id) + 16 * col_thread_id] = (A[a + A_col_inc * (i * 4 + row_thread_id) + A_internal_cols * A_row_inc * col_thread_id]);");
+          else if (row_major_A && !transpose_A)
+            source.append("      As[ (i*4 + row_thread_id) + 16 * col_thread_id] = (A[a + A_internal_cols * A_row_inc * (i * 4 + row_thread_id) + A_col_inc * col_thread_id]);");
+          else if (!row_major_A && transpose_A)
+            source.append("      As[ (i*4 + row_thread_id) + 16 * col_thread_id] = (A[a + A_internal_rows * A_col_inc * (i * 4 + row_thread_id) + A_row_inc * col_thread_id]);");
+          else if (!row_major_A && !transpose_A)
+            source.append("      As[ (i*4 + row_thread_id) + 16 * col_thread_id] = (A[a + A_row_inc * (i * 4 + row_thread_id) + A_internal_rows * A_col_inc * col_thread_id]);");
+
+          source.append("    barrier(CLK_LOCAL_MEM_FENCE);  \n");
+
+          // initialize memory pointers
+          source.append("    __local  "); source.append(numeric_string); source.append(" *ap = As;  \n");
+          if (row_major_B && transpose_B)
+          {
+            source.append("    __global const "); source.append(numeric_string); source.append(" *bp = B + (b + (16 * row_thread_id + col_thread_id) * B_row_inc * B_internal_cols);  \n");
+          }
+          else if (row_major_B && !transpose_B)
+          {
+            source.append("    __global const "); source.append(numeric_string); source.append(" *bp = B + (b + (16 * row_thread_id + col_thread_id) * B_col_inc);  \n");
+          }
+          else if (!row_major_B && transpose_B)
+          {
+            source.append("    __global const "); source.append(numeric_string); source.append(" *bp = B + (b + (16 * row_thread_id + col_thread_id) * B_row_inc);  \n");
+          }
+          else if (!row_major_B && !transpose_B)
+          {
+            source.append("    __global const "); source.append(numeric_string); source.append(" *bp = B + (b + (16 * row_thread_id + col_thread_id) * B_col_inc * B_internal_rows);  \n");
+          }
+
+          // run computations
+          source.append("    for(size_t i = 0; i < 16; i++) {  \n");
+          if (row_major_B && transpose_B)
+          {
+            source.append("      "); source.append(numeric_string); source.append(" bv = bp[i * B_col_inc];  \n");
+          }
+          else if (row_major_B && !transpose_B)
+          {
+            source.append("      "); source.append(numeric_string); source.append(" bv = bp[i * B_row_inc * B_internal_cols];  \n");
+          }
+          else if (!row_major_B && transpose_B)
+          {
+            source.append("      "); source.append(numeric_string); source.append(" bv = bp[i * B_col_inc * B_internal_rows];  \n");
+          }
+          else if (!row_major_B && !transpose_B)
+          {
+            source.append("      "); source.append(numeric_string); source.append(" bv = bp[i * B_row_inc];  \n");
+          }
+
+          source.append("      for(size_t k = 0; k < 16; k++)   \n");
+          source.append("	    cv[k] += ap[k] * bv;  \n");
+
+          source.append("      ap += 16;  \n");
+          source.append("    }  \n");
+
+          source.append("    barrier(CLK_LOCAL_MEM_FENCE);  \n");
+          source.append("  }  \n");
+
+          // write to C
+          if (row_major_C)
+          {
+              source.append("  int c = C_internal_cols * (C_row_inc * 16 * row_block_id + C_row_start) + 64 * C_col_inc * col_block_id + C_col_start  \n");  //block column index
+              source.append("          + C_col_inc * (16 * row_thread_id + col_thread_id);  \n");
+          }
+          else
+          {
+              source.append("  int c = C_row_inc * 16 * row_block_id + C_row_start + (64 * C_col_inc * col_block_id + C_col_start) * C_internal_rows  \n");   // block column index
+              source.append("          + C_internal_rows * C_col_inc * (16 * row_thread_id + col_thread_id);  \n");
+          }
+
+          source.append("  for(size_t i = 0; i < 16; i++) {  \n");
+
+          if (row_major_C)
+          {
+            source.append("    C[c] = (beta == 0) ? alpha * cv[i] : alpha * cv[i] + beta * C[c];  \n");
+            source.append("      c += C_internal_cols * C_row_inc;  \n");
+          }
+          else
+          {
+            source.append("    C[c] = (beta == 0) ? alpha * cv[i] : alpha * cv[i] + beta * C[c];  \n");
+            source.append("      c += C_row_inc;  \n");
+          }
+
+          source.append("  }  \n");
+          source.append("}  \n");
+
+        }
+
+
+        // main kernel class
+        /** @brief Main kernel class for the generation of matrix-matrix product kernels C = A * B
+          *
+          * @param F_A  Row/Column majority tag for A
+          * @param F_B  Row/Column majority tag for B
+          * @param F_C  Row/Column majority tag for C
+          */
+        template <class NumericT, typename F_A, typename F_B, typename F_C>
+        struct matrix_prod
+        {
+          static std::string program_name()
+          {
+            return viennacl::ocl::type_to_string<NumericT>::apply() + "_matrix_prod_" + detail::type_to_string(F_A()) + detail::type_to_string(F_B()) + detail::type_to_string(F_C());
+          }
+
+          static void init(viennacl::ocl::context & ctx)
+          {
+            viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx);
+            std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
+            bool row_major_A = viennacl::is_row_major<F_A>::value;
+            bool row_major_B = viennacl::is_row_major<F_B>::value;
+            bool row_major_C = viennacl::is_row_major<F_C>::value;
+
+
+            static std::map<cl_context, bool> init_done;
+            if (!init_done[ctx.handle().get()])
+            {
+              std::string source;
+              source.reserve(8192);
+
+              viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
+
+              // only generate for floating points (forces error for integers)
+              if (numeric_string == "float" || numeric_string == "double")
+              {
+                generate_matrix_prod_blas3(source, numeric_string, row_major_A, row_major_B, row_major_C, false, false);
+                generate_matrix_prod_blas3(source, numeric_string, row_major_A, row_major_B, row_major_C, false, true);
+                generate_matrix_prod_blas3(source, numeric_string, row_major_A, row_major_B, row_major_C, true, false);
+                generate_matrix_prod_blas3(source, numeric_string, row_major_A, row_major_B, row_major_C, true, true);
+
+                generate_matrix_prod16_blas3(source, numeric_string, row_major_A, row_major_B, row_major_C, false, false);
+                generate_matrix_prod16_blas3(source, numeric_string, row_major_A, row_major_B, row_major_C, false, true);
+                generate_matrix_prod16_blas3(source, numeric_string, row_major_A, row_major_B, row_major_C, true, false);
+                generate_matrix_prod16_blas3(source, numeric_string, row_major_A, row_major_B, row_major_C, true, true);
+
+              }
+
+              std::string prog_name = program_name();
+              #ifdef VIENNACL_BUILD_INFO
+              std::cout << "Creating program " << prog_name << std::endl;
+              #endif
+              ctx.add_program(source, prog_name);
+              init_done[ctx.handle().get()] = true;
+            } //if
+          } //init
+        };
+
+      }  // namespace kernels
+    }  // namespace opencl
+  }  // namespace linalg
+}  // namespace viennacl
+#endif
+
diff --git a/viennacl/linalg/opencl/kernels/matrix_solve.hpp b/viennacl/linalg/opencl/kernels/matrix_solve.hpp
new file mode 100644
index 0000000..9ab1c12
--- /dev/null
+++ b/viennacl/linalg/opencl/kernels/matrix_solve.hpp
@@ -0,0 +1,212 @@
+#ifndef VIENNACL_LINALG_OPENCL_KERNELS_MATRIX_SOLVE_HPP
+#define VIENNACL_LINALG_OPENCL_KERNELS_MATRIX_SOLVE_HPP
+
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/platform.hpp"
+#include "viennacl/ocl/utils.hpp"
+
+#include "viennacl/linalg/opencl/kernels/matrix.hpp"
+
+/** @file viennacl/linalg/opencl/kernels/matrix_solve.hpp
+ *  @brief OpenCL kernel file for dense matrix solves with multiple right hand side (BLAS level 3) */
+namespace viennacl
+{
+  namespace linalg
+  {
+    namespace opencl
+    {
+      namespace kernels
+      {
+
+        template <typename StringType>
+        void generate_matrix_solve_blas3(StringType & source, std::string const & numeric_string,
+                                         bool row_major_A, bool row_major_B,
+                                         bool transpose_A, bool transpose_B,
+                                         bool upper_solve, bool unit_diagonal)
+        {
+          //start OpenCL code:
+          source.append("__kernel void ");
+          if (transpose_A)
+            source.append("trans_");
+          if (unit_diagonal)
+            source.append("unit_");
+          if (upper_solve)
+            source.append("upper_");
+          else
+            source.append("lower_");
+          if (transpose_B)
+            source.append("trans_");
+          source.append("solve");
+
+          source.append("( \n");
+          source.append("          __global const "); source.append(numeric_string); source.append(" * A, \n");
+          source.append("          unsigned int A_start1, unsigned int A_start2, \n");
+          source.append("          unsigned int A_inc1,   unsigned int A_inc2, \n");
+          source.append("          unsigned int A_size1,  unsigned int A_size2, \n");
+          source.append("          unsigned int A_internal_size1, unsigned int A_internal_size2, \n");
+          source.append("          __global "); source.append(numeric_string); source.append(" * B, \n");
+          source.append("          unsigned int B_start1, unsigned int B_start2, \n");
+          source.append("          unsigned int B_inc1,   unsigned int B_inc2, \n");
+          source.append("          unsigned int B_size1,  unsigned int B_size2, \n");
+          source.append("          unsigned int B_internal_size1, unsigned int B_internal_size2) { \n");
+          source.append("  "); source.append(numeric_string); source.append(" temp;  \n");
+          if (upper_solve)
+          {
+            //Note: A is square, thus A_rows == A_cols and no dispatch for transposedness needed
+            source.append("  for (unsigned int row_cnt = 0; row_cnt < A_size1; ++row_cnt)  \n");
+            source.append("  {  \n");
+            source.append("    unsigned int row = A_size1 - 1 - row_cnt; \n");
+          }
+          else //lower triangular solve
+          {
+            source.append("  for (unsigned int row = 0; row < A_size1; ++row) \n");
+            source.append("  { \n");
+          }
+
+          if (!unit_diagonal)
+          {
+            source.append("    barrier(CLK_GLOBAL_MEM_FENCE); \n");
+            source.append("    if (get_local_id(0) == 0)  \n");
+            //Note: A is square, thus A_internal_rows == A_internal_cols and no dispatch for transposedness needed
+            if (row_major_B && transpose_B)
+              source.append("      B[(get_group_id(0) * B_inc1 + B_start1) * B_internal_size2 + (row * B_inc2 + B_start2)] /= ");
+            else if (row_major_B && !transpose_B)
+              source.append("      B[(row * B_inc1 + B_start1) * B_internal_size2 + (get_group_id(0) * B_inc2 + B_start2)] /= ");
+            else if (!row_major_B && transpose_B)
+              source.append("      B[(get_group_id(0) * B_inc1 + B_start1) + (row * B_inc2 + B_start2) * B_internal_size1] /= ");
+            else if (!row_major_B && !transpose_B)
+              source.append("      B[(row * B_inc1 + B_start1) + (get_group_id(0) * B_inc2 + B_start2) * B_internal_size1] /= ");
+
+            if (row_major_A)
+              source.append("A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]; \n");
+            else
+              source.append("A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2)*A_internal_size1]; \n");
+          }
+
+          source.append("    barrier(CLK_GLOBAL_MEM_FENCE); \n");
+
+          if (row_major_B && transpose_B)
+            source.append("    temp = B[(get_group_id(0) * B_inc1 + B_start1) * B_internal_size2 + (row * B_inc2 + B_start2)]; \n");
+          else if (row_major_B && !transpose_B)
+            source.append("    temp = B[(row * B_inc1 + B_start1) * B_internal_size2 + (get_group_id(0) * B_inc2 + B_start2)]; \n");
+          else if (!row_major_B && transpose_B)
+            source.append("    temp = B[(get_group_id(0) * B_inc1 + B_start1) + (row * B_inc2 + B_start2) * B_internal_size1]; \n");
+          else if (!row_major_B && !transpose_B)
+            source.append("    temp = B[(row * B_inc1 + B_start1) + (get_group_id(0) * B_inc2 + B_start2) * B_internal_size1]; \n");
+
+          source.append("    //eliminate column of op(A) with index 'row' in parallel: \n");
+          if (upper_solve)
+            source.append("    for  (unsigned int elim = get_local_id(0); elim < row; elim += get_local_size(0)) \n");
+          else
+            source.append("    for  (unsigned int elim = row + get_local_id(0) + 1; elim < A_size1; elim += get_local_size(0)) \n");
+
+          if (row_major_B && transpose_B)
+            source.append("      B[(get_group_id(0) * B_inc1 + B_start1) * B_internal_size2 + (elim * B_inc2 + B_start2)] -= temp * ");
+          else if (row_major_B && !transpose_B)
+            source.append("      B[(elim * B_inc1 + B_start1) * B_internal_size2 + (get_group_id(0) * B_inc2 + B_start2)] -= temp * ");
+          else if (!row_major_B && transpose_B)
+            source.append("      B[(get_group_id(0) * B_inc1 + B_start1) + (elim * B_inc2 + B_start2) * B_internal_size1] -= temp * ");
+          else if (!row_major_B && !transpose_B)
+            source.append("      B[(elim * B_inc1 + B_start1) + (get_group_id(0) * B_inc2 + B_start2) * B_internal_size1] -= temp * ");
+
+          if (row_major_A && transpose_A)
+            source.append("A[(row * A_inc1 + A_start1) * A_internal_size2 + (elim * A_inc2 + A_start2)]; \n");
+          else if (row_major_A && !transpose_A)
+            source.append("A[(elim * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]; \n");
+          else if (!row_major_A && transpose_A)
+            source.append("A[(row * A_inc1 + A_start1) + (elim * A_inc2 + A_start2) * A_internal_size1]; \n");
+          else if (!row_major_A && !transpose_A)
+            source.append("A[(elim * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1]; \n");
+
+          source.append("   } \n");
+          source.append("} \n");
+        }
+
+
+        // main kernel class
+        /** @brief Main kernel class for the generation of matrix solve kernels.
+          *
+          * @param F1  Row/Column majority tag for the system matrix
+          * @param F2  Row/Column majority tag for the right hand side matrix
+          */
+        template <class NumericT, typename F1, typename F2>
+        struct matrix_solve
+        {
+          static std::string program_name()
+          {
+            return viennacl::ocl::type_to_string<NumericT>::apply() + "_matrix_solve_" + detail::type_to_string(F1()) + detail::type_to_string(F2());
+          }
+
+          static void init(viennacl::ocl::context & ctx)
+          {
+            viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx);
+            std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
+            bool matrix_row_major = viennacl::is_row_major<F1>::value;
+            bool rhs_row_major    = viennacl::is_row_major<F2>::value;
+
+
+            static std::map<cl_context, bool> init_done;
+            if (!init_done[ctx.handle().get()])
+            {
+              std::string source;
+              source.reserve(8192);
+
+              viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
+
+              // only generate for floating points (forces error for integers)
+              if (numeric_string == "float" || numeric_string == "double")
+              {
+                generate_matrix_solve_blas3(source, numeric_string, matrix_row_major, rhs_row_major,
+                                            false, false, false, false);
+                generate_matrix_solve_blas3(source, numeric_string, matrix_row_major, rhs_row_major,
+                                            false, false, false, true);
+                generate_matrix_solve_blas3(source, numeric_string, matrix_row_major, rhs_row_major,
+                                            false, false, true, false);
+                generate_matrix_solve_blas3(source, numeric_string, matrix_row_major, rhs_row_major,
+                                            false, false, true, true);
+
+                generate_matrix_solve_blas3(source, numeric_string, matrix_row_major, rhs_row_major,
+                                            false, true, false, false);
+                generate_matrix_solve_blas3(source, numeric_string, matrix_row_major, rhs_row_major,
+                                            false, true, false, true);
+                generate_matrix_solve_blas3(source, numeric_string, matrix_row_major, rhs_row_major,
+                                            false, true, true, false);
+                generate_matrix_solve_blas3(source, numeric_string, matrix_row_major, rhs_row_major,
+                                            false, true, true, true);
+
+                generate_matrix_solve_blas3(source, numeric_string, matrix_row_major, rhs_row_major,
+                                            true, false, false, false);
+                generate_matrix_solve_blas3(source, numeric_string, matrix_row_major, rhs_row_major,
+                                            true, false, false, true);
+                generate_matrix_solve_blas3(source, numeric_string, matrix_row_major, rhs_row_major,
+                                            true, false, true, false);
+                generate_matrix_solve_blas3(source, numeric_string, matrix_row_major, rhs_row_major,
+                                            true, false, true, true);
+
+                generate_matrix_solve_blas3(source, numeric_string, matrix_row_major, rhs_row_major,
+                                            true, true, false, false);
+                generate_matrix_solve_blas3(source, numeric_string, matrix_row_major, rhs_row_major,
+                                            true, true, false, true);
+                generate_matrix_solve_blas3(source, numeric_string, matrix_row_major, rhs_row_major,
+                                            true, true, true, false);
+                generate_matrix_solve_blas3(source, numeric_string, matrix_row_major, rhs_row_major,
+                                            true, true, true, true);
+              }
+
+              std::string prog_name = program_name();
+              #ifdef VIENNACL_BUILD_INFO
+              std::cout << "Creating program " << prog_name << std::endl;
+              #endif
+              ctx.add_program(source, prog_name);
+              init_done[ctx.handle().get()] = true;
+            } //if
+          } //init
+        };
+
+      }  // namespace kernels
+    }  // namespace opencl
+  }  // namespace linalg
+}  // namespace viennacl
+#endif
+
diff --git a/viennacl/linalg/opencl/kernels/nmf.hpp b/viennacl/linalg/opencl/kernels/nmf.hpp
new file mode 100644
index 0000000..a585668
--- /dev/null
+++ b/viennacl/linalg/opencl/kernels/nmf.hpp
@@ -0,0 +1,82 @@
+#ifndef VIENNACL_LINALG_OPENCL_KERNELS_NMF_HPP
+#define VIENNACL_LINALG_OPENCL_KERNELS_NMF_HPP
+
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/platform.hpp"
+#include "viennacl/ocl/utils.hpp"
+
+/** @file viennacl/linalg/opencl/kernels/nmf.hpp
+ *  @brief OpenCL kernel file for nonnegative matrix factorization */
+namespace viennacl
+{
+  namespace linalg
+  {
+    namespace opencl
+    {
+      namespace kernels
+      {
+
+        template <typename StringType>
+        void generate_nmf_el_wise_mul_div(StringType & source, std::string const & numeric_string)
+        {
+          source.append("__kernel void el_wise_mul_div( \n");
+          source.append("          __global "); source.append(numeric_string); source.append(" * matrix1, \n");
+          source.append("          __global const "); source.append(numeric_string); source.append(" * matrix2, \n");
+          source.append("          __global const "); source.append(numeric_string); source.append(" * matrix3, \n");
+          source.append("          unsigned int size) \n");
+          source.append("{ \n");
+          source.append("  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) \n");
+          source.append("  { \n");
+          source.append("    "); source.append(numeric_string); source.append(" val = matrix1[i] * matrix2[i]; \n");
+          source.append("    "); source.append(numeric_string); source.append(" divisor = matrix3[i]; \n");
+          source.append("    matrix1[i] = (divisor > ("); source.append(numeric_string); source.append(")0.00001) ? (val / divisor) : ("); source.append(numeric_string); source.append(")0; \n");
+          source.append("  } \n");
+          source.append("} \n");
+        }
+
+        // main kernel class
+        /** @brief Main kernel class for generating OpenCL kernels for nonnegative matrix factorization of a dense matrices. */
+        template <class NumericT>
+        struct nmf
+        {
+          static std::string program_name()
+          {
+            return viennacl::ocl::type_to_string<NumericT>::apply() + "_nmf";
+          }
+
+          static void init(viennacl::ocl::context & ctx)
+          {
+            viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx);
+            std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
+
+            static std::map<cl_context, bool> init_done;
+            if (!init_done[ctx.handle().get()])
+            {
+              std::string source;
+              source.reserve(8192);
+
+              viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
+
+              // only generate for floating points (forces error for integers)
+              if (numeric_string == "float" || numeric_string == "double")
+              {
+                generate_nmf_el_wise_mul_div(source, numeric_string);
+              }
+
+              std::string prog_name = program_name();
+              #ifdef VIENNACL_BUILD_INFO
+              std::cout << "Creating program " << prog_name << std::endl;
+              #endif
+              ctx.add_program(source, prog_name);
+              init_done[ctx.handle().get()] = true;
+            } //if
+          } //init
+        };
+
+      }  // namespace kernels
+    }  // namespace opencl
+  }  // namespace linalg
+}  // namespace viennacl
+#endif
+
diff --git a/viennacl/linalg/opencl/kernels/scalar.hpp b/viennacl/linalg/opencl/kernels/scalar.hpp
new file mode 100644
index 0000000..a83cc71
--- /dev/null
+++ b/viennacl/linalg/opencl/kernels/scalar.hpp
@@ -0,0 +1,266 @@
+#ifndef VIENNACL_LINALG_OPENCL_KERNELS_SCALAR_HPP
+#define VIENNACL_LINALG_OPENCL_KERNELS_SCALAR_HPP
+
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/platform.hpp"
+#include "viennacl/ocl/utils.hpp"
+
+/** @file viennacl/linalg/opencl/kernels/scalar.hpp
+ *  @brief OpenCL kernel file for scalar operations */
+namespace viennacl
+{
+  namespace linalg
+  {
+    namespace opencl
+    {
+      namespace kernels
+      {
+
+        //////////////////////////// Part 1: Kernel generation routines ////////////////////////////////////
+
+        /** @brief Enumeration for the scalar type in avbv-like operations */
+        enum asbs_scalar_type
+        {
+          VIENNACL_ASBS_NONE = 0, // scalar does not exist/contribute
+          VIENNACL_ASBS_CPU,
+          VIENNACL_ASBS_GPU
+        };
+
+        /** @brief Configuration struct for generating OpenCL kernels for linear combinations of viennacl::scalar<> objects */
+        struct asbs_config
+        {
+          asbs_config() : with_stride_and_range(true), a(VIENNACL_ASBS_CPU), b(VIENNACL_ASBS_NONE) {}
+
+          bool with_stride_and_range;
+          std::string      assign_op;
+          asbs_scalar_type a;
+          asbs_scalar_type b;
+        };
+
+        // just returns the assignment string
+        template <typename StringType>
+        void generate_asbs_impl3(StringType & source, char sign_a, char sign_b, asbs_config const & cfg, bool mult_alpha, bool mult_beta)
+        {
+          source.append("      *s1 "); source.append(cfg.assign_op); source.append(1, sign_a); source.append(" *s2 ");
+          if (mult_alpha)
+            source.append("* alpha ");
+          else
+            source.append("/ alpha ");
+          if (cfg.b != VIENNACL_ASBS_NONE)
+          {
+            source.append(1, sign_b); source.append(" *s3 ");
+            if (mult_beta)
+              source.append("* beta");
+            else
+              source.append("/ beta");
+          }
+          source.append("; \n");
+        }
+
+        template <typename StringType>
+        void generate_asbs_impl2(StringType & source, char sign_a, char sign_b, asbs_config const & cfg)
+        {
+          source.append("    if (options2 & (1 << 1)) { \n");
+          if (cfg.b != VIENNACL_ASBS_NONE)
+          {
+            source.append("     if (options3 & (1 << 1)) \n");
+            generate_asbs_impl3(source, sign_a, sign_b, cfg, false, false);
+            source.append("     else \n");
+            generate_asbs_impl3(source, sign_a, sign_b, cfg, false, true);
+          }
+          else
+            generate_asbs_impl3(source, sign_a, sign_b, cfg, false, true);
+          source.append("    } else { \n");
+          if (cfg.b != VIENNACL_ASBS_NONE)
+          {
+            source.append("     if (options3 & (1 << 1)) \n");
+            generate_asbs_impl3(source, sign_a, sign_b, cfg, true, false);
+            source.append("     else \n");
+            generate_asbs_impl3(source, sign_a, sign_b, cfg, true, true);
+          }
+          else
+            generate_asbs_impl3(source, sign_a, sign_b, cfg, true, true);
+          source.append("    } \n");
+
+        }
+
+        template <typename StringType>
+        void generate_asbs_impl(StringType & source, std::string const & numeric_string, asbs_config const & cfg)
+        {
+          source.append("__kernel void as");
+          if (cfg.b != VIENNACL_ASBS_NONE)
+            source.append("bs");
+          if (cfg.assign_op != "=")
+            source.append("_s");
+
+          if (cfg.a == VIENNACL_ASBS_CPU)
+            source.append("_cpu");
+          else if (cfg.a == VIENNACL_ASBS_GPU)
+            source.append("_gpu");
+
+          if (cfg.b == VIENNACL_ASBS_CPU)
+            source.append("_cpu");
+          else if (cfg.b == VIENNACL_ASBS_GPU)
+            source.append("_gpu");
+          source.append("( \n");
+          source.append("  __global "); source.append(numeric_string); source.append(" * s1, \n");
+          source.append(" \n");
+          if (cfg.a == VIENNACL_ASBS_CPU)
+          {
+            source.append("  "); source.append(numeric_string); source.append(" fac2, \n");
+          }
+          else if (cfg.a == VIENNACL_ASBS_GPU)
+          {
+            source.append("  __global "); source.append(numeric_string); source.append(" * fac2, \n");
+          }
+          source.append("  unsigned int options2, \n");  // 0: no action, 1: flip sign, 2: take inverse, 3: flip sign and take inverse
+          source.append("  __global const "); source.append(numeric_string); source.append(" * s2");
+
+          if (cfg.b != VIENNACL_ASBS_NONE)
+          {
+            source.append(", \n\n");
+            if (cfg.b == VIENNACL_ASBS_CPU)
+            {
+              source.append("  "); source.append(numeric_string); source.append(" fac3, \n");
+            }
+            else if (cfg.b == VIENNACL_ASBS_GPU)
+            {
+              source.append("  __global "); source.append(numeric_string); source.append(" * fac3, \n");
+            }
+            source.append("  unsigned int options3, \n");  // 0: no action, 1: flip sign, 2: take inverse, 3: flip sign and take inverse
+            source.append("  __global const "); source.append(numeric_string); source.append(" * s3");
+          }
+          source.append(") \n{ \n");
+
+          if (cfg.a == VIENNACL_ASBS_CPU)
+          {
+            source.append("  "); source.append(numeric_string); source.append(" alpha = fac2; \n");
+          }
+          else if (cfg.a == VIENNACL_ASBS_GPU)
+          {
+            source.append("  "); source.append(numeric_string); source.append(" alpha = fac2[0]; \n");
+          }
+          source.append(" \n");
+
+          if (cfg.b == VIENNACL_ASBS_CPU)
+          {
+            source.append("  "); source.append(numeric_string); source.append(" beta = fac3; \n");
+          }
+          else if (cfg.b == VIENNACL_ASBS_GPU)
+          {
+            source.append("  "); source.append(numeric_string); source.append(" beta = fac3[0]; \n");
+          }
+
+          source.append("  if (options2 & (1 << 0)) { \n");
+          if (cfg.b != VIENNACL_ASBS_NONE)
+          {
+            source.append("   if (options3 & (1 << 0)) { \n");
+            generate_asbs_impl2(source, '-', '-', cfg);
+            source.append("   } else { \n");
+            generate_asbs_impl2(source, '-', '+', cfg);
+            source.append("   } \n");
+          }
+          else
+            generate_asbs_impl2(source, '-', '+', cfg);
+          source.append("  } else { \n");
+          if (cfg.b != VIENNACL_ASBS_NONE)
+          {
+            source.append("   if (options3 & (1 << 0)) { \n");
+            generate_asbs_impl2(source, '+', '-', cfg);
+            source.append("   } else { \n");
+            generate_asbs_impl2(source, '+', '+', cfg);
+            source.append("   } \n");
+          }
+          else
+            generate_asbs_impl2(source, '+', '+', cfg);
+
+          source.append("  } \n");
+          source.append("} \n");
+        }
+
+        template <typename StringType>
+        void generate_asbs(StringType & source, std::string const & numeric_string)
+        {
+          asbs_config cfg;
+          cfg.assign_op = "=";
+          cfg.with_stride_and_range = true;
+
+          // as
+          cfg.b = VIENNACL_ASBS_NONE; cfg.a = VIENNACL_ASBS_CPU; generate_asbs_impl(source, numeric_string, cfg);
+          cfg.b = VIENNACL_ASBS_NONE; cfg.a = VIENNACL_ASBS_GPU; generate_asbs_impl(source, numeric_string, cfg);
+
+          // asbs
+          cfg.a = VIENNACL_ASBS_CPU; cfg.b = VIENNACL_ASBS_CPU; generate_asbs_impl(source, numeric_string, cfg);
+          cfg.a = VIENNACL_ASBS_CPU; cfg.b = VIENNACL_ASBS_GPU; generate_asbs_impl(source, numeric_string, cfg);
+          cfg.a = VIENNACL_ASBS_GPU; cfg.b = VIENNACL_ASBS_CPU; generate_asbs_impl(source, numeric_string, cfg);
+          cfg.a = VIENNACL_ASBS_GPU; cfg.b = VIENNACL_ASBS_GPU; generate_asbs_impl(source, numeric_string, cfg);
+
+          // asbs
+          cfg.assign_op = "+=";
+
+          cfg.a = VIENNACL_ASBS_CPU; cfg.b = VIENNACL_ASBS_CPU; generate_asbs_impl(source, numeric_string, cfg);
+          cfg.a = VIENNACL_ASBS_CPU; cfg.b = VIENNACL_ASBS_GPU; generate_asbs_impl(source, numeric_string, cfg);
+          cfg.a = VIENNACL_ASBS_GPU; cfg.b = VIENNACL_ASBS_CPU; generate_asbs_impl(source, numeric_string, cfg);
+          cfg.a = VIENNACL_ASBS_GPU; cfg.b = VIENNACL_ASBS_GPU; generate_asbs_impl(source, numeric_string, cfg);
+        }
+
+        template <typename StringType>
+        void generate_scalar_swap(StringType & source, std::string const & numeric_string)
+        {
+          source.append("__kernel void swap( \n");
+          source.append("          __global "); source.append(numeric_string); source.append(" * s1, \n");
+          source.append("          __global "); source.append(numeric_string); source.append(" * s2) \n");
+          source.append("{ \n");
+          source.append("  "); source.append(numeric_string); source.append(" tmp = *s2; \n");
+          source.append("  *s2 = *s1; \n");
+          source.append("  *s1 = tmp; \n");
+          source.append("} \n");
+        }
+
+        //////////////////////////// Part 2: Main kernel class ////////////////////////////////////
+
+        // main kernel class
+        /** @brief Main kernel class for generating OpenCL kernels for operations involving viennacl::scalar<>, but not viennacl::vector<> or viennacl::matrix<>. */
+        template <class TYPE>
+        struct scalar
+        {
+          static std::string program_name()
+          {
+            return viennacl::ocl::type_to_string<TYPE>::apply() + "_scalar";
+          }
+
+          static void init(viennacl::ocl::context & ctx)
+          {
+            viennacl::ocl::DOUBLE_PRECISION_CHECKER<TYPE>::apply(ctx);
+            std::string numeric_string = viennacl::ocl::type_to_string<TYPE>::apply();
+
+            static std::map<cl_context, bool> init_done;
+            if (!init_done[ctx.handle().get()])
+            {
+              std::string source;
+              source.reserve(8192);
+
+              viennacl::ocl::append_double_precision_pragma<TYPE>(ctx, source);
+
+              // fully parametrized kernels:
+              generate_asbs(source, numeric_string);
+              generate_scalar_swap(source, numeric_string);
+
+
+              std::string prog_name = program_name();
+              #ifdef VIENNACL_BUILD_INFO
+              std::cout << "Creating program " << prog_name << std::endl;
+              #endif
+              ctx.add_program(source, prog_name);
+              init_done[ctx.handle().get()] = true;
+            } //if
+          } //init
+        };
+
+      }  // namespace kernels
+    }  // namespace opencl
+  }  // namespace linalg
+}  // namespace viennacl
+#endif
+
diff --git a/viennacl/linalg/opencl/kernels/spai.hpp b/viennacl/linalg/opencl/kernels/spai.hpp
new file mode 100644
index 0000000..ef6b986
--- /dev/null
+++ b/viennacl/linalg/opencl/kernels/spai.hpp
@@ -0,0 +1,614 @@
+#ifndef VIENNACL_LINALG_OPENCL_KERNELS_SPAI_HPP
+#define VIENNACL_LINALG_OPENCL_KERNELS_SPAI_HPP
+
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/platform.hpp"
+#include "viennacl/ocl/utils.hpp"
+
+/** @file viennacl/linalg/opencl/kernels/spai.hpp
+ *  @brief OpenCL kernel file for sparse approximate inverse operations */
+namespace viennacl
+{
+  namespace linalg
+  {
+    namespace opencl
+    {
+      namespace kernels
+      {
+
+        //////////////////////////// Part 1: Kernel generation routines ////////////////////////////////////
+
+        template <typename StringType>
+        void generate_spai_assemble_blocks(StringType & source, std::string const & numeric_string)
+        {
+          source.append("float get_element(__global const unsigned int * row_indices, \n");
+          source.append("           __global const unsigned int * column_indices, \n");
+          source.append("           __global const "); source.append(numeric_string); source.append(" * elements, \n");
+          source.append("           unsigned int row, \n");
+          source.append("           unsigned int col) \n");
+          source.append("{ \n");
+          source.append("  unsigned int row_end = row_indices[row+1]; \n");
+          source.append("  for(unsigned int i = row_indices[row]; i < row_end; ++i){ \n");
+          source.append("    if(column_indices[i] == col) \n");
+          source.append("      return elements[i]; \n");
+          source.append("    if(column_indices[i] > col) \n");
+          source.append("      return 0; \n");
+          source.append("  } \n");
+          source.append("  return 0; \n");
+          source.append("} \n");
+
+          source.append("void block_assembly(__global const unsigned int * row_indices, \n");
+          source.append("          __global const unsigned int * column_indices, \n");
+          source.append("          __global const "); source.append(numeric_string); source.append(" * elements, \n");
+          source.append("          __global const unsigned int * matrix_dimensions, \n");
+          source.append("          __global const unsigned int * set_I, \n");
+          source.append("          __global const unsigned int * set_J, \n");
+          source.append("          unsigned int matrix_ind, \n");
+          source.append("          __global "); source.append(numeric_string); source.append(" * com_A_I_J) \n");
+          source.append("{ \n");
+          source.append("  unsigned int row_n = matrix_dimensions[2*matrix_ind]; \n");
+          source.append("  unsigned int col_n = matrix_dimensions[2*matrix_ind + 1]; \n");
+
+          source.append("  for(unsigned int i = 0; i < col_n; ++i){ \n");
+                  //start row index
+          source.append("        for(unsigned int j = 0; j < row_n; j++){ \n");
+          source.append("          com_A_I_J[ i*row_n + j] = get_element(row_indices, column_indices, elements, set_I[j], set_J[i]); \n");
+          source.append("        } \n");
+          source.append("      } \n");
+          source.append("} \n");
+
+          source.append("__kernel void assemble_blocks( \n");
+          source.append("          __global const unsigned int * row_indices, \n");
+          source.append("          __global const unsigned int * column_indices, \n");
+          source.append("          __global const "); source.append(numeric_string); source.append(" * elements, \n");
+          source.append("          __global const unsigned int * set_I, \n");
+          source.append("        __global const unsigned int * set_J, \n");
+          source.append("      __global const unsigned int * i_ind, \n");
+          source.append("      __global const unsigned int * j_ind, \n");
+          source.append("        __global const unsigned int * block_ind, \n");
+          source.append("        __global const unsigned int * matrix_dimensions, \n");
+          source.append("      __global "); source.append(numeric_string); source.append(" * com_A_I_J, \n");
+          source.append("      __global unsigned int * g_is_update, \n");
+          source.append("                   unsigned int  block_elems_num) \n");
+          source.append("{ \n");
+          source.append("    for(unsigned int i  = get_global_id(0); i < block_elems_num; i += get_global_size(0)){ \n");
+          source.append("        if((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){ \n");
+          source.append("            block_assembly(row_indices, column_indices, elements, matrix_dimensions, set_I + i_ind[i], set_J + j_ind[i], i, com_A_I_J + block_ind[i]); \n");
+          source.append("        } \n");
+          source.append("    } \n");
+          source.append("  } \n");
+        }
+
+        template <typename StringType>
+        void generate_spai_block_bv_assembly(StringType & source, std::string const & numeric_string)
+        {
+          source.append("  void assemble_bv(__global "); source.append(numeric_string); source.append(" * g_bv_r, __global "); source.append(numeric_string); source.append(" * g_bv, unsigned int col_n){ \n");
+          source.append("    for(unsigned int i = 0; i < col_n; ++i){ \n");
+          source.append("      g_bv_r[i] = g_bv[ i]; \n");
+          source.append("    } \n");
+          source.append("  } \n");
+
+          source.append("  void assemble_bv_block(__global "); source.append(numeric_string); source.append(" * g_bv_r, __global "); source.append(numeric_string); source.append(" * g_bv, unsigned int col_n, \n");
+          source.append("               __global "); source.append(numeric_string); source.append(" * g_bv_u, unsigned int col_n_u) \n");
+          source.append("  { \n");
+          source.append("    assemble_bv(g_bv_r, g_bv, col_n); \n");
+          source.append("    assemble_bv(g_bv_r + col_n, g_bv_u, col_n_u); \n");
+          source.append("  } \n");
+
+          source.append("  __kernel void block_bv_assembly(__global "); source.append(numeric_string); source.append(" * g_bv, \n");
+          source.append("              __global unsigned int * start_bv_ind, \n");
+          source.append("              __global unsigned int * matrix_dimensions, \n");
+          source.append("              __global "); source.append(numeric_string); source.append(" * g_bv_u, \n");
+          source.append("              __global unsigned int * start_bv_u_ind, \n");
+          source.append("              __global unsigned int * matrix_dimensions_u, \n");
+          source.append("              __global "); source.append(numeric_string); source.append(" * g_bv_r, \n");
+          source.append("              __global unsigned int * start_bv_r_ind, \n");
+          source.append("              __global unsigned int * matrix_dimensions_r, \n");
+          source.append("              __global unsigned int * g_is_update, \n");
+          source.append("              //__local  "); source.append(numeric_string); source.append(" * local_gb, \n");
+          source.append("              unsigned int  block_elems_num) \n");
+          source.append("  { \n");
+          source.append("    for(unsigned int i  = get_global_id(0); i < block_elems_num; i += get_global_size(0)){ \n");
+          source.append("      if((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){ \n");
+          source.append("        assemble_bv_block(g_bv_r + start_bv_r_ind[i], g_bv + start_bv_ind[i], matrix_dimensions[2*i + 1], g_bv_u + start_bv_u_ind[i], matrix_dimensions_u[2*i + 1]); \n");
+          source.append("      } \n");
+          source.append("    } \n");
+          source.append("  } \n");
+        }
+
+        template <typename StringType>
+        void generate_spai_block_least_squares(StringType & source, std::string const & numeric_string)
+        {
+          source.append("void custom_dot_prod_ls(__global "); source.append(numeric_string); source.append(" * A, unsigned int row_n, __global "); source.append(numeric_string); source.append(" * v, unsigned int ind, "); source.append(numeric_string); source.append(" *res){ \n");
+          source.append("  *res = 0.0; \n");
+          source.append("  for(unsigned int j = ind; j < row_n; ++j){ \n");
+          source.append("    if(j == ind){ \n");
+          source.append("      *res += v[ j]; \n");
+          source.append("    }else{ \n");
+          source.append("      *res += A[ j + ind*row_n]*v[ j]; \n");
+          source.append("    } \n");
+          source.append("  } \n");
+          source.append("} \n");
+
+          source.append("void backwardSolve(__global "); source.append(numeric_string); source.append(" * R,  unsigned int row_n, unsigned int col_n, __global "); source.append(numeric_string); source.append(" * y, __global "); source.append(numeric_string); source.append(" * x){ \n");
+          source.append("  for (int i = col_n-1; i >= 0 ; i--) { \n");
+          source.append("    x[ i] = y[ i]; \n");
+          source.append("    for (int j = i+1; j < col_n; ++j) { \n");
+          source.append("      x[ i] -= R[ i + j*row_n]*x[ j]; \n");
+          source.append("    } \n");
+          source.append("    x[i] /= R[ i + i*row_n]; \n");
+          source.append("  } \n");
+          source.append("} \n");
+
+
+          source.append("void apply_q_trans_vec_ls(__global "); source.append(numeric_string); source.append(" * R, unsigned int row_n, unsigned int col_n, __global const "); source.append(numeric_string); source.append(" * b_v,  __global "); source.append(numeric_string); source.append(" * y){ \n");
+          source.append("            "); source.append(numeric_string); source.append(" inn_prod = 0; \n");
+          source.append("            for(unsigned int i = 0; i < col_n; ++i){ \n");
+          source.append("                custom_dot_prod_ls(R, row_n, y, i, &inn_prod); \n");
+          source.append("                for(unsigned int j = i; j < row_n; ++j){ \n");
+          source.append("                    if(i == j){ \n");
+          source.append("                        y[ j] -= b_v[ i]*inn_prod; \n");
+          source.append("                    } \n");
+          source.append("                    else{ \n");
+          source.append("                        y[j] -= b_v[ i]*inn_prod*R[ j +i*row_n]; \n");
+          source.append("                    } \n");
+          source.append("                } \n");
+          source.append("            } \n");
+          source.append("        } \n");
+
+          source.append("void ls(__global "); source.append(numeric_string); source.append(" * R, unsigned int row_n, unsigned int col_n, __global "); source.append(numeric_string); source.append(" * b_v, __global "); source.append(numeric_string); source.append(" * m_v, __global "); source.append(numeric_string); source.append(" * y_v){ \n");
+          source.append("  apply_q_trans_vec_ls(R, row_n, col_n, b_v, y_v); \n");
+          source.append("  //m_new - is m_v now \n");
+          source.append("  backwardSolve(R, row_n, col_n, y_v, m_v); \n");
+          source.append("} \n");
+
+          source.append("__kernel void block_least_squares( \n");
+          source.append("      __global "); source.append(numeric_string); source.append(" * global_R, \n");
+          source.append("      __global unsigned int * block_ind, \n");
+          source.append("      __global "); source.append(numeric_string); source.append(" * b_v, \n");
+          source.append("      __global unsigned int * start_bv_inds, \n");
+          source.append("      __global "); source.append(numeric_string); source.append(" * m_v, \n");
+          source.append("      __global "); source.append(numeric_string); source.append(" * y_v, \n");
+          source.append("      __global unsigned int * start_y_inds, \n");
+          source.append("      __global unsigned int * matrix_dimensions, \n");
+          source.append("      __global unsigned int * g_is_update, \n");
+          source.append("      unsigned int  block_elems_num) \n");
+          source.append("{ \n");
+          source.append("    for(unsigned int i  = get_global_id(0); i < block_elems_num; i += get_global_size(0)){ \n");
+          source.append("        if((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){ \n");
+          source.append("            ls(global_R + block_ind[i], matrix_dimensions[2*i], matrix_dimensions[2*i + 1], b_v +start_bv_inds[i], m_v + start_bv_inds[i], y_v + start_y_inds[i] ); \n");
+          source.append("        } \n");
+          source.append("    } \n");
+          source.append("} \n");
+        }
+
+        template <typename StringType>
+        void generate_spai_block_q_mult(StringType & source, std::string const & numeric_string)
+        {
+          source.append("void custom_dot_prod(__global "); source.append(numeric_string); source.append(" * A, unsigned int row_n, __local "); source.append(numeric_string); source.append(" * v, unsigned int ind, "); source.append(numeric_string); source.append(" *res){ \n");
+          source.append("  *res = 0.0; \n");
+          source.append("  for(unsigned int j = ind; j < row_n; ++j){ \n");
+          source.append("    if(j == ind){ \n");
+          source.append("      *res += v[j]; \n");
+          source.append("    }else{ \n");
+          source.append("      *res += A[j + ind*row_n]*v[j]; \n");
+          source.append("    } \n");
+          source.append("  } \n");
+          source.append("} \n");
+
+          source.append("void apply_q_trans_vec(__global "); source.append(numeric_string); source.append(" * R, unsigned int row_n, unsigned int col_n, __global "); source.append(numeric_string); source.append(" * b_v, __local "); source.append(numeric_string); source.append(" * y){ \n");
+          source.append("  "); source.append(numeric_string); source.append(" inn_prod = 0; \n");
+          source.append("  for(unsigned int i = 0; i < col_n; ++i){ \n");
+          source.append("    custom_dot_prod(R, row_n, y, i, &inn_prod); \n");
+          source.append("    for(unsigned int j = i; j < row_n; ++j){ \n");
+          source.append("      if(i == j){ \n");
+          source.append("        y[j] -= b_v[ i]*inn_prod; \n");
+          source.append("      } \n");
+          source.append("      else{ \n");
+          source.append("        y[j] -= b_v[ i]*inn_prod*R[ j + i*row_n]; \n");
+          source.append("      } \n");
+          source.append("    } \n");
+          source.append("  } \n");
+          source.append("} \n");
+
+          source.append("void q_mult(__global "); source.append(numeric_string); source.append(" * R, unsigned int row_n, unsigned int col_n, __global "); source.append(numeric_string); source.append(" * b_v, __local "); source.append(numeric_string); source.append(" * R_u, unsigned int col_n_u){ \n");
+          source.append("        for(unsigned int i = get_local_id(0); i < col_n_u; i+= get_local_size(0)){ \n");
+          source.append("          apply_q_trans_vec(R, row_n, col_n, b_v, R_u + row_n*i); \n");
+          source.append("        } \n");
+          source.append("} \n");
+
+          source.append("void matrix_from_global_to_local(__global "); source.append(numeric_string); source.append("* g_M, __local "); source.append(numeric_string); source.append("* l_M, unsigned int row_n, unsigned int col_n, unsigned int mat_start_ind){ \n");
+          source.append("  for(unsigned int i = get_local_id(0); i < col_n; i+= get_local_size(0)){ \n");
+          source.append("    for(unsigned int j = 0; j < row_n; ++j){ \n");
+          source.append("      l_M[i*row_n + j] = g_M[mat_start_ind + i*row_n + j]; \n");
+          source.append("    } \n");
+          source.append("  } \n");
+          source.append("} \n");
+
+          source.append("void matrix_from_local_to_global(__global "); source.append(numeric_string); source.append("* g_M, __local "); source.append(numeric_string); source.append("* l_M, unsigned int row_n, unsigned int col_n, unsigned int mat_start_ind){ \n");
+          source.append("  for(unsigned int i = get_local_id(0); i < col_n; i+= get_local_size(0)){ \n");
+          source.append("    for(unsigned int j = 0; j < row_n; ++j){ \n");
+          source.append("      g_M[mat_start_ind + i*row_n + j] = l_M[i*row_n + j]; \n");
+          source.append("    } \n");
+          source.append("  } \n");
+          source.append("} \n");
+
+          source.append("__kernel void block_q_mult(__global "); source.append(numeric_string); source.append(" * global_R, \n");
+          source.append("  __global unsigned int * block_ind, \n");
+          source.append("  __global "); source.append(numeric_string); source.append(" * global_R_u, \n");
+          source.append("  __global unsigned int *block_ind_u, \n");
+          source.append("  __global "); source.append(numeric_string); source.append(" * b_v, \n");
+          source.append("  __global unsigned int * start_bv_inds, \n");
+          source.append("  __global unsigned int * matrix_dimensions, \n");
+          source.append("  __global unsigned int * matrix_dimensions_u, \n");
+          source.append("  __global unsigned int * g_is_update, \n");
+          source.append("  __local  "); source.append(numeric_string); source.append(" * local_R_u, \n");
+          source.append("    unsigned int  block_elems_num){ \n");
+          source.append("    for(unsigned int i  = get_group_id(0); i < block_elems_num; i += get_num_groups(0)){ \n");
+          source.append("          if((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && (g_is_update[i] > 0)){ \n");
+                  //matrix_from_global_to_local(R, local_buff_R, matrix_dimensions[2*i], matrix_dimensions[2*i + 1], start_matrix_inds[i]); \n");
+          source.append("        matrix_from_global_to_local(global_R_u, local_R_u, matrix_dimensions_u[2*i], matrix_dimensions_u[2*i+ 1], block_ind_u[i]); \n");
+          source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+          source.append("              q_mult(global_R + block_ind[i], matrix_dimensions[2*i], matrix_dimensions[2*i + 1], b_v + start_bv_inds[i], local_R_u, \n");
+          source.append("             matrix_dimensions_u[2*i + 1]); \n");
+          source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+          source.append("              matrix_from_local_to_global(global_R_u, local_R_u, matrix_dimensions_u[2*i], matrix_dimensions_u[2*i + 1], block_ind_u[i]); \n");
+          source.append("          } \n");
+          source.append("      } \n");
+          source.append("} \n");
+        }
+
+        template <typename StringType>
+        void generate_spai_block_qr(StringType & source, std::string const & numeric_string)
+        {
+          source.append("void dot_prod(__local const "); source.append(numeric_string); source.append("* A, unsigned int n, unsigned int beg_ind, "); source.append(numeric_string); source.append("* res){ \n");
+          source.append("    *res = 0; \n");
+          source.append("    for(unsigned int i = beg_ind; i < n; ++i){ \n");
+          source.append("        *res += A[(beg_ind-1)*n + i]*A[(beg_ind-1)*n + i]; \n");
+          source.append("    } \n");
+          source.append("} \n");
+
+          source.append("void vector_div(__global "); source.append(numeric_string); source.append("* v, unsigned int beg_ind, "); source.append(numeric_string); source.append(" b, unsigned int n){ \n");
+          source.append("    for(unsigned int i = beg_ind; i < n; ++i){ \n");
+          source.append("        v[i] /= b; \n");
+          source.append("    } \n");
+          source.append("} \n");
+
+          source.append("void copy_vector(__local const "); source.append(numeric_string); source.append("* A, __global "); source.append(numeric_string); source.append("* v, const unsigned int beg_ind, const unsigned int n){ \n");
+          source.append("    for(unsigned int i = beg_ind; i < n; ++i){ \n");
+          source.append("        v[i] = A[(beg_ind-1)*n + i]; \n");
+          source.append("    } \n");
+          source.append("} \n");
+
+
+          source.append("void householder_vector(__local const "); source.append(numeric_string); source.append("* A, unsigned int j, unsigned int n, __global "); source.append(numeric_string); source.append("* v, __global "); source.append(numeric_string); source.append("* b){ \n");
+          source.append("    "); source.append(numeric_string); source.append(" sg; \n");
+          source.append("    dot_prod(A, n, j+1, &sg); \n");
+          source.append("    copy_vector(A, v, j+1, n); \n");
+          source.append("    "); source.append(numeric_string); source.append(" mu; \n");
+          source.append("    v[j] = 1.0; \n");
+              //print_contigious_vector(v, v_start_ind, n);
+          source.append("    if(sg == 0){ \n");
+          source.append("        *b = 0; \n");
+          source.append("    } \n");
+          source.append("    else{ \n");
+          source.append("        mu = sqrt(A[j*n + j]*A[ j*n + j] + sg); \n");
+          source.append("        if(A[ j*n + j] <= 0){ \n");
+          source.append("            v[j] = A[ j*n + j] - mu; \n");
+          source.append("        }else{ \n");
+          source.append("            v[j] = -sg/(A[ j*n + j] + mu); \n");
+          source.append("        } \n");
+          source.append("    *b = 2*(v[j]*v[j])/(sg + v[j]*v[j]); \n");
+                  //*b = (2*v[j]*v[j])/(sg + (v[j])*(v[j]));
+          source.append("        vector_div(v, j, v[j], n); \n");
+                  //print_contigious_vector(v, v_start_ind, n);
+          source.append("    } \n");
+          source.append("} \n");
+
+          source.append("void custom_inner_prod(__local const "); source.append(numeric_string); source.append("* A, __global "); source.append(numeric_string); source.append("* v, unsigned int col_ind, unsigned int row_num, unsigned int start_ind, "); source.append(numeric_string); source.append("* res){ \n");
+          source.append("    for(unsigned int i = start_ind; i < row_num; ++i){ \n");
+          source.append("        *res += A[col_ind*row_num + i]*v[i]; \n");
+          source.append("    } \n");
+          source.append("} \n");
+          //
+          source.append("void apply_householder_reflection(__local "); source.append(numeric_string); source.append("* A,  unsigned int row_n, unsigned int col_n, unsigned int iter_cnt, __global "); source.append(numeric_string); source.append("* v, "); source.append(numeric_string); source.append(" b){ \n");
+          source.append("    "); source.append(numeric_string); source.append(" in_prod_res; \n");
+          source.append("    for(unsigned int i= iter_cnt + get_local_id(0); i < col_n; i+=get_local_size(0)){ \n");
+          source.append("        in_prod_res = 0.0; \n");
+          source.append("        custom_inner_prod(A, v, i, row_n, iter_cnt, &in_prod_res); \n");
+          source.append("        for(unsigned int j = iter_cnt; j < row_n; ++j){ \n");
+          source.append("            A[ i*row_n + j] -= b*in_prod_res* v[j]; \n");
+          source.append("        } \n");
+          source.append("    } \n");
+          source.append("} \n");
+
+          source.append("void store_householder_vector(__local "); source.append(numeric_string); source.append("* A,  unsigned int ind, unsigned int n, __global "); source.append(numeric_string); source.append("* v){ \n");
+          source.append("    for(unsigned int i = ind; i < n; ++i){ \n");
+          source.append("        A[ (ind-1)*n + i] = v[i]; \n");
+          source.append("    } \n");
+          source.append("} \n");
+
+          source.append("void single_qr( __local "); source.append(numeric_string); source.append("* R, __global unsigned int* matrix_dimensions, __global "); source.append(numeric_string); source.append("* b_v, __global "); source.append(numeric_string); source.append("* v, unsigned int matrix_ind){ \n");
+                      //matrix_dimensions[0] - number of rows
+                        //matrix_dimensions[1] - number of columns
+          source.append("  unsigned int col_n = matrix_dimensions[2*matrix_ind + 1]; \n");
+          source.append("  unsigned int row_n = matrix_dimensions[2*matrix_ind]; \n");
+
+          source.append("  if((col_n == row_n)&&(row_n == 1)){ \n");
+          source.append("    b_v[0] = 0.0; \n");
+          source.append("      return; \n");
+          source.append("  } \n");
+          source.append("  for(unsigned int i = 0; i < col_n; ++i){ \n");
+          source.append("    if(get_local_id(0) == 0){ \n");
+          source.append("      householder_vector(R, i, row_n, v, b_v + i); \n");
+          source.append("    } \n");
+          source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+          source.append("    apply_householder_reflection(R, row_n, col_n, i, v, b_v[i]); \n");
+          source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+          source.append("    if(get_local_id(0) == 0){ \n");
+          source.append("      if(i < matrix_dimensions[2*matrix_ind]){ \n");
+          source.append("        store_householder_vector(R, i+1, row_n, v); \n");
+          source.append("      } \n");
+          source.append("    } \n");
+          source.append("  } \n");
+          source.append("} \n");
+
+          source.append("void matrix_from_global_to_local_qr(__global "); source.append(numeric_string); source.append("* g_M, __local "); source.append(numeric_string); source.append("* l_M, unsigned int row_n, unsigned int col_n, unsigned int mat_start_ind){ \n");
+          source.append("  for(unsigned int i = get_local_id(0); i < col_n; i+= get_local_size(0)){ \n");
+          source.append("    for(unsigned int j = 0; j < row_n; ++j){ \n");
+          source.append("      l_M[i*row_n + j] = g_M[mat_start_ind + i*row_n + j]; \n");
+          source.append("    } \n");
+          source.append("  } \n");
+          source.append("} \n");
+          source.append("void matrix_from_local_to_global_qr(__global "); source.append(numeric_string); source.append("* g_M, __local "); source.append(numeric_string); source.append("* l_M, unsigned int row_n, unsigned int col_n, unsigned int mat_start_ind){ \n");
+          source.append("  for(unsigned int i = get_local_id(0); i < col_n; i+= get_local_size(0)){ \n");
+          source.append("    for(unsigned int j = 0; j < row_n; ++j){ \n");
+          source.append("      g_M[mat_start_ind + i*row_n + j] = l_M[i*row_n + j]; \n");
+          source.append("    } \n");
+          source.append("  } \n");
+          source.append("} \n");
+
+
+          source.append("__kernel void block_qr( \n");
+          source.append("      __global "); source.append(numeric_string); source.append("* R, \n");
+          source.append("      __global unsigned int* matrix_dimensions, \n");
+          source.append("      __global "); source.append(numeric_string); source.append("* b_v, \n");
+          source.append("      __global "); source.append(numeric_string); source.append("* v, \n");
+          source.append("      __global unsigned int* start_matrix_inds, \n");
+          source.append("      __global unsigned int* start_bv_inds, \n");
+          source.append("      __global unsigned int* start_v_inds, \n");
+          source.append("      __global unsigned int * g_is_update, \n");
+          source.append("      __local "); source.append(numeric_string); source.append("* local_buff_R, \n");
+          source.append("      unsigned int block_elems_num){ \n");
+          source.append("    for(unsigned int i  = get_group_id(0); i < block_elems_num; i += get_num_groups(0)){ \n");
+          source.append("        if((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){ \n");
+          source.append("      matrix_from_global_to_local_qr(R, local_buff_R, matrix_dimensions[2*i], matrix_dimensions[2*i + 1], start_matrix_inds[i]); \n");
+          source.append("      barrier(CLK_LOCAL_MEM_FENCE); \n");
+          source.append("            single_qr(local_buff_R, matrix_dimensions, b_v + start_bv_inds[i], v + start_v_inds[i], i); \n");
+          source.append("      barrier(CLK_LOCAL_MEM_FENCE); \n");
+          source.append("            matrix_from_local_to_global_qr(R, local_buff_R, matrix_dimensions[2*i], matrix_dimensions[2*i + 1], start_matrix_inds[i]); \n");
+          source.append("        } \n");
+          source.append("    } \n");
+          source.append("} \n");
+        }
+
+        template <typename StringType>
+        void generate_spai_block_qr_assembly(StringType & source, std::string const & numeric_string)
+        {
+          source.append("void assemble_upper_part(__global "); source.append(numeric_string); source.append(" * R_q, \n");
+          source.append("            unsigned int row_n_q, unsigned int col_n_q, __global "); source.append(numeric_string); source.append(" * R_u, \n");
+          source.append("            unsigned int row_n_u, unsigned int col_n_u, \n");
+          source.append("            unsigned int col_n, unsigned int diff){ \n");
+          source.append("            for(unsigned int i = 0; i < col_n_q; ++i){ \n");
+          source.append("                for(unsigned int j = 0; j < diff; ++j){ \n");
+          source.append("          R_q[ i*row_n_q + j] = R_u[ i*row_n_u + j + col_n ]; \n");
+          source.append("                } \n");
+          source.append("            } \n");
+          source.append("        } \n");
+
+          source.append("void assemble_lower_part(__global "); source.append(numeric_string); source.append(" * R_q, unsigned int row_n_q, unsigned int col_n_q, __global "); source.append(numeric_string); source.append(" * R_u_u, \n");
+          source.append("             unsigned int row_n_u_u, unsigned int col_n_u_u, \n");
+          source.append("             unsigned int diff){ \n");
+          source.append("  for(unsigned int i = 0; i < col_n_u_u; ++i){ \n");
+          source.append("    for(unsigned int j = 0; j < row_n_u_u; ++j){ \n");
+          source.append("      R_q[i*row_n_q + j + diff] = R_u_u[i*row_n_u_u + j]; \n");
+          source.append("    } \n");
+          source.append("  } \n");
+          source.append("} \n");
+
+          source.append("void assemble_qr_block(__global "); source.append(numeric_string); source.append(" * R_q, unsigned int row_n_q, unsigned int col_n_q, __global "); source.append(numeric_string); source.append(" * R_u, unsigned int row_n_u, \n");
+          source.append("            unsigned int col_n_u, __global "); source.append(numeric_string); source.append(" * R_u_u, unsigned int row_n_u_u, unsigned int col_n_u_u, unsigned int col_n){ \n");
+          source.append("            unsigned int diff = row_n_u - col_n; \n");
+          source.append("            assemble_upper_part(R_q, row_n_q, col_n_q, R_u, row_n_u, col_n_u, col_n, diff); \n");
+          source.append("            if(diff > 0){ \n");
+          source.append("              assemble_lower_part(R_q, row_n_q, col_n_q, R_u_u, row_n_u_u, col_n_u_u, diff); \n");
+          source.append("            } \n");
+          source.append("} \n");
+
+          source.append("__kernel void block_qr_assembly( \n");
+          source.append("      __global unsigned int * matrix_dimensions, \n");
+          source.append("      __global "); source.append(numeric_string); source.append(" * R_u, \n");
+          source.append("      __global unsigned int * block_ind_u, \n");
+          source.append("      __global unsigned int * matrix_dimensions_u, \n");
+          source.append("      __global "); source.append(numeric_string); source.append(" * R_u_u, \n");
+          source.append("      __global unsigned int * block_ind_u_u, \n");
+          source.append("      __global unsigned int * matrix_dimensions_u_u, \n");
+          source.append("      __global "); source.append(numeric_string); source.append(" * R_q, \n");
+          source.append("      __global unsigned int * block_ind_q, \n");
+          source.append("      __global unsigned int * matrix_dimensions_q, \n");
+          source.append("      __global unsigned int * g_is_update, \n");
+          source.append("          //__local  "); source.append(numeric_string); source.append(" * local_R_q, \n");
+          source.append("      unsigned int  block_elems_num) \n");
+          source.append("{ \n");
+          source.append("    for(unsigned int i  = get_global_id(0); i < block_elems_num; i += get_global_size(0)){ \n");
+          source.append("        if((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){ \n");
+          source.append("           assemble_qr_block(R_q + block_ind_q[i], matrix_dimensions_q[2*i], matrix_dimensions_q[2*i + 1], R_u + block_ind_u[i], matrix_dimensions_u[2*i], \n");
+          source.append("             matrix_dimensions_u[2*i + 1], R_u_u + block_ind_u_u[i], matrix_dimensions_u_u[2*i], matrix_dimensions_u_u[2*i + 1], matrix_dimensions[2*i + 1]); \n");
+          source.append("       } \n");
+          source.append("   } \n");
+          source.append("} \n");
+        }
+
+        template <typename StringType>
+        void generate_spai_block_qr_assembly_1(StringType & source, std::string const & numeric_string)
+        {
+          source.append("void assemble_upper_part_1(__global "); source.append(numeric_string); source.append(" * R_q, unsigned int row_n_q, unsigned int col_n_q, __global "); source.append(numeric_string); source.append(" * R_u, \n");
+          source.append("             unsigned int row_n_u, unsigned int col_n_u, \n");
+          source.append("             unsigned int col_n, unsigned int diff){ \n");
+          source.append("            for(unsigned int i = 0; i < col_n_q; ++i){ \n");
+          source.append("                for(unsigned int j = 0; j < diff; ++j){ \n");
+          source.append("          R_q[ i*row_n_q + j] = R_u[i*row_n_u + j + col_n ]; \n");
+          source.append("                } \n");
+          source.append("            } \n");
+          source.append("        } \n");
+
+
+          source.append("void assemble_qr_block_1(__global "); source.append(numeric_string); source.append(" * R_q,  unsigned int row_n_q, unsigned int col_n_q, __global "); source.append(numeric_string); source.append(" * R_u, unsigned int row_n_u, \n");
+          source.append("            unsigned int col_n_u, unsigned int col_n){ \n");
+          source.append("            unsigned int diff = row_n_u - col_n; \n");
+          source.append("            assemble_upper_part_1(R_q, row_n_q, col_n_q, R_u, row_n_u, col_n_u, col_n, diff); \n");
+          source.append("} \n");
+
+          source.append("__kernel void block_qr_assembly_1( \n");
+          source.append("  __global unsigned int * matrix_dimensions, \n");
+          source.append("  __global "); source.append(numeric_string); source.append(" * R_u, \n");
+          source.append("  __global unsigned int * block_ind_u, \n");
+          source.append("  __global unsigned int * matrix_dimensions_u, \n");
+          source.append("  __global "); source.append(numeric_string); source.append(" * R_q, \n");
+          source.append("  __global unsigned int * block_ind_q, \n");
+          source.append("  __global unsigned int * matrix_dimensions_q, \n");
+          source.append("  __global unsigned int * g_is_update, \n");
+          source.append("  unsigned int  block_elems_num) \n");
+          source.append("{ \n");
+          source.append("    for(unsigned int i  = get_global_id(0); i < block_elems_num; i += get_global_size(0)){ \n");
+          source.append("        if((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){ \n");
+          source.append("            assemble_qr_block_1(R_q + block_ind_q[i], matrix_dimensions_q[2*i], matrix_dimensions_q[2*i + 1], R_u + block_ind_u[i], matrix_dimensions_u[2*i], \n");
+          source.append("              matrix_dimensions_u[2*i + 1], matrix_dimensions[2*i + 1]); \n");
+          source.append("        } \n");
+          source.append("    } \n");
+          source.append("} \n");
+        }
+
+        template <typename StringType>
+        void generate_spai_block_r_assembly(StringType & source, std::string const & numeric_string)
+        {
+          source.append("void assemble_r(__global "); source.append(numeric_string); source.append(" * gR, unsigned int row_n_r, unsigned int col_n_r, __global "); source.append(numeric_string); source.append(" * R, \n");
+          source.append("        unsigned int row_n, unsigned int col_n) \n");
+          source.append("{ \n");
+          source.append("  for(unsigned int i = 0; i < col_n; ++i){ \n");
+          source.append("     for(unsigned int j = 0; j < row_n; ++j){ \n");
+          source.append("    gR[i*row_n_r + j] = R[i*row_n + j ]; \n");
+          source.append("     } \n");
+          source.append("  } \n");
+          source.append("} \n");
+
+          source.append("void assemble_r_u(__global "); source.append(numeric_string); source.append(" * gR, \n");
+          source.append("          unsigned int row_n_r, unsigned int col_n_r, __global "); source.append(numeric_string); source.append(" * R_u, unsigned int row_n_u, unsigned int col_n_u, \n");
+          source.append("          unsigned int col_n) \n");
+          source.append("{ \n");
+          source.append("  for(unsigned int i = 0; i < col_n_u; ++i){ \n");
+          source.append("    for(unsigned int j = 0; j < col_n; ++j){ \n");
+          source.append("      gR[ (i+col_n)*row_n_r + j] = R_u[ i*row_n_u + j]; \n");
+          source.append("    } \n");
+          source.append("  } \n");
+          source.append("} \n");
+
+
+          source.append("void assemble_r_u_u(__global "); source.append(numeric_string); source.append(" * gR,  unsigned int row_n_r, unsigned int col_n_r, __global "); source.append(numeric_string); source.append(" * R_u_u, unsigned int row_n_u_u, \n");
+          source.append("          unsigned int col_n_u_u, unsigned int col_n) \n");
+          source.append("{ \n");
+          source.append("  for(unsigned int i = 0; i < col_n_u_u; ++i){ \n");
+          source.append("    for(unsigned int j = 0; j < row_n_u_u; ++j){ \n");
+          source.append("      gR[(col_n+i)*row_n_r + j + col_n] = R_u_u[i*row_n_u_u + j]; \n");
+          source.append("    } \n");
+          source.append("  } \n");
+          source.append("} \n");
+
+          source.append("void assemble_r_block(__global "); source.append(numeric_string); source.append(" * gR, unsigned int row_n_r, unsigned int col_n_r, __global "); source.append(numeric_string); source.append(" * R, unsigned int row_n, \n");
+          source.append("        unsigned int col_n, __global "); source.append(numeric_string); source.append(" * R_u, unsigned int row_n_u, unsigned int col_n_u, __global "); source.append(numeric_string); source.append(" * R_u_u, \n");
+          source.append("        unsigned int row_n_u_u, unsigned int col_n_u_u){ \n");
+          source.append("        assemble_r(gR, row_n_r, col_n_r, R, row_n, col_n); \n");
+          source.append("        assemble_r_u(gR, row_n_r, col_n_r, R_u, row_n_u, col_n_u, col_n); \n");
+          source.append("        assemble_r_u_u(gR, row_n_r, col_n_r, R_u_u, row_n_u_u, col_n_u_u, col_n); \n");
+          source.append("} \n");
+
+
+          source.append("__kernel void block_r_assembly( \n");
+          source.append("  __global "); source.append(numeric_string); source.append(" * R, \n");
+          source.append("  __global unsigned int * block_ind, \n");
+          source.append("  __global unsigned int * matrix_dimensions, \n");
+          source.append("  __global "); source.append(numeric_string); source.append(" * R_u, \n");
+          source.append("  __global unsigned int * block_ind_u, \n");
+          source.append("  __global unsigned int * matrix_dimensions_u, \n");
+          source.append("  __global "); source.append(numeric_string); source.append(" * R_u_u, \n");
+          source.append("  __global unsigned int * block_ind_u_u, \n");
+          source.append("  __global unsigned int * matrix_dimensions_u_u, \n");
+          source.append("  __global "); source.append(numeric_string); source.append(" * g_R, \n");
+          source.append("  __global unsigned int * block_ind_r, \n");
+          source.append("  __global unsigned int * matrix_dimensions_r, \n");
+          source.append("  __global unsigned int * g_is_update, \n");
+          source.append("  unsigned int  block_elems_num) \n");
+          source.append("{ \n");
+          source.append("    for(unsigned int i  = get_global_id(0); i < block_elems_num; i += get_global_size(0)){ \n");
+          source.append("        if((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){ \n");
+
+          source.append("            assemble_r_block(g_R + block_ind_r[i], matrix_dimensions_r[2*i], matrix_dimensions_r[2*i + 1], R + block_ind[i], matrix_dimensions[2*i], \n");
+          source.append("              matrix_dimensions[2*i + 1], R_u + block_ind_u[i], matrix_dimensions_u[2*i], matrix_dimensions_u[2*i + 1], \n");
+          source.append("              R_u_u + block_ind_u_u[i], matrix_dimensions_u_u[2*i], matrix_dimensions_u_u[2*i + 1]); \n");
+
+          source.append("        } \n");
+          source.append("    } \n");
+          source.append("} \n");
+        }
+
+        //////////////////////////// Part 2: Main kernel class ////////////////////////////////////
+
+        // main kernel class
+        /** @brief Main kernel class for generating OpenCL kernels for the sparse approximate inverse preconditioners. */
+        template <typename NumericT>
+        struct spai
+        {
+          static std::string program_name()
+          {
+            return viennacl::ocl::type_to_string<NumericT>::apply() + "_spai";
+          }
+
+          static void init(viennacl::ocl::context & ctx)
+          {
+            viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx);
+            std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
+
+            static std::map<cl_context, bool> init_done;
+            if (!init_done[ctx.handle().get()])
+            {
+              std::string source;
+              source.reserve(1024);
+
+              viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
+
+              generate_spai_assemble_blocks(source, numeric_string);
+              generate_spai_block_bv_assembly(source, numeric_string);
+              generate_spai_block_least_squares(source, numeric_string);
+              generate_spai_block_q_mult(source, numeric_string);
+              generate_spai_block_qr(source, numeric_string);
+              generate_spai_block_qr_assembly(source, numeric_string);
+              generate_spai_block_qr_assembly_1(source, numeric_string);
+              generate_spai_block_r_assembly(source, numeric_string);
+
+              std::string prog_name = program_name();
+              #ifdef VIENNACL_BUILD_INFO
+              std::cout << "Creating program " << prog_name << std::endl;
+              #endif
+              ctx.add_program(source, prog_name);
+              init_done[ctx.handle().get()] = true;
+            } //if
+          } //init
+        };
+
+      }  // namespace kernels
+    }  // namespace opencl
+  }  // namespace linalg
+}  // namespace viennacl
+#endif
+
diff --git a/viennacl/linalg/opencl/kernels/svd.hpp b/viennacl/linalg/opencl/kernels/svd.hpp
new file mode 100644
index 0000000..b0daa02
--- /dev/null
+++ b/viennacl/linalg/opencl/kernels/svd.hpp
@@ -0,0 +1,560 @@
+#ifndef VIENNACL_LINALG_OPENCL_KERNELS_SVD_HPP
+#define VIENNACL_LINALG_OPENCL_KERNELS_SVD_HPP
+
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/platform.hpp"
+#include "viennacl/ocl/utils.hpp"
+
+/** @file viennacl/linalg/opencl/kernels/svd.hpp
+ *  @brief OpenCL kernel file for singular value decomposition */
+namespace viennacl
+{
+  namespace linalg
+  {
+    namespace opencl
+    {
+      namespace kernels
+      {
+        template <typename StringType>
+        void generate_svd_bidiag_pack(StringType & source, std::string const & numeric_string)
+        {
+          source.append("__kernel void bidiag_pack(__global "); source.append(numeric_string); source.append("* A, \n");
+          source.append("  __global "); source.append(numeric_string); source.append("* D, \n");
+          source.append("  __global "); source.append(numeric_string); source.append("* S, \n");
+          source.append("  uint size1, \n");
+          source.append("  uint size2, \n");
+          source.append("  uint stride \n");
+          source.append(") { \n");
+          source.append("  uint size = min(size1, size2); \n");
+
+          source.append("  if(get_global_id(0) == 0) \n");
+          source.append("    S[0] = 0; \n");
+
+          source.append("  for(uint i = get_global_id(0); i < size ; i += get_global_size(0)) { \n");
+          source.append("    D[i] = A[i*stride + i]; \n");
+          source.append("    S[i + 1] = (i + 1 < size2) ? A[i*stride + (i + 1)] : 0; \n");
+          source.append("  } \n");
+          source.append("} \n");
+        }
+
+        template <typename StringType>
+        void generate_svd_col_reduce_lcl_array(StringType & source, std::string const & numeric_string)
+        {
+          // calculates a sum of local array elements
+          source.append("void col_reduce_lcl_array(__local "); source.append(numeric_string); source.append("* sums, uint lcl_id, uint lcl_sz) { \n");
+          source.append("    uint step = lcl_sz >> 1; \n");
+
+          source.append("    while(step > 0) { \n");
+          source.append("        if(lcl_id < step) { \n");
+          source.append("            sums[lcl_id] += sums[lcl_id + step]; \n");
+          source.append("        } \n");
+          source.append("        step >>= 1; \n");
+          source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+          source.append("    } \n");
+          source.append("} \n");
+        }
+
+        template <typename StringType>
+        void generate_svd_copy_col(StringType & source, std::string const & numeric_string)
+        {
+          // probably, this is a ugly way
+          source.append("__kernel void copy_col(__global "); source.append(numeric_string); source.append("* A, \n");
+          source.append("                       __global "); source.append(numeric_string); source.append("* V, \n");
+          source.append("                       uint row_start, \n");
+          source.append("                       uint col_start, \n");
+          source.append("                       uint size, \n");
+          source.append("                       uint stride \n");
+          source.append("                       ) { \n");
+          source.append("    uint glb_id = get_global_id(0); \n");
+          source.append("    uint glb_sz = get_global_size(0); \n");
+
+          source.append("    for(uint i = row_start + glb_id; i < size; i += glb_sz) { \n");
+          source.append("        V[i - row_start] = A[i * stride + col_start]; \n");
+          source.append("    } \n");
+          source.append("} \n");
+        }
+
+        template <typename StringType>
+        void generate_svd_copy_row(StringType & source, std::string const & numeric_string)
+        {
+          // probably, this is too
+          source.append("__kernel void copy_row(__global "); source.append(numeric_string); source.append("* A, \n");
+          source.append("                       __global "); source.append(numeric_string); source.append("* V, \n");
+          source.append("                       uint row_start, \n");
+          source.append("                       uint col_start, \n");
+          source.append("                       uint size, \n");
+          source.append("                       uint stride \n");
+          source.append("                       ) { \n");
+          source.append("    uint glb_id = get_global_id(0); \n");
+          source.append("    uint glb_sz = get_global_size(0); \n");
+
+          source.append("    for(uint i = col_start + glb_id; i < size; i += glb_sz) { \n");
+          source.append("        V[i - col_start] = A[row_start * stride + i]; \n");
+          source.append("    } \n");
+          source.append("} \n");
+        }
+
+        template <typename StringType>
+        void generate_svd_final_iter_update(StringType & source, std::string const & numeric_string)
+        {
+          source.append("__kernel void final_iter_update(__global "); source.append(numeric_string); source.append("* A, \n");
+          source.append("                                uint stride, \n");
+          source.append("                                uint n, \n");
+          source.append("                                uint last_n, \n");
+          source.append("                                "); source.append(numeric_string); source.append(" q, \n");
+          source.append("                                "); source.append(numeric_string); source.append(" p \n");
+          source.append("                                ) \n");
+          source.append("{ \n");
+          source.append("    uint glb_id = get_global_id(0); \n");
+          source.append("    uint glb_sz = get_global_size(0); \n");
+
+          source.append("    for (uint px = glb_id; px < last_n; px += glb_sz) \n");
+          source.append("    { \n");
+          source.append("        "); source.append(numeric_string); source.append(" v_in = A[n * stride + px]; \n");
+          source.append("        "); source.append(numeric_string); source.append(" z = A[(n - 1) * stride + px]; \n");
+          source.append("        A[(n - 1) * stride + px] = q * z + p * v_in; \n");
+          source.append("        A[n * stride + px] = q * v_in - p * z; \n");
+          source.append("    } \n");
+          source.append("} \n");
+        }
+
+        template <typename StringType>
+        void generate_svd_givens_next(StringType & source, std::string const & numeric_string)
+        {
+          source.append("__kernel void givens_next(__global "); source.append(numeric_string); source.append("* matr, \n");
+          source.append("                            __global "); source.append(numeric_string); source.append("* cs, \n");
+          source.append("                            __global "); source.append(numeric_string); source.append("* ss, \n");
+          source.append("                            uint size, \n");
+          source.append("                            uint stride, \n");
+          source.append("                            uint start_i, \n");
+          source.append("                            uint end_i \n");
+          source.append("                            ) \n");
+          source.append("{ \n");
+          source.append("    uint glb_id = get_global_id(0); \n");
+          source.append("    uint glb_sz = get_global_size(0); \n");
+
+          source.append("    uint lcl_id = get_local_id(0); \n");
+          source.append("    uint lcl_sz = get_local_size(0); \n");
+
+          source.append("    uint j = glb_id; \n");
+
+          source.append("    __local "); source.append(numeric_string); source.append(" cs_lcl[256]; \n");
+          source.append("    __local "); source.append(numeric_string); source.append(" ss_lcl[256]; \n");
+
+          source.append("    "); source.append(numeric_string); source.append(" x = (j < size) ? matr[(end_i + 1) * stride + j] : 0; \n");
+
+          source.append("    uint elems_num = end_i - start_i + 1; \n");
+          source.append("    uint block_num = (elems_num + lcl_sz - 1) / lcl_sz; \n");
+
+          source.append("    for(uint block_id = 0; block_id < block_num; block_id++) \n");
+          source.append("    { \n");
+          source.append("        uint to = min(elems_num - block_id * lcl_sz, lcl_sz); \n");
+
+          source.append("        if(lcl_id < to) \n");
+          source.append("        { \n");
+          source.append("            cs_lcl[lcl_id] = cs[end_i - (lcl_id + block_id * lcl_sz)]; \n");
+          source.append("            ss_lcl[lcl_id] = ss[end_i - (lcl_id + block_id * lcl_sz)]; \n");
+          source.append("        } \n");
+
+          source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+          source.append("        if(j < size) \n");
+          source.append("        { \n");
+          source.append("            for(uint ind = 0; ind < to; ind++) \n");
+          source.append("            { \n");
+          source.append("                uint i = end_i - (ind + block_id * lcl_sz); \n");
+
+          source.append("                "); source.append(numeric_string); source.append(" z = matr[i * stride + j]; \n");
+
+          source.append("                "); source.append(numeric_string); source.append(" cs_val = cs_lcl[ind]; \n");
+          source.append("                "); source.append(numeric_string); source.append(" ss_val = ss_lcl[ind]; \n");
+
+          source.append("                matr[(i + 1) * stride + j] = x * cs_val + z * ss_val; \n");
+          source.append("                x = -x * ss_val + z * cs_val; \n");
+          source.append("            } \n");
+          source.append("        } \n");
+          source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+          source.append("    } \n");
+          source.append("    if(j < size) \n");
+          source.append("        matr[(start_i) * stride + j] = x; \n");
+          source.append("} \n");
+        }
+
+        template <typename StringType>
+        void generate_svd_givens_prev(StringType & source, std::string const & numeric_string)
+        {
+          source.append("__kernel void givens_prev(__global "); source.append(numeric_string); source.append("* matr, \n");
+          source.append("                            __global "); source.append(numeric_string); source.append("* cs, \n");
+          source.append("                            __global "); source.append(numeric_string); source.append("* ss, \n");
+          source.append("                            uint size, \n");
+          source.append("                            uint stride, \n");
+          source.append("                            uint start_i, \n");
+          source.append("                            uint end_i \n");
+          source.append("                            ) \n");
+          source.append("{ \n");
+          source.append("    uint glb_id = get_global_id(0); \n");
+          source.append("    uint glb_sz = get_global_size(0); \n");
+
+          source.append("    uint lcl_id = get_local_id(0); \n");
+          source.append("    uint lcl_sz = get_local_size(0); \n");
+
+          source.append("    uint j = glb_id; \n");
+
+          source.append("    __local "); source.append(numeric_string); source.append(" cs_lcl[256]; \n");
+          source.append("    __local "); source.append(numeric_string); source.append(" ss_lcl[256]; \n");
+
+          source.append("    "); source.append(numeric_string); source.append(" x = (j < size) ? matr[(start_i - 1) * stride + j] : 0; \n");
+
+          source.append("    uint elems_num = end_i - start_i; \n");
+          source.append("    uint block_num = (elems_num + lcl_sz - 1) / lcl_sz; \n");
+
+          source.append("    for(uint block_id = 0; block_id < block_num; block_id++) \n");
+          source.append("    { \n");
+          source.append("        uint to = min(elems_num - block_id * lcl_sz, lcl_sz); \n");
+
+          source.append("        if(lcl_id < to) \n");
+          source.append("        { \n");
+          source.append("            cs_lcl[lcl_id] = cs[lcl_id + start_i + block_id * lcl_sz]; \n");
+          source.append("            ss_lcl[lcl_id] = ss[lcl_id + start_i + block_id * lcl_sz]; \n");
+          source.append("        } \n");
+
+          source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+          source.append("        if(j < size) \n");
+          source.append("        { \n");
+          source.append("            for(uint ind = 0; ind < to; ind++) \n");
+          source.append("            { \n");
+          source.append("                uint i = ind + start_i + block_id * lcl_sz; \n");
+
+          source.append("                "); source.append(numeric_string); source.append(" z = matr[i * stride + j]; \n");
+
+          source.append("                "); source.append(numeric_string); source.append(" cs_val = cs_lcl[ind];//cs[i]; \n");
+          source.append("                "); source.append(numeric_string); source.append(" ss_val = ss_lcl[ind];//ss[i]; \n");
+
+          source.append("                matr[(i - 1) * stride + j] = x * cs_val + z * ss_val; \n");
+          source.append("                x = -x * ss_val + z * cs_val; \n");
+          source.append("            } \n");
+          source.append("        } \n");
+          source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+          source.append("    } \n");
+          source.append("    if(j < size) \n");
+          source.append("        matr[(end_i - 1) * stride + j] = x; \n");
+          source.append("} \n");
+        }
+
+        template <typename StringType>
+        void generate_svd_house_update_A_left(StringType & source, std::string const & numeric_string)
+        {
+          source.append("__kernel void house_update_A_left( \n");
+          source.append("                        __global "); source.append(numeric_string); source.append("* A, \n");
+          source.append("                        __constant "); source.append(numeric_string); source.append("* V, \n"); //householder vector
+          source.append("                        uint row_start, \n");
+          source.append("                        uint col_start, \n");
+          source.append("                        uint size1, \n");
+          source.append("                        uint size2, \n");
+          source.append("                        uint stride, \n");
+          source.append("                        __local "); source.append(numeric_string); source.append("* sums \n");
+          source.append("                        ) { \n");
+          source.append("    uint glb_id = get_global_id(0); \n");
+          source.append("    uint glb_sz = get_global_size(0); \n");
+
+          source.append("    uint grp_id = get_group_id(0); \n");
+          source.append("    uint grp_nm = get_num_groups(0); \n");
+
+          source.append("    uint lcl_id = get_local_id(0); \n");
+          source.append("    uint lcl_sz = get_local_size(0); \n");
+
+          source.append("    "); source.append(numeric_string); source.append(" ss = 0; \n");
+
+              // doing it in slightly different way to avoid cache misses
+          source.append("    for(uint i = glb_id + col_start; i < size2; i += glb_sz) { \n");
+          source.append("        ss = 0; \n");
+          source.append("        for(uint j = row_start; j < size1; j++) ss = ss + (V[j] * A[j * stride + i]); \n");
+
+          source.append("        for(uint j = row_start; j < size1; j++) \n");
+          source.append("            A[j * stride + i] = A[j * stride + i] - (2 * V[j] * ss); \n");
+          source.append("    } \n");
+          source.append("} \n");
+        }
+
+        template <typename StringType>
+        void generate_svd_house_update_A_right(StringType & source, std::string const & numeric_string)
+        {
+
+          source.append("__kernel void house_update_A_right( \n");
+          source.append("                        __global "); source.append(numeric_string); source.append("* A, \n");
+          source.append("                        __global "); source.append(numeric_string); source.append("* V, \n"); // householder vector
+          source.append("                        uint row_start, \n");
+          source.append("                        uint col_start, \n");
+          source.append("                        uint size1, \n");
+          source.append("                        uint size2, \n");
+          source.append("                        uint stride, \n");
+          source.append("                        __local "); source.append(numeric_string); source.append("* sums \n");
+          source.append("                        ) { \n");
+
+          source.append("    uint glb_id = get_global_id(0); \n");
+
+          source.append("    uint grp_id = get_group_id(0); \n");
+          source.append("    uint grp_nm = get_num_groups(0); \n");
+
+          source.append("    uint lcl_id = get_local_id(0); \n");
+          source.append("    uint lcl_sz = get_local_size(0); \n");
+
+          source.append("    "); source.append(numeric_string); source.append(" ss = 0; \n");
+
+              // update of A matrix
+          source.append("    for(uint i = grp_id + row_start; i < size1; i += grp_nm) { \n");
+          source.append("        ss = 0; \n");
+
+          source.append("        for(uint j = lcl_id; j < size2; j += lcl_sz) ss = ss + (V[j] * A[i * stride + j]); \n");
+          source.append("        sums[lcl_id] = ss; \n");
+
+          source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+          source.append("        col_reduce_lcl_array(sums, lcl_id, lcl_sz); \n");
+          source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+          source.append("        "); source.append(numeric_string); source.append(" sum_Av = sums[0]; \n");
+
+          source.append("        for(uint j = lcl_id; j < size2; j += lcl_sz) \n");
+          source.append("            A[i * stride + j] = A[i * stride + j] - (2 * V[j] * sum_Av); \n");
+          source.append("    } \n");
+          source.append("} \n");
+
+        }
+
+        template <typename StringType>
+        void generate_svd_house_update_QL(StringType & source, std::string const & numeric_string)
+        {
+          source.append("__kernel void house_update_QL( \n");
+          source.append("                        __global "); source.append(numeric_string); source.append("* QL, \n");
+          source.append("                        __constant "); source.append(numeric_string); source.append("* V, \n"); //householder vector
+          source.append("                        uint size1, \n");
+          source.append("                        uint size2, \n");
+          source.append("                        uint strideQ, \n");
+          source.append("                        __local "); source.append(numeric_string); source.append("* sums \n");
+          source.append("                        ) { \n");
+          source.append("    uint glb_id = get_global_id(0); \n");
+          source.append("    uint glb_sz = get_global_size(0); \n");
+
+          source.append("    uint grp_id = get_group_id(0); \n");
+          source.append("    uint grp_nm = get_num_groups(0); \n");
+
+          source.append("    uint lcl_id = get_local_id(0); \n");
+          source.append("    uint lcl_sz = get_local_size(0); \n");
+
+          source.append("    "); source.append(numeric_string); source.append(" ss = 0; \n");
+              // update of left matrix
+          source.append("    for(uint i = grp_id; i < size1; i += grp_nm) { \n");
+          source.append("        ss = 0; \n");
+          source.append("        for(uint j = lcl_id; j < size1; j += lcl_sz) ss = ss + (V[j] * QL[i * strideQ + j]); \n");
+          source.append("        sums[lcl_id] = ss; \n");
+
+          source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+          source.append("        col_reduce_lcl_array(sums, lcl_id, lcl_sz); \n");
+          source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+          source.append("        "); source.append(numeric_string); source.append(" sum_Qv = sums[0]; \n");
+
+          source.append("        for(uint j = lcl_id; j < size1; j += lcl_sz) \n");
+          source.append("            QL[i * strideQ + j] = QL[i * strideQ + j] - (2 * V[j] * sum_Qv); \n");
+          source.append("    } \n");
+          source.append("} \n");
+
+        }
+
+        template <typename StringType>
+        void generate_svd_house_update_QR(StringType & source, std::string const & numeric_string)
+        {
+          source.append("__kernel void house_update_QR( \n");
+          source.append("                        __global "); source.append(numeric_string); source.append("* QR, \n");
+          source.append("                        __global "); source.append(numeric_string); source.append("* V, \n"); // householder vector
+          source.append("                        uint size1, \n");
+          source.append("                        uint size2, \n");
+          source.append("                        uint strideQ, \n");
+          source.append("                        __local "); source.append(numeric_string); source.append("* sums \n");
+          source.append("                        ) { \n");
+
+          source.append("    uint glb_id = get_global_id(0); \n");
+
+          source.append("    uint grp_id = get_group_id(0); \n");
+          source.append("    uint grp_nm = get_num_groups(0); \n");
+
+          source.append("    uint lcl_id = get_local_id(0); \n");
+          source.append("    uint lcl_sz = get_local_size(0); \n");
+
+          source.append("   "); source.append(numeric_string); source.append(" ss = 0; \n");
+
+              // update of QR matrix
+              // Actually, we are calculating a transpose of right matrix. This allows to avoid cache
+              // misses.
+          source.append("    for(uint i = grp_id; i < size2; i += grp_nm) { \n");
+          source.append("        ss = 0; \n");
+          source.append("        for(uint j = lcl_id; j < size2; j += lcl_sz) ss = ss + (V[j] * QR[i * strideQ + j]); \n");
+          source.append("        sums[lcl_id] = ss; \n");
+
+          source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+          source.append("        col_reduce_lcl_array(sums, lcl_id, lcl_sz); \n");
+          source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+          source.append("        "); source.append(numeric_string); source.append(" sum_Qv = sums[0]; \n");
+          source.append("        for(uint j = lcl_id; j < size2; j += lcl_sz) \n");
+          source.append("            QR[i * strideQ + j] = QR[i * strideQ + j] - (2 * V[j] * sum_Qv); \n");
+          source.append("    } \n");
+          source.append("} \n");
+        }
+
+        template <typename StringType>
+        void generate_svd_inverse_signs(StringType & source, std::string const & numeric_string)
+        {
+          source.append("__kernel void inverse_signs(__global "); source.append(numeric_string); source.append("* v, \n");
+          source.append("                            __global "); source.append(numeric_string); source.append("* signs, \n");
+          source.append("                            uint size, \n");
+          source.append("                            uint stride \n");
+          source.append("                            ) \n");
+          source.append("{ \n");
+          source.append("    uint glb_id_x = get_global_id(0); \n");
+          source.append("    uint glb_id_y = get_global_id(1); \n");
+
+          source.append("    if((glb_id_x < size) && (glb_id_y < size)) \n");
+          source.append("        v[glb_id_x * stride + glb_id_y] *= signs[glb_id_x]; \n");
+          source.append("} \n");
+
+        }
+
+        template <typename StringType>
+        void generate_svd_transpose_inplace(StringType & source, std::string const & numeric_string)
+        {
+
+          source.append("__kernel void transpose_inplace(__global "); source.append(numeric_string); source.append("* input, \n");
+          source.append("                        unsigned int row_num, \n");
+          source.append("                        unsigned int col_num) { \n");
+          source.append("    unsigned int size = row_num * col_num; \n");
+          source.append("    for(unsigned int i = get_global_id(0); i < size; i+= get_global_size(0)) { \n");
+          source.append("        unsigned int row = i / col_num; \n");
+          source.append("        unsigned int col = i - row*col_num; \n");
+
+          source.append("        unsigned int new_pos = col * row_num + row; \n");
+
+                  //new_pos = (col < row) ? 0 : 1;
+                  //input[i] = new_pos;
+
+          source.append("        if(i < new_pos) { \n");
+          source.append("            "); source.append(numeric_string); source.append(" val = input[i]; \n");
+          source.append("            input[i] = input[new_pos]; \n");
+          source.append("            input[new_pos] = val; \n");
+          source.append("        } \n");
+          source.append("    } \n");
+          source.append("} \n");
+
+        }
+
+        template <typename StringType>
+        void generate_svd_update_qr_column(StringType & source, std::string const & numeric_string)
+        {
+          source.append("__kernel void update_qr_column(__global "); source.append(numeric_string); source.append("* A, \n");
+          source.append("                               uint stride, \n");
+          source.append("                               __global "); source.append(numeric_string); source.append("* buf, \n");
+          source.append("                               int m, \n");
+          source.append("                               int n, \n");
+          source.append("                               int last_n) \n");
+          source.append("{ \n");
+          source.append("    uint glb_id = get_global_id(0); \n");
+          source.append("    uint glb_sz = get_global_size(0); \n");
+
+          source.append("    for (int i = glb_id; i < last_n; i += glb_sz) \n");
+          source.append("    { \n");
+          source.append("        "); source.append(numeric_string); source.append(" a_ik = A[m * stride + i], a_ik_1, a_ik_2; \n");
+
+          source.append("        a_ik_1 = A[(m + 1) * stride + i]; \n");
+
+          source.append("        for(int k = m; k < n; k++) \n");
+          source.append("        { \n");
+          source.append("            bool notlast = (k != n - 1); \n");
+
+          source.append("            "); source.append(numeric_string); source.append(" p = buf[5 * k] * a_ik + buf[5 * k + 1] * a_ik_1; \n");
+
+          source.append("            if (notlast) \n");
+          source.append("            { \n");
+          source.append("                a_ik_2 = A[(k + 2) * stride + i]; \n");
+          source.append("                p = p + buf[5 * k + 2] * a_ik_2; \n");
+          source.append("                a_ik_2 = a_ik_2 - p * buf[5 * k + 4]; \n");
+          source.append("            } \n");
+
+          source.append("            A[k * stride + i] = a_ik - p; \n");
+          source.append("            a_ik_1 = a_ik_1 - p * buf[5 * k + 3]; \n");
+
+          source.append("            a_ik = a_ik_1; \n");
+          source.append("            a_ik_1 = a_ik_2; \n");
+          source.append("        } \n");
+
+          source.append("        A[n * stride + i] = a_ik; \n");
+          source.append("    } \n");
+
+          source.append("} \n");
+        }
+
+
+
+
+        // main kernel class
+        /** @brief Main kernel class for generating OpenCL kernels for singular value decomposition of dense matrices. */
+        template <class NumericT>
+        struct svd
+        {
+          static std::string program_name()
+          {
+            return viennacl::ocl::type_to_string<NumericT>::apply() + "_svd";
+          }
+
+          static void init(viennacl::ocl::context & ctx)
+          {
+            viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx);
+            std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
+
+            static std::map<cl_context, bool> init_done;
+            if (!init_done[ctx.handle().get()])
+            {
+              std::string source;
+              source.reserve(1024);
+
+              viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
+
+              // only generate for floating points (forces error for integers)
+              if (numeric_string == "float" || numeric_string == "double")
+              {
+                //helper function used by multiple kernels:
+                generate_svd_col_reduce_lcl_array(source, numeric_string);
+
+                //kernels:
+                generate_svd_bidiag_pack(source, numeric_string);
+                generate_svd_copy_col(source, numeric_string);
+                generate_svd_copy_row(source, numeric_string);
+                generate_svd_final_iter_update(source, numeric_string);
+                generate_svd_givens_next(source, numeric_string);
+                generate_svd_givens_prev(source, numeric_string);
+                generate_svd_house_update_A_left(source, numeric_string);
+                generate_svd_house_update_A_right(source, numeric_string);
+                generate_svd_house_update_QL(source, numeric_string);
+                generate_svd_house_update_QR(source, numeric_string);
+                generate_svd_inverse_signs(source, numeric_string);
+                generate_svd_transpose_inplace(source, numeric_string);
+                generate_svd_update_qr_column(source, numeric_string);
+              }
+
+              std::string prog_name = program_name();
+              #ifdef VIENNACL_BUILD_INFO
+              std::cout << "Creating program " << prog_name << std::endl;
+              #endif
+              ctx.add_program(source, prog_name);
+              init_done[ctx.handle().get()] = true;
+            } //if
+          } //init
+        };
+
+      }  // namespace kernels
+    }  // namespace opencl
+  }  // namespace linalg
+}  // namespace viennacl
+#endif
+
diff --git a/viennacl/linalg/opencl/kernels/vector.hpp b/viennacl/linalg/opencl/kernels/vector.hpp
new file mode 100644
index 0000000..ea25df7
--- /dev/null
+++ b/viennacl/linalg/opencl/kernels/vector.hpp
@@ -0,0 +1,688 @@
+#ifndef VIENNACL_LINALG_OPENCL_KERNELS_VECTOR_HPP
+#define VIENNACL_LINALG_OPENCL_KERNELS_VECTOR_HPP
+
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/platform.hpp"
+#include "viennacl/ocl/utils.hpp"
+
+/** @file viennacl/linalg/opencl/kernels/vector.hpp
+ *  @brief OpenCL kernel file for vector operations */
+namespace viennacl
+{
+  namespace linalg
+  {
+    namespace opencl
+    {
+      namespace kernels
+      {
+
+        //////////////////////////// Part 1: Kernel generation routines ////////////////////////////////////
+
+        /** @brief Enumeration for the scalar type in avbv-like operations */
+        enum avbv_scalar_type
+        {
+          VIENNACL_AVBV_NONE = 0, // vector does not exist/contribute
+          VIENNACL_AVBV_CPU,
+          VIENNACL_AVBV_GPU
+        };
+
+        /** @brief Configuration struct for generating OpenCL kernels for linear combinations of vectors */
+        struct avbv_config
+        {
+          avbv_config() : with_stride_and_range(true), a(VIENNACL_AVBV_CPU), b(VIENNACL_AVBV_NONE) {}
+
+          bool with_stride_and_range;
+          std::string      assign_op;
+          avbv_scalar_type a;
+          avbv_scalar_type b;
+        };
+
+        // just returns the for-loop
+        template <typename StringType>
+        void generate_avbv_impl2(StringType & source, std::string const & /*numeric_string*/, avbv_config const & cfg, bool mult_alpha, bool mult_beta)
+        {
+          source.append("    for (unsigned int i = get_global_id(0); i < size1.z; i += get_global_size(0)) \n");
+          if (cfg.with_stride_and_range)
+          {
+            source.append("      vec1[i*size1.y+size1.x] "); source.append(cfg.assign_op); source.append(" vec2[i*size2.y+size2.x] ");
+            if (mult_alpha)
+              source.append("* alpha ");
+            else
+              source.append("/ alpha ");
+            if (cfg.b != VIENNACL_AVBV_NONE)
+            {
+              source.append("+ vec3[i*size3.y+size3.x] ");
+              if (mult_beta)
+                source.append("* beta");
+              else
+                source.append("/ beta");
+            }
+          }
+          else
+          {
+            source.append("    vec1[i] "); source.append(cfg.assign_op); source.append(" vec2[i] ");
+            if (mult_alpha)
+              source.append("* alpha ");
+            else
+              source.append("/ alpha ");
+            if (cfg.b != VIENNACL_AVBV_NONE)
+            {
+              source.append("+ vec3[i] ");
+              if (mult_beta)
+                source.append("* beta");
+              else
+                source.append("/ beta");
+            }
+          }
+          source.append("; \n");
+        }
+
+        template <typename StringType>
+        void generate_avbv_impl(StringType & source, std::string const & numeric_string, avbv_config const & cfg)
+        {
+          source.append("__kernel void av");
+          if (cfg.b != VIENNACL_AVBV_NONE)
+            source.append("bv");
+          if (cfg.assign_op != "=")
+            source.append("_v");
+
+          if (cfg.a == VIENNACL_AVBV_CPU)
+            source.append("_cpu");
+          else if (cfg.a == VIENNACL_AVBV_GPU)
+            source.append("_gpu");
+
+          if (cfg.b == VIENNACL_AVBV_CPU)
+            source.append("_cpu");
+          else if (cfg.b == VIENNACL_AVBV_GPU)
+            source.append("_gpu");
+          source.append("( \n");
+          source.append("  __global "); source.append(numeric_string); source.append(" * vec1, \n");
+          source.append("  uint4 size1, \n");
+          source.append(" \n");
+          if (cfg.a == VIENNACL_AVBV_CPU)
+          {
+            source.append("  "); source.append(numeric_string); source.append(" fac2, \n");
+          }
+          else if (cfg.a == VIENNACL_AVBV_GPU)
+          {
+            source.append("  __global "); source.append(numeric_string); source.append(" * fac2, \n");
+          }
+          source.append("  unsigned int options2, \n");  // 0: no action, 1: flip sign, 2: take inverse, 3: flip sign and take inverse
+          source.append("  __global const "); source.append(numeric_string); source.append(" * vec2, \n");
+          source.append("  uint4 size2");
+
+          if (cfg.b != VIENNACL_AVBV_NONE)
+          {
+            source.append(", \n\n");
+            if (cfg.b == VIENNACL_AVBV_CPU)
+            {
+              source.append("  "); source.append(numeric_string); source.append(" fac3, \n");
+            }
+            else if (cfg.b == VIENNACL_AVBV_GPU)
+            {
+              source.append("  __global "); source.append(numeric_string); source.append(" * fac3, \n");
+            }
+            source.append("  unsigned int options3, \n");  // 0: no action, 1: flip sign, 2: take inverse, 3: flip sign and take inverse
+            source.append("  __global const "); source.append(numeric_string); source.append(" * vec3, \n");
+            source.append("  uint4 size3 \n");
+          }
+          source.append(") { \n");
+
+          if (cfg.a == VIENNACL_AVBV_CPU)
+          {
+            source.append("  "); source.append(numeric_string); source.append(" alpha = fac2; \n");
+          }
+          else if (cfg.a == VIENNACL_AVBV_GPU)
+          {
+            source.append("  "); source.append(numeric_string); source.append(" alpha = fac2[0]; \n");
+          }
+          source.append("  if (options2 & (1 << 0)) \n");
+          source.append("    alpha = -alpha; \n");
+          source.append(" \n");
+
+          if (cfg.b == VIENNACL_AVBV_CPU)
+          {
+            source.append("  "); source.append(numeric_string); source.append(" beta = fac3; \n");
+          }
+          else if (cfg.b == VIENNACL_AVBV_GPU)
+          {
+            source.append("  "); source.append(numeric_string); source.append(" beta = fac3[0]; \n");
+          }
+          if (cfg.b != VIENNACL_AVBV_NONE)
+          {
+            source.append("  if (options3 & (1 << 0)) \n");
+            source.append("    beta = -beta; \n");
+            source.append(" \n");
+          }
+          source.append("  if (options2 & (1 << 1)) { \n");
+          if (cfg.b != VIENNACL_AVBV_NONE)
+          {
+            source.append("    if (options3 & (1 << 1)) {\n");
+            generate_avbv_impl2(source, numeric_string, cfg, false, false);
+            source.append("    } else {\n");
+            generate_avbv_impl2(source, numeric_string, cfg, false, true);
+            source.append("    } \n");
+          }
+          else
+            generate_avbv_impl2(source, numeric_string, cfg, false, true);
+          source.append("  } else { \n");
+          if (cfg.b != VIENNACL_AVBV_NONE)
+          {
+            source.append("    if (options3 & (1 << 1)) {\n");
+            generate_avbv_impl2(source, numeric_string, cfg, true, false);
+            source.append("    } else {\n");
+            generate_avbv_impl2(source, numeric_string, cfg, true, true);
+            source.append("    } \n");
+          }
+          else
+            generate_avbv_impl2(source, numeric_string, cfg, true, true);
+          source.append("  } \n");
+          source.append("} \n");
+        }
+
+        template <typename StringType>
+        void generate_avbv(StringType & source, std::string const & numeric_string)
+        {
+          avbv_config cfg;
+          cfg.assign_op = "=";
+          cfg.with_stride_and_range = true;
+
+          // av
+          cfg.b = VIENNACL_AVBV_NONE; cfg.a = VIENNACL_AVBV_CPU; generate_avbv_impl(source, numeric_string, cfg);
+          cfg.b = VIENNACL_AVBV_NONE; cfg.a = VIENNACL_AVBV_GPU; generate_avbv_impl(source, numeric_string, cfg);
+
+          // avbv
+          cfg.a = VIENNACL_AVBV_CPU; cfg.b = VIENNACL_AVBV_CPU; generate_avbv_impl(source, numeric_string, cfg);
+          cfg.a = VIENNACL_AVBV_CPU; cfg.b = VIENNACL_AVBV_GPU; generate_avbv_impl(source, numeric_string, cfg);
+          cfg.a = VIENNACL_AVBV_GPU; cfg.b = VIENNACL_AVBV_CPU; generate_avbv_impl(source, numeric_string, cfg);
+          cfg.a = VIENNACL_AVBV_GPU; cfg.b = VIENNACL_AVBV_GPU; generate_avbv_impl(source, numeric_string, cfg);
+
+          // avbv
+          cfg.assign_op = "+=";
+
+          cfg.a = VIENNACL_AVBV_CPU; cfg.b = VIENNACL_AVBV_CPU; generate_avbv_impl(source, numeric_string, cfg);
+          cfg.a = VIENNACL_AVBV_CPU; cfg.b = VIENNACL_AVBV_GPU; generate_avbv_impl(source, numeric_string, cfg);
+          cfg.a = VIENNACL_AVBV_GPU; cfg.b = VIENNACL_AVBV_CPU; generate_avbv_impl(source, numeric_string, cfg);
+          cfg.a = VIENNACL_AVBV_GPU; cfg.b = VIENNACL_AVBV_GPU; generate_avbv_impl(source, numeric_string, cfg);
+        }
+
+        template <typename StringType>
+        void generate_plane_rotation(StringType & source, std::string const & numeric_string)
+        {
+          source.append("__kernel void plane_rotation( \n");
+          source.append("          __global "); source.append(numeric_string); source.append(" * vec1, \n");
+          source.append("          unsigned int start1, \n");
+          source.append("          unsigned int inc1, \n");
+          source.append("          unsigned int size1, \n");
+          source.append("          __global "); source.append(numeric_string); source.append(" * vec2, \n");
+          source.append("          unsigned int start2, \n");
+          source.append("          unsigned int inc2, \n");
+          source.append("          unsigned int size2, \n");
+          source.append("          "); source.append(numeric_string); source.append(" alpha, \n");
+          source.append("          "); source.append(numeric_string); source.append(" beta) \n");
+          source.append("{ \n");
+          source.append("  "); source.append(numeric_string); source.append(" tmp1 = 0; \n");
+          source.append("  "); source.append(numeric_string); source.append(" tmp2 = 0; \n");
+          source.append(" \n");
+          source.append("  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0)) \n");
+          source.append(" { \n");
+          source.append("    tmp1 = vec1[i*inc1+start1]; \n");
+          source.append("    tmp2 = vec2[i*inc2+start2]; \n");
+          source.append(" \n");
+          source.append("    vec1[i*inc1+start1] = alpha * tmp1 + beta * tmp2; \n");
+          source.append("    vec2[i*inc2+start2] = alpha * tmp2 - beta * tmp1; \n");
+          source.append("  } \n");
+          source.append(" \n");
+          source.append("} \n");
+        }
+
+        template <typename StringType>
+        void generate_vector_swap(StringType & source, std::string const & numeric_string)
+        {
+          source.append("__kernel void swap( \n");
+          source.append("          __global "); source.append(numeric_string); source.append(" * vec1, \n");
+          source.append("          unsigned int start1, \n");
+          source.append("          unsigned int inc1, \n");
+          source.append("          unsigned int size1, \n");
+          source.append("          __global "); source.append(numeric_string); source.append(" * vec2, \n");
+          source.append("          unsigned int start2, \n");
+          source.append("          unsigned int inc2, \n");
+          source.append("          unsigned int size2 \n");
+          source.append("          ) \n");
+          source.append("{ \n");
+          source.append("  "); source.append(numeric_string); source.append(" tmp; \n");
+          source.append("  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0)) \n");
+          source.append("  { \n");
+          source.append("    tmp = vec2[i*inc2+start2]; \n");
+          source.append("    vec2[i*inc2+start2] = vec1[i*inc1+start1]; \n");
+          source.append("    vec1[i*inc1+start1] = tmp; \n");
+          source.append("  } \n");
+          source.append("} \n");
+        }
+
+        template <typename StringType>
+        void generate_assign_cpu(StringType & source, std::string const & numeric_string)
+        {
+          source.append("__kernel void assign_cpu( \n");
+          source.append("          __global "); source.append(numeric_string); source.append(" * vec1, \n");
+          source.append("          unsigned int start1, \n");
+          source.append("          unsigned int inc1, \n");
+          source.append("          unsigned int size1, \n");
+          source.append("          unsigned int internal_size1, \n");
+          source.append("          "); source.append(numeric_string); source.append(" alpha) \n");
+          source.append("{ \n");
+          source.append("  for (unsigned int i = get_global_id(0); i < internal_size1; i += get_global_size(0)) \n");
+          source.append("    vec1[i*inc1+start1] = (i < size1) ? alpha : 0; \n");
+          source.append("} \n");
+
+        }
+
+        template <typename StringType>
+        void generate_inner_prod(StringType & source, std::string const & numeric_string, vcl_size_t vector_num)
+        {
+          std::stringstream ss;
+          ss << vector_num;
+          std::string vector_num_string = ss.str();
+
+          source.append("__kernel void inner_prod"); source.append(vector_num_string); source.append("( \n");
+          source.append("          __global const "); source.append(numeric_string); source.append(" * x, \n");
+          source.append("          uint4 params_x, \n");
+          for (vcl_size_t i=0; i<vector_num; ++i)
+          {
+            ss.str("");
+            ss << i;
+            source.append("          __global const "); source.append(numeric_string); source.append(" * y"); source.append(ss.str()); source.append(", \n");
+            source.append("          uint4 params_y"); source.append(ss.str()); source.append(", \n");
+          }
+          source.append("          __local "); source.append(numeric_string); source.append(" * tmp_buffer, \n");
+          source.append("          __global "); source.append(numeric_string); source.append(" * group_buffer) \n");
+          source.append("{ \n");
+          source.append("  unsigned int entries_per_thread = (params_x.z - 1) / get_global_size(0) + 1; \n");
+          source.append("  unsigned int vec_start_index = get_group_id(0) * get_local_size(0) * entries_per_thread; \n");
+          source.append("  unsigned int vec_stop_index  = min((unsigned int)((get_group_id(0) + 1) * get_local_size(0) * entries_per_thread), params_x.z); \n");
+
+          // compute partial results within group:
+          for (vcl_size_t i=0; i<vector_num; ++i)
+          {
+            ss.str("");
+            ss << i;
+            source.append("  "); source.append(numeric_string); source.append(" tmp"); source.append(ss.str()); source.append(" = 0; \n");
+          }
+          source.append("  for (unsigned int i = vec_start_index + get_local_id(0); i < vec_stop_index; i += get_local_size(0)) { \n");
+          source.append("    ");  source.append(numeric_string); source.append(" val_x = x[i*params_x.y + params_x.x]; \n");
+          for (vcl_size_t i=0; i<vector_num; ++i)
+          {
+            ss.str("");
+            ss << i;
+            source.append("    tmp"); source.append(ss.str()); source.append(" += val_x * y"); source.append(ss.str()); source.append("[i * params_y"); source.append(ss.str()); source.append(".y + params_y"); source.append(ss.str()); source.append(".x]; \n");
+          }
+          source.append("  } \n");
+          for (vcl_size_t i=0; i<vector_num; ++i)
+          {
+            ss.str("");
+            ss << i;
+            source.append("  tmp_buffer[get_local_id(0) + "); source.append(ss.str()); source.append(" * get_local_size(0)] = tmp"); source.append(ss.str()); source.append("; \n");
+          }
+
+          // now run reduction:
+          source.append("  for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) \n");
+          source.append("  { \n");
+          source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+          source.append("    if (get_local_id(0) < stride) { \n");
+          for (vcl_size_t i=0; i<vector_num; ++i)
+          {
+            ss.str("");
+            ss << i;
+            source.append("      tmp_buffer[get_local_id(0) + "); source.append(ss.str()); source.append(" * get_local_size(0)] += tmp_buffer[get_local_id(0) + "); source.append(ss.str()); source.append(" * get_local_size(0) + stride]; \n");
+          }
+          source.append("    } \n");
+          source.append("  } \n");
+          source.append("  barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+          source.append("  if (get_local_id(0) == 0) { \n");
+          for (vcl_size_t i=0; i<vector_num; ++i)
+          {
+            ss.str("");
+            ss << i;
+            source.append("    group_buffer[get_group_id(0) + "); source.append(ss.str()); source.append(" * get_num_groups(0)] = tmp_buffer["); source.append(ss.str()); source.append(" * get_local_size(0)]; \n");
+          }
+          source.append("  } \n");
+          source.append("} \n");
+
+        }
+
+        template <typename StringType>
+        void generate_norm(StringType & source, std::string const & numeric_string)
+        {
+          bool is_float_or_double = (numeric_string == "float" || numeric_string == "double");
+
+          source.append(numeric_string); source.append(" impl_norm( \n");
+          source.append("          __global const "); source.append(numeric_string); source.append(" * vec, \n");
+          source.append("          unsigned int start1, \n");
+          source.append("          unsigned int inc1, \n");
+          source.append("          unsigned int size1, \n");
+          source.append("          unsigned int norm_selector, \n");
+          source.append("          __local "); source.append(numeric_string); source.append(" * tmp_buffer) \n");
+          source.append("{ \n");
+          source.append("  "); source.append(numeric_string); source.append(" tmp = 0; \n");
+          source.append("  if (norm_selector == 1) \n"); //norm_1
+          source.append("  { \n");
+          source.append("    for (unsigned int i = get_local_id(0); i < size1; i += get_local_size(0)) \n");
+          if (is_float_or_double)
+            source.append("      tmp += fabs(vec[i*inc1 + start1]); \n");
+          else
+            source.append("      tmp += abs(vec[i*inc1 + start1]); \n");
+          source.append("  } \n");
+          source.append("  else if (norm_selector == 2) \n"); //norm_2
+          source.append("  { \n");
+          source.append("    "); source.append(numeric_string); source.append(" vec_entry = 0; \n");
+          source.append("    for (unsigned int i = get_local_id(0); i < size1; i += get_local_size(0)) \n");
+          source.append("    { \n");
+          source.append("      vec_entry = vec[i*inc1 + start1]; \n");
+          source.append("      tmp += vec_entry * vec_entry; \n");
+          source.append("    } \n");
+          source.append("  } \n");
+          source.append("  else if (norm_selector == 0) \n"); //norm_inf
+          source.append("  { \n");
+          source.append("    for (unsigned int i = get_local_id(0); i < size1; i += get_local_size(0)) \n");
+          if (is_float_or_double)
+            source.append("      tmp = fmax(fabs(vec[i*inc1 + start1]), tmp); \n");
+          else
+          {
+            source.append("      tmp = max(("); source.append(numeric_string); source.append(")abs(vec[i*inc1 + start1]), tmp); \n");
+          }
+          source.append("  } \n");
+
+          source.append("  tmp_buffer[get_local_id(0)] = tmp; \n");
+
+          source.append("  if (norm_selector > 0) \n"); //norm_1 or norm_2:
+          source.append("  { \n");
+          source.append("    for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) \n");
+          source.append("    { \n");
+          source.append("      barrier(CLK_LOCAL_MEM_FENCE); \n");
+          source.append("      if (get_local_id(0) < stride) \n");
+          source.append("        tmp_buffer[get_local_id(0)] += tmp_buffer[get_local_id(0)+stride]; \n");
+          source.append("    } \n");
+          source.append("    return tmp_buffer[0]; \n");
+          source.append("  } \n");
+
+          //norm_inf:
+          source.append("  for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) \n");
+          source.append("  { \n");
+          source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+          source.append("    if (get_local_id(0) < stride) \n");
+          if (is_float_or_double)
+            source.append("      tmp_buffer[get_local_id(0)] = fmax(tmp_buffer[get_local_id(0)], tmp_buffer[get_local_id(0)+stride]); \n");
+          else
+            source.append("      tmp_buffer[get_local_id(0)] = max(tmp_buffer[get_local_id(0)], tmp_buffer[get_local_id(0)+stride]); \n");
+          source.append("  } \n");
+
+          source.append("  return tmp_buffer[0]; \n");
+          source.append("}; \n");
+
+          source.append("__kernel void norm( \n");
+          source.append("          __global const "); source.append(numeric_string); source.append(" * vec, \n");
+          source.append("          unsigned int start1, \n");
+          source.append("          unsigned int inc1, \n");
+          source.append("          unsigned int size1, \n");
+          source.append("          unsigned int norm_selector, \n");
+          source.append("          __local "); source.append(numeric_string); source.append(" * tmp_buffer, \n");
+          source.append("          __global "); source.append(numeric_string); source.append(" * group_buffer) \n");
+          source.append("{ \n");
+          source.append("  "); source.append(numeric_string); source.append(" tmp = impl_norm(vec, \n");
+          source.append("                        (        get_group_id(0)  * size1) / get_num_groups(0) * inc1 + start1, \n");
+          source.append("                        inc1, \n");
+          source.append("                        (   (1 + get_group_id(0)) * size1) / get_num_groups(0) \n");
+          source.append("                      - (        get_group_id(0)  * size1) / get_num_groups(0), \n");
+          source.append("                        norm_selector, \n");
+          source.append("                        tmp_buffer); \n");
+
+          source.append("  if (get_local_id(0) == 0) \n");
+          source.append("    group_buffer[get_group_id(0)] = tmp; \n");
+          source.append("} \n");
+
+        }
+
+        template <typename StringType>
+        void generate_inner_prod_sum(StringType & source, std::string const & numeric_string)
+        {
+          // sums the array 'vec1' and writes to result. Makes use of a single work-group only.
+          source.append("__kernel void sum_inner_prod( \n");
+          source.append("          __global "); source.append(numeric_string); source.append(" * vec1, \n");
+          source.append("          __local "); source.append(numeric_string); source.append(" * tmp_buffer, \n");
+          source.append("          __global "); source.append(numeric_string); source.append(" * result, \n");
+          source.append("          unsigned int start_result, \n");
+          source.append("          unsigned int inc_result) \n");
+          source.append("{ \n");
+          source.append("  tmp_buffer[get_local_id(0)] = vec1[get_global_id(0)]; \n");
+
+          source.append("  for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) \n");
+          source.append("  { \n");
+          source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+          source.append("    if (get_local_id(0) < stride) \n");
+          source.append("      tmp_buffer[get_local_id(0)] += tmp_buffer[get_local_id(0) + stride]; \n");
+          source.append("  } \n");
+          source.append("  barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+          source.append("  if (get_local_id(0) == 0) \n");
+          source.append("    result[start_result + inc_result * get_group_id(0)] = tmp_buffer[0]; \n");
+          source.append("} \n");
+
+        }
+
+        template <typename StringType>
+        void generate_sum(StringType & source, std::string const & numeric_string)
+        {
+          // sums the array 'vec1' and writes to result. Makes use of a single work-group only.
+          source.append("__kernel void sum( \n");
+          source.append("          __global "); source.append(numeric_string); source.append(" * vec1, \n");
+          source.append("          unsigned int start1, \n");
+          source.append("          unsigned int inc1, \n");
+          source.append("          unsigned int size1, \n");
+          source.append("          unsigned int option,  \n"); //0: use fmax, 1: just sum, 2: sum and return sqrt of sum
+          source.append("          __local "); source.append(numeric_string); source.append(" * tmp_buffer, \n");
+          source.append("          __global "); source.append(numeric_string); source.append(" * result) \n");
+          source.append("{ \n");
+          source.append("  "); source.append(numeric_string); source.append(" thread_sum = 0; \n");
+          source.append("  "); source.append(numeric_string); source.append(" tmp = 0; \n");
+          source.append("  for (unsigned int i = get_local_id(0); i<size1; i += get_local_size(0)) \n");
+          source.append("  { \n");
+          source.append("    if (option > 0) \n");
+          source.append("      thread_sum += vec1[i*inc1+start1]; \n");
+          source.append("    else \n");
+          source.append("    { \n");
+          source.append("      tmp = vec1[i*inc1+start1]; \n");
+          source.append("      tmp = (tmp < 0) ? -tmp : tmp; \n");
+          source.append("      thread_sum = (thread_sum > tmp) ? thread_sum : tmp; \n");
+          source.append("    } \n");
+          source.append("  } \n");
+
+          source.append("  tmp_buffer[get_local_id(0)] = thread_sum; \n");
+
+          source.append("  for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) \n");
+          source.append("  { \n");
+          source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+          source.append("    if (get_local_id(0) < stride) \n");
+          source.append("    { \n");
+          source.append("      if (option > 0) \n");
+          source.append("        tmp_buffer[get_local_id(0)] += tmp_buffer[get_local_id(0) + stride]; \n");
+          source.append("      else \n");
+          source.append("        tmp_buffer[get_local_id(0)] = (tmp_buffer[get_local_id(0)] > tmp_buffer[get_local_id(0) + stride]) ? tmp_buffer[get_local_id(0)] : tmp_buffer[get_local_id(0) + stride]; \n");
+          source.append("    } \n");
+          source.append("  } \n");
+          source.append("  barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+          source.append("  if (get_global_id(0) == 0) \n");
+          source.append("  { \n");
+          if (numeric_string == "float" || numeric_string == "double")
+          {
+            source.append("    if (option == 2) \n");
+            source.append("      *result = sqrt(tmp_buffer[0]); \n");
+            source.append("    else \n");
+          }
+          source.append("      *result = tmp_buffer[0]; \n");
+          source.append("  } \n");
+          source.append("} \n");
+
+        }
+
+        template <typename StringType>
+        void generate_index_norm_inf(StringType & source, std::string const & numeric_string)
+        {
+          //index_norm_inf:
+          source.append("unsigned int index_norm_inf_impl( \n");
+          source.append("          __global const "); source.append(numeric_string); source.append(" * vec, \n");
+          source.append("          unsigned int start1, \n");
+          source.append("          unsigned int inc1, \n");
+          source.append("          unsigned int size1, \n");
+          source.append("          __local "); source.append(numeric_string); source.append(" * entry_buffer, \n");
+          source.append("          __local unsigned int * index_buffer) \n");
+          source.append("{ \n");
+          //step 1: fill buffer:
+          source.append("  "); source.append(numeric_string); source.append(" cur_max = 0; \n");
+          source.append("  "); source.append(numeric_string); source.append(" tmp; \n");
+          source.append("  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0)) \n");
+          source.append("  { \n");
+          if (numeric_string == "float" || numeric_string == "double")
+            source.append("    tmp = fabs(vec[i*inc1+start1]); \n");
+          else
+            source.append("    tmp = abs(vec[i*inc1+start1]); \n");
+          source.append("    if (cur_max < tmp) \n");
+          source.append("    { \n");
+          source.append("      entry_buffer[get_global_id(0)] = tmp; \n");
+          source.append("      index_buffer[get_global_id(0)] = i; \n");
+          source.append("      cur_max = tmp; \n");
+          source.append("    } \n");
+          source.append("  } \n");
+
+          //step 2: parallel reduction:
+          source.append("  for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2) \n");
+          source.append("  { \n");
+          source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+          source.append("    if (get_global_id(0) < stride) \n");
+          source.append("   { \n");
+          //find the first occurring index
+          source.append("      if (entry_buffer[get_global_id(0)] < entry_buffer[get_global_id(0)+stride]) \n");
+          source.append("      { \n");
+          source.append("        index_buffer[get_global_id(0)] = index_buffer[get_global_id(0)+stride]; \n");
+          source.append("        entry_buffer[get_global_id(0)] = entry_buffer[get_global_id(0)+stride]; \n");
+          source.append("      } \n");
+          source.append("    } \n");
+          source.append("  } \n");
+          source.append(" \n");
+          source.append("  return index_buffer[0]; \n");
+          source.append("} \n");
+
+          source.append("__kernel void index_norm_inf( \n");
+          source.append("          __global "); source.append(numeric_string); source.append(" * vec, \n");
+          source.append("          unsigned int start1, \n");
+          source.append("          unsigned int inc1, \n");
+          source.append("          unsigned int size1, \n");
+          source.append("          __local "); source.append(numeric_string); source.append(" * entry_buffer, \n");
+          source.append("          __local unsigned int * index_buffer, \n");
+          source.append("          __global unsigned int * result) \n");
+          source.append("{ \n");
+          source.append("  entry_buffer[get_global_id(0)] = 0; \n");
+          source.append("  index_buffer[get_global_id(0)] = 0; \n");
+          source.append("  unsigned int tmp = index_norm_inf_impl(vec, start1, inc1, size1, entry_buffer, index_buffer); \n");
+          source.append("  if (get_global_id(0) == 0) *result = tmp; \n");
+          source.append("} \n");
+
+        }
+
+
+        //////////////////////////// Part 2: Main kernel class ////////////////////////////////////
+
+        // main kernel class
+        /** @brief Main kernel class for generating OpenCL kernels for operations on/with viennacl::vector<> without involving matrices, multiple inner products, or element-wise operations other than addition or subtraction. */
+        template <class TYPE>
+        struct vector
+        {
+          static std::string program_name()
+          {
+            return viennacl::ocl::type_to_string<TYPE>::apply() + "_vector";
+          }
+
+          static void init(viennacl::ocl::context & ctx)
+          {
+            viennacl::ocl::DOUBLE_PRECISION_CHECKER<TYPE>::apply(ctx);
+            std::string numeric_string = viennacl::ocl::type_to_string<TYPE>::apply();
+
+            static std::map<cl_context, bool> init_done;
+            if (!init_done[ctx.handle().get()])
+            {
+              std::string source;
+              source.reserve(8192);
+
+              viennacl::ocl::append_double_precision_pragma<TYPE>(ctx, source);
+
+              // fully parametrized kernels:
+              generate_avbv(source, numeric_string);
+
+              // kernels with mostly predetermined skeleton:
+              generate_plane_rotation(source, numeric_string);
+              generate_vector_swap(source, numeric_string);
+              generate_assign_cpu(source, numeric_string);
+
+              generate_inner_prod(source, numeric_string, 1);
+              generate_norm(source, numeric_string);
+              generate_sum(source, numeric_string);
+              generate_index_norm_inf(source, numeric_string);
+
+              std::string prog_name = program_name();
+              #ifdef VIENNACL_BUILD_INFO
+              std::cout << "Creating program " << prog_name << std::endl;
+              #endif
+              ctx.add_program(source, prog_name);
+              init_done[ctx.handle().get()] = true;
+            } //if
+          } //init
+        };
+
+        // class with kernels for multiple inner products.
+        /** @brief Main kernel class for generating OpenCL kernels for multiple inner products on/with viennacl::vector<>. */
+        template <class TYPE>
+        struct vector_multi_inner_prod
+        {
+          static std::string program_name()
+          {
+            return viennacl::ocl::type_to_string<TYPE>::apply() + "_vector_multi";
+          }
+
+          static void init(viennacl::ocl::context & ctx)
+          {
+            viennacl::ocl::DOUBLE_PRECISION_CHECKER<TYPE>::apply(ctx);
+            std::string numeric_string = viennacl::ocl::type_to_string<TYPE>::apply();
+
+            static std::map<cl_context, bool> init_done;
+            if (!init_done[ctx.handle().get()])
+            {
+              std::string source;
+              source.reserve(8192);
+
+              viennacl::ocl::append_double_precision_pragma<TYPE>(ctx, source);
+
+              generate_inner_prod(source, numeric_string, 2);
+              generate_inner_prod(source, numeric_string, 3);
+              generate_inner_prod(source, numeric_string, 4);
+              generate_inner_prod(source, numeric_string, 8);
+
+              generate_inner_prod_sum(source, numeric_string);
+
+              std::string prog_name = program_name();
+              #ifdef VIENNACL_BUILD_INFO
+              std::cout << "Creating program " << prog_name << std::endl;
+              #endif
+              ctx.add_program(source, prog_name);
+              init_done[ctx.handle().get()] = true;
+            } //if
+          } //init
+        };
+
+      }  // namespace kernels
+    }  // namespace opencl
+  }  // namespace linalg
+}  // namespace viennacl
+#endif
+
diff --git a/viennacl/linalg/opencl/kernels/vector_element.hpp b/viennacl/linalg/opencl/kernels/vector_element.hpp
new file mode 100644
index 0000000..ac9a062
--- /dev/null
+++ b/viennacl/linalg/opencl/kernels/vector_element.hpp
@@ -0,0 +1,155 @@
+#ifndef VIENNACL_LINALG_OPENCL_KERNELS_VECTOR_ELEMENT_HPP
+#define VIENNACL_LINALG_OPENCL_KERNELS_VECTOR_ELEMENT_HPP
+
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/platform.hpp"
+#include "viennacl/ocl/utils.hpp"
+
+/** @file viennacl/linalg/opencl/kernels/vector_element.hpp
+ *  @brief OpenCL kernel file for element-wise vector operations */
+namespace viennacl
+{
+  namespace linalg
+  {
+    namespace opencl
+    {
+      namespace kernels
+      {
+
+        //////////////////////////// Part 1: Kernel generation routines ////////////////////////////////////
+
+
+        //generate code for C = op1(A) * op2(B), where A, B, C can have different storage layouts and opX(D) = D or trans(D)
+        template <typename StringType>
+        void generate_vector_unary_element_ops(StringType & source, std::string const & numeric_string,
+                                               std::string const & funcname, std::string const & op, std::string const & op_name)
+        {
+          source.append("__kernel void "); source.append(funcname); source.append("_"); source.append(op_name); source.append("(\n");
+          source.append("    __global "); source.append(numeric_string); source.append(" * vec1, \n");
+          source.append("    uint4 size1, \n");
+          source.append("    __global "); source.append(numeric_string); source.append(" * vec2, \n");
+          source.append("    uint4 size2) { \n");
+          source.append("  for (unsigned int i = get_global_id(0); i < size1.z; i += get_global_size(0)) \n");
+          source.append("    vec1[i*size1.y+size1.x] "); source.append(op); source.append(" "); source.append(funcname); source.append("(vec2[i*size2.y+size2.x]); \n");
+          source.append("} \n");
+        }
+
+        template <typename StringType>
+        void generate_vector_unary_element_ops(StringType & source, std::string const & numeric_string, std::string const & funcname)
+        {
+          generate_vector_unary_element_ops(source, numeric_string, funcname, "=", "assign");
+          //generate_vector_unary_element_ops(source, numeric_string, funcname, "+=", "plus");
+          //generate_vector_unary_element_ops(source, numeric_string, funcname, "-=", "minus");
+        }
+
+        template <typename StringType>
+        void generate_vector_binary_element_ops(StringType & source, std::string const & numeric_string)
+        {
+          // generic kernel for the vector operation v1 = alpha * v2 + beta * v3, where v1, v2, v3 are not necessarily distinct vectors
+          source.append("__kernel void element_op( \n");
+          source.append("    __global "); source.append(numeric_string); source.append(" * vec1, \n");
+          source.append("    unsigned int start1, \n");
+          source.append("    unsigned int inc1, \n");
+          source.append("    unsigned int size1, \n");
+
+          source.append("    __global const "); source.append(numeric_string); source.append(" * vec2, \n");
+          source.append("    unsigned int start2, \n");
+          source.append("    unsigned int inc2, \n");
+
+          source.append("    __global const "); source.append(numeric_string); source.append(" * vec3, \n");
+          source.append("   unsigned int start3, \n");
+          source.append("   unsigned int inc3, \n");
+
+          source.append("   unsigned int op_type) \n"); //0: product, 1: division, 2: power
+          source.append("{ \n");
+          if (numeric_string == "float" || numeric_string == "double")
+          {
+            source.append("  if (op_type == 2) \n");
+            source.append("  { \n");
+            source.append("    for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0)) \n");
+            source.append("      vec1[i*inc1+start1] = pow(vec2[i*inc2+start2], vec3[i*inc3+start3]); \n");
+            source.append("  } else ");
+          }
+          source.append("  if (op_type == 1) \n");
+          source.append("  { \n");
+          source.append("    for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0)) \n");
+          source.append("      vec1[i*inc1+start1] = vec2[i*inc2+start2] / vec3[i*inc3+start3]; \n");
+          source.append("  } \n");
+          source.append("  else if (op_type == 0)\n");
+          source.append("  { \n");
+          source.append("    for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0)) \n");
+          source.append("      vec1[i*inc1+start1] = vec2[i*inc2+start2] * vec3[i*inc3+start3]; \n");
+          source.append("  } \n");
+          source.append("} \n");
+        }
+
+        //////////////////////////// Part 2: Main kernel class ////////////////////////////////////
+
+        // main kernel class
+        /** @brief Main kernel class for generating OpenCL kernels for elementwise operations other than addition and subtraction on/with viennacl::vector<>. */
+        template <class TYPE>
+        struct vector_element
+        {
+          static std::string program_name()
+          {
+            return viennacl::ocl::type_to_string<TYPE>::apply() + "_vector_element";
+          }
+
+          static void init(viennacl::ocl::context & ctx)
+          {
+            viennacl::ocl::DOUBLE_PRECISION_CHECKER<TYPE>::apply(ctx);
+            std::string numeric_string = viennacl::ocl::type_to_string<TYPE>::apply();
+
+            static std::map<cl_context, bool> init_done;
+            if (!init_done[ctx.handle().get()])
+            {
+              std::string source;
+              source.reserve(8192);
+
+              viennacl::ocl::append_double_precision_pragma<TYPE>(ctx, source);
+
+              // unary operations
+              if (numeric_string == "float" || numeric_string == "double")
+              {
+                generate_vector_unary_element_ops(source, numeric_string, "acos");
+                generate_vector_unary_element_ops(source, numeric_string, "asin");
+                generate_vector_unary_element_ops(source, numeric_string, "atan");
+                generate_vector_unary_element_ops(source, numeric_string, "ceil");
+                generate_vector_unary_element_ops(source, numeric_string, "cos");
+                generate_vector_unary_element_ops(source, numeric_string, "cosh");
+                generate_vector_unary_element_ops(source, numeric_string, "exp");
+                generate_vector_unary_element_ops(source, numeric_string, "fabs");
+                generate_vector_unary_element_ops(source, numeric_string, "floor");
+                generate_vector_unary_element_ops(source, numeric_string, "log");
+                generate_vector_unary_element_ops(source, numeric_string, "log10");
+                generate_vector_unary_element_ops(source, numeric_string, "sin");
+                generate_vector_unary_element_ops(source, numeric_string, "sinh");
+                generate_vector_unary_element_ops(source, numeric_string, "sqrt");
+                generate_vector_unary_element_ops(source, numeric_string, "tan");
+                generate_vector_unary_element_ops(source, numeric_string, "tanh");
+              }
+              else
+              {
+                generate_vector_unary_element_ops(source, numeric_string, "abs");
+              }
+
+              // binary operations
+              generate_vector_binary_element_ops(source, numeric_string);
+
+              std::string prog_name = program_name();
+              #ifdef VIENNACL_BUILD_INFO
+              std::cout << "Creating program " << prog_name << std::endl;
+              #endif
+              ctx.add_program(source, prog_name);
+              init_done[ctx.handle().get()] = true;
+            } //if
+          } //init
+        };
+
+      }  // namespace kernels
+    }  // namespace opencl
+  }  // namespace linalg
+}  // namespace viennacl
+#endif
+
diff --git a/viennacl/linalg/opencl/matrix_operations.hpp b/viennacl/linalg/opencl/matrix_operations.hpp
new file mode 100644
index 0000000..0eb0e6b
--- /dev/null
+++ b/viennacl/linalg/opencl/matrix_operations.hpp
@@ -0,0 +1,998 @@
+#ifndef VIENNACL_LINALG_OPENCL_MATRIX_OPERATIONS_HPP_
+#define VIENNACL_LINALG_OPENCL_MATRIX_OPERATIONS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file  viennacl/linalg/opencl/matrix_operations.hpp
+    @brief Implementations of dense matrix related operations, including matrix-vector products, using OpenCL.
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/ocl/device.hpp"
+#include "viennacl/ocl/handle.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/vector_proxy.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/meta/predicate.hpp"
+#include "viennacl/meta/result_of.hpp"
+
+#include "viennacl/scheduler/forwards.h"
+
+#include "viennacl/generator/generate.hpp"
+
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/start.hpp"
+#include "viennacl/traits/handle.hpp"
+#include "viennacl/traits/stride.hpp"
+
+#include "viennacl/linalg/opencl/common.hpp"
+
+#include "viennacl/linalg/opencl/kernels/matrix.hpp"
+#include "viennacl/linalg/opencl/kernels/matrix_element.hpp"
+
+#include "viennacl/linalg/opencl/kernels/matrix_prod.hpp"
+
+
+namespace viennacl
+{
+  namespace linalg
+  {
+    namespace opencl
+    {
+      //
+      // Introductory note: By convention, all dimensions are already checked in the dispatcher frontend. No need to double-check again in here!
+      //
+
+      template <typename NumericT, typename F,
+                typename ScalarType1>
+      void am(matrix_base<NumericT, F> & mat1,
+              matrix_base<NumericT, F> const & mat2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
+      {
+        typedef NumericT        value_type;
+
+        viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(mat1).context());
+        typedef viennacl::linalg::opencl::kernels::matrix<NumericT, F>  KernelClass;
+        KernelClass::init(ctx);
+
+        cl_uint options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
+
+        viennacl::ocl::kernel & k = ctx.get_kernel(KernelClass::program_name(),
+                                                   (viennacl::is_cpu_scalar<ScalarType1>::value ? "am_cpu" : "am_gpu"));
+        viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(mat1),
+                                cl_uint(viennacl::traits::start1(mat1)),           cl_uint(viennacl::traits::start2(mat1)),
+                                cl_uint(viennacl::traits::stride1(mat1)),          cl_uint(viennacl::traits::stride2(mat1)),
+                                cl_uint(viennacl::traits::size1(mat1)),            cl_uint(viennacl::traits::size2(mat1)),
+                                cl_uint(viennacl::traits::internal_size1(mat1)),   cl_uint(viennacl::traits::internal_size2(mat1)),
+
+                                viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<value_type>(alpha)),
+                                options_alpha,
+                                viennacl::traits::opencl_handle(mat2),
+                                cl_uint(viennacl::traits::start1(mat2)),           cl_uint(viennacl::traits::start2(mat2)),
+                                cl_uint(viennacl::traits::stride1(mat2)),          cl_uint(viennacl::traits::stride2(mat2)),
+                                cl_uint(viennacl::traits::internal_size1(mat2)),   cl_uint(viennacl::traits::internal_size2(mat2))
+                                )
+                              );
+      }
+
+
+      template <typename NumericT, typename F,
+                typename ScalarType1, typename ScalarType2>
+      void ambm(matrix_base<NumericT, F> & mat1,
+                matrix_base<NumericT, F> const & mat2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+                matrix_base<NumericT, F> const & mat3, ScalarType2 const & beta,  vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
+      {
+        typedef NumericT        value_type;
+
+        viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(mat1).context());
+        typedef viennacl::linalg::opencl::kernels::matrix<NumericT, F>  KernelClass;
+        KernelClass::init(ctx);
+
+        std::string kernel_name;
+        if      ( viennacl::is_cpu_scalar<ScalarType1>::value &&  viennacl::is_cpu_scalar<ScalarType2>::value)
+          kernel_name = "ambm_cpu_cpu";
+        else if ( viennacl::is_cpu_scalar<ScalarType1>::value && !viennacl::is_cpu_scalar<ScalarType2>::value)
+          kernel_name = "ambm_cpu_gpu";
+        else if (!viennacl::is_cpu_scalar<ScalarType1>::value &&  viennacl::is_cpu_scalar<ScalarType2>::value)
+          kernel_name = "ambm_gpu_cpu";
+        else
+          kernel_name = "ambm_gpu_gpu";
+
+        cl_uint options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
+        cl_uint options_beta  = detail::make_options(len_beta,  reciprocal_beta,  flip_sign_beta);
+
+        viennacl::ocl::kernel & k = ctx.get_kernel(KernelClass::program_name(), kernel_name);
+        viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(mat1),
+                                cl_uint(viennacl::traits::start1(mat1)),           cl_uint(viennacl::traits::start2(mat1)),
+                                cl_uint(viennacl::traits::stride1(mat1)),          cl_uint(viennacl::traits::stride2(mat1)),
+                                cl_uint(viennacl::traits::size1(mat1)),            cl_uint(viennacl::traits::size2(mat1)),
+                                cl_uint(viennacl::traits::internal_size1(mat1)),   cl_uint(viennacl::traits::internal_size2(mat1)),
+
+                                viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<value_type>(alpha)),
+                                options_alpha,
+                                viennacl::traits::opencl_handle(mat2),
+                                cl_uint(viennacl::traits::start1(mat2)),           cl_uint(viennacl::traits::start2(mat2)),
+                                cl_uint(viennacl::traits::stride1(mat2)),          cl_uint(viennacl::traits::stride2(mat2)),
+                                cl_uint(viennacl::traits::internal_size1(mat2)),   cl_uint(viennacl::traits::internal_size2(mat2)),
+
+                                viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<value_type>(beta)),
+                                options_beta,
+                                viennacl::traits::opencl_handle(mat3),
+                                cl_uint(viennacl::traits::start1(mat3)),           cl_uint(viennacl::traits::start2(mat3)),
+                                cl_uint(viennacl::traits::stride1(mat3)),          cl_uint(viennacl::traits::stride2(mat3)),
+                                cl_uint(viennacl::traits::internal_size1(mat3)),   cl_uint(viennacl::traits::internal_size2(mat3))
+                                )
+                              );
+      }
+
+
+      template <typename NumericT, typename F,
+                typename ScalarType1, typename ScalarType2>
+      void ambm_m(matrix_base<NumericT, F> & mat1,
+                  matrix_base<NumericT, F> const & mat2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+                  matrix_base<NumericT, F> const & mat3, ScalarType2 const & beta,  vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
+      {
+        typedef NumericT        value_type;
+
+        viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(mat1).context());
+        typedef viennacl::linalg::opencl::kernels::matrix<NumericT, F>  KernelClass;
+        KernelClass::init(ctx);
+
+        std::string kernel_name;
+        if      ( viennacl::is_cpu_scalar<ScalarType1>::value &&  viennacl::is_cpu_scalar<ScalarType2>::value)
+          kernel_name = "ambm_m_cpu_cpu";
+        else if ( viennacl::is_cpu_scalar<ScalarType1>::value && !viennacl::is_cpu_scalar<ScalarType2>::value)
+          kernel_name = "ambm_m_cpu_gpu";
+        else if (!viennacl::is_cpu_scalar<ScalarType1>::value &&  viennacl::is_cpu_scalar<ScalarType2>::value)
+          kernel_name = "ambm_m_gpu_cpu";
+        else
+          kernel_name = "ambm_m_gpu_gpu";
+
+        cl_uint options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
+        cl_uint options_beta  = detail::make_options(len_beta,  reciprocal_beta,  flip_sign_beta);
+
+        viennacl::ocl::kernel & k = ctx.get_kernel(KernelClass::program_name(), kernel_name);
+        viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(mat1),
+                                cl_uint(viennacl::traits::start1(mat1)),           cl_uint(viennacl::traits::start2(mat1)),
+                                cl_uint(viennacl::traits::stride1(mat1)),          cl_uint(viennacl::traits::stride2(mat1)),
+                                cl_uint(viennacl::traits::size1(mat1)),            cl_uint(viennacl::traits::size2(mat1)),
+                                cl_uint(viennacl::traits::internal_size1(mat1)),   cl_uint(viennacl::traits::internal_size2(mat1)),
+
+                                viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<value_type>(alpha)),
+                                options_alpha,
+                                viennacl::traits::opencl_handle(mat2),
+                                cl_uint(viennacl::traits::start1(mat2)),           cl_uint(viennacl::traits::start2(mat2)),
+                                cl_uint(viennacl::traits::stride1(mat2)),          cl_uint(viennacl::traits::stride2(mat2)),
+                                cl_uint(viennacl::traits::internal_size1(mat2)),   cl_uint(viennacl::traits::internal_size2(mat2)),
+
+                                viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<value_type>(beta)),
+                                options_beta,
+                                viennacl::traits::opencl_handle(mat3),
+                                cl_uint(viennacl::traits::start1(mat3)),           cl_uint(viennacl::traits::start2(mat3)),
+                                cl_uint(viennacl::traits::stride1(mat3)),          cl_uint(viennacl::traits::stride2(mat3)),
+                                cl_uint(viennacl::traits::internal_size1(mat3)),   cl_uint(viennacl::traits::internal_size2(mat3))
+                                )
+                              );
+      }
+
+
+
+      template <typename NumericT, typename F>
+      void matrix_assign(matrix_base<NumericT, F> & mat, NumericT s, bool clear = false)
+      {
+        typedef NumericT        value_type;
+
+        viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(mat).context());
+        typedef viennacl::linalg::opencl::kernels::matrix<NumericT, F>  KernelClass;
+        KernelClass::init(ctx);
+
+        value_type alpha = static_cast<value_type>(s);
+
+        cl_uint s1 = clear ? cl_uint(viennacl::traits::internal_size1(mat)) : cl_uint(viennacl::traits::size1(mat));
+        cl_uint s2 = clear ? cl_uint(viennacl::traits::internal_size2(mat)) : cl_uint(viennacl::traits::size2(mat));
+
+        viennacl::ocl::kernel & k = ctx.get_kernel(KernelClass::program_name(), "assign_cpu");
+        viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(mat),
+                                 cl_uint(viennacl::traits::start1(mat)),           cl_uint(viennacl::traits::start2(mat)),
+                                 cl_uint(viennacl::traits::stride1(mat)),          cl_uint(viennacl::traits::stride2(mat)),
+                                 s1,                                               s2,
+                                 cl_uint(viennacl::traits::internal_size1(mat)),   cl_uint(viennacl::traits::internal_size2(mat)),
+                                 viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<value_type>(alpha))
+                                )
+                              );
+      }
+
+      template <typename NumericT, typename F>
+      void matrix_diagonal_assign(matrix_base<NumericT, F> & mat, NumericT s)
+      {
+        typedef NumericT        value_type;
+
+        viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(mat).context());
+        typedef viennacl::linalg::opencl::kernels::matrix<NumericT, F>  KernelClass;
+        KernelClass::init(ctx);
+
+        value_type alpha = static_cast<value_type>(s);
+
+        viennacl::ocl::kernel & k = ctx.get_kernel(KernelClass::program_name(), "diagonal_assign_cpu");
+        viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(mat),
+                                 cl_uint(viennacl::traits::start1(mat)),           cl_uint(viennacl::traits::start2(mat)),
+                                 cl_uint(viennacl::traits::stride1(mat)),          cl_uint(viennacl::traits::stride2(mat)),
+                                 cl_uint(viennacl::traits::size1(mat)),            cl_uint(viennacl::traits::size2(mat)),
+                                 cl_uint(viennacl::traits::internal_size1(mat)),   cl_uint(viennacl::traits::internal_size2(mat)),
+                                 viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<value_type>(alpha))
+                                )
+                              );
+      }
+
+      template <typename NumericT, typename F>
+      void matrix_diag_from_vector(const vector_base<NumericT> & vec, int k, matrix_base<NumericT, F> & mat)
+      {
+        // Step 1: set everything to zero
+        matrix_assign(mat, NumericT(0));
+
+        // Step 2: set the diagonal:
+
+        // reuse vector ambm kernel for assigning the elements:
+        viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(mat).context());
+        typedef viennacl::linalg::opencl::kernels::vector<NumericT>  KernelClass;
+        KernelClass::init(ctx);
+
+        cl_uint options_alpha = 0;
+        viennacl::ocl::packed_cl_uint size_mat;
+        if (viennacl::is_row_major<F>::value)
+        {
+          vcl_size_t first_row_index = 0;
+          vcl_size_t first_col_index = 0;
+          if (k < 0)
+            first_row_index = vcl_size_t(-k);
+          else
+            first_col_index = vcl_size_t(k);
+          size_mat.start  = cl_uint( (viennacl::traits::start1(mat) + first_row_index * viennacl::traits::stride1(mat)) * viennacl::traits::internal_size2(mat)
+                                    + viennacl::traits::start2(mat) + first_col_index * viennacl::traits::stride2(mat));
+          size_mat.stride = cl_uint(viennacl::traits::stride1(mat) * viennacl::traits::internal_size2(mat) + viennacl::traits::stride2(mat));
+          size_mat.size   = cl_uint(viennacl::traits::size(vec));
+          size_mat.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
+        }
+        else
+        {
+          vcl_size_t first_row_index = 0;
+          vcl_size_t first_col_index = 0;
+          if (k < 0)
+            first_row_index = vcl_size_t(-k);
+          else
+            first_col_index = vcl_size_t(k);
+          size_mat.start  = cl_uint(   viennacl::traits::start1(mat) + first_row_index * viennacl::traits::stride1(mat)
+                                    + (viennacl::traits::start2(mat) + first_col_index * viennacl::traits::stride2(mat)) * viennacl::traits::internal_size1(mat));
+          size_mat.stride = cl_uint(viennacl::traits::stride2(mat) * viennacl::traits::internal_size1(mat) + viennacl::traits::stride1(mat));
+          size_mat.size   = cl_uint(viennacl::traits::size(vec));
+          size_mat.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
+        }
+
+        viennacl::ocl::packed_cl_uint size_vec;
+        size_vec.start  = cl_uint(viennacl::traits::start(vec));
+        size_vec.stride = cl_uint(viennacl::traits::stride(vec));
+        size_vec.size   = cl_uint(viennacl::traits::size(vec));
+        size_vec.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
+
+        viennacl::ocl::kernel & kern = ctx.get_kernel(KernelClass::program_name(), "av_cpu");
+        viennacl::ocl::enqueue(kern(viennacl::traits::opencl_handle(mat),
+                                    size_mat,
+
+                                    viennacl::traits::opencl_handle(NumericT(1)),
+                                    options_alpha,
+                                    viennacl::traits::opencl_handle(vec),
+                                    size_vec)
+                              );
+      }
+
+      template <typename NumericT, typename F>
+      void matrix_diag_to_vector(const matrix_base<NumericT, F> & mat, int k, vector_base<NumericT> & vec)
+      {
+        // reuse vector ambm kernel for assigning the elements:
+        viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(mat).context());
+        typedef viennacl::linalg::opencl::kernels::vector<NumericT>  KernelClass;
+        KernelClass::init(ctx);
+
+        cl_uint options_alpha = 0;
+        viennacl::ocl::packed_cl_uint size_mat;
+        if (viennacl::is_row_major<F>::value)
+        {
+          vcl_size_t first_row_index = 0;
+          vcl_size_t first_col_index = 0;
+          if (k < 0)
+            first_row_index = vcl_size_t(-k);
+          else
+            first_col_index = vcl_size_t(k);
+          size_mat.start  = cl_uint( (viennacl::traits::start1(mat) + first_row_index * viennacl::traits::stride1(mat)) * viennacl::traits::internal_size2(mat)
+                                    + viennacl::traits::start2(mat) + first_col_index * viennacl::traits::stride2(mat));
+          size_mat.stride = cl_uint(viennacl::traits::stride1(mat) * viennacl::traits::internal_size2(mat) + viennacl::traits::stride2(mat));
+          size_mat.size   = cl_uint(viennacl::traits::size(vec));
+          size_mat.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
+        }
+        else
+        {
+          vcl_size_t first_row_index = 0;
+          vcl_size_t first_col_index = 0;
+          if (k < 0)
+            first_row_index = vcl_size_t(-k);
+          else
+            first_col_index = vcl_size_t(k);
+          size_mat.start  = cl_uint(   viennacl::traits::start1(mat) + first_row_index * viennacl::traits::stride1(mat)
+                                    + (viennacl::traits::start2(mat) + first_col_index * viennacl::traits::stride2(mat)) * viennacl::traits::internal_size1(mat));
+          size_mat.stride = cl_uint(viennacl::traits::stride2(mat) * viennacl::traits::internal_size1(mat) + viennacl::traits::stride1(mat));
+          size_mat.size   = cl_uint(viennacl::traits::size(vec));
+          size_mat.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
+        }
+
+        viennacl::ocl::packed_cl_uint size_vec;
+        size_vec.start  = cl_uint(viennacl::traits::start(vec));
+        size_vec.stride = cl_uint(viennacl::traits::stride(vec));
+        size_vec.size   = cl_uint(viennacl::traits::size(vec));
+        size_vec.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
+
+
+        viennacl::ocl::kernel & kern = ctx.get_kernel(KernelClass::program_name(), "av_cpu");
+        viennacl::ocl::enqueue(kern(viennacl::traits::opencl_handle(vec),
+                                    size_vec,
+
+                                    viennacl::traits::opencl_handle(NumericT(1)),
+                                    options_alpha,
+                                    viennacl::traits::opencl_handle(mat),
+                                    size_mat)
+                              );
+      }
+
+      template <typename NumericT, typename F>
+      void matrix_row(const matrix_base<NumericT, F> & mat, unsigned int i, vector_base<NumericT> & vec)
+      {
+        // reuse vector ambm kernel for assigning the elements:
+        viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(mat).context());
+        typedef viennacl::linalg::opencl::kernels::vector<NumericT>  KernelClass;
+        KernelClass::init(ctx);
+
+        cl_uint options_alpha = 0;
+        viennacl::ocl::packed_cl_uint size_mat;
+        if (viennacl::is_row_major<F>::value)
+        {
+          size_mat.start  = cl_uint((viennacl::traits::start1(mat) + i * viennacl::traits::stride1(mat)) * viennacl::traits::internal_size2(mat) + viennacl::traits::start2(mat));
+          size_mat.stride = cl_uint(viennacl::traits::stride2(mat));
+          size_mat.size   = cl_uint(viennacl::traits::size(vec));
+          size_mat.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
+        }
+        else
+        {
+          size_mat.start  = cl_uint((viennacl::traits::start1(mat) + i * viennacl::traits::stride1(mat)) + viennacl::traits::start2(mat) * viennacl::traits::internal_size1(mat));
+          size_mat.stride = cl_uint(viennacl::traits::stride2(mat) * viennacl::traits::internal_size1(mat));
+          size_mat.size   = cl_uint(viennacl::traits::size(vec));
+          size_mat.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
+        }
+
+        viennacl::ocl::packed_cl_uint size_vec;
+        size_vec.start  = cl_uint(viennacl::traits::start(vec));
+        size_vec.stride = cl_uint(viennacl::traits::stride(vec));
+        size_vec.size   = cl_uint(viennacl::traits::size(vec));
+        size_vec.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
+
+
+        viennacl::ocl::kernel & kern = ctx.get_kernel(KernelClass::program_name(), "av_cpu");
+        viennacl::ocl::enqueue(kern(viennacl::traits::opencl_handle(vec),
+                                    size_vec,
+
+                                    viennacl::traits::opencl_handle(NumericT(1)),
+                                    options_alpha,
+                                    viennacl::traits::opencl_handle(mat),
+                                    size_mat)
+                              );
+      }
+
+      template <typename NumericT, typename F>
+      void matrix_column(const matrix_base<NumericT, F> & mat, unsigned int j, vector_base<NumericT> & vec)
+      {
+        // reuse vector ambm kernel for assigning the elements:
+        viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(mat).context());
+        typedef viennacl::linalg::opencl::kernels::vector<NumericT>  KernelClass;
+        KernelClass::init(ctx);
+
+        cl_uint options_alpha = 0;
+        viennacl::ocl::packed_cl_uint size_mat;
+        if (viennacl::is_row_major<F>::value)
+        {
+          size_mat.start  = cl_uint(viennacl::traits::start1(mat) * viennacl::traits::internal_size2(mat) + viennacl::traits::start2(mat) + j * viennacl::traits::stride2(mat));
+          size_mat.stride = cl_uint(viennacl::traits::stride2(mat) * viennacl::traits::internal_size2(mat));
+          size_mat.size   = cl_uint(viennacl::traits::size(vec));
+          size_mat.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
+        }
+        else
+        {
+          size_mat.start  = cl_uint(viennacl::traits::start1(mat) + (viennacl::traits::start2(mat) + j * viennacl::traits::stride2(mat)) * viennacl::traits::internal_size1(mat));
+          size_mat.stride = cl_uint(viennacl::traits::stride2(mat));
+          size_mat.size   = cl_uint(viennacl::traits::size(vec));
+          size_mat.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
+        }
+
+        viennacl::ocl::packed_cl_uint size_vec;
+        size_vec.start  = cl_uint(viennacl::traits::start(vec));
+        size_vec.stride = cl_uint(viennacl::traits::stride(vec));
+        size_vec.size   = cl_uint(viennacl::traits::size(vec));
+        size_vec.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
+
+
+        viennacl::ocl::kernel & kern = ctx.get_kernel(KernelClass::program_name(), "av_cpu");
+        viennacl::ocl::enqueue(kern(viennacl::traits::opencl_handle(vec),
+                                    size_vec,
+
+                                    viennacl::traits::opencl_handle(NumericT(1)),
+                                    options_alpha,
+                                    viennacl::traits::opencl_handle(mat),
+                                    size_mat)
+                              );
+      }
+
+
+      //
+      ///////////////////////// Element-wise operation //////////////////////////////////
+      //
+
+      // Binary operations A = B .* C and A = B ./ C
+      /** @brief Implementation of binary element-wise operations A = OP(B,C)
+      *
+      * @param A      The result matrix (or -range, or -slice)
+      * @param proxy  The proxy object holding B, C, and the operation
+      */
+      template <typename T, typename F, typename OP>
+      void element_op(matrix_base<T, F> & A,
+                      matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_binary<OP> > const & proxy)
+      {
+        assert(viennacl::traits::opencl_handle(A).context() == viennacl::traits::opencl_handle(proxy.lhs()).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+        assert(viennacl::traits::opencl_handle(A).context() == viennacl::traits::opencl_handle(proxy.rhs()).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+
+        viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+        typedef viennacl::linalg::opencl::kernels::matrix<T, F>  KernelClass;
+        KernelClass::init(ctx);
+
+        viennacl::ocl::kernel & k = ctx.get_kernel(KernelClass::program_name(), "element_op");
+
+        cl_uint op_type = 2; //0: product, 1: division, 2: power
+        if (viennacl::is_division<OP>::value)
+          op_type = 1;
+        else if (viennacl::is_product<OP>::value)
+          op_type = 0;
+
+        viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(A),
+                                cl_uint(viennacl::traits::start1(A)),           cl_uint(viennacl::traits::start2(A)),
+                                cl_uint(viennacl::traits::stride1(A)),          cl_uint(viennacl::traits::stride2(A)),
+                                cl_uint(viennacl::traits::size1(A)),            cl_uint(viennacl::traits::size2(A)),
+                                cl_uint(viennacl::traits::internal_size1(A)),   cl_uint(viennacl::traits::internal_size2(A)),
+
+                                viennacl::traits::opencl_handle(proxy.lhs()),
+                                cl_uint(viennacl::traits::start1(proxy.lhs())),           cl_uint(viennacl::traits::start2(proxy.lhs())),
+                                cl_uint(viennacl::traits::stride1(proxy.lhs())),          cl_uint(viennacl::traits::stride2(proxy.lhs())),
+                                cl_uint(viennacl::traits::internal_size1(proxy.lhs())),   cl_uint(viennacl::traits::internal_size2(proxy.lhs())),
+
+                                viennacl::traits::opencl_handle(proxy.rhs()),
+                                cl_uint(viennacl::traits::start1(proxy.rhs())),           cl_uint(viennacl::traits::start2(proxy.rhs())),
+                                cl_uint(viennacl::traits::stride1(proxy.rhs())),          cl_uint(viennacl::traits::stride2(proxy.rhs())),
+                                cl_uint(viennacl::traits::internal_size1(proxy.rhs())),   cl_uint(viennacl::traits::internal_size2(proxy.rhs())),
+
+                                op_type)
+                              );
+      }
+
+
+      // Unary operations
+
+      /** @brief Implementation of unary element-wise operations A = OP(B)
+      *
+      * @param A      The result matrix (or -range, or -slice)
+      * @param proxy  The proxy object holding B and the operation
+      */
+      template <typename T, typename F, typename OP>
+      void element_op(matrix_base<T, F> & A,
+                      matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_unary<OP> > const & proxy)
+      {
+        assert(viennacl::traits::opencl_handle(A).context() == viennacl::traits::opencl_handle(proxy.lhs()).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+        assert(viennacl::traits::opencl_handle(A).context() == viennacl::traits::opencl_handle(proxy.rhs()).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+
+        viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+
+        viennacl::linalg::opencl::kernels::matrix_element<T, F>::init(ctx);
+        viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::matrix_element<T, F>::program_name(), detail::op_to_string(OP()) + "_assign");
+
+        viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(A),
+                                 cl_uint(viennacl::traits::start1(A)),           cl_uint(viennacl::traits::start2(A)),
+                                 cl_uint(viennacl::traits::stride1(A)),          cl_uint(viennacl::traits::stride2(A)),
+                                 cl_uint(viennacl::traits::size1(A)),            cl_uint(viennacl::traits::size2(A)),
+                                 cl_uint(viennacl::traits::internal_size1(A)),   cl_uint(viennacl::traits::internal_size2(A)),
+
+                                 viennacl::traits::opencl_handle(proxy.lhs()),
+                                 cl_uint(viennacl::traits::start1(proxy.lhs())),           cl_uint(viennacl::traits::start2(proxy.lhs())),
+                                 cl_uint(viennacl::traits::stride1(proxy.lhs())),          cl_uint(viennacl::traits::stride2(proxy.lhs())),
+                                 cl_uint(viennacl::traits::internal_size1(proxy.lhs())),   cl_uint(viennacl::traits::internal_size2(proxy.lhs())))
+                              );
+      }
+
+
+      //
+      /////////////////////////   matrix-vector products /////////////////////////////////
+      //
+
+      // A * x
+
+      /** @brief Carries out matrix-vector multiplication
+      *
+      * Implementation of the convenience expression result = prod(mat, vec);
+      *
+      * @param mat    The matrix
+      * @param vec    The vector
+      * @param result The result vector
+      */
+      template <typename NumericT, typename F>
+      void prod_impl(const matrix_base<NumericT, F> & mat,
+                     const vector_base<NumericT> & vec,
+                           vector_base<NumericT> & result)
+      {
+        typedef NumericT        value_type;
+
+        viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(mat).context());
+        typedef viennacl::linalg::opencl::kernels::matrix<NumericT, F>  KernelClass;
+        KernelClass::init(ctx);
+
+        assert(mat.size2() == vec.size());
+        // Inplace matrix-vector products like x = prod(A, x) are currently illegal: Introduce a temporary like y = prod(A, x); x = y; instead
+        assert(viennacl::traits::handle(vec) != viennacl::traits::handle(result) && bool("No direct inplace matrix-vector product possible. Introduce a temporary!"));
+        //result.resize(mat.size1());
+
+        viennacl::ocl::kernel & k = ctx.get_kernel(KernelClass::program_name(), "vec_mul");
+        viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(mat),
+                                cl_uint(viennacl::traits::start1(mat)),         cl_uint(viennacl::traits::start2(mat)),
+                                cl_uint(viennacl::traits::stride1(mat)),        cl_uint(viennacl::traits::stride2(mat)),
+                                cl_uint(viennacl::traits::size1(mat)),          cl_uint(viennacl::traits::size2(mat)),
+                                cl_uint(viennacl::traits::internal_size1(mat)), cl_uint(viennacl::traits::internal_size2(mat)),
+
+                                viennacl::traits::opencl_handle(vec),
+                                cl_uint(viennacl::traits::start(vec)),
+                                cl_uint(viennacl::traits::stride(vec)),
+                                cl_uint(viennacl::traits::size(vec)),
+
+                                viennacl::traits::opencl_handle(result),
+                                cl_uint(viennacl::traits::start(result)),
+                                cl_uint(viennacl::traits::stride(result)),
+                                cl_uint(viennacl::traits::size(result)),
+
+                                viennacl::ocl::local_mem(sizeof(value_type) * k.local_work_size())
+                              ) );
+      }
+
+
+      // trans(A) * x
+
+      /** @brief Carries out matrix-vector multiplication with a transposed matrix
+      *
+      * Implementation of the convenience expression result = trans(mat) * vec;
+      *
+      * @param mat_trans  The transposed matrix proxy
+      * @param vec        The vector
+      * @param result     The result vector
+      */
+      template <typename NumericT, typename F>
+      void prod_impl(const viennacl::matrix_expression< const matrix_base<NumericT, F>, const matrix_base<NumericT, F>, op_trans> & mat_trans,
+                     const vector_base<NumericT> & vec,
+                           vector_base<NumericT> & result)
+      {
+        assert( (viennacl::traits::size1(mat_trans) == viennacl::traits::size(result)) && bool("Size check failed for transposed matrix-vector product: size1(A^T) == size(result)"));
+        assert( (viennacl::traits::size2(mat_trans) == viennacl::traits::size(vec)) && bool("Size check failed for transposed matrix-vector product: size2(A^T) == size(x)"));  //remember: mat is transposed!
+
+
+        viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec).context());
+        typedef viennacl::linalg::opencl::kernels::matrix<NumericT, F>  KernelClass;
+        KernelClass::init(ctx);
+
+
+        // Inplace matrix-vector products like x = prod(A, x) are currently illegal: Introduce a temporary like y = prod(A, x); x = y; instead
+        assert(viennacl::traits::handle(vec) != viennacl::traits::handle(result) && bool("No direct inplace transposed matrix-vector product possible. Introduce a temporary!"));
+
+        viennacl::ocl::kernel & k = ctx.get_kernel(KernelClass::program_name(), "trans_vec_mul");
+
+        viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(mat_trans.lhs()),
+                                cl_uint(viennacl::traits::start1(mat_trans.lhs())),         cl_uint(viennacl::traits::start2(mat_trans.lhs())),
+                                cl_uint(viennacl::traits::stride1(mat_trans.lhs())),        cl_uint(viennacl::traits::stride2(mat_trans.lhs())),
+                                cl_uint(viennacl::traits::size1(mat_trans.lhs())),          cl_uint(viennacl::traits::size2(mat_trans.lhs())),
+                                cl_uint(viennacl::traits::internal_size1(mat_trans.lhs())), cl_uint(viennacl::traits::internal_size2(mat_trans.lhs())),
+
+                                viennacl::traits::opencl_handle(vec),
+                                cl_uint(viennacl::traits::start(vec)),
+                                cl_uint(viennacl::traits::stride(vec)),
+                                cl_uint(viennacl::traits::size(vec)),
+
+                                viennacl::traits::opencl_handle(result),
+                                cl_uint(viennacl::traits::start(result)),
+                                cl_uint(viennacl::traits::stride(result)),
+                                cl_uint(viennacl::traits::size(result)),
+
+                                viennacl::ocl::local_mem(sizeof(NumericT) * k.local_work_size())
+                              ) );
+      }
+
+
+      //
+      /////////////////////////   matrix-matrix products /////////////////////////////////
+      //
+
+      namespace detail
+      {
+        // C = A * B and possibly transposed variants
+        template <typename T1, typename T2, typename T3, typename ScalarType >
+        void prod_slow_kernel(const T1 & A,
+                              const T2 & B,
+                              T3 & C,
+                              ScalarType alpha,
+                              ScalarType beta,
+                              std::string kernel_name)
+        {
+          typedef typename viennacl::result_of::cpu_value_type< typename T1::value_type >::type   cpu_value_type;
+          typedef typename viennacl::result_of::orientation_functor<T1>::type   orientation_A;
+          typedef typename viennacl::result_of::orientation_functor<T2>::type   orientation_B;
+          typedef typename viennacl::result_of::orientation_functor<T3>::type   orientation_C;
+
+          viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+
+          typedef viennacl::linalg::opencl::kernels::matrix_prod<cpu_value_type, orientation_A, orientation_B, orientation_C>    KernelClass;
+          KernelClass::init(ctx);
+
+          //std::cout << "KernelClass::program_name() : " << KernelClass::program_name() << std::endl;
+          viennacl::ocl::kernel & k = ctx.get_kernel(KernelClass::program_name(), kernel_name);
+
+          k.global_work_size(0, viennacl::tools::align_to_multiple<unsigned int>(static_cast<unsigned int>(viennacl::traits::size1(C)), 16));
+          k.global_work_size(1, viennacl::tools::align_to_multiple<unsigned int>(static_cast<unsigned int>(viennacl::traits::size2(C)), 16));
+          k.local_work_size(0, 16);
+          k.local_work_size(1, 16);
+
+          cpu_value_type cl_alpha = static_cast<cpu_value_type>(alpha);
+          cpu_value_type cl_beta  = static_cast<cpu_value_type>(beta);
+
+          viennacl::ocl::enqueue(k(cl_alpha,
+                                  viennacl::traits::opencl_handle(A),
+                                  cl_uint(viennacl::traits::start1(A)),           cl_uint(viennacl::traits::start2(A)),
+                                  cl_uint(viennacl::traits::stride1(A)),          cl_uint(viennacl::traits::stride2(A)),
+                                  cl_uint(viennacl::traits::size1(A)),            cl_uint(viennacl::traits::size2(A)),
+                                  cl_uint(viennacl::traits::internal_size1(A)),   cl_uint(viennacl::traits::internal_size2(A)),
+
+                                  viennacl::traits::opencl_handle(B),
+                                  cl_uint(viennacl::traits::start1(B)),           cl_uint(viennacl::traits::start2(B)),
+                                  cl_uint(viennacl::traits::stride1(B)),          cl_uint(viennacl::traits::stride2(B)),
+                                  cl_uint(viennacl::traits::size1(B)),            cl_uint(viennacl::traits::size2(B)),
+                                  cl_uint(viennacl::traits::internal_size1(B)),   cl_uint(viennacl::traits::internal_size2(B)),
+
+                                  cl_beta,
+                                  viennacl::traits::opencl_handle(C),
+                                  cl_uint(viennacl::traits::start1(C)),           cl_uint(viennacl::traits::start2(C)),
+                                  cl_uint(viennacl::traits::stride1(C)),          cl_uint(viennacl::traits::stride2(C)),
+                                  cl_uint(viennacl::traits::size1(C)),            cl_uint(viennacl::traits::size2(C)),
+                                  cl_uint(viennacl::traits::internal_size1(C)),   cl_uint(viennacl::traits::internal_size2(C))
+                                  )
+                                );
+        }
+
+        // C = A * B, using fast kernel for NVIDIA
+        template <typename T1, typename T2, typename T3, typename ScalarType >
+        void prod_fast_kernel(const T1 & A,
+                              const T2 & B,
+                              T3 & C,
+                              ScalarType alpha,
+                              ScalarType beta,
+                              std::string kernel_name)
+        {
+          typedef typename viennacl::result_of::cpu_value_type< typename T1::value_type >::type   cpu_value_type;
+          typedef typename viennacl::result_of::orientation_functor<T1>::type   orientation_A;
+          typedef typename viennacl::result_of::orientation_functor<T2>::type   orientation_B;
+          typedef typename viennacl::result_of::orientation_functor<T3>::type   orientation_C;
+
+          viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+
+          typedef viennacl::linalg::opencl::kernels::matrix_prod<cpu_value_type, orientation_A, orientation_B, orientation_C>    KernelClass;
+          KernelClass::init(ctx);
+
+          //std::cout << "KernelClass::program_name() : " << KernelClass::program_name() << std::endl;
+          viennacl::ocl::kernel & k = ctx.get_kernel(KernelClass::program_name(), kernel_name);
+
+          k.global_work_size(0, viennacl::traits::size2(C) / 4); //column blocks
+          k.global_work_size(1, viennacl::traits::size1(C) / 4); //row blocks
+          k.local_work_size(0, 16);  //columns
+          k.local_work_size(1, 4);   //rows
+
+          cpu_value_type cl_alpha = static_cast<cpu_value_type>(alpha);
+          cpu_value_type cl_beta  = static_cast<cpu_value_type>(beta);
+
+          viennacl::ocl::enqueue(k(cl_alpha,
+                                  viennacl::traits::opencl_handle(A),
+                                  cl_uint(viennacl::traits::start1(A)),           cl_uint(viennacl::traits::start2(A)),
+                                  cl_uint(viennacl::traits::stride1(A)),          cl_uint(viennacl::traits::stride2(A)),
+                                  cl_uint(viennacl::traits::size1(A)),            cl_uint(viennacl::traits::size2(A)),
+                                  cl_uint(viennacl::traits::internal_size1(A)),   cl_uint(viennacl::traits::internal_size2(A)),
+
+                                  viennacl::traits::opencl_handle(B),
+                                  cl_uint(viennacl::traits::start1(B)),           cl_uint(viennacl::traits::start2(B)),
+                                  cl_uint(viennacl::traits::stride1(B)),          cl_uint(viennacl::traits::stride2(B)),
+                                  cl_uint(viennacl::traits::size1(B)),            cl_uint(viennacl::traits::size2(B)),
+                                  cl_uint(viennacl::traits::internal_size1(B)),   cl_uint(viennacl::traits::internal_size2(B)),
+
+                                  cl_beta,
+                                  viennacl::traits::opencl_handle(C),
+                                  cl_uint(viennacl::traits::start1(C)),           cl_uint(viennacl::traits::start2(C)),
+                                  cl_uint(viennacl::traits::stride1(C)),          cl_uint(viennacl::traits::stride2(C)),
+                                  cl_uint(viennacl::traits::size1(C)),            cl_uint(viennacl::traits::size2(C)),
+                                  cl_uint(viennacl::traits::internal_size1(C)),   cl_uint(viennacl::traits::internal_size2(C))
+                                  )
+                                );
+        }
+
+        template <typename T1, typename T2, typename T3, typename ScalarType >
+        void prod(const T1 & A,
+                  const T2 & B,
+                  T3 & C,
+                  ScalarType alpha,
+                  ScalarType beta,
+                  std::string fast_kernel_name,
+                  std::string slow_kernel_name)
+        {
+          if (   (viennacl::traits::size1(A) < 64)
+              || (viennacl::traits::size2(A) < 64)
+              || (viennacl::traits::size1(B) < 64)
+              || (viennacl::traits::size2(B) < 64) )   //there is most likely not enough to compute, rendering kernel launch overhead considerable
+          {
+            prod_slow_kernel(A, B, C, alpha, beta, slow_kernel_name);
+          }
+          else if (   (viennacl::traits::size1(A) % 64 == 0)
+                   && (viennacl::traits::size2(A) % 64 == 0)
+                   && (viennacl::traits::size1(B) % 64 == 0)
+                   && (viennacl::traits::size2(B) % 64 == 0) )   // allows the use of the fast NVIDIA kernel
+          {
+            prod_fast_kernel(A, B, C, alpha, beta, fast_kernel_name);
+            //prod_slow_kernel(A, B, C, slow_kernel_name);
+          }
+          else //TODO: use four kernels
+          {
+            prod_slow_kernel(A, B, C, alpha, beta, slow_kernel_name);
+          }
+
+        }
+      } // namespace detail
+
+
+      /** @brief Carries out matrix-matrix multiplication
+      *
+      * Implementation of C = prod(A, B);
+      *
+      */
+      template <typename NumericT, typename F1, typename F2, typename F3, typename ScalarType >
+      void prod_impl(const matrix_base<NumericT, F1> & A,
+                     const matrix_base<NumericT, F2> & B,
+                           matrix_base<NumericT, F3> & C,
+                     ScalarType alpha,
+                     ScalarType beta)
+      {
+        assert( (viennacl::traits::size1(A) == viennacl::traits::size1(C)) && bool("Size mismatch in C = prod(A, B): size1(A) != size1(C)"));
+        assert( (viennacl::traits::size2(A) == viennacl::traits::size1(B)) && bool("Size mismatch in C = prod(A, B): size2(A) != size1(B)"));
+        assert( (viennacl::traits::size2(B) == viennacl::traits::size2(C)) && bool("Size mismatch in C = prod(A, B): size2(B) != size2(C)"));
+
+        bool A_not_aligned = (A.internal_size1()%matrix_base<NumericT, F1>::alignment>0) ||(A.internal_size2()%matrix_base<NumericT, F1>::alignment>0);
+        bool B_not_aligned = (B.internal_size1()%matrix_base<NumericT, F2>::alignment>0) ||(B.internal_size2()%matrix_base<NumericT, F2>::alignment>0);
+        bool C_not_aligned = (C.internal_size1()%matrix_base<NumericT, F3>::alignment>0) ||(C.internal_size2()%matrix_base<NumericT, F3>::alignment>0);
+        // Inplace matrix-vector products like B = prod(A, B) are currently illegal: Introduce a temporary like C = prod(A, B); B = C; instead
+        /*assert(  (viennacl::traits::handle(C) != viennacl::traits::handle(A))
+              && (viennacl::traits::handle(C) != viennacl::traits::handle(B))
+              && bool("No direct inplace matrix-matrix product possible. Introduce a temporary!"));*/
+
+        if(A_not_aligned || A.start1() > 0 || A.start2() > 0 || A.stride1() > 1 || A.stride2() > 1
+         ||B_not_aligned || B.start1() > 0 || B.start2() > 0 || B.stride1() > 1 || B.stride2() > 1
+         ||C_not_aligned || C.start1() > 0 || C.start2() > 0 || C.stride1() > 1 || C.stride2() > 1)
+          detail::prod(A, B, C, alpha, beta, "prod16_AA", "prod_AA");
+        else{
+          typedef matrix_expression<const matrix_base<NumericT, F1>, const matrix_base<NumericT, F2>, op_mat_mat_prod> ProdType;
+          viennacl::generator::generate_enqueue_statement(viennacl::scheduler::statement(C, viennacl::op_assign(),alpha*ProdType(A,B)+beta*C));
+        }
+      }
+
+
+
+      /** @brief Carries out matrix-matrix multiplication
+      *
+      * Implementation of C = prod(trans(A), B);
+      *
+      */
+      template <typename NumericT, typename F1, typename F2, typename F3, typename ScalarType >
+      void prod_impl(const viennacl::matrix_expression< const matrix_base<NumericT, F1>,
+                                                        const matrix_base<NumericT, F1>,
+                                                        op_trans> & A,
+                     const matrix_base<NumericT, F2> & B,
+                           matrix_base<NumericT, F3> & C,
+                     ScalarType alpha,
+                     ScalarType beta)
+      {
+        //std::cout << "size2(A): " << viennacl::traits::size2(A.lhs()) << std::endl;
+        //std::cout << "size1(C): " << viennacl::traits::size1(C) << std::endl;
+        assert( (viennacl::traits::size2(A.lhs()) == viennacl::traits::size1(C)) && bool("Size mismatch in C = prod(trans(A), B): size2(A) != size1(C)"));
+        assert( (viennacl::traits::size1(A.lhs()) == viennacl::traits::size1(B)) && bool("Size mismatch in C = prod(trans(A), B): size1(A) != size1(B)"));
+        assert( (viennacl::traits::size2(B)       == viennacl::traits::size2(C)) && bool("Size mismatch in C = prod(trans(A), B): size2(B) != size2(C)"));
+
+        // Inplace matrix-vector products like B = prod(A, B) are currently illegal: Introduce a temporary like C = prod(A, B); B = C; instead
+        /*assert(  (viennacl::traits::handle(C) != viennacl::traits::handle(A.lhs()))
+              && (viennacl::traits::handle(C) != viennacl::traits::handle(B))
+              && bool("No direct inplace matrix-matrix product possible. Introduce a temporary!"));*/
+
+        bool A_not_aligned = (A.lhs().internal_size1()%matrix_base<NumericT, F1>::alignment>0) ||(A.lhs().internal_size2()%matrix_base<NumericT, F1>::alignment>0);
+        bool B_not_aligned = (B.internal_size1()%matrix_base<NumericT, F2>::alignment>0) ||(B.internal_size2()%matrix_base<NumericT, F2>::alignment>0);
+        bool C_not_aligned = (C.internal_size1()%matrix_base<NumericT, F3>::alignment>0) ||(C.internal_size2()%matrix_base<NumericT, F3>::alignment>0);
+
+
+        if(A_not_aligned || A.lhs().start1() > 0 || A.lhs().start2() > 0 || A.lhs().stride1() > 1 || A.lhs().stride2() > 1
+         ||B_not_aligned || B.start1() > 0 || B.start2() > 0 || B.stride1() > 1 || B.stride2() > 1
+         ||C_not_aligned || C.start1() > 0 || C.start2() > 0 || C.stride1() > 1 || C.stride2() > 1)
+          detail::prod(A.lhs(), B, C, alpha, beta, "prod16_TA", "prod_TA");
+        else{
+          typedef const viennacl::matrix_expression< const matrix_base<NumericT, F1>, const matrix_base<NumericT, F1>, op_trans> LhsType;
+          typedef matrix_expression<LhsType, const matrix_base<NumericT, F2>, op_mat_mat_prod> ProdType;
+          viennacl::generator::generate_enqueue_statement(viennacl::scheduler::statement(C, viennacl::op_assign(),alpha*ProdType(A,B)+beta*C));
+        }
+      }
+
+
+
+
+      /** @brief Carries out matrix-matrix multiplication
+      *
+      * Implementation of C = prod(A, trans(B));
+      *
+      */
+      template <typename NumericT, typename F1, typename F2, typename F3, typename ScalarType >
+      void prod_impl(const matrix_base<NumericT, F1> & A,
+                     const viennacl::matrix_expression< const matrix_base<NumericT, F2>, const matrix_base<NumericT, F2>, op_trans> & B,
+                           matrix_base<NumericT, F3> & C,
+                     ScalarType alpha,
+                     ScalarType beta)
+      {
+        assert( (viennacl::traits::size1(A)       == viennacl::traits::size1(C))       && bool("Size mismatch in C = prod(A, trans(B)): size1(A) != size1(C)"));
+        assert( (viennacl::traits::size2(A)       == viennacl::traits::size2(B.lhs())) && bool("Size mismatch in C = prod(A, trans(B)): size2(A) != size2(B)"));
+        assert( (viennacl::traits::size1(B.lhs()) == viennacl::traits::size2(C))       && bool("Size mismatch in C = prod(A, trans(B)): size1(B) != size2(C)"));
+
+        bool A_not_aligned = (A.internal_size1()%matrix_base<NumericT, F1>::alignment>0) ||(A.internal_size2()%matrix_base<NumericT, F1>::alignment>0);
+        bool B_not_aligned = (B.lhs().internal_size1()%matrix_base<NumericT, F2>::alignment>0) ||(B.lhs().internal_size2()%matrix_base<NumericT, F2>::alignment>0);
+        bool C_not_aligned = (C.internal_size1()%matrix_base<NumericT, F3>::alignment>0) ||(C.internal_size2()%matrix_base<NumericT, F3>::alignment>0);
+
+        // Inplace matrix-vector products like B = prod(A, B) are currently illegal: Introduce a temporary like C = prod(A, B); B = C; instead
+        /*assert(  (viennacl::traits::handle(C) != viennacl::traits::handle(A))
+              && (viennacl::traits::handle(C) != viennacl::traits::handle(B.lhs()))
+              && bool("No direct inplace matrix-matrix product possible. Introduce a temporary!"));*/
+
+        if(A_not_aligned || A.start1() > 0 || A.start2() > 0 || A.stride1() > 1 || A.stride2() > 1
+         ||B_not_aligned || B.lhs().start1() > 0 || B.lhs().start2() > 0 || B.lhs().stride1() > 1 || B.lhs().stride2() > 1
+         ||C_not_aligned || C.start1() > 0 || C.start2() > 0 || C.stride1() > 1 || C.stride2() > 1)
+          detail::prod(A, B.lhs(), C, alpha, beta, "prod16_AT", "prod_AT");
+        else{
+          typedef const viennacl::matrix_expression< const matrix_base<NumericT, F2>, const matrix_base<NumericT, F2>, op_trans> RhsType;
+          typedef matrix_expression<const matrix_base<NumericT, F1>, RhsType, op_mat_mat_prod> ProdType;
+          viennacl::generator::generate_enqueue_statement(viennacl::scheduler::statement(C, viennacl::op_assign(),alpha*ProdType(A,B)+beta*C));
+        }
+      }
+
+
+
+      /** @brief Carries out matrix-matrix multiplication
+      *
+      * Implementation of C = prod(trans(A), trans(B));
+      *
+      */
+      template <typename NumericT, typename F1, typename F2, typename F3, typename ScalarType >
+      void prod_impl(const viennacl::matrix_expression< const matrix_base<NumericT, F1>, const matrix_base<NumericT, F1>, op_trans> & A,
+                     const viennacl::matrix_expression< const matrix_base<NumericT, F2>, const matrix_base<NumericT, F2>, op_trans> & B,
+                     matrix_base<NumericT, F3> & C,
+                     ScalarType alpha,
+                     ScalarType beta)
+      {
+        assert(viennacl::traits::size2(A.lhs()) == viennacl::traits::size1(C)       && bool("Size mismatch in C = prod(trans(A), trans(B)): size2(A) != size1(C)"));
+        assert(viennacl::traits::size1(A.lhs()) == viennacl::traits::size2(B.lhs()) && bool("Size mismatch in C = prod(trans(A), trans(B)): size1(A) != size2(B)"));
+        assert(viennacl::traits::size1(B.lhs()) == viennacl::traits::size2(C)       && bool("Size mismatch in C = prod(trans(A), trans(B)): size1(B) != size2(C)"));
+
+        bool A_not_aligned = (A.lhs().internal_size1()%matrix_base<NumericT, F1>::alignment>0) ||(A.lhs().internal_size2()%matrix_base<NumericT, F1>::alignment>0);
+        bool B_not_aligned = (B.lhs().internal_size1()%matrix_base<NumericT, F2>::alignment>0) ||(B.lhs().internal_size2()%matrix_base<NumericT, F2>::alignment>0);
+        bool C_not_aligned = (C.internal_size1()%matrix_base<NumericT, F3>::alignment>0) ||(C.internal_size2()%matrix_base<NumericT, F3>::alignment>0);
+
+        // Inplace matrix-vector products like B = prod(A, B) are currently illegal: Introduce a temporary like C = prod(A, B); B = C; instead
+        /*assert(  (viennacl::traits::handle(C) != viennacl::traits::handle(A.lhs()))
+              && (viennacl::traits::handle(C) != viennacl::traits::handle(B.lhs()))
+              && bool("No direct inplace matrix-matrix product possible. Introduce a temporary!"));*/
+
+        if(A_not_aligned || A.lhs().start1() > 0 || A.lhs().start2() > 0 || A.lhs().stride1() > 1 || A.lhs().stride2() > 1
+         ||B_not_aligned || B.lhs().start1() > 0 || B.lhs().start2() > 0 || B.lhs().stride1() > 1 || B.lhs().stride2() > 1
+         ||C_not_aligned || C.start1() > 0 || C.start2() > 0 || C.stride1() > 1 || C.stride2() > 1)
+          detail::prod(A.lhs(), B.lhs(), C, alpha, beta, "prod16_TT", "prod_TT");
+        else{
+          typedef const viennacl::matrix_expression< const matrix_base<NumericT, F1>, const matrix_base<NumericT, F1>, op_trans> LhsType;
+          typedef const viennacl::matrix_expression< const matrix_base<NumericT, F2>, const matrix_base<NumericT, F2>, op_trans> RhsType;
+          typedef matrix_expression<LhsType, RhsType, op_mat_mat_prod> ProdType;
+          viennacl::generator::generate_enqueue_statement(viennacl::scheduler::statement(C, viennacl::op_assign(),alpha*ProdType(A,B)+beta*C));
+        }
+      }
+
+
+
+
+      //
+      /////////////////////////   miscellaneous operations /////////////////////////////////
+      //
+
+
+      /** @brief The implementation of the operation mat += alpha * vec1 * vec2^T, i.e. a scaled rank 1 update
+      *
+      * Implementation of the convenience expression result += alpha * outer_prod(vec1, vec2);
+      *
+      * @param mat1    The matrix to be updated
+      * @param alpha            The scaling factor (either a viennacl::scalar<>, float, or double)
+      * @param len_alpha        Length of the buffer for an eventual final reduction step (currently always '1')
+      * @param reciprocal_alpha Use 1/alpha instead of alpha
+      * @param flip_sign_alpha  Use -alpha instead of alpha
+      * @param vec1    The first vector
+      * @param vec2    The second vector
+      */
+      template <typename NumericT, typename F, typename S1>
+      void scaled_rank_1_update(matrix_base<NumericT, F> & mat1,
+                                S1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+                                const vector_base<NumericT> & vec1,
+                                const vector_base<NumericT> & vec2)
+      {
+        assert( (viennacl::traits::size1(mat1) == viennacl::traits::size(vec1)) && bool("Size mismatch in scaled_rank_1_update: size1(A) != size(v1)"));
+        assert( (viennacl::traits::size2(mat1) == viennacl::traits::size(vec2)) && bool("Size mismatch in scaled_rank_1_update: size2(A) != size(v2)"));
+
+        viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(mat1).context());
+        typedef viennacl::linalg::opencl::kernels::matrix<NumericT, F>  KernelClass;
+        KernelClass::init(ctx);
+
+        cl_uint options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
+
+        viennacl::ocl::kernel & k = ctx.get_kernel(KernelClass::program_name(), viennacl::is_cpu_scalar<S1>::value ? "scaled_rank1_update_cpu" : "scaled_rank1_update_gpu");
+
+        viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(mat1),
+                                 cl_uint(viennacl::traits::start1(mat1)),           cl_uint(viennacl::traits::start2(mat1)),
+                                 cl_uint(viennacl::traits::stride1(mat1)),          cl_uint(viennacl::traits::stride2(mat1)),
+                                 cl_uint(viennacl::traits::size1(mat1)),            cl_uint(viennacl::traits::size2(mat1)),
+                                 cl_uint(viennacl::traits::internal_size1(mat1)),   cl_uint(viennacl::traits::internal_size2(mat1)),
+
+                                 viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<NumericT>(alpha)),
+                                 options_alpha,
+
+                                 viennacl::traits::opencl_handle(vec1),
+                                 cl_uint(viennacl::traits::start(vec1)),
+                                 cl_uint(viennacl::traits::stride(vec1)),
+                                 cl_uint(viennacl::traits::size(vec1)),
+
+                                 viennacl::traits::opencl_handle(vec2),
+                                 cl_uint(viennacl::traits::start(vec2)),
+                                 cl_uint(viennacl::traits::stride(vec2)),
+                                 cl_uint(viennacl::traits::size(vec2))
+                                )
+                              );
+      }
+
+    } // namespace opencl
+  } //namespace linalg
+} //namespace viennacl
+
+
+#endif
diff --git a/viennacl/linalg/opencl/misc_operations.hpp b/viennacl/linalg/opencl/misc_operations.hpp
new file mode 100644
index 0000000..3b87e51
--- /dev/null
+++ b/viennacl/linalg/opencl/misc_operations.hpp
@@ -0,0 +1,72 @@
+#ifndef VIENNACL_LINALG_OPENCL_MISC_OPERATIONS_HPP_
+#define VIENNACL_LINALG_OPENCL_MISC_OPERATIONS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/opencl/misc_operations.hpp
+    @brief Implementations of operations using compressed_matrix and OpenCL
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/ocl/device.hpp"
+#include "viennacl/ocl/handle.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/opencl/kernels/ilu.hpp"
+
+
+namespace viennacl
+{
+  namespace linalg
+  {
+    namespace opencl
+    {
+
+      namespace detail
+      {
+
+        template <typename ScalarType>
+        void level_scheduling_substitute(vector<ScalarType> & vec,
+                                     viennacl::backend::mem_handle const & row_index_array,
+                                     viennacl::backend::mem_handle const & row_buffer,
+                                     viennacl::backend::mem_handle const & col_buffer,
+                                     viennacl::backend::mem_handle const & element_buffer,
+                                     vcl_size_t num_rows
+                                    )
+        {
+          viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec).context());
+
+          viennacl::linalg::opencl::kernels::ilu<ScalarType>::init(ctx);
+          viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::ilu<ScalarType>::program_name(), "level_scheduling_substitute");
+
+          viennacl::ocl::enqueue(k(row_index_array.opencl_handle(), row_buffer.opencl_handle(), col_buffer.opencl_handle(), element_buffer.opencl_handle(),
+                                   vec,
+                                   static_cast<cl_uint>(num_rows)));
+        }
+
+      } //namespace detail
+
+
+    } // namespace opencl
+  } //namespace linalg
+} //namespace viennacl
+
+
+#endif
diff --git a/viennacl/linalg/opencl/scalar_operations.hpp b/viennacl/linalg/opencl/scalar_operations.hpp
new file mode 100644
index 0000000..2408feb
--- /dev/null
+++ b/viennacl/linalg/opencl/scalar_operations.hpp
@@ -0,0 +1,201 @@
+#ifndef VIENNACL_LINALG_OPENCL_SCALAR_OPERATIONS_HPP_
+#define VIENNACL_LINALG_OPENCL_SCALAR_OPERATIONS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/opencl/scalar_operations.hpp
+    @brief Implementations of scalar operations using OpenCL
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/ocl/device.hpp"
+#include "viennacl/ocl/handle.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/opencl/kernels/scalar.hpp"
+#include "viennacl/linalg/opencl/common.hpp"
+#include "viennacl/meta/predicate.hpp"
+#include "viennacl/meta/result_of.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/start.hpp"
+#include "viennacl/traits/handle.hpp"
+#include "viennacl/traits/stride.hpp"
+
+namespace viennacl
+{
+  namespace linalg
+  {
+    namespace opencl
+    {
+      template <typename S1,
+                typename S2, typename ScalarType1>
+      typename viennacl::enable_if< viennacl::is_scalar<S1>::value
+                                    && viennacl::is_scalar<S2>::value
+                                    && viennacl::is_any_scalar<ScalarType1>::value
+                                  >::type
+      as(S1 & s1,
+         S2 const & s2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
+      {
+        assert( &viennacl::traits::opencl_handle(s1).context() == &viennacl::traits::opencl_handle(s2).context() && bool("Operands not in the same OpenCL context!"));
+
+        typedef typename viennacl::result_of::cpu_value_type<S1>::type        value_type;
+        viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(s1).context());
+        viennacl::linalg::opencl::kernels::scalar<value_type>::init(ctx);
+
+        cl_uint options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
+
+        viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::scalar<value_type>::program_name(),
+                                                   (viennacl::is_cpu_scalar<ScalarType1>::value ? "as_cpu" : "as_gpu"));
+        k.local_work_size(0, 1);
+        k.global_work_size(0, 1);
+        viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(s1),
+                                 viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<value_type>(alpha)),
+                                 options_alpha,
+                                 viennacl::traits::opencl_handle(s2) )
+                              );
+      }
+
+
+      template <typename S1,
+                typename S2, typename ScalarType1,
+                typename S3, typename ScalarType2>
+      typename viennacl::enable_if< viennacl::is_scalar<S1>::value
+                                    && viennacl::is_scalar<S2>::value
+                                    && viennacl::is_scalar<S3>::value
+                                    && viennacl::is_any_scalar<ScalarType1>::value
+                                    && viennacl::is_any_scalar<ScalarType2>::value
+                                  >::type
+      asbs(S1 & s1,
+           S2 const & s2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+           S3 const & s3, ScalarType2 const & beta,  vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
+      {
+        assert( &viennacl::traits::opencl_handle(s1).context() == &viennacl::traits::opencl_handle(s2).context() && bool("Operands not in the same OpenCL context!"));
+        assert( &viennacl::traits::opencl_handle(s2).context() == &viennacl::traits::opencl_handle(s3).context() && bool("Operands not in the same OpenCL context!"));
+
+        typedef typename viennacl::result_of::cpu_value_type<S1>::type        value_type;
+        viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(s1).context());
+        viennacl::linalg::opencl::kernels::scalar<value_type>::init(ctx);
+
+        std::string kernel_name;
+        if (viennacl::is_cpu_scalar<ScalarType1>::value && viennacl::is_cpu_scalar<ScalarType2>::value)
+          kernel_name = "asbs_cpu_cpu";
+        else if (viennacl::is_cpu_scalar<ScalarType1>::value && !viennacl::is_cpu_scalar<ScalarType2>::value)
+          kernel_name = "asbs_cpu_gpu";
+        else if (!viennacl::is_cpu_scalar<ScalarType1>::value && viennacl::is_cpu_scalar<ScalarType2>::value)
+          kernel_name = "asbs_gpu_cpu";
+        else
+          kernel_name = "asbs_gpu_gpu";
+
+        cl_uint options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
+        cl_uint options_beta  = detail::make_options(len_beta,  reciprocal_beta,  flip_sign_beta);
+
+        viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::scalar<value_type>::program_name(), kernel_name);
+        k.local_work_size(0, 1);
+        k.global_work_size(0, 1);
+        viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(s1),
+                                 viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<value_type>(alpha)),
+                                 options_alpha,
+                                 viennacl::traits::opencl_handle(s2),
+                                 viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<value_type>(beta)),
+                                 options_beta,
+                                 viennacl::traits::opencl_handle(s3) )
+                              );
+      }
+
+
+      template <typename S1,
+                typename S2, typename ScalarType1,
+                typename S3, typename ScalarType2>
+      typename viennacl::enable_if< viennacl::is_scalar<S1>::value
+                                    && viennacl::is_scalar<S2>::value
+                                    && viennacl::is_scalar<S3>::value
+                                    && viennacl::is_any_scalar<ScalarType1>::value
+                                    && viennacl::is_any_scalar<ScalarType2>::value
+                                  >::type
+      asbs_s(S1 & s1,
+             S2 const & s2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+             S3 const & s3, ScalarType2 const & beta,  vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
+      {
+        assert( &viennacl::traits::opencl_handle(s1).context() == &viennacl::traits::opencl_handle(s2).context() && bool("Operands not in the same OpenCL context!"));
+        assert( &viennacl::traits::opencl_handle(s2).context() == &viennacl::traits::opencl_handle(s3).context() && bool("Operands not in the same OpenCL context!"));
+
+        typedef typename viennacl::result_of::cpu_value_type<S1>::type        value_type;
+        viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(s1).context());
+        viennacl::linalg::opencl::kernels::scalar<value_type>::init(ctx);
+
+        std::string kernel_name;
+        if (viennacl::is_cpu_scalar<ScalarType1>::value && viennacl::is_cpu_scalar<ScalarType2>::value)
+          kernel_name = "asbs_s_cpu_cpu";
+        else if (viennacl::is_cpu_scalar<ScalarType1>::value && !viennacl::is_cpu_scalar<ScalarType2>::value)
+          kernel_name = "asbs_s_cpu_gpu";
+        else if (!viennacl::is_cpu_scalar<ScalarType1>::value && viennacl::is_cpu_scalar<ScalarType2>::value)
+          kernel_name = "asbs_s_gpu_cpu";
+        else
+          kernel_name = "asbs_s_gpu_gpu";
+
+        cl_uint options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
+        cl_uint options_beta  = detail::make_options(len_beta,  reciprocal_beta,  flip_sign_beta);
+
+        viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::scalar<value_type>::program_name(), kernel_name);
+        k.local_work_size(0, 1);
+        k.global_work_size(0, 1);
+        viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(s1),
+                                 viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<value_type>(alpha)),
+                                 options_alpha,
+                                 viennacl::traits::opencl_handle(s2),
+                                 viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<value_type>(beta)),
+                                 options_beta,
+                                 viennacl::traits::opencl_handle(s3) )
+                              );
+      }
+
+
+      /** @brief Swaps the contents of two scalars, data is copied
+      *
+      * @param s1   The first scalar
+      * @param s2   The second scalar
+      */
+      template <typename S1, typename S2>
+      typename viennacl::enable_if<    viennacl::is_scalar<S1>::value
+                                    && viennacl::is_scalar<S2>::value
+                                  >::type
+      swap(S1 & s1, S2 & s2)
+      {
+        assert( &viennacl::traits::opencl_handle(s1).context() == &viennacl::traits::opencl_handle(s2).context() && bool("Operands not in the same OpenCL context!"));
+
+        typedef typename viennacl::result_of::cpu_value_type<S1>::type        value_type;
+        viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(s1).context());
+        viennacl::linalg::opencl::kernels::scalar<value_type>::init(ctx);
+
+        viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::scalar<value_type>::program_name(), "swap");
+        k.local_work_size(0, 1);
+        k.global_work_size(0, 1);
+        viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(s1),
+                                 viennacl::traits::opencl_handle(s2))
+                              );
+      }
+
+
+
+    } //namespace opencl
+  } //namespace linalg
+} //namespace viennacl
+
+
+#endif
diff --git a/viennacl/linalg/opencl/sparse_matrix_operations.hpp b/viennacl/linalg/opencl/sparse_matrix_operations.hpp
new file mode 100644
index 0000000..13efd33
--- /dev/null
+++ b/viennacl/linalg/opencl/sparse_matrix_operations.hpp
@@ -0,0 +1,940 @@
+#ifndef VIENNACL_LINALG_OPENCL_SPARSE_MATRIX_OPERATIONS_HPP_
+#define VIENNACL_LINALG_OPENCL_SPARSE_MATRIX_OPERATIONS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/opencl/sparse_matrix_operations.hpp
+    @brief Implementations of operations using sparse matrices and OpenCL
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/ocl/device.hpp"
+#include "viennacl/ocl/handle.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/opencl/kernels/compressed_matrix.hpp"
+#include "viennacl/linalg/opencl/kernels/coordinate_matrix.hpp"
+#include "viennacl/linalg/opencl/kernels/ell_matrix.hpp"
+#include "viennacl/linalg/opencl/kernels/hyb_matrix.hpp"
+#include "viennacl/linalg/opencl/kernels/compressed_compressed_matrix.hpp"
+#include "viennacl/linalg/opencl/common.hpp"
+
+namespace viennacl
+{
+  namespace linalg
+  {
+    namespace opencl
+    {
+
+      //
+      // Compressed matrix
+      //
+
+      namespace detail
+      {
+        template<typename SCALARTYPE, unsigned int MAT_ALIGNMENT>
+        void row_info(compressed_matrix<SCALARTYPE, MAT_ALIGNMENT> const & mat,
+                      vector_base<SCALARTYPE> & vec,
+                      viennacl::linalg::detail::row_info_types info_selector)
+        {
+          viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(mat).context());
+          viennacl::linalg::opencl::kernels::compressed_matrix<SCALARTYPE>::init(ctx);
+          viennacl::ocl::kernel & row_info_kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix<SCALARTYPE>::program_name(), "row_info_extractor");
+
+          viennacl::ocl::enqueue(row_info_kernel(mat.handle1().opencl_handle(), mat.handle2().opencl_handle(), mat.handle().opencl_handle(),
+                                                 viennacl::traits::opencl_handle(vec),
+                                                 cl_uint(mat.size1()),
+                                                 cl_uint(info_selector)
+                                                )
+                                );
+        }
+      }
+
+      /** @brief Carries out matrix-vector multiplication with a compressed_matrix
+      *
+      * Implementation of the convenience expression result = prod(mat, vec);
+      *
+      * @param mat    The matrix
+      * @param vec    The vector
+      * @param result The result vector
+      */
+      template<class TYPE, unsigned int ALIGNMENT>
+      void prod_impl(const viennacl::compressed_matrix<TYPE, ALIGNMENT> & mat,
+                     const viennacl::vector_base<TYPE> & vec,
+                           viennacl::vector_base<TYPE> & result)
+      {
+        viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(mat).context());
+        viennacl::linalg::opencl::kernels::compressed_matrix<TYPE>::init(ctx);
+        std::stringstream ss;
+        ss << "vec_mul";
+        if (ALIGNMENT == 4)
+          ss << "4";
+        if (ALIGNMENT == 8)
+          ss << "8";
+
+        viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix<TYPE>::program_name(), ss.str());
+
+        viennacl::ocl::packed_cl_uint layout_vec;
+        layout_vec.start  = cl_uint(viennacl::traits::start(vec));
+        layout_vec.stride = cl_uint(viennacl::traits::stride(vec));
+        layout_vec.size   = cl_uint(viennacl::traits::size(vec));
+        layout_vec.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
+
+        viennacl::ocl::packed_cl_uint layout_result;
+        layout_result.start  = cl_uint(viennacl::traits::start(result));
+        layout_result.stride = cl_uint(viennacl::traits::stride(result));
+        layout_result.size   = cl_uint(viennacl::traits::size(result));
+        layout_result.internal_size   = cl_uint(viennacl::traits::internal_size(result));
+
+        viennacl::ocl::enqueue(k(mat.handle1().opencl_handle(), mat.handle2().opencl_handle(), mat.handle().opencl_handle(),
+                                vec, layout_vec,
+                                result, layout_result
+                                ));
+      }
+
+
+      /** @brief Carries out sparse_matrix-matrix multiplication first matrix being compressed
+      *
+      * Implementation of the convenience expression result = prod(sp_mat, d_mat);
+      *
+      * @param sp_mat     The sparse matrix
+      * @param d_mat      The dense matrix
+      * @param result     The result matrix
+      */
+      template< typename TYPE, unsigned int ALIGNMENT, typename F1, typename F2>
+      void prod_impl(const viennacl::compressed_matrix<TYPE, ALIGNMENT> & sp_mat,
+                     const viennacl::matrix_base<TYPE, F1> & d_mat,
+                           viennacl::matrix_base<TYPE, F2> & result) {
+
+        viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(sp_mat).context());
+        viennacl::linalg::opencl::kernels::compressed_matrix<TYPE>::init(ctx);
+        viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix<TYPE>::program_name(),
+                                                   detail::sparse_dense_matmult_kernel_name(false, is_row_major<F1>::value, is_row_major<F2>::value));
+
+        viennacl::ocl::enqueue(k(sp_mat.handle1().opencl_handle(), sp_mat.handle2().opencl_handle(), sp_mat.handle().opencl_handle(),
+                                 viennacl::traits::opencl_handle(d_mat),
+                                 cl_uint(viennacl::traits::start1(d_mat)),          cl_uint(viennacl::traits::start2(d_mat)),
+                                 cl_uint(viennacl::traits::stride1(d_mat)),         cl_uint(viennacl::traits::stride2(d_mat)),
+                                 cl_uint(viennacl::traits::size1(d_mat)),           cl_uint(viennacl::traits::size2(d_mat)),
+                                 cl_uint(viennacl::traits::internal_size1(d_mat)),  cl_uint(viennacl::traits::internal_size2(d_mat)),
+                                 viennacl::traits::opencl_handle(result),
+                                 cl_uint(viennacl::traits::start1(result)),         cl_uint(viennacl::traits::start2(result)),
+                                 cl_uint(viennacl::traits::stride1(result)),        cl_uint(viennacl::traits::stride2(result)),
+                                 cl_uint(viennacl::traits::size1(result)),          cl_uint(viennacl::traits::size2(result)),
+                                 cl_uint(viennacl::traits::internal_size1(result)), cl_uint(viennacl::traits::internal_size2(result)) ));
+      }
+
+      /** @brief Carries out matrix-trans(matrix) multiplication first matrix being compressed
+      *          and the second transposed
+      *
+      * Implementation of the convenience expression result = prod(sp_mat, d_mat);
+      *
+      * @param sp_mat             The sparse matrix
+      * @param d_mat              The transposed dense matrix
+      * @param result             The result matrix
+      */
+      template< typename TYPE, unsigned int ALIGNMENT, typename F1, typename F2>
+      void prod_impl(const viennacl::compressed_matrix<TYPE, ALIGNMENT> & sp_mat,
+                     const viennacl::matrix_expression< const viennacl::matrix_base<TYPE, F1>,
+                                                        const viennacl::matrix_base<TYPE, F1>,
+                                                        viennacl::op_trans > & d_mat,
+                      viennacl::matrix_base<TYPE, F2> & result) {
+
+        viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(sp_mat).context());
+        viennacl::linalg::opencl::kernels::compressed_matrix<TYPE>::init(ctx);
+        viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix<TYPE>::program_name(),
+                                                   detail::sparse_dense_matmult_kernel_name(true, is_row_major<F1>::value, is_row_major<F2>::value));
+
+        viennacl::ocl::enqueue(k(sp_mat.handle1().opencl_handle(), sp_mat.handle2().opencl_handle(), sp_mat.handle().opencl_handle(),
+                                 viennacl::traits::opencl_handle(d_mat.lhs()),
+                                 cl_uint(viennacl::traits::start1(d_mat.lhs())),          cl_uint(viennacl::traits::start2(d_mat.lhs())),
+                                 cl_uint(viennacl::traits::stride1(d_mat.lhs())),         cl_uint(viennacl::traits::stride2(d_mat.lhs())),
+                                 cl_uint(viennacl::traits::size1(d_mat.lhs())),           cl_uint(viennacl::traits::size2(d_mat.lhs())),
+                                 cl_uint(viennacl::traits::internal_size1(d_mat.lhs())),  cl_uint(viennacl::traits::internal_size2(d_mat.lhs())),
+                                 viennacl::traits::opencl_handle(result),
+                                 cl_uint(viennacl::traits::start1(result)),         cl_uint(viennacl::traits::start2(result)),
+                                 cl_uint(viennacl::traits::stride1(result)),        cl_uint(viennacl::traits::stride2(result)),
+                                 cl_uint(viennacl::traits::size1(result)),          cl_uint(viennacl::traits::size2(result)),
+                                 cl_uint(viennacl::traits::internal_size1(result)), cl_uint(viennacl::traits::internal_size2(result)) ) );
+      }
+
+
+
+      // triangular solvers
+
+      /** @brief Inplace solution of a lower triangular compressed_matrix with unit diagonal. Typically used for LU substitutions
+      *
+      * @param L    The matrix
+      * @param vec  The vector holding the right hand side. Is overwritten by the solution.
+      */
+      template<typename SCALARTYPE, unsigned int MAT_ALIGNMENT>
+      void inplace_solve(compressed_matrix<SCALARTYPE, MAT_ALIGNMENT> const & L,
+                         vector_base<SCALARTYPE> & vec,
+                         viennacl::linalg::unit_lower_tag)
+      {
+        viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(L).context());
+        viennacl::linalg::opencl::kernels::compressed_matrix<SCALARTYPE>::init(ctx);
+        viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix<SCALARTYPE>::program_name(), "unit_lu_forward");
+
+        k.local_work_size(0, 128);
+        k.global_work_size(0, k.local_work_size());
+        viennacl::ocl::enqueue(k(L.handle1().opencl_handle(), L.handle2().opencl_handle(), L.handle().opencl_handle(),
+                                 viennacl::traits::opencl_handle(vec),
+                                 cl_uint(L.size1())
+                                )
+                              );
+      }
+
+      /** @brief Inplace solution of a lower triangular compressed_matrix. Typically used for LU substitutions
+      *
+      * @param L    The matrix
+      * @param vec  The vector holding the right hand side. Is overwritten by the solution.
+      */
+      template<typename SCALARTYPE, unsigned int MAT_ALIGNMENT>
+      void inplace_solve(compressed_matrix<SCALARTYPE, MAT_ALIGNMENT> const & L,
+                         vector_base<SCALARTYPE> & vec,
+                         viennacl::linalg::lower_tag)
+      {
+        viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(L).context());
+        viennacl::linalg::opencl::kernels::compressed_matrix<SCALARTYPE>::init(ctx);
+
+        viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix<SCALARTYPE>::program_name(), "lu_forward");
+
+        k.local_work_size(0, 128);
+        k.global_work_size(0, k.local_work_size());
+        viennacl::ocl::enqueue(k(L.handle1().opencl_handle(), L.handle2().opencl_handle(), L.handle().opencl_handle(),
+                                 viennacl::traits::opencl_handle(vec),
+                                 cl_uint(L.size1())
+                                )
+                              );
+      }
+
+
+      /** @brief Inplace solution of an upper triangular compressed_matrix with unit diagonal. Typically used for LU substitutions
+      *
+      * @param U    The matrix
+      * @param vec  The vector holding the right hand side. Is overwritten by the solution.
+      */
+      template<typename SCALARTYPE, unsigned int MAT_ALIGNMENT>
+      void inplace_solve(compressed_matrix<SCALARTYPE, MAT_ALIGNMENT> const & U,
+                         vector_base<SCALARTYPE> & vec,
+                         viennacl::linalg::unit_upper_tag)
+      {
+        viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(U).context());
+        viennacl::linalg::opencl::kernels::compressed_matrix<SCALARTYPE>::init(ctx);
+        viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix<SCALARTYPE>::program_name(), "unit_lu_backward");
+
+        k.local_work_size(0, 128);
+        k.global_work_size(0, k.local_work_size());
+        viennacl::ocl::enqueue(k(U.handle1().opencl_handle(), U.handle2().opencl_handle(), U.handle().opencl_handle(),
+                                 viennacl::traits::opencl_handle(vec),
+                                 cl_uint(U.size1())
+                                )
+                              );
+      }
+
+      /** @brief Inplace solution of an upper triangular compressed_matrix. Typically used for LU substitutions
+      *
+      * @param U    The matrix
+      * @param vec  The vector holding the right hand side. Is overwritten by the solution.
+      */
+      template<typename SCALARTYPE, unsigned int MAT_ALIGNMENT>
+      void inplace_solve(compressed_matrix<SCALARTYPE, MAT_ALIGNMENT> const & U,
+                         vector_base<SCALARTYPE> & vec,
+                         viennacl::linalg::upper_tag)
+      {
+        viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(U).context());
+        viennacl::linalg::opencl::kernels::compressed_matrix<SCALARTYPE>::init(ctx);
+
+        viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix<SCALARTYPE>::program_name(), "lu_backward");
+
+        k.local_work_size(0, 128);
+        k.global_work_size(0, k.local_work_size());
+        viennacl::ocl::enqueue(k(U.handle1().opencl_handle(), U.handle2().opencl_handle(), U.handle().opencl_handle(),
+                                 viennacl::traits::opencl_handle(vec),
+                                 cl_uint(U.size1())
+                                )
+                              );
+      }
+
+
+
+
+
+      // transposed triangular solvers
+
+      namespace detail
+      {
+        //
+        // block solves
+        //
+        template<typename ScalarType, unsigned int MAT_ALIGNMENT>
+        void block_inplace_solve(const matrix_expression<const compressed_matrix<ScalarType, MAT_ALIGNMENT>,
+                                                         const compressed_matrix<ScalarType, MAT_ALIGNMENT>,
+                                                         op_trans> & L,
+                                 viennacl::backend::mem_handle const & block_indices, vcl_size_t num_blocks,
+                                 vector_base<ScalarType> const & /* L_diagonal */,  //ignored
+                                 vector_base<ScalarType> & vec,
+                                 viennacl::linalg::unit_lower_tag)
+        {
+          viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(L.lhs()).context());
+          viennacl::linalg::opencl::kernels::compressed_matrix<ScalarType>::init(ctx);
+          viennacl::ocl::kernel & block_solve_kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix<ScalarType>::program_name(), "block_trans_unit_lu_forward");
+          block_solve_kernel.global_work_size(0, num_blocks * block_solve_kernel.local_work_size(0));
+
+          viennacl::ocl::enqueue(block_solve_kernel(L.lhs().handle1().opencl_handle(),
+                                                    L.lhs().handle2().opencl_handle(),
+                                                    L.lhs().handle().opencl_handle(),
+                                                    block_indices.opencl_handle(),
+                                                    vec,
+                                                    static_cast<cl_uint>(vec.size())));
+        }
+
+
+        template<typename ScalarType, unsigned int MAT_ALIGNMENT>
+        void block_inplace_solve(const matrix_expression<const compressed_matrix<ScalarType, MAT_ALIGNMENT>,
+                                                         const compressed_matrix<ScalarType, MAT_ALIGNMENT>,
+                                                         op_trans> & U,
+                                 viennacl::backend::mem_handle const & block_indices, vcl_size_t num_blocks,
+                                 vector_base<ScalarType> const & U_diagonal,
+                                 vector_base<ScalarType> & vec,
+                                 viennacl::linalg::upper_tag)
+        {
+          viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(U.lhs()).context());
+          viennacl::linalg::opencl::kernels::compressed_matrix<ScalarType>::init(ctx);
+          viennacl::ocl::kernel & block_solve_kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix<ScalarType>::program_name(), "block_trans_lu_backward");
+          block_solve_kernel.global_work_size(0, num_blocks * block_solve_kernel.local_work_size(0));
+
+          viennacl::ocl::enqueue(block_solve_kernel(U.lhs().handle1().opencl_handle(),
+                                                    U.lhs().handle2().opencl_handle(),
+                                                    U.lhs().handle().opencl_handle(),
+                                                    U_diagonal,
+                                                    block_indices.opencl_handle(),
+                                                    vec,
+                                                    static_cast<cl_uint>(vec.size())));
+        }
+
+
+      }
+
+
+      /** @brief Inplace solution of a lower triangular compressed_matrix with unit diagonal. Typically used for LU substitutions
+      *
+      * @param proxy_L  The transposed matrix proxy
+      * @param vec      The vector
+      */
+      template<typename SCALARTYPE, unsigned int MAT_ALIGNMENT>
+      void inplace_solve(matrix_expression< const compressed_matrix<SCALARTYPE, MAT_ALIGNMENT>,
+                                            const compressed_matrix<SCALARTYPE, MAT_ALIGNMENT>,
+                                            op_trans> const & proxy_L,
+                         vector_base<SCALARTYPE> & vec,
+                         viennacl::linalg::unit_lower_tag)
+      {
+        viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(proxy_L.lhs()).context());
+        viennacl::linalg::opencl::kernels::compressed_matrix<SCALARTYPE>::init(ctx);
+        viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix<SCALARTYPE>::program_name(), "trans_unit_lu_forward");
+
+        k.local_work_size(0, 128);
+        k.global_work_size(0, k.local_work_size());
+        viennacl::ocl::enqueue(k(proxy_L.lhs().handle1().opencl_handle(), proxy_L.lhs().handle2().opencl_handle(), proxy_L.lhs().handle().opencl_handle(),
+                                 viennacl::traits::opencl_handle(vec),
+                                 cl_uint(proxy_L.lhs().size1())
+                                )
+                              );
+      }
+
+
+      /** @brief Inplace solution of a lower triangular compressed_matrix. Typically used for LU substitutions
+      *
+      * @param proxy_L  The transposed matrix proxy
+      * @param vec      The vector
+      */
+      template<typename SCALARTYPE, unsigned int MAT_ALIGNMENT>
+      void inplace_solve(matrix_expression< const compressed_matrix<SCALARTYPE, MAT_ALIGNMENT>,
+                                            const compressed_matrix<SCALARTYPE, MAT_ALIGNMENT>,
+                                            op_trans> const & proxy_L,
+                         vector_base<SCALARTYPE> & vec,
+                         viennacl::linalg::lower_tag)
+      {
+        viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(proxy_L.lhs()).context());
+        viennacl::linalg::opencl::kernels::compressed_matrix<SCALARTYPE>::init(ctx);
+
+        viennacl::vector<SCALARTYPE> diagonal(vec.size());
+        detail::row_info(proxy_L.lhs(), diagonal, viennacl::linalg::detail::SPARSE_ROW_DIAGONAL);
+
+        viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix<SCALARTYPE>::program_name(), "trans_lu_forward");
+
+        k.local_work_size(0, 128);
+        k.global_work_size(0, k.local_work_size());
+        viennacl::ocl::enqueue(k(proxy_L.lhs().handle1().opencl_handle(), proxy_L.lhs().handle2().opencl_handle(), proxy_L.lhs().handle().opencl_handle(),
+                                 viennacl::traits::opencl_handle(diagonal),
+                                 viennacl::traits::opencl_handle(vec),
+                                 cl_uint(proxy_L.lhs().size1())
+                                )
+                              );
+      }
+
+      /** @brief Inplace solution of a lower triangular compressed_matrix with unit diagonal. Typically used for LU substitutions
+      *
+      * @param proxy_U  The transposed matrix proxy
+      * @param vec      The vector
+      */
+      template<typename SCALARTYPE, unsigned int MAT_ALIGNMENT>
+      void inplace_solve(matrix_expression< const compressed_matrix<SCALARTYPE, MAT_ALIGNMENT>,
+                                            const compressed_matrix<SCALARTYPE, MAT_ALIGNMENT>,
+                                            op_trans> const & proxy_U,
+                         vector_base<SCALARTYPE> & vec,
+                         viennacl::linalg::unit_upper_tag)
+      {
+        viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(proxy_U.lhs()).context());
+        viennacl::linalg::opencl::kernels::compressed_matrix<SCALARTYPE>::init(ctx);
+        viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix<SCALARTYPE>::program_name(), "trans_unit_lu_backward");
+
+        k.local_work_size(0, 128);
+        k.global_work_size(0, k.local_work_size());
+        viennacl::ocl::enqueue(k(proxy_U.lhs().handle1().opencl_handle(), proxy_U.lhs().handle2().opencl_handle(), proxy_U.lhs().handle().opencl_handle(),
+                                 viennacl::traits::opencl_handle(vec),
+                                 cl_uint(proxy_U.lhs().size1())
+                                )
+                              );
+      }
+
+
+      /** @brief Inplace solution of a lower triangular compressed_matrix. Typically used for LU substitutions
+      *
+      * @param proxy_U  The transposed matrix proxy
+      * @param vec      The vector
+      */
+      template<typename SCALARTYPE, unsigned int MAT_ALIGNMENT>
+      void inplace_solve(matrix_expression< const compressed_matrix<SCALARTYPE, MAT_ALIGNMENT>,
+                                            const compressed_matrix<SCALARTYPE, MAT_ALIGNMENT>,
+                                            op_trans> const & proxy_U,
+                         vector_base<SCALARTYPE> & vec,
+                         viennacl::linalg::upper_tag)
+      {
+        viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(proxy_U.lhs()).context());
+        viennacl::linalg::opencl::kernels::compressed_matrix<SCALARTYPE>::init(ctx);
+
+        viennacl::vector<SCALARTYPE> diagonal(vec.size());
+        detail::row_info(proxy_U.lhs(), diagonal, viennacl::linalg::detail::SPARSE_ROW_DIAGONAL);
+
+        viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix<SCALARTYPE>::program_name(), "trans_lu_backward");
+
+        k.local_work_size(0, 128);
+        k.global_work_size(0, k.local_work_size());
+        viennacl::ocl::enqueue(k(proxy_U.lhs().handle1().opencl_handle(), proxy_U.lhs().handle2().opencl_handle(), proxy_U.lhs().handle().opencl_handle(),
+                                 viennacl::traits::opencl_handle(diagonal),
+                                 viennacl::traits::opencl_handle(vec),
+                                 cl_uint(proxy_U.lhs().size1())
+                                )
+                              );
+      }
+
+
+      //
+      // Compressed Compressed matrix
+      //
+
+      /** @brief Carries out matrix-vector multiplication with a compressed_compressed_matrix
+      *
+      * Implementation of the convenience expression result = prod(mat, vec);
+      *
+      * @param mat    The matrix
+      * @param vec    The vector
+      * @param result The result vector
+      */
+      template<class TYPE>
+      void prod_impl(const viennacl::compressed_compressed_matrix<TYPE> & mat,
+                     const viennacl::vector_base<TYPE> & vec,
+                           viennacl::vector_base<TYPE> & result)
+      {
+        viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(mat).context());
+        viennacl::linalg::opencl::kernels::compressed_compressed_matrix<TYPE>::init(ctx);
+        viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_compressed_matrix<TYPE>::program_name(), "vec_mul");
+
+        result.clear();
+
+        viennacl::ocl::packed_cl_uint layout_vec;
+        layout_vec.start  = cl_uint(viennacl::traits::start(vec));
+        layout_vec.stride = cl_uint(viennacl::traits::stride(vec));
+        layout_vec.size   = cl_uint(viennacl::traits::size(vec));
+        layout_vec.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
+
+        viennacl::ocl::packed_cl_uint layout_result;
+        layout_result.start  = cl_uint(viennacl::traits::start(result));
+        layout_result.stride = cl_uint(viennacl::traits::stride(result));
+        layout_result.size   = cl_uint(viennacl::traits::size(result));
+        layout_result.internal_size   = cl_uint(viennacl::traits::internal_size(result));
+
+        viennacl::ocl::enqueue(k(mat.handle1().opencl_handle(), mat.handle3().opencl_handle(), mat.handle2().opencl_handle(), mat.handle().opencl_handle(), cl_uint(mat.nnz1()),
+                                 vec, layout_vec,
+                                 result, layout_result
+                                ));
+      }
+
+
+      //
+      // Coordinate matrix
+      //
+
+      namespace detail
+      {
+        template<typename SCALARTYPE, unsigned int MAT_ALIGNMENT>
+        void row_info(coordinate_matrix<SCALARTYPE, MAT_ALIGNMENT> const & mat,
+                      vector_base<SCALARTYPE> & vec,
+                      viennacl::linalg::detail::row_info_types info_selector)
+        {
+          viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(mat).context());
+          viennacl::linalg::opencl::kernels::coordinate_matrix<SCALARTYPE>::init(ctx);
+          viennacl::ocl::kernel & row_info_kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::coordinate_matrix<SCALARTYPE>::program_name(), "row_info_extractor");
+          unsigned int thread_num = 256; //k.local_work_size(0);
+
+          row_info_kernel.local_work_size(0, thread_num);
+
+          row_info_kernel.global_work_size(0, 64 * thread_num);  //64 work groups are hard-coded for now. Gives reasonable performance in most cases
+          viennacl::ocl::enqueue(row_info_kernel(mat.handle12().opencl_handle(), mat.handle().opencl_handle(), mat.handle3().opencl_handle(),
+                                                 viennacl::traits::opencl_handle(vec),
+                                                 cl_uint(info_selector),
+                                                 viennacl::ocl::local_mem(sizeof(cl_uint)*thread_num),
+                                                 viennacl::ocl::local_mem(sizeof(SCALARTYPE)*thread_num)) );
+        }
+      }
+
+      /** @brief Carries out matrix-vector multiplication with a coordinate_matrix
+      *
+      * Implementation of the convenience expression result = prod(mat, vec);
+      *
+      * @param mat    The matrix
+      * @param vec    The vector
+      * @param result The result vector
+      */
+      template<class SCALARTYPE, unsigned int ALIGNMENT>
+      void prod_impl(const viennacl::coordinate_matrix<SCALARTYPE, ALIGNMENT> & mat,
+                     const viennacl::vector_base<SCALARTYPE> & vec,
+                           viennacl::vector_base<SCALARTYPE> & result)
+      {
+        viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(mat).context());
+        viennacl::linalg::opencl::kernels::coordinate_matrix<SCALARTYPE>::init(ctx);
+
+        result.clear();
+
+        viennacl::ocl::packed_cl_uint layout_vec;
+        layout_vec.start  = cl_uint(viennacl::traits::start(vec));
+        layout_vec.stride = cl_uint(viennacl::traits::stride(vec));
+        layout_vec.size   = cl_uint(viennacl::traits::size(vec));
+        layout_vec.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
+
+        viennacl::ocl::packed_cl_uint layout_result;
+        layout_result.start  = cl_uint(viennacl::traits::start(result));
+        layout_result.stride = cl_uint(viennacl::traits::stride(result));
+        layout_result.size   = cl_uint(viennacl::traits::size(result));
+        layout_result.internal_size   = cl_uint(viennacl::traits::internal_size(result));
+
+        //std::cout << "prod(coordinate_matrix" << ALIGNMENT << ", vector) called with internal_nnz=" << mat.internal_nnz() << std::endl;
+
+        viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::coordinate_matrix<SCALARTYPE>::program_name(), "vec_mul");
+        unsigned int thread_num = 256; //k.local_work_size(0);
+
+        k.local_work_size(0, thread_num);
+
+        k.global_work_size(0, 64 * thread_num);  //64 work groups are hard-coded for now. Gives reasonable performance in most cases
+        //k.global_work_size(0, thread_num);  //Only one work group
+        viennacl::ocl::enqueue(k(mat.handle12().opencl_handle(), mat.handle().opencl_handle(), mat.handle3().opencl_handle(),
+                                 viennacl::traits::opencl_handle(vec),
+                                 layout_vec,
+                                 viennacl::traits::opencl_handle(result),
+                                 layout_result,
+                                 viennacl::ocl::local_mem(sizeof(cl_uint)*thread_num),
+                                 viennacl::ocl::local_mem(sizeof(SCALARTYPE)*thread_num)) );
+
+      }
+
+
+      /** @brief Carries out sparse-matrix-dense-matrix multiplication, where the sparse matrix is a coordinate_matrix
+      *
+      * Implementation of the convenience expression result = prod(A, B); with A being sparse (COO) and B being dense
+      *
+      * @param mat    The sparse matrix (COO format)
+      * @param d_mat  The dense matrix
+      * @param result The result vector
+      */
+      template<typename NumericT, unsigned int ALIGNMENT, typename F1, typename F2>
+      void prod_impl(const viennacl::coordinate_matrix<NumericT, ALIGNMENT> & mat,
+                     const viennacl::matrix_base<NumericT, F1> & d_mat,
+                           viennacl::matrix_base<NumericT, F2> & result)
+      {
+        viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(mat).context());
+        viennacl::linalg::opencl::kernels::coordinate_matrix<NumericT>::init(ctx);
+
+        viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::coordinate_matrix<NumericT>::program_name(),
+                                                   detail::sparse_dense_matmult_kernel_name(false, is_row_major<F1>::value, is_row_major<F2>::value));
+
+        result.clear();
+
+        unsigned int thread_num = 256; //k.local_work_size(0);
+        k.local_work_size(0, thread_num);
+        k.global_work_size(0, 64 * thread_num);  //64 work groups are hard-coded for now. Gives reasonable performance in most cases
+
+        viennacl::ocl::enqueue(k(mat.handle12().opencl_handle(), mat.handle().opencl_handle(), mat.handle3().opencl_handle(),
+                                 viennacl::traits::opencl_handle(d_mat),
+                                 cl_uint(viennacl::traits::start1(d_mat)),          cl_uint(viennacl::traits::start2(d_mat)),
+                                 cl_uint(viennacl::traits::stride1(d_mat)),         cl_uint(viennacl::traits::stride2(d_mat)),
+                                 cl_uint(viennacl::traits::size1(d_mat)),           cl_uint(viennacl::traits::size2(d_mat)),
+                                 cl_uint(viennacl::traits::internal_size1(d_mat)),  cl_uint(viennacl::traits::internal_size2(d_mat)),
+                                 viennacl::traits::opencl_handle(result),
+                                 cl_uint(viennacl::traits::start1(result)),         cl_uint(viennacl::traits::start2(result)),
+                                 cl_uint(viennacl::traits::stride1(result)),        cl_uint(viennacl::traits::stride2(result)),
+                                 cl_uint(viennacl::traits::size1(result)),          cl_uint(viennacl::traits::size2(result)),
+                                 cl_uint(viennacl::traits::internal_size1(result)), cl_uint(viennacl::traits::internal_size2(result)),
+                                 viennacl::ocl::local_mem(sizeof(cl_uint)*k.local_work_size(0)),
+                                 viennacl::ocl::local_mem(sizeof(NumericT)*k.local_work_size(0))) );
+
+      }
+
+      /** @brief Carries out sparse-matrix-dense-matrix multiplication, where the sparse matrix is a coordinate_matrix
+      *
+      * Implementation of the convenience expression result = prod(A, trans(B)); with A being sparse (COO) and B being dense
+      *
+      * @param mat    The sparse matrix (COO format)
+      * @param d_mat  The dense matrix
+      * @param result The result vector
+      */
+      template<typename NumericT, unsigned int ALIGNMENT, typename F1, typename F2>
+      void prod_impl(const viennacl::coordinate_matrix<NumericT, ALIGNMENT> & mat,
+                     const viennacl::matrix_expression< const viennacl::matrix_base<NumericT, F1>,
+                                                        const viennacl::matrix_base<NumericT, F1>,
+                                                        viennacl::op_trans > & d_mat,
+                           viennacl::matrix_base<NumericT, F2> & result)
+      {
+        viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(mat).context());
+        viennacl::linalg::opencl::kernels::coordinate_matrix<NumericT>::init(ctx);
+
+        viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::coordinate_matrix<NumericT>::program_name(),
+                                                   detail::sparse_dense_matmult_kernel_name(true, is_row_major<F1>::value, is_row_major<F2>::value));
+
+        result.clear();
+
+        unsigned int thread_num = 256; //k.local_work_size(0);
+        k.local_work_size(0, thread_num);
+        k.global_work_size(0, 64 * thread_num);  //64 work groups are hard-coded for now. Gives reasonable performance in most cases
+
+        viennacl::ocl::enqueue(k(mat.handle12().opencl_handle(), mat.handle().opencl_handle(), mat.handle3().opencl_handle(),
+                                 viennacl::traits::opencl_handle(d_mat),
+                                 cl_uint(viennacl::traits::start1(d_mat.lhs())),          cl_uint(viennacl::traits::start2(d_mat.lhs())),
+                                 cl_uint(viennacl::traits::stride1(d_mat.lhs())),         cl_uint(viennacl::traits::stride2(d_mat.lhs())),
+                                 cl_uint(viennacl::traits::size1(d_mat.lhs())),           cl_uint(viennacl::traits::size2(d_mat.lhs())),
+                                 cl_uint(viennacl::traits::internal_size1(d_mat.lhs())),  cl_uint(viennacl::traits::internal_size2(d_mat.lhs())),
+                                 viennacl::traits::opencl_handle(result),
+                                 cl_uint(viennacl::traits::start1(result)),         cl_uint(viennacl::traits::start2(result)),
+                                 cl_uint(viennacl::traits::stride1(result)),        cl_uint(viennacl::traits::stride2(result)),
+                                 cl_uint(viennacl::traits::size1(result)),          cl_uint(viennacl::traits::size2(result)),
+                                 cl_uint(viennacl::traits::internal_size1(result)), cl_uint(viennacl::traits::internal_size2(result)),
+                                 viennacl::ocl::local_mem(sizeof(cl_uint)*k.local_work_size(0)),
+                                 viennacl::ocl::local_mem(sizeof(NumericT)*k.local_work_size(0))) );
+
+      }
+
+
+      //
+      // ELL Matrix
+      //
+
+      template<class TYPE, unsigned int ALIGNMENT>
+      void prod_impl( const viennacl::ell_matrix<TYPE, ALIGNMENT> & mat,
+                      const viennacl::vector_base<TYPE> & vec,
+                      viennacl::vector_base<TYPE> & result)
+      {
+        assert(mat.size1() == result.size());
+        assert(mat.size2() == vec.size());
+
+        viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(mat).context());
+        viennacl::linalg::opencl::kernels::ell_matrix<TYPE>::init(ctx);
+        result.clear();
+
+        viennacl::ocl::packed_cl_uint layout_vec;
+        layout_vec.start  = cl_uint(viennacl::traits::start(vec));
+        layout_vec.stride = cl_uint(viennacl::traits::stride(vec));
+        layout_vec.size   = cl_uint(viennacl::traits::size(vec));
+        layout_vec.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
+
+        viennacl::ocl::packed_cl_uint layout_result;
+        layout_result.start  = cl_uint(viennacl::traits::start(result));
+        layout_result.stride = cl_uint(viennacl::traits::stride(result));
+        layout_result.size   = cl_uint(viennacl::traits::size(result));
+        layout_result.internal_size   = cl_uint(viennacl::traits::internal_size(result));
+
+        std::stringstream ss;
+        ss << "vec_mul_" << 1;//(ALIGNMENT != 1?4:1);
+        viennacl::ocl::kernel& k = ctx.get_kernel(viennacl::linalg::opencl::kernels::ell_matrix<TYPE>::program_name(), "vec_mul");
+
+        unsigned int thread_num = 128;
+        unsigned int group_num = 256;
+
+        k.local_work_size(0, thread_num);
+        k.global_work_size(0, thread_num * group_num);
+
+        viennacl::ocl::enqueue(k(mat.handle2().opencl_handle(),
+                                 mat.handle().opencl_handle(),
+                                 viennacl::traits::opencl_handle(vec),
+                                 layout_vec,
+                                 viennacl::traits::opencl_handle(result),
+                                 layout_result,
+                                 cl_uint(mat.size1()),
+                                 cl_uint(mat.size2()),
+                                 cl_uint(mat.internal_size1()),
+                                 cl_uint(mat.maxnnz()),
+                                 cl_uint(mat.internal_maxnnz())
+                                )
+        );
+
+
+      }
+
+      /** @brief Carries out Sparse Matrix(ELL)-Dense Matrix multiplication
+      *
+      * Implementation of the convenience expression result = prod(sp_mat, d_mat);
+      * sp_mat being in ELL format
+      *
+      * @param sp_mat     The sparse matrix (ELL)
+      * @param d_mat      The dense matrix
+      * @param result     The result matrix
+      */
+      template<class ScalarType, unsigned int ALIGNMENT, class NumericT, typename F1, typename F2 >
+      void prod_impl(const viennacl::ell_matrix<ScalarType, ALIGNMENT> & sp_mat,
+                     const viennacl::matrix_base<NumericT, F1> & d_mat,
+                           viennacl::matrix_base<NumericT, F2> & result) {
+
+        viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(sp_mat).context());
+        viennacl::linalg::opencl::kernels::ell_matrix<ScalarType>::init(ctx);
+        viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::ell_matrix<ScalarType>::program_name(),
+                                                   detail::sparse_dense_matmult_kernel_name(false, is_row_major<F1>::value, is_row_major<F2>::value));
+
+        //unsigned int thread_num = 128;
+        //unsigned int group_num = 256;
+        //
+        //k.local_work_size(0, thread_num);
+        //k.global_work_size(0, thread_num * group_num);
+
+        viennacl::ocl::enqueue(k(sp_mat.handle2().opencl_handle(), sp_mat.handle().opencl_handle(),
+                                 cl_uint(sp_mat.size1()),
+                                 cl_uint(sp_mat.size2()),
+                                 cl_uint(sp_mat.internal_size1()),
+                                 cl_uint(sp_mat.maxnnz()),
+                                 cl_uint(sp_mat.internal_maxnnz()),
+                                 viennacl::traits::opencl_handle(d_mat),
+                                 cl_uint(viennacl::traits::start1(d_mat)),          cl_uint(viennacl::traits::start2(d_mat)),
+                                 cl_uint(viennacl::traits::stride1(d_mat)),         cl_uint(viennacl::traits::stride2(d_mat)),
+                                 cl_uint(viennacl::traits::size1(d_mat)),           cl_uint(viennacl::traits::size2(d_mat)),
+                                 cl_uint(viennacl::traits::internal_size1(d_mat)),  cl_uint(viennacl::traits::internal_size2(d_mat)),
+                                 viennacl::traits::opencl_handle(result),
+                                 cl_uint(viennacl::traits::start1(result)),         cl_uint(viennacl::traits::start2(result)),
+                                 cl_uint(viennacl::traits::stride1(result)),        cl_uint(viennacl::traits::stride2(result)),
+                                 cl_uint(viennacl::traits::size1(result)),          cl_uint(viennacl::traits::size2(result)),
+                                 cl_uint(viennacl::traits::internal_size1(result)), cl_uint(viennacl::traits::internal_size2(result))
+                                )
+                              );
+      }
+
+      /** @brief Carries out Sparse Matrix(ELL)-Dense Transposed Matrix multiplication
+      *
+      * Implementation of the convenience expression result = prod(sp_mat, trans(d_mat));
+      * sp_mat being in ELL format
+      *
+      * @param sp_mat     The sparse matrix (ELL)
+      * @param d_mat      The dense transposed matrix
+      * @param result     The result matrix
+      */
+      template<class ScalarType, unsigned int ALIGNMENT, class NumericT, typename F1, typename F2>
+      void prod_impl(const viennacl::ell_matrix<ScalarType, ALIGNMENT> & sp_mat,
+                     const viennacl::matrix_expression< const viennacl::matrix_base<NumericT, F1>,
+                                                        const viennacl::matrix_base<NumericT, F1>,
+                                                        viennacl::op_trans > & d_mat,
+                           viennacl::matrix_base<NumericT, F2> & result) {
+
+        viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(sp_mat).context());
+        viennacl::linalg::opencl::kernels::ell_matrix<ScalarType>::init(ctx);
+        viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::ell_matrix<ScalarType>::program_name(),
+                                                   detail::sparse_dense_matmult_kernel_name(true, is_row_major<F1>::value, is_row_major<F2>::value));
+
+        //unsigned int thread_num = 128;
+        //unsigned int group_num = 256;
+        //
+        //k.local_work_size(0, thread_num);
+        //k.global_work_size(0, thread_num * group_num);
+
+        viennacl::ocl::enqueue(k(sp_mat.handle2().opencl_handle(), sp_mat.handle().opencl_handle(),
+                                 cl_uint(sp_mat.size1()),
+                                 cl_uint(sp_mat.size2()),
+                                 cl_uint(sp_mat.internal_size1()),
+                                 cl_uint(sp_mat.maxnnz()),
+                                 cl_uint(sp_mat.internal_maxnnz()),
+                                 viennacl::traits::opencl_handle(d_mat.lhs()),
+                                 cl_uint(viennacl::traits::start1(d_mat.lhs())),          cl_uint(viennacl::traits::start2(d_mat.lhs())),
+                                 cl_uint(viennacl::traits::stride1(d_mat.lhs())),         cl_uint(viennacl::traits::stride2(d_mat.lhs())),
+                                 cl_uint(viennacl::traits::size1(d_mat.lhs())),           cl_uint(viennacl::traits::size2(d_mat.lhs())),
+                                 cl_uint(viennacl::traits::internal_size1(d_mat.lhs())),  cl_uint(viennacl::traits::internal_size2(d_mat.lhs())),
+                                 viennacl::traits::opencl_handle(result),
+                                 cl_uint(viennacl::traits::start1(result)),         cl_uint(viennacl::traits::start2(result)),
+                                 cl_uint(viennacl::traits::stride1(result)),        cl_uint(viennacl::traits::stride2(result)),
+                                 cl_uint(viennacl::traits::size1(result)),          cl_uint(viennacl::traits::size2(result)),
+                                 cl_uint(viennacl::traits::internal_size1(result)), cl_uint(viennacl::traits::internal_size2(result))
+                                )
+                              );
+      }
+
+      //
+      // Hybrid Matrix
+      //
+
+      template<class TYPE, unsigned int ALIGNMENT>
+      void prod_impl( const viennacl::hyb_matrix<TYPE, ALIGNMENT>& mat,
+                      const viennacl::vector_base<TYPE>& vec,
+                      viennacl::vector_base<TYPE>& result)
+      {
+        assert(mat.size1() == result.size());
+        assert(mat.size2() == vec.size());
+
+        viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(mat).context());
+        viennacl::linalg::opencl::kernels::hyb_matrix<TYPE>::init(ctx);
+
+        viennacl::ocl::packed_cl_uint layout_vec;
+        layout_vec.start  = cl_uint(viennacl::traits::start(vec));
+        layout_vec.stride = cl_uint(viennacl::traits::stride(vec));
+        layout_vec.size   = cl_uint(viennacl::traits::size(vec));
+        layout_vec.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
+
+        viennacl::ocl::packed_cl_uint layout_result;
+        layout_result.start  = cl_uint(viennacl::traits::start(result));
+        layout_result.stride = cl_uint(viennacl::traits::stride(result));
+        layout_result.size   = cl_uint(viennacl::traits::size(result));
+        layout_result.internal_size   = cl_uint(viennacl::traits::internal_size(result));
+
+        viennacl::ocl::kernel& k = ctx.get_kernel(viennacl::linalg::opencl::kernels::hyb_matrix<TYPE>::program_name(), "vec_mul");
+
+        unsigned int thread_num = 256;
+        unsigned int group_num = 32;
+
+        k.local_work_size(0, thread_num);
+        k.global_work_size(0, thread_num * group_num);
+
+        viennacl::ocl::enqueue(k(mat.handle2().opencl_handle(),
+                                 mat.handle().opencl_handle(),
+                                 mat.handle3().opencl_handle(),
+                                 mat.handle4().opencl_handle(),
+                                 mat.handle5().opencl_handle(),
+                                 viennacl::traits::opencl_handle(vec),
+                                 layout_vec,
+                                 viennacl::traits::opencl_handle(result),
+                                 layout_result,
+                                 cl_uint(mat.size1()),
+                                 cl_uint(mat.internal_size1()),
+                                 cl_uint(mat.ell_nnz()),
+                                 cl_uint(mat.internal_ellnnz())
+                                )
+        );
+      }
+
+      template<typename NumericT, unsigned int ALIGNMENT, typename F1, typename F2>
+      void prod_impl( const viennacl::hyb_matrix<NumericT, ALIGNMENT>& mat,
+                      const viennacl::matrix_base<NumericT, F1> & d_mat,
+                            viennacl::matrix_base<NumericT, F2> & result)
+      {
+        viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(mat).context());
+        viennacl::linalg::opencl::kernels::hyb_matrix<NumericT>::init(ctx);
+        viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::hyb_matrix<NumericT>::program_name(),
+                                                   detail::sparse_dense_matmult_kernel_name(false, is_row_major<F1>::value, is_row_major<F2>::value));
+
+        unsigned int thread_num = 256;
+        unsigned int group_num = 32;
+
+        k.local_work_size(0, thread_num);
+        k.global_work_size(0, thread_num * group_num);
+
+        viennacl::ocl::enqueue(k(mat.handle2().opencl_handle(),
+                                 mat.handle().opencl_handle(),
+                                 mat.handle3().opencl_handle(),
+                                 mat.handle4().opencl_handle(),
+                                 mat.handle5().opencl_handle(),
+                                 cl_uint(mat.size1()),
+                                 cl_uint(mat.internal_size1()),
+                                 cl_uint(mat.ell_nnz()),
+                                 cl_uint(mat.internal_ellnnz()),
+                                 viennacl::traits::opencl_handle(d_mat),
+                                 cl_uint(viennacl::traits::start1(d_mat)),          cl_uint(viennacl::traits::start2(d_mat)),
+                                 cl_uint(viennacl::traits::stride1(d_mat)),         cl_uint(viennacl::traits::stride2(d_mat)),
+                                 cl_uint(viennacl::traits::size1(d_mat)),           cl_uint(viennacl::traits::size2(d_mat)),
+                                 cl_uint(viennacl::traits::internal_size1(d_mat)),  cl_uint(viennacl::traits::internal_size2(d_mat)),
+                                 viennacl::traits::opencl_handle(result),
+                                 cl_uint(viennacl::traits::start1(result)),         cl_uint(viennacl::traits::start2(result)),
+                                 cl_uint(viennacl::traits::stride1(result)),        cl_uint(viennacl::traits::stride2(result)),
+                                 cl_uint(viennacl::traits::size1(result)),          cl_uint(viennacl::traits::size2(result)),
+                                 cl_uint(viennacl::traits::internal_size1(result)), cl_uint(viennacl::traits::internal_size2(result))
+                                )
+        );
+      }
+
+      template<typename NumericT, unsigned int ALIGNMENT, typename F1, typename F2>
+      void prod_impl( const viennacl::hyb_matrix<NumericT, ALIGNMENT>& mat,
+                      const viennacl::matrix_expression< const viennacl::matrix_base<NumericT, F1>,
+                                                         const viennacl::matrix_base<NumericT, F1>,
+                                                         viennacl::op_trans > & d_mat,
+                            viennacl::matrix_base<NumericT, F2> & result)
+      {
+        viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(mat).context());
+        viennacl::linalg::opencl::kernels::hyb_matrix<NumericT>::init(ctx);
+        viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::hyb_matrix<NumericT>::program_name(),
+                                                   detail::sparse_dense_matmult_kernel_name(true, is_row_major<F1>::value, is_row_major<F2>::value));
+
+        unsigned int thread_num = 256;
+        unsigned int group_num = 32;
+
+        k.local_work_size(0, thread_num);
+        k.global_work_size(0, thread_num * group_num);
+
+        viennacl::ocl::enqueue(k(mat.handle2().opencl_handle(),
+                                 mat.handle().opencl_handle(),
+                                 mat.handle3().opencl_handle(),
+                                 mat.handle4().opencl_handle(),
+                                 mat.handle5().opencl_handle(),
+                                 cl_uint(mat.size1()),
+                                 cl_uint(mat.internal_size1()),
+                                 cl_uint(mat.ell_nnz()),
+                                 cl_uint(mat.internal_ellnnz()),
+                                 viennacl::traits::opencl_handle(d_mat.lhs()),
+                                 cl_uint(viennacl::traits::start1(d_mat.lhs())),          cl_uint(viennacl::traits::start2(d_mat.lhs())),
+                                 cl_uint(viennacl::traits::stride1(d_mat.lhs())),         cl_uint(viennacl::traits::stride2(d_mat.lhs())),
+                                 cl_uint(viennacl::traits::size1(d_mat.lhs())),           cl_uint(viennacl::traits::size2(d_mat.lhs())),
+                                 cl_uint(viennacl::traits::internal_size1(d_mat.lhs())),  cl_uint(viennacl::traits::internal_size2(d_mat.lhs())),
+                                 viennacl::traits::opencl_handle(result),
+                                 cl_uint(viennacl::traits::start1(result)),         cl_uint(viennacl::traits::start2(result)),
+                                 cl_uint(viennacl::traits::stride1(result)),        cl_uint(viennacl::traits::stride2(result)),
+                                 cl_uint(viennacl::traits::size1(result)),          cl_uint(viennacl::traits::size2(result)),
+                                 cl_uint(viennacl::traits::internal_size1(result)), cl_uint(viennacl::traits::internal_size2(result))
+                                )
+        );
+      }
+
+
+    } // namespace opencl
+  } //namespace linalg
+} //namespace viennacl
+
+
+#endif
diff --git a/viennacl/linalg/opencl/vandermonde_matrix_operations.hpp b/viennacl/linalg/opencl/vandermonde_matrix_operations.hpp
new file mode 100644
index 0000000..9925d5a
--- /dev/null
+++ b/viennacl/linalg/opencl/vandermonde_matrix_operations.hpp
@@ -0,0 +1,68 @@
+#ifndef VIENNACL_LINALG_OPENCL_VANDERMONDE_MATRIX_OPERATIONS_HPP_
+#define VIENNACL_LINALG_OPENCL_VANDERMONDE_MATRIX_OPERATIONS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/opencl/vandermonde_matrix_operations.hpp
+    @brief Implementations of operations using vandermonde_matrix
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/ocl/backend.hpp"
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/fft.hpp"
+//#include "viennacl/linalg/kernels/coordinate_matrix_kernels.h"
+
+namespace viennacl
+{
+  namespace linalg
+  {
+    namespace opencl
+    {
+
+      /** @brief Carries out matrix-vector multiplication with a vandermonde_matrix
+      *
+      * Implementation of the convenience expression result = prod(mat, vec);
+      *
+      * @param mat    The matrix
+      * @param vec    The vector
+      * @param result The result vector
+      */
+        template<class SCALARTYPE, unsigned int ALIGNMENT>
+        void prod_impl(const viennacl::vandermonde_matrix<SCALARTYPE, ALIGNMENT> & mat,
+                      const viennacl::vector_base<SCALARTYPE> & vec,
+                            viennacl::vector_base<SCALARTYPE> & result)
+        {
+          viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(mat).context());
+          viennacl::linalg::opencl::kernels::fft<SCALARTYPE>::init(ctx);
+
+          viennacl::ocl::kernel & kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::fft<SCALARTYPE>::program_name(), "vandermonde_prod");
+          viennacl::ocl::enqueue(kernel(viennacl::traits::opencl_handle(mat),
+                                        viennacl::traits::opencl_handle(vec),
+                                        viennacl::traits::opencl_handle(result),
+                                        static_cast<cl_uint>(mat.size1())));
+        }
+
+    } //namespace opencl
+  } //namespace linalg
+} //namespace viennacl
+
+
+#endif
diff --git a/viennacl/linalg/opencl/vector_operations.hpp b/viennacl/linalg/opencl/vector_operations.hpp
new file mode 100644
index 0000000..de491b3
--- /dev/null
+++ b/viennacl/linalg/opencl/vector_operations.hpp
@@ -0,0 +1,975 @@
+#ifndef VIENNACL_LINALG_OPENCL_VECTOR_OPERATIONS_HPP_
+#define VIENNACL_LINALG_OPENCL_VECTOR_OPERATIONS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/opencl/vector_operations.hpp
+    @brief Implementations of vector operations using OpenCL
+*/
+
+#include <cmath>
+
+#include "viennacl/forwards.h"
+#include "viennacl/ocl/device.hpp"
+#include "viennacl/ocl/handle.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/scalar.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/opencl/common.hpp"
+#include "viennacl/linalg/opencl/kernels/vector.hpp"
+#include "viennacl/linalg/opencl/kernels/vector_element.hpp"
+#include "viennacl/meta/predicate.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/start.hpp"
+#include "viennacl/traits/handle.hpp"
+#include "viennacl/traits/stride.hpp"
+
+namespace viennacl
+{
+  namespace linalg
+  {
+    namespace opencl
+    {
+      //
+      // Introductory note: By convention, all dimensions are already checked in the dispatcher frontend. No need to double-check again in here!
+      //
+
+
+      template <typename T, typename ScalarType1>
+      void av(vector_base<T> & vec1,
+              vector_base<T> const & vec2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
+      {
+        assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(vec2).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+
+        viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec1).context());
+        viennacl::linalg::opencl::kernels::vector<T>::init(ctx);
+
+        cl_uint options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
+
+        viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector<T>::program_name(),
+                                                   (viennacl::is_cpu_scalar<ScalarType1>::value ? "av_cpu" : "av_gpu"));
+        k.global_work_size(0, std::min<vcl_size_t>(128 * k.local_work_size(),
+                                                    viennacl::tools::align_to_multiple<vcl_size_t>(viennacl::traits::size(vec1), k.local_work_size()) ) );
+
+        viennacl::ocl::packed_cl_uint size_vec1;
+        size_vec1.start  = cl_uint(viennacl::traits::start(vec1));
+        size_vec1.stride = cl_uint(viennacl::traits::stride(vec1));
+        size_vec1.size   = cl_uint(viennacl::traits::size(vec1));
+        size_vec1.internal_size   = cl_uint(viennacl::traits::internal_size(vec1));
+
+        viennacl::ocl::packed_cl_uint size_vec2;
+        size_vec2.start  = cl_uint(viennacl::traits::start(vec2));
+        size_vec2.stride = cl_uint(viennacl::traits::stride(vec2));
+        size_vec2.size   = cl_uint(viennacl::traits::size(vec2));
+        size_vec2.internal_size   = cl_uint(viennacl::traits::internal_size(vec2));
+
+
+        viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(vec1),
+                                 size_vec1,
+
+                                 viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<T>(alpha)),
+                                 options_alpha,
+                                 viennacl::traits::opencl_handle(vec2),
+                                 size_vec2 )
+                              );
+      }
+
+
+      template <typename T, typename ScalarType1, typename ScalarType2>
+      void avbv(vector_base<T> & vec1,
+                vector_base<T> const & vec2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+                vector_base<T> const & vec3, ScalarType2 const & beta,  vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
+      {
+        assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(vec2).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+        assert(viennacl::traits::opencl_handle(vec2).context() == viennacl::traits::opencl_handle(vec3).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+
+        viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec1).context());
+        viennacl::linalg::opencl::kernels::vector<T>::init(ctx);
+
+        std::string kernel_name;
+        if (viennacl::is_cpu_scalar<ScalarType1>::value && viennacl::is_cpu_scalar<ScalarType2>::value)
+          kernel_name = "avbv_cpu_cpu";
+        else if (viennacl::is_cpu_scalar<ScalarType1>::value && !viennacl::is_cpu_scalar<ScalarType2>::value)
+          kernel_name = "avbv_cpu_gpu";
+        else if (!viennacl::is_cpu_scalar<ScalarType1>::value && viennacl::is_cpu_scalar<ScalarType2>::value)
+          kernel_name = "avbv_gpu_cpu";
+        else
+          kernel_name = "avbv_gpu_gpu";
+
+        cl_uint options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
+        cl_uint options_beta  = detail::make_options(len_beta,  reciprocal_beta,  flip_sign_beta);
+
+        viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector<T>::program_name(), kernel_name);
+        k.global_work_size(0, std::min<vcl_size_t>(128 * k.local_work_size(),
+                                                    viennacl::tools::align_to_multiple<vcl_size_t>(viennacl::traits::size(vec1), k.local_work_size()) ) );
+
+        viennacl::ocl::packed_cl_uint size_vec1;
+        size_vec1.start  = cl_uint(viennacl::traits::start(vec1));
+        size_vec1.stride = cl_uint(viennacl::traits::stride(vec1));
+        size_vec1.size   = cl_uint(viennacl::traits::size(vec1));
+        size_vec1.internal_size   = cl_uint(viennacl::traits::internal_size(vec1));
+
+        viennacl::ocl::packed_cl_uint size_vec2;
+        size_vec2.start  = cl_uint(viennacl::traits::start(vec2));
+        size_vec2.stride = cl_uint(viennacl::traits::stride(vec2));
+        size_vec2.size   = cl_uint(viennacl::traits::size(vec2));
+        size_vec2.internal_size   = cl_uint(viennacl::traits::internal_size(vec2));
+
+        viennacl::ocl::packed_cl_uint size_vec3;
+        size_vec3.start  = cl_uint(viennacl::traits::start(vec3));
+        size_vec3.stride = cl_uint(viennacl::traits::stride(vec3));
+        size_vec3.size   = cl_uint(viennacl::traits::size(vec3));
+        size_vec3.internal_size   = cl_uint(viennacl::traits::internal_size(vec3));
+
+        viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(vec1),
+                                 size_vec1,
+
+                                 viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<T>(alpha)),
+                                 options_alpha,
+                                 viennacl::traits::opencl_handle(vec2),
+                                 size_vec2,
+
+                                 viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<T>(beta)),
+                                 options_beta,
+                                 viennacl::traits::opencl_handle(vec3),
+                                 size_vec3 )
+                              );
+      }
+
+
+      template <typename T, typename ScalarType1, typename ScalarType2>
+      void avbv_v(vector_base<T> & vec1,
+                  vector_base<T> const & vec2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+                  vector_base<T> const & vec3, ScalarType2 const & beta,  vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
+      {
+        assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(vec2).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+        assert(viennacl::traits::opencl_handle(vec2).context() == viennacl::traits::opencl_handle(vec3).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+
+        viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec1).context());
+        viennacl::linalg::opencl::kernels::vector<T>::init(ctx);
+
+        std::string kernel_name;
+        if (viennacl::is_cpu_scalar<ScalarType1>::value && viennacl::is_cpu_scalar<ScalarType2>::value)
+          kernel_name = "avbv_v_cpu_cpu";
+        else if (viennacl::is_cpu_scalar<ScalarType1>::value && !viennacl::is_cpu_scalar<ScalarType2>::value)
+          kernel_name = "avbv_v_cpu_gpu";
+        else if (!viennacl::is_cpu_scalar<ScalarType1>::value && viennacl::is_cpu_scalar<ScalarType2>::value)
+          kernel_name = "avbv_v_gpu_cpu";
+        else
+          kernel_name = "avbv_v_gpu_gpu";
+
+        cl_uint options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
+        cl_uint options_beta  = detail::make_options(len_beta,  reciprocal_beta,  flip_sign_beta);
+
+        viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector<T>::program_name(), kernel_name);
+        k.global_work_size(0, std::min<vcl_size_t>(128 * k.local_work_size(),
+                                                    viennacl::tools::align_to_multiple<vcl_size_t>(viennacl::traits::size(vec1), k.local_work_size()) ) );
+
+        viennacl::ocl::packed_cl_uint size_vec1;
+        size_vec1.start  = cl_uint(viennacl::traits::start(vec1));
+        size_vec1.stride = cl_uint(viennacl::traits::stride(vec1));
+        size_vec1.size   = cl_uint(viennacl::traits::size(vec1));
+        size_vec1.internal_size   = cl_uint(viennacl::traits::internal_size(vec1));
+
+        viennacl::ocl::packed_cl_uint size_vec2;
+        size_vec2.start  = cl_uint(viennacl::traits::start(vec2));
+        size_vec2.stride = cl_uint(viennacl::traits::stride(vec2));
+        size_vec2.size   = cl_uint(viennacl::traits::size(vec2));
+        size_vec2.internal_size   = cl_uint(viennacl::traits::internal_size(vec2));
+
+        viennacl::ocl::packed_cl_uint size_vec3;
+        size_vec3.start  = cl_uint(viennacl::traits::start(vec3));
+        size_vec3.stride = cl_uint(viennacl::traits::stride(vec3));
+        size_vec3.size   = cl_uint(viennacl::traits::size(vec3));
+        size_vec3.internal_size   = cl_uint(viennacl::traits::internal_size(vec3));
+
+        viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(vec1),
+                                 size_vec1,
+
+                                 viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<T>(alpha)),
+                                 options_alpha,
+                                 viennacl::traits::opencl_handle(vec2),
+                                 size_vec2,
+
+                                 viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<T>(beta)),
+                                 options_beta,
+                                 viennacl::traits::opencl_handle(vec3),
+                                 size_vec3 )
+                              );
+      }
+
+
+      /** @brief Assign a constant value to a vector (-range/-slice)
+      *
+      * @param vec1   The vector to which the value should be assigned
+      * @param alpha  The value to be assigned
+      * @param up_to_internal_size  Specifies whether alpha should also be written to padded memory (mostly used for clearing the whole buffer).
+      */
+      template <typename T>
+      void vector_assign(vector_base<T> & vec1, const T & alpha, bool up_to_internal_size = false)
+      {
+        viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec1).context());
+        viennacl::linalg::opencl::kernels::vector<T>::init(ctx);
+
+        viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector<T>::program_name(), "assign_cpu");
+        k.global_work_size(0, std::min<vcl_size_t>(128 * k.local_work_size(),
+                                                    viennacl::tools::align_to_multiple<vcl_size_t>(viennacl::traits::size(vec1), k.local_work_size()) ) );
+
+        cl_uint size = up_to_internal_size ? cl_uint(vec1.internal_size()) : cl_uint(viennacl::traits::size(vec1));
+        viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(vec1),
+                                 cl_uint(viennacl::traits::start(vec1)),
+                                 cl_uint(viennacl::traits::stride(vec1)),
+                                 size,
+                                 cl_uint(vec1.internal_size()),     //Note: Do NOT use traits::internal_size() here, because vector proxies don't require padding.
+                                 viennacl::traits::opencl_handle(T(alpha)) )
+                              );
+      }
+
+
+      /** @brief Swaps the contents of two vectors, data is copied
+      *
+      * @param vec1   The first vector (or -range, or -slice)
+      * @param vec2   The second vector (or -range, or -slice)
+      */
+      template <typename T>
+      void vector_swap(vector_base<T> & vec1, vector_base<T> & vec2)
+      {
+        assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(vec2).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+
+        viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec1).context());
+        viennacl::linalg::opencl::kernels::vector<T>::init(ctx);
+
+        viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector<T>::program_name(), "swap");
+
+        viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(vec1),
+                                 cl_uint(viennacl::traits::start(vec1)),
+                                 cl_uint(viennacl::traits::stride(vec1)),
+                                 cl_uint(viennacl::traits::size(vec1)),
+                                 viennacl::traits::opencl_handle(vec2),
+                                 cl_uint(viennacl::traits::start(vec2)),
+                                 cl_uint(viennacl::traits::stride(vec2)),
+                                 cl_uint(viennacl::traits::size(vec2)))
+                              );
+      }
+
+      ///////////////////////// Binary Elementwise operations /////////////
+
+      /** @brief Implementation of the element-wise operation v1 = v2 .* v3 and v1 = v2 ./ v3    (using MATLAB syntax)
+      *
+      * @param vec1   The result vector (or -range, or -slice)
+      * @param proxy  The proxy object holding v2, v3 and the operation
+      */
+      template <typename T, typename OP>
+      void element_op(vector_base<T> & vec1,
+                      vector_expression<const vector_base<T>, const vector_base<T>, op_element_binary<OP> > const & proxy)
+      {
+        assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(proxy.lhs()).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+        assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(proxy.rhs()).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+
+        viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec1).context());
+        viennacl::linalg::opencl::kernels::vector_element<T>::init(ctx);
+
+        viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector_element<T>::program_name(), "element_op");
+
+        cl_uint op_type = 2; //0: product, 1: division, 2: power
+        if (viennacl::is_division<OP>::value)
+          op_type = 1;
+        else if (viennacl::is_product<OP>::value)
+          op_type = 0;
+
+        viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(vec1),
+                                 cl_uint(viennacl::traits::start(vec1)),
+                                 cl_uint(viennacl::traits::stride(vec1)),
+                                 cl_uint(viennacl::traits::size(vec1)),
+
+                                 viennacl::traits::opencl_handle(proxy.lhs()),
+                                 cl_uint(viennacl::traits::start(proxy.lhs())),
+                                 cl_uint(viennacl::traits::stride(proxy.lhs())),
+
+                                 viennacl::traits::opencl_handle(proxy.rhs()),
+                                 cl_uint(viennacl::traits::start(proxy.rhs())),
+                                 cl_uint(viennacl::traits::stride(proxy.rhs())),
+
+                                 op_type)
+                              );
+      }
+
+      ///////////////////////// Unary Elementwise operations /////////////
+
+      /** @brief Implementation of unary element-wise operations v1 = OP(v2)
+      *
+      * @param vec1   The result vector (or -range, or -slice)
+      * @param proxy  The proxy object holding v2 and the operation
+      */
+      template <typename T, typename OP>
+      void element_op(vector_base<T> & vec1,
+                      vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<OP> > const & proxy)
+      {
+        assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(proxy.lhs()).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+        assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(proxy.rhs()).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+
+        viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec1).context());
+        viennacl::linalg::opencl::kernels::vector_element<T>::init(ctx);
+
+        viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector_element<T>::program_name(), detail::op_to_string(OP()) + "_assign");
+
+        viennacl::ocl::packed_cl_uint size_vec1;
+        size_vec1.start  = cl_uint(viennacl::traits::start(vec1));
+        size_vec1.stride = cl_uint(viennacl::traits::stride(vec1));
+        size_vec1.size   = cl_uint(viennacl::traits::size(vec1));
+        size_vec1.internal_size   = cl_uint(viennacl::traits::internal_size(vec1));
+
+        viennacl::ocl::packed_cl_uint size_vec2;
+        size_vec2.start  = cl_uint(viennacl::traits::start(proxy.lhs()));
+        size_vec2.stride = cl_uint(viennacl::traits::stride(proxy.lhs()));
+        size_vec2.size   = cl_uint(viennacl::traits::size(proxy.lhs()));
+        size_vec2.internal_size   = cl_uint(viennacl::traits::internal_size(proxy.lhs()));
+
+        viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(vec1),
+                                 size_vec1,
+                                 viennacl::traits::opencl_handle(proxy.lhs()),
+                                 size_vec2)
+                              );
+      }
+
+      ///////////////////////// Norms and inner product ///////////////////
+
+      /** @brief Computes the partial inner product of two vectors - implementation. Library users should call inner_prod(vec1, vec2).
+      *
+      * @param vec1 The first vector
+      * @param vec2 The second vector
+      * @param partial_result The results of each group
+      */
+      template <typename T>
+      void inner_prod_impl(vector_base<T> const & vec1,
+                           vector_base<T> const & vec2,
+                           vector_base<T> & partial_result)
+      {
+        assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(vec2).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+        assert(viennacl::traits::opencl_handle(vec2).context() == viennacl::traits::opencl_handle(partial_result).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+
+        viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec1).context());
+        viennacl::linalg::opencl::kernels::vector<T>::init(ctx);
+
+        assert( (viennacl::traits::size(vec1) == viennacl::traits::size(vec2))
+              && bool("Incompatible vector sizes in inner_prod_impl()!"));
+
+        viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector<T>::program_name(), "inner_prod1");
+
+        assert( (k.global_work_size() / k.local_work_size() <= partial_result.size()) && bool("Size mismatch for partial reduction in inner_prod_impl()") );
+
+        viennacl::ocl::packed_cl_uint size_vec1;
+        size_vec1.start  = cl_uint(viennacl::traits::start(vec1));
+        size_vec1.stride = cl_uint(viennacl::traits::stride(vec1));
+        size_vec1.size   = cl_uint(viennacl::traits::size(vec1));
+        size_vec1.internal_size   = cl_uint(viennacl::traits::internal_size(vec1));
+
+        viennacl::ocl::packed_cl_uint size_vec2;
+        size_vec2.start  = cl_uint(viennacl::traits::start(vec2));
+        size_vec2.stride = cl_uint(viennacl::traits::stride(vec2));
+        size_vec2.size   = cl_uint(viennacl::traits::size(vec2));
+        size_vec2.internal_size   = cl_uint(viennacl::traits::internal_size(vec2));
+
+        viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(vec1),
+                                 size_vec1,
+                                 viennacl::traits::opencl_handle(vec2),
+                                 size_vec2,
+                                 viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<T>::type) * k.local_work_size()),
+                                 viennacl::traits::opencl_handle(partial_result)
+                                )
+                              );
+      }
+
+
+      //implementation of inner product:
+      //namespace {
+      /** @brief Computes the inner product of two vectors - implementation. Library users should call inner_prod(vec1, vec2).
+      *
+      * @param vec1 The first vector
+      * @param vec2 The second vector
+      * @param result The result scalar (on the gpu)
+      */
+      template <typename T>
+      void inner_prod_impl(vector_base<T> const & vec1,
+                           vector_base<T> const & vec2,
+                           scalar<T> & result)
+      {
+        assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(vec2).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+        assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(result).context() && bool("Operands do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+
+        viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec1).context());
+
+        vcl_size_t work_groups = 128;
+        viennacl::vector<T> temp(work_groups, viennacl::traits::context(vec1));
+        temp.resize(work_groups, ctx); // bring default-constructed vectors to the correct size:
+
+        // Step 1: Compute partial inner products for each work group:
+        inner_prod_impl(vec1, vec2, temp);
+
+        // Step 2: Sum partial results:
+        viennacl::ocl::kernel & ksum = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector<T>::program_name(), "sum");
+
+        ksum.local_work_size(0, work_groups);
+        ksum.global_work_size(0, work_groups);
+        viennacl::ocl::enqueue(ksum(viennacl::traits::opencl_handle(temp),
+                                    cl_uint(viennacl::traits::start(temp)),
+                                    cl_uint(viennacl::traits::stride(temp)),
+                                    cl_uint(viennacl::traits::size(temp)),
+                                    cl_uint(1),
+                                    viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<T>::type) * ksum.local_work_size()),
+                                    viennacl::traits::opencl_handle(result) )
+                              );
+      }
+
+      namespace detail
+      {
+        template <typename ScalarT>
+        viennacl::ocl::packed_cl_uint make_layout(vector_base<ScalarT> const & vec)
+        {
+          viennacl::ocl::packed_cl_uint ret;
+          ret.start           = cl_uint(viennacl::traits::start(vec));
+          ret.stride          = cl_uint(viennacl::traits::stride(vec));
+          ret.size            = cl_uint(viennacl::traits::size(vec));
+          ret.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
+          return ret;
+        }
+      }
+
+      /** @brief Computes multiple inner products where one argument is common to all inner products. <x, y1>, <x, y2>, ..., <x, yN>
+      *
+      * @param x          The common vector
+      * @param vec_tuple  The tuple of vectors y1, y2, ..., yN
+      * @param result     The result vector
+      */
+      template <typename T>
+      void inner_prod_impl(vector_base<T> const & x,
+                           vector_tuple<T> const & vec_tuple,
+                           vector_base<T> & result)
+      {
+        assert(viennacl::traits::opencl_handle(x).context() == viennacl::traits::opencl_handle(result).context() && bool("Operands do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+
+        viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(x).context());
+        viennacl::linalg::opencl::kernels::vector<T>::init(ctx);
+        viennacl::linalg::opencl::kernels::vector_multi_inner_prod<T>::init(ctx);
+
+        vcl_size_t work_groups = 128;
+
+        viennacl::vector<T> temp(work_groups, viennacl::traits::context(x));
+        temp.resize(8 * work_groups, ctx); // bring default-constructed vectors to the correct size:
+
+        viennacl::ocl::packed_cl_uint layout_x = detail::make_layout(x);
+
+        viennacl::ocl::kernel & ksum = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector_multi_inner_prod<T>::program_name(), "sum_inner_prod");
+        viennacl::ocl::kernel & inner_prod_kernel_1 = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector<T>::program_name(), "inner_prod1");
+        viennacl::ocl::kernel & inner_prod_kernel_2 = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector_multi_inner_prod<T>::program_name(), "inner_prod2");
+        viennacl::ocl::kernel & inner_prod_kernel_3 = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector_multi_inner_prod<T>::program_name(), "inner_prod3");
+        viennacl::ocl::kernel & inner_prod_kernel_4 = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector_multi_inner_prod<T>::program_name(), "inner_prod4");
+        viennacl::ocl::kernel & inner_prod_kernel_8 = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector_multi_inner_prod<T>::program_name(), "inner_prod8");
+
+        vcl_size_t current_index = 0;
+        while (current_index < vec_tuple.const_size())
+        {
+          switch (vec_tuple.const_size() - current_index)
+          {
+            case 7:
+            case 6:
+            case 5:
+            case 4:
+            {
+              vector_base<T> const & y0 = vec_tuple.const_at(current_index    );
+              vector_base<T> const & y1 = vec_tuple.const_at(current_index + 1);
+              vector_base<T> const & y2 = vec_tuple.const_at(current_index + 2);
+              vector_base<T> const & y3 = vec_tuple.const_at(current_index + 3);
+              viennacl::ocl::enqueue(inner_prod_kernel_4( viennacl::traits::opencl_handle(x), layout_x,
+                                                         viennacl::traits::opencl_handle(y0), detail::make_layout(y0),
+                                                         viennacl::traits::opencl_handle(y1), detail::make_layout(y1),
+                                                         viennacl::traits::opencl_handle(y2), detail::make_layout(y2),
+                                                         viennacl::traits::opencl_handle(y3), detail::make_layout(y3),
+                                                         viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<T>::type) * 4 * inner_prod_kernel_4.local_work_size()),
+                                                         viennacl::traits::opencl_handle(temp)
+                                                        ) );
+
+              ksum.local_work_size(0, work_groups);
+              ksum.global_work_size(0, 4 * work_groups);
+              viennacl::ocl::enqueue(ksum(viennacl::traits::opencl_handle(temp),
+                                          viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<T>::type) * 4 * ksum.local_work_size()),
+                                          viennacl::traits::opencl_handle(result),
+                                          cl_uint(viennacl::traits::start(result) + current_index * viennacl::traits::stride(result)),
+                                          cl_uint(viennacl::traits::stride(result))
+                                          )
+                                    );
+            }
+              current_index += 4;
+              break;
+
+            case 3:
+            {
+              vector_base<T> const & y0 = vec_tuple.const_at(current_index    );
+              vector_base<T> const & y1 = vec_tuple.const_at(current_index + 1);
+              vector_base<T> const & y2 = vec_tuple.const_at(current_index + 2);
+              viennacl::ocl::enqueue(inner_prod_kernel_3( viennacl::traits::opencl_handle(x), layout_x,
+                                                          viennacl::traits::opencl_handle(y0), detail::make_layout(y0),
+                                                          viennacl::traits::opencl_handle(y1), detail::make_layout(y1),
+                                                          viennacl::traits::opencl_handle(y2), detail::make_layout(y2),
+                                                          viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<T>::type) * 3 * inner_prod_kernel_3.local_work_size()),
+                                                          viennacl::traits::opencl_handle(temp)
+                                                         ) );
+
+              ksum.local_work_size(0, work_groups);
+              ksum.global_work_size(0, 3 * work_groups);
+              viennacl::ocl::enqueue(ksum(viennacl::traits::opencl_handle(temp),
+                                          viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<T>::type) * 3 * ksum.local_work_size()),
+                                          viennacl::traits::opencl_handle(result),
+                                          cl_uint(viennacl::traits::start(result) + current_index * viennacl::traits::stride(result)),
+                                          cl_uint(viennacl::traits::stride(result))
+                                          )
+                                    );
+            }
+              current_index += 3;
+              break;
+
+            case 2:
+            {
+              vector_base<T> const & y0 = vec_tuple.const_at(current_index    );
+              vector_base<T> const & y1 = vec_tuple.const_at(current_index + 1);
+              viennacl::ocl::enqueue(inner_prod_kernel_2( viennacl::traits::opencl_handle(x), layout_x,
+                                                          viennacl::traits::opencl_handle(y0), detail::make_layout(y0),
+                                                          viennacl::traits::opencl_handle(y1), detail::make_layout(y1),
+                                                          viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<T>::type) * 2 * inner_prod_kernel_2.local_work_size()),
+                                                          viennacl::traits::opencl_handle(temp)
+                                                        ) );
+
+              ksum.local_work_size(0, work_groups);
+              ksum.global_work_size(0, 2 * work_groups);
+              viennacl::ocl::enqueue(ksum(viennacl::traits::opencl_handle(temp),
+                                          viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<T>::type) * 2 * ksum.local_work_size()),
+                                          viennacl::traits::opencl_handle(result),
+                                          cl_uint(viennacl::traits::start(result) + current_index * viennacl::traits::stride(result)),
+                                          cl_uint(viennacl::traits::stride(result))
+                                          )
+                                    );
+            }
+              current_index += 2;
+              break;
+
+            case 1:
+            {
+              vector_base<T> const & y0 = vec_tuple.const_at(current_index    );
+              viennacl::ocl::enqueue(inner_prod_kernel_1( viennacl::traits::opencl_handle(x), layout_x,
+                                                          viennacl::traits::opencl_handle(y0), detail::make_layout(y0),
+                                                          viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<T>::type) * 1 * inner_prod_kernel_1.local_work_size()),
+                                                          viennacl::traits::opencl_handle(temp)
+                                                        ) );
+
+              ksum.local_work_size(0, work_groups);
+              ksum.global_work_size(0, 1 * work_groups);
+              viennacl::ocl::enqueue(ksum(viennacl::traits::opencl_handle(temp),
+                                          viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<T>::type) * 1 * ksum.local_work_size()),
+                                          viennacl::traits::opencl_handle(result),
+                                          cl_uint(viennacl::traits::start(result) + current_index * viennacl::traits::stride(result)),
+                                          cl_uint(viennacl::traits::stride(result))
+                                          )
+                                    );
+            }
+              current_index += 1;
+              break;
+
+            default: //8 or more vectors
+            {
+              vector_base<T> const & y0 = vec_tuple.const_at(current_index    );
+              vector_base<T> const & y1 = vec_tuple.const_at(current_index + 1);
+              vector_base<T> const & y2 = vec_tuple.const_at(current_index + 2);
+              vector_base<T> const & y3 = vec_tuple.const_at(current_index + 3);
+              vector_base<T> const & y4 = vec_tuple.const_at(current_index + 4);
+              vector_base<T> const & y5 = vec_tuple.const_at(current_index + 5);
+              vector_base<T> const & y6 = vec_tuple.const_at(current_index + 6);
+              vector_base<T> const & y7 = vec_tuple.const_at(current_index + 7);
+              viennacl::ocl::enqueue(inner_prod_kernel_8( viennacl::traits::opencl_handle(x), layout_x,
+                                                          viennacl::traits::opencl_handle(y0), detail::make_layout(y0),
+                                                          viennacl::traits::opencl_handle(y1), detail::make_layout(y1),
+                                                          viennacl::traits::opencl_handle(y2), detail::make_layout(y2),
+                                                          viennacl::traits::opencl_handle(y3), detail::make_layout(y3),
+                                                          viennacl::traits::opencl_handle(y4), detail::make_layout(y4),
+                                                          viennacl::traits::opencl_handle(y5), detail::make_layout(y5),
+                                                          viennacl::traits::opencl_handle(y6), detail::make_layout(y6),
+                                                          viennacl::traits::opencl_handle(y7), detail::make_layout(y7),
+                                                          viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<T>::type) * 8 * inner_prod_kernel_8.local_work_size()),
+                                                          viennacl::traits::opencl_handle(temp)
+                                                        ) );
+
+              ksum.local_work_size(0, work_groups);
+              ksum.global_work_size(0, 8 * work_groups);
+              viennacl::ocl::enqueue(ksum(viennacl::traits::opencl_handle(temp),
+                                          viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<T>::type) * 8 * ksum.local_work_size()),
+                                          viennacl::traits::opencl_handle(result),
+                                          cl_uint(viennacl::traits::start(result) + current_index * viennacl::traits::stride(result)),
+                                          cl_uint(viennacl::traits::stride(result))
+                                          )
+                                    );
+            }
+              current_index += 8;
+              break;
+          }
+        }
+
+      }
+
+
+      //implementation of inner product:
+      //namespace {
+      /** @brief Computes the inner product of two vectors - implementation. Library users should call inner_prod(vec1, vec2).
+      *
+      * @param vec1 The first vector
+      * @param vec2 The second vector
+      * @param result The result scalar (on the gpu)
+      */
+      template <typename T>
+      void inner_prod_cpu(vector_base<T> const & vec1,
+                          vector_base<T> const & vec2,
+                          T & result)
+      {
+        assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(vec2).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+
+        viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec1).context());
+
+        vcl_size_t work_groups = 128;
+        viennacl::vector<T> temp(work_groups, viennacl::traits::context(vec1));
+        temp.resize(work_groups, ctx); // bring default-constructed vectors to the correct size:
+
+        // Step 1: Compute partial inner products for each work group:
+        inner_prod_impl(vec1, vec2, temp);
+
+        // Step 2: Sum partial results:
+
+        // Now copy partial results from GPU back to CPU and run reduction there:
+        std::vector<T> temp_cpu(work_groups);
+        viennacl::fast_copy(temp.begin(), temp.end(), temp_cpu.begin());
+
+        result = 0;
+        for (typename std::vector<T>::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
+          result += *it;
+      }
+
+
+      //////////// Helper for norms
+
+      /** @brief Computes the partial work group results for vector norms
+      *
+      * @param vec The vector
+      * @param partial_result The result scalar
+      * @param norm_id        Norm selector. 0: norm_inf, 1: norm_1, 2: norm_2
+      */
+      template <typename T>
+      void norm_reduction_impl(vector_base<T> const & vec,
+                               vector_base<T> & partial_result,
+                                cl_uint norm_id)
+      {
+        assert(viennacl::traits::opencl_handle(vec).context() == viennacl::traits::opencl_handle(partial_result).context() && bool("Operands do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+
+        viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec).context());
+        viennacl::linalg::opencl::kernels::vector<T>::init(ctx);
+
+        viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector<T>::program_name(), "norm");
+
+        assert( (k.global_work_size() / k.local_work_size() <= partial_result.size()) && bool("Size mismatch for partial reduction in norm_reduction_impl()") );
+
+        viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(vec),
+                                 cl_uint(viennacl::traits::start(vec)),
+                                 cl_uint(viennacl::traits::stride(vec)),
+                                 cl_uint(viennacl::traits::size(vec)),
+                                 cl_uint(norm_id),
+                                 viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<T>::type) * k.local_work_size()),
+                                 viennacl::traits::opencl_handle(partial_result) )
+                              );
+      }
+
+
+      //////////// Norm 1
+
+      /** @brief Computes the l^1-norm of a vector
+      *
+      * @param vec The vector
+      * @param result The result scalar
+      */
+      template <typename T>
+      void norm_1_impl(vector_base<T> const & vec,
+                       scalar<T> & result)
+      {
+        assert(viennacl::traits::opencl_handle(vec).context() == viennacl::traits::opencl_handle(result).context() && bool("Operands do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+
+        viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec).context());
+
+        vcl_size_t work_groups = 128;
+        viennacl::vector<T> temp(work_groups, viennacl::traits::context(vec));
+
+        // Step 1: Compute the partial work group results
+        norm_reduction_impl(vec, temp, 1);
+
+        // Step 2: Compute the partial reduction using OpenCL
+        viennacl::ocl::kernel & ksum = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector<T>::program_name(), "sum");
+
+        ksum.local_work_size(0, work_groups);
+        ksum.global_work_size(0, work_groups);
+        viennacl::ocl::enqueue(ksum(viennacl::traits::opencl_handle(temp),
+                                    cl_uint(viennacl::traits::start(temp)),
+                                    cl_uint(viennacl::traits::stride(temp)),
+                                    cl_uint(viennacl::traits::size(temp)),
+                                    cl_uint(1),
+                                    viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<T>::type) * ksum.local_work_size()),
+                                    result)
+                              );
+      }
+
+      /** @brief Computes the l^1-norm of a vector with final reduction on CPU
+      *
+      * @param vec The vector
+      * @param result The result scalar
+      */
+      template <typename T>
+      void norm_1_cpu(vector_base<T> const & vec,
+                      T & result)
+      {
+        vcl_size_t work_groups = 128;
+        viennacl::vector<T> temp(work_groups, viennacl::traits::context(vec));
+
+        // Step 1: Compute the partial work group results
+        norm_reduction_impl(vec, temp, 1);
+
+        // Step 2: Now copy partial results from GPU back to CPU and run reduction there:
+        typedef std::vector<typename viennacl::result_of::cl_type<T>::type>  CPUVectorType;
+
+        CPUVectorType temp_cpu(work_groups);
+        viennacl::fast_copy(temp.begin(), temp.end(), temp_cpu.begin());
+
+        result = 0;
+        for (typename CPUVectorType::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
+          result += static_cast<T>(*it);
+      }
+
+
+
+      //////// Norm 2
+
+
+      /** @brief Computes the l^2-norm of a vector - implementation using OpenCL summation at second step
+      *
+      * @param vec The vector
+      * @param result The result scalar
+      */
+      template <typename T>
+      void norm_2_impl(vector_base<T> const & vec,
+                       scalar<T> & result)
+      {
+        assert(viennacl::traits::opencl_handle(vec).context() == viennacl::traits::opencl_handle(result).context() && bool("Operands do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+
+        viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec).context());
+
+        vcl_size_t work_groups = 128;
+        viennacl::vector<T> temp(work_groups, viennacl::traits::context(vec));
+
+        // Step 1: Compute the partial work group results
+        norm_reduction_impl(vec, temp, 2);
+
+        // Step 2: Reduction via OpenCL
+        viennacl::ocl::kernel & ksum = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector<T>::program_name(), "sum");
+
+        ksum.local_work_size(0, work_groups);
+        ksum.global_work_size(0, work_groups);
+        viennacl::ocl::enqueue( ksum(viennacl::traits::opencl_handle(temp),
+                                      cl_uint(viennacl::traits::start(temp)),
+                                      cl_uint(viennacl::traits::stride(temp)),
+                                      cl_uint(viennacl::traits::size(temp)),
+                                      cl_uint(2),
+                                      viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<T>::type) * ksum.local_work_size()),
+                                      result)
+                              );
+      }
+
+      /** @brief Computes the l^1-norm of a vector with final reduction on CPU
+      *
+      * @param vec The vector
+      * @param result The result scalar
+      */
+      template <typename T>
+      void norm_2_cpu(vector_base<T> const & vec,
+                      T & result)
+      {
+        vcl_size_t work_groups = 128;
+        viennacl::vector<T> temp(work_groups, viennacl::traits::context(vec));
+
+        // Step 1: Compute the partial work group results
+        norm_reduction_impl(vec, temp, 2);
+
+        // Step 2: Now copy partial results from GPU back to CPU and run reduction there:
+        typedef std::vector<typename viennacl::result_of::cl_type<T>::type>  CPUVectorType;
+
+        CPUVectorType temp_cpu(work_groups);
+        viennacl::fast_copy(temp.begin(), temp.end(), temp_cpu.begin());
+
+        result = 0;
+        for (typename CPUVectorType::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
+          result += static_cast<T>(*it);
+        result = std::sqrt(result);
+      }
+
+
+
+      ////////// Norm inf
+
+      /** @brief Computes the supremum-norm of a vector
+      *
+      * @param vec The vector
+      * @param result The result scalar
+      */
+      template <typename T>
+      void norm_inf_impl(vector_base<T> const & vec,
+                         scalar<T> & result)
+      {
+        assert(viennacl::traits::opencl_handle(vec).context() == viennacl::traits::opencl_handle(result).context() && bool("Operands do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+
+        viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec).context());
+
+        vcl_size_t work_groups = 128;
+        viennacl::vector<T> temp(work_groups, viennacl::traits::context(vec));
+
+        // Step 1: Compute the partial work group results
+        norm_reduction_impl(vec, temp, 0);
+
+        //part 2: parallel reduction of reduced kernel:
+        viennacl::ocl::kernel & ksum = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector<T>::program_name(), "sum");
+        ksum.local_work_size(0, work_groups);
+        ksum.global_work_size(0, work_groups);
+
+        viennacl::ocl::enqueue( ksum(viennacl::traits::opencl_handle(temp),
+                                     cl_uint(viennacl::traits::start(temp)),
+                                     cl_uint(viennacl::traits::stride(temp)),
+                                     cl_uint(viennacl::traits::size(temp)),
+                                     cl_uint(0),
+                                     viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<T>::type) * ksum.local_work_size()),
+                                     result)
+                              );
+      }
+
+      /** @brief Computes the supremum-norm of a vector
+      *
+      * @param vec The vector
+      * @param result The result scalar
+      */
+      template <typename T>
+      void norm_inf_cpu(vector_base<T> const & vec,
+                        T & result)
+      {
+        vcl_size_t work_groups = 128;
+        viennacl::vector<T> temp(work_groups, viennacl::traits::context(vec));
+
+        // Step 1: Compute the partial work group results
+        norm_reduction_impl(vec, temp, 0);
+
+        // Step 2: Now copy partial results from GPU back to CPU and run reduction there:
+        typedef std::vector<typename viennacl::result_of::cl_type<T>::type>  CPUVectorType;
+
+        CPUVectorType temp_cpu(work_groups);
+        viennacl::fast_copy(temp.begin(), temp.end(), temp_cpu.begin());
+
+        result = 0;
+        for (typename CPUVectorType::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
+          result = std::max(result, static_cast<T>(*it));
+      }
+
+
+      /////////// index norm_inf
+
+      //This function should return a CPU scalar, otherwise statements like
+      // vcl_rhs[index_norm_inf(vcl_rhs)]
+      // are ambiguous
+      /** @brief Computes the index of the first entry that is equal to the supremum-norm in modulus.
+      *
+      * @param vec The vector
+      * @return The result. Note that the result must be a CPU scalar (unsigned int), since gpu scalars are floating point types.
+      */
+      template <typename T>
+      cl_uint index_norm_inf(vector_base<T> const & vec)
+      {
+        viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec).context());
+        viennacl::linalg::opencl::kernels::vector<T>::init(ctx);
+
+        viennacl::ocl::handle<cl_mem> h = ctx.create_memory(CL_MEM_READ_WRITE, sizeof(cl_uint));
+
+        viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector<T>::program_name(), "index_norm_inf");
+        //cl_uint size = static_cast<cl_uint>(vcl_vec.internal_size());
+
+        //TODO: Use multi-group kernel for large vector sizes
+
+        k.global_work_size(0, k.local_work_size());
+        viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(vec),
+                                 cl_uint(viennacl::traits::start(vec)),
+                                 cl_uint(viennacl::traits::stride(vec)),
+                                 cl_uint(viennacl::traits::size(vec)),
+                                 viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<T>::type) * k.local_work_size()),
+                                 viennacl::ocl::local_mem(sizeof(cl_uint) * k.local_work_size()), h));
+
+        //read value:
+        cl_uint result;
+        cl_int err = clEnqueueReadBuffer(ctx.get_queue().handle().get(), h.get(), CL_TRUE, 0, sizeof(cl_uint), &result, 0, NULL, NULL);
+        VIENNACL_ERR_CHECK(err);
+        return result;
+      }
+
+      //TODO: Special case vec1 == vec2 allows improvement!!
+      /** @brief Computes a plane rotation of two vectors.
+      *
+      * Computes (x,y) <- (alpha * x + beta * y, -beta * x + alpha * y)
+      *
+      * @param vec1   The first vector
+      * @param vec2   The second vector
+      * @param alpha  The first transformation coefficient
+      * @param beta   The second transformation coefficient
+      */
+      template <typename T>
+      void plane_rotation(vector_base<T> & vec1,
+                          vector_base<T> & vec2,
+                          T alpha, T beta)
+      {
+        assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(vec2).context() && bool("Operands do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+
+        viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec1).context());
+        viennacl::linalg::opencl::kernels::vector<T>::init(ctx);
+
+        assert(viennacl::traits::size(vec1) == viennacl::traits::size(vec2));
+        viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector<T>::program_name(), "plane_rotation");
+
+        viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(vec1),
+                                 cl_uint(viennacl::traits::start(vec1)),
+                                 cl_uint(viennacl::traits::stride(vec1)),
+                                 cl_uint(viennacl::traits::size(vec1)),
+                                 viennacl::traits::opencl_handle(vec2),
+                                 cl_uint(viennacl::traits::start(vec2)),
+                                 cl_uint(viennacl::traits::stride(vec2)),
+                                 cl_uint(viennacl::traits::size(vec2)),
+                                 viennacl::traits::opencl_handle(alpha),
+                                 viennacl::traits::opencl_handle(beta))
+                              );
+      }
+
+    } //namespace opencl
+  } //namespace linalg
+} //namespace viennacl
+
+
+#endif
diff --git a/viennacl/linalg/power_iter.hpp b/viennacl/linalg/power_iter.hpp
new file mode 100644
index 0000000..75ee20d
--- /dev/null
+++ b/viennacl/linalg/power_iter.hpp
@@ -0,0 +1,118 @@
+#ifndef VIENNACL_LINALG_POWER_ITER_HPP_
+#define VIENNACL_LINALG_POWER_ITER_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/power_iter.hpp
+    @brief Defines a tag for the configuration of the power iteration method.
+
+    Contributed by Astrid Rupp.
+*/
+
+#include <cmath>
+#include <vector>
+#include "viennacl/linalg/bisect.hpp"
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+
+namespace viennacl
+{
+  namespace linalg
+  {
+    /** @brief A tag for the power iteration algorithm. */
+    class power_iter_tag
+    {
+      public:
+
+        /** @brief The constructor
+        *
+        * @param tfac      If the eigenvalue does not change more than this termination factor, the algorithm stops
+        * @param max_iters Maximum number of iterations for the power iteration
+        */
+        power_iter_tag(double tfac = 1e-8, vcl_size_t max_iters = 50000) : termination_factor_(tfac), max_iterations_(max_iters) {}
+
+        /** @brief Sets the factor for termination */
+        void factor(double fct){ termination_factor_ = fct; }
+
+          /** @brief Returns the factor for termination */
+        double factor() const { return termination_factor_; }
+
+        vcl_size_t max_iterations() const { return max_iterations_; }
+        void max_iterations(vcl_size_t new_max) { max_iterations_ = new_max; }
+
+      private:
+        double termination_factor_;
+        vcl_size_t max_iterations_;
+
+    };
+
+   /**
+    *   @brief Implementation of the calculation of eigenvalues using poweriteration
+    *
+    *   @param matrix        The system matrix
+    *   @param tag           Tag with termination factor
+    *   @return              Returns the largest eigenvalue computed by the power iteration method
+    */
+    template< typename MatrixT >
+    typename viennacl::result_of::cpu_value_type<typename MatrixT::value_type>::type
+    eig(MatrixT const& matrix, power_iter_tag const & tag)
+    {
+
+      typedef typename viennacl::result_of::value_type<MatrixT>::type           ScalarType;
+      typedef typename viennacl::result_of::cpu_value_type<ScalarType>::type    CPU_ScalarType;
+      typedef typename viennacl::result_of::vector_for_matrix<MatrixT>::type    VectorT;
+
+      CPU_ScalarType eigenvalue;
+      vcl_size_t matrix_size = matrix.size1();
+      VectorT r(matrix_size);
+      VectorT r2(matrix_size);
+      std::vector<CPU_ScalarType> s(matrix_size);
+
+      for(vcl_size_t i=0; i<s.size(); ++i)
+        s[i] = (i % 3) * CPU_ScalarType(0.1234) - CPU_ScalarType(0.5);   //'random' starting vector
+
+      detail::copy_vec_to_vec(s,r);
+
+      //std::cout << s << std::endl;
+
+      double epsilon = tag.factor();
+      CPU_ScalarType norm = norm_2(r);
+      CPU_ScalarType norm_prev = 0;
+      long numiter = 0;
+
+      for (vcl_size_t i=0; i<tag.max_iterations(); ++i)
+      {
+        if (std::fabs(norm - norm_prev) / std::fabs(norm) < epsilon)
+          break;
+
+        r /= norm;
+        r2 = viennacl::linalg::prod(matrix, r);  //using helper vector r2 for the computation of r <- A * r in order to avoid the repeated creation of temporaries
+        r = r2;
+        norm_prev = norm;
+        norm = norm_2(r);
+        numiter++;
+      }
+
+      eigenvalue = norm;
+      return eigenvalue;
+    }
+
+
+  } // end namespace linalg
+} // end namespace viennacl
+#endif
diff --git a/viennacl/linalg/prod.hpp b/viennacl/linalg/prod.hpp
index 3e147f7..702fc8f 100644
--- a/viennacl/linalg/prod.hpp
+++ b/viennacl/linalg/prod.hpp
@@ -2,25 +2,26 @@
 #define VIENNACL_LINALG_PROD_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
 ============================================================================= */
 
-/** @file prod.hpp
-    @brief Generic interface for matrix-vector and matrix-matrix products. 
-           See viennacl/linalg/vector_operations.hpp, viennacl/linalg/matrix_operations.hpp,
-           viennacl/linalg/compressed_matrix_operations.hpp and viennacl/linalg/coordinate_matrix_operations.hpp for implementations.
+/** @file viennacl/linalg/prod.hpp
+    @brief Generic interface for matrix-vector and matrix-matrix products.
+           See viennacl/linalg/vector_operations.hpp, viennacl/linalg/matrix_operations.hpp, and
+           viennacl/linalg/sparse_matrix_operations.hpp for implementations.
 */
 
 #include "viennacl/forwards.h"
@@ -35,49 +36,44 @@ namespace viennacl
   //
   // generic prod function
   //   uses tag dispatch to identify which algorithm
-  //   should be called 
+  //   should be called
   //
-  namespace linalg 
+  namespace linalg
   {
-    #ifdef VIENNACL_HAVE_MTL4
+    #ifdef VIENNACL_WITH_MTL4
     // ----------------------------------------------------
     // mtl4
     //
     template< typename MatrixT, typename VectorT >
-    VectorT 
-    prod(MatrixT const& matrix, VectorT const& vector, 
-         typename viennacl::enable_if< viennacl::is_mtl4< typename viennacl::traits::tag_of< MatrixT >::type >::value
-                                     >::type* dummy = 0)
+    typename viennacl::enable_if< viennacl::is_mtl4< typename viennacl::traits::tag_of< MatrixT >::type >::value,
+                                  VectorT>::type
+    prod(MatrixT const& matrix, VectorT const& vector)
     {
-      // std::cout << "mtl4 .. " << std::endl;
       return VectorT(matrix * vector);
     }
     #endif
-    
-    #ifdef VIENNACL_HAVE_EIGEN
+
+    #ifdef VIENNACL_WITH_EIGEN
     // ----------------------------------------------------
     // Eigen
     //
     template< typename MatrixT, typename VectorT >
-    VectorT 
-    prod(MatrixT const& matrix, VectorT const& vector, 
-         typename viennacl::enable_if< viennacl::is_eigen< typename viennacl::traits::tag_of< MatrixT >::type >::value
-                                     >::type* dummy = 0)
+    typename viennacl::enable_if< viennacl::is_eigen< typename viennacl::traits::tag_of< MatrixT >::type >::value,
+                                  VectorT>::type
+    prod(MatrixT const& matrix, VectorT const& vector)
     {
-      // std::cout << "ublas .. " << std::endl;
       return matrix * vector;
     }
     #endif
-    
-    #ifdef VIENNACL_HAVE_UBLAS
+
+    #ifdef VIENNACL_WITH_UBLAS
     // ----------------------------------------------------
     // UBLAS
     //
     template< typename MatrixT, typename VectorT >
-    VectorT 
-    prod(MatrixT const& matrix, VectorT const& vector, 
-         typename viennacl::enable_if< viennacl::is_ublas< typename viennacl::traits::tag_of< MatrixT >::type >::value
-                                     >::type* dummy = 0)
+    typename viennacl::enable_if< viennacl::is_ublas< typename viennacl::traits::tag_of< MatrixT >::type >::value,
+                                  VectorT>::type
+    prod(MatrixT const& matrix, VectorT const& vector)
     {
       // std::cout << "ublas .. " << std::endl;
       return boost::numeric::ublas::prod(matrix, vector);
@@ -88,11 +84,11 @@ namespace viennacl
     // ----------------------------------------------------
     // STL type
     //
-    
+
     // dense matrix-vector product:
     template< typename T, typename A1, typename A2, typename VectorT >
-    VectorT 
-    prod_impl(std::vector< std::vector<T, A1>, A2 > const& matrix, VectorT const& vector)
+    VectorT
+    prod(std::vector< std::vector<T, A1>, A2 > const & matrix, VectorT const& vector)
     {
       VectorT result(matrix.size());
       for (typename std::vector<T, A1>::size_type i=0; i<matrix.size(); ++i)
@@ -103,17 +99,17 @@ namespace viennacl
       }
       return result;
     }
-    
+
     // sparse matrix-vector product:
     template< typename KEY, typename DATA, typename COMPARE, typename AMAP, typename AVEC, typename VectorT >
-    VectorT 
-    prod_impl(std::vector< std::map<KEY, DATA, COMPARE, AMAP>, AVEC > const& matrix, VectorT const& vector)
+    VectorT
+    prod(std::vector< std::map<KEY, DATA, COMPARE, AMAP>, AVEC > const& matrix, VectorT const& vector)
     {
       typedef std::vector< std::map<KEY, DATA, COMPARE, AMAP>, AVEC > MatrixType;
-      
+
       VectorT result(matrix.size());
       for (typename MatrixType::size_type i=0; i<matrix.size(); ++i)
-      { 
+      {
         result[i] = 0; //we will not assume that VectorT is initialized to zero
         for (typename std::map<KEY, DATA, COMPARE, AMAP>::const_iterator row_entries = matrix[i].begin();
              row_entries != matrix[i].end();
@@ -122,102 +118,201 @@ namespace viennacl
       }
       return result;
     }
-    
-    
-    template< typename MatrixT, typename VectorT >
-    VectorT 
-    prod(MatrixT const& matrix, VectorT const& vector, 
+
+
+    /*template< typename MatrixT, typename VectorT >
+    VectorT
+    prod(MatrixT const& matrix, VectorT const& vector,
          typename viennacl::enable_if< viennacl::is_stl< typename viennacl::traits::tag_of< MatrixT >::type >::value
                                      >::type* dummy = 0)
     {
       // std::cout << "std .. " << std::endl;
       return prod_impl(matrix, vector);
-    }
+    }*/
 
     // ----------------------------------------------------
     // VIENNACL
     //
-    template< typename MatrixT1, typename MatrixT2 >
-    viennacl::matrix_expression< const MatrixT1, 
-                                 const viennacl::matrix_range<MatrixT2>,
-                                 viennacl::op_prod >
-    prod(MatrixT1 const& A,
-         viennacl::matrix_range<MatrixT2> const& B)
+
+    // standard product:
+    template< typename NumericT, typename F1, typename F2>
+    viennacl::matrix_expression< const viennacl::matrix_base<NumericT, F1>,
+                                 const viennacl::matrix_base<NumericT, F2>,
+                                 viennacl::op_mat_mat_prod >
+    prod(viennacl::matrix_base<NumericT, F1> const & A,
+         viennacl::matrix_base<NumericT, F2> const & B)
     {
       // std::cout << "viennacl .. " << std::endl;
-      return viennacl::matrix_expression< const MatrixT1, 
-                                          const viennacl::matrix_range<MatrixT2>,
-                                          viennacl::op_prod >(A, B);
+      return viennacl::matrix_expression< const viennacl::matrix_base<NumericT, F1>,
+                                          const viennacl::matrix_base<NumericT, F2>,
+                                          viennacl::op_mat_mat_prod >(A, B);
     }
 
-
-    template< typename MatrixT1, typename MatrixT2 >
-    viennacl::matrix_expression< const MatrixT1, 
-                                 const viennacl::matrix_expression<const viennacl::matrix_range<MatrixT2>,
-                                                                   const viennacl::matrix_range<MatrixT2>,
+    // right factor is transposed:
+    template< typename NumericT, typename F1, typename F2>
+    viennacl::matrix_expression< const viennacl::matrix_base<NumericT, F1>,
+                                 const viennacl::matrix_expression<const viennacl::matrix_base<NumericT, F2>,
+                                                                   const viennacl::matrix_base<NumericT, F2>,
                                                                    op_trans>,
-                                 viennacl::op_prod >
-    prod(MatrixT1 const & A,
-         viennacl::matrix_expression<const viennacl::matrix_range<MatrixT2>,
-                                     const viennacl::matrix_range<MatrixT2>,
+                                 viennacl::op_mat_mat_prod >
+    prod(viennacl::matrix_base<NumericT, F1> const & A,
+         viennacl::matrix_expression<const viennacl::matrix_base<NumericT, F2>,
+                                     const viennacl::matrix_base<NumericT, F2>,
                                      op_trans> const & B)
     {
       // std::cout << "viennacl .. " << std::endl;
-      return viennacl::matrix_expression< const MatrixT1, 
-                                          const viennacl::matrix_expression<const viennacl::matrix_range<MatrixT2>,
-                                                                            const viennacl::matrix_range<MatrixT2>,
+      return viennacl::matrix_expression< const viennacl::matrix_base<NumericT, F1>,
+                                          const viennacl::matrix_expression<const viennacl::matrix_base<NumericT, F2>,
+                                                                            const viennacl::matrix_base<NumericT, F2>,
                                                                             op_trans>,
-                                          viennacl::op_prod >(A, B);
+                                          viennacl::op_mat_mat_prod >(A, B);
     }
 
+    // left factor transposed:
+    template< typename NumericT, typename F1, typename F2>
+    viennacl::matrix_expression< const viennacl::matrix_expression<const viennacl::matrix_base<NumericT, F1>,
+                                                                   const viennacl::matrix_base<NumericT, F1>,
+                                                                   op_trans>,
+                                 const viennacl::matrix_base<NumericT, F2>,
+                                 viennacl::op_mat_mat_prod >
+    prod(viennacl::matrix_expression<const viennacl::matrix_base<NumericT, F1>,
+                                     const viennacl::matrix_base<NumericT, F1>,
+                                     op_trans> const & A,
+         viennacl::matrix_base<NumericT, F2> const & B)
+    {
+      // std::cout << "viennacl .. " << std::endl;
+      return viennacl::matrix_expression< const viennacl::matrix_expression<const viennacl::matrix_base<NumericT, F1>,
+                                                                            const viennacl::matrix_base<NumericT, F1>,
+                                                                            op_trans>,
+                                          const viennacl::matrix_base<NumericT, F2>,
+                                          viennacl::op_mat_mat_prod >(A, B);
+    }
 
 
+    // both factors transposed:
+    template< typename NumericT, typename F1, typename F2>
+    viennacl::matrix_expression< const viennacl::matrix_expression<const viennacl::matrix_base<NumericT, F1>,
+                                                                   const viennacl::matrix_base<NumericT, F1>,
+                                                                   op_trans>,
+                                 const viennacl::matrix_expression<const viennacl::matrix_base<NumericT, F2>,
+                                                                   const viennacl::matrix_base<NumericT, F2>,
+                                                                   op_trans>,
+                                 viennacl::op_mat_mat_prod >
+    prod(viennacl::matrix_expression<const viennacl::matrix_base<NumericT, F1>,
+                                     const viennacl::matrix_base<NumericT, F1>,
+                                     op_trans> const & A,
+         viennacl::matrix_expression<const viennacl::matrix_base<NumericT, F2>,
+                                     const viennacl::matrix_base<NumericT, F2>,
+                                     op_trans> const & B)
+    {
+      // std::cout << "viennacl .. " << std::endl;
+      return viennacl::matrix_expression< const viennacl::matrix_expression<const viennacl::matrix_base<NumericT, F1>,
+                                                                            const viennacl::matrix_base<NumericT, F1>,
+                                                                            op_trans>,
+                                          const viennacl::matrix_expression<const viennacl::matrix_base<NumericT, F2>,
+                                                                            const viennacl::matrix_base<NumericT, F2>,
+                                                                            op_trans>,
+                                          viennacl::op_mat_mat_prod >(A, B);
+    }
 
 
 
-
-    template< typename MatrixT, typename NumericT, unsigned int ALIGNMENT >
-    viennacl::vector_expression< const MatrixT, 
-                                 const viennacl::vector<NumericT, ALIGNMENT>,
+    // matrix-vector product
+    template< typename NumericT, typename F>
+    viennacl::vector_expression< const viennacl::matrix_base<NumericT, F>,
+                                 const viennacl::vector_base<NumericT>,
                                  viennacl::op_prod >
-    prod(MatrixT const& matrix,
-         viennacl::vector<NumericT, ALIGNMENT> const& vector)
+    prod(viennacl::matrix_base<NumericT, F> const & matrix,
+         viennacl::vector_base<NumericT> const & vector)
     {
       // std::cout << "viennacl .. " << std::endl;
-      return viennacl::linalg::prod_impl(matrix, vector);
+      return viennacl::vector_expression< const viennacl::matrix_base<NumericT, F>,
+                                          const viennacl::vector_base<NumericT>,
+                                          viennacl::op_prod >(matrix, vector);
     }
 
-    template< typename MatrixT, typename NumericT, typename F, unsigned int ALIGNMENT >
-    viennacl::matrix_expression< const MatrixT, 
-                                 const viennacl::matrix<NumericT, F, ALIGNMENT>,
+    // transposed matrix-vector product
+    template< typename NumericT, typename F>
+    viennacl::vector_expression< const viennacl::matrix_expression<const viennacl::matrix_base<NumericT, F>,
+                                                                   const viennacl::matrix_base<NumericT, F>,
+                                                                   op_trans>,
+                                 const viennacl::vector_base<NumericT>,
                                  viennacl::op_prod >
-    prod(MatrixT const& matrix_A,
-         viennacl::matrix<NumericT, F, ALIGNMENT> const& matrix_B)
+    prod(viennacl::matrix_expression<const viennacl::matrix_base<NumericT, F>,
+                                     const viennacl::matrix_base<NumericT, F>,
+                                     op_trans> const & matrix,
+         viennacl::vector_base<NumericT> const & vector)
     {
       // std::cout << "viennacl .. " << std::endl;
-      return viennacl::matrix_expression< const MatrixT, 
-                                          const viennacl::matrix<NumericT, F, ALIGNMENT>,
-                                          viennacl::op_prod >(matrix_A, matrix_B);
+      return viennacl::vector_expression< const viennacl::matrix_expression<const viennacl::matrix_base<NumericT, F>,
+                                                                            const viennacl::matrix_base<NumericT, F>,
+                                                                            op_trans>,
+                                          const viennacl::vector_base<NumericT>,
+                                          viennacl::op_prod >(matrix, vector);
     }
 
-    template< typename MatrixT, typename NumericT, typename F, unsigned int ALIGNMENT >
-    viennacl::matrix_expression< const MatrixT, 
-                                 const viennacl::matrix_expression< const viennacl::matrix<NumericT, F, ALIGNMENT>, 
-                                                                    const viennacl::matrix<NumericT, F, ALIGNMENT>,
-                                                                    viennacl::op_trans >,
-                                 viennacl::op_prod >
-    prod(MatrixT const& matrix_A,
-         const viennacl::matrix_expression< const viennacl::matrix<NumericT, F, ALIGNMENT>, 
-                                            const viennacl::matrix<NumericT, F, ALIGNMENT>,
-                                            viennacl::op_trans > & matrix_B)
+
+    template<typename SparseMatrixType, class SCALARTYPE>
+    typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixType>::value,
+                                  vector_expression<const SparseMatrixType,
+                                                    const vector_base<SCALARTYPE>,
+                                                    op_prod >
+                                 >::type
+    prod(const SparseMatrixType & mat,
+         const vector_base<SCALARTYPE> & vec)
     {
-      // std::cout << "viennacl .. " << std::endl;
-      return viennacl::matrix_expression< const MatrixT, 
-                                          const viennacl::matrix_expression< const viennacl::matrix<NumericT, F, ALIGNMENT>, 
-                                                                             const viennacl::matrix<NumericT, F, ALIGNMENT>,
-                                                                             viennacl::op_trans >,
-                                          viennacl::op_prod >(matrix_A, matrix_B);
-      //return viennacl::linalg::prod_impl(matrix_A, matrix_B);
+      return vector_expression<const SparseMatrixType,
+                               const vector_base<SCALARTYPE>,
+                               op_prod >(mat, vec);
+    }
+
+    template< typename SparseMatrixType, typename SCALARTYPE, typename F1>
+    typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixType>::value,
+                                  viennacl::matrix_expression<const SparseMatrixType,
+                                                              const matrix_base < SCALARTYPE, F1 >,
+                                                              op_prod >
+                                 >::type
+    prod(const SparseMatrixType & sp_mat,
+         const viennacl::matrix_base<SCALARTYPE, F1> & d_mat)
+    {
+      return viennacl::matrix_expression<const SparseMatrixType,
+                                         const viennacl::matrix_base < SCALARTYPE, F1 >,
+                                         op_prod >(sp_mat, d_mat);
+    }
+
+    // right factor is transposed
+    template< typename SparseMatrixType, typename SCALARTYPE, typename F1 >
+    typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixType>::value,
+                                  viennacl::matrix_expression< const SparseMatrixType,
+                                                               const viennacl::matrix_expression<const viennacl::matrix_base<SCALARTYPE, F1>,
+                                                                                                 const viennacl::matrix_base<SCALARTYPE, F1>,
+                                                                                                 op_trans>,
+                                                               viennacl::op_prod >
+                                  >::type
+    prod(const SparseMatrixType & A,
+         viennacl::matrix_expression<const viennacl::matrix_base < SCALARTYPE, F1 >,
+                                     const viennacl::matrix_base < SCALARTYPE, F1 >,
+                                     op_trans> const & B)
+    {
+      return viennacl::matrix_expression< const SparseMatrixType,
+                                          const viennacl::matrix_expression<const viennacl::matrix_base < SCALARTYPE, F1 >,
+                                                                            const viennacl::matrix_base < SCALARTYPE, F1 >,
+                                                                            op_trans>,
+                                          viennacl::op_prod >(A, B);
+    }
+
+    template<typename StructuredMatrixType, class SCALARTYPE>
+    typename viennacl::enable_if< viennacl::is_any_dense_structured_matrix<StructuredMatrixType>::value,
+                                  vector_expression<const StructuredMatrixType,
+                                                    const vector_base<SCALARTYPE>,
+                                                    op_prod >
+                                 >::type
+    prod(const StructuredMatrixType & mat,
+         const vector_base<SCALARTYPE> & vec)
+    {
+      return vector_expression<const StructuredMatrixType,
+                               const vector_base<SCALARTYPE>,
+                               op_prod >(mat, vec);
     }
 
   } // end namespace linalg
diff --git a/viennacl/linalg/qr-method-common.hpp b/viennacl/linalg/qr-method-common.hpp
new file mode 100644
index 0000000..1ff1e93
--- /dev/null
+++ b/viennacl/linalg/qr-method-common.hpp
@@ -0,0 +1,225 @@
+#ifndef VIENNACL_LINALG_QR_METHOD_COMMON_HPP
+#define VIENNACL_LINALG_QR_METHOD_COMMON_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include <cmath>
+
+#include "viennacl/ocl/device.hpp"
+#include "viennacl/ocl/handle.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/linalg/opencl/kernels/svd.hpp"
+#include "viennacl/meta/result_of.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+
+#include <boost/numeric/ublas/vector.hpp>
+#include <boost/numeric/ublas/io.hpp>
+
+/** @file viennacl/linalg/qr-method-common.hpp
+    @brief Common routines used for the QR method and SVD. Experimental.
+*/
+
+namespace viennacl
+{
+  namespace linalg
+  {
+    const std::string SVD_BIDIAG_PACK_KERNEL = "bidiag_pack";
+    const std::string SVD_HOUSEHOLDER_UPDATE_A_LEFT_KERNEL = "house_update_A_left";
+    const std::string SVD_HOUSEHOLDER_UPDATE_A_RIGHT_KERNEL = "house_update_A_right";
+    const std::string SVD_HOUSEHOLDER_UPDATE_QL_KERNEL = "house_update_QL";
+    const std::string SVD_HOUSEHOLDER_UPDATE_QR_KERNEL = "house_update_QR";
+    const std::string SVD_COPY_COL_KERNEL = "copy_col";
+    const std::string SVD_COPY_ROW_KERNEL = "copy_row";
+    const std::string SVD_MATRIX_TRANSPOSE_KERNEL = "transpose_inplace";
+    const std::string SVD_INVERSE_SIGNS_KERNEL = "inverse_signs";
+    const std::string SVD_GIVENS_PREV_KERNEL = "givens_prev";
+    const std::string SVD_GIVENS_NEXT_KERNEL = "givens_next";
+    const std::string SVD_FINAL_ITER_UPDATE_KERNEL = "final_iter_update";
+    const std::string SVD_UPDATE_QR_COLUMN_KERNEL = "update_qr_column";
+
+    namespace detail
+    {
+      //static const float EPS = 0.00001f;
+      //static const vcl_size_t ITER_MAX = 50;
+
+      static const double EPS = 1e-10;
+      static const vcl_size_t ITER_MAX = 50;
+
+      template <typename SCALARTYPE>
+      SCALARTYPE pythag(SCALARTYPE a, SCALARTYPE b)
+      {
+        return std::sqrt(a*a + b*b);
+      }
+
+      template <typename SCALARTYPE>
+      SCALARTYPE sign(SCALARTYPE val)
+      {
+          return (val >= 0) ? SCALARTYPE(1) : SCALARTYPE(-1);
+      }
+
+      // DEPRECATED: Replace with viennacl::linalg::norm_2
+      template <typename VectorType>
+      typename VectorType::value_type norm_lcl(VectorType const & x, vcl_size_t size)
+      {
+        typename VectorType::value_type x_norm = 0.0;
+        for(vcl_size_t i = 0; i < size; i++)
+          x_norm += std::pow(x[i], 2);
+        return std::sqrt(x_norm);
+      }
+
+      template <typename VectorType>
+      void normalize(VectorType & x, vcl_size_t size)
+      {
+        typename VectorType::value_type x_norm = norm_lcl(x, size);
+        for(vcl_size_t i = 0; i < size; i++)
+            x[i] /= x_norm;
+      }
+
+
+
+      template <typename VectorType>
+      void householder_vector(VectorType & v, vcl_size_t start)
+      {
+        typedef typename VectorType::value_type    ScalarType;
+        ScalarType x_norm = norm_lcl(v, v.size());
+        ScalarType alpha = -sign(v[start]) * x_norm;
+        v[start] += alpha;
+        normalize(v, v.size());
+      }
+
+      template <typename MatrixType>
+      void transpose(MatrixType & A)
+      {
+        typedef typename MatrixType::value_type                                   ScalarType;
+        typedef typename viennacl::result_of::cpu_value_type<ScalarType>::type    CPU_ScalarType;
+
+        viennacl::ocl::kernel & kernel = viennacl::ocl::get_kernel(viennacl::linalg::opencl::kernels::svd<CPU_ScalarType>::program_name(), SVD_MATRIX_TRANSPOSE_KERNEL);
+
+        viennacl::ocl::enqueue(kernel(A,
+                                      static_cast<cl_uint>(A.internal_size1()),
+                                      static_cast<cl_uint>(A.internal_size2())
+                                     )
+                              );
+      }
+
+
+
+      template <typename T>
+      void cdiv(T xr, T xi, T yr, T yi, T& cdivr, T& cdivi)
+      {
+          // Complex scalar division.
+          T r;
+          T d;
+          if (std::fabs(yr) > std::fabs(yi))
+          {
+              r = yi / yr;
+              d = yr + r * yi;
+              cdivr = (xr + r * xi) / d;
+              cdivi = (xi - r * xr) / d;
+          }
+          else
+          {
+              r = yr / yi;
+              d = yi + r * yr;
+              cdivr = (r * xr + xi) / d;
+              cdivi = (r * xi - xr) / d;
+          }
+      }
+
+
+      template <typename SCALARTYPE, unsigned int ALIGNMENT>
+      void copy_vec(viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT>& A,
+                    viennacl::vector<SCALARTYPE, ALIGNMENT>& V,
+                    vcl_size_t row_start,
+                    vcl_size_t col_start,
+                    bool copy_col
+      )
+      {
+
+        std::string kernel_name = copy_col ? SVD_COPY_COL_KERNEL : SVD_COPY_ROW_KERNEL;
+        viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+        viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<SCALARTYPE>::program_name(), kernel_name);
+
+        viennacl::ocl::enqueue(kernel(
+                                      A,
+                                      V,
+                                      static_cast<cl_uint>(row_start),
+                                      static_cast<cl_uint>(col_start),
+                                      copy_col ? static_cast<cl_uint>(A.size1())
+                                               : static_cast<cl_uint>(A.size2()),
+                                      static_cast<cl_uint>(A.internal_size2())
+                              ));
+
+      }
+
+
+      template<typename SCALARTYPE, unsigned int ALIGNMENT>
+      void prepare_householder_vector(
+                                    viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT>& A,
+                                    viennacl::vector<SCALARTYPE, ALIGNMENT>& D,
+                                    vcl_size_t size,
+                                    vcl_size_t row_start,
+                                    vcl_size_t col_start,
+                                    vcl_size_t start,
+                                    bool is_column
+                                    )
+      {
+        boost::numeric::ublas::vector<SCALARTYPE> tmp = boost::numeric::ublas::scalar_vector<SCALARTYPE>(size, 0);
+
+        copy_vec(A, D, row_start, col_start, is_column);
+        fast_copy(D.begin(), D.begin() + vcl_ptrdiff_t(size - start), tmp.begin() + start);
+
+        //std::cout << "1: " << tmp << "\n";
+
+        detail::householder_vector(tmp, start);
+        fast_copy(tmp, D);
+
+        //std::cout << "2: "  << D << "\n";
+      }
+
+      template <typename SCALARTYPE, unsigned int ALIGNMENT, typename VectorType>
+      void bidiag_pack(viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT>& A,
+                       VectorType & dh,
+                       VectorType & sh
+                      )
+      {
+        viennacl::vector<SCALARTYPE, ALIGNMENT> D(dh.size());
+        viennacl::vector<SCALARTYPE, ALIGNMENT> S(sh.size());
+
+        viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+        viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<SCALARTYPE>::program_name(), SVD_BIDIAG_PACK_KERNEL);
+
+        viennacl::ocl::enqueue(kernel(
+                                      A,
+                                      D,
+                                      S,
+                                      static_cast<cl_uint>(A.size1()),
+                                      static_cast<cl_uint>(A.size2()),
+                                      static_cast<cl_uint>(A.internal_size2())
+                                    ));
+
+        fast_copy(D, dh);
+        fast_copy(S, sh);
+      }
+
+    }
+  }
+}
+
+#endif
diff --git a/viennacl/linalg/qr-method.hpp b/viennacl/linalg/qr-method.hpp
new file mode 100644
index 0000000..f787f3b
--- /dev/null
+++ b/viennacl/linalg/qr-method.hpp
@@ -0,0 +1,952 @@
+#ifndef VIENNACL_LINALG_QR_METHOD_HPP_
+#define VIENNACL_LINALG_QR_METHOD_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+
+#include "viennacl/linalg/qr-method-common.hpp"
+#include "viennacl/linalg/prod.hpp"
+
+#include <boost/numeric/ublas/vector.hpp>
+#include <boost/numeric/ublas/matrix.hpp>
+
+/** @file viennacl/linalg/qr-method.hpp
+    @brief Implementation of the QR method for eigenvalue computations. Experimental.
+*/
+
+namespace viennacl
+{
+  namespace linalg
+  {
+    namespace detail
+    {
+        template<typename MatrixType, typename VectorType>
+        void givens_next(MatrixType& matrix,
+                        VectorType& tmp1,
+                        VectorType& tmp2,
+                        int l,
+                        int m
+                      )
+        {
+          viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(matrix).context());
+
+          typedef typename MatrixType::value_type                                   ScalarType;
+          typedef typename viennacl::result_of::cpu_value_type<ScalarType>::type    CPU_ScalarType;
+
+          viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<CPU_ScalarType>::program_name(), SVD_GIVENS_NEXT_KERNEL);
+
+          kernel.global_work_size(0, viennacl::tools::align_to_multiple<cl_uint>(cl_uint(viennacl::traits::size1(matrix)), 256));
+          kernel.local_work_size(0, 256);
+
+          viennacl::ocl::enqueue(kernel(
+                                        matrix,
+                                        tmp1,
+                                        tmp2,
+                                        static_cast<cl_uint>(matrix.size1()),
+                                        static_cast<cl_uint>(matrix.internal_size2()),
+                                        static_cast<cl_uint>(l),
+                                        static_cast<cl_uint>(m - 1)
+                                ));
+        }
+
+
+        // Symmetric tridiagonal QL algorithm.
+        // This is derived from the Algol procedures tql2, by Bowdler, Martin, Reinsch, and Wilkinson,
+        // Handbook for Auto. Comp., Vol.ii-Linear Algebra, and the corresponding Fortran subroutine in EISPACK.
+        template <typename SCALARTYPE, unsigned int ALIGNMENT>
+        void tql2(viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT> & Q,
+                  boost::numeric::ublas::vector<SCALARTYPE> & d,
+                  boost::numeric::ublas::vector<SCALARTYPE> & e)
+        {
+            int n = static_cast<int>(Q.size1());
+
+            boost::numeric::ublas::vector<SCALARTYPE> cs(n), ss(n);
+            viennacl::vector<SCALARTYPE> tmp1(n), tmp2(n);
+
+            for (int i = 1; i < n; i++)
+                e(i - 1) = e(i);
+
+            e(n - 1) = 0;
+
+            SCALARTYPE f = 0;
+            SCALARTYPE tst1 = 0;
+            SCALARTYPE eps = 2 * static_cast<SCALARTYPE>(EPS);
+
+            for (int l = 0; l < n; l++)
+            {
+                // Find small subdiagonal element.
+                tst1 = std::max<SCALARTYPE>(tst1, std::fabs(d(l)) + std::fabs(e(l)));
+                int m = l;
+                while (m < n)
+                {
+                    if (std::fabs(e(m)) <= eps * tst1)
+                        break;
+                    m++;
+                }
+
+                // If m == l, d(l) is an eigenvalue, otherwise, iterate.
+                if (m > l)
+                {
+                    int iter = 0;
+                    do
+                    {
+                        iter = iter + 1;  // (Could check iteration count here.)
+
+                        // Compute implicit shift
+                        SCALARTYPE g = d(l);
+                        SCALARTYPE p = (d(l + 1) - g) / (2 * e(l));
+                        SCALARTYPE r = pythag<SCALARTYPE>(p, 1);
+                        if (p < 0)
+                        {
+                            r = -r;
+                        }
+
+                        d(l) = e(l) / (p + r);
+                        d(l + 1) = e(l) * (p + r);
+                        SCALARTYPE dl1 = d(l + 1);
+                        SCALARTYPE h = g - d(l);
+                        for (int i = l + 2; i < n; i++)
+                        {
+                            d(i) -= h;
+                        }
+
+                        f = f + h;
+
+                        // Implicit QL transformation.
+                        p = d(m);
+                        SCALARTYPE c = 1;
+                        SCALARTYPE c2 = c;
+                        SCALARTYPE c3 = c;
+                        SCALARTYPE el1 = e(l + 1);
+                        SCALARTYPE s = 0;
+                        SCALARTYPE s2 = 0;
+                        for (int i = m - 1; i >= l; i--)
+                        {
+                            c3 = c2;
+                            c2 = c;
+                            s2 = s;
+                            g = c * e(i);
+                            h = c * p;
+                            r = pythag(p, e(i));
+                            e(i + 1) = s * r;
+                            s = e(i) / r;
+                            c = p / r;
+                            p = c * d(i) - s * g;
+                            d(i + 1) = h + s * (c * g + s * d(i));
+
+                            cs[i] = c;
+                            ss[i] = s;
+                        }
+
+                        p = -s * s2 * c3 * el1 * e(l) / dl1;
+                        e(l) = s * p;
+                        d(l) = c * p;
+
+                        {
+                            viennacl::copy(cs, tmp1);
+                            viennacl::copy(ss, tmp2);
+
+                            givens_next(Q, tmp1, tmp2, l, m);
+                        }
+
+                        // Check for convergence.
+                    }
+                    while (std::fabs(e(l)) > eps * tst1);
+                }
+                d(l) = d(l) + f;
+                e(l) = 0;
+            }
+        }
+
+        template <typename SCALARTYPE, typename MatrixT>
+        void final_iter_update_gpu(MatrixT& A,
+                                int n,
+                                int last_n,
+                                SCALARTYPE q,
+                                SCALARTYPE p
+                                )
+        {
+            viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+
+            viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<SCALARTYPE>::program_name(), SVD_FINAL_ITER_UPDATE_KERNEL);
+
+            viennacl::ocl::enqueue(kernel(
+                                          A,
+                                          static_cast<cl_uint>(A.internal_size1()),
+                                          static_cast<cl_uint>(n),
+                                          static_cast<cl_uint>(last_n),
+                                          q,
+                                          p
+                                  ));
+        }
+
+        template <typename SCALARTYPE, typename MatrixT>
+        void update_float_QR_column_gpu(MatrixT& A,
+                                const std::vector<SCALARTYPE>& buf,
+                                viennacl::vector<SCALARTYPE>& buf_vcl,
+                                int m,
+                                int n,
+                                int last_n,
+                                bool //is_triangular
+                                )
+        {
+            viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+
+            viennacl::fast_copy(buf, buf_vcl);
+
+            viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<SCALARTYPE>::program_name(), SVD_UPDATE_QR_COLUMN_KERNEL);
+
+            viennacl::ocl::enqueue(kernel(
+                                          A,
+                                          static_cast<cl_uint>(A.internal_size1()),
+                                          buf_vcl,
+                                          static_cast<cl_uint>(m),
+                                          static_cast<cl_uint>(n),
+                                          static_cast<cl_uint>(last_n)
+                                  ));
+        }
+
+        template <typename SCALARTYPE, typename MatrixT>
+        void final_iter_update(MatrixT& A,
+                                int n,
+                                int last_n,
+                                SCALARTYPE q,
+                                SCALARTYPE p
+                                )
+        {
+            for (int i = 0; i < last_n; i++)
+            {
+                SCALARTYPE v_in = A(i, n);
+                SCALARTYPE z = A(i, n - 1);
+                A(i, n - 1) = q * z + p * v_in;
+                A(i, n) = q * v_in - p * z;
+            }
+        }
+
+        template <typename SCALARTYPE, typename MatrixT>
+        void update_float_QR_column(MatrixT& A,
+                                const std::vector<SCALARTYPE>& buf,
+                                int m,
+                                int n,
+                                int last_i,
+                                bool is_triangular
+                                )
+        {
+            for (int i = 0; i < last_i; i++)
+            {
+                int start_k = is_triangular?std::max(i + 1, m):m;
+
+                SCALARTYPE* a_row = A.row(i);
+
+                SCALARTYPE a_ik   = a_row[start_k];
+                SCALARTYPE a_ik_1 = 0;
+                SCALARTYPE a_ik_2 = 0;
+
+                if(start_k < n)
+                    a_ik_1 = a_row[start_k + 1];
+
+                for(int k = start_k; k < n; k++)
+                {
+                    bool notlast = (k != n - 1);
+
+                    SCALARTYPE p = buf[5 * k] * a_ik + buf[5 * k + 1] * a_ik_1;
+
+                    if (notlast)
+                    {
+                        a_ik_2 = a_row[k + 2];
+                        p = p + buf[5 * k + 2] * a_ik_2;
+                        a_ik_2 = a_ik_2 - p * buf[5 * k + 4];
+                    }
+
+                    a_row[k] = a_ik - p;
+                    a_ik_1 = a_ik_1 - p * buf[5 * k + 3];
+
+                    a_ik = a_ik_1;
+                    a_ik_1 = a_ik_2;
+                }
+
+                if(start_k < n)
+                    a_row[n] = a_ik;
+            }
+        }
+
+        /** @brief Internal helper class representing a row-major dense matrix used for the QR method for the purpose of computing eigenvalues. */
+        template <typename SCALARTYPE>
+        class FastMatrix
+        {
+        public:
+            FastMatrix()
+            {
+                size_ = 0;
+            }
+
+            FastMatrix(vcl_size_t sz, vcl_size_t internal_size) : size_(sz), internal_size_(internal_size)
+            {
+                data.resize(internal_size * internal_size);
+            }
+
+            SCALARTYPE& operator()(int i, int j)
+            {
+                return data[i * internal_size_ + j];
+            }
+
+            SCALARTYPE* row(int i)
+            {
+                return &data[i * internal_size_];
+            }
+
+            SCALARTYPE* begin()
+            {
+                return &data[0];
+            }
+
+            SCALARTYPE* end()
+            {
+                return &data[0] + data.size();
+            }
+
+            std::vector<SCALARTYPE> data;
+        private:
+            vcl_size_t size_;
+            vcl_size_t internal_size_;
+        };
+
+        // Nonsymmetric reduction from Hessenberg to real Schur form.
+        // This is derived from the Algol procedure hqr2, by Martin and Wilkinson, Handbook for Auto. Comp.,
+        // Vol.ii-Linear Algebra, and the corresponding  Fortran subroutine in EISPACK.
+        template <typename SCALARTYPE, unsigned int ALIGNMENT>
+        void hqr2(viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT>& vcl_H,
+                    viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT>& V,
+                    boost::numeric::ublas::vector<SCALARTYPE>& d,
+                    boost::numeric::ublas::vector<SCALARTYPE>& e)
+        {
+            transpose(V);
+
+            int nn = static_cast<int>(vcl_H.size1());
+
+            FastMatrix<SCALARTYPE> H(nn, vcl_H.internal_size2());//, V(nn);
+
+            std::vector<SCALARTYPE> buf(5 * nn);
+            viennacl::vector<SCALARTYPE> buf_vcl(5 * nn);
+
+            viennacl::fast_copy(vcl_H, H.begin());
+
+
+            int n = nn - 1;
+
+            SCALARTYPE eps = 2 * static_cast<SCALARTYPE>(EPS);
+            SCALARTYPE exshift = 0;
+            SCALARTYPE p = 0;
+            SCALARTYPE q = 0;
+            SCALARTYPE r = 0;
+            SCALARTYPE s = 0;
+            SCALARTYPE z = 0;
+            SCALARTYPE t;
+            SCALARTYPE w;
+            SCALARTYPE x;
+            SCALARTYPE y;
+
+            SCALARTYPE out1, out2;
+
+            // compute matrix norm
+            SCALARTYPE norm = 0;
+            for (int i = 0; i < nn; i++)
+            {
+                for (int j = std::max(i - 1, 0); j < nn; j++)
+                    norm = norm + std::fabs(H(i, j));
+            }
+
+            // Outer loop over eigenvalue index
+            int iter = 0;
+            while (n >= 0)
+            {
+                // Look for single small sub-diagonal element
+                int l = n;
+                while (l > 0)
+                {
+                    s = std::fabs(H(l - 1, l - 1)) + std::fabs(H(l, l));
+                    if (s == 0) s = norm;
+                    if (std::fabs(H(l, l - 1)) < eps * s)
+                        break;
+
+                    l--;
+                }
+
+                // Check for convergence
+                if (l == n)
+                {
+                    // One root found
+                    H(n, n) = H(n, n) + exshift;
+                    d(n) = H(n, n);
+                    e(n) = 0;
+                    n--;
+                    iter = 0;
+                }
+                else if (l == n - 1)
+                {
+                    // Two roots found
+                    w = H(n, n - 1) * H(n - 1, n);
+                    p = (H(n - 1, n - 1) - H(n, n)) / 2;
+                    q = p * p + w;
+                    z = static_cast<SCALARTYPE>(std::sqrt(std::fabs(q)));
+                    H(n, n) = H(n, n) + exshift;
+                    H(n - 1, n - 1) = H(n - 1, n - 1) + exshift;
+                    x = H(n, n);
+
+                    if (q >= 0)
+                    {
+                        // Real pair
+                        z = (p >= 0) ? (p + z) : (p - z);
+                        d(n - 1) = x + z;
+                        d(n) = d(n - 1);
+                        if (z != 0)
+                            d(n) = x - w / z;
+                        e(n - 1) = 0;
+                        e(n) = 0;
+                        x = H(n, n - 1);
+                        s = std::fabs(x) + std::fabs(z);
+                        p = x / s;
+                        q = z / s;
+                        r = static_cast<SCALARTYPE>(std::sqrt(p * p + q * q));
+                        p = p / r;
+                        q = q / r;
+
+                        // Row modification
+                        for (int j = n - 1; j < nn; j++)
+                        {
+                            SCALARTYPE h_nj = H(n, j);
+                            z = H(n - 1, j);
+                            H(n - 1, j) = q * z + p * h_nj;
+                            H(n, j) = q * h_nj - p * z;
+                        }
+
+                        final_iter_update(H, n, n + 1, q, p);
+                        final_iter_update_gpu(V, n, nn, q, p);
+                    }
+                    else
+                    {
+                        // Complex pair
+                        d(n - 1) = x + p;
+                        d(n) = x + p;
+                        e(n - 1) = z;
+                        e(n) = -z;
+                    }
+
+                    n = n - 2;
+                    iter = 0;
+                }
+                else
+                {
+                    // No convergence yet
+
+                    // Form shift
+                    x = H(n, n);
+                    y = 0;
+                    w = 0;
+                    if (l < n)
+                    {
+                        y = H(n - 1, n - 1);
+                        w = H(n, n - 1) * H(n - 1, n);
+                    }
+
+                    // Wilkinson's original ad hoc shift
+                    if (iter == 10)
+                    {
+                        exshift += x;
+                        for (int i = 0; i <= n; i++)
+                            H(i, i) -= x;
+
+                        s = std::fabs(H(n, n - 1)) + std::fabs(H(n - 1, n - 2));
+                        x = y = SCALARTYPE(0.75) * s;
+                        w = SCALARTYPE(-0.4375) * s * s;
+                    }
+
+                    // MATLAB's new ad hoc shift
+                    if (iter == 30)
+                    {
+                        s = (y - x) / 2;
+                        s = s * s + w;
+                        if (s > 0)
+                        {
+                            s = static_cast<SCALARTYPE>(std::sqrt(s));
+                            if (y < x) s = -s;
+                            s = x - w / ((y - x) / 2 + s);
+                            for (int i = 0; i <= n; i++)
+                                H(i, i) -= s;
+                            exshift += s;
+                            x = y = w = SCALARTYPE(0.964);
+                        }
+                    }
+
+                    iter = iter + 1;
+
+                    // Look for two consecutive small sub-diagonal elements
+                    int m = n - 2;
+                    while (m >= l)
+                    {
+                        SCALARTYPE h_m1_m1 = H(m + 1, m + 1);
+                        z = H(m, m);
+                        r = x - z;
+                        s = y - z;
+                        p = (r * s - w) / H(m + 1, m) + H(m, m + 1);
+                        q = h_m1_m1 - z - r - s;
+                        r = H(m + 2, m + 1);
+                        s = std::fabs(p) + std::fabs(q) + std::fabs(r);
+                        p = p / s;
+                        q = q / s;
+                        r = r / s;
+                        if (m == l)
+                            break;
+                        if (std::fabs(H(m, m - 1)) * (std::fabs(q) + std::fabs(r)) < eps * (std::fabs(p) * (std::fabs(H(m - 1, m - 1)) + std::fabs(z) + std::fabs(h_m1_m1))))
+                            break;
+                        m--;
+                    }
+
+                    for (int i = m + 2; i <= n; i++)
+                    {
+                        H(i, i - 2) = 0;
+                        if (i > m + 2)
+                            H(i, i - 3) = 0;
+                    }
+
+                    // float QR step involving rows l:n and columns m:n
+                    for (int k = m; k < n; k++)
+                    {
+                        bool notlast = (k != n - 1);
+                        if (k != m)
+                        {
+                            p = H(k, k - 1);
+                            q = H(k + 1, k - 1);
+                            r = (notlast ? H(k + 2, k - 1) : 0);
+                            x = std::fabs(p) + std::fabs(q) + std::fabs(r);
+                            if (x != 0)
+                            {
+                                p = p / x;
+                                q = q / x;
+                                r = r / x;
+                            }
+                        }
+
+                        if (x == 0) break;
+
+                        s = static_cast<SCALARTYPE>(std::sqrt(p * p + q * q + r * r));
+                        if (p < 0) s = -s;
+
+                        if (s != 0)
+                        {
+                            if (k != m)
+                                H(k, k - 1) = -s * x;
+                            else
+                                if (l != m)
+                                    H(k, k - 1) = -H(k, k - 1);
+
+                            p = p + s;
+                            y = q / s;
+                            z = r / s;
+                            x = p / s;
+                            q = q / p;
+                            r = r / p;
+
+                            buf[5 * k] = x;
+                            buf[5 * k + 1] = y;
+                            buf[5 * k + 2] = z;
+                            buf[5 * k + 3] = q;
+                            buf[5 * k + 4] = r;
+
+
+                            SCALARTYPE* a_row_k = H.row(k);
+                            SCALARTYPE* a_row_k_1 = H.row(k + 1);
+                            SCALARTYPE* a_row_k_2 = H.row(k + 2);
+                            // Row modification
+                            for (int j = k; j < nn; j++)
+                            {
+                                SCALARTYPE h_kj = a_row_k[j];
+                                SCALARTYPE h_k1_j = a_row_k_1[j];
+
+                                p = h_kj + q * h_k1_j;
+                                if (notlast)
+                                {
+                                    SCALARTYPE h_k2_j = a_row_k_2[j];
+                                    p = p + r * h_k2_j;
+                                    a_row_k_2[j] = h_k2_j - p * z;
+                                }
+
+                                a_row_k[j] = h_kj - p * x;
+                                a_row_k_1[j] = h_k1_j - p * y;
+                            }
+
+                            //H(k + 1, nn - 1) = h_kj;
+
+
+                            // Column modification
+                            for (int i = k; i < std::min(nn, k + 4); i++)
+                            {
+                                p = x * H(i, k) + y * H(i, k + 1);
+                                if (notlast)
+                                {
+                                    p = p + z * H(i, k + 2);
+                                    H(i, k + 2) = H(i, k + 2) - p * r;
+                                }
+
+                                H(i, k) = H(i, k) - p;
+                                H(i, k + 1) = H(i, k + 1) - p * q;
+                            }
+                        }
+                        else
+                        {
+                            buf[5 * k] = 0;
+                            buf[5 * k + 1] = 0;
+                            buf[5 * k + 2] = 0;
+                            buf[5 * k + 3] = 0;
+                            buf[5 * k + 4] = 0;
+                        }
+                    }
+
+                    // Timer timer;
+                    // timer.start();
+
+                    update_float_QR_column(H, buf, m, n, n, true);
+                    update_float_QR_column_gpu(V, buf, buf_vcl, m, n, nn, false);
+
+                    // std::cout << timer.get() << "\n";
+                }
+            }
+
+            // Backsubstitute to find vectors of upper triangular form
+            if (norm == 0)
+            {
+                return;
+            }
+
+            for (n = nn - 1; n >= 0; n--)
+            {
+                p = d(n);
+                q = e(n);
+
+                // Real vector
+                if (q == 0)
+                {
+                    int l = n;
+                    H(n, n) = 1;
+                    for (int i = n - 1; i >= 0; i--)
+                    {
+                        w = H(i, i) - p;
+                        r = 0;
+                        for (int j = l; j <= n; j++)
+                            r = r + H(i, j) * H(j, n);
+
+                        if (e(i) < 0)
+                        {
+                            z = w;
+                            s = r;
+                        }
+                        else
+                        {
+                            l = i;
+                            if (e(i) == 0)
+                            {
+                                H(i, n) = (w != 0) ? (-r / w) : (-r / (eps * norm));
+                            }
+                            else
+                            {
+                                // Solve real equations
+                                x = H(i, i + 1);
+                                y = H(i + 1, i);
+                                q = (d(i) - p) * (d(i) - p) + e(i) * e(i);
+                                t = (x * s - z * r) / q;
+                                H(i, n) = t;
+                                H(i + 1, n) = (std::fabs(x) > std::fabs(z)) ? ((-r - w * t) / x) : ((-s - y * t) / z);
+                            }
+
+                            // Overflow control
+                            t = std::fabs(H(i, n));
+                            if ((eps * t) * t > 1)
+                                for (int j = i; j <= n; j++)
+                                    H(j, n) /= t;
+                        }
+                    }
+                }
+                else if (q < 0)
+                {
+                    // Complex vector
+                    int l = n - 1;
+
+                    // Last vector component imaginary so matrix is triangular
+                    if (std::fabs(H(n, n - 1)) > std::fabs(H(n - 1, n)))
+                    {
+                        H(n - 1, n - 1) = q / H(n, n - 1);
+                        H(n - 1, n) = -(H(n, n) - p) / H(n, n - 1);
+                    }
+                    else
+                    {
+                        cdiv<SCALARTYPE>(0, -H(n - 1, n), H(n - 1, n - 1) - p, q, out1, out2);
+
+                        H(n - 1, n - 1) = out1;
+                        H(n - 1, n) = out2;
+                    }
+
+                    H(n, n - 1) = 0;
+                    H(n, n) = 1;
+                    for (int i = n - 2; i >= 0; i--)
+                    {
+                        SCALARTYPE ra, sa, vr, vi;
+                        ra = 0;
+                        sa = 0;
+                        for (int j = l; j <= n; j++)
+                        {
+                            SCALARTYPE h_ij = H(i, j);
+                            ra = ra + h_ij * H(j, n - 1);
+                            sa = sa + h_ij * H(j, n);
+                        }
+
+                        w = H(i, i) - p;
+
+                        if (e(i) < 0)
+                        {
+                            z = w;
+                            r = ra;
+                            s = sa;
+                        }
+                        else
+                        {
+                            l = i;
+                            if (e(i) == 0)
+                            {
+                                cdiv<SCALARTYPE>(-ra, -sa, w, q, out1, out2);
+                                H(i, n - 1) = out1;
+                                H(i, n) = out2;
+                            }
+                            else
+                            {
+                                // Solve complex equations
+                                x = H(i, i + 1);
+                                y = H(i + 1, i);
+                                vr = (d(i) - p) * (d(i) - p) + e(i) * e(i) - q * q;
+                                vi = (d(i) - p) * 2 * q;
+                                if ( (vr == 0) && (vi == 0) )
+                                    vr = eps * norm * (std::fabs(w) + std::fabs(q) + std::fabs(x) + std::fabs(y) + std::fabs(z));
+
+                                cdiv<SCALARTYPE>(x * r - z * ra + q * sa, x * s - z * sa - q * ra, vr, vi, out1, out2);
+
+                                H(i, n - 1) = out1;
+                                H(i, n) = out2;
+
+
+                                if (std::fabs(x) > (std::fabs(z) + std::fabs(q)))
+                                {
+                                    H(i + 1, n - 1) = (-ra - w * H(i, n - 1) + q * H(i, n)) / x;
+                                    H(i + 1, n) = (-sa - w * H(i, n) - q * H(i, n - 1)) / x;
+                                }
+                                else
+                                {
+                                    cdiv<SCALARTYPE>(-r - y * H(i, n - 1), -s - y * H(i, n), z, q, out1, out2);
+
+                                    H(i + 1, n - 1) = out1;
+                                    H(i + 1, n) = out2;
+                                }
+                            }
+
+                            // Overflow control
+                            t = std::max(std::fabs(H(i, n - 1)), std::fabs(H(i, n)));
+                            if ((eps * t) * t > 1)
+                            {
+                                for (int j = i; j <= n; j++)
+                                {
+                                    H(j, n - 1) /= t;
+                                    H(j, n) /= t;
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+
+            viennacl::fast_copy(H.begin(), H.end(),  vcl_H);
+            // viennacl::fast_copy(V.begin(), V.end(),  vcl_V);
+
+            viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT> tmp = V;
+
+            V = viennacl::linalg::prod(trans(tmp), vcl_H);
+        }
+
+        template <typename SCALARTYPE, unsigned int ALIGNMENT>
+        bool householder_twoside(
+                            viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT>& A,
+                            viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT>& Q,
+                            viennacl::vector<SCALARTYPE, ALIGNMENT>& D,
+                            vcl_size_t start)
+        {
+            viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+
+            if(start + 2 >= A.size1())
+                return false;
+
+            prepare_householder_vector(A, D, A.size1(), start + 1, start, start + 1, true);
+
+            {
+                viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<SCALARTYPE>::program_name(), SVD_HOUSEHOLDER_UPDATE_A_LEFT_KERNEL);
+
+                viennacl::ocl::enqueue(kernel(
+                                              A,
+                                              D,
+                                              static_cast<cl_uint>(start + 1),
+                                              static_cast<cl_uint>(start),
+                                              static_cast<cl_uint>(A.size1()),
+                                              static_cast<cl_uint>(A.size2()),
+                                              static_cast<cl_uint>(A.internal_size2()),
+                                              viennacl::ocl::local_mem(static_cast<cl_uint>(128 * 4))
+                                      ));
+            }
+
+            {
+                viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<SCALARTYPE>::program_name(), SVD_HOUSEHOLDER_UPDATE_A_RIGHT_KERNEL);
+
+                viennacl::ocl::enqueue(kernel(
+                                              A,
+                                              D,
+                                              static_cast<cl_uint>(0),
+                                              static_cast<cl_uint>(0),
+                                              static_cast<cl_uint>(A.size1()),
+                                              static_cast<cl_uint>(A.size2()),
+                                              static_cast<cl_uint>(A.internal_size2()),
+                                              viennacl::ocl::local_mem(static_cast<cl_uint>(128 * sizeof(SCALARTYPE)))
+                                      ));
+            }
+
+            {
+                viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<SCALARTYPE>::program_name(), SVD_HOUSEHOLDER_UPDATE_QL_KERNEL);
+
+                viennacl::ocl::enqueue(kernel(
+                                                Q,
+                                                D,
+                                                static_cast<cl_uint>(A.size1()),
+                                                static_cast<cl_uint>(A.size2()),
+                                                static_cast<cl_uint>(Q.internal_size2()),
+                                                viennacl::ocl::local_mem(static_cast<cl_uint>(128 * sizeof(SCALARTYPE)))
+                                            ));
+            }
+
+            return true;
+        }
+
+        template <typename SCALARTYPE, typename F, unsigned int ALIGNMENT>
+        void tridiagonal_reduction(viennacl::matrix<SCALARTYPE, F, ALIGNMENT>& A,
+                                    viennacl::matrix<SCALARTYPE, F, ALIGNMENT>& Q)
+        {
+            vcl_size_t sz = A.size1();
+
+            viennacl::vector<SCALARTYPE> hh_vector(sz);
+
+            for(vcl_size_t i = 0; i < sz; i++)
+            {
+                householder_twoside(A, Q, hh_vector, i);
+            }
+
+        }
+
+        template <typename SCALARTYPE, typename F, unsigned int ALIGNMENT>
+        void qr_method(viennacl::matrix<SCALARTYPE, F, ALIGNMENT> & A,
+                       viennacl::matrix<SCALARTYPE, F, ALIGNMENT> & Q,
+                       boost::numeric::ublas::vector<SCALARTYPE> & D,
+                       boost::numeric::ublas::vector<SCALARTYPE> & E,
+                       bool is_symmetric = true)
+        {
+            viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+
+            assert(A.size1() == A.size2() && bool("Input matrix must be square for QR method!"));
+
+            D.resize(A.size1());
+            E.resize(A.size1());
+
+            viennacl::linalg::opencl::kernels::svd<SCALARTYPE>::init(ctx);
+
+            Q = viennacl::identity_matrix<SCALARTYPE>(Q.size1(), ctx);
+
+            // reduce to tridiagonal form
+            detail::tridiagonal_reduction(A, Q);
+
+            // pack diagonal and super-diagonal
+            // ublas::vector<SCALARTYPE> D(A.size1()), E(A.size1());
+
+            bidiag_pack(A, D, E);
+
+            // find eigenvalues
+            if(is_symmetric)
+            {
+
+                detail::tql2(Q, D, E);
+                transpose(Q);
+            }
+            else
+            {
+                detail::hqr2(A, Q, D, E);
+            }
+
+            // std::cout << A << "\n";
+
+            boost::numeric::ublas::matrix<float> eigen_values(A.size1(), A.size1());
+            eigen_values.clear();
+
+            for (vcl_size_t i = 0; i < A.size1(); i++)
+            {
+                if(std::fabs(E(i)) < EPS)
+                {
+                    eigen_values(i, i) = D(i);
+                }
+                else
+                {
+                    eigen_values(i, i) = D(i);
+                    eigen_values(i, i + 1) = E(i);
+                    eigen_values(i + 1, i) = -E(i);
+                    eigen_values(i + 1, i + 1) = D(i);
+                    i++;
+                }
+            }
+
+            copy(eigen_values, A);
+        }
+    }
+
+
+    template <typename SCALARTYPE, typename F, unsigned int ALIGNMENT>
+    void qr_method_nsm(viennacl::matrix<SCALARTYPE, F, ALIGNMENT>& A,
+                       viennacl::matrix<SCALARTYPE, F, ALIGNMENT>& Q,
+                       boost::numeric::ublas::vector<SCALARTYPE>& D,
+                       boost::numeric::ublas::vector<SCALARTYPE>& E
+                      )
+    {
+        detail::qr_method(A, Q, D, E, false);
+    }
+
+    template <typename SCALARTYPE, typename F, unsigned int ALIGNMENT>
+    void qr_method_sym(viennacl::matrix<SCALARTYPE, F, ALIGNMENT>& A,
+                       viennacl::matrix<SCALARTYPE, F, ALIGNMENT>& Q,
+                       boost::numeric::ublas::vector<SCALARTYPE>& D
+                      )
+    {
+        boost::numeric::ublas::vector<SCALARTYPE> E(A.size1());
+
+        detail::qr_method(A, Q, D, E, true);
+    }
+
+  }
+}
+
+#endif
diff --git a/viennacl/linalg/qr.hpp b/viennacl/linalg/qr.hpp
index e0f28fd..34b63ca 100644
--- a/viennacl/linalg/qr.hpp
+++ b/viennacl/linalg/qr.hpp
@@ -2,23 +2,24 @@
 #define VIENNACL_LINALG_QR_HPP
 
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
 ============================================================================= */
 
 /** @file viennacl/linalg/qr.hpp
-    @brief Proivdes a QR factorization using a block-based approach.  Experimental in 1.2.x.
+    @brief Provides a QR factorization using a block-based approach.
 */
 
 #include <utility>
@@ -47,129 +48,56 @@ namespace viennacl
   {
     namespace detail
     {
-      
-      // orthogonalises j-th column of A
-      template <typename MatrixType, typename VectorType>
-      typename MatrixType::value_type setup_householder_vector(MatrixType const & A, VectorType & v, std::size_t j)
-      {
-        typedef typename MatrixType::value_type   ScalarType;
-        
-        //compute norm of column below diagonal:
-        ScalarType sigma = 0;
-        ScalarType beta = 0;
-        for (std::size_t k = j+1; k<A.size1(); ++k)
-          sigma += A(k, j) * A(k, j);
-
-        //get v from A:
-        v[j] = 1;
-        //ScalarType scaling = sqrt(sigma + A(j,j)*A(j,j));
-        //ScalarType scaling = sqrt(sigma);
-        ScalarType scaling = 1.0;
-        for (std::size_t k = j+1; k<A.size1(); ++k)
-          v[k] = A(k, j) / scaling;
-        sigma = sigma / (scaling * scaling);
-        ScalarType A_jj = A(j,j) / scaling;
-        
-        std::cout << "sigma: " << sigma << std::endl;
-        assert( sigma >= 0.0  && "sigma must be non-negative!");
-
-        
-        if (sigma == 0)
-          return 0;
-        else
-        {
-          ScalarType mu = sqrt(sigma + A_jj*A_jj);
-          std::cout << "mu: " << mu << std::endl;
-          std::cout << "sigma: " << sigma << std::endl;
-          
-          ScalarType v1;
-          if (A_jj <= 0)
-            v1 = A_jj - mu;
-          else
-            v1 = -sigma / (A_jj + mu);
-          
-          beta = 2.0 * v1 * v1 / (sigma + v1 * v1);
-          
-          //divide v by its diagonal element v[j]
-          v[j] = 1;
-          std::cout << "v1: " << v1 << std::endl;
-          for (std::size_t k = j+1; k<A.size1(); ++k)
-            v[k] /= v1;
-        }
-          
-        return beta;
-      }
-      
 
       template <typename MatrixType, typename VectorType>
-      typename MatrixType::value_type setup_householder_vector_ublas(MatrixType const & A, VectorType & v, MatrixType & matrix_1x1, std::size_t j)
+      typename MatrixType::value_type setup_householder_vector_ublas(MatrixType const & A, VectorType & v, MatrixType & matrix_1x1, vcl_size_t j)
       {
         using boost::numeric::ublas::range;
         using boost::numeric::ublas::project;
-        
+
         typedef typename MatrixType::value_type   ScalarType;
-        
+
         //compute norm of column below diagonal:
-        //ScalarType sigma = 0;
-        //for (std::size_t k = j+1; k<A.size1(); ++k)
-        //  sigma += A(k, j) * A(k, j);
-        matrix_1x1 = prod( trans(project(A, range(j+1, A.size1()), range(j, j+1))),
-                                 project(A, range(j+1, A.size1()), range(j, j+1))
-                         );
+        matrix_1x1 = boost::numeric::ublas::prod( trans(project(A, range(j+1, A.size1()), range(j, j+1))),
+                                                        project(A, range(j+1, A.size1()), range(j, j+1))
+                                                );
         ScalarType sigma = matrix_1x1(0,0);
         ScalarType beta = 0;
         ScalarType A_jj = A(j,j);
-        
-        assert( sigma >= 0.0  && "sigma must be non-negative!");
+
+        assert( sigma >= 0.0  && bool("sigma must be non-negative!"));
 
         //get v from A:
-        //for (std::size_t k = j+1; k<A.size1(); ++k)
-        //  v[k] = A(k, j);
         v(j,0) = 1.0;
         project(v, range(j+1, A.size1()), range(0,1)) = project(A, range(j+1, A.size1()), range(j,j+1));
-        
+
         if (sigma == 0)
           return 0;
         else
         {
-          ScalarType mu = sqrt(sigma + A_jj*A_jj);
-          //std::cout << "mu: " << mu << std::endl;
-          //std::cout << "sigma: " << sigma << std::endl;
-          
-          ScalarType v1;
-          if (A_jj <= 0)
-            v1 = A_jj - mu;
-          else
-            v1 = -sigma / (A_jj + mu);
-          
-          beta = 2.0 * v1 * v1 / (sigma + v1 * v1);
-          
+          ScalarType mu = std::sqrt(sigma + A_jj*A_jj);
+
+          ScalarType v1 = (A_jj <= 0) ? (A_jj - mu) : (-sigma / (A_jj + mu));
+          beta = static_cast<ScalarType>(2.0) * v1 * v1 / (sigma + v1 * v1);
+
           //divide v by its diagonal element v[j]
-          //v[j] = 1;
-          //for (std::size_t k = j+1; k<A.size1(); ++k)
-          //  v[k] /= v1;
           project(v, range(j+1, A.size1()), range(0,1)) /= v1;
         }
-          
+
         return beta;
       }
 
 
       template <typename MatrixType, typename VectorType>
-      typename viennacl::result_of::cpu_value_type< typename MatrixType::value_type >::type 
-      setup_householder_vector_viennacl(MatrixType const & A, VectorType & v, MatrixType & matrix_1x1, std::size_t j)
+      typename viennacl::result_of::cpu_value_type< typename MatrixType::value_type >::type
+      setup_householder_vector_viennacl(MatrixType const & A, VectorType & v, MatrixType & matrix_1x1, vcl_size_t j)
       {
-        //using boost::numeric::ublas::range;
-        //using boost::numeric::ublas::project;
         using viennacl::range;
         using viennacl::project;
-        
+
         typedef typename viennacl::result_of::cpu_value_type< typename MatrixType::value_type >::type   ScalarType;
-        
+
         //compute norm of column below diagonal:
-        //ScalarType sigma = 0;
-        //for (std::size_t k = j+1; k<A.size1(); ++k)
-        //  sigma += A(k, j) * A(k, j);
         matrix_1x1 = viennacl::linalg::prod( trans(project(A, range(j+1, A.size1()), range(j, j+1))),
                                                    project(A, range(j+1, A.size1()), range(j, j+1))
                                            );
@@ -177,99 +105,70 @@ namespace viennacl
         ScalarType beta = 0;
         ScalarType A_jj = A(j,j);
 
-        //std::cout << "sigma: " << sigma << std::endl;
-        assert( sigma >= 0.0  && "sigma must be non-negative!");
-
+        assert( sigma >= 0.0  && bool("sigma must be non-negative!"));
 
         //get v from A:
-        //for (std::size_t k = j+1; k<A.size1(); ++k)
-        //  v[k] = A(k, j);
         v(j,0) = 1.0;
         project(v, range(j+1, A.size1()), range(0,1)) = project(A, range(j+1, A.size1()), range(j,j+1));
-        
+
         if (sigma == 0)
           return 0;
         else
         {
-          ScalarType mu = sqrt(sigma + A_jj*A_jj);
-          //std::cout << "mu: " << mu << std::endl;
-          //std::cout << "sigma: " << sigma << std::endl;
-          
-          ScalarType v1;
-          if (A_jj <= 0)
-            v1 = A_jj - mu;
-          else
-            v1 = -sigma / (A_jj + mu);
-          
+          ScalarType mu = std::sqrt(sigma + A_jj*A_jj);
+
+          ScalarType v1 = (A_jj <= 0) ? (A_jj - mu) : (-sigma / (A_jj + mu));
+
           beta = 2.0 * v1 * v1 / (sigma + v1 * v1);
-          
+
           //divide v by its diagonal element v[j]
-          //v[j] = 1;
-          //for (std::size_t k = j+1; k<A.size1(); ++k)
-          //  v[k] /= v1;
-          //v(j,0) = 1.0;
           project(v, range(j+1, A.size1()), range(0,1)) /= v1;
         }
-          
+
         return beta;
       }
 
 
       // Apply (I - beta v v^T) to the k-th column of A, where v is the reflector starting at j-th row/column
       template <typename MatrixType, typename VectorType, typename ScalarType>
-      void householder_reflect(MatrixType & A, VectorType & v, ScalarType beta, std::size_t j, std::size_t k)
+      void householder_reflect(MatrixType & A, VectorType & v, ScalarType beta, vcl_size_t j, vcl_size_t k)
       {
         ScalarType v_in_col = A(j,k);
-        for (std::size_t i=j+1; i<A.size1(); ++i)
+        for (vcl_size_t i=j+1; i<A.size1(); ++i)
           v_in_col += v[i] * A(i,k);
 
-        assert(v[j] == 1.0);
-        //std::cout << "v[]: " << v[0] << ", " << v[1] << ", " << v[2] << std::endl;
-        //std::cout << "v_in_col: " << v_in_col << std::endl;
-        
-        for (std::size_t i=j; i<A.size1(); ++i)
+        //assert(v[j] == 1.0);
+
+        for (vcl_size_t i=j; i<A.size1(); ++i)
           A(i,k) -= beta * v_in_col * v[i];
       }
 
       template <typename MatrixType, typename VectorType, typename ScalarType>
-      void householder_reflect_ublas(MatrixType & A, VectorType & v, MatrixType & matrix_1x1, ScalarType beta, std::size_t j, std::size_t k)
+      void householder_reflect_ublas(MatrixType & A, VectorType & v, MatrixType & matrix_1x1, ScalarType beta, vcl_size_t j, vcl_size_t k)
       {
         using boost::numeric::ublas::range;
         using boost::numeric::ublas::project;
-        
-        ScalarType v_in_col = A(j,k);
-        //for (std::size_t i=j+1; i<A.size1(); ++i)
-        //  v_in_col += v[i] * A(i,k);
 
-        matrix_1x1 = prod(trans(project(v, range(j+1, A.size1()), range(0, 1))),
-                         project(A, range(j+1, A.size1()), range(k,k+1)));
+        ScalarType v_in_col = A(j,k);
+        matrix_1x1 = boost::numeric::ublas::prod(trans(project(v, range(j+1, A.size1()), range(0, 1))),
+                                                       project(A, range(j+1, A.size1()), range(k,k+1)));
         v_in_col += matrix_1x1(0,0);
-                         
-        //for (std::size_t i=j; i<A.size1(); ++i)
-        //  A(i,k) -= beta * v_in_col * v[i];
-        
+
         project(A, range(j, A.size1()), range(k, k+1)) -= (beta * v_in_col) * project(v, range(j, A.size1()), range(0, 1));
       }
 
       template <typename MatrixType, typename VectorType, typename ScalarType>
-      void householder_reflect_viennacl(MatrixType & A, VectorType & v, MatrixType & matrix_1x1, ScalarType beta, std::size_t j, std::size_t k)
+      void householder_reflect_viennacl(MatrixType & A, VectorType & v, MatrixType & matrix_1x1, ScalarType beta, vcl_size_t j, vcl_size_t k)
       {
-        //using boost::numeric::ublas::range;
-        //using boost::numeric::ublas::project;
         using viennacl::range;
         using viennacl::project;
-        
+
         ScalarType v_in_col = A(j,k);
-        //for (std::size_t i=j+1; i<A.size1(); ++i)
-        //  v_in_col += v[i] * A(i,k);
 
         matrix_1x1 = viennacl::linalg::prod(trans(project(v, range(j+1, A.size1()), range(0, 1))),
                                                   project(A, range(j+1, A.size1()), range(k,k+1)));
         v_in_col += matrix_1x1(0,0);
-                         
-        //for (std::size_t i=j; i<A.size1(); ++i)
-        //  A(i,k) -= beta * v_in_col * v[i];
-        
+
         if ( beta * v_in_col != 0.0)
         {
           VectorType temp = project(v, range(j, A.size1()), range(0, 1));
@@ -282,158 +181,382 @@ namespace viennacl
 
       // Apply (I - beta v v^T) to A, where v is the reflector starting at j-th row/column
       template <typename MatrixType, typename VectorType, typename ScalarType>
-      void householder_reflect(MatrixType & A, VectorType & v, ScalarType beta, std::size_t j)
+      void householder_reflect(MatrixType & A, VectorType & v, ScalarType beta, vcl_size_t j)
       {
-        std::size_t column_end = A.size2();
-        
-        for (std::size_t k=j; k<column_end; ++k) //over columns
+        vcl_size_t column_end = A.size2();
+
+        for (vcl_size_t k=j; k<column_end; ++k) //over columns
           householder_reflect(A, v, beta, j, k);
       }
-      
-      
+
+
       template <typename MatrixType, typename VectorType>
-      void write_householder_to_A(MatrixType & A, VectorType const & v, std::size_t j)
+      void write_householder_to_A(MatrixType & A, VectorType const & v, vcl_size_t j)
       {
-        for (std::size_t i=j+1; i<A.size1(); ++i)
+        for (vcl_size_t i=j+1; i<A.size1(); ++i)
           A(i,j) = v[i];
       }
-      
+
       template <typename MatrixType, typename VectorType>
-      void write_householder_to_A_ublas(MatrixType & A, VectorType const & v, std::size_t j)
+      void write_householder_to_A_ublas(MatrixType & A, VectorType const & v, vcl_size_t j)
       {
-        //for (std::size_t i=j+1; i<A.size1(); ++i)
-        //  A(i,j) = v[i];
         using boost::numeric::ublas::range;
         using boost::numeric::ublas::project;
-        
+
         //VectorType temp = project(v, range(j+1, A.size1()));
         project( A, range(j+1, A.size1()), range(j, j+1) ) = project(v, range(j+1, A.size1()), range(0, 1) );;
       }
 
       template <typename MatrixType, typename VectorType>
-      void write_householder_to_A_viennacl(MatrixType & A, VectorType const & v, std::size_t j)
+      void write_householder_to_A_viennacl(MatrixType & A, VectorType const & v, vcl_size_t j)
       {
-        //for (std::size_t i=j+1; i<A.size1(); ++i)
-        //  A(i,j) = v[i];
-        //using boost::numeric::ublas::range;
-        //using boost::numeric::ublas::project;
         using viennacl::range;
         using viennacl::project;
-        
+
         //VectorType temp = project(v, range(j+1, A.size1()));
         project( A, range(j+1, A.size1()), range(j, j+1) ) = project(v, range(j+1, A.size1()), range(0, 1) );;
       }
 
-      
-      /*template<typename MatrixType>
-      std::vector<typename MatrixType::value_type> qr(MatrixType & A)
+
+
+      /** @brief Implementation of inplace-QR factorization for a general Boost.uBLAS compatible matrix A
+      *
+      * @param A            A dense compatible to Boost.uBLAS
+      * @param block_size   The block size to be used. The number of columns of A must be a multiple of block_size
+      */
+      template<typename MatrixType>
+      std::vector<typename MatrixType::value_type> inplace_qr_ublas(MatrixType & A, vcl_size_t block_size = 32)
       {
         typedef typename MatrixType::value_type   ScalarType;
-        
+        typedef boost::numeric::ublas::matrix_range<MatrixType>  MatrixRange;
+
+        using boost::numeric::ublas::range;
+        using boost::numeric::ublas::project;
+
         std::vector<ScalarType> betas(A.size2());
-        std::vector<ScalarType> v(A.size1());
+        MatrixType v(A.size1(), 1);
+        MatrixType matrix_1x1(1,1);
+
+        MatrixType Y(A.size1(), block_size); Y.clear(); Y.resize(A.size1(), block_size);
+        MatrixType W(A.size1(), block_size); W.clear(); W.resize(A.size1(), block_size);
 
-        //copy A to Q:
-        for (size_t j=0; j<A.size2(); ++j)
+        //run over A in a block-wise manner:
+        for (vcl_size_t j = 0; j < std::min(A.size1(), A.size2()); j += block_size)
         {
-            betas[j] = setup_householder_vector(A, v, j);
-            householder_reflect(A, v, betas[j], j);
-            write_householder_to_A(A, v, j);
+          vcl_size_t effective_block_size = std::min(std::min(A.size1(), A.size2()), j+block_size) - j;
+
+          //determine Householder vectors:
+          for (vcl_size_t k = 0; k < effective_block_size; ++k)
+          {
+            betas[j+k] = detail::setup_householder_vector_ublas(A, v, matrix_1x1, j+k);
+
+            for (vcl_size_t l = k; l < effective_block_size; ++l)
+              detail::householder_reflect_ublas(A, v, matrix_1x1, betas[j+k], j+k, j+l);
+
+            detail::write_householder_to_A_ublas(A, v, j+k);
+          }
+
+          //
+          // Setup Y:
+          //
+          Y.clear();  Y.resize(A.size1(), block_size);
+          for (vcl_size_t k = 0; k < effective_block_size; ++k)
+          {
+            //write Householder to Y:
+            Y(j+k,k) = 1.0;
+            project(Y, range(j+k+1, A.size1()), range(k, k+1)) = project(A, range(j+k+1, A.size1()), range(j+k, j+k+1));
+          }
+
+          //
+          // Setup W:
+          //
+
+          //first vector:
+          W.clear();  W.resize(A.size1(), block_size);
+          W(j, 0) = -betas[j];
+          project(W, range(j+1, A.size1()), range(0, 1)) = -betas[j] * project(A, range(j+1, A.size1()), range(j, j+1));
+
+
+          //k-th column of W is given by -beta * (Id + W*Y^T) v_k, where W and Y have k-1 columns
+          for (vcl_size_t k = 1; k < effective_block_size; ++k)
+          {
+            MatrixRange Y_old = project(Y, range(j, A.size1()), range(0, k));
+            MatrixRange v_k   = project(Y, range(j, A.size1()), range(k, k+1));
+            MatrixRange W_old = project(W, range(j, A.size1()), range(0, k));
+            MatrixRange z     = project(W, range(j, A.size1()), range(k, k+1));
+
+            MatrixType YT_prod_v = boost::numeric::ublas::prod(boost::numeric::ublas::trans(Y_old), v_k);
+            z = - betas[j+k] * (v_k + prod(W_old, YT_prod_v));
+          }
+
+          //
+          //apply (I+WY^T)^T = I + Y W^T to the remaining columns of A:
+          //
+
+          if (A.size2() - j - effective_block_size > 0)
+          {
+
+            MatrixRange A_part(A, range(j, A.size1()), range(j+effective_block_size, A.size2()));
+            MatrixRange W_part(W, range(j, A.size1()), range(0, effective_block_size));
+            MatrixType temp = boost::numeric::ublas::prod(trans(W_part), A_part);
+
+            A_part += prod(project(Y, range(j, A.size1()), range(0, effective_block_size)),
+                          temp);
+          }
         }
-        
+
         return betas;
-      }*/
-      
-      
-      
-      
-      class range
-      {
-        public:
-          range(std::size_t start, std::size_t end) : start_(start), end_(end) {}
-          
-          std::size_t lower() const { return start_; }
-          std::size_t upper() const { return end_; }
-          
-        private:
-          std::size_t start_;
-          std::size_t end_;
-      };
-
-      template <typename MatrixType>
-      class sub_matrix
+      }
+
+
+      /** @brief Implementation of a OpenCL-only QR factorization for GPUs (or multi-core CPU). DEPRECATED! Use only if you're curious and interested in playing a bit with a GPU-only implementation.
+      *
+      * Performance is rather poor at small matrix sizes.
+      * Prefer the use of the hybrid version, which is automatically chosen using the interface function inplace_qr()
+      *
+      * @param A            A dense ViennaCL matrix to be factored
+      * @param block_size   The block size to be used. The number of columns of A must be a multiple of block_size
+      */
+      template<typename MatrixType>
+      std::vector< typename viennacl::result_of::cpu_value_type< typename MatrixType::value_type >::type >
+      inplace_qr_viennacl(MatrixType & A, vcl_size_t block_size = 16)
       {
-        public:
-          typedef typename MatrixType::value_type value_type;
-          
-          sub_matrix(MatrixType & mat,
-                      range row_range,
-                      range col_range) : mat_(mat), row_range_(row_range), col_range_(col_range) {}
-                      
-          value_type operator()(size_t row, size_t col) const
+        typedef typename viennacl::result_of::cpu_value_type< typename MatrixType::value_type >::type   ScalarType;
+        typedef viennacl::matrix_range<MatrixType>  MatrixRange;
+
+        using viennacl::range;
+        using viennacl::project;
+
+        std::vector<ScalarType> betas(A.size2());
+        MatrixType v(A.size1(), 1);
+        MatrixType matrix_1x1(1,1);
+
+        MatrixType Y(A.size1(), block_size); Y.clear();
+        MatrixType W(A.size1(), block_size); W.clear();
+
+        MatrixType YT_prod_v(block_size, 1);
+        MatrixType z(A.size1(), 1);
+
+        //run over A in a block-wise manner:
+        for (vcl_size_t j = 0; j < std::min(A.size1(), A.size2()); j += block_size)
+        {
+          vcl_size_t effective_block_size = std::min(std::min(A.size1(), A.size2()), j+block_size) - j;
+
+          //determine Householder vectors:
+          for (vcl_size_t k = 0; k < effective_block_size; ++k)
           {
-            assert(row < size1());
-            assert(col < size2());
-            return mat_(row + row_range_.lower(), col + col_range_.lower()); 
+            betas[j+k] = detail::setup_householder_vector_viennacl(A, v, matrix_1x1, j+k);
+            for (vcl_size_t l = k; l < effective_block_size; ++l)
+              detail::householder_reflect_viennacl(A, v, matrix_1x1, betas[j+k], j+k, j+l);
+
+            detail::write_householder_to_A_viennacl(A, v, j+k);
           }
-                      
-          std::size_t size1() const { return row_range_.upper() - row_range_.lower(); }
-          std::size_t size2() const { return col_range_.upper() - col_range_.lower(); }
-          
-        private:
-          MatrixType & mat_;
-          range row_range_;
-          range col_range_;
-      };
-
-
-      //computes C = prod(A, B)
-      template <typename MatrixTypeA, typename MatrixTypeB, typename MatrixTypeC>
-      void prod_AA(MatrixTypeA const & A, MatrixTypeB const & B, MatrixTypeC & C)
-      {
-        assert(C.size1() == A.size1());
-        assert(A.size2() == B.size1());
-        assert(B.size2() == C.size2());
-        
-        typedef typename MatrixTypeC::value_type   ScalarType;
-        
-        for (std::size_t i=0; i<C.size1(); ++i)
-        {
-          for (std::size_t j=0; j<C.size2(); ++j)
+
+          //
+          // Setup Y:
+          //
+          Y.clear();
+          for (vcl_size_t k = 0; k < effective_block_size; ++k)
+          {
+            //write Householder to Y:
+            Y(j+k,k) = 1.0;
+            project(Y, range(j+k+1, A.size1()), range(k, k+1)) = project(A, range(j+k+1, A.size1()), range(j+k, j+k+1));
+          }
+
+          //
+          // Setup W:
+          //
+
+          //first vector:
+          W.clear();
+          W(j, 0) = -betas[j];
+          //project(W, range(j+1, A.size1()), range(0, 1)) = -betas[j] * project(A, range(j+1, A.size1()), range(j, j+1));
+          project(W, range(j+1, A.size1()), range(0, 1)) = project(A, range(j+1, A.size1()), range(j, j+1));
+          project(W, range(j+1, A.size1()), range(0, 1)) *= -betas[j];
+
+
+          //k-th column of W is given by -beta * (Id + W*Y^T) v_k, where W and Y have k-1 columns
+          for (vcl_size_t k = 1; k < effective_block_size; ++k)
           {
-            ScalarType val = 0;
-            for (std::size_t k=0; k<A.size2(); ++k)
-              val += A(i, k) * B(k, j);
-            C(i, j) = val;
+            MatrixRange Y_old = project(Y, range(j, A.size1()), range(0, k));
+            MatrixRange v_k   = project(Y, range(j, A.size1()), range(k, k+1));
+            MatrixRange W_old = project(W, range(j, A.size1()), range(0, k));
+
+            project(YT_prod_v, range(0, k), range(0,1)) = prod(trans(Y_old), v_k);
+            project(z, range(j, A.size1()), range(0,1)) = prod(W_old, project(YT_prod_v, range(0, k), range(0,1)));
+            project(W, range(j, A.size1()), range(k, k+1)) = project(z, range(j, A.size1()), range(0,1));
+            project(W, range(j, A.size1()), range(k, k+1)) += v_k;
+            project(W, range(j, A.size1()), range(k, k+1)) *= - betas[j+k];
+          }
+
+          //
+          //apply (I+WY^T)^T = I + Y W^T to the remaining columns of A:
+          //
+
+          if (A.size2() > j + effective_block_size)
+          {
+
+            MatrixRange A_part(A, range(j, A.size1()), range(j+effective_block_size, A.size2()));
+            MatrixRange W_part(W, range(j, A.size1()), range(0, effective_block_size));
+            MatrixType temp = prod(trans(W_part), A_part);
+
+            A_part += prod(project(Y, range(j, A.size1()), range(0, effective_block_size)),
+                          temp);
           }
         }
+
+        return betas;
       }
-      
-      //computes C = prod(A^T, B)
-      template <typename MatrixTypeA, typename MatrixTypeB, typename MatrixTypeC>
-      void prod_TA(MatrixTypeA const & A, MatrixTypeB const & B, MatrixTypeC & C)
+
+
+
+
+
+
+      //MatrixType is ViennaCL-matrix
+      /** @brief Implementation of a hybrid QR factorization using uBLAS on the CPU and ViennaCL for GPUs (or multi-core CPU)
+      *
+      * Prefer the use of the convenience interface inplace_qr()
+      *
+      * @param A            A dense ViennaCL matrix to be factored
+      * @param block_size   The block size to be used. The number of columns of A must be a multiple of block_size
+      */
+      template<typename MatrixType>
+      std::vector< typename viennacl::result_of::cpu_value_type< typename MatrixType::value_type >::type >
+      inplace_qr_hybrid(MatrixType & A, vcl_size_t block_size = 16)
       {
-        assert(C.size1() == A.size2());
-        assert(A.size1() == B.size1());
-        assert(B.size2() == C.size2());
-        
-        typedef typename MatrixTypeC::value_type   ScalarType;
-        
-        for (std::size_t i=0; i<C.size1(); ++i)
+        typedef typename viennacl::result_of::cpu_value_type< typename MatrixType::value_type >::type   ScalarType;
+
+        typedef viennacl::matrix_range<MatrixType>                    VCLMatrixRange;
+        typedef boost::numeric::ublas::matrix<ScalarType>             UblasMatrixType;
+        typedef boost::numeric::ublas::matrix_range<UblasMatrixType>  UblasMatrixRange;
+
+        std::vector<ScalarType> betas(A.size2());
+        UblasMatrixType v(A.size1(), 1);
+        UblasMatrixType matrix_1x1(1,1);
+
+        UblasMatrixType ublasW(A.size1(), block_size); ublasW.clear(); ublasW.resize(A.size1(), block_size);
+        UblasMatrixType ublasY(A.size1(), block_size); ublasY.clear(); ublasY.resize(A.size1(), block_size);
+
+        UblasMatrixType ublasA(A.size1(), A.size1());
+
+        MatrixType vclW(ublasW.size1(), ublasW.size2());
+        MatrixType vclY(ublasY.size1(), ublasY.size2());
+
+
+        //run over A in a block-wise manner:
+        for (vcl_size_t j = 0; j < std::min(A.size1(), A.size2()); j += block_size)
         {
-          for (std::size_t j=0; j<C.size2(); ++j)
+          vcl_size_t effective_block_size = std::min(std::min(A.size1(), A.size2()), j+block_size) - j;
+          UblasMatrixRange ublasA_part = boost::numeric::ublas::project(ublasA,
+                                                                        boost::numeric::ublas::range(0, A.size1()),
+                                                                        boost::numeric::ublas::range(j, j + effective_block_size));
+          viennacl::copy(viennacl::project(A,
+                                          viennacl::range(0, A.size1()),
+                                          viennacl::range(j, j+effective_block_size)),
+                         ublasA_part
+                        );
+
+          //determine Householder vectors:
+          for (vcl_size_t k = 0; k < effective_block_size; ++k)
+          {
+            betas[j+k] = detail::setup_householder_vector_ublas(ublasA, v, matrix_1x1, j+k);
+
+            for (vcl_size_t l = k; l < effective_block_size; ++l)
+              detail::householder_reflect_ublas(ublasA, v, matrix_1x1, betas[j+k], j+k, j+l);
+
+            detail::write_householder_to_A_ublas(ublasA, v, j+k);
+          }
+
+          //
+          // Setup Y:
+          //
+          ublasY.clear();  ublasY.resize(A.size1(), block_size);
+          for (vcl_size_t k = 0; k < effective_block_size; ++k)
+          {
+            //write Householder to Y:
+            ublasY(j+k,k) = 1.0;
+            boost::numeric::ublas::project(ublasY,
+                                           boost::numeric::ublas::range(j+k+1, A.size1()),
+                                           boost::numeric::ublas::range(k, k+1))
+              = boost::numeric::ublas::project(ublasA,
+                                               boost::numeric::ublas::range(j+k+1, A.size1()),
+                                               boost::numeric::ublas::range(j+k, j+k+1));
+          }
+
+          //
+          // Setup W:
+          //
+
+          //first vector:
+          ublasW.clear();  ublasW.resize(A.size1(), block_size);
+          ublasW(j, 0) = -betas[j];
+          boost::numeric::ublas::project(ublasW,
+                                        boost::numeric::ublas::range(j+1, A.size1()),
+                                        boost::numeric::ublas::range(0, 1))
+            = -betas[j] * boost::numeric::ublas::project(ublasA,
+                                                          boost::numeric::ublas::range(j+1, A.size1()),
+                                                          boost::numeric::ublas::range(j, j+1));
+
+
+          //k-th column of W is given by -beta * (Id + W*Y^T) v_k, where W and Y have k-1 columns
+          for (vcl_size_t k = 1; k < effective_block_size; ++k)
           {
-            ScalarType val = 0;
-            for (std::size_t k=0; k<A.size1(); ++k)
-              val += A(k, i) * B(k, j);
-            C(i, j) = val;
+            UblasMatrixRange Y_old = boost::numeric::ublas::project(ublasY,
+                                                                    boost::numeric::ublas::range(j, A.size1()),
+                                                                    boost::numeric::ublas::range(0, k));
+            UblasMatrixRange v_k   = boost::numeric::ublas::project(ublasY,
+                                                                    boost::numeric::ublas::range(j, A.size1()),
+                                                                    boost::numeric::ublas::range(k, k+1));
+            UblasMatrixRange W_old = boost::numeric::ublas::project(ublasW,
+                                                                    boost::numeric::ublas::range(j, A.size1()),
+                                                                    boost::numeric::ublas::range(0, k));
+            UblasMatrixRange z     = boost::numeric::ublas::project(ublasW,
+                                                                    boost::numeric::ublas::range(j, A.size1()),
+                                                                    boost::numeric::ublas::range(k, k+1));
+
+            UblasMatrixType YT_prod_v = boost::numeric::ublas::prod(boost::numeric::ublas::trans(Y_old), v_k);
+            z = - betas[j+k] * (v_k + prod(W_old, YT_prod_v));
+          }
+
+
+
+          //
+          //apply (I+WY^T)^T = I + Y W^T to the remaining columns of A:
+          //
+
+          VCLMatrixRange A_part = viennacl::project(A,
+                                                    viennacl::range(0, A.size1()),
+                                                    viennacl::range(j, j+effective_block_size));
+
+          viennacl::copy(boost::numeric::ublas::project(ublasA,
+                                                        boost::numeric::ublas::range(0, A.size1()),
+                                                        boost::numeric::ublas::range(j, j+effective_block_size)),
+                        A_part);
+
+          viennacl::copy(ublasW, vclW);
+          viennacl::copy(ublasY, vclY);
+
+          if (A.size2() > j + effective_block_size)
+          {
+
+            VCLMatrixRange A_part(A, viennacl::range(j, A.size1()), viennacl::range(j+effective_block_size, A.size2()));
+            VCLMatrixRange W_part(vclW, viennacl::range(j, A.size1()), viennacl::range(0, effective_block_size));
+            MatrixType temp = viennacl::linalg::prod(trans(W_part), A_part);
+
+            A_part += viennacl::linalg::prod(viennacl::project(vclY, viennacl::range(j, A.size1()), viennacl::range(0, effective_block_size)),
+                                             temp);
           }
         }
+
+        return betas;
       }
-      
+
+
 
     } //namespace detail
-        
+
+
 
 
     //takes an inplace QR matrix A and generates Q and R explicitly
@@ -441,7 +564,7 @@ namespace viennacl
     void recoverQ(MatrixType const & A, VectorType const & betas, MatrixType & Q, MatrixType & R)
     {
       typedef typename MatrixType::value_type   ScalarType;
-      
+
       std::vector<ScalarType> v(A.size1());
 
       Q.clear();
@@ -450,30 +573,24 @@ namespace viennacl
       //
       // Recover R from upper-triangular part of A:
       //
-      std::size_t i_max = std::min(R.size1(), R.size2());
-      for (std::size_t i=0; i<i_max; ++i)
-        for (std::size_t j=i; j<R.size2(); ++j)
+      vcl_size_t i_max = std::min(R.size1(), R.size2());
+      for (vcl_size_t i=0; i<i_max; ++i)
+        for (vcl_size_t j=i; j<R.size2(); ++j)
           R(i,j) = A(i,j);
-      
+
       //
       // Recover Q by applying all the Householder reflectors to the identity matrix:
       //
-      for (std::size_t i=0; i<Q.size1(); ++i)
+      for (vcl_size_t i=0; i<Q.size1(); ++i)
         Q(i,i) = 1.0;
 
-      std::size_t j_max = std::min(A.size1(), A.size2());
-      for (std::size_t j=0; j<j_max; ++j)
+      vcl_size_t j_max = std::min(A.size1(), A.size2());
+      for (vcl_size_t j=0; j<j_max; ++j)
       {
-        std::size_t col_index = j_max - j - 1;
+        vcl_size_t col_index = j_max - j - 1;
         v[col_index] = 1.0;
-        for (std::size_t i=col_index+1; i<A.size1(); ++i)
+        for (vcl_size_t i=col_index+1; i<A.size1(); ++i)
           v[i] = A(i, col_index);
-        
-        /*std::cout << "Recovery with beta = " << betas[col_index] << ", j=" << col_index << std::endl;
-        std::cout << "v: ";
-        for (size_t i=0; i<v.size(); ++i)
-          std::cout << v[i] << ", ";
-        std::cout << std::endl;*/
 
         if (betas[col_index] != 0)
           detail::householder_reflect(Q, v, betas[col_index], col_index);
@@ -481,379 +598,70 @@ namespace viennacl
     }
 
 
-    /** @brief Implementation of inplace-QR factorization for a general Boost.uBLAS compatible matrix A 
-     * 
-     * @param A            A dense compatible to Boost.uBLAS
-     * @param block_size   The block size to be used. The number of columns of A must be a multiple of block_size
+    /** @brief Computes Q^T b, where Q is an implicit orthogonal matrix defined via its Householder reflectors stored in A.
+     *
+     *  @param A      A matrix holding the Householder reflectors in the lower triangular part. Typically obtained from calling inplace_qr() on the original matrix
+     *  @param betas  The scalars beta_i for each Householder reflector (I - beta_i v_i v_i^T)
+     *  @param b      The vector b to which the result Q^T b is directly written to
      */
-    template<typename MatrixType>
-    std::vector<typename MatrixType::value_type> inplace_qr_ublas(MatrixType & A, std::size_t block_size = 32)
+    template <typename MatrixType, typename VectorType1, typename VectorType2>
+    void inplace_qr_apply_trans_Q(MatrixType const & A, VectorType1 const & betas, VectorType2 & b)
     {
-      typedef typename MatrixType::value_type   ScalarType;
-      typedef boost::numeric::ublas::matrix_range<MatrixType>  MatrixRange;
-      
-      using boost::numeric::ublas::range;
-      using boost::numeric::ublas::project;
-      
-      std::vector<ScalarType> betas(A.size2());
-      //boost::numeric::ublas::vector<ScalarType> v(A.size1());
-      MatrixType v(A.size1(), 1);
-      MatrixType matrix_1x1(1,1);
-
-      MatrixType Y(A.size1(), block_size); Y.clear(); Y.resize(A.size1(), block_size);
-      MatrixType W(A.size1(), block_size); W.clear(); W.resize(A.size1(), block_size);
-        
-      //run over A in a block-wise manner:
-      for (std::size_t j = 0; j < std::min(A.size1(), A.size2()); j += block_size)
-      {
-        //determine Householder vectors:
-        for (std::size_t k = 0; k < block_size; ++k)
-        {
-          betas[j+k] = detail::setup_householder_vector_ublas(A, v, matrix_1x1, j+k);
-          
-          for (std::size_t l = k; l < block_size; ++l)
-            detail::householder_reflect_ublas(A, v, matrix_1x1, betas[j+k], j+k, j+l);
+      typedef typename viennacl::result_of::cpu_value_type<typename MatrixType::value_type>::type   ScalarType;
 
-          detail::write_householder_to_A_ublas(A, v, j+k);
-        }
-
-        //
-        // Setup Y:
-        //
-        Y.clear();  Y.resize(A.size1(), block_size);
-        for (std::size_t k = 0; k < block_size; ++k)
-        {
-          //write Householder to Y:
-          Y(j+k,k) = 1.0;
-          project(Y, range(j+k+1, A.size1()), range(k, k+1)) = project(A, range(j+k+1, A.size1()), range(j+k, j+k+1));
-        }
-        
-        //
-        // Setup W:
-        //
-        
-        //first vector:
-        W.clear();  W.resize(A.size1(), block_size);
-        W(j, 0) = -betas[j];
-        project(W, range(j+1, A.size1()), range(0, 1)) = -betas[j] * project(A, range(j+1, A.size1()), range(j, j+1));
-        
-        
-        //k-th column of W is given by -beta * (Id + W*Y^T) v_k, where W and Y have k-1 columns
-        for (std::size_t k = 1; k < block_size; ++k)
-        {
-          MatrixRange Y_old = project(Y, range(j, A.size1()), range(0, k));
-          MatrixRange v_k   = project(Y, range(j, A.size1()), range(k, k+1));
-          MatrixRange W_old = project(W, range(j, A.size1()), range(0, k));
-          MatrixRange z     = project(W, range(j, A.size1()), range(k, k+1));
-          
-          MatrixType YT_prod_v = boost::numeric::ublas::prod(boost::numeric::ublas::trans(Y_old), v_k);
-          z = - betas[j+k] * (v_k + prod(W_old, YT_prod_v));
-        }
-
-        //
-        //apply (I+WY^T)^T = I + Y W^T to the remaining columns of A:
-        //
-        
-        if (A.size2() - j - block_size > 0)
-        {
-          
-          MatrixRange A_part(A, range(j, A.size1()), range(j+block_size, A.size2()));
-          MatrixRange W_part(W, range(j, A.size1()), range(0, block_size));
-          MatrixType temp = boost::numeric::ublas::prod(trans(W_part), A_part);
-          
-          A_part += prod(project(Y, range(j, A.size1()), range(0, Y.size2())),
-                         temp);
-        }
-      }
-
-      return betas;
-    }
-
-
-    /** @brief Implementation of a OpenCL-only QR factorization for GPUs (or multi-core CPU) 
-     * 
-     * Performance is rather poor at small matrix sizes.
-     * Prefer the use of the hybrid version, which is automatically chosen using the interface function inplace_qr()
-     * 
-     * @param A            A dense ViennaCL matrix to be factored
-     * @param block_size   The block size to be used. The number of columns of A must be a multiple of block_size
-     */
-    template<typename MatrixType>
-    std::vector< typename viennacl::result_of::cpu_value_type< typename MatrixType::value_type >::type > 
-    inplace_qr_viennacl(MatrixType & A, std::size_t block_size = 16)
-    {
-      typedef typename viennacl::result_of::cpu_value_type< typename MatrixType::value_type >::type   ScalarType;
-      typedef viennacl::matrix_range<MatrixType>  MatrixRange;
-      
-      //using boost::numeric::ublas::range;
-      //using boost::numeric::ublas::project;
-      using viennacl::range;
-      using viennacl::project;
-      
-      std::vector<ScalarType> betas(A.size2());
-      //boost::numeric::ublas::vector<ScalarType> v(A.size1());
-      MatrixType v(A.size1(), 1);
-      MatrixType matrix_1x1(1,1);
-
-      MatrixType Y(A.size1(), block_size); Y.clear();
-      MatrixType W(A.size1(), block_size); W.clear();
-
-      MatrixType YT_prod_v(block_size, 1);
-      MatrixType z(A.size1(), 1);      
-      
-      //run over A in a block-wise manner:
-      for (std::size_t j = 0; j < std::min(A.size1(), A.size2()); j += block_size)
+      //
+      // Apply Q^T = (I - beta_m v_m v_m^T) \times ... \times (I - beta_0 v_0 v_0^T) by applying all the Householder reflectors to b:
+      //
+      for (vcl_size_t col_index=0; col_index<std::min(A.size1(), A.size2()); ++col_index)
       {
-        
-        //determine Householder vectors:
-        for (std::size_t k = 0; k < block_size; ++k)
-        {
-          betas[j+k] = detail::setup_householder_vector_viennacl(A, v, matrix_1x1, j+k);
-          for (std::size_t l = k; l < block_size; ++l)
-            detail::householder_reflect_viennacl(A, v, matrix_1x1, betas[j+k], j+k, j+l);
-
-          detail::write_householder_to_A_viennacl(A, v, j+k);
-        }
-
-        //
-        // Setup Y:
-        //
-        Y.clear();
-        for (std::size_t k = 0; k < block_size; ++k)
-        {
-          //write Householder to Y:
-          Y(j+k,k) = 1.0;
-          project(Y, range(j+k+1, A.size1()), range(k, k+1)) = project(A, range(j+k+1, A.size1()), range(j+k, j+k+1));
-        }
-        
-        //
-        // Setup W:
-        //
-        
-        //first vector:
-        W.clear();
-        W(j, 0) = -betas[j];
-        //project(W, range(j+1, A.size1()), range(0, 1)) = -betas[j] * project(A, range(j+1, A.size1()), range(j, j+1));
-        project(W, range(j+1, A.size1()), range(0, 1)) = project(A, range(j+1, A.size1()), range(j, j+1));
-        project(W, range(j+1, A.size1()), range(0, 1)) *= -betas[j];
-        
-        
-        //k-th column of W is given by -beta * (Id + W*Y^T) v_k, where W and Y have k-1 columns
-        for (std::size_t k = 1; k < block_size; ++k)
-        {
-          MatrixRange Y_old = project(Y, range(j, A.size1()), range(0, k));
-          MatrixRange v_k   = project(Y, range(j, A.size1()), range(k, k+1));
-          MatrixRange W_old = project(W, range(j, A.size1()), range(0, k));
-          //MatrixRange z     = project(W, range(0, A.size1()), range(k, k+1));
-         
-          //std::cout << "should: " << k << std::endl;
-          project(YT_prod_v, range(0, k), range(0,1)) = prod(trans(Y_old), v_k);
-          project(z, range(j, A.size1()), range(0,1)) = prod(W_old, project(YT_prod_v, range(0, k), range(0,1)));
-          //project(W, range(0, A.size1()), range(k, k+1)) = - betas[j+k] * (v_k + prod(W_old, YT_prod_v));
-          project(W, range(j, A.size1()), range(k, k+1)) = project(z, range(j, A.size1()), range(0,1));
-          project(W, range(j, A.size1()), range(k, k+1)) += v_k;
-          project(W, range(j, A.size1()), range(k, k+1)) *= - betas[j+k];
-        }
+        ScalarType v_in_b = b[col_index];
+        for (vcl_size_t i=col_index+1; i<A.size1(); ++i)
+          v_in_b += A(i, col_index) * b[i];
 
-        //
-        //apply (I+WY^T)^T = I + Y W^T to the remaining columns of A:
-        //
-        
-        if (A.size2() - j - block_size > 0)
-        {
-          
-          MatrixRange A_part(A, range(j, A.size1()), range(j+block_size, A.size2()));
-          MatrixRange W_part(W, range(j, A.size1()), range(0, block_size));
-          MatrixType temp = prod(trans(W_part), A_part);
-          
-          A_part += prod(project(Y, range(j, A.size1()), range(0, Y.size2())),
-                         temp);
-        }
+        b[col_index] -= betas[col_index] * v_in_b;
+        for (vcl_size_t i=col_index+1; i<A.size1(); ++i)
+          b[i] -= betas[col_index] * A(i, col_index) * v_in_b;
       }
-
-      return betas;
     }
 
-
-
-
-
-
-    //MatrixType is ViennaCL-matrix
-    /** @brief Implementation of a hybrid QR factorization using uBLAS on the CPU and ViennaCL for GPUs (or multi-core CPU) 
-     * 
-     * Prefer the use of the convenience interface inplace_qr()
-     * 
-     * @param A            A dense ViennaCL matrix to be factored
-     * @param block_size   The block size to be used. The number of columns of A must be a multiple of block_size
-     */
-    template<typename MatrixType>
-    std::vector< typename viennacl::result_of::cpu_value_type< typename MatrixType::value_type >::type > 
-    inplace_qr_hybrid(MatrixType & A, std::size_t block_size = 16)
+    template <typename T, typename F, unsigned int ALIGNMENT, typename VectorType1, unsigned int A2>
+    void inplace_qr_apply_trans_Q(viennacl::matrix<T, F, ALIGNMENT> const & A, VectorType1 const & betas, viennacl::vector<T, A2> & b)
     {
-      typedef typename viennacl::result_of::cpu_value_type< typename MatrixType::value_type >::type   ScalarType;
-
-      typedef viennacl::matrix_range<MatrixType>                    VCLMatrixRange;
-      typedef boost::numeric::ublas::matrix<ScalarType>             UblasMatrixType;
-      typedef boost::numeric::ublas::matrix_range<UblasMatrixType>  UblasMatrixRange;
-      
-      //using boost::numeric::ublas::range;
-      //using boost::numeric::ublas::project;
-      
-      std::vector<ScalarType> betas(A.size2());
-      UblasMatrixType v(A.size1(), 1);
-      UblasMatrixType matrix_1x1(1,1);
-
-      UblasMatrixType ublasW(A.size1(), block_size); ublasW.clear(); ublasW.resize(A.size1(), block_size);
-      UblasMatrixType ublasY(A.size1(), block_size); ublasY.clear(); ublasY.resize(A.size1(), block_size);
-      
-      UblasMatrixType ublasA(A.size1(), A.size1());
-      
-      MatrixType vclW(ublasW.size1(), ublasW.size2());
-      MatrixType vclY(ublasY.size1(), ublasY.size2());
-      
-        
-      //run over A in a block-wise manner:
-      for (std::size_t j = 0; j < std::min(A.size1(), A.size2()); j += block_size)
-      {
-        UblasMatrixRange ublasA_part = boost::numeric::ublas::project(ublasA,
-                                                                      boost::numeric::ublas::range(0, A.size1()),
-                                                                      boost::numeric::ublas::range(j, j+block_size));
-        viennacl::copy(viennacl::project(A,
-                                         viennacl::range(0, A.size1()),
-                                         viennacl::range(j, j+block_size)),
-                       ublasA_part
-                      );
-        
-        //determine Householder vectors:
-        for (std::size_t k = 0; k < block_size; ++k)
-        {
-          betas[j+k] = detail::setup_householder_vector_ublas(ublasA, v, matrix_1x1, j+k);
-          
-          for (std::size_t l = k; l < block_size; ++l)
-            detail::householder_reflect_ublas(ublasA, v, matrix_1x1, betas[j+k], j+k, j+l);
+      boost::numeric::ublas::matrix<T> ublas_A(A.size1(), A.size2());
+      viennacl::copy(A, ublas_A);
 
-          detail::write_householder_to_A_ublas(ublasA, v, j+k);
-        }
+      std::vector<T> stl_b(b.size());
+      viennacl::copy(b, stl_b);
 
-        //
-        // Setup Y:
-        //
-        ublasY.clear();  ublasY.resize(A.size1(), block_size);
-        for (std::size_t k = 0; k < block_size; ++k)
-        {
-          //write Householder to Y:
-          ublasY(j+k,k) = 1.0;
-          boost::numeric::ublas::project(ublasY, 
-                                         boost::numeric::ublas::range(j+k+1, A.size1()), 
-                                         boost::numeric::ublas::range(k, k+1)) 
-            = boost::numeric::ublas::project(ublasA, 
-                                             boost::numeric::ublas::range(j+k+1, A.size1()),
-                                             boost::numeric::ublas::range(j+k, j+k+1));
-        }
-        
-        //
-        // Setup W:
-        //
-        
-        //first vector:
-        ublasW.clear();  ublasW.resize(A.size1(), block_size);
-        ublasW(j, 0) = -betas[j];
-        boost::numeric::ublas::project(ublasW, 
-                                       boost::numeric::ublas::range(j+1, A.size1()), 
-                                       boost::numeric::ublas::range(0, 1)) 
-           = -betas[j] * boost::numeric::ublas::project(ublasA, 
-                                                        boost::numeric::ublas::range(j+1, A.size1()), 
-                                                        boost::numeric::ublas::range(j, j+1));
-        
-        
-        //k-th column of W is given by -beta * (Id + W*Y^T) v_k, where W and Y have k-1 columns
-        for (std::size_t k = 1; k < block_size; ++k)
-        {
-          UblasMatrixRange Y_old = boost::numeric::ublas::project(ublasY,
-                                                                  boost::numeric::ublas::range(j, A.size1()),
-                                                                  boost::numeric::ublas::range(0, k));
-          UblasMatrixRange v_k   = boost::numeric::ublas::project(ublasY,
-                                                                  boost::numeric::ublas::range(j, A.size1()),
-                                                                  boost::numeric::ublas::range(k, k+1));
-          UblasMatrixRange W_old = boost::numeric::ublas::project(ublasW, 
-                                                                  boost::numeric::ublas::range(j, A.size1()), 
-                                                                  boost::numeric::ublas::range(0, k));
-          UblasMatrixRange z     = boost::numeric::ublas::project(ublasW, 
-                                                                  boost::numeric::ublas::range(j, A.size1()), 
-                                                                  boost::numeric::ublas::range(k, k+1));
-          
-          UblasMatrixType YT_prod_v = boost::numeric::ublas::prod(boost::numeric::ublas::trans(Y_old), v_k);
-          z = - betas[j+k] * (v_k + prod(W_old, YT_prod_v));
-        }
-        
-        
-
-        //
-        //apply (I+WY^T)^T = I + Y W^T to the remaining columns of A:
-        //
-        
-        VCLMatrixRange A_part = viennacl::project(A,
-                                                  viennacl::range(0, A.size1()),
-                                                  viennacl::range(j, j+block_size));
-        
-        viennacl::copy(boost::numeric::ublas::project(ublasA,
-                                                      boost::numeric::ublas::range(0, A.size1()),
-                                                      boost::numeric::ublas::range(j, j+block_size)),
-                       A_part);
-        
-        viennacl::copy(ublasW, vclW);
-        viennacl::copy(ublasY, vclY);
-        
-        if (A.size2() - j - block_size > 0)
-        {
-          
-          VCLMatrixRange A_part(A, range(j, A.size1()), range(j+block_size, A.size2()));
-          VCLMatrixRange W_part(vclW, range(j, A.size1()), range(0, block_size));
-          MatrixType temp = viennacl::linalg::prod(trans(W_part), A_part);
-          
-          A_part += prod(viennacl::project(vclY, 
-                                           viennacl::range(j, A.size1()), 
-                                           viennacl::range(0, vclY.size2())),
-                         temp);
-        }
-      }
+      inplace_qr_apply_trans_Q(ublas_A, betas, stl_b);
 
-      return betas;
+      viennacl::copy(stl_b, b);
     }
 
-
-
-    /** @brief Overload of inplace-QR factorization of a ViennaCL matrix A 
-     * 
+    /** @brief Overload of inplace-QR factorization of a ViennaCL matrix A
+     *
      * @param A            A dense ViennaCL matrix to be factored
-     * @param block_size   The block size to be used. The number of columns of A must be a multiple of block_size
+     * @param block_size   The block size to be used.
      */
     template<typename T, typename F, unsigned int ALIGNMENT>
-    std::vector<T> inplace_qr(viennacl::matrix<T, F, ALIGNMENT> & A, std::size_t block_size = 16)
+    std::vector<T> inplace_qr(viennacl::matrix<T, F, ALIGNMENT> & A, vcl_size_t block_size = 16)
     {
-      if (A.size2() % block_size != 0)
-        std::cerr << "ViennaCL: Warning in inplace_qr(): Number of columns is not a multiple of the block size" << std::endl;
-      
-      return inplace_qr_hybrid(A, block_size);
+      return detail::inplace_qr_hybrid(A, block_size);
     }
 
-    /** @brief Overload of inplace-QR factorization for a general Boost.uBLAS compatible matrix A 
-     * 
+    /** @brief Overload of inplace-QR factorization for a general Boost.uBLAS compatible matrix A
+     *
      * @param A            A dense compatible to Boost.uBLAS
-     * @param block_size   The block size to be used. The number of columns of A must be a multiple of block_size
+     * @param block_size   The block size to be used.
      */
     template<typename MatrixType>
-    std::vector<typename MatrixType::value_type> inplace_qr(MatrixType & A, std::size_t block_size = 16)
+    std::vector<typename MatrixType::value_type> inplace_qr(MatrixType & A, vcl_size_t block_size = 16)
     {
-      if (A.size2() % block_size != 0)
-        std::cerr << "ViennaCL: Warning in inplace_qr(): Number of columns is not a multiple of the block size" << std::endl;
-      
-      return inplace_qr_ublas(A, block_size);
+      return detail::inplace_qr_ublas(A, block_size);
     }
 
 
-        
+
   } //linalg
 } //viennacl
 
diff --git a/viennacl/linalg/row_scaling.hpp b/viennacl/linalg/row_scaling.hpp
index 45d045d..8795fb8 100644
--- a/viennacl/linalg/row_scaling.hpp
+++ b/viennacl/linalg/row_scaling.hpp
@@ -2,22 +2,23 @@
 #define VIENNACL_LINALG_ROW_SCALING_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
 ============================================================================= */
 
-/** @file row_scaling.hpp
+/** @file viennacl/linalg/row_scaling.hpp
     @brief A row normalization preconditioner is implemented here
 */
 
@@ -34,9 +35,8 @@ namespace viennacl
 {
   namespace linalg
   {
-    
-    /** @brief A tag for a row preconditioner
-    */
+
+    /** @brief A tag for a row scaling preconditioner which merely normalizes the equation system such that each row of the system matrix has unit norm. */
     class row_scaling_tag
     {
       public:
@@ -45,176 +45,151 @@ namespace viennacl
         * @param p   Integer selecting the desired row norm.
         */
         row_scaling_tag(unsigned int p = 2) : norm_(p) {}
-        
-        /** @brief Returns the index p of the l^p-norm (1... sum(abs(x)), 2... sqrt(sum(x_i^2))). Currently only p=1 and p=2 supported*/
+
+        /** @brief Returns the index p of the l^p-norm (0 ... ||x||_sup, 1... sum(abs(x)), 2... sqrt(sum(x_i^2))). Currently only p=0, p=1, and p=2 supported.*/
         unsigned int norm() const { return norm_; }
-        
+
       private:
         unsigned int norm_;
     };
-    
 
-    /** @brief Jacobi preconditioner class, can be supplied to solve()-routines
-    */
-    template <typename MatrixType>
+
+    /** \cond */
+    namespace detail
+    {
+      template <typename T>
+      struct row_scaling_for_viennacl
+      {
+        enum { value = false };
+      };
+
+      template <typename ScalarType, unsigned int ALIGNMENT>
+      struct row_scaling_for_viennacl< viennacl::compressed_matrix<ScalarType, ALIGNMENT> >
+      {
+        enum { value = true };
+      };
+
+      template <typename ScalarType, unsigned int ALIGNMENT>
+      struct row_scaling_for_viennacl< viennacl::coordinate_matrix<ScalarType, ALIGNMENT> >
+      {
+        enum { value = true };
+      };
+    }
+    /** \endcond */
+
+
+    /** @brief Jacobi-type preconditioner class, can be supplied to solve()-routines. This is a diagonal preconditioner with the diagonal entries being (configurable) row norms of the matrix.
+     *
+     *  Default implementation for non-native ViennaCL matrices (e.g. uBLAS)
+     */
+    template <typename MatrixType,
+              bool is_viennacl = detail::row_scaling_for_viennacl<MatrixType>::value >
     class row_scaling
     {
       typedef typename MatrixType::value_type      ScalarType;
-      
+
       public:
         /** @brief Constructor for the preconditioner
         *
         * @param mat   The system matrix
         * @param tag   A row scaling tag holding the desired norm.
         */
-        row_scaling(MatrixType const & mat, row_scaling_tag const & tag) : system_matrix(mat), tag_(tag)
+        row_scaling(MatrixType const & mat, row_scaling_tag const & tag) : diag_M(viennacl::traits::size1(mat))
         {
-          assert(mat.size1() == mat.size2());
-          diag_M_inv.resize(mat.size1());  //resize without preserving values
-          
-          for (typename MatrixType::const_iterator1 row_it = system_matrix.begin1();
-                row_it != system_matrix.end1();
+          assert(mat.size1() == mat.size2() && bool("Size mismatch"));
+          init(mat, tag);
+        }
+
+        void init(MatrixType const & mat, row_scaling_tag const & tag)
+        {
+          diag_M.resize(mat.size1());  //resize without preserving values
+
+          for (typename MatrixType::const_iterator1 row_it = mat.begin1();
+                row_it != mat.end1();
                 ++row_it)
           {
-            diag_M_inv[row_it.index1()];
             for (typename MatrixType::const_iterator2 col_it = row_it.begin();
                   col_it != row_it.end();
                   ++col_it)
             {
-              if (tag_.norm() == 1)
-                diag_M_inv[col_it.index1()] += std::fabs(*col_it);
-              else
-                diag_M_inv[col_it.index1()] += (*col_it) * (*col_it);
+              if (tag.norm() == 0)
+                diag_M[col_it.index1()] = std::max<ScalarType>(diag_M[col_it.index1()], std::fabs(*col_it));
+              else if (tag.norm() == 1)
+                diag_M[col_it.index1()] += std::fabs(*col_it);
+              else if (tag.norm() == 2)
+                diag_M[col_it.index1()] += (*col_it) * (*col_it);
             }
-            if (diag_M_inv[row_it.index1()] == 0)
+            if (diag_M[row_it.index1()] == 0)
               throw "ViennaCL: Zero row encountered while setting up row scaling preconditioner!";
-            
-            if (tag_.norm() == 1)
-              diag_M_inv[row_it.index1()] = static_cast<ScalarType>(1.0) / diag_M_inv[row_it.index1()];
-            else
-              diag_M_inv[row_it.index1()] = static_cast<ScalarType>(1.0) / std::sqrt(diag_M_inv[row_it.index1()]);
+
+            if (tag.norm() == 2)
+              diag_M[row_it.index1()] = std::sqrt(diag_M[row_it.index1()]);
           }
         }
-        
-        
+
+
         /** @brief Apply to res = b - Ax, i.e. row applied vec (right hand side),  */
         template <typename VectorType>
         void apply(VectorType & vec) const
         {
-          assert(vec.size() == diag_M_inv.size());
-          for (size_t i=0; i<vec.size(); ++i)
-          {
-            vec[i] *= diag_M_inv[i];
-          }
+          assert(vec.size() == diag_M.size() && bool("Size mismatch"));
+          for (vcl_size_t i=0; i<vec.size(); ++i)
+            vec[i] /= diag_M[i];
         }
-        
+
       private:
-        MatrixType const & system_matrix;
-        row_scaling_tag const & tag_;
-        std::vector<ScalarType> diag_M_inv;
+        std::vector<ScalarType> diag_M;
     };
 
-    
+
     /** @brief Jacobi preconditioner class, can be supplied to solve()-routines.
     *
     *  Specialization for compressed_matrix
     */
-    template <typename ScalarType, unsigned int MAT_ALIGNMENT>
-    class row_scaling< compressed_matrix<ScalarType, MAT_ALIGNMENT> >
+    template <typename MatrixType>
+    class row_scaling< MatrixType, true>
     {
-      typedef compressed_matrix<ScalarType, MAT_ALIGNMENT>   MatrixType;
-      
+        typedef typename viennacl::result_of::cpu_value_type<typename MatrixType::value_type>::type  ScalarType;
+
+
       public:
         /** @brief Constructor for the preconditioner
         *
         * @param mat   The system matrix
         * @param tag   A row scaling tag holding the desired norm.
         */
-        row_scaling(MatrixType const & mat, row_scaling_tag const & tag) : system_matrix(mat), tag_(tag), diag_M_inv(mat.size1())
+        row_scaling(MatrixType const & mat, row_scaling_tag const & tag) : diag_M(mat.size1(), viennacl::traits::context(mat))
         {
-          assert(system_matrix.size1() == system_matrix.size2());
-          
-          init_gpu();
+          init(mat, tag);
         }
-        
-        /*
-        void init_cpu()
-        {
-          std::vector< std::map<unsigned int, ScalarType> > cpu_check;
-          std::vector<ScalarType> diag_M_inv_cpu(system_matrix.size1());
-          
-          copy(system_matrix, cpu_check);
-          viennacl::tools::const_sparse_matrix_adapter<ScalarType> cpu_check_adapter(cpu_check);
-          
-          for (typename viennacl::tools::const_sparse_matrix_adapter<ScalarType>::const_iterator1 row_it = cpu_check_adapter.begin1();
-                row_it != cpu_check_adapter.end1();
-                ++row_it)
-          {
-            diag_M_inv_cpu[row_it.index1()] = 0;
-            for (typename viennacl::tools::const_sparse_matrix_adapter<ScalarType>::const_iterator2 col_it = row_it.begin();
-                  col_it != row_it.end();
-                  ++col_it)
-            {
-              if (tag_.norm() == 1)
-                diag_M_inv_cpu[col_it.index1()] += std::fabs(*col_it);
-              else
-                diag_M_inv_cpu[col_it.index1()] += (*col_it) * (*col_it);
-            }
-            if (diag_M_inv_cpu[row_it.index1()] == 0)
-              throw "ViennaCL: Zero row encountered while setting up row scaling preconditioner!";
-            
-            if (tag_.norm() == 1)
-              diag_M_inv_cpu[row_it.index1()] = static_cast<ScalarType>(1.0) / diag_M_inv_cpu[row_it.index1()];
-            else
-              diag_M_inv_cpu[row_it.index1()] = static_cast<ScalarType>(1.0) / std::sqrt(diag_M_inv_cpu[row_it.index1()]);
-          }
-          
-          diag_M_inv.resize(system_matrix.size1(), false);
-          viennacl::fast_copy(diag_M_inv_cpu, diag_M_inv);
-        } */
-        
-        void init_gpu()
-        {
-          if (tag_.norm() == 1)
-          {
-            viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(
-                                                viennacl::linalg::kernels::compressed_matrix<ScalarType, MAT_ALIGNMENT>::program_name(),
-                                                "row_scaling_1");
 
-            viennacl::ocl::enqueue( k(system_matrix.handle1(), system_matrix.handle2(), system_matrix.handle(), 
-                                      diag_M_inv, static_cast<cl_uint>(diag_M_inv.size())) );        
-          }
-          else
+        void init(MatrixType const & mat, row_scaling_tag const & tag)
+        {
+          switch (tag.norm())
           {
-            viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(
-                                                viennacl::linalg::kernels::compressed_matrix<ScalarType, MAT_ALIGNMENT>::program_name(),
-                                                "row_scaling_2");
-
-            viennacl::ocl::enqueue( k(system_matrix.handle1(), system_matrix.handle2(), system_matrix.handle(), 
-                                      diag_M_inv, static_cast<cl_uint>(diag_M_inv.size())) );        
+            case 0:
+              detail::row_info(mat, diag_M, detail::SPARSE_ROW_NORM_INF);
+              break;
+            case 1:
+              detail::row_info(mat, diag_M, detail::SPARSE_ROW_NORM_1);
+              break;
+            case 2:
+              detail::row_info(mat, diag_M, detail::SPARSE_ROW_NORM_2);
+              break;
+            default:
+              throw "Unknown norm!";
           }
         }
-        
+
         template <unsigned int ALIGNMENT>
         void apply(viennacl::vector<ScalarType, ALIGNMENT> & vec) const
         {
-          assert(viennacl::traits::size1(system_matrix) == viennacl::traits::size(vec));
-          
-          //run kernel:
-          viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<ScalarType, ALIGNMENT>::program_name(),
-                                                                "diag_precond");
-
-          viennacl::ocl::enqueue(
-             k(viennacl::traits::handle(diag_M_inv), cl_uint(viennacl::traits::start(diag_M_inv)), cl_uint(viennacl::traits::size(diag_M_inv)),
-               viennacl::traits::handle(vec), cl_uint(viennacl::traits::start(vec)), cl_uint(viennacl::traits::size(vec)) )
-                                );        
-          
+          assert(viennacl::traits::size(diag_M) == viennacl::traits::size(vec) && bool("Size mismatch"));
+          vec = element_div(vec, diag_M);
         }
-        
+
       private:
-        MatrixType const & system_matrix;
-        row_scaling_tag const & tag_;
-        viennacl::vector<ScalarType> diag_M_inv;
+        viennacl::vector<ScalarType> diag_M;
     };
 
   }
diff --git a/viennacl/linalg/scalar_operations.hpp b/viennacl/linalg/scalar_operations.hpp
new file mode 100644
index 0000000..d675ced
--- /dev/null
+++ b/viennacl/linalg/scalar_operations.hpp
@@ -0,0 +1,242 @@
+#ifndef VIENNACL_LINALG_SCALAR_OPERATIONS_HPP
+#define VIENNACL_LINALG_SCALAR_OPERATIONS_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/scalar_operations.hpp
+    @brief Implementations of scalar operations.
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/meta/predicate.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/start.hpp"
+#include "viennacl/traits/handle.hpp"
+#include "viennacl/traits/stride.hpp"
+#include "viennacl/linalg/host_based/scalar_operations.hpp"
+
+#ifdef VIENNACL_WITH_OPENCL
+  #include "viennacl/linalg/opencl/scalar_operations.hpp"
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+  #include "viennacl/linalg/cuda/scalar_operations.hpp"
+#endif
+
+
+
+namespace viennacl
+{
+  namespace linalg
+  {
+
+    /** @brief Interface for the generic operation s1 = s2 @ alpha, where s1 and s2 are GPU scalars, @ denotes multiplication or division, and alpha is either a GPU or a CPU scalar
+     *
+     * @param s1                The first  (GPU) scalar
+     * @param s2                The second (GPU) scalar
+     * @param alpha             The scalar alpha in the operation
+     * @param len_alpha         If alpha is obtained from summing over a small GPU vector (e.g. the final summation after a multi-group reduction), then supply the length of the array here
+     * @param reciprocal_alpha  If true, then s2 / alpha instead of s2 * alpha is computed
+     * @param flip_sign_alpha   If true, then (-alpha) is used instead of alpha
+     */
+    template <typename S1,
+              typename S2, typename ScalarType1>
+    typename viennacl::enable_if< viennacl::is_scalar<S1>::value
+                                  && viennacl::is_scalar<S2>::value
+                                  && viennacl::is_any_scalar<ScalarType1>::value
+                                >::type
+    as(S1 & s1,
+       S2 const & s2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
+    {
+      switch (viennacl::traits::handle(s1).get_active_handle_id())
+      {
+        case viennacl::MAIN_MEMORY:
+          viennacl::linalg::host_based::as(s1, s2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha);
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case viennacl::OPENCL_MEMORY:
+          viennacl::linalg::opencl::as(s1, s2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha);
+          break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        case viennacl::CUDA_MEMORY:
+          viennacl::linalg::cuda::as(s1, s2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha);
+          break;
+#endif
+        case viennacl::MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("not implemented");
+      }
+    }
+
+
+    /** @brief Interface for the generic operation s1 = s2 @ alpha + s3 @ beta, where s1, s2 and s3 are GPU scalars, @ denotes multiplication or division, and alpha, beta are either a GPU or a CPU scalar
+     *
+     * @param s1                The first  (GPU) scalar
+     * @param s2                The second (GPU) scalar
+     * @param alpha             The scalar alpha in the operation
+     * @param len_alpha         If alpha is a small GPU vector, which needs to be summed in order to obtain the final scalar, then supply the length of the array here
+     * @param reciprocal_alpha  If true, then s2 / alpha instead of s2 * alpha is computed
+     * @param flip_sign_alpha   If true, then (-alpha) is used instead of alpha
+     * @param s3                The third (GPU) scalar
+     * @param beta              The scalar beta in the operation
+     * @param len_beta          If beta is obtained from summing over a small GPU vector (e.g. the final summation after a multi-group reduction), then supply the length of the array here
+     * @param reciprocal_beta   If true, then s2 / beta instead of s2 * beta is computed
+     * @param flip_sign_beta    If true, then (-beta) is used instead of beta
+     */
+    template <typename S1,
+              typename S2, typename ScalarType1,
+              typename S3, typename ScalarType2>
+    typename viennacl::enable_if< viennacl::is_scalar<S1>::value
+                                  && viennacl::is_scalar<S2>::value
+                                  && viennacl::is_scalar<S3>::value
+                                  && viennacl::is_any_scalar<ScalarType1>::value
+                                  && viennacl::is_any_scalar<ScalarType2>::value
+                                >::type
+    asbs(S1 & s1,
+         S2 const & s2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+         S3 const & s3, ScalarType2 const & beta,  vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
+    {
+      switch (viennacl::traits::handle(s1).get_active_handle_id())
+      {
+        case viennacl::MAIN_MEMORY:
+          viennacl::linalg::host_based::asbs(s1,
+                                             s2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha,
+                                             s3,  beta, len_beta,  reciprocal_beta,  flip_sign_beta);
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case viennacl::OPENCL_MEMORY:
+          viennacl::linalg::opencl::asbs(s1,
+                                         s2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha,
+                                         s3,  beta, len_beta,  reciprocal_beta,  flip_sign_beta);
+          break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        case viennacl::CUDA_MEMORY:
+          viennacl::linalg::cuda::asbs(s1,
+                                       s2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha,
+                                       s3,  beta, len_beta,  reciprocal_beta,  flip_sign_beta);
+          break;
+#endif
+        case viennacl::MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("not implemented");
+      }
+    }
+
+
+    /** @brief Interface for the generic operation s1 += s2 @ alpha + s3 @ beta, where s1, s2 and s3 are GPU scalars, @ denotes multiplication or division, and alpha, beta are either a GPU or a CPU scalar
+     *
+     * @param s1                The first  (GPU) scalar
+     * @param s2                The second (GPU) scalar
+     * @param alpha             The scalar alpha in the operation
+     * @param len_alpha         If alpha is a small GPU vector, which needs to be summed in order to obtain the final scalar, then supply the length of the array here
+     * @param reciprocal_alpha  If true, then s2 / alpha instead of s2 * alpha is computed
+     * @param flip_sign_alpha   If true, then (-alpha) is used instead of alpha
+     * @param s3                The third (GPU) scalar
+     * @param beta              The scalar beta in the operation
+     * @param len_beta          If beta is obtained from summing over a small GPU vector (e.g. the final summation after a multi-group reduction), then supply the length of the array here
+     * @param reciprocal_beta   If true, then s2 / beta instead of s2 * beta is computed
+     * @param flip_sign_beta    If true, then (-beta) is used instead of beta
+     */
+    template <typename S1,
+              typename S2, typename ScalarType1,
+              typename S3, typename ScalarType2>
+    typename viennacl::enable_if< viennacl::is_scalar<S1>::value
+                                  && viennacl::is_scalar<S2>::value
+                                  && viennacl::is_scalar<S3>::value
+                                  && viennacl::is_any_scalar<ScalarType1>::value
+                                  && viennacl::is_any_scalar<ScalarType2>::value
+                                >::type
+    asbs_s(S1 & s1,
+           S2 const & s2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+           S3 const & s3, ScalarType2 const & beta,  vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
+    {
+      switch (viennacl::traits::handle(s1).get_active_handle_id())
+      {
+        case viennacl::MAIN_MEMORY:
+          viennacl::linalg::host_based::asbs_s(s1,
+                                               s2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha,
+                                               s3,  beta, len_beta,  reciprocal_beta,  flip_sign_beta);
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case viennacl::OPENCL_MEMORY:
+          viennacl::linalg::opencl::asbs_s(s1,
+                                           s2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha,
+                                           s3,  beta, len_beta,  reciprocal_beta,  flip_sign_beta);
+          break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        case viennacl::CUDA_MEMORY:
+          viennacl::linalg::cuda::asbs_s(s1,
+                                         s2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha,
+                                         s3,  beta, len_beta,  reciprocal_beta,  flip_sign_beta);
+          break;
+#endif
+        case viennacl::MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("not implemented");
+      }
+    }
+
+
+
+    /** @brief Swaps the contents of two scalars
+    *
+    * @param s1   The first scalar
+    * @param s2   The second scalar
+    */
+    template <typename S1, typename S2>
+    typename viennacl::enable_if<    viennacl::is_scalar<S1>::value
+                                  && viennacl::is_scalar<S2>::value
+                                >::type
+    swap(S1 & s1, S2 & s2)
+    {
+      switch (viennacl::traits::handle(s1).get_active_handle_id())
+      {
+        case viennacl::MAIN_MEMORY:
+          viennacl::linalg::host_based::swap(s1, s2);
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case viennacl::OPENCL_MEMORY:
+          viennacl::linalg::opencl::swap(s1, s2);
+          break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        case viennacl::CUDA_MEMORY:
+          viennacl::linalg::cuda::swap(s1, s2);
+          break;
+#endif
+        case viennacl::MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("not implemented");
+      }
+    }
+
+
+  } //namespace linalg
+} //namespace viennacl
+
+
+#endif
diff --git a/viennacl/linalg/spai.hpp b/viennacl/linalg/spai.hpp
index e10fcd9..e31bf79 100644
--- a/viennacl/linalg/spai.hpp
+++ b/viennacl/linalg/spai.hpp
@@ -2,24 +2,25 @@
 #define VIENNACL_LINALG_SPAI_HPP
 
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
 ============================================================================= */
 
 /** @file viennacl/linalg/spai.hpp
-    @brief Main include file for the sparse approximate inverse preconditioner family (SPAI and FSPAI).  Experimental in 1.2.x.
-    
+    @brief Main include file for the sparse approximate inverse preconditioner family (SPAI and FSPAI).  Experimental.
+
     Most implementation contributed by Nikolay Lukash.
 */
 
@@ -33,9 +34,10 @@
 #include <math.h>
 #include <map>
 
-//local includes
+// ViennaCL includes
 #include "viennacl/linalg/detail/spai/spai_tag.hpp"
 #include "viennacl/linalg/qr.hpp"
+#include "viennacl/linalg/prod.hpp"
 #include "viennacl/linalg/detail/spai/spai-dynamic.hpp"
 #include "viennacl/linalg/detail/spai/spai-static.hpp"
 #include "viennacl/linalg/detail/spai/sparse_vector.hpp"
@@ -55,29 +57,16 @@
 #include "boost/numeric/ublas/triangular.hpp"
 #include "boost/numeric/ublas/matrix_expression.hpp"
 
-// ViennaCL includes
-#include "viennacl/linalg/prod.hpp"
-#include "viennacl/matrix.hpp"
-#include "viennacl/compressed_matrix.hpp"
-#include "viennacl/linalg/compressed_matrix_operations.hpp"
-#include "viennacl/linalg/matrix_operations.hpp"
-#include "viennacl/scalar.hpp"
-#include "viennacl/linalg/inner_prod.hpp"
-#include "viennacl/linalg/ilu.hpp"
-#include "viennacl/ocl/backend.hpp"
-#include "viennacl/linalg/kernels/spai_source.h"
-#include "viennacl/linalg/kernels/spai_kernels.h"
-
 
 namespace viennacl
 {
     namespace linalg
     {
-        
+
         typedef viennacl::linalg::detail::spai::spai_tag         spai_tag;
         typedef viennacl::linalg::detail::spai::fspai_tag        fspai_tag;
-        
-        /** @brief Implementation of the SParse Approximate Inverse Algorithm
+
+        /** @brief Implementation of the SParse Approximate Inverse Algorithm for a generic, uBLAS-compatible matrix type.
          * @param Matrix matrix that is used for computations
          * @param Vector vector that is used for computations
          */
@@ -90,43 +79,47 @@ namespace viennacl
             typedef typename boost::numeric::ublas::vector<ScalarType> VectorType;
             /** @brief Constructor
              * @param A matrix whose approximate inverse is calculated. Must be quadratic.
-             * @param tag spai tag 
+             * @param tag spai tag
              */
             spai_precond(const MatrixType& A,
-                         const spai_tag& tag): _tag(tag){
-                
-                //VCLMatrixType vcl_Ap((unsigned int)A.size2(), (unsigned int)A.size1()), vcl_A((unsigned int)A.size1(), (unsigned int)A.size2()), 
+                         const spai_tag& tag): tag_(tag){
+
+                //VCLMatrixType vcl_Ap((unsigned int)A.size2(), (unsigned int)A.size1()), vcl_A((unsigned int)A.size1(), (unsigned int)A.size2()),
                 //vcl_At((unsigned int)A.size1(), (unsigned int)A.size2());
                 //UBLASDenseMatrixType dA = A;
                 MatrixType pA(A.size1(), A.size2());
                 MatrixType At;
                 //std::cout<<A<<std::endl;
-                if(!_tag.getIsRight()){
+                if(!tag_.getIsRight()){
                     viennacl::linalg::detail::spai::sparse_transpose(A, At);
                 }else{
                     At = A;
                 }
                 pA = At;
-                viennacl::linalg::detail::spai::initPreconditioner(pA, _spai_m);
-                viennacl::linalg::detail::spai::computeSPAI(At, _spai_m, _tag);
-                //(At, pA, _tag.getIsRight(), _tag.getIsStatic(), (ScalarType)_tag.getResidualNormThreshold(), (unsigned int)_tag.getIterationLimit(),
+                viennacl::linalg::detail::spai::initPreconditioner(pA, spai_m_);
+                viennacl::linalg::detail::spai::computeSPAI(At, spai_m_, tag_);
+                //(At, pA, tag_.getIsRight(), tag_.getIsStatic(), (ScalarType)_tag.getResidualNormThreshold(), (unsigned int)_tag.getIterationLimit(),
                  //_spai_m);
-                
+
             }
             /** @brief Application of current preconditioner, multiplication on the right-hand side vector
              * @param vec rhs vector
              */
             void apply(VectorType& vec) const {
-                vec = viennacl::linalg::prod(_spai_m, vec);
+                vec = viennacl::linalg::prod(spai_m_, vec);
             }
         private:
             // variables
-            spai_tag _tag;
+            spai_tag tag_;
             // result of SPAI
-            MatrixType _spai_m;
-        };   
-        
+            MatrixType spai_m_;
+        };
+
         //VIENNACL version
+        /** @brief Implementation of the SParse Approximate Inverse Algorithm for a ViennaCL compressed_matrix.
+         * @param Matrix matrix that is used for computations
+         * @param Vector vector that is used for computations
+         */
         template <typename ScalarType, unsigned int MAT_ALIGNMENT>
         class spai_precond< viennacl::compressed_matrix<ScalarType, MAT_ALIGNMENT> >
         {
@@ -134,24 +127,25 @@ namespace viennacl
             typedef boost::numeric::ublas::compressed_matrix<ScalarType> UBLASSparseMatrixType;
             typedef viennacl::vector<ScalarType> VectorType;
             typedef viennacl::matrix<ScalarType> VCLDenseMatrixType;
-            
+
             typedef boost::numeric::ublas::vector<ScalarType> UBLASVectorType;
         public:
-            
+
             /** @brief Constructor
              * @param A matrix whose approximate inverse is calculated. Must be quadratic.
              * @param tag spai tag
              */
             spai_precond(const MatrixType& A,
-                         const spai_tag& tag): _tag(tag)
+                         const spai_tag& tag): tag_(tag), spai_m_(viennacl::traits::context(A))
             {
-                viennacl::linalg::kernels::spai<ScalarType, 1>::init();
-              
-                MatrixType At(A.size1(), A.size2());
-                UBLASSparseMatrixType ubls_A, ubls_spai_m;
+                viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+                viennacl::linalg::opencl::kernels::spai<ScalarType>::init(ctx);
+
+                MatrixType At(A.size1(), A.size2(), viennacl::context(ctx));
+                UBLASSparseMatrixType ubls_A(A.size1(), A.size2()), ubls_spai_m;
                 UBLASSparseMatrixType ubls_At;
-                viennacl::copy(A, ubls_A);;
-                if(!_tag.getIsRight()){
+                viennacl::copy(A, ubls_A);
+                if(!tag_.getIsRight()){
                     viennacl::linalg::detail::spai::sparse_transpose(ubls_A, ubls_At);
                 }
                 else{
@@ -162,29 +156,31 @@ namespace viennacl
                 //execute SPAI with ublas matrix types
                 viennacl::linalg::detail::spai::initPreconditioner(ubls_At, ubls_spai_m);
                 viennacl::copy(ubls_At, At);
-                viennacl::linalg::detail::spai::computeSPAI(At, ubls_At, ubls_spai_m, _spai_m, _tag);
-                //viennacl::copy(ubls_spai_m, _spai_m);
-                
+                viennacl::linalg::detail::spai::computeSPAI(At, ubls_At, ubls_spai_m, spai_m_, tag_);
+                //viennacl::copy(ubls_spai_m, spai_m_);
+                tmp_.resize(A.size1(), viennacl::traits::context(A), false);
             }
             /** @brief Application of current preconditioner, multiplication on the right-hand side vector
              * @param vec rhs vector
              */
             void apply(VectorType& vec) const {
-                vec = viennacl::linalg::prod(_spai_m, vec);
+                tmp_ = viennacl::linalg::prod(spai_m_, vec);
+                vec = tmp_;
             }
         private:
             // variables
-            spai_tag _tag;
+            spai_tag tag_;
             // result of SPAI
-            MatrixType _spai_m;
+            MatrixType spai_m_;
+            mutable VectorType tmp_;
         };
-        
-        
+
+
         //
         // FSPAI
         //
-        
-        /** @brief Implementation of the Factored SParse Approximate Inverse Algorithm
+
+        /** @brief Implementation of the Factored SParse Approximate Inverse Algorithm for a generic, uBLAS-compatible matrix type.
         * @param Matrix matrix that is used for computations
         * @param Vector vector that is used for computations
         */
@@ -197,7 +193,7 @@ namespace viennacl
             typedef typename boost::numeric::ublas::matrix<ScalarType> UBLASDenseMatrixType;
             typedef typename viennacl::matrix<ScalarType> VCLMatrixType;
         public:
-            
+
             /** @brief Constructor
             * @param A matrix whose approximate inverse is calculated. Must be quadratic.
             * @param tag SPAI configuration tag
@@ -208,31 +204,35 @@ namespace viennacl
                 MatrixType pA = A;
                 viennacl::linalg::detail::spai::computeFSPAI(A, pA, L, L_trans, tag_);
             }
-            
+
             /** @brief Application of current preconditioner, multiplication on the right-hand side vector
             * @param vec rhs vector
             */
-            void apply(VectorType& vec) const 
+            void apply(VectorType& vec) const
             {
               VectorType temp = viennacl::linalg::prod(L_trans, vec);
               vec = viennacl::linalg::prod(L, temp);
             }
-            
+
         private:
             // variables
             const fspai_tag & tag_;
             // result of SPAI
             MatrixType L;
             MatrixType L_trans;
-        };   
-        
+        };
+
+
+
+
 
-        
-        
-        
         //
         // ViennaCL version
         //
+        /** @brief Implementation of the Factored SParse Approximate Inverse Algorithm for a ViennaCL compressed_matrix.
+        * @param Matrix matrix that is used for computations
+        * @param Vector vector that is used for computations
+        */
         template <typename ScalarType, unsigned int MAT_ALIGNMENT>
         class fspai_precond< viennacl::compressed_matrix<ScalarType, MAT_ALIGNMENT> >
         {
@@ -242,13 +242,14 @@ namespace viennacl
             typedef boost::numeric::ublas::compressed_matrix<ScalarType> UBLASSparseMatrixType;
             typedef boost::numeric::ublas::vector<ScalarType> UBLASVectorType;
         public:
-            
+
             /** @brief Constructor
             * @param A matrix whose approximate inverse is calculated. Must be quadratic.
             * @param tag SPAI configuration tag
             */
             fspai_precond(const MatrixType & A,
-                        const fspai_tag & tag): tag_(tag){
+                          const fspai_tag & tag) : tag_(tag), L(viennacl::traits::context(A)), L_trans(viennacl::traits::context(A)), temp_apply_vec_(A.size1(), viennacl::traits::context(A))
+            {
                 //UBLASSparseMatrixType ubls_A;
                 UBLASSparseMatrixType ublas_A(A.size1(), A.size2());
                 UBLASSparseMatrixType pA(A.size1(), A.size2());
@@ -266,26 +267,26 @@ namespace viennacl
                 viennacl::copy(ublas_L, L);
                 viennacl::copy(ublas_L_trans, L_trans);
             }
-            
-            
+
+
             /** @brief Application of current preconditioner, multiplication on the right-hand side vector
             * @param vec rhs vector
             */
-            void apply(VectorType& vec) const 
+            void apply(VectorType& vec) const
             {
-              VectorType temp(vec.size());
-              temp = viennacl::linalg::prod(L_trans, vec);
-              vec = viennacl::linalg::prod(L, temp);
+              temp_apply_vec_ = viennacl::linalg::prod(L_trans, vec);
+              vec = viennacl::linalg::prod(L, temp_apply_vec_);
             }
-            
+
         private:
             // variables
             const fspai_tag & tag_;
             MatrixType L;
             MatrixType L_trans;
+            mutable VectorType temp_apply_vec_;
         };
-        
-        
+
+
     }
 }
-#endif
\ No newline at end of file
+#endif
diff --git a/viennacl/linalg/sparse_matrix_operations.hpp b/viennacl/linalg/sparse_matrix_operations.hpp
new file mode 100644
index 0000000..8633ff2
--- /dev/null
+++ b/viennacl/linalg/sparse_matrix_operations.hpp
@@ -0,0 +1,375 @@
+#ifndef VIENNACL_LINALG_SPARSE_MATRIX_OPERATIONS_HPP_
+#define VIENNACL_LINALG_SPARSE_MATRIX_OPERATIONS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/sparse_matrix_operations.hpp
+    @brief Implementations of operations using sparse matrices
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/host_based/sparse_matrix_operations.hpp"
+
+#ifdef VIENNACL_WITH_OPENCL
+  #include "viennacl/linalg/opencl/sparse_matrix_operations.hpp"
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+  #include "viennacl/linalg/cuda/sparse_matrix_operations.hpp"
+#endif
+
+namespace viennacl
+{
+  namespace linalg
+  {
+
+    namespace detail
+    {
+
+      template<typename SparseMatrixType, typename SCALARTYPE, unsigned int VEC_ALIGNMENT>
+      typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixType>::value >::type
+      row_info(SparseMatrixType const & mat,
+               vector<SCALARTYPE, VEC_ALIGNMENT> & vec,
+               row_info_types info_selector)
+      {
+        switch (viennacl::traits::handle(mat).get_active_handle_id())
+        {
+          case viennacl::MAIN_MEMORY:
+            viennacl::linalg::host_based::detail::row_info(mat, vec, info_selector);
+            break;
+#ifdef VIENNACL_WITH_OPENCL
+          case viennacl::OPENCL_MEMORY:
+            viennacl::linalg::opencl::detail::row_info(mat, vec, info_selector);
+            break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+          case viennacl::CUDA_MEMORY:
+            viennacl::linalg::cuda::detail::row_info(mat, vec, info_selector);
+            break;
+#endif
+          case viennacl::MEMORY_NOT_INITIALIZED:
+            throw memory_exception("not initialised!");
+          default:
+            throw memory_exception("not implemented");
+        }
+      }
+
+    }
+
+
+
+    // A * x
+
+    /** @brief Carries out matrix-vector multiplication involving a sparse matrix type
+    *
+    * Implementation of the convenience expression result = prod(mat, vec);
+    *
+    * @param mat    The matrix
+    * @param vec    The vector
+    * @param result The result vector
+    */
+    template<typename SparseMatrixType, class ScalarType>
+    typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixType>::value>::type
+    prod_impl(const SparseMatrixType & mat,
+              const viennacl::vector_base<ScalarType> & vec,
+                    viennacl::vector_base<ScalarType> & result)
+    {
+      assert( (mat.size1() == result.size()) && bool("Size check failed for compressed matrix-vector product: size1(mat) != size(result)"));
+      assert( (mat.size2() == vec.size())    && bool("Size check failed for compressed matrix-vector product: size2(mat) != size(x)"));
+
+      switch (viennacl::traits::handle(mat).get_active_handle_id())
+      {
+        case viennacl::MAIN_MEMORY:
+          viennacl::linalg::host_based::prod_impl(mat, vec, result);
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case viennacl::OPENCL_MEMORY:
+          viennacl::linalg::opencl::prod_impl(mat, vec, result);
+          break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        case viennacl::CUDA_MEMORY:
+          viennacl::linalg::cuda::prod_impl(mat, vec, result);
+          break;
+#endif
+        case viennacl::MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("not implemented");
+      }
+    }
+
+
+    // A * B
+    /** @brief Carries out matrix-matrix multiplication first matrix being sparse
+    *
+    * Implementation of the convenience expression result = prod(sp_mat, d_mat);
+    *
+    * @param sp_mat   The sparse matrix
+    * @param d_mat    The dense matrix
+    * @param result   The result matrix (dense)
+    */
+    template<typename SparseMatrixType, class ScalarType, typename F1, typename F2>
+    typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixType>::value>::type
+    prod_impl(const SparseMatrixType & sp_mat,
+              const viennacl::matrix_base<ScalarType, F1> & d_mat,
+                    viennacl::matrix_base<ScalarType, F2> & result)
+    {
+      assert( (sp_mat.size1() == result.size1()) && bool("Size check failed for compressed matrix - dense matrix product: size1(sp_mat) != size1(result)"));
+      assert( (sp_mat.size2() == d_mat.size1()) && bool("Size check failed for compressed matrix - dense matrix product: size2(sp_mat) != size1(d_mat)"));
+
+      switch (viennacl::traits::handle(sp_mat).get_active_handle_id())
+      {
+        case viennacl::MAIN_MEMORY:
+          viennacl::linalg::host_based::prod_impl(sp_mat, d_mat, result);
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case viennacl::OPENCL_MEMORY:
+          viennacl::linalg::opencl::prod_impl(sp_mat, d_mat, result);
+          break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        case viennacl::CUDA_MEMORY:
+          viennacl::linalg::cuda::prod_impl(sp_mat, d_mat, result);
+          break;
+#endif
+        case viennacl::MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("not implemented");
+      }
+    }
+
+    // A * transpose(B)
+    /** @brief Carries out matrix-matrix multiplication first matrix being sparse, and the second transposed
+    *
+    * Implementation of the convenience expression result = prod(sp_mat, d_mat);
+    *
+    * @param sp_mat   The sparse matrix
+    * @param d_mat    The dense matrix (transposed)
+    * @param result   The result matrix (dense)
+    */
+    template<typename SparseMatrixType, class ScalarType, typename F1, typename F2>
+    typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixType>::value>::type
+    prod_impl(const SparseMatrixType & sp_mat,
+              const viennacl::matrix_expression<const viennacl::matrix_base<ScalarType, F1>,
+                                                const viennacl::matrix_base<ScalarType, F1>,
+                                                viennacl::op_trans>& d_mat,
+                    viennacl::matrix_base<ScalarType, F2> & result)
+    {
+      assert( (sp_mat.size1() == result.size1()) && bool("Size check failed for compressed matrix - dense matrix product: size1(sp_mat) != size1(result)"));
+      assert( (sp_mat.size2() == d_mat.size1()) && bool("Size check failed for compressed matrix - dense matrix product: size2(sp_mat) != size1(d_mat)"));
+
+      switch (viennacl::traits::handle(sp_mat).get_active_handle_id())
+      {
+        case viennacl::MAIN_MEMORY:
+          viennacl::linalg::host_based::prod_impl(sp_mat, d_mat, result);
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case viennacl::OPENCL_MEMORY:
+          viennacl::linalg::opencl::prod_impl(sp_mat, d_mat, result);
+          break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        case viennacl::CUDA_MEMORY:
+          viennacl::linalg::cuda::prod_impl(sp_mat, d_mat, result);
+          break;
+#endif
+        case viennacl::MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("not implemented");
+      }
+    }
+
+    /** @brief Carries out triangular inplace solves
+    *
+    * @param mat    The matrix
+    * @param vec    The vector
+    * @param tag    The solver tag (lower_tag, unit_lower_tag, unit_upper_tag, or upper_tag)
+    */
+    template<typename SparseMatrixType, class ScalarType, typename SOLVERTAG>
+    typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixType>::value>::type
+    inplace_solve(const SparseMatrixType & mat,
+                  viennacl::vector_base<ScalarType> & vec,
+                  SOLVERTAG tag)
+    {
+      assert( (mat.size1() == mat.size2()) && bool("Size check failed for triangular solve on compressed matrix: size1(mat) != size2(mat)"));
+      assert( (mat.size2() == vec.size())    && bool("Size check failed for compressed matrix-vector product: size2(mat) != size(x)"));
+
+      switch (viennacl::traits::handle(mat).get_active_handle_id())
+      {
+        case viennacl::MAIN_MEMORY:
+          viennacl::linalg::host_based::inplace_solve(mat, vec, tag);
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case viennacl::OPENCL_MEMORY:
+          viennacl::linalg::opencl::inplace_solve(mat, vec, tag);
+          break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        case viennacl::CUDA_MEMORY:
+          viennacl::linalg::cuda::inplace_solve(mat, vec, tag);
+          break;
+#endif
+        case viennacl::MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("not implemented");
+      }
+    }
+
+
+    /** @brief Carries out transposed triangular inplace solves
+    *
+    * @param mat    The matrix
+    * @param vec    The vector
+    * @param tag    The solver tag (lower_tag, unit_lower_tag, unit_upper_tag, or upper_tag)
+    */
+    template<typename SparseMatrixType, class ScalarType, typename SOLVERTAG>
+    typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixType>::value>::type
+    inplace_solve(const matrix_expression<const SparseMatrixType, const SparseMatrixType, op_trans> & mat,
+                  viennacl::vector_base<ScalarType> & vec,
+                  SOLVERTAG tag)
+    {
+      assert( (mat.size1() == mat.size2()) && bool("Size check failed for triangular solve on transposed compressed matrix: size1(mat) != size2(mat)"));
+      assert( (mat.size1() == vec.size())    && bool("Size check failed for transposed compressed matrix triangular solve: size1(mat) != size(x)"));
+
+      switch (viennacl::traits::handle(mat.lhs()).get_active_handle_id())
+      {
+        case viennacl::MAIN_MEMORY:
+          viennacl::linalg::host_based::inplace_solve(mat, vec, tag);
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case viennacl::OPENCL_MEMORY:
+          viennacl::linalg::opencl::inplace_solve(mat, vec, tag);
+          break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        case viennacl::CUDA_MEMORY:
+          viennacl::linalg::cuda::inplace_solve(mat, vec, tag);
+          break;
+#endif
+        case viennacl::MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("not implemented");
+      }
+    }
+
+
+
+    namespace detail
+    {
+
+      template<typename SparseMatrixType, class ScalarType, typename SOLVERTAG>
+      typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixType>::value>::type
+      block_inplace_solve(const matrix_expression<const SparseMatrixType, const SparseMatrixType, op_trans> & mat,
+                          viennacl::backend::mem_handle const & block_index_array, vcl_size_t num_blocks,
+                          viennacl::vector_base<ScalarType> const & mat_diagonal,
+                          viennacl::vector_base<ScalarType> & vec,
+                          SOLVERTAG tag)
+      {
+        assert( (mat.size1() == mat.size2()) && bool("Size check failed for triangular solve on transposed compressed matrix: size1(mat) != size2(mat)"));
+        assert( (mat.size1() == vec.size())  && bool("Size check failed for transposed compressed matrix triangular solve: size1(mat) != size(x)"));
+
+        switch (viennacl::traits::handle(mat.lhs()).get_active_handle_id())
+        {
+          case viennacl::MAIN_MEMORY:
+            viennacl::linalg::host_based::detail::block_inplace_solve(mat, block_index_array, num_blocks, mat_diagonal, vec, tag);
+            break;
+  #ifdef VIENNACL_WITH_OPENCL
+          case viennacl::OPENCL_MEMORY:
+            viennacl::linalg::opencl::detail::block_inplace_solve(mat, block_index_array, num_blocks, mat_diagonal, vec, tag);
+            break;
+  #endif
+  #ifdef VIENNACL_WITH_CUDA
+          case viennacl::CUDA_MEMORY:
+            viennacl::linalg::cuda::detail::block_inplace_solve(mat, block_index_array, num_blocks, mat_diagonal, vec, tag);
+            break;
+  #endif
+          case viennacl::MEMORY_NOT_INITIALIZED:
+            throw memory_exception("not initialised!");
+          default:
+            throw memory_exception("not implemented");
+        }
+      }
+
+
+    }
+
+
+
+  } //namespace linalg
+
+
+  /** @brief Returns an expression template class representing a transposed matrix */
+  template<typename M1>
+  typename viennacl::enable_if<viennacl::is_any_sparse_matrix<M1>::value,
+                                matrix_expression< const M1, const M1, op_trans>
+                              >::type
+  trans(const M1 & mat)
+  {
+    return matrix_expression< const M1, const M1, op_trans>(mat, mat);
+  }
+
+  //free functions:
+  /** @brief Implementation of the operation 'result = v1 + A * v2', where A is a matrix
+  *
+  * @param result The vector the result is written to.
+  * @param proxy  An expression template proxy class holding v1, A, and v2.
+  */
+  template <typename SCALARTYPE, typename SparseMatrixType>
+  typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixType>::value,
+                                viennacl::vector<SCALARTYPE> >::type
+  operator+(viennacl::vector_base<SCALARTYPE> & result,
+            const viennacl::vector_expression< const SparseMatrixType, const viennacl::vector_base<SCALARTYPE>, viennacl::op_prod> & proxy)
+  {
+    assert(proxy.lhs().size1() == result.size() && bool("Dimensions for addition of sparse matrix-vector product to vector don't match!"));
+    vector<SCALARTYPE> temp(proxy.lhs().size1());
+    viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), temp);
+    result += temp;
+    return result;
+  }
+
+  /** @brief Implementation of the operation 'result = v1 - A * v2', where A is a matrix
+  *
+  * @param result The vector the result is written to.
+  * @param proxy  An expression template proxy class.
+  */
+  template <typename SCALARTYPE, typename SparseMatrixType>
+  typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixType>::value,
+                                viennacl::vector<SCALARTYPE> >::type
+  operator-(viennacl::vector_base<SCALARTYPE> & result,
+            const viennacl::vector_expression< const SparseMatrixType, const viennacl::vector_base<SCALARTYPE>, viennacl::op_prod> & proxy)
+  {
+    assert(proxy.lhs().size1() == result.size() && bool("Dimensions for addition of sparse matrix-vector product to vector don't match!"));
+    vector<SCALARTYPE> temp(proxy.lhs().size1());
+    viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), temp);
+    result += temp;
+    return result;
+  }
+
+} //namespace viennacl
+
+
+#endif
diff --git a/viennacl/linalg/svd.hpp b/viennacl/linalg/svd.hpp
new file mode 100644
index 0000000..3f07411
--- /dev/null
+++ b/viennacl/linalg/svd.hpp
@@ -0,0 +1,532 @@
+#ifndef VIENNACL_LINALG_SVD_HPP
+#define VIENNACL_LINALG_SVD_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/svd.hpp
+    @brief Provides singular value decomposition using a block-based approach.  Experimental.
+
+    Contributed by Volodymyr Kysenko.
+*/
+
+
+// Note: Boost.uBLAS is required at the moment
+#include <boost/numeric/ublas/vector.hpp>
+#include <boost/numeric/ublas/matrix.hpp>
+
+
+#include <cmath>
+
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/opencl/kernels/svd.hpp"
+#include "viennacl/linalg/qr-method-common.hpp"
+
+namespace viennacl
+{
+  namespace linalg
+  {
+
+    namespace detail
+    {
+
+      template<typename MatrixType, typename VectorType>
+      void givens_prev(MatrixType & matrix,
+                       VectorType & tmp1,
+                       VectorType & tmp2,
+                       int n,
+                       int l,
+                       int k
+                      )
+      {
+        typedef typename MatrixType::value_type                                   ScalarType;
+        typedef typename viennacl::result_of::cpu_value_type<ScalarType>::type    CPU_ScalarType;
+
+        viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(matrix).context());
+        viennacl::ocl::kernel & kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<CPU_ScalarType>::program_name(), SVD_GIVENS_PREV_KERNEL);
+
+        kernel.global_work_size(0, viennacl::tools::align_to_multiple<vcl_size_t>(viennacl::traits::size1(matrix), 256));
+        kernel.local_work_size(0, 256);
+
+        viennacl::ocl::enqueue(kernel(
+                                      matrix,
+                                      tmp1,
+                                      tmp2,
+                                      static_cast<cl_uint>(n),
+                                      static_cast<cl_uint>(matrix.internal_size1()),
+                                      static_cast<cl_uint>(l + 1),
+                                      static_cast<cl_uint>(k + 1)
+                              ));
+      }
+
+
+      template<typename MatrixType, typename VectorType>
+      void change_signs(MatrixType& matrix, VectorType& signs, int n)
+      {
+        typedef typename MatrixType::value_type                                   ScalarType;
+        typedef typename viennacl::result_of::cpu_value_type<ScalarType>::type    CPU_ScalarType;
+
+        viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(matrix).context());
+        viennacl::ocl::kernel & kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<CPU_ScalarType>::program_name(), SVD_INVERSE_SIGNS_KERNEL);
+
+        kernel.global_work_size(0, viennacl::tools::align_to_multiple<vcl_size_t>(viennacl::traits::size1(matrix), 16));
+        kernel.global_work_size(1, viennacl::tools::align_to_multiple<vcl_size_t>(viennacl::traits::size2(matrix), 16));
+
+        kernel.local_work_size(0, 16);
+        kernel.local_work_size(1, 16);
+
+        viennacl::ocl::enqueue(kernel(
+                                      matrix,
+                                      signs,
+                                      static_cast<cl_uint>(n),
+                                      static_cast<cl_uint>(matrix.internal_size1())
+                              ));
+      }
+
+      template<typename MatrixType, typename CPU_VectorType>
+      void svd_qr_shift(MatrixType & vcl_u,
+                        MatrixType & vcl_v,
+                        CPU_VectorType & q,
+                        CPU_VectorType & e)
+      {
+        typedef typename MatrixType::value_type                                   ScalarType;
+        typedef typename viennacl::result_of::cpu_value_type<ScalarType>::type    CPU_ScalarType;
+
+        int n = static_cast<int>(q.size());
+        int m = static_cast<int>(vcl_u.size1());
+
+        detail::transpose(vcl_u);
+        detail::transpose(vcl_v);
+
+        std::vector<CPU_ScalarType> signs_v(n, 1);
+        std::vector<CPU_ScalarType> cs1(n), ss1(n), cs2(n), ss2(n);
+
+        viennacl::vector<CPU_ScalarType> tmp1(n), tmp2(n);
+
+        bool goto_test_conv = false;
+
+        for (int k = n - 1; k >= 0; k--)
+        {
+          // std::cout << "K = " << k << std::endl;
+
+          vcl_size_t iter = 0;
+          for (iter = 0; iter < detail::ITER_MAX; iter++)
+          {
+            // test for split
+            int l;
+            for (l = k; l >= 0; l--)
+            {
+              goto_test_conv = false;
+              if (std::fabs(e[l]) <= detail::EPS)
+              {
+                // set it
+                goto_test_conv = true;
+                break;
+              }
+
+              if (std::fabs(q[l - 1]) <= detail::EPS)
+              {
+                // goto
+                break;
+              }
+            }
+
+            if (!goto_test_conv)
+            {
+              CPU_ScalarType c = 0.0;
+              CPU_ScalarType s = 1.0;
+
+              //int l1 = l - 1;
+              //int l2 = k;
+
+              for (int i = l; i <= k; i++)
+              {
+                CPU_ScalarType f = s * e[i];
+                e[i] = c * e[i];
+
+                if (std::fabs(f) <= detail::EPS)
+                {
+                  //l2 = i - 1;
+                  break;
+                }
+
+                CPU_ScalarType g = q[i];
+                CPU_ScalarType h = detail::pythag(f, g);
+                q[i] = h;
+                c = g / h;
+                s = -f / h;
+
+                cs1[i] = c;
+                ss1[i] = s;
+              }
+
+              // std::cout << "Hitted!" << l1 << " " << l2 << "\n";
+
+              // for(int i = l; i <= l2; i++)
+              // {
+              //   for (int j = 0; j < m; j++)
+              //   {
+              //     CPU_ScalarType y = u(j, l1);
+              //     CPU_ScalarType z = u(j, i);
+              //     u(j, l1) = y * cs1[i] + z * ss1[i];
+              //     u(j, i) = -y * ss1[i] + z * cs1[i];
+              //   }
+              // }
+            }
+
+            CPU_ScalarType z = q[k];
+
+            if (l == k)
+            {
+              if (z < 0)
+              {
+                q[k] = -z;
+
+                signs_v[k] *= -1;
+              }
+
+              break;
+            }
+
+            if (iter >= detail::ITER_MAX - 1)
+              break;
+
+            CPU_ScalarType x = q[l];
+            CPU_ScalarType y = q[k - 1];
+            CPU_ScalarType g = e[k - 1];
+            CPU_ScalarType h = e[k];
+            CPU_ScalarType f = ((y - z) * (y + z) + (g - h) * (g + h)) / (2 * h * y);
+
+            g = detail::pythag<CPU_ScalarType>(f, 1);
+
+            if (f < 0) {
+              f = ((x - z) * (x + z) + h * (y / (f - g) - h)) / x;
+            } else {
+              f = ((x - z) * (x + z) + h * (y / (f + g) - h)) / x;
+            }
+
+            CPU_ScalarType c = 1;
+            CPU_ScalarType s = 1;
+
+            for (vcl_size_t i = l + 1; i <= static_cast<vcl_size_t>(k); i++)
+            {
+              g = e[i];
+              y = q[i];
+              h = s * g;
+              g = c * g;
+              CPU_ScalarType z = detail::pythag(f, h);
+              e[i - 1] = z;
+              c = f / z;
+              s = h / z;
+              f = x * c + g * s;
+              g = -x * s + g * c;
+              h = y * s;
+              y = y * c;
+
+              cs1[i] = c;
+              ss1[i] = s;
+
+              z = detail::pythag(f, h);
+              q[i - 1] = z;
+              c = f / z;
+              s = h / z;
+              f = c * g + s * y;
+              x = -s * g + c * y;
+
+              cs2[i] = c;
+              ss2[i] = s;
+            }
+
+            {
+              viennacl::copy(cs1, tmp1);
+              viennacl::copy(ss1, tmp2);
+
+              givens_prev(vcl_v, tmp1, tmp2, n, l, k);
+            }
+
+            {
+              viennacl::copy(cs2, tmp1);
+              viennacl::copy(ss2, tmp2);
+
+              givens_prev(vcl_u, tmp1, tmp2, m, l, k);
+            }
+
+            e[l] = 0.0;
+            e[k] = f;
+            q[k] = x;
+          }
+
+        }
+
+
+        viennacl::copy(signs_v, tmp1);
+        change_signs(vcl_v, tmp1, n);
+
+        // transpose singular matrices again
+        detail::transpose(vcl_u);
+        detail::transpose(vcl_v);
+      }
+
+
+      /*template <typename SCALARTYPE, unsigned int ALIGNMENT>
+      bool householder_c(viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT> & A,
+                          viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT> & Q,
+                          viennacl::vector<SCALARTYPE, ALIGNMENT> & D,
+                          vcl_size_t start)
+      {
+
+        vcl_size_t row_start = start;
+        vcl_size_t col_start = start;
+
+        if(row_start + 1 >= A.size1())
+          return false;
+
+        std::vector<SCALARTYPE> tmp(A.size1(), 0);
+
+        copy_vec(A, D, row_start, col_start, true);
+        fast_copy(D.begin(), D.begin() + (A.size1() - row_start), tmp.begin() + row_start);
+
+        detail::householder_vector(tmp, row_start);
+
+        fast_copy(tmp, D);
+
+        viennacl::ocl::kernel & kernel = viennacl::ocl::get_kernel(viennacl::linalg::opencl::kernels::svd<SCALARTYPE>::program_name(), SVD_HOUSEHOLDER_COL_KERNEL);
+
+        //kernel.global_work_size(0, A.size1() << 1);
+
+        viennacl::ocl::enqueue(kernel(
+                                      A,
+                                      Q,
+                                      D,
+                                      static_cast<cl_uint>(row_start),
+                                      static_cast<cl_uint>(col_start),
+                                      static_cast<cl_uint>(A.size1()),
+                                      static_cast<cl_uint>(A.size2()),
+                                      static_cast<cl_uint>(A.internal_size2()),
+                                      static_cast<cl_uint>(Q.internal_size2()),
+                                      viennacl::ocl::local_mem(static_cast<cl_uint>(128 * sizeof(SCALARTYPE)))
+                              ));
+
+        return true;
+      }*/
+
+      template <typename SCALARTYPE, unsigned int ALIGNMENT>
+      bool householder_c(viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT>& A,
+                          viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT>& Q,
+                          viennacl::vector<SCALARTYPE, ALIGNMENT>& D,
+                          vcl_size_t row_start, vcl_size_t col_start)
+      {
+        viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+
+        if(row_start + 1 >= A.size1())
+          return false;
+
+        prepare_householder_vector(A, D, A.size1(), row_start, col_start, row_start, true);
+
+        {
+          viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<SCALARTYPE>::program_name(), SVD_HOUSEHOLDER_UPDATE_A_LEFT_KERNEL);
+
+          viennacl::ocl::enqueue(kernel(
+                                        A,
+                                        D,
+                                        static_cast<cl_uint>(row_start),
+                                        static_cast<cl_uint>(col_start),
+                                        static_cast<cl_uint>(A.size1()),
+                                        static_cast<cl_uint>(A.size2()),
+                                        static_cast<cl_uint>(A.internal_size2()),
+                                        viennacl::ocl::local_mem(static_cast<cl_uint>(128 * sizeof(SCALARTYPE)))
+                                ));
+        }
+
+        {
+          viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<SCALARTYPE>::program_name(), SVD_HOUSEHOLDER_UPDATE_QL_KERNEL);
+
+          viennacl::ocl::enqueue(kernel(
+                                        Q,
+                                        D,
+                                        static_cast<cl_uint>(A.size1()),
+                                        static_cast<cl_uint>(A.size2()),
+                                        static_cast<cl_uint>(Q.internal_size2()),
+                                        viennacl::ocl::local_mem(static_cast<cl_uint>(128 * sizeof(SCALARTYPE)))
+                                ));
+        }
+
+        return true;
+      }
+
+      /*
+      template <typename SCALARTYPE, unsigned int ALIGNMENT>
+      bool householder_r(viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT>& A,
+                          viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT>& Q,
+                          viennacl::vector<SCALARTYPE, ALIGNMENT>& S,
+                          vcl_size_t start)
+      {
+
+        vcl_size_t row_start = start;
+        vcl_size_t col_start = start + 1;
+
+        if(col_start + 1 >= A.size2())
+          return false;
+
+        std::vector<SCALARTYPE> tmp(A.size2(), 0);
+
+        copy_vec(A, S, row_start, col_start, false);
+        fast_copy(S.begin(),
+                  S.begin() + (A.size2() - col_start),
+                  tmp.begin() + col_start);
+
+        detail::householder_vector(tmp, col_start);
+        fast_copy(tmp, S);
+
+        viennacl::ocl::kernel& kernel = viennacl::ocl::get_kernel(viennacl::linalg::opencl::kernels::svd<SCALARTYPE>::program_name(), SVD_HOUSEHOLDER_ROW_KERNEL);
+
+        viennacl::ocl::enqueue(kernel(
+                                      A,
+                                      Q,
+                                      S,
+                                      static_cast<cl_uint>(row_start),
+                                      static_cast<cl_uint>(col_start),
+                                      static_cast<cl_uint>(A.size1()),
+                                      static_cast<cl_uint>(A.size2()),
+                                      static_cast<cl_uint>(A.internal_size2()),
+                                      static_cast<cl_uint>(Q.internal_size2()),
+                                      viennacl::ocl::local_mem(static_cast<cl_uint>(128 * sizeof(SCALARTYPE)))
+                                ));
+        return true;
+      } */
+
+      template <typename SCALARTYPE, unsigned int ALIGNMENT>
+      bool householder_r(viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT> & A,
+                          viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT> & Q,
+                          viennacl::vector<SCALARTYPE, ALIGNMENT>& D,
+                          vcl_size_t row_start, vcl_size_t col_start)
+      {
+        viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+
+        if(col_start + 1 >= A.size2())
+          return false;
+
+        prepare_householder_vector(A, D, A.size2(), row_start, col_start, col_start, false);
+
+        {
+          viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<SCALARTYPE>::program_name(), SVD_HOUSEHOLDER_UPDATE_A_RIGHT_KERNEL);
+
+          viennacl::ocl::enqueue(kernel(
+                                        A,
+                                        D,
+                                        static_cast<cl_uint>(row_start),
+                                        static_cast<cl_uint>(col_start),
+                                        static_cast<cl_uint>(A.size1()),
+                                        static_cast<cl_uint>(A.size2()),
+                                        static_cast<cl_uint>(A.internal_size2()),
+                                        viennacl::ocl::local_mem(static_cast<cl_uint>(128 * sizeof(SCALARTYPE)))
+                                ));
+        }
+
+        {
+          viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<SCALARTYPE>::program_name(), SVD_HOUSEHOLDER_UPDATE_QR_KERNEL);
+
+          viennacl::ocl::enqueue(kernel(
+                                        Q,
+                                        D,
+                                        static_cast<cl_uint>(A.size1()),
+                                        static_cast<cl_uint>(A.size2()),
+                                        static_cast<cl_uint>(Q.internal_size2()),
+                                        viennacl::ocl::local_mem(static_cast<cl_uint>(128 * sizeof(SCALARTYPE)))
+                                ));
+        }
+
+        return true;
+      }
+
+      template <typename SCALARTYPE, unsigned int ALIGNMENT>
+      void bidiag(viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT> & Ai,
+                  viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT> & QL,
+                  viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT> & QR)
+      {
+        viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(QL).context());
+
+        vcl_size_t row_num = Ai.size1();
+        vcl_size_t col_num = Ai.size2();
+
+        vcl_size_t to = std::min(row_num, col_num);
+        vcl_size_t big_to = std::max(row_num, col_num);
+
+        //for storing householder vector
+        viennacl::vector<SCALARTYPE, ALIGNMENT> hh_vector(big_to);
+
+        QL = viennacl::identity_matrix<SCALARTYPE>(QL.size1(), ctx);
+        QR = viennacl::identity_matrix<SCALARTYPE>(QR.size1(), ctx);
+
+        for(vcl_size_t i = 0; i < to; i++)
+        {
+          householder_c(Ai, QL, hh_vector, i, i);
+          householder_r(Ai, QR, hh_vector, i, i+1);
+        }
+      }
+
+    } // namespace detail
+
+
+    /** @brief Computes the singular value decomposition of a matrix A. Experimental in 1.3.x
+     *
+     * @param A     The input matrix. Will be overwritten with a diagonal matrix containing the singular values on return
+     * @param QL    The left orthogonal matrix
+     * @param QR    The right orthogonal matrix
+     */
+    template <typename SCALARTYPE, unsigned int ALIGNMENT>
+    void svd(viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT> & A,
+              viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT> & QL,
+              viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT> & QR)
+    {
+      viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+      viennacl::linalg::opencl::kernels::svd<SCALARTYPE>::init(ctx);
+
+      vcl_size_t row_num = A.size1();
+      vcl_size_t col_num = A.size2();
+
+      vcl_size_t to = std::min(row_num, col_num);
+
+
+      //viennacl::vector<SCALARTYPE, ALIGNMENT> d(to);
+      //viennacl::vector<SCALARTYPE, ALIGNMENT> s(to + 1);
+
+      // first stage
+      detail::bidiag(A, QL, QR);
+
+      // second stage
+      //std::vector<SCALARTYPE> dh(to, 0);
+      //std::vector<SCALARTYPE> sh(to + 1, 0);
+      boost::numeric::ublas::vector<SCALARTYPE> dh = boost::numeric::ublas::scalar_vector<SCALARTYPE>(to, 0);
+      boost::numeric::ublas::vector<SCALARTYPE> sh = boost::numeric::ublas::scalar_vector<SCALARTYPE>(to + 1, 0);
+
+      detail::bidiag_pack(A, dh, sh);
+
+      detail::svd_qr_shift( QL, QR, dh, sh);
+
+      // Write resulting diagonal matrix with singular values to A:
+      boost::numeric::ublas::matrix<SCALARTYPE> h_Sigma(row_num, col_num);
+      h_Sigma.clear();
+
+      for (vcl_size_t i = 0; i < to; i++)
+        h_Sigma(i, i) = dh[i];
+
+      copy(h_Sigma, A);
+    }
+  }
+}
+#endif
diff --git a/viennacl/linalg/toeplitz_matrix_operations.hpp b/viennacl/linalg/toeplitz_matrix_operations.hpp
index 098ebbc..7901ea4 100644
--- a/viennacl/linalg/toeplitz_matrix_operations.hpp
+++ b/viennacl/linalg/toeplitz_matrix_operations.hpp
@@ -2,29 +2,28 @@
 #define VIENNACL_LINALG_TOEPLITZ_MATRIX_OPERATIONS_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
 ============================================================================= */
 
 /** @file toeplitz_matrix_operations.hpp
-    @brief Implementations of operations using toeplitz_matrix
+    @brief Implementations of operations using toeplitz_matrix. Experimental.
 */
 
 #include "viennacl/forwards.h"
-#include "viennacl/ocl/device.hpp"
-#include "viennacl/ocl/handle.hpp"
-#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/backend.hpp"
 #include "viennacl/scalar.hpp"
 #include "viennacl/vector.hpp"
 #include "viennacl/tools/tools.hpp"
@@ -34,48 +33,10 @@ namespace viennacl
 {
   namespace linalg
   {
-    
-    
-    // A * x
-    /** @brief Returns a proxy class that represents matrix-vector multiplication with a compressed_matrix
-    *
-    * This is used for the convenience expression result = prod(mat, vec);
-    *
-    * @param mat    The matrix
-    * @param vec    The vector
-    */
-    template<class SCALARTYPE, unsigned int ALIGNMENT, unsigned int VECTOR_ALIGNMENT>
-    vector_expression<const toeplitz_matrix<SCALARTYPE, ALIGNMENT>,
-                      const vector<SCALARTYPE, VECTOR_ALIGNMENT>, 
-                      op_prod > prod_impl(const toeplitz_matrix<SCALARTYPE, ALIGNMENT> & mat, 
-                                     const vector<SCALARTYPE, VECTOR_ALIGNMENT> & vec)
-    {
-      return vector_expression<const toeplitz_matrix<SCALARTYPE, ALIGNMENT>,
-                               const vector<SCALARTYPE, VECTOR_ALIGNMENT>, 
-                               op_prod >(mat, vec);
-    }
-    
+
+
     // A * x
-    /** @brief Returns a proxy class that represents matrix-vector multiplication with a toeplitz_matrix
-    *
-    * This is used for the convenience expression result = prod(mat, vec);
-    *
-    * @param mat    The matrix
-    * @param vec    The vector
-    * @param NUM_THREADS Number of threads per work group. Can be used for fine-tuning.
-    */
-    template<class SCALARTYPE, unsigned int ALIGNMENT, unsigned int VECTOR_ALIGNMENT>
-    viennacl::vector_expression<const viennacl::toeplitz_matrix<SCALARTYPE, ALIGNMENT>,
-                                const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT>, 
-                                viennacl::op_prod > prod_impl(const viennacl::toeplitz_matrix<SCALARTYPE, ALIGNMENT> & mat, 
-                                                              const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT> & vec, 
-                                                              size_t NUM_THREADS)
-    {
-      return viennacl::vector_expression<const viennacl::toeplitz_matrix<SCALARTYPE, ALIGNMENT>,
-                               const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT>, 
-                               viennacl::op_prod >(mat, vec);
-    }
-    
+
     /** @brief Carries out matrix-vector multiplication with a toeplitz_matrix
     *
     * Implementation of the convenience expression result = prod(mat, vec);
@@ -84,131 +45,32 @@ namespace viennacl
     * @param vec    The vector
     * @param result The result vector
     */
-      template<class SCALARTYPE, unsigned int ALIGNMENT, unsigned int VECTOR_ALIGNMENT>
-      void prod_impl(const viennacl::toeplitz_matrix<SCALARTYPE, ALIGNMENT> & mat, 
-                     const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT> & vec,
-                           viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT> & result)
-      {
-        assert(mat.size1() == result.size());
-        assert(mat.size2() == vec.size());
-        
-        viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT> tep(mat.elements().size() * 2);
-        viennacl::detail::fft::real_to_complex(mat.elements(), tep, mat.elements().size());
-
-        viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT> tmp(vec.size() * 4);
-        viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT> tmp2(vec.size() * 4);
-
-        tmp.clear();
-        copy(vec, tmp);
-        viennacl::detail::fft::real_to_complex(tmp, tmp2, vec.size() * 2);
-        viennacl::linalg::convolve(tep, tmp2, tmp);
-        viennacl::detail::fft::complex_to_real(tmp, tmp2, vec.size() * 2);
-        copy(tmp2.begin(), tmp2.begin() + vec.size(), result.begin());
-      }
+    template<class SCALARTYPE, unsigned int ALIGNMENT>
+    void prod_impl(const viennacl::toeplitz_matrix<SCALARTYPE, ALIGNMENT> & mat,
+                   const viennacl::vector_base<SCALARTYPE> & vec,
+                         viennacl::vector_base<SCALARTYPE> & result)
+    {
+      assert(mat.size1() == result.size());
+      assert(mat.size2() == vec.size());
 
-  } //namespace linalg
+      viennacl::vector<SCALARTYPE> tmp(vec.size() * 4); tmp.clear();
+      viennacl::vector<SCALARTYPE> tmp2(vec.size() * 4);
 
+      viennacl::vector<SCALARTYPE> tep(mat.elements().size() * 2);
+      viennacl::detail::fft::real_to_complex(mat.elements(), tep, mat.elements().size());
 
 
-    /** @brief Implementation of the operation v1 = A * v2, where A is a matrix
-    *
-    * @param proxy  An expression template proxy class.
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    template <unsigned int MAT_ALIGNMENT>
-    viennacl::vector<SCALARTYPE, ALIGNMENT> & 
-    viennacl::vector<SCALARTYPE, ALIGNMENT>::operator=(const viennacl::vector_expression< const toeplitz_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                                                          const viennacl::vector<SCALARTYPE, ALIGNMENT>,
-                                                                                          viennacl::op_prod> & proxy) 
-    {
-      // check for the special case x = A * x
-      if (proxy.rhs().handle().get() == this->handle().get())
-      {
-        viennacl::vector<SCALARTYPE, ALIGNMENT> result(proxy.rhs().size());
-        viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
-        *this = result;
-        return *this;
-      }
-      else
-      {
-        viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), *this);
-        return *this;
-      }
-      return *this;
-    }
 
-    //v += A * x
-    /** @brief Implementation of the operation v1 += A * v2, where A is a matrix
-    *
-    * @param proxy  An expression template proxy class.
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    template <unsigned int MAT_ALIGNMENT>
-    viennacl::vector<SCALARTYPE, ALIGNMENT> & 
-    viennacl::vector<SCALARTYPE, ALIGNMENT>::operator+=(const vector_expression< const toeplitz_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                                                 const vector<SCALARTYPE, ALIGNMENT>,
-                                                                                 op_prod> & proxy) 
-    {
-      vector<SCALARTYPE, ALIGNMENT> result(proxy.lhs().size1());
-      viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
-      *this += result;
-      return *this;
+      copy(vec, tmp);
+      viennacl::detail::fft::real_to_complex(tmp, tmp2, vec.size() * 2);
+      viennacl::linalg::convolve(tep, tmp2, tmp);
+      viennacl::detail::fft::complex_to_real(tmp, tmp2, vec.size() * 2);
+      copy(tmp2.begin(), tmp2.begin() + static_cast<vcl_ptrdiff_t>(vec.size()), result.begin());
     }
 
-    /** @brief Implementation of the operation v1 -= A * v2, where A is a matrix
-    *
-    * @param proxy  An expression template proxy class.
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    template <unsigned int MAT_ALIGNMENT>
-    viennacl::vector<SCALARTYPE, ALIGNMENT> & 
-    viennacl::vector<SCALARTYPE, ALIGNMENT>::operator-=(const vector_expression< const toeplitz_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                                                 const vector<SCALARTYPE, ALIGNMENT>,
-                                                                                 op_prod> & proxy) 
-    {
-      vector<SCALARTYPE, ALIGNMENT> result(proxy.get_lhs().size1());
-      viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
-      *this -= result;
-      return *this;
-    }
-    
-    
-    //free functions:
-    /** @brief Implementation of the operation 'result = v1 + A * v2', where A is a matrix
-    *
-    * @param proxy  An expression template proxy class.
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    template <unsigned int MAT_ALIGNMENT>
-    viennacl::vector<SCALARTYPE, ALIGNMENT> 
-    viennacl::vector<SCALARTYPE, ALIGNMENT>::operator+(const vector_expression< const toeplitz_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                                                const vector<SCALARTYPE, ALIGNMENT>,
-                                                                                op_prod> & proxy) 
-    {
-      assert(proxy.get_lhs().size1() == size());
-      vector<SCALARTYPE, ALIGNMENT> result(size());
-      viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
-      result += *this;
-      return result;
-    }
+  } //namespace linalg
+
 
-    /** @brief Implementation of the operation 'result = v1 - A * v2', where A is a matrix
-    *
-    * @param proxy  An expression template proxy class.
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    template <unsigned int MAT_ALIGNMENT>
-    viennacl::vector<SCALARTYPE, ALIGNMENT> 
-    viennacl::vector<SCALARTYPE, ALIGNMENT>::operator-(const vector_expression< const toeplitz_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                                                const vector<SCALARTYPE, ALIGNMENT>,
-                                                                                op_prod> & proxy) 
-    {
-      assert(proxy.get_lhs().size1() == size());
-      vector<SCALARTYPE, ALIGNMENT> result(size());
-      viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
-      result = *this - result;
-      return result;
-    }
 
 } //namespace viennacl
 
diff --git a/viennacl/linalg/tred2.hpp b/viennacl/linalg/tred2.hpp
new file mode 100644
index 0000000..0f66c48
--- /dev/null
+++ b/viennacl/linalg/tred2.hpp
@@ -0,0 +1,68 @@
+#ifndef VIENNACL_LINALG_TRED2_HPP_
+#define VIENNACL_LINALG_TRED2_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/tred2.hpp
+*   @brief optimized CPU tridiagonalization
+*
+*   Contributed by Alex Christensen.
+*/
+
+#include "viennacl/linalg/host_based/sse_kernels.hpp"
+#include <boost/numeric/ublas/matrix.hpp>
+
+namespace viennacl
+{
+  namespace linalg
+  {
+    /** @brief Inplace reduction of a hermitian (or real symmetric) to tridiagonal form using householder similarity transforms (preserving eigenvalues)
+     *
+     * @param A            A dense matrix to be tridiagonalized
+     * @param block_size   The block size to be used
+     */
+    template<typename ScalarType>
+    void inplace_tred2(boost::numeric::ublas::matrix<ScalarType> const & A, vcl_size_t block_size = 1)
+    {
+
+#ifdef VIENNACL_WITH_OPENMP
+      vcl_size_t num_threads=(vcl_size_t)omp_get_max_threads();
+      omp_set_num_threads(omp_get_max_threads());
+#else
+      vcl_size_t num_threads=1;
+#endif
+
+      vcl_size_t n=A.size1();
+      if(n!=A.size2())
+        std::cerr << "ViennaCL: Warning in inplace_tred2(): Matrix is not hermitian (or real symmetric)" << std::endl;
+      block_size=std::min(n,block_size);
+
+      //get pointers to the value arrays
+      ScalarType** rows=new ScalarType*[n];
+      for(vcl_size_t i=0;i<n;i++)
+        rows[i]=(ScalarType*)&A(i,0);
+
+      //call the optimized CPU code
+      inplace_tred2(rows,A.size1(),block_size,num_threads);
+
+      delete [] rows;
+    }
+
+  } //namespace linalg
+} //namespace viennacl
+#endif
diff --git a/viennacl/linalg/vandermonde_matrix_operations.hpp b/viennacl/linalg/vandermonde_matrix_operations.hpp
index 80002d5..7d1bf3e 100644
--- a/viennacl/linalg/vandermonde_matrix_operations.hpp
+++ b/viennacl/linalg/vandermonde_matrix_operations.hpp
@@ -2,81 +2,40 @@
 #define VIENNACL_LINALG_VANDERMONDE_MATRIX_OPERATIONS_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
 ============================================================================= */
 
-/** @file vandermonde_matrix_operations.hpp
-    @brief Implementations of operations using vandermonde_matrix
+/** @file viennacl/linalg/vandermonde_matrix_operations.hpp
+    @brief Implementations of operations using vandermonde_matrix. Experimental.
 */
 
 #include "viennacl/forwards.h"
-#include "viennacl/ocl/device.hpp"
-#include "viennacl/ocl/handle.hpp"
-#include "viennacl/ocl/kernel.hpp"
 #include "viennacl/scalar.hpp"
 #include "viennacl/vector.hpp"
 #include "viennacl/tools/tools.hpp"
 #include "viennacl/fft.hpp"
-//#include "viennacl/linalg/kernels/coordinate_matrix_kernels.h"
+#include "viennacl/linalg/opencl/vandermonde_matrix_operations.hpp"
 
 namespace viennacl
 {
   namespace linalg
   {
-    
-    
-    // A * x
-    /** @brief Returns a proxy class that represents matrix-vector multiplication with a compressed_matrix
-    *
-    * This is used for the convenience expression result = prod(mat, vec);
-    *
-    * @param mat    The matrix
-    * @param vec    The vector
-    */
-    template<class SCALARTYPE, unsigned int ALIGNMENT, unsigned int VECTOR_ALIGNMENT>
-    vector_expression<const vandermonde_matrix<SCALARTYPE, ALIGNMENT>,
-                      const vector<SCALARTYPE, VECTOR_ALIGNMENT>, 
-                      op_prod > prod_impl(const vandermonde_matrix<SCALARTYPE, ALIGNMENT> & mat, 
-                                     const vector<SCALARTYPE, VECTOR_ALIGNMENT> & vec)
-    {
-      return vector_expression<const vandermonde_matrix<SCALARTYPE, ALIGNMENT>,
-                               const vector<SCALARTYPE, VECTOR_ALIGNMENT>, 
-                               op_prod >(mat, vec);
-    }
-    
+
     // A * x
-    /** @brief Returns a proxy class that represents matrix-vector multiplication with a vandermonde_matrix
-    *
-    * This is used for the convenience expression result = prod(mat, vec);
-    *
-    * @param mat    The matrix
-    * @param vec    The vector
-    * @param NUM_THREADS Number of threads per work group. Can be used for fine-tuning.
-    */
-    template<class SCALARTYPE, unsigned int ALIGNMENT, unsigned int VECTOR_ALIGNMENT>
-    viennacl::vector_expression<const viennacl::vandermonde_matrix<SCALARTYPE, ALIGNMENT>,
-                                const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT>, 
-                                viennacl::op_prod > prod_impl(const viennacl::vandermonde_matrix<SCALARTYPE, ALIGNMENT> & mat, 
-                                                              const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT> & vec, 
-                                                              size_t NUM_THREADS)
-    {
-      return viennacl::vector_expression<const viennacl::vandermonde_matrix<SCALARTYPE, ALIGNMENT>,
-                               const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT>, 
-                               viennacl::op_prod >(mat, vec);
-    }
-    
+
     /** @brief Carries out matrix-vector multiplication with a vandermonde_matrix
     *
     * Implementation of the convenience expression result = prod(mat, vec);
@@ -85,126 +44,26 @@ namespace viennacl
     * @param vec    The vector
     * @param result The result vector
     */
-      template<class SCALARTYPE, unsigned int ALIGNMENT, unsigned int VECTOR_ALIGNMENT>
-      void prod_impl(const viennacl::vandermonde_matrix<SCALARTYPE, ALIGNMENT> & mat, 
-                     const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT> & vec,
-                           viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT> & result)
-      {
-        assert(mat.size1() == result.size());
-        assert(mat.size2() == vec.size());
-        
-        //fft::vandermonde_prod<SCALARTYPE>(mat.handle(), vec.handle(), result.handle(), mat.size1());      
-        viennacl::linalg::kernels::fft<SCALARTYPE, 1>::init();
-        
-        viennacl::ocl::kernel& kernel = viennacl::ocl::current_context()
-                                          .get_program(viennacl::linalg::kernels::fft<SCALARTYPE, 1>::program_name())
-                                          .get_kernel("vandermonde_prod");
-        viennacl::ocl::enqueue(kernel(mat, vec, result, static_cast<cl_uint>(mat.size1())));
-      }
-
-  } //namespace linalg
-
-
-
-    /** @brief Implementation of the operation v1 = A * v2, where A is a matrix
-    *
-    * @param proxy  An expression template proxy class.
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    template <unsigned int MAT_ALIGNMENT>
-    viennacl::vector<SCALARTYPE, ALIGNMENT> & 
-    viennacl::vector<SCALARTYPE, ALIGNMENT>::operator=(const viennacl::vector_expression< const vandermonde_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                                                          const viennacl::vector<SCALARTYPE, ALIGNMENT>,
-                                                                                          viennacl::op_prod> & proxy) 
+    template<class SCALARTYPE, unsigned int ALIGNMENT>
+    void prod_impl(const viennacl::vandermonde_matrix<SCALARTYPE, ALIGNMENT> & mat,
+                   const viennacl::vector_base<SCALARTYPE> & vec,
+                         viennacl::vector_base<SCALARTYPE> & result)
     {
-      // check for the special case x = A * x
-      if (proxy.rhs().handle().get() == this->handle().get())
-      {
-        viennacl::vector<SCALARTYPE, ALIGNMENT> result(proxy.rhs().size());
-        viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
-        *this = result;
-        return *this;
-      }
-      else
+      assert(mat.size1() == result.size());
+      assert(mat.size2() == vec.size());
+
+      switch (viennacl::traits::handle(mat).get_active_handle_id())
       {
-        viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), *this);
-        return *this;
+        case viennacl::OPENCL_MEMORY:
+          viennacl::linalg::opencl::prod_impl(mat, vec, result);
+          break;
+        default:
+          throw "not implemented";
       }
-      return *this;
     }
 
-    //v += A * x
-    /** @brief Implementation of the operation v1 += A * v2, where A is a matrix
-    *
-    * @param proxy  An expression template proxy class.
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    template <unsigned int MAT_ALIGNMENT>
-    viennacl::vector<SCALARTYPE, ALIGNMENT> & 
-    viennacl::vector<SCALARTYPE, ALIGNMENT>::operator+=(const vector_expression< const vandermonde_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                                                 const vector<SCALARTYPE, ALIGNMENT>,
-                                                                                 op_prod> & proxy) 
-    {
-      vector<SCALARTYPE, ALIGNMENT> result(proxy.lhs().size1());
-      viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
-      *this += result;
-      return *this;
-    }
-
-    /** @brief Implementation of the operation v1 -= A * v2, where A is a matrix
-    *
-    * @param proxy  An expression template proxy class.
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    template <unsigned int MAT_ALIGNMENT>
-    viennacl::vector<SCALARTYPE, ALIGNMENT> & 
-    viennacl::vector<SCALARTYPE, ALIGNMENT>::operator-=(const vector_expression< const vandermonde_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                                                 const vector<SCALARTYPE, ALIGNMENT>,
-                                                                                 op_prod> & proxy) 
-    {
-      vector<SCALARTYPE, ALIGNMENT> result(proxy.get_lhs().size1());
-      viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
-      *this -= result;
-      return *this;
-    }
-    
-    
-    //free functions:
-    /** @brief Implementation of the operation 'result = v1 + A * v2', where A is a matrix
-    *
-    * @param proxy  An expression template proxy class.
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    template <unsigned int MAT_ALIGNMENT>
-    viennacl::vector<SCALARTYPE, ALIGNMENT> 
-    viennacl::vector<SCALARTYPE, ALIGNMENT>::operator+(const vector_expression< const vandermonde_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                                                const vector<SCALARTYPE, ALIGNMENT>,
-                                                                                op_prod> & proxy) 
-    {
-      assert(proxy.get_lhs().size1() == size());
-      vector<SCALARTYPE, ALIGNMENT> result(size());
-      viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
-      result += *this;
-      return result;
-    }
+  } //namespace linalg
 
-    /** @brief Implementation of the operation 'result = v1 - A * v2', where A is a matrix
-    *
-    * @param proxy  An expression template proxy class.
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    template <unsigned int MAT_ALIGNMENT>
-    viennacl::vector<SCALARTYPE, ALIGNMENT> 
-    viennacl::vector<SCALARTYPE, ALIGNMENT>::operator-(const vector_expression< const vandermonde_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                                                const vector<SCALARTYPE, ALIGNMENT>,
-                                                                                op_prod> & proxy) 
-    {
-      assert(proxy.get_lhs().size1() == size());
-      vector<SCALARTYPE, ALIGNMENT> result(size());
-      viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
-      result = *this - result;
-      return result;
-    }
 
 } //namespace viennacl
 
diff --git a/viennacl/linalg/vector_operations.hpp b/viennacl/linalg/vector_operations.hpp
index 2cceee3..9b6eb51 100644
--- a/viennacl/linalg/vector_operations.hpp
+++ b/viennacl/linalg/vector_operations.hpp
@@ -1,998 +1,880 @@
-#ifndef VIENNACL_VECTOR_OPERATIONS_HPP_
-#define VIENNACL_VECTOR_OPERATIONS_HPP_
+#ifndef VIENNACL_LINALG_VECTOR_OPERATIONS_HPP_
+#define VIENNACL_LINALG_VECTOR_OPERATIONS_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
 ============================================================================= */
 
-/** @file vector_operations.hpp
+/** @file viennacl/linalg/vector_operations.hpp
     @brief Implementations of vector operations.
 */
 
 #include "viennacl/forwards.h"
-#include "viennacl/ocl/device.hpp"
-#include "viennacl/ocl/handle.hpp"
-#include "viennacl/ocl/kernel.hpp"
 #include "viennacl/scalar.hpp"
 #include "viennacl/tools/tools.hpp"
-#include "viennacl/linalg/kernels/vector_kernels.h"
 #include "viennacl/meta/predicate.hpp"
 #include "viennacl/meta/enable_if.hpp"
 #include "viennacl/traits/size.hpp"
 #include "viennacl/traits/start.hpp"
 #include "viennacl/traits/handle.hpp"
+#include "viennacl/traits/stride.hpp"
+#include "viennacl/linalg/host_based/vector_operations.hpp"
+
+#ifdef VIENNACL_WITH_OPENCL
+  #include "viennacl/linalg/opencl/vector_operations.hpp"
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+  #include "viennacl/linalg/cuda/vector_operations.hpp"
+#endif
 
 namespace viennacl
 {
   namespace linalg
   {
-    /** @brief Addition of two vectors. Try to use the overloaded operators for vector instead, unless you want to fine-tune the number of GPU threads involved.
-    *
-    * @param vec1  The first addend. 
-    * @param vec2  The second addend.
-    * @param result The result vector.
-    */
-    template <typename V1, typename V2, typename V3>
-    typename viennacl::enable_if< viennacl::is_vector<V1>::value
-                                  && viennacl::is_vector<V2>::value
-                                  && viennacl::is_vector<V3>::value
-                                >::type
-    add(const V1 & vec1, 
-        const V2 & vec2, 
-        V3 & result)
+    template <typename T, typename ScalarType1>
+    void av(vector_base<T> & vec1,
+            vector_base<T> const & vec2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
     {
-      typedef typename viennacl::result_of::cpu_value_type<V1>::type        value_type;
-      
-      //TODO: Ensure that correct alignment is chosen for the kernels.
-      const unsigned int ALIGNMENT = V1::alignment;
-      
-      assert( (viennacl::traits::size(vec1) == viennacl::traits::size(vec2))
-             && (viennacl::traits::size(vec1) == viennacl::traits::size(result))
-             && "Incompatible vector sizes in add()!");
-
-      //unsigned int size = std::min(viennacl::traits::internal_size(vec1),
-      //                             viennacl::traits::internal_size(vec2));
-      
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "add");
-      
-      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec1), cl_uint(viennacl::traits::start(vec1)), cl_uint(viennacl::traits::size(vec1)),
-                               viennacl::traits::handle(vec2), cl_uint(viennacl::traits::start(vec2)), cl_uint(viennacl::traits::size(vec2)),
-                               viennacl::traits::handle(result),  cl_uint(viennacl::traits::start(result)), cl_uint(viennacl::traits::size(result)) )
-                            );
+      assert(viennacl::traits::size(vec1) == viennacl::traits::size(vec2) && bool("Incompatible vector sizes in v1 = v2 @ alpha: size(v1) != size(v2)"));
+
+      switch (viennacl::traits::handle(vec1).get_active_handle_id())
+      {
+        case viennacl::MAIN_MEMORY:
+          viennacl::linalg::host_based::av(vec1, vec2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha);
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case viennacl::OPENCL_MEMORY:
+          viennacl::linalg::opencl::av(vec1, vec2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha);
+          break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        case viennacl::CUDA_MEMORY:
+          viennacl::linalg::cuda::av(vec1, vec2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha);
+          break;
+#endif
+        case viennacl::MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("not implemented");
+      }
     }
 
-    /** @brief Inplace addition of two vectors. Try to use the overloaded operators for vector instead, unless you want to fine-tune the number of GPU threads involved.
-    *
-    * Computes vec1 += vec2.
-    * 
-    * @param vec1  The result. 
-    * @param vec2  The addend
-    */
-    template <typename V1, typename V2>
-    typename viennacl::enable_if< viennacl::is_vector<V1>::value
-                                  && viennacl::is_vector<V2>::value
-                                >::type
-    inplace_add(V1 & vec1,
-                const V2 & vec2)
+
+    template <typename T, typename ScalarType1, typename ScalarType2>
+    void avbv(vector_base<T> & vec1,
+              vector_base<T> const & vec2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+              vector_base<T> const & vec3, ScalarType2 const & beta,  vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
     {
-      typedef typename viennacl::result_of::cpu_value_type<V1>::type        value_type;
-      
-      //TODO: Ensure that correct alignment is chosen for the kernels.
-      const unsigned int ALIGNMENT = V1::alignment;
-      
-      assert( (viennacl::traits::size(vec1) == viennacl::traits::size(vec2))
-             && "Incompatible vector sizes in inplace_add()!");
-      
-      
-      //unsigned int size = std::min(vec1.internal_size(), vec2.internal_size());
-      
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "inplace_add");
-
-      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec1), cl_uint(viennacl::traits::start(vec1)), cl_uint(viennacl::traits::size(vec1)),
-                               viennacl::traits::handle(vec2), cl_uint(viennacl::traits::start(vec2)), cl_uint(viennacl::traits::size(vec2)))
-                            );
+      assert(viennacl::traits::size(vec1) == viennacl::traits::size(vec2) && bool("Incompatible vector sizes in v1 = v2 @ alpha + v3 @ beta: size(v1) != size(v2)"));
+      assert(viennacl::traits::size(vec2) == viennacl::traits::size(vec3) && bool("Incompatible vector sizes in v1 = v2 @ alpha + v3 @ beta: size(v2) != size(v3)"));
+
+      switch (viennacl::traits::handle(vec1).get_active_handle_id())
+      {
+        case viennacl::MAIN_MEMORY:
+          viennacl::linalg::host_based::avbv(vec1,
+                                                  vec2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha,
+                                                  vec3,  beta, len_beta,  reciprocal_beta,  flip_sign_beta);
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case viennacl::OPENCL_MEMORY:
+          viennacl::linalg::opencl::avbv(vec1,
+                                         vec2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha,
+                                         vec3,  beta, len_beta,  reciprocal_beta,  flip_sign_beta);
+          break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        case viennacl::CUDA_MEMORY:
+          viennacl::linalg::cuda::avbv(vec1,
+                                       vec2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha,
+                                       vec3,  beta, len_beta,  reciprocal_beta,  flip_sign_beta);
+          break;
+#endif
+        case viennacl::MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("not implemented");
+      }
     }
 
 
+    template <typename T, typename ScalarType1, typename ScalarType2>
+    void avbv_v(vector_base<T> & vec1,
+                vector_base<T> const & vec2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+                vector_base<T> const & vec3, ScalarType2 const & beta,  vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
+    {
+      assert(viennacl::traits::size(vec1) == viennacl::traits::size(vec2) && bool("Incompatible vector sizes in v1 += v2 @ alpha + v3 @ beta: size(v1) != size(v2)"));
+      assert(viennacl::traits::size(vec2) == viennacl::traits::size(vec3) && bool("Incompatible vector sizes in v1 += v2 @ alpha + v3 @ beta: size(v2) != size(v3)"));
 
-    /** @brief Subtraction of two vectors. Try to use the overloaded operators for vector instead, unless you want to fine-tune the number of GPU threads involved.
-    *
-    * result = vec1 - vec2
+      switch (viennacl::traits::handle(vec1).get_active_handle_id())
+      {
+        case viennacl::MAIN_MEMORY:
+          viennacl::linalg::host_based::avbv_v(vec1,
+                                                    vec2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha,
+                                                    vec3,  beta, len_beta,  reciprocal_beta,  flip_sign_beta);
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case viennacl::OPENCL_MEMORY:
+          viennacl::linalg::opencl::avbv_v(vec1,
+                                           vec2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha,
+                                           vec3,  beta, len_beta,  reciprocal_beta,  flip_sign_beta);
+          break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        case viennacl::CUDA_MEMORY:
+          viennacl::linalg::cuda::avbv_v(vec1,
+                                         vec2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha,
+                                         vec3,  beta, len_beta,  reciprocal_beta,  flip_sign_beta);
+          break;
+#endif
+        case viennacl::MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("not implemented");
+      }
+    }
+
+
+    /** @brief Assign a constant value to a vector (-range/-slice)
     *
-    * @param vec1  The first operand. 
-    * @param vec2  The second operand.
-    * @param result The result vector.
+    * @param vec1   The vector to which the value should be assigned
+    * @param alpha  The value to be assigned
+    * @param up_to_internal_size    Whether 'alpha' should be written to padded memory as well. This is used for setting all entries to zero, including padded memory.
     */
-    template <typename V1, typename V2, typename V3>
-    typename viennacl::enable_if< viennacl::is_vector<V1>::value
-                                  && viennacl::is_vector<V2>::value
-                                  && viennacl::is_vector<V3>::value
-                                >::type
-    sub(const V1 & vec1,
-        const V2 & vec2,
-        V3 & result)
+    template <typename T>
+    void vector_assign(vector_base<T> & vec1, const T & alpha, bool up_to_internal_size = false)
     {
-      typedef typename viennacl::result_of::cpu_value_type<V1>::type        value_type;
-      
-      //TODO: Ensure that correct alignment is chosen for the kernels.
-      const unsigned int ALIGNMENT = V1::alignment;
-      
-      assert( (viennacl::traits::size(vec1) == viennacl::traits::size(vec2))
-             && (viennacl::traits::size(vec1) == viennacl::traits::size(result))
-             && "Incompatible vector sizes in sub()!");
-      
-      //unsigned int size = std::min(vec1.internal_size(), vec2.internal_size());
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "sub");
-
-      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec1), cl_uint(viennacl::traits::start(vec1)), cl_uint(viennacl::traits::size(vec1)),
-                               viennacl::traits::handle(vec2), cl_uint(viennacl::traits::start(vec2)), cl_uint(viennacl::traits::size(vec2)),
-                               viennacl::traits::handle(result),  cl_uint(viennacl::traits::start(result)), cl_uint(viennacl::traits::size(result)) )
-                            );        
+      switch (viennacl::traits::handle(vec1).get_active_handle_id())
+      {
+        case viennacl::MAIN_MEMORY:
+          viennacl::linalg::host_based::vector_assign(vec1, alpha, up_to_internal_size);
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case viennacl::OPENCL_MEMORY:
+          viennacl::linalg::opencl::vector_assign(vec1, alpha, up_to_internal_size);
+          break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        case viennacl::CUDA_MEMORY:
+          viennacl::linalg::cuda::vector_assign(vec1, alpha, up_to_internal_size);
+          break;
+#endif
+        case viennacl::MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("not implemented");
+      }
     }
 
-    /** @brief Inplace addition of two vectors. Try to use the overloaded operators for vector instead, unless you want to fine-tune the number of GPU threads involved.
+
+    /** @brief Swaps the contents of two vectors, data is copied
     *
-    * Computes vec1 -= vec2.
-    * 
-    * @param vec1  The result. 
-    * @param vec2  The subtracted vector
+    * @param vec1   The first vector (or -range, or -slice)
+    * @param vec2   The second vector (or -range, or -slice)
     */
-    template <typename V1, typename V2>
-    typename viennacl::enable_if< viennacl::is_vector<V1>::value
-                                  && viennacl::is_vector<V2>::value
-                                >::type
-    inplace_sub(V1 & vec1,
-                const V2 & vec2)
+    template <typename T>
+    void vector_swap(vector_base<T> & vec1, vector_base<T> & vec2)
     {
-      typedef typename viennacl::result_of::cpu_value_type<V1>::type        value_type;
-      
-      //TODO: Ensure that correct alignment is chosen for the kernels.
-      const unsigned int ALIGNMENT = V1::alignment;
-      
-      assert( (viennacl::traits::size(vec1) == viennacl::traits::size(vec2))
-             && "Incompatible vector sizes in inplace_sub()!");
-      
-      //unsigned int size = std::min(vec1.internal_size(), vec2.internal_size());
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "inplace_sub");
-
-      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec1), cl_uint(viennacl::traits::start(vec1)), cl_uint(viennacl::traits::size(vec1)),
-                               viennacl::traits::handle(vec2), cl_uint(viennacl::traits::start(vec2)), cl_uint(viennacl::traits::size(vec2)))
-                            );        
+      assert(viennacl::traits::size(vec1) == viennacl::traits::size(vec2) && bool("Incompatible vector sizes in vector_swap()"));
+
+      switch (viennacl::traits::handle(vec1).get_active_handle_id())
+      {
+        case viennacl::MAIN_MEMORY:
+          viennacl::linalg::host_based::vector_swap(vec1, vec2);
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case viennacl::OPENCL_MEMORY:
+          viennacl::linalg::opencl::vector_swap(vec1, vec2);
+          break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        case viennacl::CUDA_MEMORY:
+          viennacl::linalg::cuda::vector_swap(vec1, vec2);
+          break;
+#endif
+        case viennacl::MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("not implemented");
+      }
     }
 
 
-    //result = vec * scalar
-    /** @brief Scales a vector. Try to use the overloaded operators for vector instead, unless you want to fine-tune the number of GPU threads involved.
-    *
-    * Computes result = vec * alpha, where alpha is a gpu scalar
+    ///////////////////////// Elementwise operations /////////////
+
+
+
+    /** @brief Implementation of the element-wise operation v1 = v2 .* v3 and v1 = v2 ./ v3    (using MATLAB syntax)
     *
-    * @param vec    The vector to be scaled.
-    * @param alpha  The scaling factor.
-    * @param result The result vector.
+    * @param vec1   The result vector (or -range, or -slice)
+    * @param proxy  The proxy object holding v2, v3 and the operation
     */
-    template <typename V1, typename S2, typename V3>
-    typename viennacl::enable_if< viennacl::is_vector<V1>::value
-                                  && viennacl::is_scalar<S2>::value
-                                  && viennacl::is_vector<V3>::value
-                                >::type
-    mult(const V1 & vec,
-         S2 const & alpha,
-         V3 & result)
+    template <typename T, typename OP>
+    void element_op(vector_base<T> & vec1,
+                    vector_expression<const vector_base<T>, const vector_base<T>, OP> const & proxy)
     {
-      typedef typename viennacl::result_of::cpu_value_type<V1>::type        value_type;
-      
-      //TODO: Ensure that correct alignment is chosen for the kernels.
-      const unsigned int ALIGNMENT = V1::alignment;
-      
-      assert( (viennacl::traits::size(vec) == viennacl::traits::size(result))
-             && "Incompatible vector sizes in mult()!");
-      
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "mult");
-
-      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec), cl_uint(viennacl::traits::start(vec)), cl_uint(viennacl::traits::size(vec)),
-                               alpha,
-                               viennacl::traits::handle(result),  cl_uint(viennacl::traits::start(result)), cl_uint(viennacl::traits::size(result)))
-                            );        
+      assert(viennacl::traits::size(vec1) == viennacl::traits::size(proxy) && bool("Incompatible vector sizes in element_op()"));
+
+      switch (viennacl::traits::handle(vec1).get_active_handle_id())
+      {
+        case viennacl::MAIN_MEMORY:
+          viennacl::linalg::host_based::element_op(vec1, proxy);
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case viennacl::OPENCL_MEMORY:
+          viennacl::linalg::opencl::element_op(vec1, proxy);
+          break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        case viennacl::CUDA_MEMORY:
+          viennacl::linalg::cuda::element_op(vec1, proxy);
+          break;
+#endif
+        case viennacl::MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("not implemented");
+      }
     }
 
-    /** @brief Scales a vector. Try to use the overloaded operators for vector instead, unless you want to fine-tune the number of GPU threads involved.
-    *
-    * Computes result = vec * alpha, where alpha is a cpu scalar
-    *
-    * @param vec    The vector to be scaled.
-    * @param alpha  The scaling factor.
-    * @param result The result vector.
-    */
-    template <typename V1, typename SCALARTYPE, typename V3>
-    typename viennacl::enable_if< viennacl::is_vector<V1>::value
-                                  && viennacl::is_cpu_scalar<SCALARTYPE>::value
-                                  && viennacl::is_vector<V3>::value
-                                >::type
-    mult(V1 const & vec,
-         SCALARTYPE alpha,
-         V3 & result)
+    /** \cond */
+
+// Helper macro for generating binary element-wise operations such as element_prod(), element_div(), element_pow() without unnecessary code duplication */
+#define VIENNACL_GENERATE_BINARY_ELEMENTOPERATION_OVERLOADS(OPNAME) \
+    template <typename T> \
+    viennacl::vector_expression<const vector_base<T>, const vector_base<T>, op_element_binary<op_##OPNAME> > \
+    element_##OPNAME(vector_base<T> const & v1, vector_base<T> const & v2) \
+    { \
+      return viennacl::vector_expression<const vector_base<T>, const vector_base<T>, op_element_binary<op_##OPNAME> >(v1, v2); \
+    } \
+\
+    template <typename V1, typename V2, typename OP, typename T> \
+    viennacl::vector_expression<const vector_expression<const V1, const V2, OP>, const vector_base<T>, op_element_binary<op_##OPNAME> > \
+    element_##OPNAME(vector_expression<const V1, const V2, OP> const & proxy, vector_base<T> const & v2) \
+    { \
+      return viennacl::vector_expression<const vector_expression<const V1, const V2, OP>, const vector_base<T>, op_element_binary<op_##OPNAME> >(proxy, v2); \
+    } \
+\
+    template <typename T, typename V2, typename V3, typename OP> \
+    viennacl::vector_expression<const vector_base<T>, const vector_expression<const V2, const V3, OP>, op_element_binary<op_##OPNAME> > \
+    element_##OPNAME(vector_base<T> const & v1, vector_expression<const V2, const V3, OP> const & proxy) \
+    { \
+      return viennacl::vector_expression<const vector_base<T>, const vector_expression<const V2, const V3, OP>, op_element_binary<op_##OPNAME> >(v1, proxy); \
+    } \
+\
+    template <typename V1, typename V2, typename OP1, \
+              typename V3, typename V4, typename OP2> \
+    viennacl::vector_expression<const vector_expression<const V1, const V2, OP1>, \
+                                const vector_expression<const V3, const V4, OP2>, \
+                                op_element_binary<op_##OPNAME> > \
+    element_##OPNAME(vector_expression<const V1, const V2, OP1> const & proxy1, \
+                     vector_expression<const V3, const V4, OP2> const & proxy2) \
+    {\
+      return viennacl::vector_expression<const vector_expression<const V1, const V2, OP1>, \
+                                         const vector_expression<const V3, const V4, OP2>, \
+                                         op_element_binary<op_##OPNAME> >(proxy1, proxy2); \
+    }
+
+    VIENNACL_GENERATE_BINARY_ELEMENTOPERATION_OVERLOADS(prod)  //for element_prod()
+    VIENNACL_GENERATE_BINARY_ELEMENTOPERATION_OVERLOADS(div)   //for element_div()
+    VIENNACL_GENERATE_BINARY_ELEMENTOPERATION_OVERLOADS(pow)   //for element_pow()
+
+#undef VIENNACL_GENERATE_BINARY_ELEMENTOPERATION_OVERLOADS
+
+// Helper macro for generating unary element-wise operations such as element_exp(), element_sin(), etc. without unnecessary code duplication */
+#define VIENNACL_MAKE_UNARY_ELEMENT_OP(funcname) \
+    template <typename T> \
+    viennacl::vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<op_##funcname> > \
+    element_##funcname(vector_base<T> const & v) \
+    { \
+      return viennacl::vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<op_##funcname> >(v, v); \
+    } \
+    template <typename LHS, typename RHS, typename OP> \
+    viennacl::vector_expression<const vector_expression<const LHS, const RHS, OP>, \
+                                const vector_expression<const LHS, const RHS, OP>, \
+                                op_element_unary<op_##funcname> > \
+    element_##funcname(vector_expression<const LHS, const RHS, OP> const & proxy) \
+    { \
+      return viennacl::vector_expression<const vector_expression<const LHS, const RHS, OP>, \
+                                         const vector_expression<const LHS, const RHS, OP>, \
+                                         op_element_unary<op_##funcname> >(proxy, proxy); \
+    } \
+
+    VIENNACL_MAKE_UNARY_ELEMENT_OP(abs)
+    VIENNACL_MAKE_UNARY_ELEMENT_OP(acos)
+    VIENNACL_MAKE_UNARY_ELEMENT_OP(asin)
+    VIENNACL_MAKE_UNARY_ELEMENT_OP(atan)
+    VIENNACL_MAKE_UNARY_ELEMENT_OP(ceil)
+    VIENNACL_MAKE_UNARY_ELEMENT_OP(cos)
+    VIENNACL_MAKE_UNARY_ELEMENT_OP(cosh)
+    VIENNACL_MAKE_UNARY_ELEMENT_OP(exp)
+    VIENNACL_MAKE_UNARY_ELEMENT_OP(fabs)
+    VIENNACL_MAKE_UNARY_ELEMENT_OP(floor)
+    VIENNACL_MAKE_UNARY_ELEMENT_OP(log)
+    VIENNACL_MAKE_UNARY_ELEMENT_OP(log10)
+    VIENNACL_MAKE_UNARY_ELEMENT_OP(sin)
+    VIENNACL_MAKE_UNARY_ELEMENT_OP(sinh)
+    VIENNACL_MAKE_UNARY_ELEMENT_OP(sqrt)
+    VIENNACL_MAKE_UNARY_ELEMENT_OP(tan)
+    VIENNACL_MAKE_UNARY_ELEMENT_OP(tanh)
+
+#undef VIENNACL_MAKE_UNARY_ELEMENT_OP
+
+    /** \endcond */
+
+    ///////////////////////// Norms and inner product ///////////////////
+
+
+    //implementation of inner product:
+    //namespace {
+
+    /** @brief Computes the inner product of two vectors - dispatcher interface
+     *
+     * @param vec1 The first vector
+     * @param vec2 The second vector
+     * @param result The result scalar (on the gpu)
+     */
+    template <typename T>
+    void inner_prod_impl(vector_base<T> const & vec1,
+                         vector_base<T> const & vec2,
+                         scalar<T> & result)
     {
-      typedef typename viennacl::result_of::cpu_value_type<V1>::type        value_type;
-      
-      //TODO: Ensure that correct alignment is chosen for the kernels.
-      const unsigned int ALIGNMENT = V1::alignment;
-      
-      assert( (viennacl::traits::size(vec) == viennacl::traits::size(result))
-             && "Incompatible vector sizes in mult()!");
-      
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "cpu_mult");
-
-      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec), cl_uint(viennacl::traits::start(vec)), cl_uint(viennacl::traits::size(vec)),
-                               static_cast<value_type>(alpha),
-                               viennacl::traits::handle(result),  cl_uint(viennacl::traits::start(result)), cl_uint(viennacl::traits::size(result)))
-                            );        
+      assert( vec1.size() == vec2.size() && bool("Size mismatch") );
+
+      switch (viennacl::traits::handle(vec1).get_active_handle_id())
+      {
+        case viennacl::MAIN_MEMORY:
+          viennacl::linalg::host_based::inner_prod_impl(vec1, vec2, result);
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case viennacl::OPENCL_MEMORY:
+          viennacl::linalg::opencl::inner_prod_impl(vec1, vec2, result);
+          break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        case viennacl::CUDA_MEMORY:
+          viennacl::linalg::cuda::inner_prod_impl(vec1, vec2, result);
+          break;
+#endif
+        case viennacl::MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("not implemented");
+      }
     }
 
-    /** @brief Scales a vector inplace. Try to use the overloaded operators for vector instead, unless you want to fine-tune the number of GPU threads involved.
-    *
-    * Computes result *= alpha, where alpha is a gpu scalar
-    *
-    * @param vec    The vector to be scaled.
-    * @param alpha  The scaling factor.
-    */
-    template <typename V1, typename S2>
-    typename viennacl::enable_if< viennacl::is_vector<V1>::value
-                                  && viennacl::is_scalar<S2>::value
-                                >::type
-    inplace_mult(V1 & vec,
-                 S2 const & alpha)
+    // vector expression on lhs
+    template <typename LHS, typename RHS, typename OP, typename T>
+    void inner_prod_impl(viennacl::vector_expression<LHS, RHS, OP> const & vec1,
+                         vector_base<T> const & vec2,
+                         scalar<T> & result)
     {
-      typedef typename viennacl::result_of::cpu_value_type<V1>::type        value_type;
-      
-      //TODO: Ensure that correct alignment is chosen for the kernels.
-      const unsigned int ALIGNMENT = V1::alignment;
-      
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "inplace_mult");
-
-      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec), cl_uint(viennacl::traits::start(vec)), cl_uint(viennacl::traits::size(vec)),
-                               alpha)
-                            );
+      viennacl::vector<T> temp = vec1;
+      inner_prod_impl(temp, vec2, result);
     }
 
-    /** @brief Scales a vector inplace. Try to use the overloaded operators for vector instead, unless you want to fine-tune the number of GPU threads involved.
-    *
-    * Computes result *= alpha, where alpha is a cpu scalar
-    *
-    * @param vec    The vector to be scaled.
-    * @param alpha  The scaling factor.
-    */
-    template <typename V1, typename S2>
-    typename viennacl::enable_if< viennacl::is_vector<V1>::value
-                                  && viennacl::is_cpu_scalar<S2>::value
-                                >::type
-    inplace_mult(V1 & vec,
-                 S2 alpha)
+
+    // vector expression on rhs
+    template <typename T, typename LHS, typename RHS, typename OP>
+    void inner_prod_impl(vector_base<T> const & vec1,
+                         viennacl::vector_expression<LHS, RHS, OP> const & vec2,
+                         scalar<T> & result)
     {
-      typedef typename viennacl::result_of::cpu_value_type<V1>::type        value_type;
-      
-      //TODO: Ensure that correct alignment is chosen for the kernels.
-      const unsigned int ALIGNMENT = V1::alignment;
-      
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "cpu_inplace_mult");
-
-      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec), cl_uint(viennacl::traits::start(vec)), cl_uint(viennacl::traits::size(vec)), 
-                               static_cast<value_type>(alpha))
-                            );        
+      viennacl::vector<T> temp = vec2;
+      inner_prod_impl(vec1, temp, result);
     }
 
-    //result = vec / scalar
-    /** @brief Scales a vector. Try to use the overloaded operators for vector instead, unless you want to fine-tune the number of GPU threads involved.
-    *
-    * Computes result = vec / alpha, where alpha is a gpu scalar
-    *
-    * @param vec    The vector to be scaled.
-    * @param alpha  The (inverse) scaling factor.
-    * @param result The result vector.
-    */
-    template <typename V1, typename S2, typename V3>
-    typename viennacl::enable_if< viennacl::is_vector<V1>::value
-                                  && viennacl::is_scalar<S2>::value
-                                  && viennacl::is_vector<V3>::value
-                                >::type
-    divide(V1 const & vec,
-           S2 const & alpha,
-           V3 & result)
+
+    // vector expression on lhs and rhs
+    template <typename LHS1, typename RHS1, typename OP1,
+              typename LHS2, typename RHS2, typename OP2, typename T>
+    void inner_prod_impl(viennacl::vector_expression<LHS1, RHS1, OP1> const & vec1,
+                         viennacl::vector_expression<LHS2, RHS2, OP2> const & vec2,
+                         scalar<T> & result)
     {
-      typedef typename viennacl::result_of::cpu_value_type<V1>::type        value_type;
-      
-      //TODO: Ensure that correct alignment is chosen for the kernels.
-      const unsigned int ALIGNMENT = V1::alignment;
-      
-      assert( (viennacl::traits::size(vec) == viennacl::traits::size(result))
-             && "Incompatible vector sizes in divide()!");
-
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "divide");
-
-      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec), cl_uint(viennacl::traits::start(vec)), cl_uint(viennacl::traits::size(vec)), 
-                               alpha,
-                               viennacl::traits::handle(result), cl_uint(viennacl::traits::start(result)), cl_uint(viennacl::traits::size(result)))
-                            );
+      viennacl::vector<T> temp1 = vec1;
+      viennacl::vector<T> temp2 = vec2;
+      inner_prod_impl(temp1, temp2, result);
     }
 
-    /** @brief Scales a vector inplace. Try to use the overloaded operators for vector instead, unless you want to fine-tune the number of GPU threads involved.
-    *
-    * Computes result *= alpha, where alpha is a gpu scalar
-    *
-    * @param vec    The vector to be scaled.
-    * @param alpha  The (inverse) scaling factor.
-    */
-    template <typename V1, typename S2>
-    typename viennacl::enable_if< viennacl::is_vector<V1>::value
-                                  && viennacl::is_scalar<S2>::value
-                                >::type
-    inplace_divide(V1 & vec,
-                   S2 const & alpha)
+
+
+
+    /** @brief Computes the inner product of two vectors with the final reduction step on the CPU - dispatcher interface
+     *
+     * @param vec1 The first vector
+     * @param vec2 The second vector
+     * @param result The result scalar (on the gpu)
+     */
+    template <typename T>
+    void inner_prod_cpu(vector_base<T> const & vec1,
+                        vector_base<T> const & vec2,
+                        T & result)
     {
-      typedef typename viennacl::result_of::cpu_value_type<V1>::type        value_type;
-      
-      //TODO: Ensure that correct alignment is chosen for the kernels.
-      const unsigned int ALIGNMENT = V1::alignment;
-      
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "inplace_divide");
-
-      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec), cl_uint(viennacl::traits::start(vec)), cl_uint(viennacl::traits::size(vec)), 
-                               alpha) 
-                            );
+      assert( vec1.size() == vec2.size() && bool("Size mismatch") );
+
+      switch (viennacl::traits::handle(vec1).get_active_handle_id())
+      {
+        case viennacl::MAIN_MEMORY:
+          viennacl::linalg::host_based::inner_prod_impl(vec1, vec2, result);
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case viennacl::OPENCL_MEMORY:
+          viennacl::linalg::opencl::inner_prod_cpu(vec1, vec2, result);
+          break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        case viennacl::CUDA_MEMORY:
+          viennacl::linalg::cuda::inner_prod_cpu(vec1, vec2, result);
+          break;
+#endif
+        case viennacl::MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("not implemented");
+      }
     }
 
-    //result = factor * vec1 + vec2
-    /** @brief Multiply-add operation. Try to use the overloaded operators for vector instead, unless you want to fine-tune the number of GPU threads involved.
-    *
-    * Computes result = alpha * vec1 + vec2, where alpha is a gpu scalar
-    *
-    * @param vec1    The first added
-    * @param alpha  The scaling factor for the first addend.
-    * @param vec2    The second added.
-    * @param result The result vector.
-    */
-    template <typename V1, typename S2, typename V3, typename V4>
-    typename viennacl::enable_if< viennacl::is_vector<V1>::value
-                                  && viennacl::is_scalar<S2>::value
-                                  && viennacl::is_vector<V3>::value
-                                  && viennacl::is_vector<V4>::value
-                                >::type
-    mul_add(V1 const & vec1,
-            S2 const & alpha,
-            V3 const & vec2,
-            V4 & result)
+    // vector expression on lhs
+    template <typename LHS, typename RHS, typename OP, typename T>
+    void inner_prod_cpu(viennacl::vector_expression<LHS, RHS, OP> const & vec1,
+                        vector_base<T> const & vec2,
+                        T & result)
     {
-      typedef typename viennacl::result_of::cpu_value_type<V1>::type        value_type;
-      
-      //TODO: Ensure that correct alignment is chosen for the kernels.
-      const unsigned int ALIGNMENT = V1::alignment;
-      
-      assert( (viennacl::traits::size(vec1) == viennacl::traits::size(vec2))
-             && (viennacl::traits::size(vec1) == viennacl::traits::size(result))
-             && "Incompatible vector sizes in mul_add()!");
-      
-      
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "mul_add");
-      //cl_uint size = static_cast<cl_uint>(std::min(vec1.internal_size(), vec2.internal_size()));
-
-      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec1), cl_uint(viennacl::traits::start(vec1)), cl_uint(viennacl::traits::size(vec1)), 
-                               alpha,
-                               viennacl::traits::handle(vec2), cl_uint(viennacl::traits::start(vec2)), cl_uint(viennacl::traits::size(vec2)), 
-                               viennacl::traits::handle(result), cl_uint(viennacl::traits::start(result)), cl_uint(viennacl::traits::size(result)))
-                            );        
+      viennacl::vector<T> temp = vec1;
+      inner_prod_cpu(temp, vec2, result);
     }
 
-    /** @brief Multiply-add operation. Try to use the overloaded operators for vector instead, unless you want to fine-tune the number of GPU threads involved.
-    *
-    * Computes result = alpha * vec1 + vec2, where alpha is a cpu scalar
-    *
-    * @param vec1    The first added
-    * @param alpha   The scaling factor for the first addend.
-    * @param vec2    The second added.
-    * @param result  The result vector.
-    */
-    template <typename V1, typename SCALARTYPE, typename V3, typename V4>
-    typename viennacl::enable_if< viennacl::is_vector<V1>::value
-                                  && viennacl::is_cpu_scalar<SCALARTYPE>::value
-                                  && viennacl::is_vector<V3>::value
-                                  && viennacl::is_vector<V4>::value
-                                >::type
-    mul_add(V1 const & vec1,
-            SCALARTYPE alpha,
-            V3 const & vec2,
-            V4 & result)
+
+    // vector expression on rhs
+    template <typename T, typename LHS, typename RHS, typename OP>
+    void inner_prod_cpu(vector_base<T> const & vec1,
+                        viennacl::vector_expression<LHS, RHS, OP> const & vec2,
+                        T & result)
     {
-      typedef typename viennacl::result_of::cpu_value_type<V1>::type        value_type;
-      
-      //TODO: Ensure that correct alignment is chosen for the kernels.
-      const unsigned int ALIGNMENT = V1::alignment;
-      
-      assert( (viennacl::traits::size(vec1) == viennacl::traits::size(vec2))
-             && (viennacl::traits::size(vec1) == viennacl::traits::size(result))
-             && "Incompatible vector sizes in mul_add()!");
-      
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "cpu_mul_add");
-      //cl_uint size = static_cast<cl_uint>(std::min(vec1.internal_size(), vec2.internal_size()));
-
-      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec1), cl_uint(viennacl::traits::start(vec1)), cl_uint(viennacl::traits::size(vec1)), 
-                               static_cast<value_type>(alpha),
-                               viennacl::traits::handle(vec2), cl_uint(viennacl::traits::start(vec2)), cl_uint(viennacl::traits::size(vec2)), 
-                               viennacl::traits::handle(result), cl_uint(viennacl::traits::start(result)), cl_uint(viennacl::traits::size(result)))
-                            );
+      viennacl::vector<T> temp = vec2;
+      inner_prod_cpu(vec1, temp, result);
     }
 
-    //vec1 += factor * vec2
-    /** @brief Inplace Multiply-add operation. Try to use the overloaded operators for vector instead, unless you want to fine-tune the number of GPU threads involved.
-    *
-    * Computes vec1 += alpha * vec2, where alpha is a gpu scalar
-    *
-    * @param vec1    The first added
-    * @param alpha   The scaling factor for the first addend.
-    * @param vec2    The second added.
-    */
-    template <typename V1, typename V2, typename S3>
-    typename viennacl::enable_if< viennacl::is_vector<V1>::value
-                                  && viennacl::is_vector<V2>::value
-                                  && viennacl::is_scalar<S3>::value
-                                >::type
-    inplace_mul_add(V1 & vec1,
-                    V2 const & vec2,
-                    S3 const & alpha)
+
+    // vector expression on lhs and rhs
+    template <typename LHS1, typename RHS1, typename OP1,
+              typename LHS2, typename RHS2, typename OP2, typename S3>
+    void inner_prod_cpu(viennacl::vector_expression<LHS1, RHS1, OP1> const & vec1,
+                        viennacl::vector_expression<LHS2, RHS2, OP2> const & vec2,
+                        S3 & result)
     {
-      typedef typename viennacl::result_of::cpu_value_type<V1>::type        value_type;
-      
-      //TODO: Ensure that correct alignment is chosen for the kernels.
-      const unsigned int ALIGNMENT = V1::alignment;
-      
-      assert( (viennacl::traits::size(vec1) == viennacl::traits::size(vec2))
-             && "Incompatible vector sizes in inplace_mul_add()!");
-      
-      //cl_uint size = static_cast<cl_uint>(std::min(vec1.internal_size(), vec2.internal_size()));
-      
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "inplace_mul_add");
-
-      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec1), cl_uint(viennacl::traits::start(vec1)), cl_uint(viennacl::traits::size(vec1)), 
-                               viennacl::traits::handle(vec2), cl_uint(viennacl::traits::start(vec2)), cl_uint(viennacl::traits::size(vec2)), 
-                               alpha));
+      viennacl::vector<S3> temp1 = vec1;
+      viennacl::vector<S3> temp2 = vec2;
+      inner_prod_cpu(temp1, temp2, result);
     }
 
-    /** @brief Inplace Multiply-add operation. Try to use the overloaded operators for vector instead, unless you want to fine-tune the number of GPU threads involved.
-    *
-    * Computes vec1 += alpha * vec2, where alpha is a cpu scalar
-    *
-    * @param vec1    The first added
-    * @param vec2    The second added.
-    * @param alpha   The scaling factor for the first addend.
-    */
-    template <typename V1, typename V2, typename SCALARTYPE>
-    typename viennacl::enable_if< viennacl::is_vector<V1>::value
-                                  && viennacl::is_vector<V2>::value
-                                  && viennacl::is_cpu_scalar<SCALARTYPE>::value
-                                >::type
-    inplace_mul_add(V1 & vec1,
-                    V2 const & vec2,
-                    SCALARTYPE alpha)
+
+
+    /** @brief Computes the inner products <x, y1>, <x, y2>, ..., <x, y_N> and writes the result to a (sub-)vector
+     *
+     * @param x       The common vector
+     * @param y_tuple A collection of vector, all of the same size.
+     * @param result  The result scalar (on the gpu). Needs to match the number of elements in y_tuple
+     */
+    template <typename T>
+    void inner_prod_impl(vector_base<T> const & x,
+                         vector_tuple<T> const & y_tuple,
+                         vector_base<T> & result)
     {
-      typedef typename viennacl::result_of::cpu_value_type<V1>::type        value_type;
-      
-      //TODO: Ensure that correct alignment is chosen for the kernels.
-      const unsigned int ALIGNMENT = V1::alignment;
-      
-      assert( (viennacl::traits::size(vec1) == viennacl::traits::size(vec2))
-             && "Incompatible vector sizes in inplace_mul_add()!");
-
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "cpu_inplace_mul_add");
-      //cl_uint size = static_cast<cl_uint>(std::min(vec1.internal_size(), vec2.internal_size()));
-
-      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec1), cl_uint(viennacl::traits::start(vec1)), cl_uint(viennacl::traits::size(vec1)), 
-                               viennacl::traits::handle(vec2), cl_uint(viennacl::traits::start(vec2)), cl_uint(viennacl::traits::size(vec2)), 
-                               value_type(alpha)));
+      assert( x.size() == y_tuple.const_at(0).size() && bool("Size mismatch") );
+      assert( result.size() == y_tuple.const_size() && bool("Number of elements does not match result size") );
+
+      switch (viennacl::traits::handle(x).get_active_handle_id())
+      {
+        case viennacl::MAIN_MEMORY:
+          viennacl::linalg::host_based::inner_prod_impl(x, y_tuple, result);
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case viennacl::OPENCL_MEMORY:
+          viennacl::linalg::opencl::inner_prod_impl(x, y_tuple, result);
+          break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        case viennacl::CUDA_MEMORY:
+          viennacl::linalg::cuda::inner_prod_impl(x, y_tuple, result);
+          break;
+#endif
+        case viennacl::MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("not implemented");
+      }
     }
 
-    /** @brief Multiply-subtract operation. Try to use the overloaded operators for vector instead, unless you want to fine-tune the number of GPU threads involved.
-    *
-    * Computes result = alpha * vec1 - vec2, where alpha is a gpu scalar
+
+    /** @brief Computes the l^1-norm of a vector - dispatcher interface
     *
-    * @param vec1    The first vector operand
-    * @param alpha   The scaling factor for the first vector.
-    * @param vec2    The second operand.
-    * @param result  The result vector.
+    * @param vec The vector
+    * @param result The result scalar
     */
-    template <typename V1, typename S2, typename V3, typename V4>
-    typename viennacl::enable_if< viennacl::is_vector<V1>::value
-                                  && viennacl::is_scalar<S2>::value
-                                  && viennacl::is_vector<V3>::value
-                                  && viennacl::is_vector<V4>::value
-                                >::type
-    mul_sub(V1 const & vec1,
-            S2 const & alpha,
-            V3 const & vec2,
-            V4 & result)
+    template <typename T>
+    void norm_1_impl(vector_base<T> const & vec,
+                     scalar<T> & result)
     {
-      typedef typename viennacl::result_of::cpu_value_type<V1>::type        value_type;
-      
-      //TODO: Ensure that correct alignment is chosen for the kernels.
-      const unsigned int ALIGNMENT = V1::alignment;
-      
-      assert( (viennacl::traits::size(vec1) == viennacl::traits::size(vec2))
-             && (viennacl::traits::size(vec1) == viennacl::traits::size(result))
-             && "Incompatible vector sizes in mul_sub()!");
-      
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "mul_sub");
-      //cl_uint size = static_cast<cl_uint>(std::min(vec1.internal_size(), vec2.internal_size()));
-
-      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec1), cl_uint(viennacl::traits::start(vec1)), cl_uint(viennacl::traits::size(vec1)), 
-                               alpha,
-                               viennacl::traits::handle(vec2), cl_uint(viennacl::traits::start(vec2)), cl_uint(viennacl::traits::size(vec2)), 
-                               viennacl::traits::handle(result), cl_uint(viennacl::traits::start(result)), cl_uint(viennacl::traits::size(result)))
-                            );
+      switch (viennacl::traits::handle(vec).get_active_handle_id())
+      {
+        case viennacl::MAIN_MEMORY:
+          viennacl::linalg::host_based::norm_1_impl(vec, result);
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case viennacl::OPENCL_MEMORY:
+          viennacl::linalg::opencl::norm_1_impl(vec, result);
+          break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        case viennacl::CUDA_MEMORY:
+          viennacl::linalg::cuda::norm_1_impl(vec, result);
+          break;
+#endif
+        case viennacl::MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("not implemented");
+      }
     }
 
 
-    /** @brief Inplace Multiply-subtract operation. Try to use the overloaded operators for vector instead, unless you want to fine-tune the number of GPU threads involved.
+    /** @brief Computes the l^1-norm of a vector - interface for a vector expression. Creates a temporary.
     *
-    * Computes vec1 -= alpha * vec2, where alpha is a gpu scalar
-    *
-    * @param vec1    The result vector which is updated
-    * @param vec2    The second operand.
-    * @param alpha   The scaling factor for the vector update.
+    * @param vec    The vector expression
+    * @param result The result scalar
     */
-    template <typename V1, typename V2, typename S3>
-    typename viennacl::enable_if< viennacl::is_vector<V1>::value
-                                  && viennacl::is_vector<V2>::value
-                                  && viennacl::is_scalar<S3>::value
-                                >::type
-    inplace_mul_sub(V1 & vec1,
-                    V2 const & vec2,
-                    S3 const & alpha)
+    template <typename LHS, typename RHS, typename OP, typename S2>
+    void norm_1_impl(viennacl::vector_expression<LHS, RHS, OP> const & vec,
+                     S2 & result)
     {
-      typedef typename viennacl::result_of::cpu_value_type<V1>::type        value_type;
-      
-      //TODO: Ensure that correct alignment is chosen for the kernels.
-      const unsigned int ALIGNMENT = V1::alignment;
-      
-      assert( (viennacl::traits::size(vec1) == viennacl::traits::size(vec2))
-             && "Incompatible vector sizes in inplace_mul_sub()!");
-      
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "inplace_mul_sub");
-      //cl_uint size = static_cast<cl_uint>(std::min(vec1.internal_size(), vec2.internal_size()));
-
-      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec1), cl_uint(viennacl::traits::start(vec1)), cl_uint(viennacl::traits::size(vec1)), 
-                               viennacl::traits::handle(vec2), cl_uint(viennacl::traits::start(vec2)), cl_uint(viennacl::traits::size(vec2)), 
-                               alpha)
-                            );        
+      viennacl::vector<typename viennacl::result_of::cpu_value_type<S2>::type> temp = vec;
+      norm_1_impl(temp, result);
     }
 
-    /** @brief Inplace divide-add operation. Try to use the overloaded operators for vector instead, unless you want to fine-tune the number of GPU threads involved.
-    *
-    * Computes vec1 += vec2 / alpha, where alpha is a gpu scalar
+
+
+    /** @brief Computes the l^1-norm of a vector with final reduction on the CPU
     *
-    * @param vec1    The first vector
-    * @param vec2    The vector update
-    * @param alpha   The scaling factor for the second vector.
+    * @param vec The vector
+    * @param result The result scalar
     */
-    template <typename V1, typename V2, typename S3>
-    typename viennacl::enable_if< viennacl::is_vector<V1>::value
-                                  && viennacl::is_vector<V2>::value
-                                  && viennacl::is_scalar<S3>::value
-                                >::type
-    inplace_div_add(V1 & vec1,
-                    V2 const & vec2,
-                    S3 const & alpha)
+    template <typename T>
+    void norm_1_cpu(vector_base<T> const & vec,
+                    T & result)
     {
-      typedef typename viennacl::result_of::cpu_value_type<V1>::type        value_type;
-      
-      //TODO: Ensure that correct alignment is chosen for the kernels.
-      const unsigned int ALIGNMENT = V1::alignment;
-      
-      assert( (viennacl::traits::size(vec1) == viennacl::traits::size(vec2))
-             && "Incompatible vector sizes in inplace_div_add()!");
-      
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "inplace_div_add");
-      //cl_uint size = static_cast<cl_uint>(std::min(vec1.internal_size(), vec2.internal_size()));
-
-      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec1), cl_uint(viennacl::traits::start(vec1)), cl_uint(viennacl::traits::size(vec1)), 
-                               viennacl::traits::handle(vec2), cl_uint(viennacl::traits::start(vec2)), cl_uint(viennacl::traits::size(vec2)), 
-                               alpha)
-                            );
+      switch (viennacl::traits::handle(vec).get_active_handle_id())
+      {
+        case viennacl::MAIN_MEMORY:
+          viennacl::linalg::host_based::norm_1_impl(vec, result);
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case viennacl::OPENCL_MEMORY:
+          viennacl::linalg::opencl::norm_1_cpu(vec, result);
+          break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        case viennacl::CUDA_MEMORY:
+          viennacl::linalg::cuda::norm_1_cpu(vec, result);
+          break;
+#endif
+        case viennacl::MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("not implemented");
+      }
     }
 
-    /** @brief Inplace divide-subtract operation. Try to use the overloaded operators for vector instead, unless you want to fine-tune the number of GPU threads involved.
-    *
-    * Computes vec1 -= vec2 / alpha, where alpha is a gpu scalar
+    /** @brief Computes the l^1-norm of a vector with final reduction on the CPU - interface for a vector expression. Creates a temporary.
     *
-    * @param vec1    The first vector
-    * @param vec2    The vector update
-    * @param alpha   The scaling factor for the second vector.
+    * @param vec    The vector expression
+    * @param result The result scalar
     */
-    template <typename V1, typename V2, typename S3>
-    typename viennacl::enable_if< viennacl::is_vector<V1>::value
-                                  && viennacl::is_vector<V2>::value
-                                  && viennacl::is_scalar<S3>::value
-                                >::type
-    inplace_div_sub(V1 & vec1,
-                    V2 const & vec2,
-                    S3 const & alpha)
+    template <typename LHS, typename RHS, typename OP, typename S2>
+    void norm_1_cpu(viennacl::vector_expression<LHS, RHS, OP> const & vec,
+                    S2 & result)
     {
-      typedef typename viennacl::result_of::cpu_value_type<V1>::type        value_type;
-      
-      //TODO: Ensure that correct alignment is chosen for the kernels.
-      const unsigned int ALIGNMENT = V1::alignment;
-      
-      assert( (viennacl::traits::size(vec1) == viennacl::traits::size(vec2))
-             && "Incompatible vector sizes in inplace_div_sub()!");
-      
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "inplace_div_sub");
-      //cl_uint size = static_cast<cl_uint>(std::min(vec1.internal_size(), vec2.internal_size()));
-
-      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec1), cl_uint(viennacl::traits::start(vec1)), cl_uint(viennacl::traits::size(vec1)),
-                               viennacl::traits::handle(vec2), cl_uint(viennacl::traits::start(vec2)), cl_uint(viennacl::traits::size(vec2)),
-                               alpha)
-                            );        
+      viennacl::vector<typename viennacl::result_of::cpu_value_type<LHS>::type> temp = vec;
+      norm_1_cpu(temp, result);
     }
 
 
-    ///////////////////////// Norms and inner product ///////////////////
 
 
-    //implementation of inner product:
-    //namespace {
-    /** @brief Computes the inner product of two vectors - implementation. Library users should call inner_prod(vec1, vec2).
-     *
-     * @param vec1 The first vector
-     * @param vec2 The second vector
-     * @param result The result scalar (on the gpu)
-     */
-    template <typename V1, typename V2, typename S3>
-    void inner_prod_impl(V1 const & vec1,
-                         V2 const & vec2,
-                         S3 & result,
-                         typename viennacl::enable_if< viennacl::is_vector<V1>::value
-                                                       && viennacl::is_vector<V2>::value
-                                                       && viennacl::is_scalar<S3>::value
-#ifdef _MSC_VER
-                                                     >::type * dummy = 0)
-#else
-                                                     >::type * dummy)
-#endif                                                   
+    /** @brief Computes the l^2-norm of a vector - dispatcher interface
+    *
+    * @param vec The vector
+    * @param result The result scalar
+    */
+    template <typename T>
+    void norm_2_impl(vector_base<T> const & vec,
+                     scalar<T> & result)
     {
-      typedef typename viennacl::result_of::cpu_value_type<V1>::type        value_type;
-      
-      //TODO: Ensure that correct alignment is chosen for the kernels.
-      const unsigned int ALIGNMENT = V1::alignment;
-    
-      assert( (viennacl::traits::size(vec1) == viennacl::traits::size(vec2))
-             && "Incompatible vector sizes in inner_prod_impl()!");
-      
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "inner_prod");
-      //cl_uint size = static_cast<cl_uint>(std::min(vec1.internal_size(), vec2.internal_size()));
-      unsigned int work_groups = k.global_work_size() / k.local_work_size();
-      
-      static viennacl::vector<value_type> temp(work_groups);
-      
-      //Note: Number of work groups MUST be a power of two!
-      //std::cout << work_groups << ", " << k.local_work_size() << ", " << k.global_work_size() << std::endl;
-      assert( work_groups * k.local_work_size() == k.global_work_size() );
-      assert( (k.global_work_size() / k.local_work_size()) == 1 
-              || (k.global_work_size() / k.local_work_size()) == 2 
-              || (k.global_work_size() / k.local_work_size()) == 4
-              || (k.global_work_size() / k.local_work_size()) == 8
-              || (k.global_work_size() / k.local_work_size()) == 16
-              || (k.global_work_size() / k.local_work_size()) == 32
-              || (k.global_work_size() / k.local_work_size()) == 64
-              || (k.global_work_size() / k.local_work_size()) == 128
-              || (k.global_work_size() / k.local_work_size()) == 256
-              || (k.global_work_size() / k.local_work_size()) == 512 );
-              
-      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec1), cl_uint(viennacl::traits::start(vec1)), cl_uint(viennacl::traits::size(vec1)),
-                               viennacl::traits::handle(vec2), cl_uint(viennacl::traits::start(vec2)), cl_uint(viennacl::traits::size(vec2)),
-                               viennacl::ocl::local_mem(sizeof(value_type) * k.local_work_size()),
-                               temp));        
-
-      viennacl::ocl::kernel & ksum = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "sum");
-      
-      ksum.local_work_size(0, work_groups);
-      ksum.global_work_size(0, work_groups);
-      viennacl::ocl::enqueue(ksum(viennacl::traits::handle(temp), cl_uint(viennacl::traits::start(temp)), cl_uint(viennacl::traits::size(temp)),
-                                  result)
-                            );
+      switch (viennacl::traits::handle(vec).get_active_handle_id())
+      {
+        case viennacl::MAIN_MEMORY:
+          viennacl::linalg::host_based::norm_2_impl(vec, result);
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case viennacl::OPENCL_MEMORY:
+          viennacl::linalg::opencl::norm_2_impl(vec, result);
+          break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        case viennacl::CUDA_MEMORY:
+          viennacl::linalg::cuda::norm_2_impl(vec, result);
+          break;
+#endif
+        case viennacl::MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("not implemented");
+      }
     }
 
-    //public interface of inner product
-    /** @brief Computes the inner product of two vectors.
+    /** @brief Computes the l^2-norm of a vector - interface for a vector expression. Creates a temporary.
     *
-    * @param vec1 The first vector
-    * @param vec2 The second vector
-    * @return The result
+    * @param vec    The vector expression
+    * @param result The result scalar
     */
-    template <typename V1, typename V2>
-    typename viennacl::enable_if< viennacl::is_vector<V1>::value
-                                  && viennacl::is_vector<V2>::value,
-                                  viennacl::scalar_expression< const V1, 
-                                                               const V2,
-                                                               viennacl::op_inner_prod >
-                                >::type
-    inner_prod_impl(V1 const & vec1,
-                    V2 const & vec2)
+    template <typename LHS, typename RHS, typename OP, typename T>
+    void norm_2_impl(viennacl::vector_expression<LHS, RHS, OP> const & vec,
+                     scalar<T> & result)
     {
-      return viennacl::scalar_expression< const V1, 
-                                          const V2,
-                                          viennacl::op_inner_prod >(vec1, vec2);
+      viennacl::vector<T> temp = vec;
+      norm_2_impl(temp, result);
     }
 
 
-    
-    /** @brief Computes the l^1-norm of a vector
+    /** @brief Computes the l^2-norm of a vector with final reduction on the CPU - dispatcher interface
     *
     * @param vec The vector
     * @param result The result scalar
     */
-    template <typename V1, typename S2>
-    void norm_1_impl(V1 const & vec,
-                     S2 & result,
-                     typename viennacl::enable_if< viennacl::is_vector<V1>::value
-                                                   && viennacl::is_scalar<S2>::value
-#ifdef _MSC_VER
-                                                 >::type * dummy = 0)
-#else
-                                                 >::type * dummy)
-#endif                                                   
+    template <typename T>
+    void norm_2_cpu(vector_base<T> const & vec,
+                    T & result)
     {
-      typedef typename viennacl::result_of::cpu_value_type<V1>::type        value_type;
-      
-      //TODO: Ensure that correct alignment is chosen for the kernels.
-      const unsigned int ALIGNMENT = V1::alignment;
-      
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "norm_1");
-      //cl_uint size = static_cast<cl_uint>(vcl_vec.internal_size());
-      
-      if (k.local_work_size() != k.global_work_size())
+      switch (viennacl::traits::handle(vec).get_active_handle_id())
       {
-        //NOTE: For some reasons the kernel could not be started with several work groups on NVIDIA hardware. This forces us to use as many parallel threads within a single work group as possible
-        k.local_work_size(0, viennacl::ocl::current_device().max_work_group_size());
-        k.global_work_size(0, viennacl::ocl::current_device().max_work_group_size());
+        case viennacl::MAIN_MEMORY:
+          viennacl::linalg::host_based::norm_2_impl(vec, result);
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case viennacl::OPENCL_MEMORY:
+          viennacl::linalg::opencl::norm_2_cpu(vec, result);
+          break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        case viennacl::CUDA_MEMORY:
+          viennacl::linalg::cuda::norm_2_cpu(vec, result);
+          break;
+#endif
+        case viennacl::MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("not implemented");
       }
-      
-      unsigned int work_groups = k.global_work_size() / k.local_work_size();
-      viennacl::vector<value_type> temp(work_groups);
-
-      //Note: Number of work groups MUST be a power of two!
-      //std::cout << work_groups << ", " << k.local_work_size() << ", " << k.global_work_size() << std::endl;
-      assert( work_groups * k.local_work_size() == k.global_work_size() );
-      assert( (k.global_work_size() / k.local_work_size()) == 1 
-             || (k.global_work_size() / k.local_work_size()) == 2 
-             || (k.global_work_size() / k.local_work_size()) == 4
-             || (k.global_work_size() / k.local_work_size()) == 8
-             || (k.global_work_size() / k.local_work_size()) == 16
-             || (k.global_work_size() / k.local_work_size()) == 32
-             || (k.global_work_size() / k.local_work_size()) == 64
-             || (k.global_work_size() / k.local_work_size()) == 128
-             || (k.global_work_size() / k.local_work_size()) == 256
-             || (k.global_work_size() / k.local_work_size()) == 512 );
-               
-      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec), cl_uint(viennacl::traits::start(vec)), cl_uint(viennacl::traits::size(vec)),                                 
-                                viennacl::ocl::local_mem(sizeof(value_type) * k.local_work_size()),
-                                temp));        
-      
-      viennacl::ocl::kernel & ksum = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "sum");
-      
-      ksum.local_work_size(0, work_groups);
-      ksum.global_work_size(0, work_groups);
-      viennacl::ocl::enqueue(ksum(viennacl::traits::handle(temp), cl_uint(viennacl::traits::start(temp)), cl_uint(viennacl::traits::size(temp)),
-                                  result)
-                            );
     }
 
-    /** @brief Computes the l^2-norm of a vector - implementation
+    /** @brief Computes the l^2-norm of a vector with final reduction on the CPU - interface for a vector expression. Creates a temporary.
+    *
+    * @param vec    The vector expression
+    * @param result The result scalar
+    */
+    template <typename LHS, typename RHS, typename OP, typename S2>
+    void norm_2_cpu(viennacl::vector_expression<LHS, RHS, OP> const & vec,
+                    S2 & result)
+    {
+      viennacl::vector<typename viennacl::result_of::cpu_value_type<LHS>::type> temp = vec;
+      norm_2_cpu(temp, result);
+    }
+
+
+
+
+    /** @brief Computes the supremum-norm of a vector
     *
     * @param vec The vector
     * @param result The result scalar
     */
-    template <typename V1, typename S2>
-    void norm_2_impl(V1 const & vec,
-                     S2 & result,
-                     typename viennacl::enable_if< viennacl::is_vector<V1>::value
-                                                  && viennacl::is_scalar<S2>::value
-#ifdef _MSC_VER
-                                                 >::type * dummy = 0)
-#else
-                                                 >::type * dummy)
-#endif                                                   
+    template <typename T>
+    void norm_inf_impl(vector_base<T> const & vec,
+                       scalar<T> & result)
     {
-      typedef typename viennacl::result_of::cpu_value_type<V1>::type        value_type;
-      
-      //TODO: Ensure that correct alignment is chosen for the kernels.
-      const unsigned int ALIGNMENT = V1::alignment;
-      
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "norm_2");
-      //cl_uint size = static_cast<cl_uint>(vcl_vec.internal_size());
-      
-      if (k.local_work_size() != k.global_work_size())
+      switch (viennacl::traits::handle(vec).get_active_handle_id())
       {
-        //NOTE: For some reasons the kernel could not be started with several work groups on NVIDIA hardware. This forces us to use as many parallel threads within a single work group as possible
-        k.local_work_size(0, viennacl::ocl::current_device().max_work_group_size());
-        k.global_work_size(0, viennacl::ocl::current_device().max_work_group_size());
+        case viennacl::MAIN_MEMORY:
+          viennacl::linalg::host_based::norm_inf_impl(vec, result);
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case viennacl::OPENCL_MEMORY:
+          viennacl::linalg::opencl::norm_inf_impl(vec, result);
+          break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        case viennacl::CUDA_MEMORY:
+          viennacl::linalg::cuda::norm_inf_impl(vec, result);
+          break;
+#endif
+        case viennacl::MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("not implemented");
       }
+    }
 
-      unsigned int work_groups = k.global_work_size() / k.local_work_size();
-      viennacl::vector<value_type> temp(work_groups);
-        
-      //Note: Number of work groups MUST be a power of two!
-      //std::cout << work_groups << ", " << k.local_work_size() << ", " << k.global_work_size() << std::endl;
-      assert( work_groups * k.local_work_size() == k.global_work_size() );
-      assert( (k.global_work_size() / k.local_work_size()) == 1 
-             || (k.global_work_size() / k.local_work_size()) == 2 
-             || (k.global_work_size() / k.local_work_size()) == 4
-             || (k.global_work_size() / k.local_work_size()) == 8
-             || (k.global_work_size() / k.local_work_size()) == 16
-             || (k.global_work_size() / k.local_work_size()) == 32
-             || (k.global_work_size() / k.local_work_size()) == 64
-             || (k.global_work_size() / k.local_work_size()) == 128
-             || (k.global_work_size() / k.local_work_size()) == 256
-             || (k.global_work_size() / k.local_work_size()) == 512 );
-               
-        viennacl::ocl::enqueue(k(viennacl::traits::handle(vec), cl_uint(viennacl::traits::start(vec)), cl_uint(viennacl::traits::size(vec)),                                 
-                                 viennacl::ocl::local_mem(sizeof(value_type) * k.local_work_size()),
-                                 temp)
-                              );
-
-        viennacl::ocl::kernel & sqrt_sum = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "sqrt_sum");
-        
-        sqrt_sum.local_work_size(0, work_groups);
-        sqrt_sum.global_work_size(0, work_groups);
-        viennacl::ocl::enqueue(
-                        sqrt_sum(viennacl::traits::handle(temp), cl_uint(viennacl::traits::start(temp)), cl_uint(viennacl::traits::size(temp)),
-                                 result)
-                              );
+    /** @brief Computes the supremum norm of a vector - interface for a vector expression. Creates a temporary.
+    *
+    * @param vec    The vector expression
+    * @param result The result scalar
+    */
+    template <typename LHS, typename RHS, typename OP, typename T>
+    void norm_inf_impl(viennacl::vector_expression<LHS, RHS, OP> const & vec,
+                       scalar<T> & result)
+    {
+      viennacl::vector<T> temp = vec;
+      norm_inf_impl(temp, result);
     }
 
-    /** @brief Computes the supremum-norm of a vector
+
+    /** @brief Computes the supremum-norm of a vector with final reduction on the CPU
     *
     * @param vec The vector
     * @param result The result scalar
     */
-    template <typename V1, typename S2>
-    void norm_inf_impl(V1 const & vec,
-                       S2 & result,
-                       typename viennacl::enable_if< viennacl::is_vector<V1>::value
-                                                     && viennacl::is_scalar<S2>::value
-#ifdef _MSC_VER
-                                                   >::type * dummy = 0)
-#else
-                                                   >::type * dummy)
-#endif                                                   
+    template <typename T>
+    void norm_inf_cpu(vector_base<T> const & vec,
+                      T & result)
     {
-      typedef typename viennacl::result_of::cpu_value_type<V1>::type        value_type;
-      
-      //TODO: Ensure that correct alignment is chosen for the kernels.
-      const unsigned int ALIGNMENT = V1::alignment;
-      
-      //cl_uint size = static_cast<cl_uint>(vcl_vec.internal_size());
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "norm_inf");
-
-      if (k.local_work_size() != k.global_work_size())
+      switch (viennacl::traits::handle(vec).get_active_handle_id())
       {
-        //NOTE: For some reasons the kernel could not be started with several work groups on NVIDIA hardware. This forces us to use as many parallel threads within a single work group as possible
-        k.local_work_size(0, viennacl::ocl::current_device().max_work_group_size());
-        k.global_work_size(0, viennacl::ocl::current_device().max_work_group_size());
+        case viennacl::MAIN_MEMORY:
+          viennacl::linalg::host_based::norm_inf_impl(vec, result);
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case viennacl::OPENCL_MEMORY:
+          viennacl::linalg::opencl::norm_inf_cpu(vec, result);
+          break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        case viennacl::CUDA_MEMORY:
+          viennacl::linalg::cuda::norm_inf_cpu(vec, result);
+          break;
+#endif
+        case viennacl::MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("not implemented");
       }
-      
-      unsigned int work_groups = k.global_work_size() / k.local_work_size();
-      viennacl::vector<value_type> temp(work_groups);
-        
-      //Note: Number of work groups MUST be a power of two!
-      //std::cout << work_groups << ", " << k.local_work_size() << ", " << k.global_work_size() << std::endl;
-      assert( work_groups * k.local_work_size() == k.global_work_size() );
-      assert( work_groups == 1 
-             || work_groups == 2 
-             || work_groups == 4
-             || work_groups == 8
-             || work_groups == 16
-             || work_groups == 32
-             || work_groups == 64
-             || work_groups == 128
-             || work_groups == 256
-             || work_groups == 512 );
-               
-      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec), cl_uint(viennacl::traits::start(vec)), cl_uint(viennacl::traits::size(vec)),                                 
-                               viennacl::ocl::local_mem(sizeof(value_type) * k.local_work_size()),
-                               temp));
-      //viennacl::ocl::get_queue().finish();
-      
-      //part 2: parallel reduction of reduced kernel:
-      viennacl::ocl::kernel & max_kernel = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "vmax");
-      max_kernel.local_work_size(0, work_groups);
-      max_kernel.global_work_size(0, work_groups);
-      
-      viennacl::ocl::enqueue(
-                       max_kernel(viennacl::traits::handle(temp), cl_uint(viennacl::traits::start(temp)), cl_uint(viennacl::traits::size(temp)),
-                                  result)
-                            );
     }
 
-    //This function should return a CPU scalar, otherwise statements like 
-    // vcl_rhs[index_norm_inf(vcl_rhs)] 
+    /** @brief Computes the supremum norm of a vector with final reduction on the CPU - interface for a vector expression. Creates a temporary.
+    *
+    * @param vec    The vector expression
+    * @param result The result scalar
+    */
+    template <typename LHS, typename RHS, typename OP, typename S2>
+    void norm_inf_cpu(viennacl::vector_expression<LHS, RHS, OP> const & vec,
+                      S2 & result)
+    {
+      viennacl::vector<typename viennacl::result_of::cpu_value_type<LHS>::type> temp = vec;
+      norm_inf_cpu(temp, result);
+    }
+
+
+    //This function should return a CPU scalar, otherwise statements like
+    // vcl_rhs[index_norm_inf(vcl_rhs)]
     // are ambiguous
     /** @brief Computes the index of the first entry that is equal to the supremum-norm in modulus.
     *
     * @param vec The vector
-    * @return The result. Note that the result must be a CPU scalar (unsigned int), since gpu scalars are floating point types.
+    * @return The result. Note that the result must be a CPU scalar
     */
-    template <typename V1>
-    typename viennacl::enable_if< viennacl::is_vector<V1>::value,
-                                  cl_uint
-                                >::type
-    index_norm_inf(V1 const & vec)
+    template <typename T>
+    vcl_size_t index_norm_inf(vector_base<T> const & vec)
     {
-      typedef typename viennacl::result_of::cpu_value_type<V1>::type        value_type;
-      
-      //TODO: Ensure that correct alignment is chosen for the kernels.
-      const unsigned int ALIGNMENT = V1::alignment;
-      
-      viennacl::ocl::handle<cl_mem> h = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, sizeof(cl_uint));
-      
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "index_norm_inf");
-      //cl_uint size = static_cast<cl_uint>(vcl_vec.internal_size());
-
-      k.global_work_size(0, k.local_work_size());
-      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec), cl_uint(viennacl::traits::start(vec)), cl_uint(viennacl::traits::size(vec)),                                 
-                               viennacl::ocl::local_mem(sizeof(value_type) * k.local_work_size()),
-                               viennacl::ocl::local_mem(sizeof(cl_uint) * k.local_work_size()), h));
-      
-      //read value:
-      cl_uint result;
-      cl_int err;
-      err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(), h.get(), CL_TRUE, 0, sizeof(cl_uint), &result, 0, NULL, NULL);
-      VIENNACL_ERR_CHECK(err);
-      return result;
+      switch (viennacl::traits::handle(vec).get_active_handle_id())
+      {
+        case viennacl::MAIN_MEMORY:
+          return viennacl::linalg::host_based::index_norm_inf(vec);
+#ifdef VIENNACL_WITH_OPENCL
+        case viennacl::OPENCL_MEMORY:
+          return viennacl::linalg::opencl::index_norm_inf(vec);
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        case viennacl::CUDA_MEMORY:
+          return viennacl::linalg::cuda::index_norm_inf(vec);
+#endif
+        case viennacl::MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("not implemented");
+      }
     }
-    
-    //TODO: Special case vec1 == vec2 allows improvement!!
+
+    /** @brief Computes the supremum norm of a vector with final reduction on the CPU - interface for a vector expression. Creates a temporary.
+    *
+    * @param vec    The vector expression
+    */
+    template <typename LHS, typename RHS, typename OP>
+    vcl_size_t index_norm_inf(viennacl::vector_expression<LHS, RHS, OP> const & vec)
+    {
+      viennacl::vector<typename viennacl::result_of::cpu_value_type<LHS>::type> temp = vec;
+      return index_norm_inf(temp);
+    }
+
+
     /** @brief Computes a plane rotation of two vectors.
     *
     * Computes (x,y) <- (alpha * x + beta * y, -beta * x + alpha * y)
     *
     * @param vec1   The first vector
     * @param vec2   The second vector
-    * @param alpha  The first transformation coefficient
-    * @param beta   The second transformation coefficient
+    * @param alpha  The first transformation coefficient (CPU scalar)
+    * @param beta   The second transformation coefficient (CPU scalar)
     */
-    template <typename V1, typename V2, typename SCALARTYPE>
-    typename viennacl::enable_if< viennacl::is_vector<V1>::value
-                                  && viennacl::is_vector<V2>::value
-                                  && viennacl::is_cpu_scalar<SCALARTYPE>::value
-                                >::type
-    plane_rotation(V1 & vec1,
-                   V2 & vec2,
-                   SCALARTYPE alpha,
-                   SCALARTYPE beta)
+    template <typename T>
+    void plane_rotation(vector_base<T> & vec1,
+                        vector_base<T> & vec2,
+                        T alpha, T beta)
     {
-      typedef typename viennacl::result_of::cpu_value_type<V1>::type        value_type;
-      
-      //TODO: Ensure that correct alignment is chosen for the kernels.
-      const unsigned int ALIGNMENT = V1::alignment;
-      
-      assert(viennacl::traits::size(vec1) == viennacl::traits::size(vec2));
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::program_name(), "plane_rotation");
-
-      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec1), cl_uint(viennacl::traits::start(vec1)), cl_uint(viennacl::traits::size(vec1)),                                 
-                               viennacl::traits::handle(vec2), cl_uint(viennacl::traits::start(vec2)), cl_uint(viennacl::traits::size(vec2)),                                 
-                               alpha,
-                               beta)
-                            );
+      switch (viennacl::traits::handle(vec1).get_active_handle_id())
+      {
+        case viennacl::MAIN_MEMORY:
+          viennacl::linalg::host_based::plane_rotation(vec1, vec2, alpha, beta);
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case viennacl::OPENCL_MEMORY:
+          viennacl::linalg::opencl::plane_rotation(vec1, vec2, alpha, beta);
+          break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        case viennacl::CUDA_MEMORY:
+          viennacl::linalg::cuda::plane_rotation(vec1, vec2, alpha, beta);
+          break;
+#endif
+        case viennacl::MEMORY_NOT_INITIALIZED:
+          throw memory_exception("not initialised!");
+        default:
+          throw memory_exception("not implemented");
+      }
     }
-    
+
   } //namespace linalg
 } //namespace viennacl
 
diff --git a/viennacl/matrix.hpp b/viennacl/matrix.hpp
index f7f9a4c..9ac27cb 100644
--- a/viennacl/matrix.hpp
+++ b/viennacl/matrix.hpp
@@ -1,1044 +1,3048 @@
-#ifndef VIENNACL_MATRIX_HPP_
-#define VIENNACL_MATRIX_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file matrix.hpp
-    @brief Implementation of the dense matrix class
-*/
-
-#include "viennacl/forwards.h"
-#include "viennacl/ocl/backend.hpp"
-#include "viennacl/scalar.hpp"
-#include "viennacl/vector.hpp"
-#include "viennacl/linalg/matrix_operations.hpp"
-#include "viennacl/tools/tools.hpp"
-#include "viennacl/tools/matrix_size_deducer.hpp"
-#include "viennacl/tools/matrix_kernel_class_deducer.hpp"
-#include "viennacl/meta/result_of.hpp"
-#include "viennacl/meta/enable_if.hpp"
-
-namespace viennacl
-{
-    /** @brief A tag for row-major storage of a dense matrix. */
-    struct row_major
-    {
-      /** @brief Returns the memory offset for entry (i,j) of a dense matrix.
-      *
-      * @param i   row index
-      * @param j   column index
-      * @param num_rows  number of entries per row (including alignment)
-      * @param num_cols  number of entries per column (including alignment)
-      */
-      static vcl_size_t mem_index(vcl_size_t i, vcl_size_t j, vcl_size_t num_rows, vcl_size_t num_cols)
-      {
-        return i * num_cols + j;
-      }
-      
-      static vcl_size_t internal_size1(vcl_size_t rows, vcl_size_t alignment)
-      {
-        return viennacl::tools::roundUpToNextMultiple<vcl_size_t>(rows, alignment);;
-      }
-      
-      static vcl_size_t internal_size2(vcl_size_t cols, vcl_size_t alignment)
-      {
-        return viennacl::tools::roundUpToNextMultiple<vcl_size_t>(cols, alignment);
-      }
-    };
-
-    struct column_major
-    {
-      /** @brief Returns the memory offset for entry (i,j) of a dense matrix.
-      *
-      * @param i   row index
-      * @param j   column index
-      * @param num_rows  number of entries per row (including alignment)
-      * @param num_cols  number of entries per column (including alignment)
-      */
-      static vcl_size_t mem_index(vcl_size_t i, vcl_size_t j, vcl_size_t num_rows, vcl_size_t num_cols)
-      {
-        return i + j * num_rows;
-      }
-      
-      static vcl_size_t internal_size1(vcl_size_t rows, vcl_size_t alignment)
-      {
-        return viennacl::tools::roundUpToNextMultiple<vcl_size_t>(rows, alignment);
-      }
-      
-      static vcl_size_t internal_size2(vcl_size_t cols, vcl_size_t alignment)
-      {
-        return viennacl::tools::roundUpToNextMultiple<vcl_size_t>(cols, alignment);
-      }
-    };
-    
-    template <typename LHS, typename RHS, typename OP>
-    class matrix_expression
-    {
-      public:
-        ///** @brief Extracts the vector type from the two operands.
-        //*/
-        //typedef typename viennacl::tools::VECTOR_EXTRACTOR<LHS, RHS>::ResultType    VectorType;
-      
-        matrix_expression(LHS & lhs, RHS & rhs) : _lhs(lhs), _rhs(rhs) {}
-        
-        /** @brief Get left hand side operand
-        */
-        LHS & lhs() const { return _lhs; }
-        /** @brief Get right hand side operand
-        */
-        RHS & rhs() const { return _rhs; }
-        
-        /** @brief Returns the size of the result vector */
-        std::size_t size1() const { return viennacl::tools::MATRIX_SIZE_DEDUCER<LHS, RHS, OP>::size1(_lhs, _rhs); }
-        std::size_t size2() const { return viennacl::tools::MATRIX_SIZE_DEDUCER<LHS, RHS, OP>::size2(_lhs, _rhs); }
-        
-      private:
-        /** @brief The left hand side operand */
-        typename result_of::matrix_expression_internal_storage<LHS>::type _lhs;
-        /** @brief The right hand side operand */
-        typename result_of::matrix_expression_internal_storage<RHS>::type _rhs;
-    };
-    
-    
-    /** @brief A tag indicating iteration along increasing row index of a matrix */
-    struct row_iteration {};
-    
-    /** @brief A tag indicating iteration along increasing columns index of a matrix */
-    struct col_iteration {};
-
-    //STL-like iterator. TODO: STL-compliance...
-    template <typename ROWCOL, typename MATRIXTYPE>
-    class matrix_iterator
-    {
-        typedef matrix_iterator<ROWCOL, MATRIXTYPE>    self_type;
-      public:
-        typedef typename MATRIXTYPE::value_type       value_type;
-        
-        matrix_iterator(MATRIXTYPE & mat, 
-                        std::size_t start_row,
-                        std::size_t start_col) : mat_(mat), row_(start_row), col_(start_col) {};
-        
-        value_type operator*(void) { return mat_(row_, col_); }
-        self_type & operator++(void) { viennacl::tools::MATRIX_ITERATOR_INCREMENTER<ROWCOL, MATRIXTYPE>::apply(mat_, row_, col_); return *this; }
-        self_type & operator++(int) { self_type tmp = *this; ++(*this); return tmp; }
-        
-        bool operator==(self_type const & other) { return (row_ == other.row_) && (col_ == other.col_); }
-        bool operator!=(self_type const & other) { return !(*this == other); }
-        
-        vcl_size_t index1() { return row_; }
-        vcl_size_t index2() { return col_; }
-        
-        MATRIXTYPE & operator()(void) const { return mat_; }
-      
-      private:
-        MATRIXTYPE & mat_;
-        vcl_size_t row_;
-        vcl_size_t col_;
-    };
-
-    /** @brief A dense matrix class
-    *
-    * @tparam SCALARTYPE   The underlying scalar type (either float or double)
-    * @tparam F            Storage layout: Either row_major or column_major (at present only row_major is supported)
-    * @tparam ALIGNMENT   The internal memory size is given by (size()/ALIGNMENT + 1) * ALIGNMENT. ALIGNMENT must be a power of two. Best values or usually 4, 8 or 16, higher values are usually a waste of memory.
-    */
-    template <class SCALARTYPE, typename F, unsigned int ALIGNMENT>
-    class matrix
-    {
-      typedef matrix<SCALARTYPE, F, ALIGNMENT>          self_type;
-    public:
-      
-      typedef matrix_iterator<row_iteration, matrix<SCALARTYPE, F, ALIGNMENT> >   iterator1;
-      typedef matrix_iterator<col_iteration, matrix<SCALARTYPE, F, ALIGNMENT> >   iterator2;
-      typedef scalar<typename viennacl::tools::CHECK_SCALAR_TEMPLATE_ARGUMENT<SCALARTYPE>::ResultType>   value_type;
-      typedef vcl_size_t                                                          size_type;
-      
-      /** @brief The default constructor. Does not allocate any memory. */
-      matrix() : rows_(0), columns_(0)
-      {
-        typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< matrix<SCALARTYPE, F, ALIGNMENT> >::ResultType    KernelClass;
-        KernelClass::init();
-      };
-      
-      /** @brief Creates the matrix with the given dimensions
-      *
-      * @param rows     Number of rows
-      * @param columns  Number of columns
-      */
-      explicit matrix(size_type rows, size_type columns) :
-        rows_(rows), columns_(columns)
-      {
-        typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< matrix<SCALARTYPE, F, ALIGNMENT> >::ResultType    KernelClass;
-        KernelClass::init();
-        elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, sizeof(SCALARTYPE)*internal_size());
-      }
-
-      explicit matrix(cl_mem mem, size_type rows, size_type columns) :
-        rows_(rows), columns_(columns)
-      {
-        typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< matrix<SCALARTYPE, F, ALIGNMENT> >::ResultType    KernelClass;
-        KernelClass::init();
-        elements_ = mem;
-        elements_.inc(); //prevents that the user-provided memory is deleted once the matrix object is destroyed.
-      }
-
-      template <typename LHS, typename RHS, typename OP>
-      matrix(matrix_expression< LHS, RHS, OP> const & proxy) : rows_(proxy.size1()), columns_(proxy.size2())
-      {
-        typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< matrix<SCALARTYPE, F, ALIGNMENT> >::ResultType    KernelClass;
-        KernelClass::init();
-        elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, sizeof(SCALARTYPE)*internal_size());
-        
-        *this = proxy;
-      }
-
-      matrix(matrix_range<self_type> const & proxy) : rows_(proxy.size1()), columns_(proxy.size2())
-      {
-        typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< matrix<SCALARTYPE, F, ALIGNMENT> >::ResultType    KernelClass;
-        KernelClass::init();
-        elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, sizeof(SCALARTYPE)*internal_size());
-        
-        *this = proxy;
-      }
-
-      matrix(matrix_range<const self_type> const & proxy) : rows_(proxy.size1()), columns_(proxy.size2())
-      {
-        typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< matrix<SCALARTYPE, F, ALIGNMENT> >::ResultType    KernelClass;
-        KernelClass::init();
-        elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, sizeof(SCALARTYPE)*internal_size());
-        
-        *this = proxy;
-      }
-
-
-
-      //copy constructor:
-      matrix(const matrix<SCALARTYPE, F, ALIGNMENT> & mat) :
-        rows_(mat.size1()), columns_(mat.size2()),
-        elements_(viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, sizeof(SCALARTYPE)*internal_size()))
-      {
-        cl_int err;
-        err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle().get(), mat.handle().get(), elements_.get(), 0, 0, sizeof(SCALARTYPE)*internal_size(), 0, NULL, NULL);
-        VIENNACL_ERR_CHECK(err);
-      }
-
-      matrix<SCALARTYPE, F, ALIGNMENT> & operator=(const matrix<SCALARTYPE, F, ALIGNMENT> & mat)
-      {
-        resize(mat.size1(), mat.size2(), false);
-        cl_int err;
-        err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle().get(), mat.handle().get(), elements_.get(), 0, 0, sizeof(SCALARTYPE)*internal_size(), 0, NULL, NULL);
-        VIENNACL_ERR_CHECK(err);
-        return *this;
-      }
-      
-      matrix<SCALARTYPE, F, ALIGNMENT> & operator=(const matrix_expression< const matrix<SCALARTYPE, F, ALIGNMENT>,
-                                                                            const matrix<SCALARTYPE, F, ALIGNMENT>,
-                                                                            op_trans> & proxy)
-      {
-        assert(elements_.get() != proxy.lhs().handle().get() && "Self-assignment of matrix transpose not implemented");
-        assert(proxy.lhs().size1() == size2() && "Matrix dimensions do not match!");
-        assert(proxy.lhs().size2() == size1() && "Matrix dimensions do not match!");
-
-        resize(proxy.lhs().size2(), proxy.lhs().size1(), false);
-        
-        std::vector<SCALARTYPE> temp(proxy.lhs().internal_size());
-        
-        cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(),
-                                         proxy.lhs().handle().get(), CL_TRUE, 0,
-                                         sizeof(SCALARTYPE)*proxy.lhs().internal_size(),
-                                         &(temp[0]), 0, NULL, NULL);
-        VIENNACL_ERR_CHECK(err);
-        viennacl::ocl::get_queue().finish();
-
-        /*
-        for (size_t i=0; i<proxy.lhs().size1(); ++i)
-        {
-          for (size_t j=0; j<proxy.lhs().size2(); ++j)
-            std::cout << temp[F::mem_index(i,j, proxy.lhs().internal_size1(), proxy.lhs().internal_size2())] << ", ";
-        }*/
-        
-        std::vector<SCALARTYPE> temp_trans(internal_size());
-
-        for (vcl_size_t i=0; i<proxy.lhs().size1(); ++i)
-          for (vcl_size_t j=0; j<proxy.lhs().size2(); ++j)
-            temp_trans[F::mem_index(j,i, internal_size1(), internal_size2())] 
-             = temp[F::mem_index(i,j, proxy.lhs().internal_size1(), proxy.lhs().internal_size2())];
-
-        /*     
-        for (size_t i=0; i<proxy.lhs().size1(); ++i)
-        {
-          for (size_t j=0; j<proxy.lhs().size2(); ++j)
-            std::cout << temp_trans[F::mem_index(i,j, proxy.lhs().internal_size1(), proxy.lhs().internal_size2())] << ", ";
-        }*/
-        
-        elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, 
-                                                                   sizeof(SCALARTYPE)*internal_size(),
-                                                                   &(temp_trans[0]));
-          
-        return *this;
-      }
-
-
-      matrix<SCALARTYPE, F, ALIGNMENT> & operator=(const matrix_range<self_type> & mat)
-      {
-        resize(mat.size1(), mat.size2(), false);
-        
-        // clear matrix:
-        clear();
-        
-        // use inplace_add:
-        viennacl::linalg::inplace_add(*this, mat);
-        
-        return *this;
-      }
-
-      matrix<SCALARTYPE, F, ALIGNMENT> & operator=(const matrix_range<const self_type> & mat)
-      {
-        resize(mat.size1(), mat.size2(), false);
-        
-        // clear matrix:
-        clear();
-        
-        // use inplace_add:
-        viennacl::linalg::inplace_add(*this, mat);
-        
-        return *this;
-      }
-
-
-
-
-      /** @brief Resizes the matrix.
-      *   Existing entries can be preserved, but 
-      *
-      * @param rows       New number of rows
-      * @param columns    New number of columns
-      * @param preserve   If true, existing values are preserved. 
-      */
-      void resize(size_type rows, size_type columns, bool preserve = true)
-      {
-        assert(rows > 0 && columns > 0);
-        if (preserve)
-        {
-          //get old entries:
-          std::vector< SCALARTYPE > old_entries(internal_size());
-          cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(), //src
-                                           elements_.get(), //dest
-                                           CL_TRUE, //blocking
-                                           0, //offset
-                                           sizeof(SCALARTYPE)*internal_size(), //size
-                                           &(old_entries[0]), //destination
-                                           0, NULL, NULL);
-          VIENNACL_ERR_CHECK(err);
-          
-          //set up entries of new matrix:
-          std::vector< SCALARTYPE > new_entries(F::internal_size1(rows, ALIGNMENT) * F::internal_size2(columns, ALIGNMENT));
-          for (size_type i=0; i<rows; ++i)
-          {
-            if (i >= rows_)
-              continue;
-              
-            for (size_type j=0; j<columns; ++j)
-            {
-              if (j >= columns_)
-                continue;
-              new_entries[F::mem_index(i, j, F::internal_size1(rows, ALIGNMENT), F::internal_size2(columns, ALIGNMENT))] 
-                 = old_entries[F::mem_index(i, j, internal_size1(), internal_size2())];
-            }
-          }
-          
-          //copy new entries to GPU:
-          elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, new_entries);
-          rows_ = rows;
-          columns_ = columns;
-        }
-        else //discard old entries:
-        {
-          rows_ = rows;
-          columns_ = columns;
-          
-          std::vector< SCALARTYPE > new_entries(F::internal_size1(rows, ALIGNMENT) * F::internal_size2(columns, ALIGNMENT));
-          elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, new_entries);
-        }
-      }
-      
-      
-      //read-write access to an element of the vector
-      /** @brief Read-write access to a single element of the vector
-      */
-      entry_proxy<SCALARTYPE> operator()(size_type row_index, size_type col_index)
-      {
-        return entry_proxy<SCALARTYPE>(F::mem_index(row_index, col_index, internal_size1(), internal_size2()), elements_);
-      }
-      
-      /** @brief Read access to a single element of the vector
-      */
-      scalar<SCALARTYPE> operator()(size_type row_index, size_type col_index) const
-      {
-        scalar<SCALARTYPE> tmp;
-        cl_int err;
-        err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle().get(),
-                                  elements_.get(),
-                                  tmp.handle().get(),
-                                  sizeof(SCALARTYPE) * F::mem_index(row_index, col_index, internal_size1(), internal_size2()),
-                                  0,
-                                  sizeof(SCALARTYPE),
-                                  0,
-                                  NULL,
-                                  NULL);
-        //assert(err == CL_SUCCESS);
-        VIENNACL_ERR_CHECK(err);
-        return tmp;
-      }
-      
-
-      matrix_expression< const matrix<SCALARTYPE, F, ALIGNMENT>,
-                         const matrix<SCALARTYPE, F, ALIGNMENT>,
-                         op_add >
-      operator + (const matrix< SCALARTYPE, F, ALIGNMENT> & other) 
-      {
-        return matrix_expression< const matrix<SCALARTYPE, F, ALIGNMENT>,
-                                  const matrix<SCALARTYPE, F, ALIGNMENT>,
-                                  op_add > (*this, other);
-      }
-
-
-      matrix<SCALARTYPE, F, ALIGNMENT> & operator += (const matrix< SCALARTYPE, F, ALIGNMENT> & other) 
-      {
-        viennacl::linalg::inplace_add(*this, other);
-        return *this;
-      }
-
-      matrix<SCALARTYPE, F, ALIGNMENT> & operator += (const matrix_range< matrix<SCALARTYPE, F, ALIGNMENT> > & other) 
-      {
-        viennacl::linalg::inplace_add(*this, other);
-        return *this;
-      }
-
-      matrix_expression< const matrix<SCALARTYPE, F, ALIGNMENT>,
-                         const matrix<SCALARTYPE, F, ALIGNMENT>,
-                         op_sub >
-      operator - (const matrix< SCALARTYPE, F, ALIGNMENT> & other) 
-      {
-        return matrix_expression< const matrix<SCALARTYPE, F, ALIGNMENT>,
-                                  const matrix<SCALARTYPE, F, ALIGNMENT>,
-                                  op_sub > (*this, other);
-      }
-
-      matrix<SCALARTYPE, F, ALIGNMENT> & operator -= (const matrix< SCALARTYPE, F, ALIGNMENT> & other) 
-      {
-        viennacl::linalg::inplace_sub(*this, other);
-        return *this;
-      }
-
-      template <unsigned int A1, unsigned int A2>
-      matrix<SCALARTYPE, F, ALIGNMENT> & operator += (const matrix_expression< const vector<SCALARTYPE, A1>,
-                                                                               const vector<SCALARTYPE, A2>,
-                                                                               op_prod > & proxy) 
-      {
-        viennacl::linalg::rank_1_update(*this, proxy.lhs(), proxy.rhs());
-        return *this;
-      }
-
-      template <unsigned int A1, unsigned int A2>
-      matrix<SCALARTYPE, F, ALIGNMENT> & operator -= (const matrix_expression< const vector<SCALARTYPE, A1>,
-                                                                               const vector<SCALARTYPE, A2>,
-                                                                               op_prod > & proxy) 
-      {
-        viennacl::linalg::scaled_rank_1_update(*this, static_cast<SCALARTYPE>(-1.0), proxy.lhs(), proxy.rhs());
-        return *this;
-      }
-
-      template <unsigned int A1, unsigned int A2>
-      matrix<SCALARTYPE, F, ALIGNMENT> & operator += (const matrix_expression< const matrix_expression< const vector<SCALARTYPE, A1>,
-                                                                                                        const vector<SCALARTYPE, A2>,
-                                                                                                        op_prod >,
-                                                                               const SCALARTYPE,
-                                                                               op_prod > & proxy) 
-      {
-        viennacl::linalg::scaled_rank_1_update(*this, proxy.rhs(), proxy.lhs().lhs(), proxy.lhs().rhs());
-        return *this;
-      }
-
-      template <unsigned int A1, unsigned int A2>
-      matrix<SCALARTYPE, F, ALIGNMENT> & operator -= (const matrix_expression< const matrix_expression< const vector<SCALARTYPE, A1>,
-                                                                                                        const vector<SCALARTYPE, A2>,
-                                                                                                        op_prod >,
-                                                                               const SCALARTYPE,
-                                                                               op_prod > & proxy) 
-      {
-        viennacl::linalg::scaled_rank_1_update(*this, static_cast<SCALARTYPE>(-1.0) * proxy.rhs(), proxy.lhs().lhs(), proxy.lhs().rhs());
-        return *this;
-      }
-      
-      
-      matrix<SCALARTYPE, F, ALIGNMENT> & operator *= (SCALARTYPE val) 
-      {
-        viennacl::linalg::inplace_mult(*this, val);
-        return *this;
-      }
-
-      matrix<SCALARTYPE, F, ALIGNMENT> & operator *= (scalar<SCALARTYPE> const & val) 
-      {
-        viennacl::linalg::inplace_mult(*this, val);
-        return *this;
-      }
-
-      matrix<SCALARTYPE, F, ALIGNMENT> & operator /= (SCALARTYPE val) 
-      {
-        viennacl::linalg::inplace_mult(*this, SCALARTYPE(1.0) / val);
-        return *this;
-      }
-
-      matrix<SCALARTYPE, F, ALIGNMENT> & operator /= (scalar<SCALARTYPE> const & val) 
-      {
-        viennacl::linalg::inplace_divide(*this, val);
-        return *this;
-      }
-
-
-      //this = A * B and related (with trans())
-      template <typename MatrixType1, typename MatrixType2>
-      matrix<SCALARTYPE, F, ALIGNMENT> & operator = (const matrix_expression< MatrixType1,
-                                                                              MatrixType2,
-                                                                              op_prod > & proxy) 
-      {
-        viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), *this);
-        return *this;
-      }
-
-      //this = A + B
-      template <typename T1, typename T2>
-      matrix<SCALARTYPE, F, ALIGNMENT> &
-      operator = (const matrix_expression< const T1,
-                                           const T2,
-                                           op_add > & proxy) 
-      {
-        viennacl::linalg::add(proxy.lhs(), proxy.rhs(), *this);
-        return *this;
-      }
-      
-      //this = A - B
-      template <typename T1, typename T2>
-      matrix<SCALARTYPE, F, ALIGNMENT> &
-      operator = (const matrix_expression< const T1,
-                                           const T2,
-                                           op_sub > & proxy) 
-      {
-        viennacl::linalg::add(proxy.lhs(), proxy.rhs(), *this);
-        return *this;
-      }
-      
-      
-      
-
-      //this = A - B
-      matrix<SCALARTYPE, F, ALIGNMENT> & operator = (const matrix_expression< const matrix<SCALARTYPE, F, ALIGNMENT>,
-                                                                               const matrix<SCALARTYPE, F, ALIGNMENT>,
-                                                                               op_sub > & proxy) 
-      {
-        viennacl::linalg::sub(proxy.lhs(), proxy.rhs(), *this);
-        return *this;
-      }
-
-
-      /** @brief Returns the number of rows */
-      const size_type & size1() const { return rows_;}
-      /** @brief Returns the number of columns */
-      const size_type & size2() const { return columns_; }
-      
-      /** @brief Resets all entries to zero */
-      void clear()
-      {
-        typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< matrix<SCALARTYPE, F, ALIGNMENT> >::ResultType    KernelClass;
-        
-        viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), "clear");
-        viennacl::ocl::enqueue(k(elements_,
-                                 cl_uint(0), cl_uint(0),
-                                 cl_uint(size1()), cl_uint(size2()),
-                                 cl_uint(internal_size1()), cl_uint(internal_size2())
-                                )
-                              );
-      }
-      
-      
-      //const unsigned int row_stride() const { return roundUpToNextMultiple<unsigned int>(columns(), ALIGNMENT); }
-      /** @brief Returns the internal number of rows. Usually required for launching OpenCL kernels only */
-      const size_type internal_size1() const { return F::internal_size1(size1(), ALIGNMENT); }
-      /** @brief Returns the internal number of columns. Usually required for launching OpenCL kernels only */
-      const size_type internal_size2() const { return F::internal_size2(size2(), ALIGNMENT); }
-      /** @brief Returns the total amount of allocated memory in multiples of sizeof(SCALARTYPE) */
-      const size_type internal_size() const { return internal_size1() * internal_size2(); }
-      
-      /** @brief Returns the OpenCL handle */
-      const viennacl::ocl::handle<cl_mem> & handle() const { return elements_; }
-      
-      #if defined(_MSC_VER) && _MSC_VER < 1500          //Visual Studio 2005 needs special treatment
-      template <typename CPU_MATRIX>
-      friend void copy(const CPU_MATRIX & cpu_matrix,
-                      matrix & gpu_matrix );
-      
-      template <typename SCALARTYPE2, typename A1, typename A2>
-      friend void copy(const std::vector< std::vector<SCALARTYPE2, A1>, A2> & cpu_matrix,
-                      matrix & gpu_matrix );
-      
-      template <typename SCALARTYPE2>
-      friend void fast_copy(SCALARTYPE2 * cpu_matrix_begin,
-                            SCALARTYPE2 * cpu_matrix_end,
-                            matrix & gpu_matrix);
-      
-      #ifdef VIENNACL_HAVE_EIGEN
-      friend void copy(const Eigen::MatrixXf & cpu_matrix,
-                       matrix & gpu_matrix);
-      
-      friend void copy(const Eigen::MatrixXd & cpu_matrix,
-                       matrix & gpu_matrix);
-      #endif
-      
-      #ifdef VIENNACL_HAVE_MTL4
-      template <typename SCALARTYPE2, typename T>
-      friend void copy(const mtl::dense2D<SCALARTYPE2, T>& cpu_matrix,
-                       matrix & gpu_matrix);
-      #endif
-      #else
-      template <typename CPU_MATRIX, typename SCALARTYPE2, typename F2, unsigned int ALIGNMENT2>
-      friend void copy(const CPU_MATRIX & cpu_matrix,
-                      matrix<SCALARTYPE2, F2, ALIGNMENT2> & gpu_matrix );
-                      
-      template <typename SCALARTYPE2, typename A1, typename A2, typename F2, unsigned int ALIGNMENT2>
-      friend void copy(const std::vector< std::vector<SCALARTYPE2, A1>, A2> & cpu_matrix,
-                       matrix<SCALARTYPE2, F2, ALIGNMENT2> & gpu_matrix );
-      
-      template <typename SCALARTYPE2, typename F2, unsigned int ALIGNMENT2>
-      friend void fast_copy(SCALARTYPE2 * cpu_matrix_begin,
-                            SCALARTYPE2 * cpu_matrix_end,
-                            matrix<SCALARTYPE2, F2, ALIGNMENT2> & gpu_matrix);
-      
-      #ifdef VIENNACL_HAVE_EIGEN
-      template <typename F2, unsigned int ALIGNMENT2>
-      friend void copy(const Eigen::MatrixXf & cpu_matrix,
-                matrix<float, F2, ALIGNMENT2> & gpu_matrix);
-      
-      template <typename F2, unsigned int ALIGNMENT2>
-      friend void copy(const Eigen::MatrixXd & cpu_matrix,
-                matrix<double, F2, ALIGNMENT2> & gpu_matrix);
-      #endif
-      
-      #ifdef VIENNACL_HAVE_MTL4
-      template <typename SCALARTYPE2, typename T, typename F2, unsigned int ALIGNMENT2>
-      friend void copy(const mtl::dense2D<SCALARTYPE2, T>& cpu_matrix,
-                       matrix<SCALARTYPE2, F2, ALIGNMENT2> & gpu_matrix);
-      #endif
-      #endif                 
-      
-    private:
-      size_type rows_;
-      size_type columns_;
-      viennacl::ocl::handle<cl_mem> elements_;
-    }; //matrix
-
-    /** @brief Prints the matrix. Output is compatible to boost::numeric::ublas
-    *
-    * @param s            STL output stream
-    * @param gpu_matrix   A dense ViennaCL matrix
-    */
-    template<class SCALARTYPE, typename F, unsigned int ALIGNMENT>
-    std::ostream & operator<<(std::ostream & s, const matrix<SCALARTYPE, F, ALIGNMENT> & gpu_matrix)
-    {
-      typedef typename matrix<SCALARTYPE, F, ALIGNMENT>::size_type      size_type;
-      
-      std::vector<SCALARTYPE> tmp(gpu_matrix.internal_size());
-      cl_int err;
-      err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(), gpu_matrix.handle().get(), CL_TRUE, 0, sizeof(SCALARTYPE) * gpu_matrix.internal_size(), &tmp[0], 0, NULL, NULL);
-      VIENNACL_ERR_CHECK(err);
-      viennacl::ocl::get_queue().finish();
-      
-      s << "[" << gpu_matrix.size1() << "," << gpu_matrix.size2() << "]";
-      
-      s << "(";
-      for (size_type i = 0; i < gpu_matrix.size1(); ++i)
-      {
-        s << "(";
-        for (size_type j = 0; j < gpu_matrix.size2(); ++j)
-        {
-          s << tmp[F::mem_index(i, j, gpu_matrix.internal_size1(), gpu_matrix.internal_size2())];
-          if (j < gpu_matrix.size2() - 1)
-            s << ",";
-        }
-        s << ")";
-        if (i < gpu_matrix.size1() - 1)
-          s << ",";
-      }
-      s << ")";
-      return s;
-    }
-
-    /** @brief Prints the matrix. Output is compatible to boost::numeric::ublas
-    *
-    * @param s            STL output stream
-    * @param expr         A matrix expression
-    */
-    template<typename LHS, typename RHS, typename OP>
-    std::ostream & operator<<(std::ostream & s, const matrix_expression<LHS, RHS, OP> & expr)
-    {
-      typedef typename viennacl::tools::CPU_SCALAR_TYPE_DEDUCER< typename tools::CONST_REMOVER<LHS>::ResultType >::ResultType     ScalarType;
-
-      matrix<ScalarType> temp = expr;
-      s << temp;
-      return s;
-    }
-    
-    /** @brief Returns an expression template class representing a transposed matrix */
-    template<class SCALARTYPE, typename F, unsigned int ALIGNMENT>
-    matrix_expression< const matrix<SCALARTYPE, F, ALIGNMENT>,
-                       const matrix<SCALARTYPE, F, ALIGNMENT>,
-                       op_trans> trans(const matrix<SCALARTYPE, F, ALIGNMENT> & mat)
-    {
-      return matrix_expression< const matrix<SCALARTYPE, F, ALIGNMENT>,
-                                const matrix<SCALARTYPE, F, ALIGNMENT>,
-                                op_trans>(mat, mat);
-    }
-    
-    
-    /////////////////////// transfer operations: //////////////////////////////////////
-
-    //
-    //cpu to gpu, generic type:
-    //
-    /** @brief Copies a dense matrix from the host (CPU) to the OpenCL device (GPU or multi-core CPU)
-    *
-    * @param cpu_matrix   A dense matrix on the host. Type requirements: .size1() returns number of rows, .size2() returns number of columns. Access to entries via operator()
-    * @param gpu_matrix   A dense ViennaCL matrix
-    */
-    template <typename CPU_MATRIX, typename SCALARTYPE, typename F, unsigned int ALIGNMENT>
-    void copy(const CPU_MATRIX & cpu_matrix,
-              matrix<SCALARTYPE, F, ALIGNMENT> & gpu_matrix )
-    {
-      typedef typename matrix<SCALARTYPE, F, ALIGNMENT>::size_type      size_type;
-      
-      //std::cout << "Copying CPU_MATRIX!" << std::endl;
-      //std::cout << "Size at begin: " << gpu_matrix.size1() << ", " << gpu_matrix.size2() << std::endl;
-      if (gpu_matrix.size1() == 0 || gpu_matrix.size2() == 0)
-      {
-        gpu_matrix.resize(cpu_matrix.size1(),
-                          cpu_matrix.size2(), false);
-      }
-      else
-      {
-        assert( (gpu_matrix.size1() == cpu_matrix.size1()) 
-               && (gpu_matrix.size2() == cpu_matrix.size2())
-              );
-      }
-
-      std::vector<SCALARTYPE> data(gpu_matrix.internal_size());
-      for (size_type i = 0; i < gpu_matrix.size1(); ++i)
-      {
-        for (size_type j = 0; j < gpu_matrix.size2(); ++j) 
-          data[F::mem_index(i, j, gpu_matrix.internal_size1(), gpu_matrix.internal_size2())] = cpu_matrix(i,j);
-      }
-      
-      gpu_matrix.elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, data);
-      //std::cout << "Size at end: " << gpu_matrix.size1() << ", " << gpu_matrix.size2() << std::endl;
-    }
-    
-    //
-    //cpu to gpu, STL type:
-    //
-    /** @brief Copies a dense STL-type matrix from the host (CPU) to the OpenCL device (GPU or multi-core CPU)
-    *
-    * @param cpu_matrix   A dense matrix on the host of type std::vector< std::vector<> >. cpu_matrix[i][j] returns the element in the i-th row and j-th columns (both starting with zero)
-    * @param gpu_matrix   A dense ViennaCL matrix
-    */
-    template <typename SCALARTYPE, typename A1, typename A2, typename F, unsigned int ALIGNMENT>
-    void copy(const std::vector< std::vector<SCALARTYPE, A1>, A2> & cpu_matrix,
-              matrix<SCALARTYPE, F, ALIGNMENT> & gpu_matrix )
-    {
-      typedef typename matrix<SCALARTYPE, F, ALIGNMENT>::size_type      size_type;
-      
-      if (gpu_matrix.size1() == 0 || gpu_matrix.size2() == 0)
-      {
-        gpu_matrix.resize(cpu_matrix.size(),
-                          cpu_matrix[0].size(),
-                          false);
-      }
-      else
-      {
-        assert( (gpu_matrix.size1() == cpu_matrix.size()) 
-               && (gpu_matrix.size2() == cpu_matrix[0].size())
-              );
-      }
-
-      std::vector<SCALARTYPE> data(gpu_matrix.internal_size());
-      for (size_type i = 0; i < gpu_matrix.size1(); ++i)
-      {
-        for (size_type j = 0; j < gpu_matrix.size2(); ++j) 
-          data[F::mem_index(i, j, gpu_matrix.internal_size1(), gpu_matrix.internal_size2())] = cpu_matrix[i][j];
-      }
-      
-      gpu_matrix.elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, data);
-    }
-    
-    
-    //
-    //cpu to gpu, another STL type:
-    //
-    /** @brief Copies a dense matrix from the host (CPU) to the OpenCL device (GPU or multi-core CPU) without temporary. Matrix-Layout on CPU must be equal to the matrix-layout on the GPU.
-    *
-    * @param cpu_matrix_begin   Pointer to the first matrix entry. Cf. iterator concept in STL
-    * @param cpu_matrix_end     Pointer past the last matrix entry. Cf. iterator concept in STL
-    * @param gpu_matrix         A dense ViennaCL matrix
-    */
-    template <typename SCALARTYPE, typename F, unsigned int ALIGNMENT>
-    void fast_copy(SCALARTYPE * cpu_matrix_begin,
-                   SCALARTYPE * cpu_matrix_end,
-                   matrix<SCALARTYPE, F, ALIGNMENT> & gpu_matrix)
-    {
-      gpu_matrix.elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE,
-                                                                            sizeof(SCALARTYPE) * (cpu_matrix_end - cpu_matrix_begin),
-                                                                            cpu_matrix_begin);
-    }
-    
-   
-    #ifdef VIENNACL_HAVE_EIGEN
-    /** @brief Copies a dense Eigen matrix from the host (CPU) to the OpenCL device (GPU or multi-core CPU)
-    *
-    * @param cpu_matrix   A dense MTL matrix. cpu_matrix(i, j) returns the element in the i-th row and j-th columns (both starting with zero)
-    * @param gpu_matrix   A dense ViennaCL matrix
-    */
-    template <typename F, unsigned int ALIGNMENT>
-    void copy(const Eigen::MatrixXf & cpu_matrix,
-              matrix<float, F, ALIGNMENT> & gpu_matrix)
-    {
-      typedef typename matrix<float, F, ALIGNMENT>::size_type      size_type;
-      
-      if (gpu_matrix.size1() == 0 || gpu_matrix.size2() == 0)
-      {
-        gpu_matrix.resize(cpu_matrix.rows(),
-                          cpu_matrix.cols(),
-                          false);
-      }
-      else
-      {
-        assert( (gpu_matrix.size1() == static_cast<std::size_t>(cpu_matrix.rows())) 
-               && (gpu_matrix.size2() == static_cast<std::size_t>(cpu_matrix.cols()))
-              );
-      }
-
-      std::vector<float> data(gpu_matrix.internal_size());
-      for (size_type i = 0; i < gpu_matrix.size1(); ++i)
-      {
-        for (size_type j = 0; j < gpu_matrix.size2(); ++j) 
-          data[F::mem_index(i, j, gpu_matrix.internal_size1(), gpu_matrix.internal_size2())] = cpu_matrix(i,j);
-      }
-      
-      gpu_matrix.elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, data);
-    }
-    
-    /** @brief Copies a dense Eigen matrix from the host (CPU) to the OpenCL device (GPU or multi-core CPU)
-    *
-    * @param cpu_matrix   A dense MTL matrix. cpu_matrix(i, j) returns the element in the i-th row and j-th columns (both starting with zero)
-    * @param gpu_matrix   A dense ViennaCL matrix
-    */
-    template <typename F, unsigned int ALIGNMENT>
-    void copy(const Eigen::MatrixXd & cpu_matrix,
-              matrix<double, F, ALIGNMENT> & gpu_matrix)
-    {
-      typedef typename matrix<double, F, ALIGNMENT>::size_type      size_type;
-      
-      if (gpu_matrix.size1() == 0 || gpu_matrix.size2() == 0)
-      {
-        gpu_matrix.resize(cpu_matrix.rows(),
-                          cpu_matrix.cols(),
-                          false);
-      }
-      else
-      {
-        assert( (gpu_matrix.size1() == static_cast<std::size_t>(cpu_matrix.rows())) 
-               && (gpu_matrix.size2() == static_cast<std::size_t>(cpu_matrix.cols()))
-              );
-      }
-
-      std::vector<double> data(gpu_matrix.internal_size());
-      for (size_type i = 0; i < gpu_matrix.size1(); ++i)
-      {
-        for (size_type j = 0; j < gpu_matrix.size2(); ++j) 
-          data[F::mem_index(i, j, gpu_matrix.internal_size1(), gpu_matrix.internal_size2())] = cpu_matrix(i,j);
-      }
-      
-      gpu_matrix.elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, data);
-    }
-    #endif
-    
-    #ifdef VIENNACL_HAVE_MTL4
-    /** @brief Copies a dense MTL matrix from the host (CPU) to the OpenCL device (GPU or multi-core CPU)
-    *
-    * @param cpu_matrix   A dense MTL matrix. cpu_matrix(i, j) returns the element in the i-th row and j-th columns (both starting with zero)
-    * @param gpu_matrix   A dense ViennaCL matrix
-    */
-    template <typename SCALARTYPE, typename T, typename F, unsigned int ALIGNMENT>
-    void copy(const mtl::dense2D<SCALARTYPE, T>& cpu_matrix,
-              matrix<SCALARTYPE, F, ALIGNMENT> & gpu_matrix)
-    {
-      typedef typename matrix<SCALARTYPE, F, ALIGNMENT>::size_type      size_type;
-      
-      if (gpu_matrix.size1() == 0 || gpu_matrix.size2() == 0)
-      {
-        gpu_matrix.resize(cpu_matrix.num_rows(),
-                          cpu_matrix.num_cols(),
-                          false);
-      }
-      else
-      {
-        assert( (gpu_matrix.size1() == cpu_matrix.num_rows()) 
-               && (gpu_matrix.size2() == cpu_matrix.num_cols())
-              );
-      }
-
-      std::vector<SCALARTYPE> data(gpu_matrix.internal_size());
-      for (size_type i = 0; i < gpu_matrix.size1(); ++i)
-      {
-        for (size_type j = 0; j < gpu_matrix.size2(); ++j) 
-          data[F::mem_index(i, j, gpu_matrix.internal_size1(), gpu_matrix.internal_size2())] = cpu_matrix[i][j];
-      }
-      
-      gpu_matrix.elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, data);
-    }
-    #endif
-    
-    
-    
-    
-    //
-    //gpu to cpu, generic type
-    //
-    /** @brief Copies a dense matrix from the OpenCL device (GPU or multi-core CPU) to the host (CPU). 
-    *
-    * @param gpu_matrix   A dense ViennaCL matrix
-    * @param cpu_matrix   A dense memory on the host. Must have at least as many rows and columns as the gpu_matrix! Type requirement: Access to entries via operator()
-    */
-    template <typename CPU_MATRIX, typename SCALARTYPE, typename F, unsigned int ALIGNMENT>
-    void copy(const matrix<SCALARTYPE, F, ALIGNMENT> & gpu_matrix,
-              CPU_MATRIX & cpu_matrix )
-    {
-      typedef typename matrix<float, F, ALIGNMENT>::size_type      size_type;
-      
-      if ( (gpu_matrix.size1() > 0) && (gpu_matrix.size2() > 0) )
-      {
-        std::vector<SCALARTYPE> temp_buffer(gpu_matrix.internal_size());
-        cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(), gpu_matrix.handle().get(), CL_TRUE, 0, sizeof(SCALARTYPE)*gpu_matrix.internal_size(), &(temp_buffer[0]), 0, NULL, NULL);
-        VIENNACL_ERR_CHECK(err);
-        
-        //now copy entries to cpu_matrix:
-        for (size_type i = 0; i < gpu_matrix.size1(); ++i)
-          for (size_type j = 0; j < gpu_matrix.size2(); ++j) 
-            cpu_matrix(i,j) = temp_buffer[F::mem_index(i, j, gpu_matrix.internal_size1(), gpu_matrix.internal_size2())];
-      }
-    }
-
-    //gpu to cpu, STL type
-    /** @brief Copies a dense matrix from the OpenCL device (GPU or multi-core CPU) to the host (CPU). 
-    *
-    * @param gpu_matrix   A dense ViennaCL matrix
-    * @param cpu_matrix   A dense memory on the host using STL types, typically std::vector< std::vector<> > Must have at least as many rows and columns as the gpu_matrix! Type requirement: Access to entries via operator()
-    */
-    template <typename SCALARTYPE, typename A1, typename A2, typename F, unsigned int ALIGNMENT>
-    void copy(const matrix<SCALARTYPE, F, ALIGNMENT> & gpu_matrix,
-              std::vector< std::vector<SCALARTYPE, A1>, A2> & cpu_matrix)
-    {
-      typedef typename matrix<float, F, ALIGNMENT>::size_type      size_type;
-      
-      if ( (gpu_matrix.size1() > 0) && (gpu_matrix.size2() > 0) 
-         && (cpu_matrix.size() >= gpu_matrix.size1()) && (cpu_matrix[0].size() >= gpu_matrix.size2()))
-      {
-        std::vector<SCALARTYPE> temp_buffer(gpu_matrix.internal_size());
-        cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(), gpu_matrix.handle().get(), CL_TRUE, 0, sizeof(SCALARTYPE)*gpu_matrix.internal_size(), &(temp_buffer[0]), 0, NULL, NULL);
-        VIENNACL_ERR_CHECK(err);
-        
-        //now copy entries to cpu_matrix:
-        for (size_type i = 0; i < gpu_matrix.size1(); ++i)
-          for (size_type j = 0; j < gpu_matrix.size2(); ++j) 
-            cpu_matrix[i][j] = temp_buffer[F::mem_index(i, j, gpu_matrix.internal_size1(), gpu_matrix.internal_size2())];
-      }
-    }
-
-    //gpu to cpu, STL type
-    /** @brief Copies a dense matrix from the OpenCL device (GPU or multi-core CPU) to the host (CPU). 
-    *
-    * @param gpu_matrix         A dense ViennaCL matrix
-    * @param cpu_matrix_begin   Pointer to the output memory on the CPU. User must ensure that provided memory is large enough.
-    */
-    template <typename SCALARTYPE, typename F, unsigned int ALIGNMENT>
-    void fast_copy(const matrix<SCALARTYPE, F, ALIGNMENT> & gpu_matrix,
-                   SCALARTYPE * cpu_matrix_begin)
-    {
-      cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(),
-                                       gpu_matrix.handle().get(), 
-                                       CL_TRUE, 0,
-                                       sizeof(SCALARTYPE)*gpu_matrix.internal_size(),
-                                       cpu_matrix_begin, 0, NULL, NULL);
-      VIENNACL_ERR_CHECK(err);
-    }
-
-
-
-
-
-
-
-
-
-    // outer_prod(v1, v2) * val;
-    template<typename CPU_SCALAR, typename SCALARTYPE,unsigned int VECTOR_ALIGNMENT>
-    viennacl::matrix_expression< const viennacl::matrix_expression< const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT>,
-                                                                    const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT>,
-                                                                    op_prod>,
-                                 const SCALARTYPE,
-                                 op_prod>  operator*(const viennacl::matrix_expression< const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT>,
-                                                                                        const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT>,
-                                                                                        op_prod> & proxy,
-                                                     CPU_SCALAR const & val)
-    {
-      return viennacl::matrix_expression< const viennacl::matrix_expression< const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT>,
-                                                                             const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT>,
-                                                                             op_prod>,
-                                          const SCALARTYPE,
-                                          op_prod>(proxy, static_cast<SCALARTYPE>(val));
-    }
-
-    // val * outer_prod(v1, v2);
-    template <typename CPU_SCALAR, typename SCALARTYPE, unsigned int VA1, unsigned int VA2>
-    viennacl::matrix_expression< const viennacl::matrix_expression< const viennacl::vector<SCALARTYPE, VA1>,
-                                                                    const viennacl::vector<SCALARTYPE, VA2>,
-                                                                    op_prod>,
-                                 const SCALARTYPE,
-                                 op_prod>  operator*(CPU_SCALAR const & val,
-                                                     viennacl::matrix_expression< const viennacl::vector<SCALARTYPE, VA1>,
-                                                                                  const viennacl::vector<SCALARTYPE, VA2>,
-                                                                                  op_prod> const & proxy)
-    {
-      return viennacl::matrix_expression< const viennacl::matrix_expression< const viennacl::vector<SCALARTYPE, VA1>,
-                                                                             const viennacl::vector<SCALARTYPE, VA2>,
-                                                                             op_prod>,
-                                          const SCALARTYPE,
-                                          op_prod>(proxy, static_cast<SCALARTYPE>(val));
-    }
-    
-   
-
-} //namespace viennacl
-
-#endif
+#ifndef VIENNACL_MATRIX_HPP_
+#define VIENNACL_MATRIX_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/matrix.hpp
+    @brief Implementation of the dense matrix class
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/linalg/matrix_operations.hpp"
+#include "viennacl/linalg/sparse_matrix_operations.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/tools/matrix_size_deducer.hpp"
+#include "viennacl/meta/result_of.hpp"
+#include "viennacl/meta/enable_if.hpp"
+//#include "viennacl/rand/utils.hpp"
+#include "viennacl/traits/handle.hpp"
+
+namespace viennacl
+{
+  /** @brief Base class for representing matrices where the individual entries are not all stored explicitly, e.g. identity_matrix<>
+    *
+    * Examples are identity_matrix, scalar_matrix, and zero_matrix.
+    */
+  template<typename SCALARTYPE>
+  class implicit_matrix_base
+  {
+    protected:
+      typedef vcl_size_t        size_type;
+      implicit_matrix_base(size_type size1, size_type size2, std::pair<SCALARTYPE, bool> value, bool diag) : size1_(size1), size2_(size2), value_(value), diag_(diag){ }
+    public:
+      typedef SCALARTYPE const & const_reference;
+      typedef SCALARTYPE cpu_value_type;
+
+      size_type size1() const { return size1_; }
+      size_type size2() const { return size2_; }
+
+      SCALARTYPE  value() const { return value_.first; }
+      bool is_value_static( ) const { return value_.second; }
+      bool diag() const { return diag_; }
+
+      const_reference operator()(size_type i, size_type j) const {
+        if(diag_) return (i == j) ? value_.first : 0;
+        return value_.first;
+      }
+
+    protected:
+      size_type size1_;
+      size_type size2_;
+      std::pair<SCALARTYPE, bool> value_;
+      bool diag_;
+  };
+
+  //
+  // Initializer types
+  //
+  /** @brief Represents a vector consisting of 1 at a given index and zeros otherwise. To be used as an initializer for viennacl::vector, vector_range, or vector_slize only. */
+  template <typename SCALARTYPE>
+  class identity_matrix
+  {
+    public:
+      typedef vcl_size_t         size_type;
+      typedef SCALARTYPE const & const_reference;
+
+      identity_matrix(size_type s, viennacl::context ctx = viennacl::context()) : size_(s), diag_(1), off_diag_(0), ctx_(ctx) {}
+
+      size_type size1() const { return size_; }
+      size_type size2() const { return size_; }
+      const_reference operator()(size_type i, size_type j) const { return (i == j) ? diag_ : off_diag_; }
+
+      viennacl::context context() const { return ctx_; }
+
+    private:
+      size_type size_;
+      SCALARTYPE diag_;
+      SCALARTYPE off_diag_;
+      viennacl::context ctx_;
+  };
+
+
+  /** @brief Represents a vector consisting of zeros only. To be used as an initializer for viennacl::vector, vector_range, or vector_slize only. */
+  template <typename SCALARTYPE>
+  class zero_matrix
+  {
+    public:
+      typedef vcl_size_t         size_type;
+      typedef SCALARTYPE const & const_reference;
+
+      zero_matrix(size_type s1, size_type s2, viennacl::context ctx = viennacl::context()) : size1_(s1), size2_(s2), val_(0), ctx_(ctx) {}
+
+      size_type size1() const { return size1_; }
+      size_type size2() const { return size2_; }
+      const_reference operator()(size_type /*i*/, size_type /*j*/) const { return val_; }
+
+      viennacl::context context() const { return ctx_; }
+
+    private:
+      size_type size1_;
+      size_type size2_;
+      SCALARTYPE val_;
+      viennacl::context ctx_;
+  };
+
+
+  /** @brief Represents a vector consisting of scalars 's' only, i.e. v[i] = s for all i. To be used as an initializer for viennacl::vector, vector_range, or vector_slize only. */
+  template <typename SCALARTYPE>
+  class scalar_matrix
+  {
+    public:
+      typedef vcl_size_t         size_type;
+      typedef SCALARTYPE const & const_reference;
+
+      scalar_matrix(size_type s1, size_type s2, const_reference val, viennacl::context ctx = viennacl::context()) : size1_(s1), size2_(s2), value_(val), ctx_(ctx) {}
+
+      size_type size1() const { return size1_; }
+      size_type size2() const { return size2_; }
+      const_reference operator()(size_type /*i*/, size_type /*j*/) const { return value_; }
+
+      viennacl::context context() const { return ctx_; }
+
+    private:
+      size_type size1_;
+      size_type size2_;
+      SCALARTYPE value_;
+      viennacl::context ctx_;
+  };
+
+
+
+//#ifdef VIENNACL_WITH_OPENCL
+//  template<class SCALARTYPE, class DISTRIBUTION>
+//  rand::random_matrix_t<SCALARTYPE, DISTRIBUTION> random_matrix(unsigned int size1, unsigned int size2, DISTRIBUTION const & distribution){
+//      return rand::random_matrix_t<SCALARTYPE,DISTRIBUTION>(size1,size2,distribution);
+//  }
+//#endif
+
+  /** @brief Expression template class for representing a tree of expressions which ultimately result in a matrix.
+    *
+    * @tparam LHS   The left hand side of the expression tree
+    * @tparam RHS   The right hand side of the expression tree
+    * @tparam OP    The operator to apply to LHS and RHS to obtain the result.
+    */
+  template <typename LHS, typename RHS, typename OP>
+  class matrix_expression
+  {
+      typedef typename viennacl::result_of::reference_if_nonscalar<LHS>::type     lhs_reference_type;
+      typedef typename viennacl::result_of::reference_if_nonscalar<RHS>::type     rhs_reference_type;
+
+    public:
+      typedef vcl_size_t       size_type;
+
+      matrix_expression(LHS & lhs, RHS & rhs) : lhs_(lhs), rhs_(rhs) {}
+
+      /** @brief Get left hand side operand
+      */
+      LHS & lhs() const { return lhs_; }
+      /** @brief Get right hand side operand
+      */
+      RHS & rhs() const { return rhs_; }
+
+      /** @brief Returns the size of the result vector */
+      vcl_size_t size1() const { return viennacl::tools::MATRIX_SIZE_DEDUCER<LHS, RHS, OP>::size1(lhs_, rhs_); }
+      vcl_size_t size2() const { return viennacl::tools::MATRIX_SIZE_DEDUCER<LHS, RHS, OP>::size2(lhs_, rhs_); }
+
+    private:
+      /** @brief The left hand side operand */
+      lhs_reference_type lhs_;
+      /** @brief The right hand side operand */
+      rhs_reference_type rhs_;
+  };
+
+
+  /** @brief A tag indicating iteration along increasing row index of a matrix */
+  struct row_iteration {};
+
+  /** @brief A tag indicating iteration along increasing columns index of a matrix */
+  struct col_iteration {};
+
+  //STL-like iterator. TODO: STL-compliance...
+  /** @brief uBLAS-like iterator class for iterating over the entries of a dense matrix. */
+  template <typename ROWCOL, typename MATRIXTYPE>
+  class matrix_iterator
+  {
+      typedef matrix_iterator<ROWCOL, MATRIXTYPE>    self_type;
+    public:
+      typedef typename MATRIXTYPE::value_type       value_type;
+
+      matrix_iterator(MATRIXTYPE & mat,
+                      vcl_size_t start_row,
+                      vcl_size_t start_col) : mat_(mat), row_(start_row), col_(start_col) {}
+
+      value_type operator*(void) { return mat_(row_, col_); }
+      self_type & operator++(void) { viennacl::tools::MATRIX_ITERATOR_INCREMENTER<ROWCOL, MATRIXTYPE>::apply(mat_, row_, col_); return *this; }
+      self_type operator++(int) { self_type tmp = *this; ++(*this); return tmp; }
+
+      bool operator==(self_type const & other) { return (row_ == other.row_) && (col_ == other.col_); }
+      bool operator!=(self_type const & other) { return !(*this == other); }
+
+      vcl_size_t index1() { return row_; }
+      vcl_size_t index2() { return col_; }
+
+      MATRIXTYPE & operator()(void) const { return mat_; }
+
+    private:
+      MATRIXTYPE & mat_;
+      vcl_size_t row_;
+      vcl_size_t col_;
+  };
+
+
+  /** @brief A dense matrix class
+  *
+  * @tparam SCALARTYPE   The underlying scalar type (either float or double)
+  * @tparam F            Storage layout: Either row_major or column_major (at present only row_major is supported)
+  * @tparam ALIGNMENT   The internal memory size is given by (size()/ALIGNMENT + 1) * ALIGNMENT. ALIGNMENT must be a power of two. Best values or usually 4, 8 or 16, higher values are usually a waste of memory.
+  */
+  template <class SCALARTYPE, typename F, typename SizeType /* see forwards.h for default type */, typename DistanceType /* see forwards.h for default type */>
+  class matrix_base
+  {
+      typedef matrix_base<SCALARTYPE, F, SizeType, DistanceType>          self_type;
+    public:
+
+      typedef matrix_iterator<row_iteration, self_type >   iterator1;
+      typedef matrix_iterator<col_iteration, self_type >   iterator2;
+      typedef scalar<SCALARTYPE>                                                  value_type;
+      typedef SCALARTYPE                                                          cpu_value_type;
+      typedef SizeType                                                            size_type;
+      typedef DistanceType                                                        difference_type;
+      typedef viennacl::backend::mem_handle                                       handle_type;
+      typedef F                                                                   orientation_functor;
+      typedef typename F::orientation_category                                    orientation_category;
+
+      static const size_type alignment = 128;
+
+
+      /** @brief The default constructor. Does not allocate any memory. */
+      explicit matrix_base() : size1_(0), size2_(0), start1_(0), start2_(0), stride1_(1), stride2_(1), internal_size1_(0), internal_size2_(0) {}
+
+      /** @brief Creates the matrix with the given dimensions
+      *
+      * @param rows     Number of rows
+      * @param columns  Number of columns
+      * @param ctx      Optional context in which the matrix is created (one out of multiple OpenCL contexts, CUDA, host)
+      */
+      explicit matrix_base(size_type rows, size_type columns, viennacl::context ctx = viennacl::context())
+          : size1_(rows), size2_(columns), start1_(0), start2_(0), stride1_(1), stride2_(1),
+            internal_size1_(viennacl::tools::align_to_multiple<size_type>(rows, alignment)),
+            internal_size2_(viennacl::tools::align_to_multiple<size_type>(columns, alignment))
+      {
+        if (rows > 0 && columns > 0)
+        {
+          viennacl::backend::memory_create(elements_, sizeof(SCALARTYPE)*internal_size(), ctx);
+          clear();
+        }
+      }
+
+
+      /** @brief Constructor for creating a matrix_range or matrix_stride from some other matrix/matrix_range/matrix_stride */
+      explicit matrix_base(viennacl::backend::mem_handle & h,
+                           size_type mat_size1, size_type mat_start1, difference_type mat_stride1, size_type mat_internal_size1,
+                           size_type mat_size2, size_type mat_start2, difference_type mat_stride2, size_type mat_internal_size2)
+        : size1_(mat_size1), size2_(mat_size2),
+          start1_(mat_start1), start2_(mat_start2),
+          stride1_(mat_stride1), stride2_(mat_stride2),
+          internal_size1_(mat_internal_size1), internal_size2_(mat_internal_size2),
+          elements_(h) {}
+
+      template <typename LHS, typename RHS, typename OP>
+      explicit matrix_base(matrix_expression<const LHS, const RHS, OP> const & proxy) :
+        size1_(viennacl::traits::size1(proxy)), size2_(viennacl::traits::size2(proxy)), start1_(0), start2_(0), stride1_(1), stride2_(1),
+        internal_size1_(viennacl::tools::align_to_multiple<size_type>(size1_, alignment)),
+        internal_size2_(viennacl::tools::align_to_multiple<size_type>(size2_, alignment))
+      {
+        elements_.switch_active_handle_id(viennacl::traits::active_handle_id(proxy));
+        if (internal_size() > 0)
+        {
+          viennacl::backend::memory_create(elements_, sizeof(SCALARTYPE)*internal_size(), viennacl::traits::context(proxy));
+          clear();
+          self_type::operator=(proxy);
+        }
+      }
+
+      // CUDA or host memory:
+      explicit matrix_base(SCALARTYPE * ptr_to_mem, viennacl::memory_types mem_type,
+                           size_type mat_size1, size_type mat_start1, difference_type mat_stride1, size_type mat_internal_size1,
+                           size_type mat_size2, size_type mat_start2, difference_type mat_stride2, size_type mat_internal_size2)
+        : size1_(mat_size1), size2_(mat_size2),
+          start1_(mat_start1), start2_(mat_start2),
+          stride1_(mat_stride1), stride2_(mat_stride2),
+          internal_size1_(mat_internal_size1), internal_size2_(mat_internal_size2)
+      {
+        if (mem_type == viennacl::CUDA_MEMORY)
+        {
+#ifdef VIENNACL_WITH_CUDA
+          elements_.switch_active_handle_id(viennacl::CUDA_MEMORY);
+          elements_.cuda_handle().reset(reinterpret_cast<char*>(ptr_to_mem));
+          elements_.cuda_handle().inc(); //prevents that the user-provided memory is deleted once the vector object is destroyed.
+#else
+          throw cuda_not_available_exception();
+#endif
+        }
+        else if (mem_type == viennacl::MAIN_MEMORY)
+        {
+          elements_.switch_active_handle_id(viennacl::MAIN_MEMORY);
+          elements_.ram_handle().reset(reinterpret_cast<char*>(ptr_to_mem));
+          elements_.ram_handle().inc(); //prevents that the user-provided memory is deleted once the vector object is destroyed.
+        }
+
+        elements_.raw_size(sizeof(SCALARTYPE) * internal_size());
+      }
+
+#ifdef VIENNACL_WITH_OPENCL
+      explicit matrix_base(cl_mem mem, size_type rows, size_type columns, viennacl::context ctx = viennacl::context())
+        : size1_(rows), size2_(columns),
+          start1_(0), start2_(0),
+          stride1_(1), stride2_(1),
+          internal_size1_(rows), internal_size2_(columns)
+      {
+        elements_.switch_active_handle_id(viennacl::OPENCL_MEMORY);
+        elements_.opencl_handle() = mem;
+        elements_.opencl_handle().inc();  //prevents that the user-provided memory is deleted once the vector object is destroyed.
+        elements_.opencl_handle().context(ctx.opencl_context());
+        elements_.raw_size(sizeof(SCALARTYPE)*internal_size());
+      }
+
+      explicit matrix_base(cl_mem mem, viennacl::context ctx,
+                           size_type mat_size1, size_type mat_start1, difference_type mat_stride1, size_type mat_internal_size1,
+                           size_type mat_size2, size_type mat_start2, difference_type mat_stride2, size_type mat_internal_size2)
+        : size1_(mat_size1), size2_(mat_size2),
+          start1_(mat_start1), start2_(mat_start2),
+          stride1_(mat_stride1), stride2_(mat_stride2),
+          internal_size1_(mat_internal_size1), internal_size2_(mat_internal_size2)
+      {
+        elements_.switch_active_handle_id(viennacl::OPENCL_MEMORY);
+        elements_.opencl_handle() = mem;
+        elements_.opencl_handle().inc();  //prevents that the user-provided memory is deleted once the vector object is destroyed.
+        elements_.opencl_handle().context(ctx.opencl_context());
+        elements_.raw_size(sizeof(SCALARTYPE)*internal_size());
+      }
+#endif
+
+
+      self_type & operator=(const self_type & other)  //enables implicit conversions
+      {
+        if (internal_size() == 0)
+        {
+          if (other.internal_size() == 0)
+            return *this;
+          resize(other.size1(), other.size2(), false);
+        }
+
+        viennacl::linalg::am(*this,
+                             other, cpu_value_type(1.0), 1, false, false);
+        return *this;
+      }
+
+      /** @brief Creates the matrix from the supplied random matrix. */
+      /*template<class DISTRIBUTION>
+      matrix(rand::random_matrix_t<SCALARTYPE, DISTRIBUTION> const & m) : rows_(m.size1), columns_(m.size2)
+      {
+        if (internal_size() > 0)
+        {
+          viennacl::backend::memory_create(elements_, sizeof(SCALARTYPE)*internal_size());
+          rand::buffer_dumper<SCALARTYPE, DISTRIBUTION>::dump(elements_,m.distribution,0,internal_size());
+        }
+      }*/
+
+
+
+      /** @brief Implementation of the operation m1 = m2 @ alpha, where @ denotes either multiplication or division, and alpha is either a CPU or a GPU scalar
+      *
+      * @param proxy  An expression template proxy class.
+      */
+      template <typename LHS, typename RHS, typename OP>
+      self_type & operator=(const matrix_expression<const LHS, const RHS, OP> & proxy)
+      {
+        assert(  (viennacl::traits::size1(proxy) == size1() || size1() == 0)
+              && (viennacl::traits::size2(proxy) == size2() || size2() == 0)
+              && bool("Incompatible matrix sizes!"));
+
+        if (internal_size() == 0 && viennacl::traits::size1(proxy) > 0 && viennacl::traits::size2(proxy) > 0)
+        {
+          size1_ = viennacl::traits::size1(proxy);
+          size2_ = viennacl::traits::size2(proxy);
+          internal_size1_ = viennacl::tools::align_to_multiple<size_type>(size1_, alignment);
+          internal_size2_ = viennacl::tools::align_to_multiple<size_type>(size2_, alignment);
+          viennacl::backend::memory_create(elements_, sizeof(SCALARTYPE)*internal_size(), viennacl::traits::context(proxy));
+          if (size1_ != internal_size1_ || size2_ != internal_size2_)
+            clear();
+        }
+
+        if (internal_size() > 0)
+          linalg::detail::op_executor<self_type, op_assign, matrix_expression<const LHS, const RHS, OP> >::apply(*this, proxy);
+
+        return *this;
+      }
+
+
+      // A = trans(B). Currently achieved in CPU memory
+      self_type & operator=(const matrix_expression< const self_type,
+                                                     const self_type,
+                                                     op_trans> & proxy)
+      {
+        assert( (handle() != proxy.lhs().handle()) && bool("Self-assignment of matrix transpose not implemented"));
+        assert( ( (proxy.lhs().size1() == size2()) || (size2() == 0) ) && bool("Matrix dimensions do not match!"));
+        assert( ( (proxy.lhs().size2() == size1()) || (size1() == 0) ) && bool("Matrix dimensions do not match!"));
+
+        if (internal_size() == 0 && viennacl::traits::size1(proxy) > 0 && viennacl::traits::size2(proxy) > 0)
+        {
+          size1_ = viennacl::traits::size1(proxy);
+          size2_ = viennacl::traits::size2(proxy);
+          internal_size1_ = viennacl::tools::align_to_multiple<size_type>(size1_, alignment);
+          internal_size2_ = viennacl::tools::align_to_multiple<size_type>(size2_, alignment);
+        }
+
+        std::vector<SCALARTYPE> temp(proxy.lhs().internal_size());
+
+        viennacl::backend::memory_read(proxy.lhs().handle(), 0, sizeof(SCALARTYPE)*proxy.lhs().internal_size(), &(temp[0]));
+
+        // now transpose it
+        std::vector<SCALARTYPE> temp_trans(internal_size());
+
+        for (vcl_size_t i=0; i<proxy.lhs().size1(); ++i)
+          for (vcl_size_t j=0; j<proxy.lhs().size2(); ++j)
+            temp_trans[F::mem_index(start2() + stride2() * j,
+                                    start1() + stride1() * i,
+                                    internal_size1(), internal_size2())]
+              = temp[F::mem_index(proxy.lhs().start1() + proxy.lhs().stride1() * i,
+                                  proxy.lhs().start2() + proxy.lhs().stride2() * j,
+                                  proxy.lhs().internal_size1(), proxy.lhs().internal_size2())];
+
+        // write back
+        viennacl::backend::memory_create(elements_, sizeof(SCALARTYPE)*internal_size(), viennacl::traits::context(proxy), &(temp_trans[0]));
+
+        return *this;
+      }
+
+      template <typename LHS, typename RHS, typename OP>
+      self_type & operator+=(const matrix_expression<const LHS, const RHS, OP> & proxy)
+      {
+        assert(  (viennacl::traits::size1(proxy) == size1())
+              && (viennacl::traits::size2(proxy) == size2())
+              && bool("Incompatible matrix sizes!"));
+        assert( (size1() > 0) && bool("Vector not yet initialized!") );
+        assert( (size2() > 0) && bool("Vector not yet initialized!") );
+
+        linalg::detail::op_executor<self_type, op_inplace_add, matrix_expression<const LHS, const RHS, OP> >::apply(*this, proxy);
+
+        return *this;
+      }
+
+      template <typename LHS, typename RHS, typename OP>
+      self_type & operator-=(const matrix_expression<const LHS, const RHS, OP> & proxy)
+      {
+        assert(  (viennacl::traits::size1(proxy) == size1())
+              && (viennacl::traits::size2(proxy) == size2())
+              && bool("Incompatible matrix sizes!"));
+        assert( (size1() > 0) && bool("Vector not yet initialized!") );
+        assert( (size2() > 0) && bool("Vector not yet initialized!") );
+
+        linalg::detail::op_executor<self_type, op_inplace_sub, matrix_expression<const LHS, const RHS, OP> >::apply(*this, proxy);
+
+        return *this;
+      }
+
+      /** @brief Assigns the supplied identity matrix to the matrix. */
+      self_type & operator = (identity_matrix<SCALARTYPE> const & m)
+      {
+        assert( (m.size1() == size1_ || size1_ == 0) && bool("Size mismatch!") );
+        assert( (m.size2() == size2_ || size2_ == 0) && bool("Size mismatch!") );
+
+        if (internal_size() == 0)
+        {
+          size1_ = m.size1();
+          size2_ = m.size2();
+          internal_size1_ = viennacl::tools::align_to_multiple<size_type>(size1_, alignment);
+          internal_size2_ = viennacl::tools::align_to_multiple<size_type>(size2_, alignment);
+          if (internal_size() > 0)
+          {
+            viennacl::backend::memory_create(elements_, sizeof(SCALARTYPE)*internal_size(), m.context());
+            clear();
+          }
+        }
+        else
+          viennacl::linalg::matrix_assign(*this, SCALARTYPE(0));
+
+        if (internal_size() > 0)
+          viennacl::linalg::matrix_diagonal_assign(*this, m(0,0));
+
+        return *this;
+      }
+
+      /** @brief Assigns the supplied zero matrix to the matrix. */
+      self_type & operator = (zero_matrix<SCALARTYPE> const & m)
+      {
+        assert( (m.size1() == size1_ || size1_ == 0) && bool("Size mismatch!") );
+        assert( (m.size2() == size2_ || size2_ == 0) && bool("Size mismatch!") );
+
+        if (internal_size() == 0)
+        {
+          size1_ = m.size1();
+          size2_ = m.size2();
+          internal_size1_ = viennacl::tools::align_to_multiple<size_type>(size1_, alignment);
+          internal_size2_ = viennacl::tools::align_to_multiple<size_type>(size2_, alignment);
+          if (internal_size() > 0)
+          {
+            viennacl::backend::memory_create(elements_, sizeof(SCALARTYPE)*internal_size(), m.context());
+            clear();
+          }
+        }
+        else
+          viennacl::linalg::matrix_assign(*this, SCALARTYPE(0));
+
+        return *this;
+      }
+
+      /** @brief Assigns the supplied scalar vector to the matrix. */
+      self_type & operator = (scalar_matrix<SCALARTYPE> const & m)
+      {
+        assert( (m.size1() == size1_ || size1_ == 0) && bool("Size mismatch!") );
+        assert( (m.size2() == size2_ || size2_ == 0) && bool("Size mismatch!") );
+
+        if (internal_size() == 0)
+        {
+          size1_ = m.size1();
+          size2_ = m.size2();
+          internal_size1_ = viennacl::tools::align_to_multiple<size_type>(size1_, alignment);
+          internal_size2_ = viennacl::tools::align_to_multiple<size_type>(size2_, alignment);
+          if (internal_size() > 0)
+          {
+            viennacl::backend::memory_create(elements_, sizeof(SCALARTYPE)*internal_size(), m.context());
+            clear();
+          }
+        }
+
+        if (internal_size() > 0)
+        {
+          viennacl::linalg::matrix_assign(*this, m(0,0));
+        }
+
+        return *this;
+      }
+
+
+      //read-write access to an element of the matrix/matrix_range/matrix_slice
+      /** @brief Read-write access to a single element of the matrix/matrix_range/matrix_slice
+      */
+      entry_proxy<SCALARTYPE> operator()(size_type row_index, size_type col_index)
+      {
+        return entry_proxy<SCALARTYPE>(F::mem_index(start1_ + stride1_ * row_index, start2_ + stride2_ * col_index, internal_size1(), internal_size2()), elements_);
+      }
+
+      /** @brief Read access to a single element of the matrix/matrix_range/matrix_slice
+      */
+      const_entry_proxy<SCALARTYPE> operator()(size_type row_index, size_type col_index) const
+      {
+        return const_entry_proxy<SCALARTYPE>(F::mem_index(start1_ + stride1_ * row_index, start2_ + stride2_ * col_index, internal_size1(), internal_size2()), elements_);
+      }
+
+      //
+      // Operator overloads for enabling implicit conversions:
+      //
+      self_type & operator += (const self_type & other)
+      {
+        viennacl::linalg::ambm(*this,
+                                *this, SCALARTYPE(1.0), 1, false, false,
+                                other, SCALARTYPE(1.0), 1, false, false);
+        return *this;
+      }
+
+      self_type & operator -= (const self_type & other)
+      {
+        viennacl::linalg::ambm(*this,
+                                *this, SCALARTYPE(1.0), 1, false, false,
+                                other, SCALARTYPE(1.0), 1, false, true);
+        return *this;
+      }
+
+      /** @brief Scales a matrix by a CPU scalar value
+      */
+      self_type & operator *= (SCALARTYPE val)
+      {
+        //viennacl::linalg::inplace_mult(*this, val);
+        viennacl::linalg::am(*this,
+                              *this, val, 1, false, false);
+        return *this;
+      }
+
+      /** @brief Scales this matrix by a CPU scalar value
+      */
+      self_type & operator /= (SCALARTYPE val)
+      {
+        //viennacl::linalg::inplace_mult(*this, static_cast<SCALARTYPE>(1) / val);
+        viennacl::linalg::am(*this,
+                              *this, val, 1, true, false);
+        return *this;
+      }
+
+
+      /** @brief Sign flip for the matrix. Emulated to be equivalent to -1.0 * matrix */
+      matrix_expression<const self_type, const SCALARTYPE, op_mult> operator-() const
+      {
+        return matrix_expression<const self_type, const SCALARTYPE, op_mult>(*this, SCALARTYPE(-1));
+      }
+
+      /** @brief Returns the number of rows */
+      size_type size1() const { return size1_;}
+      /** @brief Returns the number of columns */
+      size_type size2() const { return size2_; }
+
+      /** @brief Returns the number of rows */
+      size_type start1() const { return start1_;}
+      /** @brief Returns the number of columns */
+      size_type start2() const { return start2_; }
+
+      /** @brief Returns the number of rows */
+      size_type stride1() const { return stride1_;}
+      /** @brief Returns the number of columns */
+      size_type stride2() const { return stride2_; }
+
+      /** @brief Resets all entries to zero */
+      void clear()
+      {
+        viennacl::linalg::matrix_assign(*this, SCALARTYPE(0), true);
+      }
+
+
+      /** @brief Returns the internal number of rows. Usually required for launching OpenCL kernels only */
+      size_type internal_size1() const { return internal_size1_; }
+      /** @brief Returns the internal number of columns. Usually required for launching OpenCL kernels only */
+      size_type internal_size2() const { return internal_size2_; }
+      /** @brief Returns the total amount of allocated memory in multiples of sizeof(SCALARTYPE) */
+      size_type internal_size() const { return internal_size1() * internal_size2(); }
+
+      /** @brief Returns the OpenCL handle, non-const-version */
+            handle_type & handle()       { return elements_; }
+      /** @brief Returns the OpenCL handle, const-version */
+      const handle_type & handle() const { return elements_; }
+
+
+      viennacl::memory_types memory_domain() const
+      {
+        return elements_.get_active_handle_id();
+      }
+
+    protected:
+
+      void set_handle(viennacl::backend::mem_handle const & h)
+      {
+        elements_ = h;
+      }
+
+      void switch_memory_context(viennacl::context new_ctx)
+      {
+        viennacl::backend::switch_memory_context<SCALARTYPE>(elements_, new_ctx);
+      }
+
+
+      /** @brief Resizes the matrix.
+      *   Existing entries can be preserved, but
+      *
+      * @param rows       New number of rows
+      * @param columns    New number of columns
+      * @param preserve   If true, existing values are preserved.
+      */
+      void resize(size_type rows, size_type columns, bool preserve = true)
+      {
+        assert( (rows > 0 && columns > 0) && bool("Check failed in matrix::resize(): Number of rows and columns must be positive!"));
+
+        if (preserve && internal_size() > 0)
+        {
+          //get old entries:
+          std::vector< SCALARTYPE > old_entries(internal_size());
+          viennacl::backend::memory_read(elements_, 0, sizeof(SCALARTYPE)*internal_size(), &(old_entries[0]));
+
+          //set up entries of new matrix:
+          std::vector< SCALARTYPE > new_entries(  viennacl::tools::align_to_multiple<vcl_size_t>(rows,    alignment)
+                                                * viennacl::tools::align_to_multiple<vcl_size_t>(columns, alignment));
+          for (size_type i=0; i<rows; ++i)
+          {
+            if (i >= size1_)
+              continue;
+
+            for (size_type j=0; j<columns; ++j)
+            {
+              if (j >= size2_)
+                continue;
+              new_entries[F::mem_index(i, j, viennacl::tools::align_to_multiple<vcl_size_t>(rows, alignment), viennacl::tools::align_to_multiple<vcl_size_t>(columns, alignment))]
+                  = old_entries[F::mem_index(i, j, internal_size1(), internal_size2())];
+            }
+          }
+
+          //copy new entries to GPU:
+          size1_ = rows;
+          size2_ = columns;
+          internal_size1_ = viennacl::tools::align_to_multiple<size_type>(size1_, alignment);
+          internal_size2_ = viennacl::tools::align_to_multiple<size_type>(size2_, alignment);
+          viennacl::backend::memory_create(elements_, sizeof(SCALARTYPE)*new_entries.size(), viennacl::traits::context(elements_), &(new_entries[0]));
+        }
+        else //discard old entries:
+        {
+          size1_ = rows;
+          size2_ = columns;
+          internal_size1_ = viennacl::tools::align_to_multiple<size_type>(size1_, alignment);
+          internal_size2_ = viennacl::tools::align_to_multiple<size_type>(size2_, alignment);
+
+          viennacl::backend::memory_create(elements_, sizeof(SCALARTYPE)*internal_size(), viennacl::traits::context(elements_));
+          clear();
+        }
+      }
+
+    private:
+      size_type size1_;
+      size_type size2_;
+      size_type start1_;
+      size_type start2_;
+      difference_type stride1_;
+      difference_type stride2_;
+      size_type internal_size1_;
+      size_type internal_size2_;
+      handle_type elements_;
+  }; //matrix
+
+
+
+  /** @brief A dense matrix class
+  *
+  * @tparam SCALARTYPE   The underlying scalar type (either float or double)
+  * @tparam F            Storage layout: Either row_major or column_major (at present only row_major is supported)
+  * @tparam ALIGNMENT   The internal memory size is given by (size()/ALIGNMENT + 1) * ALIGNMENT. ALIGNMENT must be a power of two. Best values or usually 4, 8 or 16, higher values are usually a waste of memory.
+  */
+  template <class SCALARTYPE, typename F, unsigned int ALIGNMENT>
+  class matrix : public matrix_base<SCALARTYPE, F>
+  {
+      typedef matrix<SCALARTYPE, F, ALIGNMENT>          self_type;
+      typedef matrix_base<SCALARTYPE, F>                base_type;
+    public:
+      typedef typename base_type::size_type             size_type;
+
+      /** @brief The default constructor. Does not allocate any memory. */
+      explicit matrix() : base_type() {}
+
+      /** @brief Creates the matrix with the given dimensions
+      *
+      * @param rows     Number of rows
+      * @param columns  Number of columns
+      * @param ctx      Optional context in which the matrix is created (one out of multiple OpenCL contexts, CUDA, host)
+      */
+      explicit matrix(size_type rows, size_type columns, viennacl::context ctx = viennacl::context()) : base_type(rows, columns, ctx) {}
+
+#ifdef VIENNACL_WITH_OPENCL
+      explicit matrix(cl_mem mem, size_type rows, size_type columns) : base_type(mem, rows, columns) {}
+#endif
+
+      template <typename LHS, typename RHS, typename OP>
+      matrix(matrix_expression< LHS, RHS, OP> const & proxy) : base_type(proxy) {}
+
+      /** @brief Creates the matrix from the supplied identity matrix. */
+      matrix(identity_matrix<SCALARTYPE> const & m) : base_type(m.size1(), m.size2(), m.context())
+      {
+        if (base_type::internal_size() > 0)
+          base_type::operator=(m);
+      }
+
+      /** @brief Creates the matrix from the supplied zero matrix. */
+      matrix(zero_matrix<SCALARTYPE> const & m) : base_type(m.size1(), m.size2(), m.context())
+      {
+        if (base_type::internal_size() > 0)
+          base_type::operator=(m);
+      }
+
+      /** @brief Creates the matrix from the supplied scalar matrix. */
+      matrix(scalar_matrix<SCALARTYPE> const & m) : base_type(m.size1(), m.size2(), m.context())
+      {
+        if (base_type::internal_size() > 0)
+          base_type::operator=(m);
+      }
+
+      matrix(const base_type & other) : base_type(other.size1(), other.size2(), viennacl::traits::context(other))
+      {
+        base_type::operator=(other);
+      }
+
+
+      //copy constructor:
+      matrix(const self_type & other) : base_type(other.size1(), other.size2(), viennacl::traits::context(other))
+      {
+        base_type::operator=(other);
+      }
+
+
+      /*template <typename M1>
+      self_type & operator=(const matrix_expression< const M1, const M1, op_trans> & proxy)
+      {
+        self_type temp(proxy.lhs());
+        *this = trans(temp);
+        return *this;
+      }*/
+
+      using base_type::operator=;
+
+      /** @brief Resizes the matrix.
+      *   Existing entries can optionally be preserved
+      *
+      * @param rows       New number of rows
+      * @param columns    New number of columns
+      * @param preserve   If true, existing values are preserved.
+      */
+      void resize(size_type rows, size_type columns, bool preserve = true)
+      {
+        base_type::resize(rows, columns, preserve);
+      }
+
+  }; //matrix
+
+
+
+  /** @brief Prints the matrix. Output is compatible to boost::numeric::ublas
+  *
+  * @param s            STL output stream
+  * @param gpu_matrix   A dense ViennaCL matrix
+  */
+  template<class SCALARTYPE, typename F>
+  std::ostream & operator<<(std::ostream & s, const matrix_base<SCALARTYPE, F> & gpu_matrix)
+  {
+    typedef typename matrix_base<SCALARTYPE, F>::size_type      size_type;
+
+    std::vector<SCALARTYPE> tmp(gpu_matrix.internal_size());
+    viennacl::backend::memory_read(gpu_matrix.handle(), 0, sizeof(SCALARTYPE) * gpu_matrix.internal_size(), &(tmp[0]));
+
+    s << "[" << gpu_matrix.size1() << "," << gpu_matrix.size2() << "]";
+
+    s << "(";
+    for (size_type i = 0; i < gpu_matrix.size1(); ++i)
+    {
+      s << "(";
+      for (size_type j = 0; j < gpu_matrix.size2(); ++j)
+      {
+        s << tmp[F::mem_index(i * gpu_matrix.stride1() + gpu_matrix.start1(), j * gpu_matrix.stride2() + gpu_matrix.start2(), gpu_matrix.internal_size1(), gpu_matrix.internal_size2())];
+        if (j < gpu_matrix.size2() - 1)
+          s << ",";
+      }
+      s << ")";
+      if (i < gpu_matrix.size1() - 1)
+        s << ",";
+    }
+    s << ")";
+    return s;
+  }
+
+  /** @brief Prints the matrix. Output is compatible to boost::numeric::ublas
+  *
+  * @param s            STL output stream
+  * @param expr         A matrix expression
+  */
+  template<typename LHS, typename RHS, typename OP>
+  std::ostream & operator<<(std::ostream & s, const matrix_expression<LHS, RHS, OP> & expr)
+  {
+    typedef typename viennacl::tools::CPU_SCALAR_TYPE_DEDUCER< typename tools::CONST_REMOVER<LHS>::ResultType >::ResultType     ScalarType;
+
+    matrix<ScalarType> temp = expr;
+    s << temp;
+    return s;
+  }
+
+  /** @brief Returns an expression template class representing a transposed matrix */
+  template<typename NumericT, typename F>
+  matrix_expression< const matrix_base<NumericT, F>, const matrix_base<NumericT, F>, op_trans>
+  trans(const matrix_base<NumericT, F> & mat)
+  {
+    return matrix_expression< const matrix_base<NumericT, F>, const matrix_base<NumericT, F>, op_trans>(mat, mat);
+  }
+
+  //diag():
+  template<typename NumericT, typename F>
+  vector_expression< const matrix_base<NumericT, F>, const int, op_matrix_diag>
+  diag(const matrix_base<NumericT, F> & A, int k = 0)
+  {
+    return vector_expression< const matrix_base<NumericT, F>, const int, op_matrix_diag>(A, k);
+  }
+
+  template<typename NumericT>
+  matrix_expression< const vector_base<NumericT>, const int, op_vector_diag>
+  diag(const vector_base<NumericT> & v, int k = 0)
+  {
+    return matrix_expression< const vector_base<NumericT>, const int, op_vector_diag>(v, k);
+  }
+
+  // row():
+  template<typename NumericT, typename F>
+  vector_expression< const matrix_base<NumericT, F>, const unsigned int, op_row>
+  row(const matrix_base<NumericT, F> & A, unsigned int i)
+  {
+    return vector_expression< const matrix_base<NumericT, F>, const unsigned int, op_row>(A, i);
+  }
+
+  // column():
+  template<typename NumericT, typename F>
+  vector_expression< const matrix_base<NumericT, F>, const unsigned int, op_column>
+  column(const matrix_base<NumericT, F> & A, unsigned int j)
+  {
+    return vector_expression< const matrix_base<NumericT, F>, const unsigned int, op_column>(A, j);
+  }
+
+  /////////////////////// transfer operations: //////////////////////////////////////
+
+  //
+  //cpu to gpu, generic type:
+  //
+  /** @brief Copies a dense matrix from the host (CPU) to the OpenCL device (GPU or multi-core CPU)
+  *
+  * @param cpu_matrix   A dense matrix on the host. Type requirements: .size1() returns number of rows, .size2() returns number of columns. Access to entries via operator()
+  * @param gpu_matrix   A dense ViennaCL matrix
+  */
+  template <typename CPU_MATRIX, typename SCALARTYPE, typename F, unsigned int ALIGNMENT>
+  void copy(const CPU_MATRIX & cpu_matrix,
+            matrix<SCALARTYPE, F, ALIGNMENT> & gpu_matrix )
+  {
+    typedef typename matrix<SCALARTYPE, F, ALIGNMENT>::size_type      size_type;
+
+    //std::cout << "Copying CPU_MATRIX!" << std::endl;
+    //std::cout << "Size at begin: " << gpu_matrix.size1() << ", " << gpu_matrix.size2() << std::endl;
+    if (gpu_matrix.size1() == 0 || gpu_matrix.size2() == 0)
+    {
+      gpu_matrix.resize(cpu_matrix.size1(),
+                        cpu_matrix.size2(), false);
+    }
+
+    assert( (gpu_matrix.size1() == cpu_matrix.size1()) && (gpu_matrix.size2() == cpu_matrix.size2()) && bool("Matrix dimensions mismatch.") );
+
+    std::vector<SCALARTYPE> data(gpu_matrix.internal_size());
+    for (size_type i = 0; i < gpu_matrix.size1(); ++i)
+    {
+      for (size_type j = 0; j < gpu_matrix.size2(); ++j)
+        data[F::mem_index(i, j, gpu_matrix.internal_size1(), gpu_matrix.internal_size2())] = cpu_matrix(i,j);
+    }
+
+    viennacl::backend::memory_create(gpu_matrix.handle(), sizeof(SCALARTYPE) * data.size(), viennacl::traits::context(gpu_matrix), &(data[0]));
+    //gpu_matrix.elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, data);
+    //std::cout << "Size at end: " << gpu_matrix.size1() << ", " << gpu_matrix.size2() << std::endl;
+  }
+
+  //
+  //cpu to gpu, STL type:
+  //
+  /** @brief Copies a dense STL-type matrix from the host (CPU) to the OpenCL device (GPU or multi-core CPU)
+  *
+  * @param cpu_matrix   A dense matrix on the host of type std::vector< std::vector<> >. cpu_matrix[i][j] returns the element in the i-th row and j-th columns (both starting with zero)
+  * @param gpu_matrix   A dense ViennaCL matrix
+  */
+  template <typename SCALARTYPE, typename A1, typename A2, typename F, unsigned int ALIGNMENT>
+  void copy(const std::vector< std::vector<SCALARTYPE, A1>, A2> & cpu_matrix,
+            matrix<SCALARTYPE, F, ALIGNMENT> & gpu_matrix )
+  {
+    typedef typename matrix<SCALARTYPE, F, ALIGNMENT>::size_type      size_type;
+
+    if (gpu_matrix.size1() == 0 || gpu_matrix.size2() == 0)
+    {
+      gpu_matrix.resize(cpu_matrix.size(),
+                        cpu_matrix[0].size(),
+                        false);
+    }
+
+    assert( (gpu_matrix.size1() == cpu_matrix.size()) && bool("Matrix dimensions mismatch.") );
+
+    std::vector<SCALARTYPE> data(gpu_matrix.internal_size());
+    for (size_type i = 0; i < gpu_matrix.size1(); ++i)
+    {
+      assert( (gpu_matrix.size2() == cpu_matrix[i].size()) && bool("Matrix dimensions mismatch.") );
+
+      for (size_type j = 0; j < gpu_matrix.size2(); ++j)
+        data[F::mem_index(i, j, gpu_matrix.internal_size1(), gpu_matrix.internal_size2())] = cpu_matrix[i][j];
+    }
+
+    viennacl::backend::memory_create(gpu_matrix.handle(), sizeof(SCALARTYPE) * data.size(), viennacl::traits::context(gpu_matrix), &(data[0]));
+    //gpu_matrix.elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, data);
+  }
+
+
+  //
+  //cpu to gpu, another STL type:
+  //
+  /** @brief Copies a dense matrix from the host (CPU) to the OpenCL device (GPU or multi-core CPU) without temporary. Matrix-Layout on CPU must be equal to the matrix-layout on the GPU.
+  *
+  * @param cpu_matrix_begin   Pointer to the first matrix entry. Cf. iterator concept in STL
+  * @param cpu_matrix_end     Pointer past the last matrix entry. Cf. iterator concept in STL
+  * @param gpu_matrix         A dense ViennaCL matrix
+  */
+  template <typename SCALARTYPE, typename F, unsigned int ALIGNMENT>
+  void fast_copy(SCALARTYPE * cpu_matrix_begin,
+                  SCALARTYPE * cpu_matrix_end,
+                  matrix<SCALARTYPE, F, ALIGNMENT> & gpu_matrix)
+  {
+    viennacl::backend::memory_create(gpu_matrix.handle(), sizeof(SCALARTYPE) * (cpu_matrix_end - cpu_matrix_begin), viennacl::traits::context(gpu_matrix), cpu_matrix_begin);
+    /*gpu_matrix.elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE,
+                                                                          sizeof(SCALARTYPE) * (cpu_matrix_end - cpu_matrix_begin),
+                                                                          cpu_matrix_begin);*/
+  }
+
+
+  #ifdef VIENNACL_WITH_EIGEN
+  /** @brief Copies a dense Eigen matrix from the host (CPU) to the OpenCL device (GPU or multi-core CPU)
+  *
+  * @param cpu_matrix   A dense MTL matrix. cpu_matrix(i, j) returns the element in the i-th row and j-th columns (both starting with zero)
+  * @param gpu_matrix   A dense ViennaCL matrix
+  */
+  template <typename F, unsigned int ALIGNMENT>
+  void copy(const Eigen::MatrixXf & cpu_matrix,
+            matrix<float, F, ALIGNMENT> & gpu_matrix)
+  {
+    typedef typename matrix<float, F, ALIGNMENT>::size_type      size_type;
+
+    if (gpu_matrix.size1() == 0 || gpu_matrix.size2() == 0)
+    {
+      gpu_matrix.resize(cpu_matrix.rows(),
+                        cpu_matrix.cols(),
+                        false);
+    }
+    else
+    {
+      assert( (gpu_matrix.size1() == static_cast<vcl_size_t>(cpu_matrix.rows()))
+              && (gpu_matrix.size2() == static_cast<vcl_size_t>(cpu_matrix.cols()))
+              && bool("matrix size mismatch")
+            );
+    }
+
+    std::vector<float> data(gpu_matrix.internal_size());
+    for (size_type i = 0; i < gpu_matrix.size1(); ++i)
+    {
+      for (size_type j = 0; j < gpu_matrix.size2(); ++j)
+        data[F::mem_index(i, j, gpu_matrix.internal_size1(), gpu_matrix.internal_size2())] = cpu_matrix(i,j);
+    }
+
+    viennacl::backend::memory_create(gpu_matrix.handle(), sizeof(float) * data.size(), viennacl::traits::context(gpu_matrix), &(data[0]));
+    //gpu_matrix.elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, data);
+  }
+
+  /** @brief Copies a dense Eigen matrix from the host (CPU) to the OpenCL device (GPU or multi-core CPU)
+  *
+  * @param cpu_matrix   A dense MTL matrix. cpu_matrix(i, j) returns the element in the i-th row and j-th columns (both starting with zero)
+  * @param gpu_matrix   A dense ViennaCL matrix
+  */
+  template <typename F, unsigned int ALIGNMENT>
+  void copy(const Eigen::MatrixXd & cpu_matrix,
+            matrix<double, F, ALIGNMENT> & gpu_matrix)
+  {
+    typedef typename matrix<double, F, ALIGNMENT>::size_type      size_type;
+
+    if (gpu_matrix.size1() == 0 || gpu_matrix.size2() == 0)
+    {
+      gpu_matrix.resize(cpu_matrix.rows(),
+                        cpu_matrix.cols(),
+                        false);
+    }
+    else
+    {
+      assert( (gpu_matrix.size1() == static_cast<vcl_size_t>(cpu_matrix.rows()))
+              && (gpu_matrix.size2() == static_cast<vcl_size_t>(cpu_matrix.cols()))
+              && bool("matrix size mismatch")
+            );
+    }
+
+    std::vector<double> data(gpu_matrix.internal_size());
+    for (size_type i = 0; i < gpu_matrix.size1(); ++i)
+    {
+      for (size_type j = 0; j < gpu_matrix.size2(); ++j)
+        data[F::mem_index(i, j, gpu_matrix.internal_size1(), gpu_matrix.internal_size2())] = cpu_matrix(i,j);
+    }
+
+    viennacl::backend::memory_create(gpu_matrix.handle(), sizeof(double) * data.size(), viennacl::traits::context(gpu_matrix), &(data[0]));
+    //gpu_matrix.elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, data);
+  }
+  #endif
+
+  #ifdef VIENNACL_WITH_MTL4
+  /** @brief Copies a dense MTL matrix from the host (CPU) to the OpenCL device (GPU or multi-core CPU)
+  *
+  * @param cpu_matrix   A dense MTL matrix. cpu_matrix(i, j) returns the element in the i-th row and j-th columns (both starting with zero)
+  * @param gpu_matrix   A dense ViennaCL matrix
+  */
+  template <typename SCALARTYPE, typename T, typename F, unsigned int ALIGNMENT>
+  void copy(const mtl::dense2D<SCALARTYPE, T>& cpu_matrix,
+            matrix<SCALARTYPE, F, ALIGNMENT> & gpu_matrix)
+  {
+    typedef typename matrix<SCALARTYPE, F, ALIGNMENT>::size_type      size_type;
+
+    if (gpu_matrix.size1() == 0 || gpu_matrix.size2() == 0)
+    {
+      gpu_matrix.resize(cpu_matrix.num_rows(),
+                        cpu_matrix.num_cols(),
+                        false);
+    }
+    else
+    {
+      assert( (gpu_matrix.size1() == cpu_matrix.num_rows())
+              && (gpu_matrix.size2() == cpu_matrix.num_cols())
+              && bool("matrix size mismatch")
+            );
+    }
+
+    std::vector<SCALARTYPE> data(gpu_matrix.internal_size());
+    for (size_type i = 0; i < gpu_matrix.size1(); ++i)
+    {
+      for (size_type j = 0; j < gpu_matrix.size2(); ++j)
+        data[F::mem_index(i, j, gpu_matrix.internal_size1(), gpu_matrix.internal_size2())] = cpu_matrix[i][j];
+    }
+
+    viennacl::backend::memory_create(gpu_matrix.handle(), sizeof(SCALARTYPE) * data.size(), viennacl::traits::context(gpu_matrix), &(data[0]));
+    //gpu_matrix.elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, data);
+  }
+  #endif
+
+
+
+
+  //
+  //gpu to cpu, generic type
+  //
+  /** @brief Copies a dense matrix from the OpenCL device (GPU or multi-core CPU) to the host (CPU).
+  *
+  * @param gpu_matrix   A dense ViennaCL matrix
+  * @param cpu_matrix   A dense memory on the host. Must have at least as many rows and columns as the gpu_matrix! Type requirement: Access to entries via operator()
+  */
+  template <typename CPU_MATRIX, typename SCALARTYPE, typename F, unsigned int ALIGNMENT>
+  void copy(const matrix<SCALARTYPE, F, ALIGNMENT> & gpu_matrix,
+            CPU_MATRIX & cpu_matrix )
+  {
+    typedef typename matrix<float, F, ALIGNMENT>::size_type      size_type;
+
+    if ( (gpu_matrix.size1() > 0) && (gpu_matrix.size2() > 0) )
+    {
+      assert( viennacl::traits::size1(cpu_matrix) == gpu_matrix.size1() && bool("Matrix dimensions mismatch: rows"));
+
+      std::vector<SCALARTYPE> temp_buffer(gpu_matrix.internal_size());
+      viennacl::backend::memory_read(gpu_matrix.handle(), 0, sizeof(SCALARTYPE)*gpu_matrix.internal_size(), &(temp_buffer[0]));
+
+      //now copy entries to cpu_matrix:
+      for (size_type i = 0; i < gpu_matrix.size1(); ++i)
+      {
+        assert( viennacl::traits::size2(cpu_matrix) == gpu_matrix.size2() && bool("Matrix dimensions mismatch: columns"));
+        for (size_type j = 0; j < gpu_matrix.size2(); ++j)
+          cpu_matrix(i,j) = temp_buffer[F::mem_index(i, j, gpu_matrix.internal_size1(), gpu_matrix.internal_size2())];
+      }
+    }
+  }
+
+  //gpu to cpu, STL type
+  /** @brief Copies a dense matrix from the OpenCL device (GPU or multi-core CPU) to the host (CPU).
+  *
+  * @param gpu_matrix   A dense ViennaCL matrix
+  * @param cpu_matrix   A dense memory on the host using STL types, typically std::vector< std::vector<> > Must have at least as many rows and columns as the gpu_matrix! Type requirement: Access to entries via operator()
+  */
+  template <typename SCALARTYPE, typename A1, typename A2, typename F, unsigned int ALIGNMENT>
+  void copy(const matrix<SCALARTYPE, F, ALIGNMENT> & gpu_matrix,
+            std::vector< std::vector<SCALARTYPE, A1>, A2> & cpu_matrix)
+  {
+    typedef typename matrix<float, F, ALIGNMENT>::size_type      size_type;
+
+    if ( (gpu_matrix.size1() > 0) && (gpu_matrix.size2() > 0) )
+    {
+      assert( (cpu_matrix.size() == gpu_matrix.size1()) && bool("Matrix dimensions mismatch: rows"));
+
+      std::vector<SCALARTYPE> temp_buffer(gpu_matrix.internal_size());
+      viennacl::backend::memory_read(gpu_matrix.handle(), 0, sizeof(SCALARTYPE)*gpu_matrix.internal_size(), &(temp_buffer[0]));
+
+      //now copy entries to cpu_matrix:
+      for (size_type i = 0; i < gpu_matrix.size1(); ++i)
+      {
+        assert( (cpu_matrix[i].size() == gpu_matrix.size2()) && bool("Matrix dimensions mismatch: columns"));
+
+        for (size_type j = 0; j < gpu_matrix.size2(); ++j)
+          cpu_matrix[i][j] = temp_buffer[F::mem_index(i, j, gpu_matrix.internal_size1(), gpu_matrix.internal_size2())];
+      }
+    }
+  }
+
+  //gpu to cpu, STL type
+  /** @brief Copies a dense matrix from the OpenCL device (GPU or multi-core CPU) to the host (CPU).
+  *
+  * @param gpu_matrix         A dense ViennaCL matrix
+  * @param cpu_matrix_begin   Pointer to the output memory on the CPU. User must ensure that provided memory is large enough.
+  */
+  template <typename SCALARTYPE, typename F, unsigned int ALIGNMENT>
+  void fast_copy(const matrix<SCALARTYPE, F, ALIGNMENT> & gpu_matrix,
+                  SCALARTYPE * cpu_matrix_begin)
+  {
+    viennacl::backend::memory_read(gpu_matrix.handle(), 0, sizeof(SCALARTYPE)*gpu_matrix.internal_size(), cpu_matrix_begin);
+  }
+
+
+
+  /////////////////////// matrix operator overloads to follow ////////////////////////////////////////////
+
+
+  // operator +
+  /** @brief Generic 'catch-all' overload, which enforces a temporary if the expression tree gets too deep. */
+  template <typename LHS1, typename RHS1, typename OP1,
+            typename LHS2, typename RHS2, typename OP2>
+  matrix_expression< const matrix_expression<const LHS1, const RHS1, OP1>,
+                     const matrix_expression<const LHS2, const RHS2, OP2>,
+                     op_add>
+  operator + (matrix_expression<const LHS1, const RHS1, OP1> const & proxy1,
+              matrix_expression<const LHS2, const RHS2, OP2> const & proxy2)
+  {
+    assert(    (viennacl::traits::size1(proxy1) == viennacl::traits::size1(proxy2))
+            && (viennacl::traits::size2(proxy1) == viennacl::traits::size2(proxy2))
+            && bool("Incompatible matrix sizes!"));
+    return matrix_expression< const matrix_expression<const LHS1, const RHS1, OP1>,
+                              const matrix_expression<const LHS2, const RHS2, OP2>,
+                              op_add>(proxy1, proxy2);
+  }
+
+  template <typename LHS1, typename RHS1, typename OP1,
+            typename NumericT, typename F>
+  matrix_expression< const matrix_expression<const LHS1, const RHS1, OP1>,
+                     const matrix_base<NumericT, F>,
+                     op_add>
+  operator + (matrix_expression<const LHS1, const RHS1, OP1> const & proxy1,
+              matrix_base<NumericT, F> const & proxy2)
+  {
+    assert(    (viennacl::traits::size1(proxy1) == viennacl::traits::size1(proxy2))
+            && (viennacl::traits::size2(proxy1) == viennacl::traits::size2(proxy2))
+            && bool("Incompatible matrix sizes!"));
+    return matrix_expression< const matrix_expression<const LHS1, const RHS1, OP1>,
+                              const matrix_base<NumericT, F>,
+                              op_add>(proxy1, proxy2);
+  }
+
+  template <typename NumericT, typename F,
+            typename LHS2, typename RHS2, typename OP2>
+  matrix_expression< const matrix_base<NumericT, F>,
+                     const matrix_expression<const LHS2, const RHS2, OP2>,
+                     op_add>
+  operator + (matrix_base<NumericT, F> const & proxy1,
+              matrix_expression<const LHS2, const RHS2, OP2> const & proxy2)
+  {
+    assert(    (viennacl::traits::size1(proxy1) == viennacl::traits::size1(proxy2))
+            && (viennacl::traits::size2(proxy1) == viennacl::traits::size2(proxy2))
+            && bool("Incompatible matrix sizes!"));
+    return  matrix_expression< const matrix_base<NumericT, F>,
+                               const matrix_expression<const LHS2, const RHS2, OP2>,
+                               op_add>(proxy1, proxy2);
+  }
+
+  /** @brief Operator overload for m1 + m2, where m1 and m2 are either dense matrices, matrix ranges, or matrix slices. No mixing of different storage layouts allowed at the moment. */
+  template <typename NumericT, typename F>
+  matrix_expression< const matrix_base<NumericT, F>, const matrix_base<NumericT, F>, op_add >
+  operator + (const matrix_base<NumericT, F> & m1, const matrix_base<NumericT, F> & m2)
+  {
+    return matrix_expression< const matrix_base<NumericT, F>,
+                              const matrix_base<NumericT, F>,
+                              op_add > (m1, m2);
+  }
+
+
+  // operator -
+  template <typename LHS1, typename RHS1, typename OP1,
+            typename LHS2, typename RHS2, typename OP2>
+  matrix_expression< const matrix_expression<const LHS1, const RHS1, OP1>,
+                     const matrix_expression<const LHS2, const RHS2, OP2>,
+                     op_sub>
+  operator - (matrix_expression<const LHS1, const RHS1, OP1> const & proxy1,
+              matrix_expression<const LHS2, const RHS2, OP2> const & proxy2)
+  {
+    assert(    (viennacl::traits::size1(proxy1) == viennacl::traits::size1(proxy2))
+            && (viennacl::traits::size2(proxy1) == viennacl::traits::size2(proxy2))
+            && bool("Incompatible matrix sizes!"));
+    return matrix_expression< const matrix_expression<const LHS1, const RHS1, OP1>,
+                              const matrix_expression<const LHS2, const RHS2, OP2>,
+                              op_sub>(proxy1, proxy2);
+  }
+
+  template <typename LHS1, typename RHS1, typename OP1,
+            typename NumericT, typename F>
+  matrix_expression< const matrix_expression<const LHS1, const RHS1, OP1>,
+                     const matrix_base<NumericT, F>,
+                     op_sub>
+  operator - (matrix_expression<const LHS1, const RHS1, OP1> const & proxy1,
+              matrix_base<NumericT, F> const & proxy2)
+  {
+    assert(    (viennacl::traits::size1(proxy1) == viennacl::traits::size1(proxy2))
+            && (viennacl::traits::size2(proxy1) == viennacl::traits::size2(proxy2))
+            && bool("Incompatible matrix sizes!"));
+    return matrix_expression< const matrix_expression<const LHS1, const RHS1, OP1>,
+                              const matrix_base<NumericT, F>,
+                              op_sub>(proxy1, proxy2);
+  }
+
+  template <typename NumericT, typename F,
+            typename LHS2, typename RHS2, typename OP2>
+  matrix_expression< const matrix_base<NumericT, F>,
+                     const matrix_expression<const LHS2, const RHS2, OP2>,
+                     op_sub>
+  operator - (matrix_base<NumericT, F> const & proxy1,
+              matrix_expression<const LHS2, const RHS2, OP2> const & proxy2)
+  {
+    assert(    (viennacl::traits::size1(proxy1) == viennacl::traits::size1(proxy2))
+            && (viennacl::traits::size2(proxy1) == viennacl::traits::size2(proxy2))
+            && bool("Incompatible matrix sizes!"));
+    return  matrix_expression< const matrix_base<NumericT, F>,
+                               const matrix_expression<const LHS2, const RHS2, OP2>,
+                               op_sub>(proxy1, proxy2);
+  }
+
+  /** @brief Operator overload for m1 - m2, where m1 and m2 are either dense matrices, matrix ranges, or matrix slices. No mixing of different storage layouts allowed at the moment. */
+  template <typename NumericT, typename F>
+  matrix_expression< const matrix_base<NumericT, F>, const matrix_base<NumericT, F>, op_sub >
+  operator - (const matrix_base<NumericT, F> & m1, const matrix_base<NumericT, F> & m2)
+  {
+    return matrix_expression< const matrix_base<NumericT, F>,
+                              const matrix_base<NumericT, F>,
+                              op_sub > (m1, m2);
+  }
+
+
+
+  // operator *
+  /** @brief Operator overload for the expression alpha * m1, where alpha is a host scalar (float or double) and m1 is a ViennaCL matrix.
+  *
+  * @param value   The host scalar (float or double)
+  * @param m1      A ViennaCL matrix
+  */
+  template <typename S1, typename NumericT, typename F>
+  typename viennacl::enable_if<    viennacl::is_any_scalar<S1>::value,
+                                matrix_expression< const matrix_base<NumericT, F>, const S1, op_mult>
+                              >::type
+  operator * (S1 const & value, matrix_base<NumericT, F> const & m1)
+  {
+    return matrix_expression< const matrix_base<NumericT, F>, const S1, op_mult>(m1, value);
+  }
+
+
+  /** @brief Operator overload for the multiplication of a matrix expression with a scalar from the right, e.g. (beta * m1) * alpha. Here, beta * m1 is wrapped into a matrix_expression and then multiplied with alpha from the right.
+  *
+  * @param proxy   Left hand side matrix expression
+  * @param val     Right hand side scalar
+  */
+  template <typename LHS, typename RHS, typename OP, typename S1>
+  typename viennacl::enable_if< viennacl::is_any_scalar<S1>::value,
+                                matrix_expression< const matrix_expression< LHS, RHS, OP>, const S1, op_mult> >::type
+  operator * (matrix_expression< LHS, RHS, OP> const & proxy,
+              S1 const & val)
+  {
+    return matrix_expression< const matrix_expression< LHS, RHS, OP>, const S1, op_mult>(proxy, val);
+  }
+
+
+  /** @brief Operator overload for the multiplication of a matrix expression with a ViennaCL scalar from the left, e.g. alpha * (beta * m1). Here, beta * m1 is wrapped into a matrix_expression and then multiplied with alpha from the left.
+  *
+  * @param val     Right hand side scalar
+  * @param proxy   Left hand side matrix expression
+  */
+  template <typename S1, typename LHS, typename RHS, typename OP>
+  typename viennacl::enable_if< viennacl::is_any_scalar<S1>::value,
+                                matrix_expression< const matrix_expression< LHS, RHS, OP>, const S1, op_mult> >::type
+  operator * (S1 const & val,
+              matrix_expression< LHS, RHS, OP> const & proxy)
+  {
+    return matrix_expression< const matrix_expression< LHS, RHS, OP>, const S1, op_mult>(proxy, val);
+  }
+
+  /** @brief Scales the matrix by a GPU scalar 'alpha' and returns an expression template
+  */
+  template <typename NumericT, typename F, typename S1>
+  typename viennacl::enable_if< viennacl::is_any_scalar<S1>::value,
+                                matrix_expression< const matrix_base<NumericT, F>, const S1, op_mult> >::type
+  operator * (matrix_base<NumericT, F> const & m1, S1 const & s1)
+  {
+    return matrix_expression< const matrix_base<NumericT, F>, const S1, op_mult>(m1, s1);
+  }
+
+
+  // operator *=
+
+  /** @brief Scales a matrix by a GPU scalar value
+  */
+  template <typename NumericT, typename F, typename S1>
+  typename viennacl::enable_if< viennacl::is_scalar<S1>::value,
+                                matrix_base<NumericT, F> &
+                              >::type
+  operator *= (matrix_base<NumericT, F> & m1, S1 const & gpu_val)
+  {
+    //viennacl::linalg::inplace_mult(*this, gpu_val);
+    viennacl::linalg::am(m1,
+                         m1, gpu_val, 1, false, (viennacl::is_flip_sign_scalar<S1>::value ? true : false));
+    return m1;
+  }
+
+
+  // operator /
+
+
+  /** @brief Operator overload for the division of a matrix expression by a scalar from the right, e.g. (beta * m1) / alpha. Here, beta * m1 is wrapped into a matrix_expression and then divided by alpha.
+  *
+  * @param proxy   Left hand side matrix expression
+  * @param val     Right hand side scalar
+  */
+  template <typename LHS, typename RHS, typename OP, typename S1>
+  typename viennacl::enable_if< viennacl::is_any_scalar<S1>::value,
+                                matrix_expression< const matrix_expression<const LHS, const RHS, OP>, const S1, op_div> >::type
+  operator / (matrix_expression<const LHS, const RHS, OP> const & proxy,
+              S1 const & val)
+  {
+    return matrix_expression< const matrix_expression<const LHS, const RHS, OP>, const S1, op_div>(proxy, val);
+  }
+
+
+  /** @brief Returns an expression template for scaling the matrix by a GPU scalar 'alpha'
+  */
+  template <typename NumericT, typename F, typename S1>
+  typename viennacl::enable_if< viennacl::is_any_scalar<S1>::value,
+                                matrix_expression< const matrix_base<NumericT, F>, const S1, op_div> >::type
+  operator / (matrix_base<NumericT, F> const & m1, S1 const & s1)
+  {
+    return matrix_expression< const matrix_base<NumericT, F>, const S1, op_div>(m1, s1);
+  }
+
+
+  // operator /=
+
+  /** @brief Scales a matrix by a GPU scalar value
+  */
+  template <typename NumericT, typename F, typename S1>
+  typename viennacl::enable_if< viennacl::is_scalar<S1>::value,
+                                matrix_base<NumericT, F> &
+                              >::type
+  operator /= (matrix_base<NumericT, F> & m1, S1 const & gpu_val)
+  {
+    //viennacl::linalg::inplace_divide(*this, gpu_val);
+    viennacl::linalg::am(m1,
+                         m1, gpu_val, 1, true, (viennacl::is_flip_sign_scalar<S1>::value ? true : false));
+    return m1;
+  }
+
+
+
+
+
+  // outer_prod(v1, v2) * val;
+  template <typename NumericT, typename S1>
+  typename viennacl::enable_if< viennacl::is_scalar<S1>::value,
+                                viennacl::matrix_expression< const viennacl::matrix_expression< const vector_base<NumericT>, const vector_base<NumericT>, op_prod>,
+                                                             const S1,
+                                                             op_mult>
+                              >::type
+  operator*(const viennacl::matrix_expression< const vector_base<NumericT>, const vector_base<NumericT>, op_prod> & proxy,
+            const S1 & val)
+  {
+    return viennacl::matrix_expression< const viennacl::matrix_expression< const vector_base<NumericT>, const vector_base<NumericT>, op_prod>,
+                                        const S1,
+                                        op_mult>(proxy, val);
+  }
+
+  template <typename NumericT, typename S1>
+  typename viennacl::enable_if< viennacl::is_cpu_scalar<S1>::value,
+                                viennacl::matrix_expression< const viennacl::matrix_expression< const vector_base<NumericT>, const vector_base<NumericT>, op_prod>,
+                                                              const NumericT,
+                                                              op_mult>
+                              >::type
+  operator*(const viennacl::matrix_expression< const vector_base<NumericT>, const vector_base<NumericT>, op_prod> & proxy,
+            const S1 & val)
+  {
+    return viennacl::matrix_expression< const viennacl::matrix_expression< const vector_base<NumericT>, const vector_base<NumericT>, op_prod>,
+                                        const NumericT,
+                                        op_mult>(proxy, NumericT(val));
+  }
+
+  // val * outer_prod(v1, v2);
+  template <typename NumericT, typename S1>
+  typename viennacl::enable_if< viennacl::is_scalar<S1>::value,
+                                viennacl::matrix_expression< const viennacl::matrix_expression< const vector_base<NumericT>, const vector_base<NumericT>, op_prod>,
+                                                             const S1,
+                                                             op_mult>
+                              >::type
+  operator*(const S1 & val,
+            const viennacl::matrix_expression< const vector_base<NumericT>, const vector_base<NumericT>, op_prod> & proxy)
+  {
+    return viennacl::matrix_expression< const viennacl::matrix_expression< const vector_base<NumericT>, const vector_base<NumericT>, op_prod>,
+                                        const S1,
+                                        op_mult>(proxy, val);
+  }
+
+  template<typename NumericT, typename S1>
+  typename viennacl::enable_if< viennacl::is_cpu_scalar<S1>::value,
+                                viennacl::matrix_expression< const viennacl::matrix_expression< const vector_base<NumericT>, const vector_base<NumericT>, op_prod>,
+                                                             const NumericT,
+                                                             op_mult>
+                              >::type
+  operator*(const S1 & val,
+            const viennacl::matrix_expression< const vector_base<NumericT>, const vector_base<NumericT>, op_prod> & proxy)
+  {
+    return viennacl::matrix_expression< const viennacl::matrix_expression< const vector_base<NumericT>, const vector_base<NumericT>, op_prod>,
+                                        const NumericT,
+                                        op_mult>(proxy, NumericT(val));
+  }
+
+
+
+  //
+  // Specify available operations:
+  //
+
+  /** \cond */
+
+  namespace linalg
+  {
+    namespace detail
+    {
+
+      // x = y
+      template <typename T, typename F>
+      struct op_executor<matrix_base<T, F>, op_assign, matrix_base<T, F> >
+      {
+        static void apply(matrix_base<T, F> & lhs, matrix_base<T, F> const & rhs)
+        {
+          viennacl::linalg::am(lhs, rhs, T(1), 1, false, false);
+        }
+      };
+
+      // x += y
+      template <typename T, typename F>
+      struct op_executor<matrix_base<T, F>, op_inplace_add, matrix_base<T, F> >
+      {
+        static void apply(matrix_base<T, F> & lhs, matrix_base<T, F> const & rhs)
+        {
+          viennacl::linalg::ambm(lhs, lhs, T(1), 1, false, false, rhs, T(1), 1, false, false);
+        }
+      };
+
+      // x -= y
+      template <typename T, typename F>
+      struct op_executor<matrix_base<T, F>, op_inplace_sub, matrix_base<T, F> >
+      {
+        static void apply(matrix_base<T, F> & lhs, matrix_base<T, F> const & rhs)
+        {
+          viennacl::linalg::ambm(lhs, lhs, T(1), 1, false, false, rhs, T(1), 1, false, true);
+        }
+      };
+
+      ///////////// x  OP  y * alpha ////////////////////////
+
+
+      // x = alpha * y
+      template <typename T, typename F, typename ScalarType>
+      struct op_executor<matrix_base<T, F>, op_assign, matrix_expression<const matrix_base<T, F>, const ScalarType, op_mult> >
+      {
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F>, const ScalarType, op_mult> const & proxy)
+        {
+          viennacl::linalg::am(lhs, proxy.lhs(), proxy.rhs(), 1, false, false);
+        }
+      };
+
+      // x += alpha * y
+      template <typename T, typename F, typename ScalarType>
+      struct op_executor<matrix_base<T, F>, op_inplace_add, matrix_expression<const matrix_base<T, F>, const ScalarType, op_mult> >
+      {
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F>, const ScalarType, op_mult> const & proxy)
+        {
+          viennacl::linalg::ambm(lhs, lhs, T(1), 1, false, false, proxy.lhs(), proxy.rhs(), 1, false, false);
+        }
+      };
+
+      // x -= alpha * y
+      template <typename T, typename F, typename ScalarType>
+      struct op_executor<matrix_base<T, F>, op_inplace_sub, matrix_expression<const matrix_base<T, F>, const ScalarType, op_mult> >
+      {
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F>, const ScalarType, op_mult> const & proxy)
+        {
+          viennacl::linalg::ambm(lhs, lhs, T(1), 1, false, false, proxy.lhs(), proxy.rhs(), 1, false, true);
+        }
+      };
+
+
+      ///////////// x  OP  vec_expr * alpha ////////////////////////
+
+      // x = alpha * vec_expr
+      template <typename T, typename F, typename LHS, typename RHS, typename OP, typename ScalarType>
+      struct op_executor<matrix_base<T, F>, op_assign, matrix_expression<const matrix_expression<const LHS, const RHS, OP>, const ScalarType, op_mult> >
+      {
+          static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const LHS, const RHS, OP>, const ScalarType, op_mult> const & proxy)
+          {
+            matrix<T, F> temp(proxy.lhs());
+            lhs = temp * proxy.rhs();
+          }
+      };
+
+      // x += alpha * vec_expr
+      template <typename T, typename F, typename LHS, typename RHS, typename OP, typename ScalarType>
+      struct op_executor<matrix_base<T, F>, op_inplace_add, matrix_expression<const matrix_expression<const LHS, const RHS, OP>, const ScalarType, op_mult> >
+      {
+          static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const LHS, const RHS, OP>, const ScalarType, op_mult> const & proxy)
+          {
+            matrix<T, F> temp(proxy.lhs());
+            lhs += temp * proxy.rhs();
+          }
+      };
+
+      // x -= alpha * vec_expr
+      template <typename T, typename F, typename LHS, typename RHS, typename OP, typename ScalarType>
+      struct op_executor<matrix_base<T, F>, op_inplace_sub, matrix_expression<const matrix_expression<const LHS, const RHS, OP>, const ScalarType, op_mult> >
+      {
+          static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const LHS, const RHS, OP>, const ScalarType, op_mult> const & proxy)
+          {
+            matrix<T, F> temp(proxy.lhs());
+            lhs -= temp * proxy.rhs();
+          }
+      };
+
+
+      ///////////// x  OP  y / alpha ////////////////////////
+
+      // x = y / alpha
+      template <typename T, typename F, typename ScalarType>
+      struct op_executor<matrix_base<T, F>, op_assign, matrix_expression<const matrix_base<T, F>, const ScalarType, op_div> >
+      {
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F>, const ScalarType, op_div> const & proxy)
+        {
+          viennacl::linalg::am(lhs, proxy.lhs(), proxy.rhs(), 1, true, false);
+        }
+      };
+
+      // x += y / alpha
+      template <typename T, typename F, typename ScalarType>
+      struct op_executor<matrix_base<T, F>, op_inplace_add, matrix_expression<const matrix_base<T, F>, const ScalarType, op_div> >
+      {
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F>, const ScalarType, op_div> const & proxy)
+        {
+          viennacl::linalg::ambm(lhs, lhs, T(1), 1, false, false, proxy.lhs(), proxy.rhs(), 1, true, false);
+        }
+      };
+
+      // x -= y / alpha
+      template <typename T, typename F, typename ScalarType>
+      struct op_executor<matrix_base<T, F>, op_inplace_sub, matrix_expression<const matrix_base<T, F>, const ScalarType, op_div> >
+      {
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F>, const ScalarType, op_div> const & proxy)
+        {
+          viennacl::linalg::ambm(lhs, lhs, T(1), 1, false, false, proxy.lhs(), proxy.rhs(), 1, true, true);
+        }
+      };
+
+
+      ///////////// x  OP  vec_expr / alpha ////////////////////////
+
+      // x = vec_expr / alpha
+      template <typename T, typename F, typename LHS, typename RHS, typename OP, typename ScalarType>
+      struct op_executor<matrix_base<T, F>, op_assign, matrix_expression<const matrix_expression<const LHS, const RHS, OP>, const ScalarType, op_div> >
+      {
+          static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const LHS, const RHS, OP>, const ScalarType, op_div> const & proxy)
+          {
+            matrix<T, F> temp(proxy.lhs());
+            lhs = temp / proxy.rhs();
+          }
+      };
+
+      // x += vec_expr / alpha
+      template <typename T, typename F, typename LHS, typename RHS, typename OP, typename ScalarType>
+      struct op_executor<matrix_base<T, F>, op_inplace_add, matrix_expression<const matrix_expression<const LHS, const RHS, OP>, const ScalarType, op_div> >
+      {
+          static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const LHS, const RHS, OP>, const ScalarType, op_div> const & proxy)
+          {
+            matrix<T, F> temp(proxy.lhs());
+            lhs += temp / proxy.rhs();
+          }
+      };
+
+      // x -= vec_expr / alpha
+      template <typename T, typename F, typename LHS, typename RHS, typename OP, typename ScalarType>
+      struct op_executor<matrix_base<T, F>, op_inplace_sub, matrix_expression<const matrix_expression<const LHS, const RHS, OP>, const ScalarType, op_div> >
+      {
+          static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const LHS, const RHS, OP>, const ScalarType, op_div> const & proxy)
+          {
+            matrix<T, F> temp(proxy.lhs());
+            lhs -= temp / proxy.rhs();
+          }
+      };
+
+
+
+      // generic x = vec_expr1 + vec_expr2:
+      template <typename T, typename F, typename LHS, typename RHS>
+      struct op_executor<matrix_base<T, F>, op_assign, matrix_expression<const LHS, const RHS, op_add> >
+      {
+        // generic x = vec_expr1 + vec_expr2:
+        template <typename LHS1, typename RHS1>
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const LHS1, const RHS1, op_add> const & proxy)
+        {
+          bool op_aliasing_lhs = op_aliasing(lhs, proxy.lhs());
+          bool op_aliasing_rhs = op_aliasing(lhs, proxy.rhs());
+
+          if (op_aliasing_lhs || op_aliasing_rhs)
+          {
+            matrix_base<T, F> temp(proxy.lhs());
+            op_executor<matrix_base<T, F>, op_inplace_add, RHS>::apply(temp, proxy.rhs());
+            lhs = temp;
+          }
+          else
+          {
+            op_executor<matrix_base<T, F>, op_assign, LHS>::apply(lhs, proxy.lhs());
+            op_executor<matrix_base<T, F>, op_inplace_add, RHS>::apply(lhs, proxy.rhs());
+          }
+        }
+
+        // x = y + z
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_add> const & proxy)
+        {
+          viennacl::linalg::ambm(lhs,
+                                 proxy.lhs(), T(1), 1, false, false,
+                                 proxy.rhs(), T(1), 1, false, false);
+        }
+
+        // x = alpha * y + z
+        template <typename ScalarType>
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType, op_mult>,
+                                                                  const matrix_base<T, F>,
+                                                                  op_add> const & proxy)
+        {
+          viennacl::linalg::ambm(lhs,
+                                 proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, false,
+                                 proxy.rhs(), T(1), 1, false, false);
+        }
+
+        // x = y / alpha + z
+        template <typename ScalarType>
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType, op_div>,
+                                                                  const matrix_base<T, F>,
+                                                                  op_add> const & proxy)
+        {
+          viennacl::linalg::ambm(lhs,
+                                 proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, false,
+                                 proxy.rhs(), T(1), 1, false, false);
+        }
+
+        // x = y + beta * z
+        template <typename ScalarType>
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F>,
+                                                                  const matrix_expression<const matrix_base<T, F>, const ScalarType, op_mult>,
+                                                                  op_add> const & proxy)
+        {
+          viennacl::linalg::ambm(lhs,
+                                 proxy.lhs(), T(1), 1, false, false,
+                                 proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, false);
+        }
+
+        // x = y + z / beta
+        template <typename ScalarType>
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F>,
+                                                                  const matrix_expression<const matrix_base<T, F>, const ScalarType, op_div>,
+                                                                  op_add> const & proxy)
+        {
+          viennacl::linalg::ambm(lhs,
+                                 proxy.lhs(), T(1), 1, false, false,
+                                 proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, false);
+        }
+
+        // x = alpha * y + beta * z
+        template <typename ScalarType1, typename ScalarType2>
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType1, op_mult>,
+                                                                  const matrix_expression<const matrix_base<T, F>, const ScalarType2, op_mult>,
+                                                                  op_add> const & proxy)
+        {
+          viennacl::linalg::ambm(lhs,
+                                 proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, false,
+                                 proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, false);
+        }
+
+        // x = alpha * y + z / beta
+        template <typename ScalarType1, typename ScalarType2>
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType1, op_mult>,
+                                                                  const matrix_expression<const matrix_base<T, F>, const ScalarType2, op_div>,
+                                                                  op_add> const & proxy)
+        {
+          viennacl::linalg::ambm(lhs,
+                                 proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, false,
+                                 proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, false);
+        }
+
+        // x = y / alpha + beta * z
+        template <typename ScalarType1, typename ScalarType2>
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType1, op_div>,
+                                                                  const matrix_expression<const matrix_base<T, F>, const ScalarType2, op_mult>,
+                                                                  op_add> const & proxy)
+        {
+          viennacl::linalg::ambm(lhs,
+                                 proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, false,
+                                 proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, false);
+        }
+
+        // x = y / alpha + z / beta
+        template <typename ScalarType1, typename ScalarType2>
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType1, op_div>,
+                                                                  const matrix_expression<const matrix_base<T, F>, const ScalarType2, op_div>,
+                                                                  op_add> const & proxy)
+        {
+          viennacl::linalg::ambm(lhs,
+                                 proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, false,
+                                 proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, false);
+        }
+      };
+
+      // dense = sparse * dense
+      template <typename T, typename F1, typename LHS, typename RHS>
+      struct op_executor<matrix_base<T, F1>, op_assign, matrix_expression<const LHS, const RHS, op_prod> >
+      {
+        template < typename SparseMatrixType, typename F2 >
+        static void apply(matrix_base<T, F1> & lhs, matrix_expression<const SparseMatrixType,
+                                                                     const viennacl::matrix_base<T, F2>,
+                                                                     viennacl::op_prod> const & proxy)
+        {
+          viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), lhs);
+        }
+
+        // dense = sparse * trans(dense)
+        template < typename SparseMatrixType, typename F2 >
+        static void apply(matrix_base<T, F1> & lhs, matrix_expression<const SparseMatrixType,
+                                                                     const viennacl::matrix_expression< const viennacl::matrix_base<T, F2>,
+                                                                                                        const viennacl::matrix_base<T, F2>,
+                                                                                                        viennacl::op_trans >,
+                                                                     viennacl::op_prod> const & proxy)
+        {
+          viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), lhs);
+        }
+
+      };
+
+      // generic x += vec_expr1 + vec_expr2:
+      template <typename T, typename F, typename LHS, typename RHS>
+      struct op_executor<matrix_base<T, F>, op_inplace_add, matrix_expression<const LHS, const RHS, op_add> >
+      {
+        // generic x += vec_expr1 + vec_expr2:
+        template <typename LHS1, typename RHS1>
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const LHS1, const RHS1, op_add> const & proxy)
+        {
+          bool op_aliasing_lhs = op_aliasing(lhs, proxy.lhs());
+          bool op_aliasing_rhs = op_aliasing(lhs, proxy.rhs());
+
+          if (op_aliasing_lhs || op_aliasing_rhs)
+          {
+            matrix_base<T, F> temp(proxy.lhs());
+            op_executor<matrix_base<T, F>, op_inplace_add, RHS>::apply(temp, proxy.rhs());
+            lhs += temp;
+          }
+          else
+          {
+            op_executor<matrix_base<T, F>, op_inplace_add, LHS>::apply(lhs, proxy.lhs());
+            op_executor<matrix_base<T, F>, op_inplace_add, RHS>::apply(lhs, proxy.rhs());
+          }
+        }
+
+        // x += y + z
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_add> const & proxy)
+        {
+          viennacl::linalg::ambm_m(lhs,
+                                   proxy.lhs(), T(1), 1, false, false,
+                                   proxy.rhs(), T(1), 1, false, false);
+        }
+
+        // x += alpha * y + z
+        template <typename ScalarType>
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType, op_mult>,
+                                                                  const matrix_base<T, F>,
+                                                                  op_add> const & proxy)
+        {
+          viennacl::linalg::ambm_m(lhs,
+                                   proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, false,
+                                   proxy.rhs(), T(1), 1, false, false);
+        }
+
+        // x += y / alpha + z
+        template <typename ScalarType>
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType, op_div>,
+                                                                  const matrix_base<T, F>,
+                                                                  op_add> const & proxy)
+        {
+          viennacl::linalg::ambm_m(lhs,
+                                   proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, false,
+                                   proxy.rhs(), T(1), 1, false, false);
+        }
+
+        // x += y + beta * z
+        template <typename ScalarType>
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F>,
+                                                                  const matrix_expression<const matrix_base<T, F>, const ScalarType, op_mult>,
+                                                                  op_add> const & proxy)
+        {
+          viennacl::linalg::ambm_m(lhs,
+                                   proxy.lhs(), T(1), 1, false, false,
+                                   proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, false);
+        }
+
+        // x += y + z / beta
+        template <typename ScalarType>
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F>,
+                                                                  const matrix_expression<const matrix_base<T, F>, const ScalarType, op_div>,
+                                                                  op_add> const & proxy)
+        {
+          viennacl::linalg::ambm_m(lhs,
+                                   proxy.lhs(), T(1), 1, false, false,
+                                   proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, false);
+        }
+
+        // x += alpha * y + beta * z
+        template <typename ScalarType1, typename ScalarType2>
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType1, op_mult>,
+                                                                  const matrix_expression<const matrix_base<T, F>, const ScalarType2, op_mult>,
+                                                                  op_add> const & proxy)
+        {
+          viennacl::linalg::ambm_m(lhs,
+                                   proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, false,
+                                   proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, false);
+        }
+
+        // x += alpha * y + z / beta
+        template <typename ScalarType1, typename ScalarType2>
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType1, op_mult>,
+                                                                  const matrix_expression<const matrix_base<T, F>, const ScalarType2, op_div>,
+                                                                  op_add> const & proxy)
+        {
+          viennacl::linalg::ambm_m(lhs,
+                                   proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, false,
+                                   proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, false);
+        }
+
+        // x += y / alpha + beta * z
+        template <typename ScalarType1, typename ScalarType2>
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType1, op_div>,
+                                                                  const matrix_expression<const matrix_base<T, F>, const ScalarType2, op_mult>,
+                                                                  op_add> const & proxy)
+        {
+          viennacl::linalg::ambm_m(lhs,
+                                   proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, false,
+                                   proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, false);
+        }
+
+        // x += y / alpha + z / beta
+        template <typename ScalarType1, typename ScalarType2>
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType1, op_div>,
+                                                                  const matrix_expression<const matrix_base<T, F>, const ScalarType2, op_div>,
+                                                                  op_add> const & proxy)
+        {
+          viennacl::linalg::ambm_m(lhs,
+                                   proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, false,
+                                   proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, false);
+        }
+      };
+
+
+
+      // generic x -= vec_expr1 + vec_expr2:
+      template <typename T, typename F, typename LHS, typename RHS>
+      struct op_executor<matrix_base<T, F>, op_inplace_sub, matrix_expression<const LHS, const RHS, op_add> >
+      {
+        // generic x -= vec_expr1 + vec_expr2:
+        template <typename LHS1, typename RHS1>
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const LHS1, const RHS1, op_add> const & proxy)
+        {
+          bool op_aliasing_lhs = op_aliasing(lhs, proxy.lhs());
+          bool op_aliasing_rhs = op_aliasing(lhs, proxy.rhs());
+
+          if (op_aliasing_lhs || op_aliasing_rhs)
+          {
+            matrix_base<T, F> temp(proxy.lhs());
+            op_executor<matrix_base<T, F>, op_inplace_add, RHS>::apply(temp, proxy.rhs());
+            lhs -= temp;
+          }
+          else
+          {
+            op_executor<matrix_base<T, F>, op_inplace_sub, LHS>::apply(lhs, proxy.lhs());
+            op_executor<matrix_base<T, F>, op_inplace_sub, RHS>::apply(lhs, proxy.rhs());
+          }
+        }
+
+        // x -= y + z
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_add> const & proxy)
+        {
+          viennacl::linalg::ambm_m(lhs,
+                                   proxy.lhs(), T(1), 1, false, true,
+                                   proxy.rhs(), T(1), 1, false, true);
+        }
+
+        // x -= alpha * y + z
+        template <typename ScalarType>
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType, op_mult>,
+                                                                  const matrix_base<T, F>,
+                                                                  op_add> const & proxy)
+        {
+          viennacl::linalg::ambm_m(lhs,
+                                   proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, true,
+                                   proxy.rhs(), T(1), 1, false, true);
+        }
+
+        // x -= y / alpha + z
+        template <typename ScalarType>
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType, op_div>,
+                                                                  const matrix_base<T, F>,
+                                                                  op_add> const & proxy)
+        {
+          viennacl::linalg::ambm_m(lhs,
+                                   proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, true,
+                                   proxy.rhs(), T(1), 1, false, true);
+        }
+
+        // x -= y + beta * z
+        template <typename ScalarType>
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F>,
+                                                                  const matrix_expression<const matrix_base<T, F>, const ScalarType, op_mult>,
+                                                                  op_add> const & proxy)
+        {
+          viennacl::linalg::ambm_m(lhs,
+                                   proxy.lhs(), T(1), 1, false, true,
+                                   proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, true);
+        }
+
+        // x -= y + z / beta
+        template <typename ScalarType>
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F>,
+                                                                  const matrix_expression<const matrix_base<T, F>, const ScalarType, op_div>,
+                                                                  op_add> const & proxy)
+        {
+          viennacl::linalg::ambm_m(lhs,
+                                   proxy.lhs(), T(1), 1, false, true,
+                                   proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, true);
+        }
+
+        // x -= alpha * y + beta * z
+        template <typename ScalarType1, typename ScalarType2>
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType1, op_mult>,
+                                                                  const matrix_expression<const matrix_base<T, F>, const ScalarType2, op_mult>,
+                                                                  op_add> const & proxy)
+        {
+          viennacl::linalg::ambm_m(lhs,
+                                   proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, true,
+                                   proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, true);
+        }
+
+        // x -= alpha * y + z / beta
+        template <typename ScalarType1, typename ScalarType2>
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType1, op_mult>,
+                                                                  const matrix_expression<const matrix_base<T, F>, const ScalarType2, op_div>,
+                                                                  op_add> const & proxy)
+        {
+          viennacl::linalg::ambm_m(lhs,
+                                   proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, true,
+                                   proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, true);
+        }
+
+        // x -= y / alpha + beta * z
+        template <typename ScalarType1, typename ScalarType2>
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType1, op_div>,
+                                                                  const matrix_expression<const matrix_base<T, F>, const ScalarType2, op_mult>,
+                                                                  op_add> const & proxy)
+        {
+          viennacl::linalg::ambm_m(lhs,
+                                   proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, true,
+                                   proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, true);
+        }
+
+        // x -= y / alpha + z / beta
+        template <typename ScalarType1, typename ScalarType2>
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType1, op_div>,
+                                                                  const matrix_expression<const matrix_base<T, F>, const ScalarType2, op_div>,
+                                                                  op_add> const & proxy)
+        {
+          viennacl::linalg::ambm_m(lhs,
+                                   proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, true,
+                                   proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, true);
+        }
+      };
+
+
+
+      ///////////////////////
+
+
+
+      // generic x = vec_expr1 - vec_expr2:
+      template <typename T, typename F, typename LHS, typename RHS>
+      struct op_executor<matrix_base<T, F>, op_assign, matrix_expression<const LHS, const RHS, op_sub> >
+      {
+        // generic x = vec_expr1 - vec_expr2:
+        template <typename LHS1, typename RHS1>
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const LHS1, const RHS1, op_sub> const & proxy)
+        {
+          bool op_aliasing_lhs = op_aliasing(lhs, proxy.lhs());
+          bool op_aliasing_rhs = op_aliasing(lhs, proxy.rhs());
+
+          if (op_aliasing_lhs || op_aliasing_rhs)
+          {
+            matrix_base<T, F> temp(proxy.lhs());
+            op_executor<matrix_base<T, F>, op_inplace_sub, RHS>::apply(temp, proxy.rhs());
+            lhs = temp;
+          }
+          else
+          {
+            op_executor<matrix_base<T, F>, op_assign, LHS>::apply(lhs, proxy.lhs());
+            op_executor<matrix_base<T, F>, op_inplace_sub, RHS>::apply(lhs, proxy.rhs());
+          }
+        }
+
+        // x = y - z
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_sub> const & proxy)
+        {
+          viennacl::linalg::ambm(lhs,
+                                 proxy.lhs(), T(1), 1, false, false,
+                                 proxy.rhs(), T(1), 1, false, true);
+        }
+
+        // x = alpha * y - z
+        template <typename ScalarType>
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType, op_mult>,
+                                                                  const matrix_base<T, F>,
+                                                                  op_sub> const & proxy)
+        {
+          viennacl::linalg::ambm(lhs,
+                                 proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, false,
+                                 proxy.rhs(), T(1), 1, false, true);
+        }
+
+        // x = y / alpha - z
+        template <typename ScalarType>
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType, op_div>,
+                                                                  const matrix_base<T, F>,
+                                                                  op_sub> const & proxy)
+        {
+          viennacl::linalg::ambm(lhs,
+                                 proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, false,
+                                 proxy.rhs(), T(1), 1, false, true);
+        }
+
+        // x = y - beta * z
+        template <typename ScalarType>
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F>,
+                                                                  const matrix_expression<const matrix_base<T, F>, const ScalarType, op_mult>,
+                                                                  op_sub> const & proxy)
+        {
+          viennacl::linalg::ambm(lhs,
+                                 proxy.lhs(), T(1), 1, false, false,
+                                 proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, true);
+        }
+
+        // x = y - z / beta
+        template <typename ScalarType>
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F>,
+                                                                  const matrix_expression<const matrix_base<T, F>, const ScalarType, op_div>,
+                                                                  op_sub> const & proxy)
+        {
+          viennacl::linalg::ambm(lhs,
+                                 proxy.lhs(), T(1), 1, false, false,
+                                 proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, true);
+        }
+
+        // x = alpha * y - beta * z
+        template <typename ScalarType1, typename ScalarType2>
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType1, op_mult>,
+                                                                  const matrix_expression<const matrix_base<T, F>, const ScalarType2, op_mult>,
+                                                                  op_sub> const & proxy)
+        {
+          viennacl::linalg::ambm(lhs,
+                                 proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, false,
+                                 proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, true);
+        }
+
+        // x = alpha * y - z / beta
+        template <typename ScalarType1, typename ScalarType2>
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType1, op_mult>,
+                                                                  const matrix_expression<const matrix_base<T, F>, const ScalarType2, op_div>,
+                                                                  op_sub> const & proxy)
+        {
+          viennacl::linalg::ambm(lhs,
+                                 proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, false,
+                                 proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, true);
+        }
+
+        // x = y / alpha - beta * z
+        template <typename ScalarType1, typename ScalarType2>
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType1, op_div>,
+                                                                  const matrix_expression<const matrix_base<T, F>, const ScalarType2, op_mult>,
+                                                                  op_sub> const & proxy)
+        {
+          viennacl::linalg::ambm(lhs,
+                                 proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, false,
+                                 proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, true);
+        }
+
+        // x = y / alpha - z / beta
+        template <typename ScalarType1, typename ScalarType2>
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType1, op_div>,
+                                                                  const matrix_expression<const matrix_base<T, F>, const ScalarType2, op_div>,
+                                                                  op_sub> const & proxy)
+        {
+          viennacl::linalg::ambm(lhs,
+                                 proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, false,
+                                 proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, true);
+        }
+      };
+
+
+      // generic x += vec_expr1 - vec_expr2:
+      template <typename T, typename F, typename LHS, typename RHS>
+      struct op_executor<matrix_base<T, F>, op_inplace_add, matrix_expression<const LHS, const RHS, op_sub> >
+      {
+        // generic x += vec_expr1 - vec_expr2:
+        template <typename LHS1, typename RHS1>
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const LHS1, const RHS1, op_sub> const & proxy)
+        {
+          bool op_aliasing_lhs = op_aliasing(lhs, proxy.lhs());
+          bool op_aliasing_rhs = op_aliasing(lhs, proxy.rhs());
+
+          if (op_aliasing_lhs || op_aliasing_rhs)
+          {
+            matrix_base<T, F> temp(proxy.lhs());
+            op_executor<matrix_base<T, F>, op_inplace_sub, RHS>::apply(temp, proxy.rhs());
+            lhs += temp;
+          }
+          else
+          {
+            op_executor<matrix_base<T, F>, op_inplace_add, LHS>::apply(lhs, proxy.lhs());
+            op_executor<matrix_base<T, F>, op_inplace_sub, RHS>::apply(lhs, proxy.rhs());
+          }
+        }
+
+        // x += y - z
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_sub> const & proxy)
+        {
+          viennacl::linalg::ambm_m(lhs,
+                                   proxy.lhs(), T(1), 1, false, false,
+                                   proxy.rhs(), T(1), 1, false, true);
+        }
+
+        // x += alpha * y - z
+        template <typename ScalarType>
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType, op_mult>,
+                                                                  const matrix_base<T, F>,
+                                                                  op_sub> const & proxy)
+        {
+          viennacl::linalg::ambm_m(lhs,
+                                   proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, false,
+                                   proxy.rhs(), T(1), 1, false, true);
+        }
+
+        // x += y / alpha - z
+        template <typename ScalarType>
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType, op_div>,
+                                                                  const matrix_base<T, F>,
+                                                                  op_sub> const & proxy)
+        {
+          viennacl::linalg::ambm_m(lhs,
+                                   proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, false,
+                                   proxy.rhs(), T(1), 1, false, true);
+        }
+
+        // x += y - beta * z
+        template <typename ScalarType>
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F>,
+                                                                  const matrix_expression<const matrix_base<T, F>, const ScalarType, op_mult>,
+                                                                  op_sub> const & proxy)
+        {
+          viennacl::linalg::ambm_m(lhs,
+                                   proxy.lhs(), T(1), 1, false, false,
+                                   proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, true);
+        }
+
+        // x += y - z / beta
+        template <typename ScalarType>
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F>,
+                                                                  const matrix_expression<const matrix_base<T, F>, const ScalarType, op_div>,
+                                                                  op_sub> const & proxy)
+        {
+          viennacl::linalg::ambm_m(lhs,
+                                   proxy.lhs(), T(1), 1, false, false,
+                                   proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, true);
+        }
+
+        // x += alpha * y - beta * z
+        template <typename ScalarType1, typename ScalarType2>
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType1, op_mult>,
+                                                                  const matrix_expression<const matrix_base<T, F>, const ScalarType2, op_mult>,
+                                                                  op_sub> const & proxy)
+        {
+          viennacl::linalg::ambm_m(lhs,
+                                   proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, false,
+                                   proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, true);
+        }
+
+        // x += alpha * y - z / beta
+        template <typename ScalarType1, typename ScalarType2>
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType1, op_mult>,
+                                                                  const matrix_expression<const matrix_base<T, F>, const ScalarType2, op_div>,
+                                                                  op_sub> const & proxy)
+        {
+          viennacl::linalg::ambm_m(lhs,
+                                   proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, false,
+                                   proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, true);
+        }
+
+        // x += y / alpha - beta * z
+        template <typename ScalarType1, typename ScalarType2>
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType1, op_div>,
+                                                                  const matrix_expression<const matrix_base<T, F>, const ScalarType2, op_mult>,
+                                                                  op_sub> const & proxy)
+        {
+          viennacl::linalg::ambm_m(lhs,
+                                   proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, false,
+                                   proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, true);
+        }
+
+        // x += y / alpha - z / beta
+        template <typename ScalarType1, typename ScalarType2>
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType1, op_div>,
+                                                                  const matrix_expression<const matrix_base<T, F>, const ScalarType2, op_div>,
+                                                                  op_sub> const & proxy)
+        {
+          viennacl::linalg::ambm_m(lhs,
+                                   proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, false,
+                                   proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, true);
+        }
+      };
+
+
+
+      // generic x -= vec_expr1 - vec_expr2:
+      template <typename T, typename F, typename LHS, typename RHS>
+      struct op_executor<matrix_base<T, F>, op_inplace_sub, matrix_expression<const LHS, const RHS, op_sub> >
+      {
+        // generic x -= vec_expr1 - vec_expr2:
+        template <typename LHS1, typename RHS1>
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const LHS1, const RHS1, op_sub> const & proxy)
+        {
+          bool op_aliasing_lhs = op_aliasing(lhs, proxy.lhs());
+          bool op_aliasing_rhs = op_aliasing(lhs, proxy.rhs());
+
+          if (op_aliasing_lhs || op_aliasing_rhs)
+          {
+            matrix_base<T, F> temp(proxy.lhs());
+            op_executor<matrix_base<T, F>, op_inplace_sub, RHS>::apply(temp, proxy.rhs());
+            lhs -= temp;
+          }
+          else
+          {
+            op_executor<matrix_base<T, F>, op_inplace_sub, LHS>::apply(lhs, proxy.lhs());
+            op_executor<matrix_base<T, F>, op_inplace_add, RHS>::apply(lhs, proxy.rhs());
+          }
+        }
+
+        // x -= y - z
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_sub> const & proxy)
+        {
+          viennacl::linalg::ambm_m(lhs,
+                                   proxy.lhs(), T(1), 1, false, true,
+                                   proxy.rhs(), T(1), 1, false, false);
+        }
+
+        // x -= alpha * y - z
+        template <typename ScalarType>
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType, op_mult>,
+                                                                  const matrix_base<T, F>,
+                                                                  op_sub> const & proxy)
+        {
+          viennacl::linalg::ambm_m(lhs,
+                                   proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, true,
+                                   proxy.rhs(), T(1), 1, false, false);
+        }
+
+        // x -= y / alpha - z
+        template <typename ScalarType>
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType, op_div>,
+                                                                  const matrix_base<T, F>,
+                                                                  op_sub> const & proxy)
+        {
+          viennacl::linalg::ambm_m(lhs,
+                                   proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, true,
+                                   proxy.rhs(), T(1), 1, false, false);
+        }
+
+        // x -= y - beta * z
+        template <typename ScalarType>
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F>,
+                                                                  const matrix_expression<const matrix_base<T, F>, const ScalarType, op_mult>,
+                                                                  op_sub> const & proxy)
+        {
+          viennacl::linalg::ambm_m(lhs,
+                                   proxy.lhs(), T(1), 1, false, true,
+                                   proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, false);
+        }
+
+        // x -= y - z / beta
+        template <typename ScalarType>
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F>,
+                                                                  const matrix_expression<const matrix_base<T, F>, const ScalarType, op_div>,
+                                                                  op_sub> const & proxy)
+        {
+          viennacl::linalg::ambm_m(lhs,
+                                   proxy.lhs(), T(1), 1, false, true,
+                                   proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, false);
+        }
+
+        // x -= alpha * y - beta * z
+        template <typename ScalarType1, typename ScalarType2>
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType1, op_mult>,
+                                                                  const matrix_expression<const matrix_base<T, F>, const ScalarType2, op_mult>,
+                                                                  op_sub> const & proxy)
+        {
+          viennacl::linalg::ambm_m(lhs,
+                                   proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, true,
+                                   proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, false);
+        }
+
+        // x -= alpha * y - z / beta
+        template <typename ScalarType1, typename ScalarType2>
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType1, op_mult>,
+                                                                  const matrix_expression<const matrix_base<T, F>, const ScalarType2, op_div>,
+                                                                  op_sub> const & proxy)
+        {
+          viennacl::linalg::ambm_m(lhs,
+                                   proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, true,
+                                   proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, false);
+        }
+
+        // x -= y / alpha - beta * z
+        template <typename ScalarType1, typename ScalarType2>
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType1, op_div>,
+                                                                  const matrix_expression<const matrix_base<T, F>, const ScalarType2, op_mult>,
+                                                                  op_sub> const & proxy)
+        {
+          viennacl::linalg::ambm_m(lhs,
+                                   proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, true,
+                                   proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, false);
+        }
+
+        // x -= y / alpha - z / beta
+        template <typename ScalarType1, typename ScalarType2>
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F>, const ScalarType1, op_div>,
+                                                                  const matrix_expression<const matrix_base<T, F>, const ScalarType2, op_div>,
+                                                                  op_sub> const & proxy)
+        {
+          viennacl::linalg::ambm_m(lhs,
+                                   proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, true,
+                                   proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, false);
+        }
+      };
+
+
+      //////////////////// diag(), row(), column() operations ////////////////////////////////////////
+
+      template <typename T, typename F, typename LHS>
+      struct op_executor<matrix_base<T, F>, op_assign, matrix_expression<const LHS, const int, op_vector_diag> >
+      {
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const vector_base<T>, const int, op_vector_diag> const & proxy)
+        {
+          viennacl::linalg::matrix_diag_from_vector(proxy.lhs(), proxy.rhs(), lhs);
+        }
+      };
+
+
+      template <typename T, typename LHS>
+      struct op_executor<vector_base<T>, op_assign, vector_expression<const LHS, const int, op_matrix_diag> >
+      {
+        template <typename F>
+        static void apply(vector_base<T> & lhs, vector_expression<const matrix_base<T, F>, const int, op_matrix_diag> const & proxy)
+        {
+          viennacl::linalg::matrix_diag_to_vector(proxy.lhs(), proxy.rhs(), lhs);
+        }
+      };
+
+      template <typename T, typename LHS>
+      struct op_executor<vector_base<T>, op_assign, vector_expression<const LHS, const unsigned int, op_row> >
+      {
+        template <typename F>
+        static void apply(vector_base<T> & lhs, vector_expression<const matrix_base<T, F>, const unsigned int, op_row> const & proxy)
+        {
+          viennacl::linalg::matrix_row(proxy.lhs(), proxy.rhs(), lhs);
+        }
+      };
+
+
+      template <typename T, typename LHS>
+      struct op_executor<vector_base<T>, op_assign, vector_expression<const LHS, const unsigned int, op_column> >
+      {
+        template <typename F>
+        static void apply(vector_base<T> & lhs, vector_expression<const matrix_base<T, F>, const unsigned int, op_column> const & proxy)
+        {
+          viennacl::linalg::matrix_column(proxy.lhs(), proxy.rhs(), lhs);
+        }
+      };
+
+
+      //////////////////// Element-wise operations ////////////////////////////////////////
+
+      // generic x = mat_expr1 .* mat_expr2:
+      template <typename T, typename F, typename LHS, typename RHS, typename OP>
+      struct op_executor<matrix_base<T, F>, op_assign, matrix_expression<const LHS, const RHS, op_element_binary<OP> > >
+      {
+        // x = y .* z
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_binary<OP> > const & proxy)
+        {
+          viennacl::linalg::element_op(lhs, proxy);
+        }
+
+        // x = y .* mat_expr
+        template <typename LHS2, typename RHS2, typename OP2>
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F>, const matrix_expression<const LHS2, const RHS2, OP2>, op_element_binary<OP> > const & proxy)
+        {
+          matrix<T, F> temp(proxy.rhs());
+          viennacl::linalg::element_op(lhs, viennacl::matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_binary<OP> >(proxy.lhs(), temp));
+        }
+
+        // x = mat_expr .* z
+        template <typename LHS1, typename RHS1, typename OP1>
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const LHS1, const RHS1, OP1>, const matrix_base<T, F>, op_element_binary<OP> > const & proxy)
+        {
+          matrix<T, F> temp(proxy.lhs());
+          viennacl::linalg::element_op(lhs, viennacl::matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_binary<OP> >(temp, proxy.rhs()));
+        }
+
+        // x = mat_expr .* mat_expr
+        template <typename LHS1, typename RHS1, typename OP1,
+                  typename LHS2, typename RHS2, typename OP2>
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const LHS1, const RHS1, OP1>,
+                                                                  const matrix_expression<const LHS2, const RHS2, OP2>,
+                                                                  op_element_binary<OP> > const & proxy)
+        {
+          matrix<T, F> temp1(proxy.lhs());
+          matrix<T, F> temp2(proxy.rhs());
+          viennacl::linalg::element_op(lhs, viennacl::matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_binary<OP> >(temp1, temp2));
+        }
+      };
+
+      // generic x += mat_expr .* mat_expr:
+      template <typename T, typename F, typename LHS, typename RHS, typename OP>
+      struct op_executor<matrix_base<T, F>, op_inplace_add, matrix_expression<const LHS, const RHS, op_element_binary<OP> > >
+      {
+        // x += y .* z
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_binary<OP> > const & proxy)
+        {
+          viennacl::matrix<T, F> temp(proxy);
+          lhs += temp;
+        }
+
+        // x += y .* mat_expr
+        template <typename LHS2, typename RHS2, typename OP2>
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F>, const matrix_expression<const LHS2, const RHS2, OP2>, op_element_binary<OP> > const & proxy)
+        {
+          matrix<T, F> temp(proxy.rhs());
+          matrix<T, F> temp2(temp.size1(), temp.size2());
+          viennacl::linalg::element_op(temp2, viennacl::matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_binary<OP> >(proxy.lhs(), temp));
+          lhs += temp2;
+        }
+
+        // x += mat_expr .* z
+        template <typename LHS1, typename RHS1, typename OP1>
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const LHS1, const RHS1, OP1>, const matrix_base<T, F>, op_element_binary<OP> > const & proxy)
+        {
+          matrix<T, F> temp(proxy.lhs());
+          matrix<T, F> temp2(temp.size1(), temp.size2());
+          viennacl::linalg::element_op(temp2, viennacl::matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_binary<OP> >(temp, proxy.rhs()));
+          lhs += temp2;
+        }
+
+        // x += mat_expr .* mat_expr
+        template <typename LHS1, typename RHS1, typename OP1,
+                  typename LHS2, typename RHS2, typename OP2>
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const LHS1, const RHS1, OP1>,
+                                                                  const matrix_expression<const LHS2, const RHS2, OP2>,
+                                                                  op_element_binary<OP> > const & proxy)
+        {
+          matrix<T, F> temp1(proxy.lhs());
+          matrix<T, F> temp2(proxy.rhs());
+          matrix<T, F> temp3(temp1.size1(), temp1.size2());
+          viennacl::linalg::element_op(temp3, viennacl::matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_binary<OP> >(temp1, temp2));
+          lhs += temp3;
+        }
+      };
+
+      // generic x -= mat_expr1 .* mat_expr2:
+      template <typename T, typename F, typename LHS, typename RHS, typename OP>
+      struct op_executor<matrix_base<T, F>, op_inplace_sub, matrix_expression<const LHS, const RHS, op_element_binary<OP> > >
+      {
+
+        // x -= y .* z
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_binary<OP> > const & proxy)
+        {
+          viennacl::matrix<T, F> temp(proxy);
+          lhs -= temp;
+        }
+
+        // x -= y .* mat_expr
+        template <typename LHS2, typename RHS2, typename OP2>
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F>, const matrix_expression<const LHS2, const RHS2, OP2>, op_element_binary<OP> > const & proxy)
+        {
+          matrix<T, F> temp(proxy.rhs());
+          matrix<T, F> temp2(temp.size1(), temp.size2());
+          viennacl::linalg::element_op(temp2, viennacl::matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_binary<OP> >(proxy.lhs(), temp));
+          lhs -= temp2;
+        }
+
+        // x -= mat_expr .* z
+        template <typename LHS1, typename RHS1, typename OP1>
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const LHS1, const RHS1, OP1>, const matrix_base<T, F>, op_element_binary<OP> > const & proxy)
+        {
+          matrix<T, F> temp(proxy.lhs());
+          matrix<T, F> temp2(temp.size1(), temp.size2());
+          viennacl::linalg::element_op(temp2, viennacl::matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_binary<OP> >(temp, proxy.rhs()));
+          lhs -= temp2;
+        }
+
+        // x -= mat_expr .* mat_expr
+        template <typename LHS1, typename RHS1, typename OP1,
+                  typename LHS2, typename RHS2, typename OP2>
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const LHS1, const RHS1, OP1>,
+                                                                     const matrix_expression<const LHS2, const RHS2, OP2>,
+                                                                     op_element_binary<OP> > const & proxy)
+        {
+          matrix<T, F> temp1(proxy.lhs());
+          matrix<T, F> temp2(proxy.rhs());
+          matrix<T, F> temp3(temp1.size1(), temp1.size2());
+          viennacl::linalg::element_op(temp3, viennacl::matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_binary<OP> >(temp1, temp2));
+          lhs -= temp3;
+        }
+      };
+
+      //////////////// unary expressions
+
+      template <typename T, typename F, typename LHS, typename RHS, typename OP>
+      struct op_executor<matrix_base<T, F>, op_assign, matrix_expression<const LHS, const RHS, op_element_unary<OP> > >
+      {
+        // x = OP(y)
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_unary<OP> > const & proxy)
+        {
+          viennacl::linalg::element_op(lhs, proxy);
+        }
+
+        // x = OP(vec_expr)
+        template <typename LHS2, typename RHS2, typename OP2>
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const LHS2, const RHS2, OP2>,
+                                                                     const matrix_expression<const LHS2, const RHS2, OP2>,
+                                                                     op_element_unary<OP> > const & proxy)
+        {
+          matrix<T, F> temp(proxy.rhs());
+          viennacl::linalg::element_op(lhs, viennacl::matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_unary<OP> >(temp, temp));
+        }
+      };
+
+      template <typename T, typename F, typename LHS, typename RHS, typename OP>
+      struct op_executor<matrix_base<T, F>, op_inplace_add, matrix_expression<const LHS, const RHS, op_element_unary<OP> > >
+      {
+        // x += OP(y)
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_unary<OP> > const & proxy)
+        {
+          matrix<T, F> temp(proxy);
+          lhs += temp;
+        }
+
+        // x += OP(vec_expr)
+        template <typename LHS2, typename RHS2, typename OP2>
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const LHS2, const RHS2, OP2>,
+                                                                  const matrix_expression<const LHS2, const RHS2, OP2>,
+                                                                  op_element_unary<OP> > const & proxy)
+        {
+          matrix<T, F> temp(proxy.rhs());
+          viennacl::linalg::element_op(temp, viennacl::matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_unary<OP> >(temp, temp)); // inplace operation is safe here
+          lhs += temp;
+        }
+      };
+
+      template <typename T, typename F, typename LHS, typename RHS, typename OP>
+      struct op_executor<matrix_base<T, F>, op_inplace_sub, matrix_expression<const LHS, const RHS, op_element_unary<OP> > >
+      {
+        // x -= OP(y)
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_unary<OP> > const & proxy)
+        {
+          matrix<T, F> temp(proxy);
+          lhs -= temp;
+        }
+
+        // x -= OP(vec_expr)
+        template <typename LHS2, typename RHS2, typename OP2>
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const LHS2, const RHS2, OP2>,
+                                                                     const matrix_expression<const LHS2, const RHS2, OP2>,
+                                                                     op_element_unary<OP> > const & proxy)
+        {
+          matrix<T, F> temp(proxy.rhs());
+          viennacl::linalg::element_op(temp, viennacl::matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_unary<OP> >(temp, temp)); // inplace operation is safe here
+          lhs -= temp;
+        }
+      };
+
+
+
+      //////////////// Matrix - Matrix products ////////////////
+
+      // C = A * B
+      template <typename T, typename F, typename F1, typename F2>
+      struct op_executor<matrix_base<T, F>, op_assign, matrix_expression<const matrix_base<T, F1>, const matrix_base<T, F2>, op_mat_mat_prod> >
+      {
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F1>, const matrix_base<T, F2>, op_mat_mat_prod> const & rhs)
+        {
+          viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), lhs, T(1.0), T(0));
+        }
+      };
+
+      // C = A * B^T
+      template <typename T, typename F, typename F1, typename F2>
+      struct op_executor<matrix_base<T, F>, op_assign, matrix_expression<const matrix_base<T, F1>,
+                                                                         const matrix_expression<const matrix_base<T, F2>, const matrix_base<T, F2>, op_trans>,
+                                                                         op_mat_mat_prod> >
+      {
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F1>,
+                                                                     const matrix_expression<const matrix_base<T, F2>, const matrix_base<T, F2>, op_trans>,
+                                                                     op_mat_mat_prod> const & rhs)
+        {
+          viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), lhs, T(1.0), T(0));
+        }
+      };
+
+      // C = A^T * B
+      template <typename T, typename F, typename F1, typename F2>
+      struct op_executor<matrix_base<T, F>, op_assign, matrix_expression<const matrix_expression<const matrix_base<T, F1>, const matrix_base<T, F1>, op_trans>,
+                                                                         const matrix_base<T, F2>,
+                                                                         op_mat_mat_prod> >
+      {
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F1>, const matrix_base<T, F1>, op_trans>,
+                                                                     const matrix_base<T, F2>,
+                                                                     op_mat_mat_prod> const & rhs)
+        {
+          viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), lhs, T(1.0), T(0));
+        }
+      };
+
+      // C = A^T * B^T
+      template <typename T, typename F, typename F1, typename F2>
+      struct op_executor<matrix_base<T, F>, op_assign, matrix_expression<const matrix_expression<const matrix_base<T, F1>, const matrix_base<T, F1>, op_trans>,
+                                                                         const matrix_expression<const matrix_base<T, F2>, const matrix_base<T, F2>, op_trans>,
+                                                                         op_mat_mat_prod> >
+      {
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F1>, const matrix_base<T, F1>, op_trans>,
+                                                                     const matrix_expression<const matrix_base<T, F2>, const matrix_base<T, F2>, op_trans>,
+                                                                     op_mat_mat_prod> const & rhs)
+        {
+          viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), lhs, T(1.0), T(0));
+        }
+      };
+
+
+      // C += A * B
+      template <typename T, typename F, typename F1, typename F2>
+      struct op_executor<matrix_base<T, F>, op_inplace_add, matrix_expression<const matrix_base<T, F1>, const matrix_base<T, F2>, op_mat_mat_prod> >
+      {
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F1>, const matrix_base<T, F2>, op_mat_mat_prod> const & rhs)
+        {
+          viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), lhs, T(1.0), T(1.0));
+        }
+      };
+
+      // C += A * B^T
+      template <typename T, typename F, typename F1, typename F2>
+      struct op_executor<matrix_base<T, F>, op_inplace_add, matrix_expression<const matrix_base<T, F1>,
+                                                                              const matrix_expression<const matrix_base<T, F2>, const matrix_base<T, F2>, op_trans>,
+                                                                              op_mat_mat_prod> >
+      {
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F1>,
+                                                                     const matrix_expression<const matrix_base<T, F2>, const matrix_base<T, F2>, op_trans>,
+                                                                     op_mat_mat_prod> const & rhs)
+        {
+          viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), lhs, T(1.0), T(1.0));
+        }
+      };
+
+      // C += A^T * B
+      template <typename T, typename F, typename F1, typename F2>
+      struct op_executor<matrix_base<T, F>, op_inplace_add, matrix_expression<const matrix_expression<const matrix_base<T, F1>, const matrix_base<T, F1>, op_trans>,
+                                                                              const matrix_base<T, F2>,
+                                                                              op_mat_mat_prod> >
+      {
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F1>, const matrix_base<T, F1>, op_trans>,
+                                                                     const matrix_base<T, F2>,
+                                                                     op_mat_mat_prod> const & rhs)
+        {
+          viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), lhs, T(1.0), T(1.0));
+        }
+      };
+
+      // C += A^T * B^T
+      template <typename T, typename F, typename F1, typename F2>
+      struct op_executor<matrix_base<T, F>, op_inplace_add, matrix_expression<const matrix_expression<const matrix_base<T, F1>, const matrix_base<T, F1>, op_trans>,
+                                                                              const matrix_expression<const matrix_base<T, F2>, const matrix_base<T, F2>, op_trans>,
+                                                                              op_mat_mat_prod> >
+      {
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F1>, const matrix_base<T, F1>, op_trans>,
+                                                                     const matrix_expression<const matrix_base<T, F2>, const matrix_base<T, F2>, op_trans>,
+                                                                     op_mat_mat_prod> const & rhs)
+        {
+          viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), lhs, T(1.0), T(1.0));
+        }
+      };
+
+
+      // C -= A * B
+      template <typename T, typename F, typename F1, typename F2>
+      struct op_executor<matrix_base<T, F>, op_inplace_sub, matrix_expression<const matrix_base<T, F1>, const matrix_base<T, F2>, op_mat_mat_prod> >
+      {
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F1>, const matrix_base<T, F2>, op_mat_mat_prod> const & rhs)
+        {
+          viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), lhs, T(-1.0), T(1.0));
+        }
+      };
+
+      // C -= A * B^T
+      template <typename T, typename F, typename F1, typename F2>
+      struct op_executor<matrix_base<T, F>, op_inplace_sub, matrix_expression<const matrix_base<T, F1>,
+                                                                              const matrix_expression<const matrix_base<T, F2>, const matrix_base<T, F2>, op_trans>,
+                                                                              op_mat_mat_prod> >
+      {
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_base<T, F1>,
+                                                                     const matrix_expression<const matrix_base<T, F2>, const matrix_base<T, F2>, op_trans>,
+                                                                     op_mat_mat_prod> const & rhs)
+        {
+          viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), lhs, T(-1.0), T(1.0));
+        }
+      };
+
+      // C -= A^T * B
+      template <typename T, typename F, typename F1, typename F2>
+      struct op_executor<matrix_base<T, F>, op_inplace_sub, matrix_expression<const matrix_expression<const matrix_base<T, F1>, const matrix_base<T, F1>, op_trans>,
+                                                                              const matrix_base<T, F2>,
+                                                                              op_mat_mat_prod> >
+      {
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F1>, const matrix_base<T, F1>, op_trans>,
+                                                                     const matrix_base<T, F2>,
+                                                                     op_mat_mat_prod> const & rhs)
+        {
+          viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), lhs, T(-1.0), T(1.0));
+        }
+      };
+
+      // C -= A^T * B^T
+      template <typename T, typename F, typename F1, typename F2>
+      struct op_executor<matrix_base<T, F>, op_inplace_sub, matrix_expression<const matrix_expression<const matrix_base<T, F1>, const matrix_base<T, F1>, op_trans>,
+                                                                              const matrix_expression<const matrix_base<T, F2>, const matrix_base<T, F2>, op_trans>,
+                                                                              op_mat_mat_prod> >
+      {
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const matrix_expression<const matrix_base<T, F1>, const matrix_base<T, F1>, op_trans>,
+                                                                     const matrix_expression<const matrix_base<T, F2>, const matrix_base<T, F2>, op_trans>,
+                                                                     op_mat_mat_prod> const & rhs)
+        {
+          viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), lhs, T(-1.0), T(1.0));
+        }
+      };
+
+      ////////////////// Matrix-Vector Products ///////////////
+
+      // y = A * x
+      template <typename T, typename F>
+      struct op_executor<vector_base<T>, op_assign, vector_expression<const matrix_base<T, F>, const vector_base<T>, op_prod> >
+      {
+        static void apply(vector_base<T> & lhs, vector_expression<const matrix_base<T, F>, const vector_base<T>, op_prod> const & rhs)
+        {
+          // check for x = A * x
+          if (op_aliasing(lhs, rhs.rhs()))
+          {
+            vector_base<T> temp(rhs);
+            lhs = temp;
+          }
+          else
+            viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), lhs);
+        }
+      };
+
+      // y = A^T * x
+      template <typename T, typename F>
+      struct op_executor<vector_base<T>, op_assign, vector_expression<const matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_trans>,
+                                                                      const vector_base<T>,
+                                                                      op_prod> >
+      {
+        static void apply(vector_base<T> & lhs, vector_expression<const matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_trans>,
+                                                                  const vector_base<T>,
+                                                                  op_prod> const & rhs)
+        {
+          // check for x = A^T * x
+          if (op_aliasing(lhs, rhs.rhs()))
+          {
+            vector_base<T> temp(rhs);
+            lhs = temp;
+          }
+          else
+            viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), lhs);
+        }
+      };
+
+
+      // y += A * x
+      template <typename T, typename F>
+      struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const matrix_base<T, F>, const vector_base<T>, op_prod> >
+      {
+        static void apply(vector_base<T> & lhs, vector_expression<const matrix_base<T, F>, const vector_base<T>, op_prod> const & rhs)
+        {
+          vector_base<T> temp(rhs);
+          lhs += temp;
+        }
+      };
+
+      // y += A^T * x
+      template <typename T, typename F>
+      struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_trans>,
+                                                                           const vector_base<T>,
+                                                                           op_prod> >
+      {
+        static void apply(vector_base<T> & lhs, vector_expression<const matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_trans>,
+                                                                  const vector_base<T>,
+                                                                  op_prod> const & rhs)
+        {
+          vector_base<T> temp(rhs);
+          lhs += temp;
+        }
+      };
+
+
+      // y -= A * x
+      template <typename T, typename F>
+      struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const matrix_base<T, F>, const vector_base<T>, op_prod> >
+      {
+        static void apply(vector_base<T> & lhs, vector_expression<const matrix_base<T, F>, const vector_base<T>, op_prod> const & rhs)
+        {
+          vector_base<T> temp(rhs);
+          lhs -= temp;
+        }
+      };
+
+      // y -= A^T * x
+      template <typename T, typename F>
+      struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_trans>,
+                                                                           const vector_base<T>,
+                                                                           op_prod> >
+      {
+        static void apply(vector_base<T> & lhs, vector_expression<const matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_trans>,
+                                                                  const vector_base<T>,
+                                                                  op_prod> const & rhs)
+        {
+          vector_base<T> temp(rhs);
+          lhs -= temp;
+        }
+      };
+
+
+
+      ////////////////// Rank-1 Updates ///////////////
+
+      // A = v1 * v2^T
+      template <typename T, typename F>
+      struct op_executor<matrix_base<T, F>, op_assign, matrix_expression<const vector_base<T>, const vector_base<T>, op_prod> >
+      {
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const vector_base<T>, const vector_base<T>, op_prod> const & rhs)
+        {
+          lhs.clear();
+          viennacl::linalg::scaled_rank_1_update(lhs, T(1.0), 1, false, false, rhs.lhs(), rhs.rhs());
+        }
+      };
+
+      // A = alpha * v1 * v2^T
+      template <typename T, typename F, typename ScalarType>
+      struct op_executor<matrix_base<T, F>, op_assign, matrix_expression< const matrix_expression<const vector_base<T>, const vector_base<T>, op_prod>,
+                                                                          const ScalarType,
+                                                                          op_mult> >
+      {
+        static void apply(matrix_base<T, F> & lhs, matrix_expression< const matrix_expression<const vector_base<T>, const vector_base<T>, op_prod>,
+                                                                      const ScalarType,
+                                                                      op_mult> const & rhs)
+        {
+          lhs.clear();
+          viennacl::linalg::scaled_rank_1_update(lhs, rhs.rhs(), 1, false, false, rhs.lhs().lhs(), rhs.lhs().rhs());
+        }
+      };
+
+      // A += v1 * v2^T
+      template <typename T, typename F>
+      struct op_executor<matrix_base<T, F>, op_inplace_add, matrix_expression<const vector_base<T>, const vector_base<T>, op_prod> >
+      {
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const vector_base<T>, const vector_base<T>, op_prod> const & rhs)
+        {
+          viennacl::linalg::scaled_rank_1_update(lhs, T(1.0), 1, false, false, rhs.lhs(), rhs.rhs());
+        }
+      };
+
+      // A += alpha * v1 * v2^T
+      template <typename T, typename F, typename ScalarType>
+      struct op_executor<matrix_base<T, F>, op_inplace_add, matrix_expression< const matrix_expression<const vector_base<T>, const vector_base<T>, op_prod>,
+                                                                               const ScalarType,
+                                                                               op_mult> >
+      {
+        static void apply(matrix_base<T, F> & lhs, matrix_expression< const matrix_expression<const vector_base<T>, const vector_base<T>, op_prod>,
+                                                                      const ScalarType,
+                                                                      op_mult> const & rhs)
+        {
+          viennacl::linalg::scaled_rank_1_update(lhs, rhs.rhs(), 1, false, false, rhs.lhs().lhs(), rhs.lhs().rhs());
+        }
+      };
+
+      // A -= v1 * v2^T
+      template <typename T, typename F>
+      struct op_executor<matrix_base<T, F>, op_inplace_sub, matrix_expression<const vector_base<T>, const vector_base<T>, op_prod> >
+      {
+        static void apply(matrix_base<T, F> & lhs, matrix_expression<const vector_base<T>, const vector_base<T>, op_prod> const & rhs)
+        {
+          viennacl::linalg::scaled_rank_1_update(lhs, T(1.0), 1, false, true, rhs.lhs(), rhs.rhs());
+        }
+      };
+
+      // A -= alpha * v1 * v2^T
+      template <typename T, typename F, typename ScalarType>
+      struct op_executor<matrix_base<T, F>, op_inplace_sub, matrix_expression< const matrix_expression<const vector_base<T>, const vector_base<T>, op_prod>,
+                                                                               const ScalarType,
+                                                                               op_mult> >
+      {
+        static void apply(matrix_base<T, F> & lhs, matrix_expression< const matrix_expression<const vector_base<T>, const vector_base<T>, op_prod>,
+                                                                      const ScalarType,
+                                                                      op_mult> const & rhs)
+        {
+          viennacl::linalg::scaled_rank_1_update(lhs, rhs.rhs(), 1, false, true, rhs.lhs().lhs(), rhs.lhs().rhs());
+        }
+      };
+
+
+    } // namespace detail
+
+  } // namespace linalg
+
+  /** \endcond */
+
+} //namespace viennacl
+
+#endif
diff --git a/viennacl/matrix_proxy.hpp b/viennacl/matrix_proxy.hpp
index d1ebc72..8941de2 100644
--- a/viennacl/matrix_proxy.hpp
+++ b/viennacl/matrix_proxy.hpp
@@ -2,16 +2,17 @@
 #define VIENNACL_MATRIX_PROXY_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
@@ -29,482 +30,419 @@
 namespace viennacl
 {
 
+  /** @brief Class for representing non-strided submatrices of a bigger matrix A.
+    *
+    * In MATLAB notation, this could for example refer to the submatrix A(3:8, 6:10) of a matrix A.
+    */
   template <typename MatrixType>
-  class matrix_range
+  class matrix_range : public matrix_base<typename MatrixType::cpu_value_type, typename MatrixType::orientation_functor>
   {
+      typedef matrix_base<typename MatrixType::cpu_value_type,
+                          typename MatrixType::orientation_functor>    base_type;
+      typedef matrix_range<MatrixType>                                 self_type;
+
     public:
+      typedef typename MatrixType::orientation_category       orientation_category;
+
       typedef typename MatrixType::value_type     value_type;
       typedef typename viennacl::result_of::cpu_value_type<value_type>::type    cpu_value_type;
       typedef range::size_type                    size_type;
       typedef range::difference_type              difference_type;
       typedef value_type                          reference;
       typedef const value_type &                  const_reference;
-      
-      matrix_range(MatrixType & A, 
-                   range const & row_range,
-                   range const & col_range) : A_(&A), row_range_(row_range), col_range_(col_range) {}
-                   
-      size_type start1() const { return row_range_.start(); }
-      size_type size1() const { return row_range_.size(); }
-
-      size_type start2() const { return col_range_.start(); }
-      size_type size2() const { return col_range_.size(); }
-      
-      ////////// operator= //////////////////////////
-      
-      /** @brief Copy-constructor: Writes the entries from the matrix_range to the wrapped matrix.
-       * 
-       * Note: A generic overload of operator=() is insufficient, because then the compiler generates the copy-CTOR!
-       * 
-       * @param other    The submatrix to be assigned
-       */
-      matrix_range<MatrixType> & operator = (const matrix_range<MatrixType> & other) 
-      {
-        assert(size1() == other.size1());
-        assert(size2() == other.size2());
-
-        typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< MatrixType >::ResultType    KernelClass;
-        
-        std::size_t block_size = 16;
-        
-        viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), "assign");
-        k.global_work_size(0, block_size*block_size);
-        k.global_work_size(1, block_size*block_size);
-        k.local_work_size(0, block_size);
-        k.local_work_size(1, block_size);
-        
-        viennacl::ocl::enqueue(k(viennacl::traits::handle(*A_),
-                                        cl_uint(start1()),             cl_uint(start2()), 
-                                        cl_uint(size1()),              cl_uint(size2()),
-                                        cl_uint(A_->internal_size1()), cl_uint(A_->internal_size2()),
-                                viennacl::traits::handle(other), 
-                                        cl_uint(viennacl::traits::start1(other)),            cl_uint(viennacl::traits::start2(other)), 
-                                        cl_uint(viennacl::traits::size1(other)),             cl_uint(viennacl::traits::size2(other)),
-                                        cl_uint(viennacl::traits::internal_size1(other)),    cl_uint(viennacl::traits::internal_size2(other))
-                                )
-                              );
-
-        return *this;
-      }
-
-      template <typename MatrixType2>
-      matrix_range<MatrixType> & operator = (const MatrixType2 & other) 
-      {
-        assert(size1() == other.size1());
-        assert(size2() == other.size2());
-
-        typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< MatrixType >::ResultType    KernelClass;
-        
-        std::size_t block_size = 16;
-        
-        viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), "assign");
-        k.global_work_size(0, block_size*block_size);
-        k.global_work_size(1, block_size*block_size);
-        k.local_work_size(0, block_size);
-        k.local_work_size(1, block_size);
-        
-        viennacl::ocl::enqueue(k(viennacl::traits::handle(*A_),
-                                        cl_uint(start1()),             cl_uint(start2()), 
-                                        cl_uint(size1()),              cl_uint(size2()),
-                                        cl_uint(A_->internal_size1()), cl_uint(A_->internal_size2()),
-                                viennacl::traits::handle(other), 
-                                        cl_uint(viennacl::traits::start1(other)),            cl_uint(viennacl::traits::start2(other)), 
-                                        cl_uint(viennacl::traits::size1(other)),             cl_uint(viennacl::traits::size2(other)),
-                                        cl_uint(viennacl::traits::internal_size1(other)),    cl_uint(viennacl::traits::internal_size2(other))
-                                )
-                              );
-
-        return *this;
-      }
-
-      
-      template <typename MatrixType1, typename MatrixType2>
-      matrix_range<MatrixType> & operator = (const matrix_expression< MatrixType1,
-                                                                      MatrixType2,
-                                                                      op_prod > & proxy) 
-      {
-        viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), *this);
-        return *this;
-      }
-      
-      template <typename MatrixType1, typename MatrixType2>
-      matrix_range<MatrixType> & 
-      operator = (const matrix_expression< MatrixType1,
-                                           MatrixType2,
-                                           op_add > & proxy) 
-      {
-        viennacl::linalg::add(proxy.lhs(), proxy.rhs(), *this);
-        return *this;
-      }
-
-      template <typename MatrixType1, typename MatrixType2>
-      matrix_range<MatrixType> & 
-      operator = (const matrix_expression< MatrixType1,
-                                           MatrixType2,
-                                           op_sub > & proxy) 
-      {
-        viennacl::linalg::sub(proxy.lhs(), proxy.rhs(), *this);
-        return *this;
-      }
-
-
-      ////////// operator+= //////////////////////////
-
-      matrix_range<MatrixType> & operator += (matrix_range<MatrixType> const & other)
-      {
-        viennacl::linalg::inplace_add(*this, other);
-        return *this;
-      }
-      
-      template <typename MatrixType1, typename MatrixType2>
-      matrix_range<MatrixType> & operator += (const matrix_expression< MatrixType1,
-                                                                       MatrixType2,
-                                                                       op_prod > & proxy)
-      {
-        MatrixType temp = proxy;
-        viennacl::linalg::inplace_add(*this, temp);
-        return *this;
-      }
-      
-      
-      ////////// operator-= //////////////////////////
-      matrix_range<MatrixType> & operator -= (matrix_range<MatrixType> const & other)
-      {
-        viennacl::linalg::inplace_sub(*this, other);
-        return *this;
-      }
-      
-      template <typename MatrixType1, typename MatrixType2>
-      matrix_range<MatrixType> & operator -= (const matrix_expression< MatrixType1,
-                                                                       MatrixType2,
-                                                                       op_prod > & proxy)
-      {
-        MatrixType temp = proxy;
-        viennacl::linalg::inplace_sub(*this, temp);
-        return *this;
-      }
-
-
-      ////////// operator*= //////////////////////////
-
-      template <typename T>
-      matrix_range<MatrixType> & operator *= (T const & val)
-      {
-        viennacl::linalg::inplace_mult(*this, val);
-        return *this;
-      }
-      
-      ////////// operator/= //////////////////////////
-
-      template <typename T>
-      matrix_range<MatrixType> & operator /= (T const & val)
-      {
-        viennacl::linalg::inplace_divide(*this, val);
-        return *this;
-      }
 
-      matrix_range<MatrixType> & operator /= (cpu_value_type val)
-      {
-        viennacl::linalg::inplace_mult(*this, cpu_value_type(1.0) / val);
-        return *this;
-      }
-
-
-      ////////// operator+ //////////////////////////
-      
-      template <typename MatrixType2>
-      typename viennacl::enable_if< viennacl::is_matrix<MatrixType2>::value,
-                                    matrix_expression< const matrix_range<MatrixType>,
-                                                       const MatrixType2,
-                                                       op_add > >::type
-      operator + (const MatrixType2 & other) 
-      {
-        return matrix_expression< const matrix_range<MatrixType>,
-                                  const MatrixType2,
-                                  op_add > (*this, other);
-      }
-      
-      ////////// operator- //////////////////////////
-      
-      template <typename MatrixType2>
-      typename viennacl::enable_if< viennacl::is_matrix<MatrixType2>::value,
-                                    matrix_expression< const matrix_range<MatrixType>,
-                                                       const MatrixType2,
-                                                       op_sub > >::type
-      operator - (const MatrixType2 & other) 
-      {
-        return matrix_expression< const matrix_range<MatrixType>,
-                                  const MatrixType2,
-                                  op_sub > (*this, other);
-      }
-      
-      
-      
-
-      //const_reference operator()(size_type i, size_type j) const { return A_(start1() + i, start2() + i); }
-      //reference operator()(size_type i, size_type j) { return A_(start1() + i, start2() + i); }
+      matrix_range(MatrixType & A,
+                   range const & row_range,
+                   range const & col_range) : base_type(A.handle(),
+                                                        row_range.size(), row_range.start(), 1, A.internal_size1(),
+                                                        col_range.size(), col_range.start(), 1, A.internal_size2()) {}
 
-      MatrixType & get() { return *A_; }
-      const MatrixType & get() const { return *A_; }
+      using base_type::operator=;
 
-    private:
-      MatrixType * A_;
-      range row_range_;
-      range col_range_;
   };
 
-  
-  /** @brief Returns an expression template class representing a transposed matrix */
-  template <typename MatrixType>
-  matrix_expression< const matrix_range<MatrixType>,
-                     const matrix_range<MatrixType>,
-                     op_trans> trans(const matrix_range<MatrixType> & mat)
-  {
-    return matrix_expression< const matrix_range<MatrixType>,
-                              const matrix_range<MatrixType>,
-                              op_trans>(mat, mat);
-  }
-  
-  
-  
-  
+
   /////////////////////////////////////////////////////////////
   ///////////////////////// CPU to GPU ////////////////////////
   /////////////////////////////////////////////////////////////
-  
+
   //row_major:
   template <typename CPU_MATRIX, typename SCALARTYPE>
   void copy(const CPU_MATRIX & cpu_matrix,
             matrix_range<matrix<SCALARTYPE, row_major, 1> > & gpu_matrix_range )
   {
     assert( (cpu_matrix.size1() == gpu_matrix_range.size1())
-           && (cpu_matrix.size2() == gpu_matrix_range.size2()) );
-    
-     if ( gpu_matrix_range.start2() != 0 ||  gpu_matrix_range.size2() !=  gpu_matrix_range.get().size2())
-     {
-       std::vector<SCALARTYPE> entries(gpu_matrix_range.size2());
-       
-       //copy each stride separately:
-       for (size_t i=0; i < gpu_matrix_range.size1(); ++i)
-       {
-         for (size_t j=0; j < gpu_matrix_range.size2(); ++j)
-           entries[j] = cpu_matrix(i,j);
-         
-         size_t start_offset = (gpu_matrix_range.start1() + i) * gpu_matrix_range.get().internal_size2() + gpu_matrix_range.start2();
-         size_t num_entries = gpu_matrix_range.size2();
-         cl_int err = clEnqueueWriteBuffer(viennacl::ocl::get_queue().handle().get(),
-                                          gpu_matrix_range.get().handle().get(), CL_TRUE, 
-                                          sizeof(SCALARTYPE)*start_offset,
-                                          sizeof(SCALARTYPE)*num_entries,
-                                          &(entries[0]), 0, NULL, NULL);
-        VIENNACL_ERR_CHECK(err);
-        //std::cout << "Strided copy worked!" << std::endl;
-       }
-     }
-     else
-     {
-       //full block can be copied: 
-       std::vector<SCALARTYPE> entries(gpu_matrix_range.size1()*gpu_matrix_range.size2());
-       
-       //copy each stride separately:
-       for (size_t i=0; i < gpu_matrix_range.size1(); ++i)
-         for (size_t j=0; j < gpu_matrix_range.size2(); ++j)
-           entries[i*gpu_matrix_range.get().internal_size2() + j] = cpu_matrix(i,j);
-       
-       size_t start_offset = gpu_matrix_range.start1() * gpu_matrix_range.get().internal_size2();
-       size_t num_entries = gpu_matrix_range.size1() * gpu_matrix_range.size2();
-       //std::cout << "start_offset: " << start_offset << std::endl;
-       cl_int err = clEnqueueWriteBuffer(viennacl::ocl::get_queue().handle().get(),
-                                         gpu_matrix_range.get().handle().get(), CL_TRUE, 
-                                         sizeof(SCALARTYPE)*start_offset,
-                                         sizeof(SCALARTYPE)*num_entries,
-                                         &(entries[0]), 0, NULL, NULL);
-       VIENNACL_ERR_CHECK(err);
-       //std::cout << "Block copy worked!" << std::endl;
-     }
+           && (cpu_matrix.size2() == gpu_matrix_range.size2())
+           && bool("Matrix size mismatch!"));
+
+    if ( gpu_matrix_range.start2() != 0)
+    {
+      std::vector<SCALARTYPE> entries(gpu_matrix_range.size2());
+
+      //copy each stride separately:
+      for (vcl_size_t i=0; i < gpu_matrix_range.size1(); ++i)
+      {
+        for (vcl_size_t j=0; j < gpu_matrix_range.size2(); ++j)
+          entries[j] = cpu_matrix(i,j);
+
+        vcl_size_t start_offset = (gpu_matrix_range.start1() + i) * gpu_matrix_range.internal_size2() + gpu_matrix_range.start2();
+        vcl_size_t num_entries = gpu_matrix_range.size2();
+        viennacl::backend::memory_write(gpu_matrix_range.handle(), sizeof(SCALARTYPE)*start_offset, sizeof(SCALARTYPE)*num_entries, &(entries[0]));
+      //std::cout << "Strided copy worked!" << std::endl;
+      }
+    }
+    else
+    {
+      //full block can be copied:
+      std::vector<SCALARTYPE> entries(gpu_matrix_range.size1()*gpu_matrix_range.internal_size2());
+
+      //copy each stride separately:
+      for (vcl_size_t i=0; i < gpu_matrix_range.size1(); ++i)
+        for (vcl_size_t j=0; j < gpu_matrix_range.size2(); ++j)
+          entries[i*gpu_matrix_range.internal_size2() + j] = cpu_matrix(i,j);
+
+      vcl_size_t start_offset = gpu_matrix_range.start1() * gpu_matrix_range.internal_size2();
+      vcl_size_t num_entries = gpu_matrix_range.size1() * gpu_matrix_range.internal_size2();
+      viennacl::backend::memory_write(gpu_matrix_range.handle(), sizeof(SCALARTYPE)*start_offset, sizeof(SCALARTYPE)*num_entries, &(entries[0]));
+      //std::cout << "Block copy worked!" << std::endl;
+    }
   }
-  
+
   //column_major:
   template <typename CPU_MATRIX, typename SCALARTYPE>
   void copy(const CPU_MATRIX & cpu_matrix,
             matrix_range<matrix<SCALARTYPE, column_major, 1> > & gpu_matrix_range )
   {
     assert( (cpu_matrix.size1() == gpu_matrix_range.size1())
-           && (cpu_matrix.size2() == gpu_matrix_range.size2()) );
-    
-     if ( gpu_matrix_range.start1() != 0 ||  gpu_matrix_range.size1() != gpu_matrix_range.get().size1())
+           && (cpu_matrix.size2() == gpu_matrix_range.size2())
+           && bool("Matrix size mismatch!"));
+
+     if ( gpu_matrix_range.start1() != 0 ||  gpu_matrix_range.size1() != gpu_matrix_range.size1())
      {
        std::vector<SCALARTYPE> entries(gpu_matrix_range.size1());
-       
+
        //copy each stride separately:
-       for (size_t j=0; j < gpu_matrix_range.size2(); ++j)
+       for (vcl_size_t j=0; j < gpu_matrix_range.size2(); ++j)
        {
-         for (size_t i=0; i < gpu_matrix_range.size1(); ++i)
+         for (vcl_size_t i=0; i < gpu_matrix_range.size1(); ++i)
            entries[i] = cpu_matrix(i,j);
-         
-         size_t start_offset = (gpu_matrix_range.start2() + j) * gpu_matrix_range.get().internal_size1() + gpu_matrix_range.start1();
-         size_t num_entries = gpu_matrix_range.size1();
-         cl_int err = clEnqueueWriteBuffer(viennacl::ocl::get_queue().handle().get(),
-                                          gpu_matrix_range.get().handle().get(), CL_TRUE, 
-                                          sizeof(SCALARTYPE)*start_offset,
-                                          sizeof(SCALARTYPE)*num_entries,
-                                          &(entries[0]), 0, NULL, NULL);
-        VIENNACL_ERR_CHECK(err);
+
+         vcl_size_t start_offset = (gpu_matrix_range.start2() + j) * gpu_matrix_range.internal_size1() + gpu_matrix_range.start1();
+         vcl_size_t num_entries = gpu_matrix_range.size1();
+         viennacl::backend::memory_write(gpu_matrix_range.handle(), sizeof(SCALARTYPE)*start_offset, sizeof(SCALARTYPE)*num_entries, &(entries[0]));
         //std::cout << "Strided copy worked!" << std::endl;
        }
      }
      else
      {
-       //full block can be copied: 
-       std::vector<SCALARTYPE> entries(gpu_matrix_range.size1()*gpu_matrix_range.size2());
-       
+       //full block can be copied:
+       std::vector<SCALARTYPE> entries(gpu_matrix_range.internal_size1()*gpu_matrix_range.size2());
+
        //copy each stride separately:
-       for (size_t i=0; i < gpu_matrix_range.size1(); ++i)
-         for (size_t j=0; j < gpu_matrix_range.size2(); ++j)
-           entries[i + j*gpu_matrix_range.get().internal_size1()] = cpu_matrix(i,j);
-       
-       size_t start_offset = gpu_matrix_range.start2() * gpu_matrix_range.get().internal_size1();
-       size_t num_entries = gpu_matrix_range.size1() * gpu_matrix_range.size2();
-       //std::cout << "start_offset: " << start_offset << std::endl;
-       cl_int err = clEnqueueWriteBuffer(viennacl::ocl::get_queue().handle().get(),
-                                         gpu_matrix_range.get().handle().get(), CL_TRUE, 
-                                         sizeof(SCALARTYPE)*start_offset,
-                                         sizeof(SCALARTYPE)*num_entries,
-                                         &(entries[0]), 0, NULL, NULL);
-       VIENNACL_ERR_CHECK(err);
+       for (vcl_size_t i=0; i < gpu_matrix_range.size1(); ++i)
+         for (vcl_size_t j=0; j < gpu_matrix_range.size2(); ++j)
+           entries[i + j*gpu_matrix_range.internal_size1()] = cpu_matrix(i,j);
+
+       vcl_size_t start_offset = gpu_matrix_range.start2() * gpu_matrix_range.internal_size1();
+       vcl_size_t num_entries = gpu_matrix_range.internal_size1() * gpu_matrix_range.size2();
+       viennacl::backend::memory_write(gpu_matrix_range.handle(), sizeof(SCALARTYPE)*start_offset, sizeof(SCALARTYPE)*num_entries, &(entries[0]));
        //std::cout << "Block copy worked!" << std::endl;
      }
-    
+
   }
 
 
   /////////////////////////////////////////////////////////////
   ///////////////////////// GPU to CPU ////////////////////////
   /////////////////////////////////////////////////////////////
-  
-  
+
+
   //row_major:
   template <typename CPU_MATRIX, typename SCALARTYPE>
   void copy(matrix_range<matrix<SCALARTYPE, row_major, 1> > const & gpu_matrix_range,
             CPU_MATRIX & cpu_matrix)
   {
     assert( (cpu_matrix.size1() == gpu_matrix_range.size1())
-           && (cpu_matrix.size2() == gpu_matrix_range.size2()) );
-    
-     if ( gpu_matrix_range.start2() != 0 ||  gpu_matrix_range.size2() !=  gpu_matrix_range.get().size2())
+           && (cpu_matrix.size2() == gpu_matrix_range.size2())
+           && bool("Matrix size mismatch!"));
+
+     if ( gpu_matrix_range.start2() != 0)
      {
        std::vector<SCALARTYPE> entries(gpu_matrix_range.size2());
-       
+
        //copy each stride separately:
-       for (size_t i=0; i < gpu_matrix_range.size1(); ++i)
+       for (vcl_size_t i=0; i < gpu_matrix_range.size1(); ++i)
        {
-         size_t start_offset = (gpu_matrix_range.start1() + i) * gpu_matrix_range.get().internal_size2() + gpu_matrix_range.start2();
-         size_t num_entries = gpu_matrix_range.size2();
-         cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(),
-                                          gpu_matrix_range.get().handle().get(), CL_TRUE, 
-                                          sizeof(SCALARTYPE)*start_offset,
-                                          sizeof(SCALARTYPE)*num_entries,
-                                          &(entries[0]), 0, NULL, NULL);
-        VIENNACL_ERR_CHECK(err);
+         vcl_size_t start_offset = (gpu_matrix_range.start1() + i) * gpu_matrix_range.internal_size2() + gpu_matrix_range.start2();
+         vcl_size_t num_entries = gpu_matrix_range.size2();
+         viennacl::backend::memory_read(gpu_matrix_range.handle(), sizeof(SCALARTYPE)*start_offset, sizeof(SCALARTYPE)*num_entries, &(entries[0]));
         //std::cout << "Strided copy worked!" << std::endl;
-        
-        for (size_t j=0; j < gpu_matrix_range.size2(); ++j)
+
+        for (vcl_size_t j=0; j < gpu_matrix_range.size2(); ++j)
           cpu_matrix(i,j) = entries[j];
-         
        }
      }
      else
      {
-       //full block can be copied: 
-       std::vector<SCALARTYPE> entries(gpu_matrix_range.size1()*gpu_matrix_range.size2());
-       
-       size_t start_offset = gpu_matrix_range.start1() * gpu_matrix_range.get().internal_size2();
-       size_t num_entries = gpu_matrix_range.size1() * gpu_matrix_range.size2();
-       //std::cout << "start_offset: " << start_offset << std::endl;
-       cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(),
-                                         gpu_matrix_range.get().handle().get(), CL_TRUE, 
-                                         sizeof(SCALARTYPE)*start_offset,
-                                         sizeof(SCALARTYPE)*num_entries,
-                                         &(entries[0]), 0, NULL, NULL);
-       VIENNACL_ERR_CHECK(err);
+       //full block can be copied:
+       std::vector<SCALARTYPE> entries(gpu_matrix_range.size1()*gpu_matrix_range.internal_size2());
+
+       vcl_size_t start_offset = gpu_matrix_range.start1() * gpu_matrix_range.internal_size2();
+       vcl_size_t num_entries = gpu_matrix_range.size1() * gpu_matrix_range.size2();
+         viennacl::backend::memory_read(gpu_matrix_range.handle(), sizeof(SCALARTYPE)*start_offset, sizeof(SCALARTYPE)*num_entries, &(entries[0]));
        //std::cout << "Block copy worked!" << std::endl;
 
-       for (size_t i=0; i < gpu_matrix_range.size1(); ++i)
-         for (size_t j=0; j < gpu_matrix_range.size2(); ++j)
-           cpu_matrix(i,j) = entries[i*gpu_matrix_range.get().internal_size2() + j];
+       for (vcl_size_t i=0; i < gpu_matrix_range.size1(); ++i)
+         for (vcl_size_t j=0; j < gpu_matrix_range.size2(); ++j)
+           cpu_matrix(i,j) = entries[i*gpu_matrix_range.internal_size2() + j];
     }
-    
+
   }
-  
-  
+
+
   //column_major:
   template <typename CPU_MATRIX, typename SCALARTYPE>
   void copy(matrix_range<matrix<SCALARTYPE, column_major, 1> > const & gpu_matrix_range,
             CPU_MATRIX & cpu_matrix)
   {
     assert( (cpu_matrix.size1() == gpu_matrix_range.size1())
-           && (cpu_matrix.size2() == gpu_matrix_range.size2()) );
-    
-     if ( gpu_matrix_range.start1() != 0 ||  gpu_matrix_range.size1() !=  gpu_matrix_range.get().size1())
+           && (cpu_matrix.size2() == gpu_matrix_range.size2())
+           && bool("Matrix size mismatch!"));
+
+     if ( gpu_matrix_range.start1() != 0)
      {
        std::vector<SCALARTYPE> entries(gpu_matrix_range.size1());
-       
+
        //copy each stride separately:
-       for (size_t j=0; j < gpu_matrix_range.size2(); ++j)
+       for (vcl_size_t j=0; j < gpu_matrix_range.size2(); ++j)
        {
-         size_t start_offset = (gpu_matrix_range.start2() + j) * gpu_matrix_range.get().internal_size1() + gpu_matrix_range.start1();
-         size_t num_entries = gpu_matrix_range.size1();
-         cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(),
-                                          gpu_matrix_range.get().handle().get(), CL_TRUE, 
-                                          sizeof(SCALARTYPE)*start_offset,
-                                          sizeof(SCALARTYPE)*num_entries,
-                                          &(entries[0]), 0, NULL, NULL);
-        VIENNACL_ERR_CHECK(err);
+         vcl_size_t start_offset = (gpu_matrix_range.start2() + j) * gpu_matrix_range.internal_size1() + gpu_matrix_range.start1();
+         vcl_size_t num_entries = gpu_matrix_range.size1();
+         viennacl::backend::memory_read(gpu_matrix_range.handle(), sizeof(SCALARTYPE)*start_offset, sizeof(SCALARTYPE)*num_entries, &(entries[0]));
         //std::cout << "Strided copy worked!" << std::endl;
-        
-        for (size_t i=0; i < gpu_matrix_range.size1(); ++i)
+
+        for (vcl_size_t i=0; i < gpu_matrix_range.size1(); ++i)
           cpu_matrix(i,j) = entries[i];
        }
      }
      else
      {
-       //full block can be copied: 
-       std::vector<SCALARTYPE> entries(gpu_matrix_range.size1()*gpu_matrix_range.size2());
-       
+       //full block can be copied:
+       std::vector<SCALARTYPE> entries(gpu_matrix_range.internal_size1()*gpu_matrix_range.size2());
+
        //copy each stride separately:
-       size_t start_offset = gpu_matrix_range.start2() * gpu_matrix_range.get().internal_size1();
-       size_t num_entries = gpu_matrix_range.size1() * gpu_matrix_range.size2();
-       //std::cout << "start_offset: " << start_offset << std::endl;
-       cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(),
-                                         gpu_matrix_range.get().handle().get(), CL_TRUE, 
-                                         sizeof(SCALARTYPE)*start_offset,
-                                         sizeof(SCALARTYPE)*num_entries,
-                                         &(entries[0]), 0, NULL, NULL);
-       VIENNACL_ERR_CHECK(err);
+       vcl_size_t start_offset = gpu_matrix_range.start2() * gpu_matrix_range.internal_size1();
+       vcl_size_t num_entries = gpu_matrix_range.internal_size1() * gpu_matrix_range.size2();
+       viennacl::backend::memory_read(gpu_matrix_range.handle(), sizeof(SCALARTYPE)*start_offset, sizeof(SCALARTYPE)*num_entries, &(entries[0]));
        //std::cout << "Block copy worked!" << std::endl;
-       
-       for (size_t i=0; i < gpu_matrix_range.size1(); ++i)
-         for (size_t j=0; j < gpu_matrix_range.size2(); ++j)
-           cpu_matrix(i,j) = entries[i + j*gpu_matrix_range.get().internal_size1()];
+
+       for (vcl_size_t i=0; i < gpu_matrix_range.size1(); ++i)
+         for (vcl_size_t j=0; j < gpu_matrix_range.size2(); ++j)
+           cpu_matrix(i,j) = entries[i + j*gpu_matrix_range.internal_size1()];
      }
-    
+
   }
 
 
-  template<typename MatrixType>
-  std::ostream & operator<<(std::ostream & s, matrix_range<MatrixType> const & proxy)
+  //
+  // Convenience function
+  //
+  template <typename MatrixType>
+  matrix_range<MatrixType> project(MatrixType & A, viennacl::range const & r1, viennacl::range const & r2)
   {
-    MatrixType temp = proxy;
-    s << temp;
-    return s;
+    assert(r1.size() <= A.size1() && r2.size() <= A.size2() && bool("Size of range invalid!"));
+
+    return matrix_range<MatrixType>(A, r1, r2);
   }
 
-  template<typename MatrixType>
-  std::ostream & operator<<(std::ostream & s, matrix_range<const MatrixType> const & proxy)
+
+  template <typename MatrixType>
+  matrix_range<MatrixType> project(matrix_range<MatrixType> & A, viennacl::range const & r1, viennacl::range const & r2)
   {
-    MatrixType temp = proxy;
-    s << temp;
-    return s;
+    assert(r1.size() <= A.size1() && r2.size() <= A.size2() && bool("Size of range invalid!"));
+
+    return matrix_range<MatrixType>(A,
+                                    viennacl::range(A.start1() + r1.start(), A.start1() + r1.start() + r1.size()),
+                                    viennacl::range(A.start2() + r2.start(), A.start2() + r2.start() + r2.size())
+                                   );
+  }
+
+
+
+
+//
+//
+//
+/////////////////////////////// Slice /////////////////////////////////////////////
+//
+//
+//
+
+
+
+
+
+  /** @brief Class for representing strided submatrices of a bigger matrix A.
+    *
+    * In MATLAB notation, this could for example refer to the submatrix A(3:2:8, 6:3:16) of a matrix A.
+    */
+  template <typename MatrixType>
+  class matrix_slice : public matrix_base<typename MatrixType::cpu_value_type, typename MatrixType::orientation_functor>
+  {
+      typedef matrix_base<typename MatrixType::cpu_value_type,
+                          typename MatrixType::orientation_functor>    base_type;
+      typedef matrix_slice<MatrixType>                                 self_type;
+
+    public:
+      typedef typename MatrixType::orientation_category       orientation_category;
+
+      typedef typename MatrixType::value_type     value_type;
+      typedef typename viennacl::result_of::cpu_value_type<value_type>::type    cpu_value_type;
+      typedef range::size_type                    size_type;
+      typedef range::difference_type              difference_type;
+      typedef value_type                          reference;
+      typedef const value_type &                  const_reference;
+
+      matrix_slice(MatrixType & A,
+                   slice const & row_slice,
+                   slice const & col_slice) : base_type(A.handle(),
+                                                        row_slice.size(), row_slice.start(), row_slice.stride(), A.internal_size1(),
+                                                        col_slice.size(), col_slice.start(), col_slice.stride(), A.internal_size2()) {}
+
+      using base_type::operator=;
+
+  };
+
+
+
+  /////////////////////////////////////////////////////////////
+  ///////////////////////// CPU to GPU ////////////////////////
+  /////////////////////////////////////////////////////////////
+
+  //row_major:
+  template <typename CPU_MATRIX, typename SCALARTYPE>
+  void copy(const CPU_MATRIX & cpu_matrix,
+            matrix_slice<matrix<SCALARTYPE, row_major, 1> > & gpu_matrix_slice )
+  {
+    assert( (cpu_matrix.size1() == gpu_matrix_slice.size1())
+           && (cpu_matrix.size2() == gpu_matrix_slice.size2())
+           && bool("Matrix size mismatch!"));
+
+     if ( (gpu_matrix_slice.size1() > 0) && (gpu_matrix_slice.size1() > 0) )
+     {
+       vcl_size_t num_entries = gpu_matrix_slice.size2() * gpu_matrix_slice.stride2(); //no. of entries per stride
+
+       std::vector<SCALARTYPE> entries(num_entries);
+
+       //copy each stride separately:
+       for (vcl_size_t i=0; i < gpu_matrix_slice.size1(); ++i)
+       {
+         vcl_size_t start_offset = (gpu_matrix_slice.start1() + i * gpu_matrix_slice.stride1()) * gpu_matrix_slice.internal_size2() + gpu_matrix_slice.start2();
+         viennacl::backend::memory_read(gpu_matrix_slice.handle(), sizeof(SCALARTYPE)*start_offset, sizeof(SCALARTYPE)*num_entries, &(entries[0]));
+
+         for (vcl_size_t j=0; j < gpu_matrix_slice.size2(); ++j)
+           entries[j * gpu_matrix_slice.stride2()] = cpu_matrix(i,j);
+
+         viennacl::backend::memory_write(gpu_matrix_slice.handle(), sizeof(SCALARTYPE)*start_offset, sizeof(SCALARTYPE)*num_entries, &(entries[0]));
+       }
+     }
+  }
+
+  //column_major:
+  template <typename CPU_MATRIX, typename SCALARTYPE>
+  void copy(const CPU_MATRIX & cpu_matrix,
+            matrix_slice<matrix<SCALARTYPE, column_major, 1> > & gpu_matrix_slice )
+  {
+    assert( (cpu_matrix.size1() == gpu_matrix_slice.size1())
+           && (cpu_matrix.size2() == gpu_matrix_slice.size2())
+           && bool("Matrix size mismatch!"));
+
+
+    if ( (gpu_matrix_slice.size1() > 0) && (gpu_matrix_slice.size1() > 0) )
+    {
+      vcl_size_t num_entries = gpu_matrix_slice.size1() * gpu_matrix_slice.stride1(); //no. of entries per stride
+
+      std::vector<SCALARTYPE> entries(num_entries);
+
+      //copy each column stride separately:
+      for (vcl_size_t j=0; j < gpu_matrix_slice.size2(); ++j)
+      {
+        vcl_size_t start_offset = gpu_matrix_slice.start1() + (gpu_matrix_slice.start2() + j * gpu_matrix_slice.stride2()) * gpu_matrix_slice.internal_size1();
+
+        viennacl::backend::memory_read(gpu_matrix_slice.handle(), sizeof(SCALARTYPE)*start_offset, sizeof(SCALARTYPE)*num_entries, &(entries[0]));
+
+        for (vcl_size_t i=0; i < gpu_matrix_slice.size1(); ++i)
+          entries[i * gpu_matrix_slice.stride1()] = cpu_matrix(i,j);
+
+        viennacl::backend::memory_write(gpu_matrix_slice.handle(), sizeof(SCALARTYPE)*start_offset, sizeof(SCALARTYPE)*num_entries, &(entries[0]));
+      }
+    }
+
+  }
+
+
+  /////////////////////////////////////////////////////////////
+  ///////////////////////// GPU to CPU ////////////////////////
+  /////////////////////////////////////////////////////////////
+
+
+  //row_major:
+  template <typename CPU_MATRIX, typename SCALARTYPE>
+  void copy(matrix_slice<matrix<SCALARTYPE, row_major, 1> > const & gpu_matrix_slice,
+            CPU_MATRIX & cpu_matrix)
+  {
+    assert( (cpu_matrix.size1() == gpu_matrix_slice.size1())
+           && (cpu_matrix.size2() == gpu_matrix_slice.size2())
+           && bool("Matrix size mismatch!"));
+
+     if ( (gpu_matrix_slice.size1() > 0) && (gpu_matrix_slice.size1() > 0) )
+     {
+       vcl_size_t num_entries = gpu_matrix_slice.size2() * gpu_matrix_slice.stride2(); //no. of entries per stride
+
+       std::vector<SCALARTYPE> entries(num_entries);
+
+       //copy each stride separately:
+       for (vcl_size_t i=0; i < gpu_matrix_slice.size1(); ++i)
+       {
+         vcl_size_t start_offset = (gpu_matrix_slice.start1() + i * gpu_matrix_slice.stride1()) * gpu_matrix_slice.internal_size2() + gpu_matrix_slice.start2();
+
+         viennacl::backend::memory_read(gpu_matrix_slice.handle(), sizeof(SCALARTYPE)*start_offset, sizeof(SCALARTYPE)*num_entries, &(entries[0]));
+
+         for (vcl_size_t j=0; j < gpu_matrix_slice.size2(); ++j)
+           cpu_matrix(i,j) = entries[j * gpu_matrix_slice.stride2()];
+       }
+     }
+
+  }
+
+
+  //column_major:
+  template <typename CPU_MATRIX, typename SCALARTYPE>
+  void copy(matrix_slice<matrix<SCALARTYPE, column_major, 1> > const & gpu_matrix_slice,
+            CPU_MATRIX & cpu_matrix)
+  {
+    assert( (cpu_matrix.size1() == gpu_matrix_slice.size1())
+           && (cpu_matrix.size2() == gpu_matrix_slice.size2())
+           && bool("Matrix size mismatch!"));
+
+    if ( (gpu_matrix_slice.size1() > 0) && (gpu_matrix_slice.size1() > 0) )
+    {
+      vcl_size_t num_entries = gpu_matrix_slice.size1() * gpu_matrix_slice.stride1(); //no. of entries per stride
+
+      std::vector<SCALARTYPE> entries(num_entries);
+
+      //copy each column stride separately:
+      for (vcl_size_t j=0; j < gpu_matrix_slice.size2(); ++j)
+      {
+        vcl_size_t start_offset = gpu_matrix_slice.start1() + (gpu_matrix_slice.start2() + j * gpu_matrix_slice.stride2()) * gpu_matrix_slice.internal_size1();
+
+        viennacl::backend::memory_read(gpu_matrix_slice.handle(), sizeof(SCALARTYPE)*start_offset, sizeof(SCALARTYPE)*num_entries, &(entries[0]));
+
+        for (vcl_size_t i=0; i < gpu_matrix_slice.size1(); ++i)
+          cpu_matrix(i,j) = entries[i * gpu_matrix_slice.stride1()];
+      }
+    }
+
   }
 
 
@@ -512,25 +450,37 @@ namespace viennacl
   // Convenience function
   //
   template <typename MatrixType>
-  matrix_range<MatrixType> project(MatrixType & A, viennacl::range const & r1, viennacl::range const & r2)
+  matrix_slice<MatrixType> project(MatrixType & A, viennacl::slice const & r1, viennacl::slice const & r2)
   {
-    return matrix_range<MatrixType>(A, r1, r2);
+    assert(r1.size() <= A.size1() && r2.size() <= A.size2() && bool("Size of slice invalid!"));
+
+    return matrix_slice<MatrixType>(A, r1, r2);
   }
 
-  /*template <typename MatrixType>
-  matrix_range<MatrixType> project(MatrixType const & A, viennacl::range const & r1, viennacl::range const & r2)
+  template <typename MatrixType>
+  matrix_slice<MatrixType> project(matrix_range<MatrixType> & A, viennacl::slice const & r1, viennacl::slice const & r2)
   {
-    return matrix_range<MatrixType>(A, r1, r2);
-  }*/
+    assert(r1.size() <= A.size1() && r2.size() <= A.size2() && bool("Size of slice invalid!"));
 
-  //TODO: Think about const-matrix...
-  /*template <typename MatrixType>
-  matrix_range<const MatrixType> project(MatrixType const & A, viennacl::range const & r1, viennacl::range const & r2)
+    return matrix_slice<MatrixType>(A,
+                                    viennacl::slice(A.start1() + r1.start(), r1.stride(), r1.size()),
+                                    viennacl::slice(A.start2() + r2.start(), r2.stride(), r2.size())
+                                   );
+  }
+
+  template <typename MatrixType>
+  matrix_slice<MatrixType> project(matrix_slice<MatrixType> & A, viennacl::slice const & r1, viennacl::slice const & r2)
   {
-    return matrix_range<MatrixType>(A, r1, r2);
-  }*/
+    assert(r1.size() <= A.size1() && r2.size() <= A.size2() && bool("Size of slice invalid!"));
+
+    return matrix_slice<MatrixType>(A,
+                                    viennacl::slice(A.start1() + r1.start(), A.stride1() * r1.stride(), r1.size()),
+                                    viennacl::slice(A.start2() + r2.start(), A.stride2() * r2.stride(), r2.size())
+                                   );
+  }
 
+  // TODO: Allow mix of range/slice
 
 }
 
-#endif
\ No newline at end of file
+#endif
diff --git a/viennacl/meta/enable_if.hpp b/viennacl/meta/enable_if.hpp
index b301f9d..f105d36 100644
--- a/viennacl/meta/enable_if.hpp
+++ b/viennacl/meta/enable_if.hpp
@@ -1,48 +1,42 @@
-#ifndef VIENNACL_META_ENABLE_IF_HPP_
-#define VIENNACL_META_ENABLE_IF_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file enable_if.hpp
-    @brief Simple enable-if variant that uses the SFINAE pattern
-*/
-
-#include <string>
-#include <fstream>
-#include <sstream>
-#include "viennacl/forwards.h"
-
-
-#include <vector>
-#include <map>
-
-namespace viennacl
-{
-    /** @brief Simple enable-if variant that uses the SFINAE pattern */
-    template <bool b, class T = void> 
-    struct enable_if
-    {
-      typedef T   type;
-    };
-
-    template <class T> 
-    struct enable_if<false, T> {};
-
-} //namespace viennacl
-    
-
-#endif
+#ifndef VIENNACL_META_ENABLE_IF_HPP_
+#define VIENNACL_META_ENABLE_IF_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/meta/enable_if.hpp
+    @brief Simple enable-if variant that uses the SFINAE pattern
+*/
+
+namespace viennacl
+{
+    /** @brief Simple enable-if variant that uses the SFINAE pattern */
+    template <bool b, class T = void>
+    struct enable_if
+    {
+      typedef T   type;
+    };
+
+    /** \cond */
+    template <class T>
+    struct enable_if<false, T> {};
+    /** \endcond */
+
+} //namespace viennacl
+
+
+#endif
diff --git a/viennacl/meta/predicate.hpp b/viennacl/meta/predicate.hpp
index 88029b0..fac514e 100644
--- a/viennacl/meta/predicate.hpp
+++ b/viennacl/meta/predicate.hpp
@@ -1,115 +1,511 @@
-#ifndef VIENNACL_META_PREDICATE_HPP_
-#define VIENNACL_META_PREDICATE_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file predicate.hpp
-    @brief All the predicates used within ViennaCL. Checks for expressions to be vectors, etc.
-*/
-
-#include <string>
-#include <fstream>
-#include <sstream>
-#include "viennacl/forwards.h"
-
-namespace viennacl
-{
-    //
-    // is_cpu_scalar: checks for float or double
-    //
-    template <typename T>
-    struct is_cpu_scalar
-    {
-      enum { value = false };
-    };
-  
-    template <>
-    struct is_cpu_scalar<float>
-    {
-      enum { value = true };
-    };
-
-    template <>
-    struct is_cpu_scalar<double>
-    {
-      enum { value = true };
-    };
-    
-    //
-    // is_scalar: checks for viennacl::scalar
-    //
-    template <typename T>
-    struct is_scalar
-    {
-      enum { value = false };
-    };
-  
-    template <typename T>
-    struct is_scalar<viennacl::scalar<T> >
-    {
-      enum { value = true };
-    };
-  
-    //
-    // is_vector
-    //
-    template <typename T>
-    struct is_vector
-    {
-      enum { value = false };
-    };
-
-    template <typename ScalarType, unsigned int ALIGNMENT>
-    struct is_vector<viennacl::vector<ScalarType, ALIGNMENT> >
-    {
-      enum { value = true };
-    };
-
-    template <typename T>
-    struct is_vector<viennacl::vector_range<T> >
-    {
-      enum { value = true };
-    };
-    
-    
-    
-    //
-    // is_matrix
-    //
-    template <typename T>
-    struct is_matrix
-    {
-      enum { value = false };
-    };
-
-    template <typename ScalarType, typename F, unsigned int ALIGNMENT>
-    struct is_matrix<viennacl::matrix<ScalarType, F, ALIGNMENT> >
-    {
-      enum { value = true };
-    };
-
-    template <typename T>
-    struct is_matrix<viennacl::matrix_range<T> >
-    {
-      enum { value = true };
-    };
-    
-
-} //namespace viennacl
-    
-
-#endif
+#ifndef VIENNACL_META_PREDICATE_HPP_
+#define VIENNACL_META_PREDICATE_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file predicate.hpp
+    @brief All the predicates used within ViennaCL. Checks for expressions to be vectors, etc.
+*/
+
+#include <string>
+#include <fstream>
+#include <sstream>
+#include "viennacl/forwards.h"
+
+#ifdef VIENNACL_WITH_OPENCL
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#else
+#include "CL/cl.h"
+#endif
+#endif
+
+namespace viennacl
+{
+
+    //
+    // is_cpu_scalar: checks for float or double
+    //
+    //template <typename T>
+    //struct is_cpu_scalar
+    //{
+    //  enum { value = false };
+    //};
+
+    /** \cond */
+    template <> struct is_cpu_scalar<char>           { enum { value = true }; };
+    template <> struct is_cpu_scalar<unsigned char>  { enum { value = true }; };
+    template <> struct is_cpu_scalar<short>          { enum { value = true }; };
+    template <> struct is_cpu_scalar<unsigned short> { enum { value = true }; };
+    template <> struct is_cpu_scalar<int>            { enum { value = true }; };
+    template <> struct is_cpu_scalar<unsigned int>   { enum { value = true }; };
+    template <> struct is_cpu_scalar<long>           { enum { value = true }; };
+    template <> struct is_cpu_scalar<unsigned long>  { enum { value = true }; };
+    template <> struct is_cpu_scalar<float>          { enum { value = true }; };
+    template <> struct is_cpu_scalar<double>         { enum { value = true }; };
+    /** \endcond */
+
+
+    //
+    // is_scalar: checks for viennacl::scalar
+    //
+    //template <typename T>
+    //struct is_scalar
+    //{
+    //  enum { value = false };
+    //};
+
+    /** \cond */
+    template <typename T>
+    struct is_scalar<viennacl::scalar<T> >
+    {
+      enum { value = true };
+    };
+    /** \endcond */
+
+    //
+    // is_flip_sign_scalar: checks for viennacl::scalar modified with unary operator-
+    //
+    //template <typename T>
+    //struct is_flip_sign_scalar
+    //{
+    //  enum { value = false };
+    //};
+
+    /** \cond */
+    template <typename T>
+    struct is_flip_sign_scalar<viennacl::scalar_expression< const scalar<T>,
+                                                            const scalar<T>,
+                                                            op_flip_sign> >
+    {
+      enum { value = true };
+    };
+    /** \endcond */
+
+    //
+    // is_any_scalar: checks for either CPU and GPU scalars, i.e. is_cpu_scalar<>::value || is_scalar<>::value
+    //
+    //template <typename T>
+    //struct is_any_scalar
+    //{
+    //  enum { value = (is_scalar<T>::value || is_cpu_scalar<T>::value || is_flip_sign_scalar<T>::value )};
+    //};
+
+    //
+
+      /** \cond */
+  #define VIENNACL_MAKE_ANY_VECTOR_TRUE(type) template<> struct is_any_vector< type > { enum { value = 1 }; };
+  #define VIENNACL_MAKE_FOR_ALL_SCALARTYPE(type) \
+    VIENNACL_MAKE_ANY_VECTOR_TRUE(type<float>)\
+    VIENNACL_MAKE_ANY_VECTOR_TRUE(type<double>)
+
+    VIENNACL_MAKE_FOR_ALL_SCALARTYPE(viennacl::vector)
+    VIENNACL_MAKE_FOR_ALL_SCALARTYPE(viennacl::vector_range)
+    VIENNACL_MAKE_FOR_ALL_SCALARTYPE(viennacl::vector_slice)
+    VIENNACL_MAKE_FOR_ALL_SCALARTYPE(viennacl::unit_vector)
+    VIENNACL_MAKE_FOR_ALL_SCALARTYPE(viennacl::zero_vector)
+    VIENNACL_MAKE_FOR_ALL_SCALARTYPE(viennacl::one_vector)
+    VIENNACL_MAKE_FOR_ALL_SCALARTYPE(viennacl::scalar_vector)
+
+  #undef VIENNACL_MAKE_FOR_ALL_SCALARTYPE
+  #undef VIENNACL_MAKE_ANY_VECTOR_TRUE
+      /** \endcond */
+
+
+      /** \cond */
+  #define VIENNACL_MAKE_ANY_MATRIX_TRUE(TYPE)\
+    template<> struct is_any_dense_matrix< TYPE > { enum { value = 1 }; };
+
+  #define VIENNACL_MAKE_FOR_ALL_SCALARTYPE(TYPE) \
+    VIENNACL_MAKE_ANY_MATRIX_TRUE(TYPE<float>)\
+    VIENNACL_MAKE_ANY_MATRIX_TRUE(TYPE<double>)
+
+  #define COMMA ,
+  #define VIENNACL_MAKE_FOR_ALL_SCALARTYPE_LAYOUT(TYPE) \
+    VIENNACL_MAKE_ANY_MATRIX_TRUE(TYPE<float COMMA viennacl::row_major>)\
+    VIENNACL_MAKE_ANY_MATRIX_TRUE(TYPE<double COMMA viennacl::row_major>)\
+    VIENNACL_MAKE_ANY_MATRIX_TRUE(TYPE<float COMMA viennacl::column_major>)\
+    VIENNACL_MAKE_ANY_MATRIX_TRUE(TYPE<double COMMA viennacl::column_major>)
+
+    VIENNACL_MAKE_FOR_ALL_SCALARTYPE_LAYOUT(viennacl::matrix)
+//    VIENNACL_MAKE_FOR_ALL_SCALARTYPE_LAYOUT(viennacl::matrix_range)
+//    VIENNACL_MAKE_FOR_ALL_SCALARTYPE_LAYOUT(viennacl::matrix_slice)
+    VIENNACL_MAKE_FOR_ALL_SCALARTYPE(viennacl::identity_matrix)
+    VIENNACL_MAKE_FOR_ALL_SCALARTYPE(viennacl::zero_matrix)
+    VIENNACL_MAKE_FOR_ALL_SCALARTYPE(viennacl::scalar_matrix)
+
+  #undef VIENNACL_MAKE_FOR_ALL_SCALARTYPE_LAYOUT
+  #undef VIENNACL_MAKE_FOR_ALL_SCALARTYPE
+  #undef VIENNACL_MAKE_ANY_MATRIX_TRUE
+      /** \endcond */
+
+    //
+    // is_row_major
+    //
+    //template <typename T>
+    //struct is_row_major
+    //{
+    //  enum { value = false };
+    //};
+
+    /** \cond */
+    template <typename ScalarType>
+    struct is_row_major<viennacl::matrix_base<ScalarType, viennacl::row_major> >
+    {
+      enum { value = true };
+    };
+
+    template <>
+    struct is_row_major< viennacl::row_major >
+    {
+      enum { value = true };
+    };
+
+    template <typename T>
+    struct is_row_major<viennacl::matrix_expression<T, T, viennacl::op_trans> >
+    {
+      enum { value = is_row_major<T>::value };
+    };
+    /** \endcond */
+
+
+    //
+    // is_circulant_matrix
+    //
+    //template <typename T>
+    //struct is_circulant_matrix
+    //{
+    //  enum { value = false };
+    //};
+
+    /** \cond */
+    template <typename ScalarType, unsigned int ALIGNMENT>
+    struct is_circulant_matrix<viennacl::circulant_matrix<ScalarType, ALIGNMENT> >
+    {
+      enum { value = true };
+    };
+
+    template <typename ScalarType, unsigned int ALIGNMENT>
+    struct is_circulant_matrix<const viennacl::circulant_matrix<ScalarType, ALIGNMENT> >
+    {
+      enum { value = true };
+    };
+    /** \endcond */
+
+    //
+    // is_hankel_matrix
+    //
+    //template <typename T>
+    //struct is_hankel_matrix
+    //{
+    //  enum { value = false };
+    //};
+
+    /** \cond */
+    template <typename ScalarType, unsigned int ALIGNMENT>
+    struct is_hankel_matrix<viennacl::hankel_matrix<ScalarType, ALIGNMENT> >
+    {
+      enum { value = true };
+    };
+
+    template <typename ScalarType, unsigned int ALIGNMENT>
+    struct is_hankel_matrix<const viennacl::hankel_matrix<ScalarType, ALIGNMENT> >
+    {
+      enum { value = true };
+    };
+    /** \endcond */
+
+    //
+    // is_toeplitz_matrix
+    //
+    //template <typename T>
+    //struct is_toeplitz_matrix
+    //{
+    //  enum { value = false };
+    //};
+
+    /** \cond */
+    template <typename ScalarType, unsigned int ALIGNMENT>
+    struct is_toeplitz_matrix<viennacl::toeplitz_matrix<ScalarType, ALIGNMENT> >
+    {
+      enum { value = true };
+    };
+
+    template <typename ScalarType, unsigned int ALIGNMENT>
+    struct is_toeplitz_matrix<const viennacl::toeplitz_matrix<ScalarType, ALIGNMENT> >
+    {
+      enum { value = true };
+    };
+    /** \endcond */
+
+    //
+    // is_vandermonde_matrix
+    //
+    //template <typename T>
+    //struct is_vandermonde_matrix
+    //{
+    //  enum { value = false };
+    //};
+
+    /** \cond */
+    template <typename ScalarType, unsigned int ALIGNMENT>
+    struct is_vandermonde_matrix<viennacl::vandermonde_matrix<ScalarType, ALIGNMENT> >
+    {
+      enum { value = true };
+    };
+
+    template <typename ScalarType, unsigned int ALIGNMENT>
+    struct is_vandermonde_matrix<const viennacl::vandermonde_matrix<ScalarType, ALIGNMENT> >
+    {
+      enum { value = true };
+    };
+    /** \endcond */
+
+
+    //
+    // is_compressed_matrix
+    //
+
+    /** \cond */
+    template <typename ScalarType, unsigned int ALIGNMENT>
+    struct is_compressed_matrix<viennacl::compressed_matrix<ScalarType, ALIGNMENT> >
+    {
+      enum { value = true };
+    };
+    /** \endcond */
+
+    //
+    // is_coordinate_matrix
+    //
+
+    /** \cond */
+    template <typename ScalarType, unsigned int ALIGNMENT>
+    struct is_coordinate_matrix<viennacl::coordinate_matrix<ScalarType, ALIGNMENT> >
+    {
+      enum { value = true };
+    };
+    /** \endcond */
+
+    //
+    // is_ell_matrix
+    //
+    /** \cond */
+    template <typename ScalarType, unsigned int ALIGNMENT>
+    struct is_ell_matrix<viennacl::ell_matrix<ScalarType, ALIGNMENT> >
+    {
+      enum { value = true };
+    };
+    /** \endcond */
+
+    //
+    // is_hyb_matrix
+    //
+    /** \cond */
+    template <typename ScalarType, unsigned int ALIGNMENT>
+    struct is_hyb_matrix<viennacl::hyb_matrix<ScalarType, ALIGNMENT> >
+    {
+      enum { value = true };
+    };
+    /** \endcond */
+
+
+    //
+    // is_any_sparse_matrix
+    //
+    //template <typename T>
+    //struct is_any_sparse_matrix
+    //{
+    //  enum { value = false };
+    //};
+
+    /** \cond */
+    template <typename ScalarType, unsigned int ALIGNMENT>
+    struct is_any_sparse_matrix<viennacl::compressed_matrix<ScalarType, ALIGNMENT> >
+    {
+      enum { value = true };
+    };
+
+    template <typename ScalarType>
+    struct is_any_sparse_matrix<viennacl::compressed_compressed_matrix<ScalarType> >
+    {
+      enum { value = true };
+    };
+
+    template <typename ScalarType, unsigned int ALIGNMENT>
+    struct is_any_sparse_matrix<viennacl::coordinate_matrix<ScalarType, ALIGNMENT> >
+    {
+      enum { value = true };
+    };
+
+    template <typename ScalarType, unsigned int ALIGNMENT>
+    struct is_any_sparse_matrix<viennacl::ell_matrix<ScalarType, ALIGNMENT> >
+    {
+      enum { value = true };
+    };
+
+    template <typename ScalarType, unsigned int ALIGNMENT>
+    struct is_any_sparse_matrix<viennacl::hyb_matrix<ScalarType, ALIGNMENT> >
+    {
+      enum { value = true };
+    };
+
+    template <typename T>
+    struct is_any_sparse_matrix<const T>
+    {
+      enum { value = is_any_sparse_matrix<T>::value };
+    };
+
+    /** \endcond */
+
+    //////////////// Part 2: Operator predicates ////////////////////
+
+    //
+    // is_addition
+    //
+    /** @brief Helper metafunction for checking whether the provided type is viennacl::op_add (for addition) */
+    template <typename T>
+    struct is_addition
+    {
+      enum { value = false };
+    };
+
+    /** \cond */
+    template <>
+    struct is_addition<viennacl::op_add>
+    {
+      enum { value = true };
+    };
+    /** \endcond */
+
+    //
+    // is_subtraction
+    //
+    /** @brief Helper metafunction for checking whether the provided type is viennacl::op_sub (for subtraction) */
+    template <typename T>
+    struct is_subtraction
+    {
+      enum { value = false };
+    };
+
+    /** \cond */
+    template <>
+    struct is_subtraction<viennacl::op_sub>
+    {
+      enum { value = true };
+    };
+    /** \endcond */
+
+    //
+    // is_product
+    //
+    /** @brief Helper metafunction for checking whether the provided type is viennacl::op_prod (for products/multiplication) */
+    template <typename T>
+    struct is_product
+    {
+      enum { value = false };
+    };
+
+    /** \cond */
+    template <>
+    struct is_product<viennacl::op_prod>
+    {
+      enum { value = true };
+    };
+
+    template <>
+    struct is_product<viennacl::op_mult>
+    {
+      enum { value = true };
+    };
+
+    template <>
+    struct is_product<viennacl::op_element_binary<op_prod> >
+    {
+      enum { value = true };
+    };
+    /** \endcond */
+
+    //
+    // is_division
+    //
+    /** @brief Helper metafunction for checking whether the provided type is viennacl::op_div (for division) */
+    template <typename T>
+    struct is_division
+    {
+      enum { value = false };
+    };
+
+    /** \cond */
+    template <>
+    struct is_division<viennacl::op_div>
+    {
+      enum { value = true };
+    };
+
+    template <>
+    struct is_division<viennacl::op_element_binary<op_div> >
+    {
+      enum { value = true };
+    };
+    /** \endcond */
+
+        // is_primitive_type
+    //
+
+    /** @brief Helper class for checking whether a type is a primitive type. */
+    template<class T>
+    struct is_primitive_type{ enum {value = false}; };
+
+    /** \cond */
+    template<> struct is_primitive_type<float>         { enum { value = true }; };
+    template<> struct is_primitive_type<double>        { enum { value = true }; };
+    template<> struct is_primitive_type<unsigned int>  { enum { value = true }; };
+    template<> struct is_primitive_type<int>           { enum { value = true }; };
+    template<> struct is_primitive_type<unsigned char> { enum { value = true }; };
+    template<> struct is_primitive_type<char>          { enum { value = true }; };
+    template<> struct is_primitive_type<unsigned long> { enum { value = true }; };
+    template<> struct is_primitive_type<long>          { enum { value = true }; };
+    template<> struct is_primitive_type<unsigned short>{ enum { value = true }; };
+    template<> struct is_primitive_type<short>         { enum { value = true }; };
+    /** \endcond */
+
+#ifdef VIENNACL_WITH_OPENCL
+
+    /** @brief Helper class for checking whether a particular type is a native OpenCL type. */
+    template<class T>
+    struct is_cl_type{ enum { value = false }; };
+
+    /** \cond */
+    template<> struct is_cl_type<cl_float> { enum { value = true }; };
+    template<> struct is_cl_type<cl_double>{ enum { value = true }; };
+    template<> struct is_cl_type<cl_uint>  { enum { value = true }; };
+    template<> struct is_cl_type<cl_int>   { enum { value = true }; };
+    template<> struct is_cl_type<cl_uchar> { enum { value = true }; };
+    template<> struct is_cl_type<cl_char>  { enum { value = true }; };
+    template<> struct is_cl_type<cl_ulong> { enum { value = true }; };
+    template<> struct is_cl_type<cl_long>  { enum { value = true }; };
+    template<> struct is_cl_type<cl_ushort>{ enum { value = true }; };
+    template<> struct is_cl_type<cl_short> { enum { value = true }; };
+    /** \endcond */
+
+#endif
+
+} //namespace viennacl
+
+
+#endif
diff --git a/viennacl/meta/result_of.hpp b/viennacl/meta/result_of.hpp
index e73a5ab..579c5db 100644
--- a/viennacl/meta/result_of.hpp
+++ b/viennacl/meta/result_of.hpp
@@ -1,219 +1,631 @@
-#ifndef VIENNACL_META_RESULT_OF_HPP_
-#define VIENNACL_META_RESULT_OF_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file result_of.hpp
-    @brief A collection of compile time type deductions
-*/
-
-#include <string>
-#include <fstream>
-#include <sstream>
-#include "viennacl/forwards.h"
-
-
-#ifdef VIENNACL_HAVE_UBLAS  
-#include <boost/numeric/ublas/matrix_sparse.hpp>
-#include <boost/numeric/ublas/matrix.hpp>
-#endif
-
-#ifdef VIENNACL_HAVE_EIGEN  
-#include <Eigen/Core>
-#include <Eigen/Sparse>
-#endif
-
-#ifdef VIENNACL_HAVE_MTL4
-#include <boost/numeric/mtl/mtl.hpp>
-#endif
-
-#include <vector>
-#include <map>
-
-namespace viennacl
-{
-    namespace result_of
-    {
-      //
-      // Retrieve size_type 
-      //
-      template <typename T>
-      struct size_type
-      {
-        typedef typename T::size_type   type;
-      };
-
-      #ifdef VIENNACL_HAVE_EIGEN
-      template <class T, int a, int b, int c, int d, int e>
-      struct size_type< Eigen::Matrix<T, a, b, c, d, e> >
-      {
-        typedef std::size_t   type;
-      };
-      
-      template <>
-      struct size_type<Eigen::VectorXf>
-      {
-        typedef std::size_t   type;
-      };
-      
-      template <>
-      struct size_type<Eigen::VectorXd>
-      {
-        typedef std::size_t   type;
-      };
-
-      template <typename T, int options>
-      struct size_type<Eigen::SparseMatrix<T, options> >
-      {
-        typedef std::size_t   type;
-      };
-      #endif
-      
-      //
-      // Retrieve value_type:
-      //
-      template <typename T>
-      struct value_type
-      {
-        typedef typename T::value_type    type; 
-      };
-
-      //
-      // Retrieve cpu value_type:
-      //
-      template <typename T>
-      struct cpu_value_type
-      {
-        typedef typename T::ERROR_CANNOT_DEDUCE_CPU_SCALAR_TYPE_FOR_T    type; 
-      };
-
-      template <>
-      struct cpu_value_type<float>
-      {
-        typedef float    type; 
-      };
-      
-      template <>
-      struct cpu_value_type<double>
-      {
-        typedef double    type; 
-      };
-      
-      template <typename T>
-      struct cpu_value_type<viennacl::scalar<T> >
-      {
-        typedef T    type; 
-      };
-
-      template <typename T, unsigned int ALIGNMENT>
-      struct cpu_value_type<viennacl::vector<T, ALIGNMENT> >
-      {
-        typedef T    type; 
-      };
-
-      template <typename T>
-      struct cpu_value_type<viennacl::vector_range<T> >
-      {
-        typedef typename cpu_value_type<T>::type    type; 
-      };
-      
-      template <typename T1, typename T2, typename OP>
-      struct cpu_value_type<viennacl::vector_expression<T1, T2, OP> >
-      {
-        typedef typename cpu_value_type<T1>::type    type; 
-      };
-      
-      
-      
-      template <typename T, typename F, unsigned int ALIGNMENT>
-      struct cpu_value_type<viennacl::matrix<T, F, ALIGNMENT> >
-      {
-        typedef T    type; 
-      };
-      
-      template <typename T>
-      struct cpu_value_type<viennacl::matrix_range<T> >
-      {
-        typedef typename cpu_value_type<T>::type    type; 
-      };
-
-      template <typename T1, typename T2, typename OP>
-      struct cpu_value_type<viennacl::matrix_expression<T1, T2, OP> >
-      {
-        typedef typename cpu_value_type<T1>::type    type; 
-      };
-      
-      
-    #ifdef VIENNACL_HAVE_EIGEN  
-      template <>
-      struct value_type<Eigen::MatrixXf>
-      {
-        typedef Eigen::MatrixXf::RealScalar    type; 
-      };
-      
-      template <>
-      struct value_type<Eigen::MatrixXd>
-      {
-        typedef Eigen::MatrixXd::RealScalar    type; 
-      };
-
-      template <typename ScalarType, int option>
-      struct value_type<Eigen::SparseMatrix<ScalarType, option> >
-      {
-        typedef ScalarType    type; 
-      };
-
-      template <>
-      struct value_type<Eigen::VectorXf>
-      {
-        typedef Eigen::VectorXf::RealScalar    type; 
-      };
-
-      template <>
-      struct value_type<Eigen::VectorXd>
-      {
-        typedef Eigen::VectorXd::RealScalar    type; 
-      };
-      
-    #endif
-      
-      
-      
-      template <typename T>
-      struct matrix_expression_internal_storage
-      {
-        typedef T &     type;
-      };
-     
-      template <>
-      struct matrix_expression_internal_storage<const float>
-      {
-        typedef float type;
-      };
-      
-      template <>
-      struct matrix_expression_internal_storage<const double>
-      {
-        typedef double type;
-      };
-      
-      
-    } //namespace result_of
-} //namespace viennacl
-    
-
-#endif
+#ifndef VIENNACL_META_RESULT_OF_HPP_
+#define VIENNACL_META_RESULT_OF_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/meta/result_of.hpp
+    @brief A collection of compile time type deductions
+*/
+
+#include <string>
+#include <fstream>
+#include <sstream>
+#include "viennacl/forwards.h"
+
+
+#ifdef VIENNACL_WITH_UBLAS
+#include <boost/numeric/ublas/matrix_sparse.hpp>
+#include <boost/numeric/ublas/matrix.hpp>
+#endif
+
+#ifdef VIENNACL_WITH_EIGEN
+#include <Eigen/Core>
+#include <Eigen/Sparse>
+#endif
+
+#ifdef VIENNACL_WITH_MTL4
+#include <boost/numeric/mtl/mtl.hpp>
+#endif
+
+#ifdef VIENNACL_WITH_OPENCL
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#else
+#include "CL/cl.h"
+#endif
+#endif
+
+#include <vector>
+#include <map>
+
+namespace viennacl
+{
+    namespace result_of
+    {
+      //
+      // Retrieve alignment from vector
+      //
+      /** @brief Retrieves the alignment from a vector. Deprecated - will be replaced by a pure runtime facility in the future. */
+      template <typename T>
+      struct alignment
+      {
+        typedef typename T::ERROR_ARGUMENT_PROVIDED_IS_NOT_A_VECTOR_OR_A_MATRIX   error_type;
+        enum { value = 1 };
+      };
+
+      /** \cond */
+      template <typename T>
+      struct alignment<const T>
+      {
+        enum { value = alignment<T>::value };
+      };
+
+      template <typename SCALARTYPE, unsigned int ALIGNMENT>
+      struct alignment< vector<SCALARTYPE, ALIGNMENT> >
+      {
+        enum { value = ALIGNMENT };
+      };
+
+      template <typename T>
+      struct alignment< vector_range<T> >
+      {
+        enum { value = alignment<T>::value };
+      };
+
+      template <typename T>
+      struct alignment< vector_slice<T> >
+      {
+        enum { value = alignment<T>::value };
+      };
+
+      // support for a*x with scalar a and vector x
+      template <typename LHS, typename RHS, typename OP>
+      struct alignment< vector_expression<LHS, RHS, OP> >
+      {
+        enum { value = alignment<LHS>::value };
+      };
+
+
+      // Matrices
+      template <typename SCALARTYPE, typename F, unsigned int ALIGNMENT>
+      struct alignment< matrix<SCALARTYPE, F, ALIGNMENT> >
+      {
+        enum { value = ALIGNMENT };
+      };
+
+      template <typename T>
+      struct alignment< matrix_range<T> >
+      {
+        enum { value = alignment<T>::value };
+      };
+
+      template <typename T>
+      struct alignment< matrix_slice<T> >
+      {
+        enum { value = alignment<T>::value };
+      };
+
+      template <typename LHS, typename RHS>
+      struct alignment< matrix_expression<LHS, RHS, op_trans> >
+      {
+        enum { value = alignment<LHS>::value };
+      };
+      /** \endcond */
+
+      //
+      // Majority specifier for matrices (row_major, column_major)
+      //
+      /** @brief Returns the orientation functor tag (either row_major or column_major) of a matrix */
+      template <typename T>
+      struct orientation_functor
+      {
+        typedef typename T::ERROR_ARGUMENT_PROVIDED_IS_NOT_A_MATRIX     type;
+      };
+
+      /** \cond */
+      template <typename T>
+      struct orientation_functor<const T>
+      {
+        typedef typename orientation_functor<T>::type  type;
+      };
+
+      template <typename SCALARTYPE, typename F, unsigned int ALIGNMENT>
+      struct orientation_functor< matrix<SCALARTYPE, F, ALIGNMENT> >
+      {
+        typedef F     type;
+      };
+
+      template <typename T>
+      struct orientation_functor< matrix_range<T> >
+      {
+        typedef typename orientation_functor<T>::type  type;
+      };
+
+      template <typename T>
+      struct orientation_functor< matrix_slice<T> >
+      {
+        typedef typename orientation_functor<T>::type  type;
+      };
+
+      template <typename SCALARTYPE, typename F>
+      struct orientation_functor< matrix_base<SCALARTYPE, F> >
+      {
+        typedef F     type;
+      };
+
+      template <typename LHS, typename RHS>
+      struct orientation_functor< matrix_expression<LHS, RHS, op_trans> >
+      {
+        typedef typename orientation_functor<LHS>::type  type;
+      };
+      /** \endcond */
+
+
+      //
+      // Retrieve size_type
+      //
+      /** @brief Generic meta-function for retrieving the size_type associated with type T */
+      template <typename T>
+      struct size_type
+      {
+        typedef typename T::size_type   type;
+      };
+
+      /** \cond */
+      template <typename T, typename SizeType>
+      struct size_type< vector_base<T, SizeType> >
+      {
+        typedef SizeType   type;
+      };
+
+      #ifdef VIENNACL_WITH_EIGEN
+      template <class T, int a, int b, int c, int d, int e>
+      struct size_type< Eigen::Matrix<T, a, b, c, d, e> >
+      {
+        typedef vcl_size_t   type;
+      };
+
+      template <>
+      struct size_type<Eigen::VectorXf>
+      {
+        typedef vcl_size_t   type;
+      };
+
+      template <>
+      struct size_type<Eigen::VectorXd>
+      {
+        typedef vcl_size_t   type;
+      };
+
+      template <typename T, int options>
+      struct size_type<Eigen::SparseMatrix<T, options> >
+      {
+        typedef vcl_size_t   type;
+      };
+      #endif
+      /** \endcond */
+
+      //
+      // Retrieve value_type:
+      //
+      /** @brief Generic helper function for retrieving the value_type associated with type T */
+      template <typename T>
+      struct value_type
+      {
+        typedef typename T::value_type    type;
+      };
+
+      /** \cond */
+#ifdef VIENNACL_WITH_EIGEN
+      template <>
+      struct value_type<Eigen::MatrixXf>
+      {
+        typedef Eigen::MatrixXf::RealScalar    type;
+      };
+
+      template <>
+      struct value_type<Eigen::MatrixXd>
+      {
+        typedef Eigen::MatrixXd::RealScalar    type;
+      };
+
+      template <typename ScalarType, int option>
+      struct value_type<Eigen::SparseMatrix<ScalarType, option> >
+      {
+        typedef ScalarType    type;
+      };
+
+      template <>
+      struct value_type<Eigen::VectorXf>
+      {
+        typedef Eigen::VectorXf::RealScalar    type;
+      };
+
+      template <>
+      struct value_type<Eigen::VectorXd>
+      {
+        typedef Eigen::VectorXd::RealScalar    type;
+      };
+
+#endif
+      /** \endcond */
+
+
+      //
+      // Retrieve cpu value_type:
+      //
+      /** @brief Helper meta function for retrieving the main RAM-based value type. Particularly important to obtain T from viennacl::scalar<T> in a generic way. */
+      template <typename T>
+      struct cpu_value_type
+      {
+        typedef typename T::ERROR_CANNOT_DEDUCE_CPU_SCALAR_TYPE_FOR_T    type;
+      };
+
+      /** \cond */
+      template <typename T>
+      struct cpu_value_type<const T>
+      {
+        typedef typename cpu_value_type<T>::type    type;
+      };
+
+      template <>
+      struct cpu_value_type<char>
+      {
+        typedef char    type;
+      };
+
+      template <>
+      struct cpu_value_type<unsigned char>
+      {
+        typedef unsigned char    type;
+      };
+
+      template <>
+      struct cpu_value_type<short>
+      {
+        typedef short    type;
+      };
+
+      template <>
+      struct cpu_value_type<unsigned short>
+      {
+        typedef unsigned short    type;
+      };
+
+      template <>
+      struct cpu_value_type<int>
+      {
+        typedef int    type;
+      };
+
+      template <>
+      struct cpu_value_type<unsigned int>
+      {
+        typedef unsigned int    type;
+      };
+
+      template <>
+      struct cpu_value_type<long>
+      {
+        typedef int    type;
+      };
+
+      template <>
+      struct cpu_value_type<unsigned long>
+      {
+        typedef unsigned long    type;
+      };
+
+
+      template <>
+      struct cpu_value_type<float>
+      {
+        typedef float    type;
+      };
+
+      template <>
+      struct cpu_value_type<double>
+      {
+        typedef double    type;
+      };
+
+      template <typename T>
+      struct cpu_value_type<viennacl::scalar<T> >
+      {
+        typedef T    type;
+      };
+
+      template <typename T>
+      struct cpu_value_type<viennacl::vector_base<T> >
+      {
+        typedef T    type;
+      };
+
+      template <typename T>
+      struct cpu_value_type<viennacl::implicit_vector_base<T> >
+      {
+        typedef T    type;
+      };
+
+
+      template <typename T, unsigned int ALIGNMENT>
+      struct cpu_value_type<viennacl::vector<T, ALIGNMENT> >
+      {
+        typedef T    type;
+      };
+
+      template <typename T>
+      struct cpu_value_type<viennacl::vector_range<T> >
+      {
+        typedef typename cpu_value_type<T>::type    type;
+      };
+
+      template <typename T>
+      struct cpu_value_type<viennacl::vector_slice<T> >
+      {
+        typedef typename cpu_value_type<T>::type    type;
+      };
+
+      template <typename T1, typename T2, typename OP>
+      struct cpu_value_type<viennacl::vector_expression<const T1, const T2, OP> >
+      {
+        typedef typename cpu_value_type<T1>::type    type;
+      };
+
+      template <typename T1, typename T2, typename OP>
+      struct cpu_value_type<const viennacl::vector_expression<const T1, const T2, OP> >
+      {
+        typedef typename cpu_value_type<T1>::type    type;
+      };
+
+
+      template <typename T, typename F>
+      struct cpu_value_type<viennacl::matrix_base<T, F> >
+      {
+        typedef T    type;
+      };
+
+      template <typename T>
+      struct cpu_value_type<viennacl::implicit_matrix_base<T> >
+      {
+        typedef T    type;
+      };
+
+
+      template <typename T, typename F, unsigned int ALIGNMENT>
+      struct cpu_value_type<viennacl::matrix<T, F, ALIGNMENT> >
+      {
+        typedef T    type;
+      };
+
+      template <typename T>
+      struct cpu_value_type<viennacl::matrix_range<T> >
+      {
+        typedef typename cpu_value_type<T>::type    type;
+      };
+
+      template <typename T>
+      struct cpu_value_type<viennacl::matrix_slice<T> >
+      {
+        typedef typename cpu_value_type<T>::type    type;
+      };
+
+      template <typename T, unsigned int ALIGNMENT>
+      struct cpu_value_type<viennacl::compressed_matrix<T, ALIGNMENT> >
+      {
+        typedef typename cpu_value_type<T>::type    type;
+      };
+
+      template <typename T>
+      struct cpu_value_type<viennacl::compressed_compressed_matrix<T> >
+      {
+        typedef typename cpu_value_type<T>::type    type;
+      };
+
+      template <typename T, unsigned int ALIGNMENT>
+      struct cpu_value_type<viennacl::coordinate_matrix<T, ALIGNMENT> >
+      {
+        typedef typename cpu_value_type<T>::type    type;
+      };
+
+      template <typename T, unsigned int ALIGNMENT>
+      struct cpu_value_type<viennacl::ell_matrix<T, ALIGNMENT> >
+      {
+        typedef typename cpu_value_type<T>::type    type;
+      };
+
+      template <typename T, unsigned int ALIGNMENT>
+      struct cpu_value_type<viennacl::hyb_matrix<T, ALIGNMENT> >
+      {
+        typedef typename cpu_value_type<T>::type    type;
+      };
+
+      template <typename T, unsigned int ALIGNMENT>
+      struct cpu_value_type<viennacl::circulant_matrix<T, ALIGNMENT> >
+      {
+        typedef typename cpu_value_type<T>::type    type;
+      };
+
+      template <typename T, unsigned int ALIGNMENT>
+      struct cpu_value_type<viennacl::hankel_matrix<T, ALIGNMENT> >
+      {
+        typedef typename cpu_value_type<T>::type    type;
+      };
+
+      template <typename T, unsigned int ALIGNMENT>
+      struct cpu_value_type<viennacl::toeplitz_matrix<T, ALIGNMENT> >
+      {
+        typedef typename cpu_value_type<T>::type    type;
+      };
+
+      template <typename T, unsigned int ALIGNMENT>
+      struct cpu_value_type<viennacl::vandermonde_matrix<T, ALIGNMENT> >
+      {
+        typedef typename cpu_value_type<T>::type    type;
+      };
+
+      template <typename T1, typename T2, typename OP>
+      struct cpu_value_type<viennacl::matrix_expression<T1, T2, OP> >
+      {
+        typedef typename cpu_value_type<T1>::type    type;
+      };
+
+
+      //
+      // Deduce compatible vector type for a matrix type
+      //
+
+      template <typename T>
+      struct vector_for_matrix
+      {
+        typedef typename T::ERROR_CANNOT_DEDUCE_VECTOR_FOR_MATRIX_TYPE   type;
+      };
+
+      //ViennaCL
+      template <typename T, typename F, unsigned int A>
+      struct vector_for_matrix< viennacl::matrix<T, F, A> >
+      {
+        typedef viennacl::vector<T,A>   type;
+      };
+
+      template <typename T, unsigned int A>
+      struct vector_for_matrix< viennacl::compressed_matrix<T, A> >
+      {
+        typedef viennacl::vector<T,A>   type;
+      };
+
+      template <typename T, unsigned int A>
+      struct vector_for_matrix< viennacl::coordinate_matrix<T, A> >
+      {
+        typedef viennacl::vector<T,A>   type;
+      };
+
+      #ifdef VIENNACL_WITH_UBLAS
+      //Boost:
+      template <typename T, typename F, typename A>
+      struct vector_for_matrix< boost::numeric::ublas::matrix<T, F, A> >
+      {
+        typedef boost::numeric::ublas::vector<T>   type;
+      };
+
+      template <typename T, typename U, vcl_size_t A, typename B, typename C>
+      struct vector_for_matrix< boost::numeric::ublas::compressed_matrix<T, U, A, B, C> >
+      {
+        typedef boost::numeric::ublas::vector<T>   type;
+      };
+
+      template <typename T, typename U, vcl_size_t A, typename B, typename C>
+      struct vector_for_matrix< boost::numeric::ublas::coordinate_matrix<T, U, A, B, C> >
+      {
+        typedef boost::numeric::ublas::vector<T>   type;
+      };
+      #endif
+
+
+      template <typename T>
+      struct reference_if_nonscalar
+      {
+        typedef T &    type;
+      };
+
+#define VIENNACL_REFERENCE_IF_NONSCALAR_INT(TNAME) \
+      template <> struct reference_if_nonscalar<TNAME>                { typedef                TNAME  type; }; \
+      template <> struct reference_if_nonscalar<const TNAME>          { typedef          const TNAME  type; }; \
+      template <> struct reference_if_nonscalar<unsigned TNAME>       { typedef       unsigned TNAME  type; }; \
+      template <> struct reference_if_nonscalar<const unsigned TNAME> { typedef const unsigned TNAME  type; };
+
+      VIENNACL_REFERENCE_IF_NONSCALAR_INT(char)
+      VIENNACL_REFERENCE_IF_NONSCALAR_INT(short)
+      VIENNACL_REFERENCE_IF_NONSCALAR_INT(int)
+      VIENNACL_REFERENCE_IF_NONSCALAR_INT(long)
+
+#undef VIENNACL_REFERENCE_IF_NONSCALAR_INT
+
+      template <>
+      struct reference_if_nonscalar<float>
+      {
+        typedef float    type;
+      };
+
+      template <>
+      struct reference_if_nonscalar<const float>
+      {
+        typedef const float    type;
+      };
+
+      template <>
+      struct reference_if_nonscalar<double>
+      {
+        typedef double    type;
+      };
+
+      template <>
+      struct reference_if_nonscalar<const double>
+      {
+        typedef const double    type;
+      };
+
+      /** \endcond */
+
+      //OpenCL equivalent type
+      /** @brief Metafunction for deducing the OpenCL type for a numeric type, e.g. float -> cl_float */
+      template<typename T>
+      struct cl_type
+      {
+          typedef T type;
+      };
+
+      /** \cond */
+#ifdef VIENNACL_WITH_OPENCL
+      template<>
+      struct cl_type<float>{ typedef cl_float type; };
+
+      template<>
+      struct cl_type<double>{ typedef cl_double type; };
+
+      template<>
+      struct cl_type<int>{ typedef cl_int type; };
+
+      template<>
+      struct cl_type<unsigned int>{  typedef cl_uint type; };
+
+      template<>
+      struct cl_type<long>{  typedef cl_long type;  };
+
+      template<>
+      struct cl_type<unsigned long>{ typedef cl_ulong type; };
+
+      template<>
+      struct cl_type<short>{ typedef cl_short type;  };
+
+      template<>
+      struct cl_type<unsigned short>{ typedef cl_ushort type; };
+
+      template<>
+      struct cl_type<char>{ typedef cl_char type; };
+
+      template<>
+      struct cl_type<unsigned char>{ typedef cl_uchar type; };
+#endif
+      /** \endcond */
+
+    } //namespace result_of
+} //namespace viennacl
+
+
+#endif
diff --git a/viennacl/meta/tag_of.hpp b/viennacl/meta/tag_of.hpp
index 85e833c..8329e61 100644
--- a/viennacl/meta/tag_of.hpp
+++ b/viennacl/meta/tag_of.hpp
@@ -2,16 +2,17 @@
 #define VIENNACL_META_TAGOF_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
@@ -22,18 +23,23 @@
     @brief Dispatch facility for distinguishing between ublas, STL and ViennaCL types
 */
 
-#ifdef VIENNACL_HAVE_UBLAS  
+#include <vector>
+#include <map>
+
+#include "viennacl/forwards.h"
+
+#ifdef VIENNACL_WITH_UBLAS
 #include <boost/numeric/ublas/matrix_sparse.hpp>
 #include <boost/numeric/ublas/matrix.hpp>
 #include <boost/numeric/ublas/vector.hpp>
 #endif
 
-#ifdef VIENNACL_HAVE_EIGEN  
+#ifdef VIENNACL_WITH_EIGEN
 #include <Eigen/Core>
 #include <Eigen/Sparse>
 #endif
 
-#ifdef VIENNACL_HAVE_MTL4
+#ifdef VIENNACL_WITH_MTL4
 #include <boost/numeric/mtl/mtl.hpp>
 #endif
 
@@ -43,11 +49,17 @@ namespace viennacl
   // ----------------------------------------------------
   // TAGS
   //
+  /** @brief A tag class for identifying 'unknown' types. */
   struct tag_none     {};
+  /** @brief A tag class for identifying types from MTL4. */
   struct tag_mtl4     {};
+  /** @brief A tag class for identifying types from Eigen. */
   struct tag_eigen    {};
+  /** @brief A tag class for identifying types from uBLAS. */
   struct tag_ublas    {};
+  /** @brief A tag class for identifying types from the C++ STL. */
   struct tag_stl      {};
+  /** @brief A tag class for identifying types from ViennaCL. */
   struct tag_viennacl {};
 
   namespace traits
@@ -56,7 +68,7 @@ namespace viennacl
     // GENERIC BASE
     //
     /** @brief Generic base for wrapping other linear algebra packages
-    * 
+    *
     *  Maps types to tags, e.g. viennacl::vector to tag_viennacl, ublas::vector to tag_ublas
     *  if the matrix type is unknown, tag_none is returned
     *
@@ -66,14 +78,15 @@ namespace viennacl
     */
     template< typename T, typename Active = void >
     struct tag_of;
-   
+
+    /** \cond */
     template < typename Sequence, typename Active >
     struct tag_of
     {
       typedef viennacl::tag_none  type;
     };
-    
-    #ifdef VIENNACL_HAVE_MTL4
+
+    #ifdef VIENNACL_WITH_MTL4
     // ----------------------------------------------------
     // MTL4
     //
@@ -95,9 +108,9 @@ namespace viennacl
       typedef viennacl::tag_mtl4  type;
     };
     #endif
-    
-    
-    #ifdef VIENNACL_HAVE_EIGEN
+
+
+    #ifdef VIENNACL_WITH_EIGEN
     // ----------------------------------------------------
     // Eigen
     //
@@ -130,10 +143,10 @@ namespace viennacl
     {
       typedef viennacl::tag_eigen  type;
     };
-    
+
     #endif
-    
-    #ifdef VIENNACL_HAVE_UBLAS
+
+    #ifdef VIENNACL_WITH_UBLAS
     // ----------------------------------------------------
     // UBLAS
     //
@@ -160,13 +173,13 @@ namespace viennacl
     {
       typedef viennacl::tag_ublas  type;
     };
-    
+
     #endif
 
     // ----------------------------------------------------
     // STL types
     //
-    
+
     //vector
     template< typename T, typename A >
     struct tag_of< std::vector<T, A> >
@@ -187,8 +200,8 @@ namespace viennacl
     {
       typedef viennacl::tag_stl  type;
     };
-    
-    
+
+
     // ----------------------------------------------------
     // VIENNACL
     //
@@ -215,7 +228,7 @@ namespace viennacl
     {
       typedef viennacl::tag_viennacl  type;
     };
-    
+
     template< typename T, unsigned int I>
     struct tag_of< viennacl::compressed_matrix<T,I> >
     {
@@ -227,7 +240,19 @@ namespace viennacl
     {
       typedef viennacl::tag_viennacl  type;
     };
-    
+
+    template< typename T, unsigned int I>
+    struct tag_of< viennacl::ell_matrix<T,I> >
+    {
+      typedef viennacl::tag_viennacl  type;
+    };
+
+    template< typename T, unsigned int I>
+    struct tag_of< viennacl::hyb_matrix<T,I> >
+    {
+      typedef viennacl::tag_viennacl  type;
+    };
+
     template< typename T, unsigned int I>
     struct tag_of< viennacl::circulant_matrix<T,I> >
     {
@@ -239,90 +264,98 @@ namespace viennacl
     {
       typedef viennacl::tag_viennacl  type;
     };
-    
+
     template< typename T, unsigned int I>
     struct tag_of< viennacl::toeplitz_matrix<T,I> >
     {
       typedef viennacl::tag_viennacl  type;
     };
-    
+
     template< typename T, unsigned int I>
     struct tag_of< viennacl::vandermonde_matrix<T,I> >
     {
       typedef viennacl::tag_viennacl  type;
     };
-    
-    
+    /** \endcond */
+
     // ----------------------------------------------------
   } // end namespace traits
 
 
-  /** @brief Meta function which checks whether a tag is tag_mtl4 
+  /** @brief Meta function which checks whether a tag is tag_mtl4
   *
   *  This is an internal function only, there is no need for a library user of ViennaCL to care about it any further
   */
   template <typename Tag>
   struct is_mtl4
   {
-     enum { value = false };  
+     enum { value = false };
   };
 
+  /** \cond */
   template <>
   struct is_mtl4< viennacl::tag_mtl4 >
   {
-     enum { value = true };  
+     enum { value = true };
   };
+  /** \endcond */
 
-  /** @brief Meta function which checks whether a tag is tag_eigen 
+  /** @brief Meta function which checks whether a tag is tag_eigen
   *
   *  This is an internal function only, there is no need for a library user of ViennaCL to care about it any further
   */
   template <typename Tag>
   struct is_eigen
   {
-     enum { value = false };  
+     enum { value = false };
   };
 
+  /** \cond */
   template <>
   struct is_eigen< viennacl::tag_eigen >
   {
-     enum { value = true };  
+     enum { value = true };
   };
+  /** \endcond */
 
 
-  /** @brief Meta function which checks whether a tag is tag_ublas 
+  /** @brief Meta function which checks whether a tag is tag_ublas
   *
   *  This is an internal function only, there is no need for a library user of ViennaCL to care about it any further
   */
   template <typename Tag>
   struct is_ublas
   {
-     enum { value = false };  
+     enum { value = false };
   };
 
+  /** \cond */
   template <>
   struct is_ublas< viennacl::tag_ublas >
   {
-     enum { value = true };  
+     enum { value = true };
   };
+  /** \endcond */
 
-  /** @brief Meta function which checks whether a tag is tag_ublas 
+  /** @brief Meta function which checks whether a tag is tag_ublas
   *
   *  This is an internal function only, there is no need for a library user of ViennaCL to care about it any further
   */
   template <typename Tag>
   struct is_stl
   {
-     enum { value = false };  
+     enum { value = false };
   };
 
+  /** \cond */
   template <>
   struct is_stl< viennacl::tag_stl >
   {
-     enum { value = true };  
+     enum { value = true };
   };
+  /** \endcond */
+
 
-  
   /** @brief Meta function which checks whether a tag is tag_viennacl
   *
   *  This is an internal function only, there is no need for a library user of ViennaCL to care about it any further
@@ -330,14 +363,16 @@ namespace viennacl
   template <typename Tag>
   struct is_viennacl
   {
-     enum { value = false };  
+     enum { value = false };
   };
 
+  /** \cond */
   template <>
   struct is_viennacl< viennacl::tag_viennacl >
   {
-     enum { value = true };  
+     enum { value = true };
   };
+  /** \endcond */
 
 } // end namespace viennacl
 
diff --git a/viennacl/misc/bandwidth_reduction.hpp b/viennacl/misc/bandwidth_reduction.hpp
index be237b8..8670d23 100644
--- a/viennacl/misc/bandwidth_reduction.hpp
+++ b/viennacl/misc/bandwidth_reduction.hpp
@@ -2,16 +2,17 @@
 #define VIENNACL_MISC_BANDWIDTH_REDUCTION_HPP
 
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
@@ -19,7 +20,7 @@
 
 
 /** @file viennacl/misc/bandwidth_reduction.hpp
-    @brief Convenience include for bandwidth reduction algorithms such as Cuthill-McKee or Gibbs-Poole-Stockmeyer.  Experimental in 1.2.x.
+    @brief Convenience include for bandwidth reduction algorithms such as Cuthill-McKee or Gibbs-Poole-Stockmeyer.  Experimental.
 */
 
 #include "viennacl/misc/cuthill_mckee.hpp"
@@ -29,9 +30,9 @@
 namespace viennacl
 {
   //TODO: Add convenience overload here. Which should be default?
-  
-  
+
+
 } //namespace viennacl
-    
+
 
 #endif
diff --git a/viennacl/misc/cuthill_mckee.hpp b/viennacl/misc/cuthill_mckee.hpp
index ef9555d..08258e1 100644
--- a/viennacl/misc/cuthill_mckee.hpp
+++ b/viennacl/misc/cuthill_mckee.hpp
@@ -2,16 +2,17 @@
 #define VIENNACL_MISC_CUTHILL_MCKEE_HPP
 
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
@@ -19,12 +20,13 @@
 
 
 /** @file viennacl/misc/cuthill_mckee.hpp
-*    @brief Implementation of several flavors of the Cuthill-McKee algorithm.  Experimental in 1.2.x.
-*    
-*   Contributed by Philipp Grabenweger, interface adjustments by Karl Rupp.
+*    @brief Implementation of several flavors of the Cuthill-McKee algorithm.  Experimental.
+*
+*   Contributed by Philipp Grabenweger, interface adjustments and performance tweaks by Karl Rupp.
 */
 
 #include <iostream>
+#include <iterator>
 #include <fstream>
 #include <string>
 #include <algorithm>
@@ -33,436 +35,574 @@
 #include <deque>
 #include <cmath>
 
+#include "viennacl/forwards.h"
 
 namespace viennacl
 {
-  
+
   namespace detail
   {
-    
+
+    // Calculate the bandwidth of a reordered matrix
+    template <typename IndexT, typename ValueT>
+    IndexT calc_reordered_bw(std::vector< std::map<IndexT, ValueT> > const & matrix,
+                             std::vector<bool> & dof_assigned_to_node,
+                             std::vector<IndexT> const & permutation)
+    {
+      IndexT bw = 0;
+
+      for (vcl_size_t i = 0; i < permutation.size(); i++)
+      {
+        if (!dof_assigned_to_node[i])
+          continue;
+
+        IndexT min_index = static_cast<IndexT>(matrix.size());
+        IndexT max_index = 0;
+        for (typename std::map<IndexT, ValueT>::const_iterator it = matrix[i].begin(); it != matrix[i].end(); it++)
+        {
+          if (!dof_assigned_to_node[it->first])
+            continue;
+
+          if (permutation[it->first] > max_index)
+            max_index = permutation[it->first];
+          if (permutation[it->first] < min_index)
+            min_index = permutation[it->first];
+        }
+        if (max_index > min_index)
+          bw = std::max(bw, max_index - min_index);
+      }
+
+      return bw;
+    }
+
+
+
     // function to calculate the increment of combination comb.
     // parameters:
     // comb: pointer to vector<int> of size m, m <= n
     //       1 <= comb[i] <= n for 0 <= i < m
     //       comb[i] < comb[i+1] for 0 <= i < m - 1
-    //       comb represents an unordered selection of m values out of n
+    //       comb represents an ordered selection of m values out of n
     // n: int
     //    total number of values out of which comb is taken as selection
-    inline bool comb_inc(std::vector<int> & comb, int n)
+    template <typename IndexT>
+    bool comb_inc(std::vector<IndexT> & comb, vcl_size_t n)
     {
-        int m;
-        int k;
-        
-        m = comb.size();
-        // calculate k as highest possible index such that (*comb)[k-1] can be incremented
-        k = m;
-        while ( (k > 0) && ( ((k == m) && (comb[k-1] == n)) || 
-                            ((k < m) && (comb[k-1] == comb[k] - 1) )) )
-        {
-            k--;
-        }
-        if (k == 0) // no further increment of comb possible -> return false
-        {
-            return false;
-        }
-        else
+      IndexT m;
+      IndexT k;
+
+      m = static_cast<IndexT>(comb.size());
+      // calculate k as highest possible index such that (*comb)[k-1] can be incremented
+      k = m;
+      while ( (k > 0) && ( ((k == m) && (comb[k-1] == static_cast<IndexT>(n)-1)) ||
+                           ((k <  m) && (comb[k-1] == comb[k] - 1) )) )
+      {
+        k--;
+      }
+
+      if (k == 0) // no further increment of comb possible -> return false
+        return false;
+
+      comb[k-1] += 1;
+
+      // and all higher index positions of comb are calculated just as directly following integer values
+      // Example (1, 4, 7) -> (1, 5, 6) -> (1, 5, 7) -> (1, 6, 7) -> done   for n=7
+      for (IndexT i = k; i < m; i++)
+        comb[i] = comb[k-1] + (i - k);
+      return true;
+    }
+
+
+    /** @brief Function to generate a node layering as a tree structure
+      *
+      *
+      */
+    // node s
+    template <typename MatrixT, typename IndexT>
+    void generate_layering(MatrixT const & matrix,
+                           std::vector< std::vector<IndexT> > & layer_list)
+    {
+      std::vector<bool> node_visited_already(matrix.size(), false);
+
+      //
+      // Step 1: Set root nodes to visited
+      //
+      for (vcl_size_t i=0; i<layer_list.size(); ++i)
+      {
+        for (typename std::vector<IndexT>::iterator it  = layer_list[i].begin();
+                                                    it != layer_list[i].end();
+                                                    it++)
+          node_visited_already[*it] = true;
+      }
+
+      //
+      // Step 2: Fill next layers
+      //
+      while (layer_list.back().size() > 0)
+      {
+        vcl_size_t layer_index = layer_list.size();  //parent nodes are at layer 0
+        layer_list.push_back(std::vector<IndexT>());
+
+        for (typename std::vector<IndexT>::iterator it  = layer_list[layer_index].begin();
+                                                    it != layer_list[layer_index].end();
+                                                    it++)
         {
-            (comb[k-1])++; // increment (*comb)[k-1],
-            for (int i = k; i < m; i++) // and all higher index positions of comb are 
-            // calculated just as directly following integer values (lowest possible values)
-            {
-                comb[i] = comb[k-1] + (i - k + 1);
-            }
-            return true;
+          for (typename MatrixT::value_type::const_iterator it2  = matrix[*it].begin();
+                                                            it2 != matrix[*it].end();
+                                                            it2++)
+          {
+            if (it2->first == *it) continue;
+            if (node_visited_already[it2->first]) continue;
+
+            layer_list.back().push_back(it2->first);
+            node_visited_already[it2->first] = true;
+          }
         }
+      }
+
+      // remove last (empty) nodelist:
+      layer_list.resize(layer_list.size()-1);
     }
 
 
-    // function to generate a node layering as a tree structure rooted at
-    // node s
+    // function to generate a node layering as a tree structure rooted at node s
     template <typename MatrixType>
-    void generate_layering(MatrixType const & matrix, 
+    void generate_layering(MatrixType const & matrix,
                            std::vector< std::vector<int> > & l,
                            int s)
     {
-      std::size_t n = matrix.size();
+      vcl_size_t n = matrix.size();
       //std::vector< std::vector<int> > l;
       std::vector<bool> inr(n, false);
       std::vector<int> nlist;
-      
+
       nlist.push_back(s);
       inr[s] = true;
       l.push_back(nlist);
-      
+
       for (;;)
       {
           nlist.clear();
-          for (std::vector<int>::iterator it  = l.back().begin(); 
+          for (std::vector<int>::iterator it  = l.back().begin();
                                           it != l.back().end();
                                           it++)
           {
-              for (typename MatrixType::value_type::const_iterator it2  = matrix[*it].begin(); 
+              for (typename MatrixType::value_type::const_iterator it2  = matrix[*it].begin();
                                                          it2 != matrix[*it].end();
                                                          it2++)
               {
                   if (it2->first == *it) continue;
                   if (inr[it2->first]) continue;
-                  
+
                   nlist.push_back(it2->first);
                   inr[it2->first] = true;
               }
           }
-          
+
           if (nlist.size() == 0)
               break;
 
           l.push_back(nlist);
       }
-      
+
+    }
+
+    /** @brief Fills the provided nodelist with all nodes of the same strongly connected component as the nodes in the node_list
+      *
+      *  If more than one node is provided, all nodes should be from the same strongly connected component.
+      */
+    template <typename MatrixT, typename IndexT>
+    void nodes_of_strongly_connected_component(MatrixT const & matrix,
+                                               std::vector<IndexT> & node_list)
+    {
+      std::vector<bool> node_visited_already(matrix.size(), false);
+      std::deque<IndexT> node_queue;
+
+      //
+      // Step 1: Push root nodes to queue:
+      //
+      for (typename std::vector<IndexT>::iterator it  = node_list.begin();
+                                                  it != node_list.end();
+                                                  it++)
+      {
+        node_queue.push_back(*it);
+      }
+      node_list.resize(0);
+
+      //
+      // Step 2: Fill with remaining nodes of strongly connected compontent
+      //
+      while (!node_queue.empty())
+      {
+        IndexT node_id = node_queue.front();
+        node_queue.pop_front();
+
+        if (!node_visited_already[node_id])
+        {
+          node_list.push_back(node_id);
+          node_visited_already[node_id] = true;
+
+          for (typename MatrixT::value_type::const_iterator it  = matrix[node_id].begin();
+                                                            it != matrix[node_id].end();
+                                                            it++)
+          {
+            IndexT neighbor_node_id = it->first;
+            if (neighbor_node_id == node_id) continue;
+            if (node_visited_already[neighbor_node_id]) continue;
+
+            node_queue.push_back(neighbor_node_id);
+          }
+        }
+      }
+
     }
 
-    
-    // comparison function for comparing two vector<int> values by their 
+
+    // comparison function for comparing two vector<int> values by their
     // [1]-element
     inline bool cuthill_mckee_comp_func(std::vector<int> const & a,
                                         std::vector<int> const & b)
     {
-        return (a[1] < b[1]);
+      return (a[1] < b[1]);
     }
-    
-  }
-  
+
+    template <typename IndexT>
+    bool cuthill_mckee_comp_func_pair(std::pair<IndexT, IndexT> const & a,
+                                      std::pair<IndexT, IndexT> const & b)
+    {
+        return (a.second < b.second);
+    }
+
+    /** @brief Runs the Cuthill-McKee algorithm on a strongly connected component of a graph
+      *
+      * @param matrix                  The matrix describing the full graph
+      * @param node_assignment_queue   A queue prepopulated with the root nodes
+      * @param dof_assigned_to_node    Boolean flag array indicating whether a dof got assigned to a certain node
+      * @param permutation             The permutation array to write the result to
+      * @param current_dof             The first dof to be used for assignment
+      *
+      * @return The next free dof available
+      */
+    template <typename IndexT, typename ValueT>
+    vcl_size_t cuthill_mckee_on_strongly_connected_component(std::vector< std::map<IndexT, ValueT> > const & matrix,
+                                                              std::deque<IndexT> & node_assignment_queue,
+                                                              std::vector<bool>  & dof_assigned_to_node,
+                                                              std::vector<IndexT> & permutation,
+                                                              vcl_size_t current_dof)
+    {
+      typedef std::pair<IndexT, IndexT> NodeIdDegreePair; //first member is the node ID, second member is the node degree
+
+      std::vector< NodeIdDegreePair > local_neighbor_nodes(matrix.size());
+
+      while (!node_assignment_queue.empty())
+      {
+        // Grab first node from queue
+        vcl_size_t node_id = node_assignment_queue.front();
+        node_assignment_queue.pop_front();
+
+        // Assign dof if a new dof hasn't been assigned yet
+        if (!dof_assigned_to_node[node_id])
+        {
+          permutation[node_id] = static_cast<IndexT>(current_dof);  //TODO: Invert this!
+          ++current_dof;
+          dof_assigned_to_node[node_id] = true;
+
+          //
+          // Get all neighbors of that node:
+          //
+          vcl_size_t num_neighbors = 0;
+          for (typename std::map<IndexT, ValueT>::const_iterator neighbor_it  = matrix[node_id].begin();
+                                                                 neighbor_it != matrix[node_id].end();
+                                                               ++neighbor_it)
+          {
+            if (!dof_assigned_to_node[neighbor_it->first])
+            {
+              local_neighbor_nodes[num_neighbors] = NodeIdDegreePair(neighbor_it->first, static_cast<IndexT>(matrix[neighbor_it->first].size()));
+              ++num_neighbors;
+            }
+          }
+
+          // Sort neighbors by increasing node degree
+          std::sort(local_neighbor_nodes.begin(), local_neighbor_nodes.begin() + num_neighbors, detail::cuthill_mckee_comp_func_pair<IndexT>);
+
+          // Push neighbors to queue
+          for (vcl_size_t i=0; i<num_neighbors; ++i)
+            node_assignment_queue.push_back(local_neighbor_nodes[i].first);
+
+        } // if node doesn't have a new dof yet
+
+      } // while nodes in queue
+
+      return current_dof;
+
+    }
+
+  } //namespace detail
+
   //
   // Part 1: The original Cuthill-McKee algorithm
   //
-  
-  
+
+  /** @brief A tag class for selecting the Cuthill-McKee algorithm for reducing the bandwidth of a sparse matrix. */
   struct cuthill_mckee_tag {};
-  
+
   /** @brief Function for the calculation of a node number permutation to reduce the bandwidth of an incidence matrix by the Cuthill-McKee algorithm
-   * 
+   *
    * references:
-   *    Algorithm was implemented similary as described in 
+   *    Algorithm was implemented similary as described in
    *      "Tutorial: Bandwidth Reduction - The CutHill-
    *      McKee Algorithm" posted by Ciprian Zavoianu as weblog at
    *    http://ciprian-zavoianu.blogspot.com/2009/01/project-bandwidth-reduction.html
    *    on January 15, 2009
-   *    (URL taken on June 14, 2011) 
-   * 
+   *    (URL taken on June 14, 2011)
+   *
    * @param matrix  vector of n matrix rows, where each row is a map<int, double> containing only the nonzero elements
    * @return permutation vector r. r[l] = i means that the new label of node i will be l.
    *
    */
-  template <typename MatrixType>
-  std::vector<int> reorder(MatrixType const & matrix, cuthill_mckee_tag)
+  template <typename IndexT, typename ValueT>
+  std::vector<IndexT> reorder(std::vector< std::map<IndexT, ValueT> > const & matrix, cuthill_mckee_tag)
   {
-    std::size_t n = matrix.size();
-    std::vector<int> r;
-    std::vector<bool> inr(n, false); // status array which remembers which nodes have been added to r
-    std::deque<int> q;
-    std::vector< std::vector<int> > nodes;
-    std::vector<int> tmp(2);
-    int p = 0;
-    int c;
-    
-    int deg;
-    int deg_min;
-    
-    r.reserve(n);
-    nodes.reserve(n);
-    
-    do
+    std::vector<IndexT> permutation(matrix.size());
+    std::vector<bool>   dof_assigned_to_node(matrix.size(), false);   //flag vector indicating whether node i has received a new dof
+    std::deque<IndexT>  node_assignment_queue;
+
+    vcl_size_t current_dof = 0;  //the dof to be assigned
+
+    while (current_dof < matrix.size()) //outer loop for each strongly connected component (there may be more than one)
     {
-        // under all nodes not yet in r determine one with minimal degree
-        deg_min = -1;
-        for (std::size_t i = 0; i < n; i++)
+      //
+      // preprocessing: Determine node degrees for nodes which have not been assigned
+      //
+      vcl_size_t current_min_degree = matrix.size();
+      vcl_size_t node_with_minimum_degree = 0;
+      bool found_unassigned_node = false;
+      for (vcl_size_t i=0; i<matrix.size(); ++i)
+      {
+        if (!dof_assigned_to_node[i])
         {
-            if (!inr[i])
-            {
-                deg = matrix[i].size() - 1; // node degree
-                if (deg_min < 0 || deg < deg_min)
-                {
-                    p = i; // node number
-                    deg_min = deg;
-                }
-            }
+          if (matrix[i].size() == 1)  //This is an isolated node, so assign DOF right away
+          {
+            permutation[i] = static_cast<IndexT>(current_dof);
+            dof_assigned_to_node[i] = true;
+            ++current_dof;
+            continue;
+          }
+
+          if (!found_unassigned_node) //initialize minimum degree on first node without new dof
+          {
+            current_min_degree = matrix[i].size();
+            node_with_minimum_degree = i;
+            found_unassigned_node = true;
+          }
+
+          if (matrix[i].size() < current_min_degree) //found a node with smaller degree
+          {
+            current_min_degree = matrix[i].size();
+            node_with_minimum_degree = i;
+          }
         }
-        q.push_back(p); // push that node p with minimal degree on q
-        
-        do
-        {
-            c = q.front();
-            q.pop_front();
-            if (!inr[c])
-            {
-                r.push_back(c);
-                inr[c] = true;
-                
-                // add all neighbouring nodes of c which are not yet in r to nodes
-                nodes.resize(0);
-                for (typename MatrixType::value_type::const_iterator it =  matrix[c].begin(); it != matrix[c].end(); it++)
-                {
-                    if (it->first == c) continue;
-                    if (inr[it->first]) continue;
-                    
-                    tmp[0] = it->first;
-                    tmp[1] = matrix[it->first].size() - 1;
-                    nodes.push_back(tmp);
-                }
-                
-                // sort nodes by node degree
-                std::sort(nodes.begin(), nodes.end(), detail::cuthill_mckee_comp_func);
-                for (std::vector< std::vector<int> >::iterator it = nodes.begin(); it != nodes.end(); it++)
-                {
-                    q.push_back((*it)[0]);
-                }
-            }
-        } while (q.size() != 0);
-    } while (r.size() < n);
-    
-    return r;
+      }
+
+      //
+      // Stage 2: Distribute dofs on this closely connected (sub-)graph in a breath-first manner using one root node
+      //
+      if (found_unassigned_node) // there's work to be done
+      {
+        node_assignment_queue.push_back(static_cast<IndexT>(node_with_minimum_degree));
+        current_dof = detail::cuthill_mckee_on_strongly_connected_component(matrix, node_assignment_queue, dof_assigned_to_node, permutation, current_dof);
+      }
+    }
+
+    return permutation;
   }
-  
-  
+
+
   //
   // Part 2: Advanced Cuthill McKee
   //
 
-  /** @brief Tag for the advanced Cuthill-McKee algorithm */ 
+  /** @brief Tag for the advanced Cuthill-McKee algorithm (i.e. running the 'standard' Cuthill-McKee algorithm for a couple of different seeds). */
   class advanced_cuthill_mckee_tag
   {
     public:
       /** @brief CTOR which may take the additional parameters for the advanced algorithm.
-        * 
+        *
         * additional parameters for CTOR:
         *   a:  0 <= a <= 1
         *     parameter which specifies which nodes are tried as starting nodes
-        *     of generated node layering (tree structure whith one ore more 
+        *     of generated node layering (tree structure whith one ore more
         *     starting nodes).
-        *     the relation deg_min <= deg <= deg_min + a * (deg_max - deg_min) 
+        *     the relation deg_min <= deg <= deg_min + a * (deg_max - deg_min)
         *     must hold for node degree deg for a starting node, where deg_min/
         *     deg_max is the minimal/maximal node degree of all yet unnumbered
         *     nodes.
         *    gmax:
         *      integer which specifies maximum number of nodes in the root
         *      layer of the tree structure (gmax = 0 means no limit)
-        * 
+        *
         * @return permutation vector r. r[l] = i means that the new label of node i will be l.
         *
        */
-      advanced_cuthill_mckee_tag(double a = 0.0, std::size_t gmax = 1) : starting_node_param_(a), max_root_nodes_(gmax) {}
-      
+      advanced_cuthill_mckee_tag(double a = 0.0, vcl_size_t gmax = 1) : starting_node_param_(a), max_root_nodes_(gmax) {}
+
       double starting_node_param() const { return starting_node_param_;}
       void starting_node_param(double a) { if (a >= 0) starting_node_param_ = a; }
-      
-      std::size_t max_root_nodes() const { return max_root_nodes_; }
-      void max_root_nodes(std::size_t gmax) { max_root_nodes_ = gmax; }      
-      
+
+      vcl_size_t max_root_nodes() const { return max_root_nodes_; }
+      void max_root_nodes(vcl_size_t gmax) { max_root_nodes_ = gmax; }
+
     private:
       double starting_node_param_;
-      std::size_t max_root_nodes_;
+      vcl_size_t max_root_nodes_;
   };
-  
+
 
 
   /** @brief Function for the calculation of a node number permutation to reduce the bandwidth of an incidence matrix by the advanced Cuthill-McKee algorithm
-   * 
+   *
    *
    *  references:
    *    see description of original Cuthill McKee implementation, and
    *    E. Cuthill and J. McKee: "Reducing the Bandwidth of sparse symmetric Matrices".
    *    Naval Ship Research and Development Center, Washington, D. C., 20007
    */
-  template <typename MatrixType>
-  std::vector<int> reorder(MatrixType const & matrix,
-                           advanced_cuthill_mckee_tag const & tag)
+  template <typename IndexT, typename ValueT>
+  std::vector<IndexT> reorder(std::vector< std::map<IndexT, ValueT> > const & matrix,
+                              advanced_cuthill_mckee_tag const & tag)
   {
-    std::size_t n = matrix.size();
+    vcl_size_t n = matrix.size();
     double a = tag.starting_node_param();
-    std::size_t gmax = tag.max_root_nodes();
-    std::vector<int> r;
-    std::vector<int> r_tmp;
-    std::vector<int> r_best;
-    std::vector<int> r2(n);
-    std::vector<bool> inr(n, false);
-    std::vector<bool> inr_tmp(n);
-    std::vector<bool> inr_best(n);
-    std::deque<int> q;
-    std::vector< std::vector<int> > nodes;
-    std::vector<int> nodes_p;
-    std::vector<int> tmp(2);
-    std::vector< std::vector<int> > l;
-    int deg_min;
-    int deg_max;
-    int deg_a;
-    int deg;
-    int bw;
-    int bw_best;
-    std::vector<int> comb;
-    std::size_t g;
-    int c;
-    
-    r.reserve(n);
-    r_tmp.reserve(n);
-    r_best.reserve(n);
-    nodes.reserve(n);
-    nodes_p.reserve(n);
+    vcl_size_t gmax = tag.max_root_nodes();
+    std::vector<IndexT> permutation(n);
+    std::vector<bool>   dof_assigned_to_node(n, false);
+    std::vector<IndexT> nodes_in_strongly_connected_component;
+    std::vector<IndexT> parent_nodes;
+    vcl_size_t deg_min;
+    vcl_size_t deg_max;
+    vcl_size_t deg_a;
+    vcl_size_t deg;
+    std::vector<IndexT> comb;
+
+    nodes_in_strongly_connected_component.reserve(n);
+    parent_nodes.reserve(n);
     comb.reserve(n);
-    
-    do
-    {   
-        // add to nodes_p all nodes not yet in r which are candidates for the root node layer  
-        // search unnumbered node and generate layering 
-        for (std::size_t i = 0; i < n; i++)
-        {
-            if (!inr[i])
-            {
-                detail::generate_layering(matrix, l, i);
-                break;
-            }
-        }
-        nodes.resize(0);
-        for (std::vector< std::vector<int> >::iterator it = l.begin();
-          it != l.end(); it++)
-        {
-            for (std::vector<int>::iterator it2 = it->begin();
-              it2 != it->end(); it2++)
-            {
-                tmp[0] = *it2;
-                tmp[1] = matrix[*it2].size() - 1;
-                nodes.push_back(tmp);
-            }
-        }
-        // determine minimum and maximum node degree
-        deg_min = -1;
-        deg_max = -1;
-        for (std::vector< std::vector<int> >::iterator it = nodes.begin(); 
-          it != nodes.end(); it++)
-        {
-            deg = (*it)[1];
-            if (deg_min < 0 || deg < deg_min)
-            {
-                deg_min = deg;
-            }
-            if (deg_max < 0 || deg > deg_max)
-            {
-                deg_max = deg;
-            }
-        }
-        deg_a = deg_min + (int) (a * (deg_max - deg_min));
-        nodes_p.resize(0);
-        for (std::vector< std::vector<int> >::iterator it = nodes.begin(); 
-          it != nodes.end(); it++)
+
+    vcl_size_t current_dof = 0;
+
+    while (current_dof < matrix.size()) // for all strongly connected components
+    {
+      // get all nodes of the strongly connected component:
+      nodes_in_strongly_connected_component.resize(0);
+      for (vcl_size_t i = 0; i < n; i++)
+      {
+        if (!dof_assigned_to_node[i])
         {
-            if ((*it)[1] <= deg_a)
-            {
-                nodes_p.push_back((*it)[0]);
-            }
+          nodes_in_strongly_connected_component.push_back(static_cast<IndexT>(i));
+          detail::nodes_of_strongly_connected_component(matrix, nodes_in_strongly_connected_component);
+          break;
         }
-        
-        inr_tmp = inr;
-        g = 1;
-        comb.resize(1);
-        comb[0] = 1;
-        bw_best = -1;
-        
-        for (;;) // for all combinations of g <= gmax root nodes repeat
+      }
+
+      // determine minimum and maximum node degree
+      deg_min = 0;
+      deg_max = 0;
+      for (typename std::vector<IndexT>::iterator it  = nodes_in_strongly_connected_component.begin();
+                                                  it != nodes_in_strongly_connected_component.end();
+                                                  it++)
+      {
+        deg = matrix[*it].size();
+        if (deg_min == 0 || deg < deg_min)
+          deg_min = deg;
+        if (deg_max == 0 || deg > deg_max)
+          deg_max = deg;
+      }
+      deg_a = deg_min + static_cast<vcl_size_t>(a * (deg_max - deg_min));
+
+      // fill array of parent nodes:
+      parent_nodes.resize(0);
+      for (typename std::vector<IndexT>::iterator it  = nodes_in_strongly_connected_component.begin();
+                                                  it != nodes_in_strongly_connected_component.end();
+                                                  it++)
+      {
+        if (matrix[*it].size() <= deg_a)
+          parent_nodes.push_back(*it);
+      }
+
+      //
+      // backup current state in order to restore for every new combination of parent nodes below
+      //
+      std::vector<bool> dof_assigned_to_node_backup = dof_assigned_to_node;
+      std::vector<bool> dof_assigned_to_node_best;
+
+      std::vector<IndexT> permutation_backup = permutation;
+      std::vector<IndexT> permutation_best = permutation;
+
+      vcl_size_t current_dof_backup = current_dof;
+
+      vcl_size_t g = 1;
+      comb.resize(1);
+      comb[0] = 0;
+
+      IndexT bw_best = 0;
+
+      //
+      // Loop over all combinations of g <= gmax root nodes
+      //
+
+      for (;;)
+      {
+        dof_assigned_to_node = dof_assigned_to_node_backup;
+        permutation          = permutation_backup;
+        current_dof          = current_dof_backup;
+
+        std::deque<IndexT>  node_queue;
+
+        // add the selected root nodes according to actual combination comb to q
+        for (typename std::vector<IndexT>::iterator it = comb.begin(); it != comb.end(); it++)
+          node_queue.push_back(parent_nodes[*it]);
+
+        current_dof = detail::cuthill_mckee_on_strongly_connected_component(matrix, node_queue, dof_assigned_to_node, permutation, current_dof);
+
+        // calculate resulting bandwith for root node combination
+        // comb for current numbered component of the node graph
+        IndexT bw = detail::calc_reordered_bw(matrix, dof_assigned_to_node, permutation);
+
+        // remember best ordering:
+        if (bw_best == 0 || bw < bw_best)
         {
-            inr = inr_tmp;
-            r_tmp.resize(0);
-            
-            // add the selected root nodes according to actual combination comb to q
-            for (std::vector<int>::iterator it = comb.begin(); 
-              it != comb.end(); it++)
-            {
-                q.push_back(nodes_p[(*it)-1]);
-            }
-  
-            do // perform normal CutHill-McKee algorithm for given root nodes with 
-            // resulting numbering stored in r_tmp
-            {
-                c = q.front();
-                q.pop_front();
-                if (!inr[c])
-                {
-                    r_tmp.push_back(c);
-                    inr[c] = true;
-                    
-                    nodes.resize(0);
-                    for (typename MatrixType::value_type::const_iterator it = matrix[c].begin(); it != matrix[c].end(); it++)
-                    {
-                        if (it->first == c) continue;
-                        if (inr[it->first]) continue;
-                        
-                        tmp[0] = it->first;
-                        tmp[1] = matrix[it->first].size() - 1;
-                        nodes.push_back(tmp);
-                    }
-                    std::sort(nodes.begin(), nodes.end(), detail::cuthill_mckee_comp_func);
-                    for (std::vector< std::vector<int> >::iterator it = 
-                      nodes.begin(); it != nodes.end(); it++)
-                    {
-                        q.push_back((*it)[0]);
-                    }
-                }
-            } while (q.size() != 0);
-            
-            // calculate resulting bandwith for root node combination
-            // comb for current numbered component of the node graph
-            for (std::size_t i = 0; i < r_tmp.size(); i++)
-            {
-                r2[r_tmp[i]] = r.size() + i;
-            }
-            bw = 0;
-            for (std::size_t i = 0; i < r_tmp.size(); i++)
-            {
-                for (typename MatrixType::value_type::const_iterator it  = matrix[r_tmp[i]].begin(); 
-                                                                     it != matrix[r_tmp[i]].end();
-                                                                     it++)
-                {
-                    bw = std::max(bw, std::abs(static_cast<int>(r.size() + i) - r2[it->first]));
-                }
-            }
-            
-            // remember ordering r_tmp in r_best for smallest bandwith
-            if (bw_best < 0 || bw < bw_best)
-            {
-                r_best = r_tmp;
-                bw_best = bw;
-                inr_best = inr;
-            }
-            
-            // calculate next combination comb, if not existing
-            // increment g if g stays <= gmax, or else terminate loop
-            if (!detail::comb_inc(comb, nodes_p.size()))
-            {
-                g++;
-                if ( (gmax > 0 && g > gmax) || g > nodes_p.size())
-                {
-                    break;
-                }
-                comb.resize(g);
-                for (std::size_t i = 0; i < g; i++)
-                {
-                    comb[i] = i + 1;
-                }
-            }
+          permutation_best = permutation;
+          bw_best = bw;
+          dof_assigned_to_node_best = dof_assigned_to_node;
         }
-        
-        // store best order r_best in result array r
-        for (std::vector<int>::iterator it = r_best.begin(); 
-          it != r_best.end(); it++)
+
+        // calculate next combination comb, if not existing
+        // increment g if g stays <= gmax, or else terminate loop
+        if (!detail::comb_inc(comb, parent_nodes.size()))
         {
-            r.push_back((*it));
+          ++g;
+          if ( (gmax > 0 && g > gmax) || g > parent_nodes.size())
+            break;
+
+          comb.resize(g);
+          for (vcl_size_t i = 0; i < g; i++)
+            comb[i] = static_cast<IndexT>(i);
         }
-        inr = inr_best;
-        
-    } while (r.size() < n);
-    
-    return r;
+      }
+
+      //
+      // restore best permutation
+      //
+      permutation = permutation_best;
+      dof_assigned_to_node = dof_assigned_to_node_best;
+
+    }
+
+    return permutation;
   }
-  
+
+
 } //namespace viennacl
-    
+
 
 #endif
diff --git a/viennacl/misc/gibbs_poole_stockmeyer.hpp b/viennacl/misc/gibbs_poole_stockmeyer.hpp
index 763f445..0fc4d83 100644
--- a/viennacl/misc/gibbs_poole_stockmeyer.hpp
+++ b/viennacl/misc/gibbs_poole_stockmeyer.hpp
@@ -2,16 +2,17 @@
 #define VIENNACL_MISC_GIBBS_POOLE_STOCKMEYER_HPP
 
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
@@ -19,8 +20,8 @@
 
 
 /** @file viennacl/misc/gibbs_poole_stockmeyer.hpp
- *  @brief Implementation of the Gibbs-Poole-Stockmeyer algorithm.  Experimental in 1.2.x.
- *    
+ *  @brief Implementation of the Gibbs-Poole-Stockmeyer algorithm.  Experimental.
+ *
  *  Contributed by Philipp Grabenweger, interface adjustments by Karl Rupp.
  */
 
@@ -33,11 +34,13 @@
 #include <deque>
 #include <cmath>
 
+#include "viennacl/forwards.h"
+
 #include "viennacl/misc/cuthill_mckee.hpp"
 
 namespace viennacl
 {
-  
+
   namespace detail
   {
 
@@ -45,16 +48,16 @@ namespace viennacl
     inline int calc_layering_width(std::vector< std::vector<int> > const & l)
     {
         int w;
-        
+
         w = 0;
-        for (std::size_t i = 0; i < l.size(); i++)
+        for (vcl_size_t i = 0; i < l.size(); i++)
         {
             w = std::max(w, static_cast<int>(l[i].size()));
         }
-        
+
         return w;
     }
-    
+
     // function to decompose a list of nodes rg into connected components
     // sorted by decreasing number of nodes per component
     template <typename MatrixType>
@@ -69,12 +72,12 @@ namespace viennacl
         int c;
         std::vector<bool> inr(n, true);
         std::deque<int> q;
-        
-        for (std::size_t i = 0; i < rg.size(); i++)
+
+        for (vcl_size_t i = 0; i < rg.size(); i++)
         {
             inr[rg[i]] = false;
         }
-        
+
         do
         {
             for (int i = 0; i < n; i++)
@@ -89,7 +92,7 @@ namespace viennacl
             {
                 break;
             }
-            
+
             tmp.resize(0);
             while (q.size() > 0)
             {
@@ -100,50 +103,51 @@ namespace viennacl
                 {
                     tmp.push_back(c);
                     inr[c] = true;
-                    
+
                     for (typename MatrixType::value_type::const_iterator it = matrix[c].begin(); it != matrix[c].end(); it++)
                     {
                         if (it->first == c) continue;
                         if (inr[it->first]) continue;
-                        
+
                         q.push_back(it->first);
                     }
                 }
             }
             rgc.push_back(tmp);
         } while (true);
-        
-        for (std::size_t i = 0; i < rgc.size(); i++)
+
+        for (vcl_size_t i = 0; i < rgc.size(); i++)
         {
-            ind[0] = i;
-            ind[1] = rgc[i].size();
+            ind[0] = static_cast<int>(i);
+            ind[1] = static_cast<int>(rgc[i].size());
             sort_ind.push_back(ind);
         }
         std::sort(sort_ind.begin(), sort_ind.end(), detail::cuthill_mckee_comp_func);
-        for (std::size_t i = 0; i < rgc.size(); i++)
+        for (vcl_size_t i = 0; i < rgc.size(); i++)
         {
             rgc_sorted.push_back(rgc[sort_ind[rgc.size()-1-i][0]]);
         }
-        
+
         return rgc_sorted;
     }
-    
+
   } // namespace detail
-  
-  
+
+
+  /** @brief Tag class for identifying the Gibbs-Poole-Stockmeyer algorithm for reducing the bandwidth of a sparse matrix. */
   struct gibbs_poole_stockmeyer_tag {};
-  
+
 
   /** @brief Function for the calculation of a node numbering permutation vector to reduce the bandwidth of a incidence matrix by the Gibbs-Poole-Stockmeyer algorithm
-   * 
+   *
    * references:
-   *   Werner Neudorf: "Bandbreitenreduktion - Teil 3. Algorithmus von 
+   *   Werner Neudorf: "Bandbreitenreduktion - Teil 3. Algorithmus von
    *   Gibbs-Poole-Stockmeyer. Testbeispiele mit CM und GPS", Preprint No.
    *   M 08/02, September 2002. Technische Universität Ilmenau, Fakultät
    *   für Mathematik und Naturwissenschaften, Institut für Mathematik.
    *   http://www.db-thueringen.de/servlets/DerivateServlet/Derivate-8673/IfM_Preprint_M_02_08.pdf
    *   (URL taken on June 14, 2011)
-   * 
+   *
    * @param matrix  vector of n matrix rows, where each row is a map<int, double> containing only the nonzero elements
    * @return permutation vector r. r[l] = i means that the new label of node i will be l.
    */
@@ -151,10 +155,10 @@ namespace viennacl
   std::vector<int> reorder(MatrixType const & matrix,
                            gibbs_poole_stockmeyer_tag)
   {
-    std::size_t n = matrix.size();
-    std::vector<int> r;
+    vcl_size_t n = matrix.size();
+    std::vector<int> r(n);
     std::vector< std::vector<int> > rl;
-    std::size_t l = 0;
+    vcl_size_t l = 0;
     int state;
     bool state_end;
     std::vector< std::vector<int> > nodes;
@@ -179,50 +183,51 @@ namespace viennacl
     int deg_min;
     int deg;
     int ind_min;
-    
-    r.reserve(n);
+
     nodes.reserve(n);
-    
-    while (r.size() < n) // for all components of the graph apply GPS algorithm
+
+    int current_dof = 0;
+
+    while (current_dof < static_cast<int>(n)) // for all components of the graph apply GPS algorithm
     {
         // determine node g with mimimal degree among all nodes which
         // are not yet in result array r
         deg_min = -1;
-        for (std::size_t i = 0; i < n; i++)
+        for (vcl_size_t i = 0; i < n; i++)
         {
             if (!inr[i])
             {
-                deg = matrix[i].size() - 1; // node degree
+                deg = static_cast<int>(matrix[i].size() - 1); // node degree
                 if (deg_min < 0 || deg < deg_min)
                 {
-                    g = i; // node number
+                    g = static_cast<int>(i); // node number
                     deg_min = deg;
                 }
             }
         }
-        
+
         // algorithm for determining nodes g, h as endpoints of a pseudo graph diameter
-        while (new_g) 
+        while (new_g)
         {
           lg.clear();
           detail::generate_layering(matrix, lg, g);
-            
+
           nodes.resize(0);
-          for (std::size_t i = 0; i < lg.back().size(); i++)
+          for (vcl_size_t i = 0; i < lg.back().size(); i++)
           {
               tmp[0] = lg.back()[i];
-              tmp[1] = matrix[lg.back()[i]].size() - 1;
+              tmp[1] = static_cast<int>(matrix[lg.back()[i]].size() - 1);
               nodes.push_back(tmp);
           }
           std::sort(nodes.begin(), nodes.end(), detail::cuthill_mckee_comp_func);
-          for (std::size_t i = 0; i < nodes.size(); i++)
+          for (vcl_size_t i = 0; i < nodes.size(); i++)
           {
               lg.back()[i] = nodes[i][0];
           }
-          
+
           m_min = -1;
           new_g = false;
-          for (std::size_t i = 0; i < lg.back().size(); i++)
+          for (vcl_size_t i = 0; i < lg.back().size(); i++)
           {
               lh.clear();
               detail::generate_layering(matrix, lh, lg.back()[i]);
@@ -240,32 +245,32 @@ namespace viennacl
               }
           }
         }
-        
+
         lh.clear();
         detail::generate_layering(matrix, lh, h);
-        
+
         // calculate ls as layering intersection and rg as remaining
         // graph
         lap.clear();
-        for (std::size_t i = 0; i < lg.size(); i++)
+        for (vcl_size_t i = 0; i < lg.size(); i++)
         {
-            for (std::size_t j = 0; j < lg[i].size(); j++)
+            for (vcl_size_t j = 0; j < lg[i].size(); j++)
             {
                 lap[lg[i][j]].resize(2);
-                lap[lg[i][j]][0] = i;
+                lap[lg[i][j]][0] = static_cast<int>(i);
             }
         }
-        for (std::size_t i = 0; i < lh.size(); i++)
+        for (vcl_size_t i = 0; i < lh.size(); i++)
         {
-            for (std::size_t j = 0; j < lh[i].size(); j++)
+            for (vcl_size_t j = 0; j < lh[i].size(); j++)
             {
-                lap[lh[i][j]][1] = lg.size() - 1 - i;
+                lap[lh[i][j]][1] = static_cast<int>(lg.size() - 1 - i);
             }
         }
         rg.clear();
         ls.clear();
         ls.resize(lg.size());
-        for (std::map< int, std::vector<int> >::iterator it = lap.begin(); 
+        for (std::map< int, std::vector<int> >::iterator it = lap.begin();
           it != lap.end(); it++)
         {
             if ((it->second)[0] == (it->second)[1])
@@ -277,8 +282,8 @@ namespace viennacl
                 rg.push_back(it->first);
             }
         }
-        // partition remaining graph in connected components 
-        rgc = detail::gps_rg_components(matrix, n, rg);
+        // partition remaining graph in connected components
+        rgc = detail::gps_rg_components(matrix, static_cast<int>(n), rg);
 
         // insert nodes of each component of rgc
         k1 = detail::calc_layering_width(lg);
@@ -286,22 +291,22 @@ namespace viennacl
         wvs.resize(ls.size());
         wvsg.resize(ls.size());
         wvsh.resize(ls.size());
-        for (std::size_t i = 0; i < rgc.size(); i++)
+        for (vcl_size_t i = 0; i < rgc.size(); i++)
         {
-            for (std::size_t j = 0; j < ls.size(); j++)
+            for (vcl_size_t j = 0; j < ls.size(); j++)
             {
-                wvs[j] = ls[j].size();
-                wvsg[j] = ls[j].size();
-                wvsh[j] = ls[j].size();
+                wvs[j]  = static_cast<int>(ls[j].size());
+                wvsg[j] = static_cast<int>(ls[j].size());
+                wvsh[j] = static_cast<int>(ls[j].size());
             }
-            for (std::size_t j = 0; j < rgc[i].size(); j++)
+            for (vcl_size_t j = 0; j < rgc[i].size(); j++)
             {
                 (wvsg[lap[rgc[i][j]][0]])++;
                 (wvsh[lap[rgc[i][j]][1]])++;
             }
             k3 = 0;
             k4 = 0;
-            for (std::size_t j = 0; j < ls.size(); j++)
+            for (vcl_size_t j = 0; j < ls.size(); j++)
             {
                 if (wvsg[j] > wvs[j])
                 {
@@ -314,20 +319,20 @@ namespace viennacl
             }
             if (k3 < k4 || (k3 == k4 && k1 <= k2) )
             {
-                for (std::size_t j = 0; j < rgc[i].size(); j++)
+                for (vcl_size_t j = 0; j < rgc[i].size(); j++)
                 {
                     ls[lap[rgc[i][j]][0]].push_back(rgc[i][j]);
                 }
             }
             else
             {
-                for (std::size_t j = 0; j < rgc[i].size(); j++)
+                for (vcl_size_t j = 0; j < rgc[i].size(); j++)
                 {
                     ls[lap[rgc[i][j]][1]].push_back(rgc[i][j]);
                 }
             }
         }
-        
+
         // renumber nodes in ls
         rl.clear();
         rl.resize(ls.size());
@@ -341,12 +346,12 @@ namespace viennacl
                 l = 0;
                 state = 4;
                 break;
-                
+
               case 2:
-                for (std::size_t i = 0; i < rl[l-1].size(); i++)
+                for (vcl_size_t i = 0; i < rl[l-1].size(); i++)
                 {
                     isn.assign(n, false);
-                    for (std::map<int, double>::const_iterator it = matrix[rl[l-1][i]].begin();  
+                    for (std::map<int, double>::const_iterator it = matrix[rl[l-1][i]].begin();
                                                                it != matrix[rl[l-1][i]].end();
                                                                it++)
                     {
@@ -354,28 +359,28 @@ namespace viennacl
                         isn[it->first] = true;
                     }
                     nodes.resize(0);
-                    for (std::size_t j = 0; j < ls[l].size(); j++)
+                    for (vcl_size_t j = 0; j < ls[l].size(); j++)
                     {
                         if (inr[ls[l][j]]) continue;
                         if (!isn[ls[l][j]]) continue;
                         tmp[0] = ls[l][j];
-                        tmp[1] = matrix[ls[l][j]].size() - 1;
+                        tmp[1] = static_cast<int>(matrix[ls[l][j]].size() - 1);
                         nodes.push_back(tmp);
                     }
                     std::sort(nodes.begin(), nodes.end(), detail::cuthill_mckee_comp_func);
-                    for (std::size_t j = 0; j < nodes.size(); j++)
+                    for (vcl_size_t j = 0; j < nodes.size(); j++)
                     {
                         rl[l].push_back(nodes[j][0]);
-                        r.push_back(nodes[j][0]);
+                        r[nodes[j][0]] = current_dof++;
                         inr[nodes[j][0]] = true;
                     }
                 }
-                
+
               case 3:
-                for (std::size_t i = 0; i < rl[l].size(); i++)
+                for (vcl_size_t i = 0; i < rl[l].size(); i++)
                 {
                     isn.assign(n, false);
-                    for (std::map<int, double>::const_iterator it = matrix[rl[l][i]].begin(); 
+                    for (std::map<int, double>::const_iterator it = matrix[rl[l][i]].begin();
                                                                it != matrix[rl[l][i]].end();
                                                                it++)
                     {
@@ -383,31 +388,31 @@ namespace viennacl
                         isn[it->first] = true;
                     }
                     nodes.resize(0);
-                    for (std::size_t j = 0; j < ls[l].size(); j++)
+                    for (vcl_size_t j = 0; j < ls[l].size(); j++)
                     {
                         if (inr[ls[l][j]]) continue;
                         if (!isn[ls[l][j]]) continue;
                         tmp[0] = ls[l][j];
-                        tmp[1] = matrix[ls[l][j]].size() - 1;
+                        tmp[1] = static_cast<int>(matrix[ls[l][j]].size() - 1);
                         nodes.push_back(tmp);
                     }
                     std::sort(nodes.begin(), nodes.end(), detail::cuthill_mckee_comp_func);
-                    for (std::size_t j = 0; j < nodes.size(); j++)
+                    for (vcl_size_t j = 0; j < nodes.size(); j++)
                     {
                         rl[l].push_back(nodes[j][0]);
-                        r.push_back(nodes[j][0]);
+                        r[nodes[j][0]] = current_dof++;
                         inr[nodes[j][0]] = true;
                     }
                 }
-                
+
               case 4:
                 if (rl[l].size() < ls[l].size())
                 {
                     deg_min = -1;
-                    for (std::size_t j = 0; j < ls[l].size(); j++)
+                    for (vcl_size_t j = 0; j < ls[l].size(); j++)
                     {
                         if (inr[ls[l][j]]) continue;
-                        deg = matrix[ls[l][j]].size() - 1;
+                        deg = static_cast<int>(matrix[ls[l][j]].size() - 1);
                         if (deg_min < 0 || deg < deg_min)
                         {
                             ind_min = ls[l][j];
@@ -415,12 +420,12 @@ namespace viennacl
                         }
                     }
                     rl[l].push_back(ind_min);
-                    r.push_back(ind_min);
+                    r[ind_min] = current_dof++;
                     inr[ind_min] = true;
                     state = 3;
                     break;
                 }
-                
+
               case 5:
                 l++;
                 if (l < ls.size())
@@ -432,19 +437,19 @@ namespace viennacl
                     state_end = true;
                 }
                 break;
-                
+
             default:
                 break;
             }
         }
 
     }
-    
+
     return r;
   }
-  
-  
+
+
 } //namespace viennacl
-    
+
 
 #endif
diff --git a/viennacl/ocl/backend.hpp b/viennacl/ocl/backend.hpp
index 36bab81..b171e76 100644
--- a/viennacl/ocl/backend.hpp
+++ b/viennacl/ocl/backend.hpp
@@ -2,22 +2,23 @@
 #define VIENNACL_OCL_BACKEND_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
 ============================================================================= */
 
-/** @file backend.hpp
+/** @file viennacl/ocl/backend.hpp
     @brief Implementations of the OpenCL backend, where all contexts are stored in.
 */
 
@@ -29,13 +30,13 @@ namespace viennacl
 {
   namespace ocl
   {
-    
+
     /** @brief A backend that provides contexts for ViennaCL objects (vector, matrix, etc.) */
     template <bool dummy = false>  //never use parameter other than default (introduced for linkage issues only)
     class backend
     {
       public:
-        /** @brief Switches the current context to the context identified by i 
+        /** @brief Switches the current context to the context identified by i
         *
         * @param i   ID of the new active context
         */
@@ -43,27 +44,33 @@ namespace viennacl
         {
           current_context_id_ = i;
         }
-        
+
         /** @brief Returns the current active context */
-        static viennacl::ocl::context & current_context()
+        static viennacl::ocl::context & context(long id)
         {
-          if (!initialized_[current_context_id_])
+          if (!initialized_[id])
           {
             //std::cout << "Initializing context no. " << current_context_id_ << std::endl;
-            contexts_[current_context_id_].init();
+            contexts_[id].init();
             //create one queue per device:
-            std::vector<viennacl::ocl::device> devices = contexts_[current_context_id_].devices();
-            for (size_t j = 0; j<devices.size(); ++j)
-              contexts_[current_context_id_].add_queue(devices[j]);
-            initialized_[current_context_id_] = true;
+            std::vector<viennacl::ocl::device> devices = contexts_[id].devices();
+            for (vcl_size_t j = 0; j<devices.size(); ++j)
+              contexts_[id].add_queue(devices[j]);
+            initialized_[id] = true;
             /*
             std::cout << "Context no. " << current_context_id_ << " initialized with " << devices.size() << " devices" << std::endl;
             std::cout << "Device id: " << devices[0].id() << std::endl;
             std::cout << "Current device id: " << contexts_[current_context_id_].current_device().id() << std::endl; */
           }
-          return contexts_[current_context_id_];
+          return contexts_[id];
         }
-        
+
+        /** @brief Returns the current active context */
+        static viennacl::ocl::context & current_context()
+        {
+          return backend<dummy>::context(current_context_id_);
+        }
+
         /** @brief Returns the current queue for the active device in the active context */
         static viennacl::ocl::command_queue & get_queue()
         {
@@ -83,7 +90,7 @@ namespace viennacl
           else
           {
             //set devices for context:
-            for (size_t j = 0; j<devices.size(); ++j)
+            for (vcl_size_t j = 0; j<devices.size(); ++j)
               contexts_[i].add_device(devices[j]);
           }
         }
@@ -100,19 +107,19 @@ namespace viennacl
                                   std::vector<cl_device_id> const & devices,
                                   std::map< cl_device_id, std::vector< cl_command_queue > > const & queues)
         {
-          assert(devices.size() == queues.size() && "ViennaCL expects one queue per device!");
-          
+          assert(devices.size() == queues.size() && bool("ViennaCL expects one queue per device!"));
+
           if (initialized_[i])
             std::cerr << "ViennaCL: Warning in init_context(): Providing a list of devices has no effect, because context for ViennaCL is already created!" << std::endl;
           else
           {
             //set devices for context:
-            for (size_t j = 0; j<devices.size(); ++j)
+            for (vcl_size_t j = 0; j<devices.size(); ++j)
               contexts_[i].add_device(devices[j]);
-            
+
             //init context:
             contexts_[i].init(c);
-            
+
             //add queues:
             typedef typename std::map< cl_device_id, std::vector< cl_command_queue > >::const_iterator queue_iterator;
             for (queue_iterator qit = queues.begin();
@@ -120,10 +127,10 @@ namespace viennacl
                               ++qit)
             {
               std::vector<cl_command_queue> const & queues_for_device = qit->second;
-              for (size_t j=0; j<queues_for_device.size(); ++j)
+              for (vcl_size_t j=0; j<queues_for_device.size(); ++j)
                 contexts_[i].add_queue(qit->first, queues_for_device[j]);
             }
-            
+
             initialized_[i] = true;
           }
         }
@@ -137,13 +144,13 @@ namespace viennacl
         */
         static void setup_context(long i, cl_context c, std::vector<cl_device_id> const & devices, std::vector<cl_command_queue> const & queue)
         {
-          assert(devices.size() == queue.size() && "ViennaCL expects one queue per device!");
-          
+          assert(devices.size() == queue.size() && bool("ViennaCL expects one queue per device!"));
+
           //wrap queue vector into map
           std::map< cl_device_id, std::vector<cl_command_queue> > queues_map;
-          for (size_t j = 0; j<devices.size(); ++j)
+          for (vcl_size_t j = 0; j<devices.size(); ++j)
             queues_map[devices[j]].push_back(queue[j]);
-          
+
           setup_context(i, c, devices, queues_map);
         }
 
@@ -153,12 +160,24 @@ namespace viennacl
           contexts_[i].default_device_type(t);
         }
 
+        /** @brief Sets the maximum number of devices per context. Ignored if a device array is provided as well.  */
+        static void set_context_device_num(long i, vcl_size_t num)
+        {
+          contexts_[i].default_device_num(num);
+        }
+
+        /** @brief Sets the context device type */
+        static void set_context_platform_index(long i, vcl_size_t pf_index)
+        {
+          contexts_[i].platform_index(pf_index);
+        }
+
       private:
         static long current_context_id_;
         static std::map<long, bool> initialized_;
         static std::map<long, viennacl::ocl::context> contexts_;
     };
-    
+
     template <bool dummy>
     long backend<dummy>::current_context_id_ = 0;
 
@@ -167,7 +186,7 @@ namespace viennacl
 
     template <bool dummy>
     std::map<long, viennacl::ocl::context> backend<dummy>::contexts_;
-    
+
     ////////////////////// current context //////////////////
     /** @brief Convenience function for returning the current context */
     inline viennacl::ocl::context & current_context()
@@ -180,7 +199,12 @@ namespace viennacl
     {
       viennacl::ocl::backend<>::switch_context(i);
     }
-    
+
+    /** @brief Convenience function for returning the current context */
+    inline viennacl::ocl::context & get_context(long i)
+    {
+      return viennacl::ocl::backend<>::context(i);
+    }
 
     /** @brief Convenience function for setting devices for a context */
     inline void setup_context(long i,
@@ -189,6 +213,15 @@ namespace viennacl
       viennacl::ocl::backend<>::setup_context(i, devices);
     }
 
+    /** @brief Convenience function for setting devices for a context */
+    inline void setup_context(long i,
+                              viennacl::ocl::device const & device)
+    {
+      std::vector<cl_device_id> device_id_array(1);
+      device_id_array[0] = device.id();
+      viennacl::ocl::backend<>::setup_context(i, device_id_array);
+    }
+
     /** @brief Convenience function for setting up a context in ViennaCL from an existing OpenCL context */
     inline void setup_context(long i,
                               cl_context c,
@@ -197,7 +230,7 @@ namespace viennacl
     {
       viennacl::ocl::backend<>::setup_context(i, c, devices, queues);
     }
-    
+
     /** @brief Convenience function for setting up a context in ViennaCL from an existing OpenCL context */
     inline void setup_context(long i, cl_context c, std::vector<cl_device_id> const & devices, std::vector<cl_command_queue> const & queues)
     {
@@ -244,13 +277,30 @@ namespace viennacl
       set_context_device_type(i, CL_DEVICE_TYPE_ACCELERATOR);
     }
 
+    /** @brief Convenience function for setting the number of default devices per context */
+    inline void set_context_device_num(long i, vcl_size_t num)
+    {
+      viennacl::ocl::backend<>::set_context_device_num(i, num);
+    }
+
+
+    /** @brief Convenience function for setting the platform index
+     *
+     * @param i         Context ID
+     * @param pf_index  The platform index as returned by clGetPlatformIDs(). This is not the ID of type cl_platform_id!
+     */
+    inline void set_context_platform_index(long i, vcl_size_t pf_index)
+    {
+      viennacl::ocl::backend<>::set_context_platform_index(i, pf_index);
+    }
+
     ///////////////////////// get queues ///////////////////
     /** @brief Convenience function for getting the default queue for the currently active device in the active context */
     inline viennacl::ocl::command_queue & get_queue()
     {
       return viennacl::ocl::current_context().get_queue();
     }
-    
+
     /** @brief Convenience function for getting the queue for a particular device in the current active context */
     inline viennacl::ocl::command_queue & get_queue(viennacl::ocl::device d, unsigned int queue_id = 0)
     {
@@ -275,7 +325,7 @@ namespace viennacl
     {
       viennacl::ocl::current_context().switch_device(d);
     }
-    
+
     /** @brief Convenience function for returning the active device in the current context */
     inline viennacl::ocl::device const & current_device()
     {
diff --git a/viennacl/ocl/command_queue.hpp b/viennacl/ocl/command_queue.hpp
index 75519a2..de06c3d 100644
--- a/viennacl/ocl/command_queue.hpp
+++ b/viennacl/ocl/command_queue.hpp
@@ -1,88 +1,94 @@
-#ifndef VIENNACL_OCL_COMMAND_QUEUE_HPP_
-#define VIENNACL_OCL_COMMAND_QUEUE_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file command_queue.hpp
-    @brief Implementations of command queue representations
-*/
-
-#ifdef __APPLE__
-#include <OpenCL/cl.h>
-#else
-#include <CL/cl.h>
-#endif
-
-#include <vector>
-#include <string>
-#include <sstream>
-#include "viennacl/ocl/context.hpp"
-#include "viennacl/ocl/device.hpp"
-#include "viennacl/ocl/handle.hpp"
-
-namespace viennacl
-{
-  namespace ocl
-  {
-    
-    /** @brief A class representing a command queue
-    *
-    */
-    class command_queue
-    {
-      public:
-        command_queue() {};
-        command_queue(viennacl::ocl::handle<cl_command_queue> h, cl_device_id dev) : handle_(h) {}
-        
-        //Copy constructor:
-        command_queue(command_queue const & other)
-        {
-          handle_ = other.handle_;
-        }
-
-        //assignment operator:
-        command_queue & operator=(command_queue const & other)
-        {
-          handle_ = other.handle_;
-          return *this;
-        }
-        
-        /** @brief Waits until all kernels in the queue have finished their execution */
-        void finish() const
-        {
-          clFinish(handle_.get());
-        }
-        
-        /** @brief Waits until all kernels in the queue have started their execution */
-        void flush() const
-        {
-          clFlush(handle_.get());
-        }
-
-        viennacl::ocl::handle<cl_command_queue> const & handle() const { return handle_; }
-
-      private:
-        
-        viennacl::ocl::handle<cl_command_queue> handle_;
-    };
-
- 
-    
-  } //namespace ocl
-} //namespace viennacl
-
-#endif
+#ifndef VIENNACL_OCL_COMMAND_QUEUE_HPP_
+#define VIENNACL_OCL_COMMAND_QUEUE_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/ocl/command_queue.hpp
+    @brief Implementations of command queue representations
+*/
+
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#else
+#include <CL/cl.h>
+#endif
+
+#include <vector>
+#include <string>
+#include <sstream>
+#include "viennacl/ocl/device.hpp"
+#include "viennacl/ocl/handle.hpp"
+
+namespace viennacl
+{
+  namespace ocl
+  {
+
+    /** @brief A class representing a command queue
+    *
+    */
+    class command_queue
+    {
+      public:
+        command_queue() {}
+        command_queue(viennacl::ocl::handle<cl_command_queue> h) : handle_(h) {}
+
+        //Copy constructor:
+        command_queue(command_queue const & other)
+        {
+          handle_ = other.handle_;
+        }
+
+        //assignment operator:
+        command_queue & operator=(command_queue const & other)
+        {
+          handle_ = other.handle_;
+          return *this;
+        }
+
+        bool operator==(command_queue const & other) const
+        {
+          return handle_ == other.handle_;
+        }
+
+        /** @brief Waits until all kernels in the queue have finished their execution */
+        void finish() const
+        {
+          clFinish(handle_.get());
+        }
+
+        /** @brief Waits until all kernels in the queue have started their execution */
+        void flush() const
+        {
+          clFlush(handle_.get());
+        }
+
+        viennacl::ocl::handle<cl_command_queue> const & handle() const { return handle_; }
+        viennacl::ocl::handle<cl_command_queue>       & handle()       { return handle_; }
+
+      private:
+
+        viennacl::ocl::handle<cl_command_queue> handle_;
+    };
+
+
+
+  } //namespace ocl
+} //namespace viennacl
+
+#endif
diff --git a/viennacl/ocl/context.hpp b/viennacl/ocl/context.hpp
index 28fe598..c782adc 100644
--- a/viennacl/ocl/context.hpp
+++ b/viennacl/ocl/context.hpp
@@ -2,16 +2,17 @@
 #define VIENNACL_OCL_CONTEXT_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
@@ -32,6 +33,7 @@
 #include <map>
 #include "viennacl/ocl/forwards.h"
 #include "viennacl/ocl/handle.hpp"
+#include "viennacl/ocl/kernel.hpp"
 #include "viennacl/ocl/program.hpp"
 #include "viennacl/ocl/device.hpp"
 #include "viennacl/ocl/platform.hpp"
@@ -41,59 +43,66 @@ namespace viennacl
 {
   namespace ocl
   {
+    /** @brief Manages an OpenCL context and provides the respective convenience functions for creating buffers, etc.
+      *
+      * This class was originally written before the OpenCL C++ bindings were standardized.
+      * Regardless, it provides a couple of convience functionality which is not covered by the OpenCL C++ bindings.
+    */
     class context
     {
       typedef std::vector< viennacl::ocl::program >   ProgramContainer;
-      
+
       public:
         context() : initialized_(false),
                     device_type_(CL_DEVICE_TYPE_DEFAULT),
-                    current_device_id(0),
-                    default_device_num_(1) {}
-        
+                    current_device_id_(0),
+                    default_device_num_(1),
+                    pf_index_(0),
+                    current_queue_id_(0) {}
+
         //////// Get and set default number of devices per context */
         /** @brief Returns the maximum number of devices to be set up for the context */
-        std::size_t default_device_num() const { return default_device_num_; }
-        
+        vcl_size_t default_device_num() const { return default_device_num_; }
+
         /** @brief Sets the maximum number of devices to be set up for the context */
-        void default_device_num(std::size_t new_num) { default_device_num_ = new_num; }
-        
+        void default_device_num(vcl_size_t new_num) { default_device_num_ = new_num; }
+
         ////////// get and set preferred device type /////////////////////
         /** @brief Returns the default device type for the context */
         cl_device_type default_device_type()
         {
           return device_type_;
         }
-        
+
         /** @brief Sets the device type for this context */
-        void default_device_type(cl_device_type dtype) 
-        { 
+        void default_device_type(cl_device_type dtype)
+        {
           #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_CONTEXT)
           std::cout << "ViennaCL: Setting new device type for context " << h_ << std::endl;
           #endif
           if (!initialized_)
             device_type_ = dtype; //assume that the user provided a correct value
         }
-        
+
         //////////////////// get devices //////////////////
         /** @brief Returns a vector with all devices in this context */
         std::vector<viennacl::ocl::device> const & devices() const
         {
           return devices_;
         }
-        
+
         /** @brief Returns the current device */
         viennacl::ocl::device const & current_device() const
         {
-          //std::cout << "Current device id in context: " << current_device_id << std::endl;
-          return devices_[current_device_id];
+          //std::cout << "Current device id in context: " << current_device_id_ << std::endl;
+          return devices_[current_device_id_];
         }
-        
+
         /** @brief Switches the current device to the i-th device in this context */
-        void switch_device(size_t i)
+        void switch_device(vcl_size_t i)
         {
-          assert(i >= 0 && i < devices_.size());
-          current_device_id = i;
+          assert(i < devices_.size() && bool("Provided device index out of range!"));
+          current_device_id_ = i;
         }
 
         /** @brief If the supplied device is used within the context, it becomes the current active device. */
@@ -103,23 +112,23 @@ namespace viennacl
           std::cout << "ViennaCL: Setting new current device for context " << h_ << std::endl;
           #endif
           bool found = false;
-          for (size_t i=0; i<devices_.size(); ++i)
+          for (vcl_size_t i=0; i<devices_.size(); ++i)
           {
             if (devices_[i] == d)
             {
               found = true;
-              current_device_id = i;
+              current_device_id_ = i;
               break;
             }
           }
           if (found == false)
             std::cerr << "ViennaCL: Warning: Could not set device " << d.name() << " for context." << std::endl;
         }
-        
+
         /** @brief Add a device to the context. Must be done before the context is initialized */
         void add_device(viennacl::ocl::device const & d)
         {
-          assert(!initialized_ && "Device must be added to context before it is initialized!");
+          assert(!initialized_ && bool("Device must be added to context before it is initialized!"));
           #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_CONTEXT)
           std::cout << "ViennaCL: Adding new device to context " << h_ << std::endl;
           #endif
@@ -130,13 +139,13 @@ namespace viennacl
         /** @brief Add a device to the context. Must be done before the context is initialized */
         void add_device(cl_device_id d)
         {
-          assert(!initialized_ && "Device must be added to context before it is initialized!");
+          assert(!initialized_ && bool("Device must be added to context before it is initialized!"));
           add_device(viennacl::ocl::device(d));
         }
 
 
         /////////////////////// initialize context ///////////////////
-        
+
         /** @brief Initializes a new context */
         void init()
         {
@@ -151,55 +160,71 @@ namespace viennacl
 
 /*        void existing_context(cl_context context_id)
         {
-          assert(!initialized_ && "ViennaCL: FATAL error: Provided a new context for an already initialized context.");
+          assert(!initialized_ && bool("ViennaCL: FATAL error: Provided a new context for an already initialized context."));
           #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_CONTEXT)
           std::cout << "ViennaCL: Reusing existing context " << h_ << std::endl;
           #endif
           h_ = context_id;
         }*/
-        
+
         ////////////////////// create memory /////////////////////////////
-        /** @brief Creates a memory buffer within the context
+
+        /** @brief Creates a memory buffer within the context. Does not wrap the OpenCL handle into the smart-pointer-like viennacl::ocl::handle, which saves an OpenCL backend call, yet the user has to ensure that the OpenCL memory handle is free'd or passed to a viennacl::ocl::handle later on.
         *
         *  @param flags  OpenCL flags for the buffer creation
         *  @param size   Size of the memory buffer in bytes
         *  @param ptr    Optional pointer to CPU memory, with which the OpenCL memory should be initialized
+        *  @return       A plain OpenCL handle. Either assign it to a viennacl::ocl::handle<cl_mem> directly, or make sure that you free to memory manually if you no longer need the allocated memory.
         */
-        viennacl::ocl::handle<cl_mem> create_memory(cl_mem_flags flags, unsigned int size, void * ptr = NULL)
+        cl_mem create_memory_without_smart_handle(cl_mem_flags flags, unsigned int size, void * ptr = NULL) const
         {
           #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_CONTEXT)
-          std::cout << "ViennaCL: Creating memory of size " << size << " for context " << h_ << std::endl;
+          std::cout << "ViennaCL: Creating memory of size " << size << " for context " << h_ << " (unsafe, returning cl_mem directly)" << std::endl;
           #endif
           if (ptr)
             flags |= CL_MEM_COPY_HOST_PTR;
           cl_int err;
-          viennacl::ocl::handle<cl_mem> mem = clCreateBuffer(h_.get(), flags, size, ptr, &err);
+          cl_mem mem = clCreateBuffer(h_.get(), flags, size, ptr, &err);
           VIENNACL_ERR_CHECK(err);
           return mem;
         }
 
+
+        /** @brief Creates a memory buffer within the context
+        *
+        *  @param flags  OpenCL flags for the buffer creation
+        *  @param size   Size of the memory buffer in bytes
+        *  @param ptr    Optional pointer to CPU memory, with which the OpenCL memory should be initialized
+        */
+        viennacl::ocl::handle<cl_mem> create_memory(cl_mem_flags flags, unsigned int size, void * ptr = NULL) const
+        {
+          return viennacl::ocl::handle<cl_mem>(create_memory_without_smart_handle(flags, size, ptr), *this);
+        }
+
         /** @brief Creates a memory buffer within the context initialized from the supplied data
         *
         *  @param flags  OpenCL flags for the buffer creation
-        *  @param _buffer A vector (STL vector, ublas vector, etc.)
+        *  @param buffer A vector (STL vector, ublas vector, etc.)
         */
         template < typename SCALARTYPE, typename A, template <typename, typename> class VectorType >
-        viennacl::ocl::handle<cl_mem> create_memory(cl_mem_flags flags, const VectorType<SCALARTYPE, A> & _buffer)
+        viennacl::ocl::handle<cl_mem> create_memory(cl_mem_flags flags, const VectorType<SCALARTYPE, A> & buffer) const
         {
-          return create_memory(flags, static_cast<cl_uint>(sizeof(SCALARTYPE) * _buffer.size()), (void*)&_buffer[0]);
+          return viennacl::ocl::handle<cl_mem>(create_memory_without_smart_handle(flags, static_cast<cl_uint>(sizeof(SCALARTYPE) * buffer.size()), (void*)&buffer[0]), *this);
         }
-        
+
         //////////////////// create queues ////////////////////////////////
-        
+
         /** @brief Adds an existing queue for the given device to the context */
         void add_queue(cl_device_id dev, cl_command_queue q)
         {
           #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_CONTEXT)
           std::cout << "ViennaCL: Adding existing queue " << q << " for device " << dev << " to context " << h_ << std::endl;
           #endif
-          queues_[dev].push_back(viennacl::ocl::command_queue(q, dev));
+          viennacl::ocl::handle<cl_command_queue> queue_handle(q, *this);
+          queues_[dev].push_back(viennacl::ocl::command_queue(queue_handle));
+          queues_[dev].back().handle().inc();
         }
-        
+
         /** @brief Adds a queue for the given device to the context */
         void add_queue(cl_device_id dev)
         {
@@ -207,10 +232,14 @@ namespace viennacl
           std::cout << "ViennaCL: Adding new queue for device " << dev << " to context " << h_ << std::endl;
           #endif
           cl_int err;
-          viennacl::ocl::handle<cl_command_queue> temp = clCreateCommandQueue(h_.get(), dev, 0, &err);
+#ifdef VIENNACL_PROFILING_ENABLED
+          viennacl::ocl::handle<cl_command_queue> temp(clCreateCommandQueue(h_.get(), dev, CL_QUEUE_PROFILING_ENABLE, &err), *this);
+#else
+          viennacl::ocl::handle<cl_command_queue> temp(clCreateCommandQueue(h_.get(), dev, 0, &err), *this);
+#endif
           VIENNACL_ERR_CHECK(err);
-          
-          queues_[dev].push_back(viennacl::ocl::command_queue(temp, dev));
+
+          queues_[dev].push_back(viennacl::ocl::command_queue(temp));
         }
 
         /** @brief Adds a queue for the given device to the context */
@@ -219,14 +248,31 @@ namespace viennacl
         //get queue for default device:
         viennacl::ocl::command_queue & get_queue()
         {
-          return queues_[devices_[current_device_id].id()][0];
+          return queues_[devices_[current_device_id_].id()][current_queue_id_];
+        }
+
+        viennacl::ocl::command_queue const & get_queue() const
+        {
+          typedef std::map< cl_device_id, std::vector<viennacl::ocl::command_queue> >    QueueContainer;
+
+          // find queue:
+          QueueContainer::const_iterator it = queues_.find(devices_[current_device_id_].id());
+          if (it != queues_.end())
+            return (it->second)[current_queue_id_];
+
+          std::cerr << "ViennaCL: FATAL ERROR: Could not obtain current command queue!" << std::endl;
+          std::cout << "Number of queues in context: " << queues_.size() << std::endl;
+          std::cout << "Number of devices in context: " << devices_.size() << std::endl;
+          throw "queue not found!";
+
+          //return (it->second)[current_queue_id_];
         }
-        
+
         //get a particular queue:
         /** @brief Returns the queue with the provided index for the given device */
-        viennacl::ocl::command_queue & get_queue(cl_device_id dev, size_t i = 0)
+        viennacl::ocl::command_queue & get_queue(cl_device_id dev, vcl_size_t i = 0)
         {
-          assert(i >= 0 && i < queues_.size() && "In class 'context': id invalid in get_queue()");
+          assert(i < queues_.size() && bool("In class 'context': id invalid in get_queue()"));
           #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_CONTEXT)
           std::cout << "ViennaCL: Getting queue " << i << " for device " << dev << " in context " << h_ << std::endl;
           #endif
@@ -236,53 +282,134 @@ namespace viennacl
             if (devices_[device_index] == dev)
               break;
           }
-          
-          assert(device_index < devices_.size() && "Device not within context");
-          
+
+          assert(device_index < devices_.size() && bool("Device not within context"));
+
           return queues_[devices_[device_index].id()][i];
         }
-        
+
+        /** @brief Returns the current device */
+        // TODO: work out the const issues
+        viennacl::ocl::command_queue const & current_queue() //const
+        {
+          return queues_[devices_[current_device_id_].id()][current_queue_id_];
+        }
+
+        /** @brief Switches the current device to the i-th device in this context */
+        void switch_queue(vcl_size_t i)
+        {
+          assert(i < queues_[devices_[current_device_id_].id()].size() && bool("In class 'context': Provided queue index out of range for device!"));
+          current_queue_id_ = i;
+        }
+
+#if 1
+        /** @brief If the supplied command_queue is used within the context, it becomes the current active command_queue, the command_queue's device becomes current active device. */
+        void switch_queue(viennacl::ocl::command_queue const & q)
+        {
+          #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_CONTEXT)
+          std::cout << "ViennaCL: Setting new current queue for context " << h_ << std::endl;
+          #endif
+          bool found = false;
+          typedef std::map< cl_device_id, std::vector<viennacl::ocl::command_queue> >    QueueContainer;
+
+          // For each device:
+          vcl_size_t j = 0;
+          for (QueueContainer::const_iterator it=queues_.begin(); it != queues_.end(); it++,j++)
+          {
+              const std::vector<viennacl::ocl::command_queue> & qv = (it->second);
+              // For each queue candidate
+              for (vcl_size_t i=0; i<qv.size(); ++i)
+              {
+                  if (qv[i] == q)
+                  {
+                      found = true;
+                      current_device_id_ = j;
+                      current_queue_id_ = i;
+                      break;
+                  }
+              }
+          }
+          if (found == false)
+            std::cerr << "ViennaCL: Warning: Could not set queue " << q.handle().get() << " for context." << std::endl;
+        }
+#endif
+
         /////////////////// create program ///////////////////////////////
         /** @brief Adds a program to the context
         */
         viennacl::ocl::program & add_program(cl_program p, std::string const & prog_name)
         {
-          programs_.push_back(viennacl::ocl::program(p, prog_name));
+          programs_.push_back(viennacl::ocl::program(p, *this, prog_name));
           return programs_.back();
         }
-        
-        /** @brief Adds a new program with the provided source to the context
+
+        /** @brief Adds a new program with the provided source to the context. Compiles the program and extracts all kernels from it
         */
         viennacl::ocl::program & add_program(std::string const & source, std::string const & prog_name)
         {
           const char * source_text = source.c_str();
-          size_t source_size = source.size();
+          vcl_size_t source_size = source.size();
           cl_int err;
-          
+
           #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_CONTEXT)
           std::cout << "ViennaCL: Adding program '" << prog_name << "' to context " << h_ << std::endl;
           #endif
-          
-          viennacl::ocl::handle<cl_program> temp = clCreateProgramWithSource(h_.get(), 1, (const char **)&source_text, &source_size, &err);
+
+          //
+          // Build program
+          //
+          cl_program temp = clCreateProgramWithSource(h_.get(), 1, (const char **)&source_text, &source_size, &err);
           VIENNACL_ERR_CHECK(err);
-          
-          err = clBuildProgram(temp.get(), 0, NULL, NULL, NULL, NULL);
-          #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_BUILD)
-            char buffer[1024];
+
+          const char * options = build_options_.c_str();
+          err = clBuildProgram(temp, 0, NULL, options, NULL, NULL);
+          if (err != CL_SUCCESS)
+          {
+            char buffer[8192];
             cl_build_status status;
             clGetProgramBuildInfo(temp, devices_[0].id(), CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL);
-            clGetProgramBuildInfo(temp, devices_[0].id(), CL_PROGRAM_BUILD_LOG, sizeof(char)*1024, &buffer, NULL);
+            clGetProgramBuildInfo(temp, devices_[0].id(), CL_PROGRAM_BUILD_LOG, sizeof(char)*8192, &buffer, NULL);
             std::cout << "Build Scalar: Err = " << err << " Status = " << status << std::endl;
             std::cout << "Log: " << buffer << std::endl;
-            //std::cout << "Sources: " << source << std::endl;
-          #endif
+            std::cout << "Sources: " << source << std::endl;
+          }
           VIENNACL_ERR_CHECK(err);
 
-          programs_.push_back(viennacl::ocl::program(temp, prog_name));
-          
-          return programs_.back();
+          programs_.push_back(viennacl::ocl::program(temp, *this, prog_name));
+
+          viennacl::ocl::program & prog = programs_.back();
+
+          //
+          // Extract kernels
+          //
+          cl_kernel kernels[1024];
+          cl_uint   num_kernels_in_prog;
+          err = clCreateKernelsInProgram(prog.handle().get(), 1024, kernels, &num_kernels_in_prog);
+          VIENNACL_ERR_CHECK(err);
+
+          for (cl_uint i=0; i<num_kernels_in_prog; ++i)
+          {
+            char kernel_name[128];
+            err = clGetKernelInfo(kernels[i], CL_KERNEL_FUNCTION_NAME, 128, kernel_name, NULL);
+            prog.add_kernel(kernels[i], std::string(kernel_name));
+          }
+
+          return prog;
         }
-        
+
+        /** @brief Delete the program with the provided name */
+        void delete_program(std::string const & name){
+          for (ProgramContainer::iterator it = programs_.begin();
+                it != programs_.end();
+                ++it)
+          {
+            if (it->name() == name){
+              programs_.erase(it);
+              return;
+            }
+          }
+        }
+
         /** @brief Returns the program with the provided name */
         viennacl::ocl::program & get_program(std::string const & name)
         {
@@ -297,42 +424,94 @@ namespace viennacl
               return *it;
           }
           std::cerr << "Could not find program '" << name << "'" << std::endl;
-          assert(!"In class 'context': name invalid in get_program()");
-          return programs_[0];  //return a defined object
+          throw "In class 'context': name invalid in get_program()";
+          //return programs_[0];  //return a defined object
         }
-        
+
+        viennacl::ocl::program const & get_program(std::string const & name) const
+        {
+          #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_CONTEXT)
+          std::cout << "ViennaCL: Getting program '" << name << "' from context " << h_ << std::endl;
+          #endif
+          for (ProgramContainer::const_iterator it = programs_.begin();
+                it != programs_.end();
+                ++it)
+          {
+            if (it->name() == name)
+              return *it;
+          }
+          std::cerr << "Could not find program '" << name << "'" << std::endl;
+          throw "In class 'context': name invalid in get_program()";
+          //return programs_[0];  //return a defined object
+        }
+
+        /** @brief Returns whether the program with the provided name exists or not */
+        bool has_program(std::string const & name){
+            for (ProgramContainer::iterator it = programs_.begin();
+                  it != programs_.end();
+                  ++it)
+            {
+              if (it->name() == name) return true;
+            }
+            return false;
+        }
+
         /** @brief Returns the program with the provided id */
-        viennacl::ocl::program & get_program(size_t id)
+        viennacl::ocl::program & get_program(vcl_size_t id)
         {
-          assert(id >= 0 && id < programs_.size() && "In class 'context': id invalid in get_program()");
+          assert(id < programs_.size() && bool("In class 'context': id invalid in get_program()"));
           return programs_[id];
         }
-        
+
         /** @brief Returns the number of programs within this context */
-        size_t program_num() { return programs_.size(); }
+        vcl_size_t program_num() { return programs_.size(); }
+
+        /** @brief Convenience function for retrieving the kernel of a program directly from the context */
+        viennacl::ocl::kernel & get_kernel(std::string const & program_name, std::string const & kernel_name) { return get_program(program_name).get_kernel(kernel_name); }
 
         /** @brief Returns the number of devices within this context */
-        size_t device_num() { return devices_.size(); }
-        
+        vcl_size_t device_num() { return devices_.size(); }
+
         /** @brief Returns the context handle */
         const viennacl::ocl::handle<cl_context> & handle() const { return h_; }
-        
+
+        /** @brief Returns the current build option string */
+        std::string build_options() const { return build_options_; }
+
+        /** @brief Sets the build option string, which is passed to the OpenCL compiler in subsequent compilations. Does not effect programs already compiled previously. */
+        void build_options(std::string op) { build_options_ = op; }
+
+        /** @brief Returns the platform ID of the platform to be used for the context */
+        vcl_size_t platform_index() const  { return pf_index_; }
+
+        /** @brief Sets the platform ID of the platform to be used for the context */
+        void platform_index(vcl_size_t new_index)
+        {
+          assert(!initialized_ && bool("Platform ID must be set before context is initialized!"));
+          pf_index_ = new_index;
+        }
+
         /** @brief Less-than comparable for compatibility with std:map  */
         bool operator<(context const & other) const
         {
           return h_.get() < other.h_.get();
         }
-        
+
+        bool operator==(context const & other) const
+        {
+          return h_.get() == other.h_.get();
+        }
+
       private:
         /** @brief Initialize a new context. Reuse any previously supplied information (devices, queues) */
         void init_new()
         {
-          assert(!initialized_ && "ViennaCL FATAL error: Context already created!");
+          assert(!initialized_ && bool("ViennaCL FATAL error: Context already created!"));
 
           #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_CONTEXT)
           std::cout << "ViennaCL: Initializing new ViennaCL context." << std::endl;
           #endif
-          
+
           cl_int err;
           std::vector<cl_device_id> device_id_array;
           if (devices_.empty()) //get the default device if user has not yet specified a list of devices
@@ -341,15 +520,16 @@ namespace viennacl
             #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_CONTEXT)
             std::cout << "ViennaCL: Setting all devices for context..." << std::endl;
             #endif
-            
-            platform pf;
+
+            platform pf(pf_index_);
             std::vector<device> devices = pf.devices(device_type_);
             #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_CONTEXT)
             std::cout << "ViennaCL: Number of devices for context: " << devices.size() << std::endl;
             #endif
-            for (size_t i=0; i<devices.size(); ++i)
+            vcl_size_t device_num = std::min<vcl_size_t>(default_device_num_, devices.size());
+            for (vcl_size_t i=0; i<device_num; ++i)
               devices_.push_back(devices[i]);
-            
+
             if (devices.size() == 0)
             {
               std::cerr << "ViennaCL: FATAL ERROR: No devices of type '";
@@ -365,81 +545,140 @@ namespace viennacl
               std::cout << "' found!" << std::endl;
             }
           }
-          
+
           //extract list of device ids:
           for (std::vector< viennacl::ocl::device >::const_iterator iter = devices_.begin();
                                                                     iter != devices_.end();
                                                                   ++iter)
             device_id_array.push_back(iter->id());
-            
-          cl_uint device_num = std::max(default_device_num_, device_id_array.size());
-          h_ = clCreateContext(0, 
-                               device_num,
+
+          h_ = clCreateContext(0,
+                               static_cast<cl_uint>(devices_.size()),
                                &(device_id_array[0]),
                                NULL, NULL, &err);
           VIENNACL_ERR_CHECK(err);
-          
+
           initialized_ = true;
           #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_CONTEXT)
           std::cout << "ViennaCL: Initialization of new ViennaCL context done." << std::endl;
           #endif
         }
-        
+
         /** @brief Reuses a supplied context. */
         void init_existing(cl_context c)
         {
-          assert(!initialized_ && "ViennaCL FATAL error: Context already created!");
+          assert(!initialized_ && bool("ViennaCL FATAL error: Context already created!"));
           #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_CONTEXT)
           std::cout << "ViennaCL: Initialization of ViennaCL context from existing context." << std::endl;
           #endif
-          
+
           //set context handle:
           h_ = c;
-          
+          h_.inc(); // if the user provides the context, then the user will also call release() on the context. Without inc(), we would get a seg-fault due to double-free at program termination.
+
           if (devices_.empty())
           {
             //get devices for context:
             cl_int err;
             cl_uint num_devices;
-            size_t temp;
+            vcl_size_t temp;
             //Note: The obvious
             //  err = clGetContextInfo(h_, CL_CONTEXT_NUM_DEVICES, sizeof(cl_uint), &num_devices, NULL);
             //does not work with NVIDIA OpenCL stack!
             err = clGetContextInfo(h_.get(), CL_CONTEXT_DEVICES, VIENNACL_OCL_MAX_DEVICE_NUM * sizeof(cl_device_id), NULL, &temp);
             VIENNACL_ERR_CHECK(err);
-            assert(temp > 0 && "ViennaCL: FATAL error: Provided context does not contain any devices!");
-            num_devices = temp / sizeof(cl_device_id);
-            
+            assert(temp > 0 && bool("ViennaCL: FATAL error: Provided context does not contain any devices!"));
+            num_devices = static_cast<cl_uint>(temp / sizeof(cl_device_id));
+
             #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_CONTEXT)
             std::cout << "ViennaCL: Reusing context with " << num_devices << " devices." << std::endl;
             #endif
-            
+
             std::vector<cl_device_id> device_ids(num_devices);
             err = clGetContextInfo(h_.get(), CL_CONTEXT_DEVICES, num_devices * sizeof(cl_device_id), &(device_ids[0]), NULL);
             VIENNACL_ERR_CHECK(err);
-            
-            for (size_t i=0; i<num_devices; ++i)
+
+            for (vcl_size_t i=0; i<num_devices; ++i)
               devices_.push_back(viennacl::ocl::device(device_ids[i]));
           }
-          current_device_id = 0;
-          
+          current_device_id_ = 0;
+
           initialized_ = true;
           #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_CONTEXT)
           std::cout << "ViennaCL: Initialization of ViennaCL context from existing context done." << std::endl;
           #endif
-        }       
-        
-        
+        }
+
+
         bool initialized_;
         cl_device_type device_type_;
         viennacl::ocl::handle<cl_context> h_;
         std::vector< viennacl::ocl::device > devices_;
-        unsigned int current_device_id;
-        std::size_t default_device_num_;
+        vcl_size_t current_device_id_;
+        vcl_size_t default_device_num_;
         ProgramContainer programs_;
         std::map< cl_device_id, std::vector< viennacl::ocl::command_queue> > queues_;
+        std::string build_options_;
+        vcl_size_t pf_index_;
+        vcl_size_t current_queue_id_;
     }; //context
-    
+
+
+
+    /** @brief Adds a kernel to the program */
+    inline viennacl::ocl::kernel & viennacl::ocl::program::add_kernel(cl_kernel kernel_handle, std::string const & kernel_name)
+    {
+      assert(p_context_ != NULL && bool("Pointer to context invalid in viennacl::ocl::program object"));
+      viennacl::ocl::kernel temp(kernel_handle, *this, *p_context_, kernel_name);
+      kernels_.push_back(temp);
+      return kernels_.back();
+    }
+
+    /** @brief Returns the kernel with the provided name */
+    inline viennacl::ocl::kernel & viennacl::ocl::program::get_kernel(std::string const & name)
+    {
+      //std::cout << "Requiring kernel " << name << " from program " << name_ << std::endl;
+      for (KernelContainer::iterator it = kernels_.begin();
+            it != kernels_.end();
+           ++it)
+      {
+        if (it->name() == name)
+          return *it;
+      }
+      std::cerr << "ViennaCL: FATAL ERROR: Could not find kernel '" << name << "' from program '" << name_ << "'" << std::endl;
+      std::cout << "Number of kernels in program: " << kernels_.size() << std::endl;
+      throw "Kernel not found";
+      //return kernels_[0];  //return a defined object
+    }
+
+
+    inline void viennacl::ocl::kernel::set_work_size_defaults()
+    {
+      assert( p_program_ != NULL && bool("Kernel not initialized, program pointer invalid."));
+      assert( p_context_ != NULL && bool("Kernel not initialized, context pointer invalid."));
+
+      if (   (p_context_->current_device().type() == CL_DEVICE_TYPE_GPU)
+          || (p_context_->current_device().type() == CL_DEVICE_TYPE_ACCELERATOR) // Xeon Phi
+         )
+      {
+        local_work_size_[0] = 128;      local_work_size_[1] = 0;  local_work_size_[2] = 0;
+        global_work_size_[0] = 128*128; global_work_size_[1] = 0; global_work_size_[2] = 0;
+      }
+      else //assume CPU type:
+      {
+        //conservative assumption: one thread per CPU core:
+        local_work_size_[0] = 1; local_work_size_[1] = 0; local_work_size_[2] = 0;
+
+        size_type units = p_context_->current_device().max_compute_units();
+        size_type s = 1;
+
+        while (s < units) // find next power of 2. Important to make reductions work on e.g. six-core CPUs.
+          s *= 2;
+
+        global_work_size_[0] = s; global_work_size_[1] = 0; global_work_size_[2] = 0;
+      }
+    }
+
   }
 }
 
diff --git a/viennacl/ocl/device.hpp b/viennacl/ocl/device.hpp
index 8614b99..f04cf53 100644
--- a/viennacl/ocl/device.hpp
+++ b/viennacl/ocl/device.hpp
@@ -1,270 +1,1452 @@
-#ifndef VIENNACL_OCL_DEVICE_HPP_
-#define VIENNACL_OCL_DEVICE_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file device.hpp
-    @brief Represents an OpenCL device within ViennaCL
-*/
-
-#ifdef __APPLE__
-#include <OpenCL/cl.h>
-#else
-#include <CL/cl.h>
-#endif
-
-#include<stdio.h>
-
-#include <vector>
-#include <string>
-#include <sstream>
-#include <assert.h>
-#include "viennacl/ocl/handle.hpp"
-#include "viennacl/ocl/error.hpp"
-
-namespace viennacl
-{
-  namespace ocl
-  {
-    
-    /** @brief A class representing a compute device (e.g. a GPU)
-    *
-    */
-    class device
-    {
-      public:
-        explicit device() : device_(0) {}
-        
-        explicit device(cl_device_id dev) : device_(dev)
-        {
-          #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_DEVICE)
-          std::cout << "ViennaCL: Creating device object (CTOR with cl_device_id)" << std::endl;
-          #endif
-          init(dev);
-        }
-        
-        device(const device & other)
-        {
-          #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_DEVICE)
-          std::cout << "ViennaCL: Creating device object (Copy CTOR)" << std::endl;
-          #endif
-          device_ = other.device_;
-          init(device_);
-        }
-        
-        /** @brief Initializes the class from a given device ID */
-        void init(cl_device_id dev)
-        {
-          cl_int err;
-
-          //query a little bit of info:
-          err = clGetDeviceInfo(dev, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &max_work_group_size_, NULL);
-          VIENNACL_ERR_CHECK(err);
-          err = clGetDeviceInfo(dev, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &compute_units_, NULL);
-          VIENNACL_ERR_CHECK(err);
-          err = clGetDeviceInfo(dev, CL_DEVICE_TYPE, sizeof(cl_device_type), &type_, NULL);
-          VIENNACL_ERR_CHECK(err);
-          err = clGetDeviceInfo(dev, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_ulong), &global_memory_, NULL);
-          VIENNACL_ERR_CHECK(err);
-          err = clGetDeviceInfo(dev, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_memory_alloc_, NULL);
-          VIENNACL_ERR_CHECK(err);
-          err = clGetDeviceInfo(dev, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &local_memory_, NULL);
-          VIENNACL_ERR_CHECK(err);
-        }
-
-        /** @brief Returns true if the device supports double precision */
-        bool double_support() const
-        { 
-          char buffer[1024];
-          bool ret = false;
-          
-          //get extensions and search for double precision
-          clGetDeviceInfo(device_, CL_DEVICE_EXTENSIONS, sizeof(char)*1024, buffer, NULL);
-          std::string extensions(buffer);
-          if (extensions.find("cl_khr_fp64") != std::string::npos
-              || extensions.find("cl_amd_fp64") != std::string::npos)
-          {
-            ret = true;
-          }
-          
-          #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_DEVICE)
-          std::cout << "ViennaCL: Device extensions: " << std::endl;
-          std::cout << extensions << std::endl;
-          if (ret)
-            std::cout << "ViennaCL: Device " << name() << " supports double precision." << std::endl;
-          else
-            std::cout << "ViennaCL: No double precision for device " << name() << "." << std::endl;
-          #endif
-          
-          return ret;
-        }
-        
-        std::string double_support_extension() const
-        {
-          char buffer[1024];
-          clGetDeviceInfo(device_, CL_DEVICE_EXTENSIONS, sizeof(char)*1024, buffer, NULL);
-          std::string extensions(buffer);
-          
-          if (extensions.find("cl_amd_fp64") != std::string::npos) //AMD extension
-            return "cl_amd_fp64";
-          
-          if (extensions.find("cl_khr_fp64") != std::string::npos) //Khronos-certified standard extension for double precision
-            return "cl_khr_fp64";
-          
-          return "";
-        }
-        
-        /** @brief Returns the OpenCL device id */
-        cl_device_id id() const
-        {
-          assert(device_ != 0);
-          return device_;
-        }
-        
-        /** @brief Returns the device name */
-        std::string name() const
-        {
-          std::ostringstream oss;        
-          char buffer[1024]; 
-          cl_int err;          
-          err = clGetDeviceInfo(device_, CL_DEVICE_NAME, sizeof(char)*1024, &buffer, NULL);
-          VIENNACL_ERR_CHECK(err);
-          oss << buffer;
-          return oss.str();          
-        }
-        
-        /** @brief Returns the driver version */
-        std::string driver_version() const
-        {
-          std::ostringstream oss;
-          char buffer[1024]; buffer[0] = 0;
-          cl_int err;          
-          err = clGetDeviceInfo(device_, CL_DRIVER_VERSION, sizeof(char)*1024, buffer, NULL);
-          VIENNACL_ERR_CHECK(err);
-          oss << buffer;
-          return oss.str();          
-        }        
-        
-        /** @brief Returns the number of compute units on the device */
-        cl_uint max_compute_units() const
-        {
-          return compute_units_;
-        }
-        
-        /** @brief Returns the maximum work group size for the device*/
-        size_t max_workgroup_size() const
-        {
-          return max_work_group_size_;
-        }                        
-
-        /** @brief Returns the global memory for the device*/
-        cl_ulong global_memory() const
-        {
-          return global_memory_;
-        }           
-
-        /** @brief Returns the local memory for the device*/
-        cl_ulong local_memory() const
-        {
-          return local_memory_;
-        }       
-
-        /** @brief Returns the maximum allocable memory for the device*/
-        cl_ulong max_allocable_memory() const
-        {
-          return max_memory_alloc_;
-        }           
-        
-        /** @brief Returns an info string with a few properties of the device */
-        std::string info() const
-        {
-          std::ostringstream oss;
-          char buffer[1024]; buffer[0] = 0;
-          cl_int err;
-          cl_uint vendor_id;
-          cl_ulong local_mem_size;
-          cl_ulong global_mem_size;
-          
-          err = clGetDeviceInfo(device_, CL_DEVICE_VENDOR_ID, sizeof(cl_uint), &vendor_id, NULL);
-          VIENNACL_ERR_CHECK(err);
-          oss << "CL Device Vendor ID: " << vendor_id << std::endl;
-
-          err = clGetDeviceInfo(device_, CL_DEVICE_NAME, sizeof(char)*1024, buffer, NULL);
-          VIENNACL_ERR_CHECK(err);
-          oss << "CL Device Name: " << buffer << std::endl;
-
-          err = clGetDeviceInfo(device_, CL_DRIVER_VERSION, sizeof(char)*1024, buffer, NULL);
-          VIENNACL_ERR_CHECK(err);
-          std::string test = buffer;
-          oss << "CL Driver Version: " << test << std::endl;
-
-          oss << "--------------------------------" << std::endl;
-          
-          oss << "CL Device Max Compute Units: " << compute_units_ << std::endl;
-
-  //         err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(char)*1024, buffer, NULL);
-  //         CL_ERR_CHECK(err);
-  //         oss << "CL Device Max Work Item Dimensions: " << buffer << std::endl;
-  // 
-  //         err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(char)*1024, buffer, NULL);
-  //         CL_ERR_CHECK(err);
-  //         oss << "CL Device Max Work Item Sizes: " << buffer << std::endl;
-
-          oss << "CL Device Max Work Group Size: " << max_work_group_size_ << std::endl;
-
-          err = clGetDeviceInfo(device_, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_ulong), &global_mem_size, NULL);
-          VIENNACL_ERR_CHECK(err);
-          oss << "CL Device Global Mem Size: " << global_mem_size << std::endl;
-          
-          err = clGetDeviceInfo(device_, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &local_mem_size, NULL);
-          VIENNACL_ERR_CHECK(err);
-          oss << "CL Device Local Mem Size: " << local_mem_size << std::endl;
-          
-          //return info string:
-          std::string ret(oss.str());
-          return ret;
-        }
-        
-        size_t max_work_group_size() const { return max_work_group_size_; }
-        cl_uint compute_units() const { return compute_units_; }
-        cl_device_type type() const { return type_; }
-        
-        bool operator==(device const & other) const
-        {
-          return device_ == other.device_;
-        }
-
-        bool operator==(cl_device_id other) const
-        {
-          return device_ == other;
-        }
-
-      private:
-        
-        cl_device_id    device_;
-        size_t          max_work_group_size_;
-        cl_uint         compute_units_;
-        cl_device_type  type_; //device type
-        cl_ulong        max_memory_alloc_;
-        cl_ulong        global_memory_;
-        cl_ulong        local_memory_;
-    };
-
-  } //namespace ocl
-} //namespace viennacl
-
-#endif
+#ifndef VIENNACL_OCL_DEVICE_HPP_
+#define VIENNACL_OCL_DEVICE_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/ocl/device.hpp
+    @brief Represents an OpenCL device within ViennaCL
+*/
+
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#else
+#include <CL/cl.h>
+#endif
+
+#include<stdio.h>
+
+#include <vector>
+#include <string>
+#include <sstream>
+#include <assert.h>
+#include "viennacl/ocl/device_utils.hpp"
+#include "viennacl/ocl/handle.hpp"
+#include "viennacl/ocl/error.hpp"
+
+namespace viennacl
+{
+  namespace ocl
+  {
+
+    /** @brief A class representing a compute device (e.g. a GPU)
+    *
+    */
+    class device
+    {
+      public:
+        explicit device() : device_(0) { flush_cache(); }
+
+        explicit device(cl_device_id dev) : device_(dev)
+        {
+          #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_DEVICE)
+          std::cout << "ViennaCL: Creating device object (CTOR with cl_device_id)" << std::endl;
+          #endif
+          flush_cache();
+        }
+
+        device(const device & other) : device_(0)
+        {
+          #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_DEVICE)
+          std::cout << "ViennaCL: Creating device object (Copy CTOR)" << std::endl;
+          #endif
+          if (device_ != other.device_)
+          {
+            device_ = other.device_;
+            flush_cache();
+          }
+        }
+
+        /** @brief The default compute device address space size specified as an unsigned integer value in bits. Currently supported values are 32 or 64 bits. */
+        cl_uint address_bits() const
+        {
+          if (!address_bits_valid_)
+          {
+            cl_int err = clGetDeviceInfo(device_, CL_DEVICE_ADDRESS_BITS, sizeof(cl_uint), static_cast<void *>(&address_bits_), NULL);
+            VIENNACL_ERR_CHECK(err);
+            address_bits_valid_ = true;
+          }
+          return address_bits_;
+        }
+
+        /** @brief Is CL_TRUE if the device is available and CL_FALSE if the device is not available. */
+        cl_bool available() const
+        {
+          if (!available_valid_)
+          {
+            cl_int err = clGetDeviceInfo(device_, CL_DEVICE_AVAILABLE, sizeof(cl_bool), static_cast<void *>(&available_), NULL);
+            VIENNACL_ERR_CHECK(err);
+            available_valid_ = true;
+          }
+          return available_;
+        }
+
+        /** @brief Is CL_FALSE if the implementation does not have a compiler available to compile the program source. Is CL_TRUE if the compiler is available. This can be CL_FALSE for the embedded platform profile only. */
+        cl_bool compiler_available() const
+        {
+          if (!compiler_available_valid_)
+          {
+            cl_int err = clGetDeviceInfo(device_, CL_DEVICE_COMPILER_AVAILABLE , sizeof(cl_bool), static_cast<void *>(&compiler_available_), NULL);
+            VIENNACL_ERR_CHECK(err);
+            compiler_available_valid_ = true;
+          }
+          return compiler_available_;
+        }
+
+#ifdef CL_DEVICE_DOUBLE_FP_CONFIG
+        /** @brief Describes the OPTIONAL double precision floating-point capability of the OpenCL device.
+          *
+          * This is a bit-field that describes one or more of the following values:
+          *   CL_FP_DENORM - denorms are supported.
+          *   CL_FP_INF_NAN - INF and NaNs are supported.
+          *   CL_FP_ROUND_TO_NEAREST - round to nearest even rounding mode supported.
+          *   CL_FP_ROUND_TO_ZERO - round to zero rounding mode supported.
+          *   CL_FP_ROUND_TO_INF - round to +ve and -ve infinity rounding modes supported.
+          *   CP_FP_FMA - IEEE754-2008 fused multiply-add is supported.
+          *
+          * The mandated minimum double precision floating-point capability is
+          * CL_FP_FMA | CL_FP_ROUND_TO_NEAREST | CL_FP_ROUND_TO_ZERO | CL_FP_ROUND_TO_INF | CL_FP_INF_NAN | CL_FP_DENORM.
+          */
+        cl_device_fp_config double_fp_config() const
+        {
+          if (!double_fp_config_valid_)
+          {
+            cl_int err = clGetDeviceInfo(device_, CL_DEVICE_DOUBLE_FP_CONFIG, sizeof(cl_device_fp_config), static_cast<void *>(&double_fp_config_), NULL);
+            VIENNACL_ERR_CHECK(err);
+            double_fp_config_valid_ = true;
+          }
+          return double_fp_config_;
+        }
+#endif
+
+        /** @brief Is CL_TRUE if the OpenCL device is a little endian device and CL_FALSE otherwise. */
+        cl_bool endian_little() const
+        {
+          if (!endian_little_valid_)
+          {
+            cl_int err = clGetDeviceInfo(device_, CL_DEVICE_ENDIAN_LITTLE, sizeof(cl_bool), static_cast<void *>(&endian_little_), NULL);
+            VIENNACL_ERR_CHECK(err);
+            endian_little_valid_ = true;
+          }
+          return endian_little_;
+        }
+
+        /** @brief Is CL_TRUE if the device implements error correction for all accesses to compute device memory (global and constant) and CL_FALSE otherwise. */
+        cl_bool error_correction_support() const
+        {
+          if (!error_correction_support_valid_)
+          {
+            cl_int err = clGetDeviceInfo(device_, CL_DEVICE_ERROR_CORRECTION_SUPPORT , sizeof(cl_bool), static_cast<void *>(&error_correction_support_), NULL);
+            VIENNACL_ERR_CHECK(err);
+            error_correction_support_valid_ = true;
+          }
+          return error_correction_support_;
+        }
+
+        /** @brief Describes the execution capabilities of the device.
+          *
+          * This is a bit-field that describes one or more of the following values:
+          *   CL_EXEC_KERNEL - The OpenCL device can execute OpenCL kernels.
+          *   CL_EXEC_NATIVE_KERNEL - The OpenCL device can execute native kernels.
+          * The mandated minimum capability is CL_EXEC_KERNEL.
+          */
+        cl_device_exec_capabilities execution_capabilities() const
+        {
+          if (!execution_capabilities_valid_)
+          {
+            cl_int err = clGetDeviceInfo(device_, CL_DEVICE_EXECUTION_CAPABILITIES  , sizeof(cl_device_exec_capabilities), static_cast<void *>(&execution_capabilities_), NULL);
+            VIENNACL_ERR_CHECK(err);
+            execution_capabilities_valid_ = true;
+          }
+          return execution_capabilities_;
+        }
+
+        /** @brief Returns a space-separated list of extension names (the extension names themselves do not contain any spaces).
+          *
+          * The list of extension names returned currently can include one or more of the following approved extension names:
+          *   cl_khr_fp64
+          *   cl_khr_int64_base_atomics
+          *   cl_khr_int64_extended_atomics
+          *   cl_khr_fp16
+          *   cl_khr_gl_sharing
+          *   cl_khr_gl_event
+          *   cl_khr_d3d10_sharing
+          */
+        std::string extensions() const
+        {
+          if (!extensions_valid_)
+          {
+            cl_int err = clGetDeviceInfo(device_, CL_DEVICE_EXTENSIONS, sizeof(char) * 2048, static_cast<void *>(&extensions_), NULL);
+            VIENNACL_ERR_CHECK(err);
+            extensions_valid_ = true;
+          }
+          return extensions_;
+        }
+
+        /** @brief Size of global memory cache in bytes. */
+        cl_ulong global_mem_cache_size() const
+        {
+          if (!global_mem_cache_size_valid_)
+          {
+            cl_int err = clGetDeviceInfo(device_,  CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, sizeof(cl_ulong), static_cast<void *>(&global_mem_cache_size_), NULL);
+            VIENNACL_ERR_CHECK(err);
+            global_mem_cache_size_valid_ = true;
+          }
+          return global_mem_cache_size_;
+        }
+
+        /** @brief Type of global memory cache supported. Valid values are: CL_NONE, CL_READ_ONLY_CACHE, and CL_READ_WRITE_CACHE. */
+        cl_device_mem_cache_type global_mem_cache_type() const
+        {
+          if (!global_mem_cache_type_valid_)
+          {
+            cl_int err = clGetDeviceInfo(device_, CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, sizeof(cl_device_mem_cache_type), static_cast<void *>(&global_mem_cache_type_), NULL);
+            VIENNACL_ERR_CHECK(err);
+            global_mem_cache_type_valid_ = true;
+          }
+          return global_mem_cache_type_;
+        }
+
+        /** @brief Size of global memory cache in bytes. */
+        cl_uint global_mem_cacheline_size() const
+        {
+          if (!global_mem_cacheline_size_valid_)
+          {
+            cl_int err = clGetDeviceInfo(device_,  CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, sizeof(cl_uint), static_cast<void *>(&global_mem_cacheline_size_), NULL);
+            VIENNACL_ERR_CHECK(err);
+            global_mem_cacheline_size_valid_ = true;
+          }
+          return global_mem_cacheline_size_;
+        }
+
+        /** @brief Size of global memory in bytes. */
+        cl_ulong global_mem_size() const
+        {
+          if (!global_mem_size_valid_)
+          {
+            cl_int err = clGetDeviceInfo(device_,  CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_ulong), static_cast<void *>(&global_mem_size_), NULL);
+            VIENNACL_ERR_CHECK(err);
+            global_mem_size_valid_ = true;
+          }
+          return global_mem_size_;
+        }
+
+#ifdef CL_DEVICE_HALF_FP_CONFIG
+        /** @brief Describes the OPTIONAL half precision floating-point capability of the OpenCL device.
+          *
+          * This is a bit-field that describes one or more of the following values:
+          *   CL_FP_DENORM - denorms are supported.
+          *   CL_FP_INF_NAN - INF and NaNs are supported.
+          *   CL_FP_ROUND_TO_NEAREST - round to nearest even rounding mode supported.
+          *   CL_FP_ROUND_TO_ZERO - round to zero rounding mode supported.
+          *   CL_FP_ROUND_TO_INF - round to +ve and -ve infinity rounding modes supported.
+          *   CP_FP_FMA - IEEE754-2008 fused multiply-add is supported.
+          *
+          * The required minimum half precision floating-point capability as implemented by this extension is CL_FP_ROUND_TO_ZERO or CL_FP_ROUND_TO_INF | CL_FP_INF_NAN.
+          */
+        cl_device_fp_config half_fp_config() const
+        {
+          if (!half_fp_config_valid_)
+          {
+            cl_int err = clGetDeviceInfo(device_, CL_DEVICE_HALF_FP_CONFIG, sizeof(cl_device_fp_config), static_cast<void *>(&half_fp_config_), NULL);
+            VIENNACL_ERR_CHECK(err);
+            half_fp_config_valid_ = true;
+          }
+          return half_fp_config_;
+        }
+#endif
+
+        /** @brief Is CL_TRUE if the device and the host have a unified memory subsystem and is CL_FALSE otherwise. */
+        cl_bool host_unified_memory() const
+        {
+          if (!host_unified_memory_valid_)
+          {
+            cl_int err = clGetDeviceInfo(device_, CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(cl_bool), static_cast<void *>(&host_unified_memory_), NULL);
+            VIENNACL_ERR_CHECK(err);
+            host_unified_memory_valid_ = true;
+          }
+          return host_unified_memory_;
+        }
+
+        /** @brief Is CL_TRUE if images are supported by the OpenCL device and CL_FALSE otherwise. */
+        cl_bool image_support() const
+        {
+          if (!image_support_valid_)
+          {
+            cl_int err = clGetDeviceInfo(device_, CL_DEVICE_IMAGE_SUPPORT, sizeof(cl_bool), static_cast<void *>(&image_support_), NULL);
+            VIENNACL_ERR_CHECK(err);
+            image_support_valid_ = true;
+          }
+          return image_support_;
+        }
+
+        /** @brief Max height of 2D image in pixels. The minimum value is 8192 if CL_DEVICE_IMAGE_SUPPORT is CL_TRUE. */
+        size_t image2d_max_height() const
+        {
+          if (!image2d_max_height_valid_)
+          {
+            cl_int err = clGetDeviceInfo(device_, CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof(size_t), static_cast<void *>(&image2d_max_height_), NULL);
+            VIENNACL_ERR_CHECK(err);
+            image2d_max_height_valid_ = true;
+          }
+          return image2d_max_height_;
+        }
+
+        /** @brief Max width of 2D image in pixels. The minimum value is 8192 if CL_DEVICE_IMAGE_SUPPORT is CL_TRUE. */
+        size_t image2d_max_width() const
+        {
+          if (!image2d_max_width_valid_)
+          {
+            cl_int err = clGetDeviceInfo(device_, CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof(size_t), static_cast<void *>(&image2d_max_width_), NULL);
+            VIENNACL_ERR_CHECK(err);
+            image2d_max_width_valid_ = true;
+          }
+          return image2d_max_width_;
+        }
+
+        /** @brief Max depth of 3D image in pixels. The minimum value is 2048 if CL_DEVICE_IMAGE_SUPPORT is CL_TRUE. */
+        size_t image3d_max_depth() const
+        {
+          if (!image3d_max_depth_valid_)
+          {
+            cl_int err = clGetDeviceInfo(device_, CL_DEVICE_IMAGE3D_MAX_DEPTH, sizeof(size_t), static_cast<void *>(&image3d_max_depth_), NULL);
+            VIENNACL_ERR_CHECK(err);
+            image3d_max_depth_valid_ = true;
+          }
+          return image3d_max_depth_;
+        }
+
+        /** @brief Max height of 3D image in pixels. The minimum value is 2048 if CL_DEVICE_IMAGE_SUPPORT is CL_TRUE. */
+        size_t image3d_max_height() const
+        {
+          if (!image3d_max_height_valid_)
+          {
+            cl_int err = clGetDeviceInfo(device_, CL_DEVICE_IMAGE3D_MAX_HEIGHT, sizeof(size_t), static_cast<void *>(&image3d_max_height_), NULL);
+            VIENNACL_ERR_CHECK(err);
+            image3d_max_height_valid_ = true;
+          }
+          return image3d_max_height_;
+        }
+
+        /** @brief Max width of 3D image in pixels. The minimum value is 2048 if CL_DEVICE_IMAGE_SUPPORT is CL_TRUE. */
+        size_t image3d_max_width() const
+        {
+          if (!image3d_max_width_valid_)
+          {
+            cl_int err = clGetDeviceInfo(device_, CL_DEVICE_IMAGE3D_MAX_WIDTH, sizeof(size_t), static_cast<void *>(&image3d_max_width_), NULL);
+            VIENNACL_ERR_CHECK(err);
+            image3d_max_width_valid_ = true;
+          }
+          return image3d_max_width_;
+        }
+
+        /** @brief Size of local memory arena in bytes. The minimum value is 32 KB. */
+        cl_ulong local_mem_size() const
+        {
+          if (!local_mem_size_valid_)
+          {
+            cl_int err = clGetDeviceInfo(device_, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), static_cast<void *>(&local_mem_size_), NULL);
+            VIENNACL_ERR_CHECK(err);
+            local_mem_size_valid_ = true;
+          }
+          return local_mem_size_;
+        }
+
+        /** @brief Type of local memory supported. This can be set to CL_LOCAL implying dedicated local memory storage such as SRAM, or CL_GLOBAL. */
+        cl_device_local_mem_type local_mem_type() const
+        {
+          if (!local_mem_type_valid_)
+          {
+            cl_int err = clGetDeviceInfo(device_, CL_DEVICE_LOCAL_MEM_TYPE, sizeof(cl_device_local_mem_type), static_cast<void *>(&local_mem_type_), NULL);
+            VIENNACL_ERR_CHECK(err);
+            local_mem_type_valid_ = true;
+          }
+          return local_mem_type_;
+        }
+
+        /** @brief Maximum configured clock frequency of the device in MHz. */
+        cl_uint max_clock_frequency() const
+        {
+          if (!max_clock_frequency_valid_)
+          {
+            cl_int err = clGetDeviceInfo(device_, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(cl_uint), static_cast<void *>(&max_clock_frequency_), NULL);
+            VIENNACL_ERR_CHECK(err);
+            max_clock_frequency_valid_ = true;
+          }
+          return max_clock_frequency_;
+        }
+
+        /** @brief The number of parallel compute cores on the OpenCL device. The minimum value is 1. */
+        cl_uint max_compute_units() const
+        {
+          if (!max_compute_units_valid_)
+          {
+            cl_int err = clGetDeviceInfo(device_, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), static_cast<void *>(&max_compute_units_), NULL);
+            VIENNACL_ERR_CHECK(err);
+            max_compute_units_valid_ = true;
+          }
+          return max_compute_units_;
+        }
+
+        /** @brief Max number of arguments declared with the __constant qualifier in a kernel. The minimum value is 8. */
+        cl_uint max_constant_args() const
+        {
+          if (!max_constant_args_valid_)
+          {
+            cl_int err = clGetDeviceInfo(device_, CL_DEVICE_MAX_CONSTANT_ARGS, sizeof(cl_uint), static_cast<void *>(&max_constant_args_), NULL);
+            VIENNACL_ERR_CHECK(err);
+            max_constant_args_valid_ = true;
+          }
+          return max_constant_args_;
+        }
+
+        /** @brief Max size in bytes of a constant buffer allocation. The minimum value is 64 KB. */
+        cl_ulong max_constant_buffer_size() const
+        {
+          if (!max_constant_buffer_size_valid_)
+          {
+            cl_int err = clGetDeviceInfo(device_, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(cl_ulong), static_cast<void *>(&max_constant_buffer_size_), NULL);
+            VIENNACL_ERR_CHECK(err);
+            max_constant_buffer_size_valid_ = true;
+          }
+          return max_constant_buffer_size_;
+        }
+
+        /** @brief Max size of memory object allocation in bytes. The minimum value is max(1/4th of CL_DEVICE_GLOBAL_MEM_SIZE, 128*1024*1024) */
+        cl_ulong max_mem_alloc_size() const
+        {
+          if (!max_mem_alloc_size_valid_)
+          {
+            cl_int err = clGetDeviceInfo(device_, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), static_cast<void *>(&max_mem_alloc_size_), NULL);
+            VIENNACL_ERR_CHECK(err);
+            max_mem_alloc_size_valid_ = true;
+          }
+          return max_mem_alloc_size_;
+        }
+
+        /** @brief Max size in bytes of the arguments that can be passed to a kernel. The minimum value is 1024.
+          *
+          * For this minimum value, only a maximum of 128 arguments can be passed to a kernel.
+          */
+        size_t max_parameter_size() const
+        {
+          if (!max_parameter_size_valid_)
+          {
+            cl_int err = clGetDeviceInfo(device_, CL_DEVICE_MAX_PARAMETER_SIZE, sizeof(size_t), static_cast<void *>(&max_parameter_size_), NULL);
+            VIENNACL_ERR_CHECK(err);
+            max_parameter_size_valid_ = true;
+          }
+          return max_parameter_size_;
+        }
+
+        /** @brief Max number of simultaneous image objects that can be read by a kernel. The minimum value is 128 if CL_DEVICE_IMAGE_SUPPORT is CL_TRUE. */
+        cl_uint max_read_image_args() const
+        {
+          if (!max_read_image_args_valid_)
+          {
+            cl_int err = clGetDeviceInfo(device_, CL_DEVICE_MAX_READ_IMAGE_ARGS, sizeof(cl_uint), static_cast<void *>(&max_read_image_args_), NULL);
+            VIENNACL_ERR_CHECK(err);
+            max_read_image_args_valid_ = true;
+          }
+          return max_read_image_args_;
+        }
+
+        /** @brief Max number of simultaneous image objects that can be read by a kernel. The minimum value is 128 if CL_DEVICE_IMAGE_SUPPORT is CL_TRUE. */
+        cl_uint max_samplers() const
+        {
+          if (!max_samplers_valid_)
+          {
+            cl_int err = clGetDeviceInfo(device_, CL_DEVICE_MAX_SAMPLERS, sizeof(cl_uint), static_cast<void *>(&max_samplers_), NULL);
+            VIENNACL_ERR_CHECK(err);
+            max_samplers_valid_ = true;
+          }
+          return max_samplers_;
+        }
+
+        /** @brief Maximum number of work-items in a work-group executing a kernel using the data parallel execution model. The minimum value is 1. */
+        size_t max_work_group_size() const
+        {
+          if (!max_work_group_size_valid_)
+          {
+            cl_int err = clGetDeviceInfo(device_, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), static_cast<void *>(&max_work_group_size_), NULL);
+            VIENNACL_ERR_CHECK(err);
+            max_work_group_size_valid_ = true;
+          }
+          return max_work_group_size_;
+        }
+
+        /** @brief Maximum dimensions that specify the global and local work-item IDs used by the data parallel execution model. The minimum value is 3. */
+        cl_uint max_work_item_dimensions() const
+        {
+          if (!max_work_item_dimensions_valid_)
+          {
+            cl_int err = clGetDeviceInfo(device_, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(cl_uint), static_cast<void *>(&max_work_item_dimensions_), NULL);
+            VIENNACL_ERR_CHECK(err);
+            max_work_item_dimensions_valid_ = true;
+          }
+          return max_work_item_dimensions_;
+        }
+
+        /** @brief Maximum number of work-items that can be specified in each dimension of the work-group.
+          *
+          * Returns n size_t entries, where n is the value returned by the query for CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS. The minimum value is (1, 1, 1).
+          */
+        std::vector<size_t> max_work_item_sizes() const
+        {
+          std::vector<size_t> result(max_work_item_dimensions());
+
+          assert(result.size() < 16 && bool("Supported work item dimensions exceed available capacity!"));
+
+          if (!max_work_item_sizes_valid_)
+          {
+            cl_int err = clGetDeviceInfo(device_, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * 16, static_cast<void *>(&max_work_item_sizes_), NULL);
+            VIENNACL_ERR_CHECK(err);
+            max_work_item_sizes_valid_ = true;
+          }
+
+          for (vcl_size_t i=0; i<result.size(); ++i)
+            result[i] = max_work_item_sizes_[i];
+
+          return result;
+        }
+
+        /** @brief Max number of simultaneous image objects that can be written to by a kernel. The minimum value is 8 if CL_DEVICE_IMAGE_SUPPORT is CL_TRUE. */
+        cl_uint max_write_image_args() const
+        {
+          if (!max_write_image_args_valid_)
+          {
+            cl_int err = clGetDeviceInfo(device_, CL_DEVICE_MAX_WRITE_IMAGE_ARGS, sizeof(cl_uint), static_cast<void *>(&max_write_image_args_), NULL);
+            VIENNACL_ERR_CHECK(err);
+            max_write_image_args_valid_ = true;
+          }
+          return max_write_image_args_;
+        }
+
+        /** @brief Describes the alignment in bits of the base address of any allocated memory object. */
+        cl_uint mem_base_addr_align() const
+        {
+          if (!mem_base_addr_align_valid_)
+          {
+            cl_int err = clGetDeviceInfo(device_, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), static_cast<void *>(&mem_base_addr_align_), NULL);
+            VIENNACL_ERR_CHECK(err);
+            mem_base_addr_align_valid_ = true;
+          }
+          return mem_base_addr_align_;
+        }
+
+        /** @brief The smallest alignment in bytes which can be used for any data type. */
+        cl_uint min_data_type_align_size() const
+        {
+          if (!min_data_type_align_size_valid_)
+          {
+            cl_int err = clGetDeviceInfo(device_, CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE, sizeof(cl_uint), static_cast<void *>(&min_data_type_align_size_), NULL);
+            VIENNACL_ERR_CHECK(err);
+            min_data_type_align_size_valid_ = true;
+          }
+          return min_data_type_align_size_;
+        }
+
+        /** @brief Device name string. */
+        std::string name() const
+        {
+          if (!name_valid_)
+          {
+            cl_int err = clGetDeviceInfo(device_, CL_DEVICE_NAME, sizeof(char) * 256, static_cast<void *>(name_), NULL);
+            VIENNACL_ERR_CHECK(err);
+            name_valid_ = true;
+          }
+          return name_;
+        }
+
+        /** @brief Device architecture family. */
+        device_architecture_family architecture_family() const
+        {
+          if( !architecture_family_valid_)
+          {
+            architecture_family_ = get_device_architecture(vendor_id(), name());
+            architecture_family_valid_ = true;
+          }
+          return architecture_family_;
+        }
+
+        /** @brief Returns the native ISA vector width. The vector width is defined as the number of scalar elements that can be stored in the vector. */
+        cl_uint native_vector_width_char() const
+        {
+          if (!native_vector_width_char_valid_)
+          {
+            cl_int err = clGetDeviceInfo(device_, CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR, sizeof(cl_uint), static_cast<void *>(&native_vector_width_char_), NULL);
+            VIENNACL_ERR_CHECK(err);
+            native_vector_width_char_valid_ = true;
+          }
+          return native_vector_width_char_;
+        }
+
+        /** @brief Returns the native ISA vector width. The vector width is defined as the number of scalar elements that can be stored in the vector. */
+        cl_uint native_vector_width_short() const
+        {
+          if (!native_vector_width_short_valid_)
+          {
+            cl_int err = clGetDeviceInfo(device_, CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT, sizeof(cl_uint), static_cast<void *>(&native_vector_width_short_), NULL);
+            VIENNACL_ERR_CHECK(err);
+            native_vector_width_short_valid_ = true;
+          }
+          return native_vector_width_short_;
+        }
+
+        /** @brief Returns the native ISA vector width. The vector width is defined as the number of scalar elements that can be stored in the vector. */
+        cl_uint native_vector_width_int() const
+        {
+          if (!native_vector_width_int_valid_)
+          {
+            cl_int err = clGetDeviceInfo(device_, CL_DEVICE_NATIVE_VECTOR_WIDTH_INT, sizeof(cl_uint), static_cast<void *>(&native_vector_width_int_), NULL);
+            VIENNACL_ERR_CHECK(err);
+            native_vector_width_int_valid_ = true;
+          }
+          return native_vector_width_int_;
+        }
+
+        /** @brief Returns the native ISA vector width. The vector width is defined as the number of scalar elements that can be stored in the vector. */
+        cl_uint native_vector_width_long() const
+        {
+          if (!native_vector_width_long_valid_)
+          {
+            cl_int err = clGetDeviceInfo(device_, CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG, sizeof(cl_uint), static_cast<void *>(&native_vector_width_long_), NULL);
+            VIENNACL_ERR_CHECK(err);
+            native_vector_width_long_valid_ = true;
+          }
+          return native_vector_width_long_;
+        }
+
+        /** @brief Returns the native ISA vector width. The vector width is defined as the number of scalar elements that can be stored in the vector. */
+        cl_uint native_vector_width_float() const
+        {
+          if (!native_vector_width_float_valid_)
+          {
+            cl_int err = clGetDeviceInfo(device_, CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT, sizeof(cl_uint), static_cast<void *>(&native_vector_width_float_), NULL);
+            VIENNACL_ERR_CHECK(err);
+            native_vector_width_float_valid_ = true;
+          }
+          return native_vector_width_float_;
+        }
+
+        /** @brief Returns the native ISA vector width. The vector width is defined as the number of scalar elements that can be stored in the vector.
+          *
+          * If the cl_khr_fp64 extension is not supported, this function returns 0.
+          */
+        cl_uint native_vector_width_double() const
+        {
+          if (!native_vector_width_double_valid_)
+          {
+            cl_int err = clGetDeviceInfo(device_, CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE, sizeof(cl_uint), static_cast<void *>(&native_vector_width_double_), NULL);
+            VIENNACL_ERR_CHECK(err);
+            native_vector_width_double_valid_ = true;
+          }
+          return native_vector_width_double_;
+        }
+
+        /** @brief Returns the native ISA vector width. The vector width is defined as the number of scalar elements that can be stored in the vector.
+          *
+          * If the cl_khr_fp16 extension is not supported, this function returns 0.
+          */
+        cl_uint native_vector_width_half() const
+        {
+          if (!native_vector_width_half_valid_)
+          {
+            cl_int err = clGetDeviceInfo(device_, CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF, sizeof(cl_uint), static_cast<void *>(&native_vector_width_half_), NULL);
+            VIENNACL_ERR_CHECK(err);
+            native_vector_width_half_valid_ = true;
+          }
+          return native_vector_width_half_;
+        }
+
+        /** @brief OpenCL C version string. Returns the highest OpenCL C version supported by the compiler for this device.
+          *
+          * This version string has the following format:
+          *   OpenCL[space]C[space][major_version.minor_version][space][vendor-specific information]
+          * The major_version.minor_version value must be 1.1 if CL_DEVICE_VERSION is OpenCL 1.1.
+          * The major_version.minor_version value returned can be 1.0 or 1.1 if CL_DEVICE_VERSION is OpenCL 1.0.
+          * If OpenCL C 1.1 is returned, this implies that the language feature set defined in section 6 of the OpenCL 1.1 specification is supported by the OpenCL 1.0 device.
+          */
+        std::string opencl_c_version() const
+        {
+          if (!opencl_c_version_valid_)
+          {
+            cl_int err = clGetDeviceInfo(device_, CL_DEVICE_OPENCL_C_VERSION, sizeof(char) * 128, static_cast<void *>(opencl_c_version_), NULL);
+            VIENNACL_ERR_CHECK(err);
+            opencl_c_version_valid_ = true;
+          }
+          return opencl_c_version_;
+        }
+
+        /** @brief The platform associated with this device. */
+        cl_platform_id platform() const
+        {
+          if (!platform_valid_)
+          {
+            cl_int err = clGetDeviceInfo(device_, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), static_cast<void *>(&platform_), NULL);
+            VIENNACL_ERR_CHECK(err);
+            platform_valid_ = true;
+          }
+          return platform_;
+        }
+
+        /** @brief Preferred native vector width size for built-in scalar types that can be put into vectors. The vector width is defined as the number of scalar elements that can be stored in the vector. */
+        cl_uint preferred_vector_width_char() const
+        {
+          if (!preferred_vector_width_char_valid_)
+          {
+            cl_int err = clGetDeviceInfo(device_, CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, sizeof(cl_uint), static_cast<void *>(&preferred_vector_width_char_), NULL);
+            VIENNACL_ERR_CHECK(err);
+            preferred_vector_width_char_valid_ = true;
+          }
+          return preferred_vector_width_char_;
+        }
+
+        /** @brief Preferred native vector width size for built-in scalar types that can be put into vectors. The vector width is defined as the number of scalar elements that can be stored in the vector. */
+        cl_uint preferred_vector_width_short() const
+        {
+          if (!preferred_vector_width_short_valid_)
+          {
+            cl_int err = clGetDeviceInfo(device_, CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, sizeof(cl_uint), static_cast<void *>(&preferred_vector_width_short_), NULL);
+            VIENNACL_ERR_CHECK(err);
+            preferred_vector_width_short_valid_ = true;
+          }
+          return preferred_vector_width_short_;
+        }
+
+        /** @brief Preferred native vector width size for built-in scalar types that can be put into vectors. The vector width is defined as the number of scalar elements that can be stored in the vector. */
+        cl_uint preferred_vector_width_int() const
+        {
+          if (!preferred_vector_width_int_valid_)
+          {
+            cl_int err = clGetDeviceInfo(device_, CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, sizeof(cl_uint), static_cast<void *>(&preferred_vector_width_int_), NULL);
+            VIENNACL_ERR_CHECK(err);
+            preferred_vector_width_int_valid_ = true;
+          }
+          return preferred_vector_width_int_;
+        }
+
+        /** @brief Preferred native vector width size for built-in scalar types that can be put into vectors. The vector width is defined as the number of scalar elements that can be stored in the vector. */
+        cl_uint preferred_vector_width_long() const
+        {
+          if (!preferred_vector_width_long_valid_)
+          {
+            cl_int err = clGetDeviceInfo(device_, CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, sizeof(cl_uint), static_cast<void *>(&preferred_vector_width_long_), NULL);
+            VIENNACL_ERR_CHECK(err);
+            preferred_vector_width_long_valid_ = true;
+          }
+          return preferred_vector_width_long_;
+        }
+
+        /** @brief Preferred native vector width size for built-in scalar types that can be put into vectors. The vector width is defined as the number of scalar elements that can be stored in the vector. */
+        cl_uint preferred_vector_width_float() const
+        {
+          if (!preferred_vector_width_float_valid_)
+          {
+            cl_int err = clGetDeviceInfo(device_, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, sizeof(cl_uint), static_cast<void *>(&preferred_vector_width_float_), NULL);
+            VIENNACL_ERR_CHECK(err);
+            preferred_vector_width_float_valid_ = true;
+          }
+          return preferred_vector_width_float_;
+        }
+
+        /** @brief Preferred native vector width size for built-in scalar types that can be put into vectors. The vector width is defined as the number of scalar elements that can be stored in the vector.
+          *
+          * If the cl_khr_fp64 extension is not supported, this function returns 0.
+          */
+        cl_uint preferred_vector_width_double() const
+        {
+          if (!preferred_vector_width_double_valid_)
+          {
+            cl_int err = clGetDeviceInfo(device_, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, sizeof(cl_uint), static_cast<void *>(&preferred_vector_width_double_), NULL);
+            VIENNACL_ERR_CHECK(err);
+            preferred_vector_width_double_valid_ = true;
+          }
+          return preferred_vector_width_double_;
+        }
+
+        /** @brief Preferred native vector width size for built-in scalar types that can be put into vectors. The vector width is defined as the number of scalar elements that can be stored in the vector.
+          *
+          * If the cl_khr_fp16 extension is not supported, this function returns 0.
+          */
+        cl_uint preferred_vector_width_half() const
+        {
+          if (!preferred_vector_width_half_valid_)
+          {
+            cl_int err = clGetDeviceInfo(device_, CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF, sizeof(cl_uint), static_cast<void *>(&preferred_vector_width_half_), NULL);
+            VIENNACL_ERR_CHECK(err);
+            preferred_vector_width_half_valid_ = true;
+          }
+          return preferred_vector_width_half_;
+        }
+
+        /** @brief OpenCL profile string. Returns the profile name supported by the device.
+          *
+          * The profile name returned can be one of the following strings:
+          *   FULL_PROFILE - if the device supports the OpenCL specification
+          *   EMBEDDED_PROFILE - if the device supports the OpenCL embedded profile.
+          */
+        std::string profile() const
+        {
+          if (!profile_valid_)
+          {
+            cl_int err = clGetDeviceInfo(device_, CL_DEVICE_PROFILE, sizeof(char) * 32, static_cast<void *>(profile_), NULL);
+            VIENNACL_ERR_CHECK(err);
+            profile_valid_ = true;
+          }
+          return profile_;
+        }
+
+        /** @brief Describes the resolution of device timer. This is measured in nanoseconds. */
+        size_t profiling_timer_resolution() const
+        {
+          if (!profiling_timer_resolution_valid_)
+          {
+            cl_int err = clGetDeviceInfo(device_, CL_DEVICE_PROFILING_TIMER_RESOLUTION, sizeof(size_t), static_cast<void *>(&profiling_timer_resolution_), NULL);
+            VIENNACL_ERR_CHECK(err);
+            profiling_timer_resolution_valid_ = true;
+          }
+          return profiling_timer_resolution_;
+        }
+
+        /** @brief Describes the command-queue properties supported by the device.
+          *
+          * This is a bit-field that describes one or more of the following values:
+          *   CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE
+          *   CL_QUEUE_PROFILING_ENABLE3
+          * These properties are described in the table for clCreateCommandQueue in the OpenCL standard.
+          * The mandated minimum capability is CL_QUEUE_PROFILING_ENABLE.
+          */
+        cl_command_queue_properties queue_properties() const
+        {
+          if (!queue_properties_valid_)
+          {
+            cl_int err = clGetDeviceInfo(device_, CL_DEVICE_QUEUE_PROPERTIES, sizeof(cl_command_queue_properties), static_cast<void *>(&queue_properties_), NULL);
+            VIENNACL_ERR_CHECK(err);
+            queue_properties_valid_ = true;
+          }
+          return queue_properties_;
+        }
+
+        /** @brief Describes single precision floating-point capability of the OpenCL device.
+          *
+          * This is a bit-field that describes one or more of the following values:
+          *   CL_FP_DENORM - denorms are supported.
+          *   CL_FP_INF_NAN - INF and NaNs are supported.
+          *   CL_FP_ROUND_TO_NEAREST - round to nearest even rounding mode supported.
+          *   CL_FP_ROUND_TO_ZERO - round to zero rounding mode supported.
+          *   CL_FP_ROUND_TO_INF - round to +ve and -ve infinity rounding modes supported.
+          *   CP_FP_FMA - IEEE754-2008 fused multiply-add is supported.
+          *   CL_FP_SOFT_FLOAT - Basic floating-point operations (such as addition, subtraction, multiplication) are implemented in software.
+          *
+          * The mandated minimum floating-point capability is CL_FP_ROUND_TO_NEAREST | CL_FP_INF_NAN.
+          */
+        cl_device_fp_config single_fp_config() const
+        {
+          if (!single_fp_config_valid_)
+          {
+            cl_int err = clGetDeviceInfo(device_, CL_DEVICE_SINGLE_FP_CONFIG, sizeof(cl_device_fp_config), static_cast<void *>(&single_fp_config_), NULL);
+            VIENNACL_ERR_CHECK(err);
+            single_fp_config_valid_ = true;
+          }
+          return single_fp_config_;
+        }
+
+        /** @brief The OpenCL device type.
+          *
+          * Currently supported values are one of or a combination of: CL_DEVICE_TYPE_CPU, CL_DEVICE_TYPE_GPU, CL_DEVICE_TYPE_ACCELERATOR, or CL_DEVICE_TYPE_DEFAULT.
+          */
+        cl_device_type type() const
+        {
+          if (!type_valid_)
+          {
+            cl_int err = clGetDeviceInfo(device_, CL_DEVICE_TYPE, sizeof(cl_device_type), static_cast<void *>(&type_), NULL);
+            VIENNACL_ERR_CHECK(err);
+            type_valid_ = true;
+          }
+          return type_;
+        }
+
+        /** @brief Vendor name string. */
+        std::string vendor() const
+        {
+          if (!vendor_valid_)
+          {
+            cl_int err = clGetDeviceInfo(device_, CL_DEVICE_VENDOR, sizeof(char) * 256, static_cast<void *>(vendor_), NULL);
+            VIENNACL_ERR_CHECK(err);
+            vendor_valid_ = true;
+          }
+          return vendor_;
+        }
+
+        /** @brief A unique device vendor identifier. An example of a unique device identifier could be the PCIe ID. */
+        cl_uint vendor_id() const
+        {
+          if (!vendor_id_valid_)
+          {
+            cl_int err = clGetDeviceInfo(device_, CL_DEVICE_VENDOR_ID, sizeof(cl_uint), static_cast<void *>(&vendor_id_), NULL);
+            VIENNACL_ERR_CHECK(err);
+            vendor_id_valid_ = true;
+          }
+          return vendor_id_;
+        }
+
+        /** @brief Vendor name string. */
+        std::string version() const
+        {
+          if (!version_valid_)
+          {
+            cl_int err = clGetDeviceInfo(device_, CL_DEVICE_VERSION, sizeof(char) * 256, static_cast<void *>(version_), NULL);
+            VIENNACL_ERR_CHECK(err);
+            version_valid_ = true;
+          }
+          return version_;
+        }
+
+        /** @brief Vendor name string. */
+        std::string driver_version() const
+        {
+          if (!driver_version_valid_)
+          {
+            cl_int err = clGetDeviceInfo(device_, CL_DRIVER_VERSION, sizeof(char) * 256, static_cast<void *>(driver_version_), NULL);
+            VIENNACL_ERR_CHECK(err);
+            driver_version_valid_ = true;
+          }
+          return driver_version_;
+        }
+
+        //////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+        /** @brief ViennaCL convenience function: Returns true if the device supports double precision */
+        bool double_support() const
+        {
+          std::string ext = extensions();
+
+          if (ext.find("cl_khr_fp64") != std::string::npos || ext.find("cl_amd_fp64") != std::string::npos)
+            return true;
+
+          return false;
+        }
+
+        /** @brief ViennaCL convenience function: Returns the device extension which enables double precision (usually cl_khr_fp64, but AMD used cl_amd_fp64 in the past) */
+        std::string double_support_extension() const
+        {
+          std::string ext = extensions();
+
+          if (ext.find("cl_amd_fp64") != std::string::npos) //AMD extension
+            return "cl_amd_fp64";
+
+          if (ext.find("cl_khr_fp64") != std::string::npos) //Khronos-certified standard extension for double precision
+            return "cl_khr_fp64";
+
+          return "";
+        }
+
+        /** @brief Returns the OpenCL device id */
+        cl_device_id id() const
+        {
+          assert(device_ != 0 && bool("Device ID invalid!"));
+          return device_;
+        }
+
+        /** @brief Returns an info string with a few properties of the device. Use full_info() to get all details.
+          *
+          * Returns the following device properties:
+          * name, vendor, type, availability, max compute units, max work group size, global mem size, local mem size, local mem type, host unified memory
+          *
+          * @param indent      Number of optional blanks to be added at the start of each line
+          * @param indent_char Character to be used for indenting
+          */
+        std::string info(vcl_size_t indent = 0, char indent_char = ' ') const
+        {
+          std::string line_indent(indent, indent_char);
+          std::ostringstream oss;
+          oss << line_indent << "Name:                " << name() << std::endl;
+          oss << line_indent << "Vendor:              " << vendor() << std::endl;
+          oss << line_indent << "Type:                " << device_type_to_string(type()) << std::endl;
+          oss << line_indent << "Available:           " << available() << std::endl;
+          oss << line_indent << "Max Compute Units:   " << max_compute_units() << std::endl;
+          oss << line_indent << "Max Work Group Size: " << max_work_group_size() << std::endl;
+          oss << line_indent << "Global Mem Size:     " << global_mem_size() << std::endl;
+          oss << line_indent << "Local Mem Size:      " << local_mem_size() << std::endl;
+          oss << line_indent << "Local Mem Type:      " << local_mem_type() << std::endl;
+          oss << line_indent << "Host Unified Memory: " << host_unified_memory() << std::endl;
+
+          return oss.str();
+        }
+
+        /** @brief Returns an info string with all device properties defined in the OpenCL 1.1 standard, listed in alphabetical order. Use info() for a short overview.
+        *
+        * @param indent   Number of optional blanks to be added at the start of each line
+        * @param indent_char Character to be used for indenting
+        */
+        std::string full_info(vcl_size_t indent = 0, char indent_char = ' ') const
+        {
+          std::string line_indent(indent, indent_char);
+          std::ostringstream oss;
+          oss << line_indent << "Address Bits:                  " << address_bits() << std::endl;
+          oss << line_indent << "Available:                     " << available() << std::endl;
+          oss << line_indent << "Compiler Available:            " << compiler_available() << std::endl;
+#ifdef CL_DEVICE_DOUBLE_FP_CONFIG
+          oss << line_indent << "Double FP Config:              " << fp_config_to_string(double_fp_config()) << std::endl;
+#endif
+          oss << line_indent << "Endian Little:                 " << endian_little() << std::endl;
+          oss << line_indent << "Error Correction Support:      " << error_correction_support() << std::endl;
+          oss << line_indent << "Execution Capabilities:        " << exec_capabilities_to_string(execution_capabilities()) << std::endl;
+          oss << line_indent << "Extensions:                    " << extensions() << std::endl;
+          oss << line_indent << "Global Mem Cache Size:         " << global_mem_cache_size() << " Bytes" << std::endl;
+          oss << line_indent << "Global Mem Cache Type:         " << mem_cache_type_to_string(global_mem_cache_type()) << std::endl;
+          oss << line_indent << "Global Mem Cacheline Size:     " << global_mem_cacheline_size() << " Bytes" << std::endl;
+          oss << line_indent << "Global Mem Size:               " << global_mem_size() << " Bytes" << std::endl;
+#ifdef CL_DEVICE_HALF_FP_CONFIG
+          oss << line_indent << "Half PF Config:                " << fp_config_to_string(half_fp_config()) << std::endl;
+#endif
+          oss << line_indent << "Host Unified Memory:           " << host_unified_memory() << std::endl;
+          oss << line_indent << "Image Support:                 " << image_support() << std::endl;
+          oss << line_indent << "Image2D Max Height:            " << image2d_max_height() << std::endl;
+          oss << line_indent << "Image2D Max Width:             " << image2d_max_width() << std::endl;
+          oss << line_indent << "Image3D Max Depth:             " << image3d_max_depth() << std::endl;
+          oss << line_indent << "Image3D Max Height:            " << image3d_max_height() << std::endl;
+          oss << line_indent << "Image3D Max Width:             " << image3d_max_width() << std::endl;
+          oss << line_indent << "Local Mem Size:                " << local_mem_size() << " Bytes" << std::endl;
+          oss << line_indent << "Local Mem Type:                " << local_mem_type_to_string(local_mem_type()) << std::endl;
+          oss << line_indent << "Max Clock Frequency:           " << max_clock_frequency() << " MHz" << std::endl;
+          oss << line_indent << "Max Compute Units:             " << max_compute_units() << std::endl;
+          oss << line_indent << "Max Constant Args:             " << max_constant_args() << std::endl;
+          oss << line_indent << "Max Constant Buffer Size:      " << max_constant_buffer_size() << " Bytes" << std::endl;
+          oss << line_indent << "Max Mem Alloc Size:            " << max_mem_alloc_size() << " Bytes" << std::endl;
+          oss << line_indent << "Max Parameter Size:            " << max_parameter_size() << " Bytes" << std::endl;
+          oss << line_indent << "Max Read Image Args:           " << max_read_image_args() << std::endl;
+          oss << line_indent << "Max Samplers:                  " << max_samplers() << std::endl;
+          oss << line_indent << "Max Work Group Size:           " << max_work_group_size() << std::endl;
+          oss << line_indent << "Max Work Item Dimensions:      " << max_work_item_dimensions() << std::endl;
+          oss << line_indent << "Max Work Item Sizes:           " << convert_to_string(max_work_item_sizes()) << std::endl;
+          oss << line_indent << "Max Write Image Args:          " << max_write_image_args() << std::endl;
+          oss << line_indent << "Mem Base Addr Align:           " << mem_base_addr_align() << std::endl;
+          oss << line_indent << "Min Data Type Align Size:      " << min_data_type_align_size() << " Bytes" << std::endl;
+          oss << line_indent << "Name:                          " << name() << std::endl;
+          oss << line_indent << "Native Vector Width char:      " << native_vector_width_char() << std::endl;
+          oss << line_indent << "Native Vector Width short:     " << native_vector_width_short() << std::endl;
+          oss << line_indent << "Native Vector Width int:       " << native_vector_width_int() << std::endl;
+          oss << line_indent << "Native Vector Width long:      " << native_vector_width_long() << std::endl;
+          oss << line_indent << "Native Vector Width float:     " << native_vector_width_float() << std::endl;
+          oss << line_indent << "Native Vector Width double:    " << native_vector_width_double() << std::endl;
+          oss << line_indent << "Native Vector Width half:      " << native_vector_width_half() << std::endl;
+          oss << line_indent << "OpenCL C Version:              " << opencl_c_version() << std::endl;
+          oss << line_indent << "Platform:                      " << platform() << std::endl;
+          oss << line_indent << "Preferred Vector Width char:   " << preferred_vector_width_char() << std::endl;
+          oss << line_indent << "Preferred Vector Width short:  " << preferred_vector_width_short() << std::endl;
+          oss << line_indent << "Preferred Vector Width int:    " << preferred_vector_width_int() << std::endl;
+          oss << line_indent << "Preferred Vector Width long:   " << preferred_vector_width_long() << std::endl;
+          oss << line_indent << "Preferred Vector Width float:  " << preferred_vector_width_float() << std::endl;
+          oss << line_indent << "Preferred Vector Width double: " << preferred_vector_width_double() << std::endl;
+          oss << line_indent << "Preferred Vector Width half:   " << preferred_vector_width_half() << std::endl;
+          oss << line_indent << "Profile:                       " << profile() << std::endl;
+          oss << line_indent << "Profiling Timer Resolution:    " << profiling_timer_resolution() << " ns" << std::endl;
+          oss << line_indent << "Queue Properties:              " << queue_properties_to_string(queue_properties()) << std::endl;
+          oss << line_indent << "Single FP Config:              " << fp_config_to_string(single_fp_config()) << std::endl;
+          oss << line_indent << "Type:                          " << device_type_to_string(type()) << std::endl;
+          oss << line_indent << "Vendor:                        " << vendor() << std::endl;
+          oss << line_indent << "Vendor ID:                     " << vendor_id() << std::endl;
+          oss << line_indent << "Version:                       " << version() << std::endl;
+          oss << line_indent << "Driver Version:                " << driver_version() << std::endl;
+
+          return oss.str();
+        }
+
+        bool operator==(device const & other) const
+        {
+          return device_ == other.device_;
+        }
+
+        bool operator==(cl_device_id other) const
+        {
+          return device_ == other;
+        }
+
+      private:
+
+        /** @brief Helper function converting a floating point configuration to a string */
+        std::string fp_config_to_string(cl_device_fp_config conf) const
+        {
+          std::ostringstream oss;
+          if (conf & CL_FP_DENORM)
+            oss << "CL_FP_DENORM ";
+          if (conf & CL_FP_INF_NAN)
+            oss << "CL_FP_INF_NAN ";
+          if (conf & CL_FP_ROUND_TO_NEAREST)
+            oss << "CL_FP_ROUND_TO_NEAREST ";
+          if (conf & CL_FP_ROUND_TO_ZERO)
+            oss << "CL_FP_ROUND_TO_ZERO ";
+          if (conf & CL_FP_ROUND_TO_INF)
+            oss << "CL_FP_ROUND_TO_INF ";
+          if (conf & CL_FP_FMA)
+            oss << "CL_FP_FMA ";
+          if (conf & CL_FP_SOFT_FLOAT)
+            oss << "CL_FP_SOFT_FLOAT ";
+
+          return oss.str();
+        }
+
+        std::string exec_capabilities_to_string(cl_device_exec_capabilities cap) const
+        {
+          std::ostringstream oss;
+          if (cap & CL_EXEC_KERNEL)
+            oss << "CL_EXEC_KERNEL ";
+          if (cap & CL_EXEC_NATIVE_KERNEL)
+            oss << "CL_EXEC_NATIVE_KERNEL ";
+
+          return oss.str();
+        }
+
+        std::string mem_cache_type_to_string(cl_device_mem_cache_type cachetype) const
+        {
+          std::ostringstream oss;
+          if (cachetype == CL_NONE)
+            oss << "CL_NONE ";
+          else if (cachetype == CL_READ_ONLY_CACHE)
+            oss << "CL_READ_ONLY_CACHE ";
+          else if (cachetype == CL_READ_WRITE_CACHE)
+            oss << "CL_READ_WRITE_CACHE ";
+
+          return oss.str();
+        }
+
+        std::string local_mem_type_to_string(cl_device_local_mem_type loc_mem_type) const
+        {
+          std::ostringstream oss;
+          if (loc_mem_type & CL_LOCAL)
+            oss << "CL_LOCAL ";
+          if (loc_mem_type & CL_GLOBAL)
+            oss << "CL_GLOBAL ";
+
+          return oss.str();
+        }
+
+        std::string convert_to_string(std::vector<size_t> const & vec) const
+        {
+          std::ostringstream oss;
+          for (vcl_size_t i=0; i<vec.size(); ++i)
+            oss << vec[i] << " ";
+
+          return oss.str();
+        }
+
+        std::string queue_properties_to_string(cl_command_queue_properties queue_prop) const
+        {
+          std::ostringstream oss;
+          if (queue_prop & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE)
+            oss << "CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE ";
+          if (queue_prop & CL_QUEUE_PROFILING_ENABLE)
+            oss << "CL_QUEUE_PROFILING_ENABLE ";
+
+          return oss.str();
+        }
+
+        std::string device_type_to_string(cl_device_type dev_type) const
+        {
+          std::ostringstream oss;
+          if (dev_type & CL_DEVICE_TYPE_GPU)
+            oss << "GPU ";
+          if (dev_type & CL_DEVICE_TYPE_CPU)
+            oss << "CPU ";
+          if (dev_type & CL_DEVICE_TYPE_ACCELERATOR)
+            oss << "Accelerator ";
+          if (dev_type & CL_DEVICE_TYPE_DEFAULT)
+            oss << "(default)";
+
+          return oss.str();
+        }
+
+        void flush_cache()
+        {
+          address_bits_valid_       = false;
+          architecture_family_valid_ = false;
+          available_valid_          = false;
+          compiler_available_valid_ = false;
+#ifdef CL_DEVICE_DOUBLE_FP_CONFIG
+          double_fp_config_valid_   = false;
+#endif
+          endian_little_valid_      = false;
+          error_correction_support_valid_  = false;
+          execution_capabilities_valid_    = false;
+          extensions_valid_                = false;
+          global_mem_cache_size_valid_     = false;
+          global_mem_cache_type_valid_     = false;
+          global_mem_cacheline_size_valid_ = false;
+          global_mem_size_valid_           = false;
+#ifdef CL_DEVICE_HALF_FP_CONFIG
+          half_fp_config_valid_      = false;
+#endif
+          host_unified_memory_valid_ = false;
+          image_support_valid_       = false;
+          image2d_max_height_valid_  = false;
+          image2d_max_width_valid_   = false;
+          image3d_max_depth_valid_   = false;
+          image3d_max_height_valid_  = false;
+          image3d_max_width_valid_   = false;
+          local_mem_size_valid_      = false;
+          local_mem_type_valid_      = false;
+          max_clock_frequency_valid_ = false;
+          max_compute_units_valid_   = false;
+          max_constant_args_valid_   = false;
+          max_constant_buffer_size_valid_ = false;
+          max_mem_alloc_size_valid_  = false;
+          max_parameter_size_valid_  = false;
+          max_read_image_args_valid_ = false;
+          max_samplers_valid_        = false;
+          max_work_group_size_valid_ = false;
+          max_work_item_dimensions_valid_ = false;
+          max_work_item_sizes_valid_  = false;
+          max_write_image_args_valid_ = false;
+          mem_base_addr_align_valid_  = false;
+          min_data_type_align_size_valid_ = false;
+          name_valid_ = false;
+          native_vector_width_char_valid_   = false;
+          native_vector_width_short_valid_  = false;
+          native_vector_width_int_valid_    = false;
+          native_vector_width_long_valid_   = false;
+          native_vector_width_float_valid_  = false;
+          native_vector_width_double_valid_ = false;
+          native_vector_width_half_valid_   = false;
+          opencl_c_version_valid_ = false;
+          platform_valid_ = false;
+          preferred_vector_width_char_valid_   = false;
+          preferred_vector_width_short_valid_  = false;
+          preferred_vector_width_int_valid_    = false;
+          preferred_vector_width_long_valid_   = false;
+          preferred_vector_width_float_valid_  = false;
+          preferred_vector_width_double_valid_ = false;
+          preferred_vector_width_half_valid_   = false;
+          profile_valid_ = false;
+          profiling_timer_resolution_valid_ = false;
+          queue_properties_valid_ = false;
+          single_fp_config_valid_ = false;
+          type_valid_             = false;
+          vendor_valid_           = false;
+          vendor_id_valid_        = false;
+          version_valid_          = false;
+          driver_version_valid_   = false;
+        }
+
+        cl_device_id    device_;
+
+        //
+        // Device information supported by OpenCL 1.0 to follow
+        // cf. http://www.khronos.org/registry/cl/sdk/1.0/docs/man/xhtml/clGetDeviceInfo.html
+        // Note that all members are declared 'mutable', as they represent a caching mechanism in order to circumvent repeated potentially expensive calls to the OpenCL SDK
+        //
+
+        mutable bool    address_bits_valid_;
+        mutable cl_uint address_bits_;
+
+        mutable bool    available_valid_;
+        mutable cl_bool available_;
+
+        mutable bool    compiler_available_valid_;
+        mutable cl_bool compiler_available_;
+
+#ifdef CL_DEVICE_DOUBLE_FP_CONFIG
+        mutable bool                double_fp_config_valid_;
+        mutable cl_device_fp_config double_fp_config_;
+#endif
+
+        mutable bool    endian_little_valid_;
+        mutable cl_bool endian_little_;
+
+        mutable bool    error_correction_support_valid_;
+        mutable cl_bool error_correction_support_;
+
+        mutable bool                        execution_capabilities_valid_;
+        mutable cl_device_exec_capabilities execution_capabilities_;
+
+        mutable bool extensions_valid_;
+        mutable char extensions_[2048];    // don't forget to adjust member function accordingly when changing array size
+
+        mutable bool     global_mem_cache_size_valid_;
+        mutable cl_ulong global_mem_cache_size_;
+
+        mutable bool                     global_mem_cache_type_valid_;
+        mutable cl_device_mem_cache_type global_mem_cache_type_;
+
+        mutable bool    global_mem_cacheline_size_valid_;
+        mutable cl_uint global_mem_cacheline_size_;
+
+        mutable bool     global_mem_size_valid_;
+        mutable cl_ulong global_mem_size_;
+
+#ifdef CL_DEVICE_HALF_FP_CONFIG
+        mutable bool                half_fp_config_valid_;
+        mutable cl_device_fp_config half_fp_config_;
+#endif
+
+        mutable bool    host_unified_memory_valid_;
+        mutable cl_bool host_unified_memory_;
+
+        mutable bool    image_support_valid_;
+        mutable cl_bool image_support_;
+
+        mutable bool   image2d_max_height_valid_;
+        mutable size_t image2d_max_height_;
+
+        mutable bool   image2d_max_width_valid_;
+        mutable size_t image2d_max_width_;
+
+        mutable bool   image3d_max_depth_valid_;
+        mutable size_t image3d_max_depth_;
+
+        mutable bool   image3d_max_height_valid_;
+        mutable size_t image3d_max_height_;
+
+        mutable bool   image3d_max_width_valid_;
+        mutable size_t image3d_max_width_;
+
+        mutable bool     local_mem_size_valid_;
+        mutable cl_ulong local_mem_size_;
+
+        mutable bool                     local_mem_type_valid_;
+        mutable cl_device_local_mem_type local_mem_type_;
+
+        mutable bool    max_clock_frequency_valid_;
+        mutable cl_uint max_clock_frequency_;
+
+        mutable bool    max_compute_units_valid_;
+        mutable cl_uint max_compute_units_;
+
+        mutable bool    max_constant_args_valid_;
+        mutable cl_uint max_constant_args_;
+
+        mutable bool     max_constant_buffer_size_valid_;
+        mutable cl_ulong max_constant_buffer_size_;
+
+        mutable bool     max_mem_alloc_size_valid_;
+        mutable cl_ulong max_mem_alloc_size_;
+
+        mutable bool   max_parameter_size_valid_;
+        mutable size_t max_parameter_size_;
+
+        mutable bool    max_read_image_args_valid_;
+        mutable cl_uint max_read_image_args_;
+
+        mutable bool    max_samplers_valid_;
+        mutable cl_uint max_samplers_;
+
+        mutable bool   max_work_group_size_valid_;
+        mutable size_t max_work_group_size_;
+
+        mutable bool    max_work_item_dimensions_valid_;
+        mutable cl_uint max_work_item_dimensions_;
+
+        mutable bool   max_work_item_sizes_valid_;
+        mutable size_t max_work_item_sizes_[16];   //we do not support execution models with more than 16 dimensions. This should totally suffice in practice, though.
+
+        mutable bool    max_write_image_args_valid_;
+        mutable cl_uint max_write_image_args_;
+
+        mutable bool    mem_base_addr_align_valid_;
+        mutable cl_uint mem_base_addr_align_;
+
+        mutable bool    min_data_type_align_size_valid_;
+        mutable cl_uint min_data_type_align_size_;
+
+        mutable bool name_valid_;
+        mutable char name_[256];    // don't forget to adjust member function accordingly when changing array size
+
+        mutable bool    native_vector_width_char_valid_;
+        mutable cl_uint native_vector_width_char_;
+
+        mutable bool    native_vector_width_short_valid_;
+        mutable cl_uint native_vector_width_short_;
+
+        mutable bool    native_vector_width_int_valid_;
+        mutable cl_uint native_vector_width_int_;
+
+        mutable bool    native_vector_width_long_valid_;
+        mutable cl_uint native_vector_width_long_;
+
+        mutable bool    native_vector_width_float_valid_;
+        mutable cl_uint native_vector_width_float_;
+
+        mutable bool    native_vector_width_double_valid_;
+        mutable cl_uint native_vector_width_double_;
+
+        mutable bool    native_vector_width_half_valid_;
+        mutable cl_uint native_vector_width_half_;
+
+        mutable bool opencl_c_version_valid_;
+        mutable char opencl_c_version_[128];    // don't forget to adjust member function accordingly when changing array size
+
+        mutable bool           platform_valid_;
+        mutable cl_platform_id platform_;
+
+        mutable bool    preferred_vector_width_char_valid_;
+        mutable cl_uint preferred_vector_width_char_;
+
+        mutable bool    preferred_vector_width_short_valid_;
+        mutable cl_uint preferred_vector_width_short_;
+
+        mutable bool    preferred_vector_width_int_valid_;
+        mutable cl_uint preferred_vector_width_int_;
+
+        mutable bool    preferred_vector_width_long_valid_;
+        mutable cl_uint preferred_vector_width_long_;
+
+        mutable bool    preferred_vector_width_float_valid_;
+        mutable cl_uint preferred_vector_width_float_;
+
+        mutable bool    preferred_vector_width_double_valid_;
+        mutable cl_uint preferred_vector_width_double_;
+
+        mutable bool    preferred_vector_width_half_valid_;
+        mutable cl_uint preferred_vector_width_half_;
+
+        mutable bool profile_valid_;
+        mutable char profile_[32];    // don't forget to adjust member function accordingly when changing array size
+
+        mutable bool   profiling_timer_resolution_valid_;
+        mutable size_t profiling_timer_resolution_;
+
+        mutable bool                        queue_properties_valid_;
+        mutable cl_command_queue_properties queue_properties_;
+
+        mutable bool                single_fp_config_valid_;
+        mutable cl_device_fp_config single_fp_config_;
+
+        mutable bool           type_valid_;
+        mutable cl_device_type type_;
+
+        mutable bool vendor_valid_;
+        mutable char vendor_[256];    // don't forget to adjust member function accordingly when changing array size
+
+        mutable bool    vendor_id_valid_;
+        mutable cl_uint vendor_id_;
+
+        mutable bool version_valid_;
+        mutable char version_[256];    // don't forget to adjust member function accordingly when changing array size
+
+        mutable bool driver_version_valid_;
+        mutable char driver_version_[256];    // don't forget to adjust member function accordingly when changing array size
+
+        mutable bool architecture_family_valid_;
+        mutable device_architecture_family architecture_family_;
+    };
+
+  } //namespace ocl
+} //namespace viennacl
+
+#endif
diff --git a/viennacl/ocl/device_utils.hpp b/viennacl/ocl/device_utils.hpp
new file mode 100644
index 0000000..88af6e7
--- /dev/null
+++ b/viennacl/ocl/device_utils.hpp
@@ -0,0 +1,155 @@
+#ifndef VIENNACL_OCL_DEVICE_UTILS_HPP_
+#define VIENNACL_OCL_DEVICE_UTILS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/ocl/device_utils.hpp
+    @brief Various utility implementations for dispatching with respect to the different devices available on the market.
+*/
+
+#define VIENNACL_OCL_MAX_DEVICE_NUM  8
+
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#else
+#include <CL/cl.h>
+#endif
+
+
+#include <stddef.h>
+#include <map>
+#include <string>
+
+#include "viennacl/forwards.h"
+
+namespace viennacl
+{
+  namespace ocl
+  {
+
+    static const cl_uint intel_id = 32902;
+    static const cl_uint nvidia_id = 4318;
+    static const cl_uint amd_id = 4098;
+    static const cl_uint unknown_id = 0;
+
+    //Architecture Family
+    enum device_architecture_family{
+      //NVidia
+      Tesla,
+      Fermi,
+      Kepler,
+
+      //AMD
+      Evergreen,
+      NorthernIslands,
+      SouthernIslands,
+
+      UNKNOWN
+    };
+
+    static device_architecture_family get_device_architecture(cl_uint vendor_id, std::string const & name){
+
+      /*-NVidia-*/
+      if(vendor_id==nvidia_id){
+        //GeForce
+        vcl_size_t found=0;
+        if((found= name.find("GeForce",0)) != std::string::npos){
+          if((found = name.find_first_of("123456789", found)) != std::string::npos){
+            switch (name[found]) {
+              case '2' : return Tesla;
+              case '3' : return Tesla;
+
+              case '4' : return Fermi;
+              case '5' : return Fermi;
+
+              case '6' : return Kepler;
+              case '7' : return Kepler;
+
+              default: return UNKNOWN;
+            }
+          }
+          else
+            return UNKNOWN;
+        }
+
+        //Tesla
+        else if((found = name.find("Tesla",0)) != std::string::npos){
+          if((found = name.find("CMK", found)) != std::string::npos){
+            switch(name[found]){
+              case 'C' : return Fermi;
+              case 'M' : return Fermi;
+
+              case 'K' : return Kepler;
+
+              default : return UNKNOWN;
+            }
+          }
+          else
+            return UNKNOWN;
+        }
+
+        else
+          return UNKNOWN;
+      }
+
+      /*-AMD-*/
+      else if(vendor_id==amd_id){
+
+#define VIENNACL_DEVICE_MAP(device,arch)if(name.find(device,0)!=std::string::npos) return arch;
+
+        //Evergreen
+        VIENNACL_DEVICE_MAP("Cedar",Evergreen);
+        VIENNACL_DEVICE_MAP("Redwood",Evergreen);
+        VIENNACL_DEVICE_MAP("Juniper",Evergreen);
+        VIENNACL_DEVICE_MAP("Cypress",Evergreen);
+        VIENNACL_DEVICE_MAP("Hemlock",Evergreen);
+
+        //NorthernIslands
+        VIENNACL_DEVICE_MAP("Caicos",NorthernIslands);
+        VIENNACL_DEVICE_MAP("Turks",NorthernIslands);
+        VIENNACL_DEVICE_MAP("Barts",NorthernIslands);
+        VIENNACL_DEVICE_MAP("Cayman",NorthernIslands);
+        VIENNACL_DEVICE_MAP("Antilles",NorthernIslands);
+
+        //SouthernIslands
+        VIENNACL_DEVICE_MAP("Cape",SouthernIslands);
+        VIENNACL_DEVICE_MAP("Bonaire",SouthernIslands);
+        VIENNACL_DEVICE_MAP("Pitcaim",SouthernIslands);
+        VIENNACL_DEVICE_MAP("Tahiti",SouthernIslands);
+        VIENNACL_DEVICE_MAP("Malta",SouthernIslands);
+
+#undef VIENNACL_DEVICE_MAP
+
+        return UNKNOWN;
+
+      }
+
+      /*-Other-*/
+      else{
+        return UNKNOWN;
+      }
+
+    }
+
+
+  }
+} //namespace viennacl
+
+#endif
+
+/*@}*/
diff --git a/viennacl/ocl/enqueue.hpp b/viennacl/ocl/enqueue.hpp
index 1b001b2..f2af576 100644
--- a/viennacl/ocl/enqueue.hpp
+++ b/viennacl/ocl/enqueue.hpp
@@ -1,141 +1,129 @@
-#ifndef VIENNACL_OCL_ENQUEUE_HPP_
-#define VIENNACL_OCL_ENQUEUE_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file enqueue.hpp
-    @brief Enqueues kernels into command queues
-*/
-
-#ifdef __APPLE__
-#include <OpenCL/cl.h>
-#else
-#include <CL/cl.h>
-#endif
-
-#include "viennacl/ocl/kernel.hpp"
-#include "viennacl/ocl/command_queue.hpp"
-
-namespace viennacl
-{
-  namespace ocl
-  {
-
-    /** @brief Enqueues a kernel in the provided queue */
-    template <typename KernelType>
-    void enqueue(KernelType & k, viennacl::ocl::command_queue const & queue)
-    {
-      // 1D kernel:
-      if (k.local_work_size(1) == 0)
-      {
-        #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
-        std::cout << "ViennaCL: Starting 1D-kernel '" << k.name() << "'..." << std::endl;
-        std::cout << "ViennaCL: Global work size: '"  << k.global_work_size() << "'..." << std::endl;
-        std::cout << "ViennaCL: Local work size: '"   << k.local_work_size() << "'..." << std::endl;
-        #endif
-      
-        size_t tmp_global = k.global_work_size();
-        size_t tmp_local = k.local_work_size();
-        
-        cl_int err;
-        if (tmp_global == 1 && tmp_local == 1)
-          err = clEnqueueTask(queue.handle().get(), k.handle().get(), 0, NULL, NULL);
-        else
-          err = clEnqueueNDRangeKernel(queue.handle().get(), k.handle().get(), 1, NULL, &tmp_global, &tmp_local, 0, NULL, NULL);
-
-        if (err != CL_SUCCESS)  //if not successful, try to start with smaller work size
-        {
-          //std::cout << "FAIL: " << std::endl; exit(0);
-          while (err != CL_SUCCESS && tmp_local > 1)
-          {
-            //std::cout << "Flushing queue, then enqueuing again with half the size..." << std::endl;
-            //std::cout << "Error code: " << err << std::endl;
-            
-            tmp_global /= 2;
-            tmp_local /= 2;
-
-            #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
-            std::cout << "ViennaCL: Kernel start failed for '" << k.name() << "'." << std::endl;
-            std::cout << "ViennaCL: Global work size: '"  << tmp_global << "'..." << std::endl;
-            std::cout << "ViennaCL: Local work size: '"   << tmp_local << "'..." << std::endl;
-            #endif
-            
-            queue.finish();
-            err = clEnqueueNDRangeKernel(queue.handle().get(), k.handle().get(), 1, NULL, &tmp_global, &tmp_local, 0, NULL, NULL);
-          }
-          
-          if (err != CL_SUCCESS)
-          {
-            //could not start kernel with any parameters
-            std::cerr << "ViennaCL: FATAL ERROR: Kernel start failed for '" << k.name() << "'." << std::endl;
-            std::cerr << "ViennaCL: Smaller work sizes could not solve the problem. " << std::endl;
-            VIENNACL_ERR_CHECK(err);
-          }
-          else
-          {
-            //remember parameters:
-            k.local_work_size(0, tmp_local);
-            k.global_work_size(0, tmp_global);
-            #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
-            std::cout << "ViennaCL: Kernel '" << k.name() << "' now uses global work size " << tmp_global << " and local work size " << tmp_local << "."  << std::endl;
-            #endif
-          }          
-        }
-      }
-      else //2D kernel
-      {
-        #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
-        std::cout << "ViennaCL: Starting 2D-kernel '" << k.name() << "'..." << std::endl;
-        std::cout << "ViennaCL: Global work size: '"  << k.global_work_size(0) << ", " << k.global_work_size(1) << "'..." << std::endl;
-        std::cout << "ViennaCL: Local work size: '"   << k.local_work_size(0) << ", " << k.local_work_size(1) << "'..." << std::endl;
-        #endif
-
-        size_t tmp_global[2]; 
-        tmp_global[0] = k.global_work_size(0);
-        tmp_global[1] = k.global_work_size(1);
-        
-        size_t tmp_local[2];
-        tmp_local[0] = k.local_work_size(0);
-        tmp_local[1] = k.local_work_size(1);
-        
-        cl_int err = clEnqueueNDRangeKernel(queue.handle().get(), k.handle().get(), 2, NULL, tmp_global, tmp_local, 0, NULL, NULL);
-
-        if (err != CL_SUCCESS)
-        {
-          //could not start kernel with any parameters
-          std::cerr << "ViennaCL: FATAL ERROR: Kernel start failed for '" << k.name() << "'." << std::endl;
-          VIENNACL_ERR_CHECK(err);
-        }
-        
-      }
-            
-      #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
-      queue.finish();
-      std::cout << "ViennaCL: Kernel " << k.name() << " finished!" << std::endl;
-      #endif
-    } //enqueue()
-    
-    
-    /** @brief Convenience function that enqueues the provided kernel into the first queue of the currently active device in the currently active context */
-    template <typename KernelType>
-    void enqueue(KernelType & k)
-    {
-      enqueue(k, viennacl::ocl::current_context().get_queue());
-    }
-  } // namespace ocl
-} // namespace viennacl
-#endif
+#ifndef VIENNACL_OCL_ENQUEUE_HPP_
+#define VIENNACL_OCL_ENQUEUE_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/ocl/enqueue.hpp
+    @brief Enqueues kernels into command queues
+*/
+
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#else
+#include <CL/cl.h>
+#endif
+
+#include "viennacl/ocl/backend.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/command_queue.hpp"
+#include "viennacl/ocl/context.hpp"
+
+namespace viennacl
+{
+  namespace generator{
+      class custom_operation;
+      void enqueue_custom_op(viennacl::generator::custom_operation & op, viennacl::ocl::command_queue const & queue);
+  }
+
+  namespace ocl
+  {
+
+    /** @brief Enqueues a kernel in the provided queue */
+    template <typename KernelType>
+    void enqueue(KernelType & k, viennacl::ocl::command_queue const & queue)
+    {
+      // 1D kernel:
+      if (k.local_work_size(1) == 0)
+      {
+        #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
+        std::cout << "ViennaCL: Starting 1D-kernel '" << k.name() << "'..." << std::endl;
+        std::cout << "ViennaCL: Global work size: '"  << k.global_work_size() << "'..." << std::endl;
+        std::cout << "ViennaCL: Local work size: '"   << k.local_work_size() << "'..." << std::endl;
+        #endif
+
+        vcl_size_t tmp_global = k.global_work_size();
+        vcl_size_t tmp_local = k.local_work_size();
+
+        cl_int err;
+        if (tmp_global == 1 && tmp_local == 1)
+          err = clEnqueueTask(queue.handle().get(), k.handle().get(), 0, NULL, NULL);
+        else
+          err = clEnqueueNDRangeKernel(queue.handle().get(), k.handle().get(), 1, NULL, &tmp_global, &tmp_local, 0, NULL, NULL);
+
+        if (err != CL_SUCCESS)
+        {
+          std::cerr << "ViennaCL: FATAL ERROR: Kernel start failed for '" << k.name() << "'." << std::endl;
+          std::cerr << "ViennaCL: Smaller work sizes could not solve the problem. " << std::endl;
+          VIENNACL_ERR_CHECK(err);
+        }
+      }
+      else //2D or 3D kernel
+      {
+        #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
+        std::cout << "ViennaCL: Starting 2D/3D-kernel '" << k.name() << "'..." << std::endl;
+        std::cout << "ViennaCL: Global work size: '"  << k.global_work_size(0) << ", " << k.global_work_size(1) << ", " << k.global_work_size(2) << "'..." << std::endl;
+        std::cout << "ViennaCL: Local work size: '"   << k.local_work_size(0) << ", " << k.local_work_size(1) << ", " << k.local_work_size(2) << "'..." << std::endl;
+        #endif
+
+        vcl_size_t tmp_global[3];
+        tmp_global[0] = k.global_work_size(0);
+        tmp_global[1] = k.global_work_size(1);
+        tmp_global[2] = k.global_work_size(2);
+
+        vcl_size_t tmp_local[3];
+        tmp_local[0] = k.local_work_size(0);
+        tmp_local[1] = k.local_work_size(1);
+        tmp_local[2] = k.local_work_size(2);
+
+        cl_int err = clEnqueueNDRangeKernel(queue.handle().get(), k.handle().get(), (tmp_global[2] == 0) ? 2 : 3, NULL, tmp_global, tmp_local, 0, NULL, NULL);
+
+        if (err != CL_SUCCESS)
+        {
+          //could not start kernel with any parameters
+          std::cerr << "ViennaCL: FATAL ERROR: Kernel start failed for '" << k.name() << "'." << std::endl;
+          VIENNACL_ERR_CHECK(err);
+        }
+      }
+
+      #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
+      queue.finish();
+      std::cout << "ViennaCL: Kernel " << k.name() << " finished!" << std::endl;
+      #endif
+    } //enqueue()
+
+
+    /** @brief Convenience function that enqueues the provided kernel into the first queue of the currently active device in the currently active context */
+    template <typename KernelType>
+    void enqueue(KernelType & k)
+    {
+      enqueue(k, k.context().get_queue());
+    }
+
+    inline void enqueue(viennacl::generator::custom_operation & op, viennacl::ocl::command_queue const & queue)
+    {
+      generator::enqueue_custom_op(op,queue);
+    }
+
+    inline void enqueue(viennacl::generator::custom_operation & op)
+    {
+      enqueue(op, viennacl::ocl::current_context().get_queue());
+    }
+
+  } // namespace ocl
+} // namespace viennacl
+#endif
diff --git a/viennacl/ocl/error.hpp b/viennacl/ocl/error.hpp
index 988e083..6dcf131 100644
--- a/viennacl/ocl/error.hpp
+++ b/viennacl/ocl/error.hpp
@@ -1,599 +1,661 @@
-#ifndef VIENNACL_OCL_ERROR_HPP_
-#define VIENNACL_OCL_ERROR_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file error.hpp
-    @brief Error handling for the OpenCL layer of ViennaCL
-*/
-
-//error levels:
-//#define VIENNACL_DEBUG_ALL           //print all of the following
-//#define VIENNACL_DEBUG_KERNEL        //debug any modifications on viennacl::ocl::kernel objects
-//#define VIENNACL_DEBUG_COPY          //print infos related to setting up/modifying memory objects
-//#define VIENNACL_DEBUG_OPENCL        //display debug info for the OpenCL layer (platform/context/queue creation,
-//#define VIENNACL_DEBUG_DEVICE        //Show device info upon allocation
-//#define VIENNACL_DEBUG_CONTEXT       //Debug queries to context
-//#define VIENNACL_DEBUG_BUILD         //Show debug info from OpenCL compiler
-
-
-//backwards compatibility:
-#ifdef VIENNACL_BUILD_INFO
-  #define VIENNACL_DEBUG_ALL
-#endif
-
-
-#ifdef __APPLE__
-#include <OpenCL/cl.h>
-#else
-#include <CL/cl.h>
-#endif
-
-#include <string>
-#include <iostream>
-#include <exception>
-
-#define VIENNACL_BUG_REPORT_STRING  \
-               "\nIf you think that this is a bug in ViennaCL, please report it at viennacl-support at lists.sourceforge.net and supply at least the following information:\n"\
-               " * Operating System\n"\
-               " * Which OpenCL implementation (AMD, NVIDIA, etc.)\n"\
-               " * ViennaCL version\n"\
-               "Many thanks in advance!";\
-
-namespace viennacl
-{
-  namespace ocl
-  {
-    //Wrapper for OpenCL exceptions:
-    class device_not_found : public std::exception
-    {
-      virtual const char* what() const throw()
-      {
-        return "ViennaCL: FATAL ERROR: CL_DEVICE_NOT_FOUND \n ViennaCL could not find a suitable device. Please check whether an OpenCL implementation is properly installed and a suitable device available."
-               VIENNACL_BUG_REPORT_STRING;
-      }
-    };
-    
-    class device_not_available : public std::exception
-    {
-      virtual const char* what() const throw()
-      {
-        return "ViennaCL: FATAL ERROR: CL_DEVICE_NOT_AVAILABLE \n ViennaCL could not use the compute device because it is not available."
-               VIENNACL_BUG_REPORT_STRING;
-      }
-    };
-
-    class compiler_not_available : public std::exception
-    {
-      virtual const char* what() const throw()
-      {
-        return "ViennaCL: FATAL ERROR: CL_COMPILER_NOT_AVAILABLE \n Your OpenCL framework does not provide an OpenCL compiler. Unfortunately, ViennaCL cannot be used without such a compiler."
-               VIENNACL_BUG_REPORT_STRING;
-      }
-    };
-    
-    class mem_object_allocation_failure : public std::exception
-    {
-      virtual const char* what() const throw()
-      {
-        return "ViennaCL: FATAL ERROR: CL_MEM_OBJECT_ALLOCATION_FAILURE \n ViennaCL could not allocate memory on the device. Most likely the device simply ran out of memory."
-               VIENNACL_BUG_REPORT_STRING;
-      }
-    };
-    
-    class out_of_resources : public std::exception
-    {
-      virtual const char* what() const throw()
-      {
-        return "ViennaCL: FATAL ERROR: CL_OUT_OF_RESOURCES \n ViennaCL tried to launch a compute kernel, but the device does not provide enough resources. Try changing the global and local work item sizes."
-               VIENNACL_BUG_REPORT_STRING;
-      }
-    };
-
-    class out_of_host_memory : public std::exception
-    {
-      virtual const char* what() const throw()
-      {
-        return "ViennaCL: FATAL ERROR: CL_OUT_OF_HOST_MEMORY \n The host ran out of memory (usually CPU RAM). Please try again on smaller problems."
-               VIENNACL_BUG_REPORT_STRING;
-      }
-    };
-    
-    class profiling_info_not_available : public std::exception
-    {
-      virtual const char* what() const throw()
-      {
-        return "ViennaCL: FATAL ERROR: CL_PROFILING_INFO_NOT_AVAILABLE."
-               VIENNACL_BUG_REPORT_STRING;
-      }
-    };
-    
-    class mem_copy_overlap : public std::exception
-    {
-      virtual const char* what() const throw()
-      {
-        return "ViennaCL: FATAL ERROR: CL_MEM_COPY_OVERLAP."
-               VIENNACL_BUG_REPORT_STRING;
-      }
-    };
-    
-    class image_format_mismatch : public std::exception
-    {
-      virtual const char* what() const throw()
-      {
-        return "ViennaCL: FATAL ERROR: CL_IMAGE_FORMAT_MISMATCH."
-               VIENNACL_BUG_REPORT_STRING;
-      }
-    };
-    
-    class image_format_not_supported : public std::exception
-    {
-      virtual const char* what() const throw()
-      {
-        return "ViennaCL: FATAL ERROR: CL_IMAGE_FORMAT_NOT_SUPPORTED."
-               VIENNACL_BUG_REPORT_STRING;
-      }
-    };
-
-    class build_program_failure : public std::exception
-    {
-      virtual const char* what() const throw()
-      {
-        return "ViennaCL: FATAL ERROR: CL_BUILD_PROGRAM_FAILURE \n The OpenCL compiler encountered an error during the compilation of ViennaCL sources. This is most likely a bug in ViennaCL."
-               VIENNACL_BUG_REPORT_STRING;
-      }
-    };
-    
-    class map_failure : public std::exception
-    {
-      virtual const char* what() const throw()
-      {
-        return "ViennaCL: FATAL ERROR: CL_MAP_FAILURE."
-               VIENNACL_BUG_REPORT_STRING;
-      }
-    };
-    
-    class invalid_value : public std::exception
-    {
-      virtual const char* what() const throw()
-      {
-        return "ViennaCL: FATAL ERROR: CL_INVALID_VALUE."
-               VIENNACL_BUG_REPORT_STRING;
-      }
-    };
-    
-    class invalid_device_type : public std::exception
-    {
-      virtual const char* what() const throw()
-      {
-        return "ViennaCL: FATAL ERROR: CL_INVALID_DEVICE_TYPE."
-               VIENNACL_BUG_REPORT_STRING;
-      }
-    };
-    
-    class invalid_platform : public std::exception
-    {
-      virtual const char* what() const throw()
-      {
-        return "ViennaCL: FATAL ERROR: CL_INVALID_PLATFORM."
-               VIENNACL_BUG_REPORT_STRING;
-      }
-    };
-    
-    class invalid_device : public std::exception
-    {
-      virtual const char* what() const throw()
-      {
-        return "ViennaCL: FATAL ERROR: CL_INVALID_DEVICE."
-               VIENNACL_BUG_REPORT_STRING;
-      }
-    };
-    
-    class invalid_context : public std::exception
-    {
-      virtual const char* what() const throw()
-      {
-        return "ViennaCL: FATAL ERROR: CL_INVALID_CONTEXT."
-               VIENNACL_BUG_REPORT_STRING;
-      }
-    };
-    
-    class invalid_queue_properties : public std::exception
-    {
-      virtual const char* what() const throw()
-      {
-        return "ViennaCL: FATAL ERROR: CL_INVALID_QUEUE_PROPERTIES."
-               VIENNACL_BUG_REPORT_STRING;
-      }
-    };
-    
-    class invalid_command_queue : public std::exception
-    {
-      virtual const char* what() const throw()
-      {
-        return "ViennaCL: FATAL ERROR: CL_INVALID_COMMAND_QUEUE."
-               VIENNACL_BUG_REPORT_STRING;
-      }
-    };
-    
-    class invalid_host_ptr : public std::exception
-    {
-      virtual const char* what() const throw()
-      {
-        return "ViennaCL: FATAL ERROR: CL_INVALID_HOST_PTR."
-               VIENNACL_BUG_REPORT_STRING;
-      }
-    };
-    
-    class invalid_mem_object : public std::exception
-    {
-      virtual const char* what() const throw()
-      {
-        return "ViennaCL: FATAL ERROR: CL_INVALID_MEM_OBJECT."
-               VIENNACL_BUG_REPORT_STRING;
-      }
-    };
-    
-    class invalid_image_format_descriptor : public std::exception
-    {
-      virtual const char* what() const throw()
-      {
-        return "ViennaCL: FATAL ERROR: CL_INVALID_IMAGE_FORMAT_DESCRIPTOR."
-               VIENNACL_BUG_REPORT_STRING;
-      }
-    };
-    
-    class invalid_image_size : public std::exception
-    {
-      virtual const char* what() const throw()
-      {
-        return "ViennaCL: FATAL ERROR: CL_INVALID_IMAGE_SIZE."
-               VIENNACL_BUG_REPORT_STRING;
-      }
-    };
-    
-    class invalid_sampler : public std::exception
-    {
-      virtual const char* what() const throw()
-      {
-        return "ViennaCL: FATAL ERROR: CL_INVALID_SAMPLER."
-               VIENNACL_BUG_REPORT_STRING;
-      }
-    };
-    
-    class invalid_binary : public std::exception
-    {
-      virtual const char* what() const throw()
-      {
-        return "ViennaCL: FATAL ERROR: CL_INVALID_BINARY."
-               VIENNACL_BUG_REPORT_STRING;
-      }
-    };
-    
-    class invalid_build_options : public std::exception
-    {
-      virtual const char* what() const throw()
-      {
-        return "ViennaCL: FATAL ERROR: CL_INVALID_BUILD_OPTIONS."
-               VIENNACL_BUG_REPORT_STRING;
-      }
-    };
-    
-    class invalid_program : public std::exception
-    {
-      virtual const char* what() const throw()
-      {
-        return "ViennaCL: FATAL ERROR: CL_INVALID_PROGRAM."
-               VIENNACL_BUG_REPORT_STRING;
-      }
-    };
-    
-    class invalid_program_executable : public std::exception
-    {
-      virtual const char* what() const throw()
-      {
-        return "ViennaCL: FATAL ERROR: CL_INVALID_PROGRAM_EXECUTABLE."
-               VIENNACL_BUG_REPORT_STRING;
-      }
-    };
-    
-    class invalid_kernel_name : public std::exception
-    {
-      virtual const char* what() const throw()
-      {
-        return "ViennaCL: FATAL ERROR: CL_INVALID_KERNEL_NAME \n The supplied kernel name is invalid. If you have written your own OpenCL kernel, please check that the correct kernel name is used in the initalization of the kernel object."
-               VIENNACL_BUG_REPORT_STRING;
-      }
-    };
-
-    class invalid_kernel_definition : public std::exception
-    {
-      virtual const char* what() const throw()
-      {
-        return "ViennaCL: FATAL ERROR: CL_INVALID_KERNEL_DEFINITION."
-               VIENNACL_BUG_REPORT_STRING;
-      }
-    };
-
-    class invalid_kernel : public std::exception
-    {
-      virtual const char* what() const throw()
-      {
-        return "ViennaCL: FATAL ERROR: CL_INVALID_KERNEL \n The supplied kernel argument is invalid."
-               VIENNACL_BUG_REPORT_STRING;
-      }
-    };
-    
-    class invalid_arg_index : public std::exception
-    {
-      virtual const char* what() const throw()
-      {
-        return "ViennaCL: FATAL ERROR: CL_INVALID_ARG_INDEX."
-               VIENNACL_BUG_REPORT_STRING;
-      }
-    };
-    
-    class invalid_arg_value : public std::exception
-    {
-      virtual const char* what() const throw()
-      {
-        return "ViennaCL: FATAL ERROR: CL_INVALID_ARG_VALUE."
-               VIENNACL_BUG_REPORT_STRING;
-      }
-    };
-
-    class invalid_arg_size : public std::exception
-    {
-      virtual const char* what() const throw()
-      {
-        return "ViennaCL: FATAL ERROR: CL_INVALID_ARG_SIZE."
-               VIENNACL_BUG_REPORT_STRING;
-      }
-    };
-    
-    class invalid_kernel_args : public std::exception
-    {
-      virtual const char* what() const throw()
-      {
-        return "ViennaCL: FATAL ERROR: CL_INVALID_KERNEL_ARGS \n The supplied kernel arguments do not fit the kernel parameter list. If you have written your own OpenCL kernel, please check that the correct kernel arguments are set in the appropriate order."
-               VIENNACL_BUG_REPORT_STRING;
-      }
-    };
-    
-    class invalid_work_dimension : public std::exception
-    {
-      virtual const char* what() const throw()
-      {
-        return "ViennaCL: FATAL ERROR: CL_INVALID_WORK_DIMENSION"
-               VIENNACL_BUG_REPORT_STRING;
-      }
-    };
-    
-    class invalid_work_group_size : public std::exception
-    {
-      virtual const char* what() const throw()
-      {
-        return "ViennaCL: FATAL ERROR: CL_INVALID_WORK_GROUP_SIZE \n The supplied work group size is invalid. If you have set this value manually, please reconsider your choice."
-               VIENNACL_BUG_REPORT_STRING;
-      }
-    };
-
-    class invalid_work_item_size : public std::exception
-    {
-      virtual const char* what() const throw()
-      {
-        return "ViennaCL: FATAL ERROR: CL_INVALID_WORK_ITEM_SIZE \n The work item size is invalid. If you have set this value manually, please reconsider your choice."
-               VIENNACL_BUG_REPORT_STRING;
-      }
-    };
-    
-    class invalid_global_offset : public std::exception
-    {
-      virtual const char* what() const throw()
-      {
-        return "ViennaCL: FATAL ERROR: CL_INVALID_GLOBAL_OFFSET."
-               VIENNACL_BUG_REPORT_STRING;
-      }
-    };
-    
-    class invalid_event_wait_list : public std::exception
-    {
-      virtual const char* what() const throw()
-      {
-        return "ViennaCL: FATAL ERROR: CL_INVALID_EVENT_WAIT_LIST."
-               VIENNACL_BUG_REPORT_STRING;
-      }
-    };
-    
-    class invalid_event : public std::exception
-    {
-      virtual const char* what() const throw()
-      {
-        return "ViennaCL: FATAL ERROR: CL_INVALID_EVENT."
-               VIENNACL_BUG_REPORT_STRING;
-      }
-    };
-    
-    class invalid_operation : public std::exception
-    {
-      virtual const char* what() const throw()
-      {
-        return "ViennaCL: FATAL ERROR: CL_INVALID_OPERATION."
-               VIENNACL_BUG_REPORT_STRING;
-      }
-    };
-    
-    class invalid_gl_object : public std::exception
-    {
-      virtual const char* what() const throw()
-      {
-        return "ViennaCL: FATAL ERROR: CL_INVALID_GL_OBJECT."
-               VIENNACL_BUG_REPORT_STRING;
-      }
-    };
-    
-    class invalid_buffer_size : public std::exception
-    {
-      virtual const char* what() const throw()
-      {
-        return "ViennaCL: FATAL ERROR: CL_INVALID_BUFFER_SIZE."
-               VIENNACL_BUG_REPORT_STRING;
-      }
-    };
-    
-    class invalid_mip_level : public std::exception
-    {
-      virtual const char* what() const throw()
-      {
-        return "ViennaCL: FATAL ERROR: CL_INVALID_MIP_LEVEL."
-               VIENNACL_BUG_REPORT_STRING;
-      }
-    };
-    
-    class invalid_global_work_size : public std::exception
-    {
-      virtual const char* what() const throw()
-      {
-        return "ViennaCL: FATAL ERROR: CL_INVALID_GLOBAL_WORK_SIZE."
-               VIENNACL_BUG_REPORT_STRING;
-      }
-    };
-
-    class invalid_property : public std::exception
-    {
-      virtual const char* what() const throw()
-      {
-        return "ViennaCL: FATAL ERROR: CL_INVALID_PROPERTY."
-               VIENNACL_BUG_REPORT_STRING;
-      }
-    };
-
-    class unknown_error : public std::exception
-    {
-      virtual const char* what() const throw()
-      {
-        return "ViennaCL: FATAL ERROR: ViennaCL encountered an unknown OpenCL error. In some cases, this might be due to an invalid global work size, but it can also be due to several compilation errors."
-               VIENNACL_BUG_REPORT_STRING;
-      }
-    };
-
-    
-    class double_precision_not_provided_error : public std::exception
-    {
-      virtual const char* what() const throw()
-      {
-        return "ViennaCL: FATAL ERROR: You requested to create a ViennaCL type using double precision. However, double precision is not supported by your device."
-               VIENNACL_BUG_REPORT_STRING;
-      }
-    };
-    
-    
-    /** @brief An error reporting class. Template argument is used to avoid problems with external linkage.
-    *
-    *  Do not use this class directly, use the macro CL_ERROR_CHECK instead.
-    *  @tparam T   Useless. Helps to avoid troubles with external linkage of namespace functions.
-    */
-    template <typename T>
-    struct error_checker
-    {
-      
-      /** @brief Trows exceptions that reflect OpenCL error codes */
-      static void raise_exception(cl_int err)
-      {
-        switch (err)
-        {
-          case CL_DEVICE_NOT_FOUND:               throw device_not_found(); break;
-          case CL_DEVICE_NOT_AVAILABLE:           throw device_not_available(); break;
-          case CL_COMPILER_NOT_AVAILABLE:         throw compiler_not_available(); break;
-          case CL_MEM_OBJECT_ALLOCATION_FAILURE:  throw mem_object_allocation_failure(); break;
-          case CL_OUT_OF_RESOURCES:               throw out_of_resources(); break;
-          case CL_OUT_OF_HOST_MEMORY:             throw out_of_host_memory(); break;
-          case CL_PROFILING_INFO_NOT_AVAILABLE:   throw profiling_info_not_available(); break;
-          case CL_MEM_COPY_OVERLAP:               throw mem_copy_overlap(); break;
-          case CL_IMAGE_FORMAT_MISMATCH:          throw image_format_mismatch(); break;
-          case CL_IMAGE_FORMAT_NOT_SUPPORTED:     throw image_format_not_supported(); break;
-          case CL_BUILD_PROGRAM_FAILURE:          throw build_program_failure(); break;
-          case CL_MAP_FAILURE:                    throw map_failure(); break;
-
-          case CL_INVALID_VALUE:                  throw invalid_value(); break;
-          case CL_INVALID_DEVICE_TYPE:            throw invalid_device_type(); break;
-          case CL_INVALID_PLATFORM:               throw invalid_platform(); break;
-          case CL_INVALID_DEVICE:                 throw invalid_device(); break;
-          case CL_INVALID_CONTEXT:                throw invalid_context(); break;
-          case CL_INVALID_QUEUE_PROPERTIES:       throw invalid_queue_properties(); break;
-          case CL_INVALID_COMMAND_QUEUE:          throw invalid_command_queue(); break;
-          case CL_INVALID_HOST_PTR:               throw invalid_host_ptr(); break;
-          case CL_INVALID_MEM_OBJECT:             throw invalid_mem_object(); break;
-          case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: throw invalid_image_format_descriptor(); break;
-          case CL_INVALID_IMAGE_SIZE:             throw invalid_image_size(); break;
-          case CL_INVALID_SAMPLER:                throw invalid_sampler(); break;
-          case CL_INVALID_BINARY:                 throw invalid_binary(); break;
-          case CL_INVALID_BUILD_OPTIONS:          throw invalid_build_options(); break;
-          case CL_INVALID_PROGRAM:                throw invalid_program(); break;
-          case CL_INVALID_PROGRAM_EXECUTABLE:     throw invalid_program_executable(); break;
-          case CL_INVALID_KERNEL_NAME:            throw invalid_kernel_name(); break;
-          case CL_INVALID_KERNEL_DEFINITION:      throw invalid_kernel_definition(); break;          
-          case CL_INVALID_KERNEL:                 throw invalid_kernel(); break;
-          case CL_INVALID_ARG_INDEX:              throw invalid_arg_index(); break;
-          case CL_INVALID_ARG_VALUE:              throw invalid_arg_value(); break;
-          case CL_INVALID_ARG_SIZE:               throw invalid_arg_size(); break;
-          case CL_INVALID_KERNEL_ARGS:            throw invalid_kernel_args(); break;
-          case CL_INVALID_WORK_DIMENSION:         throw invalid_work_dimension(); break;
-          case CL_INVALID_WORK_GROUP_SIZE:        throw invalid_work_group_size(); break;
-          case CL_INVALID_WORK_ITEM_SIZE:         throw invalid_work_item_size(); break;
-          case CL_INVALID_GLOBAL_OFFSET:          throw invalid_global_offset(); break;
-          case CL_INVALID_EVENT_WAIT_LIST:        throw invalid_event_wait_list(); break;
-          case CL_INVALID_EVENT:                  throw invalid_event(); break;
-          case CL_INVALID_OPERATION:              throw invalid_operation(); break;
-          case CL_INVALID_GL_OBJECT:              throw invalid_gl_object(); break;
-          case CL_INVALID_BUFFER_SIZE:            throw invalid_buffer_size(); break;
-          case CL_INVALID_MIP_LEVEL:              throw invalid_mip_level(); break;
-          case CL_INVALID_GLOBAL_WORK_SIZE:       throw invalid_global_work_size(); break;
-      #ifdef CL_INVALID_PROPERTY
-	  case CL_INVALID_PROPERTY:               throw invalid_property(); break;
-      #endif
-          //  return "CL_INVALID_GLOBAL_WORK_SIZE";
-            
-          default: throw unknown_error();
-        }
-
-      } //getErrorString
-    
-      /** @brief Checks whether an OpenCL error has occured. 
-      * 
-      *  Do not use this function directly, use the macro CL_ERROR_CHECK instead.
-      */
-      static void checkError(cl_int err, const std::string & file, const std::string & func, int line)
-      {
-        if (err != CL_SUCCESS)
-        {
-          #ifdef VIENNACL_DEBUG_ALL
-          std::cerr << "ViennaCL: Error " << err  << " in function " << func << " ( "<< file << ":" << line << " ) " << std::endl;
-          #endif
-          raise_exception(err);
-        }
-      } //checkError()
-      
-    }; //struct 
-    
-    #define VIENNACL_ERR_CHECK(err) viennacl::ocl::error_checker<void>::checkError(err, __FILE__, __FUNCTION__, __LINE__);
-    
-  } //namespace ocl
-} //namespace viennacl
-
-#endif
-
+#ifndef VIENNACL_OCL_ERROR_HPP_
+#define VIENNACL_OCL_ERROR_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/ocl/error.hpp
+    @brief Error handling for the OpenCL layer of ViennaCL
+*/
+
+//error levels:
+//#define VIENNACL_DEBUG_ALL           //print all of the following
+//#define VIENNACL_DEBUG_KERNEL        //debug any modifications on viennacl::ocl::kernel objects
+//#define VIENNACL_DEBUG_COPY          //print infos related to setting up/modifying memory objects
+//#define VIENNACL_DEBUG_OPENCL        //display debug info for the OpenCL layer (platform/context/queue creation,
+//#define VIENNACL_DEBUG_DEVICE        //Show device info upon allocation
+//#define VIENNACL_DEBUG_CONTEXT       //Debug queries to context
+//#define VIENNACL_DEBUG_BUILD         //Show debug info from OpenCL compiler
+
+
+//backwards compatibility:
+#ifdef VIENNACL_BUILD_INFO
+  #define VIENNACL_DEBUG_ALL
+#endif
+
+
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#else
+#include <CL/cl.h>
+#endif
+
+#include <string>
+#include <iostream>
+#include <exception>
+
+#define VIENNACL_BUG_REPORT_STRING  \
+               "\nIf you think that this is a bug in ViennaCL, please report it at viennacl-support at lists.sourceforge.net and supply at least the following information:\n"\
+               " * Operating System\n"\
+               " * Which OpenCL implementation (AMD, NVIDIA, etc.)\n"\
+               " * ViennaCL version\n"\
+               "Many thanks in advance!";\
+
+namespace viennacl
+{
+  namespace ocl
+  {
+    //Wrapper for OpenCL exceptions:
+
+    /** @brief Exception thrown in the case that a requested compute device was not found.
+      *
+      * This exception usually shows up if a user requests a GPU for computation, but the OpenCL SDK does not support the GPU.
+      */
+    class device_not_found : public std::exception
+    {
+      virtual const char* what() const throw()
+      {
+        return "ViennaCL: FATAL ERROR: CL_DEVICE_NOT_FOUND \n ViennaCL could not find a suitable device. Please check whether an OpenCL implementation is properly installed and a suitable device available."
+               VIENNACL_BUG_REPORT_STRING;
+      }
+    };
+
+    /** @brief Exception thrown if the selected compute device is not available (maybe locked by another process). */
+    class device_not_available : public std::exception
+    {
+      virtual const char* what() const throw()
+      {
+        return "ViennaCL: FATAL ERROR: CL_DEVICE_NOT_AVAILABLE \n ViennaCL could not use the compute device because it is not available."
+               VIENNACL_BUG_REPORT_STRING;
+      }
+    };
+
+    /** @brief Exception thrown if the OpenCL just-in-time compiler is not available. */
+    class compiler_not_available : public std::exception
+    {
+      virtual const char* what() const throw()
+      {
+        return "ViennaCL: FATAL ERROR: CL_COMPILER_NOT_AVAILABLE \n Your OpenCL framework does not provide an OpenCL compiler. Unfortunately, ViennaCL cannot be used without such a compiler."
+               VIENNACL_BUG_REPORT_STRING;
+      }
+    };
+
+    /** @brief Exception thrown if a memory object cannot be allocated. Usually the requested memory buffer is simply too large. */
+    class mem_object_allocation_failure : public std::exception
+    {
+      virtual const char* what() const throw()
+      {
+        return "ViennaCL: FATAL ERROR: CL_MEM_OBJECT_ALLOCATION_FAILURE \n ViennaCL could not allocate memory on the device. Most likely the device simply ran out of memory."
+               VIENNACL_BUG_REPORT_STRING;
+      }
+    };
+
+    /** @brief Exception thrown if the compute device is out of resources (either global memory, registers, etc.) for the requested operation. */
+    class out_of_resources : public std::exception
+    {
+      virtual const char* what() const throw()
+      {
+        return "ViennaCL: FATAL ERROR: CL_OUT_OF_RESOURCES \n ViennaCL tried to launch a compute kernel, but the device does not provide enough resources. Try changing the global and local work item sizes."
+               VIENNACL_BUG_REPORT_STRING;
+      }
+    };
+
+    /** @brief Exception thrown if the host cannot provide enough memory for the datastructures in the OpenCL backend (temporary arrays, etc.) to perform the requested operation. */
+    class out_of_host_memory : public std::exception
+    {
+      virtual const char* what() const throw()
+      {
+        return "ViennaCL: FATAL ERROR: CL_OUT_OF_HOST_MEMORY \n The host ran out of memory (usually CPU RAM). Please try again on smaller problems."
+               VIENNACL_BUG_REPORT_STRING;
+      }
+    };
+
+    /** @brief Exception thrown if the OpenCL context does not have CL_QUEUE_PROFILING_ENABLE set, if the execution is not complete, or the event object is a user event object. */
+    class profiling_info_not_available : public std::exception
+    {
+      virtual const char* what() const throw()
+      {
+        return "ViennaCL: FATAL ERROR: CL_PROFILING_INFO_NOT_AVAILABLE."
+               VIENNACL_BUG_REPORT_STRING;
+      }
+    };
+
+    /** @brief Exception thrown if the source buffer overlaps the destination buffer when copying from device memory to device memory. */
+    class mem_copy_overlap : public std::exception
+    {
+      virtual const char* what() const throw()
+      {
+        return "ViennaCL: FATAL ERROR: CL_MEM_COPY_OVERLAP."
+               VIENNACL_BUG_REPORT_STRING;
+      }
+    };
+
+    /** @brief Exception thrown if there is a mismatch in image formats for the operands. */
+    class image_format_mismatch : public std::exception
+    {
+      virtual const char* what() const throw()
+      {
+        return "ViennaCL: FATAL ERROR: CL_IMAGE_FORMAT_MISMATCH."
+               VIENNACL_BUG_REPORT_STRING;
+      }
+    };
+
+    /** @brief Exception thrown if the requested image format is not supported. */
+    class image_format_not_supported : public std::exception
+    {
+      virtual const char* what() const throw()
+      {
+        return "ViennaCL: FATAL ERROR: CL_IMAGE_FORMAT_NOT_SUPPORTED."
+               VIENNACL_BUG_REPORT_STRING;
+      }
+    };
+
+    /** @brief Exception thrown if the OpenCL program cannot be built, usually due to a syntax error in the OpenCL code. */
+    class build_program_failure : public std::exception
+    {
+      virtual const char* what() const throw()
+      {
+        return "ViennaCL: FATAL ERROR: CL_BUILD_PROGRAM_FAILURE \n The OpenCL compiler encountered an error during the compilation of ViennaCL sources. This is most likely a bug in ViennaCL."
+               VIENNACL_BUG_REPORT_STRING;
+      }
+    };
+
+    /** @brief Exception thrown if the mapping of device memory to the host memory space failed. */
+    class map_failure : public std::exception
+    {
+      virtual const char* what() const throw()
+      {
+        return "ViennaCL: FATAL ERROR: CL_MAP_FAILURE."
+               VIENNACL_BUG_REPORT_STRING;
+      }
+    };
+
+    /** @brief Exception thrown is an invalid value is provided to an OpenCL function. */
+    class invalid_value : public std::exception
+    {
+      virtual const char* what() const throw()
+      {
+        return "ViennaCL: FATAL ERROR: CL_INVALID_VALUE."
+               VIENNACL_BUG_REPORT_STRING;
+      }
+    };
+
+    /** @brief Exception thrown if an invalid device type is specified. */
+    class invalid_device_type : public std::exception
+    {
+      virtual const char* what() const throw()
+      {
+        return "ViennaCL: FATAL ERROR: CL_INVALID_DEVICE_TYPE."
+               VIENNACL_BUG_REPORT_STRING;
+      }
+    };
+
+    /** @brief Exception thrown if an invalid OpenCL platform is provided to an OpenCL function. */
+    class invalid_platform : public std::exception
+    {
+      virtual const char* what() const throw()
+      {
+        return "ViennaCL: FATAL ERROR: CL_INVALID_PLATFORM."
+               VIENNACL_BUG_REPORT_STRING;
+      }
+    };
+
+    /** @brief Exception thrown if an invalid OpenCL device is provided to an OpenCL function. */
+    class invalid_device : public std::exception
+    {
+      virtual const char* what() const throw()
+      {
+        return "ViennaCL: FATAL ERROR: CL_INVALID_DEVICE."
+               VIENNACL_BUG_REPORT_STRING;
+      }
+    };
+
+    /** @brief Exception thrown if an invalid OpenCL context is provided to an OpenCL function. */
+    class invalid_context : public std::exception
+    {
+      virtual const char* what() const throw()
+      {
+        return "ViennaCL: FATAL ERROR: CL_INVALID_CONTEXT."
+               VIENNACL_BUG_REPORT_STRING;
+      }
+    };
+
+    /** @brief Exception thrown if invalid OpenCL command queue properties are provided when creating a command queue. */
+    class invalid_queue_properties : public std::exception
+    {
+      virtual const char* what() const throw()
+      {
+        return "ViennaCL: FATAL ERROR: CL_INVALID_QUEUE_PROPERTIES."
+               VIENNACL_BUG_REPORT_STRING;
+      }
+    };
+
+    /** @brief Exception thrown if an invalid OpenCL command queue is provided to an OpenCL function. */
+    class invalid_command_queue : public std::exception
+    {
+      virtual const char* what() const throw()
+      {
+        return "ViennaCL: FATAL ERROR: CL_INVALID_COMMAND_QUEUE."
+               VIENNACL_BUG_REPORT_STRING;
+      }
+    };
+
+    /** @brief Exception thrown if the provided pointer to host memory is invalid. */
+    class invalid_host_ptr : public std::exception
+    {
+      virtual const char* what() const throw()
+      {
+        return "ViennaCL: FATAL ERROR: CL_INVALID_HOST_PTR."
+               VIENNACL_BUG_REPORT_STRING;
+      }
+    };
+
+    /** @brief Exception thrown if an invalid OpenCL memory object (of type cl_mem) is passed to an OpenCL funciton. */
+    class invalid_mem_object : public std::exception
+    {
+      virtual const char* what() const throw()
+      {
+        return "ViennaCL: FATAL ERROR: CL_INVALID_MEM_OBJECT."
+               VIENNACL_BUG_REPORT_STRING;
+      }
+    };
+
+    /** @brief Exception thrown if an invalid image format descriptor is provided. */
+    class invalid_image_format_descriptor : public std::exception
+    {
+      virtual const char* what() const throw()
+      {
+        return "ViennaCL: FATAL ERROR: CL_INVALID_IMAGE_FORMAT_DESCRIPTOR."
+               VIENNACL_BUG_REPORT_STRING;
+      }
+    };
+
+    /** @brief Exception thrown if the image size provided is invalid (e.g. zero). */
+    class invalid_image_size : public std::exception
+    {
+      virtual const char* what() const throw()
+      {
+        return "ViennaCL: FATAL ERROR: CL_INVALID_IMAGE_SIZE."
+               VIENNACL_BUG_REPORT_STRING;
+      }
+    };
+
+    /** @brief Exception thrown if an invalid sampler is provided for an image. */
+    class invalid_sampler : public std::exception
+    {
+      virtual const char* what() const throw()
+      {
+        return "ViennaCL: FATAL ERROR: CL_INVALID_SAMPLER."
+               VIENNACL_BUG_REPORT_STRING;
+      }
+    };
+
+    /** @brief Exception thrown if the OpenCL binary (generated from the jit-compiler or loaded from some other location) won't work on the device (e.g. due to a lack of double precision support). */
+    class invalid_binary : public std::exception
+    {
+      virtual const char* what() const throw()
+      {
+        return "ViennaCL: FATAL ERROR: CL_INVALID_BINARY."
+               VIENNACL_BUG_REPORT_STRING;
+      }
+    };
+
+    /** @brief Exception thrown if invalid build options are passed to the OpenCL just-in-time compiler. */
+    class invalid_build_options : public std::exception
+    {
+      virtual const char* what() const throw()
+      {
+        return "ViennaCL: FATAL ERROR: CL_INVALID_BUILD_OPTIONS."
+               VIENNACL_BUG_REPORT_STRING;
+      }
+    };
+
+    /** @brief Exception thrown if an OpenCL program object handle is invalid (e.g. not initialized). */
+    class invalid_program : public std::exception
+    {
+      virtual const char* what() const throw()
+      {
+        return "ViennaCL: FATAL ERROR: CL_INVALID_PROGRAM."
+               VIENNACL_BUG_REPORT_STRING;
+      }
+    };
+
+    /** @brief Exception thrown if there is no built program exectuable available for the device. */
+    class invalid_program_executable : public std::exception
+    {
+      virtual const char* what() const throw()
+      {
+        return "ViennaCL: FATAL ERROR: CL_INVALID_PROGRAM_EXECUTABLE."
+               VIENNACL_BUG_REPORT_STRING;
+      }
+    };
+
+    /** @brief Exception thrown if the provided kernel name is invalid (e.g. not part of the program provided). */
+    class invalid_kernel_name : public std::exception
+    {
+      virtual const char* what() const throw()
+      {
+        return "ViennaCL: FATAL ERROR: CL_INVALID_KERNEL_NAME \n The supplied kernel name is invalid. If you have written your own OpenCL kernel, please check that the correct kernel name is used in the initalization of the kernel object."
+               VIENNACL_BUG_REPORT_STRING;
+      }
+    };
+
+    /** @brief Exception thrown if the kernel definition (number of arguments, argument types, etc.) is not the same for all devices for which the program has been built. */
+    class invalid_kernel_definition : public std::exception
+    {
+      virtual const char* what() const throw()
+      {
+        return "ViennaCL: FATAL ERROR: CL_INVALID_KERNEL_DEFINITION."
+               VIENNACL_BUG_REPORT_STRING;
+      }
+    };
+
+    /** @brief Exception thrown if the provided kernel object (of type cl_kernel) is invalid (e.g. not initialized, from different context, or corrupted). */
+    class invalid_kernel : public std::exception
+    {
+      virtual const char* what() const throw()
+      {
+        return "ViennaCL: FATAL ERROR: CL_INVALID_KERNEL \n The supplied kernel argument is invalid."
+               VIENNACL_BUG_REPORT_STRING;
+      }
+    };
+
+    /** @brief Exception thrown if the kernel argument index is invalid, e.g. an arg index larger than the number of kernel arguments was provided. */
+    class invalid_arg_index : public std::exception
+    {
+      virtual const char* what() const throw()
+      {
+        return "ViennaCL: FATAL ERROR: CL_INVALID_ARG_INDEX."
+               VIENNACL_BUG_REPORT_STRING;
+      }
+    };
+
+    /** @brief Exception thrown if the kernel argument provided has an invalid value. */
+    class invalid_arg_value : public std::exception
+    {
+      virtual const char* what() const throw()
+      {
+        return "ViennaCL: FATAL ERROR: CL_INVALID_ARG_VALUE."
+               VIENNACL_BUG_REPORT_STRING;
+      }
+    };
+
+    /** @brief Exception thrown if the arguments to an OpenCL kernel have an invalid size e.g. not sizeof(cl_mem)). */
+    class invalid_arg_size : public std::exception
+    {
+      virtual const char* what() const throw()
+      {
+        return "ViennaCL: FATAL ERROR: CL_INVALID_ARG_SIZE."
+               VIENNACL_BUG_REPORT_STRING;
+      }
+    };
+
+    /** @brief Exception thrown if the kernel arguments are invalid and/or do not fit the kernel parameter list. */
+    class invalid_kernel_args : public std::exception
+    {
+      virtual const char* what() const throw()
+      {
+        return "ViennaCL: FATAL ERROR: CL_INVALID_KERNEL_ARGS \n The supplied kernel arguments do not fit the kernel parameter list. If you have written your own OpenCL kernel, please check that the correct kernel arguments are set in the appropriate order."
+               VIENNACL_BUG_REPORT_STRING;
+      }
+    };
+
+    /** @brief Exception thrown if the work dimension is invalid (usually this means that the work dimension was set to be larger than three. */
+    class invalid_work_dimension : public std::exception
+    {
+      virtual const char* what() const throw()
+      {
+        return "ViennaCL: FATAL ERROR: CL_INVALID_WORK_DIMENSION"
+               VIENNACL_BUG_REPORT_STRING;
+      }
+    };
+
+    /** @brief Exception thrown if the number of work groups is invalid (usually this means that more than 256/512/768/1024 work groups have been specified, but the device(s) cannot support this. */
+    class invalid_work_group_size : public std::exception
+    {
+      virtual const char* what() const throw()
+      {
+        return "ViennaCL: FATAL ERROR: CL_INVALID_WORK_GROUP_SIZE \n The supplied work group size is invalid. If you have set this value manually, please reconsider your choice."
+               VIENNACL_BUG_REPORT_STRING;
+      }
+    };
+
+    /** @brief Exception thrown if the number of work items per work group invalid (usually this means that more than 256/512/768/1024 work items have been specified, but the device(s) cannot support this. */
+    class invalid_work_item_size : public std::exception
+    {
+      virtual const char* what() const throw()
+      {
+        return "ViennaCL: FATAL ERROR: CL_INVALID_WORK_ITEM_SIZE \n The work item size is invalid. If you have set this value manually, please reconsider your choice."
+               VIENNACL_BUG_REPORT_STRING;
+      }
+    };
+
+    /** @brief Exception thrown if the provided offset for get_global_id() in OpenCL kernels is invalid. */
+    class invalid_global_offset : public std::exception
+    {
+      virtual const char* what() const throw()
+      {
+        return "ViennaCL: FATAL ERROR: CL_INVALID_GLOBAL_OFFSET."
+               VIENNACL_BUG_REPORT_STRING;
+      }
+    };
+
+    /** @brief Exception thrown if the provided event wait list is invalid. */
+    class invalid_event_wait_list : public std::exception
+    {
+      virtual const char* what() const throw()
+      {
+        return "ViennaCL: FATAL ERROR: CL_INVALID_EVENT_WAIT_LIST."
+               VIENNACL_BUG_REPORT_STRING;
+      }
+    };
+
+    /** @brief Exception thrown if the provided event object (of type cl_event) is invalid. */
+    class invalid_event : public std::exception
+    {
+      virtual const char* what() const throw()
+      {
+        return "ViennaCL: FATAL ERROR: CL_INVALID_EVENT."
+               VIENNACL_BUG_REPORT_STRING;
+      }
+    };
+
+    /** @brief Exception thrown if interoperability of OpenCL with other frameworks collide. */
+    class invalid_operation : public std::exception
+    {
+      virtual const char* what() const throw()
+      {
+        return "ViennaCL: FATAL ERROR: CL_INVALID_OPERATION."
+               VIENNACL_BUG_REPORT_STRING;
+      }
+    };
+
+    /** @brief Exception thrown if the provided OpenGL (not OpenCL) object is invalid. */
+    class invalid_gl_object : public std::exception
+    {
+      virtual const char* what() const throw()
+      {
+        return "ViennaCL: FATAL ERROR: CL_INVALID_GL_OBJECT."
+               VIENNACL_BUG_REPORT_STRING;
+      }
+    };
+
+    /** @brief Exception thrown if the provided buffer size is invalid (e.g. zero) */
+    class invalid_buffer_size : public std::exception
+    {
+      virtual const char* what() const throw()
+      {
+        return "ViennaCL: FATAL ERROR: CL_INVALID_BUFFER_SIZE."
+               VIENNACL_BUG_REPORT_STRING;
+      }
+    };
+
+    /** @brief Exception thrown if the provided miplevel is greater than zero, but the OpenGL implementation does not support creating from non-zero mipmap levels. */
+    class invalid_mip_level : public std::exception
+    {
+      virtual const char* what() const throw()
+      {
+        return "ViennaCL: FATAL ERROR: CL_INVALID_MIP_LEVEL."
+               VIENNACL_BUG_REPORT_STRING;
+      }
+    };
+
+    /** @brief Exception thrown if the total number of work items is invalid (for example, not divisible by the number of work items per work group). */
+    class invalid_global_work_size : public std::exception
+    {
+      virtual const char* what() const throw()
+      {
+        return "ViennaCL: FATAL ERROR: CL_INVALID_GLOBAL_WORK_SIZE."
+               VIENNACL_BUG_REPORT_STRING;
+      }
+    };
+
+    /** @brief Exception thrown if an invalid property is provided to a function (vague value). */
+    class invalid_property : public std::exception
+    {
+      virtual const char* what() const throw()
+      {
+        return "ViennaCL: FATAL ERROR: CL_INVALID_PROPERTY."
+               VIENNACL_BUG_REPORT_STRING;
+      }
+    };
+
+    /** @brief Exception thrown if the returned error cannot be resolved to some defined error constant. Might result from invalid sources, invalid memory operations, etc. */
+    class unknown_error : public std::exception
+    {
+      virtual const char* what() const throw()
+      {
+        return "ViennaCL: FATAL ERROR: ViennaCL encountered an unknown OpenCL error. In some cases, this might be due to an invalid global work size, but it can also be due to several compilation errors."
+               VIENNACL_BUG_REPORT_STRING;
+      }
+    };
+
+    /** @brief Exception thrown if the user wants to use double precision arithmetics, but the device does not support double precision. */
+    class double_precision_not_provided_error : public std::exception
+    {
+      virtual const char* what() const throw()
+      {
+        return "ViennaCL: FATAL ERROR: You requested to create a ViennaCL type using double precision. However, double precision is not supported by your device."
+               VIENNACL_BUG_REPORT_STRING;
+      }
+    };
+
+
+    /** @brief An error reporting class. Template argument is used to avoid problems with external linkage.
+    *
+    *  Do not use this class directly, use the macro CL_ERROR_CHECK instead.
+    *  @tparam T   Useless. Helps to avoid troubles with external linkage of namespace functions.
+    */
+    template <typename T>
+    struct error_checker
+    {
+
+      /** @brief Trows exceptions that reflect OpenCL error codes */
+      static void raise_exception(cl_int err)
+      {
+        switch (err)
+        {
+          case CL_DEVICE_NOT_FOUND:               throw device_not_found();
+          case CL_DEVICE_NOT_AVAILABLE:           throw device_not_available();
+          case CL_COMPILER_NOT_AVAILABLE:         throw compiler_not_available();
+          case CL_MEM_OBJECT_ALLOCATION_FAILURE:  throw mem_object_allocation_failure();
+          case CL_OUT_OF_RESOURCES:               throw out_of_resources();
+          case CL_OUT_OF_HOST_MEMORY:             throw out_of_host_memory();
+          case CL_PROFILING_INFO_NOT_AVAILABLE:   throw profiling_info_not_available();
+          case CL_MEM_COPY_OVERLAP:               throw mem_copy_overlap();
+          case CL_IMAGE_FORMAT_MISMATCH:          throw image_format_mismatch();
+          case CL_IMAGE_FORMAT_NOT_SUPPORTED:     throw image_format_not_supported();
+          case CL_BUILD_PROGRAM_FAILURE:          throw build_program_failure();
+          case CL_MAP_FAILURE:                    throw map_failure();
+
+          case CL_INVALID_VALUE:                  throw invalid_value();
+          case CL_INVALID_DEVICE_TYPE:            throw invalid_device_type();
+          case CL_INVALID_PLATFORM:               throw invalid_platform();
+          case CL_INVALID_DEVICE:                 throw invalid_device();
+          case CL_INVALID_CONTEXT:                throw invalid_context();
+          case CL_INVALID_QUEUE_PROPERTIES:       throw invalid_queue_properties();
+          case CL_INVALID_COMMAND_QUEUE:          throw invalid_command_queue();
+          case CL_INVALID_HOST_PTR:               throw invalid_host_ptr();
+          case CL_INVALID_MEM_OBJECT:             throw invalid_mem_object();
+          case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: throw invalid_image_format_descriptor();
+          case CL_INVALID_IMAGE_SIZE:             throw invalid_image_size();
+          case CL_INVALID_SAMPLER:                throw invalid_sampler();
+          case CL_INVALID_BINARY:                 throw invalid_binary();
+          case CL_INVALID_BUILD_OPTIONS:          throw invalid_build_options();
+          case CL_INVALID_PROGRAM:                throw invalid_program();
+          case CL_INVALID_PROGRAM_EXECUTABLE:     throw invalid_program_executable();
+          case CL_INVALID_KERNEL_NAME:            throw invalid_kernel_name();
+          case CL_INVALID_KERNEL_DEFINITION:      throw invalid_kernel_definition();
+          case CL_INVALID_KERNEL:                 throw invalid_kernel();
+          case CL_INVALID_ARG_INDEX:              throw invalid_arg_index();
+          case CL_INVALID_ARG_VALUE:              throw invalid_arg_value();
+          case CL_INVALID_ARG_SIZE:               throw invalid_arg_size();
+          case CL_INVALID_KERNEL_ARGS:            throw invalid_kernel_args();
+          case CL_INVALID_WORK_DIMENSION:         throw invalid_work_dimension();
+          case CL_INVALID_WORK_GROUP_SIZE:        throw invalid_work_group_size();
+          case CL_INVALID_WORK_ITEM_SIZE:         throw invalid_work_item_size();
+          case CL_INVALID_GLOBAL_OFFSET:          throw invalid_global_offset();
+          case CL_INVALID_EVENT_WAIT_LIST:        throw invalid_event_wait_list();
+          case CL_INVALID_EVENT:                  throw invalid_event();
+          case CL_INVALID_OPERATION:              throw invalid_operation();
+          case CL_INVALID_GL_OBJECT:              throw invalid_gl_object();
+          case CL_INVALID_BUFFER_SIZE:            throw invalid_buffer_size();
+          case CL_INVALID_MIP_LEVEL:              throw invalid_mip_level();
+          case CL_INVALID_GLOBAL_WORK_SIZE:       throw invalid_global_work_size();
+      #ifdef CL_INVALID_PROPERTY
+    case CL_INVALID_PROPERTY:               throw invalid_property();
+      #endif
+          //  return "CL_INVALID_GLOBAL_WORK_SIZE";
+
+          default: throw unknown_error();
+        }
+
+      } //getErrorString
+
+      /** @brief Checks whether an OpenCL error has occured.
+      *
+      *  Do not use this function directly, use the macro CL_ERROR_CHECK instead.
+      */
+      static void checkError(cl_int err,
+          #ifdef VIENNACL_DEBUG_ALL
+                             const char * file,
+                             const char * func,
+                             int line)
+          #else
+                             const char *,
+                             const char *,
+                             int)
+          #endif
+      {
+        if (err != CL_SUCCESS)
+        {
+          #ifdef VIENNACL_DEBUG_ALL
+          std::cerr << "ViennaCL: Error " << err  << " in function " << func << " ( "<< file << ":" << line << " ) " << std::endl;
+          #endif
+          raise_exception(err);
+        }
+      } //checkError()
+
+    }; //struct
+
+    #define VIENNACL_ERR_CHECK(err) viennacl::ocl::error_checker<void>::checkError(err, __FILE__, __FUNCTION__, __LINE__);
+
+  } //namespace ocl
+} //namespace viennacl
+
+#endif
+
diff --git a/viennacl/ocl/forwards.h b/viennacl/ocl/forwards.h
index b74c3be..30dbf08 100644
--- a/viennacl/ocl/forwards.h
+++ b/viennacl/ocl/forwards.h
@@ -2,22 +2,23 @@
 #define VIENNACL_OCL_FORWARDS_H_
 
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
 ============================================================================= */
 
-/** @file ocl/forwards.h
+/** @file viennacl/ocl/forwards.h
     @brief This file provides the forward declarations for the OpenCL layer of ViennaCL
 */
 
@@ -30,12 +31,16 @@ namespace viennacl
   namespace ocl
   {
     //device type tags (cf. OpenCL standard)
+    /** @brief A tag identifying OpenCL devices as GPUs. */
     struct gpu_tag {};
+    /** @brief A tag identifying OpenCL devices as CPUs. */
     struct cpu_tag {};
+    /** @brief A tag identifying OpenCL devices as accelerators (e.g. Intel Xeon Phi) */
     struct accelerator_tag {};
+    /** @brief A tag denoting the default OpenCL device type (SDK-specific) */
     struct default_tag {};
-    
-    
+
+
     class kernel;
     class device;
     class command_queue;
@@ -47,7 +52,7 @@ namespace viennacl
 
     template <typename KernelType>
     void enqueue(KernelType & k, viennacl::ocl::command_queue const & queue);
-    
+
     inline viennacl::ocl::context & current_context();
     inline viennacl::ocl::device const & current_device();
   }
diff --git a/viennacl/ocl/handle.hpp b/viennacl/ocl/handle.hpp
index 07e1038..722fae0 100644
--- a/viennacl/ocl/handle.hpp
+++ b/viennacl/ocl/handle.hpp
@@ -1,196 +1,226 @@
-#ifndef VIENNACL_OCL_HANDLE_HPP_
-#define VIENNACL_OCL_HANDLE_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file viennacl/ocl/handle.hpp
-    @brief Implementation of a smart-pointer-like class for handling OpenCL handles.
-*/
-
-#ifdef __APPLE__
-#include <OpenCL/cl.h>
-#else
-#include <CL/cl.h>
-#endif
-
-#include <assert.h>
-#include <string>
-#include <iostream>
-#include "viennacl/ocl/error.hpp"
-
-namespace viennacl
-{
-  namespace ocl
-  {
-    /** @brief Helper for OpenCL reference counting used by class handle.
-    *   @tparam OCL_TYPE Must be one out of cl_mem, cl_program, cl_kernel, cl_command_queue and cl_context, otherwise a compile time error is thrown.
-    */
-    template<class OCL_TYPE>
-    class handle_inc_dec_helper
-    {
-      typedef typename OCL_TYPE::ERROR_TEMPLATE_ARGUMENT_FOR_CLASS_INVALID   ErrorType;
-    };
-    
-    
-    //cl_mem:
-    template <>
-    struct handle_inc_dec_helper<cl_mem>
-    {
-      static void inc(cl_mem & something)
-      {
-        cl_int err = clRetainMemObject(something);
-        VIENNACL_ERR_CHECK(err);
-      }
-      
-      static void dec(cl_mem & something)
-      {
-        #ifndef __APPLE__
-        cl_int err = clReleaseMemObject(something);
-        VIENNACL_ERR_CHECK(err);
-        #endif
-      }
-    };
-    
-    //cl_program:
-    template <>
-    struct handle_inc_dec_helper<cl_program>
-    {
-      static void inc(cl_program & something)
-      {
-        cl_int err = clRetainProgram(something);
-        VIENNACL_ERR_CHECK(err);
-      }
-      
-      static void dec(cl_program & something)
-      {
-        #ifndef __APPLE__
-        cl_int err = clReleaseProgram(something);
-        VIENNACL_ERR_CHECK(err);
-        #endif
-      }
-    };
-    
-    //cl_kernel:
-    template <>
-    struct handle_inc_dec_helper<cl_kernel>
-    {
-      static void inc(cl_kernel & something)
-      {
-        cl_int err = clRetainKernel(something);
-        VIENNACL_ERR_CHECK(err);
-      }
-      
-      static void dec(cl_kernel & something)
-      {
-        #ifndef __APPLE__
-        cl_int err = clReleaseKernel(something);
-        VIENNACL_ERR_CHECK(err);
-        #endif
-      }
-    };
-
-    //cl_command_queue:
-    template <>
-    struct handle_inc_dec_helper<cl_command_queue>
-    {
-      static void inc(cl_command_queue & something)
-      {
-        cl_int err = clRetainCommandQueue(something);
-        VIENNACL_ERR_CHECK(err);
-      }
-      
-      static void dec(cl_command_queue & something)
-      {
-        #ifndef __APPLE__
-        cl_int err = clReleaseCommandQueue(something);
-        VIENNACL_ERR_CHECK(err);
-        #endif
-      }
-    };
-    
-    //cl_context:
-    template <>
-    struct handle_inc_dec_helper<cl_context>
-    {
-      static void inc(cl_context & something)
-      {
-        cl_int err = clRetainContext(something);
-        VIENNACL_ERR_CHECK(err);
-      }
-      
-      static void dec(cl_context & something)
-      {
-        #ifndef __APPLE__
-        cl_int err = clReleaseContext(something);
-        VIENNACL_ERR_CHECK(err);
-        #endif
-      }
-    };
-    
-    /** @brief Handle class the effectively represents a smart pointer for OpenCL handles */
-    template<class OCL_TYPE>
-    class handle
-    {
-    public:
-      handle() : h_(0) {}
-      handle(const OCL_TYPE & _something) : h_(_something) {}
-      handle(const handle & other) : h_(other.h_) { if (h_ != 0) inc(); }
-      ~handle() { if (h_ != 0) dec(); }
-      handle & operator=(const handle & other)
-      {
-        if (h_ != 0) 
-          dec();
-        h_ = other.h_;
-        inc();
-        return *this;
-      }
-      handle & operator=(const OCL_TYPE & _something)
-      {
-        if (h_ != 0) dec();
-        h_ = _something;
-        return *this;
-      }
-      
-      /** @brief Implicit conversion to the plain OpenCL handle. DEPRECATED and will be removed some time in the future. */
-      operator OCL_TYPE() const { return h_; }
-      
-      const OCL_TYPE & get() const { return h_; }
-      
-      
-      
-      /** @brief Swaps the OpenCL handle of two handle objects */
-      handle & swap(handle & other)
-      {
-        OCL_TYPE tmp = other.h_;
-        other.h_ = this->h_;
-        this->h_ = tmp;
-        return *this;
-      }
-      
-      /** @brief Manually increment the OpenCL reference count. Typically called automatically, but is necessary if user-supplied memory objects are wrapped. */
-      void inc() { handle_inc_dec_helper<OCL_TYPE>::inc(h_); };
-      /** @brief Manually decrement the OpenCL reference count. Typically called automatically, but might be useful with user-supplied memory objects.  */
-      void dec() { handle_inc_dec_helper<OCL_TYPE>::dec(h_); };
-    private:
-      OCL_TYPE h_;
-    };
-
-    
-  } //namespace ocl
-} //namespace viennacl
-
-#endif
+#ifndef VIENNACL_OCL_HANDLE_HPP_
+#define VIENNACL_OCL_HANDLE_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/ocl/handle.hpp
+    @brief Implementation of a smart-pointer-like class for handling OpenCL handles.
+*/
+
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#else
+#include <CL/cl.h>
+#endif
+
+#include <assert.h>
+#include <string>
+#include <iostream>
+#include "viennacl/ocl/forwards.h"
+#include "viennacl/ocl/error.hpp"
+
+namespace viennacl
+{
+  namespace ocl
+  {
+    /** @brief Helper for OpenCL reference counting used by class handle.
+    *   @tparam OCL_TYPE Must be one out of cl_mem, cl_program, cl_kernel, cl_command_queue and cl_context, otherwise a compile time error is thrown.
+    */
+    template<class OCL_TYPE>
+    class handle_inc_dec_helper
+    {
+      typedef typename OCL_TYPE::ERROR_TEMPLATE_ARGUMENT_FOR_CLASS_INVALID   ErrorType;
+    };
+
+    /** \cond */
+    //cl_mem:
+    template <>
+    struct handle_inc_dec_helper<cl_mem>
+    {
+      static void inc(cl_mem & something)
+      {
+        cl_int err = clRetainMemObject(something);
+        VIENNACL_ERR_CHECK(err);
+      }
+
+      static void dec(cl_mem & something)
+      {
+        #ifndef __APPLE__
+        cl_int err = clReleaseMemObject(something);
+        VIENNACL_ERR_CHECK(err);
+        #endif
+      }
+    };
+
+    //cl_program:
+    template <>
+    struct handle_inc_dec_helper<cl_program>
+    {
+      static void inc(cl_program & something)
+      {
+        cl_int err = clRetainProgram(something);
+        VIENNACL_ERR_CHECK(err);
+      }
+
+      static void dec(cl_program & something)
+      {
+        #ifndef __APPLE__
+        cl_int err = clReleaseProgram(something);
+        VIENNACL_ERR_CHECK(err);
+        #endif
+      }
+    };
+
+    //cl_kernel:
+    template <>
+    struct handle_inc_dec_helper<cl_kernel>
+    {
+      static void inc(cl_kernel & something)
+      {
+        cl_int err = clRetainKernel(something);
+        VIENNACL_ERR_CHECK(err);
+      }
+
+      static void dec(cl_kernel & something)
+      {
+        #ifndef __APPLE__
+        cl_int err = clReleaseKernel(something);
+        VIENNACL_ERR_CHECK(err);
+        #endif
+      }
+    };
+
+    //cl_command_queue:
+    template <>
+    struct handle_inc_dec_helper<cl_command_queue>
+    {
+      static void inc(cl_command_queue & something)
+      {
+        cl_int err = clRetainCommandQueue(something);
+        VIENNACL_ERR_CHECK(err);
+      }
+
+      static void dec(cl_command_queue & something)
+      {
+        #ifndef __APPLE__
+        cl_int err = clReleaseCommandQueue(something);
+        VIENNACL_ERR_CHECK(err);
+        #endif
+      }
+    };
+
+    //cl_context:
+    template <>
+    struct handle_inc_dec_helper<cl_context>
+    {
+      static void inc(cl_context & something)
+      {
+        cl_int err = clRetainContext(something);
+        VIENNACL_ERR_CHECK(err);
+      }
+
+      static void dec(cl_context & something)
+      {
+        #ifndef __APPLE__
+        cl_int err = clReleaseContext(something);
+        VIENNACL_ERR_CHECK(err);
+        #endif
+      }
+    };
+    /** \endcond */
+
+    /** @brief Handle class the effectively represents a smart pointer for OpenCL handles */
+    template<class OCL_TYPE>
+    class handle
+    {
+      public:
+        handle() : h_(0), p_context_(NULL) {}
+        handle(const OCL_TYPE & something, viennacl::ocl::context const & c) : h_(something), p_context_(&c) {}
+        handle(const handle & other) : h_(other.h_), p_context_(other.p_context_) { if (h_ != 0) inc(); }
+        ~handle() { if (h_ != 0) dec(); }
+
+        /** @brief Copies the OpenCL handle from the provided handle. Does not take ownership like e.g. std::auto_ptr<>, so both handle objects are valid (more like shared_ptr). */
+        handle & operator=(const handle & other)
+        {
+          if (h_ != 0)
+            dec();
+          h_         = other.h_;
+          p_context_ = other.p_context_;
+          inc();
+          return *this;
+        }
+
+        /** @brief Wraps an OpenCL handle. Does not change the context of this handle object! Decreases the reference count if the handle object is destroyed or another OpenCL handle is assigned. */
+        handle & operator=(const OCL_TYPE & something)
+        {
+          if (h_ != 0) dec();
+          h_ = something;
+          return *this;
+        }
+
+        /** @brief Wraps an OpenCL handle including its associated context. Decreases the reference count if the handle object is destroyed or another OpenCL handle is assigned. */
+        handle & operator=(std::pair<OCL_TYPE, cl_context> p)
+        {
+          if (h_ != 0) dec();
+          h_         = p.first;
+          p_context_ = p.second;
+          return *this;
+        }
+
+
+        /** @brief Implicit conversion to the plain OpenCL handle. DEPRECATED and will be removed some time in the future. */
+        operator OCL_TYPE() const { return h_; }
+
+        const OCL_TYPE & get() const { return h_; }
+
+        viennacl::ocl::context const & context() const
+        {
+          assert(p_context_ != NULL && bool("Logic error: Accessing dangling context from handle."));
+          return *p_context_;
+        }
+        void context(viennacl::ocl::context const & c) { p_context_ = &c; }
+
+
+        /** @brief Swaps the OpenCL handle of two handle objects */
+        handle & swap(handle & other)
+        {
+          OCL_TYPE tmp = other.h_;
+          other.h_ = this->h_;
+          this->h_ = tmp;
+
+          viennacl::ocl::context const * tmp2 = other.p_context_;
+          other.p_context_ = this->p_context_;
+          this->p_context_ = tmp2;
+
+          return *this;
+        }
+
+        /** @brief Manually increment the OpenCL reference count. Typically called automatically, but is necessary if user-supplied memory objects are wrapped. */
+        void inc() { handle_inc_dec_helper<OCL_TYPE>::inc(h_); }
+        /** @brief Manually decrement the OpenCL reference count. Typically called automatically, but might be useful with user-supplied memory objects.  */
+        void dec() { handle_inc_dec_helper<OCL_TYPE>::dec(h_); }
+      private:
+        OCL_TYPE h_;
+        viennacl::ocl::context const * p_context_;
+    };
+
+
+  } //namespace ocl
+} //namespace viennacl
+
+#endif
diff --git a/viennacl/ocl/infos.hpp b/viennacl/ocl/infos.hpp
new file mode 100644
index 0000000..3534f6c
--- /dev/null
+++ b/viennacl/ocl/infos.hpp
@@ -0,0 +1,268 @@
+#ifndef VIENNACL_OCL_INFOS_HPP_
+#define VIENNACL_OCL_INFOS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2012, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/ocl/infos.hpp
+    @brief Implementation of convenience functions to get infos
+*/
+
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#else
+#include <CL/cl.h>
+#endif
+#include <vector>
+#include "viennacl/ocl/forwards.h"
+#include "viennacl/ocl/error.hpp"
+
+namespace viennacl{
+
+    namespace ocl{
+
+      /** @brief Implementation details for the OpenCL managment layer in ViennaCL */
+    namespace detail{
+
+    /** @brief Helper class for obtaining informations from the OpenCL backend. Deprecated! */
+    template<typename T>
+    struct info;
+
+    /** \cond */
+    template<>
+    struct info<cl_mem>{
+      typedef cl_mem_info type;
+      static void get(cl_mem mem, cl_mem_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret){
+          cl_int err = clGetMemObjectInfo(mem,param_name,param_value_size,param_value,param_value_size_ret);
+          VIENNACL_ERR_CHECK(err);
+      }
+    };
+
+    template<>
+    struct info<cl_device_id>{
+      typedef cl_device_info type;
+      static void get(cl_device_id device, cl_device_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret){
+          cl_int err = clGetDeviceInfo(device,param_name,param_value_size,param_value,param_value_size_ret);
+          VIENNACL_ERR_CHECK(err);
+      }
+    };
+
+    template<>
+    struct info<cl_kernel>{
+      typedef cl_kernel_info type;
+      static void get(cl_kernel kernel, cl_kernel_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret){
+          cl_int err = clGetKernelInfo(kernel,param_name,param_value_size,param_value,param_value_size_ret);
+          VIENNACL_ERR_CHECK(err);
+      }
+
+      static void get(cl_kernel kernel, cl_device_id dev_id, cl_kernel_work_group_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret){
+          cl_int err = clGetKernelWorkGroupInfo(kernel, dev_id, param_name,param_value_size,param_value,param_value_size_ret);
+          VIENNACL_ERR_CHECK(err);
+      }
+    };
+
+    template<>
+    struct info<cl_context>{
+      typedef cl_context_info type;
+      static void get(cl_context context, cl_context_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret){
+          cl_int err = clGetContextInfo(context,param_name,param_value_size,param_value,param_value_size_ret);
+          VIENNACL_ERR_CHECK(err);
+      }
+    };
+
+    template<>
+    struct info<cl_program>{
+      typedef cl_program_info type;
+      static void get(cl_program context, cl_program_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret){
+          cl_int err = clGetProgramInfo(context,param_name,param_value_size,param_value,param_value_size_ret);
+          VIENNACL_ERR_CHECK(err);
+      }
+    };
+
+    template<class RES_T>
+    struct get_info_impl{
+
+        template<class MEM_T, class INFO_T>
+        RES_T operator()(MEM_T const & mem, INFO_T const & info){
+            RES_T res;
+            detail::info<MEM_T>::get(mem,info,sizeof(RES_T),&res,NULL);
+            return res;
+        }
+
+        template<class MEM_T, class MEM2_T, class INFO_T>
+        RES_T operator()(MEM_T const & mem, MEM2_T const & mem2, INFO_T const & info){
+            RES_T res;
+            detail::info<MEM_T>::get(mem,mem2, info,sizeof(RES_T),&res,NULL);
+            return res;
+        }
+    };
+
+    template<>
+    struct get_info_impl<std::string>{
+
+        template<class MEM_T, class INFO_T>
+        std::string operator()(const MEM_T &mem, const INFO_T &info){
+            char buff[1024];
+            detail::info<MEM_T>::get(mem,info,1024,buff,NULL);
+            return std::string(buff);
+        }
+    };
+
+    template<class T>
+    struct get_info_impl<std::vector<T> >{
+        template<class MEM_T, class INFO_T>
+        std::vector<T> operator()(const MEM_T &mem, const INFO_T &info){
+            size_t vec_size;
+            detail::info<MEM_T>::get(mem,info,0,NULL,&vec_size);
+            std::vector<T> res(vec_size/sizeof(T));
+            detail::info<MEM_T>::get(mem,info,vec_size,res.data(),NULL);
+            return res;
+        }
+    };
+
+    template<typename T, typename info<T>::type param>
+    struct return_type;
+    /** \endcond */
+
+    /** \cond */
+     #define SET_INFO_RETURN_TYPE(DATA_TYPE,NAME,RETURN_TYPE) template<> struct return_type<DATA_TYPE, NAME> { typedef RETURN_TYPE Result; }
+
+     SET_INFO_RETURN_TYPE(cl_mem,CL_MEM_TYPE, cl_mem_object_type);
+     SET_INFO_RETURN_TYPE(cl_mem,CL_MEM_FLAGS, cl_mem_flags);
+     SET_INFO_RETURN_TYPE(cl_mem,CL_MEM_SIZE, size_t);
+     SET_INFO_RETURN_TYPE(cl_mem,CL_MEM_HOST_PTR, void*);
+     SET_INFO_RETURN_TYPE(cl_mem,CL_MEM_MAP_COUNT, cl_uint);
+     SET_INFO_RETURN_TYPE(cl_mem,CL_MEM_REFERENCE_COUNT, cl_uint);
+     SET_INFO_RETURN_TYPE(cl_mem,CL_MEM_CONTEXT, cl_context);
+
+     SET_INFO_RETURN_TYPE(cl_program,CL_PROGRAM_REFERENCE_COUNT,cl_uint);
+
+      SET_INFO_RETURN_TYPE(cl_device_id,  CL_DEVICE_ADDRESS_BITS, cl_uint);
+      SET_INFO_RETURN_TYPE(cl_device_id,  CL_DEVICE_AVAILABLE, cl_bool);
+      SET_INFO_RETURN_TYPE(cl_device_id,  CL_DEVICE_COMPILER_AVAILABLE, cl_bool);
+//      SET_INFO_RETURN_TYPE(cl_device_id,  CL_DEVICE_DOUBLE_FP_CONFIG, cl_device_fp_config);
+      SET_INFO_RETURN_TYPE(cl_device_id,  CL_DEVICE_ENDIAN_LITTLE, cl_bool);
+      SET_INFO_RETURN_TYPE(cl_device_id,  CL_DEVICE_ERROR_CORRECTION_SUPPORT, cl_bool);
+      SET_INFO_RETURN_TYPE(cl_device_id,  CL_DEVICE_EXECUTION_CAPABILITIES, cl_device_exec_capabilities);
+      SET_INFO_RETURN_TYPE(cl_device_id,  CL_DEVICE_EXTENSIONS, std::string);
+      SET_INFO_RETURN_TYPE(cl_device_id,  CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, cl_ulong);
+      SET_INFO_RETURN_TYPE(cl_device_id,  CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, cl_uint);
+      SET_INFO_RETURN_TYPE(cl_device_id,  CL_DEVICE_GLOBAL_MEM_SIZE, cl_ulong);
+//      SET_INFO_RETURN_TYPE(cl_device_id,  CL_DEVICE_HALF_FP_CONFIG, cl_device_fp_config);
+      SET_INFO_RETURN_TYPE(cl_device_id,  CL_DEVICE_IMAGE_SUPPORT, cl_bool);
+      SET_INFO_RETURN_TYPE(cl_device_id,  CL_DEVICE_IMAGE2D_MAX_HEIGHT , size_t);
+      SET_INFO_RETURN_TYPE(cl_device_id,  CL_DEVICE_IMAGE2D_MAX_WIDTH , size_t);
+      SET_INFO_RETURN_TYPE(cl_device_id,  CL_DEVICE_IMAGE3D_MAX_DEPTH , size_t);
+      SET_INFO_RETURN_TYPE(cl_device_id,  CL_DEVICE_IMAGE3D_MAX_HEIGHT , size_t);
+      SET_INFO_RETURN_TYPE(cl_device_id,  CL_DEVICE_IMAGE3D_MAX_WIDTH , size_t);
+      SET_INFO_RETURN_TYPE(cl_device_id,  CL_DEVICE_LOCAL_MEM_SIZE, cl_ulong);
+      SET_INFO_RETURN_TYPE(cl_device_id,  CL_DEVICE_LOCAL_MEM_TYPE, cl_device_local_mem_type);
+      SET_INFO_RETURN_TYPE(cl_device_id,  CL_DEVICE_MAX_CLOCK_FREQUENCY , cl_uint);
+      SET_INFO_RETURN_TYPE(cl_device_id,  CL_DEVICE_MAX_COMPUTE_UNITS , cl_uint); //The minimum value is 1
+      SET_INFO_RETURN_TYPE(cl_device_id,  CL_DEVICE_MAX_CONSTANT_ARGS  , cl_uint); //The minimum value is 8
+      SET_INFO_RETURN_TYPE(cl_device_id,  CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE   , cl_ulong); //The minimum value is 64 KB
+      SET_INFO_RETURN_TYPE(cl_device_id,  CL_DEVICE_MAX_MEM_ALLOC_SIZE , cl_ulong); //The minimum value is max (1/4th of CL_DEVICE_GLOBAL_MEM_SIZE, 128*1024*1024)
+      SET_INFO_RETURN_TYPE(cl_device_id,  CL_DEVICE_MAX_PARAMETER_SIZE  , size_t);
+      SET_INFO_RETURN_TYPE(cl_device_id,  CL_DEVICE_MAX_READ_IMAGE_ARGS  , cl_uint);
+      SET_INFO_RETURN_TYPE(cl_device_id,  CL_DEVICE_MAX_SAMPLERS , cl_uint);
+      SET_INFO_RETURN_TYPE(cl_device_id,  CL_DEVICE_MAX_WORK_GROUP_SIZE , size_t);
+      SET_INFO_RETURN_TYPE(cl_device_id,  CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS  , cl_uint);
+      SET_INFO_RETURN_TYPE(cl_device_id,  CL_DEVICE_MAX_WORK_ITEM_SIZES , std::vector<size_t>);
+      SET_INFO_RETURN_TYPE(cl_device_id,  CL_DEVICE_MAX_WRITE_IMAGE_ARGS , cl_uint);
+      SET_INFO_RETURN_TYPE(cl_device_id,  CL_DEVICE_MEM_BASE_ADDR_ALIGN  , cl_uint);
+      SET_INFO_RETURN_TYPE(cl_device_id,  CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE , cl_uint);
+      SET_INFO_RETURN_TYPE(cl_device_id,  CL_DEVICE_NAME , std::string);
+      SET_INFO_RETURN_TYPE(cl_device_id,  CL_DEVICE_PLATFORM , cl_platform_id);
+      SET_INFO_RETURN_TYPE(cl_device_id,  CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR  , cl_uint);
+      SET_INFO_RETURN_TYPE(cl_device_id,  CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT  , cl_uint);
+      SET_INFO_RETURN_TYPE(cl_device_id,  CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT  , cl_uint);
+      SET_INFO_RETURN_TYPE(cl_device_id,  CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT  , cl_uint);
+      SET_INFO_RETURN_TYPE(cl_device_id,  CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE  , cl_uint);
+      SET_INFO_RETURN_TYPE(cl_device_id,  CL_DEVICE_PROFILE , std::string);
+      SET_INFO_RETURN_TYPE(cl_device_id,  CL_DEVICE_PROFILING_TIMER_RESOLUTION , size_t);
+      SET_INFO_RETURN_TYPE(cl_device_id,  CL_DEVICE_QUEUE_PROPERTIES , cl_command_queue_properties);
+      SET_INFO_RETURN_TYPE(cl_device_id,  CL_DEVICE_SINGLE_FP_CONFIG  , cl_device_fp_config);
+      SET_INFO_RETURN_TYPE(cl_device_id,  CL_DEVICE_TYPE , cl_device_type);
+      SET_INFO_RETURN_TYPE(cl_device_id,  CL_DEVICE_VENDOR , std::string);
+      SET_INFO_RETURN_TYPE(cl_device_id,  CL_DEVICE_VENDOR_ID  , cl_uint);
+      SET_INFO_RETURN_TYPE(cl_device_id,   CL_DEVICE_VERSION  , std::string);
+      SET_INFO_RETURN_TYPE(cl_device_id,   CL_DRIVER_VERSION  , std::string);
+
+
+      SET_INFO_RETURN_TYPE(cl_kernel,CL_KERNEL_FUNCTION_NAME, std::string);
+      SET_INFO_RETURN_TYPE(cl_kernel,CL_KERNEL_NUM_ARGS, cl_uint);
+      SET_INFO_RETURN_TYPE(cl_kernel,CL_KERNEL_REFERENCE_COUNT, cl_uint);
+      SET_INFO_RETURN_TYPE(cl_kernel,CL_KERNEL_CONTEXT, cl_context);
+      SET_INFO_RETURN_TYPE(cl_kernel,CL_KERNEL_PROGRAM, cl_program);
+
+
+      SET_INFO_RETURN_TYPE(cl_kernel,CL_KERNEL_WORK_GROUP_SIZE, size_t);
+//      SET_INFO_RETURN_TYPE(cl_kernel,CL_KERNEL_COMPILE_WORK_GROUP_SIZE, size_t[3]);
+      SET_INFO_RETURN_TYPE(cl_kernel,CL_KERNEL_LOCAL_MEM_SIZE, cl_ulong);
+      SET_INFO_RETURN_TYPE(cl_kernel,CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, size_t);
+
+      SET_INFO_RETURN_TYPE(cl_context, CL_CONTEXT_NUM_DEVICES, cl_uint);
+      SET_INFO_RETURN_TYPE(cl_context, CL_CONTEXT_REFERENCE_COUNT, cl_uint);
+      SET_INFO_RETURN_TYPE(cl_context, CL_CONTEXT_PROPERTIES, cl_context_properties);
+
+      #undef SET_INFO_RETURN_TYPE
+
+      /** \endcond */
+    }
+
+    template<cl_device_info param>
+    typename detail::return_type<cl_device_id, param>::Result info(cl_device_id const & handle){
+        typedef typename detail::return_type<cl_device_id, param>::Result res_t;
+        return detail::get_info_impl<res_t>()(handle,param);
+    }
+
+    template<cl_mem_info param>
+    typename detail::return_type<cl_mem, param>::Result info(cl_mem const & handle){
+        typedef typename detail::return_type<cl_mem, param>::Result res_t;
+        return detail::get_info_impl<res_t>()(handle,param);
+    }
+    template<cl_program_info param>
+    typename detail::return_type<cl_program, param>::Result info(cl_program const & handle){
+        typedef typename detail::return_type<cl_program, param>::Result res_t;
+        return detail::get_info_impl<res_t>()(handle,param);
+    }
+
+//    template<cl_kernel_info param>
+//    typename detail::return_type<cl_kernel, param>::Result info(cl_kernel const & handle){
+//        typedef typename detail::return_type<cl_kernel, param>::Result res_t;
+//        return detail::get_info_impl<res_t>()(handle,param);
+//    }
+
+//    template<cl_kernel_work_group_info param>
+//    typename detail::return_type<cl_kernel, param>::Result info(cl_kernel const & handle, cl_device_id const & handle2){
+//        typedef typename detail::return_type<cl_kernel, param>::Result res_t;
+//        return detail::get_info_impl<res_t>()(handle,handle2,param);
+//    }
+
+    template<cl_context_info param>
+    typename detail::return_type<cl_context, param>::Result info(cl_context const & handle){
+        typedef typename detail::return_type<cl_context, param>::Result res_t;
+        return detail::get_info_impl<res_t>()(handle,param);
+    }
+
+    template<class OCL_TYPE, typename detail::info<OCL_TYPE>::type param>
+    typename detail::return_type<OCL_TYPE, param>::Result info(OCL_TYPE const & handle){
+        return viennacl::ocl::info(handle.get());
+    }
+
+    }
+}
+#endif // INFOS_HPP
diff --git a/viennacl/ocl/kernel.hpp b/viennacl/ocl/kernel.hpp
index 2cd386b..5b98b97 100644
--- a/viennacl/ocl/kernel.hpp
+++ b/viennacl/ocl/kernel.hpp
@@ -1,554 +1,836 @@
-#ifndef VIENNACL_OCL_KERNEL_HPP_
-#define VIENNACL_OCL_KERNEL_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file kernel.hpp
-    @brief Representation of an OpenCL kernel in ViennaCL.
-*/
-
-#ifdef __APPLE__
-#include <OpenCL/cl.h>
-#else
-#include <CL/cl.h>
-#endif
-
-#include "viennacl/ocl/forwards.h"
-#include "viennacl/ocl/backend.hpp"
-#include "viennacl/ocl/handle.hpp"
-#include "viennacl/ocl/program.hpp"
-#include "viennacl/ocl/device.hpp"
-#include "viennacl/ocl/local_mem.hpp"
-
-namespace viennacl
-{
-  namespace ocl
-  {
-    
-    /** @brief Represents an OpenCL kernel within ViennaCL */
-    class kernel
-    {
-      template <typename KernelType>
-      friend void enqueue(KernelType & k, viennacl::ocl::command_queue const & queue);
-      
-      
-    public:
-      kernel() : handle_(0)
-      {
-        #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
-        std::cout << "ViennaCL: Creating kernel object (default CTOR)" << std::endl;
-        #endif
-        set_work_size_defaults();
-      }
-      
-      kernel(viennacl::ocl::handle<cl_program> const & prog, std::string const & name) 
-       : handle_(0), program_(prog), name_(name), init_done_(false)
-      {
-        #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
-        std::cout << "ViennaCL: Creating kernel object (full CTOR)" << std::endl;
-        #endif
-        set_work_size_defaults();
-      }
-      
-      kernel(kernel const & other) 
-       : handle_(other.handle_), program_(other.program_), name_(other.name_), init_done_(other.init_done_)
-      {
-        #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
-        std::cout << "ViennaCL: Creating kernel object (Copy CTOR)" << std::endl;
-        #endif
-        local_work_size_[0] = other.local_work_size_[0];
-        local_work_size_[1] = other.local_work_size_[1];
-        
-        global_work_size_[0] = other.global_work_size_[0];
-        global_work_size_[1] = other.global_work_size_[1];
-      }
-      
-      viennacl::ocl::kernel & operator=(const kernel & other)
-      {
-        #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
-        std::cout << "ViennaCL: Assigning kernel object" << std::endl;
-        #endif
-        handle_ = other.handle_;
-        program_ = other.program_;
-        name_ = other.name_;
-        init_done_ = other.init_done_;
-        local_work_size_[0] = other.local_work_size_[0];
-        local_work_size_[1] = other.local_work_size_[1];
-        global_work_size_[0] = other.global_work_size_[0];
-        global_work_size_[1] = other.global_work_size_[1];
-        return *this;
-      }
-      
-      
-      /** @brief Sets an unsigned integer argument at the provided position */
-      void arg(unsigned int pos, cl_uint val)
-      {
-        init();
-        #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
-        std::cout << "ViennaCL: Setting unsigned long kernel argument at pos " << pos << " for kernel " << name_ << std::endl;
-        #endif
-        cl_int err = clSetKernelArg(handle_.get(), pos, sizeof(cl_uint), (void*)&val);
-        VIENNACL_ERR_CHECK(err);
-      }
-
-      /** @brief Sets a single precision floating point argument at the provided position */
-      void arg(unsigned int pos, float val)
-      {
-        init();
-        #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
-        std::cout << "ViennaCL: Setting floating point kernel argument at pos " << pos << " for kernel " << name_ << std::endl;
-        #endif
-        cl_int err = clSetKernelArg(handle_.get(), pos, sizeof(float), (void*)&val);
-        VIENNACL_ERR_CHECK(err);
-      }
-
-      /** @brief Sets a double precision floating point argument at the provided position */
-      void arg(unsigned int pos, double val)
-      {
-        init();
-        #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
-        std::cout << "ViennaCL: Setting double precision kernel argument at pos " << pos << " for kernel " << name_ << std::endl;
-        #endif
-        cl_int err = clSetKernelArg(handle_.get(), pos, sizeof(double), (void*)&val);
-        VIENNACL_ERR_CHECK(err);
-      }
-
-      //generic handling: call .handle() member
-      /** @brief Sets an OpenCL memory object at the provided position */
-      template<class VCL_TYPE>
-      void arg(unsigned int pos, VCL_TYPE const & val)
-      {
-        init();
-        #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
-        std::cout << "ViennaCL: Setting generic kernel argument at pos " << pos << " for kernel " << name_ << std::endl;
-        #endif
-        cl_mem temp = val.handle().get();
-        cl_int err = clSetKernelArg(handle_.get(), pos, sizeof(cl_mem), (void*)&temp);
-        VIENNACL_ERR_CHECK(err);
-      }
-      
-      //forward handles directly:
-      /** @brief Sets an OpenCL object at the provided position */
-      template<class CL_TYPE>
-      void arg(unsigned int pos, viennacl::ocl::handle<CL_TYPE> const & h)
-      {
-        //arg(pos, h);
-        init();
-        #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
-        std::cout << "ViennaCL: Setting handle kernel argument at pos " << pos << " for kernel " << name_ << std::endl;
-        #endif
-        CL_TYPE temp = h.get();
-        cl_int err = clSetKernelArg(handle_.get(), pos, sizeof(CL_TYPE), (void*)&temp);
-        VIENNACL_ERR_CHECK(err);
-      }
-      
-      
-      //local buffer argument:
-      /** @brief Sets an OpenCL local memory object at the provided position */
-      void arg(unsigned int pos, const local_mem & mem)
-      {
-        unsigned int size =  mem.size();
-        init();
-        #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
-        std::cout << "ViennaCL: Setting local memory kernel argument at pos " << pos << " for kernel " << name_ << std::endl;
-        #endif
-        cl_int err = clSetKernelArg(handle_.get(), pos, size, 0);
-        VIENNACL_ERR_CHECK(err);
-      }
-      
-      
-      
-      /** @brief Convenience function for setting one kernel parameter */
-      template <typename T0>
-      kernel & operator()(T0 const & t0)
-      {
-         arg(0, t0);
-         return *this;
-      }     
-
-      /** @brief Convenience function for setting two kernel parameters */
-      template <typename T0, typename T1>
-      kernel & operator()(T0 const & t0, T1 const & t1)
-      {
-         arg(0, t0); arg(1, t1);
-         return *this;
-      }     
-
-      /** @brief Convenience function for setting three kernel parameters */
-      template <typename T0, typename T1, typename T2>
-      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2)
-      {
-         arg(0, t0); arg(1, t1); arg(2, t2);
-         return *this;
-      }     
-
-      /** @brief Convenience function for setting four kernel parameters */
-      template <typename T0, typename T1, typename T2, typename T3>
-      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3)
-      {
-         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3);
-         return *this;
-      }     
-
-      /** @brief Convenience function for setting five kernel parameters */
-      template <typename T0, typename T1, typename T2, typename T3, typename T4>
-      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4)
-      {
-         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4);
-         return *this;
-      }     
-
-      /** @brief Convenience function for setting six kernel parameters */
-      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5>
-      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5)
-      {
-         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
-         return *this;
-      }     
-
-      /** @brief Convenience function for setting seven kernel parameters */
-      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6>
-      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5, T6 const & t6)
-      {
-         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5); arg(6, t6);
-         return *this;
-      }     
-
-      /** @brief Convenience function for setting eight kernel parameters */
-      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7>
-      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5, T6 const & t6, T7 const & t7)
-      {
-         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5); arg(6, t6); arg(7, t7);
-         return *this;
-      }     
-
-      /** @brief Convenience function for setting nine kernel parameters */
-      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8>
-      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5, T6 const & t6, T7 const & t7, T8 const & t8)
-      {
-         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5); arg(6, t6); arg(7, t7); arg(8, t8);
-         return *this;
-      }     
-
-      /** @brief Convenience function for setting ten kernel parameters */
-      template <typename T0, typename T1, typename T2, typename T3, typename T4,
-                typename T5, typename T6, typename T7, typename T8, typename T9>
-      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4,
-                          T5 const & t5, T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9)
-      {
-         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5); arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9);
-         return *this;
-      }     
-
-      /** @brief Convenience function for setting eleven kernel parameters */
-      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
-                typename T6, typename T7, typename T8, typename T9, typename T10>
-      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
-                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10)
-      {
-         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5); arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10);
-         return *this;
-      }     
-
-      /** @brief Convenience function for setting twelve kernel parameters */
-      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
-                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11>
-      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
-                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11)
-      {
-         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
-         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
-         return *this;
-      }     
-
-      /** @brief Convenience function for setting thirteen kernel parameters */
-      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
-                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11, typename T12>
-      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
-                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11, T12 const & t12)
-      {
-         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
-         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11); arg(12, t12);
-         return *this;
-      }     
-
-      /** @brief Convenience function for setting fourteen kernel parameters */
-      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
-                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
-                typename T12, typename T13>
-      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
-                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
-                          T12 const & t12, T13 const & t13)
-      {
-         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
-         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
-         arg(12, t12); arg(13, t13);
-         return *this;
-      }     
-
-      /** @brief Convenience function for setting fifteen kernel parameters */
-      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
-                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
-                typename T12, typename T13, typename T14>
-      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
-                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
-                          T12 const & t12, T13 const & t13, T14 const & t14)
-      {
-         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
-         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
-         arg(12, t12); arg(13, t13); arg(14, t14);
-         return *this;
-      }     
-
-      /** @brief Convenience function for setting sixteen kernel parameters */
-      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
-                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
-                typename T12, typename T13, typename T14, typename T15>
-      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
-                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
-                          T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15)
-      {
-         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
-         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
-         arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15);
-         return *this;
-      }     
-
-      /** @brief Convenience function for setting seventeen kernel parameters */
-      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
-                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
-                typename T12, typename T13, typename T14, typename T15, typename T16>
-      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
-                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
-                          T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15, T16 const & t16)
-      {
-         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
-         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
-         arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15); arg(16, t16);
-         return *this;
-      }     
-
-      /** @brief Convenience function for setting eighteen kernel parameters */
-      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
-                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
-                typename T12, typename T13, typename T14, typename T15, typename T16, typename T17>
-      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
-                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
-                          T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15, T16 const & t16, T17 const & t17)
-      {
-         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
-         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
-         arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15); arg(16, t16); arg(17, t17);
-         return *this;
-      }     
-
-      /** @brief Convenience function for setting nineteen kernel parameters */
-      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
-                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
-                typename T12, typename T13, typename T14, typename T15, typename T16, typename T17,
-                typename T18>
-      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
-                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
-                          T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15, T16 const & t16, T17 const & t17,
-                          T18 const & t18
-                         )
-      {
-         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
-         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
-         arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15); arg(16, t16); arg(17, t17);
-         arg(18, t18);
-         return *this;
-      }     
-
-      /** @brief Convenience function for setting twenty kernel parameters */
-      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
-                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
-                typename T12, typename T13, typename T14, typename T15, typename T16, typename T17,
-                typename T18, typename T19>
-      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
-                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
-                          T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15, T16 const & t16, T17 const & t17,
-                          T18 const & t18, T19 const & t19
-                         )
-      {
-         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
-         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
-         arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15); arg(16, t16); arg(17, t17);
-         arg(18, t18); arg(19, t19);
-         return *this;
-      }     
-
-      /** @brief Convenience function for setting twentyone kernel parameters */
-      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
-                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
-                typename T12, typename T13, typename T14, typename T15, typename T16, typename T17,
-                typename T18, typename T19, typename T20>
-      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
-                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
-                          T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15, T16 const & t16, T17 const & t17,
-                          T18 const & t18, T19 const & t19, T20 const & t20
-                         )
-      {
-         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
-         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
-         arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15); arg(16, t16); arg(17, t17);
-         arg(18, t18); arg(19, t19); arg(20, t20);
-         return *this;
-      }     
-
-      /** @brief Convenience function for setting twentytwo kernel parameters */
-      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
-                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
-                typename T12, typename T13, typename T14, typename T15, typename T16, typename T17,
-                typename T18, typename T19, typename T20, typename T21>
-      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
-                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
-                          T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15, T16 const & t16, T17 const & t17,
-                          T18 const & t18, T19 const & t19, T20 const & t20, T21 const & t21
-                         )
-      {
-         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
-         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
-         arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15); arg(16, t16); arg(17, t17);
-         arg(18, t18); arg(19, t19); arg(20, t20); arg(21, t21);
-         return *this;
-      }     
-
-      /** @brief Convenience function for setting twentythree kernel parameters */
-      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
-                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
-                typename T12, typename T13, typename T14, typename T15, typename T16, typename T17,
-                typename T18, typename T19, typename T20, typename T21, typename T22>
-      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
-                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
-                          T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15, T16 const & t16, T17 const & t17,
-                          T18 const & t18, T19 const & t19, T20 const & t20, T21 const & t21, T22 const & t22
-                         )
-      {
-         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
-         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
-         arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15); arg(16, t16); arg(17, t17);
-         arg(18, t18); arg(19, t19); arg(20, t20); arg(21, t21);  arg(22, t22);
-         return *this;
-      }     
-
-      /** @brief Returns the local work size at the respective dimension
-      *
-      * @param index   Dimension index (currently either 0 or 1)
-      */
-      size_t local_work_size(int index = 0) const
-      {
-        assert(index == 0 || index == 1);
-        return local_work_size_[index];
-      }
-      /** @brief Returns the global work size at the respective dimension
-      *
-      * @param index   Dimension index (currently either 0 or 1)
-      */
-      size_t global_work_size(int index = 0) const
-      { 
-        assert(index == 0 || index == 1);
-        return global_work_size_[index];
-      }
-
-      /** @brief Sets the local work size at the respective dimension
-      *
-      * @param index   Dimension index (currently either 0 or 1)
-      * @param s       The new local work size
-      */
-      void local_work_size(int index, size_t s)
-      {
-        #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
-        std::cout << "ViennaCL: Setting local work size to " << s << " at index " << index << " for kernel " << name_ << std::endl;
-        #endif
-        assert(index == 0 || index == 1);
-        local_work_size_[index] = s;
-      }
-      /** @brief Sets the global work size at the respective dimension
-      *
-      * @param index   Dimension index (currently either 0 or 1)
-      * @param s       The new global work size
-      */
-      void global_work_size(int index, size_t s)
-      { 
-        #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
-        std::cout << "ViennaCL: Setting global work size to " << s << " at index " << index << " for kernel " << name_ << std::endl;
-        #endif
-        assert(index == 0 || index == 1);
-        global_work_size_[index] = s;
-      }
-
-      std::string const & name() const { return name_; }
-
-      viennacl::ocl::handle<cl_kernel> const & handle() const { return handle_; }
-
-
-    private:
-      void create_kernel()
-      {
-        cl_int err;
-        #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
-        std::cout << "ViennaCL: Building kernel " << name_ << std::endl;
-        #endif
-        handle_ = clCreateKernel(program_.get(), name_.c_str(), &err);
-        
-        if (err != CL_SUCCESS)
-        {
-          #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
-          std::cout << "ViennaCL: Could not create kernel '" << name_ << "'." << std::endl;
-          #endif
-          //std::cerr << "Could not build kernel '" << name_ << "'." << std::endl;
-        }
-        VIENNACL_ERR_CHECK(err);
-      }
-
-      void set_work_size_defaults()
-      {
-        if (viennacl::ocl::current_device().type() == CL_DEVICE_TYPE_GPU)
-        {
-          local_work_size_[0] = 128; local_work_size_[1] = 0;
-          global_work_size_[0] = 128*128; global_work_size_[1] = 0;
-        }
-        else //assume CPU type:
-        {
-          //conservative assumption: one thread per CPU core:
-          local_work_size_[0] = 1; local_work_size_[1] = 0;
-          global_work_size_[0] = viennacl::ocl::current_device().max_compute_units(); global_work_size_[1] = 0;
-        }
-      }
-
-      void init()
-      {
-        if (!init_done_)
-        {
-          create_kernel();
-          init_done_ = true;
-        }
-      }
-      
-      viennacl::ocl::handle<cl_kernel> handle_;
-      viennacl::ocl::handle<cl_program> program_;
-      std::string name_;
-      bool init_done_;
-      size_t local_work_size_[2];
-      size_t global_work_size_[2];
-    };
-    
-  } //namespace ocl
-} //namespace viennacl
-
-#endif
+#ifndef VIENNACL_OCL_KERNEL_HPP_
+#define VIENNACL_OCL_KERNEL_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/ocl/kernel.hpp
+    @brief Representation of an OpenCL kernel in ViennaCL.
+*/
+
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#else
+#include <CL/cl.h>
+#endif
+
+#include "viennacl/ocl/forwards.h"
+#include "viennacl/ocl/handle.hpp"
+#include "viennacl/ocl/program.hpp"
+#include "viennacl/ocl/device.hpp"
+#include "viennacl/ocl/local_mem.hpp"
+#include "viennacl/ocl/infos.hpp"
+
+namespace viennacl
+{
+  namespace ocl
+  {
+    /** @brief Helper class for packing four cl_uint numbers into a uint4 type for access inside an OpenCL kernel.
+      *
+      * Since the primary use is for dealing with ranges and strides, the four members are termed accordingly.
+      */
+    struct packed_cl_uint
+    {
+      /** @brief Starting value of the integer stride. */
+      cl_uint start;
+      /** @brief Increment between integers. */
+      cl_uint stride;
+      /** @brief Number of values in the stride. */
+      cl_uint size;
+      /** @brief Internal length of the buffer. Might be larger than 'size' due to padding. */
+      cl_uint internal_size;
+    };
+
+    /** @brief Represents an OpenCL kernel within ViennaCL */
+    class kernel
+    {
+      template <typename KernelType>
+      friend void enqueue(KernelType & k, viennacl::ocl::command_queue const & queue);
+
+      template<cl_kernel_info param>
+      friend typename detail::return_type<cl_kernel, param>::Result info(viennacl::ocl::kernel & k);
+
+      template<cl_kernel_info param>
+      friend typename detail::return_type<cl_kernel, param>::Result info(viennacl::ocl::kernel & k, viennacl::ocl::device const & d);
+
+
+    public:
+      typedef vcl_size_t            size_type;
+
+      kernel() : handle_(), p_program_(NULL), p_context_(NULL), name_()
+      {
+        #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
+        std::cout << "ViennaCL: Creating kernel object (default CTOR)" << std::endl;
+        #endif
+      }
+
+      kernel(cl_kernel kernel_handle, viennacl::ocl::program const & kernel_program, viennacl::ocl::context const & kernel_context, std::string const & name)
+        : handle_(kernel_handle, kernel_context), p_program_(&kernel_program), p_context_(&kernel_context), name_(name)
+      {
+        #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
+        std::cout << "ViennaCL: Creating kernel object (full CTOR)" << std::endl;
+        #endif
+        set_work_size_defaults();
+      }
+
+      kernel(kernel const & other)
+        : handle_(other.handle_), p_program_(other.p_program_), p_context_(other.p_context_), name_(other.name_)
+      {
+        #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
+        std::cout << "ViennaCL: Creating kernel object (Copy CTOR)" << std::endl;
+        #endif
+        local_work_size_[0] = other.local_work_size_[0];
+        local_work_size_[1] = other.local_work_size_[1];
+        local_work_size_[2] = other.local_work_size_[2];
+
+        global_work_size_[0] = other.global_work_size_[0];
+        global_work_size_[1] = other.global_work_size_[1];
+        global_work_size_[2] = other.global_work_size_[2];
+      }
+
+      viennacl::ocl::kernel & operator=(const kernel & other)
+      {
+        #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
+        std::cout << "ViennaCL: Assigning kernel object" << std::endl;
+        #endif
+        handle_ = other.handle_;
+        p_program_ = other.p_program_;
+        p_context_ = other.p_context_;
+        name_ = other.name_;
+        local_work_size_[0] = other.local_work_size_[0];
+        local_work_size_[1] = other.local_work_size_[1];
+        local_work_size_[2] = other.local_work_size_[2];
+        global_work_size_[0] = other.global_work_size_[0];
+        global_work_size_[1] = other.global_work_size_[1];
+        global_work_size_[2] = other.global_work_size_[2];
+        return *this;
+      }
+
+      /** @brief Sets a char argument at the provided position */
+      void arg(unsigned int pos, cl_char val)
+      {
+        #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
+        std::cout << "ViennaCL: Setting char kernel argument " << val << " at pos " << pos << " for kernel " << name_ << std::endl;
+        #endif
+        cl_int err = clSetKernelArg(handle_.get(), pos, sizeof(cl_char), (void*)&val);
+        VIENNACL_ERR_CHECK(err);
+      }
+
+      /** @brief Sets a char argument at the provided position */
+      void arg(unsigned int pos, cl_uchar val)
+      {
+        #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
+        std::cout << "ViennaCL: Setting unsigned char kernel argument " << val << " at pos " << pos << " for kernel " << name_ << std::endl;
+        #endif
+        cl_int err = clSetKernelArg(handle_.get(), pos, sizeof(cl_uchar), (void*)&val);
+        VIENNACL_ERR_CHECK(err);
+      }
+
+      /** @brief Sets a argument of type short at the provided position */
+      void arg(unsigned int pos, cl_short val)
+      {
+        #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
+        std::cout << "ViennaCL: Setting short kernel argument " << val << " at pos " << pos << " for kernel " << name_ << std::endl;
+        #endif
+        cl_int err = clSetKernelArg(handle_.get(), pos, sizeof(cl_short), (void*)&val);
+        VIENNACL_ERR_CHECK(err);
+      }
+
+      /** @brief Sets a argument of type unsigned short at the provided position */
+      void arg(unsigned int pos, cl_ushort val)
+      {
+        #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
+        std::cout << "ViennaCL: Setting unsigned short kernel argument " << val << " at pos " << pos << " for kernel " << name_ << std::endl;
+        #endif
+        cl_int err = clSetKernelArg(handle_.get(), pos, sizeof(cl_ushort), (void*)&val);
+        VIENNACL_ERR_CHECK(err);
+      }
+
+
+      /** @brief Sets an unsigned integer argument at the provided position */
+      void arg(unsigned int pos, cl_uint val)
+      {
+        #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
+        std::cout << "ViennaCL: Setting unsigned int kernel argument " << val << " at pos " << pos << " for kernel " << name_ << std::endl;
+        #endif
+        cl_int err = clSetKernelArg(handle_.get(), pos, sizeof(cl_uint), (void*)&val);
+        VIENNACL_ERR_CHECK(err);
+      }
+
+      /** @brief Sets four packed unsigned integers as argument at the provided position */
+      void arg(unsigned int pos, packed_cl_uint val)
+      {
+        #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
+        std::cout << "ViennaCL: Setting packed_cl_uint kernel argument (" << val.start << ", " << val.stride << ", " << val.size << ", " << val.internal_size << ") at pos " << pos << " for kernel " << name_ << std::endl;
+        #endif
+        cl_int err = clSetKernelArg(handle_.get(), pos, sizeof(packed_cl_uint), (void*)&val);
+        VIENNACL_ERR_CHECK(err);
+      }
+
+      /** @brief Sets a single precision floating point argument at the provided position */
+      void arg(unsigned int pos, float val)
+      {
+        #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
+        std::cout << "ViennaCL: Setting floating point kernel argument " << val << " at pos " << pos << " for kernel " << name_ << std::endl;
+        #endif
+        cl_int err = clSetKernelArg(handle_.get(), pos, sizeof(float), (void*)&val);
+        VIENNACL_ERR_CHECK(err);
+      }
+
+      /** @brief Sets a double precision floating point argument at the provided position */
+      void arg(unsigned int pos, double val)
+      {
+        #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
+        std::cout << "ViennaCL: Setting double precision kernel argument " << val << " at pos " << pos << " for kernel " << name_ << std::endl;
+        #endif
+        cl_int err = clSetKernelArg(handle_.get(), pos, sizeof(double), (void*)&val);
+        VIENNACL_ERR_CHECK(err);
+      }
+
+      /** @brief Sets an int argument at the provided position */
+      void arg(unsigned int pos, cl_int val)
+      {
+        #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
+        std::cout << "ViennaCL: Setting int precision kernel argument " << val << " at pos " << pos << " for kernel " << name_ << std::endl;
+        #endif
+        cl_int err = clSetKernelArg(handle_.get(), pos, sizeof(cl_int), (void*)&val);
+        VIENNACL_ERR_CHECK(err);
+      }
+
+      /** @brief Sets an unsigned long argument at the provided position */
+      void arg(unsigned int pos, cl_ulong val)
+      {
+        #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
+        std::cout << "ViennaCL: Setting ulong precision kernel argument " << val << " at pos " << pos << " for kernel " << name_ << std::endl;
+        #endif
+        cl_int err = clSetKernelArg(handle_.get(), pos, sizeof(cl_ulong), (void*)&val);
+        VIENNACL_ERR_CHECK(err);
+      }
+
+      /** @brief Sets an unsigned long argument at the provided position */
+      void arg(unsigned int pos, cl_long val)
+      {
+        #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
+        std::cout << "ViennaCL: Setting long precision kernel argument " << val << " at pos " << pos << " for kernel " << name_ << std::endl;
+        #endif
+        cl_int err = clSetKernelArg(handle_.get(), pos, sizeof(cl_long), (void*)&val);
+        VIENNACL_ERR_CHECK(err);
+      }
+
+      //generic handling: call .handle() member
+      /** @brief Sets an OpenCL memory object at the provided position */
+      template<class VCL_TYPE>
+      void arg(unsigned int pos, VCL_TYPE const & val)
+      {
+        assert(&val.handle().opencl_handle().context() == &handle_.context() && bool("Kernel and memory object not in the same context!"));
+
+        cl_mem temp = val.handle().opencl_handle().get();
+        #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
+        std::cout << "ViennaCL: Setting generic kernel argument " << temp << " at pos " << pos << " for kernel " << name_ << std::endl;
+        #endif
+        cl_int err = clSetKernelArg(handle_.get(), pos, sizeof(cl_mem), (void*)&temp);
+        VIENNACL_ERR_CHECK(err);
+      }
+
+      //forward handles directly:
+      /** @brief Sets an OpenCL object at the provided position */
+      template<class CL_TYPE>
+      void arg(unsigned int pos, viennacl::ocl::handle<CL_TYPE> const & h)
+      {
+        CL_TYPE temp = h.get();
+        #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
+        std::cout << "ViennaCL: Setting handle kernel argument " << temp << " at pos " << pos << " for kernel " << name_ << std::endl;
+        #endif
+        cl_int err = clSetKernelArg(handle_.get(), pos, sizeof(CL_TYPE), (void*)&temp);
+        VIENNACL_ERR_CHECK(err);
+      }
+
+
+      //local buffer argument:
+      /** @brief Sets an OpenCL local memory object at the provided position */
+      void arg(unsigned int pos, const local_mem & mem)
+      {
+        cl_uint size = static_cast<cl_uint>(mem.size());
+        #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
+        std::cout << "ViennaCL: Setting local memory kernel argument of size " << size << " bytes at pos " << pos << " for kernel " << name_ << std::endl;
+        #endif
+        cl_int err = clSetKernelArg(handle_.get(), pos, size, 0);
+        VIENNACL_ERR_CHECK(err);
+      }
+
+
+
+      /** @brief Convenience function for setting one kernel parameter */
+      template <typename T0>
+      kernel & operator()(T0 const & t0)
+      {
+         arg(0, t0);
+         return *this;
+      }
+
+      /** @brief Convenience function for setting two kernel parameters */
+      template <typename T0, typename T1>
+      kernel & operator()(T0 const & t0, T1 const & t1)
+      {
+         arg(0, t0); arg(1, t1);
+         return *this;
+      }
+
+      /** @brief Convenience function for setting three kernel parameters */
+      template <typename T0, typename T1, typename T2>
+      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2)
+      {
+         arg(0, t0); arg(1, t1); arg(2, t2);
+         return *this;
+      }
+
+      /** @brief Convenience function for setting four kernel parameters */
+      template <typename T0, typename T1, typename T2, typename T3>
+      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3)
+      {
+         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3);
+         return *this;
+      }
+
+      /** @brief Convenience function for setting five kernel parameters */
+      template <typename T0, typename T1, typename T2, typename T3, typename T4>
+      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4)
+      {
+         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4);
+         return *this;
+      }
+
+      /** @brief Convenience function for setting six kernel parameters */
+      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5>
+      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5)
+      {
+         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
+         return *this;
+      }
+
+      /** @brief Convenience function for setting seven kernel parameters */
+      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6>
+      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5, T6 const & t6)
+      {
+         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5); arg(6, t6);
+         return *this;
+      }
+
+      /** @brief Convenience function for setting eight kernel parameters */
+      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7>
+      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5, T6 const & t6, T7 const & t7)
+      {
+         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5); arg(6, t6); arg(7, t7);
+         return *this;
+      }
+
+      /** @brief Convenience function for setting nine kernel parameters */
+      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8>
+      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5, T6 const & t6, T7 const & t7, T8 const & t8)
+      {
+         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5); arg(6, t6); arg(7, t7); arg(8, t8);
+         return *this;
+      }
+
+      /** @brief Convenience function for setting ten kernel parameters */
+      template <typename T0, typename T1, typename T2, typename T3, typename T4,
+                typename T5, typename T6, typename T7, typename T8, typename T9>
+      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4,
+                          T5 const & t5, T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9)
+      {
+         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5); arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9);
+         return *this;
+      }
+
+      /** @brief Convenience function for setting eleven kernel parameters */
+      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
+                typename T6, typename T7, typename T8, typename T9, typename T10>
+      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
+                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10)
+      {
+         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5); arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10);
+         return *this;
+      }
+
+      /** @brief Convenience function for setting twelve kernel parameters */
+      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
+                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11>
+      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
+                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11)
+      {
+         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
+         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
+         return *this;
+      }
+
+      /** @brief Convenience function for setting thirteen kernel parameters */
+      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
+                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11, typename T12>
+      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
+                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11, T12 const & t12)
+      {
+         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
+         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11); arg(12, t12);
+         return *this;
+      }
+
+      /** @brief Convenience function for setting fourteen kernel parameters */
+      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
+                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
+                typename T12, typename T13>
+      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
+                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
+                          T12 const & t12, T13 const & t13)
+      {
+         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
+         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
+         arg(12, t12); arg(13, t13);
+         return *this;
+      }
+
+      /** @brief Convenience function for setting fifteen kernel parameters */
+      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
+                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
+                typename T12, typename T13, typename T14>
+      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
+                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
+                          T12 const & t12, T13 const & t13, T14 const & t14)
+      {
+         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
+         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
+         arg(12, t12); arg(13, t13); arg(14, t14);
+         return *this;
+      }
+
+      /** @brief Convenience function for setting sixteen kernel parameters */
+      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
+                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
+                typename T12, typename T13, typename T14, typename T15>
+      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
+                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
+                          T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15)
+      {
+         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
+         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
+         arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15);
+         return *this;
+      }
+
+      /** @brief Convenience function for setting seventeen kernel parameters */
+      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
+                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
+                typename T12, typename T13, typename T14, typename T15, typename T16>
+      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
+                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
+                          T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15, T16 const & t16)
+      {
+         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
+         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
+         arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15); arg(16, t16);
+         return *this;
+      }
+
+      /** @brief Convenience function for setting eighteen kernel parameters */
+      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
+                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
+                typename T12, typename T13, typename T14, typename T15, typename T16, typename T17>
+      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
+                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
+                          T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15, T16 const & t16, T17 const & t17)
+      {
+         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
+         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
+         arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15); arg(16, t16); arg(17, t17);
+         return *this;
+      }
+
+      /** @brief Convenience function for setting nineteen kernel parameters */
+      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
+                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
+                typename T12, typename T13, typename T14, typename T15, typename T16, typename T17,
+                typename T18>
+      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
+                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
+                          T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15, T16 const & t16, T17 const & t17,
+                          T18 const & t18
+                         )
+      {
+         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
+         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
+         arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15); arg(16, t16); arg(17, t17);
+         arg(18, t18);
+         return *this;
+      }
+
+      /** @brief Convenience function for setting twenty kernel parameters */
+      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
+                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
+                typename T12, typename T13, typename T14, typename T15, typename T16, typename T17,
+                typename T18, typename T19>
+      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
+                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
+                          T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15, T16 const & t16, T17 const & t17,
+                          T18 const & t18, T19 const & t19
+                         )
+      {
+         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
+         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
+         arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15); arg(16, t16); arg(17, t17);
+         arg(18, t18); arg(19, t19);
+         return *this;
+      }
+
+      /** @brief Convenience function for setting twentyone kernel parameters */
+      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
+                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
+                typename T12, typename T13, typename T14, typename T15, typename T16, typename T17,
+                typename T18, typename T19, typename T20>
+      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
+                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
+                          T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15, T16 const & t16, T17 const & t17,
+                          T18 const & t18, T19 const & t19, T20 const & t20
+                         )
+      {
+         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
+         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
+         arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15); arg(16, t16); arg(17, t17);
+         arg(18, t18); arg(19, t19); arg(20, t20);
+         return *this;
+      }
+
+      /** @brief Convenience function for setting twentytwo kernel parameters */
+      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
+                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
+                typename T12, typename T13, typename T14, typename T15, typename T16, typename T17,
+                typename T18, typename T19, typename T20, typename T21>
+      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
+                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
+                          T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15, T16 const & t16, T17 const & t17,
+                          T18 const & t18, T19 const & t19, T20 const & t20, T21 const & t21
+                         )
+      {
+         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
+         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
+         arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15); arg(16, t16); arg(17, t17);
+         arg(18, t18); arg(19, t19); arg(20, t20); arg(21, t21);
+         return *this;
+      }
+
+      /** @brief Convenience function for setting 23 kernel parameters */
+      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
+                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
+                typename T12, typename T13, typename T14, typename T15, typename T16, typename T17,
+                typename T18, typename T19, typename T20, typename T21, typename T22>
+      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
+                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
+                          T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15, T16 const & t16, T17 const & t17,
+                          T18 const & t18, T19 const & t19, T20 const & t20, T21 const & t21, T22 const & t22
+                         )
+      {
+         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
+         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
+         arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15); arg(16, t16); arg(17, t17);
+         arg(18, t18); arg(19, t19); arg(20, t20); arg(21, t21);  arg(22, t22);
+         return *this;
+      }
+
+      /** @brief Convenience function for setting 24 kernel parameters */
+      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
+                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
+                typename T12, typename T13, typename T14, typename T15, typename T16, typename T17,
+                typename T18, typename T19, typename T20, typename T21, typename T22, typename T23>
+      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
+                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
+                          T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15, T16 const & t16, T17 const & t17,
+                          T18 const & t18, T19 const & t19, T20 const & t20, T21 const & t21, T22 const & t22, T23 const & t23
+                         )
+      {
+         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
+         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
+         arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15); arg(16, t16); arg(17, t17);
+         arg(18, t18); arg(19, t19); arg(20, t20); arg(21, t21); arg(22, t22); arg(23, t23);
+         return *this;
+      }
+
+      /** @brief Convenience function for setting 25 kernel parameters */
+      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
+                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
+                typename T12, typename T13, typename T14, typename T15, typename T16, typename T17,
+                typename T18, typename T19, typename T20, typename T21, typename T22, typename T23,
+                typename T24>
+      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
+                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
+                          T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15, T16 const & t16, T17 const & t17,
+                          T18 const & t18, T19 const & t19, T20 const & t20, T21 const & t21, T22 const & t22, T23 const & t23,
+                          T24 const & t24
+                         )
+      {
+         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
+         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
+         arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15); arg(16, t16); arg(17, t17);
+         arg(18, t18); arg(19, t19); arg(20, t20); arg(21, t21); arg(22, t22); arg(23, t23);
+         arg(24, t24);
+         return *this;
+      }
+
+      /** @brief Convenience function for setting 26 kernel parameters */
+      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
+                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
+                typename T12, typename T13, typename T14, typename T15, typename T16, typename T17,
+                typename T18, typename T19, typename T20, typename T21, typename T22, typename T23,
+                typename T24, typename T25>
+      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
+                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
+                          T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15, T16 const & t16, T17 const & t17,
+                          T18 const & t18, T19 const & t19, T20 const & t20, T21 const & t21, T22 const & t22, T23 const & t23,
+                          T24 const & t24, T25 const & t25
+                         )
+      {
+         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
+         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
+         arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15); arg(16, t16); arg(17, t17);
+         arg(18, t18); arg(19, t19); arg(20, t20); arg(21, t21); arg(22, t22); arg(23, t23);
+         arg(24, t24); arg(25, t25);
+         return *this;
+      }
+
+      /** @brief Convenience function for setting 27 kernel parameters */
+      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
+                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
+                typename T12, typename T13, typename T14, typename T15, typename T16, typename T17,
+                typename T18, typename T19, typename T20, typename T21, typename T22, typename T23,
+                typename T24, typename T25, typename T26>
+      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
+                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
+                          T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15, T16 const & t16, T17 const & t17,
+                          T18 const & t18, T19 const & t19, T20 const & t20, T21 const & t21, T22 const & t22, T23 const & t23,
+                          T24 const & t24, T25 const & t25, T26 const & t26
+                         )
+      {
+         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
+         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
+         arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15); arg(16, t16); arg(17, t17);
+         arg(18, t18); arg(19, t19); arg(20, t20); arg(21, t21); arg(22, t22); arg(23, t23);
+         arg(24, t24); arg(25, t25); arg(26, t26);
+         return *this;
+      }
+
+      /** @brief Convenience function for setting 28 kernel parameters */
+      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
+                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
+                typename T12, typename T13, typename T14, typename T15, typename T16, typename T17,
+                typename T18, typename T19, typename T20, typename T21, typename T22, typename T23,
+                typename T24, typename T25, typename T26, typename T27>
+      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
+                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
+                          T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15, T16 const & t16, T17 const & t17,
+                          T18 const & t18, T19 const & t19, T20 const & t20, T21 const & t21, T22 const & t22, T23 const & t23,
+                          T24 const & t24, T25 const & t25, T26 const & t26, T27 const & t27
+                         )
+      {
+         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
+         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
+         arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15); arg(16, t16); arg(17, t17);
+         arg(18, t18); arg(19, t19); arg(20, t20); arg(21, t21); arg(22, t22); arg(23, t23);
+         arg(24, t24); arg(25, t25); arg(26, t26); arg(27, t27);
+         return *this;
+      }
+
+      /** @brief Convenience function for setting 29 kernel parameters */
+      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
+                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
+                typename T12, typename T13, typename T14, typename T15, typename T16, typename T17,
+                typename T18, typename T19, typename T20, typename T21, typename T22, typename T23,
+                typename T24, typename T25, typename T26, typename T27, typename T28>
+      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
+                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
+                          T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15, T16 const & t16, T17 const & t17,
+                          T18 const & t18, T19 const & t19, T20 const & t20, T21 const & t21, T22 const & t22, T23 const & t23,
+                          T24 const & t24, T25 const & t25, T26 const & t26, T27 const & t27, T28 const & t28
+                         )
+      {
+         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
+         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
+         arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15); arg(16, t16); arg(17, t17);
+         arg(18, t18); arg(19, t19); arg(20, t20); arg(21, t21); arg(22, t22); arg(23, t23);
+         arg(24, t24); arg(25, t25); arg(26, t26); arg(27, t27); arg(28, t28);
+         return *this;
+      }
+
+      /** @brief Convenience function for setting 30 kernel parameters */
+      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
+                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
+                typename T12, typename T13, typename T14, typename T15, typename T16, typename T17,
+                typename T18, typename T19, typename T20, typename T21, typename T22, typename T23,
+                typename T24, typename T25, typename T26, typename T27, typename T28, typename T29>
+      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
+                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
+                          T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15, T16 const & t16, T17 const & t17,
+                          T18 const & t18, T19 const & t19, T20 const & t20, T21 const & t21, T22 const & t22, T23 const & t23,
+                          T24 const & t24, T25 const & t25, T26 const & t26, T27 const & t27, T28 const & t28, T29 const & t29
+                         )
+      {
+         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
+         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
+         arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15); arg(16, t16); arg(17, t17);
+         arg(18, t18); arg(19, t19); arg(20, t20); arg(21, t21); arg(22, t22); arg(23, t23);
+         arg(24, t24); arg(25, t25); arg(26, t26); arg(27, t27); arg(28, t28); arg(29, t29);
+         return *this;
+      }
+
+      /** @brief Convenience function for setting 31 kernel parameters */
+      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
+                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
+                typename T12, typename T13, typename T14, typename T15, typename T16, typename T17,
+                typename T18, typename T19, typename T20, typename T21, typename T22, typename T23,
+                typename T24, typename T25, typename T26, typename T27, typename T28, typename T29,
+                typename T30>
+      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
+                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
+                          T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15, T16 const & t16, T17 const & t17,
+                          T18 const & t18, T19 const & t19, T20 const & t20, T21 const & t21, T22 const & t22, T23 const & t23,
+                          T24 const & t24, T25 const & t25, T26 const & t26, T27 const & t27, T28 const & t28, T29 const & t29,
+                          T30 const & t30
+                         )
+      {
+         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
+         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
+         arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15); arg(16, t16); arg(17, t17);
+         arg(18, t18); arg(19, t19); arg(20, t20); arg(21, t21); arg(22, t22); arg(23, t23);
+         arg(24, t24); arg(25, t25); arg(26, t26); arg(27, t27); arg(28, t28); arg(29, t29);
+         arg(30, t30);
+         return *this;
+      }
+
+      /** @brief Convenience function for setting 32 kernel parameters */
+      template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
+                typename T6, typename T7, typename T8, typename T9, typename T10, typename T11,
+                typename T12, typename T13, typename T14, typename T15, typename T16, typename T17,
+                typename T18, typename T19, typename T20, typename T21, typename T22, typename T23,
+                typename T24, typename T25, typename T26, typename T27, typename T28, typename T29,
+                typename T30, typename T31>
+      kernel & operator()(T0 const & t0, T1 const & t1, T2 const & t2, T3 const & t3, T4 const & t4, T5 const & t5,
+                          T6 const & t6, T7 const & t7, T8 const & t8, T9 const & t9, T10 const & t10, T11 const & t11,
+                          T12 const & t12, T13 const & t13, T14 const & t14, T15 const & t15, T16 const & t16, T17 const & t17,
+                          T18 const & t18, T19 const & t19, T20 const & t20, T21 const & t21, T22 const & t22, T23 const & t23,
+                          T24 const & t24, T25 const & t25, T26 const & t26, T27 const & t27, T28 const & t28, T29 const & t29,
+                          T30 const & t30, T31 const & t31
+                         )
+      {
+         arg(0, t0); arg(1, t1); arg(2, t2); arg(3, t3); arg(4, t4); arg(5, t5);
+         arg(6, t6); arg(7, t7); arg(8, t8); arg(9, t9); arg(10, t10); arg(11, t11);
+         arg(12, t12); arg(13, t13); arg(14, t14); arg(15, t15); arg(16, t16); arg(17, t17);
+         arg(18, t18); arg(19, t19); arg(20, t20); arg(21, t21); arg(22, t22); arg(23, t23);
+         arg(24, t24); arg(25, t25); arg(26, t26); arg(27, t27); arg(28, t28); arg(29, t29);
+         arg(30, t30); arg(31, t31);
+         return *this;
+      }
+
+
+
+
+      /** @brief Returns the local work size at the respective dimension
+      *
+      * @param index   Dimension index (currently either 0 or 1)
+      */
+      size_type local_work_size(int index = 0) const
+      {
+        assert(index < 3 && bool("Work size index out of bounds"));
+        return local_work_size_[index];
+      }
+      /** @brief Returns the global work size at the respective dimension
+      *
+      * @param index   Dimension index (currently either 0 or 1)
+      */
+      size_type global_work_size(int index = 0) const
+      {
+        assert(index < 3 && bool("Work size index out of bounds"));
+        return global_work_size_[index];
+      }
+
+      /** @brief Sets the local work size at the respective dimension
+      *
+      * @param index   Dimension index (currently either 0 or 1)
+      * @param s       The new local work size
+      */
+      void local_work_size(int index, size_type s)
+      {
+        #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
+        std::cout << "ViennaCL: Setting local work size to " << s << " at index " << index << " for kernel " << name_ << std::endl;
+        #endif
+        assert(index < 3 && bool("Work size index out of bounds"));
+        local_work_size_[index] = s;
+      }
+      /** @brief Sets the global work size at the respective dimension
+      *
+      * @param index   Dimension index (currently either 0 or 1)
+      * @param s       The new global work size
+      */
+      void global_work_size(int index, size_type s)
+      {
+        #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
+        std::cout << "ViennaCL: Setting global work size to " << s << " at index " << index << " for kernel " << name_ << std::endl;
+        #endif
+        assert(index < 3 && bool("Work size index out of bounds"));
+        global_work_size_[index] = s;
+      }
+
+      std::string const & name() const { return name_; }
+
+      viennacl::ocl::handle<cl_kernel> const & handle() const { return handle_; }
+
+      viennacl::ocl::context const & context() const { return *p_context_; }
+
+    private:
+
+      inline void set_work_size_defaults();    //see context.hpp for implementation
+
+      viennacl::ocl::handle<cl_kernel> handle_;
+      viennacl::ocl::program const * p_program_;
+      viennacl::ocl::context const * p_context_;
+      std::string name_;
+      size_type local_work_size_[3];
+      size_type global_work_size_[3];
+    };
+
+    /** @brief Queries information about a kernel
+  *
+  * @param k Corresponding kernel
+  */
+    template<cl_kernel_info param>
+    typename detail::return_type<cl_kernel, param>::Result info(viennacl::ocl::kernel & k)
+    {
+        typedef typename detail::return_type<cl_kernel, param>::Result res_t;
+        return detail::get_info_impl<res_t>()(k.handle_.get(),param);
+    }
+
+  /** @brief Queries information about the execution of a kernel on a particular device
+   *
+   * @param k Corresponding kernel
+   * @param d Corresponding device
+   */
+    template<cl_kernel_info param>
+    typename detail::return_type<cl_kernel, param>::Result info(viennacl::ocl::kernel & k, viennacl::ocl::device const & d)
+    {
+        typedef typename detail::return_type<cl_kernel, param>::Result res_t;
+        return detail::get_info_impl<res_t>()(k.handle_.get(),d.id(),param);
+    }
+
+  } //namespace ocl
+} //namespace viennacl
+
+#endif
diff --git a/viennacl/ocl/local_mem.hpp b/viennacl/ocl/local_mem.hpp
index 11ba2df..30f9885 100644
--- a/viennacl/ocl/local_mem.hpp
+++ b/viennacl/ocl/local_mem.hpp
@@ -2,26 +2,29 @@
 #define VIENNACL_OCL_LOCAL_MEM_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
 ============================================================================= */
 
 
-/** @file local_mem.hpp
+/** @file viennacl/ocl/local_mem.hpp
     @brief A local (shared) memory object for OpenCL
 */
 
+#include "viennacl/forwards.h"
+
 namespace viennacl
 {
   namespace ocl
@@ -30,18 +33,18 @@ namespace viennacl
     class local_mem
     {
       public:
-        local_mem(unsigned int s) : size_(s) {}
-        
+        local_mem(vcl_size_t s) : size_(s) {}
+
         /** @brief Returns size in bytes */
-        unsigned int size() const { return size_; }
+        vcl_size_t size() const { return size_; }
 
         /** @brief Sets the size of the local memory in bytes */
-        void size(unsigned int s) { size_ = s; }
+        void size(vcl_size_t s) { size_ = s; }
 
       private:
-        unsigned int size_;
+        vcl_size_t size_;
     };
-    
+
   }
 }
 #endif
diff --git a/viennacl/ocl/platform.hpp b/viennacl/ocl/platform.hpp
index 663c2d8..11a4708 100644
--- a/viennacl/ocl/platform.hpp
+++ b/viennacl/ocl/platform.hpp
@@ -2,22 +2,23 @@
 #define VIENNACL_OCL_PLATFORM_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
 ============================================================================= */
 
-/** @file platform.hpp
+/** @file viennacl/ocl/platform.hpp
     @brief Implements a OpenCL platform within ViennaCL
 */
 
@@ -35,29 +36,45 @@ namespace viennacl
 {
   namespace ocl
   {
+
+    /** @brief Wrapper class for an OpenCL platform.
+      *
+      * This class was written when the OpenCL C++ bindings haven't been standardized yet.
+      * Regardless, it takes care about some additional details and is supposed to provide higher convenience.
+      */
     class platform
     {
-      
+
       public:
-        platform()
+        platform(vcl_size_t pf_index = 0)
         {
           cl_int err;
           cl_uint num_platforms;
-          cl_platform_id ids[3];
+          cl_platform_id ids[42];   //no more than 42 platforms supported...
           #if defined(VIENNACL_DEBUG_ALL)
           std::cout << "ViennaCL: Getting platform..." << std::endl;
           #endif
-          err = clGetPlatformIDs(1, ids, &num_platforms);
+          err = clGetPlatformIDs(42, ids, &num_platforms);
           VIENNACL_ERR_CHECK(err);
-          id_ = ids[0];
-          assert(num_platforms > 0 && "ViennaCL: ERROR: No platform found!");          
+          assert(num_platforms > pf_index && bool("ViennaCL: ERROR: Not enough platforms found!"));
+          id_ = ids[pf_index];
+          assert(num_platforms > 0 && bool("ViennaCL: ERROR: No platform found!"));
+        }
+
+        platform(cl_platform_id pf_id) : id_(pf_id) {}
+
+        platform(platform const & other) : id_(other.id_) {}
+
+        void operator=(cl_platform_id pf_id)
+        {
+          id_ = pf_id;
         }
-        
+
         cl_platform_id id() const
         {
           return id_;
         }
-        
+
         /** @brief Returns an information string */
         std::string info() const
         {
@@ -65,7 +82,7 @@ namespace viennacl
           cl_int err;
           err = clGetPlatformInfo(id_, CL_PLATFORM_VENDOR, 1024 * sizeof(char), buffer, NULL);
           VIENNACL_ERR_CHECK(err);
-          
+
           std::stringstream ss;
           ss << buffer << ": ";
 
@@ -73,10 +90,10 @@ namespace viennacl
           VIENNACL_ERR_CHECK(err);
 
           ss << buffer;
-          
+
           return ss.str();
         }
-        
+
         //////////////////// get device //////////////////
         /** @brief Returns the available devices of the supplied device type */
         std::vector<device> devices(cl_device_type dtype = CL_DEVICE_TYPE_DEFAULT)
@@ -93,25 +110,44 @@ namespace viennacl
             //workaround for ATI Stream SDK v2.3: No CPUs detected with default device type:
             err = clGetDeviceIDs(id_, CL_DEVICE_TYPE_CPU, VIENNACL_OCL_MAX_DEVICE_NUM, device_ids, &num_devices);
           }
-          
+
           VIENNACL_ERR_CHECK(err);
           #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_DEVICE)
           std::cout << "ViennaCL: Found " << num_devices << " devices." << std::endl;
           #endif
-          
-          assert(num_devices > 0 && "Error in viennacl::ocl::platform::devices(): No OpenCL devices available!");
+
+          assert(num_devices > 0 && bool("Error in viennacl::ocl::platform::devices(): No OpenCL devices available!"));
           std::vector<device> devices;
-          
+
           for (cl_uint i=0; i<num_devices; ++i)
             devices.push_back(device(device_ids[i]));
 
           return devices;
         }
-        
+
       private:
         cl_platform_id id_;
     };
-    
+
+
+
+    inline std::vector< platform > get_platforms()
+    {
+      std::vector< platform > ret;
+      cl_int err;
+      cl_uint num_platforms;
+      cl_platform_id ids[42];   //no more than 42 platforms supported...
+      #if defined(VIENNACL_DEBUG_ALL)
+      std::cout << "ViennaCL: Getting platform..." << std::endl;
+      #endif
+      err = clGetPlatformIDs(42, ids, &num_platforms);
+      VIENNACL_ERR_CHECK(err);
+
+      for (cl_uint i = 0; i < num_platforms; ++i)
+        ret.push_back( platform(ids[i]) );
+
+      return ret;
+    }
   }
 }
 
diff --git a/viennacl/ocl/program.hpp b/viennacl/ocl/program.hpp
index a30e5de..7c70df6 100644
--- a/viennacl/ocl/program.hpp
+++ b/viennacl/ocl/program.hpp
@@ -2,22 +2,23 @@
 #define VIENNACL_OCL_PROGRAM_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
 ============================================================================= */
 
-/** @file program.hpp
+/** @file viennacl/ocl/program.hpp
     @brief Implements an OpenCL program class for ViennaCL
 */
 
@@ -31,62 +32,45 @@ namespace viennacl
 {
   namespace ocl
   {
+    /** @brief Wrapper class for an OpenCL program.
+      *
+      * This class was written when the OpenCL C++ bindings haven't been standardized yet.
+      * Regardless, it takes care about some additional details and is supposed to provide higher convenience by holding the kernels defined in the program.
+      */
     class program
     {
-      friend class kernel;
-      
       typedef std::vector<viennacl::ocl::kernel>    KernelContainer;
-      
+
     public:
-      program() {}
-      program(viennacl::ocl::handle<cl_program> const & h, std::string const & prog_name = std::string()) : handle_(h), name_(prog_name) {}
-      
-      program(program const & other)
-      {
-        handle_ = other.handle_;
-        name_ = other.name_;
-        kernels_ = other.kernels_;
-      }
-      
+      program() : p_context_(NULL) {}
+      program(cl_program program_handle, viennacl::ocl::context const & program_context, std::string const & prog_name = std::string())
+        : handle_(program_handle, program_context), p_context_(&program_context), name_(prog_name) {}
+
+      program(program const & other) : handle_(other.handle_), p_context_(other.p_context_), name_(other.name_), kernels_(other.kernels_) {}
+
       viennacl::ocl::program & operator=(const program & other)
       {
         handle_ = other.handle_;
         name_ = other.name_;
+        p_context_ = other.p_context_;
         kernels_ = other.kernels_;
         return *this;
       }
 
       std::string const & name() const { return name_; }
-      
+
       /** @brief Adds a kernel to the program */
-      viennacl::ocl::kernel & add_kernel(std::string const & kernel_name)
-      {
-        viennacl::ocl::kernel temp(handle_, kernel_name);
-        kernels_.push_back(temp);
-        return kernels_.back();
-      }
-      
+      inline viennacl::ocl::kernel & add_kernel(cl_kernel kernel_handle, std::string const & kernel_name);   //see context.hpp for implementation
+
       /** @brief Returns the kernel with the provided name */
-      viennacl::ocl::kernel & get_kernel(std::string const & name)
-      {
-        //std::cout << "Requiring kernel " << name << " from program " << name_ << std::endl;
-        for (KernelContainer::iterator it = kernels_.begin();
-              it != kernels_.end();
-             ++it)
-        {
-          if (it->name() == name)
-            return *it;
-        }
-        std::cerr << "ViennaCL: FATAL ERROR: Could not find kernel '" << name << "'" << std::endl;
-        std::cout << "Number of kernels in program: " << kernels_.size() << std::endl;
-        assert(!"Kernel not found");
-        return kernels_[0];  //return a defined object
-      }
+      inline viennacl::ocl::kernel & get_kernel(std::string const & name);    //see context.hpp for implementation
 
-    private:
       const viennacl::ocl::handle<cl_program> & handle() const { return handle_; }
-      
+
+    private:
+
       viennacl::ocl::handle<cl_program> handle_;
+      viennacl::ocl::context const * p_context_;
       std::string name_;
       KernelContainer kernels_;
     };
diff --git a/viennacl/ocl/utils.hpp b/viennacl/ocl/utils.hpp
index 92e572b..5e297d1 100644
--- a/viennacl/ocl/utils.hpp
+++ b/viennacl/ocl/utils.hpp
@@ -2,16 +2,17 @@
 #define VIENNACL_OCL_UTILS_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
@@ -22,6 +23,7 @@
 */
 
 #include <vector>
+#include <string>
 #include "viennacl/ocl/backend.hpp"
 #include "viennacl/ocl/device.hpp"
 
@@ -29,26 +31,54 @@ namespace viennacl
 {
   namespace ocl
   {
-    
+
     /** @brief Ensures that double precision types are only allocated if it is supported by the device. If double precision is requested for a device not capable of providing that, a double_precision_not_provided_error is thrown.
      */
     template <typename ScalarType>
     struct DOUBLE_PRECISION_CHECKER
     {
-      static void apply() {} 
+      static void apply(viennacl::ocl::context const &) {}
     };
-    
+
+    /** \cond */
     template <>
     struct DOUBLE_PRECISION_CHECKER<double>
     {
-      static void apply()
+      static void apply(viennacl::ocl::context const & ctx)
       {
-        if (!viennacl::ocl::current_device().double_support())
+        if (!ctx.current_device().double_support())
           throw viennacl::ocl::double_precision_not_provided_error();
       }
     };
-    
-    
+    /** \endcond */
+
+    /** \brief Helper class for converting a type to its string representation. */
+    template <typename T>
+    struct type_to_string;
+
+    /** \cond */
+    template <> struct type_to_string<char>   { static std::string apply() { return "char";  } };
+    template <> struct type_to_string<short>  { static std::string apply() { return "short"; } };
+    template <> struct type_to_string<int>    { static std::string apply() { return "int";   } };
+    template <> struct type_to_string<long>   { static std::string apply() { return "long";  } };
+
+    template <> struct type_to_string<unsigned char>   { static std::string apply() { return "uchar";  } };
+    template <> struct type_to_string<unsigned short>  { static std::string apply() { return "ushort"; } };
+    template <> struct type_to_string<unsigned int>    { static std::string apply() { return "uint";   } };
+    template <> struct type_to_string<unsigned long>   { static std::string apply() { return "ulong";  } };
+
+    template <> struct type_to_string<float>  { static std::string apply() { return "float";  } };
+    template <> struct type_to_string<double> { static std::string apply() { return "double"; } };
+    /** \endcond */
+
+    template <typename T>
+    void append_double_precision_pragma(viennacl::ocl::context const & /*ctx*/, std::string & /*source*/) {}
+
+    template <>
+    inline void append_double_precision_pragma<double>(viennacl::ocl::context const & ctx, std::string & source)
+    {
+      source.append("#pragma OPENCL EXTENSION " + ctx.current_device().double_support_extension() + " : enable\n\n");
+    }
 
   } //ocl
 } //viennacl
diff --git a/viennacl/rand/gaussian.hpp b/viennacl/rand/gaussian.hpp
new file mode 100644
index 0000000..c039ea1
--- /dev/null
+++ b/viennacl/rand/gaussian.hpp
@@ -0,0 +1,54 @@
+#ifndef VIENNACL_RAND_GAUSSIAN_HPP_
+#define VIENNACL_RAND_GAUSSIAN_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/backend/mem_handle.hpp"
+#include "viennacl/rand/utils.hpp"
+
+/** @file   viennacl/rand/gaussian.hpp
+    @brief  Unused: Generation of random numbers with a Gaussian normal distribution */
+
+/** \cond */
+
+
+namespace viennacl{
+
+namespace rand{
+
+struct gaussian_tag{
+    gaussian_tag(float _mu = 0, float _sigma = 1) : mu(_mu), sigma(_sigma){ }
+    float mu;
+    float sigma;
+};
+
+template<class ScalarType>
+struct buffer_dumper<ScalarType, gaussian_tag>{
+    static void dump(viennacl::backend::mem_handle const & buff, gaussian_tag tag, cl_uint start, cl_uint size){
+      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::rand<ScalarType,1>::program_name(),"dump_gaussian");
+      k.global_work_size(0, viennacl::tools::align_to_multiple<unsigned int>(size/2,k.local_work_size(0)));
+      viennacl::ocl::enqueue(k(buff.opencl_handle(), start, size, cl_float(tag.mu), cl_float(tag.sigma) , cl_uint(time(0))));
+    }
+};
+
+}
+
+}
+
+/** \endcond */
+#endif
diff --git a/viennacl/rand/uniform.hpp b/viennacl/rand/uniform.hpp
new file mode 100644
index 0000000..c43d6af
--- /dev/null
+++ b/viennacl/rand/uniform.hpp
@@ -0,0 +1,56 @@
+#ifndef VIENNACL_RAND_UNIFORM_HPP_
+#define VIENNACL_RAND_UNIFORM_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/backend/mem_handle.hpp"
+#include "viennacl/rand/utils.hpp"
+
+/** @file   viennacl/rand/uniform.hpp
+    @brief  Unused: Generation of uniformly distributed random numbers. */
+
+/** \cond */
+
+namespace viennacl{
+
+namespace rand{
+
+struct uniform_tag{
+    uniform_tag(unsigned int _a = 0, unsigned int _b = 1) : a(_a), b(_b){ }
+    float a;
+    float b;
+};
+
+template<class ScalarType>
+struct buffer_dumper<ScalarType, uniform_tag>{
+  static void dump(viennacl::backend::mem_handle const & buff, uniform_tag tag, cl_uint start, cl_uint size){
+    viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::rand<ScalarType,1>::program_name(),"dump_uniform");
+    k.global_work_size(0, viennacl::tools::align_to_multiple<unsigned int>(size,k.local_work_size(0)));
+    viennacl::ocl::enqueue(k(buff.opencl_handle(), start, size, cl_float(tag.a), cl_float(tag.b) , cl_uint(time(0))));
+  }
+};
+
+
+
+}
+
+}
+
+/** \endcond */
+
+#endif
diff --git a/viennacl/rand/utils.hpp b/viennacl/rand/utils.hpp
new file mode 100644
index 0000000..dea4dc2
--- /dev/null
+++ b/viennacl/rand/utils.hpp
@@ -0,0 +1,71 @@
+#ifndef VIENNACL_RAND_UTILS_HPP_
+#define VIENNACL_RAND_UTILS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+/** @file   viennacl/rand/utils.hpp
+    @brief  Unused: Helper functionality random number generation. */
+
+/** \cond */
+
+#ifdef VIENNACL_WITH_OPENCL
+#include "viennacl/linalg/kernels/rand_kernels.h"
+
+namespace viennacl{
+
+namespace rand{
+
+
+template<class SCALARTYPE, class DISTRIBUTION>
+struct random_matrix_t{
+    typedef size_t size_type;
+    random_matrix_t(size_type _size1, unsigned int _size2, DISTRIBUTION const & _distribution) : size1(_size1), size2(_size2), distribution(_distribution){
+        #ifdef VIENNACL_WITH_OPENCL
+        viennacl::linalg::kernels::rand<SCALARTYPE,1>::init();
+        #endif
+    }
+    size_type size1;
+    size_type size2;
+    DISTRIBUTION distribution;
+};
+
+
+template<class SCALARTYPE, class DISTRIBUTION>
+struct random_vector_t{
+    typedef size_t size_type;
+    random_vector_t(size_type _size, DISTRIBUTION const & _distribution) : size(_size), distribution(_distribution){
+        #ifdef VIENNACL_WITH_OPENCL
+        viennacl::linalg::kernels::rand<SCALARTYPE,1>::init();
+        #endif
+    }
+    size_type size;
+    DISTRIBUTION distribution;
+};
+
+template<class ScalarType, class Distribution>
+struct buffer_dumper;
+
+
+}
+
+}
+
+#endif
+
+/** \endcond */
+
+#endif
diff --git a/viennacl/range.hpp b/viennacl/range.hpp
index 88b13d8..12cb7bc 100644
--- a/viennacl/range.hpp
+++ b/viennacl/range.hpp
@@ -2,16 +2,17 @@
 #define VIENNACL_RANGE_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
@@ -30,7 +31,7 @@ namespace viennacl
 {
 
   /** @brief A range class that refers to an interval [start, stop), where 'start' is included, and 'stop' is excluded.
-   * 
+   *
    * Similar to the boost::numeric::ublas::basic_range class.
    */
   template <typename SizeType /* see forwards.h for default argument*/,
@@ -43,33 +44,33 @@ namespace viennacl
       typedef size_type            value_type;
       typedef value_type           const_reference;
       typedef const_reference      reference;
-      
+
       basic_range() : start_(0), size_(0) {}
       basic_range(size_type start_index, size_type stop_index) : start_(start_index), size_(stop_index - start_index)
       {
         assert(start_index <= stop_index);
       }
-        
-        
+
+
       size_type start() const { return start_; }
       size_type size() const { return size_; }
-      
-      const_reference operator()(size_type i) const 
+
+      const_reference operator()(size_type i) const
       {
         assert(i < size());
         return start_ + i;
       }
       const_reference operator[](size_type i) const { return operator()(i); }
-      
+
       bool operator==(const basic_range & r) const { return (start_ == r.start_) && (size_ == r.size_); }
       bool operator!=(const basic_range & r) const { return !(*this == r); }
-      
+
     private:
       size_type start_;
       size_type size_;
   };
-  
-  
+
+
 }
 
-#endif
\ No newline at end of file
+#endif
diff --git a/viennacl/scalar.hpp b/viennacl/scalar.hpp
index 195db29..a678263 100644
--- a/viennacl/scalar.hpp
+++ b/viennacl/scalar.hpp
@@ -1,524 +1,745 @@
-#ifndef VIENNACL_SCALAR_HPP_
-#define VIENNACL_SCALAR_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file scalar.hpp
-    @brief Implementation of the ViennaCL scalar class
-*/
-
-#include "viennacl/forwards.h"
-#include "viennacl/ocl/backend.hpp"
-#include "viennacl/linalg/kernels/scalar_kernels.h"
-
-#include <iostream>
-
-namespace viennacl
-{
-    /** @brief A proxy for scalar expressions (e.g. from inner vector products)
-    * 
-    * assumption: dim(LHS) >= dim(RHS), where dim(scalar) = 0, dim(vector) = 1 and dim(matrix = 2)
-    * @tparam LHS   The left hand side operand
-    * @tparam RHS   The right hand side operand
-    * @tparam OP    The operation tag
-    */
-    template <typename LHS, typename RHS, typename OP>
-    class scalar_expression
-    {
-        typedef typename LHS::value_type          DummyType; //Visual C++ 2005 does not allow to write LHS::value_type::value_type
-      public:
-        typedef typename DummyType::value_type    ScalarType;
-        
-        scalar_expression(LHS & lhs, RHS & rhs) : _lhs(lhs), _rhs(rhs) {}
-        
-        /** @brief Returns the left hand side operand */
-        LHS & get_lhs() const { return _lhs; }
-        /** @brief Returns the left hand side operand */
-        RHS & get_rhs() const { return _rhs; }
-
-        /** @brief Conversion operator to a ViennaCL scalar */
-        operator ScalarType () const
-        {
-          viennacl::scalar<ScalarType> temp;
-          temp = *this;
-          return temp;
-        }
-
-      private:
-        LHS & _lhs;
-        RHS & _rhs;
-    };
-    
-    /** @brief This class represents a single scalar value on the GPU and behaves mostly like a built-in scalar type like float or double.
-    *
-    * Since every read and write operation requires a CPU->GPU or GPU->CPU transfer, this type should be used with care.
-    * The advantage of this type is that the GPU command queue can be filled without blocking read operations.
-    *
-    * @tparam TYPE  Either float or double. Checked at compile time.
-    */
-    template<class TYPE>
-    class scalar
-    {
-    public:
-      /** @brief Returns the underlying host scalar type. */
-      typedef typename viennacl::tools::CHECK_SCALAR_TEMPLATE_ARGUMENT<TYPE>::ResultType   value_type;
-      
-      /** @brief Allocates the memory for the scalar, but does not set it to zero. */
-      scalar()
-      {
-        viennacl::linalg::kernels::scalar<TYPE, 1>::init(); 
-        val_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, sizeof(TYPE));
-      }
-      /** @brief Allocates the memory for the scalar and sets it to the supplied value. */
-      scalar(TYPE val)
-      {
-        viennacl::linalg::kernels::scalar<TYPE, 1>::init(); 
-        val_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, sizeof(TYPE), &val);
-      }
-      
-      /** @brief Wraps an existing memory entry into a scalar 
-      *
-      * @param mem    The OpenCL memory handle
-      * @param size   Ignored - Only necessary to avoid ambiguities. Users are advised to set this parameter to '1'.
-      */
-      explicit scalar(cl_mem mem, size_t size) : val_(mem) { val_.inc(); }
-
-      /** @brief Allocates memory for the scalar and sets it to the result of supplied expression. */
-      template <typename T1, typename T2, typename OP>
-      scalar(scalar_expression<T1, T2, OP> const & proxy)
-      {
-        viennacl::linalg::kernels::scalar<TYPE, 1>::init(); 
-        val_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, sizeof(TYPE));
-        *this = proxy;
-      }
-
-      //copy constructor
-      /** @brief Copy constructor. Allocates new memory for the scalar and copies the value of the supplied scalar */
-      scalar(const scalar & other) : val_(viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, sizeof(TYPE)))
-      {
-        //copy value:
-        cl_int err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle().get(), other.handle().get(), val_.get(), 0, 0, sizeof(TYPE), 0, NULL, NULL);
-        VIENNACL_ERR_CHECK(err);
-      }
-
-      /** @brief Reads the value of the scalar from the GPU and returns the float or double value. */
-      operator TYPE() const
-      {
-        TYPE tmp;
-        cl_int err;
-        err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(), val_.get(), CL_TRUE, 0, sizeof(TYPE), &tmp, 0, NULL, NULL);
-        VIENNACL_ERR_CHECK(err);
-        return tmp;
-      } 
-      
-      /** @brief Assigns a vector entry. */
-      scalar<TYPE> & operator= (entry_proxy<TYPE> const & other)
-      {
-        //copy value:
-        cl_int err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle().get(), other.handle().get(), val_.get(), other.index() * sizeof(TYPE), 0, sizeof(TYPE), 0, NULL, NULL);
-        VIENNACL_ERR_CHECK(err);
-        return *this;
-      }
-
-      /** @brief Assigns the value from another scalar. */
-      scalar<TYPE> & operator= (scalar<TYPE> const & other)
-      {
-        //copy value:
-        cl_int err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle().get(), other.handle().get(), val_.get(), 0, 0, sizeof(TYPE), 0, NULL, NULL);
-        VIENNACL_ERR_CHECK(err);
-        
-        return *this;
-      }
-
-      scalar<TYPE> & operator= (float cpu_other)
-      {
-        //copy value:
-        TYPE other = static_cast<TYPE>(cpu_other);
-        cl_int err = clEnqueueWriteBuffer(viennacl::ocl::get_queue().handle().get(), val_.get(), CL_TRUE, 0, sizeof(TYPE), &other, 0, NULL, NULL);
-        VIENNACL_ERR_CHECK(err);
-        
-        return *this;
-      }
-
-      scalar<TYPE> & operator= (double cpu_other)
-      {
-        //copy value:
-        TYPE other = static_cast<TYPE>(cpu_other);
-        cl_int err = clEnqueueWriteBuffer(viennacl::ocl::get_queue().handle().get(), val_.get(), CL_TRUE, 0, sizeof(TYPE), &other, 0, NULL, NULL);
-        VIENNACL_ERR_CHECK(err);
-        
-        return *this;
-      }
-
-      scalar<TYPE> & operator= (long cpu_other)
-      {
-        //copy value:
-        TYPE other = static_cast<TYPE>(cpu_other);
-        cl_int err = clEnqueueWriteBuffer(viennacl::ocl::get_queue().handle().get(), val_.get(), CL_TRUE, 0, sizeof(TYPE), &other, 0, NULL, NULL);
-        VIENNACL_ERR_CHECK(err);
-        
-        return *this;
-      }
-
-      scalar<TYPE> & operator= (unsigned long cpu_other)
-      {
-        //copy value:
-        TYPE other = static_cast<TYPE>(cpu_other);
-        cl_int err = clEnqueueWriteBuffer(viennacl::ocl::get_queue().handle().get(), val_.get(), CL_TRUE, 0, sizeof(TYPE), &other, 0, NULL, NULL);
-        VIENNACL_ERR_CHECK(err);
-        
-        return *this;
-      }
-
-      scalar<TYPE> & operator= (int cpu_other)
-      {
-        //copy value:
-        TYPE other = static_cast<TYPE>(cpu_other);
-        cl_int err = clEnqueueWriteBuffer(viennacl::ocl::get_queue().handle().get(), val_.get(), CL_TRUE, 0, sizeof(TYPE), &other, 0, NULL, NULL);
-        VIENNACL_ERR_CHECK(err);
-        
-        return *this;
-      }
-
-      scalar<TYPE> & operator= (unsigned int cpu_other)
-      {
-        //copy value:
-        TYPE other = static_cast<TYPE>(cpu_other);
-        cl_int err = clEnqueueWriteBuffer(viennacl::ocl::get_queue().handle().get(), val_.get(), CL_TRUE, 0, sizeof(TYPE), &other, 0, NULL, NULL);
-        VIENNACL_ERR_CHECK(err);
-        
-        return *this;
-      }
-      /** @brief Sets the scalar to the result of supplied inner product expression. */
-      template <typename T1, typename T2>
-      scalar<TYPE> & operator= (scalar_expression<T1, T2, op_inner_prod> const & proxy)
-      {
-        viennacl::linalg::inner_prod_impl(proxy.get_lhs(), proxy.get_rhs(), *this);
-        return *this;
-      }
-
-      /** @brief Sets the scalar to the result of supplied norm_1 expression. */
-      template <typename T1, typename T2>
-      scalar<TYPE> & operator= (scalar_expression<T1, T2, op_norm_1> const & proxy)
-      {
-        viennacl::linalg::norm_1_impl(proxy.get_lhs(), *this);
-        return *this;
-      }
-
-      /** @brief Sets the scalar to the result of supplied norm_2 expression. */
-      template <typename T1, typename T2>
-      scalar<TYPE> & operator= (scalar_expression<T1, T2, op_norm_2> const & proxy)
-      {
-        viennacl::linalg::norm_2_impl(proxy.get_lhs(), *this);
-        return *this;
-      }
-
-      /** @brief Sets the scalar to the result of supplied norm_inf expression. */
-      template <typename T1, typename T2>
-      scalar<TYPE> & operator= (scalar_expression<T1, T2, op_norm_inf> const & proxy)
-      {
-        viennacl::linalg::norm_inf_impl(proxy.get_lhs(), *this);
-        return *this;
-      }
-
-      /** @brief Inplace addition of a ViennaCL scalar */
-      scalar<TYPE> & operator += (scalar<TYPE> const & other)
-      {
-        //get kernel:
-        viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::scalar<TYPE, 1>::program_name(), "inplace_add");
-        k.local_work_size(0, 1);
-        k.global_work_size(0, 1);
-        
-        viennacl::ocl::enqueue(k(val_, other.val_));
-        return *this;
-      }
-      /** @brief Inplace addition of a host scalar (float or double) */
-      scalar<TYPE> & operator += (TYPE other)
-      {
-        //get kernel:
-        viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::scalar<TYPE, 1>::program_name(), "cpu_inplace_add");
-        k.local_work_size(0, 1);
-        k.global_work_size(0, 1);
-
-        viennacl::ocl::enqueue(k(val_, other.val_));        
-        return *this;
-      }
-
-
-      /** @brief Inplace subtraction of a ViennaCL scalar */
-      scalar<TYPE> & operator -= (scalar<TYPE> const & other)
-      {
-        //get kernel:
-        viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::scalar<TYPE, 1>::program_name(), "inplace_sub");
-        k.local_work_size(0, 1);
-        k.global_work_size(0, 1);
-        
-        viennacl::ocl::enqueue(k(val_, other.val_));
-        return *this;
-      }
-      /** @brief Inplace subtraction of a host scalar (float or double) */
-      scalar<TYPE> & operator -= (TYPE other)
-      {
-        //get kernel:
-        viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::scalar<TYPE, 1>::program_name(), "cpu_inplace_sub");
-        k.local_work_size(0, 1);
-        k.global_work_size(0, 1);
-
-        viennacl::ocl::enqueue(k(val_, other.val_));        
-        return *this;
-      }
-
-
-      /** @brief Inplace multiplication with a ViennaCL scalar */
-      scalar<TYPE> & operator *= (scalar<TYPE> const & other)
-      {
-        //get kernel:
-        viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::scalar<TYPE, 1>::program_name(), "inplace_mul");
-        k.local_work_size(0, 1);
-        k.global_work_size(0, 1);
-        
-        viennacl::ocl::enqueue(k(val_, other.val_));
-        return *this;
-      }
-      /** @brief Inplace  multiplication with a host scalar (float or double) */
-      scalar<TYPE> & operator *= (TYPE other)
-      {
-        //get kernel:
-        viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::scalar<TYPE, 1>::program_name(), "cpu_inplace_mul");
-        k.local_work_size(0, 1);
-        k.global_work_size(0, 1);
-
-        viennacl::ocl::enqueue(k(val_, other.val_));        
-        return *this;
-      }
-
-
-      //////////////// operator /=    ////////////////////////////
-      /** @brief Inplace division with a ViennaCL scalar */
-      scalar<TYPE> & operator /= (scalar<TYPE> const & other)
-      {
-        //get kernel:
-        viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::scalar<TYPE, 1>::program_name(), "inplace_div");
-        k.local_work_size(0, 1);
-        k.global_work_size(0, 1);
-        
-        viennacl::ocl::enqueue(k(val_, other.val_));
-        return *this;
-      }
-      /** @brief Inplace division with a host scalar (float or double) */
-      scalar<TYPE> & operator /= (TYPE other)
-      {
-        //get kernel:
-        viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::scalar<TYPE, 1>::program_name(), "cpu_inplace_div");
-        k.local_work_size(0, 1);
-        k.global_work_size(0, 1);
-
-        viennacl::ocl::enqueue(k(val_, other.val_));        
-        return *this;
-      }
-      
-      
-      //////////////// operator + ////////////////////////////
-      /** @brief Addition of two ViennaCL scalars */
-      scalar<TYPE> operator + (scalar<TYPE> const & other)
-      {
-        scalar<TYPE> result;
-        //get kernel:
-        viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::scalar<TYPE, 1>::program_name(), "add");
-        k.local_work_size(0, 1);
-        k.global_work_size(0, 1);
-
-        viennacl::ocl::enqueue(k(val_, other.val_, result));        
-        return result;
-      }
-      /** @brief Addition of a ViennaCL scalar with a scalar expression */
-      template <typename T1, typename T2, typename OP>
-      scalar<TYPE> operator + (scalar_expression<T1, T2, OP> const & proxy) const
-      {
-        scalar<TYPE> result = proxy;
-        //get kernel:
-        viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::scalar<TYPE, 1>::program_name(), "add");
-        k.local_work_size(0, 1);
-        k.global_work_size(0, 1);
-
-        viennacl::ocl::enqueue(k(val_, result, result));        
-        return result;
-      }
-      /** @brief Addition of a ViennaCL scalar with a host scalar (float, double) */
-      scalar<TYPE> operator + (TYPE other)
-      {
-        scalar<TYPE> result;
-        //get kernel:
-        viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::scalar<TYPE, 1>::program_name(), "cpu_add");
-        k.local_work_size(0, 1);
-        k.global_work_size(0, 1);
-
-        viennacl::ocl::enqueue(k(val_, other, result));        
-        return result;
-      }
-
-
-      //////////////// operator - ////////////////////////////
-      /** @brief Subtraction of two ViennaCL scalars */
-      scalar<TYPE> operator - (scalar<TYPE> const & other) const
-      {
-        scalar<TYPE> result;
-        //get kernel:
-        viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::scalar<TYPE, 1>::program_name(), "sub");
-        k.local_work_size(0, 1);
-        k.global_work_size(0, 1);
-
-        viennacl::ocl::enqueue(k(val_, other.val_, result));        
-        return result;
-      }
-      /** @brief Subtraction of a ViennaCL scalar from a scalar expression */
-      template <typename T1, typename T2, typename OP>
-      scalar<TYPE> operator - (scalar_expression<T1, T2, OP> const & proxy) const
-      {
-        scalar<TYPE> result = *this;
-        //get kernel:
-        viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::scalar<TYPE, 1>::program_name(), "sub");
-        k.local_work_size(0, 1);
-        k.global_work_size(0, 1);
-
-        viennacl::ocl::enqueue(k(val_, result, result));        
-        return result;
-      }
-      /** @brief Subtraction of a host scalar (float, double) from a ViennaCL scalar */
-      scalar<TYPE> operator - (TYPE other) const
-      {
-        scalar<TYPE> result;
-        //get kernel:
-        viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::scalar<TYPE, 1>::program_name(), "cpu_sub");
-        k.local_work_size(0, 1);
-        k.global_work_size(0, 1);
-
-        viennacl::ocl::enqueue(k(val_, other, result));        
-        return result;
-        
-        return result;
-      }
-
-      //////////////// operator * ////////////////////////////
-      /** @brief Multiplication of two ViennaCL scalars */
-      scalar<TYPE> operator * (scalar<TYPE> const & other) const
-      {
-        scalar<TYPE> result;
-        //get kernel:
-        viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::scalar<TYPE, 1>::program_name(), "mul");
-        k.local_work_size(0, 1);
-        k.global_work_size(0, 1);
-
-        viennacl::ocl::enqueue(k(val_, other.val_, result));        
-        return result;
-      }
-      /** @brief Multiplication of a ViennaCL scalar with a scalar expression */
-      template <typename T1, typename T2, typename OP>
-      scalar<TYPE> operator * (scalar_expression<T1, T2, OP> const & proxy) const
-      {
-        scalar<TYPE> result = proxy;
-        //get kernel:
-        viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::scalar<TYPE, 1>::program_name(), "mul");
-        k.local_work_size(0, 1);
-        k.global_work_size(0, 1);
-
-        viennacl::ocl::enqueue(k(val_, result, result));        
-        return result;
-      }
-      /** @brief Multiplication of a host scalar (float, double) with a ViennaCL scalar */
-      scalar<TYPE> operator * (TYPE other) const
-      {
-        scalar<TYPE> result;
-        //get kernel:
-        viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::scalar<TYPE, 1>::program_name(), "cpu_mul");
-        k.local_work_size(0, 1);
-        k.global_work_size(0, 1);
-
-        viennacl::ocl::enqueue(k(val_, other, result));        
-        return result;
-      }
-      
-      //////////////// operator /    ////////////////////////////
-      /** @brief Division of two ViennaCL scalars */
-      scalar<TYPE> operator / (scalar<TYPE> const & other) const
-      {
-        scalar<TYPE> result;
-        //get kernel:
-        viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::scalar<TYPE, 1>::program_name(), "divide");
-        k.local_work_size(0, 1);
-        k.global_work_size(0, 1);
-
-        viennacl::ocl::enqueue(k(val_, other.val_, result));        
-        return result;
-      }
-      /** @brief Division of a ViennaCL scalar by a scalar expression */
-      template <typename T1, typename T2, typename OP>
-      scalar<TYPE> operator / (scalar_expression<T1, T2, OP> const & proxy) const
-      {
-        scalar<TYPE> result = proxy;
-        //get kernel:
-        viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::scalar<TYPE, 1>::program_name(), "divide");
-        k.local_work_size(0, 1);
-        k.global_work_size(0, 1);
-
-        viennacl::ocl::enqueue(k(val_, result, result));        
-        return result;
-      }
-      /** @brief Division of a ViennaCL scalar by a host scalar (float, double)*/
-      scalar<TYPE> operator / (TYPE other) const
-      {
-        scalar<TYPE> result;
-        //get kernel:
-        viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::scalar<TYPE, 1>::program_name(), "cpu_div");
-        k.local_work_size(0, 1);
-        k.global_work_size(0, 1);
-
-        viennacl::ocl::enqueue(k(val_, other, result));        
-        return result;
-      }
-
-      /** @brief Returns the OpenCL handle */
-      const viennacl::ocl::handle<cl_mem> & handle() const { return val_; }
-      
-    private:
-      viennacl::ocl::handle<cl_mem> val_;
-    };
-    
-    
-    //stream operators:
-    /** @brief Allows to directly print the value of a scalar to an output stream */
-    template<class SCALARTYPE>
-    std::ostream & operator<<(std::ostream & s, const scalar<SCALARTYPE> & val)
-    {
-      SCALARTYPE temp = val;
-      s << temp;
-      return s;
-    }
-
-    /** @brief Allows to directly read a value of a scalar from an input stream */
-    template<class SCALARTYPE>
-    std::istream & operator>>(std::istream & s, const scalar<SCALARTYPE> & val)
-    {
-      SCALARTYPE temp;
-      s >> temp;
-      val = temp;
-      return s;
-    }
-
-} //namespace viennacl
-
-#endif
+#ifndef VIENNACL_SCALAR_HPP_
+#define VIENNACL_SCALAR_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/scalar.hpp
+    @brief Implementation of the ViennaCL scalar class
+*/
+
+#include <iostream>
+
+#include "viennacl/forwards.h"
+#include "viennacl/backend/memory.hpp"
+#include "viennacl/meta/result_of.hpp"
+#include "viennacl/linalg/scalar_operations.hpp"
+#include "viennacl/traits/handle.hpp"
+
+#ifdef VIENNACL_WITH_OPENCL
+#include "viennacl/ocl/backend.hpp"
+#endif
+
+namespace viennacl
+{
+    /** @brief A proxy for scalar expressions (e.g. from inner vector products)
+    *
+    * assumption: dim(LHS) >= dim(RHS), where dim(scalar) = 0, dim(vector) = 1 and dim(matrix = 2)
+    * @tparam LHS   The left hand side operand
+    * @tparam RHS   The right hand side operand
+    * @tparam OP    The operation tag
+    */
+    template <typename LHS, typename RHS, typename OP>
+    class scalar_expression
+    {
+        typedef typename LHS::value_type          DummyType; //Visual C++ 2005 does not allow to write LHS::value_type::value_type
+      public:
+        typedef typename viennacl::result_of::cpu_value_type<DummyType>::type    ScalarType;
+
+        scalar_expression(LHS & lhs, RHS & rhs) : lhs_(lhs), rhs_(rhs) {}
+
+        /** @brief Returns the left hand side operand */
+        LHS & lhs() const { return lhs_; }
+        /** @brief Returns the left hand side operand */
+        RHS & rhs() const { return rhs_; }
+
+        /** @brief Conversion operator to a ViennaCL scalar */
+        operator ScalarType () const
+        {
+          viennacl::scalar<ScalarType> temp;
+          temp = *this;
+          return temp;
+        }
+
+      private:
+        LHS & lhs_;
+        RHS & rhs_;
+    };
+
+
+    /** @brief Specialization of a scalar expression for inner products. Allows for a final reduction on the CPU
+    *
+    * assumption: dim(LHS) >= dim(RHS), where dim(scalar) = 0, dim(vector) = 1 and dim(matrix = 2)
+    * @tparam LHS   The left hand side operand
+    * @tparam RHS   The right hand side operand
+    * @tparam OP    The operation tag
+    */
+    template <typename LHS, typename RHS>
+    class scalar_expression<LHS, RHS, op_inner_prod>
+    {
+        //typedef typename LHS::value_type          DummyType; //Visual C++ 2005 does not allow to write LHS::value_type::value_type
+      public:
+        typedef typename viennacl::result_of::cpu_value_type<LHS>::type    ScalarType;
+
+        scalar_expression(LHS & lhs, RHS & rhs) : lhs_(lhs), rhs_(rhs) {}
+
+        /** @brief Returns the left hand side operand */
+        LHS & lhs() const { return lhs_; }
+        /** @brief Returns the left hand side operand */
+        RHS & rhs() const { return rhs_; }
+
+        /** @brief Conversion operator to a ViennaCL scalar */
+        operator ScalarType () const
+        {
+          ScalarType result;
+          viennacl::linalg::inner_prod_cpu(lhs_, rhs_, result);
+          return result;
+        }
+
+      private:
+        LHS & lhs_;
+        RHS & rhs_;
+    };
+
+
+    /** @brief Specialization of a scalar expression for norm_1. Allows for a final reduction on the CPU
+    *
+    * @tparam LHS   The left hand side operand
+    * @tparam RHS   The right hand side operand
+    */
+    template <typename LHS, typename RHS>
+    class scalar_expression<LHS, RHS, op_norm_1>
+    {
+        //typedef typename LHS::value_type          DummyType; //Visual C++ 2005 does not allow to write LHS::value_type::value_type
+      public:
+        typedef typename viennacl::result_of::cpu_value_type<LHS>::type    ScalarType;
+
+        scalar_expression(LHS & lhs, RHS & rhs) : lhs_(lhs), rhs_(rhs) {}
+
+        /** @brief Returns the left hand side operand */
+        LHS & lhs() const { return lhs_; }
+        /** @brief Returns the left hand side operand */
+        RHS & rhs() const { return rhs_; }
+
+        /** @brief Conversion operator to a ViennaCL scalar */
+        operator ScalarType () const
+        {
+          ScalarType result;
+          viennacl::linalg::norm_1_cpu(lhs_, result);
+          return result;
+        }
+
+      private:
+        LHS & lhs_;
+        RHS & rhs_;
+    };
+
+    /** @brief Specialization of a scalar expression for norm_2. Allows for a final reduction on the CPU
+    *
+    * @tparam LHS   The left hand side operand
+    * @tparam RHS   The right hand side operand
+    */
+    template <typename LHS, typename RHS>
+    class scalar_expression<LHS, RHS, op_norm_2>
+    {
+        //typedef typename LHS::value_type          DummyType; //Visual C++ 2005 does not allow to write LHS::value_type::value_type
+      public:
+        typedef typename viennacl::result_of::cpu_value_type<LHS>::type    ScalarType;
+
+        scalar_expression(LHS & lhs, RHS & rhs) : lhs_(lhs), rhs_(rhs) {}
+
+        /** @brief Returns the left hand side operand */
+        LHS & lhs() const { return lhs_; }
+        /** @brief Returns the left hand side operand */
+        RHS & rhs() const { return rhs_; }
+
+        /** @brief Conversion operator to a ViennaCL scalar */
+        operator ScalarType () const
+        {
+          ScalarType result;
+          viennacl::linalg::norm_2_cpu(lhs_, result);
+          return result;
+        }
+
+      private:
+        LHS & lhs_;
+        RHS & rhs_;
+    };
+
+
+    /** @brief Specialization of a scalar expression for norm_inf. Allows for a final reduction on the CPU
+    *
+    * @tparam LHS   The left hand side operand
+    * @tparam RHS   The right hand side operand
+    */
+    template <typename LHS, typename RHS>
+    class scalar_expression<LHS, RHS, op_norm_inf>
+    {
+        //typedef typename LHS::value_type          DummyType; //Visual C++ 2005 does not allow to write LHS::value_type::value_type
+      public:
+        typedef typename viennacl::result_of::cpu_value_type<LHS>::type    ScalarType;
+
+        scalar_expression(LHS & lhs, RHS & rhs) : lhs_(lhs), rhs_(rhs) {}
+
+        /** @brief Returns the left hand side operand */
+        LHS & lhs() const { return lhs_; }
+        /** @brief Returns the left hand side operand */
+        RHS & rhs() const { return rhs_; }
+
+        /** @brief Conversion operator to a ViennaCL scalar */
+        operator ScalarType () const
+        {
+          ScalarType result;
+          viennacl::linalg::norm_inf_cpu(lhs_, result);
+          return result;
+        }
+
+      private:
+        LHS & lhs_;
+        RHS & rhs_;
+    };
+
+    /** @brief Specialization of a scalar expression for norm_frobenius. Allows for a final reduction on the CPU
+    *
+    * @tparam LHS   The left hand side operand
+    * @tparam RHS   The right hand side operand
+    */
+    template <typename LHS, typename RHS>
+    class scalar_expression<LHS, RHS, op_norm_frobenius>
+    {
+        //typedef typename LHS::value_type          DummyType; //Visual C++ 2005 does not allow to write LHS::value_type::value_type
+      public:
+        typedef typename viennacl::result_of::cpu_value_type<LHS>::type    ScalarType;
+
+        scalar_expression(LHS & lhs, RHS & rhs) : lhs_(lhs), rhs_(rhs) {}
+
+        /** @brief Returns the left hand side operand */
+        LHS & lhs() const { return lhs_; }
+        /** @brief Returns the left hand side operand */
+        RHS & rhs() const { return rhs_; }
+
+        /** @brief Conversion operator to a ViennaCL scalar */
+        operator ScalarType () const
+        {
+          ScalarType result;
+          viennacl::linalg::norm_frobenius_cpu(lhs_, result);
+          return result;
+        }
+
+      private:
+        LHS & lhs_;
+        RHS & rhs_;
+    };
+
+
+
+
+    /** @brief This class represents a single scalar value on the GPU and behaves mostly like a built-in scalar type like float or double.
+    *
+    * Since every read and write operation requires a CPU->GPU or GPU->CPU transfer, this type should be used with care.
+    * The advantage of this type is that the GPU command queue can be filled without blocking read operations.
+    *
+    * @tparam SCALARTYPE  Either float or double. Checked at compile time.
+    */
+    template<class SCALARTYPE>
+    class scalar
+    {
+      typedef scalar<SCALARTYPE>         self_type;
+    public:
+      typedef viennacl::backend::mem_handle                     handle_type;
+      typedef vcl_size_t                                        size_type;
+
+      /** @brief Returns the underlying host scalar type. */
+      typedef SCALARTYPE   value_type;
+
+      /** @brief Creates the scalar object, but does not yet allocate memory. Thus, scalar<> can also be a global variable (if really necessary). */
+      scalar() {}
+
+      /** @brief Allocates the memory for the scalar and sets it to the supplied value. */
+      scalar(SCALARTYPE val, viennacl::context ctx = viennacl::context())
+      {
+        viennacl::backend::memory_create(val_, sizeof(SCALARTYPE), ctx, &val);
+      }
+
+#ifdef VIENNACL_WITH_OPENCL
+      /** @brief Wraps an existing memory entry into a scalar
+      *
+      * @param mem    The OpenCL memory handle
+      * @param size   Ignored - Only necessary to avoid ambiguities. Users are advised to set this parameter to '1'.
+      */
+      explicit scalar(cl_mem mem, size_type /*size*/)
+      {
+        val_.switch_active_handle_id(viennacl::OPENCL_MEMORY);
+        val_.opencl_handle() = mem;
+        val_.opencl_handle().inc();  //prevents that the user-provided memory is deleted once the vector object is destroyed.
+      }
+#endif
+
+      /** @brief Allocates memory for the scalar and sets it to the result of supplied expression. */
+      template <typename T1, typename T2, typename OP>
+      scalar(scalar_expression<T1, T2, OP> const & proxy)
+      {
+        val_.switch_active_handle_id(viennacl::traits::handle(proxy.lhs()).get_active_handle_id());
+        viennacl::backend::memory_create(val_, sizeof(SCALARTYPE), viennacl::traits::context(proxy));
+        *this = proxy;
+      }
+
+      //copy constructor
+      /** @brief Copy constructor. Allocates new memory for the scalar and copies the value of the supplied scalar */
+      scalar(const scalar & other)
+      {
+        if (other.handle().get_active_handle_id() != viennacl::MEMORY_NOT_INITIALIZED)
+        {
+          //copy value:
+          val_.switch_active_handle_id(other.handle().get_active_handle_id());
+          viennacl::backend::memory_create(val_, sizeof(SCALARTYPE), viennacl::traits::context(other));
+          viennacl::backend::memory_copy(other.handle(), val_, 0, 0, sizeof(SCALARTYPE));
+        }
+      }
+
+      /** @brief Reads the value of the scalar from the GPU and returns the float or double value. */
+      operator SCALARTYPE() const
+      {
+        // make sure the scalar contains reasonable data:
+        assert( val_.get_active_handle_id() != viennacl::MEMORY_NOT_INITIALIZED && bool("Scalar not initialized, cannot read!"));
+
+        SCALARTYPE tmp;
+        viennacl::backend::memory_read(val_, 0, sizeof(SCALARTYPE), &tmp);
+        return tmp;
+      }
+
+      /** @brief Assigns a vector entry. */
+      self_type & operator= (entry_proxy<SCALARTYPE> const & other)
+      {
+        init_if_necessary(viennacl::traits::context(other));
+        viennacl::backend::memory_copy(other.handle(), val_, other.index() * sizeof(SCALARTYPE), 0, sizeof(SCALARTYPE));
+        return *this;
+      }
+
+      /** @brief Assigns the value from another scalar. */
+      self_type & operator= (scalar<SCALARTYPE> const & other)
+      {
+        init_if_necessary(viennacl::traits::context(other));
+        viennacl::backend::memory_copy(other.handle(), val_, 0, 0, sizeof(SCALARTYPE));
+        return *this;
+      }
+
+      self_type & operator= (float cpu_other)
+      {
+        init_if_necessary(viennacl::context());
+
+        //copy value:
+        SCALARTYPE value = static_cast<SCALARTYPE>(cpu_other);
+        viennacl::backend::memory_write(val_, 0, sizeof(SCALARTYPE), &value);
+        return *this;
+      }
+
+      self_type & operator= (double cpu_other)
+      {
+        init_if_necessary(viennacl::context());
+
+        SCALARTYPE value = static_cast<SCALARTYPE>(cpu_other);
+        viennacl::backend::memory_write(val_, 0, sizeof(SCALARTYPE), &value);
+        return *this;
+      }
+
+      self_type & operator= (long cpu_other)
+      {
+        init_if_necessary(viennacl::context());
+
+        SCALARTYPE value = static_cast<SCALARTYPE>(cpu_other);
+        viennacl::backend::memory_write(val_, 0, sizeof(SCALARTYPE), &value);
+        return *this;
+      }
+
+      self_type & operator= (unsigned long cpu_other)
+      {
+        init_if_necessary(viennacl::context());
+
+        SCALARTYPE value = static_cast<SCALARTYPE>(cpu_other);
+        viennacl::backend::memory_write(val_, 0, sizeof(SCALARTYPE), &value);
+        return *this;
+      }
+
+      self_type & operator= (int cpu_other)
+      {
+        init_if_necessary(viennacl::context());
+
+        SCALARTYPE value = static_cast<SCALARTYPE>(cpu_other);
+        viennacl::backend::memory_write(val_, 0, sizeof(SCALARTYPE), &value);
+        return *this;
+      }
+
+      self_type & operator= (unsigned int cpu_other)
+      {
+        init_if_necessary(viennacl::context());
+
+        SCALARTYPE value = static_cast<SCALARTYPE>(cpu_other);
+        viennacl::backend::memory_write(val_, 0, sizeof(SCALARTYPE), &value);
+        return *this;
+      }
+
+      /** @brief Sets the scalar to the result of supplied inner product expression. */
+      template <typename T1, typename T2>
+      self_type & operator= (scalar_expression<T1, T2, op_inner_prod> const & proxy)
+      {
+        init_if_necessary(viennacl::traits::context(proxy));
+
+        viennacl::linalg::inner_prod_impl(proxy.lhs(), proxy.rhs(), *this);
+        return *this;
+      }
+
+      /** @brief Sets the scalar to the result of supplied norm_1 expression. */
+      template <typename T1, typename T2>
+      self_type & operator= (scalar_expression<T1, T2, op_norm_1> const & proxy)
+      {
+        init_if_necessary(viennacl::traits::context(proxy));
+
+        viennacl::linalg::norm_1_impl(proxy.lhs(), *this);
+        return *this;
+      }
+
+      /** @brief Sets the scalar to the result of supplied norm_2 expression. */
+      template <typename T1, typename T2>
+      self_type & operator= (scalar_expression<T1, T2, op_norm_2> const & proxy)
+      {
+        init_if_necessary(viennacl::traits::context(proxy));
+
+        viennacl::linalg::norm_2_impl(proxy.lhs(), *this);
+        return *this;
+      }
+
+      /** @brief Sets the scalar to the result of supplied norm_inf expression. */
+      template <typename T1, typename T2>
+      self_type & operator= (scalar_expression<T1, T2, op_norm_inf> const & proxy)
+      {
+        init_if_necessary(viennacl::traits::context(proxy));
+
+        viennacl::linalg::norm_inf_impl(proxy.lhs(), *this);
+        return *this;
+      }
+
+      /** @brief Sets the scalar to the result of supplied norm_frobenius expression. */
+      template <typename T1, typename T2>
+      self_type & operator= (scalar_expression<T1, T2, op_norm_frobenius> const & proxy)
+      {
+        init_if_necessary(viennacl::traits::context(proxy));
+
+        viennacl::linalg::norm_frobenius_impl(proxy.lhs(), *this);
+        return *this;
+      }
+
+      /** @brief Sets the scalar to the inverse with respect to addition of the supplied sub-expression */
+      template <typename T1, typename T2>
+      self_type & operator= (scalar_expression<T1, T2, op_flip_sign> const & proxy)
+      {
+        init_if_necessary(viennacl::traits::context(proxy));
+
+        viennacl::linalg::as(*this, proxy.lhs(), SCALARTYPE(-1.0), 1, false, true);
+        return *this;
+      }
+
+
+      /** @brief Inplace addition of a ViennaCL scalar */
+      self_type & operator += (scalar<SCALARTYPE> const & other)
+      {
+        assert( val_.get_active_handle_id() != viennacl::MEMORY_NOT_INITIALIZED && bool("Scalar not initialized!"));
+
+        viennacl::linalg::asbs(*this,                                       // s1 =
+                               *this, SCALARTYPE(1.0), 1, false, false,     //       s1 * 1.0
+                               other, SCALARTYPE(1.0), 1, false, false);    //     + s2 * 1.0
+        return *this;
+      }
+      /** @brief Inplace addition of a host scalar (float or double) */
+      self_type & operator += (SCALARTYPE other)
+      {
+        assert( val_.get_active_handle_id() != viennacl::MEMORY_NOT_INITIALIZED && bool("Scalar not initialized!"));
+
+        viennacl::linalg::asbs(*this,                                       // s1 =
+                               *this, SCALARTYPE(1.0), 1, false, false,     //       s1 * 1.0
+                               other, SCALARTYPE(1.0), 1, false, false);    //     + s2 * 1.0
+        return *this;
+      }
+
+
+      /** @brief Inplace subtraction of a ViennaCL scalar */
+      self_type & operator -= (scalar<SCALARTYPE> const & other)
+      {
+        assert( val_.get_active_handle_id() != viennacl::MEMORY_NOT_INITIALIZED && bool("Scalar not initialized!"));
+
+        viennacl::linalg::asbs(*this,                                       // s1 =
+                               *this, SCALARTYPE(1.0), 1, false, false,     //       s1 * 1.0
+                               other, SCALARTYPE(-1.0), 1, false, false);   //     + s2 * (-1.0)
+        return *this;
+      }
+      /** @brief Inplace subtraction of a host scalar (float or double) */
+      self_type & operator -= (SCALARTYPE other)
+      {
+        assert( val_.get_active_handle_id() != viennacl::MEMORY_NOT_INITIALIZED && bool("Scalar not initialized!"));
+
+        viennacl::linalg::asbs(*this,                                       // s1 =
+                               *this, SCALARTYPE(1.0), 1, false, false,     //       s1 * 1.0
+                               other, SCALARTYPE(-1.0), 1, false, false);   //     + s2 * (-1.0)
+        return *this;
+      }
+
+
+      /** @brief Inplace multiplication with a ViennaCL scalar */
+      self_type & operator *= (scalar<SCALARTYPE> const & other)
+      {
+        assert( val_.get_active_handle_id() != viennacl::MEMORY_NOT_INITIALIZED && bool("Scalar not initialized!"));
+
+        viennacl::linalg::as(*this,                                       // s1 =
+                             *this, other, 1, false, false);              //      s1 * s2
+        return *this;
+      }
+      /** @brief Inplace  multiplication with a host scalar (float or double) */
+      self_type & operator *= (SCALARTYPE other)
+      {
+        assert( val_.get_active_handle_id() != viennacl::MEMORY_NOT_INITIALIZED && bool("Scalar not initialized!"));
+
+        viennacl::linalg::as(*this,                                       // s1 =
+                             *this, other, 1, false, false);              //      s1 * s2
+        return *this;
+      }
+
+
+      //////////////// operator /=    ////////////////////////////
+      /** @brief Inplace division with a ViennaCL scalar */
+      self_type & operator /= (scalar<SCALARTYPE> const & other)
+      {
+        assert( val_.get_active_handle_id() != viennacl::MEMORY_NOT_INITIALIZED && bool("Scalar not initialized!"));
+
+        viennacl::linalg::as(*this,                                       // s1 =
+                             *this, other, 1, true, false);              //      s1 / s2
+        return *this;
+      }
+      /** @brief Inplace division with a host scalar (float or double) */
+      self_type & operator /= (SCALARTYPE other)
+      {
+        assert( val_.get_active_handle_id() != viennacl::MEMORY_NOT_INITIALIZED && bool("Scalar not initialized!"));
+
+        viennacl::linalg::as(*this,                                       // s1 =
+                             *this, other, 1, true, false);              //      s1 / s2
+        return *this;
+      }
+
+
+      //////////////// operator + ////////////////////////////
+      /** @brief Addition of two ViennaCL scalars */
+      self_type operator + (scalar<SCALARTYPE> const & other)
+      {
+        assert( val_.get_active_handle_id() != viennacl::MEMORY_NOT_INITIALIZED && bool("Scalar not initialized!"));
+
+        self_type result = 0;
+
+        viennacl::linalg::asbs(result,                                       // result =
+                               *this, SCALARTYPE(1.0), 1, false, false,      //            *this * 1.0
+                               other, SCALARTYPE(1.0), 1, false, false);     //          + other * 1.0
+
+        return result;
+      }
+      /** @brief Addition of a ViennaCL scalar with a scalar expression */
+      template <typename T1, typename T2, typename OP>
+      self_type operator + (scalar_expression<T1, T2, OP> const & proxy) const
+      {
+        assert( val_.get_active_handle_id() != viennacl::MEMORY_NOT_INITIALIZED && bool("Scalar not initialized!"));
+
+        self_type result = proxy;
+
+        viennacl::linalg::asbs(result,                                       // result =
+                               *this, SCALARTYPE(1.0), 1, false, false,      //            *this * 1.0
+                               result, SCALARTYPE(1.0), 1, false, false);     //        + result * 1.0
+
+        return result;
+      }
+      /** @brief Addition of a ViennaCL scalar with a host scalar (float, double) */
+      self_type operator + (SCALARTYPE other)
+      {
+        assert( val_.get_active_handle_id() != viennacl::MEMORY_NOT_INITIALIZED && bool("Scalar not initialized!"));
+
+        self_type result = 0;
+
+        viennacl::linalg::asbs(result,                                       // result =
+                               *this, SCALARTYPE(1.0), 1, false, false,      //            *this * 1.0
+                               other, SCALARTYPE(1.0), 1, false, false);     //          + other * 1.0
+
+        return result;
+      }
+
+
+      //////////////// operator - ////////////////////////////
+
+      /** @brief Sign flip of the scalar. Does not evaluate immediately, but instead returns an expression template object */
+      scalar_expression<const self_type, const self_type, op_flip_sign> operator-() const
+      {
+        return scalar_expression<const self_type, const self_type, op_flip_sign>(*this, *this);
+      }
+
+
+      /** @brief Subtraction of two ViennaCL scalars */
+      self_type operator - (scalar<SCALARTYPE> const & other) const
+      {
+        assert( val_.get_active_handle_id() != viennacl::MEMORY_NOT_INITIALIZED && bool("Scalar not initialized!"));
+
+        self_type result = 0;
+
+        viennacl::linalg::asbs(result,                                       // result =
+                               *this, SCALARTYPE(1.0), 1, false, false,      //            *this * 1.0
+                               other, SCALARTYPE(-1.0), 1, false, false);    //          + other * (-1.0)
+
+        return result;
+      }
+      /** @brief Subtraction of a ViennaCL scalar from a scalar expression */
+      template <typename T1, typename T2, typename OP>
+      self_type operator - (scalar_expression<T1, T2, OP> const & proxy) const
+      {
+        assert( val_.get_active_handle_id() != viennacl::MEMORY_NOT_INITIALIZED && bool("Scalar not initialized!"));
+
+        self_type result = proxy;
+
+        viennacl::linalg::asbs(result,                                       // result =
+                                *this, SCALARTYPE(1.0), 1 , false, false,    //            *this * 1.0
+                               result, SCALARTYPE(-1.0), 1, false, false);  //          + result * (-1.0)
+
+        return result;
+      }
+      /** @brief Subtraction of a host scalar (float, double) from a ViennaCL scalar */
+      scalar<SCALARTYPE> operator - (SCALARTYPE other) const
+      {
+        assert( val_.get_active_handle_id() != viennacl::MEMORY_NOT_INITIALIZED && bool("Scalar not initialized!"));
+
+        self_type result = 0;
+
+        viennacl::linalg::asbs(result,                                       // result =
+                               *this, SCALARTYPE(1.0), 1, false, false,      //            *this * 1.0
+                               other, SCALARTYPE(-1.0), 1, false, false);    //          + other * (-1.0)
+
+        return result;
+      }
+
+      //////////////// operator * ////////////////////////////
+      /** @brief Multiplication of two ViennaCL scalars */
+      self_type operator * (scalar<SCALARTYPE> const & other) const
+      {
+        assert( val_.get_active_handle_id() != viennacl::MEMORY_NOT_INITIALIZED && bool("Scalar not initialized!"));
+
+        scalar<SCALARTYPE> result = 0;
+
+        viennacl::linalg::as(result,                                     // result =
+                             *this, other, 1, false, false);              //          *this * other
+
+        return result;
+      }
+      /** @brief Multiplication of a ViennaCL scalar with a scalar expression */
+      template <typename T1, typename T2, typename OP>
+      self_type operator * (scalar_expression<T1, T2, OP> const & proxy) const
+      {
+        assert( val_.get_active_handle_id() != viennacl::MEMORY_NOT_INITIALIZED && bool("Scalar not initialized!"));
+
+        self_type result = proxy;
+
+        viennacl::linalg::as(result,                                       // result =
+                             *this, result, 1, false, false);              //            *this * proxy
+
+        return result;
+      }
+      /** @brief Multiplication of a host scalar (float, double) with a ViennaCL scalar */
+      self_type operator * (SCALARTYPE other) const
+      {
+        assert( val_.get_active_handle_id() != viennacl::MEMORY_NOT_INITIALIZED && bool("Scalar not initialized!"));
+
+        scalar<SCALARTYPE> result = 0;
+
+        viennacl::linalg::as(result,                                     // result =
+                             *this, other, 1, false, false);              //          *this * other
+
+        return result;
+      }
+
+      //////////////// operator /    ////////////////////////////
+      /** @brief Division of two ViennaCL scalars */
+      self_type operator / (scalar<SCALARTYPE> const & other) const
+      {
+        assert( val_.get_active_handle_id() != viennacl::MEMORY_NOT_INITIALIZED && bool("Scalar not initialized!"));
+
+        self_type result = 0;
+
+        viennacl::linalg::as(result,                                     // result =
+                             *this, other, 1, true, false);              //           *this / other
+
+        return result;
+      }
+      /** @brief Division of a ViennaCL scalar by a scalar expression */
+      template <typename T1, typename T2, typename OP>
+      self_type operator / (scalar_expression<T1, T2, OP> const & proxy) const
+      {
+        assert( val_.get_active_handle_id() != viennacl::MEMORY_NOT_INITIALIZED && bool("Scalar not initialized!"));
+
+        self_type result = proxy;
+
+        viennacl::linalg::as(result,                                     // result =
+                             *this, result, 1, true, false);              //          *this / proxy
+
+        return result;
+      }
+      /** @brief Division of a ViennaCL scalar by a host scalar (float, double)*/
+      self_type operator / (SCALARTYPE other) const
+      {
+        assert( val_.get_active_handle_id() != viennacl::MEMORY_NOT_INITIALIZED && bool("Scalar not initialized!"));
+
+        self_type result = 0;
+
+        viennacl::linalg::as(result,                                     // result =
+                             *this, other, 1, true, false);              //            *this / other
+
+        return result;
+      }
+
+      /** @brief Returns the memory handle, non-const version */
+      handle_type & handle() { return val_; }
+
+      /** @brief Returns the memory handle, const version */
+      const handle_type & handle() const { return val_; }
+
+    private:
+
+      void init_if_necessary(viennacl::context ctx)
+      {
+        if (val_.get_active_handle_id() == viennacl::MEMORY_NOT_INITIALIZED)
+        {
+          viennacl::backend::memory_create(val_, sizeof(SCALARTYPE), ctx);
+        }
+      }
+
+      handle_type val_;
+    };
+
+
+    //stream operators:
+    /** @brief Allows to directly print the value of a scalar to an output stream */
+    template<class SCALARTYPE>
+    std::ostream & operator<<(std::ostream & s, const scalar<SCALARTYPE> & val)
+    {
+      SCALARTYPE temp = val;
+      s << temp;
+      return s;
+    }
+
+    /** @brief Allows to directly read a value of a scalar from an input stream */
+    template<class SCALARTYPE>
+    std::istream & operator>>(std::istream & s, const scalar<SCALARTYPE> & val)
+    {
+      SCALARTYPE temp;
+      s >> temp;
+      val = temp;
+      return s;
+    }
+
+} //namespace viennacl
+
+#endif
diff --git a/viennacl/scheduler/execute.hpp b/viennacl/scheduler/execute.hpp
new file mode 100644
index 0000000..c96066b
--- /dev/null
+++ b/viennacl/scheduler/execute.hpp
@@ -0,0 +1,247 @@
+#ifndef VIENNACL_SCHEDULER_EXECUTE_HPP
+#define VIENNACL_SCHEDULER_EXECUTE_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/scheduler/execute.hpp
+    @brief Provides the datastructures for dealing with a single statement such as 'x = y + z;'
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/scheduler/forwards.h"
+
+#include "viennacl/scheduler/execute_scalar_assign.hpp"
+#include "viennacl/scheduler/execute_axbx.hpp"
+#include "viennacl/scheduler/execute_elementwise.hpp"
+#include "viennacl/scheduler/execute_matrix_prod.hpp"
+
+namespace viennacl
+{
+  namespace scheduler
+  {
+    namespace detail
+    {
+      /** @brief Deals with x = RHS where RHS is an expression and x is either a scalar, a vector, or a matrix */
+      void execute_composite(statement const & s, statement_node const & root_node)
+      {
+        statement::container_type const & expr = s.array();
+
+        statement_node const & leaf = expr[root_node.rhs.node_index];
+
+        if (leaf.op.type  == OPERATION_BINARY_ADD_TYPE || leaf.op.type  == OPERATION_BINARY_SUB_TYPE) // x = (y) +- (z)  where y and z are either data objects or expressions
+        {
+          execute_axbx(s, root_node);
+        }
+        else if (leaf.op.type == OPERATION_BINARY_MULT_TYPE || leaf.op.type == OPERATION_BINARY_DIV_TYPE) // x = (y) * / alpha;
+        {
+          bool scalar_is_temporary = (leaf.rhs.type_family != SCALAR_TYPE_FAMILY);
+
+          statement_node scalar_temp_node;
+          if (scalar_is_temporary)
+          {
+            lhs_rhs_element temp;
+            temp.type_family  = SCALAR_TYPE_FAMILY;
+            temp.subtype      = DEVICE_SCALAR_TYPE;
+            temp.numeric_type = root_node.lhs.numeric_type;
+            detail::new_element(scalar_temp_node.lhs, temp);
+
+            scalar_temp_node.op.type_family = OPERATION_BINARY_TYPE_FAMILY;
+            scalar_temp_node.op.type        = OPERATION_BINARY_ASSIGN_TYPE;
+
+            scalar_temp_node.rhs.type_family  = COMPOSITE_OPERATION_FAMILY;
+            scalar_temp_node.rhs.subtype      = INVALID_SUBTYPE;
+            scalar_temp_node.rhs.numeric_type = INVALID_NUMERIC_TYPE;
+            scalar_temp_node.rhs.node_index   = leaf.rhs.node_index;
+
+            // work on subexpression:
+            // TODO: Catch exception, free temporary, then rethrow
+            execute_composite(s, scalar_temp_node);
+          }
+
+          if (leaf.lhs.type_family == COMPOSITE_OPERATION_FAMILY)  //(y) is an expression, so introduce a temporary z = (y):
+          {
+            statement_node new_root_y;
+
+            new_root_y.lhs.type_family  = root_node.lhs.type_family;
+            new_root_y.lhs.subtype      = root_node.lhs.subtype;
+            new_root_y.lhs.numeric_type = root_node.lhs.numeric_type;
+            detail::new_element(new_root_y.lhs, root_node.lhs);
+
+            new_root_y.op.type_family = OPERATION_BINARY_TYPE_FAMILY;
+            new_root_y.op.type        = OPERATION_BINARY_ASSIGN_TYPE;
+
+            new_root_y.rhs.type_family  = COMPOSITE_OPERATION_FAMILY;
+            new_root_y.rhs.subtype      = INVALID_SUBTYPE;
+            new_root_y.rhs.numeric_type = INVALID_NUMERIC_TYPE;
+            new_root_y.rhs.node_index   = leaf.lhs.node_index;
+
+            // work on subexpression:
+            // TODO: Catch exception, free temporary, then rethrow
+            execute_composite(s, new_root_y);
+
+            // now compute x = z * / alpha:
+            lhs_rhs_element u = root_node.lhs;
+            lhs_rhs_element v = new_root_y.lhs;
+            lhs_rhs_element alpha = scalar_is_temporary ? scalar_temp_node.lhs : leaf.rhs;
+
+            bool is_division = (leaf.op.type  == OPERATION_BINARY_DIV_TYPE);
+            switch (root_node.op.type)
+            {
+              case OPERATION_BINARY_ASSIGN_TYPE:
+                detail::ax(u,
+                           v, alpha, 1, is_division, false);
+                break;
+              case OPERATION_BINARY_INPLACE_ADD_TYPE:
+                detail::axbx(u,
+                             u,   1.0, 1, false,       false,
+                             v, alpha, 1, is_division, false);
+                break;
+              case OPERATION_BINARY_INPLACE_SUB_TYPE:
+                detail::axbx(u,
+                             u,   1.0, 1, false,       false,
+                             v, alpha, 1, is_division, true);
+                break;
+              default:
+                throw statement_not_supported_exception("Unsupported binary operator for vector operation in root note (should be =, +=, or -=)");
+            }
+
+            detail::delete_element(new_root_y.lhs);
+          }
+          else if (leaf.lhs.type_family != COMPOSITE_OPERATION_FAMILY)
+          {
+            lhs_rhs_element u = root_node.lhs;
+            lhs_rhs_element v = leaf.lhs;
+            lhs_rhs_element alpha = scalar_is_temporary ? scalar_temp_node.lhs : leaf.rhs;
+
+            bool is_division = (leaf.op.type  == OPERATION_BINARY_DIV_TYPE);
+            switch (root_node.op.type)
+            {
+              case OPERATION_BINARY_ASSIGN_TYPE:
+                detail::ax(u,
+                           v, alpha, 1, is_division, false);
+                break;
+              case OPERATION_BINARY_INPLACE_ADD_TYPE:
+                detail::axbx(u,
+                             u,   1.0, 1, false,       false,
+                             v, alpha, 1, is_division, false);
+                break;
+              case OPERATION_BINARY_INPLACE_SUB_TYPE:
+                detail::axbx(u,
+                             u,   1.0, 1, false,       false,
+                             v, alpha, 1, is_division, true);
+                break;
+              default:
+                throw statement_not_supported_exception("Unsupported binary operator for vector operation in root note (should be =, +=, or -=)");
+            }
+          }
+          else
+            throw statement_not_supported_exception("Unsupported binary operator for OPERATION_BINARY_MULT_TYPE || OPERATION_BINARY_DIV_TYPE on leaf node.");
+
+          // clean up
+          if (scalar_is_temporary)
+            detail::delete_element(scalar_temp_node.lhs);
+        }
+        else if (   leaf.op.type == OPERATION_BINARY_INNER_PROD_TYPE
+                 || leaf.op.type == OPERATION_UNARY_NORM_1_TYPE
+                 || leaf.op.type == OPERATION_UNARY_NORM_2_TYPE
+                 || leaf.op.type == OPERATION_UNARY_NORM_INF_TYPE)
+        {
+          execute_scalar_assign_composite(s, root_node);
+        }
+        else if (   (leaf.op.type_family == OPERATION_UNARY_TYPE_FAMILY && leaf.op.type != OPERATION_UNARY_TRANS_TYPE)
+                 || leaf.op.type == OPERATION_BINARY_ELEMENT_PROD_TYPE
+                 || leaf.op.type == OPERATION_BINARY_ELEMENT_DIV_TYPE) // element-wise operations
+        {
+          execute_element_composite(s, root_node);
+        }
+        else if (   leaf.op.type == OPERATION_BINARY_MAT_VEC_PROD_TYPE
+                 || leaf.op.type == OPERATION_BINARY_MAT_MAT_PROD_TYPE)
+        {
+          execute_matrix_prod(s, root_node);
+        }
+        else
+          throw statement_not_supported_exception("Unsupported binary operator");
+      }
+
+
+      /** @brief Deals with x = y  for a scalar/vector/matrix x, y */
+      inline void execute_single(statement const &, statement_node const & root_node)
+      {
+        lhs_rhs_element u = root_node.lhs;
+        lhs_rhs_element v = root_node.rhs;
+        switch (root_node.op.type)
+        {
+          case OPERATION_BINARY_ASSIGN_TYPE:
+            detail::ax(u,
+                       v, 1.0, 1, false, false);
+            break;
+          case OPERATION_BINARY_INPLACE_ADD_TYPE:
+            detail::axbx(u,
+                         u, 1.0, 1, false, false,
+                         v, 1.0, 1, false, false);
+            break;
+          case OPERATION_BINARY_INPLACE_SUB_TYPE:
+            detail::axbx(u,
+                         u, 1.0, 1, false, false,
+                         v, 1.0, 1, false, true);
+            break;
+          default:
+            throw statement_not_supported_exception("Unsupported binary operator for operation in root note (should be =, +=, or -=)");
+        }
+
+      }
+
+
+      inline void execute_impl(statement const & s, statement_node const & root_node)
+      {
+        if (   root_node.lhs.type_family != SCALAR_TYPE_FAMILY
+            && root_node.lhs.type_family != VECTOR_TYPE_FAMILY
+            && root_node.lhs.type_family != MATRIX_TYPE_FAMILY)
+          throw statement_not_supported_exception("Unsupported lvalue encountered in head node.");
+
+        switch (root_node.rhs.type_family)
+        {
+          case COMPOSITE_OPERATION_FAMILY:
+            execute_composite(s, root_node);
+            break;
+          case SCALAR_TYPE_FAMILY:
+          case VECTOR_TYPE_FAMILY:
+          case MATRIX_TYPE_FAMILY:
+            execute_single(s, root_node);
+            break;
+          default:
+            throw statement_not_supported_exception("Invalid rvalue encountered in vector assignment");
+        }
+
+      }
+    }
+
+    inline void execute(statement const & s)
+    {
+      // simply start execution from the root node:
+      detail::execute_impl(s, s.array()[s.root()]);
+    }
+
+
+  }
+
+} //namespace viennacl
+
+#endif
+
diff --git a/viennacl/scheduler/execute_axbx.hpp b/viennacl/scheduler/execute_axbx.hpp
new file mode 100644
index 0000000..e590a0e
--- /dev/null
+++ b/viennacl/scheduler/execute_axbx.hpp
@@ -0,0 +1,379 @@
+#ifndef VIENNACL_SCHEDULER_EXECUTE_AXBX_HPP
+#define VIENNACL_SCHEDULER_EXECUTE_AXBX_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/scheduler/execute_axbx.hpp
+    @brief Provides the datastructures for dealing with statements of the type x = (y) +- (z)
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/scheduler/forwards.h"
+
+#include "viennacl/scheduler/execute_scalar_assign.hpp"
+#include "viennacl/scheduler/execute_generic_dispatcher.hpp"
+
+namespace viennacl
+{
+  namespace scheduler
+  {
+    namespace detail
+    {
+      /** @brief Deals with x = (y) +- (z)  where y and z are either data objects or expressions */
+      inline void execute_axbx(statement const & s, statement_node const & root_node)
+      {
+        statement::container_type const & expr = s.array();
+
+        statement_node const & leaf = expr[root_node.rhs.node_index];
+
+        if (leaf.op.type  == OPERATION_BINARY_ADD_TYPE || leaf.op.type  == OPERATION_BINARY_SUB_TYPE) // x = (y) +- (z)  where y and z are either data objects or expressions
+        {
+          bool flip_sign_z = (leaf.op.type  == OPERATION_BINARY_SUB_TYPE);
+
+          if (   leaf.lhs.type_family != COMPOSITE_OPERATION_FAMILY
+              && leaf.rhs.type_family != COMPOSITE_OPERATION_FAMILY)
+          {
+            lhs_rhs_element u = root_node.lhs;
+            lhs_rhs_element v = leaf.lhs;
+            lhs_rhs_element w = leaf.rhs;
+            switch (root_node.op.type)
+            {
+              case OPERATION_BINARY_ASSIGN_TYPE:
+                detail::axbx(u,
+                             v, 1.0, 1, false, false,
+                             w, 1.0, 1, false, flip_sign_z);
+                break;
+              case OPERATION_BINARY_INPLACE_ADD_TYPE:
+                detail::axbx_x(u,
+                               v, 1.0, 1, false, false,
+                               w, 1.0, 1, false, flip_sign_z);
+                break;
+              case OPERATION_BINARY_INPLACE_SUB_TYPE:
+                detail::axbx_x(u,
+                               v, 1.0, 1, false, true,
+                               w, 1.0, 1, false, !flip_sign_z);
+                break;
+              default:
+                throw statement_not_supported_exception("Unsupported binary operator for operation in root note (should be =, +=, or -=)");
+            }
+          }
+          else if (  leaf.lhs.type_family == COMPOSITE_OPERATION_FAMILY
+                  && leaf.rhs.type_family != COMPOSITE_OPERATION_FAMILY) // x = (y) + z, y being a subtree itself, z being a scalar, vector, or matrix
+          {
+            statement_node const & y = expr[leaf.lhs.node_index];
+
+            if (y.op.type_family == OPERATION_BINARY_TYPE_FAMILY)
+            {
+              // y might be  'v * alpha' or 'v / alpha' with {scalar|vector|matrix} v
+              if (   (y.op.type == OPERATION_BINARY_MULT_TYPE || y.op.type == OPERATION_BINARY_DIV_TYPE)
+                  &&  y.lhs.type_family != COMPOSITE_OPERATION_FAMILY
+                  &&  y.rhs.type_family == SCALAR_TYPE_FAMILY)
+              {
+                lhs_rhs_element u = root_node.lhs;
+                lhs_rhs_element v = y.lhs;
+                lhs_rhs_element w = leaf.rhs;
+                lhs_rhs_element alpha = y.rhs;
+
+                bool is_division = (y.op.type == OPERATION_BINARY_DIV_TYPE);
+                switch (root_node.op.type)
+                {
+                  case OPERATION_BINARY_ASSIGN_TYPE:
+                    detail::axbx(u,
+                                 v, alpha, 1, is_division, false,
+                                 w,   1.0, 1, false,       flip_sign_z);
+                    break;
+                  case OPERATION_BINARY_INPLACE_ADD_TYPE:
+                    detail::axbx_x(u,
+                                   v, alpha, 1, is_division, false,
+                                   w,   1.0, 1, false,       flip_sign_z);
+                    break;
+                  case OPERATION_BINARY_INPLACE_SUB_TYPE:
+                    detail::axbx_x(u,
+                                   v, alpha, 1, is_division, true,
+                                   w,   1.0, 1, false,       !flip_sign_z);
+                    break;
+                  default:
+                    throw statement_not_supported_exception("Unsupported binary operator for vector operation in root note (should be =, +=, or -=)");
+                }
+              }
+              else // no built-in kernel, we use a temporary.
+              {
+                statement_node new_root_y;
+
+                detail::new_element(new_root_y.lhs, root_node.lhs);
+
+                new_root_y.op.type_family = OPERATION_BINARY_TYPE_FAMILY;
+                new_root_y.op.type        = OPERATION_BINARY_ASSIGN_TYPE;
+
+                new_root_y.rhs.type_family  = COMPOSITE_OPERATION_FAMILY;
+                new_root_y.rhs.subtype      = INVALID_SUBTYPE;
+                new_root_y.rhs.numeric_type = INVALID_NUMERIC_TYPE;
+                new_root_y.rhs.node_index   = leaf.lhs.node_index;
+
+                // work on subexpression:
+                // TODO: Catch exception, free temporary, then rethrow
+                execute_composite(s, new_root_y);
+
+                // now add:
+                lhs_rhs_element u = root_node.lhs;
+                lhs_rhs_element v = new_root_y.lhs;
+                lhs_rhs_element w = leaf.rhs;
+                switch (root_node.op.type)
+                {
+                  case OPERATION_BINARY_ASSIGN_TYPE:
+                    detail::axbx(u,
+                                 v, 1.0, 1, false, false,
+                                 w, 1.0, 1, false, flip_sign_z);
+                    break;
+                  case OPERATION_BINARY_INPLACE_ADD_TYPE:
+                    detail::axbx_x(u,
+                                   v, 1.0, 1, false, false,
+                                   w, 1.0, 1, false, flip_sign_z);
+                    break;
+                  case OPERATION_BINARY_INPLACE_SUB_TYPE:
+                    detail::axbx_x(u,
+                                   v, 1.0, 1, false, true,
+                                   w, 1.0, 1, false, !flip_sign_z);
+                    break;
+                  default:
+                    throw statement_not_supported_exception("Unsupported binary operator for vector operation in root note (should be =, +=, or -=)");
+                }
+
+                detail::delete_element(new_root_y.lhs);
+              }
+            }
+            else
+              throw statement_not_supported_exception("Cannot deal with unary operations on vectors");
+
+          }
+          else if (  leaf.lhs.type_family != COMPOSITE_OPERATION_FAMILY
+                  && leaf.rhs.type_family == COMPOSITE_OPERATION_FAMILY) // x = y + (z), y being vector, z being a subtree itself
+          {
+            statement_node const & z = expr[leaf.rhs.node_index];
+
+            if (z.op.type_family == OPERATION_BINARY_TYPE_FAMILY)
+            {
+              // z might be  'v * alpha' or 'v / alpha' with vector v
+              if (   (z.op.type == OPERATION_BINARY_MULT_TYPE || z.op.type == OPERATION_BINARY_DIV_TYPE)
+                  &&  z.lhs.type_family != COMPOSITE_OPERATION_FAMILY
+                  &&  z.rhs.type_family == SCALAR_TYPE_FAMILY)
+              {
+                lhs_rhs_element u = root_node.lhs;
+                lhs_rhs_element v = leaf.lhs;
+                lhs_rhs_element w = z.lhs;
+                lhs_rhs_element beta = z.rhs;
+
+                bool is_division = (z.op.type == OPERATION_BINARY_DIV_TYPE);
+                switch (root_node.op.type)
+                {
+                  case OPERATION_BINARY_ASSIGN_TYPE:
+                    detail::axbx(u,
+                                 v,  1.0, 1, false, false,
+                                 w, beta, 1, is_division, flip_sign_z);
+                    break;
+                  case OPERATION_BINARY_INPLACE_ADD_TYPE:
+                    detail::axbx_x(u,
+                                   v,  1.0, 1, false, false,
+                                   w, beta, 1, is_division, flip_sign_z);
+                    break;
+                  case OPERATION_BINARY_INPLACE_SUB_TYPE:
+                    detail::axbx_x(u,
+                                   v,  1.0, 1, false, true,
+                                   w, beta, 1, is_division, !flip_sign_z);
+                    break;
+                  default:
+                    throw statement_not_supported_exception("Unsupported binary operator for vector operation in root note (should be =, +=, or -=)");
+                }
+              }
+              else // no built-in kernel, we use a temporary.
+              {
+                statement_node new_root_z;
+
+                detail::new_element(new_root_z.lhs, root_node.lhs);
+
+                new_root_z.op.type_family = OPERATION_BINARY_TYPE_FAMILY;
+                new_root_z.op.type        = OPERATION_BINARY_ASSIGN_TYPE;
+
+                new_root_z.rhs.type_family  = COMPOSITE_OPERATION_FAMILY;
+                new_root_z.rhs.subtype      = INVALID_SUBTYPE;
+                new_root_z.rhs.numeric_type = INVALID_NUMERIC_TYPE;
+                new_root_z.rhs.node_index   = leaf.rhs.node_index;
+
+                // work on subexpression:
+                // TODO: Catch exception, free temporary, then rethrow
+                execute_composite(s, new_root_z);
+
+                // now add:
+                lhs_rhs_element u = root_node.lhs;
+                lhs_rhs_element v = leaf.lhs;
+                lhs_rhs_element w = new_root_z.lhs;
+                switch (root_node.op.type)
+                {
+                  case OPERATION_BINARY_ASSIGN_TYPE:
+                    detail::axbx(u,
+                                 v, 1.0, 1, false, false,
+                                 w, 1.0, 1, false, flip_sign_z);
+                    break;
+                  case OPERATION_BINARY_INPLACE_ADD_TYPE:
+                    detail::axbx_x(u,
+                                   v, 1.0, 1, false, false,
+                                   w, 1.0, 1, false, flip_sign_z);
+                    break;
+                  case OPERATION_BINARY_INPLACE_SUB_TYPE:
+                    detail::axbx_x(u,
+                                   v, 1.0, 1, false, true,
+                                   w, 1.0, 1, false, !flip_sign_z);
+                    break;
+                  default:
+                    throw statement_not_supported_exception("Unsupported binary operator for vector operation in root note (should be =, +=, or -=)");
+                }
+
+                detail::delete_element(new_root_z.lhs);
+              }
+            }
+            else
+              throw statement_not_supported_exception("Cannot deal with unary operations on vectors");
+
+          }
+          else if (  leaf.lhs.type_family == COMPOSITE_OPERATION_FAMILY
+                  && leaf.rhs.type_family == COMPOSITE_OPERATION_FAMILY) // x = (y) + (z), y and z being subtrees
+          {
+            statement_node const & y = expr[leaf.lhs.node_index];
+            statement_node const & z = expr[leaf.rhs.node_index];
+
+            if (   y.op.type_family == OPERATION_BINARY_TYPE_FAMILY
+                && z.op.type_family == OPERATION_BINARY_TYPE_FAMILY)
+            {
+              // z might be  'v * alpha' or 'v / alpha' with vector v
+              if (   (y.op.type == OPERATION_BINARY_MULT_TYPE || y.op.type == OPERATION_BINARY_DIV_TYPE)
+                  &&  y.lhs.type_family != COMPOSITE_OPERATION_FAMILY
+                  &&  y.rhs.type_family == SCALAR_TYPE_FAMILY
+                  && (z.op.type == OPERATION_BINARY_MULT_TYPE || z.op.type == OPERATION_BINARY_DIV_TYPE)
+                  &&  z.lhs.type_family != COMPOSITE_OPERATION_FAMILY
+                  &&  z.rhs.type_family == SCALAR_TYPE_FAMILY)
+              {
+                lhs_rhs_element u = root_node.lhs;
+                lhs_rhs_element v = y.lhs;
+                lhs_rhs_element w = z.lhs;
+                lhs_rhs_element alpha = y.rhs;
+                lhs_rhs_element beta  = z.rhs;
+
+                bool is_division_y = (y.op.type == OPERATION_BINARY_DIV_TYPE);
+                bool is_division_z = (z.op.type == OPERATION_BINARY_DIV_TYPE);
+                switch (root_node.op.type)
+                {
+                  case OPERATION_BINARY_ASSIGN_TYPE:
+                    detail::axbx(u,
+                                 v, alpha, 1, is_division_y, false,
+                                 w,  beta, 1, is_division_z, flip_sign_z);
+                    break;
+                  case OPERATION_BINARY_INPLACE_ADD_TYPE:
+                    detail::axbx_x(u,
+                                   v, alpha, 1, is_division_y, false,
+                                   w,  beta, 1, is_division_z, flip_sign_z);
+                    break;
+                  case OPERATION_BINARY_INPLACE_SUB_TYPE:
+                    detail::axbx_x(u,
+                                   v, alpha, 1, is_division_y, true,
+                                   w,  beta, 1, is_division_z, !flip_sign_z);
+                    break;
+                  default:
+                    throw statement_not_supported_exception("Unsupported binary operator for vector operation in root note (should be =, +=, or -=)");
+                }
+              }
+              else // no built-in kernel, we use a temporary.
+              {
+                statement_node new_root_y;
+
+                detail::new_element(new_root_y.lhs, root_node.lhs);
+
+                new_root_y.op.type_family = OPERATION_BINARY_TYPE_FAMILY;
+                new_root_y.op.type   = OPERATION_BINARY_ASSIGN_TYPE;
+
+                new_root_y.rhs.type_family  = COMPOSITE_OPERATION_FAMILY;
+                new_root_y.rhs.subtype      = INVALID_SUBTYPE;
+                new_root_y.rhs.numeric_type = INVALID_NUMERIC_TYPE;
+                new_root_y.rhs.node_index   = leaf.lhs.node_index;
+
+                // work on subexpression:
+                // TODO: Catch exception, free temporary, then rethrow
+                execute_composite(s, new_root_y);
+
+                statement_node new_root_z;
+
+                detail::new_element(new_root_z.lhs, root_node.lhs);
+
+                new_root_z.op.type_family = OPERATION_BINARY_TYPE_FAMILY;
+                new_root_z.op.type        = OPERATION_BINARY_ASSIGN_TYPE;
+
+                new_root_z.rhs.type_family  = COMPOSITE_OPERATION_FAMILY;
+                new_root_z.rhs.subtype      = INVALID_SUBTYPE;
+                new_root_z.rhs.numeric_type = INVALID_NUMERIC_TYPE;
+                new_root_z.rhs.node_index   = leaf.rhs.node_index;
+
+                // work on subexpression:
+                // TODO: Catch exception, free temporaries, then rethrow
+                execute_composite(s, new_root_z);
+
+                // now add:
+                lhs_rhs_element u = root_node.lhs;
+                lhs_rhs_element v = new_root_y.lhs;
+                lhs_rhs_element w = new_root_z.lhs;
+
+                switch (root_node.op.type)
+                {
+                  case OPERATION_BINARY_ASSIGN_TYPE:
+                    detail::axbx(u,
+                                 v, 1.0, 1, false, false,
+                                 w, 1.0, 1, false, flip_sign_z);
+                    break;
+                  case OPERATION_BINARY_INPLACE_ADD_TYPE:
+                    detail::axbx_x(u,
+                                   v, 1.0, 1, false, false,
+                                   w, 1.0, 1, false, flip_sign_z);
+                    break;
+                  case OPERATION_BINARY_INPLACE_SUB_TYPE:
+                    detail::axbx_x(u,
+                                   v, 1.0, 1, false, true,
+                                   w, 1.0, 1, false, !flip_sign_z);
+                    break;
+                  default:
+                    throw statement_not_supported_exception("Unsupported binary operator for vector operation in root note (should be =, +=, or -=)");
+                }
+
+                detail::delete_element(new_root_y.lhs);
+                detail::delete_element(new_root_z.lhs);
+              }
+            }
+            else
+              throw statement_not_supported_exception("Cannot deal with unary operations on vectors");
+          }
+          else
+            throw statement_not_supported_exception("Cannot deal with addition of vectors");
+        }
+        else
+          throw statement_not_supported_exception("Unsupported binary operator for vector operations");
+      }
+
+    } // namespace detail
+  } // namespace scheduler
+} // namespace viennacl
+
+#endif
+
diff --git a/viennacl/scheduler/execute_elementwise.hpp b/viennacl/scheduler/execute_elementwise.hpp
new file mode 100644
index 0000000..2070b38
--- /dev/null
+++ b/viennacl/scheduler/execute_elementwise.hpp
@@ -0,0 +1,466 @@
+#ifndef VIENNACL_SCHEDULER_EXECUTE_ELEMENTWISE_HPP
+#define VIENNACL_SCHEDULER_EXECUTE_ELEMENTWISE_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/scheduler/execute_elementwise.hpp
+    @brief Deals with the execution of unary and binary element-wise operations
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/scheduler/forwards.h"
+#include "viennacl/scheduler/execute_util.hpp"
+#include "viennacl/linalg/vector_operations.hpp"
+#include "viennacl/linalg/matrix_operations.hpp"
+
+namespace viennacl
+{
+  namespace scheduler
+  {
+    namespace detail
+    {
+      // result = element_op(x,y) for vectors or matrices x, y
+      inline void element_op(lhs_rhs_element result,
+                             lhs_rhs_element const & x,
+                             operation_node_type  op_type)
+      {
+        assert( result.numeric_type == x.numeric_type && bool("Numeric type not the same!"));
+        assert( result.type_family == x.type_family && bool("Subtype not the same!"));
+
+        if (x.subtype == DENSE_VECTOR_TYPE)
+        {
+          assert( result.subtype == x.subtype && bool("result not of vector type for unary elementwise operation"));
+          if (x.numeric_type == FLOAT_TYPE)
+          {
+            switch (op_type)
+            {
+#define VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPNAME, SCALARTYPE, OPTAG) \
+            case OPNAME:  viennacl::linalg::element_op(*result.vector_##SCALARTYPE, \
+                                                       viennacl::vector_expression<const vector_base<SCALARTYPE>, const vector_base<SCALARTYPE>, \
+                                                                                   op_element_unary<OPTAG> >(*x.vector_##SCALARTYPE, *x.vector_##SCALARTYPE)); break;
+
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_ABS_TYPE,   float, op_abs)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_ACOS_TYPE,  float, op_acos)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_ASIN_TYPE,  float, op_asin)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_ATAN_TYPE,  float, op_atan)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_CEIL_TYPE,  float, op_ceil)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_COS_TYPE,   float, op_cos)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_COSH_TYPE,  float, op_cosh)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_EXP_TYPE,   float, op_exp)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_FABS_TYPE,  float, op_fabs)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_FLOOR_TYPE, float, op_floor)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_LOG_TYPE,   float, op_log)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_LOG10_TYPE, float, op_log10)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_SIN_TYPE,   float, op_sin)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_SINH_TYPE,  float, op_sinh)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_SQRT_TYPE,  float, op_sqrt)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_TAN_TYPE,   float, op_tan)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_TANH_TYPE,  float, op_tanh)
+            default:
+              throw statement_not_supported_exception("Invalid op_type in unary elementwise operations");
+            }
+          }
+          else if (x.numeric_type == DOUBLE_TYPE)
+          {
+            switch (op_type)
+            {
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_ABS_TYPE,   double, op_abs)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_ACOS_TYPE,  double, op_acos)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_ASIN_TYPE,  double, op_asin)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_ATAN_TYPE,  double, op_atan)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_CEIL_TYPE,  double, op_ceil)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_COS_TYPE,   double, op_cos)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_COSH_TYPE,  double, op_cosh)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_EXP_TYPE,   double, op_exp)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_FABS_TYPE,  double, op_fabs)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_FLOOR_TYPE, double, op_floor)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_LOG_TYPE,   double, op_log)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_LOG10_TYPE, double, op_log10)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_SIN_TYPE,   double, op_sin)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_SINH_TYPE,  double, op_sinh)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_SQRT_TYPE,  double, op_sqrt)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_TAN_TYPE,   double, op_tan)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_TANH_TYPE,  double, op_tanh)
+
+#undef VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP
+            default:
+              throw statement_not_supported_exception("Invalid op_type in unary elementwise operations");
+            }
+          }
+          else
+            throw statement_not_supported_exception("Invalid numeric type in unary elementwise operator");
+        }
+        else if (x.subtype == DENSE_ROW_MATRIX_TYPE)
+        {
+          if (x.numeric_type == FLOAT_TYPE)
+          {
+            switch (op_type)
+            {
+#define VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPNAME, SCALARTYPE, OPTAG) \
+            case OPNAME:  viennacl::linalg::element_op(*result.matrix_row_##SCALARTYPE, \
+                                                       viennacl::matrix_expression<const matrix_base<SCALARTYPE, viennacl::row_major>, const matrix_base<SCALARTYPE, viennacl::row_major>, \
+                                                                                   op_element_unary<OPTAG> >(*x.matrix_row_##SCALARTYPE, *x.matrix_row_##SCALARTYPE)); break;
+
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_ABS_TYPE,   float, op_abs)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_ACOS_TYPE,  float, op_acos)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_ASIN_TYPE,  float, op_asin)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_ATAN_TYPE,  float, op_atan)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_CEIL_TYPE,  float, op_ceil)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_COS_TYPE,   float, op_cos)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_COSH_TYPE,  float, op_cosh)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_EXP_TYPE,   float, op_exp)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_FABS_TYPE,  float, op_fabs)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_FLOOR_TYPE, float, op_floor)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_LOG_TYPE,   float, op_log)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_LOG10_TYPE, float, op_log10)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_SIN_TYPE,   float, op_sin)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_SINH_TYPE,  float, op_sinh)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_SQRT_TYPE,  float, op_sqrt)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_TAN_TYPE,   float, op_tan)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_TANH_TYPE,  float, op_tanh)
+            default:
+              throw statement_not_supported_exception("Invalid op_type in unary elementwise operations");
+            }
+
+          }
+          else if (x.numeric_type == DOUBLE_TYPE)
+          {
+            switch (op_type)
+            {
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_ABS_TYPE,   double, op_abs)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_ACOS_TYPE,  double, op_acos)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_ASIN_TYPE,  double, op_asin)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_ATAN_TYPE,  double, op_atan)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_CEIL_TYPE,  double, op_ceil)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_COS_TYPE,   double, op_cos)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_COSH_TYPE,  double, op_cosh)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_EXP_TYPE,   double, op_exp)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_FABS_TYPE,  double, op_fabs)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_FLOOR_TYPE, double, op_floor)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_LOG_TYPE,   double, op_log)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_LOG10_TYPE, double, op_log10)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_SIN_TYPE,   double, op_sin)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_SINH_TYPE,  double, op_sinh)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_SQRT_TYPE,  double, op_sqrt)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_TAN_TYPE,   double, op_tan)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_TANH_TYPE,  double, op_tanh)
+            default:
+              throw statement_not_supported_exception("Invalid op_type in unary elementwise operations");
+            }
+          }
+          else
+            throw statement_not_supported_exception("Invalid numeric type in unary elementwise operator");
+
+#undef VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP
+
+        }
+        else if (x.subtype == DENSE_COL_MATRIX_TYPE)
+        {
+          if (x.numeric_type == FLOAT_TYPE)
+          {
+            switch (op_type)
+            {
+#define VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPNAME, SCALARTYPE, OPTAG) \
+            case OPNAME:  viennacl::linalg::element_op(*result.matrix_col_##SCALARTYPE, \
+                                                       viennacl::matrix_expression<const matrix_base<SCALARTYPE, viennacl::column_major>, const matrix_base<SCALARTYPE, viennacl::column_major>, \
+                                                                                   op_element_unary<OPTAG> >(*x.matrix_col_##SCALARTYPE, *x.matrix_col_##SCALARTYPE)); break;
+
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_ABS_TYPE,   float, op_abs)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_ACOS_TYPE,  float, op_acos)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_ASIN_TYPE,  float, op_asin)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_ATAN_TYPE,  float, op_atan)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_CEIL_TYPE,  float, op_ceil)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_COS_TYPE,   float, op_cos)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_COSH_TYPE,  float, op_cosh)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_EXP_TYPE,   float, op_exp)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_FABS_TYPE,  float, op_fabs)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_FLOOR_TYPE, float, op_floor)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_LOG_TYPE,   float, op_log)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_LOG10_TYPE, float, op_log10)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_SIN_TYPE,   float, op_sin)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_SINH_TYPE,  float, op_sinh)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_SQRT_TYPE,  float, op_sqrt)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_TAN_TYPE,   float, op_tan)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_TANH_TYPE,  float, op_tanh)
+            default:
+              throw statement_not_supported_exception("Invalid op_type in unary elementwise operations");
+            }
+
+          }
+          else if (x.numeric_type == DOUBLE_TYPE)
+          {
+            switch (op_type)
+            {
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_ABS_TYPE,   double, op_abs)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_ACOS_TYPE,  double, op_acos)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_ASIN_TYPE,  double, op_asin)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_ATAN_TYPE,  double, op_atan)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_CEIL_TYPE,  double, op_ceil)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_COS_TYPE,   double, op_cos)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_COSH_TYPE,  double, op_cosh)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_EXP_TYPE,   double, op_exp)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_FABS_TYPE,  double, op_fabs)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_FLOOR_TYPE, double, op_floor)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_LOG_TYPE,   double, op_log)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_LOG10_TYPE, double, op_log10)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_SIN_TYPE,   double, op_sin)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_SINH_TYPE,  double, op_sinh)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_SQRT_TYPE,  double, op_sqrt)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_TAN_TYPE,   double, op_tan)
+              VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP(OPERATION_UNARY_TANH_TYPE,  double, op_tanh)
+            default:
+              throw statement_not_supported_exception("Invalid op_type in unary elementwise operations");
+            }
+          }
+          else
+            throw statement_not_supported_exception("Invalid numeric type in unary elementwise operator");
+
+#undef VIENNACL_SCHEDULER_GENERATE_UNARY_ELEMENT_OP
+        }
+      }
+
+      // result = element_op(x,y) for vectors or matrices x, y
+      inline void element_op(lhs_rhs_element result,
+                             lhs_rhs_element const & x,
+                             lhs_rhs_element const & y,
+                             operation_node_type  op_type)
+      {
+        assert(      x.numeric_type == y.numeric_type && bool("Numeric type not the same!"));
+        assert( result.numeric_type == y.numeric_type && bool("Numeric type not the same!"));
+
+        assert(      x.type_family == y.type_family && bool("Subtype not the same!"));
+        assert( result.type_family == y.type_family && bool("Subtype not the same!"));
+
+        switch (op_type)
+        {
+
+        case OPERATION_BINARY_ELEMENT_DIV_TYPE:
+          if (x.subtype == DENSE_VECTOR_TYPE)
+          {
+            switch (x.numeric_type)
+            {
+              case FLOAT_TYPE:
+                viennacl::linalg::element_op(*result.vector_float,
+                                             vector_expression<const vector_base<float>,
+                                                               const vector_base<float>,
+                                                               op_element_binary<op_div> >(*x.vector_float, *y.vector_float));
+                break;
+              case DOUBLE_TYPE:
+                viennacl::linalg::element_op(*result.vector_double,
+                                             vector_expression<const vector_base<double>,
+                                                               const vector_base<double>,
+                                                               op_element_binary<op_div> >(*x.vector_double, *y.vector_double));
+                break;
+              default:
+                throw statement_not_supported_exception("Invalid numeric type for binary elementwise division");
+            }
+          }
+          else if (x.subtype == DENSE_ROW_MATRIX_TYPE)
+          {
+            switch (x.numeric_type)
+            {
+              case FLOAT_TYPE:
+                viennacl::linalg::element_op(*result.matrix_row_float,
+                                             matrix_expression< const matrix_base<float, row_major>,
+                                                                const matrix_base<float, row_major>,
+                                                                op_element_binary<op_div> >(*x.matrix_row_float, *y.matrix_row_float));
+                break;
+              case DOUBLE_TYPE:
+                viennacl::linalg::element_op(*result.matrix_row_double,
+                                             matrix_expression< const matrix_base<double, row_major>,
+                                                                const matrix_base<double, row_major>,
+                                                                op_element_binary<op_div> >(*x.matrix_row_double, *y.matrix_row_double));
+                break;
+              default:
+                throw statement_not_supported_exception("Invalid numeric type for binary elementwise division");
+            }
+          }
+          else if (x.subtype == DENSE_COL_MATRIX_TYPE)
+          {
+            switch (x.numeric_type)
+            {
+              case FLOAT_TYPE:
+                viennacl::linalg::element_op(*result.matrix_col_float,
+                                             matrix_expression< const matrix_base<float, column_major>,
+                                                                const matrix_base<float, column_major>,
+                                                                op_element_binary<op_div> >(*x.matrix_col_float, *y.matrix_col_float));
+                break;
+              case DOUBLE_TYPE:
+                viennacl::linalg::element_op(*result.matrix_col_double,
+                                             matrix_expression< const matrix_base<double, column_major>,
+                                                                const matrix_base<double, column_major>,
+                                                                op_element_binary<op_div> >(*x.matrix_col_double, *y.matrix_col_double));
+                break;
+              default:
+                throw statement_not_supported_exception("Invalid numeric type for binary elementwise division");
+            }
+          }
+          else
+            throw statement_not_supported_exception("Invalid operand type for binary elementwise division");
+          break;
+
+
+        case OPERATION_BINARY_ELEMENT_PROD_TYPE:
+          if (x.subtype == DENSE_VECTOR_TYPE)
+          {
+            switch (x.numeric_type)
+            {
+              case FLOAT_TYPE:
+                viennacl::linalg::element_op(*result.vector_float,
+                                             vector_expression<const vector_base<float>,
+                                                               const vector_base<float>,
+                                                               op_element_binary<op_prod> >(*x.vector_float, *y.vector_float));
+                break;
+              case DOUBLE_TYPE:
+                viennacl::linalg::element_op(*result.vector_double,
+                                             vector_expression<const vector_base<double>,
+                                                               const vector_base<double>,
+                                                               op_element_binary<op_prod> >(*x.vector_double, *y.vector_double));
+                break;
+              default:
+                throw statement_not_supported_exception("Invalid numeric type for binary elementwise division");
+            }
+          }
+          else if (x.subtype == DENSE_ROW_MATRIX_TYPE)
+          {
+            switch (x.numeric_type)
+            {
+              case FLOAT_TYPE:
+                viennacl::linalg::element_op(*result.matrix_row_float,
+                                             matrix_expression< const matrix_base<float, row_major>,
+                                                                const matrix_base<float, row_major>,
+                                                                op_element_binary<op_prod> >(*x.matrix_row_float, *y.matrix_row_float));
+                break;
+              case DOUBLE_TYPE:
+                viennacl::linalg::element_op(*result.matrix_row_double,
+                                             matrix_expression< const matrix_base<double, row_major>,
+                                                                const matrix_base<double, row_major>,
+                                                                op_element_binary<op_prod> >(*x.matrix_row_double, *y.matrix_row_double));
+                break;
+              default:
+                throw statement_not_supported_exception("Invalid numeric type for binary elementwise division");
+            }
+          }
+          else if (x.subtype == DENSE_COL_MATRIX_TYPE)
+          {
+            switch (x.numeric_type)
+            {
+              case FLOAT_TYPE:
+                viennacl::linalg::element_op(*result.matrix_col_float,
+                                             matrix_expression< const matrix_base<float, column_major>,
+                                                                const matrix_base<float, column_major>,
+                                                                op_element_binary<op_prod> >(*x.matrix_col_float, *y.matrix_col_float));
+                break;
+              case DOUBLE_TYPE:
+                viennacl::linalg::element_op(*result.matrix_col_double,
+                                             matrix_expression< const matrix_base<double, column_major>,
+                                                                const matrix_base<double, column_major>,
+                                                                op_element_binary<op_prod> >(*x.matrix_col_double, *y.matrix_col_double));
+                break;
+              default:
+                throw statement_not_supported_exception("Invalid numeric type for binary elementwise division");
+            }
+          }
+          else
+            throw statement_not_supported_exception("Invalid operand type for binary elementwise division");
+          break;
+        default:
+          throw statement_not_supported_exception("Invalid operation type for binary elementwise operations");
+        }
+      }
+    }
+
+    /** @brief Deals with x = RHS where RHS is a vector expression */
+    inline void execute_element_composite(statement const & s, statement_node const & root_node)
+    {
+      statement_node const & leaf = s.array()[root_node.rhs.node_index];
+
+      statement_node new_root_lhs;
+      statement_node new_root_rhs;
+
+      // check for temporary on lhs:
+      if (leaf.lhs.type_family == COMPOSITE_OPERATION_FAMILY)
+      {
+        detail::new_element(new_root_lhs.lhs, root_node.lhs);
+
+        new_root_lhs.op.type_family = OPERATION_BINARY_TYPE_FAMILY;
+        new_root_lhs.op.type        = OPERATION_BINARY_ASSIGN_TYPE;
+
+        new_root_lhs.rhs.type_family  = COMPOSITE_OPERATION_FAMILY;
+        new_root_lhs.rhs.subtype      = INVALID_SUBTYPE;
+        new_root_lhs.rhs.numeric_type = INVALID_NUMERIC_TYPE;
+        new_root_lhs.rhs.node_index   = leaf.lhs.node_index;
+
+        // work on subexpression:
+        // TODO: Catch exception, free temporary, then rethrow
+        detail::execute_composite(s, new_root_lhs);
+      }
+
+      if (leaf.op.type == OPERATION_BINARY_ELEMENT_PROD_TYPE || leaf.op.type == OPERATION_BINARY_ELEMENT_DIV_TYPE)
+      {
+        // check for temporary on rhs:
+        if (leaf.rhs.type_family == COMPOSITE_OPERATION_FAMILY)
+        {
+          detail::new_element(new_root_rhs.lhs, root_node.lhs);
+
+          new_root_rhs.op.type_family = OPERATION_BINARY_TYPE_FAMILY;
+          new_root_rhs.op.type        = OPERATION_BINARY_ASSIGN_TYPE;
+
+          new_root_rhs.rhs.type_family  = COMPOSITE_OPERATION_FAMILY;
+          new_root_rhs.rhs.subtype      = INVALID_SUBTYPE;
+          new_root_rhs.rhs.numeric_type = INVALID_NUMERIC_TYPE;
+          new_root_rhs.rhs.node_index   = leaf.rhs.node_index;
+
+          // work on subexpression:
+          // TODO: Catch exception, free temporary, then rethrow
+          detail::execute_composite(s, new_root_rhs);
+        }
+
+        lhs_rhs_element x = (leaf.lhs.type_family == COMPOSITE_OPERATION_FAMILY) ? new_root_lhs.lhs : leaf.lhs;
+        lhs_rhs_element y = (leaf.rhs.type_family == COMPOSITE_OPERATION_FAMILY) ? new_root_rhs.lhs : leaf.rhs;
+
+        // compute element-wise operation:
+        detail::element_op(root_node.lhs, x, y, leaf.op.type);
+
+        if (leaf.rhs.type_family == COMPOSITE_OPERATION_FAMILY)
+          detail::delete_element(new_root_rhs.lhs);
+      }
+      else if (leaf.op.type_family  == OPERATION_UNARY_TYPE_FAMILY)
+      {
+        lhs_rhs_element x = (leaf.lhs.type_family == COMPOSITE_OPERATION_FAMILY) ? new_root_lhs.lhs : leaf.lhs;
+
+        // compute element-wise operation:
+        detail::element_op(root_node.lhs, x, leaf.op.type);
+      }
+      else
+        throw statement_not_supported_exception("Unsupported elementwise operation.");
+
+      // clean up:
+      if (leaf.lhs.type_family == COMPOSITE_OPERATION_FAMILY)
+        detail::delete_element(new_root_lhs.lhs);
+
+    }
+
+
+  } // namespace scheduler
+
+} // namespace viennacl
+
+#endif
+
diff --git a/viennacl/scheduler/execute_generic_dispatcher.hpp b/viennacl/scheduler/execute_generic_dispatcher.hpp
new file mode 100644
index 0000000..765be4a
--- /dev/null
+++ b/viennacl/scheduler/execute_generic_dispatcher.hpp
@@ -0,0 +1,135 @@
+#ifndef VIENNACL_SCHEDULER_EXECUTE_GENERIC_DISPATCHER_HPP
+#define VIENNACL_SCHEDULER_EXECUTE_GENERIC_DISPATCHER_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/scheduler/execute_generic_dispatcher.hpp
+    @brief Provides unified wrappers for the common routines {as(), asbs(), asbs_s()}, {av(), avbv(), avbv_v()}, and {am(), ambm(), ambm_m()} such that scheduler logic is not cluttered with numeric type decutions
+*/
+
+#include <assert.h>
+
+#include "viennacl/forwards.h"
+#include "viennacl/scheduler/forwards.h"
+#include "viennacl/scheduler/execute_util.hpp"
+#include "viennacl/scheduler/execute_scalar_dispatcher.hpp"
+#include "viennacl/scheduler/execute_vector_dispatcher.hpp"
+#include "viennacl/scheduler/execute_matrix_dispatcher.hpp"
+
+namespace viennacl
+{
+  namespace scheduler
+  {
+    namespace detail
+    {
+
+      /** @brief Wrapper for viennacl::linalg::av(), taking care of the argument unwrapping */
+      template <typename ScalarType1>
+      void ax(lhs_rhs_element & x1,
+              lhs_rhs_element const & x2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
+      {
+        assert(x1.type_family == x2.type_family && bool("Arguments are not of the same type family!"));
+
+        switch (x1.type_family)
+        {
+        case SCALAR_TYPE_FAMILY:
+          detail::as(x1, x2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha);
+          break;
+        case VECTOR_TYPE_FAMILY:
+          detail::av(x1, x2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha);
+          break;
+        case MATRIX_TYPE_FAMILY:
+          detail::am(x1, x2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha);
+          break;
+        default:
+          throw statement_not_supported_exception("Invalid argument in scheduler ax() while dispatching.");
+        }
+      }
+
+      /** @brief Wrapper for viennacl::linalg::avbv(), taking care of the argument unwrapping */
+      template <typename ScalarType1, typename ScalarType2>
+      void axbx(lhs_rhs_element & x1,
+                lhs_rhs_element const & x2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+                lhs_rhs_element const & x3, ScalarType2 const & beta,  vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
+      {
+        assert(   x1.type_family == x2.type_family
+               && x2.type_family == x3.type_family
+               && bool("Arguments are not of the same type family!"));
+
+        switch (x1.type_family)
+        {
+        case SCALAR_TYPE_FAMILY:
+          detail::asbs(x1,
+                       x2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha,
+                       x3, beta,  len_beta,  reciprocal_beta,  flip_sign_beta);
+          break;
+        case VECTOR_TYPE_FAMILY:
+            detail::avbv(x1,
+                         x2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha,
+                         x3, beta,  len_beta,  reciprocal_beta,  flip_sign_beta);
+          break;
+        case MATRIX_TYPE_FAMILY:
+            detail::ambm(x1,
+                         x2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha,
+                         x3, beta,  len_beta,  reciprocal_beta,  flip_sign_beta);
+          break;
+        default:
+          throw statement_not_supported_exception("Invalid argument in scheduler ax() while dispatching.");
+        }
+      }
+
+      /** @brief Wrapper for viennacl::linalg::avbv_v(), taking care of the argument unwrapping */
+      template <typename ScalarType1, typename ScalarType2>
+      void axbx_x(lhs_rhs_element & x1,
+                  lhs_rhs_element const & x2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+                  lhs_rhs_element const & x3, ScalarType2 const & beta,  vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
+      {
+        assert(   x1.type_family == x2.type_family
+               && x2.type_family == x3.type_family
+               && bool("Arguments are not of the same type family!"));
+
+        switch (x1.type_family)
+        {
+        case SCALAR_TYPE_FAMILY:
+          detail::asbs_s(x1,
+                         x2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha,
+                         x3, beta,  len_beta,  reciprocal_beta,  flip_sign_beta);
+          break;
+        case VECTOR_TYPE_FAMILY:
+            detail::avbv_v(x1,
+                           x2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha,
+                           x3, beta,  len_beta,  reciprocal_beta,  flip_sign_beta);
+          break;
+        case MATRIX_TYPE_FAMILY:
+            detail::ambm_m(x1,
+                           x2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha,
+                           x3, beta,  len_beta,  reciprocal_beta,  flip_sign_beta);
+          break;
+        default:
+          throw statement_not_supported_exception("Invalid argument in scheduler ax() while dispatching.");
+        }
+      }
+
+
+    } // namespace detail
+  } // namespace scheduler
+} // namespace viennacl
+
+#endif
+
diff --git a/viennacl/scheduler/execute_matrix_dispatcher.hpp b/viennacl/scheduler/execute_matrix_dispatcher.hpp
new file mode 100644
index 0000000..8367855
--- /dev/null
+++ b/viennacl/scheduler/execute_matrix_dispatcher.hpp
@@ -0,0 +1,210 @@
+#ifndef VIENNACL_SCHEDULER_EXECUTE_MATRIX_DISPATCHER_HPP
+#define VIENNACL_SCHEDULER_EXECUTE_MATRIX_DISPATCHER_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/scheduler/execute_matrix_dispatcher.hpp
+    @brief Provides wrappers for am(), ambm(), ambm_m(), etc. in viennacl/linalg/matrix_operations.hpp such that scheduler logic is not cluttered with numeric type decutions
+*/
+
+#include <assert.h>
+
+#include "viennacl/forwards.h"
+#include "viennacl/scheduler/forwards.h"
+#include "viennacl/scheduler/execute_util.hpp"
+#include "viennacl/linalg/matrix_operations.hpp"
+
+namespace viennacl
+{
+  namespace scheduler
+  {
+    namespace detail
+    {
+
+      /** @brief Wrapper for viennacl::linalg::av(), taking care of the argument unwrapping */
+      template <typename ScalarType1>
+      void am(lhs_rhs_element & mat1,
+              lhs_rhs_element const & mat2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
+      {
+        assert(   mat1.type_family == MATRIX_TYPE_FAMILY && mat2.type_family == MATRIX_TYPE_FAMILY
+               && bool("Arguments are not matrix types!"));
+
+        assert(mat1.numeric_type == mat2.numeric_type && bool("Matrices do not have the same scalar type"));
+
+        if (mat1.subtype == DENSE_ROW_MATRIX_TYPE)
+        {
+          switch (mat1.numeric_type)
+          {
+          case FLOAT_TYPE:
+            viennacl::linalg::am(*mat1.matrix_row_float,
+                                 *mat2.matrix_row_float, convert_to_float(alpha), len_alpha, reciprocal_alpha, flip_sign_alpha);
+            break;
+          case DOUBLE_TYPE:
+            viennacl::linalg::am(*mat1.matrix_row_double,
+                                 *mat2.matrix_row_double, convert_to_double(alpha), len_alpha, reciprocal_alpha, flip_sign_alpha);
+            break;
+
+          default:
+            throw statement_not_supported_exception("Invalid arguments in scheduler when calling am()");
+          }
+        }
+        else if (mat1.subtype == DENSE_COL_MATRIX_TYPE)
+        {
+          switch (mat1.numeric_type)
+          {
+          case FLOAT_TYPE:
+            viennacl::linalg::am(*mat1.matrix_col_float,
+                                 *mat2.matrix_col_float, convert_to_float(alpha), len_alpha, reciprocal_alpha, flip_sign_alpha);
+            break;
+          case DOUBLE_TYPE:
+            viennacl::linalg::am(*mat1.matrix_col_double,
+                                 *mat2.matrix_col_double, convert_to_double(alpha), len_alpha, reciprocal_alpha, flip_sign_alpha);
+            break;
+
+          default:
+            throw statement_not_supported_exception("Invalid arguments in scheduler when calling am()");
+          }
+        }
+        else
+        {
+          throw statement_not_supported_exception("Invalid arguments in scheduler when calling am()");
+        }
+      }
+
+      /** @brief Wrapper for viennacl::linalg::avbv(), taking care of the argument unwrapping */
+      template <typename ScalarType1, typename ScalarType2>
+      void ambm(lhs_rhs_element & mat1,
+                lhs_rhs_element const & mat2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+                lhs_rhs_element const & mat3, ScalarType2 const & beta,  vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
+      {
+        assert(   mat1.type_family == MATRIX_TYPE_FAMILY
+               && mat2.type_family == MATRIX_TYPE_FAMILY
+               && mat3.type_family == MATRIX_TYPE_FAMILY
+               && bool("Arguments are not matrix types!"));
+
+        assert(   (mat1.subtype == mat2.subtype)
+               && (mat2.subtype == mat3.subtype)
+               && bool("Matrices do not have the same layout"));
+
+        assert(   (mat1.numeric_type == mat2.numeric_type)
+               && (mat2.numeric_type == mat3.numeric_type)
+               && bool("Matrices do not have the same scalar type"));
+
+        if (mat1.subtype == DENSE_ROW_MATRIX_TYPE)
+        {
+          switch (mat1.numeric_type)
+          {
+          case FLOAT_TYPE:
+            viennacl::linalg::ambm(*mat1.matrix_row_float,
+                                   *mat2.matrix_row_float, convert_to_float(alpha), len_alpha, reciprocal_alpha, flip_sign_alpha,
+                                   *mat3.matrix_row_float, convert_to_float(beta),  len_beta,  reciprocal_beta,  flip_sign_beta);
+            break;
+          case DOUBLE_TYPE:
+            viennacl::linalg::ambm(*mat1.matrix_row_double,
+                                   *mat2.matrix_row_double, convert_to_double(alpha), len_alpha, reciprocal_alpha, flip_sign_alpha,
+                                   *mat3.matrix_row_double, convert_to_double(beta),  len_beta,  reciprocal_beta,  flip_sign_beta);
+            break;
+          default:
+            throw statement_not_supported_exception("Invalid arguments in scheduler when calling ambm()");
+          }
+        }
+        else if (mat1.subtype == DENSE_COL_MATRIX_TYPE)
+        {
+          switch (mat1.numeric_type)
+          {
+          case FLOAT_TYPE:
+            viennacl::linalg::ambm(*mat1.matrix_col_float,
+                                   *mat2.matrix_col_float, convert_to_float(alpha), len_alpha, reciprocal_alpha, flip_sign_alpha,
+                                   *mat3.matrix_col_float, convert_to_float(beta),  len_beta,  reciprocal_beta,  flip_sign_beta);
+            break;
+          case DOUBLE_TYPE:
+            viennacl::linalg::ambm(*mat1.matrix_col_double,
+                                   *mat2.matrix_col_double, convert_to_double(alpha), len_alpha, reciprocal_alpha, flip_sign_alpha,
+                                   *mat3.matrix_col_double, convert_to_double(beta),  len_beta,  reciprocal_beta,  flip_sign_beta);
+            break;
+          default:
+            throw statement_not_supported_exception("Invalid arguments in scheduler when calling ambm()");
+          }
+        }
+      }
+
+      /** @brief Wrapper for viennacl::linalg::avbv_v(), taking care of the argument unwrapping */
+      template <typename ScalarType1, typename ScalarType2>
+      void ambm_m(lhs_rhs_element & mat1,
+                  lhs_rhs_element const & mat2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+                  lhs_rhs_element const & mat3, ScalarType2 const & beta,  vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
+      {
+        assert(   mat1.type_family == MATRIX_TYPE_FAMILY
+               && mat2.type_family == MATRIX_TYPE_FAMILY
+               && mat3.type_family == MATRIX_TYPE_FAMILY
+               && bool("Arguments are not matrix types!"));
+
+        assert(   (mat1.subtype == mat2.subtype)
+               && (mat2.subtype == mat3.subtype)
+               && bool("Matrices do not have the same layout"));
+
+        assert(   (mat1.numeric_type == mat2.numeric_type)
+               && (mat2.numeric_type == mat3.numeric_type)
+               && bool("Matrices do not have the same scalar type"));
+
+        if (mat1.subtype == DENSE_ROW_MATRIX_TYPE)
+        {
+          switch (mat1.numeric_type)
+          {
+          case FLOAT_TYPE:
+            viennacl::linalg::ambm_m(*mat1.matrix_row_float,
+                                     *mat2.matrix_row_float, convert_to_float(alpha), len_alpha, reciprocal_alpha, flip_sign_alpha,
+                                     *mat3.matrix_row_float, convert_to_float(beta),  len_beta,  reciprocal_beta,  flip_sign_beta);
+            break;
+          case DOUBLE_TYPE:
+            viennacl::linalg::ambm_m(*mat1.matrix_row_double,
+                                     *mat2.matrix_row_double, convert_to_double(alpha), len_alpha, reciprocal_alpha, flip_sign_alpha,
+                                     *mat3.matrix_row_double, convert_to_double(beta),  len_beta,  reciprocal_beta,  flip_sign_beta);
+            break;
+          default:
+            throw statement_not_supported_exception("Invalid arguments in scheduler when calling ambm_m()");
+          }
+        }
+        else if (mat1.subtype == DENSE_COL_MATRIX_TYPE)
+        {
+          switch (mat1.numeric_type)
+          {
+          case FLOAT_TYPE:
+            viennacl::linalg::ambm_m(*mat1.matrix_col_float,
+                                     *mat2.matrix_col_float, convert_to_float(alpha), len_alpha, reciprocal_alpha, flip_sign_alpha,
+                                     *mat3.matrix_col_float, convert_to_float(beta),  len_beta,  reciprocal_beta,  flip_sign_beta);
+            break;
+          case DOUBLE_TYPE:
+            viennacl::linalg::ambm_m(*mat1.matrix_col_double,
+                                     *mat2.matrix_col_double, convert_to_double(alpha), len_alpha, reciprocal_alpha, flip_sign_alpha,
+                                     *mat3.matrix_col_double, convert_to_double(beta),  len_beta,  reciprocal_beta,  flip_sign_beta);
+            break;
+          default:
+            throw statement_not_supported_exception("Invalid arguments in scheduler when calling ambm_m()");
+          }
+        }
+      }
+
+
+    } // namespace detail
+  } // namespace scheduler
+} // namespace viennacl
+
+#endif
+
diff --git a/viennacl/scheduler/execute_matrix_prod.hpp b/viennacl/scheduler/execute_matrix_prod.hpp
new file mode 100644
index 0000000..afc23db
--- /dev/null
+++ b/viennacl/scheduler/execute_matrix_prod.hpp
@@ -0,0 +1,498 @@
+#ifndef VIENNACL_SCHEDULER_EXECUTE_MATRIX_PROD_HPP
+#define VIENNACL_SCHEDULER_EXECUTE_MATRIX_PROD_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/scheduler/execute_matrix_prod.hpp
+    @brief Deals with matrix-vector and matrix-matrix products.
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/scheduler/forwards.h"
+#include "viennacl/scheduler/execute_util.hpp"
+#include "viennacl/scheduler/execute_generic_dispatcher.hpp"
+#include "viennacl/linalg/vector_operations.hpp"
+#include "viennacl/linalg/matrix_operations.hpp"
+#include "viennacl/linalg/sparse_matrix_operations.hpp"
+#include "viennacl/compressed_matrix.hpp"
+#include "viennacl/coordinate_matrix.hpp"
+#include "viennacl/ell_matrix.hpp"
+#include "viennacl/hyb_matrix.hpp"
+
+namespace viennacl
+{
+  namespace scheduler
+  {
+    namespace detail
+    {
+      inline bool matrix_prod_temporary_required(statement const & s, lhs_rhs_element const & elem)
+      {
+        if (elem.type_family != COMPOSITE_OPERATION_FAMILY)
+          return false;
+
+        // check composite node for being a transposed matrix proxy:
+        statement_node const & leaf = s.array()[elem.node_index];
+        if (   leaf.op.type == OPERATION_UNARY_TRANS_TYPE && leaf.lhs.type_family == MATRIX_TYPE_FAMILY)
+          return false;
+
+        return true;
+      }
+
+      inline void matrix_matrix_prod(statement const & s,
+                                     lhs_rhs_element result,
+                                     lhs_rhs_element const & A,
+                                     lhs_rhs_element const & B,
+                                     double alpha,
+                                     double beta)
+      {
+        if (A.type_family == MATRIX_TYPE_FAMILY && B.type_family == MATRIX_TYPE_FAMILY)        // C = A * B
+        {
+          assert(      A.numeric_type == B.numeric_type && bool("Numeric type not the same!"));
+          assert( result.numeric_type == B.numeric_type && bool("Numeric type not the same!"));
+
+#define VIENNACL_SCHEDULER_GENERATE_MATRIX_MATRIX_PROD(LAYOUTA, MEMBERA, LAYOUTB, MEMBERB, LAYOUTC, MEMBERC)\
+          if (A.subtype == LAYOUTA && B.subtype == LAYOUTB && result.subtype == LAYOUTC)\
+          {\
+            switch (result.numeric_type)\
+            {\
+            case FLOAT_TYPE:\
+              viennacl::linalg::prod_impl(*A.matrix_##MEMBERA##_float, *B.matrix_##MEMBERB##_float, *result.matrix_##MEMBERC##_float, static_cast<float>(alpha), static_cast<float>(beta)); break;\
+            case DOUBLE_TYPE:\
+              viennacl::linalg::prod_impl(*A.matrix_##MEMBERA##_double, *B.matrix_##MEMBERB##_double, *result.matrix_##MEMBERC##_double, alpha, beta); break;\
+            default:\
+              throw statement_not_supported_exception("Invalid numeric type in matrix-matrix multiplication");\
+            }\
+          }
+
+          VIENNACL_SCHEDULER_GENERATE_MATRIX_MATRIX_PROD(DENSE_ROW_MATRIX_TYPE, row, DENSE_ROW_MATRIX_TYPE, row, DENSE_ROW_MATRIX_TYPE, row)
+          VIENNACL_SCHEDULER_GENERATE_MATRIX_MATRIX_PROD(DENSE_ROW_MATRIX_TYPE, row, DENSE_ROW_MATRIX_TYPE, row, DENSE_COL_MATRIX_TYPE, col)
+
+          VIENNACL_SCHEDULER_GENERATE_MATRIX_MATRIX_PROD(DENSE_ROW_MATRIX_TYPE, row, DENSE_COL_MATRIX_TYPE, col, DENSE_ROW_MATRIX_TYPE, row)
+          VIENNACL_SCHEDULER_GENERATE_MATRIX_MATRIX_PROD(DENSE_ROW_MATRIX_TYPE, row, DENSE_COL_MATRIX_TYPE, col, DENSE_COL_MATRIX_TYPE, col)
+
+          VIENNACL_SCHEDULER_GENERATE_MATRIX_MATRIX_PROD(DENSE_COL_MATRIX_TYPE, col, DENSE_ROW_MATRIX_TYPE, row, DENSE_ROW_MATRIX_TYPE, row)
+          VIENNACL_SCHEDULER_GENERATE_MATRIX_MATRIX_PROD(DENSE_COL_MATRIX_TYPE, col, DENSE_ROW_MATRIX_TYPE, row, DENSE_COL_MATRIX_TYPE, col)
+
+          VIENNACL_SCHEDULER_GENERATE_MATRIX_MATRIX_PROD(DENSE_COL_MATRIX_TYPE, col, DENSE_COL_MATRIX_TYPE, col, DENSE_ROW_MATRIX_TYPE, row)
+          VIENNACL_SCHEDULER_GENERATE_MATRIX_MATRIX_PROD(DENSE_COL_MATRIX_TYPE, col, DENSE_COL_MATRIX_TYPE, col, DENSE_COL_MATRIX_TYPE, col)
+
+#undef VIENNACL_SCHEDULER_GENERATE_MATRIX_MATRIX_PROD
+        }
+        else if (A.type_family == MATRIX_TYPE_FAMILY && B.type_family == COMPOSITE_OPERATION_FAMILY)        // C = A * B^T
+        {
+          statement_node const & leaf = s.array()[B.node_index];
+
+          assert(leaf.lhs.type_family  == MATRIX_TYPE_FAMILY && leaf.op.type == OPERATION_UNARY_TRANS_TYPE && bool("Logic error: Argument not a matrix transpose!"));
+          assert(leaf.lhs.numeric_type == result.numeric_type && bool("Numeric type not the same!"));
+          assert(result.numeric_type == A.numeric_type && bool("Numeric type not the same!"));
+
+#define VIENNACL_SCHEDULER_GENERATE_MATRIX_MATRIX_PROD(LAYOUTA, MEMBERA, LAYOUTB, MEMBERB, MAJORB, LAYOUTC, MEMBERC)\
+          if (A.subtype == LAYOUTA && leaf.lhs.subtype == LAYOUTB && result.subtype == LAYOUTC)\
+          {\
+            switch (result.numeric_type)\
+            {\
+            case FLOAT_TYPE:\
+              viennacl::linalg::prod_impl(*A.matrix_##MEMBERA##_float, \
+                                          viennacl::matrix_expression< const matrix_base<float, MAJORB>,\
+                                                                       const matrix_base<float, MAJORB>,\
+                                                                       op_trans> (*(leaf.lhs.matrix_##MEMBERB##_float), *(leaf.lhs.matrix_##MEMBERB##_float)), \
+                                          *result.matrix_##MEMBERC##_float, static_cast<float>(alpha), static_cast<float>(beta)); break;\
+            case DOUBLE_TYPE:\
+              viennacl::linalg::prod_impl(*A.matrix_##MEMBERA##_double,\
+                                          viennacl::matrix_expression< const matrix_base<double, MAJORB>,\
+                                                                       const matrix_base<double, MAJORB>,\
+                                                                       op_trans>(*(leaf.lhs.matrix_##MEMBERB##_double), *(leaf.lhs.matrix_##MEMBERB##_double)), \
+                                          *result.matrix_##MEMBERC##_double, alpha, beta); break;\
+            default:\
+              throw statement_not_supported_exception("Invalid numeric type in matrix-matrix multiplication");\
+            }\
+          }
+
+          VIENNACL_SCHEDULER_GENERATE_MATRIX_MATRIX_PROD(DENSE_ROW_MATRIX_TYPE, row, DENSE_ROW_MATRIX_TYPE, row, row_major, DENSE_ROW_MATRIX_TYPE, row)
+          VIENNACL_SCHEDULER_GENERATE_MATRIX_MATRIX_PROD(DENSE_ROW_MATRIX_TYPE, row, DENSE_ROW_MATRIX_TYPE, row, row_major, DENSE_COL_MATRIX_TYPE, col)
+
+          VIENNACL_SCHEDULER_GENERATE_MATRIX_MATRIX_PROD(DENSE_ROW_MATRIX_TYPE, row, DENSE_COL_MATRIX_TYPE, col, column_major, DENSE_ROW_MATRIX_TYPE, row)
+          VIENNACL_SCHEDULER_GENERATE_MATRIX_MATRIX_PROD(DENSE_ROW_MATRIX_TYPE, row, DENSE_COL_MATRIX_TYPE, col, column_major, DENSE_COL_MATRIX_TYPE, col)
+
+          VIENNACL_SCHEDULER_GENERATE_MATRIX_MATRIX_PROD(DENSE_COL_MATRIX_TYPE, col, DENSE_ROW_MATRIX_TYPE, row, row_major, DENSE_ROW_MATRIX_TYPE, row)
+          VIENNACL_SCHEDULER_GENERATE_MATRIX_MATRIX_PROD(DENSE_COL_MATRIX_TYPE, col, DENSE_ROW_MATRIX_TYPE, row, row_major, DENSE_COL_MATRIX_TYPE, col)
+
+          VIENNACL_SCHEDULER_GENERATE_MATRIX_MATRIX_PROD(DENSE_COL_MATRIX_TYPE, col, DENSE_COL_MATRIX_TYPE, col, column_major, DENSE_ROW_MATRIX_TYPE, row)
+          VIENNACL_SCHEDULER_GENERATE_MATRIX_MATRIX_PROD(DENSE_COL_MATRIX_TYPE, col, DENSE_COL_MATRIX_TYPE, col, column_major, DENSE_COL_MATRIX_TYPE, col)
+
+#undef VIENNACL_SCHEDULER_GENERATE_MATRIX_MATRIX_PROD
+        }
+        else if (A.type_family == COMPOSITE_OPERATION_FAMILY && B.type_family == MATRIX_TYPE_FAMILY)        // C = A^T * B
+        {
+          statement_node const & leaf = s.array()[A.node_index];
+
+          assert(leaf.lhs.type_family  == MATRIX_TYPE_FAMILY && leaf.op.type == OPERATION_UNARY_TRANS_TYPE && bool("Logic error: Argument not a matrix transpose!"));
+          assert(leaf.lhs.numeric_type == result.numeric_type && bool("Numeric type not the same!"));
+          assert(result.numeric_type == B.numeric_type && bool("Numeric type not the same!"));
+
+#define VIENNACL_SCHEDULER_GENERATE_MATRIX_MATRIX_PROD(LAYOUTA, MEMBERA, MAJORA, LAYOUTB, MEMBERB, LAYOUTC, MEMBERC)\
+          if (leaf.lhs.subtype == LAYOUTA && B.subtype == LAYOUTB && result.subtype == LAYOUTC)\
+          {\
+            switch (result.numeric_type)\
+            {\
+            case FLOAT_TYPE:\
+              viennacl::linalg::prod_impl(viennacl::matrix_expression< const matrix_base<float, MAJORA>,\
+                                                                       const matrix_base<float, MAJORA>,\
+                                                                       op_trans>(*leaf.lhs.matrix_##MEMBERA##_float, *leaf.lhs.matrix_##MEMBERA##_float), \
+                                          *B.matrix_##MEMBERB##_float,\
+                                          *result.matrix_##MEMBERC##_float, static_cast<float>(alpha), static_cast<float>(beta)); break;\
+            case DOUBLE_TYPE:\
+              viennacl::linalg::prod_impl(viennacl::matrix_expression< const matrix_base<double, MAJORA>,\
+                                                                       const matrix_base<double, MAJORA>,\
+                                                                       op_trans>(*leaf.lhs.matrix_##MEMBERA##_double, *leaf.lhs.matrix_##MEMBERA##_double), \
+                                          *B.matrix_##MEMBERB##_double,\
+                                          *result.matrix_##MEMBERC##_double, alpha, beta); break;\
+            default:\
+              throw statement_not_supported_exception("Invalid numeric type in matrix-matrix multiplication");\
+            }\
+          }
+
+          VIENNACL_SCHEDULER_GENERATE_MATRIX_MATRIX_PROD(DENSE_ROW_MATRIX_TYPE, row, row_major, DENSE_ROW_MATRIX_TYPE, row, DENSE_ROW_MATRIX_TYPE, row)
+          VIENNACL_SCHEDULER_GENERATE_MATRIX_MATRIX_PROD(DENSE_ROW_MATRIX_TYPE, row, row_major, DENSE_ROW_MATRIX_TYPE, row, DENSE_COL_MATRIX_TYPE, col)
+
+          VIENNACL_SCHEDULER_GENERATE_MATRIX_MATRIX_PROD(DENSE_ROW_MATRIX_TYPE, row, row_major, DENSE_COL_MATRIX_TYPE, col, DENSE_ROW_MATRIX_TYPE, row)
+          VIENNACL_SCHEDULER_GENERATE_MATRIX_MATRIX_PROD(DENSE_ROW_MATRIX_TYPE, row, row_major, DENSE_COL_MATRIX_TYPE, col, DENSE_COL_MATRIX_TYPE, col)
+
+          VIENNACL_SCHEDULER_GENERATE_MATRIX_MATRIX_PROD(DENSE_COL_MATRIX_TYPE, col, column_major, DENSE_ROW_MATRIX_TYPE, row, DENSE_ROW_MATRIX_TYPE, row)
+          VIENNACL_SCHEDULER_GENERATE_MATRIX_MATRIX_PROD(DENSE_COL_MATRIX_TYPE, col, column_major, DENSE_ROW_MATRIX_TYPE, row, DENSE_COL_MATRIX_TYPE, col)
+
+          VIENNACL_SCHEDULER_GENERATE_MATRIX_MATRIX_PROD(DENSE_COL_MATRIX_TYPE, col, column_major, DENSE_COL_MATRIX_TYPE, col, DENSE_ROW_MATRIX_TYPE, row)
+          VIENNACL_SCHEDULER_GENERATE_MATRIX_MATRIX_PROD(DENSE_COL_MATRIX_TYPE, col, column_major, DENSE_COL_MATRIX_TYPE, col, DENSE_COL_MATRIX_TYPE, col)
+
+#undef VIENNACL_SCHEDULER_GENERATE_MATRIX_MATRIX_PROD
+        }
+        else if (A.type_family == COMPOSITE_OPERATION_FAMILY && B.type_family == COMPOSITE_OPERATION_FAMILY)        // C = A^T * B^T
+        {
+          statement_node const & leafA = s.array()[A.node_index];
+          statement_node const & leafB = s.array()[B.node_index];
+
+          assert(leafA.lhs.type_family  == MATRIX_TYPE_FAMILY && leafA.op.type == OPERATION_UNARY_TRANS_TYPE && bool("Logic error: Argument not a matrix transpose!"));
+          assert(leafB.lhs.type_family  == MATRIX_TYPE_FAMILY && leafB.op.type == OPERATION_UNARY_TRANS_TYPE && bool("Logic error: Argument not a matrix transpose!"));
+          assert(leafA.lhs.numeric_type == result.numeric_type && bool("Numeric type not the same!"));
+          assert(leafB.lhs.numeric_type == result.numeric_type && bool("Numeric type not the same!"));
+
+#define VIENNACL_SCHEDULER_GENERATE_MATRIX_MATRIX_PROD(LAYOUTA, MEMBERA, MAJORA, LAYOUTB, MEMBERB, MAJORB, LAYOUTC, MEMBERC)\
+          if (leafA.lhs.subtype == LAYOUTA && leafB.lhs.subtype == LAYOUTB && result.subtype == LAYOUTC)\
+          {\
+            switch (result.numeric_type)\
+            {\
+            case FLOAT_TYPE:\
+              viennacl::linalg::prod_impl(viennacl::matrix_expression< const matrix_base<float, MAJORA>,\
+                                                                       const matrix_base<float, MAJORA>,\
+                                                                       op_trans>(*leafA.lhs.matrix_##MEMBERA##_float, *leafA.lhs.matrix_##MEMBERA##_float), \
+                                          viennacl::matrix_expression< const matrix_base<float, MAJORB>,\
+                                                                       const matrix_base<float, MAJORB>,\
+                                                                       op_trans>(*leafB.lhs.matrix_##MEMBERB##_float, *leafB.lhs.matrix_##MEMBERB##_float), \
+                                          *result.matrix_##MEMBERC##_float, static_cast<float>(alpha), static_cast<float>(beta)); break;\
+            case DOUBLE_TYPE:\
+              viennacl::linalg::prod_impl(viennacl::matrix_expression< const matrix_base<double, MAJORA>,\
+                                                                       const matrix_base<double, MAJORA>,\
+                                                                       op_trans>(*leafA.lhs.matrix_##MEMBERA##_double, *leafA.lhs.matrix_##MEMBERA##_double), \
+                                          viennacl::matrix_expression< const matrix_base<double, MAJORB>,\
+                                                                       const matrix_base<double, MAJORB>,\
+                                                                       op_trans>(*leafB.lhs.matrix_##MEMBERB##_double, *leafB.lhs.matrix_##MEMBERB##_double), \
+                                          *result.matrix_##MEMBERC##_double, alpha, beta); break;\
+            default:\
+              throw statement_not_supported_exception("Invalid numeric type in matrix-matrix multiplication");\
+            }\
+          }
+
+          VIENNACL_SCHEDULER_GENERATE_MATRIX_MATRIX_PROD(DENSE_ROW_MATRIX_TYPE, row, row_major, DENSE_ROW_MATRIX_TYPE, row, row_major, DENSE_ROW_MATRIX_TYPE, row)
+          VIENNACL_SCHEDULER_GENERATE_MATRIX_MATRIX_PROD(DENSE_ROW_MATRIX_TYPE, row, row_major, DENSE_ROW_MATRIX_TYPE, row, row_major, DENSE_COL_MATRIX_TYPE, col)
+
+          VIENNACL_SCHEDULER_GENERATE_MATRIX_MATRIX_PROD(DENSE_ROW_MATRIX_TYPE, row, row_major, DENSE_COL_MATRIX_TYPE, col, column_major, DENSE_ROW_MATRIX_TYPE, row)
+          VIENNACL_SCHEDULER_GENERATE_MATRIX_MATRIX_PROD(DENSE_ROW_MATRIX_TYPE, row, row_major, DENSE_COL_MATRIX_TYPE, col, column_major, DENSE_COL_MATRIX_TYPE, col)
+
+          VIENNACL_SCHEDULER_GENERATE_MATRIX_MATRIX_PROD(DENSE_COL_MATRIX_TYPE, col, column_major, DENSE_ROW_MATRIX_TYPE, row, row_major, DENSE_ROW_MATRIX_TYPE, row)
+          VIENNACL_SCHEDULER_GENERATE_MATRIX_MATRIX_PROD(DENSE_COL_MATRIX_TYPE, col, column_major, DENSE_ROW_MATRIX_TYPE, row, row_major, DENSE_COL_MATRIX_TYPE, col)
+
+          VIENNACL_SCHEDULER_GENERATE_MATRIX_MATRIX_PROD(DENSE_COL_MATRIX_TYPE, col, column_major, DENSE_COL_MATRIX_TYPE, col, column_major, DENSE_ROW_MATRIX_TYPE, row)
+          VIENNACL_SCHEDULER_GENERATE_MATRIX_MATRIX_PROD(DENSE_COL_MATRIX_TYPE, col, column_major, DENSE_COL_MATRIX_TYPE, col, column_major, DENSE_COL_MATRIX_TYPE, col)
+
+#undef VIENNACL_SCHEDULER_GENERATE_MATRIX_MATRIX_PROD
+        }
+        else
+          throw statement_not_supported_exception("Matrix-matrix multiplication encountered operands being neither dense matrices nor transposed dense matrices");
+      }
+
+      inline void matrix_vector_prod(statement const & s,
+                                     lhs_rhs_element result,
+                                     lhs_rhs_element const & A,
+                                     lhs_rhs_element const & x)
+      {
+        assert( result.numeric_type == x.numeric_type && bool("Numeric type not the same!"));
+        assert( result.type_family == x.type_family && bool("Subtype not the same!"));
+        assert( result.subtype == DENSE_VECTOR_TYPE && bool("Result node for matrix-vector product not a vector type!"));
+
+        // deal with transposed product first:
+        // switch: trans for A
+        if (A.type_family == COMPOSITE_OPERATION_FAMILY) // prod(trans(A), x)
+        {
+          statement_node const & leaf = s.array()[A.node_index];
+
+          assert(leaf.lhs.type_family  == MATRIX_TYPE_FAMILY && leaf.op.type == OPERATION_UNARY_TRANS_TYPE && bool("Logic error: Argument not a matrix transpose!"));
+          assert(leaf.lhs.numeric_type == x.numeric_type && bool("Numeric type not the same!"));
+
+          if (leaf.lhs.subtype == DENSE_ROW_MATRIX_TYPE)
+          {
+            switch (leaf.lhs.numeric_type)
+            {
+            case FLOAT_TYPE:
+              viennacl::linalg::prod_impl(viennacl::matrix_expression< const matrix_base<float, row_major>,
+                                                                       const matrix_base<float, row_major>,
+                                                                       op_trans>(*leaf.lhs.matrix_row_float, *leaf.lhs.matrix_row_float),
+                                          *x.vector_float,
+                                          *result.vector_float); break;
+            case DOUBLE_TYPE:
+              viennacl::linalg::prod_impl(viennacl::matrix_expression< const matrix_base<double, row_major>,
+                                                                       const matrix_base<double, row_major>,
+                                                                       op_trans>(*leaf.lhs.matrix_row_double, *leaf.lhs.matrix_row_double),
+                                          *x.vector_double,
+                                          *result.vector_double); break;
+            default:
+              throw statement_not_supported_exception("Invalid numeric type in matrix-{matrix,vector} multiplication");
+            }
+          }
+          else if (leaf.lhs.subtype == DENSE_COL_MATRIX_TYPE)
+          {
+            switch (leaf.lhs.numeric_type)
+            {
+            case FLOAT_TYPE:
+              viennacl::linalg::prod_impl(viennacl::matrix_expression< const matrix_base<float, column_major>,
+                                                                       const matrix_base<float, column_major>,
+                                                                       op_trans>(*leaf.lhs.matrix_col_float, *leaf.lhs.matrix_col_float),
+                                          *x.vector_float,
+                                          *result.vector_float); break;
+            case DOUBLE_TYPE:
+              viennacl::linalg::prod_impl(viennacl::matrix_expression< const matrix_base<double, column_major>,
+                                                                       const matrix_base<double, column_major>,
+                                                                       op_trans>(*leaf.lhs.matrix_col_double, *leaf.lhs.matrix_col_double),
+                                          *x.vector_double,
+                                          *result.vector_double); break;
+            default:
+              throw statement_not_supported_exception("Invalid numeric type in matrix-{matrix,vector} multiplication");
+            }
+          }
+          else
+            throw statement_not_supported_exception("Invalid matrix type for transposed matrix-vector product");
+        }
+        else if (A.subtype == DENSE_ROW_MATRIX_TYPE)
+        {
+          switch (A.numeric_type)
+          {
+          case FLOAT_TYPE:
+            viennacl::linalg::prod_impl(*A.matrix_row_float, *x.vector_float, *result.vector_float);
+            break;
+          case DOUBLE_TYPE:
+            viennacl::linalg::prod_impl(*A.matrix_row_double, *x.vector_double, *result.vector_double);
+            break;
+          default:
+            throw statement_not_supported_exception("Invalid numeric type in matrix-{matrix,vector} multiplication");
+          }
+        }
+        else if (A.subtype == DENSE_COL_MATRIX_TYPE)
+        {
+          switch (A.numeric_type)
+          {
+          case FLOAT_TYPE:
+            viennacl::linalg::prod_impl(*A.matrix_col_float, *x.vector_float, *result.vector_float);
+            break;
+          case DOUBLE_TYPE:
+            viennacl::linalg::prod_impl(*A.matrix_col_double, *x.vector_double, *result.vector_double);
+            break;
+          default:
+            throw statement_not_supported_exception("Invalid numeric type in matrix-{matrix,vector} multiplication");
+          }
+        }
+        else if (A.subtype == COMPRESSED_MATRIX_TYPE)
+        {
+          switch (A.numeric_type)
+          {
+          case FLOAT_TYPE:
+            viennacl::linalg::prod_impl(*A.compressed_matrix_float, *x.vector_float, *result.vector_float);
+            break;
+          case DOUBLE_TYPE:
+            viennacl::linalg::prod_impl(*A.compressed_matrix_double, *x.vector_double, *result.vector_double);
+            break;
+          default:
+            throw statement_not_supported_exception("Invalid numeric type in matrix-{matrix,vector} multiplication");
+          }
+        }
+        else if (A.subtype == COORDINATE_MATRIX_TYPE)
+        {
+          switch (A.numeric_type)
+          {
+          case FLOAT_TYPE:
+            viennacl::linalg::prod_impl(*A.coordinate_matrix_float, *x.vector_float, *result.vector_float);
+            break;
+          case DOUBLE_TYPE:
+            viennacl::linalg::prod_impl(*A.coordinate_matrix_double, *x.vector_double, *result.vector_double);
+            break;
+          default:
+            throw statement_not_supported_exception("Invalid numeric type in matrix-{matrix,vector} multiplication");
+          }
+        }
+        else if (A.subtype == ELL_MATRIX_TYPE)
+        {
+          switch (A.numeric_type)
+          {
+          case FLOAT_TYPE:
+            viennacl::linalg::prod_impl(*A.ell_matrix_float, *x.vector_float, *result.vector_float);
+            break;
+          case DOUBLE_TYPE:
+            viennacl::linalg::prod_impl(*A.ell_matrix_double, *x.vector_double, *result.vector_double);
+            break;
+          default:
+            throw statement_not_supported_exception("Invalid numeric type in matrix-{matrix,vector} multiplication");
+          }
+        }
+        else if (A.subtype == HYB_MATRIX_TYPE)
+        {
+          switch (A.numeric_type)
+          {
+          case FLOAT_TYPE:
+            viennacl::linalg::prod_impl(*A.hyb_matrix_float, *x.vector_float, *result.vector_float);
+            break;
+          case DOUBLE_TYPE:
+            viennacl::linalg::prod_impl(*A.hyb_matrix_double, *x.vector_double, *result.vector_double);
+            break;
+          default:
+            throw statement_not_supported_exception("Invalid numeric type in matrix-{matrix,vector} multiplication");
+          }
+        }
+        else
+        {
+          std::cout << "A.subtype: " << A.subtype << std::endl;
+          throw statement_not_supported_exception("Invalid matrix type for matrix-vector product");
+        }
+      }
+
+    } // namespace detail
+
+    inline void execute_matrix_prod(statement const & s, statement_node const & root_node)
+    {
+      statement_node const & leaf = s.array()[root_node.rhs.node_index];
+
+      // Part 1: Check whether temporaries are required //
+
+      statement_node new_root_lhs;
+      statement_node new_root_rhs;
+
+      bool lhs_needs_temporary = detail::matrix_prod_temporary_required(s, leaf.lhs);
+      bool rhs_needs_temporary = detail::matrix_prod_temporary_required(s, leaf.rhs);
+
+      // check for temporary on lhs:
+      if (lhs_needs_temporary)
+      {
+        std::cout << "Temporary for LHS!" << std::endl;
+        detail::new_element(new_root_lhs.lhs, root_node.lhs);
+
+        new_root_lhs.op.type_family = OPERATION_BINARY_TYPE_FAMILY;
+        new_root_lhs.op.type        = OPERATION_BINARY_ASSIGN_TYPE;
+
+        new_root_lhs.rhs.type_family  = COMPOSITE_OPERATION_FAMILY;
+        new_root_lhs.rhs.subtype      = INVALID_SUBTYPE;
+        new_root_lhs.rhs.numeric_type = INVALID_NUMERIC_TYPE;
+        new_root_lhs.rhs.node_index   = leaf.lhs.node_index;
+
+        // work on subexpression:
+        // TODO: Catch exception, free temporary, then rethrow
+        detail::execute_composite(s, new_root_lhs);
+      }
+
+      // check for temporary on rhs:
+      if (rhs_needs_temporary)
+      {
+        detail::new_element(new_root_rhs.lhs, root_node.lhs);
+
+        new_root_rhs.op.type_family = OPERATION_BINARY_TYPE_FAMILY;
+        new_root_rhs.op.type        = OPERATION_BINARY_ASSIGN_TYPE;
+
+        new_root_rhs.rhs.type_family  = COMPOSITE_OPERATION_FAMILY;
+        new_root_rhs.rhs.subtype      = INVALID_SUBTYPE;
+        new_root_rhs.rhs.numeric_type = INVALID_NUMERIC_TYPE;
+        new_root_rhs.rhs.node_index   = leaf.rhs.node_index;
+
+        // work on subexpression:
+        // TODO: Catch exception, free temporary, then rethrow
+        detail::execute_composite(s, new_root_rhs);
+      }
+
+      // Part 2: Run the actual computations //
+
+      lhs_rhs_element x = lhs_needs_temporary ? new_root_lhs.lhs : leaf.lhs;
+      lhs_rhs_element y = rhs_needs_temporary ? new_root_rhs.lhs : leaf.rhs;
+
+      if (root_node.lhs.type_family == VECTOR_TYPE_FAMILY)
+      {
+        if (root_node.op.type != OPERATION_BINARY_ASSIGN_TYPE)
+        {
+          //split y += A*x
+          statement_node new_root_z;
+          detail::new_element(new_root_z.lhs, root_node.lhs);
+
+          // compute z = A * x
+          detail::matrix_vector_prod(s, new_root_z.lhs, x, y);
+
+          // assignment y = z
+          double alpha = 0;
+          if (root_node.op.type == OPERATION_BINARY_INPLACE_ADD_TYPE)
+            alpha = 1.0;
+          else if (root_node.op.type == OPERATION_BINARY_INPLACE_SUB_TYPE)
+            alpha = -1.0;
+          else
+            throw statement_not_supported_exception("Invalid assignment type for matrix-vector product");
+
+          lhs_rhs_element y = root_node.lhs;
+          detail::axbx(y,
+                       y, 1.0, 1, false, false,
+                       new_root_z.lhs, alpha, 1, false, false);
+
+          detail::delete_element(new_root_z.lhs);
+        }
+        else
+          detail::matrix_vector_prod(s, root_node.lhs, x, y);
+      }
+      else
+      {
+        double alpha = (root_node.op.type == OPERATION_BINARY_INPLACE_SUB_TYPE) ? -1.0 : 1.0;
+        double beta  = (root_node.op.type != OPERATION_BINARY_ASSIGN_TYPE)      ?  1.0 : 0.0;
+
+        detail::matrix_matrix_prod(s, root_node.lhs, x, y, alpha, beta);
+      }
+
+      // Part 3: Clean up //
+
+      if (lhs_needs_temporary)
+        detail::delete_element(new_root_lhs.lhs);
+
+      if (rhs_needs_temporary)
+        detail::delete_element(new_root_rhs.lhs);
+    }
+
+  } // namespace scheduler
+} // namespace viennacl
+
+#endif
+
diff --git a/viennacl/scheduler/execute_scalar_assign.hpp b/viennacl/scheduler/execute_scalar_assign.hpp
new file mode 100644
index 0000000..f3265d2
--- /dev/null
+++ b/viennacl/scheduler/execute_scalar_assign.hpp
@@ -0,0 +1,189 @@
+#ifndef VIENNACL_SCHEDULER_EXECUTE_SCALAR_ASSIGN_HPP
+#define VIENNACL_SCHEDULER_EXECUTE_SCALAR_ASSIGN_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/scheduler/execute_scalar_assign.hpp
+    @brief Deals with the execution of x = RHS; for a vector x and any compatible right hand side expression RHS.
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/scheduler/forwards.h"
+#include "viennacl/scheduler/execute_vector_dispatcher.hpp"
+
+namespace viennacl
+{
+  namespace scheduler
+  {
+    /** @brief Deals with x = RHS where RHS is a vector expression */
+    inline void execute_scalar_assign_composite(statement const & s, statement_node const & root_node)
+    {
+      statement_node const & leaf = s.array()[root_node.rhs.node_index];
+
+      if (leaf.op.type  == OPERATION_BINARY_INNER_PROD_TYPE) // alpha = inner_prod( (x), (y) ) with x, y being either vectors or expressions
+      {
+        assert(root_node.lhs.type_family == SCALAR_TYPE_FAMILY && bool("Inner product requires assignment to scalar type!"));
+
+        if (   leaf.lhs.type_family == VECTOR_TYPE_FAMILY
+            && leaf.rhs.type_family == VECTOR_TYPE_FAMILY)
+
+        {
+          detail::inner_prod_impl(leaf.lhs, leaf.rhs, root_node.lhs);
+        }
+        else if (   leaf.lhs.type_family == COMPOSITE_OPERATION_FAMILY  // temporary for (x)
+                 && leaf.rhs.type_family == VECTOR_TYPE_FAMILY)
+        {
+          statement_node new_root_x;
+
+          detail::new_element(new_root_x.lhs, leaf.rhs);
+
+          new_root_x.op.type_family = OPERATION_BINARY_TYPE_FAMILY;
+          new_root_x.op.type        = OPERATION_BINARY_ASSIGN_TYPE;
+
+          new_root_x.rhs.type_family  = COMPOSITE_OPERATION_FAMILY;
+          new_root_x.rhs.subtype      = INVALID_SUBTYPE;
+          new_root_x.rhs.numeric_type = INVALID_NUMERIC_TYPE;
+          new_root_x.rhs.node_index   = leaf.lhs.node_index;
+
+          // work on subexpression:
+          // TODO: Catch exception, free temporary, then rethrow
+          detail::execute_composite(s, new_root_x);
+
+          detail::inner_prod_impl(new_root_x.lhs, leaf.rhs, root_node.lhs);
+
+          detail::delete_element(new_root_x.lhs);
+        }
+        else if (   leaf.lhs.type_family == VECTOR_TYPE_FAMILY
+                 && leaf.rhs.type_family == COMPOSITE_OPERATION_FAMILY) // temporary for (y)
+        {
+          statement_node new_root_y;
+
+          detail::new_element(new_root_y.lhs, leaf.lhs);
+
+          new_root_y.op.type_family = OPERATION_BINARY_TYPE_FAMILY;
+          new_root_y.op.type        = OPERATION_BINARY_ASSIGN_TYPE;
+
+          new_root_y.rhs.type_family  = COMPOSITE_OPERATION_FAMILY;
+          new_root_y.rhs.subtype      = INVALID_SUBTYPE;
+          new_root_y.rhs.numeric_type = INVALID_NUMERIC_TYPE;
+          new_root_y.rhs.node_index   = leaf.rhs.node_index;
+
+          // work on subexpression:
+          // TODO: Catch exception, free temporary, then rethrow
+          detail::execute_composite(s, new_root_y);
+
+          detail::inner_prod_impl(leaf.lhs, new_root_y.lhs, root_node.lhs);
+
+          detail::delete_element(new_root_y.lhs);
+        }
+        else if (   leaf.lhs.type_family == COMPOSITE_OPERATION_FAMILY   // temporary for (x)
+                 && leaf.rhs.type_family == COMPOSITE_OPERATION_FAMILY)  // temporary for (y)
+        {
+          // extract size information from vectors:
+          lhs_rhs_element const & temp_node = detail::extract_representative_vector(s, leaf.lhs);
+
+          // temporary for (x)
+          statement_node new_root_x;
+          detail::new_element(new_root_x.lhs, temp_node);
+
+          new_root_x.op.type_family = OPERATION_BINARY_TYPE_FAMILY;
+          new_root_x.op.type        = OPERATION_BINARY_ASSIGN_TYPE;
+
+          new_root_x.rhs.type_family  = COMPOSITE_OPERATION_FAMILY;
+          new_root_x.rhs.subtype      = INVALID_SUBTYPE;
+          new_root_x.rhs.numeric_type = INVALID_NUMERIC_TYPE;
+          new_root_x.rhs.node_index   = leaf.lhs.node_index;
+
+          // work on subexpression:
+          // TODO: Catch exception, free temporary, then rethrow
+          detail::execute_composite(s, new_root_x);
+
+          // temporary for (y)
+          statement_node new_root_y;
+          detail::new_element(new_root_y.lhs, temp_node);
+
+          new_root_y.op.type_family = OPERATION_BINARY_TYPE_FAMILY;
+          new_root_y.op.type        = OPERATION_BINARY_ASSIGN_TYPE;
+
+          new_root_y.rhs.type_family  = COMPOSITE_OPERATION_FAMILY;
+          new_root_y.rhs.subtype      = INVALID_SUBTYPE;
+          new_root_y.rhs.numeric_type = INVALID_NUMERIC_TYPE;
+          new_root_y.rhs.node_index   = leaf.rhs.node_index;
+
+          // work on subexpression:
+          // TODO: Catch exception, free temporary, then rethrow
+          detail::execute_composite(s, new_root_y);
+
+          // compute inner product:
+          detail::inner_prod_impl(new_root_x.lhs, new_root_y.lhs, root_node.lhs);
+
+          detail::delete_element(new_root_x.lhs);
+          detail::delete_element(new_root_y.lhs);
+        }
+        else
+          throw statement_not_supported_exception("Cannot deal with inner product of the provided arguments");
+      }
+      else if (   leaf.op.type  == OPERATION_UNARY_NORM_1_TYPE
+               || leaf.op.type  == OPERATION_UNARY_NORM_2_TYPE
+               || leaf.op.type  == OPERATION_UNARY_NORM_INF_TYPE)
+      {
+        assert(root_node.lhs.type_family == SCALAR_TYPE_FAMILY && bool("Inner product requires assignment to scalar type!"));
+
+        if (leaf.lhs.type_family == VECTOR_TYPE_FAMILY)
+        {
+          detail::norm_impl(leaf.lhs, root_node.lhs, leaf.op.type);
+        }
+        else if (leaf.lhs.type_family == COMPOSITE_OPERATION_FAMILY) //introduce temporary:
+        {
+          lhs_rhs_element const & temp_node = detail::extract_representative_vector(s, leaf.lhs);
+
+          statement_node new_root_y;
+
+          detail::new_element(new_root_y.lhs, temp_node);
+
+          new_root_y.op.type_family = OPERATION_BINARY_TYPE_FAMILY;
+          new_root_y.op.type        = OPERATION_BINARY_ASSIGN_TYPE;
+
+          new_root_y.rhs.type_family  = COMPOSITE_OPERATION_FAMILY;
+          new_root_y.rhs.subtype      = INVALID_SUBTYPE;
+          new_root_y.rhs.numeric_type = INVALID_NUMERIC_TYPE;
+          new_root_y.rhs.node_index   = leaf.lhs.node_index;
+
+          // work on subexpression:
+          // TODO: Catch exception, free temporary, then rethrow
+          detail::execute_composite(s, new_root_y);
+
+          detail::norm_impl(new_root_y.lhs, root_node.lhs, leaf.op.type);
+
+          detail::delete_element(new_root_y.lhs);
+        }
+        else
+          throw statement_not_supported_exception("Cannot deal with norm_inf of the provided arguments");
+      }
+      else
+        throw statement_not_supported_exception("Unsupported operation for scalar.");
+    }
+
+
+  }
+
+} //namespace viennacl
+
+#endif
+
diff --git a/viennacl/scheduler/execute_scalar_dispatcher.hpp b/viennacl/scheduler/execute_scalar_dispatcher.hpp
new file mode 100644
index 0000000..e932a92
--- /dev/null
+++ b/viennacl/scheduler/execute_scalar_dispatcher.hpp
@@ -0,0 +1,131 @@
+#ifndef VIENNACL_SCHEDULER_EXECUTE_SCALAR_DISPATCHER_HPP
+#define VIENNACL_SCHEDULER_EXECUTE_SCALAR_DISPATCHER_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/scheduler/execute_scalar_dispatcher.hpp
+    @brief Provides wrappers for as(), asbs(), asbs_s(), etc. in viennacl/linalg/scalar_operations.hpp such that scheduler logic is not cluttered with numeric type decutions
+*/
+
+#include <assert.h>
+
+#include "viennacl/forwards.h"
+#include "viennacl/scheduler/forwards.h"
+#include "viennacl/scheduler/execute_util.hpp"
+#include "viennacl/linalg/scalar_operations.hpp"
+
+namespace viennacl
+{
+  namespace scheduler
+  {
+    namespace detail
+    {
+      /** @brief Wrapper for viennacl::linalg::av(), taking care of the argument unwrapping */
+      template <typename ScalarType1>
+      void as(lhs_rhs_element & s1,
+              lhs_rhs_element const & s2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
+      {
+        assert(   s1.type_family == SCALAR_TYPE_FAMILY && (s1.subtype == HOST_SCALAR_TYPE || s1.subtype == DEVICE_SCALAR_TYPE)
+               && s2.type_family == SCALAR_TYPE_FAMILY && (s2.subtype == HOST_SCALAR_TYPE || s2.subtype == DEVICE_SCALAR_TYPE)
+               && bool("Arguments are not vector types!"));
+
+        switch (s1.numeric_type)
+        {
+          case FLOAT_TYPE:
+            assert(s2.numeric_type == FLOAT_TYPE && bool("Vectors do not have the same scalar type"));
+            viennacl::linalg::av(*s1.vector_float,
+                                 *s2.vector_float, convert_to_float(alpha), len_alpha, reciprocal_alpha, flip_sign_alpha);
+            break;
+          case DOUBLE_TYPE:
+            assert(s2.numeric_type == DOUBLE_TYPE && bool("Vectors do not have the same scalar type"));
+            viennacl::linalg::av(*s1.vector_double,
+                                 *s2.vector_double, convert_to_double(alpha), len_alpha, reciprocal_alpha, flip_sign_alpha);
+            break;
+          default:
+            throw statement_not_supported_exception("Invalid arguments in scheduler when calling av()");
+        }
+      }
+
+      /** @brief Wrapper for viennacl::linalg::avbv(), taking care of the argument unwrapping */
+      template <typename ScalarType1, typename ScalarType2>
+      void asbs(lhs_rhs_element & s1,
+                lhs_rhs_element const & s2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+                lhs_rhs_element const & s3, ScalarType2 const & beta,  vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
+      {
+        assert(   s1.type_family == SCALAR_TYPE_FAMILY && (s1.subtype == HOST_SCALAR_TYPE || s1.subtype == DEVICE_SCALAR_TYPE)
+               && s2.type_family == SCALAR_TYPE_FAMILY && (s2.subtype == HOST_SCALAR_TYPE || s2.subtype == DEVICE_SCALAR_TYPE)
+               && s3.type_family == SCALAR_TYPE_FAMILY && (s3.subtype == HOST_SCALAR_TYPE || s3.subtype == DEVICE_SCALAR_TYPE)
+               && bool("Arguments are not vector types!"));
+
+        switch (s1.numeric_type)
+        {
+          case FLOAT_TYPE:
+            assert(s2.numeric_type == FLOAT_TYPE && s3.numeric_type == FLOAT_TYPE && bool("Vectors do not have the same scalar type"));
+            viennacl::linalg::avbv(*s1.vector_float,
+                                   *s2.vector_float, convert_to_float(alpha), len_alpha, reciprocal_alpha, flip_sign_alpha,
+                                   *s3.vector_float, convert_to_float(beta),  len_beta,  reciprocal_beta,  flip_sign_beta);
+            break;
+          case DOUBLE_TYPE:
+            assert(s2.numeric_type == DOUBLE_TYPE && s3.numeric_type == DOUBLE_TYPE && bool("Vectors do not have the same scalar type"));
+            viennacl::linalg::avbv(*s1.vector_double,
+                                   *s2.vector_double, convert_to_double(alpha), len_alpha, reciprocal_alpha, flip_sign_alpha,
+                                   *s3.vector_double, convert_to_double(beta),  len_beta,  reciprocal_beta,  flip_sign_beta);
+            break;
+          default:
+            throw statement_not_supported_exception("Invalid arguments in scheduler when calling avbv()");
+        }
+      }
+
+      /** @brief Wrapper for viennacl::linalg::avbv_v(), taking care of the argument unwrapping */
+      template <typename ScalarType1, typename ScalarType2>
+      void asbs_s(lhs_rhs_element & s1,
+                  lhs_rhs_element const & s2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+                  lhs_rhs_element const & s3, ScalarType2 const & beta,  vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
+      {
+        assert(   s1.type_family == SCALAR_TYPE_FAMILY && (s1.subtype == HOST_SCALAR_TYPE || s1.subtype == DEVICE_SCALAR_TYPE)
+               && s2.type_family == SCALAR_TYPE_FAMILY && (s2.subtype == HOST_SCALAR_TYPE || s2.subtype == DEVICE_SCALAR_TYPE)
+               && s3.type_family == SCALAR_TYPE_FAMILY && (s3.subtype == HOST_SCALAR_TYPE || s3.subtype == DEVICE_SCALAR_TYPE)
+               && bool("Arguments are not vector types!"));
+
+        switch (s1.numeric_type)
+        {
+          case FLOAT_TYPE:
+            assert(s2.numeric_type == FLOAT_TYPE && s3.numeric_type == FLOAT_TYPE && bool("Vectors do not have the same scalar type"));
+            viennacl::linalg::avbv_v(*s1.vector_float,
+                                     *s2.vector_float, convert_to_float(alpha), len_alpha, reciprocal_alpha, flip_sign_alpha,
+                                     *s3.vector_float, convert_to_float(beta),  len_beta,  reciprocal_beta,  flip_sign_beta);
+            break;
+          case DOUBLE_TYPE:
+            assert(s2.numeric_type == DOUBLE_TYPE && s3.numeric_type == DOUBLE_TYPE && bool("Vectors do not have the same scalar type"));
+            viennacl::linalg::avbv_v(*s1.vector_double,
+                                     *s2.vector_double, convert_to_double(alpha), len_alpha, reciprocal_alpha, flip_sign_alpha,
+                                     *s3.vector_double, convert_to_double(beta),  len_beta,  reciprocal_beta,  flip_sign_beta);
+            break;
+          default:
+            throw statement_not_supported_exception("Invalid arguments in scheduler when calling avbv_v()");
+        }
+      }
+
+
+    } // namespace detail
+  } // namespace scheduler
+} // namespace viennacl
+
+#endif
+
diff --git a/viennacl/scheduler/execute_util.hpp b/viennacl/scheduler/execute_util.hpp
new file mode 100644
index 0000000..c8a58c3
--- /dev/null
+++ b/viennacl/scheduler/execute_util.hpp
@@ -0,0 +1,253 @@
+#ifndef VIENNACL_SCHEDULER_EXECUTE_UTIL_HPP
+#define VIENNACL_SCHEDULER_EXECUTE_UTIL_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/scheduler/execute_util.hpp
+    @brief Provides various utilities for implementing the execution of statements
+*/
+
+#include <assert.h>
+
+#include "viennacl/forwards.h"
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/scheduler/forwards.h"
+
+namespace viennacl
+{
+  namespace scheduler
+  {
+    namespace detail
+    {
+      //
+      inline lhs_rhs_element const & extract_representative_vector(statement const & s, lhs_rhs_element const & element)
+      {
+        switch (element.type_family)
+        {
+        case VECTOR_TYPE_FAMILY:
+          return element;
+        case COMPOSITE_OPERATION_FAMILY:
+        {
+          statement_node const & leaf = s.array()[element.node_index];
+
+          if (leaf.op.type_family == OPERATION_UNARY_TYPE_FAMILY)
+            return extract_representative_vector(s, leaf.lhs);
+          switch (leaf.op.type)
+          {
+          case OPERATION_BINARY_ADD_TYPE:
+          case OPERATION_BINARY_SUB_TYPE:
+          case OPERATION_BINARY_MULT_TYPE:
+          case OPERATION_BINARY_DIV_TYPE:
+          case OPERATION_BINARY_ELEMENT_PROD_TYPE:
+          case OPERATION_BINARY_ELEMENT_DIV_TYPE:
+            return extract_representative_vector(s, leaf.lhs);
+          case OPERATION_BINARY_MAT_VEC_PROD_TYPE:
+            return extract_representative_vector(s, leaf.rhs);
+          default:
+            throw statement_not_supported_exception("Vector leaf encountered an invalid binary operation!");
+          }
+        }
+        default:
+          throw statement_not_supported_exception("Vector leaf encountered an invalid node type!");
+        }
+      }
+
+
+      // helper routines for extracting the scalar type
+      inline float convert_to_float(float f) { return f; }
+      inline float convert_to_float(double d) { return static_cast<float>(d); }
+      inline float convert_to_float(lhs_rhs_element const & el)
+      {
+        if (el.type_family == SCALAR_TYPE_FAMILY && el.subtype == HOST_SCALAR_TYPE && el.numeric_type == FLOAT_TYPE)
+          return el.host_float;
+        if (el.type_family == SCALAR_TYPE_FAMILY && el.subtype == DEVICE_SCALAR_TYPE && el.numeric_type == FLOAT_TYPE)
+          return *el.scalar_float;
+
+        throw statement_not_supported_exception("Cannot convert to float");
+      }
+
+      // helper routines for extracting the scalar type
+      inline double convert_to_double(float d) { return static_cast<double>(d); }
+      inline double convert_to_double(double d) { return d; }
+      inline double convert_to_double(lhs_rhs_element const & el)
+      {
+        if (el.type_family == SCALAR_TYPE_FAMILY && el.subtype == HOST_SCALAR_TYPE && el.numeric_type == DOUBLE_TYPE)
+          return el.host_double;
+        if (el.type_family == SCALAR_TYPE_FAMILY && el.subtype == DEVICE_SCALAR_TYPE && el.numeric_type == DOUBLE_TYPE)
+          return *el.scalar_double;
+
+        throw statement_not_supported_exception("Cannot convert to double");
+      }
+
+      /////////////////// Create/Destory temporary vector ///////////////////////
+
+      inline void new_element(lhs_rhs_element & new_elem, lhs_rhs_element const & old_element)
+      {
+        new_elem.type_family  = old_element.type_family;
+        new_elem.subtype      = old_element.subtype;
+        new_elem.numeric_type = old_element.numeric_type;
+        if (new_elem.type_family == SCALAR_TYPE_FAMILY)
+        {
+          assert(new_elem.subtype == DEVICE_SCALAR_TYPE && bool("Expected a device scalar in root node"));
+
+          switch (new_elem.numeric_type)
+          {
+            case FLOAT_TYPE:
+              new_elem.scalar_float = new viennacl::scalar<float>();
+              return;
+            case DOUBLE_TYPE:
+              new_elem.scalar_double = new viennacl::scalar<double>();
+              return;
+            default:
+              throw statement_not_supported_exception("Invalid vector type for vector construction");
+          }
+        }
+        else if (new_elem.type_family == VECTOR_TYPE_FAMILY)
+        {
+          assert(new_elem.subtype == DENSE_VECTOR_TYPE && bool("Expected a dense vector in root node"));
+
+          switch (new_elem.numeric_type)
+          {
+            case FLOAT_TYPE:
+              new_elem.vector_float = new viennacl::vector<float>((old_element.vector_float)->size());
+              return;
+            case DOUBLE_TYPE:
+              new_elem.vector_double = new viennacl::vector<double>((old_element.vector_float)->size());
+              return;
+            default:
+              throw statement_not_supported_exception("Invalid vector type for vector construction");
+          }
+        }
+        else if (new_elem.type_family == MATRIX_TYPE_FAMILY)
+        {
+          assert( (new_elem.subtype == DENSE_COL_MATRIX_TYPE || new_elem.subtype == DENSE_ROW_MATRIX_TYPE)
+                 && bool("Expected a dense matrix in root node"));
+
+          if (new_elem.subtype == DENSE_COL_MATRIX_TYPE)
+          {
+            switch (new_elem.numeric_type)
+            {
+              case FLOAT_TYPE:
+                new_elem.matrix_col_float = new viennacl::matrix<float, viennacl::column_major>((old_element.matrix_col_float)->size1(), (old_element.matrix_col_float)->size2());
+                return;
+              case DOUBLE_TYPE:
+                new_elem.matrix_col_double = new viennacl::matrix<double, viennacl::column_major>((old_element.matrix_col_double)->size1(), (old_element.matrix_col_double)->size2());
+                return;
+              default:
+                throw statement_not_supported_exception("Invalid vector type for vector construction");
+            }
+          }
+          else if (new_elem.subtype == DENSE_ROW_MATRIX_TYPE)
+          {
+            switch (new_elem.numeric_type)
+            {
+              case FLOAT_TYPE:
+                new_elem.matrix_row_float = new viennacl::matrix<float, viennacl::row_major>((old_element.matrix_row_float)->size1(), (old_element.matrix_row_float)->size2());
+                return;
+              case DOUBLE_TYPE:
+                new_elem.matrix_row_double = new viennacl::matrix<double, viennacl::row_major>((old_element.matrix_row_double)->size1(), (old_element.matrix_row_double)->size2());
+                return;
+              default:
+                throw statement_not_supported_exception("Invalid vector type for vector construction");
+            }
+          }
+          else
+            throw statement_not_supported_exception("Expected a dense matrix in root node when creating a temporary");
+        }
+        else
+          throw statement_not_supported_exception("Unknown type familty when creating new temporary object");
+      }
+
+      inline void delete_element(lhs_rhs_element & elem)
+      {
+        if (elem.type_family == SCALAR_TYPE_FAMILY)
+        {
+          switch (elem.numeric_type)
+          {
+            case FLOAT_TYPE:
+              delete elem.scalar_float;
+              return;
+            case DOUBLE_TYPE:
+              delete elem.scalar_double;
+              return;
+            default:
+              throw statement_not_supported_exception("Invalid vector type for vector destruction");
+          }
+        }
+        else if (elem.type_family == VECTOR_TYPE_FAMILY)
+        {
+          switch (elem.numeric_type)
+          {
+            case FLOAT_TYPE:
+              delete elem.vector_float;
+              return;
+            case DOUBLE_TYPE:
+              delete elem.vector_double;
+              return;
+            default:
+              throw statement_not_supported_exception("Invalid vector type for vector destruction");
+          }
+        }
+        else if (elem.type_family == MATRIX_TYPE_FAMILY)
+        {
+          if (elem.subtype == DENSE_COL_MATRIX_TYPE)
+          {
+            switch (elem.numeric_type)
+            {
+              case FLOAT_TYPE:
+                delete elem.matrix_col_float;
+                return;
+              case DOUBLE_TYPE:
+                delete elem.matrix_col_double;
+                return;
+              default:
+                throw statement_not_supported_exception("Invalid vector type for vector destruction");
+            }
+          }
+          else if (elem.subtype == DENSE_ROW_MATRIX_TYPE)
+          {
+            switch (elem.numeric_type)
+            {
+              case FLOAT_TYPE:
+                delete elem.matrix_row_float;
+                return;
+              case DOUBLE_TYPE:
+                delete elem.matrix_row_double;
+                return;
+              default:
+                throw statement_not_supported_exception("Invalid vector type for vector destruction");
+            }
+          }
+          else
+            throw statement_not_supported_exception("Expected a dense matrix in root node when deleting temporary");
+        }
+        else
+          throw statement_not_supported_exception("Unknown type familty when deleting temporary object");
+      }
+
+    } // namespace detail
+
+
+  } // namespace scheduler
+} // namespace viennacl
+
+#endif
+
diff --git a/viennacl/scheduler/execute_vector_dispatcher.hpp b/viennacl/scheduler/execute_vector_dispatcher.hpp
new file mode 100644
index 0000000..b9c6c64
--- /dev/null
+++ b/viennacl/scheduler/execute_vector_dispatcher.hpp
@@ -0,0 +1,191 @@
+#ifndef VIENNACL_SCHEDULER_EXECUTE_VECTOR_DISPATCHER_HPP
+#define VIENNACL_SCHEDULER_EXECUTE_VECTOR_DISPATCHER_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/scheduler/execute_vector_dispatcher.hpp
+    @brief Provides wrappers for av(), avbv(), avbv_v(), etc. in viennacl/linalg/vector_operations.hpp such that scheduler logic is not cluttered with numeric type decutions
+*/
+
+#include <assert.h>
+
+#include "viennacl/forwards.h"
+#include "viennacl/scheduler/forwards.h"
+#include "viennacl/scheduler/execute_util.hpp"
+#include "viennacl/linalg/vector_operations.hpp"
+
+namespace viennacl
+{
+  namespace scheduler
+  {
+    namespace detail
+    {
+      /** @brief Wrapper for viennacl::linalg::av(), taking care of the argument unwrapping */
+      template <typename ScalarType1>
+      void av(lhs_rhs_element & vec1,
+              lhs_rhs_element const & vec2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
+      {
+        assert(   vec1.type_family == VECTOR_TYPE_FAMILY && vec1.subtype == DENSE_VECTOR_TYPE
+               && vec2.type_family == VECTOR_TYPE_FAMILY && vec2.subtype == DENSE_VECTOR_TYPE
+               && bool("Arguments are not vector types!"));
+
+        switch (vec1.numeric_type)
+        {
+          case FLOAT_TYPE:
+            assert(vec2.numeric_type == FLOAT_TYPE && bool("Vectors do not have the same scalar type"));
+            viennacl::linalg::av(*vec1.vector_float,
+                                 *vec2.vector_float, convert_to_float(alpha), len_alpha, reciprocal_alpha, flip_sign_alpha);
+            break;
+          case DOUBLE_TYPE:
+            assert(vec2.numeric_type == DOUBLE_TYPE && bool("Vectors do not have the same scalar type"));
+            viennacl::linalg::av(*vec1.vector_double,
+                                 *vec2.vector_double, convert_to_double(alpha), len_alpha, reciprocal_alpha, flip_sign_alpha);
+            break;
+          default:
+            throw statement_not_supported_exception("Invalid arguments in scheduler when calling av()");
+        }
+      }
+
+      /** @brief Wrapper for viennacl::linalg::avbv(), taking care of the argument unwrapping */
+      template <typename ScalarType1, typename ScalarType2>
+      void avbv(lhs_rhs_element & vec1,
+                lhs_rhs_element const & vec2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+                lhs_rhs_element const & vec3, ScalarType2 const & beta,  vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
+      {
+        assert(   vec1.type_family == VECTOR_TYPE_FAMILY && vec1.subtype == DENSE_VECTOR_TYPE
+               && vec2.type_family == VECTOR_TYPE_FAMILY && vec2.subtype == DENSE_VECTOR_TYPE
+               && vec3.type_family == VECTOR_TYPE_FAMILY && vec3.subtype == DENSE_VECTOR_TYPE
+               && bool("Arguments are not vector types!"));
+
+        switch (vec1.numeric_type)
+        {
+          case FLOAT_TYPE:
+            assert(vec2.numeric_type == FLOAT_TYPE && vec3.numeric_type == FLOAT_TYPE && bool("Vectors do not have the same scalar type"));
+            viennacl::linalg::avbv(*vec1.vector_float,
+                                   *vec2.vector_float, convert_to_float(alpha), len_alpha, reciprocal_alpha, flip_sign_alpha,
+                                   *vec3.vector_float, convert_to_float(beta),  len_beta,  reciprocal_beta,  flip_sign_beta);
+            break;
+          case DOUBLE_TYPE:
+            assert(vec2.numeric_type == DOUBLE_TYPE && vec3.numeric_type == DOUBLE_TYPE && bool("Vectors do not have the same scalar type"));
+            viennacl::linalg::avbv(*vec1.vector_double,
+                                   *vec2.vector_double, convert_to_double(alpha), len_alpha, reciprocal_alpha, flip_sign_alpha,
+                                   *vec3.vector_double, convert_to_double(beta),  len_beta,  reciprocal_beta,  flip_sign_beta);
+            break;
+          default:
+            throw statement_not_supported_exception("Invalid arguments in scheduler when calling avbv()");
+        }
+      }
+
+      /** @brief Wrapper for viennacl::linalg::avbv_v(), taking care of the argument unwrapping */
+      template <typename ScalarType1, typename ScalarType2>
+      void avbv_v(lhs_rhs_element & vec1,
+                  lhs_rhs_element const & vec2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+                  lhs_rhs_element const & vec3, ScalarType2 const & beta,  vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
+      {
+        assert(   vec1.type_family == VECTOR_TYPE_FAMILY && vec1.subtype == DENSE_VECTOR_TYPE
+               && vec2.type_family == VECTOR_TYPE_FAMILY && vec2.subtype == DENSE_VECTOR_TYPE
+               && vec3.type_family == VECTOR_TYPE_FAMILY && vec3.subtype == DENSE_VECTOR_TYPE
+               && bool("Arguments are not vector types!"));
+
+        switch (vec1.numeric_type)
+        {
+          case FLOAT_TYPE:
+            assert(vec2.numeric_type == FLOAT_TYPE && vec3.numeric_type == FLOAT_TYPE && bool("Vectors do not have the same scalar type"));
+            viennacl::linalg::avbv_v(*vec1.vector_float,
+                                     *vec2.vector_float, convert_to_float(alpha), len_alpha, reciprocal_alpha, flip_sign_alpha,
+                                     *vec3.vector_float, convert_to_float(beta),  len_beta,  reciprocal_beta,  flip_sign_beta);
+            break;
+          case DOUBLE_TYPE:
+            assert(vec2.numeric_type == DOUBLE_TYPE && vec3.numeric_type == DOUBLE_TYPE && bool("Vectors do not have the same scalar type"));
+            viennacl::linalg::avbv_v(*vec1.vector_double,
+                                     *vec2.vector_double, convert_to_double(alpha), len_alpha, reciprocal_alpha, flip_sign_alpha,
+                                     *vec3.vector_double, convert_to_double(beta),  len_beta,  reciprocal_beta,  flip_sign_beta);
+            break;
+          default:
+            throw statement_not_supported_exception("Invalid arguments in scheduler when calling avbv_v()");
+        }
+      }
+
+
+      /** @brief Dispatcher interface for computing s = norm_1(x) */
+      inline void norm_impl(lhs_rhs_element const & x,
+                            lhs_rhs_element const & s,
+                            operation_node_type op_type)
+      {
+        assert( x.type_family == VECTOR_TYPE_FAMILY && x.subtype == DENSE_VECTOR_TYPE && bool("Argument is not a dense vector type!"));
+        assert( s.type_family == SCALAR_TYPE_FAMILY && s.subtype == DEVICE_SCALAR_TYPE && bool("Argument is not a scalar type!"));
+
+        switch (x.numeric_type)
+        {
+          case FLOAT_TYPE:
+            assert(s.numeric_type == FLOAT_TYPE && bool("Vector and scalar do not have the same numeric type"));
+            if (op_type == OPERATION_UNARY_NORM_1_TYPE)
+              viennacl::linalg::norm_1_impl(*x.vector_float, *s.scalar_float);
+            else if (op_type == OPERATION_UNARY_NORM_2_TYPE)
+              viennacl::linalg::norm_2_impl(*x.vector_float, *s.scalar_float);
+            else if (op_type == OPERATION_UNARY_NORM_INF_TYPE)
+              viennacl::linalg::norm_inf_impl(*x.vector_float, *s.scalar_float);
+            else
+              throw statement_not_supported_exception("Invalid norm type in scheduler::detail::norm_impl()");
+            break;
+          case DOUBLE_TYPE:
+            if (op_type == OPERATION_UNARY_NORM_1_TYPE)
+              viennacl::linalg::norm_1_impl(*x.vector_double, *s.scalar_double);
+            else if (op_type == OPERATION_UNARY_NORM_2_TYPE)
+              viennacl::linalg::norm_2_impl(*x.vector_double, *s.scalar_double);
+            else if (op_type == OPERATION_UNARY_NORM_INF_TYPE)
+              viennacl::linalg::norm_inf_impl(*x.vector_double, *s.scalar_double);
+            else
+              throw statement_not_supported_exception("Invalid norm type in scheduler::detail::norm_impl()");
+            break;
+          default:
+            throw statement_not_supported_exception("Invalid numeric type in scheduler when calling norm_impl()");
+        }
+      }
+
+      /** @brief Dispatcher interface for computing s = inner_prod(x, y) */
+      inline void inner_prod_impl(lhs_rhs_element const & x,
+                                  lhs_rhs_element const & y,
+                                  lhs_rhs_element const & s)
+      {
+        assert( x.type_family == VECTOR_TYPE_FAMILY && x.subtype == DENSE_VECTOR_TYPE && bool("Argument is not a dense vector type!"));
+        assert( y.type_family == VECTOR_TYPE_FAMILY && y.subtype == DENSE_VECTOR_TYPE && bool("Argument is not a dense vector type!"));
+        assert( s.type_family == SCALAR_TYPE_FAMILY && s.subtype == DEVICE_SCALAR_TYPE && bool("Argument is not a scalar type!"));
+
+        switch (x.numeric_type)
+        {
+          case FLOAT_TYPE:
+            assert(y.numeric_type == FLOAT_TYPE && s.numeric_type == FLOAT_TYPE && bool("Vector and scalar do not have the same numeric type"));
+            viennacl::linalg::inner_prod_impl(*x.vector_float, *y.vector_float, *s.scalar_float);
+            break;
+          case DOUBLE_TYPE:
+            assert(y.numeric_type == DOUBLE_TYPE && s.numeric_type == DOUBLE_TYPE && bool("Vector and scalar do not have the same numeric type"));
+            viennacl::linalg::inner_prod_impl(*x.vector_double, *y.vector_double, *s.scalar_double);
+            break;
+          default:
+            throw statement_not_supported_exception("Invalid arguments in scheduler when calling av()");
+        }
+      }
+
+    } // namespace detail
+  } // namespace scheduler
+} // namespace viennacl
+
+#endif
+
diff --git a/viennacl/scheduler/forwards.h b/viennacl/scheduler/forwards.h
new file mode 100644
index 0000000..e811466
--- /dev/null
+++ b/viennacl/scheduler/forwards.h
@@ -0,0 +1,710 @@
+#ifndef VIENNACL_SCHEDULER_STATEMENT_HPP
+#define VIENNACL_SCHEDULER_STATEMENT_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/scheduler/forwards.h
+    @brief Provides the datastructures for dealing with a single statement such as 'x = y + z;'
+*/
+
+#include "viennacl/forwards.h"
+
+#include <vector>
+
+namespace viennacl
+{
+  namespace scheduler
+  {
+
+    /** @brief Exception for the case the scheduler is unable to deal with the operation */
+    class statement_not_supported_exception : public std::exception
+    {
+    public:
+      statement_not_supported_exception() : message_() {}
+      statement_not_supported_exception(std::string message) : message_("ViennaCL: Internal error: The scheduler encountered a problem with the operation provided: " + message) {}
+
+      virtual const char* what() const throw() { return message_.c_str(); }
+
+      virtual ~statement_not_supported_exception() throw() {}
+    private:
+      std::string message_;
+    };
+
+
+    /** @brief Optimization enum for grouping operations into unary or binary operations. Just for optimization of lookups. */
+    enum operation_node_type_family
+    {
+      OPERATION_INVALID_TYPE_FAMILY = 0,
+
+      // unary or binary expression
+      OPERATION_UNARY_TYPE_FAMILY,
+      OPERATION_BINARY_TYPE_FAMILY
+    };
+
+    /** @brief Enumeration for identifying the possible operations */
+    enum operation_node_type
+    {
+      OPERATION_INVALID_TYPE = 0,
+
+      // unary expression
+      OPERATION_UNARY_ABS_TYPE,
+      OPERATION_UNARY_ACOS_TYPE,
+      OPERATION_UNARY_ASIN_TYPE,
+      OPERATION_UNARY_ATAN_TYPE,
+      OPERATION_UNARY_CEIL_TYPE,
+      OPERATION_UNARY_COS_TYPE,
+      OPERATION_UNARY_COSH_TYPE,
+      OPERATION_UNARY_EXP_TYPE,
+      OPERATION_UNARY_FABS_TYPE,
+      OPERATION_UNARY_FLOOR_TYPE,
+      OPERATION_UNARY_LOG_TYPE,
+      OPERATION_UNARY_LOG10_TYPE,
+      OPERATION_UNARY_SIN_TYPE,
+      OPERATION_UNARY_SINH_TYPE,
+      OPERATION_UNARY_SQRT_TYPE,
+      OPERATION_UNARY_TAN_TYPE,
+      OPERATION_UNARY_TANH_TYPE,
+      OPERATION_UNARY_TRANS_TYPE,
+      OPERATION_UNARY_NORM_1_TYPE,
+      OPERATION_UNARY_NORM_2_TYPE,
+      OPERATION_UNARY_NORM_INF_TYPE,
+
+      // binary expression
+      OPERATION_BINARY_ACCESS_TYPE,
+      OPERATION_BINARY_ASSIGN_TYPE,
+      OPERATION_BINARY_INPLACE_ADD_TYPE,
+      OPERATION_BINARY_INPLACE_SUB_TYPE,
+      OPERATION_BINARY_ADD_TYPE,
+      OPERATION_BINARY_SUB_TYPE,
+      OPERATION_BINARY_MAT_VEC_PROD_TYPE,
+      OPERATION_BINARY_MAT_MAT_PROD_TYPE,
+      OPERATION_BINARY_MULT_TYPE,    // scalar times vector/matrix
+      OPERATION_BINARY_DIV_TYPE,     // vector/matrix divided by scalar
+      OPERATION_BINARY_ELEMENT_PROD_TYPE,
+      OPERATION_BINARY_ELEMENT_DIV_TYPE,
+      OPERATION_BINARY_INNER_PROD_TYPE
+    };
+
+
+
+    namespace result_of
+    {
+      /** @brief Helper metafunction for obtaining the operation ID as well as the operation family for unary and binary operations on vectors and matrices. */
+      template <typename T>
+      struct op_type_info
+      {
+        typedef typename T::ERROR_UNKNOWN_OP_TYPE   error_type;
+      };
+
+      /** \cond */
+
+      // unary operations
+      template <> struct op_type_info<op_element_unary<op_abs>   > { enum { id = OPERATION_UNARY_ABS_TYPE,   family = OPERATION_UNARY_TYPE_FAMILY }; };
+      template <> struct op_type_info<op_element_unary<op_acos>  > { enum { id = OPERATION_UNARY_ACOS_TYPE,  family = OPERATION_UNARY_TYPE_FAMILY }; };
+      template <> struct op_type_info<op_element_unary<op_asin>  > { enum { id = OPERATION_UNARY_ASIN_TYPE,  family = OPERATION_UNARY_TYPE_FAMILY }; };
+      template <> struct op_type_info<op_element_unary<op_atan>  > { enum { id = OPERATION_UNARY_ATAN_TYPE,  family = OPERATION_UNARY_TYPE_FAMILY }; };
+      template <> struct op_type_info<op_element_unary<op_ceil>  > { enum { id = OPERATION_UNARY_CEIL_TYPE,  family = OPERATION_UNARY_TYPE_FAMILY }; };
+      template <> struct op_type_info<op_element_unary<op_cos>   > { enum { id = OPERATION_UNARY_COS_TYPE,   family = OPERATION_UNARY_TYPE_FAMILY }; };
+      template <> struct op_type_info<op_element_unary<op_cosh>  > { enum { id = OPERATION_UNARY_COSH_TYPE,  family = OPERATION_UNARY_TYPE_FAMILY }; };
+      template <> struct op_type_info<op_element_unary<op_exp>   > { enum { id = OPERATION_UNARY_EXP_TYPE,   family = OPERATION_UNARY_TYPE_FAMILY }; };
+      template <> struct op_type_info<op_element_unary<op_fabs>  > { enum { id = OPERATION_UNARY_FABS_TYPE,  family = OPERATION_UNARY_TYPE_FAMILY }; };
+      template <> struct op_type_info<op_element_unary<op_floor> > { enum { id = OPERATION_UNARY_FLOOR_TYPE, family = OPERATION_UNARY_TYPE_FAMILY }; };
+      template <> struct op_type_info<op_element_unary<op_log>   > { enum { id = OPERATION_UNARY_LOG_TYPE,   family = OPERATION_UNARY_TYPE_FAMILY }; };
+      template <> struct op_type_info<op_element_unary<op_log10> > { enum { id = OPERATION_UNARY_LOG10_TYPE, family = OPERATION_UNARY_TYPE_FAMILY }; };
+      template <> struct op_type_info<op_element_unary<op_sin>   > { enum { id = OPERATION_UNARY_SIN_TYPE,   family = OPERATION_UNARY_TYPE_FAMILY }; };
+      template <> struct op_type_info<op_element_unary<op_sinh>  > { enum { id = OPERATION_UNARY_SINH_TYPE,  family = OPERATION_UNARY_TYPE_FAMILY }; };
+      template <> struct op_type_info<op_element_unary<op_sqrt>  > { enum { id = OPERATION_UNARY_SQRT_TYPE,  family = OPERATION_UNARY_TYPE_FAMILY }; };
+      template <> struct op_type_info<op_element_unary<op_tan>   > { enum { id = OPERATION_UNARY_TAN_TYPE,   family = OPERATION_UNARY_TYPE_FAMILY }; };
+      template <> struct op_type_info<op_element_unary<op_tanh>  > { enum { id = OPERATION_UNARY_TANH_TYPE,  family = OPERATION_UNARY_TYPE_FAMILY }; };
+      template <> struct op_type_info<op_norm_1                  > { enum { id = OPERATION_UNARY_NORM_1_TYPE,   family = OPERATION_UNARY_TYPE_FAMILY }; };
+      template <> struct op_type_info<op_norm_2                  > { enum { id = OPERATION_UNARY_NORM_2_TYPE,   family = OPERATION_UNARY_TYPE_FAMILY }; };
+      template <> struct op_type_info<op_norm_inf                > { enum { id = OPERATION_UNARY_NORM_INF_TYPE, family = OPERATION_UNARY_TYPE_FAMILY }; };
+      template <> struct op_type_info<op_trans                   > { enum { id = OPERATION_UNARY_TRANS_TYPE, family = OPERATION_UNARY_TYPE_FAMILY }; };
+
+      // binary operations
+      template <> struct op_type_info<op_assign>                   { enum { id = OPERATION_BINARY_ASSIGN_TYPE,       family = OPERATION_BINARY_TYPE_FAMILY }; };
+      template <> struct op_type_info<op_inplace_add>              { enum { id = OPERATION_BINARY_INPLACE_ADD_TYPE,  family = OPERATION_BINARY_TYPE_FAMILY }; };
+      template <> struct op_type_info<op_inplace_sub>              { enum { id = OPERATION_BINARY_INPLACE_SUB_TYPE,  family = OPERATION_BINARY_TYPE_FAMILY }; };
+      template <> struct op_type_info<op_add>                      { enum { id = OPERATION_BINARY_ADD_TYPE,          family = OPERATION_BINARY_TYPE_FAMILY }; };
+      template <> struct op_type_info<op_sub>                      { enum { id = OPERATION_BINARY_SUB_TYPE,          family = OPERATION_BINARY_TYPE_FAMILY }; };
+      template <> struct op_type_info<op_prod>                     { enum { id = OPERATION_BINARY_MAT_VEC_PROD_TYPE, family = OPERATION_BINARY_TYPE_FAMILY }; };
+      template <> struct op_type_info<op_mat_mat_prod>             { enum { id = OPERATION_BINARY_MAT_MAT_PROD_TYPE, family = OPERATION_BINARY_TYPE_FAMILY }; };
+      template <> struct op_type_info<op_mult>                     { enum { id = OPERATION_BINARY_MULT_TYPE,         family = OPERATION_BINARY_TYPE_FAMILY }; };
+      template <> struct op_type_info<op_div>                      { enum { id = OPERATION_BINARY_DIV_TYPE,          family = OPERATION_BINARY_TYPE_FAMILY }; };
+      template <> struct op_type_info<op_element_binary<op_prod> > { enum { id = OPERATION_BINARY_ELEMENT_PROD_TYPE, family = OPERATION_BINARY_TYPE_FAMILY }; };
+      template <> struct op_type_info<op_element_binary<op_div>  > { enum { id = OPERATION_BINARY_ELEMENT_DIV_TYPE,  family = OPERATION_BINARY_TYPE_FAMILY }; };
+      template <> struct op_type_info<op_inner_prod>               { enum { id = OPERATION_BINARY_INNER_PROD_TYPE,   family = OPERATION_BINARY_TYPE_FAMILY }; };
+
+      /** \endcond */
+    } // namespace result_of
+
+
+
+
+
+    /** @brief Groups the type of a node in the statement tree. Used for faster dispatching */
+    enum statement_node_type_family
+    {
+      INVALID_TYPE_FAMILY = 0,
+
+      // LHS or RHS are again an expression:
+      COMPOSITE_OPERATION_FAMILY,
+
+      // device scalars:
+      SCALAR_TYPE_FAMILY,
+
+      // vector:
+      VECTOR_TYPE_FAMILY,
+
+      // matrices:
+      MATRIX_TYPE_FAMILY
+    };
+
+    /** @brief Encodes the type of a node in the statement tree. */
+    enum statement_node_subtype
+    {
+      INVALID_SUBTYPE = 0, //when type is COMPOSITE_OPERATION_FAMILY
+
+      HOST_SCALAR_TYPE,
+      DEVICE_SCALAR_TYPE,
+
+      DENSE_VECTOR_TYPE,
+      IMPLICIT_VECTOR_TYPE,
+
+      DENSE_ROW_MATRIX_TYPE,
+      DENSE_COL_MATRIX_TYPE,
+      IMPLICIT_MATRIX_TYPE,
+
+      COMPRESSED_MATRIX_TYPE,
+      COORDINATE_MATRIX_TYPE,
+      ELL_MATRIX_TYPE,
+      HYB_MATRIX_TYPE
+
+      // other matrix types to be added here
+    };
+
+    /** @brief Encodes the type of a node in the statement tree. */
+    enum statement_node_numeric_type
+    {
+      INVALID_NUMERIC_TYPE = 0, //when type is COMPOSITE_OPERATION_FAMILY
+
+      CHAR_TYPE,
+      UCHAR_TYPE,
+      SHORT_TYPE,
+      USHORT_TYPE,
+      INT_TYPE,
+      UINT_TYPE,
+      LONG_TYPE,
+      ULONG_TYPE,
+      HALF_TYPE,
+      FLOAT_TYPE,
+      DOUBLE_TYPE
+    };
+
+
+    namespace result_of
+    {
+      ///////////// numeric type ID deduction /////////////
+
+      /** @brief Helper metafunction for obtaining the runtime type ID for a numerical type */
+      template <typename T>
+      struct numeric_type_id {};
+
+      /** \cond */
+
+      template <> struct numeric_type_id<char>           { enum { value = CHAR_TYPE   }; };
+      template <> struct numeric_type_id<unsigned char>  { enum { value = UCHAR_TYPE  }; };
+      template <> struct numeric_type_id<short>          { enum { value = SHORT_TYPE  }; };
+      template <> struct numeric_type_id<unsigned short> { enum { value = USHORT_TYPE }; };
+      template <> struct numeric_type_id<int>            { enum { value = INT_TYPE    }; };
+      template <> struct numeric_type_id<unsigned int>   { enum { value = UINT_TYPE   }; };
+      template <> struct numeric_type_id<long>           { enum { value = LONG_TYPE   }; };
+      template <> struct numeric_type_id<unsigned long>  { enum { value = ULONG_TYPE  }; };
+      template <> struct numeric_type_id<float>          { enum { value = FLOAT_TYPE  }; };
+      template <> struct numeric_type_id<double>         { enum { value = DOUBLE_TYPE }; };
+
+      /** \endcond */
+
+      ///////////// matrix layout ID deduction /////////////
+
+      /** @brief Helper metafunction for obtaining the memory layout (row-/column-major) for a matrix. */
+      template <typename F>
+      struct layout_type_id {};
+
+      /** \cond */
+
+      template <> struct layout_type_id<viennacl::column_major> { enum { value = DENSE_COL_MATRIX_TYPE }; };
+      template <> struct layout_type_id<viennacl::row_major   > { enum { value = DENSE_ROW_MATRIX_TYPE }; };
+
+      /** \endcond */
+    }
+
+
+
+    /** @brief A class representing the 'data' for the LHS or RHS operand of the respective node.
+      *
+      * If it represents a compound expression, the union holds the array index within the respective statement array.
+      * If it represents a object with data (vector, matrix, etc.) it holds the respective pointer (scalar, vector, matrix) or value (host scalar)
+      *
+      * The member 'type_family' is an optimization for quickly retrieving the 'type', which denotes the currently 'active' member in the union
+      */
+    struct lhs_rhs_element
+    {
+      statement_node_type_family   type_family;
+      statement_node_subtype       subtype;
+      statement_node_numeric_type  numeric_type;
+
+      union
+      {
+        /////// Case 1: Node is another compound expression:
+        vcl_size_t        node_index;
+
+        /////// Case 2: Node is a leaf, hence carries an operand:
+
+        // host scalars:
+        char               host_char;
+        unsigned char      host_uchar;
+        short              host_short;
+        unsigned short     host_ushort;
+        int                host_int;
+        unsigned int       host_uint;
+        long               host_long;
+        unsigned long      host_ulong;
+        float              host_float;
+        double             host_double;
+
+        // Note: ViennaCL types have potentially expensive copy-CTORs, hence using pointers:
+
+        // scalars:
+        //viennacl::scalar<char>             *scalar_char;
+        //viennacl::scalar<unsigned char>    *scalar_uchar;
+        //viennacl::scalar<short>            *scalar_short;
+        //viennacl::scalar<unsigned short>   *scalar_ushort;
+        //viennacl::scalar<int>              *scalar_int;
+        //viennacl::scalar<unsigned int>     *scalar_uint;
+        //viennacl::scalar<long>             *scalar_long;
+        //viennacl::scalar<unsigned long>    *scalar_ulong;
+        viennacl::scalar<float>            *scalar_float;
+        viennacl::scalar<double>           *scalar_double;
+
+        // vectors:
+        //viennacl::vector_base<char>             *vector_char;
+        //viennacl::vector_base<unsigned char>    *vector_uchar;
+        //viennacl::vector_base<short>            *vector_short;
+        //viennacl::vector_base<unsigned short>   *vector_ushort;
+        //viennacl::vector_base<int>              *vector_int;
+        //viennacl::vector_base<unsigned int>     *vector_uint;
+        //viennacl::vector_base<long>             *vector_long;
+        //viennacl::vector_base<unsigned long>    *vector_ulong;
+        viennacl::vector_base<float>            *vector_float;
+        viennacl::vector_base<double>           *vector_double;
+
+        // implicit vectors:
+        //viennacl::implicit_vector_base<char>             *implicit_vector_char;
+        //viennacl::implicit_vector_base<unsigned char>    *implicit_vector_uchar;
+        //viennacl::implicit_vector_base<short>            *implicit_vector_short;
+        //viennacl::implicit_vector_base<unsigned short>   *implicit_vector_ushort;
+        //viennacl::implicit_vector_base<int>              *implicit_vector_int;
+        //viennacl::implicit_vector_base<unsigned int>     *implicit_vector_uint;
+        //viennacl::implicit_vector_base<long>             *implicit_vector_long;
+        //viennacl::implicit_vector_base<unsigned long>    *implicit_vector_ulong;
+        viennacl::implicit_vector_base<float>            *implicit_vector_float;
+        viennacl::implicit_vector_base<double>           *implicit_vector_double;
+
+        // row-major matrices:
+        //viennacl::matrix_base<char>             *matrix_row_char;
+        //viennacl::matrix_base<unsigned char>    *matrix_row_uchar;
+        //viennacl::matrix_base<short>            *matrix_row_short;
+        //viennacl::matrix_base<unsigned short>   *matrix_row_ushort;
+        //viennacl::matrix_base<int>              *matrix_row_int;
+        //viennacl::matrix_base<unsigned int>     *matrix_row_uint;
+        //viennacl::matrix_base<long>             *matrix_row_long;
+        //viennacl::matrix_base<unsigned long>    *matrix_row_ulong;
+        viennacl::matrix_base<float>            *matrix_row_float;
+        viennacl::matrix_base<double>           *matrix_row_double;
+
+        // column-major matrices:
+        //viennacl::matrix_base<char,           viennacl::column_major>    *matrix_col_char;
+        //viennacl::matrix_base<unsigned char,  viennacl::column_major>    *matrix_col_uchar;
+        //viennacl::matrix_base<short,          viennacl::column_major>    *matrix_col_short;
+        //viennacl::matrix_base<unsigned short, viennacl::column_major>    *matrix_col_ushort;
+        //viennacl::matrix_base<int,            viennacl::column_major>    *matrix_col_int;
+        //viennacl::matrix_base<unsigned int,   viennacl::column_major>    *matrix_col_uint;
+        //viennacl::matrix_base<long,           viennacl::column_major>    *matrix_col_long;
+        //viennacl::matrix_base<unsigned long,  viennacl::column_major>    *matrix_col_ulong;
+        viennacl::matrix_base<float,          viennacl::column_major>    *matrix_col_float;
+        viennacl::matrix_base<double,         viennacl::column_major>    *matrix_col_double;
+
+        //viennacl::implicit_matrix_base<char>             *implicit_matrix_char;
+        //viennacl::implicit_matrix_base<unsigned char>    *implicit_matrix_uchar;
+        //viennacl::implicit_matrix_base<short>            *implicit_matrix_short;
+        //viennacl::implicit_matrix_base<unsigned short>   *implicit_matrix_ushort;
+        //viennacl::implicit_matrix_base<int>              *implicit_matrix_int;
+        //viennacl::implicit_matrix_base<unsigned int>     *implicit_matrix_uint;
+        //viennacl::implicit_matrix_base<long>             *implicit_matrix_long;
+        //viennacl::implicit_matrix_base<unsigned long>    *implicit_matrix_ulong;
+        viennacl::implicit_matrix_base<float>            *implicit_matrix_float;
+        viennacl::implicit_matrix_base<double>           *implicit_matrix_double;
+
+        //viennacl::compressed_matrix<float>    *compressed_matrix_char;
+        //viennacl::compressed_matrix<double>   *compressed_matrix_uchar;
+        //viennacl::compressed_matrix<float>    *compressed_matrix_short;
+        //viennacl::compressed_matrix<double>   *compressed_matrix_ushort;
+        //viennacl::compressed_matrix<float>    *compressed_matrix_int;
+        //viennacl::compressed_matrix<double>   *compressed_matrix_uint;
+        //viennacl::compressed_matrix<float>    *compressed_matrix_long;
+        //viennacl::compressed_matrix<double>   *compressed_matrix_ulong;
+        viennacl::compressed_matrix<float>    *compressed_matrix_float;
+        viennacl::compressed_matrix<double>   *compressed_matrix_double;
+
+        //viennacl::coordinate_matrix<float>    *coordinate_matrix_char;
+        //viennacl::coordinate_matrix<double>   *coordinate_matrix_uchar;
+        //viennacl::coordinate_matrix<float>    *coordinate_matrix_short;
+        //viennacl::coordinate_matrix<double>   *coordinate_matrix_ushort;
+        //viennacl::coordinate_matrix<float>    *coordinate_matrix_int;
+        //viennacl::coordinate_matrix<double>   *coordinate_matrix_uint;
+        //viennacl::coordinate_matrix<float>    *coordinate_matrix_long;
+        //viennacl::coordinate_matrix<double>   *coordinate_matrix_ulong;
+        viennacl::coordinate_matrix<float>    *coordinate_matrix_float;
+        viennacl::coordinate_matrix<double>   *coordinate_matrix_double;
+
+        //viennacl::ell_matrix<float>    *ell_matrix_char;
+        //viennacl::ell_matrix<double>   *ell_matrix_uchar;
+        //viennacl::ell_matrix<float>    *ell_matrix_short;
+        //viennacl::ell_matrix<double>   *ell_matrix_ushort;
+        //viennacl::ell_matrix<float>    *ell_matrix_int;
+        //viennacl::ell_matrix<double>   *ell_matrix_uint;
+        //viennacl::ell_matrix<float>    *ell_matrix_long;
+        //viennacl::ell_matrix<double>   *ell_matrix_ulong;
+        viennacl::ell_matrix<float>    *ell_matrix_float;
+        viennacl::ell_matrix<double>   *ell_matrix_double;
+
+        //viennacl::hyb_matrix<float>    *hyb_matrix_char;
+        //viennacl::hyb_matrix<double>   *hyb_matrix_uchar;
+        //viennacl::hyb_matrix<float>    *hyb_matrix_short;
+        //viennacl::hyb_matrix<double>   *hyb_matrix_ushort;
+        //viennacl::hyb_matrix<float>    *hyb_matrix_int;
+        //viennacl::hyb_matrix<double>   *hyb_matrix_uint;
+        //viennacl::hyb_matrix<float>    *hyb_matrix_long;
+        //viennacl::hyb_matrix<double>   *hyb_matrix_ulong;
+        viennacl::hyb_matrix<float>    *hyb_matrix_float;
+        viennacl::hyb_matrix<double>   *hyb_matrix_double;
+      };
+    };
+
+
+    /** @brief Struct for holding the type family as well as the type of an operation (could be addition, subtraction, norm, etc.) */
+    struct op_element
+    {
+      operation_node_type_family   type_family;
+      operation_node_type          type;
+    };
+
+    /** @brief Main datastructure for an node in the statement tree */
+    struct statement_node
+    {
+      lhs_rhs_element    lhs;
+      op_element         op;
+      lhs_rhs_element    rhs;
+    };
+
+    namespace result_of
+    {
+
+      /** @brief Helper metafunction for obtaining the number of nodes of an expression template tree. */
+      template <class T> struct num_nodes { enum { value = 0 }; };
+      /** \cond */
+      template <class LHS, class OP, class RHS> struct num_nodes<       vector_expression<LHS, RHS, OP> > { enum { value = 1 + num_nodes<LHS>::value + num_nodes<RHS>::value }; };
+      template <class LHS, class OP, class RHS> struct num_nodes< const vector_expression<LHS, RHS, OP> > { enum { value = 1 + num_nodes<LHS>::value + num_nodes<RHS>::value }; };
+      template <class LHS, class OP, class RHS> struct num_nodes<       matrix_expression<LHS, RHS, OP> > { enum { value = 1 + num_nodes<LHS>::value + num_nodes<RHS>::value }; };
+      template <class LHS, class OP, class RHS> struct num_nodes< const matrix_expression<LHS, RHS, OP> > { enum { value = 1 + num_nodes<LHS>::value + num_nodes<RHS>::value }; };
+      template <class LHS, class OP, class RHS> struct num_nodes<       scalar_expression<LHS, RHS, OP> > { enum { value = 1 + num_nodes<LHS>::value + num_nodes<RHS>::value }; };
+      template <class LHS, class OP, class RHS> struct num_nodes< const scalar_expression<LHS, RHS, OP> > { enum { value = 1 + num_nodes<LHS>::value + num_nodes<RHS>::value }; };
+      /** \endcond */
+
+    }
+
+    /** \brief The main class for representing a statement such as x = inner_prod(y,z); at runtime.
+      *
+      * This is the equivalent to an expression template tree, but entirely built at runtime in order to perform really cool stuff such as kernel fusion.
+      */
+    class statement
+    {
+      public:
+        typedef statement_node              value_type;
+        typedef viennacl::vcl_size_t        size_type;
+        typedef std::vector<value_type>     container_type;
+
+        statement(container_type const & custom_array) : array_(custom_array) {}
+
+        /** @brief Generate the runtime statement from an expression template.
+          *
+          * Constructing a runtime statement from expression templates makes perfect sense, because this way only a single allocation is needed when creating the statement. */
+        template <typename LHS, typename OP, typename RHS>
+        statement(LHS & lhs, OP const &, RHS const & rhs) : array_(1 + result_of::num_nodes<RHS>::value)
+        {
+          // set OP:
+          array_[0].op.type_family = operation_node_type_family(result_of::op_type_info<OP>::family);
+          array_[0].op.type        = operation_node_type(result_of::op_type_info<OP>::id);
+
+          // set LHS:
+          add_lhs(0, 1, lhs);
+
+          // set RHS:
+          add_rhs(0, 1, rhs);
+        }
+
+        container_type const & array() const { return array_; }
+
+        size_type root() const { return 0; }
+
+      private:
+
+        ///////////// Scalar node helper ////////////////
+
+        // TODO: add integer vector overloads here
+        void assign_element(lhs_rhs_element & elem, viennacl::scalar<float>  const & t) { elem.scalar_float  = const_cast<viennacl::scalar<float> *>(&t); }
+        void assign_element(lhs_rhs_element & elem, viennacl::scalar<double> const & t) { elem.scalar_double = const_cast<viennacl::scalar<double> *>(&t); }
+
+        ///////////// Vector node helper ////////////////
+        // TODO: add integer vector overloads here
+        void assign_element(lhs_rhs_element & elem, viennacl::vector_base<float>  const & t) { elem.vector_float  = const_cast<viennacl::vector_base<float> *>(&t); }
+        void assign_element(lhs_rhs_element & elem, viennacl::vector_base<double> const & t) { elem.vector_double = const_cast<viennacl::vector_base<double> *>(&t); }
+
+        ///////////// Matrix node helper ////////////////
+        // TODO: add integer matrix overloads here
+        void assign_element(lhs_rhs_element & elem, viennacl::matrix_base<float,  viennacl::column_major> const & t) { elem.matrix_col_float  = const_cast<viennacl::matrix_base<float,  viennacl::column_major> *>(&t); }
+        void assign_element(lhs_rhs_element & elem, viennacl::matrix_base<float,  viennacl::row_major>    const & t) { elem.matrix_row_float  = const_cast<viennacl::matrix_base<float,  viennacl::row_major>    *>(&t); }
+        void assign_element(lhs_rhs_element & elem, viennacl::matrix_base<double, viennacl::column_major> const & t) { elem.matrix_col_double = const_cast<viennacl::matrix_base<double, viennacl::column_major> *>(&t); }
+        void assign_element(lhs_rhs_element & elem, viennacl::matrix_base<double, viennacl::row_major>    const & t) { elem.matrix_row_double = const_cast<viennacl::matrix_base<double, viennacl::row_major>    *>(&t); }
+
+        void assign_element(lhs_rhs_element & elem, viennacl::compressed_matrix<float>  const & m) { elem.compressed_matrix_float  = const_cast<viennacl::compressed_matrix<float>  *>(&m); }
+        void assign_element(lhs_rhs_element & elem, viennacl::compressed_matrix<double> const & m) { elem.compressed_matrix_double = const_cast<viennacl::compressed_matrix<double> *>(&m); }
+
+        void assign_element(lhs_rhs_element & elem, viennacl::coordinate_matrix<float>  const & m) { elem.coordinate_matrix_float  = const_cast<viennacl::coordinate_matrix<float>  *>(&m); }
+        void assign_element(lhs_rhs_element & elem, viennacl::coordinate_matrix<double> const & m) { elem.coordinate_matrix_double = const_cast<viennacl::coordinate_matrix<double> *>(&m); }
+
+        void assign_element(lhs_rhs_element & elem, viennacl::ell_matrix<float>  const & m) { elem.ell_matrix_float  = const_cast<viennacl::ell_matrix<float>  *>(&m); }
+        void assign_element(lhs_rhs_element & elem, viennacl::ell_matrix<double> const & m) { elem.ell_matrix_double = const_cast<viennacl::ell_matrix<double> *>(&m); }
+
+        void assign_element(lhs_rhs_element & elem, viennacl::hyb_matrix<float>  const & m) { elem.hyb_matrix_float  = const_cast<viennacl::hyb_matrix<float>  *>(&m); }
+        void assign_element(lhs_rhs_element & elem, viennacl::hyb_matrix<double> const & m) { elem.hyb_matrix_double = const_cast<viennacl::hyb_matrix<double> *>(&m); }
+
+        //////////// Tree leaves (terminals) ////////////////////
+
+        vcl_size_t add_element(vcl_size_t       next_free,
+                                lhs_rhs_element & elem,
+                                float const &     t)
+        {
+          elem.type_family  = SCALAR_TYPE_FAMILY;
+          elem.subtype      = HOST_SCALAR_TYPE;
+          elem.numeric_type = FLOAT_TYPE;
+          elem.host_float   = t;
+          return next_free;
+        }
+
+        vcl_size_t add_element(vcl_size_t       next_free,
+                                lhs_rhs_element & elem,
+                                double const &    t)
+        {
+          elem.type_family  = SCALAR_TYPE_FAMILY;
+          elem.subtype      = HOST_SCALAR_TYPE;
+          elem.numeric_type = DOUBLE_TYPE;
+          elem.host_double  = t;
+          return next_free;
+        }
+
+        template <typename T>
+        vcl_size_t add_element(vcl_size_t next_free,
+                                lhs_rhs_element            & elem,
+                                viennacl::scalar<T> const & t)
+        {
+          elem.type_family  = SCALAR_TYPE_FAMILY;
+          elem.subtype      = DEVICE_SCALAR_TYPE;
+          elem.numeric_type = statement_node_numeric_type(result_of::numeric_type_id<T>::value);
+          assign_element(elem, t);
+          return next_free;
+        }
+
+
+        template <typename T>
+        vcl_size_t add_element(vcl_size_t next_free,
+                                lhs_rhs_element            & elem,
+                                viennacl::vector_base<T> const & t)
+        {
+          elem.type_family           = VECTOR_TYPE_FAMILY;
+          elem.subtype               = DENSE_VECTOR_TYPE;
+          elem.numeric_type          = statement_node_numeric_type(result_of::numeric_type_id<T>::value);
+          assign_element(elem, t);
+          return next_free;
+        }
+
+        template <typename T, typename F>
+        vcl_size_t add_element(vcl_size_t next_free,
+                                lhs_rhs_element            & elem,
+                                viennacl::matrix_base<T, F> const & t)
+        {
+          elem.type_family  = MATRIX_TYPE_FAMILY;
+          elem.subtype      = statement_node_subtype(result_of::layout_type_id<F>::value);
+          elem.numeric_type = statement_node_numeric_type(result_of::numeric_type_id<T>::value);
+          assign_element(elem, t);
+          return next_free;
+        }
+
+        template <typename T>
+        vcl_size_t add_element(vcl_size_t next_free,
+                                lhs_rhs_element            & elem,
+                                viennacl::compressed_matrix<T> const & t)
+        {
+          elem.type_family  = MATRIX_TYPE_FAMILY;
+          elem.subtype      = COMPRESSED_MATRIX_TYPE;
+          elem.numeric_type = statement_node_numeric_type(result_of::numeric_type_id<T>::value);
+          assign_element(elem, t);
+          return next_free;
+        }
+
+        template <typename T>
+        vcl_size_t add_element(vcl_size_t next_free,
+                                lhs_rhs_element            & elem,
+                                viennacl::coordinate_matrix<T> const & t)
+        {
+          elem.type_family  = MATRIX_TYPE_FAMILY;
+          elem.subtype      = COORDINATE_MATRIX_TYPE;
+          elem.numeric_type = statement_node_numeric_type(result_of::numeric_type_id<T>::value);
+          assign_element(elem, t);
+          return next_free;
+        }
+
+        template <typename T>
+        vcl_size_t add_element(vcl_size_t next_free,
+                                lhs_rhs_element            & elem,
+                                viennacl::ell_matrix<T> const & t)
+        {
+          elem.type_family  = MATRIX_TYPE_FAMILY;
+          elem.subtype      = ELL_MATRIX_TYPE;
+          elem.numeric_type = statement_node_numeric_type(result_of::numeric_type_id<T>::value);
+          assign_element(elem, t);
+          return next_free;
+        }
+
+        template <typename T>
+        vcl_size_t add_element(vcl_size_t next_free,
+                                lhs_rhs_element            & elem,
+                                viennacl::hyb_matrix<T> const & t)
+        {
+          elem.type_family  = MATRIX_TYPE_FAMILY;
+          elem.subtype      = HYB_MATRIX_TYPE;
+          elem.numeric_type = statement_node_numeric_type(result_of::numeric_type_id<T>::value);
+          assign_element(elem, t);
+          return next_free;
+        }
+
+
+        //////////// Tree nodes (non-terminals) ////////////////////
+
+        template <typename LHS, typename RHS, typename OP>
+        vcl_size_t add_element(vcl_size_t       next_free,
+                                lhs_rhs_element & elem,
+                                viennacl::scalar_expression<LHS, RHS, OP> const & t)
+        {
+          elem.type_family  = COMPOSITE_OPERATION_FAMILY;
+          elem.subtype      = INVALID_SUBTYPE;
+          elem.numeric_type = INVALID_NUMERIC_TYPE;
+          elem.node_index   = next_free;
+          return add_node(next_free, next_free + 1, t);
+        }
+
+        template <typename LHS, typename RHS, typename OP>
+        vcl_size_t add_element(vcl_size_t       next_free,
+                                lhs_rhs_element & elem,
+                                viennacl::vector_expression<LHS, RHS, OP> const & t)
+        {
+          elem.type_family  = COMPOSITE_OPERATION_FAMILY;
+          elem.subtype      = INVALID_SUBTYPE;
+          elem.numeric_type = INVALID_NUMERIC_TYPE;
+          elem.node_index   = next_free;
+          return add_node(next_free, next_free + 1, t);
+        }
+
+        template <typename LHS, typename RHS, typename OP>
+        vcl_size_t add_element(vcl_size_t next_free,
+                                lhs_rhs_element & elem,
+                                viennacl::matrix_expression<LHS, RHS, OP> const & t)
+        {
+          elem.type_family   = COMPOSITE_OPERATION_FAMILY;
+          elem.subtype      = INVALID_SUBTYPE;
+          elem.numeric_type = INVALID_NUMERIC_TYPE;
+          elem.node_index    = next_free;
+          return add_node(next_free, next_free + 1, t);
+        }
+
+
+        //////////// Helper routines ////////////////////
+
+
+        template <typename T>
+        vcl_size_t add_lhs(vcl_size_t current_index, vcl_size_t next_free, T const & t)
+        {
+          return add_element(next_free, array_[current_index].lhs, t);
+        }
+
+        template <typename T>
+        vcl_size_t add_rhs(vcl_size_t current_index, vcl_size_t next_free, T const & t)
+        {
+          return add_element(next_free, array_[current_index].rhs, t);
+        }
+
+        //////////// Internal interfaces ////////////////////
+
+        template <template <typename, typename, typename> class ExpressionT, typename LHS, typename RHS, typename OP>
+        vcl_size_t add_node(vcl_size_t current_index, vcl_size_t next_free, ExpressionT<LHS, RHS, OP> const & proxy)
+        {
+          // set OP:
+          array_[current_index].op.type_family = operation_node_type_family(result_of::op_type_info<OP>::family);
+          array_[current_index].op.type        = operation_node_type(result_of::op_type_info<OP>::id);
+
+          // set LHS and RHS:
+          if (array_[current_index].op.type_family == OPERATION_UNARY_TYPE_FAMILY)
+          {
+            // unary expression: set rhs to invalid:
+            array_[current_index].rhs.type_family  = INVALID_TYPE_FAMILY;
+            array_[current_index].rhs.subtype      = INVALID_SUBTYPE;
+            array_[current_index].rhs.numeric_type = INVALID_NUMERIC_TYPE;
+            return add_lhs(current_index, next_free, proxy.lhs());
+          }
+
+          return add_rhs(current_index, add_lhs(current_index, next_free, proxy.lhs()), proxy.rhs());
+
+        }
+
+        container_type   array_;
+    };
+
+    namespace detail
+    {
+      /** @brief Deals with x = RHS where RHS is an expression and x is either a scalar, a vector, or a matrix */
+      inline void execute_composite(statement const & /* s */, statement_node const & /* root_node */);
+    }
+
+  } // namespace scheduler
+
+} // namespace viennacl
+
+#endif
+
diff --git a/viennacl/scheduler/io.hpp b/viennacl/scheduler/io.hpp
new file mode 100644
index 0000000..729cfd9
--- /dev/null
+++ b/viennacl/scheduler/io.hpp
@@ -0,0 +1,290 @@
+#ifndef VIENNACL_SCHEDULER_IO_HPP
+#define VIENNACL_SCHEDULER_IO_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/scheduler/io.hpp
+    @brief Some helper routines for reading/writing/printing scheduler expressions
+*/
+
+#include <iostream>
+#include <sstream>
+
+#include "viennacl/forwards.h"
+#include "viennacl/scheduler/forwards.h"
+
+
+namespace viennacl
+{
+  namespace scheduler
+  {
+
+    namespace detail
+    {
+#define VIENNACL_TRANSLATE_OP_TO_STRING(NAME)   case NAME: return #NAME;
+
+      /** @brief Helper routine for converting the operation enums to string */
+      std::string to_string(viennacl::scheduler::op_element op_elem)
+      {
+        if (op_elem.type_family == OPERATION_UNARY_TYPE_FAMILY)
+        {
+          switch (op_elem.type)
+          {
+            VIENNACL_TRANSLATE_OP_TO_STRING(OPERATION_UNARY_ABS_TYPE)
+            VIENNACL_TRANSLATE_OP_TO_STRING(OPERATION_UNARY_ACOS_TYPE)
+            VIENNACL_TRANSLATE_OP_TO_STRING(OPERATION_UNARY_ASIN_TYPE)
+            VIENNACL_TRANSLATE_OP_TO_STRING(OPERATION_UNARY_ATAN_TYPE)
+            VIENNACL_TRANSLATE_OP_TO_STRING(OPERATION_UNARY_CEIL_TYPE)
+            VIENNACL_TRANSLATE_OP_TO_STRING(OPERATION_UNARY_COS_TYPE)
+            VIENNACL_TRANSLATE_OP_TO_STRING(OPERATION_UNARY_COSH_TYPE)
+            VIENNACL_TRANSLATE_OP_TO_STRING(OPERATION_UNARY_EXP_TYPE)
+            VIENNACL_TRANSLATE_OP_TO_STRING(OPERATION_UNARY_FABS_TYPE)
+            VIENNACL_TRANSLATE_OP_TO_STRING(OPERATION_UNARY_FLOOR_TYPE)
+            VIENNACL_TRANSLATE_OP_TO_STRING(OPERATION_UNARY_LOG_TYPE)
+            VIENNACL_TRANSLATE_OP_TO_STRING(OPERATION_UNARY_LOG10_TYPE)
+            VIENNACL_TRANSLATE_OP_TO_STRING(OPERATION_UNARY_SIN_TYPE)
+            VIENNACL_TRANSLATE_OP_TO_STRING(OPERATION_UNARY_SINH_TYPE)
+            VIENNACL_TRANSLATE_OP_TO_STRING(OPERATION_UNARY_SQRT_TYPE)
+            VIENNACL_TRANSLATE_OP_TO_STRING(OPERATION_UNARY_TAN_TYPE)
+            VIENNACL_TRANSLATE_OP_TO_STRING(OPERATION_UNARY_TANH_TYPE)
+            VIENNACL_TRANSLATE_OP_TO_STRING(OPERATION_UNARY_TRANS_TYPE)
+            VIENNACL_TRANSLATE_OP_TO_STRING(OPERATION_UNARY_NORM_1_TYPE)
+            VIENNACL_TRANSLATE_OP_TO_STRING(OPERATION_UNARY_NORM_2_TYPE)
+            VIENNACL_TRANSLATE_OP_TO_STRING(OPERATION_UNARY_NORM_INF_TYPE)
+
+            default: throw statement_not_supported_exception("Cannot convert unary operation to string");
+          }
+        }
+        else if (op_elem.type_family == OPERATION_BINARY_TYPE_FAMILY)
+        {
+          switch (op_elem.type)
+          {
+            VIENNACL_TRANSLATE_OP_TO_STRING(OPERATION_BINARY_ASSIGN_TYPE)
+            VIENNACL_TRANSLATE_OP_TO_STRING(OPERATION_BINARY_INPLACE_ADD_TYPE)
+            VIENNACL_TRANSLATE_OP_TO_STRING(OPERATION_BINARY_INPLACE_SUB_TYPE)
+            VIENNACL_TRANSLATE_OP_TO_STRING(OPERATION_BINARY_ADD_TYPE)
+            VIENNACL_TRANSLATE_OP_TO_STRING(OPERATION_BINARY_SUB_TYPE)
+            VIENNACL_TRANSLATE_OP_TO_STRING(OPERATION_BINARY_MAT_VEC_PROD_TYPE)
+            VIENNACL_TRANSLATE_OP_TO_STRING(OPERATION_BINARY_MAT_MAT_PROD_TYPE)
+            VIENNACL_TRANSLATE_OP_TO_STRING(OPERATION_BINARY_MULT_TYPE)
+            VIENNACL_TRANSLATE_OP_TO_STRING(OPERATION_BINARY_DIV_TYPE)
+            VIENNACL_TRANSLATE_OP_TO_STRING(OPERATION_BINARY_ELEMENT_PROD_TYPE)
+            VIENNACL_TRANSLATE_OP_TO_STRING(OPERATION_BINARY_ELEMENT_DIV_TYPE)
+            VIENNACL_TRANSLATE_OP_TO_STRING(OPERATION_BINARY_INNER_PROD_TYPE)
+
+            default: throw statement_not_supported_exception("Cannot convert unary operation to string");
+          }
+        }
+        else if (op_elem.type_family == OPERATION_INVALID_TYPE_FAMILY)
+        {
+          if (op_elem.type == OPERATION_INVALID_TYPE)
+            return "OPERATION_INVALID_TYPE";
+          else
+            throw statement_not_supported_exception("Unknown invalid operation type when converting to string");
+        }
+        else
+          throw statement_not_supported_exception("Unknown operation family when converting to string");
+      }
+
+#undef VIENNACL_TRANSLATE_OP_TO_STRING
+
+#define VIENNACL_TRANSLATE_ELEMENT_TO_STRING(NAME, ELEMENT)   case NAME: ss << "(" << element.ELEMENT << ")"; return #NAME + ss.str();
+
+      /** @brief Helper routine converting the enum and union values inside a statement node to a string */
+      std::string to_string(viennacl::scheduler::lhs_rhs_element element)
+      {
+        std::stringstream ss;
+
+        if (element.type_family == COMPOSITE_OPERATION_FAMILY)
+        {
+          ss << "(" << element.node_index << ")";
+          return "COMPOSITE_OPERATION_FAMILY" + ss.str();
+        }
+        else if (element.type_family == SCALAR_TYPE_FAMILY)
+        {
+          if (element.subtype == HOST_SCALAR_TYPE)
+          {
+            ss << ", HOST_SCALAR_TYPE ";
+            switch (element.numeric_type)
+            {
+              VIENNACL_TRANSLATE_ELEMENT_TO_STRING(CHAR_TYPE,   host_char)
+              VIENNACL_TRANSLATE_ELEMENT_TO_STRING(UCHAR_TYPE,  host_uchar)
+              VIENNACL_TRANSLATE_ELEMENT_TO_STRING(SHORT_TYPE,  host_short)
+              VIENNACL_TRANSLATE_ELEMENT_TO_STRING(USHORT_TYPE, host_ushort)
+              VIENNACL_TRANSLATE_ELEMENT_TO_STRING(INT_TYPE,    host_int)
+              VIENNACL_TRANSLATE_ELEMENT_TO_STRING(UINT_TYPE,   host_uint)
+              VIENNACL_TRANSLATE_ELEMENT_TO_STRING(LONG_TYPE,   host_long)
+              VIENNACL_TRANSLATE_ELEMENT_TO_STRING(ULONG_TYPE,  host_ulong)
+              VIENNACL_TRANSLATE_ELEMENT_TO_STRING(FLOAT_TYPE,  host_float)
+              VIENNACL_TRANSLATE_ELEMENT_TO_STRING(DOUBLE_TYPE, host_double)
+
+              default: throw statement_not_supported_exception("Cannot convert host scalar type to string");
+            }
+          }
+          else
+          {
+            ss << ", DEVICE_SCALAR_TYPE";
+            switch (element.numeric_type)
+            {
+              //VIENNACL_TRANSLATE_ELEMENT_TO_STRING(CHAR_TYPE,   scalar_char)
+              //VIENNACL_TRANSLATE_ELEMENT_TO_STRING(UCHAR_TYPE,  scalar_uchar)
+              //VIENNACL_TRANSLATE_ELEMENT_TO_STRING(SHORT_TYPE,  scalar_short)
+              //VIENNACL_TRANSLATE_ELEMENT_TO_STRING(USHORT_TYPE, scalar_ushort)
+              //VIENNACL_TRANSLATE_ELEMENT_TO_STRING(INT_TYPE,    scalar_int)
+              //VIENNACL_TRANSLATE_ELEMENT_TO_STRING(UINT_TYPE,   scalar_uint)
+              //VIENNACL_TRANSLATE_ELEMENT_TO_STRING(LONG_TYPE,   scalar_long)
+              //VIENNACL_TRANSLATE_ELEMENT_TO_STRING(ULONG_TYPE,  scalar_ulong)
+              //VIENNACL_TRANSLATE_ELEMENT_TO_STRING(HALF_TYPE,   scalar_half)
+              VIENNACL_TRANSLATE_ELEMENT_TO_STRING(FLOAT_TYPE,  scalar_float)
+              VIENNACL_TRANSLATE_ELEMENT_TO_STRING(DOUBLE_TYPE, scalar_double)
+            default: throw statement_not_supported_exception("Cannot convert scalar type to string");
+            }
+          }
+        }
+        else if (element.type_family == VECTOR_TYPE_FAMILY)
+        {
+          ss << ", DENSE_VECTOR_TYPE ";
+          switch (element.numeric_type)
+          {
+            //VIENNACL_TRANSLATE_ELEMENT_TO_STRING(CHAR_TYPE,   vector_char)
+            //VIENNACL_TRANSLATE_ELEMENT_TO_STRING(UCHAR_TYPE,  vector_uchar)
+            //VIENNACL_TRANSLATE_ELEMENT_TO_STRING(SHORT_TYPE,  vector_short)
+            //VIENNACL_TRANSLATE_ELEMENT_TO_STRING(USHORT_TYPE, vector_ushort)
+            //VIENNACL_TRANSLATE_ELEMENT_TO_STRING(INT_TYPE,    vector_int)
+            //VIENNACL_TRANSLATE_ELEMENT_TO_STRING(UINT_TYPE,   vector_uint)
+            //VIENNACL_TRANSLATE_ELEMENT_TO_STRING(LONG_TYPE,   vector_long)
+            //VIENNACL_TRANSLATE_ELEMENT_TO_STRING(ULONG_TYPE,  vector_ulong)
+            //VIENNACL_TRANSLATE_ELEMENT_TO_STRING(HALF_TYPE,   vector_half)
+            VIENNACL_TRANSLATE_ELEMENT_TO_STRING(FLOAT_TYPE,  vector_float)
+            VIENNACL_TRANSLATE_ELEMENT_TO_STRING(DOUBLE_TYPE, vector_double)
+
+            default: throw statement_not_supported_exception("Cannot convert vector type to string");
+          }
+        }
+        else if (element.type_family == MATRIX_TYPE_FAMILY)
+        {
+          if (element.subtype == DENSE_ROW_MATRIX_TYPE)
+          {
+            ss << ", DENSE_ROW_MATRIX_TYPE ";
+            switch (element.numeric_type)
+            {
+              //VIENNACL_TRANSLATE_ELEMENT_TO_STRING(CHAR_TYPE,   matrix_row_char)
+              //VIENNACL_TRANSLATE_ELEMENT_TO_STRING(UCHAR_TYPE,  matrix_row_uchar)
+              //VIENNACL_TRANSLATE_ELEMENT_TO_STRING(SHORT_TYPE,  matrix_row_short)
+              //VIENNACL_TRANSLATE_ELEMENT_TO_STRING(USHORT_TYPE, matrix_row_ushort)
+              //VIENNACL_TRANSLATE_ELEMENT_TO_STRING(INT_TYPE,    matrix_row_int)
+              //VIENNACL_TRANSLATE_ELEMENT_TO_STRING(UINT_TYPE,   matrix_row_uint)
+              //VIENNACL_TRANSLATE_ELEMENT_TO_STRING(LONG_TYPE,   matrix_row_long)
+              //VIENNACL_TRANSLATE_ELEMENT_TO_STRING(ULONG_TYPE,  matrix_row_ulong)
+              //VIENNACL_TRANSLATE_ELEMENT_TO_STRING(HALF_TYPE,   matrix_row_half)
+              VIENNACL_TRANSLATE_ELEMENT_TO_STRING(FLOAT_TYPE,  matrix_row_float)
+              VIENNACL_TRANSLATE_ELEMENT_TO_STRING(DOUBLE_TYPE, matrix_row_double)
+
+              default: throw statement_not_supported_exception("Cannot convert row-major matrix type to string");
+            }
+          }
+          else
+          {
+            ss << ", DENSE_COL_MATRIX_TYPE ";
+            switch (element.numeric_type)
+            {
+              //VIENNACL_TRANSLATE_ELEMENT_TO_STRING(CHAR_TYPE,   matrix_col_char)
+              //VIENNACL_TRANSLATE_ELEMENT_TO_STRING(UCHAR_TYPE,  matrix_col_uchar)
+              //VIENNACL_TRANSLATE_ELEMENT_TO_STRING(SHORT_TYPE,  matrix_col_short)
+              //VIENNACL_TRANSLATE_ELEMENT_TO_STRING(USHORT_TYPE, matrix_col_ushort)
+              //VIENNACL_TRANSLATE_ELEMENT_TO_STRING(INT_TYPE,    matrix_col_int)
+              //VIENNACL_TRANSLATE_ELEMENT_TO_STRING(UINT_TYPE,   matrix_col_uint)
+              //VIENNACL_TRANSLATE_ELEMENT_TO_STRING(LONG_TYPE,   matrix_col_long)
+              //VIENNACL_TRANSLATE_ELEMENT_TO_STRING(ULONG_TYPE,  matrix_col_ulong)
+              //VIENNACL_TRANSLATE_ELEMENT_TO_STRING(HALF_TYPE,   matrix_col_half)
+              VIENNACL_TRANSLATE_ELEMENT_TO_STRING(FLOAT_TYPE,  matrix_col_float)
+              VIENNACL_TRANSLATE_ELEMENT_TO_STRING(DOUBLE_TYPE, matrix_col_double)
+
+              default: throw statement_not_supported_exception("Cannot convert column-major matrix type to string");
+            }
+          }
+        }
+        else if (element.type_family == INVALID_TYPE_FAMILY)
+        {
+          return "INVALID_TYPE_FAMILY";
+        }
+        else
+          throw statement_not_supported_exception("Unknown operation family when converting to string");
+      }
+
+#undef VIENNACL_TRANSLATE_ELEMENT_TO_STRING
+
+    } // namespace detail
+
+
+    /** @brief Print a single statement_node. Non-recursive */
+    std::ostream & operator<<(std::ostream & os, viennacl::scheduler::statement_node const & s_node)
+    {
+      os << "LHS: " << detail::to_string(s_node.lhs) << ", "
+         << "OP: "  << detail::to_string(s_node.op) << ", "
+         << "RHS: " << detail::to_string(s_node.rhs);
+
+      return os;
+    }
+
+
+
+
+
+    namespace detail
+    {
+      /** @brief Recursive worker routine for printing a whole statement */
+      void print_node(std::ostream & os, viennacl::scheduler::statement const & s, vcl_size_t node_index, vcl_size_t indent = 0)
+      {
+        typedef viennacl::scheduler::statement::container_type   StatementNodeContainer;
+        typedef viennacl::scheduler::statement::value_type       StatementNode;
+
+        StatementNodeContainer const & nodes = s.array();
+        StatementNode const & current_node = nodes[node_index];
+
+        for (vcl_size_t i=0; i<indent; ++i)
+          os << " ";
+
+        os << "Node " << node_index << ": " << current_node << std::endl;
+
+        if (current_node.lhs.type_family == COMPOSITE_OPERATION_FAMILY)
+          print_node(os, s, current_node.lhs.node_index, indent+1);
+
+        if (current_node.rhs.type_family == COMPOSITE_OPERATION_FAMILY)
+          print_node(os, s, current_node.rhs.node_index, indent+1);
+      }
+    }
+
+    /** @brief Writes a string identifying the scheduler statement to an output stream.
+      *
+      * Typically used for debugging
+      * @param os    The output stream
+      * @param s     The statement object
+      */
+    std::ostream & operator<<(std::ostream & os, viennacl::scheduler::statement const & s)
+    {
+      detail::print_node(os, s, s.root());
+      return os;
+    }
+  }
+
+} //namespace viennacl
+
+#endif
+
diff --git a/viennacl/range.hpp b/viennacl/slice.hpp
similarity index 57%
copy from viennacl/range.hpp
copy to viennacl/slice.hpp
index 88b13d8..3218745 100644
--- a/viennacl/range.hpp
+++ b/viennacl/slice.hpp
@@ -1,24 +1,25 @@
-#ifndef VIENNACL_RANGE_HPP_
-#define VIENNACL_RANGE_HPP_
+#ifndef VIENNACL_SLICE_HPP_
+#define VIENNACL_SLICE_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
 ============================================================================= */
 
-/** @file range.hpp
-    @brief Implementation of a range object for use with proxy objects
+/** @file slice.hpp
+    @brief Implementation of a slice object for use with proxy objects
 */
 
 #include <vector>
@@ -29,13 +30,13 @@
 namespace viennacl
 {
 
-  /** @brief A range class that refers to an interval [start, stop), where 'start' is included, and 'stop' is excluded.
-   * 
+  /** @brief A slice class that refers to an interval [start, stop), where 'start' is included, and 'stop' is excluded.
+   *
    * Similar to the boost::numeric::ublas::basic_range class.
    */
   template <typename SizeType /* see forwards.h for default argument*/,
             typename DistanceType /* see forwards.h for default argument*/>
-  class basic_range
+  class basic_slice
   {
     public:
       typedef SizeType             size_type;
@@ -43,33 +44,34 @@ namespace viennacl
       typedef size_type            value_type;
       typedef value_type           const_reference;
       typedef const_reference      reference;
-      
-      basic_range() : start_(0), size_(0) {}
-      basic_range(size_type start_index, size_type stop_index) : start_(start_index), size_(stop_index - start_index)
-      {
-        assert(start_index <= stop_index);
-      }
-        
-        
-      size_type start() const { return start_; }
-      size_type size() const { return size_; }
-      
-      const_reference operator()(size_type i) const 
+
+      basic_slice() : start_(0), stride_(1), size_(0) {}
+      basic_slice(size_type start_index,
+                  difference_type stride_arg,
+                  size_type size_arg) : start_(start_index), stride_(stride_arg), size_(size_arg) {}
+
+
+      size_type       start() const { return start_; }
+      difference_type stride() const { return stride_; }
+      size_type       size() const { return size_; }
+
+      const_reference operator()(size_type i) const
       {
         assert(i < size());
-        return start_ + i;
+        return start_ + i * stride_;
       }
       const_reference operator[](size_type i) const { return operator()(i); }
-      
-      bool operator==(const basic_range & r) const { return (start_ == r.start_) && (size_ == r.size_); }
-      bool operator!=(const basic_range & r) const { return !(*this == r); }
-      
+
+      bool operator==(const basic_slice & s) const { return (start_ == s.start_) && (stride_ == s.stride_) && (size_ == s.size_); }
+      bool operator!=(const basic_slice & s) const { return !(*this == s); }
+
     private:
       size_type start_;
+      difference_type stride_;
       size_type size_;
   };
-  
-  
+
+
 }
 
-#endif
\ No newline at end of file
+#endif
diff --git a/viennacl/toeplitz_matrix.hpp b/viennacl/toeplitz_matrix.hpp
index e50b1b9..af7adc5 100644
--- a/viennacl/toeplitz_matrix.hpp
+++ b/viennacl/toeplitz_matrix.hpp
@@ -2,28 +2,29 @@
 #define VIENNACL_TOEPLITZ_MATRIX_HPP
 
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
 ============================================================================= */
 
 /** @file toeplitz_matrix.hpp
-    @brief Implementation of the toeplitz_matrix class for efficient manipulation of Toeplitz matrices.  Experimental in 1.2.x.
+    @brief Implementation of the toeplitz_matrix class for efficient manipulation of Toeplitz matrices.  Experimental.
 */
 
 #include "viennacl/forwards.h"
 #include "viennacl/vector.hpp"
-#include "viennacl/ocl/context.hpp"
+#include "viennacl/ocl/backend.hpp"
 
 #include "viennacl/fft.hpp"
 
@@ -41,27 +42,26 @@ namespace viennacl {
     class toeplitz_matrix
     {
       public:
+        typedef viennacl::backend::mem_handle                                                              handle_type;
+        typedef scalar<typename viennacl::tools::CHECK_SCALAR_TEMPLATE_ARGUMENT<SCALARTYPE>::ResultType>   value_type;
 
         /**
          * @brief The default constructor. Does not allocate any memory.
          *
          */
-        explicit toeplitz_matrix()
-        {
-          viennacl::linalg::kernels::fft<SCALARTYPE, 1>::init();
-        }
+        explicit toeplitz_matrix() {}
 
         /** @brief         Creates the matrix with the given size
         *
         * @param rows      Number of rows of the matrix
         * @param cols      Number of columns of the matrix
         */
-        explicit toeplitz_matrix(std::size_t rows, std::size_t cols) : elements_(rows * 2)
+        explicit toeplitz_matrix(vcl_size_t rows, vcl_size_t cols) : elements_(rows * 2)
         {
-          assert(rows == cols && "Toeplitz matrix must be square!");
-          viennacl::linalg::kernels::fft<SCALARTYPE, 1>::init();
+          assert(rows == cols && bool("Toeplitz matrix must be square!"));
+          (void)cols;  // avoid 'unused parameter' warning in optimized builds
         }
-        
+
 
         /** @brief Resizes the matrix.
         *   Existing entries can be preserved
@@ -69,7 +69,7 @@ namespace viennacl {
         * @param sz         New size of matrix
         * @param preserve   If true, existing values are preserved.
         */
-        void resize(size_t sz, bool preserve = true) {
+        void resize(vcl_size_t sz, bool preserve = true) {
             elements_.resize(sz * 2, preserve);
         }
 
@@ -77,7 +77,7 @@ namespace viennacl {
         *
         *   @return OpenCL handle
         */
-        viennacl::ocl::handle<cl_mem> handle() const { return elements_.handle(); }
+        handle_type const & handle() const { return elements_.handle(); }
 
         /**
          * @brief Returns an internal viennacl::vector, which represents a Toeplitz matrix elements
@@ -90,19 +90,19 @@ namespace viennacl {
         /**
          * @brief Returns the number of rows of the matrix
          */
-        std::size_t size1() const { return elements_.size() / 2; }
-        
+        vcl_size_t size1() const { return elements_.size() / 2; }
+
         /**
          * @brief Returns the number of columns of the matrix
          */
-        std::size_t size2() const { return elements_.size() / 2; }
+        vcl_size_t size2() const { return elements_.size() / 2; }
 
         /** @brief Returns the internal size of matrix representtion.
         *   Usually required for launching OpenCL kernels only
         *
         *   @return Internal size of matrix representation
         */
-        std::size_t internal_size() const { return elements_.internal_size(); }
+        vcl_size_t internal_size() const { return elements_.internal_size(); }
 
 
         /**
@@ -112,16 +112,16 @@ namespace viennacl {
          * @param col_index  Column index of accessed element
          * @return Proxy for matrix entry
          */
-        entry_proxy<SCALARTYPE> operator()(std::size_t row_index, std::size_t col_index) 
+        entry_proxy<SCALARTYPE> operator()(vcl_size_t row_index, vcl_size_t col_index)
         {
-            assert(row_index < size1() && col_index < size2() && "Invalid access");
-            
-            int index = static_cast<int>(col_index) - static_cast<int>(row_index);
-            
+            assert(row_index < size1() && col_index < size2() && bool("Invalid access"));
+
+            long index = static_cast<long>(col_index) - static_cast<long>(row_index);
+
             if (index < 0)
               index = -index;
             else if
-              (index > 0) index = 2 * size1() - index;
+              (index > 0) index = 2 * static_cast<long>(size1()) - index;
             return elements_[index];
         }
 
@@ -138,10 +138,10 @@ namespace viennacl {
         }
 
     private:
-        toeplitz_matrix(toeplitz_matrix const & t) {}
-        toeplitz_matrix & operator=(toeplitz_matrix const & t) {}
-        
-      
+        toeplitz_matrix(toeplitz_matrix const &) {}
+        toeplitz_matrix & operator=(toeplitz_matrix const & t);
+
+
         viennacl::vector<SCALARTYPE, ALIGNMENT> elements_;
     };
 
@@ -154,8 +154,9 @@ namespace viennacl {
     template <typename SCALARTYPE, unsigned int ALIGNMENT>
     void copy(std::vector<SCALARTYPE> const & cpu_vec, toeplitz_matrix<SCALARTYPE, ALIGNMENT>& gpu_mat)
     {
-        std::size_t size = gpu_mat.size1();
-        assert((size * 2 - 1)  == cpu_vec.size() && "Size mismatch");
+        assert( (gpu_mat.size1() == 0 || (gpu_mat.size1() * 2 - 1)  == cpu_vec.size()) && bool("Size mismatch"));
+
+        vcl_size_t size = gpu_mat.size1();
         std::vector<SCALARTYPE> rvrs(cpu_vec.size());
         std::copy(cpu_vec.begin(), cpu_vec.end(), rvrs.begin());
         std::reverse(rvrs.begin(), rvrs.end());
@@ -176,8 +177,9 @@ namespace viennacl {
     template <typename SCALARTYPE, unsigned int ALIGNMENT>
     void copy(toeplitz_matrix<SCALARTYPE, ALIGNMENT> const & gpu_mat, std::vector<SCALARTYPE> & cpu_vec)
     {
-        std::size_t size = gpu_mat.size1();
-        assert((size * 2 - 1)  == cpu_vec.size() && "Size mismatch");
+        assert((gpu_mat.size1() * 2 - 1)  == cpu_vec.size() && bool("Size mismatch"));
+
+        vcl_size_t size = gpu_mat.size1();
         std::vector<SCALARTYPE> tmp(size * 2);
         copy(gpu_mat.elements(), tmp);
         std::reverse(tmp.begin(), tmp.end());
@@ -196,14 +198,15 @@ namespace viennacl {
     template <typename SCALARTYPE, unsigned int ALIGNMENT, typename MATRIXTYPE>
     void copy(toeplitz_matrix<SCALARTYPE, ALIGNMENT> const & tep_src, MATRIXTYPE & com_dst)
     {
-        std::size_t size = tep_src.size1();
-        assert(size == com_dst.size1() && "Size mismatch");
-        assert(size == com_dst.size2() && "Size mismatch");
+        assert(tep_src.size1() == viennacl::traits::size1(com_dst) && bool("Size mismatch"));
+        assert(tep_src.size2() == viennacl::traits::size2(com_dst) && bool("Size mismatch"));
+
+        vcl_size_t size = tep_src.size1();
         std::vector<SCALARTYPE> tmp(tep_src.size1() * 2 - 1);
         copy(tep_src, tmp);
 
-        for(std::size_t i = 0; i < size; i++)
-            for(std::size_t j = 0; j < size; j++)
+        for(vcl_size_t i = 0; i < size; i++)
+            for(vcl_size_t j = 0; j < size; j++)
                 com_dst(i, j) = tmp[static_cast<int>(j) - static_cast<int>(i) + static_cast<int>(size) - 1];
     }
 
@@ -216,16 +219,17 @@ namespace viennacl {
     template <typename SCALARTYPE, unsigned int ALIGNMENT, typename MATRIXTYPE>
     void copy(MATRIXTYPE const & com_src, toeplitz_matrix<SCALARTYPE, ALIGNMENT>& tep_dst)
     {
-        std::size_t size = tep_dst.size1();
-        assert(size == com_src.size1() && "Size mismatch");
-        assert(size == com_src.size2() && "Size mismatch");
+        assert( (tep_dst.size1() == 0 || tep_dst.size1() == viennacl::traits::size1(com_src)) && bool("Size mismatch"));
+        assert( (tep_dst.size2() == 0 || tep_dst.size2() == viennacl::traits::size2(com_src)) && bool("Size mismatch"));
+
+        vcl_size_t size = tep_dst.size1();
 
         std::vector<SCALARTYPE> tmp(2*size - 1);
 
-        for(int i = size - 1; i >= 0; i--)
+        for(long i = static_cast<long>(size) - 1; i >= 0; i--)
             tmp[size - i - 1] = com_src(i, 0);
 
-        for(std::size_t i = 1; i < size; i++)
+        for(vcl_size_t i = 1; i < size; i++)
             tmp[size + i - 1] = com_src(0, i);
 
         copy(tmp, tep_dst);
@@ -257,15 +261,15 @@ namespace viennacl {
     template<class SCALARTYPE, unsigned int ALIGNMENT>
     std::ostream & operator<<(std::ostream & s, toeplitz_matrix<SCALARTYPE, ALIGNMENT>& gpu_matrix)
     {
-        std::size_t size = gpu_matrix.size1();
+        vcl_size_t size = gpu_matrix.size1();
         std::vector<SCALARTYPE> tmp(2*size - 1);
         copy(gpu_matrix, tmp);
         s << "[" << size << "," << size << "](";
 
-        for(std::size_t i = 0; i < size; i++) {
+        for(vcl_size_t i = 0; i < size; i++) {
             s << "(";
-            for(std::size_t j = 0; j < size; j++) {
-                s << tmp[(int)j - (int)i + (int)size - 1];
+            for(vcl_size_t j = 0; j < size; j++) {
+                s << tmp[static_cast<int>(j) - static_cast<int>(i) + static_cast<int>(size - 1)];
                 //s << (int)i - (int)j;
                 if(j < (size - 1)) s << ",";
             }
@@ -275,6 +279,99 @@ namespace viennacl {
         return s;
     }
 
+    //
+    // Specify available operations:
+    //
+
+    /** \cond */
+
+    namespace linalg
+    {
+      namespace detail
+      {
+        // x = A * y
+        template <typename T, unsigned int A>
+        struct op_executor<vector_base<T>, op_assign, vector_expression<const toeplitz_matrix<T, A>, const vector_base<T>, op_prod> >
+        {
+            static void apply(vector_base<T> & lhs, vector_expression<const toeplitz_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+            {
+              // check for the special case x = A * x
+              if (viennacl::traits::handle(lhs) == viennacl::traits::handle(rhs.rhs()))
+              {
+                viennacl::vector<T> temp(lhs);
+                viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), temp);
+                lhs = temp;
+              }
+              else
+                viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), lhs);
+            }
+        };
+
+        template <typename T, unsigned int A>
+        struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const toeplitz_matrix<T, A>, const vector_base<T>, op_prod> >
+        {
+            static void apply(vector_base<T> & lhs, vector_expression<const toeplitz_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+            {
+              viennacl::vector<T> temp(lhs);
+              viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), temp);
+              lhs += temp;
+            }
+        };
+
+        template <typename T, unsigned int A>
+        struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const toeplitz_matrix<T, A>, const vector_base<T>, op_prod> >
+        {
+            static void apply(vector_base<T> & lhs, vector_expression<const toeplitz_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+            {
+              viennacl::vector<T> temp(lhs);
+              viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), temp);
+              lhs -= temp;
+            }
+        };
+
+
+        // x = A * vec_op
+        template <typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+        struct op_executor<vector_base<T>, op_assign, vector_expression<const toeplitz_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> >
+        {
+            static void apply(vector_base<T> & lhs, vector_expression<const toeplitz_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+            {
+              viennacl::vector<T> temp(rhs.rhs());
+              viennacl::linalg::prod_impl(rhs.lhs(), temp, lhs);
+            }
+        };
+
+        // x = A * vec_op
+        template <typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+        struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const toeplitz_matrix<T, A>, vector_expression<const LHS, const RHS, OP>, op_prod> >
+        {
+            static void apply(vector_base<T> & lhs, vector_expression<const toeplitz_matrix<T, A>, vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+            {
+              viennacl::vector<T> temp(rhs.rhs());
+              viennacl::vector<T> temp_result(lhs);
+              viennacl::linalg::prod_impl(rhs.lhs(), temp, temp_result);
+              lhs += temp_result;
+            }
+        };
+
+        // x = A * vec_op
+        template <typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+        struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const toeplitz_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> >
+        {
+            static void apply(vector_base<T> & lhs, vector_expression<const toeplitz_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+            {
+              viennacl::vector<T> temp(rhs.rhs());
+              viennacl::vector<T> temp_result(lhs);
+              viennacl::linalg::prod_impl(rhs.lhs(), temp, temp_result);
+              lhs -= temp_result;
+            }
+        };
+
+      } // namespace detail
+    } // namespace linalg
+
+    /** \endcond */
+
 }
 
-#endif // _VIENNACL_TOEPLITZ_MATRIX_HPP
+#endif // VIENNACL_TOEPLITZ_MATRIX_HPP
diff --git a/viennacl/tools/adapter.hpp b/viennacl/tools/adapter.hpp
index 99467c7..0b753ea 100644
--- a/viennacl/tools/adapter.hpp
+++ b/viennacl/tools/adapter.hpp
@@ -2,23 +2,24 @@
 #define VIENNACL_TOOLS_ADAPTER_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
 ============================================================================= */
 
-/** @file adapter.hpp
-    @brief Adapter classes for sparse matrices made of the STL type std::vector<std::map<unsigned int, SCALARTYPE> >
+/** @file viennacl/tools/adapter.hpp
+    @brief Adapter classes for sparse matrices made of the STL type std::vector<std::map<SizeType, SCALARTYPE> >
 */
 
 #include <string>
@@ -34,39 +35,39 @@ namespace viennacl
 {
   namespace tools
   {
-    
-    /** @brief A const iterator for sparse matrices of type std::vector<std::map<unsigned int, SCALARTYPE> >
-    *  
+
+    /** @brief A const iterator for sparse matrices of type std::vector<std::map<SizeType, SCALARTYPE> >
+    *
     *  The iterator behaves like ublas iterators. Attention: Iteration along first columns and then rows via .begin() is untested!
     *
     *  @tparam SCALARTYPE     either float or double
     *  @tparam is_iterator1   if true, this iterator iterates along increasing row indices, otherwise along increasing column indices
     *  @tparam increment      if +1, this is a forward iterator, if -1 we have a reverse iterator
     */
-    template <typename SCALARTYPE, bool is_iterator1, bool is_forward>
+    template <typename SCALARTYPE, typename SizeType, bool is_iterator1, bool is_forward>
     class const_sparse_matrix_adapted_iterator
     {
-      typedef const_sparse_matrix_adapted_iterator<SCALARTYPE, is_iterator1, is_forward>    self_type;
-      
+      typedef const_sparse_matrix_adapted_iterator<SCALARTYPE, SizeType, is_iterator1, is_forward>    self_type;
+
       public:
         typedef self_type     iterator1;
         typedef self_type     iterator2;
-        typedef std::size_t   size_type;
-        
-        const_sparse_matrix_adapted_iterator(std::vector<std::map<unsigned int, SCALARTYPE> > const & mat, int i, int j)
+        typedef vcl_size_t   size_type;
+
+        const_sparse_matrix_adapted_iterator(std::vector<std::map<SizeType, SCALARTYPE> > const & mat, int i, int j)
          : mat_(mat), i_(i), j_(j)
         {
           if (i < 0) //reverse iterator end
           {
-            //iter2 = _mat[0].rend();  //reverse iterator end
+            //iter2 = mat_[0].rend();  //reverse iterator end
           }
-          else  //_i is valid
+          else  //i_ is valid
           {
             if (j < 0)
             {
-              //iter2 = _mat[i].rend();
+              //iter2 = mat_[i].rend();
             }
-            else //_j is valid
+            else //j_ is valid
             {
               if (i_ < mat_.size() && mat_[i].size() > 0 )
               {
@@ -83,14 +84,14 @@ namespace viennacl
             }
           }
         }
-         
+
         SCALARTYPE operator*(void) const
         {
           if (is_iterator1)
           {
-            typedef typename std::map<unsigned int, SCALARTYPE>::const_iterator  col_iterator;
-            
-            col_iterator colit = mat_[i_].find(j_);
+            typedef typename std::map<SizeType, SCALARTYPE>::const_iterator  col_iterator;
+
+            col_iterator colit = mat_[i_].find(static_cast<unsigned int>(j_));
 
             if (colit != mat_[i_].end())
               return colit->second;
@@ -99,7 +100,7 @@ namespace viennacl
           else
             return iter2->second;
         }
-        
+
         self_type & operator++(void)
         {
           if (is_iterator1)
@@ -113,9 +114,9 @@ namespace viennacl
             ++iter2;
           return *this;
         }
-        self_type & operator++(int) { self_type tmp = *this; ++(*this); return tmp; }
-        
-        self_type operator+=(unsigned int offset)
+        self_type operator++(int) { self_type tmp = *this; ++(*this); return tmp; }
+
+        self_type operator+=(SizeType offset)
         {
           if (is_iterator1)
           {
@@ -126,87 +127,84 @@ namespace viennacl
           }
           else
           {
-            for (unsigned int k=0; k<offset; ++k)
+            for (SizeType k=0; k<offset; ++k)
               ++iter2;  //Note: User must ensure that this is always valid...
           }
           return *this;
         }
-        
+
         bool operator==(self_type const & other) const
         {
-          if (is_iterator1)
-            return (i_ == other.i_);
-          return (iter2 == other.iter2);
+          return is_iterator1 ? (i_ == other.i_) : (iter2 == other.iter2);
         }
-        
+
         bool operator!=(self_type const & other) const { return !(*this == other); }
-        
-        int index1() const { return i_; }
-        int index2() const
-        { 
+
+        size_type index1() const { return i_; }
+        size_type index2() const
+        {
           if (is_iterator1)
             return 0;
           else
             return iter2->first;
         }
-        
-        const_sparse_matrix_adapted_iterator<SCALARTYPE, !is_iterator1, true> begin() const
+
+        const_sparse_matrix_adapted_iterator<SCALARTYPE, SizeType, !is_iterator1, true> begin() const
         {
-          return const_sparse_matrix_adapted_iterator<SCALARTYPE, !is_iterator1, true>(mat_, i_, 0);
+          return const_sparse_matrix_adapted_iterator<SCALARTYPE, SizeType, !is_iterator1, true>(mat_, static_cast<int>(i_), 0);
         }
-        const_sparse_matrix_adapted_iterator<SCALARTYPE, !is_iterator1, true> end() const
+        const_sparse_matrix_adapted_iterator<SCALARTYPE, SizeType, !is_iterator1, true> end() const
         {
           int end_ = static_cast<int>(mat_[i_].size());
           if (end_ > 0)
             end_ = mat_[i_].rbegin()->first;
-          return const_sparse_matrix_adapted_iterator<SCALARTYPE, !is_iterator1, true>(mat_, i_, end_ + 1);
+          return const_sparse_matrix_adapted_iterator<SCALARTYPE, SizeType, !is_iterator1, true>(mat_, static_cast<int>(i_), end_ + 1);
         }
-        
+
       private:
-        std::vector<std::map<unsigned int, SCALARTYPE> > const & mat_;
-        typename std::map<unsigned int, SCALARTYPE>::const_iterator iter2;
+        std::vector<std::map<SizeType, SCALARTYPE> > const & mat_;
+        typename std::map<SizeType, SCALARTYPE>::const_iterator iter2;
         size_type i_;
         size_type j_;
     };
-    
-    /** @brief Adapts a constant sparse matrix type made up from std::vector<std::map<unsigned int, SCALARTYPE> > to basic ublas-compatibility.
+
+    /** @brief Adapts a constant sparse matrix type made up from std::vector<std::map<SizeType, SCALARTYPE> > to basic ublas-compatibility.
     *
     *  @tparam SCALARTYPE   either float or double
     */
-    template <typename SCALARTYPE>
+    template <typename SCALARTYPE, typename SizeType = unsigned int>
     class const_sparse_matrix_adapter
     {
       public:
-        typedef const_sparse_matrix_adapted_iterator<SCALARTYPE, true, true>      const_iterator1;
-        typedef const_sparse_matrix_adapted_iterator<SCALARTYPE, false, true>     const_iterator2;
+        typedef const_sparse_matrix_adapted_iterator<SCALARTYPE, SizeType, true, true>      const_iterator1;
+        typedef const_sparse_matrix_adapted_iterator<SCALARTYPE, SizeType, false, true>     const_iterator2;
 
-        typedef const_sparse_matrix_adapted_iterator<SCALARTYPE, true, false>   const_reverse_iterator1;
+        typedef const_sparse_matrix_adapted_iterator<SCALARTYPE, SizeType, true, false>   const_reverse_iterator1;
         typedef SCALARTYPE    value_type;
-        typedef std::size_t   size_type;
-        
-        const_sparse_matrix_adapter(std::vector<std::map<unsigned int, SCALARTYPE> > const & mat) 
-         : mat_(mat), size1_(mat_.size()), size2_(mat_.size()) {};
-
-        const_sparse_matrix_adapter(std::vector<std::map<unsigned int, SCALARTYPE> > const & mat, size_type num_rows, size_type num_cols) 
-         : mat_(mat), size1_(num_rows), size2_(num_cols) {};
-         
+        typedef vcl_size_t   size_type;
+
+        const_sparse_matrix_adapter(std::vector<std::map<SizeType, SCALARTYPE> > const & mat)
+         : mat_(mat), size1_(mat_.size()), size2_(mat_.size()) {}
+
+        const_sparse_matrix_adapter(std::vector<std::map<SizeType, SCALARTYPE> > const & mat, size_type num_rows, size_type num_cols)
+         : mat_(mat), size1_(num_rows), size2_(num_cols) {}
+
         size_type size1() const { return size1_; }
         size_type size2() const { return size2_; }
-        //size_type size2() const { return (_mat.size() > 0) ? _mat.back().size() : 0; }
 
         const_iterator1 begin1() const { return const_iterator1(mat_, 0, 0); }
-        const_iterator1 end1() const   { return const_iterator1(mat_, size1(), size2()); }
+        const_iterator1 end1() const   { return const_iterator1(mat_, static_cast<int>(size1()), static_cast<int>(size2())); }
 
-        const_reverse_iterator1 rbegin1() const { return const_reverse_iterator1(mat_, size1() - 1, 0); }
-        const_reverse_iterator1 rend1() const   { return const_reverse_iterator1(mat_, -1, size2()); }
+        const_reverse_iterator1 rbegin1() const { return const_reverse_iterator1(mat_, static_cast<int>(size1() - 1), 0); }
+        const_reverse_iterator1 rend1() const   { return const_reverse_iterator1(mat_, -1, static_cast<int>(size2())); }
 
         const_iterator2 begin2() const { return const_iterator2(mat_, 0, 0); }
         const_iterator2 end2() const   { return const_iterator2(mat_, size1(), size2()); }
 
-        SCALARTYPE operator()(unsigned int i, unsigned int j) const
+        SCALARTYPE operator()(SizeType i, SizeType j) const
         {
-          typedef typename std::map<unsigned int, SCALARTYPE>::const_iterator  col_iterator;
-          
+          typedef typename std::map<SizeType, SCALARTYPE>::const_iterator  col_iterator;
+
           col_iterator colit = mat_[i].find(j);
 
           if (colit != mat_[i].end())
@@ -215,41 +213,41 @@ namespace viennacl
         }
 
       private:
-        std::vector<std::map<unsigned int, SCALARTYPE> > const & mat_;
+        std::vector<std::map<SizeType, SCALARTYPE> > const & mat_;
         size_type size1_;
         size_type size2_;
     };
-    
-    
-    /** @brief A non-const iterator for sparse matrices of type std::vector<std::map<unsigned int, SCALARTYPE> >
-    *  
+
+
+    /** @brief A non-const iterator for sparse matrices of type std::vector<std::map<SizeType, SCALARTYPE> >
+    *
     *  The iterator behaves like ublas iterators. Attention: Iteration along first columns and then rows via .begin() is untested! Reverse iterators are missing!
     *
     *  @tparam SCALARTYPE     either float or double
     *  @tparam is_iterator1   if true, this iterator iterates along increasing row indices, otherwise along increasiong column indices
     */
-    template <typename SCALARTYPE, bool is_iterator1>
+    template <typename SCALARTYPE, typename SizeType, bool is_iterator1>
     class sparse_matrix_adapted_iterator
     {
-      typedef sparse_matrix_adapted_iterator<SCALARTYPE, is_iterator1>    self_type;
-      
+      typedef sparse_matrix_adapted_iterator<SCALARTYPE, SizeType, is_iterator1>    self_type;
+
       public:
         typedef self_type     iterator1;
         typedef self_type     iterator2;
-        typedef std::size_t   size_type;
-        
-        sparse_matrix_adapted_iterator(std::vector<std::map<unsigned int, SCALARTYPE> > & mat, int i, int j)
+        typedef vcl_size_t   size_type;
+
+        sparse_matrix_adapted_iterator(std::vector<std::map<SizeType, SCALARTYPE> > & mat, int i, int j)
          : mat_(mat), i_(i), j_(j)
         {
           if (i < 0) //reverse iterator end
           {
-            //iter2 = _mat[0].rend();  //reverse iterator end
+            //iter2 = mat_[0].rend();  //reverse iterator end
           }
           else  //_i is valid
           {
             if (j < 0)
             {
-              //iter2 = _mat[i].rend();
+              //iter2 = mat[i]_.rend();
             }
             else //_j is valid
             {
@@ -268,17 +266,17 @@ namespace viennacl
             }
           }
         }
-         
+
         SCALARTYPE & operator*(void)
         {
           if (is_iterator1)
           {
-            return mat_[i_][j_];
+            return mat_[i_][static_cast<SizeType>(j_)];
           }
           else
             return iter2->second;
         }
-        
+
         self_type & operator++(void)
         {
           if (is_iterator1)
@@ -287,8 +285,8 @@ namespace viennacl
             ++iter2;
           return *this;
         }
-        self_type & operator++(int) { self_type tmp = *this; ++(*this); return tmp; }
-        
+        self_type operator++(int) { self_type tmp = *this; ++(*this); return tmp; }
+
         self_type operator+=(size_type offset)
         {
           if (is_iterator1)
@@ -300,7 +298,7 @@ namespace viennacl
           }
           return *this;
         }
-        
+
         bool operator==(self_type const & other) const
         {
           if (is_iterator1)
@@ -308,96 +306,104 @@ namespace viennacl
           return (iter2 == other.iter2);
         }
         bool operator!=(self_type const & other) const { return !(*this == other); }
-        
-        unsigned int index1() const { return i_; }
-        unsigned int index2() const
-        { 
+
+        size_type index1() const { return i_; }
+        size_type index2() const
+        {
           if (is_iterator1)
             return 0;
           else
             return iter2->first;
         }
-        
-        sparse_matrix_adapted_iterator<SCALARTYPE, !is_iterator1> begin() const
+
+        sparse_matrix_adapted_iterator<SCALARTYPE, SizeType, !is_iterator1> begin() const
         {
-          return sparse_matrix_adapted_iterator<SCALARTYPE, !is_iterator1>(mat_, i_, 0);
+          return sparse_matrix_adapted_iterator<SCALARTYPE, SizeType, !is_iterator1>(mat_, static_cast<int>(i_), 0);
         }
-        sparse_matrix_adapted_iterator<SCALARTYPE, !is_iterator1> end() const
+        sparse_matrix_adapted_iterator<SCALARTYPE, SizeType, !is_iterator1> end() const
         {
           int end_ = static_cast<int>(mat_[i_].size());
           if (end_ > 0)
             end_ = mat_[i_].rbegin()->first;
-          return sparse_matrix_adapted_iterator<SCALARTYPE, !is_iterator1>(mat_, i_, end_ + 1);
+          return sparse_matrix_adapted_iterator<SCALARTYPE, SizeType, !is_iterator1>(mat_, static_cast<int>(i_), end_ + 1);
         }
-        
+
       private:
-        std::vector<std::map<unsigned int, SCALARTYPE> > & mat_;
-        typename std::map<unsigned int, SCALARTYPE>::iterator iter2;
+        std::vector<std::map<SizeType, SCALARTYPE> > & mat_;
+        typename std::map<SizeType, SCALARTYPE>::iterator iter2;
         size_type i_;
         size_type j_;
     };
-    
-    
-    
-    /** @brief Adapts a non-const sparse matrix type made up from std::vector<std::map<unsigned int, SCALARTYPE> > to basic ublas-compatibility.
+
+
+
+    /** @brief Adapts a non-const sparse matrix type made up from std::vector<std::map<SizeType, SCALARTYPE> > to basic ublas-compatibility.
     *
     *  @tparam SCALARTYPE   either float or double
     */
-    template <typename SCALARTYPE>
-    class sparse_matrix_adapter : public const_sparse_matrix_adapter<SCALARTYPE>
+    template <typename SCALARTYPE, typename SizeType = unsigned int>
+    class sparse_matrix_adapter : public const_sparse_matrix_adapter<SCALARTYPE, SizeType>
     {
-        typedef const_sparse_matrix_adapter<SCALARTYPE>   BaseType;
+        typedef const_sparse_matrix_adapter<SCALARTYPE, SizeType>   BaseType;
       public:
-        typedef sparse_matrix_adapted_iterator<SCALARTYPE, true>      iterator1;
-        typedef sparse_matrix_adapted_iterator<SCALARTYPE, false>     iterator2;
-        typedef std::size_t                                           size_type;
-        
-        sparse_matrix_adapter(std::vector<std::map<unsigned int, SCALARTYPE> > & mat) 
-         : BaseType(mat), mat_(mat), size1_(mat_.size()), size2_(mat_.size()) { };
-
-        sparse_matrix_adapter(std::vector<std::map<unsigned int, SCALARTYPE> > & mat,
-                              std::size_t num_rows,
-                              std::size_t num_cols) 
-         : BaseType(mat, num_rows, num_cols), mat_(mat), size1_(num_rows), size2_(num_cols) { };
-         
+        typedef sparse_matrix_adapted_iterator<SCALARTYPE, SizeType, true>      iterator1;
+        typedef sparse_matrix_adapted_iterator<SCALARTYPE, SizeType, false>     iterator2;
+        typedef const_sparse_matrix_adapted_iterator<SCALARTYPE, SizeType, true, true>      const_iterator1;
+        typedef const_sparse_matrix_adapted_iterator<SCALARTYPE, SizeType, false, true>     const_iterator2;
+        typedef SizeType                                              size_type;
+
+        sparse_matrix_adapter(std::vector<std::map<SizeType, SCALARTYPE> > & mat)
+         : BaseType(mat), mat_(mat), size1_(mat_.size()), size2_(mat_.size()) {}
+
+        sparse_matrix_adapter(std::vector<std::map<SizeType, SCALARTYPE> > & mat,
+                              vcl_size_t num_rows,
+                              vcl_size_t num_cols)
+         : BaseType(mat, num_rows, num_cols), mat_(mat), size1_(static_cast<size_type>(num_rows)), size2_(static_cast<size_type>(num_cols)) {}
+
         iterator1 begin1() { return iterator1(mat_, 0, 0); }
-        iterator1 end1() { return iterator1(mat_, mat_.size(), mat_.back().size()); }
+        iterator1 end1() { return iterator1(mat_, static_cast<int>(mat_.size()), static_cast<int>(mat_.back().size())); }
+
+        const_iterator1 begin1() const { return const_iterator1(mat_, 0, 0); }
+        const_iterator1 end1() const   { return const_iterator1(mat_, size1(), size2()); }
 
         iterator2 begin2() { return iterator2(mat_, 0, 0); }
         iterator2 end2() { return iterator2(mat_, mat_.size(), mat_.back().size()); }
-        
-        SCALARTYPE & operator()(size_type i, size_type j) { return mat_[i][j]; }
-        
-        void resize(size_type i, size_type j, bool preserve = true)
+
+        const_iterator2 begin2() const { return const_iterator2(mat_, 0, 0); }
+        const_iterator2 end2() const   { return const_iterator2(mat_, size1(), size2()); }
+
+        SCALARTYPE & operator()(vcl_size_t i, vcl_size_t j) { return mat_[i][static_cast<size_type>(j)]; }
+
+        void resize(vcl_size_t i, vcl_size_t j, bool preserve = true)
         {
           if (i>0)
             mat_.resize(i);
           if (!preserve)
             clear();
-          
-          size1_ = i;
-          size2_ = j;
+
+          size1_ = static_cast<size_type>(i);
+          size2_ = static_cast<size_type>(j);
         }
-        
+
         void clear()
         {
           for (size_type i=0; i<mat_.size(); ++i)
             mat_[i].clear();
         }
-        
+
         size_type size1() { return size1_; }
         size_type size1() const { return size1_; } //Note: Due to name hiding it is not sufficient to have it in the base class
-        
+
         //assume a square matrix
         size_type size2() { return size2_; }
         size_type size2() const { return size2_; } //Note: Due to name hiding it is not sufficient to have it in the base class
-        
+
       private:
-        std::vector<std::map<unsigned int, SCALARTYPE> > & mat_;
+        std::vector<std::map<SizeType, SCALARTYPE> > & mat_;
         size_type size1_;
         size_type size2_;
     };
-    
+
 
   }
 }
diff --git a/viennacl/tools/entry_proxy.hpp b/viennacl/tools/entry_proxy.hpp
index f7d4543..16ebc65 100644
--- a/viennacl/tools/entry_proxy.hpp
+++ b/viennacl/tools/entry_proxy.hpp
@@ -2,28 +2,28 @@
 #define VIENNACL_TOOLS_ENTRY_PROXY_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
 ============================================================================= */
 
-/** @file entry_proxy.hpp
+/** @file viennacl/tools/entry_proxy.hpp
     @brief A proxy class for entries in a vector
 */
 
 
 #include "viennacl/forwards.h"
-#include "viennacl/ocl/backend.hpp"
 #include "viennacl/scalar.hpp"
 
 namespace viennacl
@@ -40,24 +40,26 @@ namespace viennacl
     class entry_proxy
     {
       public:
+        typedef viennacl::backend::mem_handle      handle_type;
+
         /** @brief The constructor for the proxy class. Declared explicit to avoid any surprises created by the compiler.
         *
         * @param mem_offset The memory offset in multiples of sizeof(SCALARTYPE) relative to the memory pointed to by the handle
         * @param mem_handle A viennacl::ocl::handle for the memory buffer on the GPU.
         */
-        explicit entry_proxy(unsigned int mem_offset, 
-                             viennacl::ocl::handle<cl_mem> const & mem_handle) 
-         : _index(mem_offset), _mem_handle(mem_handle) {};
-        
-         
+        explicit entry_proxy(vcl_size_t mem_offset,
+                             handle_type & mem_handle)
+         : index_(mem_offset), mem_handle_(mem_handle) {}
+
+
         //operators:
         /** @brief Inplace addition of a CPU floating point value
         */
         entry_proxy & operator+=(SCALARTYPE value)
         {
           SCALARTYPE temp = read();
-          temp += value; 
-          write(temp);         
+          temp += value;
+          write(temp);
           return *this;
         }
 
@@ -66,8 +68,8 @@ namespace viennacl
         entry_proxy &  operator-=(SCALARTYPE value)
         {
           SCALARTYPE temp = read();
-          temp -= value; 
-          write(temp);         
+          temp -= value;
+          write(temp);
           return *this;
         }
 
@@ -76,8 +78,8 @@ namespace viennacl
         entry_proxy &  operator*=(SCALARTYPE value)
         {
           SCALARTYPE temp = read();
-          temp *= value; 
-          write(temp);         
+          temp *= value;
+          write(temp);
           return *this;
         }
 
@@ -86,8 +88,8 @@ namespace viennacl
         entry_proxy &  operator/=(SCALARTYPE value)
         {
           SCALARTYPE temp = read();
-          temp /= value; 
-          write(temp);         
+          temp /= value;
+          write(temp);
           return *this;
         }
 
@@ -103,9 +105,7 @@ namespace viennacl
         */
         entry_proxy & operator=(scalar<SCALARTYPE> const & value)
         {
-          cl_int err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle().get(), value.handle().get(), _mem_handle.get(), 0, sizeof(SCALARTYPE)*_index, sizeof(SCALARTYPE), 0, NULL, NULL);
-          //assert(err == CL_SUCCESS);
-          VIENNACL_ERR_CHECK(err);
+          viennacl::backend::memory_copy(value.handle(), mem_handle_, 0, sizeof(SCALARTYPE)*index_, sizeof(SCALARTYPE));
           return *this;
         }
 
@@ -113,13 +113,7 @@ namespace viennacl
         */
         entry_proxy &  operator=(entry_proxy const & other)
         {
-          cl_int err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle().get(),
-                                           other._mem_handle.get(), //src
-                                           _mem_handle.get(),       //dest
-                                           sizeof(SCALARTYPE) * other._index, //offset src
-                                           sizeof(SCALARTYPE) * _index,       //offset dest
-                                           sizeof(SCALARTYPE), 0, NULL, NULL);
-          VIENNACL_ERR_CHECK(err);
+          viennacl::backend::memory_copy(other.handle(), mem_handle_, sizeof(SCALARTYPE) * other.index_, sizeof(SCALARTYPE)*index_, sizeof(SCALARTYPE));
           return *this;
         }
 
@@ -137,14 +131,14 @@ namespace viennacl
           SCALARTYPE temp = read();
           return temp;
         }
-        
+
         /** @brief Returns the index of the represented element
         */
-        unsigned int index() const { return _index; }
-        
+        vcl_size_t index() const { return index_; }
+
         /** @brief Returns the memory viennacl::ocl::handle
         */
-        viennacl::ocl::handle<cl_mem> const & handle() const { return _mem_handle; }
+        handle_type const & handle() const { return mem_handle_; }
 
       private:
         /** @brief Reads an element from the GPU to the CPU
@@ -152,28 +146,88 @@ namespace viennacl
         SCALARTYPE read() const
         {
           SCALARTYPE temp;
-          cl_int err;
-          err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(), _mem_handle.get(), CL_TRUE, sizeof(SCALARTYPE)*_index, sizeof(SCALARTYPE), &temp, 0, NULL, NULL);
-          //assert(err == CL_SUCCESS);
-          VIENNACL_ERR_CHECK(err);
-          viennacl::ocl::get_queue().finish();
+          viennacl::backend::memory_read(mem_handle_, sizeof(SCALARTYPE)*index_, sizeof(SCALARTYPE), &temp);
           return temp;
         }
-        
+
         /** @brief Writes a floating point value to the GPU
         */
         void write(SCALARTYPE value)
         {
-          cl_int err;
-          err = clEnqueueWriteBuffer(viennacl::ocl::get_queue().handle().get(), _mem_handle.get(), CL_TRUE, sizeof(SCALARTYPE)*_index, sizeof(SCALARTYPE), &value, 0, NULL, NULL);
-          //assert(err == CL_SUCCESS);
-          VIENNACL_ERR_CHECK(err);
+          viennacl::backend::memory_write(mem_handle_, sizeof(SCALARTYPE)*index_, sizeof(SCALARTYPE), &value);
+        }
+
+        vcl_size_t index_;
+        viennacl::backend::mem_handle & mem_handle_;
+    }; //entry_proxy
+
+
+
+
+
+
+
+    /**
+    * @brief A proxy class for a single element of a vector or matrix. This proxy should not be noticed by end-users of the library.
+    *
+    * This proxy provides access to a single entry of a vector. If the element is assigned to a GPU object, no unnecessary transfers to the CPU and back to GPU are initiated.
+    *
+    * @tparam SCALARTYPE Either float or double
+    */
+    template <typename SCALARTYPE>
+    class const_entry_proxy
+    {
+        typedef const_entry_proxy<SCALARTYPE>      self_type;
+      public:
+        typedef viennacl::backend::mem_handle      handle_type;
+
+        /** @brief The constructor for the proxy class. Declared explicit to avoid any surprises created by the compiler.
+        *
+        * @param mem_offset The memory offset in multiples of sizeof(SCALARTYPE) relative to the memory pointed to by the handle
+        * @param mem_handle A viennacl::ocl::handle for the memory buffer on the GPU.
+        */
+        explicit const_entry_proxy(vcl_size_t mem_offset,
+                                   handle_type const & mem_handle)
+         : index_(mem_offset), mem_handle_(mem_handle) {}
+
+
+        //type conversion:
+        // allows to write something like:
+        //  double test = vector(4);
+        /** @brief Conversion to a CPU floating point value.
+        *
+        *  This conversion allows to write something like
+        *    double test = vector(4);
+        *  However, one has to keep in mind that CPU<->GPU transfers are very slow compared to CPU<->CPU operations.
+        */
+        operator SCALARTYPE () const
+        {
+          SCALARTYPE temp = read();
+          return temp;
+        }
+
+        /** @brief Returns the index of the represented element
+        */
+        unsigned int index() const { return index_; }
+
+        /** @brief Returns the memory handle
+        */
+        handle_type const & handle() const { return mem_handle_; }
+
+      private:
+        /** @brief Reads an element from the GPU to the CPU
+        */
+        SCALARTYPE read() const
+        {
+          SCALARTYPE temp;
+          viennacl::backend::memory_read(mem_handle_, sizeof(SCALARTYPE)*index_, sizeof(SCALARTYPE), &temp);
+          return temp;
         }
-        
-        unsigned int _index;
-        viennacl::ocl::handle<cl_mem> const & _mem_handle;
+
+        vcl_size_t index_;
+        viennacl::backend::mem_handle const & mem_handle_;
     }; //entry_proxy
-    
+
 }
 
 #endif
diff --git a/viennacl/tools/matrix_kernel_class_deducer.hpp b/viennacl/tools/matrix_kernel_class_deducer.hpp
deleted file mode 100644
index 898a779..0000000
--- a/viennacl/tools/matrix_kernel_class_deducer.hpp
+++ /dev/null
@@ -1,67 +0,0 @@
-#ifndef VIENNACL_TOOLS_MATRIX_KERNEL_CLASS_DEDUCER_HPP_
-#define VIENNACL_TOOLS_MATRIX_KERNEL_CLASS_DEDUCER_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file matrix_kernel_class_deducer.hpp
-    @brief Implementation of a helper meta class for deducing the correct kernels for the supplied matrix
-*/
-
-#include <string>
-#include <fstream>
-#include <sstream>
-#include "viennacl/forwards.h"
-#include "viennacl/linalg/kernels/matrix_col_kernels.h"
-#include "viennacl/linalg/kernels/matrix_row_kernels.h"
-
-#include <vector>
-#include <map>
-
-namespace viennacl
-{
-  namespace tools
-  {
-    /**     @brief Implementation of a helper meta class for deducing the correct kernels for the supplied matrix */
-    template <typename MatrixType1>
-    struct MATRIX_KERNEL_CLASS_DEDUCER
-    {};
-    
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    struct MATRIX_KERNEL_CLASS_DEDUCER< viennacl::matrix<SCALARTYPE, viennacl::row_major, ALIGNMENT> >
-    {
-      typedef viennacl::linalg::kernels::matrix_row<SCALARTYPE, ALIGNMENT>     ResultType;
-    };
-    
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    struct MATRIX_KERNEL_CLASS_DEDUCER< viennacl::matrix<SCALARTYPE, viennacl::column_major, ALIGNMENT> >
-    {
-      typedef viennacl::linalg::kernels::matrix_col<SCALARTYPE, ALIGNMENT>     ResultType;
-    };
-
-    //support for matrix range:
-    template <typename T>
-    struct MATRIX_KERNEL_CLASS_DEDUCER< viennacl::matrix_range<T> >
-    {
-      typedef typename MATRIX_KERNEL_CLASS_DEDUCER<T>::ResultType    ResultType;
-    };
-    
-    
-  }
-
-}
-
-#endif
diff --git a/viennacl/tools/matrix_prod_kernel_class_deducer.hpp b/viennacl/tools/matrix_prod_kernel_class_deducer.hpp
deleted file mode 100644
index 6905537..0000000
--- a/viennacl/tools/matrix_prod_kernel_class_deducer.hpp
+++ /dev/null
@@ -1,160 +0,0 @@
-#ifndef VIENNACL_TOOLS_MATRIX_PROD_KERNEL_CLASS_DEDUCER_HPP_
-#define VIENNACL_TOOLS_MATRIX_PROD_KERNEL_CLASS_DEDUCER_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file matrix_prod_kernel_class_deducer.hpp
-    @brief Implementation of a helper meta class for deducing the correct kernels for matrix-matrix products
-*/
-
-#include <string>
-#include <fstream>
-#include <sstream>
-#include "viennacl/forwards.h"
-#include "viennacl/linalg/kernels/matrix_prod_col_col_col_kernels.h"
-#include "viennacl/linalg/kernels/matrix_prod_col_col_row_kernels.h"
-#include "viennacl/linalg/kernels/matrix_prod_col_row_col_kernels.h"
-#include "viennacl/linalg/kernels/matrix_prod_col_row_row_kernels.h"
-#include "viennacl/linalg/kernels/matrix_prod_row_col_col_kernels.h"
-#include "viennacl/linalg/kernels/matrix_prod_row_col_row_kernels.h"
-#include "viennacl/linalg/kernels/matrix_prod_row_row_col_kernels.h"
-#include "viennacl/linalg/kernels/matrix_prod_row_row_row_kernels.h"
-
-#include <vector>
-#include <map>
-
-namespace viennacl
-{
-  namespace tools
-  {
-    namespace detail
-    {
-      template <typename MatrixType>
-      struct extract_matrix
-      {
-        typedef typename MatrixType::ERROR_UNKNOWN_MATRIX_TYPE_PROVIDED   error_type;
-      };
-      
-      template <typename SCALARTYPE, typename F, unsigned int ALIGNMENT>
-      struct extract_matrix < viennacl::matrix<SCALARTYPE, F, ALIGNMENT> >
-      {
-        typedef viennacl::matrix<SCALARTYPE, F, ALIGNMENT>   type;
-      };
-
-      template <typename SCALARTYPE, typename F, unsigned int ALIGNMENT>
-      struct extract_matrix < const viennacl::matrix<SCALARTYPE, F, ALIGNMENT> >
-      {
-        typedef viennacl::matrix<SCALARTYPE, F, ALIGNMENT>   type;
-      };
-
-      
-      template <typename MatrixType>
-      struct extract_matrix < viennacl::matrix_range<MatrixType> >
-      {
-        typedef typename extract_matrix<MatrixType>::type   type;
-      };
-
-      template <typename MatrixType>
-      struct extract_matrix < const viennacl::matrix_range<MatrixType> >
-      {
-        typedef typename extract_matrix<MatrixType>::type   type;
-      };
-      
-      
-    }
-    
-    
-    
-    /** @brief deduces kernel type for C=A*B, where A, B, C are MatrixType1, MatrixType2 and MatrixType3 respectively */
-    template <typename MatrixType1, typename MatrixType2, typename MatrixType3>
-    struct MATRIX_PROD_KERNEL_CLASS_DEDUCER
-    {
-      typedef typename MATRIX_PROD_KERNEL_CLASS_DEDUCER< typename detail::extract_matrix<MatrixType1>::type,
-                                                         typename detail::extract_matrix<MatrixType2>::type,
-                                                         typename detail::extract_matrix<MatrixType3>::type>::ResultType   ResultType;
-    };
-    
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    struct MATRIX_PROD_KERNEL_CLASS_DEDUCER< viennacl::matrix<SCALARTYPE, viennacl::row_major, ALIGNMENT>,
-                                             viennacl::matrix<SCALARTYPE, viennacl::row_major, ALIGNMENT>,
-                                             viennacl::matrix<SCALARTYPE, viennacl::row_major, ALIGNMENT> >
-    {
-      typedef viennacl::linalg::kernels::matrix_prod_row_row_row<SCALARTYPE, ALIGNMENT>     ResultType;
-    };
-
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    struct MATRIX_PROD_KERNEL_CLASS_DEDUCER< viennacl::matrix<SCALARTYPE, viennacl::row_major, ALIGNMENT>,
-                                             viennacl::matrix<SCALARTYPE, viennacl::row_major, ALIGNMENT>,
-                                             viennacl::matrix<SCALARTYPE, viennacl::column_major, ALIGNMENT> >
-    {
-      typedef viennacl::linalg::kernels::matrix_prod_row_row_col<SCALARTYPE, ALIGNMENT>     ResultType;
-    };
-    
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    struct MATRIX_PROD_KERNEL_CLASS_DEDUCER< viennacl::matrix<SCALARTYPE, viennacl::row_major, ALIGNMENT>,
-                                             viennacl::matrix<SCALARTYPE, viennacl::column_major, ALIGNMENT>,
-                                             viennacl::matrix<SCALARTYPE, viennacl::row_major, ALIGNMENT> >
-    {
-      typedef viennacl::linalg::kernels::matrix_prod_row_col_row<SCALARTYPE, ALIGNMENT>     ResultType;
-    };
-
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    struct MATRIX_PROD_KERNEL_CLASS_DEDUCER< viennacl::matrix<SCALARTYPE, viennacl::row_major, ALIGNMENT>,
-                                             viennacl::matrix<SCALARTYPE, viennacl::column_major, ALIGNMENT>,
-                                             viennacl::matrix<SCALARTYPE, viennacl::column_major, ALIGNMENT> >
-    {
-      typedef viennacl::linalg::kernels::matrix_prod_row_col_col<SCALARTYPE, ALIGNMENT>     ResultType;
-    };
-
-    
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    struct MATRIX_PROD_KERNEL_CLASS_DEDUCER< viennacl::matrix<SCALARTYPE, viennacl::column_major, ALIGNMENT>,
-                                             viennacl::matrix<SCALARTYPE, viennacl::row_major, ALIGNMENT>,
-                                             viennacl::matrix<SCALARTYPE, viennacl::row_major, ALIGNMENT> >
-    {
-      typedef viennacl::linalg::kernels::matrix_prod_col_row_row<SCALARTYPE, ALIGNMENT>     ResultType;
-    };
-
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    struct MATRIX_PROD_KERNEL_CLASS_DEDUCER< viennacl::matrix<SCALARTYPE, viennacl::column_major, ALIGNMENT>,
-                                             viennacl::matrix<SCALARTYPE, viennacl::row_major, ALIGNMENT>,
-                                             viennacl::matrix<SCALARTYPE, viennacl::column_major, ALIGNMENT> >
-    {
-      typedef viennacl::linalg::kernels::matrix_prod_col_row_col<SCALARTYPE, ALIGNMENT>     ResultType;
-    };
-    
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    struct MATRIX_PROD_KERNEL_CLASS_DEDUCER< viennacl::matrix<SCALARTYPE, viennacl::column_major, ALIGNMENT>,
-                                             viennacl::matrix<SCALARTYPE, viennacl::column_major, ALIGNMENT>,
-                                             viennacl::matrix<SCALARTYPE, viennacl::row_major, ALIGNMENT> >
-    {
-      typedef viennacl::linalg::kernels::matrix_prod_col_col_row<SCALARTYPE, ALIGNMENT>     ResultType;
-    };
-
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    struct MATRIX_PROD_KERNEL_CLASS_DEDUCER< viennacl::matrix<SCALARTYPE, viennacl::column_major, ALIGNMENT>,
-                                             viennacl::matrix<SCALARTYPE, viennacl::column_major, ALIGNMENT>,
-                                             viennacl::matrix<SCALARTYPE, viennacl::column_major, ALIGNMENT> >
-    {
-      typedef viennacl::linalg::kernels::matrix_prod_col_col_col<SCALARTYPE, ALIGNMENT>     ResultType;
-    };
-    
-  }
-
-}
-
-#endif
diff --git a/viennacl/tools/matrix_size_deducer.hpp b/viennacl/tools/matrix_size_deducer.hpp
index 0f6e564..b15dc1b 100644
--- a/viennacl/tools/matrix_size_deducer.hpp
+++ b/viennacl/tools/matrix_size_deducer.hpp
@@ -2,34 +2,36 @@
 #define VIENNACL_TOOLS_MATRIX_SIZE_DEDUCER_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
 ============================================================================= */
 
-/** @file matrix_size_deducer.hpp
+/** @file viennacl/tools/matrix_size_deducer.hpp
     @brief Helper implementations that deduce the dimensions of the supplied matrix-valued expressions.
 */
 
 #include <string>
 #include <fstream>
 #include <sstream>
-#include "viennacl/forwards.h"
-#include "viennacl/tools/adapter.hpp"
-
+#include <cmath>
 #include <vector>
 #include <map>
 
+#include "viennacl/forwards.h"
+#include "viennacl/tools/adapter.hpp"
+
 namespace viennacl
 {
   namespace tools
@@ -45,98 +47,167 @@ namespace viennacl
     struct MATRIX_SIZE_DEDUCER
     {
       //Standard case: size1 from lhs, size2 from rhs (fits most cases)
-      static size_t size1(LHS & lhs, RHS & rhs) { return lhs.size1(); }
-      static size_t size2(LHS & lhs, RHS & rhs) { return rhs.size2(); }
+      static vcl_size_t size1(LHS & lhs, RHS & /*rhs*/) { return lhs.size1(); }
+      static vcl_size_t size2(LHS & /*lhs*/, RHS & rhs) { return rhs.size2(); }
     };
-    
+
+    /** \cond */
     //special case: outer vector product:
-    template <typename ScalarType, unsigned int A1, unsigned int A2>
-    struct MATRIX_SIZE_DEDUCER<viennacl::vector<ScalarType, A1>,
-                               viennacl::vector<ScalarType, A2>,
+    template <typename ScalarType>
+    struct MATRIX_SIZE_DEDUCER<const viennacl::vector_base<ScalarType>,
+                               const viennacl::vector_base<ScalarType>,
                                viennacl::op_prod>
     {
-      static size_t size1(viennacl::vector<ScalarType, A1> & lhs,
-                          viennacl::vector<ScalarType, A2> & rhs) { return lhs.size1(); }
+      static vcl_size_t size1(viennacl::vector_base<ScalarType> const & lhs,
+                               viennacl::vector_base<ScalarType> const & /*rhs*/) { return lhs.size(); }
 
-      static size_t size2(viennacl::vector<ScalarType, A1> & lhs,
-                          viennacl::vector<ScalarType, A2> & rhs) { return rhs.size2(); }
+      static vcl_size_t size2(viennacl::vector_base<ScalarType> const & /*lhs*/,
+                               viennacl::vector_base<ScalarType> const & rhs) { return rhs.size(); }
     };
 
-    //special case: transposed matrix-Something product: Return the number of rows of the matrix
-    /*template <typename MatrixType, typename ScalarType, unsigned int A>
-    struct MATRIX_SIZE_DEDUCER<MatrixType, const viennacl::vector<ScalarType, A>, viennacl::op_prod>
+
+    //special case: multiplication with a scalar
+    template <typename LHS, typename RHS, typename OP, typename ScalarType>
+    struct MATRIX_SIZE_DEDUCER<const viennacl::matrix_expression<const LHS, const RHS, OP>,
+                               const ScalarType,
+                               viennacl::op_mult>
     {
-      static unsigned int size(MatrixType & lhs, const viennacl::vector<ScalarType, A> & rhs) { return lhs.size1(); }
-    };*/
+      static vcl_size_t size1(viennacl::matrix_expression<const LHS, const RHS, OP> const & lhs,
+                               ScalarType const & /*rhs*/) { return MATRIX_SIZE_DEDUCER<const LHS, const RHS, OP>::size1(lhs.lhs(), lhs.rhs()); }
 
-    // A^T * B
-    template <typename ScalarType, typename T1, typename F2, unsigned int A2>
-    struct MATRIX_SIZE_DEDUCER<const viennacl::matrix_expression<T1,
-                                                                 T1, op_trans>,
-                               const viennacl::matrix<ScalarType, F2, A2>,
-                               viennacl::op_prod>
+      static vcl_size_t size2(viennacl::matrix_expression<const LHS, const RHS, OP> const & lhs,
+                               ScalarType const & /*rhs*/) { return MATRIX_SIZE_DEDUCER<const LHS, const RHS, OP>::size2(lhs.lhs(), lhs.rhs()); }
+    };
+
+    //special case: multiplication with a scalar
+    template <typename T, typename F, typename ScalarType>
+    struct MATRIX_SIZE_DEDUCER<const viennacl::matrix_base<T, F>,
+                               const ScalarType,
+                               viennacl::op_mult>
     {
-      static std::size_t size1(viennacl::matrix_expression<T1,
-                                                           T1,
-                                                           op_trans> const & lhs,
-                               viennacl::matrix<ScalarType, F2, A2> const & rhs) { return lhs.lhs().size2(); }
-      static std::size_t size2(viennacl::matrix_expression<T1,
-                                                           T1,
-                                                           op_trans> const & lhs,
-                               viennacl::matrix<ScalarType, F2, A2> const & rhs) { return rhs.size2(); }
+      static vcl_size_t size1(viennacl::matrix_base<T, F> const & lhs,
+                               ScalarType const & /*rhs*/) { return lhs.size1(); }
+
+      static vcl_size_t size2(viennacl::matrix_base<T, F> const & lhs,
+                               ScalarType const & /*rhs*/) { return lhs.size2(); }
+    };
+
+
+    //special case: division with a scalar
+    template <typename LHS, typename RHS, typename OP, typename ScalarType>
+    struct MATRIX_SIZE_DEDUCER<const viennacl::matrix_expression<const LHS, const RHS, OP>,
+                               const ScalarType,
+                               viennacl::op_div>
+    {
+      static vcl_size_t size1(viennacl::matrix_expression<const LHS, const RHS, OP> const & lhs,
+                               ScalarType const & /*rhs*/) { return MATRIX_SIZE_DEDUCER<const LHS, const RHS, OP>::size1(lhs.lhs(), lhs.rhs()); }
+
+      static vcl_size_t size2(viennacl::matrix_expression<const LHS, const RHS, OP> const & lhs,
+                               ScalarType const & /*rhs*/) { return MATRIX_SIZE_DEDUCER<const LHS, const RHS, OP>::size2(lhs.lhs(), lhs.rhs()); }
+    };
+
+    //special case: division with a scalar
+    template <typename T, typename F, typename ScalarType>
+    struct MATRIX_SIZE_DEDUCER<const viennacl::matrix_base<T, F>,
+                               const ScalarType,
+                               viennacl::op_div>
+    {
+      static vcl_size_t size1(viennacl::matrix_base<T, F> const & lhs,
+                               ScalarType const & /*rhs*/) { return lhs.size1(); }
+
+      static vcl_size_t size2(viennacl::matrix_base<T, F> const & lhs,
+                               ScalarType const & /*rhs*/) { return lhs.size2(); }
     };
 
-    template <typename T1, typename MatrixType2>
+    //special case: diagonal from vector
+    template <typename T>
+    struct MATRIX_SIZE_DEDUCER<const viennacl::vector_base<T>,
+                               const int,
+                               viennacl::op_vector_diag>
+    {
+      static vcl_size_t size1(viennacl::vector_base<T> const & lhs,
+                               const int k) { return lhs.size() + static_cast<vcl_size_t>(std::fabs(double(k))); }
+
+      static vcl_size_t size2(viennacl::vector_base<T> const & lhs,
+                               const int k) { return lhs.size() + static_cast<vcl_size_t>(std::fabs(double(k))); }
+    };
+
+
+
+
+
+
+
+
+    //special case: transposed matrix-vector product: Return the number of rows of the matrix
+    template <typename MatrixType>
+    struct MATRIX_SIZE_DEDUCER<MatrixType,
+                               MatrixType,
+                               viennacl::op_trans>
+    {
+      static vcl_size_t size1(const MatrixType & lhs,
+                               const MatrixType & /*rhs*/) { return lhs.size2(); }
+      static vcl_size_t size2(const MatrixType & lhs,
+                               const MatrixType & /*rhs*/) { return lhs.size1(); }
+    };
+
+    // A^T * B
+    template <typename ScalarType, typename T1, typename F2>
     struct MATRIX_SIZE_DEDUCER<const viennacl::matrix_expression<T1,
                                                                  T1, op_trans>,
-                               const viennacl::matrix_range<MatrixType2>,
-                               viennacl::op_prod>
+                               const viennacl::matrix_base<ScalarType, F2>,
+                               viennacl::op_mat_mat_prod>
     {
-      static std::size_t size1(viennacl::matrix_expression<T1,
+      static vcl_size_t size1(viennacl::matrix_expression<T1,
                                                            T1,
                                                            op_trans> const & lhs,
-                               viennacl::matrix_range<MatrixType2> const & rhs) { return lhs.lhs().size2(); }
-      static std::size_t size2(viennacl::matrix_expression<T1,
+                               viennacl::matrix_base<ScalarType, F2> const & /*rhs*/) { return lhs.lhs().size2(); }
+      static vcl_size_t size2(viennacl::matrix_expression<T1,
                                                            T1,
-                                                           op_trans> const & lhs,
-                               viennacl::matrix_range<MatrixType2> const & rhs) { return rhs.size2(); }
+                                                           op_trans> const & /*lhs*/,
+                               viennacl::matrix_base<ScalarType, F2> const & rhs) { return rhs.size2(); }
     };
-    
-    
-    // A * B^T 
-    
-    template <typename ScalarType, typename F1, unsigned int A1, typename T2>
-    struct MATRIX_SIZE_DEDUCER<const viennacl::matrix<ScalarType, F1, A1>,
+
+
+    // A * B^T
+
+    template <typename ScalarType, typename F1, typename T2>
+    struct MATRIX_SIZE_DEDUCER<const viennacl::matrix_base<ScalarType, F1>,
                                const viennacl::matrix_expression<T2,
                                                                  T2, op_trans>,
-                               viennacl::op_prod>
+                               viennacl::op_mat_mat_prod>
     {
-      static std::size_t size1(viennacl::matrix<ScalarType, F1, A1> const & lhs,
+      static vcl_size_t size1(viennacl::matrix_base<ScalarType, F1> const & lhs,
                                viennacl::matrix_expression<T2,
                                                            T2,
-                                                           op_trans> const & rhs) { return lhs.size1(); }
-      static std::size_t size2(viennacl::matrix<ScalarType, F1, A1> const & lhs,
+                                                           op_trans> const & /*rhs*/) { return lhs.size1(); }
+      static vcl_size_t size2(viennacl::matrix_base<ScalarType, F1> const & /*lhs*/,
                                viennacl::matrix_expression<T2,
                                                            T2,
                                                            op_trans> const & rhs) { return rhs.lhs().size1(); }
     };
 
-    template <typename MatrixType1, typename T2>
-    struct MATRIX_SIZE_DEDUCER<const viennacl::matrix_range<MatrixType1>,
+
+
+
+    // A^T * B^T
+
+    template <typename T1, typename T2>
+    struct MATRIX_SIZE_DEDUCER<const viennacl::matrix_expression<T1,
+                                                                 T1, op_trans>,
                                const viennacl::matrix_expression<T2,
                                                                  T2, op_trans>,
-                               viennacl::op_prod>
+                               viennacl::op_mat_mat_prod>
     {
-      static std::size_t size1(viennacl::matrix_range<MatrixType1> const & lhs,
-                               viennacl::matrix_expression<T2,
-                                                           T2,
-                                                           op_trans> const & rhs) { return lhs.size1(); }
-      static std::size_t size2(viennacl::matrix_range<MatrixType1> const & lhs,
-                               viennacl::matrix_expression<T2,
-                                                           T2,
-                                                           op_trans> const & rhs) { return rhs.lhs().size1(); }
+      typedef viennacl::matrix_expression<T1, T1, op_trans>   LHSType;
+      typedef viennacl::matrix_expression<T2, T2, op_trans>   RHSType;
+
+      static vcl_size_t size1(LHSType const & lhs,
+                               RHSType const & /*rhs*/) { return lhs.lhs().size2(); }
+      static vcl_size_t size2(LHSType const & /*lhs*/,
+                               RHSType const & rhs) { return rhs.lhs().size1(); }
     };
-    
+    /** \endcond */
   }
 }
 
diff --git a/viennacl/tools/matrix_solve_kernel_class_deducer.hpp b/viennacl/tools/matrix_solve_kernel_class_deducer.hpp
deleted file mode 100644
index d9694c2..0000000
--- a/viennacl/tools/matrix_solve_kernel_class_deducer.hpp
+++ /dev/null
@@ -1,77 +0,0 @@
-#ifndef VIENNACL_TOOLS_MATRIX_SOLVE_KERNEL_CLASS_DEDUCER_HPP_
-#define VIENNACL_TOOLS_MATRIX_SOLVE_KERNEL_CLASS_DEDUCER_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file matrix_solve_kernel_class_deducer.hpp
-    @brief Implementation of a helper meta class for deducing the correct kernels for the dense matrix solver
-*/
-
-#include <string>
-#include <fstream>
-#include <sstream>
-#include "viennacl/forwards.h"
-#include "viennacl/linalg/kernels/matrix_solve_col_col_kernels.h"
-#include "viennacl/linalg/kernels/matrix_solve_col_row_kernels.h"
-#include "viennacl/linalg/kernels/matrix_solve_row_col_kernels.h"
-#include "viennacl/linalg/kernels/matrix_solve_row_row_kernels.h"
-
-#include <vector>
-#include <map>
-
-namespace viennacl
-{
-  namespace tools
-  {
-    /** @brief deduces kernel type for A \ B, where A, B, C are MatrixType1 and MatrixType2 */
-    template <typename MatrixType1, typename MatrixType2>
-    struct MATRIX_SOLVE_KERNEL_CLASS_DEDUCER
-    {};
-    
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    struct MATRIX_SOLVE_KERNEL_CLASS_DEDUCER< viennacl::matrix<SCALARTYPE, viennacl::row_major, ALIGNMENT>,
-                                              viennacl::matrix<SCALARTYPE, viennacl::row_major, ALIGNMENT> >
-    {
-      typedef viennacl::linalg::kernels::matrix_solve_row_row<SCALARTYPE, ALIGNMENT>     ResultType;
-    };
-
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    struct MATRIX_SOLVE_KERNEL_CLASS_DEDUCER< viennacl::matrix<SCALARTYPE, viennacl::row_major, ALIGNMENT>,
-                                              viennacl::matrix<SCALARTYPE, viennacl::column_major, ALIGNMENT> >
-    {
-      typedef viennacl::linalg::kernels::matrix_solve_row_col<SCALARTYPE, ALIGNMENT>     ResultType;
-    };
-
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    struct MATRIX_SOLVE_KERNEL_CLASS_DEDUCER< viennacl::matrix<SCALARTYPE, viennacl::column_major, ALIGNMENT>,
-                                              viennacl::matrix<SCALARTYPE, viennacl::row_major, ALIGNMENT> >
-    {
-      typedef viennacl::linalg::kernels::matrix_solve_col_row<SCALARTYPE, ALIGNMENT>     ResultType;
-    };
-
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    struct MATRIX_SOLVE_KERNEL_CLASS_DEDUCER< viennacl::matrix<SCALARTYPE, viennacl::column_major, ALIGNMENT>,
-                                              viennacl::matrix<SCALARTYPE, viennacl::column_major, ALIGNMENT> >
-    {
-      typedef viennacl::linalg::kernels::matrix_solve_col_col<SCALARTYPE, ALIGNMENT>     ResultType;
-    };
-    
-  }
-
-}
-
-#endif
diff --git a/viennacl/tools/shared_ptr.hpp b/viennacl/tools/shared_ptr.hpp
new file mode 100644
index 0000000..b178fcd
--- /dev/null
+++ b/viennacl/tools/shared_ptr.hpp
@@ -0,0 +1,163 @@
+#ifndef VIENNACL_TOOLS_SHARED_PTR_HPP
+#define VIENNACL_TOOLS_SHARED_PTR_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2012, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file tools/shared_ptr.hpp
+    @brief Implementation of a shared pointer class (cf. std::shared_ptr, boost::shared_ptr). Will be used until C++11 is widely available.
+
+    Contributed by Philippe Tillet.
+*/
+
+#include <cstdlib>
+#include <algorithm>
+
+namespace viennacl
+{
+  namespace tools
+  {
+
+    namespace detail
+    {
+
+      /** @brief Reference counting class for the shared_ptr implementation */
+      class count
+      {
+        public:
+          count(unsigned int val) : val_(val){ }
+          void dec(){ --val_; }
+          void inc(){ ++val_; }
+          bool is_null(){ return val_ == 0; }
+          unsigned int val(){ return val_; }
+        private:
+          unsigned int val_;
+      };
+
+      /** @brief Interface for the reference counter inside the shared_ptr */
+      struct aux
+      {
+        detail::count count;
+
+        aux() :count(1) {}
+        virtual void destroy()=0;
+        virtual ~aux() {}
+      };
+
+      /** @brief Implementation helper for the reference counting mechanism inside shared_ptr. */
+      template<class U, class Deleter>
+      struct auximpl: public detail::aux
+      {
+        U* p;
+        Deleter d;
+
+        auximpl(U* pu, Deleter x) :p(pu), d(x) {}
+        virtual void destroy() { d(p); }
+      };
+
+      /** @brief Default deleter class for a pointer. The default is to just call 'delete' on the pointer. Provide your own implementations for 'delete[]' and 'free'. */
+      template<class U>
+      struct default_deleter
+      {
+        void operator()(U* p) const { delete p; }
+      };
+
+    }
+
+    /** @brief A shared pointer class similar to boost::shared_ptr. Reimplemented in order to avoid a Boost-dependency. Will be replaced by std::shared_ptr as soon as C++11 is widely available. */
+    template<class T>
+    class shared_ptr
+    {
+        template<class U>
+        friend class shared_ptr;
+
+        detail::aux* pa;
+        T* pt;
+
+      public:
+
+        shared_ptr() :pa(NULL), pt(NULL) {}
+
+        template<class U, class Deleter>
+        shared_ptr(U* pu, Deleter d) : pa(new detail::auximpl<U, Deleter>(pu, d)), pt(pu) {}
+
+        template<class U>
+        explicit shared_ptr(U* pu) : pa(new detail::auximpl<U, detail::default_deleter<U> >(pu, detail::default_deleter<U>())), pt(pu) {}
+
+        shared_ptr(const shared_ptr& s) :pa(s.pa), pt(s.pt) { inc(); }
+
+        template<class U>
+        shared_ptr(const shared_ptr<U>& s) :pa(s.pa), pt(s.pt) { inc(); }
+
+        ~shared_ptr() { dec(); }
+
+        void reset(){
+            shared_ptr<T>().swap(*this);
+        }
+
+        void reset(T * ptr){
+            shared_ptr<T>(ptr).swap(*this);
+        }
+
+        void swap(shared_ptr<T> & other){
+            std::swap(pt,other.pt);
+            std::swap(pa, other.pa);
+        }
+
+
+        shared_ptr& operator=(const shared_ptr& s)
+        {
+            if(this!=&s)
+            {
+                dec();
+                pa = s.pa;
+                pt = s.pt;
+                inc();
+            }
+            return *this;
+        }
+
+        T* get() const {  return pt; }
+
+        T* operator->() const {  return pt; }
+
+        T& operator*() const { return *pt; }
+
+        void inc() { if(pa) pa->count.inc(); }
+
+        void dec()
+        {
+          if(pa)
+          {
+            pa->count.dec();
+
+            if(pa->count.is_null())
+            {
+                pa->destroy();
+                delete pa;
+                pa = NULL;
+            }
+          }
+        }
+
+    };
+
+  }
+
+}
+
+#endif // VIENNACL_UTILS_SHARED_PTR_HPP
diff --git a/viennacl/tools/timer.hpp b/viennacl/tools/timer.hpp
new file mode 100644
index 0000000..8d7a4f0
--- /dev/null
+++ b/viennacl/tools/timer.hpp
@@ -0,0 +1,122 @@
+#ifndef _VIENNACL_TOOLS_TIMER_HPP_
+#define _VIENNACL_TOOLS_TIMER_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file   viennacl/tools/timer.hpp
+    @brief  A simple, yet (mostly) sufficiently accurate timer for benchmarking and profiling. */
+
+#include <iostream>
+
+
+#ifdef _WIN32
+
+#define WINDOWS_LEAN_AND_MEAN
+#include <windows.h>
+#undef min
+#undef max
+
+namespace viennacl{
+
+  namespace tools{
+
+    /** @brief Simple timer class based on gettimeofday (POSIX) or QueryPerformanceCounter (Windows).
+      *
+      * Avoids messing with Boost and should be sufficient for benchmarking purposes.
+      */
+    class timer
+    {
+    public:
+
+      timer()
+      {
+        QueryPerformanceFrequency(&freq);
+      }
+
+      void start()
+      {
+        QueryPerformanceCounter((LARGE_INTEGER*) &start_time);
+      }
+
+      double get() const
+      {
+        LARGE_INTEGER  end_time;
+        QueryPerformanceCounter((LARGE_INTEGER*) &end_time);
+        return (static_cast<double>(end_time.QuadPart) - static_cast<double>(start_time.QuadPart)) / static_cast<double>(freq.QuadPart);
+      }
+
+
+    private:
+      LARGE_INTEGER freq;
+      LARGE_INTEGER start_time;
+    };
+
+  }
+
+}
+
+
+#else
+
+#include <sys/time.h>
+
+namespace viennacl{
+
+  namespace tools{
+
+    /** @brief Simple timer class based on gettimeofday (POSIX) or QueryPerformanceCounter (Windows).
+      *
+      * Avoids messing with Boost and should be sufficient for benchmarking purposes.
+      */
+    class timer
+    {
+    public:
+
+      timer() : ts(0)
+      {}
+
+      void start()
+      {
+        struct timeval tval;
+        gettimeofday(&tval, NULL);
+        ts = static_cast<double>(tval.tv_sec * 1000000 + tval.tv_usec);
+      }
+
+      double get() const
+      {
+        struct timeval tval;
+        gettimeofday(&tval, NULL);
+        double end_time = tval.tv_sec * 1000000 + tval.tv_usec;
+
+        return static_cast<double>(end_time-ts) / 1000000.0;
+      }
+
+    private:
+      double ts;
+    };
+
+  }
+
+}
+
+
+
+#endif
+
+#endif
diff --git a/viennacl/tools/tools.hpp b/viennacl/tools/tools.hpp
index 243e90c..028d3aa 100644
--- a/viennacl/tools/tools.hpp
+++ b/viennacl/tools/tools.hpp
@@ -1,339 +1,289 @@
-#ifndef VIENNACL_TOOLS_TOOLS_HPP_
-#define VIENNACL_TOOLS_TOOLS_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file tools.hpp
-    @brief Various little tools used here and there in ViennaCL.
-*/
-
-#include <string>
-#include <fstream>
-#include <sstream>
-#include "viennacl/forwards.h"
-#include "viennacl/tools/adapter.hpp"
-
-
-#ifdef VIENNACL_HAVE_UBLAS  
-#include <boost/numeric/ublas/matrix_sparse.hpp>
-#include <boost/numeric/ublas/matrix.hpp>
-#endif
-
-#ifdef VIENNACL_HAVE_EIGEN  
-#include <Eigen/Core>
-#include <Eigen/Sparse>
-#endif
-
-#ifdef VIENNACL_HAVE_MTL4
-#include <boost/numeric/mtl/mtl.hpp>
-#endif
-
-#include <vector>
-#include <map>
-
-namespace viennacl
-{
-  namespace tools
-  {
-    
-    /** @brief Supply suitable increment functions for the iterators: */
-    template <class SCALARTYPE, typename F, unsigned int ALIGNMENT>
-    struct MATRIX_ITERATOR_INCREMENTER<viennacl::row_iteration, viennacl::matrix<SCALARTYPE, F, ALIGNMENT> >
-    {
-      static void apply(const viennacl::matrix<SCALARTYPE, F, ALIGNMENT> & mat, unsigned int & row, unsigned int & col) { ++row; }
-    };
-
-    template <class SCALARTYPE, typename F, unsigned int ALIGNMENT>
-    struct MATRIX_ITERATOR_INCREMENTER<viennacl::col_iteration, viennacl::matrix<SCALARTYPE, F, ALIGNMENT> >
-    {
-      static void apply(const viennacl::matrix<SCALARTYPE, F, ALIGNMENT> & mat, unsigned int & row, unsigned int & col) { ++col; }
-    };
-
-    
-    /** @brief A guard that checks whether the floating point type of GPU types is either float or double */
-    template <typename T>
-    struct CHECK_SCALAR_TEMPLATE_ARGUMENT
-    {
-        typedef typename T::ERROR_SCALAR_MUST_HAVE_TEMPLATE_ARGUMENT_FLOAT_OR_DOUBLE  ResultType;
-    };
-    
-    template <>
-    struct CHECK_SCALAR_TEMPLATE_ARGUMENT<float>
-    {
-        typedef float  ResultType;
-    };
-    
-    template <>
-    struct CHECK_SCALAR_TEMPLATE_ARGUMENT<double>
-    {
-        typedef double  ResultType;
-    };
-
-    
-    
-    /** @brief Reads a text from a file into a std::string
-    *
-    * @param filename   The filename
-    * @return The text read from the file
-    */
-    inline std::string readTextFromFile(const std::string & filename)
-    {
-      std::ifstream f(filename.c_str());
-      if (!f) return std::string();
-
-      std::stringstream result;
-      std::string tmp;
-      while (std::getline(f, tmp))
-        result << tmp << std::endl;
-
-      return result.str();
-    }
-
-    /** @brief Replaces all occurances of a substring by another stringstream
-    *
-    * @param text   The string to search in
-    * @param to_search  The substring to search for
-    * @param to_replace The replacement for found substrings
-    * @return The resulting string
-    */
-    inline std::string strReplace(const std::string & text, std::string to_search, std::string to_replace)
-    {
-      std::string::size_type pos = 0;
-      std::string result;
-      std::string::size_type found;
-      while( (found = text.find(to_search, pos)) != std::string::npos )
-      {
-        result.append(text.substr(pos,found-pos));
-        result.append(to_replace);
-        pos = found + to_search.length();
-      }
-      if (pos < text.length())
-        result.append(text.substr(pos));
-      return result;
-    }
-
-    /** @brief Rounds an integer to the next multiple of another integer
-    *
-    * @tparam INT_TYPE  The integer type
-    * @param to_reach   The integer to be rounded up (ceil operation)
-    * @param base       The base
-    * @return The smallest multiple of 'base' such that to_reach <= base
-    */
-    template <class INT_TYPE>
-    INT_TYPE roundUpToNextMultiple(INT_TYPE to_reach, INT_TYPE base)
-    {
-      if (to_reach % base == 0) return to_reach;
-      return ((to_reach / base) + 1) * base;
-    }
-    
-    
-    /** @brief Create a double precision kernel out of a single precision kernel
-    *
-    * @param source          The source string
-    * @param platform_info   An info string that contains the OpenCL platform vendor
-    * @return   The double precision kernel
-    */
-    inline std::string make_double_kernel(std::string const & source, std::string const & fp_extension)
-    {
-      std::stringstream ss;
-      ss << "#pragma OPENCL EXTENSION " << fp_extension << " : enable\n\n";
-      
-      std::string result = ss.str();
-      result.append(strReplace(source, "float", "double"));
-      return result;
-    }
-    
-    
-    /** @brief Removes the const qualifier from a type */
-    template <typename T>
-    struct CONST_REMOVER
-    {
-      typedef T   ResultType;
-    };
-
-    template <typename T>
-    struct CONST_REMOVER<const T>
-    {
-      typedef T   ResultType;
-    };
-
-
-    /** @brief Extracts the vector type from one of the two arguments. Used for the vector_expression type.
-    *
-    * @tparam LHS   The left hand side operand of the vector_expression
-    * @tparam RHS   The right hand side operand of the vector_expression
-    */
-    template <typename LHS, typename RHS>
-    struct VECTOR_EXTRACTOR_IMPL
-    {
-      typedef typename LHS::ERROR_COULD_NOT_EXTRACT_VECTOR_INFORMATION_FROM_VECTOR_EXPRESSION  ResultType;
-    };
-    
-    template <typename LHS, typename ScalarType, unsigned int A>
-    struct VECTOR_EXTRACTOR_IMPL<LHS, viennacl::vector<ScalarType, A> >
-    {
-      typedef viennacl::vector<ScalarType, A>   ResultType;
-    };
-
-    template <typename RHS, typename ScalarType, unsigned int A>
-    struct VECTOR_EXTRACTOR_IMPL<viennacl::vector<ScalarType, A>, RHS>
-    {
-      typedef viennacl::vector<ScalarType, A>   ResultType;
-    };
-
-    //resolve ambiguities for previous cases:
-    template <typename ScalarType, unsigned int A>
-    struct VECTOR_EXTRACTOR_IMPL<viennacl::vector<ScalarType, A>, viennacl::vector<ScalarType, A> >
-    {
-      typedef viennacl::vector<ScalarType, A>   ResultType;
-    };
-
-    template <typename LHS, typename RHS>
-    struct VECTOR_EXTRACTOR
-    {
-      typedef typename VECTOR_EXTRACTOR_IMPL<typename CONST_REMOVER<LHS>::ResultType,
-                                              typename CONST_REMOVER<RHS>::ResultType>::ResultType      ResultType;
-    };
-
-    /** @brief Deduces the size of the resulting vector represented by a vector_expression from the operands
-    *
-    * @tparam LHS   The left hand side operand
-    * @tparam RHS   The right hand side operand
-    * @tparam OP    The operation tag
-    */
-    template <typename LHS, typename RHS, typename OP>
-    struct VECTOR_SIZE_DEDUCER
-    {
-      //take care: using a plain, naive .size() on the left hand side type can cause subtle side-effects!
-    };
-
-    //Standard case: LHS is the vector type and carries the correct size
-    template <typename ScalarType, unsigned int A, typename RHS>
-    struct VECTOR_SIZE_DEDUCER<const viennacl::vector<ScalarType, A>, RHS, viennacl::op_prod>
-    {
-      static size_t size(const viennacl::vector<ScalarType, A> & lhs,
-                         const RHS & rhs) { return lhs.size(); }
-    };
-
-    template <typename ScalarType, unsigned int A, typename RHS>
-    struct VECTOR_SIZE_DEDUCER<const viennacl::vector<ScalarType, A>, RHS, viennacl::op_div>
-    {
-      static size_t size(const viennacl::vector<ScalarType, A> & lhs,
-                         const RHS & rhs) { return lhs.size(); }
-    };
-    
-    //special case: matrix-vector product: Return the number of rows of the matrix
-    template <typename ScalarType, typename F, unsigned int Amat, unsigned int A>
-    struct VECTOR_SIZE_DEDUCER<const viennacl::matrix<ScalarType, F, Amat>, const viennacl::vector<ScalarType, A>, viennacl::op_prod>
-    {
-      static size_t size(const viennacl::matrix<ScalarType, F, Amat> & lhs,
-                         const viennacl::vector<ScalarType, A> & rhs) { return lhs.size1(); }
-    };
-
-    template <typename ScalarType, unsigned int Amat, unsigned int A>
-    struct VECTOR_SIZE_DEDUCER<const viennacl::circulant_matrix<ScalarType, Amat>, const viennacl::vector<ScalarType, A>, viennacl::op_prod>
-    {
-      static size_t size(const viennacl::circulant_matrix<ScalarType, Amat> & lhs,
-                         const viennacl::vector<ScalarType, A> & rhs) { return lhs.size1(); }
-    };
-    
-    template <typename ScalarType, unsigned int Amat, unsigned int A>
-    struct VECTOR_SIZE_DEDUCER<const viennacl::compressed_matrix<ScalarType, Amat>, const viennacl::vector<ScalarType, A>, viennacl::op_prod>
-    {
-      static size_t size(const viennacl::compressed_matrix<ScalarType, Amat> & lhs,
-                         const viennacl::vector<ScalarType, A> & rhs) { return lhs.size1(); }
-    };
-
-    template <typename ScalarType, unsigned int Amat, unsigned int A>
-    struct VECTOR_SIZE_DEDUCER<const viennacl::coordinate_matrix<ScalarType, Amat>, const viennacl::vector<ScalarType, A>, viennacl::op_prod>
-    {
-      static size_t size(const viennacl::coordinate_matrix<ScalarType, Amat> & lhs,
-                         const viennacl::vector<ScalarType, A> & rhs) { return lhs.size1(); }
-    };
-    
-    //special case: transposed matrix-vector product: Return the number of cols(!) of the matrix
-    template <typename ScalarType, typename F, unsigned int Amat, unsigned int A>
-    struct VECTOR_SIZE_DEDUCER<const viennacl::matrix_expression< const viennacl::matrix<ScalarType, F, Amat>,
-                                                                  const viennacl::matrix<ScalarType, F, Amat>,
-                                                                  op_trans>,
-                               const viennacl::vector<ScalarType, A>,
-                               viennacl::op_prod>
-    {
-      static size_t size(const viennacl::matrix_expression< const viennacl::matrix<ScalarType, F, Amat>,
-                                                            const viennacl::matrix<ScalarType, F, Amat>,
-                                                            op_trans> & lhs,
-                         const viennacl::vector<ScalarType, A> & rhs) { return lhs.lhs().size2(); }
-    };
-
-    
-    
-    
-    
-    /** @brief Obtain the cpu scalar type from a type, including a GPU type like viennacl::scalar<T>
-    *
-    * @tparam T   Either a CPU scalar type or a GPU scalar type
-    */
-    template <typename T>
-    struct CPU_SCALAR_TYPE_DEDUCER
-    {
-      //force compiler error if type cannot be deduced
-      //typedef T       ResultType;
-    };
-
-    template <>
-    struct CPU_SCALAR_TYPE_DEDUCER< float >
-    {
-      typedef float       ResultType;
-    };
-
-    template <>
-    struct CPU_SCALAR_TYPE_DEDUCER< double >
-    {
-      typedef double       ResultType;
-    };
-    
-    template <typename T>
-    struct CPU_SCALAR_TYPE_DEDUCER< viennacl::scalar<T> >
-    {
-      typedef T       ResultType;
-    };
-
-    template <typename T, unsigned int A>
-    struct CPU_SCALAR_TYPE_DEDUCER< viennacl::vector<T, A> >
-    {
-      typedef T       ResultType;
-    };
-
-    template <typename T, typename F, unsigned int A>
-    struct CPU_SCALAR_TYPE_DEDUCER< viennacl::matrix<T, F, A> >
-    {
-      typedef T       ResultType;
-    };
-
-    
-    template <typename T, typename F, unsigned int A>
-    struct CPU_SCALAR_TYPE_DEDUCER< viennacl::matrix_expression<const matrix<T, F, A>, const matrix<T, F, A>, op_trans> >
-    {
-      typedef T       ResultType;
-    };
-
-        
-  } //namespace tools
-} //namespace viennacl
-    
-
-#endif
+#ifndef VIENNACL_TOOLS_TOOLS_HPP_
+#define VIENNACL_TOOLS_TOOLS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/tools/tools.hpp
+    @brief Various little tools used here and there in ViennaCL.
+*/
+
+#include <string>
+#include <fstream>
+#include <sstream>
+#include "viennacl/forwards.h"
+#include "viennacl/tools/adapter.hpp"
+
+#include <vector>
+#include <map>
+
+namespace viennacl
+{
+  namespace tools
+  {
+
+    /** \cond */
+    /** @brief Supply suitable increment functions for the iterators: */
+    template <class SCALARTYPE, typename F, unsigned int ALIGNMENT>
+    struct MATRIX_ITERATOR_INCREMENTER<viennacl::row_iteration, viennacl::matrix<SCALARTYPE, F, ALIGNMENT> >
+    {
+      static void apply(const viennacl::matrix<SCALARTYPE, F, ALIGNMENT> & /*mat*/, unsigned int & row, unsigned int & /*col*/) { ++row; }
+    };
+
+    template <class SCALARTYPE, typename F, unsigned int ALIGNMENT>
+    struct MATRIX_ITERATOR_INCREMENTER<viennacl::col_iteration, viennacl::matrix<SCALARTYPE, F, ALIGNMENT> >
+    {
+      static void apply(const viennacl::matrix<SCALARTYPE, F, ALIGNMENT> & /*mat*/, unsigned int & /*row*/, unsigned int & col) { ++col; }
+    };
+    /** \endcond */
+
+
+    /** @brief A guard that checks whether the floating point type of GPU types is either float or double */
+    template <typename T>
+    struct CHECK_SCALAR_TEMPLATE_ARGUMENT
+    {
+        typedef typename T::ERROR_SCALAR_MUST_HAVE_TEMPLATE_ARGUMENT_FLOAT_OR_DOUBLE  ResultType;
+    };
+
+    /** \cond */
+    template <>
+    struct CHECK_SCALAR_TEMPLATE_ARGUMENT<float>
+    {
+        typedef float  ResultType;
+    };
+
+    template <>
+    struct CHECK_SCALAR_TEMPLATE_ARGUMENT<double>
+    {
+        typedef double  ResultType;
+    };
+    /** \endcond */
+
+
+
+    /** @brief Reads a text from a file into a std::string
+    *
+    * @param filename   The filename
+    * @return The text read from the file
+    */
+    inline std::string readTextFromFile(const std::string & filename)
+    {
+      std::ifstream f(filename.c_str());
+      if (!f) return std::string();
+
+      std::stringstream result;
+      std::string tmp;
+      while (std::getline(f, tmp))
+        result << tmp << std::endl;
+
+      return result.str();
+    }
+
+    /** @brief Replaces all occurances of a substring by another stringstream
+    *
+    * @param text   The string to search in
+    * @param to_search  The substring to search for
+    * @param to_replace The replacement for found substrings
+    * @return The resulting string
+    */
+    inline std::string strReplace(const std::string & text, std::string to_search, std::string to_replace)
+    {
+      std::string::size_type pos = 0;
+      std::string result;
+      std::string::size_type found;
+      while( (found = text.find(to_search, pos)) != std::string::npos )
+      {
+        result.append(text.substr(pos,found-pos));
+        result.append(to_replace);
+        pos = found + to_search.length();
+      }
+      if (pos < text.length())
+        result.append(text.substr(pos));
+      return result;
+    }
+
+    /** @brief Rounds an integer to the next multiple of another integer
+    *
+    * @tparam INT_TYPE  The integer type
+    * @param to_reach   The integer to be rounded up (ceil operation)
+    * @param base       The base
+    * @return The smallest multiple of 'base' such that to_reach <= base
+    */
+    template <class INT_TYPE>
+    INT_TYPE align_to_multiple(INT_TYPE to_reach, INT_TYPE base)
+    {
+      if (to_reach % base == 0) return to_reach;
+      return ((to_reach / base) + 1) * base;
+    }
+
+
+    /** @brief Rounds an integer to the previous multiple of another integer
+    *
+    * @tparam INT_TYPE  The integer type
+    * @param to_reach   The integer to be rounded down (floor operation)
+    * @param base       The base
+    * @return The biggest multiple of 'base' such that to_reach >= base
+    */
+    template <class INT_TYPE>
+    INT_TYPE roundDownToPreviousMultiple(INT_TYPE to_reach, INT_TYPE base)
+    {
+      if (to_reach % base == 0) return to_reach;
+      return (to_reach / base) * base;
+    }
+
+    /** @brief Replace in a source string a pattern by another
+     *
+     * @param source The source string
+     * @param find String to find
+     * @param replace String to replace
+     */
+    int inline find_and_replace(std::string & source, std::string const & find, std::string const & replace)
+    {
+        int num=0;
+        vcl_size_t fLen = find.size();
+        vcl_size_t rLen = replace.size();
+        for (vcl_size_t pos=0; (pos=source.find(find, pos))!=std::string::npos; pos+=rLen)
+        {
+            num++;
+            source.replace(pos, fLen, replace);
+        }
+        return num;
+    }
+
+    /** @brief Create a double precision kernel out of a single precision kernel
+    *
+    * @param source          The source string
+    * @param fp_extension    An info string that specifies the OpenCL double precision extension
+    * @return   The double precision kernel
+    */
+    inline std::string make_double_kernel(std::string const & source, std::string const & fp_extension)
+    {
+      std::stringstream ss;
+      ss << "#pragma OPENCL EXTENSION " << fp_extension << " : enable\n\n";
+
+      std::string result = ss.str();
+      result.append(strReplace(source, "float", "double"));
+      return result;
+    }
+
+
+    /** @brief Removes the const qualifier from a type */
+    template <typename T>
+    struct CONST_REMOVER
+    {
+      typedef T   ResultType;
+    };
+
+    /** \cond */
+    template <typename T>
+    struct CONST_REMOVER<const T>
+    {
+      typedef T   ResultType;
+    };
+    /** \endcond */
+
+
+    /////// CPU scalar type deducer ///////////
+
+    /** @brief Obtain the cpu scalar type from a type, including a GPU type like viennacl::scalar<T>
+    *
+    * @tparam T   Either a CPU scalar type or a GPU scalar type
+    */
+    template <typename T>
+    struct CPU_SCALAR_TYPE_DEDUCER
+    {
+      //force compiler error if type cannot be deduced
+      //typedef T       ResultType;
+    };
+
+    /** \cond */
+    template <>
+    struct CPU_SCALAR_TYPE_DEDUCER< float >
+    {
+      typedef float       ResultType;
+    };
+
+    template <>
+    struct CPU_SCALAR_TYPE_DEDUCER< double >
+    {
+      typedef double       ResultType;
+    };
+
+    template <typename T>
+    struct CPU_SCALAR_TYPE_DEDUCER< viennacl::scalar<T> >
+    {
+      typedef T       ResultType;
+    };
+
+    template <typename T, unsigned int A>
+    struct CPU_SCALAR_TYPE_DEDUCER< viennacl::vector<T, A> >
+    {
+      typedef T       ResultType;
+    };
+
+    template <typename T, typename F, unsigned int A>
+    struct CPU_SCALAR_TYPE_DEDUCER< viennacl::matrix<T, F, A> >
+    {
+      typedef T       ResultType;
+    };
+
+
+    template <typename T, typename F, unsigned int A>
+    struct CPU_SCALAR_TYPE_DEDUCER< viennacl::matrix_expression<const matrix<T, F, A>, const matrix<T, F, A>, op_trans> >
+    {
+      typedef T       ResultType;
+    };
+    /** \endcond */
+
+    //
+    // Converts a scalar type when necessary unless it is a viennacl::scalar<> (typical use-case: convert user-provided floats to double (and vice versa) for OpenCL kernels)
+    //
+
+    template <typename HostScalarType>
+    viennacl::scalar<HostScalarType> const & promote_if_host_scalar(viennacl::scalar<HostScalarType> const & s) { return s; }
+
+    template <typename HostScalarType>
+    viennacl::scalar_expression<const viennacl::scalar<HostScalarType>,
+                                const viennacl::scalar<HostScalarType>,
+                                viennacl::op_flip_sign> const &
+    promote_if_host_scalar(viennacl::scalar_expression<const viennacl::scalar<HostScalarType>,
+                                                       const viennacl::scalar<HostScalarType>,
+                                                       viennacl::op_flip_sign> const & s) { return s; }
+
+    template <typename HostScalarType>
+    HostScalarType promote_if_host_scalar(float s) { return s; }
+
+    template <typename HostScalarType>
+    HostScalarType promote_if_host_scalar(double s) { return s; }
+
+    template <typename HostScalarType>
+    HostScalarType promote_if_host_scalar(long s) { return s; }
+
+    template <typename HostScalarType>
+    HostScalarType promote_if_host_scalar(unsigned long s) { return s; }
+
+    template <typename HostScalarType>
+    HostScalarType promote_if_host_scalar(int s) { return s; }
+
+    template <typename HostScalarType>
+    HostScalarType promote_if_host_scalar(unsigned int s) { return s; }
+
+  } //namespace tools
+} //namespace viennacl
+
+
+#endif
diff --git a/viennacl/traits/clear.hpp b/viennacl/traits/clear.hpp
index 14f547b..7163592 100644
--- a/viennacl/traits/clear.hpp
+++ b/viennacl/traits/clear.hpp
@@ -1,72 +1,75 @@
-#ifndef VIENNACL_TRAITS_CLEAR_HPP_
-#define VIENNACL_TRAITS_CLEAR_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file viennacl/traits/clear.hpp
-    @brief Generic clear functionality for different vector and matrix types
-*/
-
-#include <string>
-#include <fstream>
-#include <sstream>
-#include "viennacl/forwards.h"
-
-#ifdef VIENNACL_HAVE_UBLAS  
-#include <boost/numeric/ublas/matrix_sparse.hpp>
-#include <boost/numeric/ublas/matrix.hpp>
-#endif
-
-#ifdef VIENNACL_HAVE_EIGEN  
-#include <Eigen/Core>
-#include <Eigen/Sparse>
-#endif
-
-#ifdef VIENNACL_HAVE_MTL4
-#include <boost/numeric/mtl/mtl.hpp>
-#endif
-
-#include "viennacl/traits/size.hpp"
-
-#include <vector>
-#include <map>
-
-namespace viennacl
-{
-  namespace traits
-  {
-    
-    //clear:
-    template <typename VectorType>
-    void clear(VectorType & vec)
-    {
-      typedef typename viennacl::result_of::size_type<VectorType>::type  size_type;
-      
-      for (size_type i=0; i<viennacl::traits::size(vec); ++i)
-        vec[i] = 0;  //TODO: Quantity access can also be wrapped...
-    }
-
-    template <typename ScalarType, unsigned int ALIGNMENT>
-    void clear(viennacl::vector<ScalarType, ALIGNMENT> & vec)
-    {
-      vec.clear();
-    }
-  } //namespace traits
-} //namespace viennacl
-    
-
-#endif
+#ifndef VIENNACL_TRAITS_CLEAR_HPP_
+#define VIENNACL_TRAITS_CLEAR_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/traits/clear.hpp
+    @brief Generic clear functionality for different vector and matrix types
+*/
+
+#include <string>
+#include <fstream>
+#include <sstream>
+#include "viennacl/forwards.h"
+
+#ifdef VIENNACL_WITH_UBLAS
+#include <boost/numeric/ublas/matrix_sparse.hpp>
+#include <boost/numeric/ublas/matrix.hpp>
+#endif
+
+#ifdef VIENNACL_WITH_EIGEN
+#include <Eigen/Core>
+#include <Eigen/Sparse>
+#endif
+
+#ifdef VIENNACL_WITH_MTL4
+#include <boost/numeric/mtl/mtl.hpp>
+#endif
+
+#include "viennacl/traits/size.hpp"
+
+#include <vector>
+#include <map>
+
+namespace viennacl
+{
+  namespace traits
+  {
+
+    //clear:
+    /** @brief Generic routine for setting all entries of a vector to zero. This is the version for non-ViennaCL objects. */
+    template <typename VectorType>
+    void clear(VectorType & vec)
+    {
+      typedef typename viennacl::result_of::size_type<VectorType>::type  size_type;
+
+      for (size_type i=0; i<viennacl::traits::size(vec); ++i)
+        vec[i] = 0;  //TODO: Quantity access can also be wrapped...
+    }
+
+    /** @brief Generic routine for setting all entries of a vector to zero. This is the version for ViennaCL objects. */
+    template <typename ScalarType, unsigned int ALIGNMENT>
+    void clear(viennacl::vector<ScalarType, ALIGNMENT> & vec)
+    {
+      vec.clear();
+    }
+  } //namespace traits
+} //namespace viennacl
+
+
+#endif
diff --git a/viennacl/traits/context.hpp b/viennacl/traits/context.hpp
new file mode 100644
index 0000000..b237874
--- /dev/null
+++ b/viennacl/traits/context.hpp
@@ -0,0 +1,66 @@
+#ifndef VIENNACL_TRAITS_CONTEXT_HPP_
+#define VIENNACL_TRAITS_CONTEXT_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/traits/context.hpp
+    @brief Extracts the underlying context from objects
+*/
+
+#include <string>
+#include <fstream>
+#include <sstream>
+#include "viennacl/forwards.h"
+#include "viennacl/context.hpp"
+#include "viennacl/traits/handle.hpp"
+
+namespace viennacl
+{
+  namespace traits
+  {
+    //
+    // Context
+    //
+    /** @brief Returns an ID for the currently active memory domain of an object */
+    template <typename T>
+    viennacl::context context(T const & t)
+    {
+#ifdef VIENNACL_WITH_OPENCL
+      if (traits::active_handle_id(t) == OPENCL_MEMORY)
+        return viennacl::context(traits::opencl_handle(t).context());
+#endif
+
+      return viennacl::context(traits::active_handle_id(t));
+    }
+
+    /** @brief Returns an ID for the currently active memory domain of an object */
+    inline viennacl::context context(viennacl::backend::mem_handle const & h)
+    {
+#ifdef VIENNACL_WITH_OPENCL
+      if (h.get_active_handle_id() == OPENCL_MEMORY)
+        return viennacl::context(h.opencl_handle().context());
+#endif
+
+      return viennacl::context(h.get_active_handle_id());
+    }
+
+  } //namespace traits
+} //namespace viennacl
+
+
+#endif
diff --git a/viennacl/traits/fill.hpp b/viennacl/traits/fill.hpp
index 32afed0..673fa4d 100644
--- a/viennacl/traits/fill.hpp
+++ b/viennacl/traits/fill.hpp
@@ -1,69 +1,70 @@
-#ifndef VIENNACL_TRAITS_FILL_HPP_
-#define VIENNACL_TRAITS_FILL_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file fill.hpp
-    @brief Generic fill functionality for different matrix types
-*/
-
-#include <string>
-#include <fstream>
-#include <sstream>
-#include "viennacl/forwards.h"
-#include "viennacl/meta/result_of.hpp"
-
-#ifdef VIENNACL_HAVE_EIGEN  
-#include <Eigen/Core>
-#include <Eigen/Sparse>
-#endif
-
-#include <vector>
-#include <map>
-
-namespace viennacl
-{
-
-  namespace traits
-  {
-    //
-    // Resize: Change the size of vectors and matrices
-    //
-    template <typename MatrixType, typename SCALARTYPE>
-    void fill(MatrixType & matrix, std::size_t row_index, std::size_t col_index, SCALARTYPE value)
-    {
-      matrix(row_index, col_index) = value; 
-    }
-    
-    #ifdef VIENNACL_HAVE_EIGEN
-    template <typename T, int options, typename SCALARTYPE>
-    inline void fill(Eigen::SparseMatrix<T, options> & m,
-                     std::size_t row_index,
-                     std::size_t col_index,
-                     SCALARTYPE value
-                    )
-    {
-      m.fill(row_index, col_index) = value;
-    }    
-    #endif
-
- 
-  } //namespace traits
-} //namespace viennacl
-    
-
-#endif
+#ifndef VIENNACL_TRAITS_FILL_HPP_
+#define VIENNACL_TRAITS_FILL_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/traits/fill.hpp
+    @brief Generic fill functionality for different matrix types
+*/
+
+#include <string>
+#include <fstream>
+#include <sstream>
+#include "viennacl/forwards.h"
+#include "viennacl/meta/result_of.hpp"
+
+#ifdef VIENNACL_WITH_EIGEN
+#include <Eigen/Core>
+#include <Eigen/Sparse>
+#endif
+
+#include <vector>
+#include <map>
+
+namespace viennacl
+{
+
+  namespace traits
+  {
+
+    /** @brief Generic filler routine for setting an entry of a matrix to a particular value */
+    template <typename MatrixType, typename SCALARTYPE>
+    void fill(MatrixType & matrix, vcl_size_t row_index, vcl_size_t col_index, SCALARTYPE value)
+    {
+      matrix(row_index, col_index) = value;
+    }
+
+    #ifdef VIENNACL_WITH_EIGEN
+    /** @brief Generic filler routine for setting an entry of a matrix to a particular value. Special case for Eigen sparse matrices. */
+    template <typename T, int options, typename SCALARTYPE>
+    inline void fill(Eigen::SparseMatrix<T, options> & m,
+                     vcl_size_t row_index,
+                     vcl_size_t col_index,
+                     SCALARTYPE value
+                    )
+    {
+      m.insert(row_index, col_index) = value;
+    }
+    #endif
+
+
+  } //namespace traits
+} //namespace viennacl
+
+
+#endif
diff --git a/viennacl/traits/handle.hpp b/viennacl/traits/handle.hpp
index a5ea9b3..fa196cc 100644
--- a/viennacl/traits/handle.hpp
+++ b/viennacl/traits/handle.hpp
@@ -1,75 +1,245 @@
-#ifndef VIENNACL_TRAITS_HANDLE_HPP_
-#define VIENNACL_TRAITS_HANDLE_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file traits/handle.hpp
-    @brief Extracts the underlying OpenCL handle from a vector, a matrix, an expression etc.
-*/
-
-#include <string>
-#include <fstream>
-#include <sstream>
-#include "viennacl/forwards.h"
-
-#ifdef __APPLE__
-#include <OpenCL/cl.h>
-#else
-#include <CL/cl.h>
-#endif
-
-namespace viennacl
-{
-  namespace traits
-  {
-    
-    // Returns the OpenCL handle of a ViennaCL object
-    template <typename T>
-    viennacl::ocl::handle<cl_mem> handle(T & obj)
-    {
-      return obj.handle();
-    }
-
-    template <typename T>
-    viennacl::ocl::handle<cl_mem> handle(viennacl::vector_range<T> & obj)
-    {
-      return handle(obj.get());
-    }
-
-    template <typename T>
-    viennacl::ocl::handle<cl_mem> handle(viennacl::vector_range<T> const & obj)
-    {
-      return handle(obj.get());
-    }
-
-    template <typename T>
-    viennacl::ocl::handle<cl_mem> handle(viennacl::matrix_range<T> & obj)
-    {
-      return handle(obj.get());
-    }
-
-    template <typename T>
-    viennacl::ocl::handle<cl_mem> handle(viennacl::matrix_range<T> const & obj)
-    {
-      return handle(obj.get());
-    }
-
-  } //namespace traits
-} //namespace viennacl
-    
-
-#endif
+#ifndef VIENNACL_TRAITS_HANDLE_HPP_
+#define VIENNACL_TRAITS_HANDLE_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/traits/handle.hpp
+    @brief Extracts the underlying OpenCL handle from a vector, a matrix, an expression etc.
+*/
+
+#include <string>
+#include <fstream>
+#include <sstream>
+#include "viennacl/forwards.h"
+
+#include "viennacl/backend/mem_handle.hpp"
+
+namespace viennacl
+{
+  namespace traits
+  {
+    //
+    // Generic memory handle
+    //
+    /** @brief Returns the generic memory handle of an object. Non-const version. */
+    template <typename T>
+    viennacl::backend::mem_handle & handle(T & obj)
+    {
+      return obj.handle();
+    }
+
+    /** @brief Returns the generic memory handle of an object. Const-version. */
+    template <typename T>
+    viennacl::backend::mem_handle const & handle(T const & obj)
+    {
+      return obj.handle();
+    }
+
+    /** \cond */
+    inline char   handle(char val)   { return val; }  //for unification purposes when passing CPU-scalars to kernels
+    inline short  handle(short val)  { return val; }  //for unification purposes when passing CPU-scalars to kernels
+    inline int    handle(int val)    { return val; }  //for unification purposes when passing CPU-scalars to kernels
+    inline long   handle(long val)   { return val; }  //for unification purposes when passing CPU-scalars to kernels
+    inline float  handle(float val)  { return val; }  //for unification purposes when passing CPU-scalars to kernels
+    inline double handle(double val) { return val; }  //for unification purposes when passing CPU-scalars to kernels
+
+    template <typename LHS, typename RHS, typename OP>
+    viennacl::backend::mem_handle       & handle(viennacl::scalar_expression< const LHS, const RHS, OP> & obj)
+    {
+      return handle(obj.lhs());
+    }
+
+    template <typename LHS, typename RHS, typename OP>
+    viennacl::backend::mem_handle const & handle(viennacl::matrix_expression<LHS, RHS, OP> const & obj);
+
+    template <typename LHS, typename RHS, typename OP>
+    viennacl::backend::mem_handle const & handle(viennacl::vector_expression<LHS, RHS, OP> const & obj);
+
+    template <typename LHS, typename RHS, typename OP>
+    viennacl::backend::mem_handle const & handle(viennacl::scalar_expression< const LHS, const RHS, OP> const & obj)
+    {
+      return handle(obj.lhs());
+    }
+
+    // proxy objects require extra care (at the moment)
+    template <typename T>
+    viennacl::backend::mem_handle       & handle(viennacl::vector_base<T>       & obj)
+    {
+      return obj.handle();
+    }
+
+    template <typename T>
+    viennacl::backend::mem_handle const & handle(viennacl::vector_base<T> const & obj)
+    {
+      return obj.handle();
+    }
+
+
+
+    template <typename T>
+    viennacl::backend::mem_handle       & handle(viennacl::matrix_range<T>       & obj)
+    {
+      return obj.get().handle();
+    }
+
+    template <typename T>
+    viennacl::backend::mem_handle const & handle(viennacl::matrix_range<T> const & obj)
+    {
+      return obj.get().handle();
+    }
+
+
+    template <typename T>
+    viennacl::backend::mem_handle       & handle(viennacl::matrix_slice<T>      & obj)
+    {
+      return obj.get().handle();
+    }
+
+    template <typename T>
+    viennacl::backend::mem_handle const & handle(viennacl::matrix_slice<T> const & obj)
+    {
+      return obj.get().handle();
+    }
+
+    template <typename LHS, typename RHS, typename OP>
+    viennacl::backend::mem_handle const & handle(viennacl::vector_expression<LHS, RHS, OP> const & obj)
+    {
+      return handle(obj.lhs());
+    }
+
+    template <typename LHS, typename RHS, typename OP>
+    viennacl::backend::mem_handle const & handle(viennacl::matrix_expression<LHS, RHS, OP> const & obj)
+    {
+      return handle(obj.lhs());
+    }
+
+    /** \endcond */
+
+    //
+    // RAM handle extraction
+    //
+    /** @brief Generic helper routine for extracting the RAM handle of a ViennaCL object. Non-const version. */
+    template <typename T>
+    typename viennacl::backend::mem_handle::ram_handle_type & ram_handle(T & obj)
+    {
+      return viennacl::traits::handle(obj).ram_handle();
+    }
+
+    /** @brief Generic helper routine for extracting the RAM handle of a ViennaCL object. Const version. */
+    template <typename T>
+    typename viennacl::backend::mem_handle::ram_handle_type const & ram_handle(T const & obj)
+    {
+      return viennacl::traits::handle(obj).ram_handle();
+    }
+
+    /** \cond */
+    inline viennacl::backend::mem_handle::ram_handle_type & ram_handle(viennacl::backend::mem_handle & h)
+    {
+      return h.ram_handle();
+    }
+
+    inline viennacl::backend::mem_handle::ram_handle_type const & ram_handle(viennacl::backend::mem_handle const & h)
+    {
+      return h.ram_handle();
+    }
+    /** \endcond */
+
+    //
+    // OpenCL handle extraction
+    //
+#ifdef VIENNACL_WITH_OPENCL
+    /** @brief Generic helper routine for extracting the OpenCL handle of a ViennaCL object. Non-const version. */
+    template <typename T>
+    viennacl::ocl::handle<cl_mem> & opencl_handle(T & obj)
+    {
+      return viennacl::traits::handle(obj).opencl_handle();
+    }
+
+    /** @brief Generic helper routine for extracting the OpenCL handle of a ViennaCL object. Const version. */
+    template <typename T>
+    viennacl::ocl::handle<cl_mem> const & opencl_handle(T const & obj)
+    {
+      return viennacl::traits::handle(obj).opencl_handle();
+    }
+
+    inline cl_char   opencl_handle(char            val) { return val; }  //for unification purposes when passing CPU-scalars to kernels
+    inline cl_short  opencl_handle(short           val) { return val; }  //for unification purposes when passing CPU-scalars to kernels
+    inline cl_int    opencl_handle(int             val) { return val; }  //for unification purposes when passing CPU-scalars to kernels
+    inline cl_long   opencl_handle(long            val) { return val; }  //for unification purposes when passing CPU-scalars to kernels
+    inline cl_uchar  opencl_handle(unsigned char   val) { return val; }  //for unification purposes when passing CPU-scalars to kernels
+    inline cl_ushort opencl_handle(unsigned short  val) { return val; }  //for unification purposes when passing CPU-scalars to kernels
+    inline cl_uint   opencl_handle(unsigned int    val) { return val; }  //for unification purposes when passing CPU-scalars to kernels
+    inline cl_ulong  opencl_handle(unsigned long   val) { return val; }  //for unification purposes when passing CPU-scalars to kernels
+    inline float     opencl_handle(float           val) { return val; }  //for unification purposes when passing CPU-scalars to kernels
+    inline double    opencl_handle(double          val) { return val; }  //for unification purposes when passing CPU-scalars to kernels
+
+
+#endif
+
+
+
+    //
+    // Active handle ID
+    //
+    /** @brief Returns an ID for the currently active memory domain of an object */
+    template <typename T>
+    viennacl::memory_types active_handle_id(T const & obj)
+    {
+      return handle(obj).get_active_handle_id();
+    }
+
+    /** \cond */
+    template <typename T>
+    viennacl::memory_types active_handle_id(circulant_matrix<T> const &) { return OPENCL_MEMORY; }
+
+    template <typename T>
+    viennacl::memory_types active_handle_id(hankel_matrix<T> const &) { return OPENCL_MEMORY; }
+
+    template <typename T>
+    viennacl::memory_types active_handle_id(toeplitz_matrix<T> const &) { return OPENCL_MEMORY; }
+
+    template <typename T>
+    viennacl::memory_types active_handle_id(vandermonde_matrix<T> const &) { return OPENCL_MEMORY; }
+
+    template <typename LHS, typename RHS, typename OP>
+    viennacl::memory_types active_handle_id(viennacl::vector_expression<LHS, RHS, OP> const &);
+
+    template <typename LHS, typename RHS, typename OP>
+    viennacl::memory_types active_handle_id(viennacl::scalar_expression<LHS, RHS, OP> const & obj)
+    {
+      return active_handle_id(obj.lhs());
+    }
+
+    template <typename LHS, typename RHS, typename OP>
+    viennacl::memory_types active_handle_id(viennacl::vector_expression<LHS, RHS, OP> const & obj)
+    {
+      return active_handle_id(obj.lhs());
+    }
+
+    template <typename LHS, typename RHS, typename OP>
+    viennacl::memory_types active_handle_id(viennacl::matrix_expression<LHS, RHS, OP> const & obj)
+    {
+      return active_handle_id(obj.lhs());
+    }
+    /** \endcond */
+
+  } //namespace traits
+} //namespace viennacl
+
+
+#endif
diff --git a/viennacl/traits/size.hpp b/viennacl/traits/size.hpp
index 59b3f1f..4c8bd08 100644
--- a/viennacl/traits/size.hpp
+++ b/viennacl/traits/size.hpp
@@ -1,225 +1,320 @@
-#ifndef VIENNACL_TRAITS_SIZE_HPP_
-#define VIENNACL_TRAITS_SIZE_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file size.hpp
-    @brief Generic size and resize functionality for different vector and matrix types
-*/
-
-#include <string>
-#include <fstream>
-#include <sstream>
-#include "viennacl/forwards.h"
-#include "viennacl/meta/result_of.hpp"
-
-#ifdef VIENNACL_HAVE_UBLAS  
-#include <boost/numeric/ublas/matrix_sparse.hpp>
-#include <boost/numeric/ublas/matrix.hpp>
-#endif
-
-#ifdef VIENNACL_HAVE_EIGEN  
-#include <Eigen/Core>
-#include <Eigen/Sparse>
-#endif
-
-#ifdef VIENNACL_HAVE_MTL4
-#include <boost/numeric/mtl/mtl.hpp>
-#endif
-
-#include <vector>
-#include <map>
-
-namespace viennacl
-{
-
-  namespace traits
-  {
-    //
-    // Resize: Change the size of vectors and matrices
-    //
-    template <typename MatrixType>
-    void resize(MatrixType & matrix, size_t rows, size_t cols)
-    {
-      matrix.resize(rows, cols); 
-    }
-    
-    template <typename VectorType>
-    void resize(VectorType & vec, size_t new_size)
-    {
-      vec.resize(new_size); 
-    }
-    
-    #ifdef VIENNACL_HAVE_UBLAS  
-    //ublas needs separate treatment:
-    template <typename ScalarType>
-    void resize(boost::numeric::ublas::compressed_matrix<ScalarType> & matrix,
-                size_t rows,
-                size_t cols)
-    {
-      matrix.resize(rows, cols, false); //Note: omitting third parameter leads to compile time error (not implemented in ublas <= 1.42) 
-    }
-    #endif  
-    
-    
-    #ifdef VIENNACL_HAVE_MTL4
-    template <typename ScalarType>
-    void resize(mtl::compressed2D<ScalarType> & matrix,
-                size_t rows,
-                size_t cols)
-    {
-      matrix.change_dim(rows, cols);
-    }
-    
-    template <typename ScalarType>
-    void resize(mtl::dense_vector<ScalarType> & vec,
-                size_t new_size)
-    {
-      vec.change_dim(new_size);
-    }
-    #endif      
-
-    #ifdef VIENNACL_HAVE_EIGEN
-    inline void resize(Eigen::MatrixXf & m,
-                       std::size_t new_rows,
-                       std::size_t new_cols)
-    {
-      m.resize(new_rows, new_cols);
-    }
-    
-    inline void resize(Eigen::MatrixXd & m,
-                       std::size_t new_rows,
-                       std::size_t new_cols)
-    {
-      m.resize(new_rows, new_cols);
-    }
-    
-    template <typename T, int options>
-    inline void resize(Eigen::SparseMatrix<T, options> & m,
-                       std::size_t new_rows,
-                       std::size_t new_cols)
-    {
-      m.resize(new_rows, new_cols);
-    }    
-    
-    inline void resize(Eigen::VectorXf & v,
-                       std::size_t new_size)
-    {
-      v.resize(new_size);
-    }
-    
-    inline void resize(Eigen::VectorXd & v,
-                       std::size_t new_size)
-    {
-      v.resize(new_size);
-    }
-    #endif
-
-
-    //
-    // size: Returns the length of vectors
-    //
-    template <typename VectorType>
-    typename result_of::size_type<VectorType>::type size(VectorType const & vec)
-    {
-      return vec.size(); 
-    }
-
-    #ifdef VIENNACL_HAVE_MTL4
-    template <typename ScalarType>
-    typename result_of::size_type< mtl::dense_vector<ScalarType> >::type
-    size(mtl::dense_vector<ScalarType> const & vec) { return vec.used_memory(); }
-    #endif
-    
-    #ifdef VIENNACL_HAVE_EIGEN
-    inline std::size_t size(Eigen::VectorXf const & v) { return v.rows(); }
-    inline std::size_t size(Eigen::VectorXd const & v) { return v.rows(); }
-    #endif
-
-    //
-    // size1: No. of rows for matrices
-    //
-    template <typename MatrixType>
-    typename result_of::size_type<MatrixType>::type
-    size1(MatrixType const & mat) { return mat.size1(); }
-
-    #ifdef VIENNACL_HAVE_EIGEN
-    inline std::size_t size1(Eigen::MatrixXf const & m) { return m.rows(); }
-    inline std::size_t size1(Eigen::MatrixXd const & m) { return m.rows(); }
-    template <typename T, int options>
-    inline std::size_t size1(Eigen::SparseMatrix<T, options> & m) { return m.rows(); }    
-    #endif
-
-    //
-    // size2: No. of columns for matrices
-    //
-    template <typename MatrixType>
-    typename result_of::size_type<MatrixType>::type
-    size2(MatrixType const & mat) { return mat.size2(); }
- 
-    #ifdef VIENNACL_HAVE_EIGEN
-    inline std::size_t size2(Eigen::MatrixXf const & m) { return m.cols(); }
-    inline std::size_t size2(Eigen::MatrixXd const & m) { return m.cols(); }
-    template <typename T, int options>
-    inline std::size_t size2(Eigen::SparseMatrix<T, options> & m) { return m.cols(); }    
-    #endif
- 
-    //
-    // internal_size: Returns the internal (padded) length of vectors
-    //
-    template <typename VectorType>
-    typename result_of::size_type<VectorType>::type 
-    internal_size(VectorType const & vec)
-    {
-      return vec.internal_size(); 
-    }
-
-    template <typename VectorType>
-    typename result_of::size_type<VectorType>::type 
-    internal_size(viennacl::vector_range<VectorType> const & vec)
-    {
-      return vec.get().internal_size(); 
-    }
-
-    //
-    // internal_size1: No. of internal (padded) rows for matrices
-    //
-    template <typename MatrixType>
-    typename result_of::size_type<MatrixType>::type
-    internal_size1(MatrixType const & mat) { return mat.internal_size1(); }
-
-    template <typename MatrixType>
-    typename result_of::size_type<MatrixType>::type
-    internal_size1(viennacl::matrix_range<MatrixType> const & mat) { return mat.get().internal_size1(); }
-
-    //
-    // internal_size2: No. of internal (padded) columns for matrices
-    //
-    template <typename MatrixType>
-    typename result_of::size_type<MatrixType>::type
-    internal_size2(MatrixType const & mat) { return mat.internal_size2(); }
- 
-    template <typename MatrixType>
-    typename result_of::size_type<MatrixType>::type
-    internal_size2(viennacl::matrix_range<MatrixType> const & mat) { return mat.get().internal_size2(); }
-
- 
-  } //namespace traits
-} //namespace viennacl
-    
-
-#endif
+#ifndef VIENNACL_TRAITS_SIZE_HPP_
+#define VIENNACL_TRAITS_SIZE_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/traits/size.hpp
+    @brief Generic size and resize functionality for different vector and matrix types
+*/
+
+#include <string>
+#include <fstream>
+#include <sstream>
+#include "viennacl/forwards.h"
+#include "viennacl/meta/result_of.hpp"
+#include "viennacl/meta/predicate.hpp"
+
+#ifdef VIENNACL_WITH_UBLAS
+#include <boost/numeric/ublas/matrix_sparse.hpp>
+#include <boost/numeric/ublas/matrix.hpp>
+#endif
+
+#ifdef VIENNACL_WITH_EIGEN
+#include <Eigen/Core>
+#include <Eigen/Sparse>
+#endif
+
+#ifdef VIENNACL_WITH_MTL4
+#include <boost/numeric/mtl/mtl.hpp>
+#endif
+
+#include <vector>
+#include <map>
+
+namespace viennacl
+{
+
+  namespace traits
+  {
+    //
+    // Resize: Change the size of vectors and matrices
+    //
+    /** @brief Generic resize routine for resizing a matrix (ViennaCL, uBLAS, etc.) to a new size/dimension */
+    template <typename MatrixType>
+    void resize(MatrixType & matrix, vcl_size_t rows, vcl_size_t cols)
+    {
+      matrix.resize(rows, cols);
+    }
+
+    /** @brief Generic resize routine for resizing a vector (ViennaCL, uBLAS, etc.) to a new size */
+    template <typename VectorType>
+    void resize(VectorType & vec, vcl_size_t new_size)
+    {
+      vec.resize(new_size);
+    }
+
+    /** \cond */
+    #ifdef VIENNACL_WITH_UBLAS
+    //ublas needs separate treatment:
+    template <typename ScalarType>
+    void resize(boost::numeric::ublas::compressed_matrix<ScalarType> & matrix,
+                vcl_size_t rows,
+                vcl_size_t cols)
+    {
+      matrix.resize(rows, cols, false); //Note: omitting third parameter leads to compile time error (not implemented in ublas <= 1.42)
+    }
+    #endif
+
+
+    #ifdef VIENNACL_WITH_MTL4
+    template <typename ScalarType>
+    void resize(mtl::compressed2D<ScalarType> & matrix,
+                vcl_size_t rows,
+                vcl_size_t cols)
+    {
+      matrix.change_dim(rows, cols);
+    }
+
+    template <typename ScalarType>
+    void resize(mtl::dense_vector<ScalarType> & vec,
+                vcl_size_t new_size)
+    {
+      vec.change_dim(new_size);
+    }
+    #endif
+
+    #ifdef VIENNACL_WITH_EIGEN
+    inline void resize(Eigen::MatrixXf & m,
+                       vcl_size_t new_rows,
+                       vcl_size_t new_cols)
+    {
+      m.resize(new_rows, new_cols);
+    }
+
+    inline void resize(Eigen::MatrixXd & m,
+                       vcl_size_t new_rows,
+                       vcl_size_t new_cols)
+    {
+      m.resize(new_rows, new_cols);
+    }
+
+    template <typename T, int options>
+    inline void resize(Eigen::SparseMatrix<T, options> & m,
+                       vcl_size_t new_rows,
+                       vcl_size_t new_cols)
+    {
+      m.resize(new_rows, new_cols);
+    }
+
+    inline void resize(Eigen::VectorXf & v,
+                       vcl_size_t new_size)
+    {
+      v.resize(new_size);
+    }
+
+    inline void resize(Eigen::VectorXd & v,
+                       vcl_size_t new_size)
+    {
+      v.resize(new_size);
+    }
+    #endif
+    /** \endcond */
+
+
+    //
+    // size: Returns the length of vectors
+    //
+    /** @brief Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.) */
+    template <typename VectorType>
+    vcl_size_t size(VectorType const & vec)
+    {
+      return vec.size();
+    }
+
+    /** \cond */
+    template <typename SparseMatrixType, typename VectorType>
+    typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixType>::value,
+                                  vcl_size_t >::type
+    size(vector_expression<const SparseMatrixType, const VectorType, op_prod> const & proxy)
+    {
+      return proxy.lhs().size1();
+    }
+
+    template <typename T, unsigned int A, typename VectorType>
+    vcl_size_t size(vector_expression<const circulant_matrix<T, A>, const VectorType, op_prod> const & proxy) { return proxy.lhs().size1();  }
+
+    template <typename T, unsigned int A, typename VectorType>
+    vcl_size_t size(vector_expression<const hankel_matrix<T, A>, const VectorType, op_prod> const & proxy) { return proxy.lhs().size1();  }
+
+    template <typename T, unsigned int A, typename VectorType>
+    vcl_size_t size(vector_expression<const toeplitz_matrix<T, A>, const VectorType, op_prod> const & proxy) { return proxy.lhs().size1();  }
+
+    template <typename T, unsigned int A, typename VectorType>
+    vcl_size_t size(vector_expression<const vandermonde_matrix<T, A>, const VectorType, op_prod> const & proxy) { return proxy.lhs().size1();  }
+
+    template <typename NumericT, typename F>
+    vcl_size_t size(vector_expression<const matrix_base<NumericT, F>, const vector_base<NumericT>, op_prod> const & proxy)  //matrix-vector product
+    {
+      return proxy.lhs().size1();
+    }
+
+    template <typename NumericT, typename F>
+    vcl_size_t size(vector_expression<const matrix_expression<const matrix_base<NumericT, F>, const matrix_base<NumericT, F>, op_trans>,
+                                      const vector_base<NumericT>,
+                                      op_prod> const & proxy)  //transposed matrix-vector product
+    {
+      return proxy.lhs().lhs().size2();
+    }
+
+
+    #ifdef VIENNACL_WITH_MTL4
+    template <typename ScalarType>
+    vcl_size_t size(mtl::dense_vector<ScalarType> const & vec) { return vec.used_memory(); }
+    #endif
+
+    #ifdef VIENNACL_WITH_EIGEN
+    inline vcl_size_t size(Eigen::VectorXf const & v) { return v.rows(); }
+    inline vcl_size_t size(Eigen::VectorXd const & v) { return v.rows(); }
+    #endif
+
+    template <typename LHS, typename RHS, typename OP>
+    vcl_size_t size(vector_expression<LHS, RHS, OP> const & proxy)
+    {
+      return size(proxy.lhs());
+    }
+
+    template <typename LHS, typename RHS>
+    vcl_size_t size(vector_expression<LHS, const vector_tuple<RHS>, op_inner_prod> const & proxy)
+    {
+      return proxy.rhs().const_size();
+    }
+
+    /** \endcond */
+
+
+    //
+    // size1: No. of rows for matrices
+    //
+    /** @brief Generic routine for obtaining the number of rows of a matrix (ViennaCL, uBLAS, etc.) */
+    template <typename MatrixType>
+    vcl_size_t
+    size1(MatrixType const & mat) { return mat.size1(); }
+
+    /** \cond */
+    template <typename RowType>
+    vcl_size_t
+    size1(std::vector< RowType > const & mat) { return mat.size(); }
+
+    #ifdef VIENNACL_WITH_EIGEN
+    inline vcl_size_t size1(Eigen::MatrixXf const & m) { return static_cast<vcl_size_t>(m.rows()); }
+    inline vcl_size_t size1(Eigen::MatrixXd const & m) { return static_cast<vcl_size_t>(m.rows()); }
+    template <typename T, int options>
+    inline vcl_size_t size1(Eigen::SparseMatrix<T, options> & m) { return static_cast<vcl_size_t>(m.rows()); }
+    #endif
+
+#ifdef VIENNACL_WITH_MTL4
+    template <typename SCALARTYPE, typename T>
+    vcl_size_t size1(mtl::dense2D<SCALARTYPE, T> const & m) { return static_cast<vcl_size_t>(m.num_rows()); }
+    template <typename SCALARTYPE>
+    vcl_size_t size1(mtl::compressed2D<SCALARTYPE> const & m) { return static_cast<vcl_size_t>(m.num_rows()); }
+#endif
+
+    /** \endcond */
+
+    //
+    // size2: No. of columns for matrices
+    //
+    /** @brief Generic routine for obtaining the number of columns of a matrix (ViennaCL, uBLAS, etc.) */
+    template <typename MatrixType>
+    typename result_of::size_type<MatrixType>::type
+    size2(MatrixType const & mat) { return mat.size2(); }
+
+    /** \cond */
+    #ifdef VIENNACL_WITH_EIGEN
+    inline vcl_size_t size2(Eigen::MatrixXf const & m) { return m.cols(); }
+    inline vcl_size_t size2(Eigen::MatrixXd const & m) { return m.cols(); }
+    template <typename T, int options>
+    inline vcl_size_t size2(Eigen::SparseMatrix<T, options> & m) { return m.cols(); }
+    #endif
+
+#ifdef VIENNACL_WITH_MTL4
+    template <typename SCALARTYPE, typename T>
+    vcl_size_t size2(mtl::dense2D<SCALARTYPE, T> const & m) { return static_cast<vcl_size_t>(m.num_cols()); }
+    template <typename SCALARTYPE>
+    vcl_size_t size2(mtl::compressed2D<SCALARTYPE> const & m) { return static_cast<vcl_size_t>(m.num_cols()); }
+#endif
+    /** \endcond */
+
+    //
+    // internal_size: Returns the internal (padded) length of vectors
+    //
+    /** @brief Helper routine for obtaining the buffer length of a ViennaCL vector  */
+    template <typename NumericT>
+    vcl_size_t internal_size(vector_base<NumericT> const & vec)
+    {
+      return vec.internal_size();
+    }
+
+
+    //
+    // internal_size1: No. of internal (padded) rows for matrices
+    //
+    /** @brief Helper routine for obtaining the internal number of entries per row of a ViennaCL matrix  */
+    template <typename NumericT, typename F>
+    vcl_size_t internal_size1(matrix_base<NumericT, F> const & mat) { return mat.internal_size1(); }
+
+
+    //
+    // internal_size2: No. of internal (padded) columns for matrices
+    //
+    /** @brief Helper routine for obtaining the internal number of entries per column of a ViennaCL matrix  */
+    template <typename NumericT, typename F>
+    vcl_size_t internal_size2(matrix_base<NumericT, F> const & mat) { return mat.internal_size2(); }
+
+
+    template <typename LHS>
+    vcl_size_t size(vector_expression<LHS, const int, op_matrix_diag> const & proxy)
+    {
+      int k = proxy.rhs();
+      int A_size1 = static_cast<int>(size1(proxy.lhs()));
+      int A_size2 = static_cast<int>(size2(proxy.lhs()));
+
+      int row_depth = std::min(A_size1, A_size1 + k);
+      int col_depth = std::min(A_size2, A_size2 - k);
+
+      return std::min(row_depth, col_depth);
+    }
+
+    template <typename LHS>
+    vcl_size_t size(vector_expression<LHS, const unsigned int, op_row> const & proxy)
+    {
+      return size2(proxy.lhs());
+    }
+
+    template <typename LHS>
+    vcl_size_t size(vector_expression<LHS, const unsigned int, op_column> const & proxy)
+    {
+      return size1(proxy.lhs());
+    }
+
+
+  } //namespace traits
+} //namespace viennacl
+
+
+#endif
diff --git a/viennacl/traits/start.hpp b/viennacl/traits/start.hpp
index f2364fb..168f596 100644
--- a/viennacl/traits/start.hpp
+++ b/viennacl/traits/start.hpp
@@ -1,97 +1,101 @@
-#ifndef VIENNACL_TRAITS_START_HPP_
-#define VIENNACL_TRAITS_START_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file start.hpp
-    @brief Extracts the underlying OpenCL start index handle from a vector, a matrix, an expression etc.
-*/
-
-#include <string>
-#include <fstream>
-#include <sstream>
-#include "viennacl/forwards.h"
-
-namespace viennacl
-{
-  namespace traits
-  {
-    //
-    // start: Mostly for vectors
-    //
-    
-    // Default: Try to get the start index from the .start() member function
-    template <typename T>
-    typename result_of::size_type<T>::type
-    start(T const & obj)
-    {
-      return obj.start();
-    }
-    
-    //ViennaCL vector leads to start index 0:
-    template <typename ScalarType, unsigned int ALIGNMENT>
-    typename result_of::size_type<viennacl::vector<ScalarType, ALIGNMENT> >::type
-    start(viennacl::vector<ScalarType, ALIGNMENT> const & v)
-    {
-      return 0; 
-    }
-
-    //
-    // start1: Row start index
-    //
-    
-    // Default: Try to get the start index from the .start1() member function
-    template <typename T>
-    typename result_of::size_type<T>::type
-    start1(T const & obj)
-    {
-      return obj.start1();
-    }
-
-    //ViennaCL matrix leads to start index 0:
-    template <typename ScalarType, typename F, unsigned int ALIGNMENT>
-    typename result_of::size_type<viennacl::matrix<ScalarType, F, ALIGNMENT> >::type
-    start1(viennacl::matrix<ScalarType, F, ALIGNMENT> const & v)
-    {
-      return 0; 
-    }
-
-
-    //
-    // start2: Column start index
-    //
-    template <typename T>
-    typename result_of::size_type<T>::type
-    start2(T const & obj)
-    {
-      return obj.start2();
-    }
-
-    //ViennaCL matrix leads to start index 0:
-    template <typename ScalarType, typename F, unsigned int ALIGNMENT>
-    typename result_of::size_type<viennacl::matrix<ScalarType, F, ALIGNMENT> >::type
-    start2(viennacl::matrix<ScalarType, F, ALIGNMENT> const & v)
-    {
-      return 0; 
-    }
-    
-
-  } //namespace traits
-} //namespace viennacl
-    
-
-#endif
+#ifndef VIENNACL_TRAITS_START_HPP_
+#define VIENNACL_TRAITS_START_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/traits/start.hpp
+    @brief Extracts the underlying OpenCL start index handle from a vector, a matrix, an expression etc.
+*/
+
+#include <string>
+#include <fstream>
+#include <sstream>
+#include "viennacl/forwards.h"
+
+#include "viennacl/meta/result_of.hpp"
+
+namespace viennacl
+{
+  namespace traits
+  {
+    //
+    // start: Mostly for vectors
+    //
+
+    // Default: Try to get the start index from the .start() member function
+    template <typename T>
+    typename result_of::size_type<T>::type
+    start(T const & obj)
+    {
+      return obj.start();
+    }
+
+    //ViennaCL vector leads to start index 0:
+    template <typename ScalarType, unsigned int ALIGNMENT>
+    typename result_of::size_type<viennacl::vector<ScalarType, ALIGNMENT> >::type
+    start(viennacl::vector<ScalarType, ALIGNMENT> const &)
+    {
+      return 0;
+    }
+
+
+    //
+    // start1: Row start index
+    //
+
+    // Default: Try to get the start index from the .start1() member function
+    template <typename T>
+    typename result_of::size_type<T>::type
+    start1(T const & obj)
+    {
+      return obj.start1();
+    }
+
+    //ViennaCL matrix leads to start index 0:
+    template <typename ScalarType, typename F, unsigned int ALIGNMENT>
+    typename result_of::size_type<viennacl::matrix<ScalarType, F, ALIGNMENT> >::type
+    start1(viennacl::matrix<ScalarType, F, ALIGNMENT> const &)
+    {
+      return 0;
+    }
+
+
+    //
+    // start2: Column start index
+    //
+    template <typename T>
+    typename result_of::size_type<T>::type
+    start2(T const & obj)
+    {
+      return obj.start2();
+    }
+
+    //ViennaCL matrix leads to start index 0:
+    template <typename ScalarType, typename F, unsigned int ALIGNMENT>
+    typename result_of::size_type<viennacl::matrix<ScalarType, F, ALIGNMENT> >::type
+    start2(viennacl::matrix<ScalarType, F, ALIGNMENT> const &)
+    {
+      return 0;
+    }
+
+
+  } //namespace traits
+} //namespace viennacl
+
+
+#endif
diff --git a/viennacl/traits/stride.hpp b/viennacl/traits/stride.hpp
new file mode 100644
index 0000000..1b37507
--- /dev/null
+++ b/viennacl/traits/stride.hpp
@@ -0,0 +1,75 @@
+#ifndef VIENNACL_TRAITS_STRIDE_HPP_
+#define VIENNACL_TRAITS_STRIDE_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/traits/stride.hpp
+    @brief Determines row and column increments for matrices and matrix proxies
+*/
+
+#include <string>
+#include <fstream>
+#include <sstream>
+#include "viennacl/forwards.h"
+#include "viennacl/meta/result_of.hpp"
+
+
+#include <vector>
+#include <map>
+
+namespace viennacl
+{
+
+  namespace traits
+  {
+
+    //
+    // inc: Increment for vectors. Defaults to 1
+    //
+    template <typename T>
+    typename result_of::size_type< viennacl::vector_base<T> >::type
+    stride(viennacl::vector_base<T> const & s) { return s.stride(); }
+
+    //
+    // inc1: Row increment for matrices. Defaults to 1
+    //
+    //template <typename MatrixType>
+    //typename result_of::size_type<MatrixType>::type
+    //stride1(MatrixType const &) { return 1; }
+
+    template <typename NumericT, typename F>
+    typename result_of::size_type< matrix_base<NumericT, F> >::type
+    stride1(matrix_base<NumericT, F> const & s) { return s.stride1(); }
+
+    //
+    // inc2: Column increment for matrices. Defaults to 1
+    //
+    //template <typename MatrixType>
+    //typename result_of::size_type<MatrixType>::type
+    //stride2(MatrixType const &) { return 1; }
+
+    template <typename NumericT, typename F>
+    typename result_of::size_type< matrix_base<NumericT, F> >::type
+    stride2(matrix_base<NumericT, F> const & s) { return s.stride2(); }
+
+
+  } //namespace traits
+} //namespace viennacl
+
+
+#endif
diff --git a/viennacl/vandermonde_matrix.hpp b/viennacl/vandermonde_matrix.hpp
index d97929b..015a321 100644
--- a/viennacl/vandermonde_matrix.hpp
+++ b/viennacl/vandermonde_matrix.hpp
@@ -2,16 +2,17 @@
 #define VIENNACL_VANDERMONDE_MATRIX_HPP
 
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
@@ -20,12 +21,12 @@
 #include <cmath>
 
 /** @file vandermonde_matrix.hpp
-    @brief Implementation of the vandermonde_matrix class for efficient manipulation of Vandermonde matrices.  Experimental in 1.2.x.
+    @brief Implementation of the vandermonde_matrix class for efficient manipulation of Vandermonde matrices.  Experimental.
 */
 
 #include "viennacl/forwards.h"
 #include "viennacl/vector.hpp"
-#include "viennacl/ocl/context.hpp"
+#include "viennacl/ocl/backend.hpp"
 
 #include "viennacl/fft.hpp"
 
@@ -41,14 +42,14 @@ namespace viennacl {
     class vandermonde_matrix
     {
       public:
+        typedef viennacl::backend::mem_handle                                                              handle_type;
+        typedef scalar<typename viennacl::tools::CHECK_SCALAR_TEMPLATE_ARGUMENT<SCALARTYPE>::ResultType>   value_type;
+
         /**
          * @brief The default constructor. Does not allocate any memory.
          *
          */
-        explicit vandermonde_matrix()
-        {
-          viennacl::linalg::kernels::fft<SCALARTYPE, 1>::init();
-        }
+        explicit vandermonde_matrix() {}
 
         /**
          * @brief         Creates the matrix with the given size
@@ -56,10 +57,10 @@ namespace viennacl {
          * @param rows      Number of rows of the matrix
          * @param cols      Number of columns of the matrix
          */
-        explicit vandermonde_matrix(std::size_t rows, std::size_t cols) : elements_(rows)
+        explicit vandermonde_matrix(vcl_size_t rows, vcl_size_t cols) : elements_(rows)
         {
-          assert(rows == cols && "Vandermonde matrix must be square in this release!");
-          viennacl::linalg::kernels::fft<SCALARTYPE, 1>::init();
+          assert(rows == cols && bool("Vandermonde matrix must be square in this release!"));
+          (void)cols;  // avoid 'unused parameter' warning in optimized builds
         }
 
         /** @brief Resizes the matrix.
@@ -68,7 +69,7 @@ namespace viennacl {
         * @param sz         New size of matrix
         * @param preserve   If true, existing values are preserved.
         */
-        void resize(std::size_t sz, bool preserve = true) {
+        void resize(vcl_size_t sz, bool preserve = true) {
             elements_.resize(sz, preserve);
         }
 
@@ -76,7 +77,7 @@ namespace viennacl {
         *
         *   @return OpenCL handle
         */
-        viennacl::ocl::handle<cl_mem> handle() const { return elements_.handle(); }
+        handle_type const & handle() const { return elements_.handle(); }
 
         /**
          * @brief Returns an internal viennacl::vector, which represents a Vandermonde matrix elements
@@ -88,19 +89,19 @@ namespace viennacl {
         /**
          * @brief Returns the number of rows of the matrix
          */
-        std::size_t size1() const { return elements_.size(); }
-        
+        vcl_size_t size1() const { return elements_.size(); }
+
         /**
          * @brief Returns the number of columns of the matrix
          */
-        std::size_t size2() const { return elements_.size(); }
+        vcl_size_t size2() const { return elements_.size(); }
 
         /** @brief Returns the internal size of matrix representtion.
         *   Usually required for launching OpenCL kernels only
         *
         *   @return Internal size of matrix representation
         */
-        std::size_t internal_size() const { return elements_.internal_size(); }
+        vcl_size_t internal_size() const { return elements_.internal_size(); }
 
         /**
          * @brief Read-write access to a base element of the matrix
@@ -108,7 +109,7 @@ namespace viennacl {
          * @param row_index  Row index of accessed element
          * @return Proxy for matrix entry
          */
-        entry_proxy<SCALARTYPE> operator()(std::size_t row_index)
+        entry_proxy<SCALARTYPE> operator()(vcl_size_t row_index)
         {
             return elements_[row_index];
         }
@@ -120,17 +121,17 @@ namespace viennacl {
          * @param col_index  Column index of accessed element
          * @return Proxy for matrix entry
          */
-        SCALARTYPE operator()(std::size_t row_index, std::size_t col_index) const
+        SCALARTYPE operator()(vcl_size_t row_index, vcl_size_t col_index) const
         {
-            assert(row_index < size1() && col_index < size2() && "Invalid access");
-            
+            assert(row_index < size1() && col_index < size2() && bool("Invalid access"));
+
             return pow(elements_[row_index], static_cast<int>(col_index));
         }
 
     private:
-        vandermonde_matrix(vandermonde_matrix const & t) {}
-        vandermonde_matrix & operator=(vandermonde_matrix const & t) {}
-        
+        vandermonde_matrix(vandermonde_matrix const &) {}
+        vandermonde_matrix & operator=(vandermonde_matrix const & t);
+
         viennacl::vector<SCALARTYPE, ALIGNMENT> elements_;
     };
 
@@ -143,7 +144,7 @@ namespace viennacl {
     template <typename SCALARTYPE, unsigned int ALIGNMENT>
     void copy(std::vector<SCALARTYPE>& cpu_vec, vandermonde_matrix<SCALARTYPE, ALIGNMENT>& gpu_mat)
     {
-        assert(cpu_vec.size() == gpu_mat.size1()  && "Size mismatch");
+        assert(cpu_vec.size() == gpu_mat.size1()  && bool("Size mismatch"));
         copy(cpu_vec, gpu_mat.elements());
     }
 
@@ -156,7 +157,7 @@ namespace viennacl {
     template <typename SCALARTYPE, unsigned int ALIGNMENT>
     void copy(vandermonde_matrix<SCALARTYPE, ALIGNMENT>& gpu_mat, std::vector<SCALARTYPE>& cpu_vec)
     {
-        assert(cpu_vec.size() == gpu_mat.size1() && "Size mismatch");
+        assert(cpu_vec.size() == gpu_mat.size1() && bool("Size mismatch"));
         copy(gpu_mat.elements(), cpu_vec);
     }
 
@@ -169,19 +170,20 @@ namespace viennacl {
     template <typename SCALARTYPE, unsigned int ALIGNMENT, typename MATRIXTYPE>
     void copy(vandermonde_matrix<SCALARTYPE, ALIGNMENT>& vander_src, MATRIXTYPE& com_dst)
     {
-        std::size_t size = vander_src.size1();
-        assert(size == com_dst.size1() && "Size mismatch");
-        assert(size == com_dst.size2() && "Size mismatch");
+        assert(vander_src.size1() == viennacl::traits::size1(com_dst) && bool("Size mismatch"));
+        assert(vander_src.size2() == viennacl::traits::size2(com_dst) && bool("Size mismatch"));
+
+        vcl_size_t size = vander_src.size1();
         std::vector<SCALARTYPE> tmp(size);
         copy(vander_src, tmp);
 
-        for(std::size_t i = 0; i < size; i++) {
-            for(std::size_t j = 0; j < size; j++) {
-                com_dst(i, j) = pow(tmp[i], static_cast<int>(j));
+        for(vcl_size_t i = 0; i < size; i++) {
+            for(vcl_size_t j = 0; j < size; j++) {
+                com_dst(i, j) = std::pow(tmp[i], static_cast<int>(j));
             }
         }
     }
-    
+
     /** @brief Copies a the matrix-like object to the Vandermonde matrix from the OpenCL device (either GPU or multi-core CPU)
     *
     *
@@ -189,14 +191,15 @@ namespace viennacl {
     * @param vander_dst   A vandermonde_matrix from ViennaCL
     */
     template <typename SCALARTYPE, unsigned int ALIGNMENT, typename MATRIXTYPE>
-    void copy(MATRIXTYPE& com_src, vandermonde_matrix<SCALARTYPE, ALIGNMENT>& vander_dst) 
+    void copy(MATRIXTYPE& com_src, vandermonde_matrix<SCALARTYPE, ALIGNMENT>& vander_dst)
     {
-        std::size_t size = vander_dst.size1();
-        assert(size == com_src.size1() && "Size mismatch");
-        assert(size == com_src.size2() && "Size mismatch");
+        assert( (vander_dst.size1() == 0 || vander_dst.size1() == viennacl::traits::size1(com_src)) && bool("Size mismatch"));
+        assert( (vander_dst.size2() == 0 || vander_dst.size2() == viennacl::traits::size2(com_src)) && bool("Size mismatch"));
+
+        vcl_size_t size = vander_dst.size1();
         std::vector<SCALARTYPE> tmp(size);
 
-        for(std::size_t i = 0; i < size; i++)
+        for(vcl_size_t i = 0; i < size; i++)
             tmp[i] = com_src(i, 1);
 
         copy(tmp, vander_dst);
@@ -219,15 +222,15 @@ namespace viennacl {
     template<class SCALARTYPE, unsigned int ALIGNMENT>
     std::ostream & operator<<(std::ostream& s, vandermonde_matrix<SCALARTYPE, ALIGNMENT>& gpu_matrix)
     {
-        std::size_t size = gpu_matrix.size1();
+        vcl_size_t size = gpu_matrix.size1();
         std::vector<SCALARTYPE> tmp(size);
         copy(gpu_matrix, tmp);
         s << "[" << size << "," << size << "](\n";
 
-        for(std::size_t i = 0; i < size; i++) {
+        for(vcl_size_t i = 0; i < size; i++) {
             s << "(";
-            for(std::size_t j = 0; j < size; j++) {
-                s << pow(tmp[i], j);
+            for(vcl_size_t j = 0; j < size; j++) {
+                s << pow(tmp[i], static_cast<SCALARTYPE>(j));
                 if(j < (size - 1)) s << ",";
             }
             s << ")";
@@ -236,6 +239,99 @@ namespace viennacl {
         return s;
     }
 
+
+    //
+    // Specify available operations:
+    //
+
+    /** \cond */
+
+    namespace linalg
+    {
+      namespace detail
+      {
+        // x = A * y
+        template <typename T, unsigned int A>
+        struct op_executor<vector_base<T>, op_assign, vector_expression<const vandermonde_matrix<T, A>, const vector_base<T>, op_prod> >
+        {
+            static void apply(vector_base<T> & lhs, vector_expression<const vandermonde_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+            {
+              // check for the special case x = A * x
+              if (viennacl::traits::handle(lhs) == viennacl::traits::handle(rhs.rhs()))
+              {
+                viennacl::vector<T> temp(lhs);
+                viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), temp);
+                lhs = temp;
+              }
+              else
+                viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), lhs);
+            }
+        };
+
+        template <typename T, unsigned int A>
+        struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const vandermonde_matrix<T, A>, const vector_base<T>, op_prod> >
+        {
+            static void apply(vector_base<T> & lhs, vector_expression<const vandermonde_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+            {
+              viennacl::vector<T> temp(lhs);
+              viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), temp);
+              lhs += temp;
+            }
+        };
+
+        template <typename T, unsigned int A>
+        struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const vandermonde_matrix<T, A>, const vector_base<T>, op_prod> >
+        {
+            static void apply(vector_base<T> & lhs, vector_expression<const vandermonde_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+            {
+              viennacl::vector<T> temp(lhs);
+              viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), temp);
+              lhs -= temp;
+            }
+        };
+
+
+        // x = A * vec_op
+        template <typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+        struct op_executor<vector_base<T>, op_assign, vector_expression<const vandermonde_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> >
+        {
+            static void apply(vector_base<T> & lhs, vector_expression<const vandermonde_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+            {
+              viennacl::vector<T> temp(rhs.rhs());
+              viennacl::linalg::prod_impl(rhs.lhs(), temp, lhs);
+            }
+        };
+
+        // x = A * vec_op
+        template <typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+        struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const vandermonde_matrix<T, A>, vector_expression<const LHS, const RHS, OP>, op_prod> >
+        {
+            static void apply(vector_base<T> & lhs, vector_expression<const vandermonde_matrix<T, A>, vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+            {
+              viennacl::vector<T> temp(rhs.rhs());
+              viennacl::vector<T> temp_result(lhs);
+              viennacl::linalg::prod_impl(rhs.lhs(), temp, temp_result);
+              lhs += temp_result;
+            }
+        };
+
+        // x = A * vec_op
+        template <typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+        struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const vandermonde_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> >
+        {
+            static void apply(vector_base<T> & lhs, vector_expression<const vandermonde_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+            {
+              viennacl::vector<T> temp(rhs.rhs());
+              viennacl::vector<T> temp_result(lhs);
+              viennacl::linalg::prod_impl(rhs.lhs(), temp, temp_result);
+              lhs -= temp_result;
+            }
+        };
+
+      } // namespace detail
+    } // namespace linalg
+
+    /** \endcond */
 }
 
-#endif // _VIENNACL_VANDERMONDE_MATRIX_HPP
+#endif // VIENNACL_VANDERMONDE_MATRIX_HPP
diff --git a/viennacl/vector.hpp b/viennacl/vector.hpp
index 8ae4981..23e4906 100644
--- a/viennacl/vector.hpp
+++ b/viennacl/vector.hpp
@@ -1,1748 +1,3240 @@
-#ifndef VIENNACL_VECTOR_HPP_
-#define VIENNACL_VECTOR_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file vector.hpp
-    @brief The vector type with operator-overloads and proxy classes is defined here. 
-           Linear algebra operations such as norms and inner products are located in linalg/vector_operations.hpp
-*/
-
-
-#include "viennacl/forwards.h"
-#include "viennacl/ocl/backend.hpp"
-#include "viennacl/scalar.hpp"
-#include "viennacl/tools/tools.hpp"
-#include "viennacl/tools/entry_proxy.hpp"
-#include "viennacl/linalg/vector_operations.hpp"
-
-namespace viennacl
-{
-    
-    /** @brief An expression template class that represents a binary operation that yields a vector
-    *
-    * In contrast to full expression templates as introduced by Veldhuizen, ViennaCL does not allow nested expressions.
-    * The reason is that this requires automated GPU viennacl::ocl::kernel generation, which then has to be compiles just-in-time.
-    * For performance-critical applications, one better writes the appropriate viennacl::ocl::kernels by hand.
-    *
-    * Assumption: dim(LHS) >= dim(RHS), where dim(scalar) = 0, dim(vector) = 1 and dim(matrix = 2)
-    *
-    * @tparam LHS   left hand side operand
-    * @tparam RHS   right hand side operand
-    * @tparam OP    the operator
-    */
-    template <typename LHS, typename RHS, typename OP>
-    class vector_expression
-    {
-      public:
-        /** @brief Extracts the vector type from the two operands.
-        */
-        typedef typename viennacl::tools::VECTOR_EXTRACTOR<LHS, RHS>::ResultType    VectorType;
-      
-        vector_expression(LHS & lhs, RHS & rhs) : _lhs(lhs), _rhs(rhs) {}
-        
-        /** @brief Get left hand side operand
-        */
-        LHS & lhs() const { return _lhs; }
-        /** @brief Get right hand side operand
-        */
-        RHS & rhs() const { return _rhs; }
-        
-        /** @brief Returns the size of the result vector */
-        std::size_t size() const { return viennacl::tools::VECTOR_SIZE_DEDUCER<LHS, RHS, OP>::size(_lhs, _rhs); }
-        
-      private:
-        /** @brief The left hand side operand */
-        LHS & _lhs;
-        /** @brief The right hand side operand */
-        RHS & _rhs;
-    };
-    
-    /** @brief A STL-type const-iterator for vector elements. Elements can be accessed, but cannot be manipulated. VERY SLOW!!
-    *
-    * Every dereference operation initiates a transfer from the GPU to the CPU. The overhead of such a transfer is around 50us, so 20.000 dereferences take one second.
-    * This is four orders of magnitude slower than similar dereferences on the CPU. However, increments and comparisons of iterators is as fast as for CPU types.
-    * If you need a fast iterator, copy the whole vector to the CPU first and iterate over the CPU object, e.g.
-    * std::vector<float> temp;
-    * copy(gpu_vector, temp);
-    * for (std::vector<float>::const_iterator iter = temp.begin();
-    *      iter != temp.end();
-    *      ++iter)
-    * {
-    *   //do something
-    * }
-    * Note that you may obtain inconsistent data if entries of gpu_vector are manipulated elsewhere in the meanwhile.
-    *
-    * @tparam SCALARTYPE  The underlying floating point type (either float or double)
-    * @tparam ALIGNMENT   Alignment of the underlying vector, @see vector
-    */
-    template<class SCALARTYPE, unsigned int ALIGNMENT>
-    class const_vector_iterator
-    {
-        typedef const_vector_iterator<SCALARTYPE, ALIGNMENT>    self_type;
-      public:
-        typedef scalar<SCALARTYPE>            value_type;
-        typedef long                          difference_type;
-        
-        const_vector_iterator() {};
-        /** @brief Constructor
-        *   @param vec    The vector over which to iterate
-        *   @param index  The starting index of the iterator
-        */        
-        const_vector_iterator(vector<SCALARTYPE, ALIGNMENT> const & vec,      cl_uint index)  : elements_(vec.handle()), index_(index) {};
-        const_vector_iterator(viennacl::ocl::handle<cl_mem> const & elements, cl_uint index)  : elements_(elements), index_(index) {};
-
-        
-        value_type operator*(void) const 
-        { 
-           value_type result;
-           result = entry_proxy<SCALARTYPE>(index_, elements_);
-           return result;
-        }
-        self_type operator++(void) { ++index_; return *this; }
-        self_type operator++(int) { self_type tmp = *this; ++(*this); return tmp; }
-        
-        bool operator==(self_type const & other) const { return index_ == other.index_; }
-        bool operator!=(self_type const & other) const { return index_ != other.index_; }
-        
-//        self_type & operator=(self_type const & other)
-//        {
-//           _index = other._index;
-//           elements_ = other._elements;
-//           return *this;
-//        }   
-
-        difference_type operator-(self_type const & other) const { difference_type result = index_; return result - other.index_; }
-        self_type operator+(difference_type diff) const { return self_type(elements_, index_ + diff); }
-        
-        std::size_t index() const { return index_; }
-        viennacl::ocl::handle<cl_mem> const & handle() const { return elements_; }
-
-      protected:
-        /** @brief  The index of the entry the iterator is currently pointing to */
-        viennacl::ocl::handle<cl_mem> elements_;
-        std::size_t index_;
-    };
-    
-
-    /** @brief A STL-type iterator for vector elements. Elements can be accessed and manipulated. VERY SLOW!!
-    *
-    * Every dereference operation initiates a transfer from the GPU to the CPU. The overhead of such a transfer is around 50us, so 20.000 dereferences take one second.
-    * This is four orders of magnitude slower than similar dereferences on the CPU. However, increments and comparisons of iterators is as fast as for CPU types.
-    * If you need a fast iterator, copy the whole vector to the CPU first and iterate over the CPU object, e.g.
-    * std::vector<float> temp;
-    * copy(gpu_vector, temp);
-    * for (std::vector<float>::const_iterator iter = temp.begin();
-    *      iter != temp.end();
-    *      ++iter)
-    * {
-    *   //do something
-    * }
-    * copy(temp, gpu_vector);
-    * Note that you may obtain inconsistent data if you manipulate entries of gpu_vector in the meanwhile.
-    *
-    * @tparam SCALARTYPE  The underlying floating point type (either float or double)
-    * @tparam ALIGNMENT   Alignment of the underlying vector, @see vector
-    */
-    template<class SCALARTYPE, unsigned int ALIGNMENT>
-    class vector_iterator : public const_vector_iterator<SCALARTYPE, ALIGNMENT>
-    {
-        typedef const_vector_iterator<SCALARTYPE, ALIGNMENT>  base_type;
-        typedef vector_iterator<SCALARTYPE, ALIGNMENT>        self_type;
-      public:
-        vector_iterator() : base_type(){};
-        vector_iterator(viennacl::ocl::handle<cl_mem> const & elements, std::size_t index)  : base_type(elements, index) {};
-        /** @brief Constructor
-        *   @param vec    The vector over which to iterate
-        *   @param index  The starting index of the iterator
-        */        
-        vector_iterator(vector<SCALARTYPE, ALIGNMENT> & vec, cl_uint index) : base_type(vec, index) {};
-        vector_iterator(base_type const & b) : base_type(b) {};
-
-        typename base_type::value_type operator*(void)  
-        { 
-           typename base_type::value_type result;
-           result = entry_proxy<SCALARTYPE>(base_type::index_, base_type::elements_); 
-           return result;
-        }
-        
-        viennacl::ocl::handle<cl_mem> handle() { return base_type::elements_; }
-        
-        operator base_type() const
-        {
-          return base_type(base_type::elements_, base_type::index_);
-        }
-    };
-
-    // forward definition in VCLForwards.h!
-    /** @brief A vector class representing a linear memory sequence on the GPU. Inspired by boost::numeric::ublas::vector
-    *
-    *  This is the basic vector type of ViennaCL. It is similar to std::vector and boost::numeric::ublas::vector and supports various linear algebra operations.
-    * By default, the internal length of the vector is padded to a multiple of 'ALIGNMENT' in order to speed up several GPU viennacl::ocl::kernels.
-    *
-    * @tparam SCALARTYPE  The floating point type, either 'float' or 'double'
-    * @tparam ALIGNMENT   The internal memory size is given by (size()/ALIGNMENT + 1) * ALIGNMENT. ALIGNMENT must be a power of two. Best values or usually 4, 8 or 16, higher values are usually a waste of memory.
-    */
-    template<class SCALARTYPE, unsigned int ALIGNMENT>
-    class vector
-    {
-      
-    public:
-      typedef scalar<typename viennacl::tools::CHECK_SCALAR_TEMPLATE_ARGUMENT<SCALARTYPE>::ResultType>   value_type;
-      typedef vcl_size_t                                        size_type;
-      typedef vcl_ptrdiff_t                                     difference_type;
-      typedef const_vector_iterator<SCALARTYPE, ALIGNMENT>      const_iterator;
-      typedef vector_iterator<SCALARTYPE, ALIGNMENT>            iterator;
-      
-      static const int alignment = ALIGNMENT;
-
-      /** @brief Default constructor in order to be compatible with various containers.
-      */
-      vector() : size_(0) { viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::init();  }
-
-      /** @brief An explicit constructor for the vector, allocating the given amount of memory (plus a padding specified by 'ALIGNMENT')
-      *
-      * @param vec_size   The length (i.e. size) of the vector.
-      */
-      explicit vector(size_type vec_size) : size_(vec_size)
-      {
-        viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::init(); 
-        
-        if (size_ > 0)
-          elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, sizeof(SCALARTYPE)*internal_size());
-        
-        //force entries above size_ to zero:
-        if (size_ < internal_size())
-        {
-          std::vector<SCALARTYPE> temp(internal_size() - size_);
-          cl_int err = clEnqueueWriteBuffer(viennacl::ocl::get_queue().handle().get(), elements_.get(), CL_TRUE, sizeof(SCALARTYPE)*size_, sizeof(SCALARTYPE)*(internal_size() - size_), &(temp[0]), 0, NULL, NULL);
-          //assert(err == CL_SUCCESS);
-          VIENNACL_ERR_CHECK(err);
-        }
-      }
-
-      /** @brief Create a vector from existing OpenCL memory
-      *
-      * Note: The provided memory must take an eventual ALIGNMENT into account, i.e. existing_mem must be at least of size internal_size()!
-      * This is trivially the case with the default alignment, but should be considered when using vector<> with an alignment parameter not equal to 1.
-      *
-      * @param existing_mem   An OpenCL handle representing the memory
-      * @param vec_size       The size of the vector. 
-      */
-      explicit vector(cl_mem existing_mem, size_type vec_size) : size_(vec_size),  elements_(existing_mem)
-      {
-        elements_.inc();  //prevents that the user-provided memory is deleted once the vector object is destroyed.
-      }
-      
-      template <typename LHS, typename RHS, typename OP>
-      vector(vector_expression<LHS, RHS, OP> const & other) : size_(other.size())
-      {
-        elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, sizeof(SCALARTYPE)*other.size());
-        *this = other;
-      }
-      
-      /** @brief The copy constructor
-      *
-      * Entries of 'vec' are directly copied to this vector.
-      */
-      vector(const vector<SCALARTYPE, ALIGNMENT> & vec) :
-        size_(vec.size())
-      {
-        viennacl::linalg::kernels::vector<SCALARTYPE, 1>::init(); 
-        
-        if (size() != 0)
-        {
-          elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, sizeof(SCALARTYPE)*internal_size());
-          cl_int err;
-          err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle().get(), vec.handle().get(), elements_.get(), 0, 0, sizeof(SCALARTYPE)*internal_size(), 0, NULL, NULL);
-          //assert(err == CL_SUCCESS);
-          VIENNACL_ERR_CHECK(err);
-        }
-      }
-
-      /** @brief Assignment operator. This vector is resized if 'vec' is of a different size.
-      */
-      vector<SCALARTYPE, ALIGNMENT> & operator=(const vector<SCALARTYPE, ALIGNMENT> & vec)
-      {
-        resize(vec.size());
-        if (size() != 0)
-        {
-          cl_int err;
-          err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle().get(), vec.handle().get(), elements_.get(), 0, 0, sizeof(SCALARTYPE)*internal_size(), 0, NULL, NULL);
-          VIENNACL_ERR_CHECK(err);
-        }
-        return *this;
-      }
-
-
-      /** @brief Implementation of the operation v1 = alpha * v2, where alpha is a GPU scalar
-      *
-      * @param proxy  An expression template proxy class.
-      */
-      template <typename VectorType>   //use template to cover const/non-const of VectorType:
-      vector<SCALARTYPE, ALIGNMENT> & operator = (const vector_expression< VectorType,
-                                                                           const scalar<SCALARTYPE>,
-                                                                           op_prod> & proxy)
-      {
-        resize(proxy.lhs().size());
-        //std::cout << "vector::operator=(vec_times_scalar_proxy)" << std::endl; 
-        viennacl::linalg::mult(proxy.lhs(), proxy.rhs(), *this);
-        return *this;
-      }
-
-      /** @brief Implementation of the operation v1 = alpha * v2, where alpha is a CPU scalar
-      *
-      * @param proxy  An expression template proxy class.
-      */
-      template <typename VectorType>   //use template to cover const/non-const of VectorType:
-      vector<SCALARTYPE, ALIGNMENT> & operator = (const vector_expression< VectorType,
-                                                                           const SCALARTYPE,
-                                                                           op_prod> & proxy)
-      {
-        resize(proxy.lhs().size());
-        viennacl::linalg::mult(proxy.lhs(), proxy.rhs(), *this);
-        return *this;
-      }
-
-      /** @brief Implementation of the operation v1 = v2 / alpha, where alpha is a GPU scalar
-      *
-      * @param proxy  An expression template proxy class.
-      */
-      template <typename VectorType>   //use template to cover const/non-const of VectorType:
-      vector<SCALARTYPE, ALIGNMENT> & operator = (const vector_expression< VectorType,
-                                                                           const scalar<SCALARTYPE>,
-                                                                           op_div> & proxy)
-      {
-        resize(proxy.lhs().size());
-        //std::cout << "vector::operator=(vec_times_scalar_proxy)" << std::endl; 
-        viennacl::linalg::divide(proxy.lhs(), proxy.rhs(), *this);
-        return *this;
-      }
-
-      /** @brief Implementation of the operation v1 = v2 / alpha, where alpha is a CPU scalar
-      *
-      * @param proxy  An expression template proxy class.
-      */
-      template <typename VectorType>   //use template to cover const/non-const of VectorType:
-      vector<SCALARTYPE, ALIGNMENT> & operator = (const vector_expression< VectorType,
-                                                                           const SCALARTYPE,
-                                                                           op_div> & proxy)
-      {
-        resize(proxy.lhs().size());
-        //std::cout << "vector::operator=(vec_times_scalar_proxy)" << std::endl; 
-        viennacl::linalg::mult(proxy.lhs(), static_cast<SCALARTYPE>(1.0) / proxy.rhs(), *this);
-        return *this;
-      }
-
-      //v1 = v2 + v3; 
-      /** @brief Implementation of the operation v1 = v2 + v3
-      *
-      * @param proxy  An expression template proxy class.
-      */
-      vector<SCALARTYPE, ALIGNMENT> & operator = (const vector_expression< vector<SCALARTYPE, ALIGNMENT>,
-                                                                           vector<SCALARTYPE, ALIGNMENT>,
-                                                                           op_add> & proxy)
-      {
-        resize(proxy.lhs().size());
-        //std::cout << "vector::operator=(vec_times_scalar_proxy)" << std::endl; 
-        viennacl::linalg::add(proxy.lhs(), proxy.rhs(), *this);
-        return *this;
-      }
-      
-      //v1 = v2 - v3; 
-      /** @brief Implementation of the operation v1 = v2 - v3
-      *
-      * @param proxy  An expression template proxy class.
-      */
-      vector<SCALARTYPE, ALIGNMENT> & operator = (const vector_expression< vector<SCALARTYPE, ALIGNMENT>,
-                                                                           vector<SCALARTYPE, ALIGNMENT>,
-                                                                           op_sub> & proxy)
-      {
-        resize(proxy.lhs().size());
-        //std::cout << "vector::operator=(vec_times_scalar_proxy)" << std::endl; 
-        viennacl::linalg::sub(proxy.lhs(), proxy.rhs(), *this);
-        return *this;
-      }
-      
-      ///////////////////////////// Matrix Vector interaction start ///////////////////////////////////
-
-      //Note: The following operator overloads are defined in matrix_operations.hpp, compressed_matrix_operations.hpp and coordinate_matrix_operations.hpp
-      //This is certainly not the nicest approach and will most likely by changed in the future, but it works :-)
-      
-      //matrix<>
-      /** @brief Operator overload for v1 = A * v2, where v1, v2 are vectors and A is a dense matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <typename F, unsigned int MAT_ALIGNMENT>
-      vector<SCALARTYPE, ALIGNMENT> & operator=(const vector_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
-                                                const vector<SCALARTYPE, ALIGNMENT>,
-                                                op_prod> & proxy);
-
-      /** @brief Operator overload for v1 += A * v2, where v1, v2 are vectors and A is a dense matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <typename F, unsigned int MAT_ALIGNMENT>
-      vector<SCALARTYPE, ALIGNMENT> & operator+=(const vector_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
-                                                                          const vector<SCALARTYPE, ALIGNMENT>,
-                                                                          op_prod> & proxy);
-                                                
-      /** @brief Operator overload for v1 -= A * v2, where v1, v2 are vectors and A is a dense matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <typename F, unsigned int MAT_ALIGNMENT>
-      vector<SCALARTYPE, ALIGNMENT> & operator-=(const vector_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
-                                                                          const vector<SCALARTYPE, ALIGNMENT>,
-                                                                          op_prod> & proxy);
-
-      /** @brief Operator overload for v1 + A * v2, where v1, v2 are vectors and A is a dense matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <typename F, unsigned int MAT_ALIGNMENT>
-      vector<SCALARTYPE, ALIGNMENT> operator+(const vector_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
-                                                                       const vector<SCALARTYPE, ALIGNMENT>,
-                                                                       op_prod> & proxy);
-
-      /** @brief Operator overload for v1 - A * v2, where v1, v2 are vectors and A is a dense matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <typename F, unsigned int MAT_ALIGNMENT>
-      vector<SCALARTYPE, ALIGNMENT> operator-(const vector_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
-                                                                       const vector<SCALARTYPE, ALIGNMENT>,
-                                                                       op_prod> & proxy);
-
-      //transposed_matrix_proxy:
-      /** @brief Operator overload for v1 = trans(A) * v2, where v1, v2 are vectors and A is a dense matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <typename F, unsigned int MAT_ALIGNMENT>
-      vector<SCALARTYPE, ALIGNMENT> & operator=(const vector_expression< const matrix_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
-                                                                                                  const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
-                                                                                                  op_trans >,
-                                                                         const vector<SCALARTYPE, ALIGNMENT>,
-                                                                         op_prod> & proxy);
-
-      /** @brief Operator overload for v1 += trans(A) * v2, where v1, v2 are vectors and A is a dense matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <typename F, unsigned int MAT_ALIGNMENT>
-      vector<SCALARTYPE, ALIGNMENT> & operator+=(const vector_expression< const matrix_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
-                                                                                                   const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
-                                                                                                   op_trans >,
-                                                                          const vector<SCALARTYPE, ALIGNMENT>,
-                                                                          op_prod> & proxy);
-                                                
-      /** @brief Operator overload for v1 -= trans(A) * v2, where v1, v2 are vectors and A is a dense matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <typename F, unsigned int MAT_ALIGNMENT>
-      vector<SCALARTYPE, ALIGNMENT> & operator-=(const vector_expression< const matrix_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
-                                                                                                   const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
-                                                                                                   op_trans >,
-                                                                          const vector<SCALARTYPE, ALIGNMENT>,
-                                                                          op_prod> & proxy);
-
-      /** @brief Operator overload for v1 + trans(A) * v2, where v1, v2 are vectors and A is a dense matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <typename F, unsigned int MAT_ALIGNMENT>
-      vector<SCALARTYPE, ALIGNMENT> operator+(const vector_expression< const matrix_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
-                                                                                                const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
-                                                                                                op_trans >,
-                                                                       const vector<SCALARTYPE, ALIGNMENT>,
-                                                                       op_prod> & proxy);
-
-      /** @brief Operator overload for v1 - trans(A) * v2, where v1, v2 are vectors and A is a dense matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <typename F, unsigned int MAT_ALIGNMENT>
-      vector<SCALARTYPE, ALIGNMENT> operator-(const vector_expression< const matrix_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
-                                                                                                const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
-                                                                                                op_trans >,
-                                                                       const vector<SCALARTYPE, ALIGNMENT>,
-                                                                       op_prod> & proxy);
-                                                                       
-                                                                       
-      //                                                                 
-      //////////// compressed_matrix<>
-      //
-      /** @brief Operator overload for v1 = A * v2, where v1, v2 are vectors and A is a sparse matrix of type compressed_matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <unsigned int MAT_ALIGNMENT>
-      vector<SCALARTYPE, ALIGNMENT> & operator=(const vector_expression< const compressed_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                const vector<SCALARTYPE, ALIGNMENT>,
-                                                op_prod> & proxy);
-
-      /** @brief Operator overload for v1 += A * v2, where v1, v2 are vectors and A is a sparse matrix of type compressed_matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <unsigned int MAT_ALIGNMENT>
-      vector<SCALARTYPE, ALIGNMENT> & operator+=(const vector_expression< const compressed_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                                          const vector<SCALARTYPE, ALIGNMENT>,
-                                                                          op_prod> & proxy);
-                                                
-      /** @brief Operator overload for v1 -= A * v2, where v1, v2 are vectors and A is a sparse matrix of type compressed_matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <unsigned int MAT_ALIGNMENT>
-      vector<SCALARTYPE, ALIGNMENT> & operator-=(const vector_expression< const compressed_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                                          const vector<SCALARTYPE, ALIGNMENT>,
-                                                                          op_prod> & proxy);
-
-      /** @brief Operator overload for v1 + A * v2, where v1, v2 are vectors and A is a sparse matrix of type compressed_matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <unsigned int MAT_ALIGNMENT>
-      vector<SCALARTYPE, ALIGNMENT> operator+(const vector_expression< const compressed_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                                       const vector<SCALARTYPE, ALIGNMENT>,
-                                                                       op_prod> & proxy);
-
-      /** @brief Operator overload for v1 - A * v2, where v1, v2 are vectors and A is a sparse matrix of type compressed_matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <unsigned int MAT_ALIGNMENT>
-      vector<SCALARTYPE, ALIGNMENT> operator-(const vector_expression< const compressed_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                                       const vector<SCALARTYPE, ALIGNMENT>,
-                                                                       op_prod> & proxy);
-
-      //
-      // coordinate_matrix<>
-      //
-      /** @brief Operator overload for v1 = A * v2, where v1, v2 are vectors and A is a sparse matrix of type coordinate_matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <unsigned int MAT_ALIGNMENT>
-      vector<SCALARTYPE, ALIGNMENT> & operator=(const vector_expression< const coordinate_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                const vector<SCALARTYPE, ALIGNMENT>,
-                                                op_prod> & proxy);
-
-      /** @brief Operator overload for v1 += A * v2, where v1, v2 are vectors and A is a sparse matrix of type coordinate_matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <unsigned int MAT_ALIGNMENT>
-      vector<SCALARTYPE, ALIGNMENT> & operator+=(const vector_expression< const coordinate_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                                          const vector<SCALARTYPE, ALIGNMENT>,
-                                                                          op_prod> & proxy);
-                                                
-      /** @brief Operator overload for v1 -= A * v2, where v1, v2 are vectors and A is a sparse matrix of type coordinate_matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <unsigned int MAT_ALIGNMENT>
-      vector<SCALARTYPE, ALIGNMENT> & operator-=(const vector_expression< const coordinate_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                                          const vector<SCALARTYPE, ALIGNMENT>,
-                                                                          op_prod> & proxy);
-
-      /** @brief Operator overload for v1 + A * v2, where v1, v2 are vectors and A is a sparse matrix of type coordinate_matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <unsigned int MAT_ALIGNMENT>
-      vector<SCALARTYPE, ALIGNMENT> operator+(const vector_expression< const coordinate_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                                       const vector<SCALARTYPE, ALIGNMENT>,
-                                                                       op_prod> & proxy);
-
-      /** @brief Operator overload for v1 - A * v2, where v1, v2 are vectors and A is a sparse matrix of type coordinate_matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <unsigned int MAT_ALIGNMENT>
-      vector<SCALARTYPE, ALIGNMENT> operator-(const vector_expression< const coordinate_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                                       const vector<SCALARTYPE, ALIGNMENT>,
-                                                                       op_prod> & proxy);
-
-      //
-      // circulant_matrix<>
-      //
-      /** @brief Operator overload for v1 = A * v2, where v1, v2 are vectors and A is a sparse matrix of type circulant_matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <unsigned int MAT_ALIGNMENT>
-      vector<SCALARTYPE, ALIGNMENT> & operator=(const vector_expression< const circulant_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                const vector<SCALARTYPE, ALIGNMENT>,
-                                                op_prod> & proxy);
-
-      /** @brief Operator overload for v1 += A * v2, where v1, v2 are vectors and A is a sparse matrix of type circulant_matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <unsigned int MAT_ALIGNMENT>
-      vector<SCALARTYPE, ALIGNMENT> & operator+=(const vector_expression< const circulant_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                                          const vector<SCALARTYPE, ALIGNMENT>,
-                                                                          op_prod> & proxy);
-                                                
-      /** @brief Operator overload for v1 -= A * v2, where v1, v2 are vectors and A is a sparse matrix of type circulant_matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <unsigned int MAT_ALIGNMENT>
-      vector<SCALARTYPE, ALIGNMENT> & operator-=(const vector_expression< const circulant_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                                          const vector<SCALARTYPE, ALIGNMENT>,
-                                                                          op_prod> & proxy);
-
-      /** @brief Operator overload for v1 + A * v2, where v1, v2 are vectors and A is a sparse matrix of type circulant_matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <unsigned int MAT_ALIGNMENT>
-      vector<SCALARTYPE, ALIGNMENT> operator+(const vector_expression< const circulant_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                                       const vector<SCALARTYPE, ALIGNMENT>,
-                                                                       op_prod> & proxy);
-
-      /** @brief Operator overload for v1 - A * v2, where v1, v2 are vectors and A is a sparse matrix of type circulant_matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <unsigned int MAT_ALIGNMENT>
-      vector<SCALARTYPE, ALIGNMENT> operator-(const vector_expression< const circulant_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                                       const vector<SCALARTYPE, ALIGNMENT>,
-                                                                       op_prod> & proxy);
-
-
-      //
-      // hankel_matrix<>
-      //
-      /** @brief Operator overload for v1 = A * v2, where v1, v2 are vectors and A is a sparse matrix of type circulant_matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <unsigned int MAT_ALIGNMENT>
-      vector<SCALARTYPE, ALIGNMENT> & operator=(const vector_expression< const hankel_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                const vector<SCALARTYPE, ALIGNMENT>,
-                                                op_prod> & proxy);
-
-      /** @brief Operator overload for v1 += A * v2, where v1, v2 are vectors and A is a sparse matrix of type circulant_matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <unsigned int MAT_ALIGNMENT>
-      vector<SCALARTYPE, ALIGNMENT> & operator+=(const vector_expression< const hankel_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                                          const vector<SCALARTYPE, ALIGNMENT>,
-                                                                          op_prod> & proxy);
-                                                
-      /** @brief Operator overload for v1 -= A * v2, where v1, v2 are vectors and A is a sparse matrix of type circulant_matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <unsigned int MAT_ALIGNMENT>
-      vector<SCALARTYPE, ALIGNMENT> & operator-=(const vector_expression< const hankel_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                                          const vector<SCALARTYPE, ALIGNMENT>,
-                                                                          op_prod> & proxy);
-
-      /** @brief Operator overload for v1 + A * v2, where v1, v2 are vectors and A is a sparse matrix of type circulant_matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <unsigned int MAT_ALIGNMENT>
-      vector<SCALARTYPE, ALIGNMENT> operator+(const vector_expression< const hankel_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                                       const vector<SCALARTYPE, ALIGNMENT>,
-                                                                       op_prod> & proxy);
-
-      /** @brief Operator overload for v1 - A * v2, where v1, v2 are vectors and A is a sparse matrix of type circulant_matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <unsigned int MAT_ALIGNMENT>
-      vector<SCALARTYPE, ALIGNMENT> operator-(const vector_expression< const hankel_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                                       const vector<SCALARTYPE, ALIGNMENT>,
-                                                                       op_prod> & proxy);
-
-      //
-      // toeplitz_matrix<>
-      //
-      /** @brief Operator overload for v1 = A * v2, where v1, v2 are vectors and A is a sparse matrix of type circulant_matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <unsigned int MAT_ALIGNMENT>
-      vector<SCALARTYPE, ALIGNMENT> & operator=(const vector_expression< const toeplitz_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                const vector<SCALARTYPE, ALIGNMENT>,
-                                                op_prod> & proxy);
-
-      /** @brief Operator overload for v1 += A * v2, where v1, v2 are vectors and A is a sparse matrix of type circulant_matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <unsigned int MAT_ALIGNMENT>
-      vector<SCALARTYPE, ALIGNMENT> & operator+=(const vector_expression< const toeplitz_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                                          const vector<SCALARTYPE, ALIGNMENT>,
-                                                                          op_prod> & proxy);
-                                                
-      /** @brief Operator overload for v1 -= A * v2, where v1, v2 are vectors and A is a sparse matrix of type circulant_matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <unsigned int MAT_ALIGNMENT>
-      vector<SCALARTYPE, ALIGNMENT> & operator-=(const vector_expression< const toeplitz_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                                          const vector<SCALARTYPE, ALIGNMENT>,
-                                                                          op_prod> & proxy);
-
-      /** @brief Operator overload for v1 + A * v2, where v1, v2 are vectors and A is a sparse matrix of type circulant_matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <unsigned int MAT_ALIGNMENT>
-      vector<SCALARTYPE, ALIGNMENT> operator+(const vector_expression< const toeplitz_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                                       const vector<SCALARTYPE, ALIGNMENT>,
-                                                                       op_prod> & proxy);
-
-      /** @brief Operator overload for v1 - A * v2, where v1, v2 are vectors and A is a sparse matrix of type circulant_matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <unsigned int MAT_ALIGNMENT>
-      vector<SCALARTYPE, ALIGNMENT> operator-(const vector_expression< const toeplitz_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                                       const vector<SCALARTYPE, ALIGNMENT>,
-                                                                       op_prod> & proxy);
-
-      
-      //
-      // vandermonde_matrix<>
-      //
-      /** @brief Operator overload for v1 = A * v2, where v1, v2 are vectors and A is a sparse matrix of type circulant_matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <unsigned int MAT_ALIGNMENT>
-      vector<SCALARTYPE, ALIGNMENT> & operator=(const vector_expression< const vandermonde_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                const vector<SCALARTYPE, ALIGNMENT>,
-                                                op_prod> & proxy);
-
-      /** @brief Operator overload for v1 += A * v2, where v1, v2 are vectors and A is a sparse matrix of type circulant_matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <unsigned int MAT_ALIGNMENT>
-      vector<SCALARTYPE, ALIGNMENT> & operator+=(const vector_expression< const vandermonde_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                                          const vector<SCALARTYPE, ALIGNMENT>,
-                                                                          op_prod> & proxy);
-                                                
-      /** @brief Operator overload for v1 -= A * v2, where v1, v2 are vectors and A is a sparse matrix of type circulant_matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <unsigned int MAT_ALIGNMENT>
-      vector<SCALARTYPE, ALIGNMENT> & operator-=(const vector_expression< const vandermonde_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                                          const vector<SCALARTYPE, ALIGNMENT>,
-                                                                          op_prod> & proxy);
-
-      /** @brief Operator overload for v1 + A * v2, where v1, v2 are vectors and A is a sparse matrix of type circulant_matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <unsigned int MAT_ALIGNMENT>
-      vector<SCALARTYPE, ALIGNMENT> operator+(const vector_expression< const vandermonde_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                                       const vector<SCALARTYPE, ALIGNMENT>,
-                                                                       op_prod> & proxy);
-
-      /** @brief Operator overload for v1 - A * v2, where v1, v2 are vectors and A is a sparse matrix of type circulant_matrix.
-      *
-      * @param proxy An expression template proxy class
-      */
-      template <unsigned int MAT_ALIGNMENT>
-      vector<SCALARTYPE, ALIGNMENT> operator-(const vector_expression< const vandermonde_matrix<SCALARTYPE, MAT_ALIGNMENT>,
-                                                                       const vector<SCALARTYPE, ALIGNMENT>,
-                                                                       op_prod> & proxy);
-
-      
-      
-      ///////////////////////////// Matrix Vector interaction end ///////////////////////////////////
-
-      //enlarge or reduce allocated memory and set unused memory to zero
-      /** @brief Resizes the allocated memory for the vector. Pads the memory to be a multiple of 'ALIGNMENT'
-      *
-      *  @param new_size  The new size of the vector
-      *  @param preserve  If true, old entries of the vector are preserved, otherwise eventually discarded.
-      */
-      void resize(size_type new_size, bool preserve = true)
-      {
-        assert(new_size > 0);
-        
-        if (new_size != size_)
-        {
-          std::size_t new_internal_size = viennacl::tools::roundUpToNextMultiple<std::size_t>(new_size, ALIGNMENT);
-        
-          std::vector<SCALARTYPE> temp(size_);
-          if (preserve && size_ > 0)
-            fast_copy(*this, temp);
-          temp.resize(new_size);  //drop all entries above new_size
-          temp.resize(new_internal_size); //enlarge to fit new internal size
-          
-          if (new_internal_size != internal_size())
-          {
-            elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, sizeof(SCALARTYPE)*new_internal_size);
-          }
-          
-          fast_copy(temp, *this);
-          size_ = new_size;
-        }
-        
-      }
-      
-
-      //read-write access to an element of the vector
-      /** @brief Read-write access to a single element of the vector
-      */
-      entry_proxy<SCALARTYPE> operator()(size_type index)
-      {
-        return entry_proxy<SCALARTYPE>(index, elements_);
-      }
-
-      /** @brief Read-write access to a single element of the vector
-      */
-      entry_proxy<SCALARTYPE> operator[](size_type index)
-      {
-        return entry_proxy<SCALARTYPE>(index, elements_);
-      }
-
-
-      /** @brief Read access to a single element of the vector
-      */
-      scalar<SCALARTYPE> operator()(size_type index) const
-      {
-        scalar<SCALARTYPE> tmp;
-        cl_int err;
-        err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle(), elements_, tmp.handle(), sizeof(SCALARTYPE)*index, 0, sizeof(SCALARTYPE), 0, NULL, NULL);
-        //assert(err == CL_SUCCESS);
-        VIENNACL_ERR_CHECK(err);
-        return tmp;
-      }
-      
-      /** @brief Read access to a single element of the vector
-      */
-      scalar<SCALARTYPE> operator[](size_type index) const
-      {
-        return operator()(index);
-      }
-      
-      /** @brief Inplace addition of a vector
-      */
-      vector<SCALARTYPE, ALIGNMENT> & operator += (const vector<SCALARTYPE, ALIGNMENT> & vec)
-      {
-        viennacl::linalg::inplace_add(*this, vec);
-        return *this;
-      }
-
-      /** @brief Inplace addition of a scaled vector, i.e. v1 += alpha * v2, where alpha is a GPU scalar
-      */
-      vector<SCALARTYPE, ALIGNMENT> & operator += (const vector_expression< vector<SCALARTYPE, ALIGNMENT>,
-                                                                           const scalar<SCALARTYPE>,
-                                                                           op_prod> & proxy)
-      {
-        viennacl::linalg::inplace_mul_add(*this, proxy.lhs(), proxy.rhs());
-        return *this;
-      }
-
-      /** @brief Inplace addition of a scaled vector, i.e. v1 += alpha * v2, where alpha is a GPU scalar
-      */
-      vector<SCALARTYPE, ALIGNMENT> & operator += (const vector_expression< const vector<SCALARTYPE, ALIGNMENT>,
-                                                                           const scalar<SCALARTYPE>,
-                                                                           op_prod> & proxy)
-      {
-        viennacl::linalg::inplace_mul_add(*this, proxy.lhs(), proxy.rhs());
-        return *this;
-      }
-
-      /** @brief Inplace addition of a scaled vector, i.e. v1 += alpha * v2, where alpha is a CPU scalar
-      */
-      vector<SCALARTYPE, ALIGNMENT> & operator += (const vector_expression< vector<SCALARTYPE, ALIGNMENT>,
-                                                                           const SCALARTYPE,
-                                                                           op_prod> & proxy)
-      {
-        viennacl::linalg::inplace_mul_add(*this, proxy.lhs(), proxy.rhs());
-        return *this;
-      }
-
-      /** @brief Inplace addition of a scaled vector, i.e. v1 += alpha * v2, where alpha is a CPU scalar
-      */
-      vector<SCALARTYPE, ALIGNMENT> & operator += (const vector_expression< const vector<SCALARTYPE, ALIGNMENT>,
-                                                                           const SCALARTYPE,
-                                                                           op_prod> & proxy)
-      {
-        viennacl::linalg::inplace_mul_add(*this, proxy.lhs(), proxy.rhs());
-        return *this;
-      }
-
-      /** @brief Inplace addition of a scaled vector, i.e. v1 += alpha * v2, where alpha is a GPU scalar
-      */
-      vector<SCALARTYPE, ALIGNMENT> & operator += (const vector_expression< const vector<SCALARTYPE, ALIGNMENT>,
-                                                                           const scalar<SCALARTYPE>,
-                                                                           op_div> & proxy)
-      {
-        viennacl::linalg::inplace_div_add(*this, proxy.lhs(), proxy.rhs());
-        return *this;
-      }
-
-
-
-      /** @brief Inplace subtraction of a vector
-      */
-      vector<SCALARTYPE, ALIGNMENT> & operator -= (const vector<SCALARTYPE, ALIGNMENT> & vec)
-      {
-        viennacl::linalg::inplace_sub(*this, vec);
-        return *this;
-      }
-
-      /** @brief Inplace subtraction of a scaled vector, i.e. v1 -= alpha * v2, where alpha is a GPU scalar
-      */
-      vector<SCALARTYPE, ALIGNMENT> & operator -= (const vector_expression< vector<SCALARTYPE, ALIGNMENT>,
-                                                                           const scalar<SCALARTYPE>,
-                                                                           op_prod> & proxy)
-      {
-        viennacl::linalg::inplace_mul_sub(*this, proxy.lhs(), proxy.rhs());
-        return *this;
-      }
-
-      /** @brief Inplace subtraction of a scaled vector, i.e. v1 -= alpha * v2, where alpha is a GPU scalar
-      */
-      vector<SCALARTYPE, ALIGNMENT> & operator -= (const vector_expression< const vector<SCALARTYPE, ALIGNMENT>,
-                                                                           const scalar<SCALARTYPE>,
-                                                                           op_prod> & proxy)
-      {
-        viennacl::linalg::inplace_mul_sub(*this, proxy.lhs(), proxy.rhs());
-        return *this;
-      }
-
-      /** @brief Inplace subtraction of a scaled vector, i.e. v1 -= alpha * v2, where alpha is a CPU scalar
-      */
-      vector<SCALARTYPE, ALIGNMENT> & operator -= (const vector_expression< vector<SCALARTYPE, ALIGNMENT>,
-                                                                            const SCALARTYPE,
-                                                                            op_prod> & proxy)
-      {
-        viennacl::linalg::inplace_mul_add(*this, proxy.lhs(), -proxy.rhs());
-        return *this;
-      }
-
-      /** @brief Inplace subtraction of a scaled vector, i.e. v1 -= alpha * v2, where alpha is a CPU scalar
-      */
-      vector<SCALARTYPE, ALIGNMENT> & operator -= (const vector_expression< const vector<SCALARTYPE, ALIGNMENT>,
-                                                                            const SCALARTYPE,
-                                                                            op_prod> & proxy)
-      {
-        viennacl::linalg::inplace_mul_add(*this, proxy.lhs(), -proxy.rhs());
-        return *this;
-      }
-      
-      /** @brief Inplace subtraction of a scaled vector, i.e. v1 -= alpha * v2, where alpha is a CPU scalar
-      */
-      vector<SCALARTYPE, ALIGNMENT> & operator -= (const vector_expression< const vector<SCALARTYPE, ALIGNMENT>,
-                                                                            const scalar<SCALARTYPE>,
-                                                                            op_div> & proxy)
-      {
-        viennacl::linalg::inplace_div_sub(*this, proxy.lhs(), proxy.rhs());
-        return *this;
-      }
-      
-      
-      
-
-      /** @brief Scales this vector by a CPU scalar value
-      */
-      vector<SCALARTYPE, ALIGNMENT> & operator *= (SCALARTYPE val)
-      {
-        viennacl::linalg::inplace_mult(*this, val);
-        return *this;
-      }
-
-      /** @brief Scales this vector by a GPU scalar value
-      */
-      vector<SCALARTYPE, ALIGNMENT> & operator *= (scalar<SCALARTYPE> const & gpu_val)
-      {
-        viennacl::linalg::inplace_mult(*this, gpu_val);
-        return *this;
-      }
-
-      /** @brief Scales this vector by a CPU scalar value
-      */
-      vector<SCALARTYPE, ALIGNMENT> & operator /= (SCALARTYPE val)
-      {
-        viennacl::linalg::inplace_mult(*this, static_cast<SCALARTYPE>(1) / val);
-        return *this;
-      }
-      
-      /** @brief Scales this vector by a CPU scalar value
-      */
-      vector<SCALARTYPE, ALIGNMENT> & operator /= (scalar<SCALARTYPE> const & gpu_val)
-      {
-        viennacl::linalg::inplace_divide(*this, gpu_val);
-        return *this;
-      }
-      
-      
-      
-      // free addition
-      
-      /** @brief Adds up two vectors
-      */
-      vector<SCALARTYPE, ALIGNMENT> operator + (const vector<SCALARTYPE, ALIGNMENT> & vec) const
-      {
-        vector<SCALARTYPE, ALIGNMENT> result(internal_size());
-        viennacl::linalg::add(*this, vec, result);
-        return result;
-      }
-      
-      /** @brief Adds up two vectors, i.e. result = v1 + v2 * alpha, where alpha is a GPU scalar
-      */
-      vector<SCALARTYPE, ALIGNMENT> operator + (const vector_expression< vector<SCALARTYPE, ALIGNMENT>,
-                                                                         const scalar<SCALARTYPE>,
-                                                                           op_prod> & proxy) const
-      {
-        vector<SCALARTYPE, ALIGNMENT> result(size_);
-        viennacl::linalg::mul_add(proxy.lhs(), proxy.rhs(), *this, result);
-        return result;
-      }
-
-      /** @brief Adds up two vectors, i.e. result = v1 + v2 * alpha, where alpha is a GPU scalar
-      */
-      vector<SCALARTYPE, ALIGNMENT> operator + (const vector_expression< const vector<SCALARTYPE, ALIGNMENT>,
-                                                                         const scalar<SCALARTYPE>,
-                                                                           op_prod> & proxy) const
-      {
-        vector<SCALARTYPE, ALIGNMENT> result(size_);
-        viennacl::linalg::mul_add(proxy.lhs(), proxy.rhs(), *this, result);
-        return result;
-      }
-
-      /** @brief Adds up two vectors, i.e. result = v1 + v2 * alpha, where alpha is a CPU scalar
-      */
-      vector<SCALARTYPE, ALIGNMENT> operator + (const vector_expression< vector<SCALARTYPE, ALIGNMENT>,
-                                                                         const SCALARTYPE,
-                                                                         op_prod> & proxy) const
-      {
-        vector<SCALARTYPE, ALIGNMENT> result(size_);
-        viennacl::linalg::mul_add(proxy.lhs(), proxy.rhs(), *this, result);
-        return result;
-      }
-
-      /** @brief Adds up two vectors, i.e. result = v1 + v2 * alpha, where alpha is a CPU scalar
-      */
-      vector<SCALARTYPE, ALIGNMENT> operator + (const vector_expression< const vector<SCALARTYPE, ALIGNMENT>,
-                                                                         const SCALARTYPE,
-                                                                         op_prod> & proxy) const
-      {
-        vector<SCALARTYPE, ALIGNMENT> result(size_);
-        viennacl::linalg::mul_add(proxy.lhs(), proxy.rhs(), *this, result);
-        return result;
-      }
-
-
-      //free subtraction:
-      /** @brief Implementation of    result = v1 - v2
-      */
-      vector<SCALARTYPE, ALIGNMENT> operator - (const vector<SCALARTYPE, ALIGNMENT> & vec) const
-      {
-        vector<SCALARTYPE, ALIGNMENT> result(size_);
-        viennacl::linalg::sub(*this, vec, result);
-        return result;
-      }
-
-      /** @brief Adds up two vectors, i.e. result = v1 - v2 * alpha, where alpha is a GPU scalar
-      */
-      vector<SCALARTYPE, ALIGNMENT> operator - (const vector_expression< vector<SCALARTYPE, ALIGNMENT>,
-                                                                         const scalar<SCALARTYPE>,
-                                                                           op_prod> & proxy) const
-      {
-        vector<SCALARTYPE, ALIGNMENT> result(size_);
-        result = *this;
-        viennacl::linalg::inplace_mul_sub(result, proxy.lhs(), proxy.rhs());
-        return result;
-      }
-
-      /** @brief Adds up two vectors, i.e. result = v1 - v2 * alpha, where alpha is a GPU scalar
-      */
-      vector<SCALARTYPE, ALIGNMENT> operator - (const vector_expression< const vector<SCALARTYPE, ALIGNMENT>,
-                                                                         const scalar<SCALARTYPE>,
-                                                                           op_prod> & proxy) const
-      {
-        vector<SCALARTYPE, ALIGNMENT> result(size_);
-        result = *this;
-        viennacl::linalg::inplace_mul_sub(result, proxy.lhs(), proxy.rhs());
-        return result;
-      }
-
-      /** @brief Adds up two vectors, i.e. result = v1 - v2 * alpha, where alpha is a CPU scalar
-      */
-      vector<SCALARTYPE, ALIGNMENT> operator - (const vector_expression< vector<SCALARTYPE, ALIGNMENT>,
-                                                                         const SCALARTYPE,
-                                                                         op_prod> & proxy) const
-      {
-        vector<SCALARTYPE, ALIGNMENT> result(size_);
-        result = *this;
-        viennacl::linalg::inplace_mul_add(result, proxy.lhs(), -proxy.rhs());
-        return result;
-      }
-
-      /** @brief Adds up two vectors, i.e. result = v1 - v2 * alpha, where alpha is a CPU scalar
-      */
-      vector<SCALARTYPE, ALIGNMENT> operator - (const vector_expression< const vector<SCALARTYPE, ALIGNMENT>,
-                                                                         const SCALARTYPE,
-                                                                         op_prod> & proxy) const
-      {
-        vector<SCALARTYPE, ALIGNMENT> result(size_);
-        result = *this;
-        viennacl::linalg::inplace_mul_add(result, proxy.lhs(), -proxy.rhs());
-        return result;
-      }
-
-      
-      //free multiplication
-      /** @brief Scales the vector by a CPU scalar 'alpha' and returns an expression template
-      */
-      vector_expression< const vector<SCALARTYPE, ALIGNMENT>, const SCALARTYPE, op_prod> 
-      operator * (SCALARTYPE value) const
-      {
-        return vector_expression< const vector<SCALARTYPE, ALIGNMENT>, const SCALARTYPE, op_prod>(*this, value);
-      }
-
-      /** @brief Scales the vector by a GPU scalar 'alpha' and returns an expression template
-      */
-      vector_expression< const vector<SCALARTYPE, ALIGNMENT>, const scalar<SCALARTYPE>, op_prod> 
-      operator * (scalar<SCALARTYPE> const & value) const
-      {
-        return vector_expression< const vector<SCALARTYPE, ALIGNMENT>, const scalar<SCALARTYPE>, op_prod>(*this, value);
-      }
-
-      //free division
-      /** @brief Scales the vector by a CPU scalar 'alpha' and returns an expression template
-      */
-      vector_expression< const vector<SCALARTYPE, ALIGNMENT>, const SCALARTYPE, op_div> 
-      operator / (SCALARTYPE value) const
-      {
-        return vector_expression< const vector<SCALARTYPE, ALIGNMENT>, const SCALARTYPE, op_div>(*this, value);
-      }
-
-      /** @brief Scales the vector by a GPU scalar 'alpha' and returns an expression template
-      */
-      vector_expression< const vector<SCALARTYPE, ALIGNMENT>, const scalar<SCALARTYPE>, op_div> 
-      operator / (scalar<SCALARTYPE> const & value) const
-      {
-        return vector_expression< const vector<SCALARTYPE, ALIGNMENT>, const scalar<SCALARTYPE>, op_div>(*this, value);
-      }
-      
-      
-      //// iterators:
-      /** @brief Returns an iterator pointing to the beginning of the vector  (STL like)*/
-      iterator begin()
-      {
-        return iterator(*this, 0);
-      }
-
-      /** @brief Returns an iterator pointing to the end of the vector (STL like)*/
-      iterator end()
-      {
-        return iterator(*this, size());
-      }
-
-      /** @brief Returns a const-iterator pointing to the beginning of the vector (STL like)*/
-      const_iterator begin() const
-      {
-        return const_iterator(*this, 0);
-      }
-
-      /** @brief Returns a const-iterator pointing to the end of the vector (STL like)*/
-      const_iterator end() const
-      {
-        return const_iterator(*this, size());
-      }
-
-      /** @brief Swaps the entries of the two vectors
-      */
-      vector<SCALARTYPE, ALIGNMENT> & swap(vector<SCALARTYPE, ALIGNMENT> & other)
-      {
-        swap(*this, other);
-        return *this;
-      };
-      
-      /** @brief Swaps the handles of two vectors by swapping the OpenCL handles only, no data copy
-      */ 
-      vector<SCALARTYPE, ALIGNMENT> & fast_swap(vector<SCALARTYPE, ALIGNMENT> & other) 
-      { 
-        assert(this->size_ == other.size_); 
-        this->elements_.swap(other.elements_); 
-        return *this; 
-      };       
-      
-      /** @brief Returns the length of the vector (cf. std::vector)
-      */
-      size_type size() const { return size_; }
-      
-      /** @brief Returns the maximum possible size of the vector, which is given by 128 MByte due to limitations by OpenCL.
-      */
-      size_type max_size() const
-      {
-        return (128*1024*1024) / sizeof(SCALARTYPE);  //128 MB is maximum size of memory chunks in OpenCL!
-      }
-      /** @brief Returns the internal length of the vector, which is given by size() plus the extra memory due to padding the memory with zeros up to a multiple of 'ALIGNMENT'
-      */
-      size_type internal_size() const { return viennacl::tools::roundUpToNextMultiple<size_type>(size_, ALIGNMENT); }
-      
-      /** @brief Returns true is the size is zero */
-      bool empty() { return size_ == 0; }
-      
-      /** @brief Returns the OpenCL memory viennacl::ocl::handle. Typically used for launching compute viennacl::ocl::kernels */
-      const viennacl::ocl::handle<cl_mem> & handle() const { return elements_; }
-
-      /** @brief Resets all entries to zero. Does not change the size of the vector.
-      */
-      void clear()
-      {
-        viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::program_name(), "clear");
-        
-        viennacl::ocl::enqueue(k(elements_,
-                                 cl_uint(0),
-                                 cl_uint(internal_size()))
-                              );
-      }
-      //void swap(vector & other){}
-      
-
-      //TODO: Think about implementing the following public member functions
-      //void insert_element(unsigned int i, SCALARTYPE val){}
-      //void erase_element(unsigned int i){}
-      
-    private:
-      cl_uint size_;
-      viennacl::ocl::handle<cl_mem> elements_;
-    }; //vector
-    
-
-    //
-    //////////////////// Copy from GPU to CPU //////////////////////////////////
-    //
-    
-    /** @brief STL-like transfer for the entries of a GPU vector to the CPU. The cpu type does not need to lie in a linear piece of memory.
-    *
-    * @param gpu_begin  GPU constant iterator pointing to the beginning of the gpu vector (STL-like)
-    * @param gpu_end    GPU constant iterator pointing to the end of the vector (STL-like)
-    * @param cpu_begin  Output iterator for the cpu vector. The cpu vector must be at least as long as the gpu vector!
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT, typename CPU_ITERATOR>
-    void copy(const const_vector_iterator<SCALARTYPE, ALIGNMENT> & gpu_begin,
-              const const_vector_iterator<SCALARTYPE, ALIGNMENT> & gpu_end,
-              CPU_ITERATOR cpu_begin )
-    {
-      assert(gpu_end - gpu_begin >= 0);
-      if (gpu_end - gpu_begin != 0)
-      {
-        std::vector<SCALARTYPE> temp_buffer(gpu_end - gpu_begin);
-        cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(),
-                                         gpu_begin.handle().get(), CL_TRUE, 0, 
-                                         sizeof(SCALARTYPE)*(gpu_end - gpu_begin),
-                                         &(temp_buffer[0]), 0, NULL, NULL);
-        VIENNACL_ERR_CHECK(err);
-        viennacl::ocl::get_queue().finish();
-        
-        //now copy entries to cpu_vec:
-        std::copy(temp_buffer.begin(), temp_buffer.end(), cpu_begin);
-      }
-    }
-
-    /** @brief STL-like transfer for the entries of a GPU vector to the CPU. The cpu type does not need to lie in a linear piece of memory.
-    *
-    * @param gpu_begin  GPU iterator pointing to the beginning of the gpu vector (STL-like)
-    * @param gpu_end    GPU iterator pointing to the end of the vector (STL-like)
-    * @param cpu_begin  Output iterator for the cpu vector. The cpu vector must be at least as long as the gpu vector!
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT, typename CPU_ITERATOR>
-    void copy(const vector_iterator<SCALARTYPE, ALIGNMENT> & gpu_begin,
-              const vector_iterator<SCALARTYPE, ALIGNMENT> & gpu_end,
-              CPU_ITERATOR cpu_begin )
-
-    {
-      copy(const_vector_iterator<SCALARTYPE, ALIGNMENT>(gpu_begin),
-           const_vector_iterator<SCALARTYPE, ALIGNMENT>(gpu_end),
-           cpu_begin);
-    }
-    
-    /** @brief Transfer from a gpu vector to a cpu vector. Convenience wrapper for viennacl::linalg::copy(gpu_vec.begin(), gpu_vec.end(), cpu_vec.begin());
-    *
-    * @param gpu_vec    A gpu vector
-    * @param cpu_vec    The cpu vector. Type requirements: Output iterator can be obtained via member function .begin()
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT, typename CPUVECTOR>
-    void copy(vector<SCALARTYPE, ALIGNMENT> const & gpu_vec,
-              CPUVECTOR & cpu_vec )
-    {
-      viennacl::copy(gpu_vec.begin(), gpu_vec.end(), cpu_vec.begin());
-    }
-
-    //from gpu to cpu. Type assumption: cpu_vec lies in a linear memory chunk
-    /** @brief STL-like transfer of a GPU vector to the CPU. The cpu type is assumed to reside in a linear piece of memory, such as e.g. for std::vector.
-    *
-    * This method is faster than the plain copy() function, because entries are
-    * directly written to the cpu vector, starting with &(*cpu.begin()) However,
-    * keep in mind that the cpu type MUST represent a linear piece of
-    * memory, otherwise you will run into undefined behavior.
-    *
-    * @param gpu_begin  GPU iterator pointing to the beginning of the gpu vector (STL-like)
-    * @param gpu_end    GPU iterator pointing to the end of the vector (STL-like)
-    * @param cpu_begin  Output iterator for the cpu vector. The cpu vector must be at least as long as the gpu vector!
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT, typename CPU_ITERATOR>
-    void fast_copy(const const_vector_iterator<SCALARTYPE, ALIGNMENT> & gpu_begin,
-                   const const_vector_iterator<SCALARTYPE, ALIGNMENT> & gpu_end,
-                   CPU_ITERATOR cpu_begin )
-    {
-      if (gpu_begin != gpu_end)
-      {
-        cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(),
-                                         gpu_begin.handle().get(), CL_TRUE, 0,
-                                         sizeof(SCALARTYPE)*(gpu_end - gpu_begin),
-                                         &(*cpu_begin), 0, NULL, NULL);
-        VIENNACL_ERR_CHECK(err);
-        viennacl::ocl::get_queue().finish();
-      }
-    }
-
-    /** @brief Transfer from a gpu vector to a cpu vector. Convenience wrapper for viennacl::linalg::fast_copy(gpu_vec.begin(), gpu_vec.end(), cpu_vec.begin());
-    *
-    * @param gpu_vec    A gpu vector.
-    * @param cpu_vec    The cpu vector. Type requirements: Output iterator can be obtained via member function .begin()
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT, typename CPUVECTOR>
-    void fast_copy(vector<SCALARTYPE, ALIGNMENT> const & gpu_vec,
-                   CPUVECTOR & cpu_vec )
-    {
-      viennacl::fast_copy(gpu_vec.begin(), gpu_vec.end(), cpu_vec.begin());
-    }
-
-
-
-    #ifdef VIENNACL_HAVE_EIGEN
-    template <unsigned int ALIGNMENT>
-    void copy(vector<float, ALIGNMENT> const & gpu_vec,
-              Eigen::VectorXf & eigen_vec)
-    {
-      viennacl::fast_copy(gpu_vec.begin(), gpu_vec.end(), &(eigen_vec[0]));
-    }
-    
-    template <unsigned int ALIGNMENT>
-    void copy(vector<double, ALIGNMENT> & gpu_vec,
-              Eigen::VectorXd & eigen_vec)
-    {
-      viennacl::fast_copy(gpu_vec.begin(), gpu_vec.end(), &(eigen_vec[0]));
-    }
-    #endif
-
-
-    //
-    //////////////////// Copy from CPU to GPU //////////////////////////////////
-    //
-
-    //from cpu to gpu. Safe assumption: cpu_vector does not necessarily occupy a linear memory segment, but is not larger than the allocated memory on the GPU
-    /** @brief STL-like transfer for the entries of a GPU vector to the CPU. The cpu type does not need to lie in a linear piece of memory.
-    *
-    * @param cpu_begin  CPU iterator pointing to the beginning of the gpu vector (STL-like)
-    * @param cpu_end    CPU iterator pointing to the end of the vector (STL-like)
-    * @param gpu_begin  Output iterator for the gpu vector. The gpu vector must be at least as long as the cpu vector!
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT, typename CPU_ITERATOR>
-    void copy(CPU_ITERATOR const & cpu_begin,
-              CPU_ITERATOR const & cpu_end,
-              vector_iterator<SCALARTYPE, ALIGNMENT> gpu_begin)
-    {
-      assert(cpu_end - cpu_begin > 0);
-      if (cpu_begin != cpu_end)
-      {
-        //we require that the size of the gpu_vector is larger or equal to the cpu-size
-        std::vector<SCALARTYPE> temp_buffer(cpu_end - cpu_begin);
-        std::copy(cpu_begin, cpu_end, temp_buffer.begin());
-        cl_int err = clEnqueueWriteBuffer(viennacl::ocl::get_queue().handle().get(),
-                                          gpu_begin.handle().get(), CL_TRUE, sizeof(SCALARTYPE)*gpu_begin.index(),
-                                          sizeof(SCALARTYPE)*(cpu_end - cpu_begin),
-                                          &(temp_buffer[0]), 0, NULL, NULL);
-        VIENNACL_ERR_CHECK(err);
-      }
-    }
-
-    // for things like copy(std_vec.begin(), std_vec.end(), vcl_vec.begin() + 1);
-    template <typename SCALARTYPE, unsigned int ALIGNMENT, typename CPU_ITERATOR>
-    void copy(CPU_ITERATOR const & cpu_begin,
-              CPU_ITERATOR const & cpu_end,
-              const_vector_iterator<SCALARTYPE, ALIGNMENT> gpu_begin)
-    {
-      copy(cpu_begin, cpu_end, vector_iterator<SCALARTYPE, ALIGNMENT>(gpu_begin));
-    }
-
-    /** @brief Transfer from a cpu vector to a gpu vector. Convenience wrapper for viennacl::linalg::copy(cpu_vec.begin(), cpu_vec.end(), gpu_vec.begin());
-    *
-    * @param cpu_vec    A cpu vector. Type requirements: Iterator can be obtained via member function .begin() and .end()
-    * @param gpu_vec    The gpu vector.
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT, typename CPUVECTOR>
-    void copy(const CPUVECTOR & cpu_vec, vector<SCALARTYPE, ALIGNMENT> & gpu_vec)
-    {
-      viennacl::copy(cpu_vec.begin(), cpu_vec.end(), gpu_vec.begin());
-    }
-
-    /** @brief STL-like transfer of a CPU vector to the GPU. The cpu type is assumed to reside in a linear piece of memory, such as e.g. for std::vector.
-    *
-    * This method is faster than the plain copy() function, because entries are
-    * directly read from the cpu vector, starting with &(*cpu.begin()). However,
-    * keep in mind that the cpu type MUST represent a linear piece of
-    * memory, otherwise you will run into undefined behavior.
-    *
-    * @param cpu_begin  CPU iterator pointing to the beginning of the cpu vector (STL-like)
-    * @param cpu_end    CPU iterator pointing to the end of the vector (STL-like)
-    * @param gpu_begin  Output iterator for the gpu vector. The gpu iterator must be incrementable (cpu_end - cpu_begin) times, otherwise the result is undefined.
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT, typename CPU_ITERATOR>
-    void fast_copy(CPU_ITERATOR const & cpu_begin,
-                   CPU_ITERATOR const & cpu_end,
-                   vector_iterator<SCALARTYPE, ALIGNMENT> gpu_begin)
-    {
-      if (cpu_begin != cpu_end)
-      {
-        //we require that the size of the gpu_vector is larger or equal to the cpu-size
-        cl_int err = clEnqueueWriteBuffer(viennacl::ocl::get_queue().handle().get(), 
-                                          gpu_begin.handle().get(), CL_TRUE, 0, 
-                                          sizeof(SCALARTYPE)*(cpu_end - cpu_begin), &(*cpu_begin), 0, NULL, NULL);
-        VIENNACL_ERR_CHECK(err);
-      }
-    }
-
-
-    /** @brief Transfer from a cpu vector to a gpu vector. Convenience wrapper for viennacl::linalg::fast_copy(cpu_vec.begin(), cpu_vec.end(), gpu_vec.begin());
-    *
-    * @param cpu_vec    A cpu vector. Type requirements: Iterator can be obtained via member function .begin() and .end()
-    * @param gpu_vec    The gpu vector.
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT, typename CPUVECTOR>
-    void fast_copy(const CPUVECTOR & cpu_vec, vector<SCALARTYPE, ALIGNMENT> & gpu_vec)
-    {
-      viennacl::fast_copy(cpu_vec.begin(), cpu_vec.end(), gpu_vec.begin());
-    }
-
-    #ifdef VIENNACL_HAVE_EIGEN
-    template <unsigned int ALIGNMENT>
-    void copy(Eigen::VectorXf const & eigen_vec,
-              vector<float, ALIGNMENT> & gpu_vec)
-    {
-      std::vector<float> entries(eigen_vec.size());
-      for (size_t i = 0; i<entries.size(); ++i)
-        entries[i] = eigen_vec(i);
-      viennacl::fast_copy(entries.begin(), entries.end(), gpu_vec.begin());
-    }
-    
-    template <unsigned int ALIGNMENT>
-    void copy(Eigen::VectorXd const & eigen_vec,
-              vector<double, ALIGNMENT> & gpu_vec)
-    {
-      std::vector<double> entries(eigen_vec.size());
-      for (size_t i = 0; i<entries.size(); ++i)
-        entries[i] = eigen_vec(i);
-      viennacl::fast_copy(entries.begin(), entries.end(), gpu_vec.begin());
-    }
-    #endif
-    
-
-
-    //
-    //////////////////// Copy from GPU to GPU //////////////////////////////////
-    //
-    /** @brief Copy (parts of a) GPU vector to another GPU vector
-    *
-    * @param gpu_src_begin    GPU iterator pointing to the beginning of the gpu vector (STL-like)
-    * @param gpu_src_end      GPU iterator pointing to the end of the vector (STL-like)
-    * @param gpu_dest_begin   Output iterator for the gpu vector. The gpu_dest vector must be at least as long as the gpu_src vector!
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT_SRC, unsigned int ALIGNMENT_DEST>
-    void copy(const_vector_iterator<SCALARTYPE, ALIGNMENT_SRC> const & gpu_src_begin,
-              const_vector_iterator<SCALARTYPE, ALIGNMENT_SRC> const & gpu_src_end,
-              vector_iterator<SCALARTYPE, ALIGNMENT_DEST> gpu_dest_begin)
-    {
-      assert(gpu_src_end - gpu_src_begin >= 0);
-      if (gpu_src_begin != gpu_src_end)
-      {
-        cl_int err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle().get(),
-                                          gpu_src_begin.handle().get(),  //src handle
-                                          gpu_dest_begin.handle().get(), //dest handle
-                                          sizeof(SCALARTYPE) * gpu_src_begin.index(), //src offset
-                                          sizeof(SCALARTYPE) * gpu_dest_begin.index(), //dest offset
-                                          sizeof(SCALARTYPE) * (gpu_src_end.index() - gpu_src_begin.index()), //data length
-                                          0, //don't know -> check!! (something related to increment?)
-                                          NULL, NULL);
-        VIENNACL_ERR_CHECK(err);
-      }
-    }
-
-    /** @brief Copy (parts of a) GPU vector to another GPU vector
-    *
-    * @param gpu_src_begin   GPU iterator pointing to the beginning of the gpu vector (STL-like)
-    * @param gpu_src_end     GPU iterator pointing to the end of the vector (STL-like)
-    * @param gpu_dest_begin  Output iterator for the gpu vector. The gpu vector must be at least as long as the cpu vector!
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT_SRC, unsigned int ALIGNMENT_DEST>
-    void copy(const_vector_iterator<SCALARTYPE, ALIGNMENT_SRC> const & gpu_src_begin,
-              const_vector_iterator<SCALARTYPE, ALIGNMENT_SRC> const & gpu_src_end,
-              const_vector_iterator<SCALARTYPE, ALIGNMENT_DEST> gpu_dest_begin)
-    {
-      copy(gpu_src_begin, gpu_src_end, vector_iterator<SCALARTYPE, ALIGNMENT_DEST>(gpu_dest_begin));
-    }
-
-    /** @brief Transfer from a ViennaCL vector to another ViennaCL vector. Convenience wrapper for viennacl::linalg::copy(gpu_src_vec.begin(), gpu_src_vec.end(), gpu_dest_vec.begin());
-    *
-    * @param gpu_src_vec    A gpu vector
-    * @param gpu_dest_vec    The cpu vector. Type requirements: Output iterator can be obtained via member function .begin()
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT_SRC, unsigned int ALIGNMENT_DEST>
-    void copy(vector<SCALARTYPE, ALIGNMENT_SRC> const & gpu_src_vec,
-              vector<SCALARTYPE, ALIGNMENT_DEST> & gpu_dest_vec )
-    {
-      viennacl::copy(gpu_src_vec.begin(), gpu_src_vec.end(), gpu_dest_vec.begin());
-    } 
-
-
-    
-    
-    
-
-    //global functions for handling vectors:
-    /** @brief Output stream. Output format is ublas compatible.
-    * @param s    STL output stream
-    * @param val  The vector that should be printed
-    */
-    template<class SCALARTYPE, unsigned int ALIGNMENT>
-    std::ostream & operator<<(std::ostream & s, vector<SCALARTYPE,ALIGNMENT> const & val)
-    {
-      viennacl::ocl::get_queue().finish();
-      std::vector<SCALARTYPE> tmp(val.size());
-      copy(val.begin(), val.end(), tmp.begin());
-      std::cout << "[" << val.size() << "](";
-      for (typename std::vector<SCALARTYPE>::size_type i=0; i<val.size(); ++i)
-      {
-        if (i > 0)
-          s << ",";
-        s << tmp[i];
-      }
-      std::cout << ")";
-      return s;
-    }
-
-    /** @brief Swaps the contents of two vectors, data is copied
-    *
-    * @param vec1   The first vector
-    * @param vec2   The second vector
-    */
-    template<class SCALARTYPE, unsigned int ALIGNMENT>
-    void swap(viennacl::vector<SCALARTYPE, ALIGNMENT> & vec1,
-              viennacl::vector<SCALARTYPE, ALIGNMENT> & vec2)
-    {
-      assert(viennacl::traits::size(vec1) == viennacl::traits::size(vec2)
-             && "Incompatible vector sizes in swap()");
-
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::program_name(), "swap");
-
-      viennacl::ocl::enqueue(k(viennacl::traits::handle(vec1), cl_uint(viennacl::traits::start(vec1)), cl_uint(viennacl::traits::size(vec1)),
-                               viennacl::traits::handle(vec2), cl_uint(viennacl::traits::start(vec2)), cl_uint(viennacl::traits::size(vec2)))
-                            );
-    }
-    
-    /** @brief Swaps the content of two vectors by swapping OpenCL handles only, NO data is copied
-    *
-    * @param v1   The first vector
-    * @param v2   The second vector
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    vector<SCALARTYPE, ALIGNMENT> & fast_swap(vector<SCALARTYPE, ALIGNMENT> & v1,
-                                              vector<SCALARTYPE, ALIGNMENT> & v2) 
-    { 
-      return v1.fast_swap(v2);
-    }       
-    
-    
-    
-    ////////// operations /////////////
-    /** @brief Operator overload for the expression alpha * v1, where alpha is a host scalar (float or double) and v1 is a ViennaCL vector.
-    *
-    * @param value   The host scalar (float or double)
-    * @param vec     A ViennaCL vector
-    */
-    template <typename SCALARTYPE, unsigned int A>
-    vector_expression< const vector<SCALARTYPE, A>, const SCALARTYPE, op_prod> operator * (SCALARTYPE const & value, vector<SCALARTYPE, A> const & vec)
-    {
-      return vector_expression< const vector<SCALARTYPE, A>, const SCALARTYPE, op_prod>(vec, value);
-    }
-
-    /** @brief Operator overload for the expression alpha * v1, where alpha is a ViennaCL scalar (float or double) and v1 is a ViennaCL vector.
-    *
-    * @param value   The ViennaCL scalar
-    * @param vec     A ViennaCL vector
-    */
-    template <typename SCALARTYPE, unsigned int A>
-    vector_expression< const vector<SCALARTYPE, A>, const scalar<SCALARTYPE>, op_prod> operator * (scalar<SCALARTYPE> const & value, vector<SCALARTYPE, A> const & vec)
-    {
-        return vector_expression< const vector<SCALARTYPE, A>, const scalar<SCALARTYPE>, op_prod>(vec, value);
-    }
-
-
-    //addition and subtraction of two vector_expressions:
-    /** @brief Operator overload for the addition of two vector expressions.
-    *
-    * @param proxy1  Left hand side vector expression
-    * @param proxy2  Right hand side vector expression
-    */
-    template <typename LHS1, typename RHS1, typename OP1,
-              typename LHS2, typename RHS2, typename OP2>
-    typename vector_expression< LHS1, RHS1, OP1>::VectorType
-    operator + (vector_expression< LHS1, RHS1, OP1> const & proxy1,
-                vector_expression< LHS2, RHS2, OP2> const & proxy2)
-    {
-      assert(proxy1.size() == proxy2.size());
-      typename vector_expression< LHS1, RHS1, OP1>::VectorType result(proxy1.size());
-      result = proxy1;
-      result += proxy2;
-      return result;
-    }
-
-    /** @brief Operator overload for the subtraction of two vector expressions.
-    *
-    * @param proxy1  Left hand side vector expression
-    * @param proxy2  Right hand side vector expression
-    */
-    template <typename LHS1, typename RHS1, typename OP1,
-              typename LHS2, typename RHS2, typename OP2>
-    typename vector_expression< LHS1, RHS1, OP1>::VectorType
-    operator - (vector_expression< LHS1, RHS1, OP1> const & proxy1,
-                vector_expression< LHS2, RHS2, OP2> const & proxy2)
-    {
-      assert(proxy1.size() == proxy2.size());
-      typename vector_expression< LHS1, RHS1, OP1>::VectorType result(proxy1.size());
-      result = proxy1;
-      result -= proxy2;
-      return result;
-    }
-    
-    //////////// one vector expression from left /////////////////////////////////////////
-    
-    /** @brief Operator overload for the addition of a vector expression from the left, e.g. alpha * vec1 + vec2. Here, alpha * vec1 is wrapped into a vector_expression and then added to vec2.
-    *
-    * @param proxy   Left hand side vector expression
-    * @param vec     Right hand side vector
-    */
-    template <typename SCALARTYPE, unsigned int A, typename LHS, typename RHS, typename OP>
-    vector<SCALARTYPE, A> operator + (vector_expression< LHS, RHS, OP> const & proxy,
-                                      vector<SCALARTYPE, A> const & vec)
-    {
-      assert(proxy.size() == vec.size());
-      vector<SCALARTYPE, A> result(vec.size());
-      result = proxy;
-      result += vec;
-      return result;
-    }
-
-    /** @brief Operator overload for the subtraction of a vector expression from the left, e.g. alpha * vec1 + vec2. Here, alpha * vec1 is wrapped into a vector_expression and then added to vec2.
-    *
-    * @param proxy   Left hand side vector expression
-    * @param vec     Right hand side vector
-    */
-    template <typename SCALARTYPE, unsigned int A, typename LHS, typename RHS, typename OP>
-    vector<SCALARTYPE, A> operator - (vector_expression< LHS, RHS, OP> const & proxy,
-                                      vector<SCALARTYPE, A> const & vec)
-    {
-      assert(proxy.size() == vec.size());
-      vector<SCALARTYPE, A> result(vec.size());
-      result = proxy;
-      result -= vec;
-      return result;
-    }
-
-
-    /** @brief Operator overload for the multiplication of a vector expression with a scalar from the right, e.g. (beta * vec1) * alpha. Here, beta * vec1 is wrapped into a vector_expression and then multiplied with alpha from the right.
-    *
-    * @param proxy   Left hand side vector expression
-    * @param val     Right hand side scalar
-    */
-    template <typename SCALARTYPE, typename LHS, typename RHS, typename OP>
-    vector<SCALARTYPE> operator * (vector_expression< LHS, RHS, OP> const & proxy,
-                                   scalar<SCALARTYPE> const & val)
-    {
-      vector<SCALARTYPE> result(proxy.size());
-      result = proxy;
-      result *= val;
-      return result;
-    }
-
-    /** @brief Operator overload for the division of a vector expression by a scalar from the right, e.g. (beta * vec1) / alpha. Here, beta * vec1 is wrapped into a vector_expression and then divided by alpha.
-    *
-    * @param proxy   Left hand side vector expression
-    * @param val     Right hand side scalar
-    */
-    template <typename SCALARTYPE, typename LHS, typename RHS, typename OP>
-    vector<SCALARTYPE> operator / (vector_expression< LHS, RHS, OP> const & proxy,
-                                      scalar<SCALARTYPE> const & val)
-    {
-      vector<SCALARTYPE> result(proxy.size());
-      result = proxy;
-      result /= val;
-      return result;
-    }
-
-
-    //////////// one vector expression from right (on scalar) ///////////////////////
-    
-    /** @brief Operator overload for the multiplication of a vector expression with a ViennaCL scalar from the left, e.g. alpha * (beta * vec1). Here, beta * vec1 is wrapped into a vector_expression and then multiplied with alpha from the left.
-    *
-    * @param val     Right hand side scalar
-    * @param proxy   Left hand side vector expression
-    */
-    template <typename SCALARTYPE, typename LHS, typename RHS, typename OP>
-    vector<SCALARTYPE> operator * (scalar<SCALARTYPE> const & val,
-                                   vector_expression< LHS, RHS, OP> const & proxy)
-    {
-      vector<SCALARTYPE> result(proxy.size());
-      result = proxy;
-      result *= val;
-      return result;
-    }
-    
-    /** @brief Operator overload for the multiplication of a vector expression with a host scalar (float or double) from the left, e.g. alpha * (beta * vec1). Here, beta * vec1 is wrapped into a vector_expression and then multiplied with alpha from the left.
-    *
-    * @param val     Right hand side scalar
-    * @param proxy   Left hand side vector expression
-    */
-    template <typename SCALARTYPE, typename LHS, typename RHS, typename OP>
-    viennacl::vector<SCALARTYPE> operator * (SCALARTYPE val,
-                                   viennacl::vector_expression< LHS, RHS, OP> const & proxy)
-    {
-      viennacl::vector<SCALARTYPE> result(proxy.size());
-      result = proxy;
-      result *= val;
-      return result;
-    }
-
-}
-
-#endif
+#ifndef VIENNACL_VECTOR_HPP_
+#define VIENNACL_VECTOR_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file  viennacl/vector.hpp
+    @brief The vector type with operator-overloads and proxy classes is defined here.
+           Linear algebra operations such as norms and inner products are located in linalg/vector_operations.hpp
+*/
+
+
+#include "viennacl/forwards.h"
+#include "viennacl/backend/memory.hpp"
+#include "viennacl/scalar.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/tools/entry_proxy.hpp"
+#include "viennacl/linalg/detail/op_executor.hpp"
+#include "viennacl/linalg/vector_operations.hpp"
+#include "viennacl/meta/result_of.hpp"
+//#include "viennacl/rand/utils.hpp"
+#include "viennacl/context.hpp"
+#include "viennacl/traits/handle.hpp"
+
+namespace viennacl
+{
+
+  /** @brief Common base class for representing vectors where the entries are not all stored explicitly.
+    *
+    * Typical examples are zero_vector or scalar_vector.
+    */
+  template<typename SCALARTYPE>
+  class implicit_vector_base
+  {
+    protected:
+      typedef vcl_size_t        size_type;
+      implicit_vector_base(size_type s, vcl_size_t i, std::pair<SCALARTYPE, bool> v, viennacl::context ctx) : size_(s), index_(std::make_pair(true,i)), value_(v), ctx_(ctx){ }
+      implicit_vector_base(size_type s, std::pair<SCALARTYPE, bool> v, viennacl::context ctx) : size_(s), index_(std::make_pair(false,0)), value_(v), ctx_(ctx){ }
+
+    public:
+      typedef SCALARTYPE const & const_reference;
+      typedef SCALARTYPE cpu_value_type;
+
+      viennacl::context context() const { return ctx_; }
+
+      size_type size() const { return size_; }
+
+      cpu_value_type  value() const { return value_.first; }
+
+      bool is_value_static() const { return value_.second; }
+
+      vcl_size_t index() const { return index_.second; }
+
+      bool has_index() const { return index_.first; }
+
+      cpu_value_type operator()(size_type i) const {
+        if(index_.first)
+          return (i==index_.second)?value_.first:0;
+        return value_.first;
+      }
+
+      cpu_value_type operator[](size_type i) const {
+        if(index_.first)
+          return (i==index_.second)?value_.first:0;
+        return
+            value_.first;
+      }
+
+    protected:
+      size_type size_;
+      std::pair<bool, vcl_size_t> index_;
+      std::pair<SCALARTYPE, bool> value_;
+      viennacl::context ctx_;
+  };
+
+  /** @brief Represents a vector consisting of 1 at a given index and zeros otherwise.*/
+  template <typename SCALARTYPE>
+  class unit_vector : public implicit_vector_base<SCALARTYPE>
+  {
+      typedef implicit_vector_base<SCALARTYPE> base_type;
+    public:
+      typedef typename base_type::size_type size_type;
+      unit_vector(size_type s, size_type ind, viennacl::context ctx = viennacl::context()) : base_type(s, ind, std::make_pair(SCALARTYPE(1),true), ctx)
+      {
+        assert( (ind < s) && bool("Provided index out of range!") );
+      }
+  };
+
+
+  /** @brief Represents a vector consisting of zeros only. */
+  template <typename SCALARTYPE>
+  class zero_vector : public implicit_vector_base<SCALARTYPE>
+  {
+      typedef implicit_vector_base<SCALARTYPE> base_type;
+    public:
+      typedef typename base_type::size_type size_type;
+      typedef SCALARTYPE        const_reference;
+      zero_vector(size_type s, viennacl::context ctx = viennacl::context()) : base_type(s, std::make_pair(SCALARTYPE(0),true), ctx) {}
+  };
+
+  /** @brief Represents a vector consisting of ones only. */
+  template <typename SCALARTYPE>
+  class one_vector : public implicit_vector_base<SCALARTYPE>
+  {
+      typedef implicit_vector_base<SCALARTYPE> base_type;
+    public:
+      typedef typename base_type::size_type size_type;
+      typedef SCALARTYPE        const_reference;
+      one_vector(size_type s, viennacl::context ctx = viennacl::context()) : base_type(s, std::make_pair(SCALARTYPE(1),true), ctx) {}
+  };
+
+
+  /** @brief Represents a vector consisting of scalars 's' only, i.e. v[i] = s for all i. To be used as an initializer for viennacl::vector, vector_range, or vector_slize only. */
+  template <typename SCALARTYPE>
+  class scalar_vector : public implicit_vector_base<SCALARTYPE>
+  {
+      typedef implicit_vector_base<SCALARTYPE> base_type;
+    public:
+      typedef typename base_type::size_type size_type;
+      typedef SCALARTYPE const & const_reference;
+
+      scalar_vector(size_type s, SCALARTYPE val, viennacl::context ctx = viennacl::context()) : base_type(s, std::make_pair(val,false), ctx) {}
+  };
+
+
+//#ifdef VIENNACL_WITH_OPENCL
+//  template<class SCALARTYPE, class DISTRIBUTION>
+//  rand::random_vector_t<SCALARTYPE, DISTRIBUTION> random_vector(unsigned int size, DISTRIBUTION const & distribution){
+//      return rand::random_vector_t<SCALARTYPE,DISTRIBUTION>(size,distribution);
+//  }
+//#endif
+
+
+  //
+  // Vector expression
+  //
+
+  /** @brief An expression template class that represents a binary operation that yields a vector
+  *
+  * In contrast to full expression templates as introduced by Veldhuizen, ViennaCL does not allow nested expressions.
+  * The reason is that this requires automated GPU viennacl::ocl::kernel generation, which then has to be compiles just-in-time.
+  * For performance-critical applications, one better writes the appropriate viennacl::ocl::kernels by hand.
+  *
+  * Assumption: dim(LHS) >= dim(RHS), where dim(scalar) = 0, dim(vector) = 1 and dim(matrix = 2)
+  *
+  * @tparam LHS   left hand side operand
+  * @tparam RHS   right hand side operand
+  * @tparam OP    the operator
+  */
+  template <typename LHS, typename RHS, typename OP>
+  class vector_expression
+  {
+      typedef typename viennacl::result_of::reference_if_nonscalar<LHS>::type     lhs_reference_type;
+      typedef typename viennacl::result_of::reference_if_nonscalar<RHS>::type     rhs_reference_type;
+
+    public:
+      enum { alignment = 1 };
+
+      /** @brief Extracts the vector type from the two operands.
+      */
+      typedef vcl_size_t       size_type;
+
+      vector_expression(LHS & l, RHS & r) : lhs_(l), rhs_(r) {}
+
+      /** @brief Get left hand side operand
+      */
+      lhs_reference_type lhs() const { return lhs_; }
+      /** @brief Get right hand side operand
+      */
+      rhs_reference_type rhs() const { return rhs_; }
+
+      /** @brief Returns the size of the result vector */
+      size_type size() const { return viennacl::traits::size(*this); }
+
+    private:
+      /** @brief The left hand side operand */
+      lhs_reference_type lhs_;
+      /** @brief The right hand side operand */
+      rhs_reference_type rhs_;
+  };
+
+  /** @brief A STL-type const-iterator for vector elements. Elements can be accessed, but cannot be manipulated. VERY SLOW!!
+  *
+  * Every dereference operation initiates a transfer from the GPU to the CPU. The overhead of such a transfer is around 50us, so 20.000 dereferences take one second.
+  * This is four orders of magnitude slower than similar dereferences on the CPU. However, increments and comparisons of iterators is as fast as for CPU types.
+  * If you need a fast iterator, copy the whole vector to the CPU first and iterate over the CPU object, e.g.
+  * std::vector<float> temp;
+  * copy(gpu_vector, temp);
+  * for (std::vector<float>::const_iterator iter = temp.begin();
+  *      iter != temp.end();
+  *      ++iter)
+  * {
+  *   //do something
+  * }
+  * Note that you may obtain inconsistent data if entries of gpu_vector are manipulated elsewhere in the meanwhile.
+  *
+  * @tparam SCALARTYPE  The underlying floating point type (either float or double)
+  * @tparam ALIGNMENT   Alignment of the underlying vector, @see vector
+  */
+  template<class SCALARTYPE, unsigned int ALIGNMENT>
+  class const_vector_iterator
+  {
+      typedef const_vector_iterator<SCALARTYPE, ALIGNMENT>    self_type;
+    public:
+      typedef scalar<SCALARTYPE>            value_type;
+      typedef vcl_ptrdiff_t                 difference_type;
+      typedef viennacl::backend::mem_handle handle_type;
+
+      //const_vector_iterator() {}
+
+      /** @brief Constructor
+      *   @param vec    The vector over which to iterate
+      *   @param index  The starting index of the iterator
+      *   @param start  First index of the element in the vector pointed to be the iterator (for vector_range and vector_slice)
+      *   @param stride Stride for the support of vector_slice
+      */
+      const_vector_iterator(vector_base<SCALARTYPE> const & vec,
+                            vcl_size_t index,
+                            vcl_size_t start = 0,
+                            vcl_ptrdiff_t stride = 1) : elements_(vec.handle()), index_(index), start_(start), stride_(stride) {}
+
+      /** @brief Constructor for vector-like treatment of arbitrary buffers
+      *   @param elements  The buffer over which to iterate
+      *   @param index     The starting index of the iterator
+      *   @param start     First index of the element in the vector pointed to be the iterator (for vector_range and vector_slice)
+      *   @param stride    Stride for the support of vector_slice
+      */
+      const_vector_iterator(handle_type const & elements,
+                            vcl_size_t index,
+                            vcl_size_t start = 0,
+                            vcl_ptrdiff_t stride = 1) : elements_(elements), index_(index), start_(start), stride_(stride) {}
+
+      /** @brief Dereferences the iterator and returns the value of the element. For convenience only, performance is poor due to OpenCL overhead! */
+      value_type operator*(void) const
+      {
+          value_type result;
+          result = const_entry_proxy<SCALARTYPE>(start_ + index_ * stride_, elements_);
+          return result;
+      }
+      self_type operator++(void) { index_ += stride_; return *this; }
+      self_type operator++(int) { self_type tmp = *this; ++(*this); return tmp; }
+
+      bool operator==(self_type const & other) const { return index_ == other.index_; }
+      bool operator!=(self_type const & other) const { return index_ != other.index_; }
+
+//        self_type & operator=(self_type const & other)
+//        {
+//           index_ = other._index;
+//           elements_ = other._elements;
+//           return *this;
+//        }
+
+      difference_type operator-(self_type const & other) const
+      {
+        assert( (other.start_ == start_) && (other.stride_ == stride_) && bool("Iterators are not from the same vector (proxy)!"));
+        return static_cast<difference_type>(index_) - static_cast<difference_type>(other.index_);
+      }
+      self_type operator+(difference_type diff) const { return self_type(elements_, index_ + diff * stride_, start_, stride_); }
+
+      //vcl_size_t index() const { return index_; }
+      /** @brief Offset of the current element index with respect to the beginning of the buffer */
+      vcl_size_t offset() const { return start_ + index_ * stride_; }
+
+      /** @brief Index increment in the underlying buffer when incrementing the iterator to the next element */
+      vcl_size_t stride() const { return stride_; }
+      handle_type const & handle() const { return elements_; }
+
+    protected:
+      /** @brief  The index of the entry the iterator is currently pointing to */
+      handle_type const & elements_;
+      vcl_size_t index_;  //offset from the beginning of elements_
+      vcl_size_t start_;
+      vcl_ptrdiff_t stride_;
+  };
+
+
+  /** @brief A STL-type iterator for vector elements. Elements can be accessed and manipulated. VERY SLOW!!
+  *
+  * Every dereference operation initiates a transfer from the GPU to the CPU. The overhead of such a transfer is around 50us, so 20.000 dereferences take one second.
+  * This is four orders of magnitude slower than similar dereferences on the CPU. However, increments and comparisons of iterators is as fast as for CPU types.
+  * If you need a fast iterator, copy the whole vector to the CPU first and iterate over the CPU object, e.g.
+  * std::vector<float> temp;
+  * copy(gpu_vector, temp);
+  * for (std::vector<float>::const_iterator iter = temp.begin();
+  *      iter != temp.end();
+  *      ++iter)
+  * {
+  *   //do something
+  * }
+  * copy(temp, gpu_vector);
+  * Note that you may obtain inconsistent data if you manipulate entries of gpu_vector in the meanwhile.
+  *
+  * @tparam SCALARTYPE  The underlying floating point type (either float or double)
+  * @tparam ALIGNMENT   Alignment of the underlying vector, @see vector
+  */
+  template<class SCALARTYPE, unsigned int ALIGNMENT>
+  class vector_iterator : public const_vector_iterator<SCALARTYPE, ALIGNMENT>
+  {
+      typedef const_vector_iterator<SCALARTYPE, ALIGNMENT>  base_type;
+      typedef vector_iterator<SCALARTYPE, ALIGNMENT>        self_type;
+    public:
+      typedef typename base_type::handle_type               handle_type;
+      typedef typename base_type::difference_type           difference_type;
+
+      vector_iterator() : base_type(), elements_(NULL) {}
+      vector_iterator(handle_type & elements,
+                      vcl_size_t index,
+                      vcl_size_t start = 0,
+                      vcl_ptrdiff_t stride = 1)  : base_type(elements, index, start, stride), elements_(elements) {}
+      /** @brief Constructor
+      *   @param vec    The vector over which to iterate
+      *   @param index  The starting index of the iterator
+      *   @param start  Offset from the beginning of the underlying vector (for ranges and slices)
+      *   @param stride Stride for slices
+      */
+      vector_iterator(vector_base<SCALARTYPE> & vec,
+                      vcl_size_t index,
+                      vcl_size_t start = 0,
+                      vcl_ptrdiff_t stride = 1) : base_type(vec, index, start, stride), elements_(vec.handle()) {}
+      //vector_iterator(base_type const & b) : base_type(b) {}
+
+      typename base_type::value_type operator*(void)
+      {
+          typename base_type::value_type result;
+          result = entry_proxy<SCALARTYPE>(base_type::start_ + base_type::index_ * base_type::stride_, elements_);
+          return result;
+      }
+
+      difference_type operator-(self_type const & other) const { difference_type result = base_type::index_; return (result - static_cast<difference_type>(other.index_)); }
+      self_type operator+(difference_type diff) const { return self_type(elements_, base_type::index_ + diff * base_type::stride_, base_type::start_, base_type::stride_); }
+
+      handle_type       & handle()       { return elements_; }
+      handle_type const & handle() const { return base_type::elements_; }
+
+      //operator base_type() const
+      //{
+      //  return base_type(base_type::elements_, base_type::index_, base_type::start_, base_type::stride_);
+      //}
+    private:
+      handle_type & elements_;
+  };
+
+
+  /** @brief Common base class for dense vectors, vector ranges, and vector slices.
+    *
+    * @tparam SCALARTYPE   The floating point type, either 'float' or 'double'
+    */
+  template<class SCALARTYPE, typename SizeType /* see forwards.h for default type */, typename DistanceType /* see forwards.h for default type */>
+  class vector_base
+  {
+      typedef vector_base<SCALARTYPE>         self_type;
+
+    public:
+      typedef scalar<SCALARTYPE>                                value_type;
+      typedef SCALARTYPE                                        cpu_value_type;
+      typedef viennacl::backend::mem_handle                     handle_type;
+      typedef SizeType                                          size_type;
+      typedef DistanceType                                      difference_type;
+      typedef const_vector_iterator<SCALARTYPE, 1>              const_iterator;
+      typedef vector_iterator<SCALARTYPE, 1>                    iterator;
+
+      static const size_type alignment = 128;
+
+      /** @brief Default constructor in order to be compatible with various containers.
+      */
+      explicit vector_base() : size_(0), start_(0), stride_(1), internal_size_(0) { /* Note: One must not call ::init() here because a vector might have been created globally before the backend has become available */ }
+
+      /** @brief An explicit constructor for wrapping an existing vector into a vector_range or vector_slice.
+       *
+       *
+       *
+       * @param h          The existing memory handle from a vector/vector_range/vector_slice
+       * @param vec_size   The length (i.e. size) of the buffer
+       * @param vec_start  The offset from the beginning of the buffer identified by 'h'
+       * @param vec_stride Increment between two elements in the original buffer (in multiples of SCALARTYPE)
+      */
+      explicit vector_base(viennacl::backend::mem_handle & h,
+                           size_type vec_size, size_type vec_start, difference_type vec_stride)
+        : size_(vec_size), start_(vec_start), stride_(vec_stride), internal_size_(vec_size), elements_(h) {}
+
+      /** @brief Creates a vector and allocates the necessary memory */
+      explicit vector_base(size_type vec_size, viennacl::context ctx = viennacl::context())
+        : size_(vec_size), start_(0), stride_(1), internal_size_(viennacl::tools::align_to_multiple<size_type>(size_, alignment))
+      {
+        if (size_ > 0)
+        {
+          viennacl::backend::memory_create(elements_, sizeof(SCALARTYPE)*internal_size(), ctx);
+          clear();
+        }
+      }
+
+      // CUDA or host memory:
+      explicit vector_base(SCALARTYPE * ptr_to_mem, viennacl::memory_types mem_type, size_type vec_size, vcl_size_t start = 0, difference_type stride = 1)
+        : size_(vec_size), start_(start), stride_(stride), internal_size_(vec_size)
+      {
+        if (mem_type == viennacl::CUDA_MEMORY)
+        {
+#ifdef VIENNACL_WITH_CUDA
+          elements_.switch_active_handle_id(viennacl::CUDA_MEMORY);
+          elements_.cuda_handle().reset(reinterpret_cast<char*>(ptr_to_mem));
+          elements_.cuda_handle().inc(); //prevents that the user-provided memory is deleted once the vector object is destroyed.
+#else
+          throw cuda_not_available_exception();
+#endif
+        }
+        else if (mem_type == viennacl::MAIN_MEMORY)
+        {
+          elements_.switch_active_handle_id(viennacl::MAIN_MEMORY);
+          elements_.ram_handle().reset(reinterpret_cast<char*>(ptr_to_mem));
+          elements_.ram_handle().inc(); //prevents that the user-provided memory is deleted once the vector object is destroyed.
+        }
+
+        elements_.raw_size(sizeof(SCALARTYPE) * vec_size);
+
+      }
+
+#ifdef VIENNACL_WITH_OPENCL
+      /** @brief Create a vector from existing OpenCL memory
+      *
+      * Note: The provided memory must take an eventual ALIGNMENT into account, i.e. existing_mem must be at least of size internal_size()!
+      * This is trivially the case with the default alignment, but should be considered when using vector<> with an alignment parameter not equal to 1.
+      *
+      * @param existing_mem   An OpenCL handle representing the memory
+      * @param vec_size       The size of the vector.
+      */
+      explicit vector_base(cl_mem existing_mem, size_type vec_size, size_type start = 0, difference_type stride = 1, viennacl::context ctx = viennacl::context())
+        : size_(vec_size), start_(start), stride_(stride), internal_size_(vec_size)
+      {
+        elements_.switch_active_handle_id(viennacl::OPENCL_MEMORY);
+        elements_.opencl_handle() = existing_mem;
+        elements_.opencl_handle().inc();  //prevents that the user-provided memory is deleted once the vector object is destroyed.
+        elements_.opencl_handle().context(ctx.opencl_context());
+        elements_.raw_size(sizeof(SCALARTYPE) * vec_size);
+      }
+#endif
+
+      /** @brief Creates the vector from the supplied random vector. */
+      /*template<class DISTRIBUTION>
+      vector(rand::random_vector_t<SCALARTYPE, DISTRIBUTION> v) : size_(v.size)
+      {
+        if(size_ > 0)
+        {
+          viennacl::backend::memory_create(elements_, sizeof(SCALARTYPE)*internal_size());
+          rand::buffer_dumper<SCALARTYPE, DISTRIBUTION>::dump(elements_,v.distribution,0,size_);
+        }
+      } */
+
+      template <typename LHS, typename RHS, typename OP>
+      explicit vector_base(vector_expression<const LHS, const RHS, OP> const & proxy)
+        : size_(viennacl::traits::size(proxy)), start_(0), stride_(1), internal_size_(viennacl::tools::align_to_multiple<size_type>(size_, alignment))
+      {
+        if (size_ > 0)
+        {
+          viennacl::backend::memory_create(elements_, sizeof(SCALARTYPE)*internal_size(), viennacl::traits::context(proxy));
+          clear();
+        }
+        self_type::operator=(proxy);
+      }
+
+
+      //
+      // operator=
+      //
+
+
+      /** @brief Assignment operator. Other vector needs to be of the same size, or this vector is not yet initialized.
+      */
+      self_type & operator=(const self_type & vec)
+      {
+        assert( ( (vec.size() == size()) || (size() == 0) )
+                && bool("Incompatible vector sizes!"));
+
+        if (vec.size() > 0)
+        {
+          if (size_ == 0)
+          {
+            size_ = vec.size();
+            internal_size_ = viennacl::tools::align_to_multiple<size_type>(size_, alignment);
+            elements_.switch_active_handle_id(vec.handle().get_active_handle_id());
+            viennacl::backend::memory_create(elements_, sizeof(SCALARTYPE)*internal_size(), viennacl::traits::context(vec));
+            pad();
+          }
+
+          viennacl::linalg::av(*this,
+                               vec, cpu_value_type(1.0), 1, false, false);
+        }
+
+        return *this;
+      }
+
+
+      /** @brief Implementation of the operation v1 = v2 @ alpha, where @ denotes either multiplication or division, and alpha is either a CPU or a GPU scalar
+      *
+      * @param proxy  An expression template proxy class.
+      */
+      template <typename LHS, typename RHS, typename OP>
+      self_type & operator=(const vector_expression<const LHS, const RHS, OP> & proxy)
+      {
+        assert( ( (viennacl::traits::size(proxy) == size()) || (size() == 0) )
+                && bool("Incompatible vector sizes!"));
+
+        // initialize the necessary buffer
+        if (size() == 0)
+        {
+          size_ = viennacl::traits::size(proxy);
+          internal_size_ = viennacl::tools::align_to_multiple<size_type>(size_, alignment);
+          viennacl::backend::memory_create(elements_, sizeof(SCALARTYPE)*internal_size(), viennacl::traits::context(proxy));
+          pad();
+        }
+
+        linalg::detail::op_executor<self_type, op_assign, vector_expression<const LHS, const RHS, OP> >::apply(*this, proxy);
+
+        return *this;
+      }
+
+      // assign vector range or vector slice
+      template <typename T>
+      self_type &
+      operator = (const vector_base<T> & v1)
+      {
+        assert( ( (v1.size() == size()) || (size() == 0) )
+                && bool("Incompatible vector sizes!"));
+
+        if (size() == 0)
+        {
+          size_ = v1.size();
+          if (size_ > 0)
+          {
+            internal_size_ = viennacl::tools::align_to_multiple<size_type>(size_, alignment);
+            viennacl::backend::memory_create(elements_, sizeof(SCALARTYPE)*internal_size(), viennacl::traits::context(v1));
+            pad();
+          }
+        }
+
+        viennacl::linalg::av(*this,
+                             v1, SCALARTYPE(1.0), 1, false, false);
+
+        return *this;
+      }
+
+      /** @brief Creates the vector from the supplied unit vector. */
+      self_type & operator = (unit_vector<SCALARTYPE> const & v)
+      {
+        assert( ( (v.size() == size()) || (size() == 0) )
+                && bool("Incompatible vector sizes!"));
+
+        if (size() == 0)
+        {
+          size_ = v.size();
+          internal_size_ = viennacl::tools::align_to_multiple<size_type>(size_, alignment);
+          if (size_ > 0)
+          {
+            viennacl::backend::memory_create(elements_, sizeof(SCALARTYPE)*internal_size(), v.context());
+            clear();
+          }
+        }
+        else
+          viennacl::linalg::vector_assign(*this, SCALARTYPE(0));
+
+        if (size_ > 0)
+          this->operator()(v.index()) = SCALARTYPE(1);
+
+        return *this;
+      }
+
+      /** @brief Creates the vector from the supplied zero vector. */
+      self_type & operator = (zero_vector<SCALARTYPE> const & v)
+      {
+        assert( ( (v.size() == size()) || (size() == 0) )
+                && bool("Incompatible vector sizes!"));
+
+        if (size() == 0)
+        {
+          size_ = v.size();
+          internal_size_ = viennacl::tools::align_to_multiple<size_type>(size_, alignment);
+          if (size_ > 0)
+          {
+            viennacl::backend::memory_create(elements_, sizeof(SCALARTYPE)*internal_size(), v.context());
+            clear();
+          }
+        }
+        else
+          viennacl::linalg::vector_assign(*this, SCALARTYPE(0));
+
+        return *this;
+      }
+
+      /** @brief Creates the vector from the supplied scalar vector. */
+      self_type & operator = (scalar_vector<SCALARTYPE> const & v)
+      {
+        assert( ( (v.size() == size()) || (size() == 0) )
+                && bool("Incompatible vector sizes!"));
+
+        if (size() == 0)
+        {
+          size_ = v.size();
+          internal_size_ = viennacl::tools::align_to_multiple<size_type>(size_, alignment);
+          if (size_ > 0)
+          {
+            viennacl::backend::memory_create(elements_, sizeof(SCALARTYPE)*internal_size(), v.context());
+            pad();
+          }
+        }
+
+        if (size_ > 0)
+          viennacl::linalg::vector_assign(*this, v[0]);
+
+        return *this;
+      }
+
+
+
+      ///////////////////////////// Matrix Vector interaction start ///////////////////////////////////
+
+      //Note: The following operator overloads are defined in matrix_operations.hpp, compressed_matrix_operations.hpp and coordinate_matrix_operations.hpp
+      //This is certainly not the nicest approach and will most likely by changed in the future, but it works :-)
+
+      //matrix<>
+      /** @brief Operator overload for v1 = A * v2, where v1, v2 are vectors and A is a dense matrix.
+      *
+      * @param proxy An expression template proxy class
+      */
+      template <typename F>
+      self_type & operator=(const viennacl::vector_expression< const matrix_base<SCALARTYPE, F>, const vector_base<SCALARTYPE>, viennacl::op_prod> & proxy)
+      {
+        assert(viennacl::traits::size1(proxy.lhs()) == size() && bool("Size check failed for v1 = A * v2: size1(A) != size(v1)"));
+
+        // check for the special case x = A * x
+        if (viennacl::traits::handle(proxy.rhs()) == viennacl::traits::handle(*this))
+        {
+          viennacl::vector<SCALARTYPE> result(viennacl::traits::size1(proxy.lhs()));
+          viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
+          *this = result;
+        }
+        else
+        {
+          viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), *this);
+        }
+        return *this;
+      }
+
+
+      //transposed_matrix_proxy:
+      /** @brief Operator overload for v1 = trans(A) * v2, where v1, v2 are vectors and A is a dense matrix.
+      *
+      * @param proxy An expression template proxy class
+      */
+      template <typename F>
+      self_type & operator=(const vector_expression< const matrix_expression< const matrix_base<SCALARTYPE, F>, const matrix_base<SCALARTYPE, F>, op_trans >,
+                                                     const vector_base<SCALARTYPE>,
+                                                     op_prod> & proxy)
+      {
+        assert(viennacl::traits::size1(proxy.lhs()) == size() && bool("Size check failed in v1 = trans(A) * v2: size2(A) != size(v1)"));
+
+        // check for the special case x = trans(A) * x
+        if (viennacl::traits::handle(proxy.rhs()) == viennacl::traits::handle(*this))
+        {
+          viennacl::vector<SCALARTYPE> result(viennacl::traits::size1(proxy.lhs()));
+          viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
+          *this = result;
+        }
+        else
+        {
+          viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), *this);
+        }
+        return *this;
+      }
+
+      ///////////////////////////// Matrix Vector interaction end ///////////////////////////////////
+
+
+      //read-write access to an element of the vector
+      /** @brief Read-write access to a single element of the vector
+      */
+      entry_proxy<SCALARTYPE> operator()(size_type index)
+      {
+        assert( (size() > 0)  && bool("Cannot apply operator() to vector of size zero!"));
+        assert( index < size() && bool("Index out of bounds!") );
+
+        return entry_proxy<SCALARTYPE>(start_ + stride_ * index, elements_);
+      }
+
+      /** @brief Read-write access to a single element of the vector
+      */
+      entry_proxy<SCALARTYPE> operator[](size_type index)
+      {
+        assert( (size() > 0)  && bool("Cannot apply operator() to vector of size zero!"));
+        assert( index < size() && bool("Index out of bounds!") );
+
+        return entry_proxy<SCALARTYPE>(start_ + stride_ * index, elements_);
+      }
+
+
+      /** @brief Read access to a single element of the vector
+      */
+      const_entry_proxy<SCALARTYPE> operator()(size_type index) const
+      {
+        assert( (size() > 0)  && bool("Cannot apply operator() to vector of size zero!"));
+        assert( index < size() && bool("Index out of bounds!") );
+
+        return const_entry_proxy<SCALARTYPE>(start_ + stride_ * index, elements_);
+      }
+
+      /** @brief Read access to a single element of the vector
+      */
+      const_entry_proxy<SCALARTYPE> operator[](size_type index) const
+      {
+        assert( (size() > 0)  && bool("Cannot apply operator() to vector of size zero!"));
+        assert( index < size() && bool("Index out of bounds!") );
+
+        return const_entry_proxy<SCALARTYPE>(start_ + stride_ * index, elements_);
+      }
+
+      //
+      // Operator overloads with implicit conversion (thus cannot be made global without introducing additional headache)
+      //
+      self_type & operator += (const self_type & vec)
+      {
+        assert(vec.size() == size() && bool("Incompatible vector sizes!"));
+
+        if (size() > 0)
+          viennacl::linalg::avbv(*this,
+                                  *this, SCALARTYPE(1.0), 1, false, false,
+                                  vec,   SCALARTYPE(1.0), 1, false, false);
+        return *this;
+      }
+
+      self_type & operator -= (const self_type & vec)
+      {
+        assert(vec.size() == size() && bool("Incompatible vector sizes!"));
+
+        if (size() > 0)
+          viennacl::linalg::avbv(*this,
+                                  *this, SCALARTYPE(1.0),  1, false, false,
+                                  vec,   SCALARTYPE(-1.0), 1, false, false);
+        return *this;
+      }
+
+      template <typename LHS, typename RHS, typename OP>
+      self_type & operator += (const vector_expression<const LHS, const RHS, OP> & proxy)
+      {
+        assert( (viennacl::traits::size(proxy) == size()) && bool("Incompatible vector sizes!"));
+        assert( (size() > 0) && bool("Vector not yet initialized!") );
+
+        linalg::detail::op_executor<self_type, op_inplace_add, vector_expression<const LHS, const RHS, OP> >::apply(*this, proxy);
+
+        return *this;
+      }
+
+      template <typename LHS, typename RHS, typename OP>
+      self_type & operator -= (const vector_expression<const LHS, const RHS, OP> & proxy)
+      {
+        assert( (viennacl::traits::size(proxy) == size()) && bool("Incompatible vector sizes!"));
+        assert( (size() > 0) && bool("Vector not yet initialized!") );
+
+        linalg::detail::op_executor<self_type, op_inplace_sub, vector_expression<const LHS, const RHS, OP> >::apply(*this, proxy);
+
+        return *this;
+      }
+
+      /** @brief Scales a vector (or proxy) by a CPU scalar value
+      */
+      self_type & operator *= (SCALARTYPE val)
+      {
+        if (size() > 0)
+          viennacl::linalg::av(*this,
+                                *this, val, 1, false, false);
+        return *this;
+      }
+
+      /** @brief Scales this vector by a CPU scalar value
+      */
+      self_type & operator /= (SCALARTYPE val)
+      {
+        if (size() > 0)
+          viennacl::linalg::av(*this,
+                               *this, val, 1, true, false);
+        return *this;
+      }
+
+
+      /** @brief Scales the vector by a CPU scalar 'alpha' and returns an expression template
+      */
+      vector_expression< const self_type, const SCALARTYPE, op_mult>
+      operator * (SCALARTYPE value) const
+      {
+        return vector_expression< const self_type, const SCALARTYPE, op_mult>(*this, value);
+      }
+
+
+      /** @brief Scales the vector by a CPU scalar 'alpha' and returns an expression template
+      */
+      vector_expression< const self_type, const SCALARTYPE, op_div>
+      operator / (SCALARTYPE value) const
+      {
+        return vector_expression< const self_type, const SCALARTYPE, op_div>(*this, value);
+      }
+
+
+      /** @brief Sign flip for the vector. Emulated to be equivalent to -1.0 * vector */
+      vector_expression<const self_type, const SCALARTYPE, op_mult> operator-() const
+      {
+        return vector_expression<const self_type, const SCALARTYPE, op_mult>(*this, SCALARTYPE(-1.0));
+      }
+
+      //
+      //// iterators:
+      //
+
+      /** @brief Returns an iterator pointing to the beginning of the vector  (STL like)*/
+      iterator begin()
+      {
+        return iterator(*this, 0, start_, stride_);
+      }
+
+      /** @brief Returns an iterator pointing to the end of the vector (STL like)*/
+      iterator end()
+      {
+        return iterator(*this, size(), start_, stride_);
+      }
+
+      /** @brief Returns a const-iterator pointing to the beginning of the vector (STL like)*/
+      const_iterator begin() const
+      {
+        return const_iterator(*this, 0, start_, stride_);
+      }
+
+      /** @brief Returns a const-iterator pointing to the end of the vector (STL like)*/
+      const_iterator end() const
+      {
+        return const_iterator(*this, size(), start_, stride_);
+      }
+
+      /** @brief Swaps the entries of the two vectors
+      */
+      self_type & swap(self_type & other)
+      {
+        viennacl::linalg::vector_swap(*this, other);
+        return *this;
+      };
+
+
+      /** @brief Returns the length of the vector (cf. std::vector)
+      */
+      size_type size() const { return size_; }
+
+      /** @brief Returns the internal length of the vector, which is given by size() plus the extra memory due to padding the memory with zeros up to a multiple of 'ALIGNMENT'
+      */
+      size_type internal_size() const { return internal_size_; }
+
+      /** @brief Returns the offset within the buffer
+      */
+      size_type start() const { return start_; }
+
+      /** @brief Returns the stride within the buffer (in multiples of sizeof(SCALARTYPE))
+      */
+      size_type stride() const { return stride_; }
+
+
+      /** @brief Returns true is the size is zero */
+      bool empty() const { return size_ == 0; }
+
+      /** @brief Returns the memory handle. */
+      const handle_type & handle() const { return elements_; }
+
+      /** @brief Returns the memory handle. */
+      handle_type & handle() { return elements_; }
+
+      /** @brief Resets all entries to zero. Does not change the size of the vector.
+      */
+      void clear()
+      {
+        viennacl::linalg::vector_assign(*this, cpu_value_type(0.0), true);
+      }
+
+      viennacl::memory_types memory_domain() const
+      {
+        return elements_.get_active_handle_id();
+      }
+
+    protected:
+
+      void set_handle(viennacl::backend::mem_handle const & h)
+      {
+        elements_ = h;
+      }
+
+      /** @brief Swaps the handles of two vectors by swapping the OpenCL handles only, no data copy
+      */
+      self_type & fast_swap(self_type & other)
+      {
+        assert(this->size_ == other.size_ && bool("Vector size mismatch"));
+        this->elements_.swap(other.elements_);
+        return *this;
+      }
+
+      /** @brief Pads vectors with alignment > 1 with trailing zeros if the internal size is larger than the visible size */
+      void pad()
+      {
+        if (internal_size() != size())
+        {
+          std::vector<SCALARTYPE> pad(internal_size() - size());
+          viennacl::backend::memory_write(elements_, sizeof(SCALARTYPE) * size(), sizeof(SCALARTYPE) * pad.size(), &(pad[0]));
+        }
+      }
+
+      void switch_memory_context(viennacl::context new_ctx)
+      {
+        viennacl::backend::switch_memory_context<SCALARTYPE>(elements_, new_ctx);
+      }
+
+      //TODO: Think about implementing the following public member functions
+      //void insert_element(unsigned int i, SCALARTYPE val){}
+      //void erase_element(unsigned int i){}
+
+      //enlarge or reduce allocated memory and set unused memory to zero
+      /** @brief Resizes the allocated memory for the vector. Pads the memory to be a multiple of 'ALIGNMENT'
+      *
+      *  @param new_size  The new size of the vector
+      *  @param preserve  If true, old entries of the vector are preserved, otherwise eventually discarded.
+      */
+      void resize(size_type new_size, bool preserve = true)
+      {
+        resize_impl(new_size, viennacl::traits::context(*this), preserve);
+      }
+
+      /** @brief Resizes the allocated memory for the vector. Convenience function for setting an OpenCL context in case reallocation is needed
+      *
+      *  @param new_size  The new size of the vector
+      *  @param ctx       The context within which the new memory should be allocated
+      *  @param preserve  If true, old entries of the vector are preserved, otherwise eventually discarded.
+      */
+      void resize(size_type new_size, viennacl::context ctx, bool preserve = true)
+      {
+        resize_impl(new_size, ctx, preserve);
+      }
+
+    private:
+
+      void resize_impl(size_type new_size, viennacl::context ctx, bool preserve = true)
+      {
+        assert(new_size > 0 && bool("Positive size required when resizing vector!"));
+
+        if (new_size != size_)
+        {
+          vcl_size_t new_internal_size = viennacl::tools::align_to_multiple<vcl_size_t>(new_size, alignment);
+
+          std::vector<SCALARTYPE> temp(size_);
+          if (preserve && size_ > 0)
+            fast_copy(*this, temp);
+          temp.resize(new_size);  //drop all entries above new_size
+          temp.resize(new_internal_size); //enlarge to fit new internal size
+
+          if (new_internal_size != internal_size())
+          {
+            viennacl::backend::memory_create(elements_, sizeof(SCALARTYPE)*new_internal_size, ctx, NULL);
+          }
+
+          fast_copy(temp, *this);
+          size_ = new_size;
+          internal_size_ = viennacl::tools::align_to_multiple<size_type>(size_, alignment);
+          pad();
+        }
+
+      }
+
+      size_type       size_;
+      size_type       start_;
+      difference_type stride_;
+      size_type       internal_size_;
+      handle_type elements_;
+  }; //vector_base
+
+
+
+  // forward definition in forwards.h!
+  /** @brief A vector class representing a linear memory sequence on the GPU. Inspired by boost::numeric::ublas::vector
+  *
+  *  This is the basic vector type of ViennaCL. It is similar to std::vector and boost::numeric::ublas::vector and supports various linear algebra operations.
+  * By default, the internal length of the vector is padded to a multiple of 'ALIGNMENT' in order to speed up several GPU viennacl::ocl::kernels.
+  *
+  * @tparam SCALARTYPE  The floating point type, either 'float' or 'double'
+  * @tparam ALIGNMENT   The internal memory size is given by (size()/ALIGNMENT + 1) * ALIGNMENT. ALIGNMENT must be a power of two. Best values or usually 4, 8 or 16, higher values are usually a waste of memory.
+  */
+  template<class SCALARTYPE, unsigned int ALIGNMENT>
+  class vector : public vector_base<SCALARTYPE>
+  {
+    typedef vector<SCALARTYPE, ALIGNMENT>         self_type;
+    typedef vector_base<SCALARTYPE>               base_type;
+
+  public:
+    typedef typename base_type::size_type                  size_type;
+    typedef typename base_type::difference_type            difference_type;
+
+    /** @brief Default constructor in order to be compatible with various containers.
+    */
+    explicit vector() : base_type() { /* Note: One must not call ::init() here because the vector might have been created globally before the backend has become available */ }
+
+    /** @brief An explicit constructor for the vector, allocating the given amount of memory (plus a padding specified by 'ALIGNMENT')
+    *
+    * @param vec_size   The length (i.e. size) of the vector.
+    */
+    explicit vector(size_type vec_size) : base_type(vec_size) {}
+
+    explicit vector(size_type vec_size, viennacl::context ctx) : base_type(vec_size, ctx) {}
+
+    explicit vector(SCALARTYPE * ptr_to_mem, viennacl::memory_types mem_type, size_type vec_size, size_type start = 0, difference_type stride = 1)
+        : base_type(ptr_to_mem, mem_type, vec_size, start, stride) {}
+
+#ifdef VIENNACL_WITH_OPENCL
+    /** @brief Create a vector from existing OpenCL memory
+    *
+    * Note: The provided memory must take an eventual ALIGNMENT into account, i.e. existing_mem must be at least of size internal_size()!
+    * This is trivially the case with the default alignment, but should be considered when using vector<> with an alignment parameter not equal to 1.
+    *
+    * @param existing_mem   An OpenCL handle representing the memory
+    * @param vec_size       The size of the vector.
+    */
+    explicit vector(cl_mem existing_mem, size_type vec_size, size_type start = 0, difference_type stride = 1) : base_type(existing_mem, vec_size, start, stride) {}
+
+    /** @brief An explicit constructor for the vector, allocating the given amount of memory (plus a padding specified by 'ALIGNMENT') and the OpenCL context provided
+    *
+    * @param vec_size   The length (i.e. size) of the vector.
+    * @param ctx        The context
+    */
+    explicit vector(size_type vec_size, viennacl::ocl::context const & ctx) : base_type(vec_size, ctx) {}
+#endif
+
+    template <typename LHS, typename RHS, typename OP>
+    vector(vector_expression<const LHS, const RHS, OP> const & proxy) : base_type(proxy) {}
+
+    vector(const base_type & v) : base_type(v.size(), viennacl::traits::context(v))
+    {
+      if (v.size() > 0)
+        base_type::operator=(v);
+    }
+
+    vector(const self_type & v) : base_type(v.size(), viennacl::traits::context(v))
+    {
+      if (v.size() > 0)
+        base_type::operator=(v);
+    }
+
+    /** @brief Creates the vector from the supplied unit vector. */
+    vector(unit_vector<SCALARTYPE> const & v) : base_type(v.size())
+    {
+      if (v.size() > 0)
+        this->operator()(v.index()) = SCALARTYPE(1);;
+    }
+
+    /** @brief Creates the vector from the supplied zero vector. */
+    vector(zero_vector<SCALARTYPE> const & v) : base_type(v.size(), v.context())
+    {
+      if (v.size() > 0)
+        viennacl::linalg::vector_assign(*this, SCALARTYPE(0.0));
+    }
+
+    /** @brief Creates the vector from the supplied scalar vector. */
+    vector(scalar_vector<SCALARTYPE> const & v) : base_type(v.size(), v.context())
+    {
+      if (v.size() > 0)
+        viennacl::linalg::vector_assign(*this, v[0]);
+    }
+
+    // the following is used to circumvent an issue with Clang 3.0 when 'using base_type::operator=;' directly
+    template <typename T>
+    self_type & operator=(T const & other)
+    {
+      base_type::operator=(other);
+      return *this;
+    }
+
+    using base_type::operator+=;
+    using base_type::operator-=;
+
+    //enlarge or reduce allocated memory and set unused memory to zero
+    /** @brief Resizes the allocated memory for the vector. Pads the memory to be a multiple of 'ALIGNMENT'
+    *
+    *  @param new_size  The new size of the vector
+    *  @param preserve  If true, old entries of the vector are preserved, otherwise eventually discarded.
+    */
+    void resize(size_type new_size, bool preserve = true)
+    {
+      base_type::resize(new_size, preserve);
+    }
+
+    void resize(size_type new_size, viennacl::context ctx, bool preserve = true)
+    {
+      base_type::resize(new_size, ctx, preserve);
+    }
+
+    /** @brief Swaps the handles of two vectors by swapping the OpenCL handles only, no data copy
+    */
+    self_type & fast_swap(self_type & other)
+    {
+      base_type::fast_swap(other);
+      return *this;
+    }
+
+    void switch_memory_context(viennacl::context new_ctx)
+    {
+      base_type::switch_memory_context(new_ctx);
+    }
+
+  }; //vector
+
+  /** @brief Tuple class holding pointers to multiple vectors. Mainly used as a temporary object returned from viennacl::tie(). */
+  template <typename ScalarT>
+  class vector_tuple
+  {
+    typedef vector_base<ScalarT>   VectorType;
+
+  public:
+      // 2 vectors
+
+      vector_tuple(VectorType const & v0, VectorType const & v1) : const_vectors_(2), non_const_vectors_()
+      {
+        const_vectors_[0] = &v0;
+        const_vectors_[1] = &v1;
+      }
+      vector_tuple(VectorType       & v0, VectorType       & v1) : const_vectors_(2), non_const_vectors_(2)
+      {
+        const_vectors_[0] = &v0; non_const_vectors_[0] = &v0;
+        const_vectors_[1] = &v1; non_const_vectors_[1] = &v1;
+      }
+
+      // 3 vectors
+
+      vector_tuple(VectorType const & v0, VectorType const & v1, VectorType const & v2) : const_vectors_(3), non_const_vectors_()
+      {
+        const_vectors_[0] = &v0;
+        const_vectors_[1] = &v1;
+        const_vectors_[2] = &v2;
+      }
+      vector_tuple(VectorType       & v0, VectorType       & v1, VectorType       & v2) : const_vectors_(3), non_const_vectors_(3)
+      {
+        const_vectors_[0] = &v0; non_const_vectors_[0] = &v0;
+        const_vectors_[1] = &v1; non_const_vectors_[1] = &v1;
+        const_vectors_[2] = &v2; non_const_vectors_[2] = &v2;
+      }
+
+      // 4 vectors
+
+      vector_tuple(VectorType const & v0, VectorType const & v1, VectorType const & v2, VectorType const & v3) : const_vectors_(4), non_const_vectors_()
+      {
+        const_vectors_[0] = &v0;
+        const_vectors_[1] = &v1;
+        const_vectors_[2] = &v2;
+        const_vectors_[3] = &v3;
+      }
+      vector_tuple(VectorType       & v0, VectorType       & v1, VectorType       & v2, VectorType       & v3) : const_vectors_(4), non_const_vectors_(4)
+      {
+        const_vectors_[0] = &v0; non_const_vectors_[0] = &v0;
+        const_vectors_[1] = &v1; non_const_vectors_[1] = &v1;
+        const_vectors_[2] = &v2; non_const_vectors_[2] = &v2;
+        const_vectors_[3] = &v3; non_const_vectors_[3] = &v3;
+      }
+
+      // add more overloads here
+
+      // generic interface:
+
+      vector_tuple(std::vector<VectorType const *> const & vecs) : const_vectors_(vecs.size()), non_const_vectors_()
+      {
+        for (vcl_size_t i=0; i<vecs.size(); ++i)
+          const_vectors_[i] = vecs[i];
+      }
+
+      vector_tuple(std::vector<VectorType *> const & vecs) : const_vectors_(vecs.size()), non_const_vectors_(vecs.size())
+      {
+        for (vcl_size_t i=0; i<vecs.size(); ++i)
+        {
+              const_vectors_[i] = vecs[i];
+          non_const_vectors_[i] = vecs[i];
+        }
+      }
+
+      vcl_size_t size()       const { return non_const_vectors_.size(); }
+      vcl_size_t const_size() const { return     const_vectors_.size(); }
+
+      VectorType       &       at(vcl_size_t i) const { return *(non_const_vectors_.at(i)); }
+      VectorType const & const_at(vcl_size_t i) const { return     *(const_vectors_.at(i)); }
+
+  private:
+    std::vector<VectorType const *>   const_vectors_;
+    std::vector<VectorType *>         non_const_vectors_;
+  };
+
+  // 2 args
+  template <typename ScalarT>
+  vector_tuple<ScalarT> tie(vector_base<ScalarT> const & v0, vector_base<ScalarT> const & v1) { return vector_tuple<ScalarT>(v0, v1); }
+
+  template <typename ScalarT>
+  vector_tuple<ScalarT> tie(vector_base<ScalarT>       & v0, vector_base<ScalarT>       & v1) { return vector_tuple<ScalarT>(v0, v1); }
+
+  // 3 args
+  template <typename ScalarT>
+  vector_tuple<ScalarT> tie(vector_base<ScalarT> const & v0, vector_base<ScalarT> const & v1, vector_base<ScalarT> const & v2) { return vector_tuple<ScalarT>(v0, v1, v2); }
+
+  template <typename ScalarT>
+  vector_tuple<ScalarT> tie(vector_base<ScalarT>       & v0, vector_base<ScalarT>       & v1, vector_base<ScalarT>       & v2) { return vector_tuple<ScalarT>(v0, v1, v2); }
+
+  // 4 args
+  template <typename ScalarT>
+  vector_tuple<ScalarT> tie(vector_base<ScalarT> const & v0, vector_base<ScalarT> const & v1, vector_base<ScalarT> const & v2, vector_base<ScalarT> const & v3)
+  {
+    return vector_tuple<ScalarT>(v0, v1, v2, v3);
+  }
+
+  template <typename ScalarT>
+  vector_tuple<ScalarT> tie(vector_base<ScalarT>       & v0, vector_base<ScalarT>       & v1, vector_base<ScalarT>       & v2, vector_base<ScalarT>       & v3)
+  {
+    return vector_tuple<ScalarT>(v0, v1, v2, v3);
+  }
+
+  // 5 args
+  template <typename ScalarT>
+  vector_tuple<ScalarT> tie(vector_base<ScalarT> const & v0,
+                            vector_base<ScalarT> const & v1,
+                            vector_base<ScalarT> const & v2,
+                            vector_base<ScalarT> const & v3,
+                            vector_base<ScalarT> const & v4)
+  {
+    typedef vector_base<ScalarT> const *       VectorPointerType;
+    std::vector<VectorPointerType> vec(5);
+    vec[0] = &v0;
+    vec[1] = &v1;
+    vec[2] = &v2;
+    vec[3] = &v3;
+    vec[4] = &v4;
+    return vector_tuple<ScalarT>(vec);
+  }
+
+  template <typename ScalarT>
+  vector_tuple<ScalarT> tie(vector_base<ScalarT> & v0,
+                            vector_base<ScalarT> & v1,
+                            vector_base<ScalarT> & v2,
+                            vector_base<ScalarT> & v3,
+                            vector_base<ScalarT> & v4)
+  {
+    typedef vector_base<ScalarT> *       VectorPointerType;
+    std::vector<VectorPointerType> vec(5);
+    vec[0] = &v0;
+    vec[1] = &v1;
+    vec[2] = &v2;
+    vec[3] = &v3;
+    vec[4] = &v4;
+    return vector_tuple<ScalarT>(vec);
+  }
+
+  // TODO: Add more arguments to tie() here. Maybe use some preprocessor magic to accomplish this.
+
+  //
+  //////////////////// Copy from GPU to CPU //////////////////////////////////
+  //
+
+
+  /** @brief STL-like transfer of a GPU vector to the CPU. The cpu type is assumed to reside in a linear piece of memory, such as e.g. for std::vector.
+  *
+  * This method is faster than the plain copy() function, because entries are
+  * directly written to the cpu vector, starting with &(*cpu.begin()) However,
+  * keep in mind that the cpu type MUST represent a linear piece of
+  * memory, otherwise you will run into undefined behavior.
+  *
+  * @param gpu_begin  GPU iterator pointing to the beginning of the gpu vector (STL-like)
+  * @param gpu_end    GPU iterator pointing to the end of the vector (STL-like)
+  * @param cpu_begin  Output iterator for the cpu vector. The cpu vector must be at least as long as the gpu vector!
+  */
+  template <typename SCALARTYPE, unsigned int ALIGNMENT, typename CPU_ITERATOR>
+  void fast_copy(const const_vector_iterator<SCALARTYPE, ALIGNMENT> & gpu_begin,
+                  const const_vector_iterator<SCALARTYPE, ALIGNMENT> & gpu_end,
+                  CPU_ITERATOR cpu_begin )
+  {
+    if (gpu_begin != gpu_end)
+    {
+      if (gpu_begin.stride() == 1)
+      {
+        viennacl::backend::memory_read(gpu_begin.handle(),
+                                      sizeof(SCALARTYPE)*gpu_begin.offset(),
+                                      sizeof(SCALARTYPE)*gpu_begin.stride() * (gpu_end - gpu_begin),
+                                      &(*cpu_begin));
+      }
+      else
+      {
+        vcl_size_t gpu_size = (gpu_end - gpu_begin);
+        std::vector<SCALARTYPE> temp_buffer(gpu_begin.stride() * gpu_size);
+        viennacl::backend::memory_read(gpu_begin.handle(), sizeof(SCALARTYPE)*gpu_begin.offset(), sizeof(SCALARTYPE)*temp_buffer.size(), &(temp_buffer[0]));
+
+        for (vcl_size_t i=0; i<gpu_size; ++i)
+        {
+          (&(*cpu_begin))[i] = temp_buffer[i * gpu_begin.stride()];
+        }
+      }
+    }
+  }
+
+  /** @brief Transfer from a gpu vector to a cpu vector. Convenience wrapper for viennacl::linalg::fast_copy(gpu_vec.begin(), gpu_vec.end(), cpu_vec.begin());
+  *
+  * @param gpu_vec    A gpu vector.
+  * @param cpu_vec    The cpu vector. Type requirements: Output iterator pointing to entries linear in memory can be obtained via member function .begin()
+  */
+  template <typename NumericT, typename CPUVECTOR>
+  void fast_copy(vector_base<NumericT> const & gpu_vec, CPUVECTOR & cpu_vec )
+  {
+    viennacl::fast_copy(gpu_vec.begin(), gpu_vec.end(), cpu_vec.begin());
+  }
+
+
+  /** @brief Asynchronous version of fast_copy(), copying data from device to host. The host iterator cpu_begin needs to reside in a linear piece of memory, such as e.g. for std::vector.
+  *
+  * This method allows for overlapping data transfer with host computation and returns immediately if the gpu vector has a unit-stride.
+  * In order to wait for the transfer to complete, use viennacl::backend::finish().
+  * Note that data pointed to by cpu_begin must not be modified prior to completion of the transfer.
+  *
+  * @param gpu_begin  GPU iterator pointing to the beginning of the gpu vector (STL-like)
+  * @param gpu_end    GPU iterator pointing to the end of the vector (STL-like)
+  * @param cpu_begin  Output iterator for the cpu vector. The cpu vector must be at least as long as the gpu vector!
+  */
+  template <typename SCALARTYPE, unsigned int ALIGNMENT, typename CPU_ITERATOR>
+  void async_copy(const const_vector_iterator<SCALARTYPE, ALIGNMENT> & gpu_begin,
+                  const const_vector_iterator<SCALARTYPE, ALIGNMENT> & gpu_end,
+                  CPU_ITERATOR cpu_begin )
+  {
+    if (gpu_begin != gpu_end)
+    {
+      if (gpu_begin.stride() == 1)
+      {
+        viennacl::backend::memory_read(gpu_begin.handle(),
+                                       sizeof(SCALARTYPE)*gpu_begin.offset(),
+                                       sizeof(SCALARTYPE)*gpu_begin.stride() * (gpu_end - gpu_begin),
+                                       &(*cpu_begin),
+                                       true);
+      }
+      else // no async copy possible, so fall-back to fast_copy
+        fast_copy(gpu_begin, gpu_end, cpu_begin);
+    }
+  }
+
+  /** @brief Transfer from a gpu vector to a cpu vector. Convenience wrapper for viennacl::linalg::fast_copy(gpu_vec.begin(), gpu_vec.end(), cpu_vec.begin());
+  *
+  * @param gpu_vec    A gpu vector.
+  * @param cpu_vec    The cpu vector. Type requirements: Output iterator pointing to entries linear in memory can be obtained via member function .begin()
+  */
+  template <typename NumericT, typename CPUVECTOR>
+  void async_copy(vector_base<NumericT> const & gpu_vec, CPUVECTOR & cpu_vec )
+  {
+    viennacl::async_copy(gpu_vec.begin(), gpu_vec.end(), cpu_vec.begin());
+  }
+
+
+  /** @brief STL-like transfer for the entries of a GPU vector to the CPU. The cpu type does not need to lie in a linear piece of memory.
+  *
+  * @param gpu_begin  GPU constant iterator pointing to the beginning of the gpu vector (STL-like)
+  * @param gpu_end    GPU constant iterator pointing to the end of the vector (STL-like)
+  * @param cpu_begin  Output iterator for the cpu vector. The cpu vector must be at least as long as the gpu vector!
+  */
+  template <typename SCALARTYPE, unsigned int ALIGNMENT, typename CPU_ITERATOR>
+  void copy(const const_vector_iterator<SCALARTYPE, ALIGNMENT> & gpu_begin,
+            const const_vector_iterator<SCALARTYPE, ALIGNMENT> & gpu_end,
+            CPU_ITERATOR cpu_begin )
+  {
+    assert(gpu_end - gpu_begin >= 0 && bool("Iterators incompatible"));
+    if (gpu_end - gpu_begin != 0)
+    {
+      std::vector<SCALARTYPE> temp_buffer(gpu_end - gpu_begin);
+      fast_copy(gpu_begin, gpu_end, temp_buffer.begin());
+
+      //now copy entries to cpu_vec:
+      std::copy(temp_buffer.begin(), temp_buffer.end(), cpu_begin);
+    }
+  }
+
+  /** @brief STL-like transfer for the entries of a GPU vector to the CPU. The cpu type does not need to lie in a linear piece of memory.
+  *
+  * @param gpu_begin  GPU iterator pointing to the beginning of the gpu vector (STL-like)
+  * @param gpu_end    GPU iterator pointing to the end of the vector (STL-like)
+  * @param cpu_begin  Output iterator for the cpu vector. The cpu vector must be at least as long as the gpu vector!
+  */
+  template <typename SCALARTYPE, unsigned int ALIGNMENT, typename CPU_ITERATOR>
+  void copy(const vector_iterator<SCALARTYPE, ALIGNMENT> & gpu_begin,
+            const vector_iterator<SCALARTYPE, ALIGNMENT> & gpu_end,
+            CPU_ITERATOR cpu_begin )
+
+  {
+    viennacl::copy(const_vector_iterator<SCALARTYPE, ALIGNMENT>(gpu_begin),
+                    const_vector_iterator<SCALARTYPE, ALIGNMENT>(gpu_end),
+                    cpu_begin);
+  }
+
+  /** @brief Transfer from a gpu vector to a cpu vector. Convenience wrapper for viennacl::linalg::copy(gpu_vec.begin(), gpu_vec.end(), cpu_vec.begin());
+  *
+  * @param gpu_vec    A gpu vector
+  * @param cpu_vec    The cpu vector. Type requirements: Output iterator can be obtained via member function .begin()
+  */
+  template <typename NumericT, typename CPUVECTOR>
+  void copy(vector_base<NumericT> const & gpu_vec, CPUVECTOR & cpu_vec )
+  {
+    viennacl::copy(gpu_vec.begin(), gpu_vec.end(), cpu_vec.begin());
+  }
+
+
+
+  #ifdef VIENNACL_WITH_EIGEN
+  template <unsigned int ALIGNMENT>
+  void copy(vector<float, ALIGNMENT> const & gpu_vec,
+            Eigen::VectorXf & eigen_vec)
+  {
+    viennacl::fast_copy(gpu_vec.begin(), gpu_vec.end(), &(eigen_vec[0]));
+  }
+
+  template <unsigned int ALIGNMENT>
+  void copy(vector<double, ALIGNMENT> & gpu_vec,
+            Eigen::VectorXd & eigen_vec)
+  {
+    viennacl::fast_copy(gpu_vec.begin(), gpu_vec.end(), &(eigen_vec[0]));
+  }
+  #endif
+
+
+  //
+  //////////////////// Copy from CPU to GPU //////////////////////////////////
+  //
+
+  /** @brief STL-like transfer of a CPU vector to the GPU. The cpu type is assumed to reside in a linear piece of memory, such as e.g. for std::vector.
+  *
+  * This method is faster than the plain copy() function, because entries are
+  * directly read from the cpu vector, starting with &(*cpu.begin()). However,
+  * keep in mind that the cpu type MUST represent a linear piece of
+  * memory, otherwise you will run into undefined behavior.
+  *
+  * @param cpu_begin  CPU iterator pointing to the beginning of the cpu vector (STL-like)
+  * @param cpu_end    CPU iterator pointing to the end of the vector (STL-like)
+  * @param gpu_begin  Output iterator for the gpu vector. The gpu iterator must be incrementable (cpu_end - cpu_begin) times, otherwise the result is undefined.
+  */
+  template <typename CPU_ITERATOR, typename SCALARTYPE, unsigned int ALIGNMENT>
+  void fast_copy(CPU_ITERATOR const & cpu_begin,
+                  CPU_ITERATOR const & cpu_end,
+                  vector_iterator<SCALARTYPE, ALIGNMENT> gpu_begin)
+  {
+    if (cpu_end - cpu_begin > 0)
+    {
+      if (gpu_begin.stride() == 1)
+      {
+        viennacl::backend::memory_write(gpu_begin.handle(),
+                                        sizeof(SCALARTYPE)*gpu_begin.offset(),
+                                        sizeof(SCALARTYPE)*gpu_begin.stride() * (cpu_end - cpu_begin), &(*cpu_begin));
+      }
+      else //writing to slice:
+      {
+        vcl_size_t cpu_size = (cpu_end - cpu_begin);
+        std::vector<SCALARTYPE> temp_buffer(gpu_begin.stride() * cpu_size);
+
+        viennacl::backend::memory_read(gpu_begin.handle(), sizeof(SCALARTYPE)*gpu_begin.offset(), sizeof(SCALARTYPE)*temp_buffer.size(), &(temp_buffer[0]));
+
+        for (vcl_size_t i=0; i<cpu_size; ++i)
+          temp_buffer[i * gpu_begin.stride()] = (&(*cpu_begin))[i];
+
+        viennacl::backend::memory_write(gpu_begin.handle(), sizeof(SCALARTYPE)*gpu_begin.offset(), sizeof(SCALARTYPE)*temp_buffer.size(), &(temp_buffer[0]));
+      }
+    }
+  }
+
+
+  /** @brief Transfer from a cpu vector to a gpu vector. Convenience wrapper for viennacl::linalg::fast_copy(cpu_vec.begin(), cpu_vec.end(), gpu_vec.begin());
+  *
+  * @param cpu_vec    A cpu vector. Type requirements: Iterator can be obtained via member function .begin() and .end()
+  * @param gpu_vec    The gpu vector.
+  */
+  template <typename CPUVECTOR, typename NumericT>
+  void fast_copy(const CPUVECTOR & cpu_vec, vector_base<NumericT> & gpu_vec)
+  {
+    viennacl::fast_copy(cpu_vec.begin(), cpu_vec.end(), gpu_vec.begin());
+  }
+
+  /** @brief Asynchronous version of fast_copy(), copying data from host to device. The host iterator cpu_begin needs to reside in a linear piece of memory, such as e.g. for std::vector.
+  *
+  * This method allows for overlapping data transfer with host computation and returns immediately if the gpu vector has a unit-stride.
+  * In order to wait for the transfer to complete, use viennacl::backend::finish().
+  * Note that data pointed to by cpu_begin must not be modified prior to completion of the transfer.
+  *
+  * @param cpu_begin  CPU iterator pointing to the beginning of the cpu vector (STL-like)
+  * @param cpu_end    CPU iterator pointing to the end of the vector (STL-like)
+  * @param gpu_begin  Output iterator for the gpu vector. The gpu iterator must be incrementable (cpu_end - cpu_begin) times, otherwise the result is undefined.
+  */
+  template <typename CPU_ITERATOR, typename SCALARTYPE, unsigned int ALIGNMENT>
+  void async_copy(CPU_ITERATOR const & cpu_begin,
+                  CPU_ITERATOR const & cpu_end,
+                  vector_iterator<SCALARTYPE, ALIGNMENT> gpu_begin)
+  {
+    if (cpu_end - cpu_begin > 0)
+    {
+      if (gpu_begin.stride() == 1)
+      {
+        viennacl::backend::memory_write(gpu_begin.handle(),
+                                        sizeof(SCALARTYPE)*gpu_begin.offset(),
+                                        sizeof(SCALARTYPE)*gpu_begin.stride() * (cpu_end - cpu_begin), &(*cpu_begin),
+                                        true);
+      }
+      else // fallback to blocking copy. There's nothing we can do to prevent this
+        fast_copy(cpu_begin, cpu_end, gpu_begin);
+    }
+  }
+
+
+  /** @brief Transfer from a cpu vector to a gpu vector. Convenience wrapper for viennacl::linalg::fast_copy(cpu_vec.begin(), cpu_vec.end(), gpu_vec.begin());
+  *
+  * @param cpu_vec    A cpu vector. Type requirements: Iterator can be obtained via member function .begin() and .end()
+  * @param gpu_vec    The gpu vector.
+  */
+  template <typename CPUVECTOR, typename NumericT>
+  void async_copy(const CPUVECTOR & cpu_vec, vector_base<NumericT> & gpu_vec)
+  {
+    viennacl::async_copy(cpu_vec.begin(), cpu_vec.end(), gpu_vec.begin());
+  }
+
+  //from cpu to gpu. Safe assumption: cpu_vector does not necessarily occupy a linear memory segment, but is not larger than the allocated memory on the GPU
+  /** @brief STL-like transfer for the entries of a GPU vector to the CPU. The cpu type does not need to lie in a linear piece of memory.
+  *
+  * @param cpu_begin  CPU iterator pointing to the beginning of the gpu vector (STL-like)
+  * @param cpu_end    CPU iterator pointing to the end of the vector (STL-like)
+  * @param gpu_begin  Output iterator for the gpu vector. The gpu vector must be at least as long as the cpu vector!
+  */
+  template <typename SCALARTYPE, unsigned int ALIGNMENT, typename CPU_ITERATOR>
+  void copy(CPU_ITERATOR const & cpu_begin,
+            CPU_ITERATOR const & cpu_end,
+            vector_iterator<SCALARTYPE, ALIGNMENT> gpu_begin)
+  {
+    assert(cpu_end - cpu_begin > 0 && bool("Iterators incompatible"));
+    if (cpu_begin != cpu_end)
+    {
+      //we require that the size of the gpu_vector is larger or equal to the cpu-size
+      std::vector<SCALARTYPE> temp_buffer(cpu_end - cpu_begin);
+      std::copy(cpu_begin, cpu_end, temp_buffer.begin());
+      viennacl::fast_copy(temp_buffer.begin(), temp_buffer.end(), gpu_begin);
+    }
+  }
+
+  // for things like copy(std_vec.begin(), std_vec.end(), vcl_vec.begin() + 1);
+
+  /** @brief Transfer from a cpu vector to a gpu vector. Convenience wrapper for viennacl::linalg::copy(cpu_vec.begin(), cpu_vec.end(), gpu_vec.begin());
+  *
+  * @param cpu_vec    A cpu vector. Type requirements: Iterator can be obtained via member function .begin() and .end()
+  * @param gpu_vec    The gpu vector.
+  */
+  template <typename CPUVECTOR, typename T>
+  void copy(const CPUVECTOR & cpu_vec, vector_base<T> & gpu_vec)
+  {
+    viennacl::copy(cpu_vec.begin(), cpu_vec.end(), gpu_vec.begin());
+  }
+
+
+  #ifdef VIENNACL_WITH_EIGEN
+  template <unsigned int ALIGNMENT>
+  void copy(Eigen::VectorXf const & eigen_vec,
+            vector<float, ALIGNMENT> & gpu_vec)
+  {
+    std::vector<float> entries(eigen_vec.size());
+    for (vcl_size_t i = 0; i<entries.size(); ++i)
+      entries[i] = eigen_vec(i);
+    viennacl::fast_copy(entries.begin(), entries.end(), gpu_vec.begin());
+  }
+
+  template <unsigned int ALIGNMENT>
+  void copy(Eigen::VectorXd const & eigen_vec,
+            vector<double, ALIGNMENT> & gpu_vec)
+  {
+    std::vector<double> entries(eigen_vec.size());
+    for (vcl_size_t i = 0; i<entries.size(); ++i)
+      entries[i] = eigen_vec(i);
+    viennacl::fast_copy(entries.begin(), entries.end(), gpu_vec.begin());
+  }
+  #endif
+
+
+
+  //
+  //////////////////// Copy from GPU to GPU //////////////////////////////////
+  //
+  /** @brief Copy (parts of a) GPU vector to another GPU vector
+  *
+  * @param gpu_src_begin    GPU iterator pointing to the beginning of the gpu vector (STL-like)
+  * @param gpu_src_end      GPU iterator pointing to the end of the vector (STL-like)
+  * @param gpu_dest_begin   Output iterator for the gpu vector. The gpu_dest vector must be at least as long as the gpu_src vector!
+  */
+  template <typename SCALARTYPE, unsigned int ALIGNMENT_SRC, unsigned int ALIGNMENT_DEST>
+  void copy(const_vector_iterator<SCALARTYPE, ALIGNMENT_SRC> const & gpu_src_begin,
+            const_vector_iterator<SCALARTYPE, ALIGNMENT_SRC> const & gpu_src_end,
+            vector_iterator<SCALARTYPE, ALIGNMENT_DEST> gpu_dest_begin)
+  {
+    assert(gpu_src_end - gpu_src_begin >= 0);
+    assert(gpu_src_begin.stride() == 1 && bool("ViennaCL ERROR: copy() for GPU->GPU not implemented for slices! Use operator= instead for the moment."));
+
+    if (gpu_src_begin.stride() == 1 && gpu_dest_begin.stride() == 1)
+    {
+      if (gpu_src_begin != gpu_src_end)
+        viennacl::backend::memory_copy(gpu_src_begin.handle(), gpu_dest_begin.handle(),
+                                        sizeof(SCALARTYPE) * gpu_src_begin.offset(),
+                                        sizeof(SCALARTYPE) * gpu_dest_begin.offset(),
+                                        sizeof(SCALARTYPE) * (gpu_src_end.offset() - gpu_src_begin.offset()));
+    }
+    else
+    {
+      assert( false && bool("not implemented yet"));
+    }
+  }
+
+  /** @brief Copy (parts of a) GPU vector to another GPU vector
+  *
+  * @param gpu_src_begin   GPU iterator pointing to the beginning of the gpu vector (STL-like)
+  * @param gpu_src_end     GPU iterator pointing to the end of the vector (STL-like)
+  * @param gpu_dest_begin  Output iterator for the gpu vector. The gpu vector must be at least as long as the cpu vector!
+  */
+  template <typename SCALARTYPE, unsigned int ALIGNMENT_SRC, unsigned int ALIGNMENT_DEST>
+  void copy(vector_iterator<SCALARTYPE, ALIGNMENT_SRC> const & gpu_src_begin,
+            vector_iterator<SCALARTYPE, ALIGNMENT_SRC> const & gpu_src_end,
+            vector_iterator<SCALARTYPE, ALIGNMENT_DEST> gpu_dest_begin)
+  {
+    viennacl::copy(static_cast<const_vector_iterator<SCALARTYPE, ALIGNMENT_SRC> >(gpu_src_begin),
+                    static_cast<const_vector_iterator<SCALARTYPE, ALIGNMENT_SRC> >(gpu_src_end),
+                    gpu_dest_begin);
+  }
+
+  /** @brief Transfer from a ViennaCL vector to another ViennaCL vector. Convenience wrapper for viennacl::linalg::copy(gpu_src_vec.begin(), gpu_src_vec.end(), gpu_dest_vec.begin());
+  *
+  * @param gpu_src_vec    A gpu vector
+  * @param gpu_dest_vec    The cpu vector. Type requirements: Output iterator can be obtained via member function .begin()
+  */
+  template <typename SCALARTYPE, unsigned int ALIGNMENT_SRC, unsigned int ALIGNMENT_DEST>
+  void copy(vector<SCALARTYPE, ALIGNMENT_SRC> const & gpu_src_vec,
+            vector<SCALARTYPE, ALIGNMENT_DEST> & gpu_dest_vec )
+  {
+    viennacl::copy(gpu_src_vec.begin(), gpu_src_vec.end(), gpu_dest_vec.begin());
+  }
+
+
+
+
+
+
+  //global functions for handling vectors:
+  /** @brief Output stream. Output format is ublas compatible.
+  * @param os   STL output stream
+  * @param val  The vector that should be printed
+  */
+  template <typename T>
+  std::ostream & operator<<(std::ostream & os, vector_base<T> const & val)
+  {
+    std::vector<T> tmp(val.size());
+    viennacl::copy(val.begin(), val.end(), tmp.begin());
+    os << "[" << val.size() << "](";
+    for (typename std::vector<T>::size_type i=0; i<val.size(); ++i)
+    {
+      if (i > 0)
+        os << ",";
+      os << tmp[i];
+    }
+    os << ")";
+    return os;
+  }
+
+  template <typename LHS, typename RHS, typename OP>
+  std::ostream & operator<<(std::ostream & os, vector_expression<LHS, RHS, OP> const & proxy)
+
+  {
+    typedef typename viennacl::result_of::cpu_value_type<typename LHS::value_type>::type ScalarType;
+    viennacl::vector<ScalarType> result = proxy;
+    os << result;
+    return os;
+  }
+
+  /** @brief Swaps the contents of two vectors, data is copied
+  *
+  * @param vec1   The first vector
+  * @param vec2   The second vector
+  */
+  template <typename T>
+  void swap(vector_base<T> & vec1, vector_base<T> & vec2)
+  {
+    viennacl::linalg::vector_swap(vec1, vec2);
+  }
+
+  /** @brief Swaps the content of two vectors by swapping OpenCL handles only, NO data is copied
+  *
+  * @param v1   The first vector
+  * @param v2   The second vector
+  */
+  template <typename SCALARTYPE, unsigned int ALIGNMENT>
+  vector<SCALARTYPE, ALIGNMENT> & fast_swap(vector<SCALARTYPE, ALIGNMENT> & v1,
+                                            vector<SCALARTYPE, ALIGNMENT> & v2)
+  {
+    return v1.fast_swap(v2);
+  }
+
+
+
+
+
+  //
+  //
+  ////////// operations /////////////////////////////////////////////////////////////////////////////////
+  //
+  //
+
+
+  //
+  // operator *=
+  //
+
+  /** @brief Scales this vector by a GPU scalar value
+  */
+  template <typename T, typename S1>
+  typename viennacl::enable_if< viennacl::is_any_scalar<S1>::value,
+                                vector_base<T> &
+                              >::type
+  operator *= (vector_base<T> & v1, S1 const & gpu_val)
+  {
+    if (v1.size() > 0)
+      viennacl::linalg::av(v1,
+                           v1, gpu_val, 1, false, (viennacl::is_flip_sign_scalar<S1>::value ? true : false));
+    return v1;
+  }
+
+
+  //
+  // operator /=
+  //
+
+
+  /** @brief Scales this vector by a GPU scalar value
+  */
+  template <typename T, typename S1>
+  typename viennacl::enable_if< viennacl::is_any_scalar<S1>::value,
+                                vector_base<T> &
+                              >::type
+  operator /= (vector_base<T> & v1, S1 const & gpu_val)
+  {
+    if (v1.size() > 0)
+      viennacl::linalg::av(v1,
+                           v1, gpu_val, 1, true, (viennacl::is_flip_sign_scalar<S1>::value ? true : false));
+    return v1;
+  }
+
+
+  //
+  // operator +
+  //
+
+
+  /** @brief Operator overload for the addition of two vector expressions.
+  *
+  * @param proxy1  Left hand side vector expression
+  * @param proxy2  Right hand side vector expression
+  */
+  template <typename LHS1, typename RHS1, typename OP1,
+            typename LHS2, typename RHS2, typename OP2>
+  vector_expression< const vector_expression< LHS1, RHS1, OP1>,
+                     const vector_expression< LHS2, RHS2, OP2>,
+                     viennacl::op_add>
+  operator + (vector_expression<LHS1, RHS1, OP1> const & proxy1,
+              vector_expression<LHS2, RHS2, OP2> const & proxy2)
+  {
+    assert(proxy1.size() == proxy2.size() && bool("Incompatible vector sizes!"));
+    return   vector_expression< const vector_expression<LHS1, RHS1, OP1>,
+                                const vector_expression<LHS2, RHS2, OP2>,
+                                viennacl::op_add>(proxy1, proxy2);
+  }
+
+  /** @brief Operator overload for the addition of a vector expression with a vector or another vector expression. This is the default implementation for all cases that are too complex in order to be covered within a single kernel, hence a temporary vector is created.
+  *
+  * @param proxy   Left hand side vector expression
+  * @param vec     Right hand side vector (also -range and -slice is allowed)
+  */
+  template <typename LHS, typename RHS, typename OP, typename T>
+  vector_expression< const vector_expression<LHS, RHS, OP>,
+                     const vector_base<T>,
+                     viennacl::op_add>
+  operator + (vector_expression<LHS, RHS, OP> const & proxy,
+              vector_base<T> const & vec)
+  {
+    assert(proxy.size() == vec.size() && bool("Incompatible vector sizes!"));
+    return vector_expression< const vector_expression<LHS, RHS, OP>,
+                              const vector_base<T>,
+                              viennacl::op_add>(proxy, vec);
+  }
+
+  /** @brief Operator overload for the addition of a vector with a vector expression. This is the default implementation for all cases that are too complex in order to be covered within a single kernel, hence a temporary vector is created.
+  *
+  * @param proxy   Left hand side vector expression
+  * @param vec     Right hand side vector (also -range and -slice is allowed)
+  */
+  template <typename T, typename LHS, typename RHS, typename OP>
+  vector_expression< const vector_base<T>,
+                     const vector_expression<LHS, RHS, OP>,
+                     viennacl::op_add>
+  operator + (vector_base<T> const & vec,
+              vector_expression<LHS, RHS, OP> const & proxy)
+  {
+    assert(proxy.size() == vec.size() && bool("Incompatible vector sizes!"));
+    return vector_expression< const vector_base<T>,
+                              const vector_expression<LHS, RHS, OP>,
+                              viennacl::op_add>(vec, proxy);
+  }
+
+  /** @brief Returns an expression template object for adding up two vectors, i.e. v1 + v2
+  */
+  template <typename T>
+  vector_expression< const vector_base<T>, const vector_base<T>, op_add>
+  operator + (const vector_base<T> & v1, const vector_base<T> & v2)
+  {
+    return vector_expression< const vector_base<T>, const vector_base<T>, op_add>(v1, v2);
+  }
+
+
+
+  //
+  // operator -
+  //
+
+  /** @brief Operator overload for the subtraction of two vector expressions.
+  *
+  * @param proxy1  Left hand side vector expression
+  * @param proxy2  Right hand side vector expression
+  */
+  template <typename LHS1, typename RHS1, typename OP1,
+            typename LHS2, typename RHS2, typename OP2>
+  vector_expression< const vector_expression< LHS1, RHS1, OP1>,
+                     const vector_expression< LHS2, RHS2, OP2>,
+                     viennacl::op_sub>
+  operator - (vector_expression<LHS1, RHS1, OP1> const & proxy1,
+              vector_expression<LHS2, RHS2, OP2> const & proxy2)
+  {
+    assert(proxy1.size() == proxy2.size() && bool("Incompatible vector sizes!"));
+    return   vector_expression< const vector_expression<LHS1, RHS1, OP1>,
+                                const vector_expression<LHS2, RHS2, OP2>,
+                                viennacl::op_sub>(proxy1, proxy2);
+  }
+
+
+  /** @brief Operator overload for the subtraction of a vector expression with a vector or another vector expression. This is the default implementation for all cases that are too complex in order to be covered within a single kernel, hence a temporary vector is created.
+  *
+  * @param proxy   Left hand side vector expression
+  * @param vec     Right hand side vector (also -range and -slice is allowed)
+  */
+  template <typename LHS, typename RHS, typename OP, typename T>
+  vector_expression< const vector_expression<LHS, RHS, OP>,
+                     const vector_base<T>,
+                     viennacl::op_sub>
+  operator - (vector_expression<LHS, RHS, OP> const & proxy,
+              vector_base<T> const & vec)
+  {
+    assert(proxy.size() == vec.size() && bool("Incompatible vector sizes!"));
+    return vector_expression< const vector_expression<LHS, RHS, OP>,
+                              const vector_base<T>,
+                              viennacl::op_sub>(proxy, vec);
+  }
+
+  /** @brief Operator overload for the subtraction of a vector expression with a vector or another vector expression. This is the default implementation for all cases that are too complex in order to be covered within a single kernel, hence a temporary vector is created.
+  *
+  * @param proxy   Left hand side vector expression
+  * @param vec     Right hand side vector (also -range and -slice is allowed)
+  */
+  template <typename T, typename LHS, typename RHS, typename OP>
+  vector_expression< const vector_base<T>,
+                     const vector_expression<LHS, RHS, OP>,
+                     viennacl::op_sub>
+  operator - (vector_base<T> const & vec,
+              vector_expression<LHS, RHS, OP> const & proxy)
+  {
+    assert(proxy.size() == vec.size() && bool("Incompatible vector sizes!"));
+    return vector_expression< const vector_base<T>,
+                              const vector_expression<LHS, RHS, OP>,
+                              viennacl::op_sub>(vec, proxy);
+  }
+
+  /** @brief Returns an expression template object for subtracting two vectors, i.e. v1 - v2
+  */
+  template <typename T>
+  vector_expression< const vector_base<T>, const vector_base<T>, op_sub>
+  operator - (const vector_base<T> & v1, const vector_base<T> & v2)
+  {
+    return vector_expression< const vector_base<T>, const vector_base<T>, op_sub>(v1, v2);
+  }
+
+
+  //
+  // operator *
+  //
+
+
+  /** @brief Operator overload for the expression alpha * v1, where alpha is a host scalar (float or double) and v1 is a ViennaCL vector.
+  *
+  * @param value   The host scalar (float or double)
+  * @param vec     A ViennaCL vector
+  */
+  template <typename S1, typename T>
+  typename viennacl::enable_if< viennacl::is_any_scalar<S1>::value,
+                                vector_expression< const vector_base<T>, const S1, op_mult> >::type
+  operator * (S1 const & value, vector_base<T> const & vec)
+  {
+    return vector_expression< const vector_base<T>, const S1, op_mult>(vec, value);
+  }
+
+  /** @brief Operator overload for the expression alpha * v1, where alpha is a char
+  *
+  * @param value   The host scalar (float or double)
+  * @param vec     A ViennaCL vector
+  */
+  template <typename T>
+  vector_expression< const vector_base<T>, const T, op_mult>
+  operator * (char value, vector_base<T> const & vec)
+  {
+    return vector_expression< const vector_base<T>, const T, op_mult>(vec, value);
+  }
+
+  /** @brief Operator overload for the expression alpha * v1, where alpha is a short
+  *
+  * @param value   The host scalar (float or double)
+  * @param vec     A ViennaCL vector
+  */
+  template <typename T>
+  vector_expression< const vector_base<T>, const T, op_mult>
+  operator * (short value, vector_base<T> const & vec)
+  {
+    return vector_expression< const vector_base<T>, const T, op_mult>(vec, value);
+  }
+
+  /** @brief Operator overload for the expression alpha * v1, where alpha is a int
+  *
+  * @param value   The host scalar (float or double)
+  * @param vec     A ViennaCL vector
+  */
+  template <typename T>
+  vector_expression< const vector_base<T>, const T, op_mult>
+  operator * (int value, vector_base<T> const & vec)
+  {
+    return vector_expression< const vector_base<T>, const T, op_mult>(vec, value);
+  }
+
+  /** @brief Operator overload for the expression alpha * v1, where alpha is a long
+  *
+  * @param value   The host scalar (float or double)
+  * @param vec     A ViennaCL vector
+  */
+  template <typename T>
+  vector_expression< const vector_base<T>, const T, op_mult>
+  operator * (long value, vector_base<T> const & vec)
+  {
+    return vector_expression< const vector_base<T>, const T, op_mult>(vec, value);
+  }
+
+
+
+
+  /** @brief Operator overload for the expression alpha * v1, where alpha is a scalar expression and v1 is a ViennaCL vector.
+  *
+  * @param expr    The scalar expression
+  * @param vec     A ViennaCL vector
+  */
+  template <typename LHS, typename RHS, typename OP, typename T>
+  vector_expression< const vector_base<T>, const scalar_expression<LHS, RHS, OP>, op_mult>
+  operator * (scalar_expression<LHS, RHS, OP> const & expr, vector_base<T> const & vec)
+  {
+    return vector_expression< const vector_base<T>, const scalar_expression<LHS, RHS, OP>, op_mult>(vec, expr);
+  }
+
+  /** @brief Scales the vector by a scalar 'alpha' and returns an expression template
+  */
+  template <typename T, typename S1>
+  typename viennacl::enable_if< viennacl::is_any_scalar<S1>::value,
+                                vector_expression< const vector_base<T>, const S1, op_mult> >::type
+  operator * (vector_base<T> const & vec, S1 const & value)
+  {
+    return vector_expression< const vector_base<T>, const S1, op_mult>(vec, value);
+  }
+
+  template <typename T>
+  vector_expression< const vector_base<T>, const T, op_mult>
+  operator * (vector_base<T> const & vec, T const & value)
+  {
+    return vector_expression< const vector_base<T>, const T, op_mult>(vec, value);
+  }
+
+  /** @brief Operator overload for the multiplication of a vector expression with a scalar from the right, e.g. (beta * vec1) * alpha. Here, beta * vec1 is wrapped into a vector_expression and then multiplied with alpha from the right.
+  *
+  * @param proxy   Left hand side vector expression
+  * @param val     Right hand side scalar
+  */
+  template <typename LHS, typename RHS, typename OP, typename S1>
+  typename viennacl::enable_if< viennacl::is_any_scalar<S1>::value,
+                                viennacl::vector_expression<const vector_expression<LHS, RHS, OP>, const S1, op_mult>  >::type
+  operator * (vector_expression< LHS, RHS, OP> const & proxy,
+              S1 const & val)
+  {
+    return viennacl::vector_expression<const vector_expression<LHS, RHS, OP>, const S1, op_mult>(proxy, val);
+  }
+
+  /** @brief Operator overload for the multiplication of a vector expression with a ViennaCL scalar from the left, e.g. alpha * (beta * vec1). Here, beta * vec1 is wrapped into a vector_expression and then multiplied with alpha from the left.
+  *
+  * @param val     Right hand side scalar
+  * @param proxy   Left hand side vector expression
+  */
+  template <typename S1, typename LHS, typename RHS, typename OP>
+  typename viennacl::enable_if< viennacl::is_any_scalar<S1>::value,
+                                viennacl::vector_expression<const vector_expression<LHS, RHS, OP>, const S1, op_mult>  >::type
+  operator * (S1 const & val,
+              vector_expression<LHS, RHS, OP> const & proxy)
+  {
+    return viennacl::vector_expression<const vector_expression<LHS, RHS, OP>, const S1, op_mult>(proxy, val);
+  }
+
+  //
+  // operator /
+  //
+
+  /** @brief Operator overload for the division of a vector expression by a scalar from the right, e.g. (beta * vec1) / alpha. Here, beta * vec1 is wrapped into a vector_expression and then divided by alpha.
+  *
+  * @param proxy   Left hand side vector expression
+  * @param val     Right hand side scalar
+  */
+  template <typename S1, typename LHS, typename RHS, typename OP>
+  typename viennacl::enable_if< viennacl::is_any_scalar<S1>::value,
+                                viennacl::vector_expression<const vector_expression<LHS, RHS, OP>, const S1, op_div>  >::type
+  operator / (vector_expression< LHS, RHS, OP> const & proxy,
+              S1 const & val)
+  {
+    return viennacl::vector_expression<const vector_expression<LHS, RHS, OP>, const S1, op_div>(proxy, val);
+  }
+
+
+  /** @brief Returns an expression template for scaling the vector by a GPU scalar 'alpha'
+  */
+  template <typename T, typename S1>
+  typename viennacl::enable_if< viennacl::is_any_scalar<S1>::value,
+                                vector_expression< const vector_base<T>, const S1, op_div> >::type
+  operator / (vector_base<T> const & v1, S1 const & s1)
+  {
+    return vector_expression<const vector_base<T>, const S1, op_div>(v1, s1);
+  }
+
+
+
+  //
+  // Specify available operations:
+  //
+
+  /** \cond */
+
+  namespace linalg
+  {
+    namespace detail
+    {
+      // x = y
+      template <typename T>
+      struct op_executor<vector_base<T>, op_assign, vector_base<T> >
+      {
+        static void apply(vector_base<T> & lhs, vector_base<T> const & rhs)
+        {
+          viennacl::linalg::av(lhs, rhs, T(1), 1, false, false);
+        }
+      };
+
+      // x = inner_prod(z, {y0, y1, ...})
+      template <typename T>
+      struct op_executor<vector_base<T>, op_assign, vector_expression<const vector_base<T>, const vector_tuple<T>, op_inner_prod> >
+      {
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>, const vector_tuple<T>, op_inner_prod> const & rhs)
+        {
+          viennacl::linalg::inner_prod_impl(rhs.lhs(), rhs.rhs(), lhs);
+        }
+      };
+
+      // x += y
+      template <typename T>
+      struct op_executor<vector_base<T>, op_inplace_add, vector_base<T> >
+      {
+        static void apply(vector_base<T> & lhs, vector_base<T> const & rhs)
+        {
+          viennacl::linalg::avbv(lhs, lhs, T(1), 1, false, false, rhs, T(1), 1, false, false);
+        }
+      };
+
+      // x -= y
+      template <typename T>
+      struct op_executor<vector_base<T>, op_inplace_sub, vector_base<T> >
+      {
+        static void apply(vector_base<T> & lhs, vector_base<T> const & rhs)
+        {
+          viennacl::linalg::avbv(lhs, lhs, T(1), 1, false, false, rhs, T(1), 1, false, true);
+        }
+      };
+
+      ///////////// x  OP  y * alpha ////////////////////////
+
+
+      // x = alpha * y
+      template <typename T, typename ScalarType>
+      struct op_executor<vector_base<T>, op_assign, vector_expression<const vector_base<T>, const ScalarType, op_mult> >
+      {
+        // generic case: ScalarType is a scalar expression
+        template <typename LHS, typename RHS, typename OP>
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>, const scalar_expression<LHS, RHS, OP>, op_mult> const & proxy)
+        {
+          T alpha = proxy.rhs();
+          viennacl::linalg::av(lhs, proxy.lhs(), alpha, 1, false, false);
+        }
+
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>, const scalar<T>, op_mult> const & proxy)
+        {
+          viennacl::linalg::av(lhs, proxy.lhs(), proxy.rhs(), 1, false, false);
+        }
+
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>, const T, op_mult> const & proxy)
+        {
+          viennacl::linalg::av(lhs, proxy.lhs(), proxy.rhs(), 1, false, false);
+        }
+      };
+
+      // x += alpha * y
+      template <typename T, typename ScalarType>
+      struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const vector_base<T>, const ScalarType, op_mult> >
+      {
+        // generic case: ScalarType is a scalar expression
+        template <typename LHS, typename RHS, typename OP>
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>, const scalar_expression<LHS, RHS, OP>, op_mult> const & proxy)
+        {
+          T alpha = proxy.rhs();
+          viennacl::linalg::avbv(lhs, lhs, T(1), 1, false, false, proxy.lhs(), alpha, 1, false, false);
+        }
+
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>, const scalar<T>, op_mult> const & proxy)
+        {
+          viennacl::linalg::avbv(lhs, lhs, T(1), 1, false, false, proxy.lhs(), proxy.rhs(), 1, false, false);
+        }
+
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>, const T, op_mult> const & proxy)
+        {
+          viennacl::linalg::avbv(lhs, lhs, T(1), 1, false, false, proxy.lhs(), proxy.rhs(), 1, false, false);
+        }
+      };
+
+      // x -= alpha * y
+      template <typename T, typename ScalarType>
+      struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const vector_base<T>, const ScalarType, op_mult> >
+      {
+        // generic case: ScalarType is a scalar expression
+        template <typename LHS, typename RHS, typename OP>
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>, const scalar_expression<LHS, RHS, OP>, op_mult> const & proxy)
+        {
+          T alpha = proxy.rhs();
+          viennacl::linalg::avbv(lhs, lhs, T(1), 1, false, false, proxy.lhs(), alpha, 1, false, true);
+        }
+
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>, const scalar<T>, op_mult> const & proxy)
+        {
+          viennacl::linalg::avbv(lhs, lhs, T(1), 1, false, false, proxy.lhs(), proxy.rhs(), 1, false, true);
+        }
+
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>, const T, op_mult> const & proxy)
+        {
+          viennacl::linalg::avbv(lhs, lhs, T(1), 1, false, false, proxy.lhs(), proxy.rhs(), 1, false, true);
+        }
+      };
+
+
+      ///////////// x  OP  vec_expr * alpha ////////////////////////
+
+      // x = alpha * vec_expr
+      template <typename T, typename LHS, typename RHS, typename OP, typename ScalarType>
+      struct op_executor<vector_base<T>, op_assign, vector_expression<const vector_expression<const LHS, const RHS, OP>, const ScalarType, op_mult> >
+      {
+          static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const LHS, const RHS, OP>, const ScalarType, op_mult> const & proxy)
+          {
+            vector<T> temp(proxy.lhs());
+            lhs = temp * proxy.rhs();
+          }
+      };
+
+      // x += alpha * vec_expr
+      template <typename T, typename LHS, typename RHS, typename OP, typename ScalarType>
+      struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const vector_expression<const LHS, const RHS, OP>, const ScalarType, op_mult> >
+      {
+          static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const LHS, const RHS, OP>, const ScalarType, op_mult> const & proxy)
+          {
+            vector<T> temp(proxy.lhs());
+            lhs += temp * proxy.rhs();
+          }
+      };
+
+      // x -= alpha * vec_expr
+      template <typename T, typename LHS, typename RHS, typename OP, typename ScalarType>
+      struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const vector_expression<const LHS, const RHS, OP>, const ScalarType, op_mult> >
+      {
+          static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const LHS, const RHS, OP>, const ScalarType, op_mult> const & proxy)
+          {
+            vector<T> temp(proxy.lhs());
+            lhs -= temp * proxy.rhs();
+          }
+      };
+
+
+      ///////////// x  OP  y / alpha ////////////////////////
+
+      // x = y / alpha
+      template <typename T, typename ScalarType>
+      struct op_executor<vector_base<T>, op_assign, vector_expression<const vector_base<T>, const ScalarType, op_div> >
+      {
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>, const ScalarType, op_div> const & proxy)
+        {
+          viennacl::linalg::av(lhs, proxy.lhs(), proxy.rhs(), 1, true, false);
+        }
+      };
+
+      // x += y / alpha
+      template <typename T, typename ScalarType>
+      struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const vector_base<T>, const ScalarType, op_div> >
+      {
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>, const ScalarType, op_div> const & proxy)
+        {
+          viennacl::linalg::avbv(lhs, lhs, T(1), 1, false, false, proxy.lhs(), proxy.rhs(), 1, true, false);
+        }
+      };
+
+      // x -= y / alpha
+      template <typename T, typename ScalarType>
+      struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const vector_base<T>, const ScalarType, op_div> >
+      {
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>, const ScalarType, op_div> const & proxy)
+        {
+          viennacl::linalg::avbv(lhs, lhs, T(1), 1, false, false, proxy.lhs(), proxy.rhs(), 1, true, true);
+        }
+      };
+
+
+      ///////////// x  OP  vec_expr / alpha ////////////////////////
+
+      // x = vec_expr / alpha
+      template <typename T, typename LHS, typename RHS, typename OP, typename ScalarType>
+      struct op_executor<vector_base<T>, op_assign, vector_expression<const vector_expression<const LHS, const RHS, OP>, const ScalarType, op_div> >
+      {
+          static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const LHS, const RHS, OP>, const ScalarType, op_div> const & proxy)
+          {
+            vector<T> temp(proxy.lhs());
+            lhs = temp / proxy.rhs();
+          }
+      };
+
+      // x += vec_expr / alpha
+      template <typename T, typename LHS, typename RHS, typename OP, typename ScalarType>
+      struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const vector_expression<const LHS, const RHS, OP>, const ScalarType, op_div> >
+      {
+          static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const LHS, const RHS, OP>, const ScalarType, op_div> const & proxy)
+          {
+            vector<T> temp(proxy.lhs());
+            lhs += temp / proxy.rhs();
+          }
+      };
+
+      // x -= vec_expr / alpha
+      template <typename T, typename LHS, typename RHS, typename OP, typename ScalarType>
+      struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const vector_expression<const LHS, const RHS, OP>, const ScalarType, op_div> >
+      {
+          static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const LHS, const RHS, OP>, const ScalarType, op_div> const & proxy)
+          {
+            vector<T> temp(proxy.lhs());
+            lhs -= temp / proxy.rhs();
+          }
+      };
+
+
+
+      // generic x = vec_expr1 + vec_expr2:
+      template <typename T, typename LHS, typename RHS>
+      struct op_executor<vector_base<T>, op_assign, vector_expression<const LHS, const RHS, op_add> >
+      {
+        // generic x = vec_expr1 + vec_expr2:
+        template <typename LHS1, typename RHS1>
+        static void apply(vector_base<T> & lhs, vector_expression<const LHS1, const RHS1, op_add> const & proxy)
+        {
+          bool op_aliasing_lhs = op_aliasing(lhs, proxy.lhs());
+          bool op_aliasing_rhs = op_aliasing(lhs, proxy.rhs());
+
+          if (op_aliasing_lhs || op_aliasing_rhs)
+          {
+            vector_base<T> temp(proxy.lhs());
+            op_executor<vector_base<T>, op_inplace_add, RHS>::apply(temp, proxy.rhs());
+            lhs = temp;
+          }
+          else
+          {
+            op_executor<vector_base<T>, op_assign, LHS>::apply(lhs, proxy.lhs());
+            op_executor<vector_base<T>, op_inplace_add, RHS>::apply(lhs, proxy.rhs());
+          }
+        }
+
+        // x = y + z
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>, const vector_base<T>, op_add> const & proxy)
+        {
+          viennacl::linalg::avbv(lhs,
+                                 proxy.lhs(), T(1), 1, false, false,
+                                 proxy.rhs(), T(1), 1, false, false);
+        }
+
+        // x = alpha * y + z
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const T, op_mult>,
+                                                                  const vector_base<T>,
+                                                                  op_add> const & proxy)
+        {
+          viennacl::linalg::avbv(lhs,
+                                 proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, false,
+                                 proxy.rhs(), T(1), 1, false, false);
+        }
+
+        // x = y / alpha + z
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const T, op_div>,
+                                                                  const vector_base<T>,
+                                                                  op_add> const & proxy)
+        {
+          viennacl::linalg::avbv(lhs,
+                                 proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, false,
+                                 proxy.rhs(), T(1), 1, false, false);
+        }
+
+        // x = y + beta * z
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>,
+                                                                  const vector_expression<const vector_base<T>, const T, op_mult>,
+                                                                  op_add> const & proxy)
+        {
+          viennacl::linalg::avbv(lhs,
+                                 proxy.lhs(), T(1), 1, false, false,
+                                 proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, false);
+        }
+
+        // x = y + z / beta
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>,
+                                                                  const vector_expression<const vector_base<T>, const T, op_div>,
+                                                                  op_add> const & proxy)
+        {
+          viennacl::linalg::avbv(lhs,
+                                 proxy.lhs(), T(1), 1, false, false,
+                                 proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, false);
+        }
+
+        // x = alpha * y + beta * z
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const T, op_mult>,
+                                                                  const vector_expression<const vector_base<T>, const T, op_mult>,
+                                                                  op_add> const & proxy)
+        {
+          viennacl::linalg::avbv(lhs,
+                                 proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, false,
+                                 proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, false);
+        }
+
+        // x = alpha * y + z / beta
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const T, op_mult>,
+                                                                  const vector_expression<const vector_base<T>, const T, op_div>,
+                                                                  op_add> const & proxy)
+        {
+          viennacl::linalg::avbv(lhs,
+                                 proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, false,
+                                 proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, false);
+        }
+
+        // x = y / alpha + beta * z
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const T, op_div>,
+                                                                  const vector_expression<const vector_base<T>, const T, op_mult>,
+                                                                  op_add> const & proxy)
+        {
+          viennacl::linalg::avbv(lhs,
+                                 proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, false,
+                                 proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, false);
+        }
+
+        // x = y / alpha + z / beta
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const T, op_div>,
+                                                                  const vector_expression<const vector_base<T>, const T, op_div>,
+                                                                  op_add> const & proxy)
+        {
+          viennacl::linalg::avbv(lhs,
+                                 proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, false,
+                                 proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, false);
+        }
+      };
+
+
+      // generic x += vec_expr1 + vec_expr2:
+      template <typename T, typename LHS, typename RHS>
+      struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const LHS, const RHS, op_add> >
+      {
+        // generic x += vec_expr1 + vec_expr2:
+        template <typename LHS1, typename RHS1>
+        static void apply(vector_base<T> & lhs, vector_expression<const LHS1, const RHS1, op_add> const & proxy)
+        {
+          bool op_aliasing_lhs = op_aliasing(lhs, proxy.lhs());
+          bool op_aliasing_rhs = op_aliasing(lhs, proxy.rhs());
+
+          if (op_aliasing_lhs || op_aliasing_rhs)
+          {
+            vector_base<T> temp(proxy.lhs());
+            op_executor<vector_base<T>, op_inplace_add, RHS>::apply(temp, proxy.rhs());
+            lhs += temp;
+          }
+          else
+          {
+            op_executor<vector_base<T>, op_inplace_add, LHS>::apply(lhs, proxy.lhs());
+            op_executor<vector_base<T>, op_inplace_add, RHS>::apply(lhs, proxy.rhs());
+          }
+        }
+
+        // x += y + z
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>, const vector_base<T>, op_add> const & proxy)
+        {
+          viennacl::linalg::avbv_v(lhs,
+                                   proxy.lhs(), T(1), 1, false, false,
+                                   proxy.rhs(), T(1), 1, false, false);
+        }
+
+        // x += alpha * y + z
+        template <typename ScalarType>
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const ScalarType, op_mult>,
+                                                                  const vector_base<T>,
+                                                                  op_add> const & proxy)
+        {
+          viennacl::linalg::avbv_v(lhs,
+                                   proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, false,
+                                   proxy.rhs(), T(1), 1, false, false);
+        }
+
+        // x += y / alpha + z
+        template <typename ScalarType>
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const ScalarType, op_div>,
+                                                                  const vector_base<T>,
+                                                                  op_add> const & proxy)
+        {
+          viennacl::linalg::avbv_v(lhs,
+                                   proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, false,
+                                   proxy.rhs(), T(1), 1, false, false);
+        }
+
+        // x += y + beta * z
+        template <typename ScalarType>
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>,
+                                                                  const vector_expression<const vector_base<T>, const ScalarType, op_mult>,
+                                                                  op_add> const & proxy)
+        {
+          viennacl::linalg::avbv_v(lhs,
+                                   proxy.lhs(), T(1), 1, false, false,
+                                   proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, false);
+        }
+
+        // x += y + z / beta
+        template <typename ScalarType>
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>,
+                                                                  const vector_expression<const vector_base<T>, const ScalarType, op_div>,
+                                                                  op_add> const & proxy)
+        {
+          viennacl::linalg::avbv_v(lhs,
+                                   proxy.lhs(), T(1), 1, false, false,
+                                   proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, false);
+        }
+
+        // x += alpha * y + beta * z
+        template <typename ScalarType1, typename ScalarType2>
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const ScalarType1, op_mult>,
+                                                                  const vector_expression<const vector_base<T>, const ScalarType2, op_mult>,
+                                                                  op_add> const & proxy)
+        {
+          viennacl::linalg::avbv_v(lhs,
+                                   proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, false,
+                                   proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, false);
+        }
+
+        // x += alpha * y + z / beta
+        template <typename ScalarType1, typename ScalarType2>
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const ScalarType1, op_mult>,
+                                                                  const vector_expression<const vector_base<T>, const ScalarType2, op_div>,
+                                                                  op_add> const & proxy)
+        {
+          viennacl::linalg::avbv_v(lhs,
+                                   proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, false,
+                                   proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, false);
+        }
+
+        // x += y / alpha + beta * z
+        template <typename ScalarType1, typename ScalarType2>
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const ScalarType1, op_div>,
+                                                                  const vector_expression<const vector_base<T>, const ScalarType2, op_mult>,
+                                                                  op_add> const & proxy)
+        {
+          viennacl::linalg::avbv_v(lhs,
+                                   proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, false,
+                                   proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, false);
+        }
+
+        // x += y / alpha + z / beta
+        template <typename ScalarType1, typename ScalarType2>
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const ScalarType1, op_div>,
+                                                                  const vector_expression<const vector_base<T>, const ScalarType2, op_div>,
+                                                                  op_add> const & proxy)
+        {
+          viennacl::linalg::avbv_v(lhs,
+                                   proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, false,
+                                   proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, false);
+        }
+      };
+
+
+
+      // generic x -= vec_expr1 + vec_expr2:
+      template <typename T, typename LHS, typename RHS>
+      struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const LHS, const RHS, op_add> >
+      {
+        // generic x -= vec_expr1 + vec_expr2:
+        template <typename LHS1, typename RHS1>
+        static void apply(vector_base<T> & lhs, vector_expression<const LHS1, const RHS1, op_add> const & proxy)
+        {
+          bool op_aliasing_lhs = op_aliasing(lhs, proxy.lhs());
+          bool op_aliasing_rhs = op_aliasing(lhs, proxy.rhs());
+
+          if (op_aliasing_lhs || op_aliasing_rhs)
+          {
+            vector_base<T> temp(proxy.lhs());
+            op_executor<vector_base<T>, op_inplace_add, RHS>::apply(temp, proxy.rhs());
+            lhs -= temp;
+          }
+          else
+          {
+            op_executor<vector_base<T>, op_inplace_sub, LHS>::apply(lhs, proxy.lhs());
+            op_executor<vector_base<T>, op_inplace_sub, RHS>::apply(lhs, proxy.rhs());
+          }
+        }
+
+        // x -= y + z
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>, const vector_base<T>, op_add> const & proxy)
+        {
+          viennacl::linalg::avbv_v(lhs,
+                                   proxy.lhs(), T(1), 1, false, true,
+                                   proxy.rhs(), T(1), 1, false, true);
+        }
+
+        // x -= alpha * y + z
+        template <typename ScalarType>
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const ScalarType, op_mult>,
+                                                                  const vector_base<T>,
+                                                                  op_add> const & proxy)
+        {
+          viennacl::linalg::avbv_v(lhs,
+                                   proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, true,
+                                   proxy.rhs(), T(1), 1, false, true);
+        }
+
+        // x -= y / alpha + z
+        template <typename ScalarType>
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const ScalarType, op_div>,
+                                                                  const vector_base<T>,
+                                                                  op_add> const & proxy)
+        {
+          viennacl::linalg::avbv_v(lhs,
+                                   proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, true,
+                                   proxy.rhs(), T(1), 1, false, true);
+        }
+
+        // x -= y + beta * z
+        template <typename ScalarType>
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>,
+                                                                  const vector_expression<const vector_base<T>, const ScalarType, op_mult>,
+                                                                  op_add> const & proxy)
+        {
+          viennacl::linalg::avbv_v(lhs,
+                                   proxy.lhs(), T(1), 1, false, true,
+                                   proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, true);
+        }
+
+        // x -= y + z / beta
+        template <typename ScalarType>
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>,
+                                                                  const vector_expression<const vector_base<T>, const ScalarType, op_div>,
+                                                                  op_add> const & proxy)
+        {
+          viennacl::linalg::avbv_v(lhs,
+                                   proxy.lhs(), T(1), 1, false, true,
+                                   proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, true);
+        }
+
+        // x -= alpha * y + beta * z
+        template <typename ScalarType1, typename ScalarType2>
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const ScalarType1, op_mult>,
+                                                                  const vector_expression<const vector_base<T>, const ScalarType2, op_mult>,
+                                                                  op_add> const & proxy)
+        {
+          viennacl::linalg::avbv_v(lhs,
+                                   proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, true,
+                                   proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, true);
+        }
+
+        // x -= alpha * y + z / beta
+        template <typename ScalarType1, typename ScalarType2>
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const ScalarType1, op_mult>,
+                                                                  const vector_expression<const vector_base<T>, const ScalarType2, op_div>,
+                                                                  op_add> const & proxy)
+        {
+          viennacl::linalg::avbv_v(lhs,
+                                   proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, true,
+                                   proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, true);
+        }
+
+        // x -= y / alpha + beta * z
+        template <typename ScalarType1, typename ScalarType2>
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const ScalarType1, op_div>,
+                                                                  const vector_expression<const vector_base<T>, const ScalarType2, op_mult>,
+                                                                  op_add> const & proxy)
+        {
+          viennacl::linalg::avbv_v(lhs,
+                                   proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, true,
+                                   proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, true);
+        }
+
+        // x -= y / alpha + z / beta
+        template <typename ScalarType1, typename ScalarType2>
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const ScalarType1, op_div>,
+                                                                  const vector_expression<const vector_base<T>, const ScalarType2, op_div>,
+                                                                  op_add> const & proxy)
+        {
+          viennacl::linalg::avbv_v(lhs,
+                                   proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, true,
+                                   proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, true);
+        }
+      };
+
+
+
+      ///////////////////////
+
+
+
+      // generic x = vec_expr1 - vec_expr2:
+      template <typename T, typename LHS, typename RHS>
+      struct op_executor<vector_base<T>, op_assign, vector_expression<const LHS, const RHS, op_sub> >
+      {
+        // generic x = vec_expr1 - vec_expr2:
+        template <typename LHS1, typename RHS1>
+        static void apply(vector_base<T> & lhs, vector_expression<const LHS1, const RHS1, op_sub> const & proxy)
+        {
+          bool op_aliasing_lhs = op_aliasing(lhs, proxy.lhs());
+          bool op_aliasing_rhs = op_aliasing(lhs, proxy.rhs());
+
+          if (op_aliasing_lhs || op_aliasing_rhs)
+          {
+            vector_base<T> temp(proxy.lhs());
+            op_executor<vector_base<T>, op_inplace_sub, RHS>::apply(temp, proxy.rhs());
+            lhs = temp;
+          }
+          else
+          {
+            op_executor<vector_base<T>, op_assign, LHS>::apply(lhs, proxy.lhs());
+            op_executor<vector_base<T>, op_inplace_sub, RHS>::apply(lhs, proxy.rhs());
+          }
+        }
+
+        // x = y - z
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>, const vector_base<T>, op_sub> const & proxy)
+        {
+          viennacl::linalg::avbv(lhs,
+                                 proxy.lhs(), T(1), 1, false, false,
+                                 proxy.rhs(), T(1), 1, false, true);
+        }
+
+        // x = alpha * y - z
+        template <typename ScalarType>
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const ScalarType, op_mult>,
+                                                                  const vector_base<T>,
+                                                                  op_sub> const & proxy)
+        {
+          viennacl::linalg::avbv(lhs,
+                                 proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, false,
+                                 proxy.rhs(), T(1), 1, false, true);
+        }
+
+        // x = y / alpha - z
+        template <typename ScalarType>
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const ScalarType, op_div>,
+                                                                  const vector_base<T>,
+                                                                  op_sub> const & proxy)
+        {
+          viennacl::linalg::avbv(lhs,
+                                 proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, false,
+                                 proxy.rhs(), T(1), 1, false, true);
+        }
+
+        // x = y - beta * z
+        template <typename ScalarType>
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>,
+                                                                  const vector_expression<const vector_base<T>, const ScalarType, op_mult>,
+                                                                  op_sub> const & proxy)
+        {
+          viennacl::linalg::avbv(lhs,
+                                 proxy.lhs(), T(1), 1, false, false,
+                                 proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, true);
+        }
+
+        // x = y - z / beta
+        template <typename ScalarType>
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>,
+                                                                  const vector_expression<const vector_base<T>, const ScalarType, op_div>,
+                                                                  op_sub> const & proxy)
+        {
+          viennacl::linalg::avbv(lhs,
+                                 proxy.lhs(), T(1), 1, false, false,
+                                 proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, true);
+        }
+
+        // x = alpha * y - beta * z
+        template <typename ScalarType1, typename ScalarType2>
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const ScalarType1, op_mult>,
+                                                                  const vector_expression<const vector_base<T>, const ScalarType2, op_mult>,
+                                                                  op_sub> const & proxy)
+        {
+          viennacl::linalg::avbv(lhs,
+                                 proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, false,
+                                 proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, true);
+        }
+
+        // x = alpha * y - z / beta
+        template <typename ScalarType1, typename ScalarType2>
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const ScalarType1, op_mult>,
+                                                                  const vector_expression<const vector_base<T>, const ScalarType2, op_div>,
+                                                                  op_sub> const & proxy)
+        {
+          viennacl::linalg::avbv(lhs,
+                                 proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, false,
+                                 proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, true);
+        }
+
+        // x = y / alpha - beta * z
+        template <typename ScalarType1, typename ScalarType2>
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const ScalarType1, op_div>,
+                                                                  const vector_expression<const vector_base<T>, const ScalarType2, op_mult>,
+                                                                  op_sub> const & proxy)
+        {
+          viennacl::linalg::avbv(lhs,
+                                 proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, false,
+                                 proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, true);
+        }
+
+        // x = y / alpha - z / beta
+        template <typename ScalarType1, typename ScalarType2>
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const ScalarType1, op_div>,
+                                                                  const vector_expression<const vector_base<T>, const ScalarType2, op_div>,
+                                                                  op_sub> const & proxy)
+        {
+          viennacl::linalg::avbv(lhs,
+                                 proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, false,
+                                 proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, true);
+        }
+      };
+
+
+      // generic x += vec_expr1 - vec_expr2:
+      template <typename T, typename LHS, typename RHS>
+      struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const LHS, const RHS, op_sub> >
+      {
+        // generic x += vec_expr1 - vec_expr2:
+        template <typename LHS1, typename RHS1>
+        static void apply(vector_base<T> & lhs, vector_expression<const LHS1, const RHS1, op_sub> const & proxy)
+        {
+          bool op_aliasing_lhs = op_aliasing(lhs, proxy.lhs());
+          bool op_aliasing_rhs = op_aliasing(lhs, proxy.rhs());
+
+          if (op_aliasing_lhs || op_aliasing_rhs)
+          {
+            vector_base<T> temp(proxy.lhs());
+            op_executor<vector_base<T>, op_inplace_sub, RHS>::apply(temp, proxy.rhs());
+            lhs += temp;
+          }
+          else
+          {
+            op_executor<vector_base<T>, op_inplace_add, LHS>::apply(lhs, proxy.lhs());
+            op_executor<vector_base<T>, op_inplace_sub, RHS>::apply(lhs, proxy.rhs());
+          }
+        }
+
+        // x += y - z
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>, const vector_base<T>, op_sub> const & proxy)
+        {
+          viennacl::linalg::avbv_v(lhs,
+                                   proxy.lhs(), T(1), 1, false, false,
+                                   proxy.rhs(), T(1), 1, false, true);
+        }
+
+        // x += alpha * y - z
+        template <typename ScalarType>
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const ScalarType, op_mult>,
+                                                                  const vector_base<T>,
+                                                                  op_sub> const & proxy)
+        {
+          viennacl::linalg::avbv_v(lhs,
+                                   proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, false,
+                                   proxy.rhs(), T(1), 1, false, true);
+        }
+
+        // x += y / alpha - z
+        template <typename ScalarType>
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const ScalarType, op_div>,
+                                                                  const vector_base<T>,
+                                                                  op_sub> const & proxy)
+        {
+          viennacl::linalg::avbv_v(lhs,
+                                   proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, false,
+                                   proxy.rhs(), T(1), 1, false, true);
+        }
+
+        // x += y - beta * z
+        template <typename ScalarType>
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>,
+                                                                  const vector_expression<const vector_base<T>, const ScalarType, op_mult>,
+                                                                  op_sub> const & proxy)
+        {
+          viennacl::linalg::avbv_v(lhs,
+                                   proxy.lhs(), T(1), 1, false, false,
+                                   proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, true);
+        }
+
+        // x += y - z / beta
+        template <typename ScalarType>
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>,
+                                                                  const vector_expression<const vector_base<T>, const ScalarType, op_div>,
+                                                                  op_sub> const & proxy)
+        {
+          viennacl::linalg::avbv_v(lhs,
+                                   proxy.lhs(), T(1), 1, false, false,
+                                   proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, true);
+        }
+
+        // x += alpha * y - beta * z
+        template <typename ScalarType1, typename ScalarType2>
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const ScalarType1, op_mult>,
+                                                                  const vector_expression<const vector_base<T>, const ScalarType2, op_mult>,
+                                                                  op_sub> const & proxy)
+        {
+          viennacl::linalg::avbv_v(lhs,
+                                   proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, false,
+                                   proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, true);
+        }
+
+        // x += alpha * y - z / beta
+        template <typename ScalarType1, typename ScalarType2>
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const ScalarType1, op_mult>,
+                                                                  const vector_expression<const vector_base<T>, const ScalarType2, op_div>,
+                                                                  op_sub> const & proxy)
+        {
+          viennacl::linalg::avbv_v(lhs,
+                                   proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, false,
+                                   proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, true);
+        }
+
+        // x += y / alpha - beta * z
+        template <typename ScalarType1, typename ScalarType2>
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const ScalarType1, op_div>,
+                                                                  const vector_expression<const vector_base<T>, const ScalarType2, op_mult>,
+                                                                  op_sub> const & proxy)
+        {
+          viennacl::linalg::avbv_v(lhs,
+                                   proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, false,
+                                   proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, true);
+        }
+
+        // x += y / alpha - z / beta
+        template <typename ScalarType1, typename ScalarType2>
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const ScalarType1, op_div>,
+                                                                  const vector_expression<const vector_base<T>, const ScalarType2, op_div>,
+                                                                  op_sub> const & proxy)
+        {
+          viennacl::linalg::avbv_v(lhs,
+                                   proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, false,
+                                   proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, true);
+        }
+      };
+
+
+
+      // generic x -= vec_expr1 - vec_expr2:
+      template <typename T, typename LHS, typename RHS>
+      struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const LHS, const RHS, op_sub> >
+      {
+        // generic x -= vec_expr1 - vec_expr2:
+        template <typename LHS1, typename RHS1>
+        static void apply(vector_base<T> & lhs, vector_expression<const LHS1, const RHS1, op_sub> const & proxy)
+        {
+          bool op_aliasing_lhs = op_aliasing(lhs, proxy.lhs());
+          bool op_aliasing_rhs = op_aliasing(lhs, proxy.rhs());
+
+          if (op_aliasing_lhs || op_aliasing_rhs)
+          {
+            vector_base<T> temp(proxy.lhs());
+            op_executor<vector_base<T>, op_inplace_sub, RHS>::apply(temp, proxy.rhs());
+            lhs -= temp;
+          }
+          else
+          {
+            op_executor<vector_base<T>, op_inplace_sub, LHS>::apply(lhs, proxy.lhs());
+            op_executor<vector_base<T>, op_inplace_add, RHS>::apply(lhs, proxy.rhs());
+          }
+        }
+
+        // x -= y - z
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>, const vector_base<T>, op_sub> const & proxy)
+        {
+          viennacl::linalg::avbv_v(lhs,
+                                   proxy.lhs(), T(1), 1, false, true,
+                                   proxy.rhs(), T(1), 1, false, false);
+        }
+
+        // x -= alpha * y - z
+        template <typename ScalarType>
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const ScalarType, op_mult>,
+                                                                  const vector_base<T>,
+                                                                  op_sub> const & proxy)
+        {
+          viennacl::linalg::avbv_v(lhs,
+                                   proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, true,
+                                   proxy.rhs(), T(1), 1, false, false);
+        }
+
+        // x -= y / alpha - z
+        template <typename ScalarType>
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const ScalarType, op_div>,
+                                                                  const vector_base<T>,
+                                                                  op_sub> const & proxy)
+        {
+          viennacl::linalg::avbv_v(lhs,
+                                   proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, true,
+                                   proxy.rhs(), T(1), 1, false, false);
+        }
+
+        // x -= y - beta * z
+        template <typename ScalarType>
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>,
+                                                                  const vector_expression<const vector_base<T>, const ScalarType, op_mult>,
+                                                                  op_sub> const & proxy)
+        {
+          viennacl::linalg::avbv_v(lhs,
+                                   proxy.lhs(), T(1), 1, false, true,
+                                   proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, false);
+        }
+
+        // x -= y - z / beta
+        template <typename ScalarType>
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>,
+                                                                  const vector_expression<const vector_base<T>, const ScalarType, op_div>,
+                                                                  op_sub> const & proxy)
+        {
+          viennacl::linalg::avbv_v(lhs,
+                                   proxy.lhs(), T(1), 1, false, true,
+                                   proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, false);
+        }
+
+        // x -= alpha * y - beta * z
+        template <typename ScalarType1, typename ScalarType2>
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const ScalarType1, op_mult>,
+                                                                  const vector_expression<const vector_base<T>, const ScalarType2, op_mult>,
+                                                                  op_sub> const & proxy)
+        {
+          viennacl::linalg::avbv_v(lhs,
+                                   proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, true,
+                                   proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, false);
+        }
+
+        // x -= alpha * y - z / beta
+        template <typename ScalarType1, typename ScalarType2>
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const ScalarType1, op_mult>,
+                                                                  const vector_expression<const vector_base<T>, const ScalarType2, op_div>,
+                                                                  op_sub> const & proxy)
+        {
+          viennacl::linalg::avbv_v(lhs,
+                                   proxy.lhs().lhs(), proxy.lhs().rhs(), 1, false, true,
+                                   proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, false);
+        }
+
+        // x -= y / alpha - beta * z
+        template <typename ScalarType1, typename ScalarType2>
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const ScalarType1, op_div>,
+                                                                  const vector_expression<const vector_base<T>, const ScalarType2, op_mult>,
+                                                                  op_sub> const & proxy)
+        {
+          viennacl::linalg::avbv_v(lhs,
+                                   proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, true,
+                                   proxy.rhs().lhs(), proxy.rhs().rhs(), 1, false, false);
+        }
+
+        // x -= y / alpha - z / beta
+        template <typename ScalarType1, typename ScalarType2>
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const vector_base<T>, const ScalarType1, op_div>,
+                                                                  const vector_expression<const vector_base<T>, const ScalarType2, op_div>,
+                                                                  op_sub> const & proxy)
+        {
+          viennacl::linalg::avbv_v(lhs,
+                                   proxy.lhs().lhs(), proxy.lhs().rhs(), 1, true, true,
+                                   proxy.rhs().lhs(), proxy.rhs().rhs(), 1, true, false);
+        }
+      };
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+      //////////////////// Element-wise operations ////////////////////////////////////////
+
+      // generic x = vec_expr1 .* vec_expr2:
+      template <typename T, typename LHS, typename RHS, typename OP>
+      struct op_executor<vector_base<T>, op_assign, vector_expression<const LHS, const RHS, op_element_binary<OP> > >
+      {
+        // x = y .* z  or  x = y ./ z
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>, const vector_base<T>, op_element_binary<OP> > const & proxy)
+        {
+          viennacl::linalg::element_op(lhs, proxy);
+        }
+
+        // x = y .* vec_expr  or  x = y ./ vec_expr
+        template <typename LHS2, typename RHS2, typename OP2>
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>, const vector_expression<const LHS2, const RHS2, OP2>, op_element_binary<OP> > const & proxy)
+        {
+          vector<T> temp(proxy.rhs());
+          viennacl::linalg::element_op(lhs, viennacl::vector_expression<const vector_base<T>, const vector_base<T>, op_element_binary<OP> >(proxy.lhs(), temp));
+        }
+
+        // x = vec_expr .* z  or  x = vec_expr ./ z
+        template <typename LHS1, typename RHS1, typename OP1>
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const LHS1, const RHS1, OP1>, const vector_base<T>, op_element_binary<OP> > const & proxy)
+        {
+          vector<T> temp(proxy.lhs());
+          viennacl::linalg::element_op(lhs, viennacl::vector_expression<const vector_base<T>, const vector_base<T>, op_element_binary<OP> >(temp, proxy.rhs()));
+        }
+
+        // x = vec_expr .* vec_expr  or  z = vec_expr .* vec_expr
+        template <typename LHS1, typename RHS1, typename OP1,
+                  typename LHS2, typename RHS2, typename OP2>
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const LHS1, const RHS1, OP1>,
+                                                                  const vector_expression<const LHS2, const RHS2, OP2>,
+                                                                  op_element_binary<OP> > const & proxy)
+        {
+          vector<T> temp1(proxy.lhs());
+          vector<T> temp2(proxy.rhs());
+          viennacl::linalg::element_op(lhs, viennacl::vector_expression<const vector_base<T>, const vector_base<T>, op_element_binary<OP> >(temp1, temp2));
+        }
+      };
+
+      // generic x += vec_expr1 .* vec_expr2:
+      template <typename T, typename LHS, typename RHS, typename OP>
+      struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const LHS, const RHS, op_element_binary<OP> > >
+      {
+        // x += y .* z  or  x += y ./ z
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>, const vector_base<T>, op_element_binary<OP> > const & proxy)
+        {
+          viennacl::vector<T> temp(proxy);
+          lhs += temp;
+        }
+
+        // x += y .* vec_expr  or  x += y ./ vec_expr
+        template <typename LHS2, typename RHS2, typename OP2>
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>, const vector_expression<const LHS2, const RHS2, OP2>,  op_element_binary<OP> > const & proxy)
+        {
+          vector<T> temp(proxy.rhs());
+          vector<T> temp2(temp.size());
+          viennacl::linalg::element_op(temp2, viennacl::vector_expression<const vector_base<T>, const vector_base<T>, op_element_binary<OP> >(proxy.lhs(), temp));
+          lhs += temp2;
+        }
+
+        // x += vec_expr .* z  or  x += vec_expr ./ z
+        template <typename LHS1, typename RHS1, typename OP1>
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const LHS1, const RHS1, OP1>, const vector_base<T>, op_element_binary<OP> > const & proxy)
+        {
+          vector<T> temp(proxy.lhs());
+          vector<T> temp2(temp.size());
+          viennacl::linalg::element_op(temp2, viennacl::vector_expression<const vector_base<T>, const vector_base<T>, op_element_binary<OP> >(temp, proxy.rhs()));
+          lhs += temp2;
+        }
+
+        // x += vec_expr .* vec_expr  or  x += vec_expr ./ vec_expr
+        template <typename LHS1, typename RHS1, typename OP1,
+                  typename LHS2, typename RHS2, typename OP2>
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const LHS1, const RHS1, OP1>,
+                                                                  const vector_expression<const LHS2, const RHS2, OP2>,
+                                                                  op_element_binary<OP> > const & proxy)
+        {
+          vector<T> temp1(proxy.lhs());
+          vector<T> temp2(proxy.rhs());
+          vector<T> temp3(temp1.size());
+          viennacl::linalg::element_op(temp3, viennacl::vector_expression<const vector_base<T>, const vector_base<T>, op_element_binary<OP> >(temp1, temp2));
+          lhs += temp3;
+        }
+      };
+
+      // generic x -= vec_expr1 .* vec_expr2:
+      template <typename T, typename LHS, typename RHS, typename OP>
+      struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const LHS, const RHS, op_element_binary<OP> > >
+      {
+
+        // x -= y .* z  or  x -= y ./ z
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>, const vector_base<T>, op_element_binary<OP> > const & proxy)
+        {
+          viennacl::vector<T> temp(proxy);
+          lhs -= temp;
+        }
+
+        // x -= y .* vec_expr  or  x -= y ./ vec_expr
+        template <typename LHS2, typename RHS2, typename OP2>
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>, const vector_expression<const LHS2, const RHS2, OP2>, op_element_binary<OP> > const & proxy)
+        {
+          vector<T> temp(proxy.rhs());
+          vector<T> temp2(temp.size());
+          viennacl::linalg::element_op(temp2, viennacl::vector_expression<const vector_base<T>, const vector_base<T>, op_element_binary<OP> >(proxy.lhs(), temp));
+          lhs -= temp2;
+        }
+
+        // x -= vec_expr .* z  or  x -= vec_expr ./ z
+        template <typename LHS1, typename RHS1, typename OP1>
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const LHS1, const RHS1, OP1>, const vector_base<T>, op_element_binary<OP> > const & proxy)
+        {
+          vector<T> temp(proxy.lhs());
+          vector<T> temp2(temp.size());
+          viennacl::linalg::element_op(temp2, viennacl::vector_expression<const vector_base<T>, const vector_base<T>, op_element_binary<OP> >(temp, proxy.rhs()));
+          lhs -= temp2;
+        }
+
+        // x -= vec_expr .* vec_expr  or  x -= vec_expr ./ vec_expr
+        template <typename LHS1, typename RHS1, typename OP1,
+                  typename LHS2, typename RHS2, typename OP2>
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const LHS1, const RHS1, OP1>,
+                                                                  const vector_expression<const LHS2, const RHS2, OP2>,
+                                                                  op_element_binary<OP> > const & proxy)
+        {
+          vector<T> temp1(proxy.lhs());
+          vector<T> temp2(proxy.rhs());
+          vector<T> temp3(temp1.size());
+          viennacl::linalg::element_op(temp3, viennacl::vector_expression<const vector_base<T>, const vector_base<T>, op_element_binary<OP> >(temp1, temp2));
+          lhs -= temp3;
+        }
+      };
+
+      //////////////// unary expressions
+
+      template <typename T, typename LHS, typename RHS, typename OP>
+      struct op_executor<vector_base<T>, op_assign, vector_expression<const LHS, const RHS, op_element_unary<OP> > >
+      {
+        // x = OP(y)
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<OP> > const & proxy)
+        {
+          viennacl::linalg::element_op(lhs, proxy);
+        }
+
+        // x = OP(vec_expr)
+        template <typename LHS2, typename RHS2, typename OP2>
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const LHS2, const RHS2, OP2>,
+                                                                  const vector_expression<const LHS2, const RHS2, OP2>,
+                                                                  op_element_unary<OP> > const & proxy)
+        {
+          vector<T> temp(proxy.rhs());
+          viennacl::linalg::element_op(lhs, viennacl::vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<OP> >(temp, temp));
+        }
+      };
+
+      template <typename T, typename LHS, typename RHS, typename OP>
+      struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const LHS, const RHS, op_element_unary<OP> > >
+      {
+        // x += OP(y)
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<OP> > const & proxy)
+        {
+          vector<T> temp(proxy);
+          lhs += temp;
+        }
+
+        // x += OP(vec_expr)
+        template <typename LHS2, typename RHS2, typename OP2>
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const LHS2, const RHS2, OP2>,
+                                                                  const vector_expression<const LHS2, const RHS2, OP2>,
+                                                                  op_element_unary<OP> > const & proxy)
+        {
+          vector<T> temp(proxy.rhs());
+          viennacl::linalg::element_op(temp, viennacl::vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<OP> >(temp, temp)); // inplace operation is safe here
+          lhs += temp;
+        }
+      };
+
+      template <typename T, typename LHS, typename RHS, typename OP>
+      struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const LHS, const RHS, op_element_unary<OP> > >
+      {
+        // x -= OP(y)
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<OP> > const & proxy)
+        {
+          vector<T> temp(proxy);
+          lhs -= temp;
+        }
+
+        // x -= OP(vec_expr)
+        template <typename LHS2, typename RHS2, typename OP2>
+        static void apply(vector_base<T> & lhs, vector_expression<const vector_expression<const LHS2, const RHS2, OP2>,
+                                                                  const vector_expression<const LHS2, const RHS2, OP2>,
+                                                                  op_element_unary<OP> > const & proxy)
+        {
+          vector<T> temp(proxy.rhs());
+          viennacl::linalg::element_op(temp, viennacl::vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<OP> >(temp, temp)); // inplace operation is safe here
+          lhs -= temp;
+        }
+      };
+
+    } // namespace detail
+
+  } // namespace linalg
+
+  /** \endcond */
+
+} // namespace viennacl
+
+#endif
diff --git a/viennacl/vector_proxy.hpp b/viennacl/vector_proxy.hpp
index d138472..a7f2cfa 100644
--- a/viennacl/vector_proxy.hpp
+++ b/viennacl/vector_proxy.hpp
@@ -2,16 +2,17 @@
 #define VIENNACL_VECTOR_PROXY_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
+   Copyright (c) 2010-2014, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
 
                             -----------------
                   ViennaCL - The Vienna Computing Library
                             -----------------
 
    Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
-               
+
    (A list of authors and contributors can be found in the PDF manual)
 
    License:         MIT (X11), see file LICENSE in the base directory
@@ -23,128 +24,252 @@
 
 #include "viennacl/forwards.h"
 #include "viennacl/range.hpp"
+#include "viennacl/slice.hpp"
 #include "viennacl/vector.hpp"
+#include "viennacl/tools/entry_proxy.hpp"
 
 namespace viennacl
 {
-
+  /** @brief Class for representing non-strided subvectors of a bigger vector x.
+    *
+    * In MATLAB notation, this could for example refer to the subvector x(3:8) of a vector x.
+    */
   template <typename VectorType>
-  class vector_range
+  class vector_range : public vector_base<typename VectorType::cpu_value_type>
   {
-      typedef vector_range<VectorType>            self_type;
-    
+      typedef vector_range<VectorType>             self_type;
+      typedef vector_base<typename VectorType::cpu_value_type> base_type;
+
     public:
-      typedef typename VectorType::value_type     value_type;
-      typedef range::size_type                    size_type;
-      typedef range::difference_type              difference_type;
-      typedef value_type                          reference;
-      typedef const value_type &                  const_reference;
-      
+      typedef typename VectorType::value_type      value_type;
+      typedef range::size_type                     size_type;
+      typedef range::difference_type               difference_type;
+      typedef value_type                           reference;
+      typedef const value_type &                   const_reference;
+      typedef typename VectorType::const_iterator  const_iterator;
+      typedef typename VectorType::iterator        iterator;
+
+      typedef typename VectorType::cpu_value_type    cpu_value_type;
+
       static const int alignment = VectorType::alignment;
-      
-      vector_range(VectorType & v, 
-                   range const & entry_range) : v_(v), entry_range_(entry_range) {}
-                   
-      size_type start() const { return entry_range_.start(); }
-      size_type size() const { return entry_range_.size(); }
-
-      template <typename LHS, typename RHS, typename OP>
-      self_type & operator = (const vector_expression< LHS,
-                                                       RHS,
-                                                       OP > & proxy) 
-      {
-        assert( false && "Not implemented!");
-        return *this;
-      }      
-      
-      self_type & operator += (self_type const & other)
-      {
-        viennacl::linalg::inplace_add(*this, other);
-        return *this;
-      }
-      
-
-      //const_reference operator()(size_type i, size_type j) const { return A_(start1() + i, start2() + i); }
-      //reference operator()(size_type i, size_type j) { return A_(start1() + i, start2() + i); }
-
-      VectorType & get() { return v_; }
-      const VectorType & get() const { return v_; }
-
-    private:
-      VectorType & v_;
-      range entry_range_;
+
+      vector_range(VectorType & v, range const & entry_range)
+       : base_type(v.handle(), entry_range.size(), v.start() + v.stride() * entry_range.start(), v.stride()) {}
+
+
+      using base_type::operator=;
+
   };
 
-  
-  template<typename VectorType>
-  std::ostream & operator<<(std::ostream & s, vector_range<VectorType> const & proxy)
-  {
-    typedef typename VectorType::value_type   ScalarType;
-    std::vector<ScalarType> temp(proxy.size());
-    viennacl::copy(proxy, temp);
-    
-    //instead of printing 'temp' directly, let's reuse the existing functionality for viennacl::vector. It certainly adds overhead, but printing a vector is typically not about performance...
-    VectorType temp2(temp.size());
-    viennacl::copy(temp, temp2);
-    s << temp2;
-    return s;
-  }
-  
-  
-  
-  
+
+
   /////////////////////////////////////////////////////////////
   ///////////////////////// CPU to GPU ////////////////////////
   /////////////////////////////////////////////////////////////
-  
-  //row_major:
+
   template <typename VectorType, typename SCALARTYPE>
   void copy(const VectorType & cpu_vector,
             vector_range<vector<SCALARTYPE> > & gpu_vector_range )
   {
-    assert(cpu_vector.end() - cpu_vector.begin() >= 0);
-    
+    assert(cpu_vector.end() - cpu_vector.begin() >= 0 && bool("Range must have nonnegative length!"));
+
     if (cpu_vector.end() - cpu_vector.begin() > 0)
     {
       //we require that the size of the gpu_vector is larger or equal to the cpu-size
       std::vector<SCALARTYPE> temp_buffer(cpu_vector.end() - cpu_vector.begin());
       std::copy(cpu_vector.begin(), cpu_vector.end(), temp_buffer.begin());
-      cl_int err = clEnqueueWriteBuffer(viennacl::ocl::get_queue().handle().get(),
-                                        gpu_vector_range.get().handle().get(), CL_TRUE, sizeof(SCALARTYPE)*gpu_vector_range.start(),
-                                        sizeof(SCALARTYPE)*temp_buffer.size(),
-                                        &(temp_buffer[0]), 0, NULL, NULL);
-      VIENNACL_ERR_CHECK(err);
+      viennacl::backend::memory_write(gpu_vector_range.handle(), sizeof(SCALARTYPE)*gpu_vector_range.start(), sizeof(SCALARTYPE)*temp_buffer.size(), &(temp_buffer[0]));
     }
   }
-  
+
+
+  /** @brief Transfer from a cpu vector to a gpu vector. Convenience wrapper for viennacl::linalg::fast_copy(cpu_vec.begin(), cpu_vec.end(), gpu_vec.begin());
+  *
+  * @param cpu_vec    A cpu vector. Type requirements: Iterator can be obtained via member function .begin() and .end()
+  * @param gpu_vec    The gpu vector.
+  */
+  template <typename CPUVECTOR, typename VectorType>
+  void fast_copy(const CPUVECTOR & cpu_vec, vector_range<VectorType> & gpu_vec)
+  {
+    viennacl::fast_copy(cpu_vec.begin(), cpu_vec.end(), gpu_vec.begin());
+  }
 
   /////////////////////////////////////////////////////////////
   ///////////////////////// GPU to CPU ////////////////////////
   /////////////////////////////////////////////////////////////
-  
 
-  template <typename VectorType, typename SCALARTYPE>
+
+  template <typename SCALARTYPE, typename VectorType>
   void copy(vector_range<vector<SCALARTYPE> > const & gpu_vector_range,
             VectorType & cpu_vector)
   {
-    assert(cpu_vector.end() - cpu_vector.begin() >= 0);
-    
+    assert(cpu_vector.end() - cpu_vector.begin() >= 0 && bool("Range must have nonnegative length!"));
+
     if (cpu_vector.end() > cpu_vector.begin())
     {
       std::vector<SCALARTYPE> temp_buffer(cpu_vector.end() - cpu_vector.begin());
-      cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(),
-                                        gpu_vector_range.get().handle().get(), CL_TRUE, sizeof(SCALARTYPE)*gpu_vector_range.start(), 
-                                        sizeof(SCALARTYPE)*temp_buffer.size(),
-                                        &(temp_buffer[0]), 0, NULL, NULL);
-      VIENNACL_ERR_CHECK(err);
-      viennacl::ocl::get_queue().finish();
-      
+      viennacl::backend::memory_read(gpu_vector_range.handle(), sizeof(SCALARTYPE)*gpu_vector_range.start(), sizeof(SCALARTYPE)*temp_buffer.size(), &(temp_buffer[0]));
+
       //now copy entries to cpu_vec:
       std::copy(temp_buffer.begin(), temp_buffer.end(), cpu_vector.begin());
     }
   }
 
 
+  /** @brief Transfer from a GPU vector range to a CPU vector. Convenience wrapper for viennacl::linalg::fast_copy(gpu_vec.begin(), gpu_vec.end(), cpu_vec.begin());
+  *
+  * @param gpu_vec    A gpu vector range.
+  * @param cpu_vec    The cpu vector. Type requirements: Output iterator can be obtained via member function .begin()
+  */
+  template <typename VectorType, typename CPUVECTOR>
+  void fast_copy(vector_range< VectorType > const & gpu_vec,
+                 CPUVECTOR & cpu_vec )
+  {
+    viennacl::fast_copy(gpu_vec.begin(), gpu_vec.end(), cpu_vec.begin());
+  }
+
+
+
+  //
+  // Convenience function
+  //
+  template <typename VectorType>
+  vector_range<VectorType> project(VectorType & vec, viennacl::range const & r1)
+  {
+    return vector_range<VectorType>(vec, r1);
+  }
+
+  template <typename VectorType>
+  vector_range<VectorType> project(viennacl::vector_range<VectorType> & vec, viennacl::range const & r1)
+  {
+    assert(r1.size() <= vec.size() && bool("Size of range invalid!"));
+    return vector_range<VectorType>(vec, viennacl::range(vec.start() + r1.start(), vec.start() + r1.start() + r1.size()));
+  }
+
+//
+//
+//
+/////////////////////////////// Slice /////////////////////////////////////////////
+//
+//
+//
+
+
+
+  /** @brief Class for representing strided subvectors of a bigger vector x.
+    *
+    * In MATLAB notation, this could for example refer to the subvector x(3:2:8) of a vector x.
+    */
+  template <typename VectorType>
+  class vector_slice : public vector_base<typename VectorType::cpu_value_type>
+  {
+      typedef vector_slice<VectorType>             self_type;
+      typedef vector_base<typename VectorType::cpu_value_type> base_type;
+
+    public:
+      typedef typename VectorType::value_type      value_type;
+      typedef slice::size_type                     size_type;
+      typedef slice::difference_type               difference_type;
+      typedef value_type                           reference;
+      typedef const value_type &                   const_reference;
+      typedef typename VectorType::const_iterator  const_iterator;
+      typedef typename VectorType::iterator        iterator;
+
+      typedef typename VectorType::cpu_value_type  cpu_value_type;
+
+      static const int alignment = VectorType::alignment;
+
+      vector_slice(VectorType & v, slice const & entry_slice)
+          : base_type(v.handle(), entry_slice.size(), v.start() + v.stride() * entry_slice.start(), v.stride() * entry_slice.stride()) {}
+
+
+      using base_type::operator=;
+
+  };
+
+
+  /////////////////////////////////////////////////////////////
+  ///////////////////////// CPU to GPU ////////////////////////
+  /////////////////////////////////////////////////////////////
+
+  template <typename VectorType, typename SCALARTYPE>
+  void copy(const VectorType & cpu_vector,
+            vector_slice<vector<SCALARTYPE> > & gpu_vector_slice )
+  {
+    if (cpu_vector.size() > 0)
+    {
+      std::vector<SCALARTYPE> temp_buffer(gpu_vector_slice.stride() * gpu_vector_slice.size());
+
+      viennacl::backend::memory_read(gpu_vector_slice.handle(), sizeof(SCALARTYPE)*gpu_vector_slice.start(), sizeof(SCALARTYPE)*temp_buffer.size(), &(temp_buffer[0]));
+
+      for (vcl_size_t i=0; i<cpu_vector.size(); ++i)
+        temp_buffer[i * gpu_vector_slice.stride()] = cpu_vector[i];
+
+      viennacl::backend::memory_write(gpu_vector_slice.handle(), sizeof(SCALARTYPE)*gpu_vector_slice.start(), sizeof(SCALARTYPE)*temp_buffer.size(), &(temp_buffer[0]));
+    }
+  }
+
+
+
+  /////////////////////////////////////////////////////////////
+  ///////////////////////// GPU to CPU ////////////////////////
+  /////////////////////////////////////////////////////////////
+
+
+  template <typename VectorType, typename SCALARTYPE>
+  void copy(vector_slice<vector<SCALARTYPE> > const & gpu_vector_slice,
+            VectorType & cpu_vector)
+  {
+    assert(gpu_vector_slice.end() - gpu_vector_slice.begin() >= 0 && bool("Range must have nonnegative length!"));
+
+    if (gpu_vector_slice.end() - gpu_vector_slice.begin() > 0)
+    {
+      std::vector<SCALARTYPE> temp_buffer(gpu_vector_slice.stride() * gpu_vector_slice.size());
+      viennacl::backend::memory_read(gpu_vector_slice.handle(), sizeof(SCALARTYPE)*gpu_vector_slice.start(), sizeof(SCALARTYPE)*temp_buffer.size(), &(temp_buffer[0]));
+
+      for (vcl_size_t i=0; i<cpu_vector.size(); ++i)
+        cpu_vector[i] = temp_buffer[i * gpu_vector_slice.stride()];
+    }
+  }
+
+
+
+
+
+  //
+  // Convenience functions
+  //
+  template <typename VectorType>
+  vector_slice<VectorType> project(VectorType & vec, viennacl::slice const & s1)
+  {
+    assert(s1.size() <= vec.size() && bool("Size of slice larger than vector size!"));
+    return vector_slice<VectorType>(vec, s1);
+  }
+
+  template <typename VectorType>
+  vector_slice<VectorType> project(viennacl::vector_slice<VectorType> & vec, viennacl::slice const & s1)
+  {
+    assert(s1.size() <= vec.size() && bool("Size of slice larger than vector proxy!"));
+    return vector_slice<VectorType>(vec, viennacl::slice(vec.start() + s1.start(), vec.stride() * s1.stride(), s1.size()));
+  }
+
+  // interaction with range and vector_range:
+
+  template <typename VectorType>
+  vector_slice<VectorType> project(viennacl::vector_slice<VectorType> & vec, viennacl::range const & r1)
+  {
+    assert(r1.size() <= vec.size() && bool("Size of slice larger than vector proxy!"));
+    return vector_slice<VectorType>(vec, viennacl::slice(vec.start() + r1.start(), vec.stride(), r1.size()));
+  }
+
+  template <typename VectorType>
+  vector_slice<VectorType> project(viennacl::vector_range<VectorType> & vec, viennacl::slice const & s1)
+  {
+    assert(s1.size() <= vec.size() && bool("Size of slice larger than vector proxy!"));
+    return vector_slice<VectorType>(vec, viennacl::range(vec.start() + s1.start(), s1.stride(), s1.size()));
+  }
+
+
 }
 
-#endif
\ No newline at end of file
+#endif

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/viennacl.git